From 70186a9b21e32bec04a2236b21c24249300602e2 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Thu, 5 Sep 2024 09:38:36 -0700
Subject: [PATCH 1/9] Add CMake configuration for style target

---
 .clang-format          | 34 +++++++++++++++++----
 CMakeLists.txt         |  5 +++-
 cmake/RAJAMacros.cmake | 67 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/.clang-format b/.clang-format
index 1d2ad9a77f..47b6b0bee6 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,15 +1,41 @@
-BasedOnStyle : google
+BasedOnStyle : LLVM
+# Indent formatting
 IndentWidth : 2
+UseTab: Never
 BreakBeforeBraces : Linux
 KeepEmptyLinesAtTheStartOfBlocks : true
 MaxEmptyLinesToKeep : 2
 AccessModifierOffset : -2
-UseTab: Never
+
+# Control curly brace placement
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  true
+  AfterObjCDeclaration: false
+  AfterStruct:     true
+  AfterUnion:      true
+  AfterExternBlock: false
+  BeforeCatch:     true
+  BeforeElse:      true
+  # BeforeLambdaBody: true   # available in clang 11
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+
+# Pointer alignment
+DerivePointerAlignment: false
+PointerAlignment: Left
+SortIncludes: false
 AllowShortIfStatementsOnASingleLine : true
 ConstructorInitializerAllOnOneLineOrOnePerLine : true
 AllowShortFunctionsOnASingleLine : true
 AllowShortLoopsOnASingleLine : false
-BinPackParameters : false
+BinPackParameters : true
 AllowAllParametersOfDeclarationOnNextLine : false
 AlignTrailingComments : true
 ColumnLimit : 80
@@ -17,11 +43,9 @@ PenaltyBreakBeforeFirstCallParameter : 100
 PenaltyReturnTypeOnItsOwnLine : 65000
 PenaltyBreakString : 10
 
-# These improve formatting results but require clang 3.6/7 or higher
 BreakBeforeBinaryOperators : None
 AlignAfterOpenBracket: true
 BinPackArguments : false
 AlignOperands : true
 AlwaysBreakTemplateDeclarations : true
-Cpp11BracedListStyle : true
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b31cbe124..dbe5b3f113 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,7 @@ project(RAJA LANGUAGES CXX C
   VERSION ${RAJA_LOADED})
 
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PATH})
-
+set(BLT_REQUIRED_CLANGFORMAT_VERSION  "14" CACHE STRING "")
 include(cmake/SetupRajaOptions.cmake)
 
 cmake_minimum_required(VERSION 3.23)
@@ -136,6 +136,9 @@ include(cmake/SetupCompilers.cmake)
 # Macros for building executables and libraries
 include (cmake/RAJAMacros.cmake)
 
+# Configure `style` target for enforcing code style
+raja_add_code_checks()
+
 set (raja_sources
   src/AlignedRangeIndexSetBuilders.cpp
   src/DepGraphNode.cpp
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index c412593db7..5233850919 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -204,3 +204,70 @@ macro(raja_add_benchmark)
     NUM_OMP_THREADS ${arg_NUM_OMP_THREADS}
     COMMAND ${TEST_DRIVER} ${arg_NAME})
 endmacro(raja_add_benchmark)
+
+##------------------------------------------------------------------------------
+## raja_add_code_checks()
+##
+## Adds code checks for all source files recursively in the RAJA repository.
+## 
+## This creates the following parent build targets:
+##  check - Runs a non file changing style check and CppCheck
+##  style - In-place code formatting
+##
+## Creates various child build targets that follow this pattern:
+##  raja_<check|style>
+##  raja_<cppcheck|clangformat>_<check|style>
+##------------------------------------------------------------------------------
+macro(raja_add_code_checks)
+
+  set(options)
+  set(singleValueArgs)
+  set(multiValueArgs)
+
+  # Parse the arguments to the macro
+  cmake_parse_arguments(arg
+       "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  # Only do code checks if building raja by itself and not included in
+  # another project
+  if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
+      # Create file globbing expressions that only include directories that contain source
+      set(_base_dirs "RAJA" "examples" "exercises" "benchmark" "include" "src" "test")
+      set(_ext_expressions "*.cpp" "*.hpp" "*.inl"
+                           "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh")
+
+      set(_glob_expressions)
+      foreach(_exp ${_ext_expressions})
+          foreach(_base_dir ${_base_dirs})
+              list(APPEND _glob_expressions "${PROJECT_SOURCE_DIR}/${_base_dir}/${_exp}")
+          endforeach()
+      endforeach()
+
+      # Glob for list of files to run code checks on
+      set(_sources)
+      file(GLOB_RECURSE _sources ${_glob_expressions})
+
+      # Filter out exclusions
+      #set(_exclude_expressions
+      #    "${PROJECT_SOURCE_DIR}/axom/sidre/examples/lulesh2/*"
+      #    "${PROJECT_SOURCE_DIR}/axom/slam/examples/lulesh2.0.3/*"
+      #    "${PROJECT_SOURCE_DIR}/axom/slam/examples/tinyHydro/*")
+      #foreach(_exp ${_exclude_expressions})
+      #    list(FILTER _sources EXCLUDE REGEX ${_exp})
+      #endforeach()
+#
+      blt_add_code_checks(PREFIX          RAJA
+                          SOURCES         ${_sources}
+                          CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format
+                          CPPCHECK_FLAGS  --enable=all --inconclusive)
+
+      # Set FOLDER property for code check targets
+      foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style)
+          set(_tgt ${arg_PREFIX}_${_suffix})
+          if(TARGET ${_tgt}) 
+              set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks")
+          endif()
+      endforeach()
+  endif()
+
+endmacro(raja_add_code_checks)

From 696caf4f122e412bec0eff74a2c770f8c9bcbc50 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Thu, 5 Sep 2024 10:13:33 -0700
Subject: [PATCH 2/9] Refactor RAJA using the make style target

---
 .clang-format                                 |   17 +-
 examples/dynamic-forall.cpp                   |   93 +-
 examples/dynamic_mat_transpose.cpp            |  311 +-
 examples/forall-param-reductions.cpp          |  389 +-
 examples/forall_multi-reductions.cpp          |  114 +-
 examples/jacobi.cpp                           |  317 +-
 examples/kernel-dynamic-tile.cpp              |   50 +-
 examples/launch-param-reductions.cpp          |  338 +-
 examples/launch_flatten.cpp                   |  107 +-
 examples/launch_matrix-multiply.cpp           |  765 ++--
 examples/launch_reductions.cpp                |  159 +-
 examples/memoryManager.hpp                    |   57 +-
 examples/multiview.cpp                        |  168 +-
 examples/omp-target-kernel.cpp                |   32 +-
 examples/omp-target-ltimes.cpp                |  135 +-
 examples/pi-reduce_vs_atomic.cpp              |  130 +-
 examples/plugin/counter-plugin.cpp            |   46 +-
 examples/plugin/test-plugin-dynamic.cpp       |    9 +-
 examples/plugin/test-plugin.cpp               |   10 +-
 examples/plugin/timer-plugin.cpp              |   20 +-
 examples/raja-launch.cpp                      |   96 +-
 examples/red-black-gauss-seidel.cpp           |   95 +-
 examples/resource-dynamic-forall.cpp          |  117 +-
 examples/resource-forall.cpp                  |  380 +-
 examples/resource-kernel.cpp                  |   54 +-
 examples/resource-launch.cpp                  |   54 +-
 examples/resource-runtime-launch.cpp          |  159 +-
 examples/tut_daxpy.cpp                        |  225 +-
 examples/tut_halo-exchange.cpp                | 2172 +++++++-----
 examples/tut_launch_basic.cpp                 |  220 +-
 examples/tut_matrix-multiply.cpp              | 1438 ++++----
 examples/wave-eqn.cpp                         |  190 +-
 exercises/atomic-histogram.cpp                |  151 +-
 exercises/atomic-histogram_solution.cpp       |  193 +-
 exercises/dot-product.cpp                     |  101 +-
 exercises/dot-product_solution.cpp            |   95 +-
 .../kernel-matrix-transpose-local-array.cpp   |  477 +--
 ...-matrix-transpose-local-array_solution.cpp |  726 ++--
 exercises/kernel-matrix-transpose-tiled.cpp   |  214 +-
 ...kernel-matrix-transpose-tiled_solution.cpp |  282 +-
 exercises/kernel-matrix-transpose.cpp         |   85 +-
 .../kernel-matrix-transpose_solution.cpp      |  133 +-
 exercises/kernelintro-execpols.cpp            |  461 ++-
 exercises/kernelintro-execpols_solution.cpp   |  512 ++-
 exercises/kernelintro-nested-loop-reorder.cpp |  159 +-
 ...rnelintro-nested-loop-reorder_solution.cpp |  209 +-
 .../launch-matrix-transpose-local-array.cpp   |  288 +-
 ...-matrix-transpose-local-array_solution.cpp |  328 +-
 exercises/launch-matrix-transpose-tiled.cpp   |  230 +-
 ...launch-matrix-transpose-tiled_solution.cpp |  235 +-
 exercises/launch-matrix-transpose.cpp         |  138 +-
 .../launch-matrix-transpose_solution.cpp      |  113 +-
 exercises/launchintro-execpols.cpp            |  410 +--
 exercises/launchintro-execpols_solution.cpp   |  412 +--
 exercises/memoryManager.hpp                   |   57 +-
 exercises/offset-layout-stencil.cpp           |  263 +-
 exercises/offset-layout-stencil_solution.cpp  |  295 +-
 .../permuted-layout-batch-matrix-multiply.cpp |  666 ++--
 ...-layout-batch-matrix-multiply_solution.cpp |  749 ++--
 exercises/reductions.cpp                      |  126 +-
 exercises/reductions_solution.cpp             |  148 +-
 exercises/scan.cpp                            |  130 +-
 exercises/scan_solution.cpp                   |  125 +-
 exercises/segment-indexset-basics.cpp         |  157 +-
 .../segment-indexset-basics_solution.cpp      |  180 +-
 exercises/sort.cpp                            |  395 ++-
 exercises/sort_solution.cpp                   |  384 +-
 exercises/tutorial_halfday/ex2_approx-pi.cpp  |  101 +-
 .../ex2_approx-pi_solution.cpp                |  111 +-
 .../tutorial_halfday/ex5_line-of-sight.cpp    |  133 +-
 .../ex5_line-of-sight_solution.cpp            |  146 +-
 .../ex6_stencil-offset-layout.cpp             |  257 +-
 .../ex6_stencil-offset-layout_solution.cpp    |  248 +-
 .../ex8_tiled-matrix-transpose.cpp            |  120 +-
 .../ex8_tiled-matrix-transpose_solution.cpp   |  229 +-
 .../ex9_matrix-transpose-local-array.cpp      |   88 +-
 ..._matrix-transpose-local-array_solution.cpp |  321 +-
 exercises/tutorial_halfday/memoryManager.hpp  |   44 +-
 exercises/vector-addition.cpp                 |  204 +-
 exercises/vector-addition_solution.cpp        |  230 +-
 exercises/vertexsum-indexset.cpp              |  432 +--
 exercises/vertexsum-indexset_solution.cpp     |  448 +--
 exercises/view-layout.cpp                     |  481 +--
 exercises/view-layout_solution.cpp            |  488 +--
 include/RAJA/RAJA.hpp                         |   12 +-
 include/RAJA/index/IndexSet.hpp               |  289 +-
 include/RAJA/index/IndexSetBuilders.hpp       |   37 +-
 include/RAJA/index/IndexSetUtils.hpp          |   43 +-
 include/RAJA/index/IndexValue.hpp             |  144 +-
 include/RAJA/index/ListSegment.hpp            |   75 +-
 include/RAJA/index/RangeSegment.hpp           |  153 +-
 include/RAJA/internal/DepGraphNode.hpp        |   13 +-
 include/RAJA/internal/Iterators.hpp           |  129 +-
 include/RAJA/internal/MemUtils_CPU.hpp        |   24 +-
 include/RAJA/internal/RAJAVec.hpp             |  196 +-
 include/RAJA/internal/ThreadUtils_CPU.hpp     |    4 +-
 include/RAJA/internal/fault_tolerance.hpp     |  114 +-
 include/RAJA/internal/foldl.hpp               |   47 +-
 include/RAJA/internal/get_platform.hpp        |   50 +-
 include/RAJA/pattern/WorkGroup.hpp            |  227 +-
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  472 ++-
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |  176 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    |  552 +--
 include/RAJA/pattern/WorkGroup/WorkStruct.hpp |   62 +-
 include/RAJA/pattern/atomic.hpp               |   63 +-
 include/RAJA/pattern/detail/algorithm.hpp     |   27 +-
 include/RAJA/pattern/detail/forall.hpp        |   12 +-
 include/RAJA/pattern/detail/multi_reduce.hpp  |  177 +-
 include/RAJA/pattern/detail/privatizer.hpp    |   15 +-
 include/RAJA/pattern/detail/reduce.hpp        |  248 +-
 include/RAJA/pattern/forall.hpp               |  501 +--
 include/RAJA/pattern/kernel.hpp               |   99 +-
 include/RAJA/pattern/kernel/Collapse.hpp      |    8 +-
 include/RAJA/pattern/kernel/Conditional.hpp   |   68 +-
 include/RAJA/pattern/kernel/For.hpp           |   43 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   35 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |   43 +-
 include/RAJA/pattern/kernel/InitLocalMem.hpp  |   68 +-
 include/RAJA/pattern/kernel/Lambda.hpp        |  188 +-
 include/RAJA/pattern/kernel/Param.hpp         |   15 +-
 include/RAJA/pattern/kernel/Reduce.hpp        |   10 +-
 include/RAJA/pattern/kernel/Region.hpp        |   35 +-
 include/RAJA/pattern/kernel/Tile.hpp          |  110 +-
 include/RAJA/pattern/kernel/TileTCount.hpp    |   40 +-
 .../RAJA/pattern/kernel/internal/LoopData.hpp |  123 +-
 .../pattern/kernel/internal/LoopTypes.hpp     |   78 +-
 .../pattern/kernel/internal/Statement.hpp     |   17 +-
 .../pattern/kernel/internal/StatementList.hpp |   31 +-
 .../RAJA/pattern/kernel/internal/Template.hpp |   19 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  591 ++--
 include/RAJA/pattern/multi_reduce.hpp         |    9 +-
 include/RAJA/pattern/params/forall.hpp        |  726 ++--
 include/RAJA/pattern/params/kernel_name.hpp   |   17 +-
 include/RAJA/pattern/params/params_base.hpp   |   21 +-
 include/RAJA/pattern/params/reducer.hpp       |  128 +-
 include/RAJA/pattern/reduce.hpp               |    6 +-
 include/RAJA/pattern/region.hpp               |    4 +-
 include/RAJA/pattern/scan.hpp                 |  268 +-
 include/RAJA/pattern/sort.hpp                 |  214 +-
 include/RAJA/pattern/synchronize.hpp          |    4 +-
 .../RAJA/pattern/tensor/MatrixRegister.hpp    |   36 +-
 .../RAJA/pattern/tensor/ScalarRegister.hpp    |   10 +-
 include/RAJA/pattern/tensor/TensorBlock.hpp   |    1 -
 include/RAJA/pattern/tensor/TensorIndex.hpp   |  361 +-
 include/RAJA/pattern/tensor/TensorLayout.hpp  |   83 +-
 .../RAJA/pattern/tensor/TensorRegister.hpp    |  131 +-
 .../RAJA/pattern/tensor/VectorRegister.hpp    |   13 +-
 .../tensor/internal/ET/BinaryOperator.hpp     |  233 +-
 .../internal/ET/BinaryOperatorTraits.hpp      |  226 +-
 .../tensor/internal/ET/BlockLiteral.hpp       |  144 +-
 .../internal/ET/ExpressionTemplateBase.hpp    |  231 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   | 2223 ++++++------
 .../tensor/internal/ET/TensorDivide.hpp       |  681 ++--
 .../tensor/internal/ET/TensorLiteral.hpp      |  130 +-
 .../tensor/internal/ET/TensorLoadStore.hpp    |  361 +-
 .../tensor/internal/ET/TensorMultiply.hpp     |  248 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |  145 +-
 .../tensor/internal/ET/TensorNegate.hpp       |  103 +-
 .../internal/ET/TensorScalarLiteral.hpp       |  121 +-
 .../tensor/internal/ET/TensorTranspose.hpp    |  114 +-
 .../tensor/internal/ET/normalizeOperand.hpp   |   99 +-
 .../tensor/internal/ExpressionTemplate.hpp    |    1 -
 .../tensor/internal/MatrixMatrixMultiply.hpp  |  523 +--
 .../tensor/internal/MatrixRegisterImpl.hpp    | 2686 +++++++-------
 .../pattern/tensor/internal/RegisterBase.hpp  | 2023 +++++------
 .../tensor/internal/TensorIndexTraits.hpp     |  582 ++-
 .../pattern/tensor/internal/TensorRef.hpp     | 1256 ++++---
 .../tensor/internal/TensorRegisterBase.hpp    | 1539 ++++----
 .../tensor/internal/TensorTileExec.hpp        |  531 +--
 .../tensor/internal/VectorRegisterImpl.hpp    | 1768 +++++-----
 include/RAJA/pattern/tensor/stats.hpp         |    3 +-
 include/RAJA/policy/MultiPolicy.hpp           |   70 +-
 include/RAJA/policy/PolicyBase.hpp            |  148 +-
 include/RAJA/policy/WorkGroup.hpp             |   87 +-
 include/RAJA/policy/atomic_auto.hpp           |   54 +-
 include/RAJA/policy/atomic_builtin.hpp        |  352 +-
 include/RAJA/policy/cuda.hpp                  |    6 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |  253 +-
 include/RAJA/policy/cuda/WorkGroup.hpp        |    2 +-
 .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp |   47 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  275 +-
 include/RAJA/policy/cuda/atomic.hpp           |  459 +--
 include/RAJA/policy/cuda/forall.hpp           |  717 ++--
 include/RAJA/policy/cuda/intrinsics.hpp       |  157 +-
 include/RAJA/policy/cuda/kernel.hpp           |    2 +-
 .../RAJA/policy/cuda/kernel/Conditional.hpp   |   20 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  249 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |  216 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  318 +-
 .../RAJA/policy/cuda/kernel/Hyperplane.hpp    |   42 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |  164 +-
 include/RAJA/policy/cuda/kernel/Lambda.hpp    |   34 +-
 include/RAJA/policy/cuda/kernel/Reduce.hpp    |   40 +-
 include/RAJA/policy/cuda/kernel/Sync.hpp      |   53 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |  151 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |  198 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  515 +--
 include/RAJA/policy/cuda/launch.hpp           |  991 ++++--
 include/RAJA/policy/cuda/multi_reduce.hpp     |  552 +--
 .../RAJA/policy/cuda/params/kernel_name.hpp   |   56 +-
 include/RAJA/policy/cuda/params/reduce.hpp    |   83 +-
 include/RAJA/policy/cuda/policy.hpp           | 3126 ++++++++++-------
 include/RAJA/policy/cuda/raja_cudaerrchk.hpp  |   35 +-
 include/RAJA/policy/cuda/reduce.hpp           |  557 +--
 include/RAJA/policy/cuda/scan.hpp             |  108 +-
 include/RAJA/policy/cuda/sort.hpp             |  555 +--
 include/RAJA/policy/cuda/synchronize.hpp      |   10 +-
 include/RAJA/policy/desul.hpp                 |    2 +-
 include/RAJA/policy/desul/atomic.hpp          |  141 +-
 include/RAJA/policy/hip.hpp                   |    4 +-
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |  279 +-
 include/RAJA/policy/hip/WorkGroup.hpp         |    2 +-
 .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp  |   40 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  303 +-
 include/RAJA/policy/hip/atomic.hpp            |  455 +--
 include/RAJA/policy/hip/forall.hpp            |  700 ++--
 include/RAJA/policy/hip/intrinsics.hpp        |  148 +-
 include/RAJA/policy/hip/kernel.hpp            |    2 +-
 .../RAJA/policy/hip/kernel/Conditional.hpp    |   22 +-
 include/RAJA/policy/hip/kernel/For.hpp        |  221 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  320 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  228 +-
 include/RAJA/policy/hip/kernel/Hyperplane.hpp |   42 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |  166 +-
 include/RAJA/policy/hip/kernel/Lambda.hpp     |   34 +-
 include/RAJA/policy/hip/kernel/Reduce.hpp     |   59 +-
 include/RAJA/policy/hip/kernel/Sync.hpp       |   49 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |  151 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  192 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  515 +--
 include/RAJA/policy/hip/launch.hpp            |  968 +++--
 include/RAJA/policy/hip/multi_reduce.hpp      |  551 +--
 .../RAJA/policy/hip/params/kernel_name.hpp    |   55 +-
 include/RAJA/policy/hip/params/reduce.hpp     |   82 +-
 include/RAJA/policy/hip/policy.hpp            | 2885 +++++++++------
 include/RAJA/policy/hip/raja_hiperrchk.hpp    |   32 +-
 include/RAJA/policy/hip/reduce.hpp            |  551 +--
 include/RAJA/policy/hip/scan.hpp              |  110 +-
 include/RAJA/policy/hip/sort.hpp              |  465 ++-
 include/RAJA/policy/hip/synchronize.hpp       |   10 +-
 include/RAJA/policy/openmp.hpp                |    6 +-
 include/RAJA/policy/openmp/WorkGroup.hpp      |    2 +-
 .../policy/openmp/WorkGroup/Dispatcher.hpp    |   12 +-
 .../policy/openmp/WorkGroup/WorkRunner.hpp    |   70 +-
 include/RAJA/policy/openmp/atomic.hpp         |   77 +-
 include/RAJA/policy/openmp/forall.hpp         |  435 +--
 include/RAJA/policy/openmp/kernel.hpp         |    2 +-
 .../RAJA/policy/openmp/kernel/Collapse.hpp    |   53 +-
 .../policy/openmp/kernel/OmpSyncThreads.hpp   |   33 +-
 include/RAJA/policy/openmp/launch.hpp         |  385 +-
 include/RAJA/policy/openmp/multi_reduce.hpp   |  279 +-
 include/RAJA/policy/openmp/params/forall.hpp  |  562 +--
 .../RAJA/policy/openmp/params/kernel_name.hpp |   50 +-
 include/RAJA/policy/openmp/params/reduce.hpp  |   52 +-
 include/RAJA/policy/openmp/policy.hpp         |  237 +-
 include/RAJA/policy/openmp/reduce.hpp         |   19 +-
 include/RAJA/policy/openmp/region.hpp         |   20 +-
 include/RAJA/policy/openmp/scan.hpp           |  121 +-
 include/RAJA/policy/openmp/sort.hpp           |  147 +-
 include/RAJA/policy/openmp/synchronize.hpp    |    8 +-
 include/RAJA/policy/openmp_target.hpp         |    5 +-
 .../RAJA/policy/openmp_target/WorkGroup.hpp   |    2 +-
 .../openmp_target/WorkGroup/Dispatcher.hpp    |   30 +-
 .../openmp_target/WorkGroup/WorkRunner.hpp    |   70 +-
 include/RAJA/policy/openmp_target/forall.hpp  |  115 +-
 include/RAJA/policy/openmp_target/kernel.hpp  |    2 +-
 .../policy/openmp_target/kernel/Collapse.hpp  |   96 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   44 +-
 .../openmp_target/params/kernel_name.hpp      |   50 +-
 .../policy/openmp_target/params/reduce.hpp    |   52 +-
 include/RAJA/policy/openmp_target/policy.hpp  |   76 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |  190 +-
 include/RAJA/policy/sequential.hpp            |    4 +-
 include/RAJA/policy/sequential/WorkGroup.hpp  |    2 +-
 .../sequential/WorkGroup/Dispatcher.hpp       |   14 +-
 .../sequential/WorkGroup/WorkRunner.hpp       |   68 +-
 include/RAJA/policy/sequential/atomic.hpp     |   49 +-
 include/RAJA/policy/sequential/forall.hpp     |   56 +-
 include/RAJA/policy/sequential/kernel.hpp     |    2 +-
 .../policy/sequential/kernel/Collapse.hpp     |   30 +-
 .../RAJA/policy/sequential/kernel/Reduce.hpp  |   10 +-
 include/RAJA/policy/sequential/launch.hpp     |  195 +-
 .../RAJA/policy/sequential/multi_reduce.hpp   |   88 +-
 .../policy/sequential/params/kernel_name.hpp  |   54 +-
 .../RAJA/policy/sequential/params/reduce.hpp  |   48 +-
 include/RAJA/policy/sequential/policy.hpp     |   64 +-
 include/RAJA/policy/sequential/reduce.hpp     |    6 +-
 include/RAJA/policy/sequential/region.hpp     |   10 +-
 include/RAJA/policy/sequential/scan.hpp       |   90 +-
 include/RAJA/policy/sequential/sort.hpp       |   83 +-
 include/RAJA/policy/simd.hpp                  |    2 +-
 include/RAJA/policy/simd/forall.hpp           |   46 +-
 include/RAJA/policy/simd/kernel/For.hpp       |   34 +-
 include/RAJA/policy/simd/kernel/ForICount.hpp |   26 +-
 include/RAJA/policy/simd/launch.hpp           |   30 +-
 include/RAJA/policy/simd/policy.hpp           |   10 +-
 include/RAJA/policy/sycl.hpp                  |    4 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp    |   23 +-
 include/RAJA/policy/sycl/forall.hpp           |  264 +-
 include/RAJA/policy/sycl/kernel.hpp           |    2 +-
 .../RAJA/policy/sycl/kernel/Conditional.hpp   |   21 +-
 include/RAJA/policy/sycl/kernel/For.hpp       |  199 +-
 include/RAJA/policy/sycl/kernel/ForICount.hpp |  256 +-
 include/RAJA/policy/sycl/kernel/Lambda.hpp    |   34 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |  106 +-
 include/RAJA/policy/sycl/kernel/Tile.hpp      |  192 +-
 .../RAJA/policy/sycl/kernel/TileTCount.hpp    |  241 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |  151 +-
 include/RAJA/policy/sycl/launch.hpp           |  994 +++---
 .../RAJA/policy/sycl/params/kernel_name.hpp   |   61 +-
 include/RAJA/policy/sycl/params/reduce.hpp    |   53 +-
 include/RAJA/policy/sycl/policy.hpp           |  155 +-
 include/RAJA/policy/sycl/reduce.hpp           |  300 +-
 include/RAJA/policy/tensor.hpp                |    2 +-
 include/RAJA/policy/tensor/arch.hpp           |   54 +-
 include/RAJA/policy/tensor/arch/avx.hpp       |   10 +-
 .../policy/tensor/arch/avx/avx_double.hpp     |  889 ++---
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |  928 ++---
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp | 1489 ++++----
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp | 1019 +++---
 .../RAJA/policy/tensor/arch/avx/traits.hpp    |   79 +-
 include/RAJA/policy/tensor/arch/avx2.hpp      |   10 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   | 1012 +++---
 .../policy/tensor/arch/avx2/avx2_float.hpp    |  971 ++---
 .../policy/tensor/arch/avx2/avx2_int32.hpp    | 1109 +++---
 .../policy/tensor/arch/avx2/avx2_int64.hpp    | 1037 +++---
 .../RAJA/policy/tensor/arch/avx2/traits.hpp   |   91 +-
 include/RAJA/policy/tensor/arch/avx512.hpp    |   10 +-
 .../tensor/arch/avx512/avx512_double.hpp      |  708 ++--
 .../tensor/arch/avx512/avx512_float.hpp       |  731 ++--
 .../tensor/arch/avx512/avx512_int32.hpp       |  861 ++---
 .../tensor/arch/avx512/avx512_int64.hpp       |  751 ++--
 .../RAJA/policy/tensor/arch/avx512/traits.hpp |   80 +-
 include/RAJA/policy/tensor/arch/cuda.hpp      |    4 +-
 .../policy/tensor/arch/cuda/cuda_warp.hpp     | 1960 ++++++-----
 .../RAJA/policy/tensor/arch/cuda/traits.hpp   |   31 +-
 include/RAJA/policy/tensor/arch/hip.hpp       |    4 +-
 .../RAJA/policy/tensor/arch/hip/hip_wave.hpp  | 1956 ++++++-----
 .../RAJA/policy/tensor/arch/hip/traits.hpp    |   31 +-
 include/RAJA/policy/tensor/arch/scalar.hpp    |    8 +-
 .../RAJA/policy/tensor/arch/scalar/scalar.hpp |  895 ++---
 .../RAJA/policy/tensor/arch/scalar/traits.hpp |   85 +-
 include/RAJA/policy/tensor/arch_impl.hpp      |   14 +-
 include/RAJA/policy/tensor/policy.hpp         |   35 +-
 include/RAJA/util/BitMask.hpp                 |  101 +-
 include/RAJA/util/CombiningAdapter.hpp        |   86 +-
 include/RAJA/util/EnableIf.hpp                |   12 +-
 include/RAJA/util/IndexLayout.hpp             |  143 +-
 include/RAJA/util/KokkosPluginLoader.hpp      |   57 +-
 include/RAJA/util/Layout.hpp                  |  146 +-
 include/RAJA/util/LocalArray.hpp              |  104 +-
 include/RAJA/util/OffsetLayout.hpp            |  123 +-
 include/RAJA/util/OffsetOperators.hpp         |   51 +-
 include/RAJA/util/Operators.hpp               |  266 +-
 include/RAJA/util/Permutations.hpp            |   43 +-
 include/RAJA/util/PermutedLayout.hpp          |   16 +-
 include/RAJA/util/PluginContext.hpp           |   28 +-
 include/RAJA/util/PluginLinker.hpp            |   22 +-
 include/RAJA/util/PluginOptions.hpp           |   14 +-
 include/RAJA/util/PluginStrategy.hpp          |   28 +-
 include/RAJA/util/Registry.hpp                |  239 +-
 include/RAJA/util/RepeatView.hpp              |  117 +-
 include/RAJA/util/RuntimePluginLoader.hpp     |   45 +-
 include/RAJA/util/SoAArray.hpp                |    6 +-
 include/RAJA/util/SoAPtr.hpp                  |   75 +-
 include/RAJA/util/Span.hpp                    |   60 +-
 include/RAJA/util/StaticLayout.hpp            |  162 +-
 include/RAJA/util/Timer.hpp                   |   15 +-
 include/RAJA/util/TypeConvert.hpp             |    8 +-
 include/RAJA/util/TypedViewBase.hpp           | 1278 +++----
 include/RAJA/util/View.hpp                    |  217 +-
 include/RAJA/util/align.hpp                   |   18 +-
 include/RAJA/util/basic_mempool.hpp           |  108 +-
 include/RAJA/util/camp_aliases.hpp            |    2 +-
 include/RAJA/util/concepts.hpp                |   10 +-
 include/RAJA/util/for_each.hpp                |   40 +-
 include/RAJA/util/macros.hpp                  |   44 +-
 include/RAJA/util/math.hpp                    |   45 +-
 include/RAJA/util/mutex.hpp                   |    8 +-
 include/RAJA/util/plugins.hpp                 |   75 +-
 include/RAJA/util/reduce.hpp                  |  187 +-
 include/RAJA/util/resource.hpp                |  295 +-
 include/RAJA/util/sort.hpp                    |  492 +--
 include/RAJA/util/sycl_compat.hpp             |    2 +-
 include/RAJA/util/types.hpp                   |  146 +-
 include/RAJA/util/zip.hpp                     |  104 +-
 include/RAJA/util/zip_tuple.hpp               |  392 ++-
 src/AlignedRangeIndexSetBuilders.cpp          |  122 +-
 src/DepGraphNode.cpp                          |    8 +-
 src/KokkosPluginLoader.cpp                    |   73 +-
 src/LockFreeIndexSetBuilders.cpp              |  124 +-
 src/MemUtils_CUDA.cpp                         |    8 +-
 src/MemUtils_HIP.cpp                          |    8 +-
 src/MemUtils_SYCL.cpp                         |    8 +-
 src/PluginStrategy.cpp                        |   22 +-
 src/RuntimePluginLoader.cpp                   |   61 +-
 src/TensorStats.cpp                           |   13 +-
 ...t-dynamic-forall-resource-RangeSegment.hpp |   91 +-
 .../test-dynamic-forall-RangeSegment.hpp      |  110 +-
 .../tests/test-forall-CombiningAdapter-1D.hpp |   99 +-
 .../tests/test-forall-CombiningAdapter-2D.hpp |  134 +-
 .../tests/test-forall-CombiningAdapter-3D.hpp |  222 +-
 .../tests/test-forall-atomic-basic.hpp        |  101 +-
 .../tests/test-forall-AtomicRefAdd.hpp        |  188 +-
 .../tests/test-forall-AtomicRefCAS.hpp        |  187 +-
 .../tests/test-forall-AtomicRefLoadStore.hpp  |  174 +-
 .../tests/test-forall-AtomicRefLogical.hpp    |  271 +-
 .../tests/test-forall-AtomicRefMinMax.hpp     |  173 +-
 .../tests/test-forall-AtomicRefSub.hpp        |  173 +-
 .../tests/test-forall-AtomicMultiView.hpp     |   74 +-
 ...test-forall-AtomicOutOfBoundsMultiView.hpp |   62 +-
 .../tests/test-forall-AtomicView.hpp          |   52 +-
 .../tests/test-forall-IcountIndexSetView.hpp  |   62 +-
 .../tests/test-forall-IndexSetView.hpp        |   60 +-
 .../tests/test-forall-IcountIndexSet.hpp      |   62 +-
 .../indexset/tests/test-forall-IndexSet.hpp   |   55 +-
 .../tests/test-forall-basic-MultiReduce.hpp   |  231 +-
 .../tests/test-forall-basic-ReduceBitAnd.hpp  |  129 +-
 .../tests/test-forall-basic-ReduceBitOr.hpp   |  134 +-
 .../tests/test-forall-basic-ReduceMax.hpp     |  132 +-
 .../tests/test-forall-basic-ReduceMaxLoc.hpp  |  146 +-
 .../tests/test-forall-basic-ReduceMin.hpp     |  134 +-
 .../tests/test-forall-basic-ReduceMinLoc.hpp  |  144 +-
 .../tests/test-forall-basic-ReduceSum.hpp     |  132 +-
 .../test-forall-basic-expt-ReduceBitAnd.hpp   |  147 +-
 .../test-forall-basic-expt-ReduceBitOr.hpp    |  160 +-
 .../test-forall-basic-expt-ReduceMax.hpp      |  161 +-
 .../test-forall-basic-expt-ReduceMaxLoc.hpp   |  163 +-
 .../test-forall-basic-expt-ReduceMin.hpp      |  159 +-
 .../test-forall-basic-expt-ReduceMinLoc.hpp   |  161 +-
 .../test-forall-basic-expt-ReduceSum.hpp      |  153 +-
 ...est-forall-indexset-multiple-ReduceMax.hpp |   78 +-
 ...-forall-indexset-multiple-ReduceMaxLoc.hpp |   95 +-
 ...est-forall-indexset-multiple-ReduceMin.hpp |   90 +-
 ...-forall-indexset-multiple-ReduceMinLoc.hpp |   95 +-
 ...est-forall-indexset-multiple-ReduceSum.hpp |   84 +-
 ...test-forall-segment-multiple-ReduceMax.hpp |   75 +-
 ...t-forall-segment-multiple-ReduceMaxLoc.hpp |   87 +-
 ...test-forall-segment-multiple-ReduceMin.hpp |   73 +-
 ...t-forall-segment-multiple-ReduceMinLoc.hpp |   86 +-
 ...test-forall-segment-multiple-ReduceSum.hpp |  111 +-
 .../region/tests/test-forall-region.hpp       |   51 +-
 .../test-forall-ResourceIcountIndexSet.hpp    |   64 +-
 .../tests/test-forall-ResourceIndexSet.hpp    |   62 +-
 .../test-forall-resource-ListSegment.hpp      |   74 +-
 .../test-forall-resource-RangeSegment.hpp     |   71 +-
 ...est-forall-resource-RangeStrideSegment.hpp |  186 +-
 .../tests/test-forall-ListSegmentView.hpp     |  143 +-
 .../tests/test-forall-RangeSegment2DView.hpp  |   84 +-
 .../tests/test-forall-RangeSegmentView.hpp    |  103 +-
 .../test-forall-RangeStrideSegmentView.hpp    |  144 +-
 .../segment/tests/test-forall-ListSegment.hpp |   86 +-
 .../tests/test-forall-RangeSegment.hpp        |   87 +-
 .../tests/test-forall-RangeStrideSegment.hpp  |  183 +-
 .../indexset-build/test-aligned-indexset.cpp  |    9 +-
 .../tests/basic-fission-fusion-loop-impl.hpp  |   25 +-
 ...nel-basic-fission-fusion-loop-segments.hpp |   21 +-
 .../tests/basic-single-icount-loop-impl.hpp   |  121 +-
 ...rnel-basic-single-icount-loop-segments.hpp |   92 +-
 .../tests/basic-single-loop-segments-impl.hpp |  116 +-
 ...test-kernel-basic-single-loop-segments.hpp |  115 +-
 ...el-resource-basic-single-loop-segments.hpp |  115 +-
 .../conditional-fission-fusion-loop-impl.hpp  |   28 +-
 ...nditional-fission-fusion-loop-segments.hpp |   39 +-
 .../tests/test-kernel-hyperplane-2D.hpp       |  151 +-
 .../tests/test-kernel-hyperplane-3D.hpp       |  192 +-
 .../tests/test-kernel-nested-MultiReduce.hpp  |  367 +-
 .../tests/nested-loop-BlockReduceSum-impl.hpp |  180 +-
 .../tests/nested-loop-ReduceSum-impl.hpp      |  329 +-
 ...test-kernel-nested-loop-BlockReduceSum.hpp |   17 +-
 .../test-kernel-nested-loop-ReduceSum.hpp     |   17 +-
 ...el-resource-nested-loop-BlockReduceSum.hpp |   17 +-
 ...-kernel-resource-nested-loop-ReduceSum.hpp |   17 +-
 ...test-kernel-nested-loops-segment-types.hpp |  247 +-
 .../test-kernel-nested-loop-OffsetView2D.hpp  |   98 +-
 .../test-kernel-nested-loop-OffsetView3D.hpp  |  111 +-
 ...ernel-nested-loop-PermutedOffsetView2D.hpp |  109 +-
 ...ernel-nested-loop-PermutedOffsetView3D.hpp |  142 +-
 ...test-kernel-nested-loop-PermutedView2D.hpp |   73 +-
 ...test-kernel-nested-loop-PermutedView3D.hpp |   90 +-
 .../tests/nested-loop-Basic-impl.hpp          |  347 +-
 .../tests/nested-loop-MultiLambda-impl.hpp    |  253 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |  226 +-
 .../tests/test-kernel-nested-loop-Basic.hpp   |   16 +-
 .../test-kernel-nested-loop-MultiLambda.hpp   |   11 +-
 ...st-kernel-nested-loop-MultiLambdaParam.hpp |   12 +-
 ...test-kernel-resource-nested-loop-Basic.hpp |   16 +-
 ...ernel-resource-nested-loop-MultiLambda.hpp |   11 +-
 ...-resource-nested-loop-MultiLambdaParam.hpp |   12 +-
 .../tests/test-kernel-reduceloc-Max2D.hpp     |  122 +-
 .../tests/test-kernel-reduceloc-Max2DView.hpp |  122 +-
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |  121 +-
 .../tests/test-kernel-reduceloc-Min2D.hpp     |  122 +-
 .../tests/test-kernel-reduceloc-Min2DView.hpp |  122 +-
 .../test-kernel-reduceloc-Min2DViewTuple.hpp  |  121 +-
 .../region/tests/test-kernel-region-data.hpp  |   10 +-
 .../region/tests/test-kernel-region-sync.hpp  |   56 +-
 .../region/tests/test-kernel-region.hpp       |   55 +-
 .../test-kernel-single-loop-ForICount.hpp     |   45 +-
 .../test-kernel-single-loop-TileTCount.hpp    |   45 +-
 .../tests/test-kernel-tile-Dynamic2D.hpp      |  149 +-
 .../tests/test-kernel-tile-Fixed2D.hpp        |  117 +-
 .../tests/test-kernel-tile-Fixed2DMinMax.hpp  |   84 +-
 .../tests/test-kernel-tile-Fixed2DSum.hpp     |   61 +-
 .../tests/test-kernel-tile-LocalArray2D.hpp   |  152 +-
 ...kernel-resource-warp-thread-ReduceMask.hpp |   14 +-
 ...kernel-resource-warp-thread-ReduceWarp.hpp |   14 +-
 ...t-kernel-resource-warp-thread-WarpLoop.hpp |   14 +-
 .../test-kernel-warp-thread-ReduceMask.hpp    |   14 +-
 .../test-kernel-warp-thread-ReduceWarp.hpp    |   14 +-
 .../test-kernel-warp-thread-WarpLoop.hpp      |   14 +-
 .../tests/warp-thread-ReduceMask-impl.hpp     |  219 +-
 .../tests/warp-thread-ReduceWarp-impl.hpp     |  329 +-
 .../tests/warp-thread-WarpLoop-impl.hpp       |  234 +-
 .../tests/test-launch-nested-MultiReduce.hpp  |  359 +-
 .../tests/test-launch-nested-Direct.hpp       |  237 +-
 .../tests/test-launch-nested-Loop.hpp         |  234 +-
 .../tests/test-launch-nested-Tile-Direct.hpp  |  256 +-
 .../tests/test-launch-nested-Tile-Loop.hpp    |  253 +-
 .../tests/test-launch-basic-ReduceBitAnd.hpp  |  173 +-
 .../tests/test-launch-basic-ReduceMin.hpp     |  176 +-
 .../tests/test-launch-basic-ReduceSum.hpp     |  158 +-
 ...t-launch-basic-param-expt-ReduceBitAnd.hpp |  205 +-
 ...test-launch-basic-param-expt-ReduceMin.hpp |  206 +-
 ...test-launch-basic-param-expt-ReduceSum.hpp |  189 +-
 .../tests/test-launch-BasicShared.hpp         |  107 +-
 .../segment/tests/test-launch-ListSegment.hpp |  134 +-
 .../tests/test-launch-RangeSegment.hpp        |  179 +-
 .../tests/test-launch-RangeStrideSegment.hpp  |  251 +-
 .../tests/test-launch-DynamicMem.hpp          |  144 +-
 .../tests/test-launch-StaticMem.hpp           |  136 +-
 .../test-launch-nested-Tile-iCount-Direct.hpp |  188 +-
 .../test-launch-nested-Tile-iCount-Loop.hpp   |  191 +-
 .../scan/tests/test-scan-Exclusive.hpp        |   77 +-
 .../scan/tests/test-scan-ExclusiveInplace.hpp |   71 +-
 .../scan/tests/test-scan-Inclusive.hpp        |   63 +-
 .../scan/tests/test-scan-InclusiveInplace.hpp |   52 +-
 test/functional/scan/tests/test-scan-data.hpp |   16 +-
 .../matrix/test-tensor-matrix-double.hpp      |  135 +-
 .../matrix/test-tensor-matrix-float.hpp       |   73 +-
 .../matrix/test-tensor-matrix-int32_t.hpp     |   72 +-
 .../matrix/test-tensor-matrix-int64_t.hpp     |  144 +-
 .../tests/test-tensor-matrix-CtorGetSet.hpp   |   59 +-
 .../tests/test-tensor-matrix-ET_Add.hpp       |  143 +-
 .../tests/test-tensor-matrix-ET_Divide.hpp    |  144 +-
 .../tests/test-tensor-matrix-ET_LoadStore.hpp |  182 +-
 ...-tensor-matrix-ET_MatrixMatrixMultiply.hpp |  189 +-
 ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp |  202 +-
 .../test-tensor-matrix-ET_MatrixVector.hpp    |  168 +-
 .../tests/test-tensor-matrix-ET_Negate.hpp    |  116 +-
 .../tests/test-tensor-matrix-ET_Subtract.hpp  |  143 +-
 .../tests/test-tensor-matrix-ET_Transpose.hpp |  137 +-
 .../test-tensor-matrix-Load_ColMajor.hpp      |  126 +-
 .../test-tensor-matrix-Load_RowMajor.hpp      |  127 +-
 .../test-tensor-matrix-Store_ColMajor.hpp     |  140 +-
 .../test-tensor-matrix-Store_RowMajor.hpp     |  140 +-
 .../tests/test-tensor-matrix-Transpose.hpp    |   59 +-
 .../tests/test-tensor-register-Add.hpp        |   55 +-
 .../tests/test-tensor-register-Divide.hpp     |   76 +-
 .../tests/test-tensor-register-DotProduct.hpp |   32 +-
 .../tests/test-tensor-register-FMA.hpp        |   44 +-
 .../tests/test-tensor-register-FMS.hpp        |   44 +-
 .../tests/test-tensor-register-Gather.hpp     |   52 +-
 .../tests/test-tensor-register-GetSet.hpp     |  121 +-
 .../tests/test-tensor-register-Load.hpp       |   89 +-
 .../tests/test-tensor-register-Max.hpp        |   51 +-
 .../tests/test-tensor-register-Min.hpp        |   49 +-
 .../tests/test-tensor-register-Multiply.hpp   |   55 +-
 .../tests/test-tensor-register-Scatter.hpp    |   68 +-
 ...ensor-register-SegmentedBroadcastInner.hpp |   65 +-
 ...ensor-register-SegmentedBroadcastOuter.hpp |   52 +-
 ...st-tensor-register-SegmentedDotProduct.hpp |   48 +-
 ...test-tensor-register-SegmentedSumInner.hpp |   42 +-
 ...test-tensor-register-SegmentedSumOuter.hpp |   41 +-
 .../tests/test-tensor-register-Store.hpp      |  101 +-
 .../tests/test-tensor-register-Subtract.hpp   |   55 +-
 .../tests/test-tensor-vector-CtorGetSet.hpp   |   46 +-
 .../tests/test-tensor-vector-FmaFms.hpp       |   50 +-
 .../test-tensor-vector-ForallVectorRef1d.hpp  |   89 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |  115 +-
 .../tests/test-tensor-vector-MinMax.hpp       |   24 +-
 .../tests/test-tensor-vector-SumDot.hpp       |   24 +-
 .../util/test-CombiningAdapter-1D.cpp         |   21 +-
 .../util/test-CombiningAdapter-2D.cpp         |   47 +-
 .../util/test-CombiningAdapter-3D.cpp         |   74 +-
 .../util/test-PermutedCombiningAdapter-1D.cpp |   21 +-
 .../util/test-PermutedCombiningAdapter-2D.cpp |   44 +-
 .../util/test-PermutedCombiningAdapter-3D.cpp |   74 +-
 .../test-workgroup-Ordered-MultipleReuse.hpp  |  629 ++--
 .../tests/test-workgroup-Ordered-Single.hpp   |  274 +-
 ...test-workgroup-Unordered-MultipleReuse.hpp |  577 +--
 .../tests/test-workgroup-Unordered-Single.hpp |  268 +-
 test/include/RAJA_gtest.hpp                   |  277 +-
 test/include/RAJA_test-abs.hpp                |   27 +-
 test/include/RAJA_test-atomic-ref-types.hpp   |   91 +-
 test/include/RAJA_test-atomic-types.hpp       |   15 +-
 test/include/RAJA_test-atomicpol.hpp          |   88 +-
 test/include/RAJA_test-base.hpp               |    3 +-
 test/include/RAJA_test-dynamic-forall.hpp     |   19 +-
 .../RAJA_test-forall-async-execpol.hpp        |   15 +-
 test/include/RAJA_test-forall-data.hpp        |   12 +-
 test/include/RAJA_test-forall-execpol.hpp     |  174 +-
 .../RAJA_test-forall-indexset-execpol.hpp     |   44 +-
 test/include/RAJA_test-index-types.hpp        |   58 +-
 test/include/RAJA_test-indexset-build.hpp     |   64 +-
 .../RAJA_test-kernel-nested-loop-types.hpp    |  153 +-
 ...launch-direct-teams-threads-1D-execpol.hpp |   74 +-
 ...launch-direct-teams-threads-3D-execpol.hpp |  131 +-
 test/include/RAJA_test-launch-execpol.hpp     |   60 +-
 ...t-launch-loop-teams-threads-1D-execpol.hpp |   82 +-
 ...t-launch-loop-teams-threads-3D-execpol.hpp |  140 +-
 .../RAJA_test-launch-runtime-execpol.hpp      |  149 +-
 .../RAJA_test-multi-reduce-abstractor.hpp     |  217 +-
 test/include/RAJA_test-multi-reducepol.hpp    |   29 +-
 test/include/RAJA_test-platform.hpp           |    7 +-
 test/include/RAJA_test-plugin-kernelpol.hpp   |  138 +-
 test/include/RAJA_test-plugin-launchpol.hpp   |   14 +-
 .../RAJA_test-plugin-resource-launchpol.hpp   |   14 +-
 test/include/RAJA_test-reduce-types.hpp       |   13 +-
 test/include/RAJA_test-reduceloc-types.hpp    |   11 +-
 test/include/RAJA_test-reducepol.hpp          |   39 +-
 test/include/RAJA_test-tensor.hpp             |  239 +-
 test/include/RAJA_test-workgroup.hpp          |  292 +-
 test/include/RAJA_unit-test-for3d3d.hpp       |  139 +-
 test/include/RAJA_unit-test-forone.hpp        |   29 +-
 test/include/RAJA_unit-test-policy.hpp        |   62 +-
 test/include/RAJA_unit-test-types.hpp         |   37 +-
 test/include/type_helper.hpp                  |   45 +-
 .../using-with-cmake/using-with-cmake.cpp     |   14 +-
 test/integration/plugin/plugin_to_test.cpp    |   22 +-
 test/integration/plugin/tests/counter.hpp     |   14 +-
 .../plugin/tests/test-plugin-forall.hpp       |  157 +-
 .../plugin/tests/test-plugin-kernel.hpp       |   42 +-
 .../plugin/tests/test-plugin-launch.hpp       |   51 +-
 .../tests/test-plugin-resource-launch.hpp     |   54 +-
 .../plugin/tests/test-plugin-workgroup.hpp    |  327 +-
 test/integration/plugin/tests/test-plugin.hpp |   71 +-
 test/integration/plugin_for_test_dynamic.cpp  |   10 +-
 test/integration/plugin_for_test_kokkos.cpp   |   20 +-
 test/integration/test_plugin_dynamic.cpp      |    2 +-
 test/integration/test_plugin_kokkos.cpp       |    2 +-
 test/old-tests/unit/cpu/test-synchronize.cpp  |    3 +-
 test/old-tests/unit/cuda/test-synchronize.cpp |   14 +-
 test/old-tests/unit/test-sharedmem.cpp        | 1302 ++++---
 test/old-tests/unit/test-simd.cpp             |   88 +-
 .../test-algorithm-util-for_each.cpp          |   60 +-
 .../tests/test-algorithm-reduce-utils.hpp     |  299 +-
 .../tests/test-algorithm-sort-utils.hpp       |  511 +--
 .../algorithm/tests/test-algorithm-sort.hpp   |   68 +-
 .../tests/test-algorithm-stable-sort.hpp      |   68 +-
 .../tests/test-algorithm-util-reduce.hpp      |  146 +-
 .../tests/test-algorithm-util-sort.hpp        |  539 ++-
 test/unit/atomic/test-atomic-incdec.cpp       |  143 +-
 .../unit/atomic/test-atomic-ref-accessors.cpp |   90 +-
 test/unit/atomic/test-atomic-ref-addsub.cpp   |  125 +-
 test/unit/atomic/test-atomic-ref-bitwise.cpp  |  172 +-
 .../atomic/test-atomic-ref-constructor.cpp    |  100 +-
 .../unit/atomic/test-atomic-ref-exchanges.cpp |  198 +-
 test/unit/atomic/test-atomic-ref-minmax.cpp   |   89 +-
 test/unit/atomic/test-atomic-ref.hpp          |  102 +-
 test/unit/hip/test-synchronize.cpp            |   22 +-
 test/unit/index/test-indexset.cpp             |   37 +-
 test/unit/index/test-indexvalue.cpp           |    7 +-
 test/unit/index/test-listsegment.cpp          |   44 +-
 test/unit/index/test-rangesegment.cpp         |   42 +-
 test/unit/index/test-rangestridesegment.cpp   |   99 +-
 test/unit/indexing/test-indexing.hpp          |   30 +-
 .../indexing/tests/test-indexing-global.hpp   |   88 +-
 test/unit/internal/test-iterators.cpp         |   22 +-
 test/unit/internal/test-rajavec.cpp           |    4 +-
 .../unit/multi_reducer/test-multi-reducer.hpp |   34 +-
 .../tests/test-multi-reducer-constructors.hpp |  183 +-
 .../tests/test-multi-reducer-reset.hpp        |  417 ++-
 .../test-reducer-constructors-cuda.cpp        |   16 +-
 .../reducer/test-reducer-constructors-hip.cpp |   16 +-
 ...est-reducer-constructors-openmp-target.cpp |   10 +-
 .../test-reducer-constructors-openmp.cpp      |   16 +-
 .../reducer/test-reducer-constructors-seq.cpp |   17 +-
 test/unit/reducer/test-reducer-reset-cuda.cpp |    6 +-
 test/unit/reducer/test-reducer-reset-hip.cpp  |    6 +-
 .../test-reducer-reset-openmp-target.cpp      |    6 +-
 .../reducer/test-reducer-reset-openmp.cpp     |    6 +-
 test/unit/reducer/test-reducer-reset-seq.cpp  |    7 +-
 test/unit/reducer/test-reducer.hpp            |   18 +-
 .../tests/test-reducer-constructors.hpp       |  145 +-
 .../unit/reducer/tests/test-reducer-reset.hpp |  177 +-
 .../tests/test-resource-AsyncTime.hpp         |   59 +-
 .../test-resource-BasicAsyncSemantics.hpp     |   34 +-
 .../resource/tests/test-resource-Depends.hpp  |   46 +-
 .../test-resource-JoinAsyncSemantics.hpp      |   35 +-
 .../tests/test-resource-MultiStream.hpp       |   64 +-
 .../test-operators-bitwise-modulus.cpp        |   41 +-
 .../operator/test-operators-equivalence.cpp   |  104 +-
 .../util/operator/test-operators-identity.cpp |   44 +-
 .../util/operator/test-operators-logical.cpp  |   55 +-
 .../util/operator/test-operators-math.cpp     |   52 +-
 test/unit/util/test-float-limits.cpp          |    7 +-
 test/unit/util/test-fraction.cpp              |   17 +-
 test/unit/util/test-integral-limits.cpp       |    7 +-
 test/unit/util/test-math.cpp                  |   17 +-
 test/unit/util/test-span.cpp                  |   41 +-
 test/unit/util/test-span.hpp                  |   40 +-
 test/unit/util/test-timer.cpp                 |    8 +-
 test/unit/view-layout/test-indexlayout.cpp    |  215 +-
 test/unit/view-layout/test-makelayout.cpp     |  115 +-
 test/unit/view-layout/test-multiview.cpp      |  275 +-
 .../unit/view-layout/test-standard-layout.cpp |   15 +-
 test/unit/view-layout/test-typedlayout.cpp    |  119 +-
 test/unit/view-layout/test-typedview.cpp      |  169 +-
 .../tests/test-util-workgroup-Enqueue.hpp     |   12 +-
 .../tests/test-util-workgroup-WorkStorage.hpp |   29 +-
 .../tests/test-workgroup-Constructor.hpp      |  186 +-
 .../tests/test-workgroup-Dispatcher.hpp       |  296 +-
 .../tests/test-workgroup-Enqueue-Multiple.hpp |  184 +-
 .../tests/test-workgroup-Enqueue-Single.hpp   |  180 +-
 ...test-workgroup-WorkStorage-Constructor.hpp |   29 +-
 .../test-workgroup-WorkStorage-InsertCall.hpp |   36 +-
 .../test-workgroup-WorkStorage-Iterator.hpp   |   44 +-
 .../test-workgroup-WorkStorage-Multiple.hpp   |  116 +-
 719 files changed, 70916 insertions(+), 60452 deletions(-)

diff --git a/.clang-format b/.clang-format
index 47b6b0bee6..ca4ac0cd75 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,12 +2,14 @@ BasedOnStyle : LLVM
 # Indent formatting
 IndentWidth : 2
 UseTab: Never
-BreakBeforeBraces : Linux
 KeepEmptyLinesAtTheStartOfBlocks : true
 MaxEmptyLinesToKeep : 2
 AccessModifierOffset : -2
+# This must be off so that include order in RAJA is preserved
+SortIncludes: false
 
 # Control curly brace placement
+BreakBeforeBraces : Custom
 BraceWrapping:
   AfterCaseLabel:  true
   AfterClass:      true
@@ -30,22 +32,17 @@ BraceWrapping:
 # Pointer alignment
 DerivePointerAlignment: false
 PointerAlignment: Left
-SortIncludes: false
 AllowShortIfStatementsOnASingleLine : true
-ConstructorInitializerAllOnOneLineOrOnePerLine : true
 AllowShortFunctionsOnASingleLine : true
 AllowShortLoopsOnASingleLine : false
-BinPackParameters : true
 AllowAllParametersOfDeclarationOnNextLine : false
 AlignTrailingComments : true
+BinPackArguments : false
+BinPackParameters : false
+ConstructorInitializerAllOnOneLineOrOnePerLine : true
 ColumnLimit : 80
-PenaltyBreakBeforeFirstCallParameter : 100
-PenaltyReturnTypeOnItsOwnLine : 65000
-PenaltyBreakString : 10
 
-BreakBeforeBinaryOperators : None
 AlignAfterOpenBracket: true
-BinPackArguments : false
 AlignOperands : true
 AlwaysBreakTemplateDeclarations : true
-
+BreakBeforeBinaryOperators : None
diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp
index 5131010bd6..5e81a19681 100644
--- a/examples/dynamic-forall.cpp
+++ b/examples/dynamic-forall.cpp
@@ -28,22 +28,26 @@
 void checkResult(int* res, int len);
 void printResult(int* res, int len);
 
-using policy_list = camp::list<RAJA::seq_exec
-                               ,RAJA::simd_exec
+using policy_list = camp::list<RAJA::seq_exec,
+                               RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-                               ,RAJA::omp_parallel_for_exec
+                               ,
+                               RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                               ,RAJA::cuda_exec<256>
-                               ,RAJA::cuda_exec<512>
+                               ,
+                               RAJA::cuda_exec<256>,
+                               RAJA::cuda_exec<512>
 #endif
                                >;
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
 
-  if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the policy to run");
+  if (argc != 2)
+  {
+    RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the "
+                        "policy to run");
   }
 
   //
@@ -55,58 +59,61 @@ int main(int argc, char *argv[])
   const int pol = std::stoi(argv[1]);
 
   std::cout << "\n\nRAJA vector addition example...\n";
-  std::cout << "Using policy # "<<pol<<std::endl;
+  std::cout << "Using policy # " << pol << std::endl;
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   const int N = 1000000;
 
-//
-// Allocate and initialize vector data
-//
-  int *a = memoryManager::allocate<int>(N);
-  int *b = memoryManager::allocate<int>(N);
-  int *c = memoryManager::allocate<int>(N);
+  //
+  // Allocate and initialize vector data
+  //
+  int* a = memoryManager::allocate<int>(N);
+  int* b = memoryManager::allocate<int>(N);
+  int* c = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = -i;
     b[i] = i;
   }
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     c[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
   checkResult(c, N);
-//printResult(c, N);
+  // printResult(c, N);
 
 
-//----------------------------------------------------------------------------//
-// Example of dynamic policy selection for forall
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Example of dynamic policy selection for forall
+  //----------------------------------------------------------------------------//
 
-  //policy is chosen from the list
-  RAJA::expt::dynamic_forall<policy_list>(pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i)   {
-      c[i] = a[i] + b[i];
-  });
+  // policy is chosen from the list
+  RAJA::expt::dynamic_forall<policy_list>(
+      pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
+        c[i] = a[i] + b[i];
+      });
   // _rajaseq_vector_add_end
 
   checkResult(c, N);
-//printResult(c, N);
+  // printResult(c, N);
 
 
-//----------------------------------------------------------------------------//
-//
-// Clean up.
-//
+  //----------------------------------------------------------------------------//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -122,12 +129,19 @@ int main(int argc, char *argv[])
 void checkResult(int* res, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( res[i] != 0 ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (res[i] != 0)
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -138,7 +152,8 @@ void checkResult(int* res, int len)
 void printResult(int* res, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "result[" << i << "] = " << res[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
index feb5247224..83b946b732 100644
--- a/examples/dynamic_mat_transpose.cpp
+++ b/examples/dynamic_mat_transpose.cpp
@@ -83,99 +83,110 @@ using launch_policy = RAJA::LaunchPolicy<
  * Define team policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using outer0 = RAJA::LoopPolicy<
-                                       RAJA::seq_exec
+using outer0 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                       ,
-                                       RAJA::cuda_block_x_direct
+                                ,
+                                RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                       ,
-                                       RAJA::hip_block_x_direct
+                                ,
+                                RAJA::hip_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-                                       ,
-                                       RAJA::sycl_group_2_direct
+                                ,
+                                RAJA::sycl_group_2_direct
 #endif
-                                       >;
+                                >;
 
 using outer1 = RAJA::LoopPolicy<
 #if defined(RAJA_ENABLE_OPENMP)
-                                      RAJA::omp_for_exec
+    RAJA::omp_for_exec
 #else
-                                       RAJA::seq_exec
+    RAJA::seq_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                                       ,
-                                       RAJA::cuda_block_y_direct
+    ,
+    RAJA::cuda_block_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                       ,
-                                       RAJA::hip_block_y_direct
+    ,
+    RAJA::hip_block_y_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-                                       ,
-                                       RAJA::sycl_group_1_direct
+    ,
+    RAJA::sycl_group_1_direct
 #endif
-                                       >;
+    >;
 /*
  * Define thread policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using inner0 = RAJA::LoopPolicy<
-                                         RAJA::seq_exec
+using inner0 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                         ,
-                                         RAJA::cuda_thread_x_direct
+                                ,
+                                RAJA::cuda_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         RAJA::hip_thread_x_direct
+                                ,
+                                RAJA::hip_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-                                        ,
-                                         RAJA::sycl_local_2_direct
+                                ,
+                                RAJA::sycl_local_2_direct
 #endif
-                                         >;
+                                >;
 
 using inner1 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                         ,
-                                         RAJA::cuda_thread_y_direct
+                                ,
+                                RAJA::cuda_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         RAJA::hip_thread_y_direct
+                                ,
+                                RAJA::hip_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-                                        ,
-                                         RAJA::sycl_local_1_direct
+                                ,
+                                RAJA::sycl_local_1_direct
 #endif
-                                         >;
+                                >;
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
 
-  if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
+  if (argc != 2)
+  {
+    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or "
+                        "./dynamic_mat_transpose device");
   }
 
   //
   // Run time policy section is demonstrated in this example by specifying
   // kernel exection space as a command line argument (host or device).
-  // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device
+  // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose
+  // device
   //
   std::string exec_space = argv[1];
-  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
-    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
+  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
+  {
+    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or "
+                        "./dynamic_mat_transpose device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<<std::endl; }
-  if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; std::cout<<"Running RAJA::launch matrix transpose example on the device" <<std::endl; }
+  if (exec_space.compare("host") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
+    std::cout << "Running RAJA::launch matrix transpose example on the host"
+              << std::endl;
+  }
+  if (exec_space.compare("device") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
+    std::cout << "Running RAJA::launch matrix transpose example on the device"
+              << std::endl;
+  }
 
   RAJA::resources::Host host_res;
 #if defined(RAJA_ENABLE_CUDA)
@@ -189,9 +200,11 @@ int main(int argc, char *argv[])
 #endif
 
 #if defined(RAJA_GPU_ACTIVE)
-  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res =
+      RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
-  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res =
+      RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles
@@ -209,8 +222,8 @@ int main(int argc, char *argv[])
   //
   // Allocate matrix data
   //
-  int *A = host_res.allocate<int>(N_r * N_c);
-  int *At = host_res.allocate<int>(N_r * N_c);
+  int* A = host_res.allocate<int>(N_r * N_c);
+  int* At = host_res.allocate<int>(N_r * N_c);
   //
   // In the following implementations of matrix transpose, we
   // use RAJA 'View' objects to access the matrix data. A RAJA view
@@ -225,12 +238,14 @@ int main(int argc, char *argv[])
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of shared matrix transpose...\n";
@@ -241,8 +256,10 @@ int main(int argc, char *argv[])
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -253,14 +270,17 @@ int main(int argc, char *argv[])
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -272,19 +292,21 @@ int main(int argc, char *argv[])
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int tx = 0; tx < TILE_DIM; ++tx)
+      {
+        for (int ty = 0; ty < TILE_DIM; ++ty)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
-
     }
   }
   // _dynamic_mattranspose_localarray_cstyle_end
@@ -294,24 +316,26 @@ int main(int argc, char *argv[])
 
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n";
+  std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory "
+               "...\n";
 
-  //Reset memory
+  // Reset memory
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
 #if defined(RAJA_GPU_ACTIVE)
-  //Allocate device side pointers
+  // Allocate device side pointers
   int *d_A = nullptr, *d_At = nullptr;
 
-  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+  {
 
-    d_A  =  device_res.allocate<int>(N_r * N_c);
+    d_A = device_res.allocate<int>(N_r * N_c);
     d_At = device_res.allocate<int>(N_r * N_c);
 
     device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c);
     device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c);
 
-    //switch host/device pointers so we can reuse the views
+    // switch host/device pointers so we can reuse the views
     Aview.set_data(d_A);
     Atview.set_data(d_At);
   }
@@ -322,65 +346,71 @@ int main(int argc, char *argv[])
   // _dynamic_mattranspose_shared_mem_end
 
   // _dynamic_mattranspose_kernel_start
-  RAJA::launch<launch_policy>
-    (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
-                             RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
-     "Matrix tranpose with dynamic shared memory kernel",
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-  {
-    RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){
-        RAJA::loop<outer0>(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){
-
-            //Request memory from shared memory pool
-            int * tile_ptr = ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
-
-            //Use RAJA View for simplified indexing
-            RAJA::View<int, RAJA::Layout<2>> Tile(tile_ptr, TILE_DIM, TILE_DIM);
-
-            RAJA::loop<inner1>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){
-              RAJA::loop<inner0>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){
-
-                  int col = bx * TILE_DIM + tx;  // Matrix column index
-                  int row = by * TILE_DIM + ty;  // Matrix row index
-
-                  // Bounds check
-                  if (row < N_r && col < N_c) {
-                    Tile(ty,tx) = Aview(row, col);
-                  }
-
-                });
-              });
-
-            //Barrier is needed to ensure all threads have written to Tile
-            ctx.teamSync();
-
-            RAJA::loop<inner1>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){
-              RAJA::loop<inner0>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){
-
-                  int col = bx * TILE_DIM + tx;  // Matrix column index
-                  int row = by * TILE_DIM + ty;  // Matrix row index
-
-                  // Bounds check
-                  if (row < N_r && col < N_c) {
-                    Atview(col, row) = Tile(ty, tx);
-                  }
-
-                });
+  RAJA::launch<launch_policy>(
+      res,
+      RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
+                         RAJA::Threads(TILE_DIM, TILE_DIM),
+                         dynamic_shared_mem_size),
+      "Matrix tranpose with dynamic shared memory kernel",
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&](int by) {
+          RAJA::loop<outer0>(
+              ctx, RAJA::RangeSegment(0, outer_Dimc), [&](int bx) {
+                // Request memory from shared memory pool
+                int* tile_ptr = ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
+
+                // Use RAJA View for simplified indexing
+                RAJA::View<int, RAJA::Layout<2>> Tile(
+                    tile_ptr, TILE_DIM, TILE_DIM);
+
+                RAJA::loop<inner1>(
+                    ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int ty) {
+                      RAJA::loop<inner0>(
+                          ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int tx) {
+                            int col = bx * TILE_DIM + tx; // Matrix column index
+                            int row = by * TILE_DIM + ty; // Matrix row index
+
+                            // Bounds check
+                            if (row < N_r && col < N_c)
+                            {
+                              Tile(ty, tx) = Aview(row, col);
+                            }
+                          });
+                    });
+
+                // Barrier is needed to ensure all threads have written to Tile
+                ctx.teamSync();
+
+                RAJA::loop<inner1>(
+                    ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int ty) {
+                      RAJA::loop<inner0>(
+                          ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int tx) {
+                            int col = bx * TILE_DIM + tx; // Matrix column index
+                            int row = by * TILE_DIM + ty; // Matrix row index
+
+                            // Bounds check
+                            if (row < N_r && col < N_c)
+                            {
+                              Atview(col, row) = Tile(ty, tx);
+                            }
+                          });
+                    });
+
+                // The launch context uses bump style allocator in which calls
+                // to getSharedMemory moves a memory buffer pointer to return
+                // different segments of shared memory. To avoid requesting
+                // beyond the pre-allocated memory quantity we reset the
+                // allocator offset counter in the launch context effectively
+                // releasing shared memory.
+                ctx.releaseSharedMemory();
               });
-
-            //The launch context uses bump style allocator in which calls
-	    //to getSharedMemory moves a memory buffer pointer to return
-	    //different segments of shared memory. To avoid requesting beyond
-	    //the pre-allocated memory quantity we reset the allocator offset counter
-	    //in the launch context effectively releasing shared memory.
-            ctx.releaseSharedMemory();
-          });
+        });
       });
-  });
   // _dynamic_mattranspose_kernel_end
 
 #if defined(RAJA_GPU_ACTIVE)
-  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+  {
 
     device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c);
     device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c);
@@ -392,15 +422,16 @@ int main(int argc, char *argv[])
 
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
-  //Release data
+  // Release data
   host_res.deallocate(A);
   host_res.deallocate(At);
 
 #if defined(RAJA_GPU_ACTIVE)
-  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+  {
     device_res.deallocate(d_A);
     device_res.deallocate(d_At);
   }
@@ -418,16 +449,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -439,11 +476,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      //std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //<< std::endl;
-      printf("%d ",Atview(row, col));
+      printf("%d ", Atview(row, col));
     }
     std::cout << "" << std::endl;
   }
diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp
index fb82582704..9779f6c02b 100644
--- a/examples/forall-param-reductions.cpp
+++ b/examples/forall-param-reductions.cpp
@@ -47,28 +47,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 1000000;
 
-//
-// Allocate array data and initialize data to alternating sequence of 1, -1.
-//
+  //
+  // Allocate array data and initialize data to alternating sequence of 1, -1.
+  //
   RAJA::resources::Host host_res;
   int* a = host_res.allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
-    if ( i % 2 == 0 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
       a[i] = 1;
-    } else {
+    }
+    else
+    {
       a[i] = -1;
     }
   }
 
-//
-// Set min and max loc values
-//
+  //
+  // Set min and max loc values
+  //
   constexpr int minloc_ref = N / 2;
   a[minloc_ref] = -100;
 
@@ -76,36 +80,36 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-//
-// Note: with this data initialization scheme, the following results will
-//       be observed for all reduction kernels below:
-//
-//  - the sum will be zero
-//  - the min will be -100
-//  - the max will be 100
-//  - the min loc will be N/2
-//  - the max loc will be N/2 + 1
-//
-//
-
-//
-// Define index range for iterating over a elements in all examples
-//
+  //
+  // Note: with this data initialization scheme, the following results will
+  //       be observed for all reduction kernels below:
+  //
+  //  - the sum will be zero
+  //  - the min will be -100
+  //  - the max will be 100
+  //  - the min loc will be N/2
+  //  - the max loc will be N/2 + 1
+  //
+  //
+
+  //
+  // Define index range for iterating over a elements in all examples
+  //
   // _reductions_range_start
   RAJA::TypedRangeSegment<int> arange(0, N);
   // _reductions_range_end
 
-//
-// Define ValLoc Type
-//
+  //
+  // Define ValLoc Type
+  //
 
   using VALLOC_INT = RAJA::expt::ValLoc<int>;
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
   // _reductions_raja_seq_start
-  using EXEC_POL1   = RAJA::seq_exec;
+  using EXEC_POL1 = RAJA::seq_exec;
 
   int seq_sum = 0;
   int seq_min = std::numeric_limits<int>::max();
@@ -113,46 +117,52 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL1>(host_res, arange,
-    RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
-    RAJA::expt::KernelName("RAJA Reduce Seq Kernel"),
-    [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) {
-      _seq_sum += a[i];
-
-      _seq_min = RAJA_MIN(a[i], _seq_min);
-      _seq_max = RAJA_MAX(a[i], _seq_max);
-
-      _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
-      _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
-      //_seq_minloc.min(a[i], i);
-      //_seq_maxloc.max(a[i], i);
-      // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods
-      //        that are equivalent to the assignments with RAJA_MIN and RAJA_MAX
-      //        above.
-    }
-  );
+  RAJA::forall<EXEC_POL1>(
+      host_res,
+      arange,
+      RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
+      RAJA::expt::KernelName("RAJA Reduce Seq Kernel"),
+      [=](int i,
+          int& _seq_sum,
+          int& _seq_min,
+          int& _seq_max,
+          VALLOC_INT& _seq_minloc,
+          VALLOC_INT& _seq_maxloc) {
+        _seq_sum += a[i];
+
+        _seq_min = RAJA_MIN(a[i], _seq_min);
+        _seq_max = RAJA_MAX(a[i], _seq_max);
+
+        _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
+        _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
+        //_seq_minloc.min(a[i], i);
+        //_seq_maxloc.max(a[i], i);
+        // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods
+        //        that are equivalent to the assignments with RAJA_MIN and
+        //        RAJA_MAX above.
+      });
 
   std::cout << "\tsum = " << seq_sum << std::endl;
   std::cout << "\tmin = " << seq_min << std::endl;
   std::cout << "\tmax = " << seq_max << std::endl;
   std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , "
-                               << seq_minloc.getLoc() << std::endl;
+            << seq_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , "
-                               << seq_maxloc.getLoc() << std::endl;
+            << seq_maxloc.getLoc() << std::endl;
   // _reductions_raja_seq_end
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
 
   // _reductions_raja_omppolicy_start
-  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
   // _reductions_raja_omppolicy_end
 
   int omp_sum = 0;
@@ -161,37 +171,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL2>(host_res, arange,
-    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
-    RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"),
-    [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) {
-      _omp_sum += a[i];
-
-      _omp_min = RAJA_MIN(a[i], _omp_min);
-      _omp_max = RAJA_MAX(a[i], _omp_max);
-
-      _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
-      _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
-      //_omp_minloc.min(a[i], i);
-      //_omp_maxloc.max(a[i], i);
-    }
-  );
+  RAJA::forall<EXEC_POL2>(
+      host_res,
+      arange,
+      RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
+      RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"),
+      [=](int i,
+          int& _omp_sum,
+          int& _omp_min,
+          int& _omp_max,
+          VALLOC_INT& _omp_minloc,
+          VALLOC_INT& _omp_maxloc) {
+        _omp_sum += a[i];
+
+        _omp_min = RAJA_MIN(a[i], _omp_min);
+        _omp_max = RAJA_MAX(a[i], _omp_max);
+
+        _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
+        _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
+        //_omp_minloc.min(a[i], i);
+        //_omp_maxloc.max(a[i], i);
+      });
 
   std::cout << "\tsum = " << omp_sum << std::endl;
   std::cout << "\tmin = " << omp_min << std::endl;
   std::cout << "\tmax = " << omp_max << std::endl;
   std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , "
-                               << omp_minloc.getLoc() << std::endl;
+            << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , "
-                               << omp_maxloc.getLoc() << std::endl;
+            << omp_maxloc.getLoc() << std::endl;
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
   std::cout << "\n Running RAJA OpenMP Target reductions...\n";
@@ -199,7 +215,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::resources::Omp omp_res;
 
   // _reductions_raja_omppolicy_start
-  using EXEC_POL3   = RAJA::omp_target_parallel_for_exec_nt;
+  using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt;
   // _reductions_raja_omppolicy_end
 
   int omp_t_sum = 0;
@@ -208,38 +224,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_t_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_t_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(omp_res, arange,
-    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
-    RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"),
-    [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) {
-      _omp_t_sum += a[i];
-
-      _omp_t_min = RAJA_MIN(a[i], _omp_t_min);
-      _omp_t_max = RAJA_MAX(a[i], _omp_t_max);
-
-      _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc);
-      _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc);
-      //_omp_t_minloc.min(a[i], i);
-      //_omp_t_maxloc.max(a[i], i);
-    }
-  );
+  RAJA::forall<EXEC_POL3>(
+      omp_res,
+      arange,
+      RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
+      RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"),
+      [=](int i,
+          int& _omp_t_sum,
+          int& _omp_t_min,
+          int& _omp_t_max,
+          VALLOC_INT& _omp_t_minloc,
+          VALLOC_INT& _omp_t_maxloc) {
+        _omp_t_sum += a[i];
+
+        _omp_t_min = RAJA_MIN(a[i], _omp_t_min);
+        _omp_t_max = RAJA_MAX(a[i], _omp_t_max);
+
+        _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc);
+        _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc);
+        //_omp_t_minloc.min(a[i], i);
+        //_omp_t_maxloc.max(a[i], i);
+      });
 
   std::cout << "\tsum = " << omp_t_sum << std::endl;
   std::cout << "\tmin = " << omp_t_min << std::endl;
   std::cout << "\tmax = " << omp_t_max << std::endl;
   std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , "
-                               << omp_t_minloc.getLoc() << std::endl;
+            << omp_t_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , "
-                               << omp_t_maxloc.getLoc() << std::endl;
+            << omp_t_maxloc.getLoc() << std::endl;
 
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
@@ -250,7 +272,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   cuda_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_cudapolicy_start
-  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   // _reductions_raja_cudapolicy_end
 
   int cuda_sum = 0;
@@ -259,37 +281,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(cuda_res, arange,
-    RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
-    RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
-    [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) {
-      _cuda_sum += d_a[i];
-
-      _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
-      _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
-
-      _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
-      _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
-      //_cuda_minloc.min(a[i], i);
-      //_cuda_maxloc.max(a[i], i);
-    }
-  );
+  RAJA::forall<EXEC_POL3>(
+      cuda_res,
+      arange,
+      RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
+      RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
+      [=] RAJA_DEVICE(int i,
+                      int& _cuda_sum,
+                      int& _cuda_min,
+                      int& _cuda_max,
+                      VALLOC_INT& _cuda_minloc,
+                      VALLOC_INT& _cuda_maxloc) {
+        _cuda_sum += d_a[i];
+
+        _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
+        _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
+
+        _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
+        _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
+        //_cuda_minloc.min(a[i], i);
+        //_cuda_maxloc.max(a[i], i);
+      });
 
   std::cout << "\tsum = " << cuda_sum << std::endl;
   std::cout << "\tmin = " << cuda_min << std::endl;
   std::cout << "\tmax = " << cuda_max << std::endl;
   std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , "
-                               << cuda_minloc.getLoc() << std::endl;
+            << cuda_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , "
-                               << cuda_maxloc.getLoc() << std::endl;
+            << cuda_maxloc.getLoc() << std::endl;
   cuda_res.deallocate(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
@@ -300,7 +328,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hip_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_hippolicy_start
-  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   // _reductions_raja_hippolicy_end
 
   int hip_sum = 0;
@@ -309,38 +337,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange,
-    RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
-    RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
-    [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) {
-      _hip_sum += d_a[i];
-
-      _hip_min = RAJA_MIN(d_a[i], _hip_min);
-      _hip_max = RAJA_MAX(d_a[i], _hip_max);
-
-      _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
-      _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
-      //_hip_minloc.min(d_a[i], i);
-      //_hip_maxloc.max(d_a[i], i);
-    }
-  );
+  RAJA::forall<EXEC_POL3>(
+      arange,
+      RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
+      RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
+      [=] RAJA_DEVICE(int i,
+                      int& _hip_sum,
+                      int& _hip_min,
+                      int& _hip_max,
+                      VALLOC_INT& _hip_minloc,
+                      VALLOC_INT& _hip_maxloc) {
+        _hip_sum += d_a[i];
+
+        _hip_min = RAJA_MIN(d_a[i], _hip_min);
+        _hip_max = RAJA_MAX(d_a[i], _hip_max);
+
+        _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
+        _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
+        //_hip_minloc.min(d_a[i], i);
+        //_hip_maxloc.max(d_a[i], i);
+      });
 
   std::cout << "\tsum = " << hip_sum << std::endl;
   std::cout << "\tmin = " << hip_min << std::endl;
   std::cout << "\tmax = " << hip_max << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , "
-                               << hip_minloc.getLoc() << std::endl;
+            << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , "
-                               << hip_maxloc.getLoc() << std::endl;
+            << hip_maxloc.getLoc() << std::endl;
 
   hip_res.deallocate(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL reductions...\n";
@@ -351,7 +384,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   sycl_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_syclpolicy_start
-  using EXEC_POL3   = RAJA::sycl_exec<SYCL_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::sycl_exec<SYCL_BLOCK_SIZE>;
   // _reductions_raja_syclpolicy_end
 
   int sycl_sum = 0;
@@ -360,42 +393,48 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(sycl_res, arange,
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
-    RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
-    [=] RAJA_DEVICE (int i, int &_sycl_sum, int &_sycl_min, int &_sycl_max, VALLOC_INT &_sycl_minloc, VALLOC_INT &_sycl_maxloc) {
-      _sycl_sum += d_a[i];
-
-      _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
-      _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
-
-      _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
-      _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
-      //_sycl_minloc.min(d_a[i], i);
-      //_sycl_maxloc.max(d_a[i], i);
-    }
-  );
+  RAJA::forall<EXEC_POL3>(
+      sycl_res,
+      arange,
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
+      RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
+      [=] RAJA_DEVICE(int i,
+                      int& _sycl_sum,
+                      int& _sycl_min,
+                      int& _sycl_max,
+                      VALLOC_INT& _sycl_minloc,
+                      VALLOC_INT& _sycl_maxloc) {
+        _sycl_sum += d_a[i];
+
+        _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
+        _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
+
+        _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
+        _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
+        //_sycl_minloc.min(d_a[i], i);
+        //_sycl_maxloc.max(d_a[i], i);
+      });
 
   std::cout << "\tsum = " << sycl_sum << std::endl;
   std::cout << "\tmin = " << sycl_min << std::endl;
   std::cout << "\tmax = " << sycl_max << std::endl;
   std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , "
-                               << sycl_minloc.getLoc() << std::endl;
+            << sycl_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , "
-                               << sycl_maxloc.getLoc() << std::endl;
+            << sycl_maxloc.getLoc() << std::endl;
 
   sycl_res.deallocate(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   host_res.deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp
index c3be312194..d8b145f9ee 100644
--- a/examples/forall_multi-reductions.cpp
+++ b/examples/forall_multi-reductions.cpp
@@ -27,7 +27,7 @@
  *
  */
 
-template < typename t_exec_policy, typename t_multi_reduce_policy >
+template <typename t_exec_policy, typename t_multi_reduce_policy>
 struct Backend
 {
   using exec_policy = t_exec_policy;
@@ -38,50 +38,51 @@ struct Backend
 
 auto example_policies = camp::make_tuple(
 
-      Backend<RAJA::seq_exec, RAJA::seq_multi_reduce>{"Sequential"}
+    Backend<RAJA::seq_exec, RAJA::seq_multi_reduce>{"Sequential"}
 
 #if defined(RAJA_ENABLE_OPENMP)
-    , Backend<RAJA::omp_parallel_for_exec, RAJA::omp_multi_reduce>{"OpenMP"}
+    ,
+    Backend<RAJA::omp_parallel_for_exec, RAJA::omp_multi_reduce>{"OpenMP"}
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-    , Backend<RAJA::cuda_exec_async<256>, RAJA::cuda_multi_reduce_atomic>{"Cuda"}
+    ,
+    Backend<RAJA::cuda_exec_async<256>, RAJA::cuda_multi_reduce_atomic>{"Cuda"}
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-    , Backend<RAJA::hip_exec_async<256>, RAJA::hip_multi_reduce_atomic>{"Hip"}
+    ,
+    Backend<RAJA::hip_exec_async<256>, RAJA::hip_multi_reduce_atomic>{"Hip"}
 #endif
 
-    );
+);
 
-template < typename exec_policy, typename multi_reduce_policy >
+template <typename exec_policy, typename multi_reduce_policy>
 void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a)
 {
-  RAJA::MultiReduceSum<multi_reduce_policy, int>    multi_reduce_sum(num_bins);
-  RAJA::MultiReduceMin<multi_reduce_policy, int>    multi_reduce_min(num_bins);
-  RAJA::MultiReduceMax<multi_reduce_policy, int>    multi_reduce_max(num_bins);
+  RAJA::MultiReduceSum<multi_reduce_policy, int> multi_reduce_sum(num_bins);
+  RAJA::MultiReduceMin<multi_reduce_policy, int> multi_reduce_min(num_bins);
+  RAJA::MultiReduceMax<multi_reduce_policy, int> multi_reduce_max(num_bins);
   RAJA::MultiReduceBitAnd<multi_reduce_policy, int> multi_reduce_and(num_bins);
-  RAJA::MultiReduceBitOr<multi_reduce_policy, int>  multi_reduce_or(num_bins);
-
-  RAJA::forall<exec_policy>(arange,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
+  RAJA::MultiReduceBitOr<multi_reduce_policy, int> multi_reduce_or(num_bins);
 
+  RAJA::forall<exec_policy>(arange, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
     int bin = bins[i];
 
-    multi_reduce_sum[bin] +=  a[i];
+    multi_reduce_sum[bin] += a[i];
     multi_reduce_min[bin].min(a[i]);
     multi_reduce_max[bin].max(a[i]);
-    multi_reduce_and[bin] &=  a[i];
-    multi_reduce_or [bin] |=  a[i];
-
+    multi_reduce_and[bin] &= a[i];
+    multi_reduce_or[bin] |= a[i];
   });
 
-  for (int bin = 0; bin < num_bins; ++bin) {
+  for (int bin = 0; bin < num_bins; ++bin)
+  {
     std::cout << "\tsum[" << bin << "] = " << multi_reduce_sum.get(bin) << '\n';
     std::cout << "\tmin[" << bin << "] = " << multi_reduce_min.get(bin) << '\n';
     std::cout << "\tmax[" << bin << "] = " << multi_reduce_max.get(bin) << '\n';
     std::cout << "\tand[" << bin << "] = " << multi_reduce_and.get(bin) << '\n';
-    std::cout << "\tor [" << bin << "] = " << multi_reduce_or .get(bin) << '\n';
+    std::cout << "\tor [" << bin << "] = " << multi_reduce_or.get(bin) << '\n';
     std::cout << '\n';
   }
 }
@@ -90,77 +91,78 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 {
 
   // _multi_reductions_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   const int N = 1000000;
   const int num_bins = 10;
 
-//
-// Allocate array data and initialize data to alternating sequence of 1, -1.
-//
+  //
+  // Allocate array data and initialize data to alternating sequence of 1, -1.
+  //
   camp::resources::Host host_res;
   int* host_bins = host_res.template allocate<int>(N);
-  int* host_a    = host_res.template allocate<int>(N);
+  int* host_a = host_res.template allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     host_bins[i] = i % num_bins;
-    host_a[i] = (i % (2*num_bins)) - num_bins;
+    host_a[i] = (i % (2 * num_bins)) - num_bins;
   }
 
   // _multi_reductions_array_init_end
 
-//
-// Note: with this data initialization scheme, the following results will
-//       be observed for all reduction kernels below:
-//
-// for bin in [0, num_bins)
-//  - the sum will be (bin - num_bins/2) * N / num_bins
-//  - the min will be bin - num_bins
-//  - the max will be bin
-//  - the and will be min & max
-//  - the or  will be min | max
-//
-
-//
-// Define index range for iterating over a elements in all examples
-//
+  //
+  // Note: with this data initialization scheme, the following results will
+  //       be observed for all reduction kernels below:
+  //
+  // for bin in [0, num_bins)
+  //  - the sum will be (bin - num_bins/2) * N / num_bins
+  //  - the min will be bin - num_bins
+  //  - the max will be bin
+  //  - the and will be min & max
+  //  - the or  will be min | max
+  //
+
+  //
+  // Define index range for iterating over a elements in all examples
+  //
   // _multi_reductions_range_start
   RAJA::RangeSegment arange(0, N);
   // _multi_reductions_range_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   RAJA::for_each_tuple(example_policies, [&](auto const& backend) {
-
     std::cout << "Running " << backend.name << " policies" << '\n';
 
     using exec_policy = typename std::decay_t<decltype(backend)>::exec_policy;
-    using multi_reduce_policy = typename std::decay_t<decltype(backend)>::multi_reduce_policy;
+    using multi_reduce_policy =
+        typename std::decay_t<decltype(backend)>::multi_reduce_policy;
 
     auto res = RAJA::resources::get_default_resource<exec_policy>();
 
     int* bins = res.template allocate<int>(N);
-    int* a    = res.template allocate<int>(N);
+    int* a = res.template allocate<int>(N);
 
-    res.memcpy(bins, host_bins, N*sizeof(int));
-    res.memcpy(a   , host_a   , N*sizeof(int));
+    res.memcpy(bins, host_bins, N * sizeof(int));
+    res.memcpy(a, host_a, N * sizeof(int));
 
     example_code<exec_policy, multi_reduce_policy>(arange, num_bins, bins, a);
 
     res.deallocate(bins);
-    res.deallocate(a   );
+    res.deallocate(a);
 
     std::cout << std::endl;
   });
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   host_res.deallocate(host_bins);
-  host_res.deallocate(host_a   );
+  host_res.deallocate(host_a);
 
   std::cout << "\n DONE!...\n";
 
diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp
index 0badaa7396..8bf25d9a86 100644
--- a/examples/jacobi.cpp
+++ b/examples/jacobi.cpp
@@ -39,7 +39,7 @@
  * (I, Iold) and initialized to zero. The first set of
  * nested for loops apply an iteration of the Jacobi
  * scheme. The scheme is only applied to the interior
- * nodes. 
+ * nodes.
  *
  * The second set of nested for loops is used to
  * update Iold and compute the l_2 norm of the
@@ -52,7 +52,7 @@
  * ----[RAJA Concepts]---------------
  * - Forall::nested loop
  * - RAJA Reduction
- * 
+ *
  */
 
 
@@ -63,9 +63,9 @@
  *
  * CUDA_BLOCK_SIZE_Y - Number of threads in the
  *                     y-dimension of a cuda thread block
- * 
+ *
  * CUDA_BLOCK_SIZE   - Number of threads per threads block
-*/
+ */
 #if defined(RAJA_ENABLE_CUDA)
 const int CUDA_BLOCK_SIZE = 256;
 #endif
@@ -80,23 +80,24 @@ const int HIP_BLOCK_SIZE = 256;
 //  h - Spacing between grid points
 //  n - Number of grid points
 //
-struct grid_s {
+struct grid_s
+{
   double o, h;
   int n;
 };
 
-// 
+//
 // ----[Functions]---------
 // solution   - Function for the analytic solution
 // computeErr - Displays the maximum error in the solution
 //
 double solution(double x, double y);
-void computeErr(double *I, grid_s grid);
+void computeErr(double* I, grid_s grid);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout<<"Jacobi Example"<<std::endl;
+  std::cout << "Jacobi Example" << std::endl;
 
   /*
    * ----[Solver Parameters]------------
@@ -108,7 +109,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
    * resI2     - Residual
    * iteration - Iteration number
    * grid_s    - Struct with grid information for a cartesian dimension
-  */
+   */
   double tol = 1e-10;
 
   int N = 50;
@@ -124,10 +125,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   gridx.n = N + 2;
 
   //
-  //I, Iold - Holds iterates of Jacobi method
+  // I, Iold - Holds iterates of Jacobi method
   //
-  double *I = memoryManager::allocate<double>(NN);
-  double *Iold = memoryManager::allocate<double>(NN);
+  double* I = memoryManager::allocate<double>(NN);
+  double* Iold = memoryManager::allocate<double>(NN);
 
 
   memset(I, 0, NN * sizeof(double));
@@ -138,23 +139,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   resI2 = 1;
   iteration = 0;
 
-  while (resI2 > tol * tol) {
+  while (resI2 > tol * tol)
+  {
 
     //
     // Jacobi Iteration
     //
-    for (int n = 1; n <= N; ++n) {
-      for (int m = 1; m <= N; ++m) {
+    for (int n = 1; n <= N; ++n)
+    {
+      for (int m = 1; m <= N; ++m)
+      {
 
         double x = gridx.o + m * gridx.h;
         double y = gridx.o + n * gridx.h;
 
-        double f = gridx.h * gridx.h
-                   * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+        double f = gridx.h * gridx.h *
+                   (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
         int id = n * (N + 2) + m;
-        I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
-                           + Iold[id + 1]);
+        I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+                        Iold[id - 1] + Iold[id + 1]);
       }
     }
 
@@ -162,12 +166,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     // Compute residual and update Iold
     //
     resI2 = 0.0;
-    for (int k = 0; k < NN; k++) {
+    for (int k = 0; k < NN; k++)
+    {
       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
       Iold[k] = I[k];
     }
 
-    if (iteration > maxIter) {
+    if (iteration > maxIter)
+    {
       printf("Standard C++ Loop - Maxed out on iterations \n");
       exit(-1);
     }
@@ -184,9 +190,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment gridRange(0, NN);
   RAJA::RangeSegment jacobiRange(1, (N + 1));
 
-  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
-  RAJA::statement::For<1, RAJA::seq_exec,
-    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
+  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   printf("RAJA: Sequential Policy - Nested ForallN \n");
   resI2 = 1;
@@ -195,41 +202,39 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   memset(Iold, 0, NN * sizeof(double));
 
   /*
-   *  Sequential Jacobi Iteration. 
+   *  Sequential Jacobi Iteration.
    *
    *  Note that a RAJA ReduceSum object is used to accumulate the sum
-   *  for the residual. Since the loop is run sequentially, this is 
-   *  not strictly necessary. It is done here for consistency and 
+   *  for the residual. Since the loop is run sequentially, this is
+   *  not strictly necessary. It is done here for consistency and
    *  comparison with other RAJA variants in this example.
-   */  
-  while (resI2 > tol * tol) {
+   */
+  while (resI2 > tol * tol)
+  {
 
-    RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
-                         [=] (RAJA::Index_type m, RAJA::Index_type n) {
-                         
+    RAJA::kernel<jacobiSeqNestedPolicy>(
+        RAJA::make_tuple(jacobiRange, jacobiRange),
+        [=](RAJA::Index_type m, RAJA::Index_type n) {
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
-          double f = gridx.h * gridx.h
-                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+          double f = gridx.h * gridx.h *
+                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          I[id] =
-               0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
-                          + Iold[id + 1]);
+          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+                          Iold[id - 1] + Iold[id + 1]);
         });
 
     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
-    RAJA::forall<RAJA::seq_exec>(
-      gridRange, [=](RAJA::Index_type k) {
-      
-        RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);          
-        Iold[k] = I[k];
+    RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
+      RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
+      Iold[k] = I[k];
+    });
 
-      });
-    
     resI2 = RAJA_resI2;
-    if (iteration > maxIter) {
+    if (iteration > maxIter)
+    {
       printf("Jacobi: Sequential - Maxed out on iterations! \n");
       exit(-1);
     }
@@ -237,17 +242,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   computeErr(I, gridx);
   printf("No of iterations: %d \n \n", iteration);
-  
-  
+
+
 #if defined(RAJA_ENABLE_OPENMP)
   printf("RAJA: OpenMP Policy - Nested ForallN \n");
   resI2 = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
-  
+
   /*
-   *  OpenMP parallel Jacobi Iteration. 
+   *  OpenMP parallel Jacobi Iteration.
    *
    *  ----[RAJA Policies]-----------
    *  RAJA::omp_collapse_for_exec -
@@ -256,41 +261,41 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
    *  Note that OpenMP RAJA ReduceSum object performs the reduction
    *  operation for the residual in a thread-safe manner.
    */
-  
-  using jacobiOmpNestedPolicy = RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
-        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
-
-  while (resI2 > tol * tol) {
-    
-    RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
-                         [=] (RAJA::Index_type m, RAJA::Index_type n) {
-
-                
-      double x = gridx.o + m * gridx.h;
-      double y = gridx.o + n * gridx.h;
-
-      double f = gridx.h * gridx.h * 
-                 (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
-
-      int id = n * (N + 2) + m;
-      I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + 
-                           Iold[id - 1] + Iold[id + 1]);              
-    });
+
+  using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+
+  while (resI2 > tol * tol)
+  {
+
+    RAJA::kernel<jacobiOmpNestedPolicy>(
+        RAJA::make_tuple(jacobiRange, jacobiRange),
+        [=](RAJA::Index_type m, RAJA::Index_type n) {
+          double x = gridx.o + m * gridx.h;
+          double y = gridx.o + n * gridx.h;
+
+          double f = gridx.h * gridx.h *
+                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+          int id = n * (N + 2) + m;
+          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+                          Iold[id - 1] + Iold[id + 1]);
+        });
 
 
     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
 
-    RAJA::forall<RAJA::omp_parallel_for_exec>( gridRange, 
-      [=](RAJA::Index_type k) {
-      
-      RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);                    
-      Iold[k] = I[k];
-        
-    });
-    
+    RAJA::forall<RAJA::omp_parallel_for_exec>(
+        gridRange, [=](RAJA::Index_type k) {
+          RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
+          Iold[k] = I[k];
+        });
+
     resI2 = RAJA_resI2;
-    if (iteration > maxIter) {
+    if (iteration > maxIter)
+    {
       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
       exit(-1);
     }
@@ -303,7 +308,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
   /*
-   *  CUDA Jacobi Iteration. 
+   *  CUDA Jacobi Iteration.
    *
    *  ----[RAJA Policies]-----------
    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
@@ -315,42 +320,45 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   printf("RAJA: CUDA Policy - Nested ForallN \n");
 
-  using jacobiCUDANestedPolicy = RAJA::KernelPolicy<
-    RAJA::statement::CudaKernel<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
-          RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    > >;
-  
+  using jacobiCUDANestedPolicy =
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<32>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<32>,
+              RAJA::cuda_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::cuda_thread_y_direct,
+                  RAJA::statement::For<0,
+                                       RAJA::cuda_thread_x_direct,
+                                       RAJA::statement::Lambda<0>>>>>>>;
+
   resI2 = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
 
-  while (resI2 > tol * tol) {
+  while (resI2 > tol * tol)
+  {
 
     //
-    // Jacobi Iteration 
+    // Jacobi Iteration
     //
     RAJA::kernel<jacobiCUDANestedPolicy>(
-                         RAJA::make_tuple(jacobiRange,jacobiRange),
-                         [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {
-                           
+        RAJA::make_tuple(jacobiRange, jacobiRange),
+        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
-          double f = gridx.h * gridx.h
-                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+          double f = gridx.h * gridx.h *
+                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
-                             + Iold[id + 1]);                            
+          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+                          Iold[id - 1] + Iold[id + 1]);
         });
 
     //
@@ -358,16 +366,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
-      
+        gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
           RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
           Iold[k] = I[k];
-
-      });
+        });
 
     resI2 = RAJA_resI2;
 
-    if (iteration > maxIter) {
+    if (iteration > maxIter)
+    {
       printf("RAJA: CUDA - Maxed out on iterations! \n");
       exit(-1);
     }
@@ -392,47 +399,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   printf("RAJA: HIP Policy - Nested ForallN \n");
 
-  using jacobiHIPNestedPolicy = RAJA::KernelPolicy<
-    RAJA::statement::HipKernel<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
-          RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    > >;
+  using jacobiHIPNestedPolicy =
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<32>,
+          RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<32>,
+              RAJA::hip_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::hip_thread_y_direct,
+                  RAJA::statement::For<0,
+                                       RAJA::hip_thread_x_direct,
+                                       RAJA::statement::Lambda<0>>>>>>>;
 
   resI2 = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
 
-  double *d_I    = memoryManager::allocate_gpu<double>(NN);
-  double *d_Iold = memoryManager::allocate_gpu<double>(NN);
-  hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice ));
+  double* d_I = memoryManager::allocate_gpu<double>(NN);
+  double* d_Iold = memoryManager::allocate_gpu<double>(NN);
+  hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
 
-  while (resI2 > tol * tol) {
+  while (resI2 > tol * tol)
+  {
 
     //
     // Jacobi Iteration
     //
     RAJA::kernel<jacobiHIPNestedPolicy>(
-                         RAJA::make_tuple(jacobiRange,jacobiRange),
-                         [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {
-
+        RAJA::make_tuple(jacobiRange, jacobiRange),
+        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
-          double f = gridx.h * gridx.h
-                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+          double f = gridx.h * gridx.h *
+                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1]
-                             + d_Iold[id + 1]);
+          d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] +
+                            d_Iold[id - 1] + d_Iold[id + 1]);
         });
 
     //
@@ -440,23 +451,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
-
+        gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
           RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
           d_Iold[k] = d_I[k];
-
-      });
+        });
 
     resI2 = RAJA_resI2;
 
-    if (iteration > maxIter) {
+    if (iteration > maxIter)
+    {
       printf("RAJA: HIP - Maxed out on iterations! \n");
       exit(-1);
     }
     iteration++;
   }
   hipDeviceSynchronize();
-  hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
   computeErr(I, gridx);
   printf("No of iterations: %d \n \n", iteration);
 
@@ -466,7 +476,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   memoryManager::deallocate(I);
   memoryManager::deallocate(Iold);
-  
+
 
   return 0;
 }
@@ -482,25 +492,26 @@ double solution(double x, double y)
 //
 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
 //
-void computeErr(double *I, grid_s grid)
+void computeErr(double* I, grid_s grid)
 {
 
   RAJA::RangeSegment gridRange(0, grid.n);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
-  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
-    RAJA::statement::For<1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
-
-  RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange,gridRange),
-                       [=] (RAJA::Index_type ty, RAJA::Index_type tx ) {
-
-      int id = tx + grid.n * ty;
-      double x = grid.o + tx * grid.h;
-      double y = grid.o + ty * grid.h;
-      double myErr = std::abs(I[id] - solution(x, y));
-      tMax.max(myErr);
-    });
+  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<jacobiSeqNestedPolicy>(
+      RAJA::make_tuple(gridRange, gridRange),
+      [=](RAJA::Index_type ty, RAJA::Index_type tx) {
+        int id = tx + grid.n * ty;
+        double x = grid.o + tx * grid.h;
+        double y = grid.o + ty * grid.h;
+        double myErr = std::abs(I[id] - solution(x, y));
+        tMax.max(myErr);
+      });
 
   double l2err = tMax;
   printf("Max error = %lg, h = %f \n", l2err, grid.h);
diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp
index 5de2123425..6aac29178a 100644
--- a/examples/kernel-dynamic-tile.cpp
+++ b/examples/kernel-dynamic-tile.cpp
@@ -1,34 +1,36 @@
 #include "RAJA/RAJA.hpp"
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
   std::cout << "\n\nRAJA dynamic_tile example...\n\n";
 
-//Using policy = KernelPolicy<Tile<tile_dynamic<0>, seq_exec, …>>;
-//RAJA::kernel_param<policy>(
-// make_tuple(RangeSegment(0,N)),
-//  make_tuple(32),  // param 0 is referenced by tile_dynamic
-//  [=](int i, int tile_size){
-//
-//  });
+  // Using policy = KernelPolicy<Tile<tile_dynamic<0>, seq_exec, …>>;
+  // RAJA::kernel_param<policy>(
+  //  make_tuple(RangeSegment(0,N)),
+  //   make_tuple(32),  // param 0 is referenced by tile_dynamic
+  //   [=](int i, int tile_size){
+  //
+  //   });
 
   using namespace RAJA;
 
-  kernel_param<
-    KernelPolicy<
-      statement::Tile<1, tile_dynamic<1>, seq_exec,
-        statement::Tile<0, tile_dynamic<0>, seq_exec,
-          statement::For<1, seq_exec,
-             statement::For<0, seq_exec, statement::Lambda<0>>
-          >
-        >
-      >
-    >
-  >(make_tuple(RangeSegment{0,25}, RangeSegment{0,25}),
+  kernel_param<KernelPolicy<statement::Tile<
+      1,
+      tile_dynamic<1>,
+      seq_exec,
+      statement::Tile<
+          0,
+          tile_dynamic<0>,
+          seq_exec,
+          statement::For<1,
+                         seq_exec,
+                         statement::For<0, seq_exec, statement::Lambda<0>>>>>>>(
+      make_tuple(RangeSegment{0, 25}, RangeSegment{0, 25}),
       make_tuple(TileSize{5}, TileSize{10}),
-     //make_tuple(TileSize(10)), // not sure we need this, good for static_assert
-     [=](int i, int j, TileSize x, TileSize y){
-       std::cout << "Running index (" << i << "," << j << ") of " << x.size << "x" << y.size << " tile." << std::endl;
-  });
-
+      // make_tuple(TileSize(10)), // not sure we need this, good for
+      // static_assert
+      [=](int i, int j, TileSize x, TileSize y) {
+        std::cout << "Running index (" << i << "," << j << ") of " << x.size
+                  << "x" << y.size << " tile." << std::endl;
+      });
 }
diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp
index b57bedfd6b..2682e15edd 100644
--- a/examples/launch-param-reductions.cpp
+++ b/examples/launch-param-reductions.cpp
@@ -38,7 +38,7 @@ constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-//LC testing hardware has a limit of 151
+// LC testing hardware has a limit of 151
 constexpr int SYCL_BLOCK_SIZE = 128;
 #endif
 
@@ -48,14 +48,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 1000000;
 
-//
-// Use a resource to allocate memory
-//
+  //
+  // Use a resource to allocate memory
+  //
   RAJA::resources::Host host_res;
 #if defined(RAJA_ENABLE_CUDA)
   RAJA::resources::Cuda device_res;
@@ -68,22 +68,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-//
-// Allocate array data and initialize data to alternating sequence of 1, -1.
-//
+  //
+  // Allocate array data and initialize data to alternating sequence of 1, -1.
+  //
   int* a = host_res.allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
-    if ( i % 2 == 0 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
       a[i] = 1;
-    } else {
+    }
+    else
+    {
       a[i] = -1;
     }
   }
 
-//
-// Set min and max loc values
-//
+  //
+  // Set min and max loc values
+  //
   constexpr int minloc_ref = N / 2;
   a[minloc_ref] = -100;
 
@@ -91,37 +95,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-//
-// Note: with this data initialization scheme, the following results will
-//       be observed for all reduction kernels below:
-//
-//  - the sum will be zero
-//  - the min will be -100
-//  - the max will be 100
-//  - the min loc will be N/2
-//  - the max loc will be N/2 + 1
-//
-//
-
-//
-// Define index range for iterating over a elements in all examples
-//
+  //
+  // Note: with this data initialization scheme, the following results will
+  //       be observed for all reduction kernels below:
+  //
+  //  - the sum will be zero
+  //  - the min will be -100
+  //  - the max will be 100
+  //  - the min loc will be N/2
+  //  - the max loc will be N/2 + 1
+  //
+  //
+
+  //
+  // Define index range for iterating over a elements in all examples
+  //
   // _reductions_range_start
   RAJA::TypedRangeSegment<int> arange(0, N);
   // _reductions_range_end
 
-//
-// Define ValLoc Type
-//
+  //
+  // Define ValLoc Type
+  //
 
   using VALLOC_INT = RAJA::expt::ValLoc<int>;
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
   // _reductions_raja_seq_start
-  using LAUNCH_POL1   = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
-  using LOOP_POL1     = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using LAUNCH_POL1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+  using LOOP_POL1 = RAJA::LoopPolicy<RAJA::seq_exec>;
 
   int seq_sum = 0;
   int seq_min = std::numeric_limits<int>::max();
@@ -129,20 +133,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL1>
-    (host_res, RAJA::LaunchParams(), "SeqReductionKernel",
-    RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
-                           int &_seq_sum, int &_seq_min,
-                           int &_seq_max, VALLOC_INT &_seq_minloc,
-                           VALLOC_INT &_seq_maxloc) {
-
-      RAJA::loop<LOOP_POL1>(ctx, arange, [&] (int i) {
-
+  RAJA::launch<LAUNCH_POL1>(
+      host_res,
+      RAJA::LaunchParams(),
+      "SeqReductionKernel",
+      RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
+                           int& _seq_sum,
+                           int& _seq_min,
+                           int& _seq_max,
+                           VALLOC_INT& _seq_minloc,
+                           VALLOC_INT& _seq_maxloc) {
+        RAJA::loop<LOOP_POL1>(ctx, arange, [&](int i) {
           _seq_sum += a[i];
 
           _seq_min = RAJA_MIN(a[i], _seq_min);
@@ -152,33 +158,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
           //_seq_minloc.min(a[i], i);
           //_seq_maxloc.max(a[i], i);
-          // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods
-          //        that are equivalent to the assignments with RAJA_MIN and RAJA_MAX
-          //        above.
-        }
-      );
-
-    }
-  );
+          // Note : RAJA::expt::ValLoc<T> objects provide min() and max()
+          // methods
+          //        that are equivalent to the assignments with RAJA_MIN and
+          //        RAJA_MAX above.
+        });
+      });
 
   std::cout << "\tsum = " << seq_sum << std::endl;
   std::cout << "\tmin = " << seq_min << std::endl;
   std::cout << "\tmax = " << seq_max << std::endl;
   std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , "
-                               << seq_minloc.getLoc() << std::endl;
+            << seq_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , "
-                               << seq_maxloc.getLoc() << std::endl;
+            << seq_maxloc.getLoc() << std::endl;
   // _reductions_raja_seq_end
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
 
   // _reductions_raja_omppolicy_start
-  using LAUNCH_POL2   = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
-  using LOOP_POL2     = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using LAUNCH_POL2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+  using LOOP_POL2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   // _reductions_raja_omppolicy_end
 
   int omp_sum = 0;
@@ -187,20 +191,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL2>
-    (host_res, RAJA::LaunchParams(), "OmpReductionKernel",
-    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
-                           int &_omp_sum, int &_omp_min,
-                           int &_omp_max, VALLOC_INT &_omp_minloc,
-                           VALLOC_INT &_omp_maxloc) {
-
-      RAJA::loop<LOOP_POL2>(ctx, arange, [&] (int i) {
-
+  RAJA::launch<LAUNCH_POL2>(
+      host_res,
+      RAJA::LaunchParams(),
+      "OmpReductionKernel",
+      RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
+                           int& _omp_sum,
+                           int& _omp_min,
+                           int& _omp_max,
+                           VALLOC_INT& _omp_minloc,
+                           VALLOC_INT& _omp_maxloc) {
+        RAJA::loop<LOOP_POL2>(ctx, arange, [&](int i) {
           _omp_sum += a[i];
 
           _omp_min = RAJA_MIN(a[i], _omp_min);
@@ -210,23 +216,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
           //_omp_minloc.min(a[i], i);
           //_omp_maxloc.max(a[i], i);
-        }
-      );
-
-    }
-  );
+        });
+      });
 
   std::cout << "\tsum = " << omp_sum << std::endl;
   std::cout << "\tmin = " << omp_min << std::endl;
   std::cout << "\tmax = " << omp_max << std::endl;
   std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , "
-                               << omp_minloc.getLoc() << std::endl;
+            << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , "
-                               << omp_maxloc.getLoc() << std::endl;
+            << omp_maxloc.getLoc() << std::endl;
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
@@ -235,11 +238,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   device_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_cudapolicy_start
-  using LAUNCH_POL3   = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false /*async*/>>;
-  using LOOP_POL3     = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+  using LAUNCH_POL3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false /*async*/>>;
+  using LOOP_POL3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
   // _reductions_raja_cudapolicy_end
 
-  const int NUMBER_OF_TEAMS = (N-1)/CUDA_BLOCK_SIZE + 1;
+  const int NUMBER_OF_TEAMS = (N - 1) / CUDA_BLOCK_SIZE + 1;
 
   int cuda_sum = 0;
   int cuda_min = std::numeric_limits<int>::max();
@@ -247,21 +250,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL3>
-    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)),
-     "CUDAReductionKernel",
-    RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
-                           int &_cuda_sum, int &_cuda_min, int &_cuda_max,
-                           VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) {
-
-
-      RAJA::loop<LOOP_POL3>(ctx, arange, [&] (int i) {
-
+  RAJA::launch<LAUNCH_POL3>(
+      device_res,
+      RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS),
+                         RAJA::Threads(CUDA_BLOCK_SIZE)),
+      "CUDAReductionKernel",
+      RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
+                           int& _cuda_sum,
+                           int& _cuda_min,
+                           int& _cuda_max,
+                           VALLOC_INT& _cuda_minloc,
+                           VALLOC_INT& _cuda_maxloc) {
+        RAJA::loop<LOOP_POL3>(ctx, arange, [&](int i) {
           _cuda_sum += d_a[i];
 
           _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
@@ -271,26 +276,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
           //_cuda_minloc.min(a[i], i);
           //_cuda_maxloc.max(a[i], i);
-
-        }
-      );
-
-
-    }
-  );
+        });
+      });
 
   std::cout << "\tsum = " << cuda_sum << std::endl;
   std::cout << "\tmin = " << cuda_min << std::endl;
   std::cout << "\tmax = " << cuda_max << std::endl;
   std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , "
-                               << cuda_minloc.getLoc() << std::endl;
+            << cuda_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , "
-                               << cuda_maxloc.getLoc() << std::endl;
+            << cuda_maxloc.getLoc() << std::endl;
 
   device_res.deallocate(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
@@ -299,11 +299,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   device_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_hippolicy_start
-  using LAUNCH_POL3   = RAJA::LaunchPolicy<RAJA::hip_launch_t<false /*async*/>>;
-  using LOOP_POL3     = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
+  using LAUNCH_POL3 = RAJA::LaunchPolicy<RAJA::hip_launch_t<false /*async*/>>;
+  using LOOP_POL3 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
   // _reductions_raja_hippolicy_end
 
-  const int NUMBER_OF_TEAMS = (N-1)/HIP_BLOCK_SIZE + 1;
+  const int NUMBER_OF_TEAMS = (N - 1) / HIP_BLOCK_SIZE + 1;
 
   int hip_sum = 0;
   int hip_min = std::numeric_limits<int>::max();
@@ -311,21 +311,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL3>
-    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)),
-     "HipReductionKernel",
-    RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
-                           int &_hip_sum, int &_hip_min,
-                           int &_hip_max, VALLOC_INT &_hip_minloc,
-                           VALLOC_INT &_hip_maxloc) {
-
-      RAJA::loop<LOOP_POL3>(ctx, arange, [&] (int i) {
-
+  RAJA::launch<LAUNCH_POL3>(
+      device_res,
+      RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS),
+                         RAJA::Threads(HIP_BLOCK_SIZE)),
+      "HipReductionKernel",
+      RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
+                           int& _hip_sum,
+                           int& _hip_min,
+                           int& _hip_max,
+                           VALLOC_INT& _hip_minloc,
+                           VALLOC_INT& _hip_maxloc) {
+        RAJA::loop<LOOP_POL3>(ctx, arange, [&](int i) {
           _hip_sum += d_a[i];
 
           _hip_min = RAJA_MIN(d_a[i], _hip_min);
@@ -335,25 +337,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
           //_hip_minloc.min(d_a[i], i);
           //_hip_maxloc.max(d_a[i], i);
-
-        }
-      );
-
-    }
-  );
+        });
+      });
 
   std::cout << "\tsum = " << hip_sum << std::endl;
   std::cout << "\tmin = " << hip_min << std::endl;
   std::cout << "\tmax = " << hip_max << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , "
-                               << hip_minloc.getLoc() << std::endl;
+            << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , "
-                               << hip_maxloc.getLoc() << std::endl;
+            << hip_maxloc.getLoc() << std::endl;
 
   device_res.deallocate(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL reductions...\n";
@@ -362,11 +360,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   device_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_syclpolicy_start
-  using LAUNCH_POL4   = RAJA::LaunchPolicy<RAJA::sycl_launch_t<false /*async*/>>;
-  using LOOP_POL4     = RAJA::LoopPolicy<RAJA::sycl_global_item_2>;
+  using LAUNCH_POL4 = RAJA::LaunchPolicy<RAJA::sycl_launch_t<false /*async*/>>;
+  using LOOP_POL4 = RAJA::LoopPolicy<RAJA::sycl_global_item_2>;
   // _reductions_raja_syclpolicy_end
 
-  const int NUMBER_OF_TEAMS = (N-1)/SYCL_BLOCK_SIZE + 1;
+  const int NUMBER_OF_TEAMS = (N - 1) / SYCL_BLOCK_SIZE + 1;
 
   int sycl_sum = 0;
   int sycl_min = std::numeric_limits<int>::max();
@@ -374,21 +372,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL4>
-    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)),
-     "SyclReductionKernel",
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
-                           int &_sycl_sum, int &_sycl_min,
-                           int &_sycl_max, VALLOC_INT &_sycl_minloc,
-                           VALLOC_INT &_sycl_maxloc) {
-
-      RAJA::loop<LOOP_POL4>(ctx, arange, [&] (int i) {
-
+  RAJA::launch<LAUNCH_POL4>(
+      device_res,
+      RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS),
+                         RAJA::Threads(SYCL_BLOCK_SIZE)),
+      "SyclReductionKernel",
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
+                           int& _sycl_sum,
+                           int& _sycl_min,
+                           int& _sycl_max,
+                           VALLOC_INT& _sycl_minloc,
+                           VALLOC_INT& _sycl_maxloc) {
+        RAJA::loop<LOOP_POL4>(ctx, arange, [&](int i) {
           _sycl_sum += d_a[i];
 
           _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
@@ -398,29 +398,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
           //_sycl_minloc.min(d_a[i], i);
           //_sycl_maxloc.max(d_a[i], i);
-
-        }
-      );
-
-    }
-  );
+        });
+      });
 
   std::cout << "\tsum = " << sycl_sum << std::endl;
   std::cout << "\tmin = " << sycl_min << std::endl;
   std::cout << "\tmax = " << sycl_max << std::endl;
   std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , "
-                               << sycl_minloc.getLoc() << std::endl;
+            << sycl_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , "
-                               << sycl_maxloc.getLoc() << std::endl;
+            << sycl_maxloc.getLoc() << std::endl;
 
   device_res.deallocate(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   host_res.deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp
index 2a3d92ad84..b79cc249a4 100644
--- a/examples/launch_flatten.cpp
+++ b/examples/launch_flatten.cpp
@@ -35,15 +35,17 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 using device_launch = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
-using device_inner_pol0 =  RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
-using device_inner_pol1 =  RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
-using device_flatten_pol =  RAJA::LoopPolicy<RAJA::cuda_flatten_block_threads_xy_direct>;
+using device_inner_pol0 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+using device_inner_pol1 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+using device_flatten_pol =
+    RAJA::LoopPolicy<RAJA::cuda_flatten_block_threads_xy_direct>;
 using reduce_policy = RAJA::cuda_reduce;
 #elif defined(RAJA_ENABLE_HIP)
 using device_launch = RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>;
-using device_inner_pol0 =  RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
-using device_inner_pol1 =  RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
-using device_flatten_pol =  RAJA::LoopPolicy<RAJA::hip_flatten_block_threads_xy_direct>;
+using device_inner_pol0 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+using device_inner_pol1 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+using device_flatten_pol =
+    RAJA::LoopPolicy<RAJA::hip_flatten_block_threads_xy_direct>;
 using reduce_policy = RAJA::hip_reduce;
 #endif
 
@@ -54,7 +56,7 @@ using reduce_policy = RAJA::hip_reduce;
 using host_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 using host_loop = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
@@ -63,20 +65,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Problem size dimensions
   //
   constexpr int N = 4;
-  constexpr int NN = N*N;
+  constexpr int NN = N * N;
 
   //
   // Configure grid size
   //
-  RAJA::LaunchParams launch_params(RAJA::Teams(1),
-                                   RAJA::Threads(N, N));
+  RAJA::LaunchParams launch_params(RAJA::Teams(1), RAJA::Threads(N, N));
 
 
   //
   // Resource object for host, used to allocate memory
   //
   camp::resources::Host host_res;
-  int *h_A_ptr = host_res.allocate<int>(NN);
+  int* h_A_ptr = host_res.allocate<int>(NN);
 
   //
   // Resource object for device, used to allocate memory
@@ -87,9 +88,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   camp::resources::Hip device_res;
 #endif
 
-  int *d_A_ptr = device_res.allocate<int>(NN);
+  int* d_A_ptr = device_res.allocate<int>(NN);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running device version of teams_flatten example ...\n";
 
@@ -97,27 +98,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<2>> d_A_2DView(d_A_ptr, N, N);
   RAJA::View<int, RAJA::Layout<1>> d_A_1DView(d_A_ptr, NN);
 
-  RAJA::launch<device_launch>
-    (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
-     {
+  RAJA::launch<device_launch>(
+      launch_params, [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<device_inner_pol1>(
+            ctx, RAJA::RangeSegment(0, N), [&](int j) {
+              RAJA::loop<device_inner_pol0>(
+                  ctx, RAJA::RangeSegment(0, N), [&](int i) {
+                    d_A_2DView(j, i) = i + j;
+                  });
+            });
 
-       RAJA::loop<device_inner_pol1>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
-         RAJA::loop<device_inner_pol0>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
-             d_A_2DView(j, i) = i + j;
-           });
-         });
+        ctx.teamSync();
 
-       ctx.teamSync();
+        // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
+        // accumulating memory contents
+        RAJA::loop<device_flatten_pol>(
+            ctx, RAJA::RangeSegment(0, NN), [&](int i) {
+              device_kernel_sum += d_A_1DView(i);
+            });
+      });
 
-       // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
-       // accumulating memory contents
-       RAJA::loop<device_flatten_pol>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
-           device_kernel_sum += d_A_1DView(i);
-       });
-
-     });
-
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running host version of teams_flatten example ...\n";
 
@@ -125,29 +126,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<2>> h_A_2DView(h_A_ptr, N, N);
   RAJA::View<int, RAJA::Layout<1>> h_A_1DView(h_A_ptr, NN);
 
-  RAJA::launch<host_launch>
-    (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
-    {
-
-       RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
-         RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
-             h_A_2DView(j, i) = i + j;
-           });
-         });
-
-       ctx.teamSync();
-
-       //As loops are dispatched as standard C loops we can revert to using
-       //a regular seq_exec policy
-       RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
-           host_kernel_sum += h_A_1DView(i);
-       });
-
-     });
-
-  if ( device_kernel_sum.get() == host_kernel_sum.get() ) {
+  RAJA::launch<host_launch>(
+      launch_params, [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&](int j) {
+          RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&](int i) {
+            h_A_2DView(j, i) = i + j;
+          });
+        });
+
+        ctx.teamSync();
+
+        // As loops are dispatched as standard C loops we can revert to using
+        // a regular seq_exec policy
+        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN), [&](int i) {
+          host_kernel_sum += h_A_1DView(i);
+        });
+      });
+
+  if (device_kernel_sum.get() == host_kernel_sum.get())
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 
diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp
index 797c5ee7c5..7c00a71071 100644
--- a/examples/launch_matrix-multiply.cpp
+++ b/examples/launch_matrix-multiply.cpp
@@ -31,23 +31,22 @@
 /*
  *  Define number of threads in x and y dimensions in a RAJA thread team
  *  or in a CUDA/HIP thread blocks
-*/
+ */
 #define THREAD_SZ 16
 
 /*
  * Define host/device launch policies
  */
-using launch_policy = RAJA::LaunchPolicy<
-    RAJA::seq_launch_t
+using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_launch_t<false>
+                                         ,
+                                         RAJA::cuda_launch_t<false>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_launch_t<false>
+                                         ,
+                                         RAJA::hip_launch_t<false>
 #endif
-    >;
+                                         >;
 
 using loop_policy = RAJA::seq_exec;
 
@@ -77,39 +76,45 @@ using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy;
 */
 using teams_x = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                 , gpu_block_x_policy
+                                 ,
+                                 gpu_block_x_policy
 #endif
-                                >;
+                                 >;
 
 using teams_y = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                 , gpu_block_y_policy
+                                 ,
+                                 gpu_block_y_policy
 #endif
-                                >;
+                                 >;
 
 using threads_x = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                   , gpu_thread_x_policy
+                                   ,
+                                   gpu_thread_x_policy
 #endif
-                                  >;
+                                   >;
 
 using threads_y = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                   , gpu_thread_y_policy
+                                   ,
+                                   gpu_thread_y_policy
 #endif
-                                  >;
+                                   >;
 
 using global_thread_x = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         , gpu_global_thread_x_policy
+                                         ,
+                                         gpu_global_thread_x_policy
 #endif
-                                        >;
+                                         >;
 
 using global_thread_y = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         , gpu_global_thread_y_policy
+                                         ,
+                                         gpu_global_thread_y_policy
 #endif
-                                        >;
+                                         >;
 
 //
 // Define dimensionality of matrices.
@@ -134,9 +139,11 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
   int row = blockIdx.y * blockDim.y + threadIdx.y;
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if ( row < N && col < N ) {
+  if (row < N && col < N)
+  {
     double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
+    for (int k = 0; k < N; ++k)
+    {
       dot += A(row, k) * B(k, col);
     }
 
@@ -147,8 +154,8 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
 __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
 {
 
-  int Row = blockIdx.y*THREAD_SZ + threadIdx.y;
-  int Col = blockIdx.x*THREAD_SZ + threadIdx.x;
+  int Row = blockIdx.y * THREAD_SZ + threadIdx.y;
+  int Col = blockIdx.x * THREAD_SZ + threadIdx.x;
 
   __shared__ double As[THREAD_SZ][THREAD_SZ];
   __shared__ double Bs[THREAD_SZ][THREAD_SZ];
@@ -156,15 +163,16 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
 
   Cs[threadIdx.y][threadIdx.x] = 0.0;
 
-  for (int k = 0; k < (THREAD_SZ + N - 1)/THREAD_SZ; k++) {
+  for (int k = 0; k < (THREAD_SZ + N - 1) / THREAD_SZ; k++)
+  {
 
-    if ( static_cast<int>(k*THREAD_SZ + threadIdx.x) < N && Row < N )
-      As[threadIdx.y][threadIdx.x] = A[Row*N + k*THREAD_SZ + threadIdx.x];
+    if (static_cast<int>(k * THREAD_SZ + threadIdx.x) < N && Row < N)
+      As[threadIdx.y][threadIdx.x] = A[Row * N + k * THREAD_SZ + threadIdx.x];
     else
       As[threadIdx.y][threadIdx.x] = 0.0;
 
-    if ( static_cast<int>(k*THREAD_SZ + threadIdx.y) < N && Col < N)
-      Bs[threadIdx.y][threadIdx.x] = B[(k*THREAD_SZ + threadIdx.y)*N + Col];
+    if (static_cast<int>(k * THREAD_SZ + threadIdx.y) < N && Col < N)
+      Bs[threadIdx.y][threadIdx.x] = B[(k * THREAD_SZ + threadIdx.y) * N + Col];
     else
       Bs[threadIdx.y][threadIdx.x] = 0.0;
 
@@ -177,8 +185,8 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
   }
 
   if (Row < N && Col < N)
-    C[((blockIdx.y * blockDim.y + threadIdx.y)*N) +
-      (blockIdx.x * blockDim.x)+ threadIdx.x] = Cs[threadIdx.y][threadIdx.x];
+    C[((blockIdx.y * blockDim.y + threadIdx.y) * N) +
+      (blockIdx.x * blockDim.x) + threadIdx.x] = Cs[threadIdx.y][threadIdx.x];
 }
 #endif
 
@@ -186,7 +194,7 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
 // Functions for checking results
 //
 template <typename T>
-void checkResult(T *C, int N);
+void checkResult(T* C, int N);
 
 template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
@@ -195,68 +203,72 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 // Functions for printing results
 //
 template <typename T>
-void printResult(T *C, int N);
+void printResult(T* C, int N);
 
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix multiplication example...\n";
 
-//
-// Define num rows/cols in matrix and number of teams based on
-// number of threads in a dimension.
-//
+  //
+  // Define num rows/cols in matrix and number of teams based on
+  // number of threads in a dimension.
+  //
   const int N = 1000;
-  const int NTeams = (N - 1)/THREAD_SZ + 1;
+  const int NTeams = (N - 1) / THREAD_SZ + 1;
 
-//
-// Allocate and initialize matrix data.
-//
-  double *A = memoryManager::allocate<double>(N * N);
-  double *B = memoryManager::allocate<double>(N * N);
-  double *C = memoryManager::allocate<double>(N * N);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  //
+  // Allocate and initialize matrix data.
+  //
+  double* A = memoryManager::allocate<double>(N * N);
+  double* B = memoryManager::allocate<double>(N * N);
+  double* C = memoryManager::allocate<double>(N * N);
+
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
       A(row, col) = row;
       B(row, col) = col;
     }
   }
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version of matrix multiplication...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_cstyle_start
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k) {
+      for (int k = 0; k < N; ++k)
+      {
         dot += A(row, k) * B(k, col);
       }
       C(row, col) = dot;
-
     }
   }
   // _matmult_cstyle_end
 
   checkResult<double>(C, N);
-//printResult<double>(C, N);
+  // printResult<double>(C, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// We define RAJA range segments to define the ranges of
-// row, column, and dot-product loops for RAJA variants
-//
+  //
+  // We define RAJA range segments to define the ranges of
+  // row, column, and dot-product loops for RAJA variants
+  //
   // _matmult_ranges_start
   RAJA::RangeSegment row_range(0, N);
   RAJA::RangeSegment col_range(0, N);
@@ -265,120 +277,120 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #endif
   // _matmult_ranges_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// For the RAJA implementations of matrix multiplication, we
-// use RAJA 'View' objects to access the matrix data. A RAJA view
-// holds a pointer to a data array and enables multi-dimensional indexing
-// into that data, similar to the macros we defined above.
-//
+  //
+  // For the RAJA implementations of matrix multiplication, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into that data, similar to the macros we defined above.
+  //
   // _matmult_views_start
   RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, N);
   // _matmult_views_end
 
-//----------------------------------------------------------------------------//
-//
-// RAJA Team loops uses a RAJA::launch method to launch a kernel.
-// These examples, illustrate the basic interface and mechanics.
-//
-// This is different than RAJA::forall and so a few points of exmplanation
-// are in order:
-//
-// 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch)
-//    execution is chosen at run time and we support running on the host
-//    or device.
-//
-// 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP
-//    and considers programming using a group of threads in which we group into
-//    teams. Number of threads and teams are defined inside the Resources struct.
-//
-// 3) Launch context is used synchronize threads within a team, an example of this
-//    is presented further below.
-//
-// 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism can be
-//    expressed by mapping outer loops (up to 3) to gpu blocks (teams) and inner
-//    loops to threads in a block (team).
-//
+  //----------------------------------------------------------------------------//
+  //
+  // RAJA Team loops uses a RAJA::launch method to launch a kernel.
+  // These examples, illustrate the basic interface and mechanics.
+  //
+  // This is different than RAJA::forall and so a few points of exmplanation
+  // are in order:
+  //
+  // 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch)
+  //    execution is chosen at run time and we support running on the host
+  //    or device.
+  //
+  // 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP
+  //    and considers programming using a group of threads in which we group
+  //    into teams. Number of threads and teams are defined inside the Resources
+  //    struct.
+  //
+  // 3) Launch context is used synchronize threads within a team, an example of
+  // this
+  //    is presented further below.
+  //
+  // 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism
+  // can be
+  //    expressed by mapping outer loops (up to 3) to gpu blocks (teams) and
+  //    inner loops to threads in a block (team).
+  //
 
   std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
-  //As a starting point we demonstrate assigning each dot product
-  //to a thread on a two dimensional compute grid. Rows are mapped
-  //to threads in the x dimension, while Cols are mapped to threads
-  //in the y dimension. On the host this mapping simplifies to executing
-  //two for loops.
+  // As a starting point we demonstrate assigning each dot product
+  // to a thread on a two dimensional compute grid. Rows are mapped
+  // to threads in the x dimension, while Cols are mapped to threads
+  // in the y dimension. On the host this mapping simplifies to executing
+  // two for loops.
 
   // _matmult_basickernel_start
-  RAJA::launch<launch_policy>(RAJA::ExecPlace::HOST,
-   RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
-                         RAJA::Threads(THREAD_SZ,THREAD_SZ)),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-   RAJA::loop<global_thread_y>(ctx, col_range, [&] (int col) {
-       RAJA::loop<global_thread_x>(ctx, row_range, [&] (int row) {
-
-          double dot = 0.0;
-          for (int k = 0; k < N; ++k) {
-            dot += Aview(row, k) * Bview(k, col);
-          }
-          Cview(row, col) = dot;
+  RAJA::launch<launch_policy>(
+      RAJA::ExecPlace::HOST,
+      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
+                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<global_thread_y>(ctx, col_range, [&](int col) {
+          RAJA::loop<global_thread_x>(ctx, row_range, [&](int row) {
+            double dot = 0.0;
+            for (int k = 0; k < N; ++k)
+            {
+              dot += Aview(row, k) * Bview(k, col);
+            }
+            Cview(row, col) = dot;
+          });
+        });
       });
-    });
-
-  });
   // _matmult_basickernel_end
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
-  //RAJA Team loops currently only support a pair of policies at a time.
-  //Switching between a sequential and OpenMP launch space requires
-  //recompiling execution policies. When running exclusively on the host
-  //the compute grid may be left uninitialized as loop methods get expanded to
-  //standard C style loops.
+  // RAJA Team loops currently only support a pair of policies at a time.
+  // Switching between a sequential and OpenMP launch space requires
+  // recompiling execution policies. When running exclusively on the host
+  // the compute grid may be left uninitialized as loop methods get expanded to
+  // standard C style loops.
   using omp_launch_policy = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   using omp_col_policy0 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
   using omp_row_policy0 = RAJA::LoopPolicy<loop_policy>;
 
-  RAJA::launch<omp_launch_policy>(RAJA::LaunchParams(),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-   RAJA::loop<omp_col_policy0>(ctx, col_range, [&] (int col) {
-       RAJA::loop<omp_row_policy0>(ctx, row_range, [&] (int row) {
-
-          double dot = 0.0;
-          for (int k = 0; k < N; ++k) {
-            dot += Aview(row, k) * Bview(k, col);
-          }
-          Cview(row, col) = dot;
+  RAJA::launch<omp_launch_policy>(
+      RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<omp_col_policy0>(ctx, col_range, [&](int col) {
+          RAJA::loop<omp_row_policy0>(ctx, row_range, [&](int row) {
+            double dot = 0.0;
+            for (int k = 0; k < N; ++k)
+            {
+              dot += Aview(row, k) * Bview(k, col);
+            }
+            Cview(row, col) = dot;
+          });
+        });
       });
-    });
-
-  });
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // This example collapses the row and col loops in an OpenMP parallel region.
@@ -387,33 +399,32 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   using global_thread_xy = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
-   RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST,
-                                         RAJA::LaunchParams(),
-   [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-     RAJA::expt::loop<global_thread_xy>(ctx, col_range, row_range, [&] (int col, int row) {
-
-           double dot = 0.0;
-           for (int k = 0; k < N; ++k) {
-             dot += Aview(row, k) * Bview(k, col);
-           }
-           Cview(row, col) = dot;
-
-     });
-
-   });
+  RAJA::launch<omp_launch_policy>(
+      RAJA::ExecPlace::HOST,
+      RAJA::LaunchParams(),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::expt::loop<global_thread_xy>(
+            ctx, col_range, row_range, [&](int col, int row) {
+              double dot = 0.0;
+              for (int k = 0; k < N; ++k)
+              {
+                dot += Aview(row, k) * Bview(k, col);
+              }
+              Cview(row, col) = dot;
+            });
+      });
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+// printResult<double>(Cview, N);
 #endif // if RAJA_ENABLE_OPENMP
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA mat-mult (RAJA-nested)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // This example maps row indicies to RAJA teams (CUDA
@@ -425,87 +436,86 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
-    RAJA::LaunchParams(RAJA::Teams(N),
-                          RAJA::Threads(N)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-    RAJA::loop<teams_x>(ctx, col_range, [&] (int col) {
-        RAJA::loop<threads_x>(ctx, row_range, [&] (int row) {
-
-           double dot = 0.0;
-           for (int k = 0; k < N; ++k) {
-             dot += Aview(row, k) * Bview(k, col);
-           }
-           Cview(row, col) = dot;
-       });
-     });
-
-   });
+  RAJA::launch<launch_policy>(
+      RAJA::ExecPlace::DEVICE,
+      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<teams_x>(ctx, col_range, [&](int col) {
+          RAJA::loop<threads_x>(ctx, row_range, [&](int row) {
+            double dot = 0.0;
+            for (int k = 0; k < N; ++k)
+            {
+              dot += Aview(row, k) * Bview(k, col);
+            }
+            Cview(row, col) = dot;
+          });
+        });
+      });
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tiled mat-mult ...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // This example takes the extents of the col and row loops and breaks
   // them down into `tiles`. Tile loops are used to generate RangeSegments of
   // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate
-  // across the work within each tile. On the device, tiles are typically assigned
-  // to teams, while RAJA loops are mapped to threads.
+  // across the work within each tile. On the device, tiles are typically
+  // assigned to teams, while RAJA loops are mapped to threads.
   //
   // The tiling capabilities in RAJA will also mask out of bounds iterations.
   //
-  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
-    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
-                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+  RAJA::launch<launch_policy>(
+      RAJA::ExecPlace::DEVICE,
+      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
+                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::tile<teams_y>
-        (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) {
-          RAJA::tile<teams_x>
-            (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) {
-
-              RAJA::loop<threads_y>(ctx, row_tile, [&] (int col) {
-                RAJA::loop<threads_x>(ctx, col_tile, [&] (int row) {
-
-                    double dot = 0.0;
-                    for (int k = 0; k < N; ++k) {
-                      dot += Aview(row, k) * Bview(k, col);
-                    }
-                    Cview(row, col) = dot;
-
+        RAJA::tile<teams_y>(
+            ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& row_tile) {
+              RAJA::tile<teams_x>(
+                  ctx,
+                  THREAD_SZ,
+                  col_range,
+                  [&](RAJA::RangeSegment const& col_tile) {
+                    RAJA::loop<threads_y>(ctx, row_tile, [&](int col) {
+                      RAJA::loop<threads_x>(ctx, col_tile, [&](int row) {
+                        double dot = 0.0;
+                        for (int k = 0; k < N; ++k)
+                        {
+                          dot += Aview(row, k) * Bview(k, col);
+                        }
+                        Cview(row, col) = dot;
+                      });
+                    });
                   });
-                });
             });
-        });
-   });
+      });
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 #endif // if RAJA_ENABLE_CUDA
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  double *d_A = memoryManager::allocate_gpu<double>(N * N);
-  double *d_B = memoryManager::allocate_gpu<double>(N * N);
-  double *d_C = memoryManager::allocate_gpu<double>(N * N);
+  double* d_A = memoryManager::allocate_gpu<double>(N * N);
+  double* d_B = memoryManager::allocate_gpu<double>(N * N);
+  double* d_C = memoryManager::allocate_gpu<double>(N * N);
 
   std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
-  hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, N, N);
@@ -521,74 +531,73 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
-    RAJA::LaunchParams(RAJA::Teams(N),
-                          RAJA::Threads(N)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-     RAJA::loop<teams_x>(ctx, col_range, [&] (int col) {
-       RAJA::loop<threads_x>(ctx, row_range, [&] (int row) {
-
+  RAJA::launch<launch_policy>(
+      RAJA::ExecPlace::DEVICE,
+      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<teams_x>(ctx, col_range, [&](int col) {
+          RAJA::loop<threads_x>(ctx, row_range, [&](int row) {
             double dot = 0.0;
-            for (int k = 0; k < N; ++k) {
+            for (int k = 0; k < N; ++k)
+            {
               dot += d_Aview(row, k) * d_Bview(k, col);
             }
 
             d_Cview(row, col) = dot;
-
+          });
         });
-     });
-  });
+      });
 
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP tiled mat-mult ...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  std::memset(C, 0, N * N * sizeof(double));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   //
   // This example takes the extents of the col and row loops and breaks
   // them down into `tiles`. Tile loops are used to generate RangeSegments of
   // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate
-  // across the work within each tile. On the device tiles are typically assigned
-  // to teams, while RAJA loops are mapped to threads.
+  // across the work within each tile. On the device tiles are typically
+  // assigned to teams, while RAJA loops are mapped to threads.
   //
   // The tiling capabilities in RAJA will also mask out of bounds iterations.
   //
-  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
-    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
-                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+  RAJA::launch<launch_policy>(
+      RAJA::ExecPlace::DEVICE,
+      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
+                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::tile<teams_y>
-        (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) {
-          RAJA::tile<teams_x>
-            (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) {
-
-              RAJA::loop<threads_y>(ctx, row_tile, [&] (int col) {
-                RAJA::loop<threads_x>(ctx, col_tile, [&] (int row) {
-
-                    double dot = 0.0;
-                    for (int k = 0; k < N; ++k) {
-                      dot += Aview(row, k) * Bview(k, col);
-                    }
-                    Cview(row, col) = dot;
-
+        RAJA::tile<teams_y>(
+            ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& row_tile) {
+              RAJA::tile<teams_x>(
+                  ctx,
+                  THREAD_SZ,
+                  col_range,
+                  [&](RAJA::RangeSegment const& col_tile) {
+                    RAJA::loop<threads_y>(ctx, row_tile, [&](int col) {
+                      RAJA::loop<threads_x>(ctx, col_tile, [&](int row) {
+                        double dot = 0.0;
+                        for (int k = 0; k < N; ++k)
+                        {
+                          dot += Aview(row, k) * Bview(k, col);
+                        }
+                        Cview(row, col) = dot;
+                      });
+                    });
                   });
-                });
             });
-        });
-   });
+      });
 
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+// printResult<double>(Cview, N);
 #endif // if RAJA_ENABLE_HIP
 
 //----------------------------------------------------------------------------//
@@ -596,9 +605,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running CUDA tiled mat-mult with shared memory ...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
-  using seq_loop =  RAJA::LoopPolicy<RAJA::seq_exec, RAJA::seq_exec>;
+  using seq_loop = RAJA::LoopPolicy<RAJA::seq_exec, RAJA::seq_exec>;
 
   //
   // This example builds on the RAJA tiling capabilies presented earlier
@@ -610,85 +619,99 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // This example also uses the teamSync() method in the launch context
   // to add a barrier ensuring all threads have loaded/read from shared memory
   //
-  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
-    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
-                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-   //
-   // Loop over teams
-   //
-   RAJA::tile<teams_y>
-     (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &y_tile) {
-     RAJA::tile<teams_x>
-       (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &x_tile) {
-
-         RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ];
-         RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
-         RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
-
-         RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-             RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
-               Cs[ty][tx] = 0.0;
-             });
-         });
-
-         RAJA::tile<seq_loop>
-           (ctx, THREAD_SZ, dot_range, [&] (RAJA::RangeSegment const &k_tile) {
-
-           RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-               RAJA::loop_icount<threads_x>(ctx, k_tile, [&](int k_id, int tx) {
-                   As[ty][tx] = Aview(row,k_id);
-                 });
-             });
-
-           RAJA::loop_icount<threads_y>(ctx, k_tile, [&](int k_id, int ty) {
-               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
-                   Bs[ty][tx] = Bview(k_id,col);
-               });
-             });
-
-           ctx.teamSync();
-
-           RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
-
-                   RAJA::loop_icount<seq_loop>(ctx, k_tile, [&] (int gid, int e) {
-                       Cs[ty][tx] += As[ty][e] * Bs[e][tx];
-                     });
-
-                 });
-             });
-
-           ctx.teamSync();
-
-         });  // slide across matrix
-
-          RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
-                   Cview(col,row) = Cs[ty][tx];
-               });
-           });
-       });
-     });
-  });  // kernel
+  RAJA::launch<launch_policy>(
+      RAJA::ExecPlace::DEVICE,
+      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
+                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        //
+        // Loop over teams
+        //
+        RAJA::tile<teams_y>(
+            ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& y_tile) {
+              RAJA::tile<teams_x>(
+                  ctx,
+                  THREAD_SZ,
+                  col_range,
+                  [&](RAJA::RangeSegment const& x_tile) {
+                    RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ];
+                    RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
+                    RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
+
+                    RAJA::loop_icount<threads_y>(
+                        ctx, y_tile, [&](int row, int ty) {
+                          RAJA::loop_icount<threads_x>(
+                              ctx, x_tile, [&](int col, int tx) {
+                                Cs[ty][tx] = 0.0;
+                              });
+                        });
+
+                    RAJA::tile<seq_loop>(
+                        ctx,
+                        THREAD_SZ,
+                        dot_range,
+                        [&](RAJA::RangeSegment const& k_tile) {
+                          RAJA::loop_icount<threads_y>(
+                              ctx, y_tile, [&](int row, int ty) {
+                                RAJA::loop_icount<threads_x>(
+                                    ctx, k_tile, [&](int k_id, int tx) {
+                                      As[ty][tx] = Aview(row, k_id);
+                                    });
+                              });
+
+                          RAJA::loop_icount<threads_y>(
+                              ctx, k_tile, [&](int k_id, int ty) {
+                                RAJA::loop_icount<threads_x>(
+                                    ctx, x_tile, [&](int col, int tx) {
+                                      Bs[ty][tx] = Bview(k_id, col);
+                                    });
+                              });
+
+                          ctx.teamSync();
+
+                          RAJA::loop_icount<threads_y>(
+                              ctx, y_tile, [&](int row, int ty) {
+                                RAJA::loop_icount<threads_x>(
+                                    ctx, x_tile, [&](int col, int tx) {
+                                      RAJA::loop_icount<seq_loop>(
+                                          ctx, k_tile, [&](int gid, int e) {
+                                            Cs[ty][tx] += As[ty][e] * Bs[e][tx];
+                                          });
+                                    });
+                              });
+
+                          ctx.teamSync();
+                        }); // slide across matrix
+
+                    RAJA::loop_icount<threads_y>(
+                        ctx, y_tile, [&](int row, int ty) {
+                          RAJA::loop_icount<threads_x>(
+                              ctx, x_tile, [&](int col, int tx) {
+                                Cview(col, row) = Cs[ty][tx];
+                              });
+                        });
+                  });
+            });
+      }); // kernel
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+// printResult<double>(Cview, N);
 #endif
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // Define thread block dimensions
   dim3 blockdim(THREAD_SZ, THREAD_SZ);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
-//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
+  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch CUDA kernel defined near the top of this file.
   matMultKernel<<<griddim, blockdim>>>(N, C, A, B);
@@ -697,20 +720,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   checkResult<double>(Cview, N);
 
-  std::cout << "\n Running CUDA tiled mat-mult with shared memory (no RAJA)...\n";
+  std::cout << "\n Running CUDA tiled mat-mult with shared memory (no "
+               "RAJA)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   sharedMatMultKernel<<<griddim, blockdim>>>(N, C, A, B);
 
   cudaDeviceSynchronize();
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 #endif // if RAJA_ENABLE_CUDA
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -719,47 +743,58 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Define thread block dimensions
   dim3 blockdim(THREAD_SZ, THREAD_SZ);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
-//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
+  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  std::memset(C, 0, N * N * sizeof(double));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
+  hipLaunchKernelGGL(
+      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-  std::cout << "\n Running HIP tiled mat-mult with shared memory (no RAJA)...\n";
+  std::cout << "\n Running HIP tiled mat-mult with shared memory (no "
+               "RAJA)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  std::memset(C, 0, N * N * sizeof(double));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL((sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
+  hipLaunchKernelGGL((sharedMatMultKernel),
+                     dim3(griddim),
+                     dim3(blockdim),
+                     0,
+                     0,
+                     N,
+                     d_C,
+                     d_A,
+                     d_B);
 
   hipDeviceSynchronize();
 
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
 #endif // if RAJA_ENABLE_HIP
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -776,16 +811,22 @@ template <typename T>
 void checkResult(T* C, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      if ( std::abs( C(row, col) - row * col * N ) > 10e-12 ) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      if (std::abs(C(row, col) - row * col * N) > 10e-12)
+      {
         match = false;
       }
     }
   }
-  if ( match ) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -794,16 +835,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      if ( std::abs( Cview(row, col) - row * col * N ) > 10e-12 ) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      if (std::abs(Cview(row, col) - row * col * N) > 10e-12)
+      {
         match = false;
       }
     }
   }
-  if ( match ) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -815,10 +862,12 @@ template <typename T>
 void printResult(T* C, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      std::cout << "C(" << row << "," << col << ") = "
-                << C(row, col) << std::endl;
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      std::cout << "C(" << row << "," << col << ") = " << C(row, col)
+                << std::endl;
     }
   }
   std::cout << std::endl;
@@ -828,10 +877,12 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      std::cout << "C(" << row << "," << col << ") = "
-                << Cview(row, col) << std::endl;
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      std::cout << "C(" << row << "," << col << ") = " << Cview(row, col)
+                << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp
index 24e313e649..c5de9b0e30 100644
--- a/examples/launch_reductions.cpp
+++ b/examples/launch_reductions.cpp
@@ -45,15 +45,17 @@ using device_loop = RAJA::hip_global_thread_x;
 
 using launch_policy = RAJA::LaunchPolicy<host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         , device_launch
+                                         ,
+                                         device_launch
 #endif
-                                        >;
+                                         >;
 
 using loop_pol = RAJA::LoopPolicy<host_loop
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                  , device_loop
+                                  ,
+                                  device_loop
 #endif
-                                 >;
+                                  >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -66,11 +68,13 @@ using reduce_policy = RAJA::seq_reduce;
 #endif
 
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
 
-  if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device");
+  if (argc != 2)
+  {
+    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions "
+                        "device");
   }
 
   //
@@ -79,39 +83,51 @@ int main(int argc, char *argv[])
   // Example usage ./launch_reductions host or ./launch_reductions device
   //
   std::string exec_space = argv[1];
-  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
-    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device");
+  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
+  {
+    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions "
+                        "device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Launch reductions example on the host \n"); }
-  if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Launch reductions example on the device \n"); }
+  if (exec_space.compare("host") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
+    printf("Running RAJA-Launch reductions example on the host \n");
+  }
+  if (exec_space.compare("device") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
+    printf("Running RAJA-Launch reductions example on the device \n");
+  }
 
   // _reductions_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   const int N = 1000000;
 
-//
-// Allocate array data and initialize data to alternating sequence of 1, -1.
-//
+  //
+  // Allocate array data and initialize data to alternating sequence of 1, -1.
+  //
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
-    if ( i % 2 == 0 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
       a[i] = 1;
-    } else {
+    }
+    else
+    {
       a[i] = -1;
     }
   }
 
-//
-// Set min and max loc values
-//
+  //
+  // Set min and max loc values
+  //
   const int minloc_ref = N / 2;
   a[minloc_ref] = -100;
 
@@ -119,70 +135,69 @@ int main(int argc, char *argv[])
   a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-//
-// Note: with this data initialization scheme, the following results will
-//       be observed for all reduction kernels below:
-//
-//  - the sum will be zero
-//  - the min will be -100
-//  - the max will be 100
-//  - the min loc will be N/2
-//  - the max loc will be N/2 + 1
-//
-//
+  //
+  // Note: with this data initialization scheme, the following results will
+  //       be observed for all reduction kernels below:
+  //
+  //  - the sum will be zero
+  //  - the min will be -100
+  //  - the max will be 100
+  //  - the min loc will be N/2
+  //  - the max loc will be N/2 + 1
+  //
+  //
 
-//
-// Define index range for iterating over a elements in all examples
-//
+  //
+  // Define index range for iterating over a elements in all examples
+  //
   // _reductions_range_start
   RAJA::RangeSegment arange(0, N);
   // _reductions_range_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   RAJA::ReduceSum<reduce_policy, int> kernel_sum(0);
-  RAJA::ReduceMin<reduce_policy, int> kernel_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<reduce_policy, int> kernel_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMin<reduce_policy, int> kernel_min(
+      std::numeric_limits<int>::max());
+  RAJA::ReduceMax<reduce_policy, int> kernel_max(
+      std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(
+      std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(
+      std::numeric_limits<int>::min(), -1);
 
   const int TEAM_SZ = 256;
-  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ);
-
-  RAJA::launch<launch_policy>
-    (select_cpu_or_gpu,
-     RAJA::LaunchParams(RAJA::Teams(GRID_SZ),
-                        RAJA::Threads(TEAM_SZ)),
-     "Launch Reductions",
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) 
-     {
-
-       RAJA::loop<loop_pol>(ctx, arange, [&] (int i) {
-           
-           kernel_sum += a[i];
-           
-           kernel_min.min(a[i]);
-           kernel_max.max(a[i]);
-           
-           kernel_minloc.minloc(a[i], i);
-           kernel_maxloc.maxloc(a[i], i);
-         });
-       
-    });
+  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ);
+
+  RAJA::launch<launch_policy>(
+      select_cpu_or_gpu,
+      RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
+      "Launch Reductions",
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<loop_pol>(ctx, arange, [&](int i) {
+          kernel_sum += a[i];
+
+          kernel_min.min(a[i]);
+          kernel_max.max(a[i]);
+
+          kernel_minloc.minloc(a[i], i);
+          kernel_maxloc.maxloc(a[i], i);
+        });
+      });
 
   std::cout << "\tsum = " << kernel_sum.get() << std::endl;
   std::cout << "\tmin = " << kernel_min.get() << std::endl;
   std::cout << "\tmax = " << kernel_max.get() << std::endl;
   std::cout << "\tmin, loc = " << kernel_minloc.get() << " , "
-                               << kernel_minloc.getLoc() << std::endl;
+            << kernel_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , "
-                               << kernel_maxloc.getLoc() << std::endl;
+            << kernel_maxloc.getLoc() << std::endl;
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp
index 62d3d6e3e7..6f68615a45 100644
--- a/examples/memoryManager.hpp
+++ b/examples/memoryManager.hpp
@@ -28,20 +28,20 @@ namespace memoryManager
 {
 
 #if defined(RAJA_ENABLE_SYCL)
-  static camp::resources::Resource* sycl_res;
+static camp::resources::Resource* sycl_res;
 #endif
 
 template <typename T>
-T *allocate(RAJA::Index_type size)
+T* allocate(RAJA::Index_type size)
 {
-  T *ptr;
+  T* ptr;
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(
-      cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
+      cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
 #elif defined(RAJA_ENABLE_HIP)
-      hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
+  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-      ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
+  ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
 #else
   ptr = new T[size];
 #endif
@@ -49,9 +49,10 @@ T *allocate(RAJA::Index_type size)
 }
 
 template <typename T>
-void deallocate(T *&ptr)
+void deallocate(T*& ptr)
 {
-  if (ptr) {
+  if (ptr)
+  {
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
@@ -65,37 +66,39 @@ void deallocate(T *&ptr)
   }
 }
 
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-  template <typename T>
-  T *allocate_gpu(RAJA::Index_type size)
-  {
-    T *ptr;
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
+template <typename T>
+T* allocate_gpu(RAJA::Index_type size)
+{
+  T* ptr;
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size));
+  cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
+  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-      auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
-      ptr = cl::sycl::malloc_device<T>(size, *qu);
+  auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
+  ptr = cl::sycl::malloc_device<T>(size, *qu);
 #endif
-    return ptr;
-  }
+  return ptr;
+}
 
-  template <typename T>
-  void deallocate_gpu(T *&ptr)
+template <typename T>
+void deallocate_gpu(T*& ptr)
+{
+  if (ptr)
   {
-    if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
-      cudaErrchk(cudaFree(ptr));
+    cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
-      hipErrchk(hipFree(ptr));
+    hipErrchk(hipFree(ptr));
 #elif defined(RAJA_ENABLE_SYCL)
     sycl_res->deallocate(ptr);
 #endif
-      ptr = nullptr;
-    }
+    ptr = nullptr;
   }
+}
 #endif
 
-};  // namespace memoryManager
+}; // namespace memoryManager
 #endif
diff --git a/examples/multiview.cpp b/examples/multiview.cpp
index b765dc84d4..378abde700 100644
--- a/examples/multiview.cpp
+++ b/examples/multiview.cpp
@@ -15,12 +15,12 @@
  * A RAJA::MultiView object wraps an array-of-pointers,
  * or a pointer-to-pointers, whereas a RAJA::View wraps a single
  * pointer or array. This allows a single RAJA::Layout to be applied to
- * multiple arrays internal to the MultiView, allowing multiple arrays to share indexing
- * arithmetic when their access patterns are the same.
- * 
+ * multiple arrays internal to the MultiView, allowing multiple arrays to share
+ * indexing arithmetic when their access patterns are the same.
+ *
  * The instantiation of a MultiView works exactly like a standard View,
- * except that it takes an array-of-pointers. In the following example, a MultiView
- * applies a 1-D layout of length 4 to 2 internal arrays in myarr:
+ * except that it takes an array-of-pointers. In the following example, a
+ * MultiView applies a 1-D layout of length 4 to 2 internal arrays in myarr:
  *
  *   // Arrays of the same size, which will become internal to the MultiView.
  *   int a1[4] = {5,6,7,8};
@@ -31,30 +31,35 @@
  *   myarr[0] = a1;
  *   myarr[1] = a2;
  *
- *   // This MultiView applies a 1-D layout of length 4 to each internal array in myarr.
- *   RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
- * 
- * The default MultiView accesses internal arrays via the 0th index of the MultiView:
- * 
- *   MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
- *   MView( 1, 2 ); // accesses 2nd index of the 1st internal array a2, returns value of 10
- * 
+ *   // This MultiView applies a 1-D layout of length 4 to each internal array
+ * in myarr. RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
+ *
+ * The default MultiView accesses internal arrays via the 0th index of the
+ * MultiView:
+ *
+ *   MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1,
+ * returns value of 8 MView( 1, 2 ); // accesses 2nd index of the 1st internal
+ * array a2, returns value of 10
+ *
  * The index into the array-of-pointers can be moved to different
- * indices of the MultiView () access operator, rather than the default 0th index. By 
- * passing a third template parameter to the MultiView constructor, the internal array index
- * and the integer indicating which array to access can be reversed:
+ * indices of the MultiView () access operator, rather than the default 0th
+ * index. By passing a third template parameter to the MultiView constructor,
+ * the internal array index and the integer indicating which array to access can
+ * be reversed:
  *
  *   // MultiView with array-of-pointers index in 1st position
  *   RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4);
  *
- *   MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
- *   MView1( 2, 1 ); // accesses 2nd index of the 1st internal array a2, returns value of 10
- * 
- * As the number of Layout dimensions increases, the index into the array-of-pointers can be
- * moved to more distinct locations in the MultiView () access operator. Here is an example
- * which compares the accesses of a 2-D layout on a normal RAJA::View with a RAJA::MultiView
- * with the array-of-pointers index set to the 2nd position:
- *  
+ *   MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1,
+ * returns value of 8 MView1( 2, 1 ); // accesses 2nd index of the 1st internal
+ * array a2, returns value of 10
+ *
+ * As the number of Layout dimensions increases, the index into the
+ * array-of-pointers can be moved to more distinct locations in the MultiView ()
+ * access operator. Here is an example which compares the accesses of a 2-D
+ * layout on a normal RAJA::View with a RAJA::MultiView with the
+ * array-of-pointers index set to the 2nd position:
+ *
  *   RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2);
  *
  *   normalView( 2, 1 ); // accesses 3rd index of the a1 array, value = 7
@@ -62,8 +67,9 @@
  *   // MultiView with array-of-pointers index in 2nd position
  *   RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2);
  *
- *   MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, returns value of 7 (same as normaView(2,1))
- *   MView2( 2, 1, 1 ); // accesses the 3rd index of the 1st internal array a2, returns value of 11
+ *   MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1,
+ * returns value of 7 (same as normaView(2,1)) MView2( 2, 1, 1 ); // accesses
+ * the 3rd index of the 1st internal array a2, returns value of 11
  *
  * The following code demonstrates 2 aspects of RAJA::MultiView usage:
  * - Basic usage
@@ -75,53 +81,62 @@ void docs_example()
   // temporaries
   int t1, t2, t3, t4;
 
-  printf( "MultiView Example from RAJA Documentation:\n" );
+  printf("MultiView Example from RAJA Documentation:\n");
 
   // _multiview_example_1Dinit_start
   // Arrays of the same size, which will become internal to the MultiView.
-  int a1[4] = {5,6,7,8};
-  int a2[4] = {9,10,11,12};
+  int a1[4] = {5, 6, 7, 8};
+  int a2[4] = {9, 10, 11, 12};
 
   // Array-of-pointers which will be passed into MultiView.
-  int * myarr[2];
+  int* myarr[2];
   myarr[0] = a1;
   myarr[1] = a2;
 
-  // This MultiView applies a 1-D layout of length 4 to each internal array in myarr.
-  RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
+  // This MultiView applies a 1-D layout of length 4 to each internal array in
+  // myarr.
+  RAJA::MultiView<int, RAJA::Layout<1>> MView(myarr, 4);
   // _multiview_example_1Dinit_end
 
   // _multiview_example_1Daccess_start
-  t1 = MView( 0, 3 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
-  t2 = MView( 1, 2 ); // accesses 3rd index of the 1st internal array a2, returns value of 11
+  t1 = MView(0, 3); // accesses the 4th index of the 0th internal array a1,
+                    // returns value of 8
+  t2 = MView(1, 2); // accesses 3rd index of the 1st internal array a2, returns
+                    // value of 11
   // _multiview_example_1Daccess_end
 
   // _multiview_example_1Daopindex_start
   // MultiView with array-of-pointers index in 1st position.
-  RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4);
+  RAJA::MultiView<int, RAJA::Layout<1>, 1> MView1(myarr, 4);
 
-  t3 = MView1( 3, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
-  t4 = MView1( 2, 1 ); // accesses 3rd index of the 1st internal array a2, returns value of 11
+  t3 = MView1(3, 0); // accesses the 4th index of the 0th internal array a1,
+                     // returns value of 8
+  t4 = MView1(2, 1); // accesses 3rd index of the 1st internal array a2, returns
+                     // value of 11
   // _multiview_example_1Daopindex_end
 
-  printf( "Comparison of default MultiView with another MultiView that has the array-of-pointers index in the 1st position of the () accessor:\n" );
-  printf( "MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3 );
-  printf( "MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4 );
+  printf("Comparison of default MultiView with another MultiView that has the "
+         "array-of-pointers index in the 1st position of the () accessor:\n");
+  printf("MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3);
+  printf("MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4);
 
   // _multiview_example_2Daopindex_start
-  RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2);
+  RAJA::View<int, RAJA::Layout<2>> normalView(a1, 2, 2);
 
-  t1 = normalView( 1, 1 ); // accesses 4th index of the a1 array, value = 8
+  t1 = normalView(1, 1); // accesses 4th index of the a1 array, value = 8
 
   // MultiView with array-of-pointers index in 2nd position
-  RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2);
+  RAJA::MultiView<int, RAJA::Layout<2>, 2> MView2(myarr, 2, 2);
 
-  t2 = MView2( 1, 1, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 (same as normalView(1,1))
-  t3 = MView2( 0, 0, 1 ); // accesses the 1st index of the 1st internal array a2, returns value of 9
+  t2 = MView2(1, 1, 0); // accesses the 4th index of the 0th internal array a1,
+                        // returns value of 8 (same as normalView(1,1))
+  t3 = MView2(0, 0, 1); // accesses the 1st index of the 1st internal array a2,
+                        // returns value of 9
   // _multiview_example_2Daopindex_end
 
-  printf( "Comparison of 2D normal View with 2D MultiView that has the array-of-pointers index in the 2nd position of the () accessor:\n" );
-  printf( "normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2 );
+  printf("Comparison of 2D normal View with 2D MultiView that has the "
+         "array-of-pointers index in the 2nd position of the () accessor:\n");
+  printf("normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2);
 }
 
 int main()
@@ -129,11 +144,11 @@ int main()
   docs_example();
 
   constexpr int N = 12;
-  int * myarr[2]; // two 3x4 arrays
+  int* myarr[2]; // two 3x4 arrays
   int arr1[N];
   int arr2[N];
 
-  for ( int ii = 0; ii < N; ++ii )
+  for (int ii = 0; ii < N; ++ii)
   {
     arr1[ii] = 100 + ii;
     arr2[ii] = 200 + ii;
@@ -143,39 +158,47 @@ int main()
   myarr[1] = arr2;
 
   // 4x3 layout
-  std::array<RAJA::idx_t, 2> perm { {0, 1} };
-  RAJA::Layout<2> layout = RAJA::make_permuted_layout(
-                              { {4, 3} }, perm
-                           );
+  std::array<RAJA::idx_t, 2> perm{{0, 1}};
+  RAJA::Layout<2> layout = RAJA::make_permuted_layout({{4, 3}}, perm);
 
   // Basic MultiView usage
   // Default usage: no specified array-of-pointers index moving
   // 0th position is used as the array-of-pointers index
-  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>> arrView(myarr, layout);
+  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>> arrView(myarr,
+                                                                  layout);
 
   // Moved array-of-pointers index MultiView usage
   // Add an array-of-pointers index specifier
   constexpr int aopidx = 1;
-  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>, aopidx> arrViewMov(myarr, layout);
+  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>, aopidx> arrViewMov(
+      myarr, layout);
 
   // Comparing values of both views
-  printf ( "Comparing values of both default and 1-index-ed MultiViews:\n" );
-  for ( int pp = 0; pp < 2; ++pp )
+  printf("Comparing values of both default and 1-index-ed MultiViews:\n");
+  for (int pp = 0; pp < 2; ++pp)
   {
-    for ( int kk = 0; kk < 4; ++kk )
+    for (int kk = 0; kk < 4; ++kk)
     {
-      for ( int jj = 0; jj < 3; ++jj )
+      for (int jj = 0; jj < 3; ++jj)
       {
-        printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) );
+        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n",
+               pp,
+               kk,
+               jj,
+               arrView(pp, kk, jj),
+               kk,
+               pp,
+               jj,
+               arrViewMov(kk, pp, jj));
       }
     }
   }
 
   // switch values
-  printf ( "Switching values\n" );
-  for ( int kk = 0; kk < 4; ++kk )
+  printf("Switching values\n");
+  for (int kk = 0; kk < 4; ++kk)
   {
-    for ( int jj = 0; jj < 3; ++jj )
+    for (int jj = 0; jj < 3; ++jj)
     {
       int temp = arrView(0, kk, jj);
       arrView(0, kk, jj) = arrView(1, kk, jj);
@@ -184,14 +207,23 @@ int main()
   }
 
   // Comparing switched values of both views
-  printf ( "Comparing switched values of both default and 1-index-ed MultiViews:\n" );
-  for ( int pp = 0; pp < 2; ++pp )
+  printf("Comparing switched values of both default and 1-index-ed "
+         "MultiViews:\n");
+  for (int pp = 0; pp < 2; ++pp)
   {
-    for ( int kk = 0; kk < 4; ++kk )
+    for (int kk = 0; kk < 4; ++kk)
     {
-      for ( int jj = 0; jj < 3; ++jj )
+      for (int jj = 0; jj < 3; ++jj)
       {
-        printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) );
+        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n",
+               pp,
+               kk,
+               jj,
+               arrView(pp, kk, jj),
+               kk,
+               pp,
+               jj,
+               arrViewMov(kk, pp, jj));
       }
     }
   }
diff --git a/examples/omp-target-kernel.cpp b/examples/omp-target-kernel.cpp
index ce425e07a6..1d101f1ca5 100644
--- a/examples/omp-target-kernel.cpp
+++ b/examples/omp-target-kernel.cpp
@@ -10,35 +10,33 @@
 using namespace RAJA;
 using namespace RAJA::statement;
 
-int main(int /*argc*/, char** /*argv[]*/) {
+int main(int /*argc*/, char** /*argv[]*/)
+{
 
   // using Pol = KernelPolicy<
   //               For<1, RAJA::seq_exec>,
   //               For<0, RAJA::omp_target_parallel_for_exec<1>, Lambda<0> >
   //             >;
   using Pol = KernelPolicy<
-    Collapse<omp_target_parallel_collapse_exec, ArgList<0,1>, Lambda<0> > >;
+      Collapse<omp_target_parallel_collapse_exec, ArgList<0, 1>, Lambda<0>>>;
 
-  double* array = new double[25*25];
+  double* array = new double[25 * 25];
 
-#pragma omp target enter data map(to: array[0:25*25])
+#pragma omp target enter data map(to : array [0:25 * 25])
 #pragma omp target data use_device_ptr(array)
 
 #if 1
   RAJA::kernel<Pol>(
-      RAJA::make_tuple(
-        RAJA::RangeSegment(0,25),
-        RAJA::RangeSegment(0,25)),
-      [=] (int /*i*/, int /*j*/) {
-      //array[i + (25*j)] = i*j;
-  //    int idx = i;
-      //array[0] = i*j;
-  });
+      RAJA::make_tuple(RAJA::RangeSegment(0, 25), RAJA::RangeSegment(0, 25)),
+      [=](int /*i*/, int /*j*/) {
+        // array[i + (25*j)] = i*j;
+        //    int idx = i;
+        // array[0] = i*j;
+      });
 #else
-  RAJA::forall<RAJA::omp_target_parallel_for_exec<1>>(
-      RAJA::RangeSegment(0,25),
-      [=] (int i) {
-      //
-  });
+  RAJA::forall<RAJA::omp_target_parallel_for_exec<1>>(RAJA::RangeSegment(0, 25),
+                                                      [=](int i) {
+                                                        //
+                                                      });
 #endif
 }
diff --git a/examples/omp-target-ltimes.cpp b/examples/omp-target-ltimes.cpp
index f51694b3af..c04939c26f 100644
--- a/examples/omp-target-ltimes.cpp
+++ b/examples/omp-target-ltimes.cpp
@@ -9,7 +9,6 @@
 #include <stdio.h>
 
 
-
 #include "RAJA/RAJA.hpp"
 #include "RAJA/util/Timer.hpp"
 
@@ -28,22 +27,25 @@ RAJA_INDEX_VALUE(IZone, "IZone");
 
 
 void runLTimesRajaKernel(bool debug,
-                          Index_type num_moments,
-                          Index_type num_directions,
-                          Index_type num_groups,
-                          Index_type num_zones)
+                         Index_type num_moments,
+                         Index_type num_directions,
+                         Index_type num_groups,
+                         Index_type num_zones)
 {
 
-	using namespace RAJA::statement;
+  using namespace RAJA::statement;
 
   // psi[direction, group, zone]
-  using PsiView = RAJA::TypedView<double, Layout<3, Index_type, 2>, IDirection, IGroup, IZone>;
+  using PsiView = RAJA::
+      TypedView<double, Layout<3, Index_type, 2>, IDirection, IGroup, IZone>;
 
   // phi[moment, group, zone]
-  using PhiView = RAJA::TypedView<double, Layout<3, Index_type, 2>, IMoment, IGroup, IZone>;
+  using PhiView =
+      RAJA::TypedView<double, Layout<3, Index_type, 2>, IMoment, IGroup, IZone>;
 
   // ell[moment, direction]
-  using EllView = RAJA::TypedView<double, Layout<2, Index_type, 1>, IMoment, IDirection>;
+  using EllView =
+      RAJA::TypedView<double, Layout<2, Index_type, 1>, IMoment, IDirection>;
 
 
   // allocate data
@@ -54,16 +56,19 @@ void runLTimesRajaKernel(bool debug,
 
 
   // randomize data
-  for (size_t i = 0; i < ell_data.size(); ++i) {
-    ell_data[i] = i; //drand48();
+  for (size_t i = 0; i < ell_data.size(); ++i)
+  {
+    ell_data[i] = i; // drand48();
   }
 
-  for (size_t i = 0; i < psi_data.size(); ++i) {
-    psi_data[i] = 2*i; //drand48();
+  for (size_t i = 0; i < psi_data.size(); ++i)
+  {
+    psi_data[i] = 2 * i; // drand48();
   }
 
-  for (size_t i = 0; i < phi_data.size(); ++i) {
-    phi_data[i] = 0; //drand48();
+  for (size_t i = 0; i < phi_data.size(); ++i)
+  {
+    phi_data[i] = 0; // drand48();
   }
 
   int hid = omp_get_initial_device();
@@ -71,58 +76,50 @@ void runLTimesRajaKernel(bool debug,
 
   // create device memory
   double *d_ell, *d_phi, *d_psi;
-  d_ell = static_cast<double*>(omp_target_alloc(sizeof(double) * ell_data.size(), did));
-  d_phi = static_cast<double*>(omp_target_alloc(sizeof(double) * phi_data.size(), did));
-  d_psi = static_cast<double*>(omp_target_alloc(sizeof(double) * psi_data.size(), did));
+  d_ell = static_cast<double*>(
+      omp_target_alloc(sizeof(double) * ell_data.size(), did));
+  d_phi = static_cast<double*>(
+      omp_target_alloc(sizeof(double) * phi_data.size(), did));
+  d_psi = static_cast<double*>(
+      omp_target_alloc(sizeof(double) * psi_data.size(), did));
 
   // Copy to device
   omp_target_memcpy(
-      &ell_data[0],
-      d_ell,
-      sizeof(double) * ell_data.size(),
-      0,0, hid, did);
+      &ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0, hid, did);
   omp_target_memcpy(
-      &phi_data[0],
-      d_phi,
-      sizeof(double) * phi_data.size(),
-      0,0,hid,did);
+      &phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0, hid, did);
   omp_target_memcpy(
-      &psi_data[0],
-      d_psi,
-      sizeof(double) * psi_data.size(),
-      0,0,hid,did);
+      &psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0, hid, did);
 
 
   // create views on data
-  std::array<RAJA::idx_t, 2> ell_perm {{0, 1}};
-  EllView ell(
-      d_ell,
-      make_permuted_layout({{num_moments, num_directions}}, ell_perm));
+  std::array<RAJA::idx_t, 2> ell_perm{{0, 1}};
+  EllView ell(d_ell,
+              make_permuted_layout({{num_moments, num_directions}}, ell_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView psi(
-      d_psi,
-      make_permuted_layout({{num_directions, num_groups, num_zones}}, psi_perm));
+  std::array<RAJA::idx_t, 3> psi_perm{{0, 1, 2}};
+  PsiView psi(d_psi,
+              make_permuted_layout({{num_directions, num_groups, num_zones}},
+                                   psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+  std::array<RAJA::idx_t, 3> phi_perm{{0, 1, 2}};
   PhiView phi(
       d_phi,
       make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm));
 
 
-
-  using Pol = RAJA::KernelPolicy<
-      Collapse<omp_target_parallel_collapse_exec,
-        ArgList<0, 1, 2>,
-        For<3, RAJA::seq_exec, Lambda<0>>>>;
+  using Pol = RAJA::KernelPolicy<Collapse<omp_target_parallel_collapse_exec,
+                                          ArgList<0, 1, 2>,
+                                          For<3, RAJA::seq_exec, Lambda<0>>>>;
 
   RAJA::Timer timer;
   timer.start();
 
-  auto segments =  RAJA::make_tuple(TypedRangeSegment<IMoment>(0, num_moments),
-      TypedRangeSegment<IDirection>(0, num_directions),
-      TypedRangeSegment<IGroup>(0, num_groups),
-      TypedRangeSegment<IZone>(0, num_zones));
+  auto segments =
+      RAJA::make_tuple(TypedRangeSegment<IMoment>(0, num_moments),
+                       TypedRangeSegment<IDirection>(0, num_directions),
+                       TypedRangeSegment<IGroup>(0, num_groups),
+                       TypedRangeSegment<IZone>(0, num_zones));
 
 
   kernel<Pol>(
@@ -130,56 +127,62 @@ void runLTimesRajaKernel(bool debug,
       segments,
 
       // Lambda_CalcPhi
-      [=] (IMoment m, IDirection d, IGroup g, IZone z) {
+      [=](IMoment m, IDirection d, IGroup g, IZone z) {
         phi(m, g, z) += ell(m, d) * psi(d, g, z);
       });
 
 
-
   timer.stop();
-  printf("LTimes took %lf seconds using RAJA::kernel\n",
-      timer.elapsed());
+  printf("LTimes took %lf seconds using RAJA::kernel\n", timer.elapsed());
 
 
   // Check correctness
-  if(debug){
+  if (debug)
+  {
 
     size_t errors = 0;
     double total_error = 0.;
-    for (IZone z(0); z < num_zones; ++z) {
-      for (IGroup g(0); g < num_groups; ++g) {
-        for (IMoment m(0); m < num_moments; ++m) {
+    for (IZone z(0); z < num_zones; ++z)
+    {
+      for (IGroup g(0); g < num_groups; ++g)
+      {
+        for (IMoment m(0); m < num_moments; ++m)
+        {
           double total = 0.0;
-          for (IDirection d(0); d < num_directions; ++d) {
+          for (IDirection d(0); d < num_directions; ++d)
+          {
             double val = ell(m, d) * psi(d, g, z);
             total += val;
           }
-          if(std::abs(total-phi(m,g,z)) > 1e-9){
-            ++ errors;
+          if (std::abs(total - phi(m, g, z)) > 1e-9)
+          {
+            ++errors;
           }
-          total_error += std::abs(total-phi(m,g,z));
+          total_error += std::abs(total - phi(m, g, z));
         }
       }
     }
-    if(errors == 0){
+    if (errors == 0)
+    {
       printf("  -- no errors (%e)\n", total_error);
     }
-    else{
+    else
+    {
       printf("  -- failed : %ld errors\n", (long)errors);
     }
   }
-
 }
 
 
-int main(){
+int main()
+{
 
   bool debug = true;
 
   int m = 25;
   int d = 80;
   int g = 32;
-  int z = 32*1024;
+  int z = 32 * 1024;
 
   printf("m=%d, d=%d, g=%d, z=%d\n", m, d, g, z);
 
@@ -187,5 +190,3 @@ int main(){
 
   return 0;
 }
-
-
diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp
index ea0c18611f..fae7c47c19 100644
--- a/examples/pi-reduce_vs_atomic.cpp
+++ b/examples/pi-reduce_vs_atomic.cpp
@@ -45,55 +45,54 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nRAJA pi example...\n";
 
-//
-// Define RangeSegment to enumerate "bins" and "bin step" size used in
-// Riemann integral sum to approximate pi,
-// and memory location for atomic add operation.
-//
+  //
+  // Define RangeSegment to enumerate "bins" and "bin step" size used in
+  // Riemann integral sum to approximate pi,
+  // and memory location for atomic add operation.
+  //
   const int num_bins = 512 * 512;
-  const double dx = 1.0 / double(num_bins); 
+  const double dx = 1.0 / double(num_bins);
 
-  RAJA::RangeSegment bins(0, num_bins); 
+  RAJA::RangeSegment bins(0, num_bins);
 
   double* atomic_pi = memoryManager::allocate<double>(1);
 
-// Set precision for printing pi
+  // Set precision for printing pi
   int prec = 16;
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential pi approximation...\n";
- 
+
   double c_pi = 0.0;
 
-  for (int i = 0; i < num_bins; ++i) {
-      double x = (double(i) + 0.5) * dx;
-      c_pi += dx / (1.0 + x * x);
+  for (int i = 0; i < num_bins; ++i)
+  {
+    double x = (double(i) + 0.5) * dx;
+    c_pi += dx / (1.0 + x * x);
   }
   c_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) 
-            << c_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl;
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential pi approximation (reduction)...\n";
 
-  using EXEC_POL1   = RAJA::seq_exec;
-  using REDUCE_POL1 = RAJA::seq_reduce; 
+  using EXEC_POL1 = RAJA::seq_exec;
+  using REDUCE_POL1 = RAJA::seq_reduce;
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
   RAJA::forall<EXEC_POL1>(bins, [=](int i) {
-      double x = (double(i) + 0.5) * dx;
-      seq_pi += dx / (1.0 + x * x);
+    double x = (double(i) + 0.5) * dx;
+    seq_pi += dx / (1.0 + x * x);
   });
   double seq_pi_val = seq_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) 
-            << seq_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
 
 
   std::cout << "\n Running RAJA sequential pi approximation (atomic)...\n";
@@ -103,35 +102,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   *atomic_pi = 0.0;
 
   RAJA::forall<EXEC_POL1>(bins, [=](int i) {
-      double x = (double(i) + 0.5) * dx;
-      RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi, 
-                                   dx / (1.0 + x * x));
+    double x = (double(i) + 0.5) * dx;
+    RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi, dx / (1.0 + x * x));
   });
   *atomic_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) 
-            << *atomic_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running RAJA OpenMP pi approximation (reduction)...\n";
 
-  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
   using REDUCE_POL2 = RAJA::omp_reduce;
 
   RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
 
   RAJA::forall<EXEC_POL2>(bins, [=](int i) {
-      double x = (double(i) + 0.5) * dx;
-      omp_pi += dx / (1.0 + x * x);
+    double x = (double(i) + 0.5) * dx;
+    omp_pi += dx / (1.0 + x * x);
   });
   double omp_pi_val = omp_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << omp_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
 
 
   std::cout << "\n Running RAJA OpenMP pi approximation (atomic)...\n";
@@ -141,37 +137,34 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   *atomic_pi = 0.0;
 
   RAJA::forall<EXEC_POL2>(bins, [=](int i) {
-      double x = (double(i) + 0.5) * dx;
-      RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi, 
-                                   dx / (1.0 + x * x));
+    double x = (double(i) + 0.5) * dx;
+    RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi, dx / (1.0 + x * x));
   });
   *atomic_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << *atomic_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
 
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running RAJA CUDA pi approximation (reduction)...\n";
 
-  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::cuda_reduce;
 
   RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
 
-  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE (int i) {
-      double x = (double(i) + 0.5) * dx;
-      cuda_pi += dx / (1.0 + x * x);
+  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE(int i) {
+    double x = (double(i) + 0.5) * dx;
+    cuda_pi += dx / (1.0 + x * x);
   });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << cuda_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
 
 
   std::cout << "\n Running RAJA CUDA pi approximation (atomic)...\n";
@@ -180,63 +173,62 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE (int i) {
-      double x = (double(i) + 0.5) * dx;
-      RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi, dx / (1.0 + x * x));
+  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE(int i) {
+    double x = (double(i) + 0.5) * dx;
+    RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi, dx / (1.0 + x * x));
   });
   *atomic_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << *atomic_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running RAJA HIP pi approximation (reduction)...\n";
 
-  using EXEC_POL4   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL4 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL4 = RAJA::hip_reduce;
 
   RAJA::ReduceSum<REDUCE_POL4, double> hip_pi(0.0);
 
-  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE (int i) {
-      double x = (double(i) + 0.5) * dx;
-      hip_pi += dx / (1.0 + x * x);
+  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE(int i) {
+    double x = (double(i) + 0.5) * dx;
+    hip_pi += dx / (1.0 + x * x);
   });
   double hip_pi_val = hip_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << hip_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << hip_pi_val << std::endl;
 
   std::cout << "\n Running RAJA HIP pi approximation (atomic)...\n";
 
   *atomic_pi = 0;
   double* d_atomic_pi = memoryManager::allocate_gpu<double>(1);
-  hipErrchk(hipMemcpy( d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(
+      d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice));
 
   using ATOMIC_POL4 = RAJA::hip_atomic;
 
-  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE (int i) {
-      double x = (double(i) + 0.5) * dx;
-      RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi, dx / (1.0 + x * x));
+  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE(int i) {
+    double x = (double(i) + 0.5) * dx;
+    RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi, dx / (1.0 + x * x));
   });
 
-  hipErrchk(hipMemcpy( atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost ));
-  *atomic_pi *= 4.0; 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << *atomic_pi << std::endl;
+  hipErrchk(hipMemcpy(
+      atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost));
+  *atomic_pi *= 4.0;
+  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
 
   memoryManager::deallocate_gpu(d_atomic_pi);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(atomic_pi);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/plugin/counter-plugin.cpp b/examples/plugin/counter-plugin.cpp
index 8134cd9b83..ece7814a71 100644
--- a/examples/plugin/counter-plugin.cpp
+++ b/examples/plugin/counter-plugin.cpp
@@ -10,45 +10,51 @@
 
 #include <iostream>
 
-class CounterPlugin :
-  public RAJA::util::PluginStrategy
+class CounterPlugin : public RAJA::util::PluginStrategy
 {
-  public:
-  void preCapture(const RAJA::util::PluginContext& p) override {
-    if (p.platform == RAJA::Platform::host) 
+public:
+  void preCapture(const RAJA::util::PluginContext& p) override
+  {
+    if (p.platform == RAJA::Platform::host)
     {
-      std::cout << " [CounterPlugin]: Capturing host kernel for the " << ++host_capture_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Capturing host kernel for the "
+                << ++host_capture_counter << " time!" << std::endl;
     }
     else
     {
-      std::cout << " [CounterPlugin]: Capturing device kernel for the " << ++device_capture_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Capturing device kernel for the "
+                << ++device_capture_counter << " time!" << std::endl;
     }
   }
 
-  void preLaunch(const RAJA::util::PluginContext& p) override {
+  void preLaunch(const RAJA::util::PluginContext& p) override
+  {
     if (p.platform == RAJA::Platform::host)
     {
-      std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_launch_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Launching host kernel for the "
+                << ++host_launch_counter << " time!" << std::endl;
     }
     else
     {
-      std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_launch_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Launching device kernel for the "
+                << ++device_launch_counter << " time!" << std::endl;
     }
   }
 
-  private:
-   int host_capture_counter;
-   int device_capture_counter;
-   int host_launch_counter;
-   int device_launch_counter;
+private:
+  int host_capture_counter;
+  int device_capture_counter;
+  int host_launch_counter;
+  int device_launch_counter;
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin> P("Counter", "Counts number of kernel launches.");
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("Counter",
+                                                        "Counts "
+                                                        "number of "
+                                                        "kernel "
+                                                        "launches.");
 
 // Dynamically loading plugin.
-extern "C" RAJA::util::PluginStrategy *getPlugin ()
-{
-  return new CounterPlugin;
-}
+extern "C" RAJA::util::PluginStrategy* getPlugin() { return new CounterPlugin; }
 // _plugin_example_end
diff --git a/examples/plugin/test-plugin-dynamic.cpp b/examples/plugin/test-plugin-dynamic.cpp
index c9e574a99e..b73a13441f 100644
--- a/examples/plugin/test-plugin-dynamic.cpp
+++ b/examples/plugin/test-plugin-dynamic.cpp
@@ -8,15 +8,14 @@
 #include "RAJA/RAJA.hpp"
 #include <cstdlib>
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
   RAJA::util::init_plugins("../lib/libtimer_plugin.so");
 
-  double *a = new double[10];
+  double* a = new double[10];
   for (int i = 0; i < 4; i++)
   {
-    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=](int i) {
-      a[i] = 0;
-    });
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
+                                 [=](int i) { a[i] = 0; });
   }
 }
diff --git a/examples/plugin/test-plugin.cpp b/examples/plugin/test-plugin.cpp
index b18233cb90..2164ae7df9 100644
--- a/examples/plugin/test-plugin.cpp
+++ b/examples/plugin/test-plugin.cpp
@@ -7,13 +7,13 @@
 
 #include "RAJA/RAJA.hpp"
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
   double* a = new double[10];
 
-  for (int i = 0; i < 10; i++) {
-    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0,10), [=] (int i) {
-        a[i] = 0;
-    });
+  for (int i = 0; i < 10; i++)
+  {
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
+                                 [=](int i) { a[i] = 0; });
   }
 }
diff --git a/examples/plugin/timer-plugin.cpp b/examples/plugin/timer-plugin.cpp
index 2619f9fcd9..cdb330a970 100644
--- a/examples/plugin/timer-plugin.cpp
+++ b/examples/plugin/timer-plugin.cpp
@@ -21,15 +21,19 @@ class TimerPlugin : public RAJA::util::PluginStrategy
   void postLaunch(const RAJA::util::PluginContext& p) override
   {
     end_time = std::chrono::steady_clock::now();
-    double elapsedMs = std::chrono::duration<double, std::milli>(end_time - start_time).count();
+    double elapsedMs =
+        std::chrono::duration<double, std::milli>(end_time - start_time)
+            .count();
 
     if (p.platform == RAJA::Platform::host)
     {
-      printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs);
+      printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n",
+             elapsedMs);
     }
     else
     {
-      printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", elapsedMs);
+      printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n",
+             elapsedMs);
     }
   }
 
@@ -39,10 +43,10 @@ class TimerPlugin : public RAJA::util::PluginStrategy
 };
 
 // Dynamically loading plugin.
-extern "C" RAJA::util::PluginStrategy *getPlugin()
-{
-  return new TimerPlugin;
-}
+extern "C" RAJA::util::PluginStrategy* getPlugin() { return new TimerPlugin; }
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<TimerPlugin> P("Timer", "Prints elapsed time of kernel executions.");
\ No newline at end of file
+static RAJA::util::PluginRegistry::add<TimerPlugin> P("Timer",
+                                                      "Prints elapsed "
+                                                      "time of kernel "
+                                                      "executions.");
\ No newline at end of file
diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp
index b2642e16ff..89ed356f37 100644
--- a/examples/raja-launch.cpp
+++ b/examples/raja-launch.cpp
@@ -56,36 +56,36 @@ using launch_policy = RAJA::LaunchPolicy<
  */
 using teams_x = RAJA::LoopPolicy<
 #if defined(RAJA_ENABLE_OPENMP)
-                                       RAJA::omp_parallel_for_exec
+    RAJA::omp_parallel_for_exec
 #else
-                                       RAJA::seq_exec
+    RAJA::seq_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                                       ,
-                                       RAJA::cuda_block_x_direct
+    ,
+    RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                       ,
-                                       RAJA::hip_block_x_direct
+    ,
+    RAJA::hip_block_x_direct
 #endif
-                                       >;
+    >;
 /*
  * Define thread policies.
  * Up to 3 dimension are supported: x,y,z
  */
 using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                         ,
-                                         RAJA::cuda_thread_x_loop
+                                   ,
+                                   RAJA::cuda_thread_x_loop
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         RAJA::hip_thread_x_loop
+                                   ,
+                                   RAJA::hip_thread_x_loop
 #endif
-                                         >;
+                                   >;
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   // Resource object for host
@@ -109,7 +109,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // RAJA teams may switch between host and device policies at run time.
   // The loop below will execute through the available backends.
 
-  for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) {
+  for (int exec_place = 0; exec_place < num_of_backends; ++exec_place)
+  {
 
     auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place;
 
@@ -117,12 +118,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     int N_tri = 5;
 
     int* Ddat = nullptr;
-    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
+    {
       Ddat = host_res.allocate<int>(N_tri * N_tri);
     }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+    {
       Ddat = device_res.allocate<int>(N_tri * N_tri);
     }
 #endif
@@ -141,51 +144,56 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
      * and is used to perform thread synchronizations within a team.
      */
 
-    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){
-      std::cout << "\n Running upper triangular pattern example on the host...\n";
-    } else {
-      std::cout << "\n Running upper triangular pattern example on the device...\n";
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
+    {
+      std::cout << "\n Running upper triangular pattern example on the "
+                   "host...\n";
+    }
+    else
+    {
+      std::cout << "\n Running upper triangular pattern example on the "
+                   "device...\n";
     }
 
 
     RAJA::View<int, RAJA::Layout<2>> D(Ddat, N_tri, N_tri);
 
-    RAJA::launch<launch_policy>
-      (select_cpu_or_gpu,
-       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+    RAJA::launch<launch_policy>(
+        select_cpu_or_gpu,
+        RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
+            // Array shared within threads of the same team
+            RAJA_TEAM_SHARED int s_A[1];
 
-         RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
-
-           // Array shared within threads of the same team
-           RAJA_TEAM_SHARED int s_A[1];
-
-           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+            RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
               s_A[c] = r;
-           });  // loop c
-
-           ctx.teamSync();
-
-           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
-               D(r, c) = r * N_tri + c;
-               printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
-           });  // loop c
+            }); // loop c
 
-         });  // loop r
+            ctx.teamSync();
 
-       });  // outer lambda
+            RAJA::loop<threads_x>(
+                ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+                  D(r, c) = r * N_tri + c;
+                  printf(
+                      "r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+                }); // loop c
+          });       // loop r
+        });         // outer lambda
 
-    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
+    {
       host_res.deallocate(Ddat);
     }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+    {
       device_res.deallocate(Ddat);
     }
 #endif
 
-  }  // Execution places loop
+  } // Execution places loop
 
 
-}  // Main
+} // Main
diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp
index cfe74dc58a..f22df3cac9 100644
--- a/examples/red-black-gauss-seidel.cpp
+++ b/examples/red-black-gauss-seidel.cpp
@@ -52,7 +52,8 @@
  * h - Spacing between grid points
  * n - Number of grid points
  */
-struct grid_s {
+struct grid_s
+{
   double o, h;
   int n;
 };
@@ -62,16 +63,16 @@ struct grid_s {
  * solution      - Function for the analytic solution
  * computeErr    - Displays the maximum error in the solution
  * gsColorPolicy - Generates the custom index set for this example
-*/
+ */
 double solution(double x, double y);
-void computeErr(double *I, grid_s grid);
-RAJA::TypedIndexSet<RAJA::ListSegment> 
-  gsColorPolicy(int N, camp::resources::Resource res);
+void computeErr(double* I, grid_s grid);
+RAJA::TypedIndexSet<RAJA::ListSegment>
+gsColorPolicy(int N, camp::resources::Resource res);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout<<"Red-Black Gauss-Seidel Example"<<std::endl;
+  std::cout << "Red-Black Gauss-Seidel Example" << std::endl;
 
   /*
    * ----[Solver Parameters]------------
@@ -100,7 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   camp::resources::Resource resource{camp::resources::Host()};
 
-  double *I = resource.allocate<double>(NN);
+  double* I = resource.allocate<double>(NN);
 
   memset(I, 0, NN * sizeof(double));
 
@@ -117,7 +118,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   resI2 = 1;
   iteration = 0;
-  while (resI2 > tol * tol) {
+  while (resI2 > tol * tol)
+  {
 
 #if defined(RAJA_ENABLE_OPENMP)
     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
@@ -128,9 +130,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     //
     // Gauss-Seidel Iteration
     //
-    RAJA::forall<colorPolicy>(colorSet, 
-      [=](RAJA::Index_type id) {
-        
+    RAJA::forall<colorPolicy>(colorSet, [=](RAJA::Index_type id) {
       //
       // Compute x,y grid index
       //
@@ -140,21 +140,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       double x = gridx.o + m * gridx.h;
       double y = gridx.o + n * gridx.h;
 
-      double f = gridx.h * gridx.h * 
+      double f = gridx.h * gridx.h *
                  (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
-      double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] -
-                                 I[id - 1] - I[id + 1]);
+      double newI =
+          -0.25 * (f - I[id - N - 2] - I[id + N + 2] - I[id - 1] - I[id + 1]);
 
       double oldI = I[id];
       RAJA_resI2 += (newI - oldI) * (newI - oldI);
       I[id] = newI;
-
     });
     resI2 = RAJA_resI2;
 
-    if (iteration > maxIter) {
-      std::cout<<"Gauss-Seidel maxed out on iterations"<<std::endl;
+    if (iteration > maxIter)
+    {
+      std::cout << "Gauss-Seidel maxed out on iterations" << std::endl;
       break;
     }
 
@@ -174,36 +174,40 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //  to generate RAJA ListSegments and populate a RAJA Static Index
 //  Set.
 
-RAJA::TypedIndexSet<RAJA::ListSegment> 
-  gsColorPolicy(int N, camp::resources::Resource res)
+RAJA::TypedIndexSet<RAJA::ListSegment>
+gsColorPolicy(int N, camp::resources::Resource res)
 {
   RAJA::TypedIndexSet<RAJA::ListSegment> colorSet;
 
-  int redN = static_cast<int>( std::ceil( static_cast<double>(N * N / 2) ) );
-  int blkN = static_cast<int>( std::floor( static_cast<double>(N * N / 2) ) );
-  RAJA::Index_type *Red = new RAJA::Index_type[redN];
-  RAJA::Index_type *Blk = new RAJA::Index_type[blkN];
+  int redN = static_cast<int>(std::ceil(static_cast<double>(N * N / 2)));
+  int blkN = static_cast<int>(std::floor(static_cast<double>(N * N / 2)));
+  RAJA::Index_type* Red = new RAJA::Index_type[redN];
+  RAJA::Index_type* Blk = new RAJA::Index_type[blkN];
 
   int ib = 0;
   int ir = 0;
 
   bool isRed = true;
 
-  for (int n = 1; n <= N; ++n) {
-    
-    for (int m = 1; m <= N; ++m) {
-      
+  for (int n = 1; n <= N; ++n)
+  {
+
+    for (int m = 1; m <= N; ++m)
+    {
+
       RAJA::Index_type id = n * (N + 2) + m;
-      if (isRed) {
+      if (isRed)
+      {
         Red[ib] = id;
         ib++;
-      } else {
+      }
+      else
+      {
         Blk[ir] = id;
         ir++;
       }
       isRed = !isRed;
     }
-
   }
 
   // Create Index
@@ -227,26 +231,25 @@ double solution(double x, double y)
 //
 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
 //
-void computeErr(double *I, grid_s grid)
+void computeErr(double* I, grid_s grid)
 {
 
   RAJA::RangeSegment fdBounds(0, grid.n);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
-  using errPolicy = RAJA::KernelPolicy<
-    RAJA::statement::For<1, RAJA::seq_exec,
-    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
-
-  RAJA::kernel<errPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
-                       [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
-    
-      int id = tx + grid.n * ty;
-      double x = grid.o + tx * grid.h;
-      double y = grid.o + ty * grid.h;
-      double myErr = std::abs(I[id] - solution(x, y));
-      tMax.max(myErr);
-
-    });
+  using errPolicy = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<errPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
+                          [=](RAJA::Index_type tx, RAJA::Index_type ty) {
+                            int id = tx + grid.n * ty;
+                            double x = grid.o + tx * grid.h;
+                            double y = grid.o + ty * grid.h;
+                            double myErr = std::abs(I[id] - solution(x, y));
+                            tMax.max(myErr);
+                          });
 
   double l2err = tMax;
   printf("Max error = %lg, h = %f \n", l2err, grid.h);
diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
index 0b35017fac..c4ef22f542 100644
--- a/examples/resource-dynamic-forall.cpp
+++ b/examples/resource-dynamic-forall.cpp
@@ -28,25 +28,29 @@
 void checkResult(int* res, int len);
 void printResult(int* res, int len);
 
-using policy_list = camp::list<RAJA::seq_exec
-                               ,RAJA::simd_exec
+using policy_list = camp::list<RAJA::seq_exec,
+                               RAJA::simd_exec
 #if defined(RAJA_ENABLE_CUDA)
-                               ,RAJA::cuda_exec<256>
-                               ,RAJA::cuda_exec<512>
+                               ,
+                               RAJA::cuda_exec<256>,
+                               RAJA::cuda_exec<512>
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-                               ,RAJA::hip_exec<256>
-                               ,RAJA::hip_exec<512>
+                               ,
+                               RAJA::hip_exec<256>,
+                               RAJA::hip_exec<512>
 #endif
                                >;
 
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
 
-  if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index of  the policy to run");
+  if (argc != 2)
+  {
+    RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index "
+                        "of  the policy to run");
   }
 
   //
@@ -58,50 +62,55 @@ int main(int argc, char *argv[])
   const int pol = std::stoi(argv[1]);
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if(pol < 2) {
+  if (pol < 2)
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
   }
 
   std::cout << "\n\nRAJA vector addition example...\n";
-  std::cout << "Using policy # "<<pol<<std::endl;
+  std::cout << "Using policy # " << pol << std::endl;
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   const int N = 1000000;
 
-//
-// Allocate and initialize vector data
-//
-  int *a = memoryManager::allocate<int>(N);
-  int *b = memoryManager::allocate<int>(N);
-  int *c = memoryManager::allocate<int>(N);
+  //
+  // Allocate and initialize vector data
+  //
+  int* a = memoryManager::allocate<int>(N);
+  int* b = memoryManager::allocate<int>(N);
+  int* c = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = -i;
     b[i] = i;
   }
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     c[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
   checkResult(c, N);
-//printResult(c, N);
+  // printResult(c, N);
 
 
-//----------------------------------------------------------------------------//
-// Example of dynamic policy selection for forall
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Example of dynamic policy selection for forall
+  //----------------------------------------------------------------------------//
 
   RAJA::resources::Host host_res;
 #if defined(RAJA_ENABLE_CUDA)
@@ -112,30 +121,32 @@ int main(int argc, char *argv[])
 #endif
 #if defined(RAJA_ENABLE_SYCL)
   RAJA::resources::Sycl device_res;
-#endif  
+#endif
 
-  //Get typed erased resource - it will internally store if we are running on the host or device
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+  // Get typed erased resource - it will internally store if we are running on
+  // the host or device
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
+  RAJA::resources::Resource res =
+      RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
-  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res =
+      RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
-  RAJA::expt::dynamic_forall<policy_list>
-  (res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i)   {
-
-    c[i] = a[i] + b[i];
-
-  });
+  RAJA::expt::dynamic_forall<policy_list>(
+      res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
+        c[i] = a[i] + b[i];
+      });
 
   checkResult(c, N);
-  //printResult(c, N);
+  // printResult(c, N);
 
 
-//----------------------------------------------------------------------------//
-//
-// Clean up.
-//
+  //----------------------------------------------------------------------------//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -151,12 +162,19 @@ int main(int argc, char *argv[])
 void checkResult(int* res, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( res[i] != 0 ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (res[i] != 0)
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -167,7 +185,8 @@ void checkResult(int* res, int len)
 void printResult(int* res, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "result[" << i << "] = " << res[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp
index b374bdba3f..8d729c4368 100644
--- a/examples/resource-forall.cpp
+++ b/examples/resource-forall.cpp
@@ -18,7 +18,7 @@
  *  Vector Addition Example
  *
  *  Computes c = a + b, where a, b, c are vectors of ints.
- *  It illustrates similarities between a  C-style for-loop and a RAJA 
+ *  It illustrates similarities between a  C-style for-loop and a RAJA
  *  forall loop.
  *
  *  RAJA features shown:
@@ -35,279 +35,275 @@
 //
 // Functions for checking and printing results
 //
-void checkResult(int* res, int len); 
+void checkResult(int* res, int len);
 void printResult(int* res, int len);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA vector addition example...\n";
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   const int N = 100000;
 
-//
-// Allocate and initialize vector data
-//
+  //
+  // Allocate and initialize vector data
+  //
   RAJA::resources::Host host{};
 
-  int *a = host.allocate<int>(N);
-  int *b = host.allocate<int>(N);
-  int *c = host.allocate<int>(N);
+  int* a = host.allocate<int>(N);
+  int* b = host.allocate<int>(N);
+  int* c = host.allocate<int>(N);
 
-  int *a_ = host.allocate<int>(N);
-  int *b_ = host.allocate<int>(N);
-  int *c_ = host.allocate<int>(N);
+  int* a_ = host.allocate<int>(N);
+  int* b_ = host.allocate<int>(N);
+  int* c_ = host.allocate<int>(N);
 
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = -i;
     b[i] = 2 * i;
     a_[i] = -i;
     b_[i] = 2 * i;
-
   }
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style vector addition...\n";
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     c[i] = a[i] + b[i];
   }
 
   checkResult(c, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA::seq_exec policy enforces sequential execution.... 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::seq_exec policy enforces sequential execution....
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
 
-  RAJA::forall<RAJA::seq_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
-//----------------------------------------------------------------------------//
-// RAJA::sind_exec policy enforces simd execution.... 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::sind_exec policy enforces simd execution....
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA simd_exec vector addition...\n";
 
-  RAJA::forall<RAJA::simd_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });
+  RAJA::forall<RAJA::simd_exec>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
 #if defined(RAJA_ENABLE_OPENMP)
-//----------------------------------------------------------------------------//
-// RAJA::omp_for_parallel_exec policy execution.... 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::omp_for_parallel_exec policy execution....
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(host, RAJA::RangeSegment(0, N),
-  [=] (int i) {
-    c[i] = a[i] + b[i]; 
-  });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
-//----------------------------------------------------------------------------//
-// RAJA::omp_parallel_for_static_exec policy execution.... 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::omp_parallel_for_static_exec policy execution....
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA omp_parallel_for_static_exec (default chunksize) vector addition...\n";
+  std::cout << "\n Running RAJA omp_parallel_for_static_exec (default "
+               "chunksize) vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_static_exec< >>(host, RAJA::RangeSegment(0, N),
-  [=] (int i) {
-    c[i] = a[i] + b[i]; 
-  });
+  RAJA::forall<RAJA::omp_parallel_for_static_exec<>>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
-//----------------------------------------------------------------------------//
-// RAJA::omp_parallel_for_dynamic_exec policy execution.... 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::omp_parallel_for_dynamic_exec policy execution....
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector addition...\n";
+  std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector "
+               "addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_dynamic_exec<16>>(host, RAJA::RangeSegment(0, N),
-  [=] (int i) {
-    c[i] = a[i] + b[i]; 
-  });
+  RAJA::forall<RAJA::omp_parallel_for_dynamic_exec<16>>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 #endif
 
 
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
 
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-
-/*
-  GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block
-*/
-const int GPU_BLOCK_SIZE = 256;
+  /*
+    GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block
+  */
+  const int GPU_BLOCK_SIZE = 256;
 
-//----------------------------------------------------------------------------//
-// RAJA::cuda/hip_exec policy execution.... 
-//----------------------------------------------------------------------------//
-{
-  std::cout << "\n Running RAJA GPU vector addition on 2 seperate streams...\n";
+  //----------------------------------------------------------------------------//
+  // RAJA::cuda/hip_exec policy execution....
+  //----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA GPU vector addition on 2 seperate "
+                 "streams...\n";
 #if defined(RAJA_ENABLE_CUDA)
-  RAJA::resources::Cuda res_gpu1;
-  RAJA::resources::Cuda res_gpu2;
-  using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
+    RAJA::resources::Cuda res_gpu1;
+    RAJA::resources::Cuda res_gpu2;
+    using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
 #elif defined(RAJA_ENABLE_HIP)
-  RAJA::resources::Hip res_gpu1;
-  RAJA::resources::Hip res_gpu2;
-  using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
+    RAJA::resources::Hip res_gpu1;
+    RAJA::resources::Hip res_gpu2;
+    using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
 #elif defined(RAJA_ENABLE_SYCL)
-RAJA::resources::Sycl res_gpu1;
-RAJA::resources::Sycl res_gpu2;
-using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
+    RAJA::resources::Sycl res_gpu1;
+    RAJA::resources::Sycl res_gpu2;
+    using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
 #endif
 
-  int* d_a1 = res_gpu1.allocate<int>(N);
-  int* d_b1 = res_gpu1.allocate<int>(N);
-  int* d_c1 = res_gpu1.allocate<int>(N);
+    int* d_a1 = res_gpu1.allocate<int>(N);
+    int* d_b1 = res_gpu1.allocate<int>(N);
+    int* d_c1 = res_gpu1.allocate<int>(N);
 
-  int* d_a2 = res_gpu2.allocate<int>(N);
-  int* d_b2 = res_gpu2.allocate<int>(N);
-  int* d_c2 = res_gpu2.allocate<int>(N);
+    int* d_a2 = res_gpu2.allocate<int>(N);
+    int* d_b2 = res_gpu2.allocate<int>(N);
+    int* d_c2 = res_gpu2.allocate<int>(N);
 
-  res_gpu1.memcpy(d_a1, a, sizeof(int)* N);
-  res_gpu1.memcpy(d_b1, b, sizeof(int)* N);
+    res_gpu1.memcpy(d_a1, a, sizeof(int) * N);
+    res_gpu1.memcpy(d_b1, b, sizeof(int) * N);
 
-  res_gpu2.memcpy(d_a2, a, sizeof(int)* N);
-  res_gpu2.memcpy(d_b2, b, sizeof(int)* N);
+    res_gpu2.memcpy(d_a2, a, sizeof(int) * N);
+    res_gpu2.memcpy(d_b2, b, sizeof(int) * N);
 
 
-  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N), 
-    [=] RAJA_DEVICE (int i) { 
-    d_c1[i] = d_a1[i] + d_b1[i]; 
-  });    
+    RAJA::forall<EXEC_POLICY>(
+        res_gpu1, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
+          d_c1[i] = d_a1[i] + d_b1[i];
+        });
 
-  RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0, N), 
-    [=] RAJA_DEVICE (int i) { 
-    d_c2[i] = d_a2[i] + d_b2[i]; 
-  }); 
+    RAJA::forall<EXEC_POLICY>(
+        res_gpu2, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
+          d_c2[i] = d_a2[i] + d_b2[i];
+        });
 
-  res_gpu1.memcpy(c, d_c1, sizeof(int)*N );
+    res_gpu1.memcpy(c, d_c1, sizeof(int) * N);
 
-  res_gpu2.memcpy(c_, d_c2, sizeof(int)*N );
+    res_gpu2.memcpy(c_, d_c2, sizeof(int) * N);
 
-  checkResult(c, N);
-  checkResult(c_, N);
+    checkResult(c, N);
+    checkResult(c_, N);
 
-  res_gpu1.deallocate(d_a1);
-  res_gpu1.deallocate(d_b1);
-  res_gpu1.deallocate(d_c1);
+    res_gpu1.deallocate(d_a1);
+    res_gpu1.deallocate(d_b1);
+    res_gpu1.deallocate(d_c1);
 
-  res_gpu2.deallocate(d_a2);
-  res_gpu2.deallocate(d_b2);
-  res_gpu2.deallocate(d_c2);
-}
+    res_gpu2.deallocate(d_a2);
+    res_gpu2.deallocate(d_b2);
+    res_gpu2.deallocate(d_c2);
+  }
 
 
-//----------------------------------------------------------------------------//
-// RAJA::cuda/hip_exec policy with waiting event.... 
-//----------------------------------------------------------------------------//
-{
-  std::cout << "\n Running RAJA GPU vector with dependency between two seperate streams...\n";
+  //----------------------------------------------------------------------------//
+  // RAJA::cuda/hip_exec policy with waiting event....
+  //----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA GPU vector with dependency between two "
+                 "seperate streams...\n";
 #if defined(RAJA_ENABLE_CUDA)
-  // _raja_res_defres_start
-  RAJA::resources::Cuda res_gpu1;
-  RAJA::resources::Cuda res_gpu2;
-  RAJA::resources::Host res_host;
+    // _raja_res_defres_start
+    RAJA::resources::Cuda res_gpu1;
+    RAJA::resources::Cuda res_gpu2;
+    RAJA::resources::Host res_host;
 
-  using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
-  // _raja_res_defres_end
+    using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
+    // _raja_res_defres_end
 #elif defined(RAJA_ENABLE_HIP)
-  RAJA::resources::Hip res_gpu1;
-  RAJA::resources::Hip res_gpu2;
-  RAJA::resources::Host res_host;
+    RAJA::resources::Hip res_gpu1;
+    RAJA::resources::Hip res_gpu2;
+    RAJA::resources::Host res_host;
 
-  using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
+    using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
 #elif defined(RAJA_ENABLE_SYCL)
-  RAJA::resources::Sycl res_gpu1;
-  RAJA::resources::Sycl res_gpu2;
-  RAJA::resources::Host res_host;
+    RAJA::resources::Sycl res_gpu1;
+    RAJA::resources::Sycl res_gpu2;
+    RAJA::resources::Host res_host;
 
-  using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
+    using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
 #endif
 
-  // _raja_res_alloc_start
-  int* d_array1 = res_gpu1.allocate<int>(N);
-  int* d_array2 = res_gpu2.allocate<int>(N);
-  int* h_array  = res_host.allocate<int>(N);
-  // _raja_res_alloc_end
-
-  // _raja_res_k1_start
-  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0,N),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] = i;
-    }
-  );
-  // _raja_res_k1_end
-
-  // _raja_res_k2_start
-  RAJA::resources::Event e = RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0,N),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array2[i] = -1;
-    }
-  );
-  // _raja_res_k2_end
-
-  // _raja_res_wait_start
-  res_gpu2.wait_for(&e);
-  // _raja_res_wait_end
-
-  // _raja_res_k3_start
-  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0,N),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] *= d_array2[i];
-    }
-  );
-  // _raja_res_k3_end
-
-  // _raja_res_memcpy_start
-  res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N);
-  // _raja_res_memcpy_end
-
-  // _raja_res_k4_start
-  bool check = true;
-  RAJA::forall<RAJA::seq_exec>(res_host, RAJA::RangeSegment(0,N),
-    [&check, h_array] (int i) {
-      if(h_array[i] != -i) {check = false;} 
-    }
-  );
-  // _raja_res_k4_end
-  
-  std::cout << "\n         result -- ";
-  if (check) std::cout << "PASS\n";
-  else std::cout << "FAIL\n";
-
-  res_gpu1.deallocate(d_array1);
-  res_gpu2.deallocate(d_array2);
-  res_host.deallocate(h_array);
-
-}
+    // _raja_res_alloc_start
+    int* d_array1 = res_gpu1.allocate<int>(N);
+    int* d_array2 = res_gpu2.allocate<int>(N);
+    int* h_array = res_host.allocate<int>(N);
+    // _raja_res_alloc_end
+
+    // _raja_res_k1_start
+    RAJA::forall<EXEC_POLICY>(res_gpu1,
+                              RAJA::RangeSegment(0, N),
+                              [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
+    // _raja_res_k1_end
+
+    // _raja_res_k2_start
+    RAJA::resources::Event e = RAJA::forall<EXEC_POLICY>(
+        res_gpu2, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
+          d_array2[i] = -1;
+        });
+    // _raja_res_k2_end
+
+    // _raja_res_wait_start
+    res_gpu2.wait_for(&e);
+    // _raja_res_wait_end
+
+    // _raja_res_k3_start
+    RAJA::forall<EXEC_POLICY>(
+        res_gpu1, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
+          d_array1[i] *= d_array2[i];
+        });
+    // _raja_res_k3_end
+
+    // _raja_res_memcpy_start
+    res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N);
+    // _raja_res_memcpy_end
+
+    // _raja_res_k4_start
+    bool check = true;
+    RAJA::forall<RAJA::seq_exec>(
+        res_host, RAJA::RangeSegment(0, N), [&check, h_array](int i) {
+          if (h_array[i] != -i)
+          {
+            check = false;
+          }
+        });
+    // _raja_res_k4_end
+
+    std::cout << "\n         result -- ";
+    if (check)
+      std::cout << "PASS\n";
+    else
+      std::cout << "FAIL\n";
+
+    res_gpu1.deallocate(d_array1);
+    res_gpu2.deallocate(d_array2);
+    res_host.deallocate(h_array);
+  }
 
 #endif
-//
-//
-// Clean up.
-//
+  //
+  //
+  // Clean up.
+  //
   host.deallocate(a);
   host.deallocate(b);
   host.deallocate(c);
@@ -324,15 +320,22 @@ using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
 //
 // Function to check result and report P/F.
 //
-void checkResult(int* res, int len) 
+void checkResult(int* res, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( res[i] != i ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (res[i] != i)
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -343,7 +346,8 @@ void checkResult(int* res, int len)
 void printResult(int* res, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "result[" << i << "] = " << res[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp
index a754876479..a38f5c83a1 100644
--- a/examples/resource-kernel.cpp
+++ b/examples/resource-kernel.cpp
@@ -10,7 +10,7 @@
 
 using namespace RAJA;
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -21,54 +21,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
   RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
-  int* d_array = def_cuda_res.allocate<int>(N*M);
-  int* h_array = def_host_res.allocate<int>(N*M);
+  int* d_array = def_cuda_res.allocate<int>(N * M);
+  int* h_array = def_host_res.allocate<int>(N * M);
 
   RAJA::RangeSegment one_range(0, 1);
   RAJA::RangeSegment m_range(0, M);
   RAJA::RangeSegment n_range(0, N);
 
-  using TEST_POL =
-    RAJA::KernelPolicy<
-      statement::CudaKernelAsync<
-        statement::For<0, cuda_block_x_loop,
-          statement::For<1, cuda_thread_x_loop,
-            statement::Lambda<0>
-          >
-        >
-      >
-    >;
+  using TEST_POL = RAJA::KernelPolicy<statement::CudaKernelAsync<statement::For<
+      0,
+      cuda_block_x_loop,
+      statement::For<1, cuda_thread_x_loop, statement::Lambda<0>>>>>;
 
-  RAJA::forall<RAJA::seq_exec>(def_host_res, n_range,
-    [=, &def_cuda_res](int i){
-      RAJA::resources::Cuda res_cuda; 
+  RAJA::forall<RAJA::seq_exec>(
+      def_host_res, n_range, [=, &def_cuda_res](int i) {
+        RAJA::resources::Cuda res_cuda;
 
-      RAJA::resources::Event e = RAJA::kernel_resource<TEST_POL>(
-        RAJA::make_tuple(one_range,
-                         m_range),
+        RAJA::resources::Event e = RAJA::kernel_resource<TEST_POL>(
+            RAJA::make_tuple(one_range, m_range),
 
-        res_cuda,
+            res_cuda,
 
-        [=] RAJA_DEVICE (int k, int j) {
-          d_array[i*M + j] = i * M + j;  
-        }
-      );
+            [=] RAJA_DEVICE(int k, int j) { d_array[i * M + j] = i * M + j; });
 
-      def_cuda_res.wait_for(&e);
-    }
-  );
+        def_cuda_res.wait_for(&e);
+      });
 
   def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M);
 
   int ec_count = 0;
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, N*M),
-    [=, &ec_count](int i){
-      if (h_array[i] != i) ec_count++;
-    }
-  );
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N * M),
+                               [=, &ec_count](int i) {
+                                 if (h_array[i] != i) ec_count++;
+                               });
 
   std::cout << "    Result -- ";
-  if (ec_count > 0) 
+  if (ec_count > 0)
     std::cout << "FAIL : error count = " << ec_count << "\n";
   else
     std::cout << "PASS!\n";
diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp
index 288b70f8a5..12c228e91e 100644
--- a/examples/resource-launch.cpp
+++ b/examples/resource-launch.cpp
@@ -10,7 +10,7 @@
 
 using namespace RAJA;
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -21,8 +21,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
   RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
-  int* d_array = def_cuda_res.allocate<int>(N*M);
-  int* h_array = def_host_res.allocate<int>(N*M);
+  int* d_array = def_cuda_res.allocate<int>(N * M);
+  int* h_array = def_host_res.allocate<int>(N * M);
 
   RAJA::RangeSegment one_range(0, 1);
   RAJA::RangeSegment m_range(0, M);
@@ -34,39 +34,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
-  RAJA::forall<RAJA::seq_exec>(def_host_res, n_range,
-    [=, &def_cuda_res](int i){
-
-      RAJA::resources::Cuda res_cuda;
-
-      RAJA::resources::Event e =
-        RAJA::launch<launch_policy>(res_cuda,
-        RAJA::LaunchParams(RAJA::Teams(64),
-                         RAJA::Threads(1)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
-
-       RAJA::loop<teams_x>(ctx, m_range, [&] (int j) {
-         RAJA::loop<threads_x>(ctx, one_range, [&] (int k) {
-
-           d_array[i*M + j] = i * M + j;
-
-           });
-         });
-
+  RAJA::forall<RAJA::seq_exec>(
+      def_host_res, n_range, [=, &def_cuda_res](int i) {
+        RAJA::resources::Cuda res_cuda;
+
+        RAJA::resources::Event e = RAJA::launch<launch_policy>(
+            res_cuda,
+            RAJA::LaunchParams(RAJA::Teams(64), RAJA::Threads(1)),
+            [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+              RAJA::loop<teams_x>(ctx, m_range, [&](int j) {
+                RAJA::loop<threads_x>(ctx, one_range, [&](int k) {
+                  d_array[i * M + j] = i * M + j;
+                });
+              });
+            });
+
+        def_cuda_res.wait_for(&e);
       });
 
-      def_cuda_res.wait_for(&e);
-    }
-  );
-
   def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M);
 
   int ec_count = 0;
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, N*M),
-    [=, &ec_count](int i){
-      if (h_array[i] != i) ec_count++;
-    }
-  );
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N * M),
+                               [=, &ec_count](int i) {
+                                 if (h_array[i] != i) ec_count++;
+                               });
 
   std::cout << "    Result -- ";
   if (ec_count > 0)
diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp
index e52923d81f..87c10fa871 100644
--- a/examples/resource-runtime-launch.cpp
+++ b/examples/resource-runtime-launch.cpp
@@ -42,15 +42,17 @@ using device_loop = RAJA::hip_global_thread_x;
 
 using launch_policy = RAJA::LaunchPolicy<host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         , device_launch
+                                         ,
+                                         device_launch
 #endif
-                                        >;
+                                         >;
 
 using loop_pol = RAJA::LoopPolicy<host_loop
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                  , device_loop
+                                  ,
+                                  device_loop
 #endif
-                                 >;
+                                  >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -60,11 +62,13 @@ using reduce_policy = RAJA::hip_reduce;
 using reduce_policy = RAJA::seq_reduce;
 #endif
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
 
-  if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions device");
+  if (argc != 2)
+  {
+    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions "
+                        "device");
   }
 
   //
@@ -73,39 +77,51 @@ int main(int argc, char *argv[])
   // Example usage ./teams_reductions host or ./teams_reductions device
   //
   std::string exec_space = argv[1];
-  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
-    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions device");
+  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
+  {
+    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions "
+                        "device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams reductions example on the host \n"); }
-  if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); }
+  if (exec_space.compare("host") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
+    printf("Running RAJA-Teams reductions example on the host \n");
+  }
+  if (exec_space.compare("device") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
+    printf("Running RAJA-Teams reductions example on the device \n");
+  }
 
   // _reductions_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   const int N = 1000000;
 
-//
-// Allocate array data and initialize data to alternating sequence of 1, -1.
-//
+  //
+  // Allocate array data and initialize data to alternating sequence of 1, -1.
+  //
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
-    if ( i % 2 == 0 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
       a[i] = 1;
-    } else {
+    }
+    else
+    {
       a[i] = -1;
     }
   }
 
-//
-// Set min and max loc values
-//
+  //
+  // Set min and max loc values
+  //
   const int minloc_ref = N / 2;
   a[minloc_ref] = -100;
 
@@ -113,35 +129,39 @@ int main(int argc, char *argv[])
   a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-//
-// Note: with this data initialization scheme, the following results will
-//       be observed for all reduction kernels below:
-//
-//  - the sum will be zero
-//  - the min will be -100
-//  - the max will be 100
-//  - the min loc will be N/2
-//  - the max loc will be N/2 + 1
-//
-//
+  //
+  // Note: with this data initialization scheme, the following results will
+  //       be observed for all reduction kernels below:
+  //
+  //  - the sum will be zero
+  //  - the min will be -100
+  //  - the max will be 100
+  //  - the min loc will be N/2
+  //  - the max loc will be N/2 + 1
+  //
+  //
 
-//
-// Define index range for iterating over a elements in all examples
-//
+  //
+  // Define index range for iterating over a elements in all examples
+  //
   // _reductions_range_start
   RAJA::RangeSegment arange(0, N);
   // _reductions_range_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   RAJA::ReduceSum<reduce_policy, int> kernel_sum(0);
-  RAJA::ReduceMin<reduce_policy, int> kernel_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<reduce_policy, int> kernel_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMin<reduce_policy, int> kernel_min(
+      std::numeric_limits<int>::max());
+  RAJA::ReduceMax<reduce_policy, int> kernel_max(
+      std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(
+      std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(
+      std::numeric_limits<int>::min(), -1);
 
   const int TEAM_SZ = 256;
-  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ);
+  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ);
 
 
   RAJA::resources::Host host_res;
@@ -152,44 +172,47 @@ int main(int argc, char *argv[])
   RAJA::resources::Hip device_res;
 #endif
 
-  //Get typed erased resource - it will internally store if we are running on the host or device
+  // Get typed erased resource - it will internally store if we are running on
+  // the host or device
 #if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
-  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res =
+      RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
-  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res =
+      RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
-  //How the kernel executes now depends on how the resource is constructed (host or device)
-  RAJA::launch<launch_policy>
-    (res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ),
-                                   RAJA::Threads(TEAM_SZ)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
-       RAJA::loop<loop_pol>(ctx, arange, [&] (int i) {
-
-           kernel_sum += a[i];
+  // How the kernel executes now depends on how the resource is constructed
+  // (host or device)
+  RAJA::launch<launch_policy>(
+      res,
+      RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<loop_pol>(ctx, arange, [&](int i) {
+          kernel_sum += a[i];
 
-           kernel_min.min(a[i]);
-           kernel_max.max(a[i]);
+          kernel_min.min(a[i]);
+          kernel_max.max(a[i]);
 
-           kernel_minloc.minloc(a[i], i);
-           kernel_maxloc.maxloc(a[i], i);
-         });
-    });
+          kernel_minloc.minloc(a[i], i);
+          kernel_maxloc.maxloc(a[i], i);
+        });
+      });
 
 
   std::cout << "\tsum = " << kernel_sum.get() << std::endl;
   std::cout << "\tmin = " << kernel_min.get() << std::endl;
   std::cout << "\tmax = " << kernel_max.get() << std::endl;
   std::cout << "\tmin, loc = " << kernel_minloc.get() << " , "
-                               << kernel_minloc.getLoc() << std::endl;
+            << kernel_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , "
-                               << kernel_maxloc.getLoc() << std::endl;
+            << kernel_maxloc.getLoc() << std::endl;
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/tut_daxpy.cpp b/examples/tut_daxpy.cpp
index 74b127e0d6..39ecaad085 100644
--- a/examples/tut_daxpy.cpp
+++ b/examples/tut_daxpy.cpp
@@ -15,12 +15,12 @@
  *  Daxpy Example
  *
  *  Computes a += b*c, where a, b are vectors of doubles
- *  and c is a scalar double. It illustrates similarities between a 
- *  C-style for-loop and a RAJA forall loop. 
+ *  and c is a scalar double. It illustrates similarities between a
+ *  C-style for-loop and a RAJA forall loop.
  *
  *  RAJA features shown:
  *    - `forall` loop iteration template method
- *    -  Index range segment 
+ *    -  Index range segment
  *    -  Execution policies
  */
 
@@ -28,187 +28,184 @@
 // Functions for checking and printing results
 //
 void checkResult(double* v1, double* v2, int len);
-void printResult(double* v, int len); 
+void printResult(double* v, int len);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA daxpy example...\n";
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   const int N = 1000000;
 
-//
-// Allocate and initialize vector data.
-//
+  //
+  // Allocate and initialize vector data.
+  //
   double* a0 = new double[N];
   double* aref = new double[N];
 
   double* ta = new double[N];
   double* tb = new double[N];
-  
+
   double c = 3.14159;
-  
-  for (int i = 0; i < N; i++) {
+
+  for (int i = 0; i < N; i++)
+  {
     a0[i] = 1.0;
     tb[i] = 2.0;
   }
 
-//
-// Declare and set pointers to array data. 
-// We reset them for each daxpy version so that 
-// they all look the same.
-//
+  //
+  // Declare and set pointers to array data.
+  // We reset them for each daxpy version so that
+  // they all look the same.
+  //
 
   double* a = ta;
   double* b = tb;
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version of daxpy...\n";
-   
-  std::memcpy( a, a0, N * sizeof(double) );  
 
-  for (int i = 0; i < N; ++i) {
+  std::memcpy(a, a0, N * sizeof(double));
+
+  for (int i = 0; i < N; ++i)
+  {
     a[i] += b[i] * c;
   }
 
-  std::memcpy( aref, a, N* sizeof(double) ); 
+  std::memcpy(aref, a, N * sizeof(double));
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// In the following, we show a RAJA version
-// of the daxpy operation and how it can
-// be run differently by choosing different
-// RAJA execution policies. 
-//
-// Note that the only thing that changes in 
-// these versions is the execution policy.
-// To implement these cases using the 
-// programming model choices directly, would
-// require unique changes for each.
-//
-  
-//----------------------------------------------------------------------------//
+  //
+  // In the following, we show a RAJA version
+  // of the daxpy operation and how it can
+  // be run differently by choosing different
+  // RAJA execution policies.
+  //
+  // Note that the only thing that changes in
+  // these versions is the execution policy.
+  // To implement these cases using the
+  // programming model choices directly, would
+  // require unique changes for each.
+  //
+
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential daxpy...\n";
-   
-  std::memcpy( a, a0, N * sizeof(double) );  
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N), [=] (int i) {
-    a[i] += b[i] * c;
-  });
+  std::memcpy(a, a0, N * sizeof(double));
+
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
+                               [=](int i) { a[i] += b[i] * c; });
 
   checkResult(a, aref, N);
-//printResult(a, N); 
+  // printResult(a, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// RAJA SIMD version.
-//
+  //
+  // RAJA SIMD version.
+  //
   std::cout << "\n Running RAJA SIMD daxpy...\n";
-   
-  std::memcpy( a, a0, N * sizeof(double) );  
 
-  RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N), [=] (int i) {
-    a[i] += b[i] * c;
-  });
+  std::memcpy(a, a0, N * sizeof(double));
+
+  RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
+                                [=](int i) { a[i] += b[i] * c; });
 
   checkResult(a, aref, N);
-//printResult(a, N); 
+  // printResult(a, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP daxpy...\n";
-   
-  std::memcpy( a, a0, N * sizeof(double) );  
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=] (int i) {
-    a[i] += b[i] * c;
-  });
+  std::memcpy(a, a0, N * sizeof(double));
+
+  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N),
+                                            [=](int i) { a[i] += b[i] * c; });
 
   checkResult(a, aref, N);
-//printResult(a, N); 
+// printResult(a, N);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
-//
-// RAJA CUDA parallel GPU version (256 threads per thread block).
-//
+  //
+  // RAJA CUDA parallel GPU version (256 threads per thread block).
+  //
   std::cout << "\n Running RAJA CUDA daxpy...\n";
 
-  a = 0; b = 0;
-  cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) ));
-  cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) ));
- 
-  cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); 
-  cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); 
+  a = 0;
+  b = 0;
+  cudaErrchk(cudaMalloc((void**)&a, N * sizeof(double)));
+  cudaErrchk(cudaMalloc((void**)&b, N * sizeof(double)));
 
-  RAJA::forall<RAJA::cuda_exec<256>>(RAJA::RangeSegment(0, N), 
-    [=] RAJA_DEVICE (int i) {
-    a[i] += b[i] * c;
-  });
+  cudaErrchk(cudaMemcpy(a, a0, N * sizeof(double), cudaMemcpyHostToDevice));
+  cudaErrchk(cudaMemcpy(b, tb, N * sizeof(double), cudaMemcpyHostToDevice));
+
+  RAJA::forall<RAJA::cuda_exec<256>>(
+      RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { a[i] += b[i] * c; });
 
-  cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost ));
+  cudaErrchk(cudaMemcpy(ta, a, N * sizeof(double), cudaMemcpyDeviceToHost));
 
   cudaErrchk(cudaFree(a));
   cudaErrchk(cudaFree(b));
 
   a = ta;
   checkResult(a, aref, N);
-//printResult(a, N); 
+// printResult(a, N);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
-//
-// RAJA HIP parallel GPU version (256 threads per thread block).
-//
+  //
+  // RAJA HIP parallel GPU version (256 threads per thread block).
+  //
   std::cout << "\n Running RAJA HIP daxpy...\n";
 
-  a = 0; b = 0;
-  hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) ));
-  hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) ));
+  a = 0;
+  b = 0;
+  hipErrchk(hipMalloc((void**)&a, N * sizeof(double)));
+  hipErrchk(hipMalloc((void**)&b, N * sizeof(double)));
 
-  hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(a, a0, N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(b, tb, N * sizeof(double), hipMemcpyHostToDevice));
 
-  RAJA::forall<RAJA::hip_exec<256>>(RAJA::RangeSegment(0, N),
-    [=] RAJA_DEVICE (int i) {
-    a[i] += b[i] * c;
-  });
+  RAJA::forall<RAJA::hip_exec<256>>(
+      RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { a[i] += b[i] * c; });
 
-  hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(ta, a, N * sizeof(double), hipMemcpyDeviceToHost));
 
   hipErrchk(hipFree(a));
   hipErrchk(hipFree(b));
 
   a = ta;
   checkResult(a, aref, N);
-//printResult(a, N);
+// printResult(a, N);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up. 
-//
-  delete[] a0; 
-  delete[] aref; 
-  delete[] ta; 
+  //
+  // Clean up.
+  //
+  delete[] a0;
+  delete[] aref;
+  delete[] ta;
   delete[] tb;
-  
+
   std::cout << "\n DONE!...\n";
 
   return 0;
@@ -217,26 +214,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //
 // Function to compare result to reference and report P/F.
 //
-void checkResult(double* v1, double* v2, int len) 
+void checkResult(double* v1, double* v2, int len)
 {
   bool match = true;
-  for (int i = 0; i < len; i++) {
-    if ( v1[i] != v2[i] ) { match = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (v1[i] != v2[i])
+    {
+      match = false;
+    }
   }
-  if ( match ) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
-  } 
+  }
 }
 
 //
-// Function to print result. 
+// Function to print result.
 //
-void printResult(double* v, int len) 
+void printResult(double* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "result[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp
index c584695128..0ce21573bd 100644
--- a/examples/tut_halo-exchange.cpp
+++ b/examples/tut_halo-exchange.cpp
@@ -34,8 +34,9 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using forall
-  CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using workgroup
+  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when
+  using forall CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a
+  CUDA thread block when using workgroup
 */
 #if defined(RAJA_ENABLE_CUDA)
 const int CUDA_BLOCK_SIZE = 256;
@@ -56,42 +57,51 @@ const int num_neighbors = 26;
 //
 // Functions for checking and printing results
 //
-void checkResult(std::vector<double*> const& vars, std::vector<double*> const& vars_ref,
-                 int var_size, int num_vars);
+void checkResult(std::vector<double*> const& vars,
+                 std::vector<double*> const& vars_ref,
+                 int var_size,
+                 int num_vars);
 void printResult(std::vector<double*> const& vars, int var_size, int num_vars);
 
 //
 // Functions for allocating and populating packing and unpacking lists
 //
-void create_pack_lists(std::vector<int*>& pack_index_lists, std::vector<int>& pack_index_list_lengths,
-                       const int halo_width, const int* grid_dims);
-void create_unpack_lists(std::vector<int*>& unpack_index_lists, std::vector<int>& unpack_index_list_lengths,
-                         const int halo_width, const int* grid_dims);
+void create_pack_lists(std::vector<int*>& pack_index_lists,
+                       std::vector<int>& pack_index_list_lengths,
+                       const int halo_width,
+                       const int* grid_dims);
+void create_unpack_lists(std::vector<int*>& unpack_index_lists,
+                         std::vector<int>& unpack_index_list_lengths,
+                         const int halo_width,
+                         const int* grid_dims);
 void destroy_pack_lists(std::vector<int*>& pack_index_lists);
 void destroy_unpack_lists(std::vector<int*>& unpack_index_lists);
 
 
-template < typename T >
+template <typename T>
 struct memory_manager_allocator
 {
   using value_type = T;
 
   memory_manager_allocator() = default;
 
-  template < typename U >
-  constexpr memory_manager_allocator(memory_manager_allocator<U> const&) noexcept
-  { }
+  template <typename U>
+  constexpr memory_manager_allocator(
+      memory_manager_allocator<U> const&) noexcept
+  {}
 
   /*[[nodiscard]]*/
   value_type* allocate(size_t num)
   {
-    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+    {
       throw std::bad_alloc();
     }
 
-    value_type *ptr = memoryManager::allocate<value_type>(num);
+    value_type* ptr = memoryManager::allocate<value_type>(num);
 
-    if (!ptr) {
+    if (!ptr)
+    {
       throw std::bad_alloc();
     }
 
@@ -106,45 +116,49 @@ struct memory_manager_allocator
 };
 
 template <typename T, typename U>
-bool operator==(memory_manager_allocator<T> const&, memory_manager_allocator<U> const&)
+bool operator==(memory_manager_allocator<T> const&,
+                memory_manager_allocator<U> const&)
 {
   return true;
 }
 
 template <typename T, typename U>
-bool operator!=(memory_manager_allocator<T> const& lhs, memory_manager_allocator<U> const& rhs)
+bool operator!=(memory_manager_allocator<T> const& lhs,
+                memory_manager_allocator<U> const& rhs)
 {
   return !(lhs == rhs);
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template < typename T >
+template <typename T>
 struct pinned_allocator
 {
   using value_type = T;
 
   pinned_allocator() = default;
 
-  template < typename U >
+  template <typename U>
   constexpr pinned_allocator(pinned_allocator<U> const&) noexcept
-  { }
+  {}
 
   /*[[nodiscard]]*/
   value_type* allocate(size_t num)
   {
-    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+    {
       throw std::bad_alloc();
     }
 
-    value_type *ptr = nullptr;
+    value_type* ptr = nullptr;
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaMallocHost((void **)&ptr, num*sizeof(value_type)));
+    cudaErrchk(cudaMallocHost((void**)&ptr, num * sizeof(value_type)));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipHostMalloc((void **)&ptr, num*sizeof(value_type)));
+    hipErrchk(hipHostMalloc((void**)&ptr, num * sizeof(value_type)));
 #endif
 
-    if (!ptr) {
+    if (!ptr)
+    {
       throw std::bad_alloc();
     }
 
@@ -176,12 +190,13 @@ bool operator!=(pinned_allocator<T> const& lhs, pinned_allocator<U> const& rhs)
 
 #endif
 
-int main(int argc, char **argv)
+int main(int argc, char** argv)
 {
 
   std::cout << "\n\nRAJA halo exchange example...\n";
 
-  if (argc != 1 && argc != 7) {
+  if (argc != 1 && argc != 7)
+  {
     std::cerr << "Usage: tut_halo-exchange "
               << "[grid_x grid_y grid_z halo_width num_vars num_cycles]\n";
     std::exit(1);
@@ -194,47 +209,46 @@ int main(int argc, char **argv)
   // Define number of grid variables
   // Define number of cycles
   //
-  const int grid_dims[3] = { (argc != 7) ? 100 : std::atoi(argv[1]),
-                             (argc != 7) ? 100 : std::atoi(argv[2]),
-                             (argc != 7) ? 100 : std::atoi(argv[3]) };
-  const int halo_width =     (argc != 7) ?   1 : std::atoi(argv[4]);
-  const int num_vars   =     (argc != 7) ?   3 : std::atoi(argv[5]);
-  const int num_cycles =     (argc != 7) ?   3 : std::atoi(argv[6]);
+  const int grid_dims[3] = {(argc != 7) ? 100 : std::atoi(argv[1]),
+                            (argc != 7) ? 100 : std::atoi(argv[2]),
+                            (argc != 7) ? 100 : std::atoi(argv[3])};
+  const int halo_width = (argc != 7) ? 1 : std::atoi(argv[4]);
+  const int num_vars = (argc != 7) ? 3 : std::atoi(argv[5]);
+  const int num_cycles = (argc != 7) ? 3 : std::atoi(argv[6]);
   // _halo_exchange_input_params_end
 
-  std::cout << "grid dimensions "     << grid_dims[0]
-            << " x "                  << grid_dims[1]
-            << " x "                  << grid_dims[2] << "\n"
-            << "halo width "          << halo_width   << "\n"
-            << "number of variables " << num_vars     << "\n"
-            << "number of cycles "    << num_cycles   << "\n";
+  std::cout << "grid dimensions " << grid_dims[0] << " x " << grid_dims[1]
+            << " x " << grid_dims[2] << "\n"
+            << "halo width " << halo_width << "\n"
+            << "number of variables " << num_vars << "\n"
+            << "number of cycles " << num_cycles << "\n";
 
-  if ( grid_dims[0] < halo_width ||
-       grid_dims[1] < halo_width ||
-       grid_dims[2] < halo_width ) {
+  if (grid_dims[0] < halo_width || grid_dims[1] < halo_width ||
+      grid_dims[2] < halo_width)
+  {
     std::cerr << "Error: "
               << "grid dimensions must not be smaller than the halo width\n";
     std::exit(1);
   }
 
-  const int grid_plus_halo_dims[3] = { grid_dims[0] + 2*halo_width,
-                                       grid_dims[1] + 2*halo_width,
-                                       grid_dims[2] + 2*halo_width };
+  const int grid_plus_halo_dims[3] = {grid_dims[0] + 2 * halo_width,
+                                      grid_dims[1] + 2 * halo_width,
+                                      grid_dims[2] + 2 * halo_width};
 
-  const int var_size = grid_plus_halo_dims[0] *
-                       grid_plus_halo_dims[1] *
-                       grid_plus_halo_dims[2] ;
+  const int var_size =
+      grid_plus_halo_dims[0] * grid_plus_halo_dims[1] * grid_plus_halo_dims[2];
 
   // _halo_exchange_vars_allocate_start
   //
   // Allocate grid variables and reference grid variables used to check
   // correctness.
   //
-  std::vector<double*> vars    (num_vars, nullptr);
+  std::vector<double*> vars(num_vars, nullptr);
   std::vector<double*> vars_ref(num_vars, nullptr);
 
-  for (int v = 0; v < num_vars; ++v) {
-    vars[v]     = memoryManager::allocate<double>(var_size);
+  for (int v = 0; v < num_vars; ++v)
+  {
+    vars[v] = memoryManager::allocate<double>(var_size);
     vars_ref[v] = memoryManager::allocate<double>(var_size);
   }
   // _halo_exchange_vars_allocate_end
@@ -245,12 +259,14 @@ int main(int argc, char **argv)
   // Generate index lists for packing and unpacking
   //
   std::vector<int*> pack_index_lists(num_neighbors, nullptr);
-  std::vector<int > pack_index_list_lengths(num_neighbors, 0);
-  create_pack_lists(pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
+  std::vector<int> pack_index_list_lengths(num_neighbors, 0);
+  create_pack_lists(
+      pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
 
   std::vector<int*> unpack_index_lists(num_neighbors, nullptr);
-  std::vector<int > unpack_index_list_lengths(num_neighbors, 0);
-  create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
+  std::vector<int> unpack_index_list_lengths(num_neighbors, 0);
+  create_unpack_lists(
+      unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
   // _halo_exchange_index_list_generate_end
 
 
@@ -263,7 +279,7 @@ int main(int argc, char **argv)
   auto timer = RAJA::Timer();
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   {
     std::cout << "\n Running C-style halo exchange...\n";
 
@@ -272,74 +288,82 @@ int main(int argc, char **argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
-
     }
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        for (int i = 0; i < var_size; i++) {
-          var[i] = i + v;
+          for (int i = 0; i < var_size; i++)
+          {
+            var[i] = i + v;
+          }
         }
-      }
 
-      // _halo_exchange_sequential_cstyle_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_sequential_cstyle_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          for (int i = 0; i < len; i++) {
-            buffer[i] = var[list[i]];
+            for (int i = 0; i < len; i++)
+            {
+              buffer[i] = var[list[i]];
+            }
+
+            buffer += len;
           }
 
-          buffer += len;
+          // send single message
         }
+        // _halo_exchange_sequential_cstyle_packing_end
 
-        // send single message
-      }
-      // _halo_exchange_sequential_cstyle_packing_end
+        // _halo_exchange_sequential_cstyle_unpacking_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-      // _halo_exchange_sequential_cstyle_unpacking_start
-      for (int l = 0; l < num_neighbors; ++l) {
+          // recv single message
 
-        // recv single message
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+            double* var = vars[v];
 
-          double* var = vars[v];
+            for (int i = 0; i < len; i++)
+            {
+              var[list[i]] = buffer[i];
+            }
 
-          for (int i = 0; i < len; i++) {
-            var[list[i]] = buffer[i];
+            buffer += len;
           }
-
-          buffer += len;
         }
-      }
-      // _halo_exchange_sequential_cstyle_unpacking_end
-
+        // _halo_exchange_sequential_cstyle_unpacking_end
       }
       timer.stop();
 
@@ -348,30 +372,33 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate(buffers[l]);
-
     }
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // copy result of exchange for reference later
-    for (int v = 0; v < num_vars; ++v) {
+    for (int v = 0; v < num_vars; ++v)
+    {
 
-      double* var     = vars[v];
+      double* var = vars[v];
       double* var_ref = vars_ref[v];
 
-      for (int i = 0; i < var_size; i++) {
+      for (int i = 0; i < var_size; i++)
+      {
         var_ref[i] = var[i];
       }
     }
   }
 
 
-//----------------------------------------------------------------------------//
-// Separate packing/unpacking loops using forall
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Separate packing/unpacking loops using forall
+  //----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA loop forall halo exchange...\n";
 
@@ -383,74 +410,78 @@ int main(int argc, char **argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
-
     }
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=](int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_seq_forall_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_seq_forall_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
-            buffer[i] = var[list[i]];
-          });
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
+              buffer[i] = var[list[i]];
+            });
 
-          buffer += len;
-        }
+            buffer += len;
+          }
 
-        // send single message
-      }
-      // _halo_exchange_seq_forall_packing_end
+          // send single message
+        }
+        // _halo_exchange_seq_forall_packing_end
 
-      // _halo_exchange_seq_forall_unpacking_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_seq_forall_unpacking_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        // recv single message
+          // recv single message
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
-            var[list[i]] = buffer[i];
-          });
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
+              var[list[i]] = buffer[i];
+            });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
-      // _halo_exchange_seq_forall_unpacking_end
-
+        // _halo_exchange_seq_forall_unpacking_end
       }
       timer.stop();
 
@@ -459,136 +490,139 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate(buffers[l]);
-
     }
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 
-//----------------------------------------------------------------------------//
-// RAJA::WorkGroup with allows deferred execution
-// This has overhead and indirection not in the separate loop version,
-// but can be useful for debugging.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::WorkGroup with allows deferred execution
+  // This has overhead and indirection not in the separate loop version,
+  // but can be useful for debugging.
+  //----------------------------------------------------------------------------//
   {
-  std::cout << "\n Running RAJA loop workgroup halo exchange...\n";
+    std::cout << "\n Running RAJA loop workgroup halo exchange...\n";
 
     double minCycle = std::numeric_limits<double>::max();
 
     // _halo_exchange_seq_workgroup_policies_start
     using forall_policy = RAJA::seq_exec;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy <
-                                 RAJA::seq_work,
-                                 RAJA::ordered,
-                                 RAJA::ragged_array_of_objects,
-                                 RAJA::indirect_function_call_dispatch >;
-
-    using workpool = RAJA::WorkPool< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     memory_manager_allocator<char> >;
-
-    using workgroup = RAJA::WorkGroup< workgroup_policy,
-                                       int,
-                                       RAJA::xargs<>,
-                                       memory_manager_allocator<char> >;
-
-    using worksite = RAJA::WorkSite< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     memory_manager_allocator<char> >;
+    using workgroup_policy =
+        RAJA::WorkGroupPolicy<RAJA::seq_work,
+                              RAJA::ordered,
+                              RAJA::ragged_array_of_objects,
+                              RAJA::indirect_function_call_dispatch>;
+
+    using workpool = RAJA::WorkPool<workgroup_policy,
+                                    int,
+                                    RAJA::xargs<>,
+                                    memory_manager_allocator<char>>;
+
+    using workgroup = RAJA::WorkGroup<workgroup_policy,
+                                      int,
+                                      RAJA::xargs<>,
+                                      memory_manager_allocator<char>>;
+
+    using worksite = RAJA::WorkSite<workgroup_policy,
+                                    int,
+                                    RAJA::xargs<>,
+                                    memory_manager_allocator<char>>;
     // _halo_exchange_seq_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
-
     }
 
-    workpool pool_pack  (memory_manager_allocator<char>{});
+    workpool pool_pack(memory_manager_allocator<char>{});
     workpool pool_unpack(memory_manager_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=](int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_seq_workgroup_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_seq_workgroup_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_pack.enqueue(range_segment(0, len), [=] (int i) {
-            buffer[i] = var[list[i]];
-          });
+            pool_pack.enqueue(range_segment(0, len),
+                              [=](int i) { buffer[i] = var[list[i]]; });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_pack = pool_pack.instantiate();
+        workgroup group_pack = pool_pack.instantiate();
 
-      worksite site_pack = group_pack.run();
+        worksite site_pack = group_pack.run();
 
-      // send all messages
-      // _halo_exchange_seq_workgroup_packing_end
+        // send all messages
+        // _halo_exchange_seq_workgroup_packing_end
 
-      // _halo_exchange_seq_workgroup_unpacking_start
-      // recv all messages
+        // _halo_exchange_seq_workgroup_unpacking_start
+        // recv all messages
 
-      for (int l = 0; l < num_neighbors; ++l) {
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_unpack.enqueue(range_segment(0, len), [=] (int i) {
-            var[list[i]] = buffer[i];
-          });
+            pool_unpack.enqueue(range_segment(0, len),
+                                [=](int i) { var[list[i]] = buffer[i]; });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_unpack = pool_unpack.instantiate();
-
-      worksite site_unpack = group_unpack.run();
-      // _halo_exchange_seq_workgroup_unpacking_end
+        workgroup group_unpack = pool_unpack.instantiate();
 
+        worksite site_unpack = group_unpack.run();
+        // _halo_exchange_seq_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -597,28 +631,29 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate(buffers[l]);
-
     }
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// Separate packing/unpacking loops using forall
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Separate packing/unpacking loops using forall
+  //----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Openmp forall halo exchange...\n";
 
@@ -630,74 +665,78 @@ int main(int argc, char **argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
-
     }
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=](int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_openmp_forall_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_openmp_forall_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
-            buffer[i] = var[list[i]];
-          });
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
+              buffer[i] = var[list[i]];
+            });
 
-          buffer += len;
-        }
+            buffer += len;
+          }
 
-        // send single message
-      }
-      // _halo_exchange_openmp_forall_packing_end
+          // send single message
+        }
+        // _halo_exchange_openmp_forall_packing_end
 
-      // _halo_exchange_openmp_forall_unpacking_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_openmp_forall_unpacking_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        // recv single message
+          // recv single message
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
-            var[list[i]] = buffer[i];
-          });
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
+              var[list[i]] = buffer[i];
+            });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
-      // _halo_exchange_openmp_forall_unpacking_end
-
+        // _halo_exchange_openmp_forall_unpacking_end
       }
       timer.stop();
 
@@ -706,23 +745,24 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate(buffers[l]);
-
     }
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 
-//----------------------------------------------------------------------------//
-// RAJA::WorkGroup may allow effective parallelism across loops with Openmp.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::WorkGroup may allow effective parallelism across loops with Openmp.
+  //----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA OpenMP workgroup halo exchange...\n";
 
@@ -731,109 +771,111 @@ int main(int argc, char **argv)
     // _halo_exchange_openmp_workgroup_policies_start
     using forall_policy = RAJA::omp_parallel_for_exec;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy <
-                                 RAJA::omp_work,
-                                 RAJA::ordered,
-                                 RAJA::ragged_array_of_objects,
-                                 RAJA::indirect_function_call_dispatch >;
-
-    using workpool = RAJA::WorkPool< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     memory_manager_allocator<char> >;
-
-    using workgroup = RAJA::WorkGroup< workgroup_policy,
-                                       int,
-                                       RAJA::xargs<>,
-                                       memory_manager_allocator<char> >;
-
-    using worksite = RAJA::WorkSite< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     memory_manager_allocator<char> >;
+    using workgroup_policy =
+        RAJA::WorkGroupPolicy<RAJA::omp_work,
+                              RAJA::ordered,
+                              RAJA::ragged_array_of_objects,
+                              RAJA::indirect_function_call_dispatch>;
+
+    using workpool = RAJA::WorkPool<workgroup_policy,
+                                    int,
+                                    RAJA::xargs<>,
+                                    memory_manager_allocator<char>>;
+
+    using workgroup = RAJA::WorkGroup<workgroup_policy,
+                                      int,
+                                      RAJA::xargs<>,
+                                      memory_manager_allocator<char>>;
+
+    using worksite = RAJA::WorkSite<workgroup_policy,
+                                    int,
+                                    RAJA::xargs<>,
+                                    memory_manager_allocator<char>>;
     // _halo_exchange_openmp_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
-
     }
 
-    workpool pool_pack  (memory_manager_allocator<char>{});
+    workpool pool_pack(memory_manager_allocator<char>{});
     workpool pool_unpack(memory_manager_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=](int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_openmp_workgroup_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_openmp_workgroup_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_pack.enqueue(range_segment(0, len), [=] (int i) {
-            buffer[i] = var[list[i]];
-          });
+            pool_pack.enqueue(range_segment(0, len),
+                              [=](int i) { buffer[i] = var[list[i]]; });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_pack = pool_pack.instantiate();
+        workgroup group_pack = pool_pack.instantiate();
 
-      worksite site_pack = group_pack.run();
+        worksite site_pack = group_pack.run();
 
-      // send all messages
-      // _halo_exchange_openmp_workgroup_packing_end
+        // send all messages
+        // _halo_exchange_openmp_workgroup_packing_end
 
-      // _halo_exchange_openmp_workgroup_unpacking_start
-      // recv all messages
+        // _halo_exchange_openmp_workgroup_unpacking_start
+        // recv all messages
 
-      for (int l = 0; l < num_neighbors; ++l) {
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_unpack.enqueue(range_segment(0, len), [=] (int i) {
-            var[list[i]] = buffer[i];
-          });
+            pool_unpack.enqueue(range_segment(0, len),
+                                [=](int i) { var[list[i]] = buffer[i]; });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
-
-      workgroup group_unpack = pool_unpack.instantiate();
 
-      worksite site_unpack = group_unpack.run();
-      // _halo_exchange_openmp_workgroup_unpacking_end
+        workgroup group_unpack = pool_unpack.instantiate();
 
+        worksite site_unpack = group_unpack.run();
+        // _halo_exchange_openmp_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -842,30 +884,31 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate(buffers[l]);
-
     }
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//----------------------------------------------------------------------------//
-// Separate packing/unpacking loops using forall
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Separate packing/unpacking loops using forall
+  //----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Cuda forall halo exchange...\n";
 
@@ -873,25 +916,33 @@ int main(int argc, char **argv)
 
 
     std::vector<double*> cuda_vars(num_vars, nullptr);
-    std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> cuda_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> cuda_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v) {
+    for (int v = 0; v < num_vars; ++v)
+    {
       cuda_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       int pack_len = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault ));
+      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l],
+                            pack_index_lists[l],
+                            pack_len * sizeof(int),
+                            cudaMemcpyDefault));
 
       int unpack_len = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault ));
+      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l],
+                            unpack_index_lists[l],
+                            unpack_len * sizeof(int),
+                            cudaMemcpyDefault));
     }
 
-    std::swap(vars,               cuda_vars);
-    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(vars, cuda_vars);
+    std::swap(pack_index_lists, cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
 
@@ -901,78 +952,83 @@ int main(int argc, char **argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
-
     }
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_cuda_forall_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_cuda_forall_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            buffer[i] = var[list[i]];
-          });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
 
-          buffer += len;
-        }
+            buffer += len;
+          }
 
-        cudaErrchk(cudaDeviceSynchronize());
+          cudaErrchk(cudaDeviceSynchronize());
 
-        // send single message
-      }
-      // _halo_exchange_cuda_forall_packing_end
+          // send single message
+        }
+        // _halo_exchange_cuda_forall_packing_end
 
-      // _halo_exchange_cuda_forall_unpacking_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_cuda_forall_unpacking_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        // recv single message
+          // recv single message
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            var[list[i]] = buffer[i];
-          });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
-
-      cudaErrchk(cudaDeviceSynchronize());
-      // _halo_exchange_cuda_forall_unpacking_end
 
+        cudaErrchk(cudaDeviceSynchronize());
+        // _halo_exchange_cuda_forall_unpacking_end
       }
       timer.stop();
 
@@ -981,39 +1037,43 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate_gpu(buffers[l]);
-
     }
 
 
-    std::swap(vars,               cuda_vars);
-    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(vars, cuda_vars);
+    std::swap(pack_index_lists, cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v) {
-      cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault ));
+    for (int v = 0; v < num_vars; ++v)
+    {
+      cudaErrchk(cudaMemcpy(
+          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       memoryManager::deallocate_gpu(cuda_pack_index_lists[l]);
       memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]);
     }
 
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 
-//----------------------------------------------------------------------------//
-// RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution
+  //----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Cuda workgroup halo exchange...\n";
 
@@ -1021,138 +1081,145 @@ int main(int argc, char **argv)
 
 
     std::vector<double*> cuda_vars(num_vars, nullptr);
-    std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> cuda_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> cuda_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v) {
+    for (int v = 0; v < num_vars; ++v)
+    {
       cuda_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       int pack_len = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault ));
+      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l],
+                            pack_index_lists[l],
+                            pack_len * sizeof(int),
+                            cudaMemcpyDefault));
 
       int unpack_len = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault ));
+      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l],
+                            unpack_index_lists[l],
+                            unpack_len * sizeof(int),
+                            cudaMemcpyDefault));
     }
 
-    std::swap(vars,               cuda_vars);
-    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(vars, cuda_vars);
+    std::swap(pack_index_lists, cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
 
     // _halo_exchange_cuda_workgroup_policies_start
     using forall_policy = RAJA::cuda_exec_async<CUDA_BLOCK_SIZE>;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy <
-                                 RAJA::cuda_work_async<CUDA_WORKGROUP_BLOCK_SIZE>,
-                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-                                 RAJA::constant_stride_array_of_objects,
-                                 RAJA::indirect_function_call_dispatch >;
-
-    using workpool = RAJA::WorkPool< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     pinned_allocator<char> >;
-
-    using workgroup = RAJA::WorkGroup< workgroup_policy,
-                                       int,
-                                       RAJA::xargs<>,
-                                       pinned_allocator<char> >;
-
-    using worksite = RAJA::WorkSite< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     pinned_allocator<char> >;
+    using workgroup_policy = RAJA::WorkGroupPolicy<
+        RAJA::cuda_work_async<CUDA_WORKGROUP_BLOCK_SIZE>,
+        RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+        RAJA::constant_stride_array_of_objects,
+        RAJA::indirect_function_call_dispatch>;
+
+    using workpool = RAJA::
+        WorkPool<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+
+    using workgroup = RAJA::
+        WorkGroup<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+
+    using worksite = RAJA::
+        WorkSite<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
     // _halo_exchange_cuda_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
-
     }
 
-    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_pack(pinned_allocator<char>{});
     workpool pool_unpack(pinned_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_cuda_workgroup_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_cuda_workgroup_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            buffer[i] = var[list[i]];
-          });
+            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
+              buffer[i] = var[list[i]];
+            });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_pack = pool_pack.instantiate();
+        workgroup group_pack = pool_pack.instantiate();
 
-      worksite site_pack = group_pack.run();
+        worksite site_pack = group_pack.run();
 
-      cudaErrchk(cudaDeviceSynchronize());
+        cudaErrchk(cudaDeviceSynchronize());
 
-      // send all messages
-      // _halo_exchange_cuda_workgroup_packing_end
+        // send all messages
+        // _halo_exchange_cuda_workgroup_packing_end
 
-      // _halo_exchange_cuda_workgroup_unpacking_start
-      // recv all messages
+        // _halo_exchange_cuda_workgroup_unpacking_start
+        // recv all messages
 
-      for (int l = 0; l < num_neighbors; ++l) {
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            var[list[i]] = buffer[i];
-          });
+            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
+              var[list[i]] = buffer[i];
+            });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
-
-      workgroup group_unpack = pool_unpack.instantiate();
 
-      worksite site_unpack = group_unpack.run();
+        workgroup group_unpack = pool_unpack.instantiate();
 
-      cudaErrchk(cudaDeviceSynchronize());
-      // _halo_exchange_cuda_workgroup_unpacking_end
+        worksite site_unpack = group_unpack.run();
 
+        cudaErrchk(cudaDeviceSynchronize());
+        // _halo_exchange_cuda_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -1161,46 +1228,50 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate_gpu(buffers[l]);
-
     }
 
 
-    std::swap(vars,               cuda_vars);
-    std::swap(pack_index_lists,   cuda_pack_index_lists);
+    std::swap(vars, cuda_vars);
+    std::swap(pack_index_lists, cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v) {
-      cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault ));
+    for (int v = 0; v < num_vars; ++v)
+    {
+      cudaErrchk(cudaMemcpy(
+          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       memoryManager::deallocate_gpu(cuda_pack_index_lists[l]);
       memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]);
     }
 
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
-// Separate packing/unpacking loops using forall
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Separate packing/unpacking loops using forall
+  //----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Hip forall halo exchange...\n";
 
@@ -1208,25 +1279,33 @@ int main(int argc, char **argv)
 
 
     std::vector<double*> hip_vars(num_vars, nullptr);
-    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> hip_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v) {
+    for (int v = 0; v < num_vars; ++v)
+    {
       hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       int pack_len = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
+      hipErrchk(hipMemcpy(hip_pack_index_lists[l],
+                          pack_index_lists[l],
+                          pack_len * sizeof(int),
+                          hipMemcpyHostToDevice));
 
       int unpack_len = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
+      hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
+                          unpack_index_lists[l],
+                          unpack_len * sizeof(int),
+                          hipMemcpyHostToDevice));
     }
 
-    std::swap(vars,               hip_vars);
-    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(vars, hip_vars);
+    std::swap(pack_index_lists, hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
 
@@ -1236,78 +1315,83 @@ int main(int argc, char **argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
-
     }
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_hip_forall_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_hip_forall_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            buffer[i] = var[list[i]];
-          });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
 
-          buffer += len;
-        }
+            buffer += len;
+          }
 
-        hipErrchk(hipDeviceSynchronize());
+          hipErrchk(hipDeviceSynchronize());
 
-        // send single message
-      }
-      // _halo_exchange_hip_forall_packing_end
+          // send single message
+        }
+        // _halo_exchange_hip_forall_packing_end
 
-      // _halo_exchange_hip_forall_unpacking_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_hip_forall_unpacking_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        // recv single message
+          // recv single message
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            var[list[i]] = buffer[i];
-          });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
-
-      hipErrchk(hipDeviceSynchronize());
-      // _halo_exchange_hip_forall_unpacking_end
 
+        hipErrchk(hipDeviceSynchronize());
+        // _halo_exchange_hip_forall_unpacking_end
       }
       timer.stop();
 
@@ -1316,179 +1400,193 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate_gpu(buffers[l]);
-
     }
 
 
-    std::swap(vars,               hip_vars);
-    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(vars, hip_vars);
+    std::swap(pack_index_lists, hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v) {
-      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
+    for (int v = 0; v < num_vars; ++v)
+    {
+      hipErrchk(hipMemcpy(vars[v],
+                          hip_vars[v],
+                          var_size * sizeof(double),
+                          hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
       memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
     }
 
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 
 #if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
-//----------------------------------------------------------------------------//
-// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
+  //----------------------------------------------------------------------------//
   {
-    std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo exchange...\n";
+    std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo "
+                 "exchange...\n";
 
     double minCycle = std::numeric_limits<double>::max();
 
 
     std::vector<double*> hip_vars(num_vars, nullptr);
-    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> hip_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v) {
+    for (int v = 0; v < num_vars; ++v)
+    {
       hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       int pack_len = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
+      hipErrchk(hipMemcpy(hip_pack_index_lists[l],
+                          pack_index_lists[l],
+                          pack_len * sizeof(int),
+                          hipMemcpyHostToDevice));
 
       int unpack_len = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
+      hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
+                          unpack_index_lists[l],
+                          unpack_len * sizeof(int),
+                          hipMemcpyHostToDevice));
     }
 
-    std::swap(vars,               hip_vars);
-    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(vars, hip_vars);
+    std::swap(pack_index_lists, hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
 
     // _halo_exchange_hip_workgroup_policies_start
     using forall_policy = RAJA::hip_exec_async<HIP_BLOCK_SIZE>;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy <
-                                 RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
-                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                 RAJA::constant_stride_array_of_objects,
-                                 RAJA::indirect_function_call_dispatch >;
-
-    using workpool = RAJA::WorkPool< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     pinned_allocator<char> >;
-
-    using workgroup = RAJA::WorkGroup< workgroup_policy,
-                                       int,
-                                       RAJA::xargs<>,
-                                       pinned_allocator<char> >;
-
-    using worksite = RAJA::WorkSite< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     pinned_allocator<char> >;
+    using workgroup_policy = RAJA::WorkGroupPolicy<
+        RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
+        RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        RAJA::constant_stride_array_of_objects,
+        RAJA::indirect_function_call_dispatch>;
+
+    using workpool = RAJA::
+        WorkPool<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+
+    using workgroup = RAJA::
+        WorkGroup<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+
+    using worksite = RAJA::
+        WorkSite<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
     // _halo_exchange_hip_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
-
     }
 
-    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_pack(pinned_allocator<char>{});
     workpool pool_unpack(pinned_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+        }
 
-      // _halo_exchange_hip_workgroup_packing_start
-      for (int l = 0; l < num_neighbors; ++l) {
+        // _halo_exchange_hip_workgroup_packing_start
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            buffer[i] = var[list[i]];
-          });
+            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
+              buffer[i] = var[list[i]];
+            });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_pack = pool_pack.instantiate();
+        workgroup group_pack = pool_pack.instantiate();
 
-      worksite site_pack = group_pack.run();
+        worksite site_pack = group_pack.run();
 
-      hipErrchk(hipDeviceSynchronize());
+        hipErrchk(hipDeviceSynchronize());
 
-      // send all messages
-      // _halo_exchange_hip_workgroup_packing_end
+        // send all messages
+        // _halo_exchange_hip_workgroup_packing_end
 
-      // _halo_exchange_hip_workgroup_unpacking_start
-      // recv all messages
+        // _halo_exchange_hip_workgroup_unpacking_start
+        // recv all messages
 
-      for (int l = 0; l < num_neighbors; ++l) {
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
-            var[list[i]] = buffer[i];
-          });
+            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
+              var[list[i]] = buffer[i];
+            });
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_unpack = pool_unpack.instantiate();
+        workgroup group_unpack = pool_unpack.instantiate();
 
-      worksite site_unpack = group_unpack.run();
-
-      hipErrchk(hipDeviceSynchronize());
-      // _halo_exchange_hip_workgroup_unpacking_end
+        worksite site_unpack = group_unpack.run();
 
+        hipErrchk(hipDeviceSynchronize());
+        // _halo_exchange_hip_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -1497,188 +1595,200 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate_gpu(buffers[l]);
-
     }
 
 
-    std::swap(vars,               hip_vars);
-    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(vars, hip_vars);
+    std::swap(pack_index_lists, hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v) {
-      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
+    for (int v = 0; v < num_vars; ++v)
+    {
+      hipErrchk(hipMemcpy(vars[v],
+                          hip_vars[v],
+                          var_size * sizeof(double),
+                          hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
       memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
     }
 
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
+  //----------------------------------------------------------------------------//
   {
-    std::cout << "\n Running RAJA Hip direct dispatch workgroup halo exchange...\n";
+    std::cout << "\n Running RAJA Hip direct dispatch workgroup halo "
+                 "exchange...\n";
 
     double minCycle = std::numeric_limits<double>::max();
 
 
     std::vector<double*> hip_vars(num_vars, nullptr);
-    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*> hip_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v) {
+    for (int v = 0; v < num_vars; ++v)
+    {
       hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       int pack_len = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
+      hipErrchk(hipMemcpy(hip_pack_index_lists[l],
+                          pack_index_lists[l],
+                          pack_len * sizeof(int),
+                          hipMemcpyHostToDevice));
 
       int unpack_len = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
+      hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
+                          unpack_index_lists[l],
+                          unpack_len * sizeof(int),
+                          hipMemcpyHostToDevice));
     }
 
-    std::swap(vars,               hip_vars);
-    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(vars, hip_vars);
+    std::swap(pack_index_lists, hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
 
     using forall_policy = RAJA::hip_exec_async<HIP_BLOCK_SIZE>;
 
-    struct Packer {
+    struct Packer
+    {
       double* buffer;
       double* var;
       int* list;
-      RAJA_DEVICE void operator() (int i) const {
-        buffer[i] = var[list[i]];
-      }
+      RAJA_DEVICE void operator()(int i) const { buffer[i] = var[list[i]]; }
     };
 
-    struct UnPacker {
+    struct UnPacker
+    {
       double* buffer;
       double* var;
       int* list;
-      RAJA_DEVICE void operator()(int i) const {
-        var[list[i]] = buffer[i];
-      }
+      RAJA_DEVICE void operator()(int i) const { var[list[i]] = buffer[i]; }
     };
 
-    using workgroup_policy = RAJA::WorkGroupPolicy <
-                                 RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
-                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                 RAJA::constant_stride_array_of_objects,
-                                 RAJA::direct_dispatch<camp::list<range_segment, Packer>,
-                                                       camp::list<range_segment, UnPacker>>
-                                 >;
-
-    using workpool = RAJA::WorkPool< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     pinned_allocator<char> >;
-
-    using workgroup = RAJA::WorkGroup< workgroup_policy,
-                                       int,
-                                       RAJA::xargs<>,
-                                       pinned_allocator<char> >;
-
-    using worksite = RAJA::WorkSite< workgroup_policy,
-                                     int,
-                                     RAJA::xargs<>,
-                                     pinned_allocator<char> >;
+    using workgroup_policy = RAJA::WorkGroupPolicy<
+        RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
+        RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        RAJA::constant_stride_array_of_objects,
+        RAJA::direct_dispatch<camp::list<range_segment, Packer>,
+                              camp::list<range_segment, UnPacker>>>;
+
+    using workpool = RAJA::
+        WorkPool<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+
+    using workgroup = RAJA::
+        WorkGroup<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+
+    using worksite = RAJA::
+        WorkSite<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
-
     }
 
-    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_pack(pinned_allocator<char>{});
     workpool pool_unpack(pinned_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c ) {
+    for (int c = 0; c < num_cycles; ++c)
+    {
       timer.start();
       {
 
-      // set vars
-      for (int v = 0; v < num_vars; ++v) {
+        // set vars
+        for (int v = 0; v < num_vars; ++v)
+        {
 
-        double* var = vars[v];
+          double* var = vars[v];
 
-        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
-          var[i] = i + v;
-        });
-      }
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+        }
 
-      for (int l = 0; l < num_neighbors; ++l) {
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = pack_index_lists[l];
-        int  len  = pack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = pack_index_lists[l];
+          int len = pack_index_list_lengths[l];
 
-        // pack
-        for (int v = 0; v < num_vars; ++v) {
+          // pack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+            pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_pack = pool_pack.instantiate();
+        workgroup group_pack = pool_pack.instantiate();
 
-      worksite site_pack = group_pack.run();
+        worksite site_pack = group_pack.run();
 
-      hipErrchk(hipDeviceSynchronize());
+        hipErrchk(hipDeviceSynchronize());
 
-      // send all messages
+        // send all messages
 
-      // recv all messages
+        // recv all messages
 
-      for (int l = 0; l < num_neighbors; ++l) {
+        for (int l = 0; l < num_neighbors; ++l)
+        {
 
-        double* buffer = buffers[l];
-        int* list = unpack_index_lists[l];
-        int  len  = unpack_index_list_lengths[l];
+          double* buffer = buffers[l];
+          int* list = unpack_index_lists[l];
+          int len = unpack_index_list_lengths[l];
 
-        // unpack
-        for (int v = 0; v < num_vars; ++v) {
+          // unpack
+          for (int v = 0; v < num_vars; ++v)
+          {
 
-          double* var = vars[v];
+            double* var = vars[v];
 
-          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+            pool_unpack.enqueue(range_segment(0, len),
+                                UnPacker{buffer, var, list});
 
-          buffer += len;
+            buffer += len;
+          }
         }
-      }
 
-      workgroup group_unpack = pool_unpack.instantiate();
+        workgroup group_unpack = pool_unpack.instantiate();
 
-      worksite site_unpack = group_unpack.run();
-
-      hipErrchk(hipDeviceSynchronize());
+        worksite site_unpack = group_unpack.run();
 
+        hipErrchk(hipDeviceSynchronize());
       }
       timer.stop();
 
@@ -1687,45 +1797,52 @@ int main(int argc, char **argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
 
       memoryManager::deallocate_gpu(buffers[l]);
-
     }
 
 
-    std::swap(vars,               hip_vars);
-    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(vars, hip_vars);
+    std::swap(pack_index_lists, hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v) {
-      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
+    for (int v = 0; v < num_vars; ++v)
+    {
+      hipErrchk(hipMemcpy(vars[v],
+                          hip_vars[v],
+                          var_size * sizeof(double),
+                          hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l) {
+    for (int l = 0; l < num_neighbors; ++l)
+    {
       memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
       memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
     }
 
 
-    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
+              << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    //printResult(vars, var_size, num_vars);
+    // printResult(vars, var_size, num_vars);
   }
 
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
-//
-// Clean up.
-//
-  for (int v = 0; v < num_vars; ++v) {
+  //
+  // Clean up.
+  //
+  for (int v = 0; v < num_vars; ++v)
+  {
     memoryManager::deallocate(vars[v]);
     memoryManager::deallocate(vars_ref[v]);
   }
@@ -1743,20 +1860,30 @@ int main(int argc, char **argv)
 //
 // Function to compare result to reference and report P/F.
 //
-void checkResult(std::vector<double*> const& vars, std::vector<double*> const& vars_ref,
-                 int var_size, int num_vars)
+void checkResult(std::vector<double*> const& vars,
+                 std::vector<double*> const& vars_ref,
+                 int var_size,
+                 int num_vars)
 {
   bool correct = true;
-  for (int v = 0; v < num_vars; ++v) {
+  for (int v = 0; v < num_vars; ++v)
+  {
     double* var = vars[v];
     double* var_ref = vars_ref[v];
-    for (int i = 0; i < var_size; i++) {
-      if ( var[i] != var_ref[i] ) { correct = false; }
+    for (int i = 0; i < var_size; i++)
+    {
+      if (var[i] != var_ref[i])
+      {
+        correct = false;
+      }
     }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -1767,9 +1894,11 @@ void checkResult(std::vector<double*> const& vars, std::vector<double*> const& v
 void printResult(std::vector<double*> const& vars, int var_size, int num_vars)
 {
   std::cout << std::endl;
-  for (int v = 0; v < num_vars; ++v) {
+  for (int v = 0; v < num_vars; ++v)
+  {
     double* var = vars[v];
-    for (int i = 0; i < var_size; i++) {
+    for (int i = 0; i < var_size; i++)
+    {
       std::cout << "result[" << i << "] = " << var[i] << std::endl;
     }
   }
@@ -1791,119 +1920,202 @@ struct Extent
 // Function to generate index lists for packing.
 //
 void create_pack_lists(std::vector<int*>& pack_index_lists,
-                       std::vector<int >& pack_index_list_lengths,
-                       const int halo_width, const int* grid_dims)
+                       std::vector<int>& pack_index_list_lengths,
+                       const int halo_width,
+                       const int* grid_dims)
 {
   std::vector<Extent> pack_index_list_extents(num_neighbors);
 
   // faces
-  pack_index_list_extents[0]  = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[1]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[2]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[3]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[4]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[5]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[0] = Extent{halo_width,
+                                      halo_width + halo_width,
+                                      halo_width,
+                                      grid_dims[1] + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[1] = Extent{grid_dims[0],
+                                      grid_dims[0] + halo_width,
+                                      halo_width,
+                                      grid_dims[1] + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[2] = Extent{halo_width,
+                                      grid_dims[0] + halo_width,
+                                      halo_width,
+                                      halo_width + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[3] = Extent{halo_width,
+                                      grid_dims[0] + halo_width,
+                                      grid_dims[1],
+                                      grid_dims[1] + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[4] = Extent{halo_width,
+                                      grid_dims[0] + halo_width,
+                                      halo_width,
+                                      grid_dims[1] + halo_width,
+                                      halo_width,
+                                      halo_width + halo_width};
+  pack_index_list_extents[5] = Extent{halo_width,
+                                      grid_dims[0] + halo_width,
+                                      halo_width,
+                                      grid_dims[1] + halo_width,
+                                      grid_dims[2],
+                                      grid_dims[2] + halo_width};
 
   // edges
-  pack_index_list_extents[6]  = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[7]  = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[8]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[9]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[10] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[11] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[14] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[15] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[16] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[17] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[6] = Extent{halo_width,
+                                      halo_width + halo_width,
+                                      halo_width,
+                                      halo_width + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[7] = Extent{halo_width,
+                                      halo_width + halo_width,
+                                      grid_dims[1],
+                                      grid_dims[1] + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[8] = Extent{grid_dims[0],
+                                      grid_dims[0] + halo_width,
+                                      halo_width,
+                                      halo_width + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[9] = Extent{grid_dims[0],
+                                      grid_dims[0] + halo_width,
+                                      grid_dims[1],
+                                      grid_dims[1] + halo_width,
+                                      halo_width,
+                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[10] = Extent{halo_width,
+                                       halo_width + halo_width,
+                                       halo_width,
+                                       grid_dims[1] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[11] = Extent{halo_width,
+                                       halo_width + halo_width,
+                                       halo_width,
+                                       grid_dims[1] + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[12] = Extent{grid_dims[0],
+                                       grid_dims[0] + halo_width,
+                                       halo_width,
+                                       grid_dims[1] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[13] = Extent{grid_dims[0],
+                                       grid_dims[0] + halo_width,
+                                       halo_width,
+                                       grid_dims[1] + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[14] = Extent{halo_width,
+                                       grid_dims[0] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[15] = Extent{halo_width,
+                                       grid_dims[0] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[16] = Extent{halo_width,
+                                       grid_dims[0] + halo_width,
+                                       grid_dims[1],
+                                       grid_dims[1] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[17] = Extent{halo_width,
+                                       grid_dims[0] + halo_width,
+                                       grid_dims[1],
+                                       grid_dims[1] + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
 
   // corners
-  pack_index_list_extents[18] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[19] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[20] = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[21] = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[18] = Extent{halo_width,
+                                       halo_width + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[19] = Extent{halo_width,
+                                       halo_width + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[20] = Extent{halo_width,
+                                       halo_width + halo_width,
+                                       grid_dims[1],
+                                       grid_dims[1] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[21] = Extent{halo_width,
+                                       halo_width + halo_width,
+                                       grid_dims[1],
+                                       grid_dims[1] + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[22] = Extent{grid_dims[0],
+                                       grid_dims[0] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[23] = Extent{grid_dims[0],
+                                       grid_dims[0] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[24] = Extent{grid_dims[0],
+                                       grid_dims[0] + halo_width,
+                                       grid_dims[1],
+                                       grid_dims[1] + halo_width,
+                                       halo_width,
+                                       halo_width + halo_width};
+  pack_index_list_extents[25] = Extent{grid_dims[0],
+                                       grid_dims[0] + halo_width,
+                                       grid_dims[1],
+                                       grid_dims[1] + halo_width,
+                                       grid_dims[2],
+                                       grid_dims[2] + halo_width};
 
   const int grid_i_stride = 1;
-  const int grid_j_stride = grid_dims[0] + 2*halo_width;
-  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
+  const int grid_j_stride = grid_dims[0] + 2 * halo_width;
+  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2 * halo_width);
 
-  for (int l = 0; l < num_neighbors; ++l) {
+  for (int l = 0; l < num_neighbors; ++l)
+  {
 
     Extent extent = pack_index_list_extents[l];
 
     pack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
                                  (extent.j_max - extent.j_min) *
-                                 (extent.k_max - extent.k_min) ;
+                                 (extent.k_max - extent.k_min);
 
-    pack_index_lists[l] = memoryManager::allocate<int>(pack_index_list_lengths[l]);
+    pack_index_lists[l] =
+        memoryManager::allocate<int>(pack_index_list_lengths[l]);
 
     int* pack_list = pack_index_lists[l];
 
     int list_idx = 0;
-    for (int kk = extent.k_min; kk < extent.k_max; ++kk) {
-      for (int jj = extent.j_min; jj < extent.j_max; ++jj) {
-        for (int ii = extent.i_min; ii < extent.i_max; ++ii) {
+    for (int kk = extent.k_min; kk < extent.k_max; ++kk)
+    {
+      for (int jj = extent.j_min; jj < extent.j_max; ++jj)
+      {
+        for (int ii = extent.i_min; ii < extent.i_max; ++ii)
+        {
 
-          int pack_idx = ii * grid_i_stride +
-                         jj * grid_j_stride +
-                         kk * grid_k_stride ;
+          int pack_idx =
+              ii * grid_i_stride + jj * grid_j_stride + kk * grid_k_stride;
 
           pack_list[list_idx] = pack_idx;
 
@@ -1919,7 +2131,8 @@ void create_pack_lists(std::vector<int*>& pack_index_lists,
 //
 void destroy_pack_lists(std::vector<int*>& pack_index_lists)
 {
-  for (int l = 0; l < num_neighbors; ++l) {
+  for (int l = 0; l < num_neighbors; ++l)
+  {
     memoryManager::deallocate(pack_index_lists[l]);
   }
 }
@@ -1928,119 +2141,187 @@ void destroy_pack_lists(std::vector<int*>& pack_index_lists)
 //
 // Function to generate index lists for unpacking.
 //
-void create_unpack_lists(std::vector<int*>& unpack_index_lists, std::vector<int>& unpack_index_list_lengths,
-                         const int halo_width, const int* grid_dims)
+void create_unpack_lists(std::vector<int*>& unpack_index_lists,
+                         std::vector<int>& unpack_index_list_lengths,
+                         const int halo_width,
+                         const int* grid_dims)
 {
   std::vector<Extent> unpack_index_list_extents(num_neighbors);
 
   // faces
-  unpack_index_list_extents[0]  = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[1]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[2]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[3]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[4]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[5]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[0] = Extent{0,
+                                        halo_width,
+                                        halo_width,
+                                        grid_dims[1] + halo_width,
+                                        halo_width,
+                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width,
+                                        grid_dims[0] + 2 * halo_width,
+                                        halo_width,
+                                        grid_dims[1] + halo_width,
+                                        halo_width,
+                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[2] = Extent{halo_width,
+                                        grid_dims[0] + halo_width,
+                                        0,
+                                        halo_width,
+                                        halo_width,
+                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[3] = Extent{halo_width,
+                                        grid_dims[0] + halo_width,
+                                        grid_dims[1] + halo_width,
+                                        grid_dims[1] + 2 * halo_width,
+                                        halo_width,
+                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[4] = Extent{halo_width,
+                                        grid_dims[0] + halo_width,
+                                        halo_width,
+                                        grid_dims[1] + halo_width,
+                                        0,
+                                        halo_width};
+  unpack_index_list_extents[5] = Extent{halo_width,
+                                        grid_dims[0] + halo_width,
+                                        halo_width,
+                                        grid_dims[1] + halo_width,
+                                        grid_dims[2] + halo_width,
+                                        grid_dims[2] + 2 * halo_width};
 
   // edges
-  unpack_index_list_extents[6]  = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[7]  = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[8]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[9]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[10] = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[11] = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[14] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[15] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[16] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[17] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[6] = Extent{
+      0, halo_width, 0, halo_width, halo_width, grid_dims[2] + halo_width};
+  unpack_index_list_extents[7] = Extent{0,
+                                        halo_width,
+                                        grid_dims[1] + halo_width,
+                                        grid_dims[1] + 2 * halo_width,
+                                        halo_width,
+                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width,
+                                        grid_dims[0] + 2 * halo_width,
+                                        0,
+                                        halo_width,
+                                        halo_width,
+                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width,
+                                        grid_dims[0] + 2 * halo_width,
+                                        grid_dims[1] + halo_width,
+                                        grid_dims[1] + 2 * halo_width,
+                                        halo_width,
+                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[10] = Extent{
+      0, halo_width, halo_width, grid_dims[1] + halo_width, 0, halo_width};
+  unpack_index_list_extents[11] = Extent{0,
+                                         halo_width,
+                                         halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width,
+                                         grid_dims[0] + 2 * halo_width,
+                                         halo_width,
+                                         grid_dims[1] + halo_width,
+                                         0,
+                                         halo_width};
+  unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width,
+                                         grid_dims[0] + 2 * halo_width,
+                                         halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[14] = Extent{
+      halo_width, grid_dims[0] + halo_width, 0, halo_width, 0, halo_width};
+  unpack_index_list_extents[15] = Extent{halo_width,
+                                         grid_dims[0] + halo_width,
+                                         0,
+                                         halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[16] = Extent{halo_width,
+                                         grid_dims[0] + halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[1] + 2 * halo_width,
+                                         0,
+                                         halo_width};
+  unpack_index_list_extents[17] = Extent{halo_width,
+                                         grid_dims[0] + halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[1] + 2 * halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
 
   // corners
-  unpack_index_list_extents[18] = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[19] = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[20] = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[21] = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[18] =
+      Extent{0, halo_width, 0, halo_width, 0, halo_width};
+  unpack_index_list_extents[19] = Extent{0,
+                                         halo_width,
+                                         0,
+                                         halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[20] = Extent{0,
+                                         halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[1] + 2 * halo_width,
+                                         0,
+                                         halo_width};
+  unpack_index_list_extents[21] = Extent{0,
+                                         halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[1] + 2 * halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width,
+                                         grid_dims[0] + 2 * halo_width,
+                                         0,
+                                         halo_width,
+                                         0,
+                                         halo_width};
+  unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width,
+                                         grid_dims[0] + 2 * halo_width,
+                                         0,
+                                         halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width,
+                                         grid_dims[0] + 2 * halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[1] + 2 * halo_width,
+                                         0,
+                                         halo_width};
+  unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width,
+                                         grid_dims[0] + 2 * halo_width,
+                                         grid_dims[1] + halo_width,
+                                         grid_dims[1] + 2 * halo_width,
+                                         grid_dims[2] + halo_width,
+                                         grid_dims[2] + 2 * halo_width};
 
   const int grid_i_stride = 1;
-  const int grid_j_stride = grid_dims[0] + 2*halo_width;
-  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
+  const int grid_j_stride = grid_dims[0] + 2 * halo_width;
+  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2 * halo_width);
 
-  for (int l = 0; l < num_neighbors; ++l) {
+  for (int l = 0; l < num_neighbors; ++l)
+  {
 
     Extent extent = unpack_index_list_extents[l];
 
     unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
                                    (extent.j_max - extent.j_min) *
-                                   (extent.k_max - extent.k_min) ;
+                                   (extent.k_max - extent.k_min);
 
-    unpack_index_lists[l] = memoryManager::allocate<int>(unpack_index_list_lengths[l]);
+    unpack_index_lists[l] =
+        memoryManager::allocate<int>(unpack_index_list_lengths[l]);
 
     int* unpack_list = unpack_index_lists[l];
 
     int list_idx = 0;
-    for (int kk = extent.k_min; kk < extent.k_max; ++kk) {
-      for (int jj = extent.j_min; jj < extent.j_max; ++jj) {
-        for (int ii = extent.i_min; ii < extent.i_max; ++ii) {
+    for (int kk = extent.k_min; kk < extent.k_max; ++kk)
+    {
+      for (int jj = extent.j_min; jj < extent.j_max; ++jj)
+      {
+        for (int ii = extent.i_min; ii < extent.i_max; ++ii)
+        {
 
-          int unpack_idx = ii * grid_i_stride +
-                           jj * grid_j_stride +
-                           kk * grid_k_stride ;
+          int unpack_idx =
+              ii * grid_i_stride + jj * grid_j_stride + kk * grid_k_stride;
 
           unpack_list[list_idx] = unpack_idx;
 
@@ -2056,7 +2337,8 @@ void create_unpack_lists(std::vector<int*>& unpack_index_lists, std::vector<int>
 //
 void destroy_unpack_lists(std::vector<int*>& unpack_index_lists)
 {
-  for (int l = 0; l < num_neighbors; ++l) {
+  for (int l = 0; l < num_neighbors; ++l)
+  {
     memoryManager::deallocate(unpack_index_lists[l]);
   }
 }
diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp
index 96a2ffe2f0..fa354d2612 100644
--- a/examples/tut_launch_basic.cpp
+++ b/examples/tut_launch_basic.cpp
@@ -31,7 +31,7 @@
  * the example below choses a sequential
  * execution space and either a CUDA or HIP
  * execution device execution space.
-*/
+ */
 
 // __host_launch_start
 using host_launch = RAJA::seq_launch_t;
@@ -45,12 +45,12 @@ using device_launch = RAJA::cuda_launch_t<false>;
 using device_launch = RAJA::hip_launch_t<false>;
 #endif
 
-using launch_policy = RAJA::LaunchPolicy<
-  host_launch
+using launch_policy = RAJA::LaunchPolicy<host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-  ,device_launch
+                                         ,
+                                         device_launch
 #endif
-  >;
+                                         >;
 
 /*
  * RAJA launch exposes a thread/block programming model
@@ -64,69 +64,73 @@ using launch_policy = RAJA::LaunchPolicy<
  * On the host the loops expands to standard C style for loops.
  */
 
-using teams_x = RAJA::LoopPolicy<
-                                       RAJA::seq_exec
+using teams_x = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                       ,
-                                       RAJA::cuda_block_x_direct
+                                 ,
+                                 RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                       ,
-                                       RAJA::hip_block_x_direct
+                                 ,
+                                 RAJA::hip_block_x_direct
 #endif
-                                       >;
+                                 >;
 
-using teams_y = RAJA::LoopPolicy<
-                                       RAJA::seq_exec
+using teams_y = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                       ,
-                                       RAJA::cuda_block_y_direct
+                                 ,
+                                 RAJA::cuda_block_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                       ,
-                                       RAJA::hip_block_y_direct
+                                 ,
+                                 RAJA::hip_block_y_direct
 #endif
-                                       >;
+                                 >;
 
 using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                         ,
-                                         RAJA::cuda_thread_x_direct
+                                   ,
+                                   RAJA::cuda_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         RAJA::hip_thread_x_direct
+                                   ,
+                                   RAJA::hip_thread_x_direct
 #endif
-                                         >;
+                                   >;
 
 using threads_y = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                         ,
-                                         RAJA::cuda_thread_y_direct
+                                   ,
+                                   RAJA::cuda_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         RAJA::hip_thread_y_direct
+                                   ,
+                                   RAJA::hip_thread_y_direct
 #endif
-                                         >;
+                                   >;
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 __global__ void gpuKernel()
 {
-  //Equivalent CUDA/HIP style thread/block mapping
-  // _device_loop_start
-  {int by = blockIdx.y;
-    {int bx = blockIdx.x;
-
-      {int ty = threadIdx.y;
-        {int tx = blockIdx.x;
-
-          printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d block_by %d \n",
-                 tx, ty, bx, by);
-
+  // Equivalent CUDA/HIP style thread/block mapping
+  //  _device_loop_start
+  {
+    int by = blockIdx.y;
+    {
+      int bx = blockIdx.x;
+
+      {
+        int ty = threadIdx.y;
+        {
+          int tx = blockIdx.x;
+
+          printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d "
+                 "block_by %d \n",
+                 tx,
+                 ty,
+                 bx,
+                 by);
         }
       }
-
     }
   }
   // _device_loop_end
@@ -142,78 +146,102 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-  if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device");
+  if (argc != 2)
+  {
+    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic "
+                        "device");
   }
 
-//
-// Run time policy section is demonstrated in this example by specifying
-// kernel exection space as a command line argument (host or device).
-// Example usage ./tut_launch_basic host or ./tut_launch_basic device
-//
+  //
+  // Run time policy section is demonstrated in this example by specifying
+  // kernel exection space as a command line argument (host or device).
+  // Example usage ./tut_launch_basic host or ./tut_launch_basic device
+  //
   std::string exec_space = argv[1];
-  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
-    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device");
+  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
+  {
+    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic "
+                        "device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams on the host \n"); }
-  if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams on the device \n"); }
+  if (exec_space.compare("host") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
+    printf("Running RAJA-Teams on the host \n");
+  }
+  if (exec_space.compare("device") == 0)
+  {
+    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
+    printf("Running RAJA-Teams on the device \n");
+  }
 
-//
-// The following three kernels illustrate loop based parallelism
-// based on nested for loops. For correctness team and thread loops
-// make the assumption that all work inside can be done
-// concurrently.
-//
+  //
+  // The following three kernels illustrate loop based parallelism
+  // based on nested for loops. For correctness team and thread loops
+  // make the assumption that all work inside can be done
+  // concurrently.
+  //
 
   // __compute_grid_start
-  const int Nteams  = 2;
+  const int Nteams = 2;
   const int Nthreads = 2;
   // __compute_grid_end
 
-  RAJA::launch<launch_policy>(select_cpu_or_gpu,
-    RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams),
-                     RAJA::Threads(Nthreads,Nthreads)),
-
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-     // _team_loops_start
-     RAJA::loop<teams_y>(ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&] (int by) {
-       RAJA::loop<teams_x>(ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&] (int bx) {
-
-         RAJA::loop<threads_y>(ctx, RAJA::TypedRangeSegment<int>(0, Nthreads), [&] (int ty) {
-           RAJA::loop<threads_x>(ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),       [&] (int tx) {
-               printf("RAJA Teams: threadId_x %d threadId_y %d teamId_x %d teamId_y %d \n",
-                      tx, ty, bx, by);
-
-
-           });
-         });
-
-       });
-     });
-     // _team_loops_end
-
-   });
-
-  //Equivalent C style loops
-  if(select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
+  RAJA::launch<launch_policy>(
+      select_cpu_or_gpu,
+      RAJA::LaunchParams(RAJA::Teams(Nteams, Nteams),
+                         RAJA::Threads(Nthreads, Nthreads)),
+
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        // _team_loops_start
+        RAJA::loop<teams_y>(
+            ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&](int by) {
+              RAJA::loop<teams_x>(
+                  ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&](int bx) {
+                    RAJA::loop<threads_y>(
+                        ctx,
+                        RAJA::TypedRangeSegment<int>(0, Nthreads),
+                        [&](int ty) {
+                          RAJA::loop<threads_x>(
+                              ctx,
+                              RAJA::TypedRangeSegment<int>(0, Nthreads),
+                              [&](int tx) {
+                                printf("RAJA Teams: threadId_x %d threadId_y "
+                                       "%d teamId_x %d teamId_y %d \n",
+                                       tx,
+                                       ty,
+                                       bx,
+                                       by);
+                              });
+                        });
+                  });
+            });
+        // _team_loops_end
+      });
+
+  // Equivalent C style loops
+  if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
+  {
     // _c_style_loops_start
-    for (int by=0; by<Nteams; ++by) {
-      for (int bx=0; bx<Nteams; ++bx) {
+    for (int by = 0; by < Nteams; ++by)
+    {
+      for (int bx = 0; bx < Nteams; ++bx)
+      {
 
-        for (int ty=0; ty<Nthreads; ++ty) {
-          for (int tx=0; tx<Nthreads; ++tx) {
+        for (int ty = 0; ty < Nthreads; ++ty)
+        {
+          for (int tx = 0; tx < Nthreads; ++tx)
+          {
 
             printf("c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n",
-	    tx, ty, bx, by);
+                   tx,
+                   ty,
+                   bx,
+                   by);
           }
         }
-
       }
     }
     // _c_style_loops_end
@@ -232,13 +260,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
     gpuKernel<<<griddim, blockdim>>>();
   cudaDeviceSynchronize();
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
     hipLaunchKernelGGL((gpuKernel), dim3(griddim), dim3(blockdim), 0, 0);
   hipDeviceSynchronize();
 #endif
diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp
index e939d96dbb..53666d20e6 100644
--- a/examples/tut_matrix-multiply.cpp
+++ b/examples/tut_matrix-multiply.cpp
@@ -64,9 +64,11 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
   int row = blockIdx.y * blockDim.y + threadIdx.y;
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if ( row < N && col < N ) {
+  if (row < N && col < N)
+  {
     double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
+    for (int k = 0; k < N; ++k)
+    {
       dot += A(row, k) * B(k, col);
     }
 
@@ -79,7 +81,7 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
 // Functions for checking results
 //
 template <typename T>
-void checkResult(T *C, int N);
+void checkResult(T* C, int N);
 
 template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
@@ -88,262 +90,256 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 // Functions for printing results
 //
 template <typename T>
-void printResult(T *C, int N);
+void printResult(T* C, int N);
 
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix multiplication example...\n";
 
-//
-// Define num rows/cols in matrix
-//
+  //
+  // Define num rows/cols in matrix
+  //
   const int N = 1000;
-//const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE;
+  // const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE;
 
-//
-// Allocate and initialize matrix data.
-//
-  double *A = memoryManager::allocate<double>(N * N);
-  double *B = memoryManager::allocate<double>(N * N);
-  double *C = memoryManager::allocate<double>(N * N);
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  //
+  // Allocate and initialize matrix data.
+  //
+  double* A = memoryManager::allocate<double>(N * N);
+  double* B = memoryManager::allocate<double>(N * N);
+  double* C = memoryManager::allocate<double>(N * N);
+
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
       A(row, col) = row;
       B(row, col) = col;
     }
   }
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version of matrix multiplication...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_cstyle_start
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k) {
+      for (int k = 0; k < N; ++k)
+      {
         dot += A(row, k) * B(k, col);
       }
       C(row, col) = dot;
-
     }
   }
   // _matmult_cstyle_end
 
   checkResult<double>(C, N);
-//printResult<double>(C, N);
+  // printResult<double>(C, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// We define RAJA range segments to define the ranges of
-// row, column, and dot-product loops for RAJA variants
-//
+  //
+  // We define RAJA range segments to define the ranges of
+  // row, column, and dot-product loops for RAJA variants
+  //
   // _matmult_ranges_start
   RAJA::TypedRangeSegment<int> row_range(0, N);
   RAJA::TypedRangeSegment<int> col_range(0, N);
   RAJA::TypedRangeSegment<int> dot_range(0, N);
   // _matmult_ranges_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// For the RAJA implementations of matrix multiplication, we
-// use RAJA 'View' objects to access the matrix data. A RAJA view
-// holds a pointer to a data array and enables multi-dimensional indexing
-// into that data, similar to the macros we defined above.
-//
+  //
+  // For the RAJA implementations of matrix multiplication, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into that data, similar to the macros we defined above.
+  //
   // _matmult_views_start
   RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, N);
   // _matmult_views_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// In the next few examples, we show ways that we can use RAJA::forall
-// statements for the matrix multiplication kernel. This usage is not
-// recommended for performance reasons. Specifically, it limits the amount
-// of parallelism that can be exposed to less than is possible. We show
-// this usage here, to make this point clear. Later in this file, we
-// introduce RAJA nested loop abstractions and show that we can extract all
-// available parallelism.
-//
-//
-// In the first RAJA implementation, we replace the outer 'row' loop
-// with a RAJA::forall statement. The lambda expression contains the
-// inner loops.
-//
+  //
+  // In the next few examples, we show ways that we can use RAJA::forall
+  // statements for the matrix multiplication kernel. This usage is not
+  // recommended for performance reasons. Specifically, it limits the amount
+  // of parallelism that can be exposed to less than is possible. We show
+  // this usage here, to make this point clear. Later in this file, we
+  // introduce RAJA nested loop abstractions and show that we can extract all
+  // available parallelism.
+  //
+  //
+  // In the first RAJA implementation, we replace the outer 'row' loop
+  // with a RAJA::forall statement. The lambda expression contains the
+  // inner loops.
+  //
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential mat-mult (RAJA-row)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_outerforall_start
-  RAJA::forall<RAJA::seq_exec>( row_range, [=](int row) {
-
-    for (int col = 0; col < N; ++col) {
+  RAJA::forall<RAJA::seq_exec>(row_range, [=](int row) {
+    for (int col = 0; col < N; ++col)
+    {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k) {
+      for (int k = 0; k < N; ++k)
+      {
         dot += Aview(row, k) * Bview(k, col);
       }
       Cview(row, col) = dot;
-
     }
-
   });
   // _matmult_outerforall_end
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Next, we replace the outer 'row' loop and the inner 'col' loop
-// with RAJA::forall statements. This will also work with parallel
-// execution policies, such as OpenMP and CUDA, with caveats and
-// restrictions.
-//
-// However, nesting RAJA::forall calls like this is not recommended as
-// it limits the ability to expose parallelism and flexibility for
-// implementation alternatives.
-//
+  //
+  // Next, we replace the outer 'row' loop and the inner 'col' loop
+  // with RAJA::forall statements. This will also work with parallel
+  // execution policies, such as OpenMP and CUDA, with caveats and
+  // restrictions.
+  //
+  // However, nesting RAJA::forall calls like this is not recommended as
+  // it limits the ability to expose parallelism and flexibility for
+  // implementation alternatives.
+  //
 
   std::cout << "\n Running sequential mat-mult (RAJA-row, RAJA-col)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_nestedforall_start
-  RAJA::forall<RAJA::seq_exec>( row_range, [=](int row) {
-
-    RAJA::forall<RAJA::seq_exec>( col_range, [=](int col) {
-
+  RAJA::forall<RAJA::seq_exec>(row_range, [=](int row) {
+    RAJA::forall<RAJA::seq_exec>(col_range, [=](int col) {
       double dot = 0.0;
-      for (int k = 0; k < N; ++k) {
+      for (int k = 0; k < N; ++k)
+      {
         dot += Aview(row, k) * Bview(k, col);
       }
       Cview(row, col) = dot;
-
     });
-
   });
   // _matmult_nestedforall_end
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Next, we use a RAJA::kernel method to execute the kernel. These examples,
-// illustrate the basic kernel interface and mechanics. The execution policies
-// express the outer row and col loops using the RAJA kernel interface. Later,
-// in this file we show some more complex policy examples where we express all
-// three loops using the kernel interface and use additional kernel features.
-//
-// This is different than RAJA::forall and so a few points of exmplanation
-// are in order:
-//
-// 1) A range and lambda index argument are required for each level in
-//    the loop nest. Here, we have two of each since we have a doubly-nested
-//    loop.
-// 2) A range for each loop nest level is specified in a RAJA tuple object.
-//    The order of ranges in the tuple must match the order of args to the
-//    lambda for this to be correct, in general. RAJA provides strongly-typed
-//    indices to help with this. However, this example does not use them.
-// 3) An execution policy is required for each level in the loop nest. These
-//    are specified in the 'RAJA::statement::For' templates in the
-//    'RAJA::KernelPolicy type.
-// 4) The loop nest ordering is specified in the nested execution policy --
-//    the first 'For' policy is the outermost loop, the second 'For' policy
-//    is the loop nested inside the outermost loop, and so on.
-// 5) The integer values that are the first template arguments to the policies
-//    indicate which range/lambda argument, the policy applies to.
-//
+  //
+  // Next, we use a RAJA::kernel method to execute the kernel. These examples,
+  // illustrate the basic kernel interface and mechanics. The execution policies
+  // express the outer row and col loops using the RAJA kernel interface. Later,
+  // in this file we show some more complex policy examples where we express all
+  // three loops using the kernel interface and use additional kernel features.
+  //
+  // This is different than RAJA::forall and so a few points of exmplanation
+  // are in order:
+  //
+  // 1) A range and lambda index argument are required for each level in
+  //    the loop nest. Here, we have two of each since we have a doubly-nested
+  //    loop.
+  // 2) A range for each loop nest level is specified in a RAJA tuple object.
+  //    The order of ranges in the tuple must match the order of args to the
+  //    lambda for this to be correct, in general. RAJA provides strongly-typed
+  //    indices to help with this. However, this example does not use them.
+  // 3) An execution policy is required for each level in the loop nest. These
+  //    are specified in the 'RAJA::statement::For' templates in the
+  //    'RAJA::KernelPolicy type.
+  // 4) The loop nest ordering is specified in the nested execution policy --
+  //    the first 'For' policy is the outermost loop, the second 'For' policy
+  //    is the loop nested inside the outermost loop, and so on.
+  // 5) The integer values that are the first template arguments to the policies
+  //    indicate which range/lambda argument, the policy applies to.
+  //
 
   std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_basickernel_start
-  using EXEC_POL =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::seq_exec,    // row
-        RAJA::statement::For<0, RAJA::seq_exec,  // col
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+  using EXEC_POL = RAJA::KernelPolicy<
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // row
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // col
+                                                RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<EXEC_POL>(RAJA::make_tuple(col_range, row_range),
-    [=](int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += Aview(row, k) * Bview(k, col);
-    }
-    Cview(row, col) = dot;
-
-  });
+                         [=](int col, int row) {
+                           double dot = 0.0;
+                           for (int k = 0; k < N; ++k)
+                           {
+                             dot += Aview(row, k) * Bview(k, col);
+                           }
+                           Cview(row, col) = dot;
+                         });
   // _matmult_basickernel_end
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_ompkernel_start
-  using EXEC_POL1 =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::omp_parallel_for_exec,  // row
-        RAJA::statement::For<0, RAJA::seq_exec,            // col
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+  using EXEC_POL1 = RAJA::KernelPolicy<
+      RAJA::statement::For<1,
+                           RAJA::omp_parallel_for_exec, // row
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // col
+                                                RAJA::statement::Lambda<0>>>>;
   // _matmult_ompkernel_end
 
   RAJA::kernel<EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
-    [=](int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += Aview(row, k) * Bview(k, col);
-    }
-    Cview(row, col) = dot;
-
-  });
+                          [=](int col, int row) {
+                            double dot = 0.0;
+                            for (int k = 0; k < N; ++k)
+                            {
+                              dot += Aview(row, k) * Bview(k, col);
+                            }
+                            Cview(row, col) = dot;
+                          });
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp inner)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // Swapping the template arguments in this nested policy swaps the loop
@@ -353,70 +349,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // sequentially, while row (inner) iterations execute in parallel.
   //
   // _matmult_ompkernel_swap_start
-  using EXEC_POL2 =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,                  // col
-        RAJA::statement::For<1, RAJA::omp_parallel_for_exec,    // row
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+  using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec, // col
+      RAJA::statement::For<1,
+                           RAJA::omp_parallel_for_exec, // row
+                           RAJA::statement::Lambda<0>>>>;
   // _matmult_ompkernel_swap_end
 
-  RAJA::kernel<EXEC_POL2>( RAJA::make_tuple(col_range, row_range),
-    [=](int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += Aview(row, k) * Bview(k, col);
-    }
-    Cview(row, col) = dot;
-
-  });
+  RAJA::kernel<EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
+                          [=](int col, int row) {
+                            double dot = 0.0;
+                            for (int k = 0; k < N; ++k)
+                            {
+                              dot += Aview(row, k) * Bview(k, col);
+                            }
+                            Cview(row, col) = dot;
+                          });
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // This policy collapses the row and col loops in an OpenMP parallel region.
   // This is the same as using an OpenMP 'parallel for' directive on the
   // outer loop with a 'collapse(2) clause.
   //
-  using EXEC_POL3 =
-    RAJA::KernelPolicy<
+  using EXEC_POL3 = RAJA::KernelPolicy<
       RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<1, 0>,   // row, col
-        RAJA::statement::Lambda<0>
-      >
-    >;
+                                RAJA::ArgList<1, 0>, // row, col
+                                RAJA::statement::Lambda<0>>>;
 
   RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
-    [=](int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += Aview(row, k) * Bview(k, col);
-    }
-    Cview(row, col) = dot;
-
-  });
+                          [=](int col, int row) {
+                            double dot = 0.0;
+                            for (int k = 0; k < N; ++k)
+                            {
+                              dot += Aview(row, k) * Bview(k, col);
+                            }
+                            Cview(row, col) = dot;
+                          });
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+// printResult<double>(Cview, N);
 #endif // if RAJA_ENABLE_OPENMP
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA mat-mult (RAJA-nested - POL4)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // This policy replaces the loop nest with a single CUDA kernel launch
@@ -430,35 +419,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   //
   using EXEC_POL4 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<1, RAJA::cuda_block_x_loop,
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          1,
+          RAJA::cuda_block_x_loop,
+          RAJA::statement::
+              For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-    [=] RAJA_DEVICE (int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += Aview(row, k) * Bview(k, col);
-    }
-    Cview(row, col) = dot;
-
-  });
+                          [=] RAJA_DEVICE(int col, int row) {
+                            double dot = 0.0;
+                            for (int k = 0; k < N; ++k)
+                            {
+                              dot += Aview(row, k) * Bview(k, col);
+                            }
+                            Cview(row, col) = dot;
+                          });
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tiled mat-mult (RAJA-POL5)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // This policy collapses the col and row loops into a single CUDA kernel
@@ -470,50 +454,50 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // same as in this kernel and the one above.
   //
   using EXEC_POL5 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
-            RAJA::statement::For<1, RAJA::cuda_thread_y_loop,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                RAJA::statement::Lambda<0>
-              >
-            >
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+              RAJA::cuda_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::cuda_thread_y_loop,
+                  RAJA::statement::For<0,
+                                       RAJA::cuda_thread_x_loop,
+                                       RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
-    [=] RAJA_DEVICE (int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += Aview(row, k) * Bview(k, col);
-    }
-    Cview(row, col) = dot;
-
-  });
+                          [=] RAJA_DEVICE(int col, int row) {
+                            double dot = 0.0;
+                            for (int k = 0; k < N; ++k)
+                            {
+                              dot += Aview(row, k) * Bview(k, col);
+                            }
+                            Cview(row, col) = dot;
+                          });
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 #endif // if RAJA_ENABLE_CUDA
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  double *d_A = memoryManager::allocate_gpu<double>(N * N);
-  double *d_B = memoryManager::allocate_gpu<double>(N * N);
-  double *d_C = memoryManager::allocate_gpu<double>(N * N);
+  double* d_A = memoryManager::allocate_gpu<double>(N * N);
+  double* d_B = memoryManager::allocate_gpu<double>(N * N);
+  double* d_C = memoryManager::allocate_gpu<double>(N * N);
 
   std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
-  hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, N, N);
@@ -530,38 +514,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   using EXEC_POL4 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<1, RAJA::hip_block_x_loop,
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+          1,
+          RAJA::hip_block_x_loop,
+          RAJA::statement::
+              For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-    [=] RAJA_DEVICE (int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += d_Aview(row, k) * d_Bview(k, col);
-    }
-
-    d_Cview(row, col) = dot;
-
-  });
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+                          [=] RAJA_DEVICE(int col, int row) {
+                            double dot = 0.0;
+                            for (int k = 0; k < N; ++k)
+                            {
+                              dot += d_Aview(row, k) * d_Bview(k, col);
+                            }
+
+                            d_Cview(row, col) = dot;
+                          });
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP tiled mat-mult (RAJA-POL5)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  std::memset(C, 0, N * N * sizeof(double));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   //
   // This policy collapses the col and row loops into a single HIP kernel
@@ -573,50 +552,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // same as in this kernel and the one above.
   //
   using EXEC_POL5 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-                                 RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-                                   RAJA::hip_block_x_loop,
-            RAJA::statement::For<1, RAJA::hip_thread_y_loop,
-              RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-                RAJA::statement::Lambda<0>
-              >
-            >
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+          RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+              RAJA::hip_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::hip_thread_y_loop,
+                  RAJA::statement::For<0,
+                                       RAJA::hip_thread_x_loop,
+                                       RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
-    [=] RAJA_DEVICE (int col, int row) {
-
-    double dot = 0.0;
-    for (int k = 0; k < N; ++k) {
-      dot += d_Aview(row, k) * d_Bview(k, col);
-    }
-
-    d_Cview(row, col) = dot;
-
-  });
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+                          [=] RAJA_DEVICE(int col, int row) {
+                            double dot = 0.0;
+                            for (int k = 0; k < N; ++k)
+                            {
+                              dot += d_Aview(row, k) * d_Bview(k, col);
+                            }
+
+                            d_Cview(row, col) = dot;
+                          });
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+// printResult<double>(Cview, N);
 #endif // if RAJA_ENABLE_HIP
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// The following examples use execution policies to express the outer row and
-// col loops as well as the inner dot product loop using the RAJA kernel
-// interface. They show some more complex policy examples and use additional
-// kernel features.
-//
+  //
+  // The following examples use execution policies to express the outer row and
+  // col loops as well as the inner dot product loop using the RAJA kernel
+  // interface. They show some more complex policy examples and use additional
+  // kernel features.
+  //
 
-  std::cout << "\n Running sequential mat-mult with multiple lambdas (RAJA-POL6a)...\n";
+  std::cout << "\n Running sequential mat-mult with multiple lambdas "
+               "(RAJA-POL6a)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   //
   // This policy executes the col, row and k (inner dot product) loops
@@ -632,310 +610,312 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // by all three lambdas.
   //
   // _matmult_3lambdakernel_seq_start
-  using EXEC_POL6a =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::seq_exec,
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, RAJA::seq_exec,
-            RAJA::statement::Lambda<1> // inner loop: dot += ...
-          >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-        >
-      >
-    >;
+  using EXEC_POL6a = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,
+          RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+          RAJA::statement::For<2,
+                               RAJA::seq_exec,
+                               RAJA::statement::Lambda<1> // inner loop: dot +=
+                                                          // ...
+                               >,
+          RAJA::statement::
+              Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set
+                                                           // C(row,
+                                                           // col)
+                                                           // = dot
+          >>>;
 
   RAJA::kernel_param<EXEC_POL6a>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=](double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] (int col, int row, int k, double& dot) {
-       dot += Aview(row, k) * Bview(k, col);
-    },
+      // lambda 1
+      [=](int col, int row, int k, double& dot) {
+        dot += Aview(row, k) * Bview(k, col);
+      },
 
-    // lambda 2
-    [=] (int col, int row, double& dot) {
-       Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=](int col, int row, double& dot) { Cview(row, col) = dot; }
 
   );
   // _matmult_3lambdakernel_seq_end
 
   checkResult<double>(Cview, N);
-  //printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
-//
-// The following examples uses an extension of the lambda statement
-// to specify lambda arguments. By specifying arguments within statements
-// we remove the requirement that lambdas require all of the tuple contents.
-//
+  //
+  // The following examples uses an extension of the lambda statement
+  // to specify lambda arguments. By specifying arguments within statements
+  // we remove the requirement that lambdas require all of the tuple contents.
+  //
 
-  std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda args in statements (RAJA-POL6b)...\n";
+  std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda "
+               "args in statements (RAJA-POL6b)...\n";
 
   // _matmult_3lambdakernel_args_seq_start
   // Alias for convenience
-  using RAJA::Segs;
   using RAJA::Params;
+  using RAJA::Segs;
 
-  using EXEC_POL6b =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::seq_exec,
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, RAJA::seq_exec,
-            RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
-          >,
-          RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>  // C(row, col) = dot
-        >
-      >
-    >;
+  using EXEC_POL6b = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,
+          RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
+          RAJA::statement::For<
+              2,
+              RAJA::seq_exec,
+              RAJA::statement::Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ...
+              >,
+          RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // C(row, col) = dot
+          >>>;
 
   RAJA::kernel_param<EXEC_POL6b>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=](double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] (int col, int row, int k, double& dot) {
-       dot += Aview(row, k) * Bview(k, col);
-    },
+      // lambda 1
+      [=](int col, int row, int k, double& dot) {
+        dot += Aview(row, k) * Bview(k, col);
+      },
 
-    // lambda 2
-    [=] (int col, int row, double& dot) {
-       Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=](int col, int row, double& dot) { Cview(row, col) = dot; }
 
   );
   // _matmult_3lambdakernel_args_seq_end
 
   checkResult<double>(Cview, N);
-  //printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop collapse (RAJA-POL7)...\n";
+  std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop "
+               "collapse (RAJA-POL7)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_3lambdakernel_ompcollapse_start
-  using EXEC_POL7 =
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<1, 0>,   // row, col
-        RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-        RAJA::statement::For<2, RAJA::seq_exec,
-          RAJA::statement::Lambda<1> // inner loop: dot += ...
-        >,
-        RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-      >
-    >;
+  using EXEC_POL7 = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      RAJA::omp_parallel_collapse_exec,
+      RAJA::ArgList<1, 0>,                         // row, col
+      RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+      RAJA::statement::For<2,
+                           RAJA::seq_exec,
+                           RAJA::statement::Lambda<1> // inner loop: dot += ...
+                           >,
+      RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set
+                                                                    // C(row,
+                                                                    // col) =
+                                                                    // dot
+      >>;
   // _matmult_3lambdakernel_ompcollapse_end
 
   RAJA::kernel_param<EXEC_POL7>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=](double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] (int col, int row, int k, double& dot) {
-       dot += Aview(row, k) * Bview(k, col);
-    },
+      // lambda 1
+      [=](int col, int row, int k, double& dot) {
+        dot += Aview(row, k) * Bview(k, col);
+      },
 
-    // lambda 2
-    [=] (int col, int row, double& dot) {
-       Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=](int col, int row, double& dot) { Cview(row, col) = dot; }
 
   );
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+// printResult<double>(Cview, N);
 #endif // if RAJA_ENABLE_OPENMP
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL8)...\n";
+  std::cout << "\n Running CUDA mat-mult with multiple lambdas "
+               "(RAJA-POL8)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_3lambdakernel_cuda_start
   using EXEC_POL8 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<1, RAJA::cuda_block_x_loop,    // row
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
-            RAJA::statement::Lambda<0, RAJA::Params<0>>,    // dot = 0.0
-            RAJA::statement::For<2, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>                  // dot += ...
-            >,
-            RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C = ...
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          1,
+          RAJA::cuda_block_x_loop, // row
+          RAJA::statement::For<
+              0,
+              RAJA::cuda_thread_x_loop,                    // col
+              RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+              RAJA::statement::For<2,
+                                   RAJA::seq_exec,
+                                   RAJA::statement::Lambda<1> // dot += ...
+                                   >,
+              RAJA::statement::
+                  Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ...
+              >>>>;
   // _matmult_3lambdakernel_cuda_end
 
   RAJA::kernel_param<EXEC_POL8>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] RAJA_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
-       dot += Aview(row, k) * Bview(k, col);
-    },
+      // lambda 1
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
+        dot += Aview(row, k) * Bview(k, col);
+      },
 
-    // lambda 2
-    [=] RAJA_DEVICE (int col, int row, double& dot) {
-       Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
 
   );
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL9a)...\n";
+  std::cout << "\n Running CUDA mat-mult with multiple lambdas "
+               "(RAJA-POL9a)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_3lambdakernel_cudatiled_start
   using EXEC_POL9a =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-                                 RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-                                   RAJA::cuda_block_x_loop,
-            RAJA::statement::For<1, RAJA::cuda_thread_y_loop,   // row
-              RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
-                RAJA::statement::Lambda<0, RAJA::Params<0>>,    // dot = 0.0
-                RAJA::statement::For<2, RAJA::seq_exec,
-                    RAJA::statement::Lambda<1>                 // dot += ...
-                >,
-                RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C = ...
-              >
-            >
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+              RAJA::cuda_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::cuda_thread_y_loop, // row
+                  RAJA::statement::For<
+                      0,
+                      RAJA::cuda_thread_x_loop,                    // col
+                      RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+                      RAJA::statement::For<2,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::Lambda<1> // dot +=
+                                                                      // ...
+                                           >,
+                      RAJA::statement::
+                          Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C
+                                                                       // = ...
+                      >>>>>>;
   // _matmult_3lambdakernel_cudatiled_end
 
   RAJA::kernel_param<EXEC_POL9a>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] RAJA_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
-       dot += Aview(row, k) * Bview(k, col);
-    },
+      // lambda 1
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
+        dot += Aview(row, k) * Bview(k, col);
+      },
 
-    // lambda 2
-    [=] RAJA_DEVICE (int col, int row,  double& dot) {
-       Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
 
   );
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9b)...\n";
+  std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args "
+               "in statements (RAJA-POL9b)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   using EXEC_POL9b =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-                                 RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-                                   RAJA::cuda_block_x_loop,
-            RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row
-              RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
-                RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
-                RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
-                >,
-                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
-              >
-            >
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+              RAJA::cuda_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::cuda_thread_y_loop, // row
+                  RAJA::statement::For<
+                      0,
+                      RAJA::cuda_thread_x_loop,              // col
+                      RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
+                      RAJA::statement::For<
+                          2,
+                          RAJA::seq_exec,
+                          RAJA::statement::
+                              Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ...
+                          >,
+                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set C
+                                                                        // = ...
+                      >>>>>>;
 
   RAJA::kernel_param<EXEC_POL9b>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] RAJA_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
-       dot += Aview(row, k) * Bview(k, col);
-    },
+      // lambda 1
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
+        dot += Aview(row, k) * Bview(k, col);
+      },
 
-    // lambda 2
-    [=] RAJA_DEVICE (int col, int row, double& dot) {
-       Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
 
   );
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running  mat-mult with tiling + shared memory...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // This example builds on the RAJA tiling capabilities presented earlier
   // and uses RAJA LocalArray's to load tiles of the global matrix
@@ -945,134 +925,152 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // synchronization. We recommend viewing tut_matrix-transpose-local-array.cpp
   // for an introduction to RAJA LocalArray types and thread synchronization.
 
-  using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>;
-
-  using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-  using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
-  using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
-  using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-  using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-
-  using EXEC_POL10 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernelFixed<CUDA_BLOCK_SIZE*CUDA_BLOCK_SIZE,
-        //Initalize thread private value
-        RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2,1,0>,
+  using Shmem =
+      RAJA::LocalArray<double,
+                       RAJA::PERM_IJ,
+                       RAJA::SizeList<CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>;
+
+  using shmem_Lambda0 =
+      RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+  using shmem_Lambda1 = RAJA::statement::
+      Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+  using shmem_Lambda2 = RAJA::statement::
+      Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+  using shmem_Lambda3 =
+      RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
+  using shmem_Lambda4 = RAJA::statement::
+      Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+
+  using EXEC_POL10 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+      CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE,
+      // Initalize thread private value
+      RAJA::statement::InitLocalMem<
+          RAJA::cuda_shared_mem,
+          RAJA::ParamList<2, 1, 0>,
 
           // Tile rows and cols of C (the result matrix C)
-          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_direct,
-            RAJA::statement::Tile<2, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_direct,
-
-            // zero out shmem tile of C
-            RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                shmem_Lambda0 > >,
-
-                // Slide window across matrix: Load tiles of global matrices A, B and compute
-                // local dot products
-                RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::seq_exec,
-
-                  // Load tile of A into shmem
-                  RAJA::statement::For<1, RAJA::cuda_thread_y_loop,
-                    RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                      shmem_Lambda1
-                    >
-                   >,
-
-                  // Load tile of B into shmem
-                  RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
-                    RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
-                      shmem_Lambda2
-                    >
-                  >,
-
-                  RAJA::statement::CudaSyncThreads,
-
-                  //Partial multiplication
-                  RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
-                    RAJA::statement::For<1, RAJA::seq_exec,
-                      RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                        shmem_Lambda3
-                      >
-                    >
-                  >,
-
-                  RAJA::statement::CudaSyncThreads
-                >, //sliding window
-
-               //Write memory out to global matrix
-               RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                shmem_Lambda4 > >
-             >
-            >
-           > //Create shared memory
-         >//Cuda kernel
-        >;
-
-    Shmem aShared, bShared, cShared;
-
-    RAJA::kernel_param<EXEC_POL10>(
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+              RAJA::cuda_block_x_direct,
+              RAJA::statement::Tile<
+                  2,
+                  RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+                  RAJA::cuda_block_y_direct,
+
+                  // zero out shmem tile of C
+                  RAJA::statement::For<
+                      2,
+                      RAJA::cuda_thread_y_loop,
+                      RAJA::statement::
+                          For<0, RAJA::cuda_thread_x_loop, shmem_Lambda0>>,
+
+                  // Slide window across matrix: Load tiles of global matrices
+                  // A, B and compute local dot products
+                  RAJA::statement::Tile<
+                      1,
+                      RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+                      RAJA::seq_exec,
+
+                      // Load tile of A into shmem
+                      RAJA::statement::For<
+                          1,
+                          RAJA::cuda_thread_y_loop,
+                          RAJA::statement::
+                              For<0, RAJA::cuda_thread_x_loop, shmem_Lambda1>>,
+
+                      // Load tile of B into shmem
+                      RAJA::statement::For<
+                          2,
+                          RAJA::cuda_thread_y_loop,
+                          RAJA::statement::
+                              For<1, RAJA::cuda_thread_x_loop, shmem_Lambda2>>,
+
+                      RAJA::statement::CudaSyncThreads,
+
+                      // Partial multiplication
+                      RAJA::statement::For<
+                          2,
+                          RAJA::cuda_thread_y_loop,
+                          RAJA::statement::For<
+                              1,
+                              RAJA::seq_exec,
+                              RAJA::statement::For<0,
+                                                   RAJA::cuda_thread_x_loop,
+                                                   shmem_Lambda3>>>,
+
+                      RAJA::statement::CudaSyncThreads>, // sliding window
+
+                  // Write memory out to global matrix
+                  RAJA::statement::For<
+                      2,
+                      RAJA::cuda_thread_y_loop,
+                      RAJA::statement::For<0,
+                                           RAJA::cuda_thread_x_loop,
+                                           shmem_Lambda4>>>>> // Create shared
+                                                              // memory
+      >                                                       // Cuda kernel
+                                        >;
+
+  Shmem aShared, bShared, cShared;
+
+  RAJA::kernel_param<EXEC_POL10>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
                        RAJA::TypedRangeSegment<int>(0, N),
                        RAJA::TypedRangeSegment<int>(0, N)),
       RAJA::make_tuple(aShared, bShared, cShared),
 
-    // Zero out thread local memory for storing dot products
-    [=] RAJA_HOST_DEVICE (int tn, int tp, Shmem &cShared) {
-
-      cShared(tn,tp) = 0.0;
-
-    },
-
-    // Load tile of A
-    [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) {
-
-      aShared(tn, tm) = Aview(n, m);
-
-    },
-
-    // Load tile of B
-    [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) {
-
-      bShared(tm, tp) = Bview(m, p);
-
-    },
-
-    // Do partial update in shmem
-    [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared,  Shmem &bShared, Shmem & cShared) {
-
-      cShared(tn,tp) += aShared(tn,tm) * bShared(tm, tp);
-
-    },
-
-    // Write out complete result
-    [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp,  Shmem &cShared) {
-
-      Cview(n,p) = cShared(tn,tp);
-
-    });
+      // Zero out thread local memory for storing dot products
+      [=] RAJA_HOST_DEVICE(int tn, int tp, Shmem& cShared) {
+        cShared(tn, tp) = 0.0;
+      },
+
+      // Load tile of A
+      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared) {
+        aShared(tn, tm) = Aview(n, m);
+      },
+
+      // Load tile of B
+      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared) {
+        bShared(tm, tp) = Bview(m, p);
+      },
+
+      // Do partial update in shmem
+      [=] RAJA_HOST_DEVICE(int tn,
+                           int tm,
+                           int tp,
+                           Shmem& aShared,
+                           Shmem& bShared,
+                           Shmem& cShared) {
+        cShared(tn, tp) += aShared(tn, tm) * bShared(tm, tp);
+      },
+
+      // Write out complete result
+      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, Shmem& cShared) {
+        Cview(n, p) = cShared(tn, tp);
+      });
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+// printResult<double>(Cview, N);
 #endif // if RAJA_ENABLE_CUDA
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
+  std::memset(C, 0, N * N * sizeof(double));
 
   // Define thread block dimensions
   dim3 blockdim(CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
-//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
+  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch CUDA kernel defined near the top of this file.
   matMultKernel<<<griddim, blockdim>>>(N, C, A, B);
@@ -1080,154 +1078,158 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   cudaDeviceSynchronize();
 
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 #endif // if RAJA_ENABLE_CUDA
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running HIP mat-mult with multiple lambdas (RAJA-POL8)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  std::memset(C, 0, N * N * sizeof(double));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // _matmult_3lambdakernel_hip_start
   using EXEC_POL8 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<1, RAJA::hip_block_x_loop,    // row
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col
-            RAJA::statement::Lambda<0, RAJA::Params<0>>,   // dot = 0.0
-            RAJA::statement::For<2, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>                 // dot += ...
-            >,
-            RAJA::statement::Lambda<2,
-              RAJA::Segs<0,1>, RAJA::Params<0>>            // set C = ...
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+          1,
+          RAJA::hip_block_x_loop, // row
+          RAJA::statement::For<
+              0,
+              RAJA::hip_thread_x_loop,                     // col
+              RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+              RAJA::statement::For<2,
+                                   RAJA::seq_exec,
+                                   RAJA::statement::Lambda<1> // dot += ...
+                                   >,
+              RAJA::statement::
+                  Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ...
+              >>>>;
   // _matmult_3lambdakernel_hip_end
 
   RAJA::kernel_param<EXEC_POL8>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] RAJA_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
-       dot += d_Aview(row, k) * d_Bview(k, col);
-    },
+      // lambda 1
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
+        dot += d_Aview(row, k) * d_Bview(k, col);
+      },
 
-    // lambda 2
-    [=] RAJA_DEVICE (int col, int row, double& dot) {
-       d_Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_DEVICE(int col, int row, double& dot) {
+        d_Cview(row, col) = dot;
+      }
 
   );
 
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
 
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9)...\n";
+  std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in "
+               "statements (RAJA-POL9)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  std::memset(C, 0, N * N * sizeof(double));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // _matmult_3lambdakernel_hiptiled_start
   using EXEC_POL9b =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-                                 RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-                                   RAJA::hip_block_x_loop,
-            RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // row
-              RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // col
-                RAJA::statement::Lambda<0, Params<0>>,          // dot = 0.0
-                RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
-                >,
-                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
-              >
-            >
-          >
-        >
-      >
-    >;
- // _matmult_3lambdakernel_hiptiled_end
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+          RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+              RAJA::hip_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::hip_thread_y_loop, // row
+                  RAJA::statement::For<
+                      0,
+                      RAJA::hip_thread_x_loop,               // col
+                      RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
+                      RAJA::statement::For<
+                          2,
+                          RAJA::seq_exec,
+                          RAJA::statement::
+                              Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ...
+                          >,
+                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set C
+                                                                        // = ...
+                      >>>>>>;
+  // _matmult_3lambdakernel_hiptiled_end
 
   RAJA::kernel_param<EXEC_POL9b>(
-    RAJA::make_tuple(col_range, row_range, dot_range),
+      RAJA::make_tuple(col_range, row_range, dot_range),
 
-    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
+      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
 
-    // lambda 0
-    [=] RAJA_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
-       dot += d_Aview(row, k) * d_Bview(k, col);
-    },
+      // lambda 1
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
+        dot += d_Aview(row, k) * d_Bview(k, col);
+      },
 
-    // lambda 2
-    [=] RAJA_DEVICE (int col, int row, double& dot) {
-       d_Cview(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_DEVICE(int col, int row, double& dot) {
+        d_Cview(row, col) = dot;
+      }
 
   );
 
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP tiled mat-mult (no RAJA)...\n";
 
-  std::memset(C, 0, N*N * sizeof(double));
-  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  std::memset(C, 0, N * N * sizeof(double));
+  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // Define thread block dimensions
   dim3 blockdim(HIP_BLOCK_SIZE, HIP_BLOCK_SIZE);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
-//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
+  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
+  hipLaunchKernelGGL(
+      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
-  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
-//printResult<double>(Cview, N);
+  // printResult<double>(Cview, N);
 
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
 #endif // if RAJA_ENABLE_HIP
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -1244,16 +1246,22 @@ template <typename T>
 void checkResult(T* C, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      if ( std::abs( C(row, col) - row * col * N ) > 10e-12 ) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      if (std::abs(C(row, col) - row * col * N) > 10e-12)
+      {
         match = false;
       }
     }
   }
-  if ( match ) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -1262,16 +1270,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      if ( std::abs( Cview(row, col) - row * col * N ) > 10e-12 ) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      if (std::abs(Cview(row, col) - row * col * N) > 10e-12)
+      {
         match = false;
       }
     }
   }
-  if ( match ) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -1283,10 +1297,12 @@ template <typename T>
 void printResult(T* C, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      std::cout << "C(" << row << "," << col << ") = "
-                << C(row, col) << std::endl;
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      std::cout << "C(" << row << "," << col << ") = " << C(row, col)
+                << std::endl;
     }
   }
   std::cout << std::endl;
@@ -1296,10 +1312,12 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      std::cout << "C(" << row << "," << col << ") = "
-                << Cview(row, col) << std::endl;
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      std::cout << "C(" << row << "," << col << ") = " << Cview(row, col)
+                << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp
index e3b83480ee..ce1a8fc101 100644
--- a/examples/wave-eqn.cpp
+++ b/examples/wave-eqn.cpp
@@ -13,7 +13,7 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *   Time-Domain Finite Difference 
+ *   Time-Domain Finite Difference
  *   Acoustic Wave Equation Solver
  *
  * ------[Details]----------------------
@@ -26,7 +26,7 @@
  * The scheme uses a second order central difference discretization
  * for time and a fourth order central difference discretization for space.
  * Periodic boundary conditions are assumed on the grid [-1,1] x [-1, 1].
- * 
+ *
  * NOTE: The x and y dimensions are discretized identically.
  * ----[RAJA Concepts]-------------------
  * - RAJA kernels are portable and a single implemenation can run
@@ -34,7 +34,7 @@
  *
  * RAJA MaxReduction - RAJA's implementation for computing a maximum value
  *    (MinReduction computes the min)
-*/
+ */
 
 //
 //  ---[Constant Values]-------
@@ -51,7 +51,8 @@ const double PI = 3.14159265359;
 //  h - Spacing between grid points
 //  n - Number of grid points
 //
-struct grid_s {
+struct grid_s
+{
   double ox, dx;
   int nx;
 };
@@ -66,16 +67,17 @@ struct grid_s {
 //
 
 template <typename T, typename fdNestedPolicy>
-void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx);
+void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx);
 double waveSol(double t, double x, double y);
-void setIC(double *P1, double *P2, double t0, double t1, grid_s grid);
-void computeErr(double *P, double tf, grid_s grid);
+void setIC(double* P1, double* P2, double t0, double t1, grid_s grid);
+void computeErr(double* P, double tf, grid_s grid);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout<<"Time-Domain Finite Difference Acoustic Wave Equation Solver"<<std::endl;
-         
+  std::cout << "Time-Domain Finite Difference Acoustic Wave Equation Solver"
+            << std::endl;
+
   //
   // Wave speed squared
   //
@@ -97,14 +99,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment fdBounds(0, grid.nx);
 
   //
-  //Solution is propagated until time T
+  // Solution is propagated until time T
   //
   double T = 0.82;
 
 
   int entries = grid.nx * grid.nx;
-  double *P1 = memoryManager::allocate<double>(entries);
-  double *P2 = memoryManager::allocate<double>(entries);
+  double* P1 = memoryManager::allocate<double>(entries);
+  double* P2 = memoryManager::allocate<double>(entries);
 
   //
   //----[Time stepping parameters]----
@@ -123,21 +125,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
 
   // Sequential policy
-  using fdPolicy = RAJA::KernelPolicy<
-    RAJA::statement::For<1, RAJA::seq_exec,
-    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
+  using fdPolicy = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   // OpenMP policy
-  //using fdPolicy = RAJA::KernelPolicy<
-  //RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+  // using fdPolicy = RAJA::KernelPolicy<
+  // RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
   //  RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
 
   // CUDA policy
-  //using fdPolicy =
-  //RAJA::KernelPolicy<
+  // using fdPolicy =
+  // RAJA::KernelPolicy<
   //  RAJA::statement::CudaKernel<
-  //      RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct,
-  //        RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::cuda_block_x_direct,
+  //      RAJA::statement::Tile<1, RAJA::tile_fixed<16>,
+  //      RAJA::cuda_block_y_direct,
+  //        RAJA::statement::Tile<0, RAJA::tile_fixed<16>,
+  //        RAJA::cuda_block_x_direct,
   //          RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
   //            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
   //              RAJA::statement::Lambda<0>
@@ -151,13 +156,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   time = 0;
   setIC(P1, P2, (time - dt), time, grid);
-  for (int k = 0; k < nt; ++k) {
+  for (int k = 0; k < nt; ++k)
+  {
 
     wave<double, fdPolicy>(P1, P2, fdBounds, ct, grid.nx);
 
     time += dt;
 
-    double *Temp = P2;
+    double* Temp = P2;
     P2 = P1;
     P1 = Temp;
   }
@@ -185,29 +191,30 @@ double waveSol(double t, double x, double y)
 //
 //  Error is computed via ||P_{approx}(:) - P_{analytic}(:)||_{inf}
 //
-void computeErr(double *P, double tf, grid_s grid)
+void computeErr(double* P, double tf, grid_s grid)
 {
 
   RAJA::RangeSegment fdBounds(0, grid.nx);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
-  using initialPolicy = RAJA::KernelPolicy<
-  RAJA::statement::For<1, RAJA::seq_exec ,
-    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
-
-  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
-                       [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
-
-      int id = tx + grid.nx * ty;
-      double x = grid.ox + tx * grid.dx;
-      double y = grid.ox + ty * grid.dx;
-      double myErr = std::abs(P[id] - waveSol(tf, x, y));
-
-      //
-      // tMax.max() is used to store the maximum value
-      //
-      tMax.max(myErr);
-    });
+  using initialPolicy = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
+                              [=](RAJA::Index_type tx, RAJA::Index_type ty) {
+                                int id = tx + grid.nx * ty;
+                                double x = grid.ox + tx * grid.dx;
+                                double y = grid.ox + ty * grid.dx;
+                                double myErr =
+                                    std::abs(P[id] - waveSol(tf, x, y));
+
+                                //
+                                // tMax.max() is used to store the maximum value
+                                //
+                                tMax.max(myErr);
+                              });
 
   double lInfErr = tMax;
   printf("Max Error = %lg, dx = %f \n", lInfErr, grid.dx);
@@ -217,63 +224,64 @@ void computeErr(double *P, double tf, grid_s grid)
 //
 // Function to set intial condition
 //
-void setIC(double *P1, double *P2, double t0, double t1, grid_s grid)
+void setIC(double* P1, double* P2, double t0, double t1, grid_s grid)
 {
 
   RAJA::RangeSegment fdBounds(0, grid.nx);
 
-  using initialPolicy = RAJA::KernelPolicy<
-  RAJA::statement::For<1, RAJA::seq_exec,
-    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
-  
-  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
-                       [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
-                         
-      int id = tx + ty * grid.nx;
-      double x = grid.ox + tx * grid.dx;
-      double y = grid.ox + ty * grid.dx;
-      
-      P1[id] = waveSol(t0, x, y);
-      P2[id] = waveSol(t1, x, y);
-    });
-}
+  using initialPolicy = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
+  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
+                              [=](RAJA::Index_type tx, RAJA::Index_type ty) {
+                                int id = tx + ty * grid.nx;
+                                double x = grid.ox + tx * grid.dx;
+                                double y = grid.ox + ty * grid.dx;
+
+                                P1[id] = waveSol(t0, x, y);
+                                P2[id] = waveSol(t1, x, y);
+                              });
+}
 
 
 template <typename T, typename fdNestedPolicy>
-void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx)
+void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx)
 {
 
-  RAJA::kernel<fdNestedPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
-                       [=] RAJA_HOST_DEVICE (RAJA::Index_type tx, RAJA::Index_type ty) {
-      //                  
-      //Coefficients for fourth order stencil
-      //
-     double coeff[5] = { -1.0/12.0, 4.0/3.0, -5.0/2.0, 4.0/3.0, -1.0/12.0};
-
-     const int id = tx + ty * nx;
-     double P_old = P1[id];
-     double P_curr = P2[id];
-
-     //
-     // Compute Laplacian
-     //
-     double lap = 0.0;
-
-     for (auto r : RAJA::RangeSegment(-sr, sr + 1)) {
-       const int xi = (tx + r + nx) % nx;
-       const int idx = xi + nx * ty;
-       lap += coeff[r + sr] * P2[idx];
-  
-       const int yi = (ty + r + nx) % nx;
-       const int idy = tx + nx * yi;
-       lap += coeff[r + sr] * P2[idy];
-     }
-
-     //
-     // Store result
-     //
-     P1[id] = 2 * P_curr - P_old + ct * lap;
-
-  });
+  RAJA::kernel<fdNestedPolicy>(
+      RAJA::make_tuple(fdBounds, fdBounds),
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type tx, RAJA::Index_type ty) {
+        //
+        // Coefficients for fourth order stencil
+        //
+        double coeff[5] = {
+            -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
+
+        const int id = tx + ty * nx;
+        double P_old = P1[id];
+        double P_curr = P2[id];
+
+        //
+        // Compute Laplacian
+        //
+        double lap = 0.0;
+
+        for (auto r : RAJA::RangeSegment(-sr, sr + 1))
+        {
+          const int xi = (tx + r + nx) % nx;
+          const int idx = xi + nx * ty;
+          lap += coeff[r + sr] * P2[idx];
+
+          const int yi = (ty + r + nx) % nx;
+          const int idy = tx + nx * yi;
+          lap += coeff[r + sr] * P2[idy];
+        }
+
+        //
+        // Store result
+        //
+        P1[id] = 2 * P_curr - P_old + ct * lap;
+      });
 }
diff --git a/exercises/atomic-histogram.cpp b/exercises/atomic-histogram.cpp
index 602a04a10e..ecdc1a9e7d 100644
--- a/exercises/atomic-histogram.cpp
+++ b/exercises/atomic-histogram.cpp
@@ -19,9 +19,9 @@
  *
  *  In this exercise, you will use use RAJA atomic operations to compute
  *  an array which represents a histogram of values in another array.
- *  Given an array of length N containing integers in the interval [0, M), 
- *  you will compute entries in an array 'hist' of length M. Each entry 
- *  hist[i] in the histogram array will equal the number of occurrences of 
+ *  Given an array of length N containing integers in the interval [0, M),
+ *  you will compute entries in an array 'hist' of length M. Each entry
+ *  hist[i] in the histogram array will equal the number of occurrences of
  *  the value 'i' in the orginal array.
  *
  *  This file contains sequential and OpenMP variants of the histogram
@@ -41,11 +41,11 @@
   Specifies the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-//const int CUDA_BLOCK_SIZE = 256;
+// const int CUDA_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-//const int HIP_BLOCK_SIZE = 256;
+// const int HIP_BLOCK_SIZE = 256;
 #endif
 
 //
@@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // Define array bounds and initialize array to compute histogram of values
-  // on. 
+  // on.
   //
 
   // _array_atomic_histogram_start
@@ -72,31 +72,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* array = memoryManager::allocate<int>(N);
   int* hist = memoryManager::allocate<int>(M);
 
-  for (int i = 0; i < N; ++i) { 
+  for (int i = 0; i < N; ++i)
+  {
     array[i] = rand() % M;
   }
   // _array_atomic_histogram_end
 
   int* hist_ref = memoryManager::allocate<int>(M);
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential historgram...\n";
 
   std::memset(hist_ref, 0, M * sizeof(int));
 
-  for (int i = 0; i < N; ++i) {
-      hist_ref[ array[i] ]++;
+  for (int i = 0; i < N; ++i)
+  {
+    hist_ref[array[i]]++;
   }
 
-//printArray(hist_ref, M);
+  // printArray(hist_ref, M);
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -104,50 +106,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  #pragma omp parallel for
-  for (int i = 0; i < N; ++i) {
-      #pragma omp atomic
-      hist[ array[i] ]++;
+#pragma omp parallel for
+  for (int i = 0; i < N; ++i)
+  {
+#pragma omp atomic
+    hist[array[i]]++;
   }
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
-#endif 
+#endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA::seq_exec policy enforces strictly sequential execution.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::seq_exec policy enforces strictly sequential execution.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential atomic histogram...\n";
 
   std::memset(hist, 0, M * sizeof(int));
 
   // _range_atomic_histogram_start
-  //RAJA::TypedRangeSegment<int> array_range(0,N);
+  // RAJA::TypedRangeSegment<int> array_range(0,N);
   // _range_atomic_histogram_end
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
-  ///           method with RAJA::seq_exec execution policy type and a 
+  ///           method with RAJA::seq_exec execution policy type and a
   ///           RAJA::atomicAdd operation with RAJA::seq_atomic policy.
   ///
   ///           You will need to uncomment the range segment definition
   ///           above to use it in the kernel.
   ///
-  //RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
-  //});
+  // RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
+  // });
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 
-//----------------------------------------------------------------------------//
-// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -159,44 +162,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
-  ///           method with RAJA::omp_parallel_for_exec execution policy type 
+  ///           method with RAJA::omp_parallel_for_exec execution policy type
   ///           and a RAJA::atomicAdd operation with RAJA::omp_atomic policy.
-  /// 
+  ///
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA auto_atomic policy can also be used with the RAJA OpenMP 
-// execution policy. 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA auto_atomic policy can also be used with the RAJA OpenMP
+  // execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n";
-  
+
   std::memset(hist, 0, M * sizeof(int));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
-  ///           method with RAJA::omp_parallel_for_exec execution policy type 
+  ///           method with RAJA::omp_parallel_for_exec execution policy type
   ///           and a RAJA::atomicAdd operation with RAJA::auto_atomic policy.
   ///
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -216,20 +219,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA auto_atomic policy can also be used with the RAJA CUDA 
-// execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA auto_atomic policy can also be used with the RAJA CUDA
+  // execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n";
- 
+
   std::memset(hist, 0, M * sizeof(int));
 
   ///
@@ -242,15 +245,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
-   
+
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA hip_atomic policy is used with the RAJA HIP execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA hip_atomic policy is used with the RAJA HIP execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -270,20 +273,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA auto_atomic policy can also be used with the RAJA HIP 
-// execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA auto_atomic policy can also be used with the RAJA HIP
+  // execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n";
- 
+
   std::memset(hist, 0, M * sizeof(int));
 
   ///
@@ -296,9 +299,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
-   
+
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
@@ -321,12 +324,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* hist, int* hist_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( correct && hist[i] != hist_ref[i] ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (correct && hist[i] != hist_ref[i])
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -337,7 +347,8 @@ void checkResult(int* hist, int* hist_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp
index 368f729ebc..bb3380ffc4 100644
--- a/exercises/atomic-histogram_solution.cpp
+++ b/exercises/atomic-histogram_solution.cpp
@@ -19,9 +19,9 @@
  *
  *  In this exercise, you will use use RAJA atomic operations to compute
  *  an array which represents a histogram of values in another array.
- *  Given an array of length N containing integers in the interval [0, M), 
- *  you will compute entries in an array 'hist' of length M. Each entry 
- *  hist[i] in the histogram array will equal the number of occurrences of 
+ *  Given an array of length N containing integers in the interval [0, M),
+ *  you will compute entries in an array 'hist' of length M. Each entry
+ *  hist[i] in the histogram array will equal the number of occurrences of
  *  the value 'i' in the orginal array.
  *
  *  This file contains sequential and OpenMP variants of the histogram
@@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // Define array bounds and initialize array to compute histogram of values
-  // on. 
+  // on.
   //
 
   // _array_atomic_histogram_start
@@ -72,31 +72,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* array = memoryManager::allocate<int>(N);
   int* hist = memoryManager::allocate<int>(M);
 
-  for (int i = 0; i < N; ++i) { 
+  for (int i = 0; i < N; ++i)
+  {
     array[i] = rand() % M;
   }
   // _array_atomic_histogram_end
 
   int* hist_ref = memoryManager::allocate<int>(M);
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential historgram...\n";
 
   std::memset(hist_ref, 0, M * sizeof(int));
 
-  for (int i = 0; i < N; ++i) {
-      hist_ref[ array[i] ]++;
+  for (int i = 0; i < N; ++i)
+  {
+    hist_ref[array[i]]++;
   }
 
-//printArray(hist_ref, M);
+  // printArray(hist_ref, M);
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -104,43 +106,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  #pragma omp parallel for
-  for (int i = 0; i < N; ++i) {
-      #pragma omp atomic
-      hist[ array[i] ]++;
+#pragma omp parallel for
+  for (int i = 0; i < N; ++i)
+  {
+#pragma omp atomic
+    hist[array[i]]++;
   }
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
-#endif 
+#endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA::seq_exec policy enforces strictly sequential execution.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::seq_exec policy enforces strictly sequential execution.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential atomic histogram...\n";
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _range_atomic_histogram_start 
-  RAJA::TypedRangeSegment<int> array_range(0,N);
-  // _range_atomic_histogram_end 
+  // _range_atomic_histogram_start
+  RAJA::TypedRangeSegment<int> array_range(0, N);
+  // _range_atomic_histogram_end
 
   RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
-
     RAJA::atomicAdd<RAJA::seq_atomic>(&hist[array[i]], 1);
-
   });
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 
-//----------------------------------------------------------------------------//
-// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -148,46 +149,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajaomp_atomic_histogram_start 
+  // _rajaomp_atomic_histogram_start
   RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
-
     RAJA::atomicAdd<RAJA::omp_atomic>(&hist[array[i]], 1);
-
   });
   // _rajaomp_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA auto_atomic policy can also be used with the RAJA OpenMP 
-// execution policy. 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA auto_atomic policy can also be used with the RAJA OpenMP
+  // execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n";
-  
+
   std::memset(hist, 0, M * sizeof(int));
 
   RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
-
     RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
-
   });
-    
+
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -195,47 +192,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajacuda_atomic_histogram_start 
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
-
-    RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1);
-
-  });
+  // _rajacuda_atomic_histogram_start
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      array_range, [=] RAJA_DEVICE(int i) {
+        RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1);
+      });
   // _rajacuda_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA auto_atomic policy can also be used with the RAJA CUDA 
-// execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA auto_atomic policy can also be used with the RAJA CUDA
+  // execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n";
- 
-  std::memset(hist, 0, M * sizeof(int));
-
-  // _rajacuda_atomicauto_histogram_start 
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
 
-    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
+  std::memset(hist, 0, M * sizeof(int));
 
-  });
+  // _rajacuda_atomicauto_histogram_start
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      array_range, [=] RAJA_DEVICE(int i) {
+        RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
+      });
   // _rajacuda_atomicauto_histogram_end
-   
+
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA hip_atomic policy is used with the RAJA HIP execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA hip_atomic policy is used with the RAJA HIP execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -243,41 +238,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajahip_atomic_histogram_start 
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
-
-    RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1);
-
-  });
+  // _rajahip_atomic_histogram_start
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      array_range, [=] RAJA_DEVICE(int i) {
+        RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1);
+      });
   // _rajahip_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA auto_atomic policy can also be used with the RAJA HIP 
-// execution policy.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA auto_atomic policy can also be used with the RAJA HIP
+  // execution policy.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n";
- 
-  std::memset(hist, 0, M * sizeof(int));
-
-  // _rajahip_atomicauto_histogram_start 
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
 
-    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
+  std::memset(hist, 0, M * sizeof(int));
 
-  });
+  // _rajahip_atomicauto_histogram_start
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      array_range, [=] RAJA_DEVICE(int i) {
+        RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
+      });
   // _rajahip_atomicauto_histogram_end
-   
+
   checkResult(hist, hist_ref, M);
-//printArray(hist, M);
+  // printArray(hist, M);
 
 #endif
 
@@ -300,12 +293,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* hist, int* hist_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( correct && hist[i] != hist_ref[i] ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (correct && hist[i] != hist_ref[i])
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -316,7 +316,8 @@ void checkResult(int* hist, int* hist_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp
index c2830c6cb2..9625220983 100644
--- a/exercises/dot-product.cpp
+++ b/exercises/dot-product.cpp
@@ -14,9 +14,9 @@
 /*
  *  Vector Dot Product Exercise
  *
- *  Computes dot = (a,b), where a, b are vectors of 
+ *  Computes dot = (a,b), where a, b are vectors of
  *  doubles and dot is a scalar double. It illustrates how RAJA
- *  supports a portable parallel reduction opertion in a way that 
+ *  supports a portable parallel reduction opertion in a way that
  *  the code looks like it does in a sequential implementation.
  *
  *  RAJA features shown:
@@ -33,38 +33,40 @@
 //
 void checkResult(double compdot, double refdot);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: vector dot product...\n";
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   constexpr int N = 1000000;
 
-//
-// Allocate and initialize vector data
-//
-  double *a = memoryManager::allocate<double>(N);
-  double *b = memoryManager::allocate<double>(N);
+  //
+  // Allocate and initialize vector data
+  //
+  double* a = memoryManager::allocate<double>(N);
+  double* b = memoryManager::allocate<double>(N);
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = 1.0;
     b[i] = 1.0;
   }
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// C-style dot product operation.
-//
+  //
+  // C-style dot product operation.
+  //
   std::cout << "\n Running C-version of dot product...\n";
 
   // _csytle_dotprod_start
   double dot = 0.0;
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     dot += a[i] * b[i];
   }
 
@@ -73,7 +75,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   double dot_ref = dot;
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential dot product...\n";
 
@@ -83,16 +85,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::seq_exec
-  ///           execution policy type and RAJA::seq_reduce. 
+  ///           execution policy type and RAJA::seq_reduce.
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
 
   RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
-    seqdot += a[i] * b[i]; 
-  });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
+                               [=](int i) { seqdot += a[i] * b[i]; });
 
   dot = seqdot.get();
 
@@ -101,7 +102,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP dot product...\n";
@@ -111,8 +112,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement the dot product kernel using a RAJA::omp_parallel_for_exec
-  ///           execution policy type and RAJA::omp_reduce reduction policy type.
+  /// EXERCISE: Implement the dot product kernel using a
+  /// RAJA::omp_parallel_for_exec
+  ///           execution policy type and RAJA::omp_reduce reduction policy
+  ///           type.
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -121,11 +124,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//const int CUDA_BLOCK_SIZE = 256;
+  // const int CUDA_BLOCK_SIZE = 256;
 
   std::cout << "\n Running RAJA CUDA dot product...\n";
 
@@ -135,10 +138,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::cuda_exec
-  ///           execution policy type and RAJA::cuda_reduce reduction policy type.
-  ///      
+  ///           execution policy type and RAJA::cuda_reduce reduction policy
+  ///           type.
+  ///
   ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' above.
-  ///                 if you want to use it here. 
+  ///                 if you want to use it here.
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -146,30 +150,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-//const int HIP_BLOCK_SIZE = 256;
+  // const int HIP_BLOCK_SIZE = 256;
 
   std::cout << "\n Running RAJA HIP dot product...\n";
 
   dot = 0.0;
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  int* d_b = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::hip_exec
-  ///           execution policy type and RAJA::hip_reduce reduction policy type.
-  ///      
+  ///           execution policy type and RAJA::hip_reduce reduction policy
+  ///           type.
+  ///
   ///           NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' above
-  ///                 if you want to use it here. 
+  ///                 if you want to use it here.
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -180,11 +185,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_b);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
 
-//const int SYCL_BLOCK_SIZE = 256;
+  // const int SYCL_BLOCK_SIZE = 256;
 
   std::cout << "\n Running RAJA SYCL dot product...\n";
 
@@ -194,10 +199,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::sycl_exec
-  ///           execution policy type and RAJA::sycl_reduce. 
+  ///           execution policy type and RAJA::sycl_reduce.
   ///
   ///           NOTE: You will need to uncomment 'SYCL_BLOCK_SIZE' above
-  ///                 if you want to use it here. 
+  ///                 if you want to use it here.
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -206,7 +211,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
   memoryManager::deallocate(a);
@@ -222,10 +227,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //
 void checkResult(double compdot, double refdot)
 {
-  if ( compdot == refdot ) {
+  if (compdot == refdot)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
-
diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp
index d0ae458171..ea8acae3c4 100644
--- a/exercises/dot-product_solution.cpp
+++ b/exercises/dot-product_solution.cpp
@@ -16,9 +16,9 @@
 /*
  *  Vector Dot Product Exercise
  *
- *  Computes dot = (a,b), where a, b are vectors of 
+ *  Computes dot = (a,b), where a, b are vectors of
  *  doubles and dot is a scalar double. It illustrates how RAJA
- *  supports a portable parallel reduction opertion in a way that 
+ *  supports a portable parallel reduction opertion in a way that
  *  the code looks like it does in a sequential implementation.
  *
  *  RAJA features shown:
@@ -35,38 +35,40 @@
 //
 void checkResult(double compdot, double refdot);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: vector dot product...\n";
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   constexpr int N = 1000000;
 
-//
-// Allocate and initialize vector data
-//
-  double *a = memoryManager::allocate<double>(N);
-  double *b = memoryManager::allocate<double>(N);
+  //
+  // Allocate and initialize vector data
+  //
+  double* a = memoryManager::allocate<double>(N);
+  double* b = memoryManager::allocate<double>(N);
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = 1.0;
     b[i] = 1.0;
   }
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// C-style dot product operation.
-//
+  //
+  // C-style dot product operation.
+  //
   std::cout << "\n Running C-version of dot product...\n";
 
   // _csytle_dotprod_start
   double dot = 0.0;
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     dot += a[i] * b[i];
   }
 
@@ -75,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   double dot_ref = dot;
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential dot product...\n";
 
@@ -84,9 +86,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _rajaseq_dotprod_start
   RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
-    seqdot += a[i] * b[i]; 
-  });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
+                               [=](int i) { seqdot += a[i] * b[i]; });
 
   dot = seqdot.get();
   // _rajaseq_dotprod_end
@@ -96,7 +97,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP dot product...\n";
@@ -106,9 +107,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _rajaomp_dotprod_start
   RAJA::ReduceSum<RAJA::omp_reduce, double> ompdot(0.0);
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=] (int i) { 
-    ompdot += a[i] * b[i]; 
-  }); 
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      RAJA::RangeSegment(0, N), [=](int i) { ompdot += a[i] * b[i]; });
 
   dot = ompdot.get();
   // _rajaomp_dotprod_end
@@ -119,7 +119,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -132,10 +132,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _rajacuda_dotprod_start
   RAJA::ReduceSum<RAJA::cuda_reduce, double> cudot(0.0);
 
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::RangeSegment(0, N), 
-    [=] RAJA_DEVICE (int i) { 
-    cudot += a[i] * b[i]; 
-  });    
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_DEVICE(int i) { cudot += a[i] * b[i]; });
 
   dot = cudot.get();
   // _rajacuda_dotprod_end
@@ -145,7 +144,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -155,19 +154,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   dot = 0.0;
 
-  double *d_a = memoryManager::allocate_gpu<double>(N);
-  double *d_b = memoryManager::allocate_gpu<double>(N);
+  double* d_a = memoryManager::allocate_gpu<double>(N);
+  double* d_b = memoryManager::allocate_gpu<double>(N);
 
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_b, b, N * sizeof(double), hipMemcpyHostToDevice));
 
   // _rajahip_dotprod_start
   RAJA::ReduceSum<RAJA::hip_reduce, double> hpdot(0.0);
 
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
-    [=] RAJA_DEVICE (int i) {
-    hpdot += d_a[i] * d_b[i];
-  });
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_DEVICE(int i) { hpdot += d_a[i] * d_b[i]; });
 
   dot = hpdot.get();
   // _rajahip_dotprod_end
@@ -180,7 +178,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_b);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
 
@@ -193,10 +191,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _rajasycl_dotprod_start
   RAJA::ReduceSum<RAJA::sycl_reduce, double> hpdot(0.0);
 
-  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE, false>>(RAJA::RangeSegment(0, N),
-    [=] RAJA_DEVICE (int i) {
-    hpdot += a[i] * b[i];
-  });
+  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE, false>>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_DEVICE(int i) { hpdot += a[i] * b[i]; });
 
   dot = static_cast<double>(hpdot.get());
   // _rajasycl_dotprod_end
@@ -207,7 +204,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
   memoryManager::deallocate(a);
@@ -223,10 +220,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //
 void checkResult(double compdot, double refdot)
 {
-  if ( compdot == refdot ) {
+  if (compdot == refdot)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
-
diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp
index 227af7d2be..006dd27e34 100644
--- a/exercises/kernel-matrix-transpose-local-array.cpp
+++ b/exercises/kernel-matrix-transpose-local-array.cpp
@@ -66,7 +66,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
@@ -87,8 +87,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -104,8 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
@@ -120,8 +122,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -132,14 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -151,19 +158,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int tx = 0; tx < TILE_DIM; ++tx)
+      {
+        for (int ty = 0; ty < TILE_DIM; ++ty)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
-
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -186,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM =
-    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM = RAJA::
+      LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -214,19 +223,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
           ///
           /// TODO...
           ///
-          /// EXERCISE: Initialize the local memory statement as position 2 
+          /// EXERCISE: Initialize the local memory statement as position 2
           ///           in the paramater list.
           ///
 
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
+          RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
+  RAJA::seq_exec, RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
+  RAJA::seq_exec, RAJA::statement::Lambda<0>
             >
           >,
 
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-              RAJA::statement::Lambda<1>
+          RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
+  RAJA::seq_exec, RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
+  RAJA::seq_exec, RAJA::statement::Lambda<1>
             >
           >
 
@@ -235,7 +244,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel_param<SEQ_EXEC_POL_I>( 
+  RAJA::kernel_param<SEQ_EXEC_POL_I>(
     RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
                      RAJA::TypedRangeSegment<int>(0, N_r)),
 
@@ -277,8 +286,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     //      These loops iterate over the number of
     //      tiles needed to carry out the transpose
     //
-    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
-      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>,
+  RAJA::omp_parallel_for_exec, RAJA::statement::Tile<0,
+  RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
         // This statement will initalize local array memory inside a
         // kernel. The cpu_tile_mem policy specifies that memory should be
         // allocated on the stack. The entries in the RAJA::ParamList
@@ -293,7 +303,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
           ///
           /// TODO...
           ///
-          /// EXERCISE: Use two ForICount statements with seq_exec to call the first lambda.
+          /// EXERCISE: Use two ForICount statements with seq_exec to call the
+  first lambda.
           ///
 
           //
@@ -308,7 +319,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
           ///
           /// TODO...
           ///
-          /// EXERCISE: Use two ForICount statements with seq_exec to call the second lambda.
+          /// EXERCISE: Use two ForICount statements with seq_exec to call the
+  second lambda.
           ///
         >
       >
@@ -343,65 +355,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_2_POL =
-  RAJA::KernelPolicy<
-    //
-    // (0) Execution policies for outer loops
-    //      These loops iterate over the number of
-    //      tiles needed to carry out the transpose
-    //
-    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      // This statement will initalize local array memory inside a
-      // kernel. The cpu_tile_mem policy specifies that memory should be
-      // allocated on the stack. The entries in the RAJA::ParamList
-      // identify RAJA local arrays to intialize in the parameter tuple.
-        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-          //
-          // (1) Execution policies for the first set of inner
-          // loops. These loops copy data from the global matrices
-          // to the local tile.
-          //
-          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                                       RAJA::statement::Lambda<0>
-             >
-          >,
-          //
-          // (2) Execution policies for the second set of inner
-          // loops. These loops copy data from the local tile to
-          // the global matrix.
-          //     Note: The order of the loops have been
-          //     swapped! This enables us to swap which
-          //     index has unit stride.
-          //
-          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
-              RAJA::statement::Lambda<1>
-            >
-          >
-        >
-      >
-    >
-  >;
+  using OPENMP_EXEC_2_POL = RAJA::KernelPolicy<
+      //
+      // (0) Execution policies for outer loops
+      //      These loops iterate over the number of
+      //      tiles needed to carry out the transpose
+      //
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::seq_exec,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::seq_exec,
+              // This statement will initalize local array memory inside a
+              // kernel. The cpu_tile_mem policy specifies that memory should be
+              // allocated on the stack. The entries in the RAJA::ParamList
+              // identify RAJA local arrays to intialize in the parameter tuple.
+              RAJA::statement::InitLocalMem<
+                  RAJA::cpu_tile_mem,
+                  RAJA::ParamList<2>,
+                  //
+                  // (1) Execution policies for the first set of inner
+                  // loops. These loops copy data from the global matrices
+                  // to the local tile.
+                  //
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<1>,
+                      RAJA::omp_parallel_for_exec,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<0>,
+                                                 RAJA::seq_exec,
+                                                 RAJA::statement::Lambda<0>>>,
+                  //
+                  // (2) Execution policies for the second set of inner
+                  // loops. These loops copy data from the local tile to
+                  // the global matrix.
+                  //     Note: The order of the loops have been
+                  //     swapped! This enables us to swap which
+                  //     index has unit stride.
+                  //
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<0>,
+                      RAJA::seq_exec,
+                      RAJA::statement::ForICount<
+                          1,
+                          RAJA::statement::Param<1>,
+                          RAJA::seq_exec,
+                          RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = Aview(row, col);
-
-    },
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      Atview(col, row) = Tile_Array(ty, tx);
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+      },
 
-    }
-  );
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Atview(col, row) = Tile_Array(ty, tx);
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_r, N_c);
@@ -413,87 +430,89 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using CUDA_EXEC_POL =
-  RAJA::KernelPolicy<
-    RAJA::statement::CudaKernel<
+  using CUDA_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
-          // This statement will initalize local array memory inside a
-          // kernel. The cpu_tile_mem policy specifies that memory should be
-          // allocated on the stack. The entries in the RAJA::ParamList
-          // identify RAJA local arrays to intialize in the parameter tuple.
-          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
-            //
-            // (1) Execution policies for the first set of inner
-            // loops. These loops copy data from the global matrices
-            // to the local tile.
-            //
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct,
-                                          RAJA::statement::Lambda<0>
-              >
-            >,
-            // Synchronize threads to ensure all loads
-            // to the local array are complete
-            RAJA::statement::CudaSyncThreads,
-            //
-            // (2) Execution policies for the second set of inner
-            // loops. These loops copy data from the local tile to
-            // the global matrix.
-            //     Note: The order of the loops have been
-            //     swapped! This enables us to swap which
-            //     index has unit stride.
-            //
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
-                                            RAJA::statement::Lambda<1>
-              >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::CudaSyncThreads
-          >
-        >
-      >
-    >
-  >;
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::cuda_block_x_loop,
+              // This statement will initalize local array memory inside a
+              // kernel. The cpu_tile_mem policy specifies that memory should be
+              // allocated on the stack. The entries in the RAJA::ParamList
+              // identify RAJA local arrays to intialize in the parameter tuple.
+              RAJA::statement::InitLocalMem<
+                  RAJA::cuda_shared_mem,
+                  RAJA::ParamList<2>,
+                  //
+                  // (1) Execution policies for the first set of inner
+                  // loops. These loops copy data from the global matrices
+                  // to the local tile.
+                  //
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<0>,
+                      RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<1>,
+                                                 RAJA::cuda_thread_x_direct,
+                                                 RAJA::statement::Lambda<0>>>,
+                  // Synchronize threads to ensure all loads
+                  // to the local array are complete
+                  RAJA::statement::CudaSyncThreads,
+                  //
+                  // (2) Execution policies for the second set of inner
+                  // loops. These loops copy data from the local tile to
+                  // the global matrix.
+                  //     Note: The order of the loops have been
+                  //     swapped! This enables us to swap which
+                  //     index has unit stride.
+                  //
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<1>,
+                      RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<1,
+                                                 RAJA::statement::Param<0>,
+                                                 RAJA::cuda_thread_x_direct,
+                                                 RAJA::statement::Lambda<1>>>,
+                  // Synchronize threads to ensure all reads
+                  // from the local array are complete
+                  RAJA::statement::CudaSyncThreads>>>>>;
 
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = Aview(row, col);
-
-    },
-
-    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      Atview(col, row) = Tile_Array(ty, tx);
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+      },
 
-    }
-  );
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Atview(col, row) = Tile_Array(ty, tx);
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
-  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -505,93 +524,98 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
-  using HIP_EXEC_POL =
-  RAJA::KernelPolicy<
-    RAJA::statement::HipKernel<
+  using HIP_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::HipKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-          // This statement will initalize local array memory inside a
-          // kernel. The cpu_tile_mem policy specifies that memory should be
-          // allocated on the stack. The entries in the RAJA::ParamList
-          // identify RAJA local arrays to intialize in the parameter tuple.
-          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
-            //
-            // (1) Execution policies for the first set of inner
-            // loops. These loops copy data from the global matrices
-            // to the local tile.
-            //
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct,
-                                          RAJA::statement::Lambda<0>
-              >
-            >,
-            // Synchronize threads to ensure all loads
-            // to the local array are complete
-            RAJA::statement::HipSyncThreads,
-            //
-            // (2) Execution policies for the second set of inner
-            // loops. These loops copy data from the local tile to
-            // the global matrix.
-            //     Note: The order of the loops have been
-            //     swapped! This enables us to swap which
-            //     index has unit stride.
-            //
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
-                                            RAJA::statement::Lambda<1>
-              >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::HipSyncThreads
-          >
-        >
-      >
-    >
-  >;
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::hip_block_x_loop,
+              // This statement will initalize local array memory inside a
+              // kernel. The cpu_tile_mem policy specifies that memory should be
+              // allocated on the stack. The entries in the RAJA::ParamList
+              // identify RAJA local arrays to intialize in the parameter tuple.
+              RAJA::statement::InitLocalMem<
+                  RAJA::hip_shared_mem,
+                  RAJA::ParamList<2>,
+                  //
+                  // (1) Execution policies for the first set of inner
+                  // loops. These loops copy data from the global matrices
+                  // to the local tile.
+                  //
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<0>,
+                      RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<1>,
+                                                 RAJA::hip_thread_x_direct,
+                                                 RAJA::statement::Lambda<0>>>,
+                  // Synchronize threads to ensure all loads
+                  // to the local array are complete
+                  RAJA::statement::HipSyncThreads,
+                  //
+                  // (2) Execution policies for the second set of inner
+                  // loops. These loops copy data from the local tile to
+                  // the global matrix.
+                  //     Note: The order of the loops have been
+                  //     swapped! This enables us to swap which
+                  //     index has unit stride.
+                  //
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<1>,
+                      RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<1,
+                                                 RAJA::statement::Param<0>,
+                                                 RAJA::hip_thread_x_direct,
+                                                 RAJA::statement::Lambda<1>>>,
+                  // Synchronize threads to ensure all reads
+                  // from the local array are complete
+                  RAJA::statement::HipSyncThreads>>>>>;
 
 
   RAJA::kernel_param<HIP_EXEC_POL>(
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = d_Aview(row, col);
+      },
 
-      Tile_Array(ty, tx) = d_Aview(row, col);
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        d_Atview(col, row) = Tile_Array(ty, tx);
+      });
 
-    },
-
-    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      d_Atview(col, row) = Tile_Array(ty, tx);
-
-    }
-  );
-
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
 
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n";
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise with "
+               "args in statement ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  //Alias for convenience
-  using RAJA::Segs;
+  // Alias for convenience
   using RAJA::Offsets;
   using RAJA::Params;
+  using RAJA::Segs;
 
   // _mattranspose_localarray_raja_lambdaargs_start
   ///
@@ -609,7 +633,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
           RAJA::statement::For<1, RAJA::seq_exec,
             RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> >
+              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>,
+  Offsets<1>, Params<0> >
             >
           >,
 
@@ -624,7 +649,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel_param<SEQ_EXEC_POL_II>( 
+  RAJA::kernel_param<SEQ_EXEC_POL_II>(
     RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
                      RAJA::TypedRangeSegment<int>(0, N_r)),
 
@@ -644,7 +669,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -657,16 +682,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -678,8 +709,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
index 7b44cd3453..802f07826e 100644
--- a/exercises/kernel-matrix-transpose-local-array_solution.cpp
+++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -66,7 +66,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
@@ -87,8 +87,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -104,8 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
@@ -120,8 +122,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -132,14 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -151,19 +158,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int tx = 0; tx < TILE_DIM; ++tx)
+      {
+        for (int ty = 0; ty < TILE_DIM; ++ty)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
-
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -186,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM =
-    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM = RAJA::
+      LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -200,43 +209,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _mattranspose_localarray_raja_start
-  using SEQ_EXEC_POL_I =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >,
-
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-              RAJA::statement::Lambda<1>
-            >
-          >
-
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel_param<SEQ_EXEC_POL_I>( 
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-      Tile_Array(ty, tx) = Aview(row, col);
-    },
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-      Atview(col, row) = Tile_Array(ty, tx);
-    }
+  using SEQ_EXEC_POL_I = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_DIM>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<
+              RAJA::cpu_tile_mem,
+              RAJA::ParamList<2>,
+
+              RAJA::statement::ForICount<
+                  1,
+                  RAJA::statement::Param<0>,
+                  RAJA::seq_exec,
+                  RAJA::statement::ForICount<0,
+                                             RAJA::statement::Param<1>,
+                                             RAJA::seq_exec,
+                                             RAJA::statement::Lambda<0>>>,
+
+              RAJA::statement::ForICount<
+                  0,
+                  RAJA::statement::Param<1>,
+                  RAJA::seq_exec,
+                  RAJA::statement::ForICount<1,
+                                             RAJA::statement::Param<0>,
+                                             RAJA::seq_exec,
+                                             RAJA::statement::Lambda<1>>>
+
+              >>>>;
+
+  RAJA::kernel_param<SEQ_EXEC_POL_I>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+      },
+
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Atview(col, row) = Tile_Array(ty, tx);
+      }
 
   );
   // _mattranspose_localarray_raja_end
@@ -252,65 +270,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_1_POL =
-  RAJA::KernelPolicy<
-    //
-    // (0) Execution policies for outer loops
-    //      These loops iterate over the number of
-    //      tiles needed to carry out the transpose
-    //
-    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
-      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        // This statement will initalize local array memory inside a
-        // kernel. The cpu_tile_mem policy specifies that memory should be
-        // allocated on the stack. The entries in the RAJA::ParamList
-        // identify RAJA local arrays in the parameter tuple to intialize.
-        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-          //
-          // (1) Execution policies for the first set of inner
-          // loops. These loops copy data from the global matrices
-          // to the local tile.
-          //
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-                                       RAJA::statement::Lambda<0>
-            >
-          >,
-          //
-          // (2) Execution policies for the second set of inner
-          // loops. These loops copy data from the local tile to
-          // the global matrix.
-          //     Note: The order of the loops have been
-          //     swapped! This enables us to swap which
-          //     index has unit stride.
-          //
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-                                       RAJA::statement::Lambda<1>
-            >
-          >
-        >
-      >
-    >
-   >;
+  using OPENMP_EXEC_1_POL = RAJA::KernelPolicy<
+      //
+      // (0) Execution policies for outer loops
+      //      These loops iterate over the number of
+      //      tiles needed to carry out the transpose
+      //
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::omp_parallel_for_exec,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::seq_exec,
+              // This statement will initalize local array memory inside a
+              // kernel. The cpu_tile_mem policy specifies that memory should be
+              // allocated on the stack. The entries in the RAJA::ParamList
+              // identify RAJA local arrays in the parameter tuple to intialize.
+              RAJA::statement::InitLocalMem<
+                  RAJA::cpu_tile_mem,
+                  RAJA::ParamList<2>,
+                  //
+                  // (1) Execution policies for the first set of inner
+                  // loops. These loops copy data from the global matrices
+                  // to the local tile.
+                  //
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<0>,
+                      RAJA::seq_exec,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<1>,
+                                                 RAJA::seq_exec,
+                                                 RAJA::statement::Lambda<0>>>,
+                  //
+                  // (2) Execution policies for the second set of inner
+                  // loops. These loops copy data from the local tile to
+                  // the global matrix.
+                  //     Note: The order of the loops have been
+                  //     swapped! This enables us to swap which
+                  //     index has unit stride.
+                  //
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<1>,
+                      RAJA::seq_exec,
+                      RAJA::statement::ForICount<
+                          1,
+                          RAJA::statement::Param<0>,
+                          RAJA::seq_exec,
+                          RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_1_POL>(
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = Aview(row, col);
-
-    },
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      Atview(col, row) = Tile_Array(ty, tx);
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+      },
 
-    }
-  );
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Atview(col, row) = Tile_Array(ty, tx);
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -321,65 +344,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_2_POL =
-  RAJA::KernelPolicy<
-    //
-    // (0) Execution policies for outer loops
-    //      These loops iterate over the number of
-    //      tiles needed to carry out the transpose
-    //
-    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      // This statement will initalize local array memory inside a
-      // kernel. The cpu_tile_mem policy specifies that memory should be
-      // allocated on the stack. The entries in the RAJA::ParamList
-      // identify RAJA local arrays to intialize in the parameter tuple.
-        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-          //
-          // (1) Execution policies for the first set of inner
-          // loops. These loops copy data from the global matrices
-          // to the local tile.
-          //
-          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                                       RAJA::statement::Lambda<0>
-             >
-          >,
-          //
-          // (2) Execution policies for the second set of inner
-          // loops. These loops copy data from the local tile to
-          // the global matrix.
-          //     Note: The order of the loops have been
-          //     swapped! This enables us to swap which
-          //     index has unit stride.
-          //
-          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
-              RAJA::statement::Lambda<1>
-            >
-          >
-        >
-      >
-    >
-  >;
+  using OPENMP_EXEC_2_POL = RAJA::KernelPolicy<
+      //
+      // (0) Execution policies for outer loops
+      //      These loops iterate over the number of
+      //      tiles needed to carry out the transpose
+      //
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::seq_exec,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::seq_exec,
+              // This statement will initalize local array memory inside a
+              // kernel. The cpu_tile_mem policy specifies that memory should be
+              // allocated on the stack. The entries in the RAJA::ParamList
+              // identify RAJA local arrays to intialize in the parameter tuple.
+              RAJA::statement::InitLocalMem<
+                  RAJA::cpu_tile_mem,
+                  RAJA::ParamList<2>,
+                  //
+                  // (1) Execution policies for the first set of inner
+                  // loops. These loops copy data from the global matrices
+                  // to the local tile.
+                  //
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<1>,
+                      RAJA::omp_parallel_for_exec,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<0>,
+                                                 RAJA::seq_exec,
+                                                 RAJA::statement::Lambda<0>>>,
+                  //
+                  // (2) Execution policies for the second set of inner
+                  // loops. These loops copy data from the local tile to
+                  // the global matrix.
+                  //     Note: The order of the loops have been
+                  //     swapped! This enables us to swap which
+                  //     index has unit stride.
+                  //
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<0>,
+                      RAJA::seq_exec,
+                      RAJA::statement::ForICount<
+                          1,
+                          RAJA::statement::Param<1>,
+                          RAJA::seq_exec,
+                          RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = Aview(row, col);
-
-    },
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Atview(col, row) = Tile_Array(ty, tx);
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+      },
 
-    }
-  );
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Atview(col, row) = Tile_Array(ty, tx);
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_r, N_c);
@@ -391,87 +419,89 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using CUDA_EXEC_POL =
-  RAJA::KernelPolicy<
-    RAJA::statement::CudaKernel<
+  using CUDA_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
-          // This statement will initalize local array memory inside a
-          // kernel. The cpu_tile_mem policy specifies that memory should be
-          // allocated on the stack. The entries in the RAJA::ParamList
-          // identify RAJA local arrays to intialize in the parameter tuple.
-          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
-            //
-            // (1) Execution policies for the first set of inner
-            // loops. These loops copy data from the global matrices
-            // to the local tile.
-            //
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct,
-                                          RAJA::statement::Lambda<0>
-              >
-            >,
-            // Synchronize threads to ensure all loads
-            // to the local array are complete
-            RAJA::statement::CudaSyncThreads,
-            //
-            // (2) Execution policies for the second set of inner
-            // loops. These loops copy data from the local tile to
-            // the global matrix.
-            //     Note: The order of the loops have been
-            //     swapped! This enables us to swap which
-            //     index has unit stride.
-            //
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
-                                            RAJA::statement::Lambda<1>
-              >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::CudaSyncThreads
-          >
-        >
-      >
-    >
-  >;
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::cuda_block_x_loop,
+              // This statement will initalize local array memory inside a
+              // kernel. The cpu_tile_mem policy specifies that memory should be
+              // allocated on the stack. The entries in the RAJA::ParamList
+              // identify RAJA local arrays to intialize in the parameter tuple.
+              RAJA::statement::InitLocalMem<
+                  RAJA::cuda_shared_mem,
+                  RAJA::ParamList<2>,
+                  //
+                  // (1) Execution policies for the first set of inner
+                  // loops. These loops copy data from the global matrices
+                  // to the local tile.
+                  //
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<0>,
+                      RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<1>,
+                                                 RAJA::cuda_thread_x_direct,
+                                                 RAJA::statement::Lambda<0>>>,
+                  // Synchronize threads to ensure all loads
+                  // to the local array are complete
+                  RAJA::statement::CudaSyncThreads,
+                  //
+                  // (2) Execution policies for the second set of inner
+                  // loops. These loops copy data from the local tile to
+                  // the global matrix.
+                  //     Note: The order of the loops have been
+                  //     swapped! This enables us to swap which
+                  //     index has unit stride.
+                  //
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<1>,
+                      RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<1,
+                                                 RAJA::statement::Param<0>,
+                                                 RAJA::cuda_thread_x_direct,
+                                                 RAJA::statement::Lambda<1>>>,
+                  // Synchronize threads to ensure all reads
+                  // from the local array are complete
+                  RAJA::statement::CudaSyncThreads>>>>>;
 
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = Aview(row, col);
-
-    },
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Atview(col, row) = Tile_Array(ty, tx);
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+      },
 
-    }
-  );
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Atview(col, row) = Tile_Array(ty, tx);
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
-  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -483,138 +513,154 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
-  using HIP_EXEC_POL =
-  RAJA::KernelPolicy<
-    RAJA::statement::HipKernel<
+  using HIP_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::HipKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-          // This statement will initalize local array memory inside a
-          // kernel. The cpu_tile_mem policy specifies that memory should be
-          // allocated on the stack. The entries in the RAJA::ParamList
-          // identify RAJA local arrays to intialize in the parameter tuple.
-          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
-            //
-            // (1) Execution policies for the first set of inner
-            // loops. These loops copy data from the global matrices
-            // to the local tile.
-            //
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct,
-                                          RAJA::statement::Lambda<0>
-              >
-            >,
-            // Synchronize threads to ensure all loads
-            // to the local array are complete
-            RAJA::statement::HipSyncThreads,
-            //
-            // (2) Execution policies for the second set of inner
-            // loops. These loops copy data from the local tile to
-            // the global matrix.
-            //     Note: The order of the loops have been
-            //     swapped! This enables us to swap which
-            //     index has unit stride.
-            //
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
-                                            RAJA::statement::Lambda<1>
-              >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::HipSyncThreads
-          >
-        >
-      >
-    >
-  >;
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::hip_block_x_loop,
+              // This statement will initalize local array memory inside a
+              // kernel. The cpu_tile_mem policy specifies that memory should be
+              // allocated on the stack. The entries in the RAJA::ParamList
+              // identify RAJA local arrays to intialize in the parameter tuple.
+              RAJA::statement::InitLocalMem<
+                  RAJA::hip_shared_mem,
+                  RAJA::ParamList<2>,
+                  //
+                  // (1) Execution policies for the first set of inner
+                  // loops. These loops copy data from the global matrices
+                  // to the local tile.
+                  //
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<0>,
+                      RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<1>,
+                                                 RAJA::hip_thread_x_direct,
+                                                 RAJA::statement::Lambda<0>>>,
+                  // Synchronize threads to ensure all loads
+                  // to the local array are complete
+                  RAJA::statement::HipSyncThreads,
+                  //
+                  // (2) Execution policies for the second set of inner
+                  // loops. These loops copy data from the local tile to
+                  // the global matrix.
+                  //     Note: The order of the loops have been
+                  //     swapped! This enables us to swap which
+                  //     index has unit stride.
+                  //
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<1>,
+                      RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<1,
+                                                 RAJA::statement::Param<0>,
+                                                 RAJA::hip_thread_x_direct,
+                                                 RAJA::statement::Lambda<1>>>,
+                  // Synchronize threads to ensure all reads
+                  // from the local array are complete
+                  RAJA::statement::HipSyncThreads>>>>>;
 
 
   RAJA::kernel_param<HIP_EXEC_POL>(
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = d_Aview(row, col);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-    },
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Tile_Array(ty, tx) = d_Aview(row, col);
+      },
 
-    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      d_Atview(col, row) = Tile_Array(ty, tx);
-
-    }
-  );
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        d_Atview(col, row) = Tile_Array(ty, tx);
+      });
 
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
 
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n";
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise with "
+               "args in statement ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  //Alias for convenience
-  using RAJA::Segs;
+  // Alias for convenience
   using RAJA::Offsets;
   using RAJA::Params;
+  using RAJA::Segs;
 
   // _raja_mattranspose_lambdaargs_start
-  using SEQ_EXEC_POL_II =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0>,
-
-          RAJA::statement::For<1, RAJA::seq_exec,
-            RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> >
-            >
-          >,
-
-          RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> >
-            >
-          >
-
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel_param<SEQ_EXEC_POL_II>( 
-    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-
-    RAJA::make_tuple(Tile_Array),
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+  using SEQ_EXEC_POL_II = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_DIM>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<
+              RAJA::cpu_tile_mem,
+              RAJA::ParamList<0>,
+
+              RAJA::statement::For<
+                  1,
+                  RAJA::seq_exec,
+                  RAJA::statement::For<0,
+                                       RAJA::seq_exec,
+                                       RAJA::statement::Lambda<0,
+                                                               Segs<0>,
+                                                               Segs<1>,
+                                                               Offsets<0>,
+                                                               Offsets<1>,
+                                                               Params<0>>>>,
+
+              RAJA::statement::For<
+                  0,
+                  RAJA::seq_exec,
+                  RAJA::statement::For<
+                      1,
+                      RAJA::seq_exec,
+                      RAJA::statement::
+                          Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0>>>>
+
+              >>>>;
+
+  RAJA::kernel_param<SEQ_EXEC_POL_II>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                       RAJA::TypedRangeSegment<int>(0, N_r)),
+
+      RAJA::make_tuple(Tile_Array),
+
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
         Tile_Array(ty, tx) = Aview(row, col);
-    },
+      },
 
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-      Atview(col, row) = Tile_Array(ty, tx);
-    }
-  );
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
+        Atview(col, row) = Tile_Array(ty, tx);
+      });
   // _raja_mattranspose_lambdaargs_start
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -627,16 +673,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -648,8 +700,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp
index 7316563117..8ab08df5d1 100644
--- a/exercises/kernel-matrix-transpose-tiled.cpp
+++ b/exercises/kernel-matrix-transpose-tiled.cpp
@@ -21,14 +21,14 @@
  *  transposed and returned as a second matrix At.
  *
  *  This operation is carried out using a tiling algorithm.
- *  The algorithm iterates over tiles of the matrix A and 
+ *  The algorithm iterates over tiles of the matrix A and
  *  performs a transpose copy without explicitly storing the tile.
  *
  *  The algorithm is expressed as a collection of ``outer``
- *  and ``inner`` for loops. Iterations of the inner loop will 
+ *  and ``inner`` for loops. Iterations of the inner loop will
  *  tranpose tile entries; while outer loops will iterate over
  *  the number of tiles needed to carryout the transpose.
- *  We do not assume that tiles divide the number of rows and 
+ *  We do not assume that tiles divide the number of rows and
  *  and columns of the matrix.
  *
  *  RAJA features shown:
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
@@ -110,24 +112,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
-      
     }
   }
   // _cstyle_tiled_mattranspose_end
@@ -138,12 +144,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose. 
+  // transpose.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Further partioning of the iteration space is carried out in the 
+  // Further partioning of the iteration space is carried out in the
   // tile_fixed statements. Iterations inside a RAJA loop is given by their
-  // global iteration number. 
+  // global iteration number.
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -154,7 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. The template parameter inside 
+  // using sequential loops. The template parameter inside
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
@@ -168,23 +174,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// NOTE: We have done this first one for you.
   ///
 
-  using TILED_KERNEL_EXEC_POL = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::seq_exec, 
-            RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_DIM>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              RAJA::seq_exec,
+              RAJA::statement::
+                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -192,7 +198,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -214,7 +221,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range,
+  row_Range),
     [=](int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -224,7 +232,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n";
+  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner "
+               "loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -233,35 +242,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP parallel for loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using TILED_KERNEL_EXEC_POL_OMP2 = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                    RAJA::ArgList<0, 1>,
-                                    RAJA::statement::Lambda<0>
-          > //closes collapse
-        > // closes Tile 0
-      > // closes Tile 1
-    >; // closes policy list
-      
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>( RAJA::make_tuple(col_Range, row_Range), 
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_DIM>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<0,
+                            RAJA::tile_fixed<TILE_DIM>,
+                            RAJA::seq_exec,
+                            RAJA::statement::Collapse<
+                                RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<0, 1>,
+                                RAJA::statement::Lambda<0>> // closes collapse
+                            >                               // closes Tile 0
+      >                                                     // closes Tile 1
+                                                        >; // closes policy list
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  
+
   // _raja_mattranspose_cuda_start
 
   ///
@@ -277,7 +288,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range,
+  row_Range),
     [=] RAJA_DEVICE (int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -285,47 +297,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
   using TILED_KERNEL_EXEC_POL_HIP =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-            RAJA::statement::For<1, RAJA::hip_thread_x_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_y_direct,
-                                      RAJA::statement::Lambda<0>
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>( RAJA::make_tuple(col_Range, row_Range),
-    [=] RAJA_DEVICE (int col, int row) {
-      d_Atview(col, row) = d_Aview(row, col);
-  });
-
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::hip_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::hip_thread_x_direct,
+                  RAJA::statement::For<0,
+                                       RAJA::hip_thread_y_direct,
+                                       RAJA::statement::Lambda<0>>>>>>>;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=] RAJA_DEVICE(int col, int row) {
+        d_Atview(col, row) = d_Aview(row, col);
+      });
+
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -340,7 +356,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-} 
+}
 
 //
 // Function to check result and report P/F.
@@ -349,16 +365,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -370,11 +392,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernel-matrix-transpose-tiled_solution.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp
index 9124a1b174..ba0937c13c 100644
--- a/exercises/kernel-matrix-transpose-tiled_solution.cpp
+++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp
@@ -21,14 +21,14 @@
  *  transposed and returned as a second matrix At.
  *
  *  This operation is carried out using a tiling algorithm.
- *  The algorithm iterates over tiles of the matrix A and 
+ *  The algorithm iterates over tiles of the matrix A and
  *  performs a transpose copy without explicitly storing the tile.
  *
  *  The algorithm is expressed as a collection of ``outer``
- *  and ``inner`` for loops. Iterations of the inner loop will 
+ *  and ``inner`` for loops. Iterations of the inner loop will
  *  tranpose tile entries; while outer loops will iterate over
  *  the number of tiles needed to carryout the transpose.
- *  We do not assume that tiles divide the number of rows and 
+ *  We do not assume that tiles divide the number of rows and
  *  and columns of the matrix.
  *
  *  RAJA features shown:
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
@@ -110,24 +112,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
-      
     }
   }
   // _cstyle_tiled_mattranspose_end
@@ -138,12 +144,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose. 
+  // transpose.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Further partioning of the iteration space is carried out in the 
+  // Further partioning of the iteration space is carried out in the
   // tile_fixed statements. Iterations inside a RAJA loop is given by their
-  // global iteration number. 
+  // global iteration number.
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -154,27 +160,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. The template parameter inside 
+  // using sequential loops. The template parameter inside
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
-  using TILED_KERNEL_EXEC_POL = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::seq_exec, 
-            RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_DIM>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              RAJA::seq_exec,
+              RAJA::statement::
+                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -182,7 +188,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -190,29 +197,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using TILED_KERNEL_EXEC_POL_OMP = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::omp_parallel_for_exec, 
-            RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          > 
-        >
-      >
-    >; 
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using TILED_KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_DIM>,
+      RAJA::omp_parallel_for_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              RAJA::omp_parallel_for_exec,
+              RAJA::statement::
+                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n";
+  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner "
+               "loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -221,99 +229,107 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP parallel for loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using TILED_KERNEL_EXEC_POL_OMP2 = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                    RAJA::ArgList<0, 1>,
-                                    RAJA::statement::Lambda<0>
-          > //closes collapse
-        > // closes Tile 0
-      > // closes Tile 1
-    >; // closes policy list
-      
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>( RAJA::make_tuple(col_Range, row_Range), 
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_DIM>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<0,
+                            RAJA::tile_fixed<TILE_DIM>,
+                            RAJA::seq_exec,
+                            RAJA::statement::Collapse<
+                                RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<0, 1>,
+                                RAJA::statement::Lambda<0>> // closes collapse
+                            >                               // closes Tile 0
+      >                                                     // closes Tile 1
+                                                        >; // closes policy list
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  
+
   // _raja_mattranspose_cuda_start
-  using TILED_KERNEL_EXEC_POL_CUDA = 
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
-            RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
-                RAJA::statement::Lambda<0> 
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
-    [=] RAJA_DEVICE (int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using TILED_KERNEL_EXEC_POL_CUDA =
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::cuda_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::cuda_thread_x_direct,
+                  RAJA::statement::For<0,
+                                       RAJA::cuda_thread_y_direct,
+                                       RAJA::statement::Lambda<0>>>>>>>;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=] RAJA_DEVICE(int col, int row) {
+        Atview(col, row) = Aview(row, col);
+      });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
   using TILED_KERNEL_EXEC_POL_HIP =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-            RAJA::statement::For<1, RAJA::hip_thread_x_direct,
-              RAJA::statement::For<0, RAJA::hip_thread_y_direct,
-                RAJA::statement::Lambda<0>
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>( RAJA::make_tuple(col_Range, row_Range),
-    [=] RAJA_DEVICE (int col, int row) {
-      d_Atview(col, row) = d_Aview(row, col);
-  });
-
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_DIM>,
+          RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_DIM>,
+              RAJA::hip_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::hip_thread_x_direct,
+                  RAJA::statement::For<0,
+                                       RAJA::hip_thread_y_direct,
+                                       RAJA::statement::Lambda<0>>>>>>>;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=] RAJA_DEVICE(int col, int row) {
+        d_Atview(col, row) = d_Aview(row, col);
+      });
+
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -328,7 +344,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-} 
+}
 
 //
 // Function to check result and report P/F.
@@ -337,16 +353,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -358,11 +380,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernel-matrix-transpose.cpp b/exercises/kernel-matrix-transpose.cpp
index 04f71bf7e0..6d16e7e5eb 100644
--- a/exercises/kernel-matrix-transpose.cpp
+++ b/exercises/kernel-matrix-transpose.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-        Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -104,13 +108,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose. 
+  // transpose.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number. 
+  // Iterations inside a RAJA loop is given by their global iteration number.
   //
-//RAJA::TypedRangeSegment<int> row_Range(0, N_r);
-//RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+  // RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  // RAJA::TypedRangeSegment<int> col_Range(0, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running sequential matrix transpose ...\n";
@@ -118,7 +122,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. 
+  // using sequential loops.
   //
   // _raja_mattranspose_start
 
@@ -127,9 +131,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a
   ///           basic matrix transpose.
-  ///   
+  ///
   ///           Uncomment 'row_Range' and 'col_Range' objects above so they
-  ///           can be used in the kernel. 
+  ///           can be used in the kernel.
   ///
 
   ///
@@ -149,7 +153,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -163,9 +168,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a
   ///           basic matrix transpose.
-  ///   
+  ///
   ///           Uncomment 'row_Range' and 'col_Range' objects above so they
-  ///           can be used in the kernel. 
+  ///           can be used in the kernel.
   ///
 
   ///
@@ -174,7 +179,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range),
     [=](int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -183,13 +188,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  
+
   // _raja_mattranspose_cuda_start
 
   ///
@@ -197,9 +202,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a
   ///           basic matrix transpose.
-  ///   
+  ///
   ///           Uncomment 'row_Range' and 'col_Range' objects above so they
-  ///           can be used in the kernel. 
+  ///           can be used in the kernel.
   ///
 
   ///
@@ -208,7 +213,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range),
     [=] RAJA_DEVICE (int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -216,10 +221,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -230,7 +235,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-} 
+}
 
 //
 // Function to check result and report P/F.
@@ -239,16 +244,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -260,11 +271,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp
index 4dab678520..c73f39719d 100644
--- a/exercises/kernel-matrix-transpose_solution.cpp
+++ b/exercises/kernel-matrix-transpose_solution.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-        Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -104,10 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose. 
+  // transpose.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number. 
+  // Iterations inside a RAJA loop is given by their global iteration number.
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -118,29 +122,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. 
+  // using sequential loops.
   //
   // _raja_mattranspose_start
-  using KERNEL_EXEC_POL = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::seq_exec, 
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>
-         >
-      >
-    >;
-
-  RAJA::kernel<KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<KERNEL_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -148,53 +148,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // This policy loops sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using KERNEL_EXEC_POL_OMP = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::omp_parallel_for_exec, 
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>
-        >
-      > 
-    >; 
-
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  
+
   // _raja_mattranspose_cuda_start
-  using KERNEL_EXEC_POL_CUDA = 
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
-          RAJA::statement::For<0, RAJA::cuda_thread_y_loop,
-                                  RAJA::statement::Lambda<0> 
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
-    [=] RAJA_DEVICE (int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using KERNEL_EXEC_POL_CUDA =
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          1,
+          RAJA::cuda_thread_x_loop,
+          RAJA::statement::
+              For<0, RAJA::cuda_thread_y_loop, RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
+                                     [=] RAJA_DEVICE(int col, int row) {
+                                       Atview(col, row) = Aview(row, col);
+                                     });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -205,7 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-} 
+}
 
 //
 // Function to check result and report P/F.
@@ -214,16 +205,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -235,11 +232,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernelintro-execpols.cpp b/exercises/kernelintro-execpols.cpp
index fdffc21ca9..8ff73f90ae 100644
--- a/exercises/kernelintro-execpols.cpp
+++ b/exercises/kernelintro-execpols.cpp
@@ -37,16 +37,17 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template< int i_block_size, int j_block_size, int k_block_size >
-__launch_bounds__(i_block_size*j_block_size*k_block_size)
-__global__ void nested_init(double* a, double c, int N)
+template <int i_block_size, int j_block_size, int k_block_size>
+__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
+    void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if ( i < N && j < N && k < N ) {
-    a[i+N*(j+N*k)] = c * i * j * k ;
+  if (i < N && j < N && k < N)
+  {
+    a[i + N * (j + N * k)] = c * i * j * k;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -58,64 +59,71 @@ __global__ void nested_init(double* a, double c, int N)
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-// _init_define_start
-//
-// 3D tensor has N^3 entries
-//
+  // _init_define_start
+  //
+  // 3D tensor has N^3 entries
+  //
   constexpr int N = 100;
   constexpr int N_tot = N * N * N;
   constexpr double c = 0.0001;
   double* a = memoryManager::allocate<double>(N_tot);
   double* a_ref = memoryManager::allocate<double>(N_tot);
-// _init_define_end
-
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
-
-// _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+  // _init_define_end
+
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference "
+               "solution ...\n";
+
+  // _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        a_ref[i + N * (j + N * k)] = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_seq_end
+  // _cstyle_tensorinit_seq_end
 
 
-//----------------------------------------------------------------------------//
-// We introduce a RAJA View to wrap the tensor data pointer and simplify
-// multi-dimensional indexing.
-// We use this in the rest of the examples in this file.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // We introduce a RAJA View to wrap the tensor data pointer and simplify
+  // multi-dimensional indexing.
+  // We use this in the rest of the examples in this file.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-// _3D_raja_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
-// _3D_raja_view_end
-
-// _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+  // _3D_raja_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
+  // _3D_raja_view_end
+
+  // _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_view_seq_end
+  // _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
@@ -135,110 +143,105 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA OpenMP multithreading variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA OpenMP multithreading variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cstyle_tensorinit_omp_outer_start
-  #pragma omp parallel for
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+// _cstyle_tensorinit_omp_outer_start
+#pragma omp parallel for
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_omp_outer_end
+  // _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_omp_outer_start
-  using EXEC_POL2 =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<2, RAJA::omp_parallel_for_exec,    // k
-        RAJA::statement::For<1, RAJA::seq_exec,              // j
-          RAJA::statement::For<0, RAJA::seq_exec,            // i
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<EXEC_POL2>( 
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
-
-    [=]( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_omp_outer_end
+  // _raja_tensorinit_omp_outer_start
+  using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      RAJA::omp_parallel_for_exec, // k
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // j
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // i
+                                                RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<EXEC_POL2>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cstyle_tensorinit_omp_collapse_start
-  #pragma omp parallel for collapse(3)
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+// _cstyle_tensorinit_omp_collapse_start
+#pragma omp parallel for collapse(3)
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_omp_collapse_end
+  // _cstyle_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_omp_collapse_start
-  using EXEC_POL3 =
-    RAJA::KernelPolicy<
+  // _raja_tensorinit_omp_collapse_start
+  using EXEC_POL3 = RAJA::KernelPolicy<
       RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<2, 1, 0>,  // k, j, i
-        RAJA::statement::Lambda<0>
-      >
-    >;
-
-  RAJA::kernel<EXEC_POL3>( 
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
-
-    [=]( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_omp_collapse_end
+                                RAJA::ArgList<2, 1, 0>, // k, j, i
+                                RAJA::statement::Lambda<0>>>;
+
+  RAJA::kernel<EXEC_POL3>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n";
 
@@ -262,43 +265,38 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA CUDA GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA CUDA GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_cuda_start
+  // _raja_tensorinit_cuda_start
   using EXEC_POL5 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<2, RAJA::cuda_thread_z_loop,      // k
-          RAJA::statement::For<1, RAJA::cuda_thread_y_loop,    // j
-            RAJA::statement::For<0, RAJA::cuda_thread_x_loop,  // i
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          2,
+          RAJA::cuda_thread_z_loop, // k
+          RAJA::statement::For<
+              1,
+              RAJA::cuda_thread_y_loop, // j
+              RAJA::statement::For<0,
+                                   RAJA::cuda_thread_x_loop, // i
+                                   RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=] __device__ ( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_cuda_end
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
@@ -308,56 +306,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Define total thread-block size and size of each block dimension
   //
-// _cuda_blockdim_start
+  // _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
-// _cuda_blockdim_end
-
-// _raja_tensorinit_cuda_tiled_direct_start
-  using EXEC_POL6 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz,
-        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
-                                 RAJA::cuda_block_y_direct,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
-                                   RAJA::cuda_block_x_direct,
-            RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // k
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i
-                  RAJA::statement::Lambda<0>
-                >
-              >
-            >
-          >
-        >
-      >
-    >;
+  // _cuda_blockdim_end
+
+  // _raja_tensorinit_cuda_tiled_direct_start
+  using EXEC_POL6 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+      i_block_sz * j_block_sz * k_block_sz,
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<j_block_sz>,
+          RAJA::cuda_block_y_direct,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<i_block_sz>,
+              RAJA::cuda_block_x_direct,
+              RAJA::statement::For<
+                  2,
+                  RAJA::cuda_block_z_direct, // k
+                  RAJA::statement::For<
+                      1,
+                      RAJA::cuda_thread_y_direct, // j
+                      RAJA::statement::For<0,
+                                           RAJA::cuda_thread_x_direct, // i
+                                           RAJA::statement::Lambda<0>>>>>>>>;
 
   RAJA::kernel<EXEC_POL6>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=] __device__ ( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_cuda_tiled_direct_end
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cuda_tensorinit_tiled_direct_start
+  // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
                 "Invalid block_size");
 
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
@@ -365,10 +361,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
                static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-    <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk( cudaGetLastError() );
+      <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk(cudaGetLastError());
   cudaErrchk(cudaDeviceSynchronize());
-// _cuda_tensorinit_tiled_direct_end
+  // _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
@@ -377,51 +373,47 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
-// RAJA HIP GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA HIP GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-// _3D_raja_device_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
-// _3D_raja_device_view_end
+  // _3D_raja_device_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
+  // _3D_raja_device_view_end
 
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
-// _raja_tensorinit_hip_start
+  // _raja_tensorinit_hip_start
   using EXEC_POL7 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<2, RAJA::hip_thread_z_loop,      // k
-          RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // j
-            RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // i
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<EXEC_POL7>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
-
-    [=] __device__ ( int i, int j, int k) {
-       d_aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_hip_end
-
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+          2,
+          RAJA::hip_thread_z_loop, // k
+          RAJA::statement::For<
+              1,
+              RAJA::hip_thread_y_loop, // j
+              RAJA::statement::For<0,
+                                   RAJA::hip_thread_x_loop, // i
+                                   RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<EXEC_POL7>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
+
+                          [=] __device__(int i, int j, int k) {
+                            d_aView(i, j, k) = c * i * j * k;
+                          });
+  // _raja_tensorinit_hip_end
+
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
@@ -435,47 +427,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
-
-// _raja_tensorinit_hip_tiled_direct_start
-  using EXEC_POL8 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz,
-        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
-                                 RAJA::hip_block_y_direct,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
-                                   RAJA::hip_block_x_direct,
-            RAJA::statement::For<2, RAJA::hip_block_z_direct,      // k
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // j
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i
-                  RAJA::statement::Lambda<0>
-                >
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<EXEC_POL8>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
-
-    [=] __device__ ( int i, int j, int k) {
-       d_aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_hip_tiled_direct_end
-
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+
+  // _raja_tensorinit_hip_tiled_direct_start
+  using EXEC_POL8 = RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+      i_block_sz * j_block_sz * k_block_sz,
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<j_block_sz>,
+          RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<i_block_sz>,
+              RAJA::hip_block_x_direct,
+              RAJA::statement::For<
+                  2,
+                  RAJA::hip_block_z_direct, // k
+                  RAJA::statement::For<
+                      1,
+                      RAJA::hip_thread_y_direct, // j
+                      RAJA::statement::For<0,
+                                           RAJA::hip_thread_x_direct, // i
+                                           RAJA::statement::Lambda<0>>>>>>>>;
+
+  RAJA::kernel<EXEC_POL8>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
+
+                          [=] __device__(int i, int j, int k) {
+                            d_aView(i, j, k) = c * i * j * k;
+                          });
+  // _raja_tensorinit_hip_tiled_direct_end
+
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
 #endif // if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -494,14 +485,18 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while ( correct && (i < n) ) {
+  while (correct && (i < n))
+  {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp
index c5041e01a9..9fecb8bfe9 100644
--- a/exercises/kernelintro-execpols_solution.cpp
+++ b/exercises/kernelintro-execpols_solution.cpp
@@ -37,16 +37,17 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template< int i_block_size, int j_block_size, int k_block_size >
-__launch_bounds__(i_block_size*j_block_size*k_block_size)
-__global__ void nested_init(double* a, double c, int N)
+template <int i_block_size, int j_block_size, int k_block_size>
+__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
+    void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if ( i < N && j < N && k < N ) {
-    a[i+N*(j+N*k)] = c * i * j * k ;
+  if (i < N && j < N && k < N)
+  {
+    a[i + N * (j + N * k)] = c * i * j * k;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -58,228 +59,219 @@ __global__ void nested_init(double* a, double c, int N)
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-// _init_define_start
-//
-// 3D tensor has N^3 entries
-//
+  // _init_define_start
+  //
+  // 3D tensor has N^3 entries
+  //
   constexpr int N = 100;
   constexpr int N_tot = N * N * N;
   constexpr double c = 0.0001;
   double* a = memoryManager::allocate<double>(N_tot);
   double* a_ref = memoryManager::allocate<double>(N_tot);
-// _init_define_end
-
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
-
-// _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+  // _init_define_end
+
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference "
+               "solution ...\n";
+
+  // _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        a_ref[i + N * (j + N * k)] = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_seq_end
+  // _cstyle_tensorinit_seq_end
 
 
-//----------------------------------------------------------------------------//
-// We introduce a RAJA View to wrap the tensor data pointer and simplify
-// multi-dimensional indexing.
-// We use this in the rest of the examples in this file.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // We introduce a RAJA View to wrap the tensor data pointer and simplify
+  // multi-dimensional indexing.
+  // We use this in the rest of the examples in this file.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-// _3D_raja_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
-// _3D_raja_view_end
-
-// _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+  // _3D_raja_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
+  // _3D_raja_view_end
+
+  // _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_view_seq_end
+  // _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_seq_start
-  using EXEC_POL1 =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<2, RAJA::seq_exec,    // k
-        RAJA::statement::For<1, RAJA::seq_exec,  // j
-          RAJA::statement::For<0, RAJA::seq_exec,// i
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+  // _raja_tensorinit_seq_start
+  using EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      RAJA::seq_exec, // k
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // j
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // i
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL1>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=]( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_seq_end
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA OpenMP multithreading variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA OpenMP multithreading variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cstyle_tensorinit_omp_outer_start
-  #pragma omp parallel for
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+// _cstyle_tensorinit_omp_outer_start
+#pragma omp parallel for
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_omp_outer_end
+  // _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_omp_outer_start
-  using EXEC_POL2 =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<2, RAJA::omp_parallel_for_exec,    // k
-        RAJA::statement::For<1, RAJA::seq_exec,              // j
-          RAJA::statement::For<0, RAJA::seq_exec,            // i
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+  // _raja_tensorinit_omp_outer_start
+  using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      RAJA::omp_parallel_for_exec, // k
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // j
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // i
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL2>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=]( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_omp_outer_end
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cstyle_tensorinit_omp_collapse_start
-  #pragma omp parallel for collapse(3)
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+// _cstyle_tensorinit_omp_collapse_start
+#pragma omp parallel for collapse(3)
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_omp_collapse_end
+  // _cstyle_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_omp_collapse_start
-  using EXEC_POL3 =
-    RAJA::KernelPolicy<
+  // _raja_tensorinit_omp_collapse_start
+  using EXEC_POL3 = RAJA::KernelPolicy<
       RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<2, 1, 0>,  // k, j, i
-        RAJA::statement::Lambda<0>
-      >
-    >;
+                                RAJA::ArgList<2, 1, 0>, // k, j, i
+                                RAJA::statement::Lambda<0>>>;
 
   RAJA::kernel<EXEC_POL3>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=]( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_omp_collapse_end
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_omp_collapse_start
-  using EXEC_POL4 =
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<2, 1>,    // k, j
-        RAJA::statement::For<0, RAJA::seq_exec,        // i
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+  // _raja_tensorinit_omp_collapse_start
+  using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      RAJA::omp_parallel_collapse_exec,
+      RAJA::ArgList<2, 1>, // k, j
+      RAJA::statement::For<0,
+                           RAJA::seq_exec, // i
+                           RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<EXEC_POL4>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=]( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_omp_collapse_end
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
@@ -288,43 +280,38 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA CUDA GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA CUDA GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_cuda_start
+  // _raja_tensorinit_cuda_start
   using EXEC_POL5 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<2, RAJA::cuda_thread_z_loop,      // k
-          RAJA::statement::For<1, RAJA::cuda_thread_y_loop,    // j
-            RAJA::statement::For<0, RAJA::cuda_thread_x_loop,  // i
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          2,
+          RAJA::cuda_thread_z_loop, // k
+          RAJA::statement::For<
+              1,
+              RAJA::cuda_thread_y_loop, // j
+              RAJA::statement::For<0,
+                                   RAJA::cuda_thread_x_loop, // i
+                                   RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=] __device__ ( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_cuda_end
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
@@ -334,56 +321,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Define total thread-block size and size of each block dimension
   //
-// _cuda_blockdim_start
+  // _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
-// _cuda_blockdim_end
-
-// _raja_tensorinit_cuda_tiled_direct_start
-  using EXEC_POL6 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz,
-        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
-                                 RAJA::cuda_block_y_direct,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
-                                   RAJA::cuda_block_x_direct,
-            RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // k
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i
-                  RAJA::statement::Lambda<0>
-                >
-              >
-            >
-          >
-        >
-      >
-    >;
+  // _cuda_blockdim_end
+
+  // _raja_tensorinit_cuda_tiled_direct_start
+  using EXEC_POL6 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+      i_block_sz * j_block_sz * k_block_sz,
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<j_block_sz>,
+          RAJA::cuda_block_y_direct,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<i_block_sz>,
+              RAJA::cuda_block_x_direct,
+              RAJA::statement::For<
+                  2,
+                  RAJA::cuda_block_z_direct, // k
+                  RAJA::statement::For<
+                      1,
+                      RAJA::cuda_thread_y_direct, // j
+                      RAJA::statement::For<0,
+                                           RAJA::cuda_thread_x_direct, // i
+                                           RAJA::statement::Lambda<0>>>>>>>>;
 
   RAJA::kernel<EXEC_POL6>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
 
-    [=] __device__ ( int i, int j, int k) {
-       aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_cuda_tiled_direct_end
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+  // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cuda_tensorinit_tiled_direct_start
+  // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
                 "Invalid block_size");
 
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
@@ -391,10 +376,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
                static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-    <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk( cudaGetLastError() );
+      <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk(cudaGetLastError());
   cudaErrchk(cudaDeviceSynchronize());
-// _cuda_tensorinit_tiled_direct_end
+  // _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
@@ -403,51 +388,47 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
-// RAJA HIP GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA HIP GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-// _3D_raja_device_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
-// _3D_raja_device_view_end
+  // _3D_raja_device_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
+  // _3D_raja_device_view_end
 
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
-// _raja_tensorinit_hip_start
+  // _raja_tensorinit_hip_start
   using EXEC_POL7 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<2, RAJA::hip_thread_z_loop,      // k
-          RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // j
-            RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // i
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<EXEC_POL7>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N),
-                      RAJA::TypedRangeSegment<int>(0, N) ),
-
-    [=] __device__ ( int i, int j, int k) {
-       d_aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_hip_end
-
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+          2,
+          RAJA::hip_thread_z_loop, // k
+          RAJA::statement::For<
+              1,
+              RAJA::hip_thread_y_loop, // j
+              RAJA::statement::For<0,
+                                   RAJA::hip_thread_x_loop, // i
+                                   RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<EXEC_POL7>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
+
+                          [=] __device__(int i, int j, int k) {
+                            d_aView(i, j, k) = c * i * j * k;
+                          });
+  // _raja_tensorinit_hip_end
+
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
@@ -461,47 +442,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
-
-// _raja_tensorinit_hip_tiled_direct_start
-  using EXEC_POL8 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz,
-        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
-                                 RAJA::hip_block_y_direct,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
-                                   RAJA::hip_block_x_direct,
-            RAJA::statement::For<2, RAJA::hip_block_z_direct,      // k
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // j
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i
-                  RAJA::statement::Lambda<0>
-                >
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<EXEC_POL8>(
-     RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N) ),
-
-    [=] __device__ ( int i, int j, int k) {
-       d_aView(i, j, k) = c * i * j * k ;
-    }
-  );
-// _raja_tensorinit_hip_tiled_direct_end
-
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+
+  // _raja_tensorinit_hip_tiled_direct_start
+  using EXEC_POL8 = RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+      i_block_sz * j_block_sz * k_block_sz,
+      RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<j_block_sz>,
+          RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<i_block_sz>,
+              RAJA::hip_block_x_direct,
+              RAJA::statement::For<
+                  2,
+                  RAJA::hip_block_z_direct, // k
+                  RAJA::statement::For<
+                      1,
+                      RAJA::hip_thread_y_direct, // j
+                      RAJA::statement::For<0,
+                                           RAJA::hip_thread_x_direct, // i
+                                           RAJA::statement::Lambda<0>>>>>>>>;
+
+  RAJA::kernel<EXEC_POL8>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
+
+                          [=] __device__(int i, int j, int k) {
+                            d_aView(i, j, k) = c * i * j * k;
+                          });
+  // _raja_tensorinit_hip_tiled_direct_end
+
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
 #endif // if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -520,14 +500,18 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while ( correct && (i < n) ) {
+  while (correct && (i < n))
+  {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp
index 406ea7e581..18d6bc5e3f 100644
--- a/exercises/kernelintro-nested-loop-reorder.cpp
+++ b/exercises/kernelintro-nested-loop-reorder.cpp
@@ -14,10 +14,10 @@
  * Nested Loop Basics and Loop Reordering (RAJA::kernel)
  *
  *  In this exercise, we introduce basic RAJA::kernel mechanics for executing
- *  nested loop kernels, including using execution policies to permute the 
- *  order of loops in a loop nest. The exercise performs no actual 
+ *  nested loop kernels, including using execution policies to permute the
+ *  order of loops in a loop nest. The exercise performs no actual
  *  computation and just prints out loop indices to show different
- *  loop ordering. Also, to avoid difficulty in interpreting parallel 
+ *  loop ordering. Also, to avoid difficulty in interpreting parallel
  *  output, the execution policies use sequential execution.
  *
  *  RAJA features shown:
@@ -28,18 +28,18 @@
 
 //
 // Define three named loop index integer types used in the triply-nested loops.
-// These will trigger compilation errors if lambda index argument ordering 
+// These will trigger compilation errors if lambda index argument ordering
 // and types do not match the typed range index ordering.  See final
 // example in this file.
 //
 // _raja_typed_indices_start
 RAJA_INDEX_VALUE_T(KIDX, int, "KIDX");
-RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); 
-RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); 
+RAJA_INDEX_VALUE_T(JIDX, int, "JIDX");
+RAJA_INDEX_VALUE_T(IIDX, int, "IIDX");
 // _raja_typed_indices_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   // _range_min_max_start
@@ -51,117 +51,141 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int kmax = 4;
   // _range_min_max_end
 
-//
-// The RAJA variants of the loop nest use the following typed range segments
-// based on the typed indices defined above, outside of main().
-//
+  //
+  // The RAJA variants of the loop nest use the following typed range segments
+  // based on the typed indices defined above, outside of main().
+  //
   // _raja_typed_index_ranges_start
   RAJA::TypedRangeSegment<KIDX> KRange(kmin, kmax);
   RAJA::TypedRangeSegment<JIDX> JRange(jmin, jmax);
   RAJA::TypedRangeSegment<IIDX> IRange(imin, imax);
   // _raja_typed_index_ranges_end
- 
+
 
   std::cout << "\n\nRAJA::kernel nested loop reorder example...\n";
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" 
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, "
+               "I-inner"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _cstyle_kji_loops_start
-  for (int k = kmin; k < kmax; ++k) {
-    for (int j = jmin; j < jmax; ++j) {
-      for (int i = imin; i < imax; ++i) {
-        printf( " (%d, %d, %d) \n", i, j, k);
+  for (int k = kmin; k < kmax; ++k)
+  {
+    for (int j = jmin; j < jmax; ++j)
+    {
+      for (int i = imin; i < imax; ++i)
+      {
+        printf(" (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_kji_loops_end
 
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, "
+               "I-inner)"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _raja_kji_loops_start
-  using KJI_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<2, RAJA::seq_exec,    // k
-                          RAJA::statement::For<1, RAJA::seq_exec,  // j
-                            RAJA::statement::For<0, RAJA::seq_exec,// i 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-
-  RAJA::kernel<KJI_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) { 
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
+  using KJI_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      RAJA::seq_exec, // k
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // j
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // i
+                                                RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<KJI_EXECPOL>(
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
+        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+      });
   // _raja_kji_loops_end
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" 
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, "
+               "K-inner"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _cstyle_jik_loops_start
-  for (int j = jmin; j < jmax; ++j) {
-    for (int i = imin; i < imax; ++i) {
-      for (int k = kmin; k < kmax; ++k) {
-        printf( " (%d, %d, %d) \n", i, j, k);
+  for (int j = jmin; j < jmax; ++j)
+  {
+    for (int i = imin; i < imax; ++i)
+    {
+      for (int k = kmin; k < kmax; ++k)
+      {
+        printf(" (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_jik_loops_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, 
+  /// EXERCISE: Make a RAJA version of the kernel with j on outer loop,
   ///           i on middle loop, and k on inner loop
   ///
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" 
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, "
+               "J-inner"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _cstyle_ikj_loops_start
-  for (int i = imin; i < imax; ++i) {
-    for (int k = kmin; k < kmax; ++k) {
-      for (int j = jmin; j < jmax; ++j) {
-        printf( " (%d, %d, %d) \n", i, j, k);
+  for (int i = imin; i < imax; ++i)
+  {
+    for (int k = kmin; k < kmax; ++k)
+    {
+      for (int j = jmin; j < jmax; ++j)
+      {
+        printf(" (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_ikj_loops_end
 
-//----------------------------------------------------------------------------//
- 
+  //----------------------------------------------------------------------------//
+
   std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, 
+  /// EXERCISE: Make a RAJA version of the kernel with i on outer loop,
   ///           k on middle loop, and j on inner loop
   ///
 
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
- 
-#if 0  // Enable this code block to generate compiler error.
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+
+#if 0 // Enable this code block to generate compiler error.
 //----------------------------------------------------------------------------//
 // The following demonstrates that code will not compile if lambda argument
 // types/order do not match the types/order For statements in the execution
@@ -181,4 +205,3 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   return 0;
 }
-
diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp
index 9df3ff4657..de28c08e67 100644
--- a/exercises/kernelintro-nested-loop-reorder_solution.cpp
+++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp
@@ -14,10 +14,10 @@
  * Nested Loop Basics and Loop Reordering (RAJA::kernel)
  *
  *  In this exercise, we introduce basic RAJA::kernel mechanics for executing
- *  nested loop kernels, including using execution policies to permute the 
- *  order of loops in a loop nest. The exercise performs no actual 
+ *  nested loop kernels, including using execution policies to permute the
+ *  order of loops in a loop nest. The exercise performs no actual
  *  computation and just prints out loop indices to show different
- *  loop ordering. Also, to avoid difficulty in interpreting parallel 
+ *  loop ordering. Also, to avoid difficulty in interpreting parallel
  *  output, the execution policies use sequential execution.
  *
  *  RAJA features shown:
@@ -28,18 +28,18 @@
 
 //
 // Define three named loop index integer types used in the triply-nested loops.
-// These will trigger compilation errors if lambda index argument ordering 
+// These will trigger compilation errors if lambda index argument ordering
 // and types do not match the typed range index ordering.  See final
 // example in this file.
 //
 // _raja_typed_indices_start
 RAJA_INDEX_VALUE_T(KIDX, int, "KIDX");
-RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); 
-RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); 
+RAJA_INDEX_VALUE_T(JIDX, int, "JIDX");
+RAJA_INDEX_VALUE_T(IIDX, int, "IIDX");
 // _raja_typed_indices_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   // _range_min_max_start
@@ -51,137 +51,159 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int kmax = 4;
   // _range_min_max_end
 
-//
-// The RAJA variants of the loop nest use the following typed range segments
-// based on the typed indices defined above, outside of main().
-//
+  //
+  // The RAJA variants of the loop nest use the following typed range segments
+  // based on the typed indices defined above, outside of main().
+  //
   // _raja_typed_index_ranges_start
   RAJA::TypedRangeSegment<KIDX> KRange(kmin, kmax);
   RAJA::TypedRangeSegment<JIDX> JRange(jmin, jmax);
   RAJA::TypedRangeSegment<IIDX> IRange(imin, imax);
   // _raja_typed_index_ranges_end
- 
+
 
   std::cout << "\n\nRAJA::kernel nested loop reorder example...\n";
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" 
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, "
+               "I-inner"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _cstyle_kji_loops_start
-  for (int k = kmin; k < kmax; ++k) {
-    for (int j = jmin; j < jmax; ++j) {
-      for (int i = imin; i < imax; ++i) {
-        printf( " (%d, %d, %d) \n", i, j, k);
+  for (int k = kmin; k < kmax; ++k)
+  {
+    for (int j = jmin; j < jmax; ++j)
+    {
+      for (int i = imin; i < imax; ++i)
+      {
+        printf(" (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_kji_loops_end
 
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, "
+               "I-inner)"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _raja_kji_loops_start
-  using KJI_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<2, RAJA::seq_exec,    // k
-                          RAJA::statement::For<1, RAJA::seq_exec,  // j
-                            RAJA::statement::For<0, RAJA::seq_exec,// i 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-
-  RAJA::kernel<KJI_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) { 
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
+  using KJI_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      RAJA::seq_exec, // k
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // j
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // i
+                                                RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<KJI_EXECPOL>(
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
+        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+      });
   // _raja_kji_loops_end
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" 
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, "
+               "K-inner"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _cstyle_jik_loops_start
-  for (int j = jmin; j < jmax; ++j) {
-    for (int i = imin; i < imax; ++i) {
-      for (int k = kmin; k < kmax; ++k) {
-        printf( " (%d, %d, %d) \n", i, j, k);
+  for (int j = jmin; j < jmax; ++j)
+  {
+    for (int i = imin; i < imax; ++i)
+    {
+      for (int k = kmin; k < kmax; ++k)
+      {
+        printf(" (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_jik_loops_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _raja_jik_loops_start
-  using JIK_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<1, RAJA::seq_exec,    // j
-                          RAJA::statement::For<0, RAJA::seq_exec,  // i
-                            RAJA::statement::For<2, RAJA::seq_exec,// k 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-
-  RAJA::kernel<JIK_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) { 
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
+  using JIK_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec, // j
+      RAJA::statement::For<0,
+                           RAJA::seq_exec, // i
+                           RAJA::statement::For<2,
+                                                RAJA::seq_exec, // k
+                                                RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<JIK_EXECPOL>(
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
+        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+      });
   // _raja_jik_loops_end
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" 
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, "
+               "J-inner"
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _cstyle_ikj_loops_start
-  for (int i = imin; i < imax; ++i) {
-    for (int k = kmin; k < kmax; ++k) {
-      for (int j = jmin; j < jmax; ++j) {
-        printf( " (%d, %d, %d) \n", i, j, k);
+  for (int i = imin; i < imax; ++i)
+  {
+    for (int k = kmin; k < kmax; ++k)
+    {
+      for (int j = jmin; j < jmax; ++j)
+      {
+        printf(" (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_ikj_loops_end
 
-//----------------------------------------------------------------------------//
- 
+  //----------------------------------------------------------------------------//
+
   std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+            << "...\n\n"
+            << " (I, J, K)\n"
+            << " ---------\n";
 
   // _raja_ikj_loops_start
-  using IKJ_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<0, RAJA::seq_exec,    // i
-                          RAJA::statement::For<2, RAJA::seq_exec,  // k
-                            RAJA::statement::For<1, RAJA::seq_exec,// j 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-
-  RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) {
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
+  using IKJ_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec, // i
+      RAJA::statement::For<2,
+                           RAJA::seq_exec, // k
+                           RAJA::statement::For<1,
+                                                RAJA::seq_exec, // j
+                                                RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<IKJ_EXECPOL>(
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
+        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+      });
   // _raja_ikj_loops_end
 
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
- 
-#if 0  // Enable this code block to generate compiler error.
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+
+#if 0 // Enable this code block to generate compiler error.
 //----------------------------------------------------------------------------//
 // The following demonstrates that code will not compile if lambda argument
 // types/order do not match the types/order For statements in the execution
@@ -201,4 +223,3 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   return 0;
 }
-
diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
index eea48d073a..8bcab8dc1e 100644
--- a/exercises/launch-matrix-transpose-local-array.cpp
+++ b/exercises/launch-matrix-transpose-local-array.cpp
@@ -50,7 +50,7 @@
 // Define dimensionality of matrices and tile size
 //
 const int DIM = 2;
-#define TILE_DIM (16)  // #define to appease msvc
+#define TILE_DIM (16) // #define to appease msvc
 
 //
 // Function for checking results
@@ -65,7 +65,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose example...\n";
@@ -84,8 +84,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -101,8 +101,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
@@ -117,8 +119,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -129,14 +133,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -148,19 +155,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int tx = 0; tx < TILE_DIM; ++tx)
+      {
+        for (int ty = 0; ty < TILE_DIM; ++ty)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
-
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -179,34 +188,38 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-          ///
-          /// TODO ...
-          ///
-          /// Exercise Implement loop_icount methods to load tiles of the
-          /// input matrix into the RAJA_TEAM_SHARED memory array
-          ///
-
-          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
-
-              Atview(col, row) = Tile_Array[ty][tx];
-
+      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
+                            // the cpu
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<loop_pol_1>(
+            ctx,
+            TILE_DIM,
+            RAJA::TypedRangeSegment<int>(0, N_r),
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<loop_pol_1>(
+                  ctx,
+                  TILE_DIM,
+                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+                    ///
+                    /// TODO ...
+                    ///
+                    /// Exercise Implement loop_icount methods to load tiles of
+                    /// the input matrix into the RAJA_TEAM_SHARED memory array
+                    ///
+
+                    RAJA::loop_icount<loop_pol_1>(
+                        ctx, col_tile, [&](int col, int tx) {
+                          RAJA::loop_icount<loop_pol_1>(
+                              ctx, row_tile, [&](int row, int ty) {
+                                Atview(col, row) = Tile_Array[ty][tx];
+                              });
+                        });
+                  });
             });
-          });
-
-        });
       });
-
-    });
   // _mattranspose_localarray_raja_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -231,39 +244,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           within the omp parallel region.
   ///
 
-  //using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  // using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
+                            // the cpu
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+        /*
+        RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0,
+        N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+            RAJA::tile<loop_pol_2>(ctx, TILE_DIM,
+        RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int>
+        const &col_tile) {
 
-      /*
-      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-          RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+              RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
-            RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+              RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty)
+        { RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
 
-            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
-              RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-
-                Tile_Array[ty][tx] = Aview(row, col);
+                  Tile_Array[ty][tx] = Aview(row, col);
 
+                });
               });
-            });
 
-            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-              RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+              RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx)
+        { RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
 
-                Atview(col, row) = Tile_Array[ty][tx];
+                  Atview(col, row) = Tile_Array[ty][tx];
 
+                  });
                 });
-              });
 
+            });
           });
-        });
-    */
-    });
+      */
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -282,56 +298,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   /// TODO...
   ///
-  /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads directly
+  /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads
+  /// directly
   ///
 
   const bool cuda_async = false;
-  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+  using cuda_launch_policy =
+      RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
-      /*
-      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+        /*
+        RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0,
+        N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+          RAJA::tile<cuda_teams_x>(ctx, TILE_DIM,
+        RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int>
+        const &col_tile) {
 
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+            RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
-          RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
-            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int
+        ty) { RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int
+        tx) {
 
-              Tile_Array[ty][tx] = Aview(row, col);
+                Tile_Array[ty][tx] = Aview(row, col);
 
+              });
             });
-          });
 
-          RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int
+        tx) { RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int
+        ty) {
 
-              Atview(col, row) = Tile_Array[ty][tx];
+                Atview(col, row) = Tile_Array[ty][tx];
 
+              });
             });
-          });
 
+          });
         });
+        */
       });
-      */
-  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
 
-  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -343,8 +366,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
@@ -360,44 +384,48 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool hip_async = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
-  RAJA::launch<hip_launch_policy>
-     (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-          RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
-            RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-
-              Tile_Array[ty][tx] = d_Aview(row, col);
-
-            });
-          });
-
-          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
-
-              d_Atview(col, row) = Tile_Array[ty][tx];
-
+  RAJA::launch<hip_launch_policy>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<hip_teams_y>(
+            ctx,
+            TILE_DIM,
+            RAJA::TypedRangeSegment<int>(0, N_r),
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<hip_teams_x>(
+                  ctx,
+                  TILE_DIM,
+                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+                    RAJA::loop_icount<hip_threads_y>(
+                        ctx, row_tile, [&](int row, int ty) {
+                          RAJA::loop_icount<hip_threads_x>(
+                              ctx, col_tile, [&](int col, int tx) {
+                                Tile_Array[ty][tx] = d_Aview(row, col);
+                              });
+                        });
+
+                    RAJA::loop_icount<hip_threads_x>(
+                        ctx, col_tile, [&](int col, int tx) {
+                          RAJA::loop_icount<hip_threads_y>(
+                              ctx, row_tile, [&](int row, int ty) {
+                                d_Atview(col, row) = Tile_Array[ty][tx];
+                              });
+                        });
+                  });
             });
-          });
-
-        });
       });
 
-    });
-
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -410,16 +438,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -431,8 +465,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
index fe2d41ecec..e388a58848 100644
--- a/exercises/launch-matrix-transpose-local-array_solution.cpp
+++ b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -50,7 +50,7 @@
 // Define dimensionality of matrices and tile size
 //
 const int DIM = 2;
-#define TILE_DIM (16)  // #define to appease msvc
+#define TILE_DIM (16) // #define to appease msvc
 
 //
 // Function for checking results
@@ -65,7 +65,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose example...\n";
@@ -84,8 +84,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -101,8 +101,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
@@ -117,8 +119,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -129,14 +133,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -148,19 +155,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int tx = 0; tx < TILE_DIM; ++tx)
+      {
+        for (int ty = 0; ty < TILE_DIM; ++ty)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
-
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -179,35 +188,39 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-          RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
-            RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
-
-              Tile_Array[ty][tx] = Aview(row, col);
-
+      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
+                            // the cpu
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<loop_pol_1>(
+            ctx,
+            TILE_DIM,
+            RAJA::TypedRangeSegment<int>(0, N_r),
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<loop_pol_1>(
+                  ctx,
+                  TILE_DIM,
+                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+                    RAJA::loop_icount<loop_pol_1>(
+                        ctx, row_tile, [&](int row, int ty) {
+                          RAJA::loop_icount<loop_pol_1>(
+                              ctx, col_tile, [&](int col, int tx) {
+                                Tile_Array[ty][tx] = Aview(row, col);
+                              });
+                        });
+
+                    RAJA::loop_icount<loop_pol_1>(
+                        ctx, col_tile, [&](int col, int tx) {
+                          RAJA::loop_icount<loop_pol_1>(
+                              ctx, row_tile, [&](int row, int ty) {
+                                Atview(col, row) = Tile_Array[ty][tx];
+                              });
+                        });
+                  });
             });
-          });
-
-          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
-
-              Atview(col, row) = Tile_Array[ty][tx];
-
-            });
-          });
-
-        });
       });
-
-    });
   // _mattranspose_localarray_raja_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -229,36 +242,40 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-          RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
-            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-
-              Tile_Array[ty][tx] = Aview(row, col);
-
+      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
+                            // the cpu
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<omp_pol_2>(
+            ctx,
+            TILE_DIM,
+            RAJA::TypedRangeSegment<int>(0, N_r),
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<loop_pol_2>(
+                  ctx,
+                  TILE_DIM,
+                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+                    RAJA::loop_icount<loop_pol_2>(
+                        ctx, row_tile, [&](int row, int ty) {
+                          RAJA::loop_icount<loop_pol_2>(
+                              ctx, col_tile, [&](int col, int tx) {
+                                Tile_Array[ty][tx] = Aview(row, col);
+                              });
+                        });
+
+                    RAJA::loop_icount<loop_pol_2>(
+                        ctx, col_tile, [&](int col, int tx) {
+                          RAJA::loop_icount<loop_pol_2>(
+                              ctx, row_tile, [&](int row, int ty) {
+                                Atview(col, row) = Tile_Array[ty][tx];
+                              });
+                        });
+                  });
             });
-          });
-
-          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
-
-              Atview(col, row) = Tile_Array[ty][tx];
-
-            });
-          });
-
-        });
       });
 
-    });
-
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
@@ -281,52 +298,56 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
   const bool cuda_async = false;
-  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+  using cuda_launch_policy =
+      RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-          RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
-            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-
-              Tile_Array[ty][tx] = Aview(row, col);
-
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<cuda_teams_y>(
+            ctx,
+            TILE_DIM,
+            RAJA::TypedRangeSegment<int>(0, N_r),
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<cuda_teams_x>(
+                  ctx,
+                  TILE_DIM,
+                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+                    RAJA::loop_icount<cuda_threads_y>(
+                        ctx, row_tile, [&](int row, int ty) {
+                          RAJA::loop_icount<cuda_threads_x>(
+                              ctx, col_tile, [&](int col, int tx) {
+                                Tile_Array[ty][tx] = Aview(row, col);
+                              });
+                        });
+
+                    RAJA::loop_icount<cuda_threads_x>(
+                        ctx, col_tile, [&](int col, int tx) {
+                          RAJA::loop_icount<cuda_threads_y>(
+                              ctx, row_tile, [&](int row, int ty) {
+                                Atview(col, row) = Tile_Array[ty][tx];
+                              });
+                        });
+                  });
             });
-          });
-
-         RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-           RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
-
-             Atview(col, row) = Tile_Array[ty][tx];
-
-           });
-         });
-
-       });
-     });
-
-   });
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
 
-  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -338,8 +359,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
@@ -355,44 +377,48 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool hip_async = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
-  RAJA::launch<hip_launch_policy>
-     (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-          RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
-            RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-
-              Tile_Array[ty][tx] = d_Aview(row, col);
-
+  RAJA::launch<hip_launch_policy>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<hip_teams_y>(
+            ctx,
+            TILE_DIM,
+            RAJA::TypedRangeSegment<int>(0, N_r),
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<hip_teams_x>(
+                  ctx,
+                  TILE_DIM,
+                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+                    RAJA::loop_icount<hip_threads_y>(
+                        ctx, row_tile, [&](int row, int ty) {
+                          RAJA::loop_icount<hip_threads_x>(
+                              ctx, col_tile, [&](int col, int tx) {
+                                Tile_Array[ty][tx] = d_Aview(row, col);
+                              });
+                        });
+
+                    RAJA::loop_icount<hip_threads_x>(
+                        ctx, col_tile, [&](int col, int tx) {
+                          RAJA::loop_icount<hip_threads_y>(
+                              ctx, row_tile, [&](int row, int ty) {
+                                d_Atview(col, row) = Tile_Array[ty][tx];
+                              });
+                        });
+                  });
             });
-          });
-
-          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-           RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
-
-             d_Atview(col, row) = Tile_Array[ty][tx];
-
-           });
-         });
-
-       });
-     });
-
-   });
+      });
 
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -405,16 +431,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -426,8 +458,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp
index 1206cbc680..82e995eee3 100644
--- a/exercises/launch-matrix-transpose-tiled.cpp
+++ b/exercises/launch-matrix-transpose-tiled.cpp
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA tiled matrix transpose example...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
 
   //----------------------------------------------------------------------------//
@@ -111,24 +113,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
-
     }
   }
   // _cstyle_tiled_mattranspose_end
@@ -147,13 +153,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // global iteration number.
   //
 
-/// 
-/// TODO: Uncomment these range segments so you can use them in the 
-///       non-HIP exercises in this file.
-/*
-  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
-  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
-*/
+  ///
+  /// TODO: Uncomment these range segments so you can use them in the
+  ///       non-HIP exercises in this file.
+  /*
+    RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+    RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+  */
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running sequential tiled matrix transpose ...\n";
@@ -165,38 +171,40 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
-  //using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  // using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
-
-      /*
-      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+        /*
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&]
+        (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+          RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&]
+        (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+            RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
+
+                ///
+                /// TODO...
+                ///
+                /// EXERCISE: Implement a loop method that takes a col_tile and
+                ///           returns the global index to the column iteration
+                ///
+                /// Uncomment the statement below to run the kernel and check
+        the
+                /// result.
+                ///
+
+                //Atview(col, row) = Aview(row, col);
 
-          RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
-
-              /// 
-              /// TODO...
-              ///
-              /// EXERCISE: Implement a loop method that takes a col_tile and 
-              ///           returns the global index to the column iteration
-              ///
-              /// Uncomment the statement below to run the kernel and check the 
-              /// result. 
-              /// 
-              
-              //Atview(col, row) = Aview(row, col);
+            });
 
           });
-              
         });
+        */
       });
-      */
-  });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -204,7 +212,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -212,26 +221,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  //using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  //using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  // using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  // using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Create a launch_policy_2 that will create an omp parallel region
   ///
-  /// Uncomment the kernel below to run it and check the result. 
-  /// 
-  ///           
+  /// Uncomment the kernel below to run it and check the result.
+  ///
+  ///
 
   /*
   RAJA::launch<launch_policy_2>(
      RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&]
+  (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&]
+  (RAJA::TypedRangeSegment<int> const &col_tile) {
 
             RAJA::loop<loop_pol_2>(ctx, row_tile, [&] (int row) {
                 RAJA::loop<loop_pol_2>(ctx, col_tile, [&] (int col) {
@@ -252,7 +263,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
@@ -277,39 +288,41 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Implement the cuda launch policy to dispatch the kernel below
   ///           on the GPU
   ///
-  ///           When you uncomment kernel code below, you will also need to 
+  ///           When you uncomment kernel code below, you will also need to
   ///           uncomment variables above that are used within it.
   ///
 
-/*
-  RAJA::launch<cuda_launch_policy>(
-    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+  /*
+    RAJA::launch<cuda_launch_policy>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                       RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+        RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&]
+    (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+          RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&]
+    (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-          RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
-            RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
+            RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
+              RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
 
-              Atview(col, row) = Aview(row, col);
+                Atview(col, row) = Aview(row, col);
 
+              });
             });
-          });
 
+          });
         });
-      });
 
-  });
-*/
+    });
+  */
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
@@ -317,15 +330,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::TypedRangeSegment<int> row_Range2(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range2(0, N_c);
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
@@ -341,31 +355,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool hip_async = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
-  RAJA::launch<hip_launch_policy>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                        RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, row_Range2, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, col_Range2, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA::loop<hip_threads_y>(ctx, row_tile, [&] (int row) {
-            RAJA::loop<hip_threads_x>(ctx, col_tile, [&] (int col) {
-
-              Atview(col, row) = Aview(row, col);
-
+  RAJA::launch<hip_launch_policy>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<hip_teams_y>(
+            ctx,
+            TILE_DIM,
+            row_Range2,
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<hip_teams_x>(
+                  ctx,
+                  TILE_DIM,
+                  col_Range2,
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA::loop<hip_threads_y>(ctx, row_tile, [&](int row) {
+                      RAJA::loop<hip_threads_x>(ctx, col_tile, [&](int col) {
+                        Atview(col, row) = Aview(row, col);
+                      });
+                    });
+                  });
             });
-          });
-
-        });
       });
 
-  });
-
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -389,16 +405,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -410,11 +432,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp
index 646040f6f0..5dbdb5a8b7 100644
--- a/exercises/launch-matrix-transpose-tiled_solution.cpp
+++ b/exercises/launch-matrix-transpose-tiled_solution.cpp
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA tiled matrix transpose example...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
 
   //----------------------------------------------------------------------------//
@@ -111,30 +113,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty) {
-        for (int tx = 0; tx < TILE_DIM; ++tx) {
+      for (int ty = 0; ty < TILE_DIM; ++ty)
+      {
+        for (int tx = 0; tx < TILE_DIM; ++tx)
+        {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          int col = bx * TILE_DIM + tx; // Matrix column index
+          int row = by * TILE_DIM + ty; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
-
     }
   }
   // _cstyle_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
   //
@@ -162,25 +168,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_1>(RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
-            RAJA::loop<loop_pol_1>(ctx, col_tile, [&] (int col) {
-
-              Atview(col, row) = Aview(row, col);
-
+  RAJA::launch<launch_policy_1>(
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<loop_pol_1>(
+            ctx,
+            TILE_DIM,
+            row_Range,
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<loop_pol_1>(
+                  ctx,
+                  TILE_DIM,
+                  col_Range,
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA::loop<loop_pol_1>(ctx, row_tile, [&](int row) {
+                      RAJA::loop<loop_pol_1>(ctx, col_tile, [&](int col) {
+                        Atview(col, row) = Aview(row, col);
+                      });
+                    });
+                  });
             });
-          });
-
-        });
       });
-
-  });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -188,7 +196,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -201,32 +210,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA::loop<loop_pol_2>(ctx, row_tile, [&] (int row) {
-            RAJA::loop<loop_pol_2>(ctx, col_tile, [&] (int col) {
-
-              Atview(col, row) = Aview(row, col);
-
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<omp_for_pol_2>(
+            ctx,
+            TILE_DIM,
+            row_Range,
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<loop_pol_2>(
+                  ctx,
+                  TILE_DIM,
+                  col_Range,
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA::loop<loop_pol_2>(ctx, row_tile, [&](int row) {
+                      RAJA::loop<loop_pol_2>(ctx, col_tile, [&](int col) {
+                        Atview(col, row) = Aview(row, col);
+                      });
+                    });
+                  });
             });
-          });
-
-        });
       });
 
-  });
-
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
@@ -237,7 +247,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int r_block_sz = TILE_DIM;
   const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
   const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
-  
+
   // _raja_mattranspose_cuda_start
   using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -246,49 +256,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
   const bool cuda_async = false;
-  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+  using cuda_launch_policy =
+      RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
-            RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
-
-              Atview(col, row) = Aview(row, col);
-
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<cuda_teams_y>(
+            ctx,
+            TILE_DIM,
+            row_Range,
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<cuda_teams_x>(
+                  ctx,
+                  TILE_DIM,
+                  col_Range,
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA::loop<cuda_threads_y>(ctx, row_tile, [&](int row) {
+                      RAJA::loop<cuda_threads_x>(ctx, col_tile, [&](int col) {
+                        Atview(col, row) = Aview(row, col);
+                      });
+                    });
+                  });
             });
-          });
-
-        });
       });
-
-  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
@@ -305,30 +318,32 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
-    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-        RAJA::tile<hip_teams_x> (ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-          RAJA::loop<hip_threads_y>(ctx, row_tile, [&] (int row) {
-            RAJA::loop<hip_threads_x>(ctx, col_tile, [&] (int col) {
-
-              d_Atview(col, row) = d_Aview(row, col);
-
-           });
-         });
-
-       });
-     });
-
-  });
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::tile<hip_teams_y>(
+            ctx,
+            TILE_DIM,
+            row_Range,
+            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+              RAJA::tile<hip_teams_x>(
+                  ctx,
+                  TILE_DIM,
+                  col_Range,
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                    RAJA::loop<hip_threads_y>(ctx, row_tile, [&](int row) {
+                      RAJA::loop<hip_threads_x>(ctx, col_tile, [&](int col) {
+                        d_Atview(col, row) = d_Aview(row, col);
+                      });
+                    });
+                  });
+            });
+      });
 
-  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -352,16 +367,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -373,11 +394,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp
index 7cd96429bb..d1234d498f 100644
--- a/exercises/launch-matrix-transpose.cpp
+++ b/exercises/launch-matrix-transpose.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-        Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -104,10 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose. 
+  // transpose.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number. 
+  // Iterations inside a RAJA loop is given by their global iteration number.
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -118,98 +122,90 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. 
+  // using sequential loops.
   //
   // _raja_mattranspose_start
   using loop_policy_seq = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_seq>
-   (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<loop_policy_seq>(ctx, row_Range, [&] (int /*row*/) {
-        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int /*col*/) {
-
-	  /// TODO...
-	  ///
-	  /// EXERCISE: Implement the kernel body for the transpose operation
-	  ///
-
+  RAJA::launch<launch_policy_seq>(
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<loop_policy_seq>(ctx, row_Range, [&](int /*row*/) {
+          RAJA::loop<loop_policy_seq>(ctx, col_Range, [&](int /*col*/) {
+            /// TODO...
+            ///
+            /// EXERCISE: Implement the kernel body for the transpose operation
+            ///
+          });
         });
       });
-
-  });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   //
   // This policy loops sequentially while exposing parallelism on
   // one of the inner loops.
-  
-  //uncomment to use in example below
-  //using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
-
-  RAJA::launch<launch_policy_omp>(RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
-
-
-      /// TODO...
-      ///
-      /// EXERCISE: Implement the loops to apply omp parallism and sequential
-      ///           execution on the column and row loops respectively
-      ///
-
-      //Atview(col, row) = Aview(row, col);
 
+  // uncomment to use in example below
+  // using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
-  });
+  RAJA::launch<launch_policy_omp>(
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+        /// TODO...
+        ///
+        /// EXERCISE: Implement the loops to apply omp parallism and sequential
+        ///           execution on the column and row loops respectively
+        ///
+
+        // Atview(col, row) = Aview(row, col);
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  
+
   // _raja_mattranspose_cuda_start
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async = false; //execute asynchronously
+  const bool async = false; // execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
   RAJA::launch<launch_policy_cuda>(
-    RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<cuda_thread_y>(ctx, row_Range, [&] (int row) {
-        RAJA::loop<cuda_thread_x>(ctx, col_Range, [&] (int col) {
-
+      RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<cuda_thread_y>(ctx, row_Range, [&](int row) {
+          RAJA::loop<cuda_thread_x>(ctx, col_Range, [&](int col) {
             Atview(col, row) = Aview(row, col);
-
+          });
         });
       });
-
-  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -220,7 +216,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-} 
+}
 
 //
 // Function to check result and report P/F.
@@ -229,16 +225,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -250,11 +252,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp
index a7822bc1c7..4c33abed2e 100644
--- a/exercises/launch-matrix-transpose_solution.cpp
+++ b/exercises/launch-matrix-transpose_solution.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-        Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -104,10 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose. 
+  // transpose.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number. 
+  // Iterations inside a RAJA loop is given by their global iteration number.
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -118,32 +122,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. 
+  // using sequential loops.
   //
   // _raja_mattranspose_start
   using loop_policy_seq = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_seq>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<loop_policy_seq>(ctx, row_Range, [&] (int row) {
-        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int col) {
-
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<loop_policy_seq>(ctx, row_Range, [&](int row) {
+          RAJA::loop<loop_policy_seq>(ctx, col_Range, [&](int col) {
             Atview(col, row) = Aview(row, col);
-
+          });
         });
       });
-
-  });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
+               "loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -155,56 +157,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_omp>(
-    RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<loop_policy_omp>(ctx, row_Range, [&] (int row) {
-        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int col) {
-
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<loop_policy_omp>(ctx, row_Range, [&](int row) {
+          RAJA::loop<loop_policy_seq>(ctx, col_Range, [&](int col) {
             Atview(col, row) = Aview(row, col);
-
+          });
         });
       });
 
-  });
-
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  
+
   // _raja_mattranspose_cuda_start
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async = false; //execute asynchronously
+  const bool async = false; // execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
-  RAJA::launch<launch_policy_cuda>
-    (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<cuda_thread_y>(ctx, row_Range, [&] (int row) {
-        RAJA::loop<cuda_thread_x>(ctx, col_Range, [&] (int col) {
-
+  RAJA::launch<launch_policy_cuda>(
+      RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<cuda_thread_y>(ctx, row_Range, [&](int row) {
+          RAJA::loop<cuda_thread_x>(ctx, col_Range, [&](int col) {
             Atview(col, row) = Aview(row, col);
-
+          });
         });
       });
-
-  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  //printResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -215,7 +210,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-} 
+}
 
 //
 // Function to check result and report P/F.
@@ -224,16 +219,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match = false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -245,11 +246,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp
index 10c2b0e302..e02bc3b586 100644
--- a/exercises/launchintro-execpols.cpp
+++ b/exercises/launchintro-execpols.cpp
@@ -37,16 +37,17 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template< int i_block_size, int j_block_size, int k_block_size >
-__launch_bounds__(i_block_size*j_block_size*k_block_size)
-__global__ void nested_init(double* a, double c, int N)
+template <int i_block_size, int j_block_size, int k_block_size>
+__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
+    void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if ( i < N && j < N && k < N ) {
-    a[i+N*(j+N*k)] = c * i * j * k ;
+  if (i < N && j < N && k < N)
+  {
+    a[i + N * (j + N * k)] = c * i * j * k;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -58,64 +59,71 @@ __global__ void nested_init(double* a, double c, int N)
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-// _init_define_start
-//
-// 3D tensor has N^3 entries
-//
+  // _init_define_start
+  //
+  // 3D tensor has N^3 entries
+  //
   constexpr int N = 100;
   constexpr int N_tot = N * N * N;
   constexpr double c = 0.0001;
   double* a = memoryManager::allocate<double>(N_tot);
   double* a_ref = memoryManager::allocate<double>(N_tot);
-// _init_define_end
-
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
-
-// _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+  // _init_define_end
+
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference "
+               "solution ...\n";
+
+  // _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        a_ref[i + N * (j + N * k)] = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_seq_end
+  // _cstyle_tensorinit_seq_end
 
 
-//----------------------------------------------------------------------------//
-// We introduce a RAJA View to wrap the tensor data pointer and simplify
-// multi-dimensional indexing.
-// We use this in the rest of the examples in this file.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // We introduce a RAJA View to wrap the tensor data pointer and simplify
+  // multi-dimensional indexing.
+  // We use this in the rest of the examples in this file.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-// _3D_raja_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
-// _3D_raja_view_end
-
-// _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+  // _3D_raja_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
+  // _3D_raja_view_end
+
+  // _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_view_seq_end
+  // _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
@@ -129,50 +137,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           the tensor initialization kernel.
   ///
 
-// _raja_tensorinit_seq_start
-  //using loop_policy_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  // _raja_tensorinit_seq_start
+  // using loop_policy_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_1>
-    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
-      /*
-      RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+  RAJA::launch<launch_policy_1>(
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+        /*
+        RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&]
+        (int k) {
 
-          //Add additional loop methods to complete the kernel
+            //Add additional loop methods to complete the kernel
 
+        });
+        */
       });
-      */
-  });
-// _raja_tensorinit_seq_end
+  // _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA OpenMP multithreading variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA OpenMP multithreading variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cstyle_tensorinit_omp_outer_start
-  #pragma omp parallel for
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+// _cstyle_tensorinit_omp_outer_start
+#pragma omp parallel for
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_omp_outer_end
+  // _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
@@ -186,61 +199,61 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           kernel that creates a parallel outer loop.
   ///
 
-// _raja_tensorinit_omp_outer_start
+  // _raja_tensorinit_omp_outer_start
   /*
   using omp_policy_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   using loop_policy_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   */
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
-  RAJA::launch<launch_policy_2>
-    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
-
-         //TODO: Use the omp_policy_2 to distribute loop iterations
-         //in a RAJA::loop method
-         /*
-         RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
-            RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
-
+  RAJA::launch<launch_policy_2>(
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+        // TODO: Use the omp_policy_2 to distribute loop iterations
+        // in a RAJA::loop method
+        /*
+        RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&]
+        (int j) { RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0,
+        N), [&] (int i) {
 
-            });
-         });
-        */
 
-  });
-// _raja_tensorinit_omp_outer_end
+           });
+        });
+       */
+      });
+  // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 #endif
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   //
   // Define total thread-block size and size of each block dimension
   //
-// _cuda_blockdim_start
+  // _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
-// _cuda_blockdim_end
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
+  // _cuda_blockdim_end
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA CUDA GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA CUDA GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_cuda_start
+  // _raja_tensorinit_cuda_start
   using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
   using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
@@ -248,34 +261,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_3 = false;
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
-  RAJA::launch<launch_policy_3>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k),
-                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<cuda_teams_z_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-        RAJA::loop<cuda_global_thread_y_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
-          RAJA::loop<cuda_global_thread_x_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
-
-            aView(i, j, k) = c * i * j * k ;
-
-          });
-        });
+  RAJA::launch<launch_policy_3>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<cuda_teams_z_3>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::loop<cuda_global_thread_y_3>(
+                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                    RAJA::loop<cuda_global_thread_x_3>(
+                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
+                          aView(i, j, k) = c * i * j * k;
+                        });
+                  });
+            });
       });
-  });
 
-// _raja_tensorinit_cuda_end
+  // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_cuda_tiled_direct_start
+  // _raja_tensorinit_cuda_tiled_direct_start
   using cuda_teams_z_4 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_teams_y_4 = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x_4 = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -286,46 +299,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_4 = false;
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
-  RAJA::launch<launch_policy_4>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<cuda_teams_z_4>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-
-        RAJA::tile<cuda_teams_y_4>
-          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
-
-          RAJA::tile<cuda_teams_x_4>
-            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
-
-            RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&] (int j) {
-                RAJA::loop<cuda_threads_x_4>(ctx, i_tile, [&] (int i) {
-
-                    aView(i, j, k) = c * i * j * k ;
-
+  RAJA::launch<launch_policy_4>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<cuda_teams_z_4>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::tile<cuda_teams_y_4>(
+                  ctx,
+                  j_block_sz,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                    RAJA::tile<cuda_teams_x_4>(
+                        ctx,
+                        i_block_sz,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
+                          RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&](int j) {
+                            RAJA::loop<cuda_threads_x_4>(
+                                ctx, i_tile, [&](int i) {
+                                  aView(i, j, k) = c * i * j * k;
+                                });
+                          });
+                        });
                   });
-              });
-
             });
-          });
-
       });
-    });
-// _raja_tensorinit_cuda_tiled_direct_end
+  // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cuda_tensorinit_tiled_direct_start
+  // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
                 "Invalid block_size");
 
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
@@ -333,10 +346,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
                static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-    <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk( cudaGetLastError() );
+      <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk(cudaGetLastError());
   cudaErrchk(cudaDeviceSynchronize());
-// _cuda_tensorinit_tiled_direct_end
+  // _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
@@ -353,27 +366,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
 
-//----------------------------------------------------------------------------//
-// RAJA HIP GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA HIP GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-// _3D_raja_device_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
-// _3D_raja_deviceview_end
+  // _3D_raja_device_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
+  // _3D_raja_deviceview_end
 
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
-// _raja_tensorinit_hip_start
+  // _raja_tensorinit_hip_start
   using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
   using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
@@ -381,36 +394,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_5 = false;
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
-  RAJA::launch<launch_policy_5>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-       RAJA::loop<hip_teams_z_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-           RAJA::loop<hip_global_thread_y_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
-               RAJA::loop<hip_global_thread_x_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
-
-                   d_aView(i, j, k) = c * i * j * k ;
-
-           });
-         });
-       });
-
-  });
-// _raja_tensorinit_hip_end
+  RAJA::launch<launch_policy_5>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<hip_teams_z_5>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::loop<hip_global_thread_y_5>(
+                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                    RAJA::loop<hip_global_thread_x_5>(
+                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
+                          d_aView(i, j, k) = c * i * j * k;
+                        });
+                  });
+            });
+      });
+  // _raja_tensorinit_hip_end
 
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
-// _raja_tensorinit_hip_tiled_direct_start
+  // _raja_tensorinit_hip_tiled_direct_start
   using hip_teams_z_6 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_teams_y_6 = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x_6 = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -421,42 +433,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_6 = false;
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
-  RAJA::launch<launch_policy_6>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<hip_teams_z_6>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-
-        RAJA::tile<hip_teams_y_6>
-          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
-
-          RAJA::tile<hip_teams_x_6>
-            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
-
-            RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&] (int j) {
-                RAJA::loop<hip_threads_x_6>(ctx, i_tile, [&] (int i) {
-
-                    d_aView(i, j, k) = c * i * j * k ;
-
+  RAJA::launch<launch_policy_6>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<hip_teams_z_6>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::tile<hip_teams_y_6>(
+                  ctx,
+                  j_block_sz,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                    RAJA::tile<hip_teams_x_6>(
+                        ctx,
+                        i_block_sz,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
+                          RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&](int j) {
+                            RAJA::loop<hip_threads_x_6>(
+                                ctx, i_tile, [&](int i) {
+                                  d_aView(i, j, k) = c * i * j * k;
+                                });
+                          });
+                        });
                   });
-              });
-
             });
-          });
-
       });
-    });
-// _raja_tensorinit_hip_tiled_direct_end
+  // _raja_tensorinit_hip_tiled_direct_end
 
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
 #endif // if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -475,14 +487,18 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while ( correct && (i < n) ) {
+  while (correct && (i < n))
+  {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp
index 1bfff68acf..37da99f9f0 100644
--- a/exercises/launchintro-execpols_solution.cpp
+++ b/exercises/launchintro-execpols_solution.cpp
@@ -37,16 +37,17 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template< int i_block_size, int j_block_size, int k_block_size >
-__launch_bounds__(i_block_size*j_block_size*k_block_size)
-__global__ void nested_init(double* a, double c, int N)
+template <int i_block_size, int j_block_size, int k_block_size>
+__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
+    void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if ( i < N && j < N && k < N ) {
-    a[i+N*(j+N*k)] = c * i * j * k ;
+  if (i < N && j < N && k < N)
+  {
+    a[i + N * (j + N * k)] = c * i * j * k;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -58,174 +59,186 @@ __global__ void nested_init(double* a, double c, int N)
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-// _init_define_start
-//
-// 3D tensor has N^3 entries
-//
+  // _init_define_start
+  //
+  // 3D tensor has N^3 entries
+  //
   constexpr int N = 100;
   constexpr int N_tot = N * N * N;
   constexpr double c = 0.0001;
   double* a = memoryManager::allocate<double>(N_tot);
   double* a_ref = memoryManager::allocate<double>(N_tot);
-// _init_define_end
-
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
-
-// _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+  // _init_define_end
+
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference "
+               "solution ...\n";
+
+  // _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        a_ref[i + N * (j + N * k)] = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_seq_end
+  // _cstyle_tensorinit_seq_end
 
 
-//----------------------------------------------------------------------------//
-// We introduce a RAJA View to wrap the tensor data pointer and simplify
-// multi-dimensional indexing.
-// We use this in the rest of the examples in this file.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // We introduce a RAJA View to wrap the tensor data pointer and simplify
+  // multi-dimensional indexing.
+  // We use this in the rest of the examples in this file.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-// _3D_raja_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
-// _3D_raja_view_end
-
-// _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+  // _3D_raja_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
+  // _3D_raja_view_end
+
+  // _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_view_seq_end
+  // _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_seq_start
+  // _raja_tensorinit_seq_start
   using loop_policy_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_1>
-    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-         RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
-            RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
-
-                aView(i, j, k) = c * i * j * k ;
-
+  RAJA::launch<launch_policy_1>(
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<loop_policy_1>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::loop<loop_policy_1>(
+                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                    RAJA::loop<loop_policy_1>(
+                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
+                          aView(i, j, k) = c * i * j * k;
+                        });
+                  });
             });
-         });
       });
-  });
-// _raja_tensorinit_seq_end
+  // _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA OpenMP multithreading variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA OpenMP multithreading variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cstyle_tensorinit_omp_outer_start
-  #pragma omp parallel for
-  for (int k = 0; k < N; ++k ) {
-    for (int j = 0; j < N; ++j ) {
-      for (int i = 0; i < N; ++i ) {
-        aView(i, j, k) = c * i * j * k ;
+// _cstyle_tensorinit_omp_outer_start
+#pragma omp parallel for
+  for (int k = 0; k < N; ++k)
+  {
+    for (int j = 0; j < N; ++j)
+    {
+      for (int i = 0; i < N; ++i)
+      {
+        aView(i, j, k) = c * i * j * k;
       }
     }
   }
-// _cstyle_tensorinit_omp_outer_end
+  // _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_omp_outer_start
+  // _raja_tensorinit_omp_outer_start
   using omp_policy_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   using loop_policy_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
-  RAJA::launch<launch_policy_2>
-    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<omp_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-         RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
-            RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
-
-                aView(i, j, k) = c * i * j * k ;
-
+  RAJA::launch<launch_policy_2>(
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // host
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<omp_policy_2>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::loop<loop_policy_2>(
+                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                    RAJA::loop<loop_policy_2>(
+                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
+                          aView(i, j, k) = c * i * j * k;
+                        });
+                  });
             });
-         });
       });
-  });
-// _raja_tensorinit_omp_outer_end
+  // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 #endif
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   //
   // Define total thread-block size and size of each block dimension
   //
-// _cuda_blockdim_start
+  // _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
-// _cuda_blockdim_end
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
+  // _cuda_blockdim_end
 
-//----------------------------------------------------------------------------//
-// C-style and RAJA CUDA GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style and RAJA CUDA GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_cuda_start
+  // _raja_tensorinit_cuda_start
   using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
   using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
@@ -233,34 +246,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_3 = false;
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
-  RAJA::launch<launch_policy_3>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k),
-                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<cuda_teams_z_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-        RAJA::loop<cuda_global_thread_y_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
-          RAJA::loop<cuda_global_thread_x_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
-
-            aView(i, j, k) = c * i * j * k ;
-
-          });
-        });
+  RAJA::launch<launch_policy_3>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<cuda_teams_z_3>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::loop<cuda_global_thread_y_3>(
+                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                    RAJA::loop<cuda_global_thread_x_3>(
+                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
+                          aView(i, j, k) = c * i * j * k;
+                        });
+                  });
+            });
       });
-  });
 
-// _raja_tensorinit_cuda_end
+  // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _raja_tensorinit_cuda_tiled_direct_start
+  // _raja_tensorinit_cuda_tiled_direct_start
   using cuda_teams_z_4 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_teams_y_4 = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x_4 = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -271,46 +284,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_4 = false;
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
-  RAJA::launch<launch_policy_4>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                        RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<cuda_teams_z_4>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-
-        RAJA::tile<cuda_teams_y_4>
-          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
-
-          RAJA::tile<cuda_teams_x_4>
-            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
-
-            RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&] (int j) {
-                RAJA::loop<cuda_threads_x_4>(ctx, i_tile, [&] (int i) {
-
-                    aView(i, j, k) = c * i * j * k ;
-
+  RAJA::launch<launch_policy_4>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<cuda_teams_z_4>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::tile<cuda_teams_y_4>(
+                  ctx,
+                  j_block_sz,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                    RAJA::tile<cuda_teams_x_4>(
+                        ctx,
+                        i_block_sz,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
+                          RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&](int j) {
+                            RAJA::loop<cuda_threads_x_4>(
+                                ctx, i_tile, [&](int i) {
+                                  aView(i, j, k) = c * i * j * k;
+                                });
+                          });
+                        });
                   });
-              });
-
             });
-          });
-
       });
-    });
-// _raja_tensorinit_cuda_tiled_direct_end
+  // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cuda_tensorinit_tiled_direct_start
+  // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
                 "Invalid block_size");
 
   dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
@@ -318,10 +331,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
                static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-    <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk( cudaGetLastError() );
+      <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk(cudaGetLastError());
   cudaErrchk(cudaDeviceSynchronize());
-// _cuda_tensorinit_tiled_direct_end
+  // _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
@@ -338,27 +351,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
 
-//----------------------------------------------------------------------------//
-// RAJA HIP GPU variants.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA HIP GPU variants.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-// _3D_raja_device_view_start
-  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
-// _3D_raja_deviceview_end
+  // _3D_raja_device_view_start
+  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
+  // _3D_raja_deviceview_end
 
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
-// _raja_tensorinit_hip_start
+  // _raja_tensorinit_hip_start
   using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
   using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
@@ -366,36 +379,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_5 = false;
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
-  RAJA::launch<launch_policy_5>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                        RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-       RAJA::loop<hip_teams_z_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-           RAJA::loop<hip_global_thread_y_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
-               RAJA::loop<hip_global_thread_x_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
-
-                   d_aView(i, j, k) = c * i * j * k ;
-
-           });
-         });
-       });
-
-  });
-// _raja_tensorinit_hip_end
+  RAJA::launch<launch_policy_5>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<hip_teams_z_5>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::loop<hip_global_thread_y_5>(
+                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                    RAJA::loop<hip_global_thread_x_5>(
+                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
+                          d_aView(i, j, k) = c * i * j * k;
+                        });
+                  });
+            });
+      });
+  // _raja_tensorinit_hip_end
 
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
-// _raja_tensorinit_hip_tiled_direct_start
+  // _raja_tensorinit_hip_tiled_direct_start
   using hip_teams_z_6 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_teams_y_6 = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x_6 = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -406,42 +418,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const bool async_6 = false;
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
-  RAJA::launch<launch_policy_6>
-    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
-
-      RAJA::loop<hip_teams_z_6>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
-
-        RAJA::tile<hip_teams_y_6>
-          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
-
-          RAJA::tile<hip_teams_x_6>
-            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
-
-            RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&] (int j) {
-                RAJA::loop<hip_threads_x_6>(ctx, i_tile, [&] (int i) {
-
-                    d_aView(i, j, k) = c * i * j * k ;
-
+  RAJA::launch<launch_policy_6>(
+      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<hip_teams_z_6>(
+            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+              RAJA::tile<hip_teams_y_6>(
+                  ctx,
+                  j_block_sz,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                    RAJA::tile<hip_teams_x_6>(
+                        ctx,
+                        i_block_sz,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
+                          RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&](int j) {
+                            RAJA::loop<hip_threads_x_6>(
+                                ctx, i_tile, [&](int i) {
+                                  d_aView(i, j, k) = c * i * j * k;
+                                });
+                          });
+                        });
                   });
-              });
-
             });
-          });
-
       });
-    });
-// _raja_tensorinit_hip_tiled_direct_end
+  // _raja_tensorinit_hip_tiled_direct_end
 
-  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
 #endif // if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -460,14 +472,18 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while ( correct && (i < n) ) {
+  while (correct && (i < n))
+  {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp
index 62d3d6e3e7..6f68615a45 100644
--- a/exercises/memoryManager.hpp
+++ b/exercises/memoryManager.hpp
@@ -28,20 +28,20 @@ namespace memoryManager
 {
 
 #if defined(RAJA_ENABLE_SYCL)
-  static camp::resources::Resource* sycl_res;
+static camp::resources::Resource* sycl_res;
 #endif
 
 template <typename T>
-T *allocate(RAJA::Index_type size)
+T* allocate(RAJA::Index_type size)
 {
-  T *ptr;
+  T* ptr;
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(
-      cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
+      cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
 #elif defined(RAJA_ENABLE_HIP)
-      hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
+  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-      ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
+  ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
 #else
   ptr = new T[size];
 #endif
@@ -49,9 +49,10 @@ T *allocate(RAJA::Index_type size)
 }
 
 template <typename T>
-void deallocate(T *&ptr)
+void deallocate(T*& ptr)
 {
-  if (ptr) {
+  if (ptr)
+  {
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
@@ -65,37 +66,39 @@ void deallocate(T *&ptr)
   }
 }
 
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-  template <typename T>
-  T *allocate_gpu(RAJA::Index_type size)
-  {
-    T *ptr;
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
+template <typename T>
+T* allocate_gpu(RAJA::Index_type size)
+{
+  T* ptr;
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size));
+  cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
+  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-      auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
-      ptr = cl::sycl::malloc_device<T>(size, *qu);
+  auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
+  ptr = cl::sycl::malloc_device<T>(size, *qu);
 #endif
-    return ptr;
-  }
+  return ptr;
+}
 
-  template <typename T>
-  void deallocate_gpu(T *&ptr)
+template <typename T>
+void deallocate_gpu(T*& ptr)
+{
+  if (ptr)
   {
-    if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
-      cudaErrchk(cudaFree(ptr));
+    cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
-      hipErrchk(hipFree(ptr));
+    hipErrchk(hipFree(ptr));
 #elif defined(RAJA_ENABLE_SYCL)
     sycl_res->deallocate(ptr);
 #endif
-      ptr = nullptr;
-    }
+    ptr = nullptr;
   }
+}
 #endif
 
-};  // namespace memoryManager
+}; // namespace memoryManager
 #endif
diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp
index 3432adbb50..478fdef1cb 100644
--- a/exercises/offset-layout-stencil.cpp
+++ b/exercises/offset-layout-stencil.cpp
@@ -16,21 +16,21 @@
 /*
  *  Offset Layout Stencil Exercise
  *
- *  This exercise applies a five-point stencil to the interior cells of a 
+ *  This exercise applies a five-point stencil to the interior cells of a
  *  lattice and stores the resulting sums in a second lattice of equal size.
- *  You can think of the lattice as representing the centers of cells on a 
- *  two-dimensional Cartesian mesh. 
+ *  You can think of the lattice as representing the centers of cells on a
+ *  two-dimensional Cartesian mesh.
  *
- *  The five-point stencil accumulates values of a cell and its four neighbors. 
- *  Assuming the cells of a lattice may be accessed through a row/col fashion, 
+ *  The five-point stencil accumulates values of a cell and its four neighbors.
+ *  Assuming the cells of a lattice may be accessed through a row/col fashion,
  *  the stencil may be expressed as the following sum:
- * 
+ *
  *  output(row, col) = input(row, col) +
  *                     input(row - 1, col) + input(row + 1, col) +
  *                     input(row, col - 1) + input(row, col + 1)
  *
  *  We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros
- *  for a lattice of size (N_r + 2) x (N_c + 2).  
+ *  for a lattice of size (N_r + 2) x (N_c + 2).
  *
  *  In the case of N_r = N_c = 3, the input lattice values are:
  *
@@ -60,8 +60,8 @@
  *  | 0 | 0 | 0 | 0 | 0 |
  *  ---------------------
  *
- * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to 
- * simplify the indexing to perform the stencil calculation. For the 
+ * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to
+ * simplify the indexing to perform the stencil calculation. For the
  * purposes of discussion, we enumerate the lattice in the following manner:
  *
  *  --------------------------------------------------
@@ -81,12 +81,12 @@
  *
  *  RAJA features shown:
  *    - RAJA::kernel kernel execution method and execution policies
- *    - RAJA::View 
+ *    - RAJA::View
  *    - RAJA::Layout
  *
  * For the CUDA implementation, we use unified memory to hold the lattice data.
  * For HIP, we use explicit host-device memory and manually copy data between
- * the two. 
+ * the two.
  */
 
 /*
@@ -111,28 +111,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nFive-point stencil example...\n";
 
-// _stencil_define_start
-//
-// Define num of interior cells in row/cols in a lattice
-//
+  // _stencil_define_start
+  //
+  // Define num of interior cells in row/cols in a lattice
+  //
   constexpr int N_r = 5;
   constexpr int N_c = 4;
 
-//
-// Define total num of cells in rows/cols in a lattice
-//
+  //
+  // Define total num of cells in rows/cols in a lattice
+  //
   constexpr int totCellsInRow = N_r + 2;
   constexpr int totCellsInCol = N_c + 2;
 
-//
-// Define total num of cells in a lattice
-//
+  //
+  // Define total num of cells in a lattice
+  //
   constexpr int totCells = totCellsInRow * totCellsInCol;
-// _stencil_define_end
+  // _stencil_define_end
 
-//
-// Allocate and initialize lattice
-//
+  //
+  // Allocate and initialize lattice
+  //
   int* input = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output_ref = memoryManager::allocate<int>(totCells * sizeof(int));
@@ -141,104 +141,100 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
   std::memset(output_ref, 0, totCells * sizeof(int));
 
-//
-// C-Style intialization
-//
-// _stencil_input_init_start
-  for (int row = 1; row <= N_r; ++row) {
-    for (int col = 1; col <= N_c; ++col) {
+  //
+  // C-Style intialization
+  //
+  // _stencil_input_init_start
+  for (int row = 1; row <= N_r; ++row)
+  {
+    for (int col = 1; col <= N_c; ++col)
+    {
       int id = col + totCellsInCol * row;
       input[id] = 1;
     }
   }
-// _stencil_input_init_end
+  // _stencil_input_init_end
 
-  std::cout << "\ninput lattice:\n"; 
+  std::cout << "\ninput lattice:\n";
   printLattice(input, totCellsInRow, totCellsInCol);
 
-//
-// Generate reference solution
-//
-// _stencil_output_ref_start
-  for (int row = 1; row <= N_r; ++row) {
-    for (int col = 1; col <= N_c; ++col) {
+  //
+  // Generate reference solution
+  //
+  // _stencil_output_ref_start
+  for (int row = 1; row <= N_r; ++row)
+  {
+    for (int col = 1; col <= N_c; ++col)
+    {
 
       int id = col + totCellsInCol * row;
-      output_ref[id] = input[id] + input[id + 1]
-                        + input[id - 1]
-                        + input[id + totCellsInCol]
-                        + input[id - totCellsInCol];
+      output_ref[id] = input[id] + input[id + 1] + input[id - 1] +
+                       input[id + totCellsInCol] + input[id - totCellsInCol];
     }
   }
-// _stencil_output_ref_end
+  // _stencil_output_ref_end
 
-  std::cout << "\noutput reference lattice:\n"; 
+  std::cout << "\noutput reference lattice:\n";
   printLattice(output_ref, totCellsInRow, totCellsInCol);
 
-//----------------------------------------------------------------------------//
-
-//
-// The following code illustrates pairing an offset layout and a RAJA view
-// object to simplify multidimensional indexing.
-// An offset layout is constructed by using the make_offset_layout method.
-// The first argument of the layout is an array object with the coordinates of
-// the bottom left corner of the lattice, and the second argument is an array
-// object of the coordinates of the top right corner plus 1.
-// The example uses double braces to initiate the array object and its
-// subobjects.
-//
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following code illustrates pairing an offset layout and a RAJA view
+  // object to simplify multidimensional indexing.
+  // An offset layout is constructed by using the make_offset_layout method.
+  // The first argument of the layout is an array object with the coordinates of
+  // the bottom left corner of the lattice, and the second argument is an array
+  // object of the coordinates of the top right corner plus 1.
+  // The example uses double braces to initiate the array object and its
+  // subobjects.
+  //
   // _offsetlayout_views_start
   const int DIM = 2;
 
   RAJA::OffsetLayout<DIM, int> layout =
-      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r+1, N_c+1}});
+      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r + 1, N_c + 1}});
 
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> inputView(input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> outputView(output, layout);
   // _offsetlayout_views_end
 
-//
-// Create range segments used in kernels
-//
+  //
+  // Create range segments used in kernels
+  //
   // _offsetlayout_ranges_start
   RAJA::TypedRangeSegment<int> col_range(0, N_c);
   RAJA::TypedRangeSegment<int> row_range(0, N_r);
   // _offsetlayout_ranges_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n";
 
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaseq_start
-  using NESTED_EXEC_POL1 =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::seq_exec,    // row
-        RAJA::statement::For<0, RAJA::seq_exec,  // col
-          RAJA::statement::Lambda<0>
-        >
-      >  
-    >;  
-
-  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
-                                 [=](int col, int row) {
-
-                                   outputView(row, col) =
-                                       inputView(row, col)
-                                       + inputView(row - 1, col)
-                                       + inputView(row + 1, col)
-                                       + inputView(row, col - 1)
-                                       + inputView(row, col + 1);
-
-                                 });
+  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // row
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // col
+                                                RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL1>(
+      RAJA::make_tuple(col_range, row_range), [=](int col, int row) {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajaseq_end
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -256,12 +252,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           earlier tutorial section.
   ///
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -270,77 +266,68 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajacuda_start
-  using NESTED_EXEC_POL3 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;                                                     
+  using NESTED_EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+      RAJA::statement::For<1,
+                           RAJA::cuda_block_x_loop, // row
+                           RAJA::statement::For<0,
+                                                RAJA::cuda_thread_x_loop, // col
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
                                  [=] RAJA_DEVICE(int col, int row) {
-
                                    outputView(row, col) =
-                                       inputView(row, col)
-                                       + inputView(row - 1, col)
-                                       + inputView(row + 1, col)
-                                       + inputView(row, col - 1)
-                                       + inputView(row, col + 1);
-
+                                       inputView(row, col) +
+                                       inputView(row - 1, col) +
+                                       inputView(row + 1, col) +
+                                       inputView(row, col - 1) +
+                                       inputView(row, col + 1);
                                  });
   // _offsetlayout_rajacuda_end
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running five-point stencil (RAJA-Kernel - "
                "hip)...\n";
 
-  int* d_input  = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
+  int* d_input = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
   int* d_output = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
 
-  hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(
+      hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice));
 
-  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView (d_input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView(d_input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
 
   // _offsetlayout_rajahip_start
-  using NESTED_EXEC_POL4 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<1, RAJA::hip_block_x_loop, //row
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+  using NESTED_EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::HipKernel<
+      RAJA::statement::For<1,
+                           RAJA::hip_block_x_loop, // row
+                           RAJA::statement::For<0,
+                                                RAJA::hip_thread_x_loop, // col
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
                                  [=] RAJA_DEVICE(int col, int row) {
-
                                    d_outputView(row, col) =
-                                         d_inputView(row, col)
-                                       + d_inputView(row - 1, col)
-                                       + d_inputView(row + 1, col)
-                                       + d_inputView(row, col - 1)
-                                       + d_inputView(row, col + 1);
+                                       d_inputView(row, col) +
+                                       d_inputView(row - 1, col) +
+                                       d_inputView(row + 1, col) +
+                                       d_inputView(row, col - 1) +
+                                       d_inputView(row, col + 1);
                                  });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(
+      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
@@ -348,11 +335,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_output);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(input);
   memoryManager::deallocate(output);
   memoryManager::deallocate(output_ref);
@@ -367,8 +354,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void printLattice(int* lattice, int totCellsInRow, int totCellsInCol)
 {
   std::cout << std::endl;
-  for (int row = 0; row < totCellsInRow; ++row) {
-    for (int col = 0; col < totCellsInCol; ++col) {
+  for (int row = 0; row < totCellsInRow; ++row)
+  {
+    for (int col = 0; col < totCellsInCol; ++col)
+    {
 
       const int id = col + totCellsInCol * row;
       std::cout << lattice[id] << " ";
@@ -386,14 +375,18 @@ void checkResult(int* compLattice, int* refLattice, int totCells)
   bool correct = true;
 
   int i = 0;
-  while ( correct && (i < totCells) ) {
+  while (correct && (i < totCells))
+  {
     correct = (compLattice[i] == refLattice[i]);
     i++;
   }
 
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp
index f212ca7630..814c6128db 100644
--- a/exercises/offset-layout-stencil_solution.cpp
+++ b/exercises/offset-layout-stencil_solution.cpp
@@ -16,21 +16,21 @@
 /*
  *  Offset Layout Stencil Exercise
  *
- *  This exercise applies a five-point stencil to the interior cells of a 
+ *  This exercise applies a five-point stencil to the interior cells of a
  *  lattice and stores the resulting sums in a second lattice of equal size.
- *  You can think of the lattice as representing the centers of cells on a 
- *  two-dimensional Cartesian mesh. 
+ *  You can think of the lattice as representing the centers of cells on a
+ *  two-dimensional Cartesian mesh.
  *
- *  The five-point stencil accumulates values of a cell and its four neighbors. 
- *  Assuming the cells of a lattice may be accessed through a row/col fashion, 
+ *  The five-point stencil accumulates values of a cell and its four neighbors.
+ *  Assuming the cells of a lattice may be accessed through a row/col fashion,
  *  the stencil may be expressed as the following sum:
- * 
+ *
  *  output(row, col) = input(row, col) +
  *                     input(row - 1, col) + input(row + 1, col) +
  *                     input(row, col - 1) + input(row, col + 1)
  *
  *  We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros
- *  for a lattice of size (N_r + 2) x (N_c + 2).  
+ *  for a lattice of size (N_r + 2) x (N_c + 2).
  *
  *  In the case of N_r = N_c = 3, the input lattice values are:
  *
@@ -60,8 +60,8 @@
  *  | 0 | 0 | 0 | 0 | 0 |
  *  ---------------------
  *
- * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to 
- * simplify the indexing to perform the stencil calculation. For the 
+ * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to
+ * simplify the indexing to perform the stencil calculation. For the
  * purposes of discussion, we enumerate the lattice in the following manner:
  *
  *  --------------------------------------------------
@@ -81,13 +81,13 @@
  *
  *  RAJA features shown:
  *    - RAJA::kernel kernel execution method and execution policies
- *    - RAJA::View 
+ *    - RAJA::View
  *    - RAJA::OffsetLayout
  *    - RAJA::make_offset_layout method
  *
  * For the CUDA implementation, we use unified memory to hold the lattice data.
  * For HIP, we use explicit host-device memory and manually copy data between
- * the two. 
+ * the two.
  */
 
 /*
@@ -112,28 +112,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nFive-point stencil example...\n";
 
-// _stencil_define_start
-//
-// Define num of interior cells in row/cols in a lattice
-//
+  // _stencil_define_start
+  //
+  // Define num of interior cells in row/cols in a lattice
+  //
   constexpr int N_r = 5;
   constexpr int N_c = 4;
 
-//
-// Define total num of cells in rows/cols in a lattice
-//
+  //
+  // Define total num of cells in rows/cols in a lattice
+  //
   constexpr int totCellsInRow = N_r + 2;
   constexpr int totCellsInCol = N_c + 2;
 
-//
-// Define total num of cells in a lattice
-//
+  //
+  // Define total num of cells in a lattice
+  //
   constexpr int totCells = totCellsInRow * totCellsInCol;
-// _stencil_define_end
+  // _stencil_define_end
 
-//
-// Allocate and initialize lattice
-//
+  //
+  // Allocate and initialize lattice
+  //
   int* input = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output_ref = memoryManager::allocate<int>(totCells * sizeof(int));
@@ -142,104 +142,100 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
   std::memset(output_ref, 0, totCells * sizeof(int));
 
-//
-// C-Style intialization
-//
-// _stencil_input_init_start
-  for (int row = 1; row <= N_r; ++row) {
-    for (int col = 1; col <= N_c; ++col) {
+  //
+  // C-Style intialization
+  //
+  // _stencil_input_init_start
+  for (int row = 1; row <= N_r; ++row)
+  {
+    for (int col = 1; col <= N_c; ++col)
+    {
       int id = col + totCellsInCol * row;
       input[id] = 1;
     }
   }
-// _stencil_input_init_end
+  // _stencil_input_init_end
 
-  std::cout << "\ninput lattice:\n"; 
+  std::cout << "\ninput lattice:\n";
   printLattice(input, totCellsInRow, totCellsInCol);
 
-//
-// Generate reference solution
-//
-// _stencil_output_ref_start
-  for (int row = 1; row <= N_r; ++row) {
-    for (int col = 1; col <= N_c; ++col) {
+  //
+  // Generate reference solution
+  //
+  // _stencil_output_ref_start
+  for (int row = 1; row <= N_r; ++row)
+  {
+    for (int col = 1; col <= N_c; ++col)
+    {
 
       int id = col + totCellsInCol * row;
-      output_ref[id] = input[id] + input[id + 1]
-                        + input[id - 1]
-                        + input[id + totCellsInCol]
-                        + input[id - totCellsInCol];
+      output_ref[id] = input[id] + input[id + 1] + input[id - 1] +
+                       input[id + totCellsInCol] + input[id - totCellsInCol];
     }
   }
-// _stencil_output_ref_end
+  // _stencil_output_ref_end
 
-  std::cout << "\noutput reference lattice:\n"; 
+  std::cout << "\noutput reference lattice:\n";
   printLattice(output_ref, totCellsInRow, totCellsInCol);
 
-//----------------------------------------------------------------------------//
-
-//
-// The following code illustrates pairing an offset layout and a RAJA view
-// object to simplify multidimensional indexing.
-// An offset layout is constructed by using the make_offset_layout method.
-// The first argument of the layout is an array object with the coordinates of
-// the bottom left corner of the lattice, and the second argument is an array
-// object of the coordinates of the top right corner plus 1.
-// The example uses double braces to initiate the array object and its
-// subobjects.
-//
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following code illustrates pairing an offset layout and a RAJA view
+  // object to simplify multidimensional indexing.
+  // An offset layout is constructed by using the make_offset_layout method.
+  // The first argument of the layout is an array object with the coordinates of
+  // the bottom left corner of the lattice, and the second argument is an array
+  // object of the coordinates of the top right corner plus 1.
+  // The example uses double braces to initiate the array object and its
+  // subobjects.
+  //
   // _offsetlayout_views_start
   const int DIM = 2;
 
   RAJA::OffsetLayout<DIM, int> layout =
-      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r+1, N_c+1}});
+      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r + 1, N_c + 1}});
 
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> inputView(input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> outputView(output, layout);
   // _offsetlayout_views_end
 
-//
-// Create range segments used in kernels
-//
+  //
+  // Create range segments used in kernels
+  //
   // _offsetlayout_ranges_start
   RAJA::TypedRangeSegment<int> col_range(0, N_c);
   RAJA::TypedRangeSegment<int> row_range(0, N_r);
   // _offsetlayout_ranges_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n";
 
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaseq_start
-  using NESTED_EXEC_POL1 =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::seq_exec,    // row
-        RAJA::statement::For<0, RAJA::seq_exec,  // col
-          RAJA::statement::Lambda<0>
-        >
-      >  
-    >;  
-
-  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
-                                 [=](int col, int row) {
-
-                                   outputView(row, col) =
-                                       inputView(row, col)
-                                       + inputView(row - 1, col)
-                                       + inputView(row + 1, col)
-                                       + inputView(row, col - 1)
-                                       + inputView(row, col + 1);
-
-                                 });
+  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<
+      RAJA::statement::For<1,
+                           RAJA::seq_exec, // row
+                           RAJA::statement::For<0,
+                                                RAJA::seq_exec, // col
+                                                RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL1>(
+      RAJA::make_tuple(col_range, row_range), [=](int col, int row) {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajaseq_end
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -248,33 +244,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaomp_start
-  using NESTED_EXEC_POL2 = 
-    RAJA::KernelPolicy<
+  using NESTED_EXEC_POL2 = RAJA::KernelPolicy<
       RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<1, 0>,   // row, col
-        RAJA::statement::Lambda<0>
-      > 
-    >;
-
-  RAJA::kernel<NESTED_EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
-                                 [=](int col, int row) {
-
-                                   outputView(row, col) =
-                                       inputView(row, col)
-                                       + inputView(row - 1, col)
-                                       + inputView(row + 1, col)
-                                       + inputView(row, col - 1)
-                                       + inputView(row, col + 1);
-
-                                 });
+                                RAJA::ArgList<1, 0>, // row, col
+                                RAJA::statement::Lambda<0>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL2>(
+      RAJA::make_tuple(col_range, row_range), [=](int col, int row) {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajaomp_end
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -283,36 +272,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajacuda_start
-  using NESTED_EXEC_POL3 =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;                                                     
+  using NESTED_EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+      RAJA::statement::For<1,
+                           RAJA::cuda_block_x_loop, // row
+                           RAJA::statement::For<0,
+                                                RAJA::cuda_thread_x_loop, // col
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
                                  [=] RAJA_DEVICE(int col, int row) {
-
                                    outputView(row, col) =
-                                       inputView(row, col)
-                                       + inputView(row - 1, col)
-                                       + inputView(row + 1, col)
-                                       + inputView(row, col - 1)
-                                       + inputView(row, col + 1);
-
+                                       inputView(row, col) +
+                                       inputView(row - 1, col) +
+                                       inputView(row + 1, col) +
+                                       inputView(row, col - 1) +
+                                       inputView(row, col + 1);
                                  });
   // _offsetlayout_rajacuda_end
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -321,42 +304,40 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(output, 0, totCells * sizeof(int));
 
-  int* d_input  = memoryManager::allocate_gpu<int>(totCells);
+  int* d_input = memoryManager::allocate_gpu<int>(totCells);
   int* d_output = memoryManager::allocate_gpu<int>(totCells);
 
-  hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(
+      hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(
+      d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice));
 
-  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView (d_input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView(d_input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
 
   // _offsetlayout_rajahip_start
-  using NESTED_EXEC_POL4 =
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<1, RAJA::hip_block_x_loop, //row
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+  using NESTED_EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::HipKernel<
+      RAJA::statement::For<1,
+                           RAJA::hip_block_x_loop, // row
+                           RAJA::statement::For<0,
+                                                RAJA::hip_thread_x_loop, // col
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
                                  [=] RAJA_DEVICE(int col, int row) {
-
                                    d_outputView(row, col) =
-                                         d_inputView(row, col)
-                                       + d_inputView(row - 1, col)
-                                       + d_inputView(row + 1, col)
-                                       + d_inputView(row, col - 1)
-                                       + d_inputView(row, col + 1);
+                                       d_inputView(row, col) +
+                                       d_inputView(row - 1, col) +
+                                       d_inputView(row + 1, col) +
+                                       d_inputView(row, col - 1) +
+                                       d_inputView(row, col + 1);
                                  });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(
+      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
 
-  std::cout << "\noutput lattice:\n"; 
+  std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
@@ -364,11 +345,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_output);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(input);
   memoryManager::deallocate(output);
   memoryManager::deallocate(output_ref);
@@ -383,8 +364,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void printLattice(int* lattice, int totCellsInRow, int totCellsInCol)
 {
   std::cout << std::endl;
-  for (int row = 0; row < totCellsInRow; ++row) {
-    for (int col = 0; col < totCellsInCol; ++col) {
+  for (int row = 0; row < totCellsInRow; ++row)
+  {
+    for (int col = 0; col < totCellsInCol; ++col)
+    {
 
       const int id = col + totCellsInCol * row;
       std::cout << lattice[id] << " ";
@@ -402,14 +385,18 @@ void checkResult(int* compLattice, int* refLattice, int totCells)
   bool correct = true;
 
   int i = 0;
-  while ( correct && (i < totCells) ) {
+  while (correct && (i < totCells))
+  {
     correct = (compLattice[i] == refLattice[i]);
     i++;
   }
 
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp
index 2fb9d7ac56..b789e63690 100644
--- a/exercises/permuted-layout-batch-matrix-multiply.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply.cpp
@@ -75,77 +75,77 @@ constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 //
-//Function for checking results
+// Function for checking results
 //
 template <typename T>
 void checkResult(T C, int nMat, int nRows, int nCols);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA batched matrix multiplication exercise...\n";
 
-// Dimensions of matrices
+  // Dimensions of matrices
   constexpr int N_c = 3;
   constexpr int N_r = 3;
 
-// Number of matrices
+  // Number of matrices
   constexpr int N = 8000000;
 
-// Number of iterations
+  // Number of iterations
   constexpr int NITER = 20;
 
   std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n";
 
-//
-// Initialize a RAJA timer object
-// and variable to store minimum run time
-//
+  //
+  // Initialize a RAJA timer object
+  // and variable to store minimum run time
+  //
   auto timer = RAJA::Timer();
   double minRun = std::numeric_limits<double>::max();
 
-//
-// Allocate space for data in layout 1
-//
-  double *A = memoryManager::allocate<double>(N_c * N_r * N);
-  double *B = memoryManager::allocate<double>(N_c * N_r * N);
-  double *C = memoryManager::allocate<double>(N_c * N_r * N);
-
-//
-// Layout 1
-//
-// make_permuted_layout takes the number of entries in each dimension and a
-// templated array indicating index arguments with slowest to fastest stride.
-// Standard C++ arrays are used to hold the number of entries in each component.
-// This example uses double braces to initalize the array and its subobjects.
-// The layout object will index into the array as the following C macro would
-// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
-//
-// RAJA::Layout objects may be templated on dimension, argument type, and 
-// index with unit stride. Here, the column index has unit stride (argument 2). 
-//
+  //
+  // Allocate space for data in layout 1
+  //
+  double* A = memoryManager::allocate<double>(N_c * N_r * N);
+  double* B = memoryManager::allocate<double>(N_c * N_r * N);
+  double* C = memoryManager::allocate<double>(N_c * N_r * N);
+
+  //
+  // Layout 1
+  //
+  // make_permuted_layout takes the number of entries in each dimension and a
+  // templated array indicating index arguments with slowest to fastest stride.
+  // Standard C++ arrays are used to hold the number of entries in each
+  // component. This example uses double braces to initalize the array and its
+  // subobjects. The layout object will index into the array as the following C
+  // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
+  //
+  // RAJA::Layout objects may be templated on dimension, argument type, and
+  // index with unit stride. Here, the column index has unit stride (argument
+  // 2).
+  //
   // _permutedlayout_defviews_start
-  std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
-  auto layout1 =
-      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 );
+  std::array<RAJA::idx_t, 3> perm1{{0, 1, 2}};
+  auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Bview(B, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Cview(C, layout1);
   // _permutedlayout_defviews_end
 
-//
-// Allocate space for data in layout 2
-//
-  double *A2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double *B2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double *C2 = memoryManager::allocate<double>(N_c * N_r * N);
+  //
+  // Allocate space for data in layout 2
+  //
+  double* A2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double* B2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double* C2 = memoryManager::allocate<double>(N_c * N_r * N);
 
-//
-// Permuted layout - equivalent to indexing using the following macro
-// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
-// In this case the element index has unit stride (argument 0). 
-//
+  //
+  // Permuted layout - equivalent to indexing using the following macro
+  // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
+  // In this case the element index has unit stride (argument 0).
+  //
 
   ///
   /// TODO...
@@ -158,13 +158,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           Then, create views for the A2, B2, C2 arrays using the
   ///           layout object; i.e., Aview2, Bview2, and Cview2.
   ///
-  ///           Hint: You will the same indexing to access the array data 
-  ///           via the Views as for the Views above which are created 
+  ///           Hint: You will the same indexing to access the array data
+  ///           via the Views as for the Views above which are created
   ///           using the layout1 View (see kernels in the code below).
   ///
-  ///           When you are done with the Views, test them out by 
+  ///           When you are done with the Views, test them out by
   ///           uncommenting the kernels in the code below that use the
-  ///           the Aview2, Bview2, and Cview2 views. 
+  ///           the Aview2, Bview2, and Cview2 views.
   ///
 
 //
@@ -180,64 +180,65 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #endif
 
   RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
-    for (int row = 0; row < N_r; ++row) {
-      for (int col = 0; col < N_c; ++col) {
+    for (int row = 0; row < N_r; ++row)
+    {
+      for (int col = 0; col < N_c; ++col)
+      {
         Aview(e, row, col) = row;
         Bview(e, row, col) = col;
         Cview(e, row, col) = 0;
 
-//      Aview2(e, row, col) = row;
-//      Bview2(e, row, col) = col;
-//      Cview2(e, row, col) = 0;
+        //      Aview2(e, row, col) = row;
+        //      Bview2(e, row, col) = col;
+        //      Cview2(e, row, col) = 0;
       }
     }
   });
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - sequential) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
     // _permutedlayout_batchedmatmult_loop_start
-    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
-      [=](int e) {
-
-        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                         + Aview(e, 0, 1) * Bview(e, 1, 0)
-                         + Aview(e, 0, 2) * Bview(e, 2, 0);
-        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                         + Aview(e, 0, 1) * Bview(e, 1, 1)
-                         + Aview(e, 0, 2) * Bview(e, 2, 1);
-        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                         + Aview(e, 0, 1) * Bview(e, 1, 2)
-                         + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                         + Aview(e, 1, 1) * Bview(e, 1, 0)
-                         + Aview(e, 1, 2) * Bview(e, 2, 0);
-        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                         + Aview(e, 1, 1) * Bview(e, 1, 1)
-                         + Aview(e, 1, 2) * Bview(e, 2, 1);
-        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                         + Aview(e, 1, 1) * Bview(e, 1, 2)
-                         + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                         + Aview(e, 2, 1) * Bview(e, 1, 0)
-                         + Aview(e, 2, 2) * Bview(e, 2, 0);
-        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                         + Aview(e, 2, 1) * Bview(e, 1, 1)
-                         + Aview(e, 2, 2) * Bview(e, 2, 1);
-        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                         + Aview(e, 2, 1) * Bview(e, 1, 2)
-                         + Aview(e, 2, 2) * Bview(e, 2, 2);
-      }
-    );
+    RAJA::forall<RAJA::seq_exec>(
+        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
+                           Aview(e, 0, 1) * Bview(e, 1, 0) +
+                           Aview(e, 0, 2) * Bview(e, 2, 0);
+          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
+                           Aview(e, 0, 1) * Bview(e, 1, 1) +
+                           Aview(e, 0, 2) * Bview(e, 2, 1);
+          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
+                           Aview(e, 0, 1) * Bview(e, 1, 2) +
+                           Aview(e, 0, 2) * Bview(e, 2, 2);
+
+          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
+                           Aview(e, 1, 1) * Bview(e, 1, 0) +
+                           Aview(e, 1, 2) * Bview(e, 2, 0);
+          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
+                           Aview(e, 1, 1) * Bview(e, 1, 1) +
+                           Aview(e, 1, 2) * Bview(e, 2, 1);
+          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
+                           Aview(e, 1, 1) * Bview(e, 1, 2) +
+                           Aview(e, 1, 2) * Bview(e, 2, 2);
+
+          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
+                           Aview(e, 2, 1) * Bview(e, 1, 0) +
+                           Aview(e, 2, 2) * Bview(e, 2, 0);
+          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
+                           Aview(e, 2, 1) * Bview(e, 1, 1) +
+                           Aview(e, 2, 2) * Bview(e, 2, 1);
+          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
+                           Aview(e, 2, 1) * Bview(e, 1, 2) +
+                           Aview(e, 2, 2) * Bview(e, 2, 2);
+        });
     // _permutedlayout_batchedmatmult_loop_end
     timer.stop();
 
@@ -245,68 +246,68 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-    
+
   std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - sequential) ... " << std::endl;
 
-/*
-    timer.start();
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    // _permutedlayout2_batchedmatmult_loop_start
-    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=](int e) {
-
-        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+  /*
+      timer.start();
+    minRun = std::numeric_limits<double>::max();
+    for (int i = 0; i < NITER; ++i) {
+
+      // _permutedlayout2_batchedmatmult_loop_start
+      RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
+        [=](int e) {
+
+          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                            + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                            + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                            + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                            + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                            + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                            + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                            + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                            + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                            + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                            + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                            + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                            + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                            + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                            + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                            + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                            + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                            + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                            + Aview2(e, 2, 2) * Bview2(e, 2, 2);
 
-      }
-    );
-    // _permutedlayout2_batchedmatmult_loop_end
-    timer.stop();
+        }
+      );
+      // _permutedlayout2_batchedmatmult_loop_end
+      timer.stop();
 
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
-  checkResult(Cview2, N, N_r, N_c);
-*/
+      RAJA::Timer::ElapsedType tMin = timer.elapsed();
+      if (tMin < minRun) minRun = tMin;
+      timer.reset();
+    }
+    std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+    checkResult(Cview2, N, N_r, N_c);
+  */
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -316,45 +317,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
     // _permutedlayout_batchedmatmult_omp_start
-    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=](int e) {
-
-        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                         + Aview(e, 0, 1) * Bview(e, 1, 0)
-                         + Aview(e, 0, 2) * Bview(e, 2, 0);
-        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                         + Aview(e, 0, 1) * Bview(e, 1, 1)
-                         + Aview(e, 0, 2) * Bview(e, 2, 1);
-        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                         + Aview(e, 0, 1) * Bview(e, 1, 2)
-                         + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                         + Aview(e, 1, 1) * Bview(e, 1, 0)
-                         + Aview(e, 1, 2) * Bview(e, 2, 0);
-        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                         + Aview(e, 1, 1) * Bview(e, 1, 1)
-                         + Aview(e, 1, 2) * Bview(e, 2, 1);
-        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                         + Aview(e, 1, 1) * Bview(e, 1, 2)
-                         + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                         + Aview(e, 2, 1) * Bview(e, 1, 0)
-                         + Aview(e, 2, 2) * Bview(e, 2, 0);
-        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                         + Aview(e, 2, 1) * Bview(e, 1, 1)
-                         + Aview(e, 2, 2) * Bview(e, 2, 1);
-        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                         + Aview(e, 2, 1) * Bview(e, 1, 2)
-                         + Aview(e, 2, 2) * Bview(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::omp_parallel_for_exec>(
+        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
+                           Aview(e, 0, 1) * Bview(e, 1, 0) +
+                           Aview(e, 0, 2) * Bview(e, 2, 0);
+          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
+                           Aview(e, 0, 1) * Bview(e, 1, 1) +
+                           Aview(e, 0, 2) * Bview(e, 2, 1);
+          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
+                           Aview(e, 0, 1) * Bview(e, 1, 2) +
+                           Aview(e, 0, 2) * Bview(e, 2, 2);
+
+          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
+                           Aview(e, 1, 1) * Bview(e, 1, 0) +
+                           Aview(e, 1, 2) * Bview(e, 2, 0);
+          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
+                           Aview(e, 1, 1) * Bview(e, 1, 1) +
+                           Aview(e, 1, 2) * Bview(e, 2, 1);
+          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
+                           Aview(e, 1, 1) * Bview(e, 1, 2) +
+                           Aview(e, 1, 2) * Bview(e, 2, 2);
+
+          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
+                           Aview(e, 2, 1) * Bview(e, 1, 0) +
+                           Aview(e, 2, 2) * Bview(e, 2, 0);
+          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
+                           Aview(e, 2, 1) * Bview(e, 1, 1) +
+                           Aview(e, 2, 2) * Bview(e, 2, 1);
+          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
+                           Aview(e, 2, 1) * Bview(e, 1, 2) +
+                           Aview(e, 2, 2) * Bview(e, 2, 2);
+        });
     // _permutedlayout_batchedmatmult_omp_end
     timer.stop();
 
@@ -362,71 +361,72 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  
-  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - omp parallel for) ... " << std::endl;
 
   std::memset(C2, 0, N_c * N_r * N * sizeof(double));
 
-/*
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=](int e) {
-
-        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+  /*
+    minRun = std::numeric_limits<double>::max();
+    for (int i = 0; i < NITER; ++i) {
+
+      timer.start();
+      RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0,
+    N),
+        [=](int e) {
+
+          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                            + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                            + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                            + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                            + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                            + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                            + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                            + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                            + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                            + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                            + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                            + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                            + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                            + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                            + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                            + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                            + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                            + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                            + Aview2(e, 2, 2) * Bview2(e, 2, 2);
 
-        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
-
-      }
-    );
-    timer.stop();
+        }
+      );
+      timer.stop();
 
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
-  checkResult(Cview2, N, N_r, N_c);
-*/
+      RAJA::Timer::ElapsedType tMin = timer.elapsed();
+      if (tMin < minRun) minRun = tMin;
+      timer.reset();
+    }
+    std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+    checkResult(Cview2, N, N_r, N_c);
+  */
 
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -436,44 +436,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=] RAJA_DEVICE(int e) {
-
-        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                         + Aview(e, 0, 1) * Bview(e, 1, 0)
-                         + Aview(e, 0, 2) * Bview(e, 2, 0);
-        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                         + Aview(e, 0, 1) * Bview(e, 1, 1)
-                         + Aview(e, 0, 2) * Bview(e, 2, 1);
-        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                         + Aview(e, 0, 1) * Bview(e, 1, 2)
-                         + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                         + Aview(e, 1, 1) * Bview(e, 1, 0)
-                         + Aview(e, 1, 2) * Bview(e, 2, 0);
-        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                         + Aview(e, 1, 1) * Bview(e, 1, 1)
-                         + Aview(e, 1, 2) * Bview(e, 2, 1);
-        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                         + Aview(e, 1, 1) * Bview(e, 1, 2)
-                         + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                         + Aview(e, 2, 1) * Bview(e, 1, 0)
-                         + Aview(e, 2, 2) * Bview(e, 2, 0);
-        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                         + Aview(e, 2, 1) * Bview(e, 1, 1)
-                         + Aview(e, 2, 2) * Bview(e, 2, 1);
-        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                         + Aview(e, 2, 1) * Bview(e, 1, 2)
-                         + Aview(e, 2, 2) * Bview(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
+                           Aview(e, 0, 1) * Bview(e, 1, 0) +
+                           Aview(e, 0, 2) * Bview(e, 2, 0);
+          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
+                           Aview(e, 0, 1) * Bview(e, 1, 1) +
+                           Aview(e, 0, 2) * Bview(e, 2, 1);
+          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
+                           Aview(e, 0, 1) * Bview(e, 1, 2) +
+                           Aview(e, 0, 2) * Bview(e, 2, 2);
+
+          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
+                           Aview(e, 1, 1) * Bview(e, 1, 0) +
+                           Aview(e, 1, 2) * Bview(e, 2, 0);
+          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
+                           Aview(e, 1, 1) * Bview(e, 1, 1) +
+                           Aview(e, 1, 2) * Bview(e, 2, 1);
+          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
+                           Aview(e, 1, 1) * Bview(e, 1, 2) +
+                           Aview(e, 1, 2) * Bview(e, 2, 2);
+
+          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
+                           Aview(e, 2, 1) * Bview(e, 1, 0) +
+                           Aview(e, 2, 2) * Bview(e, 2, 0);
+          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
+                           Aview(e, 2, 1) * Bview(e, 1, 1) +
+                           Aview(e, 2, 2) * Bview(e, 2, 1);
+          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
+                           Aview(e, 2, 1) * Bview(e, 1, 2) +
+                           Aview(e, 2, 2) * Bview(e, 2, 2);
+        });
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -481,10 +479,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - cuda) ... " << std::endl;
@@ -496,7 +494,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0,
+  N),
       [=] RAJA_DEVICE(int e) {
 
         Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
@@ -542,63 +541,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 */
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - hip) ... " << std::endl;
 
-  double *d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Aview(d_A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Bview(d_B, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Cview(d_C, layout1);
 
-  hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(
+      hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=] RAJA_DEVICE(int e) {
-
-        d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
-        d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
-        d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
-
-        d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
-        d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
-        d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
-
-        d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
-        d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
-        d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+          d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) +
+                             d_Aview(e, 0, 1) * d_Bview(e, 1, 0) +
+                             d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
+          d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) +
+                             d_Aview(e, 0, 1) * d_Bview(e, 1, 1) +
+                             d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
+          d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) +
+                             d_Aview(e, 0, 1) * d_Bview(e, 1, 2) +
+                             d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
+
+          d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) +
+                             d_Aview(e, 1, 1) * d_Bview(e, 1, 0) +
+                             d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
+          d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) +
+                             d_Aview(e, 1, 1) * d_Bview(e, 1, 1) +
+                             d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
+          d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) +
+                             d_Aview(e, 1, 1) * d_Bview(e, 1, 2) +
+                             d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
+
+          d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) +
+                             d_Aview(e, 2, 1) * d_Bview(e, 1, 0) +
+                             d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
+          d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) +
+                             d_Aview(e, 2, 1) * d_Bview(e, 1, 1) +
+                             d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
+          d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) +
+                             d_Aview(e, 2, 1) * d_Bview(e, 1, 2) +
+                             d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
+        });
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -606,19 +605,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
 
-  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - hip) ... " << std::endl;
@@ -632,14 +632,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Bview2(d_B2, layout2);
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Cview2(d_C2, layout2);
 
-  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double),
+hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N *
+sizeof(double), hipMemcpyHostToDevice ));
 
   minRun = std::numeric_limits<double>::max();
   for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0,
+N),
       [=] RAJA_DEVICE(int e) {
 
         d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0)
@@ -681,7 +683,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double),
+hipMemcpyDeviceToHost ));
 
   std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
@@ -695,11 +698,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 */
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -719,19 +722,26 @@ void checkResult(T C, int nMat, int nRows, int nCols)
 {
 
   bool status = true;
-  for (int e = 0; e < nMat; ++e) {
-    for (int row = 0; row < nRows; ++row) {
-      for (int col = 0; col < nCols; ++col) {
-        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) {
+  for (int e = 0; e < nMat; ++e)
+  {
+    for (int row = 0; row < nRows; ++row)
+    {
+      for (int col = 0; col < nCols; ++col)
+      {
+        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12)
+        {
           status = false;
         }
       }
     }
   }
 
-  if ( status ) {
+  if (status)
+  {
     std::cout << "\tresult -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\tresult -- FAIL\n";
   }
 }
diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
index 297ec45047..05b393ef2b 100644
--- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
@@ -76,81 +76,80 @@ constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 //
-//Function for checking results
+// Function for checking results
 //
 template <typename T>
 void checkResult(T C, int nMat, int nRows, int nCols);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA batched matrix multiplication exercise...\n";
 
-// Dimensions of matrices
+  // Dimensions of matrices
   constexpr int N_c = 3;
   constexpr int N_r = 3;
 
-// Number of matrices
+  // Number of matrices
   constexpr int N = 8000000;
 
-// Number of iterations
+  // Number of iterations
   constexpr int NITER = 20;
 
   std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n";
 
-//
-// Initialize a RAJA timer object
-// and variable to store minimum run time
-//
+  //
+  // Initialize a RAJA timer object
+  // and variable to store minimum run time
+  //
   auto timer = RAJA::Timer();
   double minRun = std::numeric_limits<double>::max();
 
-//
-// Allocate space for data in layout 1
-//
-  double *A = memoryManager::allocate<double>(N_c * N_r * N);
-  double *B = memoryManager::allocate<double>(N_c * N_r * N);
-  double *C = memoryManager::allocate<double>(N_c * N_r * N);
-
-//
-// Layout 1
-//
-// make_permuted_layout takes the number of entries in each dimension and a
-// templated array indicating index arguments with slowest to fastest stride.
-// Standard C++ arrays are used to hold the number of entries in each component.
-// This example uses double braces to initalize the array and its subobjects.
-// The layout object will index into the array as the following C macro would
-// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
-//
-// RAJA::Layout objects may be templated on dimension, argument type, and 
-// index with unit stride. Here, the column index has unit stride (argument 2). 
-//
+  //
+  // Allocate space for data in layout 1
+  //
+  double* A = memoryManager::allocate<double>(N_c * N_r * N);
+  double* B = memoryManager::allocate<double>(N_c * N_r * N);
+  double* C = memoryManager::allocate<double>(N_c * N_r * N);
+
+  //
+  // Layout 1
+  //
+  // make_permuted_layout takes the number of entries in each dimension and a
+  // templated array indicating index arguments with slowest to fastest stride.
+  // Standard C++ arrays are used to hold the number of entries in each
+  // component. This example uses double braces to initalize the array and its
+  // subobjects. The layout object will index into the array as the following C
+  // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
+  //
+  // RAJA::Layout objects may be templated on dimension, argument type, and
+  // index with unit stride. Here, the column index has unit stride (argument
+  // 2).
+  //
   // _permutedlayout_defviews_start
-  std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
-  auto layout1 =
-      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 );
+  std::array<RAJA::idx_t, 3> perm1{{0, 1, 2}};
+  auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Bview(B, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Cview(C, layout1);
   // _permutedlayout_defviews_end
 
-//
-// Allocate space for data in layout 2
-//
-  double *A2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double *B2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double *C2 = memoryManager::allocate<double>(N_c * N_r * N);
-
-//
-// Permuted layout - equivalent to indexing using the following macro
-// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
-// In this case the element index has unit stride (argument 0). 
-//
+  //
+  // Allocate space for data in layout 2
+  //
+  double* A2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double* B2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double* C2 = memoryManager::allocate<double>(N_c * N_r * N);
+
+  //
+  // Permuted layout - equivalent to indexing using the following macro
+  // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
+  // In this case the element index has unit stride (argument 0).
+  //
   // _permutedlayout_permviews_start
-  std::array<RAJA::idx_t, 3> perm2 {{1, 2, 0}};
-  auto layout2 =
-      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 );
+  std::array<RAJA::idx_t, 3> perm2{{1, 2, 0}};
+  auto layout2 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm2);
 
   RAJA::View<double, RAJA::Layout<3, int, 0>> Aview2(A2, layout2);
   RAJA::View<double, RAJA::Layout<3, int, 0>> Bview2(B2, layout2);
@@ -170,8 +169,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #endif
 
   RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
-    for (int row = 0; row < N_r; ++row) {
-      for (int col = 0; col < N_c; ++col) {
+    for (int row = 0; row < N_r; ++row)
+    {
+      for (int col = 0; col < N_c; ++col)
+      {
         Aview(e, row, col) = row;
         Bview(e, row, col) = col;
         Cview(e, row, col) = 0;
@@ -184,50 +185,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   });
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - sequential) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
     // _permutedlayout_batchedmatmult_loop_start
-    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
-      [=](int e) {
-
-        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                         + Aview(e, 0, 1) * Bview(e, 1, 0)
-                         + Aview(e, 0, 2) * Bview(e, 2, 0);
-        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                         + Aview(e, 0, 1) * Bview(e, 1, 1)
-                         + Aview(e, 0, 2) * Bview(e, 2, 1);
-        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                         + Aview(e, 0, 1) * Bview(e, 1, 2)
-                         + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                         + Aview(e, 1, 1) * Bview(e, 1, 0)
-                         + Aview(e, 1, 2) * Bview(e, 2, 0);
-        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                         + Aview(e, 1, 1) * Bview(e, 1, 1)
-                         + Aview(e, 1, 2) * Bview(e, 2, 1);
-        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                         + Aview(e, 1, 1) * Bview(e, 1, 2)
-                         + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                         + Aview(e, 2, 1) * Bview(e, 1, 0)
-                         + Aview(e, 2, 2) * Bview(e, 2, 0);
-        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                         + Aview(e, 2, 1) * Bview(e, 1, 1)
-                         + Aview(e, 2, 2) * Bview(e, 2, 1);
-        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                         + Aview(e, 2, 1) * Bview(e, 1, 2)
-                         + Aview(e, 2, 2) * Bview(e, 2, 2);
-      }
-    );
+    RAJA::forall<RAJA::seq_exec>(
+        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
+                           Aview(e, 0, 1) * Bview(e, 1, 0) +
+                           Aview(e, 0, 2) * Bview(e, 2, 0);
+          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
+                           Aview(e, 0, 1) * Bview(e, 1, 1) +
+                           Aview(e, 0, 2) * Bview(e, 2, 1);
+          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
+                           Aview(e, 0, 1) * Bview(e, 1, 2) +
+                           Aview(e, 0, 2) * Bview(e, 2, 2);
+
+          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
+                           Aview(e, 1, 1) * Bview(e, 1, 0) +
+                           Aview(e, 1, 2) * Bview(e, 2, 0);
+          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
+                           Aview(e, 1, 1) * Bview(e, 1, 1) +
+                           Aview(e, 1, 2) * Bview(e, 2, 1);
+          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
+                           Aview(e, 1, 1) * Bview(e, 1, 2) +
+                           Aview(e, 1, 2) * Bview(e, 2, 2);
+
+          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
+                           Aview(e, 2, 1) * Bview(e, 1, 0) +
+                           Aview(e, 2, 2) * Bview(e, 2, 0);
+          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
+                           Aview(e, 2, 1) * Bview(e, 1, 1) +
+                           Aview(e, 2, 2) * Bview(e, 2, 1);
+          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
+                           Aview(e, 2, 1) * Bview(e, 1, 2) +
+                           Aview(e, 2, 2) * Bview(e, 2, 2);
+        });
     // _permutedlayout_batchedmatmult_loop_end
     timer.stop();
 
@@ -235,55 +235,53 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-    
+
   std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - sequential) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
     // _permutedlayout2_batchedmatmult_loop_start
-    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=](int e) {
-
-        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::seq_exec>(
+        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 0);
+          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 1);
+          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 0);
+          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 1);
+          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 0);
+          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 1);
+          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 2);
+        });
     // _permutedlayout2_batchedmatmult_loop_end
     timer.stop();
 
@@ -291,10 +289,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -304,45 +302,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
     // _permutedlayout_batchedmatmult_omp_start
-    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=](int e) {
-
-        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                         + Aview(e, 0, 1) * Bview(e, 1, 0)
-                         + Aview(e, 0, 2) * Bview(e, 2, 0);
-        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                         + Aview(e, 0, 1) * Bview(e, 1, 1)
-                         + Aview(e, 0, 2) * Bview(e, 2, 1);
-        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                         + Aview(e, 0, 1) * Bview(e, 1, 2)
-                         + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                         + Aview(e, 1, 1) * Bview(e, 1, 0)
-                         + Aview(e, 1, 2) * Bview(e, 2, 0);
-        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                         + Aview(e, 1, 1) * Bview(e, 1, 1)
-                         + Aview(e, 1, 2) * Bview(e, 2, 1);
-        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                         + Aview(e, 1, 1) * Bview(e, 1, 2)
-                         + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                         + Aview(e, 2, 1) * Bview(e, 1, 0)
-                         + Aview(e, 2, 2) * Bview(e, 2, 0);
-        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                         + Aview(e, 2, 1) * Bview(e, 1, 1)
-                         + Aview(e, 2, 2) * Bview(e, 2, 1);
-        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                         + Aview(e, 2, 1) * Bview(e, 1, 2)
-                         + Aview(e, 2, 2) * Bview(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::omp_parallel_for_exec>(
+        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
+                           Aview(e, 0, 1) * Bview(e, 1, 0) +
+                           Aview(e, 0, 2) * Bview(e, 2, 0);
+          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
+                           Aview(e, 0, 1) * Bview(e, 1, 1) +
+                           Aview(e, 0, 2) * Bview(e, 2, 1);
+          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
+                           Aview(e, 0, 1) * Bview(e, 1, 2) +
+                           Aview(e, 0, 2) * Bview(e, 2, 2);
+
+          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
+                           Aview(e, 1, 1) * Bview(e, 1, 0) +
+                           Aview(e, 1, 2) * Bview(e, 2, 0);
+          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
+                           Aview(e, 1, 1) * Bview(e, 1, 1) +
+                           Aview(e, 1, 2) * Bview(e, 2, 1);
+          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
+                           Aview(e, 1, 1) * Bview(e, 1, 2) +
+                           Aview(e, 1, 2) * Bview(e, 2, 2);
+
+          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
+                           Aview(e, 2, 1) * Bview(e, 1, 0) +
+                           Aview(e, 2, 2) * Bview(e, 2, 0);
+          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
+                           Aview(e, 2, 1) * Bview(e, 1, 1) +
+                           Aview(e, 2, 2) * Bview(e, 2, 1);
+          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
+                           Aview(e, 2, 1) * Bview(e, 1, 2) +
+                           Aview(e, 2, 2) * Bview(e, 2, 2);
+        });
     // _permutedlayout_batchedmatmult_omp_end
     timer.stop();
 
@@ -350,11 +346,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  
-  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - omp parallel for) ... " << std::endl;
@@ -362,57 +358,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(C2, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
-    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=](int e) {
-
-        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::omp_parallel_for_exec>(
+        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 0);
+          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 1);
+          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 0);
+          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 1);
+          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 0);
+          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 1);
+          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 2);
+        });
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -422,44 +416,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=] RAJA_DEVICE(int e) {
-
-        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                         + Aview(e, 0, 1) * Bview(e, 1, 0)
-                         + Aview(e, 0, 2) * Bview(e, 2, 0);
-        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                         + Aview(e, 0, 1) * Bview(e, 1, 1)
-                         + Aview(e, 0, 2) * Bview(e, 2, 1);
-        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                         + Aview(e, 0, 1) * Bview(e, 1, 2)
-                         + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                         + Aview(e, 1, 1) * Bview(e, 1, 0)
-                         + Aview(e, 1, 2) * Bview(e, 2, 0);
-        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                         + Aview(e, 1, 1) * Bview(e, 1, 1)
-                         + Aview(e, 1, 2) * Bview(e, 2, 1);
-        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                         + Aview(e, 1, 1) * Bview(e, 1, 2)
-                         + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                         + Aview(e, 2, 1) * Bview(e, 1, 0)
-                         + Aview(e, 2, 2) * Bview(e, 2, 0);
-        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                         + Aview(e, 2, 1) * Bview(e, 1, 1)
-                         + Aview(e, 2, 2) * Bview(e, 2, 1);
-        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                         + Aview(e, 2, 1) * Bview(e, 1, 2)
-                         + Aview(e, 2, 2) * Bview(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
+                           Aview(e, 0, 1) * Bview(e, 1, 0) +
+                           Aview(e, 0, 2) * Bview(e, 2, 0);
+          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
+                           Aview(e, 0, 1) * Bview(e, 1, 1) +
+                           Aview(e, 0, 2) * Bview(e, 2, 1);
+          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
+                           Aview(e, 0, 1) * Bview(e, 1, 2) +
+                           Aview(e, 0, 2) * Bview(e, 2, 2);
+
+          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
+                           Aview(e, 1, 1) * Bview(e, 1, 0) +
+                           Aview(e, 1, 2) * Bview(e, 2, 0);
+          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
+                           Aview(e, 1, 1) * Bview(e, 1, 1) +
+                           Aview(e, 1, 2) * Bview(e, 2, 1);
+          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
+                           Aview(e, 1, 1) * Bview(e, 1, 2) +
+                           Aview(e, 1, 2) * Bview(e, 2, 2);
+
+          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
+                           Aview(e, 2, 1) * Bview(e, 1, 0) +
+                           Aview(e, 2, 2) * Bview(e, 2, 0);
+          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
+                           Aview(e, 2, 1) * Bview(e, 1, 1) +
+                           Aview(e, 2, 2) * Bview(e, 2, 1);
+          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
+                           Aview(e, 2, 1) * Bview(e, 1, 2) +
+                           Aview(e, 2, 2) * Bview(e, 2, 2);
+        });
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -467,10 +459,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - cuda) ... " << std::endl;
@@ -478,68 +470,66 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(C2, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=] RAJA_DEVICE(int e) {
-
-        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 0);
+          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 1);
+          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 0, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 0);
+          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 1);
+          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 1, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 0) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 0);
+          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 1) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 1);
+          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) +
+                            Aview2(e, 2, 1) * Bview2(e, 1, 2) +
+                            Aview2(e, 2, 2) * Bview2(e, 2, 2);
+        });
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - hip) ... " << std::endl;
 
-  double *d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
 
-  double *d_A2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_B2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_C2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_A2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_B2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double* d_C2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Aview(d_A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Bview(d_B, layout1);
@@ -549,50 +539,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Bview2(d_B2, layout2);
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Cview2(d_C2, layout2);
 
-  hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(
+      hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(
+      d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(
+      d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=] RAJA_DEVICE(int e) {
-
-        d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
-        d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
-        d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
-
-        d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
-        d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
-        d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
-
-        d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
-        d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
-        d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+          d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) +
+                             d_Aview(e, 0, 1) * d_Bview(e, 1, 0) +
+                             d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
+          d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) +
+                             d_Aview(e, 0, 1) * d_Bview(e, 1, 1) +
+                             d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
+          d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) +
+                             d_Aview(e, 0, 1) * d_Bview(e, 1, 2) +
+                             d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
+
+          d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) +
+                             d_Aview(e, 1, 1) * d_Bview(e, 1, 0) +
+                             d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
+          d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) +
+                             d_Aview(e, 1, 1) * d_Bview(e, 1, 1) +
+                             d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
+          d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) +
+                             d_Aview(e, 1, 1) * d_Bview(e, 1, 2) +
+                             d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
+
+          d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) +
+                             d_Aview(e, 2, 1) * d_Bview(e, 1, 0) +
+                             d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
+          d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) +
+                             d_Aview(e, 2, 1) * d_Bview(e, 1, 1) +
+                             d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
+          d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) +
+                             d_Aview(e, 2, 1) * d_Bview(e, 1, 2) +
+                             d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
+        });
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -600,55 +592,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(
+      hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
 
-  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - hip) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
+  for (int i = 0; i < NITER; ++i)
+  {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
-      [=] RAJA_DEVICE(int e) {
-
-        d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0)
-                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0)
-                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
-        d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1)
-                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1)
-                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1);
-        d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2)
-                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2)
-                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2);
-
-        d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0)
-                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0)
-                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0);
-        d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1)
-                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1)
-                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1);
-        d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2)
-                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2)
-                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2);
-
-        d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0)
-                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0)
-                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0);
-        d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1)
-                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1)
-                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1);
-        d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2)
-                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2)
-                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2);
-
-      }
-    );
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+          d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) +
+                              d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) +
+                              d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
+          d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1) +
+                              d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1) +
+                              d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1);
+          d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2) +
+                              d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2) +
+                              d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2);
+
+          d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0) +
+                              d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0) +
+                              d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0);
+          d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1) +
+                              d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1) +
+                              d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1);
+          d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2) +
+                              d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2) +
+                              d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2);
+
+          d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0) +
+                              d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0) +
+                              d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0);
+          d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1) +
+                              d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1) +
+                              d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1);
+          d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2) +
+                              d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2) +
+                              d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2);
+        });
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -656,9 +647,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(
+      C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
 
-  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 
   memoryManager::deallocate_gpu(d_A);
@@ -669,11 +661,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_C2);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -693,19 +685,26 @@ void checkResult(T C, int nMat, int nRows, int nCols)
 {
 
   bool status = true;
-  for (int e = 0; e < nMat; ++e) {
-    for (int row = 0; row < nRows; ++row) {
-      for (int col = 0; col < nCols; ++col) {
-        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) {
+  for (int e = 0; e < nMat; ++e)
+  {
+    for (int row = 0; row < nRows; ++row)
+    {
+      for (int col = 0; col < nCols; ++col)
+      {
+        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12)
+        {
           status = false;
         }
       }
     }
   }
 
-  if ( status ) {
+  if (status)
+  {
     std::cout << "\tresult -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\tresult -- FAIL\n";
   }
 }
diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp
index 4c6b90c063..e4752861de 100644
--- a/exercises/reductions.cpp
+++ b/exercises/reductions.cpp
@@ -32,7 +32,7 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-//constexpr int CUDA_BLOCK_SIZE = 256;
+// constexpr int CUDA_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
@@ -45,27 +45,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 1000000;
 
-//
-// Allocate array data and initialize data to alternating sequence of 1, -1.
-//
+  //
+  // Allocate array data and initialize data to alternating sequence of 1, -1.
+  //
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
-    if ( i % 2 == 0 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
       a[i] = 1;
-    } else {
-      a[i] = -1; 
+    }
+    else
+    {
+      a[i] = -1;
     }
   }
 
-//
-// Set min and max loc values
-//
+  //
+  // Set min and max loc values
+  //
   constexpr int minloc_ref = N / 2;
   a[minloc_ref] = -100;
 
@@ -73,26 +77,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-//
-// Note: with this data initialization scheme, the following results will
-//       be observed for all reduction kernels below:
-//
-//  - the sum will be zero
-//  - the min will be -100
-//  - the max will be 100
-//  - the min loc will be N/2
-//  - the max loc will be N/2 + 1
-//
-//
-
-//
-// Define index range for iterating over a elements in all examples
-//
+  //
+  // Note: with this data initialization scheme, the following results will
+  //       be observed for all reduction kernels below:
+  //
+  //  - the sum will be zero
+  //  - the min will be -100
+  //  - the max will be 100
+  //  - the min loc will be N/2
+  //  - the max loc will be N/2 + 1
+  //
+  //
+
+  //
+  // Define index range for iterating over a elements in all examples
+  //
   // _reductions_range_start
-//RAJA::TypedRangeSegment<int> arange(0, N);
-  // _reductions_range_end
+  // RAJA::TypedRangeSegment<int> arange(0, N);
+  //  _reductions_range_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
@@ -101,7 +105,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Define EXEC_POL1 and REDCUE_POL1 for executing sequentially.
   ///
- 
+
   /// TODO...
   ///
   /// EXERCISE: Remove comments for remainder of sequential section.
@@ -112,11 +116,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceSum<REDUCE_POL1, int> seq_sum(0);
   RAJA::ReduceMin<REDUCE_POL1, int> seq_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL1, int> seq_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMinLoc<REDUCE_POL1, int>
+  seq_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL1, int>
+  seq_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL1>(arange, [=](int i) {
-    
+
     seq_sum += a[i];
 
     seq_min.min(a[i]);
@@ -130,14 +136,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tsum = " << seq_sum.get() << std::endl;
   std::cout << "\tmin = " << seq_min.get() << std::endl;
   std::cout << "\tmax = " << seq_max.get() << std::endl;
-  std::cout << "\tmin, loc = " << seq_minloc.get() << " , " 
+  std::cout << "\tmin, loc = " << seq_minloc.get() << " , "
                                << seq_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " 
+  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , "
                                << seq_maxloc.getLoc() << std::endl;
   */
-  
 
-//----------------------------------------------------------------------------//
+
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
@@ -152,7 +158,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise.
+  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this
+  /// exercise.
   ///
   ///           Uncomment 'arange' variable above so it can be used in kernel.
   ///
@@ -181,12 +188,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tmin, loc = " << omp_minloc.get() << " , "
                                << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.get() << " , "
-                               << omp_maxloc.getLoc() << std::endl; 
+                               << omp_maxloc.getLoc() << std::endl;
   */
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
@@ -200,7 +207,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise.
+  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this
+  /// exercise.
   ///
   ///           Uncomment 'arange' variable above so it can be used in kernel.
   ///
@@ -232,7 +240,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   */
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
@@ -240,21 +248,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::TypedRangeSegment<int> arange1(0, N);
 
   int* d_a = memoryManager::allocate_gpu<int>(N);
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
 
   // _reductions_raja_hippolicy_start
-  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::hip_reduce;
   // _reductions_raja_hippolicy_end
 
   RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
   RAJA::ReduceMin<REDUCE_POL3, int> hip_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL3, int> hip_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(std::numeric_limits<int>::min(), -1);
-
-  RAJA::forall<EXEC_POL3>(arange1, [=] RAJA_DEVICE (int i) {
+  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(
+      std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
+      std::numeric_limits<int>::min(), -1);
 
+  RAJA::forall<EXEC_POL3>(arange1, [=] RAJA_DEVICE(int i) {
     hip_sum += d_a[i];
 
     hip_min.min(d_a[i]);
@@ -262,28 +271,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     hip_minloc.minloc(d_a[i], i);
     hip_maxloc.maxloc(d_a[i], i);
-
   });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
   std::cout << "\tmax = " << hip_max.get() << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.get() << " , "
-                               << hip_minloc.getLoc() << std::endl;
+            << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.get() << " , "
-                               << hip_maxloc.getLoc() << std::endl;
+            << hip_maxloc.getLoc() << std::endl;
 
   memoryManager::deallocate_gpu(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
- 
+
   return 0;
 }
diff --git a/exercises/reductions_solution.cpp b/exercises/reductions_solution.cpp
index 6da731e62e..46992ec857 100644
--- a/exercises/reductions_solution.cpp
+++ b/exercises/reductions_solution.cpp
@@ -45,27 +45,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 1000000;
 
-//
-// Allocate array data and initialize data to alternating sequence of 1, -1.
-//
+  //
+  // Allocate array data and initialize data to alternating sequence of 1, -1.
+  //
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
-    if ( i % 2 == 0 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
       a[i] = 1;
-    } else {
-      a[i] = -1; 
+    }
+    else
+    {
+      a[i] = -1;
     }
   }
 
-//
-// Set min and max loc values
-//
+  //
+  // Set min and max loc values
+  //
   constexpr int minloc_ref = N / 2;
   a[minloc_ref] = -100;
 
@@ -73,41 +77,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-//
-// Note: with this data initialization scheme, the following results will
-//       be observed for all reduction kernels below:
-//
-//  - the sum will be zero
-//  - the min will be -100
-//  - the max will be 100
-//  - the min loc will be N/2
-//  - the max loc will be N/2 + 1
-//
-//
-
-//
-// Define index range for iterating over a elements in all examples
-//
+  //
+  // Note: with this data initialization scheme, the following results will
+  //       be observed for all reduction kernels below:
+  //
+  //  - the sum will be zero
+  //  - the min will be -100
+  //  - the max will be 100
+  //  - the min loc will be N/2
+  //  - the max loc will be N/2 + 1
+  //
+  //
+
+  //
+  // Define index range for iterating over a elements in all examples
+  //
   // _reductions_range_start
   RAJA::TypedRangeSegment<int> arange(0, N);
   // _reductions_range_end
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
   // _reductions_raja_seq_start
-  using EXEC_POL1   = RAJA::seq_exec;
+  using EXEC_POL1 = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
- 
+
   RAJA::ReduceSum<REDUCE_POL1, int> seq_sum(0);
   RAJA::ReduceMin<REDUCE_POL1, int> seq_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL1, int> seq_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(
+      std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(
+      std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL1>(arange, [=](int i) {
-    
     seq_sum += a[i];
 
     seq_min.min(a[i]);
@@ -115,37 +120,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     seq_minloc.minloc(a[i], i);
     seq_maxloc.maxloc(a[i], i);
-
   });
 
   std::cout << "\tsum = " << seq_sum.get() << std::endl;
   std::cout << "\tmin = " << seq_min.get() << std::endl;
   std::cout << "\tmax = " << seq_max.get() << std::endl;
-  std::cout << "\tmin, loc = " << seq_minloc.get() << " , " 
-                               << seq_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " 
-                               << seq_maxloc.getLoc() << std::endl;
+  std::cout << "\tmin, loc = " << seq_minloc.get() << " , "
+            << seq_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , "
+            << seq_maxloc.getLoc() << std::endl;
   // _reductions_raja_seq_end
-  
 
-//----------------------------------------------------------------------------//
+
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
 
   // _reductions_raja_omppolicy_start
-  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
   using REDUCE_POL2 = RAJA::omp_reduce;
   // _reductions_raja_omppolicy_end
 
   RAJA::ReduceSum<REDUCE_POL2, int> omp_sum(0);
   RAJA::ReduceMin<REDUCE_POL2, int> omp_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL2, int> omp_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL2, int> omp_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL2, int> omp_maxloc(std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMinLoc<REDUCE_POL2, int> omp_minloc(
+      std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL2, int> omp_maxloc(
+      std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL2>(arange, [=](int i) {
-
     omp_sum += a[i];
 
     omp_min.min(a[i]);
@@ -153,37 +158,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     omp_minloc.minloc(a[i], i);
     omp_maxloc.maxloc(a[i], i);
-
   });
 
   std::cout << "\tsum = " << omp_sum.get() << std::endl;
   std::cout << "\tmin = " << omp_min.get() << std::endl;
   std::cout << "\tmax = " << omp_max.get() << std::endl;
   std::cout << "\tmin, loc = " << omp_minloc.get() << " , "
-                               << omp_minloc.getLoc() << std::endl;
+            << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.get() << " , "
-                               << omp_maxloc.getLoc() << std::endl; 
+            << omp_maxloc.getLoc() << std::endl;
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
 
   // _reductions_raja_cudapolicy_start
-  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::cuda_reduce;
   // _reductions_raja_cudapolicy_end
 
   RAJA::ReduceSum<REDUCE_POL3, int> cuda_sum(0);
   RAJA::ReduceMin<REDUCE_POL3, int> cuda_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL3, int> cuda_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL3, int> cuda_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL3, int> cuda_maxloc(std::numeric_limits<int>::min(), -1);
-
-  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE (int i) {
+  RAJA::ReduceMinLoc<REDUCE_POL3, int> cuda_minloc(
+      std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL3, int> cuda_maxloc(
+      std::numeric_limits<int>::min(), -1);
 
+  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE(int i) {
     cuda_sum += a[i];
 
     cuda_min.min(a[i]);
@@ -191,39 +196,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     cuda_minloc.minloc(a[i], i);
     cuda_maxloc.maxloc(a[i], i);
-
   });
 
   std::cout << "\tsum = " << cuda_sum.get() << std::endl;
   std::cout << "\tmin = " << cuda_min.get() << std::endl;
   std::cout << "\tmax = " << cuda_max.get() << std::endl;
   std::cout << "\tmin, loc = " << cuda_minloc.get() << " , "
-                               << cuda_minloc.getLoc() << std::endl;
+            << cuda_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << cuda_maxloc.get() << " , "
-                               << cuda_maxloc.getLoc() << std::endl;
+            << cuda_maxloc.getLoc() << std::endl;
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
 
   int* d_a = memoryManager::allocate_gpu<int>(N);
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
 
   // _reductions_raja_hippolicy_start
-  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::hip_reduce;
   // _reductions_raja_hippolicy_end
 
   RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
   RAJA::ReduceMin<REDUCE_POL3, int> hip_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL3, int> hip_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(std::numeric_limits<int>::min(), -1);
-
-  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE (int i) {
+  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(
+      std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
+      std::numeric_limits<int>::min(), -1);
 
+  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE(int i) {
     hip_sum += d_a[i];
 
     hip_min.min(d_a[i]);
@@ -231,28 +236,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     hip_minloc.minloc(d_a[i], i);
     hip_maxloc.maxloc(d_a[i], i);
-
   });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
   std::cout << "\tmax = " << hip_max.get() << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.get() << " , "
-                               << hip_minloc.getLoc() << std::endl;
+            << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.get() << " , "
-                               << hip_maxloc.getLoc() << std::endl;
+            << hip_maxloc.getLoc() << std::endl;
 
   memoryManager::deallocate_gpu(d_a);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
- 
+
   return 0;
 }
diff --git a/exercises/scan.cpp b/exercises/scan.cpp
index 68f52fce2b..11e3068ff8 100644
--- a/exercises/scan.cpp
+++ b/exercises/scan.cpp
@@ -40,11 +40,11 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-//constexpr int CUDA_BLOCK_SIZE = 16;
+// constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-//constexpr int HIP_BLOCK_SIZE = 16;
+// constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -66,14 +66,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA scan example...\n";
 
   // _scan_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 20;
 
-//
-// Allocate and initialize vector data
-//
+  //
+  // Allocate and initialize vector data
+  //
   int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
@@ -85,11 +85,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _scan_array_init_end
 
 
-
-//----------------------------------------------------------------------------//
-// Perform various sequential scans to illustrate inclusive/exclusive,
-// in-place, default scans with different operators
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform various sequential scans to illustrate inclusive/exclusive,
+  // in-place, default scans with different operators
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (default)...\n";
 
@@ -97,7 +96,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec
-  ///           execution policy type. 
+  ///           execution policy type.
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
@@ -111,7 +110,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (plus)...\n";
 
@@ -121,14 +120,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit plus operator. 
+  ///           execution policy type and an explicit plus operator.
   ///
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan (plus)...\n";
 
@@ -138,14 +137,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an exclusive RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit plus operator. 
+  ///           execution policy type and an explicit plus operator.
   ///
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n";
 
@@ -155,14 +154,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit minimum operator. 
+  ///           execution policy type and an explicit minimum operator.
   ///
 
   CHECK_INC_SCAN_RESULTS(OP_MIN_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n";
 
@@ -172,7 +171,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit maximum operator. 
+  ///           execution policy type and an explicit maximum operator.
   ///
 
   CHECK_EXC_SCAN_RESULTS(OP_MAX_INT)
@@ -182,24 +181,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of OpenMP scans...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of OpenMP scans...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP inclusive_scan (plus)...\n";
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement an inclusive RAJA scan with RAJA::omp_parallel_for_exec
-  ///           execution policy type and an explicit plus operator. 
+  /// EXERCISE: Implement an inclusive RAJA scan with
+  /// RAJA::omp_parallel_for_exec
+  ///           execution policy type and an explicit plus operator.
   ///
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n";
 
@@ -208,8 +208,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::omp_parallel_for_exec
-  ///           execution policy type and an explicit plus operator. 
+  /// EXERCISE: Implement an exclusive inplace RAJA scan with
+  /// RAJA::omp_parallel_for_exec
+  ///           execution policy type and an explicit plus operator.
   ///
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -218,13 +219,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of CUDA scans...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of CUDA scans...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n";
 
@@ -244,7 +245,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n";
 
@@ -264,7 +265,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan (plus)...\n";
 
@@ -286,14 +287,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of HIP scans...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of HIP scans...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n";
 
@@ -301,42 +302,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* d_in = memoryManager::allocate_gpu<int>(N);
   int* d_out = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::hip_exec
-  ///           execution policy type and an explicit plus operator. 
+  ///           execution policy type and an explicit plus operator.
   ///
   ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top
   ///                 of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP exclusive_scan (plus)...\n";
 
-  hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement an exclusive RAJA scan with RAJA::hip_exec
-  ///           execution policy type and an explicit plus operator. 
+  ///           execution policy type and an explicit plus operator.
   ///
   ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top
   ///                 of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
@@ -347,11 +348,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -368,12 +369,14 @@ template <typename Function, typename T>
 void checkInclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     val = Function()(val, in[i]);
-    if (out[i] != val) {
+    if (out[i] != val)
+    {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val
-                << " (at index " << i << ")\n";
+      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
+                << ")\n";
     }
   }
   std::cout << "\n\t result -- CORRECT\n";
@@ -386,11 +389,13 @@ template <typename Function, typename T>
 void checkExclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i) {
-    if (out[i] != val) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (out[i] != val)
+    {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val
-                << " (at index " << i << ")\n";
+      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
+                << ")\n";
     }
     val = Function()(val, in[i]);
   }
@@ -404,6 +409,9 @@ template <typename T>
 void printArray(const T* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; }
+  for (int i = 0; i < N; ++i)
+  {
+    std::cout << " " << v[i];
+  }
   std::cout << std::endl;
 }
diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp
index 7ed7101192..925b586101 100644
--- a/exercises/scan_solution.cpp
+++ b/exercises/scan_solution.cpp
@@ -40,11 +40,11 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-  constexpr int CUDA_BLOCK_SIZE = 16;
+constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-  constexpr int HIP_BLOCK_SIZE = 16;
+constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -66,14 +66,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA scan example...\n";
 
   // _scan_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 20;
 
-//
-// Allocate and initialize vector data
-//
+  //
+  // Allocate and initialize vector data
+  //
   int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
@@ -85,11 +85,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _scan_array_init_end
 
 
-
-//----------------------------------------------------------------------------//
-// Perform various sequential scans to illustrate inclusive/exclusive,
-// in-place, default scans with different operators
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform various sequential scans to illustrate inclusive/exclusive,
+  // in-place, default scans with different operators
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (default)...\n";
 
@@ -102,7 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (plus)...\n";
 
@@ -118,7 +117,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan (plus)...\n";
 
@@ -134,7 +133,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n";
 
@@ -149,7 +148,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n";
 
@@ -167,23 +166,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of OpenMP scans...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of OpenMP scans...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP inclusive_scan (plus)...\n";
 
   // _scan_inclusive_omp_plus_start
-  RAJA::inclusive_scan<RAJA::omp_parallel_for_exec>(RAJA::make_span(in, N),
-                                                    RAJA::make_span(out, N),
-                                                    RAJA::operators::plus<int>{});
+  RAJA::inclusive_scan<RAJA::omp_parallel_for_exec>(
+      RAJA::make_span(in, N),
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_inclusive_omp_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n";
 
@@ -191,8 +191,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_inplace_omp_plus_start
   RAJA::exclusive_scan_inplace<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(out, N),
-      RAJA::operators::plus<int>{});
+      RAJA::make_span(out, N), RAJA::operators::plus<int>{});
   // _scan_exclusive_inplace_omp_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -201,13 +200,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//----------------------------------------------------------------------------//
-// Perform a few CUDA scans...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a few CUDA scans...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n";
 
@@ -215,15 +214,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_inclusive_inplace_cuda_plus_start
   RAJA::inclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N),
-      RAJA::operators::plus<int>{});
+      RAJA::make_span(out, N), RAJA::operators::plus<int>{});
   // _scan_inclusive_inplace_cuda_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n";
 
@@ -231,15 +229,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_inplace_cuda_plus_start
   RAJA::exclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N),
-      RAJA::operators::plus<int>{});
+      RAJA::make_span(out, N), RAJA::operators::plus<int>{});
   // _scan_exclusive_inplace_cuda_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan (plus)...\n";
 
@@ -258,14 +255,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of HIP scans...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of HIP scans...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n";
 
@@ -273,33 +270,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* d_in = memoryManager::allocate_gpu<int>(N);
   int* d_out = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
 
   // _scan_inclusive_inplace_hip_plus_start
   RAJA::inclusive_scan_inplace<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_out, N),
-      RAJA::operators::plus<int>{});
+      RAJA::make_span(d_out, N), RAJA::operators::plus<int>{});
   // _scan_inclusive_inplace_hip_plus_end
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP exclusive_scan (plus)...\n";
 
-  hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
 
   RAJA::exclusive_scan<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
       RAJA::make_span(d_in, N),
       RAJA::make_span(d_out, N),
       RAJA::operators::plus<int>{});
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
@@ -310,11 +306,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -331,12 +327,14 @@ template <typename Function, typename T>
 void checkInclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     val = Function()(val, in[i]);
-    if (out[i] != val) {
+    if (out[i] != val)
+    {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val
-                << " (at index " << i << ")\n";
+      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
+                << ")\n";
     }
   }
   std::cout << "\n\t result -- CORRECT\n";
@@ -349,11 +347,13 @@ template <typename Function, typename T>
 void checkExclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i) {
-    if (out[i] != val) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (out[i] != val)
+    {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val
-                << " (at index " << i << ")\n";
+      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
+                << ")\n";
     }
     val = Function()(val, in[i]);
   }
@@ -367,6 +367,9 @@ template <typename T>
 void printArray(const T* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; }
+  for (int i = 0; i < N; ++i)
+  {
+    std::cout << " " << v[i];
+  }
   std::cout << std::endl;
 }
diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp
index b7c0c26458..490be37848 100644
--- a/exercises/segment-indexset-basics.cpp
+++ b/exercises/segment-indexset-basics.cpp
@@ -20,9 +20,9 @@
  *
  *  In this exercise, you will learn how to create RAJA segments and index sets
  *  and use them to execute kernels. There are no computations performed in the
- *  exercises and no parallel execution. The kernels contain only print 
+ *  exercises and no parallel execution. The kernels contain only print
  *  statements to illustrate various iteration patterns. Thus, all kernels
- *  look the same. The only thing that changes in these versions is the object 
+ *  look the same. The only thing that changes in these versions is the object
  *  passed to the 'forall' method that defines the iteration space.
  *
  *  RAJA features shown:
@@ -43,59 +43,58 @@ using IdxType = int;
 using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
 using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
 using ListSegType = RAJA::TypedListSegment<IdxType>;
-using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >;
+using IndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
 // _raja_segment_type_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA segments index sets and index sets...\n";
 
-// Resource object used to construct list segment objects with indices
-// living in host (CPU) memory.
+  // Resource object used to construct list segment objects with indices
+  // living in host (CPU) memory.
   camp::resources::Resource host_res{camp::resources::Host()};
 
 
-//----------------------------------------------------------------------------//
-// Stride-1 iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Stride-1 iteration spaces
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version range kernel...\n";
 
   // _cstyle_range1_start
-  for (IdxType i = 0; i < 20; i++) {
-    std::cout << i << "  "; 
+  for (IdxType i = 0; i < 20; i++)
+  {
+    std::cout << i << "  ";
   }
   // _cstyle_range1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA range kernel...\n";
 
   // _raja_range1_start
-  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 range kernel...\n";
 
   // _raja_striderange1_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_striderange1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 list kernel...\n";
 
@@ -104,47 +103,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Collect indices in a vector to create list segment
   //
   std::vector<IdxType> idx;
-  for (IdxType i = 0; i < 20; ++i) {
-    idx.push_back(i); 
-  } 
+  for (IdxType i = 0; i < 20; ++i)
+  {
+    idx.push_back(i);
+  }
 
-  ListSegType idx_list1( idx, host_res );
+  ListSegType idx_list1(idx, host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1, [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(idx_list1,
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_list1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running C-style stride-1 list kernel...\n";
 
   // _cstyle_list1_start
-  IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
-  for (IdxType ii = 0; ii < iis; ++ii) { 
-    std::cout << idx[ ii ] << "  ";
+  IdxType iis = static_cast<IdxType>(idx.size()); // to avoid compiler warning
+  for (IdxType ii = 0; ii < iis; ++ii)
+  {
+    std::cout << idx[ii] << "  ";
   }
   // _cstyle_list1_end
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
-// Negative stride iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Negative stride iteration spaces
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version negative stride kernel...\n";
 
   // _cstyle_negstriderange1_start
-  for (IdxType i = 19; i > -1; i--) {
+  for (IdxType i = 19; i > -1; i--)
+  {
     std::cout << i << "  ";
   }
   // _cstyle_negstriderange1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA negative stride kernel...\n";
 
@@ -156,9 +157,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << std::endl;
 
-//----------------------------------//
-// List variant
-//----------------------------------//
+  //----------------------------------//
+  // List variant
+  //----------------------------------//
 
   std::cout << "\n Running RAJA negative stride list kernel...\n";
 
@@ -166,43 +167,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Reverse the order of indices in the vector
   //
-  std::reverse( idx.begin(), idx.end() );
-  ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res );
+  std::reverse(idx.begin(), idx.end());
+  ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse, [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse,
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_negstridelist1_end
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
-// Non-unit uniform stride iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Non-unit uniform stride iteration spaces
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version stride-2 range kernel...\n";
 
   // _cstyle_range2_start
-  for (IdxType i = 0; i < 20; i += 2) {
+  for (IdxType i = 0; i < 20; i += 2)
+  {
     std::cout << i << "  ";
   }
   // _cstyle_range2_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-2 range kernel...\n";
 
   // _raja_range2_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range2_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-3 range kernel...\n";
 
@@ -214,50 +214,50 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
-// IndexSets: complex iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // IndexSets: complex iteration spaces
+  //----------------------------------------------------------------------------//
 
-//
-// Sequential index set execution policy used in several of the following
-// example implementations.
-//
+  //
+  // Sequential index set execution policy used in several of the following
+  // example implementations.
+  //
 
   // _raja_seq_indexset_policy_start
-  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
-                                            RAJA::seq_exec>;
+  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
   // _raja_seq_indexset_policy__end
 
   std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n";
 
   // _raja_indexset_2ranges_start
   IndexSetType is2;
-  is2.push_back( RangeSegType(0, 10) );
-  is2.push_back( RangeSegType(15, 20) );
-  
-  RAJA::forall<SEQ_ISET_EXECPOL>(is2, [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  is2.push_back(RangeSegType(0, 10));
+  is2.push_back(RangeSegType(15, 20));
+
+  RAJA::forall<SEQ_ISET_EXECPOL>(is2,
+                                 [=](IdxType i) { std::cout << i << "  "; });
   // _raja_indexset_2ranges_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running C-version of two segment kernel...\n";
 
   // _cstyle_2ranges_start
-  for (IdxType i = 0; i < 10; ++i) {
+  for (IdxType i = 0; i < 10; ++i)
+  {
     std::cout << i << "  ";
   }
-  for (IdxType i = 15; i < 20; ++i) {
+  for (IdxType i = 15; i < 20; ++i)
+  {
     std::cout << i << "  ";
   }
   // _cstyle_2ranges_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA index set (3 segments) kernel...\n";
 
@@ -265,20 +265,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Make a RAJA version of a kernel that prints the sequence
-  ///        
+  ///
   ///           0 1 2 3 4 5 6 7 10 11 14 20 22 24 25 26 27
   ///
-  ///           using a RAJA::TypedIndexSet containing two 
-  ///           RAJA::TypedRangeSegment objects and on 
-  ///           RAJA::TypedListSegment object. 
+  ///           using a RAJA::TypedIndexSet containing two
+  ///           RAJA::TypedRangeSegment objects and on
+  ///           RAJA::TypedListSegment object.
   ///
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
- 
+
   return 0;
 }
-
diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp
index 4267582d98..d3bf08ec52 100644
--- a/exercises/segment-indexset-basics_solution.cpp
+++ b/exercises/segment-indexset-basics_solution.cpp
@@ -20,9 +20,9 @@
  *
  *  In this exercise, you will learn how to create RAJA segments and index sets
  *  and use them to execute kernels. There are no computations performed in the
- *  exercises and no parallel execution. The kernels contain only print 
+ *  exercises and no parallel execution. The kernels contain only print
  *  statements to illustrate various iteration patterns. Thus, all kernels
- *  look the same. The only thing that changes in these versions is the object 
+ *  look the same. The only thing that changes in these versions is the object
  *  passed to the 'forall' method that defines the iteration space.
  *
  *  RAJA features shown:
@@ -43,59 +43,58 @@ using IdxType = int;
 using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
 using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
 using ListSegType = RAJA::TypedListSegment<IdxType>;
-using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >;
+using IndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
 // _raja_segment_type_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA segments index sets and index sets...\n";
 
-// Resource object used to construct list segment objects with indices
-// living in host (CPU) memory.
+  // Resource object used to construct list segment objects with indices
+  // living in host (CPU) memory.
   camp::resources::Resource host_res{camp::resources::Host()};
 
 
-//----------------------------------------------------------------------------//
-// Stride-1 iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Stride-1 iteration spaces
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version range kernel...\n";
 
-// _cstyle_range1_start
-  for (IdxType i = 0; i < 20; i++) {
-    std::cout << i << "  "; 
+  // _cstyle_range1_start
+  for (IdxType i = 0; i < 20; i++)
+  {
+    std::cout << i << "  ";
   }
-// _cstyle_range1_end
+  // _cstyle_range1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA range kernel...\n";
 
   // _raja_range1_start
-  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 range kernel...\n";
 
   // _raja_striderange1_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_striderange1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 list kernel...\n";
 
@@ -104,61 +103,62 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Collect indices in a vector to create list segment
   //
   std::vector<IdxType> idx;
-  for (IdxType i = 0; i < 20; ++i) {
-    idx.push_back(i); 
-  } 
+  for (IdxType i = 0; i < 20; ++i)
+  {
+    idx.push_back(i);
+  }
 
-  ListSegType idx_list1( idx, host_res );
+  ListSegType idx_list1(idx, host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1, [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(idx_list1,
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_list1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running C-style stride-1 list kernel...\n";
 
   // _cstyle_list1_start
-  IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
-  for (IdxType ii = 0; ii < iis; ++ii) { 
-    std::cout << idx[ ii ] << "  ";
+  IdxType iis = static_cast<IdxType>(idx.size()); // to avoid compiler warning
+  for (IdxType ii = 0; ii < iis; ++ii)
+  {
+    std::cout << idx[ii] << "  ";
   }
   // _cstyle_list1_end
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
-// Negative stride iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Negative stride iteration spaces
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version negative stride kernel...\n";
 
   // _cstyle_negstriderange1_start
-  for (IdxType i = 19; i > -1; i--) {
+  for (IdxType i = 19; i > -1; i--)
+  {
     std::cout << i << "  ";
   }
   // _cstyle_negstriderange1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA negative stride kernel...\n";
 
   // _raja_negstriderange1_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(19, -1, -1), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(19, -1, -1),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_negstriderange1_end
 
   std::cout << std::endl;
 
-//----------------------------------//
-// List variant
-//----------------------------------//
+  //----------------------------------//
+  // List variant
+  //----------------------------------//
 
   std::cout << "\n Running RAJA negative stride list kernel...\n";
 
@@ -166,121 +166,117 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Reverse the order of indices in the vector
   //
-  std::reverse( idx.begin(), idx.end() );
-  ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res );
+  std::reverse(idx.begin(), idx.end());
+  ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse, [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse,
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_negstridelist1_end
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
-// Non-unit uniform stride iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Non-unit uniform stride iteration spaces
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version stride-2 range kernel...\n";
 
   // _cstyle_range2_start
-  for (IdxType i = 0; i < 20; i += 2) {
+  for (IdxType i = 0; i < 20; i += 2)
+  {
     std::cout << i << "  ";
   }
   // _cstyle_range2_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-2 range kernel...\n";
 
   // _raja_range2_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range2_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA stride-3 range kernel...\n";
 
   // _raja_range3_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 3), [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 3),
+                               [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range3_end
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
-// IndexSets: complex iteration spaces
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // IndexSets: complex iteration spaces
+  //----------------------------------------------------------------------------//
 
-//
-// Sequential index set execution policy used in several of the following
-// example implementations.
-//
+  //
+  // Sequential index set execution policy used in several of the following
+  // example implementations.
+  //
 
   std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n";
 
   // _raja_indexset_2ranges_start
-  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
-                                            RAJA::seq_exec>;
+  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
 
   IndexSetType is2;
-  is2.push_back( RangeSegType(0, 10) );
-  is2.push_back( RangeSegType(15, 20) );
-  
-  RAJA::forall<SEQ_ISET_EXECPOL>(is2, [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  is2.push_back(RangeSegType(0, 10));
+  is2.push_back(RangeSegType(15, 20));
+
+  RAJA::forall<SEQ_ISET_EXECPOL>(is2,
+                                 [=](IdxType i) { std::cout << i << "  "; });
   // _raja_indexset_2ranges_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running C-version of two segment kernel...\n";
 
   // _cstyle_2ranges_start
-  for (IdxType i = 0; i < 10; ++i) {
+  for (IdxType i = 0; i < 10; ++i)
+  {
     std::cout << i << "  ";
   }
-  for (IdxType i = 15; i < 20; ++i) {
+  for (IdxType i = 15; i < 20; ++i)
+  {
     std::cout << i << "  ";
   }
   // _cstyle_2ranges_end
 
   std::cout << std::endl;
 
-//----------------------------------//
+  //----------------------------------//
 
   std::cout << "\n Running RAJA index set (3 segments) kernel...\n";
 
   // _raja_indexset_3segs_start
   IndexSetType is3;
 
-  is3.push_back( RangeSegType(0, 8) );
+  is3.push_back(RangeSegType(0, 8));
 
-  IdxType indx[ ] = {10, 11, 14, 20, 22};
-  ListSegType list2( indx, 5, host_res );
-  is3.push_back( list2 );
+  IdxType indx[] = {10, 11, 14, 20, 22};
+  ListSegType list2(indx, 5, host_res);
+  is3.push_back(list2);
 
-  is3.push_back( RangeSegType(24, 28) );
- 
-  RAJA::forall<SEQ_ISET_EXECPOL>(is3, [=] (IdxType i) {
-    std::cout << i << "  ";
-  });
+  is3.push_back(RangeSegType(24, 28));
+
+  RAJA::forall<SEQ_ISET_EXECPOL>(is3,
+                                 [=](IdxType i) { std::cout << i << "  "; });
   // _raja_indexset_3segs_end
 
   std::cout << std::endl;
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
- 
+
   return 0;
 }
-
diff --git a/exercises/sort.cpp b/exercises/sort.cpp
index 21a5fb5edd..1b13eb20ac 100644
--- a/exercises/sort.cpp
+++ b/exercises/sort.cpp
@@ -8,10 +8,12 @@
 #define OP_GREATER RAJA::operators::greater<int>
 #define OP_LESS RAJA::operators::less<int>
 
-#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N) 
-#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult<X>(in, out, in_vals, out_vals, N) 
-#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N) 
-#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult<X>(in, out, in_vals, out_vals, N) 
+#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N)
+#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X)                                     \
+  checkUnstableSortResult<X>(in, out, in_vals, out_vals, N)
+#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N)
+#define CHECK_STABLE_SORT_PAIR_RESULT(X)                                       \
+  checkStableSortResult<X>(in, out, in_vals, out_vals, N)
 
 #include <cstdlib>
 #include <iostream>
@@ -30,9 +32,9 @@
 /*
  *  Sort Exercise
  *
- *  Exercise demonstrates how to perform RAJA unstable and stable sort operations
- *  for integer arrays, including pairs variant, using different comparators.
- *  Other array data types, comparators, etc. are similar
+ *  Exercise demonstrates how to perform RAJA unstable and stable sort
+ * operations for integer arrays, including pairs variant, using different
+ * comparators. Other array data types, comparators, etc. are similar
  *
  *  RAJA features shown:
  *    - `RAJA::sort` and `RAJA::sort_pairs` methods
@@ -47,11 +49,11 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-//constexpr int CUDA_BLOCK_SIZE = 16;
+// constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-//constexpr int HIP_BLOCK_SIZE = 16;
+// constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -60,14 +62,20 @@
 template <typename Function, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkUnstableSortResult(const T* in, const T* out,
-                             const U* in_vals, const U* out_vals, int N);
+void checkUnstableSortResult(const T* in,
+                             const T* out,
+                             const U* in_vals,
+                             const U* out_vals,
+                             int N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkStableSortResult(const T* in, const T* out,
-                           const U* in_vals, const U* out_vals, int N);
+void checkStableSortResult(const T* in,
+                           const T* out,
+                           const U* in_vals,
+                           const U* out_vals,
+                           int N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -81,27 +89,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA sort example...\n";
 
   // _sort_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 20;
 
-//
-// Allocate and initialize vector data
-//
+  //
+  // Allocate and initialize vector data
+  //
   int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
   unsigned* in_vals = memoryManager::allocate<unsigned>(N);
   unsigned* out_vals = memoryManager::allocate<unsigned>(N);
 
-  std::iota(in      , in + N/2, 0);
-  std::iota(in + N/2, in + N  , 0);
-  std::shuffle(in      , in + N/2, std::mt19937{12345u});
-  std::shuffle(in + N/2, in + N  , std::mt19937{67890u});
+  std::iota(in, in + N / 2, 0);
+  std::iota(in + N / 2, in + N, 0);
+  std::shuffle(in, in + N / 2, std::mt19937{12345u});
+  std::shuffle(in + N / 2, in + N, std::mt19937{67890u});
 
-  std::fill(in_vals      , in_vals + N/2, 0);
-  std::fill(in_vals + N/2, in_vals + N  , 1);
+  std::fill(in_vals, in_vals + N / 2, 0);
+  std::fill(in_vals + N / 2, in_vals + N, 1);
 
   std::cout << "\n in keys...\n";
   printArray(in, N);
@@ -112,10 +120,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _sort_array_init_end
 
 
-//----------------------------------------------------------------------------//
-// Perform various sequential sorts to illustrate unstable/stable,
-// pairs, default sorts with different comparators
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform various sequential sorts to illustrate unstable/stable,
+  // pairs, default sorts with different comparators
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (default)...\n";
 
@@ -123,7 +131,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec
-  ///           execution policy type. 
+  ///           execution policy type.
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
@@ -134,12 +142,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N));
   // _sort_seq_end
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (non-decreasing)...\n";
 
@@ -149,15 +157,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec execution
-  ///           policy type and an explicit less operation. 
+  ///           policy type and an explicit less operation.
   ///
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
 
@@ -167,15 +175,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution
-  ///           policy type and an explicit less operation. 
+  ///           policy type and an explicit less operation.
   ///
 
-  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-increasing)...\n";
 
@@ -185,15 +193,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution
-  ///           policy type and an explicit greater operation. 
+  ///           policy type and an explicit greater operation.
   ///
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n";
 
@@ -204,15 +212,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA pair sort with RAJA::seq_exec execution
-  ///           policy type and an explicit less operation. 
+  ///           policy type and an explicit less operation.
   ///
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n";
 
@@ -223,10 +232,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA pair sort with RAJA::seq_exec execution
-  ///           policy type and an explicit greater operation. 
+  ///           policy type and an explicit greater operation.
   ///
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
@@ -234,9 +244,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of OpenMP sorts...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of OpenMP sorts...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP sort (non-decreasing)...\n";
 
@@ -246,15 +256,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA sort with RAJA::omp_parallel_for_exec execution
-  ///           policy type and an explicit less operation. 
+  ///           policy type and an explicit less operation.
   ///
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n";
 
@@ -264,24 +274,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec execution
-  ///           policy type and an explicit greater operation. 
+  /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec
+  /// execution
+  ///           policy type and an explicit greater operation.
   ///
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of CUDA sorts...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of CUDA sorts...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n";
 
@@ -292,18 +304,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA pair sort with RAJA::cuda_exec execution
-  ///           policy type and an explicit greater operation. 
+  ///           policy type and an explicit greater operation.
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
 
-  //checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  // checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n";
 
@@ -313,26 +326,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA pair sort with RAJA::cuda_exec execution
-  ///           policy type and an explicit less operation. 
+  ///           policy type and an explicit less operation.
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
 
-  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of HIP sorts...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of HIP sorts...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n";
 
@@ -342,48 +355,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* d_out = memoryManager::allocate_gpu<int>(N);
   int* d_out_vals = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA pair sort with RAJA::hip_exec execution
-  ///           policy type and an explicit less operation. 
+  ///           policy type and an explicit less operation.
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
-  hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(
+      hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost));
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP stable_sort (non-increasing)...\n";
 
   std::copy_n(in, N, out);
 
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA sort with RAJA::hip_exec execution
-  ///           policy type and an explicit less operation. 
+  ///           policy type and an explicit less operation.
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
@@ -394,11 +410,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -428,9 +444,11 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end()) {
+    if (key_iter == keys.end())
+    {
       auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -438,54 +456,60 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
     key_iter->second.emplace(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i-1] << ", " << out[i]
-                << " out of order"
-                << " (at index " << i-1 << ")\n";
+      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order"
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end()) {
-      if (correct) {
+    if (key_iter == keys.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " unknown or duplicate key"
+      std::cout << "\t" << out[i] << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out[i]);
-    if (val_iter == key_iter->second.end()) {
-      if (correct) {
+    if (val_iter == key_iter->second.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " unknown or duplicate val"
+      std::cout << "\t" << out[i] << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkUnstableSortResult(const T* in, const T* out,
-                             const U* in_vals, const U* out_vals, int N)
+void checkUnstableSortResult(const T* in,
+                             const T* out,
+                             const U* in_vals,
+                             const U* out_vals,
+                             int N)
 {
   Comparator comp;
   bool correct = true;
@@ -493,9 +517,11 @@ void checkUnstableSortResult(const T* in, const T* out,
   // make map of keys to vals
   using val_map = std::unordered_multiset<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end()) {
+    if (key_iter == keys_to_vals.end())
+    {
       auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -503,48 +529,57 @@ void checkUnstableSortResult(const T* in, const T* out,
     key_iter->second.emplace(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
-                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
+                << " (" << out[i] << "," << out_vals[i] << ")"
                 << " out of order"
-                << " (at index " << i-1 << ")\n";
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end()) {
-      if (correct) {
+    if (key_iter == keys_to_vals.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out_vals[i]);
-    if (val_iter == key_iter->second.end()) {
-      if (correct) {
+    if (val_iter == key_iter->second.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -561,9 +596,11 @@ void checkStableSortResult(const T* in, const T* out, int N)
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end()) {
+    if (key_iter == keys.end())
+    {
       auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -571,53 +608,59 @@ void checkStableSortResult(const T* in, const T* out, int N)
     key_iter->second.emplace_back(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i-1] << ", " << out[i]
-                << " out of order "
-                << " (at index " << i-1 << ")\n";
+      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order "
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end()) {
-      if (correct) {
+    if (key_iter == keys.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " unknown or duplicate key "
+      std::cout << "\t" << out[i] << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out[i]) {
-      if (correct) {
+    if (key_iter->second.front() != out[i])
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " out of stable order or unknown val "
+      std::cout << "\t" << out[i] << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkStableSortResult(const T* in, const T* out,
-                           const U* in_vals, const U* out_vals, int N)
+void checkStableSortResult(const T* in,
+                           const T* out,
+                           const U* in_vals,
+                           const U* out_vals,
+                           int N)
 {
   Comparator comp;
   bool correct = true;
@@ -625,9 +668,11 @@ void checkStableSortResult(const T* in, const T* out,
   // make map of keys to vals
   using val_map = std::list<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end()) {
+    if (key_iter == keys_to_vals.end())
+    {
       auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -635,47 +680,56 @@ void checkStableSortResult(const T* in, const T* out,
     key_iter->second.emplace_back(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
-                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
+                << " (" << out[i] << "," << out_vals[i] << ")"
                 << " out of order "
-                << " (at index " << i-1 << ")\n";
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end()) {
-      if (correct) {
+    if (key_iter == keys_to_vals.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out_vals[i]) {
-      if (correct) {
+    if (key_iter->second.front() != out_vals[i])
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -688,7 +742,10 @@ template <typename T>
 void printArray(const T* k, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; }
+  for (int i = 0; i < N; ++i)
+  {
+    std::cout << " " << k[i];
+  }
   std::cout << std::endl;
 }
 ///
@@ -696,7 +753,9 @@ template <typename T, typename U>
 void printArray(const T* k, const U* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; }
+  for (int i = 0; i < N; ++i)
+  {
+    std::cout << " (" << k[i] << "," << v[i] << ")";
+  }
   std::cout << std::endl;
 }
-
diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp
index 98f65c6dbe..5414885e67 100644
--- a/exercises/sort_solution.cpp
+++ b/exercises/sort_solution.cpp
@@ -8,10 +8,12 @@
 #define OP_GREATER RAJA::operators::greater<int>
 #define OP_LESS RAJA::operators::less<int>
 
-#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N) 
-#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult<X>(in, out, in_vals, out_vals, N) 
-#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N) 
-#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult<X>(in, out, in_vals, out_vals, N) 
+#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N)
+#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X)                                     \
+  checkUnstableSortResult<X>(in, out, in_vals, out_vals, N)
+#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N)
+#define CHECK_STABLE_SORT_PAIR_RESULT(X)                                       \
+  checkStableSortResult<X>(in, out, in_vals, out_vals, N)
 
 #include <cstdlib>
 #include <iostream>
@@ -30,9 +32,9 @@
 /*
  *  Sort Exercise
  *
- *  Exercise demonstrates how to perform RAJA unstable and stable sort operations
- *  for integer arrays, including pairs variant, using different comparators.
- *  Other array data types, comparators, etc. are similar
+ *  Exercise demonstrates how to perform RAJA unstable and stable sort
+ * operations for integer arrays, including pairs variant, using different
+ * comparators. Other array data types, comparators, etc. are similar
  *
  *  RAJA features shown:
  *    - `RAJA::sort` and `RAJA::sort_pairs` methods
@@ -60,14 +62,20 @@ constexpr int HIP_BLOCK_SIZE = 16;
 template <typename Function, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkUnstableSortResult(const T* in, const T* out,
-                             const U* in_vals, const U* out_vals, int N);
+void checkUnstableSortResult(const T* in,
+                             const T* out,
+                             const U* in_vals,
+                             const U* out_vals,
+                             int N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkStableSortResult(const T* in, const T* out,
-                           const U* in_vals, const U* out_vals, int N);
+void checkStableSortResult(const T* in,
+                           const T* out,
+                           const U* in_vals,
+                           const U* out_vals,
+                           int N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -81,27 +89,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA sort example...\n";
 
   // _sort_array_init_start
-//
-// Define array length
-//
+  //
+  // Define array length
+  //
   constexpr int N = 20;
 
-//
-// Allocate and initialize vector data
-//
+  //
+  // Allocate and initialize vector data
+  //
   int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
   unsigned* in_vals = memoryManager::allocate<unsigned>(N);
   unsigned* out_vals = memoryManager::allocate<unsigned>(N);
 
-  std::iota(in      , in + N/2, 0);
-  std::iota(in + N/2, in + N  , 0);
-  std::shuffle(in      , in + N/2, std::mt19937{12345u});
-  std::shuffle(in + N/2, in + N  , std::mt19937{67890u});
+  std::iota(in, in + N / 2, 0);
+  std::iota(in + N / 2, in + N, 0);
+  std::shuffle(in, in + N / 2, std::mt19937{12345u});
+  std::shuffle(in + N / 2, in + N, std::mt19937{67890u});
 
-  std::fill(in_vals      , in_vals + N/2, 0);
-  std::fill(in_vals + N/2, in_vals + N  , 1);
+  std::fill(in_vals, in_vals + N / 2, 0);
+  std::fill(in_vals + N / 2, in_vals + N, 1);
 
   std::cout << "\n in keys...\n";
   printArray(in, N);
@@ -112,10 +120,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _sort_array_init_end
 
 
-//----------------------------------------------------------------------------//
-// Perform various sequential sorts to illustrate unstable/stable,
-// pairs, default sorts with different comparators
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform various sequential sorts to illustrate unstable/stable,
+  // pairs, default sorts with different comparators
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (default)...\n";
 
@@ -125,12 +133,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N));
   // _sort_seq_end
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (non-decreasing)...\n";
 
@@ -141,12 +149,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                              RAJA::operators::less<int>{});
   // _sort_seq_less_end
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
 
@@ -157,12 +165,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                     RAJA::operators::less<int>{});
   // _sort_stable_seq_less_end
 
-  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-increasing)...\n";
 
@@ -173,12 +181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                     RAJA::operators::greater<int>{});
   // _sort_stable_seq_greater_end
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n";
 
@@ -191,12 +199,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                    RAJA::operators::less<int>{});
   // _sort_pairs_seq_less_end
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n";
 
@@ -209,7 +218,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                           RAJA::operators::greater<int>{});
   // _sort_stable_pairs_seq_greater_end
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
@@ -217,9 +227,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of OpenMP sorts...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of OpenMP sorts...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP sort (non-decreasing)...\n";
 
@@ -230,12 +240,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                           RAJA::operators::less<int>{});
   // _sort_omp_less_end
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n";
 
@@ -243,25 +253,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_stable_pairs_omp_greater_start
-  RAJA::stable_sort_pairs<RAJA::omp_parallel_for_exec>(RAJA::make_span(out, N),
-                                                       RAJA::make_span(out_vals, N),
-                                                       RAJA::operators::greater<int>{});
+  RAJA::stable_sort_pairs<RAJA::omp_parallel_for_exec>(
+      RAJA::make_span(out, N),
+      RAJA::make_span(out_vals, N),
+      RAJA::operators::greater<int>{});
   // _sort_stable_pairs_omp_greater_end
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of CUDA sorts...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of CUDA sorts...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n";
 
@@ -269,41 +281,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_pairs_cuda_greater_start
-  RAJA::sort_pairs<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::make_span(out, N),
-                                                     RAJA::make_span(out_vals, N),
-                                                     RAJA::operators::greater<int>{});
+  RAJA::sort_pairs<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::make_span(out, N),
+      RAJA::make_span(out_vals, N),
+      RAJA::operators::greater<int>{});
   // _sort_pairs_cuda_greater_end
 
-  //checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  // checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n";
 
   std::copy_n(in, N, out);
 
   // _sort_stable_cuda_less_start
-  RAJA::stable_sort<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::make_span(out, N),
-                                                      RAJA::operators::less<int>{});
+  RAJA::stable_sort<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::make_span(out, N), RAJA::operators::less<int>{});
   // _sort_stable_cuda_less_end
 
-  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-//----------------------------------------------------------------------------//
-// Perform a couple of HIP sorts...
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Perform a couple of HIP sorts...
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n";
 
@@ -313,38 +327,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* d_out = memoryManager::allocate_gpu<int>(N);
   int* d_out_vals = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(
+      hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice));
 
-  RAJA::sort_pairs<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::make_span(d_out, N),
-                                                   RAJA::make_span(d_out_vals, N),
-                                                   RAJA::operators::less<int>{});
+  RAJA::sort_pairs<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      RAJA::make_span(d_out, N),
+      RAJA::make_span(d_out_vals, N),
+      RAJA::operators::less<int>{});
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
-  hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(
+      hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost));
 
-  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
+  // out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP stable_sort (non-increasing)...\n";
 
   std::copy_n(in, N, out);
 
-  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
 
   // _sort_stable_hip_greater_start
   RAJA::stable_sort<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-    RAJA::make_span(d_out, N),
-    RAJA::operators::greater<int>{});
+      RAJA::make_span(d_out, N), RAJA::operators::greater<int>{});
   // _sort_stable_hip_greater_end
 
-  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
 
-  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
@@ -355,11 +372,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -389,9 +406,11 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end()) {
+    if (key_iter == keys.end())
+    {
       auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -399,54 +418,60 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
     key_iter->second.emplace(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i-1] << ", " << out[i]
-                << " out of order"
-                << " (at index " << i-1 << ")\n";
+      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order"
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end()) {
-      if (correct) {
+    if (key_iter == keys.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " unknown or duplicate key"
+      std::cout << "\t" << out[i] << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out[i]);
-    if (val_iter == key_iter->second.end()) {
-      if (correct) {
+    if (val_iter == key_iter->second.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " unknown or duplicate val"
+      std::cout << "\t" << out[i] << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkUnstableSortResult(const T* in, const T* out,
-                             const U* in_vals, const U* out_vals, int N)
+void checkUnstableSortResult(const T* in,
+                             const T* out,
+                             const U* in_vals,
+                             const U* out_vals,
+                             int N)
 {
   Comparator comp;
   bool correct = true;
@@ -454,9 +479,11 @@ void checkUnstableSortResult(const T* in, const T* out,
   // make map of keys to vals
   using val_map = std::unordered_multiset<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end()) {
+    if (key_iter == keys_to_vals.end())
+    {
       auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -464,48 +491,57 @@ void checkUnstableSortResult(const T* in, const T* out,
     key_iter->second.emplace(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
-                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
+                << " (" << out[i] << "," << out_vals[i] << ")"
                 << " out of order"
-                << " (at index " << i-1 << ")\n";
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end()) {
-      if (correct) {
+    if (key_iter == keys_to_vals.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out_vals[i]);
-    if (val_iter == key_iter->second.end()) {
-      if (correct) {
+    if (val_iter == key_iter->second.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -522,9 +558,11 @@ void checkStableSortResult(const T* in, const T* out, int N)
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end()) {
+    if (key_iter == keys.end())
+    {
       auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -532,53 +570,59 @@ void checkStableSortResult(const T* in, const T* out, int N)
     key_iter->second.emplace_back(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i-1] << ", " << out[i]
-                << " out of order "
-                << " (at index " << i-1 << ")\n";
+      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order "
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end()) {
-      if (correct) {
+    if (key_iter == keys.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " unknown or duplicate key "
+      std::cout << "\t" << out[i] << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out[i]) {
-      if (correct) {
+    if (key_iter->second.front() != out[i])
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t"
-                << out[i]
-                << " out of stable order or unknown val "
+      std::cout << "\t" << out[i] << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkStableSortResult(const T* in, const T* out,
-                           const U* in_vals, const U* out_vals, int N)
+void checkStableSortResult(const T* in,
+                           const T* out,
+                           const U* in_vals,
+                           const U* out_vals,
+                           int N)
 {
   Comparator comp;
   bool correct = true;
@@ -586,9 +630,11 @@ void checkStableSortResult(const T* in, const T* out,
   // make map of keys to vals
   using val_map = std::list<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end()) {
+    if (key_iter == keys_to_vals.end())
+    {
       auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -596,47 +642,56 @@ void checkStableSortResult(const T* in, const T* out,
     key_iter->second.emplace_back(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(out[i], out[i-1])) {
-      if (correct) {
+    if (i > 0 && comp(out[i], out[i - 1]))
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
-                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
+                << " (" << out[i] << "," << out_vals[i] << ")"
                 << " out of order "
-                << " (at index " << i-1 << ")\n";
+                << " (at index " << i - 1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end()) {
-      if (correct) {
+    if (key_iter == keys_to_vals.end())
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out_vals[i]) {
-      if (correct) {
+    if (key_iter->second.front() != out_vals[i])
+    {
+      if (correct)
+      {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << "(" << out[i] << "," << out_vals[i] << ")"
                 << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct) {
+  if (correct)
+  {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -649,7 +704,10 @@ template <typename T>
 void printArray(const T* k, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; }
+  for (int i = 0; i < N; ++i)
+  {
+    std::cout << " " << k[i];
+  }
   std::cout << std::endl;
 }
 
@@ -657,7 +715,9 @@ template <typename T, typename U>
 void printArray(const T* k, const U* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; }
+  for (int i = 0; i < N; ++i)
+  {
+    std::cout << " (" << k[i] << "," << v[i] << ")";
+  }
   std::cout << std::endl;
 }
-
diff --git a/exercises/tutorial_halfday/ex2_approx-pi.cpp b/exercises/tutorial_halfday/ex2_approx-pi.cpp
index c1ccc05aee..f5487fd9f9 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi.cpp
@@ -15,7 +15,7 @@
  *  EXERCISE #2: Approximate pi using a Riemann sum
  *
  *  In this exercise, you will apprimate pi using the formula
- * 
+ *
  *    pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the
  *    interval [0, 1].
  *
@@ -28,7 +28,7 @@
  *    - `forall` loop iteration template method
  *    - Index range segment
  *    - Sum reduction
- *    - Execution and reduction policies 
+ *    - Execution and reduction policies
  */
 
 /*
@@ -46,38 +46,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n";
 
-//
-// Define number of subintervals (N) and size of each subinterval (dx) used in
-// Riemann integral sum to approximate pi.
-//
+  //
+  // Define number of subintervals (N) and size of each subinterval (dx) used in
+  // Riemann integral sum to approximate pi.
+  //
   const int N = 512 * 512;
-  const double dx = 1.0 / double(N); 
+  const double dx = 1.0 / double(N);
 
-// Set precision for printing pi
+  // Set precision for printing pi
   int prec = 16;
 
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential pi approximation...\n";
- 
+
   double c_pi = 0.0;
 
-  for (int i = 0; i < N; ++i) {
-      double x = (double(i) + 0.5) * dx;
-      c_pi += dx / (1.0 + x * x);
+  for (int i = 0; i < N; ++i)
+  {
+    double x = (double(i) + 0.5) * dx;
+    c_pi += dx / (1.0 + x * x);
   }
   c_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) 
-            << c_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl;
 
 
-//----------------------------------------------------------------------------//
-// RAJA sequential variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA sequential variant.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential pi approximation...\n";
 
@@ -85,31 +85,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall
-  ///           method with RAJA::seq_exec execution policy type and a 
+  ///           method with RAJA::seq_exec execution policy type and a
   ///           RAJA::ReduceSum object with RAJA::seq_reduce policy type
   ///           to accumulate the sum.
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
 
-  using EXEC_POL1   = RAJA::seq_exec;
+  using EXEC_POL1 = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
 
-  RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0);
+  RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) {
-      double x = (double(i) + 0.5) * dx;
-      seq_pi += dx / (1.0 + x * x);
+  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N), [=](int i) {
+    double x = (double(i) + 0.5) * dx;
+    seq_pi += dx / (1.0 + x * x);
   });
-  double seq_pi_val = seq_pi.get() * 4.0; 
+  double seq_pi_val = seq_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) 
-            << seq_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -117,22 +116,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   double c_pi_omp = 0.0;
 
-  #pragma omp parallel for reduction(+:c_pi_omp)
-  for (int i = 0; i < N; ++i) {
-      double x = (double(i) + 0.5) * dx;
-      c_pi_omp += dx / (1.0 + x * x);
+#pragma omp parallel for reduction(+ : c_pi_omp)
+  for (int i = 0; i < N; ++i)
+  {
+    double x = (double(i) + 0.5) * dx;
+    c_pi_omp += dx / (1.0 + x * x);
   }
   c_pi_omp *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << c_pi_omp << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl;
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -142,23 +141,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall
-  ///           method with RAJA::omp_parallel_for_exec execution policy type 
+  ///           method with RAJA::omp_parallel_for_exec execution policy type
   ///           and a RAJA::ReduceSum object with RAJA::omp_reduce policy type
   ///           to accumulate the sum.
-  /// 
+  ///
 
 
   double omp_pi_val = 0.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << omp_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA CUDA variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA CUDA variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -168,16 +166,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall
-  ///           method with RAJA::cuda_exec execution policy type and a 
+  ///           method with RAJA::cuda_exec execution policy type and a
   ///           RAJA::ReduceSum object with RAJA::cuda_reduce policy type
   ///           to accumulate the sum.
-  /// 
+  ///
 
 
   double cuda_pi_val = 0.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << cuda_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
 
 #endif
 
diff --git a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
index 5654ffbea2..42a3895b48 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
@@ -15,7 +15,7 @@
  *  EXERCISE #2: Approximate pi using a Riemann sum
  *
  *  In this exercise, you will apprimate pi using the formula
- * 
+ *
  *    pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the
  *    interval [0, 1].
  *
@@ -28,7 +28,7 @@
  *    - `forall` loop iteration template method
  *    - Index range segment
  *    - Sum reduction
- *    - Execution and reduction policies 
+ *    - Execution and reduction policies
  */
 
 /*
@@ -43,59 +43,58 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n";
 
-//
-// Define number of subintervals (N) and size of each subinterval (dx) used in
-// Riemann integral sum to approximate pi.
-//
+  //
+  // Define number of subintervals (N) and size of each subinterval (dx) used in
+  // Riemann integral sum to approximate pi.
+  //
   const int N = 512 * 512;
-  const double dx = 1.0 / double(N); 
+  const double dx = 1.0 / double(N);
 
-// Set precision for printing pi
+  // Set precision for printing pi
   int prec = 16;
 
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential pi approximation...\n";
- 
+
   double c_pi = 0.0;
 
-  for (int i = 0; i < N; ++i) {
-      double x = (double(i) + 0.5) * dx;
-      c_pi += dx / (1.0 + x * x);
+  for (int i = 0; i < N; ++i)
+  {
+    double x = (double(i) + 0.5) * dx;
+    c_pi += dx / (1.0 + x * x);
   }
   c_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) 
-            << c_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl;
 
 
-//----------------------------------------------------------------------------//
-// RAJA sequential variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA sequential variant.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential pi approximation...\n";
 
-  using EXEC_POL1   = RAJA::seq_exec;
-  using REDUCE_POL1 = RAJA::seq_reduce; 
+  using EXEC_POL1 = RAJA::seq_exec;
+  using REDUCE_POL1 = RAJA::seq_reduce;
 
-  RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0);
+  RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) {
-      double x = (double(i) + 0.5) * dx;
-      seq_pi += dx / (1.0 + x * x);
+  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N), [=](int i) {
+    double x = (double(i) + 0.5) * dx;
+    seq_pi += dx / (1.0 + x * x);
   });
   double seq_pi_val = seq_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) 
-            << seq_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -103,65 +102,63 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   double c_pi_omp = 0.0;
 
-  #pragma omp parallel for reduction(+:c_pi_omp)
-  for (int i = 0; i < N; ++i) {
-      double x = (double(i) + 0.5) * dx;
-      c_pi_omp += dx / (1.0 + x * x);
+#pragma omp parallel for reduction(+ : c_pi_omp)
+  for (int i = 0; i < N; ++i)
+  {
+    double x = (double(i) + 0.5) * dx;
+    c_pi_omp += dx / (1.0 + x * x);
   }
   c_pi_omp *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << c_pi_omp << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl;
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running RAJA OpenMP pi approximation...\n";
 
-  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
   using REDUCE_POL2 = RAJA::omp_reduce;
 
-  RAJA::ReduceSum< REDUCE_POL2, double > omp_pi(0.0);
+  RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
 
-  RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=](int i) {
-      double x = (double(i) + 0.5) * dx;
-      omp_pi += dx / (1.0 + x * x);
+  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N), [=](int i) {
+    double x = (double(i) + 0.5) * dx;
+    omp_pi += dx / (1.0 + x * x);
   });
   double omp_pi_val = omp_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << omp_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA CUDA variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA CUDA variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running RAJA CUDA pi approximation...\n";
 
-  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::cuda_reduce;
 
-  RAJA::ReduceSum< REDUCE_POL3, double > cuda_pi(0.0);
+  RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
 
-  RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) {
-      double x = (double(i) + 0.5) * dx;
-      cuda_pi += dx / (1.0 + x * x);
+  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
+    double x = (double(i) + 0.5) * dx;
+    cuda_pi += dx / (1.0 + x * x);
   });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec)
-            << cuda_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
 
 #endif
 
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight.cpp b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
index c17fb2eb8a..1d22a04dd9 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
@@ -24,30 +24,30 @@
  *
  *  Given an observation point X on a terrain map, and a set of points
  *  {Y0, Y1, Y2, ...} along a ray starting at X, find which points on the
- *  terrain at Y0, Y1, etc. are visible from the point at X. A point is 
- *  visible from the point at X if and only if there is no other point on the 
- *  terrain that blocks its view from the point at X. More precisely, 
- *  a point on the terrain at Y is visible from the point at X if and only if 
- *  no other point on the terrain between X and Y has a greater vertical angle 
+ *  terrain at Y0, Y1, etc. are visible from the point at X. A point is
+ *  visible from the point at X if and only if there is no other point on the
+ *  terrain that blocks its view from the point at X. More precisely,
+ *  a point on the terrain at Y is visible from the point at X if and only if
+ *  no other point on the terrain between X and Y has a greater vertical angle
  *  from the point at X than the point at Y. So although a point at Y may
- *  be at a higher altitude than all other points on the terrain between Y 
+ *  be at a higher altitude than all other points on the terrain between Y
  *  and X, the point at Y may not be visible from the point at X.
  *
- *  Let 'altX' be the altidue at point X. Suppose we have a vector 'dist' 
- *  such that dist[i] is the horizontal distance between X and Yi, and a 
- *  vector 'alt' such that alt[i] is the altitude at point Yi. To solve 
- *  the line of sight problem, we compute an angle vector 'ang', where 
+ *  Let 'altX' be the altidue at point X. Suppose we have a vector 'dist'
+ *  such that dist[i] is the horizontal distance between X and Yi, and a
+ *  vector 'alt' such that alt[i] is the altitude at point Yi. To solve
+ *  the line of sight problem, we compute an angle vector 'ang', where
  *  ang[i] = arctan( (alt[i] - altX)/(dist[i]). Next, we perform a "max"
- *  scan on the vector 'ang' to form the vector 'ang_max'. Then, the point 
+ *  scan on the vector 'ang' to form the vector 'ang_max'. Then, the point
  *  at Yi is visible from the point at X if ang[i] >= ang_max[i]. Otherwise,
  *  the point at Yi is not visible.
  *
  *  This file contains a C-style sequential implementation of the solution to
- *  the line-of-sight problem. Where indicated by comments, you will fill in 
+ *  the line-of-sight problem. Where indicated by comments, you will fill in
  *  sequential and OpenMP versions of the algorithm using a RAJA scan operation
  *  to compute the 'ang_max' vector and a RAJA forall method to determine which
- *  points are/are not visible. If you have access to an NVIDIA GPU and a CUDA 
- *  compiler, fill in the RAJA CUDA version of the algorithm also. 
+ *  points are/are not visible. If you have access to an NVIDIA GPU and a CUDA
+ *  compiler, fill in the RAJA CUDA version of the algorithm also.
  *
  *  RAJA features you will use:
  *    - inclusive scan operations with 'max' operator
@@ -96,52 +96,59 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* visible = memoryManager::allocate<int>(N);
   int* visible_ref = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) { 
-    dist[i] = static_cast<double>(i+1);
-    double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 );
-    alt[i] = alt_fact * 
-             static_cast<double>( rand() ) / static_cast<double>( RAND_MAX );
+  for (int i = 0; i < N; ++i)
+  {
+    dist[i] = static_cast<double>(i + 1);
+    double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1);
+    alt[i] =
+        alt_fact * static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
   }
 
   //
   // Set angle array
-  // 
-  for (int i = 0; i < N; ++i) { 
-    ang[i] = atan2( alt[i], dist[i] );       // set angle in radians
+  //
+  for (int i = 0; i < N; ++i)
+  {
+    ang[i] = atan2(alt[i], dist[i]); // set angle in radians
   }
 
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n";
 
   std::memset(visible_ref, 0, N * sizeof(int));
 
   ang_max[0] = ang[0];
-  for (int i = 1; i < N; ++i) {
-      ang_max[i] = std::max(ang[i], ang_max[i-1]);
+  for (int i = 1; i < N; ++i)
+  {
+    ang_max[i] = std::max(ang[i], ang_max[i - 1]);
   }
 
   int num_visible = 0;
 
-  for (int i = 0; i < N; ++i) {
-     if ( ang[i] >= ang_max[i] ) {
-        visible_ref[i] = 1;
-        num_visible++;
-     } else {
-        visible_ref[i] = 0;
-     }
+  for (int i = 0; i < N; ++i)
+  {
+    if (ang[i] >= ang_max[i])
+    {
+      visible_ref[i] = 1;
+      num_visible++;
+    }
+    else
+    {
+      visible_ref[i] = 0;
+    }
   }
 
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible_ref, N);
+  // printArray(visible_ref, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA sequential variant
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA sequential variant
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n";
 
@@ -153,7 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs.
-  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector 
+  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector
   ///           with RAJA::seq_exec execution policy. Then, use a RAJA::forall
   ///           template with the same execution policy to determine which
   ///           points are visible.
@@ -162,12 +169,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible, N);
+  // printArray(visible, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA OpenMP multithreading variant
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA OpenMP multithreading variant
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -181,23 +188,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs.
-  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector 
-  ///           with RAJA::omp_parallel_for_exec execution policy. Then, use 
-  ///           a RAJA::forall template with the same execution policy to 
+  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector
+  ///           with RAJA::omp_parallel_for_exec execution policy. Then, use
+  ///           a RAJA::forall template with the same execution policy to
   ///           determine which points are visible.
   ///
 
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible, N);
+  // printArray(visible, N);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA CUDA variant
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA CUDA variant
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -211,16 +218,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs.
-  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector 
-  ///           with RAJA::cuda_exec execution policy. Then, use a 
-  ///           RAJA::forall template with the same execution policy to 
+  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector
+  ///           with RAJA::cuda_exec execution policy. Then, use a
+  ///           RAJA::forall template with the same execution policy to
   ///           determine which points are visible.
   ///
 
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible, N);
+  // printArray(visible, N);
 
 #endif
 
@@ -248,13 +255,20 @@ int checkResult(int* visible, int* visible_ref, int len)
   int num_visible = 0;
 
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( correct && visible[i] != visible_ref[i] ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (correct && visible[i] != visible_ref[i])
+    {
+      correct = false;
+    }
     num_visible += visible[i];
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 
@@ -268,7 +282,8 @@ template <typename T>
 void printArray(T* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
index 12348816a1..5da99b7fe2 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
@@ -93,52 +93,59 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* visible = memoryManager::allocate<int>(N);
   int* visible_ref = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
-    dist[i] = static_cast<double>(i+1);
-    double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 );
-    alt[i] = alt_fact *
-             static_cast<double>( rand() ) / static_cast<double>( RAND_MAX );
+  for (int i = 0; i < N; ++i)
+  {
+    dist[i] = static_cast<double>(i + 1);
+    double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1);
+    alt[i] =
+        alt_fact * static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
   }
 
   //
   // Set angle array
   //
-  for (int i = 0; i < N; ++i) {
-    ang[i] = atan2( alt[i], dist[i] );       // set angle in radians
+  for (int i = 0; i < N; ++i)
+  {
+    ang[i] = atan2(alt[i], dist[i]); // set angle in radians
   }
 
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n";
 
   std::memset(visible_ref, 0, N * sizeof(int));
 
   ang_max[0] = ang[0];
-  for (int i = 1; i < N; ++i) {
-      ang_max[i] = std::max(ang[i], ang_max[i-1]);
+  for (int i = 1; i < N; ++i)
+  {
+    ang_max[i] = std::max(ang[i], ang_max[i - 1]);
   }
 
   int num_visible = 0;
 
-  for (int i = 0; i < N; ++i) {
-     if ( ang[i] >= ang_max[i] ) {
-        visible_ref[i] = 1;
-        num_visible++;
-     } else {
-        visible_ref[i] = 0;
-     }
+  for (int i = 0; i < N; ++i)
+  {
+    if (ang[i] >= ang_max[i])
+    {
+      visible_ref[i] = 1;
+      num_visible++;
+    }
+    else
+    {
+      visible_ref[i] = 0;
+    }
   }
 
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible_ref, N);
+  // printArray(visible_ref, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA sequential variant
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA sequential variant
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n";
 
@@ -148,27 +155,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL1 = RAJA::seq_exec;
 
-  RAJA::inclusive_scan< EXEC_POL1 >(RAJA::make_span(ang, N),
-                                    RAJA::make_span(ang_max, N),
-                                    RAJA::operators::maximum<double>{} );
+  RAJA::inclusive_scan<EXEC_POL1>(RAJA::make_span(ang, N),
+                                  RAJA::make_span(ang_max, N),
+                                  RAJA::operators::maximum<double>{});
 
 
-  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) {
-    if ( ang[i] >= ang_max[i] ) {
-       visible[i] = 1;
-    } else {
-       visible[i] = 0;
+  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N), [=](int i) {
+    if (ang[i] >= ang_max[i])
+    {
+      visible[i] = 1;
+    }
+    else
+    {
+      visible[i] = 0;
     }
   });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible, N);
+  // printArray(visible, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA OpenMP multithreading variant
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA OpenMP multithreading variant
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -180,28 +190,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL2 = RAJA::omp_parallel_for_exec;
 
-  RAJA::inclusive_scan< EXEC_POL2 >(RAJA::make_span(ang, N),
-                                    RAJA::make_span(ang_max, N),
-                                    RAJA::operators::maximum<double>{} );
+  RAJA::inclusive_scan<EXEC_POL2>(RAJA::make_span(ang, N),
+                                  RAJA::make_span(ang_max, N),
+                                  RAJA::operators::maximum<double>{});
 
-  RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=] (int i) {
-    if ( ang[i] >= ang_max[i] ) {
-       visible[i] = 1;
-    } else {
-       visible[i] = 0;
+  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N), [=](int i) {
+    if (ang[i] >= ang_max[i])
+    {
+      visible[i] = 1;
+    }
+    else
+    {
+      visible[i] = 0;
     }
   });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible, N);
+  // printArray(visible, N);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA CUDA variant
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA CUDA variant
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -213,21 +226,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
 
-  RAJA::inclusive_scan< EXEC_POL3 >(RAJA::make_span(ang, N),
-                                    RAJA::make_span(ang_max, N),
-                                    RAJA::operators::maximum<double>{} );
+  RAJA::inclusive_scan<EXEC_POL3>(RAJA::make_span(ang, N),
+                                  RAJA::make_span(ang_max, N),
+                                  RAJA::operators::maximum<double>{});
 
-  RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) {
-    if ( ang[i] >= ang_max[i] ) {
-       visible[i] = 1;
-    } else {
-       visible[i] = 0;
+  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
+    if (ang[i] >= ang_max[i])
+    {
+      visible[i] = 1;
+    }
+    else
+    {
+      visible[i] = 0;
     }
   });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-//printArray(visible, N);
+  // printArray(visible, N);
 
 #endif
 
@@ -255,13 +271,20 @@ int checkResult(int* visible, int* visible_ref, int len)
   int num_visible = 0;
 
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( correct && visible[i] != visible_ref[i] ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (correct && visible[i] != visible_ref[i])
+    {
+      correct = false;
+    }
     num_visible += visible[i];
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 
@@ -275,7 +298,8 @@ template <typename T>
 void printArray(T* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
index 4d29f7b3ae..9e3968d313 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
@@ -15,7 +15,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #6: Offset layout stencil computation. 
+ *  EXERCISE #6: Offset layout stencil computation.
  *
  *  In this exercise, you will use RAJA Layouts and Views to perform
  *  a simple 5-point stencil computation on a 2-dimensional Cartesian mesh.
@@ -26,23 +26,23 @@
  *  The five-cell stencil accumulates values in a cell from itself and
  *  its four neighbors. Assuming the cells are indexed using (i,j) pairs on
  *  the two dimensional mesh, the stencil computation looks like:
- * 
+ *
  *  out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) +
  *              in(i, j - 1) + in(i, j + 1)
  *
  *  where 'in' is the input data array and 'out' is the result of
- *  the stencil computation. For simplicity, in the code examples, we refer 
- *  to the index tuples used to access input array entries as C (center), 
+ *  the stencil computation. For simplicity, in the code examples, we refer
+ *  to the index tuples used to access input array entries as C (center),
  *  W (west), E (east), S (south), and N (north).
  *
- *  We assume that the input array has an entry for N x M interior mesh cells 
+ *  We assume that the input array has an entry for N x M interior mesh cells
  *  plus a one cell wide halo region around the mesh interior; i.e., the size
  *  of the input array is (N + 2) * (M + 2). The output array has an entry
  *  for N x M interior mesh cells only, so its size is N * M. Note that since
- *  the arrays have different sizes, C-style indexing requires different 
+ *  the arrays have different sizes, C-style indexing requires different
  *  offset values in the code for accessing a cell entry in each array.
- * 
- *  The input array is initialized so that the entry for each interior cell 
+ *
+ *  The input array is initialized so that the entry for each interior cell
  *  is one and the entry for each halo cell is zero. So for the case where
  *  N = 3 and M = 2, the input array looks like:
  *
@@ -66,7 +66,7 @@
  *      | 3 | 4 | 3 |
  *      -------------
  *
- *  You can think about indexing into this mesh as illustrated in the 
+ *  You can think about indexing into this mesh as illustrated in the
  *  following diagram:
  *
  *  ---------------------------------------------------
@@ -79,31 +79,31 @@
  *  | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
  *  ---------------------------------------------------
  *
- *  Notably (0, 0) corresponds to the bottom left corner of the interior 
- *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom 
+ *  Notably (0, 0) corresponds to the bottom left corner of the interior
+ *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom
  *  left corner of the halo region, which extends to (3, 2).
  *
- *  This file contains two C-style sequential implementations of stencil 
- *  computation. One (Part a) has column indexing as stride-1 with the outer 
- *  loop traversing the rows ('i' loop variable) and the inner loop traversing 
- *  the columns ('j' loop variable). The other (Part B) has row indexing as 
- *  stride-1 and reverses the order of the loops. This shows that a C-style 
- *  implementation requires two different implementations, one for each loop 
- *  order, since the array offset arithmetic is different in the two cases. 
- *  Where indicated by comments, you will fill in versions using 
- *  two-dimensional RAJA Views with offset layouts. One loop ordering requires 
- *  permutations, while the other does not. If done properly, you will see 
- *  that both RAJA versions have identical inner loop bodies, which is not the 
+ *  This file contains two C-style sequential implementations of stencil
+ *  computation. One (Part a) has column indexing as stride-1 with the outer
+ *  loop traversing the rows ('i' loop variable) and the inner loop traversing
+ *  the columns ('j' loop variable). The other (Part B) has row indexing as
+ *  stride-1 and reverses the order of the loops. This shows that a C-style
+ *  implementation requires two different implementations, one for each loop
+ *  order, since the array offset arithmetic is different in the two cases.
+ *  Where indicated by comments, you will fill in versions using
+ *  two-dimensional RAJA Views with offset layouts. One loop ordering requires
+ *  permutations, while the other does not. If done properly, you will see
+ *  that both RAJA versions have identical inner loop bodies, which is not the
  *  case for the C-style variants.
  *
- *  Note that you will use the same for-loop patterns as the C-style loops. 
+ *  Note that you will use the same for-loop patterns as the C-style loops.
  *  In a later exercise, we will show you how to use RAJA's nested loop
- *  support, which allows you to write both RAJA variants with identical 
+ *  support, which allows you to write both RAJA variants with identical
  *  source code.
  *
  *  RAJA features you will use:
  *    -  Offset-layouts and RAJA Views
- * 
+ *
  *  Since this exercise is done on a CPU only, we use C++ new and delete
  *  operators to allocate and deallocate the arrays we will use.
  */
@@ -111,14 +111,14 @@
 //
 // Functions for printing and checking results
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is 
-// stride-1 (Rows indicates each row is stride-1, 
+// For array printing, 'stride1dim' indicates which mesh dimenstride is
+// stride-1 (Rows indicates each row is stride-1,
 //           Columns indicates each column is stride-1).
 //
 enum class Stride1
 {
-   Rows,
-   Columns 
+  Rows,
+  Columns
 };
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim);
 void checkResult(int* A, int* A_ref, int Ntot);
@@ -128,73 +128,76 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #6: Offset layout stencil computation...\n";
 
-//
-// Define number of rows and columns of cells in the 2D mesh.
-//
-  const int Nr_int = 5; 
+  //
+  // Define number of rows and columns of cells in the 2D mesh.
+  //
+  const int Nr_int = 5;
   const int Nc_int = 8;
 
-  const int Nr_tot = Nr_int + 2; 
+  const int Nr_tot = Nr_int + 2;
   const int Nc_tot = Nc_int + 2;
-  
+
   const int int_cells = Nr_int * Nc_int;
-  const int tot_cells = Nr_tot * Nc_tot; 
+  const int tot_cells = Nr_tot * Nc_tot;
 
-//
-// Allocate and initialize input array
-//
+  //
+  // Allocate and initialize input array
+  //
   int* B = memoryManager::allocate<int>(tot_cells * sizeof(int));
   int* A = memoryManager::allocate<int>(int_cells * sizeof(int));
   int* A_ref = memoryManager::allocate<int>(int_cells * sizeof(int));
 
 
-//----------------------------------------------------------------------------//
-// Part A:
-// 
-// Variant of stencil computation with column indexing as stride-1.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Part A:
+  //
+  // Variant of stencil computation with column indexing as stride-1.
+  //----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-//
-// We assume that for each cell id (i,j) that j is the stride-1 index.
-//
-  for (int i = 1; i <= Nc_int; ++i) {
-    for (int j = 1; j <= Nr_int; ++j) {
+  //
+  // We assume that for each cell id (i,j) that j is the stride-1 index.
+  //
+  for (int i = 1; i <= Nc_int; ++i)
+  {
+    for (int j = 1; j <= Nr_int; ++j)
+    {
       int idx = j + Nr_tot * i;
       B[idx] = 1;
     }
   }
-//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); 
+  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns);
 
 
-//----------------------------------------------------------------------------//
-// C-style stencil computation establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style stencil computation establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int i = 0; i < Nc_int; ++i) {
-    for (int j = 0; j < Nr_int; ++j) {
+  for (int i = 0; i < Nc_int; ++i)
+  {
+    for (int j = 0; j < Nr_int; ++j)
+    {
 
       int idx_out = j + Nr_int * i;
       int idx_in = (j + 1) + Nr_tot * (i + 1);
 
-      A_ref[idx_out] = B[idx_in] +                                // C
-                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] +  // W, E
-                       B[idx_in - 1] + B[idx_in + 1];             // S, N
-
+      A_ref[idx_out] = B[idx_in] +                               // C
+                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E
+                       B[idx_in - 1] + B[idx_in + 1];            // S, N
     }
   }
 
-//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
+  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
 
 
-//----------------------------------------------------------------------------//
-// Variant using RAJA Layouts and Views (no permutation).
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Variant using RAJA Layouts and Views (no permutation).
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running stencil computation with RAJA Views...\n";
 
@@ -203,114 +206,120 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE (Part A): 
+  /// EXERCISE (Part A):
   ///
-  ///   Fill in the stencil computation below where you use RAJA::View 
+  ///   Fill in the stencil computation below where you use RAJA::View
   ///   objects for accessing entries in the A and B arrays. You will use
   ///   a RAJA::OffsetLayout for the B array and a RAJA::Layout for the
-  ///   A array. The B array access requires an offset since the loops 
-  //    iterate over the interior (i, j) indices. 
+  ///   A array. The B array access requires an offset since the loops
+  //    iterate over the interior (i, j) indices.
   ///
-  ///   For this part (A) of the exercise, the column (j-loop) indexing 
+  ///   For this part (A) of the exercise, the column (j-loop) indexing
   ///   has stride 1.
   ///
 
 
-  for (int i = 0; i < Nc_int; ++i) {
-    for (int j = 0; j < Nr_int; ++j) {
+  for (int i = 0; i < Nc_int; ++i)
+  {
+    for (int j = 0; j < Nr_int; ++j)
+    {
 
       // fill in the loop body
-
     }
   }
 
   checkResult(A, A_ref, int_cells);
-//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
+  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
 
 
-//----------------------------------------------------------------------------//
-// Part B:
-// 
-// Variant of stencil computation with row indexing as stride-1.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Part B:
+  //
+  // Variant of stencil computation with row indexing as stride-1.
+  //----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-//
-// We assume that for each cell id (i,j) that i is the stride-1 index.
-//
-  for (int j = 1; j <= Nr_int; ++j) {
-    for (int i = 1; i <= Nc_int; ++i) {
+  //
+  // We assume that for each cell id (i,j) that i is the stride-1 index.
+  //
+  for (int j = 1; j <= Nr_int; ++j)
+  {
+    for (int i = 1; i <= Nc_int; ++i)
+    {
       int idx = i + Nc_tot * j;
       B[idx] = 1;
     }
   }
-//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
+  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
 
 
-//----------------------------------------------------------------------------//
-// C-style stencil computation establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style stencil computation establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int j = 0; j < Nr_int; ++j) {
-    for (int i = 0; i < Nc_int; ++i) {
+  for (int j = 0; j < Nr_int; ++j)
+  {
+    for (int i = 0; i < Nc_int; ++i)
+    {
 
       int idx_out = i + Nc_int * j;
       int idx_in = (i + 1) + Nc_tot * (j + 1);
 
-      A_ref[idx_out] = B[idx_in] +                                // C
-                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] +  // S, N
-                       B[idx_in - 1] + B[idx_in + 1];             // W, E
-
+      A_ref[idx_out] = B[idx_in] +                               // C
+                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N
+                       B[idx_in - 1] + B[idx_in + 1];            // W, E
     }
   }
 
-//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
+  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
 
 
-//----------------------------------------------------------------------------//
-// Variant using RAJA Layouts and Views (with permutation).
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Variant using RAJA Layouts and Views (with permutation).
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n";
+  std::cout << "\n\n Running stencil computation with RAJA Views "
+               "(permuted)...\n";
 
   std::memset(A, 0, int_cells * sizeof(int));
 
   ///
   /// TODO...
   ///
-  /// EXERCISE (Part B): 
+  /// EXERCISE (Part B):
   ///
-  ///   Fill in the stencil computation below where you use RAJA::View 
+  ///   Fill in the stencil computation below where you use RAJA::View
   ///   objects for accessing entries in the A and B arrays. You will use
   ///   a RAJA::OffsetLayout for the B array and a RAJA::Layout for the
-  ///   A array. The B array access requires an offset since the loops 
+  ///   A array. The B array access requires an offset since the loops
   //    iterate over the interior (i, j) indices.
   ///
-  ///   For this part (A) of the exercise, the row (i-loop) indexing 
-  ///   has stride 1. Thus, layouts for the A and B arrays require 
+  ///   For this part (A) of the exercise, the row (i-loop) indexing
+  ///   has stride 1. Thus, layouts for the A and B arrays require
   ///   the same permutation.
   ///
 
 
-  for (int j = 0; j < Nr_int; ++j) {
-    for (int i = 0; i < Nc_int; ++i) {
+  for (int j = 0; j < Nr_int; ++j)
+  {
+    for (int i = 0; i < Nc_int; ++i)
+    {
 
       // fill in the loop body
-
     }
   }
 
   checkResult(A, A_ref, int_cells);
-//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
+  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(B);
   memoryManager::deallocate(A);
   memoryManager::deallocate(A_ref);
@@ -321,19 +330,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is 
-// stride-1 (0 indicates each row is stride-1, 
+// For array printing, 'stride1dim' indicates which mesh dimenstride is
+// stride-1 (0 indicates each row is stride-1,
 //           1 indicates each column is stride-1).
 //
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim)
 {
   std::cout << std::endl;
-  for (int j = 0; j < Nrows; ++j) {
-    for (int i = 0; i < Ncols; ++i) {
+  for (int j = 0; j < Nrows; ++j)
+  {
+    for (int i = 0; i < Ncols; ++i)
+    {
       int idx = 0;
-      if ( stride1dim == Stride1::Columns ) {
+      if (stride1dim == Stride1::Columns)
+      {
         idx = j + Nrows * i;
-      } else {
+      }
+      else
+      {
         idx = i + Ncols * j;
       }
       std::cout << v[idx] << " ";
@@ -350,15 +364,20 @@ void checkResult(int* A, int* A_ref, int Ntot)
 {
   bool pass = true;
 
-  for (int i = 0; i < Ntot; ++i) {
-    if ( pass && (A[i] != A_ref[i]) ) {
+  for (int i = 0; i < Ntot; ++i)
+  {
+    if (pass && (A[i] != A_ref[i]))
+    {
       pass = false;
     }
   }
 
-  if (pass) {
+  if (pass)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
index 51aad20dae..e323c3f4d3 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
@@ -15,7 +15,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #6: Offset layout stencil computation. 
+ *  EXERCISE #6: Offset layout stencil computation.
  *
  *  In this exercise, you will use RAJA Layouts and Views to perform
  *  a simple 5-point stencil computation on a 2-dimensional Cartesian mesh.
@@ -26,23 +26,23 @@
  *  The five-cell stencil accumulates values in a cell from itself and
  *  its four neighbors. Assuming the cells are indexed using (i,j) pairs on
  *  the two dimensional mesh, the stencil computation looks like:
- * 
+ *
  *  out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) +
  *              in(i, j - 1) + in(i, j + 1)
  *
  *  where 'in' is the input data array and 'out' is the result of
- *  the stencil computation. For simplicity, in the code examples, we refer 
- *  to the index tuples used to access input array entries as C (center), 
+ *  the stencil computation. For simplicity, in the code examples, we refer
+ *  to the index tuples used to access input array entries as C (center),
  *  W (west), E (east), S (south), and N (north).
  *
- *  We assume that the input array has an entry for N x M interior mesh cells 
+ *  We assume that the input array has an entry for N x M interior mesh cells
  *  plus a one cell wide halo region around the mesh interior; i.e., the size
  *  of the input array is (N + 2) * (M + 2). The output array has an entry
  *  for N x M interior mesh cells only, so its size is N * M. Note that since
- *  the arrays have different sizes, C-style indexing requires different 
+ *  the arrays have different sizes, C-style indexing requires different
  *  offset values in the code for accessing a cell entry in each array.
- * 
- *  The input array is initialized so that the entry for each interior cell 
+ *
+ *  The input array is initialized so that the entry for each interior cell
  *  is one and the entry for each halo cell is zero. So for the case where
  *  N = 3 and M = 2, the input array looks like:
  *
@@ -66,7 +66,7 @@
  *      | 3 | 4 | 3 |
  *      -------------
  *
- *  You can think about indexing into this mesh as illustrated in the 
+ *  You can think about indexing into this mesh as illustrated in the
  *  following diagram:
  *
  *  ---------------------------------------------------
@@ -79,31 +79,31 @@
  *  | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
  *  ---------------------------------------------------
  *
- *  Notably (0, 0) corresponds to the bottom left corner of the interior 
- *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom 
+ *  Notably (0, 0) corresponds to the bottom left corner of the interior
+ *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom
  *  left corner of the halo region, which extends to (3, 2).
  *
- *  This file contains two C-style sequential implementations of stencil 
- *  computation. One has column indexing as stride-1 with the outer loop 
- *  traversing the rows ('i' loop variable) and the inner loop traversing the 
+ *  This file contains two C-style sequential implementations of stencil
+ *  computation. One has column indexing as stride-1 with the outer loop
+ *  traversing the rows ('i' loop variable) and the inner loop traversing the
  *  columns ('j' loop variable). The other has row indexing as stride-1 and
- *  reverses the order of the loops. This shows that a C-style implementation 
+ *  reverses the order of the loops. This shows that a C-style implementation
  *  requires two different implementations, one for each loop order, since the
- *  array offset arithmetic is different in the two cases. Where indicated 
+ *  array offset arithmetic is different in the two cases. Where indicated
  *  by comments, you will fill in versions using two-dimensional RAJA Views
  *  with offset layouts. One loop ordering requires permutations, while the
  *  other does not. If done properly, you will see that both RAJA versions
  *  have identical inner loop bodies, which is not the case for the C-style
  *  variants.
  *
- *  Note that you will use the same for-loop patterns as the C-style loops. 
+ *  Note that you will use the same for-loop patterns as the C-style loops.
  *  In a later exercise, we will show you how to use RAJA's nested loop
- *  support, which allows you to write both RAJA variants with identical 
+ *  support, which allows you to write both RAJA variants with identical
  *  source code.
  *
  *  RAJA features you will use:
  *    -  Offset-layouts and RAJA Views
- * 
+ *
  *  Since this exercise is done on a CPU only, we use C++ new and delete
  *  operators to allocate and deallocate the arrays we will use.
  */
@@ -111,14 +111,14 @@
 //
 // Functions for printing and checking results
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is 
-// stride-1 (Rows indicates each row is stride-1, 
+// For array printing, 'stride1dim' indicates which mesh dimenstride is
+// stride-1 (Rows indicates each row is stride-1,
 //           Columns indicates each column is stride-1).
 //
 enum class Stride1
 {
-   Rows,
-   Columns 
+  Rows,
+  Columns
 };
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim);
 void checkResult(int* A, int* A_ref, int Ntot);
@@ -128,73 +128,76 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #6: Offset layout stencil computation...\n";
 
-//
-// Define number of rows and columns of cells in the 2D mesh.
-//
+  //
+  // Define number of rows and columns of cells in the 2D mesh.
+  //
   const int DIM = 2;
 
-  const int Nr_int = 5; 
+  const int Nr_int = 5;
   const int Nc_int = 8;
 
-  const int Nr_tot = Nr_int + 2; 
+  const int Nr_tot = Nr_int + 2;
   const int Nc_tot = Nc_int + 2;
-  
+
   const int int_cells = Nr_int * Nc_int;
-  const int tot_cells = Nr_tot * Nc_tot; 
+  const int tot_cells = Nr_tot * Nc_tot;
 
-//
-// Allocate and initialize input array
-//
+  //
+  // Allocate and initialize input array
+  //
   int* B = memoryManager::allocate<int>(tot_cells * sizeof(int));
   int* A = memoryManager::allocate<int>(int_cells * sizeof(int));
   int* A_ref = memoryManager::allocate<int>(int_cells * sizeof(int));
 
 
-//----------------------------------------------------------------------------//
-// First variant of stencil computation with column indexing as stride-1.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // First variant of stencil computation with column indexing as stride-1.
+  //----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-//
-// We assume that for each cell id (i,j) that j is the stride-1 index.
-//
-  for (int i = 1; i <= Nc_int; ++i) {
-    for (int j = 1; j <= Nr_int; ++j) {
+  //
+  // We assume that for each cell id (i,j) that j is the stride-1 index.
+  //
+  for (int i = 1; i <= Nc_int; ++i)
+  {
+    for (int j = 1; j <= Nr_int; ++j)
+    {
       int idx = j + Nr_tot * i;
       B[idx] = 1;
     }
   }
-//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); 
+  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns);
 
 
-//----------------------------------------------------------------------------//
-// C-style stencil computation establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style stencil computation establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int i = 0; i < Nc_int; ++i) {
-    for (int j = 0; j < Nr_int; ++j) {
+  for (int i = 0; i < Nc_int; ++i)
+  {
+    for (int j = 0; j < Nr_int; ++j)
+    {
 
       int idx_out = j + Nr_int * i;
       int idx_in = (j + 1) + Nr_tot * (i + 1);
 
-      A_ref[idx_out] = B[idx_in] +                                // C
-                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] +  // W, E
-                       B[idx_in - 1] + B[idx_in + 1];             // S, N
-
+      A_ref[idx_out] = B[idx_in] +                               // C
+                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E
+                       B[idx_in - 1] + B[idx_in + 1];            // S, N
     }
   }
 
-//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
+  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
 
 
-//----------------------------------------------------------------------------//
-// Variant using RAJA Layouts and Views (no permutation).
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Variant using RAJA Layouts and Views (no permutation).
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running stencil computation with RAJA Views...\n";
 
@@ -203,78 +206,83 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Create offset Layout and Views for data access. Note that only
   // the input array access requires an offset since the loops iterate over
-  // the interior (i, j) indices. We can use the default layout for the 
-  // output array. Also, since the 'j' index (rightmost) is stride-1, 
+  // the interior (i, j) indices. We can use the default layout for the
+  // output array. Also, since the 'j' index (rightmost) is stride-1,
   // we don't need a permutation for this case.
   //
 
   RAJA::OffsetLayout<DIM> B_layout =
-      RAJA::make_offset_layout<DIM>({{-1, -1}}, {{Nc_tot-1, Nr_tot-1}});
+      RAJA::make_offset_layout<DIM>({{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}});
 
   RAJA::View<int, RAJA::OffsetLayout<DIM>> Bview(B, B_layout);
   RAJA::View<int, RAJA::Layout<DIM>> Aview(A, Nc_int, Nr_int);
 
-  for (int i = 0; i < Nc_int; ++i) {
-    for (int j = 0; j < Nr_int; ++j) {
-
-      Aview(i, j) = Bview(i, j) +                           // C
-                    Bview(i - 1, j) + Bview(i + 1, j) +     // W, E
-                    Bview(i, j - 1) + Bview(i, j + 1);      // S, N
+  for (int i = 0; i < Nc_int; ++i)
+  {
+    for (int j = 0; j < Nr_int; ++j)
+    {
 
+      Aview(i, j) = Bview(i, j) +                       // C
+                    Bview(i - 1, j) + Bview(i + 1, j) + // W, E
+                    Bview(i, j - 1) + Bview(i, j + 1);  // S, N
     }
   }
 
   checkResult(A, A_ref, int_cells);
-//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
+  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
 
 
-//----------------------------------------------------------------------------//
-// Second variant of stencil computation with row indexing as stride-1.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Second variant of stencil computation with row indexing as stride-1.
+  //----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-//
-// We assume that for each cell id (i,j) that i is the stride-1 index.
-//
-  for (int j = 1; j <= Nr_int; ++j) {
-    for (int i = 1; i <= Nc_int; ++i) {
+  //
+  // We assume that for each cell id (i,j) that i is the stride-1 index.
+  //
+  for (int j = 1; j <= Nr_int; ++j)
+  {
+    for (int i = 1; i <= Nc_int; ++i)
+    {
       int idx = i + Nc_tot * j;
       B[idx] = 1;
     }
   }
-//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
+  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
 
 
-//----------------------------------------------------------------------------//
-// C-style stencil computation establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style stencil computation establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int j = 0; j < Nr_int; ++j) {
-    for (int i = 0; i < Nc_int; ++i) {
+  for (int j = 0; j < Nr_int; ++j)
+  {
+    for (int i = 0; i < Nc_int; ++i)
+    {
 
       int idx_out = i + Nc_int * j;
       int idx_in = (i + 1) + Nc_tot * (j + 1);
 
-      A_ref[idx_out] = B[idx_in] +                                // C
-                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] +  // S, N
-                       B[idx_in - 1] + B[idx_in + 1];             // W, E
-
+      A_ref[idx_out] = B[idx_in] +                               // C
+                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N
+                       B[idx_in - 1] + B[idx_in + 1];            // W, E
     }
   }
 
-//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
+  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
 
 
-//----------------------------------------------------------------------------//
-// Variant using RAJA Layouts and Views (with permutation).
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // Variant using RAJA Layouts and Views (with permutation).
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n";
+  std::cout << "\n\n Running stencil computation with RAJA Views "
+               "(permuted)...\n";
 
   std::memset(A, 0, int_cells * sizeof(int));
 
@@ -289,35 +297,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // application.
   //
 
-  std::array<RAJA::idx_t, DIM> perm {{1, 0}};  // 'i' index (position zero0) 
-                                               // is stride-1 
+  std::array<RAJA::idx_t, DIM> perm{{1, 0}}; // 'i' index (position zero0)
+                                             // is stride-1
 
-  RAJA::OffsetLayout<DIM> pB_layout =
-    RAJA::make_permuted_offset_layout( {{-1, -1}}, {{Nc_tot-1, Nr_tot-1}},
-                                       perm );
+  RAJA::OffsetLayout<DIM> pB_layout = RAJA::make_permuted_offset_layout(
+      {{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}}, perm);
 
-  RAJA::Layout<DIM> pA_layout = 
-      RAJA::make_permuted_layout( {{Nc_int, Nr_int}}, perm );
+  RAJA::Layout<DIM> pA_layout =
+      RAJA::make_permuted_layout({{Nc_int, Nr_int}}, perm);
 
   RAJA::View<int, RAJA::OffsetLayout<DIM>> pBview(B, pB_layout);
   RAJA::View<int, RAJA::Layout<DIM>> pAview(A, pA_layout);
 
-  for (int j = 0; j < Nr_int; ++j) {
-    for (int i = 0; i < Nc_int; ++i) {
-
-      pAview(i, j) = pBview(i, j) +                            // C
-                     pBview(i - 1, j) + pBview(i + 1, j) +     // W, E
-                     pBview(i, j - 1) + pBview(i, j + 1);      // S, N
+  for (int j = 0; j < Nr_int; ++j)
+  {
+    for (int i = 0; i < Nc_int; ++i)
+    {
 
+      pAview(i, j) = pBview(i, j) +                        // C
+                     pBview(i - 1, j) + pBview(i + 1, j) + // W, E
+                     pBview(i, j - 1) + pBview(i, j + 1);  // S, N
     }
   }
 
   checkResult(A, A_ref, int_cells);
-//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
+  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(B);
   memoryManager::deallocate(A);
   memoryManager::deallocate(A_ref);
@@ -328,19 +336,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is 
-// stride-1 (0 indicates each row is stride-1, 
+// For array printing, 'stride1dim' indicates which mesh dimenstride is
+// stride-1 (0 indicates each row is stride-1,
 //           1 indicates each column is stride-1).
 //
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim)
 {
   std::cout << std::endl;
-  for (int j = 0; j < Nrows; ++j) {
-    for (int i = 0; i < Ncols; ++i) {
+  for (int j = 0; j < Nrows; ++j)
+  {
+    for (int i = 0; i < Ncols; ++i)
+    {
       int idx = 0;
-      if ( stride1dim == Stride1::Columns ) {
+      if (stride1dim == Stride1::Columns)
+      {
         idx = j + Nrows * i;
-      } else {
+      }
+      else
+      {
         idx = i + Ncols * j;
       }
       std::cout << v[idx] << " ";
@@ -357,15 +370,20 @@ void checkResult(int* A, int* A_ref, int Ntot)
 {
   bool pass = true;
 
-  for (int i = 0; i < Ntot; ++i) {
-    if ( pass && (A[i] != A_ref[i]) ) {
+  for (int i = 0; i < Ntot; ++i)
+  {
+    if (pass && (A[i] != A_ref[i]))
+    {
       pass = false;
     }
   }
 
-  if (pass) {
+  if (pass)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
index d183c221fa..f9ac15ab9e 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
@@ -16,13 +16,13 @@
 /*
  *  EXERCISE #8: Tiled Matrix Transpose
  *
- *  In this exercise, you will use RAJA constructs to transpose a matrix 
+ *  In this exercise, you will use RAJA constructs to transpose a matrix
  *  using a loop tiling algorithm. An input matrix A of dimension N_r x N_c
  *  is provided. You will fill in the entries of the transpose matrix At.
  *
  *  This file contains a C-style variant of the sequential matrix transpose.
  *  You will complete implementations of multiple RAJA variants by filling
- *  in missing elements of RAJA kernel API execution policies as well as the 
+ *  in missing elements of RAJA kernel API execution policies as well as the
  *  RAJA kernel implementation for each. Variants you will complete include
  *  sequential, OpenMP, and CUDA execution.
  *
@@ -52,7 +52,7 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n";
@@ -66,8 +66,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -80,9 +80,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, 
-                                                            perm );
+  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -97,14 +96,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -112,8 +113,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
 
       //
       // (1) Loops to iterate over tile entries
@@ -121,29 +124,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that output matrix data access
       //           is stride-1.
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow) {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
+      for (int trow = 0; trow < TILE_SZ; ++trow)
+      {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
+        {
 
-          int col = bx * TILE_SZ + tcol;  // Matrix column index
-          int row = by * TILE_SZ + trow;  // Matrix row index
+          int col = bx * TILE_SZ + tcol; // Matrix column index
+          int row = by * TILE_SZ + trow; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
-
     }
   }
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
-  // The following RAJA variants will use the RAJA::kernel method to 
+  // The following RAJA variants will use the RAJA::kernel method to
   // perform the matrix transpose operation.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
@@ -152,14 +157,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // global iteration number.
   //
 
-// Note: this needs to be turned on for other back-ends when working the 
+// Note: this needs to be turned on for other back-ends when working the
 //       exercises (sequential, CUDA, etc.)
 #if defined(RAJA_ENABLE_OPENMP)
   RAJA::RangeSegment row_Range(0, N_r);
   RAJA::RangeSegment col_Range(0, N_c);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -199,7 +204,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top "
+               "inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -238,9 +244,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed "
+               "inner loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -249,27 +256,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using KERNEL_EXEC_POL_OMP2 =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                               RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                                 RAJA::seq_exec,
-          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                    RAJA::ArgList<0, 1>,
-            RAJA::statement::Lambda<0>
-          > //closes collapse
-        > // closes Tile 0
-      > // closes Tile 1
-    >; // closes policy list
+  using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_SZ>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<0,
+                            RAJA::tile_fixed<TILE_SZ>,
+                            RAJA::seq_exec,
+                            RAJA::statement::Collapse<
+                                RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<0, 1>,
+                                RAJA::statement::Lambda<0>> // closes collapse
+                            >                               // closes Tile 0
+      >                                                     // closes Tile 1
+                                                  >; // closes policy list
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
-                        RAJA::make_tuple(col_Range, row_Range),
-                        [=](int col, int row) {
-
-    Atview(col, row) = Aview(row, col);
-
-  });
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -334,16 +338,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match &= false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -355,11 +365,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
index dbb9a75c20..50691ce1c1 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
@@ -50,7 +50,7 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n";
@@ -64,8 +64,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -78,9 +78,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, 
-                                                            perm );
+  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -95,14 +94,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
-  //printResult<int>(Aview, N_r, N_c);
+  // printResult<int>(Aview, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -110,38 +111,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by) {
-    for (int bx = 0; bx < outer_Dimc; ++bx) {
+  for (int by = 0; by < outer_Dimr; ++by)
+  {
+    for (int bx = 0; bx < outer_Dimc; ++bx)
+    {
 
       //
       // (1) Loops to iterate over tile entries
       //
       //     Note: loops are ordered so that output matrix data access
-      //           is stride-1.   
+      //           is stride-1.
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow) {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
+      for (int trow = 0; trow < TILE_SZ; ++trow)
+      {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
+        {
 
-          int col = bx * TILE_SZ + tcol;  // Matrix column index
-          int row = by * TILE_SZ + trow;  // Matrix row index
+          int col = bx * TILE_SZ + tcol; // Matrix column index
+          int row = by * TILE_SZ + trow; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
-
     }
   }
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
-  // The following RAJA variants will use the RAJA::kernel method to 
+  // The following RAJA variants will use the RAJA::kernel method to
   // perform the matrix transpose operation.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
@@ -152,7 +157,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment row_Range(0, N_r);
   RAJA::RangeSegment col_Range(0, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -162,32 +167,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // tile_fixed corresponds to the dimension size of the tile.
   //
 
-  using KERNEL_EXEC_POL_SEQ =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                               RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                                 RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::seq_exec,
-            RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<KERNEL_EXEC_POL_SEQ>( RAJA::make_tuple(col_Range, row_Range),
-    [=](int col, int row) {
-      Atview(col, row) = Aview(row, col);
-  });
+  using KERNEL_EXEC_POL_SEQ = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_SZ>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_SZ>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              RAJA::seq_exec,
+              RAJA::statement::
+                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<KERNEL_EXEC_POL_SEQ>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top inner loop...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top "
+               "inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -196,35 +200,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // one of the inner loops.
   //
 
-  using KERNEL_EXEC_POL_OMP =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                               RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                                 RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
-            RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      >
-    >;
+  using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_SZ>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_SZ>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              RAJA::omp_parallel_for_exec,
+              RAJA::statement::
+                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP>(
-                          RAJA::make_tuple(col_Range, row_Range),
-                          [=](int col, int row) {
-
-    Atview(col, row) = Aview(row, col);
-
-  });
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed "
+               "inner loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -234,27 +234,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // to/from the tile.
   //
 
-  using KERNEL_EXEC_POL_OMP2 =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                               RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                                 RAJA::seq_exec,
-          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                    RAJA::ArgList<0, 1>,
-            RAJA::statement::Lambda<0>
-          > //closes collapse
-        > // closes Tile 0
-      > // closes Tile 1
-    >; // closes policy list
+  using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_SZ>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<0,
+                            RAJA::tile_fixed<TILE_SZ>,
+                            RAJA::seq_exec,
+                            RAJA::statement::Collapse<
+                                RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<0, 1>,
+                                RAJA::statement::Lambda<0>> // closes collapse
+                            >                               // closes Tile 0
+      >                                                     // closes Tile 1
+                                                  >; // closes policy list
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
-                        RAJA::make_tuple(col_Range, row_Range),
-                        [=](int col, int row) {
-
-    Atview(col, row) = Aview(row, col);
-
-  });
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -267,29 +264,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   using KERNEL_EXEC_POL_CUDA =
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                                 RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                                   RAJA::cuda_block_x_loop,
-            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<0>
-              >
-            >
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(
-                           RAJA::make_tuple(col_Range, row_Range),
-                           [=] RAJA_DEVICE (int col, int row) {
-
-                             Atview(col, row) = Aview(row, col);
-
-  });
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_SZ>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_SZ>,
+              RAJA::cuda_block_x_loop,
+              RAJA::statement::For<
+                  1,
+                  RAJA::cuda_thread_y_direct,
+                  RAJA::statement::For<0,
+                                       RAJA::cuda_thread_x_direct,
+                                       RAJA::statement::Lambda<0>>>>>>>;
+
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
+                                     [=] RAJA_DEVICE(int col, int row) {
+                                       Atview(col, row) = Aview(row, col);
+                                     });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -314,16 +307,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match &= false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -335,11 +334,13 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout<<Atview(row, col)<<" ";
+      std::cout << Atview(row, col) << " ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
index 71743ba2d4..3c04f6af1d 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
@@ -17,10 +17,10 @@
  *  EXERCISE #9: Matrix Transpose with Local Array
  *
  *  In this exercise, you will use RAJA constructs to transpose a matrix
- *  using a loop tiling algorithm similar to exercise 8. However, this 
+ *  using a loop tiling algorithm similar to exercise 8. However, this
  *  exercise is different in that you will use a local array to write
- *  to and read from as each matrix tile is transposed. An input matrix 
- *  A of dimension N_r x N_c is provided. You will fill in the entries 
+ *  to and read from as each matrix tile is transposed. An input matrix
+ *  A of dimension N_r x N_c is provided. You will fill in the entries
  *  of the transpose matrix At.
  *
  *  This file contains a C-style variant of the sequential matrix transpose.
@@ -57,7 +57,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n";
@@ -71,8 +71,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -85,9 +85,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}},
-                                                            perm );
+  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -102,14 +101,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
   // printResult<int>(Aview, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of local array matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -117,8 +118,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int brow = 0; brow < outer_Dimr; ++brow) {
-    for (int bcol = 0; bcol < outer_Dimc; ++bcol) {
+  for (int brow = 0; brow < outer_Dimr; ++brow)
+  {
+    for (int bcol = 0; bcol < outer_Dimc; ++bcol)
+    {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_SZ][TILE_SZ];
@@ -129,14 +132,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow) {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
+      for (int trow = 0; trow < TILE_SZ; ++trow)
+      {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
+        {
 
-          int col = bcol * TILE_SZ + tcol;  // Matrix column index
-          int row = brow * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol; // Matrix column index
+          int row = brow * TILE_SZ + trow; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Tile[trow][tcol] = Aview(row, col);
           }
         }
@@ -148,25 +154,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
-        for (int trow = 0; trow < TILE_SZ; ++trow) {
+      for (int tcol = 0; tcol < TILE_SZ; ++tcol)
+      {
+        for (int trow = 0; trow < TILE_SZ; ++trow)
+        {
 
-          int col = bcol * TILE_SZ + tcol;  // Matrix column index
-          int row = brow * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol; // Matrix column index
+          int row = brow * TILE_SZ + trow; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Tile[trow][tcol];
           }
         }
       }
-
     }
   }
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
   // The following RAJA variants will use the RAJA::kernel method to
@@ -177,7 +185,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // tile_fixed statements. Iterations inside a RAJA loop is given by their
   // global iteration number.
   //
-#if 0  // needed for exercises, but if-def'd out to quiet compiler warnings.
+#if 0 // needed for exercises, but if-def'd out to quiet compiler warnings.
   RAJA::RangeSegment row_Range(0, N_r);
   RAJA::RangeSegment col_Range(0, N_c);
 #endif
@@ -190,7 +198,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
 
   using TILE_MEM =
-    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
+      RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
 
   // **NOTE** The LocalArray is created here, but it's memory is not yet
   //          allocated. This is done when the 'InitLocalMem' statement
@@ -199,7 +207,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   TILE_MEM RAJA_Tile;
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -425,16 +433,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match &= false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -446,8 +460,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
index 1900bf1157..9603820403 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
@@ -17,10 +17,10 @@
  *  EXERCISE #9: Matrix Transpose with Local Array
  *
  *  In this exercise, you will use RAJA constructs to transpose a matrix
- *  using a loop tiling algorithm similar to exercise 8. However, this 
+ *  using a loop tiling algorithm similar to exercise 8. However, this
  *  exercise is different in that you will use a local array to write
- *  to and read from as each matrix tile is transposed. An input matrix 
- *  A of dimension N_r x N_c is provided. You will fill in the entries 
+ *  to and read from as each matrix tile is transposed. An input matrix
+ *  A of dimension N_r x N_c is provided. You will fill in the entries
  *  of the transpose matrix At.
  *
  *  This file contains a C-style variant of the sequential matrix transpose.
@@ -57,7 +57,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n";
@@ -71,8 +71,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
+  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -85,9 +85,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}},
-                                                            perm );
+  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -102,14 +101,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       Aview(row, col) = col;
     }
   }
   // printResult<int>(Aview, N_r, N_c);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of local array matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -117,8 +118,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int brow = 0; brow < outer_Dimr; ++brow) {
-    for (int bcol = 0; bcol < outer_Dimc; ++bcol) {
+  for (int brow = 0; brow < outer_Dimr; ++brow)
+  {
+    for (int bcol = 0; bcol < outer_Dimc; ++bcol)
+    {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_SZ][TILE_SZ];
@@ -129,14 +132,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow) {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
+      for (int trow = 0; trow < TILE_SZ; ++trow)
+      {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
+        {
 
-          int col = bcol * TILE_SZ + tcol;  // Matrix column index
-          int row = brow * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol; // Matrix column index
+          int row = brow * TILE_SZ + trow; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Tile[trow][tcol] = Aview(row, col);
           }
         }
@@ -148,25 +154,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
-        for (int trow = 0; trow < TILE_SZ; ++trow) {
+      for (int tcol = 0; tcol < TILE_SZ; ++tcol)
+      {
+        for (int trow = 0; trow < TILE_SZ; ++trow)
+        {
 
-          int col = bcol * TILE_SZ + tcol;  // Matrix column index
-          int row = brow * TILE_SZ + trow;  // Matrix row index
+          int col = bcol * TILE_SZ + tcol; // Matrix column index
+          int row = brow * TILE_SZ + trow; // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_r && col < N_c)
+          {
             Atview(col, row) = Tile[trow][tcol];
           }
         }
       }
-
     }
   }
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   //
   // The following RAJA variants will use the RAJA::kernel method to
@@ -188,7 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
 
   using TILE_MEM =
-    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
+      RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
 
   // **NOTE** The LocalArray is created here, but it's memory is not yet
   //          allocated. This is done when the 'InitLocalMem' statement
@@ -197,55 +205,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   TILE_MEM RAJA_Tile;
 
-//--------------------------------------------------------------------------//
+  //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using SEQ_EXEC_POL =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                               RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                                 RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-
-          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
-                                        RAJA::seq_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
-                                        RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >,
-
-          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
-                                        RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
-                                          RAJA::seq_exec,
-              RAJA::statement::Lambda<1>
-            >
-          >
-
-          >
-        >
-      >
-    >;
+  using SEQ_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_SZ>,
+      RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_SZ>,
+          RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<
+              RAJA::cpu_tile_mem,
+              RAJA::ParamList<2>,
+
+              RAJA::statement::ForICount<
+                  1,
+                  RAJA::statement::Param<1>,
+                  RAJA::seq_exec,
+                  RAJA::statement::ForICount<0,
+                                             RAJA::statement::Param<0>,
+                                             RAJA::seq_exec,
+                                             RAJA::statement::Lambda<0>>>,
+
+              RAJA::statement::ForICount<
+                  0,
+                  RAJA::statement::Param<0>,
+                  RAJA::seq_exec,
+                  RAJA::statement::ForICount<1,
+                                             RAJA::statement::Param<1>,
+                                             RAJA::seq_exec,
+                                             RAJA::statement::Lambda<1>>>
+
+              >>>>;
+
+  RAJA::kernel_param<SEQ_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
 
-  RAJA::kernel_param<SEQ_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
-
-    RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
-
-    [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
-
-      RAJA_Tile(trow, tcol) = Aview(row, col);
-
-    },
-
-    [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
+      RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-      Atview(col, row) = RAJA_Tile(trow, tcol);
+      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
+        RAJA_Tile(trow, tcol) = Aview(row, col);
+      },
 
-  });
+      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
+        Atview(col, row) = RAJA_Tile(trow, tcol);
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -257,49 +265,48 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_POL =
-  RAJA::KernelPolicy<
-    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                             RAJA::omp_parallel_for_exec,
-      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                               RAJA::seq_exec,
-
-        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-
-          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
-                                        RAJA::seq_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
-                                          RAJA::seq_exec,
-               RAJA::statement::Lambda<0>
-            >
-          >,
-
-          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
-                                        RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
-                                          RAJA::seq_exec,
-              RAJA::statement::Lambda<1>
-            >
-          >
-        >
-      >
-    >
-   >;
-
-  RAJA::kernel_param<OPENMP_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+  using OPENMP_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
+      1,
+      RAJA::tile_fixed<TILE_SZ>,
+      RAJA::omp_parallel_for_exec,
+      RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<TILE_SZ>,
+          RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<
+              RAJA::cpu_tile_mem,
+              RAJA::ParamList<2>,
+
+              RAJA::statement::ForICount<
+                  1,
+                  RAJA::statement::Param<1>,
+                  RAJA::seq_exec,
+                  RAJA::statement::ForICount<0,
+                                             RAJA::statement::Param<0>,
+                                             RAJA::seq_exec,
+                                             RAJA::statement::Lambda<0>>>,
+
+              RAJA::statement::ForICount<
+                  0,
+                  RAJA::statement::Param<0>,
+                  RAJA::seq_exec,
+                  RAJA::statement::ForICount<1,
+                                             RAJA::statement::Param<1>,
+                                             RAJA::seq_exec,
+                                             RAJA::statement::Lambda<1>>>>>>>;
+
+  RAJA::kernel_param<OPENMP_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
 
       RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
       [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
-
         RAJA_Tile(trow, tcol) = Aview(row, col);
-
       },
 
       [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
-
         Atview(col, row) = RAJA_Tile(trow, tcol);
-
       });
 
   checkResult<int>(Atview, N_c, N_r);
@@ -315,55 +322,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   using CUDA_EXEC_POL =
-  RAJA::KernelPolicy<
-    RAJA::statement::CudaKernel<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
-                               RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
-                                 RAJA::cuda_block_x_loop,
-
-          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
-
-            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
-                                          RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
-                                            RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<0>
-              >
-            >,
-
-            RAJA::statement::CudaSyncThreads,
-
-            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
-                                          RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
-                                            RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<1>
-              >
-            >,
-
-            RAJA::statement::CudaSyncThreads
-          >
-        >
-      >
-    >
-  >;
-
-
-  RAJA::kernel_param<CUDA_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+          1,
+          RAJA::tile_fixed<TILE_SZ>,
+          RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<TILE_SZ>,
+              RAJA::cuda_block_x_loop,
+
+              RAJA::statement::InitLocalMem<
+                  RAJA::cuda_shared_mem,
+                  RAJA::ParamList<2>,
+
+                  RAJA::statement::ForICount<
+                      1,
+                      RAJA::statement::Param<1>,
+                      RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<0,
+                                                 RAJA::statement::Param<0>,
+                                                 RAJA::cuda_thread_x_direct,
+                                                 RAJA::statement::Lambda<0>>>,
+
+                  RAJA::statement::CudaSyncThreads,
+
+                  RAJA::statement::ForICount<
+                      0,
+                      RAJA::statement::Param<0>,
+                      RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<1,
+                                                 RAJA::statement::Param<1>,
+                                                 RAJA::cuda_thread_x_direct,
+                                                 RAJA::statement::Lambda<1>>>,
+
+                  RAJA::statement::CudaSyncThreads>>>>>;
+
+
+  RAJA::kernel_param<CUDA_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
 
       RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-      [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
-
+      [=] RAJA_DEVICE(
+          int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
         RAJA_Tile(trow, tcol) = Aview(row, col);
-
       },
 
-      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
-
+      [=] RAJA_DEVICE(
+          int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
         Atview(col, row) = RAJA_Tile(trow, tcol);
-
       });
 
   checkResult<int>(Atview, N_c, N_r);
@@ -391,16 +398,22 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
-      if (Atview(row, col) != row) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
+      if (Atview(row, col) != row)
+      {
         match &= false;
       }
     }
   }
-  if (match) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -412,8 +425,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row) {
-    for (int col = 0; col < N_c; ++col) {
+  for (int row = 0; row < N_r; ++row)
+  {
+    for (int col = 0; col < N_c; ++col)
+    {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/tutorial_halfday/memoryManager.hpp b/exercises/tutorial_halfday/memoryManager.hpp
index 83fb8cb3bb..c563033f9c 100644
--- a/exercises/tutorial_halfday/memoryManager.hpp
+++ b/exercises/tutorial_halfday/memoryManager.hpp
@@ -28,12 +28,12 @@ namespace memoryManager
 {
 
 template <typename T>
-T *allocate(RAJA::Index_type size)
+T* allocate(RAJA::Index_type size)
 {
-  T *ptr;
+  T* ptr;
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(
-      cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
+      cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
 #else
   ptr = new T[size];
 #endif
@@ -41,9 +41,10 @@ T *allocate(RAJA::Index_type size)
 }
 
 template <typename T>
-void deallocate(T *&ptr)
+void deallocate(T*& ptr)
 {
-  if (ptr) {
+  if (ptr)
+  {
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaFree(ptr));
 #else
@@ -54,31 +55,32 @@ void deallocate(T *&ptr)
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-  template <typename T>
-  T *allocate_gpu(RAJA::Index_type size)
-  {
-    T *ptr;
+template <typename T>
+T* allocate_gpu(RAJA::Index_type size)
+{
+  T* ptr;
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size));
+  cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
+  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
 #endif
-    return ptr;
-  }
+  return ptr;
+}
 
-  template <typename T>
-  void deallocate_gpu(T *&ptr)
+template <typename T>
+void deallocate_gpu(T*& ptr)
+{
+  if (ptr)
   {
-    if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
-      cudaErrchk(cudaFree(ptr));
+    cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
-      hipErrchk(hipFree(ptr));
+    hipErrchk(hipFree(ptr));
 #endif
-      ptr = nullptr;
-    }
+    ptr = nullptr;
   }
+}
 #endif
 
-};  // namespace memoryManager
+}; // namespace memoryManager
 #endif
diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp
index dbe5260f6d..4528d7a8c9 100644
--- a/exercises/vector-addition.cpp
+++ b/exercises/vector-addition.cpp
@@ -16,7 +16,7 @@
 /*
  *  Vector Addition Exercise
  *
- *  In this exercise, you will compute c = a + b, where a, b, c are 
+ *  In this exercise, you will compute c = a + b, where a, b, c are
  *  integer vectors.
  *
  *  This file contains sequential and OpenMP variants of the vector addition
@@ -24,7 +24,7 @@
  *  plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA
  *  compiler, in empty code sections indicated by comments.
  *
- *  The exercise shows you how to use RAJA in its simplest form and 
+ *  The exercise shows you how to use RAJA in its simplest form and
  *  illustrates similarities between a C-style for-loop and a RAJA forall loop.
  *
  *  RAJA features you will use:
@@ -32,75 +32,77 @@
  *    -  Index range segment
  *    -  Execution policies
  *
- * Note: if CUDA is enabled, CUDA unified memory is used. 
+ * Note: if CUDA is enabled, CUDA unified memory is used.
  */
 
 /*
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-//constexpr int CUDA_BLOCK_SIZE = 256;
+// constexpr int CUDA_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-//constexpr int HIP_BLOCK_SIZE = 256;
+// constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-//constexpr int SYCL_BLOCK_SIZE = 256;
+// constexpr int SYCL_BLOCK_SIZE = 256;
 #endif
 
 //
 // Functions for checking and printing arrays
 //
-void checkResult(int* c, int* c_ref, int len); 
+void checkResult(int* c, int* c_ref, int len);
 void printArray(int* v, int len);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA Vector Addition...\n";
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   constexpr int N = 1000000;
 
-//
-// Allocate and initialize vector data to random numbers in [1, 10].
-//
-  int *a = memoryManager::allocate<int>(N);
-  int *b = memoryManager::allocate<int>(N);
-  int *c = memoryManager::allocate<int>(N);
-  int *c_ref = memoryManager::allocate<int>(N);
+  //
+  // Allocate and initialize vector data to random numbers in [1, 10].
+  //
+  int* a = memoryManager::allocate<int>(N);
+  int* b = memoryManager::allocate<int>(N);
+  int* c = memoryManager::allocate<int>(N);
+  int* c_ref = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = rand() % 10 + 1;
     b[i] = rand() % 10 + 1;
   }
 
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::memset(c_ref, 0, N * sizeof(int));
 
   std::cout << "\n Running C-style sequential vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     c_ref[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
-//printArray(c_ref, N);
+  // printArray(c_ref, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA::seq_exec policy enforces strictly sequential execution.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::seq_exec policy enforces strictly sequential execution.
+  //----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
@@ -110,25 +112,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
-  ///           method and RAJA::seq_exec execution policy type. 
+  ///           method and RAJA::seq_exec execution policy type.
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
 
   // _rajaseq_vector_add_start
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
-    c[i] = a[i] + b[i]; 
-  });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
+                               [=](int i) { c[i] = a[i] + b[i]; });
   // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA::simd_exec policy attempts to force the compiler to generate SIMD
-// vectorization optimizations.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::simd_exec policy attempts to force the compiler to generate SIMD
+  // vectorization optimizations.
+  //----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
@@ -142,12 +143,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -155,21 +156,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style OpenMP vector addition...\n";
 
-  #pragma omp parallel for
-  for (int i = 0; i < N; ++i) {
+#pragma omp parallel for
+  for (int i = 0; i < N; ++i)
+  {
     c[i] = a[i] + b[i];
   }
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA::omp_parallel_for_exec policy runs the loop in parallel using
-// OpenMP multithreading.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::omp_parallel_for_exec policy runs the loop in parallel using
+  // OpenMP multithreading.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -185,13 +187,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+// printArray(c, N);
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -199,12 +201,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA vector addition...\n";
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  int* d_b = memoryManager::allocate_gpu<int>(N);
+  int* d_c = memoryManager::allocate_gpu<int>(N);
 
-  cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice ));
-  cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice));
+  cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice));
 
   ///
   /// TODO...
@@ -213,53 +215,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           method and RAJA::cuda_exec execution policy type.
   ///
   ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here. 
+  ///                 top of the file if you want to use it here.
   ///
 
-  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
+  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
-//----------------------------------------------------------------------------//
-// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a 
-// GPU device with 2 blocks per SM.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a
+  // GPU device with 2 blocks per SM.
+  //----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
-  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector "
+               "addition...\n";
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
-  ///           method and RAJA::cuda_exec execution policy type with 
+  ///           method and RAJA::cuda_exec execution policy type with
   ///           arguments defining 2 blocks per SM and asynchronous execution.
   ///
   ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here. 
+  ///                 top of the file if you want to use it here.
   ///
 
-  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
+  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
 
   checkResult(c, c_ref, N);
-//printResult(c, N);
+// printResult(c, N);
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP vector addition...\n";
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  int* d_b = memoryManager::allocate_gpu<int>(N);
+  int* d_c = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice));
 
   ///
   /// TODO...
@@ -268,29 +271,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           method and RAJA::hip_exec execution policy type.
   ///
   ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here. 
+  ///                 top of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost));
 
   checkResult(c, c_ref, N);
-//printResult(c, N);
+  // printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA::sycl_exec policy runs the loop as a SYCL kernel.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::sycl_exec policy runs the loop as a SYCL kernel.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL vector addition...\n";
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  int* d_b = memoryManager::allocate_gpu<int>(N);
+  int* d_c = memoryManager::allocate_gpu<int>(N);
 
   memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int));
   memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int));
@@ -302,24 +305,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           method and RAJA::hip_exec execution policy type.
   ///
   ///           NOTE: You will have to uncomment 'SYCL_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here. 
+  ///                 top of the file if you want to use it here.
   ///
 
   memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int));
 
   checkResult(c, c_ref, N);
-//printResult(c, N);
+  // printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -336,12 +339,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* c, int* c_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( correct && c[i] != c_ref[i] ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (correct && c[i] != c_ref[i])
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -352,9 +362,9 @@ void checkResult(int* c, int* c_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
 }
-
diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp
index 3bbc070731..5149d23d56 100644
--- a/exercises/vector-addition_solution.cpp
+++ b/exercises/vector-addition_solution.cpp
@@ -16,7 +16,7 @@
 /*
  *  Vector Addition Exercise
  *
- *  In this exercise, you will compute c = a + b, where a, b, c are 
+ *  In this exercise, you will compute c = a + b, where a, b, c are
  *  integer vectors.
  *
  *  This file contains sequential and OpenMP variants of the vector addition
@@ -24,7 +24,7 @@
  *  plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA
  *  compiler, in empty code sections indicated by comments.
  *
- *  The exercise shows you how to use RAJA in its simplest form and 
+ *  The exercise shows you how to use RAJA in its simplest form and
  *  illustrates similarities between a C-style for-loop and a RAJA forall loop.
  *
  *  RAJA features you will use:
@@ -32,7 +32,7 @@
  *    -  Index range segment
  *    -  Execution policies
  *
- * Note: if CUDA is enabled, CUDA unified memory is used. 
+ * Note: if CUDA is enabled, CUDA unified memory is used.
  */
 
 /*
@@ -53,93 +53,89 @@ constexpr int SYCL_BLOCK_SIZE = 256;
 //
 // Functions for checking and printing arrays
 //
-void checkResult(int* c, int* c_ref, int len); 
+void checkResult(int* c, int* c_ref, int len);
 void printArray(int* v, int len);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA Vector Addition...\n";
 
-//
-// Define vector length
-//
+  //
+  // Define vector length
+  //
   constexpr int N = 1000000;
 
-//
-// Allocate and initialize vector data to random numbers in [1, 10].
-//
-  int *a = memoryManager::allocate<int>(N);
-  int *b = memoryManager::allocate<int>(N);
-  int *c = memoryManager::allocate<int>(N);
-  int *c_ref = memoryManager::allocate<int>(N);
+  //
+  // Allocate and initialize vector data to random numbers in [1, 10].
+  //
+  int* a = memoryManager::allocate<int>(N);
+  int* b = memoryManager::allocate<int>(N);
+  int* c = memoryManager::allocate<int>(N);
+  int* c_ref = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = rand() % 10 + 1;
     b[i] = rand() % 10 + 1;
   }
 
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::memset(c_ref, 0, N * sizeof(int));
 
   std::cout << "\n Running C-style sequential vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     c_ref[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
-//printArray(c_ref, N);
+  // printArray(c_ref, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA::seq_exec policy enforces strictly sequential execution.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::seq_exec policy enforces strictly sequential execution.
+  //----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
   // _rajaseq_vector_add_start
-  RAJA::forall< RAJA::seq_exec >(
-    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
-      c[i] = a[i] + b[i];
-    }
-  );
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
+                               [=](int i) { c[i] = a[i] + b[i]; });
   // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
 
-//----------------------------------------------------------------------------//
-// RAJA::simd_exec policy attempts to force the compiler to generate SIMD
-// vectorization optimizations.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::simd_exec policy attempts to force the compiler to generate SIMD
+  // vectorization optimizations.
+  //----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
   std::cout << "\n Running RAJA SIMD vector addition...\n";
 
-  RAJA::forall<RAJA::simd_exec>(
-    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
-      c[i] = a[i] + b[i]; 
-    }
-  );    
+  RAJA::forall<RAJA::simd_exec>(RAJA::TypedRangeSegment<int>(0, N),
+                                [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -147,21 +143,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style OpenMP vector addition...\n";
 
-  #pragma omp parallel for
-  for (int i = 0; i < N; ++i) {
+#pragma omp parallel for
+  for (int i = 0; i < N; ++i)
+  {
     c[i] = a[i] + b[i];
   }
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA::omp_parallel_for_exec policy runs the loop in parallel using
-// OpenMP multithreading.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::omp_parallel_for_exec policy runs the loop in parallel using
+  // OpenMP multithreading.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -170,21 +167,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA OpenMP multithreaded vector addition...\n";
 
   // _rajaomp_vector_add_start
-  RAJA::forall< RAJA::omp_parallel_for_exec >(
-    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
-      c[i] = a[i] + b[i]; 
-    }
-  ); 
+  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N),
+                                            [=](int i) { c[i] = a[i] + b[i]; });
   // _rajaomp_vector_add_end
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+// printArray(c, N);
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -192,116 +186,113 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA vector addition...\n";
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  int* d_b = memoryManager::allocate_gpu<int>(N);
+  int* d_c = memoryManager::allocate_gpu<int>(N);
 
-  cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice ));
-  cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice));
+  cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice));
 
   // _rajacuda_vector_add_start
-  RAJA::forall< RAJA::cuda_exec<CUDA_BLOCK_SIZE> >(RAJA::TypedRangeSegment<int>(0, N), 
-    [=] RAJA_DEVICE (int i) {
-    d_c[i] = d_a[i] + d_b[i];
-  });
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
   // _rajacuda_vector_add_end
 
-  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
+  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
 
   checkResult(c, c_ref, N);
-//printArray(c, N);
+  // printArray(c, N);
 
-//----------------------------------------------------------------------------//
-// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a 
-// GPU device with 2 blocks per SM.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a
+  // GPU device with 2 blocks per SM.
+  //----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
-  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector "
+               "addition...\n";
 
   // _rajacuda_explicit_vector_add_start
   const bool Asynchronous = true;
 
-  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE, 2, Asynchronous>>(RAJA::TypedRangeSegment<int>(0, N), 
-    [=] RAJA_DEVICE (int i) { 
-    d_c[i] = d_a[i] + d_b[i]; 
-  });    
+  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE, 2, Asynchronous>>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
   // _rajacuda_explicit_vector_add_end
 
-  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
+  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
 
   checkResult(c, c_ref, N);
-//printResult(c, N);
+// printResult(c, N);
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP vector addition...\n";
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  int* d_b = memoryManager::allocate_gpu<int>(N);
+  int* d_c = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice));
 
   // _rajahip_vector_add_start
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N),
-    [=] RAJA_DEVICE (int i) {
-    d_c[i] = d_a[i] + d_b[i];
-  });
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
   // _rajahip_vector_add_end
 
-  hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost));
 
   checkResult(c, c_ref, N);
-//printResult(c, N);
+  // printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA::sycl_exec policy runs the loop as a SYCL kernel.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA::sycl_exec policy runs the loop as a SYCL kernel.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL vector addition...\n";
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  int* d_b = memoryManager::allocate_gpu<int>(N);
+  int* d_c = memoryManager::allocate_gpu<int>(N);
 
   memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int));
   memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int));
 
   // _rajasycl_vector_add_start
-  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N),
-    [=] RAJA_DEVICE (int i) {
-    d_c[i] = d_a[i] + d_b[i];
-  });
+  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE>>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
   // _rajasycl_vector_add_end
 
   memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int));
 
   checkResult(c, c_ref, N);
-//printResult(c, N);
+  // printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
-//
-// Clean up.
-//
+  //
+  // Clean up.
+  //
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -318,12 +309,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* c, int* c_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( correct && c[i] != c_ref[i] ) { correct = false; }
+  for (int i = 0; i < len; i++)
+  {
+    if (correct && c[i] != c_ref[i])
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -334,9 +332,9 @@ void checkResult(int* c, int* c_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < len; i++)
+  {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
 }
-
diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp
index 258250a741..028293a7f9 100644
--- a/exercises/vertexsum-indexset.cpp
+++ b/exercises/vertexsum-indexset.cpp
@@ -20,7 +20,7 @@
 /*
  *  Mesh vertex area exercise
  *
- *  In this exercise, you will use a RAJA TypedIndexSet containing 4 
+ *  In this exercise, you will use a RAJA TypedIndexSet containing 4
  *  TypedListSegments to parallelize the mesh vertex area computation.
  *  A sum is computed at each vertex on a logically-Cartesian 2D mesh
  *  where the sum represents the vertex "area" as an average of the 4
@@ -32,13 +32,13 @@
  *  each subset. When the ListSegments are put into an IndexSet, the entire
  *  computation can be executed with one RAJA::forall() statement, where
  *  you iterate over the segments sequentially and execute each segment in
- *  parallel. This exercise illustrates how RAJA can be used to enable one 
+ *  parallel. This exercise illustrates how RAJA can be used to enable one
  *  to get some parallelism from such operations without fundamentally
  *  changing the way the algorithm looks in source code.
  *
  *  This file contains sequential and OpenMP variants of the vertex area
- *  computation using C-style for-loops. You will fill in RAJA versions of 
- *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA 
+ *  computation using C-style for-loops. You will fill in RAJA versions of
+ *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA
  *  GPU and a CUDA compiler, in empty code sections indicated by comments.
  *
  *  RAJA features you will use:
@@ -68,189 +68,204 @@ void checkResult(double* a, double* aref, int n);
 void printMeshData(double* v, int n, int joff);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n";
 
-// _vertexsum_define_start
-//
-// 2D mesh has N^2 elements (N+1)^2 vertices.
-//
+  // _vertexsum_define_start
+  //
+  // 2D mesh has N^2 elements (N+1)^2 vertices.
+  //
   constexpr int N = 1000;
   constexpr int Nelem = N;
   constexpr int Nelem_tot = Nelem * Nelem;
   constexpr int Nvert = N + 1;
   constexpr int Nvert_tot = Nvert * Nvert;
-// _vertexsum_define_end
+  // _vertexsum_define_end
   double* areae = memoryManager::allocate<double>(Nelem_tot);
   double* areav = memoryManager::allocate<double>(Nvert_tot);
   double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
-  int* e2v_map = memoryManager::allocate<int>(4*Nelem_tot);
+  int* e2v_map = memoryManager::allocate<int>(4 * Nelem_tot);
 
-// _vertexsum_elemarea_start
-//
-// Define mesh spacing factor 'h' and set up elem to vertex mapping array.
-//
+  // _vertexsum_elemarea_start
+  //
+  // Define mesh spacing factor 'h' and set up elem to vertex mapping array.
+  //
   constexpr double h = 0.1;
 
-  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
     int j = ie / Nelem;
-    int imap = 4 * ie ;
+    int imap = 4 * ie;
     e2v_map[imap] = ie + j;
-    e2v_map[imap+1] = ie + j + 1;
-    e2v_map[imap+2] = ie + j + Nvert;
-    e2v_map[imap+3] = ie + j + 1 + Nvert;
+    e2v_map[imap + 1] = ie + j + 1;
+    e2v_map[imap + 2] = ie + j + Nvert;
+    e2v_map[imap + 3] = ie + j + 1 + Nvert;
   }
 
-//
-// Initialize element areas so each element area 
-// depends on the i,j coordinates of the element.
-//
+  //
+  // Initialize element areas so each element area
+  // depends on the i,j coordinates of the element.
+  //
   std::memset(areae, 0, Nelem_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
     int i = ie % Nelem;
     int j = ie / Nelem;
-    areae[ie] = h*(i+1) * h*(j+1);
+    areae[ie] = h * (i + 1) * h * (j + 1);
   }
-// _vertexsum_elemarea_end
+  // _vertexsum_elemarea_end
 
-//std::cout << "\n Element areas...\n";
-//printMeshData(areae, Nelem, Nelem);
+  // std::cout << "\n Element areas...\n";
+  // printMeshData(areae, Nelem, Nelem);
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential C-style version of vertex sum...\n";
 
-// _cstyle_vertexarea_seq_start
+  // _cstyle_vertexarea_seq_start
   std::memset(areav_ref, 0, Nvert_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie) {
-    int* iv = &(e2v_map[4*ie]);
-    areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
-    areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
-    areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
-    areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
+    int* iv = &(e2v_map[4 * ie]);
+    areav_ref[iv[0]] += areae[ie] / 4.0;
+    areav_ref[iv[1]] += areae[ie] / 4.0;
+    areav_ref[iv[2]] += areae[ie] / 4.0;
+    areav_ref[iv[3]] += areae[ie] / 4.0;
   }
-// _cstyle_vertexarea_seq_end
-
-//std::cout << "\n Vertex areas (reference)...\n";
-//printMeshData(areav_ref, Nvert, jvoff);
-
-
-//----------------------------------------------------------------------------//
-//
-// In the following, we partition the element iteration space into four
-// subsets (or "colors") indicated by numbers in the figure below. 
-// 
-//    -----------------
-//    | 2 | 3 | 2 | 3 |
-//    -----------------
-//    | 0 | 1 | 0 | 1 |
-//    -----------------
-//    | 2 | 3 | 2 | 3 |
-//    -----------------
-//    | 0 | 1 | 0 | 1 |   
-//    -----------------
-//
-// Since none of the elements with the same number share a common vertex,
-// we can iterate over each subset ("color") in parallel.
-//
-// We use RAJA ListSegments and a RAJA IndexSet to define the element 
-// partitioning. 
-//
-
-// _vertexarea_color_start
-//
-// Gather the element indices for each color in a vector.
-//
-  std::vector< std::vector<int> > idx(4);
-
-  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+  // _cstyle_vertexarea_seq_end
+
+  // std::cout << "\n Vertex areas (reference)...\n";
+  // printMeshData(areav_ref, Nvert, jvoff);
+
+
+  //----------------------------------------------------------------------------//
+  //
+  // In the following, we partition the element iteration space into four
+  // subsets (or "colors") indicated by numbers in the figure below.
+  //
+  //    -----------------
+  //    | 2 | 3 | 2 | 3 |
+  //    -----------------
+  //    | 0 | 1 | 0 | 1 |
+  //    -----------------
+  //    | 2 | 3 | 2 | 3 |
+  //    -----------------
+  //    | 0 | 1 | 0 | 1 |
+  //    -----------------
+  //
+  // Since none of the elements with the same number share a common vertex,
+  // we can iterate over each subset ("color") in parallel.
+  //
+  // We use RAJA ListSegments and a RAJA IndexSet to define the element
+  // partitioning.
+  //
+
+  // _vertexarea_color_start
+  //
+  // Gather the element indices for each color in a vector.
+  //
+  std::vector<std::vector<int>> idx(4);
+
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
     int i = ie % Nelem;
     int j = ie / Nelem;
-    if ( i % 2 == 0 ) {
-      if ( j % 2 == 0 ) {
+    if (i % 2 == 0)
+    {
+      if (j % 2 == 0)
+      {
         idx[0].push_back(ie);
-      } else {
+      }
+      else
+      {
         idx[2].push_back(ie);
       }
-    } else {
-      if ( j % 2 == 0 ) {
+    }
+    else
+    {
+      if (j % 2 == 0)
+      {
         idx[1].push_back(ie);
-      } else {
+      }
+      else
+      {
         idx[3].push_back(ie);
       }
     }
   }
-// _vertexarea_color_end
+  // _vertexarea_color_end
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant. Note that we use the vectors
-// defined above in this variant to run each element subset in parallel. 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant. Note that we use the vectors
+  // defined above in this variant to run each element subset in parallel.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running C-style OpenMP vertex sum...\n";
 
 
-// _cstyle_vertexarea_omp_start
+  // _cstyle_vertexarea_omp_start
   std::memset(areav, 0, Nvert_tot * sizeof(double));
 
-  for (int icol = 0; icol < 4; ++icol) {
-     const std::vector<int>& ievec = idx[icol];
-     const int len = static_cast<int>(ievec.size());
-
-     #pragma omp parallel for  
-     for (int i = 0; i < len; ++i) {
-        int ie = ievec[i]; 
-        int* iv = &(e2v_map[4*ie]);
-        areav[ iv[0] ] += areae[ie] / 4.0 ;
-        areav[ iv[1] ] += areae[ie] / 4.0 ;
-        areav[ iv[2] ] += areae[ie] / 4.0 ;
-        areav[ iv[3] ] += areae[ie] / 4.0 ;
-     }
-
+  for (int icol = 0; icol < 4; ++icol)
+  {
+    const std::vector<int>& ievec = idx[icol];
+    const int len = static_cast<int>(ievec.size());
+
+#pragma omp parallel for
+    for (int i = 0; i < len; ++i)
+    {
+      int ie = ievec[i];
+      int* iv = &(e2v_map[4 * ie]);
+      areav[iv[0]] += areae[ie] / 4.0;
+      areav[iv[1]] += areae[ie] / 4.0;
+      areav[iv[2]] += areae[ie] / 4.0;
+      areav[iv[3]] += areae[ie] / 4.0;
+    }
   }
-// _cstyle_vertexarea_omp_end
+  // _cstyle_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex areas (reference)...\n";
-//printMeshData(areav_ref, Nvert, jvoff);
+  // std::cout << "\n Vertex areas (reference)...\n";
+  // printMeshData(areav_ref, Nvert, jvoff);
 
 #endif
 
 
 // The IndexSet is a variadic template, where the template arguments
-// are the segment types that the IndexSet can hold. 
-// 
-#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-// _vertexarea_listsegtype_start
+// are the segment types that the IndexSet can hold.
+//
+#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) ||                \
+    defined(RAJA_ENABLE_HIP)
+  // _vertexarea_listsegtype_start
   using SegmentType = RAJA::TypedListSegment<int>;
 // _vertexarea_listsegtype_end
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//
-// Resource object used to construct list segment objects with indices
-// living in host (CPU) memory.
-//
+  //
+  // Resource object used to construct list segment objects with indices
+  // living in host (CPU) memory.
+  //
   camp::resources::Resource host_res{camp::resources::Host()};
 
-// 
-// Create a RAJA IndexSet with four ListSegments, one for the indices of 
-// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA 
-// variants of the vertex sum calculation.
+  //
+  // Create a RAJA IndexSet with four ListSegments, one for the indices of
+  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+  // variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> colorset;
 
-  colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); 
+  colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res));
 
   ///
   /// TODO...
@@ -260,56 +275,56 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           below to check if it's correct.
   ///
 
-//----------------------------------------------------------------------------//
-// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration 
-// over segments, OpenMP parallel iteration of each segment)
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration
+  // over segments, OpenMP parallel iteration of each segment)
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
 
-// _raja_vertexarea_omp_start
-  using EXEC_POL1 = RAJA::ExecPolicy<RAJA::seq_segit, 
-                                     RAJA::omp_parallel_for_exec>;
+  // _raja_vertexarea_omp_start
+  using EXEC_POL1 =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
 
   RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
-    int* iv = &(e2v_map[4*ie]);
-    areav[ iv[0] ] += areae[ie] / 4.0 ;
-    areav[ iv[1] ] += areae[ie] / 4.0 ;
-    areav[ iv[2] ] += areae[ie] / 4.0 ;
-    areav[ iv[3] ] += areae[ie] / 4.0 ;
+    int* iv = &(e2v_map[4 * ie]);
+    areav[iv[0]] += areae[ie] / 4.0;
+    areav[iv[1]] += areae[ie] / 4.0;
+    areav[iv[2]] += areae[ie] / 4.0;
+    areav[iv[3]] += areae[ie] / 4.0;
   });
-// _raja_vertexarea_omp_end
+  // _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(areav, Nvert, Nvert); 
+  // std::cout << "\n Vertex volumes...\n";
+  // printMeshData(areav, Nvert, Nvert);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration 
-// over segments, CUDA kernel launched for each segment)
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration
+  // over segments, CUDA kernel launched for each segment)
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//
-// Resource object used to construct list segment objects with indices
-// living in device (GPU) memory.
-//
+  //
+  // Resource object used to construct list segment objects with indices
+  // living in device (GPU) memory.
+  //
   camp::resources::Resource cuda_res{camp::resources::Cuda()};
 
-//
-// Create a RAJA IndexSet with four ListSegments, one for the indices of
-// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-// variants of the vertex sum calculation.
+  //
+  // Create a RAJA IndexSet with four ListSegments, one for the indices of
+  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+  // variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> cuda_colorset;
 
-  cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) );
+  cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res));
 
   ///
   /// TODO...
@@ -321,84 +336,85 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
 
-// _raja_vertexarea_cuda_start
-  using EXEC_POL2 = RAJA::ExecPolicy<RAJA::seq_segit, 
-                                     RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
+  // _raja_vertexarea_cuda_start
+  using EXEC_POL2 =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
-    int* iv = &(e2v_map[4*ie]);
-    areav[ iv[0] ] += areae[ie] / 4.0 ;
-    areav[ iv[1] ] += areae[ie] / 4.0 ;
-    areav[ iv[2] ] += areae[ie] / 4.0 ;
-    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE(int ie) {
+    int* iv = &(e2v_map[4 * ie]);
+    areav[iv[0]] += areae[ie] / 4.0;
+    areav[iv[1]] += areae[ie] / 4.0;
+    areav[iv[2]] += areae[ie] / 4.0;
+    areav[iv[3]] += areae[ie] / 4.0;
   });
-// _raja_vertexarea_cuda_end
+  // _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(areav, Nvert, jvoff);
+  // std::cout << "\n Vertex volumes...\n";
+  // printMeshData(areav, Nvert, jvoff);
 
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA HIP vertex sum calculation using IndexSet (sequential iteration
-// over segments, HIP kernel launched for each segment)
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA HIP vertex sum calculation using IndexSet (sequential iteration
+  // over segments, HIP kernel launched for each segment)
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-//
-// Allocate and initialize device memory arrays
-//
+  //
+  // Allocate and initialize device memory arrays
+  //
   double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
   double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
-  int* d_e2v_map  = memoryManager::allocate_gpu<int>(4*Nelem_tot);
+  int* d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
-  hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice);
+  hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(
+      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
-  hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
 
-//
-// Resource object used to construct list segment objects with indices
-// living in device (GPU) memory.
-//
+  //
+  // Resource object used to construct list segment objects with indices
+  // living in device (GPU) memory.
+  //
   camp::resources::Resource hip_res{camp::resources::Hip()};
 
-//
-// Create a RAJA IndexSet with four ListSegments, one for the indices of
-// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-// variants of the vertex sum calculation.
+  //
+  // Create a RAJA IndexSet with four ListSegments, one for the indices of
+  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+  // variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> hip_colorset;
 
-  hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) );
-  hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) );
-  hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) );
-  hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) );
+  hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res));
+  hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res));
+  hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res));
+  hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res));
 
   std::cout << "\n Running RAJA HIP index set vertex sum...\n";
 
-// _raja_vertexarea_hip_start
-  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit,
-                                     RAJA::hip_exec<HIP_BLOCK_SIZE>>;
+  // _raja_vertexarea_hip_start
+  using EXEC_POL3 =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE (int ie) {
-    int* iv = &(d_e2v_map[4*ie]);
-    d_areav[ iv[0] ] += d_areae[ie] / 4.0 ;
-    d_areav[ iv[1] ] += d_areae[ie] / 4.0 ;
-    d_areav[ iv[2] ] += d_areae[ie] / 4.0 ;
-    d_areav[ iv[3] ] += d_areae[ie] / 4.0 ;
+  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE(int ie) {
+    int* iv = &(d_e2v_map[4 * ie]);
+    d_areav[iv[0]] += d_areae[ie] / 4.0;
+    d_areav[iv[1]] += d_areae[ie] / 4.0;
+    d_areav[iv[2]] += d_areae[ie] / 4.0;
+    d_areav[iv[3]] += d_areae[ie] / 4.0;
   });
-// _raja_vertexarea_hip_end
+  // _raja_vertexarea_hip_end
 
-  hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost);
+  hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(areav, Nvert, jvoff);
+  // std::cout << "\n Vertex volumes...\n";
+  // printMeshData(areav, Nvert, jvoff);
 
   memoryManager::deallocate_gpu(d_areae);
   memoryManager::deallocate_gpu(d_areav);
@@ -406,7 +422,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(areae);
@@ -425,12 +441,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 void checkResult(double* a, double* aref, int n)
 {
   bool correct = true;
-  for (int i = 0; i < n*n; i++) {
-    if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; }
+  for (int i = 0; i < n * n; i++)
+  {
+    if (correct && std::abs(a[i] - aref[i]) > 10e-12)
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -441,11 +464,12 @@ void checkResult(double* a, double* aref, int n)
 void printMeshData(double* v, int n, int joff)
 {
   std::cout << std::endl;
-  for (int j = 0 ; j < n ; ++j) {
-    for (int i = 0 ; i < n ; ++i) {
-      int ii = i + j*joff ;
-      std::cout << "v(" << i << "," << j << ") = "
-                << v[ii] << std::endl;
+  for (int j = 0; j < n; ++j)
+  {
+    for (int i = 0; i < n; ++i)
+    {
+      int ii = i + j * joff;
+      std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp
index 5c1617343a..2861109eda 100644
--- a/exercises/vertexsum-indexset_solution.cpp
+++ b/exercises/vertexsum-indexset_solution.cpp
@@ -20,7 +20,7 @@
 /*
  *  Mesh vertex area exercise
  *
- *  In this exercise, you will use a RAJA TypedIndexSet containing 4 
+ *  In this exercise, you will use a RAJA TypedIndexSet containing 4
  *  TypedListSegments to parallelize the mesh vertex area computation.
  *  A sum is computed at each vertex on a logically-Cartesian 2D mesh
  *  where the sum represents the vertex "area" as an average of the 4
@@ -32,13 +32,13 @@
  *  each subset. When the ListSegments are put into an IndexSet, the entire
  *  computation can be executed with one RAJA::forall() statement, where
  *  you iterate over the segments sequentially and execute each segment in
- *  parallel. This exercise illustrates how RAJA can be used to enable one 
+ *  parallel. This exercise illustrates how RAJA can be used to enable one
  *  to get some parallelism from such operations without fundamentally
  *  changing the way the algorithm looks in source code.
  *
  *  This file contains sequential and OpenMP variants of the vertex area
- *  computation using C-style for-loops. You will fill in RAJA versions of 
- *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA 
+ *  computation using C-style for-loops. You will fill in RAJA versions of
+ *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA
  *  GPU and a CUDA compiler, in empty code sections indicated by comments.
  *
  *  RAJA features you will use:
@@ -68,329 +68,345 @@ void checkResult(double* a, double* aref, int n);
 void printMeshData(double* v, int n, int joff);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n";
 
-// _vertexsum_define_start
-//
-// 2D mesh has N^2 elements (N+1)^2 vertices.
-//
+  // _vertexsum_define_start
+  //
+  // 2D mesh has N^2 elements (N+1)^2 vertices.
+  //
   constexpr int N = 1000;
   constexpr int Nelem = N;
   constexpr int Nelem_tot = Nelem * Nelem;
   constexpr int Nvert = N + 1;
   constexpr int Nvert_tot = Nvert * Nvert;
-// _vertexsum_define_end
+  // _vertexsum_define_end
   double* areae = memoryManager::allocate<double>(Nelem_tot);
   double* areav = memoryManager::allocate<double>(Nvert_tot);
   double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
-  int* e2v_map = memoryManager::allocate<int>(4*Nelem_tot);
+  int* e2v_map = memoryManager::allocate<int>(4 * Nelem_tot);
 
-// _vertexsum_elemarea_start
-//
-// Define mesh spacing factor 'h' and set up elem to vertex mapping array.
-//
+  // _vertexsum_elemarea_start
+  //
+  // Define mesh spacing factor 'h' and set up elem to vertex mapping array.
+  //
   constexpr double h = 0.1;
 
-  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
     int j = ie / Nelem;
-    int imap = 4 * ie ;
+    int imap = 4 * ie;
     e2v_map[imap] = ie + j;
-    e2v_map[imap+1] = ie + j + 1;
-    e2v_map[imap+2] = ie + j + Nvert;
-    e2v_map[imap+3] = ie + j + 1 + Nvert;
+    e2v_map[imap + 1] = ie + j + 1;
+    e2v_map[imap + 2] = ie + j + Nvert;
+    e2v_map[imap + 3] = ie + j + 1 + Nvert;
   }
 
-//
-// Initialize element areas so each element area 
-// depends on the i,j coordinates of the element.
-//
+  //
+  // Initialize element areas so each element area
+  // depends on the i,j coordinates of the element.
+  //
   std::memset(areae, 0, Nelem_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
     int i = ie % Nelem;
     int j = ie / Nelem;
-    areae[ie] = h*(i+1) * h*(j+1);
+    areae[ie] = h * (i + 1) * h * (j + 1);
   }
-// _vertexsum_elemarea_end
+  // _vertexsum_elemarea_end
 
-//std::cout << "\n Element areas...\n";
-//printMeshData(areae, Nelem, Nelem);
+  // std::cout << "\n Element areas...\n";
+  // printMeshData(areae, Nelem, Nelem);
 
-//----------------------------------------------------------------------------//
-// C-style sequential variant establishes reference solution to compare with.
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style sequential variant establishes reference solution to compare with.
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential C-style version of vertex sum...\n";
 
-// _cstyle_vertexarea_seq_start
+  // _cstyle_vertexarea_seq_start
   std::memset(areav_ref, 0, Nvert_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie) {
-    int* iv = &(e2v_map[4*ie]);
-    areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
-    areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
-    areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
-    areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
+    int* iv = &(e2v_map[4 * ie]);
+    areav_ref[iv[0]] += areae[ie] / 4.0;
+    areav_ref[iv[1]] += areae[ie] / 4.0;
+    areav_ref[iv[2]] += areae[ie] / 4.0;
+    areav_ref[iv[3]] += areae[ie] / 4.0;
   }
-// _cstyle_vertexarea_seq_end
-
-//std::cout << "\n Vertex areas (reference)...\n";
-//printMeshData(areav_ref, Nvert, jvoff);
-
-
-//----------------------------------------------------------------------------//
-//
-// In the following, we partition the element iteration space into four
-// subsets (or "colors") indicated by numbers in the figure below. 
-// 
-//    -----------------
-//    | 2 | 3 | 2 | 3 |
-//    -----------------
-//    | 0 | 1 | 0 | 1 |
-//    -----------------
-//    | 2 | 3 | 2 | 3 |
-//    -----------------
-//    | 0 | 1 | 0 | 1 |   
-//    -----------------
-//
-// Since none of the elements with the same number share a common vertex,
-// we can iterate over each subset ("color") in parallel.
-//
-// We use RAJA ListSegments and a RAJA IndexSet to define the element 
-// partitioning. 
-//
-
-// _vertexarea_color_start
-//
-// Gather the element indices for each color in a vector.
-//
-  std::vector< std::vector<int> > idx(4);
-
-  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+  // _cstyle_vertexarea_seq_end
+
+  // std::cout << "\n Vertex areas (reference)...\n";
+  // printMeshData(areav_ref, Nvert, jvoff);
+
+
+  //----------------------------------------------------------------------------//
+  //
+  // In the following, we partition the element iteration space into four
+  // subsets (or "colors") indicated by numbers in the figure below.
+  //
+  //    -----------------
+  //    | 2 | 3 | 2 | 3 |
+  //    -----------------
+  //    | 0 | 1 | 0 | 1 |
+  //    -----------------
+  //    | 2 | 3 | 2 | 3 |
+  //    -----------------
+  //    | 0 | 1 | 0 | 1 |
+  //    -----------------
+  //
+  // Since none of the elements with the same number share a common vertex,
+  // we can iterate over each subset ("color") in parallel.
+  //
+  // We use RAJA ListSegments and a RAJA IndexSet to define the element
+  // partitioning.
+  //
+
+  // _vertexarea_color_start
+  //
+  // Gather the element indices for each color in a vector.
+  //
+  std::vector<std::vector<int>> idx(4);
+
+  for (int ie = 0; ie < Nelem_tot; ++ie)
+  {
     int i = ie % Nelem;
     int j = ie / Nelem;
-    if ( i % 2 == 0 ) {
-      if ( j % 2 == 0 ) {
+    if (i % 2 == 0)
+    {
+      if (j % 2 == 0)
+      {
         idx[0].push_back(ie);
-      } else {
+      }
+      else
+      {
         idx[2].push_back(ie);
       }
-    } else {
-      if ( j % 2 == 0 ) {
+    }
+    else
+    {
+      if (j % 2 == 0)
+      {
         idx[1].push_back(ie);
-      } else {
+      }
+      else
+      {
         idx[3].push_back(ie);
       }
     }
   }
-// _vertexarea_color_end
+  // _vertexarea_color_end
 
 
-//----------------------------------------------------------------------------//
-// C-style OpenMP multithreading variant. Note that we use the vectors
-// defined above in this variant to run each element subset in parallel. 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // C-style OpenMP multithreading variant. Note that we use the vectors
+  // defined above in this variant to run each element subset in parallel.
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running C-style OpenMP vertex sum...\n";
 
 
-// _cstyle_vertexarea_omp_start
+  // _cstyle_vertexarea_omp_start
   std::memset(areav, 0, Nvert_tot * sizeof(double));
 
-  for (int icol = 0; icol < 4; ++icol) {
-     const std::vector<int>& ievec = idx[icol];
-     const int len = static_cast<int>(ievec.size());
-
-     #pragma omp parallel for  
-     for (int i = 0; i < len; ++i) {
-        int ie = ievec[i]; 
-        int* iv = &(e2v_map[4*ie]);
-        areav[ iv[0] ] += areae[ie] / 4.0 ;
-        areav[ iv[1] ] += areae[ie] / 4.0 ;
-        areav[ iv[2] ] += areae[ie] / 4.0 ;
-        areav[ iv[3] ] += areae[ie] / 4.0 ;
-     }
-
+  for (int icol = 0; icol < 4; ++icol)
+  {
+    const std::vector<int>& ievec = idx[icol];
+    const int len = static_cast<int>(ievec.size());
+
+#pragma omp parallel for
+    for (int i = 0; i < len; ++i)
+    {
+      int ie = ievec[i];
+      int* iv = &(e2v_map[4 * ie]);
+      areav[iv[0]] += areae[ie] / 4.0;
+      areav[iv[1]] += areae[ie] / 4.0;
+      areav[iv[2]] += areae[ie] / 4.0;
+      areav[iv[3]] += areae[ie] / 4.0;
+    }
   }
-// _cstyle_vertexarea_omp_end
+  // _cstyle_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex areas (reference)...\n";
-//printMeshData(areav_ref, Nvert, jvoff);
+  // std::cout << "\n Vertex areas (reference)...\n";
+  // printMeshData(areav_ref, Nvert, jvoff);
 
 #endif
 
 
 // The IndexSet is a variadic template, where the template arguments
-// are the segment types that the IndexSet can hold. 
-// 
-#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) 
-// _vertexarea_listsegtype_start
+// are the segment types that the IndexSet can hold.
+//
+#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) ||                \
+    defined(RAJA_ENABLE_HIP)
+  // _vertexarea_listsegtype_start
   using SegmentType = RAJA::TypedListSegment<int>;
 // _vertexarea_listsegtype_end
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-//
-// Resource object used to construct list segment objects with indices
-// living in host (CPU) memory.
-//
+  //
+  // Resource object used to construct list segment objects with indices
+  // living in host (CPU) memory.
+  //
   camp::resources::Resource host_res{camp::resources::Host()};
 
-// 
-// Create a RAJA IndexSet with four ListSegments, one for the indices of 
-// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA 
-// variants of the vertex sum calculation.
+  //
+  // Create a RAJA IndexSet with four ListSegments, one for the indices of
+  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+  // variants of the vertex sum calculation.
 
-// _vertexarea_indexset_start
+  // _vertexarea_indexset_start
   RAJA::TypedIndexSet<SegmentType> colorset;
 
-  colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); 
-  colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); 
-  colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); 
-  colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) );
-// _vertexarea_indexset_end
+  colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res));
+  colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), host_res));
+  colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), host_res));
+  colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), host_res));
+  // _vertexarea_indexset_end
 
-//----------------------------------------------------------------------------//
-// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration 
-// over segments, OpenMP parallel iteration of each segment)
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration
+  // over segments, OpenMP parallel iteration of each segment)
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
 
-// _raja_vertexarea_omp_start
-  using EXEC_POL1 = RAJA::ExecPolicy<RAJA::seq_segit, 
-                                     RAJA::omp_parallel_for_exec>;
+  // _raja_vertexarea_omp_start
+  using EXEC_POL1 =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
 
   RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
-    int* iv = &(e2v_map[4*ie]);
-    areav[ iv[0] ] += areae[ie] / 4.0 ;
-    areav[ iv[1] ] += areae[ie] / 4.0 ;
-    areav[ iv[2] ] += areae[ie] / 4.0 ;
-    areav[ iv[3] ] += areae[ie] / 4.0 ;
+    int* iv = &(e2v_map[4 * ie]);
+    areav[iv[0]] += areae[ie] / 4.0;
+    areav[iv[1]] += areae[ie] / 4.0;
+    areav[iv[2]] += areae[ie] / 4.0;
+    areav[iv[3]] += areae[ie] / 4.0;
   });
-// _raja_vertexarea_omp_end
+  // _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(areav, Nvert, Nvert); 
+  // std::cout << "\n Vertex volumes...\n";
+  // printMeshData(areav, Nvert, Nvert);
 
 #endif
 
 
-//----------------------------------------------------------------------------//
-// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration 
-// over segments, CUDA kernel launched for each segment)
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration
+  // over segments, CUDA kernel launched for each segment)
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-//
-// Resource object used to construct list segment objects with indices
-// living in device (GPU) memory.
-//
+  //
+  // Resource object used to construct list segment objects with indices
+  // living in device (GPU) memory.
+  //
   camp::resources::Resource cuda_res{camp::resources::Cuda()};
 
-//
-// Create a RAJA IndexSet with four ListSegments, one for the indices of
-// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-// variants of the vertex sum calculation.
+  //
+  // Create a RAJA IndexSet with four ListSegments, one for the indices of
+  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+  // variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> cuda_colorset;
 
-  cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) );
-  cuda_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), cuda_res) );
-  cuda_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), cuda_res) );
-  cuda_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), cuda_res) );
+  cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res));
+  cuda_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), cuda_res));
+  cuda_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), cuda_res));
+  cuda_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), cuda_res));
 
   std::cout << "\n Running RAJA CUDA index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
 
-// _raja_vertexarea_cuda_start
-  using EXEC_POL2 = RAJA::ExecPolicy<RAJA::seq_segit, 
-                                     RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
+  // _raja_vertexarea_cuda_start
+  using EXEC_POL2 =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
-    int* iv = &(e2v_map[4*ie]);
-    areav[ iv[0] ] += areae[ie] / 4.0 ;
-    areav[ iv[1] ] += areae[ie] / 4.0 ;
-    areav[ iv[2] ] += areae[ie] / 4.0 ;
-    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE(int ie) {
+    int* iv = &(e2v_map[4 * ie]);
+    areav[iv[0]] += areae[ie] / 4.0;
+    areav[iv[1]] += areae[ie] / 4.0;
+    areav[iv[2]] += areae[ie] / 4.0;
+    areav[iv[3]] += areae[ie] / 4.0;
   });
-// _raja_vertexarea_cuda_end
+  // _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(areav, Nvert, jvoff);
+  // std::cout << "\n Vertex volumes...\n";
+  // printMeshData(areav, Nvert, jvoff);
 
 #endif
 
-//----------------------------------------------------------------------------//
-// RAJA HIP vertex sum calculation using IndexSet (sequential iteration
-// over segments, HIP kernel launched for each segment)
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  // RAJA HIP vertex sum calculation using IndexSet (sequential iteration
+  // over segments, HIP kernel launched for each segment)
+  //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-//
-// Allocate and initialize device memory arrays
-//
+  //
+  // Allocate and initialize device memory arrays
+  //
   double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
   double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
-  int* d_e2v_map  = memoryManager::allocate_gpu<int>(4*Nelem_tot);
+  int* d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
-  hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice);
+  hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(
+      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
-  hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
 
-//
-// Resource object used to construct list segment objects with indices
-// living in device (GPU) memory.
-//
+  //
+  // Resource object used to construct list segment objects with indices
+  // living in device (GPU) memory.
+  //
   camp::resources::Resource hip_res{camp::resources::Hip()};
 
-//
-// Create a RAJA IndexSet with four ListSegments, one for the indices of
-// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-// variants of the vertex sum calculation.
+  //
+  // Create a RAJA IndexSet with four ListSegments, one for the indices of
+  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+  // variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> hip_colorset;
 
-  hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) );
-  hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) );
-  hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) );
-  hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) );
+  hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res));
+  hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res));
+  hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res));
+  hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res));
 
   std::cout << "\n Running RAJA HIP index set vertex sum...\n";
 
-// _raja_vertexarea_hip_start
-  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit,
-                                     RAJA::hip_exec<HIP_BLOCK_SIZE>>;
+  // _raja_vertexarea_hip_start
+  using EXEC_POL3 =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE (int ie) {
-    int* iv = &(d_e2v_map[4*ie]);
-    d_areav[ iv[0] ] += d_areae[ie] / 4.0 ;
-    d_areav[ iv[1] ] += d_areae[ie] / 4.0 ;
-    d_areav[ iv[2] ] += d_areae[ie] / 4.0 ;
-    d_areav[ iv[3] ] += d_areae[ie] / 4.0 ;
+  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE(int ie) {
+    int* iv = &(d_e2v_map[4 * ie]);
+    d_areav[iv[0]] += d_areae[ie] / 4.0;
+    d_areav[iv[1]] += d_areae[ie] / 4.0;
+    d_areav[iv[2]] += d_areae[ie] / 4.0;
+    d_areav[iv[3]] += d_areae[ie] / 4.0;
   });
-// _raja_vertexarea_hip_end
+  // _raja_vertexarea_hip_end
 
-  hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost);
+  hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
   checkResult(areav, areav_ref, Nvert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(areav, Nvert, jvoff);
+  // std::cout << "\n Vertex volumes...\n";
+  // printMeshData(areav, Nvert, jvoff);
 
   memoryManager::deallocate_gpu(d_areae);
   memoryManager::deallocate_gpu(d_areav);
@@ -398,7 +414,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(areae);
@@ -417,12 +433,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 void checkResult(double* a, double* aref, int n)
 {
   bool correct = true;
-  for (int i = 0; i < n*n; i++) {
-    if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; }
+  for (int i = 0; i < n * n; i++)
+  {
+    if (correct && std::abs(a[i] - aref[i]) > 10e-12)
+    {
+      correct = false;
+    }
   }
-  if ( correct ) {
+  if (correct)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -433,11 +456,12 @@ void checkResult(double* a, double* aref, int n)
 void printMeshData(double* v, int n, int joff)
 {
   std::cout << std::endl;
-  for (int j = 0 ; j < n ; ++j) {
-    for (int i = 0 ; i < n ; ++i) {
-      int ii = i + j*joff ;
-      std::cout << "v(" << i << "," << j << ") = "
-                << v[ii] << std::endl;
+  for (int j = 0; j < n; ++j)
+  {
+    for (int i = 0; i < n; ++i)
+    {
+      int ii = i + j * joff;
+      std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp
index 0f9383e95e..c743b84d28 100644
--- a/exercises/view-layout.cpp
+++ b/exercises/view-layout.cpp
@@ -22,9 +22,9 @@
  *  RAJA features shown:
  *    - RAJA::View
  *    - RAJA::Layout
- *    - Layout permutations 
+ *    - Layout permutations
  *    - OffsetLayout
- *    - OffsetLayout permutations 
+ *    - OffsetLayout permutations
  *
  * NOTE: no RAJA kernel execution methods are used in these examples.
  */
@@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N);
 template <typename T>
 void printValues(T* C, int N);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA view & layout exercises...\n";
 
-//----------------------------------------------------------------------------//
-//
-// Matrix-matrix multiplication: default layout
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Matrix-matrix multiplication: default layout
+  //
+  //----------------------------------------------------------------------------//
 
   // _matmult_init_start
   //
@@ -58,84 +58,92 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate storage for matrices and initialize matrix entries
   //
-  double *A = new double[ N * N ];
-  double *B = new double[ N * N ];
-  double *C = new double[ N * N ];
-  double *Cref = new double[ N * N ];
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      A[ col + N*row ] = row + 1;
-      B[ col + N*row ] = col + 1;
-      C[ col + N*row ] = 0.0;
-      Cref[ col + N*row ] = 0.0;
+  double* A = new double[N * N];
+  double* B = new double[N * N];
+  double* C = new double[N * N];
+  double* Cref = new double[N * N];
+
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      A[col + N * row] = row + 1;
+      B[col + N * row] = col + 1;
+      C[col + N * row] = 0.0;
+      Cref[col + N * row] = 0.0;
     }
   }
   // _matmult_init_end
 
-//printValues<double>(A, N*N); 
-//printValues<double>(B, N*N); 
-//printValues<double>(C, N*N); 
-//printValues<double>(Cref, N*N); 
+  // printValues<double>(A, N*N);
+  // printValues<double>(B, N*N);
+  // printValues<double>(C, N*N);
+  // printValues<double>(Cref, N*N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication reference solution...\n";
 
   // _cstyle_matmult_start
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      for (int k = 0; k < N; ++k) {
-        Cref[col + N*row] += A[k + N*row] * B[col + N*k];
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      for (int k = 0; k < N; ++k)
+      {
+        Cref[col + N * row] += A[k + N * row] * B[col + N * k];
       }
     }
   }
   // _cstyle_matmult_end
 
-//printValues<double>(Cref, N*N);
+  // printValues<double>(Cref, N*N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication w/Views...\n";
 
-  // 
+  //
   // Define RAJA View objects to simplify access to the matrix entries.
-  // 
-  // Note: we use default Layout 
+  //
+  // Note: we use default Layout
   //
   // _matmult_views_start
-  RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N);
-  RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N);
-  RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N);
+  RAJA::View<double, RAJA::Layout<2, int>> Aview(A, N, N);
+  RAJA::View<double, RAJA::Layout<2, int>> Bview(B, N, N);
+  RAJA::View<double, RAJA::Layout<2, int>> Cview(C, N, N);
   // _matmult_views_end
 
   // _cstyle_matmult_views_start
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      for (int k = 0; k < N; ++k) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      for (int k = 0; k < N; ++k)
+      {
         Cview(row, col) += Aview(row, k) * Bview(k, col);
       }
     }
   }
   // _cstyle_matmult_views_end
 
-  checkResult<double>(C, Cref, N*N);
-//printValues<double>(C, N*N);
+  checkResult<double>(C, Cref, N * N);
+  // printValues<double>(C, N*N);
 
-//
-// Clean up.
-//
-  delete [] A;
-  delete [] B;
-  delete [] C;
-  delete [] Cref;
+  //
+  // Clean up.
+  //
+  delete[] A;
+  delete[] B;
+  delete[] C;
+  delete[] Cref;
 
-//----------------------------------------------------------------------------//
-//
-// Default layouts use row-major data ordering
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Default layouts use row-major data ordering
+  //
+  //----------------------------------------------------------------------------//
 
   //
   // Define dimensions and allocate arrays
@@ -144,9 +152,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int Nx = 3;
   constexpr int Ny = 5;
   constexpr int Nz = 2;
-  constexpr int Ntot  = Nx*Ny*Nz;
-  int* a = new int[ Ntot ];
-  int* aref = new int[ Ntot ];
+  constexpr int Ntot = Nx * Ny * Nz;
+  int* a = new int[Ntot];
+  int* aref = new int[Ntot];
 
   for (int i = 0; i < Ntot; ++i)
   {
@@ -154,49 +162,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   // _default_views_init_end
 
-//printValues<int>(ref, Ntot);
+  // printValues<int>(ref, Ntot);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n Running default layout view cases...\n";
 
   std::cout << "\n\t Running 1D view case...\n";
- 
+
   std::memset(a, 0, Ntot * sizeof(int));
- 
-  // _default_view1D_start 
-  RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot);
 
-  for (int i = 0; i < Ntot; ++i) {
+  // _default_view1D_start
+  RAJA::View<int, RAJA::Layout<1, int>> view_1D(a, Ntot);
+
+  for (int i = 0; i < Ntot; ++i)
+  {
     view_1D(i) = i;
   }
-  // _default_view1D_end 
+  // _default_view1D_end
 
   checkResult<int>(a, aref, Ntot);
-//printValues<int>(a, Ntot);
+  // printValues<int>(a, Ntot);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D default layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
- 
+
   // _default_view2D_start
-  RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny);
+  RAJA::View<int, RAJA::Layout<2, int>> view_2D(a, Nx, Ny);
 
   int iter{0};
-  for (int i = 0; i < Nx; ++i) {
-    for (int j = 0; j < Ny; ++j) {
+  for (int i = 0; i < Nx; ++i)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
       view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_view2D_end
 
-  checkResult<int>(a, aref, Nx*Ny);
-//printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx * Ny);
+  // printValues<int>(a, Nx*Ny);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D default layout view case...\n";
 
@@ -205,47 +216,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement a triple loop nest using a RAJA::View and 
+  /// EXERCISE: Implement a triple loop nest using a RAJA::View and
   ///           three-dimensional RAJA::Layout that iterates over the
   ///           data array 'a' with unit stride.
   ///
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//----------------------------------------------------------------------------//
-//
-// Permuted layouts change the data striding order
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Permuted layouts change the data striding order
+  //
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running permuted layout cases...\n";
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D default permutation view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _default_perm_view2D_start
-  std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
-  RAJA::Layout< 2, int > defperm2_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2);
-  RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout);
+  std::array<RAJA::idx_t, 2> defperm2{{0, 1}};
+  RAJA::Layout<2, int> defperm2_layout =
+      RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
+  RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i) {
-    for (int j = 0; j < Ny; ++j) {
+  for (int i = 0; i < Nx; ++i)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
       defperm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_perm_view2D_end
 
-  checkResult<int>(a, aref, Nx*Ny);
-//printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx * Ny);
+  // printValues<int>(a, Nx*Ny);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D default permutation view case...\n";
 
@@ -258,35 +271,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           three-dimensional RAJA::Layout with the identity permutation.
   ///
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//----------------------------------------//
-//----------------------------------------//
+  //----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _perm_2D_start
-  std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
-  RAJA::Layout< 2, int > perm2_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny}}, perm2);
-  RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout);
+  std::array<RAJA::idx_t, 2> perm2{{1, 0}};
+  RAJA::Layout<2, int> perm2_layout =
+      RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
+  RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j) {
-    for (int i = 0; i < Nx; ++i) {
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
       perm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _perm_2D_end
 
-  checkResult<int>(a, aref, Nx*Ny);
-//printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx * Ny);
+  // printValues<int>(a, Nx*Ny);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D perma layout view case...\n";
 
@@ -297,7 +312,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement a triple loop nest using a RAJA::View and
   ///           three-dimensional RAJA::Layout with the permutation
-  ///           {2, 1, 0}. 
+  ///           {2, 1, 0}.
   ///
   ///           Name the Layout object 'perm3a_layout' so it can be used
   ///           with the index conversion methods in the section below.
@@ -305,25 +320,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           Layout object you create here.
   ///
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D permb layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _permb_view3D_start
-  std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
-  RAJA::Layout< 3, int > perm3b_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b);
-  RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout);
+  std::array<RAJA::idx_t, 3> perm3b{{1, 2, 0}};
+  RAJA::Layout<3, int> perm3b_layout =
+      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
+  RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j) {
-    for (int k = 0; k < Nz; ++k) {
-      for (int i = 0; i < Nx; ++i) {
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int k = 0; k < Nz; ++k)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
         perm3b_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -331,29 +349,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   // _permb_view3D_end
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//
-// Clean up.
-//
-  delete [] a;
-  delete [] aref;
+  //
+  // Clean up.
+  //
+  delete[] a;
+  delete[] aref;
 
-//----------------------------------------------------------------------------//
-//
-// Layouts: multi-dimensional indices vs. linear indicies
-//
-// RAJA::Layout type has methods that can be used to convert between
-// multi-dimensional and linear indices. We show these below using the
-// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
-// sizes defined earlier:
-//
-//  constexpr int Nx = 3;
-//  constexpr int Ny = 5;
-//  constexpr int Nz = 2;
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Layouts: multi-dimensional indices vs. linear indicies
+  //
+  // RAJA::Layout type has methods that can be used to convert between
+  // multi-dimensional and linear indices. We show these below using the
+  // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
+  // sizes defined earlier:
+  //
+  //  constexpr int Nx = 3;
+  //  constexpr int Ny = 5;
+  //  constexpr int Nz = 2;
+  //
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Multi-dimensional indices to linear indices...\n";
 
@@ -361,44 +379,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   std::cout << "\nperm3a_layout...\n" << std::endl;
 
   int lin = -1;
-  int i = -1; 
-  int j = -1; 
-  int k = -1; 
+  int i = -1;
+  int j = -1;
+  int k = -1;
 
-/*
-  // _perm3d_layout_start
-  lin = perm3a_layout(1, 2, 0);
-  std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl;
-  std::cout << "\t  Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny "
-            << "(since perm is {2, 1, 0})" << std::endl;
+  /*
+    // _perm3d_layout_start
+    lin = perm3a_layout(1, 2, 0);
+    std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl;
+    std::cout << "\t  Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny "
+              << "(since perm is {2, 1, 0})" << std::endl;
 
-  perm3a_layout.toIndices(7, i, j, k);
-  std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
-  // _perm3d_layout_end
+    perm3a_layout.toIndices(7, i, j, k);
+    std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
+              << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+    // _perm3d_layout_end
 
 
-  lin = perm3a_layout(2, 3, 1);
-  std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl;
-  std::cout << "\t  Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny "
-            << "(since perm is {2, 1, 0})" << std::endl;
+    lin = perm3a_layout(2, 3, 1);
+    std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl;
+    std::cout << "\t  Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny "
+              << "(since perm is {2, 1, 0})" << std::endl;
 
-  perm3a_layout.toIndices(26, i, j, k);
-  std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+    perm3a_layout.toIndices(26, i, j, k);
+    std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
+              << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
 
-  lin = perm3a_layout(0, 2, 1);
-  std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl;
-  std::cout << "\t  Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny "
-            << "(since perm is {2, 1, 0})" << std::endl;
+    lin = perm3a_layout(0, 2, 1);
+    std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl;
+    std::cout << "\t  Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny "
+              << "(since perm is {2, 1, 0})" << std::endl;
 
-  perm3a_layout.toIndices(21, i, j, k);
-  std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
-*/
+    perm3a_layout.toIndices(21, i, j, k);
+    std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
+              << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+  */
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\nperm3b_layout...\n" << std::endl;
 
@@ -409,7 +427,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(13, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
 
   lin = perm3b_layout(2, 3, 1);
@@ -419,7 +438,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(23, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
 
   lin = perm3b_layout(0, 2, 1);
@@ -428,7 +448,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             << "(since perm is {1, 2, 0})" << std::endl;
   perm3b_layout.toIndices(15, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; 
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
   ///
   /// TODO...
@@ -438,11 +459,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           data array 'a' with unit stride.
   ///
 
-//----------------------------------------------------------------------------//
-//
-// Offset layouts apply offsets to indices
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Offset layouts apply offsets to indices
+  //
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running offset layout cases...\n";
 
@@ -450,10 +471,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Define some dimensions, and allocate arrays
   //
   constexpr int Ntot_ao = 40;
-  int* ao = new int[ Ntot_ao ];
-  int* ao_ref = new int[ Ntot_ao ];
+  int* ao = new int[Ntot_ao];
+  int* ao_ref = new int[Ntot_ao];
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 1D offset layout case...\n";
 
@@ -467,33 +488,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   int imin = -5;
   int imax = 6;
 
-  for (int i = imin; i < imax; ++i) {
-    ao_ref[ i-imin ] = i;
+  for (int i = imin; i < imax; ++i)
+  {
+    ao_ref[i - imin] = i;
   }
   // _cstyle_offlayout1D_end
 
-//printValues<int>(ao_ref, imax-imin);
+  // printValues<int>(ao_ref, imax-imin);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_offlayout1D_start
-  RAJA::OffsetLayout<1, int> offlayout_1D = 
-    RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); 
+  RAJA::OffsetLayout<1, int> offlayout_1D =
+      RAJA::make_offset_layout<1, int>({{imin}}, {{imax}});
 
-  RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao,
-                                                             offlayout_1D);
+  RAJA::View<int, RAJA::OffsetLayout<1, int>> aoview_1Doff(ao, offlayout_1D);
 
-  for (int i = imin; i < imax; ++i) {
+  for (int i = imin; i < imax; ++i)
+  {
     aoview_1Doff(i) = i;
   }
   // _raja_offlayout1D_end
 
-  checkResult<int>(ao, ao_ref, imax-imin);
-//printValues<int>(ao, 11);
+  checkResult<int>(ao, ao_ref, imax - imin);
+  // printValues<int>(ao, 11);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D offset layout case...\n";
 
@@ -510,17 +532,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   int jmax = 5;
 
   iter = 0;
-  for (int i = imin; i < imax; ++i) {
-    for (int j = jmin; j < jmax; ++j) {
-      ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin)  ] = iter;
+  for (int i = imin; i < imax; ++i)
+  {
+    for (int j = jmin; j < jmax; ++j)
+    {
+      ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter;
       iter++;
     }
   }
   // _cstyle_offlayout2D_end
 
-//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
@@ -532,10 +556,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   ///           same operations as the C-style example above.
   ///
 
-  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
-//printValues<int>(ao, (imax-imin)*(jmax-jmin)); 
+  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
+  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted offset layout case...\n";
 
@@ -547,50 +571,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   // _cstyle_permofflayout2D_start
   iter = 0;
-  for (int j = jmin; j < jmax; ++j) {
-    for (int i = imin; i < imax; ++i) {
-      ao_ref[ (i-imin) + (j-jmin) * (imax-imin)  ] = iter; 
+  for (int j = jmin; j < jmax; ++j)
+  {
+    for (int i = imin; i < imax; ++i)
+    {
+      ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter;
       iter++;
     }
   }
   // _cstyle_permofflayout2D_end
 
-//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_permofflayout2D_start
-  std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
-  RAJA::OffsetLayout<2> permofflayout_2D =
-    RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, 
-                                          {{imax, jmax}},
-                                          perm1D );
+  std::array<RAJA::idx_t, 2> perm1D{{1, 0}};
+  RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>(
+      {{imin, jmin}}, {{imax, jmax}}, perm1D);
 
-  RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao,
-                                                            permofflayout_2D);
+  RAJA::View<int, RAJA::OffsetLayout<2>> aoview_2Dpermoff(ao, permofflayout_2D);
 
   iter = 0;
-  for (int j = jmin; j < jmax; ++j) {
-    for (int i = imin; i < imax; ++i) {
+  for (int j = jmin; j < jmax; ++j)
+  {
+    for (int i = imin; i < imax; ++i)
+    {
       aoview_2Dpermoff(i, j) = iter;
       iter++;
     }
   }
   // _raja_permofflayout2D_end
 
-  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
-//printValues<int>(ao, (imax-imin)*(jmax-jmin));
+  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
+  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
 
-//
-// Clean up.
-//
-  delete [] ao;
-  delete [] ao_ref;
+  //
+  // Clean up.
+  //
+  delete[] ao;
+  delete[] ao_ref;
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
 
@@ -604,14 +629,19 @@ template <typename T>
 void checkResult(T* C, T* Cref, int N)
 {
   bool match = true;
-  for (int i = 0; i < N; ++i) {
-    if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (std::abs(C[i] - Cref[i]) > 10e-12)
+    {
       match = false;
     }
   }
-  if ( match ) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -619,7 +649,8 @@ void checkResult(T* C, T* Cref, int N)
 template <typename T>
 void printValues(T* C, int N)
 {
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     std::cout << "array[" << i << "] = " << C[i] << std::endl;
-    }
+  }
 };
diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp
index 7614c993a8..cb87b84aa4 100644
--- a/exercises/view-layout_solution.cpp
+++ b/exercises/view-layout_solution.cpp
@@ -22,9 +22,9 @@
  *  RAJA features shown:
  *    - RAJA::View
  *    - RAJA::Layout
- *    - Layout permutations 
+ *    - Layout permutations
  *    - OffsetLayout
- *    - OffsetLayout permutations 
+ *    - OffsetLayout permutations
  *
  * NOTE: no RAJA kernel execution methods are used in these examples.
  */
@@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N);
 template <typename T>
 void printValues(T* C, int N);
 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA view & layout exercises...\n";
 
-//----------------------------------------------------------------------------//
-//
-// Matrix-matrix multiplication: default layout
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Matrix-matrix multiplication: default layout
+  //
+  //----------------------------------------------------------------------------//
 
   // _matmult_init_start
   //
@@ -58,84 +58,92 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate storage for matrices and initialize matrix entries
   //
-  double *A = new double[ N * N ];
-  double *B = new double[ N * N ];
-  double *C = new double[ N * N ];
-  double *Cref = new double[ N * N ];
-
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      A[ col + N*row ] = row + 1;
-      B[ col + N*row ] = col + 1;
-      C[ col + N*row ] = 0.0;
-      Cref[ col + N*row ] = 0.0;
+  double* A = new double[N * N];
+  double* B = new double[N * N];
+  double* C = new double[N * N];
+  double* Cref = new double[N * N];
+
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      A[col + N * row] = row + 1;
+      B[col + N * row] = col + 1;
+      C[col + N * row] = 0.0;
+      Cref[col + N * row] = 0.0;
     }
   }
   // _matmult_init_end
 
-//printValues<double>(A, N*N); 
-//printValues<double>(B, N*N); 
-//printValues<double>(C, N*N); 
-//printValues<double>(Cref, N*N); 
+  // printValues<double>(A, N*N);
+  // printValues<double>(B, N*N);
+  // printValues<double>(C, N*N);
+  // printValues<double>(Cref, N*N);
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication reference solution...\n";
 
   // _cstyle_matmult_start
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      for (int k = 0; k < N; ++k) {
-        Cref[col + N*row] += A[k + N*row] * B[col + N*k];
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      for (int k = 0; k < N; ++k)
+      {
+        Cref[col + N * row] += A[k + N * row] * B[col + N * k];
       }
     }
   }
   // _cstyle_matmult_end
 
-//printValues<double>(Cref, N*N);
+  // printValues<double>(Cref, N*N);
 
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication w/Views...\n";
 
-  // 
+  //
   // Define RAJA View objects to simplify access to the matrix entries.
-  // 
-  // Note: we use default Layout 
+  //
+  // Note: we use default Layout
   //
   // _matmult_views_start
-  RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N);
-  RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N);
-  RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N);
+  RAJA::View<double, RAJA::Layout<2, int>> Aview(A, N, N);
+  RAJA::View<double, RAJA::Layout<2, int>> Bview(B, N, N);
+  RAJA::View<double, RAJA::Layout<2, int>> Cview(C, N, N);
   // _matmult_views_end
 
   // _cstyle_matmult_views_start
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
-      for (int k = 0; k < N; ++k) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
+      for (int k = 0; k < N; ++k)
+      {
         Cview(row, col) += Aview(row, k) * Bview(k, col);
       }
     }
   }
   // _cstyle_matmult_views_end
 
-  checkResult<double>(C, Cref, N*N);
-//printValues<double>(C, N*N);
+  checkResult<double>(C, Cref, N * N);
+  // printValues<double>(C, N*N);
 
-//
-// Clean up.
-//
-  delete [] A;
-  delete [] B;
-  delete [] C;
-  delete [] Cref;
+  //
+  // Clean up.
+  //
+  delete[] A;
+  delete[] B;
+  delete[] C;
+  delete[] Cref;
 
-//----------------------------------------------------------------------------//
-//
-// Default layouts use row-major data ordering
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Default layouts use row-major data ordering
+  //
+  //----------------------------------------------------------------------------//
 
   //
   // Define dimensions and allocate arrays
@@ -144,9 +152,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int Nx = 3;
   constexpr int Ny = 5;
   constexpr int Nz = 2;
-  constexpr int Ntot  = Nx*Ny*Nz;
-  int* a = new int[ Ntot ];
-  int* aref = new int[ Ntot ];
+  constexpr int Ntot = Nx * Ny * Nz;
+  int* a = new int[Ntot];
+  int* aref = new int[Ntot];
 
   for (int i = 0; i < Ntot; ++i)
   {
@@ -154,61 +162,67 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   // _default_views_init_end
 
-//printValues<int>(ref, Ntot);
+  // printValues<int>(ref, Ntot);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n Running default layout view cases...\n";
 
   std::cout << "\n\t Running 1D view case...\n";
- 
+
   std::memset(a, 0, Ntot * sizeof(int));
- 
-  // _default_view1D_start 
-  RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot);
 
-  for (int i = 0; i < Ntot; ++i) {
+  // _default_view1D_start
+  RAJA::View<int, RAJA::Layout<1, int>> view_1D(a, Ntot);
+
+  for (int i = 0; i < Ntot; ++i)
+  {
     view_1D(i) = i;
   }
-  // _default_view1D_end 
+  // _default_view1D_end
 
   checkResult<int>(a, aref, Ntot);
-//printValues<int>(a, Ntot);
+  // printValues<int>(a, Ntot);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D default layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
- 
+
   // _default_view2D_start
-  RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny);
+  RAJA::View<int, RAJA::Layout<2, int>> view_2D(a, Nx, Ny);
 
   int iter{0};
-  for (int i = 0; i < Nx; ++i) {
-    for (int j = 0; j < Ny; ++j) {
+  for (int i = 0; i < Nx; ++i)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
       view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_view2D_end
 
-  checkResult<int>(a, aref, Nx*Ny);
-//printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx * Ny);
+  // printValues<int>(a, Nx*Ny);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D default layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
-  // _default_view3D_start    
-  RAJA::View< int, RAJA::Layout<3, int> > view_3D(a, Nx, Ny, Nz);
+  // _default_view3D_start
+  RAJA::View<int, RAJA::Layout<3, int>> view_3D(a, Nx, Ny, Nz);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i) {
-    for (int j = 0; j < Ny; ++j) {
-      for (int k = 0; k < Nz; ++k) {
+  for (int i = 0; i < Nx; ++i)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int k = 0; k < Nz; ++k)
+      {
         view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -216,57 +230,62 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   // _default_view3D_end
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//----------------------------------------------------------------------------//
-//
-// Permuted layouts change the data striding order
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Permuted layouts change the data striding order
+  //
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running permuted layout cases...\n";
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D default permutation view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _default_perm_view2D_start
-  std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
-  RAJA::Layout< 2, int > defperm2_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2);
-  RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout);
+  std::array<RAJA::idx_t, 2> defperm2{{0, 1}};
+  RAJA::Layout<2, int> defperm2_layout =
+      RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
+  RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i) {
-    for (int j = 0; j < Ny; ++j) {
+  for (int i = 0; i < Nx; ++i)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
       defperm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_perm_view2D_end
 
-  checkResult<int>(a, aref, Nx*Ny);
-//printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx * Ny);
+  // printValues<int>(a, Nx*Ny);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D default permutation view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _default_perm_view3D_start
-  std::array<RAJA::idx_t, 3> defperm3 {{0, 1, 2}};
-  RAJA::Layout< 3, int > defperm3_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, defperm3);
-  RAJA::View< int, RAJA::Layout<3, int> > defperm_view_3D(a, defperm3_layout);
+  std::array<RAJA::idx_t, 3> defperm3{{0, 1, 2}};
+  RAJA::Layout<3, int> defperm3_layout =
+      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, defperm3);
+  RAJA::View<int, RAJA::Layout<3, int>> defperm_view_3D(a, defperm3_layout);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i) {
-    for (int j = 0; j < Ny; ++j) {
-      for (int k = 0; k < Nz; ++k) {
+  for (int i = 0; i < Nx; ++i)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int k = 0; k < Nz; ++k)
+      {
         defperm_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -274,50 +293,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   // _default_perm_view3D_end
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//----------------------------------------//
-//----------------------------------------//
+  //----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _perm_2D_start
-  std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
-  RAJA::Layout< 2, int > perm2_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny}}, perm2);
-  RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout);
+  std::array<RAJA::idx_t, 2> perm2{{1, 0}};
+  RAJA::Layout<2, int> perm2_layout =
+      RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
+  RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j) {
-    for (int i = 0; i < Nx; ++i) {
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
       perm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _perm_2D_end
 
-  checkResult<int>(a, aref, Nx*Ny);
-//printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx * Ny);
+  // printValues<int>(a, Nx*Ny);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D perma layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _perma_view3D_start
-  std::array<RAJA::idx_t, 3> perm3a {{2, 1, 0}};
-  RAJA::Layout< 3, int > perm3a_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3a);
-  RAJA::View< int, RAJA::Layout<3, int> > perm3a_view_3D(a, perm3a_layout);
+  std::array<RAJA::idx_t, 3> perm3a{{2, 1, 0}};
+  RAJA::Layout<3, int> perm3a_layout =
+      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3a);
+  RAJA::View<int, RAJA::Layout<3, int>> perm3a_view_3D(a, perm3a_layout);
 
   iter = 0;
-  for (int k = 0; k < Nz; ++k) {
-    for (int j = 0; j < Ny; ++j) {
-      for (int i = 0; i < Nx; ++i) {
+  for (int k = 0; k < Nz; ++k)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
         perm3a_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -325,25 +349,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   // _perma_view3D_end
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 3D permb layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _permb_view3D_start
-  std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
-  RAJA::Layout< 3, int > perm3b_layout =
-    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b);
-  RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout);
+  std::array<RAJA::idx_t, 3> perm3b{{1, 2, 0}};
+  RAJA::Layout<3, int> perm3b_layout =
+      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
+  RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j) {
-    for (int k = 0; k < Nz; ++k) {
-      for (int i = 0; i < Nx; ++i) {
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int k = 0; k < Nz; ++k)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
         perm3b_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -351,29 +378,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   // _permb_view3D_end
 
-  checkResult<int>(a, aref, Nx*Ny*Nz);
-//printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx * Ny * Nz);
+  // printValues<int>(a, Nx*Ny*Nz);
 
-//
-// Clean up.
-//
-  delete [] a;
-  delete [] aref;
+  //
+  // Clean up.
+  //
+  delete[] a;
+  delete[] aref;
 
-//----------------------------------------------------------------------------//
-//
-// Layouts: multi-dimensional indices vs. linear indicies
-//
-// RAJA::Layout type has methods that can be used to convert between
-// multi-dimensional and linear indices. We show these below using the
-// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
-// sizes defined earlier:
-//
-//  constexpr int Nx = 3;
-//  constexpr int Ny = 5;
-//  constexpr int Nz = 2;
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Layouts: multi-dimensional indices vs. linear indicies
+  //
+  // RAJA::Layout type has methods that can be used to convert between
+  // multi-dimensional and linear indices. We show these below using the
+  // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
+  // sizes defined earlier:
+  //
+  //  constexpr int Nx = 3;
+  //  constexpr int Ny = 5;
+  //  constexpr int Nz = 2;
+  //
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Multi-dimensional indices to linear indices...\n";
 
@@ -393,7 +420,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3a_layout.toIndices(7, i, j, k);
   std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
   // _perm3d_layout_end
 
 
@@ -404,7 +432,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3a_layout.toIndices(26, i, j, k);
   std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
 
   lin = perm3a_layout(0, 2, 1);
@@ -414,9 +443,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3a_layout.toIndices(21, i, j, k);
   std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\nperm3b_layout...\n" << std::endl;
 
@@ -427,7 +457,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(13, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
 
   lin = perm3b_layout(2, 3, 1);
@@ -437,7 +468,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(23, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
 
   lin = perm3b_layout(0, 2, 1);
@@ -447,13 +479,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(15, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; 
+            << "(" << i << ", " << j << ", " << k << ")\n"
+            << std::endl;
 
-//----------------------------------------------------------------------------//
-//
-// Offset layouts apply offsets to indices
-//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //
+  // Offset layouts apply offsets to indices
+  //
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n Running offset layout cases...\n";
 
@@ -461,10 +494,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // Define some dimensions, and allocate arrays
   //
   constexpr int Ntot_ao = 40;
-  int* ao = new int[ Ntot_ao ];
-  int* ao_ref = new int[ Ntot_ao ];
+  int* ao = new int[Ntot_ao];
+  int* ao_ref = new int[Ntot_ao];
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 1D offset layout case...\n";
 
@@ -478,33 +511,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   int imin = -5;
   int imax = 6;
 
-  for (int i = imin; i < imax; ++i) {
-    ao_ref[ i-imin ] = i;
+  for (int i = imin; i < imax; ++i)
+  {
+    ao_ref[i - imin] = i;
   }
   // _cstyle_offlayout1D_end
 
-//printValues<int>(ao_ref, imax-imin);
+  // printValues<int>(ao_ref, imax-imin);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_offlayout1D_start
-  RAJA::OffsetLayout<1, int> offlayout_1D = 
-    RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); 
+  RAJA::OffsetLayout<1, int> offlayout_1D =
+      RAJA::make_offset_layout<1, int>({{imin}}, {{imax}});
 
-  RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao,
-                                                             offlayout_1D);
+  RAJA::View<int, RAJA::OffsetLayout<1, int>> aoview_1Doff(ao, offlayout_1D);
 
-  for (int i = imin; i < imax; ++i) {
+  for (int i = imin; i < imax; ++i)
+  {
     aoview_1Doff(i) = i;
   }
   // _raja_offlayout1D_end
 
-  checkResult<int>(ao, ao_ref, imax-imin);
-//printValues<int>(ao, 11);
+  checkResult<int>(ao, ao_ref, imax - imin);
+  // printValues<int>(ao, 11);
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D offset layout case...\n";
 
@@ -521,39 +555,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   int jmax = 5;
 
   iter = 0;
-  for (int i = imin; i < imax; ++i) {
-    for (int j = jmin; j < jmax; ++j) {
-      ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin)  ] = iter;
+  for (int i = imin; i < imax; ++i)
+  {
+    for (int j = jmin; j < jmax; ++j)
+    {
+      ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter;
       iter++;
     }
   }
   // _cstyle_offlayout2D_end
 
-//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_offlayout2D_start
   RAJA::OffsetLayout<2, int> offlayout_2D =
-    RAJA::make_offset_layout<2, int>( {{imin, jmin}}, {{imax, jmax}} );
+      RAJA::make_offset_layout<2, int>({{imin, jmin}}, {{imax, jmax}});
 
-  RAJA::View< int, RAJA::OffsetLayout<2, int> > aoview_2Doff(ao,
-                                                             offlayout_2D);
+  RAJA::View<int, RAJA::OffsetLayout<2, int>> aoview_2Doff(ao, offlayout_2D);
   iter = 0;
-  for (int i = imin; i < imax; ++i) {
-    for (int j = jmin; j < jmax; ++j) {
+  for (int i = imin; i < imax; ++i)
+  {
+    for (int j = jmin; j < jmax; ++j)
+    {
       aoview_2Doff(i, j) = iter;
       iter++;
     }
   }
   // _raja_offlayout2D_end
 
-  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
-//printValues<int>(ao, (imax-imin)*(jmax-jmin)); 
+  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
+  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted offset layout case...\n";
 
@@ -565,50 +602,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   // _cstyle_permofflayout2D_start
   iter = 0;
-  for (int j = jmin; j < jmax; ++j) {
-    for (int i = imin; i < imax; ++i) {
-      ao_ref[ (i-imin) + (j-jmin) * (imax-imin)  ] = iter; 
+  for (int j = jmin; j < jmax; ++j)
+  {
+    for (int i = imin; i < imax; ++i)
+    {
+      ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter;
       iter++;
     }
   }
   // _cstyle_permofflayout2D_end
 
-//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-//----------------------------------------//
+  //----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_permofflayout2D_start
-  std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
-  RAJA::OffsetLayout<2> permofflayout_2D =
-    RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, 
-                                          {{imax, jmax}},
-                                          perm1D );
+  std::array<RAJA::idx_t, 2> perm1D{{1, 0}};
+  RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>(
+      {{imin, jmin}}, {{imax, jmax}}, perm1D);
 
-  RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao,
-                                                            permofflayout_2D);
+  RAJA::View<int, RAJA::OffsetLayout<2>> aoview_2Dpermoff(ao, permofflayout_2D);
 
   iter = 0;
-  for (int j = jmin; j < jmax; ++j) {
-    for (int i = imin; i < imax; ++i) {
+  for (int j = jmin; j < jmax; ++j)
+  {
+    for (int i = imin; i < imax; ++i)
+    {
       aoview_2Dpermoff(i, j) = iter;
       iter++;
     }
   }
   // _raja_permofflayout2D_end
 
-  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
-//printValues<int>(ao, (imax-imin)*(jmax-jmin));
+  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
+  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
 
-//
-// Clean up.
-//
-  delete [] ao;
-  delete [] ao_ref;
+  //
+  // Clean up.
+  //
+  delete[] ao;
+  delete[] ao_ref;
 
-//----------------------------------------------------------------------------//
-//----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
+  //----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
 
@@ -622,14 +660,19 @@ template <typename T>
 void checkResult(T* C, T* Cref, int N)
 {
   bool match = true;
-  for (int i = 0; i < N; ++i) {
-    if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (std::abs(C[i] - Cref[i]) > 10e-12)
+    {
       match = false;
     }
   }
-  if ( match ) {
+  if (match)
+  {
     std::cout << "\n\t result -- PASS\n";
-  } else {
+  }
+  else
+  {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -637,7 +680,8 @@ void checkResult(T* C, T* Cref, int N)
 template <typename T>
 void printValues(T* C, int N)
 {
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     std::cout << "array[" << i << "] = " << C[i] << std::endl;
-    }
+  }
 };
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 59cca4bf22..062be6c1bb 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -88,7 +88,7 @@
 #endif
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/desul.hpp"
+#include "RAJA/policy/desul.hpp"
 #endif
 
 #include "RAJA/index/IndexSet.hpp"
@@ -197,11 +197,13 @@
 
 #include "RAJA/pattern/sort.hpp"
 
-namespace RAJA {
-namespace expt{}
+namespace RAJA
+{
+namespace expt
+{}
 //  // provide a RAJA::expt namespace for experimental work, but bring alias
 //  // it into RAJA so it doesn't affect user code
 //  using namespace expt;
-}
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 1a467c8341..45f6777d93 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -34,8 +34,16 @@
 namespace RAJA
 {
 
-enum PushEnd { PUSH_FRONT, PUSH_BACK };
-enum PushCopy { PUSH_COPY, PUSH_NOCOPY };
+enum PushEnd
+{
+  PUSH_FRONT,
+  PUSH_BACK
+};
+enum PushCopy
+{
+  PUSH_COPY,
+  PUSH_NOCOPY
+};
 
 template <typename... TALL>
 class TypedIndexSet;
@@ -55,13 +63,14 @@ namespace indexset
 template <typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
 struct ExecPolicy
     : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
-                                         RAJA::Pattern::forall> {
+                                         RAJA::Pattern::forall>
+{
   using seg_it = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
 };
 
-}  // end namespace indexset
-}  // end namespace policy
+} // end namespace indexset
+} // end namespace policy
 
 using policy::indexset::ExecPolicy;
 
@@ -91,7 +100,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Construct empty index set
 #if _MSC_VER < 1910
-   // this one instance of constexpr does not work on VS2012 or VS2015
+  // this one instance of constexpr does not work on VS2012 or VS2015
   RAJA_INLINE TypedIndexSet() : PARENT() {}
 #else
   RAJA_INLINE constexpr TypedIndexSet() : PARENT() {}
@@ -99,12 +108,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Copy-constructor for index set
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet<T0, TREST...> const &c)
-      : PARENT((PARENT const &)c)
+  TypedIndexSet(TypedIndexSet<T0, TREST...> const& c) : PARENT((PARENT const&)c)
   {
     size_t num = c.data.size();
     data.resize(num);
-    for (size_t i = 0; i < num; ++i) {
+    for (size_t i = 0; i < num; ++i)
+    {
       data[i] = c.data[i];
     }
     // mark all as not owned by us
@@ -112,9 +121,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Copy-assignment operator for index set
-  TypedIndexSet<T0, TREST...> &operator=(const TypedIndexSet<T0, TREST...> &rhs)
+  TypedIndexSet<T0, TREST...>& operator=(const TypedIndexSet<T0, TREST...>& rhs)
   {
-    if (&rhs != this) {
+    if (&rhs != this)
+    {
       TypedIndexSet<T0, TREST...> copy(rhs);
       this->swap(copy);
     }
@@ -125,19 +135,21 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE ~TypedIndexSet()
   {
     size_t num_seg = data.size();
-    for (size_t i = 0; i < num_seg; ++i) {
+    for (size_t i = 0; i < num_seg; ++i)
+    {
       // Only free segment of we allocated it
-      if (owner[i]) {
+      if (owner[i])
+      {
         delete data[i];
       }
     }
   }
 
   //! Swap function for copy-and-swap idiom.
-  void swap(TypedIndexSet<T0, TREST...> &other)
+  void swap(TypedIndexSet<T0, TREST...>& other)
   {
     // Swap parents data
-    PARENT::swap((PARENT &)other);
+    PARENT::swap((PARENT&)other);
     // Swap our data
     using std::swap;
     swap(data, other.data);
@@ -150,18 +162,20 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This is used to implement the == and != operators
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool compareSegmentById(
-      size_t segid,
-      const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool
+  compareSegmentById(size_t segid,
+                     const TypedIndexSet<P0, PREST...>& other) const
   {
     // drill down our types until we have the right type
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       // peel off T0
       return PARENT::compareSegmentById(segid, other);
     }
 
     // Check that other's segid is of type T0
-    if (!other.template checkSegmentType<T0>(segid)) {
+    if (!other.template checkSegmentType<T0>(segid))
+    {
       return false;
     }
 
@@ -174,7 +188,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename P0>
   RAJA_INLINE bool checkSegmentType(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       return std::is_same<T0, P0>::value;
     }
     return PARENT::template checkSegmentType<P0>(segid);
@@ -183,22 +198,24 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t segid)
+  RAJA_INLINE P0& getSegment(size_t segid)
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t segid) const
+  RAJA_INLINE P0 const& getSegment(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
@@ -231,20 +248,25 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 private:
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &c,
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
                              PushEnd pend = PUSH_BACK,
                              PushCopy pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
-    if (pend == PUSH_BACK) {
-      for (Index_type i = 0; i < num; ++i) {
+    if (pend == PUSH_BACK)
+    {
+      for (Index_type i = 0; i < num; ++i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
-    } else {
-      for (Index_type i = num-1; i > -1; --i) {
+      }
+    }
+    else
+    {
+      for (Index_type i = num - 1; i > -1; --i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
+      }
     }
   }
 
@@ -257,58 +279,64 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 public:
   template <typename... CALL>
   RAJA_INLINE void segment_push_into(size_t segid,
-                                     TypedIndexSet<CALL...> &c,
+                                     TypedIndexSet<CALL...>& c,
                                      PushEnd pend = PUSH_BACK,
                                      PushCopy pcopy = PUSH_COPY)
   {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       PARENT::segment_push_into(segid, c, pend, pcopy);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
-    switch (value_for(pend, pcopy)) {
-      case value_for(PUSH_BACK, PUSH_COPY):
-        c.push_back(*data[offset]);
-        break;
-      case value_for(PUSH_BACK, PUSH_NOCOPY):
-        c.push_back_nocopy(data[offset]);
-        break;
-      case value_for(PUSH_FRONT, PUSH_COPY):
-        c.push_front(*data[offset]);
-        break;
-      case value_for(PUSH_FRONT, PUSH_NOCOPY):
-        c.push_front_nocopy(data[offset]);
-        break;
+    switch (value_for(pend, pcopy))
+    {
+    case value_for(PUSH_BACK, PUSH_COPY):
+      c.push_back(*data[offset]);
+      break;
+    case value_for(PUSH_BACK, PUSH_NOCOPY):
+      c.push_back_nocopy(data[offset]);
+      break;
+    case value_for(PUSH_FRONT, PUSH_COPY):
+      c.push_front(*data[offset]);
+      break;
+    case value_for(PUSH_FRONT, PUSH_NOCOPY):
+      c.push_front_nocopy(data[offset]);
+      break;
     }
   }
 
 
   //! Add segment to back end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_back_nocopy(Tnew *val)
+  RAJA_INLINE void push_back_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_BACK, PUSH_NOCOPY);
   }
 
   //! Add segment to front end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_front_nocopy(Tnew *val)
+  RAJA_INLINE void push_front_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_FRONT, PUSH_NOCOPY);
   }
 
   //! Add copy of segment to back end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_back(Tnew &&val)
+  RAJA_INLINE void push_back(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_BACK,
+                  PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_front(Tnew &&val)
+  RAJA_INLINE void push_front(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_FRONT,
+                  PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
@@ -316,7 +344,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     size_t total = PARENT::getLength();
     size_t num = data.size();
-    for (size_t i = 0; i < num; ++i) {
+    for (size_t i = 0; i < num; ++i)
+    {
       total += data[i]->size();
     }
     return total;
@@ -339,14 +368,13 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY, typename... ARGS>
-  RAJA_HOST_DEVICE void segmentCall(size_t segid,
-                                    BODY &&body,
-                                    ARGS &&... args) const
-  {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
-      PARENT::segmentCall(segid,
-                          std::forward<BODY>(body),
-                          std::forward<ARGS>(args)...);
+  RAJA_HOST_DEVICE void
+  segmentCall(size_t segid, BODY&& body, ARGS&&... args) const
+  {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
+      PARENT::segmentCall(
+          segid, std::forward<BODY>(body), std::forward<ARGS>(args)...);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
@@ -356,24 +384,23 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 protected:
   //! Internal logic to add a new segment -- catch invalid type insertion
   template <typename Tnew>
-  RAJA_INLINE void push_internal(Tnew *val,
-                                 PushEnd pend = PUSH_BACK,
-                                 PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void
+  push_internal(Tnew* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
   {
     static_assert(sizeof...(TREST) > 0, "Invalid type for this TypedIndexSet");
     PARENT::push_internal(val, pend, pcopy);
   }
 
   //! Internal logic to add a new segment
-  RAJA_INLINE void push_internal(T0 *val,
-                                 PushEnd pend = PUSH_BACK,
-                                 PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void
+  push_internal(T0* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
   {
     data.push_back(val);
     owner.push_back(pcopy == PUSH_COPY);
 
     // Determine if we push at the front or back of the segment list
-    if (pend == PUSH_BACK) {
+    if (pend == PUSH_BACK)
+    {
       // Store the segment type
       getSegmentTypes().push_back(T0_TypeId);
 
@@ -384,7 +411,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       size_t icount = val->size();
       getSegmentIcounts().push_back(getTotalLength());
       increaseTotalLength(icount);
-    } else {
+    }
+    else
+    {
       // Store the segment type
       getSegmentTypes().push_front(T0_TypeId);
 
@@ -394,7 +423,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       // Store the segment icount
       getSegmentIcounts().push_front(0);
       size_t icount = val->size();
-      for (size_t i = 1; i < getSegmentIcounts().size(); ++i) {
+      for (size_t i = 1; i < getSegmentIcounts().size(); ++i)
+      {
         getSegmentIcounts()[i] += icount;
       }
       increaseTotalLength(icount);
@@ -402,7 +432,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Returns the number of indices (the total icount of segments
-  RAJA_INLINE Index_type &getTotalLength() { return PARENT::getTotalLength(); }
+  RAJA_INLINE Index_type& getTotalLength() { return PARENT::getTotalLength(); }
 
   //! set total length of the indexset
   RAJA_INLINE void setTotalLength(int n) { return PARENT::setTotalLength(n); }
@@ -439,7 +469,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
     int minSeg = RAJA::operators::maximum<int>{}(0, begin);
     int maxSeg = RAJA::operators::minimum<int>{}(end, getNumSegments());
-    for (int i = minSeg; i < maxSeg; ++i) {
+    for (int i = minSeg; i < maxSeg; ++i)
+    {
       segment_push_into(i, retVal, PUSH_BACK, PUSH_NOCOPY);
     }
     return retVal;
@@ -452,13 +483,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This TypedIndexSet will not change and the created "slice" into it
   /// will not own any of its segments.
   ///
-  TypedIndexSet<T0, TREST...> createSlice(const int *segIds, int len)
+  TypedIndexSet<T0, TREST...> createSlice(const int* segIds, int len)
   {
     TypedIndexSet<T0, TREST...> retVal;
 
     int numSeg = getNumSegments();
-    for (int i = 0; i < len; ++i) {
-      if (segIds[i] >= 0 && segIds[i] < numSeg) {
+    for (int i = 0; i < len; ++i)
+    {
+      if (segIds[i] >= 0 && segIds[i] < numSeg)
+      {
         segment_push_into(segIds[i], retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -476,12 +509,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// iterator type must de-reference to an integral value.
   ///
   template <typename T>
-  TypedIndexSet<T0, TREST...> createSlice(const T &segIds)
+  TypedIndexSet<T0, TREST...> createSlice(const T& segIds)
   {
     TypedIndexSet<T0, TREST...> retVal;
     int numSeg = getNumSegments();
-    for (auto &seg : segIds) {
-      if (seg >= 0 && seg < numSeg) {
+    for (auto& seg : segIds)
+    {
+      if (seg >= 0 && seg < numSeg)
+      {
         segment_push_into(seg, retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -509,37 +544,37 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 protected:
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return PARENT::getSegmentIcounts();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return PARENT::getSegmentIcounts();
   }
@@ -552,13 +587,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///       types and indices; e.g., dependency info not checked.
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...>& other) const
   {
     size_t num_seg = getNumSegments();
     if (num_seg != other.getNumSegments()) return false;
 
-    for (size_t segid = 0; segid < num_seg; ++segid) {
-      if (!compareSegmentById(segid, other)) {
+    for (size_t segid = 0; segid < num_seg; ++segid)
+    {
+      if (!compareSegmentById(segid, other))
+      {
         return false;
       }
     }
@@ -567,14 +604,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Inequality operator returns true if any segment is not equal, else false.
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...>& other) const
   {
     return (!(*this == other));
   }
 
 private:
   //! vector of TypedIndexSet data objects of type T0
-  RAJA::RAJAVec<T0 *> data;
+  RAJA::RAJAVec<T0*> data;
 
   //! vector indicating which segments are owned by the TypedIndexSet
   RAJA::RAJAVec<Index_type> owner;
@@ -603,7 +640,7 @@ class TypedIndexSet<>
 
   //! Copy-constructor.
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet const &c)
+  TypedIndexSet(TypedIndexSet const& c)
   {
     segment_types = c.segment_types;
     segment_offsets = c.segment_offsets;
@@ -612,7 +649,7 @@ class TypedIndexSet<>
   }
 
   //! Swap function for copy-and-swap idiom (deep copy).
-  void swap(TypedIndexSet &other)
+  void swap(TypedIndexSet& other)
   {
     using std::swap;
     swap(segment_types, other.segment_types);
@@ -625,7 +662,7 @@ class TypedIndexSet<>
   RAJA_INLINE static size_t getNumTypes() { return 0; }
 
   template <typename T>
-  RAJA_INLINE constexpr bool isValidSegmentType(T const &) const
+  RAJA_INLINE constexpr bool isValidSegmentType(T const&) const
   {
     // Segment type wasn't found
     return false;
@@ -637,40 +674,39 @@ class TypedIndexSet<>
 
   template <typename BODY, typename... ARGS>
   RAJA_INLINE void segmentCall(size_t, BODY, ARGS...) const
-  {
-  }
+  {}
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE Index_type &getTotalLength() { return m_len; }
+  RAJA_INLINE Index_type& getTotalLength() { return m_len; }
 
   RAJA_INLINE void setTotalLength(int n) { m_len = n; }
 
@@ -678,7 +714,7 @@ class TypedIndexSet<>
 
   template <typename P0, typename... PREST>
   RAJA_INLINE bool compareSegmentById(size_t,
-                                      const TypedIndexSet<P0, PREST...> &) const
+                                      const TypedIndexSet<P0, PREST...>&) const
   {
     return false;
   }
@@ -690,34 +726,29 @@ class TypedIndexSet<>
   }
 
   template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t)
+  RAJA_INLINE P0& getSegment(size_t)
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
   template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t) const
+  RAJA_INLINE P0 const& getSegment(size_t) const
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &, PushEnd, PushCopy) const
-  {
-  }
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
+  {}
 
   template <typename... CALL>
-  RAJA_INLINE void segment_push_into(size_t,
-                                     TypedIndexSet<CALL...> &,
-                                     PushEnd,
-                                     PushCopy) const
-  {
-  }
+  RAJA_INLINE void
+  segment_push_into(size_t, TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
+  {}
 
   template <typename Tnew>
-  RAJA_INLINE void push(Tnew const &, PushEnd, PushCopy)
-  {
-  }
+  RAJA_INLINE void push(Tnew const&, PushEnd, PushCopy)
+  {}
 
 public:
   using iterator = Iterators::numeric_iterator<Index_type>;
@@ -762,15 +793,17 @@ namespace type_traits
 
 template <typename T>
 struct is_index_set
-    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
+                                            typename std::decay<T>::type>
+{};
 
 template <typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type> {
-};
-}  // namespace type_traits
+    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
+                                            typename std::decay<T>::type>
+{};
+} // namespace type_traits
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 543524be01..cd614cca01 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -37,13 +37,13 @@ namespace RAJA
  * \brief Generate an index set with aligned Range segments and List segments,
  *        as needed, from given array of indices.
  *
- *        Routine does no error-checking on argements and assumes 
+ *        Routine does no error-checking on argements and assumes
  *        RAJA::Index_type array contains valid indices.
  *
- *  \param iset reference to index set generated with aligned range segments 
+ *  \param iset reference to index set generated with aligned range segments
  *         and list segments. Method assumes index set is empty (no segments).
- *  \param work_res camp resource object that identifies the memory space in 
- *         which list segment index data will live (passed to list segment 
+ *  \param work_res camp resource object that identifies the memory space in
+ *         which list segment index data will live (passed to list segment
  *         ctor).
  *  \param indices_in pointer to start of input array of indices.
  *  \param length size of input index array.
@@ -79,37 +79,36 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  ******************************************************************************
  *
  * \brief Generate a lock-free "block" index set (planar division) containing
- *        range segments. 
+ *        range segments.
  *
- *        The method chunks a fastDim x midDim x slowDim mesh into blocks that 
+ *        The method chunks a fastDim x midDim x slowDim mesh into blocks that
  *        can be dependency-scheduled, removing need for lock constructs.
  *
  *  \param iset reference to index set generated with range segments.
- *         Method assumes index set is empty (no segments). 
+ *         Method assumes index set is empty (no segments).
  *  \param fastDim "fast" block dimension (see above).
  *  \param midDim  "mid" block dimension (see above).
  *  \param slowDim "slow" block dimension (see above).
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim);
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim);
 
 /*!
  ******************************************************************************
  *
  * \brief Generate a lock-free "color" index set containing range and list
  *        segments.
- * 
- *        TThe domain-set is colored based on connectivity to the range-set. 
- *        All elements in each segment are independent, and no two segments 
+ *
+ *        TThe domain-set is colored based on connectivity to the range-set.
+ *        All elements in each segment are independent, and no two segments
  *        can be executed in parallel.
  *
- * \param iset reference to index set generated. Method assumes index set 
- *        is empty (no segments). 
+ * \param iset reference to index set generated. Method assumes index set
+ *        is empty (no segments).
  * \param work_res camp resource object that identifies the memory space in
  *         which list segment index data will live (passed to list segment
  *         ctor).
@@ -126,6 +125,6 @@ void buildLockFreeColorIndexset(
     RAJA::Index_type* elemPermutation = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index 4baea450fc..eefc0ebbc4 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 //@{
 //!   @name Methods to gather indices of segment or index set into a container.
 //!
-//!   For each method, the given container must be templated on a data type, 
-//!   have default and copy ctors, push_back method, and value_type. Is is 
-//!   assumed that the container data type and segment or index set data type 
-//!   are compatible in the sense that the index set type can be converted to 
+//!   For each method, the given container must be templated on a data type,
+//!   have default and copy ctors, push_back method, and value_type. Is is
+//!   assumed that the container data type and segment or index set data type
+//!   are compatible in the sense that the index set type can be converted to
 //!   the container data type.
 
 /*!
@@ -49,11 +49,8 @@ RAJA_INLINE void getIndices(CONTAINER_T& con,
                             const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -68,11 +65,8 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(
+      seg, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -90,11 +84,10 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset, [&](typename CONTAINER_T::value_type idx) {
+        if (conditional(idx)) tcon.push_back(idx);
+      });
   con = tcon;
 }
 
@@ -112,16 +105,14 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx) {
+    if (conditional(idx)) tcon.push_back(idx);
+  });
   con = tcon;
 }
 
 //@}
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 44fa143445..8579f2c856 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -28,8 +28,8 @@
 namespace RAJA
 {
 
-struct IndexValueBase {
-};
+struct IndexValueBase
+{};
 
 /*!
  * \brief Strongly typed "integer" class.
@@ -44,16 +44,17 @@ struct IndexValueBase {
  * Yes, this uses the curiously-recurring template pattern.
  */
 template <typename TYPE, typename VALUE = RAJA::Index_type>
-struct IndexValue : public IndexValueBase {
+struct IndexValue : public IndexValueBase
+{
 
   using value_type = VALUE;
 
   //! Default constructor initializes value to 0.
   RAJA_INLINE constexpr IndexValue() = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue const &) = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue &&) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue const &) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue &&) = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue const&) = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue&&) = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue const&) = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue&&) = default;
 
   /*!
    * \brief Explicit constructor.
@@ -61,14 +62,13 @@ struct IndexValue : public IndexValueBase {
    */
   RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit IndexValue(value_type v)
       : value(v)
-  {
-  }
+  {}
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator*() { return value; }
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator*() { return value; }
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE const value_type &operator*() const
+  RAJA_HOST_DEVICE RAJA_INLINE const value_type& operator*() const
   {
     return value;
   }
@@ -82,10 +82,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator++()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator++()
   {
     value++;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! postdecrement -- returns a copy
@@ -97,10 +97,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator--()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator--()
   {
     value--;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! addition to underlying index from an Index_type
@@ -163,52 +163,52 @@ struct IndexValue : public IndexValueBase {
     return TYPE(value % a.value);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(value_type x)
   {
     value += x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(TYPE x)
   {
     value += x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(value_type x)
   {
     value -= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(TYPE x)
   {
     value -= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(value_type x)
   {
     value *= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(TYPE x)
   {
     value *= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(value_type x)
   {
     value /= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(TYPE x)
   {
     value /= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   RAJA_HOST_DEVICE RAJA_INLINE bool operator<(value_type x) const
@@ -295,7 +295,7 @@ convertIndex_helper(typename FROM::IndexValueType const val)
 }
 
 
-}  // namespace internal
+} // namespace internal
 
 /*!
  * \brief Function provides a way to take either an int or any Index<> type, and
@@ -334,16 +334,20 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
   return val;
 }
 
-namespace internal{
-template<typename FROM, typename Enable = void>
-struct StripIndexTypeT {
-    using type = FROM;
+namespace internal
+{
+template <typename FROM, typename Enable = void>
+struct StripIndexTypeT
+{
+  using type = FROM;
 };
 
-template<typename FROM>
-struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
+template <typename FROM>
+struct StripIndexTypeT<
+    FROM,
+    typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
 {
-    using type = typename FROM::value_type;
+  using type = typename FROM::value_type;
 };
 } // namespace internal
 
@@ -353,7 +357,7 @@ struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueB
  *
  * \param FROM the original type
  */
-template<typename FROM>
+template <typename FROM>
 using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
 
 /*!
@@ -362,33 +366,31 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  *
  * \param FROM the original type
  */
-template<typename FROM>
-using make_signed_t = typename std::conditional < 
-                                  std::is_floating_point<FROM>::value,
-                                    std::common_type<FROM>,
-                                    std::make_signed<FROM>
-                               >::type::type;
+template <typename FROM>
+using make_signed_t =
+    typename std::conditional<std::is_floating_point<FROM>::value,
+                              std::common_type<FROM>,
+                              std::make_signed<FROM>>::type::type;
 
-}  // namespace RAJA
+} // namespace RAJA
 
 /*!
  * \brief Helper Macro to create new Index types.
  * \param TYPE the name of the type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE(TYPE, NAME)                                 \
-  class TYPE : public ::RAJA::IndexValue<TYPE>                       \
-  {                                                                  \
-    using parent = ::RAJA::IndexValue<TYPE>;                         \
-                                                                     \
-  public:                                                            \
-    using IndexValueType = TYPE;                                     \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}    \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \
-        : parent::IndexValue(v)                                      \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE(TYPE, NAME)                                           \
+  class TYPE : public ::RAJA::IndexValue<TYPE>                                 \
+  {                                                                            \
+    using parent = ::RAJA::IndexValue<TYPE>;                                   \
+                                                                               \
+  public:                                                                      \
+    using IndexValueType = TYPE;                                               \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}              \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v)           \
+        : parent::IndexValue(v)                                                \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 /*!
@@ -397,17 +399,17 @@ using make_signed_t = typename std::conditional <
  * \param IDXT the index types value type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                         \
-  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                 \
-  {                                                                  \
-  public:                                                            \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                              \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue() {}               \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)               \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue(v)                 \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                                   \
+  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                           \
+  {                                                                            \
+  public:                                                                      \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                                        \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue()                           \
+    {}                                                                         \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)                         \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue(v)                          \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 #endif
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index adee46053c..ec4da54a1d 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -85,7 +85,6 @@ template <typename StorageT>
 class TypedListSegment
 {
 public:
-
   //@{
   //!   @name Types used in implementation based on template parameter.
 
@@ -111,7 +110,7 @@ class TypedListSegment
    * \param values array of indices defining iteration space of segment
    * \param length number of indices
    * \param resource camp resource defining memory space where index data live
-   * \param owned optional enum value indicating whether segment owns indices 
+   * \param owned optional enum value indicating whether segment owns indices
    * (Owned or Unowned). Default is Owned.
    *
    * If 'Unowned' is passed as last argument, the segment will not own its
@@ -121,7 +120,7 @@ class TypedListSegment
                    Index_type length,
                    camp::resources::Resource resource,
                    IndexOwnership owned = Owned)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
+      : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
   }
@@ -141,9 +140,13 @@ class TypedListSegment
   template <typename Container>
   TypedListSegment(const Container& container,
                    camp::resources::Resource resource)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size())
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(nullptr),
+        m_size(container.size())
   {
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
 
       camp::resources::Resource host_res{camp::resources::Host()};
 
@@ -152,7 +155,8 @@ class TypedListSegment
       auto dest = tmp;
       auto src = container.begin();
       auto const end = container.end();
-      while (src != end) {
+      while (src != end)
+      {
         *dest = *src;
         ++dest;
         ++src;
@@ -164,7 +168,6 @@ class TypedListSegment
       m_owned = Owned;
 
       host_res.deallocate(tmp);
-
     }
   }
 
@@ -175,10 +178,11 @@ class TypedListSegment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other)
-    : m_resource(nullptr),
-      m_owned(Unowned), m_data(other.m_data), m_size(other.m_size)
-  {
-  }
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(other.m_data),
+        m_size(other.m_size)
+  {}
 
   //! Copy assignment for list segment
   //  As this may be called from a lambda in a
@@ -192,7 +196,7 @@ class TypedListSegment
     m_size = other.m_size;
   }
 
-    //! move assignment for list segment
+  //! move assignment for list segment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs)
@@ -211,8 +215,10 @@ class TypedListSegment
 
   //! Move constructor for list segment
   RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs)
-    : m_resource(rhs.m_resource),
-      m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
+      : m_resource(rhs.m_resource),
+        m_owned(rhs.m_owned),
+        m_data(rhs.m_data),
+        m_size(rhs.m_size)
   {
     rhs.m_owned = Unowned;
     rhs.m_resource = nullptr;
@@ -221,17 +227,15 @@ class TypedListSegment
   }
 
   //! List segment destructor
-  RAJA_HOST_DEVICE ~TypedListSegment()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE ~TypedListSegment() { clear(); }
 
   //! Clear method to be called
   RAJA_HOST_DEVICE void clear()
   {
 
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_data != nullptr && m_owned == Owned) {
+    if (m_data != nullptr && m_owned == Owned)
+    {
       m_resource->deallocate(m_data);
       delete m_resource;
     }
@@ -345,7 +349,8 @@ class TypedListSegment
   {
 
     // empty list segment
-    if (len <= 0 || container == nullptr) {
+    if (len <= 0 || container == nullptr)
+    {
       m_data = nullptr;
       m_size = 0;
       m_owned = Unowned;
@@ -355,22 +360,24 @@ class TypedListSegment
     // some non-zero size -- initialize accordingly
     m_size = len;
     m_owned = container_own;
-    if (m_owned == Owned) {
+    if (m_owned == Owned)
+    {
 
-        m_resource = new camp::resources::Resource(resource_);
+      m_resource = new camp::resources::Resource(resource_);
 
-        camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res{camp::resources::Host()};
 
-        value_type* tmp = host_res.allocate<value_type>(m_size);
+      value_type* tmp = host_res.allocate<value_type>(m_size);
 
-        for (Index_type i = 0; i < m_size; ++i) {
-          tmp[i] = container[i];
-        }
+      for (Index_type i = 0; i < m_size; ++i)
+      {
+        tmp[i] = container[i];
+      }
 
-        m_data = m_resource->allocate<value_type>(m_size);
-        m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
+      m_data = m_resource->allocate<value_type>(m_size);
+      m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
 
-        host_res.deallocate(tmp);
+      host_res.deallocate(tmp);
 
       return;
     }
@@ -382,7 +389,7 @@ class TypedListSegment
 
 
   // Copy of camp resource passed to ctor
-  camp::resources::Resource *m_resource;
+  camp::resources::Resource* m_resource;
 
   // Ownership flag to guide data copying/management
   IndexOwnership m_owned;
@@ -397,7 +404,7 @@ class TypedListSegment
 //! Alias for A TypedListSegment<Index_type>
 using ListSegment = TypedListSegment<Index_type>;
 
-}  // namespace RAJA
+} // namespace RAJA
 
 namespace std
 {
@@ -409,6 +416,6 @@ RAJA_INLINE void swap(RAJA::TypedListSegment<StorageT>& a,
 {
   a.swap(b);
 }
-}  // namespace std
+} // namespace std
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index a41959c583..3ee1ba3653 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -50,10 +50,10 @@ namespace RAJA
  *
  * NOTE: TypedRangeSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of 
+ * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of
  *       indices [-5, 3).
  *
- * NOTE: Proper handling of indices strides requires that StorageT is a 
+ * NOTE: Proper handling of indices strides requires that StorageT is a
  *       signed type.
  *
  * Usage:
@@ -92,15 +92,22 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeSegment {
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeSegment
+{
 
-  // 
+  //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
-  // 
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeSegment Type must be non floating point.");
+  //
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeSegment DiffT "
+                "requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeSegment "
+                "Type must be non "
+                "floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -117,20 +124,19 @@ struct TypedRangeSegment {
   //@}
 
   //@{
-  //!   @name Constructors, destructor, and copy assignment. 
+  //!   @name Constructors, destructor, and copy assignment.
 
   /*!
    * \brief Construct a range segment repreenting the interval [begin, end)
-   * 
+   *
    * \param begin start value (inclusive) for the range
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end)
-      : m_begin(iterator(begin)), 
-        m_end(begin > end ? m_begin : iterator(end))
-  {
-  }
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
+                                               StripStorageT end)
+      : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end))
+  {}
 
   //! Disable compiler generated constructor
   RAJA_HOST_DEVICE TypedRangeSegment() = delete;
@@ -187,7 +193,7 @@ struct TypedRangeSegment {
    * \brief Compare this segment to another for inequality
    *
    * \return true if begin or end does not match, else false
-   */ 
+   */
   RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const
   {
     return !(operator==(o));
@@ -198,9 +204,9 @@ struct TypedRangeSegment {
   /*!
    * \brief Get a new TypedRangeSegment instance representing a slice of
    *        existing segment
-   * 
-   * \param begin start iterate of new range 
-   * \param length maximum length of new range 
+   *
+   * \param begin start iterate of new range
+   * \param length maximum length of new range
    * \return TypedRangeSegment representing the interval
    *         [ *begin() + begin, min( *begin() + begin + length, *end() ) )
    *
@@ -213,7 +219,7 @@ struct TypedRangeSegment {
    *     auto r = RAJA::TypedRangeSegment<int>(-4, 4);
    *
    *     // s repreents the subinterval  [-3, 2)
-   *     auto s = r.slice(1, 5); 
+   *     auto s = r.slice(1, 5);
    *
    *   \endverbatim
    */
@@ -247,8 +253,8 @@ struct TypedRangeSegment {
 /*!
  ******************************************************************************
  *
- * \class TypedRangeStrideSegment 
- * 
+ * \class TypedRangeStrideSegment
+ *
  * \brief  Segment class representing a strided range of typed indices
  *
  * \tparam StorageT underlying data type for the segment indices (required)
@@ -264,9 +270,9 @@ struct TypedRangeSegment {
  *
  * NOTE: TypedRangeStrideSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeStrideSegment allows for positive or negative strides and 
- *       indices. This allows for forward (stride > 0) or backward (stride < 0) 
- *       traversal of the iteration space. A stride of zero is undefined and 
+ * NOTE: TypedRangeStrideSegment allows for positive or negative strides and
+ *       indices. This allows for forward (stride > 0) or backward (stride < 0)
+ *       traversal of the iteration space. A stride of zero is undefined and
  *       will cause divide-by-zero errors.
  *
  * As with RangeSegment, the iteration space is inclusive of begin() and
@@ -275,7 +281,7 @@ struct TypedRangeSegment {
  * For positive strides, begin() > end() implies size()==0
  * For negative strides, begin() < end() implies size()==0
  *
- * NOTE: Proper handling of negative strides and indices requires that 
+ * NOTE: Proper handling of negative strides and indices requires that
  *       StorageT is a signed type.
  *
  * Usage:
@@ -321,15 +327,23 @@ struct TypedRangeSegment {
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeStrideSegment {
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeStrideSegment
+{
 
   //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeStrideSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeStrideSegment DiffT "
+                "requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeStrideSegm"
+                "ent Type must be "
+                "non floating "
+                "point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -349,7 +363,7 @@ struct TypedRangeStrideSegment {
   //!   @name Constructors, destructor, and copy assignment.
 
   /*!
-   * \brief Construct a range segment for the interval [begin, end) with 
+   * \brief Construct a range segment for the interval [begin, end) with
    *        given stride
    *
    * \param begin start value (inclusive) for the range
@@ -357,9 +371,8 @@ struct TypedRangeStrideSegment {
    * \param stride stride value when iterating over the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin,
-                                           StripStorageT end,
-                                           DiffT stride)
+  RAJA_HOST_DEVICE
+  TypedRangeStrideSegment(StripStorageT begin, StripStorageT end, DiffT stride)
       : m_begin(iterator(begin, stride)),
         m_end(iterator(end, stride)),
         // essentially a ceil((end-begin)/stride) but using integer math,
@@ -367,9 +380,12 @@ struct TypedRangeStrideSegment {
         m_size((end - begin + stride - (stride > 0 ? 1 : -1)) / stride)
   {
     // clamp range when end is unreachable from begin without wrapping
-    if (stride < 0 && end > begin) {
+    if (stride < 0 && end > begin)
+    {
       m_end = m_begin;
-    } else if (stride > 0 && end < begin) {
+    }
+    else if (stride > 0 && end < begin)
+    {
       m_end = m_begin;
     }
     // m_size initialized as negative indicates a zero iteration space
@@ -408,8 +424,8 @@ struct TypedRangeStrideSegment {
 
   /*!
    * \brief Get size of this segment
-   * 
-   * The size is the number of iterates in the 
+   *
+   * The size is the number of iterates in the
    * interval [begin, end) when striding over it
    */
   RAJA_HOST_DEVICE DiffT size() const { return m_size; }
@@ -435,7 +451,8 @@ struct TypedRangeStrideSegment {
    *
    * \return true if begin, end, or size does not match, else false
    */
-  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeStrideSegment const& o) const
+  RAJA_HOST_DEVICE RAJA_INLINE bool
+  operator!=(TypedRangeStrideSegment const& o) const
   {
     return !(operator==(o));
   }
@@ -450,7 +467,7 @@ struct TypedRangeStrideSegment {
    * \param length maximum length of new range
    *
    * \return TypedRangeStrideSegment representing the interval
-   *         [ *begin() + begin * stride, 
+   *         [ *begin() + begin * stride,
    *           min( *begin() + (begin + length) * stride, *end() )
    *
    * Here's an example of a slice operation on a range segment with a negative
@@ -466,7 +483,7 @@ struct TypedRangeStrideSegment {
    *     //       5 indices in r starting at the 6th entry
    *     auto s = r.slice(6, 6);
    *
-   *   \endverbatim 
+   *   \endverbatim
    */
   RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
                                                  DiffT length) const
@@ -475,15 +492,17 @@ struct TypedRangeStrideSegment {
     StorageT start = m_begin[0] + begin * stride;
     StorageT end = start + stride * length;
 
-    if (stride > 0) {
+    if (stride > 0)
+    {
       end = end > m_end[0] ? m_end[0] : end;
-    } else {
+    }
+    else
+    {
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment{stripIndexType(start),
-                                   stripIndexType(end),
-                                   m_begin.get_stride()};
+    return TypedRangeStrideSegment{
+        stripIndexType(start), stripIndexType(end), m_begin.get_stride()};
   }
 
   /*!
@@ -518,18 +537,19 @@ namespace detail
 
 template <typename T, typename... Rest>
 struct common_type
-    : std::common_type<T, typename std::common_type<Rest...>::type> {
-};
+    : std::common_type<T, typename std::common_type<Rest...>::type>
+{};
 
 template <typename T>
-struct common_type<T> {
+struct common_type<T>
+{
   using type = T;
 };
 
 template <typename... Ts>
 using common_type_t = typename common_type<Ts...>::type;
 
-}  // namespace detail
+} // namespace detail
 
 /*!
  * \brief Function to make a TypedRangeSegment for the interval [begin, end)
@@ -549,7 +569,7 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
 }
 
 /*!
- * \brief Function to make a TypedRangeStride Segment for the interval 
+ * \brief Function to make a TypedRangeStride Segment for the interval
  *        [begin, end) with given stride
  *
  *  \return a newly constructed TypedRangeStrideSegment where
@@ -561,13 +581,14 @@ template <typename BeginT,
           typename EndT,
           typename StrideT,
           typename Common = detail::common_type_t<BeginT, EndT>>
-RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
-    BeginT&& begin,
-    EndT&& end,
-    StrideT&& stride)
+RAJA_HOST_DEVICE TypedRangeStrideSegment<Common>
+make_strided_range(BeginT&& begin, EndT&& end, StrideT&& stride)
 {
-  static_assert(std::is_signed<StrideT>::value, "make_strided_segment : stride must be signed.");
-  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value, "make_stride_segment : stride and end must be of similar types.");
+  static_assert(std::is_signed<StrideT>::value,
+                "make_strided_segment : stride must be signed.");
+  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value,
+                "make_stride_segment : stride and end must be of similar "
+                "types.");
   return {begin, end, stride};
 }
 
@@ -576,15 +597,15 @@ namespace concepts
 
 template <typename T, typename U>
 struct RangeConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>())
+{};
 
 template <typename T, typename U, typename V>
 struct RangeStrideConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>())
+{};
 
-}  // namespace concepts
+} // namespace concepts
 
 namespace type_traits
 {
@@ -595,9 +616,9 @@ DefineTypeTraitFromConcept(is_range_constructible,
 DefineTypeTraitFromConcept(is_range_stride_constructible,
                            RAJA::concepts::RangeStrideConstructible);
 
-}  // namespace type_traits
+} // namespace type_traits
 
-}  // namespace RAJA
+} // namespace RAJA
 
 namespace std
 {
@@ -618,6 +639,6 @@ RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeStrideSegment<T>& a,
   a.swap(b);
 }
 
-}  // namespace std
+} // namespace std
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 8feceae22f..6c9858221a 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -57,8 +57,7 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   DepGraphNode()
       : m_num_dep_tasks(0), m_semaphore_reload_value(0), m_semaphore_value(0)
-  {
-  }
+  {}
 
   ///
   /// Get/set semaphore value; i.e., the current number of (unsatisfied)
@@ -82,7 +81,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void satisfyOne()
   {
-    if (m_semaphore_value > 0) {
+    if (m_semaphore_value > 0)
+    {
       --m_semaphore_value;
     }
   }
@@ -92,7 +92,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void wait()
   {
-    while (m_semaphore_value > 0) {
+    while (m_semaphore_value > 0)
+    {
       // TODO: an efficient wait would be better here, but the standard
       // promise/future is not good enough
       std::this_thread::yield();
@@ -124,6 +125,6 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   std::atomic<int> m_semaphore_value;
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 6f32a56e6d..73628a035b 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -50,7 +50,8 @@ std::string overflow_msg(LType lhs, RType rhs)
 template <typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 {
-  if (std::is_unsigned<Type>::value) {
+  if (std::is_unsigned<Type>::value)
+  {
     if ((rhs > 0) && (lhs > std::numeric_limits<Type>::max() - rhs))
       return true;
     if ((rhs < 0) && (lhs < std::numeric_limits<Type>::min() - rhs))
@@ -64,18 +65,22 @@ RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
                                               DifferenceType rhs,
                                               bool iterator_on_left = true)
 {
-  if (iterator_on_left) {
+  if (iterator_on_left)
+  {
 
-    if (std::is_unsigned<Type>::value) {
+    if (std::is_unsigned<Type>::value)
+    {
       if ((rhs > 0) && (lhs < std::numeric_limits<Type>::min() + rhs))
         return true;
       if ((rhs < 0) && (lhs > std::numeric_limits<Type>::max() + rhs))
         return true;
     }
+  }
+  else
+  { // Special case where operation is : value(lhs) - iterator(rhs).
 
-  } else {  // Special case where operation is : value(lhs) - iterator(rhs).
-
-    if (std::is_unsigned<DifferenceType>::value) {
+    if (std::is_unsigned<DifferenceType>::value)
+    {
       if ((lhs > 0) && (rhs < std::numeric_limits<DifferenceType>::min() + lhs))
         return true;
       if ((lhs < 0)) return true;
@@ -121,8 +126,7 @@ class numeric_iterator
 
   RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs)
       : val(rhs)
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; }
 
@@ -174,8 +178,8 @@ class numeric_iterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator+=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
@@ -183,8 +187,8 @@ class numeric_iterator
     val += rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator-=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
@@ -192,48 +196,47 @@ class numeric_iterator
     val -= rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator+=(const numeric_iterator& rhs)
   {
     val += rhs.val;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator-=(const numeric_iterator& rhs)
   {
     val -= rhs.val;
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline stripped_value_type operator+(
-      const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type
+  operator+(const numeric_iterator& rhs) const
   {
     return val + rhs.val;
   }
-  RAJA_HOST_DEVICE inline stripped_value_type operator-(
-      const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type
+  operator-(const numeric_iterator& rhs) const
   {
     return val - rhs.val;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator
+  operator+(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
 #endif
     return numeric_iterator(val + rhs);
   }
-  RAJA_HOST_DEVICE inline numeric_iterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator
+  operator-(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
 #endif
     return numeric_iterator(val - rhs);
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+(
-      difference_type lhs,
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator
+  operator+(difference_type lhs, const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_addition_overflow(rhs.val, lhs)
@@ -243,9 +246,8 @@ class numeric_iterator
     return numeric_iterator(lhs + rhs.val);
 #endif
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-(
-      difference_type lhs,
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator
+  operator-(difference_type lhs, const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_subtraction_overflow(rhs.val, lhs, false)
@@ -287,17 +289,20 @@ class strided_numeric_iterator
   using iterator_category = std::random_access_iterator_tag;
 
   constexpr strided_numeric_iterator() noexcept = default;
-  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = default;
-  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = default;
-  strided_numeric_iterator& operator=(const strided_numeric_iterator&) noexcept = default;
-  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept = default;
+  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept =
+      default;
+  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept =
+      default;
+  strided_numeric_iterator&
+  operator=(const strided_numeric_iterator&) noexcept = default;
+  strided_numeric_iterator&
+  operator=(strided_numeric_iterator&&) noexcept = default;
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
       DifferenceType stride_ = DifferenceType(1))
       : val(rhs), stride(stride_)
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return stride; }
 
@@ -312,8 +317,8 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator&
+  operator+=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
@@ -321,8 +326,8 @@ class strided_numeric_iterator
     val += rhs * stride;
     return *this;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator&
+  operator-=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -331,15 +336,15 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator+(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator+(const strided_numeric_iterator& rhs) const
   {
     return (static_cast<difference_type>(val) +
             (static_cast<difference_type>(rhs.val))) /
            stride;
   }
-  RAJA_HOST_DEVICE inline difference_type operator-(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator-(const strided_numeric_iterator& rhs) const
   {
     difference_type diff = (static_cast<difference_type>(val) -
                             (static_cast<difference_type>(rhs.val)));
@@ -348,16 +353,16 @@ class strided_numeric_iterator
                ? (difference_type{1} + diff / stride)
                : diff / stride;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator
+  operator+(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
 #endif
     return strided_numeric_iterator(val + rhs * stride, stride);
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator
+  operator-(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -367,34 +372,34 @@ class strided_numeric_iterator
 
   // Specialized comparison to allow normal iteration to work on off-stride
   // multiples by adjusting rhs to the nearest *higher* multiple of stride
-  RAJA_HOST_DEVICE inline bool operator!=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator!=(const strided_numeric_iterator& rhs) const
   {
     return (val - rhs.val) / stride;
   }
-  RAJA_HOST_DEVICE inline bool operator==(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator==(const strided_numeric_iterator& rhs) const
   {
     return !((val - rhs.val) / stride);
   }
 
-  RAJA_HOST_DEVICE inline bool operator>(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator>(const strided_numeric_iterator& rhs) const
   {
     return val * stride > rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator<(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator<(const strided_numeric_iterator& rhs) const
   {
     return val * stride < rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator>=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator>=(const strided_numeric_iterator& rhs) const
   {
     return val * stride >= rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator<=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator<=(const strided_numeric_iterator& rhs) const
   {
     return val * stride <= rhs.val * stride;
   }
@@ -419,8 +424,8 @@ class strided_numeric_iterator
 };
 
 
-}  // namespace Iterators
+} // namespace Iterators
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif /* RAJA_ITERATORS_HPP */
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 55015f9ab7..e1540c8384 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -27,7 +27,7 @@
 
 #include "RAJA/util/types.hpp"
 
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \
+#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) ||                \
     defined(__MINGW32__) || defined(__BORLANDC__)
 #define RAJA_PLATFORM_WINDOWS
 #include <malloc.h>
@@ -53,10 +53,10 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #elif defined(RAJA_PLATFORM_WINDOWS)
   return _aligned_malloc(size, alignment);
 #else
-  char *mem = (char *)malloc(size + alignment + sizeof(void *));
+  char* mem = (char*)malloc(size + alignment + sizeof(void*));
   if (nullptr == mem) return nullptr;
-  void **ptr = (void **)((std::uintptr_t)(mem + alignment + sizeof(void *)) &
-                         ~(alignment - 1));
+  void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) &
+                        ~(alignment - 1));
   // Store the original address one position behind what we give the user.
   ptr[-1] = mem;
   return ptr;
@@ -97,30 +97,28 @@ inline void free_aligned(void* ptr)
 ///
 struct FreeAligned
 {
-  void operator()(void* ptr)
-  {
-    free_aligned(ptr);
-  }
+  void operator()(void* ptr) { free_aligned(ptr); }
 };
 
 ///
 /// Deleter function object for memory allocated with allocate_aligned_type
 /// that calls the destructor for the fist size objects in the storage.
 ///
-template < typename T, typename index_type >
+template <typename T, typename index_type>
 struct FreeAlignedType : FreeAligned
 {
   index_type size = 0;
 
   void operator()(T* ptr)
   {
-    for ( index_type i = size; i > 0; --i ) {
-      ptr[i-1].~T();
+    for (index_type i = size; i > 0; --i)
+    {
+      ptr[i - 1].~T();
     }
     FreeAligned::operator()(ptr);
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 1d0ec0cbeb..966a319bbc 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -49,7 +49,7 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename T, typename Allocator = std::allocator<T> >
+template <typename T, typename Allocator = std::allocator<T>>
 class RAJAVec
 {
   using allocator_traits_type = std::allocator_traits<Allocator>;
@@ -57,8 +57,9 @@ class RAJAVec
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
+
 public:
   using value_type = T;
   using allocator_type = Allocator;
@@ -86,7 +87,9 @@ class RAJAVec
   ///
   RAJAVec(const RAJAVec& other)
       : m_data(nullptr),
-        m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)),
+        m_allocator(
+            allocator_traits_type::select_on_container_copy_construction(
+                other.m_allocator)),
         m_capacity(0),
         m_size(0)
   {
@@ -113,7 +116,8 @@ class RAJAVec
   ///
   RAJAVec& operator=(const RAJAVec& rhs)
   {
-    if (&rhs != this) {
+    if (&rhs != this)
+    {
       copy_assign_private(rhs, propagate_on_container_copy_assignment{});
     }
     return *this;
@@ -124,8 +128,10 @@ class RAJAVec
   ///
   RAJAVec& operator=(RAJAVec&& rhs)
   {
-    if (&rhs != this) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (&rhs != this)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -150,25 +156,25 @@ class RAJAVec
   ///
   /// Get a pointer to the beginning of the contiguous vector
   ///
-        pointer data()       { return m_data; }
+  pointer data() { return m_data; }
   ///
   const_pointer data() const { return m_data; }
 
   ///
   /// Get an iterator to the end.
   ///
-        iterator  end()       { return m_data + m_size; }
+  iterator end() { return m_data + m_size; }
   ///
-  const_iterator  end() const { return m_data + m_size; }
+  const_iterator end() const { return m_data + m_size; }
   ///
   const_iterator cend() const { return m_data + m_size; }
 
   ///
   /// Get an iterator to the beginning.
   ///
-        iterator  begin()       { return m_data; }
+  iterator begin() { return m_data; }
   ///
-  const_iterator  begin() const { return m_data; }
+  const_iterator begin() const { return m_data; }
   ///
   const_iterator cbegin() const { return m_data; }
 
@@ -200,18 +206,12 @@ class RAJAVec
   ///
   /// Shrink the capacity of the vector to the current size.
   ///
-  void shrink_to_fit()
-  {
-    shrink_cap(m_size);
-  }
+  void shrink_to_fit() { shrink_cap(m_size); }
 
   ///
   /// Empty vector of all data.
   ///
-  void clear()
-  {
-    destroy_items_after(0);
-  }
+  void clear() { destroy_items_after(0); }
 
   ///
   /// Change the size of the vector,
@@ -221,10 +221,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -237,10 +240,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size, const_reference new_value)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size, new_value);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -248,23 +254,23 @@ class RAJAVec
   ///
   /// Bracket operator accessor.
   ///
-        reference operator[](difference_type i)       { return m_data[i]; }
+  reference operator[](difference_type i) { return m_data[i]; }
   ///
   const_reference operator[](difference_type i) const { return m_data[i]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference front()       { return m_data[0]; }
+  reference front() { return m_data[0]; }
   ///
   const_reference front() const { return m_data[0]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference back()       { return m_data[m_size-1]; }
+  reference back() { return m_data[m_size - 1]; }
   ///
-  const_reference back() const { return m_data[m_size-1]; }
+  const_reference back() const { return m_data[m_size - 1]; }
 
   ///
   /// Add item to front end of vector. Note that this operation is unique to
@@ -272,28 +278,31 @@ class RAJAVec
   ///
   void push_front(const_reference item) { emplace_front_private(item); }
   ///
-  void push_front(   value_type&& item) { emplace_front_private(std::move(item)); }
+  void push_front(value_type&& item) { emplace_front_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_front(Os&&... os) { emplace_front_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_front(Os&&... os)
+  {
+    emplace_front_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Add item to back end of vector.
   ///
   void push_back(const_reference item) { emplace_back_private(item); }
   ///
-  void push_back(   value_type&& item) { emplace_back_private(std::move(item)); }
+  void push_back(value_type&& item) { emplace_back_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_back(Os&&... os) { emplace_back_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_back(Os&&... os)
+  {
+    emplace_back_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Remove the last item of the vector.
   ///
-  void pop_back()
-  {
-    destroy_items_after(m_size-1);
-  }
+  void pop_back() { destroy_items_after(m_size - 1); }
 
 private:
   pointer m_data;
@@ -307,7 +316,8 @@ class RAJAVec
   ///
   void copy_assign_private(RAJAVec const& rhs, std::true_type)
   {
-    if (m_allocator != rhs.m_allocator) {
+    if (m_allocator != rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
       m_allocator = rhs.m_allocator;
@@ -323,10 +333,13 @@ class RAJAVec
   void copy_assign_private(RAJAVec const& rhs, std::false_type)
   {
     reserve(rhs.size());
-    if (size() < rhs.size()) {
+    if (size() < rhs.size())
+    {
       copy_assign_items(0, size(), rhs.data());
       copy_construct_items_back(rhs.size(), rhs.data());
-    } else {
+    }
+    else
+    {
       copy_assign_items(0, rhs.size(), rhs.data());
       destroy_items_after(size());
     }
@@ -357,7 +370,8 @@ class RAJAVec
   ///
   void move_assign_private(RAJAVec&& rhs, std::false_type)
   {
-    if (m_allocator == rhs.m_allocator) {
+    if (m_allocator == rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
 
@@ -368,12 +382,17 @@ class RAJAVec
       rhs.m_data = nullptr;
       rhs.m_capacity = 0;
       rhs.m_size = 0;
-    } else {
+    }
+    else
+    {
       reserve(rhs.size());
-      if (size() < rhs.size()) {
+      if (size() < rhs.size())
+      {
         move_assign_items(0, size(), rhs.data());
         move_construct_items_back(rhs.size(), rhs.data());
-      } else {
+      }
+      else
+      {
         move_assign_items(0, rhs.size(), rhs.data());
         destroy_items_after(size());
       }
@@ -386,10 +405,10 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::true_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
+    swap(m_data, other.m_data);
     swap(m_allocator, other.m_allocator);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   ///
@@ -398,9 +417,9 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::false_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_data, other.m_data);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   //
@@ -408,7 +427,8 @@ class RAJAVec
   //
   void copy_assign_items(size_type first, size_type last, const_pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = o_data[i];
     }
   }
@@ -418,7 +438,8 @@ class RAJAVec
   //
   void move_assign_items(size_type first, size_type last, pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = std::move(o_data[i]);
     }
   }
@@ -426,11 +447,13 @@ class RAJAVec
   //
   // Construct items [m_size, new_size) from args.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void construct_items_back(size_type new_size, Os&&... os)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(
+          m_allocator, m_data + m_size, std::forward<Os>(os)...);
     }
   }
 
@@ -439,8 +462,10 @@ class RAJAVec
   //
   void copy_construct_items_back(size_type new_size, const_pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(
+          m_allocator, m_data + m_size, o_data[m_size]);
     }
   }
 
@@ -449,8 +474,10 @@ class RAJAVec
   //
   void move_construct_items_back(size_type new_size, pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size]));
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(
+          m_allocator, m_data + m_size, std::move(o_data[m_size]));
     }
   }
 
@@ -459,39 +486,45 @@ class RAJAVec
   //
   void destroy_items_after(size_type new_end)
   {
-    for (; m_size > new_end; --m_size) {
-      allocator_traits_type::destroy(m_allocator, m_data+m_size-1);
+    for (; m_size > new_end; --m_size)
+    {
+      allocator_traits_type::destroy(m_allocator, m_data + m_size - 1);
     }
   }
 
   //
   // Add an item to the front, shifting all existing items back one.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_front_private(Os&&... os)
   {
     reserve(m_size + 1);
 
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
       size_type i = m_size;
-      allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1]));
-      for (--i; i > 0; --i) {
+      allocator_traits_type::construct(
+          m_allocator, m_data + i, std::move(m_data[i - 1]));
+      for (--i; i > 0; --i)
+      {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(m_allocator, m_data, std::forward<Os>(os)...);
+    allocator_traits_type::construct(
+        m_allocator, m_data, std::forward<Os>(os)...);
     m_size++;
   }
 
   //
   // Add an item to the back.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    allocator_traits_type::construct(
+        m_allocator, m_data + m_size, std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -509,7 +542,8 @@ class RAJAVec
   size_type get_next_cap(size_type target_size)
   {
     size_type next_cap = s_init_cap;
-    if (m_capacity != 0) {
+    if (m_capacity != 0)
+    {
       next_cap = static_cast<size_type>(m_capacity * s_grow_fac);
     }
     return std::max(target_size, next_cap);
@@ -520,7 +554,8 @@ class RAJAVec
   //
   void grow_cap(size_type target_size)
   {
-    if (m_capacity < target_size) {
+    if (m_capacity < target_size)
+    {
       change_cap(get_next_cap(target_size));
     }
   }
@@ -530,7 +565,8 @@ class RAJAVec
   //
   void shrink_cap(size_type target_size)
   {
-    if (m_capacity > target_size) {
+    if (m_capacity > target_size)
+    {
       change_cap(std::max(m_size, target_size));
     }
   }
@@ -542,14 +578,18 @@ class RAJAVec
   void change_cap(size_type next_cap)
   {
     pointer tdata = nullptr;
-    if (next_cap != 0) {
+    if (next_cap != 0)
+    {
       tdata = allocator_traits_type::allocate(m_allocator, next_cap);
     }
 
-    if (m_data) {
-      for (size_type i = 0; i < m_size; ++i) {
-        allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i]));
-        allocator_traits_type::destroy(m_allocator, m_data+i);
+    if (m_data)
+    {
+      for (size_type i = 0; i < m_size; ++i)
+      {
+        allocator_traits_type::construct(
+            m_allocator, tdata + i, std::move(m_data[i]));
+        allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
     }
@@ -559,6 +599,6 @@ class RAJAVec
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/internal/ThreadUtils_CPU.hpp b/include/RAJA/internal/ThreadUtils_CPU.hpp
index addd22c4f7..c83905ea77 100644
--- a/include/RAJA/internal/ThreadUtils_CPU.hpp
+++ b/include/RAJA/internal/ThreadUtils_CPU.hpp
@@ -47,6 +47,6 @@ int getMaxOMPThreadsCPU()
   return nthreads;
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index cf3a86cede..da72005702 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -37,62 +37,74 @@
 #include <stdio.h>
 #include "cycle.h"
 
-#define RAJA_FT_BEGIN                          \
-  extern volatile int fault_type;              \
-  bool repeat;                                 \
-  bool do_time = false;                        \
-  ticks start = 0, stop = 0;                   \
-  if (fault_type != 0) {                       \
-    printf("Uncaught fault %d\n", fault_type); \
-    fault_type = 0;                            \
-  }                                            \
-  do {                                         \
-    repeat = false;                            \
-    if (do_time) {                             \
-      start = getticks();                      \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  bool do_time = false;                                                        \
+  ticks start = 0, stop = 0;                                                   \
+  if (fault_type != 0)                                                         \
+  {                                                                            \
+    printf("Uncaught fault %d\n", fault_type);                                 \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  do                                                                           \
+  {                                                                            \
+    repeat = false;                                                            \
+    if (do_time)                                                               \
+    {                                                                          \
+      start = getticks();                                                      \
     }
 
-#define RAJA_FT_END                                                          \
-  if (do_time) {                                                             \
-    stop = getticks();                                                       \
-    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start)); \
-    do_time = false;                                                         \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type < 0) {                                                      \
-    printf("Unrecoverable fault (restart penalty)\n");                       \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type > 0) {                                                      \
-    /* invalidate cache */                                                   \
-    repeat = true;                                                           \
-    do_time = true;                                                          \
-  }                                                                          \
-  }                                                                          \
-  while (repeat == true)                                                     \
+#define RAJA_FT_END                                                            \
+  if (do_time)                                                                 \
+  {                                                                            \
+    stop = getticks();                                                         \
+    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start));   \
+    do_time = false;                                                           \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type < 0)                                                          \
+  {                                                                            \
+    printf("Unrecoverable fault (restart penalty)\n");                         \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat = true;                                                             \
+    do_time = true;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
     ;
 
 #else
-#define RAJA_FT_BEGIN             \
-  extern volatile int fault_type; \
-  bool repeat;                    \
-  if (fault_type == 0) {          \
-    do {                          \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  if (fault_type == 0)                                                         \
+  {                                                                            \
+    do                                                                         \
+    {                                                                          \
       repeat = false;
 
-#define RAJA_FT_END        \
-  if (fault_type > 0) {    \
-    /* invalidate cache */ \
-    repeat = true;         \
-    fault_type = 0;        \
-  }                        \
-  }                        \
-  while (repeat == true)   \
-    ;                      \
-  }                        \
-  else { fault_type = 0; /* ignore for the simulation */ }
-
-#endif  // RAJA_REPORT_FT
+#define RAJA_FT_END                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat = true;                                                             \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
+    ;                                                                          \
+  }                                                                            \
+  else                                                                         \
+  {                                                                            \
+    fault_type = 0; /* ignore for the simulation */                            \
+  }
+
+#endif // RAJA_REPORT_FT
 
 #else
 
@@ -100,6 +112,6 @@
 
 #define RAJA_FT_END
 
-#endif  // RAJA_ENABLE_FT
+#endif // RAJA_ENABLE_FT
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index af65c05392..3e9e0bc15a 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -44,14 +44,16 @@ template <typename Op, typename... Rest>
 struct foldl_impl;
 
 template <typename Op, typename Arg1>
-struct foldl_impl<Op, Arg1> {
+struct foldl_impl<Op, Arg1>
+{
   using Ret = Arg1;
 };
 
 #if RAJA_HAS_CXX17_IS_INVOCABLE
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
 };
 
@@ -60,18 +62,22 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
-  using Ret = typename foldl_impl<
-      Op,
-      typename std::invoke_result<Op, typename std::invoke_result<Op, Arg1, Arg2>::type,
-                                      Arg3>::type,
-      Rest...>::Ret;
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
+  using Ret =
+      typename foldl_impl<Op,
+                          typename std::invoke_result<
+                              Op,
+                              typename std::invoke_result<Op, Arg1, Arg2>::type,
+                              Arg3>::type,
+                          Rest...>::Ret;
 };
 
 #else
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
 };
 
@@ -80,7 +86,8 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
   using Ret = typename foldl_impl<
       Op,
       typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
@@ -93,17 +100,16 @@ struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
 } // namespace detail
 
 template <typename Op, typename Arg1>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
-    Op&& RAJA_UNUSED_ARG(operation),
-    Arg1&& arg) -> typename detail::foldl_impl<Op, Arg1>::Ret
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& RAJA_UNUSED_ARG(operation), Arg1&& arg) ->
+    typename detail::foldl_impl<Op, Arg1>::Ret
 {
   return camp::forward<Arg1>(arg);
 }
 
 template <typename Op, typename Arg1, typename Arg2>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2) ->
     typename detail::foldl_impl<Op, Arg1, Arg2>::Ret
 {
   return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
@@ -115,11 +121,8 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2,
-                                                  Arg3&& arg3,
-                                                  Rest&&... rest) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Rest&&... rest) ->
     typename detail::foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
 {
   return foldl(camp::forward<Op>(operation),
@@ -157,6 +160,6 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args)
 }
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index 0354d04bfd..d32344629c 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -8,18 +8,21 @@
 namespace RAJA
 {
 
-namespace policy {
-namespace multi {
+namespace policy
+{
+namespace multi
+{
 template <typename Selector, typename... Policies>
 class MultiPolicy;
 
 }
-}
+} // namespace policy
 
-namespace detail 
+namespace detail
 {
 
-struct max_platform {
+struct max_platform
+{
   RAJA_HOST_DEVICE
   RAJA_INLINE
   constexpr RAJA::Platform operator()(const RAJA::Platform& l,
@@ -34,7 +37,8 @@ struct max_platform {
  * This is a catch-all, so anything undefined gets Platform::undefined
  */
 template <typename T, typename = void>
-struct get_platform {
+struct get_platform
+{
   // catch-all: undefined platform
   static constexpr Platform value = Platform::undefined;
 };
@@ -45,7 +49,8 @@ struct get_platform {
  * reduction of them all.
  */
 template <typename... Policies>
-struct get_platform_from_list {
+struct get_platform_from_list
+{
   static constexpr Platform value =
       foldl(max_platform(), get_platform<Policies>::value...);
 };
@@ -54,7 +59,8 @@ struct get_platform_from_list {
  * Define an empty list as Platform::undefined;
  */
 template <>
-struct get_platform_from_list<> {
+struct get_platform_from_list<>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -67,10 +73,10 @@ struct get_platform_from_list<> {
  */
 template <typename T>
 struct get_platform<T,
-                    typename std::
-                        enable_if<std::is_base_of<RAJA::PolicyBase, T>::value
-                                  && !RAJA::type_traits::is_indexset_policy<T>::
-                                         value>::type> {
+                    typename std::enable_if<
+                        std::is_base_of<RAJA::PolicyBase, T>::value &&
+                        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
+{
 
   static constexpr Platform value = T::platform;
 };
@@ -83,12 +89,13 @@ struct get_platform<T,
  */
 template <typename SEG, typename EXEC>
 struct get_platform<RAJA::ExecPolicy<SEG, EXEC>>
-    : public get_platform_from_list<SEG, EXEC> {
-};
+    : public get_platform_from_list<SEG, EXEC>
+{};
 
 
 template <typename T>
-struct get_statement_platform {
+struct get_statement_platform
+{
   static constexpr Platform value =
       get_platform_from_list<typename T::execution_policy_t,
                              typename T::enclosed_statements_t>::value;
@@ -102,7 +109,8 @@ struct get_statement_platform {
  * each of them.
  */
 template <typename... Stmts>
-struct get_platform<RAJA::internal::StatementList<Stmts...>> {
+struct get_platform<RAJA::internal::StatementList<Stmts...>>
+{
   static constexpr Platform value =
       foldl(max_platform(), get_statement_platform<Stmts>::value...);
 };
@@ -111,7 +119,8 @@ struct get_platform<RAJA::internal::StatementList<Stmts...>> {
  * Specialize for an empty statement list to be undefined
  */
 template <>
-struct get_platform<RAJA::internal::StatementList<>> {
+struct get_platform<RAJA::internal::StatementList<>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -120,11 +129,12 @@ struct get_platform<RAJA::internal::StatementList<>> {
 // Once a specific policy is selected, that policy will select the correct
 // platform... see policy_invoker in MultiPolicy.hpp
 template <typename SELECTOR, typename... POLICIES>
-struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>> {
+struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
-} // closing brace for detail namespace
-} // closing brace for RAJA namespace
+} // namespace detail
+} // namespace RAJA
 
 #endif // RAJA_get_platform_HPP
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 767821b8d8..b3e50fea8e 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -38,38 +38,44 @@ namespace RAJA
  *
  * \verbatim
 
-   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> pool(allocator);
+   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator>
+ pool(allocator);
 
    pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) {
       xarg0[i] = xarg1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group =
+ pool.instantiate();
 
    int* xarg0 = ...;
    int xarg1 = ...;
-   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site = group.run(xarg0, xarg1);
+   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site =
+ group.run(xarg0, xarg1);
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template < typename ... Args >
+template <typename... Args>
 using xargs = camp::list<Args...>;
 
-namespace detail {
+namespace detail
+{
 
-template < typename T >
-struct is_xargs {
+template <typename T>
+struct is_xargs
+{
   static constexpr bool value = false;
 };
 
-template < typename ... Args >
-struct is_xargs<xargs<Args...>> {
+template <typename... Args>
+struct is_xargs<xargs<Args...>>
+{
   static constexpr bool value = true;
 };
 
-}
+} // namespace detail
 
 
 //
@@ -102,7 +108,8 @@ struct is_xargs<xargs<Args...>> {
       data[i] = 1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
  * \endverbatim
  *
@@ -112,11 +119,15 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkPool {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkPool
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkPool: EXTRA_ARGS_T "
+                "must be a "
+                "RAJA::xargs<...> type");
 };
 
 /*!
@@ -135,9 +146,11 @@ struct WorkPool {
  *
  * \verbatim
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
  * \endverbatim
  *
@@ -147,11 +160,15 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkGroup {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkGroup
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkGroup: "
+                "EXTRA_ARGS_T must be a "
+                "RAJA::xargs<...> type");
 };
 
 /*!
@@ -170,7 +187,8 @@ struct WorkGroup {
  *
  * \verbatim
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
    site.synchronize();
 
@@ -182,11 +200,15 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkSite {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkSite
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkSite: EXTRA_ARGS_T "
+                "must be a "
+                "RAJA::xargs<...> type");
 };
 
 
@@ -195,7 +217,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -209,7 +231,10 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using policy = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -218,10 +243,16 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<
-      exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>;
-  using storage_type = detail::WorkStorage<
-      storage_policy, Allocator, typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<exec_policy,
+                                             order_policy,
+                                             dispatch_policy,
+                                             Allocator,
+                                             index_type,
+                                             Args...>;
+  using storage_type =
+      detail::WorkStorage<storage_policy,
+                          Allocator,
+                          typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -229,9 +260,7 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workrunner_type::resource_type;
 
-  explicit WorkPool(Allocator const& aloc)
-    : m_storage(aloc)
-  { }
+  explicit WorkPool(Allocator const& aloc) : m_storage(aloc) {}
 
   WorkPool(WorkPool const&) = delete;
   WorkPool& operator=(WorkPool const&) = delete;
@@ -239,30 +268,26 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
   WorkPool(WorkPool&&) = default;
   WorkPool& operator=(WorkPool&&) = default;
 
-  size_t num_loops() const
-  {
-    return m_storage.size();
-  }
+  size_t num_loops() const { return m_storage.size(); }
 
-  size_t storage_bytes() const
-  {
-    return m_storage.storage_size();
-  }
+  size_t storage_bytes() const { return m_storage.storage_size(); }
 
   void reserve(size_t num_loops, size_t storage_bytes)
   {
     m_storage.reserve(num_loops, storage_bytes);
   }
 
-  template < typename segment_T, typename loop_T >
+  template <typename segment_T, typename loop_T>
   inline void enqueue(segment_T&& seg, loop_T&& loop_body)
   {
     {
       // ignore zero length loops
-      using std::begin; using std::end;
+      using std::begin;
+      using std::end;
       if (begin(seg) == end(seg)) return;
     }
-    if (m_storage.begin() == m_storage.end()) {
+    if (m_storage.begin() == m_storage.end())
+    {
       // perform auto-reserve on reuse
       reserve(m_max_num_loops, m_max_storage_bytes);
     }
@@ -273,8 +298,7 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     using RAJA::util::trigger_updates_before;
     auto body = trigger_updates_before(loop_body);
 
-    m_runner.enqueue(
-        m_storage, std::forward<segment_T>(seg), std::move(body));
+    m_runner.enqueue(m_storage, std::forward<segment_T>(seg), std::move(body));
 
     util::callPostCapturePlugins(context);
   }
@@ -289,10 +313,7 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkPool()
-  {
-    clear();
-  }
+  ~WorkPool() { clear(); }
 
 private:
   storage_type m_storage;
@@ -307,7 +328,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  ORDER_POLICY_T,
@@ -321,7 +342,10 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using policy = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -347,7 +371,8 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 
   inline worksite_type run(resource_type r, Args...);
 
-  worksite_type run(Args... args) {
+  worksite_type run(Args... args)
+  {
     auto r = resource_type::get_default();
     return run(r, std::move(args)...);
   }
@@ -360,19 +385,15 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkGroup()
-  {
-    clear();
-  }
+  ~WorkGroup() { clear(); }
 
 private:
   storage_type m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
-    : m_storage(std::move(storage))
-    , m_runner(std::move(runner))
-  { }
+      : m_storage(std::move(storage)), m_runner(std::move(runner))
+  {}
 };
 
 template <typename EXEC_POLICY_T,
@@ -380,7 +401,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -394,7 +415,10 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using policy = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -418,10 +442,7 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
   WorkSite(WorkSite&&) = default;
   WorkSite& operator=(WorkSite&&) = default;
 
-  resource_type get_resource() const
-  {
-    return m_resource;
-  }
+  resource_type get_resource() const { return m_resource; }
 
   void clear()
   {
@@ -429,19 +450,15 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
     // TODO: synchronize
   }
 
-  ~WorkSite()
-  {
-    clear();
-  }
+  ~WorkSite() { clear(); }
 
 private:
   per_run_storage m_run_storage;
   resource_type m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
-    : m_run_storage(std::move(run_storage))
-    , m_resource(r)
-  { }
+      : m_run_storage(std::move(run_storage)), m_resource(r)
+  {}
 };
 
 
@@ -450,19 +467,22 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::workgroup_type
-WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::instantiate()
+inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                         ORDER_POLICY_T,
+                                         STORAGE_POLICY_T,
+                                         DISPATCH_POLICY_T>,
+                         INDEX_T,
+                         xargs<Args...>,
+                         ALLOCATOR_T>::workgroup_type
+WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                         ORDER_POLICY_T,
+                         STORAGE_POLICY_T,
+                         DISPATCH_POLICY_T>,
+         INDEX_T,
+         xargs<Args...>,
+         ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
   m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
@@ -477,36 +497,43 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::worksite_type
+inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                          ORDER_POLICY_T,
+                                          STORAGE_POLICY_T,
+                                          DISPATCH_POLICY_T>,
+                          INDEX_T,
+                          xargs<Args...>,
+                          ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T,
+                    ORDER_POLICY_T,
+                    STORAGE_POLICY_T,
+                    DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::run(typename WorkGroup<
-                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-                          INDEX_T,
-                          xargs<Args...>,
-                          ALLOCATOR_T>::resource_type r,
+    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                                         ORDER_POLICY_T,
+                                                         STORAGE_POLICY_T,
+                                                         DISPATCH_POLICY_T>,
+                                         INDEX_T,
+                                         xargs<Args...>,
+                                         ALLOCATOR_T>::resource_type r,
                       Args... args)
 {
   util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(r,
+                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
   return site;
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index 1eac283f4b..954e59b9af 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -36,35 +36,36 @@ namespace RAJA
 namespace detail
 {
 
-template < typename >
+template <typename>
 struct DispatcherVoidPtrWrapper
 {
   void* ptr;
   DispatcherVoidPtrWrapper() = default;
   // implicit constructor from void*
-  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) {}
 };
 
-template < typename >
+template <typename>
 struct DispatcherVoidConstPtrWrapper
 {
   const void* ptr;
   DispatcherVoidConstPtrWrapper() = default;
   // implicit constructor from const void*
-  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) {}
 };
 
 
-constexpr bool dispatcher_use_host_invoke(Platform platform) {
+constexpr bool dispatcher_use_host_invoke(Platform platform)
+{
   return !(platform == Platform::cuda || platform == Platform::hip);
 }
 
 // Transforms one dispatch policy into another by creating a dispatch policy
 // of holder_type objects. See usage in WorkRunner for more explanation.
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 struct dispatcher_transform_types;
 ///
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 using dispatcher_transform_types_t =
     typename dispatcher_transform_types<dispatch_policy, holder_type>::type;
 
@@ -75,12 +76,17 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+template <Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
 struct Dispatcher;
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
@@ -93,8 +99,12 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holde
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_function_call_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
@@ -104,27 +114,29 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  template < typename T >
-  static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
+  template <typename T>
+  static void s_move_construct_destroy(void_ptr_wrapper dest,
+                                       void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
     T* src_as_T = static_cast<T*>(src.ptr);
-    new(dest_as_T) T(std::move(*src_as_T));
+    new (dest_as_T) T(std::move(*src_as_T));
     (*src_as_T).~T();
   }
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  template < typename T >
+  template <typename T>
   static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
   }
   ///
-  template < typename T >
-  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
+  template <typename T>
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
+                                          CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -133,22 +145,26 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// destroy the object of type T in obj
   ///
-  template < typename T >
+  template <typename T>
   static void s_destroy(void_ptr_wrapper obj)
   {
     T* obj_as_T = static_cast<T*>(obj.ptr);
     (*obj_as_T).~T();
   }
 
-  using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
-  using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
-  using destroyer_type = void(*)(void_ptr_wrapper /*obj*/);
+  using mover_type = void (*)(void_ptr_wrapper /*dest*/,
+                              void_ptr_wrapper /*src*/);
+  using invoker_type = void (*)(void_cptr_wrapper /*obj*/,
+                                CallArgs... /*args*/);
+  using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceInvokerFactory {
+  template <typename T>
+  struct DeviceInvokerFactory
+  {
     using value_type = invoker_type;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -160,14 +176,15 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{&s_host_invoke<T>},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type{&s_move_construct_destroy<T>},
+            invoker_type{&s_host_invoke<T>},
+            destroyer_type{&s_destroy<T>},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -179,14 +196,17 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(DeviceInvokerFactory<T>{})},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
+    return {mover_type{&s_move_construct_destroy<T>},
+            invoker_type{std::forward<CreateOnDevice>(createOnDevice)(
+                DeviceInvokerFactory<T>{})},
+            destroyer_type{&s_destroy<T>},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -196,8 +216,10 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
 };
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
@@ -210,38 +232,48 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, ho
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_virtual_function_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  struct impl_base {
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
+  struct impl_base
+  {
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const = 0;
     virtual void destroy(void_ptr_wrapper obj) const = 0;
   };
 
-  struct host_impl_base {
+  struct host_impl_base
+  {
     virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
   };
 
-  struct device_impl_base {
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+  struct device_impl_base
+  {
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const = 0;
   };
 
-  template < typename T >
+  template <typename T>
   struct base_impl_type : impl_base
   {
     ///
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
 
@@ -255,7 +287,7 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template <typename T>
   struct host_impl_type : host_impl_base
   {
     ///
@@ -268,20 +300,22 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template <typename T>
   struct device_impl_type : device_impl_base
   {
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
 
-  struct mover_type {
+  struct mover_type
+  {
     impl_base* m_impl;
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
@@ -289,7 +323,8 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     host_impl_base* m_impl;
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
@@ -297,30 +332,30 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
   ///
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     device_impl_base* m_impl;
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
-  struct destroyer_type {
+  struct destroyer_type
+  {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper obj) const
-    {
-      m_impl->destroy(obj);
-    }
+    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceImplTypeFactory {
+  template <typename T>
+  struct DeviceImplTypeFactory
+  {
     using value_type = device_impl_type<T>*;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -333,16 +368,17 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return { mover_type{&s_base_impl},
-             host_invoker_type{&s_host_impl},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    return {mover_type{&s_base_impl},
+            host_invoker_type{&s_host_impl},
+            destroyer_type{&s_base_impl},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -354,17 +390,19 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
     static base_impl_type<T> s_base_impl;
-    static device_impl_type<T>* s_device_impl_ptr{
-        std::forward<CreateOnDevice>(createOnDevice)(DeviceImplTypeFactory<T>{}) };
-    return { mover_type{&s_base_impl},
-             device_invoker_type{s_device_impl_ptr},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    static device_impl_type<T>* s_device_impl_ptr{std::forward<CreateOnDevice>(
+        createOnDevice)(DeviceImplTypeFactory<T>{})};
+    return {mover_type{&s_base_impl},
+            device_invoker_type{s_device_impl_ptr},
+            destroyer_type{&s_base_impl},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -375,17 +413,23 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
 
 
 // direct_dispatch expects a list of types
-template < typename ... Ts, typename holder_type >
-struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type> {
-  using type = ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
+template <typename... Ts, typename holder_type>
+struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type>
+{
+  using type =
+      ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to zero callable types.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::direct_dispatch<>;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
@@ -395,40 +439,41 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
-    void operator()(void_ptr_wrapper, void_ptr_wrapper) const
-    { }
+  struct mover_type
+  {
+    void operator()(void_ptr_wrapper, void_ptr_wrapper) const {}
   };
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
-    void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+  struct host_invoker_type
+  {
+    void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  struct device_invoker_type {
-    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+  struct device_invoker_type
+  {
+    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
-    void operator()(void_ptr_wrapper) const
-    { }
+  struct destroyer_type
+  {
+    void operator()(void_ptr_wrapper) const {}
   };
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
     return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
   ///
@@ -437,9 +482,12 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
     return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
@@ -453,8 +501,15 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs...> {
+template <Platform platform,
+          typename T,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::direct_dispatch<T>;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
@@ -464,12 +519,13 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -477,28 +533,30 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     void operator()(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -509,10 +567,13 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename U,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+  template <typename U,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
     return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
   ///
@@ -521,10 +582,14 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename U, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+  template <typename U,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
     return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
@@ -538,46 +603,55 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template < typename T0, typename T1, typename ... TNs,
-           Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
-                  DispatcherID, CallArgs...> {
+template <typename T0,
+          typename T1,
+          typename... TNs,
+          Platform platform,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
   using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   using id_type = int;
-  using callable_indices = camp::make_int_seq_t<id_type, 2+sizeof...(TNs)>;
+  using callable_indices = camp::make_int_seq_t<id_type, 2 + sizeof...(TNs)>;
   using callable_types = camp::list<T0, T1, TNs...>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  dest, src);
+      impl_helper(callable_indices{}, callable_types{}, dest, src);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper dest, void_ptr_wrapper src) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper dest,
+                     void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -585,79 +659,93 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     id_type id;
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices{},
+                  callable_types{},
+                  obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_cptr_wrapper obj,
+                     CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     id_type id;
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices{},
+                  callable_types{},
+                  obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
+                                 camp::list<Ts...>,
+                                 void_cptr_wrapper obj,
+                                 CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper obj) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj);
+      impl_helper(callable_indices{}, callable_types{}, obj);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper obj) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -671,12 +759,13 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// The id is just the index of T in the list of callable_types.
   /// If T is not in Ts return -1.
   ///
-  template < typename T, int ... id_types, typename ... Ts >
-  static constexpr id_type get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
+  template <typename T, int... id_types, typename... Ts>
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
+                                  camp::list<Ts...>)
   {
     id_type id{-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[] {0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    int unused[]{0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
     camp::sink(unused); // quiet unused var warning
     return id;
   }
@@ -684,12 +773,16 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {
+        mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -697,12 +790,17 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {
+        mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -718,8 +816,8 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
 // template < typename T, typename Dispatcher_T >
 // inline const Dispatcher_T* get_Dispatcher(work_policy const&);
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 9645f73050..e07b64cdb2 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -40,18 +40,18 @@ namespace detail
 /*!
  * A body and args holder for storing loops that are being executed in foralls
  */
-template <typename LoopBody, typename ... Args>
+template <typename LoopBody, typename... Args>
 struct HoldBodyArgs_base
 {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template < typename body_in,
-      typename = typename std::enable_if<
-        std::is_same<LoopBody, camp::decay<body_in>>::value>::type >
+  template <typename body_in,
+            typename = typename std::enable_if<
+                std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
   HoldBodyArgs_base(body_in&& body, Args... args)
-    : m_body(std::forward<body_in>(body))
-    , m_arg_tuple(std::forward<Args>(args)...)
-  { }
+      : m_body(std::forward<body_in>(body)),
+        m_arg_tuple(std::forward<Args>(args)...)
+  {}
 
 protected:
   LoopBody m_body;
@@ -62,7 +62,7 @@ struct HoldBodyArgs_base
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the host
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template <typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -73,7 +73,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
     invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -84,7 +84,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the device
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template <typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -95,7 +95,7 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
     invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -105,21 +105,24 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <typename ExecutionPolicy, typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename ExecutionPolicy,
+          typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
   using HoldBodyArgs = typename std::conditional<
       !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
       HoldBodyArgs_host<LoopBody, index_type, Args...>,
-      HoldBodyArgs_device<LoopBody, index_type, Args...> >::type;
+      HoldBodyArgs_device<LoopBody, index_type, Args...>>::type;
 
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldForall(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
@@ -143,7 +146,7 @@ template <typename EXEC_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner;
 
 
@@ -156,7 +159,7 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallOrdered_base
 {
   using exec_policy = EXEC_POLICY_T;
@@ -164,20 +167,24 @@ struct WorkRunnerForallOrdered_base
   using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
-  using resource_type = typename resources::get_resource<FORALL_EXEC_POLICY>::type;
+  using resource_type =
+      typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
   using forall_exec_policy = FORALL_EXEC_POLICY;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
-    using type = HoldForall<forall_exec_policy,
-                            typename camp::at<T, camp::num<0>>::type, // segment_type
-                            typename camp::at<T, camp::num<1>>::type, // loop_type
-                            index_type, Args...>;
+  struct holder_type
+  {
+    template <typename T>
+    using type =
+        HoldForall<forall_exec_policy,
+                   typename camp::at<T, camp::num<0>>::type, // segment_type
+                   typename camp::at<T, camp::num<1>>::type, // loop_type
+                   index_type,
+                   Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -186,33 +193,41 @@ struct WorkRunnerForallOrdered_base
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::host, dispatcher_holder_policy, void, resource_type, Args...>;
+  using dispatcher_type = Dispatcher<Platform::host,
+                                     dispatcher_holder_policy,
+                                     void,
+                                     resource_type,
+                                     Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
   WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete;
+  WorkRunnerForallOrdered_base&
+  operator=(WorkRunnerForallOrdered_base const&) = delete;
 
-  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base&&) = default;
+  WorkRunnerForallOrdered_base&
+  operator=(WorkRunnerForallOrdered_base&&) = default;
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename segment_T, typename loop_T >
+  template <typename WorkContainer, typename segment_T, typename loop_T>
   inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
   {
-    using holder = holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
+    using holder =
+        holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
     storage.template emplace<holder>(
         get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-        std::forward<segment_T>(seg), std::forward<loop_T>(loop));
+        std::forward<segment_T>(seg),
+        std::forward<loop_T>(loop));
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  { }
+  void clear() {}
 
   // no extra storage required here
   using per_run_storage = int;
@@ -227,29 +242,27 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallOrdered
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
@@ -259,7 +272,8 @@ struct WorkRunnerForallOrdered
     typename base::per_run_storage run_storage{};
 
     auto end = storage.end();
-    for (auto iter = storage.begin(); iter != end; ++iter) {
+    for (auto iter = storage.begin(); iter != end; ++iter)
+    {
       value_type::host_call(&*iter, r, args...);
     }
 
@@ -276,29 +290,28 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallReverse
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
-  // run the loops using forall in the reverse order to the order they were enqueued
-  template < typename WorkContainer >
+  // run the loops using forall in the reverse order to the order they were
+  // enqueued
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
@@ -308,16 +321,17 @@ struct WorkRunnerForallReverse
     typename base::per_run_storage run_storage{};
 
     auto begin = storage.begin();
-    for (auto iter = storage.end(); iter != begin; --iter) {
-      value_type::host_call(&*(iter-1), r, args...);
+    for (auto iter = storage.end(); iter != begin; --iter)
+    {
+      value_type::host_call(&*(iter - 1), r, args...);
     }
 
     return run_storage;
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 52631d108f..8a43982bd3 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -46,7 +46,7 @@ namespace detail
 //   operator -  ( iterator_base const& )
 //   operator == ( iterator_base const& )
 //   operator <  ( iterator_base const& )
-template < typename iterator_base >
+template <typename iterator_base>
 struct random_access_iterator : iterator_base
 {
   using base = iterator_base;
@@ -59,10 +59,10 @@ struct random_access_iterator : iterator_base
   using base::base;
 
   random_access_iterator(random_access_iterator const&) = default;
-  random_access_iterator(random_access_iterator &&) = default;
+  random_access_iterator(random_access_iterator&&) = default;
 
   random_access_iterator& operator=(random_access_iterator const&) = default;
-  random_access_iterator& operator=(random_access_iterator &&) = default;
+  random_access_iterator& operator=(random_access_iterator&&) = default;
 
 
   RAJA_HOST_DEVICE reference operator*() const
@@ -70,10 +70,7 @@ struct random_access_iterator : iterator_base
     return *static_cast<base const&>(*this);
   }
 
-  RAJA_HOST_DEVICE pointer operator->() const
-  {
-    return &(*(*this));
-  }
+  RAJA_HOST_DEVICE pointer operator->() const { return &(*(*this)); }
 
   RAJA_HOST_DEVICE reference operator[](difference_type i) const
   {
@@ -120,68 +117,75 @@ struct random_access_iterator : iterator_base
     return *this;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator+(random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy += rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      difference_type lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator+(difference_type lhs, random_access_iterator const& rhs)
   {
     random_access_iterator copy = rhs;
     copy += lhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator-(
-      random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator-(random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy -= rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline difference_type operator-(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline difference_type
+  operator-(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator==(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator==(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator!=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator!=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -191,10 +195,12 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename STORAGE_POLICY_T,
+          typename ALLOCATOR_T,
+          typename Dispatcher_T>
 class WorkStorage;
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -202,15 +208,17 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
   using storage_policy = RAJA::array_of_pointers;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
   using value_type = GenericWorkStruct<dispatcher_type>;
@@ -231,8 +239,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   };
 
 public:
-
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
     using value_type = const typename WorkStorage::value_type;
@@ -241,14 +249,9 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     using difference_type = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
-    const_iterator_base(const pointer_and_size* ptrptr)
-      : m_ptrptr(ptrptr)
-    { }
+    const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {}
 
-    RAJA_HOST_DEVICE reference operator*() const
-    {
-      return *(m_ptrptr->ptr);
-    }
+    RAJA_HOST_DEVICE reference operator*() const { return *(m_ptrptr->ptr); }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
     {
@@ -256,20 +259,23 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -282,22 +288,22 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_vec(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_vec(0, aloc), m_aloc(aloc)
+  {}
 
   WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_vec(std::move(rhs.m_vec))
-    , m_aloc(std::move(rhs.m_aloc))
-  { }
+      : m_vec(std::move(rhs.m_vec)), m_aloc(std::move(rhs.m_aloc))
+  {}
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -312,33 +318,26 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_vec.size();
-  }
+  size_type size() const { return m_vec.size(); }
 
-  const_iterator begin() const
-  {
-    return const_iterator(m_vec.begin());
-  }
+  const_iterator begin() const { return const_iterator(m_vec.begin()); }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_vec.end());
-  }
+  const_iterator end() const { return const_iterator(m_vec.end()); }
 
   // number of bytes used for storage of loops
   size_type storage_size() const
   {
     size_type storage_size_nbytes = 0;
-    for (size_t i = 0; i < m_vec.size(); ++i) {
+    for (size_t i = 0; i < m_vec.size(); ++i)
+    {
       storage_size_nbytes += m_vec[i].size;
     }
     return storage_size_nbytes;
   }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
         dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
@@ -347,20 +346,21 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   // destroy all stored loops, deallocates all storage
   void clear()
   {
-    while (!m_vec.empty()) {
+    while (!m_vec.empty())
+    {
       destroy_value(m_vec.back());
       m_vec.pop_back();
     }
     m_vec.shrink_to_fit();
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<pointer_and_size, typename allocator_traits_type::template rebind_alloc<pointer_and_size>> m_vec;
+  RAJAVec<
+      pointer_and_size,
+      typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
+      m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
@@ -375,12 +375,16 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
       // take storage if allocators compare equal
       m_vec = std::move(rhs.m_vec);
-    } else {
+    }
+    else
+    {
       // allocate new storage if allocators do not compare equal
-      for (size_type i = 0; i < rhs.m_vec.size(); ++i) {
+      for (size_type i = 0; i < rhs.m_vec.size(); ++i)
+      {
         m_vec.emplace_back(move_destroy_value(std::move(rhs), rhs.m_vec[i]));
       }
       rhs.m_vec.clear();
@@ -389,7 +393,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // allocate and construct value in storage
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   pointer_and_size create_value(const dispatcher_type* dispatcher,
                                 holder_ctor_args&&... ctor_args)
   {
@@ -414,8 +418,10 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
     value_type::move_destroy(value_ptr, other_value_and_size.ptr);
 
-    allocator_traits_type::deallocate(rhs.m_aloc,
-        reinterpret_cast<char*>(other_value_and_size.ptr), other_value_and_size.size);
+    allocator_traits_type::deallocate(
+        rhs.m_aloc,
+        reinterpret_cast<char*>(other_value_and_size.ptr),
+        other_value_and_size.size);
 
     return pointer_and_size{value_ptr, other_value_and_size.size};
   }
@@ -424,12 +430,14 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   void destroy_value(pointer_and_size value_and_size_ptr)
   {
     value_type::destroy(value_and_size_ptr.ptr);
-    allocator_traits_type::deallocate(m_aloc,
-        reinterpret_cast<char*>(value_and_size_ptr.ptr), value_and_size_ptr.size);
+    allocator_traits_type::deallocate(
+        m_aloc,
+        reinterpret_cast<char*>(value_and_size_ptr.ptr),
+        value_and_size_ptr.size);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -437,15 +445,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
   using storage_policy = RAJA::ragged_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
   using value_type = GenericWorkStruct<dispatcher_type>;
@@ -457,7 +467,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   using pointer = value_type*;
   using const_pointer = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
     using value_type = const typename WorkStorage::value_type;
@@ -467,14 +478,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_begin, const size_type* offset_iter)
-      : m_array_begin(array_begin)
-      , m_offset_iter(offset_iter)
-    { }
+        : m_array_begin(array_begin), m_offset_iter(offset_iter)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
-      return *reinterpret_cast<pointer>(
-          m_array_begin + *m_offset_iter);
+      return *reinterpret_cast<pointer>(m_array_begin + *m_offset_iter);
     }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
@@ -483,20 +492,23 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
@@ -510,19 +522,18 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_offsets(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_offsets(0, aloc), m_aloc(aloc)
+  {}
 
   WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_offsets(std::move(rhs.m_offsets))
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
-    , m_aloc(std::move(rhs.m_aloc))
+      : m_offsets(std::move(rhs.m_offsets)),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap),
+        m_aloc(std::move(rhs.m_aloc))
   {
     rhs.m_array_begin = nullptr;
     rhs.m_array_end = nullptr;
@@ -531,8 +542,10 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -546,10 +559,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_offsets.size();
-  }
+  size_type size() const { return m_offsets.size(); }
 
   const_iterator begin() const
   {
@@ -562,17 +572,15 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of bytes used for storage of loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
-    size_type value_size   = create_value<holder>(value_offset,
-        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    size_type value_size = create_value<holder>(
+        value_offset, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
   }
@@ -581,24 +589,25 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(
+          m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
-      m_array_end   = nullptr;
-      m_array_cap   = nullptr;
+      m_array_end = nullptr;
+      m_array_cap = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<size_type, typename allocator_traits_type::template rebind_alloc<size_type>> m_offsets;
+  RAJAVec<size_type,
+          typename allocator_traits_type::template rebind_alloc<size_type>>
+      m_offsets;
   char* m_array_begin = nullptr;
-  char* m_array_end   = nullptr;
-  char* m_array_cap   = nullptr;
+  char* m_array_end = nullptr;
+  char* m_array_cap = nullptr;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
@@ -606,35 +615,39 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   {
     clear();
 
-    m_offsets     = std::move(rhs.m_offsets);
+    m_offsets = std::move(rhs.m_offsets);
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
-    m_aloc        = std::move(rhs.m_aloc);
+    m_array_end = rhs.m_array_end;
+    m_array_cap = rhs.m_array_cap;
+    m_aloc = std::move(rhs.m_aloc);
 
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end   = nullptr;
-    rhs.m_array_cap   = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
   }
 
   // move assignment if allocator does not propagate on move assignment
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
-      m_offsets     = std::move(rhs.m_offsets);
+      m_offsets = std::move(rhs.m_offsets);
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end = rhs.m_array_end;
+      m_array_cap = rhs.m_array_cap;
 
       rhs.m_array_begin = nullptr;
-      rhs.m_array_end   = nullptr;
-      rhs.m_array_cap   = nullptr;
-    } else {
+      rhs.m_array_end = nullptr;
+      rhs.m_array_cap = nullptr;
+    }
+    else
+    {
       array_reserve(rhs.storage_size());
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         m_array_end = m_array_begin + rhs.m_offsets[i];
         move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]);
         m_offsets.emplace_back(rhs.m_offsets[i]);
@@ -647,46 +660,45 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // get loop storage capacity, used and unused in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // get unused loop storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // reserve space for loop_storage_size bytes of loop storage
   void array_reserve(size_type loop_storage_size)
   {
-    if (loop_storage_size > storage_capacity()) {
+    if (loop_storage_size > storage_capacity())
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + storage_size();
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + storage_size();
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + m_offsets[i],
-                             m_array_begin + m_offsets[i]);
+                           m_array_begin + m_offsets[i]);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(
+            m_aloc, m_array_begin, storage_capacity());
       }
 
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end = new_array_end;
+      m_array_cap = new_array_cap;
     }
   }
 
   // destroy loop objects (does not deallocate array storage)
   void array_clear()
   {
-    while (!m_offsets.empty()) {
+    while (!m_offsets.empty())
+    {
       destroy_value(m_offsets.back());
       m_array_end = m_array_begin + m_offsets.back();
       m_offsets.pop_back();
@@ -696,15 +708,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   size_type create_value(size_type value_offset,
                          const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused()) {
-      array_reserve(std::max(storage_size() + value_size, 2*storage_capacity()));
+    if (value_size > storage_unused())
+    {
+      array_reserve(
+          std::max(storage_size() + value_size, 2 * storage_capacity()));
     }
 
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
@@ -726,13 +740,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::constant_stride_array_of_objects,
                   ALLOCATOR_T,
                   Dispatcher_T>
@@ -742,15 +755,17 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
   using storage_policy = RAJA::constant_stride_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
   using value_type = GenericWorkStruct<dispatcher_type>;
@@ -762,7 +777,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   using pointer = value_type*;
   using const_pointer = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
     using value_type = const typename WorkStorage::value_type;
@@ -772,9 +788,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_pos, size_type stride)
-      : m_array_pos(array_pos)
-      , m_stride(stride)
-    { }
+        : m_array_pos(array_pos), m_stride(stride)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
@@ -787,20 +802,23 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
@@ -813,30 +831,30 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   using const_iterator = random_access_iterator<const_iterator_base>;
 
 
-  explicit WorkStorage(allocator_type const& aloc)
-    : m_aloc(aloc)
-  { }
+  explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {}
 
   WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_aloc(std::move(rhs.m_aloc))
-    , m_stride(rhs.m_stride)
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
+      : m_aloc(std::move(rhs.m_aloc)),
+        m_stride(rhs.m_stride),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap)
   {
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end   = nullptr;
-    rhs.m_array_cap   = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -847,35 +865,28 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     size_type num_storage_loops =
         std::max(num_loops, (loop_storage_size + m_stride - 1) / m_stride);
-    array_reserve(num_storage_loops*m_stride, m_stride);
+    array_reserve(num_storage_loops * m_stride, m_stride);
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return storage_size() / m_stride;
-  }
+  size_type size() const { return storage_size() / m_stride; }
 
   const_iterator begin() const
   {
     return const_iterator(m_array_begin, m_stride);
   }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_array_end, m_stride);
-  }
+  const_iterator end() const { return const_iterator(m_array_end, m_stride); }
 
   // amount of storage in bytes used to store loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher,
+                         std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -883,64 +894,67 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(
+          m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
-      m_array_end   = nullptr;
-      m_array_cap   = nullptr;
+      m_array_end = nullptr;
+      m_array_cap = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
   allocator_type m_aloc;
-  size_type m_stride     = 1; // can't be 0 because size divides stride
+  size_type m_stride = 1; // can't be 0 because size divides stride
   char* m_array_begin = nullptr;
-  char* m_array_end   = nullptr;
-  char* m_array_cap   = nullptr;
+  char* m_array_end = nullptr;
+  char* m_array_cap = nullptr;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
 
-    m_aloc        = std::move(rhs.m_aloc);
-    m_stride      = rhs.m_stride     ;
+    m_aloc = std::move(rhs.m_aloc);
+    m_stride = rhs.m_stride;
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end = rhs.m_array_end;
+    m_array_cap = rhs.m_array_cap;
 
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end   = nullptr;
-    rhs.m_array_cap   = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
   }
 
   // move assignment if allocator does not propagate on move assignment
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
-      m_stride      = rhs.m_stride     ;
+      m_stride = rhs.m_stride;
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end = rhs.m_array_end;
+      m_array_cap = rhs.m_array_cap;
 
       // do not reset stride, leave it for reuse
       rhs.m_array_begin = nullptr;
-      rhs.m_array_end   = nullptr;
-      rhs.m_array_cap   = nullptr;
-    } else {
+      rhs.m_array_end = nullptr;
+      rhs.m_array_cap = nullptr;
+    }
+    else
+    {
 
       m_stride = rhs.m_stride;
       array_reserve(rhs.storage_size(), rhs.m_stride);
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         move_destroy_value(m_array_end, rhs.m_array_begin + i * rhs.m_stride);
         m_array_end += m_stride;
       }
@@ -950,16 +964,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 
   // storage capacity, used and unused, in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // unused storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // allocate enough storage for loop_storage_size bytes with
   // each loop body separated by new_stride bytes
@@ -968,33 +976,39 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // Note that loop_storage_size must be a multiple of new_stride
   void array_reserve(size_type loop_storage_size, size_type new_stride)
   {
-    if (loop_storage_size > storage_capacity() || new_stride > m_stride) {
+    if (loop_storage_size > storage_capacity() || new_stride > m_stride)
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + size() * new_stride;
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + size() * new_stride;
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + i * new_stride,
-                             m_array_begin + i *   m_stride);
+                           m_array_begin + i * m_stride);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(
+            m_aloc, m_array_begin, storage_capacity());
       }
 
-      m_stride      = new_stride     ;
+      m_stride = new_stride;
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end = new_array_end;
+      m_array_cap = new_array_cap;
     }
   }
 
   // destroy the loops in storage (does not deallocate loop storage)
   void array_clear()
   {
-    for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) {
+    for (size_type value_offset = storage_size(); value_offset > 0;
+         value_offset -= m_stride)
+    {
       destroy_value(value_offset - m_stride);
       m_array_end -= m_stride;
     }
@@ -1002,18 +1016,20 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   void create_value(const dispatcher_type* dispatcher,
                     holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused() && value_size <= m_stride) {
-      array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()),
+    if (value_size > storage_unused() && value_size <= m_stride)
+    {
+      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
                     m_stride);
-    } else if (value_size > m_stride) {
-      array_reserve((size()+1)*value_size,
-                    value_size);
+    }
+    else if (value_size > m_stride)
+    {
+      array_reserve((size() + 1) * value_size, value_size);
     }
 
     size_type value_offset = storage_size();
@@ -1025,8 +1041,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // move construct the loop body in value from other and
   // destroy the loop body in other
-  void move_destroy_value(char* value_ptr,
-                          char* other_value_ptr)
+  void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
     value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
                              reinterpret_cast<pointer>(other_value_ptr));
@@ -1035,14 +1050,13 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 72e1540c54..4ccfc5d4f5 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -35,7 +35,7 @@ namespace detail
 /*!
  * A struct that gives a generic way to layout memory for different loops
  */
-template < size_t size, typename Dispatcher_T >
+template <size_t size, typename Dispatcher_T>
 struct WorkStruct;
 
 /*!
@@ -44,67 +44,75 @@ struct WorkStruct;
  *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
-template < typename Dispatcher_T >
+template <typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
-struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
+template <size_t size,
+          Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
+struct WorkStruct<
+    size,
+    Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
 {
-  using dispatcher_type = Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
+  using dispatcher_type =
+      Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
 
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
-  template < typename holder, typename ... holder_ctor_args >
-  static RAJA_INLINE
-  void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  static RAJA_INLINE void construct(void* ptr,
+                                    const dispatcher_type* dispatcher,
+                                    holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
     using value_type = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
-        "holder must fit in WorkStruct::obj");
+                  "holder must fit in WorkStruct::obj");
     static_assert(std::is_standard_layout<true_value_type>::value,
-        "WorkStruct must be a standard layout type");
+                  "WorkStruct must be a standard layout type");
     static_assert(std::is_standard_layout<value_type>::value,
-        "GenericWorkStruct must be a standard layout type");
+                  "GenericWorkStruct must be a standard layout type");
     static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
-        "WorkStruct and GenericWorkStruct must have obj at the same offset");
+                  "WorkStruct and GenericWorkStruct must have obj at the same "
+                  "offset");
     static_assert(sizeof(value_type) <= sizeof(true_value_type),
-        "WorkStruct must not be smaller than GenericWorkStruct");
+                  "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
     value_ptr->invoke = dispatcher->invoke;
-    new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
+    new (&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE
-  void move_destroy(WorkStruct* value_dst,
-                    WorkStruct* value_src)
+  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
+                                       WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
     value_dst->invoke = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj);
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
+                                                  &value_src->obj);
   }
 
   // destroy the value ptr
-  static RAJA_INLINE
-  void destroy(WorkStruct* value_ptr)
+  static RAJA_INLINE void destroy(WorkStruct* value_ptr)
   {
     value_ptr->dispatcher->destroy(&value_ptr->obj);
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE
-  void host_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
+                                    CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE
-  void device_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
+                                                  CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
@@ -114,8 +122,8 @@ struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, Call
   typename std::aligned_storage<size, RAJA_MAX_ALIGN>::type obj;
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index d5905f7928..2a024c6db3 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -87,7 +87,7 @@ namespace RAJA
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc)
 {
   return RAJA::atomicLoad(Policy{}, acc);
 }
@@ -100,7 +100,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value)
 {
   RAJA::atomicStore(Policy{}, acc, value);
 }
@@ -114,7 +114,7 @@ RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value)
 {
   return RAJA::atomicAdd(Policy{}, acc, value);
 }
@@ -128,7 +128,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value)
 {
   return RAJA::atomicSub(Policy{}, acc, value);
 }
@@ -142,7 +142,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value)
 {
   return RAJA::atomicMin(Policy{}, acc, value);
 }
@@ -156,7 +156,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value)
 {
   return RAJA::atomicMax(Policy{}, acc, value);
 }
@@ -169,7 +169,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc)
 {
   return RAJA::atomicInc(Policy{}, acc);
 }
@@ -185,7 +185,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare)
 {
   return RAJA::atomicInc(Policy{}, acc, compare);
 }
@@ -198,7 +198,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc)
 {
   return RAJA::atomicDec(Policy{}, acc);
 }
@@ -214,7 +214,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare)
 {
   return RAJA::atomicDec(Policy{}, acc, compare);
 }
@@ -229,7 +229,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicAnd can only be used on integral types");
@@ -246,7 +246,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicOr can only be used on integral types");
@@ -263,7 +263,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicXor can only be used on integral types");
@@ -279,7 +279,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
 {
   return RAJA::atomicExchange(Policy{}, acc, value);
 }
@@ -295,7 +295,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value)
 {
   return RAJA::atomicCAS(Policy{}, acc, compare, value);
 }
@@ -317,22 +317,18 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr explicit AtomicRef(value_type *value_ptr)
-      : m_value_ptr(value_ptr) {}
+  constexpr explicit AtomicRef(value_type* value_ptr) : m_value_ptr(value_ptr)
+  {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr AtomicRef(AtomicRef const &c)
-      : m_value_ptr(c.m_value_ptr) {}
+  constexpr AtomicRef(AtomicRef const& c) : m_value_ptr(c.m_value_ptr) {}
 
   AtomicRef& operator=(AtomicRef const&) = delete;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type * getPointer() const
-  {
-    return m_value_ptr;
-  }
+  value_type* getPointer() const { return m_value_ptr; }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -351,17 +347,11 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type load() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  value_type load() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  operator value_type() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  operator value_type() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -383,9 +373,12 @@ class AtomicRef
   {
     value_type compare = expect;
     value_type old = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
-    if (compare == old) {
+    if (compare == old)
+    {
       return true;
-    } else {
+    }
+    else
+    {
       expect = old;
       return false;
     }
@@ -527,10 +520,10 @@ class AtomicRef
   }
 
 private:
-  value_type *m_value_ptr;
+  value_type* m_value_ptr;
 };
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 21d266bd21..23ed6c462e 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -49,30 +49,29 @@ using ContainerVal =
     camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
 
 template <typename Container>
-using ContainerRef =
-    decltype(*camp::val<camp::iterator_from<Container>>());
+using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
 
 template <typename Container>
 using ContainerDiff =
-    camp::decay<decltype(camp::val<camp::iterator_from<Container>>()-camp::val<camp::iterator_from<Container>>())>;
+    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
+                         camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE
-DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
+RAJA_INLINE DiffType firstIndex(DiffType n,
+                                CountType num_threads,
+                                CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
 
-}  // end namespace detail
+} // end namespace detail
 
 
 /*!
     \brief swap values at iterators lhs and rhs
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-safe_iter_swap(Iter lhs, Iter rhs)
+RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs)
 {
 #ifdef RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
   using camp::safe_swap;
@@ -87,9 +86,7 @@ safe_iter_swap(Iter lhs, Iter rhs)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-next(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
 {
   ++it;
   return it;
@@ -99,14 +96,12 @@ next(Iter it)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-prev(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it)
 {
   --it;
   return it;
 }
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp
index 3bd5d7ecaf..217ef0b882 100644
--- a/include/RAJA/pattern/detail/forall.hpp
+++ b/include/RAJA/pattern/detail/forall.hpp
@@ -19,12 +19,12 @@
 #ifndef RAJA_PATTERN_DETAIL_FORALL_HPP
 #define RAJA_PATTERN_DETAIL_FORALL_HPP
 
-#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \
-  using std::begin;                                  \
-  using std::end;                                    \
-  using std::distance;                               \
-  auto begin##SUFFIX = begin(CONTAINER);             \
-  auto end##SUFFIX = end(CONTAINER);                 \
+#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX)                           \
+  using std::begin;                                                            \
+  using std::end;                                                              \
+  using std::distance;                                                         \
+  auto begin##SUFFIX = begin(CONTAINER);                                       \
+  auto end##SUFFIX = end(CONTAINER);                                           \
   auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX)
 
 #define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it)
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 884b9aa989..0f6b7069b2 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -26,32 +26,29 @@
 #include "RAJA/util/RepeatView.hpp"
 
 
-#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)    \
-  template <typename tuning, typename T>                      \
-  struct MultiReduce##OP_NAME<POL<tuning>, T>                 \
-      : reduce::detail::BaseMultiReduce##OP_NAME<             \
-            DATA<T, RAJA::reduce::OP<T>, tuning>>             \
-  {                                                           \
-    using policy = POL<tuning>;                               \
-    using Base = reduce::detail::BaseMultiReduce##OP_NAME<    \
-        DATA<T, RAJA::reduce::OP<T>, tuning>>;                \
-    using Base::Base;                                         \
-    using typename Base::value_type;                          \
-    using typename Base::reference;                           \
-                                                              \
-    RAJA_SUPPRESS_HD_WARN                                     \
-    RAJA_HOST_DEVICE                                          \
-    reference operator[](size_t bin) const                    \
-    {                                                         \
-      return reference(*this, bin);                           \
-    }                                                         \
+#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)                     \
+  template <typename tuning, typename T>                                       \
+  struct MultiReduce##OP_NAME<POL<tuning>, T>                                  \
+      : reduce::detail::BaseMultiReduce##OP_NAME<                              \
+            DATA<T, RAJA::reduce::OP<T>, tuning>>                              \
+  {                                                                            \
+    using policy = POL<tuning>;                                                \
+    using Base = reduce::detail::BaseMultiReduce##OP_NAME<                     \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                                 \
+    using Base::Base;                                                          \
+    using typename Base::value_type;                                           \
+    using typename Base::reference;                                            \
+                                                                               \
+    RAJA_SUPPRESS_HD_WARN                                                      \
+    RAJA_HOST_DEVICE                                                           \
+    reference operator[](size_t bin) const { return reference(*this, bin); }   \
   };
 
-#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)            \
-  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)        \
+#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)                             \
+  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)                         \
   RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA)
 
 namespace RAJA
@@ -70,29 +67,34 @@ struct BaseMultiReduce
   using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
   using value_type = typename t_MultiReduceData::value_type;
 
-  BaseMultiReduce() : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)} {}
+  BaseMultiReduce()
+      : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)}
+  {}
 
   explicit BaseMultiReduce(size_t num_bins,
                            value_type init_val = MultiReduceOp::identity(),
                            value_type identity = MultiReduceOp::identity())
       : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
-  { }
-
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>,
-                                   concepts::negate<std::is_convertible<Container, size_t>>,
-                                   concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* = nullptr >
+  {}
+
+  template <
+      typename Container,
+      concepts::enable_if_t<
+          type_traits::is_range<Container>,
+          concepts::negate<std::is_convertible<Container, size_t>>,
+          concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* =
+          nullptr>
   explicit BaseMultiReduce(Container const& container,
                            value_type identity = MultiReduceOp::identity())
       : data{container, identity}
-  { }
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduce(BaseMultiReduce const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduce(BaseMultiReduce &&) = default;
-  BaseMultiReduce &operator=(BaseMultiReduce const&) = delete;
-  BaseMultiReduce &operator=(BaseMultiReduce &&) = delete;
+  BaseMultiReduce(BaseMultiReduce&&) = default;
+  BaseMultiReduce& operator=(BaseMultiReduce const&) = delete;
+  BaseMultiReduce& operator=(BaseMultiReduce&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduce() = default;
 
@@ -108,12 +110,13 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void reset(Container const& container,
              value_type identity = MultiReduceOp::identity())
   {
-    for (size_t bin = 0; bin < data.num_bins(); ++bin) {
+    for (size_t bin = 0; bin < data.num_bins(); ++bin)
+    {
       RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
     }
     data.reset(container, identity);
@@ -125,7 +128,7 @@ struct BaseMultiReduce
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseMultiReduce const& combine(size_t bin, value_type const &other) const
+  BaseMultiReduce const& combine(size_t bin, value_type const& other) const
   {
     data.combine(bin, other);
     return *this;
@@ -135,16 +138,19 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void get_all(Container& container) const
   {
     RAJA_EXTRACT_BED_IT(container);
-    if (size_t(distance_it) != data.num_bins()) {
-      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer");
+    if (size_t(distance_it) != data.num_bins())
+    {
+      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size "
+                          "than multi reducer");
     }
     size_t bin = 0;
-    for (auto& val : container) {
+    for (auto& val : container)
+    {
       val = data.get(bin);
       ++bin;
     }
@@ -167,17 +173,17 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 {
 public:
   using Base = BaseMultiReduce<MultiReduceData>;
-  using typename Base::value_type;
   using Base::Base;
+  using typename Base::value_type;
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMin(BaseMultiReduceMin const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin(BaseMultiReduceMin &&) = default;
+  BaseMultiReduceMin(BaseMultiReduceMin&&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin const&) = delete;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMin() = default;
 
@@ -185,8 +191,8 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMin const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -196,10 +202,7 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMin const& m_base;
@@ -226,9 +229,9 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMax(BaseMultiReduceMax &&) = default;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete;
+  BaseMultiReduceMax(BaseMultiReduceMax&&) = default;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax const&) = delete;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMax() = default;
 
@@ -236,8 +239,8 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMax const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -247,10 +250,7 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMax const& m_base;
@@ -277,9 +277,9 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceSum(BaseMultiReduceSum &&) = default;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete;
+  BaseMultiReduceSum(BaseMultiReduceSum&&) = default;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum const&) = delete;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceSum() = default;
 
@@ -287,8 +287,8 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceSum const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -298,10 +298,7 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceSum const& m_base;
@@ -328,9 +325,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete;
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr&&) = default;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr const&) = delete;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitOr() = default;
 
@@ -338,8 +335,8 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitOr const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -349,10 +346,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitOr const& m_base;
@@ -379,9 +373,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete;
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&) = default;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd const&) = delete;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitAnd() = default;
 
@@ -389,8 +383,8 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitAnd const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -400,10 +394,7 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitAnd const& m_base;
@@ -411,10 +402,10 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   };
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace reduce
+} // namespace reduce
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif /* RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP */
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 3579027cd3..036890e067 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -42,16 +42,19 @@ class has_privatizer
 
 static_assert(!has_privatizer<int>::value, "if this fires, abandon all hope");
 
-struct GenericWrapperBase {
-};
+struct GenericWrapperBase
+{};
 
 template <typename T>
-struct Privatizer {
+struct Privatizer
+{
   using value_type = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
   static_assert(!has_privatizer<T>::value,
-                "Privatizer selected inappropriately, this is almost certainly "
+                "Privatizer selected "
+                "inappropriately, this is almost "
+                "certainly "
                 "a bug");
   static_assert(!std::is_base_of<GenericWrapperBase, T>::value,
                 "Privatizer selected inappropriately, this is almost certainly "
@@ -96,8 +99,8 @@ RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer
   return typename T::privatizer{item};
 }
 
-}  // namespace internal
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif /* __RAJA_PRIVATIZER_HPP */
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 788f3c698d..70cfbd856c 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -21,33 +21,33 @@
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/types.hpp"
 
-#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)               \
-  template <typename T>                                       \
-  class Reduce##OP<POL, T>                                    \
-      : public reduce::detail::BaseReduce##OP<T, COMBINER>    \
-  {                                                           \
-  public:                                                     \
-    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>; \
-    using Base::Base;                                         \
+#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)                                \
+  template <typename T>                                                        \
+  class Reduce##OP<POL, T>                                                     \
+      : public reduce::detail::BaseReduce##OP<T, COMBINER>                     \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>;                  \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                    \
-  template <typename T, typename IndexType>                              \
-  class Reduce##OP<POL, T, IndexType>                                    \
-      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>    \
-  {                                                                      \
-  public:                                                                \
-    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>; \
-    using Base::Base;                                                    \
+#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                          \
+  template <typename T, typename IndexType>                                    \
+  class Reduce##OP<POL, T, IndexType>                                          \
+      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>          \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>;       \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)       \
-  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)             \
-  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)    \
-  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)    \
-  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)           \
+#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)                               \
+  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)                                     \
+  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)                                   \
   RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER)
 
 namespace RAJA
@@ -64,39 +64,40 @@ namespace detail
 {
 
 template <typename T, template <typename...> class Op>
-struct op_adapter : private Op<T, T, T> {
+struct op_adapter : private Op<T, T, T>
+{
   using operator_type = Op<T, T, T>;
   RAJA_HOST_DEVICE static constexpr T identity()
   {
     return operator_type::identity();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val, const T v) const
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val, const T v) const
   {
     val = operator_type::operator()(val, v);
   }
 };
-}  // namespace detail
+} // namespace detail
 
 template <typename T>
-struct sum : detail::op_adapter<T, RAJA::operators::plus> {
-};
+struct sum : detail::op_adapter<T, RAJA::operators::plus>
+{};
 
 template <typename T>
-struct min : detail::op_adapter<T, RAJA::operators::minimum> {
-};
+struct min : detail::op_adapter<T, RAJA::operators::minimum>
+{};
 
 template <typename T>
-struct max : detail::op_adapter<T, RAJA::operators::maximum> {
-};
+struct max : detail::op_adapter<T, RAJA::operators::maximum>
+{};
 
 template <typename T>
-struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or> {
-};
+struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or>
+{};
 
 template <typename T>
-struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and> {
-};
+struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and>
+{};
 
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -107,10 +108,11 @@ namespace detail
 {
 
 template <typename T, bool = std::is_integral<T>::value>
-struct DefaultLoc {};
+struct DefaultLoc
+{};
 
 template <typename T>
-struct DefaultLoc<T, false>  // any non-integral type
+struct DefaultLoc<T, false> // any non-integral type
 {
   RAJA_HOST_DEVICE constexpr T value() const { return T(); }
 };
@@ -128,55 +130,67 @@ class ValueLoc
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   IndexType loc = DefaultLoc<IndexType>().value();
 
-#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__)
+#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 ||            \
+    defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const& other)
+      : val{other.val}, loc{other.loc}
+  {}
   RAJA_HOST_DEVICE
-  ValueLoc &operator=(ValueLoc const &other) { val = other.val; loc = other.loc; return *this;}
+  ValueLoc& operator=(ValueLoc const& other)
+  {
+    val = other.val;
+    loc = other.loc;
+    return *this;
+  }
 #else
   constexpr ValueLoc() = default;
-  constexpr ValueLoc(ValueLoc const &) = default;
-  ValueLoc &operator=(ValueLoc const &) = default;
+  constexpr ValueLoc(ValueLoc const&) = default;
+  ValueLoc& operator=(ValueLoc const&) = default;
 #endif
 
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc<IndexType>().value()} {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_)
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_)
+      : val{val_}, loc{DefaultLoc<IndexType>().value()}
+  {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_, IndexType const& loc_)
       : val{val_}, loc{loc_}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE operator T() const { return val; }
   RAJA_HOST_DEVICE IndexType getLoc() { return loc; }
-  RAJA_HOST_DEVICE bool operator<(ValueLoc const &rhs) const
+  RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const
   {
     return val < rhs.val;
   }
-  RAJA_HOST_DEVICE bool operator>(ValueLoc const &rhs) const
+  RAJA_HOST_DEVICE bool operator>(ValueLoc const& rhs) const
   {
     return val > rhs.val;
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace reduce
+} // namespace reduce
 
 namespace operators
 {
 template <typename T, typename IndexType, bool B>
-struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> min()
+struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      min()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      max()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::max());
   }
 };
-}  // namespace operators
+} // namespace operators
 
 namespace reduce
 {
@@ -208,8 +222,7 @@ class BaseReduce
   RAJA_HOST_DEVICE
   BaseReduce(T init_val, T identity_ = Reduce::identity())
       : c{init_val, identity_}
-  {
-  }
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
@@ -220,27 +233,27 @@ class BaseReduce
   }
 
   //! prohibit compiler-generated copy assignment
-  BaseReduce &operator=(const BaseReduce &) = delete;
+  BaseReduce& operator=(const BaseReduce&) = delete;
 
   //! compiler-generated copy constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce(const BaseReduce &copy) : c(copy.c) {}
+  BaseReduce(const BaseReduce& copy) : c(copy.c) {}
 
   //! compiler-generated move constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  BaseReduce(BaseReduce &&copy) : c(std::move(copy.c)) {}
+  BaseReduce(BaseReduce&& copy) : c(std::move(copy.c)) {}
 
   //! compiler-generated move assignment
-  BaseReduce &operator=(BaseReduce &&) = default;
+  BaseReduce& operator=(BaseReduce&&) = default;
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const &other) const { c.combine(other); }
+  void combine(T const& other) const { c.combine(other); }
 
-  T &local() const { return c.local(); }
+  T& local() const { return c.local(); }
 
   //! Get the calculated reduced value
   operator T() const { return c.get(); }
@@ -253,7 +266,7 @@ template <typename T, typename Reduce, typename Derived>
 class BaseCombinable
 {
 protected:
-  BaseCombinable const *parent = nullptr;
+  BaseCombinable const* parent = nullptr;
   T identity;
   T mutable my_data;
 
@@ -266,8 +279,7 @@ class BaseCombinable
   RAJA_HOST_DEVICE
   constexpr BaseCombinable(T init_val, T identity_ = T())
       : identity{identity_}, my_data{init_val}
-  {
-  }
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
@@ -279,25 +291,25 @@ class BaseCombinable
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable(BaseCombinable const &other)
+  constexpr BaseCombinable(BaseCombinable const& other)
       : parent{other.parent ? other.parent : &other},
         identity{other.identity},
         my_data{identity}
-  {
-  }
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   ~BaseCombinable()
   {
-    if (parent && my_data != identity) {
+    if (parent && my_data != identity)
+    {
       Reduce()(parent->my_data, my_data);
     }
   }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const &other) { Reduce{}(my_data, other); }
+  void combine(T const& other) { Reduce{}(my_data, other); }
 
   /*!
    *  \return the calculated reduced value
@@ -307,17 +319,17 @@ class BaseCombinable
   /*!
    *  \return reference to the local value
    */
-  T &local() const { return my_data; }
+  T& local() const { return my_data; }
 
   T get_combined() const { return my_data; }
 
 private:
   // Convenience method for CRTP
-  const Derived &derived() const
+  const Derived& derived() const
   {
-    return *(static_cast<const Derived *>(this));
+    return *(static_cast<const Derived*>(this));
   }
-  Derived &derived() { return *(static_cast<Derived *>(this)); }
+  Derived& derived() { return *(static_cast<Derived*>(this)); }
 };
 
 /*!
@@ -336,7 +348,7 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMin &min(T rhs) const
+  const BaseReduceMin& min(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -350,7 +362,10 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
@@ -362,24 +377,28 @@ class BaseReduceMinLoc
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
+  constexpr BaseReduceMinLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_ = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
     operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMinLoc &minloc(T rhs, IndexType loc) const
+  const BaseReduceMinLoc& minloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
@@ -408,7 +427,7 @@ class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMax &max(T rhs) const
+  const BaseReduceMax& max(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -432,7 +451,7 @@ class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceSum &operator+=(T rhs) const
+  const BaseReduceSum& operator+=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -456,7 +475,7 @@ class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitOr &operator|=(T rhs) const
+  const BaseReduceBitOr& operator|=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -480,7 +499,7 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitAnd &operator&=(T rhs) const
+  const BaseReduceBitAnd& operator&=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -495,36 +514,45 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
-class BaseReduceMaxLoc
-    : public BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
+class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
+                                           RAJA::reduce::max,
+                                           Combiner>
 {
 public:
-  using Base = BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
+  using Base =
+      BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
   using value_type = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
+  constexpr BaseReduceMaxLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_ = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
     operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMaxLoc &maxloc(T rhs, IndexType loc) const
+  const BaseReduceMaxLoc& maxloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
@@ -537,10 +565,10 @@ class BaseReduceMaxLoc
   operator T() const { return Base::get(); }
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace reduce
+} // namespace reduce
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif /* RAJA_PATTERN_DETAIL_REDUCE_HPP */
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 686f0e8c6b..2382f2bc78 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -98,7 +98,8 @@ namespace detail
 {
 /// Adapter to replace specific implementations for the icount variants
 template <typename Range, typename Body, typename IndexT>
-struct icount_adapter {
+struct icount_adapter
+{
   using index_type = typename std::decay<IndexT>::type;
   typename std::decay<Body>::type body;
   using container_type = typename std::decay<Range>::type;
@@ -119,20 +120,32 @@ struct icount_adapter {
   }
 };
 
-struct CallForall {
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+struct CallForall
+{
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res>
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
 
-struct CallForallIcount {
+struct CallForallIcount
+{
   constexpr CallForallIcount(int s);
 
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res>
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
-}  // namespace detail
+} // namespace detail
 
 /*!
  ******************************************************************************
@@ -152,12 +165,20 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody, typename ForallParams>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody,
+          typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params)
+forall(Res r,
+       ExecutionPolicy&& p,
+       Container&& c,
+       LoopBody&& loop_body,
+       ForallParams&& f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
   return forall_impl(r,
@@ -167,7 +188,10 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallPa
                      std::forward<ForallParams>(f_params));
 }
 
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -197,22 +221,25 @@ template <typename Res,
           typename LoopBody,
           typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                      ExecutionPolicy&& p,
-                                                      Container&& c,
-                                                      IndexType&& icount,
-                                                      LoopBody&& loop_body,
-                                                      ForallParams&& f_params)
+                                                     ExecutionPolicy&& p,
+                                                     Container&& c,
+                                                     IndexType&& icount,
+                                                     LoopBody&& loop_body,
+                                                     ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c,
-                                                                 loop_body,
-                                                                 icount);
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(
+      c, loop_body, icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted, std::forward<ForallParams>(f_params));
+  return forall_impl(r,
+                     std::forward<ExecutionPolicy>(p),
+                     range,
+                     adapted,
+                     std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -230,15 +257,16 @@ template <typename Res,
           typename... SegmentTypes,
           typename LoopBody,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                ExecPolicy<SegmentIterPolicy,
-                                                SegmentExecPolicy>,
-                                                const TypedIndexSet<SegmentTypes...>& iset,
-                                                LoopBody loop_body,
-                                                ForallParams f_params)
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(Res r,
+              ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+              const TypedIndexSet<SegmentTypes...>& iset,
+              LoopBody loop_body,
+              ForallParams f_params)
 {
   // no need for icount variant here
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
   wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
     iset.segmentCall(segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
@@ -256,30 +284,35 @@ template <typename Res,
           typename LoopBody,
           typename... SegmentTypes,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall(Res r,
-                                         ExecPolicy<SegmentIterPolicy,
-                                         SegmentExecPolicy>,
-                                         const TypedIndexSet<SegmentTypes...>& iset,
-                                         LoopBody loop_body,
-                                         ForallParams f_params)
-{
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+RAJA_INLINE resources::EventProxy<Res>
+forall(Res r,
+       ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+       const TypedIndexSet<SegmentTypes...>& iset,
+       LoopBody loop_body,
+       ForallParams f_params)
+{
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
   wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params);
+    iset.segmentCall(segID,
+                     detail::CallForall{},
+                     SegmentExecPolicy(),
+                     loop_body,
+                     r,
+                     f_params);
   });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
-}  // end namespace wrap
-
+} // end namespace wrap
 
 
 /*!
  ******************************************************************************
  *
- * \brief The RAJA::policy_by_value_interface forall functions provide an interface with
- *        value-based policies. It also enforces the interface and performs
- *        static checks as well as triggering plugins and loop body updates.
+ * \brief The RAJA::policy_by_value_interface forall functions provide an
+ *interface with value-based policies. It also enforces the interface and
+ *performs static checks as well as triggering plugins and loop body updates.
  *
  ******************************************************************************
  */
@@ -294,11 +327,12 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
-                                                     Res r,
-                                                     IdxSet&& c,
-                                                     Params&&... params)
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -306,9 +340,10 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -318,21 +353,23 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  RAJA::resources::EventProxy<Res> e =
+      wrap::forall_Icount(r,
+                          std::forward<ExecutionPolicy>(p),
+                          std::forward<IdxSet>(c),
+                          std::move(body),
+                          f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
-                                                     IdxSet&& c,
-                                                     LoopBody&& loop_body)
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -349,11 +386,14 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -363,7 +403,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -373,22 +414,24 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e = wrap::forall(r,
+                                              std::forward<ExecutionPolicy>(p),
+                                              std::forward<IdxSet>(c),
+                                              std::move(body),
+                                              f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
@@ -405,12 +448,14 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_multi_policy<ExecutionPolicy>,
-    type_traits::is_range<Container>>
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_multi_policy<ExecutionPolicy>,
+                                  type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
@@ -420,9 +465,9 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 
   // plugins handled in multipolicy policy_invoker
   return forall_impl(r,
-              std::forward<ExecutionPolicy>(p),
-              std::forward<Container>(c),
-              std::forward<LoopBody>(loop_body));
+                     std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -438,10 +483,9 @@ template <typename ExecutionPolicy,
           typename IndexType,
           typename FirstParam,
           typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_range<Container>,
-    type_traits::is_integral<IndexType>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_range<Container>,
+                                  type_traits::is_integral<IndexType>>
 forall_Icount(ExecutionPolicy&& p,
               Res r,
               Container&& c,
@@ -452,11 +496,14 @@ forall_Icount(ExecutionPolicy&& p,
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
-  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
+                                               std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
+                                      std::forward<Params>(params)...);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -466,22 +513,23 @@ forall_Icount(ExecutionPolicy&& p,
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      icount,
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall_Icount(r,
+                          std::forward<ExecutionPolicy>(p),
+                          std::forward<Container>(c),
+                          icount,
+                          std::move(body),
+                          f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename IndexType,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_range<Container>,
@@ -509,7 +557,10 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
-template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename Container,
+          typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -524,7 +575,8 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -534,19 +586,21 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =  wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e = wrap::forall(r,
+                                              std::forward<ExecutionPolicy>(p),
+                                              std::forward<Container>(c),
+                                              std::move(body),
+                                              f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
 
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -562,7 +616,7 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
       std::forward<LoopBody>(loop_body));
 }
 
-}  // end inline namespace policy_by_value_interface
+} // namespace policy_by_value_interface
 
 
 /*!
@@ -570,8 +624,10 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
@@ -579,7 +635,8 @@ RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall(
@@ -592,8 +649,10 @@ forall(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 {
   Res r = Res::get_default();
@@ -601,7 +660,8 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -611,12 +671,17 @@ forall_Icount(Res r, Args&&... args)
 namespace detail
 {
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
-                                                               ExecutionPolicy,
-                                                               LoopBody body,
-                                                               Res r,
-                                                               ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForall::operator()(T const& segment,
+                       ExecutionPolicy,
+                       LoopBody body,
+                       Res r,
+                       ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -626,18 +691,24 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& seg
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
-                                                                     ExecutionPolicy,
-                                                                     LoopBody body,
-                                                                     Res r,
-                                                                     ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForallIcount::operator()(T const& segment,
+                             ExecutionPolicy,
+                             LoopBody body,
+                             Res r,
+                             ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params);
+  return wrap::forall_Icount(
+      r, ExecutionPolicy(), segment, start, body, f_params);
 }
 
-}  // namespace detail
+} // namespace detail
 
 //
 // Experimental support for dynamic policy selection
@@ -650,104 +721,116 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T cons
 namespace expt
 {
 
-  template<camp::idx_t IDX, typename POLICY_LIST>
-  struct dynamic_helper
+template <camp::idx_t IDX, typename POLICY_LIST>
+struct dynamic_helper
+{
+  template <typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (IDX == pol)
     {
-      if(IDX==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+  template <typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
+  {
 
-      if(IDX==pol){
-        RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-        //Return a generic event proxy from r,
-        //because forall returns a typed event proxy
-        return {r};
-      }
+    if (IDX == pol)
+    {
+      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      return dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+      // Return a generic event proxy from r,
+      // because forall returns a typed event proxy
+      return {r};
     }
 
-  };
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(
+        r, pol, seg, body);
+  }
+};
 
-  template<typename POLICY_LIST>
-  struct dynamic_helper<0, POLICY_LIST>
+template <typename POLICY_LIST>
+struct dynamic_helper<0, POLICY_LIST>
+{
+  template <typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void
-    invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (0 == pol)
     {
-      if(0==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      RAJA_ABORT_OR_THROW("Policy enum not supported ");
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    RAJA_ABORT_OR_THROW("Policy enum not supported ");
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-      if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
+  template <typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
+  {
+    if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+    using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      //Return a generic event proxy from r,
-      //because forall returns a typed event proxy
-      return {r};
-    }
+    // Return a generic event proxy from r,
+    // because forall returns a typed event proxy
+    return {r};
+  }
+};
 
-  };
+template <typename POLICY_LIST, typename SEGMENT, typename BODY>
+void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body)
+  if (pol > N - 1)
   {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy enum not supported");
-    }
-    dynamic_helper<N-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+    RAJA_ABORT_OR_THROW("Policy enum not supported");
   }
+  dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+}
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  resources::EventProxy<resources::Resource>
-  dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-  {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy value out of range");
-    }
+template <typename POLICY_LIST, typename SEGMENT, typename BODY>
+resources::EventProxy<resources::Resource>
+dynamic_forall(RAJA::resources::Resource r,
+               const int pol,
+               SEGMENT const& seg,
+               BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-    return dynamic_helper<N-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+  if (pol > N - 1)
+  {
+    RAJA_ABORT_OR_THROW("Policy value out of range");
   }
 
-}  // namespace expt
+  return dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+}
+
+} // namespace expt
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index 1875fe27d9..da1e6f6be7 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -55,37 +55,36 @@ template <typename T>
 struct IterableWrapperTuple;
 
 template <typename... Ts>
-struct IterableWrapperTuple<camp::tuple<Ts...>> {
+struct IterableWrapperTuple<camp::tuple<Ts...>>
+{
 
-  using type =
-      camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
-                             typename camp::decay<Ts>::IndexType>...>;
+  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                                      typename camp::decay<Ts>::IndexType>...>;
 };
 
 
 namespace internal
 {
 template <class Tuple, camp::idx_t... I>
-RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
-                                                   camp::idx_seq<I...>)
-    -> camp::tuple<RAJA::Span<
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
+RAJA_INLINE constexpr auto
+make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>) -> camp::tuple<
+    RAJA::Span<typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+               typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<
-          typename camp::decay<
-              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-          typename camp::decay<camp::tuple_element_t<I, camp::decay<Tuple>>>::
-              IndexType>{camp::get<I>(std::forward<Tuple>(t)).begin(),
-                         camp::get<I>(std::forward<Tuple>(t)).end()}...);
+      RAJA::Span<typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+                 typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>{
+          camp::get<I>(std::forward<Tuple>(t)).begin(),
+          camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
-}  // namespace internal
+} // namespace internal
 
 template <class Tuple>
-RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple &&t)
+RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t)
     -> decltype(internal::make_wrapped_tuple_impl(
         std::forward<Tuple>(t),
         camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{}))
@@ -101,10 +100,11 @@ template <typename PolicyType,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &&segments,
-                                                                  ParamTuple &&params,
-                                                                  Resource resource,
-                                                                  Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource>
+kernel_param_resource(SegmentTuple&& segments,
+                      ParamTuple&& params,
+                      Resource resource,
+                      Bodies&&... bodies)
 {
   util::PluginContext context{util::make_context<PolicyType>()};
 
@@ -131,11 +131,11 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
   // our segments, loop bodies, and the tuple of loop indices
   // it is passed through all of the kernel mechanics by-referenece,
   // and only copied to provide thread-private instances.
-  loop_data_t loop_data(make_wrapped_tuple(
-                            std::forward<SegmentTuple>(segments)),
-                            std::forward<ParamTuple>(params),
-                            resource,
-                            std::forward<Bodies>(bodies)...);
+  loop_data_t loop_data(
+      make_wrapped_tuple(std::forward<SegmentTuple>(segments)),
+      std::forward<ParamTuple>(params),
+      resource,
+      std::forward<Bodies>(bodies)...);
 
   util::callPostCapturePlugins(context);
 
@@ -156,44 +156,45 @@ template <typename PolicyType,
           typename SegmentTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_resource(SegmentTuple &&segments,
-                                                            Resource resource,
-                                                            Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource>
+kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
 {
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 resource,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments),
+      RAJA::make_tuple(),
+      resource,
+      std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType,
           typename SegmentTuple,
           typename ParamTuple,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel_param(SegmentTuple &&segments,
-                                                                                           ParamTuple &&params,
-                                                                                           Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 std::forward<ParamTuple>(params),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments),
+      std::forward<ParamTuple>(params),
+      res,
+      std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel(SegmentTuple &&segments,
-                                                                                     Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel(SegmentTuple&& segments, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments),
+      RAJA::make_tuple(),
+      res,
+      std::forward<Bodies>(bodies)...);
 }
 
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 
 #include "RAJA/pattern/kernel/Collapse.hpp"
diff --git a/include/RAJA/pattern/kernel/Collapse.hpp b/include/RAJA/pattern/kernel/Collapse.hpp
index 8efb126397..095ad402ef 100644
--- a/include/RAJA/pattern/kernel/Collapse.hpp
+++ b/include/RAJA/pattern/kernel/Collapse.hpp
@@ -29,12 +29,12 @@ namespace statement
 template <typename ExecPolicy, typename ForList, typename... EnclosedStmts>
 struct Collapse : public internal::ForList,
                   public internal::CollapseBase,
-                  public internal::Statement<ExecPolicy, EnclosedStmts...> {
-};
+                  public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
 
-}  // namespace statement
-}  // end namespace RAJA
+} // namespace statement
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index 6b7875c4c2..450fecfd5d 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -37,8 +37,8 @@ namespace statement
  *
  */
 template <typename Condition, typename... EnclosedStmts>
-struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
+struct If : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
 
 /*!
@@ -46,10 +46,11 @@ struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
  *
  */
 template <long value>
-struct Value {
+struct Value
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const &)
+  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const&)
   {
     return value;
   }
@@ -60,10 +61,11 @@ struct Value {
  *
  */
 template <typename L, typename R>
-struct Equals {
+struct Equals
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) == R::eval(data);
   }
@@ -74,10 +76,11 @@ struct Equals {
  *
  */
 template <typename L, typename R>
-struct NotEquals {
+struct NotEquals
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) != R::eval(data);
   }
@@ -89,10 +92,11 @@ struct NotEquals {
  *
  */
 template <typename L, typename R>
-struct Or {
+struct Or
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) || R::eval(data);
   }
@@ -104,10 +108,11 @@ struct Or {
  *
  */
 template <typename L, typename R>
-struct And {
+struct And
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) && R::eval(data);
   }
@@ -119,10 +124,11 @@ struct And {
  *
  */
 template <typename L, typename R>
-struct LessThan {
+struct LessThan
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) < R::eval(data);
   }
@@ -134,10 +140,11 @@ struct LessThan {
  *
  */
 template <typename L, typename R>
-struct LessThanEq {
+struct LessThanEq
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) <= R::eval(data);
   }
@@ -149,10 +156,11 @@ struct LessThanEq {
  *
  */
 template <typename L, typename R>
-struct GreaterThan {
+struct GreaterThan
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) > R::eval(data);
   }
@@ -164,10 +172,11 @@ struct GreaterThan {
  *
  */
 template <typename L, typename R>
-struct GreaterThanEq {
+struct GreaterThanEq
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) >= R::eval(data);
   }
@@ -179,31 +188,34 @@ struct GreaterThanEq {
  *
  */
 template <typename L>
-struct Not {
+struct Not
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return !(L::eval(data));
   }
 };
 
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
 
 
 template <typename Condition, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types> {
+struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
-    if (Condition::eval(data)) {
+    if (Condition::eval(data))
+    {
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(
           std::forward<Data>(data));
     }
@@ -211,8 +223,8 @@ struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types> {
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 539c451673..71b6bd3009 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -42,14 +42,15 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts>
 struct For : public internal::ForList,
              public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+             public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   // TODO: add static_assert for valid policy in Pol
   using execution_policy_t = ExecPolicy;
 };
 
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
@@ -59,8 +60,12 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -85,11 +90,13 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -103,7 +110,11 @@ struct StatementExecutor<
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                ExecPolicy{},
+                TypedRangeSegment<len_t>(0, len),
+                for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -112,15 +123,14 @@ struct StatementExecutor<
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types> {
+template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -134,15 +144,16 @@ struct StatementExecutor<
 
     RAJA_EXTRACT_BED_IT(TypedRangeSegment<len_t>(0, len));
 
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
       for_wrapper(*(begin_it + i));
     }
   }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_For_HPP */
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 18515c7f59..9dfd2ca126 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -44,8 +44,9 @@ template <camp::idx_t ArgumentId,
           typename ExecPolicy = camp::nil,
           typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
-             public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+                   public internal::ForTraitBase<ArgumentId, ExecPolicy>,
+                   public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
@@ -54,7 +55,7 @@ struct ForICount : public internal::ForList,
   using execution_policy_t = ExecPolicy;
 };
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
@@ -64,9 +65,13 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
-struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -93,32 +98,38 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes,
-                     EnclosedStmts...> for_wrapper(data);
+    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                ExecPolicy{},
+                TypedRangeSegment<len_t>(0, len),
+                for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_nested_HPP */
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 955afcecc0..78a2383e43 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -81,21 +81,18 @@ template <camp::idx_t HpArgumentId,
           typename ArgList,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Hyperplane
-    : public internal::Statement<ExecPolicy,
-                                 EnclosedStmts...> {
-};
+struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
 
 
 template <camp::idx_t HpArgumentId, typename ArgList, typename... EnclosedStmts>
-struct HyperplaneInner
-    : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
+struct HyperplaneInner : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
 
 template <camp::idx_t HpArgumentId,
@@ -108,11 +105,13 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
                                                HpExecPolicy,
                                                ArgList<Args...>,
                                                ExecPolicy,
-                                               EnclosedStmts...>, Types> {
+                                               EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get type of Hp arguments index
@@ -135,9 +134,9 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
 
     // compute manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    idx_t hp_len = segment_length<HpArgumentId>(data) +
-                   foldl(RAJA::operators::plus<idx_t>(),
-                                 segment_length<Args>(data)...);
+    idx_t hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<idx_t>(), segment_length<Args>(data)...);
 
     /* Execute the outer loop over hyperplanes
      *
@@ -146,7 +145,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r, HpExecPolicy{},
+    forall_impl(r,
+                HpExecPolicy{},
                 TypedRangeSegment<idx_t>(0, hp_len),
                 outer_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
@@ -159,11 +159,13 @@ template <camp::idx_t HpArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>, Types> {
+    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get h value
@@ -173,13 +175,14 @@ struct StatementExecutor<
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
     idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
-                                camp::get<Args>(data.offset_tuple)...);
+                        camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
 
     // check bounds
-    if (i >= 0 && i < len) {
+    if (i >= 0 && i < len)
+    {
 
       // store in tuple
       data.template assign_offset<HpArgumentId>(i);
@@ -194,8 +197,8 @@ struct StatementExecutor<
 };
 
 
-}  // end namespace internal
+} // end namespace internal
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 21d9e3cd2a..b8cb6208d3 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -26,7 +26,7 @@
 namespace RAJA
 {
 
-//Policies for RAJA local arrays
+// Policies for RAJA local arrays
 struct cpu_tile_mem;
 
 
@@ -43,43 +43,51 @@ namespace statement
  * IntiLocalMem<Pol, RAJA::param_idx<0>, statements...>
  * Will intialize the 0th array in the param tuple
  */
-template<typename Pol, typename Indices, typename... EnclosedStmts>
-struct InitLocalMem : public internal::Statement<camp::nil> {
-};
+template <typename Pol, typename Indices, typename... EnclosedStmts>
+struct InitLocalMem : public internal::Statement<camp::nil>
+{};
 
-//Policy Specialization
-template<camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts...> : public internal::Statement<camp::nil> {
-};
+// Policy Specialization
+template <camp::idx_t... Indices, typename... EnclosedStmts>
+struct InitLocalMem<RAJA::cpu_tile_mem,
+                    camp::idx_seq<Indices...>,
+                    EnclosedStmts...> : public internal::Statement<camp::nil>
+{};
 
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
 
-//Statement executor to initalize RAJA local array
-template<camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...>, Types>{
-  
-  //Execute statement list
-  template<class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+// Statement executor to initalize RAJA local array
+template <camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
+                                                 camp::idx_seq<Indices...>,
+                                                 EnclosedStmts...>,
+                         Types>
+{
+
+  // Execute statement list
+  template <class Data>
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
-  
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t... others, class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t... others, class Data>
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
 
     // Initialize memory
 #ifdef RAJA_COMPILER_MSVC
     // MSVC doesn't like taking a pointer to stack allocated data?!?!
-    varType *ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
+    varType* ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
     camp::get<Pos>(data.param_tuple).set_data(ptr);
 #else
     varType Array[camp::get<Pos>(data.param_tuple).size()];
@@ -95,21 +103,19 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_se
     delete[] ptr;
 #endif
   }
-  
 
-  
-  template<typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
-    //Initalize local arrays + execute statements + cleanup
+    // Initalize local arrays + execute statements + cleanup
     exec_expanded<Indices...>(data);
   }
-  
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 29d41b431e..69e8bd7f8c 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -46,28 +46,28 @@ struct lambda_arg_param_t
 struct lambda_arg_offset_t
 {};
 
-template<typename T>
+template <typename T>
 struct lambda_arg_value_t
 {
-    using type = T;
+  using type = T;
 };
 
-template<typename T, camp::idx_t V>
+template <typename T, camp::idx_t V>
 struct LambdaArg
 {
-    static constexpr camp::idx_t value = V;
+  static constexpr camp::idx_t value = V;
 };
 
-}
-
+} // namespace internal
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment values
  * should be passed into the lambda as an argument
  */
-template<camp::idx_t ... args>
-using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
+template <camp::idx_t... args>
+using Segs =
+    camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment offsets
@@ -79,16 +79,18 @@ using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...
  * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the
  * current tile.
  */
-template<camp::idx_t ... args>
-using Offsets = camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
+template <camp::idx_t... args>
+using Offsets =
+    camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more parameters that
  * should be passed into the lambda as an argument.
  */
-template<camp::idx_t ... args>
-using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
+template <camp::idx_t... args>
+using Params =
+    camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more constant values
@@ -103,8 +105,9 @@ using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args
  * writing:   Lambda<0, ValuesT<double, 3, 4>>
  * invokes:   lambda0( (double)3, (double) 4 )
  */
-template<typename T, camp::idx_t ... values>
-using ValuesT = camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
+template <typename T, camp::idx_t... values>
+using ValuesT =
+    camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
 
 
 namespace statement
@@ -119,24 +122,18 @@ namespace statement
  * RAJA::kernel<exec_pol>(make_tuple{s0, s1, s2}, lambda0, lambda1);
  *
  */
-template <camp::idx_t BodyIdx, typename... Args >
-struct Lambda : internal::Statement<camp::nil> {
+template <camp::idx_t BodyIdx, typename... Args>
+struct Lambda : internal::Statement<camp::nil>
+{
   static const camp::idx_t loop_body_index = BodyIdx;
 };
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
 
 
-
-
-
-
-
-
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -146,26 +143,25 @@ namespace internal
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename SegmentType, camp::idx_t id>
+template <typename SegmentType, camp::idx_t id>
 struct LambdaSegExtractor
 {
 
   static_assert(!std::is_same<SegmentType, void>::value,
-      "Segment not assigned, but used in Lambda with Segs<> argument");
+                "Segment not "
+                "assigned, but used "
+                "in Lambda with "
+                "Segs<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return SegmentType(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)]);
+    return SegmentType(camp::get<id>(data.segment_tuple)
+                           .begin()[camp::get<id>(data.offset_tuple)]);
   }
-
 };
 
 
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -175,26 +171,24 @@ struct LambdaSegExtractor
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename OffsetType, camp::idx_t id>
+template <typename OffsetType, camp::idx_t id>
 struct LambdaOffsetExtractor
 {
 
   static_assert(!std::is_same<OffsetType, void>::value,
-      "Segment not assigned, but used in Lambda with Offsets<> argument");
+                "Segment not assigned, "
+                "but used in Lambda "
+                "with Offsets<> "
+                "argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
     return OffsetType(camp::get<id>(data.offset_tuple));
   }
-
 };
 
 
-
 /*
  * Helper that provides first level of argument extraction
  * This acts as a switchboard between Segs, Offsets, and Params
@@ -202,129 +196,134 @@ struct LambdaOffsetExtractor
  * It calls LambdaArgExtractor to perform the actual argument extraction.
  * This allows LambdaArgExtractor to be specialized
  */
-template<typename Types, typename T>
+template <typename Types, typename T>
 struct LambdaArgSwitchboard;
 
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
 {
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
   static_assert(!std::is_same<OffsetType, void>::value,
-      "Offset not assigned, but used in Lambda with Offsets<> argument");
+                "Offset not assigned, "
+                "but used in Lambda "
+                "with Offsets<> "
+                "argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
-    return LambdaOffsetExtractor<OffsetType, id>::extract(std::forward<Data>(data));
+    return LambdaOffsetExtractor<OffsetType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
 {
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
   static_assert(!std::is_same<SegmentType, void>::value,
-      "Segment not assigned, but used in Lambda with Segs<> argument");
+                "Segment not "
+                "assigned, but used "
+                "in Lambda with "
+                "Segs<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return LambdaSegExtractor<SegmentType, id>::extract(std::forward<Data>(data));
+    return LambdaSegExtractor<SegmentType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
 {
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static auto extract(Data &&data)->
-    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto
+  extract(Data&& data) -> typename std::add_lvalue_reference<
+      camp::tuple_element_t<id,
+                            typename camp::decay<Data>::param_tuple_t>>::type
   {
     return camp::get<id>(data.param_tuple);
   }
 };
 
 
-template<typename Types, typename T, camp::idx_t value>
+template <typename Types, typename T, camp::idx_t value>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
 {
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static T extract(Data &&)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static T extract(Data&&)
   {
     return T(value);
   }
 };
 
 
-
 RAJA_SUPPRESS_HD_WARN
-template<camp::idx_t LoopIndex, typename Types, typename Data, typename... targLists>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data,
-                                                       camp::list<targLists...> const &)
+template <camp::idx_t LoopIndex,
+          typename Types,
+          typename Data,
+          typename... targLists>
+RAJA_INLINE RAJA_HOST_DEVICE void
+invoke_lambda_with_args(Data&& data, camp::list<targLists...> const&)
 {
   camp::get<LoopIndex>(data.bodies)(
       LambdaArgSwitchboard<Types, targLists>::extract(data)...);
 }
 
 
-
-
 /*!
  * A RAJA::kernel statement that invokes a lambda function
  * with user specified arguments.
  */
-template <camp::idx_t LambdaIndex,typename... Args, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types> {
+template <camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    //Convert SegList, ParamList into Seg, Param types, and store in a list
+    // Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data), targList{});
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
+                                                targList{});
   }
 };
 
 
-
-template <camp::idx_t LambdaIndex, typename Types, typename Data, camp::idx_t ... SEGS, camp::idx_t ... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data, camp::idx_seq<SEGS...> const &, camp::idx_seq<PARAMS...> const &)
+template <camp::idx_t LambdaIndex,
+          typename Types,
+          typename Data,
+          camp::idx_t... SEGS,
+          camp::idx_t... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
+                                                camp::idx_seq<SEGS...> const&,
+                                                camp::idx_seq<PARAMS...> const&)
 {
 
   using AllSegs = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
+                    Types>::exec(std::forward<Data>(data));
 }
 
 
 template <camp::idx_t LambdaIndex, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
+struct StatementExecutor<statement::Lambda<LambdaIndex>, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
     using Data_t = camp::decay<Data>;
@@ -335,14 +334,13 @@ struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
         std::forward<Data>(data),
         camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
         camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{});
-
   }
 };
 
 
-}  // namespace internal
+} // namespace internal
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp
index 8e870ebe15..60972754e0 100644
--- a/include/RAJA/pattern/kernel/Param.hpp
+++ b/include/RAJA/pattern/kernel/Param.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 namespace internal
 {
 
-struct ParamBase {
-};
+struct ParamBase
+{};
 
-}// end namespace internal
+} // end namespace internal
 
 namespace statement
 {
@@ -47,20 +47,21 @@ namespace statement
  * RAJA::kernel execution policies.
  */
 template <camp::idx_t ParamId>
-struct Param : public internal::ParamBase {
+struct Param : public internal::ParamBase
+{
 
   constexpr static camp::idx_t param_idx = ParamId;
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const& data)
       -> decltype(camp::get<ParamId>(data.param_tuple))
   {
     return camp::get<ParamId>(data.param_tuple);
   }
 };
 
-}  // end namespace statement
-}  // end namespace RAJA
+} // end namespace statement
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index 4de4922ea3..ec1835e75d 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -39,10 +39,12 @@ namespace statement
  *
  */
 template <typename ReducePolicy,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts>
-struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...> {
+struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
@@ -52,10 +54,10 @@ struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...> {
 };
 
 
-}  // end namespace statement
+} // end namespace statement
 
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_Reduce_HPP */
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 82b79ae775..7acb322494 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -30,37 +30,38 @@ namespace RAJA
 namespace statement
 {
 
-template<typename RegionPolicy, typename... EnclosedStmts>
-struct Region : public internal::Statement<camp::nil> {
-};
+template <typename RegionPolicy, typename... EnclosedStmts>
+struct Region : public internal::Statement<camp::nil>
+{};
 
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
 
-//Statement executor to create a region within kernel
-
-//Note: RAJA region's lambda must capture by reference otherwise
-//internal function calls are undefined.
-template<typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>, Types> {
+// Statement executor to create a region within kernel
 
-template<typename Data>
-static RAJA_INLINE void exec(Data &&data)
+// Note: RAJA region's lambda must capture by reference otherwise
+// internal function calls are undefined.
+template <typename RegionPolicy, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
+                         Types>
 {
 
-  RAJA::region<RegionPolicy>([&]() {
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&& data)
+  {
+
+    RAJA::region<RegionPolicy>([&]() {
       using data_t = camp::decay<Data>;
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
     });
-}
-
+  }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 43f72e0545..86cfcb4345 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -34,14 +34,13 @@
 namespace RAJA
 {
 
-struct TileSize {
+struct TileSize
+{
   const camp::idx_t size;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr TileSize(camp::idx_t size_) : size{size_}
-  {
-  }
+  constexpr TileSize(camp::idx_t size_) : size{size_} {}
 };
 
 namespace statement
@@ -56,26 +55,28 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
 
-}  // end namespace statement
+} // end namespace statement
 
 ///! tag for a tiling loop
 template <camp::idx_t chunk_size_>
-struct tile_fixed {
+struct tile_fixed
+{
   static constexpr camp::idx_t chunk_size = chunk_size_;
 };
 
 template <camp::idx_t ArgumentId>
-struct tile_dynamic {
+struct tile_dynamic
+{
   static constexpr camp::idx_t id = ArgumentId;
 };
 
 
-
 namespace internal
 {
 
@@ -84,8 +85,12 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -104,7 +109,8 @@ struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
 
 template <typename Iterable>
-struct IterableTiler {
+struct IterableTiler
+{
   using value_type = camp::decay<Iterable>;
 
   struct iterate
@@ -122,16 +128,15 @@ struct IterableTiler {
   public:
     using value_type = iterate;
     using difference_type = camp::idx_t;
-    using pointer = value_type *;
-    using reference = value_type &;
+    using pointer = value_type*;
+    using reference = value_type&;
     using iterator_category = std::random_access_iterator_tag;
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
-    constexpr iterator(IterableTiler const &itiler_, Index_type block_id_)
+    constexpr iterator(IterableTiler const& itiler_, Index_type block_id_)
         : itiler{itiler_}, block_id{block_id_}
-    {
-    }
+    {}
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
@@ -142,20 +147,20 @@ struct IterableTiler {
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE difference_type operator-(const iterator &rhs) const
+    RAJA_INLINE difference_type operator-(const iterator& rhs) const
     {
       return static_cast<difference_type>(block_id) -
              static_cast<difference_type>(rhs.block_id);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator-(const difference_type &rhs) const
+    RAJA_INLINE iterator operator-(const difference_type& rhs) const
     {
       return iterator(itiler, block_id - rhs);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator+(const difference_type &rhs) const
+    RAJA_INLINE iterator operator+(const difference_type& rhs) const
     {
       return iterator(itiler,
                       block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
@@ -169,13 +174,13 @@ struct IterableTiler {
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator!=(const iterator &rhs) const
+    RAJA_INLINE bool operator!=(const iterator& rhs) const
     {
       return block_id != rhs.block_id;
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator<(const iterator &rhs) const
+    RAJA_INLINE bool operator<(const iterator& rhs) const
     {
       return block_id < rhs.block_id;
     }
@@ -183,16 +188,17 @@ struct IterableTiler {
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  IterableTiler(const Iterable &it_, camp::idx_t block_size_)
+  IterableTiler(const Iterable& it_, camp::idx_t block_size_)
       : it{it_}, block_size{block_size_}
   {
     using std::begin;
     using std::distance;
     using std::end;
-    dist = it.end() - it.begin();  // distance(begin(it), end(it));
+    dist = it.end() - it.begin(); // distance(begin(it), end(it));
     num_blocks = dist / block_size;
     // if (dist % block_size) num_blocks += 1;
-    if (dist - num_blocks * block_size > 0) {
+    if (dist - num_blocks * block_size > 0)
+    {
       num_blocks += 1;
     }
   }
@@ -222,13 +228,15 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>, Types> {
+    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = tile_fixed<ChunkSize>::chunk_size;
@@ -238,53 +246,63 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                EPol{},
+                tiled_iterable,
+                tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-template<camp::idx_t ArgumentId,
-  typename EPol,
-  typename... EnclosedStmts,
-  typename Types>
+template <camp::idx_t ArgumentId,
+          typename EPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>, Types> {
+    statement::
+        Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = camp::get<ArgumentId>(data.param_tuple);
-    static_assert(camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
-                  "Extracted parameter must be of type TileSize.");
+    static_assert(
+        camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
+        "Extracted parameter must be of type TileSize.");
 
     // Create a tile iterator
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size.size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
-    
+    forall_impl(r,
+                EPol{},
+                tiled_iterable,
+                tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
+
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 2653e992c7..eadf8dc2d2 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -47,7 +47,8 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
                 "RAJA::Statement::Param< # >");
@@ -56,7 +57,7 @@ struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...> {
 };
 
 
-}  // end namespace statement
+} // end namespace statement
 
 namespace internal
 {
@@ -66,9 +67,13 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
-struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -79,17 +84,16 @@ struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
   {
     // Assign the tile's segment to the tuple
     camp::get<ArgumentId>(Base::data.segment_tuple) = si.s;
-    
+
     // Assign the tile's index
     Base::data.template assign_param<ParamId>(si.i);
-    
+
     // Execute enclosed statements
     Base::exec();
   }
 };
 
 
-
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::TileTCount
  *
@@ -102,14 +106,16 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>, Types> {
+    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = TPol::chunk_size;
@@ -119,12 +125,16 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileTCountWrapper<ArgumentId, ParamId, Data, Types,
-                      EnclosedStmts...> tile_wrapper(data);
+    TileTCountWrapper<ArgumentId, ParamId, Data, Types, EnclosedStmts...>
+        tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                EPol{},
+                tiled_iterable,
+                tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
@@ -132,7 +142,7 @@ struct StatementExecutor<
 };
 
 
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 9667a55538..3109b9b452 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -40,29 +40,27 @@ namespace internal
 {
 
 
-
-
-  // Universal base of all For wrappers for type traits
-  struct ForList {
-  };
-  struct ForBase {
-  };
-  struct CollapseBase {
-  };
-  template <camp::idx_t ArgumentId, typename Policy>
-  struct ForTraitBase : public ForBase {
-    constexpr static camp::idx_t index_val = ArgumentId;
-    using index = camp::num<ArgumentId>;
-    using index_type = camp::nil;  // default to invalid type
-    using policy_type = Policy;
-    using type = ForTraitBase;  // make camp::value compatible
-  };
-
-
+// Universal base of all For wrappers for type traits
+struct ForList
+{};
+struct ForBase
+{};
+struct CollapseBase
+{};
+template <camp::idx_t ArgumentId, typename Policy>
+struct ForTraitBase : public ForBase
+{
+  constexpr static camp::idx_t index_val = ArgumentId;
+  using index = camp::num<ArgumentId>;
+  using index_type = camp::nil; // default to invalid type
+  using policy_type = Policy;
+  using type = ForTraitBase; // make camp::value compatible
+};
 
 
 template <typename Iterator>
-struct iterable_difftype_getter {
+struct iterable_difftype_getter
+{
   using type = typename std::iterator_traits<
       typename Iterator::iterator>::difference_type;
 };
@@ -79,7 +77,8 @@ using difftype_tuple_from_segments =
 
 
 template <typename Iterator>
-struct iterable_value_type_getter {
+struct iterable_value_type_getter
+{
   using type =
       typename std::iterator_traits<typename Iterator::iterator>::value_type;
 };
@@ -100,13 +99,12 @@ using index_types_from_segments =
                            value_type_list_from_segments<Segments>>::type;
 
 
-
-
 template <typename SegmentTuple,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-struct LoopData {
+struct LoopData
+{
 
   using Self = LoopData<SegmentTuple, ParamTuple, Resource, Bodies...>;
 
@@ -138,78 +136,70 @@ struct LoopData {
   using vector_sizes_t = tuple_of_n<int, camp::tuple_size<SegmentTuple>::value>;
   vector_sizes_t vector_sizes;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  LoopData(SegmentTuple const &s, ParamTuple const &p, Resource r, Bodies const &... b)
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
+                                                  ParamTuple const& p,
+                                                  Resource r,
+                                                  Bodies const&... b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
-  {
-  }
-  constexpr LoopData(LoopData const &) = default;
-  constexpr LoopData(LoopData &&) = default;
+  {}
+  constexpr LoopData(LoopData const&) = default;
+  constexpr LoopData(LoopData&&) = default;
 
   template <camp::idx_t Idx, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const& i)
   {
     camp::get<Idx>(offset_tuple) = i;
   }
 
   template <typename ParamId, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const& i)
   {
-    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
+    using param_t =
+        camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
     camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
   }
 
   template <typename ParamId>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  auto get_param() ->
-    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
+  RAJA_HOST_DEVICE RAJA_INLINE auto get_param()
+      -> camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
   {
     return camp::get<ParamId::param_idx>(param_tuple);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  Resource get_resource()
-  {
-    return res;
-  }
-
-
+  RAJA_HOST_DEVICE RAJA_INLINE Resource get_resource() { return res; }
 };
 
 
-
-
 template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type =
-    typename std::iterator_traits<
-        typename camp::at_v<typename Data::segment_tuple_t::TList,
-                            ArgumentId>::iterator>::difference_type;
-
-
+using segment_diff_type = typename std::iterator_traits<
+    typename camp::at_v<typename Data::segment_tuple_t::TList,
+                        ArgumentId>::iterator>::difference_type;
 
 
 template <camp::idx_t ArgumentId, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
-  segment_diff_type<ArgumentId, Data>
+RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const& data)
+    -> segment_diff_type<ArgumentId, Data>
 {
   return camp::get<ArgumentId>(data.segment_tuple).end() -
          camp::get<ArgumentId>(data.segment_tuple).begin();
 }
 
 
-
-
 template <typename Data, typename Types, typename... EnclosedStmts>
-struct GenericWrapper : GenericWrapperBase {
+struct GenericWrapper : GenericWrapperBase
+{
   using data_t = camp::decay<Data>;
 
-  data_t &data;
+  data_t& data;
 
   RAJA_INLINE
-  constexpr explicit GenericWrapper(data_t &d) : data{d} {}
+  constexpr explicit GenericWrapper(data_t& d) : data{d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 };
 
 
@@ -217,28 +207,27 @@ struct GenericWrapper : GenericWrapperBase {
  * Convenience object used to create a thread-private LoopData object.
  */
 template <typename T>
-struct NestedPrivatizer {
+struct NestedPrivatizer
+{
   using data_t = typename T::data_t;
   using value_type = camp::decay<T>;
-  using reference_type = value_type &;
+  using reference_type = value_type&;
 
   data_t privatized_data;
   value_type privatized_wrapper;
 
   RAJA_INLINE
-  constexpr NestedPrivatizer(const T &o)
+  constexpr NestedPrivatizer(const T& o)
       : privatized_data{o.data}, privatized_wrapper(privatized_data)
-  {
-  }
+  {}
 
   RAJA_INLINE
   reference_type get_priv() { return privatized_wrapper; }
 };
 
 
-
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 7f77df4214..3bfb7b5e9f 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -29,67 +29,77 @@ namespace internal
 {
 
 
-template <typename SegmentTypes,
-          typename OffsetTypes>
+template <typename SegmentTypes, typename OffsetTypes>
 struct LoopTypes;
 
-template <typename ... SegmentTypes,
-          typename ... OffsetTypes>
-struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>> {
+template <typename... SegmentTypes, typename... OffsetTypes>
+struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
+{
 
-  using Self = LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
+  using Self =
+      LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
 
   static constexpr size_t s_num_segments = sizeof...(SegmentTypes);
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
   static_assert(s_num_segments == sizeof...(OffsetTypes),
-      "Number of segments and offsets must match");
+                "Number of segments "
+                "and offsets must "
+                "match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
   using offset_types_t = camp::list<OffsetTypes...>;
 };
 
 
-template<typename Data>
-using makeInitialLoopTypes =
-    LoopTypes<list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
-              list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
+template <typename Data>
+using makeInitialLoopTypes = LoopTypes<
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
 
 
-template<typename Types, camp::idx_t Segment, typename T, typename Seq>
+template <typename Types, camp::idx_t Segment, typename T, typename Seq>
 struct SetSegmentTypeHelper;
 
-template<typename Types,
-         camp::idx_t Segment,
-         typename T,
-         camp::idx_t ... SEQ>
+template <typename Types, camp::idx_t Segment, typename T, camp::idx_t... SEQ>
 struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
 {
-    using segment_list = typename Types::segment_types_t;
-    using offset_list = typename Types::offset_types_t;
-
-    static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-        "Segment was already assigned: Probably looping over same segment in loop nest");
-
-    using type = LoopTypes<
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>,
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>>;
-
+  using segment_list = typename Types::segment_types_t;
+  using offset_list = typename Types::offset_types_t;
+
+  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+                "Segment was already assigned: Probably looping over same "
+                "segment in loop nest");
+
+  using type = LoopTypes<
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>,
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>>;
 };
 
 
-template<typename Types, camp::idx_t Segment, typename T>
-using setSegmentType =
-    typename SetSegmentTypeHelper<Types, Segment, T, camp::make_idx_seq_t<Types::s_num_segments>>::type;
+template <typename Types, camp::idx_t Segment, typename T>
+using setSegmentType = typename SetSegmentTypeHelper<
+    Types,
+    Segment,
+    T,
+    camp::make_idx_seq_t<Types::s_num_segments>>::type;
 
-template<typename Types, camp::idx_t Segment, typename Data>
-using setSegmentTypeFromData =
-    setSegmentType<Types, Segment, camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
+template <typename Types, camp::idx_t Segment, typename Data>
+using setSegmentTypeFromData = setSegmentType<
+    Types,
+    Segment,
+    camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
 
 
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_LoopTypes_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 48ca828a68..8279aac29c 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -28,11 +28,13 @@ namespace internal
 {
 
 
-
 template <typename ExecPolicy, typename... EnclosedStmts>
-struct Statement {
-  static_assert(std::is_same<ExecPolicy, camp::nil>::value || sizeof...(EnclosedStmts) > 0,
-      "Executable statement with no enclosed statements, this is almost certainly a bug");
+struct Statement
+{
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
+                    sizeof...(EnclosedStmts) > 0,
+                "Executable statement with no enclosed statements, this is "
+                "almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
@@ -40,15 +42,12 @@ struct Statement {
 };
 
 
-
-
 template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index 5c0d71afb4..ac88ffe3cf 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -35,8 +35,6 @@ template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
-
 template <typename... Stmts>
 using StatementList = camp::list<Stmts...>;
 
@@ -47,11 +45,13 @@ struct StatementListExecutor;
 
 template <camp::idx_t statement_index,
           camp::idx_t num_statements,
-          typename StmtList, typename Types>
-struct StatementListExecutor {
+          typename StmtList,
+          typename Types>
+struct StatementListExecutor
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Get the statement we're going to execute
@@ -61,8 +61,10 @@ struct StatementListExecutor {
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList, Types>::exec(
-        std::forward<Data>(data));
+    StatementListExecutor<statement_index + 1,
+                          num_statements,
+                          StmtList,
+                          Types>::exec(std::forward<Data>(data));
   }
 };
 
@@ -72,26 +74,25 @@ struct StatementListExecutor {
  */
 
 template <camp::idx_t num_statements, typename StmtList, typename Types>
-struct StatementListExecutor<num_statements, num_statements, StmtList, Types> {
+struct StatementListExecutor<num_statements, num_statements, StmtList, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&)
-  {
-  }
+  static RAJA_INLINE void exec(Data&&)
+  {}
 };
 
 
 template <typename StmtList, typename Types, typename Data>
-RAJA_INLINE void execute_statement_list(Data &&data)
+RAJA_INLINE void execute_statement_list(Data&& data)
 {
   StatementListExecutor<0, camp::size<StmtList>::value, StmtList, Types>::exec(
       std::forward<Data>(data));
 }
 
 
-
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index c750b95986..c8a980bf97 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -39,8 +39,8 @@ struct SeqToType
 template <typename T, typename SEQ>
 struct ListOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
+template <typename T, camp::idx_t... SEQ>
+struct ListOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::list<typename SeqToType<T, SEQ>::type...>;
 };
@@ -49,8 +49,8 @@ struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
 template <typename T, typename SEQ>
 struct TupleOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
+template <typename T, camp::idx_t... SEQ>
+struct TupleOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::tuple<typename SeqToType<T, SEQ>::type...>;
 };
@@ -64,7 +64,8 @@ struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
  *
  */
 template <typename T, camp::idx_t N>
-using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+using list_of_n =
+    typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 /*
@@ -74,12 +75,12 @@ using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::ty
  *
  */
 template <typename T, camp::idx_t N>
-using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+using tuple_of_n =
+    typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
-
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_Template_HPP */
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index b78ec0de92..7c14e08236 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -28,7 +28,7 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-//Odd dependecy with atomics is breaking CI builds
+// Odd dependecy with atomics is breaking CI builds
 //#include "RAJA/util/View.hpp"
 
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
@@ -41,12 +41,17 @@ namespace RAJA
 {
 
 // GPU or CPU threads available
-//strongly type the ExecPlace (guards agaist errors)
-enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES };
-
-struct null_launch_t {
+// strongly type the ExecPlace (guards agaist errors)
+enum struct ExecPlace : int
+{
+  HOST,
+  DEVICE,
+  NUM_PLACES
 };
 
+struct null_launch_t
+{};
+
 // Support for host, and device
 template <typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
@@ -55,7 +60,8 @@ template <typename HOST_POLICY
 #endif
           >
 
-struct LoopPolicy {
+struct LoopPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -68,7 +74,8 @@ template <typename HOST_POLICY
           typename DEVICE_POLICY = HOST_POLICY
 #endif
           >
-struct LaunchPolicy {
+struct LaunchPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -76,7 +83,8 @@ struct LaunchPolicy {
 };
 
 
-struct Teams {
+struct Teams
+{
   int value[3];
 
   RAJA_INLINE
@@ -96,7 +104,8 @@ struct Teams {
   constexpr Teams(int i, int j, int k) : value{i, j, k} {}
 };
 
-struct Threads {
+struct Threads
+{
   int value[3];
 
   RAJA_INLINE
@@ -117,7 +126,8 @@ struct Threads {
   constexpr Threads(int i, int j, int k) : value{i, j, k} {}
 };
 
-struct Lanes {
+struct Lanes
+{
   int value;
 
   RAJA_INLINE
@@ -129,7 +139,8 @@ struct Lanes {
   constexpr Lanes(int i) : value(i) {}
 };
 
-struct LaunchParams {
+struct LaunchParams
+{
 public:
   Teams teams;
   Threads threads;
@@ -138,67 +149,71 @@ struct LaunchParams {
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0)
-    : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {};
+  LaunchParams(Teams in_teams,
+               Threads in_threads,
+               size_t in_shared_mem_size = 0)
+      : teams(in_teams),
+        threads(in_threads),
+        shared_mem_size(in_shared_mem_size){};
 
 private:
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Teams apply(Teams const &a) { return (teams = a); }
+  Teams apply(Teams const& a) { return (teams = a); }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Threads apply(Threads const &a) { return (threads = a); }
+  Threads apply(Threads const& a) { return (threads = a); }
 };
 
 class LaunchContext
 {
 public:
-
-  //Bump style allocator used to
-  //get memory from the pool
+  // Bump style allocator used to
+  // get memory from the pool
   size_t shared_mem_offset;
 
-  void *shared_mem_ptr;
+  void* shared_mem_ptr;
 
 #if defined(RAJA_ENABLE_SYCL)
-  mutable cl::sycl::nd_item<3> *itm;
+  mutable cl::sycl::nd_item<3>* itm;
 #endif
 
   RAJA_HOST_DEVICE LaunchContext()
-    : shared_mem_offset(0), shared_mem_ptr(nullptr)
-  {
-  }
+      : shared_mem_offset(0), shared_mem_ptr(nullptr)
+  {}
 
-  //TODO handle alignment
-  template<typename T>
+  // TODO handle alignment
+  template <typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
   {
 
-    //Calculate offset in bytes with a char pointer
-    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
+    // Calculate offset in bytes with a char pointer
+    void* mem_ptr = static_cast<char*>(shared_mem_ptr) + shared_mem_offset;
 
-    shared_mem_offset += bytes*sizeof(T);
+    shared_mem_offset += bytes * sizeof(T);
 
-    //convert to desired type
+    // convert to desired type
     return static_cast<T*>(mem_ptr);
   }
 
   /*
   //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t z_stride=DIM-1, typename arg, typename... args>
-  RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs)
+  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
+  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
+  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
   {
     T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
 
     shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx, idxs...);
+    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
+  idxs...);
   }
   */
 
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
-    //On the cpu/gpu we want to restart the count
+    // On the cpu/gpu we want to restart the count
     shared_mem_offset = 0;
   }
 
@@ -218,19 +233,24 @@ class LaunchContext
 template <typename LAUNCH_POLICY>
 struct LaunchExecute;
 
-//Policy based launch with support to new reducers...
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Policy based launch with support to new reducers...
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context{
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -242,29 +262,36 @@ void launch(LaunchParams const &launch_params, const char *kernel_name, ReducePa
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(
+      Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 
-//Duplicate of code above on account that we need to support the case in which a kernel_name is not given
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args)
+// Duplicate of code above on account that we need to support the case in which
+// a kernel_name is not given
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context{
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -276,148 +303,208 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(
+      Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 //=================================================
-//Run time based policy launch
+// Run time based policy launch
 //=================================================
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
+void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
 {
   launch<POLICY_LIST>(place, params, nullptr, body);
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams& params,
+            const char* kernel_name,
+            BODY const& body)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(), params, kernel_name, body);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(), params, kernel_name, body);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface
+// Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(),
+        launch_params,
+        kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(),
+        launch_params,
+        kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface with support of the case without a new kernel name
+// Run-time API for new reducer interface with support of the case without a new
+// kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args)
-            //BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            ReduceParams&&... rest_of_launch_args)
+// BODY const &body)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(),
+        launch_params,
+        kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(),
+        launch_params,
+        kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
 
-// Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-template<typename T, typename U>
-RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
-  else { return RAJA::resources::Resource(host_res); }
+// Helper function to retrieve a resource based on the run-time policy - if a
+// device is active
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
+template <typename T, typename U>
+RAJA::resources::Resource
+Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    return RAJA::resources::Resource(device_res);
+  }
+  else
+  {
+    return RAJA::resources::Resource(host_res);
+  }
 }
 #endif
 
-template<typename T>
-RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
+template <typename T>
+RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    RAJA_ABORT_OR_THROW("Device is not enabled");
+  }
 
   return RAJA::resources::Resource(host_res);
 }
 
-//Launch API which takes team resource struct and supports new reducers
-template <typename POLICY_LIST, typename ... ReduceParams>
+// Launch API which takes team resource struct and supports new reducers
+template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* kernel_name,
+       ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context{
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -429,24 +516,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name,  p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #endif
-    default: {
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-    }
+  default:
+  {
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -456,36 +549,45 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 }
 
 
-//Duplicate of API above on account that we need to handle the case that a kernel name is not provided
-template <typename POLICY_LIST, typename ... ReduceParams>
+// Duplicate of API above on account that we need to handle the case that a
+// kernel name is not provided
+template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
        ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context{
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -497,24 +599,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #endif
-    default: {
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-    }
+  default:
+  {
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -523,7 +631,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   return resources::EventProxy<resources::Resource>(res);
 }
 
-template<typename POLICY_LIST>
+template <typename POLICY_LIST>
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 using loop_policy = typename POLICY_LIST::device_policy_t;
 #else
@@ -541,28 +649,23 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void
+loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
 template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                          SEGMENT const &segment,
-                                          BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void
+loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          segment,
-                                                          body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment, body);
 }
 
 namespace expt
@@ -573,16 +676,14 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -590,18 +691,18 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              SEGMENT const& segment2,
+                                              BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                           segment0, segment1, segment2, body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
 }
 
-} //namespace expt
+} // namespace expt
 
 template <typename POLICY, typename SEGMENT>
 struct TileExecute;
@@ -614,16 +715,14 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+                                       SEGMENT const& segment,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size,
-                                                       segment,
-                                                       body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size, segment, body);
 }
 
 template <typename POLICY_LIST,
@@ -631,15 +730,13 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size,
+                                              SEGMENT const& segment,
+                                              BODY const& body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size,
-                                                          segment,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size, segment, body);
 }
 
 namespace expt
@@ -650,20 +747,16 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size0,
                                        TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size0,
-                                                       tile_size1,
-                                                       segment0,
-                                                       segment1,
-                                                       body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
 template <typename POLICY_LIST,
@@ -671,23 +764,19 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size0,
-                                       TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size0,
+                                              TILE_T tile_size1,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              BODY const& body)
 {
 
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size0,
-                                                          tile_size1,
-                                                          segment0,
-                                                          segment1,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-} //namespace expt
+} // namespace expt
 
-}  // namespace RAJA
+} // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
index 3fbe36877c..9d3d9dc975 100644
--- a/include/RAJA/pattern/multi_reduce.hpp
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -156,7 +156,7 @@ struct MultiReduceSum;
  */
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -171,7 +171,8 @@ struct MultiReduceBitOr;
    Index_ptr bins = ...;
    Real_ptr bit_vals = ...;
 
-   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
+   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins,
+ init_val);
 
    forall<exec_policy>( ..., [=] (Index_type i) {
       my_bits[bins[i]] &= (data[i]);
@@ -188,7 +189,7 @@ struct MultiReduceBitOr;
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitAnd;
 
-} //namespace RAJA
+} // namespace RAJA
 
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index fb854c8706..b1fdfa3b59 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -21,346 +21,440 @@ namespace RAJA
 namespace expt
 {
 
-  //
-  //
-  // Forall Parameter Packing type
-  //
-  //
-  struct ParamMultiplexer;
-
-  template<typename... Params>
-  struct ForallParamPack {
-
-    friend struct ParamMultiplexer;
-
-    using Base = camp::tuple<Params...>;
-    Base param_tup;
-
-    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
-    using params_seq = camp::make_idx_seq_t< param_tup_sz >;
-
-  private:
-
-    // Init
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_init(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(expt::detail::init<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Combine
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& out, const ForallParamPack& in ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
-    }
-
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
-    }
-    
-    // Resolve
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Used to construct the argument TYPES that will be invoked with the lambda.
-    template<typename null_t = camp::nil>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; };
-    template<typename null_t = camp::nil, typename First>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); };
-    template<typename null_t = camp::nil, typename First, typename Second, typename... Rest>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
-
-    using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
-    
-    //Use the size of param_tup to generate the argument list.
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
-    template<camp::idx_t N>
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>) {
-      return camp::tuple_cat_pair(  camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num<N-1>())  );
-    }
-
-  public:
-    ForallParamPack(){}
-
-    RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());}
-
-    using lambda_arg_seq = camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
-
-    template<typename... Ts>
-    ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
-  }; // struct ForallParamPack 
-  
-
-
-  //===========================================================================
-  //
-  //
-  // ParamMultiplexer is how we hook into the individual calls within forall_impl.
-  //
-  //
-  struct ParamMultiplexer {
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr init( ForallParamPack<Params...>& f_params, Args&& ...args) {
-      FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr combine(ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr resolve( ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-  };
-  //===========================================================================
+//
+//
+// Forall Parameter Packing type
+//
+//
+struct ParamMultiplexer;
+
+template <typename... Params>
+struct ForallParamPack
+{
+
+  friend struct ParamMultiplexer;
 
+  using Base = camp::tuple<Params...>;
+  Base param_tup;
 
+  static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
+  using params_seq = camp::make_idx_seq_t<param_tup_sz>;
+
+private:
+  // Init
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_init(EXEC_POL,
+                                    camp::idx_seq<Seq...>,
+                                    ForallParamPack& f_params,
+                                    Args&&... args)
+  {
+    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                             std::forward<Args>(args)...));
+  }
+
+  // Combine
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void
+  detail_combine(EXEC_POL,
+                 camp::idx_seq<Seq...>,
+                 ForallParamPack& out,
+                 const ForallParamPack& in)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
+                                          camp::get<Seq>(in.param_tup)));
+  }
 
-  //===========================================================================
-  //
-  //
-  // ForallParamPack generators.
-  //
-  //
-  RAJA_INLINE static auto get_empty_forall_param_pack(){
-    static ForallParamPack<> p;
-    return p;
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void
+  detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(f_params.param_tup)));
   }
 
-  namespace detail {
-    // all_true trick to perform variadic expansion in static asserts.
-    // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
-    template<bool...> struct bool_pack;
-    template<bool... bs>
-    using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+  // Resolve
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_resolve(EXEC_POL,
+                                       camp::idx_seq<Seq...>,
+                                       ForallParamPack& f_params,
+                                       Args&&... args)
+  {
+    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                          std::forward<Args>(args)...));
+  }
 
-    template<typename Base, typename... Ts>
-    using check_types_derive_base = all_true<std::is_convertible<Ts, Base>::value...>;
-  } // namespace detail
+  // Used to construct the argument TYPES that will be invoked with the lambda.
+  template <typename null_t = camp::nil>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple<>{};
+  };
+  template <typename null_t = camp::nil, typename First>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return typename First::ARG_TUP_T();
+  };
+  template <typename null_t = camp::nil,
+            typename First,
+            typename Second,
+            typename... Rest>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
+                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
+  };
 
+  using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
 
-  template<typename... Ts>
-  constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple) {
-    static_assert(detail::check_types_derive_base<detail::ForallParamBase, camp::decay<Ts>...>::value,
-        "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ;
-    return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+  // Use the size of param_tup to generate the argument list.
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>)
+  {
+    return camp::make_tuple();
+  }
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>)
+  {
+    return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup();
+  }
+  template <camp::idx_t N>
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>)
+  {
+    return camp::tuple_cat_pair(
+        camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(),
+        LAMBDA_ARG_TUP_V(camp::num<N - 1>()));
   }
 
-  
+public:
+  ForallParamPack() {}
 
-  namespace detail {
-    // Maybe we should do a lot of these with structs...
-    template<camp::idx_t... Seq, typename TupleType>
-    constexpr auto tuple_from_seq (const camp::idx_seq<Seq...>&, TupleType&& tuple){
-      return camp::forward_as_tuple( camp::get< Seq >(std::forward<TupleType>(tuple))... );
-    };
+  RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args()
+  {
+    return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());
+  }
 
-    template<typename... Ts>
-    constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple){
-      return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts)-1>{},std::move(tuple));
-    };
-  } // namespace detail
+  using lambda_arg_seq =
+      camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
 
+  template <typename... Ts>
+  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)){};
+}; // struct ForallParamPack
 
-  // Make a tuple of the param pack except the final element...
-  template<typename... Args>
-  constexpr auto make_forall_param_pack(Args&&... args){
-    // We assume the last element of the pack is the lambda so we need to strip it from the list.
-    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
-    return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+
+//===========================================================================
+//
+//
+// ParamMultiplexer is how we hook into the individual calls within forall_impl.
+//
+//
+struct ParamMultiplexer
+{
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr init(ForallParamPack<Params...>& f_params,
+                             Args&&... args)
+  {
+    FP::detail_init(EXEC_POL(),
+                    typename FP::params_seq(),
+                    f_params,
+                    std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr combine(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_combine(EXEC_POL(),
+                       typename FP::params_seq(),
+                       f_params,
+                       std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr resolve(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_resolve(EXEC_POL(),
+                       typename FP::params_seq(),
+                       f_params,
+                       std::forward<Args>(args)...);
   }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Callable should be the last argument in the param pack, just extract it...
-  //
-  //
-  template<typename... Args>
-  constexpr auto&& get_lambda(Args&&... args){
-    return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
-  } 
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Checking expected argument list against the assumed lambda.
-  //
-  //
-  namespace detail {
-
-    // 
-    //
-    // Lambda traits Utilities
-    // 
-    //
-    template<class F>
-    struct lambda_traits;
-
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...)>
-    {  // non-const specialization
-      using arg_type = First; 
-    };
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...) const>
-    {  // const specialization
-      using arg_type = First; 
-    };
-
-    template<class T>
-    typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
-
-
-    // 
-    //
-    // List manipulation Utilities
-    // 
-    //
-    template<typename... Ts>
-    constexpr auto list_remove_pointer(const camp::list<Ts...>&){
-      return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
-    }
-    
-    template<typename... Ts>
-    constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
-      return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
-    }
-
-    template<typename... Ts>
-    constexpr auto tuple_to_list(const camp::tuple<Ts...>&) {
-      return camp::list<Ts...>{};
-    }
-
-    // TODO : Change to std::is_invocable at c++17
-    template <typename F, typename... Args>
-    struct is_invocable :
-      std::is_constructible<
-        std::function<void(Args ...)>,
-        std::reference_wrapper<typename std::remove_reference<F>::type>
-      >{};
-
-    template<class...>
-    using void_t = void;
-
-    template<class F, class=void>
-    struct has_empty_op : std::false_type{};
-
-    template<class F>
-    struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>> : std::true_type{};
-
-    template<class F>
-    struct get_lambda_index_type {
-      typedef typename std::remove_pointer<
-                decltype(lambda_arg_helper(
-                      &camp::decay<F>::operator())
-                )
-              >::type type;
-    };
-
-    // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args.
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {}
-
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
+};
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// ForallParamPack generators.
+//
+//
+RAJA_INLINE static auto get_empty_forall_param_pack()
+{
+  static ForallParamPack<> p;
+  return p;
+}
+
+namespace detail
+{
+// all_true trick to perform variadic expansion in static asserts.
+// https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
+template <bool...>
+struct bool_pack;
+template <bool... bs>
+using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+template <typename Base, typename... Ts>
+using check_types_derive_base =
+    all_true<std::is_convertible<Ts, Base>::value...>;
+} // namespace detail
+
+
+template <typename... Ts>
+constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
+{
+  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
+                                                camp::decay<Ts>...>::value,
+                "Forall optional arguments do not derive ForallParamBase. "
+                "Please see Reducer, ReducerLoc and KernelName for examples.");
+  return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+}
+
+
+namespace detail
+{
+// Maybe we should do a lot of these with structs...
+template <camp::idx_t... Seq, typename TupleType>
+constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
+{
+  return camp::forward_as_tuple(
+      camp::get<Seq>(std::forward<TupleType>(tuple))...);
+};
+
+template <typename... Ts>
+constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
+{
+  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1>{},
+                        std::move(tuple));
+};
+} // namespace detail
+
+
+// Make a tuple of the param pack except the final element...
+template <typename... Args>
+constexpr auto make_forall_param_pack(Args&&... args)
+{
+  // We assume the last element of the pack is the lambda so we need to strip it
+  // from the list.
+  auto stripped_arg_tuple = detail::strip_last_elem(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+  return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Callable should be the last argument in the param pack, just extract it...
+//
+//
+template <typename... Args>
+constexpr auto&& get_lambda(Args&&... args)
+{
+  return camp::get<sizeof...(Args) - 1>(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Checking expected argument list against the assumed lambda.
+//
+//
+namespace detail
+{
+
+//
+//
+// Lambda traits Utilities
+//
+//
+template <class F>
+struct lambda_traits;
+
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...)>
+{ // non-const specialization
+  using arg_type = First;
+};
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...) const>
+{ // const specialization
+  using arg_type = First;
+};
+
+template <class T>
+typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
+
+
+//
+//
+// List manipulation Utilities
+//
+//
+template <typename... Ts>
+constexpr auto list_remove_pointer(const camp::list<Ts...>&)
+{
+  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
+}
+
+template <typename... Ts>
+constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&)
+{
+  return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
+}
+
+template <typename... Ts>
+constexpr auto tuple_to_list(const camp::tuple<Ts...>&)
+{
+  return camp::list<Ts...>{};
+}
+
+// TODO : Change to std::is_invocable at c++17
+template <typename F, typename... Args>
+struct is_invocable
+    : std::is_constructible<
+          std::function<void(Args...)>,
+          std::reference_wrapper<typename std::remove_reference<F>::type>>
+{};
+
+template <class...>
+using void_t = void;
+
+template <class F, class = void>
+struct has_empty_op : std::false_type
+{};
+
+template <class F>
+struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>>
+    : std::true_type
+{};
+
+template <class F>
+struct get_lambda_index_type
+{
+  typedef typename std::remove_pointer<decltype(lambda_arg_helper(
+      &camp::decay<F>::operator()))>::type type;
+};
+
+// If LAMBDA::operator() is not available this probably isn't a generic lambda
+// and we can't extract and check args.
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{}
+
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<has_empty_op<LAMBDA>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{
 #if !defined(RAJA_ENABLE_HIP)
-      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS."); 
+  static_assert(is_invocable<LAMBDA,
+                             typename get_lambda_index_type<LAMBDA>::type,
+                             EXPECTED_ARGS...>::value,
+                "LAMBDA Not invocable w/ EXPECTED_ARGS.");
 #endif
-    }
-
-  } // namespace detail
+}
 
+} // namespace detail
 
-  template<typename Lambda, typename ForallParams>
-  constexpr 
-  void
-  check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
 
-    using expected_arg_type_list = decltype( detail::list_add_lvalue_ref(
-                                               detail::list_remove_pointer(
-                                                 detail::tuple_to_list(
-                                                   fpp.lambda_args()
-                                                 )
-                                               )
-                                            ));
+template <typename Lambda, typename ForallParams>
+constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
+{
 
-    detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
-  }
-  //===========================================================================
-  
+  using expected_arg_type_list = decltype(detail::list_add_lvalue_ref(
+      detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args()))));
 
+  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
+}
+//===========================================================================
 
-  //===========================================================================
-  //
-  //
-  // Type trailts for SFINAE work.
-  //
-  //
-  namespace type_traits
-  {
-    template <typename T> struct is_ForallParamPack : std::false_type {};
-    template <typename... Args> struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {};
 
-    template <typename T> struct is_ForallParamPack_empty : std::true_type {};
-    template <typename First, typename... Rest> struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>> : std::false_type {};
-    template <> struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {};
-  }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Invoke Forall with Params.
-  //
-  //
-  namespace detail {
-    template<camp::idx_t Idx, typename FP>
-    RAJA_HOST_DEVICE
-    constexpr
-    auto get_lambda_args(FP& fpp)
-        -> decltype(  *camp::get<Idx>( fpp.lambda_args() )  ) {
-      return (  *camp::get<Idx>( fpp.lambda_args() )  );
-    }
-
-    CAMP_SUPPRESS_HD_WARN
-    template <typename Fn,
-              camp::idx_t... Sequence,
-              typename Params,
-              typename... Ts>
-    RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                      Fn&& f,
-                                                      camp::idx_seq<Sequence...>,
-                                                      Ts&&... extra)
-    {
-      return f(std::forward<Ts...>(extra...), ( get_lambda_args<Sequence>(params) )...);
-    }
-  } // namespace detail
-
-  //CAMP_SUPPRESS_HD_WARN
-  template <typename Params, typename Fn, typename... Ts>
-  RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra)
-  {
-    return detail::invoke_with_order(
-        camp::forward<Params>(params),
-        camp::forward<Fn>(f),
-        typename camp::decay<Params>::lambda_arg_seq(),
-        camp::forward<Ts...>(extra)...);
-  }
-  //===========================================================================
+//===========================================================================
+//
+//
+// Type trailts for SFINAE work.
+//
+//
+namespace type_traits
+{
+template <typename T>
+struct is_ForallParamPack : std::false_type
+{};
+template <typename... Args>
+struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type
+{};
+
+template <typename T>
+struct is_ForallParamPack_empty : std::true_type
+{};
+template <typename First, typename... Rest>
+struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
+    : std::false_type
+{};
+template <>
+struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
+{};
+} // namespace type_traits
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Invoke Forall with Params.
+//
+//
+namespace detail
+{
+template <camp::idx_t Idx, typename FP>
+RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
+    -> decltype(*camp::get<Idx>(fpp.lambda_args()))
+{
+  return (*camp::get<Idx>(fpp.lambda_args()));
+}
+
+CAMP_SUPPRESS_HD_WARN
+template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                  Fn&& f,
+                                                  camp::idx_seq<Sequence...>,
+                                                  Ts&&... extra)
+{
+  return f(std::forward<Ts...>(extra...),
+           (get_lambda_args<Sequence>(params))...);
+}
+} // namespace detail
+
+// CAMP_SUPPRESS_HD_WARN
+template <typename Params, typename Fn, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto
+invoke_body(Params&& params, Fn&& f, Ts&&... extra)
+{
+  return detail::invoke_with_order(
+      camp::forward<Params>(params),
+      camp::forward<Fn>(f),
+      typename camp::decay<Params>::lambda_arg_seq(),
+      camp::forward<Ts...>(extra)...);
+}
+//===========================================================================
 
 } //  namespace expt
 } //  namespace RAJA
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
index e768d8dd59..f3a517fbac 100644
--- a/include/RAJA/pattern/params/kernel_name.hpp
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -10,23 +10,20 @@ namespace expt
 namespace detail
 {
 
-  struct KernelName : public ForallParamBase {
-    RAJA_HOST_DEVICE KernelName() {}
-    KernelName(const char* name_in) : name(name_in) {}
-    const char* name;
-  };
+struct KernelName : public ForallParamBase
+{
+  RAJA_HOST_DEVICE KernelName() {}
+  KernelName(const char* name_in) : name(name_in) {}
+  const char* name;
+};
 
 } // namespace detail
 
-inline auto KernelName(const char * n)
-{
-  return detail::KernelName(n);
-}
+inline auto KernelName(const char* n) { return detail::KernelName(n); }
 } // namespace expt
 
 
 } //  namespace RAJA
 
 
-
 #endif // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index 51e96260f8..78b14f907a 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -9,16 +9,17 @@ namespace expt
 namespace detail
 {
 
-  struct ForallParamBase {
-
-    // Some of this can be made virtual in c++20, for now must be defined in each child class
-    // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.)
-    using ARG_TUP_T = camp::tuple<>; 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
-  
-  };
+struct ForallParamBase
+{
+
+  // Some of this can be made virtual in c++20, for now must be defined in each
+  // child class if any arguments to the forall lambda are needed (e.g.
+  // KernelName is excluded.)
+  using ARG_TUP_T = camp::tuple<>;
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
 } // namespace detail
 
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 05103c7ad4..e6c4c737a1 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -18,8 +18,9 @@ namespace RAJA
 namespace expt
 {
 
-template<typename T>
-struct ValLoc {
+template <typename T>
+struct ValLoc
+{
   using index_type = RAJA::Index_type;
   using value_type = T;
 
@@ -27,14 +28,28 @@ struct ValLoc {
   RAJA_HOST_DEVICE ValLoc(value_type v) : val(v) {}
   RAJA_HOST_DEVICE ValLoc(value_type v, RAJA::Index_type l) : val(v), loc(l) {}
 
-  RAJA_HOST_DEVICE void min(value_type v, index_type l) { if (v < val) { val = v; loc = l; } }
-  RAJA_HOST_DEVICE void max(value_type v, index_type l) { if (v > val) { val = v; loc = l; } }
+  RAJA_HOST_DEVICE void min(value_type v, index_type l)
+  {
+    if (v < val)
+    {
+      val = v;
+      loc = l;
+    }
+  }
+  RAJA_HOST_DEVICE void max(value_type v, index_type l)
+  {
+    if (v > val)
+    {
+      val = v;
+      loc = l;
+    }
+  }
 
   bool constexpr operator<(const ValLoc& rhs) const { return val < rhs.val; }
   bool constexpr operator>(const ValLoc& rhs) const { return val > rhs.val; }
 
-  value_type getVal() {return val;}
-  RAJA::Index_type getLoc() {return loc;}
+  value_type getVal() { return val; }
+  RAJA::Index_type getLoc() { return loc; }
 
 private:
   value_type val;
@@ -47,7 +62,8 @@ namespace operators
 {
 
 template <typename T>
-struct limits<RAJA::expt::ValLoc<T>> {
+struct limits<RAJA::expt::ValLoc<T>>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T> min()
   {
     return RAJA::expt::ValLoc<T>(RAJA::operators::limits<T>::min());
@@ -71,75 +87,81 @@ namespace detail
 {
 
 #if defined(RAJA_CUDA_ACTIVE)
-  using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
-  using device_mem_pool_t = RAJA::hip::device_mempool_type;
+using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #elif defined(RAJA_SYCL_ACTIVE)
-  using device_mem_pool_t = RAJA::sycl::device_mempool_type;
+using device_mem_pool_t = RAJA::sycl::device_mempool_type;
 #endif
 
-  //
-  //
-  // Basic Reducer
-  //
-  //
-  template <typename Op, typename T>
-  struct Reducer : public ForallParamBase {
-    using op = Op;
-    using value_type = T;
-
-    RAJA_HOST_DEVICE Reducer() {}
-    Reducer(value_type *target_in) : target(target_in), val(op::identity()) {}
-
-    value_type *target = nullptr;
-    value_type val = op::identity();
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+//
+//
+// Basic Reducer
+//
+//
+template <typename Op, typename T>
+struct Reducer : public ForallParamBase
+{
+  using op = Op;
+  using value_type = T;
+
+  RAJA_HOST_DEVICE Reducer() {}
+  Reducer(value_type* target_in) : target(target_in), val(op::identity()) {}
+
+  value_type* target = nullptr;
+  value_type val = op::identity();
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type* devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int* device_count = nullptr;
 #endif
 
-    using ARG_TUP_T = camp::tuple<value_type*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&val); }
+  using ARG_TUP_T = camp::tuple<value_type*>;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&val);
+  }
 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
 } // namespace detail
 
 template <template <typename, typename, typename> class Op, typename T>
-auto constexpr Reduce(T *target)
+auto constexpr Reduce(T* target)
 {
   return detail::Reducer<Op<T, T, T>, T>(target);
 }
 
 
-
 namespace detail
 {
 
-  //
-  //
-  // Basic ReducerLoc
-  //
-  //
-  template <typename Op, typename T>
-  struct ReducerLoc : public Reducer<Op, T> {
-    using Base = Reducer<Op, T>;
-    using value_type = typename Base::value_type;
-    ReducerLoc(value_type *target_in) {
-      Base::target = target_in;
-      Base::val = value_type(Op::identity());
-    }
-  };
+//
+//
+// Basic ReducerLoc
+//
+//
+template <typename Op, typename T>
+struct ReducerLoc : public Reducer<Op, T>
+{
+  using Base = Reducer<Op, T>;
+  using value_type = typename Base::value_type;
+  ReducerLoc(value_type* target_in)
+  {
+    Base::target = target_in;
+    Base::val = value_type(Op::identity());
+  }
+};
 
 } // namespace detail
 
 template <template <typename, typename, typename> class Op, typename T>
-auto constexpr ReduceLoc(T *target)
+auto constexpr ReduceLoc(T* target)
 {
   return detail::ReducerLoc<Op<T, T, T>, T>(target);
 }
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 0c0eaf3efb..98d552236a 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -205,7 +205,7 @@ class ReduceSum;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -231,7 +231,7 @@ class ReduceBitOr;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitAnd;
-} //namespace RAJA
+} // namespace RAJA
 
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/region.hpp b/include/RAJA/pattern/region.hpp
index a79422fa7b..09a890498f 100644
--- a/include/RAJA/pattern/region.hpp
+++ b/include/RAJA/pattern/region.hpp
@@ -38,7 +38,7 @@ void region(OuterBody&& outer_body, InnerBody&& inner_body)
   region_impl(ExecutionPolicy(), outer_body, inner_body);
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 0f46ee0a22..79f6ee678e 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -46,20 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-inclusive_scan_inplace(ExecPolicy&& p,
-                       Res r,
-                       Container&& c,
-                       Function binop = Function{})
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    inclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function{})
 {
   using std::begin;
   using std::end;
@@ -68,32 +69,32 @@ inclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop);
+  return impl::scan::inclusive_inplace(
+      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop);
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
                        Function binop = Function{})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop);
 }
 
 /*!
@@ -114,16 +115,16 @@ template <typename ExecPolicy,
           typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-exclusive_scan_inplace(ExecPolicy&& p,
-                       Res r,
-                       Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    exclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function{},
+                           T value = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -132,11 +133,12 @@ exclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop, value);
+  return impl::scan::exclusive_inplace(
+      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop, value);
 }
 ///
 template <typename ExecPolicy,
@@ -144,11 +146,12 @@ template <typename ExecPolicy,
           typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
                        Function binop = Function{},
@@ -156,11 +159,7 @@ exclusive_scan_inplace(ExecPolicy&& p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop, value);
 }
 
 /*!
@@ -183,19 +182,20 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
-inclusive_scan(ExecPolicy&& p,
-               Res r,
-               InContainer&& in,
-               OutContainer&& out,
-               Function binop = Function{})
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    inclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function{})
 {
   using std::begin;
   using std::end;
@@ -207,24 +207,27 @@ inclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop);
+  return impl::scan::inclusive(
+      r, std::forward<ExecPolicy>(p), begin(in), end(in), begin(out), binop);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
@@ -262,18 +265,18 @@ template <typename ExecPolicy,
           typename T = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
-exclusive_scan(ExecPolicy&& p,
-               Res r,
-               InContainer&& in,
-               OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    exclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function{},
+                   T value = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -285,11 +288,17 @@ exclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop, value);
+  return impl::scan::exclusive(r,
+                               std::forward<ExecPolicy>(p),
+                               begin(in),
+                               end(in),
+                               begin(out),
+                               binop,
+                               value);
 }
 ///
 template <typename ExecPolicy,
@@ -298,12 +307,13 @@ template <typename ExecPolicy,
           typename T = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
@@ -320,7 +330,7 @@ exclusive_scan(ExecPolicy&& p,
       value);
 }
 
-}  // end inline namespace policy_by_value_interface
+} // namespace policy_by_value_interface
 
 
 /*!
@@ -329,11 +339,11 @@ exclusive_scan(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type >
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -342,10 +352,9 @@ exclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -358,11 +367,11 @@ exclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -371,10 +380,9 @@ inclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -387,11 +395,11 @@ inclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -400,10 +408,9 @@ exclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -416,11 +423,11 @@ exclusive_scan_inplace(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -429,16 +436,15 @@ inclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index acf3fe5ba7..75b596500b 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -46,23 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-sort(ExecPolicy&& p,
-     Res r,
-     Container&& c,
-     Compare comp = Compare{})
+sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -70,35 +68,36 @@ sort(ExecPolicy&& p,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
   auto N = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::unstable(r, std::forward<ExecPolicy>(p),
-                                begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable(
+        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-sort(ExecPolicy&& p,
-     Container&& c,
-     Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+sort(ExecPolicy&& p, Container&& c, Compare comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -113,23 +112,21 @@ sort(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-stable_sort(ExecPolicy&& p,
-            Res r,
-            Container&& c,
-            Compare comp = Compare{})
+stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -137,35 +134,36 @@ stable_sort(ExecPolicy&& p,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
   auto N = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::stable(r, std::forward<ExecPolicy>(p),
-                              begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable(
+        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-stable_sort(ExecPolicy&& p,
-            Container&& c,
-            Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -185,7 +183,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -199,8 +198,8 @@ sort_pairs(ExecPolicy&& p,
            Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -210,13 +209,16 @@ sort_pairs(ExecPolicy&& p,
                 "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
-  auto end_key   = end(keys);
+  auto end_key = end(keys);
   auto N = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p),
-                                      begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable_pairs(
+        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -224,13 +226,16 @@ sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 sort_pairs(ExecPolicy&& p,
            KeyContainer&& keys,
            ValContainer&& vals,
@@ -262,7 +267,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -276,8 +282,8 @@ stable_sort_pairs(ExecPolicy&& p,
                   Compare comp = Compare{})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -287,13 +293,16 @@ stable_sort_pairs(ExecPolicy&& p,
                 "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
-  auto end_key   = end(keys);
+  auto end_key = end(keys);
   auto N = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p),
-                                    begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable_pairs(
+        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -301,13 +310,16 @@ stable_sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 stable_sort_pairs(ExecPolicy&& p,
                   KeyContainer&& keys,
                   ValContainer&& vals,
@@ -322,7 +334,7 @@ stable_sort_pairs(ExecPolicy&& p,
       comp);
 }
 
-}  // end inline namespace policy_by_value_interface
+} // namespace policy_by_value_interface
 
 // =============================================================================
 
@@ -332,11 +344,12 @@ stable_sort_pairs(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort(Args &&... args)
+sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort<ExecPolicy>(
@@ -347,7 +360,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort(Res r, Args &&... args)
+sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -359,11 +372,12 @@ sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort(Args &&... args)
+stable_sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort<ExecPolicy>(
@@ -374,7 +388,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort(Res r, Args &&... args)
+stable_sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -386,11 +400,12 @@ stable_sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort_pairs(Args &&... args)
+sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs<ExecPolicy>(
@@ -401,7 +416,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort_pairs(Res r, Args &&... args)
+sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -413,11 +428,12 @@ sort_pairs(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort_pairs(Args &&... args)
+stable_sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs<ExecPolicy>(
@@ -428,12 +444,12 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort_pairs(Res r, Args &&... args)
+stable_sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/pattern/synchronize.hpp b/include/RAJA/pattern/synchronize.hpp
index d3e42af81c..3d6047a407 100644
--- a/include/RAJA/pattern/synchronize.hpp
+++ b/include/RAJA/pattern/synchronize.hpp
@@ -43,6 +43,6 @@ void synchronize()
 {
   synchronize_impl(Policy{});
 }
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // RAJA_synchronize_HPP
+#endif // RAJA_synchronize_HPP
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index 9fa39f34ee..cd386ddcf5 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -28,25 +28,27 @@ namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename LAYOUT, typename REGISTER_POLICY = default_register>
-  using SquareMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem,
-                                   RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem>>;
-
-  template<typename T, typename LAYOUT, camp::idx_t ROWS, camp::idx_t COLS,
-           typename REGISTER_POLICY = default_register>
-  using RectMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<ROWS,COLS>>;
+template <typename T,
+          typename LAYOUT,
+          typename REGISTER_POLICY = default_register>
+using SquareMatrixRegister = TensorRegister<
+    REGISTER_POLICY,
+    T,
+    LAYOUT,
+    camp::idx_seq<
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
+
+template <typename T,
+          typename LAYOUT,
+          camp::idx_t ROWS,
+          camp::idx_t COLS,
+          typename REGISTER_POLICY = default_register>
+using RectMatrixRegister =
+    TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
 
 } // namespace expt
-}  // namespace RAJA
-
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/ScalarRegister.hpp b/include/RAJA/pattern/tensor/ScalarRegister.hpp
index f6675b4ba9..d0cc0fa6db 100644
--- a/include/RAJA/pattern/tensor/ScalarRegister.hpp
+++ b/include/RAJA/pattern/tensor/ScalarRegister.hpp
@@ -28,12 +28,10 @@ namespace RAJA
 namespace expt
 {
 
-  // Convenience to describe ScalarTensors
-  template<typename T>
-  using ScalarRegister = TensorRegister<scalar_register,
-                                        T,
-                                        ScalarLayout,
-                                        camp::idx_seq<>>;
+// Convenience to describe ScalarTensors
+template <typename T>
+using ScalarRegister =
+    TensorRegister<scalar_register, T, ScalarLayout, camp::idx_seq<>>;
 
 
 } // namespace expt
diff --git a/include/RAJA/pattern/tensor/TensorBlock.hpp b/include/RAJA/pattern/tensor/TensorBlock.hpp
index 0e9869a772..6fc9d48897 100644
--- a/include/RAJA/pattern/tensor/TensorBlock.hpp
+++ b/include/RAJA/pattern/tensor/TensorBlock.hpp
@@ -360,7 +360,6 @@ namespace ET{
 }  // namespace RAJA
 
 
-
 #endif
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index f992649876..41b8f35a0b 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -29,197 +29,194 @@ namespace expt
 {
 
 
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndexInner;
-
-  template<typename INNER_TYPE>
-  struct StaticTensorIndex;
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-  class TensorIndex {
-    public:
-      using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type all(){
-        return self_type(index_type(-1), value_type(-1));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>> static_all(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>>();
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type range(index_type begin, index_type end){
-        return self_type(begin, value_type(stripIndexType(end-begin)));
-      }
-
-      template<value_type TBEGIN, value_type TEND>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>> static_range(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>>();
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex() : m_index(index_type(0)), m_length(0) {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg) :
-      m_index(*seg.begin()), m_length(seg.size())
-      {}
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {}
-
-      template<typename T, camp::idx_t D>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
-
-
-      template<IDX IDX_VAL, strip_index_type_t<IDX> LEN_VAL>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
-          : m_index(IDX_VAL)
-          , m_length(LEN_VAL)
-      {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type const &operator*() const {
-        return m_index;
-      }
-
-      // used in strip_by_value as a static cast
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      explicit operator index_type() const {
-        // return does not matter, but suppresses no-return warnings
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type begin() const {
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type size() const {
-        return m_length;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type dim() const {
-        return DIM;
-      }
-
-    private:
-      index_type m_index;
-      value_type m_length;
-  };
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
-
-      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      static const index_type s_index  = INDEX_VALUE;
-      static const index_type s_length = LENGTH_VALUE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr operator base_type() {
-        return base_type(s_index,s_length);
-      }
-    
-  };
-
-
-
-  /*!
-   * Index that specifies the starting element index of a Vector
-   */
-  template<typename IDX, typename VECTOR_TYPE>
-  using VectorIndex =  TensorIndex<IDX, VECTOR_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Row index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using RowIndex =  TensorIndex<IDX, MATRIX_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Column index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using ColIndex =  TensorIndex<IDX, MATRIX_TYPE, 1>;
-
-
-  /*!
-   * Converts a Row index to a Column index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndexInner;
+
+template <typename INNER_TYPE>
+struct StaticTensorIndex;
+
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+class TensorIndex
+{
+public:
+  using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type = strip_index_type_t<IDX>;
+  using index_type = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr self_type all()
+  {
+    return self_type(index_type(-1), value_type(-1));
+  }
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                            TENSOR_TYPE,
+                                                            DIM,
+                                                            index_type(-1),
+                                                            value_type(-1)>>
+  static_all()
+  {
+    return StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                    TENSOR_TYPE,
+                                                    DIM,
+                                                    index_type(-1),
+                                                    value_type(-1)>>();
+  }
+
   RAJA_INLINE
-  constexpr
-  ColIndex<IDX, MATRIX_TYPE> toColIndex(RowIndex<IDX, MATRIX_TYPE> const &r){
-    return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+  RAJA_HOST_DEVICE
+  static constexpr self_type range(index_type begin, index_type end)
+  {
+    return self_type(begin, value_type(stripIndexType(end - begin)));
   }
 
-  /*!
-   * Converts a Column index to a Row index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+  template <value_type TBEGIN, value_type TEND>
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex<
+      StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>
+  static_range()
+  {
+    return StaticTensorIndex<
+        StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>();
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(RAJA::TypedRangeSegment<IDX> const& seg)
+      : m_index(*seg.begin()), m_length(seg.size())
+  {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(index_type value, value_type length)
+      : m_index(value), m_length(length)
+  {}
+
+  template <typename T, camp::idx_t D>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE constexpr TensorIndex(TensorIndex<IDX, T, D> const& c)
+      : m_index(*c), m_length(c.size())
+  {}
+
+
+  template <IDX IDX_VAL, strip_index_type_t<IDX> LEN_VAL>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      StaticTensorIndex<
+          StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const
+          RAJA_UNUSED_ARG(&c))
+      : m_index(IDX_VAL), m_length(LEN_VAL)
+  {}
+
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  constexpr index_type const& operator*() const { return m_index; }
+
+  // used in strip_by_value as a static cast
   RAJA_INLINE
-  constexpr
-  RowIndex<IDX, MATRIX_TYPE> toRowIndex(ColIndex<IDX, MATRIX_TYPE> const &c){
-    return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+  RAJA_HOST_DEVICE
+  constexpr explicit operator index_type() const
+  {
+    // return does not matter, but suppresses no-return warnings
+    return m_index;
   }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type begin() const { return m_index; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type size() const { return m_length; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type dim() const { return DIM; }
+
+private:
+  index_type m_index;
+  value_type m_length;
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndex<
+    StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
+{
+
+  using base_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type = strip_index_type_t<IDX>;
+  using index_type = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  static const index_type s_index = INDEX_VALUE;
+  static const index_type s_length = LENGTH_VALUE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr operator base_type() { return base_type(s_index, s_length); }
+};
+
+
+/*!
+ * Index that specifies the starting element index of a Vector
+ */
+template <typename IDX, typename VECTOR_TYPE>
+using VectorIndex = TensorIndex<IDX, VECTOR_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Row index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using RowIndex = TensorIndex<IDX, MATRIX_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Column index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
+
+
+/*!
+ * Converts a Row index to a Column index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
+toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
+{
+  return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+}
+
+/*!
+ * Converts a Column index to a Row index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
+toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
+{
+  return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+}
+
 } // namespace expt
-}  // namespace RAJA
+} // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
 
diff --git a/include/RAJA/pattern/tensor/TensorLayout.hpp b/include/RAJA/pattern/tensor/TensorLayout.hpp
index 376d6b905a..3d581b631b 100644
--- a/include/RAJA/pattern/tensor/TensorLayout.hpp
+++ b/include/RAJA/pattern/tensor/TensorLayout.hpp
@@ -28,68 +28,57 @@ namespace expt
 {
 
 
-  template<camp::idx_t ... DIM_SEQ>
-  struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
-  {
+template <camp::idx_t... DIM_SEQ>
+struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
+{
 
-      using seq_t = camp::idx_seq<DIM_SEQ...>;
+  using seq_t = camp::idx_seq<DIM_SEQ...>;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major() { return false; }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major() { return false; }
+};
 
-  };
 
+// specialization for Matrix layouts, where column vs row major matters
+template <camp::idx_t S2, camp::idx_t S1>
+struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+{
+  using seq_t = camp::idx_seq<S2, S1>;
 
-  // specialization for Matrix layouts, where column vs row major matters
-  template<camp::idx_t S2, camp::idx_t S1>
-  struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major()
   {
-      using seq_t = camp::idx_seq<S2, S1>;
+    return S1 == 0; // Rows are stride-1
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return S1 == 0; // Rows are stride-1
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return S1 == 1; // Columns are stride-1
-      }
-  };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major()
+  {
+    return S1 == 1; // Columns are stride-1
+  }
+};
 
 
-  // 0d tensor (scalar) layout
-  using ScalarLayout = TensorLayout<>;
+// 0d tensor (scalar) layout
+using ScalarLayout = TensorLayout<>;
 
-  // 1d tensor (vector) layout
-  using VectorLayout = TensorLayout<0>;
+// 1d tensor (vector) layout
+using VectorLayout = TensorLayout<0>;
 
-  // 2d tensor (matrix) layouts
-  using RowMajorLayout = TensorLayout<0, 1>;
-  using ColMajorLayout = TensorLayout<1, 0>;
+// 2d tensor (matrix) layouts
+using RowMajorLayout = TensorLayout<0, 1>;
+using ColMajorLayout = TensorLayout<1, 0>;
 
 
 } // namespace expt
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index d410f46fb7..38c8baedac 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -28,81 +28,92 @@
 
 namespace RAJA
 {
-namespace internal {
-namespace expt {
-    class TensorRegisterConcreteBase;
-}
+namespace internal
+{
+namespace expt
+{
+class TensorRegisterConcreteBase;
 }
+} // namespace internal
 
 namespace expt
 {
 
 
-  template<typename REGISTER_POLICY,
-           typename T,
-           typename LAYOUT,
-           typename SIZES>
-  class TensorRegister;
+template <typename REGISTER_POLICY, typename T, typename LAYOUT, typename SIZES>
+class TensorRegister;
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic - TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
+
+/*
+ * Overload for:    arithmetic - TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic * TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
+
+/*
+ * Overload for:    arithmetic * TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
-
-  /*
-   * Overload for:    arithmetic / TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
+
+/*
+ * Overload for:    arithmetic / TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
 
 } // namespace expt
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #include "RAJA/pattern/tensor/internal/TensorRegisterBase.hpp"
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index afab05658f..9a72d976d0 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -24,16 +24,15 @@ namespace RAJA
 {
 namespace expt
 {
-  // Convenience to describe VectorTensors
-  template<typename T, typename REGISTER_POLICY = default_register, camp::idx_t NUM_ELEM = Register<T,REGISTER_POLICY>::s_num_elem>
-  using VectorRegister = TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        VectorLayout,
-                                        camp::idx_seq<NUM_ELEM> >;
+// Convenience to describe VectorTensors
+template <typename T,
+          typename REGISTER_POLICY = default_register,
+          camp::idx_t NUM_ELEM = Register<T, REGISTER_POLICY>::s_num_elem>
+using VectorRegister =
+    TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
 } // namespace expt
 
 } // namespace RAJA
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 953f4fd4a0..29fc5b01da 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -34,110 +34,121 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator
+    : public TensorExpressionBase<
+          TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
+{
+public:
+  using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
+  using operator_type = OPERATOR;
+  using left_operand_type = LEFT_OPERAND;
+  using right_operand_type = RIGHT_OPERAND;
+
+  using element_type = typename LEFT_OPERAND::element_type;
+  using index_type = typename LEFT_OPERAND::index_type;
+
+  using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
+  using result_type = typename operator_traits::result_type;
+
+  static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorBinaryOperator(left_operand_type const& left,
+                       right_operand_type const& right)
+      : m_left_operand{left}, m_right_operand{right}
+  {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr auto getDimSize(camp::idx_t dim) const
+      -> decltype(operator_traits::getDimSize(dim,
+                                              m_left_operand,
+                                              m_right_operand))
+  {
+    return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(operator_type::eval(m_left_operand.eval(tile),
+                                      m_right_operand.eval(tile)))
+  {
+    return operator_type::eval(m_left_operand.eval(tile),
+                               m_right_operand.eval(tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
   {
+    operator_type::print_ast();
+    printf("[");
+    operator_type::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
 
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator :
-        public TensorExpressionBase<TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
-    {
-      public:
-        using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
-        using operator_type = OPERATOR;
-        using left_operand_type = LEFT_OPERAND;
-        using right_operand_type = RIGHT_OPERAND;
-
-        using element_type = typename LEFT_OPERAND::element_type;
-        using index_type = typename LEFT_OPERAND::index_type;
-
-        using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
-        using result_type = typename operator_traits::result_type;
-
-        static constexpr camp::idx_t s_num_dims =
-            operator_traits::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorBinaryOperator(left_operand_type const &left, right_operand_type const &right) :
-        m_left_operand{left}, m_right_operand{right}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        auto getDimSize(camp::idx_t dim) const ->
-        decltype(operator_traits::getDimSize(dim, m_left_operand, m_right_operand))
-        {
-          return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile)))
-        {
-          return operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          operator_type::print_ast();
-          printf("[");
-          operator_type::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-
-
-    /*
-     * Overload for:    arithmetic + tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator+(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
-
-
-    /*
-     * Overload for:    arithmetic - tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator-(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
+/*
+ * Overload for:    arithmetic + tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                 RIGHT_OPERAND>
+{
+  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                   RIGHT_OPERAND>(
+      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+}
+
+
+/*
+ * Overload for:    arithmetic - tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorSubtract<
+        typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+        RIGHT_OPERAND>
+{
+  return TensorSubtract<
+      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+                     right);
+}
 
 
 //    /*
@@ -145,23 +156,29 @@ namespace expt
 //
 //     */
 //    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-//      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
+//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+//      bool>::type = true, typename
+//      std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+//      RIGHT_OPERAND>::value, bool>::type = true>
 //    RAJA_INLINE
 //    RAJA_HOST_DEVICE
 //    auto operator/(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
+//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//    RIGHT_OPERAND>
 //    {
-//      return TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+//      return TensorDivide<typename
+//      NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+//      right);
 //    }
 
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index a1450bf19f..07cb1ac466 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -27,161 +27,135 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-    struct TensorOperatorAdd
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left + right)
-      {
-        return left + right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Add");
-      }
-    };
-
-    struct TensorOperatorSubtract
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left - right)
-      {
-        return left - right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Subtract");
-      }
-    };
-
-
-
-
+namespace ET
+{
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator;
+struct TensorOperatorAdd
+{
 
-    template<typename LHS, typename RHS>
-    using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left + right)
+  {
+    return left + right;
+  }
 
-    template<typename LHS, typename RHS>
-    using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Add"); }
+};
 
+struct TensorOperatorSubtract
+{
 
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left - right)
+  {
+    return left - right;
+  }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Subtract"); }
+};
 
-    /*!
-     * Provides default operations for add, subtract and divide
-     *
-     * For the most part, this is just element wise operations between
-     * compatible tensors.
-     *
-     * There are specializations that handle when one operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
-    struct OperatorTraits {
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental");
-        }
+template <typename LHS, typename RHS>
+using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
 
+template <typename LHS, typename RHS>
+using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &rhs) {
-          return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
-        }
 
-    };
+/*!
+ * Provides default operations for add, subtract and divide
+ *
+ * For the most part, this is just element wise operations between
+ * compatible tensors.
+ *
+ * There are specializations that handle when one operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
+struct OperatorTraits
+{
 
-    /*!
-     * Specialization when the left operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
-    {
+  using result_type = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-        using result_type = typename RHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Elemental"); }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &, RHS_TYPE const &rhs) {
-          return rhs.getDimSize(dim);
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const& rhs)
+  {
+    return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
+  }
+};
 
-    };
+/*!
+ * Specialization when the left operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
+{
 
-    /*!
-     * Specialization when the right operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
-    {
+  using result_type = typename RHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const&, RHS_TYPE const& rhs)
+  {
+    return rhs.getDimSize(dim);
+  }
+};
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &) {
-          return lhs.getDimSize(dim);
-        }
+/*!
+ * Specialization when the right operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
+{
 
+  using result_type = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const&)
+  {
+    return lhs.getDimSize(dim);
+  }
+};
 
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index 210414eaec..110405149e 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -26,7 +26,6 @@
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
 
 
-
 namespace RAJA
 {
 namespace internal
@@ -35,93 +34,90 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Temporary n-dimensional memory.
-     *
-     * STORAGE_TYPE defines the memory storage
-     * TENSOR_TYPE defines what kind of tensor is returned by eval()
-     */
-    template<typename STORAGE_TYPE, typename TENSOR_TYPE>
-    class BlockLiteral :  public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>> {
-      public:
-        using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
-        using storage_type = STORAGE_TYPE;
-        using tensor_type = TENSOR_TYPE;
-        using result_type = TENSOR_TYPE;
-        using ref_type = typename STORAGE_TYPE::ref_type;
-        using tile_type = typename ref_type::tile_type;
-        using index_type = camp::idx_t;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        storage_type m_storage;
-        tile_type m_tile_origin;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return storage_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        BlockLiteral(tile_type tile_origin) :
-          m_storage(),
-          m_tile_origin(tile_origin)
-        {
+namespace ET
+{
 
-        }
 
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          result_type result;
+/*!
+ * Temporary n-dimensional memory.
+ *
+ * STORAGE_TYPE defines the memory storage
+ * TENSOR_TYPE defines what kind of tensor is returned by eval()
+ */
+template <typename STORAGE_TYPE, typename TENSOR_TYPE>
+class BlockLiteral
+    : public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>>
+{
+public:
+  using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
+  using storage_type = STORAGE_TYPE;
+  using tensor_type = TENSOR_TYPE;
+  using result_type = TENSOR_TYPE;
+  using ref_type = typename STORAGE_TYPE::ref_type;
+  using tile_type = typename ref_type::tile_type;
+  using index_type = camp::idx_t;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+
+private:
+  storage_type m_storage;
+  tile_type m_tile_origin;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return storage_type::s_dim_elem(dim);
+  }
 
-          // load result from storage
-          result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr BlockLiteral(tile_type tile_origin)
+      : m_storage(), m_tile_origin(tile_origin)
+  {}
 
-          return result;
-        }
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    result_type result;
 
+    // load result from storage
+    result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
 
-        /*!
-         *  Returns a ref that points at this data, shifted by its origin
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        ref_type get_ref() {
+    return result;
+  }
 
-          // compute shifited origin ref
-          return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
 
-        }
+  /*!
+   *  Returns a ref that points at this data, shifted by its origin
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  ref_type get_ref()
+  {
 
+    // compute shifited origin ref
+    return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
+  }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("BlockLiteral()");
-        }
 
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("BlockLiteral()"); }
+};
 
 
 //    /*
-//     * For TensorRegister nodes, we need to wrap this in a constant value ET node
+//     * For TensorRegister nodes, we need to wrap this in a constant value ET
+//     node
 //     */
 //    template<typename RHS>
 //    struct NormalizeOperandHelper<RHS,
-//    typename std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase, RHS>::value>::type>
+//    typename
+//    std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase,
+//    RHS>::value>::type>
 //    {
 //        using return_type = BlockLiteral<RHS>;
 //
@@ -134,12 +130,12 @@ namespace expt
 //        }
 //    };
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3e96a63462..3d03d0dd67 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -38,130 +38,123 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
+class TensorRegisterConcreteBase;
 
-  namespace ET
+namespace ET
+{
+
+//
+// forward decls
+//
+
+template <typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
+class TensorLoadStore;
+
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorMultiply;
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorDivide;
+
+template <typename TENSOR_TYPE>
+class TensorNegate;
+
+template <typename TENSOR_TYPE>
+class TensorTranspose;
+
+
+// provides a non-templated base-type for all ET's
+// this allows using things like std::is_base_of
+class TensorExpressionConcreteBase
+{};
+
+
+template <typename DERIVED_TYPE>
+class TensorExpressionBase : public TensorExpressionConcreteBase
+{
+public:
+  using self_type = DERIVED_TYPE;
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type* getThis() { return static_cast<self_type*>(this); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr camp::idx_t getDimBegin(camp::idx_t) const { return 0; }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
+  operator+(RHS const& rhs) const
+  {
+    return TensorAdd<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE TensorSubtract<self_type, normalize_operand_t<RHS>>
+      operator-(RHS const& rhs) const
   {
+    return TensorSubtract<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate<self_type> operator-() const
+  {
+    return TensorNegate<self_type>(*getThis());
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE TensorMultiply<self_type, normalize_operand_t<RHS>>
+      operator*(RHS const& rhs) const
+  {
+    return TensorMultiply<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
+  operator/(RHS const& rhs) const
+  {
+    return TensorDivide<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose<self_type> transpose() const
+  {
+    return TensorTranspose<self_type>(*getThis());
+  }
+};
+
+
+} // namespace ET
 
-    //
-    // forward decls
-    //
-
-    template<typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
-    class TensorLoadStore;
-
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorMultiply;
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorDivide;
-
-    template<typename TENSOR_TYPE>
-    class TensorNegate;
-
-    template<typename TENSOR_TYPE>
-    class TensorTranspose;
-
-
-
-
-    // provides a non-templated base-type for all ET's
-    // this allows using things like std::is_base_of
-    class TensorExpressionConcreteBase{};
-
-
-    template<typename DERIVED_TYPE>
-    class TensorExpressionBase :public TensorExpressionConcreteBase {
-      public:
-        using self_type = DERIVED_TYPE;
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        self_type *getThis(){
-          return static_cast<self_type*>(this);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        self_type const *getThis() const {
-          return static_cast<self_type const*>(this);
-        }
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        camp::idx_t getDimBegin(camp::idx_t ) const
-        {
-          return 0;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorAdd<self_type, normalize_operand_t<RHS> >
-        operator+(RHS const &rhs) const {
-          return TensorAdd<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorSubtract<self_type, normalize_operand_t<RHS>>
-        operator-(RHS const &rhs) const {
-          return TensorSubtract<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate<self_type>
-        operator-() const {
-          return TensorNegate<self_type>(*getThis());
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply<self_type, normalize_operand_t<RHS>>
-        operator*(RHS const &rhs) const {
-          return TensorMultiply<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide<self_type, normalize_operand_t<RHS>>
-        operator/(RHS const &rhs) const {
-          return TensorDivide<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose<self_type>
-        transpose() const {
-          return TensorTranspose<self_type>(*getThis());
-        }
-
-    };
-
-
-  } // namespace ET
-
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index e7e7223ce4..47c502ac83 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -20,1212 +20,1245 @@
 #define RAJA_pattern_tensor_ET_MultiplyOperator_HPP
 
 
-
 namespace RAJA
 {
 namespace internal
 {
 namespace expt
 {
-  //forward
-  class TensorBlockConcreteBase;
+// forward
+class TensorBlockConcreteBase;
+
 
+namespace ET
+{
 
 
+/*!
+ * Provides default multiply, multiply add, and multiply subtract
+ * operations.
+ *
+ * If the operands are both matrices, we perform a matrix-matrix multiply.
+ * Otherwise, we perform element-wise operations.
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct MultiplyOperator
+{
 
-  namespace ET
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast()
   {
+    printf("Elemental(%d,%d)",
+           (int)s_num_dims,
+           (int)RIGHT_OPERAND_TYPE::s_num_dims);
+  }
 
 
-    /*!
-     * Provides default multiply, multiply add, and multiply subtract
-     * operations.
-     *
-     * If the operands are both matrices, we perform a matrix-matrix multiply.
-     * Otherwise, we perform element-wise operations.
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct MultiplyOperator
-    {
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile) * right.eval(tile))
+  {
+    return left.eval(tile) * right.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
+                                               add.eval(tile)))
+  {
+    return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
+                                                    subtract.eval(tile)))
+  {
+    return left.eval(tile).multiply_subtract(right.eval(tile),
+                                             subtract.eval(tile));
+  }
+};
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental(%d,%d)", (int)s_num_dims, (int)RIGHT_OPERAND_TYPE::s_num_dims);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile) * right.eval(tile))
-        {
-          return left.eval(tile) * right.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).multiply_add(right.eval(tile), add.eval(tile)))
-        {
-          return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile)))
-        {
-          return left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile));
-        }
-
-
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a scalar * tensor
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+
+/*!
+ * Specialization that provides multiplying a scalar * tensor
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+{
 
-        using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-          return right.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(right.eval(tile).scale(left.eval(tile)))
-        {
-          return right.eval(tile).scale(left.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a tensor*scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(right.eval(tile).scale(left.eval(tile)))
+  {
+    return right.eval(tile).scale(left.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+
+/*!
+ * Specialization that provides multiplying a tensor*scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+{
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-          return left.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile).scale(right.eval(tile)))
-        {
-          return left.eval(tile).scale(right.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization for matrix-vector right multiplication.
-     *
-     * By default the A*x operator for two matrices produces a matrix-vector
-     * multiplication.
-     *
-     * The right hand side vector is always treated as a column vector.
-     *
-     * The resulting vector type is inherited from the RHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==1>::type>
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile).scale(right.eval(tile)))
+  {
+    return left.eval(tile).scale(right.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+
+/*!
+ * Specialization for matrix-vector right multiplication.
+ *
+ * By default the A*x operator for two matrices produces a matrix-vector
+ * multiplication.
+ *
+ * The right hand side vector is always treated as a column vector.
+ *
+ * The resulting vector type is inherited from the RHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type =
+      typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Vector"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? right.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+
+    // clear result
+    result_type result(0);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
+
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX = void>
+  struct MultiplyBridge;
+
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+    // get tile size from matrix type
+    auto tile_size = left_type::result_type::s_dim_elem(1);
+    auto k_size = et_left.getDimSize(1);
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    auto left_tile =
+        LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    left_tile.m_begin[0] = tile.m_begin[0];
+    left_tile.m_size[0] = tile.m_size[0];
+    left_tile.m_size[1] = tile_size;
+
+    using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+    RightType right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k;
+      auto left = et_left.eval(left_tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Vector");
-      }
+      right_tile.m_begin[0] = k;
+      auto right = et_right.eval(right_tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? right.getDimSize(0) : 0;
-      }
+      // accumulate product
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k;
+      left_part_tile.m_size[1] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
+
+      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
+
+      // accumulate product of partial tile
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
 
-        // clear result
-        result_type result(0);
+  template <typename T>
+  struct Diag
+  {
+    static_assert(!std::is_same<T, void>::value, "diag");
+  };
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+  template <typename I, TensorTileSize TTS, typename B, typename S>
+  struct Diag<StaticTensorTile<I, TTS, B, S>>
+  {
+    static_assert(std::is_same<I, void>::value, "diag");
+  };
 
-        return result;
-      }
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX>
+  struct MultiplyBridge
+  {
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
+    Diag<TILE_TYPE> diag;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TILE_TYPE const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
+      // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+      // get tile size from matrix type
+      auto tile_size = left_type::result_type::s_dim_elem(1);
+      auto k_size = et_left.getDimSize(1);
+      // TODO: check that left and right are compatible
+      // m_left.getDimSize(1) == m_right.getDimSize(0)
+      // how do we provide checking for this kind of error?
+
+      // tile over row of left and column of right
+      auto left_tile =
+          LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+      left_tile.m_begin[0] = tile.m_begin[0];
+      left_tile.m_size[0] = tile.m_size[0];
+      left_tile.m_size[1] = tile_size;
+
+      using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+      RightType right_tile = tile;
+      right_tile.m_size[0] = tile_size;
+
+      // Do full tiles in k
+      decltype(k_size) k = 0;
+      for (; k + tile_size <= k_size; k += tile_size)
+      {
 
-        // evaluate add into result
-        result_type result = add.eval(tile);
+        // evaluate both sides of operator
+        left_tile.m_begin[1] = k;
+        auto left = et_left.eval(left_tile);
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        right_tile.m_begin[0] = k;
+        auto right = et_right.eval(right_tile);
 
-        return result;
+        // accumulate product
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+      // remainder tile in k
+      if (k < k_size)
+      {
+        auto& left_part_tile = make_tensor_tile_partial(left_tile);
+        left_part_tile.m_begin[1] = k;
+        left_part_tile.m_size[1] = k_size - k;
+        auto left = et_left.eval(left_part_tile);
+
+        auto& right_part_tile = make_tensor_tile_partial(right_tile);
+        right_part_tile.m_begin[0] = k;
+        right_part_tile.m_size[0] = k_size - k;
+        auto right = et_right.eval(right_part_tile);
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+    }
+  };
+
+
+  template <size_t INDEX,
+            typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, INDEX>>
+  {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
 
-    private:
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size = et_left.getDimSize(1);
 
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX=void>
-      struct MultiplyBridge;
+      auto const offset = INDEX * tile_size;
 
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+      if ((offset + tile_size) <= k_size)
       {
-        //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-
-        // get tile size from matrix type
-        auto tile_size = left_type::result_type::s_dim_elem(1);
-        auto k_size = et_left.getDimSize(1);
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        left_tile.m_begin[0] = tile.m_begin[0];
-        left_tile.m_size[0] = tile.m_size[0];
-        left_tile.m_size[1] = tile_size;
-
-        using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-        RightType right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product of partial tile
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
 
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        MultiplyBridge<STORAGE,
+                       TileType,
+                       camp::integral_constant<size_t, INDEX - 1>>::
+            multiply_into_result(result, tile, et_left, et_right);
+        result += temp;
       }
+      else
+      {
 
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+    }
+  };
+
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, 0>>
+  {
 
-      template<typename T>
-      struct Diag{
-          static_assert(!std::is_same<T,void>::value,"diag");
-      };
-
-      template<typename I, TensorTileSize TTS, typename B, typename S>
-      struct Diag< StaticTensorTile<I,TTS,B,S> >{
-          static_assert(std::is_same<I,void>::value,"diag");
-      };
-
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX>
-      struct MultiplyBridge {
-
-          Diag<TILE_TYPE> diag;
-
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-            //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-    
-            // get tile size from matrix type
-            auto tile_size = left_type::result_type::s_dim_elem(1);
-            auto k_size = et_left.getDimSize(1);
-            // TODO: check that left and right are compatible
-            // m_left.getDimSize(1) == m_right.getDimSize(0)
-            // how do we provide checking for this kind of error?
-    
-            // tile over row of left and column of right
-            auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-            left_tile.m_begin[0] = tile.m_begin[0];
-            left_tile.m_size[0] = tile.m_size[0];
-            left_tile.m_size[1] = tile_size;
-    
-            using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-            RightType right_tile = tile;
-            right_tile.m_size[0] = tile_size;
-    
-            // Do full tiles in k
-            decltype(k_size) k = 0;
-            for(;k+tile_size <= k_size; k+= tile_size){
-    
-              // evaluate both sides of operator
-              left_tile.m_begin[1] = k;
-              auto left = et_left.eval(left_tile);
-    
-              right_tile.m_begin[0] = k;
-              auto right = et_right.eval(right_tile);
-    
-              // accumulate product
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-            // remainder tile in k
-            if(k < k_size){
-              auto &left_part_tile = make_tensor_tile_partial(left_tile);
-              left_part_tile.m_begin[1] = k;
-              left_part_tile.m_size[1] = k_size-k;
-              auto left = et_left.eval(left_part_tile);
-    
-              auto &right_part_tile = make_tensor_tile_partial(right_tile);
-              right_part_tile.m_begin[0] = k;
-              right_part_tile.m_size[0] = k_size-k;
-              auto right = et_right.eval(right_part_tile);
-    
-              // accumulate product of partial tile
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-    
-          }
-      };
-
-
-
-
-      template<
-          size_t INDEX,
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,INDEX>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = INDEX*tile_size;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,INDEX-1>>::multiply_into_result(result,tile,et_left,et_right);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE...  SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,0>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = 0;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0,  INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0,  INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          void
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-              const size_t iter_count = (k_size/tile_size) + ( (k_size%tile_size != 0) ? 1 : 0 );
-
-              MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,iter_count>>::multiply_into_result(result,tile,et_left,et_right);
-
-            }
-          };
-
-      };
-
-
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd;
-
-
-    /*!
-     * Specialization for vector*matrix left multiplication.
-     *
-     * By default the x'*A operator for two matrices produces a vector-matrix
-     * multiplication.
-     *
-     * The left hand side vector is always treated as a row vector.
-     *
-     * The resulting vector type is inherited from the LHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const&,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size = et_left.getDimSize(1);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Vector*Matrix");
-      }
+      auto const offset = 0;
+
+      if ((offset + tile_size) <= k_size)
+      {
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return dim == 0 ? left.getDimSize(0) : 0;
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        result += temp;
       }
+      else
+      {
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
-        // clear result
-        result_type result(0);
-
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
-
-        return result;
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE,
+                             TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      void>
+  {
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
-        // evaluate add into result
-        result_type result = add.eval(tile);
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size = et_left.getDimSize(1);
+      const size_t iter_count =
+          (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
-        return result;
-      }
+      MultiplyBridge<STORAGE,
+                     TileType,
+                     camp::integral_constant<size_t, iter_count>>::
+          multiply_into_result(result, tile, et_left, et_right);
+    }
+  };
+};
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        auto tile_size = right_type::result_type::s_dim_elem(0);
-        auto k_size = et_right.getDimSize(0);
+
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd;
 
 
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
+/*!
+ * Specialization for vector*matrix left multiplication.
+ *
+ * By default the x'*A operator for two matrices produces a vector-matrix
+ * multiplication.
+ *
+ * The left hand side vector is always treated as a row vector.
+ *
+ * The resulting vector type is inherited from the LHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
 
-        // tile over row of left and column of right
-        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        right_tile.m_begin[1] = tile.m_begin[0];
-        right_tile.m_size[1] = tile.m_size[0];
-        right_tile.m_size[0] = tile_size;
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
 
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[0] = tile_size;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Vector*Matrix"); }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return dim == 0 ? left.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+    // clear result
+    result_type result(0);
 
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
 
-          // evaluate both sides of operator
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
+    return result;
+  }
 
-          left_tile.m_begin[0] = k;
-          auto left = et_left.eval(left_tile);
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    auto tile_size = right_type::result_type::s_dim_elem(0);
+    auto k_size = et_right.getDimSize(0);
 
-          // accumulate product
-          result = right.left_multiply_vector_accumulate(left, result);
 
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[0] = k;
-          left_part_tile.m_size[0] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
+    // tile over row of left and column of right
+    auto right_tile =
+        RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    right_tile.m_begin[1] = tile.m_begin[0];
+    right_tile.m_size[1] = tile.m_size[0];
+    right_tile.m_size[0] = tile_size;
 
-          // compute product into x of partial tile
-          result = right.left_multiply_vector_accumulate(left, result);
-        }
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[0] = tile_size;
 
-      }
 
-    };
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
 
+      // evaluate both sides of operator
+      right_tile.m_begin[0] = k;
+      auto right = et_right.eval(right_tile);
 
+      left_tile.m_begin[0] = k;
+      auto left = et_left.eval(left_tile);
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorRegisters
+      // accumulate product
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
+
+      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[0] = k;
+      left_part_tile.m_size[0] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
+
+      // compute product into x of partial tile
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+  }
+};
+
+
+/*!
+ * Specialization for matrix-matrix multiplication for TensorRegisters
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+
+    /*
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     * For TensorRegister:
+     *
+     *   Return's a register containing product of left and right operands
+     *
+     * For TensorBlock:
+     *
+     *  Return's an ET TensorLiteral containing the left and right operrands
+     *
+     *  OR
+     *
+     *  Returns an ET multiply
      *
      */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
-    {
+    // create zeroed temporary
+    result_type result;
+    result.broadcast(0);
+
+    // multiply left and right operands into temporary
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-      static constexpr camp::idx_t s_num_dims = 2;
+    // start accumulator with addition term
+    result_type result = add.eval(tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Matrix");
-      }
+    multiply_into_result(result, tile, left, right);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-      }
+    return result;
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    using right_tensor_type = typename right_type::result_type;
+    auto tile_size = right_tensor_type::s_dim_elem(0);
+    auto k_size = et_left.getDimSize(1);
 
-        /*
-         *
-         * For TensorRegister:
-         *
-         *   Return's a register containing product of left and right operands
-         *
-         * For TensorBlock:
-         *
-         *  Return's an ET TensorLiteral containing the left and right operrands
-         *
-         *  OR
-         *
-         *  Returns an ET multiply
-         *
-         */
-        // create zeroed temporary
-        result_type result;
-        result.broadcast(0);
-
-        // multiply left and right operands into temporary
-        multiply_into_result(result, tile, left,right);
-
-        return result;
-      }
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add)
-      {
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin = et_left.getDimBegin(1);
 
-        // start accumulator with addition term
-        result_type result = add.eval(tile);
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin = et_right.getDimBegin(0);
 
-        multiply_into_result(result, tile, left, right);
 
-        return result;
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
 
-      }
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left = et_left.eval(left_tile);
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        using right_tensor_type = typename right_type::result_type;
-        auto tile_size = right_tensor_type::s_dim_elem(0);
-        auto k_size = et_left.getDimSize(1);
-
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[1] = tile_size;
-        auto left_begin = et_left.getDimBegin(1);
-
-        TILE_TYPE right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-        auto right_begin = et_right.getDimBegin(0);
-
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k + left_begin;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k + right_begin;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-        // remainder tile in k
-        if(k < k_size){
-
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k + left_begin;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k + right_begin;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-      }
+      right_tile.m_begin[0] = k + right_begin;
+      auto right = et_right.eval(right_tile);
 
-    };
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
 
+      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
 
+      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
 
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+  }
+};
 
 
-    template<typename OPERAND_TYPE, typename TILE_TYPE>
-    class RestrictExtents : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>> {
-      public:
-        using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
-        using operand_type = OPERAND_TYPE;
-        using result_type = typename OPERAND_TYPE::result_type;
-        using index_type = typename TILE_TYPE::index_type;
-        using tile_type = TILE_TYPE;
-        static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+template <typename OPERAND_TYPE, typename TILE_TYPE>
+class RestrictExtents
+    : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>>
+{
+public:
+  using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
+  using operand_type = OPERAND_TYPE;
+  using result_type = typename OPERAND_TYPE::result_type;
+  using index_type = typename TILE_TYPE::index_type;
+  using tile_type = TILE_TYPE;
+  static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+
+private:
+  operand_type m_operand;
+  tile_type m_tile;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RestrictExtents(operand_type const& operand, tile_type const& tile)
+      : m_operand{operand}, m_tile{tile}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tile.m_size[dim];
+  }
 
-      private:
-        operand_type m_operand;
-        tile_type m_tile;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimBegin(camp::idx_t dim) const
+  {
+    return m_tile.m_begin[dim];
+  }
 
-      public:
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        RestrictExtents(operand_type const &operand, tile_type const &tile) :
-        m_operand{operand}, m_tile{tile}
-        {}
+  template <typename TILE_TYPE2>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE2 const& tile) const
+      -> decltype(m_operand.eval(tile))
+  {
+    return m_operand.eval(tile);
+  }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("RestrictExtents(");
+    m_operand.print_ast();
+    printf(")");
+  }
+};
+
+template <typename OPERAND, typename TILE>
+RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
+                                               TILE const& tile)
+{
+  using tile_type = typename OPERAND::tile_type;
+  tile_type new_tile;
+  new_tile.copy(tile);
+  return RestrictExtents<OPERAND, TILE>(operand, new_tile);
+}
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tile.m_size[dim];
-        }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimBegin(camp::idx_t dim) const {
-          return m_tile.m_begin[dim];
-        }
+/*!
+ * Specialization for matrix-matrix multiplication for TensorBlocks
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
 
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<
+        std::is_base_of<TensorBlockConcreteBase,
+                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using left_type = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
 
-        template<typename TILE_TYPE2>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE2 const &tile) const ->
-          decltype(m_operand.eval(tile))
-        {
-          return m_operand.eval(tile);
-        }
+  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
+  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename
+  //      RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("RestrictExtents(");
-          m_operand.print_ast();
-          printf(")");
-        }
 
+  // This tensor type is a TensorBlock of some kind
+  using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
 
-    };
+  // Get the storage type from the TensorBlock
+  using storage_type = typename tensor_type::storage_type;
 
-    template<typename OPERAND, typename TILE>
-    RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const &operand, TILE const &tile){
-      using tile_type = typename OPERAND::tile_type;
-      tile_type new_tile;
-      new_tile.copy(tile);
-      return RestrictExtents<OPERAND, TILE>(operand, new_tile);
-    }
+  // Create a BlockLiteral that uses the TensorBlock's indicated storage
+  // and has an eval() that produces the TensorBlock's register type
+  using block_literal =
+      BlockLiteral<storage_type, typename tensor_type::register_type>;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const&,
+      RIGHT_OPERAND_TYPE const&) //->
+                                 /// decltype(TensorMultiply<decltype(left.eval(tile)),
+                                 /// decltype(right.eval(tile))>(left.eval(tile),
+                                 /// right.eval(tile)))
+  {
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorBlocks
+    /*
+     * First pass:  just return a Multiply ET that evaluates the block
+     * with underlying TensorRegisters
+     *
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     * Second pass: we want to return a TensorLiteral ET node with the
+     * matrix product already evaluated.?
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
+     *
+     */
+    // create a BlockLiteral
+    block_literal result(tile);
+
+    // evaluate the block-wise product into result
+
+    // return TensorMultiply<decltype(left.eval(tile)),
+    // decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const&
+                   add) //->
+                        // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
+                        // decltype(right.eval(tile)),
+                        // decltype(add.eval(tile))>(left.eval(tile),
+                        // right.eval(tile), add.eval(tile)))
+  {
+    /*
+     * First pass:  we want to return a BlockLiteral ET node with the
+     * matrix product already evaluated.  We do this by creating
+     * a LoadStore node wrapping the BlockLiteral, and evaluating it as
+     * a sub-expression.
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
      *
      */
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    // create a BlockLiteral
+    using block_tile_type = typename block_literal::tile_type;
+    block_tile_type block_tile;
+    block_tile.copy(tile);
+    block_literal result(block_tile);
+
+    using ref_type = typename block_literal::ref_type;
+    using load_store_type = TensorLoadStore<tensor_type, ref_type>;
+
+    // initialize the result with our addition term
+    auto result_et = load_store_type(result.get_ref()).eval(tile);
+    result_et = add.eval(tile);
+
+    // return TensorMultiplyAdd<decltype(left.eval(tile)),
+    // decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile),
+    // right.eval(tile), add.eval(tile));
+
+    //          multiply_into_result(result_et, tile, restrictExtents(left,
+    //          tile), restrictExtents(right, tile));
+    multiply_into_result(result_et, tile, left, right);
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+
+    // get tile size from matrix type
+    auto tile_size = result_type::s_dim_elem(1);
+    auto k_size = et_left.getDimSize(1);
+
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin = et_left.getDimBegin(1);
+
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin = et_right.getDimBegin(0);
+
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
-        using left_type = LEFT_OPERAND_TYPE;
-        using right_type = RIGHT_OPERAND_TYPE;
-        using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-        static constexpr camp::idx_t s_num_dims = 2;
 
-  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
-  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
-
-
-        // This tensor type is a TensorBlock of some kind
-        using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
-
-        // Get the storage type from the TensorBlock
-        using storage_type = typename tensor_type::storage_type;
-
-        // Create a BlockLiteral that uses the TensorBlock's indicated storage
-        // and has an eval() that produces the TensorBlock's register type
-        using block_literal = BlockLiteral<storage_type,
-                                           typename tensor_type::register_type>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Matrx*Matrix");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &) //->
-          ///decltype(TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile)))
-        {
-
-          /*
-           * First pass:  just return a Multiply ET that evaluates the block
-           * with underlying TensorRegisters
-           *
-           *
-           * Second pass: we want to return a TensorLiteral ET node with the
-           * matrix product already evaluated.?
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-          // create a BlockLiteral
-          block_literal result(tile);
-
-          // evaluate the block-wise product into result
-
-          //return TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-        template<typename TILE_TYPE, typename ADD_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add) //->
-          //decltype(TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile)))
-        {
-          /*
-           * First pass:  we want to return a BlockLiteral ET node with the
-           * matrix product already evaluated.  We do this by creating
-           * a LoadStore node wrapping the BlockLiteral, and evaluating it as
-           * a sub-expression.
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-
-          // create a BlockLiteral
-          using block_tile_type = typename block_literal::tile_type;
-          block_tile_type block_tile;
-          block_tile.copy(tile);
-          block_literal result(block_tile);
-
-          using ref_type = typename block_literal::ref_type;
-          using load_store_type = TensorLoadStore<tensor_type, ref_type>;
-
-          // initialize the result with our addition term
-          auto result_et = load_store_type(result.get_ref()).eval(tile);
-          result_et = add.eval(tile);
-
-          //return TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile));
-
-//          multiply_into_result(result_et, tile, restrictExtents(left, tile), restrictExtents(right, tile));
-          multiply_into_result(result_et, tile, left, right);
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-      private:
-
-        template<typename STORAGE, typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-        {
-
-          // get tile size from matrix type
-          auto tile_size = result_type::s_dim_elem(1);
-          auto k_size = et_left.getDimSize(1);
-
-          // TODO: check that left and right are compatible
-          // m_left.getDimSize(1) == m_right.getDimSize(0)
-          // how do we provide checking for this kind of error?
-
-          // tile over row of left and column of right
-          TILE_TYPE left_tile = tile;
-          left_tile.m_size[1] = tile_size;
-          auto left_begin = et_left.getDimBegin(1);
-
-          TILE_TYPE right_tile = tile;
-          right_tile.m_size[0] = tile_size;
-          auto right_begin = et_right.getDimBegin(0);
-
-
-
-          // Do full tiles in k
-          decltype(k_size) k = 0;
-          for(;k+tile_size <= k_size; k+= tile_size){
-
-
-            // evaluate both sides of operator
-            left_tile.m_begin[1] = k + left_begin;
-            auto left = et_left.eval(left_tile);
-
-            right_tile.m_begin[0] = k + right_begin;
-            auto right = et_right.eval(right_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
-          }
-          // remainder tile in k
-          if(k < k_size){
-
-            auto &left_part_tile = make_tensor_tile_partial(left_tile);
-            left_part_tile.m_begin[1] = k + left_begin;
-            left_part_tile.m_size[1] = k_size-k;
-            auto left = et_left.eval(left_part_tile);
-
-            auto &right_part_tile = make_tensor_tile_partial(right_tile);
-            right_part_tile.m_begin[0] = k + right_begin;
-            right_part_tile.m_size[0] = k_size-k;
-            auto right = et_right.eval(right_part_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_part_tile) * restrictExtents(right, right_part_tile);
-          }
-        }
-    };
-
-
-  } // namespace ET
 
-  } // namespace internal
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left = et_left.eval(left_tile);
+
+      right_tile.m_begin[0] = k + right_begin;
+      auto right = et_right.eval(right_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result +=
+          restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+
+      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1] = k_size - k;
+      auto left = et_left.eval(left_part_tile);
+
+      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0] = k_size - k;
+      auto right = et_right.eval(right_part_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result += restrictExtents(left, left_part_tile) *
+                restrictExtents(right, right_part_tile);
+    }
+  }
+};
+
+
+} // namespace ET
+
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index faa92747dd..b2edaebf09 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -33,348 +33,383 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct DivideOperator;
+
+
+/*!
+ * Specialization that provides dividing a scalar by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
   {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type numerator(left.eval(tile));
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct DivideOperator;
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_n(right.eval(tile), tile.m_size[0]);
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
 
-    /*!
-     * Specialization that provides dividing a scalar by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return left.eval(tile).divide(denominator);
+    }
+    else
     {
+      return left.eval(tile).divide_n(denominator, tile.m_size[0]);
+    }
+  }
+};
+
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_n(right.eval(tile), tile.m_size[0]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_n(denominator, tile.m_size[0]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
-        }
-      }
-    };
-
-
-
-
-
-
-    /*!
-     * Specialization that provides dividing a scalar by a matrix
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a scalar by a matrix
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type numerator(left.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_nm(
+        right.eval(tile), tile.m_size[0], tile.m_size[1]);
+  }
+};
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return left.eval(tile).divide(denominator);
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_nm(denominator, tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide_nm(
+          denominator, tile.m_size[0], tile.m_size[1]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorDivide: public TensorExpressionBase<TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using divide_op = DivideOperator<left_operand_type, right_operand_type>;
-        using result_type = typename divide_op::result_type;
-        static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
-
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return divide_op::divide(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Divide(");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic / tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator/(LHS const &left_operand, RHS const &right_operand) ->
-    TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+      return left.eval(tile).divide_nm(
+          right.eval(tile), tile.m_size[0], tile.m_size[1]);
     }
+  }
+};
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorDivide : public TensorExpressionBase<
+                         TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+  using divide_op = DivideOperator<left_operand_type, right_operand_type>;
+  using result_type = typename divide_op::result_type;
+  static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
+
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorDivide(left_operand_type const& left_operand,
+               right_operand_type const& right_operand)
+      : m_left_operand{left_operand}, m_right_operand{right_operand}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    return divide_op::divide(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
+  {
+    return m_right_operand;
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Divide(");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic / tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index 6720a304f2..c1c1cb54be 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -33,78 +33,74 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename TENSOR_TYPE>
+class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
+{
+public:
+  using self_type = TensorLiteral<TENSOR_TYPE>;
+  using tensor_type = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using result_type = tensor_type;
+  using index_type = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return tensor_type::s_dim_elem(dim);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLiteral(tensor_type const& value) : m_value{value} {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const&) const
+  {
+    return result_type(m_value);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("TensorLiteral()"); }
+
+private:
+  tensor_type m_value;
+};
+
+
+/*
+ * For TensorRegister nodes, we need to wrap this in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
+{
+  using return_type = TensorLiteral<RHS>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
   {
+    return return_type(rhs);
+  }
+};
 
+} // namespace ET
 
-    template<typename TENSOR_TYPE>
-    class TensorLiteral :  public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>> {
-      public:
-        using self_type = TensorLiteral<TENSOR_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using result_type = tensor_type;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return tensor_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLiteral(tensor_type const &value) :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &) const {
-          return result_type(m_value);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("TensorLiteral()");
-        }
-
-      private:
-        tensor_type m_value;
-    };
-
-
-    /*
-     * For TensorRegister nodes, we need to wrap this in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
-    {
-        using return_type = TensorLiteral<RHS>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
-
-  } // namespace ET
-
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 3b69552a32..f6642c4c6c 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -34,222 +34,187 @@ namespace expt
 {
 
 
+namespace ET
+{
 
 
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+struct TensorStoreFunctor
+{
+  LHS_TYPE const& m_lhs;
+  RHS_TYPE const& m_rhs;
 
-  namespace ET
+  template <typename TILE_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(TILE_TYPE const& tile) const
   {
 
 
+    /*
+     *
+     * For recursive ET types, eval() produces a new ET, and
+     * eval_lhs() produces a new TensorLoadStore.
+     *
+     */
 
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    struct TensorStoreFunctor
-    {
-        LHS_TYPE const &m_lhs;
-        RHS_TYPE const &m_rhs;
-
-        template<typename TILE_TYPE>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void operator()(TILE_TYPE const &tile) const {
-
-
-          /*
-           *
-           * For recursive ET types, eval() produces a new ET, and
-           * eval_lhs() produces a new TensorLoadStore.
-           *
-           */
-
-          m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
-
-        }
-    };
-
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    auto makeTensorStoreFunctor(LHS_TYPE const &lhs, RHS_TYPE const &rhs) ->
-    TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
-    {
-      return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
-    }
-
-
-    template<typename TENSOR_TYPE, typename REF_TYPE>
-    class TensorLoadStore : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>> {
-      public:
-        using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using index_type = typename REF_TYPE::index_type;
-        using ref_type = REF_TYPE;
-        using tile_type = typename REF_TYPE::tile_type;
-        using result_type = TENSOR_TYPE;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        ref_type m_ref;
-
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLoadStore(ref_type const &ref) : m_ref{ref}
-        {
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorLoadStore(self_type const &rhs) : m_ref(rhs.m_ref)
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print() const {
-          printf("TensorLoadStore: ");
-          m_ref.m_tile.print();
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(self_type const &rhs)
-        {
-          store(rhs);
-          return *this;
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(RHS const &rhs)
-        {
-
-          store(normalizeOperand(rhs));
-
-          return *this;
-        }
-
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator+=(RHS const &rhs)
-        {
-          store( normalizeOperand(rhs) + (*this) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator-=(RHS const &rhs)
-        {
-          store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator*=(RHS const &rhs)
-        {
-          store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator/=(RHS const &rhs)
-        {
-          store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
-        {
-          return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval_lhs(TILE_TYPE const &tile) const ->
-          decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref, tile)))
-        {
-          return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_ref.m_tile.m_size[dim];
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Load()");
-        }
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        tile_type const &getTile() const {
-          return m_ref.m_tile;
-        }
-
-
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void store(RHS const &rhs)
-        {
-#ifdef RAJA_DEBUG_PRINT_ET_AST
-          printf("Store(");
-          rhs.print_ast();
-          printf(")\n");
-#endif
+    m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
+  }
+};
+
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+makeTensorStoreFunctor(LHS_TYPE const& lhs, RHS_TYPE const& rhs)
+    -> TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
+{
+  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
+}
+
+
+template <typename TENSOR_TYPE, typename REF_TYPE>
+class TensorLoadStore
+    : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>>
+{
+public:
+  using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
+  using tensor_type = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using index_type = typename REF_TYPE::index_type;
+  using ref_type = REF_TYPE;
+  using tile_type = typename REF_TYPE::tile_type;
+  using result_type = TENSOR_TYPE;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
-          tensorTileExec<tensor_type>(m_ref.m_tile,
-              makeTensorStoreFunctor<tensor_type>(*this, rhs));
-        }
 
+private:
+  ref_type m_ref;
+
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLoadStore(ref_type const& ref) : m_ref{ref} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorLoadStore(self_type const& rhs) : m_ref(rhs.m_ref) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print() const
+  {
+    printf("TensorLoadStore: ");
+    m_ref.m_tile.print();
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& rhs)
+  {
+    store(rhs);
+    return *this;
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator=(RHS const& rhs)
+  {
+
+    store(normalizeOperand(rhs));
 
+    return *this;
+  }
 
 
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator+=(RHS const& rhs)
+  {
+    store(normalizeOperand(rhs) + (*this));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator-=(RHS const& rhs)
+  {
+    store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*=(RHS const& rhs)
+  {
+    store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator/=(RHS const& rhs)
+  {
+    store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
+  {
+    return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const& tile) const
+      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
+                                                                  tile)))
+  {
+    return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_ref.m_tile.m_size[dim];
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("Load()"); }
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  tile_type const& getTile() const { return m_ref.m_tile; }
+
+
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE void store(RHS const& rhs)
+  {
+#ifdef RAJA_DEBUG_PRINT_ET_AST
+    printf("Store(");
+    rhs.print_ast();
+    printf(")\n");
+#endif
 
-    };
+    tensorTileExec<tensor_type>(
+        m_ref.m_tile, makeTensorStoreFunctor<tensor_type>(*this, rhs));
+  }
+};
 
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 3e3429588f..956bcc7314 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -33,129 +33,139 @@ namespace internal
 namespace expt
 {
 
-  namespace ET
+namespace ET
+{
+
+// forward decl for FMA contraction
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_TYPE>
+class TensorMultiplyAdd;
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorMultiply
+    : public TensorExpressionBase<
+          TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiply(left_operand_type const& left_operand,
+                 right_operand_type const& right_operand)
+      : m_left_operand{left_operand}, m_right_operand{right_operand}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr int getDimSize(int dim) const
+  {
+    return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
+  {
+    return multiply_op::multiply(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
   {
+    return m_right_operand;
+  }
+
+
+  /*!
+   * operator+ overload that forms a FMA contraction
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename ADD>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
+                                                 right_operand_type,
+                                                 normalize_operand_t<ADD>>
+  operator+(ADD const& add) const
+  {
+    return TensorMultiplyAdd<left_operand_type,
+                             right_operand_type,
+                             normalize_operand_t<ADD>>(
+        m_left_operand, m_right_operand, normalizeOperand(add));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Multiply[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic * tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
+
+} // namespace ET
 
-    // forward decl for FMA contraction
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_TYPE>
-    class TensorMultiplyAdd;
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorMultiply : public TensorExpressionBase<TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        int getDimSize(int dim) const {
-          return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
-        {
-          return multiply_op::multiply(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        /*!
-         * operator+ overload that forms a FMA contraction
-         */
-        RAJA_SUPPRESS_HD_WARN
-        template<typename ADD>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>
-        operator+(ADD const &add) const {
-          return TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>(m_left_operand, m_right_operand, normalizeOperand(add));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Multiply[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic * tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator*(LHS const &left_operand, RHS const &right_operand) ->
-    TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-    {
-      return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
-    }
-
-  } // namespace ET
-
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 44f27e92c7..8d652a7c67 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -33,83 +33,92 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Expression for LHS*RHS+ADD, which allows for accessing FMA style
-     * operations.
-     *
-     * This ET can only be generated by contracting an Add and Multiple ET.
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using add_operand_type = ADD_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-        add_operand_type m_add_operand;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd(left_operand_type const &left_operand, right_operand_type const &right_operand,
-                          add_operand_type const &add_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}, m_add_operand{add_operand}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand))
-        {
-          return multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("MultiplyAdd[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(", ");
-          m_add_operand.print_ast();
-          printf(")");
-        }
-
+namespace ET
+{
 
 
-    };
+/*!
+ * Expression for LHS*RHS+ADD, which allows for accessing FMA style
+ * operations.
+ *
+ * This ET can only be generated by contracting an Add and Multiple ET.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd
+    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                                    RIGHT_OPERAND_TYPE,
+                                                    ADD_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                      RIGHT_OPERAND_TYPE,
+                                      ADD_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using add_operand_type = ADD_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+  add_operand_type m_add_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiplyAdd(left_operand_type const& left_operand,
+                    right_operand_type const& right_operand,
+                    add_operand_type const& add_operand)
+      : m_left_operand{left_operand},
+        m_right_operand{right_operand},
+        m_add_operand{add_operand}
+  {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply_add(tile,
+                                            m_left_operand,
+                                            m_right_operand,
+                                            m_add_operand))
+  {
+    return multiply_op::multiply_add(
+        tile, m_left_operand, m_right_operand, m_add_operand);
+  }
 
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("MultiplyAdd[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(", ");
+    m_add_operand.print_ast();
+    printf(")");
+  }
+};
 
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index d5211e4963..116caeb42e 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -33,63 +33,60 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
+{
+public:
+  using self_type = TensorNegate<ET_TYPE>;
+  using rhs_type = ET_TYPE;
+  using tensor_type = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type = typename ET_TYPE::index_type;
+
+  using result_type = tensor_type;
+  using tile_type = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate(rhs_type const& tensor) : m_tensor{tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
   {
+    return m_tensor.eval(tile).scale(-1);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Negate(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+} // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorNegate :  public TensorExpressionBase<TensorNegate<ET_TYPE>> {
-      public:
-        using self_type = TensorNegate<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return m_tensor.eval(tile).scale(-1);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Negate(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index 4ab0a3ebc6..b015ca395c 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -33,80 +33,73 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename T>
+class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
+{
+public:
+  using self_type = TensorScalarLiteral<T>;
+  using tensor_type = RAJA::expt::ScalarRegister<T>;
+  using element_type = T;
+  using result_type = T;
+  using index_type = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = 0;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type) const { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit constexpr TensorScalarLiteral(element_type const& value) noexcept
+      : m_value{value}
+  {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE element_type eval(TILE_TYPE const&) const
   {
+    return m_value;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("ScalarLiteral(%e)", (double)m_value); }
+
+private:
+  element_type m_value;
+};
 
 
-    template<typename T>
-    class TensorScalarLiteral :  public TensorExpressionBase<TensorScalarLiteral<T>> {
-      public:
-        using self_type = TensorScalarLiteral<T>;
-        using tensor_type = RAJA::expt::ScalarRegister<T>;
-        using element_type = T;
-        using result_type = T;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = 0;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type ) const {
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        constexpr
-        TensorScalarLiteral(element_type const &value) noexcept :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        element_type eval(TILE_TYPE const &) const {
-          return m_value;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("ScalarLiteral(%e)", (double)m_value);
-        }
-
-      private:
-        element_type m_value;
-    };
-
-
-    /*
-     * For arithmetic values, we need to wrap in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
+/*
+ * For arithmetic values, we need to wrap in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
     typename std::enable_if<std::is_arithmetic<RHS>::value>::type>
-    {
-        using return_type = TensorScalarLiteral<RHS>;
+{
+  using return_type = TensorScalarLiteral<RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
+  {
+    return return_type(rhs);
+  }
+};
 
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index 46950eec6f..590ce2a14d 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -33,69 +33,65 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
+{
+public:
+  using self_type = TensorTranspose<ET_TYPE>;
+  using rhs_type = ET_TYPE;
+  using tensor_type = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type = typename ET_TYPE::index_type;
+
+  using result_type = tensor_type;
+  using tile_type = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose(rhs_type const& tensor) : m_tensor{tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
   {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    // transpose which tile we are returning
+    TILE_TYPE trans_tile{{tile.m_begin[1], tile.m_begin[0]},
+                         {tile.m_size[1], tile.m_size[0]}};
+
+    // evaluate and return the transposed tile
+    return m_tensor.eval(trans_tile).transpose();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Transpose(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+} // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorTranspose :  public TensorExpressionBase<TensorTranspose<ET_TYPE>> {
-      public:
-        using self_type = TensorTranspose<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          // transpose which tile we are returning
-          TILE_TYPE trans_tile{
-            {tile.m_begin[1], tile.m_begin[0]},
-            {tile.m_size[1],  tile.m_size[0]}
-          };
-
-          // evaluate and return the transposed tile
-          return m_tensor.eval(trans_tile).transpose();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Transpose(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
index 2a868a3131..90a81aea6e 100644
--- a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
@@ -33,66 +33,59 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
-
-  namespace ET
-  {
-    class TensorExpressionConcreteBase;
-
-    template<typename RHS, typename enable = void>
-    struct NormalizeOperandHelper;
-
-
-    /*
-     * For TensorExpression nodes, we just return them as-is.
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
-    {
-        using return_type = RHS;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return rhs;
-        }
-    };
-
-
-
-
-    /**
-     * Allows uniform packaging up of operands into ExpressionTemplates.
-     *
-     * The NormalizeOperandHelper is specialized throughout the code in order
-     * to convert non-ET operands into ET objects
-     *
-     * ET operators can then take any operand type, and use this to convert
-     * them into ET types the same way.
-     */
-    template<typename RHS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto normalizeOperand(RHS const &rhs) ->
+class TensorRegisterConcreteBase;
+
+namespace ET
+{
+class TensorExpressionConcreteBase;
+
+template <typename RHS, typename enable = void>
+struct NormalizeOperandHelper;
+
+
+/*
+ * For TensorExpression nodes, we just return them as-is.
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
+{
+  using return_type = RHS;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs) { return rhs; }
+};
+
+
+/**
+ * Allows uniform packaging up of operands into ExpressionTemplates.
+ *
+ * The NormalizeOperandHelper is specialized throughout the code in order
+ * to convert non-ET operands into ET objects
+ *
+ * ET operators can then take any operand type, and use this to convert
+ * them into ET types the same way.
+ */
+template <typename RHS>
+RAJA_INLINE RAJA_HOST_DEVICE auto normalizeOperand(RHS const& rhs) ->
     typename NormalizeOperandHelper<RHS>::return_type
-    {
-      return NormalizeOperandHelper<RHS>::normalize(rhs);
-    }
+{
+  return NormalizeOperandHelper<RHS>::normalize(rhs);
+}
 
-    template<typename RHS>
-    using normalize_operand_t =
-        typename NormalizeOperandHelper<RHS>::return_type;
+template <typename RHS>
+using normalize_operand_t = typename NormalizeOperandHelper<RHS>::return_type;
 
 
-  } // namespace ET
+} // namespace ET
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
index 2b6bf7304d..a94ec924db 100644
--- a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
@@ -31,5 +31,4 @@
 #include "RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp"
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 08a9886acc..9272f64cd6 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -31,301 +31,324 @@ namespace expt
 {
 
 
+template <typename MATA, typename MATB>
+struct MatrixMatrixMultiplyHelper;
 
 
+/**
+ *
+ * Row-Major * Row-Major ==> Row-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-
-
-  template<typename MATA, typename MATB>
-  struct MatrixMatrixMultiplyHelper;
-
-
-
-  /**
-   *
-   * Row-Major * Row-Major ==> Row-Major
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for "
+                "multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::RowMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::RowMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::RowMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+  /*
+   * Matrix B (and C) has 1 more more registers per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-  struct MatrixMatrixMultiplyHelper<
-  RAJA::expt::TensorRegister<REGISTER_POLICY,
-                   T,
-                   RAJA::expt::RowMajorLayout,
-                   camp::idx_seq<N_SIZE, M_SIZE>>,
-                   RAJA::expt::TensorRegister<REGISTER_POLICY,
-                    T,
-                    RAJA::expt::RowMajorLayout,
-                    camp::idx_seq<M2_SIZE, O_SIZE>> >
-    {
-
-      static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-      using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                       T,
-                                       RAJA::expt::RowMajorLayout,
-                                       camp::idx_seq<N_SIZE, M_SIZE>>;
-
-      using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        RAJA::expt::RowMajorLayout,
-                                        camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-      using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::RowMajorLayout,
-                                         camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-      using register_type = typename result_type::register_type;
-
-      static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-      static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-      /*
-       * Matrix B (and C) has 1 more more registers per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-      {
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-        RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
 #endif
 
-        constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
+    constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
 
-        RAJA_UNROLL
-        for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-          camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
-          camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
-
-          RAJA_UNROLL
-          for(camp::idx_t a_col = 0;a_col < M_SIZE;++ a_col){
-            camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
-
-            C.get_register(c_reg) =
-                register_type(A.get(ac_row, a_col)).multiply_add(
-                    B.get_register(b_reg),
-                    C.get_register(c_reg));
-          }
-        }
-
-      }
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
+      camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
 
-      /*
-       * Matrix B (and C) have less than one register per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
+      RAJA_UNROLL
+      for (camp::idx_t a_col = 0; a_col < M_SIZE; ++a_col)
       {
-        constexpr camp::idx_t bc_segbits = result_type::s_segbits;
-        constexpr camp::idx_t a_segments_per_register = 1<<bc_segbits;
-
-        RAJA_UNROLL
-        for(camp::idx_t ac_row = 0;ac_row < N_SIZE;++ ac_row){
-          camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
-          camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
-          register_type c_tmp;
-
-          RAJA_UNROLL
-          for(camp::idx_t b_reg = 0;b_reg < right_type::s_num_registers;++ b_reg){
-
-            camp::idx_t a_segment = ac_row*right_type::s_num_registers + b_reg;
-            camp::idx_t a_reg = a_segment / a_segments_per_register;
-            camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
-
-            auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(bc_segbits, a_reg_segment);
-
-            if(b_reg == 0){
-
-              c_tmp = a_tmp.multiply(B.get_register(b_reg));
-            }
-            else{
-              c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
-            }
-
-          }
-
-          C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
-
-        }
+        camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
 
+        C.get_register(c_reg) =
+            register_type(A.get(ac_row, a_col))
+                .multiply_add(B.get_register(b_reg), C.get_register(c_reg));
       }
+    }
+  }
 
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
-      void multiply(left_type const &A, right_type const &B, result_type &C){
-        C = result_type(0);
-        multiply_accumulate(A, B, C);
-      }
-  };
-
-
-  /**
-   *
-   * Column-Major * Column-Major ==> Column-Major
+  /*
+   * Matrix B (and C) have less than one register per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-    struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                     T,
-                     RAJA::expt::ColMajorLayout,
-                     camp::idx_seq<N_SIZE, M_SIZE>>,
-                     RAJA::expt::TensorRegister<REGISTER_POLICY,
-                      T,
-                      RAJA::expt::ColMajorLayout,
-                      camp::idx_seq<M2_SIZE, O_SIZE>> >
-      {
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t bc_segbits = result_type::s_segbits;
+    constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
+
+    RAJA_UNROLL
+    for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row)
+    {
+      camp::idx_t c_reg = ac_row / result_type::s_major_dim_per_register;
+      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
+      register_type c_tmp;
 
-      using self_type = MatrixMatrixMultiplyHelper<
-          RAJA::expt::TensorRegister<REGISTER_POLICY,
-                         T,
-                         RAJA::expt::ColMajorLayout,
-                         camp::idx_seq<N_SIZE, M_SIZE>>,
-                         RAJA::expt::TensorRegister<REGISTER_POLICY,
-                          T,
-                          RAJA::expt::ColMajorLayout,
-                          camp::idx_seq<M2_SIZE, O_SIZE>> >;
-
-        static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-        using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::ColMajorLayout,
-                                         camp::idx_seq<N_SIZE, M_SIZE>>;
-
-        using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                          T,
-                                          RAJA::expt::ColMajorLayout,
-                                          camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-        using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                           T,
-                                           RAJA::expt::ColMajorLayout,
-                                           camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-        using register_type = typename result_type::register_type;
-
-        static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-        static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-
-
-        /*
-         * Matrix A (and C) has 1 more more registers per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
+      RAJA_UNROLL
+      for (camp::idx_t b_reg = 0; b_reg < right_type::s_num_registers; ++b_reg)
+      {
 
-  #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-          RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
-  #endif
+        camp::idx_t a_segment = ac_row * right_type::s_num_registers + b_reg;
+        camp::idx_t a_reg = a_segment / a_segments_per_register;
+        camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
 
+        auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(
+            bc_segbits, a_reg_segment);
 
-          constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
+        if (b_reg == 0)
+        {
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-            camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
-            camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
+          c_tmp = a_tmp.multiply(B.get_register(b_reg));
+        }
+        else
+        {
+          c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
+        }
+      }
 
-            RAJA_UNROLL
-            for(camp::idx_t b_row = 0;b_row < M_SIZE;++ b_row){
-              camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
+      C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
+    }
+  }
 
-              C.get_register(c_reg) =
-                  register_type(B.get(b_row, bc_col)).multiply_add(
-                      A.get_register(a_reg),
-                      C.get_register(c_reg));
-            }
-          }
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void
+  multiply(left_type const& A, right_type const& B, result_type& C)
+  {
+    C = result_type(0);
+    multiply_accumulate(A, B, C);
+  }
+};
 
 
-        }
+/**
+ *
+ * Column-Major * Column-Major ==> Column-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-        /*
-         * Matrix A (and C) have less than one register per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static
-        typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-          constexpr camp::idx_t ac_segbits = result_type::s_segbits;
-          constexpr camp::idx_t b_segments_per_register = 1<<ac_segbits;
+  using self_type = MatrixMatrixMultiplyHelper<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<N_SIZE, M_SIZE>>,
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
+
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for "
+                "multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::ColMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::ColMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::ColMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+
+  /*
+   * Matrix A (and C) has 1 more more registers per column
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
+      typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 
-          camp::idx_t bc_col = 0;
+#if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
+#endif
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < N_SIZE/result_type::s_major_dim_per_register;++ c_reg){
 
-            RAJA_UNROLL
-            for(camp::idx_t c_segment = 0;c_segment < result_type::s_major_dim_per_register;++ c_segment){
+    constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
 
-              register_type c_tmp;
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
+      camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
 
-              RAJA_UNROLL
-              for(camp::idx_t a_reg = 0;a_reg < right_type::s_num_registers;++ a_reg){
+      RAJA_UNROLL
+      for (camp::idx_t b_row = 0; b_row < M_SIZE; ++b_row)
+      {
+        camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
 
+        C.get_register(c_reg) =
+            register_type(B.get(b_row, bc_col))
+                .multiply_add(A.get_register(a_reg), C.get_register(c_reg));
+      }
+    }
+  }
 
-                camp::idx_t b_segment = bc_col*right_type::s_num_registers + a_reg;
-                camp::idx_t b_reg = b_segment / b_segments_per_register;
-                camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
+  /*
+   * Matrix A (and C) have less than one register per column
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t ac_segbits = result_type::s_segbits;
+    constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
+
+    camp::idx_t bc_col = 0;
+
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0;
+         c_reg < N_SIZE / result_type::s_major_dim_per_register;
+         ++c_reg)
+    {
 
-                register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(ac_segbits, b_reg_segment);
+      RAJA_UNROLL
+      for (camp::idx_t c_segment = 0;
+           c_segment < result_type::s_major_dim_per_register;
+           ++c_segment)
+      {
 
-                if(a_reg == 0){
-                  c_tmp = b_tmp.multiply(A.get_register(a_reg));
-                }
-                else{
-                  c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
-                }
+        register_type c_tmp;
 
-              }
+        RAJA_UNROLL
+        for (camp::idx_t a_reg = 0; a_reg < right_type::s_num_registers;
+             ++a_reg)
+        {
 
-              C.get_register(c_reg) += c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
-              ++ bc_col;
-            } // c_segment
-          } // c_reg
+          camp::idx_t b_segment = bc_col * right_type::s_num_registers + a_reg;
+          camp::idx_t b_reg = b_segment / b_segments_per_register;
+          camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
 
+          register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(
+              ac_segbits, b_reg_segment);
 
+          if (a_reg == 0)
+          {
+            c_tmp = b_tmp.multiply(A.get_register(a_reg));
+          }
+          else
+          {
+            c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
+          }
         }
 
+        C.get_register(c_reg) +=
+            c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        void multiply(left_type const &A, right_type const &B, result_type &C){
-          C = result_type(0);
-          self_type::multiply_accumulate(A, B, C);
-        }
-    };
+        ++bc_col;
+      } // c_segment
+    }   // c_reg
+  }
 
 
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void
+  multiply(left_type const& A, right_type const& B, result_type& C)
+  {
+    C = result_type(0);
+    self_type::multiply_accumulate(A, B, C);
+  }
+};
 
 
 } // namespace expt
@@ -333,6 +356,4 @@ namespace expt
 } // namespace RAJA
 
 
-
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 3036a096b5..07ebbe3099 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -32,1121 +32,1370 @@ namespace RAJA
 namespace expt
 {
 
-  /*
-   * 2D (Matrix) specialization of TensorRegister
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t ROW_ORD, camp::idx_t COL_ORD, camp::idx_t ROW_SIZE, camp::idx_t COL_SIZE>
-  class TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>> :
-    public RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+/*
+ * 2D (Matrix) specialization of TensorRegister
+ */
+template <typename REGISTER_POLICY,
+          typename T,
+          camp::idx_t ROW_ORD,
+          camp::idx_t COL_ORD,
+          camp::idx_t ROW_SIZE,
+          camp::idx_t COL_SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
+    : public RAJA::internal::expt::TensorRegisterBase<
+          TensorRegister<REGISTER_POLICY,
+                         T,
+                         TensorLayout<ROW_ORD, COL_ORD>,
+                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   TensorLayout<ROW_ORD, COL_ORD>,
+                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+  using base_type = RAJA::internal::expt::TensorRegisterBase<
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+  using register_type = Register<T, REGISTER_POLICY>;
+  using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
+  using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
+  using register_policy = REGISTER_POLICY;
+  using element_type = T;
+  using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
+
+  using transpose_tensor_type =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<!ROW_ORD, !COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+
+  using transpose_type = TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        layout_type,
+                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+  using product_type = TensorRegister<REGISTER_POLICY,
+                                      T,
+                                      layout_type,
+                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+
+  static constexpr camp::idx_t s_num_rows = ROW_SIZE;
+  static constexpr camp::idx_t s_num_columns = COL_SIZE;
+
+
+  static constexpr camp::idx_t s_elements_per_register =
+      RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem;
+
+  // number of registers to hold entire matrix
+  static constexpr camp::idx_t s_num_registers =
+      (ROW_SIZE * COL_SIZE) / s_elements_per_register;
+
+  // We only allow matrix sizes that exactly fit in some number of registers
+  static_assert((ROW_SIZE * COL_SIZE) ==
+                    s_num_registers * s_elements_per_register,
+                "MatrixRegister must be dimensioned to exactly fit an integer "
+                "number of registers");
+
+  using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+
+  static constexpr camp::idx_t s_minor_dim_elements =
+      layout_type::is_row_major() ? s_num_columns : s_num_rows;
+
+  static constexpr camp::idx_t s_major_dim_elements =
+      layout_type::is_row_major() ? s_num_rows : s_num_columns;
+
+  // number of (full) registers that span the minor dim
+  // if a single register is split across multiple rows or columns, then
+  // this is 0
+  static constexpr camp::idx_t s_minor_dim_registers =
+      s_minor_dim_elements / s_elements_per_register;
+
+  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
+                "Minor dimension smaller than a vector need to be a power of "
+                "two fraction");
+
+  static_assert(s_minor_dim_registers == 0 ||
+                    (s_minor_dim_elements % s_elements_per_register == 0),
+                "Minor dimensions greater than a vector length must be an "
+                "integer number of vectors");
+
+
+  static constexpr camp::idx_t s_major_dim_per_register =
+      s_elements_per_register / s_minor_dim_elements;
+
+  static constexpr camp::idx_t s_segbits =
+      RAJA::LogBase2<s_minor_dim_elements>::value;
+
+private:
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
+                                                                 IDX col) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-      using base_type = RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-      using register_type = Register<T, REGISTER_POLICY>;
-      using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
-      using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
-      using register_policy = REGISTER_POLICY;
-      using element_type = T;
-      using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
-
-      using transpose_tensor_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<!ROW_ORD, !COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-
-      using transpose_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-      using product_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
-
-      static constexpr camp::idx_t s_num_rows = ROW_SIZE;
-      static constexpr camp::idx_t s_num_columns = COL_SIZE;
-
-
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
+               : (col * IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
+  }
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX row, IDX col)
+      -> IDX
+  {
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) & IDX(s_mask_per_register)
+               : (col * IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
+  }
 
+  using base_type::m_registers;
 
-      static constexpr camp::idx_t s_elements_per_register =
-          RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem;
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() : base_type() {}
 
-      // number of registers to hold entire matrix
-      static constexpr camp::idx_t s_num_registers =
-          (ROW_SIZE*COL_SIZE) / s_elements_per_register;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) : base_type(c) { this->broadcast(c); }
 
-      // We only allow matrix sizes that exactly fit in some number of registers
-      static_assert((ROW_SIZE*COL_SIZE) == s_num_registers*s_elements_per_register,
-          "MatrixRegister must be dimensioned to exactly fit an integer number of registers");
 
-      using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) { this->copy(c); }
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegister() {}
 
 
-      static constexpr camp::idx_t s_minor_dim_elements =
-          layout_type::is_row_major() ? s_num_columns : s_num_rows;
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
+           (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+  }
 
-      static constexpr camp::idx_t s_major_dim_elements =
-          layout_type::is_row_major() ? s_num_rows : s_num_columns;
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? ROW_SIZE : COL_SIZE;
+  }
 
-      // number of (full) registers that span the minor dim
-      // if a single register is split across multiple rows or columns, then
-      // this is 0
-      static constexpr camp::idx_t s_minor_dim_registers =
-              s_minor_dim_elements / s_elements_per_register;
 
-      static_assert(s_minor_dim_registers >0  ||  log_base2_t::is_exact,
-          "Minor dimension smaller than a vector need to be a power of two fraction");
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      static_assert(s_minor_dim_registers == 0 || (s_minor_dim_elements % s_elements_per_register == 0),
-          "Minor dimensions greater than a vector length must be an integer number of vectors");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
 
-      static constexpr camp::idx_t s_major_dim_per_register =
-          s_elements_per_register / s_minor_dim_elements;
+  /*!
+   * Provide matrix-matrix multiply for operator* between to matrices
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return matrix_multiply(y);
+  }
 
-      static constexpr camp::idx_t s_segbits = RAJA::LogBase2<s_minor_dim_elements>::value;
+  /*!
+   * Provide right matrix-vector multiply for operator* between this
+   * matrix and a vector.
+   */
+  template <typename T2, typename RP>
+  VectorRegister<T2, RP> operator*(VectorRegister<T2, RP> const& y) const
+  {
+    return right_multiply_vector(y);
+  }
 
-    private:
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) >> IDX(s_shift_per_register) :
-            (col*IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
-      }
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) & IDX(s_mask_per_register) :
-            (col*IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
-      }
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      using base_type::m_registers;
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+  {
 
-    public:
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister() : base_type() {}
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c) : base_type(c)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr,
+                              ref.m_stride[0],
+                              ref.m_stride[1],
+                              ref.m_tile.m_size[0],
+                              ref.m_tile.m_size[1]);
+        }
       }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) : base_type(c)
+      // strided data
+      else
       {
-        this->copy(c);
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegister(){}
-
-
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
-            (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
       }
+    }
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? ROW_SIZE : COL_SIZE;
-      }
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(value);
-        return *this;
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
       }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr,
+                                ref.m_stride[0],
+                                ref.m_stride[1],
+                                ref.m_tile.m_size[0],
+                                ref.m_tile.m_size[1]);
+        }
       }
+    }
+  };
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE StrideInt1,
+            INDEX_TYPE StrideInt2,
+            INDEX_TYPE BeginInt1,
+            INDEX_TYPE BeginInt2,
+            INDEX_TYPE SizeInt1,
+            INDEX_TYPE SizeInt2,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+      camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+      camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+      STRIDE_ONE_DIM>>
+  {
 
-
-      /*!
-       * Provide matrix-matrix multiply for operator* between to matrices
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+        camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+        camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        return matrix_multiply(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr,
+                              ref.m_stride[0],
+                              ref.m_stride[1],
+                              ref.m_tile.m_size[0],
+                              ref.m_tile.m_size[1]);
+        }
       }
-
-      /*!
-       * Provide right matrix-vector multiply for operator* between this
-       * matrix and a vector.
-       */
-      template<typename T2, typename RP>
-      VectorRegister<T2, RP>
-      operator*(VectorRegister<T2, RP> const &y) const
+      // strided data
+      else
       {
-        return right_multiply_vector(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
       }
+    }
 
 
-      template<typename REF_TYPE>
-      struct RefBridge;
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr,
+                               ref.m_stride[0],
+                               ref.m_stride[1],
+                               ref.m_tile.m_size[0],
+                               ref.m_tile.m_size[1]);
+        }
       }
-
-
-
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+      // strided data
+      else
       {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr,
+                                ref.m_stride[0],
+                                ref.m_stride[1],
+                                ref.m_tile.m_size[0],
+                                ref.m_tile.m_size[1]);
+        }
+      }
+    }
+  };
+
+
+  /*!
+   * Loads a dense full matrix from memory.
+   *
+   * For row-major, column entries must be stride-1
+   * For column-major, row entries must be stride-1
+   *
+   * Non-stride-1 dimension can have any striding... so this is can
+   * be a "semi-dense" matrix.
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type&
+  load_packed(element_type const* ptr, int row_stride, int col_stride)
+  {
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
-
-      template<
-           typename POINTER_TYPE,
-           typename INDEX_TYPE,
-           RAJA::internal::expt::TensorTileSize TENSOR_SIZE, 
-           INDEX_TYPE StrideInt1, INDEX_TYPE StrideInt2,
-           INDEX_TYPE  BeginInt1, INDEX_TYPE  BeginInt2,
-           INDEX_TYPE   SizeInt1, INDEX_TYPE   SizeInt2,
-           camp::idx_t STRIDE_ONE_DIM
-      >
-      struct RefBridge
-      <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>>
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
       {
+        m_registers[reg].load_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense load for row-major
+    else if (layout_type::is_row_major())
+    {
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        camp::idx_t reg = 0;
+        for (camp::idx_t row = 0; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
+            camp::idx_t offset =
+                row * row_stride + colreg * s_elements_per_register;
 
-      /*!
-       * Loads a dense full matrix from memory.
-       *
-       * For row-major, column entries must be stride-1
-       * For column-major, row entries must be stride-1
-       *
-       * Non-stride-1 dimension can have any striding... so this is can
-       * be a "semi-dense" matrix.
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr,
-          int row_stride, int col_stride)
-      {
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+            m_registers[reg].load_packed(ptr + offset);
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].load_packed(ptr + reg*s_elements_per_register);
+            reg++;
           }
-
         }
-        // Do semi-dense load for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            camp::idx_t reg = 0;
-            for(camp::idx_t row = 0;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t offset = row*row_stride + colreg*s_elements_per_register;
+        camp::idx_t reg = 0;
+        for (camp::idx_t col = 0; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                m_registers[reg].load_packed(ptr + offset);
+            camp::idx_t offset =
+                col * col_stride + rowreg * s_elements_per_register;
 
-                reg ++;
+            m_registers[reg].load_packed(ptr + offset);
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
+            reg++;
           }
         }
-        // Do semi-dense load for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            camp::idx_t reg = 0;
-            for(camp::idx_t col = 0;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
-
-                camp::idx_t offset = col*col_stride + rowreg*s_elements_per_register;
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
 
-                m_registers[reg].load_packed(ptr + offset);
+    return *this;
+  }
 
-                reg ++;
+  /*!
+   * Loads a strided full matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type&
+  load_strided(element_type const* ptr, int row_stride, int col_stride)
+  {
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      col_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided full matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr,
-          int row_stride, int col_stride)
+      // less than one register per row
+      else
       {
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(
+              ptr_i, s_segbits, col_stride, row_stride);
         }
+      }
+    }
 
-        // column major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      row_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a dense partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // less than one register per column
+      else
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(
+              ptr_i, s_segbits, row_stride, col_stride);
+        }
+      }
+    }
 
-        if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
+    return *this;
+  }
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+  /*!
+   * Loads a dense partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_nm(element_type const* ptr,
+                            int row_stride,
+                            int col_stride,
+                            int num_rows,
+                            int num_cols)
+  {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = colreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t col0 = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // loading a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
 
-            // zero out remaining rows
-            for(camp::idx_t row = num_rows;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
-
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = colreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
+
+              break; // end this row
             }
           }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
-          }
         }
-        // Do semi-dense load for column-major
-        else{
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
+        // zero out remaining rows
+        for (camp::idx_t row = num_rows; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            m_registers[reg] = element_type(0);
+          }
+        }
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = rowreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t row0 = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
-            // zero out remaining columns
-            for(camp::idx_t col = num_cols;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = rowreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
-            }
 
+              break; // end this column
+            }
           }
-          // more than one column per register
-          else{
+        }
+        // zero out remaining columns
+        for (camp::idx_t col = num_cols; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            m_registers[reg] = element_type(0);
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // more than one column per register
+      else
       {
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row >= num_rows){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
 
+    return *this;
+  }
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
+  /*!
+   * Loads a strided partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_nm(element_type const* ptr,
+                             int row_stride,
+                             int col_stride,
+                             int num_rows,
+                             int num_cols)
+  {
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row >= num_rows)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per row
           else
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            col_stride,
+                                            reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows = reg_num_rows > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_rows;
+
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(
+              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col >= num_cols){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col >= num_cols)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per column
           else
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            row_stride,
+                                            reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols = reg_num_cols > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_cols;
+
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(
+              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
+    return *this;
+  }
 
 
-      /*!
-       * Store a dense full matrix to memory.
-       *
-       * Column entries must be stride-1, rows may be any striding
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  /*!
+   * Store a dense full matrix to memory.
+   *
+   * Column entries must be stride-1, rows may be any striding
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_packed(element_type* ptr, int row_stride, int col_stride) const
+  {
 
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].store_packed(ptr + reg*s_elements_per_register);
-          }
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense store for row-major
+    else if (layout_type::is_row_major())
+    {
 
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
-        // Do semi-dense store for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one column per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
-        }
-        // Do semi-dense store for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one row per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
+      }
+      // more than one column per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
+      }
+      // more than one row per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
 
 
-        return *this;
-      }
+    return *this;
+  }
 
-      /*!
-       * Store a strided full matrix to memory
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  /*!
+   * Store a strided full matrix to memory
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_strided(element_type* ptr, int row_stride, int col_stride) const
+  {
 
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, col_stride);
         }
-
-        // column major
-        else{
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      }
+      // less than one register per row
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(
+              ptr_i, s_segbits, col_stride, row_stride);
         }
-
-        return *this;
       }
+    }
 
-      /*!
-       * Store a dense partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+    // column major
+    else
+    {
+      // one or more registers per column
+      if (s_minor_dim_registers)
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, row_stride);
+        }
+      }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(
+              ptr_i, s_segbits, row_stride, col_stride);
+        }
+      }
+    }
 
+    return *this;
+  }
 
-        if(layout_type::is_row_major()){
+  /*!
+   * Store a dense partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_nm(element_type* ptr,
+                                   int row_stride,
+                                   int col_stride,
+                                   int num_rows,
+                                   int num_cols) const
+  {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // store a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t col0 = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // store a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+
+              break; // end this row
+            }
           }
         }
-        // Do semi-dense store for column-major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return store_strided_nm(
+            ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t row0 = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
 
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+              break; // end this column
+            }
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Store a strided partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+      // more than one column per register
+      else
       {
 
+        // default to strided operation
+        return store_strided_nm(
+            ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+    return *this;
+  }
 
+  /*!
+   * Store a strided partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided_nm(element_type* ptr,
+                                    int row_stride,
+                                    int col_stride,
+                                    int num_rows,
+                                    int num_cols) const
+  {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
-          }
-          // less than one register per row
-          else
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             col_stride,
+                                             reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows = reg_num_rows > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_rows;
+
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(
+              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+    // column major
+    else
+    {
+
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             row_stride,
+                                             reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols = reg_num_cols > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_cols;
+
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(
+              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
-
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_nm(self_type const &mat, int num_rows, int num_cols) const {
-        self_type result;
+    return *this;
+  }
 
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_nm(self_type const& mat, int num_rows, int num_cols) const
+  {
+    self_type result;
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
+          {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
 
-              }
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
-          // less than one register per row
-          else
-          {
+        }
+      }
+      // less than one register per row
+      else
+      {
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows = reg_num_rows > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_rows;
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
-            }
-          }
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
         }
+      }
+    }
 
-        // column major
-        else{
+    // column major
+    else
+    {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
         }
-
-
-        return result;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols = reg_num_cols > s_major_dim_per_register
+                             ? s_major_dim_per_register
+                             : reg_num_cols;
+
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+        }
+      }
+    }
 
 
+    return result;
+  }
 
-      /*!
-       * Matrix transpose, keeping layout
-       *
-       * Transpose is not completely implemented
-       */
+
+  /*!
+   * Matrix transpose, keeping layout
+   *
+   * Transpose is not completely implemented
+   */
 #if 0
       RAJA_HOST_DEVICE
       RAJA_INLINE
@@ -1291,386 +1540,429 @@ namespace expt
         return reinterpret_cast<transpose_tensor_type const &>(*this);
       }
 #endif
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector(row_vector_type v) const {
-        column_vector_type result(0);
-        return right_multiply_vector_accumulate(v, result);
-      }
-
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector(column_vector_type v) const {
-        row_vector_type result(0);
-        return left_multiply_vector_accumulate(v, result);
-      }
-
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += (this) * v
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector_accumulate(row_vector_type const &v, column_vector_type result) const {
-
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
-
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
-
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
-
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type right_multiply_vector(row_vector_type v) const
+  {
+    column_vector_type result(0);
+    return right_multiply_vector_accumulate(v, result);
+  }
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector(column_vector_type v) const
+  {
+    row_vector_type result(0);
+    return left_multiply_vector_accumulate(v, result);
+  }
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += (this) * v
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type
+  right_multiply_vector_accumulate(row_vector_type const& v,
+                                   column_vector_type result) const
+  {
 
-          }
-          // one or more registers per row
-          else{
+    if (layout_type::is_row_major())
+    {
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute partial dot products for all registers in this row
-              auto rowsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-                rowsum = m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
-                reg ++;
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              } // rowreg
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(row) + rowsum.sum();
-              result.set(value, row);
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
-            } // row
-          }
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
+          // accumulate result
+          result.get_register(result_reg) += value;
         }
-        else{
-
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-
-            auto &mv = result.get_register(0);
-
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+      }
+      // one or more registers per row
+      else
+      {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              mv = m_registers[m_reg].multiply_add(v_tmp, mv);
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
 
-            }
+          // compute partial dot products for all registers in this row
+          auto rowsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            // Now sum segments in mv together to form final result
-            mv = mv.segmented_sum_outer(s_segbits, 0);
+            rowsum =
+                m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
+            reg++;
 
-          }
-          // one or more registers per column
-          else{
+          } // rowreg
 
-            // Loop over columns (which is also registers)
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(row) + rowsum.sum();
+          result.set(value, row);
 
-              // extract column value from v
-              auto v_col = register_type(v.get(col));
+        } // row
+      }
+    }
+    else
+    {
 
-              // apply v_col to entire column (1 or more registers)
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
 
-                auto &mv = result.get_register(rowreg);
-                mv = m_registers[reg].multiply_add(v_col, mv);
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
 
-                reg ++;
+        auto& mv = result.get_register(0);
 
-              } // rowreg
-            } // col
-          }
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
 
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          mv = m_registers[m_reg].multiply_add(v_tmp, mv);
         }
-        return result;
-      }
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += v * (this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector_accumulate(column_vector_type const &v, row_vector_type result) const {
 
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-            auto &vm = result.get_register(0);
-
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+        // Now sum segments in mv together to form final result
+        mv = mv.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per column
+      else
+      {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        // Loop over columns (which is also registers)
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
 
-            }
+          // extract column value from v
+          auto v_col = register_type(v.get(col));
 
-            // Now sum segments in mv together to form final result
-            vm = vm.segmented_sum_outer(s_segbits, 0);
+          // apply v_col to entire column (1 or more registers)
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-          }
-          // one or more registers per row
-          else{
+            auto& mv = result.get_register(rowreg);
+            mv = m_registers[reg].multiply_add(v_col, mv);
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
-              auto lhs_bcat = register_type(v.get(row));
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+            reg++;
 
-                result.get_register(colreg) =
-                    m_registers[reg].multiply_add(lhs_bcat, result.get_register(colreg));
-                reg ++;
+          } // rowreg
+        }   // col
+      }
+    }
+    return result;
+  }
+
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += v * (this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector_accumulate(column_vector_type const& v,
+                                                  row_vector_type result) const
+  {
 
-              } // rowreg
+    if (layout_type::is_row_major())
+    {
 
-            }
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
+        auto& vm = result.get_register(0);
+
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
+
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        }
 
-          }
+        // Now sum segments in mv together to form final result
+        vm = vm.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per row
+      else
+      {
 
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
+          auto lhs_bcat = register_type(v.get(row));
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-        } // row-major
+            result.get_register(colreg) = m_registers[reg].multiply_add(
+                lhs_bcat, result.get_register(colreg));
+            reg++;
 
-        // Column-major:
-        else{
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
+          } // rowreg
+        }
+      }
 
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
+    } // row-major
 
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+    // Column-major:
+    else
+    {
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-          }
-          // one or more registers per column
-          else{
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
-
-              // compute partial dot products for all registers in this row
-              auto colsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
-                colsum = m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
-                reg ++;
-
-              } // rowreg
-
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(col) + colsum.sum();
-              result.set(value, col);
-
-            } // col
-          }
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-        } // col-major
-        return result;
+          // accumulate result
+          result.get_register(result_reg) += value;
+        }
       }
+      // one or more registers per column
+      else
+      {
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
+
+          // compute partial dot products for all registers in this row
+          auto colsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+            colsum =
+                m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
+            reg++;
 
+          } // rowreg
 
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(col) + colsum.sum();
+          result.set(value, col);
 
-
-
-      /*!
-       * Matrix-Matrix product
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply(RMAT const &mat) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(0);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply(*this, mat, res);
-        return res;
+        } // col
       }
 
-      /*!
-       * Matrix-Matrix multiply add
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply_add(RMAT const &B, typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type const &C) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(C);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, res);
-        return res;
-      }
 
-      /*!
-       * Matrix-Matrix multiply accumulate
-       */
-      template<typename ACCMAT, typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      void
-      matrix_multiply_accumulate(ACCMAT &acc, RMAT const &B) const {
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, acc);
-      }
+    } // col-major
+    return result;
+  }
 
 
+  /*!
+   * Matrix-Matrix product
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply(RMAT const& mat) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,
+                                                              RMAT>::result_type
+        res(0);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::multiply(
+        *this, mat, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply add
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply_add(
+          RMAT const& B,
+          typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+              self_type,
+              RMAT>::result_type const& C) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,
+                                                              RMAT>::result_type
+        res(C);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::
+        multiply_accumulate(*this, B, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply accumulate
+   */
+  template <typename ACCMAT, typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  matrix_multiply_accumulate(ACCMAT& acc, RMAT const& B) const
+  {
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::
+        multiply_accumulate(*this, B, acc);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int row, int col){
-        m_registers[to_register(row, col)].set(val, to_lane(row,col));
-        return *this;
-      }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int row, int col) const {
-        return m_registers[to_register(row, col)].get(to_lane(row,col));
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& set(element_type val, int row, int col)
+  {
+    m_registers[to_register(row, col)].set(val, to_lane(row, col));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int row, int col) const
+  {
+    return m_registers[to_register(row, col)].get(to_lane(row, col));
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type extract_diagonal_register(camp::idx_t starting_column, camp::idx_t segbits, camp::idx_t segment) const {
 
-        register_type result(0);
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type extract_diagonal_register(camp::idx_t starting_column,
+                                          camp::idx_t segbits,
+                                          camp::idx_t segment) const
+  {
 
-        camp::idx_t num_rows = register_type::s_num_elem >> segbits;
-        camp::idx_t num_repeats = 1 << segbits;
+    register_type result(0);
 
-        camp::idx_t col0 = (starting_column + num_rows*segment)%s_num_columns;
-        camp::idx_t row0 = num_rows*segment;
+    camp::idx_t num_rows = register_type::s_num_elem >> segbits;
+    camp::idx_t num_repeats = 1 << segbits;
 
-        for(camp::idx_t i = 0;i < num_rows;++i){
-          camp::idx_t col = (col0 + i) % s_num_columns;
-          camp::idx_t row = row0 + i;
-          auto value = get(row,col);
-          for(camp::idx_t j = 0;j < num_repeats;++j){
-            result.set(value, (i<<segbits) + j);
-          }
-        }
+    camp::idx_t col0 = (starting_column + num_rows * segment) % s_num_columns;
+    camp::idx_t row0 = num_rows * segment;
 
-        return result;
+    for (camp::idx_t i = 0; i < num_rows; ++i)
+    {
+      camp::idx_t col = (col0 + i) % s_num_columns;
+      camp::idx_t row = row0 + i;
+      auto value = get(row, col);
+      for (camp::idx_t j = 0; j < num_repeats; ++j)
+      {
+        result.set(value, (i << segbits) + j);
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * @brief Converts to matrix to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string(bool one_line=false) const {
-        std::string s = "Matrix(" + std::to_string(s_num_rows) +
-            "x" + std::to_string(s_num_columns);
-        if(!one_line){
-          s +=")\n";
-        }
 
+  /*!
+   * @brief Converts to matrix to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string(bool one_line = false) const
+  {
+    std::string s = "Matrix(" + std::to_string(s_num_rows) + "x" +
+                    std::to_string(s_num_columns);
+    if (!one_line)
+    {
+      s += ")\n";
+    }
 
-        s += "[ ";
 
-        //
-        for(camp::idx_t r = 0;r < s_num_rows; ++ r){
-          if(r > 0){
-            s += ", ";
-            if(!one_line){
-              s+= "\n  ";
-            }
-          }
-          s += "[";
-          for(camp::idx_t c = 0;c < s_num_columns; ++ c){
-            if(c > 0){
-              s += ", ";
-            }
-            s += std::to_string(this->get(r,c));
-          }
-          s += "]";
-        }
+    s += "[ ";
 
-        s += " ]";
-        if(!one_line){
-          s+="\n";
+    //
+    for (camp::idx_t r = 0; r < s_num_rows; ++r)
+    {
+      if (r > 0)
+      {
+        s += ", ";
+        if (!one_line)
+        {
+          s += "\n  ";
         }
-        return s;
       }
+      s += "[";
+      for (camp::idx_t c = 0; c < s_num_columns; ++c)
+      {
+        if (c > 0)
+        {
+          s += ", ";
+        }
+        s += std::to_string(this->get(r, c));
+      }
+      s += "]";
+    }
 
-  }; // MatrixRegisterImpl
-
-
+    s += " ]";
+    if (!one_line)
+    {
+      s += "\n";
+    }
+    return s;
+  }
 
+}; // MatrixRegisterImpl
 
 
 } // namespace expt
-}  // namespace RAJA
-
-
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 3480fda10c..209a11e611 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -33,1178 +33,1194 @@ namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename REGISTER_POLICY>
-  class Register;
+template <typename T, typename REGISTER_POLICY>
+class Register;
 }
 
 namespace internal
 {
 namespace expt
 {
-  class RegisterConcreteBase {};
+class RegisterConcreteBase
+{};
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic - TensorRegister
+/*
+ * Overload for:    arithmetic - TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic * TensorRegister
+/*
+ * Overload for:    arithmetic * TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
 
-  /*
-   * Overload for:    arithmetic / TensorRegister
+/*
+ * Overload for:    arithmetic / TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-
-
-
-
-  /*!
-   * Register base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
-   */
-  template<typename Derived>
-  class RegisterBase;
-
-  template<typename T, typename REGISTER_POLICY>
-  class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>> :
-    public RegisterConcreteBase
-  {
-    public:
-      using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-      using element_type = camp::decay<T>;
-
-      using index_type = camp::idx_t;
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
 
-      using int_element_type = typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
-      using int_vector_type = RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-    private:
+/*!
+ * Register base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class RegisterBase;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
+template <typename T, typename REGISTER_POLICY>
+class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
+    : public RegisterConcreteBase
+{
+public:
+  using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
+  using element_type = camp::decay<T>;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
+  using index_type = camp::idx_t;
 
-    public:
+  using int_element_type =
+      typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
+  using int_vector_type =
+      RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return true;
-      }
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(){}
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return true; }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~RegisterBase(){}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase() {}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~RegisterBase() {}
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(RegisterBase const &){}
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      RegisterBase(self_type const &){
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase(RegisterBase const&) {}
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr RegisterBase(self_type const&) {}
 
 
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type s_broadcast_n(element_type const &value, camp::idx_t N){
-        self_type x;
-        for(camp::idx_t i = 0;i < N;++ i){
-          x.set(value, i);
-        }
-        return x;
-      }
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static self_type s_broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    self_type x;
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      x.set(value, i);
+    }
+    return x;
+  }
 
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
 
 
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> offsets){
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather(element_type const* ptr,
+         RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          getThis()->set(ptr[offsets.get(i)], i);
-        }
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N){
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather_n(element_type const* ptr,
+           RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+           camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-          for(camp::idx_t i = 0;i < N;++ i){
-            getThis()->set(ptr[offsets.get(i)], i);
-          }
-          return *getThis();
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    getThis()->gather(
+        ptr,
+        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
 
-            if(seg >= num_outer || i >= num_inner){
-              getThis()->set(element_type(0), lane);
-            }
-            else{
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              element_type value = ptr[offset];
+        if (seg >= num_outer || i >= num_inner)
+        {
+          getThis()->set(element_type(0), lane);
+        }
+        else
+        {
 
-              getThis()->set(value, lane);
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            }
+          element_type value = ptr[offset];
 
-            lane ++;
-          }
+          getThis()->set(value, lane);
         }
 
-        return *getThis();
+        lane++;
       }
+    }
 
+    return *getThis();
+  }
 
 
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets) const {
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter(element_type* ptr,
+          RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N) const {
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr,
+            RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+            camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        getThis()->scatter(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    getThis()->scatter(
+        ptr,
+        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
 
-            if(!(seg >= num_outer || i >= num_inner)){
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              ptr[offset] = getThis()->get(lane);
+        if (!(seg >= num_outer || i >= num_inner))
+        {
 
-            }
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            lane ++;
-          }
+          ptr[offset] = getThis()->get(lane);
         }
 
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
+        lane++;
       }
+    }
 
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Assign one register to another
-       * @param x register to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
 
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
 
+  /*!
+   * @brief Assign one register to another
+   * @param x register to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
 
 
+  /*!
+   * @brief Add two registers
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Add two registers
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
 
+  /*!
+   * @brief Add a register to this register
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Add a register to this register
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Add scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Add scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
 
+  /*!
+   * @brief Add a scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Add a scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Negate the value of this register
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
 
-      /*!
-       * @brief Negate the value of this register
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
+  /*!
+   * @brief Subtract two register registers
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Subtract two register registers
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  /*!
+   * @brief Subtract a register from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Subtract a register from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  /*!
+   * @brief Subtract a scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Subtract a scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(RHS const &rhs) const
-      {
-        return getThis()->multiply(rhs);
-      }
+  /*!
+   * @brief Multiply two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*(RHS const& rhs) const
+  {
+    return getThis()->multiply(rhs);
+  }
 
-      /*!
-       * @brief Multiply a register with this register
-       * @param x register to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = getThis()->multiply(rhs);
-        return *getThis();
-      }
+  /*!
+   * @brief Multiply a register with this register
+   * @param x register to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() = getThis()->multiply(rhs);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Divide two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * @brief Divide two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x register to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x register to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Divide n elements of this register by another register
-       * @param x register to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b.get(i), i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Divide n elements of this register by another register
+   * @param x register to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief Divide n elements of this register by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b, i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Divide n elements of this register by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief Dot product of two registers
-       * @param x Other register to dot with this register
-       * @return Value of (*this) dot x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        return getThis()->multiply(x).sum();
-      }
+  /*!
+   * @brief Dot product of two registers
+   * @param x Other register to dot with this register
+   * @return Value of (*this) dot x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    return getThis()->multiply(x).sum();
+  }
 
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return (self_type(*getThis()) * self_type(b)) + self_type(c);
-      }
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return (self_type(*getThis()) * self_type(b)) + self_type(c);
+  }
 
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
 
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
 
-      /*!
-       * Minimum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(camp::idx_t N) const
-      {
-        return getThis()->min(N);
-      }
+  /*!
+   * Minimum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(camp::idx_t N) const { return getThis()->min(N); }
 
-      /*!
-       * Maximum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(camp::idx_t N) const
-      {
-        return getThis()->max(N);
-      }
+  /*!
+   * Maximum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(camp::idx_t N) const { return getThis()->max(N); }
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle left operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
-       *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
-       *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_left(camp::idx_t lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle left operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
+   *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
+   *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_left(camp::idx_t lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-        self_type z;
+    self_type z;
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
 
-          z.set(xy_select == 0 ? x.get(i) : y.get(i - (1<<lvl)), i);
-        }
+      z.set(xy_select == 0 ? x.get(i) : y.get(i - (1 << lvl)), i);
+    }
 
-        return z;
-      }
+    return z;
+  }
 
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle right operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
-       *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
-       *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_right(int lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle right operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
+   *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
+   *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_right(int lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-        self_type z;
+    self_type z;
 
-        camp::idx_t i0 = 1<<lvl;
+    camp::idx_t i0 = 1 << lvl;
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
-          z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1<<lvl)), i);
-        }
+      z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1 << lvl)), i);
+    }
 
-        return z;
-      }
+    return z;
+  }
 
 
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
 
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
       {
-        int_vector_type result;
-
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
-
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
-            result.set(seg*stride_outer + i*stride_inner, lane);
-            lane ++;
-          }
-        }
-
-        return result;
+        result.set(seg * stride_outer + i * stride_inner, lane);
+        lane++;
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * self_type::s_num_elem>>segbits;
-
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          auto value = getThis()->get(i) + result.get((i >> segbits)+output_offset);
-          result.set(value, (i >> segbits)+output_offset);
-        }
 
-        return result;
-      }
-
-      /*!
-       * Sum all segments as subvectors, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 the segments are size 1, which means that this is just a
-       *      sum of all elements.  The output_segment determines where the
-       *      result is placed.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=3:
-       *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
-       *
-       *  segbits=1 the segments are 2-wide:
-       *
-       *      output_segment=0:
-       *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
-       *
-       *  and so on up to segbits=3, which is just the original vector:
-       *  segbits=3
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * (1<<segbits);
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          camp::idx_t output_i = output_offset + (i&((1<<segbits)-1));
-          auto value = getThis()->get(i) + result.get(output_i);
-          result.set(value, output_i);
-        }
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * self_type::s_num_elem >> segbits;
 
-        return result;
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      auto value =
+          getThis()->get(i) + result.get((i >> segbits) + output_offset);
+      result.set(value, (i >> segbits) + output_offset);
+    }
 
+    return result;
+  }
 
+  /*!
+   * Sum all segments as subvectors, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 the segments are size 1, which means that this is just a
+   *      sum of all elements.  The output_segment determines where the
+   *      result is placed.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=3:
+   *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
+   *
+   *  segbits=1 the segments are 2-wide:
+   *
+   *      output_segment=0:
+   *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
+   *
+   *  and so on up to segbits=3, which is just the original vector:
+   *  segbits=3
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-      RAJA_INLINE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * (1 << segbits);
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
+      auto value = getThis()->get(i) + result.get(output_i);
+      result.set(value, output_i);
+    }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+    return result;
+  }
 
-            if(seg >= num_outer || i >= num_inner){
-              result.set(element_type(0), lane);
-            }
-            else{
 
-              element_type div = getThis()->get(lane) / den.get(lane);
+  RAJA_INLINE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
 
-              result.set(div, lane);
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size = 1 << segbits;
 
-            }
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-            lane ++;
-          }
+        if (seg >= num_outer || i >= num_inner)
+        {
+          result.set(element_type(0), lane);
         }
+        else
+        {
 
-        return result;
-      }
-
+          element_type div = getThis()->get(lane) / den.get(lane);
 
+          result.set(div, lane);
+        }
 
-      /*!
-       * Segmented dot product performs dot products
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
-       *
-       *
-       *  segbits=0 is equivalent to a vector multiply,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
-       *
-       *  segbits=1 sums neighboring pairs of products.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
-       *
-       *  and so on up to segbits=3, which is a full dot-product of x and y, and the
-       *      output_segment denotes the vector position of the result
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type segmented_dot(camp::idx_t segbits, camp::idx_t output_segment, self_type const &x) const
-      {
-        return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+        lane++;
       }
+    }
 
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      input_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      input_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      input_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
+    return result;
+  }
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
 
-          auto off = (i&mask) + offset;
+  /*!
+   * Segmented dot product performs dot products
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
+   *
+   *
+   *  segbits=0 is equivalent to a vector multiply,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
+   *
+   *  segbits=1 sums neighboring pairs of products.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
+   *
+   *  and so on up to segbits=3, which is a full dot-product of x and y, and the
+   *      output_segment denotes the vector position of the result
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type segmented_dot(camp::idx_t segbits,
+                          camp::idx_t output_segment,
+                          self_type const& x) const
+  {
+    return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+  }
 
-          result.set(getThis()->get(off), i);
-        }
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      input_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      input_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      input_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
-        return result;
-      }
+    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
 
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
+      auto off = (i & mask) + offset;
 
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+      result.set(getThis()->get(off), i);
+    }
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+    return result;
+  }
 
-          auto off = (i>>segbits) + offset;
 
-          result.set(getThis()->get(off), i);
-        }
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
-        return result;
-      }
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
 
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
+      auto off = (i >> segbits) + offset;
 
+      result.set(getThis()->get(off), i);
+    }
 
+    return result;
+  }
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
 
-        //
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          s += std::to_string(getThis()->get(i)) + " ";
-        }
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
 
-        s += " ]\n";
+    //
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      s += std::to_string(getThis()->get(i)) + " ";
+    }
 
-        return s;
-      }
+    s += " ]\n";
 
-  };
+    return s;
+  }
+};
 
 
 } // namespace expt
@@ -1212,5 +1228,4 @@ namespace expt
 } // namespace RAJA
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index bb53993fed..641a8ec261 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -27,348 +27,278 @@ namespace RAJA
 
 namespace internal
 {
-    /* Partial specialization for the strip_index_type_t helper in
-       IndexValue.hpp
-    */
-    template<typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
-    struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
-    {
-        using type = typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
-    };
+/* Partial specialization for the strip_index_type_t helper in
+   IndexValue.hpp
+*/
+template <typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
+struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
+{
+  using type =
+      typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
+};
 
 
 namespace expt
 {
 
 
+// Helper that strips the Vector type from an argument
+template <typename ARG>
+struct TensorIndexTraits
+{
+  using arg_type = ARG;
+  using value_type = strip_index_type_t<ARG>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return false; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(arg_type const& arg) { return arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(arg_type const arg)
+  {
+    return arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(arg_type const&) { return 1; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(arg_type const&) { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem() { return 1; }
+};
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
+{
+  using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using arg_type = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(index_type const& arg) { return *arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const arg)
+  {
+    return (arg_type)arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const& arg) { return arg.size(); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const& arg)
+  {
+    return arg.begin();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+    RAJA::expt::StaticTensorIndexInner<IDX,
+                                       TENSOR_TYPE,
+                                       DIM,
+                                       INDEX_VALUE,
+                                       LENGTH_VALUE>>>
+{
+  using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using index_type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::StaticTensorIndexInner<IDX,
+                                         TENSOR_TYPE,
+                                         DIM,
+                                         INDEX_VALUE,
+                                         LENGTH_VALUE>>;
+  using arg_type = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const)
+  {
+    return INDEX_VALUE;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const&) { return LENGTH_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const&) { return INDEX_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+/*
+ * Returns vector size of argument.
+ *
+ * For scalars, always returns 1.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr bool isTensorIndex()
+{
+  return TensorIndexTraits<ARG>::isTensorIndex();
+}
 
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndex(ARG const& arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const&
+{
+  return TensorIndexTraits<ARG>::strip(arg);
+}
 
 
-
-    // Helper that strips the Vector type from an argument
-    template<typename ARG>
-    struct TensorIndexTraits {
-        using arg_type = ARG;
-        using value_type = strip_index_type_t<ARG>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return false;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(arg_type const &arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(arg_type const arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(arg_type const &){
-          return 1;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(arg_type const &){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return 1;
-        }
-    };
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-    struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>> {
-        using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(index_type const &arg){
-          return *arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const arg){
-          return (arg_type)arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &arg){
-          return arg.size();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &arg){
-          return arg.begin();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-
-
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-    struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
-        RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>
-    >> {
-        using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using index_type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &){
-          return LENGTH_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-    /*
-     * Returns vector size of argument.
-     *
-     * For scalars, always returns 1.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    bool isTensorIndex()
-    {
-      return TensorIndexTraits<ARG>::isTensorIndex();
-    }
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndex(ARG const &arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const &
-    {
-      return TensorIndexTraits<ARG>::strip(arg);
-    }
-
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndexByValue(ARG const arg) ->
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+stripTensorIndexByValue(ARG const arg) ->
     typename TensorIndexTraits<ARG>::arg_type const
-    {
-      return TensorIndexTraits<ARG>::strip_by_value(arg);
-    }
-
-    /*
-     * Returns tensor dimension size of argument.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorSize(ARG const &arg, IDX dim_size)
-    {
-      return TensorIndexTraits<ARG>::size(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::size(arg)) :
-          dim_size;
-    }
-
-    /*
-     * Returns tensor dimenson beginning index of an argument.
-     *
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorBegin(ARG const &arg, IDX dim_minval)
-    {
-      return TensorIndexTraits<ARG>::begin(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::begin(arg)) :
-          dim_minval;
-    }
-
-    /*
-     * Returns vector dim of argument.
-     *
-     * For scalars, always returns 0.
-     *
-     * For VectorIndex types, returns the DIM argument.
-     * For vector_exec, this is always 0
-     *
-     * For matrices, DIM means:
-     *   0 : Row
-     *   1 : Column
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto getTensorDim() ->
-      decltype(TensorIndexTraits<ARG>::dim())
-    {
-      return TensorIndexTraits<ARG>::dim();
-    }
+{
+  return TensorIndexTraits<ARG>::strip_by_value(arg);
+}
+
+/*
+ * Returns tensor dimension size of argument.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
+                                                         IDX dim_size)
+{
+  return TensorIndexTraits<ARG>::size(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::size(arg))
+             : dim_size;
+}
+
+/*
+ * Returns tensor dimenson beginning index of an argument.
+ *
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
+                                                          IDX dim_minval)
+{
+  return TensorIndexTraits<ARG>::begin(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::begin(arg))
+             : dim_minval;
+}
+
+/*
+ * Returns vector dim of argument.
+ *
+ * For scalars, always returns 0.
+ *
+ * For VectorIndex types, returns the DIM argument.
+ * For vector_exec, this is always 0
+ *
+ * For matrices, DIM means:
+ *   0 : Row
+ *   1 : Column
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto getTensorDim()
+    -> decltype(TensorIndexTraits<ARG>::dim())
+{
+  return TensorIndexTraits<ARG>::dim();
+}
 
 } // namespace expt
 
 
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)],
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        camp::get<id>(data.segment_tuple)
+            .begin()[camp::get<id>(data.offset_tuple)],
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
+        camp::get<id>(data.vector_sizes));
+  }
+};
 
 } // namespace internal
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index 60e31f24b9..f4de0be068 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -30,658 +30,766 @@ namespace internal
 namespace expt
 {
 
-    template<typename INT_SEQ>
-    struct StaticIndexArray;
-
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
-    struct PrependStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct AddStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct SetStaticIndexArray;
-
-
-    template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>> {
-        
-        using seq_type = camp::int_seq<INDEX_TYPE,HEAD,TAIL...>;
-        using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE,TAIL...>>;
-
-        Tail tail;
-
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>() = default;
-       
-	 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t index) {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return Tail::value_at(index-1);
-            }
-        }
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t index) const {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return tail[index-1];
-            }
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {
-            printf("%ld ",(long)HEAD);
-            tail.print_values();
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            printf("[");
-            print_values();
-            printf("]");
-        }
-
-
-    };
-
-    template<typename INDEX_TYPE>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
-    {
+template <typename INT_SEQ>
+struct StaticIndexArray;
+
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
+struct PrependStaticIndexArray;
 
-        using seq_type = camp::int_seq<INDEX_TYPE>;
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct AddStaticIndexArray;
 
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct SetStaticIndexArray;
 
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t) {
-            return 0;
-        }
+template <typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>
+{
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t) const {
-            return 0;
-        }
+  using seq_type = camp::int_seq<INDEX_TYPE, HEAD, TAIL...>;
+  using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {}
+  Tail tail;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            print("[]");
-        }
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>() = default;
 
-    };
 
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
-    struct PrependStaticIndexArray<INDEX_TYPE, NEW_HEAD, StaticIndexArray<camp::int_seq<INDEX_TYPE,ORIG_INTS...>>>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t index)
+  {
+    if (index == 0)
+    {
+      return HEAD;
+    }
+    else
     {
-        using Type = StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
-        using Seq  = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
-    };
+      return Tail::value_at(index - 1);
+    }
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t index) const
+  {
+    if (index == 0)
+    {
+      return HEAD;
+    }
+    else
+    {
+      return tail[index - 1];
+    }
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const
+  {
+    printf("%ld ", (long)HEAD);
+    tail.print_values();
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("[");
+    print_values();
+    printf("]");
+  }
+};
+
+template <typename INDEX_TYPE>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
+{
 
+  using seq_type = camp::int_seq<INDEX_TYPE>;
 
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, IDX, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using AddTail = typename AddStaticIndexArray<INDEX_TYPE,IDX-1,DELTA,typename Orig::Tail>::Type;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, 0, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
 
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Seq;
-    };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t) { return 0; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t) const { return 0; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const {}
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using SetTail = typename SetStaticIndexArray<INDEX_TYPE,IDX-1,VALUE,typename Orig::Tail>::Type;
-        using Type    = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Type;
-        using Seq     = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, 0, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Seq;
-    };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const { print("[]"); }
+};
 
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
+struct PrependStaticIndexArray<
+    INDEX_TYPE,
+    NEW_HEAD,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, ORIG_INTS...>>>
+{
+  using Type =
+      StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
+  using Seq = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               DELTA,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
 
-    enum TensorTileSize
-    {
-      TENSOR_PARTIAL,  // the tile is a full TensorRegister
-      TENSOR_FULL,     // the tile is a partial TensorRegister
-      TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
-    };
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                HEAD + DELTA,
+                                                typename Orig::Tail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               HEAD + DELTA,
+                                               typename Orig::Tail>::Seq;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               VALUE,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                VALUE,
+                                                typename Orig::Tail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               VALUE,
+                                               typename Orig::Tail>::Seq;
+};
 
-    template<typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
-    struct TensorTile
-    {
-        using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using nonstatic_self_type = self_type;
-        using index_type = INDEX_TYPE;
-        index_type m_begin[NUM_DIMS];
-        index_type m_size[NUM_DIMS];
-
-        static constexpr camp::idx_t s_num_dims = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
-
-
-        template<typename I, TensorTileSize S>
-        void copy(TensorTile<I, S, NUM_DIMS> const &c)
-        {
-          for(camp::idx_t i = 0;i < NUM_DIMS;++i){
-            m_begin[i] = c.m_begin[i];
-            m_size[i] = c.m_size[i];
-          }
-        }
-
-        /*!
-         * Subtract begin offsets of two tiles.
-         *
-         * The resulting tile has the sizes of the left operand, but has
-         * m_begin[i] = left.m_begin[i] - right.m_begin[i]
-         *
-         */
-        template<typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const &sub) const {
-          self_type result(*this);
-          for(camp::idx_t i = 0;i < s_num_dims; ++ i){
-            result.m_begin[i] -= sub.m_begin[i];
-          }
-          return result;
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorTile: dims=%d, m_begin=[",  (int)NUM_DIMS);
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_begin[i]);
-          }
-
-          printf("], m_size=[");
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_size[i]);
-          }
-
-          printf("]\n");
-        }
-    };
-
-
-
-
-    template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    struct StaticTensorTile;
-
-    template< typename INDEX_TYPE,
-              TensorTileSize TENSOR_SIZE,
-              INDEX_TYPE... BeginInts,
-              INDEX_TYPE... SizeInts>
-    struct StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, BeginInts...>,
-              camp::int_seq<INDEX_TYPE, SizeInts...>>
-    {
 
+enum TensorTileSize
+{
+  TENSOR_PARTIAL, // the tile is a full TensorRegister
+  TENSOR_FULL,    // the tile is a partial TensorRegister
+  TENSOR_MULTIPLE // the tile is multiple TennsorRegisters
+};
 
+template <typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
+struct TensorTile
+{
+  using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using nonstatic_self_type = self_type;
+  using index_type = INDEX_TYPE;
+  index_type m_begin[NUM_DIMS];
+  index_type m_size[NUM_DIMS];
 
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
-        using begin_type = StaticIndexArray<begin_seq>;
-        using size_type  = StaticIndexArray<size_seq >;
-        using self_type  = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq,size_seq>;
-        using index_type = INDEX_TYPE;
+  static constexpr camp::idx_t s_num_dims = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-        using nonstatic_self_type = TensorTile<INDEX_TYPE,TENSOR_SIZE,sizeof...(BeginInts)>;
 
-        using Partial = StaticTensorTile< INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>; 
-        using Full    = StaticTensorTile< INDEX_TYPE, TENSOR_FULL   , begin_seq, size_seq>; 
+  template <typename I, TensorTileSize S>
+  void copy(TensorTile<I, S, NUM_DIMS> const& c)
+  {
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      m_begin[i] = c.m_begin[i];
+      m_size[i] = c.m_size[i];
+    }
+  }
+
+  /*!
+   * Subtract begin offsets of two tiles.
+   *
+   * The resulting tile has the sizes of the left operand, but has
+   * m_begin[i] = left.m_begin[i] - right.m_begin[i]
+   *
+   */
+  template <typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type
+  operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const& sub) const
+  {
+    self_type result(*this);
+    for (camp::idx_t i = 0; i < s_num_dims; ++i)
+    {
+      result.m_begin[i] -= sub.m_begin[i];
+    }
+    return result;
+  }
 
-        begin_type m_begin;
-        size_type  m_size;
 
-	static_assert(
-          sizeof...(BeginInts) == sizeof...(SizeInts),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
-        );
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorTile: dims=%d, m_begin=[", (int)NUM_DIMS);
 
-        static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_begin[i]);
+    }
 
-        constexpr operator nonstatic_self_type() const {
-            return nonstatic_self_type { {BeginInts...}, {SizeInts...} };
-        }
+    printf("], m_size=[");
 
-        constexpr nonstatic_self_type nonstatic() const {
-            return *this;
-        }
-        
-        template<TensorTileSize S>
-        constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const RAJA_UNUSED_ARG(&c)) const
-        {}
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_size[i]);
+    }
 
+    printf("]\n");
+  }
+};
+
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+struct StaticTensorTile;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts>
+struct StaticTensorTile<INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, BeginInts...>,
+                        camp::int_seq<INDEX_TYPE, SizeInts...>>
+{
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorTile: dims=%d, m_begin=",  (int)s_num_dims);
 
-          m_begin.print();
+  using begin_seq = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq = camp::int_seq<INDEX_TYPE, SizeInts...>;
+  using begin_type = StaticIndexArray<begin_seq>;
+  using size_type = StaticIndexArray<size_seq>;
+  using self_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  using index_type = INDEX_TYPE;
 
-          printf(", m_size=");
-          
-          m_size.print();
+  using nonstatic_self_type =
+      TensorTile<INDEX_TYPE, TENSOR_SIZE, sizeof...(BeginInts)>;
 
-          printf("\n");
-        }
-    };
+  using Partial =
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>;
+  using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileBegin;
+  begin_type m_begin;
+  size_type m_size;
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileBegin<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using BeginType = StaticIndexArray<TBEGIN>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,BeginType>::Seq,
-                TSIZE
-            >;
-        };
+  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
+                "Mismatch between "
+                "number of "
+                "elements in "
+                "Begin and Size "
+                "series of "
+                "StaticTensorTil"
+                "e");
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileSize;
+  static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileSize<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using SizeType = StaticIndexArray<TSIZE>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                TBEGIN,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,SizeType>::Seq
-            >;
-        };
+  constexpr operator nonstatic_self_type() const
+  {
+    return nonstatic_self_type{{BeginInts...}, {SizeInts...}};
+  }
 
+  constexpr nonstatic_self_type nonstatic() const { return *this; }
 
+  template <TensorTileSize S>
+  constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const
+                          RAJA_UNUSED_ARG(&c)) const
+  {}
 
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct TensorRef
-    {
-        static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
-        static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorTile: dims=%d, m_begin=", (int)s_num_dims);
 
-        using self_type = TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
-        using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
+    m_begin.print();
 
-        pointer_type m_pointer;
-        index_type m_stride[NUM_DIMS];
-        tile_type m_tile;
+    printf(", m_size=");
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS, m_pointer);
+    m_size.print();
 
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_stride[i]);
-          }
+    printf("\n");
+  }
+};
 
-          printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileBegin;
 
-          m_tile.print();
-        }
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileBegin<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using BeginType = StaticIndexArray<TBEGIN>;
+  using Type = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, BeginType>::Seq,
+      TSIZE>;
+};
+
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileSize;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileSize<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using SizeType = StaticIndexArray<TSIZE>;
+  using Type = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      TBEGIN,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, SizeType>::Seq>;
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          camp::idx_t NUM_DIMS,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct TensorRef
+{
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t s_num_dims = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+  using self_type = TensorRef<POINTER_TYPE,
+                              INDEX_TYPE,
+                              TENSOR_SIZE,
+                              NUM_DIMS,
+                              STRIDE_ONE_DIM>;
+  using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using pointer_type = POINTER_TYPE;
+  using index_type = INDEX_TYPE;
+
+
+  pointer_type m_pointer;
+  index_type m_stride[NUM_DIMS];
+  tile_type m_tile;
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[",
+           (int)NUM_DIMS,
+           m_pointer);
+
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_stride[i]);
+    }
 
-    };
+    printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+    m_tile.print();
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename STRIDE_TYPE,
+          typename BEGIN_TYPE,
+          typename SIZE_TYPE,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct StaticTensorRef;
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... StrideInts,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts,
+          camp::idx_t STRIDE_ONE_DIM>
+struct StaticTensorRef<POINTER_TYPE,
+                       INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, StrideInts...>,
+                       camp::int_seq<INDEX_TYPE, BeginInts...>,
+                       camp::int_seq<INDEX_TYPE, SizeInts...>,
+                       STRIDE_ONE_DIM>
+{
 
+  static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
+  using pointer_type = POINTER_TYPE;
+  using index_type = INDEX_TYPE;
 
+  using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
+  using begin_seq = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq = camp::int_seq<INDEX_TYPE, SizeInts...>;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename STRIDE_TYPE, typename BEGIN_TYPE, typename SIZE_TYPE, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct StaticTensorRef;
+  using stride_type = StaticIndexArray<stride_seq>;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, INDEX_TYPE... StrideInts, INDEX_TYPE... BeginInts, INDEX_TYPE... SizeInts, camp::idx_t STRIDE_ONE_DIM>
-    struct StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInts...>,camp::int_seq<INDEX_TYPE,BeginInts...>,camp::int_seq<INDEX_TYPE,SizeInts...>,STRIDE_ONE_DIM>
-    {
+  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
+                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorRef");
 
-        static constexpr camp::idx_t    s_num_dims         = sizeof...(BeginInts);
-        static constexpr camp::idx_t    s_stride_one_dim   = STRIDE_ONE_DIM;
-        static constexpr TensorTileSize s_ref_tensor_size  = TENSOR_SIZE;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
-        using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
 
-        using stride_type  = StaticIndexArray<stride_seq>;
+  using self_type = StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE,
+                                    TENSOR_SIZE,
+                                    stride_seq,
+                                    begin_seq,
+                                    size_seq>;
+  using tile_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
-	static_assert(
-          (sizeof...(BeginInts) == sizeof...(SizeInts)) && (sizeof...(SizeInts) == sizeof...(StrideInts)),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorRef"
-        );
-        
 
-        using self_type = StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,stride_seq,begin_seq,size_seq>;
-        using tile_type = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  pointer_type m_pointer;
+  stride_type m_stride;
+  tile_type m_tile;
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=",
+           (int)s_num_dims,
+           m_pointer);
 
-        pointer_type m_pointer;
-        stride_type m_stride;
-        tile_type m_tile;
+    m_stride.print();
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims, m_pointer);
-
-          m_stride.print();
-
-          printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
-
-          m_tile.print();
-        }
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
-    struct MergeRefTile;
-
-    template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t ... DIM_SEQ>
-    struct MergeRefTile <REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
-
-        static_assert( REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims , "Merging a ref with a tile requires an equivalent number of dimensions.");
-
-        static constexpr camp::idx_t    s_num_dims         = REF_TYPE::s_num_dims;
-        static constexpr camp::idx_t    s_stride_one_dim   = REF_TYPE::s_stride_one_dim;
-        static constexpr TensorTileSize s_ref_tensor_size  = TILE_TYPE::s_tensor_size;
-        using pointer_type    = typename REF_TYPE::pointer_type;
-        using ref_index_type  = typename REF_TYPE::index_type;
-        
-        static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
-        using tile_index_type = typename TILE_TYPE::index_type;
-
-        using merge_type = TensorRef<pointer_type, tile_index_type, s_tile_tensor_size, s_num_dims, s_stride_one_dim>;
-        using shift_type = merge_type;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile){
-          return merge_type{
-            ref.m_pointer,
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin){
-          return shift_type{
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            ref.m_tile
-          };
-        }
-
-    };
-
-
-
-
-
-
-
-    template<
-       typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE,
-       typename STRIDE, INDEX_TYPE1... BEGIN1, INDEX_TYPE1... SIZE1, camp::idx_t STRIDE_ONE_DIM,
-       typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, typename BEGIN2, typename SIZE2,
-       camp::idx_t ... DIM_SEQ
-    >
-    struct MergeRefTile<
-       StaticTensorRef<
-              POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE,
-              STRIDE,
-              camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-              camp::int_seq<INDEX_TYPE1,SIZE1...>,
-              STRIDE_ONE_DIM
-       >,
-       StaticTensorTile<
-              INDEX_TYPE2,
-              TENSOR_SIZE,
-              BEGIN2,
-              SIZE2
-       >,
-       camp::idx_seq<DIM_SEQ...>
-    > {
-
-        using ref_tile_type = StaticTensorTile<
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>
-              >;
-
-        using ref_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  STRIDE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                  STRIDE_ONE_DIM
-              >;
-
-        using tile_type = StaticTensorTile<
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  BEGIN2,
-                  SIZE2
-              >;
-
-        using ref_stride_type = typename ref_type ::stride_type;
-
-        using new_stride_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>; 
-        
-        using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(BEGIN1)...>; 
-        using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(SIZE1)...>; 
-       
-        using shift_tile_type = StaticTensorTile<INDEX_TYPE2,TENSOR_SIZE,shift_begin_seq,shift_size_seq>;
- 
-        using new_stride_type = StaticIndexArray<new_stride_seq>; 
-
-        using merge_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  BEGIN2,
-                  SIZE2,
-                  STRIDE_ONE_DIM
-              >;
-
-        using shift_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  shift_begin_seq,
-                  shift_size_seq,
-                  STRIDE_ONE_DIM
-              >;
-
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(ref_type const &ref, tile_type const &tile){
-          return merge_type {
-            ref.m_pointer,
-            new_stride_type(),
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
-          return shift_type {
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            new_stride_type(),
-            shift_tile_type()
-          };
-        }
-
-
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto merge_ref_tile(REF_TYPE const &ref, TILE_TYPE const &tile) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
-    }
+    printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
 
+    m_tile.print();
+  }
+};
 
 
-    /*!
-     * Modifies a ref's pointer so that the supplied tile_origin will resolve
-     * to the original pointer.
-     */
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto shift_tile_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref, tile_origin);
-    }
+template <typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
+struct MergeRefTile;
 
+template <typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
+struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
+{
 
+  static_assert(REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
+                "Merging a ref "
+                "with a tile "
+                "requires an "
+                "equivalent "
+                "number of "
+                "dimensions.");
+
+  static constexpr camp::idx_t s_num_dims = REF_TYPE::s_num_dims;
+  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
+  static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
+  using pointer_type = typename REF_TYPE::pointer_type;
+  using ref_index_type = typename REF_TYPE::index_type;
+
+  static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
+  using tile_index_type = typename TILE_TYPE::index_type;
+
+  using merge_type = TensorRef<pointer_type,
+                               tile_index_type,
+                               s_tile_tensor_size,
+                               s_num_dims,
+                               s_stride_one_dim>;
+  using shift_type = merge_type;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(REF_TYPE const& ref, TILE_TYPE const& tile)
+  {
+    return merge_type{
+        ref.m_pointer, {tile_index_type(ref.m_stride[DIM_SEQ])...}, tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(REF_TYPE const& ref,
+                                           TILE_TYPE const& tile_origin)
+  {
+    return shift_type{ref.m_pointer -
+                          RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                  ref.m_stride[DIM_SEQ])...),
+                      {tile_index_type(ref.m_stride[DIM_SEQ])...},
+                      ref.m_tile};
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE1,
+          TensorTileSize RTENSOR_SIZE,
+          typename STRIDE,
+          INDEX_TYPE1... BEGIN1,
+          INDEX_TYPE1... SIZE1,
+          camp::idx_t STRIDE_ONE_DIM,
+          typename INDEX_TYPE2,
+          TensorTileSize TENSOR_SIZE,
+          typename BEGIN2,
+          typename SIZE2,
+          camp::idx_t... DIM_SEQ>
+struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE1,
+                                    RTENSOR_SIZE,
+                                    STRIDE,
+                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                    STRIDE_ONE_DIM>,
+                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
+                    camp::idx_seq<DIM_SEQ...>>
+{
 
-    /*!
-     * Changes TensorTile size type to FULL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &
-    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &>(tile);
-    }
+  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
+                                         RTENSOR_SIZE,
+                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
+
+  using ref_type = StaticTensorRef<POINTER_TYPE,
+                                   INDEX_TYPE1,
+                                   RTENSOR_SIZE,
+                                   STRIDE,
+                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                   STRIDE_ONE_DIM>;
+
+  using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
+
+  using ref_stride_type = typename ref_type ::stride_type;
+
+  using new_stride_seq =
+      camp::int_seq<INDEX_TYPE2,
+                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
+
+  using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
+  using shift_size_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
+
+  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
+                                           TENSOR_SIZE,
+                                           shift_begin_seq,
+                                           shift_size_seq>;
+
+  using new_stride_type = StaticIndexArray<new_stride_seq>;
+
+  using merge_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     BEGIN2,
+                                     SIZE2,
+                                     STRIDE_ONE_DIM>;
+
+  using shift_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     shift_begin_seq,
+                                     shift_size_seq,
+                                     STRIDE_ONE_DIM>;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(ref_type const& ref, tile_type const& tile)
+  {
+    return merge_type{ref.m_pointer, new_stride_type(), tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(ref_type const& ref,
+                                           tile_type const& tile_origin)
+  {
+    return shift_type{ref.m_pointer -
+                          RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                  ref.m_stride[DIM_SEQ])...),
+                      new_stride_type(),
+                      shift_tile_type()};
+  }
+};
+
+
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+merge_ref_tile(REF_TYPE const& ref, TILE_TYPE const& tile) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
+{
+  return MergeRefTile<REF_TYPE,
+                      TILE_TYPE,
+                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
+                                                                          tile);
+}
 
-    /*!
-     * Changes TensorTile size type to PARTIAL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &
-    make_tensor_tile_partial(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &>(tile);
-    }
 
+/*!
+ * Modifies a ref's pointer so that the supplied tile_origin will resolve
+ * to the original pointer.
+ */
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
+{
+  return MergeRefTile<
+      REF_TYPE,
+      TILE_TYPE,
+      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
+                                                                 tile_origin);
+}
 
 
-    /*!
-     * Changes StaticTensorTile size type to FULL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &
-    make_tensor_tile_full(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE
+    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&
+    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&>(tile);
+}
 
-    /*!
-     * Changes StaticTensorTile size type to PARTIAL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &
-    make_tensor_tile_partial(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE
+    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&
+    make_tensor_tile_partial(
+        TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&>(
+      tile);
+}
 
 
+/*!
+ * Changes StaticTensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_FULL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_full(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE>&>(tile);
+}
+
+/*!
+ * Changes StaticTensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_PARTIAL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_partial(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE>&>(tile);
+}
+
 
-  } // namespace expt
+} // namespace expt
 } // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index d2bce598ff..638dcc7b8b 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -34,815 +34,796 @@ namespace expt
 {
 
 
+namespace ET
+{
+class TensorExpressionConcreteBase;
+} // namespace ET
+
+
+template <typename TENSOR, camp::idx_t DIM>
+struct TensorDimSize
+{
+  static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
+};
 
+/*
+ * Tensor product helper class.
+ *
+ * This defines the default product operation between types when using the
+ * operator*
+ *
+ */
+template <typename LHS, typename RHS>
+struct TensorDefaultOperation
+{
 
+  using multiply_type = decltype(LHS().multiply(RHS()));
 
-  namespace ET
+  // default multiplication operator
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static multiply_type multiply(LHS const& lhs, RHS const& rhs)
   {
-    class TensorExpressionConcreteBase;
-  } // namespace ET
+    return lhs.multiply(rhs);
+  }
+};
 
 
-  template<typename TENSOR, camp::idx_t DIM>
-  struct TensorDimSize{
-      static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
-  };
+template <typename REF_TYPE>
+struct TensorRegisterStoreRef
+{
+  using self_type = TensorRegisterStoreRef<REF_TYPE>;
+  REF_TYPE m_ref;
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator=(RHS const& rhs)
+  {
+
+    rhs.store_ref(m_ref);
+    return *this;
+  }
+};
+
+template <camp::idx_t N, camp::idx_t D>
+struct DivideRoundUp
+{
+  static constexpr camp::idx_t value = (N % D) > 0 ? (1 + N / D) : (N / D);
+};
+
+
+class TensorRegisterConcreteBase
+{};
+
+/*!
+ * TensorRegister base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class TensorRegisterBase;
+
+template <typename REGISTER_POLICY,
+          typename T,
+          typename LAYOUT,
+          typename camp::idx_t... SIZES>
+class TensorRegisterBase<
+    RAJA::expt::
+        TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
+    : public TensorRegisterConcreteBase
+{
+public:
+  using self_type = RAJA::expt::
+      TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
+  using element_type = camp::decay<T>;
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+
+  static constexpr camp::idx_t s_num_registers =
+      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
+                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
+
+  using index_type = camp::idx_t;
+
+  using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
+
+  using register_policy = REGISTER_POLICY;
+
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
+
+protected:
+  register_type m_registers[s_num_registers];
+
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegisterBase() {}
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegisterBase(element_type c) { broadcast(c); }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegisterBase(self_type const& c) { copy(c); }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegisterBase() {}
+
 
   /*
-   * Tensor product helper class.
-   *
-   * This defines the default product operation between types when using the
-   * operator*
-   *
+   * Overload for:    assignment of ET to a TensorRegister
    */
-  template<typename LHS, typename RHS>
-  struct TensorDefaultOperation{
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this TensorRegister
+    *this = rhs.eval(self_type::s_get_default_tile());
+  }
+
+
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
+                                                           REGS const&... regs)
+      : m_registers{reg0, regs...}
+  {
+    static_assert(1 + sizeof...(REGS) == s_num_registers,
+                  "Incompatible number of registers");
+  }
 
-      using multiply_type = decltype(LHS().multiply(RHS()));
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
-      // default multiplication operator
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      multiply_type multiply(LHS const &lhs, RHS const &rhs)
-      {
-        return lhs.multiply(rhs);
-      }
 
-  };
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
+  create_et_store_ref(REF_TYPE const& ref)
+  {
+    return TensorRegisterStoreRef<REF_TYPE>{ref};
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static self_type s_load_ref(REF_TYPE const& ref)
+  {
+
+    self_type value;
 
+    value.load_ref(ref);
+    return value;
+  }
 
-  template<typename REF_TYPE>
-  struct TensorRegisterStoreRef{
-      using self_type = TensorRegisterStoreRef<REF_TYPE>;
-      REF_TYPE m_ref;
+  /*!
+   * Gets the size of the tensor
+   * Since this is a vector, just the length of the vector in dim 0
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr int s_dim_elem(int dim)
+  {
+    return (dim == 0) ? self_type::s_num_elem : 0;
+  }
 
-      RAJA_SUPPRESS_HD_WARN
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator=(RHS const &rhs)
-      {
 
-        rhs.store_ref(m_ref);
-        return *this;
-      }
-  };
+  /*!
+   * Gets the default tile of this tensor
+   * That tile always start at 0, and extends to the full tile sizes
+   */
 
-  template<camp::idx_t N, camp::idx_t D>
-  struct DivideRoundUp {
-      static constexpr camp::idx_t value =
-          (N % D) > 0 ? (1 + N/D) : (N/D);
-  };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr StaticTensorTile<int,
+                                    TENSOR_FULL,
+                                    camp::int_seq<int, int(SIZES * 0)...>,
+                                    camp::int_seq<int, int(SIZES)...>>
+  s_get_default_tile()
+  {
+    return StaticTensorTile<int,
+                            TENSOR_FULL,
+                            camp::int_seq<int, int(SIZES * 0)...>,
+                            camp::int_seq<int, int(SIZES)...>>();
+  }
 
+  /*!
+   * @brief convenience routine to allow Vector classes to use
+   * camp::sink() across a variety of register types, and use things like
+   * ternary operators
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr bool sink() const { return false; }
 
-  class TensorRegisterConcreteBase {};
 
   /*!
-   * TensorRegister base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
+   * Copy contents of another tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& c)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = c.vec(i);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * Sets all elements to zero
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& clear()
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = register_type(0);
+    }
+
+
+    return *getThis();
+  }
+
+
+  /*!
+   * Copy contents of another matrix operator
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type v)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i].broadcast(v);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(value, i);
+    }
+    return *getThis();
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].add(mat.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].subtract(mat.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * element-wise multiplication
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply(x.vec(i));
+    }
+    return result;
+  }
+
+  /*!
+   * element-wise fused multiply add
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply_add(self_type const& x, self_type const& add) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Dot product of two vectors
+   * @param x Other vector to dot with this vector
+   * @return Value of (*this) dot x
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    element_type result(0);
+
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result += m_registers[reg].multiply(x.vec(reg)).sum();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
+                                       T2,
+                                       RAJA::expt::ScalarLayout,
+                                       camp::idx_seq<>> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
+
+  /*!
+   * @brief Assign one register to antoher
+   * @param x Vector to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Add two vector registers
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
+
+
+  /*!
+   * @brief Add a vector to this vector
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
    */
-  template<typename Derived>
-  class TensorRegisterBase;
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-  template<typename REGISTER_POLICY, typename T, typename LAYOUT, typename camp::idx_t ... SIZES>
-  class TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>> :
-    public TensorRegisterConcreteBase
+  /*!
+   * @brief Add vector to a scalar
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
+
+
+  /*!
+   * @brief Add a scalar to this vector
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Negate the value of this vector
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
+
+  /*!
+   * @brief Subtract two vector registers
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a vector from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
   {
-    public:
-      using self_type = RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
-      using element_type = camp::decay<T>;
-
-      static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+    return getThis()->subtract(x);
+  }
 
-      static constexpr camp::idx_t s_num_registers = DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...), RegisterTraits<REGISTER_POLICY,T>::s_num_elem>::value;
-
-      using index_type = camp::idx_t;
-
-      using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-
-      using register_policy = REGISTER_POLICY;
-
-    private:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
-
-    protected:
-
-      register_type m_registers[s_num_registers];
-
-    public:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegisterBase(){}
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(element_type c)
-      {
-        broadcast(c);
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(self_type const &c)
-      {
-        copy(c);
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegisterBase(){}
-
-
-      /*
-       * Overload for:    assignment of ET to a TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this TensorRegister
-        *this = rhs.eval(self_type::s_get_default_tile());
-      }
-
-
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(register_type reg0, REGS const &... regs) :
-        m_registers{reg0, regs...}
-      {
-        static_assert(1+sizeof...(REGS) == s_num_registers,
-            "Incompatible number of registers");
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      TensorRegisterStoreRef<REF_TYPE>
-      create_et_store_ref(REF_TYPE const &ref) {
-        return TensorRegisterStoreRef<REF_TYPE>{ref};
-      }
-
-      RAJA_SUPPRESS_HD_WARN
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type
-      s_load_ref(REF_TYPE const &ref) {
-
-        self_type value;
-
-        value.load_ref(ref);
-        return value;
-      }
-
-      /*!
-       * Gets the size of the tensor
-       * Since this is a vector, just the length of the vector in dim 0
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr int s_dim_elem(int dim){
-        return (dim==0) ? self_type::s_num_elem : 0;
-      }
-
-
-      /*!
-       * Gets the default tile of this tensor
-       * That tile always start at 0, and extends to the full tile sizes
-       */
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>
-      s_get_default_tile()
-      {
-        return StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>();
-      }
-
-      /*!
-       * @brief convenience routine to allow Vector classes to use
-       * camp::sink() across a variety of register types, and use things like
-       * ternary operators
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      bool sink() const{
-        return false;
-      }
-
-
-
-
-
-
-      /*!
-       * Copy contents of another tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &c){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = c.vec(i);
-        }
-        return *getThis();
-      }
-
-
-
-
-      /*!
-       * Sets all elements to zero
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &clear(){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = register_type(0);
-        }
-
-
-        return *getThis();
-      }
-
-
-      /*!
-       * Copy contents of another matrix operator
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type v){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i].broadcast(v);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast_n(element_type const &value, camp::idx_t N){
-        for(camp::idx_t i = 0;i < N;++ i){
-          getThis()->set(value, i);
-        }
-        return *getThis();
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].add(mat.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].subtract(mat.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * element-wise multiplication
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply(x.vec(i));
-        }
-        return result;
-      }
-
-      /*!
-       * element-wise fused multiply add
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply_add(self_type const &x, self_type const &add) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
-        }
-        return result;
-      }
-
-
-
-      /*!
-       * @brief Dot product of two vectors
-       * @param x Other vector to dot with this vector
-       * @return Value of (*this) dot x
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        element_type result(0);
-
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result += m_registers[reg].multiply(x.vec(reg)).sum();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register, T2, RAJA::expt::ScalarLayout, camp::idx_seq<>> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
-
-      /*!
-       * @brief Assign one register to antoher
-       * @param x Vector to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
-
-
-
-
-
-      /*!
-       * @brief Add two vector registers
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a vector to this vector
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Add vector to a scalar
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a scalar to this vector
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Negate the value of this vector
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
-
-      /*!
-       * @brief Subtract two vector registers
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a vector from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a scalar from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
+  /*!
+   * @brief Subtract a scalar from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Multiply two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE
       typename TensorDefaultOperation<self_type, RHS>::multiply_type
-      operator*(RHS const &rhs) const
-      {
-        return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-      }
-
-      /*!
-       * @brief Multiply a vector with this vector
-       * @param x Vector to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Divide two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Vector to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Returns element wise minimum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmin(x.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * @brief Returns element wise maximum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmax(x.vec(i));
-        }
-        return result;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &vec(int i){
-        return m_registers[i];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &vec(int i) const{
-        return m_registers[i];
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &get_register(int reg){
-        return m_registers[reg];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &get_register(int reg) const{
-        return m_registers[reg];
-      }
-
-
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
-
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
-
-
-      /*!
-       * In-place add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_add(self_type x){
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place sbutract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_subtract(self_type x){
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply(self_type x){
-        *getThis() = getThis()->multiply(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_add(self_type x, self_type y){
-        *getThis() = getThis()->multiply_add(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-subtract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_subtract(self_type x, self_type y){
-        *getThis() = getThis()->multiply_subtract(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place divide operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_divide(self_type x){
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place scaling operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_scale(element_type x){
-        *getThis() = getThis()->scale(x);
-        return *getThis();
-      }
-
-  };
-
-} //namespace internal
+      operator*(RHS const& rhs) const
+  {
+    return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+  }
+
+  /*!
+   * @brief Multiply a vector with this vector
+   * @param x Vector to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() =
+        TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Divide two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Vector to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Returns element wise minimum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmin(x.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Returns element wise maximum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmax(x.vec(i));
+    }
+    return result;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type& vec(int i) { return m_registers[i]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const& vec(int i) const { return m_registers[i]; }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type& get_register(int reg) { return m_registers[reg]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const& get_register(int reg) const
+  {
+    return m_registers[reg];
+  }
+
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
+
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
+
+
+  /*!
+   * In-place add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_add(self_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place sbutract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_subtract(self_type x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply(self_type x)
+  {
+    *getThis() = getThis()->multiply(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_add(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_add(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-subtract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_subtract(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_subtract(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place divide operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_divide(self_type x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place scaling operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_scale(element_type x)
+  {
+    *getThis() = getThis()->scale(x);
+    return *getThis();
+  }
+};
 
 } // namespace expt
 
-}  // namespace RAJA
+} // namespace internal
 
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 3899a97118..d215057d9b 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -33,347 +33,352 @@ namespace expt
 {
 
 
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
+template <typename STORAGE, typename DIM_SEQ>
+struct TensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ>
-    struct TensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST>
-    struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>{
+/**
+ * Implement a dimension tiling loop
+ */
+template <typename STORAGE, camp::idx_t DIM0, camp::idx_t... DIM_REST>
+struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
+{
 
-      using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE const &otile, TTYPE &tile, BODY && body){
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE& tile, BODY&& body)
+  {
 
-        auto const orig_begin = otile.m_begin[DIM0];
-        auto const orig_size =  otile.m_size[DIM0];
+    auto const orig_begin = otile.m_begin[DIM0];
+    auto const orig_size = otile.m_size[DIM0];
 
-        // Do the full tile sizes
-        for(tile.m_begin[DIM0] = orig_begin;
+    // Do the full tile sizes
+    for (tile.m_begin[DIM0] = orig_begin;
 
-            tile.m_begin[DIM0] +  STORAGE::s_dim_elem(DIM0) <=
-                orig_begin+orig_size;
+         tile.m_begin[DIM0] + STORAGE::s_dim_elem(DIM0) <=
+         orig_begin + orig_size;
 
-            tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0)){
+         tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0))
+    {
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, tile, body);
+      // Do the next inner tiling loop
+      inner_t::exec(otile, tile, body);
+    }
 
-        }
+    // Postamble if needed
+    if (tile.m_begin[DIM0] < orig_begin + orig_size)
+    {
 
-        // Postamble if needed
-        if(tile.m_begin[DIM0] <
-            orig_begin + orig_size)
-        {
+      // convert tile to a partial tile
+      auto& part_tile = make_tensor_tile_partial(tile);
 
-          // convert tile to a partial tile
-          auto &part_tile = make_tensor_tile_partial(tile);
+      // store original size
+      auto tmp_size = part_tile.m_size[DIM0];
 
-          // store original size
-          auto tmp_size = part_tile.m_size[DIM0];
+      // set tile size to the remainder
+      part_tile.m_size[DIM0] = orig_begin + orig_size - tile.m_begin[DIM0];
 
-          // set tile size to the remainder
-          part_tile.m_size[DIM0] =
-              orig_begin +
-              orig_size -
-              tile.m_begin[DIM0];
+      // Do the next inner tiling loop
+      inner_t::exec(otile, part_tile, body);
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, part_tile, body);
+      // restore size
+      part_tile.m_size[DIM0] = tmp_size;
+    }
 
-          // restore size
-          part_tile.m_size[DIM0] = tmp_size;
-        }
+    // reset tile dimension
+    tile.m_begin[DIM0] = orig_begin;
+  }
 
-        // reset tile dimension
-        tile.m_begin[DIM0] = orig_begin;
 
-      }
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  static_exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
 
 
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
 
-      template<
-          typename OTILE,
-          typename TTYPE,
-          typename BODY
-      >
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void
-      static_exec(
-          OTILE const &otile,
-          TTYPE const &tile,
-          BODY && body
-      ){
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    auto constexpr step_size = STORAGE::s_dim_elem(DIM0);
 
-        auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-        auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    auto constexpr iter_count =
+        (tile_begin >= orig_begin) && (tile_begin < (orig_begin + orig_size))
+            ? ((orig_begin + orig_size) - tile_begin + step_size - 1) /
+                  step_size
+            : 0;
 
-        auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-        auto constexpr step_size  = STORAGE::s_dim_elem(DIM0);
+    using IterCount =
+        camp::integral_constant<typename TTYPE::index_type, iter_count>;
+    using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
+    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
+                                                  IterCount>::type;
 
-        auto constexpr iter_count =
-               (tile_begin >= orig_begin) && (tile_begin < (orig_begin+orig_size))
-                 ? ((orig_begin + orig_size) - tile_begin + step_size - 1) / step_size
-                 : 0;
+    StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
+  }
+};
 
 
-        using IterCount = camp::integral_constant<typename TTYPE::index_type,iter_count>;
-        using DimSeq = camp::idx_seq<DIM0,DIM_REST...>;
-        using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,IterCount>::type;
+/**
+ * Termination of nested loop:  execute evaluation of ET
+ */
+template <typename STORAGE>
+struct TensorTileExec<STORAGE, camp::idx_seq<>>
+{
 
-        StaticTensorTileExec<STORAGE,DimSeq,IdxSeq>::exec(otile,tile,body);
-        
-      }
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE&, TTYPE const& tile, BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  static_exec(OTILE const&, TTYPE const& tile, BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+};
+
+
+template <typename STORAGE,
+          typename TILE_TYPE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void
+tensorTileExec_expanded(TILE_TYPE const& orig_tile,
+                        BODY&& body,
+                        camp::idx_seq<IDX_SEQ...> const&,
+                        camp::idx_seq<DIM_SEQ...> const&)
+{
 
+  // tile over full rows and columns
+  // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
+  TILE_TYPE tile{
+      {orig_tile.m_begin[IDX_SEQ]...},
+      {STORAGE::s_dim_elem(IDX_SEQ)...},
+  };
 
 
-    };
+  // Promote the tile type to a "full-tile" so that the full-element
+  // register operations are used.
+  // Any of the tiling loops can demote this to a partial-tile when
+  // they do postamble execution
+  auto& full_tile = make_tensor_tile_full(tile);
 
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    /**
-     * Termination of nested loop:  execute evaluation of ET
-     */
-    template<typename STORAGE>
-    struct TensorTileExec<STORAGE, camp::idx_seq<>>{
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE &, TTYPE const &tile, BODY && body){
+  tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+}
 
-        // execute body, passing in the current tile
-        body(tile);
 
-      }
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void static_exec(OTILE const &, TTYPE const &tile, BODY && body){
+/**
+ * Implement a dimension tiling loop
+ */
 
-        // execute body, passing in the current tile
-        body(tile);
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t... DIM_REST,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0, DIM_REST...>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
 
-      }
+  using DimList = camp::idx_seq<DIM0, DIM_REST...>;
+  using DimTail = camp::idx_seq<DIM_REST...>;
+  using IdxList = camp::idx_seq<IDX, IDX_REST...>;
+  using IdxTail = camp::idx_seq<IDX_REST...>;
 
-    };
+  using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0, DIM_REST...>,
+                                        camp::idx_seq<IDX_REST...>>;
 
+  static auto const step_size = STORAGE::s_dim_elem(DIM0);
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded(TILE_TYPE const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
-    {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
 
-      // tile over full rows and columns
-      // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
-      TILE_TYPE tile {
-        {orig_tile.m_begin[IDX_SEQ]...},
-        {STORAGE::s_dim_elem(IDX_SEQ)...},
-      };
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-      // Promote the tile type to a "full-tile" so that the full-element
-      // register operations are used.
-      // Any of the tiling loops can demote this to a partial-tile when
-      // they do postamble execution
-      auto &full_tile = make_tensor_tile_full(tile);
+    using NextTile = typename expt::
+        SetStaticTensorTileBegin<TTYPE, NextBegin, (size_t)DIM0>::Type;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+    using TailTile = typename expt::
+        SetStaticTensorTileSize<TTYPE, TailSize, (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
 
-      tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec DOWN");
 
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
+    {
+      DownExec::static_exec(otile, tile, body);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
     }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      DownExec::static_exec(otile, part_tile, body);
+    }
+  }
+};
 
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>,camp::idx_seq<IDX,IDX_REST...>>{
-
-          using DimList  = camp::idx_seq<DIM0, DIM_REST...>;
-          using DimTail  = camp::idx_seq<      DIM_REST...>;
-          using IdxList  = camp::idx_seq<IDX , IDX_REST...>;
-          using IdxTail  = camp::idx_seq<      IDX_REST...>;
-
-          using DownExec = TensorTileExec<STORAGE,camp::idx_seq<DIM_REST...>>;
-          using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0,DIM_REST...>,camp::idx_seq<IDX_REST...>>;
-
-          static auto const step_size = STORAGE::s_dim_elem(DIM0);
-
-          template<
-              typename OTILE,
-              typename TTYPE,
-              typename BODY
-          >
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static
-          void
-          exec(
-              OTILE const &otile,
-              TTYPE const &tile,
-              BODY && body
-          ){
-    
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
-
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
-
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
-
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
-
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec DOWN" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               DownExec::static_exec(otile, tile, body);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               DownExec::static_exec(otile,part_tile,body);
-            }
-    
-          }
-
-
-
-    };
-
-
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0>,
+                                        camp::idx_seq<IDX_REST...>>;
 
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0>, camp::idx_seq<IDX,IDX_REST...>>{
-      using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0>,camp::idx_seq<IDX_REST...>>;
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const & otile, TTYPE const &tile, BODY && body) {
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+    using NextTile = typename expt::
+        SetStaticTensorTileBegin<TTYPE, NextBegin, (size_t)DIM0>::Type;
 
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
+    using TailTile = typename expt::
+        SetStaticTensorTileSize<TTYPE, TailSize, (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec ACROSS" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               body(tile);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               body(part_tile);
-            }
-      }
 
-    };
-
-    template<typename STORAGE, camp::idx_t ... DIM_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>, camp::idx_seq<> >{
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const &, TTYPE const &, BODY &&) {}
-
-    };
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec ACROSS");
 
-
-
-    template<typename STORAGE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded( StaticTensorTile<INDEX_TYPE,TENSOR_SIZE, TBEGIN, TSIZE> const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
+      body(tile);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
+    }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      body(part_tile);
+    }
+  }
+};
 
-      using InputType = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          TBEGIN,
-          TSIZE
-      >;
+template <typename STORAGE, camp::idx_t... DIM_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM_REST...>,
+                            camp::idx_seq<>>
+{
 
-      using InputBegin = typename InputType::begin_type;
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const&, TTYPE const&, BODY&&)
+  {}
+};
+
+
+template <typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
+    BODY&& body,
+    camp::idx_seq<IDX_SEQ...> const&,
+    camp::idx_seq<DIM_SEQ...> const&)
+{
 
-      using Type = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_FULL,
-          camp::int_seq<INDEX_TYPE,InputBegin::value_at(IDX_SEQ)...>,
-          camp::int_seq<INDEX_TYPE,STORAGE::s_dim_elem(IDX_SEQ)...>
-      >;
+  using InputType = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>;
 
-      Type full_tile;
+  using InputBegin = typename InputType::begin_type;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+  using Type = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_FULL,
+      camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
+      camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
 
+  Type full_tile;
 
-      tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    }
 
+  tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+}
 
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec(TILE_TYPE const &tile, BODY && body)
-    {
-      using layout_type = typename STORAGE::layout_type;
-      tensorTileExec_expanded<STORAGE>(tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
-    }
+template <typename STORAGE, typename TILE_TYPE, typename BODY>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
+                                                 BODY&& body)
+{
+  using layout_type = typename STORAGE::layout_type;
+  tensorTileExec_expanded<STORAGE>(
+      tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
+}
 
-  } // namespace internal
 } // namespace expt
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 4ef4998fbe..e83d9c5fac 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -34,959 +34,1075 @@ namespace RAJA
 namespace expt
 {
 
-  /*!
-   * This provides a Tensor specialization for vectors
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-  class TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>> :
-    public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>
+/*!
+ * This provides a Tensor specialization for vectors
+ */
+template <typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     RAJA::expt::VectorLayout,
+                     camp::idx_seq<SIZE>>
+    : public internal::expt::TensorRegisterBase<
+          RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                     T,
+                                     RAJA::expt::VectorLayout,
+                                     camp::idx_seq<SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   RAJA::expt::VectorLayout,
+                                   camp::idx_seq<SIZE>>;
+  using base_type = internal::expt::TensorRegisterBase<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::VectorLayout,
+                                 camp::idx_seq<SIZE>>>;
+  using element_type = camp::decay<T>;
+  using layout_type = TensorLayout<0>;
+  using register_type = Register<T, REGISTER_POLICY>;
+
+  static constexpr camp::idx_t s_num_elem = SIZE;
+
+  using int_element_type =
+      typename register_type::int_vector_type::element_type;
+  using int_vector_type = TensorRegister<REGISTER_POLICY,
+                                         int_element_type,
+                                         RAJA::expt::VectorLayout,
+                                         camp::idx_seq<SIZE>>;
+
+private:
+  static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
+
+  static constexpr camp::idx_t s_num_full_registers =
+      s_num_elem / s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_partial_lanes =
+      s_num_elem % s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_registers = (s_num_partial_lanes > 0)
+                                                     ? s_num_full_registers + 1
+                                                     : s_num_full_registers;
+
+  using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+  // Offset of last regiser in m_registers
+  static constexpr camp::idx_t s_final_register = s_num_partial_lanes == 0
+                                                      ? s_num_full_registers - 1
+                                                      : s_num_full_registers;
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX i) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-      using base_type = internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>;
-      using element_type = camp::decay<T>;
-      using layout_type = TensorLayout<0>;
-      using register_type = Register<T, REGISTER_POLICY>;
-
-      static constexpr camp::idx_t s_num_elem = SIZE;
-
-      using int_element_type = typename register_type::int_vector_type::element_type;
-      using int_vector_type = TensorRegister<REGISTER_POLICY, int_element_type, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-
-    private:
+    return i >> IDX(s_shift_per_register);
+  }
 
-      static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
-
-      static constexpr camp::idx_t s_num_full_registers = s_num_elem/s_register_num_elem;
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX i) -> IDX
+  {
+    return i & IDX(s_mask_per_register);
+  }
 
-      static constexpr camp::idx_t s_num_partial_lanes =  s_num_elem%s_register_num_elem;
 
-      static constexpr camp::idx_t s_num_registers =
-          (s_num_partial_lanes > 0) ?
-              s_num_full_registers + 1 :
-              s_num_full_registers;
+  using base_type::m_registers;
 
-      using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() {}
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) { this->broadcast(c); }
 
-      // Offset of last regiser in m_registers
-      static constexpr camp::idx_t s_final_register =
-          s_num_partial_lanes == 0 ?
-              s_num_full_registers-1 : s_num_full_registers;
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX i) -> IDX {
-        return i >> IDX(s_shift_per_register);
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) {}
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX i) -> IDX {
-        return i & IDX(s_mask_per_register);
-      }
+  /*
+   * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
+   */
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<
+                    RAJA::internal::expt::ET::TensorExpressionConcreteBase,
+                    RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this
+    // RAJA::expt::TensorRegister
+    *this = rhs.eval(base_type::s_get_default_tile());
+  }
 
 
-      using base_type::m_registers;
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
+                                                       REGS const&... regs)
+      : base_type(reg0, regs...)
+  {}
 
-    public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
 
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return STRIDE_ONE_DIM == 0;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister(){}
 
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? s_num_elem : 0;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c)
-      {
-        this->broadcast(c);
-      }
 
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) :
-        base_type(c)
-      {
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
-      /*
-       * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<RAJA::internal::expt::ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this RAJA::expt::TensorRegister
-        *this = rhs.eval(base_type::s_get_default_tile());
-      }
+  /*!
+   * Provide left vector-matrix multiply for operator* between
+   * this vector and a matrix
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return y.left_vector_multiply(*this);
+  }
 
 
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(register_type reg0, REGS const &... regs) :
-        base_type(reg0, regs...)
-      {
-      }
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
 
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return STRIDE_ONE_DIM == 0;
-      }
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
+  {
 
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? s_num_elem : 0;
-      }
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        this->broadcast(value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+        }
       }
-
-      /*!
-       * Provide left vector-matrix multiply for operator* between
-       * this vector and a matrix
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+      // strided data
+      else
       {
-        return y.left_vector_multiply(*this);
-      }
-
-
-      template<typename REF_TYPE>
-      struct RefBridge;
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        }
       }
+    }
 
 
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
-      {
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-
-
-
-
-
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, INDEX_TYPE STRIDE_VALUE, INDEX_TYPE BEGIN_VALUE, INDEX_TYPE SIZE_VALUE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>> 
-      {
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type &self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-     
-
-
-
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, int stride)
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, int N)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].load_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE STRIDE_VALUE,
+            INDEX_TYPE BEGIN_VALUE,
+            INDEX_TYPE SIZE_VALUE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+      camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+      camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+      STRIDE_ONE_DIM>>
+  {
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr,
-          int stride, int N)
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+        camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+        camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].load_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
-
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].gather(ptr, offsets.vec(reg));
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].gather(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].gather_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, int stride) const
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
+
+
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, int N) const
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                    stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
+
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+      }
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].store_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-            return *this;
-          }
+        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
+                                       N - reg * s_register_num_elem);
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type  *ptr,
-          int stride, int N) const
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].store_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            return *this;
-          }
-
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                      stride);
+      }
+      else
+      {
+        m_registers[reg].load_strided_n(ptr +
+                                            reg * s_register_num_elem * stride,
+                                        stride,
+                                        N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
 
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].gather(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(
+          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type const &offsets) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].scatter(ptr, offsets.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].gather(ptr, offsets.vec(reg));
+      }
+      else
+      {
+        m_registers[reg].gather_n(
+            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(ptr,
+                                             offsets.vec(s_final_register),
+                                             N - s_final_register *
+                                                     s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type const &offsets, camp::idx_t N) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].scatter(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].scatter_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_num_full_registers*s_register_num_elem);
-        }
-        return *this;
-      }
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                     stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &den) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(den.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          result.vec(s_final_register) = m_registers[s_final_register].divide_n(den.vec(s_final_register), s_num_partial_lanes);
-        }
-        return result;
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
       }
-
-      /*!
-       * @brief Divide n elements of this vector by another vector
-       * @param x Vector to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b.get(i), i);
-        }
-        return q;
+      else
+      {
+        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
+                                        N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Divide n elements of this vector by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b, i);
-        }
-        return q;
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                       stride);
       }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min() const
+      else
       {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].min_n(s_num_partial_lanes);
-        }
-
-        element_type result = m_registers[0].min();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::min<element_type>(result, m_registers[i].min());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(s_num_partial_lanes));
-        }
-        return result;
+        m_registers[reg].store_strided_n(ptr +
+                                             reg * s_register_num_elem * stride,
+                                         stride,
+                                         N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride,
+          stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the smallest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(int N) const
-      {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].min_n(N);
-        }
 
-        element_type result = m_registers[0].min();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::min<element_type>(result, m_registers[reg].min());
-          }
-          else{
-            return RAJA::min<element_type>(result, m_registers[reg].min_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
-      }
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr,
+                           int_vector_type const& offsets) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].scatter(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max() const
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& scatter_n(element_type* ptr,
+                             int_vector_type const& offsets,
+                             camp::idx_t N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].max_n(s_num_partial_lanes);
-        }
-
-        element_type result = m_registers[0].max();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::max<element_type>(result, m_registers[i].max());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(s_num_partial_lanes));
-        }
-        return result;
+        m_registers[reg].scatter(ptr, offsets.vec(reg));
       }
-
-      /*!
-       * @brief Returns the largest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(int N) const
+      else
       {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].max_n(N);
-        }
+        m_registers[reg].scatter_n(
+            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
 
-        element_type result = m_registers[0].max();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::max<element_type>(result, m_registers[reg].max());
-          }
-          else{
-            return RAJA::max<element_type>(result, m_registers[reg].max_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(ptr,
+                                              offsets.vec(s_final_register),
+                                              N - s_num_full_registers *
+                                                      s_register_num_elem);
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& den) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(den.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      result.vec(s_final_register) = m_registers[s_final_register].divide_n(
+          den.vec(s_final_register), s_num_partial_lanes);
+    }
+    return result;
+  }
 
-      /*!
-       * @brief Returns the sum of all elements
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type sum() const
-      {
-        // first do a vector sum of all registers
-        register_type s = m_registers[0];
-        for(camp::idx_t i = 1;i < s_num_registers;++ i){
-          s += m_registers[i];
-        }
-        // then a horizontal sum of result
-        return s.sum();
-      }
+  /*!
+   * @brief Divide n elements of this vector by another vector
+   * @param x Vector to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
+  /*!
+   * @brief Divide n elements of this vector by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief The * operator of two vectors is a element-wise multiply
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(self_type const &x) const {
-        return this->multiply(x);
-      }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].min_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::min<element_type>(result, m_registers[i].min());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::min<element_type>(
+          result, m_registers[s_final_register].min_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      /*!
-       * @brief The dot product of two vectors
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type dot(self_type const &x) const {
-        element_type dp(0);
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          dp += m_registers[i].dot(x.vec(i));
-        }
-        return dp;
+  /*!
+   * @brief Returns the smallest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].min_n(N);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        result = RAJA::min<element_type>(result, m_registers[reg].min());
+      }
+      else
+      {
+        return RAJA::min<element_type>(
+            result, m_registers[reg].min_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::min<element_type>(
+          result,
+          m_registers[s_final_register].min_n(N - s_final_register *
+                                                      s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].max_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::max<element_type>(result, m_registers[i].max());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::max<element_type>(
+          result, m_registers[s_final_register].max_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int idx){
-        m_registers[to_register(idx)].set(val, to_lane(idx));
-        return *this;
+  /*!
+   * @brief Returns the largest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].max_n(N);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        result = RAJA::max<element_type>(result, m_registers[reg].max());
       }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int idx) const {
-        return m_registers[to_register(idx)].get(to_lane(idx));
+      else
+      {
+        return RAJA::max<element_type>(
+            result, m_registers[reg].max_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::max<element_type>(
+          result,
+          m_registers[s_final_register].max_n(N - s_final_register *
+                                                      s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the sum of all elements
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type sum() const
+  {
+    // first do a vector sum of all registers
+    register_type s = m_registers[0];
+    for (camp::idx_t i = 1; i < s_num_registers; ++i)
+    {
+      s += m_registers[i];
+    }
+    // then a horizontal sum of result
+    return s.sum();
+  }
 
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+  /*!
+   * @brief The * operator of two vectors is a element-wise multiply
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator*(self_type const& x) const { return this->multiply(x); }
 
-        //
-        for(camp::idx_t i = 0;i < s_num_elem; ++ i){
-          s += std::to_string(this->get(i)) + " ";
-        }
 
-        camp::idx_t physical_size = s_num_registers * s_register_num_elem;
-        if(s_num_elem < physical_size){
-          s += "{";
-          for(camp::idx_t i = s_num_elem;i < physical_size; ++ i){
-            s += std::to_string(this->get(i)) + " ";
-          }
-          s += "}";
-        }
+  /*!
+   * @brief The dot product of two vectors
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type dot(self_type const& x) const
+  {
+    element_type dp(0);
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      dp += m_registers[i].dot(x.vec(i));
+    }
+    return dp;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& set(element_type val, int idx)
+  {
+    m_registers[to_register(idx)].set(val, to_lane(idx));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int idx) const
+  {
+    return m_registers[to_register(idx)].get(to_lane(idx));
+  }
 
-        s += " ]\n";
 
-        return s;
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+
+    //
+    for (camp::idx_t i = 0; i < s_num_elem; ++i)
+    {
+      s += std::to_string(this->get(i)) + " ";
+    }
+
+    camp::idx_t physical_size = s_num_registers * s_register_num_elem;
+    if (s_num_elem < physical_size)
+    {
+      s += "{";
+      for (camp::idx_t i = s_num_elem; i < physical_size; ++i)
+      {
+        s += std::to_string(this->get(i)) + " ";
       }
+      s += "}";
+    }
 
 
-  };
+    s += " ]\n";
+
+    return s;
+  }
+};
 
 
 } // namespace expt
-}  // namespace RAJA
+} // namespace RAJA
 
 
 // Bring in the register policy file so we get the default register type
diff --git a/include/RAJA/pattern/tensor/stats.hpp b/include/RAJA/pattern/tensor/stats.hpp
index 77b70faf00..d92590a663 100644
--- a/include/RAJA/pattern/tensor/stats.hpp
+++ b/include/RAJA/pattern/tensor/stats.hpp
@@ -33,7 +33,7 @@ namespace expt
 {
 struct tensor_stats
 {
-    static int indent;
+  static int indent;
 
   static camp::idx_t num_vector_copy;
   static camp::idx_t num_vector_copy_ctor;
@@ -77,7 +77,6 @@ struct tensor_stats
 
   static void resetVectorStats();
   static void printVectorStats();
-
 };
 
 } // namespace expt
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index defa08585a..9bf03dd560 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -56,16 +56,15 @@ class MultiPolicy
   Selector s;
 
 public:
-  MultiPolicy() = delete;  // No default construction
+  MultiPolicy() = delete; // No default construction
   MultiPolicy(Selector s) : s(s), _policies({Policies{}...}) {}
   MultiPolicy(Selector s, Policies... policies) : s(s), _policies({policies...})
-  {
-  }
+  {}
 
-  MultiPolicy(const MultiPolicy &p) : s(p.s), _policies(p._policies) {}
+  MultiPolicy(const MultiPolicy& p) : s(p.s), _policies(p._policies) {}
 
   template <typename Iterable, typename Body>
-  int invoke(Iterable &&i, Body &&b)
+  int invoke(Iterable&& i, Body&& b)
   {
     size_t index = s(i);
     _policies.invoke(index, i, b);
@@ -86,9 +85,8 @@ template <typename Iterable,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE void forall_impl(MultiPolicy<Selector, Policies...> p,
-                             Iterable &&iter,
-                             Body &&body)
+RAJA_INLINE void
+forall_impl(MultiPolicy<Selector, Policies...> p, Iterable&& iter, Body&& body)
 {
   p.invoke(iter, body);
 }
@@ -97,17 +95,18 @@ template <typename Res,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE resources::EventProxy<Res> forall_impl(Res r,
-                                  MultiPolicy<Selector, Policies...> p,
-                                  Iterable &&iter,
-                                  Body &&body)
+RAJA_INLINE resources::EventProxy<Res>
+forall_impl(Res r,
+            MultiPolicy<Selector, Policies...> p,
+            Iterable&& iter,
+            Body&& body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
 }
 
-}  // end namespace multi
-}  // end namespace policy
+} // end namespace multi
+} // end namespace policy
 
 using policy::multi::MultiPolicy;
 
@@ -122,7 +121,7 @@ auto make_multi_policy(camp::idx_seq<Indices...>,
 {
   return MultiPolicy<Selector, Policies...>(s, std::get<Indices>(policies)...);
 }
-}  // namespace detail
+} // namespace detail
 
 /// make_multi_policy - Construct a MultiPolicy from the given selector and
 /// Policies
@@ -161,7 +160,8 @@ namespace detail
 {
 
 template <size_t index, size_t size, typename Policy, typename... rest>
-struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
+struct policy_invoker : public policy_invoker<index - 1, size, rest...>
+{
   static_assert(index < size, "index must be in the range of possibilities");
   Policy _p;
   using NextInvoker = policy_invoker<index - 1, size, rest...>;
@@ -169,9 +169,10 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
   policy_invoker(Policy p, rest... args) : NextInvoker(args...), _p(p) {}
 
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - index - 1) {
+    if (offset == size - index - 1)
+    {
 
       util::PluginContext context{util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
@@ -189,20 +190,26 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
-      NextInvoker::invoke(offset, std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+    else
+    {
+      NextInvoker::invoke(offset,
+                          std::forward<Iterable>(iter),
+                          std::forward<LoopBody>(loop_body));
     }
   }
 };
 
 template <size_t size, typename Policy, typename... rest>
-struct policy_invoker<0, size, Policy, rest...> {
+struct policy_invoker<0, size, Policy, rest...>
+{
   Policy _p;
   policy_invoker(Policy p, rest...) : _p(p) {}
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - 1) {
+    if (offset == size - 1)
+    {
 
       util::PluginContext context{util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
@@ -214,30 +221,33 @@ struct policy_invoker<0, size, Policy, rest...> {
 
       util::callPreLaunchPlugins(context);
 
-      //std::cout <<"policy_invoker: No index\n";
+      // std::cout <<"policy_invoker: No index\n";
       using policy::multi::forall_impl;
       RAJA_FORCEINLINE_RECURSIVE
       auto r = resources::get_resource<Policy>::type::get_default();
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
+    }
+    else
+    {
       throw std::runtime_error("unknown offset invoked");
     }
   }
 };
 
-}  // end namespace detail
+} // end namespace detail
 
 namespace type_traits
 {
 
 template <typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type> {
-};
-}  // namespace type_traits
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
+                                            typename std::decay<T>::type>
+{};
+} // namespace type_traits
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 898c92a621..2d2a8a9402 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -26,7 +26,8 @@
 namespace RAJA
 {
 
-enum class Policy {
+enum class Policy
+{
   undefined,
   sequential,
   simd,
@@ -37,7 +38,8 @@ enum class Policy {
   sycl
 };
 
-enum class Pattern {
+enum class Pattern
+{
   undefined,
   forall,
   region,
@@ -52,17 +54,23 @@ enum class Pattern {
   workgroup_dispatch
 };
 
-enum class Launch { undefined, sync, async };
-
-struct PolicyBase {
+enum class Launch
+{
+  undefined,
+  sync,
+  async
 };
 
+struct PolicyBase
+{};
+
 template <Policy Policy_,
           Pattern Pattern_,
           Launch Launch_,
           Platform Platform_,
           typename... Traits>
-struct PolicyBaseT : PolicyBase {
+struct PolicyBaseT : PolicyBase
+{
   static constexpr Policy policy = Policy_;
   static constexpr Pattern pattern = Pattern_;
   static constexpr Launch launch = Launch_;
@@ -70,81 +78,87 @@ struct PolicyBaseT : PolicyBase {
 };
 
 template <typename PolicyType>
-struct policy_of {
+struct policy_of
+{
   static constexpr Policy value = PolicyType::policy;
 };
 
 template <typename PolicyType>
-struct pattern_of {
+struct pattern_of
+{
   static constexpr Pattern value = PolicyType::pattern;
 };
 
 template <typename PolicyType>
-struct launch_of {
+struct launch_of
+{
   static constexpr Launch value = PolicyType::launch;
 };
 
 template <typename PolicyType>
-struct platform_of {
+struct platform_of
+{
   static constexpr Platform value = PolicyType::platform;
 };
 
 template <typename PolicyType, RAJA::Policy P_>
-struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_> {
-};
+struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_>
+{};
 
-template <typename PolicyType, RAJA::Policy ... Ps_>
-struct policy_any_of : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
-};
+template <typename PolicyType, RAJA::Policy... Ps_>
+struct policy_any_of
+    : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value>
+{};
 
 template <typename PolicyType, RAJA::Pattern P_>
-struct pattern_is
-    : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_> {
-};
+struct pattern_is : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_>
+{};
 
 template <typename PolicyType, RAJA::Launch L_>
-struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_> {
-};
+struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_>
+{};
 
 template <typename PolicyType, RAJA::Platform P_>
 struct platform_is
-    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_> {
-};
+    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_>
+{};
 
 template <typename PolicyType, typename Trait>
-struct policy_has_trait_impl
-    : camp::num<false> {
-};
+struct policy_has_trait_impl : camp::num<false>
+{};
 ///
-template <typename Trait, Policy Policy_,
-                          Pattern Pattern_,
-                          Launch Launch_,
-                          Platform Platform_,
-                          typename... Traits>
+template <typename Trait,
+          Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          Platform Platform_,
+          typename... Traits>
 struct policy_has_trait_impl<
-      PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>, Trait>
-    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value> {
-};
+    PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
+    Trait>
+    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value>
+{};
 ///
 template <typename PolicyType, typename Trait>
 using policy_has_trait = policy_has_trait_impl<camp::decay<PolicyType>, Trait>;
 
 
 template <typename Inner>
-struct wrapper {
+struct wrapper
+{
   using inner = Inner;
 };
 
 namespace reduce
 {
 
-struct ordered {
-};
+struct ordered
+{};
 
-struct unordered {
-};
+struct unordered
+{};
 
-}  // namespace reduce
+} // namespace reduce
 
 
 template <Policy Pol, Pattern Pat, typename... Args>
@@ -159,10 +173,7 @@ template <Policy Policy_,
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          typename... Args>
+template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
@@ -185,56 +196,57 @@ struct ExecutionPolicy
                     ::RAJA::concepts::has_type<::RAJA::Launch>(
                         camp::decay<decltype(Pol::launch)>()),
                     ::RAJA::concepts::has_type<::RAJA::Platform>(
-                        camp::decay<decltype(Pol::platform)>())) {
-};
+                        camp::decay<decltype(Pol::platform)>()))
+{};
 
-}  // end namespace concepts
+} // end namespace concepts
 
 namespace type_traits
 {
 
 template <typename Pol>
-struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential> {
-};
+struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential>
+{};
 template <typename Pol>
-struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd> {
-};
+struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd>
+{};
 template <typename Pol>
-struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp> {
-};
+struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp>
+{};
 template <typename Pol>
 struct is_target_openmp_policy
-    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp> {
-};
+    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp>
+{};
 template <typename Pol>
-struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda> {
-};
+struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda>
+{};
 template <typename Pol>
-struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip> {
-};
+struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip>
+{};
 template <typename Pol>
-struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl> {
-};
+struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl>
+{};
 
 template <typename Pol>
 struct is_device_exec_policy
-    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip> {
-};
+    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip>
+{};
 
 DefineTypeTraitFromConcept(is_execution_policy,
                            RAJA::concepts::ExecutionPolicy);
 
 
 template <typename Pol>
-struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce> {
-};
+struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce>
+{};
 
 template <typename Pol>
-struct is_multi_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
-};
+struct is_multi_reduce_policy
+    : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce>
+{};
 
-}  // end namespace type_traits
+} // end namespace type_traits
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_POLICYBASE_HPP */
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index cae78d2493..dd50242cc5 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -39,93 +39,94 @@ namespace workgroup
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
 /// execute the enqueued loops in the reverse order from the order that they
 /// were enqueued
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct reverse_ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
 
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in separate allocations.
 struct array_of_pointers
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in a single compact array.
 struct ragged_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 /// store an array of the enqueued objects with padding such that the objects
 /// can be accessed using a constant stride from the beginning of the array.
 struct constant_stride_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 
 /// Dispatch using function pointers to make indirect function calls
 struct indirect_function_call_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
 /// Dispatch using virtual functions to make indirect function calls
 struct indirect_virtual_function_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
 /// Dispatch using an implementation equivalent to a switch statement to select
 /// the type from RangeAndCallables and directly call the object.
 /// RangeAndCallables is a pack of types of the form camp::list<Range, Callable>
 /// where pairs of Range and Callable are the types of the range and callable
 /// objects that may be passed to WorkPool enqueue.
-template < typename ... RangeAndCallables >
+template <typename... RangeAndCallables>
 struct direct_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
-
-template < typename EXEC_POLICY_T,
-           typename ORDER_POLICY_T,
-           typename STORAGE_POLICY_T,
-           typename DISPATCH_POLICY_T = indirect_function_call_dispatch >
-struct WorkGroupPolicy
-    : public RAJA::make_policy_pattern_platform_t<
-                       policy_of<EXEC_POLICY_T>::value,
-                       Pattern::workgroup,
-                       platform_of<EXEC_POLICY_T>::value> {
-  static_assert(RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
+                                  Pattern::workgroup_dispatch>
+{};
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
+struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
+                             policy_of<EXEC_POLICY_T>::value,
+                             Pattern::workgroup,
+                             platform_of<EXEC_POLICY_T>::value>
+{
+  static_assert(
+      RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
       "WorkGroupPolicy: EXEC_POLICY_T must be a workgroup exec policy");
-  static_assert(RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
+  static_assert(
+      RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
-      "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
-  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::value,
-      "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup dispatch policy");
+  static_assert(RAJA::pattern_is<STORAGE_POLICY_T,
+                                 RAJA::Pattern::workgroup_storage>::value,
+                "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage "
+                "policy");
+  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T,
+                                 RAJA::Pattern::workgroup_dispatch>::value,
+                "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup "
+                "dispatch policy");
 };
 
-}  // end namespace workgroup
-}  // end namespace policy
+} // end namespace workgroup
+} // end namespace policy
 
 using policy::workgroup::ordered;
 using policy::workgroup::reverse_ordered;
 
 using policy::workgroup::array_of_pointers;
-using policy::workgroup::ragged_array_of_objects;
 using policy::workgroup::constant_stride_array_of_objects;
+using policy::workgroup::ragged_array_of_objects;
 
+using policy::workgroup::direct_dispatch;
 using policy::workgroup::indirect_function_call_dispatch;
 using policy::workgroup::indirect_virtual_function_dispatch;
-using policy::workgroup::direct_dispatch;
 
 using policy::workgroup::WorkGroupPolicy;
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index e0ca557b32..324891b5b0 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -23,7 +23,7 @@
 #include "RAJA/util/macros.hpp"
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 /*!
@@ -39,19 +39,19 @@
  * because we assume there is no thread safety issues (no parallel model)
  */
 #if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::cuda_atomic {}
 #elif defined(__HIP_DEVICE_COMPILE__) && defined(RAJA_HIP_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::hip_atomic {}
 #elif defined(__SYCL_DEVICE_ONLY__)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::sycl_atomic {}
 #elif defined(RAJA_ENABLE_OPENMP)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::omp_atomic {}
 #else
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::seq_atomic {}
 #endif
 
@@ -60,108 +60,102 @@ namespace RAJA
 {
 
 //! Atomic policy that automatically does "the right thing"
-struct auto_atomic {
-};
+struct auto_atomic
+{};
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T* acc)
 {
   return atomicLoad(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T* acc, T value)
 {
   atomicStore(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T* acc, T value)
 {
   return atomicAdd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T* acc, T value)
 {
   return atomicSub(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T* acc, T value)
 {
   return atomicMin(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T* acc, T value)
 {
   return atomicMax(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc, T compare)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc, T compare)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T* acc, T value)
 {
   return atomicAnd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T* acc, T value)
 {
   return atomicOr(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T* acc, T value)
 {
   return atomicXor(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
-                                              T *acc,
-                                              T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, T* acc, T value)
 {
   return atomicExchange(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(auto_atomic, T *acc, T compare, T value)
+atomicCAS(auto_atomic, T* acc, T compare, T value)
 {
   return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value);
 }
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 // make sure this define doesn't bleed out of this header
 #undef RAJA_AUTO_ATOMIC
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index 34755fa49d..cc0acf70f7 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -22,7 +22,8 @@
 
 #include <cstdint>
 
-#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    (defined(_WIN32) && defined(__INTEL_COMPILER))
 #include <intrin.h>
 #endif
 
@@ -41,14 +42,16 @@ namespace RAJA
 
 
 //! Atomic policy that uses the compilers builtin __atomic_XXX routines
-struct builtin_atomic {
-};
+struct builtin_atomic
+{};
 
 
-namespace detail {
+namespace detail
+{
 
 
-#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    (defined(_WIN32) && defined(__INTEL_COMPILER))
 
 
 /*!
@@ -56,12 +59,11 @@ namespace detail {
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    std::is_same<T, char>::value ||
-    std::is_same<T, short>::value ||
-    std::is_same<T, long>::value ||
-    std::is_same<T, long long>::value;
+      std::is_same<T, char>::value || std::is_same<T, short>::value ||
+      std::is_same<T, long>::value || std::is_same<T, long long>::value;
 };
 
 
@@ -70,18 +72,18 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
+struct builtin_useReinterpret
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 ||
-     sizeof(T) == 2 ||
-     sizeof(T) == 4 ||
-     sizeof(T) == 8);
-
-  using type =
-    std::conditional_t<sizeof(T) == 1, char,
-    std::conditional_t<sizeof(T) == 2, short,
-    std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+
+  using type = std::conditional_t<
+      sizeof(T) == 1,
+      char,
+      std::conditional_t<sizeof(T) == 2,
+                         short,
+                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -90,10 +92,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -105,22 +108,22 @@ struct builtin_useCAS {
 /*!
  * Atomic or using intrinsics
  */
-RAJA_INLINE char builtin_atomicOr(char *acc, char value)
+RAJA_INLINE char builtin_atomicOr(char* acc, char value)
 {
   return _InterlockedOr8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicOr(short *acc, short value)
+RAJA_INLINE short builtin_atomicOr(short* acc, short value)
 {
   return _InterlockedOr16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicOr(long *acc, long value)
+RAJA_INLINE long builtin_atomicOr(long* acc, long value)
 {
   return _InterlockedOr(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
 {
   return _InterlockedOr64(acc, value);
 }
@@ -131,7 +134,7 @@ RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
 }
@@ -140,22 +143,22 @@ RAJA_INLINE T builtin_atomicLoad(T *acc)
 /*!
  * Atomic exchange using intrinsics
  */
-RAJA_INLINE char builtin_atomicExchange(char *acc, char value)
+RAJA_INLINE char builtin_atomicExchange(char* acc, char value)
 {
   return _InterlockedExchange8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicExchange(short *acc, short value)
+RAJA_INLINE short builtin_atomicExchange(short* acc, short value)
 {
   return _InterlockedExchange16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicExchange(long *acc, long value)
+RAJA_INLINE long builtin_atomicExchange(long* acc, long value)
 {
   return _InterlockedExchange(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
 {
   return _InterlockedExchange64(acc, value);
 }
@@ -166,7 +169,7 @@ RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   builtin_atomicExchange(acc, value);
 }
@@ -175,22 +178,23 @@ RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 /*!
  * Atomic compare and swap using intrinsics
  */
-RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value)
+RAJA_INLINE char builtin_atomicCAS(char* acc, char compare, char value)
 {
   return _InterlockedCompareExchange8(acc, value, compare);
 }
 
-RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value)
+RAJA_INLINE short builtin_atomicCAS(short* acc, short compare, short value)
 {
   return _InterlockedCompareExchange16(acc, value, compare);
 }
 
-RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
+RAJA_INLINE long builtin_atomicCAS(long* acc, long compare, long value)
 {
   return _InterlockedCompareExchange(acc, value, compare);
 }
 
-RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
+RAJA_INLINE long long
+builtin_atomicCAS(long long* acc, long long compare, long long value)
 {
   return _InterlockedCompareExchange64(acc, value, compare);
 }
@@ -199,22 +203,22 @@ RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long
 /*!
  * Atomic addition using intrinsics
  */
-RAJA_INLINE char builtin_atomicAdd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAdd(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAdd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAdd(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAdd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAdd(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, value);
 }
@@ -223,22 +227,22 @@ RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
 /*!
  * Atomic subtraction using intrinsics
  */
-RAJA_INLINE char builtin_atomicSub(char *acc, char value)
+RAJA_INLINE char builtin_atomicSub(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, -value);
 }
 
-RAJA_INLINE short builtin_atomicSub(short *acc, short value)
+RAJA_INLINE short builtin_atomicSub(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, -value);
 }
 
-RAJA_INLINE long builtin_atomicSub(long *acc, long value)
+RAJA_INLINE long builtin_atomicSub(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, -value);
 }
 
-RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, -value);
 }
@@ -247,22 +251,22 @@ RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
 /*!
  * Atomic and using intrinsics
  */
-RAJA_INLINE char builtin_atomicAnd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAnd(char* acc, char value)
 {
   return _InterlockedAnd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAnd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAnd(short* acc, short value)
 {
   return _InterlockedAnd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAnd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAnd(long* acc, long value)
 {
   return _InterlockedAnd(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value)
 {
   return _InterlockedAnd64(acc, value);
 }
@@ -271,28 +275,28 @@ RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
 /*!
  * Atomic xor using intrinsics
  */
-RAJA_INLINE char builtin_atomicXor(char *acc, char value)
+RAJA_INLINE char builtin_atomicXor(char* acc, char value)
 {
   return _InterlockedXor8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicXor(short *acc, short value)
+RAJA_INLINE short builtin_atomicXor(short* acc, short value)
 {
   return _InterlockedXor16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicXor(long *acc, long value)
+RAJA_INLINE long builtin_atomicXor(long* acc, long value)
 {
   return _InterlockedXor(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
 {
   return _InterlockedXor64(acc, value);
 }
 
 
-#else  // RAJA_COMPILER_MSVC
+#else // RAJA_COMPILER_MSVC
 
 
 /*!
@@ -300,10 +304,11 @@ RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -312,54 +317,54 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+struct builtin_useReinterpret
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -369,10 +374,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !std::is_integral<T>::value && !std::is_enum<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !std::is_integral<T>::value && !std::is_enum<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -386,7 +392,7 @@ struct builtin_useCAS {
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
 }
@@ -397,7 +403,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -408,7 +414,7 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -419,7 +425,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   __atomic_compare_exchange_n(
       acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
@@ -432,7 +438,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
 }
@@ -443,7 +449,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
 }
@@ -454,7 +460,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
 }
@@ -465,7 +471,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
 }
@@ -476,13 +482,13 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
 }
 
 
-#endif  // RAJA_COMPILER_MSVC
+#endif // RAJA_COMPILER_MSVC
 
 
 /*!
@@ -502,12 +508,12 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+      builtin_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -516,7 +522,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -530,13 +536,12 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicExchange(reinterpret_cast<R*>(acc),
-                           RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -545,14 +550,14 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicCAS(reinterpret_cast<R*>(acc),
-                      RAJA::util::reinterp_A_as_B<T, R>(compare),
-                      RAJA::util::reinterp_A_as_B<T, R>(value)));
+      builtin_atomicCAS(reinterpret_cast<R*>(acc),
+                        RAJA::util::reinterp_A_as_B<T, R>(compare),
+                        RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -567,7 +572,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
@@ -580,7 +585,7 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -595,13 +600,13 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  * Returns the OLD value that was replaced by the result of this operation.
  */
 template <typename T, typename Oper>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = builtin_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
     old = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected));
@@ -617,19 +622,21 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper,
-                                                     ShortCircuit &&sc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
+                                                     Oper&& oper,
+                                                     ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
     old = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
@@ -646,181 +653,160 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
 
 /*!
  * Atomic subtraction using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 
 /*!
  * Atomic and using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
 
 /*!
  * Atomic or using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 
 /*!
  * Atomic xor using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
 
-}  // namespace detail
+} // namespace detail
 
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicLoad(acc);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T* acc, T value)
 {
   detail::builtin_atomicStore(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAdd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicSub(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicAdd(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
+  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicSub(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAnd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicOr(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicXor(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicExchange(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T
+atomicCAS(builtin_atomic, T* acc, T compare, T value)
 {
   return detail::builtin_atomicCAS(acc, compare, value);
 }
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index e9d5bc454f..87031da720 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/forall.hpp"
@@ -42,6 +42,6 @@
 #include "RAJA/policy/cuda/launch.hpp"
 #include "RAJA/policy/cuda/WorkGroup.hpp"
 
-#endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
+#endif // closing endif for if defined(RAJA_ENABLE_CUDA)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 88a89d5362..b1b2fb2233 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -71,7 +71,8 @@ cudaDeviceProp& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -90,7 +91,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -110,7 +112,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -132,7 +135,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -141,8 +145,10 @@ struct DevicePinnedAllocator {
     cudaErrchk(cudaGetDevice(&device));
     void* ptr;
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    cudaErrchk(
+        cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(
+        ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
 
     return ptr;
   }
@@ -158,22 +164,25 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct cudaInfo {
+struct cudaInfo
+{
   const void* func = nullptr;
   cuda_dim_t gridDim{0, 0, 0};
   cuda_dim_t blockDim{0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
+  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct cudaStatusInfo : cudaInfo {
+struct cudaStatusInfo : cudaInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -190,12 +199,9 @@ extern cudaStatusInfo tl_status;
 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Cuda res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
 
-}  // namespace detail
+} // namespace detail
 
 //! Ensure all resources in use are synchronized wrt raja kernel launches
 RAJA_INLINE
@@ -205,13 +211,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
       val.second = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     cudaErrchk(cudaDeviceSynchronize());
   }
 }
@@ -224,12 +233,16 @@ void synchronize(::RAJA::resources::Cuda res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -242,29 +255,40 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, cuda_dim_t gridDim, cuda_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Cuda res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            cuda_dim_t gridDim,
+            cuda_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Cuda res,
+            bool async = true,
+            const char* name = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePushA(name);
+  if (name) nvtxRangePushA(name);
 #else
   RAJA_UNUSED_VAR(name);
 #endif
-  cudaErrchk(cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+  cudaErrchk(
+      cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePop();
+  if (name) nvtxRangePop();
 #endif
   launch(res, async);
 }
@@ -283,9 +307,11 @@ cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                             detail::tl_status.gridDim.y *
-                                             detail::tl_status.gridDim.z; }
+cuda_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -293,9 +319,11 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                              detail::tl_status.blockDim.y *
-                                              detail::tl_status.blockDim.z; }
+cuda_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -310,7 +338,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -322,24 +351,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
+                                  ? align - (unaligned_shmem % align)
+                                  : size_t(0);
   const size_t aligned_shmem = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -354,15 +386,16 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void* func,
-    cuda_dim_t gridDim,
-    cuda_dim_t blockDim,
-    size_t& dynamic_smem,
-    ::RAJA::resources::Cuda res,
-    LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 cuda_dim_t gridDim,
+                 cuda_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Cuda res,
+                 LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(detail::tl_status,
+  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
+      detail::tl_status,
       detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
@@ -378,7 +411,8 @@ static constexpr size_t cuda_occupancy_uninitialized_size_t =
 struct CudaFixedMaxBlocksData
 {
   int device_sm_per_device = cuda::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -399,20 +433,23 @@ struct CudaOccMaxBlocksThreadsData
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksThreadsData
+cuda_occupancy_max_blocks_threads(const void* func,
+                                  size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
-    cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
-
+    cudaErrchk(
+        cudaOccupancyMaxPotentialBlockSize(&data.func_max_blocks_per_device,
+                                           &data.func_max_threads_per_block,
+                                           func,
+                                           func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -427,43 +464,49 @@ struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
     data.func_threads_per_block = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func,
+                          size_t func_dynamic_shmem_per_block,
+                          int func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
     data.func_threads_per_block = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -496,14 +539,16 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -517,10 +562,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -528,7 +577,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -536,16 +586,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -554,8 +605,10 @@ struct ConcretizerImpl
   {
     auto data = cuda_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -563,9 +616,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
@@ -574,10 +627,10 @@ struct ConcretizerImpl
   IdxT m_len;
 };
 
-}  // namespace cuda
+} // namespace cuda
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA
+#endif // closing endif for RAJA_ENABLE_CUDA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup.hpp b/include/RAJA/policy/cuda/WorkGroup.hpp
index d24dcfb769..0c3405b401 100644
--- a/include/RAJA/policy/cuda/WorkGroup.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/cuda/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/cuda/WorkGroup/WorkRunner.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index f6269b36e4..d2bea228e7 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace cuda
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -53,7 +53,8 @@ inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
   static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     cudaErrchk(cudaFreeHost(ptr));
     cudaErrchk(cudaMallocHost(&ptr, nbytes));
@@ -73,7 +74,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +82,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Cuda::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   cudaErrchk(cudaStreamSynchronize(res.get_stream()));
 
@@ -91,32 +93,35 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
   return value;
 }
 
-}  // namespace cuda
+} // namespace cuda
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
-inline const Dispatcher_T* get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T,
+          typename Dispatcher_T,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+inline const Dispatcher_T*
+get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
   static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return cuda::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 41fe17c84a..0178736eea 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,35 +36,36 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -73,9 +74,10 @@ struct WorkRunner<
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +85,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,35 +101,36 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::reverse_ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -132,9 +139,10 @@ struct WorkRunner<
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,8 +150,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,26 +167,29 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldCudaDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
     // TODO:: decide when to run hooks, may bypass this and use impl directly
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
-    const index_type stride  = blockDim.x * gridDim.x;
+    const index_type stride = blockDim.x * gridDim.x;
     const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto end = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,12 +199,12 @@ struct HoldCudaDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           size_t BLOCKS_PER_SM,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -206,36 +221,42 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
-  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
-  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using exec_policy =
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::cuda::
+      unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
   using resource_type = resources::Cuda;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template <typename T>
     using type = HoldCudaDeviceXThreadblockLoop<
         typename camp::at<T, camp::num<0>>::type, // ITERABLE
         typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -244,21 +265,25 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::cuda, dispatcher_holder_policy, RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+  using dispatcher_type =
+      Dispatcher<Platform::cuda,
+                 dispatcher_holder_policy,
+                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
+                 Args...>;
 
   WorkRunner() = default;
 
   WorkRunner(WorkRunner const&) = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -268,52 +293,66 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void
+  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
-    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using Iterator = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using ITERABLE = camp::decay<Iterable>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
     Iterator end = std::end(iter);
     IndexType len = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
           get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+          std::forward<Iterable>(iter),
+          std::forward<LoopBody>(loop_body));
     }
   }
 
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage
+  run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
+                                                         std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage{};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+    auto func = cuda_unordered_y_block_global<BLOCK_SIZE,
+                                              BLOCKS_PER_SM,
+                                              Iterator,
+                                              value_type,
+                                              index_type,
+                                              Args...>;
 
     //
     // Compute the requested iteration space size
@@ -323,18 +362,22 @@ struct WorkRunner<
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
       cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<cuda_dim_member_t>(num_loops),
-                          1};
+      cuda_dim_t gridSize{
+          static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
+                                         block_size),
+          static_cast<cuda_dim_member_t>(num_loops),
+          1};
 
       RAJA_FT_BEGIN;
 
@@ -347,8 +390,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::cuda::launch(
+            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -358,18 +402,15 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
 };
 
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index aedfe91a03..3c94656ae4 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -25,7 +25,8 @@
 #include <stdexcept>
 #include <type_traits>
 
-#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6
+#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 &&                     \
+    __CUDACC_VER_MINOR__ >= 6
 #define RAJA_ENABLE_CUDA_ATOMIC_REF
 #endif
 
@@ -65,11 +66,11 @@ namespace detail
  * cuda_useBuiltinExchange below.
  */
 template <typename T>
-struct cuda_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+struct cuda_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -81,15 +82,15 @@ struct cuda_useBuiltinCommon {
  * below.
  */
 template <typename T>
-struct cuda_useReinterpretCommon {
-  static constexpr bool value =
-    !cuda_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretCommon
+{
+  static constexpr bool value = !cuda_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -109,7 +110,7 @@ using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -124,12 +125,12 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+struct cuda_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -137,22 +138,23 @@ struct cuda_useBuiltinExchange {
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct cuda_useReinterpretExchange {
-  static constexpr bool value =
-    !cuda_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretExchange
+{
+  static constexpr bool value = !cuda_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::type;
+using cuda_useReinterpretExchange_t =
+    typename cuda_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -160,7 +162,7 @@ using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::t
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -171,13 +173,12 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   using R = cuda_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicExchange(reinterpret_cast<R*>(acc),
-                        RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -187,41 +188,41 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
 #if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
 
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
-    cuda::memory_order_relaxed{});
+      cuda::memory_order_relaxed{});
 }
 
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
-    value, cuda::memory_order_relaxed{});
+      value, cuda::memory_order_relaxed{});
 }
 
 #else
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda_atomicOr(acc, static_cast<T>(0));
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicLoad(reinterpret_cast<R*>(acc)));
+      cuda_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda_atomicExchange(acc, value);
 }
@@ -238,14 +239,14 @@ RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
  * implemented using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinCAS {
+struct cuda_useBuiltinCAS
+{
   static constexpr bool value =
 #if __CUDA_ARCH__ >= 700
-    std::is_same<T, unsigned short int>::value ||
+      std::is_same<T, unsigned short int>::value ||
 #endif
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+      std::is_same<T, unsigned long long>::value;
 };
 
 /*!
@@ -254,29 +255,28 @@ struct cuda_useBuiltinCAS {
  * and swap supports
  */
 template <typename T>
-struct cuda_useReinterpretCAS {
-  static constexpr bool value =
-    !cuda_useBuiltinCAS<T>::value &&
-    (
+struct cuda_useReinterpretCAS
+{
+  static constexpr bool value = !cuda_useBuiltinCAS<T>::value &&
+                                (
 #if __CUDA_ARCH__ >= 700
-     sizeof(T) == sizeof(unsigned short) ||
+                                    sizeof(T) == sizeof(unsigned short) ||
 #endif
-     sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long)
-    );
+                                    sizeof(T) == sizeof(unsigned int) ||
+                                    sizeof(T) == sizeof(unsigned long long));
 
   using type =
 #if __CUDA_ARCH__ >= 700
-    std::conditional_t<sizeof(T) == sizeof(unsigned short),
-                       unsigned short,
+      std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                         unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int,
-                       unsigned long long>
+                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                            unsigned int,
+                                            unsigned long long>
 #if __CUDA_ARCH__ >= 700
-                      >
+                         >
 #endif
-    ;
+      ;
 };
 
 /*!
@@ -287,21 +287,21 @@ using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicCAS(reinterpret_cast<R*>(acc),
-                   RAJA::util::reinterp_A_as_B<T, R>(compare),
-                   RAJA::util::reinterp_A_as_B<T, R>(value)));
+      cuda_atomicCAS(reinterpret_cast<R*>(acc),
+                     RAJA::util::reinterp_A_as_B<T, R>(compare),
+                     RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 /*!
@@ -334,13 +334,13 @@ RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
-                                             Oper&& oper)
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = cuda_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
     old = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected));
@@ -349,25 +349,27 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
 }
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing CUDA supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
                                              Oper&& oper,
                                              ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
     old = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
@@ -379,29 +381,28 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int,
-  float
+using cuda_atomicAdd_builtin_types = ::camp::list<int,
+                                                  unsigned int,
+                                                  unsigned long long int,
+                                                  float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                                                  ,
+                                                  double
 #endif
->;
+                                                  >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -412,39 +413,39 @@ RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
  */
 using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
-using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
-using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long int,
-  float
+using cuda_atomicSub_via_Add_builtin_types =
+    ::camp::list<unsigned long long int,
+                 float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                 ,
+                 double
 #endif
->;
+                 >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -453,37 +454,35 @@ RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<
-  int,
-  unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<int,
+                                                     unsigned int
 #if __CUDA_ARCH__ >= 500
-  ,
-  long long int,
-  unsigned long long int
+                                                     ,
+                                                     long long int,
+                                                     unsigned long long int
 #endif
->;
+                                                     >;
 
 
 /*!
  * Atomic min
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -492,23 +491,22 @@ RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 /*!
  * Atomic max
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -517,28 +515,30 @@ RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 /*!
  * Atomic increment/decrement with reset
  */
-using cuda_atomicIncDecReset_builtin_types = ::camp::list<
-  unsigned int
->;
+using cuda_atomicIncDecReset_builtin_types = ::camp::list<unsigned int>;
 
 
 /*!
  * Atomic increment with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   return ::atomicInc(acc, value);
 }
@@ -548,7 +548,7 @@ RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc)
 {
   return cuda_atomicAdd(acc, static_cast<T>(1));
 }
@@ -557,20 +557,25 @@ RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
 /*!
  * Atomic decrement with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return cuda_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   return ::atomicDec(acc, value);
 }
@@ -580,7 +585,7 @@ RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc)
 {
   return cuda_atomicSub(acc, static_cast<T>(1));
 }
@@ -589,28 +594,25 @@ RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
 /*!
  * Atomic bitwise functions (and, or, xor)
  */
-using cuda_atomicBit_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int
->;
+using cuda_atomicBit_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long int>;
 
 
 /*!
  * Atomic and
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -620,12 +622,11 @@ RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
  * Atomic or
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -638,23 +639,23 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * Atomic xor
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
 
 
-}  // namespace detail
+} // namespace detail
 
 
 /*!
@@ -667,8 +668,8 @@ RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
@@ -680,7 +681,7 @@ atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
+atomicStore(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   detail::cuda_atomicStore(acc, value);
@@ -691,8 +692,9 @@ atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
@@ -703,8 +705,9 @@ atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
@@ -715,8 +718,9 @@ atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
@@ -727,8 +731,9 @@ atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
@@ -739,8 +744,9 @@ atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -753,8 +759,8 @@ atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
@@ -765,8 +771,9 @@ atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -779,8 +786,8 @@ atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
@@ -791,8 +798,9 @@ atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
@@ -803,8 +811,9 @@ atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
@@ -815,8 +824,9 @@ atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
@@ -827,8 +837,9 @@ atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
@@ -840,7 +851,7 @@ atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(cuda_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicCAS(acc, compare, value);
@@ -849,8 +860,8 @@ atomicCAS(cuda_atomic_explicit<host_policy>, T *acc, T compare, T value)
 #endif
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // RAJA_ENABLE_CUDA
-#endif  // guard
+#endif // RAJA_ENABLE_CUDA
+#endif // guard
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 493136400c..24a3d3a1c0 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,61 +70,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads,
+                                static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(dims.blocks,
+                                static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -132,21 +162,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
@@ -156,19 +196,25 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -177,20 +223,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
@@ -200,21 +257,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
@@ -224,21 +291,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
@@ -248,19 +325,25 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -291,20 +374,21 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -316,20 +400,19 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -342,22 +425,23 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -370,142 +454,147 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-}  // namespace impl
+} // namespace impl
 
 //
 ////////////////////////////////////////////////////////////////////////
@@ -515,27 +604,43 @@ void forallp_cuda_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                            IterationGetter,
+                                                            Concretizer,
+                                                            BlocksPerSM,
+                                                            Async>;
+  using UniqueMarker = ::camp::
+      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -545,11 +650,15 @@ forall_impl(resources::Cuda cuda_res,
   IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
-    auto func = reinterpret_cast<const void*>(
-        &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                  IndexType>);
+    auto func =
+        reinterpret_cast<const void*>(&impl::forall_cuda_kernel<EXEC_POL,
+                                                                BlocksPerSM,
+                                                                Iterator,
+                                                                LOOP_BODY,
+                                                                IndexType>);
 
     //
     // Setup shared memory buffers
@@ -568,14 +677,20 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::cuda::make_launch_body(func,
+                                       dims.blocks,
+                                       dims.threads,
+                                       shmem,
+                                       cuda_res,
+                                       std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::cuda::launch(
+          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
     }
 
     RAJA_FT_END;
@@ -585,27 +700,48 @@ forall_impl(resources::Cuda cuda_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                            IterationGetter,
+                                                            Concretizer,
+                                                            BlocksPerSM,
+                                                            Async>;
+  using UniqueMarker = ::camp::list<IterationMapping,
+                                    IterationGetter,
+                                    camp::num<BlocksPerSM>,
+                                    LOOP_BODY,
+                                    Iterator,
+                                    ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -615,11 +751,16 @@ forall_impl(resources::Cuda cuda_res,
   IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                   IndexType, camp::decay<ForallParam> >);
+        &impl::forallp_cuda_kernel<EXEC_POL,
+                                   BlocksPerSM,
+                                   Iterator,
+                                   LOOP_BODY,
+                                   IndexType,
+                                   camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -645,14 +786,21 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::cuda::make_launch_body(func,
+                                       dims.blocks,
+                                       dims.threads,
+                                       shmem,
+                                       cuda_res,
+                                       std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {
+          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::cuda::launch(
+          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -683,34 +831,47 @@ forall_impl(resources::Cuda cuda_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
+    resources::Cuda r,
+    ExecPolicy<seq_segit,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BlocksPerSM,
+                                                        Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
+                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              BlocksPerSM,
+                                                              true>(),
                      loop_body);
-  }  // iterate over segments of index set
+  } // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
   return resources::EventProxy<resources::Cuda>(r);
 }
 
-}  // namespace cuda
+} // namespace cuda
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index b2daa3a23e..d7239a64db 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -59,15 +59,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,46 +90,45 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
     }
 
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       atomicExch(&ptr[i], u.array[i]);
     }
   }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 
@@ -160,10 +153,13 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
 #else
@@ -176,10 +172,13 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
 #else
@@ -198,7 +197,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned int
+shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -210,19 +210,22 @@ RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long
+shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
+                                                           int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long long
+shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -265,7 +268,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
+                                                             int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -277,19 +281,22 @@ RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long
+shfl_sync<unsigned long>(unsigned long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
+                                                       int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long long
+shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -334,22 +341,27 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
+      if (srcLane < numThreads)
+      {
         Combiner{}(temp, rhs);
       }
     }
@@ -370,7 +382,8 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = __shfl_xor_sync(0xffffffff, temp, i);
     Combiner{}(temp, rhs);
   }
@@ -393,58 +406,76 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
+      if (srcLane < numThreads)
+      {
         Combiner{}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > policy::cuda::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <=
+                      policy::cuda::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<
+            RAJA::detail::SoAArray<T,
+                                   policy::cuda::device_constants.MAX_WARPS>*>(
+            tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
         Combiner{}(temp, rhs);
       }
@@ -456,12 +487,12 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   return temp;
 }
 
-}  // end namespace impl
+} // end namespace impl
 
-}  // end namespace cuda
+} // end namespace cuda
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel.hpp b/include/RAJA/policy/cuda/kernel.hpp
index 3ed72ecd90..eef7fc6e28 100644
--- a/include/RAJA/policy/cuda/kernel.hpp
+++ b/include/RAJA/policy/cuda/kernel.hpp
@@ -33,4 +33,4 @@
 #include "RAJA/policy/cuda/kernel/TileTCount.hpp"
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index ff15848bcb..968a3010e9 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -42,18 +42,17 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -61,18 +60,15 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 7465f515b0..a08a554496 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -51,7 +51,8 @@ namespace RAJA
  * Blocks per SM must be chosen by the user.
  */
 template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
-struct cuda_explicit_launch {};
+struct cuda_explicit_launch
+{};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
@@ -67,7 +68,10 @@ struct cuda_explicit_launch {};
  * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch = cuda_explicit_launch<async0,
+                                         num_blocks,
+                                         num_threads,
+                                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
@@ -75,7 +79,11 @@ using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_occ_calc_launch =
+    cuda_explicit_launch<async0,
+                         0,
+                         num_threads0,
+                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -87,8 +95,11 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
-};
+    : public internal::Statement<
+          ::RAJA::policy::cuda::
+              cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
+          EnclosedStmts...>
+{};
 
 
 /*!
@@ -98,8 +109,8 @@ struct CudaKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp =
-    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
+                                    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -135,9 +146,9 @@ using CudaKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixed =
-    CudaKernelExt<cuda_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using CudaKernelFixed = CudaKernelExt<
+    cuda_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -156,7 +167,10 @@ using CudaKernelFixedAsync =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<false,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -166,7 +180,10 @@ using CudaKernelFixedSM =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<true,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -183,7 +200,7 @@ using CudaKernel = CudaKernelFixed<1024, EnclosedStmts...>;
 template <typename... EnclosedStmts>
 using CudaKernelAsync = CudaKernelFixedAsync<1024, EnclosedStmts...>;
 
-}  // namespace statement
+} // namespace statement
 
 namespace internal
 {
@@ -231,13 +248,18 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
+template <int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
+                                                              BlocksPerSM,
+                                                              Data,
+                                                              executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::
+        CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
   }
 };
 
@@ -245,10 +267,11 @@ struct CudaKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
+template <typename Data, typename executor_t>
 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::CudaKernelLauncher<Data, executor_t>;
@@ -256,12 +279,14 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct CudaLaunchHelper;
 
 
@@ -270,16 +295,31 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          int blocks_per_sm,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct CudaLaunchHelper<
+    cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
+    StmtList,
+    Data,
+    Types>
 {
   using Self = CudaLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
+  using kernelGetter_t =
+      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
+                               Data,
+                               executor_t>;
 
   inline static const void* get_func()
   {
@@ -287,13 +327,16 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -303,8 +346,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
             func, shmem_size);
         recommended_blocks = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -314,69 +358,73 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
-  inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+  inline static void
+  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -384,8 +432,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -393,16 +442,15 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -416,8 +464,10 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
+inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
+                              cuda_dim_t result,
+                              cuda_dim_t minimum = cuda_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -430,12 +480,13 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -443,9 +494,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -453,9 +505,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -470,18 +523,21 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -497,7 +553,8 @@ struct StatementExecutor<
     // Only launch kernel if we have something to iterate over
     int num_blocks = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -524,24 +581,26 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      cuda_dim_t fit_threads{0,0,0};
+      cuda_dim_t fit_threads{0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitCudaDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitCudaDims(recommended_threads,
+                                  launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitCudaDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitCudaDims(max_threads,
+                                  launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -555,20 +614,21 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
       launch_dims.dims.blocks = fitCudaDims(
@@ -581,7 +641,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -596,22 +657,32 @@ struct StatementExecutor<
         // currently an unresolved issue.
         //
         auto cuda_data = RAJA::cuda::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+                                                      launch_dims.dims.blocks,
+                                                      launch_dims.dims.threads,
+                                                      shmem,
+                                                      res,
+                                                      data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&cuda_data};
+        RAJA::cuda::launch(func,
+                           launch_dims.dims.blocks,
+                           launch_dims.dims.threads,
+                           args,
+                           shmem,
+                           res,
+                           launch_t::async);
       }
     }
   }
 };
 
 
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 58ffa1ba14..bce587f6fd 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -45,9 +45,12 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,10 +63,10 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -79,8 +82,7 @@ struct CudaStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -108,9 +110,13 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,11 +129,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -136,7 +144,8 @@ struct CudaStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,8 +160,7 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
@@ -180,9 +188,13 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -195,11 +207,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -208,7 +222,8 @@ struct CudaStatementExecutor<
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,8 +233,7 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -245,14 +259,19 @@ struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : CudaStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -263,33 +282,32 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_direct<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +317,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +336,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -332,41 +348,41 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -382,9 +398,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +413,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -411,30 +425,29 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+    Data,
+    statement::For<ArgumentId,
+                   RAJA::cuda_thread_masked_direct<Mask>,
+                   EnclosedStmts...>,
+    Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +457,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +477,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -478,39 +489,38 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_thread_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -526,9 +536,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,12 +552,12 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_cuda_kernel_For_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 87556ed8b1..3900964ab3 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -46,29 +46,36 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                      sync,
+                                                      IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -102,29 +109,42 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -133,7 +153,8 @@ struct CudaStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -165,29 +186,42 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -196,7 +230,8 @@ struct CudaStatementExecutor<
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -225,14 +260,19 @@ struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : CudaStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -244,40 +284,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,9 +335,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -303,48 +349,56 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,7 +413,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -372,37 +425,43 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::cuda_thread_masked_direct<Mask>,
+                     EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,9 +472,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -428,45 +486,52 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,11 +546,10 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_cuda_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index fd33192a65..017bb0af9b 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -41,33 +41,31 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                             Types> {
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct CudaStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -93,20 +92,15 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
+} // end namespace internal
 
-
-}  // end namespace internal
-
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 258cd204d6..8ea06be341 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -39,11 +39,15 @@ struct cuda_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
                              statement::InitLocalMem<RAJA::cuda_shared_mem,
-                             camp::idx_seq<Indices...>, EnclosedStmts...>,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
                              Types>
 {
 
@@ -51,15 +55,16 @@ struct CudaStatementExecutor<Data,
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +72,35 @@ struct CudaStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +108,49 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_thread_mem,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +158,35 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,33 +194,26 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index e932a3e270..0d42b67e1c 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -40,33 +40,37 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
+} // namespace internal
+} // namespace RAJA
 
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-}  // namespace internal
-}  // namespace RAJA
-
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
-
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index 7e46748991..b93e49e966 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -35,7 +35,8 @@ namespace internal
 // Executor that handles reductions across a single CUDA thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -44,14 +45,15 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
     auto value = data.template get_param<ParamId>();
@@ -59,7 +61,8 @@ struct CudaStatementExecutor<Data,
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -73,7 +76,8 @@ struct CudaStatementExecutor<Data,
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -81,7 +85,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -94,7 +98,8 @@ struct CudaStatementExecutor<Data,
 // Executor that handles reductions across a single CUDA thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -103,14 +108,15 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
     auto value = data.template get_param<ParamId>();
@@ -118,20 +124,21 @@ struct CudaStatementExecutor<Data,
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -139,7 +146,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,9 +155,8 @@ struct CudaStatementExecutor<Data,
 };
 
 
-
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_cuda_kernel_Reduce_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index 7dd45d8837..ca885ff28c 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -43,61 +43,62 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncthreads().
  */
-struct CudaSyncThreads : public internal::Statement<camp::nil> {
-};
+struct CudaSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncwarp().
  */
-struct CudaSyncWarp : public internal::Statement<camp::nil> {
-};
+struct CudaSyncWarp : public internal::Statement<camp::nil>
+{};
 
-}  // namespace statement
+} // namespace statement
 
 namespace internal
 {
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
+struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types> {
+struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
+  static inline RAJA_DEVICE
 #if CUDART_VERSION >= 9000
-  void exec(Data &, bool) { __syncwarp(); }
+      void
+      exec(Data&, bool)
+  {
+    __syncwarp();
+  }
 #else
-  void exec(Data &, bool) {  }
+      void
+      exec(Data&, bool)
+  {}
 #endif
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad901f6b02..a65dc58a4e 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -58,10 +58,12 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,12 +104,12 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -116,7 +120,7 @@ struct CudaStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -141,11 +145,16 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,13 +162,16 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -167,12 +179,15 @@ struct CudaStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,12 +205,12 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -206,7 +221,7 @@ struct CudaStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -231,11 +246,16 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,13 +263,16 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -257,12 +280,15 @@ struct CudaStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,12 +301,12 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -291,7 +317,7 @@ struct CudaStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -316,18 +342,25 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
-
-}  // end namespace internal
-}  // end namespace RAJA
-
-#endif  // RAJA_ENABLE_CUDA
-#endif  /* RAJA_policy_cuda_kernel_Tile_HPP */
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
+
+} // end namespace internal
+} // end namespace RAJA
+
+#endif // RAJA_ENABLE_CUDA
+#endif /* RAJA_policy_cuda_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index c611346d46..a234b27b0f 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -58,35 +58,42 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::
+              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
@@ -129,35 +136,48 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -172,7 +192,8 @@ struct CudaStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -207,35 +228,48 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -250,7 +284,8 @@ struct CudaStatementExecutor<
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -279,18 +314,27 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
-
-}  // end namespace internal
-}  // end namespace RAJA
-
-#endif  // RAJA_ENABLE_CUDA
-#endif  /* RAJA_policy_cuda_kernel_TileTCount_HPP */
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
+
+} // end namespace internal
+} // end namespace RAJA
+
+#endif // RAJA_ENABLE_CUDA
+#endif /* RAJA_policy_cuda_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 9c904ea45a..50b51e9385 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -44,7 +44,8 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   CudaDims dims;
   CudaDims min_dims;
@@ -54,19 +55,15 @@ struct LaunchDims {
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(CudaDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(CudaDims _dims) : dims{_dims}, min_dims{} {}
 
   RAJA_INLINE
   LaunchDims(CudaDims _dims, CudaDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims{_dims}, min_dims{_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,43 +79,44 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper {
+struct CudaStatementListExecutorHelper
+{
 
   using next_helper_t =
       CudaStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -126,7 +124,7 @@ struct CudaStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +135,7 @@ struct CudaStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -151,16 +149,17 @@ struct CudaStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -175,27 +174,23 @@ struct CudaStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
     return CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
@@ -205,79 +200,95 @@ struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
 
 template <typename StmtList, typename Data, typename Types>
-using cuda_statement_list_executor_t = CudaStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using cuda_statement_list_executor_t =
+    CudaStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,164 +296,225 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -450,35 +522,44 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -487,69 +568,99 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-}  // namespace internal
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 75e5f6902b..a11dd9f0a0 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -37,7 +37,7 @@ __global__ void launch_global_fcn(BODY body_in)
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,7 +45,8 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -53,30 +54,38 @@ __global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_p
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -84,18 +93,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+    cuda_dim_t gridSize{static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +116,26 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
       }
 
       RAJA_FT_END;
@@ -121,13 +144,18 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -140,18 +168,21 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize{
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -163,23 +194,40 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
       launch_info.res = cuda_res;
 
       {
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
+            async,
+            named_usage::unspecified,
+            named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,13 +235,12 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
@@ -201,16 +248,20 @@ void launch_global_fcn_fixed(BODY body_in)
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, int num_threads, size_t BLOCKS_PER_SM, typename ReduceParams>
+template <typename BODY,
+          int num_threads,
+          size_t BLOCKS_PER_SM,
+          typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -218,25 +269,32 @@ void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -250,18 +308,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+    cuda_dim_t gridSize{static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                        static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -271,14 +331,26 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
       }
 
       RAJA_FT_END;
@@ -287,19 +359,27 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN && body_in, ReduceParams &launch_reducers)
+  // Version with explicit reduction parameters..
+  template <typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY,
+                                            nthreads,
+                                            BLOCKS_PER_SM,
+                                            camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -307,19 +387,22 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize{
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize{
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -331,29 +414,43 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
       launch_info.res = cuda_res;
       {
 
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL = RAJA::policy::cuda::
+            cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body =
+            RAJA::cuda::make_launch_body(func,
+                                         gridSize,
+                                         blockSize,
+                                         shared_mem_size,
+                                         cuda_res,
+                                         std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func,
+                           gridSize,
+                           blockSize,
+                           args,
+                           shared_mem_size,
+                           cuda_res,
+                           async,
+                           kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
     }
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -361,43 +458,50 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -405,29 +509,36 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -437,7 +548,8 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
       body(*(segment0.begin() + i0),
            *(segment1.begin() + i1),
            *(segment2.begin() + i2));
@@ -446,44 +558,53 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -494,34 +615,42 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -535,11 +664,14 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
           body(*(segment0.begin() + i0),
                *(segment1.begin() + i1),
@@ -551,42 +683,49 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -594,31 +733,36 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -628,54 +772,66 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
       body(*(segment0.begin() + i0),
            *(segment1.begin() + i1),
            *(segment2.begin() + i2),
-           i0, i1, i2);
+           i0,
+           i1,
+           i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -686,35 +842,42 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -728,16 +891,21 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
           body(*(segment0.begin() + i0),
                *(segment1.begin() + i1),
                *(segment2.begin() + i2),
-               i0, i1, i2);
+               i0,
+               i1,
+               i2);
         }
       }
     }
@@ -748,31 +916,34 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 /*
    CUDA generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             sync,
+                                             IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::
+              cuda_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -781,29 +952,35 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1,
+                                             IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -814,39 +991,47 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::cuda_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -856,29 +1041,34 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -890,9 +1080,9 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
@@ -903,93 +1093,113 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t = IndexMapper::template index<diff_t>();
     const diff_t i = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t_init = IndexMapper::template index<diff_t>();
@@ -997,11 +1207,12 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index f9f60f730e..e7b776416e 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -73,32 +73,41 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>(); // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
   RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -106,71 +115,87 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
 
   for (int shmem_offset = threadId;
        shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>(); // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
   RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
       Combiner{}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
       RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
-}  // namespace impl
+} // namespace impl
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -185,48 +210,63 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(
+        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
   ~MultiReduceGridAtomicHostInit_TallyData() = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
       m_num_bins = new_num_bins;
       m_tally_bins = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(
+          container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
         int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +284,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +300,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
   using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +330,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
     } func_data{min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer{}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
         int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +381,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset{}(
+            bin - 1, tally_bins, tally_rep - 1, tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -354,34 +420,31 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +452,14 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins,
+        m_identity,
+        bin,
+        value,
+        m_tally_mem,
+        GetTallyOffset{},
+        m_tally_replication,
+        m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,7 +469,8 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
@@ -409,10 +478,10 @@ struct MultiReduceGridAtomicHostInit_Data
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -422,57 +491,68 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
   ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
     size_t shared_replication = 0;
-    const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+    const size_t shared_offset =
+        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data{block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer{}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
       m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -490,10 +570,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+          m_num_bins, m_identity, shared_mem, m_shared_replication);
     }
   }
 
@@ -502,11 +582,18 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -516,16 +603,28 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -536,13 +635,15 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
   using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
@@ -551,24 +652,27 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
+  int m_shared_offset;      // in bytes
   int m_shared_replication; // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -595,19 +699,29 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataCuda
 {
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                            T,
+                                                            tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                       T,
+                                                       tuning>,
+              void>>,
       void>;
 
 
@@ -619,15 +733,16 @@ struct MultiReduceDataCuda
 
   MultiReduceDataCuda() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr >
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* =
+          nullptr>
   MultiReduceDataCuda(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,13 +754,16 @@ struct MultiReduceDataCuda
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
@@ -654,16 +772,17 @@ struct MultiReduceDataCuda
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataCuda(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda(MultiReduceDataCuda&&) = delete;
   MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
-  MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&) = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,23 +790,30 @@ struct MultiReduceDataCuda
   ~MultiReduceDataCuda()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -695,7 +821,7 @@ struct MultiReduceDataCuda
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,15 +855,17 @@ struct MultiReduceDataCuda
 
 
 private:
-  MultiReduceDataCuda const *m_parent;
+  MultiReduceDataCuda const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Cuda res)
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,19 +874,21 @@ struct MultiReduceDataCuda
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
       ::RAJA::cuda::synchronize(list_res);
     }
     m_sync_list->clear();
   }
 };
 
-}  // end namespace cuda
+} // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
+                                cuda::MultiReduceDataCuda)
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index 4edf645ed3..b7fb284136 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -7,37 +7,41 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+init(KernelName& kn, const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePush(kn.name);
+  nvtxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+    combine(KernelName&)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+resolve(KernelName&, const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePop();
+  nvtxRangePop();
 #endif
-  }
+}
 
 } //  namespace detail
 } //  namespace expt
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index a2e734b1eb..1b895a51f3 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -8,45 +8,50 @@
 #include "RAJA/policy/cuda/reduce.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
-    red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(Reducer<OP, T>& red)
-  {
-    RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    // complete reduction
-    ci.res.wait();
-    *red.target = OP{}(*red.target, *red.devicetarget);
-
-    // free memory
-    RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+init(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
+{
+  red.devicetarget =
+      RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
+  red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+    combine(Reducer<OP, T>& red)
+{
+  RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+resolve(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
+{
+  // complete reduction
+  ci.res.wait();
+  *red.target = OP{}(*red.target, *red.devicetarget);
+
+  // free memory
+  RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
 
 } //  namespace detail
 } //  namespace expt
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index cd71a37480..e5f33d748a 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -61,30 +61,32 @@ using cuda_dim_member_t = camp::decay<decltype(std::declval<cuda_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
-}  // end namespace detail
+} // end namespace detail
 
 namespace cuda
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -96,13 +98,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -115,10 +118,10 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
@@ -126,15 +129,20 @@ struct FractionOffsetOccupancyConcretizer
     IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -148,22 +156,27 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template <typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -172,10 +185,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template <size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -187,19 +200,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
     IdxT cutoff = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -210,19 +227,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -231,18 +250,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -260,8 +281,10 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
   static constexpr reduce_algorithm algorithm = t_algorithm;
@@ -279,9 +302,9 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
@@ -289,9 +312,9 @@ struct AtomicReplicationTuning
   using OffsetCalculator = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -300,7 +323,7 @@ struct MultiReduceTuning
   static constexpr bool consistent = false;
 };
 
-}  // namespace cuda
+} // namespace cuda
 
 namespace policy
 {
@@ -312,16 +335,19 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of
+                                            // the cache level that handles
+                                            // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
@@ -330,7 +356,8 @@ struct DeviceConstants
 //
 constexpr DeviceConstants device_constants(32, 1024, 32); // V100
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
@@ -339,38 +366,51 @@ constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_indexer {};
-
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::cuda,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::cuda> {
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_indexer
+{};
+
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_flatten_indexer
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<true /*async */>::value,
+          RAJA::Platform::cuda>
+{
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::forall,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda>
+{
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async, int num_threads = named_usage::unspecified,
+template <bool Async,
+          int num_threads = named_usage::unspecified,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
-                                RAJA::Policy::cuda,
-                                RAJA::Pattern::region,
-                                detail::get_launch<Async>::value,
-                                RAJA::Platform::cuda> {
-};
+struct cuda_launch_explicit_t
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<Async>::value,
+          RAJA::Platform::cuda>
+{};
 
 
 //
@@ -380,13 +420,15 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
-};
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::workgroup_exec,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -394,10 +436,10 @@ struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::cuda> {
-};
+          RAJA::Policy::cuda,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::cuda>
+{};
 
 
 ///
@@ -408,36 +450,36 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template < typename tuning >
-struct cuda_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
-
-template < typename tuning >
+template <typename tuning>
+struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::reduce,
+                                detail::get_launch<false>::value,
+                                RAJA::Platform::cuda,
+                                std::conditional_t<tuning::consistent,
+                                                   reduce::ordered,
+                                                   reduce::unordered>>
+{};
+
+template <typename tuning>
 struct cuda_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::cuda,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Cuda atomic policy for using cuda atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct cuda_atomic_explicit{};
+template <typename host_policy>
+struct cuda_atomic_explicit
+{};
 
 /*!
  * Default cuda atomic policy uses cuda atomics on the device and non-atomics
@@ -448,23 +490,26 @@ using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct cuda_block_reduce{};
+struct cuda_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct cuda_warp_reduce{};
+struct cuda_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_direct{};
+struct cuda_warp_direct
+{};
 
 // Policy to map work to threads within a warp using a warp-stride loop
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_loop{};
-
+struct cuda_warp_loop
+{};
 
 
 // Policy to map work to threads within a warp using a bit mask
@@ -473,8 +518,9 @@ struct cuda_warp_loop{};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_direct {};
+template <typename Mask>
+struct cuda_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with cuda_thread_x_*
@@ -482,24 +528,27 @@ struct cuda_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_loop {};
+template <typename Mask>
+struct cuda_warp_masked_loop
+{};
 
 
-template<typename Mask>
-struct cuda_thread_masked_direct {};
+template <typename Mask>
+struct cuda_thread_masked_direct
+{};
 
-template<typename Mask>
-struct cuda_thread_masked_loop {};
+template <typename Mask>
+struct cuda_thread_masked_loop
+{};
 
 
 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
                                                        Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                       Launch::sync>
+{};
 
-}  // end namespace cuda
-}  // end namespace policy
+} // end namespace cuda
+} // end namespace policy
 
 
 namespace internal
@@ -508,18 +557,18 @@ namespace internal
 RAJA_INLINE
 int get_size(cuda_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct CudaDims {
+struct CudaDims
+{
 
-  cuda_dim_t blocks{0,0,0};
-  cuda_dim_t threads{0,0,0};
+  cuda_dim_t blocks{0, 0, 0};
+  cuda_dim_t threads{0, 0, 0};
 
   CudaDims() = default;
   CudaDims(CudaDims const&) = default;
@@ -527,117 +576,109 @@ struct CudaDims {
 
   RAJA_INLINE
   CudaDims(cuda_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks{default_val, default_val, default_val},
+        threads{default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  cuda_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
+  cuda_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
       return {(blocks.x ? blocks.x : 1),
               (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  cuda_dim_t get_threads() const {
-    if (num_threads() != 0) {
+  cuda_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
       return {(threads.x ? threads.x : 1),
               (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct CudaDimHelper;
 
-template<>
-struct CudaDimHelper<named_dim::x>{
+template <>
+struct CudaDimHelper<named_dim::x>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::y>{
+template <>
+struct CudaDimHelper<named_dim::y>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::z>{
+template <>
+struct CudaDimHelper<named_dim::z>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-cuda_dim_member_t get_cuda_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr cuda_dim_member_t get_cuda_dim(dim_t const& d)
 {
   return CudaDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_cuda_dim(dim_t &d, cuda_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
 {
   return CudaDimHelper<dim>::set(d, value);
 }
@@ -653,12 +694,11 @@ struct IndexSize
   cuda_dim_member_t block_size = named_usage::unspecified;
   cuda_dim_member_t grid_size = named_usage::unspecified;
 
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(cuda_dim_member_t _block_size = named_usage::unspecified,
-            cuda_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      cuda_dim_member_t _block_size = named_usage::unspecified,
+      cuda_dim_member_t _grid_size = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -666,7 +706,7 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -675,23 +715,24 @@ struct IndexGlobal
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
@@ -699,20 +740,21 @@ struct IndexGlobal<dim, 1, GRID_SIZE>
   static constexpr int block_size = 1;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -720,40 +762,41 @@ struct IndexGlobal<dim, BLOCK_SIZE, 1>
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
@@ -761,43 +804,49 @@ struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -805,67 +854,77 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
@@ -873,60 +932,63 @@ struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -934,168 +996,171 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
+template <typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
+template <typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
+template <typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1104,10 +1169,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1122,10 +1187,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1140,85 +1205,83 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
+template <size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
 
 } // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
 
-using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
+    cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
+        cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using CudaFractionOffsetOccupancyConcretizer =
+    cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
@@ -1228,179 +1291,286 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t GRID_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+using cuda_exec_explicit =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+using cuda_exec_explicit_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_exec =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_exec_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Fraction,
+          bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Concretizer,
+          bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_base_explicit = std::conditional_t<with_reduce,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <bool with_reduce,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
+using cuda_exec_base_explicit_async = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-    cuda_exec<BLOCK_SIZE, Async>>;
+using cuda_exec_base =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+                       cuda_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce_async<BLOCK_SIZE>,
-    cuda_exec_async<BLOCK_SIZE>>;
+using cuda_exec_base_async =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
+                       cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
-using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async = false>
+using cuda_work_explicit =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_work_explicit_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_work_explicit_async =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_work = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_work_async = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
@@ -1410,10 +1580,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template < cuda::reduce_algorithm algorithm,
-           cuda::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <cuda::reduce_algorithm algorithm,
+          cuda::block_communication_mode comm_mode,
+          size_t replication = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1436,35 +1606,41 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1476,25 +1652,26 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
+template <bool with_atomic>
+using cuda_reduce_base =
+    std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // policies usable with multi_reducers
-template < cuda::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
-    cuda::MultiReduceTuning<
-      algorithm,
-      cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <cuda::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using cuda_multi_reduce_tuning =
+    policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
+        algorithm,
+        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                      SharedAtomicReplicationIndexer,
+                                      GetOffsetRight<int>>,
+        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                      GlobalAtomicReplicationIndexer,
+                                      GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1508,23 +1685,27 @@ using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
 //   systems.
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<16>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<16>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<0>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<0>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 //
 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
     cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
@@ -1534,18 +1715,21 @@ using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
         cuda::ConstantPreferredReplicationConcretizer<2>>,
     cuda::warp_global_xyz<>>;
 //
-using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<1>>,
-    cuda::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using cuda_multi_reduce_atomic_global_no_replication_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void, // unused with this algorithm
+        void, // unused with this algorithm
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<1>>,
+        cuda::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using cuda_multi_reduce_atomic =
+    cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using cuda_multi_reduce_atomic_low_performance_low_overhead =
     cuda_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1573,41 +1757,49 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
+template <bool Async,
+          int num_threads = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_launch_explicit_t =
+    policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
-//CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
+// CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
 template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
-    (num_threads == named_usage::unspecified) ? named_usage::unspecified : policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch_t =
+    policy::cuda::cuda_launch_explicit_t<Async,
+                                         num_threads,
+                                         (num_threads ==
+                                          named_usage::unspecified)
+                                             ? named_usage::unspecified
+                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_indexer_direct =
+    policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                               kernel_sync_requirement::none,
+                               indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_flatten_indexer_direct =
+    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
+                                       kernel_sync_requirement::none,
+                                       indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1620,7 +1812,7 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1635,22 +1827,28 @@ using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
 using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
 using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_direct = cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_direct = cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_direct = cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_direct = cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_direct = cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_direct = cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_direct =
+    cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_direct =
+    cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_direct =
+    cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_direct =
+    cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_direct =
+    cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_direct =
+    cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1665,12 +1863,18 @@ using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
 using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
 using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_loop = cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_loop = cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_loop = cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_loop = cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_loop = cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_loop =
+    cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_loop =
+    cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_loop =
+    cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_loop =
+    cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_loop =
+    cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_loop =
+    cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
@@ -1678,7 +1882,7 @@ using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1686,26 +1890,38 @@ using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
 using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
 using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
 
-using cuda_flatten_thread_xy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1713,19 +1929,31 @@ using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
 using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
 using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
 
-using cuda_flatten_thread_xy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1733,7 +1961,7 @@ using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1748,22 +1976,28 @@ using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
 using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
 using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_direct = cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_direct = cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_direct = cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_direct = cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_direct = cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_direct = cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_direct =
+    cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_direct =
+    cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_direct =
+    cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_direct =
+    cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_direct =
+    cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_direct =
+    cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1778,12 +2012,18 @@ using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
 using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
 using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_loop = cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_loop = cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_loop = cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_loop = cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_loop = cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_loop =
+    cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_loop =
+    cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_loop =
+    cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_loop =
+    cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_loop =
+    cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_loop =
+    cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
@@ -1791,7 +2031,7 @@ using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_di
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1799,26 +2039,38 @@ using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
 using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
 using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
 
-using cuda_flatten_block_xy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1826,19 +2078,31 @@ using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
 using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
 using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
 
-using cuda_flatten_block_xy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1846,9 +2110,11 @@ using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using cuda_global_direct = cuda_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_global_direct =
+    cuda_indexer_direct<cuda::IndexGlobal<dims,
+                                          named_usage::unspecified,
+                                          named_usage::unspecified>...>;
 
 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
@@ -1861,24 +2127,34 @@ using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
 using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
 using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_direct = cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_direct = cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_direct = cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_direct = cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_direct = cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_direct = cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_direct =
+    cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_direct =
+    cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_direct =
+    cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_direct =
+    cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_direct =
+    cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_direct =
+    cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using cuda_global_loop = cuda_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using cuda_global_syncable_loop = cuda_indexer_syncable_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_global_loop =
+    cuda_indexer_loop<cuda::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
+
+template <named_dim... dims>
+using cuda_global_syncable_loop =
+    cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
+                                                 named_usage::unspecified,
+                                                 named_usage::unspecified>...>;
 
 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
@@ -1891,12 +2167,18 @@ using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
 using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
 using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_loop = cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_loop = cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_loop = cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_loop = cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_loop = cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_loop =
+    cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_loop =
+    cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_loop =
+    cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_loop =
+    cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_loop =
+    cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_loop =
+    cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -1904,54 +2186,83 @@ using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using cuda_flatten_global_direct = cuda_flatten_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_flatten_global_direct =
+    cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
+                                                  named_usage::unspecified,
+                                                  named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
 using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
 
-using cuda_flatten_global_xy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using cuda_flatten_global_loop = cuda_flatten_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_flatten_global_loop =
+    cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
 using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
 
-using cuda_flatten_global_xy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1959,271 +2270,481 @@ using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_thread_size_x_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_thread_size_y_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_thread_size_z_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_block_size_x_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_block_size_y_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_block_size_z_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template <int X_BLOCK_SIZE>
 using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template <int Y_BLOCK_SIZE>
 using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template <int Z_BLOCK_SIZE>
 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2231,272 +2752,507 @@ using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE,
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_flatten_block_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_flatten_block_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
@@ -2576,7 +3332,7 @@ using cuda_block_yzx_nested_loop = cuda_block_yzx_loop;
 using cuda_block_zxy_nested_loop = cuda_block_zxy_loop;
 using cuda_block_zyx_nested_loop = cuda_block_zyx_loop;
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // RAJA_ENABLE_CUDA
+#endif // RAJA_ENABLE_CUDA
 #endif
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 409ec16818..989b328155 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -43,18 +43,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define cudaErrchk(ans)                            \
-  {                                                \
-    ::RAJA::cudaAssert((ans), __FILE__, __LINE__); \
+#define cudaErrchk(ans)                                                        \
+  {                                                                            \
+    ::RAJA::cudaAssert((ans), __FILE__, __LINE__);                             \
   }
 
-inline void cudaAssert(cudaError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+inline void
+cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
 {
-  if (code != cudaSuccess) {
-    if (abort) {
+  if (code != cudaSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "CUDAassert: ";
       msg += cudaGetErrorString(code);
@@ -63,15 +63,20 @@ inline void cudaAssert(cudaError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "CUDAassert: %s %s %d\n",
-              cudaGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(stderr,
+              "CUDAassert: %s %s %d\n",
+              cudaGetErrorString(code),
+              file,
+              line);
     }
   }
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
+#endif // closing endif for if defined(RAJA_ENABLE_CUDA)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 8d55698af8..b5c0ea235a 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -44,9 +44,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -66,7 +66,8 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>> {
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
@@ -74,7 +75,8 @@ struct atomic<sum<T>> {
 };
 
 template <typename T>
-struct atomic<min<T>> {
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
@@ -82,7 +84,8 @@ struct atomic<min<T>> {
 };
 
 template <typename T>
-struct atomic<max<T>> {
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
@@ -90,7 +93,8 @@ struct atomic<max<T>> {
 };
 
 template <typename T>
-struct atomic<and_bit<T>> {
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
@@ -98,7 +102,8 @@ struct atomic<and_bit<T>> {
 };
 
 template <typename T>
-struct atomic<or_bit<T>> {
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
@@ -106,15 +111,16 @@ struct atomic<or_bit<T>> {
 };
 
 template <typename T>
-struct cuda_atomic_available {
+struct cuda_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
 };
 
-}  // namespace cuda
+} // namespace cuda
 
-}  // namespace reduce
+} // namespace reduce
 
 namespace cuda
 {
@@ -124,15 +130,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -147,7 +157,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   int maxNumSlots = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
   int beginSlots = replicationId * maxNumSlots;
@@ -155,8 +165,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -164,33 +176,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   isLastBlock = __syncthreads_or(isLastBlock);
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
       Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -198,7 +213,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
@@ -211,57 +227,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
       temp = Combiner{}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
       T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
+      if (srcLane < numThreads)
+      {
         temp = Combiner{}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::
+            SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T,
+            RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
+           i *= 2)
+      {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
         temp = Combiner{}(temp, rhs);
       }
@@ -275,10 +311,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
   const int numBlocks = BlockIterationGetter::size();
   const int numThreads = ThreadIterationGetter::size();
@@ -291,7 +330,8 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     red.device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
@@ -304,18 +344,21 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
       temp = OP{}(temp, red.device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *(red.devicetarget) = temp;
     }
   }
@@ -326,12 +369,16 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T>
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -340,24 +387,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -368,7 +419,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
@@ -376,11 +428,13 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
     RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -391,9 +445,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -401,18 +454,19 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
+  if (threadId == 0 && temp != identity)
+  {
     RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
   }
 }
 
-}  // namespace impl
+} // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
@@ -421,12 +475,14 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Cuda res;
     Node* node_list;
@@ -479,12 +535,17 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
         m_n = m_rn->node_list;
-      } else {
+      }
+      else
+      {
         m_rn = nullptr;
         m_n = nullptr;
       }
@@ -498,7 +559,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -535,17 +596,19 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
+    if (!rn)
+    {
       rn = (ResourceNode*)malloc(sizeof(ResourceNode));
       rn->next = resource_list;
       rn->res = res;
@@ -562,7 +625,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::cuda::synchronize(*r);
     }
   }
@@ -570,9 +634,11 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
+      while (rn->node_list)
+      {
         Node* n = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
@@ -602,8 +668,11 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -618,7 +687,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool owns_device_pointer;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
 
   /*! \brief create from a default value and offload information
    *
@@ -631,8 +700,7 @@ struct ReduceLastBlock_Data
         device_count{nullptr},
         device{},
         owns_device_pointer{false}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
@@ -641,16 +709,16 @@ struct ReduceLastBlock_Data
         device_count{other.device_count},
         device{other.device},
         owns_device_pointer{false}
-  {
-  }
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -662,10 +730,11 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId = impl::
+        grid_reduce_last_block<Combiner, Accessor, replication, atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -675,13 +744,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
+    if (act)
+    {
       cuda_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -692,7 +763,8 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
@@ -703,8 +775,10 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -723,8 +797,7 @@ struct ReduceAtomicHostInit_Data
         identity{identity_},
         is_setup{false},
         owns_device_pointer{false}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
@@ -732,16 +805,17 @@ struct ReduceAtomicHostInit_Data
         identity{other.identity},
         is_setup{other.is_setup},
         owns_device_pointer{false}
-  {
-  }
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data&
+  operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -753,9 +827,8 @@ struct ReduceAtomicHostInit_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_host_init<Combiner,
-        replication, atomic_stride>(
-            temp, identity, output);
+    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -763,7 +836,8 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
+    if (act)
+    {
       is_setup = true;
       owns_device_pointer = true;
     }
@@ -775,7 +849,8 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       is_setup = false;
       owns_device_pointer = false;
     }
@@ -784,8 +859,11 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -808,8 +886,7 @@ struct ReduceAtomicDeviceInit_Data
         device_count{nullptr},
         device{nullptr},
         owns_device_pointer{false}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
@@ -818,16 +895,17 @@ struct ReduceAtomicDeviceInit_Data
         device_count{other.device_count},
         device{other.device},
         owns_device_pointer{false}
-  {
-  }
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data&
+  operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -839,10 +917,13 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId = impl::grid_reduce_atomic_device_init<Combiner,
+                                                                Accessor,
+                                                                replication,
+                                                                atomic_stride>(
+        temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -852,10 +933,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -866,7 +950,8 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
@@ -882,45 +967,73 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 1;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 1;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::cuda::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::cuda::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      cuda::ReduceLastBlock_Data<Combiner,
+                                 Accessor,
+                                 T,
+                                 replication,
+                                 atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              cuda::ReduceAtomicDeviceInit_Data<Combiner,
+                                                Accessor,
+                                                T,
+                                                replication,
+                                                atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  cuda::ReduceAtomicHostInit_Data<Combiner,
+                                                  T,
+                                                  replication,
+                                                  atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
     constexpr tally_u(TallyType* l) : list(l){};
@@ -933,15 +1046,12 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
-        val(init_val, identity_)
-  {
-  }
+      : parent{this}, tally_or_val_ptr{new TallyType}, val(init_val, identity_)
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
-    operator T();  // syncs device
+    operator T(); // syncs device
     val = reduce_data_type(in_val, identity_);
   }
 
@@ -959,8 +1069,10 @@ class Reduce
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -975,25 +1087,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -1004,13 +1126,16 @@ class Reduce
   {
     auto n = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1039,7 +1164,7 @@ class Reduce
   reduce_data_type val;
 };
 
-}  // end namespace cuda
+} // end namespace cuda
 
 //! specialization of ReduceSum for cuda_reduce
 template <typename tuning, typename T>
@@ -1134,9 +1259,10 @@ class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
@@ -1147,20 +1273,25 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1184,10 +1315,11 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
@@ -1197,20 +1329,25 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1231,8 +1368,8 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   T get() { return Base::get(); }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 0a9b0bf305..e0bc4a9287 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -49,14 +49,16 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -103,15 +105,17 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op,
+                  T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -160,15 +164,17 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -176,25 +182,15 @@ inclusive(
   // Determine temporary device storage requirements
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -216,16 +212,18 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -262,12 +260,12 @@ exclusive(
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
 
-}  // namespace scan
+} // namespace scan
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index c5a353b704..408cc139fa 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,32 +44,44 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
-      "stable_sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "stable_sort<cuda_exec> is only implemented for arithmetic "
+                "types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "stable_sort<cuda_exec> is only implemented for "
+                "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -77,26 +89,32 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -131,10 +149,12 @@ stable(
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -147,26 +167,32 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -201,10 +227,12 @@ stable(
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -218,32 +246,43 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
-      "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "sort<cuda_exec> is only implemented for RAJA::operators::less "
+                "or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -251,18 +290,24 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -270,18 +315,24 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -290,36 +341,52 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "arithmetic types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -327,20 +394,28 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -348,8 +423,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -388,15 +463,19 @@ stable_pairs(
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -410,20 +489,28 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -431,8 +518,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -471,15 +558,19 @@ stable_pairs(
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -494,36 +585,50 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "sort_pairs<cuda_exec> is only implemented for arithmetic "
+                "types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -531,20 +636,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -552,30 +665,38 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
 
-}  // namespace sort
+} // namespace sort
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
+#endif // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/synchronize.hpp b/include/RAJA/policy/cuda/synchronize.hpp
index 7f2224a28b..51efdaa63e 100644
--- a/include/RAJA/policy/cuda/synchronize.hpp
+++ b/include/RAJA/policy/cuda/synchronize.hpp
@@ -43,10 +43,10 @@ void synchronize_impl(const cuda_synchronize&)
 }
 
 
-}  // end of namespace cuda
-}  // namespace policy
-}  // end of namespace RAJA
+} // end of namespace cuda
+} // namespace policy
+} // end of namespace RAJA
 
-#endif  // defined(RAJA_ENABLE_CUDA)
+#endif // defined(RAJA_ENABLE_CUDA)
 
-#endif  // RAJA_synchronize_cuda_HPP
+#endif // RAJA_synchronize_cuda_HPP
diff --git a/include/RAJA/policy/desul.hpp b/include/RAJA/policy/desul.hpp
index d657bad8ff..7036614d7b 100644
--- a/include/RAJA/policy/desul.hpp
+++ b/include/RAJA/policy/desul.hpp
@@ -22,4 +22,4 @@
 
 #include "RAJA/policy/desul/atomic.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 71bf429079..45f0b344fe 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -28,167 +28,124 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicLoad(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
 {
-  return desul::atomic_load(acc,
-                            raja_default_desul_order{},
-                            raja_default_desul_scope{});
+  return desul::atomic_load(
+      acc, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void
-atomicStore(AtomicPolicy, T *acc, T value)
-{
-  desul::atomic_store(acc,
-                      value,
-                      raja_default_desul_order{},
-                      raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
+{
+  desul::atomic_store(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicAdd(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_add(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_add(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicSub(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_sub(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_sub(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_min(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_min(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_max(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_max(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_inc(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_inc(
+      acc, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(
+      acc, val, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_dec(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_dec(
+      acc, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(
+      acc, val, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_and(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_and(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_or(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_fetch_or(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_xor(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_xor(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_exchange(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_exchange(
+      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T
+atomicCAS(AtomicPolicy, T* acc, T compare, T value)
 {
   return desul::atomic_compare_exchange(acc,
                                         compare,
@@ -197,7 +154,7 @@ RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
                                         raja_default_desul_scope{});
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // RAJA_ENABLE_DESUL_ATOMICS
+#endif // RAJA_ENABLE_DESUL_ATOMICS
 #endif // guard
diff --git a/include/RAJA/policy/hip.hpp b/include/RAJA/policy/hip.hpp
index ab7e922c0f..7389f44850 100644
--- a/include/RAJA/policy/hip.hpp
+++ b/include/RAJA/policy/hip.hpp
@@ -42,6 +42,6 @@
 #include "RAJA/policy/hip/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_HIP_ACTIVE)
+#endif // closing endif for if defined(RAJA_HIP_ACTIVE)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index f1f69eab5e..298d9b16e2 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -72,14 +72,15 @@ hipDeviceProp_t& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    hipErrchk(hipHostMalloc(&ptr, nbytes,
-        hipHostMallocMapped | hipHostMallocNonCoherent));
+    hipErrchk(hipHostMalloc(
+        &ptr, nbytes, hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -92,7 +93,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -112,7 +114,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -134,7 +137,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -155,22 +159,25 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct hipInfo {
+struct hipInfo
+{
   const void* func = nullptr;
   hip_dim_t gridDim{0, 0, 0};
   hip_dim_t blockDim{0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)};
+  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct hipStatusInfo : hipInfo {
+struct hipStatusInfo : hipInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -187,12 +194,9 @@ extern hipStatusInfo tl_status;
 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Hip res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
 
-}  // namespace detail
+} // namespace detail
 
 //! Ensure all resources in use are synchronized wrt raja kernel launches
 RAJA_INLINE
@@ -202,13 +206,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
       val.second = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     hipErrchk(hipDeviceSynchronize());
   }
 }
@@ -221,12 +228,16 @@ void synchronize(::RAJA::resources::Hip res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -239,30 +250,41 @@ void launch(::RAJA::resources::Hip res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, hip_dim_t gridDim, hip_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Hip res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            hip_dim_t gridDim,
+            hip_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Hip res,
+            bool async = true,
+            const char* name = nullptr)
 {
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePush(name);
-  #else
-    RAJA_UNUSED_VAR(name);
-  #endif
-  hipErrchk(hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePop();
-  #endif
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePush(name);
+#else
+  RAJA_UNUSED_VAR(name);
+#endif
+  hipErrchk(
+      hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePop();
+#endif
   launch(res, async);
 }
 
@@ -280,9 +302,11 @@ hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                            detail::tl_status.gridDim.y *
-                                            detail::tl_status.gridDim.z; }
+hip_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -290,9 +314,11 @@ hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                             detail::tl_status.blockDim.y *
-                                             detail::tl_status.blockDim.z; }
+hip_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -307,7 +333,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -319,24 +346,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
+                                  ? align - (unaligned_shmem % align)
+                                  : size_t(0);
   const size_t aligned_shmem = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -351,15 +381,16 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void* func,
-    hip_dim_t gridDim,
-    hip_dim_t blockDim,
-    size_t& dynamic_smem,
-    ::RAJA::resources::Hip res,
-    LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 hip_dim_t gridDim,
+                 hip_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Hip res,
+                 LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(detail::tl_status,
+  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
+      detail::tl_status,
       detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
@@ -375,7 +406,8 @@ static constexpr size_t hip_occupancy_uninitialized_size_t =
 struct HipFixedMaxBlocksData
 {
   int device_sm_per_device = hip::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -396,27 +428,30 @@ struct HipOccMaxBlocksThreadsData
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksThreadsData
+hip_occupancy_max_blocks_threads(const void* func,
+                                 size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
-    hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
+    hipErrchk(
+        hipOccupancyMaxPotentialBlockSize(&data.func_max_blocks_per_device,
+                                          &data.func_max_threads_per_block,
+                                          func,
+                                          func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
     hipDeviceProp_t& prop = hip::device_prop();
     data.func_max_blocks_per_device = prop.multiProcessorCount;
     data.func_max_threads_per_block = 1024;
 #endif
-
   }
 
   return data;
@@ -431,55 +466,69 @@ struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
     data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func,
+                         size_t func_dynamic_shmem_per_block,
+                         int func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
     data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm,
+        func,
+        func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
@@ -512,14 +561,16 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -533,10 +584,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -544,7 +599,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -552,16 +608,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -570,8 +627,10 @@ struct ConcretizerImpl
   {
     auto data = hip_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -579,9 +638,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
@@ -590,10 +649,10 @@ struct ConcretizerImpl
   IdxT m_len;
 };
 
-}  // namespace hip
+} // namespace hip
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP
+#endif // closing endif for RAJA_ENABLE_HIP
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup.hpp b/include/RAJA/policy/hip/WorkGroup.hpp
index 2c4a29739e..b456d20036 100644
--- a/include/RAJA/policy/hip/WorkGroup.hpp
+++ b/include/RAJA/policy/hip/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/hip/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/hip/WorkGroup/WorkRunner.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 975d26b7ff..ffce3b254d 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace hip
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -53,7 +53,8 @@ inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
   static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     hipErrchk(hipHostFree(ptr));
     hipErrchk(hipHostMalloc(&ptr, nbytes));
@@ -73,7 +74,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +82,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Hip::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   hipErrchk(hipStreamSynchronize(res.get_stream()));
 
@@ -91,32 +93,30 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
   return value;
 }
 
-}  // namespace hip
+} // namespace hip
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async >
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
   static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return hip::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return hip::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 26d45d7bd9..739c120e26 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,35 +36,33 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -73,9 +71,10 @@ struct WorkRunner<
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +82,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,35 +98,33 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::reverse_ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
   using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -132,9 +133,10 @@ struct WorkRunner<
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,8 +144,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,26 +161,29 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldHipDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
     // TODO:: decide when to run hooks, may bypass this and use impl directly
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
-    const index_type stride  = blockDim.x * gridDim.x;
+    const index_type stride = blockDim.x * gridDim.x;
     const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto end = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,11 +193,11 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -205,36 +214,40 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
   using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
-  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using order_policy =
+      RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
   using resource_type = resources::Hip;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template <typename T>
     using type = HoldHipDeviceXThreadblockLoop<
         typename camp::at<T, camp::num<0>>::type, // ITERABLE
         typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -243,21 +256,24 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::hip, dispatcher_holder_policy, RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
+  using dispatcher_type = Dispatcher<Platform::hip,
+                                     dispatcher_holder_policy,
+                                     RAJA::hip_work<BLOCK_SIZE, true>,
+                                     Args...>;
 
   WorkRunner() = default;
 
   WorkRunner(WorkRunner const&) = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -267,52 +283,65 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void
+  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
-    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using Iterator = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using ITERABLE = camp::decay<Iterable>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
     Iterator end = std::end(iter);
     IndexType len = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::hip::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
           get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+          std::forward<Iterable>(iter),
+          std::forward<LoopBody>(loop_body));
     }
   }
 
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage
+  run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
+                                                         std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage{};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE,
+                                             Iterator,
+                                             value_type,
+                                             index_type,
+                                             Args...>;
 
     //
     // Compute the requested iteration space size
@@ -322,18 +351,22 @@ struct WorkRunner<
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
       hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<hip_dim_member_t>(num_loops),
-                          1};
+      hip_dim_t gridSize{
+          static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
+                                        block_size),
+          static_cast<hip_dim_member_t>(num_loops),
+          1};
 
       RAJA_FT_BEGIN;
 
@@ -346,8 +379,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::hip::launch(
+            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -357,10 +391,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
@@ -369,34 +400,36 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_function_call_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_function_call_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_virtual_function_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_virtual_function_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 
 #endif
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index b4f0d7faa7..e441c52005 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -49,11 +49,8 @@ namespace RAJA
 namespace detail
 {
 
-using hip_atomicCommon_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long
->;
+using hip_atomicCommon_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long>;
 
 /*!
  * Type trait for determining if atomic operators should be implemented
@@ -62,11 +59,11 @@ using hip_atomicCommon_builtin_types = ::camp::list<
  * hip_useBuiltinExchange below.
  */
 template <typename T>
-struct hip_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+struct hip_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -78,15 +75,15 @@ struct hip_useBuiltinCommon {
  * below.
  */
 template <typename T>
-struct hip_useReinterpretCommon {
-  static constexpr bool value =
-    !hip_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretCommon
+{
+  static constexpr bool value = !hip_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -106,7 +103,7 @@ using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -117,12 +114,12 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  * using a builtin
  */
 template <typename T>
-struct hip_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+struct hip_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -130,22 +127,23 @@ struct hip_useBuiltinExchange {
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct hip_useReinterpretExchange {
-  static constexpr bool value =
-    !hip_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretExchange
+{
+  static constexpr bool value = !hip_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::type;
+using hip_useReinterpretExchange_t =
+    typename hip_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -153,7 +151,7 @@ using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::typ
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -164,17 +162,16 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   using R = hip_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicExchange(reinterpret_cast<R*>(acc),
-                       RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
-#if defined(__has_builtin) && \
+#if defined(__has_builtin) &&                                                  \
     (__has_builtin(__hip_atomic_load) || __has_builtin(__hip_atomic_store))
 
 /*!
@@ -182,10 +179,11 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
  * using an intrinsic
  */
 template <typename T>
-struct hip_useBuiltinLoad {
+struct hip_useBuiltinLoad
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 template <typename T>
@@ -197,54 +195,54 @@ using hip_useBuiltinStore = hip_useBuiltinLoad<T>;
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct hip_useReinterpretLoad {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+struct hip_useReinterpretLoad
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -285,7 +283,7 @@ using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
   return __hip_atomic_load(acc, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -296,12 +294,12 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
   using R = hip_useReinterpretLoad_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicLoad(reinterpret_cast<R*>(acc)));
+      hip_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -310,7 +308,7 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
   __hip_atomic_store(acc, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -321,7 +319,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
@@ -337,7 +335,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
@@ -349,14 +347,14 @@ RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicCAS(reinterpret_cast<R*>(acc),
-                  RAJA::util::reinterp_A_as_B<T, R>(compare),
-                  RAJA::util::reinterp_A_as_B<T, R>(value)));
+      hip_atomicCAS(reinterpret_cast<R*>(acc),
+                    RAJA::util::reinterp_A_as_B<T, R>(compare),
+                    RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -390,13 +388,13 @@ RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
-                                            Oper&& oper)
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = hip_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
     old = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected));
@@ -406,25 +404,27 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing HIP supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
                                             Oper&& oper,
                                             ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
     old = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected) && !sc(old));
@@ -440,29 +440,28 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicAdd_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -475,16 +474,15 @@ RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicSub_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -492,10 +490,7 @@ using hip_atomicSub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
 /*!
  * List of types where HIP builtin atomicAdd is used to implement atomicSub.
@@ -503,33 +498,33 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long,
-  float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
+                                                         float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                         ,
+                                                         double
 #endif
->;
+                                                         >;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 /*!
  * HIP atomicSub builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
@@ -537,9 +532,11 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 /*!
  * HIP atomicSub via atomicAdd builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -550,23 +547,21 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
  */
 using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -577,23 +572,21 @@ RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
  */
 using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -603,9 +596,9 @@ RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
  * Atomic increment with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
@@ -615,7 +608,7 @@ RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc)
+RAJA_INLINE __device__ T hip_atomicInc(T* acc)
 {
   return hip_atomicAdd(acc, static_cast<T>(1));
 }
@@ -625,10 +618,11 @@ RAJA_INLINE __device__ T hip_atomicInc(T *acc)
  * Atomic decrement with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return hip_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
@@ -637,7 +631,7 @@ RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc)
+RAJA_INLINE __device__ T hip_atomicDec(T* acc)
 {
   return hip_atomicSub(acc, static_cast<T>(1));
 }
@@ -648,18 +642,18 @@ RAJA_INLINE __device__ T hip_atomicDec(T *acc)
  */
 using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -670,13 +664,12 @@ RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
  */
 using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -690,24 +683,24 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  */
 using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
 
 
-}  // namespace detail
+} // namespace detail
 
 
 /*!
@@ -721,8 +714,8 @@ RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
@@ -734,7 +727,7 @@ atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
+atomicStore(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   detail::hip_atomicStore(acc, value);
@@ -745,8 +738,9 @@ atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
@@ -757,8 +751,9 @@ atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
@@ -769,8 +764,9 @@ atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
@@ -781,8 +777,9 @@ atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
@@ -793,8 +790,9 @@ atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
@@ -805,8 +803,8 @@ atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
@@ -817,8 +815,9 @@ atomicInc(hip_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
@@ -829,8 +828,8 @@ atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
@@ -841,8 +840,9 @@ atomicDec(hip_atomic_explicit<host_policy>, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
@@ -853,8 +853,9 @@ atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
@@ -865,8 +866,9 @@ atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
@@ -877,8 +879,9 @@ atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
@@ -890,7 +893,7 @@ atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(hip_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(hip_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicCAS(acc, compare, value);
@@ -899,8 +902,8 @@ atomicCAS(hip_atomic_explicit<host_policy>, T *acc, T compare, T value)
 #endif
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // RAJA_ENABLE_HIP
-#endif  // guard
+#endif // RAJA_ENABLE_HIP
+#endif // guard
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index a8c4cf53b9..128b318685 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,61 +71,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(dims.threads,
+                               static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(dims.blocks,
+                               static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -133,21 +163,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
@@ -157,19 +197,25 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -178,20 +224,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
@@ -201,21 +258,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
@@ -225,21 +292,31 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
@@ -249,19 +326,25 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -291,20 +374,21 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -315,20 +399,19 @@ template <typename EXEC_POL,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -340,22 +423,23 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -367,138 +451,143 @@ template <typename EXEC_POL,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
   auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-}  // namespace impl
+} // namespace impl
 
 //
 ////////////////////////////////////////////////////////////////////////
@@ -508,27 +597,37 @@ void forallp_hip_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::hip::
+      hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::
+      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -538,7 +637,8 @@ forall_impl(resources::Hip hip_res,
   IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType>);
@@ -560,14 +660,20 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::hip::make_launch_body(func,
+                                      dims.blocks,
+                                      dims.threads,
+                                      shmem,
+                                      hip_res,
+                                      std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::hip::launch(
+          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
     }
 
     RAJA_FT_END;
@@ -577,27 +683,38 @@ forall_impl(resources::Hip hip_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam f_params)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam f_params)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::hip::
+      hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::
+      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
+                                                              IterationGetter,
+                                                              Concretizer,
+                                                              UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -607,10 +724,14 @@ forall_impl(resources::Hip hip_res,
   IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
+        &impl::forallp_hip_kernel<EXEC_POL,
+                                  Iterator,
+                                  LOOP_BODY,
+                                  IndexType,
                                   camp::decay<ForallParam>>);
 
     //
@@ -637,14 +758,21 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body =
+          RAJA::hip::make_launch_body(func,
+                                      dims.blocks,
+                                      dims.threads,
+                                      shmem,
+                                      hip_res,
+                                      std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {
+          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::hip::launch(
+          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -675,34 +803,42 @@ forall_impl(resources::Hip hip_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
+    resources::Hip r,
+    ExecPolicy<
+        seq_segit,
+        ::RAJA::policy::hip::
+            hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
-                     loop_body);
-  }  // iterate over segments of index set
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r,
+        isi,
+        detail::CallForall(),
+        ::RAJA::policy::hip::
+            hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
+        loop_body);
+  } // iterate over segments of index set
 
   if (!Async) RAJA::hip::synchronize(r);
   return resources::EventProxy<resources::Hip>(r);
 }
 
-}  // namespace hip
+} // namespace hip
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index c72a0b5c4f..ac8b372f48 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -59,15 +59,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,18 +90,23 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
-
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
+      u.array[i] = __hip_atomic_load(
+          &ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -116,19 +115,23 @@ struct AccessorDeviceScopeUseBlockFence
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::
+        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
+      __hip_atomic_store(
+          &ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -137,7 +140,8 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
     __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 #else
     __threadfence();
@@ -146,11 +150,13 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_release()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
-                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) &&                 \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
-    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) |
+                               (/*lgkmcnt*/ 0xf << 8));
 #else
     __threadfence();
 #endif
@@ -175,10 +181,13 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl_xor(u.array[i], laneMask);
   }
   return u.get_value();
@@ -187,10 +196,13 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::
+      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
+          u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl(u.array[i], srcLane);
   }
   return u.get_value();
@@ -233,22 +245,27 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
+      if (srcLane < numThreads)
+      {
         Combiner{}(temp, rhs);
       }
     }
@@ -269,7 +286,8 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = shfl_xor_sync(temp, i);
     Combiner{}(temp, rhs);
   }
@@ -292,54 +310,72 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
+      if (srcLane < numThreads)
+      {
         Combiner{}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > policy::hip::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::hip::device_constants.MAX_WARPS <=
+                      policy::hip::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
     RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<
+            RAJA::detail::SoAArray<T,
+                                   policy::hip::device_constants.MAX_WARPS>*>(
+            tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
         Combiner{}(temp, rhs);
       }
@@ -351,12 +387,12 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   return temp;
 }
 
-}  // end namespace impl
+} // end namespace impl
 
-}  // end namespace hip
+} // end namespace hip
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel.hpp b/include/RAJA/policy/hip/kernel.hpp
index 4f907f5f5f..502213a576 100644
--- a/include/RAJA/policy/hip/kernel.hpp
+++ b/include/RAJA/policy/hip/kernel.hpp
@@ -33,4 +33,4 @@
 #include "RAJA/policy/hip/kernel/TileTCount.hpp"
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 3204845544..26e7644167 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -41,19 +41,18 @@ template <typename Data,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                            statement::If<Conditional, EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -61,18 +60,15 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 39e7104c16..cffe671f9a 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -45,9 +45,12 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,10 +63,10 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i = IndexMapper::template index<diff_t>();
@@ -79,8 +82,7 @@ struct HipStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -108,9 +110,13 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -122,12 +128,14 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -136,7 +144,8 @@ struct HipStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,8 +160,7 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
@@ -180,9 +188,13 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -194,12 +206,14 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -208,7 +222,8 @@ struct HipStatementExecutor<
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,8 +233,7 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -245,14 +259,19 @@ struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : HipStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -263,33 +282,32 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +317,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +336,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -332,41 +348,41 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -382,9 +398,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +413,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -411,30 +425,28 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +456,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +476,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -478,39 +488,38 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -526,9 +535,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,12 +551,12 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_hip_kernel_For_HPP */
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index ba6642f248..31ec3af3b0 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -46,29 +46,36 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -102,29 +109,42 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -133,7 +153,8 @@ struct HipStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -165,29 +186,42 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
@@ -196,7 +230,8 @@ struct HipStatementExecutor<
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -225,14 +260,19 @@ struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : HipStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -244,40 +284,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,9 +335,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -303,48 +349,56 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,7 +413,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -372,37 +425,43 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,9 +472,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -428,45 +486,52 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
     const diff_t len = segment_length<ArgumentId>(data);
     const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,11 +546,10 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_hip_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 1ed7740008..10292bab0f 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -51,7 +51,8 @@ namespace RAJA
  *
  */
 template <bool async0, int num_blocks, int num_threads>
-struct hip_explicit_launch {};
+struct hip_explicit_launch
+{};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
@@ -87,8 +88,10 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
-};
+    : public internal::Statement<
+          ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
+          EnclosedStmts...>
+{};
 
 
 /*!
@@ -99,7 +102,8 @@ struct HipKernelExt
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExp =
-    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -109,7 +113,8 @@ using HipKernelExp =
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExpAsync =
-    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
@@ -135,9 +140,9 @@ using HipKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixed =
-    HipKernelExt<hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using HipKernelFixed = HipKernelExt<
+    hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with a fixed
@@ -145,8 +150,9 @@ using HipKernelFixed =
  * The kernel launch is asynchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixedAsync =
-    HipKernelExt<hip_explicit_launch<true, operators::limits<int>::max(), num_threads>, EnclosedStmts...>;
+using HipKernelFixedAsync = HipKernelExt<
+    hip_explicit_launch<true, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
@@ -162,7 +168,7 @@ using HipKernel = HipKernelFixed<1024, EnclosedStmts...>;
 template <typename... EnclosedStmts>
 using HipKernelAsync = HipKernelFixedAsync<1024, EnclosedStmts...>;
 
-}  // namespace statement
+} // namespace statement
 
 namespace internal
 {
@@ -210,10 +216,11 @@ __launch_bounds__(BlockSize, 1) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, typename Data, typename executor_t>
+template <int BlockSize, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+  using type = camp::decay<
+      decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
@@ -224,10 +231,11 @@ struct HipKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
+template <typename Data, typename executor_t>
 struct HipKernelLauncherGetter<0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncher<Data, executor_t>;
@@ -235,12 +243,14 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct HipLaunchHelper;
 
 
@@ -249,16 +259,28 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
+                       StmtList,
+                       Data,
+                       Types>
 {
   using Self = HipLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
+  using kernelGetter_t =
+      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                              Data,
+                              executor_t>;
 
   inline static const void* get_func()
   {
@@ -266,13 +288,16 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -282,8 +307,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
             func, shmem_size);
         recommended_blocks = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -293,69 +319,73 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
-  inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+  inline static void
+  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -363,8 +393,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -372,16 +403,15 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -395,8 +425,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
+inline hip_dim_t fitHipDims(hip_dim_member_t limit,
+                            hip_dim_t result,
+                            hip_dim_t minimum = hip_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -409,12 +441,13 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -422,9 +455,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -432,9 +466,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -449,18 +484,20 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
-  using StatementType =
-      statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
+  using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -476,7 +513,8 @@ struct StatementExecutor<
     // Only launch kernel if we have something to iterate over
     int num_blocks = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -503,24 +541,26 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      hip_dim_t fit_threads{0,0,0};
+      hip_dim_t fit_threads{0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitHipDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitHipDims(recommended_threads,
+                                 launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitHipDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitHipDims(max_threads,
+                                 launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -534,20 +574,21 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
       launch_dims.dims.blocks = fitHipDims(
@@ -560,7 +601,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -575,22 +617,32 @@ struct StatementExecutor<
         // currently an unresolved issue.
         //
         auto hip_data = RAJA::hip::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+                                                    launch_dims.dims.blocks,
+                                                    launch_dims.dims.threads,
+                                                    shmem,
+                                                    res,
+                                                    data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&hip_data};
+        RAJA::hip::launch(func,
+                          launch_dims.dims.blocks,
+                          launch_dims.dims.threads,
+                          args,
+                          shmem,
+                          res,
+                          launch_t::async);
       }
     }
   }
 };
 
 
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index 5c428f03ab..2a0e7e2779 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -41,33 +41,31 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                            Types> {
+struct HipStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct HipStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -93,20 +92,15 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
+} // end namespace internal
 
-
-}  // end namespace internal
-
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index bbb8d6081b..4a36288431 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -39,11 +39,15 @@ struct hip_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>,
-                            EnclosedStmts...>,
+                            statement::InitLocalMem<RAJA::hip_shared_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
                             Types>
 {
 
@@ -51,15 +55,16 @@ struct HipStatementExecutor<Data,
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +72,35 @@ struct HipStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +108,49 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::InitLocalMem<RAJA::hip_thread_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +158,35 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos,
+        typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem =
+        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
+            layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,33 +194,26 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index d04fb11bf6..a2fedbaf91 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -40,33 +40,37 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Lambda<LambdaIndex, Args...>,
+                            Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
+} // namespace internal
+} // namespace RAJA
 
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-}  // namespace internal
-}  // namespace RAJA
-
-#endif  // closing endif for RAJA_ENABLE_HIP guard
-
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index a518073e7c..a7bfdd7983 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -35,23 +35,25 @@ namespace internal
 // Executor that handles reductions across a single HIP thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_block_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                           Types> {
+                            statement::Reduce<RAJA::hip_block_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
     auto value = data.template get_param<ParamId>();
@@ -59,7 +61,8 @@ struct HipStatementExecutor<Data,
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -67,13 +70,13 @@ struct HipStatementExecutor<Data,
     // reduction objects
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
 
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -81,7 +84,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -94,23 +97,25 @@ struct HipStatementExecutor<Data,
 // Executor that handles reductions across a single HIP thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_warp_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                            Types> {
+                            statement::Reduce<RAJA::hip_warp_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
     auto value = data.template get_param<ParamId>();
@@ -118,20 +123,21 @@ struct HipStatementExecutor<Data,
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -139,7 +145,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,9 +154,8 @@ struct HipStatementExecutor<Data,
 };
 
 
-
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_hip_kernel_Reduce_HPP */
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index d54a5ccf83..0f525ab5fd 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -43,58 +43,55 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a HIP __syncthreads().
  */
-struct HipSyncThreads : public internal::Statement<camp::nil> {
-};
+struct HipSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a HIP __syncwarp().
  */
-struct HipSyncWarp : public internal::Statement<camp::nil> {
-};
+struct HipSyncWarp : public internal::Statement<camp::nil>
+{};
 
-}  // namespace statement
+} // namespace statement
 
 namespace internal
 {
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
+struct HipStatementExecutor<Data, statement::HipSyncThreads, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncWarp, Types> {
+struct HipStatementExecutor<Data, statement::HipSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  //not currently supported
-  void exec(Data &, bool) {  }
+  static inline RAJA_DEVICE
+      // not currently supported
+      void
+      exec(Data&, bool)
+  {}
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 62dda7f20d..2e928587d6 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -58,10 +58,12 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                   sync,
+                                                   IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,12 +104,12 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -116,7 +120,7 @@ struct HipStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -141,11 +145,16 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,13 +162,16 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -167,12 +179,15 @@ struct HipStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,12 +205,12 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -206,7 +221,7 @@ struct HipStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -231,11 +246,16 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,13 +263,16 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -257,12 +280,15 @@ struct HipStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,12 +301,12 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
@@ -291,7 +317,7 @@ struct HipStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -316,18 +342,25 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
-
-}  // end namespace internal
-}  // end namespace RAJA
-
-#endif  // RAJA_ENABLE_HIP
-#endif  /* RAJA_policy_hip_kernel_Tile_HPP */
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
+
+} // end namespace internal
+} // end namespace RAJA
+
+#endif // RAJA_ENABLE_HIP
+#endif /* RAJA_policy_hip_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 07637fbd8f..09ffee4f9d 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -58,35 +58,42 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                       EnclosedStmts...>,
-                      Types>;
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
@@ -129,35 +136,48 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -172,7 +192,8 @@ struct HipStatementExecutor<
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -207,35 +228,48 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -250,7 +284,8 @@ struct HipStatementExecutor<
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -279,18 +314,27 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
-
-}  // end namespace internal
-}  // end namespace RAJA
-
-#endif  // RAJA_ENABLE_HIP
-#endif  /* RAJA_policy_hip_kernel_TileTCount_HPP */
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
+
+} // end namespace internal
+} // end namespace RAJA
+
+#endif // RAJA_ENABLE_HIP
+#endif /* RAJA_policy_hip_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index aa0610d736..1cdf45a76a 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -44,7 +44,8 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   HipDims dims;
   HipDims min_dims;
@@ -54,19 +55,15 @@ struct LaunchDims {
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(HipDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(HipDims _dims) : dims{_dims}, min_dims{} {}
 
   RAJA_INLINE
   LaunchDims(HipDims _dims, HipDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims{_dims}, min_dims{_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,43 +79,44 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper {
+struct HipStatementListExecutorHelper
+{
 
   using next_helper_t =
       HipStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -126,7 +124,7 @@ struct HipStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +135,7 @@ struct HipStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -151,16 +149,17 @@ struct HipStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -175,27 +174,23 @@ struct HipStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<HipStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
     return HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
@@ -205,79 +200,95 @@ struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
 
 template <typename StmtList, typename Data, typename Types>
-using hip_statement_list_executor_t = HipStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using hip_statement_list_executor_t =
+    HipStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,164 +296,225 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
+                          "space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -450,35 +522,44 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_hip_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -487,69 +568,99 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-}  // namespace internal
+} // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 6823647b48..8f5c44059c 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -37,7 +37,7 @@ __global__ void launch_global_fcn(BODY body_in)
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,7 +45,8 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -53,30 +54,36 @@ __global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_p
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -84,18 +91,19 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+    hip_dim_t gridSize{static_cast<hip_dim_member_t>(params.teams.value[0]),
+                       static_cast<hip_dim_member_t>(params.teams.value[1]),
+                       static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize{static_cast<hip_dim_member_t>(params.threads.value[0]),
+                        static_cast<hip_dim_member_t>(params.threads.value[1]),
+                        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -106,13 +114,24 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
       }
 
       RAJA_FT_END;
@@ -122,13 +141,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
   }
 
 
- //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -141,18 +165,21 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize{
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize{
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -164,22 +191,36 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
       launch_info.res = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,13 +228,12 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads>
 __launch_bounds__(num_threads, 1) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
@@ -201,7 +241,7 @@ void launch_global_fcn_fixed(BODY body_in)
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -210,7 +250,8 @@ void launch_global_fcn_fixed(BODY body_in)
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -218,31 +259,37 @@ void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 
 template <bool async, int nthreads>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
+struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn_fixed<BODY, nthreads>);
+    auto func =
+        reinterpret_cast<const void*>(&launch_global_fcn_fixed<BODY, nthreads>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -250,18 +297,19 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+    hip_dim_t gridSize{static_cast<hip_dim_member_t>(params.teams.value[0]),
+                       static_cast<hip_dim_member_t>(params.teams.value[1]),
+                       static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize{static_cast<hip_dim_member_t>(params.threads.value[0]),
+                        static_cast<hip_dim_member_t>(params.threads.value[1]),
+                        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -271,13 +319,24 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
       }
 
       RAJA_FT_END;
@@ -286,18 +345,25 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY,
+                                            nthreads,
+                                            camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -305,18 +371,21 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize{
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize{
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -328,22 +397,36 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
       launch_info.res = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+                                                gridSize,
+                                                blockSize,
+                                                shared_mem_size,
+                                                hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
-
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func,
+                          gridSize,
+                          blockSize,
+                          args,
+                          shared_mem_size,
+                          hip_res,
+                          async,
+                          kernel_name);
+
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -351,7 +434,6 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -359,43 +441,50 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -403,29 +492,36 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -435,7 +531,8 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
       body(*(segment0.begin() + i0),
            *(segment1.begin() + i1),
            *(segment2.begin() + i2));
@@ -444,44 +541,53 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -492,34 +598,42 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -533,11 +647,14 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
           body(*(segment0.begin() + i0),
                *(segment1.begin() + i1),
@@ -549,42 +666,49 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -592,31 +716,36 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -626,54 +755,66 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
       body(*(segment0.begin() + i0),
            *(segment1.begin() + i1),
            *(segment2.begin() + i2),
-           i0, i1, i2);
+           i0,
+           i1,
+           i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -684,35 +825,42 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -726,16 +874,21 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
           body(*(segment0.begin() + i0),
                *(segment1.begin() + i1),
                *(segment2.begin() + i2),
-               i0, i1, i2);
+               i0,
+               i1,
+               i2);
         }
       }
     }
@@ -746,31 +899,34 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 /*
    HIP generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           sync,
+                                           IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::
+              hip_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -779,29 +935,35 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1,
+                                           IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -812,39 +974,47 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::hip_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -854,29 +1024,34 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -888,9 +1063,9 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
@@ -901,93 +1076,113 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t = IndexMapper::template index<diff_t>();
     const diff_t i = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t_init = IndexMapper::template index<diff_t>();
@@ -995,11 +1190,12 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 0d9d3899d8..55fcc46154 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/hip/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/hip/atomic.hpp"
+#include "RAJA/policy/hip/atomic.hpp"
 #endif
 
 #include "RAJA/policy/hip/policy.hpp"
@@ -73,32 +73,41 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>(); // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
   RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -106,71 +115,87 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
 
   for (int shmem_offset = threadId;
        shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>(); // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
   RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
       Combiner{}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
       RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
-}  // namespace impl
+} // namespace impl
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -185,48 +210,63 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(
+        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
   ~MultiReduceGridAtomicHostInit_TallyData() = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
       m_num_bins = new_num_bins;
       m_tally_bins = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(
+          container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
         int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset{}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +284,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +300,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
   using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +330,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
     } func_data{min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer{}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
         int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +381,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset{}(
+            bin - 1, tally_bins, tally_rep - 1, tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -354,34 +420,31 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +452,14 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins,
+        m_identity,
+        bin,
+        value,
+        m_tally_mem,
+        GetTallyOffset{},
+        m_tally_replication,
+        m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,7 +469,8 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
@@ -409,10 +478,10 @@ struct MultiReduceGridAtomicHostInit_Data
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -422,57 +491,68 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
   ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
     size_t shared_replication = 0;
-    const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+    const size_t shared_offset =
+        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data{block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer{}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
       m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -490,10 +570,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+          m_num_bins, m_identity, shared_mem, m_shared_replication);
     }
   }
 
@@ -502,11 +582,18 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -516,16 +603,28 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          shared_mem,
+          GetSharedOffset{},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins,
+          m_identity,
+          bin,
+          value,
+          m_tally_mem,
+          GetTallyOffset{},
+          m_tally_replication,
+          m_tally_bins);
     }
   }
 
@@ -536,13 +635,15 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    int tally_offset =
+        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
   using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
@@ -551,24 +652,27 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
+  int m_shared_offset;      // in bytes
   int m_shared_replication; // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -595,19 +699,29 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataHip
 {
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                           T,
+                                                           tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                      T,
+                                                      tuning>,
+              void>>,
       void>;
 
 
@@ -619,15 +733,15 @@ struct MultiReduceDataHip
 
   MultiReduceDataHip() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
   MultiReduceDataHip(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,13 +753,16 @@ struct MultiReduceDataHip
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
@@ -654,16 +771,17 @@ struct MultiReduceDataHip
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataHip(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip(MultiReduceDataHip&&) = delete;
   MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
-  MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip&&) = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,23 +789,30 @@ struct MultiReduceDataHip
   ~MultiReduceDataHip()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -695,7 +820,7 @@ struct MultiReduceDataHip
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,15 +854,17 @@ struct MultiReduceDataHip
 
 
 private:
-  MultiReduceDataHip const *m_parent;
+  MultiReduceDataHip const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Hip res)
   {
-    for (resources::Hip& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,19 +873,21 @@ struct MultiReduceDataHip
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Hip& list_res : *m_sync_list) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
       ::RAJA::hip::synchronize(list_res);
     }
     m_sync_list->clear();
   }
 };
 
-}  // end namespace hip
+} // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
+                                hip::MultiReduceDataHip)
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 30269f8406..1a9558f7ac 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -11,37 +11,40 @@
 #include "roctx.h"
 #endif
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+init(KernelName& kn, const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePush(kn.name);
+  roctxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(KernelName&)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+resolve(KernelName&, const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePop();
+  roctxRangePop();
 #endif
-  }
+}
 
 } //  namespace detail
 } //  namespace expt
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index 584d049c5b..68a6ea2667 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -8,45 +8,49 @@
 #include "RAJA/policy/hip/reduce.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
-    red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(Reducer<OP, T>& red)
-  {
-    RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    // complete reduction
-    hi.res.wait();
-    *red.target = OP{}(*red.target, *red.devicetarget);
-
-    // free memory
-    RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+init(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
+{
+  red.devicetarget =
+      RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
+  red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(Reducer<OP, T>& red)
+{
+  RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+resolve(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
+{
+  // complete reduction
+  hi.res.wait();
+  *red.target = OP{}(*red.target, *red.devicetarget);
+
+  // free memory
+  RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
 
 } //  namespace detail
 } //  namespace expt
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index a9f9027675..4b25ac1758 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -56,30 +56,32 @@ using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
-}  // end namespace detail
+} // end namespace detail
 
 namespace hip
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -91,13 +93,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -110,10 +113,10 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
@@ -121,15 +124,20 @@ struct FractionOffsetOccupancyConcretizer
     IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -143,22 +151,27 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template <typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -167,10 +180,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template <size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -182,19 +195,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
     IdxT cutoff = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -205,19 +222,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -226,18 +245,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -255,8 +276,10 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
   static constexpr reduce_algorithm algorithm = t_algorithm;
@@ -274,9 +297,9 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
@@ -284,9 +307,9 @@ struct AtomicReplicationTuning
   using OffsetCalculator = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -295,7 +318,7 @@ struct MultiReduceTuning
   static constexpr bool consistent = false;
 };
 
-}  // namespace hip
+} // namespace hip
 
 namespace policy
 {
@@ -307,16 +330,19 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of
+                                            // the cache level that handles
+                                            // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
@@ -330,31 +356,41 @@ constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A
 constexpr DeviceConstants device_constants(32, 1024, 32); // V100
 #endif
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct hip_indexer {};
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct hip_indexer
+{};
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::hip,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::hip> {
+                                 RAJA::Policy::hip,
+                                 RAJA::Pattern::region,
+                                 detail::get_launch<true /*async */>::value,
+                                 RAJA::Platform::hip>
+{
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
           bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::forall,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
@@ -362,11 +398,11 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
 
 template <bool Async, int num_threads = named_usage::unspecified>
 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
+                          RAJA::Policy::hip,
+                          RAJA::Pattern::region,
+                          detail::get_launch<Async>::value,
+                          RAJA::Platform::hip>
+{};
 
 
 //
@@ -378,11 +414,11 @@ struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
 ///
 template <size_t BLOCK_SIZE, bool Async = false>
 struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::workgroup_exec,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -390,10 +426,10 @@ struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_hip_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::hip> {
-};
+          RAJA::Policy::hip,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::hip>
+{};
 
 
 ///
@@ -405,36 +441,36 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///
 
 
-template < typename tuning >
-struct hip_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+template <typename tuning>
+struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                               RAJA::Policy::hip,
+                               RAJA::Pattern::reduce,
+                               detail::get_launch<false>::value,
+                               RAJA::Platform::hip,
+                               std::conditional_t<tuning::consistent,
+                                                  reduce::ordered,
+                                                  reduce::unordered>>
+{};
 
-template < typename tuning >
+template <typename tuning>
 struct hip_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::hip,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::hip,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Hip atomic policy for using hip atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct hip_atomic_explicit{};
+template <typename host_policy>
+struct hip_atomic_explicit
+{};
 
 /*!
  * Default hip atomic policy uses hip atomics on the device and non-atomics
@@ -445,11 +481,13 @@ using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct hip_block_reduce{};
+struct hip_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct hip_warp_reduce{};
+struct hip_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
@@ -463,15 +501,15 @@ struct hip_warp_reduce{};
 // struct hip_warp_loop{};
 
 
-
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
 // Multiple warps have to be created by using hip_thread_{yz}_*
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_direct {};
+template <typename Mask>
+struct hip_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
@@ -479,24 +517,27 @@ struct hip_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_loop {};
+template <typename Mask>
+struct hip_warp_masked_loop
+{};
 
 
-template<typename Mask>
-struct hip_thread_masked_direct {};
+template <typename Mask>
+struct hip_thread_masked_direct
+{};
 
-template<typename Mask>
-struct hip_thread_masked_loop {};
+template <typename Mask>
+struct hip_thread_masked_loop
+{};
 
 
 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
-                                                       Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                      Pattern::synchronize,
+                                                      Launch::sync>
+{};
 
-}  // end namespace hip
-}  // end namespace policy
+} // end namespace hip
+} // end namespace policy
 
 
 namespace internal
@@ -505,18 +546,18 @@ namespace internal
 RAJA_INLINE
 int get_size(hip_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct HipDims {
+struct HipDims
+{
 
-  hip_dim_t blocks{0,0,0};
-  hip_dim_t threads{0,0,0};
+  hip_dim_t blocks{0, 0, 0};
+  hip_dim_t threads{0, 0, 0};
 
   HipDims() = default;
   HipDims(HipDims const&) = default;
@@ -524,117 +565,109 @@ struct HipDims {
 
   RAJA_INLINE
   HipDims(hip_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks{default_val, default_val, default_val},
+        threads{default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  hip_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
+  hip_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
       return {(blocks.x ? blocks.x : 1),
               (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  hip_dim_t get_threads() const {
-    if (num_threads() != 0) {
+  hip_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
       return {(threads.x ? threads.x : 1),
               (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct HipDimHelper;
 
-template<>
-struct HipDimHelper<named_dim::x>{
+template <>
+struct HipDimHelper<named_dim::x>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::y>{
+template <>
+struct HipDimHelper<named_dim::y>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::z>{
+template <>
+struct HipDimHelper<named_dim::z>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-hip_dim_member_t get_hip_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr hip_dim_member_t get_hip_dim(dim_t const& d)
 {
   return HipDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_hip_dim(dim_t &d, hip_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
 {
   return HipDimHelper<dim>::set(d, value);
 }
@@ -650,12 +683,11 @@ struct IndexSize
   hip_dim_member_t block_size = named_usage::unspecified;
   hip_dim_member_t grid_size = named_usage::unspecified;
 
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(hip_dim_member_t _block_size = named_usage::unspecified,
-            hip_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      hip_dim_member_t _block_size = named_usage::unspecified,
+      hip_dim_member_t _grid_size = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -663,7 +695,7 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -672,23 +704,24 @@ struct IndexGlobal
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
@@ -696,20 +729,21 @@ struct IndexGlobal<dim, 1, GRID_SIZE>
   static constexpr int block_size = 1;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -717,40 +751,41 @@ struct IndexGlobal<dim, BLOCK_SIZE, 1>
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
@@ -758,43 +793,49 @@ struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -802,67 +843,74 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
@@ -870,60 +918,62 @@ struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
@@ -931,168 +981,171 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
   static constexpr int block_size = BLOCK_SIZE;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
   static constexpr int grid_size = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
+template <typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
+template <typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
+template <typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1101,10 +1154,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1119,10 +1172,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1137,89 +1190,88 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
+template <size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
 
 } // namespace hip
 
 // contretizers used in forall, scan, and sort policies
 
-using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using HipAvoidDeviceMaxThreadOccupancyConcretizer =
+    hip::AvoidDeviceMaxThreadOccupancyConcretizer<
+        hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using HipFractionOffsetOccupancyConcretizer =
+    hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+using HipReduceDefaultConcretizer =
+    HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
@@ -1227,83 +1279,111 @@ using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
+                                       hip::global_x<BLOCK_SIZE>,
+                                       HipDefaultConcretizer,
+                                       Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
+                                             hip::global_x<BLOCK_SIZE>,
+                                             HipDefaultConcretizer,
+                                             true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_max = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_max_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using hip_exec_occ_fraction = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using hip_exec_occ_fraction_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using hip_exec_occ_custom = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using hip_exec_occ_custom_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_with_reduce = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_with_reduce_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    true>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base = std::conditional_t<with_reduce,
-    hip_exec_with_reduce<BLOCK_SIZE, Async>,
-    hip_exec<BLOCK_SIZE, Async>>;
+using hip_exec_base =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
+                       hip_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async = std::conditional_t<with_reduce,
-    hip_exec_with_reduce_async<BLOCK_SIZE>,
-    hip_exec_async<BLOCK_SIZE>>;
+using hip_exec_base_async =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce_async<BLOCK_SIZE>,
+                       hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -1319,10 +1399,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template < hip::reduce_algorithm algorithm,
-           hip::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <hip::reduce_algorithm algorithm,
+          hip::block_communication_mode comm_mode,
+          size_t replication = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1345,35 +1425,41 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1385,25 +1471,26 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
+template <bool with_atomic>
+using hip_reduce_base =
+    std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // policies usable with multi_reducers
-template < hip::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
-    hip::MultiReduceTuning<
-      algorithm,
-      hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <hip::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using hip_multi_reduce_tuning =
+    policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
+        algorithm,
+        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                     SharedAtomicReplicationIndexer,
+                                     GetOffsetRight<int>>,
+        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                     GlobalAtomicReplicationIndexer,
+                                     GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1416,23 +1503,27 @@ using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
 // - *host_init* policies initialize memory used with atomics on the host.
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<4>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<4>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<0>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<0>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 //
 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
     hip::multi_reduce_algorithm::init_host_combine_global_atomic,
@@ -1442,18 +1533,21 @@ using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
         hip::ConstantPreferredReplicationConcretizer<32>>,
     hip::warp_global_xyz<>>;
 //
-using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<1>>,
-    hip::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using hip_multi_reduce_atomic_global_no_replication_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void, // unused with this algorithm
+        void, // unused with this algorithm
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<1>>,
+        hip::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using hip_multi_reduce_atomic =
+    hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using hip_multi_reduce_atomic_low_performance_low_overhead =
     hip_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1485,31 +1579,31 @@ using policy::hip::hip_launch_t;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_indexer_direct =
+    policy::hip::hip_indexer<iteration_mapping::Direct,
+                             kernel_sync_requirement::none,
+                             indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_indexer_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_flatten_indexer_direct =
+    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1522,7 +1616,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1537,22 +1631,28 @@ using hip_thread_yz_direct = hip_thread_direct<named_dim::y, named_dim::z>;
 using hip_thread_zx_direct = hip_thread_direct<named_dim::z, named_dim::x>;
 using hip_thread_zy_direct = hip_thread_direct<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_direct = hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_direct = hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_direct = hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_direct = hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_direct = hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_direct = hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_direct =
+    hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_direct =
+    hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_direct =
+    hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_direct =
+    hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_direct =
+    hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_direct =
+    hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1567,12 +1667,18 @@ using hip_thread_yz_loop = hip_thread_loop<named_dim::y, named_dim::z>;
 using hip_thread_zx_loop = hip_thread_loop<named_dim::z, named_dim::x>;
 using hip_thread_zy_loop = hip_thread_loop<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_loop = hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_loop = hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_loop = hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_loop = hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_loop = hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_loop =
+    hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_loop =
+    hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_loop =
+    hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_loop =
+    hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_loop =
+    hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_loop =
+    hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
@@ -1580,7 +1686,7 @@ using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_di
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_thread_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1588,26 +1694,38 @@ using hip_flatten_thread_x_direct = hip_flatten_thread_direct<named_dim::x>;
 using hip_flatten_thread_y_direct = hip_flatten_thread_direct<named_dim::y>;
 using hip_flatten_thread_z_direct = hip_flatten_thread_direct<named_dim::z>;
 
-using hip_flatten_thread_xy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_thread_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1615,19 +1733,31 @@ using hip_flatten_thread_x_loop = hip_flatten_thread_loop<named_dim::x>;
 using hip_flatten_thread_y_loop = hip_flatten_thread_loop<named_dim::y>;
 using hip_flatten_thread_z_loop = hip_flatten_thread_loop<named_dim::z>;
 
-using hip_flatten_thread_xy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1635,7 +1765,7 @@ using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1650,22 +1780,28 @@ using hip_block_yz_direct = hip_block_direct<named_dim::y, named_dim::z>;
 using hip_block_zx_direct = hip_block_direct<named_dim::z, named_dim::x>;
 using hip_block_zy_direct = hip_block_direct<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_direct = hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_direct = hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_direct = hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_direct = hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_direct = hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_direct = hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_direct =
+    hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_direct =
+    hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_direct =
+    hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_direct =
+    hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_direct =
+    hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_direct =
+    hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1680,12 +1816,18 @@ using hip_block_yz_loop = hip_block_loop<named_dim::y, named_dim::z>;
 using hip_block_zx_loop = hip_block_loop<named_dim::z, named_dim::x>;
 using hip_block_zy_loop = hip_block_loop<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_loop = hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_loop = hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_loop = hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_loop = hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_loop = hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_loop =
+    hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_loop =
+    hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_loop =
+    hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_loop =
+    hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_loop =
+    hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_loop =
+    hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
@@ -1693,7 +1835,7 @@ using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim:
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_block_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1701,26 +1843,38 @@ using hip_flatten_block_x_direct = hip_flatten_block_direct<named_dim::x>;
 using hip_flatten_block_y_direct = hip_flatten_block_direct<named_dim::y>;
 using hip_flatten_block_z_direct = hip_flatten_block_direct<named_dim::z>;
 
-using hip_flatten_block_xy_direct = hip_flatten_block_direct<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_direct = hip_flatten_block_direct<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_direct = hip_flatten_block_direct<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_direct = hip_flatten_block_direct<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_direct = hip_flatten_block_direct<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_direct = hip_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_direct = hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_direct = hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_direct = hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_direct = hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_direct = hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_direct = hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_block_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1728,19 +1882,31 @@ using hip_flatten_block_x_loop = hip_flatten_block_loop<named_dim::x>;
 using hip_flatten_block_y_loop = hip_flatten_block_loop<named_dim::y>;
 using hip_flatten_block_z_loop = hip_flatten_block_loop<named_dim::z>;
 
-using hip_flatten_block_xy_loop = hip_flatten_block_loop<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_loop = hip_flatten_block_loop<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_loop = hip_flatten_block_loop<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_loop = hip_flatten_block_loop<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_loop = hip_flatten_block_loop<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_loop = hip_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_loop = hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_loop = hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_loop = hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_loop = hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_loop = hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1748,9 +1914,11 @@ using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_di
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using hip_global_direct = hip_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_global_direct =
+    hip_indexer_direct<hip::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
 
 using hip_global_x_direct = hip_global_direct<named_dim::x>;
 using hip_global_y_direct = hip_global_direct<named_dim::y>;
@@ -1763,24 +1931,34 @@ using hip_global_yz_direct = hip_global_direct<named_dim::y, named_dim::z>;
 using hip_global_zx_direct = hip_global_direct<named_dim::z, named_dim::x>;
 using hip_global_zy_direct = hip_global_direct<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_direct = hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_direct = hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_direct = hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_direct = hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_direct = hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_direct = hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_direct =
+    hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_direct =
+    hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_direct =
+    hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_direct =
+    hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_direct =
+    hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_direct =
+    hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using hip_global_loop = hip_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using hip_global_syncable_loop = hip_indexer_syncable_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_global_loop =
+    hip_indexer_loop<hip::IndexGlobal<dims,
+                                      named_usage::unspecified,
+                                      named_usage::unspecified>...>;
+
+template <named_dim... dims>
+using hip_global_syncable_loop =
+    hip_indexer_syncable_loop<hip::IndexGlobal<dims,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>...>;
 
 using hip_global_x_loop = hip_global_loop<named_dim::x>;
 using hip_global_y_loop = hip_global_loop<named_dim::y>;
@@ -1793,12 +1971,18 @@ using hip_global_yz_loop = hip_global_loop<named_dim::y, named_dim::z>;
 using hip_global_zx_loop = hip_global_loop<named_dim::z, named_dim::x>;
 using hip_global_zy_loop = hip_global_loop<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_loop = hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_loop = hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_loop = hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_loop = hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_loop = hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_loop =
+    hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_loop =
+    hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_loop =
+    hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_loop =
+    hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_loop =
+    hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_loop =
+    hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -1806,54 +1990,83 @@ using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_di
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using hip_flatten_global_direct = hip_flatten_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_flatten_global_direct =
+    hip_flatten_indexer_direct<hip::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
 using hip_flatten_global_z_direct = hip_flatten_global_direct<named_dim::z>;
 
-using hip_flatten_global_xy_direct = hip_flatten_global_direct<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_direct = hip_flatten_global_direct<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_direct = hip_flatten_global_direct<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_direct = hip_flatten_global_direct<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_direct = hip_flatten_global_direct<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_direct = hip_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_direct = hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_direct = hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_direct = hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_direct = hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_direct = hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_direct = hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using hip_flatten_global_loop = hip_flatten_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_flatten_global_loop =
+    hip_flatten_indexer_loop<hip::IndexGlobal<dims,
+                                              named_usage::unspecified,
+                                              named_usage::unspecified>...>;
 
 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
 using hip_flatten_global_z_loop = hip_flatten_global_loop<named_dim::z>;
 
-using hip_flatten_global_xy_loop = hip_flatten_global_loop<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_loop = hip_flatten_global_loop<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_loop = hip_flatten_global_loop<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_loop = hip_flatten_global_loop<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_loop = hip_flatten_global_loop<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_loop = hip_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_loop = hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_loop = hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_loop = hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_loop = hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_loop = hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1861,271 +2074,460 @@ using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using hip_thread_size_x_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_thread_size_y_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_thread_size_z_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE>
+using hip_thread_size_x_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_thread_size_y_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_thread_size_z_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using hip_block_size_x_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using hip_block_size_y_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using hip_block_size_z_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template <int X_BLOCK_SIZE>
 using hip_thread_size_x_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template <int Y_BLOCK_SIZE>
 using hip_thread_size_y_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template <int Z_BLOCK_SIZE>
 using hip_thread_size_z_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using hip_block_size_x_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using hip_block_size_y_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using hip_block_size_z_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2133,272 +2535,507 @@ using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using hip_flatten_block_size_x_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using hip_flatten_block_size_y_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using hip_flatten_block_size_z_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using hip_flatten_block_size_x_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using hip_flatten_block_size_y_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using hip_flatten_block_size_z_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
@@ -2478,7 +3115,7 @@ using hip_block_yzx_nested_loop = hip_block_yzx_loop;
 using hip_block_zxy_nested_loop = hip_block_zxy_loop;
 using hip_block_zyx_nested_loop = hip_block_zyx_loop;
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_HIP
 #endif
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 5e3a02fb2c..a018509164 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -42,18 +42,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define hipErrchk(ans)                            \
-  {                                                \
-    ::RAJA::hipAssert((ans), __FILE__, __LINE__); \
+#define hipErrchk(ans)                                                         \
+  {                                                                            \
+    ::RAJA::hipAssert((ans), __FILE__, __LINE__);                              \
   }
 
-inline void hipAssert(hipError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+inline void
+hipAssert(hipError_t code, const char* file, int line, bool abort = true)
 {
-  if (code != hipSuccess) {
-    if (abort) {
+  if (code != hipSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "HIPassert: ";
       msg += hipGetErrorString(code);
@@ -62,15 +62,17 @@ inline void hipAssert(hipError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "HIPassert: %s %s %d\n",
-              hipGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(
+          stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file, line);
     }
   }
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for if defined(RAJA_ENABLE_HIP)
+#endif // closing endif for if defined(RAJA_ENABLE_HIP)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index e8e67029ef..4f2b5fe4db 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -60,7 +60,8 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>> {
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
@@ -68,7 +69,8 @@ struct atomic<sum<T>> {
 };
 
 template <typename T>
-struct atomic<min<T>> {
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
@@ -76,7 +78,8 @@ struct atomic<min<T>> {
 };
 
 template <typename T>
-struct atomic<max<T>> {
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
@@ -84,7 +87,8 @@ struct atomic<max<T>> {
 };
 
 template <typename T>
-struct atomic<and_bit<T>> {
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
@@ -92,7 +96,8 @@ struct atomic<and_bit<T>> {
 };
 
 template <typename T>
-struct atomic<or_bit<T>> {
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
     RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
@@ -100,15 +105,16 @@ struct atomic<or_bit<T>> {
 };
 
 template <typename T>
-struct hip_atomic_available {
+struct hip_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
 };
 
-}  // namespace hip
+} // namespace hip
 
-}  // namespace reduce
+} // namespace reduce
 
 namespace hip
 {
@@ -118,15 +124,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -141,7 +151,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   int maxNumSlots = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
   int beginSlots = replicationId * maxNumSlots;
@@ -149,8 +159,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -158,33 +170,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   __shared__ bool isLastBlock;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   __syncthreads();
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
       Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -192,7 +207,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
@@ -205,57 +221,76 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
       temp = Combiner{}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
       T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
+      if (srcLane < numThreads)
+      {
         temp = Combiner{}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T,
+                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T,
+            RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
         temp = Combiner{}(temp, rhs);
       }
@@ -269,10 +304,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
   const int numBlocks = BlockIterationGetter::size();
   const int numThreads = ThreadIterationGetter::size();
@@ -285,7 +323,8 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     red.device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
@@ -298,18 +337,21 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
       temp = OP{}(temp, red.device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *(red.devicetarget) = temp;
     }
   }
@@ -320,13 +362,16 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
           typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -335,24 +380,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -363,7 +412,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
@@ -371,11 +421,13 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
     RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -386,9 +438,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -396,19 +447,19 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
+  if (threadId == 0 && temp != identity)
+  {
     RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
   }
-
 }
 
-}  // namespace impl
+} // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
@@ -417,12 +468,14 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Hip res;
     Node* node_list;
@@ -475,12 +528,17 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
         m_n = m_rn->node_list;
-      } else {
+      }
+      else
+      {
         m_rn = nullptr;
         m_n = nullptr;
       }
@@ -494,7 +552,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -531,17 +589,19 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
+    if (!rn)
+    {
       rn = (ResourceNode*)malloc(sizeof(ResourceNode));
       rn->next = resource_list;
       rn->res = res;
@@ -558,7 +618,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::hip::synchronize(*r);
     }
   }
@@ -566,9 +627,11 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
+      while (rn->node_list)
+      {
         Node* n = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
@@ -598,8 +661,11 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -627,8 +693,7 @@ struct ReduceLastBlock_Data
         device_count{nullptr},
         device{},
         own_device_ptr{false}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
@@ -637,16 +702,16 @@ struct ReduceLastBlock_Data
         device_count{other.device_count},
         device{other.device},
         own_device_ptr{false}
-  {
-  }
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -657,10 +722,11 @@ struct ReduceLastBlock_Data
   void grid_reduce(T* output)
   {
     T temp = value;
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId = impl::
+        grid_reduce_last_block<Combiner, Accessor, replication, atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -670,13 +736,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
+    if (act)
+    {
       hip_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -687,7 +755,8 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
@@ -699,8 +768,10 @@ struct ReduceLastBlock_Data
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -712,15 +783,14 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool own_device_ptr;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         is_setup{false},
         own_device_ptr{false}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
@@ -728,16 +798,17 @@ struct ReduceAtomicHostInit_Data
         identity{other.identity},
         is_setup{other.is_setup},
         own_device_ptr{false}
-  {
-  }
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data&
+  operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -750,7 +821,7 @@ struct ReduceAtomicHostInit_Data
     T temp = value;
 
     impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
-            temp, identity, output);
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -758,7 +829,8 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
+    if (act)
+    {
       is_setup = true;
       own_device_ptr = true;
     }
@@ -770,7 +842,8 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       is_setup = false;
       own_device_ptr = false;
     }
@@ -779,8 +852,11 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -795,7 +871,7 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
       : value{initValue},
@@ -803,8 +879,7 @@ struct ReduceAtomicDeviceInit_Data
         device_count{nullptr},
         device{nullptr},
         own_device_ptr{false}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
@@ -813,16 +888,17 @@ struct ReduceAtomicDeviceInit_Data
         device_count{other.device_count},
         device{other.device},
         own_device_ptr{false}
-  {
-  }
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data&
+  operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -834,10 +910,13 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId = impl::grid_reduce_atomic_device_init<Combiner,
+                                                                Accessor,
+                                                                replication,
+                                                                atomic_stride>(
+        temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -847,10 +926,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -861,7 +943,8 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
@@ -877,45 +960,73 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 32;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 32;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::hip::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::hip::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      hip::ReduceLastBlock_Data<Combiner,
+                                Accessor,
+                                T,
+                                replication,
+                                atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              hip::ReduceAtomicDeviceInit_Data<Combiner,
+                                               Accessor,
+                                               T,
+                                               replication,
+                                               atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  hip::ReduceAtomicHostInit_Data<Combiner,
+                                                 T,
+                                                 replication,
+                                                 atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
     constexpr tally_u(TallyType* l) : list(l){};
@@ -928,15 +1039,12 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
-        val(init_val, identity_)
-  {
-  }
+      : parent{this}, tally_or_val_ptr{new TallyType}, val(init_val, identity_)
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
-    operator T();  // syncs device
+    operator T(); // syncs device
     val = reduce_data_type(in_val, identity_);
   }
 
@@ -954,8 +1062,10 @@ class Reduce
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -970,25 +1080,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -999,13 +1119,16 @@ class Reduce
   {
     auto n = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1034,7 +1157,7 @@ class Reduce
   reduce_data_type val;
 };
 
-}  // end namespace hip
+} // end namespace hip
 
 //! specialization of ReduceSum for hip_reduce
 template <typename tuning, typename T>
@@ -1129,9 +1252,10 @@ class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public hip::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
@@ -1142,20 +1266,25 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1179,10 +1308,11 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public hip::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
@@ -1192,20 +1322,25 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1226,8 +1361,8 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   T get() { return Base::get(); }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index cdf0a9b82d..827d834483 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -53,11 +53,10 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -127,11 +126,10 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -169,13 +167,13 @@ exclusive_inplace(
   // Run
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              init,
-                                              len,
-                                              binary_op,
-                                              stream));
+                                      temp_storage_bytes,
+                                      begin,
+                                      begin,
+                                      init,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
                                              temp_storage_bytes,
@@ -205,15 +203,14 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Hip>
+inclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -222,21 +219,11 @@ inclusive(
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -244,21 +231,11 @@ inclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -280,16 +257,15 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Hip>
+exclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -348,12 +324,12 @@ exclusive(
   return resources::EventProxy<resources::Hip>(hip_res);
 }
 
-}  // namespace scan
+} // namespace scan
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index eb16246623..b831bcb8a4 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -51,52 +51,64 @@ namespace detail
 {
 
 #if defined(__HIPCC__)
-  template < typename R >
-  using double_buffer = ::rocprim::double_buffer<R>;
+template <typename R>
+using double_buffer = ::rocprim::double_buffer<R>;
 #elif defined(__CUDACC__)
-  template < typename R >
-  using double_buffer = ::cub::DoubleBuffer<R>;
+template <typename R>
+using double_buffer = ::cub::DoubleBuffer<R>;
 #endif
 
-  template < typename R >
-  R* get_current(double_buffer<R>& d_bufs)
-  {
+template <typename R>
+R* get_current(double_buffer<R>& d_bufs)
+{
 #if defined(__HIPCC__)
-    return d_bufs.current();
+  return d_bufs.current();
 #elif defined(__CUDACC__)
-    return d_bufs.Current();
+  return d_bufs.Current();
 #endif
-  }
-
 }
 
+} // namespace detail
+
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA stable_sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare,
+                            operators::greater<RAJA::detail::IterVal<Iter>>>>>::
+          value,
+      "RAJA stable_sort<hip_exec> is only implemented for pointers to "
+      "arithmetic types and RAJA::operators::less and "
+      "RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -104,26 +116,28 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -178,10 +192,12 @@ stable(
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -194,26 +210,28 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
   int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -268,10 +286,12 @@ stable(
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -285,30 +305,41 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare,
+                            operators::greater<RAJA::detail::IterVal<Iter>>>>>::
+          value,
+      "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
+      "types and RAJA::operators::less and RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -316,18 +347,20 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -335,18 +368,20 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -355,36 +390,47 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for "
+                "arithmetic types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -392,16 +438,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -413,8 +464,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -475,15 +526,19 @@ stable_pairs(
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -497,16 +552,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -518,8 +578,8 @@ stable_pairs(
   using V = RAJA::detail::IterVal<ValIter>;
 
   int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int begin_bit = 0;
+  int end_bit = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -580,15 +640,19 @@ stable_pairs(
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -603,36 +667,47 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
-      "sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "sort_pairs<hip_exec> is only implemented for arithmetic "
+                "types");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -640,16 +715,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -661,16 +741,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -679,12 +764,12 @@ unstable_pairs(
   return stable_pairs(hip_res, p, keys_begin, keys_end, vals_begin, comp);
 }
 
-}  // namespace sort
+} // namespace sort
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_HIP guard
+#endif // closing endif for RAJA_ENABLE_HIP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/synchronize.hpp b/include/RAJA/policy/hip/synchronize.hpp
index deddd1c867..4d2c993c15 100644
--- a/include/RAJA/policy/hip/synchronize.hpp
+++ b/include/RAJA/policy/hip/synchronize.hpp
@@ -43,10 +43,10 @@ void synchronize_impl(const hip_synchronize&)
 }
 
 
-}  // end of namespace hip
-}  // namespace policy
-}  // end of namespace RAJA
+} // end of namespace hip
+} // namespace policy
+} // end of namespace RAJA
 
-#endif  // defined(RAJA_ENABLE_HIP)
+#endif // defined(RAJA_ENABLE_HIP)
 
-#endif  // RAJA_synchronize_hip_HPP
+#endif // RAJA_synchronize_hip_HPP
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index fc29dabcbf..de9bd28efe 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -30,7 +30,7 @@
 #include <thread>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/openmp/atomic.hpp"
+#include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
 #include "RAJA/policy/openmp/forall.hpp"
@@ -46,6 +46,6 @@
 #include "RAJA/policy/openmp/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
+#endif // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup.hpp b/include/RAJA/policy/openmp/WorkGroup.hpp
index f86c4d66a0..fbc40a15c7 100644
--- a/include/RAJA/policy/openmp/WorkGroup.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/openmp/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/openmp/WorkGroup/WorkRunner.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index 09861941ab..e7515cf97c 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -32,16 +32,16 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
   return get_Dispatcher<T, Dispatcher_T>(seq_work{});
 }
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index c889273a0f..87c4c8fc0d 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,26 +61,24 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 2dc047dd95..fa0bc24ce9 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -35,13 +35,12 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T* acc)
 {
   T ret;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
+    ret = *acc; // capture old for return value
     *acc += (T)0;
   }
   return ret;
@@ -49,8 +48,7 @@ RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T* acc, T value)
 {
   T ret;
 #pragma omp atomic capture
@@ -63,13 +61,12 @@ RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc += value;
   }
   return old;
@@ -78,13 +75,12 @@ RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc -= value;
   }
   return old;
@@ -93,15 +89,14 @@ RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value < *acc )
+    if (value < *acc)
     {
       *acc = value;
     }
@@ -115,15 +110,14 @@ RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value > *acc )
+    if (value > *acc)
     {
       *acc = value;
     }
@@ -138,13 +132,12 @@ RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc += T(1);
   }
   return old;
@@ -153,8 +146,7 @@ RAJA_INLINE T atomicInc(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
   return RAJA::atomicInc(builtin_atomic{}, acc, value);
@@ -163,13 +155,12 @@ RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc -= T(1);
   }
   return old;
@@ -178,8 +169,7 @@ RAJA_INLINE T atomicDec(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
   return RAJA::atomicDec(builtin_atomic{}, acc, value);
@@ -187,13 +177,12 @@ RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc &= value;
   }
   return old;
@@ -201,13 +190,12 @@ RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc |= value;
   }
   return old;
@@ -215,13 +203,12 @@ RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc ^= value;
   }
   return old;
@@ -229,13 +216,12 @@ RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old = *acc; // capture old for return value
     *acc = value;
   }
   return old;
@@ -243,8 +229,7 @@ RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value)
 {
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
   return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
@@ -253,7 +238,7 @@ RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
 #endif // not defined RAJA_COMPILER_MSVC
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // RAJA_ENABLE_OPENMP
-#endif  // guard
+#endif // RAJA_ENABLE_OPENMP
+#endif // guard
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 815168ae98..73ffe64558 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -55,12 +55,14 @@ namespace policy
 namespace omp
 {
 
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
@@ -83,249 +85,284 @@ forall_impl(resources::Host host_res,
 namespace internal
 {
 
-  /// Tag dispatch for omp forall
+/// Tag dispatch for omp forall
 
-  //
-  // omp for (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void
+forall_impl(const ::RAJA::policy::omp::Auto&, Iterable&& iter, Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(runtime)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(runtime)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(runtime)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  // TODO :: not implemented in forall param interface ...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
-  }
-  #endif
+// TODO :: not implemented in forall param interface ...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl(::RAJA::policy::omp::Runtime{},
+              std::forward<Iterable>(iter),
+              std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
 
 
-  /// Tag dispatch for omp forall with nowait
+/// Tag dispatch for omp forall with nowait
 
-  //
-  // omp for nowait (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //TODO :: not implemented in param interface...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl_nowait(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
-  }
-  #endif
+// TODO :: not implemented in param interface...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void
+forall_impl_nowait(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl_nowait(::RAJA::policy::omp::Runtime{},
+                     std::forward<Iterable>(iter),
+                     std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
 
 } // end namespace internal
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl(
+      Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_nowait_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(
+      Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
@@ -401,12 +438,12 @@ RAJA_INLINE void forall(
 }
 */
 
-}  // namespace omp
+} // namespace omp
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
+#endif // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/kernel.hpp b/include/RAJA/policy/openmp/kernel.hpp
index 7b9e2e4034..aa110cb4c3 100644
--- a/include/RAJA/policy/openmp/kernel.hpp
+++ b/include/RAJA/policy/openmp/kernel.hpp
@@ -22,4 +22,4 @@
 #include "RAJA/policy/openmp/kernel/Collapse.hpp"
 #include "RAJA/policy/openmp/kernel/OmpSyncThreads.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index ba71ac2fbf..0a0d6ba335 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -38,8 +38,8 @@ namespace RAJA
 struct omp_parallel_collapse_exec
     : make_policy_pattern_t<RAJA::Policy::openmp,
                             RAJA::Pattern::forall,
-                            RAJA::policy::omp::For> {
-};
+                            RAJA::policy::omp::For>
+{};
 
 namespace internal
 {
@@ -48,10 +48,15 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
@@ -71,14 +76,17 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1) firstprivate(privatizer)              \
     RAJA_COLLAPSE(2)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
         auto& private_data = privatizer.get_priv();
         private_data.template assign_offset<Arg0>(i0);
         private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
   }
@@ -92,7 +100,9 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
@@ -112,16 +122,20 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer)          \
     RAJA_COLLAPSE(3)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
-        for (i2 = 0; i2 < l2; ++i2) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
+        for (i2 = 0; i2 < l2; ++i2)
+        {
           auto& private_data = privatizer.get_priv();
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
           private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
@@ -129,14 +143,11 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
-
-
-
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
 #undef RAJA_COLLAPSE
 
-#endif  // closing endif for RAJA_ENABLE_OPENMP guard
+#endif // closing endif for RAJA_ENABLE_OPENMP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
index 65f56010bc..c1e3045ae6 100644
--- a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -30,14 +30,13 @@
 #include "RAJA/policy/openmp/policy.hpp"
 
 
-
 namespace RAJA
 {
 
 namespace statement
 {
-struct OmpSyncThreads : public internal::Statement<camp::nil> {
-};
+struct OmpSyncThreads : public internal::Statement<camp::nil>
+{};
 
 } // namespace statement
 
@@ -45,27 +44,23 @@ namespace internal
 {
 
 
-
-//Statement executor to synchronize omp threads inside a kernel region
-template<typename Types>
-struct StatementExecutor<statement::OmpSyncThreads, Types> {
-
-template<typename Data>
-static RAJA_INLINE void exec(Data &&)
+// Statement executor to synchronize omp threads inside a kernel region
+template <typename Types>
+struct StatementExecutor<statement::OmpSyncThreads, Types>
 {
-  #pragma omp barrier
-}
 
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&&)
+  {
+#pragma omp barrier
+  }
 };
 
 
+} // namespace internal
+} // namespace RAJA
 
 
+#endif // closing endif for RAJA_ENABLE_OPENMP guard
 
-}  // namespace internal
-}  // namespace RAJA
-
-
-#endif  // closing endif for RAJA_ENABLE_OPENMP guard
-
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index f3aabcf579..c4ddab14e4 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -25,48 +25,58 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::omp_launch_t> {
+struct LaunchExecute<RAJA::omp_launch_t>
+{
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char*,
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     RAJA::region<RAJA::omp_parallel_region>([&]() {
+      LaunchContext ctx;
 
-        LaunchContext ctx;
-
-        using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(body);
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
-        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+      ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
 
-        loop_body.get_priv()(ctx);
+      loop_body.get_priv()(ctx);
 
-        free(ctx.shared_mem_ptr);
-        ctx.shared_mem_ptr = nullptr;
+      free(ctx.shared_mem_ptr);
+      ctx.shared_mem_ptr = nullptr;
     });
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename ReduceParams, typename BODY>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name),  BODY const &body, ReduceParams &f_params)
+  template <typename ReduceParams, typename BODY>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
 
     expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
-    //reducer object must be named f_params as expected by macro below
+    // reducer object must be named f_params as expected by macro below
     RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-   #pragma omp parallel reduction(combine : f_params)
+#pragma omp parallel reduction(combine : f_params)
     {
 
       LaunchContext ctx;
@@ -74,7 +84,7 @@ struct LaunchExecute<RAJA::omp_launch_t> {
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 
-      ctx.shared_mem_ptr = (char*) malloc(launch_params.shared_mem_size);
+      ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size);
 
       expt::invoke_body(f_params, loop_body.get_priv(), ctx);
     }
@@ -83,18 +93,18 @@ struct LaunchExecute<RAJA::omp_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
+struct LoopExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -102,18 +112,19 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 #pragma omp for
-      for (int i = 0; i < len; i++) {
+      for (int i = 0; i < len; i++)
+      {
 
         loop_body.get_priv()(*(segment.begin() + i));
       }
     });
   }
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -124,8 +135,10 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
 
           loop_body.get_priv()(*(segment0.begin() + i),
                                *(segment1.begin() + j));
@@ -135,12 +148,12 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -152,9 +165,12 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
+      for (int k = 0; k < len2; k++)
+      {
+        for (int j = 0; j < len1; j++)
+        {
+          for (int i = 0; i < len0; i++)
+          {
             loop_body.get_priv()(*(segment0.begin() + i),
                                  *(segment1.begin() + j),
                                  *(segment2.begin() + k));
@@ -166,37 +182,41 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
 };
 
 template <typename SEGMENT>
-struct LoopExecute<omp_for_exec, SEGMENT> {
+struct LoopExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -204,12 +224,12 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -217,9 +237,12 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
           body(*(segment0.begin() + i),
                *(segment1.begin() + j),
                *(segment2.begin() + k));
@@ -233,53 +256,54 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
 // Return local index
 //
 template <typename SEGMENT>
-struct LoopICountExecute<omp_for_exec, SEGMENT> {
+struct LoopICountExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 
 #pragma omp for
-      for (int i = 0; i < len; i++) {
-        body(*(segment.begin() + i), i);
-      }
+    for (int i = 0; i < len; i++)
+    {
+      body(*(segment.begin() + i), i);
+    }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
-               i,
-               j);
-        }
+        body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
+    }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -287,18 +311,21 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            body(*(segment0.begin() + i),
-                 *(segment1.begin() + j),
-                 *(segment2.begin() + k),
-                 i,
-                 j,
-                 k);
-          }
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
+               *(segment2.begin() + k),
+               i,
+               j,
+               k);
         }
       }
+    }
   }
 };
 
@@ -306,14 +333,15 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
 struct omp_parallel_nested_for_exec;
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
+struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -324,8 +352,10 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
 
           loop_body.get_priv()(*(segment0.begin() + i),
                                *(segment1.begin() + j));
@@ -335,12 +365,12 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -352,9 +382,12 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
+      for (int k = 0; k < len2; k++)
+      {
+        for (int j = 0; j < len1; j++)
+        {
+          for (int i = 0; i < len0; i++)
+          {
             loop_body.get_priv()(*(segment0.begin() + i),
                                  *(segment1.begin() + j),
                                  *(segment2.begin() + k));
@@ -367,14 +400,15 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
 
 // Return local index
 template <typename SEGMENT>
-struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
+struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -385,25 +419,25 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
 
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j),
-                               i,
-                               j);
+          loop_body.get_priv()(
+              *(segment0.begin() + i), *(segment1.begin() + j), i, j);
         }
       }
     });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -415,9 +449,12 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
+      for (int k = 0; k < len2; k++)
+      {
+        for (int j = 0; j < len1; j++)
+        {
+          for (int i = 0; i < len0; i++)
+          {
             loop_body.get_priv()(*(segment0.begin() + i),
                                  *(segment1.begin() + j),
                                  *(segment2.begin() + k),
@@ -433,14 +470,15 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
 
 
 template <typename SEGMENT>
-struct TileExecute<omp_parallel_for_exec, SEGMENT> {
+struct TileExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -450,7 +488,8 @@ struct TileExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int i = 0; i < len; i += tile_size) {
+      for (int i = 0; i < len; i += tile_size)
+      {
         loop_body.get_priv()(segment.slice(i, tile_size));
       }
     });
@@ -458,14 +497,15 @@ struct TileExecute<omp_parallel_for_exec, SEGMENT> {
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
+struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -476,7 +516,8 @@ struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp parallel for
-      for (int i = 0; i < numTiles; i++) {
+      for (int i = 0; i < numTiles; i++)
+      {
         const int i_tile_size = i * tile_size;
         loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
       }
@@ -485,45 +526,49 @@ struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
 };
 
 template <typename SEGMENT>
-struct TileExecute<omp_for_exec, SEGMENT> {
+struct TileExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i += tile_size) {
+    for (int i = 0; i < len; i += tile_size)
+    {
       body(segment.slice(i, tile_size));
     }
   }
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_for_exec, SEGMENT> {
+struct TileTCountExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
 #pragma omp for
-    for (int i = 0; i < numTiles; i++) {
+    for (int i = 0; i < numTiles; i++)
+    {
       const int i_tile_size = i * tile_size;
       body(segment.slice(i_tile_size, tile_size), i);
     }
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 22b09a7722..00c2960c14 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -56,7 +56,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataOMP;
 
 /*!
@@ -68,46 +68,55 @@ struct MultiReduceDataOMP;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
 {
   using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(nullptr)
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(nullptr)
   {
-    m_data = create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins), other.m_num_bins);
+    m_data =
+        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
+                    other.m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&) = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&) = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (m_parent && (m_num_bins != size_t(0))) {
+    if (m_data)
+    {
+      if (m_parent && (m_num_bins != size_t(0)))
+      {
 #pragma omp critical(ompMultiReduceCritical)
         {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
             MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]);
           }
         }
@@ -116,18 +125,22 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     m_identity = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
       m_data = create_data(container, m_num_bins);
-    } else {
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -138,26 +151,29 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp{}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, num_bins * sizeof(T) );
+    auto data =
+        RAJA::allocate_aligned_type<T>(RAJA::DATA_ALIGN, num_bins * sizeof(T));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -165,11 +181,13 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t bin = num_bins; bin > 0; --bin) {
-      data[bin-1].~T();
+    for (size_t bin = num_bins; bin > 0; --bin)
+    {
+      data[bin - 1].~T();
     }
     RAJA::free_aligned(data);
     data = nullptr;
@@ -185,74 +203,101 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>>
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_get>>
 {
   using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_max_threads(omp_get_max_threads())
-      , m_num_bins(container.size())
-      , m_padded_threads(pad_threads(m_max_threads))
-      , m_padded_bins(pad_bins(m_num_bins))
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_max_threads(omp_get_max_threads()),
+        m_num_bins(container.size()),
+        m_padded_threads(pad_threads(m_max_threads)),
+        m_padded_bins(pad_bins(m_num_bins)),
+        m_identity(identity),
+        m_data(nullptr)
   {
-    m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    m_data = create_data(container,
+                         identity,
+                         m_num_bins,
+                         m_max_threads,
+                         m_padded_bins,
+                         m_padded_threads);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_padded_threads(other.m_padded_threads)
-      , m_padded_bins(other.m_padded_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_padded_threads(other.m_padded_threads),
+        m_padded_bins(other.m_padded_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&) = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&) = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (!m_parent) {
-        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    if (m_data)
+    {
+      if (!m_parent)
+      {
+        destroy_data(
+            m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
       }
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     m_identity = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
-      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    if (new_num_bins != m_num_bins)
+    {
+      destroy_data(
+          m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
       m_num_bins = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-    } else {
-      if (m_max_threads > 0) {
+      m_data = create_data(container,
+                           identity,
+                           m_num_bins,
+                           m_max_threads,
+                           m_padded_bins,
+                           m_padded_threads);
+    }
+    else
+    {
+      if (m_max_threads > 0)
+      {
         {
           size_t thread_idx = 0;
           size_t bin = 0;
-          for (auto const& value : container) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+          for (auto const& value : container)
+          {
+            m_data[index_data(
+                bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
             ++bin;
           }
         }
-        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx)
+        {
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
+            m_data[index_data(
+                bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
           }
         }
       }
@@ -263,24 +308,28 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val)
+  void combine(size_t bin, T const& val)
   {
     size_t thread_idx = omp_get_thread_num();
-    MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val);
+    MultiReduceOp{}(
+        m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)],
+        val);
   }
 
   T get(size_t bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
         reducer(m_identity);
-    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) {
-      reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
+    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx)
+    {
+      reducer.combine(
+          m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
     }
     return reducer.get_and_clear();
   }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_max_threads;
   size_t m_num_bins;
   size_t m_padded_threads;
@@ -290,8 +339,10 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
-    size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
+    size_t num_cache_lines =
+        RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
+                                   sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -299,33 +350,46 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     return max_threads;
   }
 
-  static constexpr size_t index_data(size_t bin, size_t thread_idx,
-                                     size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(size_t bin,
+                                     size_t thread_idx,
+                                     size_t padded_bins,
+                                     size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
-  template < typename Container >
-  static T* create_data(Container const& container, T identity,
-                        size_t num_bins, size_t max_threads,
-                        size_t padded_bins, size_t padded_threads)
+  template <typename Container>
+  static T* create_data(Container const& container,
+                        T identity,
+                        size_t num_bins,
+                        size_t max_threads,
+                        size_t padded_bins,
+                        size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) );
-    if (max_threads > 0) {
+    auto data = RAJA::allocate_aligned_type<T>(
+        RAJA::DATA_ALIGN, padded_threads * padded_bins * sizeof(T));
+    if (max_threads > 0)
+    {
       {
         size_t thread_idx = 0;
         size_t bin = 0;
-        for (auto const& value : container) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value);
+        for (auto const& value : container)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(value);
           ++bin;
         }
       }
-      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity);
+      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx)
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(identity);
         }
       }
     }
@@ -333,15 +397,21 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
   }
 
   static void destroy_data(T*& data,
-                           size_t num_bins, size_t max_threads,
-                           size_t padded_bins, size_t padded_threads)
+                           size_t num_bins,
+                           size_t max_threads,
+                           size_t padded_bins,
+                           size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) {
-      for (size_t bin = num_bins; bin > 0; --bin) {
-        data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T();
+    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx)
+    {
+      for (size_t bin = num_bins; bin > 0; --bin)
+      {
+        data[index_data(bin - 1, thread_idx - 1, padded_bins, padded_threads)]
+            .~T();
       }
     }
     RAJA::free_aligned(data);
@@ -349,12 +419,13 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
+                                detail::MultiReduceDataOMP)
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_OPENMP guard
+#endif // closing endif for RAJA_ENABLE_OPENMP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index d9bea5d0d8..ef7801b200 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -18,296 +18,350 @@ namespace omp
 namespace expt
 {
 
-  namespace internal
-  {
-    //
-    // omp for (Auto)
-    //
-    template <typename ExecPol, typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol, RAJA::policy::omp::Auto> >
-    forall_impl(const ExecPol& p,
-                Iterable&& iter,
-                Func&& loop_body,
-                ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+namespace internal
+{
+//
+// omp for (Auto)
+//
+template <typename ExecPol,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
+forall_impl(const ExecPol& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize <= 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize <= 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static, ChunkSize)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize > 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static, ChunkSize)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize > 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static, ChunkSize) reduction(combine         \
+                                                               : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(runtime)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(runtime) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(runtime) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for nowait (Auto)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                    Iterable&& iter,
+                                    Func&& loop_body,
+                                    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
-
-    //
-    // omp for schedule(dynamic)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
+  {
+#pragma omp for nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
     {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+    }
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-    //
-    // omp for schedule(dynamic, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine        \
+                                                                : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided, ChunkSize) reduction(combine         \
+                                                               : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(static) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-    //
-    // omp for schedule(static, ChunkSize) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-  } //  namespace internal
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-  template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-  RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                                 const omp_for_schedule_exec<Schedule>&,
-                                                                 Iterable&& iter,
-                                                                 Func&& loop_body,
-                                                                 ForallParam f_params)
-  {
-    expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body), std::forward<ForallParam>(f_params));
-    return resources::EventProxy<resources::Host>(host_res);
-  }
+} //  namespace internal
+
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE resources::EventProxy<resources::Host>
+forall_impl(resources::Host host_res,
+            const omp_for_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  expt::internal::forall_impl(Schedule{},
+                              std::forward<Iterable>(iter),
+                              std::forward<Func>(loop_body),
+                              std::forward<ForallParam>(f_params));
+  return resources::EventProxy<resources::Host>(host_res);
+}
 } //  namespace expt
 
 ///
 /// OpenMP parallel policy implementation
 ///
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
@@ -318,10 +372,10 @@ forall_impl(resources::Host host_res,
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-}  // namespace omp
+} // namespace omp
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
index 65a5f7a329..599f87422b 100644
--- a/include/RAJA/policy/openmp/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -3,32 +3,36 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+combine(KernelName&, T& /*place holder argument*/)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
index ffa270b0f3..b796592063 100644
--- a/include/RAJA/policy/openmp/params/reduce.hpp
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -3,32 +3,38 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red) {
-    red.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
-    out.val = OP{}(out.val, in.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red) {
-    *red.target = OP{}(*red.target, red.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+init(Reducer<OP, T>& red)
+{
+  red.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
+{
+  out.val = OP{}(out.val, in.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T>& red)
+{
+  *red.target = OP{}(*red.target, red.val);
+}
 
 #endif
 
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index aff2567474..172de3e965 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -26,15 +26,16 @@
 #include "RAJA/policy/atomic_builtin.hpp"
 
 #if defined(RAJA_COMPILER_MSVC)
-typedef enum omp_sched_t { 
-    // schedule kinds 
-    omp_sched_static = 0x1, 
-    omp_sched_dynamic = 0x2, 
-    omp_sched_guided = 0x3, 
-    omp_sched_auto = 0x4, 
-    
-    // schedule modifier 
-    omp_sched_monotonic = 0x80000000u 
+typedef enum omp_sched_t
+{
+  // schedule kinds
+  omp_sched_static = 0x1,
+  omp_sched_dynamic = 0x2,
+  omp_sched_guided = 0x3,
+  omp_sched_auto = 0x4,
+
+  // schedule modifier
+  omp_sched_monotonic = 0x80000000u
 } omp_sched_t;
 #else
 #include <omp.h>
@@ -51,7 +52,7 @@ enum struct multi_reduce_algorithm : int
   combine_on_get
 };
 
-template < multi_reduce_algorithm t_algorithm >
+template <multi_reduce_algorithm t_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -59,7 +60,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
-} // namspace omp
+} // namespace omp
 
 namespace policy
 {
@@ -68,15 +69,17 @@ namespace omp
 
 namespace internal
 {
-    struct ScheduleTag {};
+struct ScheduleTag
+{};
 
-    template <omp_sched_t Sched, int Chunk>
-    struct Schedule : public ScheduleTag {
-        constexpr static omp_sched_t schedule = Sched;
-        constexpr static int chunk_size = Chunk;
-        constexpr static Policy policy = Policy::openmp;
-    };
-}  // namespace internal
+template <omp_sched_t Sched, int Chunk>
+struct Schedule : public ScheduleTag
+{
+  constexpr static omp_sched_t schedule = Sched;
+  constexpr static int chunk_size = Chunk;
+  constexpr static Policy policy = Policy::openmp;
+};
+} // namespace internal
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -86,23 +89,23 @@ namespace internal
 //////////////////////////////////////////////////////////////////////
 //
 
-struct Parallel {
-};
+struct Parallel
+{};
 
-struct For {
-};
+struct For
+{};
 
-struct NoWait {
-};
+struct NoWait
+{};
 
 static constexpr int default_chunk_size = -1;
 
-struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>{
-};
+struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>
+{};
 
 template <int ChunkSize = default_chunk_size>
-struct Static : public internal::Schedule<omp_sched_static, ChunkSize> {
-};
+struct Static : public internal::Schedule<omp_sched_static, ChunkSize>
+{};
 
 template <int ChunkSize = default_chunk_size>
 using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
@@ -110,8 +113,9 @@ using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 template <int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), default_chunk_size> {
-};
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
+                                            default_chunk_size>
+{};
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -122,39 +126,41 @@ struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), defaul
 //
 
 ///
-///  Struct supporting OpenMP parallel region. 
+///  Struct supporting OpenMP parallel region.
 ///
 struct omp_parallel_region
     : make_policy_pattern_launch_platform_t<Policy::openmp,
                                             Pattern::region,
                                             Launch::undefined,
-                                            Platform::host> {
-};
+                                            Platform::host>
+{};
 
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::region,
-                                            Launch::undefined,
-                                            Platform::host> {
-};
+struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                            Pattern::region,
+                                                            Launch::undefined,
+                                                            Platform::host>
+{};
 
 
 ///
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
 template <typename Sched>
-struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              omp::NoWait,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_nowait_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            omp::NoWait,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 
@@ -162,14 +168,17 @@ struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Poli
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
 template <typename Sched>
-struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 ///
@@ -196,52 +205,58 @@ using omp_for_runtime_exec = omp_for_schedule_exec<omp::Runtime>;
 
 ///
 ///  Internal type aliases supporting 'omp for schedule( ) nowait' for specific
-///  schedule types. 
+///  schedule types.
 ///
 ///  IMPORTANT: We only provide a nowait policy option for static scheduling
 ///             since that is the only scheduling case that can be used with
-///             nowait and be correct in general. Paraphrasing the OpenMP 
+///             nowait and be correct in general. Paraphrasing the OpenMP
 ///             standard:
-///             
-///             Programs that depend on which thread executes a particular 
+///
+///             Programs that depend on which thread executes a particular
 ///             iteration under any circumstance other than static schedule
 ///             are non-conforming.
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_for_nowait_static_exec = omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
+using omp_for_nowait_static_exec =
+    omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
 
 ///
 ///  Struct supporting OpenMP 'parallel' region containing an inner loop
 ///  execution construct.
 ///
 template <typename InnerPolicy>
-using omp_parallel_exec = make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::Parallel,
-                                            wrapper<InnerPolicy>>;
+using omp_parallel_exec =
+    make_policy_pattern_launch_platform_t<Policy::openmp,
+                                          Pattern::forall,
+                                          Launch::undefined,
+                                          Platform::host,
+                                          omp::Parallel,
+                                          wrapper<InnerPolicy>>;
 
 ///
-///  Internal type aliases supporting 'omp parallel for schedule( )' for 
+///  Internal type aliases supporting 'omp parallel for schedule( )' for
 ///  specific schedule types.
 ///
 using omp_parallel_for_exec = omp_parallel_exec<omp_for_exec>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_static_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>> >;
+using omp_parallel_for_static_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_dynamic_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>> >;
+using omp_parallel_for_dynamic_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_guided_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>> >;
+using omp_parallel_for_guided_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>>>;
 
 ///
-using omp_parallel_for_runtime_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
+using omp_parallel_for_runtime_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
 
 
 ///
@@ -265,13 +280,13 @@ using omp_parallel_segit = omp_parallel_for_segit;
 ///////////////////////////////////////////////////////////////////////
 ///
 struct omp_taskgraph_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 ///
 struct omp_taskgraph_interval_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 
 ///
@@ -284,8 +299,8 @@ struct omp_taskgraph_interval_segit
 struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -294,47 +309,48 @@ struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce> {
-};
+struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce>
+{};
 
 ///
 struct omp_reduce_ordered
-    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered>
+{};
 
 ///
-template < typename tuning >
-struct omp_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template <typename tuning>
+struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::openmp,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
                                                       Pattern::synchronize,
-                                                      Launch::sync> {
-};
+                                                      Launch::sync>
+{};
 
 #if defined(RAJA_COMPILER_MSVC)
 
 // For MS Visual C, just default to builtin_atomic for everything
 using omp_atomic = builtin_atomic;
 
-#else  // RAJA_COMPILER_MSVC not defined
+#else // RAJA_COMPILER_MSVC not defined
 
-struct omp_atomic {};
+struct omp_atomic
+{};
 
 #endif
 
 
-template < RAJA::omp::multi_reduce_algorithm algorithm >
-using omp_multi_reduce_tuning = omp_multi_reduce_policy<
-    RAJA::omp::MultiReduceTuning<algorithm> >;
+template <RAJA::omp::multi_reduce_algorithm algorithm>
+using omp_multi_reduce_tuning =
+    omp_multi_reduce_policy<RAJA::omp::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - combine_on_destruction policies combine new values into a single value for
@@ -344,8 +360,8 @@ using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning<
     RAJA::omp::multi_reduce_algorithm::combine_on_destruction>;
 // - combine_on_get policies combine new values into a single value for
 //   each thread then when get is called those values are combined.
-using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning<
-    RAJA::omp::multi_reduce_algorithm::combine_on_get>;
+using omp_multi_reduce_combine_on_get =
+    omp_multi_reduce_tuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>;
 
 // Policy for RAJA::MultiReduce* objects that gives the
 // same answer every time when used in the same way
@@ -357,8 +373,8 @@ using omp_multi_reduce_unordered = omp_multi_reduce_combine_on_destruction;
 
 using omp_multi_reduce = omp_multi_reduce_unordered;
 
-}  // namespace omp
-}  // namespace policy
+} // namespace omp
+} // namespace policy
 
 
 ///
@@ -395,18 +411,19 @@ using policy::omp::omp_parallel_for_segit;
 using policy::omp::omp_parallel_segit;
 
 ///
-/// Type alias for omp parallel region containing an inner 'omp for' loop 
+/// Type alias for omp parallel region containing an inner 'omp for' loop
 /// execution policy. Inner policy types follow.
 ///
 using policy::omp::omp_parallel_exec;
 
 ///
-/// Type alias for 'omp for' loop execution within an omp_parallel_exec construct
+/// Type alias for 'omp for' loop execution within an omp_parallel_exec
+/// construct
 ///
 using policy::omp::omp_for_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// scheduling policy within an omp_parallel_exec construct
 /// Scheduling policies are near the top of this file and include:
 /// RAJA::policy::omp::{Auto, Static, Dynamic, Guided, Runtime}
@@ -421,7 +438,7 @@ using policy::omp::omp_for_schedule_exec;
 using policy::omp::omp_for_nowait_schedule_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// static scheduling policy within an omp_parallel_exec construct
 ///
 using policy::omp::omp_for_static_exec;
@@ -437,8 +454,8 @@ using policy::omp::omp_for_runtime_exec;
 ///
 /// Type aliases for omp parallel region
 ///
-using policy::omp::omp_parallel_region;
 using policy::omp::omp_launch_t;
+using policy::omp::omp_parallel_region;
 
 ///
 /// Type aliases for omp reductions
@@ -459,6 +476,6 @@ using policy::omp::omp_synchronize;
 ///
 using policy::omp::omp_work;
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index 7ccc68c3a1..d1d1dac68d 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -55,7 +55,8 @@ class ReduceOMP
 
   ~ReduceOMP()
   {
-    if (Base::parent) {
+    if (Base::parent)
+    {
 #pragma omp critical(ompReduceCritical)
       Reduce()(Base::parent->local(), Base::my_data);
       Base::my_data = Base::identity;
@@ -63,7 +64,7 @@ class ReduceOMP
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
 RAJA_DECLARE_ALL_REDUCERS(omp_reduce, detail::ReduceOMP)
 
@@ -107,25 +108,27 @@ class ReduceOMPOrdered
 
   T get_combined() const
   {
-    if (Base::my_data != Base::identity) {
+    if (Base::my_data != Base::identity)
+    {
       Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
       Base::my_data = Base::identity;
     }
 
     T res = Base::identity;
-    for (size_t i = 0; i < data->size(); ++i) {
+    for (size_t i = 0; i < data->size(); ++i)
+    {
       Reduce{}(res, (*data)[i]);
     }
     return res;
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
 RAJA_DECLARE_ALL_REDUCERS(omp_reduce_ordered, detail::ReduceOMPOrdered)
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_OPENMP guard
+#endif // closing endif for RAJA_ENABLE_OPENMP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/region.hpp b/include/RAJA/policy/openmp/region.hpp
index 88f0519abf..744e8510ce 100644
--- a/include/RAJA/policy/openmp/region.hpp
+++ b/include/RAJA/policy/openmp/region.hpp
@@ -35,21 +35,21 @@ namespace omp
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const omp_parallel_region &, Func &&body)
+RAJA_INLINE void region_impl(const omp_parallel_region&, Func&& body)
 {
 
 #pragma omp parallel
-    { // curly brackets to ensure body() is encapsulated in omp parallel region
-      //thread private copy of body
-      auto loopbody = body;
-      loopbody();
-    }
+  { // curly brackets to ensure body() is encapsulated in omp parallel region
+    // thread private copy of body
+    auto loopbody = body;
+    loopbody();
+  }
 }
 
-}  // namespace omp
+} // namespace omp
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 97cd7a8ab8..474b2517c8 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -44,18 +44,16 @@ namespace scan
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
+  using std::distance;
   using Value = typename ::std::iterator_traits<Iter>::value_type;
   const auto n = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
@@ -67,16 +65,22 @@ inclusive_inplace(
     const int pid = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
     const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    if (idx_begin != idx_end) {
-      inclusive_inplace(host_res, ::RAJA::seq_exec{},
-                        begin + idx_begin, begin + idx_end, f);
+    if (idx_begin != idx_end)
+    {
+      inclusive_inplace(
+          host_res, ::RAJA::seq_exec{}, begin + idx_begin, begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, ::RAJA::seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res,
+                      ::RAJA::seq_exec{},
+                      sums.data(),
+                      sums.data() + p,
+                      f,
+                      BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -89,19 +93,17 @@ inclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  ValueT v)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
+  using std::distance;
   using Value = typename ::std::iterator_traits<Iter>::value_type;
   const auto n = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
@@ -115,16 +117,22 @@ exclusive_inplace(
     const DistanceT idx_end = firstIndex(n, p, pid + 1);
     const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
-    if (idx_begin != idx_end) {
-      exclusive_inplace(host_res, seq_exec{},
-                        begin + idx_begin, begin + idx_end, f, init);
+    if (idx_begin != idx_end)
+    {
+      exclusive_inplace(
+          host_res, seq_exec{}, begin + idx_begin, begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res,
+                      seq_exec{},
+                      sums.data(),
+                      sums.data() + p,
+                      f,
+                      BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -137,16 +145,14 @@ exclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -162,27 +168,26 @@ template <typename Policy,
           typename OutIter,
           typename BinFn,
           typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f,
+          ValueT v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f, v);
+  return exclusive_inplace(
+      host_res, exec, out, out + distance(begin, end), f, v);
 }
 
-}  // namespace scan
+} // namespace scan
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 9e4474d692..8404c556cc 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -65,13 +65,15 @@ inline void sort_task(Sorter sorter,
   using diff_type = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
 
-  if (n <= iterates_per_task) {
-
-    sorter(begin+i_begin, begin+i_end, comp);
+  if (n <= iterates_per_task)
+  {
 
-  } else {
+    sorter(begin + i_begin, begin + i_end, comp);
+  }
+  else
+  {
 
-    const diff_type i_middle = i_begin + n/2;
+    const diff_type i_middle = i_begin + n / 2;
 
 #pragma omp task
     sort_task(sorter, begin, i_begin, i_middle, iterates_per_task, comp);
@@ -81,8 +83,10 @@ inline void sort_task(Sorter sorter,
 
 #pragma omp taskwait
 
-    //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+    // comp);
+    RAJA::detail::inplace_merge(
+        begin + i_begin, begin + i_middle, begin + i_end, comp);
   }
 }
 
@@ -114,20 +118,27 @@ inline void sort_parallel_region(Sorter sorter,
   }
 
   // hierarchically merge ranges
-  for (diff_type middle_offset = 1; middle_offset < num_threads; middle_offset *= 2) {
+  for (diff_type middle_offset = 1; middle_offset < num_threads;
+       middle_offset *= 2)
+  {
 
-    diff_type end_offset = 2*middle_offset;
+    diff_type end_offset = 2 * middle_offset;
 
-    const diff_type i_middle = firstIndex(n, num_threads, std::min(thread_id + middle_offset, num_threads));
-    const diff_type i_end    = firstIndex(n, num_threads, std::min(thread_id + end_offset,    num_threads));
+    const diff_type i_middle = firstIndex(
+        n, num_threads, std::min(thread_id + middle_offset, num_threads));
+    const diff_type i_end = firstIndex(
+        n, num_threads, std::min(thread_id + end_offset, num_threads));
 
 #pragma omp barrier
 
-    if (thread_id % end_offset == 0) {
+    if (thread_id % end_offset == 0)
+    {
 
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
-      //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+      // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+      // comp);
+      RAJA::detail::inplace_merge(
+          begin + i_begin, begin + i_middle, begin + i_end, comp);
     }
   }
 }
@@ -139,11 +150,7 @@ inline void sort_parallel_region(Sorter sorter,
         \brief sort given range using sorter and comparison function
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline
-void sort(Sorter sorter,
-          Iter begin,
-          Iter end,
-          Compare comp)
+inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
 
@@ -151,22 +158,26 @@ void sort(Sorter sorter,
 
   const diff_type n = end - begin;
 
-  if (n <= min_iterates_per_task) {
+  if (n <= min_iterates_per_task)
+  {
 
     sorter(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     const diff_type max_threads = omp_get_max_threads();
 
 #if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 
-    const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
+    const diff_type iterates_per_task =
+        std::max(n / (2 * max_threads), min_iterates_per_task);
 
-    const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
+    const diff_type requested_num_threads =
+        std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
     RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
 #pragma omp master
     {
       sort_task(sorter, begin, 0, n, iterates_per_task, comp);
@@ -174,10 +185,11 @@ void sort(Sorter sorter,
 
 #else
 
-    const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
+    const diff_type requested_num_threads = std::min(
+        (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads);
     RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
     {
       sort_parallel_region(sorter, begin, n, comp);
     }
@@ -196,12 +208,11 @@ void sort(Sorter sorter,
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
   detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
 
@@ -214,12 +225,11 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
   detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
 
@@ -229,51 +239,58 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(
+      detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(
+      detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-}  // namespace sort
+} // namespace sort
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/openmp/synchronize.hpp b/include/RAJA/policy/openmp/synchronize.hpp
index 0ecc10a46b..b717fdb00e 100644
--- a/include/RAJA/policy/openmp/synchronize.hpp
+++ b/include/RAJA/policy/openmp/synchronize.hpp
@@ -37,8 +37,8 @@ void synchronize_impl(const omp_synchronize&)
 }
 
 
-}  // end of namespace omp
-}  // namespace policy
-}  // end of namespace RAJA
+} // end of namespace omp
+} // namespace policy
+} // end of namespace RAJA
 
-#endif  // RAJA_synchronize_openmp_HPP
+#endif // RAJA_synchronize_openmp_HPP
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index af88127636..b17d719321 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -34,6 +34,7 @@
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP)
+#endif // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
+       // defined(RAJA_ENABLE_TARGET_OPENMP)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup.hpp b/include/RAJA/policy/openmp_target/WorkGroup.hpp
index 47ade8ac57..f987f12e60 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index a4a4a62903..507221d7d6 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -36,12 +36,12 @@ namespace omp_target
 
 // create the value in a target region using the factory, map the value
 // back, and return the value created in the target region
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory factory)
 {
   typename std::decay_t<Factory>::value_type value;
 
-  #pragma omp target map(tofrom : value) map(to : factory)
+#pragma omp target map(tofrom : value) map(to : factory)
   {
     value = factory();
   }
@@ -51,32 +51,32 @@ inline auto get_value(Factory factory)
 
 // get the device value and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
   return value;
 }
 
-}  // namespace omp_target
+} // namespace omp_target
 
 /*!
-* Populate and return a Dispatcher object that can be used in omp target regions
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object that can be used in omp target
+ * regions
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
   static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return omp_target::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return omp_target::get_cached_value(
+            std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index b373d09c61..41a59bcef1 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,26 +61,24 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 061481cbc1..f11d694ba7 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -33,13 +33,15 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
             Iterable&& iter,
@@ -57,27 +59,31 @@ forall_impl(resources::Omp omp_res,
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it) reduction(combine: f_params)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to                                                 \
+                            : body, begin_it) reduction(combine                \
+                                                        : f_params)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -86,13 +92,14 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>&,
             Iterable&& iter,
@@ -106,27 +113,30 @@ forall_impl(resources::Omp omp_res,
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to                                                 \
+                            : body, begin_it)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
@@ -135,16 +145,12 @@ forall_impl(resources::Omp omp_res,
 }
 
 
-
-
-
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt& p,
             Iterable&& iter,
@@ -160,9 +166,11 @@ forall_impl(resources::Omp omp_res,
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it) reduction(combine: f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it) reduction(combine                             \
+                                           : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -172,12 +180,10 @@ forall_impl(resources::Omp omp_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt&,
             Iterable&& iter,
@@ -189,9 +195,10 @@ forall_impl(resources::Omp omp_res,
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
@@ -199,12 +206,12 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-}  // namespace omp
+} // namespace omp
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for if defined(RAJA_TARGET_RAJA_ENABLE_OPENMP)
+#endif // closing endif for if defined(RAJA_TARGET_RAJA_ENABLE_OPENMP)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/kernel.hpp b/include/RAJA/policy/openmp_target/kernel.hpp
index 83038ce80a..54edbd7b8f 100644
--- a/include/RAJA/policy/openmp_target/kernel.hpp
+++ b/include/RAJA/policy/openmp_target/kernel.hpp
@@ -11,4 +11,4 @@
 #include "RAJA/policy/openmp_target/kernel/Collapse.hpp"
 #include "RAJA/policy/openmp_target/kernel/For.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index b72147151c..535a027fa6 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -10,13 +10,19 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -30,17 +36,20 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(2)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          auto& private_data = privatizer.get_priv();
-          private_data.template assign_offset<Arg0>(i0);
-          private_data.template assign_offset<Arg1>(i1);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
-        }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        auto& private_data = privatizer.get_priv();
+        private_data.template assign_offset<Arg0>(i0);
+        private_data.template assign_offset<Arg1>(i1);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -50,7 +59,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -66,20 +76,24 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(3)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            auto& private_data = privatizer.get_priv();
-            private_data.template assign_offset<Arg0>(i0);
-            private_data.template assign_offset<Arg1>(i1);
-            private_data.template assign_offset<Arg2>(i2);
-            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
-          }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          auto& private_data = privatizer.get_priv();
+          private_data.template assign_offset<Arg0>(i0);
+          private_data.template assign_offset<Arg1>(i1);
+          private_data.template assign_offset<Arg2>(i2);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -90,7 +104,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -108,26 +123,31 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(4)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            for (auto i3 = (decltype(l3))0; i3 < l3; ++i3) {
-              auto& private_data = privatizer.get_priv();
-              private_data.template assign_offset<Arg0>(i0);
-              private_data.template assign_offset<Arg1>(i1);
-              private_data.template assign_offset<Arg2>(i2);
-              private_data.template assign_offset<Arg3>(i2);
-              execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(private_data);
-            }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          for (auto i3 = (decltype(l3))0; i3 < l3; ++i3)
+          {
+            auto& private_data = privatizer.get_priv();
+            private_data.template assign_offset<Arg0>(i0);
+            private_data.template assign_offset<Arg1>(i1);
+            private_data.template assign_offset<Arg2>(i2);
+            private_data.template assign_offset<Arg3>(i2);
+            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(
+                private_data);
           }
         }
       }
     }
+  }
 };
 
-}
-}
+} // namespace internal
+} // namespace RAJA
 
 #endif // RAJA_policy_openmp_target_kernel_Collapse_HPP
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 173230b9e2..9e151c981c 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -10,25 +10,32 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct OpenMPTargetForWrapper : public GenericWrapperBase 
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct OpenMPTargetForWrapper : public GenericWrapperBase
 {
   using data_t = camp::decay<Data>;
 
   data_t data;
 
-  /*! 
+  /*!
    * \brief Deferences data so that it can be mapped to the device
    */
   RAJA_INLINE
-  constexpr explicit OpenMPTargetForWrapper(data_t &d) : 
-    data{d}  {}
+  constexpr explicit OpenMPTargetForWrapper(data_t& d) : data{d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 
   template <typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
@@ -42,28 +49,35 @@ template <camp::idx_t ArgumentId,
           int N,
           typename... EnclosedStmts,
           typename Types>
-struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>, Types>
+struct StatementExecutor<statement::For<ArgumentId,
+                                        omp_target_parallel_for_exec<N>,
+                                        EnclosedStmts...>,
+                         Types>
 {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
+    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r,
+                omp_target_parallel_for_exec<N>{},
+                TypedRangeSegment<len_t>(0, len),
+                for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
-
-}
-}
+} // namespace internal
+} // namespace RAJA
 
 #endif // RAJA_policy_openmp_kernel_For_HPP
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
index 5e9edb4b6c..fc744b94cc 100644
--- a/include/RAJA/policy/openmp_target/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -3,32 +3,36 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(KernelName&, T& /*place holder argument*/)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index df5c6af235..b047e14dbe 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -3,32 +3,38 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red) {
-    red.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
-    out.val = OP{}(out.val, in.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red) {
-    *red.target = OP{}(*red.target, red.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+init(Reducer<OP, T>& red)
+{
+  red.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
+{
+  out.val = OP{}(out.val, in.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T>& red)
+{
+  *red.target = OP{}(*red.target, red.val);
+}
 
 #endif
 
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index 520f5afc55..deff13110d 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -10,10 +10,13 @@
 
 #include "RAJA/policy/PolicyBase.hpp"
 
-namespace RAJA {
+namespace RAJA
+{
 
-namespace policy {
-namespace omp {
+namespace policy
+{
+namespace omp
+{
 
 // Max number of CUDA reduction threads per block possible.
 // Required for allocating omp target data before execution policy.
@@ -21,47 +24,48 @@ namespace omp {
 static constexpr int MAXNUMTHREADS = 1024;
 
 template <unsigned int TeamSize>
-struct Teams : std::integral_constant<unsigned int, TeamSize> {
-};
+struct Teams : std::integral_constant<unsigned int, TeamSize>
+{};
 
-struct Target {
-};
+struct Target
+{};
 
-struct Distribute {
-};
+struct Distribute
+{};
 
-struct Collapse {
-};
+struct Collapse
+{};
 
 template <size_t ThreadsPerTeam>
 struct omp_target_parallel_for_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Teams<ThreadsPerTeam>,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Teams<ThreadsPerTeam>,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_for_exec_nt
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_collapse_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Collapse> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Collapse>
+{};
 
-struct omp_target_reduce
-    : make_policy_pattern_platform_t<Policy::target_openmp, Pattern::reduce, Platform::omp_target> {
-};
+struct omp_target_reduce : make_policy_pattern_platform_t<Policy::target_openmp,
+                                                          Pattern::reduce,
+                                                          Platform::omp_target>
+{};
 
 ///
 /// WorkGroup execution policies
@@ -70,21 +74,21 @@ struct omp_target_work
     : make_policy_pattern_launch_platform_t<Policy::target_openmp,
                                             Pattern::workgroup_exec,
                                             Launch::sync,
-                                            Platform::omp_target> {
-};
+                                            Platform::omp_target>
+{};
 
 
-}  // closing brace for omp namespace
-}  // closing brace for policy namespace
+} // namespace omp
+} // namespace policy
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
+using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_parallel_for_exec;
 using policy::omp::omp_target_parallel_for_exec_nt;
 using policy::omp::omp_target_reduce;
-using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_work;
 #endif
 
-} // closing brace for RAJA namespace
+} // namespace RAJA
 
 #endif // RAJA_policy_openmp_target_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 6691729bbe..5b27a5c92c 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -33,15 +33,14 @@ namespace omp
 #pragma omp declare target
 
 template <typename T, typename I>
-struct minloc 
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
@@ -49,15 +48,14 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
@@ -70,7 +68,7 @@ struct maxloc
 static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 
 //! Information necessary for OpenMP offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
   int hostID{omp_get_initial_device()};
   int deviceID{omp_get_default_device()};
@@ -78,10 +76,9 @@ struct Offload_Info
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
+  Offload_Info(const Offload_Info& other)
       : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  {}
 };
 
 //! Reduction data for OpenMP Offload -- stores value, host pointer, and device
@@ -90,8 +87,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -100,17 +97,19 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
-     : value(initValue),
-        device{reinterpret_cast<T *>(
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
+      : value(initValue),
+        device{reinterpret_cast<T*>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
         host{new T[omp::MaxNumTeams]}
   {
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -118,78 +117,78 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(device),
-                          reinterpret_cast<void *>(host),
+    if (omp_target_memcpy(reinterpret_cast<void*>(device),
+                          reinterpret_cast<void*>(host),
                           omp::MaxNumTeams * sizeof(T),
                           0,
                           0,
                           info.deviceID,
-                          info.hostID) != 0) {
+                          info.hostID) != 0)
+    {
       printf("Unable to copy memory from host to device\n");
       exit(1);
     }
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(host),
-                          reinterpret_cast<void *>(device),
+    if (omp_target_memcpy(reinterpret_cast<void*>(host),
+                          reinterpret_cast<void*>(device),
                           omp::MaxNumTeams * sizeof(T),
                           0,
                           0,
                           info.hostID,
-                          info.deviceID) != 0) {
+                          info.deviceID) != 0)
+    {
       printf("Unable to copy memory from device to host\n");
       exit(1);
     }
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
-    if (device) {
-      omp_target_free(reinterpret_cast<void *>(device), info.deviceID);
+    if (device)
+    {
+      omp_target_free(reinterpret_cast<void*>(device), info.deviceID);
       device = nullptr;
     }
-    if (host) {
+    if (host)
+    {
       delete[] host;
       host = nullptr;
     }
   }
 };
 
-}  // end namespace omp
+} // end namespace omp
 
 //! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
+struct TargetReduce
 {
   TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val_, T identity_ = Reducer::identity())
       : info(),
         val(identity_, identity_, info),
         initVal(init_val_),
         finalVal(identity_)
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
@@ -205,8 +204,10 @@ struct TargetReduce
   //! apply reduction on device upon destruction
   ~TargetReduce()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
@@ -221,10 +222,12 @@ struct TargetReduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
 
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
         Reducer{}(val.value, val.host[i]);
       }
       val.cleanup(info);
@@ -239,14 +242,14 @@ struct TargetReduce
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
     Reducer{}(val.value, rhsVal);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
     Reducer{}(val.value, rhsVal);
     return *this;
@@ -264,13 +267,16 @@ struct TargetReduce
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+struct TargetReduceLoc
 {
   TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val_, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+  explicit TargetReduceLoc(
+      T init_val_,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -278,12 +284,13 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
@@ -297,8 +304,10 @@ struct TargetReduceLoc
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
@@ -310,10 +319,12 @@ struct TargetReduceLoc
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
         Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       val.cleanup(info);
@@ -339,14 +350,14 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
     Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
     Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
@@ -372,20 +383,19 @@ class ReduceSum<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
   using self = ReduceSum<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -398,20 +408,19 @@ class ReduceBitOr<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
   using self = ReduceBitOr<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -424,20 +433,19 @@ class ReduceBitAnd<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
   using self = ReduceBitAnd<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -450,20 +458,19 @@ class ReduceMin<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
   using self = ReduceMin<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -477,20 +484,19 @@ class ReduceMax<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
   using self = ReduceMax<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -503,21 +509,19 @@ class ReduceMinLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>
 {
 public:
-
   using self = ReduceMinLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
+  using parent = TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  self &minloc(T rhsVal, IndexType rhsLoc)
+  self& minloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  const self &minloc(T rhsVal, IndexType rhsLoc) const
+  const self& minloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
@@ -531,21 +535,19 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>
 {
 public:
-
   using self = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
+  using parent = TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  self &maxloc(T rhsVal, IndexType rhsLoc)
+  self& maxloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  const self &maxloc(T rhsVal, IndexType rhsLoc) const
+  const self& maxloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
@@ -553,8 +555,8 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
 };
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_TARGET_OPENMP guard
+#endif // closing endif for RAJA_ENABLE_TARGET_OPENMP guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 0963b31a01..1ee8fe7d0f 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -21,7 +21,7 @@
 #define RAJA_sequential_HPP
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 #include "RAJA/policy/sequential/forall.hpp"
@@ -34,4 +34,4 @@
 #include "RAJA/policy/sequential/launch.hpp"
 #include "RAJA/policy/sequential/WorkGroup.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup.hpp b/include/RAJA/policy/sequential/WorkGroup.hpp
index 291518037c..a5ffefa83d 100644
--- a/include/RAJA/policy/sequential/WorkGroup.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/sequential/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/sequential/WorkGroup/WorkRunner.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index 13796fd8a3..aa496ca02d 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -32,17 +32,17 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  static Dispatcher_T dispatcher{ Dispatcher_T::template makeDispatcher<T>() };
+  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>()};
   return &dispatcher;
 }
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index 31e401bf88..dc97f7636e 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,20 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,26 +60,23 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::reverse_ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 046e52e1c1..774cbf6855 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -27,24 +27,21 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(seq_atomic, T* acc)
 {
   return *acc;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(seq_atomic, T* acc, T value)
 {
   *acc = value;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc += value;
@@ -54,8 +51,7 @@ RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc -= value;
@@ -65,8 +61,7 @@ RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc = ret < value ? ret : value;
@@ -75,8 +70,7 @@ RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc = value < ret ? ret : value;
@@ -86,8 +80,7 @@ RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) += T(1);
@@ -96,8 +89,7 @@ RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc, T val)
 {
   T old = *acc;
   *acc = val <= old ? T(0) : old + T(1);
@@ -106,8 +98,7 @@ RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) -= T(1);
@@ -116,8 +107,7 @@ RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc, T val)
 {
   T old = *acc;
   *acc = old == T(0) || val < old ? val : old - T(1);
@@ -126,8 +116,7 @@ RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc &= value;
@@ -136,8 +125,7 @@ RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc |= value;
@@ -146,8 +134,7 @@ RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc ^= value;
@@ -156,8 +143,7 @@ RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc = value;
@@ -166,8 +152,7 @@ RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
 {
   T ret = *acc;
   *acc = ret == compare ? value : ret;
@@ -175,7 +160,7 @@ RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
 }
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // guard
+#endif // guard
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 5d1d6d84b0..922b7a16fb 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -55,24 +55,26 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     expt::invoke_body(f_params, body, *(begin_it + i));
   }
 
@@ -80,31 +82,33 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     body(*(begin_it + i));
   }
   return resources::EventProxy<Resource>(res);
 }
 
-}  // namespace sequential
+} // namespace sequential
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/kernel.hpp b/include/RAJA/policy/sequential/kernel.hpp
index 9bb107b4e7..b6b7318806 100644
--- a/include/RAJA/policy/sequential/kernel.hpp
+++ b/include/RAJA/policy/sequential/kernel.hpp
@@ -22,4 +22,4 @@
 #include "RAJA/policy/sequential/kernel/Collapse.hpp"
 #include "RAJA/policy/sequential/kernel/Reduce.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index 8e600ec2e8..aef80f6223 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -32,10 +32,12 @@ namespace internal
 //
 template <typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>, Types> {
+    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // termination case: no more loops, just execute enclosed statements
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
@@ -47,13 +49,17 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Collapse<seq_exec,
-                                             ArgList<Arg0, ArgRest...>,
-                                             EnclosedStmts...>, Types> {
+template <camp::idx_t Arg0,
+          camp::idx_t... ArgRest,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<
+    statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // Set the argument type for this loop
@@ -61,11 +67,13 @@ struct StatementExecutor<statement::Collapse<seq_exec,
 
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
+        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>,
+        NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
-    for (auto i0 = 0; i0 < len0; ++i0) {
+    for (auto i0 = 0; i0 < len0; ++i0)
+    {
       data.template assign_offset<Arg0>(i0);
 
       next_loop_t::exec(data);
@@ -74,9 +82,9 @@ struct StatementExecutor<statement::Collapse<seq_exec,
 };
 
 
-}  // namespace internal
+} // namespace internal
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 7280844320..336f32924e 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -34,10 +34,12 @@ template <template <typename...> class ReduceOperator,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>, Types> {
+    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
     // since a sequential reduction is a NOP, and the single thread always
     // has the reduced value, this is just a passthrough to the enclosed
@@ -47,9 +49,9 @@ struct StatementExecutor<
 };
 
 
-}  // namespace internal
+} // namespace internal
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_sequential_kernel_Reduce_HPP */
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index a2025a71d5..6dbe7eee56 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -26,7 +26,8 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::null_launch_t> {
+struct LaunchExecute<RAJA::null_launch_t>
+{
   template <typename BODY>
   static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
                    BODY const& RAJA_UNUSED_ARG(body))
@@ -37,19 +38,24 @@ struct LaunchExecute<RAJA::null_launch_t> {
 
 
 template <>
-struct LaunchExecute<RAJA::seq_launch_t> {
+struct LaunchExecute<RAJA::seq_launch_t>
+{
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name),
-       BODY const &body, ReduceParams &RAJA_UNUSED_ARG(ReduceParams))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
 
-    char *kernel_local_mem = new char[params.shared_mem_size];
+    char* kernel_local_mem = new char[params.shared_mem_size];
     ctx.shared_mem_ptr = kernel_local_mem;
 
     body(ctx);
@@ -60,17 +66,22 @@ struct LaunchExecute<RAJA::seq_launch_t> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename BODY, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body, ReduceParams &launch_reducers)
+  template <typename BODY, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
     LaunchContext ctx;
-    char *kernel_local_mem = new char[launch_params.shared_mem_size];
+    char* kernel_local_mem = new char[launch_params.shared_mem_size];
     ctx.shared_mem_ptr = kernel_local_mem;
 
     expt::invoke_body(launch_reducers, body, ctx);
@@ -82,54 +93,57 @@ struct LaunchExecute<RAJA::seq_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<seq_exec, SEGMENT> {
+struct LoopExecute<seq_exec, SEGMENT>
+{
 
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
+                                                BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -137,12 +151,12 @@ struct LoopExecute<seq_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -150,9 +164,12 @@ struct LoopExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
           body(*(segment0.begin() + i),
                *(segment1.begin() + j),
                *(segment2.begin() + k));
@@ -160,39 +177,42 @@ struct LoopExecute<seq_exec, SEGMENT> {
       }
     }
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopICountExecute<seq_exec, SEGMENT> {
+struct LoopICountExecute<seq_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 
-    template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
@@ -200,12 +220,12 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -213,30 +233,36 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
           body(*(segment0.begin() + i),
                *(segment1.begin() + j),
-               *(segment2.begin() + k), i, j, k);
+               *(segment2.begin() + k),
+               i,
+               j,
+               k);
         }
       }
     }
   }
-
 };
 
-//Tile Execute + variants
+// Tile Execute + variants
 
 template <typename SEGMENT>
-struct TileExecute<seq_exec, SEGMENT> {
+struct TileExecute<seq_exec, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -246,29 +272,28 @@ struct TileExecute<seq_exec, SEGMENT> {
       body(segment.slice(tx, tile_size));
     }
   }
-
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<seq_exec, SEGMENT> {
+struct TileTCountExecute<seq_exec, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = 0, bx=0; tx < len; tx += tile_size, bx++)
+    for (int tx = 0, bx = 0; tx < len; tx += tile_size, bx++)
     {
       body(segment.slice(tx, tile_size), bx);
     }
   }
-
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index be3a3860f8..c506c16ac6 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -47,7 +47,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataSeq;
 
 /*!
@@ -59,59 +59,68 @@ struct MultiReduceDataSeq;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataSeq<T, t_MultiReduceOp,
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataSeq<
+    T,
+    t_MultiReduceOp,
     RAJA::sequential::MultiReduceTuning<
-      RAJA::sequential::multi_reduce_algorithm::left_fold>>
+        RAJA::sequential::multi_reduce_algorithm::left_fold>>
 {
   using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataSeq() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
   MultiReduceDataSeq(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataSeq(MultiReduceDataSeq const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataSeq(MultiReduceDataSeq const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataSeq(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq(MultiReduceDataSeq&&) = delete;
   MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
-  MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&) = delete;
 
   ~MultiReduceDataSeq()
   {
-    if (m_data) {
-      if (!m_parent) {
+    if (m_data)
+    {
+      if (!m_parent)
+      {
         destroy_data(m_data, m_num_bins);
       }
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     m_identity = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
       m_data = create_data(container, m_num_bins);
-    } else {
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -122,27 +131,29 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp{}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataSeq const *m_parent;
+  MultiReduceDataSeq const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
-    auto data = static_cast<T*>(malloc(num_bins*sizeof(T)));
+    auto data = static_cast<T*>(malloc(num_bins * sizeof(T)));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -150,11 +161,13 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       data[bin].~T();
     }
     free(data);
@@ -162,10 +175,11 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
+                                detail::MultiReduceDataSeq)
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
index 00e6a1dc52..1370b63533 100644
--- a/include/RAJA/policy/sequential/params/kernel_name.hpp
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -3,31 +3,35 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+    combine(KernelName&, T)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 } //  namespace detail
 } //  namespace expt
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
index 60f83c4079..e44296f1d8 100644
--- a/include/RAJA/policy/sequential/params/reduce.hpp
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -3,28 +3,34 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(Reducer<OP, T>& red) {
-    red.val = OP::identity();
-  }
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
-    out.val = OP{}(out.val, in.val);
-  }
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(Reducer<OP, T>& red) {
-    *red.target = OP{}(*red.target, red.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+init(Reducer<OP, T>& red)
+{
+  red.val = OP::identity();
+}
+// Combine
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
+{
+  out.val = OP{}(out.val, in.val);
+}
+// Resolve
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+resolve(Reducer<OP, T>& red)
+{
+  *red.target = OP{}(*red.target, red.val);
+}
 
 } //  namespace detail
 } //  namespace expt
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 287af42502..73e492e13b 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -30,7 +30,7 @@ enum struct multi_reduce_algorithm : int
   left_fold
 };
 
-template < multi_reduce_algorithm t_multi_algorithm >
+template <multi_reduce_algorithm t_multi_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
@@ -38,7 +38,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
-} // namspace sequential
+} // namespace sequential
 
 namespace policy
 {
@@ -60,20 +60,20 @@ namespace sequential
 struct seq_region : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::region,
                                                           Launch::sync,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 struct seq_launch_t : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                             Pattern::region,
                                                             Launch::sync,
-                                                            Platform::host> {
-};
+                                                            Platform::host>
+{};
 
 struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::forall,
                                                         Launch::undefined,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 /// Index set segment iteration policies
@@ -86,8 +86,8 @@ using seq_segit = seq_exec;
 struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -99,20 +99,20 @@ struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
 struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::reduce,
                                                           Launch::undefined,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 ///
-template < typename tuning >
-struct seq_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template <typename tuning>
+struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::sequential,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -121,13 +121,13 @@ struct seq_multi_reduce_policy
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct seq_atomic {
-};
+struct seq_atomic
+{};
 
 
-template < RAJA::sequential::multi_reduce_algorithm algorithm >
-using seq_multi_reduce_tuning = seq_multi_reduce_policy<
-    RAJA::sequential::MultiReduceTuning<algorithm> >;
+template <RAJA::sequential::multi_reduce_algorithm algorithm>
+using seq_multi_reduce_tuning =
+    seq_multi_reduce_policy<RAJA::sequential::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - left_fold policies combine new values into a single value.
@@ -138,19 +138,19 @@ using seq_multi_reduce_left_fold = seq_multi_reduce_tuning<
 // same answer every time when used in the same way
 using seq_multi_reduce = seq_multi_reduce_left_fold;
 
-}  // namespace sequential
-}  // namespace policy
+} // namespace sequential
+} // namespace policy
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
-using policy::sequential::seq_reduce;
+using policy::sequential::seq_launch_t;
 using policy::sequential::seq_multi_reduce;
+using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
-using policy::sequential::seq_launch_t;
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/sequential/reduce.hpp b/include/RAJA/policy/sequential/reduce.hpp
index 0870726183..31a2d751a0 100644
--- a/include/RAJA/policy/sequential/reduce.hpp
+++ b/include/RAJA/policy/sequential/reduce.hpp
@@ -51,10 +51,10 @@ class ReduceSeq
 };
 
 
-}  // namespace detail
+} // namespace detail
 
 RAJA_DECLARE_ALL_REDUCERS(seq_reduce, detail::ReduceSeq)
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/region.hpp b/include/RAJA/policy/sequential/region.hpp
index 84d03ae202..48f850e868 100644
--- a/include/RAJA/policy/sequential/region.hpp
+++ b/include/RAJA/policy/sequential/region.hpp
@@ -35,15 +35,15 @@ namespace sequential
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const seq_region &, Func &&body)
+RAJA_INLINE void region_impl(const seq_region&, Func&& body)
 {
   body();
 }
 
-}  // namespace sequential
+} // namespace sequential
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 4bcc73366d..bbf945bf49 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -41,20 +41,19 @@ namespace scan
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
   ValueT agg = *begin;
 
-  for (Iter i = ++begin; i != end; ++i) {
+  for (Iter i = ++begin; i != end; ++i)
+  {
     agg = f(agg, *i);
     *i = agg;
   }
@@ -67,16 +66,14 @@ inclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  T v)
 {
   using std::distance;
   const auto n = distance(begin, end);
@@ -85,7 +82,8 @@ exclusive_inplace(
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
   ValueT agg = v;
 
-  for (DistanceT i = 0; i < n; ++i) {
+  for (DistanceT i = 0; i < n; ++i)
+  {
     auto t = begin[i];
     begin[i] = agg;
     agg = f(agg, t);
@@ -99,22 +97,21 @@ exclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
   ValueT agg = *begin;
   *out++ = agg;
 
-  for (Iter i = begin + 1; i != end; ++i) {
+  for (Iter i = begin + 1; i != end; ++i)
+  {
     agg = f(agg, *i);
     *out++ = agg;
   }
@@ -131,24 +128,23 @@ template <typename ExecPolicy,
           typename OutIter,
           typename BinFn,
           typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f,
+          T v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
   ValueT agg = v;
   OutIter o = out;
   *o++ = v;
 
-  for (Iter i = begin; i != end - 1; ++i, ++o) {
+  for (Iter i = begin; i != end - 1; ++i, ++o)
+  {
     agg = f(agg, *i);
     *o = agg;
   }
@@ -156,10 +152,10 @@ exclusive(
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-}  // namespace scan
+} // namespace scan
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 98dcf6fc27..0588af4885 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -30,7 +30,7 @@
 
 #include "RAJA/util/zip.hpp"
 
-#include "RAJA/util/sort.hpp" 
+#include "RAJA/util/sort.hpp"
 
 #include "RAJA/policy/sequential/policy.hpp"
 
@@ -50,9 +50,8 @@ namespace detail
 */
 struct UnstableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::intro_sort(std::forward<Args>(args)...);
   }
@@ -64,9 +63,8 @@ struct UnstableSorter
 */
 struct StableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::merge_sort(std::forward<Args>(args)...);
   }
@@ -80,12 +78,11 @@ struct StableSorter
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
   detail::UnstableSorter{}(begin, end, comp);
 
@@ -98,12 +95,11 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
   detail::StableSorter{}(begin, end, comp);
 
@@ -113,19 +109,21 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
   auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
@@ -133,31 +131,34 @@ unstable_pairs(
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
   auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-}  // namespace sort
+} // namespace sort
 
-}  // namespace impl
+} // namespace impl
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/simd.hpp b/include/RAJA/policy/simd.hpp
index 6cb6cd4c57..8c0ac6c9bf 100644
--- a/include/RAJA/policy/simd.hpp
+++ b/include/RAJA/policy/simd.hpp
@@ -26,4 +26,4 @@
 #include "RAJA/policy/simd/kernel/For.hpp"
 #include "RAJA/policy/simd/kernel/ForICount.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 8c5b38af9c..7a39742dfc 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -48,16 +48,14 @@ namespace simd
 
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
@@ -66,7 +64,8 @@ forall_impl(RAJA::resources::Host host_res,
   auto end = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     expt::invoke_body(f_params, loop_body, *(begin + i));
   }
 
@@ -75,33 +74,32 @@ forall_impl(RAJA::resources::Host host_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam)
 {
   auto begin = std::begin(iter);
   auto end = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     loop_body(*(begin + i));
   }
 
   return RAJA::resources::EventProxy<resources::Host>(host_res);
 }
 
-}  // namespace simd
+} // namespace simd
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index 53ed45ad1f..dc01afb581 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -40,12 +40,14 @@ namespace internal
  *
  */
 template <class T>
-struct TypeIsLambda {
+struct TypeIsLambda
+{
   static const bool value = false;
 };
 
-template <camp::idx_t BodyIdx, typename ... Args>
-struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
+template <camp::idx_t BodyIdx, typename... Args>
+struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>>
+{
   static const bool value = true;
 };
 
@@ -59,10 +61,11 @@ template <typename Types, class... Statements>
 struct Invoke_all_Lambda;
 
 template <typename Types>
-struct Invoke_all_Lambda<Types> {
+struct Invoke_all_Lambda<Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&)
+  static RAJA_INLINE void lambda_special(Data&&)
   {
     // NOP terminator
   }
@@ -70,7 +73,8 @@ struct Invoke_all_Lambda<Types> {
 
 
 template <typename Types, class Statement, class... StatementRest>
-struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
+struct Invoke_all_Lambda<Types, Statement, StatementRest...>
+{
 
   // Lambda check
   static const bool value = TypeIsLambda<camp::decay<Statement>>::value;
@@ -78,7 +82,7 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
 
   // Invoke the chain of lambdas
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&data)
+  static RAJA_INLINE void lambda_special(Data&& data)
   {
 
     // Execute this Lambda
@@ -98,10 +102,12 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
  */
 template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -113,7 +119,8 @@ struct StatementExecutor<
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
@@ -123,14 +130,15 @@ struct StatementExecutor<
       // Assign offset on privatized data
       private_data.template assign_offset<ArgumentId>(i);
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_pattern_nested_HPP */
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 36a169f2bf..c2ea42d571 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -42,14 +42,18 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId,
-          typename... EnclosedStmts, typename Types>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, RAJA::simd_exec,
-                         EnclosedStmts...>, Types> {
+    statement::
+        ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -61,7 +65,8 @@ struct StatementExecutor<
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Offsets and parameters need to be privatized
       data.template assign_offset<ArgumentId>(i);
@@ -72,13 +77,14 @@ struct StatementExecutor<
       auto privatizer = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 1f8ba01ab3..8c9069cfee 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -26,40 +26,44 @@ namespace RAJA
 {
 
 template <typename SEGMENT>
-struct LoopExecute<simd_exec, SEGMENT> {
+struct LoopExecute<simd_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT>
-struct LoopICountExecute<simd_exec, SEGMENT> {
+struct LoopICountExecute<simd_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index a85811163f..87826473de 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -41,15 +41,15 @@ namespace simd
 struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                          Pattern::forall,
                                                          Launch::undefined,
-                                                         Platform::host> {
-};
+                                                         Platform::host>
+{};
 
-}  // end of namespace simd
+} // end of namespace simd
 
-}  // end of namespace policy
+} // end of namespace policy
 
 using policy::simd::simd_exec;
 
-}  // end of namespace RAJA
+} // end of namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index 491e39910c..b06ba81ed6 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -37,6 +37,6 @@
 #include "RAJA/policy/sycl/launch.hpp"
 //#include "RAJA/policy/sycl/WorkGroup.hpp"
 
-#endif  // closing endif for if defined(RAJA_ENABLE_SYCL)
+#endif // closing endif for if defined(RAJA_ENABLE_SYCL)
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 081a88dc23..36289fed7f 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -47,7 +47,8 @@ namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct syclInfo {
+struct syclInfo
+{
   sycl_dim_t gridDim{0};
   sycl_dim_t blockDim{0};
   cl::sycl::queue qu = cl::sycl::queue();
@@ -64,10 +65,11 @@ extern syclInfo tl_status;
 
 extern std::unordered_map<cl::sycl::queue, bool> g_queue_info_map;
 
-}  // namespace detail
+} // namespace detail
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -89,7 +91,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -112,7 +115,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -139,11 +143,10 @@ using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
-}  // namespace sycl
-
-}  // namespace RAJA
+} // namespace sycl
 
-#endif  // closing endif for RAJA_ENABLE_SYCL
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for RAJA_ENABLE_SYCL
 
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 901cc694f0..e629a3d10c 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -74,7 +74,7 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
   return gridSize;
 }
 
-}  // namespace impl
+} // namespace impl
 
 
 //
@@ -85,23 +85,28 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
 
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
@@ -111,7 +116,8 @@ forall_impl(resources::Sycl &sycl_res,
   IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
     //       For now, leave the device compiler to error with invalid WG size.
@@ -125,35 +131,43 @@ forall_impl(resources::Sycl &sycl_res,
     ::sycl::queue* q = sycl_res.get_queue();
 
     q->submit([&](::sycl::handler& h) {
-
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        IndexType ii = it.get_global_id(0);
-        if (ii < len) {
-          loop_body(begin[ii]);
-        }
-      });
+      h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
+                     [=](::sycl::nd_item<1> it) {
+                       IndexType ii = it.get_global_id(0);
+                       if (ii < len)
+                       {
+                         loop_body(begin[ii]);
+                       }
+                     });
     });
 
-    if (!Async) { q->wait(); }
+    if (!Async)
+    {
+      q->wait();
+    }
   }
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE 
-resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
@@ -163,7 +177,8 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
   IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
@@ -186,24 +201,23 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
     q->submit([&](::sycl::handler& h) {
+       h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
+                      [=](::sycl::nd_item<1> it) {
+                        Index_type ii = it.get_global_id(0);
 
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        Index_type ii = it.get_global_id(0);
-
-        if (ii < len) {
-          (*lbody)((*beg)[ii]);
-        }
-      });
-    }).wait(); // Need to wait for completion to free memory
+                        if (ii < len)
+                        {
+                          (*lbody)((*beg)[ii]);
+                        }
+                      });
+     }).wait(); // Need to wait for completion to free memory
 
     // Free our device memory
     cl::sycl::free(lbody, *q);
@@ -215,23 +229,29 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
@@ -243,7 +263,8 @@ forall_impl(resources::Sycl &sycl_res,
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     //
     // Compute the number of blocks
@@ -253,57 +274,61 @@ forall_impl(resources::Sycl &sycl_res,
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
     q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        IndexType ii = it.get_id(0);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-        }
-        red.combine(fp);
-      });
+      h.parallel_for(
+          ::sycl::range<1>(len), reduction, [=](::sycl::item<1> it, auto& red) {
+            ForallParam fp;
+            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+            IndexType ii = it.get_id(0);
+            if (ii < len)
+            {
+              RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+            }
+            red.combine(fp);
+          });
     });
 
     q->wait();
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     ::sycl::free(res, *q);
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 
 {
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using Iterator = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
@@ -315,7 +340,8 @@ forall_impl(resources::Sycl &sycl_res,
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     //
     // Compute the number of blocks
     //
@@ -324,8 +350,8 @@ forall_impl(resources::Sycl &sycl_res,
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
@@ -339,45 +365,41 @@ forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
     q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
+       h.parallel_for(::sycl::range<1>(len),
                       reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-
-        Index_type ii = it.get_id(0);
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-        }
-        red.combine(fp);
-
-      });
-    }).wait(); // Need to wait for completion to free memory
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+                      [=](::sycl::item<1> it, auto& red) {
+                        Index_type ii = it.get_id(0);
+                        ForallParam fp;
+                        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                        if (ii < len)
+                        {
+                          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                        }
+                        red.combine(fp);
+                      });
+     }).wait(); // Need to wait for completion to free memory
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     // Free our device memory
     ::sycl::free(res, *q);
     ::sycl::free(lbody, *q);
     ::sycl::free(beg, *q);
 
     RAJA_FT_END;
-
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
 
@@ -403,34 +425,34 @@ template <typename LoopBody,
           size_t BlockSize,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &r,
-                                                    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-                                                    const TypedIndexSet<SegmentTypes...>& iset,
-                                                    LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& r,
+            ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+            const TypedIndexSet<SegmentTypes...>& iset,
+            LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     sycl_exec<BlockSize, true>(),
-                     loop_body);
-  }  // iterate over segments of index set
-
-  if ( !Async ) {
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(), loop_body);
+  } // iterate over segments of index set
+
+  if (!Async)
+  {
     ::sycl::queue* q = r.get_queue();
-    q->wait(); 
+    q->wait();
   }
 
   return resources::EventProxy<resources::Sycl>(r);
 }
 
-}  // namespace sycl
+} // namespace sycl
 
-}  // namespace policy
+} // namespace policy
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_SYCL guard
+#endif // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp
index 641c3a9ef3..5b669271eb 100644
--- a/include/RAJA/policy/sycl/kernel.hpp
+++ b/include/RAJA/policy/sycl/kernel.hpp
@@ -32,4 +32,4 @@
 #include "RAJA/policy/sycl/kernel/TileTCount.hpp"
 #include "RAJA/policy/sycl/kernel/internal.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index 9149418518..6eadb57651 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -42,18 +42,18 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active);
@@ -61,18 +61,15 @@ struct SyclStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index d0976b931f..cf966db172 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -45,8 +45,11 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_global_012<Dim, Local_Size>, EnclosedStmts...>,
-    Types> {
+    statement::For<ArgumentId,
+                   RAJA::sycl_global_012<Dim, Local_Size>,
+                   EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -58,36 +61,37 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_global_id(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // Set Global Space for Dimension and Local Size
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.global.x = len;
       dims.local.x = Local_Size;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.global.y = len;
       dims.local.y = Local_Size;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.global.z = len;
       dims.local.z = Local_Size;
     }
@@ -108,10 +112,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,34 +129,35 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_group(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -171,10 +178,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -186,14 +195,15 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i0 = item.get_group(Dim);
     auto i_stride = item.get_group_range(Dim);
 
-    for(auto i = i0;i < len;i += i_stride){
+    for (auto i = i0; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -203,21 +213,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
-    } 
-    if (Dim == 1) {
+    }
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -237,10 +248,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -252,8 +265,8 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_local_id(Dim);
@@ -262,25 +275,25 @@ struct SyclStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -301,10 +314,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -316,15 +331,16 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i0 = item.get_local_id(Dim);
     auto i_stride = item.get_local_range(Dim);
     auto i = i0;
 
-    for(; i < len;i += i_stride){
+    for (; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,7 +349,7 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active);
     }
     // do we need one more masked iteration?
-    if(i - i0 < len)
+    if (i - i0 < len)
     {
       // execute enclosed statements one more time, but masking them off
       // this is because there's at least one thread that isn't masked off
@@ -342,21 +358,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -380,7 +397,8 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::sycl_exec<Local_Size>, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -392,13 +410,13 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item)
+  static inline RAJA_DEVICE void exec(Data& data, cl::sycl::nd_item<3> item)
   {
     auto len = segment_length<ArgumentId>(data);
     auto i = item.get_global_id(0);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -409,9 +427,7 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
@@ -439,7 +455,8 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -451,17 +468,17 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
 
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
+    using idx_type =
+        camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
 
     idx_type len = segment_length<ArgumentId>(data);
 
-    for(idx_type i = 0;i < len;++ i){
+    for (idx_type i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -470,17 +487,15 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index 9c25bb0ab9..78414d2434 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -31,7 +31,6 @@ namespace internal
 {
 
 
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Mapping directly from local id to indices
@@ -46,24 +45,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>, Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
-        Types>;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::sycl_local_012_direct<ThreadDim>,
+                     EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
     auto i = item.get_local_id(ThreadDim);
@@ -73,14 +79,11 @@ struct SyclStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 };
 
 
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -89,39 +92,44 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
     auto i0 = item.get_local_id(0);
@@ -132,15 +140,11 @@ struct SyclStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
-
 };
 
 
-
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -149,48 +153,54 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // masked size strided loop
     diff_t len = segment_length<ArgumentId>(data);
     auto i0 = item.get_local_id(0);
     diff_t i_init = mask_t::maskValue(i0);
-    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -205,13 +215,9 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
     }
   }
-
 };
 
 
-
-
-
 /*
  * Executor for thread work sharing loop inside SyclKernel.
  * Provides a block-stride loop (stride of blockDim.xyz) for
@@ -227,23 +233,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_012_loop<ThreadDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // block stride loop
     diff_t len = segment_length<ArgumentId>(data);
@@ -251,7 +265,8 @@ struct SyclStatementExecutor<
     auto i_stride = item.get_local_range(ThreadDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -269,7 +284,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*
  * Executor for group work sharing inside SyclKernel.
  * Provides a direct mapping of each block in 012.
@@ -284,29 +298,38 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::sycl_group_012_direct<BlockDim>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
     auto i = item.get_group(BlockDim);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,23 +356,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_group_012_loop<BlockDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
@@ -357,7 +388,8 @@ struct SyclStatementExecutor<
     auto i_stride = item.get_group_range(BlockDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -384,26 +416,29 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
+          Data,
+          statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
-    for(diff_t i = 0;i < len;++ i){
+    for (diff_t i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -415,11 +450,8 @@ struct SyclStatementExecutor<
 };
 
 
-
-
-
-}  // namespace internal
-}  // end namespace RAJA
+} // namespace internal
+} // end namespace RAJA
 
 
 #endif /* RAJA_policy_sycl_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 0542f4b81e..7d517ef4ca 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -42,30 +42,36 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct SyclStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
-
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_SYCL guard
+#endif // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 88c789c062..bb176b4e9b 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -49,11 +49,11 @@ namespace RAJA
  */
 template <bool async0>
 struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t<
-                            RAJA::Policy::sycl,
-                            RAJA::Pattern::forall,
-                            detail::get_launch<async0>::value,
-                            RAJA::Platform::sycl>{
-};
+                         RAJA::Policy::sycl,
+                         RAJA::Pattern::forall,
+                         detail::get_launch<async0>::value,
+                         RAJA::Platform::sycl>
+{};
 
 namespace statement
 {
@@ -63,26 +63,22 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct SyclKernelExt
-    : public internal::Statement<LaunchConfig, EnclosedStmts...> {
-};
+    : public internal::Statement<LaunchConfig, EnclosedStmts...>
+{};
 
 /*
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is synchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernel =
-    SyclKernelExt<sycl_launch<false>,
-                  EnclosedStmts...>;
+using SyclKernel = SyclKernelExt<sycl_launch<false>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is asynchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernelAsync =
-    SyclKernelExt<sycl_launch<true>,
-                  EnclosedStmts...>;
+using SyclKernelAsync = SyclKernelExt<sycl_launch<true>, EnclosedStmts...>;
 
 } // namespace statement
 
@@ -107,7 +103,11 @@ void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<bool IsTriviallyCopyable, typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <bool IsTriviallyCopyable,
+          typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -115,17 +115,18 @@ struct SyclLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
@@ -136,21 +137,17 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    data_t* m_data = (data_t*) cl::sycl::malloc_device(sizeof(data_t), *qu);
+    data_t* m_data = (data_t*)cl::sycl::malloc_device(sizeof(data_t), *qu);
     qu->memcpy(m_data, &data, sizeof(data_t)).wait();
 
     qu->submit([&](cl::sycl::handler& h) {
- 
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-        
-        SyclKernelLauncher<Data, executor_t>(*m_data, item);
-
-      });
-    }).wait(); // Need to wait to free memory
+        h.parallel_for(launch_dims.fit_nd_range(qu),
+                       [=](cl::sycl::nd_item<3> item) {
+                         SyclKernelLauncher<Data, executor_t>(*m_data, item);
+                       });
+      }).wait(); // Need to wait to free memory
 
     cl::sycl::free(m_data, *qu);
-
   }
 };
 
@@ -159,34 +156,34 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
   {
 
     qu->submit([&](cl::sycl::handler& h) {
- 
       h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-
-        SyclKernelLauncher<Data, executor_t>(data, item);
-
-      });
+                     [=](cl::sycl::nd_item<3> item) {
+                       SyclKernelLauncher<Data, executor_t>(data, item);
+                     });
     });
 
-    if (!async) { qu->wait(); };
-
+    if (!async)
+    {
+      qu->wait();
+    };
   }
 };
 
@@ -195,44 +192,49 @@ struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
-                                      LaunchConfig, stmt_list_t, data_t, Types>;
+                                      LaunchConfig,
+                                      stmt_list_t,
+                                      data_t,
+                                      Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue* q = res.get_queue();;
+    ::sycl::queue* q = res.get_queue();
+    ;
 
     //
     // Compute the requested kernel dimensions
     //
     LaunchDims launch_dims = executor_t::calculateDimensions(data);
-    
+
     int shmem = 0;
 
     //
     // Launch the kernels
     //
     launch_t::launch(std::move(data), launch_dims, shmem, q);
-
   }
-
 };
 
 
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_SYCL guard
+#endif // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index 81a57cdecb..fb987210a9 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -1,12 +1,12 @@
- /*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for SYCL tiled executors.
- *
- ******************************************************************************
- */
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file for SYCL tiled executors.
+*
+******************************************************************************
+*/
 
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -54,16 +54,19 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -75,7 +78,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0; i < len; i += chunk_size) {
+    for (diff_t i = 0; i < len; i += chunk_size)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -89,9 +93,7 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // privatize data, so we can mess with the segments
@@ -99,7 +101,7 @@ struct SyclStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, TPol::chunk_size);
@@ -124,14 +126,13 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_direct<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -139,20 +140,24 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
-    diff_t i = item.get_group(BlockDim) * chunk_size;//get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    // diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    diff_t i =
+        item.get_group(BlockDim) *
+        chunk_size; // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
 
     // check have chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -169,15 +174,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
     diff_t len = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -193,7 +197,7 @@ struct SyclStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -217,13 +221,13 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>, Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_loop<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -231,10 +235,11 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -242,11 +247,12 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    diff_t i_init = item.get_group(BlockDim) * chunk_size; // TODO
+    diff_t i_init = item.get_group(BlockDim) * chunk_size;         // TODO
     diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -260,15 +266,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
     diff_t len = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -276,13 +281,12 @@ struct SyclStatementExecutor<
     set_sycl_dim<BlockDim>(dims.group, num_blocks);
 
 
-
     // privatize data, so we can mess with the segments
     using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -296,7 +300,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
@@ -306,25 +309,27 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_direct<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_direct<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -350,15 +355,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
     diff_t len = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
 
@@ -371,16 +375,16 @@ struct SyclStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -394,25 +398,27 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_loop<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_loop<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -424,7 +430,8 @@ struct SyclStatementExecutor<
     diff_t i_stride = item.get_group_range(ThreadDim) * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -444,15 +451,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
     diff_t len = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
     num_threads = std::max(num_threads, (diff_t)1);
@@ -466,24 +472,22 @@ struct SyclStatementExecutor<
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
 
+} // end namespace internal
+} // end namespace RAJA
 
-
-}  // end namespace internal
-}  // end namespace RAJA
-
-#endif  // RAJA_ENABLE_SYCL
-#endif  /* RAJA_policy_sycl_kernel_Tile_HPP */
+#endif // RAJA_ENABLE_SYCL
+#endif /* RAJA_policy_sycl_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index b1d263a263..922a90eb85 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -55,24 +55,28 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
+          Data,
+          statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -84,7 +88,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -114,48 +119,49 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_direct<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_direct<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_direct<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<BlockDim>(blockIdx);
+    // diff_t t = get_sycl_dim<BlockDim>(blockIdx);
     diff_t t = item.get_group(BlockDim);
     diff_t i = t * chunk_size;
 
     // check have a chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -187,37 +193,37 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_loop<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_loop<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_loop<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -231,7 +237,8 @@ struct SyclStatementExecutor<
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -247,7 +254,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
@@ -258,41 +264,41 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_direct<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_direct<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_direct<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -300,7 +306,7 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
+    // diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t = item.get_local_id(ThreadDim);
     diff_t i = t * chunk_size;
 
@@ -332,41 +338,41 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_loop<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_loop<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_loop<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
     using segment_t = camp::decay<decltype(segment)>;
@@ -374,15 +380,16 @@ struct SyclStatementExecutor<
 
     // compute trip count
     diff_t len = segment_length<ArgumentId>(data);
-//    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
+    //    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t_init = item.get_local_id(ThreadDim);
     diff_t i_init = t_init * chunk_size;
-//    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
+    //    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
     diff_t t_stride = item.get_local_range(ThreadDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -403,8 +410,8 @@ struct SyclStatementExecutor<
   }
 };
 
-}  // end namespace internal
-}  // end namespace RAJA
+} // end namespace internal
+} // end namespace RAJA
 
-#endif  // RAJA_ENABLE_SYCL
-#endif  /* RAJA_policy_sycl_kernel_TileTCount_HPP */
+#endif // RAJA_ENABLE_SYCL
+#endif /* RAJA_policy_sycl_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 56e3a9aa1e..8919fdbcde 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -43,7 +43,8 @@ namespace internal
 {
 
 // LaunchDims and Helper functions
-struct LaunchDims {
+struct LaunchDims
+{
   sycl_dim_3_t group;
   sycl_dim_3_t local;
   sycl_dim_3_t global;
@@ -52,22 +53,22 @@ struct LaunchDims {
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims() : group{0,0,0},
-                 local{1,1,1},
-                 global{1,1,1},
-                 min_groups{0,0,0},
-                 min_locals{0,0,0} {}
+  LaunchDims()
+      : group{0, 0, 0},
+        local{1, 1, 1},
+        global{1, 1, 1},
+        min_groups{0, 0, 0},
+        min_locals{0, 0, 0}
+  {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims(LaunchDims const &c) : group(c.group),
-                                    local(c.local),
-                                    global(c.global)
-  {
-  }
+  LaunchDims(LaunchDims const& c)
+      : group(c.group), local(c.local), global(c.global)
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -86,89 +87,115 @@ struct LaunchDims {
     return result;
   }
 
-  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) {
+  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q)
+  {
 
     sycl_dim_3_t launch_global;
 
-    sycl_dim_3_t launch_local {1,1,1};
-    launch_local.x = std::max(launch_local.x, local.x); 
+    sycl_dim_3_t launch_local{1, 1, 1};
+    launch_local.x = std::max(launch_local.x, local.x);
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
     cl::sycl::device dev = q->get_device();
 
-    auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>();
+    auto max_work_group_size =
+        dev.get_info<::cl::sycl::info::device::max_work_group_size>();
 
-    if(launch_local.x > max_work_group_size) {
+    if (launch_local.x > max_work_group_size)
+    {
       launch_local.x = max_work_group_size;
     }
-    if(launch_local.y > max_work_group_size) {
+    if (launch_local.y > max_work_group_size)
+    {
       launch_local.y = max_work_group_size;
     }
-    if(launch_local.z > max_work_group_size) {
+    if (launch_local.z > max_work_group_size)
+    {
       launch_local.z = max_work_group_size;
     }
 
 
     // Make sure the multiple of locals fits
     // Prefer larger z -> y -> x
-    if(launch_local.x * launch_local.y * launch_local.z > max_work_group_size) {
+    if (launch_local.x * launch_local.y * launch_local.z > max_work_group_size)
+    {
       int remaining = 1;
       // local z cannot be > max_wrk from above
-      // if equal then remaining is 1, on handle < 
-      if(max_work_group_size > launch_local.z) {
+      // if equal then remaining is 1, on handle <
+      if (max_work_group_size > launch_local.z)
+      {
         // keep local z
         remaining = max_work_group_size / launch_local.z;
       }
-      if(remaining >= launch_local.y) {
+      if (remaining >= launch_local.y)
+      {
         // keep local y
         remaining = remaining / launch_local.y;
-      } else {
+      }
+      else
+      {
         launch_local.y = remaining;
         remaining = remaining / launch_local.y;
       }
-      if(remaining < launch_local.x) {
+      if (remaining < launch_local.x)
+      {
         launch_local.x = remaining;
       }
     }
 
 
     // User gave group policy, use to calculate global space
-    if (group.x != 0 || group.y != 0 || group.z != 0) {
-      sycl_dim_3_t launch_group {1,1,1};
+    if (group.x != 0 || group.y != 0 || group.z != 0)
+    {
+      sycl_dim_3_t launch_group{1, 1, 1};
       launch_group.x = std::max(launch_group.x, group.x);
       launch_group.y = std::max(launch_group.y, group.y);
       launch_group.z = std::max(launch_group.z, group.z);
 
       launch_global.x = launch_local.x * launch_group.x;
-      launch_global.y = launch_local.y * launch_group.y; 
+      launch_global.y = launch_local.y * launch_group.y;
       launch_global.z = launch_local.z * launch_group.z;
-    } else {
-      launch_global.x = launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
-      launch_global.y = launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
-      launch_global.z = launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
+    }
+    else
+    {
+      launch_global.x =
+          launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
+      launch_global.y =
+          launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
+      launch_global.z =
+          launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
     }
 
 
-    if(launch_global.x % launch_local.x != 0) {
-      launch_global.x = ((launch_global.x / launch_local.x) + 1) * launch_local.x; 
+    if (launch_global.x % launch_local.x != 0)
+    {
+      launch_global.x =
+          ((launch_global.x / launch_local.x) + 1) * launch_local.x;
     }
-    if(launch_global.y % launch_local.y != 0) {
-      launch_global.y = ((launch_global.y / launch_local.y) + 1) * launch_local.y; 
+    if (launch_global.y % launch_local.y != 0)
+    {
+      launch_global.y =
+          ((launch_global.y / launch_local.y) + 1) * launch_local.y;
     }
-    if(launch_global.z % launch_local.z != 0) {
-      launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; 
+    if (launch_global.z % launch_local.z != 0)
+    {
+      launch_global.z =
+          ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
-    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z};
-    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z};
+    cl::sycl::range<3> ret_th = {
+        launch_local.x, launch_local.y, launch_local.z};
+    cl::sycl::range<3> ret_gl = {
+        launch_global.x, launch_global.y, launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
 };
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper {
+struct SyclStatementListExecutorHelper
+{
 
   using next_helper_t =
       SyclStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -176,7 +203,8 @@ struct SyclStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  inline static RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, item, thread_active);
@@ -186,7 +214,7 @@ struct SyclStatementListExecutorHelper {
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -200,16 +228,17 @@ struct SyclStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, cl::sycl::nd_item<3> item, bool)
+  inline static RAJA_DEVICE void exec(Data&, cl::sycl::nd_item<3> item, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -223,25 +252,23 @@ struct SyclStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<SyclStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute statements in order with helper class
-    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, item, thread_active);
+    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, item, thread_active);
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
     return SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
@@ -250,14 +277,12 @@ struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 };
 
 template <typename StmtList, typename Data, typename Types>
-using sycl_statement_list_executor_t = SyclStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using sycl_statement_list_executor_t =
+    SyclStatementListExecutor<Data, StmtList, Types>;
 
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_SYCL guard
+#endif // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index ad9fecc222..1521b53605 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -29,16 +29,23 @@ namespace RAJA
 {
 
 template <bool async>
-struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
-
- //If the launch lambda is trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
+{
+
+  // If the launch lambda is trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -49,57 +56,67 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       q->submit([&](cl::sycl::handler& h) {
+        auto s_vec = ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
 
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
+        h.parallel_for(
+            cl::sycl::nd_range<3>(gridSize, blockSize),
+            [=](cl::sycl::nd_item<3> itm) {
+              LaunchContext ctx;
+              ctx.itm = &itm;
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            body_in(ctx);
-
-           });
+              // Point to shared memory
+              ctx.shared_mem_ptr =
+                  s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
+              body_in(ctx);
+            });
       });
 
-    if (!async) { q->wait(); }
+      if (!async)
+      {
+        q->wait();
+      }
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //If the launch lambda is trivially copyable and we have explcit reduction parameters
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is trivially copyable and we have explcit reduction
+  // parameters
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -112,57 +129,61 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
       q->submit([&](cl::sycl::handler& h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
 
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+         h.parallel_for(
+             cl::sycl::nd_range<3>(gridSize, blockSize),
+             reduction,
+             [=](cl::sycl::nd_item<3> itm, auto& red) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            RAJA::expt::invoke_body(fp, body_in, ctx);
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            red.combine(fp);
+               ReduceParams fp;
+               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-           });
+               RAJA::expt::invoke_body(fp, body_in, ctx);
 
-      }).wait(); // Need to wait for completion to free memory
+               red.combine(fp);
+             });
+       }).wait(); // Need to wait for completion to free memory
 
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
 
       RAJA_FT_END;
@@ -170,17 +191,23 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -191,67 +218,74 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
       q->submit([&](cl::sycl::handler& h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
 
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+         h.parallel_for(
+             cl::sycl::nd_range<3>(gridSize, blockSize),
+             [=](cl::sycl::nd_item<3> itm) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            (*lbody)(ctx);
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-           });
-
-      }).wait(); // Need to wait for completion to free memory
+               (*lbody)(ctx);
+             });
+       }).wait(); // Need to wait for completion to free memory
 
       cl::sycl::free(lbody, *q);
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-    exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-         BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -264,66 +298,70 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
       q->submit([&](cl::sycl::handler& h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
 
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
+         h.parallel_for(
+             cl::sycl::nd_range<3>(gridSize, blockSize),
+             reduction,
+             [=](cl::sycl::nd_item<3> itm, auto& red) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+               ReduceParams fp;
+               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-            RAJA::expt::invoke_body(fp, *lbody, ctx);
+               RAJA::expt::invoke_body(fp, *lbody, ctx);
 
-            red.combine(fp);
+               red.combine(fp);
+             });
+       }).wait(); // Need to wait for completion to free memory
 
-           });
-
-      }).wait(); // Need to wait for completion to free memory
-
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
       cl::sycl::free(lbody, *q);
 
@@ -332,15 +370,14 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 /*
    SYCL global thread mapping
 */
-template<int ... DIM>
+template <int... DIM>
 struct sycl_global_item;
 
 using sycl_global_item_0 = sycl_global_item<0>;
@@ -348,53 +385,49 @@ using sycl_global_item_1 = sycl_global_item<1>;
 using sycl_global_item_2 = sycl_global_item<2>;
 
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_global_item<DIM>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
-        ctx.itm->get_local_id(DIM);
+      const int tx = ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
+                     ctx.itm->get_local_id(DIM);
 
       if (tx < len) body(*(segment.begin() + tx));
     }
   }
 };
 
-using sycl_global_item_01 = sycl_global_item<0,1>;
-using sycl_global_item_02 = sycl_global_item<0,2>;
-using sycl_global_item_10 = sycl_global_item<1,0>;
-using sycl_global_item_12 = sycl_global_item<1,2>;
-using sycl_global_item_20 = sycl_global_item<2,0>;
-using sycl_global_item_21 = sycl_global_item<2,1>;
+using sycl_global_item_01 = sycl_global_item<0, 1>;
+using sycl_global_item_02 = sycl_global_item<0, 2>;
+using sycl_global_item_10 = sycl_global_item<1, 0>;
+using sycl_global_item_12 = sycl_global_item<1, 2>;
+using sycl_global_item_20 = sycl_global_item<2, 0>;
+using sycl_global_item_21 = sycl_global_item<2, 1>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
 
       if (tx < len0 && ty < len1)
@@ -404,39 +437,36 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
 };
 
 
-using sycl_global_item_012 = sycl_global_item<0,1,2>;
-using sycl_global_item_021 = sycl_global_item<0,2,1>;
-using sycl_global_item_102 = sycl_global_item<1,0,2>;
-using sycl_global_item_120 = sycl_global_item<1,2,0>;
-using sycl_global_item_201 = sycl_global_item<2,0,1>;
-using sycl_global_item_210 = sycl_global_item<2,1,0>;
+using sycl_global_item_012 = sycl_global_item<0, 1, 2>;
+using sycl_global_item_021 = sycl_global_item<0, 2, 1>;
+using sycl_global_item_102 = sycl_global_item<1, 0, 2>;
+using sycl_global_item_120 = sycl_global_item<1, 2, 0>;
+using sycl_global_item_201 = sycl_global_item<2, 0, 1>;
+using sycl_global_item_210 = sycl_global_item<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
-      const int tz =
-        ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
-        ctx.itm->get_local_id(DIM2);
+      const int tz = ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
+                     ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
         body(*(segment0.begin() + tx),
@@ -449,48 +479,66 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
 /*
 Reshape threads in a block into a 1D iteration space
 */
-template<int ... dim>
-struct sycl_flatten_group_local_direct{};
-
-using sycl_flatten_group_local_01_direct = sycl_flatten_group_local_direct<0,1>;
-using sycl_flatten_group_local_02_direct = sycl_flatten_group_local_direct<0,2>;
-using sycl_flatten_group_local_10_direct = sycl_flatten_group_local_direct<1,0>;
-using sycl_flatten_group_local_12_direct = sycl_flatten_group_local_direct<1,2>;
-using sycl_flatten_group_local_20_direct = sycl_flatten_group_local_direct<2,0>;
-using sycl_flatten_group_local_21_direct = sycl_flatten_group_local_direct<2,1>;
-
-using sycl_flatten_group_local_012_direct = sycl_flatten_group_local_direct<0,1,2>;
-using sycl_flatten_group_local_021_direct = sycl_flatten_group_local_direct<0,2,1>;
-using sycl_flatten_group_local_102_direct = sycl_flatten_group_local_direct<1,0,2>;
-using sycl_flatten_group_local_120_direct = sycl_flatten_group_local_direct<1,2,0>;
-using sycl_flatten_group_local_201_direct = sycl_flatten_group_local_direct<2,0,1>;
-using sycl_flatten_group_local_210_direct = sycl_flatten_group_local_direct<2,1,0>;
-
-template<int ... dim>
-struct sycl_flatten_group_local_loop{};
-
-using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0,1>;
-using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0,2>;
-using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1,0>;
-using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1,2>;
-using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2,0>;
-using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2,1>;
-
-using sycl_flatten_group_local_012_loop = sycl_flatten_group_local_loop<0,1,2>;
-using sycl_flatten_group_local_021_loop = sycl_flatten_group_local_loop<0,2,1>;
-using sycl_flatten_group_local_102_loop = sycl_flatten_group_local_loop<1,0,2>;
-using sycl_flatten_group_local_120_loop = sycl_flatten_group_local_loop<1,2,0>;
-using sycl_flatten_group_local_201_loop = sycl_flatten_group_local_loop<2,0,1>;
-using sycl_flatten_group_local_210_loop = sycl_flatten_group_local_loop<2,1,0>;
-
-template<typename SEGMENT, int DIM0, int DIM1>
+template <int... dim>
+struct sycl_flatten_group_local_direct
+{};
+
+using sycl_flatten_group_local_01_direct =
+    sycl_flatten_group_local_direct<0, 1>;
+using sycl_flatten_group_local_02_direct =
+    sycl_flatten_group_local_direct<0, 2>;
+using sycl_flatten_group_local_10_direct =
+    sycl_flatten_group_local_direct<1, 0>;
+using sycl_flatten_group_local_12_direct =
+    sycl_flatten_group_local_direct<1, 2>;
+using sycl_flatten_group_local_20_direct =
+    sycl_flatten_group_local_direct<2, 0>;
+using sycl_flatten_group_local_21_direct =
+    sycl_flatten_group_local_direct<2, 1>;
+
+using sycl_flatten_group_local_012_direct =
+    sycl_flatten_group_local_direct<0, 1, 2>;
+using sycl_flatten_group_local_021_direct =
+    sycl_flatten_group_local_direct<0, 2, 1>;
+using sycl_flatten_group_local_102_direct =
+    sycl_flatten_group_local_direct<1, 0, 2>;
+using sycl_flatten_group_local_120_direct =
+    sycl_flatten_group_local_direct<1, 2, 0>;
+using sycl_flatten_group_local_201_direct =
+    sycl_flatten_group_local_direct<2, 0, 1>;
+using sycl_flatten_group_local_210_direct =
+    sycl_flatten_group_local_direct<2, 1, 0>;
+
+template <int... dim>
+struct sycl_flatten_group_local_loop
+{};
+
+using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0, 1>;
+using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0, 2>;
+using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1, 0>;
+using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1, 2>;
+using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2, 0>;
+using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2, 1>;
+
+using sycl_flatten_group_local_012_loop =
+    sycl_flatten_group_local_loop<0, 1, 2>;
+using sycl_flatten_group_local_021_loop =
+    sycl_flatten_group_local_loop<0, 2, 1>;
+using sycl_flatten_group_local_102_loop =
+    sycl_flatten_group_local_loop<1, 0, 2>;
+using sycl_flatten_group_local_120_loop =
+    sycl_flatten_group_local_loop<1, 2, 0>;
+using sycl_flatten_group_local_201_loop =
+    sycl_flatten_group_local_loop<2, 0, 1>;
+using sycl_flatten_group_local_210_loop =
+    sycl_flatten_group_local_loop<2, 1, 0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -498,21 +546,19 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
       const int tx = ctx.itm->get_local_id(DIM0);
       const int ty = ctx.itm->get_local_id(DIM1);
       const int bx = ctx.itm->get_local_range(DIM0);
-      const int tid = tx + bx*ty;
+      const int tid = tx + bx * ty;
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1>
+template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -522,21 +568,19 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
     const int bx = ctx.itm->get_local_range(DIM0);
     const int by = ctx.itm->get_local_range(DIM1);
 
-    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
+    for (int tid = tx + bx * ty; tid < len; tid += bx * by)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
     {
@@ -546,21 +590,19 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int bx = ctx.itm->get_local_range(DIM0);
       const int by = ctx.itm->get_local_range(DIM1);
 
-      const int tid = tx + bx*(ty + by*tz);
+      const int tid = tx + bx * (ty + by * tz);
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -571,10 +613,10 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
     const int by = ctx.itm->get_local_range(DIM1);
     const int bz = ctx.itm->get_local_range(DIM2);
 
-    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
+    for (int tid = tx + bx * (ty + by * tz); tid < len; tid += bx * by * bz)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
@@ -582,19 +624,17 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
   SYCL thread loops with block strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
          tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx));
@@ -606,13 +646,12 @@ struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -627,20 +666,19 @@ struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx));
     }
   }
@@ -650,13 +688,12 @@ struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -671,20 +708,18 @@ struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
   SYCL thread loops with block strides + Return Index
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM) )
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
+         tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx), tx);
     }
@@ -695,13 +730,12 @@ struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -716,20 +750,19 @@ struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx =  ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx), bx);
     }
   }
@@ -739,13 +772,12 @@ struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -757,29 +789,29 @@ struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 };
 
 // perfectly nested sycl direct policies
-using sycl_group_01_nested_direct = sycl_group_012_direct<0,1>;
-using sycl_group_02_nested_direct = sycl_group_012_direct<0,2>;
-using sycl_group_10_nested_direct = sycl_group_012_direct<1,0>;
-using sycl_group_12_nested_direct = sycl_group_012_direct<1,2>;
-using sycl_group_20_nested_direct = sycl_group_012_direct<2,0>;
-using sycl_group_21_nested_direct = sycl_group_012_direct<2,1>;
-
-using sycl_group_012_nested_direct = sycl_group_012_direct<0,1,2>;
-using sycl_group_021_nested_direct = sycl_group_012_direct<0,2,1>;
-using sycl_group_102_nested_direct = sycl_group_012_direct<1,0,2>;
-using sycl_group_120_nested_direct = sycl_group_012_direct<1,2,0>;
-using sycl_group_201_nested_direct = sycl_group_012_direct<2,0,1>;
-using sycl_group_210_nested_direct = sycl_group_012_direct<2,1,0>;
+using sycl_group_01_nested_direct = sycl_group_012_direct<0, 1>;
+using sycl_group_02_nested_direct = sycl_group_012_direct<0, 2>;
+using sycl_group_10_nested_direct = sycl_group_012_direct<1, 0>;
+using sycl_group_12_nested_direct = sycl_group_012_direct<1, 2>;
+using sycl_group_20_nested_direct = sycl_group_012_direct<2, 0>;
+using sycl_group_21_nested_direct = sycl_group_012_direct<2, 1>;
+
+using sycl_group_012_nested_direct = sycl_group_012_direct<0, 1, 2>;
+using sycl_group_021_nested_direct = sycl_group_012_direct<0, 2, 1>;
+using sycl_group_102_nested_direct = sycl_group_012_direct<1, 0, 2>;
+using sycl_group_120_nested_direct = sycl_group_012_direct<1, 2, 0>;
+using sycl_group_201_nested_direct = sycl_group_012_direct<2, 0, 1>;
+using sycl_group_210_nested_direct = sycl_group_012_direct<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -793,15 +825,15 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -823,37 +855,36 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
   Return local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =  ctx.itm->get_group(DIM0);
-      const int ty =  ctx.itm->get_group(DIM1);
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
       if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             tx, ty);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty), tx, ty);
     }
   }
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -865,46 +896,47 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       if (tx < len0 && ty < len1 && tz < len2)
         body(*(segment0.begin() + tx),
              *(segment1.begin() + ty),
-             *(segment2.begin() + tz), tx, ty, tz);
+             *(segment2.begin() + tz),
+             tx,
+             ty,
+             tz);
     }
   }
 };
 
 // perfectly nested sycl loop policies
-using sycl_group_01_nested_loop = sycl_group_012_loop<0,1>;
-using sycl_group_02_nested_loop = sycl_group_012_loop<0,2>;
-using sycl_group_10_nested_loop = sycl_group_012_loop<1,0>;
-using sycl_group_12_nested_loop = sycl_group_012_loop<1,2>;
-using sycl_group_20_nested_loop = sycl_group_012_loop<2,0>;
-using sycl_group_21_nested_loop = sycl_group_012_loop<2,1>;
-
-using sycl_group_012_nested_loop = sycl_group_012_loop<0,1,2>;
-using sycl_group_021_nested_loop = sycl_group_012_loop<0,2,1>;
-using sycl_group_102_nested_loop = sycl_group_012_loop<1,0,2>;
-using sycl_group_120_nested_loop = sycl_group_012_loop<1,2,0>;
-using sycl_group_201_nested_loop = sycl_group_012_loop<2,0,1>;
-using sycl_group_210_nested_loop = sycl_group_012_loop<2,1,0>;
+using sycl_group_01_nested_loop = sycl_group_012_loop<0, 1>;
+using sycl_group_02_nested_loop = sycl_group_012_loop<0, 2>;
+using sycl_group_10_nested_loop = sycl_group_012_loop<1, 0>;
+using sycl_group_12_nested_loop = sycl_group_012_loop<1, 2>;
+using sycl_group_20_nested_loop = sycl_group_012_loop<2, 0>;
+using sycl_group_21_nested_loop = sycl_group_012_loop<2, 1>;
+
+using sycl_group_012_nested_loop = sycl_group_012_loop<0, 1, 2>;
+using sycl_group_021_nested_loop = sycl_group_012_loop<0, 2, 1>;
+using sycl_group_102_nested_loop = sycl_group_012_loop<1, 0, 2>;
+using sycl_group_120_nested_loop = sycl_group_012_loop<1, 2, 0>;
+using sycl_group_201_nested_loop = sycl_group_012_loop<2, 0, 1>;
+using sycl_group_210_nested_loop = sycl_group_012_loop<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM1);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM1); by < len1;
              bx += ctx.itm->get_group_range(DIM1))
         {
           body(*(segment0.begin() + bx), *(segment1.begin() + by));
@@ -915,32 +947,29 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM1);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM1); by < len1;
            by += ctx.itm->get_group_range(DIM1))
       {
 
-        for (int bz = ctx.itm->get_group(DIM2);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM2); bz < len2;
              bz += ctx.itm->get_group_range(DIM2))
         {
 
@@ -957,25 +986,23 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
   perfectly nested sycl loop policies + returns local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM0);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM0); by < len1;
              by += ctx.itm->get_group_range(DIM1))
         {
 
@@ -987,38 +1014,38 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM0);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM0); by < len1;
            by += ctx.itm->get_group_range(DIM0))
       {
 
-        for (int bz =  ctx.itm->get_group(DIM0);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM0); bz < len2;
              bz += ctx.itm->get_group_range(DIM0))
         {
 
           body(*(segment0.begin() + bx),
                *(segment1.begin() + by),
-               *(segment2.begin() + bz), bx, by, bz);
+               *(segment2.begin() + bz),
+               bx,
+               by,
+               bz);
         }
       }
     }
@@ -1026,20 +1053,19 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
       body(segment.slice(tx, tile_size));
@@ -1049,20 +1075,20 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
       body(segment.slice(tx, tile_size));
     }
@@ -1071,19 +1097,19 @@ struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_group(DIM)* tile_size;
+    for (int tx = ctx.itm->get_group(DIM) * tile_size;
 
          tx < len;
 
@@ -1095,113 +1121,113 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_group(DIM) * tile_size;
-    if(tx < len){
+    if (tx < len)
+    {
       body(segment.slice(tx, tile_size));
     }
   }
 };
 
-//Tile execute + return index
+// Tile execute + return index
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM) * tile_size;
-         bx < len;
+    for (int bx = ctx.itm->get_group(DIM) * tile_size; bx < len;
          bx += ctx.itm->get_group_range(DIM) * tile_size)
     {
-      body(segment.slice(bx, tile_size), bx/tile_size);
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int bx = ctx.itm->get_group(DIM) * tile_size;
-    if(bx < len){
-      body(segment.slice(bx, tile_size), bx/tile_size);
+    if (bx < len)
+    {
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
index 1f33be19bb..68ed5d5d5e 100644
--- a/include/RAJA/policy/sycl/params/kernel_name.hpp
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -3,35 +3,38 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-#if defined(RAJA_ENABLE_SYCL)  
-  
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  SYCL_EXTERNAL
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-#endif  
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+#if defined(RAJA_ENABLE_SYCL)
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+    SYCL_EXTERNAL combine(KernelName&, T)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+#endif
 
 } //  namespace detail
 } //  namespace expt
diff --git a/include/RAJA/policy/sycl/params/reduce.hpp b/include/RAJA/policy/sycl/params/reduce.hpp
index a1afbe5835..de7818957b 100644
--- a/include/RAJA/policy/sycl/params/reduce.hpp
+++ b/include/RAJA/policy/sycl/params/reduce.hpp
@@ -3,33 +3,38 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_SYCL)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red) {
-    red.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  SYCL_EXTERNAL
-  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
-    out.val = OP{}(out.val, in.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red) {
-    *red.target = OP{}(*red.target, red.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+init(Reducer<OP, T>& red)
+{
+  red.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+    SYCL_EXTERNAL combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
+{
+  out.val = OP{}(out.val, in.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+resolve(Reducer<OP, T>& red)
+{
+  *red.target = OP{}(*red.target, red.val);
+}
 
 #endif
 
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index 0f92fe27e1..73f77b0154 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -35,7 +35,8 @@
 namespace RAJA
 {
 
-struct uint3 {
+struct uint3
+{
   unsigned long x, y, z;
 };
 
@@ -46,15 +47,17 @@ using sycl_dim_3_t = uint3;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
-}  // end namespace detail
+} // end namespace detail
 
 namespace policy
 {
@@ -73,27 +76,28 @@ struct sycl_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::sycl,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                       RAJA::Platform::sycl>
+{};
 
 template <bool Async, int num_threads = 0>
 struct sycl_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::sycl,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                           RAJA::Policy::sycl,
+                           RAJA::Pattern::region,
+                           detail::get_launch<Async>::value,
+                           RAJA::Platform::sycl>
+{};
 
 struct sycl_reduce
-    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce> {
-};
+    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce>
+{};
 
 //
 // Sycl atomic policy for using sycl atomics on the device and
 // the provided Policy on the host
 //
-template<typename host_policy>
-struct sycl_atomic_explicit{};
+template <typename host_policy>
+struct sycl_atomic_explicit
+{};
 
 //
 // Default sycl atomic policy uses sycl atomics on the device and non-atomics
@@ -101,14 +105,16 @@ struct sycl_atomic_explicit{};
 //
 using sycl_atomic = sycl_atomic_explicit<seq_atomic>;
 
-template<typename Mask>
-struct sycl_local_masked_direct {};
+template <typename Mask>
+struct sycl_local_masked_direct
+{};
 
-template<typename Mask>
-struct sycl_local_masked_loop {};
+template <typename Mask>
+struct sycl_local_masked_loop
+{};
 
-}  // namespace sycl
-}  // namespace policy
+} // namespace sycl
+} // namespace policy
 
 using policy::sycl::sycl_exec;
 using policy::sycl::sycl_reduce;
@@ -120,27 +126,29 @@ using policy::sycl::sycl_local_masked_direct;
 using policy::sycl::sycl_local_masked_loop;
 
 using policy::sycl::sycl_launch_t;
-  
+
 /*!
  * Maps indices to SYCL global id
  * Optional WORK_GROUP_SIZE to
  */
-template<int dim, int WORK_GROUP_SIZE = 1>
-struct sycl_global_012{};
+template <int dim, int WORK_GROUP_SIZE = 1>
+struct sycl_global_012
+{};
 
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_0 = sycl_global_012<0, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_1 = sycl_global_012<1, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_2 = sycl_global_012<2, WORK_GROUP_SIZE>;
 
 /*!
  * Maps segment indices to SYCL group ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_group_012_loop{};
+template <int... dim>
+struct sycl_group_012_loop
+{};
 
 using sycl_group_0_loop = sycl_group_012_loop<0>;
 using sycl_group_1_loop = sycl_group_012_loop<1>;
@@ -150,8 +158,9 @@ using sycl_group_2_loop = sycl_group_012_loop<2>;
  * Maps segment indices to SYCL local ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_local_012_loop{};
+template <int... dim>
+struct sycl_local_012_loop
+{};
 
 using sycl_local_0_loop = sycl_local_012_loop<0>;
 using sycl_local_1_loop = sycl_local_012_loop<1>;
@@ -160,8 +169,9 @@ using sycl_local_2_loop = sycl_local_012_loop<2>;
 /*!
  * Maps segment indices to SYCL group ids.
  */
-template<int ... dim>
-struct sycl_group_012_direct{};
+template <int... dim>
+struct sycl_group_012_direct
+{};
 
 using sycl_group_0_direct = sycl_group_012_direct<0>;
 using sycl_group_1_direct = sycl_group_012_direct<1>;
@@ -170,101 +180,86 @@ using sycl_group_2_direct = sycl_group_012_direct<2>;
 /*!
  * Maps segment indices to SYCL local ids.
  */
-template<int ... dim>
-struct sycl_local_012_direct{};
+template <int... dim>
+struct sycl_local_012_direct
+{};
 
 using sycl_local_0_direct = sycl_local_012_direct<0>;
 using sycl_local_1_direct = sycl_local_012_direct<1>;
 using sycl_local_2_direct = sycl_local_012_direct<2>;
 
 
-namespace internal{
+namespace internal
+{
 
-template<int dim>
+template <int dim>
 struct SyclDimHelper;
 
-template<>
-struct SyclDimHelper<0>{
+template <>
+struct SyclDimHelper<0>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct SyclDimHelper<1>{
+template <>
+struct SyclDimHelper<1>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct SyclDimHelper<2>{
+template <>
+struct SyclDimHelper<2>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.z = value;
   }
 };
 
-template<int dim, typename dim_t>
-constexpr
-auto get_sycl_dim(dim_t const &d) ->
-  decltype(d.x)
+template <int dim, typename dim_t>
+constexpr auto get_sycl_dim(dim_t const& d) -> decltype(d.x)
 {
   return SyclDimHelper<dim>::get(d);
 }
 
-template<int dim, typename dim_t>
-void set_sycl_dim(dim_t &d, int value)
+template <int dim, typename dim_t>
+void set_sycl_dim(dim_t& d, int value)
 {
   return SyclDimHelper<dim>::set(d, value);
 }
 } // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif // RAJA_ENABLE_SYCL
 
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 49d89b3cd2..a2fcf9ccbb 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   Header file for SYCL reduction stucts/classes.
- *          
+ *
  ******************************************************************************
  */
 
@@ -38,15 +38,14 @@ namespace sycl
 {
 
 template <typename T, typename I>
-struct minloc 
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
@@ -54,15 +53,14 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
@@ -74,7 +72,7 @@ struct maxloc
 static int MaxNumTeams = 1;
 
 //! Information necessary for SYCL offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
   int hostID{1};
   int deviceID{2};
@@ -82,10 +80,9 @@ struct Offload_Info
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
+  Offload_Info(const Offload_Info& other)
       : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  {}
 };
 
 //! Reduction data for SYCL Offload -- stores value, host pointer, and device
@@ -94,8 +91,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -104,20 +101,24 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
       : value(initValue)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
 
-    device = reinterpret_cast<T *>(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
-    host = reinterpret_cast<T *>(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
+    device = reinterpret_cast<T*>(
+        cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
+    host = reinterpret_cast<T*>(
+        cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
 
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -125,84 +126,84 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(device),
-                       reinterpret_cast<void *>(host),
+    auto e = q->memcpy(reinterpret_cast<void*>(device),
+                       reinterpret_cast<void*>(host),
                        sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
-    } 
+    }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(host),
-                       reinterpret_cast<void *>(device),
+    auto e = q->memcpy(reinterpret_cast<void*>(host),
+                       reinterpret_cast<void*>(device),
                        sycl::MaxNumTeams * sizeof(T));
- 
+
     e.wait();
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if (device) {
-      cl::sycl::free(reinterpret_cast<void *>(device), *q);
+    if (device)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(device), *q);
       device = nullptr;
     }
-    if (host) {
-      cl::sycl::free(reinterpret_cast<void *>(host), *q);
-      //delete[] host;
+    if (host)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(host), *q);
+      // delete[] host;
       host = nullptr;
     }
   }
 };
 
-}  // end namespace sycl
+} // end namespace sycl
 
 //! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
+struct TargetReduce
 {
   TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val)
       : info(),
         val(Reducer::identity(), Reducer::identity(), info),
         initVal(init_val),
         finalVal(Reducer::identity())
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
@@ -214,19 +215,19 @@ struct TargetReduce
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduce()
-  {
-  }
+  ~TargetReduce() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
-      for (int i =0; i < sycl::MaxNumTeams; ++i) {
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
         Reducer{}(val.value, val.host[i]);
       }
-//      val.cleanup(info);
+      //      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
@@ -240,11 +241,16 @@ struct TargetReduce
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
     Reducer{}(atm, rhsVal);
     return *this;
 #else
@@ -254,12 +260,17 @@ struct TargetReduce
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);  
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
+    Reducer{}(atm, rhsVal);
     return *this;
 #else
     Reducer{}(val.value, rhsVal);
@@ -281,13 +292,16 @@ struct TargetReduce
 //! SYCL Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+struct TargetReduceLoc
 {
   TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+  explicit TargetReduceLoc(
+      T init_val,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -295,12 +309,13 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
@@ -314,18 +329,18 @@ struct TargetReduceLoc
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduceLoc()
-  {
-  }
+  ~TargetReduceLoc() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      
-      for (int i = 0; i < sycl::MaxNumTeams; ++i) {
+
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
         Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       info.isMapped = true;
@@ -353,13 +368,15 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire,
+                           cl::sycl::memory_scope::device);
     Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    cl::sycl::atomic_fence(cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_release,
+                           cl::sycl::memory_scope::device);
     return *this;
 #else
     Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
@@ -368,7 +385,7 @@ struct TargetReduceLoc
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
     Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
@@ -382,7 +399,7 @@ struct TargetReduceLoc
   //! storage for offload information
   sycl::Offload_Info info;
   //! storage for reduction data for value
-//  sycl::Reduce_Data<T> val;
+  //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
   T initVal;
   T finalVal;
@@ -395,28 +412,31 @@ struct TargetReduceLoc
 
 //! specialization of ReduceSum for omp_target_reduce
 template <typename T>
-class ReduceSum<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::sum<T>, T>
+class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
   using self = ReduceSum<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -432,17 +452,21 @@ class ReduceBitOr<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
   using self = ReduceBitOr<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -452,11 +476,16 @@ class ReduceBitOr<sycl_reduce, T>
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -472,17 +501,21 @@ class ReduceBitAnd<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
   using self = ReduceBitAnd<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -492,11 +525,16 @@ class ReduceBitAnd<sycl_reduce, T>
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -509,21 +547,24 @@ class ReduceBitAnd<sycl_reduce, T>
 
 //! specialization of ReduceMin for omp_target_reduce
 template <typename T>
-class ReduceMin<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::min<T>, T>
+class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
   using self = ReduceMin<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -533,11 +574,16 @@ class ReduceMin<sycl_reduce, T>
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -550,21 +596,24 @@ class ReduceMin<sycl_reduce, T>
 
 //! specialization of ReduceMax for omp_target_reduce
 template <typename T>
-class ReduceMax<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::max<T>, T>
+class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
   using self = ReduceMax<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -574,11 +623,16 @@ class ReduceMax<sycl_reduce, T>
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T,
+                           cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -588,8 +642,8 @@ class ReduceMax<sycl_reduce, T>
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_SYCL guard
+#endif // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/tensor.hpp b/include/RAJA/policy/tensor.hpp
index bc38787f63..ef51060029 100644
--- a/include/RAJA/policy/tensor.hpp
+++ b/include/RAJA/policy/tensor.hpp
@@ -23,4 +23,4 @@
 #include "RAJA/policy/tensor/arch_impl.hpp"
 #include "RAJA/policy/tensor/policy.hpp"
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/policy/tensor/arch.hpp b/include/RAJA/policy/tensor/arch.hpp
index 771adea64f..8c98ac87b3 100644
--- a/include/RAJA/policy/tensor/arch.hpp
+++ b/include/RAJA/policy/tensor/arch.hpp
@@ -23,26 +23,27 @@
 namespace RAJA
 {
 
-namespace internal {
+namespace internal
+{
 
 namespace expt
 {
 
 
-  /*!
-   * Provides architectural details for a given architecture and data type.
-   */
-  template<typename REGISTER_POLICY, typename T>
-  struct RegisterTraits;
-  /*
-   * using element_type = T;
-   * using register_policy = REGISTER_POLICY;
-   * static constexpr camp::idx s_num_bits = X;
-   * static constexpr camp::idx s_num_elem = Y;
-   *
-   */
-} //namespace expt
-} //namespace internal
+/*!
+ * Provides architectural details for a given architecture and data type.
+ */
+template <typename REGISTER_POLICY, typename T>
+struct RegisterTraits;
+/*
+ * using element_type = T;
+ * using register_policy = REGISTER_POLICY;
+ * static constexpr camp::idx s_num_bits = X;
+ * static constexpr camp::idx s_num_elem = Y;
+ *
+ */
+} // namespace expt
+} // namespace internal
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -54,7 +55,8 @@ namespace expt
 {
 
 #ifdef __AVX512F__
-struct avx512_register {};
+struct avx512_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx512_register
@@ -63,7 +65,8 @@ struct avx512_register {};
 
 
 #ifdef __AVX2__
-struct avx2_register {};
+struct avx2_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx2_register
@@ -72,7 +75,8 @@ struct avx2_register {};
 
 
 #ifdef __AVX__
-struct avx_register {};
+struct avx_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx_register
@@ -85,7 +89,8 @@ struct avx_register {};
 /*!
  * A CUDA warp distributed vector register
  */
-struct cuda_warp_register {};
+struct cuda_warp_register
+{};
 
 #endif
 
@@ -96,12 +101,14 @@ struct cuda_warp_register {};
  * A HIP wavefront distributed vector register
  * On AMD GPUs this is rally just a vector register
  */
-struct hip_wave_register {};
+struct hip_wave_register
+{};
 
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-struct scalar_register {};
+struct scalar_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::scalar_register
@@ -109,15 +116,14 @@ struct scalar_register {};
 #endif
 
 
-  // This sets the default SIMD register that will be used
-  using default_register = RAJA_TENSOR_REGISTER_TYPE;
+// This sets the default SIMD register that will be used
+using default_register = RAJA_TENSOR_REGISTER_TYPE;
 
 
 } // namespace expt
 } // namespace RAJA
 
 
-
 //
 // Now include all of the traits files
 //
diff --git a/include/RAJA/policy/tensor/arch/avx.hpp b/include/RAJA/policy/tensor/arch/avx.hpp
index ed25f1f3e3..4c5445096e 100644
--- a/include/RAJA/policy/tensor/arch/avx.hpp
+++ b/include/RAJA/policy/tensor/arch/avx.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX__
 
-#include<RAJA/policy/tensor/arch/avx/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_float.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_double.hpp>
+#include <RAJA/policy/tensor/arch/avx/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_float.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_double.hpp>
 
 
 #endif // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index 8a23d66e57..fc8004331f 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -34,442 +34,461 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx_register> :
-    public internal::expt::RegisterBase<Register<double, avx_register>>
+template <>
+class Register<double, avx_register>
+    : public internal::expt::RegisterBase<Register<double, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<double, avx_register>;
+  using element_type = double;
+  using register_type = __m256d;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : base_type(), m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_pd();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    };
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<double, avx_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-                     base_type(), m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_pd();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        };
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the maximum value of each lane
-        // B = { max{v[0], v[1]},
-        //       max{v[0], v[1]},
-        //       max{v[2], v[3]},
-        //       max{v[2], v[3]} }
-        register_type b = _mm256_max_pd(m_value, a);
-
-        // now take the maximum of a lower and upper halves
-        return RAJA::max<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-}  // namespace RAJA
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    auto sh1 = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the maximum value of each lane
+    // B = { max{v[0], v[1]},
+    //       max{v[0], v[1]},
+    //       max{v[2], v[3]},
+    //       max{v[2], v[3]} }
+    register_type b = _mm256_max_pd(m_value, a);
+
+    // now take the maximum of a lower and upper halves
+    return RAJA::max<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 1e6563742a..5bf5a589f8 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -34,455 +34,487 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<float, avx_register> :
-    public internal::expt::RegisterBase<Register<float, avx_register>>
+template <>
+class Register<float, avx_register>
+    : public internal::expt::RegisterBase<Register<float, avx_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<float, avx_register>;
+  using element_type = float;
+  using register_type = __m256;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_ps();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<float, avx_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_ps();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element of first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::max<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::max<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::max<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::min<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::min<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::min<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(N >= 8 ? get(7) / b.get(7) : 0,
+                                   N >= 7 ? get(6) / b.get(6) : 0,
+                                   N >= 6 ? get(5) / b.get(5) : 0,
+                                   N >= 5 ? get(4) / b.get(4) : 0,
+                                   N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element of first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::max<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::max<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::max<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::min<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::min<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::min<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 11ab97be16..312f85e2c5 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -33,736 +33,791 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+template <>
+class Register<int32_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<int32_t, avx_register>;
+  using element_type = int32_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride,
+                            6 * stride,
+                            5 * stride,
+                            4 * stride,
+                            3 * stride,
+                            2 * stride,
+                            stride,
+                            0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0,
+                            N >= 3 ? 2 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0,
+                            N >= 2 ? 1 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i], i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr),
+                        createMask(N),
+                        reinterpret_cast<__m256>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi32(m_value, 0);
+    case 1:
+      return _mm256_extract_epi32(m_value, 1);
+    case 2:
+      return _mm256_extract_epi32(m_value, 2);
+    case 3:
+      return _mm256_extract_epi32(m_value, 3);
+    case 4:
+      return _mm256_extract_epi32(m_value, 4);
+    case 5:
+      return _mm256_extract_epi32(m_value, 5);
+    case 6:
+      return _mm256_extract_epi32(m_value, 6);
+    case 7:
+      return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi32(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi32(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi32(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi32(m_value, value, 3);
+      break;
+    case 4:
+      m_value = _mm256_insert_epi32(m_value, value, 4);
+      break;
+    case 5:
+      m_value = _mm256_insert_epi32(m_value, value, 5);
+      break;
+    case 6:
+      m_value = _mm256_insert_epi32(m_value, value, 6);
+      break;
+    case 7:
+      m_value = _mm256_insert_epi32(m_value, value, 7);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 8-way 32-bit add, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
+
+    // Low 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // no 8-way 32-bit multiply, but there is a 32x32 -> 64
+    // This gets ugly :)
+
+    // Low 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    // multiply even lanes 0, 2
+    auto res_low_even = _mm_mul_epi32(low_a, low_b);
+
+    // multiply odd lanes 1, 3
+    auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
+    auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
+    auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
+    auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
+
+
+    // High 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    // multiply even lanes 0, 2
+    auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
+
+    // multiply odd lanes 1, 3
+    auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
+    auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
+    auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
+    auto res_hi = _mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_hi_odd), _mm_castsi128_ps(res_hi_even), 0x05));
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_add_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_add_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int32_t, avx_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i], i);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N), reinterpret_cast<__m256>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 8-way 32-bit add, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // no 8-way 32-bit multiply, but there is a 32x32 -> 64
-        // This gets ugly :)
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        // multiply even lanes 0, 2
-        auto res_low_even = _mm_mul_epi32(low_a, low_b);
-
-        // multiply odd lanes 1, 3
-        auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
-        auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
-        auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
-        auto res_low = _mm256_castsi128_si256(_mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_low_odd),
-                         _mm_castsi128_ps(res_low_even),
-                         0x05)
-            ));
-
-
-        // High 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        // multiply even lanes 0, 2
-        auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
-
-        // multiply odd lanes 1, 3
-        auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
-        auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
-        auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
-        auto res_hi = _mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_hi_odd),
-                         _mm_castsi128_ps(res_hi_even),
-                         0x05)
-            );
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_add_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-        auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_add_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
-
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract final reduction
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
 
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
 
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
 
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
 
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
 
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
 
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
 
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
+    // Sum halves, extract final reduction
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_max_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_max_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
+      auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_max_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_min_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_min_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
+      auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
 
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_min_epi32(hi_a, hi_b);
 
-        if(N==5){
-          auto red_5 = _mm_max_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_max_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+};
 
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_max_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
-
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        if(N==5){
-          auto red_5 = _mm_min_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_min_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
 
+} // namespace expt
 
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_min_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 1c7fae3dc7..22c48b2bff 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -33,504 +33,529 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+template <>
+class Register<int64_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type = Register<int64_t, avx_register>;
+  using element_type = int64_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr),
+                        createMask(N),
+                        reinterpret_cast<__m256d>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi64(m_value, 0);
+    case 1:
+      return _mm256_extract_epi64(m_value, 1);
+    case 2:
+      return _mm256_extract_epi64(m_value, 2);
+    case 3:
+      return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi64(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi64(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi64(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi64(m_value, value, 3);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int64_t, avx_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(),  m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N), reinterpret_cast<__m256d>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-
-        // Add lower 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(sh1);
-        auto res_low = _mm_add_epi64(low_a, low_b);
-
-        // Add upper 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(sh1, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Sum upper and lower
-        auto res = _mm_add_epi64(res_hi, res_low);
-
-        // add lower and upper
-        return _mm_extract_epi64(res, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max!
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+    // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3),
+                                       get(2) * b.get(2),
+                                       get(1) * b.get(1),
+                                       get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3),
+                                       get(2) / b.get(2),
+                                       get(1) / b.get(1),
+                                       get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(N >= 4 ? get(3) / b.get(3) : 0,
+                                       N >= 3 ? get(2) / b.get(2) : 0,
+                                       N >= 2 ? get(1) / b.get(1) : 0,
+                                       N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap pairs and add
+    auto sh1 = permute<0x5>(m_value);
+
+    // Add lower 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(sh1);
+    auto res_low = _mm_add_epi64(low_a, low_b);
+
+    // Add upper 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(sh1, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Sum upper and lower
+    auto res = _mm_add_epi64(res_hi, res_low);
+
+    // add lower and upper
+    return _mm_extract_epi64(res, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max!
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index 33c18e2c5f..9a8d9ed000 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -20,48 +20,55 @@
 #ifndef RAJA_policy_tensor_arch_avx_traits_HPP
 #define RAJA_policy_tensor_arch_avx_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int32_t>
+{
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int64_t>
+{
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, float>
+{
+  using element_type = float;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, double>
+{
+  using element_type = double;
+  using register_policy = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
 
-} // namespace intenral
 } // namespace expt
+} // namespace internal
 } // namespace RAJA
 
 
diff --git a/include/RAJA/policy/tensor/arch/avx2.hpp b/include/RAJA/policy/tensor/arch/avx2.hpp
index b462257924..95edf0724f 100644
--- a/include/RAJA/policy/tensor/arch/avx2.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX2__
 
-#include<RAJA/policy/tensor/arch/avx2/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
+#include <RAJA/policy/tensor/arch/avx2/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
 
 
 #endif // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index 852003a4f9..98c6a38182 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -34,527 +34,549 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx2_register> :
-    public internal::expt::RegisterBase<Register<double, avx2_register>>
+template <>
+class Register<double, avx2_register>
+    : public internal::expt::RegisterBase<Register<double, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<double, avx2_register>;
+  using element_type = double;
+  using register_type = __m256d;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<double, avx2_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed ++;
+    RAJA::tensor_stats::num_vector_load_packed++;
 #endif
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n ++;
+    RAJA::tensor_stats::num_vector_load_packed_n++;
 #endif
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided ++;
+    RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+    m_value = _mm256_i64gather_pd(
+        ptr, createStridedOffsets(stride), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
+    m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
+                                       ptr,
+                                       createStridedOffsets(stride),
+                                       _mm256_castsi256_pd(createMask(N)),
+                                       sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_pd(ptr, offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      offsets.get_register(),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
+    m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
+                                       ptr,
+                                       offsets.get_register(),
+                                       _mm256_castsi256_pd(createMask(N)),
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed ++;
+    RAJA::tensor_stats::num_vector_store_packed++;
 #endif
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n ++;
+    RAJA::tensor_stats::num_vector_store_packed_n++;
 #endif
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided ++;
+    RAJA::tensor_stats::num_vector_store_strided++;
 #endif
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n ++;
+    RAJA::tensor_stats::num_vector_store_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        switch(i){
-          case 0: return self_type(_mm256_permute4x64_pd (m_value, 0x00));
-          case 1: return self_type(_mm256_permute4x64_pd (m_value, 0x55));
-          case 2: return self_type(_mm256_permute4x64_pd (m_value, 0xAA));
-          case 3: return self_type(_mm256_permute4x64_pd (m_value, 0xFF));
-        }
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    switch (i)
+    {
+    case 0:
+      return self_type(_mm256_permute4x64_pd(m_value, 0x00));
+    case 1:
+      return self_type(_mm256_permute4x64_pd(m_value, 0x55));
+    case 2:
+      return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
+    case 3:
+      return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum(camp::idx_t = 4) const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max(camp::idx_t N = 4) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum(camp::idx_t = 4) const
+  {
+    auto sh1 = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max(camp::idx_t N = 4) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 4b1e11419d..2c13ef0532 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -33,485 +33,510 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx2_register> :
-    public internal::expt::RegisterBase<Register<float, avx2_register>>
+template <>
+class Register<float, avx2_register>
+    : public internal::expt::RegisterBase<Register<float, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<float, avx2_register>;
+  using element_type = float;
+  using register_type = __m256;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride,
+                            6 * stride,
+                            5 * stride,
+                            4 * stride,
+                            3 * stride,
+                            2 * stride,
+                            stride,
+                            0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0,
+                            N >= 3 ? 2 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0,
+                            N >= 2 ? 1 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_ps(
+        ptr, createStridedOffsets(stride), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
+                                       ptr,
+                                       createStridedOffsets(stride),
+                                       _mm256_castsi256_ps(createMask(N)),
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<float, avx2_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_ps(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_ps(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(N >= 8 ? get(7) / b.get(7) : 0,
+                                   N >= 7 ? get(6) / b.get(6) : 0,
+                                   N >= 6 ? get(5) / b.get(5) : 0,
+                                   N >= 5 ? get(4) / b.get(4) : 0,
+                                   N >= 4 ? get(3) / b.get(3) : 0,
+                                   N >= 3 ? get(2) / b.get(2) : 0,
+                                   N >= 2 ? get(1) / b.get(1) : 0,
+                                   N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::max<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::min<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index ab5948a3f2..1b7ae23905 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -34,533 +34,590 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<int32_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+template <>
+class Register<int32_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<int32_t, avx2_register>;
+  using element_type = int32_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0,
+                            N >= 7 ? -1 : 0,
+                            N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0,
+                            N >= 4 ? -1 : 0,
+                            N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0,
+                            N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride,
+                            6 * stride,
+                            5 * stride,
+                            4 * stride,
+                            3 * stride,
+                            2 * stride,
+                            stride,
+                            0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0,
+                            N >= 3 ? 2 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0,
+                            N >= 5 ? 4 : 0,
+                            N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0,
+                            N >= 2 ? 1 : 0,
+                            N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0,
+                            N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_epi32(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_epi32(
+        ptr, createStridedOffsets(stride), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
+                                          ptr,
+                                          createStridedOffsets(stride),
+                                          createMask(N),
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi32(m_value, 0);
+    case 1:
+      return _mm256_extract_epi32(m_value, 1);
+    case 2:
+      return _mm256_extract_epi32(m_value, 2);
+    case 3:
+      return _mm256_extract_epi32(m_value, 3);
+    case 4:
+      return _mm256_extract_epi32(m_value, 4);
+    case 5:
+      return _mm256_extract_epi32(m_value, 5);
+    case 6:
+      return _mm256_extract_epi32(m_value, 6);
+    case 7:
+      return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi32(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi32(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi32(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi32(m_value, value, 3);
+      break;
+    case 4:
+      m_value = _mm256_insert_epi32(m_value, value, 4);
+      break;
+    case 5:
+      m_value = _mm256_insert_epi32(m_value, value, 5);
+      break;
+    case 6:
+      m_value = _mm256_insert_epi32(m_value, value, 6);
+      break;
+    case 7:
+      m_value = _mm256_insert_epi32(m_value, value, 7);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+
+    // the AVX2 epi32 multiply only multiplies the even elements
+    // and provides 64-bit results
+    // need to do some repacking to get this to work
+
+    // multiply 0, 2, 4, 6
+    auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
+
+    // Swap 32-bit words
+    auto sh_a = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+
+    auto sh_b = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
+
+    // multiply 1, 3, 5, 7
+    auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
+
+    // Stitch prod_odd and prod_even back together
+    auto sh_odd = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
+
+    return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+    auto red1 = _mm256_add_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2 =
+        _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
+    auto red2 = _mm256_add_epi32(red1, sh2);
+
+    return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::max<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int32_t, avx2_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_epi32(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_epi32(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-
-        // the AVX2 epi32 multiply only multiplies the even elements
-        // and provides 64-bit results
-        // need to do some repacking to get this to work
-
-        // multiply 0, 2, 4, 6
-        auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
-
-        // Swap 32-bit words
-        auto sh_a = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
-
-        auto sh_b = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
-
-        // multiply 1, 3, 5, 7
-        auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
-
-        // Stitch prod_odd and prod_even back together
-        auto sh_odd = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
-
-        return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1) );
-        auto red1 = _mm256_add_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
-        auto red2 = _mm256_add_epi32(red1, sh2);
-
-        return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::max<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::min<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_epi32(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::min<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_epi32(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 00eea542cd..6a6bdd96da 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -33,517 +33,542 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+template <>
+class Register<int64_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type = Register<int64_t, avx2_register>;
+  using element_type = int64_t;
+  using register_type = __m256i;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(int64_t const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                                     createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value =
+        _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
+                                    reinterpret_cast<long long const*>(ptr),
+                                    createStridedOffsets(stride),
+                                    createMask(N),
+                                    sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int64_t, avx2_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(int64_t const *ptr, camp::idx_t stride){
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                                     offsets.get_register(),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-        auto red1 = _mm256_add_epi64(m_value, sh1);
-
-        // add lower and upper
-        return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+    m_value =
+        _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
+                                    reinterpret_cast<long long const*>(ptr),
+                                    offsets.get_register(),
+                                    createMask(N),
+                                    sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi64(
+        reinterpret_cast<long long*>(ptr), createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi64(m_value, 0);
+    case 1:
+      return _mm256_extract_epi64(m_value, 1);
+    case 2:
+      return _mm256_extract_epi64(m_value, 2);
+    case 3:
+      return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi64(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi64(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi64(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi64(m_value, value, 3);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3),
+                                       get(2) * b.get(2),
+                                       get(1) * b.get(1),
+                                       get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3),
+                                       get(2) / b.get(2),
+                                       get(1) / b.get(1),
+                                       get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(N >= 4 ? get(3) / b.get(3) : 0,
+                                       N >= 3 ? get(2) / b.get(2) : 0,
+                                       N >= 2 ? get(1) / b.get(1) : 0,
+                                       N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+
+    // swap pairs and add
+    auto sh1 = permute<0x5>(m_value);
+    auto red1 = _mm256_add_epi64(m_value, sh1);
+
+    // add lower and upper
+    return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index e95c661335..d2cd703fe3 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -21,55 +21,60 @@
 #define RAJA_policy_tensor_arch_avx2_traits_HPP
 
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int32_t>
+{
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int64_t>
+{
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, float>
+{
+  using element_type = float;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, double>
+{
+  using element_type = double;
+  using register_policy = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type = int64_t;
+};
 
-
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-} // namespace intenral
 } // namespace expt
+} // namespace internal
 } // namespace RAJA
 
 
 #endif // guard
 
 
-
 #endif // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx512.hpp b/include/RAJA/policy/tensor/arch/avx512.hpp
index 597563da35..f8c2f86247 100644
--- a/include/RAJA/policy/tensor/arch/avx512.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512.hpp
@@ -18,11 +18,11 @@
 // Check if the base AVX512 instructions are present
 #ifdef __AVX512F__
 
-#include<RAJA/policy/tensor/arch/avx512/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
+#include <RAJA/policy/tensor/arch/avx512/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
 
 
 #endif // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index a7b7ebaafa..45675752dd 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -34,358 +34,374 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx512_register> :
-    public internal::expt::RegisterBase<Register<double, avx512_register>>
+template <>
+class Register<double, avx512_register>
+    : public internal::expt::RegisterBase<Register<double, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx512_register>>;
+
+
+  using register_policy = avx512_register;
+  using self_type = Register<double, avx512_register>;
+  using element_type = double;
+  using register_type = __m512d;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask8(0x00);
+    case 1:
+      return __mmask8(0x01);
+    case 2:
+      return __mmask8(0x03);
+    case 3:
+      return __mmask8(0x07);
+    case 4:
+      return __mmask8(0x0F);
+    case 5:
+      return __mmask8(0x1F);
+    case 6:
+      return __mmask8(0x3F);
+    case 7:
+      return __mmask8(0x7F);
+    case 8:
+      return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_pd()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx512_register>>;
-
-
-      using register_policy = avx512_register;
-      using self_type = Register<double, avx512_register>;
-      using element_type = double;
-      using register_type = __m512d;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_pd(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_pd(ptr, 
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_pd(ptr, 
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
-      }
+    // AVX512F
+    m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_pd(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
+                                       createMask(N),
+                                       createStridedOffsets(stride),
+                                       ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_pd(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_pd(ptr,
+                              createMask(N),
+                              createStridedOffsets(stride),
+                              m_value,
+                              sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_pd(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 84cb034a56..9f2a0e5766 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -33,365 +33,390 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx512_register> :
-    public internal::expt::RegisterBase<Register<float, avx512_register>>
+template <>
+class Register<float, avx512_register>
+    : public internal::expt::RegisterBase<Register<float, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type = Register<float, avx512_register>;
+  using element_type = float;
+  using register_type = __m512;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask16(0x0000);
+    case 1:
+      return __mmask16(0x0001);
+    case 2:
+      return __mmask16(0x0003);
+    case 3:
+      return __mmask16(0x0007);
+    case 4:
+      return __mmask16(0x000F);
+    case 5:
+      return __mmask16(0x001F);
+    case 6:
+      return __mmask16(0x003F);
+    case 7:
+      return __mmask16(0x007F);
+    case 8:
+      return __mmask16(0x00FF);
+    case 9:
+      return __mmask16(0x01FF);
+    case 10:
+      return __mmask16(0x03FF);
+    case 11:
+      return __mmask16(0x07FF);
+    case 12:
+      return __mmask16(0x0FFF);
+    case 13:
+      return __mmask16(0x1FFF);
+    case 14:
+      return __mmask16(0x3FFF);
+    case 15:
+      return __mmask16(0x7FFF);
+    case 16:
+      return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_ps(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
+                                       createMask(N),
+                                       createStridedOffsets(stride),
+                                       ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_ps(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_ps(ptr,
+                              createMask(N),
+                              createStridedOffsets(stride),
+                              m_value,
+                              sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<float, avx512_register>;
-      using element_type = float;
-      using register_type = __m512;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_ps(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_ps(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_ps(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
-      }
+    return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_ps(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 021ca90fbe..eb8fd5c6f5 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -33,417 +33,458 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+template <>
+class Register<int32_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type = Register<int32_t, avx512_register>;
+  using element_type = int32_t;
+  using register_type = __m512i;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask16(0x0000);
+    case 1:
+      return __mmask16(0x0001);
+    case 2:
+      return __mmask16(0x0003);
+    case 3:
+      return __mmask16(0x0007);
+    case 4:
+      return __mmask16(0x000F);
+    case 5:
+      return __mmask16(0x001F);
+    case 6:
+      return __mmask16(0x003F);
+    case 7:
+      return __mmask16(0x007F);
+    case 8:
+      return __mmask16(0x00FF);
+    case 9:
+      return __mmask16(0x01FF);
+    case 10:
+      return __mmask16(0x03FF);
+    case 11:
+      return __mmask16(0x07FF);
+    case 12:
+      return __mmask16(0x0FFF);
+    case 13:
+      return __mmask16(0x1FFF);
+    case 14:
+      return __mmask16(0x3FFF);
+    case 15:
+      return __mmask16(0x7FFF);
+    case 16:
+      return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi32(c))
+  {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    m_value = _mm512_loadu_si512(ptr);
+#else
+    m_value = _mm512_loadu_epi32(ptr); // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_epi32(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
+                                          createMask(N),
+                                          createStridedOffsets(stride),
+                                          ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    _mm512_storeu_si512(ptr, m_value);
+#else
+    _mm512_storeu_epi32(ptr, m_value); // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_epi32(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_epi32(ptr,
+                                 createMask(N),
+                                 createStridedOffsets(stride),
+                                 m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+// GNU 7-10 are missing this instruction.
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
+#define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
+#endif
+
+    switch (i)
+    {
+    case 0:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
+    case 1:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
+    case 2:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
+    case 3:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
+    case 4:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
+    case 5:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
+    case 6:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
+    case 7:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
+    case 8:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
+    case 9:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
+    case 10:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
+    case 11:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
+    case 12:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
+    case 13:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
+    case 14:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
+    case 15:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int32_t, avx512_register>;
-      using element_type = int32_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        m_value = _mm512_loadu_si512(ptr);
-        #else
-        m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_epi32(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        _mm512_storeu_si512(ptr, m_value);
-        #else
-        _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_epi32(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_epi32(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // GNU 7-10 are missing this instruction.
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
-        #define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
-        #endif
-
-				switch(i){	
-					case 0: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
-					case 1: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
-					case 2: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
-					case 3: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
-					case 4: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
-					case 5: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
-					case 6: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
-					case 7: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
-					case 8: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
-					case 9: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
-					case 10: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
-					case 11: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
-					case 12: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
-					case 13: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
-					case 14: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
-					case 15: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
-				}
-				return 0;
-			}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-				m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            get(15)/b.get(15),
-            get(14)/b.get(14),
-            get(13)/b.get(13),
-            get(12)/b.get(12),
-            get(11)/b.get(11),
-            get(10)/b.get(10),
-            get(9)/b.get(9),
-            get(8)/b.get(8),
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            N >= 16 ? get(15)/b.get(15) : 0,
-            N >= 15 ? get(14)/b.get(14) : 0,
-            N >= 14 ? get(13)/b.get(13) : 0,
-            N >= 13 ? get(12)/b.get(12) : 0,
-            N >= 12 ? get(11)/b.get(11) : 0,
-            N >= 11 ? get(10)/b.get(10) : 0,
-            N >= 10 ? get(9)/b.get(9) : 0,
-            N >= 9 ? get(8)/b.get(8) : 0,
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi32(m_value, a.m_value));
-      }
-  };
-
-}   // namespace expt
-
-}  // namespace RAJA
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mullo_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(get(15) / b.get(15),
+                                      get(14) / b.get(14),
+                                      get(13) / b.get(13),
+                                      get(12) / b.get(12),
+                                      get(11) / b.get(11),
+                                      get(10) / b.get(10),
+                                      get(9) / b.get(9),
+                                      get(8) / b.get(8),
+                                      get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(N >= 16 ? get(15) / b.get(15) : 0,
+                                      N >= 15 ? get(14) / b.get(14) : 0,
+                                      N >= 14 ? get(13) / b.get(13) : 0,
+                                      N >= 13 ? get(12) / b.get(12) : 0,
+                                      N >= 12 ? get(11) / b.get(11) : 0,
+                                      N >= 11 ? get(10) / b.get(10) : 0,
+                                      N >= 10 ? get(9) / b.get(9) : 0,
+                                      N >= 9 ? get(8) / b.get(8) : 0,
+                                      N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi32(m_value, a.m_value));
+  }
+};
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index 17f929c607..fdac43407b 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -33,371 +33,394 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+template <>
+class Register<int64_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type = Register<int64_t, avx512_register>;
+  using element_type = int64_t;
+  using register_type = __m512i;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask8(0x00);
+    case 1:
+      return __mmask8(0x01);
+    case 2:
+      return __mmask8(0x03);
+    case 3:
+      return __mmask8(0x07);
+    case 4:
+      return __mmask8(0x0F);
+    case 5:
+      return __mmask8(0x1F);
+    case 6:
+      return __mmask8(0x3F);
+    case 7:
+      return __mmask8(0x7F);
+    case 8:
+      return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi64(c))
+  {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER)) // Check for oneapi's icpx.
+    m_value = _mm512_maskz_loadu_epi64(
+        ~0,
+        ptr); // May cause slowdown due to looping over 8 bytes, one at a time.
+#else
+    m_value = _mm512_loadu_epi64(ptr); // GNU 7-10 are missing this instruction,
+                                       // as is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_epi64(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
+                                          createMask(N),
+                                          createStridedOffsets(stride),
+                                          ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER)) // Check for oneapi's icpx.
+    _mm512_mask_storeu_epi64(ptr,
+                             ~0,
+                             m_value); // May cause slowdown due to looping over
+                                       // 8 bytes, one at a time.
+#else
+    _mm512_storeu_epi64(ptr, m_value); // GNU 7-10 are missing this instruction,
+                                       // as is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_epi64(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_epi64(ptr,
+                                 createMask(N),
+                                 createStridedOffsets(stride),
+                                 m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi64(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int64_t, avx512_register>;
-      using element_type = int64_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi64(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        m_value = _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_epi64(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        _mm512_storeu_epi64(ptr, m_value);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_epi64(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_epi64(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi64(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi64(m_value);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi64(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi64(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
-
-}  // namespace RAJA
+    return self_type(_mm512_mullo_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(get(7) / b.get(7),
+                                      get(6) / b.get(6),
+                                      get(5) / b.get(5),
+                                      get(4) / b.get(4),
+                                      get(3) / b.get(3),
+                                      get(2) / b.get(2),
+                                      get(1) / b.get(1),
+                                      get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(N >= 8 ? get(7) / b.get(7) : 0,
+                                      N >= 7 ? get(6) / b.get(6) : 0,
+                                      N >= 6 ? get(5) / b.get(5) : 0,
+                                      N >= 5 ? get(4) / b.get(4) : 0,
+                                      N >= 4 ? get(3) / b.get(3) : 0,
+                                      N >= 3 ? get(2) / b.get(2) : 0,
+                                      N >= 2 ? get(1) / b.get(1) : 0,
+                                      N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi64(m_value); }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi64(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi64(m_value, a.m_value));
+  }
+};
+
+
+} // namespace expt
+
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index b2b5cf6731..50c98ba5d2 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -21,53 +21,59 @@
 #ifndef RAJA_policy_tensor_arch_avx512_traits_HPP
 #define RAJA_policy_tensor_arch_avx512_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int32_t>
+{
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int64_t>
+{
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, float>
+{
+  using element_type = float;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, double>
+{
+  using element_type = double;
+  using register_policy = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type = int64_t;
+};
 
-} // namespace internal
 } // namespace expt
+} // namespace internal
 } // namespace RAJA
 
 #endif // guard
 
 
-
 #endif // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/cuda.hpp b/include/RAJA/policy/tensor/arch/cuda.hpp
index a840c63d85..f1d33f7121 100644
--- a/include/RAJA/policy/tensor/arch/cuda.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda.hpp
@@ -21,8 +21,8 @@
 #ifndef RAJA_policy_tensor_arch_cuda_HPP
 #define RAJA_policy_tensor_arch_cuda_HPP
 
-#include<RAJA/policy/tensor/arch/cuda/traits.hpp>
-#include<RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
+#include <RAJA/policy/tensor/arch/cuda/traits.hpp>
+#include <RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index e23eb92bed..bc79caf8de 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -30,977 +30,1015 @@
 #define RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, cuda_warp_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>
-  {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
-
-      using register_policy = cuda_warp_register;
-      using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, cuda_warp_register>;
-
-
-		private:
-      element_type m_value;
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, cuda_warp_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, cuda_warp_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
+
+  using register_policy = cuda_warp_register;
+  using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
+  using element_type = ELEMENT_TYPE;
+  using register_type = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, cuda_warp_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 32;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
 
-		public:
-
-      static constexpr int s_num_elem = 32;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return  __shfl_sync(0xffffffff, m_value, i, 32);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return __shfl_sync(0xffffffff, m_value, i, 32);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(5-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 5-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (5 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 5 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta = s_num_elem >> (i + 1);
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+    return result;
+  }
+};
+
+
+} // namespace expt
 
 } // namespace RAJA
 
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 032517677c..41df311140 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -21,25 +21,28 @@
 #ifndef RAJA_policy_tensor_arch_cuda_traits_HPP
 #define RAJA_policy_tensor_arch_cuda_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::cuda_warp_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::cuda_warp_register;
-      static constexpr camp::idx_t s_num_elem = 32;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template <typename T>
+struct RegisterTraits<RAJA::expt::cuda_warp_register, T>
+{
+  using element_type = T;
+  using register_policy = RAJA::expt::cuda_warp_register;
+  static constexpr camp::idx_t s_num_elem = 32;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type = int32_t;
+};
 
-} // namespace internal
 } // namespace expt
+} // namespace internal
 } // namespace RAJA
 
 
-
 #endif
 
 
diff --git a/include/RAJA/policy/tensor/arch/hip.hpp b/include/RAJA/policy/tensor/arch/hip.hpp
index 6e76772a29..32ea1e6520 100644
--- a/include/RAJA/policy/tensor/arch/hip.hpp
+++ b/include/RAJA/policy/tensor/arch/hip.hpp
@@ -21,8 +21,8 @@
 #ifndef RAJA_policy_tensor_arch_hip_HPP
 #define RAJA_policy_tensor_arch_hip_HPP
 
-#include<RAJA/policy/tensor/arch/hip/traits.hpp>
-#include<RAJA/policy/tensor/arch/hip/hip_wave.hpp>
+#include <RAJA/policy/tensor/arch/hip/traits.hpp>
+#include <RAJA/policy/tensor/arch/hip/hip_wave.hpp>
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index 74bbc2f077..72b680cba0 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -30,979 +30,1017 @@
 #define RAJA_policy_tensor_arch_hip_hip_wave_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, hip_wave_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, hip_wave_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, hip_wave_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
+
+  using register_policy = hip_wave_register;
+  using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
+  using element_type = ELEMENT_TYPE;
+  using register_type = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, hip_wave_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 64;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
-
-      using register_policy = hip_wave_register;
-      using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, hip_wave_register>;
-
+    m_value = c.m_value;
+    return *this;
+  }
 
-		private:
-      element_type m_value;
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
 
-		public:
 
-      static constexpr int s_num_elem = 64;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return hip::impl::shfl_sync(m_value, i);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = hip::impl::shfl_sync(m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return hip::impl::shfl_sync(m_value, i);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = hip::impl::shfl_sync(m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(6-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 6-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (6 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 6 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta = s_num_elem >> (i + 1);
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+    return result;
+  }
+};
+
+
+} // namespace expt
 
 } // namespace RAJA
 
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index 4c4d959599..bd0e2a0136 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -21,25 +21,28 @@
 #ifndef RAJA_policy_tensor_arch_hip_traits_HPP
 #define RAJA_policy_tensor_arch_hip_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::hip_wave_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::hip_wave_register;
-      static constexpr camp::idx_t s_num_elem = 64;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template <typename T>
+struct RegisterTraits<RAJA::expt::hip_wave_register, T>
+{
+  using element_type = T;
+  using register_policy = RAJA::expt::hip_wave_register;
+  static constexpr camp::idx_t s_num_elem = 64;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type = int32_t;
+};
 
-} // namespace internal
 } // namespace expt
+} // namespace internal
 } // namespace RAJA
 
 
-
 #endif
 
 
diff --git a/include/RAJA/policy/tensor/arch/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar.hpp
index 5e139f41f0..29b3788e80 100644
--- a/include/RAJA/policy/tensor/arch/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar.hpp
@@ -16,16 +16,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 
-
 #ifndef RAJA_policy_tensor_arch_scalar_HPP
 #define RAJA_policy_tensor_arch_scalar_HPP
 
 
-
-#include<RAJA/policy/tensor/arch/scalar/traits.hpp>
-#include<RAJA/policy/tensor/arch/scalar/scalar.hpp>
+#include <RAJA/policy/tensor/arch/scalar/traits.hpp>
+#include <RAJA/policy/tensor/arch/scalar/scalar.hpp>
 
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index 139c5d27a5..96c13ea2b5 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -22,450 +22,465 @@
 
 namespace RAJA
 {
-namespace expt {
+namespace expt
+{
+
+/**
+ * A specialization for a single element register.
+ * We will implement this as a scalar value, and let the compiler use
+ * whatever registers it deems appropriate.
+ */
+template <typename T>
+class Register<T, scalar_register>
+    : public internal::expt::RegisterBase<Register<T, scalar_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
+
+  using register_policy = scalar_register;
+  using self_type = Register<T, scalar_register>;
+  using element_type = T;
+  using register_type = T;
+
+  using int_vector_type =
+      Register<typename internal::expt::RegisterTraits<scalar_register,
+                                                       T>::int_element_type,
+               scalar_register>;
+
+
+private:
+  T m_value;
+
+public:
+  static constexpr camp::idx_t s_num_elem = 1;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register() : base_type(), m_value(0) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(element_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, camp::idx_t, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get(0)];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[offsets.get(0)];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr, int_vector_type offsets) const
+  {
+
+    ptr[offsets.get(0)] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  scatter_n(element_type* ptr, int_vector_type offsets, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[offsets.get(0)] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE element_type get(camp::idx_t) const
+  {
+    return m_value;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& set(element_type value, camp::idx_t)
+  {
+    m_value = value;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value + c.m_value;
+  }
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value - c.m_value;
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type sum() const { return m_value; }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type dot(self_type const& b) const
+  {
+    return m_value * b.m_value;
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type max() const { return m_value; }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::min();
+    ;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(RAJA::max<element_type>(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the smallest element
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min() const { return m_value; }
+
+  /*!
+   * @brief Returns the smallest element from first N lanes
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::max();
+    ;
+  }
 
-  /**
-   * A specialization for a single element register.
-   * We will implement this as a scalar value, and let the compiler use
-   * whatever registers it deems appropriate.
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
    */
-  template<typename T>
-  class Register<T, scalar_register> :
-      public internal::expt::RegisterBase<Register<T, scalar_register>>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type a) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
-
-      using register_policy = scalar_register;
-      using self_type = Register<T, scalar_register>;
-      using element_type = T;
-      using register_type = T;
-
-      using int_vector_type = Register<typename internal::expt::RegisterTraits<scalar_register, T>::int_element_type, scalar_register>;
-
-
-    private:
-      T m_value;
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 1;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register() : base_type(), m_value(0) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(element_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = ptr[0];
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t ){
-        m_value = ptr[0];
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t , camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get(0)];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[offsets.get(0)];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t ) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t , camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type offsets) const {
-
-        ptr[offsets.get(0)] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type offsets, camp::idx_t N) const {
-        if(N > 0){
-          ptr[offsets.get(0)] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type get(camp::idx_t) const
-      {return m_value;}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &set(element_type value, camp::idx_t)
-      {
-        m_value = value;
-        return *this;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value + c.m_value;
-      }
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value - c.m_value;
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type sum() const
-      {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type dot(self_type const &b) const
-      {
-        return m_value * b.m_value;
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type max() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::min();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(RAJA::max<element_type>(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the smallest element
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the smallest element from first N lanes
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::max();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(RAJA::min<element_type>(m_value, a.m_value));
-      }
-
-
-
-  };
+    return self_type(RAJA::min<element_type>(m_value, a.m_value));
+  }
+};
 } // namespace expt
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index dfeccbb86f..34541f57b3 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -19,52 +19,57 @@
 #ifndef RAJA_policy_tensor_arch_scalar_traits_HPP
 #define RAJA_policy_tensor_arch_scalar_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int32_t>
+{
+  using element_type = int32_t;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int64_t>
+{
+  using element_type = int64_t;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, float>
+{
+  using element_type = float;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, double>
+{
+  using element_type = double;
+  using register_policy = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type = int64_t;
+};
 
 
-}
-}
-}
+} // namespace expt
+} // namespace internal
+} // namespace RAJA
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch_impl.hpp b/include/RAJA/policy/tensor/arch_impl.hpp
index e14451505a..0e7085b5e2 100644
--- a/include/RAJA/policy/tensor/arch_impl.hpp
+++ b/include/RAJA/policy/tensor/arch_impl.hpp
@@ -22,7 +22,6 @@
 #include "RAJA/policy/tensor/arch.hpp"
 
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -32,30 +31,29 @@
 //
 
 #ifdef __AVX512F__
-#include<RAJA/policy/tensor/arch/avx512.hpp>
+#include <RAJA/policy/tensor/arch/avx512.hpp>
 #endif
 
 
 #ifdef __AVX2__
-#include<RAJA/policy/tensor/arch/avx2.hpp>
+#include <RAJA/policy/tensor/arch/avx2.hpp>
 #endif
 
 
 #ifdef __AVX__
-#include<RAJA/policy/tensor/arch/avx.hpp>
+#include <RAJA/policy/tensor/arch/avx.hpp>
 #endif
 
 #ifdef RAJA_CUDA_ACTIVE
-#include<RAJA/policy/tensor/arch/cuda.hpp>
+#include <RAJA/policy/tensor/arch/cuda.hpp>
 #endif
 
 #ifdef RAJA_HIP_ACTIVE
-#include<RAJA/policy/tensor/arch/hip.hpp>
+#include <RAJA/policy/tensor/arch/hip.hpp>
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-#include<RAJA/policy/tensor/arch/scalar.hpp>
-
+#include <RAJA/policy/tensor/arch/scalar.hpp>
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 8618d543b2..494264f9db 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -40,8 +40,12 @@ namespace policy
 namespace tensor
 {
 
-template<typename EXEC_POLICY, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t TILE_SIZE>
-struct tensor_exec : public EXEC_POLICY {
+template <typename EXEC_POLICY,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          camp::idx_t TILE_SIZE>
+struct tensor_exec : public EXEC_POLICY
+{
   using exec_policy = EXEC_POLICY;
   using tensor_type = TENSOR_TYPE;
 
@@ -50,29 +54,30 @@ struct tensor_exec : public EXEC_POLICY {
 };
 
 
+} // end of namespace tensor
 
-}  // end of namespace tensor
-
-}  // end of namespace policy
+} // end of namespace policy
 
-namespace expt {
+namespace expt
+{
 
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using vector_exec = policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using vector_exec =
+    policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_row_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_row_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_col_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_col_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
 
 
 } //  namespace expt
 
 
-
-
-}  // end of namespace RAJA
+} // end of namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 63f011b689..1b915e2fd0 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -24,61 +24,62 @@
 namespace RAJA
 {
 
-  template<camp::idx_t N>
-  struct LogBase2
-  {
-      static constexpr camp::idx_t value = LogBase2<(N>>1)>::value + 1;
-      static constexpr bool is_exact = ((1<<value) == N);
-  };
-
-  template<>
-  struct LogBase2<0>
-  {
-      static constexpr camp::idx_t value = -1;
-      static constexpr bool is_exact = true;
-  };
+template <camp::idx_t N>
+struct LogBase2
+{
+  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
+  static constexpr bool is_exact = ((1 << value) == N);
+};
 
-  /*!
-   * A bit-masking operator
-   *
-   * Provides an operator that shifts and masks in input value to extract
-   * a contiguous set of bits.
-   *
-   * result = (input >> Shift) & (Mask)
-   *
-   * Where mask is (1<<Width)-1, or the number of bits defined by Width.
-   *
-   *
-   */
-  template<int Width, int Shift>
-  struct BitMask {
-    static constexpr int shift = Shift;
-    static constexpr int width = Width;
-    static constexpr int max_input_size = 1<<(Shift+Width);
-    static constexpr int max_masked_size = 1<<Width;
-    static constexpr int max_shifted_size = 1<<Shift;
+template <>
+struct LogBase2<0>
+{
+  static constexpr camp::idx_t value = -1;
+  static constexpr bool is_exact = true;
+};
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskValue(T input) {
-      return( (input>>( static_cast<T>(Shift) )) & static_cast<T>((1<<(Width))-1) );
-    }
+/*!
+ * A bit-masking operator
+ *
+ * Provides an operator that shifts and masks in input value to extract
+ * a contiguous set of bits.
+ *
+ * result = (input >> Shift) & (Mask)
+ *
+ * Where mask is (1<<Width)-1, or the number of bits defined by Width.
+ *
+ *
+ */
+template <int Width, int Shift>
+struct BitMask
+{
+  static constexpr int shift = Shift;
+  static constexpr int width = Width;
+  static constexpr int max_input_size = 1 << (Shift + Width);
+  static constexpr int max_masked_size = 1 << Width;
+  static constexpr int max_shifted_size = 1 << Shift;
 
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskValue(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) &
+            static_cast<T>((1 << (Width)) - 1));
+  }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T getOuter(T input) {
-      return(  (input>>(static_cast<T>(Shift))) >> Width );
-    }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskOuter(T input) {
-      return( input & (static_cast<T>(-1) << (Width+Shift) )  );
-    }
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T getOuter(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) >> Width);
+  }
 
-  };
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskOuter(T input)
+  {
+    return (input & (static_cast<T>(-1) << (Width + Shift)));
+  }
+};
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif //RAJA_util_BitMask_HPP
+#endif // RAJA_util_BitMask_HPP
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index abe8197b93..1813d42224 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -95,10 +95,11 @@ struct CombiningAdapter
   Layout m_layout;
 
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>)
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -106,10 +107,11 @@ struct CombiningAdapter
   }
   ///
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>) const
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -117,16 +119,14 @@ struct CombiningAdapter
   }
 
 public:
-
   /*!
    * Constructor from lambda and layout.
    */
-  template < typename C_Lambda, typename C_Layout >
+  template <typename C_Lambda, typename C_Layout>
   RAJA_HOST_DEVICE CombiningAdapter(C_Lambda&& lambda, C_Layout&& layout)
-      : m_lambda(std::forward<C_Lambda>(lambda))
-      , m_layout(std::forward<C_Layout>(layout))
-  {
-  }
+      : m_lambda(std::forward<C_Lambda>(lambda)),
+        m_layout(std::forward<C_Layout>(layout))
+  {}
 
   /*!
    * Call the lambda by converting the linear index to multidimensional indices.
@@ -134,13 +134,13 @@ struct CombiningAdapter
    * @return return value of lambda
    */
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index)
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
   ///
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index) const
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
@@ -207,9 +207,9 @@ struct CombiningAdapter
  *
  */
 template <typename Lambda, typename Layout>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
-  // -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
+RAJA_HOST_DEVICE RAJA_INLINE auto
+make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
+// -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
 {
   return CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>(
       std::forward<Lambda>(lambda), std::forward<Layout>(layout));
@@ -217,52 +217,58 @@ auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto
+make_CombiningAdapter(Lambda&& lambda,
+                      ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
-        std::move(layout));
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_PermutedCombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto
+make_PermutedCombiningAdapter(Lambda&& lambda,
+                              ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   auto layout = make_permuted_layout<sizeof...(IdxTs), IdxLin>(
-              {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
-              RAJA::as_array<Perm>::get());
+      {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
+      RAJA::as_array<Perm>::get());
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
 
-        std::move(layout));
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_CombingAdapter_HPP */
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index 257e852bf9..18ad9c7951 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -41,17 +41,19 @@ struct is_any_of;
 
 template <typename T, typename... Types>
 struct is_any_of<T, ::camp::list<Types...>>
-  : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
+    : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
 {};
 
 template <typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
 template <typename T, typename TypeList>
-using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
+using enable_if_is_none_of =
+    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
+                     T>;
 
 
-}  // namespace util
-}  // namespace RAJA
+} // namespace util
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 6bb308d375..1446a868ca 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -3,7 +3,8 @@
  *
  * \file
  *
- * \brief   RAJA header file defining the IndexLayout class and IndexList classes.
+ * \brief   RAJA header file defining the IndexLayout class and IndexList
+ *classes.
  *
  ******************************************************************************
  */
@@ -20,70 +21,80 @@
 
 #include "RAJA/util/Layout.hpp"
 
-namespace RAJA 
+namespace RAJA
 {
 
 /*!
-* DirectIndex struct contains call operator that returns the same index that was input
-*
-*/
-template<typename IdxLin = Index_type>
-struct DirectIndex {
+ * DirectIndex struct contains call operator that returns the same index that
+ * was input
+ *
+ */
+template <typename IdxLin = Index_type>
+struct DirectIndex
+{
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
     return idx;
   }
-
 };
 
 /*!
-* IndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
-struct IndexList {
+ * IndexList struct stores a pointer to an array containing the index list.
+ * Its call operator returns the entry at the input location (idx) of its index
+ * list.
+ *
+ */
+template <typename IdxLin = Index_type>
+struct IndexList
+{
 
   IdxLin* index_list{nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
     return index_list[idx];
   }
-
 };
 
 /*!
-* ConditionalIndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the same index that was input if the index list is a nullptr, 
-* or otherwise returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
-struct ConditionalIndexList {
+ * ConditionalIndexList struct stores a pointer to an array containing the index
+ * list. Its call operator returns the same index that was input if the index
+ * list is a nullptr, or otherwise returns the entry at the input location (idx)
+ * of its index list.
+ *
+ */
+template <typename IdxLin = Index_type>
+struct ConditionalIndexList
+{
 
-  IdxLin* index_list{nullptr};  
+  IdxLin* index_list{nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
-    if (index_list) {
+    if (index_list)
+    {
       return index_list[idx];
-    } else {
+    }
+    else
+    {
       return idx;
     }
   }
-
 };
 
 namespace internal
 {
 
-template<typename Range, typename IdxLin, typename... IndexTypes>
+template <typename Range, typename IdxLin, typename... IndexTypes>
 struct IndexLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
-struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
+struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
+{
   using IndexRange = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
   using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
@@ -94,76 +105,78 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
   camp::tuple<IndexTypes...> tuple;
 
   template <typename... Types>
-  constexpr RAJA_INLINE IndexLayout_impl(
-      camp::tuple<IndexTypes...> index_tuple_in,
-      Types... ns)
-      : base_{(ns)...},
-        tuple(index_tuple_in)
-  {
-  }
+  constexpr RAJA_INLINE
+  IndexLayout_impl(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+      : base_{(ns)...}, tuple(index_tuple_in)
+  {}
 
   /*!
    * Computes a linear space index from entries of index lists stored in tuple.
-   * This is accomplished through the inner product of the strides and the 
+   * This is accomplished through the inner product of the strides and the
    * entry in the index list along each dimension.
    * @param indices Indices in the n-dimensional space of this layout
    * @return Linear space index.
-   */  
+   */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(Indices... indices) const
   {
     return sum<IdxLin>(
-      (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
+        (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
   }
-
 };
 
 } // namespace internal
 
 
-template <size_t n_dims = 1, typename IdxLin = Index_type, typename... IndexTypes>
+template <size_t n_dims = 1,
+          typename IdxLin = Index_type,
+          typename... IndexTypes>
 struct IndexLayout
-    : public internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...> {
-  using Base =
-      internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
+    : public internal::
+          IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>
+{
+  using Base = internal::
+      IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
   using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                    IdxLin, IndexTypes...>::IndexLayout_impl;
+                                   IdxLin,
+                                   IndexTypes...>::IndexLayout_impl;
 
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
-      const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
-          rhs)
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                               IdxLin,
+                                               IndexTypes...>& rhs)
       : Base{rhs}
-  {
-  }
-
+  {}
 };
 
 /*!
- * creates of a camp::tuple of index types 
+ * creates of a camp::tuple of index types
  * (such as DirectIndex, IndexList, or ConditionalIndexList)
  *
  */
 template <typename... IndexTypes>
 auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
 {
-    return camp::tuple<IndexTypes...>(it...);
+  return camp::tuple<IndexTypes...>(it...);
 }
 
 /*!
  * creates an index layout based on the input camp::tuple of index types
  *
- */  
-template <typename IdxLin = Index_type, typename... Types, typename... IndexTypes>
-auto make_index_layout(
-  camp::tuple<IndexTypes...> index_tuple_in,
-  Types... ns) -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
+ */
+template <typename IdxLin = Index_type,
+          typename... Types,
+          typename... IndexTypes>
+auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+    -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
-    static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-    return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in, ns...);
+  static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
+  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
+                                                              ns...);
 }
 
-}
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index c5060a0a96..74ab0fa425 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -14,41 +14,46 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
-  {
-  public:
-    using Parent = ::RAJA::util::PluginStrategy;
-    typedef void (*init_function)(const int, const uint64_t, const uint32_t, void*);
-    typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
-    typedef void (*post_function)(uint64_t);
-    typedef void (*finalize_function)();
+class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
+{
+public:
+  using Parent = ::RAJA::util::PluginStrategy;
+  typedef void (*init_function)(const int,
+                                const uint64_t,
+                                const uint32_t,
+                                void*);
+  typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
+  typedef void (*post_function)(uint64_t);
+  typedef void (*finalize_function)();
 
-    KokkosPluginLoader();
+  KokkosPluginLoader();
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+private:
+  void initPlugin(const std::string& path);
 
-    std::vector<init_function> init_functions;
-    std::vector<pre_function> pre_functions;
-    std::vector<post_function> post_functions;
-    std::vector<finalize_function> finalize_functions;
+  void initDirectory(const std::string& path);
 
-  };  // end KokkosPluginLoader class
+  std::vector<init_function> init_functions;
+  std::vector<pre_function> pre_functions;
+  std::vector<post_function> post_functions;
+  std::vector<finalize_function> finalize_functions;
 
-  void linkKokkosPluginLoader();
+}; // end KokkosPluginLoader class
 
-}  // end namespace util
-}  // end namespace RAJA
+void linkKokkosPluginLoader();
+
+} // end namespace util
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 948e37f498..cab983d52c 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -38,7 +38,6 @@ namespace detail
 {
 
 
-
 template <typename Range,
           typename IdxLin = Index_type,
           ptrdiff_t StrideOneDim = -1>
@@ -49,27 +48,28 @@ struct LayoutBase_impl;
  */
 
 template <size_t j, size_t n_dims, typename IdxLin = Index_type>
-struct stride_calculator {
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      IdxLin cur_stride,
-      IdxLin const (&sizes)[n_dims]) const
+struct stride_calculator
+{
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(IdxLin cur_stride, IdxLin const (&sizes)[n_dims]) const
   {
     return stride_calculator<j + 1, n_dims, IdxLin>{}(
         cur_stride * (sizes[j] ? sizes[j] : 1), sizes);
   }
 };
 template <size_t n_dims, typename IdxLin>
-struct stride_calculator<n_dims, n_dims, IdxLin> {
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      IdxLin cur_stride,
-      IdxLin const (&)[n_dims]) const
+struct stride_calculator<n_dims, n_dims, IdxLin>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(IdxLin cur_stride, IdxLin const (&)[n_dims]) const
   {
     return cur_stride;
   }
 };
 
 template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
+struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
+{
 public:
   using IndexLinear = IdxLin;
   using IndexRange = camp::make_idx_seq_t<sizeof...(RangeInts)>;
@@ -88,12 +88,10 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    * Default constructor with zero sizes and strides.
    */
   constexpr RAJA_INLINE LayoutBase_impl() = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const &) = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl &&) = default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl const &) =
-      default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl &&) =
-      default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&) = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&) = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl const&) = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&) = default;
 
   /*!
    * Construct a layout given the size of each dimension.
@@ -115,15 +113,15 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    *  Templated copy ctor from simillar layout.
    */
   template <typename CIdxLin, ptrdiff_t CStrideOneDim>
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE LayoutBase_impl(
-      const LayoutBase_impl<camp::idx_seq<RangeInts...>, CIdxLin, CStrideOneDim>
-          &rhs)
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
+                                        CIdxLin,
+                                        CStrideOneDim>& rhs)
       : sizes{static_cast<IdxLin>(rhs.sizes[RangeInts])...},
         strides{static_cast<IdxLin>(rhs.strides[RangeInts])...},
         inv_strides{static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
         inv_mods{static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
-  {
-  }
+  {}
 
 
   /*!
@@ -131,36 +129,36 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    */
   template <typename... Types>
   RAJA_INLINE constexpr LayoutBase_impl(
-      const std::array<IdxLin, n_dims> &sizes_in,
-      const std::array<IdxLin, n_dims> &strides_in)
+      const std::array<IdxLin, n_dims>& sizes_in,
+      const std::array<IdxLin, n_dims>& strides_in)
       : sizes{sizes_in[RangeInts]...},
         strides{strides_in[RangeInts]...},
         inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
         inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
-  {
-  }
+  {}
 
   /*!
    * Methods to performs bounds checking in layout objects
    */
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx), static_cast<long int>(sizes[N] - 1));
+           static_cast<int>(N),
+           static_cast<long int>(idx),
+           static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
                                                 Indices... indices) const
   {
-    if(sizes[N] > 0 && !(0<=idx && idx < static_cast<Idx>(sizes[N])))
+    if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N])))
     {
       BoundsCheckError<N>(idx);
     }
@@ -180,16 +178,16 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
   operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>(
-      (RangeInts==stride_one_dim ?   // Is this dimension stride-one?
-         indices :  // it's stride one, so dont bother with multiply
-         strides[RangeInts]*indices // it's not stride one
-			)...
-    );
+    return sum<IdxLin>((RangeInts == stride_one_dim
+                            ? // Is this dimension stride-one?
+                            indices
+                            : // it's stride one, so dont bother with multiply
+                            strides[RangeInts] * indices // it's not stride one
+                        )...);
   }
 
 
@@ -205,20 +203,22 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    */
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
-    if(totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
+    if (totSize > 0 && (linear_index < 0 || linear_index >= totSize))
+    {
       printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
-             static_cast<long int>(linear_index), static_cast<long int>(totSize-1));
+             static_cast<long int>(linear_index),
+             static_cast<long int>(totSize - 1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
-     }
+    }
 #endif
 
-    camp::sink((indices =
-      (camp::decay<Indices>)((linear_index / inv_strides[RangeInts]) %
-                             inv_mods[RangeInts]))...);
+    camp::sink((indices = (camp::decay<Indices>)((linear_index /
+                                                  inv_strides[RangeInts]) %
+                                                 inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -231,8 +231,9 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
-    return foldl(RAJA::operators::multiplies<IdxLin>(),
-                         (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
+    return foldl(
+        RAJA::operators::multiplies<IdxLin>(),
+        (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
   }
 
   /*!
@@ -247,27 +248,21 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
     return foldl(RAJA::operators::multiplies<IdxLin>(), sizes[RangeInts]...);
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return strides[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return sizes[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 };
@@ -278,7 +273,7 @@ constexpr size_t
 template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
 constexpr IdxLin
     LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::limit;
-}  // namespace detail
+} // namespace detail
 
 /*!
  * @brief A mapping of n-dimensional index space to a linear index space.
@@ -338,7 +333,8 @@ struct TypedLayout;
 
 template <typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
 struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
-    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne> {
+    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne>
+{
 
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
   using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
@@ -356,8 +352,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
@@ -374,11 +370,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *                 dimensionality of this layout.
    */
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
     toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
@@ -392,11 +388,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-		camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -406,8 +402,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
  *
  */
 template <ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
-RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
-    Layout<n_dims, IdxLin> const &l)
+RAJA_INLINE Layout<n_dims, IdxLin, s1_dim>
+make_stride_one(Layout<n_dims, IdxLin> const& l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
@@ -418,18 +414,18 @@ RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
  *
  */
 template <ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
-RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim> make_stride_one(
-    TypedLayout<IdxLin, IdxTuple> const &l)
+RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim>
+make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
 {
   // strip l to it's base-class type
   using Base = typename TypedLayout<IdxLin, IdxTuple>::Base;
-  Base const &b = (Base const &)l;
+  Base const& b = (Base const&)l;
 
   // Use non-typed layout to initialize new typed layout
   return TypedLayout<IdxLin, IdxTuple, s1_dim>(b);
 }
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 50680101d4..8d99c1f564 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -31,8 +31,7 @@ namespace RAJA
 {
 
 
-
-template<camp::idx_t ... Sizes>
+template <camp::idx_t... Sizes>
 using ParamList = camp::idx_seq<Sizes...>;
 
 /*!
@@ -51,80 +50,87 @@ using ParamList = camp::idx_seq<Sizes...>;
  */
 
 
-namespace internal {
-
-
+namespace internal
+{
 
-  template<typename Perm, typename Sizes>
-  struct StaticLayoutHelper;
 
-  template<camp::idx_t ... Perm, Index_type ...Sizes>
-  struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>{
-      using type =  StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
-  };
+template <typename Perm, typename Sizes>
+struct StaticLayoutHelper;
 
-  template<typename Perm, typename Sizes>
-  using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
+template <camp::idx_t... Perm, Index_type... Sizes>
+struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>
+{
+  using type = StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+};
 
+template <typename Perm, typename Sizes>
+using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 
 
-}
+} // namespace internal
 
 
-template<typename ValueType, typename Perm, typename Sizes, typename... IndexTypes>
+template <typename ValueType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
 using TypedLocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, camp::list<IndexTypes...> >;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            camp::list<IndexTypes...>>;
 
 
-template<typename ValueType, typename Perm, typename Sizes>
+template <typename ValueType, typename Perm, typename Sizes>
 using LocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, internal::getDefaultIndexTypes<Perm> >;
-
-
-
-
-
-template<typename AtomicPolicy, typename DataType, typename Perm,
-         typename Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray {
-};
-
-template<typename AtomicPolicy, typename DataType, camp::idx_t ... Perm,
-          Index_type ... Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray<AtomicPolicy, DataType, camp::idx_seq<Perm ...>,
-                             RAJA::SizeList<Sizes ...>, IndexTypes ...>{
-  DataType *m_arrayPtr = nullptr;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            internal::getDefaultIndexTypes<Perm>>;
+
+
+template <typename AtomicPolicy,
+          typename DataType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray
+{};
+
+template <typename AtomicPolicy,
+          typename DataType,
+          camp::idx_t... Perm,
+          Index_type... Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray<AtomicPolicy,
+                             DataType,
+                             camp::idx_seq<Perm...>,
+                             RAJA::SizeList<Sizes...>,
+                             IndexTypes...>
+{
+  DataType* m_arrayPtr = nullptr;
   using value_type = DataType;
   using atomic_ref_t = RAJA::AtomicRef<value_type, AtomicPolicy>;
-  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm ...>, Sizes ...>;
+  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
   static const camp::idx_t NumElem = layout_type::s_size;
 
   RAJA_HOST_DEVICE
-  atomic_ref_t operator()(IndexTypes ... indices) const
+  atomic_ref_t operator()(IndexTypes... indices) const
   {
-    return(atomic_ref_t(&m_arrayPtr[layout_type::s_oper(stripIndexType(indices)
-                                                     ...)]));
+    return (atomic_ref_t(
+        &m_arrayPtr[layout_type::s_oper(stripIndexType(indices)...)]));
   }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr
-  camp::idx_t size() const
-  {
-    return layout_type::s_size;
-  }
+  constexpr camp::idx_t size() const { return layout_type::s_size; }
 
   RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(DataType * data_ptr){
-    m_arrayPtr = data_ptr;
-  }
+  RAJA_INLINE void set_data(DataType* data_ptr) { m_arrayPtr = data_ptr; }
 };
 
 
-
-
-
-}  // end namespace RAJA
+} // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 827515062e..1751ec1503 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -41,7 +41,8 @@ template <typename Range, typename IdxLin>
 struct OffsetLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin>
-struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
+struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
+{
   using Self = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
   using IndexRange = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
@@ -51,56 +52,57 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims]={0}; //If not specified set to zero
+  IdxLin offsets[n_dims] = {0}; // If not specified set to zero
 
-  constexpr RAJA_INLINE OffsetLayout_impl(
-      std::array<IdxLin, sizeof...(RangeInts)> begin,
-      std::array<IdxLin, sizeof...(RangeInts)> end)
+  constexpr RAJA_INLINE
+  OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
+                    std::array<IdxLin, sizeof...(RangeInts)> end)
       : base_{(end[RangeInts] - begin[RangeInts])...},
         offsets{begin[RangeInts]...}
-  {
-  }
+  {}
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout_impl(Self const& c)
       : base_(c.base_), offsets{c.offsets[RangeInts]...}
-  {
-  }
+  {}
 
   void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
   {
-    for(size_t i=0; i<n_dims; ++i) offsets[i] += shift[i];
+    for (size_t i = 0; i < n_dims; ++i)
+      offsets[i] += shift[i];
   }
 
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(offsets[N]), static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+           static_cast<int>(N),
+           static_cast<long int>(idx),
+           static_cast<long int>(offsets[N]),
+           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
+                                                Indices... indices) const
   {
-    if(!(offsets[N] <=idx && idx < offsets[N] + base_.sizes[N]))
+    if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N]))
     {
       BoundsCheckError<N>(idx);
     }
     RAJA_UNUSED_VAR(idx);
-    BoundsCheck<N+1>(indices...);
+    BoundsCheck<N + 1>(indices...);
   }
 
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
+  operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     return base_((indices - offsets[RangeInts])...);
@@ -108,7 +110,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
@@ -127,8 +129,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
   OffsetLayout_impl(const Layout<sizeof...(RangeInts), IdxLin>& rhs)
       : base_{rhs}
-  {
-  }
+  {}
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin size() const
   {
@@ -140,36 +141,31 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
     return base_.size_noproj();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return base_.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return base_.get_dim_size();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return offsets[DIM];
   }
 };
 
-}  // namespace internal
+} // namespace internal
 
 template <size_t n_dims = 1, typename IdxLin = Index_type>
 struct OffsetLayout
-    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin> {
+    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>
+{
   using Base =
       internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
@@ -180,51 +176,51 @@ struct OffsetLayout
       const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
           rhs)
       : Base{rhs}
-  {
-  }
+  {}
 };
 
-//TypedOffsetLayout
+// TypedOffsetLayout
 template <typename IdxLin, typename DimTuple>
 struct TypedOffsetLayout;
 
 template <typename IdxLin, typename... DimTypes>
 struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
-: public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
+    : public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
 {
-   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-   using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-   using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
-   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
-   using DimTuple = camp::tuple<DimTypes...>;
-   using IndexLinear = IdxLin;
-
-   // Pull in base coonstructors
- #if 0
+  using StrippedIdxLin = strip_index_type_t<IdxLin>;
+  using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+  using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
+  using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
+  using DimTuple = camp::tuple<DimTypes...>;
+  using IndexLinear = IdxLin;
+
+  // Pull in base coonstructors
+#if 0
    // This breaks with nvcc11
  using Base::Base;
- #else
-   using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
- #endif
+#else
+  using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
+#endif
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
     toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
@@ -248,13 +244,14 @@ auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     sizes[i] = end[i] - begin[i];
   }
   return internal::OffsetLayout_impl<camp::make_idx_seq_t<Rank>, IdxLin>::
       from_layout_and_offsets(begin, make_permuted_layout(sizes, permutation));
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 150aaeee34..9d627fedb8 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -29,15 +29,19 @@ namespace RAJA
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeft
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& num_i,
-                 Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& num_i,
+             Arg2 const& j,
+             Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
@@ -46,35 +50,46 @@ struct GetOffsetLeft
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetRight
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
 template <size_t t_bunch_num_i,
-          typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+          typename Ret,
+          typename Arg1 = Ret,
+          typename Arg2 = Arg1>
 struct GetOffsetLeftBunched
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
-  using rebind = GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
+  using rebind =
+      GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t new_bunch_num_i >
+  template <size_t new_bunch_num_i>
   using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
 
   static constexpr Arg1 bunch_num_i{t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
@@ -83,6 +98,6 @@ struct GetOffsetLeftBunched
   }
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index b4249e7182..264628d1f4 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -43,49 +43,54 @@ namespace detail
 {
 
 // truly associative (does not include fp add/multiply)
-struct associative_tag {
-};
+struct associative_tag
+{};
 
 // associative up to floating point rounding differences
-struct fp_associative_tag : associative_tag {
-};
+struct fp_associative_tag : associative_tag
+{};
 
 // get associativity tag appropriate for the type
-template < typename T >
+template <typename T>
 using associative_or_fp_associative_tag =
-  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
-                     fp_associative_tag, associative_tag>;
+    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                       fp_associative_tag,
+                       associative_tag>;
 
 template <typename Arg1, typename Arg2, typename Result>
-struct binary_function {
+struct binary_function
+{
   using first_argument_type = Arg1;
   using second_argument_type = Arg2;
   using result_type = Result;
 };
 
 template <typename Argument, typename Result>
-struct unary_function {
+struct unary_function
+{
   using argument_type = Argument;
   using result_type = Result;
 };
 
 template <typename Arg1, typename Arg2>
-struct comparison_function : public binary_function<Arg1, Arg2, bool> {
-};
+struct comparison_function : public binary_function<Arg1, Arg2, bool>
+{};
 
-}  // namespace detail
+} // namespace detail
 
 namespace types
 {
 
 template <typename T>
-struct is_unsigned_int {
+struct is_unsigned_int
+{
   static constexpr const bool value =
       std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
 
 template <typename T>
-struct is_signed_int {
+struct is_signed_int
+{
   static constexpr const bool value =
       !std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
@@ -96,51 +101,60 @@ struct is_signed_int {
    type)
 */
 template <typename T, bool GPU = false>
-struct larger {
-};
+struct larger
+{};
 
 template <>
-struct larger<uint8_t> {
+struct larger<uint8_t>
+{
   using type = uint16_t;
 };
 
 template <>
-struct larger<uint16_t> {
+struct larger<uint16_t>
+{
   using type = uint32_t;
 };
 
 template <>
-struct larger<uint32_t> {
+struct larger<uint32_t>
+{
   using type = uint64_t;
 };
 
 template <>
-struct larger<int8_t> {
+struct larger<int8_t>
+{
   using type = int16_t;
 };
 
 template <>
-struct larger<int16_t> {
+struct larger<int16_t>
+{
   using type = int32_t;
 };
 
 template <>
-struct larger<int32_t> {
+struct larger<int32_t>
+{
   using type = int64_t;
 };
 
 template <>
-struct larger<float> {
+struct larger<float>
+{
   using type = double;
 };
 
 template <>
-struct larger<double> {
+struct larger<double>
+{
   using type = long double;
 };
 
 template <>
-struct larger<double, true> {
+struct larger<double, true>
+{
   using type = double;
 };
 
@@ -148,36 +162,41 @@ namespace detail
 {
 
 template <typename T, bool isInt, bool isSigned, bool isFP, bool gpu = false>
-struct largest {
-};
+struct largest
+{};
 
 template <typename T>
-struct largest<T, true, false, false> {
+struct largest<T, true, false, false>
+{
   using type = uint64_t;
 };
 
 template <typename T>
-struct largest<T, true, true, false> {
+struct largest<T, true, true, false>
+{
   using type = int64_t;
 };
 
 template <typename T>
-struct largest<T, false, false, true, false> {
+struct largest<T, false, false, true, false>
+{
   using type = long double;
 };
 
 template <typename T>
-struct largest<T, false, false, true, true> {
+struct largest<T, false, false, true, true>
+{
   using type = double;
 };
-}  // namespace detail
+} // namespace detail
 
 /*!
         \brief type lookup to return largest similar type. If running on GPU,
    pass 'true' as second template argument
 */
 template <typename T, bool gpu = false>
-struct largest {
+struct largest
+{
   using type = typename detail::largest<T,
                                         std::is_integral<T>::value,
                                         std::is_signed<T>::value,
@@ -187,36 +206,42 @@ struct largest {
 
 
 template <typename T>
-struct size_of {
-  enum { value = sizeof(T) };
+struct size_of
+{
+  enum
+  {
+    value = sizeof(T)
+  };
 };
 
 namespace detail
 {
 
 template <typename T, typename U, bool lhsLarger>
-struct larger_of {
-};
+struct larger_of
+{};
 
 template <typename T, typename U>
-struct larger_of<T, U, true> {
+struct larger_of<T, U, true>
+{
   using type = T;
 };
 
 template <typename T, typename U>
-struct larger_of<T, U, false> {
+struct larger_of<T, U, false>
+{
   using type = U;
 };
-}  // namespace detail
+} // namespace detail
 
 template <typename T, typename U>
-struct larger_of {
+struct larger_of
+{
   using type = typename detail::
       larger_of<T, U, (size_of<T>::value > size_of<U>::value)>::type;
 };
 
-}  // namespace types
-
+} // namespace types
 
 
 template <typename T, typename Enable = void>
@@ -226,27 +251,27 @@ struct limits;
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  !std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      !std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu) );
+    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
@@ -254,8 +279,8 @@ struct limits<T,
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -264,42 +289,36 @@ struct limits<T,
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(0xFFFFFFFFFFFFFFFF);
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
 
 
 template <>
-struct limits<float> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min()
-  {
-    return -FLT_MAX;
-  }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max()
-  {
-    return FLT_MAX;
-  }
+struct limits<float>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min() { return -FLT_MAX; }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max() { return FLT_MAX; }
 };
 
 template <>
-struct limits<double> {
+struct limits<double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr double min()
   {
     return -DBL_MAX;
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() 
-  { 
-     return DBL_MAX; 
-  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() { return DBL_MAX; }
 };
 
 template <>
-struct limits<long double> {
+struct limits<long double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr long double min()
   {
     return -LDBL_MAX;
@@ -328,17 +347,20 @@ static_assert(check<long>(), "limits for long is broken");
 static_assert(check<unsigned long>(), "limits for unsigned long is broken");
 static_assert(check<long int>(), "limits for long int is broken");
 static_assert(check<unsigned long int>(),
-              "limits for unsigned long int is broken");
+              "limits for unsigned long int is "
+              "broken");
 static_assert(check<long long>(), "limits for long long is broken");
 static_assert(check<unsigned long long>(),
-              "limits for unsigned long long is broken");
+              "limits for unsigned long long is "
+              "broken");
 #endif
 
 // Arithmetic
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
-              detail::associative_or_fp_associative_tag<Ret> {
+              detail::associative_or_fp_associative_tag<Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -348,7 +370,8 @@ struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
+struct minus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -358,7 +381,8 @@ struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
-                    detail::associative_or_fp_associative_tag<Ret> {
+                    detail::associative_or_fp_associative_tag<Ret>
+{
 
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
@@ -369,7 +393,8 @@ struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct divides : public detail::binary_function<Arg1, Arg2, Ret> {
+struct divides : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -378,7 +403,8 @@ struct divides : public detail::binary_function<Arg1, Arg2, Ret> {
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
+struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -390,7 +416,8 @@ struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_and : public detail::comparison_function<Arg1, Arg2>,
-                     detail::associative_tag {
+                     detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -401,7 +428,8 @@ struct logical_and : public detail::comparison_function<Arg1, Arg2>,
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_or : public detail::comparison_function<Arg1, Arg2>,
-                    detail::associative_tag {
+                    detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -411,7 +439,8 @@ struct logical_or : public detail::comparison_function<Arg1, Arg2>,
 };
 
 template <typename T>
-struct logical_not : public detail::unary_function<T, bool> {
+struct logical_not : public detail::unary_function<T, bool>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const T& lhs) const
   {
     return !lhs;
@@ -421,30 +450,33 @@ struct logical_not : public detail::unary_function<T, bool> {
 // Bitwise
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_or : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs | rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_and : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs & rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
 };
 
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -461,7 +493,8 @@ struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
 */
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -475,7 +508,8 @@ struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -490,7 +524,8 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
 // Logical Comparison
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct equal_to : public detail::comparison_function<Arg1, Arg2> {
+struct equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -499,7 +534,8 @@ struct equal_to : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
+struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -508,7 +544,8 @@ struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater : public detail::comparison_function<Arg1, Arg2> {
+struct greater : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -517,7 +554,8 @@ struct greater : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less : public detail::comparison_function<Arg1, Arg2> {
+struct less : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -527,7 +565,8 @@ struct less : public detail::comparison_function<Arg1, Arg2> {
 
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
+struct greater_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -536,7 +575,8 @@ struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less_equal : public detail::comparison_function<Arg1, Arg2> {
+struct less_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -547,7 +587,8 @@ struct less_equal : public detail::comparison_function<Arg1, Arg2> {
 // Filters
 
 template <typename Ret, typename Orig = Ret>
-struct identity : public detail::unary_function<Orig, Ret> {
+struct identity : public detail::unary_function<Orig, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Orig& lhs) const
   {
     return lhs;
@@ -555,7 +596,8 @@ struct identity : public detail::unary_function<Orig, Ret> {
 };
 
 template <typename T, typename U>
-struct project1st : public detail::binary_function<T, U, T> {
+struct project1st : public detail::binary_function<T, U, T>
+{
   RAJA_HOST_DEVICE constexpr T operator()(const T& lhs,
                                           const U& RAJA_UNUSED_ARG(rhs)) const
   {
@@ -564,7 +606,8 @@ struct project1st : public detail::binary_function<T, U, T> {
 };
 
 template <typename T, typename U = T>
-struct project2nd : public detail::binary_function<T, U, U> {
+struct project2nd : public detail::binary_function<T, U, U>
+{
   RAJA_HOST_DEVICE constexpr U operator()(const T& RAJA_UNUSED_ARG(lhs),
                                           const U& rhs) const
   {
@@ -575,13 +618,15 @@ struct project2nd : public detail::binary_function<T, U, U> {
 // Type Traits
 
 template <typename T>
-struct is_associative {
+struct is_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
 template <typename T>
-struct is_fp_associative {
+struct is_fp_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::fp_associative_tag, T>::value;
 };
@@ -591,10 +636,10 @@ struct safe_plus
     : public plus<Arg1,
                   Arg2,
                   typename types::larger<
-                      typename types::larger_of<Arg1, Arg2>::type>::type> {
-};
+                      typename types::larger_of<Arg1, Arg2>::type>::type>
+{};
 
-}  // namespace operators
+} // namespace operators
 
 namespace concepts
 {
@@ -605,33 +650,34 @@ template <typename Function,
           typename Arg2 = Arg1>
 struct BinaryFunction
     : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>()))) {
-};
+          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>())))
+{};
 
 template <typename Function, typename Return, typename Arg = Return>
 struct UnaryFunction : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-                           camp::val<Function>()(camp::val<Arg>()))) {
-};
+                           camp::val<Function>()(camp::val<Arg>())))
+{};
 
 namespace detail
 {
 
 template <typename Fun, typename Ret, typename T, typename U>
-using is_binary_function = ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
+using is_binary_function =
+    ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
 
 template <typename Fun, typename Ret, typename T>
 using is_unary_function = ::RAJA::concepts::requires_<UnaryFunction, Ret, T>;
-}  // namespace detail
+} // namespace detail
 
-}  // namespace concepts
+} // namespace concepts
 
 namespace type_traits
 {
 DefineTypeTraitFromConcept(is_binary_function, RAJA::concepts::BinaryFunction);
 DefineTypeTraitFromConcept(is_unary_function, RAJA::concepts::UnaryFunction);
-}  // namespace type_traits
+} // namespace type_traits
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index e79e9f2830..cf370601a3 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -31,7 +31,8 @@ template <typename Indices>
 struct as_array;
 
 template <camp::idx_t... Indices>
-struct as_array<camp::idx_seq<Indices...>> {
+struct as_array<camp::idx_seq<Indices...>>
+{
   static constexpr std::array<Index_type, sizeof...(Indices)> get()
   {
     return {{Indices...}};
@@ -193,52 +194,52 @@ using PERM_MLKIJ = camp::idx_seq<4, 3, 2, 0, 1>;
 using PERM_MLKJI = camp::idx_seq<4, 3, 2, 1, 0>;
 
 
-
-
-namespace internal 
+namespace internal
 {
 
 
-template<camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
+template <camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem
 {
-  static constexpr camp::idx_t value = 
-    camp::seq_at<J, Perm>::value == I ? J : CalcInversePermutationElem<I, J+1, N, Perm>::value;
+  static constexpr camp::idx_t value =
+      camp::seq_at<J, Perm>::value == I
+          ? J
+          : CalcInversePermutationElem<I, J + 1, N, Perm>::value;
 };
 
-template<camp::idx_t I, camp::idx_t N, typename Perm>
+template <camp::idx_t I, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem<I, N, N, Perm>
 {
   static constexpr camp::idx_t value = I;
 };
 
 
-
-template<typename Range, typename Perm>
+template <typename Range, typename Perm>
 struct InversePermutationHelper;
 
-template<camp::idx_t ... Range, camp::idx_t ... Perm>
-struct InversePermutationHelper<camp::idx_seq<Range...>, 
-                                camp::idx_seq<Perm...>>
+template <camp::idx_t... Range, camp::idx_t... Perm>
+struct InversePermutationHelper<camp::idx_seq<Range...>, camp::idx_seq<Perm...>>
 {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq< 
-    CalcInversePermutationElem<Range, 0, sizeof...(Range), camp::idx_seq<Perm...>>::value ...  
-  >;  
+  using type = camp::idx_seq<
+      CalcInversePermutationElem<Range,
+                                 0,
+                                 sizeof...(Range),
+                                 camp::idx_seq<Perm...>>::value...>;
 };
 
 
-
 } // namespace internal
 
 
-
 /*!
   Inverts a permutation
 */
-template<typename Perm>
-using invert_permutation = typename internal::InversePermutationHelper<camp::make_idx_seq_t<camp::size<Perm>::value>, Perm>::type;
+template <typename Perm>
+using invert_permutation = typename internal::InversePermutationHelper<
+    camp::make_idx_seq_t<camp::size<Perm>::value>,
+    Perm>::type;
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif /* RAJA_FORALLN_PERMUTATIONS_HPP */
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index 5bb176215b..df51395123 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -67,22 +67,26 @@ auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     // If the size of dimension i is zero, then the stride is zero
     folded_strides[i] = sizes[permutation[i]] ? 1 : 0;
-    for (size_t j = i + 1; j < Rank; ++j) {
+    for (size_t j = i + 1; j < Rank; ++j)
+    {
       folded_strides[i] *= sizes[permutation[j]] ? sizes[permutation[j]] : 1;
     }
   }
 
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     strides[permutation[i]] = folded_strides[i];
   }
 
 
   // return Layout<Rank, IdxLin>(sizes, strides);
-  auto ret  = Layout<Rank, IdxLin>();
-  for (size_t i = 0; i < Rank; ++i) {
+  auto ret = Layout<Rank, IdxLin>();
+  for (size_t i = 0; i < Rank; ++i)
+  {
     ret.sizes[i] = sizes[i];
     ret.strides[i] = strides[i];
     ret.inv_strides[i] = strides[i] ? strides[i] : 1;
@@ -97,6 +101,6 @@ using Perm = camp::idx_seq<Ints...>;
 template <camp::idx_t N>
 using MakePerm = typename camp::make_idx_seq<N>::type;
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 996836e397..4be2ba0385 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -11,31 +11,33 @@
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/internal/get_platform.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class KokkosPluginLoader;
 
-struct PluginContext {
-  public:
-    PluginContext(const Platform p) :
-      platform(p) {}
+struct PluginContext
+{
+public:
+  PluginContext(const Platform p) : platform(p) {}
 
-    Platform platform;
+  Platform platform;
 
-  private:
-    mutable uint64_t kID;
+private:
+  mutable uint64_t kID;
 
-    friend class KokkosPluginLoader;
+  friend class KokkosPluginLoader;
 };
 
-template<typename Policy>
+template <typename Policy>
 PluginContext make_context()
 {
   return PluginContext{detail::get_platform<Policy>::value};
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+} // namespace util
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
index e5b77bd027..e7caa12d46 100644
--- a/include/RAJA/util/PluginLinker.hpp
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -11,14 +11,18 @@
 #include "RAJA/util/RuntimePluginLoader.hpp"
 #include "RAJA/util/KokkosPluginLoader.hpp"
 
-namespace {
-  namespace anonymous_RAJA {
-    struct pluginLinker {
-      inline pluginLinker() {
-        (void)RAJA::util::linkRuntimePluginLoader();
-        (void)RAJA::util::linkKokkosPluginLoader();
-      }
-    } pluginLinker;
+namespace
+{
+namespace anonymous_RAJA
+{
+struct pluginLinker
+{
+  inline pluginLinker()
+  {
+    (void)RAJA::util::linkRuntimePluginLoader();
+    (void)RAJA::util::linkKokkosPluginLoader();
   }
-}
+} pluginLinker;
+} // namespace anonymous_RAJA
+} // namespace
 #endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
index f0b6a35507..f787fb604a 100644
--- a/include/RAJA/util/PluginOptions.hpp
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -10,19 +10,21 @@
 
 #include <string>
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 struct PluginOptions
 {
-    PluginOptions(const std::string& newstr) : str(newstr) {};
-    
-    std::string str;
+  PluginOptions(const std::string& newstr) : str(newstr){};
+
+  std::string str;
 };
 
 inline PluginOptions make_options(const std::string& newstr)
 {
-    return PluginOptions{newstr};
+  return PluginOptions{newstr};
 }
 
 } // namespace util
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 3935559bba..2f6769c822 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -12,33 +12,35 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/Registry.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class PluginStrategy
 {
-  public:
-    RAJASHAREDDLL_API PluginStrategy();
+public:
+  RAJASHAREDDLL_API PluginStrategy();
 
-    virtual ~PluginStrategy() = default;
+  virtual ~PluginStrategy() = default;
 
-    virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
+  virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
 
-    virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void finalize();
+  virtual RAJASHAREDDLL_API void finalize();
 };
 
 using PluginRegistry = Registry<PluginStrategy>;
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+} // namespace util
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index 579481a6ed..dd72975090 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -10,126 +10,147 @@
 
 #include <memory>
 
-namespace RAJA {
-namespace util {
-
-  template <typename T>
-  class RegistryEntry {
-    std::string Name, Desc;
-    std::shared_ptr<T> object;
+namespace RAJA
+{
+namespace util
+{
+
+template <typename T>
+class RegistryEntry
+{
+  std::string Name, Desc;
+  std::shared_ptr<T> object;
+
+public:
+  RegistryEntry(const std::string& N,
+                const std::string& D,
+                std::shared_ptr<T> (*C)())
+      : Name(N), Desc(D), object(C())
+  {}
+
+  const std::string& getName() const { return Name; }
+  const std::string& getDesc() const { return Desc; }
+  T* get() const { return object.get(); }
+};
+
+/// A global registry used in conjunction with static constructors to make
+/// pluggable components (like targets or garbage collectors) "just work" when
+/// linked with an executable.
+template <typename T>
+class Registry
+{
+public:
+  using type = T;
+  using entry = RegistryEntry<T>;
+
+  class node;
+  class iterator;
+
+private:
+  Registry() = delete;
+
+  friend class node;
+  static node *Head, *Tail;
+
+public:
+  /// Node in linked list of entries.
+  ///
+  class node
+  {
+    friend class iterator;
+    friend Registry<T>;
+
+    node* Next;
+    const entry& Val;
 
   public:
-    RegistryEntry(const std::string& N, const std::string& D,
-        std::shared_ptr<T> (*C)())
-        : Name(N), Desc(D), object(C()) {}
-
-    const std::string& getName() const { return Name; }
-    const std::string& getDesc() const { return Desc; }
-    T* get() const { return object.get(); }
+    node(const entry& V) : Next(nullptr), Val(V) {}
   };
 
-  /// A global registry used in conjunction with static constructors to make
-  /// pluggable components (like targets or garbage collectors) "just work" when
-  /// linked with an executable.
-  template <typename T>
-  class Registry {
+  /// Add a node to the Registry: this is the interface between the plugin and
+  /// the executable.
+  ///
+  /// This function is exported by the executable and called by the plugin to
+  /// add a node to the executable's registry. Therefore it's not defined here
+  /// to avoid it being instantiated in the plugin and is instead defined in
+  /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
+  static RAJASHAREDDLL_API void add_node(node* N);
+
+  /// Iterators for registry entries.
+  ///
+  class iterator
+  {
+    const node* Cur;
+
   public:
-    using type = T;
-    using entry = RegistryEntry<T>;
+    explicit iterator(const node* N) : Cur(N) {}
+
+    bool operator==(const iterator& That) const { return Cur == That.Cur; }
+    bool operator!=(const iterator& That) const { return Cur != That.Cur; }
+    iterator& operator++()
+    {
+      Cur = Cur->Next;
+      return *this;
+    }
+    const entry& operator*() const { return Cur->Val; }
+    const entry* operator->() const { return &Cur->Val; }
+  };
 
-    class node;
-    class iterator;
+  // begin is not defined here in order to avoid usage of an undefined static
+  // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
+  static RAJASHAREDDLL_API iterator begin();
+  static iterator end() { return iterator(nullptr); }
 
-  private:
-    Registry() = delete;
+  /// A static registration template.
+  template <typename V>
+  class add
+  {
+    entry Entry;
+    node Node;
 
-    friend class node;
-    static node *Head, *Tail;
+    static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
   public:
-    /// Node in linked list of entries.
-    ///
-    class node {
-      friend class iterator;
-      friend Registry<T>;
-
-      node *Next;
-      const entry& Val;
-
-    public:
-      node(const entry &V) : Next(nullptr), Val(V) {}
-    };
-
-    /// Add a node to the Registry: this is the interface between the plugin and
-    /// the executable.
-    ///
-    /// This function is exported by the executable and called by the plugin to
-    /// add a node to the executable's registry. Therefore it's not defined here
-    /// to avoid it being instantiated in the plugin and is instead defined in
-    /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
-    static RAJASHAREDDLL_API void add_node(node *N);
-
-    /// Iterators for registry entries.
-    ///
-    class iterator {
-      const node *Cur;
-
-    public:
-      explicit iterator(const node *N) : Cur(N) {}
-
-      bool operator==(const iterator &That) const { return Cur == That.Cur; }
-      bool operator!=(const iterator &That) const { return Cur != That.Cur; }
-      iterator &operator++() { Cur = Cur->Next; return *this; }
-      const entry &operator*() const { return Cur->Val; }
-      const entry *operator->() const { return &Cur->Val; }
-    };
-
-    // begin is not defined here in order to avoid usage of an undefined static
-    // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
-    static RAJASHAREDDLL_API iterator begin();
-    static iterator end()   { return iterator(nullptr); }
-
-    /// A static registration template.
-    template <typename V>
-    class add {
-      entry Entry;
-      node Node;
-
-      static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
-
-    public:
-      add(const std::string& Name, const std::string& Desc)
-          : Entry(Name, Desc, CtorFn), Node(Entry) {
-        add_node(&Node);
-      }
-    };
+    add(const std::string& Name, const std::string& Desc)
+        : Entry(Name, Desc, CtorFn), Node(Entry)
+    {
+      add_node(&Node);
+    }
   };
-
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
-
-#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
-  namespace RAJA { \
-  namespace util { \
-  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
-  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
-  template<typename T> \
-  void Registry<T>::add_node(typename Registry<T>::node *N) { \
-    if (Tail) \
-      Tail->Next = N; \
-    else \
-      Head = N; \
-    Tail = N; \
-  } \
-  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
-    return iterator(Head); \
-  } \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
-  template \
-  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
-  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
-  } \
+};
+
+} // namespace util
+} // namespace RAJA
+
+#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace RAJA                                                               \
+  {                                                                            \
+  namespace util                                                               \
+  {                                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::node* Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node* Registry<T>::Tail = nullptr;                     \
+  template <typename T>                                                        \
+  void Registry<T>::add_node(typename Registry<T>::node* N)                    \
+  {                                                                            \
+    if (Tail)                                                                  \
+      Tail->Next = N;                                                          \
+    else                                                                       \
+      Head = N;                                                                \
+    Tail = N;                                                                  \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::iterator Registry<T>::begin()                          \
+  {                                                                            \
+    return iterator(Head);                                                     \
+  }                                                                            \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Head;         \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Tail;         \
+  template void                                                                \
+  Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*);             \
+  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin();   \
+  }                                                                            \
   }
 
 #endif
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 618913f794..7383364296 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -50,7 +50,7 @@ namespace RAJA
  *   unbounded extents
  *
  */
-template < typename T >
+template <typename T>
 struct RepeatView
 {
   struct iterator
@@ -62,44 +62,98 @@ struct RepeatView
     iterator() = default;
 
     constexpr iterator(const T* base, size_t index)
-      : m_value(base), m_index(index)
-    { }
+        : m_value(base), m_index(index)
+    {}
 
     constexpr reference operator*() const noexcept { return *m_value; }
-    constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); }
-
-    constexpr iterator& operator++() { ++m_index; return *this; }
-    constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
-
-    constexpr iterator& operator--() { --m_index; return *this; }
-    constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; }
-
-    constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; }
-    constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; }
+    constexpr reference operator[](difference_type index) const noexcept
+    {
+      return *(*this + index);
+    }
+
+    constexpr iterator& operator++()
+    {
+      ++m_index;
+      return *this;
+    }
+    constexpr iterator operator++(int)
+    {
+      auto tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator--()
+    {
+      --m_index;
+      return *this;
+    }
+    constexpr iterator operator--(int)
+    {
+      auto tmp = *this;
+      --(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator+=(difference_type rhs)
+    {
+      m_index += rhs;
+      return *this;
+    }
+    constexpr iterator& operator-=(difference_type rhs)
+    {
+      m_index -= rhs;
+      return *this;
+    }
 
     friend constexpr iterator operator+(iterator lhs, difference_type rhs)
-    { lhs += rhs; return lhs; }
+    {
+      lhs += rhs;
+      return lhs;
+    }
     friend constexpr iterator operator+(difference_type lhs, iterator rhs)
-    { rhs += lhs; return rhs; }
+    {
+      rhs += lhs;
+      return rhs;
+    }
 
     friend constexpr iterator operator-(iterator lhs, difference_type rhs)
-    { lhs -= rhs; return lhs; }
-    friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs)
-    { return static_cast<difference_type>(lhs.m_index) - static_cast<difference_type>(rhs.m_index); }
+    {
+      lhs -= rhs;
+      return lhs;
+    }
+    friend constexpr difference_type operator-(iterator const& lhs,
+                                               iterator const& rhs)
+    {
+      return static_cast<difference_type>(lhs.m_index) -
+             static_cast<difference_type>(rhs.m_index);
+    }
 
     friend constexpr bool operator==(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index == rhs.m_index; }
+    {
+      return lhs.m_index == rhs.m_index;
+    }
     friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs == rhs); }
+    {
+      return !(lhs == rhs);
+    }
 
     friend constexpr bool operator<(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index < rhs.m_index; }
+    {
+      return lhs.m_index < rhs.m_index;
+    }
     friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs)
-    { return !(rhs < lhs); }
+    {
+      return !(rhs < lhs);
+    }
     friend constexpr bool operator>(iterator const& lhs, iterator const& rhs)
-    { return rhs < lhs; }
+    {
+      return rhs < lhs;
+    }
     friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs < rhs); }
+    {
+      return !(lhs < rhs);
+    }
 
   private:
     const T* m_value = nullptr;
@@ -109,16 +163,19 @@ struct RepeatView
   RepeatView() = delete;
 
   constexpr RepeatView(T const& value, size_t bound)
-    : m_bound(bound), m_value(value)
-  { }
+      : m_bound(bound), m_value(value)
+  {}
 
   constexpr RepeatView(T&& value, size_t bound)
-    : m_bound(bound), m_value(std::move(value))
-  { }
+      : m_bound(bound), m_value(std::move(value))
+  {}
 
   constexpr T const& front() const { return m_value; }
   constexpr T const& back() const { return m_value; }
-  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; }
+  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const
+  {
+    return m_value;
+  }
 
   constexpr iterator begin() const { return iterator(&m_value, 0); }
   constexpr iterator cbegin() const { return iterator(&m_value, 0); }
@@ -136,6 +193,6 @@ struct RepeatView
   T m_value;
 };
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_REPEATVIEW_HPP */
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
index 3e7fbb165f..64e6133453 100644
--- a/include/RAJA/util/RuntimePluginLoader.hpp
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -14,41 +14,42 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class RuntimePluginLoader : public RAJA::util::PluginStrategy
-  {
-    using Parent = RAJA::util::PluginStrategy;
+class RuntimePluginLoader : public RAJA::util::PluginStrategy
+{
+  using Parent = RAJA::util::PluginStrategy;
 
-  public:
-    RuntimePluginLoader();
+public:
+  RuntimePluginLoader();
 
-    void init(const RAJA::util::PluginOptions& p) override;
+  void init(const RAJA::util::PluginOptions& p) override;
 
-    void preCapture(const RAJA::util::PluginContext& p) override;
+  void preCapture(const RAJA::util::PluginContext& p) override;
 
-    void postCapture(const RAJA::util::PluginContext& p) override;
+  void postCapture(const RAJA::util::PluginContext& p) override;
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
+private:
+  void initPlugin(const std::string& path);
 
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+  void initDirectory(const std::string& path);
 
-    std::vector<std::unique_ptr<Parent>> plugins;
+  std::vector<std::unique_ptr<Parent>> plugins;
 
-  };  // end RuntimePluginLoader class
+}; // end RuntimePluginLoader class
 
-  void linkRuntimePluginLoader();
+void linkRuntimePluginLoader();
 
-}  // end namespace util
-}  // end namespace RAJA
+} // end namespace util
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index 6828bc3b1a..3b86c43b89 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -52,7 +52,7 @@ class SoAArray
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
 template <typename T, typename IndexType, bool doing_min, size_t size>
-class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
+class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
 {
   using value_type = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
   using first_type = T;
@@ -74,8 +74,8 @@ class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   second_type mem_idx[size];
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif /* RAJA_SOA_ARRAY_HPP */
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 47802d8f0a..eb8259b83b 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -42,35 +42,35 @@ namespace detail
 template <typename T,
           typename mempool = RAJA::basic_mempool::MemPool<
               RAJA::basic_mempool::generic_allocator>,
-          typename accessor = DefaultAccessor >
+          typename accessor = DefaultAccessor>
 class SoAPtr
 {
-  template < typename, typename, typename >
+  template <typename, typename, typename>
   friend class SoAPtr; // friend other instantiations of this class
 
 public:
   using value_type = T;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
 
   SoAPtr() = default;
   SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr(SoAPtr&&) = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&) = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
@@ -87,8 +87,14 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return accessor::get(mem, i);
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    accessor::set(mem, i, val);
+  }
 
 private:
   value_type* mem = nullptr;
@@ -97,40 +103,45 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
+template <typename T,
+          typename IndexType,
+          bool doing_min,
+          typename mempool,
+          typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
+             mempool,
+             accessor>
 {
   using first_type = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
+  template <typename, typename, typename>
   friend class SoAPtr; // fiend other instantiations of this class
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
   SoAPtr() = default;
   SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr(SoAPtr&&) = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&) = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem), mem_idx(rhs.mem_idx)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
@@ -165,8 +176,8 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
   second_type* mem_idx = nullptr;
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif /* RAJA_SOA_PTR_HPP */
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index 2da2e0164c..d3c103a95a 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -56,7 +56,8 @@ namespace RAJA
  *
  */
 template <typename IterType, typename IndexType>
-struct Span {
+struct Span
+{
   using element_type = typename std::iterator_traits<IterType>::value_type;
   using value_type = camp::decay<element_type>;
   using size_type = IndexType;
@@ -67,19 +68,18 @@ struct Span {
   using const_iterator = IterType;
 
   static_assert(type_traits::is_integral<IndexType>::value,
-                "IndexType must model Integral");
+                "IndexType must "
+                "model Integral");
   static_assert(type_traits::is_random_access_iterator<IterType>::value,
                 "IterType must model RandomAccessIterator");
 
   RAJA_HOST_DEVICE Span(iterator begin, iterator end)
       : m_begin{begin}, m_end{end}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE Span(iterator begin, size_type size)
       : m_begin{begin}, m_end{begin + size}
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
@@ -88,16 +88,34 @@ struct Span {
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s) { return s.begin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
+  {
+    return s.begin();
+  }
   RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s) { return s.begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s) { return s.cbegin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s) { return s.cend(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s)
+  {
+    return s.begin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s)
+  {
+    return s.end();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s)
+  {
+    return s.cbegin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s)
+  {
+    return s.cend();
+  }
 
   RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end()-1); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const { return data()[i]; }
+  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end() - 1); }
+  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
+  {
+    return data()[i];
+  }
   RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
 
   RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
@@ -157,23 +175,23 @@ struct Span {
  *
  */
 template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
-    IterType begin,
-    IndexType size)
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
+                                                                 IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
 
 template <typename Iter>
-RAJA_INLINE auto make_span(Iter &iterable)
+RAJA_INLINE auto make_span(Iter& iterable)
 {
   using std::begin;
-  using std::end;
   using std::distance;
-  return Span<typename Iter::iterator, decltype(distance(begin(iterable), end(iterable)))>
-    (begin(iterable), end(iterable));
+  using std::end;
+  return Span<typename Iter::iterator,
+              decltype(distance(begin(iterable), end(iterable)))>(
+      begin(iterable), end(iterable));
 }
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_SPAN_HPP */
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 8d27980f83..661aceebad 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -32,7 +32,6 @@
 #include "RAJA/util/Permutations.hpp"
 
 
-
 namespace RAJA
 {
 
@@ -40,7 +39,11 @@ namespace detail
 {
 
 
-template <typename IdxLin, typename Range, typename Sizes, typename Strides, typename DimTypeList=void>
+template <typename IdxLin,
+          typename Range,
+          typename Sizes,
+          typename Strides,
+          typename DimTypeList = void>
 struct StaticLayoutBase_impl;
 
 
@@ -52,15 +55,16 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             void> {
+                             void>
+{
 
   using IndexLinear = IdxLin;
   using sizes = camp::int_seq<IdxLin, Sizes...>;
   using strides = camp::int_seq<IdxLin, Strides...>;
 
-  static constexpr camp::idx_t stride_one_dim =
-      RAJA::max<camp::idx_t>(
-          (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts) : -1)...);
+  static constexpr camp::idx_t stride_one_dim = RAJA::max<camp::idx_t>(
+      (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts)
+                                                    : -1)...);
 
   static constexpr size_t n_dims = sizeof...(Sizes);
 
@@ -72,9 +76,9 @@ struct StaticLayoutBase_impl<IdxLin,
   RAJA_INLINE static void print()
   {
     camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                               (int)RangeInts,
-                               (int)Sizes,
-                               (int)Strides)...);
+                      (int)RangeInts,
+                      (int)Sizes,
+                      (int)Strides)...);
   }
 
 
@@ -86,8 +90,8 @@ struct StaticLayoutBase_impl<IdxLin,
    * @return Linear space index.
    */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(Indices... indices) const
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -95,7 +99,8 @@ struct StaticLayoutBase_impl<IdxLin,
 
 
   template <typename... Indices>
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(Indices... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  s_oper(Indices... indices)
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -108,8 +113,7 @@ struct StaticLayoutBase_impl<IdxLin,
       RAJA::product<IdxLin>((Sizes == IdxLin(0) ? IdxLin(1) : Sizes)...);
 
   // Multiply together all of the sizes
-  static constexpr IdxLin s_size_noproj =
-      RAJA::product<IdxLin>(Sizes...);
+  static constexpr IdxLin s_size_noproj = RAJA::product<IdxLin>(Sizes...);
 
   /*!
    * Computes a size of the layout's space with projections as size 1.
@@ -137,34 +141,28 @@ struct StaticLayoutBase_impl<IdxLin,
   }
 
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return camp::seq_at<DIM, strides>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
-
 };
 
 template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
-struct StrideCalculatorIdx {
+struct StrideCalculatorIdx
+{
   static_assert(N == sizeof...(Sizes), "");
 
   using sizes_seq = camp::int_seq<IdxLin, Sizes...>;
@@ -178,7 +176,8 @@ struct StrideCalculatorIdx {
 };
 
 template <typename IdxLin, IdxLin N, IdxLin... Sizes>
-struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
+struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
+{
   static_assert(N == sizeof...(Sizes), "");
 
   static constexpr IdxLin size = 1;
@@ -189,11 +188,15 @@ struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
 template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <typename IdxLin, IdxLin ... Range, camp::idx_t... Perm, IdxLin... Sizes>
+template <typename IdxLin,
+          IdxLin... Range,
+          camp::idx_t... Perm,
+          IdxLin... Sizes>
 struct StrideCalculator<IdxLin,
                         camp::int_seq<IdxLin, Range...>,
                         camp::idx_seq<Perm...>,
-                        camp::int_seq<IdxLin, Sizes...>> {
+                        camp::int_seq<IdxLin, Sizes...>>
+{
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
   using sizes = camp::int_seq<IdxLin, Sizes...>;
@@ -202,14 +205,20 @@ struct StrideCalculator<IdxLin,
   using perm = camp::idx_seq<Perm...>;
   using inv_perm = invert_permutation<perm>;
 
-  using strides_unperm =
-      camp::int_seq<IdxLin, StrideCalculatorIdx<IdxLin, N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
-
-  using strides = camp::int_seq<IdxLin, camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
+  using strides_unperm = camp::int_seq<
+      IdxLin,
+      StrideCalculatorIdx<IdxLin,
+                          N,
+                          Range,
+                          camp::seq_at<Perm, sizes>::value...>::stride...>;
+
+  using strides =
+      camp::int_seq<IdxLin,
+                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
+                                 strides_unperm>::value...>;
 };
 
 
-
 template <typename IdxLin,
           IdxLin... RangeInts,
           IdxLin... Sizes,
@@ -219,19 +228,19 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             camp::list<DimTypes...>> {
+                             camp::list<DimTypes...>>
+{
 
 
   using IndexLinear = IdxLin;
-  using ranges      = camp::int_seq<IdxLin, RangeInts...>;
-  using sizes       = camp::int_seq<IdxLin, Sizes...>;
-  using strides     = camp::int_seq<IdxLin, Strides...>;  
+  using ranges = camp::int_seq<IdxLin, RangeInts...>;
+  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using strides = camp::int_seq<IdxLin, Strides...>;
 
-  using InnerLayout = StaticLayoutBase_impl<IdxLin,ranges,sizes,strides,void>;
+  using InnerLayout =
+      StaticLayoutBase_impl<IdxLin, ranges, sizes, strides, void>;
 
-  static
-  constexpr
-  camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
+  static constexpr camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
 
   static constexpr IndexLinear n_dims = sizeof...(DimTypes);
   /*!
@@ -241,8 +250,8 @@ struct StaticLayoutBase_impl<IdxLin,
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear s_oper(
-      DimTypes... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear
+  s_oper(DimTypes... indices)
   {
     return InnerLayout::s_oper(stripIndexType(indices)...);
   }
@@ -261,50 +270,44 @@ struct StaticLayoutBase_impl<IdxLin,
     return s_size_noproj;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return InnerLayout{}.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 
 
   RAJA_INLINE
   static void print() { InnerLayout::print(); }
-
 };
 
 
-
-
-
-template <typename Perm, typename IdxLin, typename Sizes, typename Indexes, typename TypeList>
+template <typename Perm,
+          typename IdxLin,
+          typename Sizes,
+          typename Indexes,
+          typename TypeList>
 struct StaticLayoutMaker
 {
-  using strides = typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
-  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides,TypeList>;
+  using strides =
+      typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
+  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides, TypeList>;
 };
 
 
-
-}  // namespace detail
+} // namespace detail
 
 
 template <typename Perm, typename IdxLin, camp::idx_t... Sizes>
@@ -313,21 +316,22 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    void
-    >::type;
+    void>::type;
 
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
+template <typename Perm,
+          typename IdxLin,
+          typename TypeList,
+          camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    TypeList
-    >::type;
+    TypeList>::type;
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 8c23a2c74d..a0f5ca80ea 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -80,7 +80,7 @@ class BGQTimer
 };
 
 using TimerBase = BGQTimer;
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #elif defined(RAJA_USE_CHRONO)
@@ -110,8 +110,7 @@ class ChronoTimer
 
 public:
   ChronoTimer() : tstart(ClockType::now()), tstop(ClockType::now()), telapsed(0)
-  {
-  }
+  {}
 
   void start() { tstart = ClockType::now(); }
 
@@ -133,7 +132,7 @@ class ChronoTimer
 };
 
 using TimerBase = ChronoTimer;
-}  // namespace RAJA
+} // namespace RAJA
 
 
 #elif defined(RAJA_USE_GETTIME)
@@ -195,7 +194,7 @@ class GettimeTimer
 };
 
 using TimerBase = GettimeTimer;
-}  // namespace RAJA
+} // namespace RAJA
 
 #elif defined(RAJA_USE_CLOCK)
 
@@ -247,7 +246,7 @@ class ClockTimer
 };
 
 using TimerBase = ClockTimer;
-}  // namespace RAJA
+} // namespace RAJA
 
 #else
 
@@ -273,6 +272,6 @@ class Timer : public TimerBase
 #endif
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 5cdc019259..511b20a532 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -39,7 +39,7 @@ namespace util
  * Reinterpret any datatype as another datatype of the same size
  */
 template <typename A, typename B>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const& a)
 {
   static_assert(sizeof(A) == sizeof(B), "A and B must be the same size");
 
@@ -49,7 +49,7 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a)
 }
 
 
-}  // namespace util
-}  // namespace RAJA
+} // namespace util
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 0d5bed35d6..50fa7e9f99 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -38,706 +38,794 @@ namespace RAJA
 namespace internal
 {
 
-  template<camp::idx_t, typename T>
-  struct IndexToType{
-      using type = T;
-  };
+template <camp::idx_t, typename T>
+struct IndexToType
+{
+  using type = T;
+};
 
-  template<typename IdxSeq, typename T>
-  struct SequenceToType;
+template <typename IdxSeq, typename T>
+struct SequenceToType;
 
-  template<camp::idx_t ... Perm, typename T>
-  struct SequenceToType<camp::idx_seq<Perm...>, T>{
-      using type =  camp::list<typename IndexToType<Perm, T>::type...>;
-  };
+template <camp::idx_t... Perm, typename T>
+struct SequenceToType<camp::idx_seq<Perm...>, T>
+{
+  using type = camp::list<typename IndexToType<Perm, T>::type...>;
+};
 
-  template<typename Perm>
-  using getDefaultIndexTypes = typename SequenceToType<Perm, RAJA::Index_type>::type;
+template <typename Perm>
+using getDefaultIndexTypes =
+    typename SequenceToType<Perm, RAJA::Index_type>::type;
 
 
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
+struct add_offset
+{
+  using type = RAJA::OffsetLayout<layout::n_dims>;
+};
 
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
+{
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+};
 
-  //Helpers to convert
-  //layouts -> OffsetLayouts
-  //Typedlayouts -> TypedOffsetLayouts
-  template<typename layout>
-  struct add_offset
-  {
-    using type = RAJA::OffsetLayout<layout::n_dims>;
-  };
 
-  template<typename IdxLin, typename...DimTypes>
-  struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
-  {
-    using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
-  };
+#if defined(RAJA_ENABLE_VECTORIZATION)
+namespace detail
+{
+/*
+ * Returns the argument number which contains a VectorIndex
+ *
+ * returns -1 if none of the arguments are VectorIndexs
+ */
 
+template <camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
+struct GetTensorArgIdxExpanded;
 
+template <camp::idx_t DIM, typename... ARGS, camp::idx_t... IDX>
+struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>>
+{
 
+  static constexpr camp::idx_t value = RAJA::max<camp::idx_t>(
+      (internal::expt::isTensorIndex<ARGS>() &&
+               internal::expt::getTensorDim<ARGS>() == DIM
+           ? IDX
+           : -1)...);
+};
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  namespace detail
-  {
-    /*
-     * Returns the argument number which contains a VectorIndex
-     *
-     * returns -1 if none of the arguments are VectorIndexs
-     */
 
-    template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
-    struct GetTensorArgIdxExpanded;
+} // namespace detail
+#endif
 
-    template<camp::idx_t DIM, typename ... ARGS, camp::idx_t ... IDX>
-    struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>> {
 
-        static constexpr camp::idx_t value =
-            RAJA::max<camp::idx_t>(
-                (internal::expt::isTensorIndex<ARGS>()&&internal::expt::getTensorDim<ARGS>()==DIM ? IDX : -1) ...);
-    };
+/*
+ * Returns the number of arguments which are VectorIndexs
+ */
+template <typename... ARGS>
+struct count_num_tensor_args
+{
+  static constexpr camp::idx_t value =
+#if defined(RAJA_ENABLE_VECTORIZATION)
+      RAJA::sum<camp::idx_t>(
+          (internal::expt::isTensorIndex<ARGS>() ? 1 : 0)...);
+#else
+      0; // There should be 0 Tensor indices if not vectorizing.
+#endif
+};
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/*
+ * Returns which argument has a vector index
+ */
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
 
-  } // namespace detail
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx<DIM, camp::list<ARGS...>>
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
+
+/*
+ * Returns the beginning index in a vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
+get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorBegin<ARGS>(
+                args,
+                layout.template get_dim_begin<
+                    GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
+
+/*
+ * Returns the number of elements in the vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
+get_tensor_args_size(LAYOUT const& layout, ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorSize<ARGS>(
+                args,
+                layout.template get_dim_size<
+                    GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
 #endif
 
 
+namespace detail
+{
+
+/*!
+ * Provides conversion of view data to a return type.
+ *
+ * For scalars, this just returns the scalar.
+ *
+ * In the future development, this may return SIMD vectors or matrices using
+ * class specializations.
+ */
+template <typename VecSeq,
+          typename Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper;
+
+
+/*
+ * Specialization for Scalar return types
+ */
+template <typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
+  using return_type = ElementType&;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
+  {
+    return data[stripIndexType(layout(args...))];
+  }
+};
+
 
-  /*
-   * Returns the number of arguments which are VectorIndexs
-   */
-  template<typename ... ARGS>
-  struct count_num_tensor_args{
-    static constexpr camp::idx_t value =
-#if defined(RAJA_ENABLE_VECTORIZATION)
-        RAJA::sum<camp::idx_t>(
-            (internal::expt::isTensorIndex<ARGS>() ? 1 : 0) ...);
-#else
-        0;  // There should be 0 Tensor indices if not vectorizing.
-#endif
-  };
-  
 #if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Returns which argument has a vector index
-   */
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
-
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx<DIM,camp::list<ARGS...>>{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
+/*
+ * Specialization for Tensor return types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, Args...>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using tensor_reg_type =
+      typename camp::at_v<camp::list<Args...>,
+                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
+  using ref_type = internal::expt::TensorRef<ElementType*,
+                                             LinIdx,
+                                             internal::expt::TENSOR_MULTIPLE,
+                                             s_num_dims,
+                                             s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-  /*
-   * Returns the beginning index in a vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_begin(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
+  {
+
+    return return_type(ref_type{
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<Args>()
+                       ? LinIdx{0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        {(LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecHead, Args...>::value>(),
+         (LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecSeq, Args...>::value>()...},
+        // tile
+        {// begin
+         {(LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
+          (LinIdx)(get_tensor_args_begin<VecSeq>(layout, args...))...},
+
+         // size
+         {(LinIdx)get_tensor_args_size<VecHead>(layout, args...),
+          (LinIdx)get_tensor_args_size<VecSeq>(layout, args...)...}}});
   }
+};
+
+
+/*
+ * Specialization for Tensor return types and static layout types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... INDEX_TYPES,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          LinIdx... RangeInts,
+          LinIdx... SizeInts,
+          LinIdx... StrideInts,
+          typename DIM_LIST>
+struct ViewReturnHelper<
+    camp::idx_seq<VecHead, VecSeq...>,
+    camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    RAJA::detail::StaticLayoutBase_impl<LinIdx,
+                                        camp::int_seq<LinIdx, RangeInts...>,
+                                        camp::int_seq<LinIdx, SizeInts...>,
+                                        camp::int_seq<LinIdx, StrideInts...>,
+                                        DIM_LIST>>
+{
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
+
+  using range_seq = camp::int_seq<LinIdx, RangeInts...>;
+  using size_seq = camp::int_seq<LinIdx, SizeInts...>;
+  using stride_seq = camp::int_seq<LinIdx, StrideInts...>;
+  using LayoutType = RAJA::detail::
+      StaticLayoutBase_impl<LinIdx, range_seq, size_seq, stride_seq, DIM_LIST>;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, index_list>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using new_begin_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_begin<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_begin<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_size_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_size<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_size<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+
+  using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
+  using new_size_type = internal::expt::StaticIndexArray<new_size_seq>;
+
+
+  using tensor_reg_type =
+      typename camp::at_v<index_list,
+                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
+  using ref_type =
+      internal::expt::StaticTensorRef<ElementType*,
+                                      LinIdx,
+                                      internal::expt::TENSOR_MULTIPLE,
+                                      stride_seq,
+                                      new_begin_seq,
+                                      new_size_seq,
+                                      s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+
 
-  /*
-   * Returns the number of elements in the vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_size(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type
+  make_return(LayoutType const& layout,
+              PointerType const& data,
+              RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
+  {
+
+    return return_type(ref_type{
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<
+                       typename RAJA::expt::StaticTensorIndex<
+                           INDEX_TYPES>::base_type>()
+                       ? LinIdx{0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        typename ref_type::stride_type(),
+        // tile
+        {new_begin_type(), new_size_type()}});
   }
+};
 #endif
 
 
-  namespace detail {
+} // namespace detail
 
-  /*!
-   * Provides conversion of view data to a return type.
-   *
-   * For scalars, this just returns the scalar.
-   *
-   * In the future development, this may return SIMD vectors or matrices using
-   * class specializations.
-   */
-  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper;
 
+/*
+ * Computes the return type of a view.
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return type.
+ *
+ * Otherwise it produces the usual scalar reference return type
+ */
+template <typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType,
+          typename... Args>
+using view_return_type_t = typename detail::ViewReturnHelper<
+    camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+    camp::list<Args...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType>::return_type;
+
+/*
+ * Creates the return value for a View
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return value.
+ *
+ * Otherwise it produces the usual scalar reference return value
+ */
+template <typename ElementType,
+          typename LinIdx,
+          typename LayoutType,
+          typename PointerType,
+          typename... Args>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
+                                                          PointerType,
+                                                          LinIdx,
+                                                          LayoutType,
+                                                          Args...>
+view_make_return_value(LayoutType const& layout,
+                       PointerType const& data,
+                       Args const&... args)
+{
+  return detail::ViewReturnHelper<
+      camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+      camp::list<Args...>,
+      ElementType,
+      PointerType,
+      LinIdx,
+      LayoutType>::make_return(layout, data, args...);
+}
 
-  /*
-   * Specialization for Scalar return types
-   */
-  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
-  {
-      using return_type = ElementType &;
+namespace detail
+{
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-        return data[stripIndexType(layout(args...))];
-      }
-  };
+/**
+ * This class will help strip strongly typed indices
+ *
+ * This default implementation static_asserts that Expected==Arg, otherwise
+ * it's an error.  This enforces types for the TypedView.
+ *
+ * Specialization where expected type is same as argument type.
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg>
+struct MatchTypedViewArgHelper
+{
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using type = strip_index_type_t<Arg>;
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Specialization for Tensor return types
-   */
-  template<camp::idx_t VecHead, camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<VecHead,VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr type extract(Arg arg)
   {
+    return stripIndexType(arg);
+  }
+};
 
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,Args...>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTensorArgIdx<0, Args...>::value>::tensor_type;
-      using ref_type = internal::expt::TensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE, s_num_dims, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          {
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecHead,Args...>::value>(),
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecSeq, Args...>::value>()...
-          },
-          // tile
-          {
-              // begin
-              {
-                  (LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
-                  (LinIdx)(get_tensor_args_begin<VecSeq> (layout, args...))...
-              },
-
-              // size
-              {
-                  (LinIdx)get_tensor_args_size<VecHead>(layout, args...),
-                  (LinIdx)get_tensor_args_size<VecSeq> (layout, args...)...
-              }
-          }
-        });
-      }
-  };
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/**
+ * Specialization where expected type is wrapped in a VectorIndex type
+ *
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
+struct MatchTypedViewArgHelper<Expected,
+                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
+{
 
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using arg_type = strip_index_type_t<Arg>;
 
+  using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
 
-  /*
-   * Specialization for Tensor return types and static layout types
-   */
-  template<
-      camp::idx_t VecHead, camp::idx_t ... VecSeq,
-      typename ... INDEX_TYPES,
-      typename ElementType, typename PointerType, typename LinIdx,
-      LinIdx... RangeInts, LinIdx... SizeInts, LinIdx... StrideInts,
-      typename DIM_LIST
-  >
-  struct ViewReturnHelper<
-      camp::idx_seq<VecHead,VecSeq...>,
-      camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
-      ElementType, PointerType,
-      LinIdx,
-      RAJA::detail::StaticLayoutBase_impl<
-          LinIdx,
-          camp::int_seq<LinIdx,RangeInts...>,
-          camp::int_seq<LinIdx,SizeInts...>,
-          camp::int_seq<LinIdx,StrideInts...>,
-          DIM_LIST
-      >
-  > {
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
-
-      using range_seq  = camp::int_seq<LinIdx,RangeInts... >;
-      using size_seq   = camp::int_seq<LinIdx,SizeInts...  >;
-      using stride_seq = camp::int_seq<LinIdx,StrideInts...>;
-      using LayoutType = RAJA::detail::StaticLayoutBase_impl<LinIdx,range_seq,size_seq,stride_seq,DIM_LIST>;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,index_list>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-
-
-      using new_begin_seq = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_begin<VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_begin<VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-      using new_size_seq  = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_size <VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_size <VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-
-      using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
-      using new_size_type  = internal::expt::StaticIndexArray<new_size_seq >;
-
-
-      using tensor_reg_type = typename camp::at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
-      using ref_type = internal::expt::StaticTensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE,stride_seq,new_begin_seq,new_size_seq, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<typename RAJA::expt::StaticTensorIndex<INDEX_TYPES>::base_type>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          typename ref_type::stride_type(),
-          // tile
-          {
-              new_begin_type(),
-              new_size_type()
-          }
-        });
-      }
-  };
-#endif
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type
+  extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg)
+  {
+    return type(stripIndexType(*vec_arg), vec_arg.size());
+  }
+};
 
+/**
+ * Specialization where expected type is wrapped in a StaticTensorIndex type
+ *
+ * In this case, there is no StaticTensorIndex to unpack, just strip any
+ * strongly typed indices.
+ */
+template <typename Expected,
+          typename Arg,
+          typename VectorType,
+          camp::idx_t DIM,
+          Arg BEGIN,
+          strip_index_type_t<Arg> LENGTH>
+struct MatchTypedViewArgHelper<
+    Expected,
+    RAJA::expt::StaticTensorIndex<
+        RAJA::expt::
+            StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>>
+{
 
-  } // namespace detail
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using arg_type = strip_index_type_t<Arg>;
 
-  /*
-   * Computes the return type of a view.
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return type.
-   *
-   * Otherwise it produces the usual scalar reference return type
-   */
-  template<typename ElementType, typename PointerType, typename LinIdx, typename LayoutType, typename ... Args>
-  using view_return_type_t =
-      typename detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::return_type;
+  using type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::
+          StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-  /*
-   * Creates the return value for a View
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return value.
-   *
-   * Otherwise it produces the usual scalar reference return value
-   */
-  template<typename ElementType, typename LinIdx, typename LayoutType, typename PointerType, typename ... Args>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  view_return_type_t<ElementType, PointerType, LinIdx, LayoutType, Args...>
-  view_make_return_value(LayoutType const &layout, PointerType const &data, Args const &... args){
-    return detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::make_return(layout, data, args...);
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type extract(
+      RAJA::expt::StaticTensorIndex<
+          RAJA::expt::
+              StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>
+          RAJA_UNUSED_ARG(vec_arg))
+  {
+    return type();
   }
+};
+#endif
 
-  namespace detail
-  {
+} // namespace detail
 
-  /**
-   * This class will help strip strongly typed indices
-   *
-   * This default implementation static_asserts that Expected==Arg, otherwise
-   * it's an error.  This enforces types for the TypedView.
-   *
-   * Specialization where expected type is same as argument type.
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg>
-  struct MatchTypedViewArgHelper{
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
 
-    using type = strip_index_type_t<Arg>;
+template <typename Expected, typename Arg>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr
+    typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
+    match_typed_view_arg(Arg const& arg)
+{
+  return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+}
+
 
-    static RAJA_HOST_DEVICE RAJA_INLINE
-    constexpr
-    type extract(Arg arg){
-      return stripIndexType(arg);
-    }
-  };
+template <typename ValueType, typename PointerType, typename LayoutType>
+class ViewBase
+{
 
+public:
+  using value_type = ValueType;
+  using pointer_type = PointerType;
+  using layout_type = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /**
-   * Specialization where expected type is wrapped in a VectorIndex type
+  using Self = ViewBase<value_type, pointer_type, layout_type>;
+  using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
+
+protected:
+  pointer_type m_data;
+  layout_type const m_layout;
+
+public:
+  /*
+   * Defaulted operators (AJK):
+   *
+   * OpenMP Target currently needs the View classes to be trivially copyable,
+   * which means that we need to use the default ctor's and assignment
+   * operators.
    *
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
+   * These defaulted operators cause issues with some versions of CUDA, so
+   * in the case that CUDA is enabled, we switch to explicitly defined
+   * operators.
    */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::TensorIndex<Arg, VectorType, DIM> >{
+#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase(){};
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE ViewBase(ViewBase const& c)
+      : m_layout(c.m_layout), m_data(c.m_data)
+  {}
 
-    using arg_type = strip_index_type_t<Arg>;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ViewBase& operator=(ViewBase const& c)
+  {
+    m_layout = c.m_layout;
+    m_data = c.m_data;
+  }
+#else
+  constexpr ViewBase() = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase const&) = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase&&) = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase const&) = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase&&) = default;
 
-    using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
+#endif
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg){
-      return type(stripIndexType(*vec_arg), vec_arg.size());
-    }
-  };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase(pointer_type data, layout_type&& layout)
+      : m_data(data), m_layout(layout)
+  {}
 
-  /**
-   * Specialization where expected type is wrapped in a StaticTensorIndex type
-   *
-   * In this case, there is no StaticTensorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM, Arg BEGIN, strip_index_type_t<Arg> LENGTH>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> >{
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
+                                                  Args... dim_sizes)
+      : m_data(data), m_layout(dim_sizes...)
+  {}
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
 
-    using arg_type = strip_index_type_t<Arg>;
+  template <bool IsConstView = std::is_const<value_type>::value>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
+      : m_data(rhs.get_data()), m_layout(rhs.get_layout())
+  {}
 
-    using type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> RAJA_UNUSED_ARG(vec_arg)){
-      return type();
-    }
-  };
-#endif
+  RAJA_HOST_DEVICE
+  RAJA_INLINE void set_data(PointerType data_ptr) { m_data = data_ptr; }
 
-  } //namespace detail
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr pointer_type const& get_data() const { return m_data; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr layout_type const& get_layout() const { return m_layout; }
 
-  template<typename Expected, typename Arg>
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr
-  typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
-  match_typed_view_arg(Arg const &arg)
+  constexpr linear_index_type size() const { return m_layout.size(); }
+
+
+  template <camp::idx_t DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type get_dim_size() const
   {
-    return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+    return m_layout.template get_dim_size<DIM>();
   }
 
 
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType>
-class ViewBase {
-
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Self = ViewBase<value_type, pointer_type, layout_type>;
-    using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
-
-  protected:
-    pointer_type m_data;
-    layout_type const m_layout;
-
-  public:
-
-
-    /*
-     * Defaulted operators (AJK):
-     *
-     * OpenMP Target currently needs the View classes to be trivially copyable,
-     * which means that we need to use the default ctor's and assignment
-     * operators.
-     *
-     * These defaulted operators cause issues with some versions of CUDA, so
-     * in the case that CUDA is enabled, we switch to explicitly defined
-     * operators.
-     */
-#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr ViewBase(){};
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE ViewBase(ViewBase const &c)
-      : m_layout(c.m_layout), m_data(c.m_data)
-    {
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    ViewBase &operator=(ViewBase const &c)
-    {
-      m_layout = c.m_layout;
-      m_data = c.m_data;
-    }
-#else
-    constexpr ViewBase() = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase const &) = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase &&) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase const &) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase &&) = default;
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
-#endif
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, layout_type &&layout) :
-    m_data(data), m_layout(layout)
-    {
-    }
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, Args... dim_sizes) :
-    m_data(data), m_layout(dim_sizes...)
-    {
-    }
-
-
-    template <bool IsConstView = std::is_const<value_type>::value>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(typename std::enable_if<IsConstView, NonConstView>::type const &rhs) :
-    m_data(rhs.get_data()), m_layout(rhs.get_layout())
-    {
-    }
-
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE void set_data(PointerType data_ptr){
-      m_data = data_ptr;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    pointer_type const &get_data() const
-    {
-      return m_data;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    layout_type const &get_layout() const
-    {
-      return m_layout;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type size() const
-    {
-      return m_layout.size();
-    }
-
-
-    template<camp::idx_t DIM>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type get_dim_size() const
-    {
-      return m_layout.template get_dim_size<DIM>();
-    }
-
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    template <size_t n_dims = layout_type::n_dims, typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
-
-      shifted_layout_type shift_layout(m_layout);
-      shift_layout.shift(shift);
-
-      return ShiftedView(m_data, shift_layout);
-    }
+  template <size_t n_dims = layout_type::n_dims,
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
+
+    shifted_layout_type shift_layout(m_layout);
+    shift_layout.shift(shift);
 
+    return ShiftedView(m_data, shift_layout);
+  }
 };
 
 
 template <typename ValueType,
-        typename PointerType,
-        typename LayoutType,
-        typename IndexTypes>
+          typename PointerType,
+          typename LayoutType,
+          typename IndexTypes>
 class TypedViewBase;
 
 template <typename ValueType,
           typename PointerType,
           typename LayoutType,
           typename... IndexTypes>
-class TypedViewBase<ValueType, PointerType, LayoutType, camp::list<IndexTypes...>> :
-  public ViewBase<ValueType, PointerType, LayoutType>
+class TypedViewBase<ValueType,
+                    PointerType,
+                    LayoutType,
+                    camp::list<IndexTypes...>>
+    : public ViewBase<ValueType, PointerType, LayoutType>
 {
 
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Base = ViewBase<ValueType, PointerType, LayoutType>;
-    using Self = TypedViewBase<value_type, pointer_type, layout_type, camp::list<IndexTypes...> >;
-    using NonConstView = TypedViewBase<nc_value_type, nc_pointer_type, layout_type, camp::list<IndexTypes...> >;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = TypedViewBase<value_type, pointer_type, shifted_layout_type, camp::list<IndexTypes...> >;
-
-    static constexpr size_t n_dims = sizeof...(IndexTypes);
-
-    using Base::Base;
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
+public:
+  using value_type = ValueType;
+  using pointer_type = PointerType;
+  using layout_type = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
+
+  using Base = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self = TypedViewBase<value_type,
+                             pointer_type,
+                             layout_type,
+                             camp::list<IndexTypes...>>;
+  using NonConstView = TypedViewBase<nc_value_type,
+                                     nc_pointer_type,
+                                     layout_type,
+                                     camp::list<IndexTypes...>>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView = TypedViewBase<value_type,
+                                    pointer_type,
+                                    shifted_layout_type,
+                                    camp::list<IndexTypes...>>;
+
+  static constexpr size_t n_dims = sizeof...(IndexTypes);
+
+  using Base::Base;
+
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout,
+        Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
 
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout,
+        Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
-    template <size_t n_dims = sizeof...(IndexTypes), typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
 
-      shifted_layout_type shift_layout(Base::get_layout());
-      shift_layout.shift(shift);
+  template <size_t n_dims = sizeof...(IndexTypes),
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
-      return ShiftedView(Base::get_data(), shift_layout);
-    }
+    shifted_layout_type shift_layout(Base::get_layout());
+    shift_layout.shift(shift);
 
+    return ShiftedView(Base::get_data(), shift_layout);
+  }
 };
 
 
-
 } // namespace internal
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index fcaee67f98..6e3246db3a 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -32,66 +32,59 @@
 namespace RAJA
 {
 
-//Helpers to convert
-//layouts -> OffsetLayouts
-//Typedlayouts -> TypedOffsetLayouts
-template<typename layout>
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
 struct add_offset
 {
   using type = RAJA::OffsetLayout<layout::n_dims>;
 };
 
-template<typename IdxLin, typename...DimTypes>
-struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
 {
-  using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
 };
 
 template <typename ValueType,
           typename LayoutType,
-          typename PointerType = ValueType *>
-using View =
-    internal::ViewBase<ValueType, PointerType, LayoutType>;
-
+          typename PointerType = ValueType*>
+using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
 
 
 template <typename ValueType, typename LayoutType, typename... IndexTypes>
-using TypedView =
-    internal::TypedViewBase<ValueType, ValueType *, LayoutType, camp::list<IndexTypes...> >;
-
-
-
+using TypedView = internal::
+    TypedViewBase<ValueType, ValueType*, LayoutType, camp::list<IndexTypes...>>;
 
 
 template <typename IndexType, typename ValueType>
-RAJA_INLINE View<ValueType, Layout<1, IndexType, 0> > make_view(
-    ValueType *ptr)
+RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
 {
-  return View<ValueType, Layout<1, IndexType, 0> >(ptr, 1);
+  return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
 
-template <size_t n_dims, typename IndexType, typename ValueType, typename... IndexTypes>
-RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> > make_index_view(
-    ValueType *ptr, IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+template <size_t n_dims,
+          typename IndexType,
+          typename ValueType,
+          typename... IndexTypes>
+RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
+make_index_view(ValueType* ptr,
+                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
-  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> >(ptr, index_layout);
+  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
+      ptr, index_layout);
 }
 
 
 // select certain indices from a tuple, given a curated index sequence
 // returns linear index of layout(ar...)
 template <typename Lay, typename Tup, camp::idx_t... Idxs>
-RAJA_HOST_DEVICE RAJA_INLINE 
-auto selecttuple( Lay lyout, Tup&& tup, camp::idx_seq<Idxs...> ) ->
-  decltype(
-            lyout(
-              camp::get<Idxs>(std::forward<Tup>(tup))...
-            )
-          )
-{ 
-  return lyout(
-                camp::get<Idxs>(std::forward<Tup>(tup))...
-              );
+RAJA_HOST_DEVICE RAJA_INLINE auto
+selecttuple(Lay lyout, Tup&& tup, camp::idx_seq<Idxs...>)
+    -> decltype(lyout(camp::get<Idxs>(std::forward<Tup>(tup))...))
+{
+  return lyout(camp::get<Idxs>(std::forward<Tup>(tup))...);
 }
 
 // sequence combiner
@@ -99,9 +92,7 @@ template <typename Seq1, typename Seq2>
 struct cat_seq;
 
 template <camp::idx_t... Idxs1, camp::idx_t... Idxs2>
-struct cat_seq  < camp::idx_seq<Idxs1...>,
-                  camp::idx_seq<Idxs2...>
-                >
+struct cat_seq<camp::idx_seq<Idxs1...>, camp::idx_seq<Idxs2...>>
 {
   using type = camp::idx_seq<Idxs1..., Idxs2...>;
 };
@@ -116,7 +107,7 @@ struct offset_seq;
 template <camp::idx_t Offset, camp::idx_t... Idxs>
 struct offset_seq<Offset, camp::idx_seq<Idxs...>>
 {
-  using type = camp::idx_seq<(Idxs+Offset)...>;
+  using type = camp::idx_seq<(Idxs + Offset)...>;
 };
 
 template <camp::idx_t Offset, typename Seq>
@@ -125,60 +116,51 @@ using offset_seq_t = typename offset_seq<Offset, Seq>::type;
 // remove the Nth index in a parameter pack
 // returns linear index of layout(ar...)
 template <typename Lay, RAJA::Index_type Nth = 0, typename Tup>
-RAJA_HOST_DEVICE RAJA_INLINE auto removenth( Lay lyout, Tup&& tup ) ->
-  decltype( selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-            )
-          )
+RAJA_HOST_DEVICE RAJA_INLINE auto
+removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
+    lyout,
+    std::forward<Tup>(tup),
+    cat_seq_t<camp::make_idx_seq_t<Nth>, // sequence up to Nth
+              offset_seq_t<Nth + 1,      // after Nth
+                           camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                Nth - 1>> // sequence after Nth
+              >{}))
 {
   return selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-          );
+      lyout,
+      std::forward<Tup>(tup),
+      cat_seq_t<camp::make_idx_seq_t<Nth>, // sequence up to Nth
+                offset_seq_t<Nth + 1,      // after Nth
+                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                  Nth - 1>> // sequence after
+                                                            // Nth
+                >{});
 }
 
 
-
-
-// P2Pidx represents the array-of-pointers index. This allows the position of the
-// index into the array-of-pointers to be moved around in the MultiView operator();
-// see the operator overload.
-// Default of 0 means that the p2p index is in the 0th position.
-template <typename ValueType,
-          typename LayoutType,
-          RAJA::Index_type P2Pidx = 0,
-          typename PointerType = ValueType **,
-          typename NonConstPointerType =
-              camp::type::ptr::add< // adds *
-                camp::type::ptr::add<
-                  camp::type::cv::rem<  // removes cv
-                    camp::type::ptr::rem<
-                      camp::type::ptr::rem<PointerType>  // removes *
-                    >
-                  >
-                >
-              >
-          >
-struct MultiView {
+// P2Pidx represents the array-of-pointers index. This allows the position of
+// the index into the array-of-pointers to be moved around in the MultiView
+// operator(); see the operator overload. Default of 0 means that the p2p index
+// is in the 0th position.
+template <
+    typename ValueType,
+    typename LayoutType,
+    RAJA::Index_type P2Pidx = 0,
+    typename PointerType = ValueType**,
+    typename NonConstPointerType = camp::type::ptr::add<           // adds *
+        camp::type::ptr::add<camp::type::cv::rem<                  // removes cv
+            camp::type::ptr::rem<camp::type::ptr::rem<PointerType> // removes
+                                                                   // *
+                                 >>>>>
+struct MultiView
+{
   using value_type = ValueType;
   using pointer_type = PointerType;
   using layout_type = LayoutType;
   using nc_value_type = camp::decay<value_type>;
   using nc_pointer_type = NonConstPointerType;
-  using NonConstView = MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
+  using NonConstView =
+      MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
   nc_pointer_type data;
@@ -186,39 +168,39 @@ struct MultiView {
   template <typename... Args>
   RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
       : layout(dim_sizes...), data(data_ptr)
-  {
-  }
+  {}
 
-  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type &&layout)
+  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type&& layout)
       : layout(layout), data(data_ptr)
-  {
-  }
+  {}
 
-  RAJA_INLINE constexpr MultiView(MultiView const &) = default;
-  RAJA_INLINE constexpr MultiView(MultiView &&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView const &) = default;
-  RAJA_INLINE MultiView& operator=(MultiView &&) = default;
+  RAJA_INLINE constexpr MultiView(MultiView const&) = default;
+  RAJA_INLINE constexpr MultiView(MultiView&&) = default;
+  RAJA_INLINE MultiView& operator=(MultiView const&) = default;
+  RAJA_INLINE MultiView& operator=(MultiView&&) = default;
 
   template <bool IsConstView = std::is_const<value_type>::value>
   RAJA_INLINE constexpr MultiView(
-      typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
-      : layout(rhs.layout),
-        data(rhs.data)
-  {
-  }
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
+      : layout(rhs.layout), data(rhs.data)
+  {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { data = data_ptr; }
 
-  template <size_t n_dims=layout_type::n_dims, typename IdxLin = Index_type>
-  RAJA_INLINE RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
-  shift(const std::array<IdxLin, n_dims>& shift)
+  template <size_t n_dims = layout_type::n_dims, typename IdxLin = Index_type>
+  RAJA_INLINE
+      RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
+      shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>(data, shift_layout);
+    return RAJA::MultiView<ValueType,
+                           typename add_offset<layout_type>::type,
+                           P2Pidx>(data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
@@ -226,22 +208,26 @@ struct MultiView {
   // making this specifically typed would require unpacking the layout,
   // this is easier to maintain
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... ar) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(Args... ar) const
   {
-    auto pidx = stripIndexType( camp::get<P2Pidx>( camp::forward_as_tuple( ar... ) ) );
+    auto pidx =
+        stripIndexType(camp::get<P2Pidx>(camp::forward_as_tuple(ar...)));
 
-    if ( pidx < 0 )
+    if (pidx < 0)
     {
-      RAJA_ABORT_OR_THROW( "Negative index while accessing array of pointers.\n" );
+      RAJA_ABORT_OR_THROW("Negative index while accessing array of "
+                          "pointers.\n");
     }
-    
-    auto idx = stripIndexType( removenth<LayoutType, P2Pidx>( layout, camp::forward_as_tuple( ar... ) ) );
+
+    auto idx = stripIndexType(
+        removenth<LayoutType, P2Pidx>(layout, camp::forward_as_tuple(ar...)));
     return data[pidx][idx];
   }
 };
 
 template <typename ViewType, typename AtomicPolicy = RAJA::auto_atomic>
-struct AtomicViewWrapper {
+struct AtomicViewWrapper
+{
   using base_type = ViewType;
   using pointer_type = typename base_type::pointer_type;
   using value_type = typename base_type::value_type;
@@ -255,7 +241,7 @@ struct AtomicViewWrapper {
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS&&... args) const
   {
     return atomic_type(&base_.operator()(std::forward<ARGS>(args)...));
   }
@@ -267,7 +253,8 @@ struct AtomicViewWrapper {
  * for performance
  */
 template <typename ViewType>
-struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
+struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
+{
   using base_type = ViewType;
   using pointer_type = typename base_type::pointer_type;
   using value_type = typename base_type::value_type;
@@ -276,12 +263,12 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
   base_type base_;
 
   RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType const &view) : base_{view} {}
+  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_{view} {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(ARGS&&... args) const
   {
     return base_.operator()(std::forward<ARGS>(args)...);
   }
@@ -289,14 +276,14 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
 
 
 template <typename AtomicPolicy, typename ViewType>
-RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy> make_atomic_view(
-    ViewType const &view)
+RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy>
+make_atomic_view(ViewType const& view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
 }
 
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 7103ecb152..3ab6e79705 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -32,15 +32,19 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(disable : 4146) // Force msvc to ignore subtracting from signed
+                                // number warning
 #endif
   void* r = nullptr;
-  if (size <= space) {
+  if (size <= space)
+  {
     char* p1 = static_cast<char*>(ptr);
     char* p2 = reinterpret_cast<char*>(
-        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) & -alignment);
+        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) &
+        -alignment);
     size_t d = static_cast<size_t>(p2 - p1);
-    if (d <= space - size) {
+    if (d <= space - size)
+    {
       r = p2;
       ptr = r;
       space -= d;
@@ -49,11 +53,11 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
   return r;
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(default : 4146) // Force msvc to ignore subtracting from signed
+                                // number warning
 #endif
-
 }
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index f0208ccbd3..ffe5a19a7f 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -60,12 +60,13 @@ class MemoryArena
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
-      m_free_space(),
-      m_used_space()
+      : m_allocation{ptr, static_cast<char*>(ptr) + size},
+        m_free_space(),
+        m_used_space()
   {
-     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
-    if (m_allocation.begin == nullptr) {
+    m_free_space[ptr] = static_cast<char*>(ptr) + size;
+    if (m_allocation.begin == nullptr)
+    {
       fprintf(stderr, "Attempt to create MemoryArena with no memory");
       std::abort();
     }
@@ -90,22 +91,23 @@ class MemoryArena
   void* get(size_t nbytes, size_t alignment)
   {
     void* ptr_out = nullptr;
-    if (capacity() >= nbytes) {
+    if (capacity() >= nbytes)
+    {
       free_type::iterator end = m_free_space.end();
-      for (free_type::iterator iter = m_free_space.begin(); iter != end;
-           ++iter) {
+      for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter)
+      {
 
         void* adj_ptr = iter->first;
         size_t cap =
             static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
 
-        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap))
+        {
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(iter,
-                            adj_ptr,
-                            static_cast<char*>(adj_ptr) + nbytes);
+          remove_free_chunk(
+              iter, adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
 
@@ -118,29 +120,35 @@ class MemoryArena
 
   bool give(void* ptr)
   {
-    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
+    if (m_allocation.begin <= ptr && ptr < m_allocation.end)
+    {
 
       used_type::iterator found = m_used_space.find(ptr);
 
-      if (found != m_used_space.end()) {
+      if (found != m_used_space.end())
+      {
 
         add_free_chunk(found->first, found->second);
 
         m_used_space.erase(found);
-
-      } else {
+      }
+      else
+      {
         fprintf(stderr, "Invalid free %p", ptr);
         std::abort();
       }
 
       return true;
-    } else {
+    }
+    else
+    {
       return false;
     }
   }
 
 private:
-  struct memory_chunk {
+  struct memory_chunk
+  {
     void* begin;
     void* end;
   };
@@ -152,19 +160,23 @@ class MemoryArena
     free_type::iterator next = m_free_space.lower_bound(begin);
 
     // check if prev exists
-    if (next != m_free_space.begin()) {
+    if (next != m_free_space.begin())
+    {
       // check if prev can cover [begin, end)
       free_type::iterator prev = next;
       --prev;
-      if (prev->second == begin) {
+      if (prev->second == begin)
+      {
         // extend prev to cover [begin, end)
         prev->second = end;
 
         // check if prev can cover next too
-        if (next != invl) {
+        if (next != invl)
+        {
           assert(next->first != begin);
 
-          if (next->first == end) {
+          if (next->first == end)
+          {
             // extend prev to cover next too
             prev->second = next->second;
 
@@ -176,10 +188,12 @@ class MemoryArena
       }
     }
 
-    if (next != invl) {
+    if (next != invl)
+    {
       assert(next->first != begin);
 
-      if (next->first == end) {
+      if (next->first == end)
+      {
         // extend next to cover [begin, end)
         m_free_space.insert(next, free_value_type{begin, next->second});
         m_free_space.erase(next);
@@ -200,28 +214,32 @@ class MemoryArena
     void* ptr_end = iter->second;
 
     // fixup m_free_space, shrinking and adding chunks as needed
-    if (ptr != begin) {
+    if (ptr != begin)
+    {
 
       // shrink end of current free region to [ptr, begin)
       iter->second = begin;
 
-      if (end != ptr_end) {
+      if (end != ptr_end)
+      {
 
         // insert free region [end, ptr_end) after current free region
         free_type::iterator next = iter;
         ++next;
         m_free_space.insert(next, free_value_type{end, ptr_end});
       }
-
-    } else if (end != ptr_end) {
+    }
+    else if (end != ptr_end)
+    {
 
       // shrink beginning of current free region to [end, ptr_end)
       free_type::iterator next = iter;
       ++next;
       m_free_space.insert(next, free_value_type{end, ptr_end});
       m_free_space.erase(iter);
-
-    } else {
+    }
+    else
+    {
 
       // can not reuse current region, erase
       m_free_space.erase(iter);
@@ -298,8 +316,7 @@ class MemPool
 
   MemPool()
       : m_arenas(), m_default_arena_size(default_default_arena_size), m_alloc()
-  {
-  }
+  {}
 
   ~MemPool()
   {
@@ -316,7 +333,8 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    while (!m_arenas.empty()) {
+    while (!m_arenas.empty())
+    {
       void* allocation_ptr = m_arenas.front().get_allocation();
       m_alloc.free(allocation_ptr);
       m_arenas.pop_front();
@@ -354,18 +372,22 @@ class MemPool
     void* ptr = nullptr;
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
+         ++iter)
+    {
       ptr = iter->get(size, alignment);
-      if (ptr != nullptr) {
+      if (ptr != nullptr)
+      {
         break;
       }
     }
 
-    if (ptr == nullptr) {
+    if (ptr == nullptr)
+    {
       const size_t alloc_size =
           std::max(size + alignment, m_default_arena_size);
       void* arena_ptr = m_alloc.malloc(alloc_size);
-      if (arena_ptr != nullptr) {
+      if (arena_ptr != nullptr)
+      {
         m_arenas.emplace_front(arena_ptr, alloc_size);
         ptr = m_arenas.front().get(size, alignment);
       }
@@ -383,13 +405,16 @@ class MemPool
     void* ptr = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
-      if (iter->give(ptr)) {
+         ++iter)
+    {
+      if (iter->give(ptr))
+      {
         ptr = nullptr;
         break;
       }
     }
-    if (ptr != nullptr) {
+    if (ptr != nullptr)
+    {
       fprintf(stderr, "Unknown pointer %p", ptr);
     }
   }
@@ -407,7 +432,8 @@ class MemPool
 };
 
 //! example allocator for basic_mempool using malloc/free
-struct generic_allocator {
+struct generic_allocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes) { return std::malloc(nbytes); }
diff --git a/include/RAJA/util/camp_aliases.hpp b/include/RAJA/util/camp_aliases.hpp
index c747ac64a0..4c735b9c8f 100644
--- a/include/RAJA/util/camp_aliases.hpp
+++ b/include/RAJA/util/camp_aliases.hpp
@@ -52,6 +52,6 @@ using ::camp::get;
 
 using ::camp::resources::Platform;
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif /* RAJA_CAMP_ALIASES_HPP */
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 4372993949..5e62b11877 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -34,18 +34,18 @@ using namespace camp::concepts;
 
 template <typename From, typename To>
 struct ConvertibleTo
-  : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
-};
+    : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>()))
+{};
 
-}
+} // namespace concepts
 
 namespace type_traits
 {
 using namespace camp::type_traits;
 
 DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
-}
+} // namespace type_traits
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 25783b2a0a..665f7f4a95 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -38,11 +38,13 @@ namespace detail
 
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
-template<typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
+template <typename Iter, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
+                                                Iter end,
+                                                UnaryFunc func)
 {
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
     func(*begin);
   }
 
@@ -52,8 +54,8 @@ UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
+                                                     UnaryFunc func)
 {
   // braced init lists are evaluated in order
   int seq_unused_array[] = {0, (func(Ts{}), 0)...};
@@ -65,8 +67,9 @@ UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+                                                      UnaryFunc func,
+                                                      camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -76,7 +79,7 @@ UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
   return func;
 }
 
-}  // namespace detail
+} // namespace detail
 
 
 /*!
@@ -87,7 +90,7 @@ UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 RAJA_SUPPRESS_HD_WARN
 template <typename Container, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+    concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
     for_each(Container&& c, UnaryFunc func)
 {
   using std::begin;
@@ -102,25 +105,26 @@ concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
+                                                     UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
 
 /*!
-  \brief Apply func to each object in the given tuple or tuple like type in order
-  using a compile-time expansion in O(N) operations and O(1) extra memory
+  \brief Apply func to each object in the given tuple or tuple like type in
+  order using a compile-time expansion in O(N) operations and O(1) extra memory
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
-  return detail::for_each_tuple(std::forward<Tuple>(t), std::move(func),
+  return detail::for_each_tuple(
+      std::forward<Tuple>(t),
+      std::move(func),
       camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 9ddb5bebb7..eddc9e7c6d 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -33,9 +33,9 @@
 // We need a better solution than this as it is a pain to manage
 // this stuff in an application.
 //
-#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
-  || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) \
-  || (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||                   \
+    (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) ||           \
+    (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
 #endif
 
@@ -115,9 +115,8 @@
  *******************************************************************************
  */
 template <typename... T>
-RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
-{
-}
+RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
+{}
 
 /*!
  * \def RAJA_STRINGIFY_HELPER(x)
@@ -133,7 +132,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  */
 #define RAJA_STRINGIFY_MACRO(x) RAJA_STRINGIFY_HELPER(x)
 
-#define RAJA_DIVIDE_CEILING_INT(dividend, divisor) \
+#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)                             \
   (((dividend) + (divisor)-1) / (divisor))
 
 /*!
@@ -141,27 +140,26 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  * Used in forall and launch
  */
 #if defined(RAJA_ENABLE_OPENMP)
-#define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
-      _Pragma(" omp declare reduction( combine \
+#define RAJA_OMP_DECLARE_REDUCTION_COMBINE                                     \
+  _Pragma(" omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")\
-        //initializer(omp_priv = omp_in) ")
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ") // initializer(omp_priv = omp_in) ")
 #endif
 
 
 RAJA_HOST_DEVICE
-inline void RAJA_ABORT_OR_THROW(const char *str)
+inline void RAJA_ABORT_OR_THROW(const char* str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
-  //segfault here ran into linking problems
-  *((volatile char *)0) = 0;  // write to address 0
+  // segfault here ran into linking problems
+  *((volatile char*)0) = 0; // write to address 0
 #else
-  printf ( "%s\n", str );
+  printf("%s\n", str);
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
-  *((volatile char *)0) = 0;  // write to address 0
+  *((volatile char*)0) = 0; // write to address 0
 #elif defined(__CUDA_ARCH__)
-  asm ("trap;");
+  asm("trap;");
 
 #elif defined(__HIP_DEVICE_COMPILE__)
   abort();
@@ -169,10 +167,11 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #else
 #ifdef RAJA_COMPILER_MSVC
   fflush(stdout);
-  char *value;
+  char* value;
   size_t len;
   bool no_except = false;
-  if(_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr){
+  if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr)
+  {
     no_except = true;
     free(value);
   }
@@ -182,9 +181,12 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #endif
 
   fflush(stdout);
-  if (no_except) {
+  if (no_except)
+  {
     std::abort();
-  } else {
+  }
+  else
+  {
     throw std::runtime_error(str);
   }
 #endif
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index 66b0c9058c..9b09414ef7 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -34,14 +34,14 @@ namespace RAJA
     For zero or negative n return 0
 
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T log2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T log2(T n) noexcept
 {
   T result = 0;
-  if (n > 0) {
-    while(n >>= 1) {
+  if (n > 0)
+  {
+    while (n >>= 1)
+    {
       ++result;
     }
   }
@@ -57,13 +57,12 @@ constexpr T log2(T n) noexcept
         if n is not a power of 2, return the next greater power of 2
       if n is negative, return 0
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE
-constexpr T next_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE constexpr T next_pow2(T n) noexcept
 {
   --n;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   ++n;
@@ -71,7 +70,8 @@ constexpr T next_pow2(T n) noexcept
 }
 
 /*!
-    \brief "round down" to the largest power of 2 that is less than or equal to n
+    \brief "round down" to the largest power of 2 that is less than or equal to
+   n
 
     For an integer n,
       if n is negative, return 0
@@ -79,13 +79,12 @@ constexpr T next_pow2(T n) noexcept
         if n is a power of 2, return n
         else return the largest power of 2 that is less than n
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE
-constexpr T prev_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE constexpr T prev_pow2(T n) noexcept
 {
-  if ( n < 0 ) return 0;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  if (n < 0) return 0;
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   return n - (n >> 1);
@@ -94,13 +93,15 @@ constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template < typename L, typename R,
-           std::enable_if_t<std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr >
+template <typename L,
+          typename R,
+          std::enable_if_t<std::is_integral<L>::value &&
+                           std::is_integral<R>::value>* = nullptr>
 constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
 {
-  return lhs & (rhs-R(1));
+  return lhs & (rhs - R(1));
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
index a955b27915..df897b183b 100644
--- a/include/RAJA/util/mutex.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -58,8 +58,8 @@ class mutex
   native_handle_type m_lock;
 };
 
-}  // namespace omp
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
+} // namespace omp
+#endif // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
 //! class providing functionality of std::lock_guard
 template <typename mutex_type>
@@ -79,6 +79,6 @@ class lock_guard
   mutex_type& m_mutex;
 };
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index d5f42efde0..c4ade8f529 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -18,103 +18,88 @@
 #include "RAJA/util/KokkosPluginLoader.hpp"
 #endif
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 template <typename T>
-RAJA_INLINE auto trigger_updates_before(T&& item)
-  -> typename std::remove_reference<T>::type
+RAJA_INLINE auto trigger_updates_before(T&& item) ->
+    typename std::remove_reference<T>::type
 {
   return item;
 }
 
 RAJA_INLINE
-void
-callPreCapturePlugins(const PluginContext& p)
+void callPreCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostCapturePlugins(const PluginContext& p)
+void callPostCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPreLaunchPlugins(const PluginContext& p)
+void callPreLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostLaunchPlugins(const PluginContext& p)
+void callPostLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callInitPlugins(const PluginOptions p)
+void callInitPlugins(const PluginOptions p)
 {
-  for (auto plugin = PluginRegistry::begin(); 
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->init(p);
   }
 }
 
 RAJA_INLINE
-void
-init_plugins(const std::string& path)
-{   
+void init_plugins(const std::string& path)
+{
   callInitPlugins(make_options(path));
 }
 
 RAJA_INLINE
-void
-init_plugins()
-{   
-  callInitPlugins(make_options(""));
-}
+void init_plugins() { callInitPlugins(make_options("")); }
 
 RAJA_INLINE
-void
-finalize_plugins()
-{   
-  for (auto plugin = PluginRegistry::begin(); 
-    plugin != PluginRegistry::end();
-    ++plugin)
+void finalize_plugins()
+{
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->finalize();
   }
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+} // namespace util
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 6d0c28f861..078b7d6363 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -44,19 +44,16 @@ namespace detail
 template <typename T, typename BinaryOp>
 struct LeftFoldReduce
 {
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
-    , m_accumulated_value(std::move(init))
-  {
-
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit LeftFoldReduce(
+      T init = BinaryOp::identity(),
+      BinaryOp op = BinaryOp{}) noexcept
+      : m_op(std::move(op)), m_accumulated_value(std::move(init))
+  {}
 
   LeftFoldReduce(LeftFoldReduce const&) = delete;
   LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
-  LeftFoldReduce(LeftFoldReduce &&) = delete;
-  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
+  LeftFoldReduce(LeftFoldReduce&&) = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce&&) = delete;
 
   ~LeftFoldReduce() = default;
 
@@ -64,8 +61,7 @@ struct LeftFoldReduce
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     m_accumulated_value = BinaryOp::identity();
   }
@@ -73,8 +69,7 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     T accumulated_value = std::move(m_accumulated_value);
 
@@ -86,17 +81,12 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
-  {
-    return m_accumulated_value;
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; }
 
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T val)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T val)
   {
     m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
   }
@@ -109,50 +99,52 @@ struct LeftFoldReduce
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <typename T, typename BinaryOp, typename SizeType = size_t,
-          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
+template <typename T,
+          typename BinaryOp,
+          typename SizeType = size_t,
+          SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
 struct BinaryTreeReduce
 {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
+  static_assert(t_num_levels <= CHAR_BIT * sizeof(SizeType),
+                "SizeType must be "
+                "large enough to "
+                "act at a bitset "
+                "for num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit BinaryTreeReduce(
+      T init = BinaryOp::identity(),
+      BinaryOp op = BinaryOp{}) noexcept
+      : m_op(std::move(op))
   {
     combine(std::move(init));
   }
 
   BinaryTreeReduce(BinaryTreeReduce const&) = delete;
   BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
-  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
-  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce&&) = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce&&) = delete;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  ~BinaryTreeReduce()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE ~BinaryTreeReduce() { clear(); }
 
 
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     // destroy all values on the tree stack and reset count to 0
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         get_value(level)->~T();
 
         m_count ^= mask;
-
       }
     }
   }
@@ -160,15 +152,16 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         value = m_op(std::move(value), std::move(*get_value(level)));
         get_value(level)->~T();
@@ -183,15 +176,17 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
+  RAJA_HOST_DEVICE RAJA_INLINE T get()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
+    for (SizeType count = m_count, level = 0, mask = 1; count;
+         ++level, mask <<= 1)
+    {
 
-      if (count & mask) {
+      if (count & mask)
+      {
 
         value = m_op(std::move(value), *get_value(level));
 
@@ -205,20 +200,19 @@ struct BinaryTreeReduce
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T value)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T value)
   {
     // accumulate values and store in the first unused level found
     // clear values from used levels along the way
     SizeType level = 0;
-    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) {
+    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1)
+    {
 
       value = m_op(std::move(*get_value(level)), std::move(value));
       get_value(level)->~T();
-
     }
 
-    new(get_storage(level)) T(std::move(value));
+    new (get_storage(level)) T(std::move(value));
 
     ++m_count;
   }
@@ -234,14 +228,12 @@ struct BinaryTreeReduce
   // values or is unused and has no value.
   std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void* get_storage(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE void* get_storage(SizeType level)
   {
     return &m_tree_stack[level];
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T* get_value(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE T* get_value(SizeType level)
   {
 #if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     // TODO: check that launder is supported in device code
@@ -254,10 +246,10 @@ struct BinaryTreeReduce
 
 
 template <typename T, typename BinaryOp>
-using HighAccuracyReduce = std::conditional_t<
-    RAJA::operators::is_fp_associative<T>::value,
-      BinaryTreeReduce<T, BinaryOp>,
-      LeftFoldReduce<T, BinaryOp>>;
+using HighAccuracyReduce =
+    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
+                       BinaryTreeReduce<T, BinaryOp>,
+                       LeftFoldReduce<T, BinaryOp>>;
 
 
 /*!
@@ -265,18 +257,15 @@ using HighAccuracyReduce = std::conditional_t<
            operation using O(N) operations and O(1) memory
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T left_fold_reduce(Iter begin,
-                   Iter end,
-                   T init,
-                   BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -290,20 +279,18 @@ T left_fold_reduce(Iter begin,
     floating point types.
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T binary_tree_reduce(Iter begin,
-                     Iter end,
-                     T init,
-                     BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
+                                                  std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -315,24 +302,21 @@ T binary_tree_reduce(Iter begin,
     is a concern, or a faster algorithm with it is not a concern
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T high_accuracy_reduce(Iter begin,
-                        Iter end,
-                        T init,
-                        BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
 }
 
-}  // namespace detail
+} // namespace detail
 
 /*!
   \brief Accumulate given range to a single value
@@ -342,16 +326,19 @@ T high_accuracy_reduce(Iter begin,
 template <typename Container,
           typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c,
+               T init = BinaryOp::identity(),
+               BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::left_fold_reduce(
+      begin(c), end(c), std::move(init), std::move(op));
 }
 
 /*!
@@ -362,16 +349,19 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
 template <typename Container,
           typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c,
+                       T init = BinaryOp::identity(),
+                       BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::binary_tree_reduce(
+      begin(c), end(c), std::move(init), std::move(op));
 }
 
 /*!
@@ -383,18 +373,21 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
 template <typename Container,
           typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c,
+                         T init = BinaryOp::identity(),
+                         BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::high_accuracy_reduce(
+      begin(c), end(c), std::move(init), std::move(op));
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 28a476d951..fa15283525 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -37,145 +37,212 @@
 namespace RAJA
 {
 
-  namespace resources
-  {
-  using namespace camp::resources;
+namespace resources
+{
+using namespace camp::resources;
 
-  template<typename e>
-  struct get_resource{
-    using type = camp::resources::Host;
-  };
+template <typename e>
+struct get_resource
+{
+  using type = camp::resources::Host;
+};
 
-  template<Platform>
-  struct get_resource_from_platform{
-    using type = camp::resources::Host;
-  };
+template <Platform>
+struct get_resource_from_platform
+{
+  using type = camp::resources::Host;
+};
 
-  template<typename ExecPol>
-  using resource_from_pol_t = typename get_resource_from_platform<detail::get_platform<ExecPol>::value>::type;
+template <typename ExecPol>
+using resource_from_pol_t = typename get_resource_from_platform<
+    detail::get_platform<ExecPol>::value>::type;
 
-  template<typename ExecPol>
-  constexpr resource_from_pol_t<ExecPol> get_default_resource() {
-    return resource_from_pol_t<ExecPol>::get_default();
-  }
+template <typename ExecPol>
+constexpr resource_from_pol_t<ExecPol> get_default_resource()
+{
+  return resource_from_pol_t<ExecPol>::get_default();
+}
 
 #if defined(RAJA_CUDA_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::cuda>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-  struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
-    using type = camp::resources::Cuda;
-  };
+template <>
+struct get_resource_from_platform<Platform::cuda>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                             IterationGetter,
+                                                             Concretizer,
+                                                             BLOCKS_PER_SM,
+                                                             Async>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
+struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async,
+                                                                 num_threads,
+                                                                 BLOCKS_PER_SM>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>>>
+{
+  using type = camp::resources::Cuda;
+};
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::hip>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
-    using type = camp::resources::Hip;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
-    using type = camp::resources::Hip;
-  };
+template <>
+struct get_resource_from_platform<Platform::hip>
+{
+  using type = camp::resources::Hip;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>
+{
+  using type = camp::resources::Hip;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Hip;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>
+{
+  using type = camp::resources::Hip;
+};
 #endif
 
 #if defined(RAJA_SYCL_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::sycl>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<size_t BlockSize, bool Async>
-  struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<typename ISetIter, size_t BlockSize, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>{
-    using type = camp::resources::Sycl;
-  };
+template <>
+struct get_resource_from_platform<Platform::sycl>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <size_t BlockSize, bool Async>
+struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <typename ISetIter, size_t BlockSize, bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>
+{
+  using type = camp::resources::Sycl;
+};
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  template<>
-  struct get_resource_from_platform<Platform::omp_target>{
-    using type = camp::resources::Omp;
-  };
-
-  template<>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>{
-    using type = camp::resources::Omp;
-  };
-
-  template<size_t ThreadsPerTeam>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter, size_t ThreadsPerTeam>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>{
-    using type = camp::resources::Omp;
-  };
+template <>
+struct get_resource_from_platform<Platform::omp_target>
+{
+  using type = camp::resources::Omp;
+};
+
+template <>
+struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>
+{
+  using type = camp::resources::Omp;
+};
+
+template <size_t ThreadsPerTeam>
+struct get_resource<
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>
+{
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>
+{
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter, size_t ThreadsPerTeam>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>
+{
+  using type = camp::resources::Omp;
+};
 #endif
 
-  } // end namespace resources
+} // end namespace resources
 
-  namespace type_traits
-  {
-    template <typename T> struct is_resource : std::false_type {};
-    template <> struct is_resource<resources::Host> : std::true_type {};
+namespace type_traits
+{
+template <typename T>
+struct is_resource : std::false_type
+{};
+template <>
+struct is_resource<resources::Host> : std::true_type
+{};
 #if defined(RAJA_CUDA_ACTIVE)
-    template <> struct is_resource<resources::Cuda> : std::true_type {};
+template <>
+struct is_resource<resources::Cuda> : std::true_type
+{};
 #endif
 #if defined(RAJA_HIP_ACTIVE)
-    template <> struct is_resource<resources::Hip> : std::true_type {};
+template <>
+struct is_resource<resources::Hip> : std::true_type
+{};
 #endif
 #if defined(RAJA_SYCL_ACTIVE)
-    template <> struct is_resource<resources::Sycl> : std::true_type {};
+template <>
+struct is_resource<resources::Sycl> : std::true_type
+{};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    template <> struct is_resource<resources::Omp> : std::true_type {};
+template <>
+struct is_resource<resources::Omp> : std::true_type
+{};
 #endif
-  } // end namespace type_traits
+} // end namespace type_traits
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
-#endif //RAJA_resources_HPP#
+#endif // RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index bbec03dfe1..af5c415a29 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -40,37 +40,41 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-partition(Iter begin,
-          Iter end,
-          Predicate pred)
+RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
+                                            Iter end,
+                                            Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return begin;
   }
 
   // advance to first false
   Iter first_false = begin;
-  for (; first_false != end; ++first_false) {
+  for (; first_false != end; ++first_false)
+  {
 
-    if (!pred(first_false)) {
+    if (!pred(first_false))
+    {
       break;
     }
   }
 
   // return if none were false
-  if (first_false == end) {
+  if (first_false == end)
+  {
     return first_false;
   }
 
   // advance through rest of list to find the next true
-  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true) {
+  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true)
+  {
 
     // find the end of a range of falses [first_false, next_true)
-    if (pred(next_true)) {
+    if (pred(next_true))
+    {
 
       // shift the known range of falses forward
       // by swapping the true to the beginning of the range
@@ -87,33 +91,36 @@ partition(Iter begin,
     and using O(N^2) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-insertion_sort(Iter begin,
-               Iter end,
-               Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void
+insertion_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return;
   }
 
   // for each unsorted item in the right side of the range
-  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end; ++next_unsorted) {
+  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end;
+       ++next_unsorted)
+  {
 
     // insert unsorted item into the sorted left side of the range
-    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert) {
+    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert)
+    {
 
       Iter next_sorted = RAJA::prev(to_insert);
 
       // compare with next item to left
-      if (comp(*to_insert, *next_sorted)) {
+      if (comp(*to_insert, *next_sorted))
+      {
 
         // swap down if should be before
         safe_iter_swap(next_sorted, to_insert);
-
-      } else {
+      }
+      else
+      {
 
         // stop if in correct position
         break;
@@ -125,30 +132,57 @@ insertion_sort(Iter begin,
 /*!
     \brief get number of strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr size_t num_shell_strides()
-{
-  return 39;
-}
+RAJA_HOST_DEVICE RAJA_INLINE constexpr size_t num_shell_strides() { return 39; }
 
 /*!
     \brief get strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr long long unsigned get_shell_stride(int i)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr long long unsigned
+get_shell_stride(int i)
 {
   using array_type = long long unsigned[num_shell_strides()];
   return (array_type{
       // strides from M. Ciura 2001
-      1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
+      1llu,
+      4llu,
+      10llu,
+      23llu,
+      57llu,
+      132llu,
+      301llu,
+      701llu,
+      1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
-      3937llu, 8858llu, 19930llu, 44842llu, 100894llu, 227011llu, 510774llu,
-      1149241llu, 2585792llu, 5818032llu, 13090572llu, 29453787llu, 66271020llu,
-      149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
-      8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
-      220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
-      5647794772392llu, 12707538237882llu, 28591961035234llu, 64331912329276llu
-    })[i];
+      3937llu,
+      8858llu,
+      19930llu,
+      44842llu,
+      100894llu,
+      227011llu,
+      510774llu,
+      1149241llu,
+      2585792llu,
+      5818032llu,
+      13090572llu,
+      29453787llu,
+      66271020llu,
+      149109795llu,
+      335497038llu,
+      754868335llu,
+      1698453753llu,
+      3821520944llu,
+      8598422124llu,
+      19346449779llu,
+      43529512002llu,
+      97941402004llu,
+      220368154509llu,
+      495828347645llu,
+      1115613782201llu,
+      2510131009952llu,
+      5647794772392llu,
+      12707538237882llu,
+      28591961035234llu,
+      64331912329276llu})[i];
 }
 
 /*!
@@ -156,26 +190,27 @@ constexpr long long unsigned get_shell_stride(int i)
     and using O(N^?) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-shell_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
 
   diff_type n = end - begin;
 
-  if (n <= static_cast<diff_type>(1)) {
+  if (n <= static_cast<diff_type>(1))
+  {
     return;
-  } else if (get_shell_stride(1) < static_cast<unsigned long long>(n)) {
+  }
+  else if (get_shell_stride(1) < static_cast<unsigned long long>(n))
+  {
 
     int i_stride = 2;
     // find first stride larger than n
     constexpr int num_strides = num_shell_strides();
-    for (; i_stride < num_strides; ++i_stride) {
-      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n)) {
+    for (; i_stride < num_strides; ++i_stride)
+    {
+      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n))
+      {
         break;
       }
     }
@@ -184,25 +219,32 @@ shell_sort(Iter begin,
 
     // for each stride size smaller than n, largest to smallest, not including 1
     // sort strided ranges with stride stride
-    for (; i_stride > 0; --i_stride) {
+    for (; i_stride > 0; --i_stride)
+    {
       diff_type stride = static_cast<diff_type>(get_shell_stride(i_stride));
 
       // for each unsorted item in the right side of each strided range
-      for (diff_type i_next_unsorted = stride; i_next_unsorted != n; ++i_next_unsorted) {
+      for (diff_type i_next_unsorted = stride; i_next_unsorted != n;
+           ++i_next_unsorted)
+      {
 
         // insert unsorted item into the sorted left side of the strided range
-        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride; i_to_insert -= stride) {
+        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride;
+             i_to_insert -= stride)
+        {
 
           Iter to_insert = begin + i_to_insert;
           Iter next_sorted = to_insert - stride;
 
           // compare with next item to left
-          if (comp(*to_insert, *next_sorted)) {
+          if (comp(*to_insert, *next_sorted))
+          {
 
             // swap down if should be before
             safe_iter_swap(next_sorted, to_insert);
-
-          } else {
+          }
+          else
+          {
 
             // stop if in correct position
             break;
@@ -222,12 +264,8 @@ shell_sort(Iter begin,
     and using O(lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-heapify(Iter begin,
-        Iter root,
-        Iter end,
-        Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void
+heapify(Iter begin, Iter root, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
@@ -235,24 +273,28 @@ heapify(Iter begin,
 
   // heapify the root node into place
   // until this is a max heap again
-  for (auto i = root - begin; 2*i+1 < N; i = root - begin) {
+  for (auto i = root - begin; 2 * i + 1 < N; i = root - begin)
+  {
 
     // find the max item amongst the root, left child, and right child
     Iter maxit = root;
 
     // left child
-    Iter child = begin + 2*i+1;
-    if (comp(*maxit, *child)) {
+    Iter child = begin + 2 * i + 1;
+    if (comp(*maxit, *child))
+    {
       maxit = child;
     }
 
     // right child
     ++child;
-    if (child != end && comp(*maxit, *child)) {
+    if (child != end && comp(*maxit, *child))
+    {
       maxit = child;
     }
 
-    if (maxit == root) {
+    if (maxit == root)
+    {
       // root is the max, done
       break;
     }
@@ -269,24 +311,22 @@ heapify(Iter begin,
     and using O(N*lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-heap_sort(Iter begin,
-          Iter end,
-          Compare comp)
+RAJA_HOST_DEVICE inline void heap_sort(Iter begin, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
   auto N = end - begin;
 
-  if (N < 2) {
+  if (N < 2)
+  {
     // already sorted
     return;
   }
 
   // make range into a max heap by
   // going through nodes with children one-by-one in reverse order
-  for (Iter root = begin + (N-1)/2; root != begin; --root) {
+  for (Iter root = begin + (N - 1) / 2; root != begin; --root)
+  {
     // heapify a sub-heap
     heapify(begin, root, end, comp);
   }
@@ -294,7 +334,8 @@ heap_sort(Iter begin,
   heapify(begin, begin, end, comp);
 
   // remove one element from max heap repeatedly until sorted
-  for (--end; begin != end; --end) {
+  for (--end; begin != end; --end)
+  {
 
     // swap max element into sorted position at end of heap
     safe_iter_swap(begin, end);
@@ -325,12 +366,8 @@ struct intro_sort_insertion_sort_cutoff
     and using O(N*lg(N)) comparisons and O(lg(N)) memory, with limited depth.
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort_depth(Iter begin,
-                 Iter end,
-                 Compare comp,
-                 unsigned depth)
+RAJA_HOST_DEVICE inline void
+intro_sort_depth(Iter begin, Iter end, Compare comp, unsigned depth)
 {
   using RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
@@ -341,57 +378,56 @@ intro_sort_depth(Iter begin,
   constexpr diff_type insertion_sort_cutoff =
       static_cast<diff_type>(intro_sort_insertion_sort_cutoff::get());
 
-  if (N < 2) {
+  if (N < 2)
+  {
 
     // already sorted
-
-  } else if (N < insertion_sort_cutoff) {
+  }
+  else if (N < insertion_sort_cutoff)
+  {
 
     // use insertion sort for small inputs
     detail::insertion_sort(begin, end, comp);
-
-  } else if (depth == 0) {
+  }
+  else if (depth == 0)
+  {
 
     // use heap sort if recurse too deep
     detail::heap_sort(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     // use quick sort
     // choose pivot with median of 3 (N >= insertion_sort_cutoff)
-    Iter mid = begin + N/2;
-    Iter last = end-1;
-    Iter pivot = comp(*begin, *mid)
-                    ? ( comp(*mid, *last)
-                           ? mid
-                           : ( comp(*begin, *last)
-                                  ? last
-                                  : begin ) )
-                    : ( comp(*mid, *last)
-                           ? ( comp(*begin, *last)
-                                  ? begin
-                                  : last )
-                           : mid );
+    Iter mid = begin + N / 2;
+    Iter last = end - 1;
+    Iter pivot =
+        comp(*begin, *mid)
+            ? (comp(*mid, *last) ? mid : (comp(*begin, *last) ? last : begin))
+            : (comp(*mid, *last) ? (comp(*begin, *last) ? begin : last) : mid);
 
     // swap pivot to last
-    if (pivot != last) {
+    if (pivot != last)
+    {
       safe_iter_swap(pivot, last);
       pivot = last;
     }
 
     // partition
-    mid = partition(begin, last, [&](Iter it){ return comp(*it, *pivot); });
+    mid = partition(begin, last, [&](Iter it) { return comp(*it, *pivot); });
 
     // swap pivot to sorted position
-    if (mid != pivot) {
+    if (mid != pivot)
+    {
       safe_iter_swap(mid, pivot);
       pivot = mid;
     }
 
     // recurse to sort first and second parts, ignoring already sorted pivot
     // by construction pivot is always in the range [begin, last]
-    detail::intro_sort_depth(begin, pivot, comp, depth-1);
-    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth-1);
+    detail::intro_sort_depth(begin, pivot, comp, depth - 1);
+    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth - 1);
   }
 }
 
@@ -400,20 +436,18 @@ intro_sort_depth(Iter begin,
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
 {
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2*RAJA::log2(N);
+  unsigned max_depth = 2 * RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  // limit max_depth statically in device code to allow compiler to remove recursion
-  if (max_depth > detail::intro_sort_device_max_depth::get()) {
+  // limit max_depth statically in device code to allow compiler to remove
+  // recursion
+  if (max_depth > detail::intro_sort_device_max_depth::get())
+  {
     max_depth = detail::intro_sort_device_max_depth::get();
   }
 #endif
@@ -426,25 +460,20 @@ intro_sort(Iter begin,
     with local range/2 copy
 */
 template <typename Iter, typename Compare>
-void
-RAJA_INLINE
-inplace_merge(  Iter first,
-                Iter middle,
-                Iter last,
-                Compare comp  )
+void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   diff_type copylen = middle - first;
 
-  if ( first == middle || middle == last )
+  if (first == middle || middle == last)
   {
     // at least one side empty, already sorted
     return;
   }
 
-  if ( !comp(*middle, *(middle-1)) )
+  if (!comp(*middle, *(middle - 1)))
   {
     // everything already in order, done
     return;
@@ -455,37 +484,39 @@ inplace_merge(  Iter first,
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, copylen * sizeof(value_type) ),
+      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                              copylen * sizeof(value_type)),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
 
   // check memory allocation worked
-  if (copyarr == nullptr) {
-    RAJA_ABORT_OR_THROW( "inplace_merge temporary memory allocation failed" );
+  if (copyarr == nullptr)
+  {
+    RAJA_ABORT_OR_THROW("inplace_merge temporary memory allocation failed");
   }
 
   // move construct input into buffer storage
   // use buf_deleter.size as index to keep track of objects constructed
-  for ( diff_type& cc = buf_deleter.size; cc < copylen; ++cc )
+  for (diff_type& cc = buf_deleter.size; cc < copylen; ++cc)
   {
-    new(&copyarr[cc]) value_type(std::move(first[cc]));
+    new (&copyarr[cc]) value_type(std::move(first[cc]));
   }
 
   // merge
-  for ( diff_type cur = 0; cur < copylen; )
+  for (diff_type cur = 0; cur < copylen;)
   {
-    if ( middle >= last ) // moved all second half, put copy into remainder
+    if (middle >= last) // moved all second half, put copy into remainder
     {
-      std::move( copyarr+cur, copyarr+copylen, first );
+      std::move(copyarr + cur, copyarr + copylen, first);
       break;
     }
-    else if ( first == middle ) // everything prior to middle is sorted, done
+    else if (first == middle) // everything prior to middle is sorted, done
     {
       break;
     }
 
-    if ( comp(*middle, copyarr[cur]) )
+    if (comp(*middle, copyarr[cur]))
     {
       *first = std::move(*middle);
       ++middle;
@@ -505,47 +536,46 @@ inplace_merge(  Iter first,
     while copies are outside, somewhat follows STL API
 */
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
-//constexpr OutIter // <-- std:: return value
-void
-RAJA_INLINE
-merge_like_std( Iter1 first1,
-                Iter1 last1,
-                Iter2 first2,
-                Iter2 last2,
-                OutIter d_first,  // using this as direct access to result
-                Compare comp)
+// constexpr OutIter // <-- std:: return value
+void RAJA_INLINE
+merge_like_std(Iter1 first1,
+               Iter1 last1,
+               Iter2 first2,
+               Iter2 last2,
+               OutIter d_first, // using this as direct access to result
+               Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if ( first1 == last2 - 1 )  // should never need to do this
+  if (first1 == last2 - 1) // should never need to do this
   {
     return;
   }
 
-  if ( (last2 - first1) == 2 ) // only 2 elements, simple swap
+  if ((last2 - first1) == 2) // only 2 elements, simple swap
   {
-    if ( !comp(*d_first, *(d_first+1)) )
+    if (!comp(*d_first, *(d_first + 1)))
     {
-      safe_iter_swap( d_first, d_first+1 );
+      safe_iter_swap(d_first, d_first + 1);
     }
     return;
   }
 
-  while ( first1 < last1 || first2 < last2 )
+  while (first1 < last1 || first2 < last2)
   {
-    if ( first1 >= last1 ) // first half done
+    if (first1 >= last1) // first half done
     {
       *d_first = std::move(*first2);
       ++first2;
     }
-    else if ( first2 >= last2 )  // second half done
+    else if (first2 >= last2) // second half done
     {
       *d_first = std::move(*first1);
       ++first1;
     }
-    else  // neither half done
+    else // neither half done
     {
-      if ( comp( *first2, *first1 ) )
+      if (comp(*first2, *first1))
       {
         *d_first = std::move(*first2);
         ++first2;
@@ -557,7 +587,7 @@ merge_like_std( Iter1 first1,
       }
     }
 
-    ++d_first;  // advance output
+    ++d_first; // advance output
   }
 
   return;
@@ -568,11 +598,7 @@ merge_like_std( Iter1 first1,
     and using O(N*lg(N)) comparisons and O(N) memory
 */
 template <typename Iter, typename Compare>
-RAJA_INLINE
-void
-merge_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
@@ -580,22 +606,22 @@ merge_sort(Iter begin,
   // iterative mergesort (bottom up) for future parallelism
 
   // min helper
-  auto minlam = [] (diff_type a, diff_type b) {return (a < b) ? a : b;};
+  auto minlam = [](diff_type a, diff_type b) { return (a < b) ? a : b; };
 
   // insertion sort for sizes <= 16
   diff_type len = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
-  if ( len <= insertion_sort_cutoff && len > 0 )
+  if (len <= insertion_sort_cutoff && len > 0)
   {
-    detail::insertion_sort( begin, end, comp );
+    detail::insertion_sort(begin, end, comp);
   }
   else
   {
     // insertion sort on 16-element chunks, then merge
-    for ( diff_type start = 0; start < len; start += insertion_sort_cutoff )
+    for (diff_type start = 0; start < len; start += insertion_sort_cutoff)
     {
-      diff_type lastchunk = minlam( insertion_sort_cutoff, len - start );
-      detail::insertion_sort( begin + start, begin + start + lastchunk, comp );
+      diff_type lastchunk = minlam(insertion_sort_cutoff, len - start);
+      detail::insertion_sort(begin + start, begin + start + lastchunk, comp);
     }
 
     // merge using extra storage
@@ -605,78 +631,96 @@ merge_sort(Iter begin,
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, len * sizeof(value_type) ),
+        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                                len * sizeof(value_type)),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
 
     // check memory allocation worked
-    if (copyarr == nullptr) {
-      RAJA_ABORT_OR_THROW( "merge_sort temporary memory allocation failed" );
+    if (copyarr == nullptr)
+    {
+      RAJA_ABORT_OR_THROW("merge_sort temporary memory allocation failed");
     }
 
     // move construct input into buffer storage
     // use buf_deleter.size as index to keep track of objects constructed
-    for ( diff_type& cc = buf_deleter.size; cc < len; ++cc )
+    for (diff_type& cc = buf_deleter.size; cc < len; ++cc)
     {
-      new(&copyarr[cc]) value_type(std::move(begin[cc]));
+      new (&copyarr[cc]) value_type(std::move(begin[cc]));
     }
 
     bool copyvalid = true;
-    //for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log n) loop
-    for ( diff_type midpoint = 16; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    // for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log
+    // n) loop
+    for (diff_type midpoint = 16; midpoint < len;
+         midpoint *= 2) // O(log n) loop
     {
-      for ( diff_type start = 0; start < len; start += midpoint * 2 )  // O(n) merging loop (can be parallelized)
+      for (diff_type start = 0; start < len;
+           start += midpoint * 2) // O(n) merging loop (can be parallelized)
       {
-        diff_type finish = minlam( start + midpoint * 2, len );
-        if ( finish > len )
+        diff_type finish = minlam(start + midpoint * 2, len);
+        if (finish > len)
         {
-          RAJA_ABORT_OR_THROW( "merge_sort invalid finish point" );  // sanity check
+          RAJA_ABORT_OR_THROW(
+              "merge_sort invalid finish point"); // sanity check
         }
 
-        if ( start + midpoint >= len )
+        if (start + midpoint >= len)
         {
           // copy sorted remainder over
-          if ( copyvalid )
+          if (copyvalid)
           {
-            std::move( copyarr + start, copyarr + finish, begin + start );
+            std::move(copyarr + start, copyarr + finish, begin + start);
           }
           else
           {
-            std::move( begin + start, begin + finish, copyarr + start );
+            std::move(begin + start, begin + finish, copyarr + start);
           }
-          break;  // skip merge if no second half exists
+          break; // skip merge if no second half exists
         }
 
-        if ( copyvalid )  // switch arrays per level of merging to avoid copying back to copyarr
+        if (copyvalid) // switch arrays per level of merging to avoid copying
+                       // back to copyarr
         {
-          detail::merge_like_std( copyarr + start, copyarr + start + midpoint, copyarr + start + midpoint, copyarr + finish, begin + start, comp );
+          detail::merge_like_std(copyarr + start,
+                                 copyarr + start + midpoint,
+                                 copyarr + start + midpoint,
+                                 copyarr + finish,
+                                 begin + start,
+                                 comp);
         }
         else
         {
-          detail::merge_like_std( begin + start, begin + start + midpoint, begin + start + midpoint, begin + finish, copyarr + start, comp );
+          detail::merge_like_std(begin + start,
+                                 begin + start + midpoint,
+                                 begin + start + midpoint,
+                                 begin + finish,
+                                 copyarr + start,
+                                 comp);
         }
       }
 
-      copyvalid = !copyvalid; // switch arrays per level of merging to avoid copying back to copyarr
+      copyvalid = !copyvalid; // switch arrays per level of merging to avoid
+                              // copying back to copyarr
     }
 
     // update copy if necessary
-    if ( copyvalid )
+    if (copyvalid)
     {
-      std::move( copyarr, copyarr + len, begin );
+      std::move(copyarr, copyarr + len, begin);
     }
   }
-  //else
+  // else
   //{
-      // Possible TBD: in-place mergesort
-      // Would shift (like insertion sort) when performing merge.
-      // PRO - Can use on GPU, O(1) storage required.
-      // CON - Shifting would cause slowdown O(n^2 log n).
+  //  Possible TBD: in-place mergesort
+  //  Would shift (like insertion sort) when performing merge.
+  //  PRO - Can use on GPU, O(1) storage required.
+  //  CON - Shifting would cause slowdown O(n^2 log n).
   //}
 }
 
-}  // namespace detail
+} // namespace detail
 
 /*!
     \brief stable insertion sort given range inplace using comparison function
@@ -684,10 +728,9 @@ merge_sort(Iter begin,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-insertion_sort(Container&& c,
-               Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    insertion_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -698,11 +741,13 @@ insertion_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::insertion_sort(begin_it, end_it, comp);
     }
   }
@@ -714,10 +759,9 @@ insertion_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-shell_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    shell_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -728,11 +772,13 @@ shell_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::shell_sort(begin_it, end_it, comp);
     }
   }
@@ -744,10 +790,9 @@ shell_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-heap_sort(Container&& c,
-          Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    heap_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -758,11 +803,13 @@ heap_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::heap_sort(begin_it, end_it, comp);
     }
   }
@@ -774,10 +821,9 @@ heap_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-intro_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    intro_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -788,11 +834,13 @@ intro_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::intro_sort(begin_it, end_it, comp);
     }
   }
@@ -804,10 +852,8 @@ intro_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-merge_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+merge_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -818,16 +864,18 @@ merge_sort(Container&& c,
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it   = end(c);
+  auto end_it = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::merge_sort(begin_it, end_it, comp);
     }
   }
 }
 
-}  // namespace RAJA
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/sycl_compat.hpp b/include/RAJA/util/sycl_compat.hpp
index 7754caa273..ff8a3754d1 100644
--- a/include/RAJA/util/sycl_compat.hpp
+++ b/include/RAJA/util/sycl_compat.hpp
@@ -26,4 +26,4 @@
 #include <sycl/sycl.hpp>
 #endif
 
-#endif  // RAJA_util_sycl_compat_HPP
+#endif // RAJA_util_sycl_compat_HPP
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 310217bde5..e49c538cbe 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -70,13 +70,19 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
-struct DirectBase {};
-struct LoopBase {};
-struct ContiguousLoopBase : LoopBase {};
-struct StridedLoopBase : LoopBase {};
-struct UnsizedLoopBase {};
-struct SizedLoopBase {};
-template < size_t t_max_iterations >
+struct DirectBase
+{};
+struct LoopBase
+{};
+struct ContiguousLoopBase : LoopBase
+{};
+struct StridedLoopBase : LoopBase
+{};
+struct UnsizedLoopBase
+{};
+struct SizedLoopBase
+{};
+template <size_t t_max_iterations>
 struct SizedLoopSpecifyingBase : SizedLoopBase
 {
   static constexpr size_t max_iterations = t_max_iterations;
@@ -103,7 +109,8 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-struct Direct : DirectBase {};
+struct Direct : DirectBase
+{};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -130,10 +137,13 @@ struct Direct : DirectBase {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-template < size_t max_iterations >
-struct Contiguousloop : ContiguousLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct Contiguousloop
+    : ContiguousLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -160,10 +170,13 @@ struct Contiguousloop : ContiguousLoopBase,
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-template < size_t max_iterations >
-struct StridedLoop : StridedLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct StridedLoop
+    : StridedLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
 } // namespace iteration_mapping
 
@@ -171,7 +184,11 @@ struct StridedLoop : StridedLoopBase,
 /// Enumeration used to indicate whether ListSegment object owns data
 /// representing its indices.
 ///
-enum IndexOwnership { Unowned, Owned };
+enum IndexOwnership
+{
+  Unowned,
+  Owned
+};
 
 ///
 /// Type use for all loop indexing in RAJA constructs.
@@ -189,8 +206,8 @@ const int UndefinedValue = -9999999;
 /// Template list of sizes
 ///
 template <Index_type... Sizes>
-struct SizeList {
-};
+struct SizeList
+{};
 
 
 ///
@@ -203,15 +220,15 @@ struct Fraction
 
   using inverse = Fraction<int_t, denominator, numerator>;
 
-  template < typename new_int_t >
-  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+  template <typename new_int_t>
+  using rebind =
+      Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
 
   static constexpr int_t multiply(int_t val) noexcept
   {
     return (val / denominator) * numerator +
            (val % denominator) * numerator / denominator;
   }
-
 };
 
 
@@ -254,7 +271,8 @@ using Complex_type = std::complex<Real_type>;
 // alignment attribute supported for versions > 12
 //
 #if __ICC >= 1300
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 #endif
@@ -262,7 +280,8 @@ using const_TDRAReal_ptr = const TDRAReal_ptr;
 #elif defined(RAJA_COMPILER_GNU)
 
 #elif defined(RAJA_COMPILER_CLANG)
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 
@@ -459,10 +478,10 @@ class ConstRestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if __ICC < 1300  // use alignment intrinsic
+#if __ICC < 1300 // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
     return ((const Real_type* RAJA_RESTRICT)dptr)[i];
-#else  // use alignment attribute
+#else // use alignment attribute
     return ((const_TDRAReal_ptr)dptr)[i];
 #endif
   }
@@ -471,7 +490,7 @@ class ConstRestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if 1  // NOTE: alignment instrinsic not available for older GNU compilers
+#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
     return ((const Real_type* RAJA_RESTRICT)RAJA_ALIGN_DATA(dptr))[i];
 #else
     return ((const Real_type* RAJA_RESTRICT)dptr)[i];
@@ -573,10 +592,10 @@ class RestrictAlignedRealPtr
   ///
   Real_type& operator[](Index_type i)
   {
-#if __ICC < 1300  // use alignment intrinsic
+#if __ICC < 1300 // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
-#else  // use alignment attribute
+#else // use alignment attribute
     return ((TDRAReal_ptr)dptr)[i];
 #endif
   }
@@ -584,10 +603,10 @@ class RestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if __ICC < 1300  // use alignment intrinsic
+#if __ICC < 1300 // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
-#else  // use alignment attribute
+#else // use alignment attribute
     return ((TDRAReal_ptr)dptr)[i];
 #endif
   }
@@ -596,7 +615,7 @@ class RestrictAlignedRealPtr
   ///
   Real_type& operator[](Index_type i)
   {
-#if 1  // NOTE: alignment instrinsic not available for older GNU compilers
+#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
     return ((Real_type * RAJA_RESTRICT) RAJA_ALIGN_DATA(dptr))[i];
 #else
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
@@ -606,7 +625,7 @@ class RestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if 1  // NOTE: alignment instrinsic not available for older GNU compilers
+#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
     return ((Real_type * RAJA_RESTRICT) RAJA_ALIGN_DATA(dptr))[i];
 #else
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
@@ -801,9 +820,9 @@ class RestrictComplexPtr
 private:
   Complex_type* dptr;
 };
-#endif  // defined(RAJA_USE_COMPLEX)
+#endif // defined(RAJA_USE_COMPLEX)
 
-#endif  // defined(RAJA_USE_PTR_CLASS)
+#endif // defined(RAJA_USE_PTR_CLASS)
 
 /*
  ******************************************************************************
@@ -867,20 +886,21 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 #endif
 
 
-namespace detail {
+namespace detail
+{
 
 /*!
  * \brief Abstracts access to memory using normal memory accesses.
  */
 struct DefaultAccessor
 {
-  template < typename T >
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
   {
     return ptr[i];
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
   {
     ptr[i] = val;
@@ -898,7 +918,10 @@ template <typename T,
 struct AsIntegerArray
 {
   static_assert(min_integer_type_size <= max_integer_type_size,
-                "incompatible min and max integer type size");
+                "incompatible "
+                "min and max "
+                "integer type "
+                "size");
   using integer_type = std::conditional_t<
       ((alignof(T) >= alignof(unsigned long long) &&
         sizeof(unsigned long long) <= max_integer_type_size) ||
@@ -919,17 +942,25 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<
-                      ((alignof(T) >= alignof(unsigned char) &&
-                        sizeof(unsigned char) <= max_integer_type_size)),
-                      unsigned char,
-                      void>>>>>;
+                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
+                                       sizeof(unsigned char) <=
+                                           max_integer_type_size)),
+                                     unsigned char,
+                                     void>>>>>;
   static_assert(!std::is_same<integer_type, void>::value,
-                "could not find a compatible integer type");
+                "could not find a "
+                "compatible integer "
+                "type");
   static_assert(sizeof(integer_type) >= min_integer_type_size,
-                "integer_type smaller than min integer type size");
+                "integer_type "
+                "smaller than "
+                "min integer "
+                "type size");
   static_assert(sizeof(integer_type) <= max_integer_type_size,
-                "integer_type greater than max integer type size");
+                "integer_type "
+                "greater than "
+                "max integer "
+                "type size");
 
   static constexpr size_t num_integer_type =
       (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
@@ -965,36 +996,31 @@ template <typename T>
 struct ScopedAssignment
 {
   ScopedAssignment(T& val, T const& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = new_val;
   }
 
   ScopedAssignment(T& val, T&& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = std::move(new_val);
   }
 
   ScopedAssignment(ScopedAssignment const&) = delete;
-  ScopedAssignment(ScopedAssignment &&) = delete;
+  ScopedAssignment(ScopedAssignment&&) = delete;
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
-  ScopedAssignment& operator=(ScopedAssignment &&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment&&) = delete;
 
-  ~ScopedAssignment()
-  {
-    m_ref_to_val = std::move(m_prev_val);
-  }
+  ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
 
 private:
   T& m_ref_to_val;
   T m_prev_val;
 };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 1beefeb9cc..21762e0dcd 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -37,41 +37,40 @@ namespace RAJA
     \brief ZipIterator class for simultaneously iterating over
     multiple iterators. This is not a standards compliant iterator.
 */
-template < typename ... Iters >
+template <typename... Iters>
 struct ZipIterator
 {
-  static_assert(concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
+  static_assert(
+      concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
   static_assert(sizeof...(Iters) > 1,
-      "ZipIterator must contain one or more iterators");
+                "ZipIterator must contain one or more "
+                "iterators");
 
-  using value_type = zip_val<typename std::iterator_traits<Iters>::value_type...>;
+  using value_type =
+      zip_val<typename std::iterator_traits<Iters>::value_type...>;
   using difference_type = std::ptrdiff_t;
   using pointer = void;
   using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
-  using creference = zip_ref<const typename std::iterator_traits<Iters>::reference...>;
+  using creference =
+      zip_ref<const typename std::iterator_traits<Iters>::reference...>;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE inline ZipIterator()
-    : m_iterators()
-  {
-  }
+  RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
 
-  template < typename... Args,
-             typename = concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...> >
+  template <typename... Args,
+            typename = concepts::enable_if<
+                type_traits::convertible_to<Args&&, Iters>...>>
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
-    : m_iterators(std::forward<Args>(args)...)
-  {
-  }
+      : m_iterators(std::forward<Args>(args)...)
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
-    : m_iterators(rhs.m_iterators)
-  {
-  }
+      : m_iterators(rhs.m_iterators)
+  {}
   RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
-    : m_iterators(std::move(rhs.m_iterators))
-  {
-  }
+      : m_iterators(std::move(rhs.m_iterators))
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator& operator=(const ZipIterator& rhs)
   {
@@ -97,11 +96,11 @@ struct ZipIterator
   }
   RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) >  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) > RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) <  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) < RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
   {
@@ -135,41 +134,38 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
   {
     detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
     return *this;
   }
-  RAJA_HOST_DEVICE inline ZipIterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
   {
     detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator-(
-      const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator-(const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) - RAJA::get<0>(rhs.m_iterators);
   }
-  RAJA_HOST_DEVICE inline ZipIterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator
+  operator+(const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp += rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE inline ZipIterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator
+  operator-(const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(
-      difference_type lhs,
-      const ZipIterator& rhs)
+  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
+                                                const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -190,7 +186,8 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
+                                                     ZipIterator rhs)
   {
     detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
   }
@@ -198,7 +195,7 @@ struct ZipIterator
 private:
   zip_val<camp::decay<Iters>...> m_iterators;
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
   {
     return reference(*RAJA::get<Is>(m_iterators)...);
@@ -210,10 +207,8 @@ struct ZipIterator
     \brief Zip multiple iterators together to iterate them simultaneously with
     a single ZipIterator object.
 */
-template < typename... Args >
-RAJA_HOST_DEVICE
-auto zip(Args&&... args)
-  -> ZipIterator<camp::decay<Args>...>
+template <typename... Args>
+RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
 {
   return {std::forward<Args>(args)...};
 }
@@ -223,29 +218,28 @@ auto zip(Args&&... args)
     ZipIterator objects.
 */
 template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto zip_span(Args&&... args)
-  -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-          typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
+    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+            typename ZipIterator<
+                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
   return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-              typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
+              typename ZipIterator<detail::ContainerIter<
+                  camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
-      zip(  end(std::forward<Args>(args))...));
+      zip(end(std::forward<Args>(args))...));
 }
 
 /*!
     \brief Comparator object that compares the first member
     of tuple like objects.
 */
-template < typename T, typename Compare >
+template <typename T, typename Compare>
 struct CompareFirst
 {
-  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_)
-    : comp(comp_)
-  { }
+  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_) : comp(comp_) {}
 
   RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs)
   {
@@ -260,14 +254,12 @@ struct CompareFirst
     \brief Make a comparator to compare first member of tuple
     like objects of type T.
 */
-template < typename T, typename Compare >
-RAJA_HOST_DEVICE
-auto compare_first(Compare comp)
-  -> CompareFirst<T, Compare>
+template <typename T, typename Compare>
+RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
 {
   return {comp};
 }
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index d631d4714b..fa8993f70d 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -31,49 +31,61 @@
 namespace RAJA
 {
 
-template < bool is_val, typename ... Ts >
+template <bool is_val, typename... Ts>
 struct zip_tuple;
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 struct zip_tuple_element;
 
-template < camp::idx_t I, bool is_val, typename ... Ts >
+template <camp::idx_t I, bool is_val, typename... Ts>
 struct zip_tuple_element<I, zip_tuple<is_val, Ts...>>
-  : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
-{ };
+    : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
+{};
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 
 
 // get function declarations for zip_tuple
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> &
-get(zip_tuple<is_val, Ts...>      &  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
-get(zip_tuple<is_val, Ts...> const&  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> &&
-get(zip_tuple<is_val, Ts...>      && z) noexcept
-{ return std::move(z).template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
+                                                     zip_tuple<is_val, Ts...>>&
+get(zip_tuple<is_val, Ts...>& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::
+    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
+    get(zip_tuple<is_val, Ts...> const& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>>&&
+get(zip_tuple<is_val, Ts...>&& z) noexcept
+{
+  return std::move(z).template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
 get(zip_tuple<is_val, Ts...> const&& z) noexcept
-{ return std::move(z).template get<I>(); }
+{
+  return std::move(z).template get<I>();
+}
 
 namespace detail
 {
 
 struct PassThrough
 {
-  template < typename T >
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::forward<T>(t))
+      -> decltype(std::forward<T>(t))
   {
     return std::forward<T>(t);
   }
@@ -81,9 +93,9 @@ struct PassThrough
 
 struct Move
 {
-  template < typename T >
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::move(t))
+      -> decltype(std::move(t))
   {
     return std::move(t);
   }
@@ -91,9 +103,9 @@ struct Move
 
 struct PreInc
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(++std::forward<Iter>(iter))
+      -> decltype(++std::forward<Iter>(iter))
   {
     return ++std::forward<Iter>(iter);
   }
@@ -101,33 +113,33 @@ struct PreInc
 
 struct PreDec
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(--std::forward<Iter>(iter))
+      -> decltype(--std::forward<Iter>(iter))
   {
     return --std::forward<Iter>(iter);
   }
 };
 
-template < typename difference_type >
+template <typename difference_type>
 struct PlusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) += rhs)
+      -> decltype(std::forward<Iter>(iter) += rhs)
   {
     return std::forward<Iter>(iter) += rhs;
   }
 };
 
-template < typename difference_type >
+template <typename difference_type>
 struct MinusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) -= rhs)
+      -> decltype(std::forward<Iter>(iter) -= rhs)
   {
     return std::forward<Iter>(iter) -= rhs;
   }
@@ -135,9 +147,9 @@ struct MinusEq
 
 struct DeRef
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(*std::forward<Iter>(iter))
+      -> decltype(*std::forward<Iter>(iter))
   {
     return *std::forward<Iter>(iter);
   }
@@ -145,7 +157,7 @@ struct DeRef
 
 struct Swap
 {
-  template< typename T0, typename T1 >
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using camp::safe_swap;
@@ -156,7 +168,7 @@ struct Swap
 
 struct IterSwap
 {
-  template< typename T0, typename T1 >
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using RAJA::safe_iter_swap;
@@ -169,9 +181,9 @@ struct IterSwap
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void
+zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
 }
@@ -179,51 +191,57 @@ void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void
+zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)), RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
+                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple&& t, F&& f)
+template <typename Tuple, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f), typename camp::decay<Tuple>::IdxSeq{});
+  zip_for_each_impl(std::forward<Tuple>(t),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple>::IdxSeq{});
 }
 
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+template <typename Tuple0, typename Tuple1, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq, typename camp::decay<Tuple1>::IdxSeq>::value,
-      "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
+                             typename camp::decay<Tuple1>::IdxSeq>::value,
+                "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0),
+                    std::forward<Tuple1>(t1),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple0>::IdxSeq{});
 }
 
 } // end namespace detail
 
 /*!
     \brief Tuple used by ZipIterator for storing multiple references and values.
-    Acts like a reference to its members allowing copy/move construction/assignment
-    based on the reference type of the zip_tuple.
+    Acts like a reference to its members allowing copy/move
+   construction/assignment based on the reference type of the zip_tuple.
 */
-template < bool is_val, typename ... Ts >
+template <bool is_val, typename... Ts>
 struct zip_tuple
 {
   using value_type = RAJA::tuple<Ts...>;
 
-  template < typename T >
-  using opp_type = typename std::conditional< is_val,
-        typename std::add_lvalue_reference<T>::type,
-        typename std::remove_reference<T>::type >::type;
+  template <typename T>
+  using opp_type =
+      typename std::conditional<is_val,
+                                typename std::add_lvalue_reference<T>::type,
+                                typename std::remove_reference<T>::type>::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -232,74 +250,112 @@ struct zip_tuple
   using IdxSeq = camp::make_idx_seq_t<sizeof...(Ts)>;
 
   // constructor from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...> >
+  template <
+      typename... Os,
+      typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...>>
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
-    : m_tuple(std::forward<Os>(os)...) { }
+      : m_tuple(std::forward<Os>(os)...)
+  {}
 
   // assignment from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, typename std::remove_reference<Ts>::type>...> >
+  template <typename... Os,
+            typename = concepts::enable_if<type_traits::convertible_to<
+                Os&&,
+                typename std::remove_reference<Ts>::type>...>>
   zip_tuple& assign(Os&&... os)
-  { return assign_helper(IdxSeq{}, std::forward<Os>(os)...); }
+  {
+    return assign_helper(IdxSeq{}, std::forward<Os>(os)...);
+  }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq{})
+  {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq{})
+  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq{})
+  {} // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
+  {
+    return assign_helper(o, IdxSeq{});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq{});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq{});
+  }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq{})
+  {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq{})
+  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq{})
+  {} // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
+  {
+    return assign_helper(o, IdxSeq{});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq{});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq{});
+  }
 
   // get member functions for zip_tuples
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> & get() & noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> const& get() const& noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> && get() && noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> const&& get() const&& noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type>&
+  get() & noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type> const&
+  get() const& noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>>&&
+  get() && noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>> const&&
+  get() const&& noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     zip_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap{});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     opp_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap{});
   }
@@ -313,69 +369,113 @@ struct zip_tuple
 private:
   // move if is_val is true, otherwise copy in move constructor
   // this allows values to be moved, and references to stay lvalue references
-  using IsValMover = typename std::conditional<is_val, detail::Move, detail::PassThrough>::type;
+  using IsValMover = typename std::
+      conditional<is_val, detail::Move, detail::PassThrough>::type;
 
   value_type m_tuple;
 
   // assignment helper from types convertible to Ts
-  template < typename ... Os, camp::idx_t ... Is >
+  template <typename... Os, camp::idx_t... Is>
   zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
-  { camp::sink(get<Is>() = std::forward<Os>(os)...); return *this; }
+  {
+    camp::sink(get<Is>() = std::forward<Os>(os)...);
+    return *this;
+  }
 
   // copy and move constructor helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...)
+  {} // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &      o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &&     o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); } return *this; }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    }
+    return *this;
+  }
 
   // copy and move constructor helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...)
+  {} // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &      o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &&     o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); return *this; }
-
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    return *this;
+  }
 };
 
 // alias zip_ref to zip_tuple capable of storing references (!is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_ref = zip_tuple<false, Ts...>;
 
 // alias zip_val to zip_tuple suitable for storing values (is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_val = zip_tuple<true, Ts...>;
 
-}  // end namespace RAJA
+} // end namespace RAJA
 
 #endif
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index d95859d71d..eed97af440 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -51,7 +51,8 @@ void buildIndexSetAligned(
   if (length == 0) return;
 
   /* only transform relatively large */
-  if (length > range_min_length) {
+  if (length > range_min_length)
+  {
     /* build a rindex array from an index array */
     RAJA::Index_type docount = 0;
     RAJA::Index_type inrange = -1;
@@ -62,28 +63,39 @@ void buildIndexSetAligned(
 
     RAJA::Index_type scanVal = indices_in[0];
     RAJA::Index_type sliceCount = 0;
-    for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+    for (RAJA::Index_type ii = 1; ii < length; ++ii)
+    {
       RAJA::Index_type lookAhead = indices_in[ii];
 
-      if (inrange == -1) {
-        if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
+      if (inrange == -1)
+      {
+        if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0))
+        {
           inrange = 1;
-        } else {
+        }
+        else
+        {
           inrange = 0;
         }
       }
 
-      if (lookAhead == scanVal + 1) {
-        if ((inrange == 0) && ((scanVal % range_align) == 0)) {
-          if (sliceCount != 0) {
+      if (lookAhead == scanVal + 1)
+      {
+        if ((inrange == 0) && ((scanVal % range_align) == 0))
+        {
+          if (sliceCount != 0)
+          {
             docount += 1 + sliceCount; /* length + singletons */
           }
           inrange = 1;
           sliceCount = 0;
         }
         ++sliceCount; /* account for scanVal */
-      } else {
-        if (inrange == 1) {
+      }
+      else
+      {
+        if (inrange == 1)
+        {
           /* we can tighten this up by schleping any trailing */
           /* sigletons off into the subsequent singleton */
           /* array.  We would then also need to recheck the */
@@ -95,30 +107,39 @@ void buildIndexSetAligned(
           docount += 2; /* length + begin */
           inrange = 0;
           sliceCount = 0;
-        } else {
+        }
+        else
+        {
           ++sliceCount; /* account for scanVal */
         }
       }
 
       scanVal = lookAhead;
-    }  // end loop to gather statistics
+    } // end loop to gather statistics
 
-    if (inrange != -1) {
-      if (inrange) {
+    if (inrange != -1)
+    {
+      if (inrange)
+      {
         ++sliceCount;
         docount += 2; /* length + begin */
-      } else {
+      }
+      else
+      {
         ++sliceCount;
         docount += 1 + sliceCount; /* length + singletons */
       }
-    } else if (scanVal != -1) {
+    }
+    else if (scanVal != -1)
+    {
       ++sliceCount;
       docount += 2;
     }
     ++docount; /* zero length termination */
 
     /* What is the cutoff criteria for generating the rindex array? */
-    if (docount < (length * (range_align - 1)) / range_align) {
+    if (docount < (length * (range_align - 1)) / range_align)
+    {
       /* The rindex array can either contain a pointer into the */
       /* original index array, *or* it can repack the data from the */
       /* original index array.  Benefits of repacking could include */
@@ -135,30 +156,41 @@ void buildIndexSetAligned(
       scanVal = indices_in[0];
       sliceCount = 0;
       dobegin = scanVal;
-      for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+      for (RAJA::Index_type ii = 1; ii < length; ++ii)
+      {
         RAJA::Index_type lookAhead = indices_in[ii];
 
-        if (inrange == -1) {
-          if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
+        if (inrange == -1)
+        {
+          if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0))
+          {
             inrange = 1;
-          } else {
+          }
+          else
+          {
             inrange = 0;
             dobegin = ii - 1;
           }
         }
-        if (lookAhead == scanVal + 1) {
-          if ((inrange == 0) && ((scanVal % range_align) == 0)) {
-            if (sliceCount != 0) {
-              iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                          work_res));
+        if (lookAhead == scanVal + 1)
+        {
+          if ((inrange == 0) && ((scanVal % range_align) == 0))
+          {
+            if (sliceCount != 0)
+            {
+              iset.push_back(
+                  ListSegment(&indices_in[dobegin], sliceCount, work_res));
             }
             inrange = 1;
             dobegin = scanVal;
             sliceCount = 0;
           }
           ++sliceCount; /* account for scanVal */
-        } else {
-          if (inrange == 1) {
+        }
+        else
+        {
+          if (inrange == 1)
+          {
             /* we can tighten this up by schleping any trailing */
             /* sigletons off into the subsequent singleton */
             /* array.  We would then also need to recheck the */
@@ -171,32 +203,44 @@ void buildIndexSetAligned(
             inrange = 0;
             sliceCount = 0;
             dobegin = ii;
-          } else {
+          }
+          else
+          {
             ++sliceCount; /* account for scanVal */
           }
         }
 
         scanVal = lookAhead;
-      }  // for (RAJA::Index_type ii ...
+      } // for (RAJA::Index_type ii ...
 
-      if (inrange != -1) {
-        if (inrange) {
+      if (inrange != -1)
+      {
+        if (inrange)
+        {
           ++sliceCount;
           iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
-        } else {
+        }
+        else
+        {
           ++sliceCount;
-          iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                      work_res));
+          iset.push_back(
+              ListSegment(&indices_in[dobegin], sliceCount, work_res));
         }
-      } else if (scanVal != -1) {
+      }
+      else if (scanVal != -1)
+      {
         iset.push_back(ListSegment(&scanVal, 1, work_res));
       }
-    } else {  // !(docount < (length*range_align-1))/range_align)
+    }
+    else
+    { // !(docount < (length*range_align-1))/range_align)
       iset.push_back(ListSegment(indices_in, length, work_res));
     }
-  } else {  // else !(length > range_min_length)
+  }
+  else
+  { // else !(length > range_min_length)
     iset.push_back(ListSegment(indices_in, length, work_res));
   }
 }
 
-}  // namespace RAJA
+} // namespace RAJA
diff --git a/src/DepGraphNode.cpp b/src/DepGraphNode.cpp
index 176d9e855d..92e984c15e 100644
--- a/src/DepGraphNode.cpp
+++ b/src/DepGraphNode.cpp
@@ -29,9 +29,11 @@ void DepGraphNode::print(std::ostream& os) const
      << m_semaphore_reload_value << std::endl;
 
   os << "     num dep tasks = " << m_num_dep_tasks;
-  if (m_num_dep_tasks > 0) {
+  if (m_num_dep_tasks > 0)
+  {
     os << " ( ";
-    for (int jj = 0; jj < m_num_dep_tasks; ++jj) {
+    for (int jj = 0; jj < m_num_dep_tasks; ++jj)
+    {
       os << m_dep_task[jj] << "  ";
     }
     os << " )";
@@ -39,4 +41,4 @@ void DepGraphNode::print(std::ostream& os) const
   os << std::endl;
 }
 
-}  // namespace RAJA
+} // namespace RAJA
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index fa05e0faf8..c2ae293bd9 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -15,43 +15,44 @@
 const uint64_t kokkos_interface_version = 20171029;
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
-template<typename function>
-RAJA_INLINE
-void
+template <typename function>
+RAJA_INLINE void
 getFunction(void* plugin, std::vector<function>& functions, const char* fname)
 {
-  #ifndef _WIN32
-  function func = (function) dlsym(plugin, fname);
+#ifndef _WIN32
+  function func = (function)dlsym(plugin, fname);
   if (func)
     functions.push_back(func);
   else
     printf("[KokkosPluginLoader]: dlsym failed: %s\n", dlerror());
-  #else
+#else
   RAJA_UNUSED_ARG(plugin);
   RAJA_UNUSED_ARG(functions);
   RAJA_UNUSED_ARG(fname);
-  #endif
+#endif
 }
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 KokkosPluginLoader::KokkosPluginLoader()
 {
-  char *env = getenv("KOKKOS_PLUGINS");
+  char* env = getenv("KOKKOS_PLUGINS");
   if (env == nullptr)
   {
     return;
   }
   initDirectory(std::string(env));
 
-  for (auto &func : init_functions)
+  for (auto& func : init_functions)
   {
     func(0, kokkos_interface_version, 0, nullptr);
   }
@@ -59,7 +60,7 @@ KokkosPluginLoader::KokkosPluginLoader()
 
 void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &func : pre_functions)
+  for (auto& func : pre_functions)
   {
     func("", 0, &(p.kID));
   }
@@ -67,7 +68,7 @@ void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 
 void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &func : post_functions)
+  for (auto& func : post_functions)
   {
     func(p.kID);
   }
@@ -75,7 +76,7 @@ void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 
 void KokkosPluginLoader::finalize()
 {
-  for (auto &func : finalize_functions)
+  for (auto& func : finalize_functions)
   {
     func();
   }
@@ -86,10 +87,10 @@ void KokkosPluginLoader::finalize()
 }
 
 // Initialize plugin from a shared object file specified by 'path'.
-void KokkosPluginLoader::initPlugin(const std::string &path)
+void KokkosPluginLoader::initPlugin(const std::string& path)
 {
-  #ifndef _WIN32
-  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#ifndef _WIN32
+  void* plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
   if (!plugin)
   {
     printf("[KokkosPluginLoader]: dlopen failed: %s\n", dlerror());
@@ -98,28 +99,31 @@ void KokkosPluginLoader::initPlugin(const std::string &path)
   // Getting and storing supported kokkos functions.
   getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
 
-  getFunction<pre_function>(plugin, pre_functions, "kokkosp_begin_parallel_for");
+  getFunction<pre_function>(
+      plugin, pre_functions, "kokkosp_begin_parallel_for");
 
-  getFunction<post_function>(plugin, post_functions, "kokkosp_end_parallel_for");
+  getFunction<post_function>(
+      plugin, post_functions, "kokkosp_end_parallel_for");
 
-  getFunction<finalize_function>(plugin, finalize_functions, "kokkosp_finalize_library");
-  #else
+  getFunction<finalize_function>(
+      plugin, finalize_functions, "kokkosp_finalize_library");
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
-void KokkosPluginLoader::initDirectory(const std::string &path)
+void KokkosPluginLoader::initDirectory(const std::string& path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   if (isSharedObject(path))
   {
     initPlugin(path);
     return;
   }
-  
-  DIR *dir;
-  struct dirent *file;
+
+  DIR* dir;
+  struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
   {
@@ -136,9 +140,9 @@ void KokkosPluginLoader::initDirectory(const std::string &path)
   {
     perror("[KokkosPluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkKokkosPluginLoader() {}
@@ -146,4 +150,7 @@ void linkKokkosPluginLoader() {}
 } // end namespace util
 } // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader> P("KokkosPluginLoader", "Dynamically load plugins ported from the Kokkos library.");
+static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader>
+    P("KokkosPluginLoader",
+      "Dynamically load plugins ported from the Kokkos "
+      "library.");
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index f9ef1f51c8..73fb00e638 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -38,15 +38,14 @@ namespace RAJA
  ******************************************************************************
  *
  * Generate a lock-free "block" index set (planar division) containing
- * range segments. 
+ * range segments.
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim)
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim)
 {
   constexpr int PROFITABLE_ENTITY_THRESHOLD_BLOCK = 100;
 
@@ -56,10 +55,13 @@ void buildLockFreeBlockIndexset(
 
   if ((midDim | slowDim) == 0) /* 1d mesh */
   {
-    if (fastDim / PROFITABLE_ENTITY_THRESHOLD_BLOCK <= 1) {
+    if (fastDim / PROFITABLE_ENTITY_THRESHOLD_BLOCK <= 1)
+    {
       // printf("%d %d\n", 0, fastDim) ;
       iset.push_back(RAJA::RangeSegment(0, fastDim));
-    } else {
+    }
+    else
+    {
       /* This just sets up the schedule -- a truly safe */
       /* execution of this schedule would require a check */
       /* for completion of dependent threads before execution. */
@@ -68,8 +70,10 @@ void buildLockFreeBlockIndexset(
       /* profitability ratio is really bad, but for */
       /* now use the brain dead approach. */
       int numSegments = numThreads * 3;
-      for (int lane = 0; lane < 3; ++lane) {
-        for (int i = lane; i < numSegments; i += 3) {
+      for (int lane = 0; lane < 3; ++lane)
+      {
+        for (int i = lane; i < numSegments; i += 3)
+        {
           RAJA::Index_type start = i * fastDim / numSegments;
           RAJA::Index_type end = (i + 1) * fastDim / numSegments;
           // printf("%d %d\n", start, end) ;
@@ -77,13 +81,17 @@ void buildLockFreeBlockIndexset(
         }
       }
     }
-  } else if (slowDim == 0) /* 2d mesh */
+  }
+  else if (slowDim == 0) /* 2d mesh */
   {
     int rowsPerSegment = midDim / (3 * numThreads);
-    if (rowsPerSegment == 0) {
+    if (rowsPerSegment == 0)
+    {
       // printf("%d %d\n", 0, fastDim*midDim) ;
       iset.push_back(RAJA::RangeSegment(0, fastDim * midDim));
-    } else {
+    }
+    else
+    {
       /* This just sets up the schedule -- a truly safe */
       /* execution of this schedule would require a check */
       /* for completion of dependent threads before execution. */
@@ -91,8 +99,10 @@ void buildLockFreeBlockIndexset(
       /* We might want to force one thread if the */
       /* profitability ratio is really bad, but for */
       /* now use the brain dead approach. */
-      for (int lane = 0; lane < 3; ++lane) {
-        for (int i = 0; i < numThreads; ++i) {
+      for (int lane = 0; lane < 3; ++lane)
+      {
+        for (int i = 0; i < numThreads; ++i)
+        {
           RAJA::Index_type startRow = i * midDim / numThreads;
           RAJA::Index_type endRow = (i + 1) * midDim / numThreads;
           RAJA::Index_type start = startRow * fastDim;
@@ -105,7 +115,9 @@ void buildLockFreeBlockIndexset(
         }
       }
     }
-  } else { /* 3d mesh */
+  }
+  else
+  { /* 3d mesh */
 
     // this requires dependence graph - commenting out for now
 
@@ -225,12 +237,15 @@ void buildLockFreeColorIndexset(
   memset(rangeToDomainCount, 0, numEntityRange * sizeof(RAJA::Index_type));
 
   /* create an inverse mapping */
-  for (int i = 0; i < numEntity; ++i) {
-    for (int j = 0; j < numRangePerDomain; ++j) {
+  for (int i = 0; i < numEntity; ++i)
+  {
+    for (int j = 0; j < numRangePerDomain; ++j)
+    {
       RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
       RAJA::Index_type idx = id * numRangePerDomain + rangeToDomainCount[id]++;
       if (idx > numEntityRange * numRangePerDomain ||
-          rangeToDomainCount[id] > numRangePerDomain) {
+          rangeToDomainCount[id] > numRangePerDomain)
+      {
         printf("foiled!\n");
         exit(-1);
       }
@@ -238,30 +253,39 @@ void buildLockFreeColorIndexset(
     }
   }
 
-  while (!done) {
+  while (!done)
+  {
     done = true;
 
-    for (int i = 0; i < numEntity; ++i) {
+    for (int i = 0; i < numEntity; ++i)
+    {
       isMarked[i] = false;
     }
 
-    for (int i = 0; i < worksetSize; ++i) {
+    for (int i = 0; i < worksetSize; ++i)
+    {
       isMarked[workset[i]] = true;
     }
 
-    for (int i = 0; i < numEntity; ++i) {
-      if (isMarked[i] == false) {
+    for (int i = 0; i < numEntity; ++i)
+    {
+      if (isMarked[i] == false)
+      {
         done = false;
-        if (worksetSize >= numEntity) {
+        if (worksetSize >= numEntity)
+        {
           printf("foiled!\n");
           exit(-1);
         }
         workset[worksetSize++] = i;
-        for (int j = 0; j < numRangePerDomain; ++j) {
+        for (int j = 0; j < numRangePerDomain; ++j)
+        {
           RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
-          for (int k = 0; k < rangeToDomainCount[id]; ++k) {
+          for (int k = 0; k < rangeToDomainCount[id]; ++k)
+          {
             RAJA::Index_type idx = rangeToDomain[id * numRangePerDomain + k];
-            if (idx < 0 || idx >= numEntity) {
+            if (idx < 0 || idx >= numEntity)
+            {
               printf("foiled!\n");
               exit(-1);
             }
@@ -270,7 +294,8 @@ void buildLockFreeColorIndexset(
         }
       }
     }
-    if (done == false) {
+    if (done == false)
+    {
       worksetDelim[numWorkset++] = worksetSize;
     }
   }
@@ -278,45 +303,58 @@ void buildLockFreeColorIndexset(
   delete[] rangeToDomainCount;
   delete[] rangeToDomain;
 
-  if (worksetSize != numEntity) {
+  if (worksetSize != numEntity)
+  {
     printf("foiled!!!\n");
     exit(-1);
   }
 
   /* we may want to create a permutation array here */
-  if (elemPermutation != 0l) {
+  if (elemPermutation != 0l)
+  {
     /* send back permutaion array, and corresponding range segments */
 
     memcpy(elemPermutation, &workset[0], numEntity * sizeof(int));
-    if (ielemPermutation != 0l) {
-      for (int i = 0; i < numEntity; ++i) {
+    if (ielemPermutation != 0l)
+    {
+      for (int i = 0; i < numEntity; ++i)
+      {
         ielemPermutation[elemPermutation[i]] = i;
       }
     }
     RAJA::Index_type end = 0;
-    for (int i = 0; i < numWorkset; ++i) {
+    for (int i = 0; i < numWorkset; ++i)
+    {
       RAJA::Index_type begin = end;
       end = worksetDelim[i];
       iset.push_back(RAJA::RangeSegment(begin, end));
     }
-  } else {
+  }
+  else
+  {
     RAJA::Index_type end = 0;
-    for (int i = 0; i < numWorkset; ++i) {
+    for (int i = 0; i < numWorkset; ++i)
+    {
       RAJA::Index_type begin = end;
       end = worksetDelim[i];
       bool isRange = true;
-      for (int j = begin + 1; j < end; ++j) {
-        if (workset[j - 1] + 1 != workset[j]) {
+      for (int j = begin + 1; j < end; ++j)
+      {
+        if (workset[j - 1] + 1 != workset[j])
+        {
           isRange = false;
           break;
         }
       }
-      if (isRange) {
+      if (isRange)
+      {
         iset.push_back(
             RAJA::RangeSegment(workset[begin], workset[end - 1] + 1));
-      } else {
-        iset.push_back(RAJA::ListSegment(&workset[begin], end - begin,
-                                         work_res));
+      }
+      else
+      {
+        iset.push_back(
+            RAJA::ListSegment(&workset[begin], end - begin, work_res));
         // printf("segment %d\n", i) ;
         // for (int j=begin; j<end; ++j) {
         //    printf("%d\n", workset[j]) ;
@@ -330,4 +368,4 @@ void buildLockFreeColorIndexset(
   delete[] workset;
 }
 
-}  // namespace RAJA
+} // namespace RAJA
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index 85ead614d9..6f312a5dd5 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -54,11 +54,11 @@ cudaStatusInfo tl_status;
 std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace cuda
+} // namespace cuda
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // if defined(RAJA_ENABLE_CUDA)
+#endif // if defined(RAJA_ENABLE_CUDA)
diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp
index 97bd82775e..e1ade2b021 100644
--- a/src/MemUtils_HIP.cpp
+++ b/src/MemUtils_HIP.cpp
@@ -54,11 +54,11 @@ hipStatusInfo tl_status;
 std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace hip
+} // namespace hip
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // if defined(RAJA_ENABLE_HIP)
+#endif // if defined(RAJA_ENABLE_HIP)
diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp
index 0b5f1b8be6..d57a9738ae 100644
--- a/src/MemUtils_SYCL.cpp
+++ b/src/MemUtils_SYCL.cpp
@@ -52,11 +52,11 @@ syclInfo tl_status;
 std::unordered_map<cl::sycl::queue, bool> g_queue_info_map{
     {cl::sycl::queue(), true}};
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace sycl
+} // namespace sycl
 
-}  // namespace RAJA
+} // namespace RAJA
 
 
-#endif  // if defined(RAJA_ENABLE_SYCL)
+#endif // if defined(RAJA_ENABLE_SYCL)
diff --git a/src/PluginStrategy.cpp b/src/PluginStrategy.cpp
index e39c5718a8..b429b9116d 100644
--- a/src/PluginStrategy.cpp
+++ b/src/PluginStrategy.cpp
@@ -9,22 +9,24 @@
 
 RAJA_INSTANTIATE_REGISTRY(PluginRegistry);
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 PluginStrategy::PluginStrategy() = default;
 
-void PluginStrategy::init(const PluginOptions&) { }
+void PluginStrategy::init(const PluginOptions&) {}
 
-void PluginStrategy::preCapture(const PluginContext&) { }
+void PluginStrategy::preCapture(const PluginContext&) {}
 
-void PluginStrategy::postCapture(const PluginContext&) { }
+void PluginStrategy::postCapture(const PluginContext&) {}
 
-void PluginStrategy::preLaunch(const PluginContext&) { }
+void PluginStrategy::preLaunch(const PluginContext&) {}
 
-void PluginStrategy::postLaunch(const PluginContext&) { }
+void PluginStrategy::postLaunch(const PluginContext&) {}
 
-void PluginStrategy::finalize() { }
+void PluginStrategy::finalize() {}
 
-}
-}
+} // namespace util
+} // namespace RAJA
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
index 3da10cda8c..c6357d2965 100644
--- a/src/RuntimePluginLoader.cpp
+++ b/src/RuntimePluginLoader.cpp
@@ -13,18 +13,20 @@
 #endif
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
-namespace RAJA {
-namespace util {
-  
+namespace RAJA
+{
+namespace util
+{
+
 RuntimePluginLoader::RuntimePluginLoader()
 {
-  char *env = ::getenv("RAJA_PLUGINS");
+  char* env = ::getenv("RAJA_PLUGINS");
   if (nullptr == env)
   {
     return;
@@ -35,7 +37,7 @@ RuntimePluginLoader::RuntimePluginLoader()
 void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
 {
   initDirectory(p.str);
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->init(p);
   }
@@ -43,7 +45,7 @@ void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
 
 void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->preCapture(p);
   }
@@ -51,7 +53,7 @@ void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->postCapture(p);
   }
@@ -59,7 +61,7 @@ void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->preLaunch(p);
   }
@@ -67,7 +69,7 @@ void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->postLaunch(p);
   }
@@ -75,7 +77,7 @@ void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::finalize()
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->finalize();
   }
@@ -83,42 +85,44 @@ void RuntimePluginLoader::finalize()
 }
 
 // Initialize plugin from a shared object file specified by 'path'.
-void RuntimePluginLoader::initPlugin(const std::string &path)
+void RuntimePluginLoader::initPlugin(const std::string& path)
 {
-  #ifndef _WIN32
-  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#ifndef _WIN32
+  void* plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
   if (!plugin)
   {
     printf("[RuntimePluginLoader]: dlopen failed: %s\n", dlerror());
   }
 
-  RuntimePluginLoader::Parent *(*getPlugin)() = (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
+  RuntimePluginLoader::Parent* (*getPlugin)() =
+      (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
 
   if (getPlugin)
   {
-    plugins.push_back(std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
+    plugins.push_back(
+        std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
   }
   else
   {
     printf("[RuntimePluginLoader]: dlsym failed: %s\n", dlerror());
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
-void RuntimePluginLoader::initDirectory(const std::string &path)
+void RuntimePluginLoader::initDirectory(const std::string& path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   if (isSharedObject(path))
   {
     initPlugin(path);
     return;
   }
-  
-  DIR *dir;
-  struct dirent *file;
+
+  DIR* dir;
+  struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
   {
@@ -135,9 +139,9 @@ void RuntimePluginLoader::initDirectory(const std::string &path)
   {
     perror("[RuntimePluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkRuntimePluginLoader() {}
@@ -145,4 +149,5 @@ void linkRuntimePluginLoader() {}
 } // end namespace util
 } // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader> P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
+static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader>
+    P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
diff --git a/src/TensorStats.cpp b/src/TensorStats.cpp
index b650b691f9..9dbcf6190f 100644
--- a/src/TensorStats.cpp
+++ b/src/TensorStats.cpp
@@ -49,7 +49,8 @@ camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_row_row = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_col_col = 0;
 
-void RAJA::tensor_stats::resetVectorStats(){
+void RAJA::tensor_stats::resetVectorStats()
+{
   num_vector_copy = 0;
   num_vector_copy_ctor = 0;
   num_vector_broadcast_ctor = 0;
@@ -88,9 +89,14 @@ void RAJA::tensor_stats::resetVectorStats(){
   num_matrix_mm_multacc_col_col = 0;
 }
 
-#define PRINT_STAT(STAT) if(STAT){printf("  %-32s   %ld\n", #STAT, STAT);}
+#define PRINT_STAT(STAT)                                                       \
+  if (STAT)                                                                    \
+  {                                                                            \
+    printf("  %-32s   %ld\n", #STAT, STAT);                                    \
+  }
 
-void RAJA::tensor_stats::printVectorStats(){
+void RAJA::tensor_stats::printVectorStats()
+{
 
   printf("RAJA SIMD Register Statistics:\n");
 
@@ -129,5 +135,4 @@ void RAJA::tensor_stats::printVectorStats(){
   PRINT_STAT(num_matrix_mm_multacc_row_row);
   PRINT_STAT(num_matrix_mm_mult_col_col);
   PRINT_STAT(num_matrix_mm_multacc_col_col);
-
 }
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index 8c8d051d8f..0e4b5ab6cd 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -12,10 +12,13 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
+                                               INDEX_TYPE last,
+                                               const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
@@ -24,44 +27,41 @@ void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::expt::dynamic_forall<POLICY_LIST>(working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-  });
+  RAJA::expt::dynamic_forall<POLICY_LIST>(
+      working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+        working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+      });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
-
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest);
 template <typename T>
 class DynamicForallResourceRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using POLICY_LIST = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -70,45 +70,50 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  //If N == 2 host, no openmp is available
-  //If N == 3 host, openmp is available
-  //If N == 4 host, device is available
-  //If N == 5 host, openmp, device are on
+  // If N == 2 host, no openmp is available
+  // If N == 3 host, openmp is available
+  // If N == 4 host, device is available
+  // If N == 5 host, openmp, device are on
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+  bool is_on_host =
+      working_res.get_platform() == camp::resources::Platform::host ? true
+                                                                    : false;
 
-  if(is_on_host) { 
+  if (is_on_host)
+  {
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3; 
-#endif      
-      //Loop through policy list
-      for(int pol=0; pol<host_range; ++pol) 
-        {
-          DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
-        }
+    host_range = 3;
+#endif
+    // Loop through policy list
+    for (int pol = 0; pol < host_range; ++pol)
+    {
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE,
+                                                WORKING_RES,
+                                                POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3; 
-#endif      
-    for(int pol=device_start; pol<N; ++pol) 
+    device_start = 3;
+#endif
+    for (int pol = device_start; pol < N; ++pol)
     {
-    DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE,
+                                                WORKING_RES,
+                                                POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
-
-
 }
 
 REGISTER_TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest,
                             RangeSegmentForallResource);
 
-#endif  // __TEST_BASIC_SHARED_HPP__
+#endif // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index 11168b0e30..1f636b1fe1 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -12,10 +12,13 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
+                                       INDEX_TYPE last,
+                                       const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -24,62 +27,63 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-    });
-
-  } else { // zero-length segment 
+    RAJA::expt::dynamic_forall<POLICY_LIST>(
+        pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+          working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+        });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
+    RAJA::expt::dynamic_forall<POLICY_LIST>(
+        pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+          (void)idx;
+          working_array[0]++;
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest);
 template <typename T>
 class DynamicForallRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using POLICY_LIST = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -87,45 +91,45 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  //If N == 2 host, no openmp is available
-  //If N == 3 host, openmp is available
-  //If N == 4 host, device is available
-  //If N == 5 host, openmp, device are on
+  // If N == 2 host, no openmp is available
+  // If N == 3 host, openmp is available
+  // If N == 4 host, device is available
+  // If N == 5 host, openmp, device are on
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+  bool is_on_host =
+      working_res.get_platform() == camp::resources::Platform::host ? true
+                                                                    : false;
 
-  if(is_on_host) { 
+  if (is_on_host)
+  {
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3; 
-#endif      
-      //Loop through policy list
-      for(int pol=0; pol<host_range; ++pol) 
-        {
-          DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
-        }
+    host_range = 3;
+#endif
+    // Loop through policy list
+    for (int pol = 0; pol < host_range; ++pol)
+    {
+      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3; 
-#endif      
-    for(int pol=device_start; pol<N; ++pol) 
+    device_start = 3;
+#endif
+    for (int pol = device_start; pol < N; ++pol)
     {
-    DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
-
-
 }
 
-REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest,
-                            RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest, RangeSegmentForall);
 
-#endif  // __TEST_BASIC_SHARED_HPP__
+#endif // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index 1b9dd4334a..ac4386e935 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -16,7 +16,8 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N = N0;
 
@@ -25,92 +26,104 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), first - first);
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
       test_array[i0] = i0;
     }
     test_array[RAJA::stripIndexType(N)] = INDEX_TYPE(0);
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      if (idx >= first && idx < last) {
-        // in bounds
-        working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+          if (idx >= first && idx < last)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest);
 template <typename T>
 class ForallCombiningAdapter1DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
 TYPED_TEST_P(ForallCombiningAdapter1DTest, Forall1D)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest,
-                            Forall1D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest, Forall1D);
 
-#endif  // __TEST_FORALL_CombiningAdapter_1D_HPP__
+#endif // __TEST_FORALL_CombiningAdapter_1D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 2be6464bb8..077ca63d04 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -14,11 +14,15 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
-                                  INDEX_TYPE first1, INDEX_TYPE last1)
+void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N = N0 * N1;
@@ -28,18 +32,17 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
+      {
         test_array[i0 * N1 + i1] = i0 * N1 + i1;
       }
     }
@@ -47,88 +50,95 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1) {
-      if (idx0 >= first0 && idx0 < last0 &&
-          idx1 >= first1 && idx1 < last1) {
-        // in bounds
-        working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
-                                           (idx1 - first1))] += (idx0 - first0) * N1 +
-                                                                (idx1 - first1);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0, r1);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1) {
+          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 && idx1 < last1)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
+                                               (idx1 - first1))] +=
+                (idx0 - first0) * N1 + (idx1 - first1);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0,
+        r1);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest);
 template <typename T>
 class ForallCombiningAdapter2DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3));
 
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(0));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(2));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2));
 }
 
 
 TYPED_TEST_P(ForallCombiningAdapter2DTest, Forall2D)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(8));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
-
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(15),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(17));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(57),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(21));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(156),
-                                                                     INDEX_TYPE(17), INDEX_TYPE(203));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(8));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(6), INDEX_TYPE(5), INDEX_TYPE(5));
+
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(15), INDEX_TYPE(0), INDEX_TYPE(17));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(57), INDEX_TYPE(4), INDEX_TYPE(21));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(13), INDEX_TYPE(156), INDEX_TYPE(17), INDEX_TYPE(203));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest,
-                            Forall2D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest, Forall2D);
 
-#endif  // __TEST_FORALL_CombiningAdapter_2D_HPP__
+#endif // __TEST_FORALL_CombiningAdapter_2D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index 83213cc113..8427a93b5c 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -14,13 +14,19 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
-                                  INDEX_TYPE first1, INDEX_TYPE last1,
-                                  INDEX_TYPE first2, INDEX_TYPE last2)
+void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1,
+                                      INDEX_TYPE first2,
+                                      INDEX_TYPE last2)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2), RAJA::stripIndexType(last2));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2),
+                                         RAJA::stripIndexType(last2));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -31,24 +37,20 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
-        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++) {
-          test_array[i0 * N1*N2 +
-                     i1 * N2 +
-                     i2] = i0 * N1 * N2 +
-                           i1 * N2 +
-                           i2;
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
+      {
+        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++)
+        {
+          test_array[i0 * N1 * N2 + i1 * N2 + i2] = i0 * N1 * N2 + i1 * N2 + i2;
         }
       }
     }
@@ -56,103 +58,151 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2) {
-      if (idx0 >= first0 && idx0 < last0 &&
-          idx1 >= first1 && idx1 < last1 &&
-          idx2 >= first2 && idx2 < last2) {
-        // in bounds
-        working_array[RAJA::stripIndexType((idx0 - first0) * N1 * N2 +
-                                           (idx1 - first1) * N2 +
-                                           (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
-                                                                (idx1 - first1) * N2 +
-                                                                (idx2 - first2);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0, r1, r2);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(
+            INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2) {
+          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 &&
+              idx1 < last1 && idx2 >= first2 && idx2 < last2)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType(
+                (idx0 - first0) * N1 * N2 + (idx1 - first1) * N2 +
+                (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
+                                     (idx1 - first1) * N2 + (idx2 - first2);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0,
+        r1,
+        r2);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest);
 template <typename T>
 class ForallCombiningAdapter3DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3),
-                                                                     INDEX_TYPE(-1), INDEX_TYPE(-1));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-4), INDEX_TYPE(0));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(2),
-                                                                     INDEX_TYPE(-7), INDEX_TYPE(-2));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5),
+      INDEX_TYPE(-5),
+      INDEX_TYPE(-3),
+      INDEX_TYPE(-3),
+      INDEX_TYPE(-1),
+      INDEX_TYPE(-1));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5),
+      INDEX_TYPE(0),
+      INDEX_TYPE(-3),
+      INDEX_TYPE(0),
+      INDEX_TYPE(-4),
+      INDEX_TYPE(0));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5),
+      INDEX_TYPE(5),
+      INDEX_TYPE(-3),
+      INDEX_TYPE(2),
+      INDEX_TYPE(-7),
+      INDEX_TYPE(-2));
 }
 
 
 TYPED_TEST_P(ForallCombiningAdapter3DTest, Forall3D)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(7),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(3));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(13),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(17),
-                                                                     INDEX_TYPE(6), INDEX_TYPE(11));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(46),
-                                                                     INDEX_TYPE(17), INDEX_TYPE(51),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(31));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3),
+      INDEX_TYPE(3),
+      INDEX_TYPE(5),
+      INDEX_TYPE(5),
+      INDEX_TYPE(7),
+      INDEX_TYPE(7));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3),
+      INDEX_TYPE(3),
+      INDEX_TYPE(5),
+      INDEX_TYPE(6),
+      INDEX_TYPE(7),
+      INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3),
+      INDEX_TYPE(4),
+      INDEX_TYPE(5),
+      INDEX_TYPE(5),
+      INDEX_TYPE(7),
+      INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3),
+      INDEX_TYPE(4),
+      INDEX_TYPE(5),
+      INDEX_TYPE(6),
+      INDEX_TYPE(7),
+      INDEX_TYPE(7));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0),
+      INDEX_TYPE(7),
+      INDEX_TYPE(0),
+      INDEX_TYPE(6),
+      INDEX_TYPE(0),
+      INDEX_TYPE(3));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1),
+      INDEX_TYPE(13),
+      INDEX_TYPE(4),
+      INDEX_TYPE(17),
+      INDEX_TYPE(6),
+      INDEX_TYPE(11));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(13),
+      INDEX_TYPE(46),
+      INDEX_TYPE(17),
+      INDEX_TYPE(51),
+      INDEX_TYPE(4),
+      INDEX_TYPE(31));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest,
-                            Forall3D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest, Forall3D);
 
-#endif  // __TEST_FORALL_CombiningAdapter_3D_HPP__
+#endif // __TEST_FORALL_CombiningAdapter_3D_HPP__
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index a9e2c5a9f8..c0615df535 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall.
+/// Header file containing basic functional tests for atomic operations with
+/// forall.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_BASIC_HPP__
@@ -15,38 +16,40 @@
 #include <numeric>
 
 // segment multiplexer
-template< typename IdxType, typename SegType >
-struct RSMultiplexer {};
+template <typename IdxType, typename SegType>
+struct RSMultiplexer
+{};
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedRangeSegment<IdxType>>
 {
   RAJA::TypedRangeSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
   {
-    return RAJA::TypedRangeSegment<IdxType>( 0, N );
+    return RAJA::TypedRangeSegment<IdxType>(0, N);
   }
 };
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeStrideSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedRangeStrideSegment<IdxType>>
 {
   RAJA::TypedRangeStrideSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
   {
-    return RAJA::TypedRangeStrideSegment<IdxType>( 0, N, 1 );
+    return RAJA::TypedRangeStrideSegment<IdxType>(0, N, 1);
   }
 };
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedListSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 {
-  RAJA::TypedListSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource work_res )
+  RAJA::TypedListSegment<IdxType> makeseg(IdxType N,
+                                          camp::resources::Resource work_res)
   {
     std::vector<IdxType> temp(N);
-    std::iota( std::begin(temp), std::end(temp), 0 );
-    return RAJA::TypedListSegment<IdxType>( &temp[0], static_cast<size_t>(temp.size()), work_res );
+    std::iota(std::begin(temp), std::end(temp), 0);
+    return RAJA::TypedListSegment<IdxType>(
+        &temp[0], static_cast<size_t>(temp.size()), work_res);
   }
 };
 // end segment multiplexer
@@ -58,25 +61,22 @@ template <typename ExecPolicy,
           typename IdxType,
           typename SegmentType,
           typename T>
-void ForallAtomicBasicTestImpl( IdxType seglimit )
+void ForallAtomicBasicTestImpl(IdxType seglimit)
 {
   // initialize an array
   const int len = 12;
 
   camp::resources::Resource work_res{WORKINGRES()};
 
-  SegmentType seg = 
-    RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
+  SegmentType seg =
+      RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
 
-  T * work_array;
-  T * test_array;
-  T * check_array;
+  T* work_array;
+  T* test_array;
+  T* check_array;
 
-  allocateForallTestData<T>(  len,
-                              work_res,
-                              &work_array,
-                              &check_array,
-                              &test_array );
+  allocateForallTestData<T>(
+      len, work_res, &work_array, &check_array, &test_array);
 
   // use atomic add to reduce the array
   test_array[0] = static_cast<T>(0);
@@ -102,14 +102,15 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
     RAJA::atomicInc<AtomicPolicy>(work_array + 4);
     RAJA::atomicDec<AtomicPolicy>(work_array + 5);
     RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-    RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i), static_cast<T>(i+1));
+    RAJA::atomicCAS<AtomicPolicy>(
+        work_array + 7, static_cast<T>(i), static_cast<T>(i + 1));
     RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
     RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
     RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
     RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
   });
 
-  work_res.memcpy( check_array, work_array, sizeof(T) * len );
+  work_res.memcpy(check_array, work_array, sizeof(T) * len);
   work_res.wait();
 
   EXPECT_EQ(static_cast<T>(seglimit), check_array[0]);
@@ -127,38 +128,42 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
   EXPECT_EQ(static_cast<T>(4), check_array[10]);
   EXPECT_EQ(static_cast<T>(13), check_array[11]);
 
-  deallocateForallTestData<T>(work_res,
-                              work_array,
-                              check_array,
-                              test_array);
+  deallocateForallTestData<T>(work_res, work_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicBasicTest);
 template <typename T>
 class ForallAtomicBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedRangeSegment<IdxType>, 
+  ForallAtomicBasicTestImpl<AExec,
+                            APol,
+                            ResType,
+                            IdxType,
+                            RAJA::TypedRangeSegment<IdxType>,
                             DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedRangeStrideSegment<IdxType>, 
+  ForallAtomicBasicTestImpl<AExec,
+                            APol,
+                            ResType,
+                            IdxType,
+                            RAJA::TypedRangeStrideSegment<IdxType>,
                             DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedListSegment<IdxType>, 
+  ForallAtomicBasicTestImpl<AExec,
+                            APol,
+                            ResType,
+                            IdxType,
+                            RAJA::TypedListSegment<IdxType>,
                             DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest,
-                            AtomicBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, AtomicBasicForall);
 
-#endif  //__TEST_FORALL_ATOMIC_BASIC_HPP__
+#endif //__TEST_FORALL_ATOMIC_BASIC_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
index 9089844744..6f1d960004 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
@@ -6,98 +6,131 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for addition arithmetic atomic operations using forall
+/// Source file containing basic functional tests for addition arithmetic atomic
+/// operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_ADD_HPP__
 #define __TEST_FORALL_ATOMICREF_ADD_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PreIncCountOp {
-  PreIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PreIncCountOp
+{
+  PreIncCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (++counter) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (++counter) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PostIncCountOp {
-  PostIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PostIncCountOp
+{
+  PostIncCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter++);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter++); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AddEqCountOp {
-  AddEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AddEqCountOp
+{
+  AddEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter += (T)1) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return (counter += (T)1) - (T)1;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchAddCountOp {
-  FetchAddCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchAddCountOp
+{
+  FetchAddCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_add((T)1);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return counter.fetch_add((T)1);
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename IdxType,
-         typename T,
-         template <typename, typename, typename> class CountOp>
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
 void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
-    T* count, T* list, bool* hit,
-    T* hcount, T* hlist, bool* hhit,
-    camp::resources::Resource work_res, IdxType N)
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
 
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = countop.max + (T)1;
-      hit[i] = false;
-      });
+    list[i] = countop.max + (T)1;
+    hit[i] = false;
+  });
 
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = countop(i);
-      list[i] = val;
-      hit[(IdxType)val] = true;
-      });
+    T val = countop(i);
+    list[i] = val;
+    hit[(IdxType)val] = true;
+  });
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -106,9 +139,9 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
-  work_res.memcpy( hhit, hit, sizeof(bool) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy(hhit, hit, sizeof(bool) * N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -119,7 +152,8 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
 #endif
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -132,7 +166,7 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefAddTestImpl( IdxType N )
+void ForallAtomicRefAddTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
@@ -140,13 +174,13 @@ void ForallAtomicRefAddTestImpl( IdxType N )
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
-  bool * hit  = work_res.allocate<bool>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
-  bool * hhit  = host_res.allocate<bool>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -156,42 +190,40 @@ void ForallAtomicRefAddTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PreIncCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PostIncCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     AddEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     FetchAddCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  work_res.deallocate( hit );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
-  host_res.deallocate( hhit ); 
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PreIncCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PostIncCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, AddEqCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, FetchAddCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  work_res.deallocate(hit);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
+  host_res.deallocate(hhit);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefAddTest);
 template <typename T>
 class ForallAtomicRefAddTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest,
-                            AtomicRefAddForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest, AtomicRefAddForall);
 
-#endif  //__TEST_FORALL_ATOMICREF_ADD_HPP__
+#endif //__TEST_FORALL_ATOMICREF_ADD_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 8f036fc4b9..08acac4f79 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -6,91 +6,119 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for CAS atomic operations using forall
+/// Source file containing basic functional tests for CAS atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_CAS_HPP__
 #define __TEST_FORALL_ATOMICREF_CAS_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CASOtherOp : all_op {
-  CASOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CASOtherOp : all_op
+{
+  CASOtherOp(T* dcount,
+             T* hcount,
+             camp::resources::Resource work_res,
+             RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
+  T operator()(IdxType i) const
+  {
+    T received, expect = (T)0;
+    while ((received = other.CAS(expect, (T)i)) != expect)
     {
-      T received, expect = (T)0;
-      while ((received = other.CAS(expect, (T)i)) != expect) {
-        expect = received;
-      }
-      return received;
+      expect = received;
     }
+    return received;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CompareExchangeWeakOtherOp : all_op {
-  CompareExchangeWeakOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CompareExchangeWeakOtherOp : all_op
+{
+  CompareExchangeWeakOtherOp(T* dcount,
+                             T* hcount,
+                             camp::resources::Resource work_res,
+                             RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    {
-      T expect = (T)0;
-      while (!other.compare_exchange_weak(expect, (T)i)) {}
-      return expect;
-    }
+  T operator()(IdxType i) const
+  {
+    T expect = (T)0;
+    while (!other.compare_exchange_weak(expect, (T)i))
+    {}
+    return expect;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CompareExchangeStrongOtherOp : all_op {
-  CompareExchangeStrongOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CompareExchangeStrongOtherOp : all_op
+{
+  CompareExchangeStrongOtherOp(T* dcount,
+                               T* hcount,
+                               camp::resources::Resource work_res,
+                               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    {
-      T expect = (T)0;
-      while (!other.compare_exchange_strong(expect, (T)i)) {}
-      return expect;
-    }
+  T operator()(IdxType i) const
+  {
+    T expect = (T)0;
+    while (!other.compare_exchange_strong(expect, (T)i))
+    {}
+    return expect;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
+                        T* count,
+                        T* list,
+                        T* hcount,
+                        T* hlist,
+                        camp::resources::Resource work_res,
+                        IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
+    T val = otherop(i);
+    list[i] = val;
   });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -99,12 +127,13 @@ testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -116,7 +145,7 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefCASTestImpl( IdxType N )
+void ForallAtomicRefCASTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
@@ -124,11 +153,11 @@ void ForallAtomicRefCASTestImpl( IdxType N )
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -138,38 +167,44 @@ void ForallAtomicRefCASTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CASOtherOp                  >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CompareExchangeWeakOtherOp  >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CompareExchangeStrongOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, CASOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy,
+                     AtomicPolicy,
+                     IdxType,
+                     T,
+                     CompareExchangeWeakOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy,
+                     AtomicPolicy,
+                     IdxType,
+                     T,
+                     CompareExchangeStrongOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefCASTest);
 template <typename T>
 class ForallAtomicRefCASTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest,
-                            AtomicRefCASForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest, AtomicRefCASForall);
 
-#endif  //__TEST_FORALL_ATOMICREF_CAS_HPP__
+#endif //__TEST_FORALL_ATOMICREF_CAS_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index 95209b6c79..f1aedadf22 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -6,93 +6,121 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for load/store atomic operations using forall
+/// Source file containing basic functional tests for load/store atomic
+/// operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 #define __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct LoadOtherOp : all_op {
-  LoadOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)seg.size()), max(min),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct LoadOtherOp : all_op
+{
+  LoadOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)seg.size()),
+        max(min),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-    { return other.load(); }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other.load(); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct OperatorTOtherOp : all_op {
-  OperatorTOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
-    : other(dcount), min(T(0)), max(min),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct OperatorTOtherOp : all_op
+{
+  OperatorTOtherOp(T* dcount,
+                   T* hcount,
+                   camp::resources::Resource work_res,
+                   RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
+      : other(dcount), min(T(0)), max(min), final_min(min), final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-    { return other; }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct StoreOtherOp : all_op {
-  StoreOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct StoreOtherOp : all_op
+{
+  StoreOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { other.store((T)i); return (T)i; }
+  T operator()(IdxType i) const
+  {
+    other.store((T)i);
+    return (T)i;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AssignOtherOp : all_op {
-  AssignOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AssignOtherOp : all_op
+{
+  AssignOtherOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return (other = (T)i); }
+  T operator()(IdxType i) const { return (other = (T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
+                              T* count,
+                              T* list,
+                              T* hcount,
+                              T* hlist,
+                              camp::resources::Resource work_res,
+                              IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
+    T val = otherop(i);
+    list[i] = val;
   });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -101,12 +129,13 @@ testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -118,7 +147,7 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLoadStoreTestImpl( IdxType N )
+void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
@@ -126,11 +155,11 @@ void ForallAtomicRefLoadStoreTestImpl( IdxType N )
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -140,40 +169,43 @@ void ForallAtomicRefLoadStoreTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       LoadOtherOp     >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       OperatorTOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       StoreOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       AssignOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, LoadOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy,
+                           AtomicPolicy,
+                           IdxType,
+                           T,
+                           OperatorTOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, StoreOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, AssignOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest);
 template <typename T>
 class ForallAtomicRefLoadStoreTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest,
                             AtomicRefLoadStoreForall);
 
-#endif  //__TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
+#endif //__TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 382560109c..5afffd7140 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -6,153 +6,199 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for logical atomic operations using forall
+/// Source file containing basic functional tests for logical atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 #define __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AndEqOtherOp : int_op {
-  AndEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AndEqOtherOp : int_op
+{
+  AndEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size()),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = np2m1((T)seg.size());
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other &= (T)i; }
+  T operator()(IdxType i) const { return other &= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchAndOtherOp : int_op {
-  FetchAndOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchAndOtherOp : int_op
+{
+  FetchAndOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = max;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_and((T)i); }
+  T operator()(IdxType i) const { return other.fetch_and((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct OrEqOtherOp : int_op {
-  OrEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct OrEqOtherOp : int_op
+{
+  OrEqOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other |= (T)i; }
+  T operator()(IdxType i) const { return other |= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchOrOtherOp : int_op {
-  FetchOrOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchOrOtherOp : int_op
+{
+  FetchOrOtherOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_or((T)i); }
+  T operator()(IdxType i) const { return other.fetch_or((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct XorEqOtherOp : int_op {
-  XorEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct XorEqOtherOp : int_op
+{
+  XorEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i)
+    {
+      final_min ^= (T)i;
+      final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other ^= (T)i; }
+  T operator()(IdxType i) const { return other ^= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchXorOtherOp : int_op {
-  FetchXorOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchXorOtherOp : int_op
+{
+  FetchXorOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i)
+    {
+      final_min ^= (T)i;
+      final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_xor((T)i); }
+  T operator()(IdxType i) const { return other.fetch_xor((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // No test when underlying op type is int, and index type is float
 typename std::enable_if<
-           (std::is_floating_point<T>::value && 
-            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
-         >::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg), 
-                     T* RAJA_UNUSED_ARG(count), T* RAJA_UNUSED_ARG(list),
-                     T* RAJA_UNUSED_ARG(hcount), T* RAJA_UNUSED_ARG(hlist),
-                     camp::resources::Resource RAJA_UNUSED_ARG(work_res), IdxType RAJA_UNUSED_ARG(N))
-{
-}
+    (std::is_floating_point<T>::value &&
+     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
+                       T* RAJA_UNUSED_ARG(count),
+                       T* RAJA_UNUSED_ARG(list),
+                       T* RAJA_UNUSED_ARG(hcount),
+                       T* RAJA_UNUSED_ARG(hlist),
+                       camp::resources::Resource RAJA_UNUSED_ARG(work_res),
+                       IdxType RAJA_UNUSED_ARG(N))
+{}
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // Run test if T is integral and operation is int_op, or for any all_op
 typename std::enable_if<
-           (std::is_integral<T>::value && 
-            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value) || 
-            (std::is_base_of<all_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
-         >::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+    (std::is_integral<T>::value &&
+     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value) ||
+    (std::is_base_of<all_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
+                       T* count,
+                       T* list,
+                       T* hcount,
+                       T* hlist,
+                       camp::resources::Resource work_res,
+                       IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
+    T val = otherop(i);
+    list[i] = val;
   });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -161,12 +207,13 @@ testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -178,7 +225,7 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLogicalTestImpl( IdxType N )
+void ForallAtomicRefLogicalTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
@@ -186,11 +233,11 @@ void ForallAtomicRefLogicalTestImpl( IdxType N )
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -200,46 +247,44 @@ void ForallAtomicRefLogicalTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  // Note: These integral tests require return type conditional overloading 
+  // Note: These integral tests require return type conditional overloading
   //       of testAtomicRefLogicalOp
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       AndEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchAndOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       OrEqOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchOrOtherOp >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       XorEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchXorOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, AndEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchAndOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, OrEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchOrOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, XorEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchXorOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest);
 template <typename T>
 class ForallAtomicRefLogicalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest,
-                            AtomicRefLogicalForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall);
 
-#endif  //__TEST_FORALL_ATOMICREF_LOGICAL_HPP__
+#endif //__TEST_FORALL_ATOMICREF_LOGICAL_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index b8860def9f..90248a5e48 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -6,93 +6,121 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for min/max atomic operations using forall
+/// Source file containing basic functional tests for min/max atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 #define __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct MaxEqOtherOp : all_op {
-  MaxEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct MaxEqOtherOp : all_op
+{
+  MaxEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.max((T)i); }
+  T operator()(IdxType i) const { return other.max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchMaxOtherOp : all_op {
-  FetchMaxOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchMaxOtherOp : all_op
+{
+  FetchMaxOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_max((T)i); }
+  T operator()(IdxType i) const { return other.fetch_max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct MinEqOtherOp : all_op {
-  MinEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct MinEqOtherOp : all_op
+{
+  MinEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.min((T)i); }
+  T operator()(IdxType i) const { return other.min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchMinOtherOp : all_op {
-  FetchMinOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchMinOtherOp : all_op
+{
+  FetchMinOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size()),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_min((T)i); }
+  T operator()(IdxType i) const { return other.fetch_min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
+                           T* count,
+                           T* list,
+                           T* hcount,
+                           T* hlist,
+                           camp::resources::Resource work_res,
+                           IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
+    T val = otherop(i);
+    list[i] = val;
   });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -101,12 +129,13 @@ testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -118,7 +147,7 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefMinMaxTestImpl( IdxType N )
+void ForallAtomicRefMinMaxTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
@@ -126,11 +155,11 @@ void ForallAtomicRefMinMaxTestImpl( IdxType N )
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -140,40 +169,38 @@ void ForallAtomicRefMinMaxTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       MaxEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchMaxOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       MinEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchMinOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MaxEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMaxOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MinEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMinOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest);
 template <typename T>
 class ForallAtomicRefMinMaxTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest,
-                            AtomicRefMinMaxForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall);
 
-#endif  //__TEST_FORALL_ATOMICREF_MINMAX_HPP__
+#endif //__TEST_FORALL_ATOMICREF_MINMAX_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
index f4579fb786..3d6fffd5f6 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
@@ -6,96 +6,114 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for subtraction arithmetic atomic operations using forall
+/// Source file containing basic functional tests for subtraction arithmetic
+/// atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_SUB_HPP__
 #define __TEST_FORALL_ATOMICREF_SUB_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PreDecCountOp {
-  PreDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PreDecCountOp
+{
+  PreDecCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (--counter);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (--counter); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PostDecCountOp {
-  PostDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PostDecCountOp
+{
+  PostDecCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter--) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter--) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct SubEqCountOp {
-  SubEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct SubEqCountOp
+{
+  SubEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter -= (T)1);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter -= (T)1); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchSubCountOp {
-  FetchSubCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchSubCountOp
+{
+  FetchSubCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_sub((T)1) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return counter.fetch_sub((T)1) - (T)1;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename IdxType,
-         typename T,
-         template <typename, typename, typename> class CountOp>
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
 void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
-    T* count, T* list, bool* hit,
-    T* hcount, T* hlist, bool* hhit,
-    camp::resources::Resource work_res, IdxType N)
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = countop.max + (T)1;
-      hit[i] = false;
-      });
+    list[i] = countop.max + (T)1;
+    hit[i] = false;
+  });
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = countop(i);
-      list[i] = val;
-      hit[(IdxType)val] = true;
-      });
+    T val = countop(i);
+    list[i] = val;
+    hit[(IdxType)val] = true;
+  });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -103,12 +121,13 @@ void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
-  work_res.memcpy( hhit, hit, sizeof(bool) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy(hhit, hit, sizeof(bool) * N);
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -121,7 +140,7 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefSubTestImpl( IdxType N )
+void ForallAtomicRefSubTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
@@ -129,13 +148,13 @@ void ForallAtomicRefSubTestImpl( IdxType N )
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
-  bool * hit  = work_res.allocate<bool>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
-  bool * hhit  = host_res.allocate<bool>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -145,42 +164,40 @@ void ForallAtomicRefSubTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PreDecCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PostDecCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     SubEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     FetchSubCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  work_res.deallocate( hit );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
-  host_res.deallocate( hhit ); 
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PreDecCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PostDecCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, SubEqCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, FetchSubCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  work_res.deallocate(hit);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
+  host_res.deallocate(hhit);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefSubTest);
 template <typename T>
 class ForallAtomicRefSubTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest,
-                            AtomicRefSubForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest, AtomicRefSubForall);
 
-#endif  //__TEST_FORALL_ATOMICREF_SUB_HPP__
+#endif //__TEST_FORALL_ATOMICREF_SUB_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index fc67162823..4300cf82f0 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
@@ -19,12 +20,13 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicMultiViewTestImpl( IdxType N )
+void ForallAtomicMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
-  int src_side = dst_side*2; // source[] dimension
+  int dst_side = static_cast<int>(
+      std::sqrt(static_cast<double>(N / 2))); // dest[] dimension
+  int src_side = dst_side * 2;                // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
@@ -33,11 +35,11 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
   camp::resources::Resource work_res{WORKINGRES()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T *  actualsource = work_res.allocate<T> (N);
-  T ** source       = work_res.allocate<T*>(src_side);
-  T *  actualdest   = work_res.allocate<T> (N/2);
-  T ** dest         = work_res.allocate<T*>(dst_side);
-  T *  check_array  = host_res.allocate<T> (N/2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source = work_res.allocate<T*>(src_side);
+  T* actualdest = work_res.allocate<T>(N / 2);
+  T** dest = work_res.allocate<T*>(dst_side);
+  T* check_array = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -48,20 +50,17 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
 #endif
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
-  {
-    source[ii] = actualsource+(ii*dst_side);
+  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii) {
+    source[ii] = actualsource + (ii * dst_side);
   });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
-  {
-    dest[ii] = actualdest+(ii*dst_side);
+  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii) {
+    dest[ii] = actualdest + (ii * dst_side);
   });
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    actualsource[i] = (T)1;
-  });
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { actualsource[i] = (T)1; });
 
   // use atomic add to reduce the array
   // 1D defaut MultiView
@@ -74,21 +73,21 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
 
   // Zero out dest using atomic MultiView
   RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
+    for (int aopidx = 0; aopidx < dst_side; ++aopidx)
     {
-      sum_atomic_view(i,aopidx) = (T)0;
+      sum_atomic_view(i, aopidx) = (T)0;
     }
   });
 
   // Assign values to dest using atomic MultiView
   RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
+    for (int aopidx = 0; aopidx < dst_side; ++aopidx)
     {
-      sum_atomic_view(i/2, aopidx) += vec_view(aopidx,i/2);
+      sum_atomic_view(i / 2, aopidx) += vec_view(aopidx, i / 2);
     }
   });
 
-  work_res.memcpy( check_array, actualdest, sizeof(T) * N/2 );
+  work_res.memcpy(check_array, actualdest, sizeof(T) * N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -98,35 +97,34 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i) {
+  for (IdxType i = 0; i < N / 2; ++i)
+  {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  work_res.deallocate( actualsource );
-  work_res.deallocate( source );
-  work_res.deallocate( actualdest );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  work_res.deallocate(actualsource);
+  work_res.deallocate(source);
+  work_res.deallocate(actualdest);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest);
 template <typename T>
 class ForallAtomicMultiViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
+  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>(20000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest,
-                            AtomicMultiViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest, AtomicMultiViewForall);
 
-#endif  //__TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
+#endif //__TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index a33c0f591a..f2f31dd7e8 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
@@ -19,12 +20,13 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
+void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
-  int src_side = dst_side*2; // source[] dimension
+  int dst_side = static_cast<int>(
+      std::sqrt(static_cast<double>(N / 2))); // dest[] dimension
+  int src_side = dst_side * 2;                // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
@@ -33,11 +35,11 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
   camp::resources::Resource work_res{WORKINGRES()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T *  actualsource = work_res.allocate<T> (N);
-  T ** source       = work_res.allocate<T*>(src_side);
-  T *  actualdest   = work_res.allocate<T> (N/2);
-  T ** dest         = work_res.allocate<T*>(dst_side);
-  T *  check_array  = host_res.allocate<T> (N/2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source = work_res.allocate<T*>(src_side);
+  T* actualdest = work_res.allocate<T>(N / 2);
+  T** dest = work_res.allocate<T*>(dst_side);
+  T* check_array = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -58,12 +60,12 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
   auto sum_atomic_view = RAJA::make_atomic_view<AtomicPolicy>(sum_view);
 
 
-  // Need gtest death test to avoid complete failure due to eventual seg fault
-  #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  EXPECT_DEATH_IF_SUPPORTED( (sum_atomic_view(0,-1) = (T)0), "" );
-  #else
-  EXPECT_THROW( (sum_atomic_view(0,-1) = (T)0), std::runtime_error );
-  #endif
+// Need gtest death test to avoid complete failure due to eventual seg fault
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+  EXPECT_DEATH_IF_SUPPORTED((sum_atomic_view(0, -1) = (T)0), "");
+#else
+  EXPECT_THROW((sum_atomic_view(0, -1) = (T)0), std::runtime_error);
+#endif
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -73,31 +75,35 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.deallocate( actualsource );
-  work_res.deallocate( source );
-  work_res.deallocate( actualdest );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  work_res.deallocate(actualsource);
+  work_res.deallocate(source);
+  work_res.deallocate(actualdest);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest);
 template <typename T>
 class ForallAtomicOutOfBoundsMultiViewTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest, AtomicOutOfBoundsMultiViewForall)
+TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
+             AtomicOutOfBoundsMultiViewForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
+  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec,
+                                           APol,
+                                           ResType,
+                                           IdxType,
+                                           DType>(20000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest,
                             AtomicOutOfBoundsMultiViewForall);
 
-#endif  //__TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
+#endif //__TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index 588e95bf82..723cbbcae7 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_VIEW_HPP__
@@ -17,7 +18,7 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicViewTestImpl( IdxType N )
+void ForallAtomicViewTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_half(0, N / 2);
@@ -25,10 +26,10 @@ void ForallAtomicViewTestImpl( IdxType N )
   camp::resources::Resource work_res{WORKINGRES()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T * hsource = host_res.allocate<T>(N);
-  T * source = work_res.allocate<T>(N);
-  T * dest = work_res.allocate<T>(N/2);
-  T * check_array = host_res.allocate<T>(N/2);
+  T* hsource = host_res.allocate<T>(N);
+  T* source = work_res.allocate<T>(N);
+  T* dest = work_res.allocate<T>(N / 2);
+  T* check_array = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -38,10 +39,9 @@ void ForallAtomicViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](IdxType i) { hsource[i] = (T)1; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](IdxType i) { hsource[i] = (T)1; });
 
-  work_res.memcpy( source, hsource, sizeof(T) * N );
+  work_res.memcpy(source, hsource, sizeof(T) * N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -59,16 +59,15 @@ void ForallAtomicViewTestImpl( IdxType N )
 
 
   // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i) {
-    sum_atomic_view(i) = (T)0;
-  });
+  RAJA::forall<ExecPolicy>(
+      seg_half, [=] RAJA_HOST_DEVICE(IdxType i) { sum_atomic_view(i) = (T)0; });
 
   // Assign values to dest using atomic view
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
     sum_atomic_view(i / 2) += vec_view(i);
   });
 
-  work_res.memcpy( check_array, dest, sizeof(T) * N/2 );
+  work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -78,34 +77,33 @@ void ForallAtomicViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i) {
+  for (IdxType i = 0; i < N / 2; ++i)
+  {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  host_res.deallocate( hsource );
-  work_res.deallocate( source );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  host_res.deallocate(hsource);
+  work_res.deallocate(source);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicViewTest);
 template <typename T>
 class ForallAtomicViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
 {
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>( 100000 );
+  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>(100000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest,
-                            AtomicViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest, AtomicViewForall);
 
-#endif  //__TEST_FORALL_ATOMIC_VIEW_HPP__
+#endif //__TEST_FORALL_ATOMIC_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index 26bd5ee7d9..d0a2db9e2c 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -19,81 +19,77 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIcountIndexSetViewTestImpl()
 {
 
-  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices; 
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
   RAJA::Layout<1> layout(N);
-  RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >
-    work_view(working_array, layout);
+  RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>> work_view(
+      working_array, layout);
 
-  RAJA::forall_Icount<EXEC_POLICY>(iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    work_view( icount ) = idx;
-  });
+  RAJA::forall_Icount<EXEC_POLICY>(
+      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+        work_view(icount) = idx;
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest);
 template <typename T>
 class ForallIcountIndexSetViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIcountIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
@@ -101,4 +97,4 @@ TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
 REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest,
                             IndexSetForallIcountView);
 
-#endif  // __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
+#endif // __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index 7fc00c47d9..1c0199815f 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -18,85 +18,79 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIndexSetViewTestImpl()
 {
 
-  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset; 
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallIndexSetViewTest);
 template <typename T>
 class ForallIndexSetViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest,
-                            IndexSetForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest, IndexSetForallView);
 
-#endif  // __TEST_FORALL_INDEXSET_VIEW_HPP__
+#endif // __TEST_FORALL_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index 70fbb98b15..a607dde3eb 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -17,81 +17,77 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIcountIndexSetTestImpl()
 {
 
-  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::forall_Icount(EXEC_POLICY(), iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    working_array[icount] = idx;
-  });
+  RAJA::forall_Icount(EXEC_POLICY(),
+                      iset,
+                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+                        working_array[icount] = idx;
+                      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallIcountIndexSetTest);
 template <typename T>
 class ForallIcountIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest,
-                            IndexSetForallIcount);
+REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest, IndexSetForallIcount);
 
-#endif  // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
+#endif // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index f2be845482..ec63e7be57 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -16,44 +16,42 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIndexSetTestImpl()
 {
 
-  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
   RAJA::forall(EXEC_POLICY(), iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
@@ -62,34 +60,31 @@ void ForallIndexSetTestImpl()
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallIndexSetTest);
 template <typename T>
 class ForallIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest,
-                            IndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest, IndexSetForall);
 
-#endif  // __TEST_FORALL_INDEXSET_HPP__
+#endif // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index c783befdf4..3cd68ce031 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -15,9 +15,13 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEG_TYPE, typename Container,
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
@@ -27,11 +31,17 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&,
                                const std::vector<IDX_TYPE>&,
                                camp::resources::Resource,
                                RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEG_TYPE, typename Container,
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
@@ -41,10 +51,11 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
                                camp::resources::Resource working_res,
                                RandomGenerator& rngen)
 {
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1;
-  const IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  const IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   const int modval = 100;
   const size_t num_bins = multi_init.size();
@@ -63,47 +74,44 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(
+      idx_range + 1, working_res, &working_range, &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
 
-    for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    for (IDX_TYPE i = 0; i < idx_len; ++i)
+    {
       IDX_TYPE idx = seg_idx[i];
       test_range[idx] = data_len;
       data_len += work_per_iterate_distribution(rngen);
-      test_range[idx+1] = data_len;
+      test_range[idx + 1] = data_len;
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
-                         &test_array);
+  allocateForallTestData(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
-                         &test_bins);
+  allocateForallTestData(
+      data_len, working_res, &working_bins, &check_bins, &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -111,7 +119,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(
+      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -123,19 +132,23 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
     RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1]; ++idx)
+      {
+        ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
         ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
       }
     });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -145,44 +158,56 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
       RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+             ++idx)
+        {
           ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
         }
       });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(
+          working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -190,21 +215,28 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
       RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+             ++idx)
+        {
           ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
         }
       });
 
-      if (!got_ref_vals) {
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -212,35 +244,25 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest);
 template <typename T>
 class ForallMultiReduceBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ABSTRACTION = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
   auto random_seed = std::random_device{}();
@@ -254,51 +276,66 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+    RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
     RAJA::getIndices(seg_idx, r1);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r1, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
+                                   REDUCE_POLICY,
+                                   ABSTRACTION,
+                                   DATA_TYPE>(
+        r1, container, seg_idx, working_res, rngen);
 
     seg_idx.clear();
-    RAJA::TypedRangeSegment<IDX_TYPE> r3( 3, 2060 );
+    RAJA::TypedRangeSegment<IDX_TYPE> r3(3, 2060);
     RAJA::getIndices(seg_idx, r3);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r3, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
+                                   REDUCE_POLICY,
+                                   ABSTRACTION,
+                                   DATA_TYPE>(
+        r3, container, seg_idx, working_res, rngen);
 
     // Range-stride segment test
     seg_idx.clear();
-    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
     RAJA::getIndices(seg_idx, r5);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r5, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
+                                   REDUCE_POLICY,
+                                   ABSTRACTION,
+                                   DATA_TYPE>(
+        r5, container, seg_idx, working_res, rngen);
 
     // List segment test
     seg_idx.clear();
     IDX_TYPE last = 10567;
-    std::uniform_int_distribution<IDX_TYPE> dist(0, last-1);
-    for (IDX_TYPE i = 0; i < last; ++i) {
+    std::uniform_int_distribution<IDX_TYPE> dist(0, last - 1);
+    for (IDX_TYPE i = 0; i < last; ++i)
+    {
       IDX_TYPE randval = dist(rngen);
-      if ( i < randval ) {
+      if (i < randval)
+      {
         seg_idx.push_back(i);
       }
     }
-    RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                         working_res );
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   l1, container, seg_idx, working_res, rngen);
+    RAJA::TypedListSegment<IDX_TYPE> l1(
+        &seg_idx[0], seg_idx.size(), working_res);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
+                                   REDUCE_POLICY,
+                                   ABSTRACTION,
+                                   DATA_TYPE>(
+        l1, container, seg_idx, working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest,
-                            MultiReduceBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest, MultiReduceBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index 6adade08a9..99a5e283fe 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -13,30 +13,30 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -49,28 +49,30 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
-  
-  // 
+
+  //
   // And now a randomized test that pushes zeros around
-  // 
+  //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redand  &= working_array[idx];
+    redand &= working_array[idx];
     redand2 &= working_array[idx];
   });
 
@@ -80,100 +82,105 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
+  for (int j = 0; j < nloops; ++j)
+  {
     RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
       redand &= working_array[idx];
     });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
                             ReduceBitAndBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index a0db78c4f6..d08ebb6790 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -13,30 +13,30 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -49,21 +49,23 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
- 
+
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_or |= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_or |= test_array[seg_idx[i]];
   }
 
 
@@ -71,7 +73,7 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
 
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redor  |= working_array[idx];
+    redor |= working_array[idx];
     redor2 |= working_array[idx];
   });
 
@@ -81,100 +83,104 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
+  for (int j = 0; j < nloops; ++j)
+  {
     RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
       redor |= working_array[idx];
     });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-    camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r1, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r2, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r3, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r4, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r5, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   l1, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
-                            ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index 5ec8c47164..4ab5329b68 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
- 
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
   const DATA_TYPE max_init = -1;
   const DATA_TYPE big_max = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -52,8 +53,8 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
 
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.max( working_array[idx] );
-    max.max( working_array[idx] );
+    maxinit.max(working_array[idx]);
+    max.max(working_array[idx]);
   });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
@@ -64,101 +65,104 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max( working_array[idx] * factor);
+    max.max(working_array[idx] * factor);
   });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
-   
+
   factor = 3;
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max( working_array[idx] * factor);
+    max.max(working_array[idx] * factor);
   });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
-                            ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index c5f228821d..205e232bee 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -13,56 +13,60 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
   const DATA_TYPE max_init = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval+1;
+  const IDX_TYPE maxloc_idx = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max = modval + 1;
   const IDX_TYPE big_maxloc = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
   DATA_TYPE ref_max = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max, maxloc_init);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init, maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max,
+                                                                 maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init,
+                                                             maxloc_init);
 
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.maxloc( working_array[idx], idx );
-    max.maxloc( working_array[idx], idx );
+    maxinit.maxloc(working_array[idx], idx);
+    max.maxloc(working_array[idx], idx);
   });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
@@ -76,103 +80,107 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.maxloc( working_array[idx] * factor, idx);
+    max.maxloc(working_array[idx] * factor, idx);
   });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
-  
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    max.maxloc( working_array[idx] * factor, idx);
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    max.maxloc(working_array[idx] * factor, idx);
   });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
- 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
                             ReduceMaxLocBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index 67e051acc4..0115377dcc 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE min_init = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -53,8 +54,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.min( working_array[idx] );
-    min.min( working_array[idx] );
+    mininit.min(working_array[idx]);
+    min.min(working_array[idx]);
   });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
@@ -63,106 +64,109 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min.reset(min_init);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
-  DATA_TYPE factor = 3; 
+  DATA_TYPE factor = 3;
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.min( working_array[idx] * factor);
+    min.min(working_array[idx] * factor);
   });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    min.min( working_array[idx] * factor);
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    min.min(working_array[idx] * factor);
   });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index be5265d4b1..cfbfbab27b 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -13,56 +13,60 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE min_init = modval + 1;
   const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
+  const IDX_TYPE minloc_idx = seg_idx[idx_len * 2 / 3];
   const DATA_TYPE small_min = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
   DATA_TYPE ref_min = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min, minloc_init);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init, minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min,
+                                                                 minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init,
+                                                             minloc_init);
 
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.minloc( working_array[idx], idx );
-    min.minloc( working_array[idx], idx );
+    mininit.minloc(working_array[idx], idx);
+    min.minloc(working_array[idx], idx);
   });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
@@ -76,103 +80,107 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.minloc( working_array[idx] * factor, idx);
+    min.minloc(working_array[idx] * factor, idx);
   });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    min.minloc( working_array[idx] * factor, idx);
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    min.minloc(working_array[idx] * factor, idx);
   });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
                             ReduceMinLocBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 2203aedd1b..675aeee306 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -13,35 +13,36 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,7 +52,7 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
   RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    sum  += working_array[idx];
+    sum += working_array[idx];
     sum2 += working_array[idx];
   });
 
@@ -62,100 +63,103 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum += working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { sum += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
-  seg_idx.clear(); 
+  // List segment tests
+  seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
-                                       working_res );
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index b1524827e0..29c1988fa3 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -13,30 +13,30 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -56,27 +56,30 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-    RAJA::expt::KernelName("RAJA Reduce BitAnd"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1, DATA_TYPE &r2) {
-      r1 &= working_array[idx];
-      r2 &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+      RAJA::expt::KernelName("RAJA Reduce BitAnd"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2) {
+        r1 &= working_array[idx];
+        r2 &= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -84,102 +87,108 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1) {
-        r1 &= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1) {
+          r1 &= working_array[idx];
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
                             ReduceBitAndBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
+#endif // __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index 6e83c14e95..1280d86f92 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -13,30 +13,30 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -49,34 +49,37 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
- 
+
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_or |= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_or |= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redor(0);
   DATA_TYPE redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
-    RAJA::expt::KernelName("RAJA Reduce BitOr"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1, DATA_TYPE &r2) {
-      r1 |= working_array[idx];
-      r2 |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
+      RAJA::expt::KernelName("RAJA Reduce BitOr"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2) {
+        r1 |= working_array[idx];
+        r2 |= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2), ref_or);
@@ -84,102 +87,107 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1) {
-        r1 |= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1) {
+          r1 |= working_array[idx];
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-    camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r1, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r2, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r3, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r4, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r5, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
+                                 DATA_TYPE,
                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   l1, seg_idx, working_res);
+                                 EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
-                            ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index 75b9fce4e3..a4eaea5fb0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
- 
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
   const DATA_TYPE max_init = -1;
   const DATA_TYPE big_max = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,14 +52,15 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE maxinit = big_max;
   DATA_TYPE max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce Max"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &mi, DATA_TYPE &m) {
-      mi = RAJA_MAX(working_array[idx], mi);
-      m  = RAJA_MAX(working_array[idx], m);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce Max"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m) {
+        mi = RAJA_MAX(working_array[idx], mi);
+        m = RAJA_MAX(working_array[idx], m);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max);
@@ -67,106 +69,109 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
-      m = RAJA_MAX(working_array[idx] * factor, m);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
+                              m = RAJA_MAX(working_array[idx] * factor, m);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
-   
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
-      m = RAJA_MAX(working_array[idx] * factor, m);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
+                              m = RAJA_MAX(working_array[idx] * factor, m);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
-                            ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index fbea0e034c..190cd5d01b 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -13,45 +13,47 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
   const DATA_TYPE max_init = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval*10;
+  const IDX_TYPE maxloc_idx = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max = modval * 10;
   const IDX_TYPE big_maxloc = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(1000 % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
   DATA_TYPE ref_max = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -61,14 +63,15 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   VL_TYPE maxinit(big_max, maxloc_init);
   VL_TYPE max(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &mi, VL_TYPE &m) {
-      mi.max( working_array[idx], idx );
-      m.max( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m) {
+        mi.max(working_array[idx], idx);
+        m.max(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -81,107 +84,111 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
-      m.max( working_array[idx] * factor, idx);
-  });
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
+                              m.max(working_array[idx] * factor, idx);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
-  
+
   factor = 3;
   RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
-      m.max( working_array[idx] * factor, idx);
-  });
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
+                              m.max(working_array[idx] * factor, idx);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
- 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
                             ReduceMaxLocBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index 78d9e0368c..42e3389608 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE min_init = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,14 +52,15 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE mininit = small_min;
   DATA_TYPE min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce Min"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &mi, DATA_TYPE &m) {
-      mi = RAJA_MIN(working_array[idx], mi);
-      m  = RAJA_MIN(working_array[idx], m);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce Min"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m) {
+        mi = RAJA_MIN(working_array[idx], mi);
+        m = RAJA_MIN(working_array[idx], m);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min);
@@ -66,107 +68,110 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min = min_init;
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
-  DATA_TYPE factor = 3; 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
-      m = RAJA_MIN(working_array[idx] * factor, m);
-  });
+  DATA_TYPE factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
+                              m = RAJA_MIN(working_array[idx] * factor, m);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
-      m = RAJA_MIN(working_array[idx] * factor, m);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
+                              m = RAJA_MIN(working_array[idx] * factor, m);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index c0996f24c1..00adda1616 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -13,45 +13,47 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE min_init = modval + 1;
   const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
+  const IDX_TYPE minloc_idx = seg_idx[idx_len * 2 / 3];
   const DATA_TYPE small_min = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
   DATA_TYPE ref_min = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -61,14 +63,15 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   VL_TYPE mininit(small_min, minloc_init);
   VL_TYPE min(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce MinLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &mi, VL_TYPE &m) {
-      mi.min( working_array[idx], idx );
-      m.min( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce MinLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m) {
+        mi.min(working_array[idx], idx);
+        m.min(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -81,107 +84,111 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
-      m.min( working_array[idx] * factor, idx);
-  });
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
+                              m.min(working_array[idx] * factor, idx);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
   RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
-      m.min( working_array[idx] * factor, idx);
-  });
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
+                              m.min(working_array[idx] * factor, idx);
+                            });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
                             ReduceMinLocBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index a5a11acf09..57e444a4ac 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -13,35 +13,36 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -49,14 +50,15 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE sum = 0;
   DATA_TYPE sum2 = 2;
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-    RAJA::expt::KernelName("RAJA Reduce Sum"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &s1, DATA_TYPE &s2) {
-      s1 += working_array[idx];
-      s2 += working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+      RAJA::expt::KernelName("RAJA Reduce Sum"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s1, DATA_TYPE & s2) {
+        s1 += working_array[idx];
+        s2 += working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -65,102 +67,107 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, 
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &s) {
-        s += working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s) {
+          s += working_array[idx];
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
-  seg_idx.clear(); 
+  // List segment tests
+  seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
-                                       working_res );
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
-#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#endif // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index 6335affc02..a32b35f3b0 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -19,8 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -45,15 +47,13 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   const double default_val = -DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = default_val;
   }
 
@@ -68,57 +68,57 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax0(default_val);
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // pick an index in one of the segments
-     int index = 5127;  // seg 3
-     if (tcount == 2) index = 1938; // seg2
-     if (tcount == 3) index = 13333; // seg4
-     if (tcount == 4) index = 52; // seg1
+    // pick an index in one of the segments
+    int index = 5127;               // seg 3
+    if (tcount == 2) index = 1938;  // seg2
+    if (tcount == 3) index = 13333; // seg4
+    if (tcount == 4) index = 52;    // seg1
 
-     double droll = dist(mt);
-     if (test_array[index] > droll) {
-       test_array[index] = droll;
-       current_max = RAJA_MAX(current_max, droll);
-     }
+    double droll = dist(mt);
+    if (test_array[index] > droll)
+    {
+      test_array[index] = droll;
+      current_max = RAJA_MAX(current_max, droll);
+    }
 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmax0.max(working_array[i]);
-       dmax1.max(2 * working_array[i]);
-     });
-
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      dmax0.max(working_array[i]);
+      dmax1.max(2 * working_array[i]);
+    });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
              ReduceMaxMultipleForallIndexSet)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE,
+                                          WORKING_RES,
+                                          EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest,
                             ReduceMaxMultipleForallIndexSet);
 
-#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
+#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 4d30728fe6..6cfcf9818b 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -19,8 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -32,10 +34,10 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
@@ -45,76 +47,75 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   double current_max = -DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = current_max;
   }
-  
+
   const int test_repeat = 4;
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max, current_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max, current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max,
+                                                            current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max,
+                                                            current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // set max val 
-     current_max = 100.0 + tcount * 10.0;
+    // set max val
+    current_max = 100.0 + tcount * 10.0;
 
-     // pick an index in one of the segments
-     current_loc = 5127;  // seg 3
-     if (tcount == 2) current_loc = 1938; // seg2
-     if (tcount == 3) current_loc = 13333; // seg4
-     if (tcount == 4) current_loc = 52; // seg1
+    // pick an index in one of the segments
+    current_loc = 5127;                   // seg 3
+    if (tcount == 2) current_loc = 1938;  // seg2
+    if (tcount == 3) current_loc = 13333; // seg4
+    if (tcount == 4) current_loc = 52;    // seg1
 
-     test_array[current_loc] = current_max;
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    test_array[current_loc] = current_max;
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmax0.maxloc(working_array[i], i);
-       dmax1.maxloc(2 * working_array[i], i);
-     });
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
+    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      dmax0.maxloc(working_array[i], i);
+      dmax1.maxloc(2 * working_array[i], i);
+    });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
              ReduceMaxLocMultipleForallIndexSet)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                             EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE,
+                                             WORKING_RES,
+                                             EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest,
                             ReduceMaxLocMultipleForallIndexSet);
 
-#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
+#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index cf3b60d078..e955e8c785 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -19,8 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -32,10 +34,10 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
@@ -45,18 +47,16 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   const double default_val = DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = default_val;
   }
-  
+
   // for setting random values in arrays
   std::random_device rd;
   std::mt19937 mt(rd());
@@ -68,57 +68,57 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin0(default_val);
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // pick an index in one of the segments
-     int index = 5127;  // seg 3
-     if (tcount == 2) index = 1938; // seg2
-     if (tcount == 3) index = 13333; // seg4
-     if (tcount == 4) index = 52; // seg1
+    // pick an index in one of the segments
+    int index = 5127;               // seg 3
+    if (tcount == 2) index = 1938;  // seg2
+    if (tcount == 3) index = 13333; // seg4
+    if (tcount == 4) index = 52;    // seg1
 
-     double droll = dist(mt);
-     if (test_array[index] > droll) {
-       test_array[index] = droll;
-       current_min = RAJA_MIN(current_min, droll);
-     }
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    double droll = dist(mt);
+    if (test_array[index] > droll)
+    {
+      test_array[index] = droll;
+      current_min = RAJA_MIN(current_min, droll);
+    }
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmin0.min(working_array[i]);
-       dmin1.min(2 * working_array[i]);
-     });
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      dmin0.min(working_array[i]);
+      dmin1.min(2 * working_array[i]);
+    });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
              ReduceMinMultipleForallIndexSet)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE,
+                                          WORKING_RES,
+                                          EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest,
                             ReduceMinMultipleForallIndexSet);
 
-#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
+#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index b8abbd9f67..aa413fe40e 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -19,8 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -32,10 +34,10 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
@@ -45,76 +47,75 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   double current_min = DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = current_min;
   }
-  
+
   const int test_repeat = 4;
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min, current_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min, current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min,
+                                                            current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min,
+                                                            current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // set min val 
-     current_min = 100.0 - tcount * 10.0;
+    // set min val
+    current_min = 100.0 - tcount * 10.0;
 
-     // pick an index in one of the segments
-     current_loc = 5127;  // seg 3
-     if (tcount == 2) current_loc = 1938; // seg2
-     if (tcount == 3) current_loc = 13333; // seg4
-     if (tcount == 4) current_loc = 52; // seg1
+    // pick an index in one of the segments
+    current_loc = 5127;                   // seg 3
+    if (tcount == 2) current_loc = 1938;  // seg2
+    if (tcount == 3) current_loc = 13333; // seg4
+    if (tcount == 4) current_loc = 52;    // seg1
 
-     test_array[current_loc] = current_min;
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    test_array[current_loc] = current_min;
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmin0.minloc(working_array[i], i);
-       dmin1.minloc(2 * working_array[i], i);
-     });
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
+    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      dmin0.minloc(working_array[i], i);
+      dmin1.minloc(2 * working_array[i], i);
+    });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
              ReduceMinLocMultipleForallIndexSet)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                             EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE,
+                                             WORKING_RES,
+                                             EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest,
                             ReduceMinLocMultipleForallIndexSet);
 
-#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
+#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index 88d3f54d7e..bda437e82b 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -17,8 +17,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceSumMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -30,10 +32,10 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
@@ -43,30 +45,25 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   double* dcheck_array;
   double* dtest_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &dworking_array,
-                                 &dcheck_array,
-                                 &dtest_array);
+  allocateForallTestData<double>(
+      alen, working_res, &dworking_array, &dcheck_array, &dtest_array);
 
   int* iworking_array;
   int* icheck_array;
   int* itest_array;
 
-  allocateForallTestData<int>(alen,
-                              working_res,
-                              &iworking_array,
-                              &icheck_array,
-                              &itest_array);
+  allocateForallTestData<int>(
+      alen, working_res, &iworking_array, &icheck_array, &itest_array);
 
   const double dinit_val = 0.1;
   const int iinit_val = 1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     dtest_array[i] = dinit_val;
     itest_array[i] = iinit_val;
   }
-  
+
   working_res.memcpy(dworking_array, dtest_array, sizeof(double) * alen);
   working_res.memcpy(iworking_array, itest_array, sizeof(int) * alen);
 
@@ -79,8 +76,9 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum2(drinit * 3.0);
   RAJA::ReduceSum<REDUCE_POLICY, int> isum3(irinit * 4);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
- 
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
+
     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
       dsum0 += 1.0 * dworking_array[idx];
       isum1 += 2 * iworking_array[idx];
@@ -91,47 +89,43 @@ void ForallIndexSetReduceSumMultipleTestImpl()
     double dchk_val = dinit_val * static_cast<double>(iset.getLength());
     int ichk_val = iinit_val * static_cast<int>(iset.getLength());
 
-    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()), 
-                               tcount * (1 * dchk_val) + (drinit * 1.0) );
-    ASSERT_EQ(static_cast<int>(isum1.get()), 
-                               tcount * (2 * ichk_val) + (irinit * 2) );
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()),
+                    tcount * (1 * dchk_val) + (drinit * 1.0));
+    ASSERT_EQ(static_cast<int>(isum1.get()),
+              tcount * (2 * ichk_val) + (irinit * 2));
     ASSERT_FLOAT_EQ(static_cast<double>(dsum2.get()),
-                               tcount * (3 * dchk_val) + (drinit * 3.0) );
-    ASSERT_EQ(static_cast<int>(isum3.get()), 
-                               tcount * (4 * ichk_val) + (irinit * 4) );
-
+                    tcount * (3 * dchk_val) + (drinit * 3.0));
+    ASSERT_EQ(static_cast<int>(isum3.get()),
+              tcount * (4 * ichk_val) + (irinit * 4));
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   dworking_array,
-                                   dcheck_array,
-                                   dtest_array);
+  deallocateForallTestData<double>(
+      working_res, dworking_array, dcheck_array, dtest_array);
 
-  deallocateForallTestData<int>(working_res,
-                                iworking_array,
-                                icheck_array,
-                                itest_array);
+  deallocateForallTestData<int>(
+      working_res, iworking_array, icheck_array, itest_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
 template <typename T>
 class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
              ReduceSumMultipleForallIndexSet)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE,
+                                          WORKING_RES,
+                                          EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest,
                             ReduceSumMultipleForallIndexSet);
 
-#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
+#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index bc5aec30d6..a688a549d0 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -14,11 +14,12 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE, 
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, 
-                                     IDX_TYPE last)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
@@ -27,19 +28,17 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
   const DATA_TYPE big_val = 500;
-  
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `max0;` not `max0(0);`
@@ -49,7 +48,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
@@ -58,24 +58,30 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
     DATA_TYPE current_max = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = 0; i < last; ++i) {
+      for (IDX_TYPE i = 0; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
-        working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[max_index],
+                           &test_array[max_index],
+                           sizeof(DATA_TYPE));
 
-        if ( current_max < roll ) {
-          current_max = roll ;
+        if (current_max < roll)
+        {
+          current_max = roll;
         }
 
         RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
@@ -87,46 +93,43 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
-
       }
-
     }
 
     max0.reset(default_val);
     max1.reset(default_val);
     max2.reset(big_val);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
 template <typename T>
 class ForallReduceMaxMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxMultipleTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest,
                             ReduceMaxMultipleForall);
 
-#endif  // __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
+#endif // __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index 8f16762989..73c44d1167 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -14,11 +14,12 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE, 
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, 
-                                        IDX_TYPE last)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
@@ -27,27 +28,29 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
   const IDX_TYPE default_loc = -1;
   const DATA_TYPE big_val = 500;
-  
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -59,27 +62,34 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
     DATA_TYPE current_max = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = first; i < last; ++i) {
+      for (IDX_TYPE i = first; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
-        if ( current_max != roll ) { // avoid two indices getting the same value
+        if (current_max != roll)
+        { // avoid two indices getting the same value
           test_array[max_index] = roll;
-          working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[max_index],
+                             &test_array[max_index],
+                             sizeof(DATA_TYPE));
 
-          if ( current_max < roll ) {
+          if (current_max < roll)
+          {
             current_max = roll;
             current_loc = max_index;
           }
@@ -99,15 +109,12 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
-
       }
-
     }
 
     max0.reset(default_val, default_loc);
     max1.reset(default_val, default_loc);
     max2.reset(big_val, default_loc);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
@@ -119,31 +126,31 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
 template <typename T>
 class ForallReduceMaxLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxLocMultipleTestImpl<IDX_TYPE,
+                                     DATA_TYPE,
+                                     WORKING_RES,
+                                     EXEC_POLICY,
+                                     REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest,
                             ReduceMaxLocMultipleForall);
 
-#endif  // __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
+#endif // __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index 7e51ac2a2d..1a5dc3870d 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -15,10 +15,11 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMinMultipleTestImpl(IDX_TYPE first, 
-                                     IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
@@ -27,19 +28,17 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
   const DATA_TYPE big_val = -500;
-  
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `min0;` not `min0(0);`
@@ -49,7 +48,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
@@ -58,24 +58,30 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
     DATA_TYPE current_min = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = 0; i < last; ++i) {
+      for (IDX_TYPE i = 0; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
-        working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[min_index],
+                           &test_array[min_index],
+                           sizeof(DATA_TYPE));
 
-        if ( current_min > roll ) {
-          current_min = roll ;
+        if (current_min > roll)
+        {
+          current_min = roll;
         }
 
         RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
@@ -87,46 +93,43 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
         ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
-
       }
-
     }
 
     min0.reset(default_val);
     min1.reset(default_val);
     min2.reset(big_val);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
 template <typename T>
 class ForallReduceMinMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMinMultipleTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest,
                             ReduceMinMultipleForall);
 
-#endif  // __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
+#endif // __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index d71f582ed9..8dcaba8b17 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -15,10 +15,11 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, 
-                                        IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
@@ -27,11 +28,8 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
   const IDX_TYPE default_loc = -1;
@@ -40,15 +38,20 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
@@ -61,34 +64,42 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
     DATA_TYPE current_min = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
       printf("reset data { %f }\n", (double)default_val);
-      for (IDX_TYPE i = first; i < last; ++i) {
+      for (IDX_TYPE i = first; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         printf("rolling { %f, %f }\n", (double)roll, (double)min_index);
-        if ( current_min != roll ) { // avoid two indices getting the same value
+        if (current_min != roll)
+        { // avoid two indices getting the same value
           test_array[min_index] = roll;
-          working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[min_index],
+                             &test_array[min_index],
+                             sizeof(DATA_TYPE));
 
-          if ( current_min > roll ) {
+          if (current_min > roll)
+          {
             current_min = roll;
             current_loc = min_index;
           }
         }
-        printf("current { %f, %f }\n", (double)current_min, (double)current_loc);
+        printf(
+            "current { %f, %f }\n", (double)current_min, (double)current_loc);
 
         RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
           min0.minloc(working_array[idx], idx);
@@ -105,16 +116,13 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
-
       }
-
     }
 
     printf("min0 reset { %f, %f }\n", (double)default_val, (double)default_loc);
     min0.reset(default_val, (DATA_TYPE)default_loc);
     min1.reset(default_val, default_loc);
     min2.reset(big_val, default_loc);
-
   }
 
   printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
@@ -127,31 +135,31 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
 template <typename T>
 class ForallReduceMinLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMinLocMultipleTestImpl<IDX_TYPE,
+                                     DATA_TYPE,
+                                     WORKING_RES,
+                                     EXEC_POLICY,
+                                     REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest,
                             ReduceMinLocMultipleForall);
 
-#endif  // __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
+#endif // __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index b5a6c469d1..bfc77c0d26 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -12,10 +12,11 @@
 #include <numeric>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, 
-                                              IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
@@ -24,15 +25,13 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i) {
+  for (IDX_TYPE i = first; i < last; ++i)
+  {
     test_array[i] = initval;
   }
 
@@ -51,7 +50,8 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first,
   const DATA_TYPE index_len = static_cast<DATA_TYPE>(last - first);
 
   const int nloops = 2;
-  for (int j = 0; j < nloops; ++j) {
+  for (int j = 0; j < nloops; ++j)
+  {
 
     RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
       sum0 += working_array[idx];
@@ -67,27 +67,29 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first,
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
-
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, 
-			                       IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
@@ -96,15 +98,13 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i) {
+  for (IDX_TYPE i = first; i < last; ++i)
+  {
     test_array[i] = initval;
   }
 
@@ -134,7 +134,8 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first,
   sum7.reset(initval * 7);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
+  for (int j = 0; j < nloops; ++j)
+  {
 
     RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
       sum0 += working_array[idx];
@@ -150,44 +151,50 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first,
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
-
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
 template <typename T>
 class ForallReduceSumMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                           EXEC_POLICY, REDUCE_POLICY>(0, 2115);
-
-  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE,
+                                           DATA_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY,
+                                           REDUCE_POLICY>(0, 2115);
+
+  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE,
+                                            DATA_TYPE,
+                                            WORKING_RES,
+                                            EXEC_POLICY,
+                                            REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
                             ReduceSumMultipleForall);
 
-#endif  // __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
+#endif // __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index f83d9ef1a5..6ea4301d4b 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -11,8 +11,10 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE, typename WORKING_RES, 
-          typename REG_POLICY, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename REG_POLICY,
+          typename EXEC_POLICY>
 void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -21,72 +23,65 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   // Set some local variables and create some segments for using in tests
   //
   const INDEX_TYPE N = last - first;
-  
+
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(&idx_array[0], &idx_array[0] + N, first);
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  working_res.memset( working_array, 0, sizeof(INDEX_TYPE) * N );
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
 
   RAJA::region<REG_POLICY>([=]() {
-
     RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
       working_array[idx - first] += 1;
     });
 
     RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[idx - first] += 2; 
+      working_array[idx - first] += 2;
     });
-
   });
 
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 3);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRegionTest);
 template <typename T>
 class ForallRegionTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRegionTest, RegionForall)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using REG_POLICY  = typename camp::at<TypeParam, camp::num<2>>::type;
+  using REG_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1, 153);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3, 2556);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1,
+                                                                         153);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3,
+                                                                         2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest,
-                            RegionForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest, RegionForall);
 
-#endif  // __TEST_FORALL_REGION_HPP__
+#endif // __TEST_FORALL_REGION_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index b000b270da..5784674802 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -17,82 +17,82 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceIcountIndexSetTestImpl()
 {
 
-  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES working_res;
   camp::resources::Resource erased_working_res{working_res};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, erased_working_res);
+      iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::forall_Icount<EXEC_POLICY>(working_res, iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    working_array[icount] = idx;
-  });
+  RAJA::forall_Icount<EXEC_POLICY>(
+      working_res,
+      iset,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+        working_array[icount] = idx;
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest);
 template <typename T>
 class ForallResourceIcountIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE,
+                                       WORKING_RESOURCE,
+                                       EXEC_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest,
                             ResourceIndexSetForallIcount);
 
-#endif  // __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
+#endif // __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index c1f714013d..194c1d49b8 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -16,81 +16,77 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceIndexSetTestImpl()
 {
 
-  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES working_res;
   camp::resources::Resource erased_working_res{working_res};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, erased_working_res);
+      iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+        working_array[idx] = idx;
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIndexSetTest);
 template <typename T>
 class ForallResourceIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallResourceIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest,
-                            ResourceIndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest, ResourceIndexSetForall);
 
-#endif  // __TEST_FORALL_INDEXSET_HPP__
+#endif // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index 5e0675cc98..051c3dc60a 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -22,13 +22,15 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
@@ -37,67 +39,71 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
   camp::resources::Resource erased_working_res{working_res};
 
   // Create list segment for tests
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
-                                          erased_working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(
+      &idx_array[0], idxlen, erased_working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ RAJA::stripIndexType(idx_array[i]) ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx)] = idx;
-  }); 
+  RAJA::forall<EXEC_POLICY>(
+      working_res, lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+        working_array[RAJA::stripIndexType(idx)] = idx;
+      });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  // 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  //
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceListSegmentTest);
 template <typename T>
 class ForallResourceListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(13));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(2047));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(32000));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceListSegmentTest,
                             ResourceListSegmentForall);
 
-#endif  // __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
+#endif // __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 83cc7c4aa1..04d557c244 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -13,7 +13,8 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
@@ -22,63 +23,71 @@ void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+        working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+      });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest);
 template <typename T>
 class ForallResourceRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
 TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
@@ -86,4 +95,4 @@ TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest,
                             ResourceRangeSegmentForall);
 
-#endif  // __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
+#endif // __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index f85f295548..6d06361ab5 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -8,12 +8,16 @@
 #ifndef __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 #define __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
-                                      DIFF_TYPE stride)
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                              INDEX_TYPE last,
+                                              DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
   WORKING_RES working_res;
@@ -23,85 +27,151 @@ void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N)); 
+  working_res.memcpy(
+      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-    test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-    idx += stride; 
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
+    test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+    idx += stride;
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+        working_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+      });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest);
 template <typename T>
 class ForallResourceRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+  // Test negative strides
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
-TYPED_TEST_P(ForallResourceRangeStrideSegmentTest, ResourceRangeStrideSegmentForall)
+TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
+             ResourceRangeStrideSegmentForall)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+  // Test size zero segments
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                           DIFF_TYPE,
+                                           WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
@@ -109,4 +179,4 @@ TYPED_TEST_P(ForallResourceRangeStrideSegmentTest, ResourceRangeStrideSegmentFor
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest,
                             ResourceRangeStrideSegmentForall);
 
-#endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
+#endif // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index e673abf306..4b2dca335b 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -23,49 +23,49 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = 0; i < N; ++i) {
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
     INDEX_TYPE randval = rand() % N;
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ idx_array[i] ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[idx_array[i]] = idx_array[i];
   }
 
   using layout_type = RAJA::Layout<1, INDEX_TYPE, 0>;
-  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
-#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) || defined(RAJA_COMPILER_MSVC)))\
-    || _GLIBCXX_RELEASE >= 20150716
-  #if (__GNUG__ && __GNUC__ < 5)
-  #define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-  #else
-  #define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-  #endif
+  using view_type = RAJA::View<INDEX_TYPE, layout_type>;
+#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) ||            \
+       defined(RAJA_COMPILER_MSVC))) ||                                        \
+    _GLIBCXX_RELEASE >= 20150716
+#if (__GNUG__ && __GNUC__ < 5)
+#define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+#else
+#define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+#endif
   static_assert(IS_TRIVIALLY_COPYABLE(layout_type),
                 "These layouts should always be triviallly copyable");
 
@@ -77,24 +77,22 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 
 
 #endif
-  
+
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -104,86 +102,89 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = 0; i < N; ++i) {
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
     INDEX_TYPE randval = rand() % N;
-    if ( i < randval ) {
-      idx_array.push_back(i+offset);
-    }     
+    if (i < randval)
+    {
+      idx_array.push_back(i + offset);
+    }
   }
 
   size_t idxlen = idx_array.size();
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ idx_array[i]-offset ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[idx_array[i] - offset] = idx_array[i];
   }
 
   using layout_type = RAJA::OffsetLayout<1, INDEX_TYPE>;
-  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
+  using view_type = RAJA::View<INDEX_TYPE, layout_type>;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type work_view(working_array, 
-                      RAJA::make_offset_layout<1, INDEX_TYPE>( {{offset}}, 
-                                                               {{N_offset}} ));
+  view_type work_view(
+      working_array,
+      RAJA::make_offset_layout<1, INDEX_TYPE>({{offset}}, {{N_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
 template <typename T>
 class ForallListSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000);
-
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13, 1);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047, 2);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000, 3);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      2047);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      32000);
+
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE,
+                                      WORKING_RESOURCE,
+                                      EXEC_POLICY>(13, 1);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE,
+                                      WORKING_RESOURCE,
+                                      EXEC_POLICY>(2047, 2);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE,
+                                      WORKING_RESOURCE,
+                                      EXEC_POLICY>(32000, 3);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest,
-                            ListSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest, ListSegmentForallView);
 
-#endif  // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
+#endif // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index b9355d9bc1..6f1434a3f1 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -24,17 +24,14 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      lentot, working_res, &working_array, &check_array, &test_array);
 
   std::iota(test_array, test_array + lentot, 0);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<NDIMS> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<NDIMS>>;
   RAJA::Layout<NDIMS> layout(N, N);
-  
+
   view_type work_view(working_array, layout);
 
   RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
@@ -45,14 +42,13 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++) {
+  for (INDEX_TYPE i = 0; i < lentot; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -69,71 +65,75 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      lentot, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * lentot ); 
+  memset(test_array, 0, sizeof(INDEX_TYPE) * lentot);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (int row = 1; row < N + 1; ++row) {
-    for (int col = 1; col < N + 1; ++col) {
-      int idx = row * (N+2) + col;
-      test_array[ idx ] = (row - 1) * N + (col - 1);
+  for (int row = 1; row < N + 1; ++row)
+  {
+    for (int col = 1; col < N + 1; ++col)
+    {
+      int idx = row * (N + 2) + col;
+      test_array[idx] = (row - 1) * N + (col - 1);
     }
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<NDIMS> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<NDIMS>>;
   RAJA::OffsetLayout<NDIMS> layout =
-    RAJA::make_offset_layout<NDIMS>( {{-1, -1}} , {{N+1, N+1}} );
+      RAJA::make_offset_layout<NDIMS>({{-1, -1}}, {{N + 1, N + 1}});
 
   view_type work_view(working_array, layout);
 
   RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
     const INDEX_TYPE row = idx / N;
     const INDEX_TYPE col = idx % N;
-    work_view(row, col) = idx;  
+    work_view(row, col) = idx;
   });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++) {
+  for (INDEX_TYPE i = 0; i < lentot; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
 template <typename T>
 class ForallRangeSegment2DViewTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runOffsetViewTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runOffsetViewTests()
 {
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(4);
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(100);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      4);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      100);
 }
 
 
 TYPED_TEST_P(ForallRangeSegment2DViewTest, RangeSegmentForall2DView)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -146,4 +146,4 @@ TYPED_TEST_P(ForallRangeSegment2DViewTest, RangeSegmentForall2DView)
 REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest,
                             RangeSegmentForall2DView);
 
-#endif  // __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
+#endif // __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index b4449db822..d2573be4a4 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -21,42 +21,39 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
- 
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
+
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
   RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx - rbegin ) = idx;
-  }); 
+    work_view(idx - rbegin) = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
                                           INDEX_TYPE offset)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first+offset, last+offset);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first + offset, last + offset);
   INDEX_TYPE N = r1.end() - r1.begin();
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -64,68 +61,70 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first, INDEX_TYPE last,
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE>>;
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type work_view(working_array, 
-                      RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}},
-                                                              {{l_offset}}));
+  view_type work_view(
+      working_array,
+      RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}}, {{l_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeViewTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeViewTests()
 {
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 10, -5);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      -5, 0, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      -5, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      0, 10, -5);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest);
 template <typename T>
 class ForallRangeSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -133,14 +132,16 @@ TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 5, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 3);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      0, 5, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      1, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      1, 255, 3);
 
   runNegativeViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest,
-                            RangeSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest, RangeSegmentForallView);
 
-#endif  // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
+#endif // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index c385b929bc..f376448844 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -8,9 +8,12 @@
 #ifndef __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 #define __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
                                           DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
@@ -21,86 +24,131 @@ void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last,
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = 0; i < N; ++i) {
-    test_array[ (idx-first)/stride ] = idx;
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
+    test_array[(idx - first) / stride] = idx;
     idx += stride;
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
   RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( (idx-first)/stride ) = idx;
+    work_view((idx - first) / stride) = idx;
   });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeIndexViewTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+{}
+
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
 {
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-10, -1, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 3);
-
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, -1, -1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, 0, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(-10, -1, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(-5, 0, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(-5, 5, 3);
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(10, -1, -1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(10, 0, -2);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest);
 template <typename T>
 class ForallRangeStrideSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
-
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 2);
-
-// Test size zero segments
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, -2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, -2);
+  using DIFF_TYPE = typename std::make_signed<INDEX_TYPE>::type;
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(0, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(1, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(1, 255, 2);
+
+  // Test size zero segments
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(0, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
+                                       DIFF_TYPE,
+                                       WORKING_RES,
+                                       EXEC_POLICY>(1, 20, -2);
 
   runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
@@ -108,4 +156,4 @@ TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
 REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest,
                             RangeStrideSegmentForallView);
 
-#endif  // __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
+#endif // __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index 0252af8644..4a7b46367a 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -22,13 +22,15 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
@@ -37,88 +39,92 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0) {
+  if (N > 0)
+  {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (size_t i = 0; i < idxlen; ++i) {
-      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i)
+    {
+      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
       working_array[RAJA::stripIndexType(idx)] = idx;
-    }); 
-
-  } else { // zero-length segment
+    });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
+      (void)idx;
       working_array[0]++;
     });
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallListSegmentTest);
 template <typename T>
 class ForallListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length list segment
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(0));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(0));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(13));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(2047));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest,
-                            ListSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest, ListSegmentForall);
 
-#endif  // __TEST_FORALL_LISTSEGMENT_HPP__
+#endif // __TEST_FORALL_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index 8b10d5dc10..30d3d4a4ef 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -14,7 +14,8 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -23,17 +24,16 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
@@ -42,74 +42,85 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
       working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
     });
-
-  } else { // zero-length segment 
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
+      (void)idx;
       working_array[0]++;
     });
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeSegmentTest);
 template <typename T>
 class ForallRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
 TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest,
-                            RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest, RangeSegmentForall);
 
-#endif  // __TEST_FORALL_RANGESEGMENT_HPP__
+#endif // __TEST_FORALL_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index 00046e15bf..17ac498f2b 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -10,12 +10,16 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -25,98 +29,161 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len); 
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-      idx += stride; 
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+    {
+      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+      idx += stride;
     }
 
     RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+      working_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
     });
-
-  } else { // zero-length segment
+  }
+  else
+  { // zero-length segment
 
     RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
+      (void)idx;
       working_array[0]++;
     });
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest);
 template <typename T>
 class ForallRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+  // Test negative strides
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
 TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+  // Test size zero segments
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
@@ -124,4 +191,4 @@ TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
 REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest,
                             RangeStrideSegmentForall);
 
-#endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
+#endif // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index ec40004b2d..97994cac35 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -11,7 +11,7 @@
 
 #include "RAJA_test-base.hpp"
 
-#include "RAJA/index/IndexSetBuilders.hpp" 
+#include "RAJA/index/IndexSetBuilders.hpp"
 
 #include "camp/resource.hpp"
 
@@ -36,7 +36,8 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(17);
   indices.push_back(18);
 
-  for (RAJA::Index_type i = 20; i < 28; ++i) {
+  for (RAJA::Index_type i = 20; i < 28; ++i)
+  {
     indices.push_back(i);
   }
 
@@ -45,10 +46,10 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(31);
 
   camp::resources::Resource res{camp::resources::Host()};
- 
+
   RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
 
-  RAJA::buildIndexSetAligned(iset, 
+  RAJA::buildIndexSetAligned(iset,
                              res,
                              &indices[0],
                              static_cast<RAJA::Index_type>(indices.size()),
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index 44a2a9ffa1..a937a19221 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -28,7 +28,8 @@ void KernelBasicFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -53,9 +54,8 @@ void KernelBasicFissionFusionLoopTestImpl(
                                     &test_array_y);
 
 
-  working_res.memset(working_array_x,
-                     0,
-                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(
+      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
@@ -86,21 +86,18 @@ void KernelBasicFissionFusionLoopTestImpl(
   });
 
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
     ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
               check_array_y[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_x,
-                                      check_array_x,
-                                      test_array_x);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_x, check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_y,
-                                      check_array_y,
-                                      test_array_y);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_y, check_array_y, test_array_y);
 }
 
-#endif  // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
+#endif // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 0627e469af..1a0bd8637c 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -13,8 +13,7 @@
 TYPED_TEST_SUITE_P(KernelBasicFissionFusionLoopTest);
 template <typename T>
 class KernelBasicFissionFusionLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
              BasicFissionFusionLoopSegmentKernel)
@@ -91,15 +90,16 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval) {
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
-                                      seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
   KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
                                        EXEC_POLICY,
                                        WORKING_RES,
@@ -108,9 +108,8 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
-                                      seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
   KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
                                        EXEC_POLICY,
                                        WORKING_RES,
@@ -120,4 +119,4 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicFissionFusionLoopTest,
                             BasicFissionFusionLoopSegmentKernel);
-#endif  // __TEST_KERNEL_BASIC_FISSION_FUSION_LOOP_HPP__
+#endif // __TEST_KERNEL_BASIC_FISSION_FUSION_LOOP_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index e22f544062..a583689061 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -16,15 +16,20 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES, typename SEG_TYPE>
-void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg, 
-                                   const std::vector<IDX_TYPE>& seg_idx,
-                                   WORKING_RES working_res,
-                                   camp::resources::Resource erased_working_res)
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE>
+void KernelBasicSingleICountLoopTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    WORKING_RES working_res,
+    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
-  if ( seg_idx.size() > 0 ) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -35,15 +40,13 @@ void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg,
   IDX_TYPE* test_array;
   IDX_TYPE* test_array_i;
 
-  if ( RAJA::stripIndexType(data_len) == 0 ) {
+  if (RAJA::stripIndexType(data_len) == 0)
+  {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      data_len, erased_working_res, &working_array, &check_array, &test_array);
 
   allocateForallTestData<IDX_TYPE>(data_len,
                                    erased_working_res,
@@ -51,70 +54,72 @@ void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg,
                                    &check_array_i,
                                    &test_array_i);
 
-  memset(static_cast<void*>(test_array), 0, 
+  memset(static_cast<void*>(test_array),
+         0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array, 
+  working_res.memcpy(working_array,
+                     test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array_i, test_array_i, 
+  working_res.memcpy(working_array_i,
+                     test_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( RAJA::stripIndexType(idx_len) > 0 ) {
+  if (RAJA::stripIndexType(idx_len) > 0)
+  {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
-      test_array  [ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
-        seg_idx[RAJA::stripIndexType(i)];
-      test_array_i[ RAJA::stripIndexType(RAJA::stripIndexType(i)) ] = 
-        IDX_TYPE(i);
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
+    {
+      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
+          seg_idx[RAJA::stripIndexType(i)];
+      test_array_i[RAJA::stripIndexType(RAJA::stripIndexType(i))] = IDX_TYPE(i);
     }
- 
+
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(seg),
-      RAJA::make_tuple(IDX_TYPE(0)),
-      
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
-        working_array[RAJA::stripIndexType(idx)] = IDX_TYPE(idx) ;
-        working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx) ;
-      }
-    );
+        RAJA::make_tuple(seg),
+        RAJA::make_tuple(IDX_TYPE(0)),
 
-  } else { // zero-length segment
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
+          working_array[RAJA::stripIndexType(idx)] = IDX_TYPE(idx);
+          working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx);
+        });
+  }
+  else
+  { // zero-length segment
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(seg),
-      RAJA::make_tuple(IDX_TYPE(0)),
-      
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
-        (void) idx; (void) i_idx;
-        working_array[0]++;
-        working_array_i[0]++;
-      }
-    );
-
+        RAJA::make_tuple(seg),
+        RAJA::make_tuple(IDX_TYPE(0)),
+
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
+          (void)idx;
+          (void)i_idx;
+          working_array[0]++;
+          working_array_i[0]++;
+        });
   }
 
-  working_res.memcpy(check_array, working_array, 
+  working_res.memcpy(check_array,
+                     working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
-  working_res.memcpy(check_array_i, working_array_i, 
+  working_res.memcpy(check_array_i,
+                     working_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
-    ASSERT_EQ( test_array[RAJA::stripIndexType(i)],
-               check_array[RAJA::stripIndexType(i)] );
-    ASSERT_EQ( test_array_i[RAJA::stripIndexType(i)],
-               check_array_i[RAJA::stripIndexType(i)] );
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array_i[RAJA::stripIndexType(i)],
+              check_array_i[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array_i,
-                                     check_array_i,
-                                     test_array_i);
+  deallocateForallTestData<IDX_TYPE>(
+      erased_working_res, working_array_i, check_array_i, test_array_i);
 }
 
-#endif  // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
+#endif // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index e6bd76fef9..00f8adc4d2 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -13,12 +13,12 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest);
 template <typename T>
 class KernelBasicSingleICountLoopTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(KernelBasicSingleICountLoopTest, BasicSingleICountLoopSegmentKernel)
+TYPED_TEST_P(KernelBasicSingleICountLoopTest,
+             BasicSingleICountLoopSegmentKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -27,77 +27,97 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest, BasicSingleICountLoopSegmentKernel
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r1, seg_idx, working_res, erased_working_res);
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r2, seg_idx, working_res, erased_working_res);
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r3, seg_idx, working_res, erased_working_res);
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs1, seg_idx, working_res, erased_working_res);
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs2, seg_idx, working_res, erased_working_res);
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs3, seg_idx, working_res, erased_working_res);
+      rs3, seg_idx, working_res, erased_working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-                                      l1, seg_idx, working_res, erased_working_res);
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
+                                      EXEC_POLICY,
+                                      WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-                                      l2, seg_idx, working_res, erased_working_res);
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest,
                             BasicSingleICountLoopSegmentKernel);
 
-#endif  // __TEST_KERNEL_BASIC_SINGLE_ICOUNT_LOOP_HPP__
+#endif // __TEST_KERNEL_BASIC_SINGLE_ICOUNT_LOOP_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index 6b4239e84a..d318fa475b 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -16,32 +16,42 @@
 #include <numeric>
 #include <vector>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES,
-          typename SEG_TYPE, bool USE_RESOURCE>
-void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg, 
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE,
+          bool USE_RESOURCE>
+void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
                                    const std::vector<IDX_TYPE>& seg_idx,
                                    WORKING_RES working_res,
                                    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
-  if ( seg_idx.size() > 0 ) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -49,58 +59,58 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  if ( RAJA::stripIndexType(data_len) == 0 ) {
+  if (RAJA::stripIndexType(data_len) == 0)
+  {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      data_len, erased_working_res, &working_array, &check_array, &test_array);
 
-  memset(static_cast<void*>(test_array), 0, 
+  memset(static_cast<void*>(test_array),
+         0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array, 
+  working_res.memcpy(working_array,
+                     test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( RAJA::stripIndexType(idx_len) > 0 ) {
+  if (RAJA::stripIndexType(idx_len) > 0)
+  {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
-      test_array[ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
-         seg_idx[RAJA::stripIndexType(i)];
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
+    {
+      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
+          seg_idx[RAJA::stripIndexType(i)];
     }
- 
-    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-        working_array[RAJA::stripIndexType(idx)] = idx;
-      }
-    );
-
-  } else { // zero-length segment
-
-    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-        (void) idx;
-        working_array[0]++;
-      }
-    );
 
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(
+        RAJA::make_tuple(seg), working_res, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+          working_array[RAJA::stripIndexType(idx)] = idx;
+        });
+  }
+  else
+  { // zero-length segment
+
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(
+        RAJA::make_tuple(seg), working_res, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+          (void)idx;
+          working_array[0]++;
+        });
   }
 
-  working_res.memcpy(check_array, working_array, 
+  working_res.memcpy(check_array,
+                     working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
-    ASSERT_EQ( test_array[RAJA::stripIndexType(i)], 
-               check_array[RAJA::stripIndexType(i)] );
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
-#endif  // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
+#endif // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index 5a7ce88f55..dbbfcdbed6 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -13,12 +13,11 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -29,77 +28,105 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>,
+                                USE_RES>(
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>,
+                                USE_RES>(
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>,
+                                USE_RES>(
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs3, seg_idx, working_res, erased_working_res);
-
-// List segment tests
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(
+      rs3, seg_idx, working_res, erased_working_res);
+
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l1, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>,
+                                USE_RES>(
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l2, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>,
+                                USE_RES>(
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
                             BasicSingleLoopSegmentKernel);
 
-#endif  // __TEST_KERNEL_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
+#endif // __TEST_KERNEL_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index 6f624eab2c..fe70c730f5 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -13,12 +13,11 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -29,77 +28,105 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>,
+                                USE_RES>(
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>,
+                                USE_RES>(
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>,
+                                USE_RES>(
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs3, seg_idx, working_res, erased_working_res);
-
-// List segment tests
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(
+      rs3, seg_idx, working_res, erased_working_res);
+
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l1, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>,
+                                USE_RES>(
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l2, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE,
+                                EXEC_POLICY,
+                                WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>,
+                                USE_RES>(
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
                             BasicSingleLoopSegmentKernel);
 
-#endif  // __TEST_KERNEL_RESOURCE_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
+#endif // __TEST_KERNEL_RESOURCE_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index 5a326b3c62..dc7f9ee1c8 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -28,7 +28,8 @@ void KernelConditionalFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -53,11 +54,11 @@ void KernelConditionalFissionFusionLoopTestImpl(
                                     &test_array_y);
 
 
-  working_res.memset(working_array_x,
-                     0,
-                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(
+      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (int param = 0; param < 2; ++param) {
+  for (int param = 0; param < 2; ++param)
+  {
 
     RAJA::kernel_param<EXEC_POLICY>(
 
@@ -90,22 +91,19 @@ void KernelConditionalFissionFusionLoopTestImpl(
     });
 
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+    {
       ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
                 check_array_y[RAJA::stripIndexType(i)]);
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_x,
-                                      check_array_x,
-                                      test_array_x);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_x, check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_y,
-                                      check_array_y,
-                                      test_array_y);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_y, check_array_y, test_array_y);
 }
 
-#endif  // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
+#endif // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index ddb2302e60..54f83eec3e 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -13,8 +13,7 @@
 TYPED_TEST_SUITE_P(KernelConditionalFissionFusionLoopTest);
 template <typename T>
 class KernelConditionalFissionFusionLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
              ConditionalFissionFusionLoopSegmentKernel)
@@ -66,10 +65,8 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
       IDX_TYPE,
       EXEC_POLICY,
       WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1,
-                                               seg_idx,
-                                               working_res,
-                                               erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
@@ -78,10 +75,8 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
       IDX_TYPE,
       EXEC_POLICY,
       WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2,
-                                               seg_idx,
-                                               working_res,
-                                               erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
@@ -91,24 +86,23 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
       IDX_TYPE,
       EXEC_POLICY,
       WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3,
-                                               seg_idx,
-                                               working_res,
-                                               erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+      rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval) {
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
-                                      seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
   KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
                                              EXEC_POLICY,
                                              WORKING_RES,
@@ -117,9 +111,8 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
-                                      seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
   KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
                                              EXEC_POLICY,
                                              WORKING_RES,
@@ -129,4 +122,4 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
 
 REGISTER_TYPED_TEST_SUITE_P(KernelConditionalFissionFusionLoopTest,
                             ConditionalFissionFusionLoopSegmentKernel);
-#endif  // __TEST_KERNEL_CONDITIONAL_FISSION_FUSION_LOOP_HPP__
+#endif // __TEST_KERNEL_CONDITIONAL_FISSION_FUSION_LOOP_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index ddae647f83..5ee1507d90 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -11,80 +11,97 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim)
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void KernelHyperplane2DTestImpl(const int groups,
+                                const int idim,
+                                const int jdim)
 {
-  // This test traverses "groups" 2D arrays, and modifies values in a 1D hyperplane manner.
+  // This test traverses "groups" 2D arrays, and modifies values in a 1D
+  // hyperplane manner.
 
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView( test_array, groups, idim, jdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView(
+      test_array, groups, idim, jdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView(
+      work_array, groups, idim, jdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView(
+      check_array, groups, idim, jdim);
 
   // initialize array
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
-  // perform array arithmetic with a 1D hyperplane, in either the I or J direction
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Grange( 0, groups );
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Irange( 0, idim );
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Jrange( 0, jdim );
-
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj ) {
-      if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim || (int)jj < 0 || (int)jj >= jdim) {
-        oob_count += 1;
-      }
+  // perform array arithmetic with a 1D hyperplane, in either the I or J
+  // direction
+  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Irange(0, idim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Jrange(0, jdim);
+
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(Grange, Irange, Jrange),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj) {
+        if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim ||
+            (int)jj < 0 || (int)jj >= jdim)
+        {
+          oob_count += 1;
+        }
 
-      DATA_TYPE left = 1;
-      if (ii > 0) {
-        left = WorkView(g, ii - 1, jj);
-      }
+        DATA_TYPE left = 1;
+        if (ii > 0)
+        {
+          left = WorkView(g, ii - 1, jj);
+        }
 
-      DATA_TYPE up = 1;
-      if (jj > 0) {
-        up = WorkView(g, ii, jj - 1);
-      }
+        DATA_TYPE up = 1;
+        if (jj > 0)
+        {
+          up = WorkView(g, ii, jj - 1);
+        }
 
-      WorkView(g, ii, jj) = left + up;
+        WorkView(g, ii, jj) = left + up;
 
-      trip_count += 1;
-  });
+        trip_count += 1;
+      });
 
-  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
 
   ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
         DATA_TYPE left = 1;
-        if (i > 0) {
+        if (i > 0)
+        {
           left = HostView(g, i - 1, j);
         }
 
         DATA_TYPE up = 1;
-        if (j > 0) {
+        if (j > 0)
+        {
           up = HostView(g, i, j - 1);
         }
 
@@ -93,42 +110,52 @@ void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim
     }
   }
 
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
         ASSERT_FLOAT_EQ(CheckView(g, i, j), HostView(g, i, j));
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane2DTest);
 template <typename T>
 class KernelHyperplane2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 111, 205);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 213, 123);
+  KernelHyperplane2DTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10);
+  KernelHyperplane2DTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(2, 111, 205);
+  KernelHyperplane2DTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(3, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest,
-                            Hyperplane2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest, Hyperplane2DKernel);
 
-#endif  // __TEST_KERNEL_HYPERPLANE_2D_HPP__
+#endif // __TEST_KERNEL_HYPERPLANE_2D_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 321f43d6a6..9bec3d5038 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -11,21 +11,38 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups), const int RAJA_UNUSED_ARG(idim), const int RAJA_UNUSED_ARG(jdim), const int RAJA_UNUSED_ARG(kdim))
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+typename std::enable_if<
+    std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups),
+                           const int RAJA_UNUSED_ARG(idim),
+                           const int RAJA_UNUSED_ARG(jdim),
+                           const int RAJA_UNUSED_ARG(kdim))
 {
   // do nothing for unsigned index types
 }
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin, const int kdimin)
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+typename std::enable_if<
+    std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int groups,
+                           const int idimin,
+                           const int jdimin,
+                           const int kdimin)
 {
-  // This test traverses "groups" number of 3D arrays, and modifies values in a 2D hyperplane manner.
+  // This test traverses "groups" number of 3D arrays, and modifies values in a
+  // 2D hyperplane manner.
 
   int idim, jdim, kdim;
-  if ( std::is_same<DATA_TYPE, float>::value )
+  if (std::is_same<DATA_TYPE, float>::value)
   {
     // Restrict to a small data size for better float precision.
     idim = 5;
@@ -41,85 +58,99 @@ KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin,
 
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim * kdim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView( test_array, groups, idim, jdim, kdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim, kdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim, kdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView(
+      test_array, groups, idim, jdim, kdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView(
+      work_array, groups, idim, jdim, kdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView(
+      check_array, groups, idim, jdim, kdim);
 
   // initialize array
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
   // perform array arithmetic with a 2D J-K hyperplane
-  RAJA::TypedRangeSegment<INDEX_TYPE>   Grange( 0, groups );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Irange( 0, idim, 1 );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Jrange( jdim-1, -1, -1 );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Krange( 0, kdim, 1 );
-
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange, Krange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk ) {
-      if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 || jj >= jdim || kk < 0 || kk >= kdim) {
-        oob_count += 1;
-      }
+  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Irange(0, idim, 1);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
+
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(Grange, Irange, Jrange, Krange),
+      [=] RAJA_HOST_DEVICE(
+          INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk) {
+        if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 ||
+            jj >= jdim || kk < 0 || kk >= kdim)
+        {
+          oob_count += 1;
+        }
 
-      DATA_TYPE left = 1;
-      if (ii > 0) {
-        left = WorkView(g, ii - 1, jj, kk);
-      }
+        DATA_TYPE left = 1;
+        if (ii > 0)
+        {
+          left = WorkView(g, ii - 1, jj, kk);
+        }
 
-      DATA_TYPE up = 1;
-      if (jj > 0) {
-        up = WorkView(g, ii, jj - 1, kk);
-      }
+        DATA_TYPE up = 1;
+        if (jj > 0)
+        {
+          up = WorkView(g, ii, jj - 1, kk);
+        }
 
-      DATA_TYPE back = 1;
-      if (kk > 0) {
-        back = WorkView(g, ii, jj, kk - 1);
-      }
+        DATA_TYPE back = 1;
+        if (kk > 0)
+        {
+          back = WorkView(g, ii, jj, kk - 1);
+        }
 
-      WorkView(g, ii, jj, kk) = left + up + back;
+        WorkView(g, ii, jj, kk) = left + up + back;
 
-      trip_count += 1;
-  });
+        trip_count += 1;
+      });
 
-  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
 
-  ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim * kdim);
+  ASSERT_EQ((INDEX_TYPE)trip_count.get(),
+            (INDEX_TYPE)groups * idim * jdim * kdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = jdim - 1; j >= 0; --j) {
-        for (int k = 0; k < kdim; ++k) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = jdim - 1; j >= 0; --j)
+      {
+        for (int k = 0; k < kdim; ++k)
+        {
           DATA_TYPE left = 1;
-          if (i > 0) {
+          if (i > 0)
+          {
             left = HostView(g, i - 1, j, k);
           }
 
           DATA_TYPE up = 1;
-          if (j > 0) {
+          if (j > 0)
+          {
             up = HostView(g, i, j - 1, k);
           }
 
           DATA_TYPE back = 1;
-          if (k > 0) {
+          if (k > 0)
+          {
             back = HostView(g, i, j, k - 1);
           }
 
@@ -129,44 +160,55 @@ KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin,
     }
   }
 
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
-        for (int k = 0; k < kdim; ++k) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
+        for (int k = 0; k < kdim; ++k)
+        {
           ASSERT_FLOAT_EQ(CheckView(g, i, j, k), HostView(g, i, j, k));
         }
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane3DTest);
 template <typename T>
 class KernelHyperplane3DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10, 10);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 151, 111, 205);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 101, 213, 123);
+  KernelHyperplane3DTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10, 10);
+  KernelHyperplane3DTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(2, 151, 111, 205);
+  KernelHyperplane3DTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(3, 101, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest,
-                            Hyperplane3DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest, Hyperplane3DKernel);
 
-#endif  // __TEST_KERNEL_HYPERPLANE_3D_HPP__
+#endif // __TEST_KERNEL_HYPERPLANE_3D_HPP__
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index 10923b9da2..c63af82818 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -15,10 +15,15 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -26,12 +31,19 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -40,7 +52,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -50,9 +63,9 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
@@ -73,51 +86,50 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(
+      idx_range + 1, working_res, &working_range, &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
-
-    for (IDX_TYPE k : sk) {
-      for (IDX_TYPE j : sj) {
-        for (IDX_TYPE i : si) {
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
+
+    for (IDX_TYPE k : sk)
+    {
+      for (IDX_TYPE j : sj)
+      {
+        for (IDX_TYPE i : si)
+        {
           IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii+1] = data_len;
+          test_range[ii + 1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
-                         &test_array);
+  allocateForallTestData(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
-                         &test_bins);
+  allocateForallTestData(
+      data_len, working_res, &working_bins, &check_bins, &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -125,7 +137,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(
+      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -137,21 +150,28 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    RAJA::kernel_resource<EXEC_POLICY>(
+        segments,
+        working_res,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -161,46 +181,60 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      RAJA::kernel_resource<EXEC_POLICY>(
+          segments,
+          working_res,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(
+          working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -208,23 +242,32 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
-
-      if (!got_ref_vals) {
+      RAJA::kernel_resource<EXEC_POLICY>(
+          segments,
+          working_res,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
+
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -232,92 +275,84 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest);
 template <typename T>
 class KernelMultiReduceNestedTest : public ::testing::Test
-{
-};
+{};
 
 //
 //
 // Defining the Kernel Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiReduceNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
-  using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1,2>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1, 2>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
+{
   using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                         >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
 
 TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ABSTRACTION = typename camp::at<TypeParam, camp::num<5>>::type;
 
   using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
   using LOOP_POLS = typename EXEC_POL_DATA::type;
-  using EXEC_POLICY = typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   // for setting random values in arrays
   auto random_seed = std::random_device{}();
@@ -329,38 +364,52 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY,
+                                    REDUCE_POLICY,
+                                    ABSTRACTION,
+                                    DATA_TYPE,
+                                    IDX_TYPE>(
+        s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY,
+                                    REDUCE_POLICY,
+                                    ABSTRACTION,
+                                    DATA_TYPE,
+                                    IDX_TYPE>(
+        s2, container, working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s3, container, working_res, rngen);
-
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY,
+                                    REDUCE_POLICY,
+                                    ABSTRACTION,
+                                    DATA_TYPE,
+                                    IDX_TYPE>(
+        s3, container, working_res, rngen);
   }
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest,
                             MultiReduceNestedKernel);
 
-#endif  // __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
+#endif // __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index 216aee14d6..cbf25b46a6 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -10,22 +10,30 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,80 +41,84 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARA
 // Define list of nested loop types the Block test supports.
 //
 //
-using BlockReduceSumSupportedLoopTypeList = camp::list<
-  DEPTH_1_REDUCESUM,
-  DEVICE_DEPTH_1_REDUCESUM
-  >;
+using BlockReduceSumSupportedLoopTypeList =
+    camp::list<DEPTH_1_REDUCESUM, DEVICE_DEPTH_1_REDUCESUM>;
 
 //
 //
 // Nest loop trip count test.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N){
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
+{
 
   WORKING_RES work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   // Allocate Tests Data
-  int * work_array;
-  int * check_array;
-  int * test_array;
+  int* work_array;
+  int* check_array;
+  int* test_array;
 
-  allocateForallTestData<int>(N,
-                              erased_work_res,
-                              &work_array,
-                              &check_array,
-                              &test_array);
+  allocateForallTestData<int>(
+      N, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<int> range(0,N);
+  RAJA::TypedRangeSegment<int> range(0, N);
 
   // Initialize Data
   std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
-  erased_work_res.memcpy(work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
-  
+  erased_work_res.memcpy(
+      work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
+
   RAJA::ReduceSum<REDUCE_POL, int> worksum(0);
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment(0, N)),
-    RAJA::make_tuple<int>(0),
+      RAJA::make_tuple(RAJA::RangeSegment(0, N)),
+      RAJA::make_tuple<int>(0),
 
-    // Resource
-    work_res,
+      // Resource
+      work_res,
 
-    // lambda 0, only runs for sequential
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
-       value = work_array[i];
-    },
+      // lambda 0, only runs for sequential
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value) {
+        value = work_array[i];
+      },
 
-    // lambda 1, only runs for device
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
-       value += work_array[i];
-    },
+      // lambda 1, only runs for device
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value) {
+        value += work_array[i];
+      },
 
-    // lambda 2, (reduction) runs for both sequential and device
-    // Device: This only gets executed on the "root" thread which received the reduced value.
-    [=] RAJA_HOST_DEVICE (int & value) {
-       worksum += value;
-    }
+      // lambda 2, (reduction) runs for both sequential and device
+      // Device: This only gets executed on the "root" thread which received the
+      // reduced value.
+      [=] RAJA_HOST_DEVICE(int& value) { worksum += value; }
 
   );
 
-  ASSERT_EQ(worksum.get(), N*(N-1)/2);
+  ASSERT_EQ(worksum.get(), N * (N - 1) / 2);
 
-  deallocateForallTestData<int>(erased_work_res,
-                                work_array,
-                                check_array,
-                                test_array);
+  deallocateForallTestData<int>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above DEPTH_1_REDUCESUM test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_1_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above
+// DEPTH_1_REDUCESUM test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_1_REDUCESUM(), args...);
 }
 
 //
@@ -114,37 +126,43 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args){
 // Defining the Kernel Loop structure for Block Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct BlockNestedLoopExec;
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<2, RAJA::Params<0>>
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::Lambda<0>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<2, RAJA::Params<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<1>>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<1>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
           RAJA::statement::Lambda<2, RAJA::Params<0>>
-          // Device: Lambda 2 only gets executed on the "root" thread which received the reduced value.
-        >
-      > // end DEVICE_KERNEL
-    >;
+          // Device: Lambda 2 only gets executed on the "root" thread which
+          // received the reduced value.
+          >> // end DEVICE_KERNEL
+                                  >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_IMPL_HPP__
+#endif // __NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index 54934bef6d..377ee899f2 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -10,20 +10,26 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,24 +37,28 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the ReduceSum test supports.
 //
 //
-using ReduceSumSupportedLoopTypeList = camp::list<
-  DEPTH_3_REDUCESUM,
-  DEPTH_3_REDUCESUM_SEQ_INNER,
-  DEPTH_3_REDUCESUM_SEQ_OUTER,
-  DEVICE_DEPTH_3_REDUCESUM,
-  DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-  DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
+using ReduceSumSupportedLoopTypeList =
+    camp::list<DEPTH_3_REDUCESUM,
+               DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEPTH_3_REDUCESUM_SEQ_OUTER,
+               DEVICE_DEPTH_3_REDUCESUM,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
 
 //
 //
 // ReduceSum 3D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2){
+                          const RAJA::Index_type dim2)
+{
   WORKING_RES work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
@@ -57,68 +67,103 @@ void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  erased_work_res.memcpy(work_array, test_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  erased_work_res.memcpy(work_array,
+                         test_array,
+                         sizeof(RAJA::Index_type) *
+                             RAJA::stripIndexType(flatSize));
 
   constexpr int Depth = 3;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim0, dim1, dim2);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
+      work_array, dim0, dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range0, range1, range2), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k) {
-                              worksum += work_view(i,j,k);
-                            });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range0, range1, range2),
+      work_res,
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k) {
+        worksum += work_view(i, j, k);
+      });
 
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
+  RAJA::forall<RAJA::seq_exec>(rangeflat, [=](RAJA::Index_type i) {
     hostsum += test_array[RAJA::stripIndexType(i)];
   });
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above DEPTH_3_REDUCESUM test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above
+// DEPTH_3_REDUCESUM test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
+                          Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
+                          Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
 //
@@ -126,101 +171,115 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&, Args... arg
 // Defining the Kernel Loop structure for ReduceSum Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct ReduceSumNestedLoopExec;
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<1,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::For<2,
+                                                RAJA::seq_exec,
+                                                RAJA::statement::Lambda<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  RAJA::seq_exec,
+                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                         >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __NESTED_LOOP_REDUCESUM_IMPL_HPP__
+#endif // __NESTED_LOOP_REDUCESUM_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
index cb2f444643..0e0f7f2b5d 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,16 +33,19 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
                             NestedLoopBlockKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
index 344ae26666..1c0283795b 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,16 +33,19 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
                             NestedLoopReduceSumKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_REDUCESUM_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_REDUCESUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
index a83c16592f..3c9631e225 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,16 +33,19 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
                             NestedLoopBlockKernel);
 
-#endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
+#endif // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
index bbf888f680..f27fea9bfa 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,16 +33,19 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
                             NestedLoopReduceSumKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_REDUCESUM_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_RESOURCE_REDUCESUM_HPP__
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index 1c1eafabc5..462c14209b 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -19,21 +19,22 @@
 
 template <typename IDX_TYPE, typename DATA_TYPE, typename EXEC_POLICY>
 void KernelNestedLoopsSegmentTypesTestImpl(
-  const RAJA::TypedRangeSegment<IDX_TYPE>& s1, 
-  const std::vector<IDX_TYPE>& s1_idx,
-  const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
-  const std::vector<IDX_TYPE>& s2_idx,
-  const RAJA::TypedListSegment<IDX_TYPE>& s3,
-  const std::vector<IDX_TYPE>& s3_idx,
-  camp::resources::Resource working_res,
-  int perm)
+    const RAJA::TypedRangeSegment<IDX_TYPE>& s1,
+    const std::vector<IDX_TYPE>& s1_idx,
+    const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
+    const std::vector<IDX_TYPE>& s2_idx,
+    const RAJA::TypedListSegment<IDX_TYPE>& s3,
+    const std::vector<IDX_TYPE>& s3_idx,
+    camp::resources::Resource working_res,
+    int perm)
 {
   IDX_TYPE idx1_len = static_cast<IDX_TYPE>(s1_idx.size());
   IDX_TYPE idx2_len = static_cast<IDX_TYPE>(s2_idx.size());
   IDX_TYPE idx3_len = static_cast<IDX_TYPE>(s3_idx.size());
 
   bool zero_legth_segment = false;
-  if ( RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0 ) {
+  if (RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0)
+  {
     zero_legth_segment = true;
   }
 
@@ -41,7 +42,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   IDX_TYPE dim2 = 1;
   IDX_TYPE dim3 = 1;
 
-  if ( !zero_legth_segment ) {
+  if (!zero_legth_segment)
+  {
     dim1 = s1_idx[s1_idx.size() - 1] + 1;
     dim2 = s2_idx[s2_idx.size() - 1] + 1;
     dim3 = s3_idx[s3_idx.size() - 1] + 1;
@@ -53,89 +55,94 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &work_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &work_array, &check_array, &test_array);
 
-  RAJA::View< DATA_TYPE, RAJA::Layout<3> > work_view(work_array, 
-                                                     dim1, dim2, dim3);
-  RAJA::View< DATA_TYPE, RAJA::Layout<3> > test_view(test_array, 
-                                                     dim1, dim2, dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(
+      work_array, dim1, dim2, dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(
+      test_array, dim1, dim2, dim3);
 
-  memset( static_cast<void*>(test_array), 0, 
-          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len) );
+  memset(static_cast<void*>(test_array),
+         0,
+         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(work_array, test_array, 
+  working_res.memcpy(work_array,
+                     test_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( !zero_legth_segment ) {
-    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1) {
-      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2) {
-        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3) {
+  if (!zero_legth_segment)
+  {
+    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1)
+    {
+      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2)
+      {
+        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3)
+        {
           auto ii1 = RAJA::stripIndexType(i1);
           auto ii2 = RAJA::stripIndexType(i2);
           auto ii3 = RAJA::stripIndexType(i3);
-          test_view( s1_idx[ii1], s2_idx[ii2], s3_idx[ii3] ) = 
-            static_cast<DATA_TYPE>( RAJA::stripIndexType(
-                                    s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]) );
+          test_view(s1_idx[ii1], s2_idx[ii2], s3_idx[ii3]) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(
+                  s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]));
         }
       }
     }
   }
 
-  if ( perm == 1 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s1, s2, s3 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
+  if (perm == 1)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s1, s2, s3),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3) {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
+  }
+
+  if (perm == 2)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s2, s3, s1),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1) {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
   }
- 
-  if ( perm == 2 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s2, s3, s1 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
-  } 
-
-  if ( perm == 3 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s3, s1, s2 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
-  } 
 
-  working_res.memcpy(check_array, work_array, 
+  if (perm == 3)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s3, s1, s2),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2) {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
+  }
+
+  working_res.memcpy(check_array,
+                     work_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     auto ii = RAJA::stripIndexType(i);
-    ASSERT_EQ( test_array[ii], check_array[ii] );
+    ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      work_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, work_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest);
 template <typename T>
 class KernelNestedLoopsSegmentTypesTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -145,141 +152,93 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
   std::vector<IDX_TYPE> s2_idx;
   std::vector<IDX_TYPE> s3_idx;
 
-// Create a segment of each basic type RAJA provides and test
-// permutations of those segments in nested loops 
+  // Create a segment of each basic type RAJA provides and test
+  // permutations of those segments in nested loops
 
-  RAJA::TypedRangeSegment<IDX_TYPE> s1( 0, 69 );
+  RAJA::TypedRangeSegment<IDX_TYPE> s1(0, 69);
   RAJA::getIndices(s1_idx, s1);
 
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2( 3, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2(3, 188, 2);
   RAJA::getIndices(s2_idx, s2);
 
   IDX_TYPE last = IDX_TYPE(427);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       s3_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> s3( &s3_idx[0], s3_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> s3(&s3_idx[0], s3_idx.size(), working_res);
 
   int perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,    
-                                        s2, s2_idx,    
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
-// Test some zero-length segment combinations
+  // Test some zero-length segment combinations
 
-// Zero-length range segment
-  RAJA::TypedRangeSegment<IDX_TYPE> s4( 4, 4 );
+  // Zero-length range segment
+  RAJA::TypedRangeSegment<IDX_TYPE> s4(4, 4);
   std::vector<IDX_TYPE> s4_idx;
   RAJA::getIndices(s4_idx, s4);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
-
-// Zero-length range stride segment
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5( 3, 3, 2 );
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+
+  // Zero-length range stride segment
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5(3, 3, 2);
   std::vector<IDX_TYPE> s5_idx;
   RAJA::getIndices(s5_idx, s5);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
-// Zero-length list segment 
+  // Zero-length list segment
   std::vector<IDX_TYPE> s6_idx;
-  RAJA::TypedListSegment<IDX_TYPE> s6( nullptr, s6_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> s6(nullptr, s6_idx.size(), working_res);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest,
                             NestedLoopsSegmentTypesKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOPS_SEGMENT_TYPES_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOPS_SEGMENT_TYPES_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index 77f168ce2f..f2dff847bd 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -23,105 +23,95 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
-  EXPECT_LT( off_dim0, dim.at(0) );
-  EXPECT_LT( off_dim1, dim.at(1) );
+  EXPECT_LT(off_dim0, dim.at(0));
+  EXPECT_LT(off_dim1, dim.at(1));
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
+    {
       test_array[j + dim.at(1) * i] = static_cast<IDX_TYPE>(1);
     }
   }
 
 
-  RAJA::OffsetLayout<2> layout =
-    RAJA::make_offset_layout<2>( {{offset_lo.at(0), offset_lo.at(1)}},
-                                 {{offset_lo.at(0) + dim.at(0),
-                                   offset_lo.at(1) + dim.at(1)}} );
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > view(working_array, layout);
+  RAJA::OffsetLayout<2> layout = RAJA::make_offset_layout<2>(
+      {{offset_lo.at(0), offset_lo.at(1)}},
+      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1)}});
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-      view(i, j) = static_cast<IDX_TYPE>(1);
-    }
-  );
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+                              view(i, j) = static_cast<IDX_TYPE>(1);
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopOffsetView2DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
   RAJA::idx_t dim0 = 21;
   RAJA::idx_t dim1 = 23;
-  std::array<RAJA::idx_t, 2> dim {{dim0, dim1}};
+  std::array<RAJA::idx_t, 2> dim{{dim0, dim1}};
 
   //
   // Square views
   //
-  std::array<RAJA::idx_t, 2> offset_lo {{0, 2}};
-  std::array<RAJA::idx_t, 2> offset_hi {{dim0-3, dim1-4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
-
-  offset_lo = std::array<RAJA::idx_t, 2> {{-1, -2}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-6}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
+  std::array<RAJA::idx_t, 2> offset_lo{{0, 2}};
+  std::array<RAJA::idx_t, 2> offset_hi{{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
+
+  offset_lo = std::array<RAJA::idx_t, 2>{{-1, -2}};
+  offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 6}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 
   //
   // Non-square views
   //
-  offset_lo = std::array<RAJA::idx_t, 2> {{0, 1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-1}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
-
-  offset_lo = std::array<RAJA::idx_t, 2> {{-1, -1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
+  offset_lo = std::array<RAJA::idx_t, 2>{{0, 1}};
+  offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 1}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
+
+  offset_lo = std::array<RAJA::idx_t, 2>{{-1, -1}};
+  offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest,
                             OffsetView2DKernelTest);
 
-#endif  // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW2D_HPP__
+#endif // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index 32adc3ede0..e196982981 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -24,74 +24,69 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
   RAJA::idx_t off_dim2 = offset_hi.at(2) - offset_lo.at(2);
-  EXPECT_LT( off_dim0, dim.at(0) );
-  EXPECT_LT( off_dim1, dim.at(1) );
-  EXPECT_LT( off_dim2, dim.at(2) );
+  EXPECT_LT(off_dim0, dim.at(0));
+  EXPECT_LT(off_dim1, dim.at(1));
+  EXPECT_LT(off_dim2, dim.at(2));
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
-      for (RAJA::idx_t k = 0; k < off_dim2; ++k) {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
+    {
+      for (RAJA::idx_t k = 0; k < off_dim2; ++k)
+      {
         test_array[k + dim.at(2) * j + dim.at(1) * dim.at(2) * i] =
-          static_cast<IDX_TYPE>(1);
+            static_cast<IDX_TYPE>(1);
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> layout =
-    RAJA::make_offset_layout<3>( {{offset_lo.at(0),
-                                   offset_lo.at(1),
-                                   offset_lo.at(2)}},
-                                 {{offset_lo.at(0) + dim.at(0),
-                                   offset_lo.at(1) + dim.at(1),
-                                   offset_lo.at(2) + dim.at(2)}} );
+  RAJA::OffsetLayout<3> layout = RAJA::make_offset_layout<3>(
+      {{offset_lo.at(0), offset_lo.at(1), offset_lo.at(2)}},
+      {{offset_lo.at(0) + dim.at(0),
+        offset_lo.at(1) + dim.at(1),
+        offset_lo.at(2) + dim.at(2)}});
 
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg( offset_lo.at(2), offset_hi.at(2));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg(offset_lo.at(2), offset_hi.at(2));
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg, kseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      view(i, j, k) = static_cast<IDX_TYPE>(1);
-    }
-  );
+      RAJA::make_tuple(iseg, jseg, kseg),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
+        view(i, j, k) = static_cast<IDX_TYPE>(1);
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopOffsetView3DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -99,40 +94,36 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   RAJA::idx_t dim0 = 13;
   RAJA::idx_t dim1 = 19;
   RAJA::idx_t dim2 = 16;
-  std::array<RAJA::idx_t, 3> dim {{dim0, dim1, dim2}};
+  std::array<RAJA::idx_t, 3> dim{{dim0, dim1, dim2}};
 
   //
   // Square views
   //
-  std::array<RAJA::idx_t, 3> offset_lo {{0, 2, 1}};
-  std::array<RAJA::idx_t, 3> offset_hi {{dim0-2, dim1-6, dim2-4}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
-
-  offset_lo = std::array<RAJA::idx_t, 3> {{-1, -2, -3}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-10, dim2-8}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
+  std::array<RAJA::idx_t, 3> offset_lo{{0, 2, 1}};
+  std::array<RAJA::idx_t, 3> offset_hi{{dim0 - 2, dim1 - 6, dim2 - 4}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
+
+  offset_lo = std::array<RAJA::idx_t, 3>{{-1, -2, -3}};
+  offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 10, dim2 - 8}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 
   //
   // Non-square views
   //
-  offset_lo = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-2, dim2-2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
-
-  offset_lo = std::array<RAJA::idx_t, 3> {{-1, -1, 0}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-4, dim2-2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
-                                                                 offset_hi);
+  offset_lo = std::array<RAJA::idx_t, 3>{{0, 1, 2}};
+  offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 2, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
+
+  offset_lo = std::array<RAJA::idx_t, 3>{{-1, -1, 0}};
+  offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 4, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest,
                             OffsetView3DKernelTest);
 
-#endif  // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW3D_HPP__
+#endif // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW3D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index f83126959d..8c7bab320a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -25,8 +25,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   //
   // Note that we assume a finite difference stencil width of one.
   //
-  std::array<RAJA::idx_t, 2> Nint_len {{dim.at(0), dim.at(1)}};
-  std::array<RAJA::idx_t, 2> Ntot_len {{dim.at(0) + 2 * 1, dim.at(1) + 2 * 1}};
+  std::array<RAJA::idx_t, 2> Nint_len{{dim.at(0), dim.at(1)}};
+  std::array<RAJA::idx_t, 2> Ntot_len{{dim.at(0) + 2 * 1, dim.at(1) + 2 * 1}};
 
   //
   // These are used in data initialization and setting reference solution.
@@ -35,8 +35,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer = dim.at( perm.at(0) );
-  RAJA::idx_t Nint_inner = dim.at( perm.at(1) );
+  RAJA::idx_t Nint_outer = dim.at(perm.at(0));
+  RAJA::idx_t Nint_inner = dim.at(perm.at(1));
 
   RAJA::idx_t Ntot_outer = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_inner = Nint_inner + 2 * 1;
@@ -45,16 +45,15 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot,
-                                   working_res,
-                                   &B_work_array,
-                                   &B_check_array,
-                                   &B_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
-    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j) {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j)
+    {
       B_test_array[j + Ntot_inner * i] = static_cast<IDX_TYPE>(1);
     }
   }
@@ -63,97 +62,87 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint,
-                                   working_res,
-                                   &A_work_array,
-                                   &A_check_array,
-                                   &A_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
-    for (RAJA::idx_t j = 0; j < Nint_inner; ++j) {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < Nint_inner; ++j)
+    {
 
       int A_idx = j + Nint_inner * i;
       int B_idx = (j + 1) + Ntot_inner * (i + 1);
 
-      A_test_array[A_idx] = B_test_array[B_idx] +                // C
-                            B_test_array[B_idx - Ntot_inner] +   // S
-                            B_test_array[B_idx + Ntot_inner] +   // N
-                            B_test_array[B_idx - 1] +            // W
-                            B_test_array[B_idx + 1];             // E
-
+      A_test_array[A_idx] = B_test_array[B_idx] +              // C
+                            B_test_array[B_idx - Ntot_inner] + // S
+                            B_test_array[B_idx + Ntot_inner] + // N
+                            B_test_array[B_idx - 1] +          // W
+                            B_test_array[B_idx + 1];           // E
     }
   }
 
 
-  RAJA::OffsetLayout<2> B_layout =
-    RAJA::make_permuted_offset_layout<2>( {{-1, -1}},
-                                          {{Ntot_len.at(0)-1, Ntot_len.at(1)-1}},
-                                          perm );
+  RAJA::OffsetLayout<2> B_layout = RAJA::make_permuted_offset_layout<2>(
+      {{-1, -1}}, {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1}}, perm);
   RAJA::Layout<2> A_layout =
-    RAJA::make_permuted_layout( {{Nint_len.at(0), Nint_len.at(1)}}, perm );
-
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > B_view(B_work_array, B_layout);
-  RAJA::View< IDX_TYPE, RAJA::Layout<2> >  A_view(A_work_array, A_layout);
+      RAJA::make_permuted_layout({{Nint_len.at(0), Nint_len.at(1)}}, perm);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> B_view(B_work_array, B_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2>> A_view(A_work_array, A_layout);
 
-  RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
 
-      A_view(i, j) = B_view(i, j) +
-                     B_view(i - 1, j) + B_view(i + 1, j) +
-                     B_view(i, j - 1) + B_view(i, j + 1);
-
-    }
-  );
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+                              A_view(i, j) = B_view(i, j) + B_view(i - 1, j) +
+                                             B_view(i + 1, j) +
+                                             B_view(i, j - 1) +
+                                             B_view(i, j + 1);
+                            });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
+  {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     A_work_array,
-                                     A_check_array,
-                                     A_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, A_work_array, A_check_array, A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     B_work_array,
-                                     B_check_array,
-                                     B_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, B_work_array, B_check_array, B_test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView2DTest : public ::testing::Test
-{
-};
+{};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest, PermutedOffsetView2DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
+             PermutedOffsetView2DKernelTest)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
   RAJA::idx_t dim0 = 23;
   RAJA::idx_t dim1 = 37;
-  std::array<RAJA::idx_t, 2> dim {{dim0, dim1}};
+  std::array<RAJA::idx_t, 2> dim{{dim0, dim1}};
 
-  std::array<RAJA::idx_t, 2> perm {{0, 1}};
+  std::array<RAJA::idx_t, 2> perm{{0, 1}};
   KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
                                                                          perm);
 
-  perm = std::array<RAJA::idx_t, 2> {{1, 0}};
+  perm = std::array<RAJA::idx_t, 2>{{1, 0}};
   KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
                                                                          perm);
 }
@@ -161,4 +150,4 @@ TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest, PermutedOffsetView2DKerne
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView2DTest,
                             PermutedOffsetView2DKernelTest);
 
-#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW2D_HPP__
+#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index 776aff7c57..ef09e58c41 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -25,10 +25,9 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   //
   // Note that we assume a finite difference stencil width of one.
   //
-  std::array<RAJA::idx_t, 3> Nint_len {{dim.at(0), dim.at(1), dim.at(2)}};
-  std::array<RAJA::idx_t, 3> Ntot_len {{dim.at(0) + 2 * 1,
-                                        dim.at(1) + 2 * 1,
-                                        dim.at(2) + 2 * 1}};
+  std::array<RAJA::idx_t, 3> Nint_len{{dim.at(0), dim.at(1), dim.at(2)}};
+  std::array<RAJA::idx_t, 3> Ntot_len{
+      {dim.at(0) + 2 * 1, dim.at(1) + 2 * 1, dim.at(2) + 2 * 1}};
 
   //
   // These are used in data initialization and setting reference solution.
@@ -37,31 +36,31 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer  = dim.at( perm.at(0) );
-  RAJA::idx_t Nint_middle = dim.at( perm.at(1) );
-  RAJA::idx_t Nint_inner  = dim.at( perm.at(2) );
+  RAJA::idx_t Nint_outer = dim.at(perm.at(0));
+  RAJA::idx_t Nint_middle = dim.at(perm.at(1));
+  RAJA::idx_t Nint_inner = dim.at(perm.at(2));
 
-  RAJA::idx_t Ntot_outer  = Nint_outer + 2 * 1;
+  RAJA::idx_t Ntot_outer = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_middle = Nint_middle + 2 * 1;
-  RAJA::idx_t Ntot_inner  = Nint_inner + 2 * 1;
+  RAJA::idx_t Ntot_inner = Nint_inner + 2 * 1;
 
   RAJA::idx_t Nint = Nint_outer * Nint_middle * Nint_inner;
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot,
-                                   working_res,
-                                   &B_work_array,
-                                   &B_check_array,
-                                   &B_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
-    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j) {
-      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k) {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j)
+    {
+      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k)
+      {
         B_test_array[k + j * Ntot_inner + i * Ntot_inner * Ntot_middle] =
-          static_cast<IDX_TYPE>(1);
+            static_cast<IDX_TYPE>(1);
       }
     }
   }
@@ -70,94 +69,85 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint,
-                                   working_res,
-                                   &A_work_array,
-                                   &A_check_array,
-                                   &A_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
-    for (RAJA::idx_t j = 0; j < Nint_middle; ++j) {
-      for (RAJA::idx_t k = 0; k < Nint_inner; ++k) {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < Nint_middle; ++j)
+    {
+      for (RAJA::idx_t k = 0; k < Nint_inner; ++k)
+      {
 
         int A_idx = k + j * Nint_inner + i * Nint_inner * Nint_middle;
         int B_idx =
-          (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
+            (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
 
         A_test_array[A_idx] =
-          B_test_array[B_idx] +                              // C
-          B_test_array[B_idx - 1] +                          // W
-          B_test_array[B_idx + 1] +                          // E
-          B_test_array[B_idx - Ntot_inner] +                 // S
-          B_test_array[B_idx + Ntot_inner] +                 // N
-          B_test_array[B_idx - (Ntot_inner*Ntot_middle)] +   // B
-          B_test_array[B_idx + (Ntot_inner*Ntot_middle)];    // T
-
+            B_test_array[B_idx] +                              // C
+            B_test_array[B_idx - 1] +                          // W
+            B_test_array[B_idx + 1] +                          // E
+            B_test_array[B_idx - Ntot_inner] +                 // S
+            B_test_array[B_idx + Ntot_inner] +                 // N
+            B_test_array[B_idx - (Ntot_inner * Ntot_middle)] + // B
+            B_test_array[B_idx + (Ntot_inner * Ntot_middle)];  // T
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> B_layout =
-    RAJA::make_permuted_offset_layout<3>( {{-1, -1, -1}},
-                                          {{Ntot_len.at(0)-1,
-                                            Ntot_len.at(1)-1,
-                                            Ntot_len.at(2)-1}},
-                                          perm );
-  RAJA::Layout<3> A_layout =
-    RAJA::make_permuted_layout( {{Nint_len.at(0),
-                                  Nint_len.at(1),
-                                  Nint_len.at(2)}}, perm );
+  RAJA::OffsetLayout<3> B_layout = RAJA::make_permuted_offset_layout<3>(
+      {{-1, -1, -1}},
+      {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1, Ntot_len.at(2) - 1}},
+      perm);
+  RAJA::Layout<3> A_layout = RAJA::make_permuted_layout(
+      {{Nint_len.at(0), Nint_len.at(1), Nint_len.at(2)}}, perm);
 
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > B_view(B_work_array, B_layout);
-  RAJA::View< IDX_TYPE, RAJA::Layout<3> >  A_view(A_work_array, A_layout);
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> B_view(B_work_array, B_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3>> A_view(A_work_array, A_layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg( 0, Nint_len.at(2) );
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg(0, Nint_len.at(2));
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg, kseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      A_view(i, j, k) = B_view(i, j, k) +
-                        B_view(i - 1, j, k) + B_view(i + 1, j, k) +
-                        B_view(i, j - 1, k) + B_view(i, j + 1, k) +
-                        B_view(i, j, k - 1) + B_view(i, j, k + 1);
-    }
-  );
+      RAJA::make_tuple(iseg, jseg, kseg),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
+        A_view(i, j, k) = B_view(i, j, k) + B_view(i - 1, j, k) +
+                          B_view(i + 1, j, k) + B_view(i, j - 1, k) +
+                          B_view(i, j + 1, k) + B_view(i, j, k - 1) +
+                          B_view(i, j, k + 1);
+      });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
+  {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     A_work_array,
-                                     A_check_array,
-                                     A_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, A_work_array, A_check_array, A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     B_work_array,
-                                     B_check_array,
-                                     B_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, B_work_array, B_check_array, B_test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView3DTest : public ::testing::Test
-{
-};
+{};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest, PermutedOffsetView3DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
+             PermutedOffsetView3DKernelTest)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -171,16 +161,16 @@ TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest, PermutedOffsetView3DKerne
   RAJA::idx_t dim1 = 4;
   RAJA::idx_t dim2 = 5;
 #endif
-  std::array<RAJA::idx_t, 3> dim {{dim0, dim1, dim2}};
+  std::array<RAJA::idx_t, 3> dim{{dim0, dim1, dim2}};
 
-  std::array<RAJA::idx_t, 3> perm {{0, 1, 2}};
+  std::array<RAJA::idx_t, 3> perm{{0, 1, 2}};
   KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
                                                                          perm);
-  perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
+  perm = std::array<RAJA::idx_t, 3>{{1, 2, 0}};
   KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
                                                                          perm);
 
-  perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
+  perm = std::array<RAJA::idx_t, 3>{{2, 0, 1}};
   KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
                                                                          perm);
 }
@@ -188,4 +178,4 @@ TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest, PermutedOffsetView3DKerne
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView3DTest,
                             PermutedOffsetView3DKernelTest);
 
-#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW3D_HPP__
+#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW3D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index 66311c43f1..271e689a88 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -18,89 +18,86 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 2>
-    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ) }};
+  std::array<RAJA::idx_t, 2> dim_strip{
+      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1);
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at( perm.at(1) );
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  int mod_val = dim.at(perm.at(1));
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<2> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View< IDX_TYPE, RAJA::Layout<2, int> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)) ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-      int val = RAJA::stripIndexType(layout(i, j)) % mod_val;
-      view(i, j) = static_cast<IDX_TYPE>(val);
-    }
-  );
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+        int val = RAJA::stripIndexType(layout(i, j)) % mod_val;
+        view(i, j) = static_cast<IDX_TYPE>(val);
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedView2DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  std::array<RAJA::idx_t, 2> perm {{0, 1}};
+  std::array<RAJA::idx_t, 2> perm{{0, 1}};
   //
   // Square view
   //
-  std::array<IDX_TYPE, 2> dim_s  {{static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 2> dim_s{
+      {static_cast<IDX_TYPE>(21), static_cast<IDX_TYPE>(21)}};
   KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 2> {{1, 0}};
+  perm = std::array<RAJA::idx_t, 2>{{1, 0}};
   KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 2> {{0, 1}};
+  perm = std::array<RAJA::idx_t, 2>{{0, 1}};
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 2> dim_ns  {{static_cast<IDX_TYPE>(15),
-                                    static_cast<IDX_TYPE>(24)}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
-
-  perm = std::array<RAJA::idx_t, 2> {{1, 0}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  std::array<IDX_TYPE, 2> dim_ns{
+      {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24)}};
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
+
+  perm = std::array<RAJA::idx_t, 2>{{1, 0}};
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest,
                             PermutedView2DKernelTest);
 
-#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW2D_HPP__
+#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index c3cb31ddce..17a449e170 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -18,99 +18,97 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 3>
-    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(2)) ) }};
+  std::array<RAJA::idx_t, 3> dim_strip{
+      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(2)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1) * dim_strip.at(2);
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at( perm.at(1) ) * dim.at( perm.at(2) );
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  int mod_val = dim.at(perm.at(1)) * dim.at(perm.at(2));
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<3> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View< IDX_TYPE, RAJA::Layout<3, int> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2)) ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      int val = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
-      view(i, j, k) = static_cast<IDX_TYPE>(val);
-    }
-  );
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
+        int val = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
+        view(i, j, k) = static_cast<IDX_TYPE>(val);
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedView3DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  std::array<RAJA::idx_t, 3> perm {{0, 1, 2}};
+  std::array<RAJA::idx_t, 3> perm{{0, 1, 2}};
   //
   // Square view
   //
-  std::array<IDX_TYPE, 3> dim_s  {{static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 3> dim_s{{static_cast<IDX_TYPE>(21),
+                                 static_cast<IDX_TYPE>(21),
+                                 static_cast<IDX_TYPE>(21)}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
+  perm = std::array<RAJA::idx_t, 3>{{1, 2, 0}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
+  perm = std::array<RAJA::idx_t, 3>{{2, 0, 1}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
+  perm = std::array<RAJA::idx_t, 3>{{0, 1, 2}};
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 3> dim_ns  {{static_cast<IDX_TYPE>(15),
-                                    static_cast<IDX_TYPE>(24),
-                                    static_cast<IDX_TYPE>(17)}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
-
-  perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
-
-  perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  std::array<IDX_TYPE, 3> dim_ns{{static_cast<IDX_TYPE>(15),
+                                  static_cast<IDX_TYPE>(24),
+                                  static_cast<IDX_TYPE>(17)}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
+
+  perm = std::array<RAJA::idx_t, 3>{{1, 2, 0}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
+
+  perm = std::array<RAJA::idx_t, 3>{{2, 0, 1}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest,
                             PermutedView3DKernelTest);
 
-#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW3D_HPP_
+#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW3D_HPP_
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index e5b99159b8..9a04251f0e 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -10,20 +10,26 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,25 +37,28 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the Basic test supports.
 //
 //
-using BasicSupportedLoopTypeList = camp::list<
-  DEPTH_2,
-  DEPTH_2_COLLAPSE,
-  DEPTH_3,
-  DEPTH_3_COLLAPSE,
-  DEPTH_3_COLLAPSE_SEQ_INNER,
-  DEPTH_3_COLLAPSE_SEQ_OUTER,
-  DEVICE_DEPTH_2>;
+using BasicSupportedLoopTypeList = camp::list<DEPTH_2,
+                                              DEPTH_2_COLLAPSE,
+                                              DEPTH_3,
+                                              DEPTH_3_COLLAPSE,
+                                              DEPTH_3_COLLAPSE_SEQ_INNER,
+                                              DEPTH_3_COLLAPSE_SEQ_OUTER,
+                                              DEVICE_DEPTH_2>;
 
 //
 //
 // Basic 2D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... ExtraArgs>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... ExtraArgs>
 void KernelNestedLoopTest(const DEPTH_2&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          ExtraArgs...){
+                          ExtraArgs...)
+{
   WORKING_RES work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
@@ -58,46 +67,58 @@ void KernelNestedLoopTest(const DEPTH_2&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 2;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim1, dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range1, range0), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type j, RAJA::Index_type i) {
-                              work_view(j,i) = (j * dim0) + i;
-                            });
-
-  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
+      work_array, dim1, dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range1, range0),
+      work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i) {
+        work_view(j, i) = (j * dim0) + i;
+      });
+
+  work_res.memcpy(check_array,
+                  work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat, [=](RAJA::Index_type i) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   });
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2 test.
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
+// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2
+// test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
 //
@@ -109,7 +130,8 @@ template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2){
+                          const RAJA::Index_type dim2)
+{
   WORKING_RES work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
@@ -118,13 +140,10 @@ void KernelNestedLoopTest(const DEPTH_3&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
@@ -132,38 +151,58 @@ void KernelNestedLoopTest(const DEPTH_3&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 3;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim2, dim1, dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range2, range1, range0), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i) {
-                              work_view(k,j,i) = (dim0 * dim1 * k) + (dim0 * j) + i;
-                            });
-
-  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
+      work_array, dim2, dim1, dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range2, range1, range0),
+      work_res,
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i) {
+        work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i;
+      });
+
+  work_res.memcpy(check_array,
+                  work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat, [=](RAJA::Index_type i) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   });
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 // DEPTH_3_COLLAPSE execution policies use the above DEPTH_3 test.
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
 //
@@ -171,99 +210,91 @@ void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args){
 // Defining the Kernel Loop structure for Basic Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct BasicNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::Lambda<0>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<1, 0>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1,2>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1, 2>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,2>,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec,
+      RAJA::statement::Collapse<
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::ArgList<1, 2>,
+          RAJA::statement::Lambda<0>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1>,
-        RAJA::statement::For<2, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1>,
+      RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end CudaKernel
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type, // row
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type, // col
+              RAJA::statement::Lambda<0>>>> // end CudaKernel
+                         >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __NESTED_LOOP_BASIC_IMPL_HPP__
+#endif // __NESTED_LOOP_BASIC_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 5c2cdd5149..5937eac900 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -10,20 +10,26 @@
 
 #include "RAJA_test-abs.hpp"
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,10 +37,8 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the MultiLambda test supports.
 //
 //
-using MultiLambdaSupportedLoopTypeList = camp::list<
-  DEPTH_2,
-  DEPTH_2_COLLAPSE,
-  DEVICE_DEPTH_2>;
+using MultiLambdaSupportedLoopTypeList =
+    camp::list<DEPTH_2, DEPTH_2_COLLAPSE, DEVICE_DEPTH_2>;
 
 //
 //
@@ -42,7 +46,8 @@ using MultiLambdaSupportedLoopTypeList = camp::list<
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(){
+void KernelNestedLoopTest()
+{
   constexpr static int N = 1000;
   constexpr static int DIM = 2;
 
@@ -50,66 +55,84 @@ void KernelNestedLoopTest(){
   WORKING_RES work_res{WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N*N);
-  double* work_arrB = work_res.template allocate<double>(N*N);
+  double* work_arrA = work_res.template allocate<double>(N * N);
+  double* work_arrB = work_res.template allocate<double>(N * N);
 
-  double* test_arrA = host_res.allocate<double>(N*N);
-  double* test_arrB = host_res.allocate<double>(N*N);
+  double* test_arrA = host_res.allocate<double>(N * N);
+  double* test_arrB = host_res.allocate<double>(N * N);
 
-  double* check_arrA = host_res.allocate<double>(N*N);
-  double* check_arrB = host_res.allocate<double>(N*N);
+  double* check_arrA = host_res.allocate<double>(N * N);
+  double* check_arrB = host_res.allocate<double>(N * N);
 
   // Initialize Data
-  for (RAJA::Index_type i = 0; i < N*N; i++) {
-    test_arrA[i] = i * 1.2;  test_arrB[i] = i * 0.5;
+  for (RAJA::Index_type i = 0; i < N * N; i++)
+  {
+    test_arrA[i] = i * 1.2;
+    test_arrB[i] = i * 0.5;
   }
-  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(
+      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Initialize RAJA Views
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
 
   // Calculate Test data
-  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
-    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
-      test_viewB(i,j) = 0.2 * (test_viewA(i,j) + test_viewA(i,j-1) + test_viewA(i,j+1) + test_viewA(i+1,j) + test_viewA(i-1,j));
+  for (RAJA::Index_type i = 1; i < N - 1; ++i)
+  {
+    for (RAJA::Index_type j = 1; j < N - 1; ++j)
+    {
+      test_viewB(i, j) = 0.2 * (test_viewA(i, j) + test_viewA(i, j - 1) +
+                                test_viewA(i, j + 1) + test_viewA(i + 1, j) +
+                                test_viewA(i - 1, j));
     }
   }
-  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
-    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
-      test_viewA(i,j) = 0.2 * (test_viewB(i,j) + test_viewB(i,j-1) + test_viewB(i,j+1) + test_viewB(i+1,j) + test_viewB(i-1,j));
+  for (RAJA::Index_type i = 1; i < N - 1; ++i)
+  {
+    for (RAJA::Index_type j = 1; j < N - 1; ++j)
+    {
+      test_viewA(i, j) = 0.2 * (test_viewB(i, j) + test_viewB(i, j - 1) +
+                                test_viewB(i, j + 1) + test_viewB(i + 1, j) +
+                                test_viewB(i - 1, j));
     }
-  } 
+  }
 
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                     RAJA::RangeSegment{1, N-1}),
-
-    // Resource
-    work_res,
-
-    // lambda 0
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
-      work_viewB(i,j) = 0.2 * (work_viewA(i,j) + work_viewA(i,j-1) + work_viewA(i,j+1) + work_viewA(i+1,j) + work_viewA(i-1,j));
-    },
-
-    // lambda 1
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
-      work_viewA(i,j) = 0.2 * (work_viewB(i,j) + work_viewB(i,j-1) + work_viewB(i,j+1) + work_viewB(i+1,j) + work_viewB(i-1,j));
-    }
-  );
-
-  work_res.memcpy(check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
-
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
-    ASSERT_TRUE( RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8 );
-    ASSERT_TRUE( RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8 );
-  });
+      RAJA::make_tuple(RAJA::RangeSegment{1, N - 1},
+                       RAJA::RangeSegment{1, N - 1}),
+
+      // Resource
+      work_res,
+
+      // lambda 0
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j) {
+        work_viewB(i, j) = 0.2 * (work_viewA(i, j) + work_viewA(i, j - 1) +
+                                  work_viewA(i, j + 1) + work_viewA(i + 1, j) +
+                                  work_viewA(i - 1, j));
+      },
+
+      // lambda 1
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j) {
+        work_viewA(i, j) = 0.2 * (work_viewB(i, j) + work_viewB(i, j - 1) +
+                                  work_viewB(i, j + 1) + work_viewB(i + 1, j) +
+                                  work_viewB(i - 1, j));
+      });
+
+  work_res.memcpy(
+      check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
+
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment{0, N * N}, [=](RAJA::Index_type i) {
+        ASSERT_TRUE(RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8);
+        ASSERT_TRUE(RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8);
+      });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -126,64 +149,66 @@ void KernelNestedLoopTest(){
 // Defining the Kernel Loop structure for MultiLambda Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      >,
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<1>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<1>>>>;
 };
 
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<0>
-      >,
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<1>
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<0>>,
+                         RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<1>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >,
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<1>
-          >
-        >
-      >
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>>,
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<1>>>>>;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __NESTED_LOOP_MULTI_LAMBDA_IMPL_HPP__
+#endif // __NESTED_LOOP_MULTI_LAMBDA_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index 8c62b908e3..056fc38be9 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -10,22 +10,30 @@
 
 #include "RAJA_test-abs.hpp"
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,10 +41,8 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARA
 // Define list of nested loop types the MultiLambdaParam test supports.
 //
 //
-using MultiLambdaParamSupportedLoopTypeList = camp::list<
-  DEPTH_3,
-  DEVICE_DEPTH_3
-  >;
+using MultiLambdaParamSupportedLoopTypeList =
+    camp::list<DEPTH_3, DEVICE_DEPTH_3>;
 
 //
 //
@@ -44,7 +50,8 @@ using MultiLambdaParamSupportedLoopTypeList = camp::list<
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(){
+void KernelNestedLoopTest()
+{
 
   constexpr static int N = 100;
   constexpr static int DIM = 2;
@@ -53,84 +60,91 @@ void KernelNestedLoopTest(){
   WORKING_RES work_res{WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N*N);
-  double* work_arrB = work_res.template allocate<double>(N*N);
-  double* work_arrC = work_res.template allocate<double>(N*N);
+  double* work_arrA = work_res.template allocate<double>(N * N);
+  double* work_arrB = work_res.template allocate<double>(N * N);
+  double* work_arrC = work_res.template allocate<double>(N * N);
 
-  double* test_arrA = host_res.allocate<double>(N*N);
-  double* test_arrB = host_res.allocate<double>(N*N);
-  double* test_arrC = host_res.allocate<double>(N*N);
+  double* test_arrA = host_res.allocate<double>(N * N);
+  double* test_arrB = host_res.allocate<double>(N * N);
+  double* test_arrC = host_res.allocate<double>(N * N);
 
-  double* check_arrC = host_res.allocate<double>(N*N);
+  double* check_arrC = host_res.allocate<double>(N * N);
 
   // Initialize RAJA Views
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewC(test_arrC, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewC(test_arrC, N, N);
 
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewC(work_arrC, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewC(work_arrC, N, N);
 
   // Initialize Data
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
       test_viewA(row, col) = row;
       test_viewB(row, col) = col;
       test_viewB(row, col) = 0;
     }
   }
 
-  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(
+      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Calculate Test data
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k) {
+      for (int k = 0; k < N; ++k)
+      {
         dot += test_viewA(row, k) * test_viewB(k, col);
       }
       test_viewC(row, col) = dot;
-
     }
   }
-  
+
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment{0, N},
-                     RAJA::RangeSegment{0, N},
-                     RAJA::RangeSegment{0, N}),
+      RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                       RAJA::RangeSegment{0, N},
+                       RAJA::RangeSegment{0, N}),
 
-    RAJA::tuple<double>{0.0},
+      RAJA::tuple<double>{0.0},
 
-    // Resource
-    work_res,
+      // Resource
+      work_res,
 
-    // lambda 0
-    [=] RAJA_HOST_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_HOST_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_HOST_DEVICE (int col, int row, int k, double& dot) {
-       dot += work_viewA(row, k) * work_viewB(k, col);
-    },
+      // lambda 1
+      [=] RAJA_HOST_DEVICE(int col, int row, int k, double& dot) {
+        dot += work_viewA(row, k) * work_viewB(k, col);
+      },
 
-    // lambda 2
-    [=] RAJA_HOST_DEVICE (int col, int row, double& dot) {
-       work_viewC(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_HOST_DEVICE(int col, int row, double& dot) {
+        work_viewC(row, col) = dot;
+      }
 
   );
 
-  work_res.memcpy(check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(
+      check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
-    ASSERT_TRUE( RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8 );
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment{0, N * N}, [=](RAJA::Index_type i) {
+        ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8);
+      });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -148,45 +162,59 @@ void KernelNestedLoopTest(){
 // Defining the Kernel Loop structure for MultiLambdaParam Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaParamNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<1> // inner loop: dot += ...
-          >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<1> // inner loop: dot += ...
+              >,
+          RAJA::statement::Lambda<2,
+                                  RAJA::Segs<0, 1>,
+                                  RAJA::Params<0>> // set
+                                                   // C(row,
+                                                   // col)
+                                                   // = dot
+          >>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<1> // inner loop: dot += ...
-          >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-        >
-      >
-      > // end CudaKernel
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<1> // inner loop: dot += ...
+                  >,
+              RAJA::statement::Lambda<2,
+                                      RAJA::Segs<0, 1>,
+                                      RAJA::Params<0>> // set C(row, col) = dot
+              >>>                                      // end CudaKernel
+                         >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __NESTED_LOOP_MULTI_LAMBDA_PARAM_IMPL_HPP__
+#endif // __NESTED_LOOP_MULTI_LAMBDA_PARAM_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index 37cab1789b..02726aecf4 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test {};
+class KernelNestedLoopBasicTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
@@ -35,11 +37,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
-                            NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_BASIC_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
index cddcb005f4..222849ecfc 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
@@ -30,7 +32,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
@@ -41,4 +44,4 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest,
                             NestedLoopMultiLambdaKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
index eae84e88c9..f109fc68f6 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
@@ -17,9 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
@@ -30,7 +33,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
@@ -41,4 +45,4 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest,
                             NestedLoopMultiLambdaParamKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index 7845500ae7..d8215f1e83 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -16,9 +16,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test {};
+class KernelNestedLoopBasicTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
@@ -34,11 +36,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
-                            NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_BASIC_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_RESOURCE_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
index 75616bea68..63998f2a76 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
@@ -30,7 +32,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
@@ -41,4 +44,4 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest,
                             NestedLoopMultiLambdaKernel);
 
-#endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_HPP__
+#endif // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
index 02dbe213cc..b43b82afe3 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
@@ -17,9 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
@@ -30,7 +33,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
@@ -41,4 +45,4 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest,
                             NestedLoopMultiLambdaParamKernel);
 
-#endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
+#endif // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index f2f2d0acab..fe898875ac 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -8,55 +8,48 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
     workarr2D[zz] = work_array + zz * ydim;
   });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
+    for (int xx = 0; xx < xdim; ++xx)
     {
-      checkarr2D[zz][xx] = zz*xdim + xx;
+      checkarr2D[zz][xx] = zz * xdim + xx;
     }
-    checkarr2D[ydim-1][xdim-1] = 0;
+    checkarr2D[ydim - 1][xdim - 1] = 0;
   });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
@@ -64,18 +57,20 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
+        maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r));
+      });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
+  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
+    for (int r = 0; r < ydim; ++r)
     {
       checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
     }
@@ -90,41 +85,48 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DTest);
 template <typename T>
 class KernelLocMax2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DTestImpl<INDEX_TYPE,
+                         DATA_TYPE,
+                         WORKING_RES,
+                         FORALL_POLICY,
+                         EXEC_POLICY,
+                         REDUCE_POLICY>(10, 10);
+  KernelLocMax2DTestImpl<INDEX_TYPE,
+                         DATA_TYPE,
+                         WORKING_RES,
+                         FORALL_POLICY,
+                         EXEC_POLICY,
+                         REDUCE_POLICY>(151, 151);
+  KernelLocMax2DTestImpl<INDEX_TYPE,
+                         DATA_TYPE,
+                         WORKING_RES,
+                         FORALL_POLICY,
+                         EXEC_POLICY,
+                         REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest,
-                            LocMax2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest, LocMax2DKernel);
 
-#endif  // __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
+#endif // __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index bd648ff88c..834e205320 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -8,55 +8,48 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
     workarr2D[zz] = work_array + zz * ydim;
   });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
+    for (int xx = 0; xx < xdim; ++xx)
     {
-      checkarr2D[zz][xx] = zz*xdim + xx;
+      checkarr2D[zz][xx] = zz * xdim + xx;
     }
-    checkarr2D[ydim-1][xdim-1] = 0;
+    checkarr2D[ydim - 1][xdim - 1] = 0;
   });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
@@ -66,18 +59,20 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
+        maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r));
+      });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
+  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
+    for (int r = 0; r < ydim; ++r)
     {
       checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
     }
@@ -92,41 +87,48 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTest);
 template <typename T>
 class KernelLocMax2DViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             FORALL_POLICY,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             FORALL_POLICY,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             FORALL_POLICY,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest,
-                            LocMax2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest, LocMax2DViewKernel);
 
-#endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
+#endif // __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index 045fc8e97e..bb31ef1704 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -8,55 +8,48 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
     workarr2D[zz] = work_array + zz * ydim;
   });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
+    for (int xx = 0; xx < xdim; ++xx)
     {
-      checkarr2D[zz][xx] = zz*xdim + xx;
+      checkarr2D[zz][xx] = zz * xdim + xx;
     }
-    checkarr2D[ydim-1][xdim-1] = 0;
+    checkarr2D[ydim - 1][xdim - 1] = 0;
   });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
@@ -72,18 +65,22 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> maxloc_reducer((DATA_TYPE)0, LocTup);
+  RAJA::
+      ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+          maxloc_reducer((DATA_TYPE)0, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
+        maxloc_reducer.maxloc(ArrView(r, c),
+                              RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+      });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
+  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
+    for (int r = 0; r < ydim; ++r)
     {
       checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
     }
@@ -98,41 +95,49 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest);
 template <typename T>
 class KernelLocMax2DViewTupleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  FORALL_POLICY,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  FORALL_POLICY,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  FORALL_POLICY,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest,
                             LocMax2DViewTupleKernel);
 
-#endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
+#endif // __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index 090280813c..cc0fe11878 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -8,55 +8,48 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
     workarr2D[zz] = work_array + zz * ydim;
   });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
+    for (int xx = 0; xx < xdim; ++xx)
     {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
+      checkarr2D[zz][xx] = zz * xdim + xx + 1;
     }
-    checkarr2D[ydim-1][xdim-1] = 0;
+    checkarr2D[ydim - 1][xdim - 1] = 0;
   });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
@@ -64,18 +57,20 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
+        minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r));
+      });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
+  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
+    for (int r = 0; r < ydim; ++r)
     {
       checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
     }
@@ -90,41 +85,48 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DTest);
 template <typename T>
 class KernelLocMin2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DTestImpl<INDEX_TYPE,
+                         DATA_TYPE,
+                         WORKING_RES,
+                         FORALL_POLICY,
+                         EXEC_POLICY,
+                         REDUCE_POLICY>(10, 10);
+  KernelLocMin2DTestImpl<INDEX_TYPE,
+                         DATA_TYPE,
+                         WORKING_RES,
+                         FORALL_POLICY,
+                         EXEC_POLICY,
+                         REDUCE_POLICY>(151, 151);
+  KernelLocMin2DTestImpl<INDEX_TYPE,
+                         DATA_TYPE,
+                         WORKING_RES,
+                         FORALL_POLICY,
+                         EXEC_POLICY,
+                         REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest,
-                            LocMin2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest, LocMin2DKernel);
 
-#endif  // __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
+#endif // __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index cf0791e8d5..12f1655b60 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -8,55 +8,48 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
     workarr2D[zz] = work_array + zz * ydim;
   });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
+    for (int xx = 0; xx < xdim; ++xx)
     {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
+      checkarr2D[zz][xx] = zz * xdim + xx + 1;
     }
-    checkarr2D[ydim-1][xdim-1] = 0;
+    checkarr2D[ydim - 1][xdim - 1] = 0;
   });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
@@ -66,18 +59,20 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(ArrView(r, c), Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
+        minloc_reducer.minloc(ArrView(r, c), Index2D(c, r));
+      });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
+  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
+    for (int r = 0; r < ydim; ++r)
     {
       checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
     }
@@ -92,41 +87,48 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTest);
 template <typename T>
 class KernelLocMin2DViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             FORALL_POLICY,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             FORALL_POLICY,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE,
+                             DATA_TYPE,
+                             WORKING_RES,
+                             FORALL_POLICY,
+                             EXEC_POLICY,
+                             REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest,
-                            LocMin2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest, LocMin2DViewKernel);
 
-#endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
+#endif // __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index 4234471f89..e1dc5e5d32 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -8,55 +8,48 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
     workarr2D[zz] = work_array + zz * ydim;
   });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
+    for (int xx = 0; xx < xdim; ++xx)
     {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
+      checkarr2D[zz][xx] = zz * xdim + xx + 1;
     }
-    checkarr2D[ydim-1][xdim-1] = 0;
+    checkarr2D[ydim - 1][xdim - 1] = 0;
   });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
@@ -68,18 +61,22 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> minloc_reducer((DATA_TYPE)1024, LocTup);
+  RAJA::
+      ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+          minloc_reducer((DATA_TYPE)1024, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
+        minloc_reducer.minloc(ArrView(r, c),
+                              RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+      });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
+  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
+    for (int r = 0; r < ydim; ++r)
     {
       checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
     }
@@ -94,41 +91,49 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest);
 template <typename T>
 class KernelLocMin2DViewTupleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  FORALL_POLICY,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  FORALL_POLICY,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  FORALL_POLICY,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest,
                             LocMin2DViewTupleKernel);
 
-#endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
+#endif // __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
index f0b9f58ff6..0bc6620e61 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-data.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -11,7 +11,9 @@
 template <typename T>
 void allocRegionTestData(int N,
                          camp::resources::Resource work_res,
-                         T** work1, T** work2, T** work3,
+                         T** work1,
+                         T** work2,
+                         T** work3,
                          camp::resources::Resource host_res,
                          T** check)
 {
@@ -24,7 +26,9 @@ void allocRegionTestData(int N,
 
 template <typename T>
 void deallocRegionTestData(camp::resources::Resource work_res,
-                           T* work1, T* work2, T* work3,
+                           T* work1,
+                           T* work2,
+                           T* work3,
                            camp::resources::Resource host_res,
                            T* check)
 {
@@ -35,4 +39,4 @@ void deallocRegionTestData(camp::resources::Resource work_res,
   host_res.deallocate(check);
 }
 
-#endif  // __TEST_KERNEL_REGION_UTILS_HPP__
+#endif // __TEST_KERNEL_REGION_UTILS_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index b9ad122d2b..c0eb29ea9d 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -19,7 +19,7 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-  
+
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
@@ -28,15 +28,17 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   allocRegionTestData(N,
                       work_res,
-                      &work_array1, &work_array2, &work_array3,
+                      &work_array1,
+                      &work_array2,
+                      &work_array3,
                       host_res,
                       &check_array);
 
-  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
 
-  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
 
   //
   // Create a list segment with indices in reverse order from range
@@ -48,52 +50,45 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(idx_array.begin(), idx_array.end(), first);
   std::reverse(idx_array.begin(), idx_array.end());
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
-                                          work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, work_res);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-    RAJA::make_tuple(rseg, lseg),
+      RAJA::make_tuple(rseg, lseg),
 
-    [=] (INDEX_TYPE i) {
-      work_array1[i - first] = 50;
-    },
+      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
 
-    [=] (INDEX_TYPE i) {
-      work_array2[i - first] = 100;
-    },
+      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-    [=] (INDEX_TYPE i) {
-      work_array3[i - first] = work_array1[i - first] + 
-                               work_array2[i - first] + 1;
-    }
+      [=](INDEX_TYPE i) {
+        work_array3[i - first] =
+            work_array1[i - first] + work_array2[i - first] + 1;
+      }
 
   );
-  
+
   work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res,
-                        work_array1, work_array2, work_array3,
-                        host_res,
-                        check_array);
+  deallocRegionTestData(
+      work_res, work_array1, work_array2, work_array3, host_res, check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionSyncTest);
 template <typename T>
 class KernelRegionSyncTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -102,7 +97,6 @@ TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
   KernelRegionSyncTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest,
-                            RegionSyncKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest, RegionSyncKernel);
 
-#endif  // __TEST_KERNEL_REGION_SYNC_HPP__
+#endif // __TEST_KERNEL_REGION_SYNC_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index bb2ec449e0..4a2b200bac 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -15,7 +15,7 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-  
+
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
@@ -24,60 +24,56 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   allocRegionTestData(N,
                       work_res,
-                      &work_array1, &work_array2, &work_array3,
+                      &work_array1,
+                      &work_array2,
+                      &work_array3,
                       host_res,
                       &check_array);
 
-  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
 
-  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
 
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-    RAJA::make_tuple(rseg),
+      RAJA::make_tuple(rseg),
 
-    [=] (INDEX_TYPE i) {
-      work_array1[i - first] = 50;
-    },
+      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
 
-    [=] (INDEX_TYPE i) {
-      work_array2[i - first] = 100;
-    },
+      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-    [=] (INDEX_TYPE i) {
-      work_array3[i - first] = work_array1[i - first] + 
-                               work_array2[i - first] + 1;
-    }
+      [=](INDEX_TYPE i) {
+        work_array3[i - first] =
+            work_array1[i - first] + work_array2[i - first] + 1;
+      }
 
   );
-  
-  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N );
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res,
-                        work_array1, work_array2, work_array3,
-                        host_res,
-                        check_array);
+  deallocRegionTestData(
+      work_res, work_array1, work_array2, work_array3, host_res, check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionTest);
 template <typename T>
 class KernelRegionTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelRegionTest, RegionKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -86,7 +82,6 @@ TYPED_TEST_P(KernelRegionTest, RegionKernel)
   KernelRegionTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest,
-                            RegionKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest, RegionKernel);
 
-#endif  // __TEST_KERNEL_REGION_HPP__
+#endif // __TEST_KERNEL_REGION_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
index 82e749d226..c0989d0b45 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
@@ -11,8 +11,9 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template<int VALUE>
-struct Value {
+template <int VALUE>
+struct Value
+{
   static constexpr int value = VALUE;
 };
 
@@ -23,61 +24,59 @@ void KernelSingleLoopForICountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < tsize; ++t) {
+  for (IDX_TYPE t = 0; t < tsize; ++t)
+  {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
-      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
+        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
-        trip_count += 1;
-        if ( i % tsize == t && ii == t ) { 
-          tile_count += 1;
-        }
-      }
-    );
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
+          trip_count += 1;
+          if (i % tsize == t && ii == t)
+          {
+            tile_count += 1;
+          }
+        });
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ( trip_result, (t+1) * N );
+    ASSERT_EQ(trip_result, (t + 1) * N);
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = N / tsize;
-    if ( t < N % tsize ) {
+    if (t < N % tsize)
+    {
       tile_expect += 1;
     }
     ASSERT_EQ(tile_result, tile_expect);
-
   }
-
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest);
 template <typename T>
 class KernelSingleLoopForICountTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(57), tsize);
+      IDX_TYPE(57), tsize);
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(1035), tsize);
-
+      IDX_TYPE(1035), tsize);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest,
                             ForICountSingleLoopKernel);
 
-#endif  // __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
+#endif // __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
index e745a8d08b..441f96deb4 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
@@ -11,8 +11,9 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template<int VALUE>
-struct Value {
+template <int VALUE>
+struct Value
+{
   static constexpr int value = VALUE;
 };
 
@@ -25,61 +26,59 @@ void KernelSingleLoopTileTCountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < NT; ++t) {
+  for (IDX_TYPE t = 0; t < NT; ++t)
+  {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
-      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
+        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
-        trip_count += 1;
-        if ( i / tsize == t && ti == t ) {
-          tile_count += 1;
-        }
-      }
-    );
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
+          trip_count += 1;
+          if (i / tsize == t && ti == t)
+          {
+            tile_count += 1;
+          }
+        });
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ( trip_result, (t+1) * N );
+    ASSERT_EQ(trip_result, (t + 1) * N);
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = tsize;
-    if ( (t + 1) * tsize > N ) {
+    if ((t + 1) * tsize > N)
+    {
       tile_expect = N - t * tsize;
     }
     ASSERT_EQ(tile_result, tile_expect);
-
   }
-
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest);
 template <typename T>
 class KernelSingleLoopTileTCountTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(57), tsize);
+      IDX_TYPE(57), tsize);
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(1035), tsize);
-
+      IDX_TYPE(1035), tsize);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest,
                             TileTCountSingleLoopKernel);
 
-#endif  // __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
+#endif // __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index ccb57cfc62..a1fde799b4 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -10,136 +10,129 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel_param<EXEC_POLICY> (
-    RAJA::make_tuple( colrange, rowrange ),
-    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y} ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple(RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y}),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
+        WorkTView(cc, rr) = WorkView(rr, cc);
+      });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
   // reset check and work transpose arrays
-  work_res.memcpy( check_array_t, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array, sizeof(DATA_TYPE) * array_length);
 
   // transpose work_array again with different tile sizes
-  RAJA::kernel_param<EXEC_POLICY> (
-    RAJA::make_tuple( colrange, rowrange ),
-    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y/2} ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
-
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
-
-  for ( int rr = 0; rr < rows; ++rr )
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple(RAJA::TileSize{tile_dim_x},
+                       RAJA::TileSize{tile_dim_y / 2}),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
+        WorkTView(cc, rr) = WorkView(rr, cc);
+      });
+
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array_t, check_array_t, test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileDynamic2DTest);
 template <typename T>
 class KernelTileDynamic2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileDynamic2DTest, TileDynamic2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      10, 10);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      151, 111);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest,
-                            TileDynamic2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest, TileDynamic2DKernel);
 
-#endif  // __TEST_KERNEL_TILE_DYNAMIC2D_HPP__
+#endif // __TEST_KERNEL_TILE_DYNAMIC2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index 9013e5c9ea..d470590458 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -10,112 +10,103 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileFixed2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
+                              WorkTView(cc, rr) = WorkView(rr, cc);
+                            });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array_t, check_array_t, test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DTest);
 template <typename T>
 class KernelTileFixed2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DTest, TileFixed2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      10, 10);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      151, 111);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest,
-                            TileFixed2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest, TileFixed2DKernel);
 
-#endif  // __TEST_KERNEL_TILE_FIXED2D_HPP__
+#endif // __TEST_KERNEL_TILE_FIXED2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index ac876065a1..a271feb1ed 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -11,42 +11,42 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 {
   // This test reduces min and max with tiling.
 
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
   // set min and max of the array
   test_array[4] = -1;
-  test_array[8] = array_length+2;
+  test_array[8] = array_length + 2;
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
-  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin( DATA_TYPE(99999) ); 
-  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax( DATA_TYPE(-1) ); 
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin(DATA_TYPE(99999));
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax(DATA_TYPE(-1));
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -54,46 +54,56 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(
+      &colidx[0], colidx.size(), work_res);
 
   // find min and max on target platform
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      workmin.min(WorkView(rr, cc));
-      workmax.max(WorkView(rr, cc));
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
+                              workmin.min(WorkView(rr, cc));
+                              workmax.max(WorkView(rr, cc));
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(-1), static_cast<DATA_TYPE>(workmin.get()));
-  ASSERT_EQ(static_cast<DATA_TYPE>(array_length+2), static_cast<DATA_TYPE>(workmax.get()));
+  ASSERT_EQ(static_cast<DATA_TYPE>(array_length + 2),
+            static_cast<DATA_TYPE>(workmax.get()));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest);
 template <typename T>
 class KernelTileFixed2DMinMaxTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE,
+                                  DATA_TYPE,
+                                  WORKING_RES,
+                                  EXEC_POLICY,
+                                  REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest,
                             TileFixed2DMinMaxKernel);
 
-#endif  // __TEST_KERNEL_TILE_FIXED2DMINMAX_HPP__
+#endif // __TEST_KERNEL_TILE_FIXED2DMINMAX_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index 33da6d3c7d..9405b71847 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -12,13 +12,17 @@
 #include <vector>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 {
   // This test reduces sums with tiling.
 
   int rows, cols;
-  if ( std::is_same<DATA_TYPE, float>::value )
+  if (std::is_same<DATA_TYPE, float>::value)
   {
     // Restrict to a small data size for better float precision.
     rows = 3;
@@ -34,16 +38,16 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 
   DATA_TYPE hostsum = 0;
 
-  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum( DATA_TYPE(0) ); 
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum(DATA_TYPE(0));
 
   // sum on CPU in a tiled manner
-  for ( int rr = 0; rr < rows; rr += tile_dim_x )
+  for (int rr = 0; rr < rows; rr += tile_dim_x)
   {
-    for ( int cc = 0; cc < cols; cc += tile_dim_y )
+    for (int cc = 0; cc < cols; cc += tile_dim_y)
     {
-      for ( int r = rr; r < std::min(rr+tile_dim_x, rows); ++r )
+      for (int r = rr; r < std::min(rr + tile_dim_x, rows); ++r)
       {
-        for ( int c = cc; c < std::min(cc+tile_dim_y, cols); ++c )
+        for (int c = cc; c < std::min(cc + tile_dim_y, cols); ++c)
         {
           hostsum += (DATA_TYPE)(r * 1.1 + c);
         }
@@ -52,7 +56,7 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
   }
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -60,13 +64,14 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(
+      &colidx[0], colidx.size(), work_res);
 
   // sum on target platform
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      worksum += (DATA_TYPE)(rr * 1.1 + cc);
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
+                              worksum += (DATA_TYPE)(rr * 1.1 + cc);
+                            });
 
   ASSERT_FLOAT_EQ(hostsum, (DATA_TYPE)worksum.get());
 }
@@ -75,23 +80,33 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest);
 template <typename T>
 class KernelTileFixed2DSumTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE,
+                               DATA_TYPE,
+                               WORKING_RES,
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE,
+                               DATA_TYPE,
+                               WORKING_RES,
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE,
+                               DATA_TYPE,
+                               WORKING_RES,
+                               EXEC_POLICY,
+                               REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest,
-                            TileFixed2DSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel);
 
-#endif  // __TEST_KERNEL_TILE_FIXED2DSUM_HPP__
+#endif // __TEST_KERNEL_TILE_FIXED2DSUM_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index 017512c50c..f1f0fa6667 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -10,121 +10,129 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
   camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize local array (shared mem)
-  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0,1>, RAJA::SizeList<tile_dim_x, tile_dim_y>>;
+  using TILE_MEM = RAJA::LocalArray<DATA_TYPE,
+                                    RAJA::Perm<0, 1>,
+                                    RAJA::SizeList<tile_dim_x, tile_dim_y>>;
   TILE_MEM Tile_Array;
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
-
-  RAJA::kernel_param<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ), RAJA::make_tuple( (INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
-      Tile_Array( ty, tx ) = WorkView( rr, cc );
-    },
-
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
-      WorkTView( cc, rr ) = Tile_Array( ty, tx );
-    }
-  );
-
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
-
-  for ( int rr = 0; rr < rows; ++rr )
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
+
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple((INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc,
+                           INDEX_TYPE rr,
+                           INDEX_TYPE tx,
+                           INDEX_TYPE ty,
+                           TILE_MEM & Tile_Array) {
+        Tile_Array(ty, tx) = WorkView(rr, cc);
+      },
+
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc,
+                           INDEX_TYPE rr,
+                           INDEX_TYPE tx,
+                           INDEX_TYPE ty,
+                           TILE_MEM & Tile_Array) {
+        WorkTView(cc, rr) = Tile_Array(ty, tx);
+      });
+
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
+
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array_t, check_array_t, test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest);
 template <typename T>
 class KernelTileLocalArray2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE,
+                                 DATA_TYPE,
+                                 WORKING_RES,
+                                 EXEC_POLICY>(10, 10);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE,
+                                 DATA_TYPE,
+                                 WORKING_RES,
+                                 EXEC_POLICY>(151, 111);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE,
+                                 DATA_TYPE,
+                                 WORKING_RES,
+                                 EXEC_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest,
-                            TileLocalArray2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel);
 
-#endif  // __TEST_KERNEL_TILE_LOCALARRAY2D_HPP__
+#endif // __TEST_KERNEL_TILE_LOCALARRAY2D_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
index 13f9c62a45..8a126e1f99 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,15 +33,17 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 64, 4 * 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
                             WarpThreadReduceMaskKernel);
 
-#endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
+#endif // __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
index cda8aaba59..efbdc67e70 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,16 +33,18 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 4000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
                             WarpThreadReduceWarpKernel);
 
-#endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
+#endif // __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
index f3194fba44..5a888c10bd 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,15 +33,17 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
                             WarpThreadWarpLoopKernel);
 
-#endif  // __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
+#endif // __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
index 08e9a0c381..c3c44f6964 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,15 +33,17 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 64, 4 * 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
                             WarpThreadReduceMaskKernel);
 
-#endif  // __TEST_WARP_THREAD_REDUCEMASK_HPP__
+#endif // __TEST_WARP_THREAD_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
index e61c05446c..37f4ded092 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,16 +33,18 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 4000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
                             WarpThreadReduceWarpKernel);
 
-#endif  // __TEST_WARP_THREAD_REDUCEWARP_HPP__
+#endif // __TEST_WARP_THREAD_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
index c435c484b2..5bf74de3f8 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -17,9 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
+{
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
@@ -31,15 +33,17 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
                             WarpThreadWarpLoopKernel);
 
-#endif  // __TEST_WARP_THREAD_WARPLOOP_HPP__
+#endif // __TEST_WARP_THREAD_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index 797379e890..efe813bbf9 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -10,38 +10,53 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -49,17 +64,19 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the ReduceMask test supports.
 //
 //
-using ReduceMaskSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-  DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
->;
+using ReduceMaskSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+               DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
@@ -71,36 +88,38 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen,
+                                           erased_work_res,
+                                           &work_array,
+                                           &check_array,
+                                           &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-                            work_res,
-                            [=] RAJA_DEVICE (RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j)) {
-                              trip_count += 1;
-                              worksum += i; // i should only be 0..directlen-1
-                              max_thread.max(threadIdx.x);
-                            });
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      work_res,
+      [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j)) {
+        trip_count += 1;
+        worksum += i; // i should only be 0..directlen-1
+        max_thread.max(threadIdx.x);
+      });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen*directlen);
-  ASSERT_EQ(worksum.get(), looplen*directlen*(directlen-1)/2);
+  ASSERT_EQ(trip_count.get(), looplen * directlen);
+  ASSERT_EQ(worksum.get(), looplen * directlen * (directlen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
@@ -112,34 +131,36 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen,
+                                           erased_work_res,
+                                           &work_array,
+                                           &check_array,
+                                           &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-                            RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0),
-                            work_res,
-                            [=] RAJA_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type RAJA_UNUSED_ARG(j), RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y) {
-                              trip_count += 1;
-                              worksum += y; // y should only be 0..3
-                              max_thread.max(threadIdx.x);
-                            });
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0),
+      work_res,
+      [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                      RAJA::Index_type RAJA_UNUSED_ARG(j),
+                      RAJA::Index_type RAJA_UNUSED_ARG(x),
+                      RAJA::Index_type y) {
+        trip_count += 1;
+        worksum += y; // y should only be 0..3
+        max_thread.max(threadIdx.x);
+      });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen*directlen);
-  ASSERT_EQ(worksum.get(), looplen*directlen*(looplen-1)/2);
+  ASSERT_EQ(trip_count.get(), looplen * directlen);
+  ASSERT_EQ(worksum.get(), looplen * directlen * (looplen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 //
@@ -147,39 +168,45 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
 // Defining the Kernel Loop structure for ReduceMask Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::ForICount<
+          0,
+          RAJA::statement::Param<0>,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::ForICount<
+              1,
+              RAJA::statement::Param<1>,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+      >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __WARP_THREAD_REDUCEMASK_IMPL_HPP__
+#endif // __WARP_THREAD_REDUCEMASK_IMPL_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index 1771b99665..bea0bf5bb6 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -10,22 +10,31 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,18 +42,20 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the ReduceWarp test supports.
 //
 //
-using ReduceWarpSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-  DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-  DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
->;
+using ReduceWarpSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
                           const RAJA::Index_type len)
 {
@@ -55,42 +66,42 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type &value) {
-                              value += i;
-                            },
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
-
-  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+      RAJA::make_tuple((RAJA::Index_type)0),
+      work_res,
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type & value) {
+        value += i;
+      },
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value) {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
+
+  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
   ASSERT_EQ(reduce_count.get(), 1);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
-                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
+    const RAJA::Index_type len) // len needs to be divisible by 10 and 16
 {
   WORKING_RES work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
@@ -102,43 +113,44 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type outerlen = len / innerlen;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type &value) {
-                              value += i + j * outerlen;
-                            },
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
-
-  ASSERT_EQ(worksum.get(), outerlen*innerlen*(outerlen*innerlen-1)/2);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple((RAJA::Index_type)0),
+      work_res,
+
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type & value) {
+        value += i + j * outerlen;
+      },
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value) {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
+
+  ASSERT_EQ(worksum.get(), outerlen * innerlen * (outerlen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
-                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
+    const RAJA::Index_type len) // len needs to be divisible by 10 and 16
 {
   WORKING_RES work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
@@ -149,41 +161,42 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
 
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type middlelen = 16;
-  RAJA::Index_type outerlen = len / (innerlen*middlelen);
+  RAJA::Index_type outerlen = len / (innerlen * middlelen);
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k, RAJA::Index_type &value) {
-                              value += i + j * outerlen + k * outerlen * middlelen;
-                            },
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
-
-  ASSERT_EQ(worksum.get(), outerlen*middlelen*innerlen*(outerlen*middlelen*innerlen-1)/2);
-  ASSERT_EQ(reduce_count.get(), middlelen*innerlen);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple((RAJA::Index_type)0),
+      work_res,
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i,
+                           RAJA::Index_type j,
+                           RAJA::Index_type k,
+                           RAJA::Index_type & value) {
+        value += i + j * outerlen + k * outerlen * middlelen;
+      },
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value) {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
+
+  ASSERT_EQ(worksum.get(),
+            outerlen * middlelen * innerlen *
+                (outerlen * middlelen * innerlen - 1) / 2);
+  ASSERT_EQ(reduce_count.get(), middlelen * innerlen);
+
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 //
@@ -191,59 +204,77 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
 // Defining the Kernel Loop structure for ReduceWarp Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>> // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::statement::Lambda<0>
-          >
-        >,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<2>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>> // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::statement::Lambda<0>
-            > // end For 0
-          >,  // end For 1
-          typename camp::at<POLICY_DATA, camp::num<3>>::type, // warp synchronize
-          RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<4>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-            RAJA::statement::Lambda<1, RAJA::Params<0>>
-          >
-        > // end For 2
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          2,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  0,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>                 // end For 0
+              >,                                              // end For 1
+          typename camp::at<POLICY_DATA, camp::num<3>>::type, // warp
+                                                              // synchronize
+          RAJA::statement::Reduce<
+              typename camp::at<POLICY_DATA, camp::num<4>>::type,
+              RAJA::operators::plus,
+              RAJA::statement::Param<0>,
+              RAJA::statement::Lambda<1, RAJA::Params<0>>>> // end For 2
+                                                        >   // end DEVICE_KERNEL
+                         >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __WARP_THREAD_REDUCEWARP_IMPL_HPP__
+#endif // __WARP_THREAD_REDUCEWARP_IMPL_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index ba4f445c88..af65db3172 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -10,38 +10,53 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -49,18 +64,20 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the WarpLoop test supports.
 //
 //
-using WarpLoopSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_1_REDUCESUM_WARP,
-  DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-  DEVICE_DEPTH_2_REDUCESUM_WARP
->;
+using WarpLoopSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARP,
+               DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+               DEVICE_DEPTH_2_REDUCESUM_WARP>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
                           const RAJA::Index_type len)
 {
@@ -71,30 +88,28 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
-                              worksum += i;
-                            });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+      work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { worksum += i; });
 
-  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
                           const RAJA::Index_type numtiles)
 {
@@ -106,36 +121,40 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, flatSize);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type j) {
-                              worksum += j; // j should only be 0..31
-                            });
-
-  ASSERT_EQ(worksum.get(), numtiles*32*(32-1)/2);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
+      RAJA::make_tuple((RAJA::Index_type)0),
+      work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                           RAJA::Index_type j) {
+        worksum += j; // j should only be 0..31
+      });
+
+  ASSERT_EQ(worksum.get(), numtiles * 32 * (32 - 1) / 2);
+
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-// More specific execution policies that use the above DEVICE_DEPTH_1_REDUCESUM_WARP test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args... args){
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
+// More specific execution policies that use the above
+// DEVICE_DEPTH_1_REDUCESUM_WARP test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
+                          Args... args)
+{
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
 }
 
 //
@@ -143,51 +162,54 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args.
 // Defining the Kernel Loop structure for WarpLoop Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>> // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<32>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<0>>::type,
+              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
-          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<32>,
+          RAJA::seq_exec,
+          RAJA::statement::ForICount<
+              0,
+              RAJA::statement::Param<0>,
+              typename camp::at<POLICY_DATA, camp::num<0>>::type,
+              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+                         >;
 };
 
-#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif  // __WARP_THREAD_WARPLOOP_IMPL_HPP__
+#endif // __WARP_THREAD_WARPLOOP_IMPL_HPP__
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index 04bc3bcc5e..8a1ebc6741 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -21,11 +21,11 @@
 // Defining the Launch Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template <typename EXEC_POL_DATA, typename IDX_TYPE,
+template <typename EXEC_POL_DATA,
+          typename IDX_TYPE,
           typename SEGMENTS_TYPE,
           typename Lambda>
-void Launch(const SEGMENTS_TYPE& segments,
-                  Lambda&& lambda)
+void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
 {
   using RAJA::get;
 
@@ -55,41 +55,66 @@ void Launch(const SEGMENTS_TYPE& segments,
   IDX_TYPE blocks_j = RAJA_DIVIDE_CEILING_INT(distance_sj, threads_j);
   IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
-                        RAJA::Threads(threads_i, threads_j,threads_k)),
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
+                         RAJA::Threads(threads_i, threads_j, threads_k)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-    RAJA::loop<TEAM_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k), [&](IDX_TYPE bk) {
-      RAJA::loop<TEAM_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j), [&](IDX_TYPE bj) {
-        RAJA::loop<TEAM_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i), [&](IDX_TYPE bi) {
-
-          RAJA::loop<THREAD_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k), [&](IDX_TYPE tk) {
-            RAJA::loop<THREAD_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_j), [&](IDX_TYPE tj) {
-              RAJA::loop<THREAD_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_i), [&](IDX_TYPE ti) {
-
-                IDX_TYPE i = ti + threads_i * bi;
-                IDX_TYPE j = tj + threads_j * bj;
-                IDX_TYPE k = tk + threads_k * bk;
-
-                if (i < distance_si && j < distance_sj && k < distance_sk) {
-                  lambda(begin_sk[k], begin_sj[j], begin_si[i]);
-                }
-              });
+        RAJA::loop<TEAM_Z_POLICY>(
+            ctx,
+            RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k),
+            [&](IDX_TYPE bk) {
+              RAJA::loop<TEAM_Y_POLICY>(
+                  ctx,
+                  RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j),
+                  [&](IDX_TYPE bj) {
+                    RAJA::loop<TEAM_X_POLICY>(
+                        ctx,
+                        RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i),
+                        [&](IDX_TYPE bi) {
+                          RAJA::loop<THREAD_Z_POLICY>(
+                              ctx,
+                              RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k),
+                              [&](IDX_TYPE tk) {
+                                RAJA::loop<THREAD_Y_POLICY>(
+                                    ctx,
+                                    RAJA::TypedRangeSegment<IDX_TYPE>(
+                                        0, threads_j),
+                                    [&](IDX_TYPE tj) {
+                                      RAJA::loop<THREAD_X_POLICY>(
+                                          ctx,
+                                          RAJA::TypedRangeSegment<IDX_TYPE>(
+                                              0, threads_i),
+                                          [&](IDX_TYPE ti) {
+                                            IDX_TYPE i = ti + threads_i * bi;
+                                            IDX_TYPE j = tj + threads_j * bj;
+                                            IDX_TYPE k = tk + threads_k * bk;
+
+                                            if (i < distance_si &&
+                                                j < distance_sj &&
+                                                k < distance_sk)
+                                            {
+                                              lambda(begin_sk[k],
+                                                     begin_sj[j],
+                                                     begin_si[i]);
+                                            }
+                                          });
+                                    });
+                              });
+                        });
+                  });
             });
-          });
-
-        });
       });
-    });
-
-  });
 }
 
-template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -97,12 +122,19 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -111,7 +143,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -121,9 +154,9 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
@@ -144,51 +177,50 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(
+      idx_range + 1, working_res, &working_range, &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
-
-    for (IDX_TYPE k : sk) {
-      for (IDX_TYPE j : sj) {
-        for (IDX_TYPE i : si) {
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
+
+    for (IDX_TYPE k : sk)
+    {
+      for (IDX_TYPE j : sj)
+      {
+        for (IDX_TYPE i : si)
+        {
           IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii+1] = data_len;
+          test_range[ii + 1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
-                         &test_array);
+  allocateForallTestData(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
-                         &test_bins);
+  allocateForallTestData(
+      data_len, working_res, &working_bins, &check_bins, &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -196,7 +228,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(
+      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -208,21 +241,26 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    Launch<EXEC_POL_DATA, IDX_TYPE>(
+        segments, [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -232,46 +270,58 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(
+          segments, [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(
+          working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -279,23 +329,30 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(
+          segments, [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
 
-      if (!got_ref_vals) {
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -303,35 +360,25 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest);
 template <typename T>
 class LaunchMultiReduceNestedTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ABSTRACTION = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
   auto random_seed = std::random_device{}();
@@ -343,38 +390,52 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA,
+                                    REDUCE_POLICY,
+                                    ABSTRACTION,
+                                    DATA_TYPE,
+                                    IDX_TYPE>(
+        s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA,
+                                    REDUCE_POLICY,
+                                    ABSTRACTION,
+                                    DATA_TYPE,
+                                    IDX_TYPE>(
+        s2, container, working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s3, container, working_res, rngen);
-
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA,
+                                    REDUCE_POLICY,
+                                    ABSTRACTION,
+                                    DATA_TYPE,
+                                    IDX_TYPE>(
+        s3, container, working_res, rngen);
   }
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest,
                             MultiReduceNestedLaunch);
 
-#endif  // __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
+#endif // __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index a730d030a7..7278cced79 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -10,19 +10,25 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6 * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -32,9 +38,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N5 = static_cast<INDEX_TYPE>(r5.end() - r5.begin());
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 *                                          
-                                         N3 * N4 *
-                                         N5 * N6);                                         
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
@@ -42,16 +46,14 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
-  //6 threads total
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
+  // 6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
   constexpr int threads_z = 4;
@@ -60,129 +62,160 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(
+        working_array, N6, N5, N4, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
+          RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
             RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                              });
-                          });
-                      });
-
+              RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
+                RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
+                  RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
+                    RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
+                      auto idx =
+                          tx +
+                          N1 * (ty +
+                                N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
+
+
+                      Aview(bz, by, bx, tz, ty, tx) =
+                          static_cast<INDEX_TYPE>(idx);
+                    });
                   });
+                });
               });
+            });
           });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
-
-                                working_array[0]++;
-                                
-                              });
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx,
+                                      r2,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx,
+                                            r1,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(
+                                                tx)) { working_array[0]++; });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(test_array[0], check_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedDirectTest);
 template <typename T>
 class LaunchNestedDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+  LaunchNestedDirectTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             THREAD_X_POLICY,
+                             THREAD_Y_POLICY,
+                             THREAD_Z_POLICY,
+                             TEAM_X_POLICY,
+                             TEAM_Y_POLICY,
+                             TEAM_Z_POLICY>(INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedDirectTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             THREAD_X_POLICY,
+                             THREAD_Y_POLICY,
+                             THREAD_Z_POLICY,
+                             TEAM_X_POLICY,
+                             TEAM_Y_POLICY,
+                             TEAM_Z_POLICY>(INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest, RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_DIRECT_HPP__
+#endif // __TEST_LAUNCH_NESTED_DIRECT_HPP__
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index 8f3b9702d0..9e3b48d470 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -10,19 +10,25 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3 * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -33,12 +39,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3 *
-                                         N4 *
-                                         N5 *
-                                         N6);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
@@ -46,17 +47,15 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  //6 threads total
+  // 6 threads total
   constexpr int threads_x = 1;
   constexpr int threads_y = 2;
   constexpr int threads_z = 3;
@@ -65,128 +64,159 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 2;
   constexpr int blocks_z = 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(
+        working_array, N6, N5, N4, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
+          RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
             RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                                
-                              });
-                          });
-                      });
-
+              RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
+                RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
+                  RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
+                    RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
+                      auto idx =
+                          tx +
+                          N1 * (ty +
+                                N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
+
+
+                      Aview(bz, by, bx, tz, ty, tx) =
+                          static_cast<INDEX_TYPE>(idx);
+                    });
                   });
+                });
               });
+            });
           });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx) ) {
-
-                                working_array[0]++;
-                                
-                              });
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx,
+                                      r2,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx,
+                                            r1,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(
+                                                tx)) { working_array[0]++; });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedLoopTest);
 template <typename T>
 class LaunchNestedLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(3));
-
-
+  LaunchNestedLoopTestImpl<INDEX_TYPE,
+                           WORKING_RES,
+                           LAUNCH_POLICY,
+                           THREAD_X_POLICY,
+                           THREAD_Y_POLICY,
+                           THREAD_Z_POLICY,
+                           TEAM_X_POLICY,
+                           TEAM_Y_POLICY,
+                           TEAM_Z_POLICY>(INDEX_TYPE(0));
+
+  LaunchNestedLoopTestImpl<INDEX_TYPE,
+                           WORKING_RES,
+                           LAUNCH_POLICY,
+                           THREAD_X_POLICY,
+                           THREAD_Y_POLICY,
+                           THREAD_Z_POLICY,
+                           TEAM_X_POLICY,
+                           TEAM_Y_POLICY,
+                           TEAM_Z_POLICY>(INDEX_TYPE(3));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest, RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_LOOP_HPP__
+#endif // __TEST_LAUNCH_NESTED_LOOP_HPP__
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 793d432987..37048f15bb 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -10,9 +10,15 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -20,25 +26,23 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   constexpr int tile_size_y = 3;
   constexpr int tile_size_z = 4;
 
-  constexpr int threads_x = 2*tile_size_x;
-  constexpr int threads_y = 3*tile_size_y;
-  constexpr int threads_z = 4*tile_size_z;
+  constexpr int threads_x = 2 * tile_size_x;
+  constexpr int threads_y = 3 * tile_size_y;
+  constexpr int threads_z = 4 * tile_size_z;
 
   constexpr int blocks_x = 4;
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
@@ -46,138 +50,192 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * tz);
-
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-
-                              });
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx,
+              tile_size_z,
+              r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx,
+                    tile_size_y,
+                    r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx,
+                          tile_size_x,
+                          r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
+                                  x_tile) {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile, [&](INDEX_TYPE tz) {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile, [&](INDEX_TYPE ty) {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile, [&](INDEX_TYPE tx) {
+                                              auto idx =
+                                                  tx + N1 * (ty + N2 * tz);
+
+                                              Aview(tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
-
-                                working_array[0]++;
-                                
-                              });
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx,
+              threads_z,
+              r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx,
+                    threads_y,
+                    r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx,
+                          threads_x,
+                          r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
+                                  x_tile) {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx,
+                                z_tile,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx,
+                                      y_tile,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx,
+                                            x_tile,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(
+                                                tx)) { working_array[0]++; });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
+                                 WORKING_RES,
+                                 LAUNCH_POLICY,
+                                 THREAD_X_POLICY,
+                                 THREAD_Y_POLICY,
+                                 THREAD_Z_POLICY,
+                                 TEAM_X_POLICY,
+                                 TEAM_Y_POLICY,
+                                 TEAM_Z_POLICY>(INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
+                                 WORKING_RES,
+                                 LAUNCH_POLICY,
+                                 THREAD_X_POLICY,
+                                 THREAD_Y_POLICY,
+                                 THREAD_Z_POLICY,
+                                 TEAM_X_POLICY,
+                                 TEAM_Y_POLICY,
+                                 TEAM_Z_POLICY>(INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index 07deab0376..cc27bc9787 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -10,9 +10,15 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -24,18 +30,16 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  //Add one to we check the bounds checking capability
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z*M + 1);
+  // Add one to we check the bounds checking capability
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x * M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y * M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z * M + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
@@ -43,140 +47,191 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * tz);
-
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                              });
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx,
+              threads_z,
+              r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx,
+                    threads_y,
+                    r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx,
+                          threads_x,
+                          r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
+                                  x_tile) {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile, [&](INDEX_TYPE tz) {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile, [&](INDEX_TYPE ty) {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile, [&](INDEX_TYPE tx) {
+                                              auto idx =
+                                                  tx + N1 * (ty + N2 * tz);
+
+                                              Aview(tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                (void) tx;
-                                (void) ty;
-                                (void) tz;
-
-                                working_array[0]++;
-                              });
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx,
+              threads_z,
+              r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx,
+                    threads_y,
+                    r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx,
+                          threads_x,
+                          r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
+                                  x_tile) {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile, [&](INDEX_TYPE tz) {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile, [&](INDEX_TYPE ty) {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile, [&](INDEX_TYPE tx) {
+                                              (void)tx;
+                                              (void)ty;
+                                              (void)tz;
+
+                                              working_array[0]++;
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
+                               WORKING_RES,
+                               LAUNCH_POLICY,
+                               THREAD_X_POLICY,
+                               THREAD_Y_POLICY,
+                               THREAD_Z_POLICY,
+                               TEAM_X_POLICY,
+                               TEAM_Y_POLICY,
+                               TEAM_Z_POLICY>(INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
+                               WORKING_RES,
+                               LAUNCH_POLICY,
+                               THREAD_X_POLICY,
+                               THREAD_Y_POLICY,
+                               THREAD_Z_POLICY,
+                               TEAM_X_POLICY,
+                               TEAM_Y_POLICY,
+                               TEAM_Z_POLICY>(INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_LOOP_HPP__
+#endif // __TEST_LAUNCH_NESTED_TILE_LOOP_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index aed4b9618e..f537f43b55 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -23,39 +24,35 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          simpand &= working_array[idx];
-     });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg, [&](IDX_TYPE idx) { simpand &= working_array[idx]; });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -66,27 +63,29 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-        redand  &= working_array[idx];
-        redand2 &= working_array[idx];
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          redand &= working_array[idx];
+          redand2 &= working_array[idx];
+        });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -94,105 +93,119 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          redand &= working_array[idx];
-      });
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { redand &= working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest);
 template <typename T>
 class LaunchReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  LAUNCH_POLICY,
+                                  GLOBAL_THREAD_POLICY,
+                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  LAUNCH_POLICY,
+                                  GLOBAL_THREAD_POLICY,
+                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  LAUNCH_POLICY,
+                                  GLOBAL_THREAD_POLICY,
+                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  LAUNCH_POLICY,
+                                  GLOBAL_THREAD_POLICY,
+                                  REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  LAUNCH_POLICY,
+                                  GLOBAL_THREAD_POLICY,
+                                  REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
+                                  DATA_TYPE,
                                   RAJA::TypedListSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  LAUNCH_POLICY,
+                                  GLOBAL_THREAD_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest,
                             ReduceBitAndBasicForall);
 
-#endif  // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
+#endif // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index 3e8c86ffd8..3c01e5c9c7 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -23,32 +24,31 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE min_init = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -57,15 +57,14 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          mininit.min( working_array[idx] );
-          min.min( working_array[idx] );
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          mininit.min(working_array[idx]);
+          min.min(working_array[idx]);
+        });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -74,116 +73,127 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          min.min( working_array[idx] * factor);
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          min.min(working_array[idx] * factor);
+        });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          min.min( working_array[idx] * factor);
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          min.min(working_array[idx] * factor);
+        });
       });
-  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest);
 template <typename T>
 class LaunchReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest, ReduceMinBasicForall);
 
-#endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
+#endif // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index 798988f116..a06ca98678 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -24,30 +25,29 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -56,14 +56,14 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          sum  += working_array[idx];
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          sum += working_array[idx];
           sum2 += working_array[idx];
-     });
-  });
+        });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -72,105 +72,117 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-            sum += working_array[idx];
-          });
-      });
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { sum += working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest);
 template <typename T>
 class LaunchReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE,
+                               DATA_TYPE,
                                RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest, ReduceSumBasicForall);
 
-#endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
+#endif // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index 776b63f6b0..43f067941b 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -13,52 +13,48 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
 
-void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+void LaunchParamExptReduceBitAndBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE &_simpand) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _simpand &= working_array[idx];
-     });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _simpand) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg, [&](IDX_TYPE idx) { _simpand &= working_array[idx]; });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand), 5);
 
@@ -69,29 +65,32 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE &_redand, DATA_TYPE &_redand2) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-        _redand  &= working_array[idx];
-        _redand2 &= working_array[idx];
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, DATA_TYPE & _redand, DATA_TYPE & _redand2) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          _redand &= working_array[idx];
+          _redand2 &= working_array[idx];
+        });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -99,105 +98,119 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE _redand) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _redand &= working_array[idx];
-      });
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE _redand) {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { _redand &= working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest);
 template <typename T>
 class LaunchParamExptReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r1, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
+                                           DATA_TYPE,
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY,
+                                           GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r2, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
+                                           DATA_TYPE,
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY,
+                                           GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r3, seg_idx, working_res);
-
-// Range-stride segment tests
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
+                                           DATA_TYPE,
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY,
+                                           GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
+
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r4, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE,
+      DATA_TYPE,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r5, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE,
+      DATA_TYPE,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    l1, seg_idx, working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
+                                           DATA_TYPE,
+                                           RAJA::TypedListSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY,
+                                           GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest,
                             ReduceBitAndBasicForall);
 
-#endif  // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
+#endif // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 2936f2de09..0567c99891 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -13,63 +13,61 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
-                                           const std::vector<IDX_TYPE>& seg_idx,
-                                           camp::resources::Resource working_res)
+void LaunchParamExptReduceMinBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE min_init = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE mininit(small_min);
   DATA_TYPE min(min_init);
-  
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchMinBasicTest",
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE &_mininit, DATA_TYPE &_min) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
 
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      "LaunchMinBasicTest",
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, DATA_TYPE & _mininit, DATA_TYPE & _min) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
           _mininit = RAJA_MIN(working_array[idx], _mininit);
-          _min     = RAJA_MIN(working_array[idx], _min);
-
-    });
-
-  });
+          _min = RAJA_MIN(working_array[idx], _min);
+        });
+      });
 
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
@@ -79,120 +77,130 @@ void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE &_min) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
           _min = RAJA_MIN(working_array[idx] * factor, _min);
-    });
-
-  });
+        });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE &_min) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
           _min = RAJA_MIN(working_array[idx] * factor, _min);
+        });
       });
-  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest);
 template <typename T>
 class LaunchParamExptReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r1, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r2, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r3, seg_idx, working_res);
-
-// Range-stride segment tests
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
+
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r4, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r5, seg_idx, working_res);
-
-// List segment tests
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
+
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 l1, seg_idx, working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest,
                             ReduceMinBasicForall);
 
-#endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
+#endif // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index bf4b29ec28..59ae7203a0 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -13,40 +13,41 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
-                                           const std::vector<IDX_TYPE>& seg_idx,
-                                           camp::resources::Resource working_res)
+void LaunchParamExptReduceSumBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -54,19 +55,18 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE sum(0), sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchSumBasicTest",
-     RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-     RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE &_sum, DATA_TYPE &_sum2) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _sum  += working_array[idx];
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      "LaunchSumBasicTest",
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, DATA_TYPE & _sum, DATA_TYPE & _sum2) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          _sum += working_array[idx];
           _sum2 += working_array[idx];
-     });
-
-  });
+        });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -75,106 +75,119 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE &_sum) {
-
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-            _sum += working_array[idx];
-          });
-      });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum) {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { _sum += working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest);
 template <typename T>
 class LaunchParamExptReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                        r1, seg_idx, working_res);
-     
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
+
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r2, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r3, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r4, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r5, seg_idx, working_res);
-
-// List segment tests
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
+
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedListSegment<IDX_TYPE>,
-                                      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                      l1, seg_idx, working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
+                                        DATA_TYPE,
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY,
+                                        GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest,
                             ReduceSumBasicForall);
 
-#endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
+#endif // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index 702d5c6cd3..a702b9071e 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -10,7 +10,10 @@
 
 #include <numeric>
 
-template <typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
+template <typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchBasicSharedTestImpl()
 {
 
@@ -21,87 +24,87 @@ void LaunchBasicSharedTestImpl()
   int* check_array;
   int* test_array;
 
-  allocateForallTestData<int>(N*N,
-                             working_res,
-                             &working_array,
-                             &check_array,
-                             &test_array);
+  allocateForallTestData<int>(
+      N * N, working_res, &working_array, &check_array, &test_array);
 
 
-
-  //Select platform
+  // Select platform
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (working_res.get_platform()  == camp::resources::Platform::host){
+  if (working_res.get_platform() == camp::resources::Platform::host)
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-  }else{
+  }
+  else
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
   }
 
   size_t shared_mem_size = 1 * sizeof(int);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (select_cpu_or_gpu,
-     RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-          RAJA::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
-
-                // Array shared within threads of the same team
-              int * s_A = ctx.getSharedMemory<int>(1);
-
-                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
-                    s_A[c] = r;
-                });
+  RAJA::launch<LAUNCH_POLICY>(
+      select_cpu_or_gpu,
+      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
+          // Array shared within threads of the same team
+          int* s_A = ctx.getSharedMemory<int>(1);
 
-                ctx.teamSync();
+          RAJA::loop<THREAD_POLICY>(
+              ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; });
 
-                //broadcast shared value to all threads and write to array
-                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
-                    const int idx = c + N*r;
-                    working_array[idx] = s_A[0];
-                });  // loop j
+          ctx.teamSync();
 
-                ctx.releaseSharedMemory();
-              });  // loop r
-        });  // outer lambda
+          // broadcast shared value to all threads and write to array
+          RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
+            const int idx = c + N * r;
+            working_array[idx] = s_A[0];
+          }); // loop j
 
+          ctx.releaseSharedMemory();
+        }); // loop r
+      });   // outer lambda
 
 
-  working_res.memcpy(check_array, working_array, sizeof(int) * N*N);
+  working_res.memcpy(check_array, working_array, sizeof(int) * N * N);
 
-  for(int r = 0; r < N; ++r) {
-    for (int c = 0; c < N; c++) {
-      ASSERT_EQ(r, check_array[c + r*N]);
+  for (int r = 0; r < N; ++r)
+  {
+    for (int c = 0; c < N; c++)
+    {
+      ASSERT_EQ(r, check_array[c + r * N]);
     }
   }
 
-  deallocateForallTestData<int>(working_res,
-                               working_array,
-                               check_array,
-                               test_array);
+  deallocateForallTestData<int>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchBasicSharedTest);
 template <typename T>
 class LaunchBasicSharedTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
 {
 
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<2>>::type;
-
-  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
-
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<2>>::type;
+
+  LaunchBasicSharedTestImpl<WORKING_RES,
+                            LAUNCH_POLICY,
+                            TEAM_POLICY,
+                            THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest,
-                            BasicSharedTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest, BasicSharedTeams);
 
-#endif  // __TEST_BASIC_SHARED_HPP__
+#endif // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index 9ed358208f..e41abaf827 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -15,18 +15,23 @@
 #include <algorithm>
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchListSegmentTestImpl(INDEX_TYPE N)
 {
 
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
     }
   }
@@ -37,107 +42,124 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0) {
+  if (N > 0)
+  {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen,
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (size_t i = 0; i < idxlen; ++i) {
-      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i)
+    {
+      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
             working_array[RAJA::stripIndexType(idx)] = idx;
           });
-      });
-
-  } else { // zero-length segment
+        });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
-            (void) idx;
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
+            (void)idx;
             working_array[0]++;
           });
-      });
-
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-  } else {
+  }
+  else
+  {
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchListSegmentTest);
 template <typename T>
 class LaunchListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
 {
-  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length list segment
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
-
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
-
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
-
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
+  LaunchListSegmentTestImpl<INDEX_TYPE,
+                            WORKING_RESOURCE,
+                            LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
+
+  LaunchListSegmentTestImpl<INDEX_TYPE,
+                            WORKING_RESOURCE,
+                            LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
+
+  LaunchListSegmentTestImpl<INDEX_TYPE,
+                            WORKING_RESOURCE,
+                            LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
+
+  LaunchListSegmentTestImpl<INDEX_TYPE,
+                            WORKING_RESOURCE,
+                            LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest,
-                            ListSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest, ListSegmentTeams);
 
-#endif  // __TEST_TEAMS_LISTSEGMENT_HPP__
+#endif // __TEST_TEAMS_LISTSEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index aa2cb2c4b5..5cd9d04caa 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -10,11 +10,15 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -23,119 +27,154 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE idx) {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
             working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-          }
-        );
-      }
-    );
-
-  } else { // zero-length segment
+          });
+        });
+  }
+  else
+  { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),  [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-            working_array[0]++;
-          }
-        );
-      }
-    );
-
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
+                working_array[0]++;
+              });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(test_array[0], check_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchRangeSegmentTest);
 template <typename T>
 class LaunchRangeSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
-
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(-5));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(0));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(5));
 }
 
 TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
-
-  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+  LaunchRangeSegmentTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(3),
+                                                   INDEX_TYPE(3));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(0),
+                                                   INDEX_TYPE(27));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(2047));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE,
+                             WORKING_RES,
+                             LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE,
+                   WORKING_RES,
+                   LAUNCH_POLICY,
+                   GLOBAL_THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest, RangeSegmentTeams);
 
-#endif  // __TEST_RANGE_SEGMENT_HPP__
+#endif // __TEST_RANGE_SEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index 94a1a77bcf..7d29703587 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -10,12 +10,17 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE, typename DIFF_TYPE,
-          typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
-void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
+void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -25,129 +30,211 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+    {
+      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
       idx += stride;
     }
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE idx) {
-            working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-          }
-        );
-
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
+            working_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+          });
+        });
+  }
+  else
+  { // zero-length segment
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-            working_array[0]++;
-          }
-        );
-
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
+                working_array[0]++;
+              });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest);
 template <typename T>
 class LaunchRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+  // Test negative strides
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
 TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
-
-  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+  // Test size zero segments
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
+                                   DIFF_TYPE,
+                                   WORKING_RES,
+                                   LAUNCH_POLICY,
+                                   GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+
+  runNegativeStrideTests<INDEX_TYPE,
+                         DIFF_TYPE,
+                         WORKING_RES,
+                         LAUNCH_POLICY,
+                         GLOBAL_THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest,
                             RangeStrideSegmentTeams);
 
-#endif  // __TEST_TEAMS_RANGESTRIDESEGMENT_HPP__
+#endif // __TEST_TEAMS_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index 8da7b81eb7..01d42ae7c5 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -10,110 +10,132 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
+  size_t data_len =
+      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  //determine the underlying type of block_range
+  // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
 
-  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
-    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
-      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
+  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
+  {
+    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
+    {
+      s_type idx = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx) + INDEX_TYPE(c);
     }
   }
 
-  size_t shared_mem_size = RAJA::stripIndexType(thread_range)*sizeof(INDEX_TYPE);
-
-  //Use an int type to test the bump style allocator.
-  //Key idea is that we are requesting different amounts.
-  shared_mem_size += RAJA::stripIndexType(thread_range)*sizeof(int);
-
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                        RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-
-          INDEX_TYPE * tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(RAJA::stripIndexType(thread_range));
-          RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(tile_ptr, RAJA::stripIndexType(thread_range));
-
-          int * int_tile_ptr = ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
-          RAJA::View<int, RAJA::Layout<1>> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range));
+  size_t shared_mem_size =
+      RAJA::stripIndexType(thread_range) * sizeof(INDEX_TYPE);
+
+  // Use an int type to test the bump style allocator.
+  // Key idea is that we are requesting different amounts.
+  shared_mem_size += RAJA::stripIndexType(thread_range) * sizeof(int);
+
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range)),
+                         shared_mem_size),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
+          INDEX_TYPE* tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(
+              RAJA::stripIndexType(thread_range));
+          RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(
+              tile_ptr, RAJA::stripIndexType(thread_range));
+
+          int* int_tile_ptr =
+              ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
+          RAJA::View<int, RAJA::Layout<1>> Int_Tile(
+              int_tile_ptr, RAJA::stripIndexType(thread_range));
 
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
-              Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid;
-            });
+            Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
+            Tile(RAJA::stripIndexType(thread_range) -
+                 RAJA::stripIndexType(tid) - 1) =
+                thread_range - tid - 1 + thread_range * bid;
+          });
 
           ctx.teamSync();
 
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid));
+            INDEX_TYPE idx = tid + thread_range * bid;
+            working_array[RAJA::stripIndexType(idx)] =
+                Tile(RAJA::stripIndexType(tid)) +
+                Int_Tile(RAJA::stripIndexType(tid));
           });
 
           ctx.releaseSharedMemory();
         });
-
-    });
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchDynamicMemTest);
 template <typename T>
 class LaunchDynamicMemTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
-    (INDEX_TYPE(4), INDEX_TYPE(2));
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
-    (INDEX_TYPE(5), INDEX_TYPE(32));
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE,
+                           WORKING_RES,
+                           LAUNCH_POLICY,
+                           TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(4), INDEX_TYPE(2));
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE,
+                           WORKING_RES,
+                           LAUNCH_POLICY,
+                           TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(5), INDEX_TYPE(32));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest,
-                            DynamicMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest, DynamicMemLaunch);
 
-#endif  // __TEST_DYNAMIC_MEM_HPP__
+#endif // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index 63b488115b..74e88783fa 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -10,109 +10,127 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY,
-int THREAD_RANGE>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY,
+          int THREAD_RANGE>
 void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 {
 
   INDEX_TYPE thread_range(THREAD_RANGE);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
+  size_t data_len =
+      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  //determine the underlying type of block_range
+  // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
-  
-  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
-    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
-      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
+
+  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
+  {
+    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
+    {
+      s_type idx = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx);
     }
   }
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                        RAJA::Threads(RAJA::stripIndexType(thread_range))),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-
-          //Since we are using custom index type we have to first use a
-          //type that the device compiler can intialize, we can then use a
-          //pointer to recast the shared memory to our desired type.
-          //This enables us to work around the following warning:
-          // warning #3019-D: dynamic initialization is not supported for
-          //a function-scope static __shared__ variable within a __device__/__global__ function
-          RAJA_TEAM_SHARED char char_Tile[THREAD_RANGE*sizeof(INDEX_TYPE)];
-          INDEX_TYPE *Tile = (INDEX_TYPE *)char_Tile;
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range))),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
+          // Since we are using custom index type we have to first use a
+          // type that the device compiler can intialize, we can then use a
+          // pointer to recast the shared memory to our desired type.
+          // This enables us to work around the following warning:
+          //  warning #3019-D: dynamic initialization is not supported for
+          // a function-scope static __shared__ variable within a
+          // __device__/__global__ function
+          RAJA_TEAM_SHARED char char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
+          INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
 
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              Tile[RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1] = thread_range-tid-1 + thread_range*bid;
-            });
+            Tile[RAJA::stripIndexType(thread_range) -
+                 RAJA::stripIndexType(tid) - 1] =
+                thread_range - tid - 1 + thread_range * bid;
+          });
 
           ctx.teamSync();
 
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile[RAJA::stripIndexType(tid)];
+            INDEX_TYPE idx = tid + thread_range * bid;
+            working_array[RAJA::stripIndexType(idx)] =
+                Tile[RAJA::stripIndexType(tid)];
           });
 
           ctx.releaseSharedMemory();
         });
-
-    });
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchStaticMemTest);
 template <typename T>
 class LaunchStaticMemTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 2>
-    (INDEX_TYPE(4));
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 32>
-    (INDEX_TYPE(5));
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+
+
+  LaunchStaticMemTestImpl<INDEX_TYPE,
+                          WORKING_RES,
+                          LAUNCH_POLICY,
+                          TEAM_POLICY,
+                          THREAD_POLICY,
+                          2>(INDEX_TYPE(4));
+
+  LaunchStaticMemTestImpl<INDEX_TYPE,
+                          WORKING_RES,
+                          LAUNCH_POLICY,
+                          TEAM_POLICY,
+                          THREAD_POLICY,
+                          32>(INDEX_TYPE(5));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest,
-                            StaticMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest, StaticMemLaunch);
 
-#endif  // __TEST_DYNAMIC_MEM_HPP__
+#endif // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index 72d59d290a..c1045945ac 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -10,19 +10,22 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int threads_x   = 4;
-  constexpr int blocks_x    = 4;
+  constexpr int threads_x = 4;
+  constexpr int blocks_x = 4;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*threads_x+1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * threads_x + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1-1)/threads_x + 1;
+  INDEX_TYPE no_tiles = (N1 - 1) / threads_x + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
@@ -36,7 +39,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
@@ -52,64 +56,74 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
                                      &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
-
-                working_ttile_array[tx] = bx;
-                working_iloop_array[tx] = ix;
-
-              }
-            );
-          }
-        );
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx,
+              threads_x,
+              r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE bx) {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
+                      working_ttile_array[tx] = bx;
+                      working_iloop_array[tx] = ix;
+                    });
+              });
+        });
+  }
+  else
+  { // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(
+        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG (ix)) {
-
-                working_ttile_array[0]++;
-                working_iloop_array[0]++;
-
-              }
-            );
-          }
-        );
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx,
+              threads_x,
+              r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx,
+                    x_tile,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
+                      working_ttile_array[0]++;
+                      working_iloop_array[0]++;
+                    });
+              });
+        });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx) {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
+    {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx)
+      {
 
-        if(idx >= N1) break;
+        if (idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -117,62 +131,66 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_ttile_array,
-                                       check_ttile_array,
-                                       test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_iloop_array,
-                                       check_iloop_array,
-                                       test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(1));
-
-    LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(2));
-
-
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
+                                 WORKING_RES,
+                                 LAUNCH_POLICY,
+                                 THREAD_X_POLICY,
+                                 TEAM_X_POLICY>(INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
+                                 WORKING_RES,
+                                 LAUNCH_POLICY,
+                                 THREAD_X_POLICY,
+                                 TEAM_X_POLICY>(INDEX_TYPE(1));
+
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
+                                 WORKING_RES,
+                                 LAUNCH_POLICY,
+                                 THREAD_X_POLICY,
+                                 TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index 31adc84810..80100a121e 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -10,22 +10,25 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int tile_size   = 4;
+  constexpr int tile_size = 4;
 
-  //following grid will require loop policies
-  constexpr int threads_x   = 3;
-  constexpr int blocks_x    = 1;
+  // following grid will require loop policies
+  constexpr int threads_x = 3;
+  constexpr int blocks_x = 1;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*tile_size+1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * tile_size + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1-1)/tile_size + 1;
+  INDEX_TYPE no_tiles = (N1 - 1) / tile_size + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
@@ -39,7 +42,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
@@ -55,64 +59,74 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
                                      &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
-
-                working_ttile_array[tx] = bx;
-                working_iloop_array[tx] = ix;
-
-              }
-            );
-          }
-        );
-      }
-    );
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx,
+              tile_size,
+              r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE bx) {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
+                      working_ttile_array[tx] = bx;
+                      working_iloop_array[tx] = ix;
+                    });
+              });
+        });
+  }
+  else
+  { // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(
+        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>
-          (ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-            RAJA::loop_icount<THREAD_X_POLICY>
-              (ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
-
-                working_ttile_array[0]++;
-                working_iloop_array[0]++;
-
-              }
-            );
-          }
-        );
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx,
+              tile_size,
+              r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx,
+                    x_tile,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
+                      working_ttile_array[0]++;
+                      working_iloop_array[0]++;
+                    });
+              });
+        });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx) {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
+    {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx)
+      {
 
-        if(idx >= N1) break;
+        if (idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -120,63 +134,66 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_ttile_array,
-                                       check_ttile_array,
-                                       test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_iloop_array,
-                                       check_iloop_array,
-                                       test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(1));
-
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(2));
-
-
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
+                               WORKING_RES,
+                               LAUNCH_POLICY,
+                               THREAD_X_POLICY,
+                               TEAM_X_POLICY>(INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
+                               WORKING_RES,
+                               LAUNCH_POLICY,
+                               THREAD_X_POLICY,
+                               TEAM_X_POLICY>(INDEX_TYPE(1));
+
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
+                               WORKING_RES,
+                               LAUNCH_POLICY,
+                               THREAD_X_POLICY,
+                               TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
-#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index 43f99f9901..2e92d3833b 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -16,8 +16,10 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -29,9 +31,9 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveTestImpl(int N,
-                           typename OP_TYPE::result_type offset =
-                           OP_TYPE::identity())
+void ScanExclusiveTestImpl(
+    int N,
+    typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
@@ -43,10 +45,7 @@ void ScanExclusiveTestImpl(int N,
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -54,10 +53,11 @@ void ScanExclusiveTestImpl(int N,
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{},
-                                    offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N),
+      OP_TYPE{},
+      offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -67,62 +67,47 @@ void ScanExclusiveTestImpl(int N,
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan<EXEC_POLICY>(res,
-                                    RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{},
-                                    offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(
+      res,
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N),
+      OP_TYPE{},
+      offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveTest);
 template <typename T>
 class ScanExclusiveTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
 {
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
-
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(0);
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(357);
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(32000);
+  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(0, T(13));
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(357, T(15));
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(32000, T(2));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0, T(13));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357, T(15));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000, T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest,
-                            ScanExclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest, ScanExclusive);
 
 #endif // __TEST_SCAN_EXCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index c42e9a8677..da8f727bc7 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -16,8 +16,10 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -29,9 +31,9 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveInplaceTestImpl(int N,
-                                  typename OP_TYPE::result_type offset =
-                                  OP_TYPE::identity())
+void ScanExclusiveInplaceTestImpl(
+    int N,
+    typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
@@ -43,10 +45,7 @@ void ScanExclusiveInplaceTestImpl(int N,
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -54,9 +53,8 @@ void ScanExclusiveInplaceTestImpl(int N,
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE{},
-                                            offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
+      RAJA::make_span(work_in, N), OP_TYPE{}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -66,61 +64,46 @@ void ScanExclusiveInplaceTestImpl(int N,
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res,
-                                            RAJA::make_span(work_in, N),
-                                            OP_TYPE{},
-                                            offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
+      res, RAJA::make_span(work_in, N), OP_TYPE{}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest);
 template <typename T>
 class ScanExclusiveInplaceTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
 {
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
-
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000);
+  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0, T(13));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357, T(15));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000, T(2));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0,
+                                                                       T(13));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357,
+                                                                       T(15));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000,
+                                                                       T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest,
-                            ScanExclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest, ScanExclusiveInplace);
 
 #endif // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index 9fcc54ed67..7a7e490ff6 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -11,15 +11,17 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-  const typename OP::result_type* actual,
-  const typename OP::result_type* original,
-  int N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     init = OP()(init, *original);
-    if (*actual != init) {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -42,10 +44,7 @@ void ScanInclusiveTestImpl(int N)
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -53,9 +52,10 @@ void ScanInclusiveTestImpl(int N)
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::inclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{});
+  RAJA::inclusive_scan<EXEC_POLICY>(
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N),
+      OP_TYPE{});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -65,46 +65,37 @@ void ScanInclusiveTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan<EXEC_POLICY>(res,
-                                    RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{});
+  RAJA::inclusive_scan<EXEC_POLICY>(
+      res,
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N),
+      OP_TYPE{});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveTest);
 template <typename T>
 class ScanInclusiveTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
 {
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
-
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(0);
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(357);
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(32000);
+  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest,
-                            ScanInclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest, ScanInclusive);
 
 #endif // __TEST_SCAN_INCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index 8e4d8e93bf..690ba827d2 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -11,15 +11,17 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-  const typename OP::result_type* actual,
-  const typename OP::result_type* original,
-  int N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     init = OP()(init, *original);
-    if (*actual != init) {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -42,10 +44,7 @@ void ScanInclusiveInplaceTestImpl(int N)
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -64,45 +63,34 @@ void ScanInclusiveInplaceTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res,
-                                            RAJA::make_span(work_in, N),
-                                            OP_TYPE{});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(
+      res, RAJA::make_span(work_in, N), OP_TYPE{});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest);
 template <typename T>
 class ScanInclusiveInplaceTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
 {
-  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
-
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000);
+  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest,
-                            ScanInclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest, ScanInclusiveInplace);
 
 #endif // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
index ccfdb47dc2..fe14bb8f3f 100644
--- a/test/functional/scan/tests/test-scan-data.hpp
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -15,22 +15,26 @@
 template <typename T>
 void allocScanTestData(int N,
                        camp::resources::Resource work_res,
-                       T** work_in, T** work_out,
-                       T** host_in, T** host_out)
+                       T** work_in,
+                       T** work_out,
+                       T** host_in,
+                       T** host_out)
 {
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  *work_in  = work_res.allocate<T>(N);
+  *work_in = work_res.allocate<T>(N);
   *work_out = work_res.allocate<T>(N);
 
-  *host_in  = host_res.allocate<T>(N);
+  *host_in = host_res.allocate<T>(N);
   *host_out = host_res.allocate<T>(N);
 }
 
 template <typename T>
 void deallocScanTestData(camp::resources::Resource work_res,
-                         T* work_in, T* work_out,
-                         T* host_in, T* host_out)
+                         T* work_in,
+                         T* work_out,
+                         T* host_in,
+                         T* host_out)
 {
   camp::resources::Resource host_res{camp::resources::Host()};
 
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index 93d08d99f8..d988dd8e55 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -14,50 +14,122 @@ using MatrixElementType = double;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::hip_wave_register>,
 #endif
 
 
 //#ifdef __AVX__
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    2,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    2,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,2, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,2, RAJA::expt::avx_register>,
 //
 //#endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,12, RAJA::expt::avx2_register>,
-
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   12,
+                                   RAJA::expt::avx2_register>,
+
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,4, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,2, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,2, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -65,7 +137,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
-
-  >;
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
index 2952fb5f6f..40cb6f67fd 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
@@ -14,29 +14,73 @@ using MatrixElementType = float;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -44,7 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
-
-  >;
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
index e15729d08a..b3e415abbc 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
@@ -14,29 +14,73 @@ using MatrixElementType = int32_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -44,6 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
-  >;
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
index f91b015b4a..3dca8e44a6 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
@@ -14,42 +14,134 @@ using MatrixElementType = int64_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx_register>,
 
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -57,6 +149,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
-  >;
+    >;
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index 1ceaf94b18..33c9faa4c3 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 #define __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void CtorGetSetImpl()
@@ -22,26 +22,30 @@ void CtorGetSetImpl()
   //
   // Allocate Data
   //
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
-
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Do Operation: broadcast-ctor and copy-ctor
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // create a matrix that contains all 3's
     matrix_t m1(element_t(3));
 
@@ -49,13 +53,14 @@ void CtorGetSetImpl()
     matrix_t m2(m1);
 
     // write out both matrices
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data1_d(i,j) = m1.get(i,j);
-        data2_d(i,j) = m2.get(i,j);
+    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+    {
+      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      {
+        data1_d(i, j) = m1.get(i, j);
+        data2_d(i, j) = m2.get(i, j);
       }
     }
-
   });
 
   // copy data back to host
@@ -66,11 +71,14 @@ void CtorGetSetImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(3, data1_h(i,j));
-      ASSERT_SCALAR_EQ(3, data2_h(i,j));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(3, data1_h(i, j));
+      ASSERT_SCALAR_EQ(3, data2_h(i, j));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
@@ -80,15 +88,10 @@ void CtorGetSetImpl()
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, CtorGetSet)
-{
-  CtorGetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index e4e1ff0bfb..5d391cf951 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Add_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Add_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_AddImpl()
@@ -19,60 +19,66 @@ void ET_AddImpl()
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -85,25 +91,26 @@ void ET_AddImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
     auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
     auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRrows =
+        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+    auto SRcols =
+        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
     // Access types:
     // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) + data3_d(SArows, SRcols) + data4_d(SAcols, rows);
+    // data2_d - Layout with all() and static_range(), which should default to
+    // normal Layout access. data3_d - StaticLayout with static_all() and
+    // static_range(). data4_d - StaticLayout with static_all() and all().
 
+    data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) +
+                          data3_d(SArows, SRcols) + data4_d(SAcols, rows);
   });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
@@ -112,27 +119,35 @@ void ET_AddImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i)+data3_h(i,j)+data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i),
+                       data1_h(i, j) + data2_h(j, i) + data3_h(i, j) +
+                           data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -141,13 +156,14 @@ void ET_AddImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // Load data using a View
         auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
         auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
         // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+        // Layout with range() and range() because loop iterate cannot be
+        // determined statically.
 
         data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
       });
@@ -158,19 +174,22 @@ void ET_AddImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -186,11 +205,7 @@ void ET_AddImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Add)
-{
-  ET_AddImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Add) { ET_AddImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index a06b87732c..db4954358b 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_DivideImpl()
@@ -18,7 +18,8 @@ void ET_DivideImpl()
   using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
@@ -26,54 +27,59 @@ void ET_DivideImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -86,25 +92,26 @@ void ET_DivideImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
     auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
     auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRrows =
+        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+    auto SRcols =
+        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
     // Access types:
     // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) + data3_d(SArows, SRcols) / data4_d(SAcols, rows);
+    // data2_d - Layout with all() and static_range(), which should default to
+    // normal Layout access. data3_d - StaticLayout with static_all() and
+    // static_range(). data4_d - StaticLayout with static_all() and all().
 
+    data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) +
+                          data3_d(SArows, SRcols) / data4_d(SAcols, rows);
   });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
@@ -113,27 +120,35 @@ void ET_DivideImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i)+data3_h(i,j)/data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i),
+                       data1_h(i, j) / data2_h(j, i) +
+                           data3_h(i, j) / data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -142,13 +157,14 @@ void ET_DivideImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // Load data using a View
         auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
         auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
         // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+        // Layout with range() and range() because loop iterate cannot be
+        // determined statically.
 
         data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
       });
@@ -159,19 +175,22 @@ void ET_DivideImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -184,15 +203,10 @@ void ET_DivideImpl()
   tensor_free<policy_t>(data3_ptr);
   tensor_free<policy_t>(data4_ptr);
   tensor_free<policy_t>(data5_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Divide)
-{
-  ET_DivideImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Divide) { ET_DivideImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 1d1c725f52..8aa903b48e 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 #define __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_LoadStoreImpl()
@@ -24,67 +24,95 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_h(data3_vec.data());
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_d(data3_ptr);
+  std::vector<element_t> data3_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t,
+             RAJA::StaticLayout<RAJA::PERM_IJ,
+                                matrix_t::s_num_rows,
+                                matrix_t::s_num_columns>>
+      data3_h(data3_vec.data());
+
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t,
+             RAJA::StaticLayout<RAJA::PERM_IJ,
+                                matrix_t::s_num_rows,
+                                matrix_t::s_num_columns>>
+      data3_d(data3_ptr);
 
 
   // alloc data4
-  std::vector<element_t> data4_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_h(data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data4_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_h(
+      data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_d(data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_d(
+      data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data5_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(
+      data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(
+      data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data6
-  std::vector<element_t> data6_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_h(data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data6_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_h(
+      data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data6_ptr = tensor_malloc<policy_t>(data6_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_d(data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data6_ptr = tensor_malloc<policy_t>(data6_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_d(
+      data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data7
-  std::vector<element_t> data7_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_h(data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
-
-  element_t *data7_ptr = tensor_malloc<policy_t>(data7_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_d(data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data7_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_h(
+      data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
+  element_t* data7_ptr = tensor_malloc<policy_t>(data7_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_d(
+      data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -95,24 +123,26 @@ void ET_LoadStoreImpl()
   //
   // Do Operation: Load/Store full matrix from one view to another
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
     auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
     auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_rows>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_columns>();
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::
+        template static_range<0, matrix_t::s_num_rows>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::
+        template static_range<0, matrix_t::s_num_columns>();
 
     data2_d(cols, rows) = data1_d(rows, cols);
 
-    data4_d(cols, rows) = data3_d(SArows, SRcols);  // mixed static_all and static_range
-    data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
-    data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
-    data7_d(cols, rows) = data3_d(rows, SRcols);    // mixed static_range and non-static
-
+    data4_d(cols, rows) =
+        data3_d(SArows, SRcols); // mixed static_all and static_range
+    data5_d(cols, rows) = data3_d(SArows, SAcols); // static_all
+    data6_d(cols, rows) = data3_d(SRrows, SRcols); // static_range
+    data7_d(cols, rows) =
+        data3_d(rows, SRcols); // mixed static_range and non-static
   });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
@@ -125,14 +155,17 @@ void ET_LoadStoreImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      //printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(j,i));
-      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data4_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data5_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data6_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data7_h(j,i));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      // printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+      // data2_h(j,i));
+      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data4_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data5_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data6_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data7_h(j, i));
     }
   }
 
@@ -140,15 +173,19 @@ void ET_LoadStoreImpl()
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -157,7 +194,7 @@ void ET_LoadStoreImpl()
       //
       // Do Operation: Load/Store partial matrix from one view to another
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // Load data using a View
         auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
         auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
@@ -171,19 +208,22 @@ void ET_LoadStoreImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -201,11 +241,7 @@ void ET_LoadStoreImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_LoadStore)
-{
-  ET_LoadStoreImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_LoadStore) { ET_LoadStoreImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index 4718172de7..f07d26a8f8 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyImpl()
@@ -23,67 +23,73 @@ void ET_MatrixMatrixMultiplyImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
+      data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
+      data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_h(data2_vec.data());
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_h(
+      data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_d(
+      data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(),  N, N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr,  N, N);
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(), N, N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 1+i*N+j;
-      data2_h(i,j) = 3+i*N+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 1 + i * N + j;
+      data2_h(i, j) = 3 + i * N + j;
     }
-
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
-
-//  printf("data2:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data2_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
+
+
+  //  printf("data2:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data2_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -92,70 +98,77 @@ void ET_MatrixMatrixMultiplyImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
+    auto A_cols =
+        RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0, N>();
 
-    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
+    auto B_rows =
+        RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0, N>();
     auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
     auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
     auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
 
     data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-
   });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-//  printf("data3:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data3:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
       element_t expected(0);
-      for(camp::idx_t k = 0;k < N; ++ k){
-        expected += data1_h(i,k)*data2_h(k,j);
+      for (camp::idx_t k = 0; k < N; ++k)
+      {
+        expected += data1_h(i, k) * data2_h(k, j);
       }
-//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-//      data3_h(i,j) = expected;
+      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+      //    (double)expected, (double)data3_h(i,j));
 
+      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+      //      data3_h(i,j) = expected;
     }
   }
 
-//  printf("expected:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("expected:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
           data3_h(i, j) = 0;
         }
       }
@@ -166,8 +179,7 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
         auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
@@ -177,7 +189,8 @@ void ET_MatrixMatrixMultiplyImpl()
         auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
         auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
-        data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+        data3_d(C_rows, C_cols) =
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
       });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
@@ -189,37 +202,35 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
-        for(camp::idx_t j = 0;j < n_size; ++ j){
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
+        for (camp::idx_t j = 0; j < n_size; ++j)
+        {
           element_t expected(0);
-          for(camp::idx_t k = 0;k < m_size; ++ k){
-            expected += data1_h(i,k)*data2_h(k,j);
+          for (camp::idx_t k = 0; k < m_size; ++k)
+          {
+            expected += data1_h(i, k) * data2_h(k, j);
           }
-    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-    //      data3_h(i,j) = expected;
+          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+          //    (double)expected, (double)data3_h(i,j));
 
+          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+          //      data3_h(i,j) = expected;
         }
       }
-
-
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiply)
 {
   ET_MatrixMatrixMultiplyImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index 8bebe94c26..2b17a8a69f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -8,10 +8,10 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE( TX, "TX" );
-RAJA_INDEX_VALUE( TY, "TY" );
+RAJA_INDEX_VALUE(TX, "TX");
+RAJA_INDEX_VALUE(TY, "TY");
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyAddImpl()
@@ -26,68 +26,75 @@ void ET_MatrixMatrixMultiplyAddImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data1_h(data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data1_d(data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_h(data2_vec.data());
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data2_h(data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data2_d(data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),  N, N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr,  N, N);
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(
+      data3_vec.data(), N, N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 1+i*N+j;
-      data2_h(i,j) = 3+i*N+j;
-      data3_h(i,j) = 5*i+13*j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 1 + i * N + j;
+      data2_h(i, j) = 3 + i * N + j;
+      data3_h(i, j) = 5 * i + 13 * j;
     }
-
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
-
-//  printf("data2:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data2_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
+
+
+  //  printf("data2:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data2_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -97,71 +104,79 @@ void ET_MatrixMatrixMultiplyAddImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
+    auto A_cols =
+        RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0, N>();
 
-    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
+    auto B_rows =
+        RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0, N>();
     auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
     auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
     auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
 
-    data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-
+    data3_d(C_rows, C_cols) +=
+        data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
   });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-//  printf("data3:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data3:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      element_t expected(5*i+13*j);
-      for(camp::idx_t k = 0;k < N; ++ k){
-        expected += data1_h(i,k)*data2_h(k,j);
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      element_t expected(5 * i + 13 * j);
+      for (camp::idx_t k = 0; k < N; ++k)
+      {
+        expected += data1_h(i, k) * data2_h(k, j);
       }
-//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-//      data3_h(i,j) = expected;
+      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+      //    (double)expected, (double)data3_h(i,j));
 
+      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+      //      data3_h(i,j) = expected;
     }
   }
 
-//  printf("expected:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("expected:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data3_h(i,j) = 5*i+13*j;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data3_h(i, j) = 5 * i + 13 * j;
         }
       }
 
@@ -171,8 +186,7 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
         auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
@@ -183,7 +197,8 @@ void ET_MatrixMatrixMultiplyAddImpl()
         auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
 
-        data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+        data3_d(C_rows, C_cols) +=
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
       });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
@@ -195,35 +210,34 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
-        for(camp::idx_t j = 0;j < n_size; ++ j){
-          element_t expected(5*i+13*j);
-          for(camp::idx_t k = 0;k < m_size; ++ k){
-            expected += data1_h(i,k)*data2_h(k,j);
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
+        for (camp::idx_t j = 0; j < n_size; ++j)
+        {
+          element_t expected(5 * i + 13 * j);
+          for (camp::idx_t k = 0; k < m_size; ++k)
+          {
+            expected += data1_h(i, k) * data2_h(k, j);
           }
-    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
+          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+          //    (double)expected, (double)data3_h(i,j));
 
-          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
+          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
         }
       }
-
-
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiplyAdd)
 {
   ET_MatrixMatrixMultiplyAddImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
index 0d7f2fd137..ad837e77ff 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixVectorImpl()
@@ -21,61 +21,67 @@ void ET_MatrixVectorImpl()
   using cvector_t = typename matrix_t::column_vector_type;
   using rvector_t = typename matrix_t::row_vector_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
+      data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
+      data1_ptr);
 
 
   // alloc data2 - The input vector
 
   std::vector<element_t> data2_vec(N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_h(data2_vec.data());
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_h(
+      data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The output vector
 
   std::vector<element_t> data3_vec(N);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_h(data3_vec.data(),  N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_d(data3_ptr,  N);
+  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_h(data3_vec.data(), N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_d(data3_ptr, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 3+i*N+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 3 + i * N + j;
     }
-    data2_h(i) = i+1;
+    data2_h(i) = i + 1;
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
-//  }
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -84,16 +90,15 @@ void ET_MatrixVectorImpl()
   //
   // Do Operation: A*x
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+    auto cols =
+        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
     auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
     auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
 
     data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
-
   });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
@@ -102,30 +107,36 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
 
     element_t expected(0);
-    for(camp::idx_t j = 0;j < N; ++ j){
-      expected += data1_h(i,j)*data2_h(j);
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      expected += data1_h(i, j) * data2_h(j);
     }
-//    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
+    //    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected,
+    //    (double)data3_h(i));
 
     ASSERT_SCALAR_EQ(expected, data3_h(i));
   }
 
-//return;
+  // return;
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
         data3_h(i) = 0;
       }
 
@@ -135,7 +146,7 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // Load data using a View
         auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
         auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
@@ -152,42 +163,40 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
 
 
         element_t expected(0);
-        for(camp::idx_t j = 0;j < m_size; ++ j){
-          expected += data1_h(i,j) * data2_h(j);
+        for (camp::idx_t j = 0; j < m_size; ++j)
+        {
+          expected += data1_h(i, j) * data2_h(j);
         }
 
-        if(i >= n_size || m_size == 0){
+        if (i >= n_size || m_size == 0)
+        {
           expected = 0;
         }
 
-//        printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
+        //        printf("i=%d, expected=%e, data3=%e\n", (int)i,
+        //        (double)expected, (double)data3_h(i));
         ASSERT_SCALAR_EQ(expected, data3_h(i));
-
       }
-
-
     }
   }
 
 
-
   //
   // Do Operation: (x')*A
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
     auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
     auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
 
-    data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
-
+    data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
   });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
@@ -196,31 +205,35 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for(camp::idx_t j = 0;j < N; ++ j){
+  for (camp::idx_t j = 0; j < N; ++j)
+  {
 
 
     element_t expected(0);
-    for(camp::idx_t i = 0;i < N; ++ i){
-      expected += data2_h(i)*data1_h(i,j);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      expected += data2_h(i) * data1_h(i, j);
     }
 
     ASSERT_SCALAR_EQ(expected, data3_h(j));
-//    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j), (double)expected);
+    //    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j),
+    //    (double)expected);
   }
 
 
-
-
   //
   // Loop over all possible sub-matrix sizes for (x')*A
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t j = 0; j < N; ++j)
+      {
         data3_h(j) = 0;
       }
 
@@ -230,7 +243,7 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // Load data using a View
         auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
         auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
@@ -238,7 +251,7 @@ void ET_MatrixVectorImpl()
         auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
         auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-        data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
+        data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
       });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
@@ -247,24 +260,25 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t j = 0; j < N; ++j)
+      {
 
         element_t expected(0);
 
-        for(camp::idx_t i = 0;i < n_size; ++ i){
-          expected += data2_h(i) * data1_h(i,j);
+        for (camp::idx_t i = 0; i < n_size; ++i)
+        {
+          expected += data2_h(i) * data1_h(i, j);
         }
 
-        if(j >= m_size || n_size == 0){
+        if (j >= m_size || n_size == 0)
+        {
           expected = 0;
         }
 
-//        printf("j=%d, expected=%e, data3=%e\n", (int)j, (double)expected, (double)data3_h(j));
+        //        printf("j=%d, expected=%e, data3=%e\n", (int)j,
+        //        (double)expected, (double)data3_h(j));
         ASSERT_SCALAR_EQ(expected, data3_h(j));
-
       }
-
-
     }
   }
 
@@ -275,11 +289,9 @@ void ET_MatrixVectorImpl()
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixVector)
 {
   ET_MatrixVectorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index 6336a2988d..54259eac0b 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_NegateImpl()
@@ -19,7 +19,8 @@ void ET_NegateImpl()
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
@@ -27,73 +28,76 @@ void ET_NegateImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*N);
+  std::vector<element_t> input0_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, N);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, N);
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, N);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_h(input1_vec.data());
+  std::vector<element_t> input1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_h(
+      input1_vec.data());
 
-  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_d(input1_ptr);
+  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_d(
+      input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  N, N);
+  std::vector<element_t> output0_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), N, N);
 
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  N, N);
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, N, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  N, N);
+  std::vector<element_t> output1_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), N, N);
 
-  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  N, N);
+  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, N, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  N, N);
+  std::vector<element_t> output2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), N, N);
 
-  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  N, N);
+  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, N, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  N, N);
+  std::vector<element_t> output3_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), N, N);
 
-  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  N, N);
+  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, N, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  N, N);
-
-  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  N, N);
+  std::vector<element_t> output4_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), N, N);
 
+  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, N, N);
 
 
   // Fill input0
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
-      input1_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
+      input1_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -104,24 +108,26 @@ void ET_NegateImpl()
   //
   // Do Operation: negation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
     auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRrows =
+        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+    auto SRcols =
+        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
     output0_d(rows, cols) = -input0_d(rows, cols);
 
-    output1_d(rows, cols) = -input1_d(SArows, SRcols);  // mixed static_all and static_range
-    output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
-    output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
-    output4_d(rows, cols) = -input1_d(rows, SRcols);    // mixed static_range and non-static
-
+    output1_d(rows, cols) =
+        -input1_d(SArows, SRcols); // mixed static_all and static_range
+    output2_d(rows, cols) = -input1_d(SArows, SAcols); // static_all
+    output3_d(rows, cols) = -input1_d(SRrows, SRcols); // static_range
+    output4_d(rows, cols) =
+        -input1_d(rows, SRcols); // mixed static_range and non-static
   });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
@@ -134,18 +140,19 @@ void ET_NegateImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), -input0_h(i,j));
-      ASSERT_SCALAR_EQ(output1_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output2_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output3_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output4_h(i,j), -input1_h(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), -input0_h(i, j));
+      ASSERT_SCALAR_EQ(output1_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output2_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output3_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output4_h(i, j), -input1_h(i, j));
     }
   }
 
 
-
   //
   // Free data
   //
@@ -156,15 +163,10 @@ void ET_NegateImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Negate)
-{
-  ET_NegateImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Negate) { ET_NegateImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index dd95c11904..ffa119b3fa 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_SubtractImpl()
@@ -19,60 +19,66 @@ void ET_SubtractImpl()
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -85,25 +91,26 @@ void ET_SubtractImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
     auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRrows =
+        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+    auto SRcols =
+        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
     // Access types:
     // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) + data3_d(SArows, SRcols) - data4_d(SAcols, rows);
+    // data2_d - Layout with all() and static_range(), which should default to
+    // normal Layout access. data3_d - StaticLayout with static_all() and
+    // static_range(). data4_d - StaticLayout with static_all() and all().
 
+    data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) +
+                          data3_d(SArows, SRcols) - data4_d(SAcols, rows);
   });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
@@ -112,27 +119,35 @@ void ET_SubtractImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i)+data3_h(i,j)-data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i),
+                       data1_h(i, j) - data2_h(j, i) + data3_h(i, j) -
+                           data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -141,13 +156,14 @@ void ET_SubtractImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // Load data using a View
         auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
         auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
         // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+        // Layout with range() and range() because loop iterate cannot be
+        // determined statically.
 
         data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
       });
@@ -158,19 +174,22 @@ void ET_SubtractImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -186,11 +205,7 @@ void ET_SubtractImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Subtract)
-{
-  ET_SubtractImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Subtract) { ET_SubtractImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index 9f40887dd1..07efa0ed03 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_TransposeImpl()
@@ -20,8 +20,10 @@ void ET_TransposeImpl()
 
   using transpose_t = typename matrix_t::transpose_type;
 
-//  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
-//  static constexpr camp::idx_t M = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t N =
+  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t M =
+  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
@@ -32,74 +34,76 @@ void ET_TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N*M);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_h(input1_vec.data());
-
-  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_d(input1_ptr);
+  std::vector<element_t> input1_vec(N * M);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_h(
+      input1_vec.data());
 
+  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_d(
+      input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
+  std::vector<element_t> output0_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  M, N);
+  std::vector<element_t> output1_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), M, N);
 
-  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  M, N);
+  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, M, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  M, N);
+  std::vector<element_t> output2_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), M, N);
 
-  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  M, N);
+  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, M, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  M, N);
+  std::vector<element_t> output3_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), M, N);
 
-  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  M, N);
+  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, M, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  M, N);
-
-  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  M, N);
+  std::vector<element_t> output4_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), M, N);
 
+  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, M, N);
 
 
   // Fill input0 and input1
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < M; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
-      input1_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < M; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
+      input1_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -110,8 +114,7 @@ void ET_TransposeImpl()
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
     auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
@@ -121,16 +124,22 @@ void ET_TransposeImpl()
     auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
     auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,M>();
+    auto SRrows =
+        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+    auto SRcols =
+        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, M>();
 
     output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
 
-    output1_d(rows_tr, cols_tr) = input1_d(SArows, SRcols).transpose();  // mixed static_all and static_range
-    output2_d(rows_tr, cols_tr) = input1_d(SArows, SAcols).transpose();  // static_all
-    output3_d(rows_tr, cols_tr) = input1_d(SRrows, SRcols).transpose();  // static_range
-    output4_d(rows_tr, cols_tr) = input1_d(rows, SRcols).transpose();    // mixed static_range and non-static
-
+    output1_d(rows_tr, cols_tr) =
+        input1_d(SArows, SRcols)
+            .transpose(); // mixed static_all and static_range
+    output2_d(rows_tr, cols_tr) =
+        input1_d(SArows, SAcols).transpose(); // static_all
+    output3_d(rows_tr, cols_tr) =
+        input1_d(SRrows, SRcols).transpose(); // static_range
+    output4_d(rows_tr, cols_tr) =
+        input1_d(rows, SRcols).transpose(); // mixed static_range and non-static
   });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
@@ -140,31 +149,30 @@ void ET_TransposeImpl()
   tensor_copy_to_host<policy_t>(output4_vec, output4_ptr);
 
 
-
-//  for(camp::idx_t i = 0;i < M; ++ i){
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%3d ", (int)output0_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
+  //  for(camp::idx_t i = 0;i < M; ++ i){
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%3d ", (int)output0_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
-      ASSERT_SCALAR_EQ(output1_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output2_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output3_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output4_h(i,j), input1_h(j,i));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
+      ASSERT_SCALAR_EQ(output1_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output2_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output3_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output4_h(i, j), input1_h(j, i));
     }
   }
 
 
-
   //
   // Free data
   //
@@ -175,15 +183,10 @@ void ET_TransposeImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Transpose)
-{
-  ET_TransposeImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Transpose) { ET_TransposeImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index fff811c48f..d458337aac 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_ColMajorImpl()
@@ -25,26 +25,34 @@ void Load_ColMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(j,i) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -54,24 +62,27 @@ void Load_ColMajorImpl()
   //
   // Do operation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     matrix_t m;
 
-    if(matrix_t::layout_type::is_column_major()){
-      m.load_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    if (matrix_t::layout_type::is_column_major())
+    {
+      m.load_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
     }
-    else{
-      m.load_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    else
+    {
+      m.load_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
     }
 
     // write out to a second view so we can check it on the host
     // on GPU's we'll write way too much, but it should stil be correct
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data2_d(j,i) = m.get(i,j);
+    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+    {
+      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      {
+        data2_d(j, i) = m.get(i, j);
       }
     }
-
   });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
@@ -80,27 +91,33 @@ void Load_ColMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i), data2(j,i));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i),
+      //      data2(j,i));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(j, i) = -1;
         }
       }
 
@@ -110,23 +127,28 @@ void Load_ColMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         matrix_t m;
-        if(matrix_t::layout_type::is_column_major()){
-          m.load_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.load_packed_nm(
+              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
         }
-        else{
-          m.load_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        else
+        {
+          m.load_strided_nm(
+              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
         }
 
         // write out to a second view so we can check it on the host
         // on GPU's we'll write way too much, but it should stil be correct
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            data2_d(j,i) = m.get(i,j);
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(j, i) = m.get(i, j);
           }
         }
-
       });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
@@ -135,19 +157,22 @@ void Load_ColMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -160,7 +185,6 @@ void Load_ColMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Load_ColMajor)
 {
   Load_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 8cae00baec..22ffda9c70 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_RowMajorImpl()
@@ -25,27 +25,34 @@ void Load_RowMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
-
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -55,23 +62,26 @@ void Load_RowMajorImpl()
   //
   // Do Operation: Full load
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     matrix_t m;
-    if(matrix_t::layout_type::is_row_major()){
-      m.load_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    if (matrix_t::layout_type::is_row_major())
+    {
+      m.load_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
     }
-    else{
-      m.load_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    else
+    {
+      m.load_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
     }
 
     // write out to a second view so we can check it on the host
     // on GPU's we'll write way too much, but it should stil be correct
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data2_d(i,j) = m.get(i,j);
+    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+    {
+      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      {
+        data2_d(i, j) = m.get(i, j);
       }
     }
-
   });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
@@ -80,27 +90,33 @@ void Load_RowMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(i,j) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(i, j) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -109,23 +125,28 @@ void Load_RowMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         matrix_t m;
-        if(matrix_t::layout_type::is_row_major()){
-          m.load_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.load_packed_nm(
+              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
         }
-        else{
-          m.load_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        else
+        {
+          m.load_strided_nm(
+              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
         }
 
         // write out to a second view so we can check it on the host
         // on GPU's we'll write way too much, but it should stil be correct
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            data2_d(i,j) = m.get(i,j);
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(i, j) = m.get(i, j);
           }
         }
-
       });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
@@ -134,19 +155,22 @@ void Load_RowMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(i,j));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(i, j));
           }
         }
       }
-
-
     }
   }
 
@@ -159,7 +183,6 @@ void Load_RowMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Load_RowMajor)
 {
   Load_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index 0961e3722d..cb6888daa5 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_ColMajorImpl()
@@ -25,34 +25,43 @@ void Store_ColMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   //
   // Fill reference data
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data2_h(j,i) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data2_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
   //
   // Clear data1
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(j,i) = element_t(-2);
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(j, i) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -61,23 +70,26 @@ void Store_ColMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill out matrix
     matrix_t m(-1.0);
 
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        m.set(2*i*matrix_t::s_num_columns+j, i, j);
+    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+    {
+      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      {
+        m.set(2 * i * matrix_t::s_num_columns + j, i, j);
       }
     }
 
     // Store matrix to memory
-    if(matrix_t::layout_type::is_column_major()){
-      m.store_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    if (matrix_t::layout_type::is_column_major())
+    {
+      m.store_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
     }
-    else{
-      m.store_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    else
+    {
+      m.store_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
     }
   });
 
@@ -87,33 +99,41 @@ void Store_ColMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
-//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
+      {
+        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+        //        data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
       }
-      else{
-//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
+      else
+      {
+        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
       }
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
 
       //
       // Clear data1
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          data1_h(j,i) = element_t(-2);
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          data1_h(j, i) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -122,24 +142,29 @@ void Store_ColMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // fill out matrix
         matrix_t m(-1.0);
 
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            m.set(2*i*matrix_t::s_num_columns+j, i, j);
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
           }
         }
 
         // Store matrix to memory
-        if(matrix_t::layout_type::is_column_major()){
-          m.store_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.store_packed_nm(
+              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
         }
-        else{
-          m.store_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        else
+        {
+          m.store_strided_nm(
+              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
         }
-
       });
 
 
@@ -149,20 +174,24 @@ void Store_ColMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          if(i < n_size && j < m_size){
-//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          if (i < n_size && j < m_size)
+          {
+            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
+            //            data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
           }
-          else{
-//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
+          else
+          {
+            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
+            //            data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
           }
         }
       }
-
-
     }
   }
 
@@ -174,7 +203,6 @@ void Store_ColMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Store_ColMajor)
 {
   Store_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index 94172b4342..11731950fe 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_RowMajorImpl()
@@ -24,34 +24,43 @@ void Store_RowMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Fill reference data
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data2_h(i,j) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data2_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
   //
   // Clear data1
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = element_t(-2);
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -60,23 +69,26 @@ void Store_RowMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill out matrix
     matrix_t m(-1.0);
 
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        m.set(2*i*matrix_t::s_num_columns+j, i, j);
+    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+    {
+      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      {
+        m.set(2 * i * matrix_t::s_num_columns + j, i, j);
       }
     }
 
     // Store matrix to memory
-    if(matrix_t::layout_type::is_row_major()){
-      m.store_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    if (matrix_t::layout_type::is_row_major())
+    {
+      m.store_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
     }
-    else{
-      m.store_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    else
+    {
+      m.store_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
     }
   });
 
@@ -86,33 +98,41 @@ void Store_RowMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
-//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
+      {
+        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+        //        data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
       }
-      else{
-//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
+      else
+      {
+        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
       }
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
 
       //
       // Clear data1
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          data1_h(i,j) = element_t(-2);
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          data1_h(i, j) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -121,24 +141,29 @@ void Store_RowMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         // fill out matrix
         matrix_t m(-1.0);
 
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            m.set(2*i*matrix_t::s_num_columns+j, i, j);
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
           }
         }
 
         // Store matrix to memory
-        if(matrix_t::layout_type::is_row_major()){
-          m.store_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.store_packed_nm(
+              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
         }
-        else{
-          m.store_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        else
+        {
+          m.store_strided_nm(
+              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
         }
-
       });
 
 
@@ -148,20 +173,24 @@ void Store_RowMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          if(i < n_size && j < m_size){
-//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          if (i < n_size && j < m_size)
+          {
+            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
+            //            data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
           }
-          else{
-//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
+          else
+          {
+            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
+            //            data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
           }
         }
       }
-
-
     }
   }
 
@@ -173,7 +202,6 @@ void Store_RowMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Store_RowMajor)
 {
   Store_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
index 1be42b1ab8..2ffe26118f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_MATRIX_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_Transpose_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void TransposeImpl()
@@ -24,7 +24,7 @@ void TransposeImpl()
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
 
-//  bool is_row_major = matrix_t::layout_type::is_row_major();
+  //  bool is_row_major = matrix_t::layout_type::is_row_major();
 
   //
   // Allocate Row-Major Data
@@ -32,41 +32,38 @@ void TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
-
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
-
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
+  std::vector<element_t> output0_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
 
 
   // Fill input0 and output0
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < M; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < M; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
   tensor_copy_to_device<policy_t>(input0_ptr, input0_vec);
 
 
-
-
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // load original matrix
     matrix_t A;
     A.load_strided(input0_ptr, M, 1);
@@ -76,47 +73,43 @@ void TransposeImpl()
 
     // store transposed matrix
     B.store_strided(output0_ptr, N, 1);
-
   });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
 
 
   printf("gtest result:\n");
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      printf("%3d ", (int)output0_h(i,j));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      printf("%3d ", (int)output0_h(i, j));
     }
     printf("\n");
   }
 
 
-
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(input0_ptr);
   tensor_free<policy_t>(output0_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, Transpose)
-{
-  TransposeImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, Transpose) { TransposeImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
index ae9a93c3ad..cd98f4f61f 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Add_HPP__
 #define __TEST_TENSOR_REGISTER_Add_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void AddImpl()
@@ -22,21 +22,22 @@ void AddImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t  i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,8 +49,7 @@ void AddImpl()
   //
 
   // operator +
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -63,15 +63,14 @@ void AddImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator +=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -87,16 +86,14 @@ void AddImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator + scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -107,16 +104,14 @@ void AddImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + 7, output0_vec[lane]);
   }
 
 
-
-
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -129,12 +124,12 @@ void AddImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +137,7 @@ void AddImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Add)
-{
-  AddImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Add) { AddImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
index 33efe4ba27..aa9a2abdf9 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Divide_HPP__
 #define __TEST_TENSOR_REGISTER_Divide_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DivideImpl()
@@ -22,21 +22,22 @@ void DivideImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,8 +49,7 @@ void DivideImpl()
   //
 
   // operator /
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -63,15 +63,14 @@ void DivideImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator /=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -87,16 +86,14 @@ void DivideImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator / scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -107,16 +104,14 @@ void DivideImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / 7, output0_vec[lane]);
   }
 
 
-
-
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -129,7 +124,8 @@ void DivideImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / 3, output0_vec[lane]);
   }
 
@@ -137,33 +133,35 @@ void DivideImpl()
   //
   // Test variable length operations for all valid lengths
   //
-  for(camp::idx_t  N = 0;N < num_elem; ++N){
-
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       register_t x;
       x.load_packed_n(input0_dptr, N);
 
       register_t y;
       y.load_packed_n(input1_dptr, N);
 
-      register_t z = x.divide_n(y,N);
+      register_t z = x.divide_n(y, N);
 
       z.store_packed(output0_dptr);
     });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-    for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
-      if(lane < N){
-        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
+      if (lane < N)
+      {
+        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane],
+                         output0_vec[lane]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(0, output0_vec[lane]);
       }
     }
-
-
   }
 
 
@@ -174,11 +172,7 @@ void DivideImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Divide)
-{
-  DivideImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Divide) { DivideImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
index 6a414dd7d0..b5beb6db30 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_DotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_DotProduct_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DotProductImpl()
@@ -22,21 +22,22 @@ void DotProductImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -47,8 +48,7 @@ void DotProductImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -62,13 +62,13 @@ void DotProductImpl()
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   element_t expected = 0;
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     expected += input0_vec[lane] * input1_vec[lane];
   }
   ASSERT_SCALAR_EQ(expected, output0_vec[0]);
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -76,11 +76,7 @@ void DotProductImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, DotProduct)
-{
-  DotProductImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, DotProduct) { DotProductImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
index e03529d183..8b04b3da25 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_FMA_HPP__
 #define __TEST_TENSOR_REGISTER_FMA_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMAImpl()
@@ -22,26 +22,27 @@ void FMAImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t *input2_hptr = input2_vec.data();
-  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input2_hptr = input2_vec.data();
+  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
-   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -54,8 +55,7 @@ void FMAImpl()
   //
 
   // operator z = a*b+c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t a;
     a.load_packed(input0_dptr);
 
@@ -65,20 +65,20 @@ void FMAImpl()
     register_t c;
     c.load_packed(input2_dptr);
 
-    register_t z = a.multiply_add(b,c);
+    register_t z = a.multiply_add(b, c);
 
     z.store_packed(output0_dptr);
   });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane], output0_vec[lane]);
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
-
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -87,11 +87,7 @@ void FMAImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, FMA)
-{
-  FMAImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, FMA) { FMAImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
index 2f8b53c0c9..98a942ced3 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_FMS_HPP__
 #define __TEST_TENSOR_REGISTER_FMS_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMSImpl()
@@ -22,26 +22,27 @@ void FMSImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t *input2_hptr = input2_vec.data();
-  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input2_hptr = input2_vec.data();
+  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
-   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -54,8 +55,7 @@ void FMSImpl()
   //
 
   // operator z = a*b-c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t a;
     a.load_packed(input0_dptr);
 
@@ -65,20 +65,20 @@ void FMSImpl()
     register_t c;
     c.load_packed(input2_dptr);
 
-    register_t z = a.multiply_subtract(b,c);
+    register_t z = a.multiply_subtract(b, c);
 
     z.store_packed(output0_dptr);
   });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane], output0_vec[lane]);
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
-
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -87,11 +87,7 @@ void FMSImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, FMS)
-{
-  FMSImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, FMS) { FMSImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
index 37429b5087..3f1459de2c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Gather_HPP__
 #define __TEST_TENSOR_REGISTER_Gather_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GatherImpl()
@@ -26,25 +26,27 @@ void GatherImpl()
   // Allocate
 
   // Data to be read (10x larger than output)
-  std::vector<element_t> input0_vec(10*num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> input0_vec(10 * num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Indexing into input0
   std::vector<index_t> input1_vec(num_elem);
-  index_t *input1_hptr = input1_vec.data();
-  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -56,8 +58,7 @@ void GatherImpl()
   //
 
   // operator z[i] = a[b[i]]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // get offsets
     int_register_t idx;
     idx.load_packed(input1_dptr);
@@ -72,7 +73,8 @@ void GatherImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
   }
 
@@ -81,11 +83,11 @@ void GatherImpl()
   // Check partial length operations
   //
 
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
 
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       // get offsets
       int_register_t idx;
       idx.load_packed_n(input1_dptr, N);
@@ -102,15 +104,17 @@ void GatherImpl()
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
-    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-      if(lane < N){
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
+      if (lane < N)
+      {
         ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ((element_t)0, output0_vec[lane]);
       }
     }
-
   }
 
 
@@ -121,11 +125,7 @@ void GatherImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Gather)
-{
-  GatherImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Gather) { GatherImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
index 194412d999..560bb4f0b0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_GetSet_HPP__
 #define __TEST_TENSOR_REGISTER_GetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GetSetImpl()
@@ -21,38 +21,40 @@ void GetSetImpl()
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
   // Test set and get operations
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x using set
     register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       x.set(input0_dptr[i], i);
     }
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = x.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
@@ -60,40 +62,39 @@ void GetSetImpl()
   //
   // test copy construction
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x using set
     register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       x.set(input0_dptr[i], i);
     }
 
     register_t cc(x);
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = cc.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test explicit copy
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x using set
     register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       x.set(input0_dptr[i], i);
     }
 
@@ -101,117 +102,109 @@ void GetSetImpl()
     cc.copy(x);
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = cc.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x using set
     register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       x.set(input0_dptr[i], i);
     }
 
     register_t cc = x;
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = cc.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test scalar construction (broadcast)
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-
-    register_t cc = (element_t) 5;
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    register_t cc = (element_t)5;
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = cc.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)5);
   }
 
 
-
-
-
   //
   // test scalar broadcast by assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-
-    register_t cc = (element_t) 0;
-    cc = (element_t) 11.0;
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    register_t cc = (element_t)0;
+    cc = (element_t)11.0;
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = cc.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)11);
   }
 
 
-
   //
   // test scalar explicit broadcast
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t cc = (element_t) 0;
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    register_t cc = (element_t)0;
     cc.broadcast(13.0);
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = cc.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)13);
   }
 
@@ -224,11 +217,7 @@ void GetSetImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, GetSet)
-{
-  GetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, GetSet) { GetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
index afe738b037..c8675fc85e 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_HPP__
 #define __TEST_TENSOR_REGISTER_Load_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void LoadImpl()
@@ -20,119 +20,120 @@ void LoadImpl()
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
-  std::vector<element_t> input0_vec(10*num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> input0_vec(10 * num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
-
   // load stride-1 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x using set
     register_t x;
     x.load_packed(input0_dptr);
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = x.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       // fill x using set
       register_t x;
       x.load_packed_n(input0_dptr, N);
 
       // extract from x using get
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         output0_dptr[i] = x.get(i);
       }
-
     });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
-
-
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x using set
     register_t x;
     x.load_strided(input0_dptr, 2);
 
     // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       output0_dptr[i] = x.get(i);
     }
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       // fill x using set
       register_t x;
       x.load_strided_n(input0_dptr, 2, N);
 
       // extract from x using get
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         output0_dptr[i] = x.get(i);
       }
-
     });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
-        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
+        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
@@ -147,11 +148,7 @@ void LoadImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Load)
-{
-  LoadImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Load) { LoadImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
index f4bce2e7a9..c407a77530 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Max_HPP__
 #define __TEST_TENSOR_REGISTER_Max_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MaxImpl()
@@ -22,24 +22,25 @@ void MaxImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
-    input1_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+    input1_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -50,8 +51,7 @@ void MaxImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // load input vectors
     register_t x;
     x.load_packed(input0_dptr);
@@ -73,10 +73,10 @@ void MaxImpl()
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
-
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for(camp::idx_t i = 1;i < num_elem;++i){
+  for (camp::idx_t i = 1; i < num_elem; ++i)
+  {
     expected = expected < input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -85,26 +85,27 @@ void MaxImpl()
 
 
   // check element-wise operation
-  for(camp::idx_t i = 0;i < num_elem;++i){
-    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       register_t x;
       x.load_packed(input0_dptr);
 
       output0_dptr[0] = x.max_n(N);
-
     });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -112,13 +113,13 @@ void MaxImpl()
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::min();
-    for(camp::idx_t i = 0;i < N;++i){
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
       expected = expected < input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
-
   }
 
   // Cleanup
@@ -129,11 +130,7 @@ void MaxImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Max)
-{
-  MaxImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Max) { MaxImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
index 957d9fbf1d..340ab16e8a 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Min_HPP__
 #define __TEST_TENSOR_REGISTER_Min_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MinImpl()
@@ -22,23 +22,24 @@ void MinImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
-   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -49,8 +50,7 @@ void MinImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // load input vectors
     register_t x;
     x.load_packed(input0_dptr);
@@ -72,10 +72,10 @@ void MinImpl()
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
-
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for(camp::idx_t i = 1;i < num_elem;++i){
+  for (camp::idx_t i = 1; i < num_elem; ++i)
+  {
     expected = expected > input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -84,26 +84,27 @@ void MinImpl()
 
 
   // check element-wise operation
-  for(camp::idx_t i = 0;i < num_elem;++i){
-    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       register_t x;
       x.load_packed(input0_dptr);
 
       output0_dptr[0] = x.min_n(N);
-
     });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -111,13 +112,13 @@ void MinImpl()
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::max();
-    for(camp::idx_t i = 0;i < N;++i){
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
       expected = expected > input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
-
   }
 
   // Cleanup
@@ -128,11 +129,7 @@ void MinImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Min)
-{
-  MinImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Min) { MinImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
index 0ed4d4ad39..33fbd89565 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Multiply_HPP__
 #define __TEST_TENSOR_REGISTER_Multiply_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MultiplyImpl()
@@ -22,21 +22,22 @@ void MultiplyImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,8 +49,7 @@ void MultiplyImpl()
   //
 
   // operator *
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -63,15 +63,14 @@ void MultiplyImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator *=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -87,16 +86,14 @@ void MultiplyImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator * scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -107,16 +104,14 @@ void MultiplyImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * 7, output0_vec[lane]);
   }
 
 
-
-
   // operator *= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -129,12 +124,12 @@ void MultiplyImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +137,7 @@ void MultiplyImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Multiply)
-{
-  MultiplyImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Multiply) { MultiplyImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
index dc27f15b7b..f1558c5f9f 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Scatter_HPP__
 #define __TEST_TENSOR_REGISTER_Scatter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void ScatterImpl()
@@ -27,25 +27,26 @@ void ScatterImpl()
 
   // Data to be read
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Indexing into output0
   std::vector<index_t> input1_vec(num_elem);
-  index_t *input1_hptr = input1_vec.data();
-  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   // Scattered output (10x larger than output)
-  std::vector<element_t> output0_vec(10*num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> output0_vec(10 * num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // precomputed expected output
-  std::vector<element_t> expected(10*num_elem);
+  std::vector<element_t> expected(10 * num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -53,8 +54,9 @@ void ScatterImpl()
 
 
   // Initialize output
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   output0_vec[i] = (element_t)0;
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
@@ -64,8 +66,7 @@ void ScatterImpl()
   //
 
   // operator z[b[i]] = a[i]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     int_register_t idx;
     idx.load_packed(input1_dptr);
 
@@ -78,15 +79,18 @@ void ScatterImpl()
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // compute expected value
-  for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
+  {
     expected[lane] = 0;
   }
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     expected[input1_vec[lane]] = input0_vec[lane];
   }
 
   // check result
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
   }
 
@@ -95,19 +99,19 @@ void ScatterImpl()
   // Check partial length operations
   //
 
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-     output0_vec[i] = (element_t)0;
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
-
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       int_register_t idx;
       idx.load_packed(input1_dptr);
 
@@ -121,18 +125,20 @@ void ScatterImpl()
 
 
     // compute expected value
-    for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
+    for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
+    {
       expected[lane] = 0;
     }
-    for(camp::idx_t lane = 0;lane < N;++ lane){
+    for (camp::idx_t lane = 0; lane < N; ++lane)
+    {
       expected[input1_vec[lane]] = input0_vec[lane];
     }
 
     // check result
-    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
       ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
     }
-
   }
 
 
@@ -143,11 +149,7 @@ void ScatterImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Scatter)
-{
-  ScatterImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Scatter) { ScatterImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
index c3394e981f..08e2d4be43 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastInnerImpl()
@@ -22,42 +22,44 @@ void SegmentedBroadcastInnerImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-//  printf("input: ");
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
-//    printf("%lf ", (double)input0_hptr[i]);
+  //  printf("input: ");
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1); //+NO_OPT_RAND);
+    //    printf("%lf ", (double)input0_hptr[i]);
   }
-//  printf("\n");
+  //  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = num_elem>>segbits;
+    camp::idx_t num_segments = num_elem >> segbits;
 
-    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
-//      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits, (camp::idx_t)input_segment);
+    for (camp::idx_t input_segment = 0; input_segment < num_segments;
+         ++input_segment)
+    {
+      //      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits,
+      //      (camp::idx_t)input_segment);
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         register_t x;
         x.load_packed(input0_dptr);
 
         register_t y = x.segmented_broadcast_inner(segbits, input_segment);
 
         y.store_packed(output0_dptr);
-
       });
 
       // Move result to host
@@ -69,31 +71,33 @@ void SegmentedBroadcastInnerImpl()
       // Compute expected values
       element_t expected[num_elem];
 
-      camp::idx_t mask = (1<<segbits)-1;
+      camp::idx_t mask = (1 << segbits) - 1;
       camp::idx_t offset = input_segment << segbits;
 
       // default implementation is dumb, just sum each value into
       // appropriate segment lane
-//      printf("Expected: ");
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      //      printf("Expected: ");
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
-        auto off = (i&mask) + offset;
+        auto off = (i & mask) + offset;
 
         expected[i] = input0_hptr[off];
 
-//        printf("%d ", (camp::idx_t)off);
-        //printf("%lf ", (double)expected[i]);
+        //        printf("%d ", (camp::idx_t)off);
+        // printf("%lf ", (double)expected[i]);
       }
-//      printf("\n");
+      //      printf("\n");
 
 
-//      printf("Result:   ");
-//      for(camp::idx_t i = 0;i < num_elem; ++ i){
-//        printf("%lf ", (double)output0_vec[i]);
-//      }
-//      printf("\n");
+      //      printf("Result:   ");
+      //      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      //        printf("%lf ", (double)output0_vec[i]);
+      //      }
+      //      printf("\n");
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
@@ -109,7 +113,6 @@ void SegmentedBroadcastInnerImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastInner)
 {
   SegmentedBroadcastInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
index 45c5739af0..40213a9972 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastOuterImpl()
@@ -22,41 +22,42 @@ void SegmentedBroadcastOuterImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-//  printf("input: ");
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-//    printf("%lf ", (double)input0_hptr[i]);
+  //  printf("input: ");
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    //    printf("%lf ", (double)input0_hptr[i]);
   }
-//  printf("\n");
+  //  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = (1<<segbits);
+    camp::idx_t num_segments = (1 << segbits);
 
-    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
+    for (camp::idx_t input_segment = 0; input_segment < num_segments;
+         ++input_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         register_t x;
         x.load_packed(input0_dptr);
 
         register_t y = x.segmented_broadcast_outer(segbits, input_segment);
 
         y.store_packed(output0_dptr);
-
       });
 
       // Move result to host
@@ -66,22 +67,24 @@ void SegmentedBroadcastOuterImpl()
       // Check result
 
       // Compute expected values
-//      printf("explode: segbits=%d, input_segment=%d\n", segbits, input_segment);
-//      printf("  expected:  ");
+      //      printf("explode: segbits=%d, input_segment=%d\n", segbits,
+      //      input_segment); printf("  expected:  ");
 
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        camp::idx_t seg = i>>segbits;
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        camp::idx_t seg = i >> segbits;
 
-        camp::idx_t off = (num_elem>>segbits)*input_segment + seg;
+        camp::idx_t off = (num_elem >> segbits) * input_segment + seg;
 
         expected[i] = input0_hptr[off];
-//        printf("%lf ", (double)expected[i]);
+        //        printf("%lf ", (double)expected[i]);
       }
-//      printf("\n");
+      //      printf("\n");
 
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
@@ -96,7 +99,6 @@ void SegmentedBroadcastOuterImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastOuter)
 {
   SegmentedBroadcastOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
index d8243864e8..5ad61c7dad 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedDotProductImpl()
@@ -22,38 +22,40 @@ void SegmentedDotProductImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
   tensor_copy_to_device<policy_t>(input1_dptr, input1_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
-
-    camp::idx_t num_output_segments = 1<<segbits;
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    for(camp::idx_t output_segment = 0;output_segment < num_output_segments;++output_segment){
+    camp::idx_t num_output_segments = 1 << segbits;
 
+    for (camp::idx_t output_segment = 0; output_segment < num_output_segments;
+         ++output_segment)
+    {
 
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         register_t x;
         x.load_packed(input0_dptr);
 
@@ -62,7 +64,6 @@ void SegmentedDotProductImpl()
 
         register_t dp = x.segmented_dot(segbits, output_segment, y);
         dp.store_packed(output0_dptr);
-
       });
 
 
@@ -72,16 +73,19 @@ void SegmentedDotProductImpl()
       // Compute expected values
       std::vector<element_t> expected(num_elem);
 
-      camp::idx_t offset = output_segment * num_elem/(1<<segbits);
+      camp::idx_t offset = output_segment * num_elem / (1 << segbits);
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        expected[(i>>segbits) + offset] += input0_vec[i]*input1_vec[i];
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        expected[(i >> segbits) + offset] += input0_vec[i] * input1_vec[i];
       }
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
@@ -90,7 +94,6 @@ void SegmentedDotProductImpl()
   } // segbits
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -98,7 +101,6 @@ void SegmentedDotProductImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedDotProduct)
 {
   SegmentedDotProductImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
index 2cfda47bcd..c212e0a601 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumInnerImpl()
@@ -22,39 +22,40 @@ void SegmentedSumInnerImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = 1<<segbits;
+    camp::idx_t num_segments = 1 << segbits;
 
-    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
+    for (camp::idx_t output_segment = 0; output_segment < num_segments;
+         ++output_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         register_t x;
         x.load_packed(input0_dptr);
 
         register_t y = x.segmented_sum_inner(segbits, output_segment);
 
         y.store_packed(output0_dptr);
-
       });
 
       // Move result to host
@@ -65,23 +66,25 @@ void SegmentedSumInnerImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * num_elem>>segbits;
+      camp::idx_t output_offset = output_segment * num_elem >> segbits;
 
       // sum each value into appropriate segment lane
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
-        auto off = (i >> segbits)+output_offset;
+        auto off = (i >> segbits) + output_offset;
 
         expected[off] += input0_hptr[i];
       }
 
 
-
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
@@ -97,7 +100,6 @@ void SegmentedSumInnerImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedSumInner)
 {
   SegmentedSumInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
index 6ce6f2a6e3..f51df07b74 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumOuterImpl()
@@ -22,38 +22,39 @@ void SegmentedSumOuterImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1); //+NO_OPT_RAND);
   }
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = num_elem>>segbits;
+    camp::idx_t num_segments = num_elem >> segbits;
 
-    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
+    for (camp::idx_t output_segment = 0; output_segment < num_segments;
+         ++output_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
         register_t x;
         x.load_packed(input0_dptr);
 
         register_t y = x.segmented_sum_outer(segbits, output_segment);
 
         y.store_packed(output0_dptr);
-
       });
 
       // Move result to host
@@ -64,19 +65,22 @@ void SegmentedSumOuterImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * (1<<segbits);
+      camp::idx_t output_offset = output_segment * (1 << segbits);
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        camp::idx_t output_i = output_offset + i%(1<<segbits);
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        camp::idx_t output_i = output_offset + i % (1 << segbits);
         expected[output_i] += input0_hptr[i];
       }
 
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
@@ -92,7 +96,6 @@ void SegmentedSumOuterImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedSumOuter)
 {
   SegmentedSumOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
index ac508fb0d6..679abdf6fb 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Store_HPP__
 #define __TEST_TENSOR_REGISTER_Store_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void StoreImpl()
@@ -21,143 +21,150 @@ void StoreImpl()
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
-  std::vector<element_t> output0_vec(10*num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> output0_vec(10 * num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
   // Initialize output
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // store stride-1 to pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x
     register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       x.set(input0_dptr[i], i);
     }
 
     x.store_packed(output0_dptr);
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+    {
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       // fill x
       register_t x;
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         x.set(input0_dptr[i], i);
       }
 
       x.store_packed_n(output0_dptr, N);
-
     });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
-
   // Initialize output
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // fill x
     register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
       x.set(input0_dptr[i], i);
     }
 
     x.store_strided(output0_dptr, 2);
-
   });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+    {
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
-
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
       // fill x
       register_t x;
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         x.set(input0_dptr[i], i);
       }
 
       x.store_strided_n(output0_dptr, 2, N);
-
     });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
-        ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
+        ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
       }
-      else{
-        ASSERT_SCALAR_EQ(output0_vec[2*i], (element_t)0);
+      else
+      {
+        ASSERT_SCALAR_EQ(output0_vec[2 * i], (element_t)0);
       }
     }
   }
@@ -171,11 +178,7 @@ void StoreImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Store)
-{
-  StoreImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Store) { StoreImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
index fb9a0efc92..7a961cb69e 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_REGISTER_Subtract_HPP__
 #define __TEST_TENSOR_REGISTER_Subtract_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SubtractImpl()
@@ -22,21 +22,22 @@ void SubtractImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,8 +49,7 @@ void SubtractImpl()
   //
 
   // operator -
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -63,15 +63,14 @@ void SubtractImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator -=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -87,16 +86,14 @@ void SubtractImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator - scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -107,16 +104,14 @@ void SubtractImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - 7, output0_vec[lane]);
   }
 
 
-
-
   // operator -= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     register_t x;
     x.load_packed(input0_dptr);
 
@@ -129,12 +124,12 @@ void SubtractImpl()
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +137,7 @@ void SubtractImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Subtract)
-{
-  SubtractImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Subtract) { SubtractImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
index cbcf7c8783..f31e64afb0 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 #define __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void CtorGetSetImpl()
@@ -23,12 +23,13 @@ void CtorGetSetImpl()
   std::vector<element_t> get(vector_t::s_num_elem);
   std::vector<element_t> set(vector_t::s_num_elem);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * get_ptr = tensor_malloc<policy_t>(get);
-  element_t * set_ptr = tensor_malloc<policy_t>(set);
+  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* get_ptr = tensor_malloc<policy_t>(get);
+  element_t* set_ptr = tensor_malloc<policy_t>(set);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
-    A[i] = (element_t)(i*2);
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
+    A[i] = (element_t)(i * 2);
     get[i] = 0;
     set[i] = 0;
   }
@@ -39,20 +40,23 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+    {
       // load array A as vector
       vector_t vec;
       vec.load_packed_n(A_ptr, N);
 
       // try get operations
-      for(camp::idx_t i = 0;i < N;++ i){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
         get_ptr[i] = vec.get(i);
       }
 
       // try set and get operations
-      for(camp::idx_t i = 0;i < N;++ i){
-        vec.set((element_t)(i+1), i);
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        vec.set((element_t)(i + 1), i);
         set_ptr[i] = vec.get(i);
       }
     }
@@ -64,17 +68,19 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+  {
 
     // check get operations
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(get[i], (element_t)(i*2));
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(get[i], (element_t)(i * 2));
     }
 
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(set[i], (element_t)(i+1));
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(set[i], (element_t)(i + 1));
     }
-
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -83,11 +89,7 @@ void CtorGetSetImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, CtorGetSet)
-{
-  CtorGetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
index 61073f5cc3..cf4e8c065e 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_VECTOR_FmaFms_HPP__
 #define __TEST_TENSOR_VECTOR_FmaFms_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void FmaFmsImpl()
@@ -24,16 +24,17 @@ void FmaFmsImpl()
   std::vector<element_t> fma(vector_t::s_num_elem);
   std::vector<element_t> fms(vector_t::s_num_elem);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * B_ptr = tensor_malloc<policy_t>(B);
-  element_t * C_ptr = tensor_malloc<policy_t>(C);
-  element_t * fma_ptr = tensor_malloc<policy_t>(fma);
-  element_t * fms_ptr = tensor_malloc<policy_t>(fms);
+  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* B_ptr = tensor_malloc<policy_t>(B);
+  element_t* C_ptr = tensor_malloc<policy_t>(C);
+  element_t* fma_ptr = tensor_malloc<policy_t>(fma);
+  element_t* fms_ptr = tensor_malloc<policy_t>(fms);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     A[i] = (element_t)i;
-    B[i] = (element_t)i*2;
-    C[i] = (element_t)i*3;
+    B[i] = (element_t)i * 2;
+    C[i] = (element_t)i * 3;
     fma[i] = 0;
     fms[i] = 0;
   }
@@ -46,8 +47,9 @@ void FmaFmsImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+    {
 
       // load arrays as vectors
       vector_t vec_A;
@@ -63,13 +65,15 @@ void FmaFmsImpl()
       // try FMA (A*B+C)
 
       vector_t fma = vec_A.multiply_add(vec_B, vec_C);
-      for(camp::idx_t i = 0;i < N;++ i){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
         fma_ptr[i] = fma.get(i);
       }
 
       // try FMS (A*B-C)
       vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
-      for(camp::idx_t i = 0;i < N;++ i){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
         fms_ptr[i] = fms.get(i);
       }
     }
@@ -80,19 +84,21 @@ void FmaFmsImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+  {
 
     // check FMA (A*B+C)
 
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(fma[i], A[i]*B[i]+C[i]);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(fma[i], A[i] * B[i] + C[i]);
     }
 
     // check FMS (A*B-C)
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(fms[i], A[i]*B[i]-C[i]);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(fms[i], A[i] * B[i] - C[i]);
     }
-
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -103,11 +109,7 @@ void FmaFmsImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, FmaFms)
-{
-  FmaFmsImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, FmaFms) { FmaFmsImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
index 854dcba8be..670044a7de 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
@@ -8,9 +8,9 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE( TX, "TX" );
+RAJA_INDEX_VALUE(TX, "TX");
 
 template <typename VECTOR_TYPE>
 void ForallVectorRef1dImpl()
@@ -21,22 +21,23 @@ void ForallVectorRef1dImpl()
   using element_t = typename vector_t::element_type;
 
 
-  size_t N = 10*vector_t::s_num_elem+1;
+  size_t N = 10 * vector_t::s_num_elem + 1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-    //N += (size_t)(100*NO_OPT_RAND);
+  // N += (size_t)(100*NO_OPT_RAND);
 
   std::vector<element_t> A(N);
   std::vector<element_t> B(N);
   std::vector<element_t> C(N);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * B_ptr = tensor_malloc<policy_t>(B);
-  element_t * C_ptr = tensor_malloc<policy_t>(C);
+  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* B_ptr = tensor_malloc<policy_t>(B);
+  element_t* C_ptr = tensor_malloc<policy_t>(C);
 
-  for(size_t i = 0;i < N; ++ i){
-    A[i] = (element_t)(NO_OPT_RAND*1000.0);
-    B[i] = (element_t)(NO_OPT_RAND*1000.0);
+  for (size_t i = 0; i < N; ++i)
+  {
+    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
+    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
     C[i] = 0.0;
   }
 
@@ -57,82 +58,87 @@ void ForallVectorRef1dImpl()
   tensor_copy_to_device<policy_t>(B_ptr, B);
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[all] = 3 + (X_d[all]*(5/Y_d[all])) + 9;
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    Z_d[all] = 3 + (X_d[all] * (5 / Y_d[all])) + 9;
   });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-//  for(size_t i = 0;i < N; ++ i){
-//    printf("%lf ", (double)C[i]);
-//  }
-//  printf("\n\n");
+  //  for(size_t i = 0;i < N; ++ i){
+  //    printf("%lf ", (double)C[i]);
+  //  }
+  //  printf("\n\n");
 
-  for(size_t i = 0;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = 0; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
 
 
   // evaluate complex left side division on all() range
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[all] = 3 + ((X_d[all]*Y_d[all])/Y_d[all]) + 9;
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    Z_d[all] = 3 + ((X_d[all] * Y_d[all]) / Y_d[all]) + 9;
   });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for(size_t i = 0;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+((A[i]*B[i])/B[i]))+9, C[i]);
+  for (size_t i = 0; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + ((A[i] * B[i]) / B[i])) + 9, C[i]);
   }
 
   // evaluate on a subrange [N/2, N)
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
   // evaluate on a subrange [N/2, N)
-  auto some = idx_t::range(N/2, N);
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[some] = 3.+ (X_d[some]*(5/Y_d[some])) + 9;
+  auto some = idx_t::range(N / 2, N);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    Z_d[some] = 3. + (X_d[some] * (5 / Y_d[some])) + 9;
   });
 
   tensor_copy_to_host<policy_t>(A, A_ptr);
   tensor_copy_to_host<policy_t>(B, B_ptr);
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for(size_t i = 0;i < N/2;i ++){
+  for (size_t i = 0; i < N / 2; i++)
+  {
     ASSERT_SCALAR_EQ(0, C[i]);
   }
-  for(size_t i = N/2;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = N / 2; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
 
 
-
-
   // evaluate on a subrange [0, N/2) using a forall statement
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   // vector_exec only works on the host due to its use of RAJA::seq_exec
-  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(RAJA::TypedRangeSegment<TX>(0,N/2),
-      [=](TX i){
+  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(
+      RAJA::TypedRangeSegment<TX>(0, N / 2),
+      [=](TX i) { Z[i] = 3 + (X[i] * (5 / Y[i])) + 9; });
 
-     Z[i] = 3 + (X[i]*(5/Y[i])) + 9;
-  });
-
-  for(size_t i = 0;i < N/2;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = 0; i < N / 2; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
-  for(size_t i = N/2;i < N;i ++){
+  for (size_t i = N / 2; i < N; i++)
+  {
     ASSERT_SCALAR_EQ(0, C[i]);
   }
 
@@ -142,7 +148,6 @@ void ForallVectorRef1dImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorVector, ForallVectorRef1d)
 {
   ForallVectorRef1dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index 93596d8f23..431be0b045 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -8,17 +8,19 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<
+    TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
   // do nothing for CUDA or device tests
 }
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<!TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<
+    !TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
   using vector_t = VECTOR_TYPE;
@@ -26,20 +28,21 @@ ForallVectorRef2dImpl()
 
   using index_t = ptrdiff_t;
 
-  index_t N = 3*vector_t::s_num_elem+1;
-  index_t M = 4*vector_t::s_num_elem+1;
+  index_t N = 3 * vector_t::s_num_elem + 1;
+  index_t M = 4 * vector_t::s_num_elem + 1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-  N += (size_t)(10*NO_OPT_RAND);
-  M += (size_t)(10*NO_OPT_RAND);
+  N += (size_t)(10 * NO_OPT_RAND);
+  M += (size_t)(10 * NO_OPT_RAND);
 
-  std::vector<element_t> A(N*M);
-  std::vector<element_t> B(N*M);
-  std::vector<element_t> C(N*M);
+  std::vector<element_t> A(N * M);
+  std::vector<element_t> B(N * M);
+  std::vector<element_t> C(N * M);
 
-  for(index_t i = 0;i < N*M; ++ i){
-    A[i] = (element_t)(NO_OPT_RAND*1000.0);
-    B[i] = (element_t)(NO_OPT_RAND*1000.0);
+  for (index_t i = 0; i < N * M; ++i)
+  {
+    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
+    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
     C[i] = 0.0;
   }
 
@@ -53,27 +56,22 @@ ForallVectorRef2dImpl()
   //
   // Test with kernel, using sequential policies and ::all()
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  using policy1_t =
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>
-        >
-      >;
+  using policy1_t = RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>;
 
   // Test with kernel, using sequential policies and ::all()
   RAJA::kernel<policy1_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N)),
-      [=] (index_t i)
-  {
-    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
-  });
+      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
 
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
@@ -81,74 +79,69 @@ ForallVectorRef2dImpl()
   // Test with kernel, using tensor_exec policy
   //
 
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  using policy2_t =
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >;
+  using policy2_t = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec,
+      RAJA::statement::For<1,
+                           RAJA::expt::vector_exec<vector_t>,
+                           RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<policy2_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N),
                        RAJA::TypedRangeSegment<index_t>(0, M)),
 
-      [=](index_t i, index_t j)
-  {
-    Z(i, j) = 3+(X(i, j)*(5/Y(i, j)))+9;
-  });
+      [=](index_t i, index_t j) {
+        Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9;
+      });
 
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
-
   //
   // Test with forall with vectors in i
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, M),
-      [=](index_t j){
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<index_t>(0, M),
+      [=](index_t j) { Z(all, j) = 3 + (X(all, j) * (5 / Y(all, j))) + 9; });
 
-    Z(all,j) = 3+(X(all,j)*(5/Y(all,j)))+9;
-
-  });
-
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
   //
   // Test with forall with vectors in j
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, N),
-      [=](index_t i){
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<index_t>(0, N),
+      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
 
-    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
-
-  });
-
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 }
 
 
-
 TYPED_TEST_P(TestTensorVector, ForallVectorRef2d)
 {
   ForallVectorRef2dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
index 4841c4e7ee..25fb00cee4 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_VECTOR_MinMax_HPP__
 #define __TEST_TENSOR_VECTOR_MinMax_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void MinMaxImpl()
@@ -22,11 +22,12 @@ void MinMaxImpl()
   std::vector<element_t> ex_min(1);
   std::vector<element_t> ex_max(1);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * ex_min_ptr = tensor_malloc<policy_t>(ex_min);
-  element_t * ex_max_ptr = tensor_malloc<policy_t>(ex_max);
+  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* ex_min_ptr = tensor_malloc<policy_t>(ex_min);
+  element_t* ex_max_ptr = tensor_malloc<policy_t>(ex_max);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     A[i] = (element_t)i;
   }
   ex_min[0] = (element_t)99999999;
@@ -39,8 +40,9 @@ void MinMaxImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
+    for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+    {
 
       // load array A as vector
       vector_t vec;
@@ -58,7 +60,7 @@ void MinMaxImpl()
   ASSERT_SCALAR_EQ(ex_min[0], (element_t)0);
 
   // check max
-  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem-1));
+  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem - 1));
 
   tensor_free<policy_t>(A_ptr);
   tensor_free<policy_t>(ex_min_ptr);
@@ -66,11 +68,7 @@ void MinMaxImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, MinMax)
-{
-  MinMaxImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, MinMax) { MinMaxImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
index fa3a1caef8..2993f50ea4 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
@@ -8,7 +8,7 @@
 #ifndef __TEST_TENSOR_VECTOR_SumDot_HPP__
 #define __TEST_TENSOR_VECTOR_SumDot_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void SumDotImpl()
@@ -25,11 +25,12 @@ void SumDotImpl()
   element_t host_sum = 0;
   element_t host_dot = 0;
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
-  element_t * ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
+  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
+  element_t* ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     A[i] = (element_t)i;
   }
 
@@ -37,9 +38,10 @@ void SumDotImpl()
   ex_dot[0] = (element_t)0;
 
   // compute expected values on host
-  for(camp::idx_t i = 0; i < vector_t::s_num_elem; ++i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     host_sum += A[i];
-    host_dot += A[i]*A[i];
+    host_dot += A[i] * A[i];
   }
 
   tensor_copy_to_device<policy_t>(A_ptr, A);
@@ -48,7 +50,7 @@ void SumDotImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
     // load array A as vector
     vector_t vec;
     vec.load_packed_n(A_ptr, vector_t::s_num_elem);
@@ -72,11 +74,7 @@ void SumDotImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, SumDot)
-{
-  SumDotImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, SumDot) { SumDotImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/util/test-CombiningAdapter-1D.cpp b/test/functional/util/test-CombiningAdapter-1D.cpp
index 4dc73781d5..b823630788 100644
--- a/test/functional/util/test-CombiningAdapter-1D.cpp
+++ b/test/functional/util/test-CombiningAdapter-1D.cpp
@@ -19,17 +19,21 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType, typename Segment0 >
+template <typename SegIndexType, typename Segment0>
 void test_CombiningAdapter_1D(Segment0 const& seg0)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
 
   size_t counter0 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType i0) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    counter0 += 1;
-  }, seg0);
+  auto adapter = RAJA::make_CombiningAdapter(
+      [&](SegIndexType i0) {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        counter0 += 1;
+      },
+      seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -38,12 +42,13 @@ void test_CombiningAdapter_1D(Segment0 const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType >
+template <typename SegIndexType>
 void test_types_CombiningAdapter_1D(SegIndexType ibegin0, SegIndexType iend0)
 {
   RAJA::TypedRangeSegment<SegIndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index bfb7355418..7b12033430 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -19,42 +19,53 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType0, typename SegIndexType1,
-           typename Segment0, typename Segment1 >
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename Segment0,
+          typename Segment1>
 void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    ASSERT_EQ(seg1_begin[counter1], i1);
-    counter1 += 1;
-    if (counter1 == seg1_len) {
-      counter1 = 0;
-      counter0 += 1;
-    }
-  }, seg0, seg1);
+  auto adapter = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1) {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        ASSERT_EQ(seg1_begin[counter1], i1);
+        counter1 += 1;
+        if (counter1 == seg1_len)
+        {
+          counter1 = 0;
+          counter0 += 1;
+        }
+      },
+      seg0,
+      seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType0, typename SegIndexType1 >
-void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0, SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1, SegIndexType1 iend1)
+template <typename SegIndexType0, typename SegIndexType1>
+void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index 9181b974b9..a8feca569e 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -19,11 +19,19 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2,
-           typename Segment0, typename Segment1, typename Segment2 >
-void test_CombiningAdapter_3D(Segment0 const& seg0, Segment1 const& seg1, Segment2 const& seg2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2,
+          typename Segment0,
+          typename Segment1,
+          typename Segment2>
+void test_CombiningAdapter_3D(Segment0 const& seg0,
+                              Segment1 const& seg1,
+                              Segment2 const& seg2)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
@@ -33,42 +41,56 @@ void test_CombiningAdapter_3D(Segment0 const& seg0, Segment1 const& seg1, Segmen
   size_t counter0 = 0;
   size_t counter1 = 0;
   size_t counter2 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    ASSERT_EQ(seg1_begin[counter1], i1);
-    ASSERT_EQ(seg2_begin[counter2], i2);
-    counter2 += 1;
-    if (counter2 == seg2_len) {
-      counter2 = 0;
-      counter1 += 1;
-      if (counter1 == seg1_len) {
-        counter1 = 0;
-        counter0 += 1;
-      }
-    }
-  }, seg0, seg1, seg2);
+  auto adapter = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2) {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        ASSERT_EQ(seg1_begin[counter1], i1);
+        ASSERT_EQ(seg2_begin[counter2], i2);
+        counter2 += 1;
+        if (counter2 == seg2_len)
+        {
+          counter2 = 0;
+          counter1 += 1;
+          if (counter1 == seg1_len)
+          {
+            counter1 = 0;
+            counter0 += 1;
+          }
+        }
+      },
+      seg0,
+      seg1,
+      seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2 >
-void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0, SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1, SegIndexType1 iend1,
-                                    SegIndexType2 ibegin2, SegIndexType2 iend2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2>
+void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1,
+                                    SegIndexType2 ibegin2,
+                                    SegIndexType2 iend2)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
   RAJA::TypedRangeSegment<SegIndexType2> rseg2(ibegin2, iend2);
-  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(rseg0, rseg1, rseg2);
+  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(
+      rseg0, rseg1, rseg2);
 }
 
 TEST(CombiningAdapter, test3D)
diff --git a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
index ddcaea52d7..31dfaba714 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
@@ -19,17 +19,21 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
+template <typename Perm, typename IndexType, typename Segment>
 void test_PermutedCombiningAdapter_1D(Segment const& seg0)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
 
   size_t counters[1] = {0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    counters[camp::seq_at<0, Perm>::value] += 1;
-  }, seg0);
+  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0) {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        counters[camp::seq_at<0, Perm>::value] += 1;
+      },
+      seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -38,12 +42,13 @@ void test_PermutedCombiningAdapter_1D(Segment const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
+template <typename Perm, typename IndexType>
 void test_types_PermutedCombiningAdapter_1D(IndexType ibegin0, IndexType iend0)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index fd1f6a8b0a..f07420f8f5 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -19,41 +19,51 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
+template <typename Perm, typename IndexType, typename Segment>
 void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg_lens[2] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size())};
 
   size_t counters[2] = {0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    ASSERT_EQ(seg1_begin[counters[1]], i1);
-    counters[camp::seq_at<1, Perm>::value] += 1;
-    if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
-      counters[camp::seq_at<1, Perm>::value] = 0;
-      counters[camp::seq_at<0, Perm>::value] += 1;
-    }
-  }, seg0, seg1);
+  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1) {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        ASSERT_EQ(seg1_begin[counters[1]], i1);
+        counters[camp::seq_at<1, Perm>::value] += 1;
+        if (counters[camp::seq_at<1, Perm>::value] ==
+            seg_lens[camp::seq_at<1, Perm>::value])
+        {
+          counters[camp::seq_at<1, Perm>::value] = 0;
+          counters[camp::seq_at<0, Perm>::value] += 1;
+        }
+      },
+      seg0,
+      seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
-void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0, IndexType iend0,
-                                            IndexType ibegin1, IndexType iend1)
+template <typename Perm, typename IndexType>
+void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index 0943584c97..cf495e48ea 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -19,10 +19,14 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
-void test_PermutedCombiningAdapter_3D(Segment const& seg0, Segment const& seg1, Segment const& seg2)
+template <typename Perm, typename IndexType, typename Segment>
+void test_PermutedCombiningAdapter_3D(Segment const& seg0,
+                                      Segment const& seg1,
+                                      Segment const& seg2)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   auto seg2_begin = begin(seg2);
@@ -31,37 +35,50 @@ void test_PermutedCombiningAdapter_3D(Segment const& seg0, Segment const& seg1,
                         static_cast<size_t>(seg2.size())};
 
   size_t counters[3] = {0, 0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1, IndexType i2) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    ASSERT_EQ(seg1_begin[counters[1]], i1);
-    ASSERT_EQ(seg2_begin[counters[2]], i2);
-    counters[camp::seq_at<2, Perm>::value] += 1;
-    if (counters[camp::seq_at<2, Perm>::value] == seg_lens[camp::seq_at<2, Perm>::value]) {
-      counters[camp::seq_at<2, Perm>::value] = 0;
-      counters[camp::seq_at<1, Perm>::value] += 1;
-      if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
-        counters[camp::seq_at<1, Perm>::value] = 0;
-        counters[camp::seq_at<0, Perm>::value] += 1;
-      }
-    }
-  }, seg0, seg1, seg2);
+  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1, IndexType i2) {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        ASSERT_EQ(seg1_begin[counters[1]], i1);
+        ASSERT_EQ(seg2_begin[counters[2]], i2);
+        counters[camp::seq_at<2, Perm>::value] += 1;
+        if (counters[camp::seq_at<2, Perm>::value] ==
+            seg_lens[camp::seq_at<2, Perm>::value])
+        {
+          counters[camp::seq_at<2, Perm>::value] = 0;
+          counters[camp::seq_at<1, Perm>::value] += 1;
+          if (counters[camp::seq_at<1, Perm>::value] ==
+              seg_lens[camp::seq_at<1, Perm>::value])
+          {
+            counters[camp::seq_at<1, Perm>::value] = 0;
+            counters[camp::seq_at<0, Perm>::value] += 1;
+          }
+        }
+      },
+      seg0,
+      seg1,
+      seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
-void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0, IndexType iend0,
-                                            IndexType ibegin1, IndexType iend1,
-                                            IndexType ibegin2, IndexType iend2)
+template <typename Perm, typename IndexType>
+void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1,
+                                            IndexType ibegin2,
+                                            IndexType iend2)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
@@ -77,7 +94,10 @@ TEST(PermutedCombiningAdapter, test3D)
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_JKI, int>(0, 0, 0, 0, 0, 5);
 
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KIJ, int>(0, 3, 0, 4, 0, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3, 0);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1, 4);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(
+      -3, 5, 0, 6, 2, 5);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(
+      4, 13, -2, 7, -3, 0);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(
+      -8, -2, -5, 3, 1, 4);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index 4241a945dd..ee31e84155 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -20,50 +20,62 @@
 
 
 // These are defined here due to cuda limitations
-template < typename IndexType, typename type1 >
-struct callable11 {
+template <typename IndexType, typename type1>
+struct callable11
+{
   type1* working_ptr1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr1[i] += type1(i);
   }
 };
-template < typename IndexType, typename type1 >
-struct callable12 {
+template <typename IndexType, typename type1>
+struct callable12
+{
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr1[i] += test_val1;
   }
 };
 
-template < typename IndexType, typename type2 >
-struct callable21 {
+template <typename IndexType, typename type2>
+struct callable21
+{
   type2* working_ptr2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr2[i] += type2(i);
   }
 };
-template < typename IndexType, typename type2 >
-struct callable22 {
+template <typename IndexType, typename type2>
+struct callable22
+{
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr2[i] += test_val2;
   }
 };
 
-template < typename IndexType, typename type3 >
-struct callable31 {
+template <typename IndexType, typename type3>
+struct callable31
+{
   type3* working_ptr3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr3[i] += type3(i);
   }
 };
-template < typename IndexType, typename type3 >
-struct callable32 {
+template <typename IndexType, typename type3>
+struct callable32
+{
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr3[i] += test_val3;
   }
 };
@@ -75,346 +87,382 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple {
-void operator()(
-    std::mt19937& rng, IndexType max_begin, IndexType min_end,
-    IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse) const
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple
 {
-  ASSERT_GT(min_end, max_begin);
-  IndexType N = min_end + max_begin;
-
-  std::vector<IndexType> begin1, end1;
-  std::vector<IndexType> begin2, end2;
-  std::vector<IndexType> begin3, end3;
-
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
   {
-    using dist_type = std::uniform_int_distribution<IndexType>;
-
-    for (IndexType j = IndexType(0); j < num1; j++) {
-      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
-      end1.push_back(dist_type(begin1.back(), min_end)(rng));
-    }
-
-    for (IndexType j = IndexType(0); j < num2; j++) {
-      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
-      end2.push_back(dist_type(begin2.back(), min_end)(rng));
-    }
-
-    for (IndexType j = IndexType(0); j < num3; j++) {
-      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
-      end3.push_back(dist_type(begin3.back(), min_end)(rng));
-    }
-  }
-
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
-
-  using type1 = IndexType;
-  using type2 = size_t;
-  using type3 = double;
+    ASSERT_GT(min_end, max_begin);
+    IndexType N = min_end + max_begin;
 
-  type1* working_array1 = nullptr;
-  type1* check_array1 = nullptr;
-  type1* test_array1 = nullptr;
+    std::vector<IndexType> begin1, end1;
+    std::vector<IndexType> begin2, end2;
+    std::vector<IndexType> begin3, end3;
 
-  type2* working_array2 = nullptr;
-  type2* check_array2 = nullptr;
-  type2* test_array2 = nullptr;
-
-  type3* working_array3 = nullptr;
-  type3* check_array3 = nullptr;
-  type3* test_array3 = nullptr;
-
-  allocateForallTestData<type1>(N * num1,
-                                working_res,
-                                &working_array1,
-                                &check_array1,
-                                &test_array1);
-
-  allocateForallTestData<type2>(N * num2,
-                                working_res,
-                                &working_array2,
-                                &check_array2,
-                                &test_array2);
-
-  allocateForallTestData<type3>(N * num3,
-                                working_res,
-                                &working_array3,
-                                &check_array3,
-                                &test_array3);
-
-  type1 const test_val1(5);
-  type2 const test_val2(7);
-  type3 const test_val3(11);
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable11<IndexType, type1>>,
-      camp::list<range_segment, callable12<IndexType, type1>>,
-      camp::list<range_segment, callable21<IndexType, type2>>,
-      camp::list<range_segment, callable22<IndexType, type2>>,
-      camp::list<range_segment, callable31<IndexType, type3>>,
-      camp::list<range_segment, callable32<IndexType, type3>> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    {
+      using dist_type = std::uniform_int_distribution<IndexType>;
 
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+      for (IndexType j = IndexType(0); j < num1; j++)
+      {
+        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end1.push_back(dist_type(begin1.back(), min_end)(rng));
+      }
 
-  using resource_type = typename WorkGroup_type::resource_type;
+      for (IndexType j = IndexType(0); j < num2; j++)
+      {
+        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end2.push_back(dist_type(begin2.back(), min_end)(rng));
+      }
 
-  WorkPool_type pool(Allocator{});
-  WorkGroup_type group = pool.instantiate();
-  WorkSite_type site = group.run();
+      for (IndexType j = IndexType(0); j < num3; j++)
+      {
+        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      }
+    }
 
-  for (IndexType pr = 0; pr < pool_reuse; pr++) {
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res{res};
+
+    using type1 = IndexType;
+    using type2 = size_t;
+    using type3 = double;
+
+    type1* working_array1 = nullptr;
+    type1* check_array1 = nullptr;
+    type1* test_array1 = nullptr;
+
+    type2* working_array2 = nullptr;
+    type2* check_array2 = nullptr;
+    type2* test_array2 = nullptr;
+
+    type3* working_array3 = nullptr;
+    type3* check_array3 = nullptr;
+    type3* test_array3 = nullptr;
+
+    allocateForallTestData<type1>(
+        N * num1, working_res, &working_array1, &check_array1, &test_array1);
+
+    allocateForallTestData<type2>(
+        N * num2, working_res, &working_array2, &check_array2, &test_array2);
+
+    allocateForallTestData<type3>(
+        N * num3, working_res, &working_array3, &check_array3, &test_array3);
+
+    type1 const test_val1(5);
+    type2 const test_val2(7);
+    type3 const test_val3(11);
+
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable11<IndexType, type1>>,
+        camp::list<range_segment, callable12<IndexType, type1>>,
+        camp::list<range_segment, callable21<IndexType, type2>>,
+        camp::list<range_segment, callable22<IndexType, type2>>,
+        camp::list<range_segment, callable31<IndexType, type3>>,
+        camp::list<range_segment, callable32<IndexType, type3>>>;
+
+    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                              OrderPolicy,
+                                              StoragePolicy,
+                                              DispatchPolicy>,
+                        IndexType,
+                        RAJA::xargs<>,
+                        Allocator>;
+
+    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    using resource_type = typename WorkGroup_type::resource_type;
+
+    WorkPool_type pool(Allocator{});
+    WorkGroup_type group = pool.instantiate();
+    WorkSite_type site = group.run();
+
+    for (IndexType pr = 0; pr < pool_reuse; pr++)
+    {
 
 
-    // fill_pool(pool, type1(5), type2(7), type3(11));
-    {
-      for (IndexType j = IndexType(0); j < num1; j++) {
-        type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable11<IndexType, type1>{working_ptr1});
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable12<IndexType, type1>{working_ptr1, test_val1});
-      }
+      // fill_pool(pool, type1(5), type2(7), type3(11));
+      {
+        for (IndexType j = IndexType(0); j < num1; j++)
+        {
+          type1* working_ptr1 = working_array1 + N * j;
+          pool.enqueue(range_segment{begin1[j], end1[j]},
+                       callable11<IndexType, type1>{working_ptr1});
+          pool.enqueue(range_segment{begin1[j], end1[j]},
+                       callable12<IndexType, type1>{working_ptr1, test_val1});
+        }
 
-      for (IndexType j = IndexType(0); j < num2; j++) {
-        type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable21<IndexType, type2>{working_ptr2});
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable22<IndexType, type2>{working_ptr2, test_val2});
-      }
+        for (IndexType j = IndexType(0); j < num2; j++)
+        {
+          type2* working_ptr2 = working_array2 + N * j;
+          pool.enqueue(range_segment{begin2[j], end2[j]},
+                       callable21<IndexType, type2>{working_ptr2});
+          pool.enqueue(range_segment{begin2[j], end2[j]},
+                       callable22<IndexType, type2>{working_ptr2, test_val2});
+        }
 
-      for (IndexType j = IndexType(0); j < num3; j++) {
-        type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable31<IndexType, type3>{working_ptr3});
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable32<IndexType, type3>{working_ptr3, test_val3});
+        for (IndexType j = IndexType(0); j < num3; j++)
+        {
+          type3* working_ptr3 = working_array3 + N * j;
+          pool.enqueue(range_segment{begin3[j], end3[j]},
+                       callable31<IndexType, type3>{working_ptr3});
+          pool.enqueue(range_segment{begin3[j], end3[j]},
+                       callable32<IndexType, type3>{working_ptr3, test_val3});
+        }
       }
-    }
-
-    group = pool.instantiate();
 
-    for (IndexType gr = 0; gr < group_reuse; gr++) {
+      group = pool.instantiate();
 
-      // set_test_data();
+      for (IndexType gr = 0; gr < group_reuse; gr++)
       {
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr1[i] = type1(0);
+
+        // set_test_data();
+        {
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr1[i] = type1(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr2[i] = type2(0);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr2[i] = type2(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr3[i] = type3(0);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr3[i] = type3(0);
+            }
           }
-        }
 
 
-        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
-            test_ptr1[ i ] = type1(i);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = begin1[j]; i < end1[j]; ++i)
+            {
+              test_ptr1[i] = type1(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
-            test_ptr2[ i ] = type2(i);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = begin2[j]; i < end2[j]; ++i)
+            {
+              test_ptr2[i] = type2(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
-            test_ptr3[ i ] = type3(i);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = begin3[j]; i < end3[j]; ++i)
+            {
+              test_ptr3[i] = type3(i);
+            }
           }
         }
-      }
 
-      site = group.run();
+        site = group.run();
 
-      auto e = resource_type::get_default().get_event();
-      e.wait();
+        auto e = resource_type::get_default().get_event();
+        e.wait();
 
-      // check_test_data(type1(5), type2(7), type3(11));
-      {
-        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+        // check_test_data(type1(5), type2(7), type3(11));
+        {
+          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-        res.wait();
+          res.wait();
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          type1* check_ptr1 = check_array1 + N * j;
-          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-          for (IndexType i = begin1[j];    i < end1[j];   i++) {
-            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
-          }
-          for (IndexType i = end1[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            type1* check_ptr1 = check_array1 + N * j;
+            for (IndexType i = IndexType(0); i < begin1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
+            for (IndexType i = begin1[j]; i < end1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+            }
+            for (IndexType i = end1[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          type2* check_ptr2 = check_array2 + N * j;
-          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-          for (IndexType i = begin2[j];    i < end2[j];   i++) {
-            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
-          }
-          for (IndexType i = end2[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            type2* check_ptr2 = check_array2 + N * j;
+            for (IndexType i = IndexType(0); i < begin2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
+            for (IndexType i = begin2[j]; i < end2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+            }
+            for (IndexType i = end2[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          type3* check_ptr3 = check_array3 + N * j;
-          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-          }
-          for (IndexType i = begin3[j];    i < end3[j];   i++) {
-            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-          }
-          for (IndexType i = end3[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            type3* check_ptr3 = check_array3 + N * j;
+            for (IndexType i = IndexType(0); i < begin3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
+            for (IndexType i = begin3[j]; i < end3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+            }
+            for (IndexType i = end3[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
           }
         }
       }
-    }
 
-    site.clear();
-    group.clear();
-    pool.clear();
-  }
+      site.clear();
+      group.clear();
+      pool.clear();
+    }
 
 
-  deallocateForallTestData<type1>(working_res,
-                                  working_array1,
-                                  check_array1,
-                                  test_array1);
+    deallocateForallTestData<type1>(
+        working_res, working_array1, check_array1, test_array1);
 
-  deallocateForallTestData<type2>(working_res,
-                                  working_array2,
-                                  check_array2,
-                                  test_array2);
+    deallocateForallTestData<type2>(
+        working_res, working_array2, check_array2, test_array2);
 
-  deallocateForallTestData<type3>(working_res,
-                                  working_array3,
-                                  check_array3,
-                                  test_array3);
-}
+    deallocateForallTestData<type3>(
+        working_res, working_array3, check_array3, test_array3);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 
 #endif
 
 
-
 template <typename T>
 class WorkGroupBasicOrderedMultipleReuseFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
+             BasicWorkGroupOrderedMultipleReuse)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -431,12 +479,23 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrd
   IndexType num2 = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType num3 = dist_type(IndexType(0), IndexType(8))(rng);
 
-  IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType pool_reuse = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                                IndexType, Allocator, WORKING_RESOURCE >{}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+  testWorkGroupOrderedMultiple<ExecPolicy,
+                               OrderPolicy,
+                               StoragePolicy,
+                               DispatchTyper,
+                               IndexType,
+                               Allocator,
+                               WORKING_RESOURCE>{}(rng,
+                                                   IndexType(96),
+                                                   IndexType(4000),
+                                                   num1,
+                                                   num2,
+                                                   num3,
+                                                   pool_reuse,
+                                                   group_reuse);
 }
 
-#endif  //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
+#endif //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index c249b7de65..609f20ba2b 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -25,149 +25,153 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle {
-void operator()(IndexType begin, IndexType end) const
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle
 {
-  ASSERT_GE(begin, (IndexType)0);
-  ASSERT_GE(end, begin);
-  IndexType N = end + begin;
-
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
-
-  IndexType* working_array;
-  IndexType* check_array;
-  IndexType* test_array;
-
-  allocateForallTestData<IndexType>(N,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  IndexType const test_val(5);
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  auto callable1 = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += i;
-      };
-
-  auto callable2 = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += test_val;
-      };
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, decltype(callable1)>,
-      camp::list<range_segment, decltype(callable2)> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
+  void operator()(IndexType begin, IndexType end) const
   {
-    for (IndexType i = IndexType(0); i < N; i++) {
-      test_array[i] = IndexType(0);
+    ASSERT_GE(begin, (IndexType)0);
+    ASSERT_GE(end, begin);
+    IndexType N = end + begin;
+
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res{res};
+
+    IndexType* working_array;
+    IndexType* check_array;
+    IndexType* test_array;
+
+    allocateForallTestData<IndexType>(
+        N, working_res, &working_array, &check_array, &test_array);
+
+    IndexType const test_val(5);
+
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+    auto callable1 = [=] RAJA_HOST_DEVICE(IndexType i) {
+      working_array[i] += i;
+    };
+
+    auto callable2 = [=] RAJA_HOST_DEVICE(IndexType i) {
+      working_array[i] += test_val;
+    };
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, decltype(callable1)>,
+        camp::list<range_segment, decltype(callable2)>>;
+
+    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                              OrderPolicy,
+                                              StoragePolicy,
+                                              DispatchPolicy>,
+                        IndexType,
+                        RAJA::xargs<>,
+                        Allocator>;
+
+    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    {
+      for (IndexType i = IndexType(0); i < N; i++)
+      {
+        test_array[i] = IndexType(0);
+      }
+
+      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+
+      for (IndexType i = begin; i < end; ++i)
+      {
+        test_array[i] = IndexType(i);
+      }
     }
 
-    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+    WorkPool_type pool(Allocator{});
 
-    for (IndexType i = begin; i < end; ++i) {
-      test_array[ i ] = IndexType(i);
+    {
+      pool.enqueue(range_segment{begin, end}, callable1);
+      pool.enqueue(range_segment{begin, end}, callable2);
     }
-  }
-
-  WorkPool_type pool(Allocator{});
-
-  {
-    pool.enqueue(range_segment{ begin, end }, callable1);
-    pool.enqueue(range_segment{ begin, end }, callable2);
-  }
-
-  WorkGroup_type group = pool.instantiate();
-
-  WorkSite_type site = group.run(res);
-
-  {
-    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
-    res.wait();
 
-    for (IndexType i = IndexType(0); i < begin; i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
+    WorkGroup_type group = pool.instantiate();
+
+    WorkSite_type site = group.run(res);
+
+    {
+      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+      res.wait();
+
+      for (IndexType i = IndexType(0); i < begin; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
+      for (IndexType i = begin; i < end; i++)
+      {
+        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      }
+      for (IndexType i = end; i < N; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
     }
-    for (IndexType i = begin;        i < end;   i++) {
-      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
-    }
-    for (IndexType i = end;          i < N;     i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
-    }
-  }
 
 
-  deallocateForallTestData<IndexType>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
-}
+    deallocateForallTestData<IndexType>(
+        working_res, working_array, check_array, test_array);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_function_call_dispatch_typer,
-                                  IndexType,
-                                  Allocator,
-                                  WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_virtual_function_dispatch_typer,
-                                  IndexType,
-                                  Allocator,
-                                  WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 
 #endif
@@ -175,13 +179,13 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicOrderedSingleFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSingle)
+TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
+             BasicWorkGroupOrderedSingle)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -203,9 +207,27 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSin
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
+  testWorkGroupOrderedSingle<ExecPolicy,
+                             OrderPolicy,
+                             StoragePolicy,
+                             DispatchTyper,
+                             IndexType,
+                             Allocator,
+                             WORKING_RESOURCE>{}(b1, e1);
+  testWorkGroupOrderedSingle<ExecPolicy,
+                             OrderPolicy,
+                             StoragePolicy,
+                             DispatchTyper,
+                             IndexType,
+                             Allocator,
+                             WORKING_RESOURCE>{}(b2, e2);
+  testWorkGroupOrderedSingle<ExecPolicy,
+                             OrderPolicy,
+                             StoragePolicy,
+                             DispatchTyper,
+                             IndexType,
+                             Allocator,
+                             WORKING_RESOURCE>{}(b3, e3);
 }
 
-#endif  //__TEST_WORKGROUP_ORDERED_SINGLE__
+#endif //__TEST_WORKGROUP_ORDERED_SINGLE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index 4207294bcf..a58379f21d 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -19,30 +19,36 @@
 
 
 // These are defined here due to cuda limitations
-template < typename IndexType, typename type1 >
-struct callable1 {
+template <typename IndexType, typename type1>
+struct callable1
+{
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr1[i] += type1(i) + test_val1;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr1[i] += type1(i) + test_val1;
   }
 };
 
-template < typename IndexType, typename type2 >
-struct callable2 {
+template <typename IndexType, typename type2>
+struct callable2
+{
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr2[i] += type2(i) + test_val2;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr2[i] += type2(i) + test_val2;
   }
 };
 
-template < typename IndexType, typename type3 >
-struct callable3 {
+template <typename IndexType, typename type3>
+struct callable3
+{
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr3[i] += type3(i) + test_val3;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr3[i] += type3(i) + test_val3;
   }
 };
 
@@ -53,325 +59,363 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple {
-void operator()(
-    std::mt19937& rng, IndexType max_begin, IndexType min_end,
-    IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse) const
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple
 {
-  ASSERT_GT(min_end, max_begin);
-  IndexType N = min_end + max_begin;
-
-  std::vector<IndexType> begin1, end1;
-  std::vector<IndexType> begin2, end2;
-  std::vector<IndexType> begin3, end3;
-
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
   {
-    using dist_type = std::uniform_int_distribution<IndexType>;
-
-    for (IndexType j = IndexType(0); j < num1; j++) {
-      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
-      end1.push_back(dist_type(begin1.back(), min_end)(rng));
-    }
-
-    for (IndexType j = IndexType(0); j < num2; j++) {
-      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
-      end2.push_back(dist_type(begin2.back(), min_end)(rng));
-    }
-
-    for (IndexType j = IndexType(0); j < num3; j++) {
-      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
-      end3.push_back(dist_type(begin3.back(), min_end)(rng));
-    }
-  }
-
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
-
-  using type1 = IndexType;
-  using type2 = size_t;
-  using type3 = double;
-
-  type1* working_array1 = nullptr;
-  type1* check_array1 = nullptr;
-  type1* test_array1 = nullptr;
-
-  type2* working_array2 = nullptr;
-  type2* check_array2 = nullptr;
-  type2* test_array2 = nullptr;
-
-  type3* working_array3 = nullptr;
-  type3* check_array3 = nullptr;
-  type3* test_array3 = nullptr;
-
-  allocateForallTestData<type1>(N * num1,
-                                working_res,
-                                &working_array1,
-                                &check_array1,
-                                &test_array1);
+    ASSERT_GT(min_end, max_begin);
+    IndexType N = min_end + max_begin;
 
-  allocateForallTestData<type2>(N * num2,
-                                working_res,
-                                &working_array2,
-                                &check_array2,
-                                &test_array2);
+    std::vector<IndexType> begin1, end1;
+    std::vector<IndexType> begin2, end2;
+    std::vector<IndexType> begin3, end3;
 
-  allocateForallTestData<type3>(N * num3,
-                                working_res,
-                                &working_array3,
-                                &check_array3,
-                                &test_array3);
-
-  type1 const test_val1(5);
-  type2 const test_val2(7);
-  type3 const test_val3(11);
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable1<IndexType, type1>>,
-      camp::list<range_segment, callable2<IndexType, type2>>,
-      camp::list<range_segment, callable3<IndexType, type3>> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  WorkPool_type pool(Allocator{});
-
-  for (IndexType pr = 0; pr < pool_reuse; pr++) {
-
-    // fill_pool(pool, type1(5), type2(7), type3(11));
     {
-      for (IndexType j = IndexType(0); j < num1; j++) {
-        type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable1<IndexType, type1>{working_ptr1, test_val1});
+      using dist_type = std::uniform_int_distribution<IndexType>;
+
+      for (IndexType j = IndexType(0); j < num1; j++)
+      {
+        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end1.push_back(dist_type(begin1.back(), min_end)(rng));
       }
 
-      for (IndexType j = IndexType(0); j < num2; j++) {
-        type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable2<IndexType, type2>{working_ptr2, test_val2});
+      for (IndexType j = IndexType(0); j < num2; j++)
+      {
+        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end2.push_back(dist_type(begin2.back(), min_end)(rng));
       }
 
-      for (IndexType j = IndexType(0); j < num3; j++) {
-        type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable3<IndexType, type3>{working_ptr3, test_val3});
+      for (IndexType j = IndexType(0); j < num3; j++)
+      {
+        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end3.push_back(dist_type(begin3.back(), min_end)(rng));
       }
     }
 
-    WorkGroup_type group = pool.instantiate();
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res{res};
+
+    using type1 = IndexType;
+    using type2 = size_t;
+    using type3 = double;
+
+    type1* working_array1 = nullptr;
+    type1* check_array1 = nullptr;
+    type1* test_array1 = nullptr;
+
+    type2* working_array2 = nullptr;
+    type2* check_array2 = nullptr;
+    type2* test_array2 = nullptr;
+
+    type3* working_array3 = nullptr;
+    type3* check_array3 = nullptr;
+    type3* test_array3 = nullptr;
+
+    allocateForallTestData<type1>(
+        N * num1, working_res, &working_array1, &check_array1, &test_array1);
+
+    allocateForallTestData<type2>(
+        N * num2, working_res, &working_array2, &check_array2, &test_array2);
+
+    allocateForallTestData<type3>(
+        N * num3, working_res, &working_array3, &check_array3, &test_array3);
+
+    type1 const test_val1(5);
+    type2 const test_val2(7);
+    type3 const test_val3(11);
+
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable1<IndexType, type1>>,
+        camp::list<range_segment, callable2<IndexType, type2>>,
+        camp::list<range_segment, callable3<IndexType, type3>>>;
+
+    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                              OrderPolicy,
+                                              StoragePolicy,
+                                              DispatchPolicy>,
+                        IndexType,
+                        RAJA::xargs<>,
+                        Allocator>;
+
+    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    WorkPool_type pool(Allocator{});
+
+    for (IndexType pr = 0; pr < pool_reuse; pr++)
+    {
 
-    for (IndexType gr = 0; gr < group_reuse; gr++) {
+      // fill_pool(pool, type1(5), type2(7), type3(11));
+      {
+        for (IndexType j = IndexType(0); j < num1; j++)
+        {
+          type1* working_ptr1 = working_array1 + N * j;
+          pool.enqueue(range_segment{begin1[j], end1[j]},
+                       callable1<IndexType, type1>{working_ptr1, test_val1});
+        }
+
+        for (IndexType j = IndexType(0); j < num2; j++)
+        {
+          type2* working_ptr2 = working_array2 + N * j;
+          pool.enqueue(range_segment{begin2[j], end2[j]},
+                       callable2<IndexType, type2>{working_ptr2, test_val2});
+        }
 
-      // set_test_data();
+        for (IndexType j = IndexType(0); j < num3; j++)
+        {
+          type3* working_ptr3 = working_array3 + N * j;
+          pool.enqueue(range_segment{begin3[j], end3[j]},
+                       callable3<IndexType, type3>{working_ptr3, test_val3});
+        }
+      }
+
+      WorkGroup_type group = pool.instantiate();
+
+      for (IndexType gr = 0; gr < group_reuse; gr++)
       {
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr1[i] = type1(0);
+
+        // set_test_data();
+        {
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr1[i] = type1(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr2[i] = type2(0);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr2[i] = type2(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr3[i] = type3(0);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr3[i] = type3(0);
+            }
           }
-        }
 
 
-        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
-            test_ptr1[ i ] = type1(i);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = begin1[j]; i < end1[j]; ++i)
+            {
+              test_ptr1[i] = type1(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
-            test_ptr2[ i ] = type2(i);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = begin2[j]; i < end2[j]; ++i)
+            {
+              test_ptr2[i] = type2(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
-            test_ptr3[ i ] = type3(i);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = begin3[j]; i < end3[j]; ++i)
+            {
+              test_ptr3[i] = type3(i);
+            }
           }
         }
-      }
 
-      WorkSite_type site = group.run(res);
+        WorkSite_type site = group.run(res);
 
-      // check_test_data(type1(5), type2(7), type3(11));
-      {
-        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+        // check_test_data(type1(5), type2(7), type3(11));
+        {
+          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-        res.wait();
+          res.wait();
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          type1* check_ptr1 = check_array1 + N * j;
-          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-          for (IndexType i = begin1[j];    i < end1[j];   i++) {
-            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
-          }
-          for (IndexType i = end1[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            type1* check_ptr1 = check_array1 + N * j;
+            for (IndexType i = IndexType(0); i < begin1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
+            for (IndexType i = begin1[j]; i < end1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+            }
+            for (IndexType i = end1[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          type2* check_ptr2 = check_array2 + N * j;
-          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-          for (IndexType i = begin2[j];    i < end2[j];   i++) {
-            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            type2* check_ptr2 = check_array2 + N * j;
+            for (IndexType i = IndexType(0); i < begin2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
+            for (IndexType i = begin2[j]; i < end2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+            }
+            for (IndexType i = end2[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
           }
-          for (IndexType i = end2[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          type3* check_ptr3 = check_array3 + N * j;
-          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-          }
-          for (IndexType i = begin3[j];    i < end3[j];   i++) {
-            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-          }
-          for (IndexType i = end3[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            type3* check_ptr3 = check_array3 + N * j;
+            for (IndexType i = IndexType(0); i < begin3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
+            for (IndexType i = begin3[j]; i < end3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+            }
+            for (IndexType i = end3[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
           }
         }
       }
-    }
 
-    pool.clear();
-  }
+      pool.clear();
+    }
 
 
-  deallocateForallTestData<type1>(working_res,
-                                  working_array1,
-                                  check_array1,
-                                  test_array1);
+    deallocateForallTestData<type1>(
+        working_res, working_array1, check_array1, test_array1);
 
-  deallocateForallTestData<type2>(working_res,
-                                  working_array2,
-                                  check_array2,
-                                  test_array2);
+    deallocateForallTestData<type2>(
+        working_res, working_array2, check_array2, test_array2);
 
-  deallocateForallTestData<type3>(working_res,
-                                  working_array3,
-                                  check_array3,
-                                  test_array3);
-}
+    deallocateForallTestData<type3>(
+        working_res, working_array3, check_array3, test_array3);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_function_call_dispatch_typer,
-                                      IndexType,
-                                      Allocator,
-                                      WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_virtual_function_dispatch_typer,
-                                      IndexType,
-                                      Allocator,
-                                      WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 
 #endif
 
 
 template <typename T>
-class WorkGroupBasicUnorderedMultipleReuseFunctionalTest : public ::testing::Test
-{
-};
+class WorkGroupBasicUnorderedMultipleReuseFunctionalTest
+    : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupUnorderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
+             BasicWorkGroupUnorderedMultipleReuse)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -388,12 +432,23 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupU
   IndexType num2 = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType num3 = dist_type(IndexType(0), IndexType(8))(rng);
 
-  IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType pool_reuse = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                                  IndexType, Allocator, WORKING_RESOURCE >{}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+  testWorkGroupUnorderedMultiple<ExecPolicy,
+                                 OrderPolicy,
+                                 StoragePolicy,
+                                 DispatchTyper,
+                                 IndexType,
+                                 Allocator,
+                                 WORKING_RESOURCE>{}(rng,
+                                                     IndexType(96),
+                                                     IndexType(4000),
+                                                     num1,
+                                                     num2,
+                                                     num3,
+                                                     pool_reuse,
+                                                     group_reuse);
 }
 
-#endif  //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
+#endif //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index 84d44dd496..00629145e7 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -24,150 +24,154 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle {
-void operator()(IndexType begin, IndexType end) const
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle
 {
-
-  ASSERT_GE(begin, (IndexType)0);
-  ASSERT_GE(end, begin);
-  IndexType N = end + begin;
-
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
-
-  IndexType* working_array;
-  IndexType* check_array;
-  IndexType* test_array;
-
-  allocateForallTestData<IndexType>(N,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  IndexType const test_val(5);
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  auto callable = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += i + test_val;
-      };
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, decltype(callable)> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using resource_type = typename WorkSite_type::resource_type;
-  static_assert(std::is_same<WORKING_RES, resource_type>::value,
-                "Expected same resource types");
-
+  void operator()(IndexType begin, IndexType end) const
   {
-    for (IndexType i = IndexType(0); i < N; i++) {
-      test_array[i] = IndexType(0);
-    }
-
-    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
 
-    for (IndexType i = begin; i < end; ++i) {
-      test_array[ i ] = IndexType(i);
+    ASSERT_GE(begin, (IndexType)0);
+    ASSERT_GE(end, begin);
+    IndexType N = end + begin;
+
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res{res};
+
+    IndexType* working_array;
+    IndexType* check_array;
+    IndexType* test_array;
+
+    allocateForallTestData<IndexType>(
+        N, working_res, &working_array, &check_array, &test_array);
+
+    IndexType const test_val(5);
+
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+    auto callable = [=] RAJA_HOST_DEVICE(IndexType i) {
+      working_array[i] += i + test_val;
+    };
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, decltype(callable)>>;
+
+    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                              OrderPolicy,
+                                              StoragePolicy,
+                                              DispatchPolicy>,
+                        IndexType,
+                        RAJA::xargs<>,
+                        Allocator>;
+
+    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    using resource_type = typename WorkSite_type::resource_type;
+    static_assert(std::is_same<WORKING_RES, resource_type>::value,
+                  "Expected same resource types");
+
+    {
+      for (IndexType i = IndexType(0); i < N; i++)
+      {
+        test_array[i] = IndexType(0);
+      }
+
+      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+
+      for (IndexType i = begin; i < end; ++i)
+      {
+        test_array[i] = IndexType(i);
+      }
     }
-  }
 
-  WorkPool_type pool(Allocator{});
+    WorkPool_type pool(Allocator{});
 
-  {
-    pool.enqueue(range_segment{ begin, end }, callable);
-  }
+    {
+      pool.enqueue(range_segment{begin, end}, callable);
+    }
 
-  WorkGroup_type group = pool.instantiate();
+    WorkGroup_type group = pool.instantiate();
 
-  WorkSite_type site = group.run();
+    WorkSite_type site = group.run();
 
-  auto e = site.get_resource().get_event();
-  e.wait();
+    auto e = site.get_resource().get_event();
+    e.wait();
 
-  {
-    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+    {
+      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
 
-    for (IndexType i = IndexType(0); i < begin; i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
-    }
-    for (IndexType i = begin;        i < end;   i++) {
-      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
-    }
-    for (IndexType i = end;          i < N;     i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
+      for (IndexType i = IndexType(0); i < begin; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
+      for (IndexType i = begin; i < end; i++)
+      {
+        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      }
+      for (IndexType i = end; i < N; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
     }
-  }
 
 
-  deallocateForallTestData<IndexType>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
-}
+    deallocateForallTestData<IndexType>(
+        working_res, working_array, check_array, test_array);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 
 #endif
@@ -175,13 +179,13 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicUnorderedSingleFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnorderedSingle)
+TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
+             BasicWorkGroupUnorderedSingle)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -203,9 +207,27 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnordere
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
+  testWorkGroupUnorderedSingle<ExecPolicy,
+                               OrderPolicy,
+                               StoragePolicy,
+                               DispatchTyper,
+                               IndexType,
+                               Allocator,
+                               WORKING_RESOURCE>{}(b1, e1);
+  testWorkGroupUnorderedSingle<ExecPolicy,
+                               OrderPolicy,
+                               StoragePolicy,
+                               DispatchTyper,
+                               IndexType,
+                               Allocator,
+                               WORKING_RESOURCE>{}(b2, e2);
+  testWorkGroupUnorderedSingle<ExecPolicy,
+                               OrderPolicy,
+                               StoragePolicy,
+                               DispatchTyper,
+                               IndexType,
+                               Allocator,
+                               WORKING_RESOURCE>{}(b3, e3);
 }
 
-#endif  //__TEST_WORKGROUP_UNORDERED_SINGLE__
+#endif //__TEST_WORKGROUP_UNORDERED_SINGLE__
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index a699171a94..efa2e2bbe2 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -21,110 +21,131 @@
 #ifdef RAJA_COMPILER_MSVC
 // disable some warnings for MSVC that we can't control, because they're emitted
 // by googletest headers
-#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
-#pragma warning( disable : 4389 )  // Force msvc to not emit conversion warning
+#pragma warning(disable : 4244) // Force msvc to not emit conversion warning
+#pragma warning(disable : 4389) // Force msvc to not emit conversion warning
 #endif
 
 #include "gtest/gtest.h"
 
-#define GPU_TEST(X, Y)                 \
-  static void gpu_test_##X##_##Y();    \
-  TEST(X, Y) { gpu_test_##X##_##Y(); } \
+#define GPU_TEST(X, Y)                                                         \
+  static void gpu_test_##X##_##Y();                                            \
+  TEST(X, Y) { gpu_test_##X##_##Y(); }                                         \
   static void gpu_test_##X##_##Y()
 
-#define GPU_TEST_F(test_fixture, test_name)                  \
-  static void gpu_test_f_##test_fixture##_##test_name();     \
-  GTEST_TEST_(test_fixture,                                   \
-              test_name,                                      \
-              test_fixture,                                   \
-              ::testing::internal::GetTypeId<test_fixture>()) \
-  {                                                           \
-    gpu_test_f_##test_fixture##_##test_name();               \
-  }                                                           \
+#define GPU_TEST_F(test_fixture, test_name)                                    \
+  static void gpu_test_f_##test_fixture##_##test_name();                       \
+  GTEST_TEST_(test_fixture,                                                    \
+              test_name,                                                       \
+              test_fixture,                                                    \
+              ::testing::internal::GetTypeId<test_fixture>())                  \
+  {                                                                            \
+    gpu_test_f_##test_fixture##_##test_name();                                 \
+  }                                                                            \
   static void gpu_test_f_##test_fixture##_##test_name()
 
-#define GPU_TEST_P(test_case_name, test_name)                               \
-  template <typename Invocable>                                              \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&);       \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
-      : public test_case_name                                                \
-  {                                                                          \
-  public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
-    virtual void TestBody()                                                  \
-    {                                                                        \
-      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); }); \
-    }                                                                        \
-                                                                             \
-  private:                                                                   \
-    static int AddToRegistry()                                               \
-    {                                                                        \
-      ::testing::UnitTest::GetInstance()                                     \
-          ->parameterized_test_registry()                                    \
-          .GetTestCasePatternHolder<test_case_name>(                         \
-              #test_case_name,                                               \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))         \
-          ->AddTestPattern(                                                  \
-              #test_case_name,                                               \
-              #test_name,                                                    \
-              new ::testing::internal::TestMetaFactory<                      \
-                  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>());     \
-      return 0;                                                              \
-    }                                                                        \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;             \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
-                                                           test_name));      \
-  };                                                                         \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                 \
-                             test_name)::gtest_registering_dummy_ =          \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();    \
-  template <typename Invocable>                                              \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&GetParam)
-
-#define GPU_TYPED_TEST_P(SuiteName, TestName)                           \
-    namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
-      template <typename gtest_TypeParam_>                              \
-      class TestName : public SuiteName<gtest_TypeParam_> {             \
-       private:                                                         \
-        typedef SuiteName<gtest_TypeParam_> TestFixture;                \
-        typedef gtest_TypeParam_ TypeParam;                             \
-       public:                                                          \
-        void TestBody() override;                                       \
-      };                                                                \
-      static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-          GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
-              __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
-              GTEST_STRINGIFY_(TestName));                              \
-    }                                                                   \
-    template <typename gtest_TypeParam_>                                \
-    void GTEST_SUITE_NAMESPACE_(                                        \
-        SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+#define GPU_TEST_P(test_case_name, test_name)                                  \
+  template <typename Invocable>                                                \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable&&);           \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
+      : public test_case_name                                                  \
+  {                                                                            \
+  public:                                                                      \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
+    virtual void TestBody()                                                    \
+    {                                                                          \
+      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); });    \
+    }                                                                          \
+                                                                               \
+  private:                                                                     \
+    static int AddToRegistry()                                                 \
+    {                                                                          \
+      ::testing::UnitTest::GetInstance()                                       \
+          ->parameterized_test_registry()                                      \
+          .GetTestCasePatternHolder<test_case_name>(                           \
+              #test_case_name,                                                 \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
+          ->AddTestPattern(                                                    \
+              #test_case_name,                                                 \
+              #test_name,                                                      \
+              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+                  test_case_name, test_name)>());                              \
+      return 0;                                                                \
+    }                                                                          \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
+                                                           test_name));        \
+  };                                                                           \
+  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
+                             test_name)::gtest_registering_dummy_ =            \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
+  template <typename Invocable>                                                \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable&& GetParam)
+
+#define GPU_TYPED_TEST_P(SuiteName, TestName)                                  \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName)                                  \
+  {                                                                            \
+    template <typename gtest_TypeParam_>                                       \
+    class TestName : public SuiteName<gtest_TypeParam_>                        \
+    {                                                                          \
+    private:                                                                   \
+      typedef SuiteName<gtest_TypeParam_> TestFixture;                         \
+      typedef gtest_TypeParam_ TypeParam;                                      \
+                                                                               \
+    public:                                                                    \
+      void TestBody() override;                                                \
+    };                                                                         \
+    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =          \
+        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(                \
+            __FILE__,                                                          \
+            __LINE__,                                                          \
+            GTEST_STRINGIFY_(SuiteName),                                       \
+            GTEST_STRINGIFY_(TestName));                                       \
+  }                                                                            \
+  template <typename gtest_TypeParam_>                                         \
+  void GTEST_SUITE_NAMESPACE_(                                                 \
+      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
 
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4244 )  // reenable warning
-#pragma warning( default : 4389 )  // reenable warning
+#pragma warning(default : 4244) // reenable warning
+#pragma warning(default : 4389) // reenable warning
 #endif
 
 
 #if defined(__CUDA_ARCH__)
 
-#define RAJA_ASSERT_EQ(X,Y) \
-{\
-  auto x = (X); \
-  auto y = (Y); \
-  if(x != y){ \
-      asm("trap;"); \
-  } \
-}
+#define RAJA_ASSERT_EQ(X, Y)                                                   \
+  {                                                                            \
+    auto x = (X);                                                              \
+    auto y = (Y);                                                              \
+    if (x != y)                                                                \
+    {                                                                          \
+      asm("trap;");                                                            \
+    }                                                                          \
+  }
 
-#define RAJA_ASSERT_FLOAT_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
-#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
+#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
+  {                                                                            \
+    RAJA_ASSERT_EQ(X, Y);                                                      \
+  }
+#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
+  {                                                                            \
+    RAJA_ASSERT_EQ(X, Y);                                                      \
+  }
 #else
 
-#define RAJA_ASSERT_EQ(X,Y) {ASSERT_EQ(X,Y);}
-#define RAJA_ASSERT_FLOAT_EQ(X,Y) {ASSERT_FLOAT_EQ(X,Y);}
-#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {ASSERT_DOUBLE_EQ(X,Y);}
+#define RAJA_ASSERT_EQ(X, Y)                                                   \
+  {                                                                            \
+    ASSERT_EQ(X, Y);                                                           \
+  }
+#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
+  {                                                                            \
+    ASSERT_FLOAT_EQ(X, Y);                                                     \
+  }
+#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
+  {                                                                            \
+    ASSERT_DOUBLE_EQ(X, Y);                                                    \
+  }
 
 #endif
 /*
@@ -139,54 +160,68 @@
  *  Now you can just say ASSERT_SCALAR_EQ(X, Y) and things should just work
  *
  */
-#define ASSERT_SCALAR_EQ(X,Y) { \
-  int value_type = RAJA::gtest::getScalarType(X); \
-  switch(value_type){ \
-    case 1: {RAJA_ASSERT_FLOAT_EQ(X,Y);} break; \
-    case 2: {RAJA_ASSERT_DOUBLE_EQ(X,Y);} break; \
-    default: {RAJA_ASSERT_EQ(X,Y);} \
-  }; }
+#define ASSERT_SCALAR_EQ(X, Y)                                                 \
+  {                                                                            \
+    int value_type = RAJA::gtest::getScalarType(X);                            \
+    switch (value_type)                                                        \
+    {                                                                          \
+    case 1:                                                                    \
+    {                                                                          \
+      RAJA_ASSERT_FLOAT_EQ(X, Y);                                              \
+    }                                                                          \
+    break;                                                                     \
+    case 2:                                                                    \
+    {                                                                          \
+      RAJA_ASSERT_DOUBLE_EQ(X, Y);                                             \
+    }                                                                          \
+    break;                                                                     \
+    default:                                                                   \
+    {                                                                          \
+      RAJA_ASSERT_EQ(X, Y);                                                    \
+    }                                                                          \
+    };                                                                         \
+  }
 
 // Traits use by the above maco
 namespace RAJA
 {
-  namespace gtest
-  {
-    template<typename T>
-    struct AssertScalarTraits{
-        static constexpr int value = 0;
-    };
-
-    template<>
-    struct AssertScalarTraits<float>{
-        static constexpr int value = 1;
-    };
-
-    template<>
-    struct AssertScalarTraits<double>{
-        static constexpr int value = 2;
-    };
-
-    template<typename T>
-    inline
-    constexpr
-    int getScalarType(T const &){
-      return AssertScalarTraits<T>::value;
-    }
+namespace gtest
+{
+template <typename T>
+struct AssertScalarTraits
+{
+  static constexpr int value = 0;
+};
 
+template <>
+struct AssertScalarTraits<float>
+{
+  static constexpr int value = 1;
+};
 
-  }
+template <>
+struct AssertScalarTraits<double>
+{
+  static constexpr int value = 2;
+};
+
+template <typename T>
+inline constexpr int getScalarType(T const&)
+{
+  return AssertScalarTraits<T>::value;
 }
 
+
+} // namespace gtest
+} // namespace RAJA
+
 // This always returns a 0, but forces compiler not to compile-out
 // constant values
-#define NO_OPT_ZERO (rand()/RAND_MAX)
+#define NO_OPT_ZERO (rand() / RAND_MAX)
 
 // Returns a random value between 1.0 and 2.0, and helps force the compiler
 // to not compile-out constant values
-#define NO_OPT_RAND (1.0+(double)rand()/RAND_MAX)
-
-
+#define NO_OPT_RAND (1.0 + (double)rand() / RAND_MAX)
 
 
-#endif  // closing endif for header file include guard
+#endif // closing endif for header file include guard
diff --git a/test/include/RAJA_test-abs.hpp b/test/include/RAJA_test-abs.hpp
index 85b5002d92..c0fece44fc 100644
--- a/test/include/RAJA_test-abs.hpp
+++ b/test/include/RAJA_test-abs.hpp
@@ -13,19 +13,20 @@
 
 #include <cmath>
 
-namespace RAJA {
-
-  template<typename T>
-  camp::concepts::enable_if_t<T, std::is_floating_point<T> >
-  test_abs(T&& val) {
-    return std::fabs(val);
-  } 
-
-  template<typename T>
-  camp::concepts::enable_if_t<T, std::is_integral<T> >
-  test_abs(T&& val) {
-    return std::abs(val);
-  }
+namespace RAJA
+{
+
+template <typename T>
+camp::concepts::enable_if_t<T, std::is_floating_point<T>> test_abs(T&& val)
+{
+  return std::fabs(val);
+}
+
+template <typename T>
+camp::concepts::enable_if_t<T, std::is_integral<T>> test_abs(T&& val)
+{
+  return std::abs(val);
+}
 
 } // namespace RAJA
 
diff --git a/test/include/RAJA_test-atomic-ref-types.hpp b/test/include/RAJA_test-atomic-ref-types.hpp
index f854932ab8..dc0667ee73 100644
--- a/test/include/RAJA_test-atomic-ref-types.hpp
+++ b/test/include/RAJA_test-atomic-ref-types.hpp
@@ -18,74 +18,71 @@
 
 #include <type_traits>
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 1, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 1, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 2, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 2, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 4, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 4, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 8, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 8, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val |= val >> 32;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 16, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 16, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
-  val |= val >> 64 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val |= val >> 32;
+  val |= val >> 64;
   return val;
 }
 
 // Assist return type conditional overloading of testAtomicRefLogicalOp
-struct int_op {}; // represents underlying op type = integral
-struct all_op {}; // these op types can accept integral or float
+struct int_op
+{}; // represents underlying op type = integral
+struct all_op
+{}; // these op types can accept integral or float
 
 
 #endif // __RAJA_test_atomic_ref_types_HPP__
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
index 90a1be4024..10081c178d 100644
--- a/test/include/RAJA_test-atomic-types.hpp
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -21,15 +21,14 @@
 //
 // Atomic data types
 //
-using AtomicDataTypeList =
-  camp::list< RAJA::Index_type,
-              int,
+using AtomicDataTypeList = camp::list<RAJA::Index_type,
+                                      int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned int,
-              long long,
-              unsigned long long,
-              float,
+                                      unsigned int,
+                                      long long,
+                                      unsigned long long,
+                                      float,
 #endif
-              double >;
+                                      double>;
 
 #endif // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-atomicpol.hpp b/test/include/RAJA_test-atomicpol.hpp
index cc327d434d..7482ca2e21 100644
--- a/test/include/RAJA_test-atomicpol.hpp
+++ b/test/include/RAJA_test-atomicpol.hpp
@@ -11,97 +11,87 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using SequentialAtomicPols =
-  camp::list<
+using SequentialAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::auto_atomic,
-              RAJA::builtin_atomic,
+    RAJA::auto_atomic,
+    RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-              RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-              RAJA::seq_atomic
-            >;
+    RAJA::seq_atomic>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPAtomicPols =
-  camp::list<
+using OpenMPAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::omp_atomic,
-              RAJA::builtin_atomic,
+    RAJA::omp_atomic,
+    RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-              RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-              RAJA::auto_atomic
-            >;
-#endif  // RAJA_ENABLE_OPENMP
+    RAJA::auto_atomic>;
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAtomicPols =
-  camp::list<
+using CudaAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::auto_atomic,
-              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-              RAJA::cuda_atomic
-            >;
-#endif  // RAJA_ENABLE_CUDA
+    RAJA::cuda_atomic>;
+#endif // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAtomicPols =
-  camp::list<
+using HipAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               RAJA::auto_atomic,
-               RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
-               RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-               RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-               RAJA::hip_atomic
-            >;
-#endif  // RAJA_ENABLE_HIP
+    RAJA::hip_atomic>;
+#endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclAtomicPols =
-  camp::list<
+using SyclAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               RAJA::auto_atomic,
-               RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
-               RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-               RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-               RAJA::sycl_atomic
-            >;
-#endif  // RAJA_ENABLE_SYCL
+    RAJA::sycl_atomic>;
+#endif // RAJA_ENABLE_SYCL
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetAtomicPols = OpenMPAtomicPols;
 #endif
 
-#endif  // __TEST_ATOMICPOL__
+#endif // __TEST_ATOMICPOL__
diff --git a/test/include/RAJA_test-base.hpp b/test/include/RAJA_test-base.hpp
index 98bf53e1c2..8ce31f92eb 100644
--- a/test/include/RAJA_test-base.hpp
+++ b/test/include/RAJA_test-base.hpp
@@ -23,7 +23,8 @@ template <class T>
 struct Test;
 
 template <class... T>
-struct Test<camp::list<T...>> {
+struct Test<camp::list<T...>>
+{
   using Types = ::testing::Types<T...>;
 };
 
diff --git a/test/include/RAJA_test-dynamic-forall.hpp b/test/include/RAJA_test-dynamic-forall.hpp
index 0185061a6d..0bccde9cbc 100644
--- a/test/include/RAJA_test-dynamic-forall.hpp
+++ b/test/include/RAJA_test-dynamic-forall.hpp
@@ -15,20 +15,23 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using policy_list = camp::list<camp::list<RAJA::seq_exec
-                               ,RAJA::simd_exec
+using policy_list = camp::list<camp::list<RAJA::seq_exec,
+                                          RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-                               ,RAJA::omp_parallel_for_exec
+                                          ,
+                                          RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                               ,RAJA::cuda_exec<256>
-                               ,RAJA::cuda_exec<512>
+                                          ,
+                                          RAJA::cuda_exec<256>,
+                                          RAJA::cuda_exec<512>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                               ,RAJA::hip_exec<256>
-                               ,RAJA::hip_exec<512>
+                                          ,
+                                          RAJA::hip_exec<256>,
+                                          RAJA::hip_exec<512>
 #endif
                                           >>;
 
 
-#endif  // __RAJA_test_dynamic_execpol_HPP__
+#endif // __RAJA_test_dynamic_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-async-execpol.hpp b/test/include/RAJA_test-forall-async-execpol.hpp
index fa9526476e..1025b43c21 100644
--- a/test/include/RAJA_test-forall-async-execpol.hpp
+++ b/test/include/RAJA_test-forall-async-execpol.hpp
@@ -28,7 +28,7 @@ using OpenMPAsyncForallExecPols = OpenMPForallExecPols;
 using OpenMPAsyncForallReduceExecPols = OpenMPForallReduceExecPols;
 using OpenMPAsyncForallAtomicExecPols = OpenMPForallAtomicExecPols;
 
-#endif  // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetAsyncForallExecPols = OpenMPTargetForallExecPols;
@@ -38,9 +38,10 @@ using OpenMPTargetAsyncForallAtomicExecPols = OpenMPTargetForallAtomicExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAsyncForallExecPols = camp::list< RAJA::cuda_exec<128, true>,
-                                       RAJA::cuda_exec<256, true>,
-                                       RAJA::cuda_exec_explicit<256,2, true> >;
+using CudaAsyncForallExecPols =
+    camp::list<RAJA::cuda_exec<128, true>,
+               RAJA::cuda_exec<256, true>,
+               RAJA::cuda_exec_explicit<256, 2, true>>;
 
 using CudaAsyncForallReduceExecPols = CudaForallExecPols;
 
@@ -49,8 +50,8 @@ using CudaAsyncForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAsyncForallExecPols = camp::list< RAJA::hip_exec<128, true>,
-                                      RAJA::hip_exec<256, true>  >;
+using HipAsyncForallExecPols =
+    camp::list<RAJA::hip_exec<128, true>, RAJA::hip_exec<256, true>>;
 
 using HipAsyncForallReduceExecPols = HipForallExecPols;
 
@@ -58,4 +59,4 @@ using HipAsyncForallAtomicExecPols = HipForallExecPols;
 
 #endif
 
-#endif  // __RAJA_test_forall_execpol_HPP__
+#endif // __RAJA_test_forall_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
index 3ced1c4cf1..6d6fd34118 100644
--- a/test/include/RAJA_test-forall-data.hpp
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -6,7 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 //
-// Utility routines for allocating/deallocating arrays in for forall tests. 
+// Utility routines for allocating/deallocating arrays in for forall tests.
 //
 
 #ifndef __RAJA_test_forall_data_HPP__
@@ -14,7 +14,7 @@
 
 #include "camp/resource.hpp"
 
-template<typename T>
+template <typename T>
 void allocateForallTestData(size_t N,
                             camp::resources::Resource work_res,
                             T** work_array,
@@ -30,8 +30,10 @@ void allocateForallTestData(size_t N,
 }
 
 // for RAJA strongly typed indices
-template<typename T,
-         typename std::enable_if<std::is_base_of<RAJA::IndexValueBase, camp::type::ptr::rem<T>>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<
+              std::is_base_of<RAJA::IndexValueBase,
+                              camp::type::ptr::rem<T>>::value>::type* = nullptr>
 void allocateForallTestData(T N,
                             camp::resources::Resource work_res,
                             T** work_array,
@@ -46,7 +48,7 @@ void allocateForallTestData(T N,
   *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
-template<typename T>
+template <typename T>
 void deallocateForallTestData(camp::resources::Resource work_res,
                               T* work_array,
                               T* check_array,
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 40adaccc8c..4dd571e32a 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -16,87 +16,109 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialForallExecPols = camp::list< RAJA::seq_exec,
-                                             RAJA::simd_exec >;
+using SequentialForallExecPols = camp::list<RAJA::seq_exec, RAJA::simd_exec>;
 
 //
 // Sequential execution policy types for reduction and atomic tests.
 //
 // Note: RAJA::simd_exec does not work with these.
 //
-using SequentialForallReduceExecPols = camp::list< RAJA::seq_exec >;
+using SequentialForallReduceExecPols = camp::list<RAJA::seq_exec>;
 
-using SequentialForallAtomicExecPols = camp::list< RAJA::seq_exec >;
+using SequentialForallAtomicExecPols = camp::list<RAJA::seq_exec>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallExecPols = 
-  camp::list< RAJA::omp_parallel_for_exec
- 
-              , RAJA::omp_parallel_for_static_exec< >
-              , RAJA::omp_parallel_for_static_exec<4>
+using OpenMPForallExecPols = camp::list<
+    RAJA::omp_parallel_for_exec
 
-#if defined(RAJA_TEST_EXHAUSTIVE)
-              , RAJA::omp_parallel_for_dynamic_exec< >
-              , RAJA::omp_parallel_for_dynamic_exec<4>
-
-              , RAJA::omp_parallel_for_guided_exec< >
-              , RAJA::omp_parallel_for_guided_exec<4>
-
-              , RAJA::omp_parallel_for_runtime_exec
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_exec>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
+    ,
+    RAJA::omp_parallel_for_static_exec<>,
+    RAJA::omp_parallel_for_static_exec<4>
 
-              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
-#endif       
-             >;
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    ,
+    RAJA::omp_parallel_for_dynamic_exec<>,
+    RAJA::omp_parallel_for_dynamic_exec<4>
+
+    ,
+    RAJA::omp_parallel_for_guided_exec<>,
+    RAJA::omp_parallel_for_guided_exec<4>
+
+    ,
+    RAJA::omp_parallel_for_runtime_exec
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_exec>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
+#endif
+    >;
 
 using OpenMPForallReduceExecPols = OpenMPForallExecPols;
 
 using OpenMPForallAtomicExecPols =
-  camp::list< RAJA::omp_parallel_for_exec
+    camp::list<RAJA::omp_parallel_for_exec
 
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              , RAJA::omp_parallel_for_static_exec< >
-              , RAJA::omp_parallel_for_static_exec<4>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-
-              , RAJA::omp_parallel_for_dynamic_exec< >
-              , RAJA::omp_parallel_for_dynamic_exec<2>
-
-              , RAJA::omp_parallel_for_guided_exec< >
-              , RAJA::omp_parallel_for_guided_exec<3>
-
-              , RAJA::omp_parallel_for_runtime_exec
+               ,
+               RAJA::omp_parallel_for_static_exec<>,
+               RAJA::omp_parallel_for_static_exec<4>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
+
+               ,
+               RAJA::omp_parallel_for_dynamic_exec<>,
+               RAJA::omp_parallel_for_dynamic_exec<2>
+
+               ,
+               RAJA::omp_parallel_for_guided_exec<>,
+               RAJA::omp_parallel_for_guided_exec<3>
+
+               ,
+               RAJA::omp_parallel_for_runtime_exec
 #endif
-            >; 
+               >;
 
-#endif  // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetForallExecPols =
-  camp::list< RAJA::omp_target_parallel_for_exec<8>,
-              RAJA::omp_target_parallel_for_exec_nt >;
+    camp::list<RAJA::omp_target_parallel_for_exec<8>,
+               RAJA::omp_target_parallel_for_exec_nt>;
 
 using OpenMPTargetForallReduceExecPols = OpenMPTargetForallExecPols;
 
@@ -105,12 +127,15 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
-                                       RAJA::cuda_exec_occ_calc<256>,
-                                       RAJA::cuda_exec_grid<256, 64>,
-                                       RAJA::cuda_exec_explicit<256,2>,
-                                       RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                       RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >;
+using CudaForallExecPols =
+    camp::list<RAJA::cuda_exec<128>,
+               RAJA::cuda_exec_occ_calc<256>,
+               RAJA::cuda_exec_grid<256, 64>,
+               RAJA::cuda_exec_explicit<256, 2>,
+               RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::cuda_exec_occ_custom<
+                   256,
+                   RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -119,11 +144,14 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
-                                      RAJA::hip_exec_occ_calc<256>,
-                                      RAJA::hip_exec_grid<256, 64>,
-                                      RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                      RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >;
+using HipForallExecPols =
+    camp::list<RAJA::hip_exec<128>,
+               RAJA::hip_exec_occ_calc<256>,
+               RAJA::hip_exec_grid<256, 64>,
+               RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::hip_exec_occ_custom<
+                   256,
+                   RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
@@ -132,8 +160,8 @@ using HipForallAtomicExecPols = HipForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclForallExecPols = camp::list< RAJA::sycl_exec<128, false>,
-                                       RAJA::sycl_exec<256, false> >;
+using SyclForallExecPols =
+    camp::list<RAJA::sycl_exec<128, false>, RAJA::sycl_exec<256, false>>;
 
 using SyclForallReduceExecPols = SyclForallExecPols;
 
@@ -141,4 +169,4 @@ using SyclForallAtomicExecPols = SyclForallExecPols;
 
 #endif
 
-#endif  // __RAJA_test_forall_execpol_HPP__
+#endif // __RAJA_test_forall_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
index 1a25ba4daf..4db6371d55 100644
--- a/test/include/RAJA_test-forall-indexset-execpol.hpp
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -13,8 +13,8 @@
 
 // Sequential execution policy types
 using SequentialForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
 
 //
 // Sequential execution policy types for reduction tests.
@@ -22,52 +22,50 @@ using SequentialForallIndexSetExecPols =
 // Note: RAJA::simd_exec does not work with these.
 //
 using SequentialForallIndexSetReduceExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallIndexSetExecPols =  
-  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+using OpenMPForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 
 using OpenMPForallIndexSetReduceExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit,
-                               RAJA::omp_target_parallel_for_exec<8>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, 
-                               RAJA::omp_target_parallel_for_exec_nt> >;
+using OpenMPTargetForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec<8>>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec_nt>>;
 
-using OpenMPTargetForallIndexSetReduceExecPols = 
-      OpenMPTargetForallIndexSetExecPols;
+using OpenMPTargetForallIndexSetReduceExecPols =
+    OpenMPTargetForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
 
 using CudaForallIndexSetReduceExecPols = CudaForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
 
 using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
 using SyclForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
 
 using SyclForallIndexSetReduceExecPols = SyclForallIndexSetExecPols;
 #endif
 
-#endif  // __RAJA_test_forall_indexset_execpol_HPP__
+#endif // __RAJA_test_forall_indexset_execpol_HPP__
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
index 231139eb57..d35c6573aa 100644
--- a/test/include/RAJA_test-index-types.hpp
+++ b/test/include/RAJA_test-index-types.hpp
@@ -25,50 +25,50 @@
 //
 RAJA_INDEX_VALUE(StrongIndexType, "StrongIndexType");
 RAJA_INDEX_VALUE_T(StrongInt, int, "StrongIntType");
-RAJA_INDEX_VALUE_T(StrongULL, unsigned long long , "StrongULLType");
+RAJA_INDEX_VALUE_T(StrongULL, unsigned long long, "StrongULLType");
 
 //
 // Standard index types list
 //
-using IdxTypeList = camp::list<RAJA::Index_type,
-                               int,
+using IdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                               unsigned int,
-// short int types will break a bunch of tests due to assumpitons made in 
-// the test implementations.
-//                             short,
-//                             unsigned short,
-                               long int,
-                               unsigned long,
-                               long long,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                             short,
+               //                             unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-                               unsigned long long>;
+               unsigned long long>;
 
 //
 // Signed index types list
 //
-using SignedIdxTypeList = camp::list<RAJA::Index_type,
-                                     int,
-                                     long long>;
+using SignedIdxTypeList = camp::list<RAJA::Index_type, int, long long>;
 
 //
 // Index types w/ Strong types list
 //
-using StrongIdxTypeList = camp::list<RAJA::Index_type,
-                                     int,
-                                     StrongIndexType,
+using StrongIdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
+               StrongIndexType,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                     //StrongInt,
-                                     unsigned int,
-// short int types will break a bunch of tests due to assumpitons made in 
-// the test implementations.
-//                                   short,
-//                                   unsigned short,
-                                     long int,
-                                     unsigned long,
-                                     long long,
+               // StrongInt,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                                   short,
+               //                                   unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-                                     //StrongULL,
-                                     unsigned long long>;
+               // StrongULL,
+               unsigned long long>;
 
 #endif // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
index a7bcdf5b05..394669f9fd 100644
--- a/test/include/RAJA_test-indexset-build.hpp
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -18,17 +18,17 @@
 #include <random>
 
 //
-// Utility routine to construct index set with mix of Range, RangeStride, 
+// Utility routine to construct index set with mix of Range, RangeStride,
 // and List segments to use in various tests.
 //
 template <typename INDEX_TYPE,
           typename RANGE_TYPE,
           typename RANGESTRIDE_TYPE,
           typename LIST_TYPE>
-void buildIndexSet( 
-  RAJA::TypedIndexSet< RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE >& iset, 
-  std::vector<INDEX_TYPE>& indices_out,
-  camp::resources::Resource working_res )
+void buildIndexSet(
+    RAJA::TypedIndexSet<RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE>& iset,
+    std::vector<INDEX_TYPE>& indices_out,
+    camp::resources::Resource working_res)
 {
   //
   //  Build vector of integers for creating List segments.
@@ -38,27 +38,29 @@ void buildIndexSet(
 
   std::vector<INDEX_TYPE> lindices;
   INDEX_TYPE idx = 0;
-  while (lindices.size() < 3000) {
+  while (lindices.size() < 3000)
+  {
     double dval = dist(gen);
-    if (dval > 0.3) {
+    if (dval > 0.3)
+    {
       lindices.push_back(idx);
     }
     idx++;
   }
 
   //
-  // Construct a mix of Range, RangeStride, and List segments 
+  // Construct a mix of Range, RangeStride, and List segments
   // and add them to index set
   //
   INDEX_TYPE rbeg = 0;
   INDEX_TYPE rend = 0;
   INDEX_TYPE stride = 0;
   INDEX_TYPE last_idx = 0;
-  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>( lindices.size() );
+  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>(lindices.size());
   std::vector<INDEX_TYPE> lseg(lseg_len);
   std::vector<INDEX_TYPE> lseg_vec(lseg_len);
 
-  indices_out.clear(); 
+  indices_out.clear();
 
   // Create empty Range segment
   rbeg = 1;
@@ -70,23 +72,26 @@ void buildIndexSet(
   rbeg = 1;
   rend = 1578;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back( lseg[i] );
+    indices_out.push_back(lseg[i]);
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg_vec[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back( lseg_vec[i] );
+    indices_out.push_back(lseg_vec[i]);
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
@@ -96,8 +101,9 @@ void buildIndexSet(
   rend = rbeg + 2040;
   stride = 3;
   iset.push_back(RANGESTRIDE_TYPE(rbeg, rend, stride));
-  for (INDEX_TYPE i = rbeg; i < rend; i += stride) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; i += stride)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
@@ -105,15 +111,17 @@ void buildIndexSet(
   rbeg = last_idx + 4;
   rend = rbeg + 2759;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg[i] = lindices[i] + last_idx + 5;
-    indices_out.push_back( lseg[i] );
+    indices_out.push_back(lseg[i]);
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
@@ -122,18 +130,20 @@ void buildIndexSet(
   rbeg = last_idx + 1;
   rend = rbeg + 320;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg_vec[i] = lindices[i] + last_idx + 7;
-    indices_out.push_back( lseg_vec[i] );
+    indices_out.push_back(lseg_vec[i]);
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
 }
 
-#endif  // __TEST_FORALL_INDEXSET_BUILD_HPP__
+#endif // __TEST_FORALL_INDEXSET_BUILD_HPP__
diff --git a/test/include/RAJA_test-kernel-nested-loop-types.hpp b/test/include/RAJA_test-kernel-nested-loop-types.hpp
index 4d13af1e9b..69fdab271d 100644
--- a/test/include/RAJA_test-kernel-nested-loop-types.hpp
+++ b/test/include/RAJA_test-kernel-nested-loop-types.hpp
@@ -16,30 +16,54 @@
 #define DEVICE_KERNEL CudaKernel
 #endif
 
-struct DEPTH_1_REDUCESUM {};
-struct DEPTH_2 {};
-struct DEPTH_2_COLLAPSE {};
-struct DEPTH_3 {};
-struct DEPTH_3_COLLAPSE {};
-struct DEPTH_3_COLLAPSE_SEQ_INNER {};
-struct DEPTH_3_COLLAPSE_SEQ_OUTER {};
-struct DEPTH_3_REDUCESUM {};
-struct DEPTH_3_REDUCESUM_SEQ_INNER {};
-struct DEPTH_3_REDUCESUM_SEQ_OUTER {};
-struct DEVICE_DEPTH_1_REDUCESUM {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARP {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE {};
-struct DEVICE_DEPTH_2 {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARP {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE {};
-struct DEVICE_DEPTH_3 {};
-struct DEVICE_DEPTH_3_REDUCESUM {};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER {};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER {};
-struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
+struct DEPTH_1_REDUCESUM
+{};
+struct DEPTH_2
+{};
+struct DEPTH_2_COLLAPSE
+{};
+struct DEPTH_3
+{};
+struct DEPTH_3_COLLAPSE
+{};
+struct DEPTH_3_COLLAPSE_SEQ_INNER
+{};
+struct DEPTH_3_COLLAPSE_SEQ_OUTER
+{};
+struct DEPTH_3_REDUCESUM
+{};
+struct DEPTH_3_REDUCESUM_SEQ_INNER
+{};
+struct DEPTH_3_REDUCESUM_SEQ_OUTER
+{};
+struct DEVICE_DEPTH_1_REDUCESUM
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARP
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE
+{};
+struct DEVICE_DEPTH_2
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARP
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE
+{};
+struct DEVICE_DEPTH_3
+{};
+struct DEVICE_DEPTH_3_REDUCESUM
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
+{};
 
 
 //
@@ -47,58 +71,63 @@ struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
 // Nested Loop Data Type information
 //
 //
-template<typename LoopPolType, typename... Policies> 
-struct NestedLoopData : camp::list<Policies...> {
+template <typename LoopPolType, typename... Policies>
+struct NestedLoopData : camp::list<Policies...>
+{
   using LoopType = LoopPolType;
 };
 
 
 //
 //
-// Filter out a list of "NestedLoopData" types given a 
+// Filter out a list of "NestedLoopData" types given a
 // tests' supported loop Type list.
 //
 //
-namespace detail{
-
-  using namespace camp;
-
-  template<typename T, typename Elements>
-  struct is_in_type_list;
-
-  template<typename T, typename Elements>
-  struct KELB_impl;
-
-  template<typename T, typename First, typename... Rest>
-  struct is_in_type_list<T, list<First, Rest...>> :
-    std::conditional<
-      std::is_same<  typename T::LoopType, First  >::value,
-      list<T>,
-      typename is_in_type_list<T, list<Rest...>>::type > {};
-
-  template<typename T, typename Last>
-  struct is_in_type_list<T, list<Last>> :
-    std::conditional<
-      std::is_same< typename T::LoopType , Last>::value,
-      list<T>,
-      list<> > {};
-
-  template<typename POL_TYPE_LIST, typename First, typename... Rest>
-  struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>> :
-    join< typename KELB_impl<POL_TYPE_LIST, list<First  >>::type, 
-          typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type > {};
-
-  template<typename POL_TYPE_LIST, typename Last>
-  struct KELB_impl<POL_TYPE_LIST, list<Last>> :
-    is_in_type_list<Last, POL_TYPE_LIST > {};
+namespace detail
+{
+
+using namespace camp;
+
+template <typename T, typename Elements>
+struct is_in_type_list;
+
+template <typename T, typename Elements>
+struct KELB_impl;
+
+template <typename T, typename First, typename... Rest>
+struct is_in_type_list<T, list<First, Rest...>>
+    : std::conditional<std::is_same<typename T::LoopType, First>::value,
+                       list<T>,
+                       typename is_in_type_list<T, list<Rest...>>::type>
+{};
+
+template <typename T, typename Last>
+struct is_in_type_list<T, list<Last>>
+    : std::conditional<std::is_same<typename T::LoopType, Last>::value,
+                       list<T>,
+                       list<>>
+{};
+
+template <typename POL_TYPE_LIST, typename First, typename... Rest>
+struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>>
+    : join<typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
+           typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
+{};
+
+template <typename POL_TYPE_LIST, typename Last>
+struct KELB_impl<POL_TYPE_LIST, list<Last>>
+    : is_in_type_list<Last, POL_TYPE_LIST>
+{};
 
 } // namespace detail
 
 
-template<typename POL_TYPE_LIST, typename EXEC_POL_LIST>
-struct KernelExecListBuilder {
+template <typename POL_TYPE_LIST, typename EXEC_POL_LIST>
+struct KernelExecListBuilder
+{
   using type = typename detail::KELB_impl<POL_TYPE_LIST, EXEC_POL_LIST>::type;
 };
 
 
-#endif  // __TEST_KERNEL_NESTED_LOOP_TYPES_HPP__
+#endif // __TEST_KERNEL_NESTED_LOOP_TYPES_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 7179e48fdc..f5ce7de856 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -15,62 +15,44 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
-
-using Sequential_launch_policies =
-  camp::list<
-             seq_policies
-            >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif  // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using cuda_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using cuda_direct_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-           >;
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using cuda_direct_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
 
 using Cuda_launch_policies =
-  camp::list<
-             cuda_direct_policies,
-             cuda_direct_explicit_policies
-            >;
-#endif  // RAJA_ENABLE_CUDA
+    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
+#endif // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
 using hip_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
-           >;
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
@@ -79,15 +61,13 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 #if defined(RAJA_ENABLE_SYCL)
 
 using sycl_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
-            >;
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
 #endif
 
 
-#endif  // __RAJA_TEST_LAUNCH_DIRECT_TEAMS_THREADS_1D_EXECPOL_HPP__
+#endif // __RAJA_TEST_LAUNCH_DIRECT_TEAMS_THREADS_1D_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index f84823e414..47268b2c84 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -15,81 +15,66 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
-                                              
+
 #if defined(RAJA_ENABLE_OPENMP)
 
-using omp_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif  // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using cuda_direct_explicit_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using Cuda_launch_policies = 
-  camp::list<
-             cuda_direct_policies,
-             cuda_direct_explicit_policies
-             >;
-
-#endif  // RAJA_ENABLE_CUDA
+using cuda_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using cuda_direct_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
+
+#endif // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
-           >;
+using hip_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
@@ -97,20 +82,18 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, //slowest
-             RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, //fastest
-             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
-            >;
+using sycl_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, // slowest
+               RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, // fastest
+               RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
-                                        
+
 #endif
 
 
-#endif  //__RAJA_TEST_LAUNCH_DIRECT_TEAM_THREADS_3D_EXECPOL_HPP__
+#endif //__RAJA_TEST_LAUNCH_DIRECT_TEAM_THREADS_3D_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index fea90a8305..b62694bcee 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -15,65 +15,47 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>>;
 
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif  // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+using cuda_policies = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
 using cuda_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
-using Cuda_launch_policies = camp::list<
-        cuda_policies,
-        cuda_explicit_policies
-         >;
-#endif  // RAJA_ENABLE_CUDA
+using Cuda_launch_policies = camp::list<cuda_policies, cuda_explicit_policies>;
+#endif // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
+using hip_policies = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+                                RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
 
-using Hip_launch_policies = camp::list<
-      hip_policies
-       >;
+using Hip_launch_policies = camp::list<hip_policies>;
 #endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
+using sycl_policies = camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
-using Sycl_launch_policies = camp::list<
-      sycl_policies
-       >;
+using Sycl_launch_policies = camp::list<sycl_policies>;
 #endif // RAJA_ENABLE_SYCL
 
 
-#endif  // __RAJA_TEST_LAUNCH_EXECPOL_HPP__
+#endif // __RAJA_TEST_LAUNCH_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index 6173fc6ffa..2a064ee490 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -15,74 +15,56 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-         RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif  // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
-using Cuda_launch_policies = camp::list<
-  cuda_loop_policies,
-  cuda_loop_explicit_policies
-  >;
-#endif  // RAJA_ENABLE_CUDA
+using Cuda_launch_policies =
+    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
+#endif // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
-  >;
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
 
-using Hip_launch_policies = camp::list<
-      hip_loop_policies
-       >;
+using Hip_launch_policies = camp::list<hip_loop_policies>;
 #endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
-  >;
-
-using Sycl_launch_policies = camp::list<  
-  sycl_loop_policies
-  >;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+
+using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
-#endif  // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
+#endif // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index d703216a13..04cda22390 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -15,98 +15,80 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
-
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
-
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
-
-#endif  // RAJA_ENABLE_OPENMP
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using OpenMP_launch_policies = camp::list<omp_policies>;
+
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
-
-using Cuda_launch_policies = camp::list<
-  cuda_loop_policies,
-  cuda_loop_explicit_policies
-  >;
-#endif  // RAJA_ENABLE_CUDA
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
+#endif // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
-  >;
-
-using Hip_launch_policies = camp::list<
-      hip_loop_policies
-       >;
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+
+using Hip_launch_policies = camp::list<hip_loop_policies>;
 #endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, //slowest index
-  RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, //fastest index
-  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
-  >;
-
-using Sycl_launch_policies = camp::list<  
-  sycl_loop_policies
-  >;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, // slowest index
+               RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, // fastest index
+               RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+
+using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
-#endif  // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
+#endif // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index fa2b39f761..a5be2f6247 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -15,58 +15,44 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
+// Launch policies
 #if defined(RAJA_ENABLE_CUDA)
-using seq_cuda_policies =
-  camp::list<
-              RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::cuda_launch_t<true>>,
-              RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
-
-using seq_cuda_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using seq_cuda_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
+
+using seq_cuda_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using Sequential_launch_policies =
-  camp::list<
-             seq_cuda_policies,
-             seq_cuda_explicit_policies
-            >;
+    camp::list<seq_cuda_policies, seq_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
 using seq_hip_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
-            >;
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using seq_sycl_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
-            >;
+using seq_sycl_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
 
 #else
 using Sequential_launch_policies =
-  camp::list<
-    camp::list<
-               RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-               RAJA::LoopPolicy<RAJA::seq_exec>,
-               RAJA::LoopPolicy<RAJA::seq_exec>
-              >
-            >;
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
 #endif // Sequential
 
 
@@ -74,99 +60,84 @@ using Sequential_launch_policies =
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using omp_cuda_policies =
-  camp::list<
-              RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::cuda_launch_t<false>>,
-              RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using omp_cuda_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::cuda_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
-using omp_cuda_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using omp_cuda_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using OpenMP_launch_policies =
-  camp::list<
-             omp_cuda_policies,
-             omp_cuda_explicit_policies
-            >;
+    camp::list<omp_cuda_policies, omp_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
 
-using omp_hip_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::hip_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
-            >;
+using omp_hip_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::hip_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using OpenMP_launch_policies = camp::list<omp_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using omp_sycl_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::sycl_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
-            >;
+using omp_sycl_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::sycl_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
 
 using OpenMP_launch_policies = camp::list<omp_sycl_policies>;
 
 #else
 
 using OpenMP_launch_policies =
-  camp::list<
-    camp::list<
-                RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
-                RAJA::LoopPolicy<RAJA::seq_exec>
-               >
-             >;
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                          RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
 #endif
 
-#endif  // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using Cuda_launch_policies =
-  camp::list<
-             seq_cuda_policies
-            ,seq_cuda_explicit_policies
+using Cuda_launch_policies = camp::list<seq_cuda_policies,
+                                        seq_cuda_explicit_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-            ,omp_cuda_policies
-            ,omp_cuda_explicit_policies
+                                        ,
+                                        omp_cuda_policies,
+                                        omp_cuda_explicit_policies
 #endif
 
-           >;
-#endif  // RAJA_ENABLE_CUDA
+                                        >;
+#endif // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using Hip_launch_policies = camp::list<
-         seq_hip_policies
+using Hip_launch_policies = camp::list<seq_hip_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-         , omp_hip_policies
+                                       ,
+                                       omp_hip_policies
 #endif
-        >;
+                                       >;
 
 #endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using Sycl_launch_policies = camp::list<
-         seq_sycl_policies
+using Sycl_launch_policies = camp::list<seq_sycl_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-         , omp_sycl_policies
+                                        ,
+                                        omp_sycl_policies
 #endif
-        >;
+                                        >;
 
 #endif // RAJA_ENABLE_SYCL
 
-#endif  // __RAJA_TEST_LAUNCH_RUNTIME_EXECPOL_HPP__
+#endif // __RAJA_TEST_LAUNCH_RUNTIME_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
index 2c5412893c..91c402e849 100644
--- a/test/include/RAJA_test-multi-reduce-abstractor.hpp
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -18,7 +18,7 @@
 //
 // Get the identity value for the operation used by the given multi reducer
 //
-template < typename MultiReducer >
+template <typename MultiReducer>
 inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 {
   return MultiReducer::MultiReduceOp::identity();
@@ -27,144 +27,207 @@ inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 
 struct SumAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
+  template <typename Reducer>
   static bool consistent(Reducer const&)
   {
-    return RAJA::policy_has_trait<typename Reducer::policy, RAJA::reduce::ordered>::value ||
+    return RAJA::policy_has_trait<typename Reducer::policy,
+                                  RAJA::reduce::ordered>::value ||
            !std::is_floating_point<typename Reducer::value_type>::value;
   }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceSum<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs + rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs + rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) += rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) += rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct MinAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceMin<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs > rhs) ? rhs : lhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return (lhs > rhs) ? rhs : lhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).min(rhs); }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs).min(rhs);
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct MaxAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceMax<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs < rhs) ? rhs : lhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return (lhs < rhs) ? rhs : lhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).max(rhs); }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs).max(rhs);
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct BitAndAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_integral<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceBitAnd<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs & rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs & rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) &= rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) &= rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct BitOrAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_integral<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceBitOr<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs | rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs | rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) |= rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) |= rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 
 // Sequential reduction policy types
-using ReduceSumAbstractors = camp::list< SumAbstractor >;
-using ReduceMinAbstractors = camp::list< MinAbstractor >;
-using ReduceMaxAbstractors = camp::list< MaxAbstractor >;
-using ReduceBitAndAbstractors = camp::list< BitAndAbstractor >;
-using ReduceBitOrAbstractors = camp::list< BitOrAbstractor >;
+using ReduceSumAbstractors = camp::list<SumAbstractor>;
+using ReduceMinAbstractors = camp::list<MinAbstractor>;
+using ReduceMaxAbstractors = camp::list<MaxAbstractor>;
+using ReduceBitAndAbstractors = camp::list<BitAndAbstractor>;
+using ReduceBitOrAbstractors = camp::list<BitOrAbstractor>;
 
-#endif  // __RAJA_test_multi_reduce_abstractor_HPP__
+#endif // __RAJA_test_multi_reduce_abstractor_HPP__
diff --git a/test/include/RAJA_test-multi-reducepol.hpp b/test/include/RAJA_test-multi-reducepol.hpp
index e024ef70aa..892fc51795 100644
--- a/test/include/RAJA_test-multi-reducepol.hpp
+++ b/test/include/RAJA_test-multi-reducepol.hpp
@@ -16,28 +16,29 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialMultiReducePols = camp::list< RAJA::seq_multi_reduce >;
+using SequentialMultiReducePols = camp::list<RAJA::seq_multi_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPMultiReducePols =
-  camp::list< RAJA::omp_multi_reduce,
-              RAJA::omp_multi_reduce_ordered >;
+    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducePols =
-  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::cuda_multi_reduce_atomic_global_host_init,
-              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+using CudaMultiReducePols = camp::list<
+    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::cuda_multi_reduce_atomic_global_host_init,
+    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducePols =
-  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::hip_multi_reduce_atomic_global_host_init,
-              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+using HipMultiReducePols = camp::list<
+    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::hip_multi_reduce_atomic_global_host_init,
+    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
-#endif  // __RAJA_test_multi_reducepol_HPP__
+#endif // __RAJA_test_multi_reducepol_HPP__
diff --git a/test/include/RAJA_test-platform.hpp b/test/include/RAJA_test-platform.hpp
index 7862461f18..4ae29867f0 100644
--- a/test/include/RAJA_test-platform.hpp
+++ b/test/include/RAJA_test-platform.hpp
@@ -16,10 +16,10 @@
 
 #include "camp/list.hpp"
 
-template < RAJA::Platform PLATFORM >
+template <RAJA::Platform PLATFORM>
 struct PlatformHolder
 {
-   static const RAJA::Platform platform = PLATFORM;
+  static const RAJA::Platform platform = PLATFORM;
 };
 
 //
@@ -38,7 +38,8 @@ using CudaPlatformList = camp::list<PlatformHolder<RAJA::Platform::cuda>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPlatformList = camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
+using OpenMPTargetPlatformList =
+    camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
diff --git a/test/include/RAJA_test-plugin-kernelpol.hpp b/test/include/RAJA_test-plugin-kernelpol.hpp
index 9c3f0e2e52..37b2ae0a4a 100644
--- a/test/include/RAJA_test-plugin-kernelpol.hpp
+++ b/test/include/RAJA_test-plugin-kernelpol.hpp
@@ -18,86 +18,90 @@
 
 // Sequential execution policy types
 using SequentialPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
-          RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::simd_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
-          RAJA::statement::For<0, RAJA::simd_exec,
-            RAJA::statement::Lambda<0>>>>
-    >;
+    RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::omp_parallel_for_exec,
-          RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::For<0,
+                                            RAJA::omp_parallel_for_exec,
+                                            RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec<64>,
-          RAJA::statement::Lambda<0>>>
-    >;
+using OpenMPTargetPluginKernelExecPols = camp::list<RAJA::KernelPolicy<
+    RAJA::statement::For<0,
+                         RAJA::omp_target_parallel_for_exec<64>,
+                         RAJA::statement::Lambda<0>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernel<
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernel<
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernelFixed<128,
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernelFixed<128,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<128>,
+        RAJA::cuda_block_x_direct,
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+        128,
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+        128,
+        RAJA::statement::Tile<
+            0,
+            RAJA::tile_fixed<128>,
+            RAJA::cuda_block_x_direct,
+            RAJA::statement::For<0,
+                                 RAJA::cuda_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernel<
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernel<
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
-            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernelFixed<128,
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernelFixed<128,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
-            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::HipKernel<
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<128>,
+        RAJA::hip_block_x_direct,
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+        128,
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+        128,
+        RAJA::statement::Tile<
+            0,
+            RAJA::tile_fixed<128>,
+            RAJA::hip_block_x_direct,
+            RAJA::statement::For<0,
+                                 RAJA::hip_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
-#endif  // __RAJA_test_plugin_kernelpol_HPP__
+#endif // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-plugin-launchpol.hpp b/test/include/RAJA_test-plugin-launchpol.hpp
index 2370084633..2ea6d4c3d8 100644
--- a/test/include/RAJA_test-plugin-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-launchpol.hpp
@@ -17,19 +17,23 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
+using CudaPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
+using HipPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
 
 #endif
 
-#endif  // __RAJA_test_plugin_kernelpol_HPP__
+#endif // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-plugin-resource-launchpol.hpp b/test/include/RAJA_test-plugin-resource-launchpol.hpp
index 8d08574347..b399bdac46 100644
--- a/test/include/RAJA_test-plugin-resource-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-resource-launchpol.hpp
@@ -17,19 +17,23 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginResourceLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginResourceLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
+using CudaPluginResourceLaunchExecPols = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
+using HipPluginResourceLaunchExecPols = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
 
 #endif
 
-#endif  // __RAJA_test_plugin_kernelpol_HPP__
+#endif // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-reduce-types.hpp b/test/include/RAJA_test-reduce-types.hpp
index 8d8115321f..6f472a048a 100644
--- a/test/include/RAJA_test-reduce-types.hpp
+++ b/test/include/RAJA_test-reduce-types.hpp
@@ -21,14 +21,13 @@
 //
 // Reduce data types
 //
-using ReduceDataTypeList =
-  camp::list< int,
+using ReduceDataTypeList = camp::list<int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
+                                      unsigned,
+                                      long long,
+                                      unsigned long long,
 #endif
-              float,
-              double >;
+                                      float,
+                                      double>;
 
 #endif // __RAJA_test_reduce_types_HPP__
diff --git a/test/include/RAJA_test-reduceloc-types.hpp b/test/include/RAJA_test-reduceloc-types.hpp
index 336c7dd23e..74e4c6b625 100644
--- a/test/include/RAJA_test-reduceloc-types.hpp
+++ b/test/include/RAJA_test-reduceloc-types.hpp
@@ -15,10 +15,13 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-struct Index2D {
-   RAJA::Index_type idx, idy;
-   constexpr Index2D() : idx(-1), idy(-1) {}
-   constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy) : idx(idx), idy(idy) {}
+struct Index2D
+{
+  RAJA::Index_type idx, idy;
+  constexpr Index2D() : idx(-1), idy(-1) {}
+  constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy)
+      : idx(idx), idy(idy)
+  {}
 };
 
 #endif // __RAJA_test_reduceloc_types_HPP__
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index e9e075b287..532c2d03d3 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -16,43 +16,44 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialReducePols = camp::list< RAJA::seq_reduce >;
+using SequentialReducePols = camp::list<RAJA::seq_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducePols = 
+using OpenMPReducePols =
 #if 0 // is ordered reduction broken???
   camp::list< RAJA::omp_reduce,
               RAJA::omp_reduce_ordered >;
 #else
-  camp::list< RAJA::omp_reduce >;
+    camp::list<RAJA::omp_reduce>;
 #endif
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducePols =
-  camp::list< RAJA::omp_target_reduce >;
+    using OpenMPTargetReducePols = camp::list<RAJA::omp_target_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence,
-                                   RAJA::cuda_reduce_block_fence,
-                                   RAJA::cuda_reduce_atomic_device_init_device_fence,
-                                   RAJA::cuda_reduce_atomic_device_init_block_fence,
-                                   RAJA::cuda_reduce_atomic_host_init_device_fence,
-                                   RAJA::cuda_reduce_atomic_host_init_block_fence >;
+using CudaReducePols =
+    camp::list<RAJA::cuda_reduce_device_fence,
+               RAJA::cuda_reduce_block_fence,
+               RAJA::cuda_reduce_atomic_device_init_device_fence,
+               RAJA::cuda_reduce_atomic_device_init_block_fence,
+               RAJA::cuda_reduce_atomic_host_init_device_fence,
+               RAJA::cuda_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols = camp::list< RAJA::hip_reduce_device_fence,
-                                  RAJA::hip_reduce_block_fence,
-                                  RAJA::hip_reduce_atomic_device_init_device_fence,
-                                  RAJA::hip_reduce_atomic_device_init_block_fence,
-                                  RAJA::hip_reduce_atomic_host_init_device_fence,
-                                  RAJA::hip_reduce_atomic_host_init_block_fence >;
+using HipReducePols =
+    camp::list<RAJA::hip_reduce_device_fence,
+               RAJA::hip_reduce_block_fence,
+               RAJA::hip_reduce_atomic_device_init_device_fence,
+               RAJA::hip_reduce_atomic_device_init_block_fence,
+               RAJA::hip_reduce_atomic_host_init_device_fence,
+               RAJA::hip_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclReducePols = camp::list< RAJA::sycl_reduce >;
+using SyclReducePols = camp::list<RAJA::sycl_reduce>;
 #endif
 
-#endif  // __RAJA_test_reducepol_HPP__
+#endif // __RAJA_test_reducepol_HPP__
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index cf633098a9..8870462a0d 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -13,30 +13,25 @@
 #include "RAJA_gtest.hpp"
 
 
-using TensorElementTypes = ::testing::Types<
-        int,
-        long,
-        float,
-        double
-    >;
-
-template<typename POL>
-struct TensorTestHelper {
-
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      body();
-    }
-
-    static constexpr bool is_device = false;
+using TensorElementTypes = ::testing::Types<int, long, float, double>;
+
+template <typename POL>
+struct TensorTestHelper
+{
+
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    body();
+  }
+
+  static constexpr bool is_device = false;
 };
 
 #ifdef RAJA_ENABLE_CUDA
 
 template <typename BODY>
-__global__
-void test_launcher(BODY body_in)
+__global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -44,33 +39,30 @@ void test_launcher(BODY body_in)
   body();
 }
 
-template<>
+template <>
 struct TensorTestHelper<RAJA::expt::cuda_warp_register>
 {
 
-    RAJA_SUPPRESS_HD_WARN
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      cudaDeviceSynchronize();
-
-      test_launcher<<<1,32>>>(body);
+  RAJA_SUPPRESS_HD_WARN
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    cudaDeviceSynchronize();
 
-      cudaDeviceSynchronize();
+    test_launcher<<<1, 32>>>(body);
 
-    }
+    cudaDeviceSynchronize();
+  }
 
-    static constexpr bool is_device = true;
+  static constexpr bool is_device = true;
 };
 #endif
 
 
-
 #ifdef RAJA_ENABLE_HIP
 
 template <typename BODY>
-__global__
-void test_launcher(BODY body_in)
+__global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -78,166 +70,191 @@ void test_launcher(BODY body_in)
   body();
 }
 
-template<>
+template <>
 struct TensorTestHelper<RAJA::expt::hip_wave_register>
 {
 
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      hipDeviceSynchronize();
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    hipDeviceSynchronize();
 
-      RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0,64),
-      [=] RAJA_HOST_DEVICE (int ){
-        body();
-      });
+    RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0, 64),
+                                     [=] RAJA_HOST_DEVICE(int) { body(); });
 
-      hipDeviceSynchronize();
-
-    }
+    hipDeviceSynchronize();
+  }
 
-    static constexpr bool is_device = true;
+  static constexpr bool is_device = true;
 };
 #endif
 
 
-
-template<typename POL, typename BODY>
-void tensor_do(BODY const &body){
+template <typename POL, typename BODY>
+void tensor_do(BODY const& body)
+{
   TensorTestHelper<POL>::exec(body);
 }
 
 
-
 #if defined(RAJA_ENABLE_CUDA)
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
-  if(TensorTestHelper<POL>::is_device){
-    T *ptr;
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    T* ptr;
 
-    cudaErrchk(cudaMalloc(&ptr, len*sizeof(T)));
+    cudaErrchk(cudaMalloc(&ptr, len * sizeof(T)));
 
     return ptr;
   }
-  else{
+  else
+  {
     return new T[len];
   }
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
-  if(TensorTestHelper<POL>::is_device){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
     cudaErrchk(cudaFree(ptr));
   }
-  else{
+  else
+  {
     delete[] ptr;
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  if(TensorTestHelper<POL>::is_device){
-    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), cudaMemcpyHostToDevice));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    cudaErrchk(cudaMemcpy(
+        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), cudaMemcpyHostToDevice));
   }
-  else{
-    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  if(TensorTestHelper<POL>::is_device){
-    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), cudaMemcpyDeviceToHost));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    cudaErrchk(cudaMemcpy(
+        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), cudaMemcpyDeviceToHost));
   }
-  else{
-    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
   }
 }
 
 
-
 #elif defined(RAJA_ENABLE_HIP)
 
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
-  if(TensorTestHelper<POL>::is_device){
-    T *ptr;
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    T* ptr;
 
-    hipErrchk(hipMalloc(&ptr, len*sizeof(T)));
+    hipErrchk(hipMalloc(&ptr, len * sizeof(T)));
 
     return ptr;
   }
-  else{
+  else
+  {
     return new T[len];
   }
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
-  if(TensorTestHelper<POL>::is_device){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
     hipErrchk(hipFree(ptr));
   }
-  else{
+  else
+  {
     delete[] ptr;
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  if(TensorTestHelper<POL>::is_device){
-    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), hipMemcpyHostToDevice));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    hipErrchk(hipMemcpy(
+        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), hipMemcpyHostToDevice));
   }
-  else{
-    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  if(TensorTestHelper<POL>::is_device){
-    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), hipMemcpyDeviceToHost));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    hipErrchk(hipMemcpy(
+        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), hipMemcpyDeviceToHost));
   }
-  else{
-    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
   }
 }
 
 
 #else
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
   return new T[len];
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
   delete[] ptr;
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
 }
 
 #endif
 
 
-
 // Sugar to make things cleaner
-template<typename POL, typename T>
-T* tensor_malloc(std::vector<T> const &vec){
-  return tensor_malloc<POL,T>(vec.size());
+template <typename POL, typename T>
+T* tensor_malloc(std::vector<T> const& vec)
+{
+  return tensor_malloc<POL, T>(vec.size());
 }
 
 
-
-
 #endif
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index 77042a43e1..f2ba680d52 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -18,28 +18,32 @@
 #include <new>
 #include <unordered_map>
 
-namespace detail {
+namespace detail
+{
 
-struct indirect_function_call_dispatch_typer {
-  template < typename ... >
+struct indirect_function_call_dispatch_typer
+{
+  template <typename...>
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
-struct indirect_virtual_function_dispatch_typer {
-  template < typename ... >
+struct indirect_virtual_function_dispatch_typer
+{
+  template <typename...>
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
-struct direct_dispatch_typer {
-  template < typename ... Ts >
+struct direct_dispatch_typer
+{
+  template <typename... Ts>
   using type = ::RAJA::direct_dispatch<Ts...>;
 };
 
 
-template < typename Resource >
+template <typename Resource>
 struct ResourceAllocator
 {
-  template < typename T >
+  template <typename T>
   struct std_allocator
   {
     using value_type = T;
@@ -47,26 +51,29 @@ struct ResourceAllocator
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator &&) = default;
+    std_allocator(std_allocator&&) = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator &&) = default;
+    std_allocator& operator=(std_allocator&&) = default;
 
-    template < typename U >
+    template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
-      : m_res(other.get_resource())
-    { }
+        : m_res(other.get_resource())
+    {}
 
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+      {
         throw std::bad_alloc();
       }
 
-      value_type* ptr = m_res.template allocate<value_type>(num, camp::resources::MemoryAccess::Pinned);
+      value_type* ptr = m_res.template allocate<value_type>(
+          num, camp::resources::MemoryAccess::Pinned);
 
-      if (!ptr) {
+      if (!ptr)
+      {
         throw std::bad_alloc();
       }
 
@@ -78,19 +85,19 @@ struct ResourceAllocator
       m_res.deallocate(ptr, camp::resources::MemoryAccess::Pinned);
     }
 
-    Resource const& get_resource() const
-    {
-      return m_res;
-    }
+    Resource const& get_resource() const { return m_res; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& /*lhs*/, std_allocator<U> const& /*rhs*/)
+    friend inline bool operator==(std_allocator const& /*lhs*/,
+                                  std_allocator<U> const& /*rhs*/)
     {
-      return true; // lhs.get_resource() == rhs.get_resource(); // TODO not equality comparable yet
+      return true; // lhs.get_resource() == rhs.get_resource(); // TODO not
+                   // equality comparable yet
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -109,10 +116,10 @@ struct NeverEqualAllocator
   NeverEqualAllocator() = default;
 
   NeverEqualAllocator(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator(NeverEqualAllocator &&) = default;
+  NeverEqualAllocator(NeverEqualAllocator&&) = default;
 
   NeverEqualAllocator& operator=(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator& operator=(NeverEqualAllocator &&) = default;
+  NeverEqualAllocator& operator=(NeverEqualAllocator&&) = default;
 
   NeverEqualAllocator select_on_container_copy_construction()
   {
@@ -121,7 +128,8 @@ struct NeverEqualAllocator
 
   ~NeverEqualAllocator()
   {
-    if (!m_allocations.empty()) {
+    if (!m_allocations.empty())
+    {
       RAJA_ABORT_OR_THROW("allocation map not empty at destruction");
     }
   }
@@ -131,7 +139,8 @@ struct NeverEqualAllocator
   {
     void* ptr = malloc(size);
     auto iter_b = m_allocations.emplace(ptr, size);
-    if (!iter_b.second) {
+    if (!iter_b.second)
+    {
       RAJA_ABORT_OR_THROW("failed to add allocation to map");
     }
     return ptr;
@@ -140,20 +149,19 @@ struct NeverEqualAllocator
   void deallocate(void* ptr, size_t size) noexcept
   {
     auto iter = m_allocations.find(ptr);
-    if (iter == m_allocations.end()) {
+    if (iter == m_allocations.end())
+    {
       RAJA_ABORT_OR_THROW("failed to find allocation in map");
     }
-    if (iter->second != size) {
+    if (iter->second != size)
+    {
       RAJA_ABORT_OR_THROW("allocation size does not match known in map");
     }
     m_allocations.erase(iter);
     free(ptr);
   }
 
-  bool operator==(NeverEqualAllocator const&) const
-  {
-    return false;
-  }
+  bool operator==(NeverEqualAllocator const&) const { return false; }
 
 private:
   std::unordered_map<void*, size_t> m_allocations;
@@ -168,31 +176,22 @@ struct AlwaysEqualAllocator
   AlwaysEqualAllocator() = default;
 
   AlwaysEqualAllocator(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator(AlwaysEqualAllocator &&) = default;
+  AlwaysEqualAllocator(AlwaysEqualAllocator&&) = default;
 
   AlwaysEqualAllocator& operator=(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator &&) = default;
+  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator&&) = default;
 
-  AlwaysEqualAllocator select_on_container_copy_construction()
-  {
-    return *this;
-  }
+  AlwaysEqualAllocator select_on_container_copy_construction() { return *this; }
 
   /*[[nodiscard]]*/
-  void* allocate(size_t size)
-  {
-    return get_allocator().allocate(size);
-  }
+  void* allocate(size_t size) { return get_allocator().allocate(size); }
 
   void deallocate(void* ptr, size_t size) noexcept
   {
     get_allocator().deallocate(ptr, size);
   }
 
-  bool operator==(AlwaysEqualAllocator const&) const
-  {
-    return true;
-  }
+  bool operator==(AlwaysEqualAllocator const&) const { return true; }
 
 private:
   static inline NeverEqualAllocator& get_allocator()
@@ -211,45 +210,49 @@ struct PropogatingAllocator : NeverEqualAllocator
   PropogatingAllocator() = default;
 
   PropogatingAllocator(PropogatingAllocator const&) = default;
-  PropogatingAllocator(PropogatingAllocator &&) = default;
+  PropogatingAllocator(PropogatingAllocator&&) = default;
 
   PropogatingAllocator& operator=(PropogatingAllocator const&) = default;
-  PropogatingAllocator& operator=(PropogatingAllocator &&) = default;
+  PropogatingAllocator& operator=(PropogatingAllocator&&) = default;
 
   PropogatingAllocator select_on_container_copy_construction()
   {
-    return PropogatingAllocator(NeverEqualAllocator::select_on_container_copy_construction());
+    return PropogatingAllocator(
+        NeverEqualAllocator::select_on_container_copy_construction());
   }
 
 private:
   PropogatingAllocator(NeverEqualAllocator&& nea)
-    : NeverEqualAllocator(std::move(nea))
-  { }
+      : NeverEqualAllocator(std::move(nea))
+  {}
 };
 
-template < typename AllocatorImpl >
+template <typename AllocatorImpl>
 struct WorkStorageTestAllocator
 {
-  template < typename T >
+  template <typename T>
   struct std_allocator
   {
     using value_type = T;
-    using propagate_on_container_copy_assignment = typename AllocatorImpl::propagate_on_container_copy_assignment;
-    using propagate_on_container_move_assignment = typename AllocatorImpl::propagate_on_container_move_assignment;
-    using propagate_on_container_swap = typename AllocatorImpl::propagate_on_container_swap;
+    using propagate_on_container_copy_assignment =
+        typename AllocatorImpl::propagate_on_container_copy_assignment;
+    using propagate_on_container_move_assignment =
+        typename AllocatorImpl::propagate_on_container_move_assignment;
+    using propagate_on_container_swap =
+        typename AllocatorImpl::propagate_on_container_swap;
 
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator &&) = default;
+    std_allocator(std_allocator&&) = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator &&) = default;
+    std_allocator& operator=(std_allocator&&) = default;
 
-    template < typename U >
+    template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
-      : m_impl(other.get_impl())
-    { }
+        : m_impl(other.get_impl())
+    {}
 
     std_allocator select_on_container_copy_construction()
     {
@@ -259,13 +262,16 @@ struct WorkStorageTestAllocator
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+      {
         throw std::bad_alloc();
       }
 
-      value_type* ptr = static_cast<value_type*>(m_impl.allocate(num*sizeof(value_type)));
+      value_type* ptr =
+          static_cast<value_type*>(m_impl.allocate(num * sizeof(value_type)));
 
-      if (!ptr) {
+      if (!ptr)
+      {
         throw std::bad_alloc();
       }
 
@@ -274,30 +280,27 @@ struct WorkStorageTestAllocator
 
     void deallocate(value_type* ptr, size_t num) noexcept
     {
-      m_impl.deallocate(static_cast<void*>(ptr), num*sizeof(value_type));
+      m_impl.deallocate(static_cast<void*>(ptr), num * sizeof(value_type));
     }
 
-    AllocatorImpl const& get_impl() const
-    {
-      return m_impl;
-    }
+    AllocatorImpl const& get_impl() const { return m_impl; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator==(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return lhs.get_impl() == rhs.get_impl();
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
 
   private:
-    std_allocator(AllocatorImpl&& impl)
-      : m_impl(std::move(impl))
-    { }
+    std_allocator(AllocatorImpl&& impl) : m_impl(std::move(impl)) {}
 
     AllocatorImpl m_impl;
   };
@@ -309,95 +312,64 @@ struct WorkStorageTestAllocator
 //
 // Data types
 //
-using IndexTypeTypeList = camp::list<
-                                 int,
-                                 long,
-                                 RAJA::Index_type
-                               >;
-
-using XargsTypeList = camp::list<
-                                 RAJA::xargs<>,
-                                 RAJA::xargs<int*>,
-                                 RAJA::xargs<int, int*>
-                               >;
-
-using SequentialExecPolicyList =
-    camp::list<
-                RAJA::seq_work
-              >;
+using IndexTypeTypeList = camp::list<int, long, RAJA::Index_type>;
+
+using XargsTypeList =
+    camp::list<RAJA::xargs<>, RAJA::xargs<int*>, RAJA::xargs<int, int*>>;
+
+using SequentialExecPolicyList = camp::list<RAJA::seq_work>;
 using SequentialOrderedPolicyList =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              >;
+    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialOrderPolicyList =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              >;
+    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialStoragePolicyList =
-    camp::list<
-                RAJA::array_of_pointers,
-                RAJA::ragged_array_of_objects,
-                RAJA::constant_stride_array_of_objects
-              >;
+    camp::list<RAJA::array_of_pointers,
+               RAJA::ragged_array_of_objects,
+               RAJA::constant_stride_array_of_objects>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPExecPolicyList =
-    camp::list<
-                RAJA::omp_work
-              >;
+using OpenMPExecPolicyList = camp::list<RAJA::omp_work>;
 using OpenMPOrderedPolicyList = SequentialOrderedPolicyList;
-using OpenMPOrderPolicyList   = SequentialOrderPolicyList;
+using OpenMPOrderPolicyList = SequentialOrderPolicyList;
 using OpenMPStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetExecPolicyList =
-    camp::list<
-                RAJA::omp_target_work
-              >;
+using OpenMPTargetExecPolicyList = camp::list<RAJA::omp_target_work>;
 using OpenMPTargetOrderedPolicyList = SequentialOrderedPolicyList;
-using OpenMPTargetOrderPolicyList   = SequentialOrderPolicyList;
+using OpenMPTargetOrderPolicyList = SequentialOrderPolicyList;
 using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaExecPolicyList =
-    camp::list<
-                #if defined(RAJA_TEST_EXHAUSTIVE)
-                // avoid compilation error:
-                // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at instantiation of class
-                RAJA::cuda_work<256>,
-                #endif
-                RAJA::cuda_work<1024>,
-                RAJA::cuda_work_explicit<256, 2>
-              >;
+using CudaExecPolicyList = camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    // avoid compilation error:
+    // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at
+    // instantiation of class
+    RAJA::cuda_work<256>,
+#endif
+    RAJA::cuda_work<1024>,
+    RAJA::cuda_work_explicit<256, 2>>;
 using CudaOrderedPolicyList = SequentialOrderedPolicyList;
-using CudaOrderPolicyList   =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered,
-                RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average
-              >;
+using CudaOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
 using CudaStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipExecPolicyList =
-    camp::list<
-                #if defined(RAJA_TEST_EXHAUSTIVE)
-                RAJA::hip_work<256>,
-                #endif
-                RAJA::hip_work<1024>
-              >;
+using HipExecPolicyList = camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    RAJA::hip_work<256>,
+#endif
+    RAJA::hip_work<1024>>;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
-using HipOrderPolicyList   =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              , RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average
-              >;
+using HipOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
 using HipStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -405,15 +377,18 @@ using HipStoragePolicyList = SequentialStoragePolicyList;
 //
 // Dispatch policy type lists, broken up for compile time reasons
 //
-using IndirectFunctionDispatchTyperList = camp::list<detail::indirect_function_call_dispatch_typer>;
-using IndirectVirtualDispatchTyperList = camp::list<detail::indirect_virtual_function_dispatch_typer>;
+using IndirectFunctionDispatchTyperList =
+    camp::list<detail::indirect_function_call_dispatch_typer>;
+using IndirectVirtualDispatchTyperList =
+    camp::list<detail::indirect_virtual_function_dispatch_typer>;
 using DirectDispatchTyperList = camp::list<detail::direct_dispatch_typer>;
 
 
 //
 // Memory resource Allocator types
 //
-using HostAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Host>::template std_allocator<char>>;
+using HostAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Host>::template std_allocator<char>>;
 
 using SequentialAllocatorList = HostAllocatorList;
 
@@ -422,23 +397,30 @@ using OpenMPAllocatorList = HostAllocatorList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Cuda>::template std_allocator<char>>;
+using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Cuda>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Hip>::template std_allocator<char>>;
+using HipAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Hip>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Omp>::template std_allocator<char>>;
+using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Omp>::template std_allocator<char>>;
 #endif
 
 
 //
 // Memory resource types for testing different std allocator requirements
 //
-using WorkStorageAllocatorList = camp::list<typename detail::WorkStorageTestAllocator<detail::AlwaysEqualAllocator>::template std_allocator<char>,
-                                            typename detail::WorkStorageTestAllocator<detail::NeverEqualAllocator>::template std_allocator<char>,
-                                            typename detail::WorkStorageTestAllocator<detail::PropogatingAllocator>::template std_allocator<char>>;
-
-#endif  // __TEST_WORKGROUP_UTILS_HPP__
+using WorkStorageAllocatorList =
+    camp::list<typename detail::WorkStorageTestAllocator<
+                   detail::AlwaysEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::NeverEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::PropogatingAllocator>::template std_allocator<char>>;
+
+#endif // __TEST_WORKGROUP_UTILS_HPP__
diff --git a/test/include/RAJA_unit-test-for3d3d.hpp b/test/include/RAJA_unit-test-for3d3d.hpp
index a2c43ec55e..878bf7b7ca 100644
--- a/test/include/RAJA_unit-test-for3d3d.hpp
+++ b/test/include/RAJA_unit-test-for3d3d.hpp
@@ -40,12 +40,15 @@ struct dim3d3d
 RAJA_HOST_DEVICE
 int index(dim3d3d idx, dim3d3d dim)
 {
-  return               idx.thread[0] +
-      dim.thread[0] * (idx.thread[1] +
-      dim.thread[1] * (idx.thread[2] +
-      dim.thread[2] * (idx.block[0] +
-      dim.block[0]  * (idx.block[1] +
-      dim.block[1]  * (idx.block[2])))));
+  return idx.thread[0] +
+         dim.thread[0] *
+             (idx.thread[1] +
+              dim.thread[1] *
+                  (idx.thread[2] +
+                   dim.thread[2] *
+                       (idx.block[0] +
+                        dim.block[0] *
+                            (idx.block[1] + dim.block[1] * (idx.block[2])))));
 }
 
 ///
@@ -56,41 +59,61 @@ int index(dim3d3d idx, dim3d3d dim)
 ///   /* code to test */
 /// } );
 ///
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 inline void for3d3d(dim3d3d dim, L&& run);
 
 // test_seq implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_seq, dim3d3d dim, L&& run)
 {
-  for (int bz = 0; bz < dim.block[2]; ++bz) {
-  for (int by = 0; by < dim.block[1]; ++by) {
-  for (int bx = 0; bx < dim.block[0]; ++bx) {
-    for (int tz = 0; tz < dim.thread[2]; ++tz) {
-    for (int ty = 0; ty < dim.thread[1]; ++ty) {
-    for (int tx = 0; tx < dim.thread[0]; ++tx) {
-      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
-    }}}
-  }}}
+  for (int bz = 0; bz < dim.block[2]; ++bz)
+  {
+    for (int by = 0; by < dim.block[1]; ++by)
+    {
+      for (int bx = 0; bx < dim.block[0]; ++bx)
+      {
+        for (int tz = 0; tz < dim.thread[2]; ++tz)
+        {
+          for (int ty = 0; ty < dim.thread[1]; ++ty)
+          {
+            for (int tx = 0; tx < dim.thread[0]; ++tx)
+            {
+              run(dim3d3d{{tx, ty, tz}, {bx, by, bz}}, dim);
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 {
 #pragma omp target teams distribute collapse(3)
-  for (int bz = 0; bz < dim.block[2]; ++bz) {
-  for (int by = 0; by < dim.block[1]; ++by) {
-  for (int bx = 0; bx < dim.block[0]; ++bx) {
+  for (int bz = 0; bz < dim.block[2]; ++bz)
+  {
+    for (int by = 0; by < dim.block[1]; ++by)
+    {
+      for (int bx = 0; bx < dim.block[0]; ++bx)
+      {
 #pragma omp parallel for collapse(3)
-    for (int tz = 0; tz < dim.thread[2]; ++tz) {
-    for (int ty = 0; ty < dim.thread[1]; ++ty) {
-    for (int tx = 0; tx < dim.thread[0]; ++tx) {
-      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
-    }}}
-  }}}
+        for (int tz = 0; tz < dim.thread[2]; ++tz)
+        {
+          for (int ty = 0; ty < dim.thread[1]; ++ty)
+          {
+            for (int tx = 0; tx < dim.thread[0]; ++tx)
+            {
+              run(dim3d3d{{tx, ty, tz}, {bx, by, bz}}, dim);
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 #endif
@@ -100,20 +123,29 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_cuda_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
+  run(dim3d3d{{static_cast<int>(threadIdx.x),
+               static_cast<int>(threadIdx.y),
+               static_cast<int>(threadIdx.z)},
+              {static_cast<int>(blockIdx.x),
+               static_cast<int>(blockIdx.y),
+               static_cast<int>(blockIdx.z)}},
+      dim3d3d{{static_cast<int>(blockDim.x),
+               static_cast<int>(blockDim.y),
+               static_cast<int>(blockDim.z)},
+              {static_cast<int>(gridDim.x),
+               static_cast<int>(gridDim.y),
+               static_cast<int>(gridDim.z)}});
 }
 
 // test_cuda implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 {
-   for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
-                         dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(std::forward<L>(run));
-   cudaErrchk(cudaGetLastError());
-   cudaErrchk(cudaDeviceSynchronize());
+  for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
+                        dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
+      std::forward<L>(run));
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -123,28 +155,37 @@ inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_hip_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
+  run(dim3d3d{{static_cast<int>(threadIdx.x),
+               static_cast<int>(threadIdx.y),
+               static_cast<int>(threadIdx.z)},
+              {static_cast<int>(blockIdx.x),
+               static_cast<int>(blockIdx.y),
+               static_cast<int>(blockIdx.z)}},
+      dim3d3d{{static_cast<int>(blockDim.x),
+               static_cast<int>(blockDim.y),
+               static_cast<int>(blockDim.z)},
+              {static_cast<int>(gridDim.x),
+               static_cast<int>(gridDim.y),
+               static_cast<int>(gridDim.z)}});
 }
 
 // test_hip implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 {
-   hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
-                      dim3(dim.block[0], dim.block[1], dim.block[2]),
-                      dim3(dim.thread[0], dim.thread[1], dim.thread[2]),
-                      0, 0,
-                      std::forward<L>(run));
-   hipErrchk(hipGetLastError());
-   hipErrchk(hipDeviceSynchronize());
+  hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
+                     dim3(dim.block[0], dim.block[1], dim.block[2]),
+                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]),
+                     0,
+                     0,
+                     std::forward<L>(run));
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 void for3d3d(dim3d3d dim, L&& run)
 {
   for3d3d(test_policy{}, dim, std::forward<L>(run));
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
index 4e9fc521e4..2d9b5ba453 100644
--- a/test/include/RAJA_unit-test-forone.hpp
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -18,11 +18,11 @@
 ///
 /// forone<test_policy>( [=] RAJA_HOST_DEVICE(){ /* code to test */ } );
 ///
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 inline void forone(L&& run);
 
 // test_seq implementation
-template < typename L >
+template <typename L>
 inline void forone(test_seq, L&& run)
 {
   std::forward<L>(run)();
@@ -31,7 +31,7 @@ inline void forone(test_seq, L&& run)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template < typename L >
+template <typename L>
 inline void forone(test_openmp_target, L&& run)
 {
 #pragma omp target
@@ -49,12 +49,12 @@ __global__ void forone_cuda_global(L run)
 }
 
 // test_cuda implementation
-template < typename L >
+template <typename L>
 inline void forone(test_cuda, L&& run)
 {
-   forone_cuda_global<<<1,1>>>(std::forward<L>(run));
-   cudaErrchk(cudaGetLastError());
-   cudaErrchk(cudaDeviceSynchronize());
+  forone_cuda_global<<<1, 1>>>(std::forward<L>(run));
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -68,17 +68,22 @@ __global__ void forone_hip_global(L run)
 }
 
 // test_hip implementation
-template < typename L >
+template <typename L>
 inline void forone(test_hip, L&& run)
 {
-   hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0, std::forward<L>(run));
-   hipErrchk(hipGetLastError());
-   hipErrchk(hipDeviceSynchronize());
+  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>,
+                     dim3(1),
+                     dim3(1),
+                     0,
+                     0,
+                     std::forward<L>(run));
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 void forone(L&& run)
 {
   forone(test_policy{}, std::forward<L>(run));
diff --git a/test/include/RAJA_unit-test-policy.hpp b/test/include/RAJA_unit-test-policy.hpp
index e0aa1f8c65..e5bdc682d4 100644
--- a/test/include/RAJA_unit-test-policy.hpp
+++ b/test/include/RAJA_unit-test-policy.hpp
@@ -20,48 +20,59 @@
 
 
 // base classes to represent host or device in exec_dispatcher
-struct RunOnHost {};
-struct RunOnDevice {};
+struct RunOnHost
+{};
+struct RunOnDevice
+{};
 
 // sequential test policy
-struct test_seq : public RunOnHost  { };
+struct test_seq : public RunOnHost
+{};
 
 // struct with specializations containing information about test policies
-template < typename test_policy >
+template <typename test_policy>
 struct test_policy_info;
 
 // alias for equivalent RAJA exec policy to given test policy
-template < typename test_policy >
-using test_equivalent_exec_policy = typename test_policy_info<test_policy>::type;
+template <typename test_policy>
+using test_equivalent_exec_policy =
+    typename test_policy_info<test_policy>::type;
 
 // alias for platform of given test policy
-template < typename test_policy >
+template <typename test_policy>
 using test_platform = typename test_policy_info<test_policy>::platform;
 
 // alias for platform of given test policy
-template < typename test_policy >
+template <typename test_policy>
 using test_resource = typename test_policy_info<test_policy>::resource;
 
-template < typename test_policy >
+template <typename test_policy>
 test_resource<test_policy> get_test_resource()
 {
   return test_resource<test_policy>::get_default();
 }
 
-template < typename dst_resource, typename src_resource, typename T >
-inline T* test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
+template <typename dst_resource, typename src_resource, typename T>
+inline T*
+test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
 {
   T* dst = nullptr;
-  if (dst_res.get_platform() == camp::resources::Platform::host) {
+  if (dst_res.get_platform() == camp::resources::Platform::host)
+  {
     dst = dst_res.template allocate<T>(len);
-    src_res.memcpy(dst, src, len*sizeof(T));
+    src_res.memcpy(dst, src, len * sizeof(T));
     src_res.wait();
-  } else if (src_res.get_platform() == camp::resources::Platform::host) {
+  }
+  else if (src_res.get_platform() == camp::resources::Platform::host)
+  {
     dst = dst_res.template allocate<T>(len);
-    dst_res.memcpy(dst, src, len*sizeof(T));
+    dst_res.memcpy(dst, src, len * sizeof(T));
     dst_res.wait();
-  } else {
-    throw std::runtime_error("Expected source or destination resource to be host");
+  }
+  else
+  {
+    throw std::runtime_error("Expected source or destination resource to be "
+                             "host");
   }
   src_res.deallocate(src);
   return dst;
@@ -69,7 +80,7 @@ inline T* test_reallocate(dst_resource dst_res, src_resource src_res, T* src, si
 
 
 // test_seq policy information
-template < >
+template <>
 struct test_policy_info<test_seq>
 {
   using resource = camp::resources::Host;
@@ -81,10 +92,11 @@ struct test_policy_info<test_seq>
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // cuda test policy
-struct test_openmp_target : public RunOnHost { };
+struct test_openmp_target : public RunOnHost
+{};
 
 // test_openmp_target policy information
-template < >
+template <>
 struct test_policy_info<test_openmp_target>
 {
   using resource = camp::resources::Omp;
@@ -98,10 +110,11 @@ struct test_policy_info<test_openmp_target>
 #if defined(RAJA_ENABLE_CUDA)
 
 // cuda test policy
-struct test_cuda : public RunOnDevice { };
+struct test_cuda : public RunOnDevice
+{};
 
 // test_cuda policy information
-template < >
+template <>
 struct test_policy_info<test_cuda>
 {
   using resource = camp::resources::Cuda;
@@ -115,10 +128,11 @@ struct test_policy_info<test_cuda>
 #if defined(RAJA_ENABLE_HIP)
 
 // hip test policy
-struct test_hip : public RunOnDevice { };
+struct test_hip : public RunOnDevice
+{};
 
 // test_hip policy information
-template < >
+template <>
 struct test_policy_info<test_hip>
 {
   using resource = camp::resources::Hip;
diff --git a/test/include/RAJA_unit-test-types.hpp b/test/include/RAJA_unit-test-types.hpp
index bb65134534..160d1ad3c7 100644
--- a/test/include/RAJA_unit-test-types.hpp
+++ b/test/include/RAJA_unit-test-types.hpp
@@ -34,42 +34,27 @@ using UnitIntegralTypes = ::testing::Types<char,
 // Expanded integral types used in RAJA index unit tests
 //
 #ifndef RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
-  #define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES \
-    RAJA::Index_type,    \
-    char,                \
-    unsigned char,       \
-    short,               \
-    unsigned short,      \
-    int,                 \
-    unsigned int,        \
-    long,                \
-    unsigned long,       \
-    long int,            \
-    unsigned long int,   \
-    long long,           \
-    unsigned long long
+#define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES                                      \
+  RAJA::Index_type, char, unsigned char, short, unsigned short, int,           \
+      unsigned int, long, unsigned long, long int, unsigned long int,          \
+      long long, unsigned long long
 #endif // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
 
 #ifndef RAJA_UNIT_FLOAT_TYPES
 #ifndef __clang__
-  #define RAJA_UNIT_FLOAT_TYPES \
-    float,               \
-    double,              \
-    long double
+#define RAJA_UNIT_FLOAT_TYPES float, double, long double
 #else
-  #define RAJA_UNIT_FLOAT_TYPES \
-    float,               \
-    double
+#define RAJA_UNIT_FLOAT_TYPES float, double
 #endif // __clang__
 #endif // FLOATING_TYPES
 
-using UnitExpandedIntegralTypes = 
-  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
+using UnitExpandedIntegralTypes =
+    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
 
 using UnitFloatTypes = ::testing::Types<RAJA_UNIT_FLOAT_TYPES>;
 
-using UnitIntFloatTypes = 
-  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES,RAJA_UNIT_FLOAT_TYPES>;
+using UnitIntFloatTypes =
+    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES, RAJA_UNIT_FLOAT_TYPES>;
 
 //
 // Standard list of index types used in RAJA index unit tests
@@ -90,4 +75,4 @@ using UnitIndexTypes = ::testing::Types<RAJA::Index_type,
 #endif
                                         unsigned long long>;
 
-#endif  // __RAJA_unit_test_types_HPP__
+#endif // __RAJA_unit_test_types_HPP__
diff --git a/test/include/type_helper.hpp b/test/include/type_helper.hpp
index 3a4581c8a0..5195737e78 100644
--- a/test/include/type_helper.hpp
+++ b/test/include/type_helper.hpp
@@ -30,7 +30,8 @@ template <typename S, typename T>
 struct type_cat;
 
 template <typename... Ss, typename... Ts>
-struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>> {
+struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>>
+{
   using type = std::tuple<Ss..., Ts...>;
 };
 
@@ -39,26 +40,30 @@ template <typename S, typename T>
 struct product;
 
 template <typename S, typename... Ss, typename... Ts>
-struct product<std::tuple<S, Ss...>, std::tuple<Ts...>> {
+struct product<std::tuple<S, Ss...>, std::tuple<Ts...>>
+{
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<S, Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts =
+      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
 };
 
 template <typename... Ss, typename... Ts, typename... Smembers>
-struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>> {
+struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>>
+{
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<Smembers..., Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts =
+      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
@@ -66,10 +71,11 @@ struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>> {
 
 // end the recursion
 template <typename... Ts>
-struct product<std::tuple<>, std::tuple<Ts...>> {
+struct product<std::tuple<>, std::tuple<Ts...>>
+{
   using type = std::tuple<>;
 };
-}  // namespace types
+} // namespace types
 
 
 namespace tt
@@ -78,12 +84,14 @@ template <typename...>
 struct concat;
 
 template <template <class...> class T, typename U>
-struct concat<T<U>> {
+struct concat<T<U>>
+{
   using type = U;
 };
 
 template <typename T>
-struct concat<T> {
+struct concat<T>
+{
   using type = T;
 };
 
@@ -91,7 +99,8 @@ template <template <class...> class T,
           class... Front,
           class... Next,
           class... Rest>
-struct concat<T<Front...>, T<Next...>, Rest...> {
+struct concat<T<Front...>, T<Next...>, Rest...>
+{
   using type = typename concat<T<Front..., Next...>, Rest...>::type;
 };
 
@@ -99,12 +108,14 @@ template <typename... Ts>
 using concat_t = typename concat<Ts...>::type;
 
 template <class T>
-struct collapse {
+struct collapse
+{
   using type = T;
 };
 
 template <template <class...> class T, class... U>
-struct collapse<T<T<U...>>> {
+struct collapse<T<T<U...>>>
+{
   using type = typename collapse<T<U...>>::type;
 };
 
@@ -115,14 +126,15 @@ template <template <class> class, class>
 struct apply;
 
 template <template <class...> class L, template <class> class Fn, class... Ts>
-struct apply<Fn, L<Ts...>> {
+struct apply<Fn, L<Ts...>>
+{
   using type = collapse_t<L<concat_t<Fn<Ts>...>>>;
 };
 
 template <template <class> class Outer, class T>
 using apply_t = typename apply<Outer, T>::type;
 
-}  // namespace tt
+} // namespace tt
 
 
 namespace detail
@@ -131,10 +143,11 @@ template <typename T>
 struct ForTesting;
 
 template <template <class...> class T, typename... Ts>
-struct ForTesting<T<Ts...>> {
+struct ForTesting<T<Ts...>>
+{
   using type = ::testing::Types<Ts...>;
 };
-}  // namespace detail
+} // namespace detail
 
 template <typename T>
 using ForTesting = typename ::detail::ForTesting<T>::type;
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
index b748f316df..10cee62226 100644
--- a/test/install/using-with-cmake/using-with-cmake.cpp
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -7,25 +7,23 @@
 #include "RAJA/RAJA.hpp"
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) 
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 {
   constexpr std::size_t N{1024};
 
   double* a = new double[N];
   double* b = new double[N];
   double c = 3.14159;
-  
-  for (std::size_t i = 0; i < N; i++) {
+
+  for (std::size_t i = 0; i < N; i++)
+  {
     a[i] = 1.0;
     b[i] = 2.0;
   }
 
   RAJA::forall<RAJA::seq_exec>(
-    RAJA::RangeSegment(0, N),
-    [=] RAJA_HOST_DEVICE (std::size_t i) {
-      a[i] += b[i] * c;
-    }
-  );
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_HOST_DEVICE(std::size_t i) { a[i] += b[i] * c; });
 
   delete[] a;
   delete[] b;
diff --git a/test/integration/plugin/plugin_to_test.cpp b/test/integration/plugin/plugin_to_test.cpp
index 8290804191..a637c3476a 100644
--- a/test/integration/plugin/plugin_to_test.cpp
+++ b/test/integration/plugin/plugin_to_test.cpp
@@ -12,11 +12,11 @@
 
 #include "counter.hpp"
 
-class CounterPlugin :
-  public RAJA::util::PluginStrategy
+class CounterPlugin : public RAJA::util::PluginStrategy
 {
-  public:
-  void preCapture(const RAJA::util::PluginContext& p) override {
+public:
+  void preCapture(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -30,7 +30,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postCapture(const RAJA::util::PluginContext& p) override {
+  void postCapture(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -44,7 +45,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void preLaunch(const RAJA::util::PluginContext& p) override {
+  void preLaunch(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -58,7 +60,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postLaunch(const RAJA::util::PluginContext& p) override {
+  void postLaunch(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -74,4 +77,7 @@ class CounterPlugin :
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin", "Counter");
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin",
+                                                        "Coun"
+                                                        "te"
+                                                        "r");
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
index bb22f697dd..a539076d84 100644
--- a/test/integration/plugin/tests/counter.hpp
+++ b/test/integration/plugin/tests/counter.hpp
@@ -4,18 +4,18 @@
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-#ifndef  RAJA_counter_HPP
-#define  RAJA_counter_HPP
+#ifndef RAJA_counter_HPP
+#define RAJA_counter_HPP
 
 
 struct CounterData
 {
   RAJA::Platform capture_platform_active = RAJA::Platform::undefined;
-  int            capture_counter_pre     = 0;
-  int            capture_counter_post    = 0;
+  int capture_counter_pre = 0;
+  int capture_counter_post = 0;
   RAJA::Platform launch_platform_active = RAJA::Platform::undefined;
-  int            launch_counter_pre     = 0;
-  int            launch_counter_post    = 0;
+  int launch_counter_pre = 0;
+  int launch_counter_post = 0;
 };
 
 // note the use of a pointer here to allow different types of memory
@@ -24,4 +24,4 @@ extern CounterData* plugin_test_data;
 
 extern camp::resources::Resource* plugin_test_resource;
 
-#endif  // RAJA_counter_HPP
+#endif // RAJA_counter_HPP
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index 3b74d6249d..dc86463af5 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -21,173 +21,169 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic forall
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForallTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::forall<ExecPolicy>(
-      RAJA::RangeSegment(i,i+1),
-      PluginTestCallable{data}
-    );
+    RAJA::forall<ExecPolicy>(RAJA::RangeSegment(i, i + 1),
+                             PluginTestCallable{data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with basic forall_Icount
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllICountTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
     RAJA::forall_Icount<ExecPolicy>(
-      RAJA::RangeSegment(i,i+1), i,
-      PluginTestCallable{data}
-    );
+        RAJA::RangeSegment(i, i + 1), i, PluginTestCallable{data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
 
-    for (int j = i; j < 10; j++) {
-      iset.push_back(RAJA::RangeSegment(j, j+1));
+    for (int j = i; j < 10; j++)
+    {
+      iset.push_back(RAJA::RangeSegment(j, j + 1));
     }
 
     RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-      iset,
-      PluginTestCallable{data}
-    );
+        iset, PluginTestCallable{data});
 
-    for (int j = i; j < 10; j++) {
+    for (int j = i; j < 10; j++)
+    {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.capture_counter_post, i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.launch_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.launch_counter_post, i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall_Icount
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllIcountIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
 
-    for (int j = i; j < 10; j++) {
-      iset.push_back(RAJA::RangeSegment(j, j+1));
+    for (int j = i; j < 10; j++)
+    {
+      iset.push_back(RAJA::RangeSegment(j, j + 1));
     }
 
     RAJA::forall_Icount<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-      iset,
-      PluginTestCallable{data}
-    );
+        iset, PluginTestCallable{data});
 
-    for (int j = i; j < 10; j++) {
+    for (int j = i; j < 10; j++)
+    {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.capture_counter_post, i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.launch_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.launch_counter_post, i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -195,8 +191,7 @@ void PluginForAllIcountIdxSetTestImpl()
 TYPED_TEST_SUITE_P(PluginForallTest);
 template <typename T>
 class PluginForallTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginForallTest, PluginForall)
 {
@@ -204,7 +199,7 @@ TYPED_TEST_P(PluginForallTest, PluginForall)
   using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllICount)
@@ -213,7 +208,7 @@ TYPED_TEST_P(PluginForallTest, PluginForAllICount)
   using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
@@ -222,7 +217,7 @@ TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
   using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
@@ -231,7 +226,9 @@ TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
   using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllIcountIdxSetTestImpl<ExecPolicy,
+                                   ResType,
+                                   PlatformHolder::platform>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
@@ -240,4 +237,4 @@ REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
                             PluginForAllIdxSet,
                             PluginForAllIcountIdxSet);
 
-#endif  //__TEST_PLUGIN_FORALL_HPP__
+#endif //__TEST_PLUGIN_FORALL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index b4bc9ebaf4..bb02759476 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -21,40 +21,38 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic kernel
-template <typename KernelPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename KernelPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginKernelTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::kernel<KernelPolicy>(
-      RAJA::make_tuple(RAJA::RangeSegment(i,i+1)),
-      PluginTestCallable{data}
-    );
+    RAJA::kernel<KernelPolicy>(RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
+                               PluginTestCallable{data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -63,8 +61,7 @@ void PluginKernelTestImpl()
 TYPED_TEST_SUITE_P(PluginKernelTest);
 template <typename T>
 class PluginKernelTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginKernelTest, PluginKernel)
 {
@@ -72,10 +69,9 @@ TYPED_TEST_P(PluginKernelTest, PluginKernel)
   using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>( );
+  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest,
-                            PluginKernel);
+REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest, PluginKernel);
 
-#endif  //__TEST_PLUGIN_KERNEL_HPP__
+#endif //__TEST_PLUGIN_KERNEL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 2c516114cd..59fe5fd67b 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -21,48 +21,47 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginLaunchTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    //Keep PluginTestCallable within a scope to ensure
-    //destruction, consistent with other test
+    // Keep PluginTestCallable within a scope to ensure
+    // destruction, consistent with other test
     {
       PluginTestCallable p_callable{data};
 
-      RAJA::launch<LaunchPolicy>
-        (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-         {
-           p_callable(i);
-         });
+      RAJA::launch<LaunchPolicy>(
+          RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx)) {
+            p_callable(i);
+          });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -71,8 +70,7 @@ void PluginLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginLaunchTest);
 template <typename T>
 class PluginLaunchTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
 {
@@ -80,10 +78,9 @@ TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
   using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest,
-                            PluginLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest, PluginLaunch);
 
-#endif  //__TEST_PLUGIN_LAUNCH_HPP__
+#endif //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index e4c216b72b..aea16eae0b 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -21,9 +21,7 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginResourceLaunchTestImpl()
 {
   WORKING_RES res;
@@ -32,39 +30,41 @@ void PluginResourceLaunchTestImpl()
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    //Keep PluginTestCallable within a scope to ensure
-    //destruction, consistent with other test
+    // Keep PluginTestCallable within a scope to ensure
+    // destruction, consistent with other test
     {
       PluginTestCallable p_callable{data};
 
-      RAJA::launch<LaunchPolicy>
-        (res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-         {
-           p_callable(i);
-         });
+      RAJA::launch<LaunchPolicy>(
+          res,
+          RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx)) {
+            p_callable(i);
+          });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -73,8 +73,7 @@ void PluginResourceLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginResourceLaunchTest);
 template <typename T>
 class PluginResourceLaunchTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
 {
@@ -82,10 +81,11 @@ TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
   using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginResourceLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+  PluginResourceLaunchTestImpl<LaunchPolicy,
+                               ResType,
+                               PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest,
-                            PluginResourceLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest, PluginResourceLaunch);
 
-#endif  //__TEST_PLUGIN_LAUNCH_HPP__
+#endif //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index 9e35aae7d2..31bcc53b60 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -29,181 +29,203 @@ template <typename ExecPolicy,
           typename Allocator,
           typename WORKINGRES,
           RAJA::Platform PLATFORM>
-struct PluginWorkGroupTestImpl {
-void operator()() const
+struct PluginWorkGroupTestImpl
 {
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, PluginTestCallable> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  SetupPluginVars spv(WORKINGRES{});
-
-  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
-
+  void operator()() const
   {
-    CounterData loop_data[10];
-    for (int i = 0; i < 10; i++) {
-      loop_data[i].capture_platform_active = RAJA::Platform::undefined;
-      loop_data[i].capture_counter_pre     = -1;
-      loop_data[i].capture_counter_post    = -1;
-      loop_data[i].launch_platform_active = RAJA::Platform::undefined;
-      loop_data[i].launch_counter_pre     = -1;
-      loop_data[i].launch_counter_post    = -1;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, PluginTestCallable>>;
+
+    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                              OrderPolicy,
+                                              StoragePolicy,
+                                              DispatchPolicy>,
+                        IndexType,
+                        RAJA::xargs<>,
+                        Allocator>;
+
+    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<>,
+                                         Allocator>;
+
+    SetupPluginVars spv(WORKINGRES{});
+
+    CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+    {
+      CounterData loop_data[10];
+      for (int i = 0; i < 10; i++)
+      {
+        loop_data[i].capture_platform_active = RAJA::Platform::undefined;
+        loop_data[i].capture_counter_pre = -1;
+        loop_data[i].capture_counter_post = -1;
+        loop_data[i].launch_platform_active = RAJA::Platform::undefined;
+        loop_data[i].launch_counter_pre = -1;
+        loop_data[i].launch_counter_post = -1;
+      }
+      plugin_test_resource->memcpy(
+          data, &loop_data[0], 10 * sizeof(CounterData));
     }
-    plugin_test_resource->memcpy(data, &loop_data[0], 10*sizeof(CounterData));
-  }
-
-  WorkPool_type pool(Allocator{});
 
-  for (int i = 0; i < 10; i++) {
-    pool.enqueue(range_segment{i,i+1}, PluginTestCallable{data});
-  }
+    WorkPool_type pool(Allocator{});
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
-    ASSERT_EQ(plugin_data.launch_counter_post,    0);
-  }
-
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
-      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    for (int i = 0; i < 10; i++)
+    {
+      pool.enqueue(range_segment{i, i + 1}, PluginTestCallable{data});
     }
-  }
 
-  WorkGroup_type group = pool.instantiate();
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(
+          &plugin_data, plugin_test_data, sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
+      ASSERT_EQ(plugin_data.launch_counter_post, 0);
+    }
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
-    ASSERT_EQ(plugin_data.launch_counter_post,    0);
-  }
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(
+          &loop_data[0], data, 10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
+      }
+    }
 
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
-      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    WorkGroup_type group = pool.instantiate();
+
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(
+          &plugin_data, plugin_test_data, sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
+      ASSERT_EQ(plugin_data.launch_counter_post, 0);
     }
-  }
 
-  WorkSite_type site = group.run();
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(
+          &loop_data[0], data, 10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
+      }
+    }
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     1);
-    ASSERT_EQ(plugin_data.launch_counter_post,    1);
-  }
+    WorkSite_type site = group.run();
+
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(
+          &plugin_data, plugin_test_data, sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 1);
+      ASSERT_EQ(plugin_data.launch_counter_post, 1);
+    }
 
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data, data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    i);
-      ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    0);
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(&loop_data, data, 10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, i + 1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, i);
+        ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, 1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, 0);
+      }
     }
-  }
 
-  plugin_test_resource->deallocate(data);
-}
+    plugin_test_resource->deallocate(data);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM
-          >
-struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
-                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                               StoragePolicy,
-                               detail::indirect_function_call_dispatch_typer,
-                               IndexType,
-                               Allocator,
-                               WORKINGRES,
-                               PLATFORM> {
-void operator()() const
-{ }
+          RAJA::Platform PLATFORM>
+struct PluginWorkGroupTestImpl<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKINGRES,
+    PLATFORM>
+{
+  void operator()() const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM
-          >
-struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
-                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                               StoragePolicy,
-                               detail::indirect_virtual_function_dispatch_typer,
-                               IndexType,
-                               Allocator,
-                               WORKINGRES,
-                               PLATFORM> {
-void operator()() const
-{ }
+          RAJA::Platform PLATFORM>
+struct PluginWorkGroupTestImpl<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKINGRES,
+    PLATFORM>
+{
+  void operator()() const {}
 };
 
 #endif
@@ -212,8 +234,7 @@ void operator()() const
 TYPED_TEST_SUITE_P(PluginWorkGroupTest);
 template <typename T>
 class PluginWorkGroupTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
 {
@@ -226,10 +247,16 @@ TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE, PlatformHolder::platform>{}( );
+  PluginWorkGroupTestImpl<ExecPolicy,
+                          OrderPolicy,
+                          StoragePolicy,
+                          DispatchTyper,
+                          IndexType,
+                          Allocator,
+                          WORKING_RESOURCE,
+                          PlatformHolder::platform>{}();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest,
-                            PluginWorkGroup);
+REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest, PluginWorkGroup);
 
-#endif  //__TEST_PLUGIN_WORKGROUP_HPP__
+#endif //__TEST_PLUGIN_WORKGROUP_HPP__
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
index 3371cb299b..4f48d0e1d0 100644
--- a/test/integration/plugin/tests/test-plugin.hpp
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_PLUGIN_HPP__
@@ -26,7 +27,7 @@ camp::resources::Resource* plugin_test_resource = nullptr;
 struct SetupPluginVars
 {
   SetupPluginVars(camp::resources::Resource const test_resource)
-    : m_test_resource(test_resource)
+      : m_test_resource(test_resource)
   {
     // ASSERT_EQ(plugin_test_data, nullptr);
     // ASSERT_EQ(plugin_test_resource, nullptr);
@@ -36,19 +37,19 @@ struct SetupPluginVars
 
     CounterData data;
     data.capture_platform_active = RAJA::Platform::undefined;
-    data.capture_counter_pre     = 0;
-    data.capture_counter_post    = 0;
+    data.capture_counter_pre = 0;
+    data.capture_counter_post = 0;
     data.launch_platform_active = RAJA::Platform::undefined;
-    data.launch_counter_pre     = 0;
-    data.launch_counter_post    = 0;
+    data.launch_counter_pre = 0;
+    data.launch_counter_post = 0;
 
     m_test_resource.memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
   SetupPluginVars(SetupPluginVars const&) = delete;
-  SetupPluginVars(SetupPluginVars &&) = delete;
+  SetupPluginVars(SetupPluginVars&&) = delete;
   SetupPluginVars& operator=(SetupPluginVars const&) = delete;
-  SetupPluginVars& operator=(SetupPluginVars &&) = delete;
+  SetupPluginVars& operator=(SetupPluginVars&&) = delete;
 
   ~SetupPluginVars()
   {
@@ -68,16 +69,15 @@ struct SetupPluginVars
 struct PluginTestCallable
 {
   PluginTestCallable(CounterData* data_optr)
-    : m_data_optr(data_optr)
-    , m_data_iptr(plugin_test_data)
+      : m_data_optr(data_optr), m_data_iptr(plugin_test_data)
   {
     clear_data();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable const& rhs)
-    : m_data_optr(rhs.m_data_optr)
-    , m_data_iptr(rhs.m_data_iptr)
-    , m_data(rhs.m_data)
+      : m_data_optr(rhs.m_data_optr),
+        m_data_iptr(rhs.m_data_iptr),
+        m_data(rhs.m_data)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -88,37 +88,40 @@ struct PluginTestCallable
       plugin_test_resource->memcpy(&i_data, m_data_iptr, sizeof(CounterData));
 
       if (m_data.capture_platform_active == RAJA::Platform::undefined &&
-          i_data.capture_platform_active != RAJA::Platform::undefined) {
+          i_data.capture_platform_active != RAJA::Platform::undefined)
+      {
         m_data = i_data;
       }
     }
 #endif
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable && rhs)
-    : m_data_optr(rhs.m_data_optr)
-    , m_data_iptr(rhs.m_data_iptr)
-    , m_data(rhs.m_data)
+  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable&& rhs)
+      : m_data_optr(rhs.m_data_optr),
+        m_data_iptr(rhs.m_data_iptr),
+        m_data(rhs.m_data)
   {
     rhs.clear();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable const& rhs)
   {
-    if (this != &rhs) {
+    if (this != &rhs)
+    {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
-      m_data      = rhs.m_data;
+      m_data = rhs.m_data;
     }
     return *this;
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable && rhs)
+  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable&& rhs)
   {
-    if (this != &rhs) {
+    if (this != &rhs)
+    {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
-      m_data      = rhs.m_data;
+      m_data = rhs.m_data;
       rhs.clear();
     }
     return *this;
@@ -127,11 +130,11 @@ struct PluginTestCallable
   RAJA_HOST_DEVICE void operator()(int i) const
   {
     m_data_optr[i].capture_platform_active = m_data.capture_platform_active;
-    m_data_optr[i].capture_counter_pre     = m_data.capture_counter_pre;
-    m_data_optr[i].capture_counter_post    = m_data.capture_counter_post;
+    m_data_optr[i].capture_counter_pre = m_data.capture_counter_pre;
+    m_data_optr[i].capture_counter_post = m_data.capture_counter_post;
     m_data_optr[i].launch_platform_active = m_data_iptr->launch_platform_active;
-    m_data_optr[i].launch_counter_pre     = m_data_iptr->launch_counter_pre;
-    m_data_optr[i].launch_counter_post    = m_data_iptr->launch_counter_post;
+    m_data_optr[i].launch_counter_pre = m_data_iptr->launch_counter_pre;
+    m_data_optr[i].launch_counter_post = m_data_iptr->launch_counter_post;
   }
 
   RAJA_HOST_DEVICE void operator()(int count, int i) const
@@ -141,9 +144,9 @@ struct PluginTestCallable
   }
 
 private:
-        CounterData* m_data_optr = nullptr;
+  CounterData* m_data_optr = nullptr;
   const CounterData* m_data_iptr = nullptr;
-        CounterData  m_data;
+  CounterData m_data;
 
 
   RAJA_HOST_DEVICE void clear()
@@ -156,12 +159,12 @@ struct PluginTestCallable
   RAJA_HOST_DEVICE void clear_data()
   {
     m_data.capture_platform_active = RAJA::Platform::undefined;
-    m_data.capture_counter_pre     = -1;
-    m_data.capture_counter_post    = -1;
+    m_data.capture_counter_pre = -1;
+    m_data.capture_counter_post = -1;
     m_data.launch_platform_active = RAJA::Platform::undefined;
-    m_data.launch_counter_pre     = -1;
-    m_data.launch_counter_post    = -1;
+    m_data.launch_counter_pre = -1;
+    m_data.launch_counter_post = -1;
   }
 };
 
-#endif  //__TEST_PLUGIN_HPP__
+#endif //__TEST_PLUGIN_HPP__
diff --git a/test/integration/plugin_for_test_dynamic.cpp b/test/integration/plugin_for_test_dynamic.cpp
index dfd04f0a50..84a65d422a 100644
--- a/test/integration/plugin_for_test_dynamic.cpp
+++ b/test/integration/plugin_for_test_dynamic.cpp
@@ -8,16 +8,16 @@
 
 #include <exception>
 
-class ExceptionPlugin :
-  public RAJA::util::PluginStrategy
+class ExceptionPlugin : public RAJA::util::PluginStrategy
 {
-  public:
-  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override {
+public:
+  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override
+  {
     throw std::runtime_error("preLaunch");
   }
 };
 
-extern "C" RAJA::util::PluginStrategy *getPlugin()
+extern "C" RAJA::util::PluginStrategy* getPlugin()
 {
   return new ExceptionPlugin;
 }
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
index d5bbc5a51d..f934d864f7 100644
--- a/test/integration/plugin_for_test_kokkos.cpp
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -9,15 +9,19 @@
 
 #include <exception>
 
-extern "C" void kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
-	const uint64_t RAJA_UNUSED_ARG(interfaceVer),
-	const uint32_t RAJA_UNUSED_ARG(devInfoCount),
-	void* RAJA_UNUSED_ARG(deviceInfo)) {}
+extern "C" void
+kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
+                     const uint64_t RAJA_UNUSED_ARG(interfaceVer),
+                     const uint32_t RAJA_UNUSED_ARG(devInfoCount),
+                     void* RAJA_UNUSED_ARG(deviceInfo))
+{}
 
-extern "C" void kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
-    const uint32_t RAJA_UNUSED_ARG(devID),
-    uint64_t* RAJA_UNUSED_ARG(kID)) {
-    throw std::runtime_error("preLaunch");
+extern "C" void
+kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
+                           const uint32_t RAJA_UNUSED_ARG(devID),
+                           uint64_t* RAJA_UNUSED_ARG(kID))
+{
+  throw std::runtime_error("preLaunch");
 }
 
 extern "C" void kokkosp_end_parallel_for(const uint64_t RAJA_UNUSED_ARG(kID)) {}
diff --git a/test/integration/test_plugin_dynamic.cpp b/test/integration/test_plugin_dynamic.cpp
index 9cba6d0a77..5a3f157e97 100644
--- a/test/integration/test_plugin_dynamic.cpp
+++ b/test/integration/test_plugin_dynamic.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestDynamic, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                               [=](int i) { a[i] = 0; });
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/integration/test_plugin_kokkos.cpp b/test/integration/test_plugin_kokkos.cpp
index b8f05d8fef..521870494b 100644
--- a/test/integration/test_plugin_kokkos.cpp
+++ b/test/integration/test_plugin_kokkos.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestKokkos, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                               [=](int i) { a[i] = 0; });
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/old-tests/unit/cpu/test-synchronize.cpp b/test/old-tests/unit/cpu/test-synchronize.cpp
index 7750fcea5c..f933804f02 100644
--- a/test/old-tests/unit/cpu/test-synchronize.cpp
+++ b/test/old-tests/unit/cpu/test-synchronize.cpp
@@ -17,7 +17,8 @@ TEST(SynchronizeTest, omp)
 
 #pragma omp parallel shared(test_val)
   {
-    if (omp_get_thread_num() == 0) {
+    if (omp_get_thread_num() == 0)
+    {
       test_val = 5.0;
     }
 
diff --git a/test/old-tests/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
index b26b7a3445..fc5a02b1ed 100644
--- a/test/old-tests/unit/cuda/test-synchronize.cpp
+++ b/test/old-tests/unit/cuda/test-synchronize.cpp
@@ -16,16 +16,14 @@ GPU_TEST(SynchronizeTest, CUDA)
   double* managed_data;
   cudaErrchk(cudaMallocManaged(&managed_data, sizeof(double) * 50));
 
-  RAJA::forall<RAJA::cuda_exec_async<256>>( RAJA::RangeSegment(0, 50),
-    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    managed_data[i] = 1.0 * i;
-  });
+  RAJA::forall<RAJA::cuda_exec_async<256>>(
+      RAJA::RangeSegment(0, 50),
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::cuda_synchronize>();
 
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
-    [=](RAJA::Index_type i) {
-    EXPECT_EQ(managed_data[i], 1.0 * i);
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, 50),
+      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   cudaErrchk(cudaFree(managed_data));
 }
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index a6f4ffcbc5..a577ab3218 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -24,9 +24,9 @@
 using namespace RAJA;
 using namespace RAJA::statement;
 
-//Define tile size ( TILE_DIM x TILE_DIM )
-//Matrix transpose and matrix multiplication
-//are carried out via tiling algorithms
+// Define tile size ( TILE_DIM x TILE_DIM )
+// Matrix transpose and matrix multiplication
+// are carried out via tiling algorithms
 RAJA_INDEX_VALUE(TX, "TX");
 RAJA_INDEX_VALUE(TY, "TY");
 
@@ -52,66 +52,79 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *B;
 #if defined(RAJA_ENABLE_CUDA)
   size_t Arr_sz = N_rows * N_cols;
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * Arr_sz));
-  cudaErrchk(cudaMallocManaged(&B, sizeof(double)  * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * Arr_sz));
 #else
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
 #endif
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      A[col + N_cols * row] = col;
     }
   }
 
-  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic,
+                                           double,
+                                           RAJA::PERM_IJ,
+                                           RAJA::SizeList<TILE_DIM, TILE_DIM>,
+                                           TY,
+                                           TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
+        TY row = by * TY_TILE_DIM + ty; // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = Aview(row, col);
+        }
+      },
 
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
+        TY row = by * TY_TILE_DIM + ty; // Matrix row index
 
-    if(row < N_rows && col < N_cols){
-      Bview(row, col) = myTile(ty, tx);
-    }
-
-  });
+        if (row < N_rows && col < N_cols)
+        {
+          Bview(row, col) = myTile(ty, tx);
+        }
+      });
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ((double)B[col + row*N_cols], (double)A[col + row*N_cols]);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)B[col + row * N_cols],
+                      (double)A[col + row * N_cols]);
     }
   }
 
@@ -119,8 +132,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
   cudaErrchk(cudaFree(A));
   cudaErrchk(cudaFree(B));
 #else
-  delete [] A;
-  delete [] B;
+  delete[] A;
+  delete[] B;
 #endif
 }
 
@@ -147,85 +160,98 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *B;
   double *d_A, *d_B;
   size_t Arr_sz = N_rows * N_cols;
   hipMalloc(&d_A, sizeof(double) * Arr_sz);
   hipMalloc(&d_B, sizeof(double) * Arr_sz);
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(
+      d_A, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(
+      d_B, N_rows, N_cols);
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      A[col + N_cols * row] = col;
     }
   }
 
-  hipMemcpy(d_A, A, Arr_sz*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_A, A, Arr_sz * sizeof(double), hipMemcpyHostToDevice);
 
-  using SharedTile = TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  using SharedTile = TypedLocalArray<double,
+                                     RAJA::PERM_IJ,
+                                     RAJA::SizeList<TILE_DIM, TILE_DIM>,
+                                     TY,
+                                     TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = d_Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
+        TY row = by * TY_TILE_DIM + ty; // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = d_Aview(row, col);
+        }
+      },
 
-    if(row < N_rows && col < N_cols){
-      d_Bview(row, col) = myTile(ty, tx);
-    }
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
+        TY row = by * TY_TILE_DIM + ty; // Matrix row index
 
-  });
+        if (row < N_rows && col < N_cols)
+        {
+          d_Bview(row, col) = myTile(ty, tx);
+        }
+      });
 
-  hipMemcpy(B, d_B, Arr_sz*sizeof(double), hipMemcpyDeviceToHost);
+  hipMemcpy(B, d_B, Arr_sz * sizeof(double), hipMemcpyDeviceToHost);
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(B[col + row*N_cols], A[col + row*N_cols]);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ(B[col + row * N_cols], A[col + row * N_cols]);
     }
   }
 
   hipFree(d_A);
   hipFree(d_B);
-  delete [] A;
-  delete [] B;
+  delete[] A;
+  delete[] B;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem_gpu, Basic);
-#endif //defined(RAJA_ENABLE_HIP)
+#endif // defined(RAJA_ENABLE_HIP)
 
 
 //
-//Matrix transpose example - test all variants
+// Matrix transpose example - test all variants
 //
 template <typename NestedPolicy>
 class MatTranspose : public ::testing::Test
@@ -248,19 +274,19 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *At, *B, *Bt;
 #if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&At, sizeof(double) * N_rows * N_cols));
-  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&Bt, sizeof(double) * N_rows * N_cols));
 #else
-  A  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
   At = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
   Bt = new double[N_rows * N_cols];
 #endif
 
@@ -271,53 +297,69 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
   }
 
 
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+  using SharedTile =
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = Aview(row, col);
-      myTile2(ty,tx) = Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      Atview(row, col) = myTile(tx,ty);
-      Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0),
+                       RAJA::RangeSegment(0, inner_Dim1),
+                       RAJA::RangeSegment(0, outer_Dim0),
+                       RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(int tx,
+                           int ty,
+                           int bx,
+                           int by,
+                           SharedTile& myTile,
+                           SharedTile& myTile2) {
+        int col = bx * TILE_DIM + tx; // Matrix column index
+        int row = by * TILE_DIM + ty; // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = Aview(row, col);
+          myTile2(ty, tx) = Bview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(int tx,
+                           int ty,
+                           int bx,
+                           int by,
+                           SharedTile& myTile,
+                           SharedTile& myTile2) {
+        int col = by * TILE_DIM + tx; // Transposed matrix column index
+        int row = bx * TILE_DIM + ty; // Transposed matrix row index
+
+        if (row < N_cols && col < N_rows)
+        {
+          Atview(row, col) = myTile(tx, ty);
+          Btview(row, col) = myTile2(tx, ty);
+        }
+      });
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ((double)Atview(col,row), (double)col);
-      ASSERT_FLOAT_EQ((double)Btview(col,row), (double)col);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)Atview(col, row), (double)col);
+      ASSERT_FLOAT_EQ((double)Btview(col, row), (double)col);
     }
   }
 
@@ -328,10 +370,10 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   cudaErrchk(cudaFree(B));
   cudaErrchk(cudaFree(Bt));
 #else
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
+  delete[] A;
+  delete[] At;
+  delete[] B;
+  delete[] Bt;
 #endif
 }
 
@@ -360,18 +402,18 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *At, *B, *Bt;
   double *d_A, *d_At, *d_B, *d_Bt;
-  hipMalloc(&d_A,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_A, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_At, sizeof(double) * N_rows * N_cols);
-  hipMalloc(&d_B,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_B, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_Bt, sizeof(double) * N_rows * N_cols);
-  A  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
   At = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
   Bt = new double[N_rows * N_cols];
 
   RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N_rows, N_cols);
@@ -387,8 +429,10 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> d_Btview(d_Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
@@ -398,48 +442,62 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipMemcpy(d_B, B, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
 
 
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+  using SharedTile =
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = d_Aview(row, col);
-      myTile2(ty,tx) = d_Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      d_Atview(row, col) = myTile(tx,ty);
-      d_Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0),
+                       RAJA::RangeSegment(0, inner_Dim1),
+                       RAJA::RangeSegment(0, outer_Dim0),
+                       RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(int tx,
+                           int ty,
+                           int bx,
+                           int by,
+                           SharedTile& myTile,
+                           SharedTile& myTile2) {
+        int col = bx * TILE_DIM + tx; // Matrix column index
+        int row = by * TILE_DIM + ty; // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = d_Aview(row, col);
+          myTile2(ty, tx) = d_Bview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(int tx,
+                           int ty,
+                           int bx,
+                           int by,
+                           SharedTile& myTile,
+                           SharedTile& myTile2) {
+        int col = by * TILE_DIM + tx; // Transposed matrix column index
+        int row = bx * TILE_DIM + ty; // Transposed matrix row index
+
+        if (row < N_cols && col < N_rows)
+        {
+          d_Atview(row, col) = myTile(tx, ty);
+          d_Btview(row, col) = myTile2(tx, ty);
+        }
+      });
 
   hipMemcpy(At, d_At, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
   hipMemcpy(Bt, d_Bt, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(Atview(col,row), col);
-      ASSERT_FLOAT_EQ(Btview(col,row), col);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ(Atview(col, row), col);
+      ASSERT_FLOAT_EQ(Btview(col, row), col);
     }
   }
 
@@ -448,149 +506,174 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipFree(d_At);
   hipFree(d_B);
   hipFree(d_Bt);
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
+  delete[] A;
+  delete[] At;
+  delete[] B;
+  delete[] Bt;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatTranspose_gpu, Basic);
 
-#endif //defined(RAJA_ENABLE_HIP)
+#endif // defined(RAJA_ENABLE_HIP)
 
 using SeqTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-        RAJA::statement::For<3, RAJA::seq_exec,
-          RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::seq_exec,
-                RAJA::statement::For<0, RAJA::seq_exec,
-                  RAJA::statement::Lambda<0>
-                                   >
-                                 >,
-
-                //Read data from shared memory
-                RAJA::statement::For<1, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                    RAJA::statement::Lambda<1> > >
-
-              > //close shared memory scope
-            >//for 2
-        >//for 3
-      > //kernel policy
-    > //list
-  >; //types
+    ::testing::Types<RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::seq_exec,
+                                         RAJA::statement::Lambda<1>>>
+
+                > // close shared memory scope
+            >     // for 2
+        >         // for 3
+                                                   > // kernel policy
+                                >                    // list
+                     >;                              // types
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatTranspose, SeqTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, TypedLocalMem, SeqTypes);
 
 
 #if defined(RAJA_ENABLE_OPENMP)
-using TestTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::seq_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<0>
-                                     >,
-
-           //Read data from shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<1>
-                                     >
-                                 >
-        >//for 2
-       >//for 3
-       > //close policy
-     > //close list
-
-  ,RAJA::list<
-      RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::seq_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-            RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-           RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
-                                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-     >//close policy
-    > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::omp_parallel_for_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-      > //close policy list
-     > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<2, 3>,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem,RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-       >//outer collapsed
-      > //close policy list
-     > //close list
-   >;
+using TestTypes = ::testing::Types<
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<0>>,
+
+                // Read data from shared memory
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<1>>>> // for 2
+        >                                                               // for 3
+                                  > // close policy
+               >                    // close list
+
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::omp_parallel_for_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::omp_parallel_for_exec,
+                                         RAJA::statement::Lambda<1>>>> // close
+                                                                       // shared
+                                                                       // mem
+                                                                       // window
+            >                                                          // 2
+        >                                                              // 3
+                                  > // close policy
+               >                    // close list
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::seq_exec,
+                                         RAJA::statement::Lambda<1>>>> // close
+                                                                       // shared
+                                                                       // mem
+                                                                       // window
+            >                                                          // 2
+        >                                                              // 3
+                                  > // close policy list
+               >                    // close list
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::Collapse<
+        RAJA::omp_parallel_collapse_exec,
+        RAJA::ArgList<2, 3>,
+
+        RAJA::statement::InitLocalMem<
+            RAJA::cpu_tile_mem,
+            RAJA::ParamList<0, 1>,
+
+            // Load data into shared memory
+            RAJA::statement::For<
+                1,
+                RAJA::seq_exec,
+                RAJA::statement::
+                    For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+            // Read data from shared memory
+            RAJA::statement::For<
+                1,
+                RAJA::seq_exec,
+                RAJA::statement::For<0,
+                                     RAJA::seq_exec,
+                                     RAJA::statement::Lambda<1>>>> // close
+                                                                   // shared mem
+                                                                   // window
+        >                           // outer collapsed
+                                  > // close policy list
+               >                    // close list
+    >;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatTranspose, TestTypes);
@@ -599,60 +682,79 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, TypedLocalMem, TestTypes);
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CUDATypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<3, RAJA::cuda_block_y_direct,
-          RAJA::statement::For<2, RAJA::cuda_block_x_direct,
-
-            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::CudaSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-              RAJA::statement::CudaSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //CudaKernel
-    > //kernel policy
-  > //list
-  ,
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<3, RAJA::cuda_block_y_loop,
-          RAJA::statement::For<2, RAJA::cuda_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::CudaSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-              RAJA::statement::CudaSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //CudaKernel
-    > //kernel policy
-  > //list
-  >; //types
+using CUDATypes = ::testing::Types<
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+            3,
+            RAJA::cuda_block_y_direct,
+            RAJA::statement::For<
+                2,
+                RAJA::cuda_block_x_direct,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::cuda_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::CudaSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>    // close shared memory
+                                                         // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                       > // CudaKernel
+                           >                             // kernel policy
+        >                                                // list
+    ,
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+            3,
+            RAJA::cuda_block_y_loop,
+            RAJA::statement::For<
+                2,
+                RAJA::cuda_block_x_loop,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::cuda_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::CudaSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>    // close shared memory
+                                                         // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                       > // CudaKernel
+                           >                             // kernel policy
+        >                                                // list
+    >;                                                   // types
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
@@ -660,80 +762,90 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HIPTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<3, RAJA::hip_block_y_direct,
-          RAJA::statement::For<2, RAJA::hip_block_x_direct,
-
-            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<0>
-                >
-              >,
-              RAJA::statement::HipSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                >
-              >,
-              RAJA::statement::HipSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //HipKernel
-    > //kernel policy
-  > //list
-  ,
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<3, RAJA::hip_block_y_loop,
-          RAJA::statement::For<2, RAJA::hip_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<0>
-                >
-              >,
-              RAJA::statement::HipSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                >
-              >,
-              RAJA::statement::HipSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //HipKernel
-    > //kernel policy
-  > //list
-  >; //types
+using HIPTypes = ::testing::Types<
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+            3,
+            RAJA::hip_block_y_direct,
+            RAJA::statement::For<
+                2,
+                RAJA::hip_block_x_direct,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::hip_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::HipSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>    // close shared memory
+                                                        // scope
+                >                                       // for 2
+            >                                           // for 3
+                                                      > // HipKernel
+                           >                            // kernel policy
+        >                                               // list
+    ,
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+            3,
+            RAJA::hip_block_y_loop,
+            RAJA::statement::For<
+                2,
+                RAJA::hip_block_x_loop,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::hip_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::HipSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>    // close shared memory
+                                                        // scope
+                >                                       // for 2
+            >                                           // for 3
+                                                      > // HipKernel
+                           >                            // kernel policy
+        >                                               // list
+    >;                                                  // types
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, TypedLocalMem_gpu, HIPTypes);
 
 #endif
 
 
-
 template <typename NestedPolicy>
 class MatMultiply : public ::testing::Test
 {
-  virtual void SetUp(){}
-  virtual void TearDown(){}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
 };
 
 TYPED_TEST_SUITE_P(MatMultiply);
@@ -747,22 +859,22 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   static constexpr size_t M = TypeParam::M;
   static constexpr size_t P = TypeParam::P;
 
-  //Matrix A size: N x M
-  //Matrix B size: M x P
-  //Result C size: N x P
+  // Matrix A size: N x M
+  // Matrix B size: M x P
+  // Result C size: N x P
 
   // Note: on CPU A==d_A, etc.
   double *A, *d_A;
-  TypeParam::alloc_double(N*M, &A, &d_A);
+  TypeParam::alloc_double(N * M, &A, &d_A);
 
   double *B, *d_B;
-  TypeParam::alloc_double(M*P, &B, &d_B);
+  TypeParam::alloc_double(M * P, &B, &d_B);
 
   double *C, *d_C;
-  TypeParam::alloc_double(N*P, &C, &d_C);
+  TypeParam::alloc_double(N * P, &C, &d_C);
 
 
-  double *C_sol = new double[N*P];
+  double* C_sol = new double[N * P];
 
   RAJA::View<double, RAJA::Layout<2>> C_solView(C_sol, N, P);
 
@@ -771,197 +883,225 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
     RAJA::View<double, RAJA::Layout<2>> Aview(A, N, M);
     RAJA::View<double, RAJA::Layout<2>> Bview(B, M, P);
     RAJA::View<double, RAJA::Layout<2>> Cview(C, N, P);
-    for (size_t row = 0; row < N; ++row) {
-      for (size_t col = 0; col < M; ++col) {
-        Aview(row, col) = ((double)col-row)/(N*M)+1;
+    for (size_t row = 0; row < N; ++row)
+    {
+      for (size_t col = 0; col < M; ++col)
+      {
+        Aview(row, col) = ((double)col - row) / (N * M) + 1;
       }
     }
 
-    for (size_t row = 0; row < M; ++row) {
-      for (size_t col = 0; col < P; ++col) {
-        Bview(row, col) = ((double)col+row)/(M*P)+1;
+    for (size_t row = 0; row < M; ++row)
+    {
+      for (size_t col = 0; col < P; ++col)
+      {
+        Bview(row, col) = ((double)col + row) / (M * P) + 1;
       }
     }
 
-    for(size_t r=0; r<N; ++r){
-      for(size_t c=0; c<P; ++c){
+    for (size_t r = 0; r < N; ++r)
+    {
+      for (size_t c = 0; c < P; ++c)
+      {
         double dot = 0.0;
-        for(size_t k=0; k<M; ++k){
-          dot += Aview(r,k)*Bview(k,c);
+        for (size_t k = 0; k < M; ++k)
+        {
+          dot += Aview(r, k) * Bview(k, c);
         }
-        C_solView(r,c) = dot;
-        Cview(r,c) = 0;
+        C_solView(r, c) = dot;
+        Cview(r, c) = 0;
       }
     }
   }
 
   // Copy A, B and C to the device (NOP on CPU)
-  TypeParam::copy_d2h(N*M, d_A, A);
-  TypeParam::copy_d2h(M*P, d_B, B);
-  TypeParam::copy_d2h(N*P, d_C, C);
+  TypeParam::copy_d2h(N * M, d_A, A);
+  TypeParam::copy_d2h(M * P, d_B, B);
+  TypeParam::copy_d2h(N * P, d_C, C);
 
   // Create device views of data
   RAJA::View<double, RAJA::Layout<2>> Aview(d_A, N, M);
   RAJA::View<double, RAJA::Layout<2>> Bview(d_B, M, P);
   RAJA::View<double, RAJA::Layout<2>> Cview(d_C, N, P);
 
-  using Shmem      = typename TypeParam::Shmem;
+  using Shmem = typename TypeParam::Shmem;
   using ThreadPriv = typename TypeParam::ThreadPriv;
 
-  Shmem aShared, bShared; //memory to be shared between threads
-  ThreadPriv pVal; //iteration dependent data
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                           RAJA::RangeSegment(0, M),
-                                           RAJA::RangeSegment(0, P)),
-                          RAJA::make_tuple(aShared, bShared, pVal),
-
-  // Zero out thread local memory for storing dot products
-  [=] RAJA_HOST_DEVICE (int tn, int tp, ThreadPriv &pVal) {
-
-    pVal(tn,tp) = 0.0;
-
-  },
-
-  // Load tile of A
-  [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) {
-
-     aShared(tn, tm) = Aview(n, m);
-
-  },
-
-  // Load tile of B
-  [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) {
-
-    bShared(tm, tp) = Bview(m, p);
-
-  },
-
-  // Do partial update in shmem
-  [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared,  Shmem &bShared, ThreadPriv & pVal) {
-
-    pVal(tn,tp) += aShared(tn,tm) * bShared(tm, tp);
-
-  },
-
-  // Write out complete result
-  [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp,  ThreadPriv &pVal) {
-
-    Cview(n,p) = pVal(tn,tp);
-
-  });
+  Shmem aShared, bShared; // memory to be shared between threads
+  ThreadPriv pVal;        // iteration dependent data
+
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N),
+                       RAJA::RangeSegment(0, M),
+                       RAJA::RangeSegment(0, P)),
+      RAJA::make_tuple(aShared, bShared, pVal),
+
+      // Zero out thread local memory for storing dot products
+      [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal) {
+        pVal(tn, tp) = 0.0;
+      },
+
+      // Load tile of A
+      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared) {
+        aShared(tn, tm) = Aview(n, m);
+      },
+
+      // Load tile of B
+      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared) {
+        bShared(tm, tp) = Bview(m, p);
+      },
+
+      // Do partial update in shmem
+      [=] RAJA_HOST_DEVICE(int tn,
+                           int tm,
+                           int tp,
+                           Shmem& aShared,
+                           Shmem& bShared,
+                           ThreadPriv& pVal) {
+        pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp);
+      },
+
+      // Write out complete result
+      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal) {
+        Cview(n, p) = pVal(tn, tp);
+      });
 
   // copy result back to host (NOP on CPU)
-  TypeParam::copy_d2h(N*P, C, d_C);
+  TypeParam::copy_d2h(N * P, C, d_C);
 
   // Check result
   RAJA::View<double, RAJA::Layout<2>> Cresult(C, N, P);
-  for (size_t row = 0; row < N; ++row) {
-    for (size_t col = 0; col < P; ++col) {
-      ASSERT_FLOAT_EQ((double)Cresult(row,col), (double)C_solView(row,col));
+  for (size_t row = 0; row < N; ++row)
+  {
+    for (size_t col = 0; col < P; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)Cresult(row, col), (double)C_solView(row, col));
     }
   }
 
   TypeParam::free_double(A, d_A);
   TypeParam::free_double(B, d_B);
   TypeParam::free_double(C, d_C);
-  delete [] C_sol;
+  delete[] C_sol;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatMultiply, shmem);
 
-void alloc_cpu(size_t N, double **host, double **device){
+void alloc_cpu(size_t N, double** host, double** device)
+{
   *host = new double[N];
   *device = *host;
 }
 
-void copy_h2d_cpu(size_t , double *, double *){
+void copy_h2d_cpu(size_t, double*, double*)
+{
   // NOP
 }
 
-void copy_d2h_cpu(size_t , double *, double *){
+void copy_d2h_cpu(size_t, double*, double*)
+{
   // NOP
 }
 
-void free_cpu(double *host, double *){
-  delete[] host;
-}
+void free_cpu(double* host, double*) { delete[] host; }
+
+struct Policy_MatMultiply_cpu
+{
 
-struct Policy_MatMultiply_cpu {
-
-    static constexpr size_t N = 150;
-    static constexpr size_t M = 25;
-    static constexpr size_t P = 95;
-    static constexpr size_t tile_size = 16;
-
-    constexpr static void(*alloc_double)(size_t, double**, double**) = alloc_cpu;
-    constexpr static void(*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
-    constexpr static void(*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
-    constexpr static void(*free_double)(double*, double*) = free_cpu;
-
-    using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-    using ThreadPriv = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-
-    using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-    using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
-    using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
-    using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-    using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-
-    // Segments:
-    // 0: N
-    // 1: M
-    // 2: P
-
-    using exec_policy =
-        RAJA::KernelPolicy<
-          //Initalize thread private value
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2,1,0>,
-
-            // Tile of N and P (the result matrix C)
-            RAJA::statement::Tile<0, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-              RAJA::statement::Tile<2, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-
-               // zero out shmem tile of C
-               RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                  shmem_Lambda0 > >,
-
-                // Slide window across matrix: Tile in M
-                RAJA::statement::Tile<1, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-
-                   // Load tile of A into shmem
-                   RAJA::statement::For<1, RAJA::seq_exec,
-                     RAJA::statement::For<0, RAJA::seq_exec,
-                     shmem_Lambda1
-                    >
-                   >,
-
-                   // Load tile of B into shmem
-                   RAJA::statement::For<2, RAJA::seq_exec,
-                     RAJA::statement::For<1, RAJA::seq_exec,
-                     shmem_Lambda2
-                    >
-                   >,
-
-                   //Partial multiplication
-                   RAJA::statement::For<2, RAJA::seq_exec,
-                     RAJA::statement::For<1, RAJA::seq_exec,
-                       RAJA::statement::For<0, RAJA::seq_exec,
-                       shmem_Lambda3
-                       >
-                     >
-                   >
-                >, //sliding window
-
-                //Write memory out to global matrix
-                RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                  shmem_Lambda4 > >
-             >
-            >
-           > //Create shared memory
-          >;
+  static constexpr size_t N = 150;
+  static constexpr size_t M = 25;
+  static constexpr size_t P = 95;
+  static constexpr size_t tile_size = 16;
+
+  constexpr static void (*alloc_double)(size_t, double**, double**) = alloc_cpu;
+  constexpr static void (*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
+  constexpr static void (*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
+  constexpr static void (*free_double)(double*, double*) = free_cpu;
+
+  using Shmem = RAJA::
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+  using ThreadPriv = RAJA::
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+
+  using shmem_Lambda0 =
+      RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+  using shmem_Lambda1 = RAJA::statement::
+      Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+  using shmem_Lambda2 = RAJA::statement::
+      Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+  using shmem_Lambda3 =
+      RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
+  using shmem_Lambda4 = RAJA::statement::
+      Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+
+  // Segments:
+  // 0: N
+  // 1: M
+  // 2: P
+
+  using exec_policy = RAJA::KernelPolicy<
+      // Initalize thread private value
+      RAJA::statement::InitLocalMem<
+          RAJA::cpu_tile_mem,
+          RAJA::ParamList<2, 1, 0>,
+
+          // Tile of N and P (the result matrix C)
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<tile_size>,
+              RAJA::seq_exec,
+              RAJA::statement::Tile<
+                  2,
+                  RAJA::tile_fixed<tile_size>,
+                  RAJA::seq_exec,
+
+                  // zero out shmem tile of C
+                  RAJA::statement::For<
+                      2,
+                      RAJA::seq_exec,
+                      RAJA::statement::For<0, RAJA::seq_exec, shmem_Lambda0>>,
+
+                  // Slide window across matrix: Tile in M
+                  RAJA::statement::Tile<
+                      1,
+                      RAJA::tile_fixed<tile_size>,
+                      RAJA::seq_exec,
+
+                      // Load tile of A into shmem
+                      RAJA::statement::For<1,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<0,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda1>>,
+
+                      // Load tile of B into shmem
+                      RAJA::statement::For<2,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<1,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda2>>,
+
+                      // Partial multiplication
+                      RAJA::statement::For<
+                          2,
+                          RAJA::seq_exec,
+                          RAJA::statement::For<
+                              1,
+                              RAJA::seq_exec,
+                              RAJA::statement::For<0,
+                                                   RAJA::seq_exec,
+                                                   shmem_Lambda3>>>>, // sliding
+                                                                      // window
+
+                  // Write memory out to global matrix
+                  RAJA::statement::For<
+                      2,
+                      RAJA::seq_exec,
+                      RAJA::statement::For<0,
+                                           RAJA::seq_exec,
+                                           shmem_Lambda4>>>>> // Create shared
+                                                              // memory
+      >;
 };
 
 using MatMultiplyTypes = ::testing::Types<Policy_MatMultiply_cpu>;
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index 72bd513fd8..7b535c0dbe 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -23,24 +23,26 @@ TEST(SIMD, Align)
 
   int N = 1024;
   double c = 0.5;
-  double *a =
+  double* a =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
-  double *b =
+  double* b =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = 0;
     b[i] = 2.0;
   }
 
 
-  double *y = RAJA::align_hint(a);
-  double *x = RAJA::align_hint(b);
+  double* y = RAJA::align_hint(a);
+  double* x = RAJA::align_hint(b);
 
   RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
                                 [=](int i) { y[i] += x[i] * c; });
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)y[i], (double)1.0);
   }
 
@@ -55,31 +57,33 @@ TEST(SIMD, OMPAndSimd)
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
       1,
       RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0> > > >;
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     a[i] = 1;
     b[i] = 1;
     c[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                     RAJA::RangeSegment(0, M)),
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c[i + j * N] = a[i + j * N] + b[i + j * N];
-                    });
+  RAJA::kernel<POL>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
+      [=](RAJA::Index_type i, RAJA::Index_type j) {
+        c[i + j * N] = a[i + j * N] + b[i + j * N];
+      });
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
   }
 
@@ -91,32 +95,33 @@ TEST(SIMD, OMPAndSimd)
 TEST(SIMD, OMPAndSimd_MultiLambda)
 {
 
-  using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0,
-                           RAJA::simd_exec,
-                           RAJA::statement::Lambda<0>,
-                           RAJA::statement::Lambda<1> > > >;
+  using POL = RAJA::KernelPolicy<
+      RAJA::statement::For<1,
+                           RAJA::omp_parallel_for_exec,
+                           RAJA::statement::For<0,
+                                                RAJA::simd_exec,
+                                                RAJA::statement::Lambda<0>,
+                                                RAJA::statement::Lambda<1>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  double *a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double *b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double *c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     a[i] = 1;
     b[i] = 1;
     c[i] = 0.0;
@@ -125,16 +130,17 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
     c2[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                     RAJA::RangeSegment(0, M)),
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c[i + j * N] = a[i + j * N] + b[i + j * N];
-                    },
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c2[i + j * N] = a2[i + j * N] + b2[i + j * N];
-                    });
-
-  for (int i = 0; i < N * M; ++i) {
+  RAJA::kernel<POL>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
+      [=](RAJA::Index_type i, RAJA::Index_type j) {
+        c[i + j * N] = a[i + j * N] + b[i + j * N];
+      },
+      [=](RAJA::Index_type i, RAJA::Index_type j) {
+        c2[i + j * N] = a2[i + j * N] + b2[i + j * N];
+      });
+
+  for (int i = 0; i < N * M; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
     ASSERT_DOUBLE_EQ((double)c2[i], (double)2.0);
   }
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
index db918ad234..8a7dff339d 100644
--- a/test/unit/algorithm/test-algorithm-util-for_each.cpp
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -19,8 +19,9 @@
 #include <vector>
 #include <set>
 
-template<typename T>
-class ForEachUnitTest : public ::testing::Test {};
+template <typename T>
+class ForEachUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes);
 
@@ -42,7 +43,8 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
 TYPED_TEST(ForEachUnitTest, VectorRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 13; ++i) {
+  for (TypeParam i = 0; i < 13; ++i)
+  {
     numbers.push_back(i);
   }
 
@@ -53,15 +55,17 @@ TYPED_TEST(ForEachUnitTest, VectorRange)
   });
 
   ASSERT_EQ(copies.size(), 13);
-  for (TypeParam i = 0; i < 13; ++i) {
-    ASSERT_EQ(numbers[i], copies[i]+1);
+  for (TypeParam i = 0; i < 13; ++i)
+  {
+    ASSERT_EQ(numbers[i], copies[i] + 1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, RajaSpanRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 11; ++i) {
+  for (TypeParam i = 0; i < 11; ++i)
+  {
     numbers.push_back(i);
   }
 
@@ -72,25 +76,27 @@ TYPED_TEST(ForEachUnitTest, RajaSpanRange)
   });
 
   ASSERT_EQ(copies.size(), 11);
-  for (TypeParam i = 0; i < 11; ++i) {
-    ASSERT_EQ(numbers[i], copies[i]+1);
+  for (TypeParam i = 0; i < 11; ++i)
+  {
+    ASSERT_EQ(numbers[i], copies[i] + 1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, SetRange)
 {
   std::set<TypeParam> numbers;
-  for (TypeParam i = 0; i < 6; ++i) {
+  for (TypeParam i = 0; i < 6; ++i)
+  {
     numbers.insert(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam const& number) {
-    copies.push_back(number);
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam const& number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 6);
-  for (TypeParam i = 0; i < 6; ++i) {
+  for (TypeParam i = 0; i < 6; ++i)
+  {
     ASSERT_EQ(i, copies[i]);
     ASSERT_EQ(numbers.count(i), 1);
   }
@@ -102,22 +108,21 @@ TYPED_TEST(ForEachUnitTest, EmptyTypeList)
   using numbers = camp::list<>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{}, [&](auto number) {
-    copies.push_back(number);
-  });
+  RAJA::for_each_type(numbers{},
+                      [&](auto number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 0);
 }
 
 
-template < typename T, T val >
+template <typename T, T val>
 T get_num(std::integral_constant<T, val>)
 {
   return val;
 }
 
-template < typename TypeParam,
-           std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr >
+template <typename TypeParam,
+          std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   using numbers = camp::list<std::integral_constant<TypeParam, 0>,
@@ -127,24 +132,21 @@ void run_int_type_test()
                              std::integral_constant<TypeParam, 4>>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{}, [&](auto number) {
-    copies.push_back(get_num(number));
-  });
+  RAJA::for_each_type(numbers{},
+                      [&](auto number) { copies.push_back(get_num(number)); });
 
   ASSERT_EQ(copies.size(), 5);
-  for (TypeParam i = 0; i < 5; ++i) {
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     ASSERT_EQ(i, copies[i]);
   }
 }
 ///
-template < typename TypeParam,
-           std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr >
+template <typename TypeParam,
+          std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   // ignore non-ints
 }
 
-TYPED_TEST(ForEachUnitTest, IntTypeList)
-{
-  run_int_type_test<TypeParam>();
-}
+TYPED_TEST(ForEachUnitTest, IntTypeList) { run_int_type_test<TypeParam>(); }
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index 4e3f9fb795..a49356f3e7 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -37,18 +37,24 @@
 
 
 // tag classes to differentiate reduce by attributes and apply correct testing
-struct left_fold_reduce_tag { };
-struct unordered_reduce_tag { };
+struct left_fold_reduce_tag
+{};
+struct unordered_reduce_tag
+{};
 
-struct reduce_interface_tag { };
+struct reduce_interface_tag
+{};
 
-struct reduce_default_interface_tag { };
-struct reduce_init_interface_tag { };
-struct reduce_init_op_interface_tag { };
+struct reduce_default_interface_tag
+{};
+struct reduce_init_interface_tag
+{};
+struct reduce_init_op_interface_tag
+{};
 
 
 // synchronize based on a RAJA execution policy
-template < typename policy >
+template <typename policy>
 struct PolicySynchronize
 {
   void synchronize()
@@ -59,32 +65,36 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::cuda_synchronize>();
+    }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::hip_synchronize>();
+    }
   }
 };
 #endif
 
 
-template <typename Res,
-          typename interface_tag,
-          typename ValType>
+template <typename Res, typename interface_tag, typename ValType>
 struct ReduceData;
 
 template <typename Res, typename ValType>
@@ -94,36 +104,37 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
   ValType* reduced_value = nullptr;
   Res m_res;
 
-  template < typename RandomGenerator >
-  ReduceData(size_t N, Res res, RandomGenerator gen_random)
-    : m_res(res)
+  template <typename RandomGenerator>
+  ReduceData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
   {
-    if (N > 0) {
-      values = m_res.template allocate<ValType>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      values = m_res.template allocate<ValType>(
+          N, camp::resources::MemoryAccess::Managed);
     }
-    reduced_value = m_res.template allocate<ValType>(1, camp::resources::MemoryAccess::Managed);
+    reduced_value = m_res.template allocate<ValType>(
+        1, camp::resources::MemoryAccess::Managed);
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       values[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if ( N == 0 ) return;
+    if (N == 0) return;
   }
 
-  Res resource()
-  {
-    return m_res;
-  }
+  Res resource() { return m_res; }
 
   ReduceData(ReduceData const&) = delete;
   ReduceData& operator=(ReduceData const&) = delete;
 
   ~ReduceData()
   {
-    if (values != nullptr) {
+    if (values != nullptr)
+    {
       m_res.deallocate(values, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed);
     }
@@ -131,15 +142,14 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 };
 
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T,
-            BinaryOp,
-            Reducer reducer, reduce_interface_tag, reduce_default_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -147,15 +157,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
   reducer.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T init,
-            BinaryOp,
-            Reducer reducer, reduce_interface_tag, reduce_init_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -163,15 +172,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
   reducer.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T init,
-            BinaryOp op,
-            Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp op,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_op_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -185,28 +193,33 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char* test_name,
-    const unsigned seed,
-    ReduceData<Res, reduce_interface_tag, T> & data,
-    RAJA::Index_type N,
-    T init,
-    BinaryOp op,
-    TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           left_fold_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value) {
+  if (reduced_check_value != *data.reduced_value)
+  {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (left fold reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value
-           << ", expected " << reduced_check_value;
+           << " incorrect " << *data.reduced_value << ", expected "
+           << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
@@ -217,68 +230,107 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char* test_name,
-    const unsigned seed,
-    ReduceData<Res, reduce_interface_tag, T> & data,
-    RAJA::Index_type N,
-    T init,
-    BinaryOp op,
-    TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           unordered_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value) {
+  if (reduced_check_value != *data.reduced_value)
+  {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (unordered reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value
-           << ", expected " << reduced_check_value;
+           << " incorrect " << *data.reduced_value << ", expected "
+           << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
 }
 
 
-template <typename ValType,
-          typename Reducer,
-          typename Res>
-void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
+template <typename ValType, typename Reducer, typename Res>
+void testReducerInterfaces(unsigned seed,
+                           RAJA::Index_type MaxN,
+                           Reducer reducer,
+                           Res res)
 {
-  using reduce_category    = typename Reducer::reduce_category ;
-  using interface_category = typename Reducer::reduce_interface ;
-  using no_init_operator   = reduce_default_interface_tag;
-  using init_no_operator   = reduce_init_interface_tag;
-  using init_operator      = reduce_init_op_interface_tag;
+  using reduce_category = typename Reducer::reduce_category;
+  using interface_category = typename Reducer::reduce_interface;
+  using no_init_operator = reduce_default_interface_tag;
+  using init_no_operator = reduce_init_interface_tag;
+  using init_operator = reduce_init_op_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
+      (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  ReduceData<Res, interface_category, ValType> data(N, res, [&](){ return dist(rng); });
-
-  ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus<ValType>::identity(), RAJA::operators::plus<ValType>{},
-      reducer, reduce_category{}, interface_category{}, no_init_operator{}));
-  ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
-  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
-  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
+  ReduceData<Res, interface_category, ValType> data(
+      N, res, [&]() { return dist(rng); });
+
+  ASSERT_TRUE(testReduce("default",
+                         seed,
+                         data,
+                         N,
+                         RAJA::operators::plus<ValType>::identity(),
+                         RAJA::operators::plus<ValType>{},
+                         reducer,
+                         reduce_category{},
+                         interface_category{},
+                         no_init_operator{}));
+  ASSERT_TRUE(testReduce("init",
+                         seed,
+                         data,
+                         N,
+                         ValType(N),
+                         RAJA::operators::plus<ValType>{},
+                         reducer,
+                         reduce_category{},
+                         interface_category{},
+                         init_no_operator{}));
+  ASSERT_TRUE(testReduce("minimum",
+                         seed,
+                         data,
+                         N,
+                         ValType(0),
+                         RAJA::operators::minimum<ValType>{},
+                         reducer,
+                         reduce_category{},
+                         interface_category{},
+                         init_operator{}));
+  ASSERT_TRUE(testReduce("Maximum",
+                         seed,
+                         data,
+                         N,
+                         ValType(0),
+                         RAJA::operators::maximum<ValType>{},
+                         reducer,
+                         reduce_category{},
+                         interface_category{},
+                         init_operator{}));
 }
 
-template <typename ValType,
-          typename Reducer,
-          typename Res>
+template <typename ValType, typename Reducer, typename Res>
 void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
 {
   testReducerInterfaces<ValType>(seed, 0, reducer, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
+  {
     testReducerInterfaces<ValType>(seed, n, reducer, res);
   }
 }
@@ -292,15 +344,15 @@ inline unsigned get_random_seed()
 
 TYPED_TEST_SUITE_P(ReduceUnitTest);
 
-template < typename T >
+template <typename T>
 class ReduceUnitTest : public ::testing::Test
-{ };
+{};
 
 TYPED_TEST_P(ReduceUnitTest, UnitReduce)
 {
-  using Reducer  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Reducer = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ValType = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
   unsigned seed = get_random_seed();
@@ -317,34 +369,21 @@ REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
 //
 // Key types for reduce tests
 //
-using ReduceValTypeList =
-  camp::list<
-              RAJA::Index_type,
-              int,
+using ReduceValTypeList = camp::list<RAJA::Index_type,
+                                     int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
-              float,
+                                     unsigned,
+                                     long long,
+                                     unsigned long long,
+                                     float,
 #endif
-              double
-            >;
+                                     double>;
 
 // Max test lengths for reduce tests
-using ReduceMaxNListDefault =
-  camp::list<
-              camp::num<10000>
-            >;
+using ReduceMaxNListDefault = camp::list<camp::num<10000>>;
 
-using ReduceMaxNListSmall =
-  camp::list<
-              camp::num<1000>
-            >;
+using ReduceMaxNListSmall = camp::list<camp::num<1000>>;
 
-using ReduceMaxNListTiny =
-  camp::list<
-              camp::num<100>
-            >;
+using ReduceMaxNListTiny = camp::list<camp::num<100>>;
 
 #endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
-
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index 4f3f5b4d64..5f411bcd51 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -37,20 +37,28 @@
 
 
 // tag classes to differentiate sort by attributes and apply correct testing
-struct unstable_sort_tag { };
-struct stable_sort_tag { };
-
-struct sort_interface_tag { };
-struct sort_pairs_interface_tag { };
-
-struct sort_default_interface_tag { };
-struct sort_comp_interface_tag { };
-struct sort_res_default_interface_tag { };
-struct sort_res_comp_interface_tag { };
+struct unstable_sort_tag
+{};
+struct stable_sort_tag
+{};
+
+struct sort_interface_tag
+{};
+struct sort_pairs_interface_tag
+{};
+
+struct sort_default_interface_tag
+{};
+struct sort_comp_interface_tag
+{};
+struct sort_res_default_interface_tag
+{};
+struct sort_res_comp_interface_tag
+{};
 
 
 // synchronize based on a RAJA execution policy
-template < typename policy >
+template <typename policy>
 struct PolicySynchronize
 {
   void synchronize()
@@ -61,24 +69,30 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::cuda_synchronize>();
+    }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::hip_synchronize>();
+    }
   }
 };
 #endif
@@ -97,37 +111,38 @@ struct SortData<Res, sort_interface_tag, K, V>
   K* sorted_keys = nullptr;
   Res m_res;
 
-  template < typename RandomGenerator >
-  SortData(size_t N, Res res, RandomGenerator gen_random)
-    : m_res(res)
+  template <typename RandomGenerator>
+  SortData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
   {
-    if (N > 0) {
-      orig_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
-      sorted_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      orig_keys =
+          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+      sorted_keys =
+          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       orig_keys[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if ( N == 0 ) return;
-    m_res.memcpy(sorted_keys, orig_keys, N*sizeof(K));
+    if (N == 0) return;
+    m_res.memcpy(sorted_keys, orig_keys, N * sizeof(K));
   }
 
-  Res resource()
-  {
-    return m_res;
-  }
+  Res resource() { return m_res; }
 
   SortData(SortData const&) = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
   {
-    if (orig_keys != nullptr) {
+    if (orig_keys != nullptr)
+    {
       m_res.deallocate(orig_keys, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(sorted_keys, camp::resources::MemoryAccess::Managed);
     }
@@ -136,23 +151,28 @@ struct SortData<Res, sort_interface_tag, K, V>
 
 
 template <typename Res, typename K, typename V>
-struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interface_tag, K, V>
+struct SortData<Res, sort_pairs_interface_tag, K, V>
+    : SortData<Res, sort_interface_tag, K, V>
 {
   using base = SortData<Res, sort_interface_tag, K, V>;
 
   V* orig_vals = nullptr;
   V* sorted_vals = nullptr;
 
-  template < typename RandomGenerator >
+  template <typename RandomGenerator>
   SortData(size_t N, Res res, RandomGenerator gen_random)
-    : base(N, res, gen_random)
+      : base(N, res, gen_random)
   {
-    if (N > 0) {
-      orig_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
-      sorted_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      orig_vals = this->m_res.template allocate<V>(
+          N, camp::resources::MemoryAccess::Managed);
+      sorted_vals = this->m_res.template allocate<V>(
+          N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       orig_vals[i] = gen_random();
     }
   }
@@ -160,8 +180,8 @@ struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interf
   void copy_data(size_t N)
   {
     base::copy_data(N);
-    if ( N == 0 ) return;
-    this->m_res.memcpy(sorted_vals, orig_vals, N*sizeof(V));
+    if (N == 0) return;
+    this->m_res.memcpy(sorted_vals, orig_vals, N * sizeof(V));
   }
 
   SortData(SortData const&) = delete;
@@ -169,22 +189,23 @@ struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interf
 
   ~SortData()
   {
-    if (orig_vals != nullptr) {
+    if (orig_vals != nullptr)
+    {
       this->m_res.deallocate(orig_vals, camp::resources::MemoryAccess::Managed);
-      this->m_res.deallocate(sorted_vals, camp::resources::MemoryAccess::Managed);
+      this->m_res.deallocate(sorted_vals,
+                             camp::resources::MemoryAccess::Managed);
     }
   }
 };
 
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_interface_tag, sort_default_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -192,50 +213,43 @@ void doSort(SortData<Res, sort_interface_tag, T> & data,
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_interface_tag, sort_comp_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(RAJA::make_span(data.sorted_keys, N),
-         comp);
+  sorter(RAJA::make_span(data.sorted_keys, N), comp);
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_interface_tag, sort_res_default_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N));
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N));
   data.resource().wait();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_interface_tag, sort_res_comp_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
-         comp);
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N), comp);
   data.resource().wait();
 }
 
@@ -244,10 +258,12 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_pairs_interface_tag, sort_default_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -261,10 +277,12 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_pairs_interface_tag, sort_comp_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -279,10 +297,12 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_pairs_interface_tag, sort_res_default_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
   sorter(data.resource(),
@@ -296,10 +316,12 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_pairs_interface_tag, sort_res_comp_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
   sorter(data.resource(),
@@ -315,22 +337,26 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_interface_tag, T> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, unstable_sort_tag, sort_interface_tag si, CompareInterface ci)
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    unstable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end()) {
+    if (key_iter == keys.end())
+    {
       auto ret = keys.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -338,34 +364,33 @@ ::testing::AssertionResult testSort(
     key_iter->second.emplace(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order "
-             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " (at index " << i-1 << ")";
+             << " out of order " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key "
-             << data.sorted_keys[i]
+             << " unknown or duplicate key " << data.sorted_keys[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_keys[i]);
     if (val_iter == key_iter->second.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate val "
-             << data.sorted_keys[i]
+             << " unknown or duplicate val " << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
@@ -377,22 +402,26 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_interface_tag, T> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, stable_sort_tag, sort_interface_tag si, CompareInterface ci)
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    stable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end()) {
+    if (key_iter == keys.end())
+    {
       auto ret = keys.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -400,33 +429,32 @@ ::testing::AssertionResult testSort(
     key_iter->second.emplace_back(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order "
-             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " (at index " << i-1 << ")";
+             << " out of order " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key "
-             << data.sorted_keys[i]
+             << " unknown or duplicate key " << data.sorted_keys[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_keys[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of stable order or unknown val "
-             << data.sorted_keys[i]
+             << " out of stable order or unknown val " << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
@@ -440,22 +468,27 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_pairs_interface_tag, K, V> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, unstable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         unstable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end()) {
+    if (key_iter == keys_to_vals.end())
+    {
       auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -463,15 +496,17 @@ ::testing::AssertionResult testSort(
     key_iter->second.emplace(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i] << " out of order"
-             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
-             << " (at index " << i-1 << ")";
+             << " keys " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " out of order"
+             << " vals " << data.sorted_vals[i - 1] << ", "
+             << data.sorted_vals[i] << " (at index " << i - 1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -479,8 +514,7 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_vals[i]);
     if (val_iter == key_iter->second.end())
@@ -488,11 +522,11 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate val "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -505,22 +539,27 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_pairs_interface_tag, K, V> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, stable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         stable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::list<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end()) {
+    if (key_iter == keys_to_vals.end())
+    {
       auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
@@ -528,16 +567,17 @@ ::testing::AssertionResult testSort(
     key_iter->second.emplace_back(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of order "
-             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
-             << " (at index " << i-1 << ")";
+             << " keys " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " vals " << data.sorted_vals[i - 1]
+             << ", " << data.sorted_vals[i] << " (at index " << i - 1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -545,19 +585,18 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_vals[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of stable order or unknown val "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -565,79 +604,116 @@ ::testing::AssertionResult testSort(
 }
 
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Sorter>
+template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::false_type,
     unsigned,
-    SortData<Res, typename Sorter::sort_interface, K, V> &,
+    SortData<Res, typename Sorter::sort_interface, K, V>&,
     RAJA::Index_type,
     Sorter)
 {
   // Sorter does not support resource interface, no tests
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Sorter>
+template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::true_type,
     unsigned seed,
-    SortData<Res, typename Sorter::sort_interface, K, V> & data,
+    SortData<Res, typename Sorter::sort_interface, K, V>& data,
     RAJA::Index_type N,
     Sorter sorter)
 {
   // Sorter supports resource interface, res tests
-  using stability_category = typename Sorter::sort_category ;
-  using pairs_category     = typename Sorter::sort_interface ;
-  using resource_no_comparator  = sort_res_default_interface_tag;
+  using stability_category = typename Sorter::sort_category;
+  using pairs_category = typename Sorter::sort_interface;
+  using resource_no_comparator = sort_res_default_interface_tag;
   using resource_use_comparator = sort_res_comp_interface_tag;
 
-  ASSERT_TRUE(testSort("resource+default", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_no_comparator{}));
-  ASSERT_TRUE(testSort("resource+ascending", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
-  ASSERT_TRUE(testSort("resource+descending", seed, data, N, RAJA::operators::greater<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
+  ASSERT_TRUE(testSort("resource+default",
+                       seed,
+                       data,
+                       N,
+                       RAJA::operators::less<K>{},
+                       sorter,
+                       stability_category{},
+                       pairs_category{},
+                       resource_no_comparator{}));
+  ASSERT_TRUE(testSort("resource+ascending",
+                       seed,
+                       data,
+                       N,
+                       RAJA::operators::less<K>{},
+                       sorter,
+                       stability_category{},
+                       pairs_category{},
+                       resource_use_comparator{}));
+  ASSERT_TRUE(testSort("resource+descending",
+                       seed,
+                       data,
+                       N,
+                       RAJA::operators::greater<K>{},
+                       sorter,
+                       stability_category{},
+                       pairs_category{},
+                       resource_use_comparator{}));
 }
 
-template <typename K,
-          typename Sorter,
-          typename Res>
-void testSorterInterfaces(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
+template <typename K, typename Sorter, typename Res>
+void testSorterInterfaces(unsigned seed,
+                          RAJA::Index_type MaxN,
+                          Sorter sorter,
+                          Res res)
 {
-  using stability_category = typename Sorter::sort_category ;
-  using pairs_category     = typename Sorter::sort_interface ;
-  using supports_resource  = typename Sorter::supports_resource ;
-  using no_comparator      = sort_default_interface_tag;
-  using use_comparator     = sort_comp_interface_tag;
+  using stability_category = typename Sorter::sort_category;
+  using pairs_category = typename Sorter::sort_interface;
+  using supports_resource = typename Sorter::supports_resource;
+  using no_comparator = sort_default_interface_tag;
+  using use_comparator = sort_comp_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
+      (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  SortData<Res, pairs_category, K> data(N, res, [&](){ return dist(rng); });
-
-  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, no_comparator{}));
-  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, use_comparator{}));
-  ASSERT_TRUE(testSort("descending", seed, data, N, RAJA::operators::greater<K>{},
-      sorter, stability_category{}, pairs_category{}, use_comparator{}));
+  SortData<Res, pairs_category, K> data(N, res, [&]() { return dist(rng); });
+
+  ASSERT_TRUE(testSort("default",
+                       seed,
+                       data,
+                       N,
+                       RAJA::operators::less<K>{},
+                       sorter,
+                       stability_category{},
+                       pairs_category{},
+                       no_comparator{}));
+  ASSERT_TRUE(testSort("ascending",
+                       seed,
+                       data,
+                       N,
+                       RAJA::operators::less<K>{},
+                       sorter,
+                       stability_category{},
+                       pairs_category{},
+                       use_comparator{}));
+  ASSERT_TRUE(testSort("descending",
+                       seed,
+                       data,
+                       N,
+                       RAJA::operators::greater<K>{},
+                       sorter,
+                       stability_category{},
+                       pairs_category{},
+                       use_comparator{}));
 
   testSorterResInterfaces(supports_resource(), seed, data, N, sorter);
 }
 
-template <typename K,
-          typename Sorter,
-          typename Res>
+template <typename K, typename Sorter, typename Res>
 void testSorter(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
 {
   testSorterInterfaces<K>(seed, 0, sorter, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
+  {
     testSorterInterfaces<K>(seed, n, sorter, res);
   }
 }
@@ -651,15 +727,15 @@ inline unsigned get_random_seed()
 
 TYPED_TEST_SUITE_P(SortUnitTest);
 
-template < typename T >
+template <typename T>
 class SortUnitTest : public ::testing::Test
-{ };
+{};
 
 TYPED_TEST_P(SortUnitTest, UnitSort)
 {
-  using Sorter   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using KeyType  = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Sorter = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using KeyType = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
   unsigned seed = get_random_seed();
@@ -676,34 +752,21 @@ REGISTER_TYPED_TEST_SUITE_P(SortUnitTest, UnitSort);
 //
 // Key types for sort tests
 //
-using SortKeyTypeList =
-  camp::list<
-              RAJA::Index_type,
-              int,
+using SortKeyTypeList = camp::list<RAJA::Index_type,
+                                   int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
-              float,
+                                   unsigned,
+                                   long long,
+                                   unsigned long long,
+                                   float,
 #endif
-              double
-            >;
+                                   double>;
 
 // Max test lengths for sort tests
-using SortMaxNListDefault =
-  camp::list<
-              camp::num<10000>
-            >;
+using SortMaxNListDefault = camp::list<camp::num<10000>>;
 
-using SortMaxNListSmall =
-  camp::list<
-              camp::num<1000>
-            >;
+using SortMaxNListSmall = camp::list<camp::num<1000>>;
 
-using SortMaxNListTiny =
-  camp::list<
-              camp::num<100>
-            >;
+using SortMaxNListTiny = camp::list<camp::num<100>>;
 
 #endif //__TEST_ALGORITHM_SORT_UTILS_HPP__
-
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index d08f949fae..0c8d279e97 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -22,9 +22,8 @@
 
 #include "test-algorithm-sort-utils.hpp"
 
-template < typename policy >
-struct PolicySort
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicySort : PolicySynchronize<policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
@@ -32,29 +31,23 @@ struct PolicySort
 
   std::string m_name;
 
-  PolicySort()
-    : m_name("RAJA::sort<unknown>")
-  { }
+  PolicySort() : m_name("RAJA::sort<unknown>") {}
 
   PolicySort(std::string const& policy_name)
-    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template < typename policy >
-struct PolicySortPairs
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicySortPairs : PolicySynchronize<policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
@@ -62,20 +55,16 @@ struct PolicySortPairs
 
   std::string m_name;
 
-  PolicySortPairs()
-    : m_name("RAJA::sort<unknown>[pairs]")
-  { }
+  PolicySortPairs() : m_name("RAJA::sort<unknown>[pairs]") {}
 
   PolicySortPairs(std::string const& policy_name)
-    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::sort<") + policy_name +
+               std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::sort_pairs<policy>(std::forward<Args>(args)...);
@@ -84,41 +73,30 @@ struct PolicySortPairs
 
 
 using SequentialSortSorters =
-  camp::list<
-              PolicySort<RAJA::seq_exec>,
-              PolicySortPairs<RAJA::seq_exec>
-            >;
+    camp::list<PolicySort<RAJA::seq_exec>, PolicySortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPSortSorters =
-  camp::list<
-              PolicySort<RAJA::omp_parallel_for_exec>,
-              PolicySortPairs<RAJA::omp_parallel_for_exec>
-            >;
+    camp::list<PolicySort<RAJA::omp_parallel_for_exec>,
+               PolicySortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaSortSorters =
-  camp::list<
-              PolicySort<RAJA::cuda_exec<128>>,
-              PolicySortPairs<RAJA::cuda_exec<128>>,
-              PolicySort<RAJA::cuda_exec_explicit<128, 2>>
-            >;
+    camp::list<PolicySort<RAJA::cuda_exec<128>>,
+               PolicySortPairs<RAJA::cuda_exec<128>>,
+               PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipSortSorters =
-  camp::list<
-              PolicySort<RAJA::hip_exec<128>>,
-              PolicySortPairs<RAJA::hip_exec<128>>
-            >;
+using HipSortSorters = camp::list<PolicySort<RAJA::hip_exec<128>>,
+                                  PolicySortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
 #endif //__TEST_UNIT_ALGORITHM_SORT_HPP__
-
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index 6b33d63497..25c8f0bc55 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -23,9 +23,8 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template < typename policy >
-struct PolicyStableSort
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicyStableSort : PolicySynchronize<policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_interface_tag;
@@ -33,29 +32,24 @@ struct PolicyStableSort
 
   std::string m_name;
 
-  PolicyStableSort()
-    : m_name("RAJA::stable_sort<unknown>")
-  { }
+  PolicyStableSort() : m_name("RAJA::stable_sort<unknown>") {}
 
   PolicyStableSort(std::string const& policy_name)
-    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::stable_sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template < typename policy >
-struct PolicyStableSortPairs
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicyStableSortPairs : PolicySynchronize<policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
@@ -63,20 +57,16 @@ struct PolicyStableSortPairs
 
   std::string m_name;
 
-  PolicyStableSortPairs()
-    : m_name("RAJA::stable_sort<unknown>[pairs]")
-  { }
+  PolicyStableSortPairs() : m_name("RAJA::stable_sort<unknown>[pairs]") {}
 
   PolicyStableSortPairs(std::string const& policy_name)
-    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::stable_sort_pairs<policy>(std::forward<Args>(args)...);
@@ -84,39 +74,31 @@ struct PolicyStableSortPairs
 };
 
 using SequentialStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::seq_exec>,
-              PolicyStableSortPairs<RAJA::seq_exec>
-            >;
+    camp::list<PolicyStableSort<RAJA::seq_exec>,
+               PolicyStableSortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::omp_parallel_for_exec>,
-              PolicyStableSortPairs<RAJA::omp_parallel_for_exec>
-            >;
+    camp::list<PolicyStableSort<RAJA::omp_parallel_for_exec>,
+               PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::cuda_exec<128>>,
-              PolicyStableSortPairs<RAJA::cuda_exec<128>>,
-              PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>
-            >;
+    camp::list<PolicyStableSort<RAJA::cuda_exec<128>>,
+               PolicyStableSortPairs<RAJA::cuda_exec<128>>,
+               PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::hip_exec<128>>,
-              PolicyStableSortPairs<RAJA::hip_exec<128>>
-            >;
+    camp::list<PolicyStableSort<RAJA::hip_exec<128>>,
+               PolicyStableSortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index 062e0f9b91..6ae3fe7138 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -23,49 +23,42 @@
 #include "test-algorithm-reduce-utils.hpp"
 
 
-template < typename test_policy >
-using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template <typename test_policy>
+using ForoneSynchronize =
+    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct BinaryTreeReduce;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct Accumulate;
 
 
-template < typename test_policy >
-struct BinaryTreeReduce<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct BinaryTreeReduce<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using reduce_category = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name()
-  {
-    return "RAJA::binary_tree_reduce";
-  }
+  const char* name() { return "RAJA::binary_tree_reduce"; }
 
-  template < typename T, typename... Args >
+  template <typename T, typename... Args>
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::binary_tree_reduce(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct Accumulate<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct Accumulate<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using reduce_category = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name()
-  {
-    return "RAJA::accumulate";
-  }
+  const char* name() { return "RAJA::accumulate"; }
 
-  template < typename T, typename... Args >
+  template <typename T, typename... Args>
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::accumulate(std::forward<Args>(args)...);
@@ -74,9 +67,9 @@ struct Accumulate<test_policy, RunOnHost>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template < typename test_policy >
+template <typename test_policy>
 struct BinaryTreeReduce<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
   using reduce_category = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
@@ -84,42 +77,43 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   std::string m_name;
 
   BinaryTreeReduce()
-    : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::binary_tree_reduce<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename T, typename Container >
+  template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE() { *reduced_value = RAJA::binary_tree_reduce(c); });
   }
 
-  template < typename T, typename Container >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  template <typename T, typename Container>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
+    forone<test_policy>([=] RAJA_DEVICE() {
       *reduced_value = RAJA::binary_tree_reduce(c, init);
     });
   }
 
-  template < typename T, typename Container, typename BinaryOp >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  template <typename T, typename Container, typename BinaryOp>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
+    forone<test_policy>([=] RAJA_DEVICE() {
       *reduced_value = RAJA::binary_tree_reduce(c, init, op);
     });
   }
 };
 
-template < typename test_policy >
-struct Accumulate<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using reduce_category = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
@@ -127,36 +121,36 @@ struct Accumulate<test_policy, RunOnDevice>
   std::string m_name;
 
   Accumulate()
-    : m_name(std::string("RAJA::accumulate<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::accumulate<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename T, typename Container >
+  template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE() { *reduced_value = RAJA::accumulate(c); });
   }
 
-  template < typename T, typename Container >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  template <typename T, typename Container>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c, init);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE() { *reduced_value = RAJA::accumulate(c, init); });
   }
 
-  template < typename T, typename Container, typename BinaryOp >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  template <typename T, typename Container, typename BinaryOp>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c, init, op);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE() { *reduced_value = RAJA::accumulate(c, init, op); });
   }
 };
 
@@ -164,42 +158,24 @@ struct Accumulate<test_policy, RunOnDevice>
 
 
 using SequentialBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_seq>
-            >;
+    camp::list<BinaryTreeReduce<test_seq>>;
 
-using SequentialAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_seq>
-            >;
+using SequentialAccumulateReduceReducers = camp::list<Accumulate<test_seq>>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_cuda>
-            >;
+using CudaBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_cuda>>;
 
-using CudaAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_cuda>
-            >;
+using CudaAccumulateReduceReducers = camp::list<Accumulate<test_cuda>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_hip>
-            >;
+using HipBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_hip>>;
 
-using HipAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_hip>
-            >;
+using HipAccumulateReduceReducers = camp::list<Accumulate<test_hip>>;
 
 #endif
 
 #endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
-
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
index 032097d9e3..59d010e4e5 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -23,79 +23,74 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template < typename test_policy >
-using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template <typename test_policy>
+using ForoneSynchronize =
+    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct InsertionSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct InsertionSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct ShellSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct ShellSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct HeapSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct HeapSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct IntroSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct IntroSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct MergeSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct MergeSortPairs;
 
 
-template < typename test_policy >
-struct InsertionSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct InsertionSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::insertion_sort";
-  }
+  const char* name() { return "RAJA::insertion_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::insertion_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
+template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::insertion_sort[pairs]";
-  }
+  const char* name() { return "RAJA::insertion_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
     auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -103,44 +98,37 @@ struct InsertionSortPairs<test_policy, RunOnHost>
   }
 };
 
-template < typename test_policy >
-struct ShellSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::shell_sort";
-  }
+  const char* name() { return "RAJA::shell_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::shell_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct ShellSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::shell_sort[pairs]";
-  }
+  const char* name() { return "RAJA::shell_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
     auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -148,44 +136,37 @@ struct ShellSortPairs<test_policy, RunOnHost>
   }
 };
 
-template < typename test_policy >
-struct HeapSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::heap_sort";
-  }
+  const char* name() { return "RAJA::heap_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::heap_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct HeapSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::heap_sort[pairs]";
-  }
+  const char* name() { return "RAJA::heap_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
     auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -193,44 +174,37 @@ struct HeapSortPairs<test_policy, RunOnHost>
   }
 };
 
-template < typename test_policy >
-struct IntroSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::intro_sort";
-  }
+  const char* name() { return "RAJA::intro_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::intro_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct IntroSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::intro_sort[pairs]";
-  }
+  const char* name() { return "RAJA::intro_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
     auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -238,44 +212,37 @@ struct IntroSortPairs<test_policy, RunOnHost>
   }
 };
 
-template < typename test_policy >
-struct MergeSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::merge_sort";
-  }
+  const char* name() { return "RAJA::merge_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct MergeSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::merge_sort[pairs]";
-  }
+  const char* name() { return "RAJA::merge_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
     auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -285,9 +252,8 @@ struct MergeSortPairs<test_policy, RunOnHost>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template < typename test_policy >
-struct InsertionSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct InsertionSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_interface_tag;
@@ -296,34 +262,28 @@ struct InsertionSort<test_policy, RunOnDevice>
   std::string m_name;
 
   InsertionSort()
-    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::insertion_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::insertion_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
+template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
   using sort_category = stable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
@@ -332,21 +292,20 @@ struct InsertionSortPairs<test_policy, RunOnDevice>
   std::string m_name;
 
   InsertionSortPairs()
-    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
+    forone<test_policy>([=] RAJA_DEVICE() {
       auto c = RAJA::zip_span(keys, vals);
       using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
       RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
@@ -354,9 +313,8 @@ struct InsertionSortPairs<test_policy, RunOnDevice>
   }
 };
 
-template < typename test_policy >
-struct ShellSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
@@ -365,34 +323,27 @@ struct ShellSort<test_policy, RunOnDevice>
   std::string m_name;
 
   ShellSort()
-    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::shell_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::shell_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct ShellSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
@@ -401,21 +352,20 @@ struct ShellSortPairs<test_policy, RunOnDevice>
   std::string m_name;
 
   ShellSortPairs()
-    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
+    forone<test_policy>([=] RAJA_DEVICE() {
       auto c = RAJA::zip_span(keys, vals);
       using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
       RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
@@ -423,9 +373,8 @@ struct ShellSortPairs<test_policy, RunOnDevice>
   }
 };
 
-template < typename test_policy >
-struct HeapSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
@@ -434,34 +383,27 @@ struct HeapSort<test_policy, RunOnDevice>
   std::string m_name;
 
   HeapSort()
-    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::heap_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::heap_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct HeapSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
@@ -470,21 +412,20 @@ struct HeapSortPairs<test_policy, RunOnDevice>
   std::string m_name;
 
   HeapSortPairs()
-    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
+    forone<test_policy>([=] RAJA_DEVICE() {
       auto c = RAJA::zip_span(keys, vals);
       using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
       RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
@@ -492,9 +433,8 @@ struct HeapSortPairs<test_policy, RunOnDevice>
   }
 };
 
-template < typename test_policy >
-struct IntroSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
@@ -503,34 +443,27 @@ struct IntroSort<test_policy, RunOnDevice>
   std::string m_name;
 
   IntroSort()
-    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::intro_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::intro_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct IntroSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
@@ -539,21 +472,20 @@ struct IntroSortPairs<test_policy, RunOnDevice>
   std::string m_name;
 
   IntroSortPairs()
-    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
+    forone<test_policy>([=] RAJA_DEVICE() {
       auto c = RAJA::zip_span(keys, vals);
       using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
       RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
@@ -561,9 +493,8 @@ struct IntroSortPairs<test_policy, RunOnDevice>
   }
 };
 
-template < typename test_policy >
-struct MergeSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_interface_tag;
@@ -572,34 +503,27 @@ struct MergeSort<test_policy, RunOnDevice>
   std::string m_name;
 
   MergeSort()
-    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::merge_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::merge_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct MergeSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
   using sort_category = unstable_sort_tag;
   using sort_interface = sort_pairs_interface_tag;
@@ -608,21 +532,20 @@ struct MergeSortPairs<test_policy, RunOnDevice>
   std::string m_name;
 
   MergeSortPairs()
-    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare comp = Compare{})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void
+  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
+    forone<test_policy>([=] RAJA_DEVICE() {
       auto c = RAJA::zip_span(keys, vals);
       using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
       RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
@@ -634,102 +557,56 @@ struct MergeSortPairs<test_policy, RunOnDevice>
 
 
 using SequentialInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_seq>,
-              InsertionSortPairs<test_seq>
-            >;
+    camp::list<InsertionSort<test_seq>, InsertionSortPairs<test_seq>>;
 
 using SequentialShellSortSorters =
-  camp::list<
-              ShellSort<test_seq>,
-              ShellSortPairs<test_seq>
-            >;
+    camp::list<ShellSort<test_seq>, ShellSortPairs<test_seq>>;
 
 using SequentialHeapSortSorters =
-  camp::list<
-              HeapSort<test_seq>,
-              HeapSortPairs<test_seq>
-            >;
+    camp::list<HeapSort<test_seq>, HeapSortPairs<test_seq>>;
 
 using SequentialIntroSortSorters =
-  camp::list<
-              IntroSort<test_seq>,
-              IntroSortPairs<test_seq>
-            >;
+    camp::list<IntroSort<test_seq>, IntroSortPairs<test_seq>>;
 
 using SequentialMergeSortSorters =
-  camp::list<
-              MergeSort<test_seq>,
-              MergeSortPairs<test_seq>
-            >;
+    camp::list<MergeSort<test_seq>, MergeSortPairs<test_seq>>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_cuda>,
-              InsertionSortPairs<test_cuda>
-            >;
+    camp::list<InsertionSort<test_cuda>, InsertionSortPairs<test_cuda>>;
 
 using CudaShellSortSorters =
-  camp::list<
-              ShellSort<test_cuda>,
-              ShellSortPairs<test_cuda>
-            >;
+    camp::list<ShellSort<test_cuda>, ShellSortPairs<test_cuda>>;
 
 using CudaHeapSortSorters =
-  camp::list<
-              HeapSort<test_cuda>,
-              HeapSortPairs<test_cuda>
-            >;
+    camp::list<HeapSort<test_cuda>, HeapSortPairs<test_cuda>>;
 
 using CudaIntroSortSorters =
-  camp::list<
-              IntroSort<test_cuda>,
-              IntroSortPairs<test_cuda>
-            >;
+    camp::list<IntroSort<test_cuda>, IntroSortPairs<test_cuda>>;
 
 using CudaMergeSortSorters =
-  camp::list<
-              MergeSort<test_cuda>,
-              MergeSortPairs<test_cuda>
-            >;
+    camp::list<MergeSort<test_cuda>, MergeSortPairs<test_cuda>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_hip>,
-              InsertionSortPairs<test_hip>
-            >;
+    camp::list<InsertionSort<test_hip>, InsertionSortPairs<test_hip>>;
 
 using HipShellSortSorters =
-  camp::list<
-              ShellSort<test_hip>,
-              ShellSortPairs<test_hip>
-            >;
+    camp::list<ShellSort<test_hip>, ShellSortPairs<test_hip>>;
 
 using HipHeapSortSorters =
-  camp::list<
-              HeapSort<test_hip>,
-              HeapSortPairs<test_hip>
-            >;
+    camp::list<HeapSort<test_hip>, HeapSortPairs<test_hip>>;
 
 using HipIntroSortSorters =
-  camp::list<
-              IntroSort<test_hip>,
-              IntroSortPairs<test_hip>
-            >;
+    camp::list<IntroSort<test_hip>, IntroSortPairs<test_hip>>;
 
 using HipMergeSortSorters =
-  camp::list<
-              MergeSort<test_hip>,
-              MergeSortPairs<test_hip>
-            >;
+    camp::list<MergeSort<test_hip>, MergeSortPairs<test_hip>>;
 
 #endif
 
 #endif //__TEST_ALGORITHM_UTIL_SORT_HPP__
-
diff --git a/test/unit/atomic/test-atomic-incdec.cpp b/test/unit/atomic/test-atomic-incdec.cpp
index 6564feeaf5..6a4f16f10f 100644
--- a/test/unit/atomic/test-atomic-incdec.cpp
+++ b/test/unit/atomic/test-atomic-incdec.cpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for "wrapping" increment and decrement functions
+/// Source file containing tests for "wrapping" increment and decrement
+/// functions
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -18,31 +19,30 @@
 #endif
 
 using unsigned_types =
-    ::testing::Types<
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>
+    ::testing::Types<std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                      ,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::hip_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::hip_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>
 #endif
-                    >;
+                     >;
 
 // Basic Inc Dec
 
@@ -50,143 +50,144 @@ template <typename T>
 class AtomicBasicIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest );
+TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest);
 
-TYPED_TEST_P( AtomicBasicIncDecUnitTest, BasicIncDecs )
+TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // test "wrapping" increment
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
   T inc_init = (T)0;
   T* inc_result = &inc_init;
 
   // oldval < val, increment oldval
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)1 );
+  ASSERT_EQ(inc_result[0], (T)1);
 
   // oldval == val, wrap to 0
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // test "wrapping" decrement
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
   T dec_init = (T)1;
   T* dec_result = &dec_init;
 
   // oldval > 0, decrement oldval
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)0 );
+  ASSERT_EQ(dec_result[0], (T)0);
 
   // oldval == 0, wrap to val
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest,
-                             BasicIncDecs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest, BasicIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicIncDecUnitTest,
-                                AtomicBasicIncDecUnitTest,
-                                unsigned_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicIncDecUnitTest,
+                               AtomicBasicIncDecUnitTest,
+                               unsigned_types);
 
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
 
 using CUDA_unsigned_types =
-    ::testing::Types<
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
-                    >;
+    ::testing::Types<std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
 
 
 template <typename T>
 class AtomicCUDAIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest );
+TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest);
 
-GPU_TYPED_TEST_P( AtomicCUDAIncDecUnitTest, CUDAIncDecs )
+GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * inc_result = nullptr;
-  T * dec_result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&inc_result, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&dec_result, sizeof(T)));
+  T* inc_result = nullptr;
+  T* dec_result = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&inc_result, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&dec_result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // test "wrapping" increment
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
   inc_result[0] = (T)0;
   // oldval < val, increment oldval
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>(
+      [=] __device__() { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)1 );
+  ASSERT_EQ(inc_result[0], (T)1);
 
   // oldval == val, wrap to 0
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>(
+      [=] __device__() { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>(
+      [=] __device__() { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // test "wrapping" decrement
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
   dec_result[0] = (T)1;
   // oldval > 0, decrement oldval
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>(
+      [=] __device__() { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)0 );
+  ASSERT_EQ(dec_result[0], (T)0);
 
   // oldval == 0, wrap to val
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>(
+      [=] __device__() { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>(
+      [=] __device__() { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(inc_result));
   cudaErrchk(cudaFree(dec_result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest,
-                             CUDAIncDecs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAIncDecUnitTest,
-                                AtomicCUDAIncDecUnitTest,
-                                CUDA_unsigned_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAIncDecUnitTest,
+                               AtomicCUDAIncDecUnitTest,
+                               CUDA_unsigned_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
index f69813fcbe..b084019a6a 100644
--- a/test/unit/atomic/test-atomic-ref-accessors.cpp
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -25,47 +25,44 @@ template <typename T>
 class AtomicRefBasicAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicAccessorUnitTest, BasicAccessors )
+TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // should also work with CUDA
   T theval = (T)0;
-  T * memaddr = &theval;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test store method with op()
-  test1.store( (T)19 );
-  ASSERT_EQ( test1, (T)19 );
+  test1.store((T)19);
+  ASSERT_EQ(test1, (T)19);
 
   // test assignment operator
   test1 = (T)23;
-  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ(test1, (T)23);
 
   // test load method
   test1 = (T)29;
-  ASSERT_EQ( test1.load(), (T)29 );
+  ASSERT_EQ(test1.load(), (T)29);
 
   // test ()
   result = (test1 = (T)31);
-  ASSERT_EQ( test1, (T)31 );
-  ASSERT_EQ( result, (T)31 );
+  ASSERT_EQ(test1, (T)31);
+  ASSERT_EQ(result, (T)31);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest,
-                             BasicAccessors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest, BasicAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicAccessUnitTest,
-                                AtomicRefBasicAccessorUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAccessUnitTest,
+                               AtomicRefBasicAccessorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -75,48 +72,54 @@ template <typename T>
 class AtomicRefCUDAAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
+GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test store method with op()
-  forone<test_cuda>( [=] __device__ () {test1.store( (T)19 );} );
+  forone<test_cuda>([=] __device__() { test1.store((T)19); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)19 );
+  ASSERT_EQ(test1, (T)19);
 
   // test assignment operator
-  forone<test_cuda>( [=] __device__ () {test1 = (T)23;} );
+  forone<test_cuda>([=] __device__() { test1 = (T)23; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ(test1, (T)23);
 
   // test load method
-  forone<test_cuda>( [=] __device__ () {test1 = (T)29; result[0] = test1.load();} );
+  forone<test_cuda>([=] __device__() {
+    test1 = (T)29;
+    result[0] = test1.load();
+  });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)29 );
-  ASSERT_EQ( test1, (T)29 );
+  ASSERT_EQ(result[0], (T)29);
+  ASSERT_EQ(test1, (T)29);
 
   // test T()
-  forone<test_cuda>( [=] __device__ () {test1 = (T)47; result[0] = test1;} );
+  forone<test_cuda>([=] __device__() {
+    test1 = (T)47;
+    result[0] = test1;
+  });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)47 );
-  ASSERT_EQ( test1, (T)47 );
+  ASSERT_EQ(result[0], (T)47);
+  ASSERT_EQ(test1, (T)47);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 = (T)31);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 = (T)31); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)31 );
-  ASSERT_EQ( test1, (T)31 );
+  ASSERT_EQ(result[0], (T)31);
+  ASSERT_EQ(test1, (T)31);
 
   cudaErrchk(cudaDeviceSynchronize());
 
@@ -124,14 +127,9 @@ GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest,
-                             CUDAAccessors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAccessUnitTest,
-                                AtomicRefCUDAAccessorUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAccessUnitTest,
+                               AtomicRefCUDAAccessorUnitTest,
+                               CUDA_types);
 #endif
-
-
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
index fba54f77fa..050ef5e2ca 100644
--- a/test/unit/atomic/test-atomic-ref-addsub.cpp
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -25,58 +25,55 @@ template <typename T>
 class AtomicRefBasicAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicAddSubUnitTest, BasicAddSubs )
+TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T theval = (T)0;
-  T * memaddr = &theval;
+  T* memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test inc ops
   T val2 = ++test1;
   T val3 = test1++;
-  ASSERT_EQ( test1, (T)2 );
-  ASSERT_EQ( val2, (T)1 );
-  ASSERT_EQ( val3, (T)1 );
+  ASSERT_EQ(test1, (T)2);
+  ASSERT_EQ(val2, (T)1);
+  ASSERT_EQ(val3, (T)1);
 
   // test dec ops
   T val4 = --test1;
   T val5 = test1--;
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( val4, (T)1 );
-  ASSERT_EQ( val5, (T)1 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(val4, (T)1);
+  ASSERT_EQ(val5, (T)1);
 
   // test add/sub ops
   T val6 = (test1 += (T)23);
-  ASSERT_EQ( test1, (T)23 );
-  ASSERT_EQ( val6, (T)23 );
+  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ(val6, (T)23);
   T val7 = (test1 -= (T)22);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( val7, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(val7, (T)1);
 
   // test add/sub methods
-  T val8 = test1.fetch_add( (T)23 );
-  ASSERT_EQ( test1, (T)24 );
-  ASSERT_EQ( val8, (T)1 );
-  T val9 = test1.fetch_sub( (T)23 );
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( val9, (T)24 );
+  T val8 = test1.fetch_add((T)23);
+  ASSERT_EQ(test1, (T)24);
+  ASSERT_EQ(val8, (T)1);
+  T val9 = test1.fetch_sub((T)23);
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(val9, (T)24);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest,
-                             BasicAddSubs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicAddSubUnitTest,
-                                AtomicRefBasicAddSubUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAddSubUnitTest,
+                               AtomicRefBasicAddSubUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -87,62 +84,62 @@ template <typename T>
 class AtomicRefCUDAAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
+GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result1 = nullptr;
-  T * result2 = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result1, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result2, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result1 = nullptr;
+  T* result2 = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result1, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result2, sizeof(T)));
   memaddr[0] = (T)0;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test inc ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = ++test1;} );
+  forone<test_cuda>([=] __device__() { result1[0] = ++test1; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1++;} );
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1++; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)2 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)2);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test dec ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = --test1;} );
+  forone<test_cuda>([=] __device__() { result1[0] = --test1; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1--;} );
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1--; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test add/sub ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = (test1 += (T)23);} );
+  forone<test_cuda>([=] __device__() { result1[0] = (test1 += (T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)23 );
-  ASSERT_EQ( result1[0], (T)23 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = (test1 -= (T)22);} );
+  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ(result1[0], (T)23);
+  forone<test_cuda>([=] __device__() { result2[0] = (test1 -= (T)22); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test add/sub methods
-  forone<test_cuda>( [=] __device__ () {result1[0] = test1.fetch_add( (T)23 );} );
+  forone<test_cuda>([=] __device__() { result1[0] = test1.fetch_add((T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)24 );
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1.fetch_sub( (T)23 );} );
+  ASSERT_EQ(test1, (T)24);
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1.fetch_sub((T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result2[0], (T)24 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result2[0], (T)24);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
@@ -150,13 +147,9 @@ GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
   cudaErrchk(cudaFree(result2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest,
-                             CUDAAddSubs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAddSubUnitTest,
-                                AtomicRefCUDAAddSubUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAddSubUnitTest,
+                               AtomicRefCUDAAddSubUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
index adf49599ca..a362ff0d9c 100644
--- a/test/unit/atomic/test-atomic-ref-bitwise.cpp
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -23,80 +23,76 @@ template <typename T>
 class AtomicRefBasicBitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicBitwiseUnitTest, BasicBitwises )
+TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T theval = (T)1;
-  T * memaddr = &theval;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test and/or
-  result = test1.fetch_and( (T)0 );
-  ASSERT_EQ( result, (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  result = test1.fetch_and((T)0);
+  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  result = test1.fetch_or( (T)1 );
-  ASSERT_EQ( result, (T)0 );
-  ASSERT_EQ( test1, (T)1 );
+  result = test1.fetch_or((T)1);
+  ASSERT_EQ(result, (T)0);
+  ASSERT_EQ(test1, (T)1);
 
   result = (test1 &= (T)0);
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result, (T)0 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result, (T)0);
 
   result = (test1 |= (T)1);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result, (T)1);
 
   // test xor
-  result = test1.fetch_xor( (T)1 );
-  ASSERT_EQ( result, (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  result = test1.fetch_xor((T)1);
+  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ(test1, (T)0);
 
   result = (test1 ^= (T)1);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result, (T)1);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest,
-                             BasicBitwises
-                           );
-
-using basic_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises);
+
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
-                    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicBitwiseUnitTest,
-                                AtomicRefBasicBitwiseUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicBitwiseUnitTest,
+                               AtomicRefBasicBitwiseUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -107,77 +103,71 @@ template <typename T>
 class AtomicRefCUDABitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDABitwiseUnitTest, CUDABitwises )
+GPU_TYPED_TEST_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   memaddr[0] = (T)1;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test and/or
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_and( (T)0 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_and((T)0); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_or( (T)1 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_or((T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)0 );
-  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ(result[0], (T)0);
+  ASSERT_EQ(test1, (T)1);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 &= (T)0);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 &= (T)0); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result[0], (T)0 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result[0], (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 |= (T)1);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 |= (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result[0], (T)1);
 
   // test xor
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_xor( (T)1 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_xor((T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 ^= (T)1);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 ^= (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result[0], (T)1);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest,
-                             CUDABitwises
-                           );
-
-using CUDA_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
-                    >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDABitwiseUnitTest,
-                                AtomicRefCUDABitwiseUnitTest,
-                                CUDA_types
-                              );
-#endif
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises);
 
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDABitwiseUnitTest,
+                               AtomicRefCUDABitwiseUnitTest,
+                               CUDA_types);
+#endif
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
index 619e3ebf20..dd9ff23eb2 100644
--- a/test/unit/atomic/test-atomic-ref-constructor.cpp
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for atomic ref constructors (and use of getPointer for verification)
+/// Source file containing tests for atomic ref constructors (and use of
+/// getPointer for verification)
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -30,38 +31,33 @@ TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest);
 template <typename T>
 void DefaultPolConstructors()
 {
-  T * memaddr = nullptr;
+  T* memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T> test1( memaddr );
+  RAJA::AtomicRef<T> test1(memaddr);
 
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<T> const & reft1 = test1;
-  RAJA::AtomicRef<T> reftest1( reft1 );
+  RAJA::AtomicRef<T> const& reft1 = test1;
+  RAJA::AtomicRef<T> reftest1(reft1);
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-TYPED_TEST_P( AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors )
+TYPED_TEST_P(AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors)
 {
   DefaultPolConstructors<TypeParam>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefDefaultConstructorUnitTest,
-                             DefaultPolConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest,
+                            DefaultPolConstructors);
 
-using default_types = ::testing::Types< int,
-                                      float,
-                                      double
-                                    >;
+using default_types = ::testing::Types<int, float, double>;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( DefaultConstrUnitTest,
-                                AtomicRefDefaultConstructorUnitTest,
-                                default_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(DefaultConstrUnitTest,
+                               AtomicRefDefaultConstructorUnitTest,
+                               default_types);
 
 // Basic Constructors with policies
 
@@ -69,35 +65,33 @@ template <typename T>
 class AtomicRefBasicConstructorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicConstructorUnitTest, BasicConstructors )
+TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
 {
   using NumericType = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType * memaddr = nullptr;
+  NumericType* memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(memaddr);
 
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest,
-                             BasicConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest,
+                            BasicConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicConstrUnitTest,
-                                AtomicRefBasicConstructorUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicConstrUnitTest,
+                               AtomicRefBasicConstructorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -109,44 +103,40 @@ class AtomicRefCUDAConstructorUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAConstructorUnitTest, CUDAConstructors )
+GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 {
   using NumericType = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType * memaddr = nullptr;
-  NumericType * proxy = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&proxy, sizeof(NumericType)));
+  NumericType* memaddr = nullptr;
+  NumericType* proxy = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&proxy, sizeof(NumericType)));
   proxy = memaddr;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test0( memaddr );
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( proxy );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test0(memaddr);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(proxy);
 
-  forone<test_cuda>( [=] __device__ () {test1.getPointer();} );
+  forone<test_cuda>([=] __device__() { test1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test0.getPointer(), nullptr );
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test0.getPointer(), nullptr);
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
-  forone<test_cuda>( [=] __device__ () {reftest1.getPointer();} );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
+  forone<test_cuda>([=] __device__() { reftest1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 
   cudaErrchk(cudaFree(proxy));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAConstructorUnitTest,
-                             CUDAConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAConstrUnitTest,
-                                AtomicRefCUDAConstructorUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAConstrUnitTest,
+                               AtomicRefCUDAConstructorUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
index 18fa1e4819..8e0135535d 100644
--- a/test/unit/atomic/test-atomic-ref-exchanges.cpp
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -23,91 +23,87 @@ template <typename T>
 class AtomicRefBasicExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicExchangeUnitTest, BasicExchanges )
+TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T swapper = (T)91;
   T theval = (T)0;
-  T * memaddr = &theval;
+  T* memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test exchange method
-  swapper = test1.exchange( swapper );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper, (T)0 );
+  swapper = test1.exchange(swapper);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper, (T)0);
 
   // test CAS method
-  swapper = test1.CAS( (T)91, swapper );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper, (T)91 );
+  swapper = test1.CAS((T)91, swapper);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper, (T)91);
 
 
   bool result = true;
   T testval = (T)19;
-  T & valref = testval;
+  T& valref = testval;
 
   // test strong exchange method
-  result = test1.compare_exchange_strong( valref, testval );
-  ASSERT_EQ( result, false );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper, (T)91 );
-  ASSERT_EQ( testval, (T)0 );
+  result = test1.compare_exchange_strong(valref, testval);
+  ASSERT_EQ(result, false);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper, (T)91);
+  ASSERT_EQ(testval, (T)0);
 
   // test weak exchange method (same as strong exchange)
-  result = test1.compare_exchange_weak( valref, swapper );
-  ASSERT_EQ( result, true );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper, (T)91 );
-  ASSERT_EQ( testval, (T)0 );
+  result = test1.compare_exchange_weak(valref, swapper);
+  ASSERT_EQ(result, true);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper, (T)91);
+  ASSERT_EQ(testval, (T)0);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest,
-                             BasicExchanges
-                           );
-
-using basic_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                      std::tuple<float, RAJA::builtin_atomic>,
-                      std::tuple<float, RAJA::seq_atomic>,
-                      std::tuple<double, RAJA::builtin_atomic>,
-                      std::tuple<double, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest, BasicExchanges);
+
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                      std::tuple<float, RAJA::omp_atomic>,
-                      std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>
 #endif
-                    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicExchangeUnitTest,
-                                AtomicRefBasicExchangeUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicExchangeUnitTest,
+                               AtomicRefBasicExchangeUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -118,17 +114,17 @@ template <typename T>
 class AtomicRefCUDAExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
+GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * swapper = nullptr;
-  T * memaddr = nullptr;
-  T * testval = nullptr;
-  bool * result = nullptr;
+  T* swapper = nullptr;
+  T* memaddr = nullptr;
+  T* testval = nullptr;
+  bool* result = nullptr;
   cudaErrchk(cudaMallocManaged(&swapper, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&testval, sizeof(T)));
@@ -140,35 +136,41 @@ GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test exchange method
-  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.exchange( swapper[0] );} );
+  forone<test_cuda>(
+      [=] __device__() { swapper[0] = test1.exchange(swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper[0], (T)0 );
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper[0], (T)0);
 
   // test CAS method
-  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.CAS( (T)91, swapper[0] );} );
+  forone<test_cuda>(
+      [=] __device__() { swapper[0] = test1.CAS((T)91, swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper[0], (T)91 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper[0], (T)91);
 
   // test strong exchange method
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_strong( testval[0], testval[0] );} );
+  forone<test_cuda>([=] __device__() {
+    result[0] = test1.compare_exchange_strong(testval[0], testval[0]);
+  });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], false );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper[0], (T)91 );
-  ASSERT_EQ( testval[0], (T)0 );
+  ASSERT_EQ(result[0], false);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper[0], (T)91);
+  ASSERT_EQ(testval[0], (T)0);
 
   // test weak exchange method (same as strong exchange)
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_weak( testval[0], swapper[0] );} );
+  forone<test_cuda>([=] __device__() {
+    result[0] = test1.compare_exchange_weak(testval[0], swapper[0]);
+  });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], true );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper[0], (T)91 );
-  ASSERT_EQ( testval[0], (T)0 );
+  ASSERT_EQ(result[0], true);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper[0], (T)91);
+  ASSERT_EQ(testval[0], (T)0);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(swapper));
@@ -177,25 +179,19 @@ GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest,
-                             CUDAExchanges
-                           );
-
-using CUDA_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>
-                    >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAExchangeUnitTest,
-                                AtomicRefCUDAExchangeUnitTest,
-                                CUDA_types
-                              );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges);
+
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAExchangeUnitTest,
+                               AtomicRefCUDAExchangeUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
index a35ea15164..b5c0b4d5db 100644
--- a/test/unit/atomic/test-atomic-ref-minmax.cpp
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -25,47 +25,44 @@ template <typename T>
 class AtomicRefBasicMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicMinMaxUnitTest, BasicMinMaxs )
+TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T theval = (T)91;
-  T * memaddr = &theval;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test min
-  result = test1.fetch_min( (T)87 );
-  ASSERT_EQ( result, (T)91 );
-  ASSERT_EQ( test1, (T)87 );
+  result = test1.fetch_min((T)87);
+  ASSERT_EQ(result, (T)91);
+  ASSERT_EQ(test1, (T)87);
 
-  result = test1.min( (T)83 );
-  ASSERT_EQ( result, (T)83 );
-  ASSERT_EQ( test1, (T)83 );
+  result = test1.min((T)83);
+  ASSERT_EQ(result, (T)83);
+  ASSERT_EQ(test1, (T)83);
 
   // test max
-  result = test1.fetch_max( (T)87 );
-  ASSERT_EQ( result, (T)83 );
-  ASSERT_EQ( test1, (T)87 );
+  result = test1.fetch_max((T)87);
+  ASSERT_EQ(result, (T)83);
+  ASSERT_EQ(test1, (T)87);
 
-  result = test1.max( (T)91 );
-  ASSERT_EQ( result, (T)91 );
-  ASSERT_EQ( test1, (T)91 );
+  result = test1.max((T)91);
+  ASSERT_EQ(result, (T)91);
+  ASSERT_EQ(test1, (T)91);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest,
-                             BasicMinMaxs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicMinMaxUnitTest,
-                                AtomicRefBasicMinMaxUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicMinMaxUnitTest,
+                               AtomicRefBasicMinMaxUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -75,57 +72,53 @@ template <typename T>
 class AtomicRefCUDAMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs )
+GPU_TYPED_TEST_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs)
 {
   using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * result = nullptr;
-  T * memaddr = nullptr;
+  T* result = nullptr;
+  T* memaddr = nullptr;
   cudaErrchk(cudaMallocManaged(&result, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   memaddr[0] = (T)91;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test min
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_min( (T)87 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_min((T)87); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)91 );
-  ASSERT_EQ( test1, (T)87 );
+  ASSERT_EQ(result[0], (T)91);
+  ASSERT_EQ(test1, (T)87);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.min( (T)83 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.min((T)83); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)83 );
-  ASSERT_EQ( test1, (T)83 );
+  ASSERT_EQ(result[0], (T)83);
+  ASSERT_EQ(test1, (T)83);
 
   // test max
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_max( (T)87 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_max((T)87); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)83 );
-  ASSERT_EQ( test1, (T)87 );
+  ASSERT_EQ(result[0], (T)83);
+  ASSERT_EQ(test1, (T)87);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.max( (T)91 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.max((T)91); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)91 );
-  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ(result[0], (T)91);
+  ASSERT_EQ(test1, (T)91);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(result));
   cudaErrchk(cudaFree(memaddr));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest,
-                             CUDAMinMaxs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAMinMaxUnitTest,
-                                AtomicRefCUDAMinMaxUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAMinMaxUnitTest,
+                               AtomicRefCUDAMinMaxUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref.hpp b/test/unit/atomic/test-atomic-ref.hpp
index f5b7dd2943..6805c432cd 100644
--- a/test/unit/atomic/test-atomic-ref.hpp
+++ b/test/unit/atomic/test-atomic-ref.hpp
@@ -13,66 +13,62 @@
 #include "RAJA_gtest.hpp"
 
 using basic_types =
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                      std::tuple<float, RAJA::builtin_atomic>,
-                      std::tuple<float, RAJA::seq_atomic>,
-                      std::tuple<double, RAJA::builtin_atomic>,
-                      std::tuple<double, RAJA::seq_atomic>
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                      std::tuple<float, RAJA::omp_atomic>,
-                      std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::cuda_atomic>,
-                      std::tuple<double, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::hip_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::hip_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::hip_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::hip_atomic>,
-                      std::tuple<double, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::hip_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::hip_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::hip_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::hip_atomic>
 #endif
-                    >;
+                     >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using CUDA_types =
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>
-                    >;
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>>;
 #endif
-
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 1b0ce0a414..284dc85754 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -14,22 +14,22 @@
 GPU_TEST(SynchronizeUnitTest, HIP)
 {
 
-  double* managed_data = (double*) malloc(sizeof(double)*50);
+  double* managed_data = (double*)malloc(sizeof(double) * 50);
   double* d_managed_data;
-  hipMalloc(&d_managed_data, sizeof(double)*50);
+  hipMalloc(&d_managed_data, sizeof(double) * 50);
 
-  RAJA::forall<RAJA::hip_exec_async<256>>( RAJA::RangeSegment(0, 50),
-    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    d_managed_data[i] = 1.0 * i;
-  });
+  RAJA::forall<RAJA::hip_exec_async<256>>(
+      RAJA::RangeSegment(0, 50), [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
+        d_managed_data[i] = 1.0 * i;
+      });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
-  hipMemcpy(managed_data, d_managed_data, sizeof(double)*50, hipMemcpyDeviceToHost);
+  hipMemcpy(
+      managed_data, d_managed_data, sizeof(double) * 50, hipMemcpyDeviceToHost);
 
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
-    [=](RAJA::Index_type i) {
-    EXPECT_EQ(managed_data[i], 1.0 * i);
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, 50),
+      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   free(managed_data);
   hipFree(d_managed_data);
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index 8d0b282624..b453044371 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -17,7 +17,7 @@
 // Resource object used to construct list segment objects with indices
 // living in host (CPU) memory. Used in all tests.
 //
-  camp::resources::Resource host_res{camp::resources::Host()};
+camp::resources::Resource host_res{camp::resources::Host()};
 
 
 TEST(IndexSetUnitTest, Empty)
@@ -40,7 +40,7 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_EQ((size_t)1, isr.getNumTypes());
   isr.push_back(RangeSegType(1, 3));
   isr.push_front(RangeSegType(0, 1));
-  ASSERT_EQ(2, isr.size()); 
+  ASSERT_EQ(2, isr.size());
   ASSERT_EQ(size_t(3), isr.getLength());
   const RangeSegType& rs0 = isr.getSegment<const RangeSegType>(0);
   const RangeSegType& rs1 = isr.getSegment<const RangeSegType>(1);
@@ -56,15 +56,15 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_NE(isr.size(), isr2.size());
   ASSERT_EQ(isr.getLength(), isr2.getLength());
 
-  using ListSegType = RAJA::TypedListSegment<int>; 
+  using ListSegType = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType isrl;
   ASSERT_EQ(size_t(2), isrl.getNumTypes());
-  int idx[ ] = {0, 2, 4, 5};
-  ListSegType lseg(idx, 4, host_res); 
+  int idx[] = {0, 2, 4, 5};
+  ListSegType lseg(idx, 4, host_res);
   isrl.push_back(lseg);
   isrl.push_back(RangeSegType(6, 8));
-  ASSERT_EQ(2, isrl.size()); 
+  ASSERT_EQ(2, isrl.size());
   ASSERT_EQ(size_t(6), isrl.getLength());
   const ListSegType ls0 = isrl.getSegment<const ListSegType>(0);
   const RangeSegType rs11 = isrl.getSegment<const RangeSegType>(1);
@@ -138,7 +138,7 @@ TEST(IndexSetUnitTest, Slice)
   ASSERT_EQ(8, *rs22.begin());
   ASSERT_EQ(10, *rs22.end());
 
-  int segs[ ] = {0, 3};
+  int segs[] = {0, 3};
   RIndexSetType iset3 = iset1.createSlice(segs, 2);
   ASSERT_EQ(2, iset3.size());
   ASSERT_EQ(size_t(4), iset3.getLength());
@@ -166,18 +166,18 @@ TEST(IndexSetUnitTest, Slice)
 TEST(IndexSetUnitTest, ConditionalEvenIndices)
 {
   using RangeSegType = RAJA::TypedRangeSegment<int>;
-  using ListSegType = RAJA::TypedListSegment<int>; 
+  using ListSegType = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType iset;
 
   iset.push_back(RangeSegType(0, 6));
-  int idx[ ] = {7, 8, 10, 11};
-  ListSegType lseg(idx, 4, host_res); 
+  int idx[] = {7, 8, 10, 11};
+  ListSegType lseg(idx, 4, host_res);
   iset.push_back(lseg);
   iset.push_back(RangeSegType(13, 17));
 
   RAJA::RAJAVec<int> ref_even_indices;
-  ref_even_indices.push_back(0); 
+  ref_even_indices.push_back(0);
   ref_even_indices.push_back(2);
   ref_even_indices.push_back(4);
   ref_even_indices.push_back(8);
@@ -186,12 +186,11 @@ TEST(IndexSetUnitTest, ConditionalEvenIndices)
   ref_even_indices.push_back(16);
 
   RAJA::RAJAVec<int> even_indices;
-  getIndicesConditional(even_indices, iset, [] (int idx) {
-    return !(idx % 2);
-  });
+  getIndicesConditional(even_indices, iset, [](int idx) { return !(idx % 2); });
 
   EXPECT_EQ(even_indices.size(), ref_even_indices.size());
-  for (size_t i = 0; i < ref_even_indices.size(); ++i) {
+  for (size_t i = 0; i < ref_even_indices.size(); ++i)
+  {
     EXPECT_EQ(even_indices[i], ref_even_indices[i]);
   }
 }
@@ -215,12 +214,12 @@ TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
   ref_lt100_indices.push_back(99);
 
   RAJA::RAJAVec<int> lt100_indices;
-  getIndicesConditional(lt100_indices, iset, [] (int idx) {
-    return (idx < 100);
-  });
+  getIndicesConditional(
+      lt100_indices, iset, [](int idx) { return (idx < 100); });
 
   EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
-  for (size_t i = 0; i < ref_lt100_indices.size(); ++i) {
+  for (size_t i = 0; i < ref_lt100_indices.size(); ++i)
+  {
     EXPECT_EQ(lt100_indices[i], ref_lt100_indices[i]);
   }
 }
diff --git a/test/unit/index/test-indexvalue.cpp b/test/unit/index/test-indexvalue.cpp
index fad47715e9..ca148d2c91 100644
--- a/test/unit/index/test-indexvalue.cpp
+++ b/test/unit/index/test-indexvalue.cpp
@@ -13,8 +13,9 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class IndexValueUnitTest : public ::testing::Test {};
+template <typename T>
+class IndexValueUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(IndexValueUnitTest, UnitIndexTypes);
 
@@ -181,7 +182,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
   ASSERT_EQ(StrongTypeIndex(8), a);
   ASSERT_EQ(RAJA::Index_type(2), b);
 
-  
+
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
   TestType c(8);
   RAJA::Index_type d(2);
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index 2ea0004b83..efe2017992 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -17,14 +17,15 @@
 
 #include <vector>
 
-template<typename T>
-class ListSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class ListSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(ListSegmentUnitTest, UnitIndexTypes);
 
 //
 // Resource object used to construct list segment objects with indices
-// living in host (CPU) memory. Used in all tests in this file. 
+// living in host (CPU) memory. Used in all tests in this file.
 //
 camp::resources::Resource host_res{camp::resources::Host()};
 
@@ -32,11 +33,12 @@ camp::resources::Resource host_res{camp::resources::Host()};
 TYPED_TEST(ListSegmentUnitTest, Constructors)
 {
   std::vector<TypeParam> idx;
-  for (TypeParam i = 0; i < 5; ++i){
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     idx.push_back(i);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1( &idx[0], idx.size(), host_res);
+  RAJA::TypedListSegment<TypeParam> list1(&idx[0], idx.size(), host_res);
   ASSERT_EQ(list1.size(), idx.size());
   ASSERT_EQ(list1.getIndexOwnership(), RAJA::Owned);
 
@@ -50,20 +52,21 @@ TYPED_TEST(ListSegmentUnitTest, Constructors)
 
   RAJA::TypedListSegment<TypeParam> container(idx, host_res);
   ASSERT_EQ(container.getIndexOwnership(), RAJA::Owned);
-  ASSERT_EQ(moved, container); 
+  ASSERT_EQ(moved, container);
 }
 
 TYPED_TEST(ListSegmentUnitTest, Swaps)
 {
   std::vector<TypeParam> idx1;
   std::vector<TypeParam> idx2;
-  for (TypeParam i = 0; i < 5; ++i){
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     idx1.push_back(i);
-    idx2.push_back(i+5);
+    idx2.push_back(i + 5);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1( idx1, host_res );
-  RAJA::TypedListSegment<TypeParam> list2( idx2, host_res );
+  RAJA::TypedListSegment<TypeParam> list1(idx1, host_res);
+  RAJA::TypedListSegment<TypeParam> list2(idx2, host_res);
   auto list3 = RAJA::TypedListSegment<TypeParam>(list1);
   auto list4 = RAJA::TypedListSegment<TypeParam>(list2);
 
@@ -80,26 +83,25 @@ TYPED_TEST(ListSegmentUnitTest, Swaps)
 
 TYPED_TEST(ListSegmentUnitTest, Equality)
 {
-  std::vector<TypeParam> idx1{5,3,1,2};
-  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+  std::vector<TypeParam> idx1{5, 3, 1, 2};
+  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
-  std::vector<TypeParam> idx2{2,1,3,5};
-  
-  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), false);
+  std::vector<TypeParam> idx2{2, 1, 3, 5};
 
-  std::reverse( idx2.begin(), idx2.end() );
+  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), false);
 
-  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), true);
+  std::reverse(idx2.begin(), idx2.end());
+
+  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), true);
 }
 
 TYPED_TEST(ListSegmentUnitTest, Iterators)
 {
-  std::vector<TypeParam> idx1{5,3,1,2};
-  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+  std::vector<TypeParam> idx1{5, 3, 1, 2};
+  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   ASSERT_EQ(TypeParam(5), *list.begin());
-  ASSERT_EQ(TypeParam(2), *(list.end()-1));
+  ASSERT_EQ(TypeParam(2), *(list.end() - 1));
 
   ASSERT_EQ(4, list.size());
 }
-
diff --git a/test/unit/index/test-rangesegment.cpp b/test/unit/index/test-rangesegment.cpp
index be82671682..fbed2a15bd 100644
--- a/test/unit/index/test-rangesegment.cpp
+++ b/test/unit/index/test-rangesegment.cpp
@@ -13,18 +13,20 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class RangeSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class RangeSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(RangeSegmentUnitTest, UnitIndexTypes);
 
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {
   RAJA::TypedRangeSegment<T> r1(-10, 7);
@@ -74,12 +76,13 @@ TYPED_TEST(RangeSegmentUnitTest, Swaps)
   ASSERT_EQ(r2, r3);
 }
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {
   RAJA::TypedRangeSegment<T> r3(-2, 100);
@@ -100,13 +103,14 @@ TYPED_TEST(RangeSegmentUnitTest, Iterators)
 }
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
-{
-}
+{}
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {
   auto r1 = RAJA::TypedRangeSegment<IDX_TYPE>(-4, 4);
@@ -128,15 +132,15 @@ void runNegativeIndexSliceTests()
 TYPED_TEST(RangeSegmentUnitTest, Slices)
 {
   auto r1 = RAJA::TypedRangeSegment<TypeParam>(0, 125);
-  auto s1 = r1.slice(10,100);
+  auto s1 = r1.slice(10, 100);
 
   ASSERT_EQ(TypeParam(10), *s1.begin());
   ASSERT_EQ(TypeParam(110), *(s1.end()));
   ASSERT_EQ(TypeParam(100), s1.size());
 
- 
+
   auto r2 = RAJA::TypedRangeSegment<TypeParam>(0, 12);
-  auto s2 = r2.slice(1,13);
+  auto s2 = r2.slice(1, 13);
 
   ASSERT_EQ(TypeParam(1), *s2.begin());
   ASSERT_EQ(TypeParam(12), *(s2.end()));
@@ -144,7 +148,7 @@ TYPED_TEST(RangeSegmentUnitTest, Slices)
 
 
   auto r3 = RAJA::TypedRangeSegment<TypeParam>(1, 125);
-  auto s3 = r3.slice(10,100);
+  auto s3 = r3.slice(10, 100);
 
   ASSERT_EQ(TypeParam(11), *s3.begin());
   ASSERT_EQ(TypeParam(111), *(s3.end()));
@@ -160,7 +164,7 @@ TYPED_TEST(RangeSegmentUnitTest, Equality)
 
   ASSERT_EQ(r1, r2);
 
-  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10,15);
+  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10, 15);
 
   ASSERT_NE(r1, r3);
 }
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
index 5b64e17b5c..d97021b0cf 100644
--- a/test/unit/index/test-rangestridesegment.cpp
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -13,64 +13,68 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class RangeStrideSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class RangeStrideSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(RangeStrideSegmentUnitTest, UnitIndexTypes);
 
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Constructors)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
-    RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
-    ASSERT_EQ(first, copied);
-    RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
-    ASSERT_EQ(moved, copied);
+  RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
+  RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
+  ASSERT_EQ(first, copied);
+  RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
+  ASSERT_EQ(moved, copied);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Assignments)
 {
-    auto r = RAJA::make_strided_range<TypeParam>(static_cast<TypeParam>(0), 
-                                                 static_cast<TypeParam>(5), 
-                                                 static_cast<typename std::make_signed<TypeParam>::type>(3));
-    RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
-    ASSERT_EQ(r, seg1);
-    RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
-    ASSERT_EQ(seg2, seg1);
+  auto r = RAJA::make_strided_range<TypeParam>(
+      static_cast<TypeParam>(0),
+      static_cast<TypeParam>(5),
+      static_cast<typename std::make_signed<TypeParam>::type>(3));
+  RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
+  ASSERT_EQ(r, seg1);
+  RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
+  ASSERT_EQ(seg2, seg1);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Swaps)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
-    RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
-    RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
-    RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
-    std::swap(r1, r2);
-    ASSERT_EQ(r1, r4);
-    ASSERT_EQ(r2, r3);
+  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
+  RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
+  RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
+  RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
+  std::swap(r1, r2);
+  ASSERT_EQ(r1, r4);
+  ASSERT_EQ(r2, r3);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Iterators)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
-    ASSERT_EQ(TypeParam(0), *r1.begin());
-    ASSERT_EQ(TypeParam(96), *(--r1.end()));
-    using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
-    ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(difftype_t(25), r1.size());
+  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
+  ASSERT_EQ(TypeParam(0), *r1.begin());
+  ASSERT_EQ(TypeParam(96), *(--r1.end()));
+  using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
+  ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(25), r1.size());
 }
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {
   RAJA::TypedRangeStrideSegment<T> segment16(-10, -2, 2);
-  using difftype_t = decltype(std::distance(segment16.begin(), segment16.end()));
+  using difftype_t =
+      decltype(std::distance(segment16.begin(), segment16.end()));
   ASSERT_EQ(segment16.size(), difftype_t(4));
 
   RAJA::TypedRangeStrideSegment<T> segment17(-5, 5, 2);
@@ -118,13 +122,17 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   ASSERT_EQ(segment11.size(), difftype_t(2));
 
   // PRIMES
-  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0, 7, 3);  // should produce 0,3,6
+  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0,
+                                                     7,
+                                                     3); // should produce 0,3,6
   ASSERT_EQ(segment12.size(), difftype_t(3));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment13(0, 13, 3);  // should produce 0,3,6,9,12
+  RAJA::TypedRangeStrideSegment<TypeParam> segment13(
+      0, 13, 3); // should produce 0,3,6,9,12
   ASSERT_EQ(segment13.size(), difftype_t(5));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment14(0, 17, 5);  // should produce 0,5,10,15
+  RAJA::TypedRangeStrideSegment<TypeParam> segment14(
+      0, 17, 5); // should produce 0,5,10,15
   ASSERT_EQ(segment14.size(), difftype_t(4));
 
   // NEGATIVE STRIDE
@@ -136,13 +144,14 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
 }
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
-{
-}
+{}
 
-template <typename IDX_TYPE, 
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {
   auto r1 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(10, -1, -1);
@@ -167,14 +176,14 @@ void runNegativeIndexStrideSliceTests()
   ASSERT_EQ(IDX_TYPE(-2), *s3.begin());
   ASSERT_EQ(IDX_TYPE(2), *s3.end());
   ASSERT_EQ(size_t(2), size_t(s3.size()));
- 
-  
+
+
   auto r4 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(-9, -1, 1);
   auto s4 = r4.slice(3, 6);
 
   ASSERT_EQ(IDX_TYPE(-6), *s4.begin());
   ASSERT_EQ(IDX_TYPE(-1), *s4.end());
-  ASSERT_EQ(size_t(5), size_t(s4.size())); 
+  ASSERT_EQ(size_t(5), size_t(s4.size()));
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Slices)
@@ -222,5 +231,5 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Equality)
 
   auto r3 = RAJA::TypedRangeStrideSegment<TypeParam>(1, 10, 1);
 
-  ASSERT_TRUE( !(r1 == r3));
+  ASSERT_TRUE(!(r1 == r3));
 }
diff --git a/test/unit/indexing/test-indexing.hpp b/test/unit/indexing/test-indexing.hpp
index 21038542ee..708e6342ad 100644
--- a/test/unit/indexing/test-indexing.hpp
+++ b/test/unit/indexing/test-indexing.hpp
@@ -18,30 +18,26 @@
 // List of named_dims
 //
 using NamedDimensionTypeList =
-    camp::list<
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>
-              >;
+    camp::list<camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
 
 //
 // List of sizes
 //
 using SizeTypeList =
-    camp::list<
-                camp::integral_constant<int, RAJA::named_usage::ignored>,
-                camp::integral_constant<int, RAJA::named_usage::unspecified>,
-                camp::integral_constant<int, 1>,
-                camp::integral_constant<int, 7>
-              >;
+    camp::list<camp::integral_constant<int, RAJA::named_usage::ignored>,
+               camp::integral_constant<int, RAJA::named_usage::unspecified>,
+               camp::integral_constant<int, 1>,
+               camp::integral_constant<int, 7>>;
 
 //
 // Holder for indexing templates
 //
-template < template < RAJA::named_dim, int, int > class T >
+template <template <RAJA::named_dim, int, int> class T>
 struct indexing_holder
 {
-  template < RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+  template <RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
   using type = T<dim, BLOCK_SIZE, GRID_SIZE>;
 };
 
@@ -49,11 +45,13 @@ struct indexing_holder
 // List of indexing holder types
 //
 #if defined(RAJA_ENABLE_CUDA)
-using CudaIndexingHolderList = camp::list< indexing_holder<RAJA::cuda::IndexGlobal> >;
+using CudaIndexingHolderList =
+    camp::list<indexing_holder<RAJA::cuda::IndexGlobal>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipIndexingHolderList = camp::list< indexing_holder<RAJA::hip::IndexGlobal> >;
+using HipIndexingHolderList =
+    camp::list<indexing_holder<RAJA::hip::IndexGlobal>>;
 #endif
 
-#endif  // __TEST_INDEXING_UTILS_HPP__
+#endif // __TEST_INDEXING_UTILS_HPP__
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index a345d80067..050d746fde 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -18,28 +18,36 @@ template <typename T>
 class IndexingUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( IndexingUnitTest );
+TYPED_TEST_SUITE_P(IndexingUnitTest);
 
-template < typename test_policy,
-           typename indexer_type,
-           RAJA::named_dim dim_012,
-           int BLOCK_SIZE,
-           int GRID_SIZE >
+template <typename test_policy,
+          typename indexer_type,
+          RAJA::named_dim dim_012,
+          int BLOCK_SIZE,
+          int GRID_SIZE>
 void testBasicIndexing()
 {
-  dim3d3d expected_dim{{1,1,1}, {1,1,1}};
-  if (BLOCK_SIZE != RAJA::named_usage::ignored) {
-    if (BLOCK_SIZE == RAJA::named_usage::unspecified) {
+  dim3d3d expected_dim{{1, 1, 1}, {1, 1, 1}};
+  if (BLOCK_SIZE != RAJA::named_usage::ignored)
+  {
+    if (BLOCK_SIZE == RAJA::named_usage::unspecified)
+    {
       expected_dim.thread[static_cast<int>(dim_012)] = 3;
-    } else {
+    }
+    else
+    {
       expected_dim.thread[static_cast<int>(dim_012)] = BLOCK_SIZE;
     }
   }
 
-  if (GRID_SIZE != RAJA::named_usage::ignored) {
-    if (GRID_SIZE == RAJA::named_usage::unspecified) {
+  if (GRID_SIZE != RAJA::named_usage::ignored)
+  {
+    if (GRID_SIZE == RAJA::named_usage::unspecified)
+    {
       expected_dim.block[static_cast<int>(dim_012)] = 5;
-    } else {
+    }
+    else
+    {
       expected_dim.block[static_cast<int>(dim_012)] = GRID_SIZE;
     }
   }
@@ -52,34 +60,40 @@ void testBasicIndexing()
   int* actual_index = host_res.allocate<int>(total_global);
   int* actual_size = host_res.allocate<int>(total_global);
 
-  for (int i = 0; i < total_global; ++i) {
+  for (int i = 0; i < total_global; ++i)
+  {
     actual_index[i] = -1;
     actual_size[i] = -1;
   }
 
-  actual_index = test_reallocate(working_res, host_res, actual_index, total_global);
-  actual_size = test_reallocate(working_res, host_res, actual_size, total_global);
+  actual_index =
+      test_reallocate(working_res, host_res, actual_index, total_global);
+  actual_size =
+      test_reallocate(working_res, host_res, actual_size, total_global);
 
   for3d3d<test_policy>(expected_dim,
-      [=] RAJA_HOST_DEVICE (dim3d3d idx, dim3d3d dim) {
-    int i = index(idx, dim);
-    actual_index[i] = indexer_type::template index<int>();
-    actual_size[i] = indexer_type::template size<int>();
-  });
-
-  actual_index = test_reallocate(host_res, working_res, actual_index, total_global);
-  actual_size = test_reallocate(host_res, working_res, actual_size, total_global);
-
-  for (int i = 0; i < total_global; ++i) {
-    ASSERT_EQ( actual_index[i], i );
-    ASSERT_EQ( actual_size[i], total_global );
+                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim) {
+                         int i = index(idx, dim);
+                         actual_index[i] = indexer_type::template index<int>();
+                         actual_size[i] = indexer_type::template size<int>();
+                       });
+
+  actual_index =
+      test_reallocate(host_res, working_res, actual_index, total_global);
+  actual_size =
+      test_reallocate(host_res, working_res, actual_size, total_global);
+
+  for (int i = 0; i < total_global; ++i)
+  {
+    ASSERT_EQ(actual_index[i], i);
+    ASSERT_EQ(actual_size[i], total_global);
   }
 
   host_res.deallocate(actual_index);
   host_res.deallocate(actual_size);
 }
 
-TYPED_TEST_P( IndexingUnitTest, BasicIndexing )
+TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
 {
   using test_policy = typename camp::at<TypeParam, camp::num<0>>::type;
   using indexer_holder_type = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -87,14 +101,16 @@ TYPED_TEST_P( IndexingUnitTest, BasicIndexing )
   using threads_type = typename camp::at<TypeParam, camp::num<3>>::type;
   using blocks_type = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  using indexer_type = typename indexer_holder_type::template type<
-      dim_type::value, threads_type::value, blocks_type::value>;
+  using indexer_type = typename indexer_holder_type::
+      template type<dim_type::value, threads_type::value, blocks_type::value>;
 
-  testBasicIndexing< test_policy, indexer_type,
-                     dim_type::value, threads_type::value, blocks_type::value >();
+  testBasicIndexing<test_policy,
+                    indexer_type,
+                    dim_type::value,
+                    threads_type::value,
+                    blocks_type::value>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P( IndexingUnitTest,
-                             BasicIndexing );
+REGISTER_TYPED_TEST_SUITE_P(IndexingUnitTest, BasicIndexing);
 
-#endif  //__TEST_INDEXING_GLOBAL__
+#endif //__TEST_INDEXING_GLOBAL__
diff --git a/test/unit/internal/test-iterators.cpp b/test/unit/internal/test-iterators.cpp
index b5eb0ade48..2d90dca4c0 100644
--- a/test/unit/internal/test-iterators.cpp
+++ b/test/unit/internal/test-iterators.cpp
@@ -14,11 +14,13 @@
 
 #include <limits>
 
-template<typename T>
-class NumericIteratorUnitTest : public ::testing::Test {};
+template <typename T>
+class NumericIteratorUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class StridedNumericIteratorUnitTest : public ::testing::Test {};
+template <typename T>
+class StridedNumericIteratorUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(NumericIteratorUnitTest, UnitExpandedIntegralTypes);
 TYPED_TEST_SUITE(StridedNumericIteratorUnitTest, UnitExpandedIntegralTypes);
@@ -84,7 +86,8 @@ TYPED_TEST(StridedNumericIteratorUnitTest, simple)
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
 TYPED_TEST(NumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value) {
+  if (std::is_unsigned<TypeParam>::value)
+  {
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -95,7 +98,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
       of_it += 11;
     });
-  
+
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -108,7 +111,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = of_it + 11;
       (void)sum;
     });
-  
+
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -121,12 +124,13 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = 11 + of_it;
       (void)sum;
     });
-  } 
+  }
 }
 
 TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value){
+  if (std::is_unsigned<TypeParam>::value)
+  {
     ASSERT_ANY_THROW({
       TypeParam val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
diff --git a/test/unit/internal/test-rajavec.cpp b/test/unit/internal/test-rajavec.cpp
index edc093b4dd..727dec64f9 100644
--- a/test/unit/internal/test-rajavec.cpp
+++ b/test/unit/internal/test-rajavec.cpp
@@ -27,8 +27,8 @@ TEST(RAJAVecUnitTest, basic_test)
 
   RAJA::RAJAVec<int> a1(a);
   ASSERT_EQ(a.size(), a1.size());
-  int* a_data = a.data(); 
-  int* a1_data = a1.data(); 
+  int* a_data = a.data();
+  int* a1_data = a1.data();
   ASSERT_EQ(a_data[0], a1_data[0]);
   ASSERT_EQ(a_data[1], a1_data[1]);
 
diff --git a/test/unit/multi_reducer/test-multi-reducer.hpp b/test/unit/multi_reducer/test-multi-reducer.hpp
index a1f94e0895..55a9550cfa 100644
--- a/test/unit/multi_reducer/test-multi-reducer.hpp
+++ b/test/unit/multi_reducer/test-multi-reducer.hpp
@@ -17,31 +17,31 @@
 //
 // Data types
 //
-using DataTypeList = camp::list< int,
-                                 float,
-                                 double >;
+using DataTypeList = camp::list<int, float, double>;
 
-using SequentialMultiReducerPolicyList = camp::list< RAJA::seq_multi_reduce >;
+using SequentialMultiReducerPolicyList = camp::list<RAJA::seq_multi_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPMultiReducerPolicyList = camp::list< RAJA::omp_multi_reduce,
-                                                 RAJA::omp_multi_reduce_ordered >;
+using OpenMPMultiReducerPolicyList =
+    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducerPolicyList =
-  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::cuda_multi_reduce_atomic_global_host_init,
-              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+using CudaMultiReducerPolicyList = camp::list<
+    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::cuda_multi_reduce_atomic_global_host_init,
+    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducerPolicyList =
-  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::hip_multi_reduce_atomic_global_host_init,
-              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+using HipMultiReducerPolicyList = camp::list<
+    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::hip_multi_reduce_atomic_global_host_init,
+    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
-#endif  // __TEST_MULTI_REDUCER_UTILS_HPP__
+#endif // __TEST_MULTI_REDUCER_UTILS_HPP__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
index 1104ae1e28..f48c015773 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA multi reducer constructors and initialization.
+/// Header file containing tests for RAJA multi reducer constructors and
+/// initialization.
 ///
 
 #ifndef __TEST_MULTI_REDUCER_CONSTRUCTOR__
@@ -22,63 +23,70 @@
 
 template <typename T>
 class MultiReducerBasicConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerSingleInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerContainerInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest);
 
 
-template <typename MultiReducePolicy,
-          typename NumericType>
+template <typename MultiReducePolicy, typename NumericType>
 void testBasicMultiReducerConstructorRegular(size_t num_bins)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType>
+template <typename MultiReducePolicy, typename NumericType>
 void testBasicMultiReducerConstructorBitwise(size_t num_bins)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
 
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
   }
 }
 
@@ -87,8 +95,10 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
-  testBasicMultiReducerConstructorBitwise< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins);
+  testBasicMultiReducerConstructorBitwise<MultiReducePolicy, NumericType>(
+      num_bins);
 }
 ///
 template <typename MultiReducePolicy,
@@ -96,7 +106,8 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
@@ -104,26 +115,30 @@ TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(0);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(1);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(2);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(10);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(0);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(1);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(2);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(10);
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType>
-void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType>
+void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
+                                                  NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -134,17 +149,20 @@ void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType i
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType>
-void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType>
+void testMultiReducerSingleInitConstructorBitwise(size_t num_bins,
+                                                  NumericType initVal)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins, initVal);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
 
@@ -155,19 +173,22 @@ void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType i
 
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
-  testMultiReducerSingleInitConstructorBitwise< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
+  testMultiReducerSingleInitConstructorBitwise<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
 }
 ///
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor)
@@ -175,28 +196,34 @@ TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor)
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(0, NumericType(2));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(1, NumericType(4));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(2, NumericType(0));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(10, NumericType(9));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      0, NumericType(2));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      1, NumericType(4));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      2, NumericType(0));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      10, NumericType(9));
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container>
+template <typename MultiReducePolicy, typename NumericType, typename Container>
 void testMultiReducerContainerInitConstructorRegular(Container const& container)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(container);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(container);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(container);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      container);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      container);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      container);
 
   ASSERT_EQ(multi_reduce_sum.size(), container.size());
   ASSERT_EQ(multi_reduce_min.size(), container.size());
   ASSERT_EQ(multi_reduce_max.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -208,19 +235,20 @@ void testMultiReducerContainerInitConstructorRegular(Container const& container)
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container>
+template <typename MultiReducePolicy, typename NumericType, typename Container>
 void testMultiReducerContainerInitConstructorBitwise(Container const& container)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(container);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(container);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      container);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      container);
 
   ASSERT_EQ(multi_reduce_and.size(), container.size());
   ASSERT_EQ(multi_reduce_or.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -236,8 +264,10 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
-  testMultiReducerContainerInitConstructorBitwise< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorBitwise<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 ///
 template <typename MultiReducePolicy,
@@ -246,10 +276,12 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 
-TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstructor)
+TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
+             MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -260,13 +292,14 @@ TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstruct
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin) {
+  for (size_t bin = 0; bin < size_t(10); ++bin)
+  {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c0);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c1);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c2);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c10);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c0);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c1);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c2);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c10);
 }
 
 
@@ -279,4 +312,4 @@ REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest,
 REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest,
                             MultiReducerConstructor);
 
-#endif  //__TEST_MULTI_REDUCER_CONSTRUCTOR__
+#endif //__TEST_MULTI_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index 0eb1eb6eb6..346ea79f83 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -22,42 +22,41 @@
 
 template <typename T>
 class MultiReducerBasicResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerSingleResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerContainerResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest);
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin) {
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < num_bins; ++bin)
+      {
         multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
+        multi_reduce_min[bin].min(initVal - 1);
+        multi_reduce_max[bin].max(initVal + 1);
       }
     });
   }
@@ -70,32 +69,38 @@ void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < num_bins; ++bin)
+      {
+        multi_reduce_and[bin] &= initVal - 1;
+        multi_reduce_or[bin] |= initVal + 1;
       }
     });
   }
@@ -106,39 +111,47 @@ void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
 
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
-  testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
+  testMultiReducerBasicResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
-  // testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, num_bins); testMultiReducerBasicResetBitwise<
+  // MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
@@ -147,29 +160,34 @@ TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
   using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
   using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(0);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(1);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(2);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(10);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(0);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(1);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(2);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(10);
 }
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
-void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+void testMultiReducerSingleResetRegular(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin)
+      {
         multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
+        multi_reduce_min[bin].min(initVal - 1);
+        multi_reduce_max[bin].max(initVal + 1);
       }
     });
   }
@@ -182,7 +200,8 @@ void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -193,19 +212,24 @@ void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
-void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+void testMultiReducerSingleResetBitwise(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin)
+      {
+        multi_reduce_and[bin] &= initVal - 1;
+        multi_reduce_or[bin] |= initVal + 1;
       }
     });
   }
@@ -216,7 +240,8 @@ void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
 
@@ -225,40 +250,51 @@ void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
-  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
-  testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
-  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
-  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerSingleReset(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, num_bins, initVal);
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, num_bins, initVal);
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(num_bins, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      0, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      4, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      num_bins, num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
@@ -267,33 +303,43 @@ TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
   using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
   using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(0, NumericType(3));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(1, NumericType(5));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(2, NumericType(0));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(10, NumericType(8));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      0, NumericType(3));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      1, NumericType(5));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      2, NumericType(0));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      10, NumericType(8));
 }
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container  >
-void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetRegular(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin)
+      {
         multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
+        multi_reduce_min[bin].min(initVal - 1);
+        multi_reduce_max[bin].max(initVal + 1);
       }
     });
   }
@@ -307,7 +353,8 @@ void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, C
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -319,23 +366,29 @@ void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, C
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container >
-void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetBitwise(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin)
+      {
+        multi_reduce_and[bin] &= initVal - 1;
+        multi_reduce_or[bin] |= initVal + 1;
       }
     });
   }
@@ -347,7 +400,8 @@ void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, C
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -357,43 +411,57 @@ void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, C
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
-  testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy,
+                                        NumericType,
+                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetBitwise<MultiReducePolicy,
+                                        NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
-  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy,
+                                        NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
 void testMultiReducerContainerReset(Container const& container)
 {
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, container);
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, container);
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(container.size(), container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      0, container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      4, container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      container.size(), container);
 }
 
 TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
@@ -408,24 +476,23 @@ TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin) {
+  for (size_t bin = 0; bin < size_t(10); ++bin)
+  {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c0);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c1);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c2);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c10);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c0);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c1);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c2);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(
+      c10);
 }
 
 
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest, MultiReducerReset);
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest,
-                            MultiReducerReset);
-
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest,
-                            MultiReducerReset);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest, MultiReducerReset);
 
 REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest,
                             MultiReducerReset);
 
-#endif  //__TEST_MULTI_REDUCER_RESET__
+#endif //__TEST_MULTI_REDUCER_RESET__
diff --git a/test/unit/reducer/test-reducer-constructors-cuda.cpp b/test/unit/reducer/test-reducer-constructors-cuda.cpp
index fea3bb9b90..75889c4706 100644
--- a/test/unit/reducer/test-reducer-constructors-cuda.cpp
+++ b/test/unit/reducer/test-reducer-constructors-cuda.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
-                                 CudaResourceList > >::Types;
+                                 CudaResourceList>>::Types;
 
-using CudaInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaInitReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList > >::Types;
+                                 CudaUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CudaInitTest,
                                ReducerInitConstructorUnitTest,
                                CudaInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-hip.cpp b/test/unit/reducer/test-reducer-constructors-hip.cpp
index 0b3197b2ef..c4f4ddb8b4 100644
--- a/test/unit/reducer/test-reducer-constructors-hip.cpp
+++ b/test/unit/reducer/test-reducer-constructors-hip.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
-                                 HipResourceList > >::Types;
+                                 HipResourceList>>::Types;
 
-using HipInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipInitReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList > >::Types;
+                                 HipUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(HipInitTest,
                                ReducerInitConstructorUnitTest,
                                HipInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
index b3204c7827..3dd9e8ae39 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
@@ -6,20 +6,20 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+using OpenMPTargetInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPTargetInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-openmp.cpp b/test/unit/reducer/test-reducer-constructors-openmp.cpp
index 26d39cdd5f..eb31791058 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList > >::Types;
+                                 HostResourceList>>::Types;
 
-using OpenMPInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-seq.cpp b/test/unit/reducer/test-reducer-constructors-seq.cpp
index 134766eb9a..7d765529f8 100644
--- a/test/unit/reducer/test-reducer-constructors-seq.cpp
+++ b/test/unit/reducer/test-reducer-constructors-seq.cpp
@@ -6,21 +6,22 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
-using SequentialBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList > >::Types;
+                                 HostResourceList>>::Types;
 
-using SequentialInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialInitReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -29,5 +30,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialInitTest,
                                ReducerInitConstructorUnitTest,
                                SequentialInitReducerConstructorTypes);
-
-
diff --git a/test/unit/reducer/test-reducer-reset-cuda.cpp b/test/unit/reducer/test-reducer-reset-cuda.cpp
index 06944d488d..2443419c7d 100644
--- a/test/unit/reducer/test-reducer-reset-cuda.cpp
+++ b/test/unit/reducer/test-reducer-reset-cuda.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerResetTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaReducerResetTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList > >::Types;
+                                 CudaUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-hip.cpp b/test/unit/reducer/test-reducer-reset-hip.cpp
index cfca5e3787..eb31480311 100644
--- a/test/unit/reducer/test-reducer-reset-hip.cpp
+++ b/test/unit/reducer/test-reducer-reset-hip.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerResetTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipReducerResetTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList > >::Types;
+                                 HipUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp-target.cpp b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
index 1bf7777bf1..5f02ec92ea 100644
--- a/test/unit/reducer/test-reducer-reset-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerResetTypes = 
-  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+using OpenMPTargetReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp.cpp b/test/unit/reducer/test-reducer-reset-openmp.cpp
index 3f8d54287f..a570a7be6a 100644
--- a/test/unit/reducer/test-reducer-reset-openmp.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerResetTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-seq.cpp b/test/unit/reducer/test-reducer-reset-seq.cpp
index 2b1ff4a748..5884aa43e4 100644
--- a/test/unit/reducer/test-reducer-reset-seq.cpp
+++ b/test/unit/reducer/test-reducer-reset-seq.cpp
@@ -11,14 +11,13 @@
 
 #include "tests/test-reducer-reset.hpp"
 
-using SequentialReducerResetTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialReducerResetTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialResetTest,
                                ReducerResetUnitTest,
                                SequentialReducerResetTypes);
-
diff --git a/test/unit/reducer/test-reducer.hpp b/test/unit/reducer/test-reducer.hpp
index aa8fbda9cf..7b9648d25b 100644
--- a/test/unit/reducer/test-reducer.hpp
+++ b/test/unit/reducer/test-reducer.hpp
@@ -16,27 +16,25 @@
 //
 // Data types
 //
-using DataTypeList = camp::list< int,
-                                 float,
-                                 double >;
+using DataTypeList = camp::list<int, float, double>;
 
-using SequentialReducerPolicyList = camp::list< RAJA::seq_reduce >;
+using SequentialReducerPolicyList = camp::list<RAJA::seq_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerPolicyList = camp::list< RAJA::omp_reduce,
-                                            RAJA::omp_reduce_ordered >;
+using OpenMPReducerPolicyList =
+    camp::list<RAJA::omp_reduce, RAJA::omp_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerPolicyList = camp::list< RAJA::omp_target_reduce >;
+using OpenMPTargetReducerPolicyList = camp::list<RAJA::omp_target_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerPolicyList = camp::list< RAJA::cuda_reduce >;
+using CudaReducerPolicyList = camp::list<RAJA::cuda_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerPolicyList = camp::list< RAJA::hip_reduce >;
+using HipReducerPolicyList = camp::list<RAJA::hip_reduce>;
 #endif
 
-#endif  // __TEST_REDUCER_UTILS_HPP__
+#endif // __TEST_REDUCER_UTILS_HPP__
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index d02d42fce9..eaa92588a6 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA reducer constructors and initialization.
+/// Header file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #ifndef __TEST_REDUCER_CONSTRUCTOR__
@@ -18,29 +19,26 @@
 
 template <typename T>
 class ReducerBasicConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class ReducerInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest);
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy,
-          typename NumericType>
-typename  std::enable_if<
+template <typename ReducePolicy, typename NumericType>
+typename std::enable_if<
 #if defined(RAJA_ENABLE_CUDA) // CUDA policy does nothing.
-            std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+    std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
 #elif defined(RAJA_ENABLE_HIP) // HIP policy does nothing.
-            std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+    std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
 #error Please enable a supported GPU platform, e.g. CUDA or HIP.
 #endif
-          >::type
+    >::type
 testReducerConstructor()
 {
   // do nothing
@@ -51,15 +49,15 @@ testReducerConstructor()
 // Should not run this on a GPU.
 template <typename ReducePolicy,
           typename NumericType>
-typename  std::enable_if< // CPU policy.
+typename std::enable_if< // CPU policy.
 #if defined(RAJA_ENABLE_CUDA)
-            !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+    !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
 #elif defined(RAJA_ENABLE_HIP)
-            !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+    !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
-            true  // Always run for non-GPU policies.
+    true // Always run for non-GPU policies.
 #endif
-          >::type
+    >::type
 testReducerConstructor()
 {
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
@@ -68,8 +66,14 @@ testReducerConstructor()
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup;
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup;
+  RAJA::ReduceMinLoc<ReducePolicy,
+                     NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup;
+  RAJA::ReduceMaxLoc<ReducePolicy,
+                     NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup;
 
   ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_min.get(), NumericType());
@@ -82,10 +86,14 @@ testReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
 }
 
 TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
@@ -93,33 +101,31 @@ TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
   using ReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testReducerConstructor< ReducePolicy, NumericType >();
+  testReducerConstructor<ReducePolicy, NumericType>();
 }
 
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
-typename  std::enable_if< // Host policy does nothing.
-            std::is_base_of<RunOnHost, ForOnePol>::value
-          >::type
-exec_dispatcher( NumericType * RAJA_UNUSED_ARG(initVal) )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
+typename std::enable_if< // Host policy does nothing.
+    std::is_base_of<RunOnHost, ForOnePol>::value>::type
+exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
 {
   // Do nothing for host policies.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
-typename  std::enable_if< // GPU policy fiddles with value.
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-exec_dispatcher( NumericType * initVal )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
+typename std::enable_if< // GPU policy fiddles with value.
+    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+exec_dispatcher(NumericType* initVal)
 {
-  forone<ForOnePol>( [=] __device__ () {
-                        initVal[0] += 1;
-                        initVal[0] -= 1;
-                 });
+  forone<ForOnePol>([=] __device__() {
+    initVal[0] += 1;
+    initVal[0] -= 1;
+  });
 }
 #endif
 
@@ -132,24 +138,24 @@ void testInitReducerConstructor()
   camp::resources::Resource work_res{WORKING_RES::get_default()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  NumericType * theVal = nullptr;
-  NumericType * workVal = nullptr;
+  NumericType* theVal = nullptr;
+  NumericType* workVal = nullptr;
 
   NumericType initVal = (NumericType)5;
 
   workVal = work_res.allocate<NumericType>(1);
   theVal = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
+  work_res.memcpy(workVal, &initVal, sizeof(initVal));
   theVal[0] = (NumericType)10;
 
-  #if defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-  #endif
+#endif
 
-  #if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-  #endif
+#endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -158,17 +164,19 @@ void testInitReducerConstructor()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy,
+                     NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy,
+                     NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup(initVal, LocTup);
 
   // move a value onto device and fiddle
-  exec_dispatcher < ReducePolicy,
-                    NumericType,
-                    ForOnePol
-                  >
-                  ( workVal );
+  exec_dispatcher<ReducePolicy, NumericType, ForOnePol>(workVal);
 
-  work_res.memcpy( &initVal, workVal, sizeof(initVal) );
+  work_res.memcpy(&initVal, workVal, sizeof(initVal));
 
   theVal[0] = initVal;
 
@@ -185,13 +193,17 @@ void testInitReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal));
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-
-  work_res.deallocate( workVal );
-  host_res.deallocate( theVal );
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
+
+  work_res.deallocate(workVal);
+  host_res.deallocate(theVal);
 }
 
 TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
@@ -201,7 +213,10 @@ TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
   using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  testInitReducerConstructor< ReduceType, NumericType, ResourceType, ForOneType >();
+  testInitReducerConstructor<ReduceType,
+                             NumericType,
+                             ResourceType,
+                             ForOneType>();
 }
 
 
@@ -211,4 +226,4 @@ REGISTER_TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest,
 REGISTER_TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest,
                             InitReducerConstructor);
 
-#endif  //__TEST_REDUCER_CONSTRUCTOR__
+#endif //__TEST_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index b82ae2995f..cec6f34403 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -16,97 +16,96 @@
 
 #include "../test-reducer.hpp"
 
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename Indexer,
-            typename Tuple,
-            typename ForOnePol
-          >
-typename  std::enable_if< // Empty function for non-device policy.
-            std::is_base_of<RunOnHost, ForOnePol>::value
-          >::type
-exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_sum),
-                  RAJA::ReduceMin<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_min),
-                  RAJA::ReduceMax<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_max),
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_minloc),
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_maxloc),
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_minloctup),
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_maxloctup),
-                  NumericType RAJA_UNUSED_ARG(initVal)
-               )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
+typename std::enable_if< // Empty function for non-device policy.
+    std::is_base_of<RunOnHost, ForOnePol>::value>::type
+exec_dispatcher(
+    RAJA::ReduceSum<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_sum),
+    RAJA::ReduceMin<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_min),
+    RAJA::ReduceMax<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_max),
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
+        reduce_minloc),
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
+        reduce_maxloc),
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
+        reduce_minloctup),
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
+        reduce_maxloctup),
+    NumericType RAJA_UNUSED_ARG(initVal))
 {
   // Non-device policies should do nothing.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename Indexer,
-            typename Tuple,
-            typename ForOnePol
-          >
-typename  std::enable_if< // GPU policy execution.
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & reduce_sum,
-                  RAJA::ReduceMin<ReducePolicy, NumericType> & reduce_min,
-                  RAJA::ReduceMax<ReducePolicy, NumericType> & reduce_max,
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & reduce_minloc,
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & reduce_maxloc,
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & reduce_minloctup,
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & reduce_maxloctup,
-                  NumericType initVal
-               )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
+typename std::enable_if< // GPU policy execution.
+    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+exec_dispatcher(
+    RAJA::ReduceSum<ReducePolicy, NumericType>& reduce_sum,
+    RAJA::ReduceMin<ReducePolicy, NumericType>& reduce_min,
+    RAJA::ReduceMax<ReducePolicy, NumericType>& reduce_max,
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& reduce_minloc,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& reduce_maxloc,
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& reduce_minloctup,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& reduce_maxloctup,
+    NumericType initVal)
 {
   // Use device to activate any value for each reducer.
-  forone<ForOnePol>( [=] __host__ __device__ () {
-                    Tuple temploc(0,0);
-                    reduce_sum += initVal;
-                    reduce_min.min(0);
-                    reduce_max.max(0);
-                    reduce_minloc.minloc(0,0);
-                    reduce_maxloc.maxloc(0,0);
-                    reduce_minloctup.minloc(0,temploc);
-                    reduce_maxloctup.maxloc(0,temploc);
-                 });
+  forone<ForOnePol>([=] __host__ __device__() {
+    Tuple temploc(0, 0);
+    reduce_sum += initVal;
+    reduce_min.min(0);
+    reduce_max.max(0);
+    reduce_minloc.minloc(0, 0);
+    reduce_maxloc.maxloc(0, 0);
+    reduce_minloctup.minloc(0, temploc);
+    reduce_maxloctup.maxloc(0, temploc);
+  });
   // Relying on implicit device synchronization in forone.
 }
 #endif
 
 template <typename T>
 class ReducerResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(ReducerResetUnitTest);
 
-template <  typename ReducePolicy,
-            typename NumericType,
-            typename WORKING_RES,
-            typename ForOnePol  >
+template <typename ReducePolicy,
+          typename NumericType,
+          typename WORKING_RES,
+          typename ForOnePol>
 void testReducerReset()
 {
   camp::resources::Resource work_res{WORKING_RES::get_default()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  NumericType * resetVal = nullptr;
-  NumericType * workVal = nullptr;
+  NumericType* resetVal = nullptr;
+  NumericType* workVal = nullptr;
 
   NumericType initVal = (NumericType)5;
 
   workVal = work_res.allocate<NumericType>(1);
   resetVal = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
+  work_res.memcpy(workVal, &initVal, sizeof(initVal));
   resetVal[0] = (NumericType)10;
 
-  #if defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-  #endif
+#endif
 
-  #if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-  #endif
+#endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -115,25 +114,28 @@ void testReducerReset()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy,
+                     NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy,
+                     NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup(initVal, LocTup);
 
   // initiate some device computation if using device policy
-  exec_dispatcher < ReducePolicy,
-                    NumericType,
-                    RAJA::Index_type,
-                    RAJA::tuple<RAJA::Index_type, RAJA::Index_type>,
-                    ForOnePol
-                  >
-                 (  reduce_sum,
-                    reduce_min,
-                    reduce_max,
-                    reduce_minloc,
-                    reduce_maxloc,
-                    reduce_minloctup,
-                    reduce_maxloctup,
-                    initVal
-                 );
+  exec_dispatcher<ReducePolicy,
+                  NumericType,
+                  RAJA::Index_type,
+                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>,
+                  ForOnePol>(reduce_sum,
+                             reduce_min,
+                             reduce_max,
+                             reduce_minloc,
+                             reduce_maxloc,
+                             reduce_minloctup,
+                             reduce_maxloctup,
+                             initVal);
 
   // perform real host resets
   reduce_sum.reset(resetVal[0]);
@@ -159,10 +161,14 @@ void testReducerReset()
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(resetVal[0]));
 
   // Reset of tuple loc defaults to 0
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
 
   // reset locs to default of -1.
   reduce_minloc.reset(resetVal[0], -1);
@@ -171,8 +177,8 @@ void testReducerReset()
   ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)(-1));
   ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)(-1));
 
-  work_res.deallocate( workVal );
-  host_res.deallocate( resetVal );
+  work_res.deallocate(workVal);
+  host_res.deallocate(resetVal);
 }
 
 TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
@@ -181,10 +187,9 @@ TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
   using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
   using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
-  testReducerReset< ReduceType, NumericType, ResourceType, ForOneType >();
+  testReducerReset<ReduceType, NumericType, ResourceType, ForOneType>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest,
-                            BasicReset);
+REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest, BasicReset);
 
-#endif  //__TEST_REDUCER_RESET__
+#endif //__TEST_REDUCER_RESET__
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index 806ba66b26..272a5bf881 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -14,15 +14,15 @@
 #include "RAJA/util/Timer.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-inline __host__ __device__ void
-gpu_time_wait_for(float time, float clockrate) {
-  clock_t time_in_clocks = time*clockrate;
+inline __host__ __device__ void gpu_time_wait_for(float time, float clockrate)
+{
+  clock_t time_in_clocks = time * clockrate;
 
-  unsigned int start_clock = (unsigned int) clock();
+  unsigned int start_clock = (unsigned int)clock();
   clock_t clock_offset = 0;
   while (clock_offset < time_in_clocks)
   {
-    unsigned int end_clock = (unsigned int) clock();
+    unsigned int end_clock = (unsigned int)clock();
     clock_offset = (clock_t)(end_clock - start_clock);
   }
 }
@@ -39,18 +39,19 @@ int get_clockrate()
     printf("  CUDA kernel runs will be serialized\n");
     return -1;
   }
-  //printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-  //    deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+  // printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+  //     deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 
 #if defined(__arm__) || defined(__aarch64__)
-  return deviceProp.clockRate/1000;
+  return deviceProp.clockRate / 1000;
 #else
   return deviceProp.clockRate;
 #endif
 }
 
 template <typename WORKING_RES, typename EXEC_POL>
-void ResourceAsyncTimeTestImpl(EXEC_POL&&) {}
+void ResourceAsyncTimeTestImpl(EXEC_POL&&)
+{}
 
 template <typename WORKING_RES, size_t BLOCK_SIZE, bool Async>
 void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
@@ -70,30 +71,31 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
 
   RAJA::Timer sync_timer;
   sync_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
-    forall<SyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE (int i) {
-        gpu_time_wait_for(100, clockrate);
-      }
-    );
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
+  {
+    forall<SyncExecPol>(
+        dev[stream], RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+          gpu_time_wait_for(100, clockrate);
+        });
   }
   sync_timer.stop();
   RAJA::Timer::ElapsedType t_sync = sync_timer.elapsed();
 
   RAJA::Timer async_timer;
   async_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
-    forall<AsyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE (int i) {
-        gpu_time_wait_for(100, clockrate);
-      }
-    );
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
+  {
+    forall<AsyncExecPol>(
+        dev[stream], RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+          gpu_time_wait_for(100, clockrate);
+        });
   }
   async_timer.stop();
   RAJA::Timer::ElapsedType t_async = async_timer.elapsed();
 
-  // We expect "total async time" to be roughly equal to "total sync time" / NUM_STREAMS.
-  // For comparison tolerance, we multiple the latter by 2 in the check.
+  // We expect "total async time" to be roughly equal to "total sync time" /
+  // NUM_STREAMS. For comparison tolerance, we multiple the latter by 2 in the
+  // check.
   ASSERT_LT(t_async, 2 * (t_sync / NUM_STREAMS));
 }
 
@@ -106,15 +108,15 @@ void ResourceAsyncTimeTestCall()
 #else
 
 template <typename WORKING_RES, typename EXEC_POLICY>
-void ResourceAsyncTimeTestCall() {}
+void ResourceAsyncTimeTestCall()
+{}
 
 #endif
 
 TYPED_TEST_SUITE_P(ResourceAsyncTimeTest);
 template <typename T>
 class ResourceAsyncTimeTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
 {
@@ -124,7 +126,6 @@ TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
   ResourceAsyncTimeTestCall<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest,
-                            ResourceAsyncTime);
+REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest, ResourceAsyncTime);
 
-#endif  // __TEST_RESOURCE_ASYNC_HPP__
+#endif // __TEST_RESOURCE_ASYNC_HPP__
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index b1939240e4..8316d0c69e 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -20,42 +20,36 @@ void ResourceBasicAsyncSemanticsTestImpl()
   resources::Host host;
 
   int* d_array = resources::Resource{dev}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      h_array[i] = i;
-    }
-  );
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+        h_array[i] = i;
+      });
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
-  forall<EXEC_POLICY>(dev, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array[i] = i + 2;
-    }
-  );
+  forall<EXEC_POLICY>(dev,
+                      RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i + 2); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
+        ASSERT_EQ(h_array[i], i + 2);
+      });
 
   dev.deallocate(d_array);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceBasicAsyncSemanticsTest);
 template <typename T>
 class ResourceBasicAsyncSemanticsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceBasicAsyncSemanticsTest, ResourceBasicAsyncSemantics)
 {
@@ -68,4 +62,4 @@ TYPED_TEST_P(ResourceBasicAsyncSemanticsTest, ResourceBasicAsyncSemantics)
 REGISTER_TYPED_TEST_SUITE_P(ResourceBasicAsyncSemanticsTest,
                             ResourceBasicAsyncSemantics);
 
-#endif  // __TEST_RESOURCE_DEPENDS_HPP__
+#endif // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index 0c1b748de2..a3f823a9db 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -22,50 +22,43 @@ void ResourceDependsTestImpl()
 
   int* d_array1 = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
   int* d_array2 = resources::Resource{dev2}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
 
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] = i;
-    }
-  );
+  forall<EXEC_POLICY>(dev1,
+                      RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
 
-  resources::Event e = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array2[i] = -1;
-    }
-  );
+  resources::Event e =
+      forall<EXEC_POLICY>(dev2,
+                          RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] *= d_array2[i];
-    }
-  );
+  forall<EXEC_POLICY>(
+      dev1, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+        d_array1[i] *= d_array2[i];
+      });
 
   dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], -i); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
+        ASSERT_EQ(h_array[i], -i);
+      });
 
   dev1.deallocate(d_array1);
   dev2.deallocate(d_array2);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceDependsTest);
 template <typename T>
 class ResourceDependsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
 {
@@ -75,7 +68,6 @@ TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
   ResourceDependsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest,
-                            ResourceDepends);
+REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest, ResourceDepends);
 
-#endif  // __TEST_RESOURCE_DEPENDS_HPP__
+#endif // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index a8e30d9719..54854596b7 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -21,24 +21,22 @@ void ResourceJoinAsyncSemanticsTestImpl()
   resources::Host host;
 
   int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      h_array[i] = i;
-    }
-  );
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+        h_array[i] = i;
+      });
 
   dev2.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
   auto e1 = dev2.get_event_erased();
   dev1.wait_for(&e1);
 
-  RAJA::resources::Event e2 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array[i] = i + 2;
-    }
-  );
+  RAJA::resources::Event e2 =
+      forall<EXEC_POLICY>(dev1,
+                          RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev2.wait_for(&e2);
 
@@ -46,22 +44,19 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i + 2); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
+        ASSERT_EQ(h_array[i], i + 2);
+      });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceJoinAsyncSemanticsTest);
 template <typename T>
 class ResourceJoinAsyncSemanticsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceJoinAsyncSemanticsTest, ResourceJoinAsyncSemantics)
 {
@@ -74,4 +69,4 @@ TYPED_TEST_P(ResourceJoinAsyncSemanticsTest, ResourceJoinAsyncSemantics)
 REGISTER_TYPED_TEST_SUITE_P(ResourceJoinAsyncSemanticsTest,
                             ResourceJoinAsyncSemantics);
 
-#endif  // __TEST_RESOURCE_DEPENDS_HPP__
+#endif // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index 7f545229f1..2400e5a20e 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -22,28 +22,31 @@ void ResourceMultiStreamTestImpl()
   resources::Host host;
 
   int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
-
-  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 0) {
-        d_array[i] = i;
-      }
-  });
-
-  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 1) {
-        d_array[i] = i;
-      }
-  });
-
-  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 2) {
-        d_array[i] = i;
-      }
-  });
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
+
+  resources::Event e1 = forall<EXEC_POLICY>(
+      dev1, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+        if (i % 3 == 0)
+        {
+          d_array[i] = i;
+        }
+      });
+
+  resources::Event e2 = forall<EXEC_POLICY>(
+      dev2, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+        if (i % 3 == 1)
+        {
+          d_array[i] = i;
+        }
+      });
+
+  resources::Event e3 = forall<EXEC_POLICY>(
+      dev2, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
+        if (i % 3 == 2)
+        {
+          d_array[i] = i;
+        }
+      });
 
   dev1.wait_for(&e2);
   dev1.wait_for(&e3);
@@ -52,11 +55,10 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
+        ASSERT_EQ(h_array[i], i);
+      });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
@@ -65,8 +67,7 @@ void ResourceMultiStreamTestImpl()
 TYPED_TEST_SUITE_P(ResourceMultiStreamTest);
 template <typename T>
 class ResourceMultiStreamTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
 {
@@ -76,7 +77,6 @@ TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
   ResourceMultiStreamTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest,
-                            ResourceMultiStream);
+REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest, ResourceMultiStream);
 
-#endif  // __TEST_RESOURCE_MULTISTREAM_HPP__
+#endif // __TEST_RESOURCE_MULTISTREAM_HPP__
diff --git a/test/unit/util/operator/test-operators-bitwise-modulus.cpp b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
index c2906cbe5f..db81160eb1 100644
--- a/test/unit/util/operator/test-operators-bitwise-modulus.cpp
+++ b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsIntegralUnitTest : public ::testing::Test {};
+template <typename T>
+class OperatorsIntegralUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsIntegralUnitTest, UnitExpandedIntegralTypes);
 
-template<typename T>
+template <typename T>
 void modulus_test()
 {
   using Mod = RAJA::operators::modulus<T>;
@@ -25,16 +26,17 @@ void modulus_test()
   Mod m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(1));
+  ASSERT_EQ(m(i, j), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(-1));
+    ASSERT_EQ(m(i, j), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void bit_or_test()
 {
   using Or = RAJA::operators::bit_or<T>;
@@ -43,12 +45,12 @@ void bit_or_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(o(i,j), T(0011));
-  ASSERT_EQ(o(i,k), T(0111));
-  ASSERT_EQ(o(j,k), T(0111));
+  ASSERT_EQ(o(i, j), T(0011));
+  ASSERT_EQ(o(i, k), T(0111));
+  ASSERT_EQ(o(j, k), T(0111));
 }
 
-template<typename T>
+template <typename T>
 void bit_and_test()
 {
   using And = RAJA::operators::bit_and<T>;
@@ -57,12 +59,12 @@ void bit_and_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(a(i,j), T(0000));
-  ASSERT_EQ(a(i,k), T(0010));
-  ASSERT_EQ(a(j,k), T(0001));
+  ASSERT_EQ(a(i, j), T(0000));
+  ASSERT_EQ(a(i, k), T(0010));
+  ASSERT_EQ(a(j, k), T(0001));
 }
 
-template<typename T>
+template <typename T>
 void bit_xor_test()
 {
   using Xor = RAJA::operators::bit_xor<T>;
@@ -71,12 +73,13 @@ void bit_xor_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(x(i,j), T(0011));
-  ASSERT_EQ(x(i,k), T(0101));
-  ASSERT_EQ(x(j,k), T(0110));
+  ASSERT_EQ(x(i, j), T(0011));
+  ASSERT_EQ(x(i, k), T(0101));
+  ASSERT_EQ(x(j, k), T(0110));
 }
 
-TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus) {
+TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus)
+{
   bit_or_test<TypeParam>();
   bit_and_test<TypeParam>();
   bit_xor_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-equivalence.cpp b/test/unit/util/operator/test-operators-equivalence.cpp
index f2a0a84c54..b4be241224 100644
--- a/test/unit/util/operator/test-operators-equivalence.cpp
+++ b/test/unit/util/operator/test-operators-equivalence.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestEquivalence : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestEquivalence : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsUnitTestEquivalence, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void equal_test()
 {
   using Eq = RAJA::operators::equal_to<T>;
@@ -25,16 +26,17 @@ void equal_test()
   Eq eq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(eq(i,j));
+  ASSERT_TRUE(eq(i, j));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(eq(i,j));
+    ASSERT_TRUE(eq(i, j));
   }
 }
 
-template<typename T>
+template <typename T>
 void not_equal_test()
 {
   using NEq = RAJA::operators::not_equal_to<T>;
@@ -42,16 +44,17 @@ void not_equal_test()
   NEq neq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(3);
-  ASSERT_TRUE(neq(i,j));
+  ASSERT_TRUE(neq(i, j));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-3);
-    ASSERT_TRUE(neq(i,j));
+    ASSERT_TRUE(neq(i, j));
   }
 }
 
-template<typename T>
+template <typename T>
 void greater_test()
 {
   using G = RAJA::operators::greater<T>;
@@ -59,18 +62,19 @@ void greater_test()
   G g;
   T i = static_cast<T>(5);
   T j = static_cast<T>(4);
-  ASSERT_TRUE(g(i,j));
-  ASSERT_FALSE(g(j,i));
+  ASSERT_TRUE(g(i, j));
+  ASSERT_FALSE(g(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-4);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(g(i,j));
-    ASSERT_FALSE(g(j,i));
+    ASSERT_TRUE(g(i, j));
+    ASSERT_FALSE(g(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void less_test()
 {
   using L = RAJA::operators::less<T>;
@@ -78,18 +82,19 @@ void less_test()
   L l;
   T i = static_cast<T>(4);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(l(i,j));
-  ASSERT_FALSE(l(j,i));
+  ASSERT_TRUE(l(i, j));
+  ASSERT_FALSE(l(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-4);
-    ASSERT_TRUE(l(i,j));
-    ASSERT_FALSE(l(j,i));
+    ASSERT_TRUE(l(i, j));
+    ASSERT_FALSE(l(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void greater_eq_test()
 {
   using G = RAJA::operators::greater_equal<T>;
@@ -98,21 +103,22 @@ void greater_eq_test()
   T i = static_cast<T>(5);
   T i2 = static_cast<T>(5);
   T j = static_cast<T>(4);
-  ASSERT_TRUE(g(i,j));
-  ASSERT_TRUE(g(i,i2));
-  ASSERT_FALSE(g(j,i));
+  ASSERT_TRUE(g(i, j));
+  ASSERT_TRUE(g(i, i2));
+  ASSERT_FALSE(g(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-4);
     i2 = static_cast<T>(-4);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(g(i,j));
-    ASSERT_TRUE(g(i,i2));
-    ASSERT_FALSE(g(j,i));
+    ASSERT_TRUE(g(i, j));
+    ASSERT_TRUE(g(i, i2));
+    ASSERT_FALSE(g(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void less_eq_test()
 {
   using L = RAJA::operators::less_equal<T>;
@@ -121,21 +127,22 @@ void less_eq_test()
   T i = static_cast<T>(4);
   T i2 = static_cast<T>(4);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(l(i,j));
-  ASSERT_TRUE(l(i,i2));
-  ASSERT_FALSE(l(j,i));
+  ASSERT_TRUE(l(i, j));
+  ASSERT_TRUE(l(i, i2));
+  ASSERT_FALSE(l(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     i2 = static_cast<T>(-5);
     j = static_cast<T>(-4);
-    ASSERT_TRUE(l(i,j));
-    ASSERT_TRUE(l(i,i2));
-    ASSERT_FALSE(l(j,i));
+    ASSERT_TRUE(l(i, j));
+    ASSERT_TRUE(l(i, i2));
+    ASSERT_FALSE(l(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void maximum_test()
 {
   using Max = RAJA::operators::maximum<T>;
@@ -143,16 +150,17 @@ void maximum_test()
   Max m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), i);
+  ASSERT_EQ(m(i, j), i);
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), j);
+    ASSERT_EQ(m(i, j), j);
   }
 }
 
-template<typename T>
+template <typename T>
 void minimum_test()
 {
   using Min = RAJA::operators::minimum<T>;
@@ -160,16 +168,18 @@ void minimum_test()
   Min m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), j);
+  ASSERT_EQ(m(i, j), j);
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), i);
+    ASSERT_EQ(m(i, j), i);
   }
 }
 
-TYPED_TEST(OperatorsUnitTestEquivalence, equivalence) {
+TYPED_TEST(OperatorsUnitTestEquivalence, equivalence)
+{
   minimum_test<TypeParam>();
   maximum_test<TypeParam>();
   equal_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-identity.cpp b/test/unit/util/operator/test-operators-identity.cpp
index 4b320d1c04..0512080504 100644
--- a/test/unit/util/operator/test-operators-identity.cpp
+++ b/test/unit/util/operator/test-operators-identity.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestIdentity: public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestIdentity : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsUnitTestIdentity, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void identity_test()
 {
   using Ident = RAJA::operators::identity<T>;
@@ -28,13 +29,14 @@ void identity_test()
   ASSERT_EQ(id(i), T(0));
   ASSERT_EQ(id(j), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
     ASSERT_EQ(id(j), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void project1st_test()
 {
   using Proj1 = RAJA::operators::project1st<T, T>;
@@ -42,17 +44,18 @@ void project1st_test()
   Proj1 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i,j), T(0));
-  ASSERT_EQ(p(j,i), T(1));
+  ASSERT_EQ(p(i, j), T(0));
+  ASSERT_EQ(p(j, i), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i,j), T(0));
-    ASSERT_EQ(p(j,i), T(-1));
+    ASSERT_EQ(p(i, j), T(0));
+    ASSERT_EQ(p(j, i), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void project2nd_test()
 {
   using Proj2 = RAJA::operators::project2nd<T, T>;
@@ -60,23 +63,26 @@ void project2nd_test()
   Proj2 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i,j), T(1));
-  ASSERT_EQ(p(j,i), T(0));
+  ASSERT_EQ(p(i, j), T(1));
+  ASSERT_EQ(p(j, i), T(0));
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4245 )  // Force msvc to not emit signed conversion warning
+#pragma warning(                                                               \
+    disable : 4245) // Force msvc to not emit signed conversion warning
 #endif
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i,j), T(-1));
-    ASSERT_EQ(p(j,i), T(0));
+    ASSERT_EQ(p(i, j), T(-1));
+    ASSERT_EQ(p(j, i), T(0));
   }
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4245 )
+#pragma warning(default : 4245)
 #endif
 }
 
-TYPED_TEST(OperatorsUnitTestIdentity, identity_project) {
+TYPED_TEST(OperatorsUnitTestIdentity, identity_project)
+{
   identity_test<TypeParam>();
   project1st_test<TypeParam>();
   project2nd_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-logical.cpp b/test/unit/util/operator/test-operators-logical.cpp
index 3fde5644a4..8edb9cdad0 100644
--- a/test/unit/util/operator/test-operators-logical.cpp
+++ b/test/unit/util/operator/test-operators-logical.cpp
@@ -12,11 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestLogical : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestLogical : public ::testing::Test
+{};
 TYPED_TEST_SUITE(OperatorsUnitTestLogical, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void logical_and_test()
 {
   using And = RAJA::operators::logical_and<T>;
@@ -28,21 +29,22 @@ void logical_and_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(a(i0,j0));
-  ASSERT_FALSE(a(i0,j1));
-  ASSERT_FALSE(a(i1,j0));
-  ASSERT_TRUE(a(i1,j1));
-  ASSERT_TRUE(a(i2,j2));
-  if (std::is_signed<T>::value) {
+  ASSERT_FALSE(a(i0, j0));
+  ASSERT_FALSE(a(i0, j1));
+  ASSERT_FALSE(a(i1, j0));
+  ASSERT_TRUE(a(i1, j1));
+  ASSERT_TRUE(a(i2, j2));
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_FALSE(a(i0,j1));
-    ASSERT_FALSE(a(i1,j0));
-    ASSERT_TRUE(a(i1,j1));
+    ASSERT_FALSE(a(i0, j1));
+    ASSERT_FALSE(a(i1, j0));
+    ASSERT_TRUE(a(i1, j1));
   }
 }
 
-template<typename T>
+template <typename T>
 void logical_or_test()
 {
   using Or = RAJA::operators::logical_or<T>;
@@ -54,21 +56,22 @@ void logical_or_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(o(i0,j0));
-  ASSERT_TRUE(o(i0,j1));
-  ASSERT_TRUE(o(i1,j0));
-  ASSERT_TRUE(o(i1,j1));
-  ASSERT_TRUE(o(i2,j2));
-  if (std::is_signed<T>::value) {
+  ASSERT_FALSE(o(i0, j0));
+  ASSERT_TRUE(o(i0, j1));
+  ASSERT_TRUE(o(i1, j0));
+  ASSERT_TRUE(o(i1, j1));
+  ASSERT_TRUE(o(i2, j2));
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_TRUE(o(i0,j1));
-    ASSERT_TRUE(o(i1,j0));
-    ASSERT_TRUE(o(i1,j1));
+    ASSERT_TRUE(o(i0, j1));
+    ASSERT_TRUE(o(i1, j0));
+    ASSERT_TRUE(o(i1, j1));
   }
 }
 
-template<typename T>
+template <typename T>
 void logical_not_test()
 {
   using Not = RAJA::operators::logical_not<T>;
@@ -78,13 +81,15 @@ void logical_not_test()
   T i1 = static_cast<T>(1);
   ASSERT_FALSE(n(i1));
   ASSERT_TRUE(n(i0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     ASSERT_FALSE(n(i1));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestLogical, logical) {
+TYPED_TEST(OperatorsUnitTestLogical, logical)
+{
   logical_and_test<TypeParam>();
   logical_or_test<TypeParam>();
   logical_not_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-math.cpp b/test/unit/util/operator/test-operators-math.cpp
index 054efd41c8..16dd7c170a 100644
--- a/test/unit/util/operator/test-operators-math.cpp
+++ b/test/unit/util/operator/test-operators-math.cpp
@@ -12,11 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestMath : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestMath : public ::testing::Test
+{};
 TYPED_TEST_SUITE(OperatorsUnitTestMath, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void plus_test()
 {
   using Plus = RAJA::operators::plus<T>;
@@ -26,16 +27,17 @@ void plus_test()
   Plus p;
   T i = static_cast<T>(1);
   T j = static_cast<T>(2);
-  ASSERT_EQ(p(i,j), T(3));
+  ASSERT_EQ(p(i, j), T(3));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(p(i,j), T(-7));
+    ASSERT_EQ(p(i, j), T(-7));
   }
 }
 
-template<typename T>
+template <typename T>
 void minus_test()
 {
   using Minus = RAJA::operators::minus<T>;
@@ -43,16 +45,17 @@ void minus_test()
   Minus m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(3));
+  ASSERT_EQ(m(i, j), T(3));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(-3));
+    ASSERT_EQ(m(i, j), T(-3));
   }
 }
 
-template<typename T>
+template <typename T>
 void multiplies_test()
 {
   using Mult = RAJA::operators::multiplies<T>;
@@ -62,16 +65,17 @@ void multiplies_test()
   Mult m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(10));
+  ASSERT_EQ(m(i, j), T(10));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(10));
+    ASSERT_EQ(m(i, j), T(10));
   }
 }
 
-template<typename T>
+template <typename T>
 void divides_test()
 {
   using Div = RAJA::operators::divides<T>;
@@ -79,22 +83,24 @@ void divides_test()
   Div d;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  if(std::is_floating_point<T>::value) 
-    ASSERT_EQ(d(i,j), T(2.5));
+  if (std::is_floating_point<T>::value)
+    ASSERT_EQ(d(i, j), T(2.5));
   else
-    ASSERT_EQ(d(i,j), T(2));
+    ASSERT_EQ(d(i, j), T(2));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    if(std::is_floating_point<T>::value) 
-      ASSERT_EQ(d(i,j), T(2.5));
+    if (std::is_floating_point<T>::value)
+      ASSERT_EQ(d(i, j), T(2.5));
     else
-      ASSERT_EQ(d(i,j), T(2));
+      ASSERT_EQ(d(i, j), T(2));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestMath, math) {
+TYPED_TEST(OperatorsUnitTestMath, math)
+{
   plus_test<TypeParam>();
   minus_test<TypeParam>();
   multiplies_test<TypeParam>();
diff --git a/test/unit/util/test-float-limits.cpp b/test/unit/util/test-float-limits.cpp
index 80635a74e1..d54e454083 100644
--- a/test/unit/util/test-float-limits.cpp
+++ b/test/unit/util/test-float-limits.cpp
@@ -6,12 +6,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for floating point numeric limits in 
+/// Source file containing tests for floating point numeric limits in
 /// RAJA operators
 ///
 
 #include "RAJA_test-base.hpp"
-#include "RAJA_unit-test-types.hpp" 
+#include "RAJA_unit-test-types.hpp"
 
 #define RAJA_CHECK_LIMITS
 #include "RAJA/util/Operators.hpp"
@@ -20,8 +20,7 @@
 
 template <typename T>
 class FloatLimitsUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(FloatLimitsUnitTest);
 
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
index 5161b2bb3a..6cc8941184 100644
--- a/test/unit/util/test-fraction.cpp
+++ b/test/unit/util/test-fraction.cpp
@@ -30,16 +30,17 @@ void testFractionMultiplyTypesValues()
             IntegerType(double(numerator) / double(denominator) * double(101)));
 
   // Test where naive algorithm causes overflow, when within precision of double
-  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) {
+  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double))
+  {
 
     static constexpr IntegerType max = std::numeric_limits<IntegerType>::max();
-    static constexpr IntegerType val = (numerator > denominator) ?
-        (max / numerator * denominator) : max;
+    static constexpr IntegerType val =
+        (numerator > denominator) ? (max / numerator * denominator) : max;
 
-    ASSERT_EQ(Frac::multiply(IntegerType(val)),
-              IntegerType(double(numerator) / double(denominator) * double(val)));
+    ASSERT_EQ(
+        Frac::multiply(IntegerType(val)),
+        IntegerType(double(numerator) / double(denominator) * double(val)));
   }
-
 }
 
 template <typename IntegerType>
@@ -54,8 +55,8 @@ void testFractionMultiplyTypes()
 }
 
 
-#define RAJA_FRACTION_RUN_TEST(test) \
-  test<int>(); \
+#define RAJA_FRACTION_RUN_TEST(test)                                           \
+  test<int>();                                                                 \
   test<size_t>();
 
 TEST(Fraction, basic_multiply_Fraction)
diff --git a/test/unit/util/test-integral-limits.cpp b/test/unit/util/test-integral-limits.cpp
index 77d2d95bc0..1e68ecc4f4 100644
--- a/test/unit/util/test-integral-limits.cpp
+++ b/test/unit/util/test-integral-limits.cpp
@@ -19,8 +19,7 @@
 
 template <typename T>
 class IntegralLimitsUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(IntegralLimitsUnitTest);
 
@@ -35,5 +34,5 @@ TYPED_TEST_P(IntegralLimitsUnitTest, IntegralLimits)
 REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsUnitTest, IntegralLimits);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsUnitTests,
-                              IntegralLimitsUnitTest,
-                              UnitIntegralTypes);
+                               IntegralLimitsUnitTest,
+                               UnitIntegralTypes);
diff --git a/test/unit/util/test-math.cpp b/test/unit/util/test-math.cpp
index 39572ad3a0..dd5b5dbc24 100644
--- a/test/unit/util/test-math.cpp
+++ b/test/unit/util/test-math.cpp
@@ -13,7 +13,7 @@
 #include "RAJA_gtest.hpp"
 #include <type_traits>
 
-template < typename T >
+template <typename T>
 void test_log2()
 {
   ASSERT_EQ(RAJA::log2(T(257)), T(8));
@@ -24,7 +24,8 @@ void test_log2()
   ASSERT_EQ(RAJA::log2(T(2)), T(1));
   ASSERT_EQ(RAJA::log2(T(1)), T(0));
   ASSERT_EQ(RAJA::log2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::log2(T(-1)), T(0));
     ASSERT_EQ(RAJA::log2(T(-100)), T(0));
   }
@@ -37,7 +38,7 @@ TEST(math, log2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_next_pow2()
 {
   ASSERT_EQ(RAJA::next_pow2(T(257)), T(512));
@@ -48,7 +49,8 @@ void test_next_pow2()
   ASSERT_EQ(RAJA::next_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::next_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::next_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::next_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::next_pow2(T(-100)), T(0));
   }
@@ -61,7 +63,7 @@ TEST(math, next_pow2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_prev_pow2()
 {
   ASSERT_EQ(RAJA::prev_pow2(T(257)), T(256));
@@ -72,7 +74,8 @@ void test_prev_pow2()
   ASSERT_EQ(RAJA::prev_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::prev_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::prev_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::prev_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::prev_pow2(T(-100)), T(0));
   }
@@ -85,7 +88,7 @@ TEST(math, prev_pow2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_power_of_2_mod()
 {
   ASSERT_EQ(RAJA::power_of_2_mod(T(257), T(256)), T(1));
diff --git a/test/unit/util/test-span.cpp b/test/unit/util/test-span.cpp
index e59054cfc6..4bf5e1abf3 100644
--- a/test/unit/util/test-span.cpp
+++ b/test/unit/util/test-span.cpp
@@ -11,41 +11,24 @@
 
 #include "test-span.hpp"
 
-#define RAJA_SPAN_RUN_TEST(test) \
-  test<int, int>(); \
-  test<int, size_t>(); \
-  test<double, int>(); \
-  test<double, size_t>(); \
+#define RAJA_SPAN_RUN_TEST(test)                                               \
+  test<int, int>();                                                            \
+  test<int, size_t>();                                                         \
+  test<double, int>();                                                         \
+  test<double, size_t>();
 
-TEST(Span, basic_construct_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanConstructTypes)
-}
+TEST(Span, basic_construct_Span){RAJA_SPAN_RUN_TEST(testSpanConstructTypes)}
 
-TEST(Span, basic_assign_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanAssignTypes)
-}
+TEST(Span, basic_assign_Span){RAJA_SPAN_RUN_TEST(testSpanAssignTypes)}
 
-TEST(Span, basic_iterator_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)
-}
+TEST(Span, basic_iterator_Span){RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)}
 
-TEST(Span, basic_element_access_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)
-}
+TEST(Span,
+     basic_element_access_Span){RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
 
-TEST(Span, basic_observe_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanObserveTypes)
-}
+TEST(Span, basic_observe_Span){RAJA_SPAN_RUN_TEST(testSpanObserveTypes)}
 
-TEST(Span, basic_subview_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)
-}
+TEST(Span, basic_subview_Span){RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)}
 
 TEST(Span, basic_make_span_Span)
 {
diff --git a/test/unit/util/test-span.hpp b/test/unit/util/test-span.hpp
index e76db861fd..fc0e8d06b0 100644
--- a/test/unit/util/test-span.hpp
+++ b/test/unit/util/test-span.hpp
@@ -36,7 +36,7 @@ void testSpanConstructTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr+len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr + len);
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(len, span.size());
@@ -85,7 +85,7 @@ void testSpanIteratorTypes()
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -96,25 +96,27 @@ void testSpanIteratorTypes()
     iterator begin = span.begin();
     iterator end = span.end();
     ASSERT_EQ(ptr, begin);
-    ASSERT_EQ(ptr+len, end);
+    ASSERT_EQ(ptr + len, end);
 
     ValueType* ptr_chk = ptr;
 
-    for (iterator iter = begin; iter != end; ++iter) {
+    for (iterator iter = begin; iter != end; ++iter)
+    {
       ASSERT_EQ(*ptr_chk, *iter);
-      ptr_chk++ ;
+      ptr_chk++;
     }
 
     const_iterator cbegin = span.cbegin();
     const_iterator cend = span.cend();
     ASSERT_EQ(ptr, cbegin);
-    ASSERT_EQ(ptr+len, cend);
+    ASSERT_EQ(ptr + len, cend);
 
     ptr_chk = ptr;
 
-    for (iterator citer = cbegin; citer != cend; ++citer) {
+    for (iterator citer = cbegin; citer != cend; ++citer)
+    {
       ASSERT_EQ(*ptr_chk, *citer);
-      ptr_chk++ ;
+      ptr_chk++;
     }
   }
 
@@ -129,7 +131,7 @@ void testSpanElementAccessTypes()
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -139,9 +141,10 @@ void testSpanElementAccessTypes()
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(*ptr, span.front());
-    ASSERT_EQ(*(ptr+len-1), span.back());
+    ASSERT_EQ(*(ptr + len - 1), span.back());
 
-    for (IndexType i = 0; i < len; ++i) {
+    for (IndexType i = 0; i < len; ++i)
+    {
       ASSERT_EQ(ptr[i], span[i]);
     }
   }
@@ -157,7 +160,7 @@ void testSpanObserveTypes()
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -170,7 +173,7 @@ void testSpanObserveTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, len-len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len - len);
 
     ASSERT_EQ(0, span.size());
     ASSERT_TRUE(span.empty());
@@ -187,7 +190,7 @@ void testSpanSubViewTypes()
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -207,17 +210,18 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.last(count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+len-count, subspan.data());
+    ASSERT_EQ(ptr + len - count, subspan.data());
   }
 
   {
     constexpr IndexType begin = 1;
     constexpr IndexType count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
-    const RAJA::Span<ValueType*, IndexType> subspan = span.subspan(begin, count);
+    const RAJA::Span<ValueType*, IndexType> subspan =
+        span.subspan(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+begin, subspan.data());
+    ASSERT_EQ(ptr + begin, subspan.data());
   }
 
   {
@@ -227,7 +231,7 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.slice(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+begin, subspan.data());
+    ASSERT_EQ(ptr + begin, subspan.data());
   }
 
   delete[] ptr;
diff --git a/test/unit/util/test-timer.cpp b/test/unit/util/test-timer.cpp
index 1688e6497e..ed4ed599ae 100644
--- a/test/unit/util/test-timer.cpp
+++ b/test/unit/util/test-timer.cpp
@@ -51,7 +51,8 @@ TEST(TimerUnitTest, No2)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i) {
+  for (int i = 2; i > 0; --i)
+  {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -73,7 +74,8 @@ TEST(TimerUnitTest, No3)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i) {
+  for (int i = 2; i > 0; --i)
+  {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -95,5 +97,5 @@ TEST(TimerUnitTest, No3)
   std::this_thread::sleep_for(std::chrono::milliseconds(10));
   timer.stop();
   elapsed = timer.elapsed();
-  EXPECT_GT(elapsed, 0.01); 
+  EXPECT_GT(elapsed, 0.01);
 }
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index bd7effa8d4..b62762d876 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -11,12 +11,13 @@
 
 using namespace RAJA;
 
-TEST(IndexLayout, IndexList1D) {
+TEST(IndexLayout, IndexList1D)
+{
   /*
    * Construct a 1D index layout with the index list {1,2,3}
    */
 
-  Index_type arr[3] = {1,2,3};
+  Index_type arr[3] = {1, 2, 3};
 
   auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
@@ -24,18 +25,18 @@ TEST(IndexLayout, IndexList1D) {
   EXPECT_EQ(index_layout(0), 1);
   EXPECT_EQ(index_layout(1), 2);
   EXPECT_EQ(index_layout(2), 3);
-
 }
 
-TEST(IndexLayout, IndexList1DSubsetOfLayout) {
+TEST(IndexLayout, IndexList1DSubsetOfLayout)
+{
   /*
-   * Construct a 1D index layout of arbitrary size greater than 3 
+   * Construct a 1D index layout of arbitrary size greater than 3
    * with the index list {2,3,4}.
    * The purpose of this test is to demonstrate the use case where
    * the index list contains a subset of its index layout
    */
 
-  Index_type arr[3] = {2,3,4};
+  Index_type arr[3] = {2, 3, 4};
 
   auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
@@ -43,65 +44,66 @@ TEST(IndexLayout, IndexList1DSubsetOfLayout) {
   EXPECT_EQ(index_layout(0), 2);
   EXPECT_EQ(index_layout(1), 3);
   EXPECT_EQ(index_layout(2), 4);
-
 }
 
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0) {
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0)
+{
   /*
-   * Construct a 2D index layout of size 3x10 with 
+   * Construct a 2D index layout of size 3x10 with
    * the index list {1,2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(1,1)          -> 11
    *   index_layout(0,5)   -> layout(1,5)          -> 15
    *   index_layout(1,7)   -> layout(2,7)          -> 27
    */
 
-  Index_type arr[2] = {1,2};
+  Index_type arr[2] = {1, 2};
 
   auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  for (int i = 0; i < 10; i++ ) {
-    EXPECT_EQ(index_layout(0,i), i+10);
-    EXPECT_EQ(index_layout(1,i), i+20);
+  for (int i = 0; i < 10; i++)
+  {
+    EXPECT_EQ(index_layout(0, i), i + 10);
+    EXPECT_EQ(index_layout(1, i), i + 20);
   }
-
 }
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1) {
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1)
+{
   /*
-   * Construct a 2D index layout of size 3x10 with 
+   * Construct a 2D index layout of size 3x10 with
    * the direct index used along the 0-axis and
    * the index list {9,5} used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(0,5)          -> 5
    *   index_layout(2,0)   -> layout(2,9)          -> 29
    */
 
-  Index_type arr[2] = {9,5};
+  Index_type arr[2] = {9, 5};
 
   auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  EXPECT_EQ(index_layout(0,0), 9);
-  EXPECT_EQ(index_layout(0,1), 5);
-  EXPECT_EQ(index_layout(1,0), 19);
-  EXPECT_EQ(index_layout(1,1), 15);
-  EXPECT_EQ(index_layout(2,0), 29);
-  EXPECT_EQ(index_layout(2,1), 25);
-
+  EXPECT_EQ(index_layout(0, 0), 9);
+  EXPECT_EQ(index_layout(0, 1), 5);
+  EXPECT_EQ(index_layout(1, 0), 19);
+  EXPECT_EQ(index_layout(1, 1), 15);
+  EXPECT_EQ(index_layout(2, 0), 29);
+  EXPECT_EQ(index_layout(2, 1), 25);
 }
 
-TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0) {
+TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0)
+{
   /*
-   * Construct a 2D index layout of size 3x3 with 
+   * Construct a 2D index layout of size 3x3 with
    * the index list {2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(2,1)          -> 7
    *   index_layout(0,2)   -> layout(2,2)          -> 8
@@ -112,18 +114,18 @@ TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0) {
   auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0,0), 6);
-  EXPECT_EQ(index_layout(0,1), 7);
-  EXPECT_EQ(index_layout(0,2), 8);  
-
+  EXPECT_EQ(index_layout(0, 0), 6);
+  EXPECT_EQ(index_layout(0, 1), 7);
+  EXPECT_EQ(index_layout(0, 2), 8);
 }
 
-TEST(IndexLayout, IndexList2DLayoutExtractOneIndex) {
+TEST(IndexLayout, IndexList2DLayoutExtractOneIndex)
+{
   /*
-   * Construct a 2D index layout of size 3x3 with 
+   * Construct a 2D index layout of size 3x3 with
    * the direct index used along the 0-axis and
    * the index list {2} used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(1,0)   -> layout(1,2)          -> 5
    *   index_layout(2,0)   -> layout(2,2)          -> 8
@@ -134,18 +136,18 @@ TEST(IndexLayout, IndexList2DLayoutExtractOneIndex) {
   auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0,0), 2);
-  EXPECT_EQ(index_layout(1,0), 5);
-  EXPECT_EQ(index_layout(2,0), 8);
-
+  EXPECT_EQ(index_layout(0, 0), 2);
+  EXPECT_EQ(index_layout(1, 0), 5);
+  EXPECT_EQ(index_layout(2, 0), 8);
 }
 
-TEST(IndexLayout, ConditionalIndexListNullPtr) {
+TEST(IndexLayout, ConditionalIndexListNullPtr)
+{
   /*
-   * Construct a 1D index layout of size 3 with 
+   * Construct a 1D index layout of size 3 with
    * the conditional index list that is a nullptr
    * (conditional index lists always evaluate nullptr to regular indexing)
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(0)            -> 0
    *   index_layout(2)     -> layout(2)            -> 2
@@ -161,18 +163,19 @@ TEST(IndexLayout, ConditionalIndexListNullPtr) {
   EXPECT_EQ(index_layout(2), 2);
 }
 
-TEST(IndexLayout, ConditionalIndexListWithIndexList) {
+TEST(IndexLayout, ConditionalIndexListWithIndexList)
+{
   /*
-   * Construct a 1D index layout of size 3 with 
+   * Construct a 1D index layout of size 3 with
    * the conditional index list that is not a nullptr
    * (conditional index lists with index list act the same as IndexList)
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(1)            -> 1
    *   index_layout(1)     -> layout(2)            -> 2
    */
 
-  Index_type arr[2] = {1,2};
+  Index_type arr[2] = {1, 2};
 
   auto index_tuple = make_index_tuple(ConditionalIndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
@@ -184,16 +187,16 @@ TEST(IndexLayout, ConditionalIndexListWithIndexList) {
 TEST(IndexLayout, View1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 5 with 
+   * Construct a 1D index layout of size 5 with
    * the index list {4,2,3} and pass to a 1D view with the data {5,10,15,20,25}
-   * Examples: 
-   *   (index layout index -> regular layout index -> unit stride index -> view at index)
-   *   index_layout(0)     -> layout(4)            -> 4                 -> 25
+   * Examples:
+   *   (index layout index -> regular layout index -> unit stride index -> view
+   * at index) index_layout(0)     -> layout(4)            -> 4 -> 25
    *   index_layout(2)     -> layout(3)            -> 3                 -> 20
    */
-  
-  Index_type data[5] = {5,10,15,20,25};
-  Index_type index_list[3] = {4,2,3};
+
+  Index_type data[5] = {5, 10, 15, 20, 25};
+  Index_type index_list[3] = {4, 2, 3};
 
   auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
@@ -203,18 +206,17 @@ TEST(IndexLayout, View1DLayout)
   EXPECT_EQ(view(0), 25);
   EXPECT_EQ(view(1), 15);
   EXPECT_EQ(view(2), 20);
-
 }
 
 TEST(IndexLayout, View2DLayout)
 {
   /*
-   * Construct a 2D index layout of size 2x3 with 
+   * Construct a 2D index layout of size 2x3 with
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * pass to a 2D view of size 2x3 with the each entry being i*j
    * for i,j in [0,2)x[0,3) (e.g. view(1,2) = 1*2, view(0,2) = 0*2, etc..)
-   * Examples: 
+   * Examples:
    *   (index layout index -> view index -> view at index)
    *   index_layout(0,1)   -> view(0,2)  -> 0
    *   index_layout(1,0)   -> view(1,1)  -> 1
@@ -222,56 +224,62 @@ TEST(IndexLayout, View2DLayout)
 
   Index_type data[2][3];
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 3; j ++ ) {
-      data[i][j] = i*j;
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      data[i][j] = i * j;
     }
   }
 
-  Index_type index_list[2] = {1,2};
+  Index_type index_list[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&index_list[0]});
+  auto index_tuple =
+      make_index_tuple(DirectIndex<>(), IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 2, 3);
 
   auto view = make_index_view(&data[0][0], index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 2; j ++ ) {
-      EXPECT_EQ(view(i,j), i*(j+1));
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 2; j++)
+    {
+      EXPECT_EQ(view(i, j), i * (j + 1));
     }
   }
-
 }
 
 TEST(IndexLayout, View3DLayout)
 {
   /*
-   * Construct a 3D index layout of size 2x3x4 with 
+   * Construct a 3D index layout of size 2x3x4 with
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * the index list {2,3} used along the 2-axis and
    * pass to a 3D view of size 2x3x4 with the each entry being i*j*k
-   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) = 0*2*2, etc..)
-   * Examples: 
-   *   (index layout index -> view index -> view at index)
+   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) =
+   * 0*2*2, etc..) Examples: (index layout index -> view index -> view at index)
    *   index_layout(0,1,0) -> view(0,2,2)-> 0
    *   index_layout(2,1,1) -> view(2,2,3)-> 12
    */
-  
+
   Index_type data[2][3][4];
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 3; j ++ ) {
-      for (int k = 0; k < 4; k ++ ) {
-	data[i][j][k] = i*j*k;
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      for (int k = 0; k < 4; k++)
+      {
+        data[i][j][k] = i * j * k;
       }
     }
   }
 
-  Index_type index_list_j[2] = {1,2};
-  Index_type index_list_k[2] = {2,3};
+  Index_type index_list_j[2] = {1, 2};
+  Index_type index_list_k[2] = {2, 3};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), 
+  auto index_tuple = make_index_tuple(DirectIndex<>(),
                                       IndexList<>{&index_list_j[0]},
                                       IndexList<>{&index_list_k[0]});
 
@@ -279,55 +287,58 @@ TEST(IndexLayout, View3DLayout)
 
   auto view = make_index_view(&data[0][0][0], index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 2; j ++ ) {
-      for (int k = 0; k < 2; k ++ ) {
-        EXPECT_EQ(view(i,j,k), i*(j+1)*(k+2));
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 2; j++)
+    {
+      for (int k = 0; k < 2; k++)
+      {
+        EXPECT_EQ(view(i, j, k), i * (j + 1) * (k + 2));
       }
     }
   }
-
 }
 
 TEST(IndexLayout, MultiView1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 4 with 
-   * the index list {1,2} and pass to a 1D multiview containing two 1D views of size 4 with
-   * the first view having each entry be the square of its index (e.g. view(2) = 2*2 = 4)
-   * and the second view having each entry be the cube of its index (e.g. view(3) = 3*3*3 = 27)
-   * Examples: 
-   *   (index layout index -> mutiview index -> view at index)
-   *   index_layout(0,1)   -> view(0,2)      -> 4
+   * Construct a 1D index layout of size 4 with
+   * the index list {1,2} and pass to a 1D multiview containing two 1D views of
+   * size 4 with the first view having each entry be the square of its index
+   * (e.g. view(2) = 2*2 = 4) and the second view having each entry be the cube
+   * of its index (e.g. view(3) = 3*3*3 = 27) Examples: (index layout index ->
+   * mutiview index -> view at index) index_layout(0,1)   -> view(0,2)      -> 4
    *   index_layout(1,0)   -> view(1,1)      -> 1
    */
 
   Index_type data_squared[4];
   Index_type data_cubed[4];
 
-  for (int i = 0; i < 4; i ++ ) {
-    data_squared[i] = i*i;
+  for (int i = 0; i < 4; i++)
+  {
+    data_squared[i] = i * i;
   }
-  
-  for (int i = 0; i < 4; i ++ ) {
-    data_cubed[i] = i*i*i;
+
+  for (int i = 0; i < 4; i++)
+  {
+    data_cubed[i] = i * i * i;
   }
 
   Index_type* data_array[2];
   data_array[0] = data_squared;
   data_array[1] = data_cubed;
 
-  Index_type index_list[2] = {1,2};
+  Index_type index_list[2] = {1, 2};
 
   auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 4);
 
-  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<> > >(data_array, index_layout);
+  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<>>>(
+      data_array, index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    EXPECT_EQ(view(0,i), data_squared[i+1]);
-    EXPECT_EQ(view(1,i), data_cubed[i+1]);
+  for (int i = 0; i < 2; i++)
+  {
+    EXPECT_EQ(view(0, i), data_squared[i + 1]);
+    EXPECT_EQ(view(1, i), data_cubed[i + 1]);
   }
-
 }
-
diff --git a/test/unit/view-layout/test-makelayout.cpp b/test/unit/view-layout/test-makelayout.cpp
index af8b6db71e..6ab43b7348 100644
--- a/test/unit/view-layout/test-makelayout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -9,19 +9,18 @@
 
 TEST(LayoutUnitTest, OffsetVsRegular)
 {
-  const auto layout =
-      RAJA::make_permuted_layout({{6, 6}},
-                                 RAJA::as_array<RAJA::Perm<1, 0>>::get());
-  const auto offset =
-      RAJA::make_permuted_offset_layout({{0, 0}},
-                                        {{6, 6}},
-                                        RAJA::as_array<RAJA::PERM_JI>::get());
+  const auto layout = RAJA::make_permuted_layout(
+      {{6, 6}}, RAJA::as_array<RAJA::Perm<1, 0>>::get());
+  const auto offset = RAJA::make_permuted_offset_layout(
+      {{0, 0}}, {{6, 6}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * OffsetLayout with 0 offset should function like the regular Layout.
    */
-  for (int j = 0; j < 6; ++j) {
-    for (int i = 0; i < 6; ++i) {
+  for (int j = 0; j < 6; ++j)
+  {
+    for (int i = 0; i < 6; ++i)
+    {
       ASSERT_EQ(offset(i, j), layout(i, j))
           << layout.strides[0] << layout.strides[1];
     }
@@ -67,10 +66,8 @@ TEST(OffsetLayoutUnitTest, 2D_JI)
    * (-1, -1), (0, -1), (1, -1)
    * (-1, -2), (0, -2), (1, -2)
    */
-  const my_layout layout =
-      RAJA::make_permuted_offset_layout({{-1, -2}},
-                                        {{2, 1}},
-                                        RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_offset_layout(
+      {{-1, -2}}, {{2, 1}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * First element, (-1, -2), should have index 0.
@@ -107,9 +104,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 
   // Construct using variadic "sizes" ctor
   // Zero for J size should correctly produce projective layout
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 0, 7}},
-                                 RAJA::as_array<RAJA::PERM_KJI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 0, 7}}, RAJA::as_array<RAJA::PERM_KJI>::get());
 
   ASSERT_EQ(0, layout(0, 0, 0));
 
@@ -124,7 +120,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
   ASSERT_EQ(12, layout(0, 0, 4));
 
   // Check that we get the identity (mod 21)
-  for (int x = 0; x < 40; ++x) {
+  for (int x = 0; x < 40; ++x)
+  {
 
     // inverse map
     int i, j, k;
@@ -155,9 +152,8 @@ TEST(LayoutUnitTest, 2D_StrideOne)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 5}},
-                                 RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
 
   /*
@@ -167,8 +163,10 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 
 
   // Check that we get the same layout
-  for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 3; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
 
       ASSERT_EQ(layout(i, j), layout_s1(i, j));
     }
@@ -178,44 +176,49 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 TEST(StaticLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2> dynamic_layout(7, 5);
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ,7,5>;
-  
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ, 7, 5>;
+
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 5}},
-                               RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7,5>;
-  
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7, 5>;
+
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 13, 5}},
-                               RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7,13,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7, 13, 5>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 13; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i,j,k));
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 13; ++j)
+    {
+      for (int k = 0; k < 5; ++k)
+      {
+        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -224,21 +227,23 @@ TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TEST(StaticLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 13, 5, 17}},
-                               RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7,13,5,17>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7, 13, 5, 17>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 13; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 5; ++l) {
-          ASSERT_EQ(dynamic_layout(i, j, k, l), static_layout::s_oper(i,j,k,l));
-        } 
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 13; ++j)
+    {
+      for (int k = 0; k < 5; ++k)
+      {
+        for (int l = 0; l < 5; ++l)
+        {
+          ASSERT_EQ(dynamic_layout(i, j, k, l),
+                    static_layout::s_oper(i, j, k, l));
+        }
       }
     }
   }
 }
-
-
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
index c841c718a6..6e29916fcd 100644
--- a/test/unit/view-layout/test-multiview.cpp
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -11,14 +11,17 @@
 RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template<typename T>
-class MultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class MultiViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class OffsetLayoutMultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class OffsetLayoutMultiViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class TypedIntegralMultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedIntegralMultiViewUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(MultiViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutMultiViewUnitTest, UnitIntFloatTypes);
@@ -29,9 +32,9 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 
   using layout = RAJA::Layout<1>;
 
-  TypeParam   a1[10];
-  TypeParam   a2[10];
-  TypeParam * data[2];
+  TypeParam a1[10];
+  TypeParam a2[10];
+  TypeParam* data[2];
 
   data[0] = a1;
   data[1] = a2;
@@ -41,60 +44,64 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
   a2[0] = val;
 
   RAJA::MultiView<TypeParam, layout> view(data, layout(10));
-  ASSERT_EQ( val, view(0,0) );
+  ASSERT_EQ(val, view(0, 0));
 
   /*
-   * Should be able to construct a non-const MultiView from a non-const MultiView
+   * Should be able to construct a non-const MultiView from a non-const
+   * MultiView
    */
   RAJA::MultiView<TypeParam, layout> view2(view);
-  ASSERT_EQ( val, view2(0,0) );
+  ASSERT_EQ(val, view2(0, 0));
 
   /*
    * Should be able to construct a const MultiView from a non-const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view(view);
-  ASSERT_EQ( val, const_view(0,0) );
+  ASSERT_EQ(val, const_view(0, 0));
 
   /*
    * Should be able to construct a const MultiView from a const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view2(const_view);
-  ASSERT_EQ( val, const_view2(0,0) );
+  ASSERT_EQ(val, const_view2(0, 0));
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to 1st position
+  // non-default construction of MultiView with array-of-pointers index moved to
+  // 1st position
   RAJA::MultiView<TypeParam, layout, 1> view1p(data, layout(10));
-  ASSERT_EQ( val, view1p(0,0) );
+  ASSERT_EQ(val, view1p(0, 0));
 
   // construct a non-const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam, layout, 1> view1p2(view1p);
-  ASSERT_EQ( val, view1p2(0,0) );
+  ASSERT_EQ(val, view1p2(0, 0));
 
   // construct a const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p(view1p);
-  ASSERT_EQ( val, const_view1p(0,0) );
+  ASSERT_EQ(val, const_view1p(0, 0));
 
   // construct a const MultiView from a const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p2(const_view1p);
-  ASSERT_EQ( val, const_view1p2(0,0) );
+  ASSERT_EQ(val, const_view1p2(0, 0));
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to 1st position
-  // and non-const pointer type specification (used in CHAI)
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc(data, layout(10));
-  ASSERT_EQ( val, view1pnc(0,0) );
+  // non-default construction of MultiView with array-of-pointers index moved to
+  // 1st position and non-const pointer type specification (used in CHAI)
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc(data, layout(10));
+  ASSERT_EQ(val, view1pnc(0, 0));
 
   // construct a non-const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc2(view1pnc);
-  ASSERT_EQ( val, view1pnc2(0,0) );
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc2(view1pnc);
+  ASSERT_EQ(val, view1pnc2(0, 0));
 
   // construct a const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc(view1pnc);
-  ASSERT_EQ( val, const_view1pnc(0,0) );
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc(
+      view1pnc);
+  ASSERT_EQ(val, const_view1pnc(0, 0));
 
   // construct a const MultiView from a const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc2(const_view1pnc);
-  ASSERT_EQ( val, const_view1pnc2(0,0) );
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc2(
+      const_view1pnc);
+  ASSERT_EQ(val, const_view1pnc2(0, 0));
 }
 
 TYPED_TEST(MultiViewUnitTest, Accessor)
@@ -103,48 +110,51 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *b = new TypeParam[N];
-  TypeParam *c = new TypeParam[N];
-  TypeParam *a[2];
+  const int N = Nx * Ny * Nz;
+  TypeParam* b = new TypeParam[N];
+  TypeParam* c = new TypeParam[N];
+  TypeParam* a[2];
 
   a[0] = b;
   a[1] = c;
 
   int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  for (TypeParam i = 0; i < TypeParam{N}; ++i)
   {
     a[0][iter] = TypeParam{i};
-    a[1][iter] = TypeParam{i}+1;
+    a[1][iter] = TypeParam{i} + 1;
     ++iter;
   }
 
   /*
    * 1D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a,N);
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>,1> view_1D1p(a,N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>, 1> view_1D1p(a, N);
   TypeParam val{0};
-  for(int i=0; i<N; ++i) {
-    ASSERT_EQ(val, view_1D(0,i));
-    ASSERT_EQ(val+1, view_1D(1,i));
-    ASSERT_EQ(val, view_1D1p(i,0));
-    ASSERT_EQ(val+1, view_1D1p(i,1));
+  for (int i = 0; i < N; ++i)
+  {
+    ASSERT_EQ(val, view_1D(0, i));
+    ASSERT_EQ(val + 1, view_1D(1, i));
+    ASSERT_EQ(val, view_1D1p(i, 0));
+    ASSERT_EQ(val + 1, view_1D1p(i, 1));
     val++;
   }
 
   /*
    * 2D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>,1> view_2D1p(a,Ny,Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>, 1> view_2D1p(a, Ny, Nx);
   val = TypeParam{0};
-  for(int j=0; j<Ny; ++j) {
-    for(int i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(0,j,i));
-      ASSERT_EQ(val+1, view_2D(1,j,i));
-      ASSERT_EQ(val, view_2D1p(j,0,i));
-      ASSERT_EQ(val+1, view_2D1p(j,1,i));
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(0, j, i));
+      ASSERT_EQ(val + 1, view_2D(1, j, i));
+      ASSERT_EQ(val, view_2D1p(j, 0, i));
+      ASSERT_EQ(val + 1, view_2D1p(j, 1, i));
       val++;
     }
   }
@@ -152,16 +162,19 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>,2> view_3D1p(a,Nz,Ny,Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>, 2> view_3D1p(a, Nz, Ny, Nx);
   val = TypeParam{0};
-  for(int k=0; k<Nz; ++k) {
-    for(int j=0; j<Ny; ++j) {
-      for(int i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(0,k,j,i));
-        ASSERT_EQ(val+1, view_3D(1,k,j,i));
-        ASSERT_EQ(val, view_3D1p(k,j,0,i));
-        ASSERT_EQ(val+1, view_3D1p(k,j,1,i));
+  for (int k = 0; k < Nz; ++k)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(0, k, j, i));
+        ASSERT_EQ(val + 1, view_3D(1, k, j, i));
+        ASSERT_EQ(val, view_3D1p(k, j, 0, i));
+        ASSERT_EQ(val + 1, view_3D1p(k, j, 1, i));
         val++;
       }
     }
@@ -187,18 +200,21 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
    */
   std::array<RAJA::Index_type, 1> lower{{1}};
   std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::MultiView<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
-  RAJA::MultiView<TypeParam, layout,1> view1p(data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::MultiView<TypeParam, layout> view(
+      data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::MultiView<TypeParam, layout, 1> view1p(
+      data, RAJA::make_offset_layout<1>(lower, upper));
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
     data[0][i] = static_cast<TypeParam>(i);
-    data[1][i] = static_cast<TypeParam>(i+1);
+    data[1][i] = static_cast<TypeParam>(i + 1);
   }
 
-  ASSERT_EQ(data[0][0], view(0,1));
-  ASSERT_EQ(data[1][9], view(1,10));
-  ASSERT_EQ(data[0][0], view1p(1,0));
-  ASSERT_EQ(data[1][9], view1p(10,1));
+  ASSERT_EQ(data[0][0], view(0, 1));
+  ASSERT_EQ(data[1][9], view(1, 10));
+  ASSERT_EQ(data[0][0], view1p(1, 0));
+  ASSERT_EQ(data[1][9], view1p(10, 1));
 
   delete[] d1;
   delete[] d2;
@@ -208,46 +224,48 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 {
 
   int N = 10;
-  TypeParam *reala = new TypeParam[N];
-  TypeParam *realb = new TypeParam[N];
-  TypeParam *a[2];
+  TypeParam* reala = new TypeParam[N];
+  TypeParam* realb = new TypeParam[N];
+  TypeParam* a[2];
   a[0] = reala;
   a[1] = realb;
 
-  //Create a view from a base view
+  // Create a view from a base view
   const int DIM = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N);
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N);
 
-  for(int i=0; i<N; ++i) {
-    A(0,i) = static_cast<TypeParam>(i + 1);
-    B(1,i) = static_cast<TypeParam>(i + 1);
+  for (int i = 0; i < N; ++i)
+  {
+    A(0, i) = static_cast<TypeParam>(i + 1);
+    B(1, i) = static_cast<TypeParam>(i + 1);
   }
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Ashift(0,i),A(0,i-N));
-    ASSERT_EQ(Bshift(1,i),B(1,i-N));
+    ASSERT_EQ(Ashift(0, i), A(0, i - N));
+    ASSERT_EQ(Bshift(1, i), B(1, i - N));
   }
 
   // offset layout with MultiView with array-of-pointers index in 1st position
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift = C.shift({{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift =
+      C.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Cshift(i,0),C(i-N,0));
-    ASSERT_EQ(Cshift(i,1),C(i-N,1));
-    ASSERT_EQ(Ashift(0,i),C(i-N,0));
-    ASSERT_EQ(Cshift(i,0),A(0,i-N));
+    ASSERT_EQ(Cshift(i, 0), C(i - N, 0));
+    ASSERT_EQ(Cshift(i, 1), C(i - N, 1));
+    ASSERT_EQ(Ashift(0, i), C(i - N, 0));
+    ASSERT_EQ(Cshift(i, 0), A(0, i - N));
   }
 
 
-  //Create a shifted view from a view with a typed layout
+  // Create a shifted view from a view with a typed layout
   using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
@@ -256,9 +274,9 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
   RAJA::MultiView<TypeParam, TLayout> D(a, myLayout);
   RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  for (TIX i = TIX{N}; i < TIX{2 * N}; ++i)
   {
-    ASSERT_EQ(Dshift(0,i),D(0,i-N));
+    ASSERT_EQ(Dshift(0, i), D(0, i - N));
   };
 
   delete[] reala;
@@ -269,53 +287,64 @@ TYPED_TEST(MultiViewUnitTest, Shift2D)
 {
 
   int N = 10;
-  TypeParam *a0 = new TypeParam[N*N];
-  TypeParam *b0 = new TypeParam[N*N];
-  TypeParam *a[2];
+  TypeParam* a0 = new TypeParam[N * N];
+  TypeParam* b0 = new TypeParam[N * N];
+  TypeParam* a[2];
   a[0] = a0;
   a[1] = b0;
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
-
-  for(int y=0; y<N; ++y) {
-    for(int x=0; x<N; ++x) {
-      A(0,y,x) = static_cast<TypeParam>(x + N*y);
-      B(1,y,x) = static_cast<TypeParam>(x + N*y + 1);
+  RAJA::OffsetLayout<DIM> layout =
+      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
+
+  for (int y = 0; y < N; ++y)
+  {
+    for (int x = 0; x < N; ++x)
+    {
+      A(0, y, x) = static_cast<TypeParam>(x + N * y);
+      B(1, y, x) = static_cast<TypeParam>(x + N * y + 1);
     }
   }
 
-  //Create a view from a base view with an offsetlayout
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+  // Create a view from a base view with an offsetlayout
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift =
+      A.shift({{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift =
+      B.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Ashift(0,y,x),A(0,y-N,x-N));
-      ASSERT_EQ(Bshift(1,y,x),B(1,y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Ashift(0, y, x), A(0, y - N, x - N));
+      ASSERT_EQ(Bshift(1, y, x), B(1, y - N, x - N));
     }
   }
 
-  //Create a view from a base view with permuted layout
-  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  // Create a view from a base view with permuted layout
+  std::array<RAJA::idx_t, 2> perm{{1, 0}};
   RAJA::OffsetLayout<2> playout =
-    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
+      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> D(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> Dshift1p = D.shift({{N,N}});
-
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Cshift(0,y,x),C(0,y-N,x-N));
-      ASSERT_EQ(Cshift(1,y,x),C(1,y-N,x-N));
-      ASSERT_EQ(Dshift1p(y,0,x),D(y-N,0,x-N));
-      ASSERT_EQ(Dshift1p(y,1,x),D(y-N,1,x-N));
-      ASSERT_EQ(Dshift1p(y,1,x),C(1,y-N,x-N));
-      ASSERT_EQ(Cshift(0,y,x),D(y-N,0,x-N));
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift =
+      C.shift({{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> D(a, playout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Dshift1p =
+      D.shift({{N, N}});
+
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Cshift(0, y, x), C(0, y - N, x - N));
+      ASSERT_EQ(Cshift(1, y, x), C(1, y - N, x - N));
+      ASSERT_EQ(Dshift1p(y, 0, x), D(y - N, 0, x - N));
+      ASSERT_EQ(Dshift1p(y, 1, x), D(y - N, 1, x - N));
+      ASSERT_EQ(Dshift1p(y, 1, x), C(1, y - N, x - N));
+      ASSERT_EQ(Cshift(0, y, x), D(y - N, 0, x - N));
     }
   }
 
diff --git a/test/unit/view-layout/test-standard-layout.cpp b/test/unit/view-layout/test-standard-layout.cpp
index 160e39ac36..cf7ce50b79 100644
--- a/test/unit/view-layout/test-standard-layout.cpp
+++ b/test/unit/view-layout/test-standard-layout.cpp
@@ -69,7 +69,8 @@ TEST(LayoutUnitTest, 2D_IJ)
   ASSERT_EQ(4, layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -100,9 +101,8 @@ TEST(LayoutUnitTest, 2D_JI)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 5}},
-                                 RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   ASSERT_EQ(0, layout(0, 0));
 
@@ -113,7 +113,8 @@ TEST(LayoutUnitTest, 2D_JI)
   ASSERT_EQ(14, layout(2, 4));
 
   // Check that we get the identity (mod 15)
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -158,7 +159,8 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
   ASSERT_EQ(0, layout(0, 5));
 
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k) {
+  for (int k = 0; k < 20; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -174,4 +176,3 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
     ASSERT_EQ(j, 0);
   }
 }
-
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
index 6820da9b52..12b24c28da 100644
--- a/test/unit/view-layout/test-typedlayout.cpp
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -8,8 +8,9 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class TypedLayoutUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedLayoutUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 
@@ -17,7 +18,8 @@ TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
 {
 
-  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,5);
+  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,
+                                                                          5);
 
   ASSERT_EQ(TypeParam{0}, l(TypeParam{0}, TypeParam{0}));
 
@@ -34,7 +36,8 @@ TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
 {
-  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout =
+      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D layout:
@@ -66,7 +69,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
   ASSERT_EQ(TypeParam(4), layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     TypeParam i, j;
@@ -82,12 +86,12 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
     ASSERT_EQ(k2, layout_a(i, j));
     ASSERT_EQ(k2, layout_b(i, j));
   }
-
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 {
-  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout =
+      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D projective layout:
@@ -118,7 +122,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 
   TypeParam pK = 0;
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k) {
+  for (int k = 0; k < 20; ++k)
+  {
 
     // inverse map
     TypeParam i, j;
@@ -139,50 +144,66 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_IJ,TypeParam,RAJA::list<TypeParam,TypeParam>,7,5>;
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_IJ,
+                              TypeParam,
+                              RAJA::list<TypeParam, TypeParam>,
+                              7,
+                              5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 5; ++j) {
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 5; ++j)
+    {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
-
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 5}},
-                               RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam>, 7,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JI,
+                              TypeParam,
+                              RAJA::list<TypeParam, TypeParam>,
+                              7,
+                              5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 5; ++j) {
-      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i,j));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 5; ++j)
+    {
+      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i, j));
     }
   }
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 13, 5}},
-                               RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JKI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam,TypeParam>,
-                                                7,13,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JKI,
+                              TypeParam,
+                              RAJA::list<TypeParam, TypeParam, TypeParam>,
+                              7,
+                              13,
+                              5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 9; ++j) {
-      for (TypeParam k = 0; k < 5; ++k) {
-        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)), static_layout::s_oper(i,j,k));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 9; ++j)
+    {
+      for (TypeParam k = 0; k < 5; ++k)
+      {
+        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)),
+                  static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -191,20 +212,28 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 13, 5, 17}},
-                               RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_LJKI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam,TypeParam,TypeParam>,
-                                                7,13,5,17>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::TypedStaticLayout<
+      RAJA::PERM_LJKI,
+      TypeParam,
+      RAJA::list<TypeParam, TypeParam, TypeParam, TypeParam>,
+      7,
+      13,
+      5,
+      17>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 8; ++j) {
-      for (TypeParam k = 0; k < 5; ++k) {
-        for (TypeParam l = 0; l < 5; ++l) {
-          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)), static_layout::s_oper(i,j,k,l));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 8; ++j)
+    {
+      for (TypeParam k = 0; k < 5; ++k)
+      {
+        for (TypeParam l = 0; l < 5; ++l)
+        {
+          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)),
+                    static_layout::s_oper(i, j, k, l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index b0823b93e0..1cfcf6a5fc 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -13,14 +13,17 @@ RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIY, "TIY");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template<typename T>
-class TypedViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class OffsetLayoutViewUnitTest : public ::testing::Test {};
+template <typename T>
+class OffsetLayoutViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class TypedIntegralViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedIntegralViewUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(TypedViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutViewUnitTest, UnitIntFloatTypes);
@@ -63,11 +66,11 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *a = new TypeParam[N];
+  const int N = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
   int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  for (TypeParam i = 0; i < TypeParam{N}; ++i)
   {
     a[iter] = TypeParam{i};
     ++iter;
@@ -76,9 +79,10 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 1D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a,N);
+  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a, N);
   TypeParam val{0};
-  for(int i=0; i<N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -86,11 +90,13 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 2D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
   val = TypeParam{0};
-  for(int j=0; j<Ny; ++j) {
-    for(int i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(j,i));
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(j, i));
       val++;
     }
   }
@@ -98,12 +104,15 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
   val = TypeParam{0};
-  for(int k=0; k<Nz; ++k) {
-    for(int j=0; j<Ny; ++j) {
-      for(int i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(k,j,i));
+  for (int k = 0; k < Nz; ++k)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(k, j, i));
         val++;
       }
     }
@@ -117,11 +126,11 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *a = new TypeParam[N];
+  const int N = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
   int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  for (TypeParam i = 0; i < TypeParam{N}; ++i)
   {
     a[iter] = TypeParam{i};
     ++iter;
@@ -130,9 +139,10 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 1D Typed Accessor
    */
-  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a,N);
+  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a, N);
   TypeParam val{0};
-  for(TypeParam i=0; i<N; ++i) {
+  for (TypeParam i = 0; i < N; ++i)
+  {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -140,11 +150,13 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 2D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
   val = TypeParam{0};
-  for(TypeParam j=0; j<Ny; ++j) {
-    for(TypeParam i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(j,i));
+  for (TypeParam j = 0; j < Ny; ++j)
+  {
+    for (TypeParam i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(j, i));
       val++;
     }
   }
@@ -152,12 +164,15 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 3D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
   val = TypeParam{0};
-  for(TypeParam k=0; k<Nz; ++k) {
-    for(TypeParam j=0; j<Ny; ++j) {
-      for(TypeParam i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(k,j,i));
+  for (TypeParam k = 0; k < Nz; ++k)
+  {
+    for (TypeParam j = 0; j < Ny; ++j)
+    {
+      for (TypeParam i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(k, j, i));
         val++;
       }
     }
@@ -177,9 +192,11 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
    */
   std::array<RAJA::Index_type, 1> lower{{1}};
   std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::View<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::View<TypeParam, layout> view(data,
+                                     RAJA::make_offset_layout<1>(lower, upper));
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
     data[i] = static_cast<TypeParam>(i);
   }
 
@@ -193,19 +210,20 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
 {
 
   int N = 10;
-  TypeParam *a = new TypeParam[N];
-  TypeParam *b = new TypeParam[N];
+  TypeParam* a = new TypeParam[N];
+  TypeParam* b = new TypeParam[N];
 
   /*
    * Create a view from a base view
    */
   const int DIM = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N);
-  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>,TX> C(a,N);
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N);
+  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>, TX> C(a, N);
 
-  for(int i=0; i<N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     A(i) = static_cast<TypeParam>(i + 1);
   }
 
@@ -215,17 +233,18 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift = C.shift({{N}});
+  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift =
+      C.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Ashift(i),A(i-N));
-    ASSERT_EQ(Bshift(i),B(i-N));
+    ASSERT_EQ(Ashift(i), A(i - N));
+    ASSERT_EQ(Bshift(i), B(i - N));
   }
 
-  for(TX tx=TX{N}; tx<TX{2*N}; tx++)
+  for (TX tx = TX{N}; tx < TX{2 * N}; tx++)
   {
-    ASSERT_EQ(Cshift(tx),C(tx-N));
+    ASSERT_EQ(Cshift(tx), C(tx - N));
   }
 
   /*
@@ -239,14 +258,13 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   RAJA::View<TypeParam, TLayout> D(a, myLayout);
   RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  for (TIX i = TIX{N}; i < TIX{2 * N}; ++i)
   {
-    ASSERT_EQ(Dshift(i),D(i-N));
+    ASSERT_EQ(Dshift(i), D(i - N));
   };
 
   delete[] a;
   delete[] b;
-
 }
 
 
@@ -254,46 +272,53 @@ TYPED_TEST(TypedViewUnitTest, Shift2D)
 {
 
   int N = 10;
-  TypeParam *a = new TypeParam[N*N];
-  TypeParam *b = new TypeParam[N*N];
+  TypeParam* a = new TypeParam[N * N];
+  TypeParam* b = new TypeParam[N * N];
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
+  RAJA::OffsetLayout<DIM> layout =
+      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
 
-  for(int y=0; y<N; ++y) {
-    for(int x=0; x<N; ++x) {
-      A(y,x) = static_cast<TypeParam>(x + N*y);
+  for (int y = 0; y < N; ++y)
+  {
+    for (int x = 0; x < N; ++x)
+    {
+      A(y, x) = static_cast<TypeParam>(x + N * y);
     }
   }
 
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Ashift(y,x),A(y-N,x-N));
-      ASSERT_EQ(Bshift(y,x),B(y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Ashift(y, x), A(y - N, x - N));
+      ASSERT_EQ(Bshift(y, x), B(y - N, x - N));
     }
   }
 
   /*
    * Create a view from a base view with permuted layout
    */
-  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  std::array<RAJA::idx_t, 2> perm{{1, 0}};
   RAJA::OffsetLayout<2> playout =
-    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
+      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Cshift(y,x),C(y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Cshift(y, x), C(y - N, x - N));
     }
   }
 
diff --git a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
index 7797ce9947..d680ddc95f 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
@@ -17,14 +17,10 @@
 #include <random>
 
 
-template < typename IndexType,
-           typename ... Args >
+template <typename IndexType, typename... Args>
 struct EnqueueTestCallable
 {
-  EnqueueTestCallable(IndexType* _ptr, IndexType _val)
-    : ptr(_ptr)
-    , val(_val)
-  { }
+  EnqueueTestCallable(IndexType* _ptr, IndexType _val) : ptr(_ptr), val(_val) {}
 
   EnqueueTestCallable(EnqueueTestCallable const&) = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable const&) = default;
@@ -40,7 +36,7 @@ struct EnqueueTestCallable
 
 private:
   IndexType* ptr;
-  IndexType  val;
+  IndexType val;
 };
 
-#endif  //__TEST_UTIL_WORKGROUP_ENQUEUE__
+#endif //__TEST_UTIL_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
index 5fa93fbf60..c55031b757 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
@@ -19,19 +19,15 @@
 #include <cstddef>
 
 
-template < typename T >
+template <typename T>
 struct TestCallable
 {
-  TestCallable(T _val)
-    : val(_val)
-  { }
+  TestCallable(T _val) : val(_val) {}
 
   TestCallable(TestCallable const&) = delete;
   TestCallable& operator=(TestCallable const&) = delete;
 
-  TestCallable(TestCallable&& o)
-    : val(o.val)
-    , move_constructed(true)
+  TestCallable(TestCallable&& o) : val(o.val), move_constructed(true)
   {
     o.moved_from = true;
   }
@@ -43,8 +39,9 @@ struct TestCallable
     return *this;
   }
 
-  RAJA_HOST_DEVICE void operator()(
-      void* val_ptr, bool* move_constructed_ptr, bool* moved_from_ptr) const
+  RAJA_HOST_DEVICE void operator()(void* val_ptr,
+                                   bool* move_constructed_ptr,
+                                   bool* moved_from_ptr) const
   {
     *static_cast<T*>(val_ptr) = val;
     *move_constructed_ptr = move_constructed;
@@ -53,6 +50,7 @@ struct TestCallable
 
 private:
   T val;
+
 public:
   bool move_constructed = false;
   bool moved_from = false;
@@ -60,7 +58,7 @@ struct TestCallable
 
 
 // work around inconsistent std::array support over stl versions
-template < typename T, size_t N >
+template <typename T, size_t N>
 struct TestArray
 {
   T a[N]{};
@@ -68,9 +66,12 @@ struct TestArray
   T const& operator[](size_t i) const { return a[i]; }
   friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
   {
-    for (size_t i = 0; i < N; ++i) {
-      if (lhs[i] == rhs[i]) continue;
-      else return false;
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (lhs[i] == rhs[i])
+        continue;
+      else
+        return false;
     }
     return true;
   }
@@ -80,4 +81,4 @@ struct TestArray
   }
 };
 
-#endif  //__TEST_UTIL_WORKGROUP_WORKSTORAGE__
+#endif //__TEST_UTIL_WORKGROUP_WORKSTORAGE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index 253015c5b8..3c09987193 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -20,102 +20,111 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
+          typename Allocator>
+struct testWorkGroupConstructorSingle
 {
-  bool success = true;
-
-  using DispatchPolicy = typename DispatchTyper::template type<>;
-
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
   {
-    RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        pool(Allocator{});
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-    RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        group = pool.instantiate();
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-    RAJA::WorkSite<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        site = group.run(Xargs{}...);
-
-    using resource_type = typename RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >::resource_type;
-    auto e = resource_type::get_default().get_event();
-    e.wait();
-
-    pool.clear();
-    group.clear();
-    site.clear();
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    bool success = true;
+
+    using DispatchPolicy = typename DispatchTyper::template type<>;
+
+    {
+      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                           OrderPolicy,
+                                           StoragePolicy,
+                                           DispatchPolicy>,
+                     IndexType,
+                     RAJA::xargs<Xargs...>,
+                     Allocator>
+          pool(Allocator{});
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                            OrderPolicy,
+                                            StoragePolicy,
+                                            DispatchPolicy>,
+                      IndexType,
+                      RAJA::xargs<Xargs...>,
+                      Allocator>
+          group = pool.instantiate();
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
+                                           OrderPolicy,
+                                           StoragePolicy,
+                                           DispatchPolicy>,
+                     IndexType,
+                     RAJA::xargs<Xargs...>,
+                     Allocator>
+          site = group.run(Xargs{}...);
+
+      using resource_type =
+          typename RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                        OrderPolicy,
+                                                        StoragePolicy,
+                                                        DispatchPolicy>,
+                                  IndexType,
+                                  RAJA::xargs<Xargs...>,
+                                  Allocator>::resource_type;
+      auto e = resource_type::get_default().get_event();
+      e.wait();
+
+      pool.clear();
+      group.clear();
+      site.clear();
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    }
+
+    ASSERT_TRUE(success);
   }
-
-  ASSERT_TRUE(success);
-}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_function_call_dispatch_typer,
-                                      IndexType,
-                                      Allocator> {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
-{ }
+          typename Allocator>
+struct testWorkGroupConstructorSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_virtual_function_dispatch_typer,
-                                      IndexType,
-                                      Allocator> {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
-{ }
+          typename Allocator>
+struct testWorkGroupConstructorSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {}
 };
 
 #endif
@@ -123,13 +132,13 @@ void operator()(RAJA::xargs<Xargs...>) const
 
 template <typename T>
 class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest, BasicWorkGroupConstructorSingle)
+TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
+             BasicWorkGroupConstructorSingle)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -139,7 +148,12 @@ TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest, BasicWorkGroupConstructorS
   using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{});
+  testWorkGroupConstructorSingle<ExecPolicy,
+                                 OrderPolicy,
+                                 StoragePolicy,
+                                 DispatchTyper,
+                                 IndexType,
+                                 Allocator>{}(Xargs{});
 }
 
-#endif  //__TEST_WORKGROUP_CONSTRUCTOR__
+#endif //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 843f3b17a6..5fa53d8a59 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -15,58 +15,46 @@
 #include "RAJA_test-workgroup.hpp"
 
 
-template  < typename ForOnePol,
-            typename Invoker,
-            typename ... CallArgs >
-typename  std::enable_if<
-            !std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-call_dispatcher( Invoker invoker,
-                 CallArgs... callArgs )
+template <typename ForOnePol, typename Invoker, typename... CallArgs>
+typename std::enable_if<!std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+call_dispatcher(Invoker invoker, CallArgs... callArgs)
 {
-  forone<ForOnePol>( [=] () {
-    invoker(callArgs...);
-  });
+  forone<ForOnePol>([=]() { invoker(callArgs...); });
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ForOnePol,
-            typename Invoker,
-            typename ... CallArgs >
-typename  std::enable_if<
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-call_dispatcher( Invoker invoker,
-                 CallArgs... callArgs )
+template <typename ForOnePol, typename Invoker, typename... CallArgs>
+typename std::enable_if<std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+call_dispatcher(Invoker invoker, CallArgs... callArgs)
 {
   RAJA::tuple<CallArgs...> lambda_capturable_callArgs(callArgs...);
-  forone<ForOnePol>( [=] RAJA_DEVICE () {
-    camp::invoke(lambda_capturable_callArgs, invoker);
-  });
+  forone<ForOnePol>(
+      [=] RAJA_DEVICE() { camp::invoke(lambda_capturable_callArgs, invoker); });
 }
 #endif
 
-template < typename IndexType,
-           typename ... Args >
+template <typename IndexType, typename... Args>
 struct DispatcherTestCallable
 {
-  DispatcherTestCallable(IndexType* _ptr_call, IndexType _val_call,
-                     IndexType* _ptr_dtor, IndexType _val_dtor)
-    : ptr_call(_ptr_call)
-    , val_call(_val_call)
-    , ptr_dtor(_ptr_dtor)
-    , val_dtor(_val_dtor)
-  { }
+  DispatcherTestCallable(IndexType* _ptr_call,
+                         IndexType _val_call,
+                         IndexType* _ptr_dtor,
+                         IndexType _val_dtor)
+      : ptr_call(_ptr_call),
+        val_call(_val_call),
+        ptr_dtor(_ptr_dtor),
+        val_dtor(_val_dtor)
+  {}
 
   DispatcherTestCallable(DispatcherTestCallable const&) = delete;
   DispatcherTestCallable& operator=(DispatcherTestCallable const&) = delete;
 
   DispatcherTestCallable(DispatcherTestCallable&& o)
-    : ptr_call(o.ptr_call)
-    , val_call(o.val_call)
-    , ptr_dtor(o.ptr_dtor)
-    , val_dtor(o.val_dtor)
-    , move_constructed(true)
+      : ptr_call(o.ptr_call),
+        val_call(o.val_call),
+        ptr_dtor(o.ptr_dtor),
+        val_dtor(o.val_dtor),
+        move_constructed(true)
   {
     o.moved_from = true;
   }
@@ -80,10 +68,7 @@ struct DispatcherTestCallable
     return *this;
   }
 
-  ~DispatcherTestCallable()
-  {
-    *ptr_dtor = val_dtor;
-  }
+  ~DispatcherTestCallable() { *ptr_dtor = val_dtor; }
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
   {
@@ -93,156 +78,168 @@ struct DispatcherTestCallable
 
 private:
   IndexType* ptr_call;
-  IndexType  val_call;
+  IndexType val_call;
   IndexType* ptr_dtor;
-  IndexType  val_dtor;
+  IndexType val_dtor;
+
 public:
   bool move_constructed = false;
   bool moved_from = false;
 };
 
-template < typename ExecPolicy,
-           typename DispatchTyper,
-           typename IndexType,
-           typename WORKING_RES,
-           typename ForOnePol >
-struct testWorkGroupDispatcherSingle {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
+template <typename ExecPolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle
 {
-  using TestCallable = DispatcherTestCallable<IndexType, Args...>;
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {
+    using TestCallable = DispatcherTestCallable<IndexType, Args...>;
 
-  camp::resources::Resource work_res{WORKING_RES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+    camp::resources::Resource work_res{WORKING_RES()};
+    camp::resources::Resource host_res{camp::resources::Host()};
 
-  static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
-  using DispatchPolicy = typename DispatchTyper::template type<TestCallable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, IndexType, Args...>;
-  using Invoker_type = typename Dispatcher_type::invoker_type;
-  using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
-  const Dispatcher_type* dispatcher =
-      RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(ExecPolicy{});
+    static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
+    using DispatchPolicy = typename DispatchTyper::template type<TestCallable>;
+    using Dispatcher_type = RAJA::detail::
+        Dispatcher<platform, DispatchPolicy, void, IndexType, Args...>;
+    using Invoker_type = typename Dispatcher_type::invoker_type;
+    using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
+    const Dispatcher_type* dispatcher =
+        RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
+            ExecPolicy{});
 
-  TestCallable* old_obj = host_res.allocate<TestCallable>(1);
-  TestCallable* new_obj = host_res.allocate<TestCallable>(1);
-  TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
+    TestCallable* old_obj = host_res.allocate<TestCallable>(1);
+    TestCallable* new_obj = host_res.allocate<TestCallable>(1);
+    TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
 
-  IndexType* chckCall = host_res.allocate<IndexType>(3);
-  IndexType* testCall = host_res.allocate<IndexType>(3);
-  IndexType* workCall = work_res.allocate<IndexType>(3);
+    IndexType* chckCall = host_res.allocate<IndexType>(3);
+    IndexType* testCall = host_res.allocate<IndexType>(3);
+    IndexType* workCall = work_res.allocate<IndexType>(3);
 
-  IndexType* chckDtor = host_res.allocate<IndexType>(3);
-  IndexType* testDtor = host_res.allocate<IndexType>(3);
+    IndexType* chckDtor = host_res.allocate<IndexType>(3);
+    IndexType* testDtor = host_res.allocate<IndexType>(3);
 
 
-  chckCall[0] = (IndexType)5;
-  chckCall[1] = (IndexType)7;
-  chckCall[2] = (IndexType)5;
+    chckCall[0] = (IndexType)5;
+    chckCall[1] = (IndexType)7;
+    chckCall[2] = (IndexType)5;
 
-  testCall[0] = (IndexType)5;
-  testCall[1] = (IndexType)5;
-  testCall[2] = (IndexType)5;
+    testCall[0] = (IndexType)5;
+    testCall[1] = (IndexType)5;
+    testCall[2] = (IndexType)5;
 
-  work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
+    work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
 
-  testCall[0] = (IndexType)0;
-  testCall[1] = (IndexType)0;
-  testCall[2] = (IndexType)0;
+    testCall[0] = (IndexType)0;
+    testCall[1] = (IndexType)0;
+    testCall[2] = (IndexType)0;
 
 
-  chckDtor[0] = (IndexType)15;
-  chckDtor[1] = (IndexType)17;
-  chckDtor[2] = (IndexType)15;
+    chckDtor[0] = (IndexType)15;
+    chckDtor[1] = (IndexType)17;
+    chckDtor[2] = (IndexType)15;
 
-  testDtor[0] = (IndexType)15;
-  testDtor[1] = (IndexType)15;
-  testDtor[2] = (IndexType)15;
+    testDtor[0] = (IndexType)15;
+    testDtor[1] = (IndexType)15;
+    testDtor[2] = (IndexType)15;
 
 
-  new(old_obj) TestCallable(workCall, chckCall[1], testDtor+1, chckDtor[1]);
+    new (old_obj)
+        TestCallable(workCall, chckCall[1], testDtor + 1, chckDtor[1]);
 
-  ASSERT_FALSE(old_obj->move_constructed);
-  ASSERT_FALSE(old_obj->moved_from);
+    ASSERT_FALSE(old_obj->move_constructed);
+    ASSERT_FALSE(old_obj->moved_from);
 
 
-  dispatcher->move_construct_destroy(new_obj, old_obj);
+    dispatcher->move_construct_destroy(new_obj, old_obj);
 
-  ASSERT_TRUE(new_obj->move_constructed);
-  ASSERT_FALSE(new_obj->moved_from);
+    ASSERT_TRUE(new_obj->move_constructed);
+    ASSERT_FALSE(new_obj->moved_from);
 
-  ASSERT_EQ(testDtor[0], chckDtor[0]);
-  ASSERT_EQ(testDtor[1], chckDtor[1]);
-  ASSERT_EQ(testDtor[2], chckDtor[2]);
+    ASSERT_EQ(testDtor[0], chckDtor[0]);
+    ASSERT_EQ(testDtor[1], chckDtor[1]);
+    ASSERT_EQ(testDtor[2], chckDtor[2]);
 
-  testDtor[0] = (IndexType)15;
-  testDtor[1] = (IndexType)15;
-  testDtor[2] = (IndexType)15;
+    testDtor[0] = (IndexType)15;
+    testDtor[1] = (IndexType)15;
+    testDtor[2] = (IndexType)15;
 
 
-  work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
+    work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
-  // move a value onto device and fiddle
-  call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType, Args...>(
-      dispatcher->invoke, wrk_obj, (IndexType)1, Args{}...);
+    // move a value onto device and fiddle
+    call_dispatcher<ForOnePol,
+                    Invoker_type,
+                    Dispatcher_cptr_type,
+                    IndexType,
+                    Args...>(
+        dispatcher->invoke, wrk_obj, (IndexType)1, Args{}...);
 
-  work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
+    work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
-  ASSERT_EQ(testCall[0], chckCall[0]);
-  ASSERT_EQ(testCall[1], chckCall[1]);
-  ASSERT_EQ(testCall[2], chckCall[2]);
+    ASSERT_EQ(testCall[0], chckCall[0]);
+    ASSERT_EQ(testCall[1], chckCall[1]);
+    ASSERT_EQ(testCall[2], chckCall[2]);
 
 
-  dispatcher->destroy(new_obj);
+    dispatcher->destroy(new_obj);
 
-  ASSERT_EQ(testDtor[0], chckDtor[0]);
-  ASSERT_EQ(testDtor[1], chckDtor[1]);
-  ASSERT_EQ(testDtor[2], chckDtor[2]);
+    ASSERT_EQ(testDtor[0], chckDtor[0]);
+    ASSERT_EQ(testDtor[1], chckDtor[1]);
+    ASSERT_EQ(testDtor[2], chckDtor[2]);
 
 
-  host_res.deallocate( old_obj );
-  host_res.deallocate( new_obj );
-  work_res.deallocate( wrk_obj );
-  host_res.deallocate( chckCall );
-  host_res.deallocate( testCall );
-  work_res.deallocate( workCall );
-  host_res.deallocate( chckDtor );
-  host_res.deallocate( testDtor );
-}
+    host_res.deallocate(old_obj);
+    host_res.deallocate(new_obj);
+    work_res.deallocate(wrk_obj);
+    host_res.deallocate(chckCall);
+    host_res.deallocate(testCall);
+    work_res.deallocate(workCall);
+    host_res.deallocate(chckDtor);
+    host_res.deallocate(testDtor);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename IndexType,
-           typename WORKING_RES,
-          typename ForOnePol
-          >
-struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                     detail::indirect_function_call_dispatch_typer,
-                                     IndexType,
-                                     WORKING_RES,
-                                     ForOnePol> {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
-{ }
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    WORKING_RES,
+    ForOnePol>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename IndexType,
-           typename WORKING_RES,
-          typename ForOnePol
-          >
-struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                     detail::indirect_virtual_function_dispatch_typer,
-                                     IndexType,
-                                     WORKING_RES,
-                                     ForOnePol> {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
-{ }
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    WORKING_RES,
+    ForOnePol>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {}
 };
 
 #endif
@@ -250,12 +247,12 @@ void operator()(RAJA::xargs<Args...>) const
 
 template <typename T>
 class WorkGroupBasicDispatcherSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest, BasicWorkGroupDispatcherSingle)
+TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
+             BasicWorkGroupDispatcherSingle)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -264,8 +261,11 @@ TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest, BasicWorkGroupDispatcherSin
   using ResourceType = typename camp::at<TypeParam, camp::num<4>>::type;
   using ForOneType = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupDispatcherSingle< ExecPolicy, DispatchTyper, IndexType, ResourceType, ForOneType >{}(
-      Args{});
+  testWorkGroupDispatcherSingle<ExecPolicy,
+                                DispatchTyper,
+                                IndexType,
+                                ResourceType,
+                                ForOneType>{}(Args{});
 }
 
-#endif  //__TEST_WORKGROUP_DISPATCHER__
+#endif //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index fcf24e89da..704dd458e4 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -23,104 +23,116 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple
 {
-  IndexType success = (IndexType)1;
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-  using callable = EnqueueTestCallable<IndexType, Args...>;
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
-
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
   {
-    WorkPool_type pool(Allocator{});
+    IndexType success = (IndexType)1;
+
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using callable = EnqueueTestCallable<IndexType, Args...>;
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable>>;
+
+    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<Args...>,
+                                         Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                              OrderPolicy,
+                                              StoragePolicy,
+                                              DispatchPolicy>,
+                        IndexType,
+                        RAJA::xargs<Args...>,
+                        Allocator>;
+
+    {
+      WorkPool_type pool(Allocator{});
+
+      // test_empty(pool);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-    // test_empty(pool);
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+      for (size_t i = 0; i < rep; ++i)
+      {
 
-    for (size_t i = 0; i < rep; ++i) {
+        {
+          for (size_t i = 0; i < num; ++i)
+          {
+            pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+          }
 
-      {
-        for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+          ASSERT_EQ(pool.num_loops(), (size_t)num);
+          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
         }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)num);
-        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
-      }
+        if (do_instantiate)
+        {
+          WorkGroup_type group = pool.instantiate();
+        }
+        else
+        {
+          pool.clear();
+        }
 
-      if (do_instantiate) {
-        WorkGroup_type group = pool.instantiate();
-      } else {
-        pool.clear();
+        ASSERT_EQ(pool.num_loops(), (size_t)0);
+        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
       }
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
     }
-  }
 
-  ASSERT_EQ(success, (IndexType)1);
-}
+    ASSERT_EQ(success, (IndexType)1);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 
 #endif
@@ -128,13 +140,13 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicEnqueueMultipleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest, BasicWorkGroupEnqueueMultiple)
+TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
+             BasicWorkGroupEnqueueMultiple)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -148,10 +160,20 @@ TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest, BasicWorkGroupEnqueueMultipl
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
+  testWorkGroupEnqueueMultiple<ExecPolicy,
+                               OrderPolicy,
+                               StoragePolicy,
+                               DispatchTyper,
+                               IndexType,
+                               Allocator>{}(
       Xargs{}, false, dist_rep(rng), dist_num(rng));
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
+  testWorkGroupEnqueueMultiple<ExecPolicy,
+                               OrderPolicy,
+                               StoragePolicy,
+                               DispatchTyper,
+                               IndexType,
+                               Allocator>{}(
       Xargs{}, true, dist_rep(rng), dist_num(rng));
 }
 
-#endif  //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
+#endif //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index 282b911d93..8776b5d853 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -23,103 +23,116 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
+          typename Allocator>
+struct testWorkGroupEnqueueSingle
 {
-  IndexType success = (IndexType)1;
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-  using callable = EnqueueTestCallable<IndexType, Args...>;
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
-
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
   {
-    WorkPool_type pool(Allocator{});
+    IndexType success = (IndexType)1;
+
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using callable = EnqueueTestCallable<IndexType, Args...>;
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable>>;
+
+    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
+                                                               OrderPolicy,
+                                                               StoragePolicy,
+                                                               DispatchPolicy>,
+                                         IndexType,
+                                         RAJA::xargs<Args...>,
+                                         Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
+                                              OrderPolicy,
+                                              StoragePolicy,
+                                              DispatchPolicy>,
+                        IndexType,
+                        RAJA::xargs<Args...>,
+                        Allocator>;
+
+    {
+      WorkPool_type pool(Allocator{});
+
+      // test_empty(pool);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-    // test_empty(pool);
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+      for (size_t i = 0; i < rep; ++i)
+      {
 
-    for (size_t i = 0; i < rep; ++i) {
+        {
+          for (size_t i = 0; i < num; ++i)
+          {
+            pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+          }
 
-      {
-        for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+          ASSERT_EQ(pool.num_loops(), (size_t)num);
+          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
         }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)num);
-        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
-      }
+        if (do_instantiate)
+        {
+          WorkGroup_type group = pool.instantiate();
+        }
+        else
+        {
+          pool.clear();
+        }
 
-      if (do_instantiate) {
-        WorkGroup_type group = pool.instantiate();
-      } else {
-        pool.clear();
+        ASSERT_EQ(pool.num_loops(), (size_t)0);
+        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
       }
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
     }
-  }
 
-  ASSERT_EQ(success, (IndexType)1);
-}
+    ASSERT_EQ(success, (IndexType)1);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_function_call_dispatch_typer,
-                                  IndexType,
-                                  Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_virtual_function_dispatch_typer,
-                                  IndexType,
-                                  Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 
 #endif
@@ -127,8 +140,7 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicEnqueueSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueSingleUnitTest);
 
@@ -143,8 +155,18 @@ TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
   using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, false, 1, 1);
-  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, true, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy,
+                             OrderPolicy,
+                             StoragePolicy,
+                             DispatchTyper,
+                             IndexType,
+                             Allocator>{}(Xargs{}, false, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy,
+                             OrderPolicy,
+                             StoragePolicy,
+                             DispatchTyper,
+                             IndexType,
+                             Allocator>{}(Xargs{}, true, 1, 1);
 }
 
-#endif  //__TEST_WORKGROUP_ENQUEUESINGLE__
+#endif //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 6022e98919..54eebcaf1c 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -20,27 +20,20 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageConstructor()
 {
   bool success = true;
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy = typename DispatchTyper::template type<>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using Dispatcher_type = RAJA::detail::
+      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
   {
     auto test_empty = [&](WorkStorage_type& container) {
-
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
@@ -73,20 +66,22 @@ void testWorkGroupWorkStorageConstructor()
 
 template <typename T>
 class WorkGroupBasicWorkStorageConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest, BasicWorkGroupWorkStorageConstructor)
+TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
+             BasicWorkGroupWorkStorageConstructor)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageConstructor<StoragePolicy,
+                                      DispatchTyper,
+                                      Allocator>();
 }
 
 
-#endif  //__TEST_WORKGROUP_WORKSTORAGECONSTRUCTOR__
+#endif //__TEST_WORKGROUP_WORKSTORAGECONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index fd5a7aeaa3..bc77d0fa3f 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -20,10 +20,7 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageInsertCall()
 {
   bool success = true;
@@ -32,27 +29,22 @@ void testWorkGroupWorkStorageInsertCall()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using Dispatcher_type = RAJA::detail::
+      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
-  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
-      callable, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(RAJA::seq_work{});
 
   {
     auto test_empty = [&](WorkStorage_type& container) {
-
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
     auto fill_contents = [&](WorkStorage_type& container, double init_val) {
-
       callable c(init_val);
 
       ASSERT_FALSE(c.move_constructed);
@@ -68,7 +60,6 @@ void testWorkGroupWorkStorageInsertCall()
     };
 
     auto test_contents = [&](WorkStorage_type& container, double init_val) {
-
       ASSERT_EQ(container.size(), (size_t)1);
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
 
@@ -77,7 +68,8 @@ void testWorkGroupWorkStorageInsertCall()
       double test_val = -1;
       bool move_constructed = false;
       bool moved_from = true;
-      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed, &moved_from);
+      WorkStruct_type::host_call(
+          &*iter, (void*)&test_val, &move_constructed, &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
@@ -126,19 +118,19 @@ void testWorkGroupWorkStorageInsertCall()
 
 template <typename T>
 class WorkGroupBasicWorkStorageInsertCallUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest, BasicWorkGroupWorkStorageInsertCall)
+TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
+             BasicWorkGroupWorkStorageInsertCall)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageInsertCall< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageInsertCall<StoragePolicy, DispatchTyper, Allocator>();
 }
 
-#endif  //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
+#endif //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index 90cc7c1368..a98aeb41fa 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -20,10 +20,7 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageIterator()
 {
   bool success = true;
@@ -32,22 +29,19 @@ void testWorkGroupWorkStorageIterator()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using Dispatcher_type = RAJA::detail::
+      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
 
-  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
-      callable, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(RAJA::seq_work{});
 
   {
     WorkStorage_type container(Allocator{});
 
-    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)0);
+    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)0);
     ASSERT_FALSE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_TRUE(container.begin() == container.end());
@@ -57,7 +51,7 @@ void testWorkGroupWorkStorageIterator()
 
     container.template emplace<callable>(dispatcher, callable{-1});
 
-    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)1);
+    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)1);
     ASSERT_TRUE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_FALSE(container.begin() == container.end());
@@ -75,12 +69,12 @@ void testWorkGroupWorkStorageIterator()
       ASSERT_EQ(++iter, container.end());
       ASSERT_EQ(--iter, container.begin());
 
-      ASSERT_EQ(iter+1, container.end());
-      ASSERT_EQ(1+iter, container.end());
+      ASSERT_EQ(iter + 1, container.end());
+      ASSERT_EQ(1 + iter, container.end());
       ASSERT_EQ(++iter, container.end());
-      ASSERT_EQ(iter-1, container.begin());
-      ASSERT_EQ(iter-=1, container.begin());
-      ASSERT_EQ(iter+=1, container.end());
+      ASSERT_EQ(iter - 1, container.begin());
+      ASSERT_EQ(iter -= 1, container.begin());
+      ASSERT_EQ(iter += 1, container.end());
     }
   }
 
@@ -90,19 +84,19 @@ void testWorkGroupWorkStorageIterator()
 
 template <typename T>
 class WorkGroupBasicWorkStorageIteratorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest, BasicWorkGroupWorkStorageIterator)
+TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
+             BasicWorkGroupWorkStorageIterator)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageIterator< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageIterator<StoragePolicy, DispatchTyper, Allocator>();
 }
 
-#endif  //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
+#endif //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 103829be0b..2a6254e2ce 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -20,12 +20,10 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
-void testWorkGroupWorkStorageMultiple(
-    const size_t num0, const size_t num1, const size_t num2)
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
+void testWorkGroupWorkStorageMultiple(const size_t num0,
+                                      const size_t num1,
+                                      const size_t num2)
 {
   bool success = true;
 
@@ -39,14 +37,16 @@ void testWorkGroupWorkStorageMultiple(
   };
   auto make_type1 = [](double init_val, size_t i) {
     type1 obj{};
-    for (size_t j = 0; j < 6; ++j) {
+    for (size_t j = 0; j < 6; ++j)
+    {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
   };
   auto make_type2 = [](double init_val, size_t i) {
     type2 obj{};
-    for (size_t j = 0; j < 14; ++j) {
+    for (size_t j = 0; j < 14; ++j)
+    {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
@@ -57,89 +57,96 @@ void testWorkGroupWorkStorageMultiple(
   using callable2 = TestCallable<type2>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable0, callable1, callable2>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy =
+      typename DispatchTyper::template type<callable0, callable1, callable2>;
+  using Dispatcher_type = RAJA::detail::
+      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
 
-  const Dispatcher_type* dispatcher0 = RAJA::detail::get_Dispatcher<
-      callable0, Dispatcher_type>(RAJA::seq_work{});
-  const Dispatcher_type* dispatcher1 = RAJA::detail::get_Dispatcher<
-      callable1, Dispatcher_type>(RAJA::seq_work{});
-  const Dispatcher_type* dispatcher2 = RAJA::detail::get_Dispatcher<
-      callable2, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher0 =
+      RAJA::detail::get_Dispatcher<callable0, Dispatcher_type>(
+          RAJA::seq_work{});
+  const Dispatcher_type* dispatcher1 =
+      RAJA::detail::get_Dispatcher<callable1, Dispatcher_type>(
+          RAJA::seq_work{});
+  const Dispatcher_type* dispatcher2 =
+      RAJA::detail::get_Dispatcher<callable2, Dispatcher_type>(
+          RAJA::seq_work{});
 
   {
     auto test_empty = [&](WorkStorage_type& container) {
-
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
-
+    auto fill_contents = [&](WorkStorage_type& container,
+                             double init_val0,
+                             double init_val1,
+                             double init_val2) {
       std::vector<callable0> vec0;
       vec0.reserve(num0);
-      for (size_t i = 0; i < num0; ++i) {
+      for (size_t i = 0; i < num0; ++i)
+      {
         vec0.emplace_back(make_type0(init_val0, i));
         ASSERT_FALSE(vec0[i].move_constructed);
         ASSERT_FALSE(vec0[i].moved_from);
         container.template emplace<callable0>(dispatcher0, std::move(vec0[i]));
         ASSERT_FALSE(vec0[i].move_constructed);
-        ASSERT_TRUE (vec0[i].moved_from);
+        ASSERT_TRUE(vec0[i].moved_from);
       }
 
       std::vector<callable1> vec1;
       vec1.reserve(num1);
-      for (size_t i = 0; i < num1; ++i) {
+      for (size_t i = 0; i < num1; ++i)
+      {
         vec1.emplace_back(make_type1(init_val1, i));
         ASSERT_FALSE(vec1[i].move_constructed);
         ASSERT_FALSE(vec1[i].moved_from);
         container.template emplace<callable1>(dispatcher1, std::move(vec1[i]));
         ASSERT_FALSE(vec1[i].move_constructed);
-        ASSERT_TRUE (vec1[i].moved_from);
+        ASSERT_TRUE(vec1[i].moved_from);
       }
 
       std::vector<callable2> vec2;
       vec2.reserve(num2);
-      for (size_t i = 0; i < num2; ++i) {
+      for (size_t i = 0; i < num2; ++i)
+      {
         vec2.emplace_back(make_type2(init_val2, i));
         ASSERT_FALSE(vec2[i].move_constructed);
         ASSERT_FALSE(vec2[i].moved_from);
         container.template emplace<callable2>(dispatcher2, std::move(vec2[i]));
         ASSERT_FALSE(vec2[i].move_constructed);
-        ASSERT_TRUE (vec2[i].moved_from);
+        ASSERT_TRUE(vec2[i].moved_from);
       }
 
-      ASSERT_EQ(container.size(), num0+num1+num2);
+      ASSERT_EQ(container.size(), num0 + num1 + num2);
       ASSERT_GE(container.storage_size(),
-          num0*sizeof(callable0) +
-          num1*sizeof(callable1) +
-          num2*sizeof(callable2));
+                num0 * sizeof(callable0) + num1 * sizeof(callable1) +
+                    num2 * sizeof(callable2));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
-
-      ASSERT_EQ(container.size(), num0+num1+num2);
+    auto test_contents = [&](WorkStorage_type& container,
+                             double init_val0,
+                             double init_val1,
+                             double init_val2) {
+      ASSERT_EQ(container.size(), num0 + num1 + num2);
       ASSERT_GE(container.storage_size(),
-          num0*sizeof(callable0) +
-          num1*sizeof(callable1) +
-          num2*sizeof(callable2));
+                num0 * sizeof(callable0) + num1 * sizeof(callable1) +
+                    num2 * sizeof(callable2));
 
       {
         auto iter = container.begin();
 
-        for (size_t i = 0; i < num0; ++i) {
+        for (size_t i = 0; i < num0; ++i)
+        {
           type0 val{};
           bool move_constructed = false;
           bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(
+              &*iter, (void*)&val, &move_constructed, &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -149,11 +156,13 @@ void testWorkGroupWorkStorageMultiple(
           ++iter;
         }
 
-        for (size_t i = 0; i < num1; ++i) {
+        for (size_t i = 0; i < num1; ++i)
+        {
           type1 val{};
           bool move_constructed = false;
           bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(
+              &*iter, (void*)&val, &move_constructed, &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -163,11 +172,13 @@ void testWorkGroupWorkStorageMultiple(
           ++iter;
         }
 
-        for (size_t i = 0; i < num2; ++i) {
+        for (size_t i = 0; i < num2; ++i)
+        {
           type2 val{};
           bool move_constructed = false;
           bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(
+              &*iter, (void*)&val, &move_constructed, &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);
@@ -215,7 +226,6 @@ void testWorkGroupWorkStorageMultiple(
 
     test_empty(container3);
     test_contents(container4, 1.0, 100.0, 1000.0);
-
   }
 
   ASSERT_TRUE(success);
@@ -224,13 +234,13 @@ void testWorkGroupWorkStorageMultiple(
 
 template <typename T>
 class WorkGroupBasicWorkStorageMultipleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest, BasicWorkGroupWorkStorageMultiple)
+TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest,
+             BasicWorkGroupWorkStorageMultiple)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -239,8 +249,8 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest, BasicWorkGroupWorkStorag
   std::mt19937 rng(std::random_device{}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
-  testWorkGroupWorkStorageMultiple< StoragePolicy, DispatchTyper, Allocator >(
+  testWorkGroupWorkStorageMultiple<StoragePolicy, DispatchTyper, Allocator>(
       dist(rng), dist(rng), dist(rng));
 }
 
-#endif  //__TEST_WORKGROUP_WORKSTORAGEMULTIPLE__
+#endif //__TEST_WORKGROUP_WORKSTORAGEMULTIPLE__

From ac92fed341165d32758fd479a025a5423f21b399 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Thu, 5 Sep 2024 10:32:22 -0700
Subject: [PATCH 3/9] Add declaration and assignment alignment

---
 .clang-format                                 |   7 +-
 examples/dynamic-forall.cpp                   |   8 +-
 examples/dynamic_mat_transpose.cpp            | 136 ++++---
 examples/forall-param-reductions.cpp          | 108 ++---
 examples/forall_multi-reductions.cpp          |  71 ++--
 examples/jacobi.cpp                           |  88 ++--
 examples/kernel-dynamic-tile.cpp              |   3 +-
 examples/launch-param-reductions.cpp          | 239 ++++++-----
 examples/launch_flatten.cpp                   |  71 ++--
 examples/launch_matrix-multiply.cpp           | 381 +++++++++++-------
 examples/launch_reductions.cpp                |  38 +-
 examples/memoryManager.hpp                    |   2 +-
 examples/multiview.cpp                        |  10 +-
 examples/omp-target-kernel.cpp                |   8 +-
 examples/omp-target-ltimes.cpp                |  15 +-
 examples/pi-reduce_vs_atomic.cpp              |  98 +++--
 examples/raja-launch.cpp                      |  49 ++-
 examples/red-black-gauss-seidel.cpp           |  66 +--
 examples/resource-dynamic-forall.cpp          |   9 +-
 examples/resource-forall.cpp                  |  56 +--
 examples/resource-kernel.cpp                  |  12 +-
 examples/resource-launch.cpp                  |  31 +-
 examples/resource-runtime-launch.cpp          |  34 +-
 examples/tut_daxpy.cpp                        |   2 +-
 examples/tut_halo-exchange.cpp                | 296 +++++++-------
 examples/tut_launch_basic.cpp                 |  21 +-
 examples/tut_matrix-multiply.cpp              | 156 +++----
 examples/wave-eqn.cpp                         |  53 +--
 exercises/atomic-histogram.cpp                |   2 +-
 exercises/atomic-histogram_solution.cpp       |  44 +-
 exercises/dot-product_solution.cpp            |  12 +-
 .../kernel-matrix-transpose-local-array.cpp   |  34 +-
 ...-matrix-transpose-local-array_solution.cpp |  64 ++-
 exercises/kernel-matrix-transpose-tiled.cpp   |  15 +-
 ...kernel-matrix-transpose-tiled_solution.cpp |  20 +-
 exercises/kernel-matrix-transpose.cpp         |   2 +-
 .../kernel-matrix-transpose_solution.cpp      |  19 +-
 exercises/kernelintro-execpols.cpp            |  60 ++-
 exercises/kernelintro-execpols_solution.cpp   |  80 ++--
 exercises/kernelintro-nested-loop-reorder.cpp |   6 +-
 ...rnelintro-nested-loop-reorder_solution.cpp |  18 +-
 .../launch-matrix-transpose-local-array.cpp   |  76 ++--
 ...-matrix-transpose-local-array_solution.cpp | 158 +++++---
 exercises/launch-matrix-transpose-tiled.cpp   |  39 +-
 ...launch-matrix-transpose-tiled_solution.cpp | 124 ++++--
 exercises/launch-matrix-transpose.cpp         |  54 ++-
 .../launch-matrix-transpose_solution.cpp      |  62 ++-
 exercises/launchintro-execpols.cpp            | 129 +++---
 exercises/launchintro-execpols_solution.cpp   | 167 +++++---
 exercises/memoryManager.hpp                   |   2 +-
 exercises/offset-layout-stencil.cpp           |  33 +-
 exercises/offset-layout-stencil_solution.cpp  |  50 ++-
 .../permuted-layout-batch-matrix-multiply.cpp |  50 ++-
 ...-layout-batch-matrix-multiply_solution.cpp |  66 +--
 exercises/reductions.cpp                      |  28 +-
 exercises/reductions_solution.cpp             |  94 +++--
 exercises/scan.cpp                            |  10 +-
 exercises/scan_solution.cpp                   |  10 +-
 exercises/segment-indexset-basics.cpp         |   8 +-
 .../segment-indexset-basics_solution.cpp      |  10 +-
 exercises/sort.cpp                            |  24 +-
 exercises/sort_solution.cpp                   |  24 +-
 exercises/tutorial_halfday/ex2_approx-pi.cpp  |  14 +-
 .../ex2_approx-pi_solution.cpp                |  38 +-
 .../tutorial_halfday/ex5_line-of-sight.cpp    |  16 +-
 .../ex5_line-of-sight_solution.cpp            |  82 ++--
 .../ex6_stencil-offset-layout.cpp             |  12 +-
 .../ex6_stencil-offset-layout_solution.cpp    |  16 +-
 .../ex8_tiled-matrix-transpose.cpp            |   8 +-
 .../ex8_tiled-matrix-transpose_solution.cpp   |  25 +-
 .../ex9_matrix-transpose-local-array.cpp      |   2 +-
 ..._matrix-transpose-local-array_solution.cpp |  34 +-
 exercises/vector-addition.cpp                 |   6 +-
 exercises/vector-addition_solution.cpp        |   6 +-
 exercises/vertexsum-indexset.cpp              |  80 ++--
 exercises/vertexsum-indexset_solution.cpp     |  80 ++--
 exercises/view-layout.cpp                     |  42 +-
 exercises/view-layout_solution.cpp            |  46 +--
 include/RAJA/index/IndexSet.hpp               |  26 +-
 include/RAJA/index/IndexSetBuilders.hpp       |  18 +-
 include/RAJA/index/IndexSetUtils.hpp          |  20 +-
 include/RAJA/index/IndexValue.hpp             |  34 +-
 include/RAJA/index/ListSegment.hpp            |  60 +--
 include/RAJA/index/RangeSegment.hpp           |  12 +-
 include/RAJA/internal/DepGraphNode.hpp        |   6 +-
 include/RAJA/internal/Iterators.hpp           |  48 +--
 include/RAJA/internal/MemUtils_CPU.hpp        |   2 +-
 include/RAJA/internal/RAJAVec.hpp             |  52 +--
 include/RAJA/internal/fault_tolerance.hpp     |  14 +-
 include/RAJA/pattern/WorkGroup.hpp            |  70 ++--
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  98 ++---
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |  18 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    | 190 ++++-----
 include/RAJA/pattern/WorkGroup/WorkStruct.hpp |  14 +-
 include/RAJA/pattern/atomic.hpp               |   2 +-
 include/RAJA/pattern/detail/algorithm.hpp     |   2 +-
 include/RAJA/pattern/detail/forall.hpp        |   4 +-
 include/RAJA/pattern/detail/multi_reduce.hpp  |  44 +-
 include/RAJA/pattern/detail/privatizer.hpp    |   2 +-
 include/RAJA/pattern/detail/reduce.hpp        |  34 +-
 include/RAJA/pattern/forall.hpp               | 129 +++---
 include/RAJA/pattern/kernel.hpp               |  10 +-
 include/RAJA/pattern/kernel/For.hpp           |   4 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   2 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |   2 +-
 include/RAJA/pattern/kernel/Lambda.hpp        |   6 +-
 include/RAJA/pattern/kernel/Region.hpp        |  11 +-
 include/RAJA/pattern/kernel/Tile.hpp          |  14 +-
 .../RAJA/pattern/kernel/internal/LoopData.hpp |  20 +-
 .../pattern/kernel/internal/LoopTypes.hpp     |   4 +-
 .../pattern/kernel/internal/Statement.hpp     |   2 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  52 +--
 include/RAJA/pattern/params/forall.hpp        |   6 +-
 include/RAJA/pattern/params/params_base.hpp   |   2 +-
 include/RAJA/pattern/params/reducer.hpp       |  16 +-
 include/RAJA/pattern/scan.hpp                 |  82 ++--
 include/RAJA/pattern/sort.hpp                 |  40 +-
 include/RAJA/pattern/tensor/TensorIndex.hpp   |  26 +-
 .../RAJA/pattern/tensor/VectorRegister.hpp    |   2 +-
 .../tensor/internal/ET/BinaryOperator.hpp     |  16 +-
 .../internal/ET/BinaryOperatorTraits.hpp      |  10 +-
 .../tensor/internal/ET/BlockLiteral.hpp       |  14 +-
 .../internal/ET/ExpressionTemplateBase.hpp    |   8 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   | 294 +++++++-------
 .../tensor/internal/ET/TensorDivide.hpp       |  36 +-
 .../tensor/internal/ET/TensorLiteral.hpp      |   8 +-
 .../tensor/internal/ET/TensorLoadStore.hpp    |  12 +-
 .../tensor/internal/ET/TensorMultiply.hpp     |  12 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |  18 +-
 .../tensor/internal/ET/TensorNegate.hpp       |  12 +-
 .../internal/ET/TensorScalarLiteral.hpp       |   8 +-
 .../tensor/internal/ET/TensorTranspose.hpp    |  12 +-
 .../tensor/internal/MatrixMatrixMultiply.hpp  |  32 +-
 .../tensor/internal/MatrixRegisterImpl.hpp    | 118 +++---
 .../pattern/tensor/internal/RegisterBase.hpp  |  72 ++--
 .../tensor/internal/TensorIndexTraits.hpp     |  16 +-
 .../pattern/tensor/internal/TensorRef.hpp     | 102 ++---
 .../tensor/internal/TensorRegisterBase.hpp    |   2 +-
 .../tensor/internal/TensorTileExec.hpp        |  18 +-
 .../tensor/internal/VectorRegisterImpl.hpp    |  20 +-
 include/RAJA/policy/MultiPolicy.hpp           |  10 +-
 include/RAJA/policy/PolicyBase.hpp            |  28 +-
 include/RAJA/policy/atomic_builtin.hpp        |  27 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |  76 ++--
 .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp |  10 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  48 +--
 include/RAJA/policy/cuda/atomic.hpp           |  47 ++-
 include/RAJA/policy/cuda/forall.hpp           | 198 ++++-----
 include/RAJA/policy/cuda/intrinsics.hpp       |  12 +-
 .../RAJA/policy/cuda/kernel/Conditional.hpp   |   2 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  24 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |  18 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  18 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |   4 +-
 include/RAJA/policy/cuda/kernel/Reduce.hpp    |   4 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |  10 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |  20 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  22 +-
 include/RAJA/policy/cuda/launch.hpp           | 200 ++++-----
 include/RAJA/policy/cuda/multi_reduce.hpp     | 103 ++---
 include/RAJA/policy/cuda/policy.hpp           | 112 ++---
 include/RAJA/policy/cuda/reduce.hpp           | 148 +++----
 include/RAJA/policy/cuda/scan.hpp             |  64 +--
 include/RAJA/policy/cuda/sort.hpp             |  80 ++--
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |  78 ++--
 .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp  |   8 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  52 +--
 include/RAJA/policy/hip/atomic.hpp            |  47 ++-
 include/RAJA/policy/hip/forall.hpp            | 190 ++++-----
 include/RAJA/policy/hip/intrinsics.hpp        |   8 +-
 .../RAJA/policy/hip/kernel/Conditional.hpp    |   2 +-
 include/RAJA/policy/hip/kernel/For.hpp        |  18 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  18 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  24 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |   4 +-
 include/RAJA/policy/hip/kernel/Reduce.hpp     |   4 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |  10 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  20 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  22 +-
 include/RAJA/policy/hip/launch.hpp            | 198 ++++-----
 include/RAJA/policy/hip/multi_reduce.hpp      | 103 ++---
 .../RAJA/policy/hip/params/kernel_name.hpp    |   2 +-
 include/RAJA/policy/hip/params/reduce.hpp     |   2 +-
 include/RAJA/policy/hip/policy.hpp            |  94 ++---
 include/RAJA/policy/hip/reduce.hpp            | 150 +++----
 include/RAJA/policy/hip/scan.hpp              |  44 +-
 include/RAJA/policy/hip/sort.hpp              |  44 +-
 include/RAJA/policy/openmp/atomic.hpp         |   4 +-
 include/RAJA/policy/openmp/forall.hpp         |  44 +-
 .../RAJA/policy/openmp/kernel/Collapse.hpp    |   6 +-
 include/RAJA/policy/openmp/launch.hpp         | 372 +++++++++--------
 include/RAJA/policy/openmp/multi_reduce.hpp   |  56 +--
 include/RAJA/policy/openmp/params/forall.hpp  |  74 ++--
 include/RAJA/policy/openmp/policy.hpp         |  14 +-
 include/RAJA/policy/openmp/scan.hpp           |  78 ++--
 include/RAJA/policy/openmp/sort.hpp           |  32 +-
 .../openmp_target/WorkGroup/Dispatcher.hpp    |   5 +-
 include/RAJA/policy/openmp_target/forall.hpp  |  28 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   4 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |  52 +--
 include/RAJA/policy/sequential/atomic.hpp     |  12 +-
 include/RAJA/policy/sequential/forall.hpp     |   6 +-
 include/RAJA/policy/sequential/launch.hpp     |  74 ++--
 .../RAJA/policy/sequential/multi_reduce.hpp   |  20 +-
 include/RAJA/policy/sequential/policy.hpp     |   2 +-
 include/RAJA/policy/sequential/scan.hpp       |  74 ++--
 include/RAJA/policy/sequential/sort.hpp       |  16 +-
 include/RAJA/policy/simd/forall.hpp           |  14 +-
 include/RAJA/policy/simd/kernel/For.hpp       |   8 +-
 include/RAJA/policy/simd/kernel/ForICount.hpp |   8 +-
 include/RAJA/policy/simd/launch.hpp           |   8 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp    |  22 +-
 include/RAJA/policy/sycl/forall.hpp           | 191 +++++----
 .../RAJA/policy/sycl/kernel/Conditional.hpp   |   2 +-
 include/RAJA/policy/sycl/kernel/For.hpp       |  40 +-
 include/RAJA/policy/sycl/kernel/ForICount.hpp |  26 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |  44 +-
 include/RAJA/policy/sycl/kernel/Tile.hpp      |  52 +--
 .../RAJA/policy/sycl/kernel/TileTCount.hpp    |  26 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |   2 +-
 include/RAJA/policy/sycl/launch.hpp           | 314 ++++++++-------
 include/RAJA/policy/sycl/reduce.hpp           |  52 +--
 .../policy/tensor/arch/avx/avx_double.hpp     |   8 +-
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |  22 +-
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp |  94 ++---
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp |  54 +--
 .../RAJA/policy/tensor/arch/avx/traits.hpp    |  24 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   |   8 +-
 .../policy/tensor/arch/avx2/avx2_float.hpp    |  26 +-
 .../policy/tensor/arch/avx2/avx2_int32.hpp    |  22 +-
 .../policy/tensor/arch/avx2/avx2_int64.hpp    |  32 +-
 .../RAJA/policy/tensor/arch/avx2/traits.hpp   |  24 +-
 .../tensor/arch/avx512/avx512_double.hpp      |   8 +-
 .../tensor/arch/avx512/avx512_float.hpp       |   6 +-
 .../tensor/arch/avx512/avx512_int32.hpp       |   6 +-
 .../tensor/arch/avx512/avx512_int64.hpp       |   8 +-
 .../RAJA/policy/tensor/arch/avx512/traits.hpp |  24 +-
 .../policy/tensor/arch/cuda/cuda_warp.hpp     |  70 ++--
 .../RAJA/policy/tensor/arch/cuda/traits.hpp   |   6 +-
 .../RAJA/policy/tensor/arch/hip/hip_wave.hpp  |  70 ++--
 .../RAJA/policy/tensor/arch/hip/traits.hpp    |   6 +-
 .../RAJA/policy/tensor/arch/scalar/scalar.hpp |   6 +-
 .../RAJA/policy/tensor/arch/scalar/traits.hpp |  24 +-
 include/RAJA/policy/tensor/policy.hpp         |   2 +-
 include/RAJA/util/BitMask.hpp                 |  16 +-
 include/RAJA/util/CombiningAdapter.hpp        |  10 +-
 include/RAJA/util/IndexLayout.hpp             |   6 +-
 include/RAJA/util/KokkosPluginLoader.hpp      |   6 +-
 include/RAJA/util/Layout.hpp                  |  32 +-
 include/RAJA/util/LocalArray.hpp              |   6 +-
 include/RAJA/util/OffsetLayout.hpp            |  26 +-
 include/RAJA/util/Operators.hpp               |   6 +-
 include/RAJA/util/Permutations.hpp            |  66 +--
 include/RAJA/util/PermutedLayout.hpp          |   8 +-
 include/RAJA/util/Registry.hpp                |  16 +-
 include/RAJA/util/RepeatView.hpp              |  10 +-
 include/RAJA/util/SoAArray.hpp                |  10 +-
 include/RAJA/util/SoAPtr.hpp                  |  22 +-
 include/RAJA/util/Span.hpp                    |  18 +-
 include/RAJA/util/StaticLayout.hpp            |  26 +-
 include/RAJA/util/Timer.hpp                   |  24 +-
 include/RAJA/util/TypedViewBase.hpp           |  74 ++--
 include/RAJA/util/View.hpp                    |  38 +-
 include/RAJA/util/align.hpp                   |   2 +-
 include/RAJA/util/basic_mempool.hpp           |  30 +-
 include/RAJA/util/for_each.hpp                |   6 +-
 include/RAJA/util/macros.hpp                  |  16 +-
 include/RAJA/util/mutex.hpp                   |  12 +-
 include/RAJA/util/reduce.hpp                  |  48 +--
 include/RAJA/util/resource.hpp                |   4 +-
 include/RAJA/util/sort.hpp                    |  42 +-
 include/RAJA/util/types.hpp                   |  46 +--
 include/RAJA/util/zip.hpp                     |   4 +-
 src/AlignedRangeIndexSetBuilders.cpp          |  24 +-
 src/KokkosPluginLoader.cpp                    |   2 +-
 src/LockFreeIndexSetBuilders.cpp              |  36 +-
 src/RuntimePluginLoader.cpp                   |   2 +-
 src/TensorStats.cpp                           |  70 ++--
 ...t-dynamic-forall-resource-RangeSegment.hpp |  26 +-
 .../test-dynamic-forall-RangeSegment.hpp      |  33 +-
 .../tests/test-forall-CombiningAdapter-1D.hpp |  13 +-
 .../tests/test-forall-CombiningAdapter-2D.hpp |  13 +-
 .../tests/test-forall-CombiningAdapter-3D.hpp |  14 +-
 .../tests/test-forall-atomic-basic.hpp        |  61 +--
 .../tests/test-forall-AtomicRefAdd.hpp        |  90 +++--
 .../tests/test-forall-AtomicRefCAS.hpp        |  56 +--
 .../tests/test-forall-AtomicRefLoadStore.hpp  |  64 +--
 .../tests/test-forall-AtomicRefLogical.hpp    |  90 +++--
 .../tests/test-forall-AtomicRefMinMax.hpp     |  64 +--
 .../tests/test-forall-AtomicRefSub.hpp        |  88 ++--
 .../tests/test-forall-AtomicMultiView.hpp     |  57 +--
 ...test-forall-AtomicOutOfBoundsMultiView.hpp |  16 +-
 .../tests/test-forall-AtomicView.hpp          |  18 +-
 .../tests/test-forall-IcountIndexSetView.hpp  |  18 +-
 .../tests/test-forall-IndexSetView.hpp        |  12 +-
 .../tests/test-forall-IcountIndexSet.hpp      |  15 +-
 .../indexset/tests/test-forall-IndexSet.hpp   |  17 +-
 .../tests/test-forall-basic-MultiReduce.hpp   |  86 ++--
 .../tests/test-forall-basic-ReduceBitAnd.hpp  |  36 +-
 .../tests/test-forall-basic-ReduceBitOr.hpp   |  36 +-
 .../tests/test-forall-basic-ReduceMax.hpp     |  40 +-
 .../tests/test-forall-basic-ReduceMaxLoc.hpp  |  54 +--
 .../tests/test-forall-basic-ReduceMin.hpp     |  40 +-
 .../tests/test-forall-basic-ReduceMinLoc.hpp  |  54 +--
 .../tests/test-forall-basic-ReduceSum.hpp     |  24 +-
 .../test-forall-basic-expt-ReduceBitAnd.hpp   |  28 +-
 .../test-forall-basic-expt-ReduceBitOr.hpp    |  28 +-
 .../test-forall-basic-expt-ReduceMax.hpp      |  33 +-
 .../test-forall-basic-expt-ReduceMaxLoc.hpp   |  45 +--
 .../test-forall-basic-expt-ReduceMin.hpp      |  33 +-
 .../test-forall-basic-expt-ReduceMinLoc.hpp   |  45 +--
 .../test-forall-basic-expt-ReduceSum.hpp      |  29 +-
 ...est-forall-indexset-multiple-ReduceMax.hpp |  26 +-
 ...-forall-indexset-multiple-ReduceMaxLoc.hpp |  20 +-
 ...est-forall-indexset-multiple-ReduceMin.hpp |  26 +-
 ...-forall-indexset-multiple-ReduceMinLoc.hpp |  20 +-
 ...est-forall-indexset-multiple-ReduceSum.hpp |  36 +-
 ...test-forall-segment-multiple-ReduceMax.hpp |  38 +-
 ...t-forall-segment-multiple-ReduceMaxLoc.hpp |  42 +-
 ...test-forall-segment-multiple-ReduceMin.hpp |  38 +-
 ...t-forall-segment-multiple-ReduceMinLoc.hpp |  42 +-
 ...test-forall-segment-multiple-ReduceSum.hpp |  64 +--
 .../region/tests/test-forall-region.hpp       |  22 +-
 .../test-forall-ResourceIcountIndexSet.hpp    |  17 +-
 .../tests/test-forall-ResourceIndexSet.hpp    |  20 +-
 .../test-forall-resource-ListSegment.hpp      |  15 +-
 .../test-forall-resource-RangeSegment.hpp     |  19 +-
 ...est-forall-resource-RangeStrideSegment.hpp |  19 +-
 .../tests/test-forall-ListSegmentView.hpp     |  12 +-
 .../tests/test-forall-RangeSegment2DView.hpp  |  46 ++-
 .../tests/test-forall-RangeSegmentView.hpp    |  28 +-
 .../test-forall-RangeStrideSegmentView.hpp    |  22 +-
 .../segment/tests/test-forall-ListSegment.hpp |  21 +-
 .../tests/test-forall-RangeSegment.hpp        |  25 +-
 .../tests/test-forall-RangeStrideSegment.hpp  |  27 +-
 .../indexset-build/test-aligned-indexset.cpp  |   2 +-
 .../tests/basic-fission-fusion-loop-impl.hpp  |  23 +-
 ...nel-basic-fission-fusion-loop-segments.hpp |   4 +-
 .../tests/basic-single-icount-loop-impl.hpp   |  16 +-
 ...rnel-basic-single-icount-loop-segments.hpp |   4 +-
 .../tests/basic-single-loop-segments-impl.hpp |  25 +-
 ...test-kernel-basic-single-loop-segments.hpp |   4 +-
 ...el-resource-basic-single-loop-segments.hpp |   4 +-
 .../conditional-fission-fusion-loop-impl.hpp  |  21 +-
 ...nditional-fission-fusion-loop-segments.hpp |   4 +-
 .../tests/test-kernel-hyperplane-2D.hpp       |  11 +-
 .../tests/test-kernel-hyperplane-3D.hpp       |  13 +-
 .../tests/test-kernel-nested-MultiReduce.hpp  |  35 +-
 .../tests/nested-loop-BlockReduceSum-impl.hpp |  16 +-
 .../tests/nested-loop-ReduceSum-impl.hpp      |  20 +-
 ...test-kernel-nested-loop-BlockReduceSum.hpp |   4 +-
 .../test-kernel-nested-loop-ReduceSum.hpp     |   4 +-
 ...el-resource-nested-loop-BlockReduceSum.hpp |   4 +-
 ...-kernel-resource-nested-loop-ReduceSum.hpp |   4 +-
 ...test-kernel-nested-loops-segment-types.hpp |  31 +-
 .../test-kernel-nested-loop-OffsetView2D.hpp  |  17 +-
 .../test-kernel-nested-loop-OffsetView3D.hpp  |  19 +-
 ...ernel-nested-loop-PermutedOffsetView2D.hpp |  23 +-
 ...ernel-nested-loop-PermutedOffsetView3D.hpp |  27 +-
 ...test-kernel-nested-loop-PermutedView2D.hpp |  15 +-
 ...test-kernel-nested-loop-PermutedView3D.hpp |  15 +-
 .../tests/nested-loop-Basic-impl.hpp          |  44 +-
 .../tests/nested-loop-MultiLambda-impl.hpp    |  14 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |  24 +-
 .../tests/test-kernel-nested-loop-Basic.hpp   |   2 +-
 .../test-kernel-nested-loop-MultiLambda.hpp   |   2 +-
 ...st-kernel-nested-loop-MultiLambdaParam.hpp |   2 +-
 ...test-kernel-resource-nested-loop-Basic.hpp |   2 +-
 ...ernel-resource-nested-loop-MultiLambda.hpp |   2 +-
 ...-resource-nested-loop-MultiLambdaParam.hpp |   2 +-
 .../tests/test-kernel-reduceloc-Max2D.hpp     |  65 +--
 .../tests/test-kernel-reduceloc-Max2DView.hpp |  65 +--
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |  67 +--
 .../tests/test-kernel-reduceloc-Min2D.hpp     |  65 +--
 .../tests/test-kernel-reduceloc-Min2DView.hpp |  65 +--
 .../test-kernel-reduceloc-Min2DViewTuple.hpp  |  67 +--
 .../region/tests/test-kernel-region-data.hpp  |  18 +-
 .../region/tests/test-kernel-region-sync.hpp  |   5 +-
 .../region/tests/test-kernel-region.hpp       |   5 +-
 .../test-kernel-single-loop-ForICount.hpp     |   7 +-
 .../test-kernel-single-loop-TileTCount.hpp    |   7 +-
 .../tests/test-kernel-tile-Dynamic2D.hpp      |  14 +-
 .../tests/test-kernel-tile-Fixed2D.hpp        |   9 +-
 .../tests/test-kernel-tile-Fixed2DMinMax.hpp  |  11 +-
 .../tests/test-kernel-tile-Fixed2DSum.hpp     |  13 +-
 .../tests/test-kernel-tile-LocalArray2D.hpp   |  14 +-
 ...kernel-resource-warp-thread-ReduceMask.hpp |   4 +-
 ...kernel-resource-warp-thread-ReduceWarp.hpp |   4 +-
 ...t-kernel-resource-warp-thread-WarpLoop.hpp |   4 +-
 .../test-kernel-warp-thread-ReduceMask.hpp    |   4 +-
 .../test-kernel-warp-thread-ReduceWarp.hpp    |   4 +-
 .../test-kernel-warp-thread-WarpLoop.hpp      |   4 +-
 .../tests/warp-thread-ReduceMask-impl.hpp     |  18 +-
 .../tests/warp-thread-ReduceWarp-impl.hpp     |  38 +-
 .../tests/warp-thread-WarpLoop-impl.hpp       |  13 +-
 .../tests/test-launch-nested-MultiReduce.hpp  |  59 +--
 .../tests/test-launch-nested-Direct.hpp       | 111 +++--
 .../tests/test-launch-nested-Loop.hpp         | 111 +++--
 .../tests/test-launch-nested-Tile-Direct.hpp  |  61 ++-
 .../tests/test-launch-nested-Tile-Loop.hpp    |  66 ++-
 .../tests/test-launch-basic-ReduceBitAnd.hpp  |  32 +-
 .../tests/test-launch-basic-ReduceMin.hpp     |  50 ++-
 .../tests/test-launch-basic-ReduceSum.hpp     |  29 +-
 ...t-launch-basic-param-expt-ReduceBitAnd.hpp |  32 +-
 ...test-launch-basic-param-expt-ReduceMin.hpp |  53 ++-
 ...test-launch-basic-param-expt-ReduceSum.hpp |  29 +-
 .../tests/test-launch-BasicShared.hpp         |  52 ++-
 .../segment/tests/test-launch-ListSegment.hpp |  29 +-
 .../tests/test-launch-RangeSegment.hpp        |  32 +-
 .../tests/test-launch-RangeStrideSegment.hpp  |  36 +-
 .../tests/test-launch-DynamicMem.hpp          |  84 ++--
 .../tests/test-launch-StaticMem.hpp           |  80 ++--
 .../test-launch-nested-Tile-iCount-Direct.hpp |  30 +-
 .../test-launch-nested-Tile-iCount-Loop.hpp   |  30 +-
 .../scan/tests/test-scan-Exclusive.hpp        |  12 +-
 .../scan/tests/test-scan-ExclusiveInplace.hpp |  12 +-
 .../scan/tests/test-scan-Inclusive.hpp        |   8 +-
 .../scan/tests/test-scan-InclusiveInplace.hpp |   8 +-
 test/functional/scan/tests/test-scan-data.hpp |  22 +-
 .../tests/test-tensor-matrix-CtorGetSet.hpp   |  42 +-
 .../tests/test-tensor-matrix-ET_Add.hpp       |  66 +--
 .../tests/test-tensor-matrix-ET_Divide.hpp    |  66 +--
 .../tests/test-tensor-matrix-ET_LoadStore.hpp |  66 +--
 ...-tensor-matrix-ET_MatrixMatrixMultiply.hpp |  67 +--
 ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp |  62 +--
 .../test-tensor-matrix-ET_MatrixVector.hpp    |  86 ++--
 .../tests/test-tensor-matrix-ET_Negate.hpp    |  60 +--
 .../tests/test-tensor-matrix-ET_Subtract.hpp  |  66 +--
 .../tests/test-tensor-matrix-ET_Transpose.hpp |  73 ++--
 .../test-tensor-matrix-Load_ColMajor.hpp      |  96 ++---
 .../test-tensor-matrix-Load_RowMajor.hpp      |  96 ++---
 .../test-tensor-matrix-Store_ColMajor.hpp     |  98 ++---
 .../test-tensor-matrix-Store_RowMajor.hpp     |  98 ++---
 .../tests/test-tensor-matrix-Transpose.hpp    |  32 +-
 .../tests/test-tensor-register-Add.hpp        |  76 ++--
 .../tests/test-tensor-register-Divide.hpp     |  94 +++--
 .../tests/test-tensor-register-DotProduct.hpp |  26 +-
 .../tests/test-tensor-register-FMA.hpp        |  32 +-
 .../tests/test-tensor-register-FMS.hpp        |  32 +-
 .../tests/test-tensor-register-Gather.hpp     |  62 +--
 .../tests/test-tensor-register-GetSet.hpp     | 204 +++++-----
 .../tests/test-tensor-register-Load.hpp       | 102 ++---
 .../tests/test-tensor-register-Max.hpp        |  48 ++-
 .../tests/test-tensor-register-Min.hpp        |  46 ++-
 .../tests/test-tensor-register-Multiply.hpp   |  76 ++--
 .../tests/test-tensor-register-Scatter.hpp    |  44 +-
 ...ensor-register-SegmentedBroadcastInner.hpp |  22 +-
 ...ensor-register-SegmentedBroadcastOuter.hpp |  20 +-
 ...st-tensor-register-SegmentedDotProduct.hpp |  26 +-
 ...test-tensor-register-SegmentedSumInner.hpp |  20 +-
 ...test-tensor-register-SegmentedSumOuter.hpp |  20 +-
 .../tests/test-tensor-register-Store.hpp      |  94 +++--
 .../tests/test-tensor-register-Subtract.hpp   |  76 ++--
 .../tests/test-tensor-vector-CtorGetSet.hpp   |  50 +--
 .../tests/test-tensor-vector-FmaFms.hpp       |  66 +--
 .../test-tensor-vector-ForallVectorRef1d.hpp  |  20 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |   9 +-
 .../tests/test-tensor-vector-MinMax.hpp       |  32 +-
 .../tests/test-tensor-vector-SumDot.hpp       |  24 +-
 .../util/test-CombiningAdapter-1D.cpp         |   5 +-
 .../util/test-CombiningAdapter-2D.cpp         |  11 +-
 .../util/test-CombiningAdapter-3D.cpp         |  15 +-
 .../util/test-PermutedCombiningAdapter-1D.cpp |   5 +-
 .../util/test-PermutedCombiningAdapter-2D.cpp |   9 +-
 .../util/test-PermutedCombiningAdapter-3D.cpp |  11 +-
 .../test-workgroup-Ordered-MultipleReuse.hpp  |  74 ++--
 .../tests/test-workgroup-Ordered-Single.hpp   |  28 +-
 ...test-workgroup-Unordered-MultipleReuse.hpp |  64 +--
 .../tests/test-workgroup-Unordered-Single.hpp |  23 +-
 test/include/RAJA_gtest.hpp                   |   2 +-
 test/include/RAJA_test-atomic-ref-types.hpp   |  10 +-
 .../RAJA_test-forall-async-execpol.hpp        |   6 +-
 test/include/RAJA_test-forall-data.hpp        |  26 +-
 test/include/RAJA_test-indexset-build.hpp     |  22 +-
 .../RAJA_test-multi-reduce-abstractor.hpp     |   8 +-
 test/include/RAJA_test-tensor.hpp             |   8 +-
 test/include/RAJA_test-workgroup.hpp          |  44 +-
 test/include/RAJA_unit-test-policy.hpp        |   8 +-
 .../using-with-cmake/using-with-cmake.cpp     |   8 +-
 test/integration/plugin/tests/counter.hpp     |  10 +-
 .../plugin/tests/test-plugin-forall.hpp       |  16 +-
 .../plugin/tests/test-plugin-kernel.hpp       |   4 +-
 .../plugin/tests/test-plugin-launch.hpp       |   9 +-
 .../tests/test-plugin-resource-launch.hpp     |   9 +-
 .../plugin/tests/test-plugin-workgroup.hpp    |  28 +-
 test/integration/plugin/tests/test-plugin.hpp |  46 +--
 test/integration/plugin_for_test_kokkos.cpp   |   8 +-
 test/old-tests/unit/cuda/test-synchronize.cpp |   6 +-
 test/old-tests/unit/test-sharedmem.cpp        | 131 +++---
 test/old-tests/unit/test-simd.cpp             |  25 +-
 .../test-algorithm-util-for_each.cpp          |  30 +-
 .../tests/test-algorithm-reduce-utils.hpp     |  76 ++--
 .../tests/test-algorithm-sort-utils.hpp       | 122 +++---
 .../algorithm/tests/test-algorithm-sort.hpp   |   8 +-
 .../tests/test-algorithm-stable-sort.hpp      |   8 +-
 .../tests/test-algorithm-util-reduce.hpp      |  56 +--
 .../tests/test-algorithm-util-sort.hpp        | 150 +++----
 test/unit/atomic/test-atomic-incdec.cpp       |  32 +-
 .../unit/atomic/test-atomic-ref-accessors.cpp |  30 +-
 test/unit/atomic/test-atomic-ref-addsub.cpp   |   6 +-
 test/unit/atomic/test-atomic-ref-bitwise.cpp  |  10 +-
 .../atomic/test-atomic-ref-constructor.cpp    |  12 +-
 .../unit/atomic/test-atomic-ref-exchanges.cpp |  44 +-
 test/unit/atomic/test-atomic-ref-minmax.cpp   |  10 +-
 test/unit/hip/test-synchronize.cpp            |  12 +-
 test/unit/index/test-indexset.cpp             |  38 +-
 test/unit/index/test-indexvalue.cpp           |   8 +-
 test/unit/index/test-listsegment.cpp          |   4 +-
 .../indexing/tests/test-indexing-global.hpp   |  25 +-
 test/unit/internal/test-iterators.cpp         |  22 +-
 test/unit/internal/test-rajavec.cpp           |   2 +-
 .../tests/test-multi-reducer-constructors.hpp |  12 +-
 .../tests/test-multi-reducer-reset.hpp        | 152 +++----
 .../tests/test-reducer-constructors.hpp       |  34 +-
 .../unit/reducer/tests/test-reducer-reset.hpp |  50 +--
 .../tests/test-resource-AsyncTime.hpp         |  30 +-
 .../test-resource-BasicAsyncSemantics.hpp     |  18 +-
 .../resource/tests/test-resource-Depends.hpp  |  22 +-
 .../test-resource-JoinAsyncSemantics.hpp      |  20 +-
 .../tests/test-resource-MultiStream.hpp       |  66 +--
 .../test-operators-bitwise-modulus.cpp        |  22 +-
 .../operator/test-operators-equivalence.cpp   |  32 +-
 .../util/operator/test-operators-identity.cpp |  12 +-
 .../util/operator/test-operators-logical.cpp  |  28 +-
 .../util/operator/test-operators-math.cpp     |  16 +-
 test/unit/util/test-span.hpp                  |  44 +-
 test/unit/view-layout/test-indexlayout.cpp    |  22 +-
 test/unit/view-layout/test-makelayout.cpp     |   2 +-
 test/unit/view-layout/test-multiview.cpp      |  50 +--
 test/unit/view-layout/test-typedview.cpp      |  46 +--
 .../tests/test-util-workgroup-Enqueue.hpp     |   6 +-
 .../tests/test-util-workgroup-WorkStorage.hpp |  16 +-
 .../tests/test-workgroup-Constructor.hpp      |  14 +-
 .../tests/test-workgroup-Dispatcher.hpp       |  42 +-
 .../tests/test-workgroup-Enqueue-Multiple.hpp |  20 +-
 .../tests/test-workgroup-Enqueue-Single.hpp   |  18 +-
 ...test-workgroup-WorkStorage-Constructor.hpp |   9 +-
 .../test-workgroup-WorkStorage-InsertCall.hpp |  19 +-
 .../test-workgroup-WorkStorage-Iterator.hpp   |   4 +-
 .../test-workgroup-WorkStorage-Multiple.hpp   |  42 +-
 540 files changed, 10907 insertions(+), 9610 deletions(-)

diff --git a/.clang-format b/.clang-format
index ca4ac0cd75..0f46089fda 100644
--- a/.clang-format
+++ b/.clang-format
@@ -8,6 +8,11 @@ AccessModifierOffset : -2
 # This must be off so that include order in RAJA is preserved
 SortIncludes: false
 
+# Alignment of consecutive declarations, assignments etc
+AlignConsecutiveAssignments : true
+AlignConsecutiveDeclarations : true
+AlignConsecutiveMacros : true
+
 # Control curly brace placement
 BreakBeforeBraces : Custom
 BraceWrapping:
@@ -23,7 +28,7 @@ BraceWrapping:
   AfterExternBlock: false
   BeforeCatch:     true
   BeforeElse:      true
-  # BeforeLambdaBody: true   # available in clang 11
+  BeforeLambdaBody: true 
   IndentBraces:    false
   SplitEmptyFunction: false
   SplitEmptyRecord: false
diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp
index 5e81a19681..751d6e3537 100644
--- a/examples/dynamic-forall.cpp
+++ b/examples/dynamic-forall.cpp
@@ -100,10 +100,10 @@ int main(int argc, char* argv[])
   //----------------------------------------------------------------------------//
 
   // policy is chosen from the list
-  RAJA::expt::dynamic_forall<policy_list>(
-      pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
-        c[i] = a[i] + b[i];
-      });
+  RAJA::expt::dynamic_forall<policy_list>(pol,
+                                          RAJA::RangeSegment(0, N),
+                                          [=] RAJA_HOST_DEVICE(int i)
+                                          { c[i] = a[i] + b[i]; });
   // _rajaseq_vector_add_end
 
   checkResult(c, N);
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
index 83b946b732..2404a2dc64 100644
--- a/examples/dynamic_mat_transpose.cpp
+++ b/examples/dynamic_mat_transpose.cpp
@@ -222,7 +222,7 @@ int main(int argc, char* argv[])
   //
   // Allocate matrix data
   //
-  int* A = host_res.allocate<int>(N_r * N_c);
+  int* A  = host_res.allocate<int>(N_r * N_c);
   int* At = host_res.allocate<int>(N_r * N_c);
   //
   // In the following implementations of matrix transpose, we
@@ -329,7 +329,7 @@ int main(int argc, char* argv[])
   if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
   {
 
-    d_A = device_res.allocate<int>(N_r * N_c);
+    d_A  = device_res.allocate<int>(N_r * N_c);
     d_At = device_res.allocate<int>(N_r * N_c);
 
     device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c);
@@ -352,59 +352,85 @@ int main(int argc, char* argv[])
                          RAJA::Threads(TILE_DIM, TILE_DIM),
                          dynamic_shared_mem_size),
       "Matrix tranpose with dynamic shared memory kernel",
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&](int by) {
-          RAJA::loop<outer0>(
-              ctx, RAJA::RangeSegment(0, outer_Dimc), [&](int bx) {
-                // Request memory from shared memory pool
-                int* tile_ptr = ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
-
-                // Use RAJA View for simplified indexing
-                RAJA::View<int, RAJA::Layout<2>> Tile(
-                    tile_ptr, TILE_DIM, TILE_DIM);
-
-                RAJA::loop<inner1>(
-                    ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int ty) {
-                      RAJA::loop<inner0>(
-                          ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int tx) {
-                            int col = bx * TILE_DIM + tx; // Matrix column index
-                            int row = by * TILE_DIM + ty; // Matrix row index
-
-                            // Bounds check
-                            if (row < N_r && col < N_c)
-                            {
-                              Tile(ty, tx) = Aview(row, col);
-                            }
-                          });
-                    });
-
-                // Barrier is needed to ensure all threads have written to Tile
-                ctx.teamSync();
-
-                RAJA::loop<inner1>(
-                    ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int ty) {
-                      RAJA::loop<inner0>(
-                          ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int tx) {
-                            int col = bx * TILE_DIM + tx; // Matrix column index
-                            int row = by * TILE_DIM + ty; // Matrix row index
-
-                            // Bounds check
-                            if (row < N_r && col < N_c)
-                            {
-                              Atview(col, row) = Tile(ty, tx);
-                            }
-                          });
-                    });
-
-                // The launch context uses bump style allocator in which calls
-                // to getSharedMemory moves a memory buffer pointer to return
-                // different segments of shared memory. To avoid requesting
-                // beyond the pre-allocated memory quantity we reset the
-                // allocator offset counter in the launch context effectively
-                // releasing shared memory.
-                ctx.releaseSharedMemory();
-              });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<outer1>(
+            ctx,
+            RAJA::RangeSegment(0, outer_Dimr),
+            [&](int by)
+            {
+              RAJA::loop<outer0>(
+                  ctx,
+                  RAJA::RangeSegment(0, outer_Dimc),
+                  [&](int bx)
+                  {
+                    // Request memory from shared memory pool
+                    int* tile_ptr =
+                        ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
+
+                    // Use RAJA View for simplified indexing
+                    RAJA::View<int, RAJA::Layout<2>> Tile(
+                        tile_ptr, TILE_DIM, TILE_DIM);
+
+                    RAJA::loop<inner1>(ctx,
+                                       RAJA::RangeSegment(0, TILE_DIM),
+                                       [&](int ty)
+                                       {
+                                         RAJA::loop<inner0>(
+                                             ctx,
+                                             RAJA::RangeSegment(0, TILE_DIM),
+                                             [&](int tx)
+                                             {
+                                               int col =
+                                                   bx * TILE_DIM +
+                                                   tx; // Matrix column index
+                                               int row = by * TILE_DIM +
+                                                         ty; // Matrix row index
+
+                                               // Bounds check
+                                               if (row < N_r && col < N_c)
+                                               {
+                                                 Tile(ty, tx) = Aview(row, col);
+                                               }
+                                             });
+                                       });
+
+                    // Barrier is needed to ensure all threads have written to
+                    // Tile
+                    ctx.teamSync();
+
+                    RAJA::loop<inner1>(
+                        ctx,
+                        RAJA::RangeSegment(0, TILE_DIM),
+                        [&](int ty)
+                        {
+                          RAJA::loop<inner0>(
+                              ctx,
+                              RAJA::RangeSegment(0, TILE_DIM),
+                              [&](int tx)
+                              {
+                                int col =
+                                    bx * TILE_DIM + tx; // Matrix column index
+                                int row =
+                                    by * TILE_DIM + ty; // Matrix row index
+
+                                // Bounds check
+                                if (row < N_r && col < N_c)
+                                {
+                                  Atview(col, row) = Tile(ty, tx);
+                                }
+                              });
+                        });
+
+                    // The launch context uses bump style allocator in which
+                    // calls to getSharedMemory moves a memory buffer pointer to
+                    // return different segments of shared memory. To avoid
+                    // requesting beyond the pre-allocated memory quantity we
+                    // reset the allocator offset counter in the launch context
+                    // effectively releasing shared memory.
+                    ctx.releaseSharedMemory();
+                  });
+            });
       });
   // _dynamic_mattranspose_kernel_end
 
diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp
index 9779f6c02b..7459953ffe 100644
--- a/examples/forall-param-reductions.cpp
+++ b/examples/forall-param-reductions.cpp
@@ -56,7 +56,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Allocate array data and initialize data to alternating sequence of 1, -1.
   //
   RAJA::resources::Host host_res;
-  int* a = host_res.allocate<int>(N);
+  int*                  a = host_res.allocate<int>(N);
 
   for (int i = 0; i < N; ++i)
   {
@@ -74,10 +74,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Set min and max loc values
   //
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref] = -100;
+  a[minloc_ref]            = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref] = 100;
+  a[maxloc_ref]            = 100;
   // _reductions_array_init_end
 
   //
@@ -111,9 +111,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _reductions_raja_seq_start
   using EXEC_POL1 = RAJA::seq_exec;
 
-  int seq_sum = 0;
-  int seq_min = std::numeric_limits<int>::max();
-  int seq_max = std::numeric_limits<int>::min();
+  int        seq_sum = 0;
+  int        seq_min = std::numeric_limits<int>::max();
+  int        seq_max = std::numeric_limits<int>::min();
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -126,12 +126,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
       RAJA::expt::KernelName("RAJA Reduce Seq Kernel"),
-      [=](int i,
-          int& _seq_sum,
-          int& _seq_min,
-          int& _seq_max,
+      [=](int         i,
+          int&        _seq_sum,
+          int&        _seq_min,
+          int&        _seq_max,
           VALLOC_INT& _seq_minloc,
-          VALLOC_INT& _seq_maxloc) {
+          VALLOC_INT& _seq_maxloc)
+      {
         _seq_sum += a[i];
 
         _seq_min = RAJA_MIN(a[i], _seq_min);
@@ -165,9 +166,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL2 = RAJA::omp_parallel_for_exec;
   // _reductions_raja_omppolicy_end
 
-  int omp_sum = 0;
-  int omp_min = std::numeric_limits<int>::max();
-  int omp_max = std::numeric_limits<int>::min();
+  int        omp_sum = 0;
+  int        omp_min = std::numeric_limits<int>::max();
+  int        omp_max = std::numeric_limits<int>::min();
   VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -180,12 +181,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
       RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"),
-      [=](int i,
-          int& _omp_sum,
-          int& _omp_min,
-          int& _omp_max,
+      [=](int         i,
+          int&        _omp_sum,
+          int&        _omp_min,
+          int&        _omp_max,
           VALLOC_INT& _omp_minloc,
-          VALLOC_INT& _omp_maxloc) {
+          VALLOC_INT& _omp_maxloc)
+      {
         _omp_sum += a[i];
 
         _omp_min = RAJA_MIN(a[i], _omp_min);
@@ -218,9 +220,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt;
   // _reductions_raja_omppolicy_end
 
-  int omp_t_sum = 0;
-  int omp_t_min = std::numeric_limits<int>::max();
-  int omp_t_max = std::numeric_limits<int>::min();
+  int        omp_t_sum = 0;
+  int        omp_t_min = std::numeric_limits<int>::max();
+  int        omp_t_max = std::numeric_limits<int>::min();
   VALLOC_INT omp_t_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_t_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -233,12 +235,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
       RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"),
-      [=](int i,
-          int& _omp_t_sum,
-          int& _omp_t_min,
-          int& _omp_t_max,
+      [=](int         i,
+          int&        _omp_t_sum,
+          int&        _omp_t_min,
+          int&        _omp_t_max,
           VALLOC_INT& _omp_t_minloc,
-          VALLOC_INT& _omp_t_maxloc) {
+          VALLOC_INT& _omp_t_maxloc)
+      {
         _omp_t_sum += a[i];
 
         _omp_t_min = RAJA_MIN(a[i], _omp_t_min);
@@ -275,9 +278,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   // _reductions_raja_cudapolicy_end
 
-  int cuda_sum = 0;
-  int cuda_min = std::numeric_limits<int>::max();
-  int cuda_max = std::numeric_limits<int>::min();
+  int        cuda_sum = 0;
+  int        cuda_min = std::numeric_limits<int>::max();
+  int        cuda_max = std::numeric_limits<int>::min();
   VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -290,12 +293,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
       RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
-      [=] RAJA_DEVICE(int i,
-                      int& _cuda_sum,
-                      int& _cuda_min,
-                      int& _cuda_max,
+      [=] RAJA_DEVICE(int         i,
+                      int&        _cuda_sum,
+                      int&        _cuda_min,
+                      int&        _cuda_max,
                       VALLOC_INT& _cuda_minloc,
-                      VALLOC_INT& _cuda_maxloc) {
+                      VALLOC_INT& _cuda_maxloc)
+      {
         _cuda_sum += d_a[i];
 
         _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
@@ -331,9 +335,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   // _reductions_raja_hippolicy_end
 
-  int hip_sum = 0;
-  int hip_min = std::numeric_limits<int>::max();
-  int hip_max = std::numeric_limits<int>::min();
+  int        hip_sum = 0;
+  int        hip_min = std::numeric_limits<int>::max();
+  int        hip_max = std::numeric_limits<int>::min();
   VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -345,12 +349,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
       RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
-      [=] RAJA_DEVICE(int i,
-                      int& _hip_sum,
-                      int& _hip_min,
-                      int& _hip_max,
+      [=] RAJA_DEVICE(int         i,
+                      int&        _hip_sum,
+                      int&        _hip_min,
+                      int&        _hip_max,
                       VALLOC_INT& _hip_minloc,
-                      VALLOC_INT& _hip_maxloc) {
+                      VALLOC_INT& _hip_maxloc)
+      {
         _hip_sum += d_a[i];
 
         _hip_min = RAJA_MIN(d_a[i], _hip_min);
@@ -387,9 +392,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 = RAJA::sycl_exec<SYCL_BLOCK_SIZE>;
   // _reductions_raja_syclpolicy_end
 
-  int sycl_sum = 0;
-  int sycl_min = std::numeric_limits<int>::max();
-  int sycl_max = std::numeric_limits<int>::min();
+  int        sycl_sum = 0;
+  int        sycl_min = std::numeric_limits<int>::max();
+  int        sycl_max = std::numeric_limits<int>::min();
   VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -402,12 +407,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
       RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
-      [=] RAJA_DEVICE(int i,
-                      int& _sycl_sum,
-                      int& _sycl_min,
-                      int& _sycl_max,
+      [=] RAJA_DEVICE(int         i,
+                      int&        _sycl_sum,
+                      int&        _sycl_min,
+                      int&        _sycl_max,
                       VALLOC_INT& _sycl_minloc,
-                      VALLOC_INT& _sycl_maxloc) {
+                      VALLOC_INT& _sycl_maxloc)
+      {
         _sycl_sum += d_a[i];
 
         _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp
index d8b145f9ee..090ded938f 100644
--- a/examples/forall_multi-reductions.cpp
+++ b/examples/forall_multi-reductions.cpp
@@ -30,7 +30,7 @@
 template <typename t_exec_policy, typename t_multi_reduce_policy>
 struct Backend
 {
-  using exec_policy = t_exec_policy;
+  using exec_policy         = t_exec_policy;
   using multi_reduce_policy = t_multi_reduce_policy;
 
   std::string name;
@@ -60,21 +60,23 @@ auto example_policies = camp::make_tuple(
 template <typename exec_policy, typename multi_reduce_policy>
 void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a)
 {
-  RAJA::MultiReduceSum<multi_reduce_policy, int> multi_reduce_sum(num_bins);
-  RAJA::MultiReduceMin<multi_reduce_policy, int> multi_reduce_min(num_bins);
-  RAJA::MultiReduceMax<multi_reduce_policy, int> multi_reduce_max(num_bins);
+  RAJA::MultiReduceSum<multi_reduce_policy, int>    multi_reduce_sum(num_bins);
+  RAJA::MultiReduceMin<multi_reduce_policy, int>    multi_reduce_min(num_bins);
+  RAJA::MultiReduceMax<multi_reduce_policy, int>    multi_reduce_max(num_bins);
   RAJA::MultiReduceBitAnd<multi_reduce_policy, int> multi_reduce_and(num_bins);
-  RAJA::MultiReduceBitOr<multi_reduce_policy, int> multi_reduce_or(num_bins);
+  RAJA::MultiReduceBitOr<multi_reduce_policy, int>  multi_reduce_or(num_bins);
 
-  RAJA::forall<exec_policy>(arange, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    int bin = bins[i];
+  RAJA::forall<exec_policy>(arange,
+                            [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
+                            {
+                              int bin = bins[i];
 
-    multi_reduce_sum[bin] += a[i];
-    multi_reduce_min[bin].min(a[i]);
-    multi_reduce_max[bin].max(a[i]);
-    multi_reduce_and[bin] &= a[i];
-    multi_reduce_or[bin] |= a[i];
-  });
+                              multi_reduce_sum[bin] += a[i];
+                              multi_reduce_min[bin].min(a[i]);
+                              multi_reduce_max[bin].max(a[i]);
+                              multi_reduce_and[bin] &= a[i];
+                              multi_reduce_or[bin] |= a[i];
+                            });
 
   for (int bin = 0; bin < num_bins; ++bin)
   {
@@ -94,20 +96,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
   //
   // Define array length
   //
-  const int N = 1000000;
+  const int N        = 1000000;
   const int num_bins = 10;
 
   //
   // Allocate array data and initialize data to alternating sequence of 1, -1.
   //
   camp::resources::Host host_res;
-  int* host_bins = host_res.template allocate<int>(N);
-  int* host_a = host_res.template allocate<int>(N);
+  int*                  host_bins = host_res.template allocate<int>(N);
+  int*                  host_a    = host_res.template allocate<int>(N);
 
   for (int i = 0; i < N; ++i)
   {
     host_bins[i] = i % num_bins;
-    host_a[i] = (i % (2 * num_bins)) - num_bins;
+    host_a[i]    = (i % (2 * num_bins)) - num_bins;
   }
 
   // _multi_reductions_array_init_end
@@ -133,28 +135,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
   //----------------------------------------------------------------------------//
 
-  RAJA::for_each_tuple(example_policies, [&](auto const& backend) {
-    std::cout << "Running " << backend.name << " policies" << '\n';
+  RAJA::for_each_tuple(
+      example_policies,
+      [&](auto const& backend)
+      {
+        std::cout << "Running " << backend.name << " policies" << '\n';
 
-    using exec_policy = typename std::decay_t<decltype(backend)>::exec_policy;
-    using multi_reduce_policy =
-        typename std::decay_t<decltype(backend)>::multi_reduce_policy;
+        using exec_policy =
+            typename std::decay_t<decltype(backend)>::exec_policy;
+        using multi_reduce_policy =
+            typename std::decay_t<decltype(backend)>::multi_reduce_policy;
 
-    auto res = RAJA::resources::get_default_resource<exec_policy>();
+        auto res = RAJA::resources::get_default_resource<exec_policy>();
 
-    int* bins = res.template allocate<int>(N);
-    int* a = res.template allocate<int>(N);
+        int* bins = res.template allocate<int>(N);
+        int* a    = res.template allocate<int>(N);
 
-    res.memcpy(bins, host_bins, N * sizeof(int));
-    res.memcpy(a, host_a, N * sizeof(int));
+        res.memcpy(bins, host_bins, N * sizeof(int));
+        res.memcpy(a, host_a, N * sizeof(int));
 
-    example_code<exec_policy, multi_reduce_policy>(arange, num_bins, bins, a);
+        example_code<exec_policy, multi_reduce_policy>(
+            arange, num_bins, bins, a);
 
-    res.deallocate(bins);
-    res.deallocate(a);
+        res.deallocate(bins);
+        res.deallocate(a);
 
-    std::cout << std::endl;
-  });
+        std::cout << std::endl;
+      });
 
   //----------------------------------------------------------------------------//
 
diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp
index 8bf25d9a86..dd74a929eb 100644
--- a/examples/jacobi.cpp
+++ b/examples/jacobi.cpp
@@ -83,7 +83,7 @@ const int HIP_BLOCK_SIZE = 256;
 struct grid_s
 {
   double o, h;
-  int n;
+  int    n;
 };
 
 //
@@ -92,7 +92,7 @@ struct grid_s
 // computeErr - Displays the maximum error in the solution
 //
 double solution(double x, double y);
-void computeErr(double* I, grid_s grid);
+void   computeErr(double* I, grid_s grid);
 
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
@@ -112,12 +112,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
    */
   double tol = 1e-10;
 
-  int N = 50;
-  int NN = (N + 2) * (N + 2);
+  int N       = 50;
+  int NN      = (N + 2) * (N + 2);
   int maxIter = 100000;
 
   double resI2;
-  int iteration;
+  int    iteration;
 
   grid_s gridx;
   gridx.o = 0.0;
@@ -127,7 +127,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // I, Iold - Holds iterates of Jacobi method
   //
-  double* I = memoryManager::allocate<double>(NN);
+  double* I    = memoryManager::allocate<double>(NN);
   double* Iold = memoryManager::allocate<double>(NN);
 
 
@@ -136,7 +136,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 
   printf("Standard  C++ Loop \n");
-  resI2 = 1;
+  resI2     = 1;
   iteration = 0;
 
   while (resI2 > tol * tol)
@@ -157,7 +157,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                    (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
         int id = n * (N + 2) + m;
-        I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+        I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
                         Iold[id - 1] + Iold[id + 1]);
       }
     }
@@ -196,7 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   printf("RAJA: Sequential Policy - Nested ForallN \n");
-  resI2 = 1;
+  resI2     = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
@@ -214,7 +214,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     RAJA::kernel<jacobiSeqNestedPolicy>(
         RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=](RAJA::Index_type m, RAJA::Index_type n) {
+        [=](RAJA::Index_type m, RAJA::Index_type n)
+        {
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
@@ -222,15 +223,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                      (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+          I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
                           Iold[id - 1] + Iold[id + 1]);
         });
 
     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
-    RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
-      RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
-      Iold[k] = I[k];
-    });
+    RAJA::forall<RAJA::seq_exec>(gridRange,
+                                 [=](RAJA::Index_type k)
+                                 {
+                                   RAJA_resI2 +=
+                                       (I[k] - Iold[k]) * (I[k] - Iold[k]);
+                                   Iold[k] = I[k];
+                                 });
 
     resI2 = RAJA_resI2;
     if (iteration > maxIter)
@@ -246,7 +250,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
   printf("RAJA: OpenMP Policy - Nested ForallN \n");
-  resI2 = 1;
+  resI2     = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
@@ -272,7 +276,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     RAJA::kernel<jacobiOmpNestedPolicy>(
         RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=](RAJA::Index_type m, RAJA::Index_type n) {
+        [=](RAJA::Index_type m, RAJA::Index_type n)
+        {
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
@@ -280,18 +285,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                      (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+          I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
                           Iold[id - 1] + Iold[id + 1]);
         });
 
 
     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
 
-    RAJA::forall<RAJA::omp_parallel_for_exec>(
-        gridRange, [=](RAJA::Index_type k) {
-          RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
-          Iold[k] = I[k];
-        });
+    RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange,
+                                              [=](RAJA::Index_type k)
+                                              {
+                                                RAJA_resI2 += (I[k] - Iold[k]) *
+                                                              (I[k] - Iold[k]);
+                                                Iold[k] = I[k];
+                                              });
 
     resI2 = RAJA_resI2;
     if (iteration > maxIter)
@@ -336,7 +343,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::cuda_thread_x_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
-  resI2 = 1;
+  resI2     = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
@@ -349,7 +356,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::kernel<jacobiCUDANestedPolicy>(
         RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
+        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n)
+        {
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
@@ -357,7 +365,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                      (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
+          I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
                           Iold[id - 1] + Iold[id + 1]);
         });
 
@@ -366,7 +374,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
+        gridRange,
+        [=] RAJA_DEVICE(RAJA::Index_type k)
+        {
           RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
           Iold[k] = I[k];
         });
@@ -415,12 +425,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::hip_thread_x_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
-  resI2 = 1;
+  resI2     = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
 
-  double* d_I = memoryManager::allocate_gpu<double>(NN);
+  double* d_I    = memoryManager::allocate_gpu<double>(NN);
   double* d_Iold = memoryManager::allocate_gpu<double>(NN);
   hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
   hipErrchk(
@@ -434,14 +444,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::kernel<jacobiHIPNestedPolicy>(
         RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
+        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n)
+        {
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
           double f = gridx.h * gridx.h *
                      (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
-          int id = n * (N + 2) + m;
+          int id  = n * (N + 2) + m;
           d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] +
                             d_Iold[id - 1] + d_Iold[id + 1]);
         });
@@ -451,7 +462,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
+        gridRange,
+        [=] RAJA_DEVICE(RAJA::Index_type k)
+        {
           RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
           d_Iold[k] = d_I[k];
         });
@@ -495,7 +508,7 @@ double solution(double x, double y)
 void computeErr(double* I, grid_s grid)
 {
 
-  RAJA::RangeSegment gridRange(0, grid.n);
+  RAJA::RangeSegment                        gridRange(0, grid.n);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
@@ -505,10 +518,11 @@ void computeErr(double* I, grid_s grid)
 
   RAJA::kernel<jacobiSeqNestedPolicy>(
       RAJA::make_tuple(gridRange, gridRange),
-      [=](RAJA::Index_type ty, RAJA::Index_type tx) {
-        int id = tx + grid.n * ty;
-        double x = grid.o + tx * grid.h;
-        double y = grid.o + ty * grid.h;
+      [=](RAJA::Index_type ty, RAJA::Index_type tx)
+      {
+        int    id    = tx + grid.n * ty;
+        double x     = grid.o + tx * grid.h;
+        double y     = grid.o + ty * grid.h;
         double myErr = std::abs(I[id] - solution(x, y));
         tMax.max(myErr);
       });
diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp
index 6aac29178a..028cd220aa 100644
--- a/examples/kernel-dynamic-tile.cpp
+++ b/examples/kernel-dynamic-tile.cpp
@@ -29,7 +29,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       make_tuple(TileSize{5}, TileSize{10}),
       // make_tuple(TileSize(10)), // not sure we need this, good for
       // static_assert
-      [=](int i, int j, TileSize x, TileSize y) {
+      [=](int i, int j, TileSize x, TileSize y)
+      {
         std::cout << "Running index (" << i << "," << j << ") of " << x.size
                   << "x" << y.size << " tile." << std::endl;
       });
diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp
index 2682e15edd..8e90ce2d7b 100644
--- a/examples/launch-param-reductions.cpp
+++ b/examples/launch-param-reductions.cpp
@@ -89,10 +89,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Set min and max loc values
   //
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref] = -100;
+  a[minloc_ref]            = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref] = 100;
+  a[maxloc_ref]            = 100;
   // _reductions_array_init_end
 
   //
@@ -125,11 +125,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _reductions_raja_seq_start
   using LAUNCH_POL1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
-  using LOOP_POL1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using LOOP_POL1   = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-  int seq_sum = 0;
-  int seq_min = std::numeric_limits<int>::max();
-  int seq_max = std::numeric_limits<int>::min();
+  int        seq_sum = 0;
+  int        seq_min = std::numeric_limits<int>::max();
+  int        seq_max = std::numeric_limits<int>::min();
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -143,26 +143,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int& _seq_sum,
-                           int& _seq_min,
-                           int& _seq_max,
-                           VALLOC_INT& _seq_minloc,
-                           VALLOC_INT& _seq_maxloc) {
-        RAJA::loop<LOOP_POL1>(ctx, arange, [&](int i) {
-          _seq_sum += a[i];
-
-          _seq_min = RAJA_MIN(a[i], _seq_min);
-          _seq_max = RAJA_MAX(a[i], _seq_max);
-
-          _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
-          _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
-          //_seq_minloc.min(a[i], i);
-          //_seq_maxloc.max(a[i], i);
-          // Note : RAJA::expt::ValLoc<T> objects provide min() and max()
-          // methods
-          //        that are equivalent to the assignments with RAJA_MIN and
-          //        RAJA_MAX above.
-        });
+                           int&                _seq_sum,
+                           int&                _seq_min,
+                           int&                _seq_max,
+                           VALLOC_INT&         _seq_minloc,
+                           VALLOC_INT&         _seq_maxloc)
+      {
+        RAJA::loop<LOOP_POL1>(ctx,
+                              arange,
+                              [&](int i)
+                              {
+                                _seq_sum += a[i];
+
+                                _seq_min = RAJA_MIN(a[i], _seq_min);
+                                _seq_max = RAJA_MAX(a[i], _seq_max);
+
+                                _seq_minloc =
+                                    RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
+                                _seq_maxloc =
+                                    RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
+                                //_seq_minloc.min(a[i], i);
+                                //_seq_maxloc.max(a[i], i);
+                                // Note : RAJA::expt::ValLoc<T> objects provide
+                                // min() and max() methods
+                                //        that are equivalent to the assignments
+                                //        with RAJA_MIN and RAJA_MAX above.
+                              });
       });
 
   std::cout << "\tsum = " << seq_sum << std::endl;
@@ -182,12 +188,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _reductions_raja_omppolicy_start
   using LAUNCH_POL2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
-  using LOOP_POL2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using LOOP_POL2   = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   // _reductions_raja_omppolicy_end
 
-  int omp_sum = 0;
-  int omp_min = std::numeric_limits<int>::max();
-  int omp_max = std::numeric_limits<int>::min();
+  int        omp_sum = 0;
+  int        omp_min = std::numeric_limits<int>::max();
+  int        omp_max = std::numeric_limits<int>::min();
   VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -201,22 +207,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int& _omp_sum,
-                           int& _omp_min,
-                           int& _omp_max,
-                           VALLOC_INT& _omp_minloc,
-                           VALLOC_INT& _omp_maxloc) {
-        RAJA::loop<LOOP_POL2>(ctx, arange, [&](int i) {
-          _omp_sum += a[i];
-
-          _omp_min = RAJA_MIN(a[i], _omp_min);
-          _omp_max = RAJA_MAX(a[i], _omp_max);
-
-          _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
-          _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
-          //_omp_minloc.min(a[i], i);
-          //_omp_maxloc.max(a[i], i);
-        });
+                           int&                _omp_sum,
+                           int&                _omp_min,
+                           int&                _omp_max,
+                           VALLOC_INT&         _omp_minloc,
+                           VALLOC_INT&         _omp_maxloc)
+      {
+        RAJA::loop<LOOP_POL2>(ctx,
+                              arange,
+                              [&](int i)
+                              {
+                                _omp_sum += a[i];
+
+                                _omp_min = RAJA_MIN(a[i], _omp_min);
+                                _omp_max = RAJA_MAX(a[i], _omp_max);
+
+                                _omp_minloc =
+                                    RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
+                                _omp_maxloc =
+                                    RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
+                                //_omp_minloc.min(a[i], i);
+                                //_omp_maxloc.max(a[i], i);
+                              });
       });
 
   std::cout << "\tsum = " << omp_sum << std::endl;
@@ -239,14 +251,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _reductions_raja_cudapolicy_start
   using LAUNCH_POL3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false /*async*/>>;
-  using LOOP_POL3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+  using LOOP_POL3   = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
   // _reductions_raja_cudapolicy_end
 
   const int NUMBER_OF_TEAMS = (N - 1) / CUDA_BLOCK_SIZE + 1;
 
-  int cuda_sum = 0;
-  int cuda_min = std::numeric_limits<int>::max();
-  int cuda_max = std::numeric_limits<int>::min();
+  int        cuda_sum = 0;
+  int        cuda_min = std::numeric_limits<int>::max();
+  int        cuda_max = std::numeric_limits<int>::min();
   VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -261,22 +273,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int& _cuda_sum,
-                           int& _cuda_min,
-                           int& _cuda_max,
-                           VALLOC_INT& _cuda_minloc,
-                           VALLOC_INT& _cuda_maxloc) {
-        RAJA::loop<LOOP_POL3>(ctx, arange, [&](int i) {
-          _cuda_sum += d_a[i];
-
-          _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
-          _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
-
-          _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
-          _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
-          //_cuda_minloc.min(a[i], i);
-          //_cuda_maxloc.max(a[i], i);
-        });
+                           int&                _cuda_sum,
+                           int&                _cuda_min,
+                           int&                _cuda_max,
+                           VALLOC_INT&         _cuda_minloc,
+                           VALLOC_INT&         _cuda_maxloc)
+      {
+        RAJA::loop<LOOP_POL3>(
+            ctx,
+            arange,
+            [&](int i)
+            {
+              _cuda_sum += d_a[i];
+
+              _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
+              _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
+
+              _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
+              _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
+              //_cuda_minloc.min(a[i], i);
+              //_cuda_maxloc.max(a[i], i);
+            });
       });
 
   std::cout << "\tsum = " << cuda_sum << std::endl;
@@ -300,14 +317,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _reductions_raja_hippolicy_start
   using LAUNCH_POL3 = RAJA::LaunchPolicy<RAJA::hip_launch_t<false /*async*/>>;
-  using LOOP_POL3 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
+  using LOOP_POL3   = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
   // _reductions_raja_hippolicy_end
 
   const int NUMBER_OF_TEAMS = (N - 1) / HIP_BLOCK_SIZE + 1;
 
-  int hip_sum = 0;
-  int hip_min = std::numeric_limits<int>::max();
-  int hip_max = std::numeric_limits<int>::min();
+  int        hip_sum = 0;
+  int        hip_min = std::numeric_limits<int>::max();
+  int        hip_max = std::numeric_limits<int>::min();
   VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -322,22 +339,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int& _hip_sum,
-                           int& _hip_min,
-                           int& _hip_max,
-                           VALLOC_INT& _hip_minloc,
-                           VALLOC_INT& _hip_maxloc) {
-        RAJA::loop<LOOP_POL3>(ctx, arange, [&](int i) {
-          _hip_sum += d_a[i];
-
-          _hip_min = RAJA_MIN(d_a[i], _hip_min);
-          _hip_max = RAJA_MAX(d_a[i], _hip_max);
-
-          _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
-          _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
-          //_hip_minloc.min(d_a[i], i);
-          //_hip_maxloc.max(d_a[i], i);
-        });
+                           int&                _hip_sum,
+                           int&                _hip_min,
+                           int&                _hip_max,
+                           VALLOC_INT&         _hip_minloc,
+                           VALLOC_INT&         _hip_maxloc)
+      {
+        RAJA::loop<LOOP_POL3>(
+            ctx,
+            arange,
+            [&](int i)
+            {
+              _hip_sum += d_a[i];
+
+              _hip_min = RAJA_MIN(d_a[i], _hip_min);
+              _hip_max = RAJA_MAX(d_a[i], _hip_max);
+
+              _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
+              _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
+              //_hip_minloc.min(d_a[i], i);
+              //_hip_maxloc.max(d_a[i], i);
+            });
       });
 
   std::cout << "\tsum = " << hip_sum << std::endl;
@@ -361,14 +383,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _reductions_raja_syclpolicy_start
   using LAUNCH_POL4 = RAJA::LaunchPolicy<RAJA::sycl_launch_t<false /*async*/>>;
-  using LOOP_POL4 = RAJA::LoopPolicy<RAJA::sycl_global_item_2>;
+  using LOOP_POL4   = RAJA::LoopPolicy<RAJA::sycl_global_item_2>;
   // _reductions_raja_syclpolicy_end
 
   const int NUMBER_OF_TEAMS = (N - 1) / SYCL_BLOCK_SIZE + 1;
 
-  int sycl_sum = 0;
-  int sycl_min = std::numeric_limits<int>::max();
-  int sycl_max = std::numeric_limits<int>::min();
+  int        sycl_sum = 0;
+  int        sycl_min = std::numeric_limits<int>::max();
+  int        sycl_max = std::numeric_limits<int>::min();
   VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
 
@@ -383,22 +405,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int& _sycl_sum,
-                           int& _sycl_min,
-                           int& _sycl_max,
-                           VALLOC_INT& _sycl_minloc,
-                           VALLOC_INT& _sycl_maxloc) {
-        RAJA::loop<LOOP_POL4>(ctx, arange, [&](int i) {
-          _sycl_sum += d_a[i];
-
-          _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
-          _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
-
-          _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
-          _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
-          //_sycl_minloc.min(d_a[i], i);
-          //_sycl_maxloc.max(d_a[i], i);
-        });
+                           int&                _sycl_sum,
+                           int&                _sycl_min,
+                           int&                _sycl_max,
+                           VALLOC_INT&         _sycl_minloc,
+                           VALLOC_INT&         _sycl_maxloc)
+      {
+        RAJA::loop<LOOP_POL4>(
+            ctx,
+            arange,
+            [&](int i)
+            {
+              _sycl_sum += d_a[i];
+
+              _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
+              _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
+
+              _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
+              _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
+              //_sycl_minloc.min(d_a[i], i);
+              //_sycl_maxloc.max(d_a[i], i);
+            });
       });
 
   std::cout << "\tsum = " << sycl_sum << std::endl;
diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp
index b79cc249a4..48cc03669b 100644
--- a/examples/launch_flatten.cpp
+++ b/examples/launch_flatten.cpp
@@ -34,14 +34,14 @@
  */
 
 #if defined(RAJA_ENABLE_CUDA)
-using device_launch = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+using device_launch     = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
 using device_inner_pol0 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 using device_inner_pol1 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
 using device_flatten_pol =
     RAJA::LoopPolicy<RAJA::cuda_flatten_block_threads_xy_direct>;
 using reduce_policy = RAJA::cuda_reduce;
 #elif defined(RAJA_ENABLE_HIP)
-using device_launch = RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>;
+using device_launch     = RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>;
 using device_inner_pol0 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 using device_inner_pol1 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
 using device_flatten_pol =
@@ -54,7 +54,7 @@ using reduce_policy = RAJA::hip_reduce;
  */
 
 using host_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
-using host_loop = RAJA::LoopPolicy<RAJA::seq_exec>;
+using host_loop   = RAJA::LoopPolicy<RAJA::seq_exec>;
 
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
@@ -64,7 +64,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Problem size dimensions
   //
-  constexpr int N = 4;
+  constexpr int N  = 4;
   constexpr int NN = N * N;
 
   //
@@ -77,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Resource object for host, used to allocate memory
   //
   camp::resources::Host host_res;
-  int* h_A_ptr = host_res.allocate<int>(NN);
+  int*                  h_A_ptr = host_res.allocate<int>(NN);
 
   //
   // Resource object for device, used to allocate memory
@@ -95,27 +95,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running device version of teams_flatten example ...\n";
 
   RAJA::ReduceSum<reduce_policy, int> device_kernel_sum(0);
-  RAJA::View<int, RAJA::Layout<2>> d_A_2DView(d_A_ptr, N, N);
-  RAJA::View<int, RAJA::Layout<1>> d_A_1DView(d_A_ptr, NN);
+  RAJA::View<int, RAJA::Layout<2>>    d_A_2DView(d_A_ptr, N, N);
+  RAJA::View<int, RAJA::Layout<1>>    d_A_1DView(d_A_ptr, NN);
 
   RAJA::launch<device_launch>(
-      launch_params, [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<device_inner_pol1>(
-            ctx, RAJA::RangeSegment(0, N), [&](int j) {
-              RAJA::loop<device_inner_pol0>(
-                  ctx, RAJA::RangeSegment(0, N), [&](int i) {
-                    d_A_2DView(j, i) = i + j;
-                  });
-            });
+      launch_params,
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<device_inner_pol1>(ctx,
+                                      RAJA::RangeSegment(0, N),
+                                      [&](int j)
+                                      {
+                                        RAJA::loop<device_inner_pol0>(
+                                            ctx,
+                                            RAJA::RangeSegment(0, N),
+                                            [&](int i)
+                                            { d_A_2DView(j, i) = i + j; });
+                                      });
 
         ctx.teamSync();
 
         // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
         // accumulating memory contents
-        RAJA::loop<device_flatten_pol>(
-            ctx, RAJA::RangeSegment(0, NN), [&](int i) {
-              device_kernel_sum += d_A_1DView(i);
-            });
+        RAJA::loop<device_flatten_pol>(ctx,
+                                       RAJA::RangeSegment(0, NN),
+                                       [&](int i)
+                                       { device_kernel_sum += d_A_1DView(i); });
       });
 
   //----------------------------------------------------------------------------//
@@ -123,24 +128,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running host version of teams_flatten example ...\n";
 
   RAJA::ReduceSum<reduce_policy, int> host_kernel_sum(0);
-  RAJA::View<int, RAJA::Layout<2>> h_A_2DView(h_A_ptr, N, N);
-  RAJA::View<int, RAJA::Layout<1>> h_A_1DView(h_A_ptr, NN);
+  RAJA::View<int, RAJA::Layout<2>>    h_A_2DView(h_A_ptr, N, N);
+  RAJA::View<int, RAJA::Layout<1>>    h_A_1DView(h_A_ptr, NN);
 
   RAJA::launch<host_launch>(
-      launch_params, [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&](int j) {
-          RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&](int i) {
-            h_A_2DView(j, i) = i + j;
-          });
-        });
+      launch_params,
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<host_loop>(ctx,
+                              RAJA::RangeSegment(0, N),
+                              [&](int j)
+                              {
+                                RAJA::loop<host_loop>(
+                                    ctx,
+                                    RAJA::RangeSegment(0, N),
+                                    [&](int i) { h_A_2DView(j, i) = i + j; });
+                              });
 
         ctx.teamSync();
 
         // As loops are dispatched as standard C loops we can revert to using
         // a regular seq_exec policy
-        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN), [&](int i) {
-          host_kernel_sum += h_A_1DView(i);
-        });
+        RAJA::loop<host_loop>(ctx,
+                              RAJA::RangeSegment(0, NN),
+                              [&](int i) { host_kernel_sum += h_A_1DView(i); });
       });
 
   if (device_kernel_sum.get() == host_kernel_sum.get())
diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp
index 7c00a71071..1355fb10e8 100644
--- a/examples/launch_matrix-multiply.cpp
+++ b/examples/launch_matrix-multiply.cpp
@@ -51,22 +51,22 @@ using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t
 using loop_policy = RAJA::seq_exec;
 
 #if defined(RAJA_ENABLE_CUDA)
-using gpu_block_x_policy = RAJA::cuda_block_x_direct;
-using gpu_block_y_policy = RAJA::cuda_block_y_direct;
-using gpu_thread_x_policy = RAJA::cuda_thread_x_loop;
-using gpu_thread_y_policy = RAJA::cuda_thread_y_loop;
-using gpu_global_thread_x_policy = RAJA::cuda_global_thread_x;
-using gpu_global_thread_y_policy = RAJA::cuda_global_thread_y;
+using gpu_block_x_policy          = RAJA::cuda_block_x_direct;
+using gpu_block_y_policy          = RAJA::cuda_block_y_direct;
+using gpu_thread_x_policy         = RAJA::cuda_thread_x_loop;
+using gpu_thread_y_policy         = RAJA::cuda_thread_y_loop;
+using gpu_global_thread_x_policy  = RAJA::cuda_global_thread_x;
+using gpu_global_thread_y_policy  = RAJA::cuda_global_thread_y;
 using gpu_global_thread_xy_policy = RAJA::cuda_global_thread_xy;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using gpu_block_x_policy = RAJA::hip_block_x_direct;
-using gpu_block_y_policy = RAJA::hip_block_y_direct;
-using gpu_thread_x_policy = RAJA::hip_thread_x_loop;
-using gpu_thread_y_policy = RAJA::hip_thread_y_loop;
-using gpu_global_thread_x_policy = RAJA::hip_global_thread_x;
-using gpu_global_thread_y_policy = RAJA::hip_global_thread_y;
+using gpu_block_x_policy          = RAJA::hip_block_x_direct;
+using gpu_block_y_policy          = RAJA::hip_block_y_direct;
+using gpu_thread_x_policy         = RAJA::hip_thread_x_loop;
+using gpu_thread_y_policy         = RAJA::hip_thread_y_loop;
+using gpu_global_thread_x_policy  = RAJA::hip_global_thread_x;
+using gpu_global_thread_y_policy  = RAJA::hip_global_thread_y;
 using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy;
 #endif
 
@@ -218,7 +218,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define num rows/cols in matrix and number of teams based on
   // number of threads in a dimension.
   //
-  const int N = 1000;
+  const int N      = 1000;
   const int NTeams = (N - 1) / THREAD_SZ + 1;
 
   //
@@ -333,17 +333,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::ExecPlace::HOST,
       RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
                          RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<global_thread_y>(ctx, col_range, [&](int col) {
-          RAJA::loop<global_thread_x>(ctx, row_range, [&](int row) {
-            double dot = 0.0;
-            for (int k = 0; k < N; ++k)
-            {
-              dot += Aview(row, k) * Bview(k, col);
-            }
-            Cview(row, col) = dot;
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<global_thread_y>(ctx,
+                                    col_range,
+                                    [&](int col)
+                                    {
+                                      RAJA::loop<global_thread_x>(
+                                          ctx,
+                                          row_range,
+                                          [&](int row)
+                                          {
+                                            double dot = 0.0;
+                                            for (int k = 0; k < N; ++k)
+                                            {
+                                              dot +=
+                                                  Aview(row, k) * Bview(k, col);
+                                            }
+                                            Cview(row, col) = dot;
+                                          });
+                                    });
       });
   // _matmult_basickernel_end
 
@@ -370,17 +379,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using omp_row_policy0 = RAJA::LoopPolicy<loop_policy>;
 
   RAJA::launch<omp_launch_policy>(
-      RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<omp_col_policy0>(ctx, col_range, [&](int col) {
-          RAJA::loop<omp_row_policy0>(ctx, row_range, [&](int row) {
-            double dot = 0.0;
-            for (int k = 0; k < N; ++k)
-            {
-              dot += Aview(row, k) * Bview(k, col);
-            }
-            Cview(row, col) = dot;
-          });
-        });
+      RAJA::LaunchParams(),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<omp_col_policy0>(ctx,
+                                    col_range,
+                                    [&](int col)
+                                    {
+                                      RAJA::loop<omp_row_policy0>(
+                                          ctx,
+                                          row_range,
+                                          [&](int row)
+                                          {
+                                            double dot = 0.0;
+                                            for (int k = 0; k < N; ++k)
+                                            {
+                                              dot +=
+                                                  Aview(row, k) * Bview(k, col);
+                                            }
+                                            Cview(row, col) = dot;
+                                          });
+                                    });
       });
 
   checkResult<double>(Cview, N);
@@ -399,20 +418,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   using global_thread_xy = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
-  RAJA::launch<omp_launch_policy>(
-      RAJA::ExecPlace::HOST,
-      RAJA::LaunchParams(),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::expt::loop<global_thread_xy>(
-            ctx, col_range, row_range, [&](int col, int row) {
-              double dot = 0.0;
-              for (int k = 0; k < N; ++k)
-              {
-                dot += Aview(row, k) * Bview(k, col);
-              }
-              Cview(row, col) = dot;
-            });
-      });
+  RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST,
+                                  RAJA::LaunchParams(),
+                                  [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+                                  {
+                                    RAJA::expt::loop<global_thread_xy>(
+                                        ctx,
+                                        col_range,
+                                        row_range,
+                                        [&](int col, int row)
+                                        {
+                                          double dot = 0.0;
+                                          for (int k = 0; k < N; ++k)
+                                          {
+                                            dot +=
+                                                Aview(row, k) * Bview(k, col);
+                                          }
+                                          Cview(row, col) = dot;
+                                        });
+                                  });
 
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
@@ -439,17 +463,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<launch_policy>(
       RAJA::ExecPlace::DEVICE,
       RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<teams_x>(ctx, col_range, [&](int col) {
-          RAJA::loop<threads_x>(ctx, row_range, [&](int row) {
-            double dot = 0.0;
-            for (int k = 0; k < N; ++k)
-            {
-              dot += Aview(row, k) * Bview(k, col);
-            }
-            Cview(row, col) = dot;
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<teams_x>(ctx,
+                            col_range,
+                            [&](int col)
+                            {
+                              RAJA::loop<threads_x>(
+                                  ctx,
+                                  row_range,
+                                  [&](int row)
+                                  {
+                                    double dot = 0.0;
+                                    for (int k = 0; k < N; ++k)
+                                    {
+                                      dot += Aview(row, k) * Bview(k, col);
+                                    }
+                                    Cview(row, col) = dot;
+                                  });
+                            });
       });
 
   checkResult<double>(Cview, N);
@@ -474,26 +506,40 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::ExecPlace::DEVICE,
       RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
                          RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::tile<teams_y>(
-            ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& row_tile) {
-              RAJA::tile<teams_x>(
-                  ctx,
-                  THREAD_SZ,
-                  col_range,
-                  [&](RAJA::RangeSegment const& col_tile) {
-                    RAJA::loop<threads_y>(ctx, row_tile, [&](int col) {
-                      RAJA::loop<threads_x>(ctx, col_tile, [&](int row) {
-                        double dot = 0.0;
-                        for (int k = 0; k < N; ++k)
-                        {
-                          dot += Aview(row, k) * Bview(k, col);
-                        }
-                        Cview(row, col) = dot;
-                      });
-                    });
-                  });
-            });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::tile<teams_y>(ctx,
+                            THREAD_SZ,
+                            row_range,
+                            [&](RAJA::RangeSegment const& row_tile)
+                            {
+                              RAJA::tile<teams_x>(
+                                  ctx,
+                                  THREAD_SZ,
+                                  col_range,
+                                  [&](RAJA::RangeSegment const& col_tile)
+                                  {
+                                    RAJA::loop<threads_y>(
+                                        ctx,
+                                        row_tile,
+                                        [&](int col)
+                                        {
+                                          RAJA::loop<threads_x>(
+                                              ctx,
+                                              col_tile,
+                                              [&](int row)
+                                              {
+                                                double dot = 0.0;
+                                                for (int k = 0; k < N; ++k)
+                                                {
+                                                  dot += Aview(row, k) *
+                                                         Bview(k, col);
+                                                }
+                                                Cview(row, col) = dot;
+                                              });
+                                        });
+                                  });
+                            });
       });
 
   checkResult<double>(Cview, N);
@@ -534,18 +580,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<launch_policy>(
       RAJA::ExecPlace::DEVICE,
       RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<teams_x>(ctx, col_range, [&](int col) {
-          RAJA::loop<threads_x>(ctx, row_range, [&](int row) {
-            double dot = 0.0;
-            for (int k = 0; k < N; ++k)
-            {
-              dot += d_Aview(row, k) * d_Bview(k, col);
-            }
-
-            d_Cview(row, col) = dot;
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<teams_x>(ctx,
+                            col_range,
+                            [&](int col)
+                            {
+                              RAJA::loop<threads_x>(
+                                  ctx,
+                                  row_range,
+                                  [&](int row)
+                                  {
+                                    double dot = 0.0;
+                                    for (int k = 0; k < N; ++k)
+                                    {
+                                      dot += d_Aview(row, k) * d_Bview(k, col);
+                                    }
+
+                                    d_Cview(row, col) = dot;
+                                  });
+                            });
       });
 
   hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
@@ -573,26 +627,40 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::ExecPlace::DEVICE,
       RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
                          RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::tile<teams_y>(
-            ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& row_tile) {
-              RAJA::tile<teams_x>(
-                  ctx,
-                  THREAD_SZ,
-                  col_range,
-                  [&](RAJA::RangeSegment const& col_tile) {
-                    RAJA::loop<threads_y>(ctx, row_tile, [&](int col) {
-                      RAJA::loop<threads_x>(ctx, col_tile, [&](int row) {
-                        double dot = 0.0;
-                        for (int k = 0; k < N; ++k)
-                        {
-                          dot += Aview(row, k) * Bview(k, col);
-                        }
-                        Cview(row, col) = dot;
-                      });
-                    });
-                  });
-            });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::tile<teams_y>(ctx,
+                            THREAD_SZ,
+                            row_range,
+                            [&](RAJA::RangeSegment const& row_tile)
+                            {
+                              RAJA::tile<teams_x>(
+                                  ctx,
+                                  THREAD_SZ,
+                                  col_range,
+                                  [&](RAJA::RangeSegment const& col_tile)
+                                  {
+                                    RAJA::loop<threads_y>(
+                                        ctx,
+                                        row_tile,
+                                        [&](int col)
+                                        {
+                                          RAJA::loop<threads_x>(
+                                              ctx,
+                                              col_tile,
+                                              [&](int row)
+                                              {
+                                                double dot = 0.0;
+                                                for (int k = 0; k < N; ++k)
+                                                {
+                                                  dot += Aview(row, k) *
+                                                         Bview(k, col);
+                                                }
+                                                Cview(row, col) = dot;
+                                              });
+                                        });
+                                  });
+                            });
       });
 
   hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
@@ -623,58 +691,84 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::ExecPlace::DEVICE,
       RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
                          RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         //
         // Loop over teams
         //
         RAJA::tile<teams_y>(
-            ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& y_tile) {
+            ctx,
+            THREAD_SZ,
+            row_range,
+            [&](RAJA::RangeSegment const& y_tile)
+            {
               RAJA::tile<teams_x>(
                   ctx,
                   THREAD_SZ,
                   col_range,
-                  [&](RAJA::RangeSegment const& x_tile) {
+                  [&](RAJA::RangeSegment const& x_tile)
+                  {
                     RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ];
                     RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
                     RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
 
-                    RAJA::loop_icount<threads_y>(
-                        ctx, y_tile, [&](int row, int ty) {
-                          RAJA::loop_icount<threads_x>(
-                              ctx, x_tile, [&](int col, int tx) {
-                                Cs[ty][tx] = 0.0;
-                              });
-                        });
+                    RAJA::loop_icount<threads_y>(ctx,
+                                                 y_tile,
+                                                 [&](int row, int ty)
+                                                 {
+                                                   RAJA::loop_icount<threads_x>(
+                                                       ctx,
+                                                       x_tile,
+                                                       [&](int col, int tx)
+                                                       { Cs[ty][tx] = 0.0; });
+                                                 });
 
                     RAJA::tile<seq_loop>(
                         ctx,
                         THREAD_SZ,
                         dot_range,
-                        [&](RAJA::RangeSegment const& k_tile) {
+                        [&](RAJA::RangeSegment const& k_tile)
+                        {
                           RAJA::loop_icount<threads_y>(
-                              ctx, y_tile, [&](int row, int ty) {
+                              ctx,
+                              y_tile,
+                              [&](int row, int ty)
+                              {
                                 RAJA::loop_icount<threads_x>(
-                                    ctx, k_tile, [&](int k_id, int tx) {
-                                      As[ty][tx] = Aview(row, k_id);
-                                    });
+                                    ctx,
+                                    k_tile,
+                                    [&](int k_id, int tx)
+                                    { As[ty][tx] = Aview(row, k_id); });
                               });
 
                           RAJA::loop_icount<threads_y>(
-                              ctx, k_tile, [&](int k_id, int ty) {
+                              ctx,
+                              k_tile,
+                              [&](int k_id, int ty)
+                              {
                                 RAJA::loop_icount<threads_x>(
-                                    ctx, x_tile, [&](int col, int tx) {
-                                      Bs[ty][tx] = Bview(k_id, col);
-                                    });
+                                    ctx,
+                                    x_tile,
+                                    [&](int col, int tx)
+                                    { Bs[ty][tx] = Bview(k_id, col); });
                               });
 
                           ctx.teamSync();
 
                           RAJA::loop_icount<threads_y>(
-                              ctx, y_tile, [&](int row, int ty) {
+                              ctx,
+                              y_tile,
+                              [&](int row, int ty)
+                              {
                                 RAJA::loop_icount<threads_x>(
-                                    ctx, x_tile, [&](int col, int tx) {
+                                    ctx,
+                                    x_tile,
+                                    [&](int col, int tx)
+                                    {
                                       RAJA::loop_icount<seq_loop>(
-                                          ctx, k_tile, [&](int gid, int e) {
+                                          ctx,
+                                          k_tile,
+                                          [&](int gid, int e) {
                                             Cs[ty][tx] += As[ty][e] * Bs[e][tx];
                                           });
                                     });
@@ -683,13 +777,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                           ctx.teamSync();
                         }); // slide across matrix
 
-                    RAJA::loop_icount<threads_y>(
-                        ctx, y_tile, [&](int row, int ty) {
-                          RAJA::loop_icount<threads_x>(
-                              ctx, x_tile, [&](int col, int tx) {
-                                Cview(col, row) = Cs[ty][tx];
-                              });
-                        });
+                    RAJA::loop_icount<threads_y>(ctx,
+                                                 y_tile,
+                                                 [&](int row, int ty)
+                                                 {
+                                                   RAJA::loop_icount<threads_x>(
+                                                       ctx,
+                                                       x_tile,
+                                                       [&](int col, int tx) {
+                                                         Cview(col, row) =
+                                                             Cs[ty][tx];
+                                                       });
+                                                 });
                   });
             });
       }); // kernel
diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp
index c5de9b0e30..d248a0ac70 100644
--- a/examples/launch_reductions.cpp
+++ b/examples/launch_reductions.cpp
@@ -29,18 +29,18 @@
 
 #if defined(RAJA_ENABLE_OPENMP)
 using host_launch = RAJA::omp_launch_t;
-using host_loop = RAJA::omp_for_exec;
+using host_loop   = RAJA::omp_for_exec;
 #else
-using host_launch = RAJA::seq_launch_t;
-using host_loop = RAJA::seq_exec;
+using host_launch   = RAJA::seq_launch_t;
+using host_loop     = RAJA::seq_exec;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using device_launch = RAJA::cuda_launch_t<false>;
-using device_loop = RAJA::cuda_global_thread_x;
+using device_loop   = RAJA::cuda_global_thread_x;
 #elif defined(RAJA_ENABLE_HIP)
 using device_launch = RAJA::hip_launch_t<false>;
-using device_loop = RAJA::hip_global_thread_x;
+using device_loop   = RAJA::hip_global_thread_x;
 #endif
 
 using launch_policy = RAJA::LaunchPolicy<host_launch
@@ -129,10 +129,10 @@ int main(int argc, char* argv[])
   // Set min and max loc values
   //
   const int minloc_ref = N / 2;
-  a[minloc_ref] = -100;
+  a[minloc_ref]        = -100;
 
   const int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref] = 100;
+  a[maxloc_ref]        = 100;
   // _reductions_array_init_end
 
   //
@@ -173,16 +173,20 @@ int main(int argc, char* argv[])
       select_cpu_or_gpu,
       RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
       "Launch Reductions",
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<loop_pol>(ctx, arange, [&](int i) {
-          kernel_sum += a[i];
-
-          kernel_min.min(a[i]);
-          kernel_max.max(a[i]);
-
-          kernel_minloc.minloc(a[i], i);
-          kernel_maxloc.maxloc(a[i], i);
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<loop_pol>(ctx,
+                             arange,
+                             [&](int i)
+                             {
+                               kernel_sum += a[i];
+
+                               kernel_min.min(a[i]);
+                               kernel_max.max(a[i]);
+
+                               kernel_minloc.minloc(a[i], i);
+                               kernel_maxloc.maxloc(a[i], i);
+                             });
       });
 
   std::cout << "\tsum = " << kernel_sum.get() << std::endl;
diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp
index 6f68615a45..930640a80d 100644
--- a/examples/memoryManager.hpp
+++ b/examples/memoryManager.hpp
@@ -78,7 +78,7 @@ T* allocate_gpu(RAJA::Index_type size)
   hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
   auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
-  ptr = cl::sycl::malloc_device<T>(size, *qu);
+  ptr     = cl::sycl::malloc_device<T>(size, *qu);
 #endif
   return ptr;
 }
diff --git a/examples/multiview.cpp b/examples/multiview.cpp
index 378abde700..42fbbfc7fe 100644
--- a/examples/multiview.cpp
+++ b/examples/multiview.cpp
@@ -144,9 +144,9 @@ int main()
   docs_example();
 
   constexpr int N = 12;
-  int* myarr[2]; // two 3x4 arrays
-  int arr1[N];
-  int arr2[N];
+  int*          myarr[2]; // two 3x4 arrays
+  int           arr1[N];
+  int           arr2[N];
 
   for (int ii = 0; ii < N; ++ii)
   {
@@ -169,7 +169,7 @@ int main()
 
   // Moved array-of-pointers index MultiView usage
   // Add an array-of-pointers index specifier
-  constexpr int aopidx = 1;
+  constexpr int                                                   aopidx = 1;
   RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>, aopidx> arrViewMov(
       myarr, layout);
 
@@ -200,7 +200,7 @@ int main()
   {
     for (int jj = 0; jj < 3; ++jj)
     {
-      int temp = arrView(0, kk, jj);
+      int temp           = arrView(0, kk, jj);
       arrView(0, kk, jj) = arrView(1, kk, jj);
       arrView(1, kk, jj) = temp;
     }
diff --git a/examples/omp-target-kernel.cpp b/examples/omp-target-kernel.cpp
index 1d101f1ca5..0099205270 100644
--- a/examples/omp-target-kernel.cpp
+++ b/examples/omp-target-kernel.cpp
@@ -23,19 +23,21 @@ int main(int /*argc*/, char** /*argv[]*/)
   double* array = new double[25 * 25];
 
 #pragma omp target enter data map(to : array [0:25 * 25])
-#pragma omp target data use_device_ptr(array)
+#pragma omp target data       use_device_ptr(array)
 
 #if 1
   RAJA::kernel<Pol>(
       RAJA::make_tuple(RAJA::RangeSegment(0, 25), RAJA::RangeSegment(0, 25)),
-      [=](int /*i*/, int /*j*/) {
+      [=](int /*i*/, int /*j*/)
+      {
         // array[i + (25*j)] = i*j;
         //    int idx = i;
         // array[0] = i*j;
       });
 #else
   RAJA::forall<RAJA::omp_target_parallel_for_exec<1>>(RAJA::RangeSegment(0, 25),
-                                                      [=](int i) {
+                                                      [=](int i)
+                                                      {
                                                         //
                                                       });
 #endif
diff --git a/examples/omp-target-ltimes.cpp b/examples/omp-target-ltimes.cpp
index c04939c26f..08b527cfed 100644
--- a/examples/omp-target-ltimes.cpp
+++ b/examples/omp-target-ltimes.cpp
@@ -26,7 +26,7 @@ RAJA_INDEX_VALUE(IGroup, "IGroup");
 RAJA_INDEX_VALUE(IZone, "IZone");
 
 
-void runLTimesRajaKernel(bool debug,
+void runLTimesRajaKernel(bool       debug,
                          Index_type num_moments,
                          Index_type num_directions,
                          Index_type num_groups,
@@ -94,16 +94,16 @@ void runLTimesRajaKernel(bool debug,
 
   // create views on data
   std::array<RAJA::idx_t, 2> ell_perm{{0, 1}};
-  EllView ell(d_ell,
+  EllView                    ell(d_ell,
               make_permuted_layout({{num_moments, num_directions}}, ell_perm));
 
   std::array<RAJA::idx_t, 3> psi_perm{{0, 1, 2}};
-  PsiView psi(d_psi,
+  PsiView                    psi(d_psi,
               make_permuted_layout({{num_directions, num_groups, num_zones}},
                                    psi_perm));
 
   std::array<RAJA::idx_t, 3> phi_perm{{0, 1, 2}};
-  PhiView phi(
+  PhiView                    phi(
       d_phi,
       make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm));
 
@@ -127,9 +127,8 @@ void runLTimesRajaKernel(bool debug,
       segments,
 
       // Lambda_CalcPhi
-      [=](IMoment m, IDirection d, IGroup g, IZone z) {
-        phi(m, g, z) += ell(m, d) * psi(d, g, z);
-      });
+      [=](IMoment m, IDirection d, IGroup g, IZone z)
+      { phi(m, g, z) += ell(m, d) * psi(d, g, z); });
 
 
   timer.stop();
@@ -140,7 +139,7 @@ void runLTimesRajaKernel(bool debug,
   if (debug)
   {
 
-    size_t errors = 0;
+    size_t errors      = 0;
     double total_error = 0.;
     for (IZone z(0); z < num_zones; ++z)
     {
diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp
index fae7c47c19..777e66980b 100644
--- a/examples/pi-reduce_vs_atomic.cpp
+++ b/examples/pi-reduce_vs_atomic.cpp
@@ -50,8 +50,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Riemann integral sum to approximate pi,
   // and memory location for atomic add operation.
   //
-  const int num_bins = 512 * 512;
-  const double dx = 1.0 / double(num_bins);
+  const int    num_bins = 512 * 512;
+  const double dx       = 1.0 / double(num_bins);
 
   RAJA::RangeSegment bins(0, num_bins);
 
@@ -81,15 +81,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA sequential pi approximation (reduction)...\n";
 
-  using EXEC_POL1 = RAJA::seq_exec;
+  using EXEC_POL1   = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(bins, [=](int i) {
-    double x = (double(i) + 0.5) * dx;
-    seq_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL1>(bins,
+                          [=](int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            seq_pi += dx / (1.0 + x * x);
+                          });
   double seq_pi_val = seq_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
@@ -101,10 +103,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL1>(bins, [=](int i) {
-    double x = (double(i) + 0.5) * dx;
-    RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi, dx / (1.0 + x * x));
-  });
+  RAJA::forall<EXEC_POL1>(bins,
+                          [=](int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi,
+                                                         dx / (1.0 + x * x));
+                          });
   *atomic_pi *= 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
@@ -116,15 +121,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA OpenMP pi approximation (reduction)...\n";
 
-  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
   using REDUCE_POL2 = RAJA::omp_reduce;
 
   RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
 
-  RAJA::forall<EXEC_POL2>(bins, [=](int i) {
-    double x = (double(i) + 0.5) * dx;
-    omp_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL2>(bins,
+                          [=](int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            omp_pi += dx / (1.0 + x * x);
+                          });
   double omp_pi_val = omp_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
@@ -136,10 +143,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL2>(bins, [=](int i) {
-    double x = (double(i) + 0.5) * dx;
-    RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi, dx / (1.0 + x * x));
-  });
+  RAJA::forall<EXEC_POL2>(bins,
+                          [=](int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi,
+                                                         dx / (1.0 + x * x));
+                          });
   *atomic_pi *= 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
@@ -153,15 +163,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA pi approximation (reduction)...\n";
 
-  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::cuda_reduce;
 
   RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
 
-  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE(int i) {
-    double x = (double(i) + 0.5) * dx;
-    cuda_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL3>(bins,
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            cuda_pi += dx / (1.0 + x * x);
+                          });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
@@ -173,10 +185,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE(int i) {
-    double x = (double(i) + 0.5) * dx;
-    RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi, dx / (1.0 + x * x));
-  });
+  RAJA::forall<EXEC_POL3>(bins,
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi,
+                                                         dx / (1.0 + x * x));
+                          });
   *atomic_pi *= 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
@@ -189,32 +204,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA HIP pi approximation (reduction)...\n";
 
-  using EXEC_POL4 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL4   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL4 = RAJA::hip_reduce;
 
   RAJA::ReduceSum<REDUCE_POL4, double> hip_pi(0.0);
 
-  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE(int i) {
-    double x = (double(i) + 0.5) * dx;
-    hip_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL4>(bins,
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            hip_pi += dx / (1.0 + x * x);
+                          });
   double hip_pi_val = hip_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << hip_pi_val << std::endl;
 
   std::cout << "\n Running RAJA HIP pi approximation (atomic)...\n";
 
-  *atomic_pi = 0;
+  *atomic_pi          = 0;
   double* d_atomic_pi = memoryManager::allocate_gpu<double>(1);
   hipErrchk(hipMemcpy(
       d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice));
 
   using ATOMIC_POL4 = RAJA::hip_atomic;
 
-  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE(int i) {
-    double x = (double(i) + 0.5) * dx;
-    RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi, dx / (1.0 + x * x));
-  });
+  RAJA::forall<EXEC_POL4>(bins,
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi,
+                                                         dx / (1.0 + x * x));
+                          });
 
   hipErrchk(hipMemcpy(
       atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost));
diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp
index 89ed356f37..8a2f40a76a 100644
--- a/examples/raja-launch.cpp
+++ b/examples/raja-launch.cpp
@@ -161,25 +161,36 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     RAJA::launch<launch_policy>(
         select_cpu_or_gpu,
         RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
-            // Array shared within threads of the same team
-            RAJA_TEAM_SHARED int s_A[1];
-
-            RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
-              s_A[c] = r;
-            }); // loop c
-
-            ctx.teamSync();
-
-            RAJA::loop<threads_x>(
-                ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
-                  D(r, c) = r * N_tri + c;
-                  printf(
-                      "r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
-                }); // loop c
-          });       // loop r
-        });         // outer lambda
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<teams_x>(ctx,
+                              RAJA::RangeSegment(0, N_tri),
+                              [&](int r)
+                              {
+                                // Array shared within threads of the same team
+                                RAJA_TEAM_SHARED int s_A[1];
+
+                                RAJA::loop<threads_x>(
+                                    ctx,
+                                    RAJA::RangeSegment(0, 1),
+                                    [&](int c) { s_A[c] = r; }); // loop c
+
+                                ctx.teamSync();
+
+                                RAJA::loop<threads_x>(
+                                    ctx,
+                                    RAJA::RangeSegment(r, N_tri),
+                                    [&](int c)
+                                    {
+                                      D(r, c) = r * N_tri + c;
+                                      printf("r=%d, c=%d : D=%d : s_A = %d \n",
+                                             r,
+                                             c,
+                                             D(r, c),
+                                             s_A[0]);
+                                    }); // loop c
+                              });       // loop r
+        });                             // outer lambda
 
     if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
     {
diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp
index f22df3cac9..e6ef3a64ca 100644
--- a/examples/red-black-gauss-seidel.cpp
+++ b/examples/red-black-gauss-seidel.cpp
@@ -55,7 +55,7 @@
 struct grid_s
 {
   double o, h;
-  int n;
+  int    n;
 };
 
 /*
@@ -65,7 +65,7 @@ struct grid_s
  * gsColorPolicy - Generates the custom index set for this example
  */
 double solution(double x, double y);
-void computeErr(double* I, grid_s grid);
+void   computeErr(double* I, grid_s grid);
 RAJA::TypedIndexSet<RAJA::ListSegment>
 gsColorPolicy(int N, camp::resources::Resource res);
 
@@ -87,12 +87,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
    */
   double tol = 1e-10;
 
-  int N = 100;
-  int NN = (N + 2) * (N + 2);
+  int N       = 100;
+  int NN      = (N + 2) * (N + 2);
   int maxIter = 100000;
 
   double resI2;
-  int iteration;
+  int    iteration;
 
   grid_s gridx;
   gridx.o = 0.0;
@@ -116,7 +116,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using colorPolicy = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
 #endif
 
-  resI2 = 1;
+  resI2     = 1;
   iteration = 0;
   while (resI2 > tol * tol)
   {
@@ -130,26 +130,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     // Gauss-Seidel Iteration
     //
-    RAJA::forall<colorPolicy>(colorSet, [=](RAJA::Index_type id) {
-      //
-      // Compute x,y grid index
-      //
-      int m = id % (N + 2);
-      int n = id / (N + 2);
-
-      double x = gridx.o + m * gridx.h;
-      double y = gridx.o + n * gridx.h;
-
-      double f = gridx.h * gridx.h *
-                 (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
-
-      double newI =
-          -0.25 * (f - I[id - N - 2] - I[id + N + 2] - I[id - 1] - I[id + 1]);
-
-      double oldI = I[id];
-      RAJA_resI2 += (newI - oldI) * (newI - oldI);
-      I[id] = newI;
-    });
+    RAJA::forall<colorPolicy>(
+        colorSet,
+        [=](RAJA::Index_type id)
+        {
+          //
+          // Compute x,y grid index
+          //
+          int m = id % (N + 2);
+          int n = id / (N + 2);
+
+          double x = gridx.o + m * gridx.h;
+          double y = gridx.o + n * gridx.h;
+
+          double f = gridx.h * gridx.h *
+                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+          double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] - I[id - 1] -
+                                 I[id + 1]);
+
+          double oldI = I[id];
+          RAJA_resI2 += (newI - oldI) * (newI - oldI);
+          I[id] = newI;
+        });
     resI2 = RAJA_resI2;
 
     if (iteration > maxIter)
@@ -234,7 +237,7 @@ double solution(double x, double y)
 void computeErr(double* I, grid_s grid)
 {
 
-  RAJA::RangeSegment fdBounds(0, grid.n);
+  RAJA::RangeSegment                        fdBounds(0, grid.n);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
   using errPolicy = RAJA::KernelPolicy<RAJA::statement::For<
@@ -243,10 +246,11 @@ void computeErr(double* I, grid_s grid)
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<errPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
-                          [=](RAJA::Index_type tx, RAJA::Index_type ty) {
-                            int id = tx + grid.n * ty;
-                            double x = grid.o + tx * grid.h;
-                            double y = grid.o + ty * grid.h;
+                          [=](RAJA::Index_type tx, RAJA::Index_type ty)
+                          {
+                            int    id    = tx + grid.n * ty;
+                            double x     = grid.o + tx * grid.h;
+                            double y     = grid.o + ty * grid.h;
                             double myErr = std::abs(I[id] - solution(x, y));
                             tMax.max(myErr);
                           });
diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
index c4ef22f542..aa7c42f8b6 100644
--- a/examples/resource-dynamic-forall.cpp
+++ b/examples/resource-dynamic-forall.cpp
@@ -134,10 +134,11 @@ int main(int argc, char* argv[])
       RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
-  RAJA::expt::dynamic_forall<policy_list>(
-      res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
-        c[i] = a[i] + b[i];
-      });
+  RAJA::expt::dynamic_forall<policy_list>(res,
+                                          pol,
+                                          RAJA::RangeSegment(0, N),
+                                          [=] RAJA_HOST_DEVICE(int i)
+                                          { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
   // printResult(c, N);
diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp
index 8d729c4368..d3f6abc3f6 100644
--- a/examples/resource-forall.cpp
+++ b/examples/resource-forall.cpp
@@ -65,8 +65,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   for (int i = 0; i < N; ++i)
   {
-    a[i] = -i;
-    b[i] = 2 * i;
+    a[i]  = -i;
+    b[i]  = 2 * i;
     a_[i] = -i;
     b_[i] = 2 * i;
   }
@@ -188,15 +188,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     res_gpu2.memcpy(d_b2, b, sizeof(int) * N);
 
 
-    RAJA::forall<EXEC_POLICY>(
-        res_gpu1, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
-          d_c1[i] = d_a1[i] + d_b1[i];
-        });
+    RAJA::forall<EXEC_POLICY>(res_gpu1,
+                              RAJA::RangeSegment(0, N),
+                              [=] RAJA_DEVICE(int i)
+                              { d_c1[i] = d_a1[i] + d_b1[i]; });
 
-    RAJA::forall<EXEC_POLICY>(
-        res_gpu2, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
-          d_c2[i] = d_a2[i] + d_b2[i];
-        });
+    RAJA::forall<EXEC_POLICY>(res_gpu2,
+                              RAJA::RangeSegment(0, N),
+                              [=] RAJA_DEVICE(int i)
+                              { d_c2[i] = d_a2[i] + d_b2[i]; });
 
     res_gpu1.memcpy(c, d_c1, sizeof(int) * N);
 
@@ -230,8 +230,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
     // _raja_res_defres_end
 #elif defined(RAJA_ENABLE_HIP)
-    RAJA::resources::Hip res_gpu1;
-    RAJA::resources::Hip res_gpu2;
+    RAJA::resources::Hip  res_gpu1;
+    RAJA::resources::Hip  res_gpu2;
     RAJA::resources::Host res_host;
 
     using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
@@ -246,7 +246,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     // _raja_res_alloc_start
     int* d_array1 = res_gpu1.allocate<int>(N);
     int* d_array2 = res_gpu2.allocate<int>(N);
-    int* h_array = res_host.allocate<int>(N);
+    int* h_array  = res_host.allocate<int>(N);
     // _raja_res_alloc_end
 
     // _raja_res_k1_start
@@ -257,9 +257,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     // _raja_res_k2_start
     RAJA::resources::Event e = RAJA::forall<EXEC_POLICY>(
-        res_gpu2, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
-          d_array2[i] = -1;
-        });
+        res_gpu2,
+        RAJA::RangeSegment(0, N),
+        [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
     // _raja_res_k2_end
 
     // _raja_res_wait_start
@@ -267,10 +267,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     // _raja_res_wait_end
 
     // _raja_res_k3_start
-    RAJA::forall<EXEC_POLICY>(
-        res_gpu1, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) {
-          d_array1[i] *= d_array2[i];
-        });
+    RAJA::forall<EXEC_POLICY>(res_gpu1,
+                              RAJA::RangeSegment(0, N),
+                              [=] RAJA_HOST_DEVICE(int i)
+                              { d_array1[i] *= d_array2[i]; });
     // _raja_res_k3_end
 
     // _raja_res_memcpy_start
@@ -279,13 +279,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     // _raja_res_k4_start
     bool check = true;
-    RAJA::forall<RAJA::seq_exec>(
-        res_host, RAJA::RangeSegment(0, N), [&check, h_array](int i) {
-          if (h_array[i] != -i)
-          {
-            check = false;
-          }
-        });
+    RAJA::forall<RAJA::seq_exec>(res_host,
+                                 RAJA::RangeSegment(0, N),
+                                 [&check, h_array](int i)
+                                 {
+                                   if (h_array[i] != -i)
+                                   {
+                                     check = false;
+                                   }
+                                 });
     // _raja_res_k4_end
 
     std::cout << "\n         result -- ";
diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp
index a38f5c83a1..64690126c1 100644
--- a/examples/resource-kernel.cpp
+++ b/examples/resource-kernel.cpp
@@ -21,8 +21,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
   RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
-  int* d_array = def_cuda_res.allocate<int>(N * M);
-  int* h_array = def_host_res.allocate<int>(N * M);
+  int*                  d_array = def_cuda_res.allocate<int>(N * M);
+  int*                  h_array = def_host_res.allocate<int>(N * M);
 
   RAJA::RangeSegment one_range(0, 1);
   RAJA::RangeSegment m_range(0, M);
@@ -34,7 +34,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       statement::For<1, cuda_thread_x_loop, statement::Lambda<0>>>>>;
 
   RAJA::forall<RAJA::seq_exec>(
-      def_host_res, n_range, [=, &def_cuda_res](int i) {
+      def_host_res,
+      n_range,
+      [=, &def_cuda_res](int i)
+      {
         RAJA::resources::Cuda res_cuda;
 
         RAJA::resources::Event e = RAJA::kernel_resource<TEST_POL>(
@@ -51,7 +54,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   int ec_count = 0;
   RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N * M),
-                               [=, &ec_count](int i) {
+                               [=, &ec_count](int i)
+                               {
                                  if (h_array[i] != i) ec_count++;
                                });
 
diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp
index 12c228e91e..42da55148a 100644
--- a/examples/resource-launch.cpp
+++ b/examples/resource-launch.cpp
@@ -21,8 +21,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
   RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
-  int* d_array = def_cuda_res.allocate<int>(N * M);
-  int* h_array = def_host_res.allocate<int>(N * M);
+  int*                  d_array = def_cuda_res.allocate<int>(N * M);
+  int*                  h_array = def_host_res.allocate<int>(N * M);
 
   RAJA::RangeSegment one_range(0, 1);
   RAJA::RangeSegment m_range(0, M);
@@ -35,18 +35,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
   RAJA::forall<RAJA::seq_exec>(
-      def_host_res, n_range, [=, &def_cuda_res](int i) {
+      def_host_res,
+      n_range,
+      [=, &def_cuda_res](int i)
+      {
         RAJA::resources::Cuda res_cuda;
 
         RAJA::resources::Event e = RAJA::launch<launch_policy>(
             res_cuda,
             RAJA::LaunchParams(RAJA::Teams(64), RAJA::Threads(1)),
-            [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-              RAJA::loop<teams_x>(ctx, m_range, [&](int j) {
-                RAJA::loop<threads_x>(ctx, one_range, [&](int k) {
-                  d_array[i * M + j] = i * M + j;
-                });
-              });
+            [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+            {
+              RAJA::loop<teams_x>(ctx,
+                                  m_range,
+                                  [&](int j)
+                                  {
+                                    RAJA::loop<threads_x>(ctx,
+                                                          one_range,
+                                                          [&](int k) {
+                                                            d_array[i * M + j] =
+                                                                i * M + j;
+                                                          });
+                                  });
             });
 
         def_cuda_res.wait_for(&e);
@@ -56,7 +66,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   int ec_count = 0;
   RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N * M),
-                               [=, &ec_count](int i) {
+                               [=, &ec_count](int i)
+                               {
                                  if (h_array[i] != i) ec_count++;
                                });
 
diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp
index 87c10fa871..4c9204747a 100644
--- a/examples/resource-runtime-launch.cpp
+++ b/examples/resource-runtime-launch.cpp
@@ -30,14 +30,14 @@
  */
 
 using host_launch = RAJA::seq_launch_t;
-using host_loop = RAJA::seq_exec;
+using host_loop   = RAJA::seq_exec;
 
 #if defined(RAJA_ENABLE_CUDA)
 using device_launch = RAJA::cuda_launch_t<true>;
-using device_loop = RAJA::cuda_global_thread_x;
+using device_loop   = RAJA::cuda_global_thread_x;
 #elif defined(RAJA_ENABLE_HIP)
 using device_launch = RAJA::hip_launch_t<true>;
-using device_loop = RAJA::hip_global_thread_x;
+using device_loop   = RAJA::hip_global_thread_x;
 #endif
 
 using launch_policy = RAJA::LaunchPolicy<host_launch
@@ -123,10 +123,10 @@ int main(int argc, char* argv[])
   // Set min and max loc values
   //
   const int minloc_ref = N / 2;
-  a[minloc_ref] = -100;
+  a[minloc_ref]        = -100;
 
   const int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref] = 100;
+  a[maxloc_ref]        = 100;
   // _reductions_array_init_end
 
   //
@@ -187,16 +187,20 @@ int main(int argc, char* argv[])
   RAJA::launch<launch_policy>(
       res,
       RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<loop_pol>(ctx, arange, [&](int i) {
-          kernel_sum += a[i];
-
-          kernel_min.min(a[i]);
-          kernel_max.max(a[i]);
-
-          kernel_minloc.minloc(a[i], i);
-          kernel_maxloc.maxloc(a[i], i);
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<loop_pol>(ctx,
+                             arange,
+                             [&](int i)
+                             {
+                               kernel_sum += a[i];
+
+                               kernel_min.min(a[i]);
+                               kernel_max.max(a[i]);
+
+                               kernel_minloc.minloc(a[i], i);
+                               kernel_maxloc.maxloc(a[i], i);
+                             });
       });
 
 
diff --git a/examples/tut_daxpy.cpp b/examples/tut_daxpy.cpp
index 39ecaad085..3aa7faa842 100644
--- a/examples/tut_daxpy.cpp
+++ b/examples/tut_daxpy.cpp
@@ -43,7 +43,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data.
   //
-  double* a0 = new double[N];
+  double* a0   = new double[N];
   double* aref = new double[N];
 
   double* ta = new double[N];
diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp
index 0ce21573bd..6052d9ca14 100644
--- a/examples/tut_halo-exchange.cpp
+++ b/examples/tut_halo-exchange.cpp
@@ -39,12 +39,12 @@
   CUDA thread block when using workgroup
 */
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
+const int CUDA_BLOCK_SIZE           = 256;
 const int CUDA_WORKGROUP_BLOCK_SIZE = 1024;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
+const int HIP_BLOCK_SIZE           = 256;
 const int HIP_WORKGROUP_BLOCK_SIZE = 1024;
 #endif
 
@@ -59,21 +59,21 @@ const int num_neighbors = 26;
 //
 void checkResult(std::vector<double*> const& vars,
                  std::vector<double*> const& vars_ref,
-                 int var_size,
-                 int num_vars);
+                 int                         var_size,
+                 int                         num_vars);
 void printResult(std::vector<double*> const& vars, int var_size, int num_vars);
 
 //
 // Functions for allocating and populating packing and unpacking lists
 //
 void create_pack_lists(std::vector<int*>& pack_index_lists,
-                       std::vector<int>& pack_index_list_lengths,
-                       const int halo_width,
-                       const int* grid_dims);
+                       std::vector<int>&  pack_index_list_lengths,
+                       const int          halo_width,
+                       const int*         grid_dims);
 void create_unpack_lists(std::vector<int*>& unpack_index_lists,
-                         std::vector<int>& unpack_index_list_lengths,
-                         const int halo_width,
-                         const int* grid_dims);
+                         std::vector<int>&  unpack_index_list_lengths,
+                         const int          halo_width,
+                         const int*         grid_dims);
 void destroy_pack_lists(std::vector<int*>& pack_index_lists);
 void destroy_unpack_lists(std::vector<int*>& unpack_index_lists);
 
@@ -213,7 +213,7 @@ int main(int argc, char** argv)
                             (argc != 7) ? 100 : std::atoi(argv[2]),
                             (argc != 7) ? 100 : std::atoi(argv[3])};
   const int halo_width = (argc != 7) ? 1 : std::atoi(argv[4]);
-  const int num_vars = (argc != 7) ? 3 : std::atoi(argv[5]);
+  const int num_vars   = (argc != 7) ? 3 : std::atoi(argv[5]);
   const int num_cycles = (argc != 7) ? 3 : std::atoi(argv[6]);
   // _halo_exchange_input_params_end
 
@@ -248,7 +248,7 @@ int main(int argc, char** argv)
 
   for (int v = 0; v < num_vars; ++v)
   {
-    vars[v] = memoryManager::allocate<double>(var_size);
+    vars[v]     = memoryManager::allocate<double>(var_size);
     vars_ref[v] = memoryManager::allocate<double>(var_size);
   }
   // _halo_exchange_vars_allocate_end
@@ -259,12 +259,12 @@ int main(int argc, char** argv)
   // Generate index lists for packing and unpacking
   //
   std::vector<int*> pack_index_lists(num_neighbors, nullptr);
-  std::vector<int> pack_index_list_lengths(num_neighbors, 0);
+  std::vector<int>  pack_index_list_lengths(num_neighbors, 0);
   create_pack_lists(
       pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
 
   std::vector<int*> unpack_index_lists(num_neighbors, nullptr);
-  std::vector<int> unpack_index_list_lengths(num_neighbors, 0);
+  std::vector<int>  unpack_index_list_lengths(num_neighbors, 0);
   create_unpack_lists(
       unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
   // _halo_exchange_index_list_generate_end
@@ -318,8 +318,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -346,8 +346,8 @@ int main(int argc, char** argv)
           // recv single message
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -385,7 +385,7 @@ int main(int argc, char** argv)
     for (int v = 0; v < num_vars; ++v)
     {
 
-      double* var = vars[v];
+      double* var     = vars[v];
       double* var_ref = vars_ref[v];
 
       for (int i = 0; i < var_size; i++)
@@ -438,8 +438,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -447,9 +447,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
-              buffer[i] = var[list[i]];
-            });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=](int i)
+                                        { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -465,8 +465,8 @@ int main(int argc, char** argv)
           // recv single message
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -474,9 +474,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
-              var[list[i]] = buffer[i];
-            });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=](int i)
+                                        { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -573,8 +573,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -603,8 +603,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -693,8 +693,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -702,9 +702,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
-              buffer[i] = var[list[i]];
-            });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=](int i)
+                                        { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -720,8 +720,8 @@ int main(int argc, char** argv)
           // recv single message
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -729,9 +729,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i) {
-              var[list[i]] = buffer[i];
-            });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=](int i)
+                                        { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -826,8 +826,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -856,8 +856,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -916,8 +916,8 @@ int main(int argc, char** argv)
 
 
     std::vector<double*> cuda_vars(num_vars, nullptr);
-    std::vector<int*> cuda_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*> cuda_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
 
     for (int v = 0; v < num_vars; ++v)
     {
@@ -926,14 +926,14 @@ int main(int argc, char** argv)
 
     for (int l = 0; l < num_neighbors; ++l)
     {
-      int pack_len = pack_index_list_lengths[l];
+      int pack_len             = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
       cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l],
                             pack_index_lists[l],
                             pack_len * sizeof(int),
                             cudaMemcpyDefault));
 
-      int unpack_len = unpack_index_list_lengths[l];
+      int unpack_len             = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
       cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l],
                             unpack_index_lists[l],
@@ -971,9 +971,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=] RAJA_DEVICE(int i)
+                                      { var[i] = i + v; });
         }
 
         // _halo_exchange_cuda_forall_packing_start
@@ -981,8 +981,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -990,9 +990,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=] RAJA_DEVICE(int i)
+                                        { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -1010,8 +1010,8 @@ int main(int argc, char** argv)
           // recv single message
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -1019,9 +1019,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=] RAJA_DEVICE(int i)
+                                        { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1081,8 +1081,8 @@ int main(int argc, char** argv)
 
 
     std::vector<double*> cuda_vars(num_vars, nullptr);
-    std::vector<int*> cuda_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*> cuda_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
 
     for (int v = 0; v < num_vars; ++v)
     {
@@ -1091,14 +1091,14 @@ int main(int argc, char** argv)
 
     for (int l = 0; l < num_neighbors; ++l)
     {
-      int pack_len = pack_index_list_lengths[l];
+      int pack_len             = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
       cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l],
                             pack_index_lists[l],
                             pack_len * sizeof(int),
                             cudaMemcpyDefault));
 
-      int unpack_len = unpack_index_list_lengths[l];
+      int unpack_len             = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
       cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l],
                             unpack_index_lists[l],
@@ -1154,9 +1154,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=] RAJA_DEVICE(int i)
+                                      { var[i] = i + v; });
         }
 
         // _halo_exchange_cuda_workgroup_packing_start
@@ -1164,8 +1164,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -1173,9 +1173,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
-              buffer[i] = var[list[i]];
-            });
+            pool_pack.enqueue(range_segment(0, len),
+                              [=] RAJA_DEVICE(int i)
+                              { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -1197,8 +1197,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -1206,9 +1206,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
-              var[list[i]] = buffer[i];
-            });
+            pool_unpack.enqueue(range_segment(0, len),
+                                [=] RAJA_DEVICE(int i)
+                                { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1279,8 +1279,8 @@ int main(int argc, char** argv)
 
 
     std::vector<double*> hip_vars(num_vars, nullptr);
-    std::vector<int*> hip_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*> hip_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
 
     for (int v = 0; v < num_vars; ++v)
     {
@@ -1289,14 +1289,14 @@ int main(int argc, char** argv)
 
     for (int l = 0; l < num_neighbors; ++l)
     {
-      int pack_len = pack_index_list_lengths[l];
+      int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
       hipErrchk(hipMemcpy(hip_pack_index_lists[l],
                           pack_index_lists[l],
                           pack_len * sizeof(int),
                           hipMemcpyHostToDevice));
 
-      int unpack_len = unpack_index_list_lengths[l];
+      int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
       hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
                           unpack_index_lists[l],
@@ -1334,9 +1334,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=] RAJA_DEVICE(int i)
+                                      { var[i] = i + v; });
         }
 
         // _halo_exchange_hip_forall_packing_start
@@ -1344,8 +1344,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -1353,9 +1353,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=] RAJA_DEVICE(int i)
+                                        { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -1373,8 +1373,8 @@ int main(int argc, char** argv)
           // recv single message
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -1382,9 +1382,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
+            RAJA::forall<forall_policy>(range_segment(0, len),
+                                        [=] RAJA_DEVICE(int i)
+                                        { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1448,8 +1448,8 @@ int main(int argc, char** argv)
 
 
     std::vector<double*> hip_vars(num_vars, nullptr);
-    std::vector<int*> hip_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*> hip_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
 
     for (int v = 0; v < num_vars; ++v)
     {
@@ -1458,14 +1458,14 @@ int main(int argc, char** argv)
 
     for (int l = 0; l < num_neighbors; ++l)
     {
-      int pack_len = pack_index_list_lengths[l];
+      int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
       hipErrchk(hipMemcpy(hip_pack_index_lists[l],
                           pack_index_lists[l],
                           pack_len * sizeof(int),
                           hipMemcpyHostToDevice));
 
-      int unpack_len = unpack_index_list_lengths[l];
+      int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
       hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
                           unpack_index_lists[l],
@@ -1521,9 +1521,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=] RAJA_DEVICE(int i)
+                                      { var[i] = i + v; });
         }
 
         // _halo_exchange_hip_workgroup_packing_start
@@ -1531,8 +1531,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -1540,9 +1540,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
-              buffer[i] = var[list[i]];
-            });
+            pool_pack.enqueue(range_segment(0, len),
+                              [=] RAJA_DEVICE(int i)
+                              { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -1564,8 +1564,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -1573,9 +1573,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) {
-              var[list[i]] = buffer[i];
-            });
+            pool_unpack.enqueue(range_segment(0, len),
+                                [=] RAJA_DEVICE(int i)
+                                { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1642,8 +1642,8 @@ int main(int argc, char** argv)
 
 
     std::vector<double*> hip_vars(num_vars, nullptr);
-    std::vector<int*> hip_pack_index_lists(num_neighbors, nullptr);
-    std::vector<int*> hip_unpack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
 
     for (int v = 0; v < num_vars; ++v)
     {
@@ -1652,14 +1652,14 @@ int main(int argc, char** argv)
 
     for (int l = 0; l < num_neighbors; ++l)
     {
-      int pack_len = pack_index_list_lengths[l];
+      int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
       hipErrchk(hipMemcpy(hip_pack_index_lists[l],
                           pack_index_lists[l],
                           pack_len * sizeof(int),
                           hipMemcpyHostToDevice));
 
-      int unpack_len = unpack_index_list_lengths[l];
+      int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
       hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
                           unpack_index_lists[l],
@@ -1676,17 +1676,17 @@ int main(int argc, char** argv)
 
     struct Packer
     {
-      double* buffer;
-      double* var;
-      int* list;
+      double*          buffer;
+      double*          var;
+      int*             list;
       RAJA_DEVICE void operator()(int i) const { buffer[i] = var[list[i]]; }
     };
 
     struct UnPacker
     {
-      double* buffer;
-      double* var;
-      int* list;
+      double*          buffer;
+      double*          var;
+      int*             list;
       RAJA_DEVICE void operator()(int i) const { var[list[i]] = buffer[i]; }
     };
 
@@ -1730,17 +1730,17 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(range_segment(0, var_size),
+                                      [=] RAJA_DEVICE(int i)
+                                      { var[i] = i + v; });
         }
 
         for (int l = 0; l < num_neighbors; ++l)
         {
 
           double* buffer = buffers[l];
-          int* list = pack_index_lists[l];
-          int len = pack_index_list_lengths[l];
+          int*    list   = pack_index_lists[l];
+          int     len    = pack_index_list_lengths[l];
 
           // pack
           for (int v = 0; v < num_vars; ++v)
@@ -1768,8 +1768,8 @@ int main(int argc, char** argv)
         {
 
           double* buffer = buffers[l];
-          int* list = unpack_index_lists[l];
-          int len = unpack_index_list_lengths[l];
+          int*    list   = unpack_index_lists[l];
+          int     len    = unpack_index_list_lengths[l];
 
           // unpack
           for (int v = 0; v < num_vars; ++v)
@@ -1862,13 +1862,13 @@ int main(int argc, char** argv)
 //
 void checkResult(std::vector<double*> const& vars,
                  std::vector<double*> const& vars_ref,
-                 int var_size,
-                 int num_vars)
+                 int                         var_size,
+                 int                         num_vars)
 {
   bool correct = true;
   for (int v = 0; v < num_vars; ++v)
   {
-    double* var = vars[v];
+    double* var     = vars[v];
     double* var_ref = vars_ref[v];
     for (int i = 0; i < var_size; i++)
     {
@@ -1920,9 +1920,9 @@ struct Extent
 // Function to generate index lists for packing.
 //
 void create_pack_lists(std::vector<int*>& pack_index_lists,
-                       std::vector<int>& pack_index_list_lengths,
-                       const int halo_width,
-                       const int* grid_dims)
+                       std::vector<int>&  pack_index_list_lengths,
+                       const int          halo_width,
+                       const int*         grid_dims)
 {
   std::vector<Extent> pack_index_list_extents(num_neighbors);
 
@@ -1965,25 +1965,25 @@ void create_pack_lists(std::vector<int*>& pack_index_lists,
                                       grid_dims[2] + halo_width};
 
   // edges
-  pack_index_list_extents[6] = Extent{halo_width,
+  pack_index_list_extents[6]  = Extent{halo_width,
                                       halo_width + halo_width,
                                       halo_width,
                                       halo_width + halo_width,
                                       halo_width,
                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[7] = Extent{halo_width,
+  pack_index_list_extents[7]  = Extent{halo_width,
                                       halo_width + halo_width,
                                       grid_dims[1],
                                       grid_dims[1] + halo_width,
                                       halo_width,
                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[8] = Extent{grid_dims[0],
+  pack_index_list_extents[8]  = Extent{grid_dims[0],
                                       grid_dims[0] + halo_width,
                                       halo_width,
                                       halo_width + halo_width,
                                       halo_width,
                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[9] = Extent{grid_dims[0],
+  pack_index_list_extents[9]  = Extent{grid_dims[0],
                                       grid_dims[0] + halo_width,
                                       grid_dims[1],
                                       grid_dims[1] + halo_width,
@@ -2142,9 +2142,9 @@ void destroy_pack_lists(std::vector<int*>& pack_index_lists)
 // Function to generate index lists for unpacking.
 //
 void create_unpack_lists(std::vector<int*>& unpack_index_lists,
-                         std::vector<int>& unpack_index_list_lengths,
-                         const int halo_width,
-                         const int* grid_dims)
+                         std::vector<int>&  unpack_index_list_lengths,
+                         const int          halo_width,
+                         const int*         grid_dims)
 {
   std::vector<Extent> unpack_index_list_extents(num_neighbors);
 
@@ -2189,19 +2189,19 @@ void create_unpack_lists(std::vector<int*>& unpack_index_lists,
   // edges
   unpack_index_list_extents[6] = Extent{
       0, halo_width, 0, halo_width, halo_width, grid_dims[2] + halo_width};
-  unpack_index_list_extents[7] = Extent{0,
+  unpack_index_list_extents[7]  = Extent{0,
                                         halo_width,
                                         grid_dims[1] + halo_width,
                                         grid_dims[1] + 2 * halo_width,
                                         halo_width,
                                         grid_dims[2] + halo_width};
-  unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width,
+  unpack_index_list_extents[8]  = Extent{grid_dims[0] + halo_width,
                                         grid_dims[0] + 2 * halo_width,
                                         0,
                                         halo_width,
                                         halo_width,
                                         grid_dims[2] + halo_width};
-  unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width,
+  unpack_index_list_extents[9]  = Extent{grid_dims[0] + halo_width,
                                         grid_dims[0] + 2 * halo_width,
                                         grid_dims[1] + halo_width,
                                         grid_dims[1] + 2 * halo_width,
diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp
index fa354d2612..85c80b9ad5 100644
--- a/examples/tut_launch_basic.cpp
+++ b/examples/tut_launch_basic.cpp
@@ -185,7 +185,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
   //
 
   // __compute_grid_start
-  const int Nteams = 2;
+  const int Nteams   = 2;
   const int Nthreads = 2;
   // __compute_grid_end
 
@@ -194,20 +194,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
       RAJA::LaunchParams(RAJA::Teams(Nteams, Nteams),
                          RAJA::Threads(Nthreads, Nthreads)),
 
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         // _team_loops_start
         RAJA::loop<teams_y>(
-            ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&](int by) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, Nteams),
+            [&](int by)
+            {
               RAJA::loop<teams_x>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&](int bx) {
+                  ctx,
+                  RAJA::TypedRangeSegment<int>(0, Nteams),
+                  [&](int bx)
+                  {
                     RAJA::loop<threads_y>(
                         ctx,
                         RAJA::TypedRangeSegment<int>(0, Nthreads),
-                        [&](int ty) {
+                        [&](int ty)
+                        {
                           RAJA::loop<threads_x>(
                               ctx,
                               RAJA::TypedRangeSegment<int>(0, Nthreads),
-                              [&](int tx) {
+                              [&](int tx)
+                              {
                                 printf("RAJA Teams: threadId_x %d threadId_y "
                                        "%d teamId_x %d teamId_y %d \n",
                                        tx,
diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp
index 53666d20e6..699fcc3dc2 100644
--- a/examples/tut_matrix-multiply.cpp
+++ b/examples/tut_matrix-multiply.cpp
@@ -199,18 +199,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_outerforall_start
-  RAJA::forall<RAJA::seq_exec>(row_range, [=](int row) {
-    for (int col = 0; col < N; ++col)
-    {
-
-      double dot = 0.0;
-      for (int k = 0; k < N; ++k)
-      {
-        dot += Aview(row, k) * Bview(k, col);
-      }
-      Cview(row, col) = dot;
-    }
-  });
+  RAJA::forall<RAJA::seq_exec>(row_range,
+                               [=](int row)
+                               {
+                                 for (int col = 0; col < N; ++col)
+                                 {
+
+                                   double dot = 0.0;
+                                   for (int k = 0; k < N; ++k)
+                                   {
+                                     dot += Aview(row, k) * Bview(k, col);
+                                   }
+                                   Cview(row, col) = dot;
+                                 }
+                               });
   // _matmult_outerforall_end
 
   checkResult<double>(Cview, N);
@@ -235,16 +237,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_nestedforall_start
-  RAJA::forall<RAJA::seq_exec>(row_range, [=](int row) {
-    RAJA::forall<RAJA::seq_exec>(col_range, [=](int col) {
-      double dot = 0.0;
-      for (int k = 0; k < N; ++k)
-      {
-        dot += Aview(row, k) * Bview(k, col);
-      }
-      Cview(row, col) = dot;
-    });
-  });
+  RAJA::forall<RAJA::seq_exec>(row_range,
+                               [=](int row)
+                               {
+                                 RAJA::forall<RAJA::seq_exec>(
+                                     col_range,
+                                     [=](int col)
+                                     {
+                                       double dot = 0.0;
+                                       for (int k = 0; k < N; ++k)
+                                       {
+                                         dot += Aview(row, k) * Bview(k, col);
+                                       }
+                                       Cview(row, col) = dot;
+                                     });
+                               });
   // _matmult_nestedforall_end
 
   checkResult<double>(Cview, N);
@@ -292,7 +299,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<EXEC_POL>(RAJA::make_tuple(col_range, row_range),
-                         [=](int col, int row) {
+                         [=](int col, int row)
+                         {
                            double dot = 0.0;
                            for (int k = 0; k < N; ++k)
                            {
@@ -323,7 +331,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _matmult_ompkernel_end
 
   RAJA::kernel<EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
-                          [=](int col, int row) {
+                          [=](int col, int row)
+                          {
                             double dot = 0.0;
                             for (int k = 0; k < N; ++k)
                             {
@@ -358,7 +367,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _matmult_ompkernel_swap_end
 
   RAJA::kernel<EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
-                          [=](int col, int row) {
+                          [=](int col, int row)
+                          {
                             double dot = 0.0;
                             for (int k = 0; k < N; ++k)
                             {
@@ -387,7 +397,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                 RAJA::statement::Lambda<0>>>;
 
   RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
-                          [=](int col, int row) {
+                          [=](int col, int row)
+                          {
                             double dot = 0.0;
                             for (int k = 0; k < N; ++k)
                             {
@@ -426,7 +437,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row) {
+                          [=] RAJA_DEVICE(int col, int row)
+                          {
                             double dot = 0.0;
                             for (int k = 0; k < N; ++k)
                             {
@@ -470,7 +482,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row) {
+                          [=] RAJA_DEVICE(int col, int row)
+                          {
                             double dot = 0.0;
                             for (int k = 0; k < N; ++k)
                             {
@@ -521,7 +534,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row) {
+                          [=] RAJA_DEVICE(int col, int row)
+                          {
                             double dot = 0.0;
                             for (int k = 0; k < N; ++k)
                             {
@@ -568,7 +582,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row) {
+                          [=] RAJA_DEVICE(int col, int row)
+                          {
                             double dot = 0.0;
                             for (int k = 0; k < N; ++k)
                             {
@@ -638,9 +653,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=](double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=](int col, int row, int k, double& dot) {
-        dot += Aview(row, k) * Bview(k, col);
-      },
+      [=](int col, int row, int k, double& dot)
+      { dot += Aview(row, k) * Bview(k, col); },
 
       // lambda 2
       [=](int col, int row, double& dot) { Cview(row, col) = dot; }
@@ -693,9 +707,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=](double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=](int col, int row, int k, double& dot) {
-        dot += Aview(row, k) * Bview(k, col);
-      },
+      [=](int col, int row, int k, double& dot)
+      { dot += Aview(row, k) * Bview(k, col); },
 
       // lambda 2
       [=](int col, int row, double& dot) { Cview(row, col) = dot; }
@@ -740,9 +753,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=](double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=](int col, int row, int k, double& dot) {
-        dot += Aview(row, k) * Bview(k, col);
-      },
+      [=](int col, int row, int k, double& dot)
+      { dot += Aview(row, k) * Bview(k, col); },
 
       // lambda 2
       [=](int col, int row, double& dot) { Cview(row, col) = dot; }
@@ -789,9 +801,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
-        dot += Aview(row, k) * Bview(k, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
+      { dot += Aview(row, k) * Bview(k, col); },
 
       // lambda 2
       [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
@@ -845,9 +856,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
-        dot += Aview(row, k) * Bview(k, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
+      { dot += Aview(row, k) * Bview(k, col); },
 
       // lambda 2
       [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
@@ -899,9 +909,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
-        dot += Aview(row, k) * Bview(k, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
+      { dot += Aview(row, k) * Bview(k, col); },
 
       // lambda 2
       [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
@@ -1021,34 +1030,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::make_tuple(aShared, bShared, cShared),
 
       // Zero out thread local memory for storing dot products
-      [=] RAJA_HOST_DEVICE(int tn, int tp, Shmem& cShared) {
-        cShared(tn, tp) = 0.0;
-      },
+      [=] RAJA_HOST_DEVICE(int tn, int tp, Shmem& cShared)
+      { cShared(tn, tp) = 0.0; },
 
       // Load tile of A
-      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared) {
-        aShared(tn, tm) = Aview(n, m);
-      },
+      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared)
+      { aShared(tn, tm) = Aview(n, m); },
 
       // Load tile of B
-      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared) {
-        bShared(tm, tp) = Bview(m, p);
-      },
+      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared)
+      { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(int tn,
-                           int tm,
-                           int tp,
+      [=] RAJA_HOST_DEVICE(int    tn,
+                           int    tm,
+                           int    tp,
                            Shmem& aShared,
                            Shmem& bShared,
-                           Shmem& cShared) {
-        cShared(tn, tp) += aShared(tn, tm) * bShared(tm, tp);
-      },
+                           Shmem& cShared)
+      { cShared(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
-      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, Shmem& cShared) {
-        Cview(n, p) = cShared(tn, tp);
-      });
+      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, Shmem& cShared)
+      { Cview(n, p) = cShared(tn, tp); });
 
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
@@ -1118,14 +1122,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
-        dot += d_Aview(row, k) * d_Bview(k, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
+      { dot += d_Aview(row, k) * d_Bview(k, col); },
 
       // lambda 2
-      [=] RAJA_DEVICE(int col, int row, double& dot) {
-        d_Cview(row, col) = dot;
-      }
+      [=] RAJA_DEVICE(int col, int row, double& dot)
+      { d_Cview(row, col) = dot; }
 
   );
 
@@ -1179,14 +1181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot) {
-        dot += d_Aview(row, k) * d_Bview(k, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
+      { dot += d_Aview(row, k) * d_Bview(k, col); },
 
       // lambda 2
-      [=] RAJA_DEVICE(int col, int row, double& dot) {
-        d_Cview(row, col) = dot;
-      }
+      [=] RAJA_DEVICE(int col, int row, double& dot)
+      { d_Cview(row, col) = dot; }
 
   );
 
diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp
index ce1a8fc101..c25a8ded7c 100644
--- a/examples/wave-eqn.cpp
+++ b/examples/wave-eqn.cpp
@@ -42,7 +42,7 @@
 //  PI - Value of pi
 //
 
-const int sr = 2;
+const int    sr = 2;
 const double PI = 3.14159265359;
 
 //
@@ -54,7 +54,7 @@ const double PI = 3.14159265359;
 struct grid_s
 {
   double ox, dx;
-  int nx;
+  int    nx;
 };
 
 
@@ -67,10 +67,10 @@ struct grid_s
 //
 
 template <typename T, typename fdNestedPolicy>
-void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx);
+void   wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx);
 double waveSol(double t, double x, double y);
-void setIC(double* P1, double* P2, double t0, double t1, grid_s grid);
-void computeErr(double* P, double tf, grid_s grid);
+void   setIC(double* P1, double* P2, double t0, double t1, grid_s grid);
+void   computeErr(double* P, double tf, grid_s grid);
 
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
@@ -104,9 +104,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   double T = 0.82;
 
 
-  int entries = grid.nx * grid.nx;
-  double* P1 = memoryManager::allocate<double>(entries);
-  double* P2 = memoryManager::allocate<double>(entries);
+  int     entries = grid.nx * grid.nx;
+  double* P1      = memoryManager::allocate<double>(entries);
+  double* P2      = memoryManager::allocate<double>(entries);
 
   //
   //----[Time stepping parameters]----
@@ -164,8 +164,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     time += dt;
 
     double* Temp = P2;
-    P2 = P1;
-    P1 = Temp;
+    P2           = P1;
+    P1           = Temp;
   }
 #if defined(RAJA_ENABLE_CUDA)
   cudaDeviceSynchronize();
@@ -194,7 +194,7 @@ double waveSol(double t, double x, double y)
 void computeErr(double* P, double tf, grid_s grid)
 {
 
-  RAJA::RangeSegment fdBounds(0, grid.nx);
+  RAJA::RangeSegment                        fdBounds(0, grid.nx);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
   using initialPolicy = RAJA::KernelPolicy<RAJA::statement::For<
@@ -203,10 +203,11 @@ void computeErr(double* P, double tf, grid_s grid)
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
-                              [=](RAJA::Index_type tx, RAJA::Index_type ty) {
-                                int id = tx + grid.nx * ty;
-                                double x = grid.ox + tx * grid.dx;
-                                double y = grid.ox + ty * grid.dx;
+                              [=](RAJA::Index_type tx, RAJA::Index_type ty)
+                              {
+                                int    id = tx + grid.nx * ty;
+                                double x  = grid.ox + tx * grid.dx;
+                                double y  = grid.ox + ty * grid.dx;
                                 double myErr =
                                     std::abs(P[id] - waveSol(tf, x, y));
 
@@ -235,10 +236,11 @@ void setIC(double* P1, double* P2, double t0, double t1, grid_s grid)
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
-                              [=](RAJA::Index_type tx, RAJA::Index_type ty) {
-                                int id = tx + ty * grid.nx;
-                                double x = grid.ox + tx * grid.dx;
-                                double y = grid.ox + ty * grid.dx;
+                              [=](RAJA::Index_type tx, RAJA::Index_type ty)
+                              {
+                                int    id = tx + ty * grid.nx;
+                                double x  = grid.ox + tx * grid.dx;
+                                double y  = grid.ox + ty * grid.dx;
 
                                 P1[id] = waveSol(t0, x, y);
                                 P2[id] = waveSol(t1, x, y);
@@ -252,16 +254,17 @@ void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx)
 
   RAJA::kernel<fdNestedPolicy>(
       RAJA::make_tuple(fdBounds, fdBounds),
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type tx, RAJA::Index_type ty) {
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type tx, RAJA::Index_type ty)
+      {
         //
         // Coefficients for fourth order stencil
         //
         double coeff[5] = {
             -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
 
-        const int id = tx + ty * nx;
-        double P_old = P1[id];
-        double P_curr = P2[id];
+        const int id     = tx + ty * nx;
+        double    P_old  = P1[id];
+        double    P_curr = P2[id];
 
         //
         // Compute Laplacian
@@ -270,11 +273,11 @@ void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx)
 
         for (auto r : RAJA::RangeSegment(-sr, sr + 1))
         {
-          const int xi = (tx + r + nx) % nx;
+          const int xi  = (tx + r + nx) % nx;
           const int idx = xi + nx * ty;
           lap += coeff[r + sr] * P2[idx];
 
-          const int yi = (ty + r + nx) % nx;
+          const int yi  = (ty + r + nx) % nx;
           const int idy = tx + nx * yi;
           lap += coeff[r + sr] * P2[idy];
         }
diff --git a/exercises/atomic-histogram.cpp b/exercises/atomic-histogram.cpp
index ecdc1a9e7d..55c683ba04 100644
--- a/exercises/atomic-histogram.cpp
+++ b/exercises/atomic-histogram.cpp
@@ -70,7 +70,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 100000;
 
   int* array = memoryManager::allocate<int>(N);
-  int* hist = memoryManager::allocate<int>(M);
+  int* hist  = memoryManager::allocate<int>(M);
 
   for (int i = 0; i < N; ++i)
   {
diff --git a/exercises/atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp
index bb3380ffc4..7884b4e305 100644
--- a/exercises/atomic-histogram_solution.cpp
+++ b/exercises/atomic-histogram_solution.cpp
@@ -70,7 +70,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 100000;
 
   int* array = memoryManager::allocate<int>(N);
-  int* hist = memoryManager::allocate<int>(M);
+  int* hist  = memoryManager::allocate<int>(M);
 
   for (int i = 0; i < N; ++i)
   {
@@ -131,9 +131,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::TypedRangeSegment<int> array_range(0, N);
   // _range_atomic_histogram_end
 
-  RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
-    RAJA::atomicAdd<RAJA::seq_atomic>(&hist[array[i]], 1);
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      array_range,
+      [=](int i) { RAJA::atomicAdd<RAJA::seq_atomic>(&hist[array[i]], 1); });
 
   checkResult(hist, hist_ref, M);
   // printArray(hist, M);
@@ -150,9 +150,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(hist, 0, M * sizeof(int));
 
   // _rajaomp_atomic_histogram_start
-  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
-    RAJA::atomicAdd<RAJA::omp_atomic>(&hist[array[i]], 1);
-  });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      array_range,
+      [=](int i) { RAJA::atomicAdd<RAJA::omp_atomic>(&hist[array[i]], 1); });
   // _rajaomp_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
@@ -172,9 +172,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
-    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
-  });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      array_range,
+      [=](int i) { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
 
   checkResult(hist, hist_ref, M);
   // printArray(hist, M);
@@ -194,9 +194,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajacuda_atomic_histogram_start
   RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i) {
-        RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1);
-      });
+      array_range,
+      [=] RAJA_DEVICE(int i)
+      { RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1); });
   // _rajacuda_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
@@ -218,9 +218,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajacuda_atomicauto_histogram_start
   RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i) {
-        RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
-      });
+      array_range,
+      [=] RAJA_DEVICE(int i)
+      { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
   // _rajacuda_atomicauto_histogram_end
 
   checkResult(hist, hist_ref, M);
@@ -240,9 +240,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajahip_atomic_histogram_start
   RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i) {
-        RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1);
-      });
+      array_range,
+      [=] RAJA_DEVICE(int i)
+      { RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1); });
   // _rajahip_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
@@ -264,9 +264,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajahip_atomicauto_histogram_start
   RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i) {
-        RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
-      });
+      array_range,
+      [=] RAJA_DEVICE(int i)
+      { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
   // _rajahip_atomicauto_histogram_end
 
   checkResult(hist, hist_ref, M);
diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp
index ea8acae3c4..c181ca04c2 100644
--- a/exercises/dot-product_solution.cpp
+++ b/exercises/dot-product_solution.cpp
@@ -132,9 +132,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajacuda_dotprod_start
   RAJA::ReduceSum<RAJA::cuda_reduce, double> cudot(0.0);
 
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_DEVICE(int i) { cudot += a[i] * b[i]; });
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
+                                                 [=] RAJA_DEVICE(int i)
+                                                 { cudot += a[i] * b[i]; });
 
   dot = cudot.get();
   // _rajacuda_dotprod_end
@@ -163,9 +163,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajahip_dotprod_start
   RAJA::ReduceSum<RAJA::hip_reduce, double> hpdot(0.0);
 
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_DEVICE(int i) { hpdot += d_a[i] * d_b[i]; });
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
+                                               [=] RAJA_DEVICE(int i)
+                                               { hpdot += d_a[i] * d_b[i]; });
 
   dot = hpdot.get();
   // _rajahip_dotprod_end
diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp
index 006dd27e34..80ff56a913 100644
--- a/exercises/kernel-matrix-transpose-local-array.cpp
+++ b/exercises/kernel-matrix-transpose-local-array.cpp
@@ -87,7 +87,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -412,13 +412,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                        RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-      },
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = Aview(row, col); },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Atview(col, row) = Tile_Array(ty, tx); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_r, N_c);
@@ -493,13 +491,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                        RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = Aview(row, col); },
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Atview(col, row) = Tile_Array(ty, tx); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -511,7 +507,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
@@ -591,13 +587,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                        RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = d_Aview(row, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = d_Aview(row, col); },
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        d_Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { d_Atview(col, row) = Tile_Array(ty, tx); });
 
   hipErrchk(
       hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
index 802f07826e..80ec0e61d4 100644
--- a/exercises/kernel-matrix-transpose-local-array_solution.cpp
+++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -87,7 +87,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -248,13 +248,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-      },
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = Aview(row, col); },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Atview(col, row) = Tile_Array(ty, tx);
-      }
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Atview(col, row) = Tile_Array(ty, tx); }
 
   );
   // _mattranspose_localarray_raja_end
@@ -327,13 +325,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                        RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-      },
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = Aview(row, col); },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Atview(col, row) = Tile_Array(ty, tx); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -401,13 +397,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                        RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-      },
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = Aview(row, col); },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Atview(col, row) = Tile_Array(ty, tx); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_r, N_c);
@@ -482,13 +476,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                        RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = Aview(row, col); },
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Atview(col, row) = Tile_Array(ty, tx); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -500,7 +492,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
@@ -580,13 +572,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                        RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = d_Aview(row, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = d_Aview(row, col); },
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        d_Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { d_Atview(col, row) = Tile_Array(ty, tx); });
 
   hipErrchk(
       hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
@@ -649,13 +639,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
       RAJA::make_tuple(Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-      },
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Tile_Array(ty, tx) = Aview(row, col); },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) {
-        Atview(col, row) = Tile_Array(ty, tx);
-      });
+      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
+      { Atview(col, row) = Tile_Array(ty, tx); });
   // _raja_mattranspose_lambdaargs_start
 
   checkResult<int>(Atview, N_c, N_r);
diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp
index 8ab08df5d1..d7658565ff 100644
--- a/exercises/kernel-matrix-transpose-tiled.cpp
+++ b/exercises/kernel-matrix-transpose-tiled.cpp
@@ -77,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -188,9 +188,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               RAJA::statement::
                   For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
+                                      [=](int col, int row)
+                                      { Atview(col, row) = Aview(row, col); });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -305,7 +305,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
@@ -334,9 +334,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
       RAJA::make_tuple(col_Range, row_Range),
-      [=] RAJA_DEVICE(int col, int row) {
-        d_Atview(col, row) = d_Aview(row, col);
-      });
+      [=] RAJA_DEVICE(int col, int row)
+      { d_Atview(col, row) = d_Aview(row, col); });
 
   hipErrchk(
       hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
diff --git a/exercises/kernel-matrix-transpose-tiled_solution.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp
index ba0937c13c..39b6e00b6a 100644
--- a/exercises/kernel-matrix-transpose-tiled_solution.cpp
+++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp
@@ -77,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -178,9 +178,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               RAJA::statement::
                   For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
+                                      [=](int col, int row)
+                                      { Atview(col, row) = Aview(row, col); });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -279,9 +279,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>(
       RAJA::make_tuple(col_Range, row_Range),
-      [=] RAJA_DEVICE(int col, int row) {
-        Atview(col, row) = Aview(row, col);
-      });
+      [=] RAJA_DEVICE(int col, int row)
+      { Atview(col, row) = Aview(row, col); });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -293,7 +292,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
@@ -322,9 +321,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
       RAJA::make_tuple(col_Range, row_Range),
-      [=] RAJA_DEVICE(int col, int row) {
-        d_Atview(col, row) = d_Aview(row, col);
-      });
+      [=] RAJA_DEVICE(int col, int row)
+      { d_Atview(col, row) = d_Aview(row, col); });
 
   hipErrchk(
       hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
diff --git a/exercises/kernel-matrix-transpose.cpp b/exercises/kernel-matrix-transpose.cpp
index 6d16e7e5eb..636fec4fa5 100644
--- a/exercises/kernel-matrix-transpose.cpp
+++ b/exercises/kernel-matrix-transpose.cpp
@@ -60,7 +60,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp
index c73f39719d..05aadcbaa3 100644
--- a/exercises/kernel-matrix-transpose_solution.cpp
+++ b/exercises/kernel-matrix-transpose_solution.cpp
@@ -60,7 +60,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -130,9 +130,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
+                                [=](int col, int row)
+                                { Atview(col, row) = Aview(row, col); });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -153,9 +153,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::omp_parallel_for_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>(RAJA::make_tuple(col_Range, row_Range),
+                                    [=](int col, int row)
+                                    { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -176,9 +176,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               For<0, RAJA::cuda_thread_y_loop, RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
-                                     [=] RAJA_DEVICE(int col, int row) {
-                                       Atview(col, row) = Aview(row, col);
-                                     });
+                                     [=] RAJA_DEVICE(int col, int row)
+                                     { Atview(col, row) = Aview(row, col); });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
diff --git a/exercises/kernelintro-execpols.cpp b/exercises/kernelintro-execpols.cpp
index 8ff73f90ae..bfc3f8ddd1 100644
--- a/exercises/kernelintro-execpols.cpp
+++ b/exercises/kernelintro-execpols.cpp
@@ -68,11 +68,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // 3D tensor has N^3 entries
   //
-  constexpr int N = 100;
-  constexpr int N_tot = N * N * N;
-  constexpr double c = 0.0001;
-  double* a = memoryManager::allocate<double>(N_tot);
-  double* a_ref = memoryManager::allocate<double>(N_tot);
+  constexpr int    N     = 100;
+  constexpr int    N_tot = N * N * N;
+  constexpr double c     = 0.0001;
+  double*          a     = memoryManager::allocate<double>(N_tot);
+  double*          a_ref = memoryManager::allocate<double>(N_tot);
   // _init_define_end
 
   //----------------------------------------------------------------------------//
@@ -185,12 +185,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::seq_exec, // i
                                                 RAJA::statement::Lambda<0>>>>>;
 
-  RAJA::kernel<EXEC_POL2>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL2>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=](int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
@@ -231,12 +231,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                 RAJA::ArgList<2, 1, 0>, // k, j, i
                                 RAJA::statement::Lambda<0>>>;
 
-  RAJA::kernel<EXEC_POL3>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=](int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
@@ -286,12 +286,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                    RAJA::cuda_thread_x_loop, // i
                                    RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<EXEC_POL5>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=] __device__(int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
@@ -334,12 +334,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                            RAJA::cuda_thread_x_direct, // i
                                            RAJA::statement::Lambda<0>>>>>>>>;
 
-  RAJA::kernel<EXEC_POL6>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL6>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=] __device__(int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
@@ -405,9 +405,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                            RAJA::TypedRangeSegment<int>(0, N),
                                            RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=] __device__(int i, int j, int k) {
-                            d_aView(i, j, k) = c * i * j * k;
-                          });
+                          [=] __device__(int i, int j, int k)
+                          { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
@@ -454,9 +453,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                            RAJA::TypedRangeSegment<int>(0, N),
                                            RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=] __device__(int i, int j, int k) {
-                            d_aView(i, j, k) = c * i * j * k;
-                          });
+                          [=] __device__(int i, int j, int k)
+                          { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_tiled_direct_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp
index 9fecb8bfe9..0323befabe 100644
--- a/exercises/kernelintro-execpols_solution.cpp
+++ b/exercises/kernelintro-execpols_solution.cpp
@@ -68,11 +68,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // 3D tensor has N^3 entries
   //
-  constexpr int N = 100;
-  constexpr int N_tot = N * N * N;
-  constexpr double c = 0.0001;
-  double* a = memoryManager::allocate<double>(N_tot);
-  double* a_ref = memoryManager::allocate<double>(N_tot);
+  constexpr int    N     = 100;
+  constexpr int    N_tot = N * N * N;
+  constexpr double c     = 0.0001;
+  double*          a     = memoryManager::allocate<double>(N_tot);
+  double*          a_ref = memoryManager::allocate<double>(N_tot);
   // _init_define_end
 
   //----------------------------------------------------------------------------//
@@ -140,12 +140,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::seq_exec, // i
                                                 RAJA::statement::Lambda<0>>>>>;
 
-  RAJA::kernel<EXEC_POL1>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL1>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=](int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
@@ -194,12 +194,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::seq_exec, // i
                                                 RAJA::statement::Lambda<0>>>>>;
 
-  RAJA::kernel<EXEC_POL2>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL2>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=](int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
@@ -240,12 +240,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                 RAJA::ArgList<2, 1, 0>, // k, j, i
                                 RAJA::statement::Lambda<0>>>;
 
-  RAJA::kernel<EXEC_POL3>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=](int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
@@ -265,12 +265,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                            RAJA::seq_exec, // i
                            RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<EXEC_POL4>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=](int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
@@ -301,12 +301,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                    RAJA::cuda_thread_x_loop, // i
                                    RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<EXEC_POL5>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=] __device__(int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
@@ -349,12 +349,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                            RAJA::cuda_thread_x_direct, // i
                                            RAJA::statement::Lambda<0>>>>>>>>;
 
-  RAJA::kernel<EXEC_POL6>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL6>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N),
+                                           RAJA::TypedRangeSegment<int>(0, N)),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
+                          [=] __device__(int i, int j, int k)
+                          { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
@@ -420,9 +420,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                            RAJA::TypedRangeSegment<int>(0, N),
                                            RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=] __device__(int i, int j, int k) {
-                            d_aView(i, j, k) = c * i * j * k;
-                          });
+                          [=] __device__(int i, int j, int k)
+                          { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
@@ -469,9 +468,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                            RAJA::TypedRangeSegment<int>(0, N),
                                            RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=] __device__(int i, int j, int k) {
-                            d_aView(i, j, k) = c * i * j * k;
-                          });
+                          [=] __device__(int i, int j, int k)
+                          { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_tiled_direct_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp
index 18d6bc5e3f..db2371f897 100644
--- a/exercises/kernelintro-nested-loop-reorder.cpp
+++ b/exercises/kernelintro-nested-loop-reorder.cpp
@@ -105,9 +105,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KJI_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
-        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-      });
+      RAJA::make_tuple(IRange, JRange, KRange),
+      [=](IIDX i, JIDX j, KIDX k)
+      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_kji_loops_end
 
   //----------------------------------------------------------------------------//
diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp
index de28c08e67..311afce13c 100644
--- a/exercises/kernelintro-nested-loop-reorder_solution.cpp
+++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp
@@ -105,9 +105,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KJI_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
-        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-      });
+      RAJA::make_tuple(IRange, JRange, KRange),
+      [=](IIDX i, JIDX j, KIDX k)
+      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_kji_loops_end
 
   //----------------------------------------------------------------------------//
@@ -149,9 +149,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<JIK_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
-        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-      });
+      RAJA::make_tuple(IRange, JRange, KRange),
+      [=](IIDX i, JIDX j, KIDX k)
+      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_jik_loops_end
 
   //----------------------------------------------------------------------------//
@@ -194,9 +194,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<IKJ_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) {
-        printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-      });
+      RAJA::make_tuple(IRange, JRange, KRange),
+      [=](IIDX i, JIDX j, KIDX k)
+      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_ikj_loops_end
 
 
diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
index 8bcab8dc1e..524457754d 100644
--- a/exercises/launch-matrix-transpose-local-array.cpp
+++ b/exercises/launch-matrix-transpose-local-array.cpp
@@ -84,7 +84,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -184,23 +184,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _mattranspose_localarray_raja_start
-  using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_pol_1      = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
       RAJA::LaunchParams(), // LaunchParams may be empty when only running on
                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<loop_pol_1>(
             ctx,
             TILE_DIM,
             RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<loop_pol_1>(
                   ctx,
                   TILE_DIM,
                   RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     ///
@@ -211,11 +214,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                     ///
 
                     RAJA::loop_icount<loop_pol_1>(
-                        ctx, col_tile, [&](int col, int tx) {
+                        ctx,
+                        col_tile,
+                        [&](int col, int tx)
+                        {
                           RAJA::loop_icount<loop_pol_1>(
-                              ctx, row_tile, [&](int row, int ty) {
-                                Atview(col, row) = Tile_Array[ty][tx];
-                              });
+                              ctx,
+                              row_tile,
+                              [&](int row, int ty)
+                              { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
                   });
             });
@@ -250,7 +257,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<launch_policy_2>(
       RAJA::LaunchParams(), // LaunchParams may be empty when only running on
                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
+      {
         /*
         RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0,
         N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
@@ -293,8 +301,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   /// TODO...
   ///
@@ -309,7 +317,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<cuda_launch_policy>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
                          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
+      {
         /*
         RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0,
         N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
@@ -353,7 +362,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
@@ -372,8 +381,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -381,39 +390,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async = false;
+  const bool hip_async    = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
                          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<hip_teams_y>(
             ctx,
             TILE_DIM,
             RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<hip_teams_x>(
                   ctx,
                   TILE_DIM,
                   RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<hip_threads_y>(
-                        ctx, row_tile, [&](int row, int ty) {
+                        ctx,
+                        row_tile,
+                        [&](int row, int ty)
+                        {
                           RAJA::loop_icount<hip_threads_x>(
-                              ctx, col_tile, [&](int col, int tx) {
-                                Tile_Array[ty][tx] = d_Aview(row, col);
-                              });
+                              ctx,
+                              col_tile,
+                              [&](int col, int tx)
+                              { Tile_Array[ty][tx] = d_Aview(row, col); });
                         });
 
                     RAJA::loop_icount<hip_threads_x>(
-                        ctx, col_tile, [&](int col, int tx) {
+                        ctx,
+                        col_tile,
+                        [&](int col, int tx)
+                        {
                           RAJA::loop_icount<hip_threads_y>(
-                              ctx, row_tile, [&](int row, int ty) {
-                                d_Atview(col, row) = Tile_Array[ty][tx];
-                              });
+                              ctx,
+                              row_tile,
+                              [&](int row, int ty)
+                              { d_Atview(col, row) = Tile_Array[ty][tx]; });
                         });
                   });
             });
diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
index e388a58848..b2fd888d99 100644
--- a/exercises/launch-matrix-transpose-local-array_solution.cpp
+++ b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -84,7 +84,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -184,39 +184,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _mattranspose_localarray_raja_start
-  using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_pol_1      = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
       RAJA::LaunchParams(), // LaunchParams may be empty when only running on
                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<loop_pol_1>(
             ctx,
             TILE_DIM,
             RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<loop_pol_1>(
                   ctx,
                   TILE_DIM,
                   RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<loop_pol_1>(
-                        ctx, row_tile, [&](int row, int ty) {
-                          RAJA::loop_icount<loop_pol_1>(
-                              ctx, col_tile, [&](int col, int tx) {
-                                Tile_Array[ty][tx] = Aview(row, col);
-                              });
+                        ctx,
+                        row_tile,
+                        [&](int row, int ty)
+                        {
+                          RAJA::loop_icount<loop_pol_1>(ctx,
+                                                        col_tile,
+                                                        [&](int col, int tx) {
+                                                          Tile_Array[ty][tx] =
+                                                              Aview(row, col);
+                                                        });
                         });
 
                     RAJA::loop_icount<loop_pol_1>(
-                        ctx, col_tile, [&](int col, int tx) {
+                        ctx,
+                        col_tile,
+                        [&](int col, int tx)
+                        {
                           RAJA::loop_icount<loop_pol_1>(
-                              ctx, row_tile, [&](int row, int ty) {
-                                Atview(col, row) = Tile_Array[ty][tx];
-                              });
+                              ctx,
+                              row_tile,
+                              [&](int row, int ty)
+                              { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
                   });
             });
@@ -237,40 +249,52 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using omp_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using omp_pol_2       = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_pol_2      = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
       RAJA::LaunchParams(), // LaunchParams may be empty when only running on
                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<omp_pol_2>(
             ctx,
             TILE_DIM,
             RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<loop_pol_2>(
                   ctx,
                   TILE_DIM,
                   RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<loop_pol_2>(
-                        ctx, row_tile, [&](int row, int ty) {
-                          RAJA::loop_icount<loop_pol_2>(
-                              ctx, col_tile, [&](int col, int tx) {
-                                Tile_Array[ty][tx] = Aview(row, col);
-                              });
+                        ctx,
+                        row_tile,
+                        [&](int row, int ty)
+                        {
+                          RAJA::loop_icount<loop_pol_2>(ctx,
+                                                        col_tile,
+                                                        [&](int col, int tx) {
+                                                          Tile_Array[ty][tx] =
+                                                              Aview(row, col);
+                                                        });
                         });
 
                     RAJA::loop_icount<loop_pol_2>(
-                        ctx, col_tile, [&](int col, int tx) {
+                        ctx,
+                        col_tile,
+                        [&](int col, int tx)
+                        {
                           RAJA::loop_icount<loop_pol_2>(
-                              ctx, row_tile, [&](int row, int ty) {
-                                Atview(col, row) = Tile_Array[ty][tx];
-                              });
+                              ctx,
+                              row_tile,
+                              [&](int row, int ty)
+                              { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
                   });
             });
@@ -288,8 +312,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -304,33 +328,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<cuda_launch_policy>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
                          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<cuda_teams_y>(
             ctx,
             TILE_DIM,
             RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<cuda_teams_x>(
                   ctx,
                   TILE_DIM,
                   RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<cuda_threads_y>(
-                        ctx, row_tile, [&](int row, int ty) {
+                        ctx,
+                        row_tile,
+                        [&](int row, int ty)
+                        {
                           RAJA::loop_icount<cuda_threads_x>(
-                              ctx, col_tile, [&](int col, int tx) {
-                                Tile_Array[ty][tx] = Aview(row, col);
-                              });
+                              ctx,
+                              col_tile,
+                              [&](int col, int tx)
+                              { Tile_Array[ty][tx] = Aview(row, col); });
                         });
 
                     RAJA::loop_icount<cuda_threads_x>(
-                        ctx, col_tile, [&](int col, int tx) {
+                        ctx,
+                        col_tile,
+                        [&](int col, int tx)
+                        {
                           RAJA::loop_icount<cuda_threads_y>(
-                              ctx, row_tile, [&](int row, int ty) {
-                                Atview(col, row) = Tile_Array[ty][tx];
-                              });
+                              ctx,
+                              row_tile,
+                              [&](int row, int ty)
+                              { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
                   });
             });
@@ -346,7 +381,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
@@ -365,8 +400,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -374,39 +409,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async = false;
+  const bool hip_async    = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
                          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<hip_teams_y>(
             ctx,
             TILE_DIM,
             RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<hip_teams_x>(
                   ctx,
                   TILE_DIM,
                   RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<hip_threads_y>(
-                        ctx, row_tile, [&](int row, int ty) {
+                        ctx,
+                        row_tile,
+                        [&](int row, int ty)
+                        {
                           RAJA::loop_icount<hip_threads_x>(
-                              ctx, col_tile, [&](int col, int tx) {
-                                Tile_Array[ty][tx] = d_Aview(row, col);
-                              });
+                              ctx,
+                              col_tile,
+                              [&](int col, int tx)
+                              { Tile_Array[ty][tx] = d_Aview(row, col); });
                         });
 
                     RAJA::loop_icount<hip_threads_x>(
-                        ctx, col_tile, [&](int col, int tx) {
+                        ctx,
+                        col_tile,
+                        [&](int col, int tx)
+                        {
                           RAJA::loop_icount<hip_threads_y>(
-                              ctx, row_tile, [&](int row, int ty) {
-                                d_Atview(col, row) = Tile_Array[ty][tx];
-                              });
+                              ctx,
+                              row_tile,
+                              [&](int row, int ty)
+                              { d_Atview(col, row) = Tile_Array[ty][tx]; });
                         });
                   });
             });
diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp
index 82e995eee3..d74afb6989 100644
--- a/exercises/launch-matrix-transpose-tiled.cpp
+++ b/exercises/launch-matrix-transpose-tiled.cpp
@@ -77,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -176,7 +176,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<launch_policy_1>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
+      {
         /*
         RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&]
         (RAJA::TypedRangeSegment<int> const &row_tile) {
@@ -330,7 +331,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::TypedRangeSegment<int> row_Range2(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range2(0, N_c);
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
@@ -343,8 +344,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -352,28 +353,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async = false;
+  const bool hip_async    = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
                          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<hip_teams_y>(
             ctx,
             TILE_DIM,
             row_Range2,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<hip_teams_x>(
                   ctx,
                   TILE_DIM,
                   col_Range2,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
-                    RAJA::loop<hip_threads_y>(ctx, row_tile, [&](int row) {
-                      RAJA::loop<hip_threads_x>(ctx, col_tile, [&](int col) {
-                        Atview(col, row) = Aview(row, col);
-                      });
-                    });
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
+                    RAJA::loop<hip_threads_y>(ctx,
+                                              row_tile,
+                                              [&](int row)
+                                              {
+                                                RAJA::loop<hip_threads_x>(
+                                                    ctx,
+                                                    col_tile,
+                                                    [&](int col) {
+                                                      Atview(col, row) =
+                                                          Aview(row, col);
+                                                    });
+                                              });
                   });
             });
       });
diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp
index 5dbdb5a8b7..258f595b68 100644
--- a/exercises/launch-matrix-transpose-tiled_solution.cpp
+++ b/exercises/launch-matrix-transpose-tiled_solution.cpp
@@ -77,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -165,27 +165,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
-  using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_pol_1      = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<loop_pol_1>(
             ctx,
             TILE_DIM,
             row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<loop_pol_1>(
                   ctx,
                   TILE_DIM,
                   col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
-                    RAJA::loop<loop_pol_1>(ctx, row_tile, [&](int row) {
-                      RAJA::loop<loop_pol_1>(ctx, col_tile, [&](int col) {
-                        Atview(col, row) = Aview(row, col);
-                      });
-                    });
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
+                    RAJA::loop<loop_pol_1>(ctx,
+                                           row_tile,
+                                           [&](int row)
+                                           {
+                                             RAJA::loop<loop_pol_1>(
+                                                 ctx,
+                                                 col_tile,
+                                                 [&](int col) {
+                                                   Atview(col, row) =
+                                                       Aview(row, col);
+                                                 });
+                                           });
                   });
             });
       });
@@ -205,28 +215,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using omp_for_pol_2   = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_pol_2      = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<omp_for_pol_2>(
             ctx,
             TILE_DIM,
             row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<loop_pol_2>(
                   ctx,
                   TILE_DIM,
                   col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
-                    RAJA::loop<loop_pol_2>(ctx, row_tile, [&](int row) {
-                      RAJA::loop<loop_pol_2>(ctx, col_tile, [&](int col) {
-                        Atview(col, row) = Aview(row, col);
-                      });
-                    });
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
+                    RAJA::loop<loop_pol_2>(ctx,
+                                           row_tile,
+                                           [&](int row)
+                                           {
+                                             RAJA::loop<loop_pol_2>(
+                                                 ctx,
+                                                 col_tile,
+                                                 [&](int col) {
+                                                   Atview(col, row) =
+                                                       Aview(row, col);
+                                                 });
+                                           });
                   });
             });
       });
@@ -245,8 +265,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   // _raja_mattranspose_cuda_start
   using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
@@ -262,22 +282,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<cuda_launch_policy>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
                          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<cuda_teams_y>(
             ctx,
             TILE_DIM,
             row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<cuda_teams_x>(
                   ctx,
                   TILE_DIM,
                   col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
-                    RAJA::loop<cuda_threads_y>(ctx, row_tile, [&](int row) {
-                      RAJA::loop<cuda_threads_x>(ctx, col_tile, [&](int col) {
-                        Atview(col, row) = Aview(row, col);
-                      });
-                    });
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
+                    RAJA::loop<cuda_threads_y>(ctx,
+                                               row_tile,
+                                               [&](int row)
+                                               {
+                                                 RAJA::loop<cuda_threads_x>(
+                                                     ctx,
+                                                     col_tile,
+                                                     [&](int col) {
+                                                       Atview(col, row) =
+                                                           Aview(row, col);
+                                                     });
+                                               });
                   });
             });
       });
@@ -292,7 +322,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
 
-  int* d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
   int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
@@ -305,8 +335,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -314,28 +344,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async = false;
+  const bool hip_async    = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
                          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::tile<hip_teams_y>(
             ctx,
             TILE_DIM,
             row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile) {
+            [&](RAJA::TypedRangeSegment<int> const& row_tile)
+            {
               RAJA::tile<hip_teams_x>(
                   ctx,
                   TILE_DIM,
                   col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile) {
-                    RAJA::loop<hip_threads_y>(ctx, row_tile, [&](int row) {
-                      RAJA::loop<hip_threads_x>(ctx, col_tile, [&](int col) {
-                        d_Atview(col, row) = d_Aview(row, col);
-                      });
-                    });
+                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
+                  {
+                    RAJA::loop<hip_threads_y>(ctx,
+                                              row_tile,
+                                              [&](int row)
+                                              {
+                                                RAJA::loop<hip_threads_x>(
+                                                    ctx,
+                                                    col_tile,
+                                                    [&](int col) {
+                                                      d_Atview(col, row) =
+                                                          d_Aview(row, col);
+                                                    });
+                                              });
                   });
             });
       });
diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp
index d1234d498f..14c52d784b 100644
--- a/exercises/launch-matrix-transpose.cpp
+++ b/exercises/launch-matrix-transpose.cpp
@@ -60,7 +60,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -125,21 +125,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // using sequential loops.
   //
   // _raja_mattranspose_start
-  using loop_policy_seq = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_policy_seq   = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_seq>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<loop_policy_seq>(ctx, row_Range, [&](int /*row*/) {
-          RAJA::loop<loop_policy_seq>(ctx, col_Range, [&](int /*col*/) {
-            /// TODO...
-            ///
-            /// EXERCISE: Implement the kernel body for the transpose operation
-            ///
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<loop_policy_seq>(ctx,
+                                    row_Range,
+                                    [&](int /*row*/)
+                                    {
+                                      RAJA::loop<loop_policy_seq>(
+                                          ctx,
+                                          col_Range,
+                                          [&](int /*col*/)
+                                          {
+                                            /// TODO...
+                                            ///
+                                            /// EXERCISE: Implement the kernel
+                                            /// body for the transpose operation
+                                            ///
+                                          });
+                                    });
       });
   // _raja_mattranspose_end
 
@@ -163,7 +172,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<launch_policy_omp>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
+      {
         /// TODO...
         ///
         /// EXERCISE: Implement the loops to apply omp parallism and sequential
@@ -187,17 +197,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async = false; // execute asynchronously
+  const bool async         = false; // execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
   RAJA::launch<launch_policy_cuda>(
       RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<cuda_thread_y>(ctx, row_Range, [&](int row) {
-          RAJA::loop<cuda_thread_x>(ctx, col_Range, [&](int col) {
-            Atview(col, row) = Aview(row, col);
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<cuda_thread_y>(
+            ctx,
+            row_Range,
+            [&](int row)
+            {
+              RAJA::loop<cuda_thread_x>(
+                  ctx,
+                  col_Range,
+                  [&](int col) { Atview(col, row) = Aview(row, col); });
+            });
       });
   // _raja_mattranspose_cuda_end
 
diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp
index 4c33abed2e..7bdd775683 100644
--- a/exercises/launch-matrix-transpose_solution.cpp
+++ b/exercises/launch-matrix-transpose_solution.cpp
@@ -60,7 +60,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -125,18 +125,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // using sequential loops.
   //
   // _raja_mattranspose_start
-  using loop_policy_seq = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_policy_seq   = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_seq>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<loop_policy_seq>(ctx, row_Range, [&](int row) {
-          RAJA::loop<loop_policy_seq>(ctx, col_Range, [&](int col) {
-            Atview(col, row) = Aview(row, col);
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<loop_policy_seq>(
+            ctx,
+            row_Range,
+            [&](int row)
+            {
+              RAJA::loop<loop_policy_seq>(
+                  ctx,
+                  col_Range,
+                  [&](int col) { Atview(col, row) = Aview(row, col); });
+            });
       });
   // _raja_mattranspose_end
 
@@ -153,18 +159,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_policy_omp   = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_omp>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<loop_policy_omp>(ctx, row_Range, [&](int row) {
-          RAJA::loop<loop_policy_seq>(ctx, col_Range, [&](int col) {
-            Atview(col, row) = Aview(row, col);
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<loop_policy_omp>(
+            ctx,
+            row_Range,
+            [&](int row)
+            {
+              RAJA::loop<loop_policy_seq>(
+                  ctx,
+                  col_Range,
+                  [&](int col) { Atview(col, row) = Aview(row, col); });
+            });
       });
 
   checkResult<int>(Atview, N_c, N_r);
@@ -181,17 +193,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async = false; // execute asynchronously
+  const bool async         = false; // execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
   RAJA::launch<launch_policy_cuda>(
       RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<cuda_thread_y>(ctx, row_Range, [&](int row) {
-          RAJA::loop<cuda_thread_x>(ctx, col_Range, [&](int col) {
-            Atview(col, row) = Aview(row, col);
-          });
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<cuda_thread_y>(
+            ctx,
+            row_Range,
+            [&](int row)
+            {
+              RAJA::loop<cuda_thread_x>(
+                  ctx,
+                  col_Range,
+                  [&](int col) { Atview(col, row) = Aview(row, col); });
+            });
       });
   // _raja_mattranspose_cuda_end
 
diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp
index e02bc3b586..9791caf763 100644
--- a/exercises/launchintro-execpols.cpp
+++ b/exercises/launchintro-execpols.cpp
@@ -68,11 +68,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // 3D tensor has N^3 entries
   //
-  constexpr int N = 100;
-  constexpr int N_tot = N * N * N;
-  constexpr double c = 0.0001;
-  double* a = memoryManager::allocate<double>(N_tot);
-  double* a_ref = memoryManager::allocate<double>(N_tot);
+  constexpr int    N     = 100;
+  constexpr int    N_tot = N * N * N;
+  constexpr double c     = 0.0001;
+  double*          a     = memoryManager::allocate<double>(N_tot);
+  double*          a_ref = memoryManager::allocate<double>(N_tot);
   // _init_define_end
 
   //----------------------------------------------------------------------------//
@@ -144,7 +144,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<launch_policy_1>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
+      {
         /*
         RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&]
         (int k) {
@@ -209,7 +210,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::launch<launch_policy_2>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
+      {
         // TODO: Use the omp_policy_2 to distribute loop iterations
         // in a RAJA::loop method
         /*
@@ -254,25 +256,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, N_tot * sizeof(double));
 
   // _raja_tensorinit_cuda_start
-  using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+  using cuda_teams_z_3         = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
   using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
 
-  const bool async_3 = false;
+  const bool async_3    = false;
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
   RAJA::launch<launch_policy_3>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<cuda_teams_z_3>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::loop<cuda_global_thread_y_3>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                  ctx,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](int j)
+                  {
                     RAJA::loop<cuda_global_thread_x_3>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
-                          aView(i, j, k) = c * i * j * k;
-                        });
+                        ctx,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
       });
@@ -296,31 +305,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_y_4 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
   using cuda_threads_x_4 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
-  const bool async_4 = false;
+  const bool async_4    = false;
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
   RAJA::launch<launch_policy_4>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<cuda_teams_z_4>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::tile<cuda_teams_y_4>(
                   ctx,
                   j_block_sz,
                   RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
+                  {
                     RAJA::tile<cuda_teams_x_4>(
                         ctx,
                         i_block_sz,
                         RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
-                          RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&](int j) {
-                            RAJA::loop<cuda_threads_x_4>(
-                                ctx, i_tile, [&](int i) {
-                                  aView(i, j, k) = c * i * j * k;
-                                });
-                          });
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
+                        {
+                          RAJA::loop<cuda_threads_y_4>(
+                              ctx,
+                              j_tile,
+                              [&](int j)
+                              {
+                                RAJA::loop<cuda_threads_x_4>(
+                                    ctx,
+                                    i_tile,
+                                    [&](int i)
+                                    { aView(i, j, k) = c * i * j * k; });
+                              });
                         });
                   });
             });
@@ -387,25 +407,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
   // _raja_tensorinit_hip_start
-  using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+  using hip_teams_z_5         = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
   using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
 
-  const bool async_5 = false;
+  const bool async_5    = false;
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
   RAJA::launch<launch_policy_5>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<hip_teams_z_5>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::loop<hip_global_thread_y_5>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                  ctx,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](int j)
+                  {
                     RAJA::loop<hip_global_thread_x_5>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
-                          d_aView(i, j, k) = c * i * j * k;
-                        });
+                        ctx,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](int i) { d_aView(i, j, k) = c * i * j * k; });
                   });
             });
       });
@@ -430,31 +457,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y_6 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x_6 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool async_6 = false;
+  const bool async_6    = false;
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
   RAJA::launch<launch_policy_6>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<hip_teams_z_6>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::tile<hip_teams_y_6>(
                   ctx,
                   j_block_sz,
                   RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
+                  {
                     RAJA::tile<hip_teams_x_6>(
                         ctx,
                         i_block_sz,
                         RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
-                          RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&](int j) {
-                            RAJA::loop<hip_threads_x_6>(
-                                ctx, i_tile, [&](int i) {
-                                  d_aView(i, j, k) = c * i * j * k;
-                                });
-                          });
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
+                        {
+                          RAJA::loop<hip_threads_y_6>(
+                              ctx,
+                              j_tile,
+                              [&](int j)
+                              {
+                                RAJA::loop<hip_threads_x_6>(ctx,
+                                                            i_tile,
+                                                            [&](int i) {
+                                                              d_aView(i, j, k) =
+                                                                  c * i * j * k;
+                                                            });
+                              });
                         });
                   });
             });
diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp
index 37da99f9f0..7e40599394 100644
--- a/exercises/launchintro-execpols_solution.cpp
+++ b/exercises/launchintro-execpols_solution.cpp
@@ -68,11 +68,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // 3D tensor has N^3 entries
   //
-  constexpr int N = 100;
-  constexpr int N_tot = N * N * N;
-  constexpr double c = 0.0001;
-  double* a = memoryManager::allocate<double>(N_tot);
-  double* a_ref = memoryManager::allocate<double>(N_tot);
+  constexpr int    N     = 100;
+  constexpr int    N_tot = N * N * N;
+  constexpr double c     = 0.0001;
+  double*          a     = memoryManager::allocate<double>(N_tot);
+  double*          a_ref = memoryManager::allocate<double>(N_tot);
   // _init_define_end
 
   //----------------------------------------------------------------------------//
@@ -131,21 +131,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, N_tot * sizeof(double));
 
   // _raja_tensorinit_seq_start
-  using loop_policy_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_policy_1   = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<loop_policy_1>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::loop<loop_policy_1>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                  ctx,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](int j)
+                  {
                     RAJA::loop<loop_policy_1>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
-                          aView(i, j, k) = c * i * j * k;
-                        });
+                        ctx,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
       });
@@ -188,22 +195,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, N_tot * sizeof(double));
 
   // _raja_tensorinit_omp_outer_start
-  using omp_policy_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  using loop_policy_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using omp_policy_2    = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_policy_2   = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
       RAJA::LaunchParams(), // LaunchParams may be empty when running on the
                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<omp_policy_2>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::loop<loop_policy_2>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                  ctx,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](int j)
+                  {
                     RAJA::loop<loop_policy_2>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
-                          aView(i, j, k) = c * i * j * k;
-                        });
+                        ctx,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
       });
@@ -239,25 +253,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, N_tot * sizeof(double));
 
   // _raja_tensorinit_cuda_start
-  using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+  using cuda_teams_z_3         = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
   using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
 
-  const bool async_3 = false;
+  const bool async_3    = false;
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
   RAJA::launch<launch_policy_3>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<cuda_teams_z_3>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::loop<cuda_global_thread_y_3>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                  ctx,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](int j)
+                  {
                     RAJA::loop<cuda_global_thread_x_3>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
-                          aView(i, j, k) = c * i * j * k;
-                        });
+                        ctx,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
       });
@@ -281,31 +302,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_y_4 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
   using cuda_threads_x_4 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
-  const bool async_4 = false;
+  const bool async_4    = false;
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
   RAJA::launch<launch_policy_4>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<cuda_teams_z_4>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::tile<cuda_teams_y_4>(
                   ctx,
                   j_block_sz,
                   RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
+                  {
                     RAJA::tile<cuda_teams_x_4>(
                         ctx,
                         i_block_sz,
                         RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
-                          RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&](int j) {
-                            RAJA::loop<cuda_threads_x_4>(
-                                ctx, i_tile, [&](int i) {
-                                  aView(i, j, k) = c * i * j * k;
-                                });
-                          });
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
+                        {
+                          RAJA::loop<cuda_threads_y_4>(
+                              ctx,
+                              j_tile,
+                              [&](int j)
+                              {
+                                RAJA::loop<cuda_threads_x_4>(
+                                    ctx,
+                                    i_tile,
+                                    [&](int i)
+                                    { aView(i, j, k) = c * i * j * k; });
+                              });
                         });
                   });
             });
@@ -372,25 +404,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
 
   // _raja_tensorinit_hip_start
-  using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+  using hip_teams_z_5         = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
   using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
 
-  const bool async_5 = false;
+  const bool async_5    = false;
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
   RAJA::launch<launch_policy_5>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<hip_teams_z_5>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::loop<hip_global_thread_y_5>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int j) {
+                  ctx,
+                  RAJA::TypedRangeSegment<int>(0, N),
+                  [&](int j)
+                  {
                     RAJA::loop<hip_global_thread_x_5>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int i) {
-                          d_aView(i, j, k) = c * i * j * k;
-                        });
+                        ctx,
+                        RAJA::TypedRangeSegment<int>(0, N),
+                        [&](int i) { d_aView(i, j, k) = c * i * j * k; });
                   });
             });
       });
@@ -415,31 +454,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y_6 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x_6 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool async_6 = false;
+  const bool async_6    = false;
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
   RAJA::launch<launch_policy_6>(
       RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
                          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<hip_teams_z_6>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N), [&](int k) {
+            ctx,
+            RAJA::TypedRangeSegment<int>(0, N),
+            [&](int k)
+            {
               RAJA::tile<hip_teams_y_6>(
                   ctx,
                   j_block_sz,
                   RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile) {
+                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
+                  {
                     RAJA::tile<hip_teams_x_6>(
                         ctx,
                         i_block_sz,
                         RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile) {
-                          RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&](int j) {
-                            RAJA::loop<hip_threads_x_6>(
-                                ctx, i_tile, [&](int i) {
-                                  d_aView(i, j, k) = c * i * j * k;
-                                });
-                          });
+                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
+                        {
+                          RAJA::loop<hip_threads_y_6>(
+                              ctx,
+                              j_tile,
+                              [&](int j)
+                              {
+                                RAJA::loop<hip_threads_x_6>(ctx,
+                                                            i_tile,
+                                                            [&](int i) {
+                                                              d_aView(i, j, k) =
+                                                                  c * i * j * k;
+                                                            });
+                              });
                         });
                   });
             });
diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp
index 6f68615a45..930640a80d 100644
--- a/exercises/memoryManager.hpp
+++ b/exercises/memoryManager.hpp
@@ -78,7 +78,7 @@ T* allocate_gpu(RAJA::Index_type size)
   hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
   auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
-  ptr = cl::sycl::malloc_device<T>(size, *qu);
+  ptr     = cl::sycl::malloc_device<T>(size, *qu);
 #endif
   return ptr;
 }
diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp
index 478fdef1cb..892c0b09c5 100644
--- a/exercises/offset-layout-stencil.cpp
+++ b/exercises/offset-layout-stencil.cpp
@@ -133,8 +133,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize lattice
   //
-  int* input = memoryManager::allocate<int>(totCells * sizeof(int));
-  int* output = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* input      = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* output     = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output_ref = memoryManager::allocate<int>(totCells * sizeof(int));
 
   std::memset(input, 0, totCells * sizeof(int));
@@ -149,7 +149,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   {
     for (int col = 1; col <= N_c; ++col)
     {
-      int id = col + totCellsInCol * row;
+      int id    = col + totCellsInCol * row;
       input[id] = 1;
     }
   }
@@ -167,7 +167,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int col = 1; col <= N_c; ++col)
     {
 
-      int id = col + totCellsInCol * row;
+      int id         = col + totCellsInCol * row;
       output_ref[id] = input[id] + input[id + 1] + input[id - 1] +
                        input[id + totCellsInCol] + input[id - totCellsInCol];
     }
@@ -221,13 +221,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::seq_exec, // col
                                                 RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<NESTED_EXEC_POL1>(
-      RAJA::make_tuple(col_range, row_range), [=](int col, int row) {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
+                                 [=](int col, int row)
+                                 {
+                                   outputView(row, col) =
+                                       inputView(row, col) +
+                                       inputView(row - 1, col) +
+                                       inputView(row + 1, col) +
+                                       inputView(row, col - 1) +
+                                       inputView(row, col + 1);
+                                 });
   // _offsetlayout_rajaseq_end
 
   std::cout << "\noutput lattice:\n";
@@ -274,7 +277,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row) {
+                                 [=] RAJA_DEVICE(int col, int row)
+                                 {
                                    outputView(row, col) =
                                        inputView(row, col) +
                                        inputView(row - 1, col) +
@@ -296,7 +300,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running five-point stencil (RAJA-Kernel - "
                "hip)...\n";
 
-  int* d_input = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
+  int* d_input  = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
   int* d_output = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
 
   hipErrchk(
@@ -314,7 +318,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row) {
+                                 [=] RAJA_DEVICE(int col, int row)
+                                 {
                                    d_outputView(row, col) =
                                        d_inputView(row, col) +
                                        d_inputView(row - 1, col) +
diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp
index 814c6128db..881f188fc4 100644
--- a/exercises/offset-layout-stencil_solution.cpp
+++ b/exercises/offset-layout-stencil_solution.cpp
@@ -134,8 +134,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize lattice
   //
-  int* input = memoryManager::allocate<int>(totCells * sizeof(int));
-  int* output = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* input      = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* output     = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output_ref = memoryManager::allocate<int>(totCells * sizeof(int));
 
   std::memset(input, 0, totCells * sizeof(int));
@@ -150,7 +150,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   {
     for (int col = 1; col <= N_c; ++col)
     {
-      int id = col + totCellsInCol * row;
+      int id    = col + totCellsInCol * row;
       input[id] = 1;
     }
   }
@@ -168,7 +168,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int col = 1; col <= N_c; ++col)
     {
 
-      int id = col + totCellsInCol * row;
+      int id         = col + totCellsInCol * row;
       output_ref[id] = input[id] + input[id + 1] + input[id - 1] +
                        input[id + totCellsInCol] + input[id - totCellsInCol];
     }
@@ -222,13 +222,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::seq_exec, // col
                                                 RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<NESTED_EXEC_POL1>(
-      RAJA::make_tuple(col_range, row_range), [=](int col, int row) {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
+                                 [=](int col, int row)
+                                 {
+                                   outputView(row, col) =
+                                       inputView(row, col) +
+                                       inputView(row - 1, col) +
+                                       inputView(row + 1, col) +
+                                       inputView(row, col - 1) +
+                                       inputView(row, col + 1);
+                                 });
   // _offsetlayout_rajaseq_end
 
   std::cout << "\noutput lattice:\n";
@@ -249,13 +252,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                 RAJA::ArgList<1, 0>, // row, col
                                 RAJA::statement::Lambda<0>>>;
 
-  RAJA::kernel<NESTED_EXEC_POL2>(
-      RAJA::make_tuple(col_range, row_range), [=](int col, int row) {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+  RAJA::kernel<NESTED_EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
+                                 [=](int col, int row)
+                                 {
+                                   outputView(row, col) =
+                                       inputView(row, col) +
+                                       inputView(row - 1, col) +
+                                       inputView(row + 1, col) +
+                                       inputView(row, col - 1) +
+                                       inputView(row, col + 1);
+                                 });
   // _offsetlayout_rajaomp_end
 
   std::cout << "\noutput lattice:\n";
@@ -280,7 +286,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row) {
+                                 [=] RAJA_DEVICE(int col, int row)
+                                 {
                                    outputView(row, col) =
                                        inputView(row, col) +
                                        inputView(row - 1, col) +
@@ -304,7 +311,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(output, 0, totCells * sizeof(int));
 
-  int* d_input = memoryManager::allocate_gpu<int>(totCells);
+  int* d_input  = memoryManager::allocate_gpu<int>(totCells);
   int* d_output = memoryManager::allocate_gpu<int>(totCells);
 
   hipErrchk(
@@ -324,7 +331,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row) {
+                                 [=] RAJA_DEVICE(int col, int row)
+                                 {
                                    d_outputView(row, col) =
                                        d_inputView(row, col) +
                                        d_inputView(row - 1, col) +
diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp
index b789e63690..91761490a5 100644
--- a/exercises/permuted-layout-batch-matrix-multiply.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply.cpp
@@ -101,7 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Initialize a RAJA timer object
   // and variable to store minimum run time
   //
-  auto timer = RAJA::Timer();
+  auto   timer  = RAJA::Timer();
   double minRun = std::numeric_limits<double>::max();
 
   //
@@ -179,21 +179,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using INIT_POL = RAJA::seq_exec;
 #endif
 
-  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
-    for (int row = 0; row < N_r; ++row)
-    {
-      for (int col = 0; col < N_c; ++col)
-      {
-        Aview(e, row, col) = row;
-        Bview(e, row, col) = col;
-        Cview(e, row, col) = 0;
-
-        //      Aview2(e, row, col) = row;
-        //      Bview2(e, row, col) = col;
-        //      Cview2(e, row, col) = 0;
-      }
-    }
-  });
+  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N),
+                         [=](int e)
+                         {
+                           for (int row = 0; row < N_r; ++row)
+                           {
+                             for (int col = 0; col < N_c; ++col)
+                             {
+                               Aview(e, row, col) = row;
+                               Bview(e, row, col) = col;
+                               Cview(e, row, col) = 0;
+
+                               //      Aview2(e, row, col) = row;
+                               //      Bview2(e, row, col) = col;
+                               //      Cview2(e, row, col) = 0;
+                             }
+                           }
+                         });
 
 
   //----------------------------------------------------------------------------//
@@ -208,7 +210,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.start();
     // _permutedlayout_batchedmatmult_loop_start
     RAJA::forall<RAJA::seq_exec>(
-        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=](int e)
+        {
           Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
                            Aview(e, 0, 1) * Bview(e, 1, 0) +
                            Aview(e, 0, 2) * Bview(e, 2, 0);
@@ -323,7 +327,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.start();
     // _permutedlayout_batchedmatmult_omp_start
     RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=](int e)
+        {
           Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
                            Aview(e, 0, 1) * Bview(e, 1, 0) +
                            Aview(e, 0, 2) * Bview(e, 2, 0);
@@ -441,7 +447,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     timer.start();
     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=] RAJA_DEVICE(int e)
+        {
           Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
                            Aview(e, 0, 1) * Bview(e, 1, 0) +
                            Aview(e, 0, 2) * Bview(e, 2, 0);
@@ -567,7 +575,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     timer.start();
     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=] RAJA_DEVICE(int e)
+        {
           d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) +
                              d_Aview(e, 0, 1) * d_Bview(e, 1, 0) +
                              d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
index 05b393ef2b..1aded0e9fe 100644
--- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
@@ -102,7 +102,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Initialize a RAJA timer object
   // and variable to store minimum run time
   //
-  auto timer = RAJA::Timer();
+  auto   timer  = RAJA::Timer();
   double minRun = std::numeric_limits<double>::max();
 
   //
@@ -168,21 +168,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using INIT_POL = RAJA::seq_exec;
 #endif
 
-  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
-    for (int row = 0; row < N_r; ++row)
-    {
-      for (int col = 0; col < N_c; ++col)
-      {
-        Aview(e, row, col) = row;
-        Bview(e, row, col) = col;
-        Cview(e, row, col) = 0;
-
-        Aview2(e, row, col) = row;
-        Bview2(e, row, col) = col;
-        Cview2(e, row, col) = 0;
-      }
-    }
-  });
+  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N),
+                         [=](int e)
+                         {
+                           for (int row = 0; row < N_r; ++row)
+                           {
+                             for (int col = 0; col < N_c; ++col)
+                             {
+                               Aview(e, row, col) = row;
+                               Bview(e, row, col) = col;
+                               Cview(e, row, col) = 0;
+
+                               Aview2(e, row, col) = row;
+                               Bview2(e, row, col) = col;
+                               Cview2(e, row, col) = 0;
+                             }
+                           }
+                         });
 
 
   //----------------------------------------------------------------------------//
@@ -197,7 +199,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.start();
     // _permutedlayout_batchedmatmult_loop_start
     RAJA::forall<RAJA::seq_exec>(
-        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=](int e)
+        {
           Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
                            Aview(e, 0, 1) * Bview(e, 1, 0) +
                            Aview(e, 0, 2) * Bview(e, 2, 0);
@@ -251,7 +255,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.start();
     // _permutedlayout2_batchedmatmult_loop_start
     RAJA::forall<RAJA::seq_exec>(
-        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=](int e)
+        {
           Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
                             Aview2(e, 0, 1) * Bview2(e, 1, 0) +
                             Aview2(e, 0, 2) * Bview2(e, 2, 0);
@@ -308,7 +314,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.start();
     // _permutedlayout_batchedmatmult_omp_start
     RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=](int e)
+        {
           Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
                            Aview(e, 0, 1) * Bview(e, 1, 0) +
                            Aview(e, 0, 2) * Bview(e, 2, 0);
@@ -363,7 +371,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     timer.start();
     RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=](int e)
+        {
           Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
                             Aview2(e, 0, 1) * Bview2(e, 1, 0) +
                             Aview2(e, 0, 2) * Bview2(e, 2, 0);
@@ -421,7 +431,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     timer.start();
     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=] RAJA_DEVICE(int e)
+        {
           Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
                            Aview(e, 0, 1) * Bview(e, 1, 0) +
                            Aview(e, 0, 2) * Bview(e, 2, 0);
@@ -475,7 +487,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     timer.start();
     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=] RAJA_DEVICE(int e)
+        {
           Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
                             Aview2(e, 0, 1) * Bview2(e, 1, 0) +
                             Aview2(e, 0, 2) * Bview2(e, 2, 0);
@@ -554,7 +568,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     timer.start();
     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=] RAJA_DEVICE(int e)
+        {
           d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) +
                              d_Aview(e, 0, 1) * d_Bview(e, 1, 0) +
                              d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
@@ -609,7 +625,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     timer.start();
     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N), [=] RAJA_DEVICE(int e) {
+        RAJA::TypedRangeSegment<int>(0, N),
+        [=] RAJA_DEVICE(int e)
+        {
           d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) +
                               d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) +
                               d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp
index e4752861de..c09321adc0 100644
--- a/exercises/reductions.cpp
+++ b/exercises/reductions.cpp
@@ -71,10 +71,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Set min and max loc values
   //
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref] = -100;
+  a[minloc_ref]            = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref] = 100;
+  a[maxloc_ref]            = 100;
   // _reductions_array_init_end
 
   //
@@ -251,27 +251,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
 
   // _reductions_raja_hippolicy_start
-  using EXEC_POL3 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::hip_reduce;
   // _reductions_raja_hippolicy_end
 
-  RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
-  RAJA::ReduceMin<REDUCE_POL3, int> hip_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL3, int> hip_max(std::numeric_limits<int>::min());
+  RAJA::ReduceSum<REDUCE_POL3, int>    hip_sum(0);
+  RAJA::ReduceMin<REDUCE_POL3, int>    hip_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL3, int>    hip_max(std::numeric_limits<int>::min());
   RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(
       std::numeric_limits<int>::max(), -1);
   RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange1, [=] RAJA_DEVICE(int i) {
-    hip_sum += d_a[i];
+  RAJA::forall<EXEC_POL3>(arange1,
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            hip_sum += d_a[i];
 
-    hip_min.min(d_a[i]);
-    hip_max.max(d_a[i]);
+                            hip_min.min(d_a[i]);
+                            hip_max.max(d_a[i]);
 
-    hip_minloc.minloc(d_a[i], i);
-    hip_maxloc.maxloc(d_a[i], i);
-  });
+                            hip_minloc.minloc(d_a[i], i);
+                            hip_maxloc.maxloc(d_a[i], i);
+                          });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
diff --git a/exercises/reductions_solution.cpp b/exercises/reductions_solution.cpp
index 46992ec857..6c0bacdad9 100644
--- a/exercises/reductions_solution.cpp
+++ b/exercises/reductions_solution.cpp
@@ -71,10 +71,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Set min and max loc values
   //
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref] = -100;
+  a[minloc_ref]            = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref] = 100;
+  a[maxloc_ref]            = 100;
   // _reductions_array_init_end
 
   //
@@ -101,26 +101,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA sequential reductions...\n";
 
   // _reductions_raja_seq_start
-  using EXEC_POL1 = RAJA::seq_exec;
+  using EXEC_POL1   = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
 
-  RAJA::ReduceSum<REDUCE_POL1, int> seq_sum(0);
-  RAJA::ReduceMin<REDUCE_POL1, int> seq_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL1, int> seq_max(std::numeric_limits<int>::min());
+  RAJA::ReduceSum<REDUCE_POL1, int>    seq_sum(0);
+  RAJA::ReduceMin<REDUCE_POL1, int>    seq_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL1, int>    seq_max(std::numeric_limits<int>::min());
   RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(
       std::numeric_limits<int>::max(), -1);
   RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL1>(arange, [=](int i) {
-    seq_sum += a[i];
+  RAJA::forall<EXEC_POL1>(arange,
+                          [=](int i)
+                          {
+                            seq_sum += a[i];
 
-    seq_min.min(a[i]);
-    seq_max.max(a[i]);
+                            seq_min.min(a[i]);
+                            seq_max.max(a[i]);
 
-    seq_minloc.minloc(a[i], i);
-    seq_maxloc.maxloc(a[i], i);
-  });
+                            seq_minloc.minloc(a[i], i);
+                            seq_maxloc.maxloc(a[i], i);
+                          });
 
   std::cout << "\tsum = " << seq_sum.get() << std::endl;
   std::cout << "\tmin = " << seq_min.get() << std::endl;
@@ -138,27 +140,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA OpenMP reductions...\n";
 
   // _reductions_raja_omppolicy_start
-  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
   using REDUCE_POL2 = RAJA::omp_reduce;
   // _reductions_raja_omppolicy_end
 
-  RAJA::ReduceSum<REDUCE_POL2, int> omp_sum(0);
-  RAJA::ReduceMin<REDUCE_POL2, int> omp_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL2, int> omp_max(std::numeric_limits<int>::min());
+  RAJA::ReduceSum<REDUCE_POL2, int>    omp_sum(0);
+  RAJA::ReduceMin<REDUCE_POL2, int>    omp_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL2, int>    omp_max(std::numeric_limits<int>::min());
   RAJA::ReduceMinLoc<REDUCE_POL2, int> omp_minloc(
       std::numeric_limits<int>::max(), -1);
   RAJA::ReduceMaxLoc<REDUCE_POL2, int> omp_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL2>(arange, [=](int i) {
-    omp_sum += a[i];
+  RAJA::forall<EXEC_POL2>(arange,
+                          [=](int i)
+                          {
+                            omp_sum += a[i];
 
-    omp_min.min(a[i]);
-    omp_max.max(a[i]);
+                            omp_min.min(a[i]);
+                            omp_max.max(a[i]);
 
-    omp_minloc.minloc(a[i], i);
-    omp_maxloc.maxloc(a[i], i);
-  });
+                            omp_minloc.minloc(a[i], i);
+                            omp_maxloc.maxloc(a[i], i);
+                          });
 
   std::cout << "\tsum = " << omp_sum.get() << std::endl;
   std::cout << "\tmin = " << omp_min.get() << std::endl;
@@ -176,7 +180,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA CUDA reductions...\n";
 
   // _reductions_raja_cudapolicy_start
-  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::cuda_reduce;
   // _reductions_raja_cudapolicy_end
 
@@ -188,15 +192,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceMaxLoc<REDUCE_POL3, int> cuda_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE(int i) {
-    cuda_sum += a[i];
+  RAJA::forall<EXEC_POL3>(arange,
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            cuda_sum += a[i];
 
-    cuda_min.min(a[i]);
-    cuda_max.max(a[i]);
+                            cuda_min.min(a[i]);
+                            cuda_max.max(a[i]);
 
-    cuda_minloc.minloc(a[i], i);
-    cuda_maxloc.maxloc(a[i], i);
-  });
+                            cuda_minloc.minloc(a[i], i);
+                            cuda_maxloc.maxloc(a[i], i);
+                          });
 
   std::cout << "\tsum = " << cuda_sum.get() << std::endl;
   std::cout << "\tmin = " << cuda_min.get() << std::endl;
@@ -216,27 +222,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
 
   // _reductions_raja_hippolicy_start
-  using EXEC_POL3 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::hip_reduce;
   // _reductions_raja_hippolicy_end
 
-  RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
-  RAJA::ReduceMin<REDUCE_POL3, int> hip_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL3, int> hip_max(std::numeric_limits<int>::min());
+  RAJA::ReduceSum<REDUCE_POL3, int>    hip_sum(0);
+  RAJA::ReduceMin<REDUCE_POL3, int>    hip_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL3, int>    hip_max(std::numeric_limits<int>::min());
   RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(
       std::numeric_limits<int>::max(), -1);
   RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE(int i) {
-    hip_sum += d_a[i];
+  RAJA::forall<EXEC_POL3>(arange,
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            hip_sum += d_a[i];
 
-    hip_min.min(d_a[i]);
-    hip_max.max(d_a[i]);
+                            hip_min.min(d_a[i]);
+                            hip_max.max(d_a[i]);
 
-    hip_minloc.minloc(d_a[i], i);
-    hip_maxloc.maxloc(d_a[i], i);
-  });
+                            hip_minloc.minloc(d_a[i], i);
+                            hip_maxloc.maxloc(d_a[i], i);
+                          });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
diff --git a/exercises/scan.cpp b/exercises/scan.cpp
index 11e3068ff8..899d3a0aa4 100644
--- a/exercises/scan.cpp
+++ b/exercises/scan.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#define OP_PLUS_INT RAJA::operators::plus<int>
-#define OP_MIN_INT RAJA::operators::minimum<int>
-#define OP_MAX_INT RAJA::operators::maximum<int>
+#define OP_PLUS_INT               RAJA::operators::plus<int>
+#define OP_MIN_INT                RAJA::operators::minimum<int>
+#define OP_MAX_INT                RAJA::operators::maximum<int>
 #define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult<X>(in, out, N);
 #define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult<X>(in, out, N);
 
@@ -74,7 +74,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data
   //
-  int* in = memoryManager::allocate<int>(N);
+  int* in  = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
   std::iota(in, in + N, -1);
@@ -299,7 +299,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n";
 
   std::copy_n(in, N, out);
-  int* d_in = memoryManager::allocate_gpu<int>(N);
+  int* d_in  = memoryManager::allocate_gpu<int>(N);
   int* d_out = memoryManager::allocate_gpu<int>(N);
 
   hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp
index 925b586101..223f7557aa 100644
--- a/exercises/scan_solution.cpp
+++ b/exercises/scan_solution.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#define OP_PLUS_INT RAJA::operators::plus<int>
-#define OP_MIN_INT RAJA::operators::minimum<int>
-#define OP_MAX_INT RAJA::operators::maximum<int>
+#define OP_PLUS_INT               RAJA::operators::plus<int>
+#define OP_MIN_INT                RAJA::operators::minimum<int>
+#define OP_MAX_INT                RAJA::operators::maximum<int>
 #define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult<X>(in, out, N);
 #define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult<X>(in, out, N);
 
@@ -74,7 +74,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data
   //
-  int* in = memoryManager::allocate<int>(N);
+  int* in  = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
   std::iota(in, in + N, -1);
@@ -267,7 +267,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n";
 
   std::copy_n(in, N, out);
-  int* d_in = memoryManager::allocate_gpu<int>(N);
+  int* d_in  = memoryManager::allocate_gpu<int>(N);
   int* d_out = memoryManager::allocate_gpu<int>(N);
 
   hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp
index 490be37848..bac3d08784 100644
--- a/exercises/segment-indexset-basics.cpp
+++ b/exercises/segment-indexset-basics.cpp
@@ -39,11 +39,11 @@
 // (so example code is less verbose)
 //----------------------------------------------------------------------------//
 // _raja_segment_type_start
-using IdxType = int;
-using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
+using IdxType            = int;
+using RangeSegType       = RAJA::TypedRangeSegment<IdxType>;
 using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
-using ListSegType = RAJA::TypedListSegment<IdxType>;
-using IndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
+using ListSegType        = RAJA::TypedListSegment<IdxType>;
+using IndexSetType       = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
 // _raja_segment_type_end
 
 
diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp
index d3bf08ec52..1a93f235bc 100644
--- a/exercises/segment-indexset-basics_solution.cpp
+++ b/exercises/segment-indexset-basics_solution.cpp
@@ -39,11 +39,11 @@
 // (so example code is less verbose)
 //----------------------------------------------------------------------------//
 // _raja_segment_type_start
-using IdxType = int;
-using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
+using IdxType            = int;
+using RangeSegType       = RAJA::TypedRangeSegment<IdxType>;
 using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
-using ListSegType = RAJA::TypedListSegment<IdxType>;
-using IndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
+using ListSegType        = RAJA::TypedListSegment<IdxType>;
+using IndexSetType       = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
 // _raja_segment_type_end
 
 
@@ -262,7 +262,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   is3.push_back(RangeSegType(0, 8));
 
-  IdxType indx[] = {10, 11, 14, 20, 22};
+  IdxType     indx[] = {10, 11, 14, 20, 22};
   ListSegType list2(indx, 5, host_res);
   is3.push_back(list2);
 
diff --git a/exercises/sort.cpp b/exercises/sort.cpp
index 1b13eb20ac..006f2b05e4 100644
--- a/exercises/sort.cpp
+++ b/exercises/sort.cpp
@@ -6,7 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #define OP_GREATER RAJA::operators::greater<int>
-#define OP_LESS RAJA::operators::less<int>
+#define OP_LESS    RAJA::operators::less<int>
 
 #define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N)
 #define CHECK_UNSTABLE_SORT_PAIR_RESULT(X)                                     \
@@ -66,7 +66,7 @@ void checkUnstableSortResult(const T* in,
                              const T* out,
                              const U* in_vals,
                              const U* out_vals,
-                             int N);
+                             int      N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
@@ -75,7 +75,7 @@ void checkStableSortResult(const T* in,
                            const T* out,
                            const U* in_vals,
                            const U* out_vals,
-                           int N);
+                           int      N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -97,10 +97,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data
   //
-  int* in = memoryManager::allocate<int>(N);
+  int* in  = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
-  unsigned* in_vals = memoryManager::allocate<unsigned>(N);
+  unsigned* in_vals  = memoryManager::allocate<unsigned>(N);
   unsigned* out_vals = memoryManager::allocate<unsigned>(N);
 
   std::iota(in, in + N / 2, 0);
@@ -352,7 +352,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
   std::copy_n(in_vals, N, out_vals);
 
-  int* d_out = memoryManager::allocate_gpu<int>(N);
+  int* d_out      = memoryManager::allocate_gpu<int>(N);
   int* d_out_vals = memoryManager::allocate_gpu<int>(N);
 
   hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
@@ -439,7 +439,7 @@ template <typename Comparator, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
@@ -509,10 +509,10 @@ void checkUnstableSortResult(const T* in,
                              const T* out,
                              const U* in_vals,
                              const U* out_vals,
-                             int N)
+                             int      N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<U>;
@@ -591,7 +591,7 @@ template <typename Comparator, typename T>
 void checkStableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to keys
   using val_map = std::list<T>;
@@ -660,10 +660,10 @@ void checkStableSortResult(const T* in,
                            const T* out,
                            const U* in_vals,
                            const U* out_vals,
-                           int N)
+                           int      N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to vals
   using val_map = std::list<U>;
diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp
index 5414885e67..8bacb285d8 100644
--- a/exercises/sort_solution.cpp
+++ b/exercises/sort_solution.cpp
@@ -6,7 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #define OP_GREATER RAJA::operators::greater<int>
-#define OP_LESS RAJA::operators::less<int>
+#define OP_LESS    RAJA::operators::less<int>
 
 #define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N)
 #define CHECK_UNSTABLE_SORT_PAIR_RESULT(X)                                     \
@@ -66,7 +66,7 @@ void checkUnstableSortResult(const T* in,
                              const T* out,
                              const U* in_vals,
                              const U* out_vals,
-                             int N);
+                             int      N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
@@ -75,7 +75,7 @@ void checkStableSortResult(const T* in,
                            const T* out,
                            const U* in_vals,
                            const U* out_vals,
-                           int N);
+                           int      N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -97,10 +97,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data
   //
-  int* in = memoryManager::allocate<int>(N);
+  int* in  = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
-  unsigned* in_vals = memoryManager::allocate<unsigned>(N);
+  unsigned* in_vals  = memoryManager::allocate<unsigned>(N);
   unsigned* out_vals = memoryManager::allocate<unsigned>(N);
 
   std::iota(in, in + N / 2, 0);
@@ -324,7 +324,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
   std::copy_n(in_vals, N, out_vals);
 
-  int* d_out = memoryManager::allocate_gpu<int>(N);
+  int* d_out      = memoryManager::allocate_gpu<int>(N);
   int* d_out_vals = memoryManager::allocate_gpu<int>(N);
 
   hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
@@ -401,7 +401,7 @@ template <typename Comparator, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
@@ -471,10 +471,10 @@ void checkUnstableSortResult(const T* in,
                              const T* out,
                              const U* in_vals,
                              const U* out_vals,
-                             int N)
+                             int      N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<U>;
@@ -553,7 +553,7 @@ template <typename Comparator, typename T>
 void checkStableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to keys
   using val_map = std::list<T>;
@@ -622,10 +622,10 @@ void checkStableSortResult(const T* in,
                            const T* out,
                            const U* in_vals,
                            const U* out_vals,
-                           int N)
+                           int      N)
 {
   Comparator comp;
-  bool correct = true;
+  bool       correct = true;
 
   // make map of keys to vals
   using val_map = std::list<U>;
diff --git a/exercises/tutorial_halfday/ex2_approx-pi.cpp b/exercises/tutorial_halfday/ex2_approx-pi.cpp
index f5487fd9f9..7b8b370b12 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi.cpp
@@ -50,7 +50,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define number of subintervals (N) and size of each subinterval (dx) used in
   // Riemann integral sum to approximate pi.
   //
-  const int N = 512 * 512;
+  const int    N  = 512 * 512;
   const double dx = 1.0 / double(N);
 
   // Set precision for printing pi
@@ -92,15 +92,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// NOTE: We've done this one for you to help you get started...
   ///
 
-  using EXEC_POL1 = RAJA::seq_exec;
+  using EXEC_POL1   = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N), [=](int i) {
-    double x = (double(i) + 0.5) * dx;
-    seq_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N),
+                          [=](int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            seq_pi += dx / (1.0 + x * x);
+                          });
   double seq_pi_val = seq_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
diff --git a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
index 42a3895b48..6f6465c20e 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
@@ -47,7 +47,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define number of subintervals (N) and size of each subinterval (dx) used in
   // Riemann integral sum to approximate pi.
   //
-  const int N = 512 * 512;
+  const int    N  = 512 * 512;
   const double dx = 1.0 / double(N);
 
   // Set precision for printing pi
@@ -78,15 +78,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA sequential pi approximation...\n";
 
-  using EXEC_POL1 = RAJA::seq_exec;
+  using EXEC_POL1   = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N), [=](int i) {
-    double x = (double(i) + 0.5) * dx;
-    seq_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N),
+                          [=](int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            seq_pi += dx / (1.0 + x * x);
+                          });
   double seq_pi_val = seq_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
@@ -123,15 +125,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA OpenMP pi approximation...\n";
 
-  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
   using REDUCE_POL2 = RAJA::omp_reduce;
 
   RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
 
-  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N), [=](int i) {
-    double x = (double(i) + 0.5) * dx;
-    omp_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N),
+                          [=](int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            omp_pi += dx / (1.0 + x * x);
+                          });
   double omp_pi_val = omp_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
@@ -147,15 +151,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA pi approximation...\n";
 
-  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::cuda_reduce;
 
   RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
 
-  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
-    double x = (double(i) + 0.5) * dx;
-    cuda_pi += dx / (1.0 + x * x);
-  });
+  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N),
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            double x = (double(i) + 0.5) * dx;
+                            cuda_pi += dx / (1.0 + x * x);
+                          });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight.cpp b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
index 1d22a04dd9..bf77811226 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
@@ -86,19 +86,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Define array bounds and initialize distance and altitude arrays.
   //
-  int N = 100;
+  int    N       = 100;
   double alt_max = 100.0;
 
-  double* dist = memoryManager::allocate<double>(N);
-  double* alt = memoryManager::allocate<double>(N);
-  double* ang = memoryManager::allocate<double>(N);
-  double* ang_max = memoryManager::allocate<double>(N);
-  int* visible = memoryManager::allocate<int>(N);
-  int* visible_ref = memoryManager::allocate<int>(N);
+  double* dist        = memoryManager::allocate<double>(N);
+  double* alt         = memoryManager::allocate<double>(N);
+  double* ang         = memoryManager::allocate<double>(N);
+  double* ang_max     = memoryManager::allocate<double>(N);
+  int*    visible     = memoryManager::allocate<int>(N);
+  int*    visible_ref = memoryManager::allocate<int>(N);
 
   for (int i = 0; i < N; ++i)
   {
-    dist[i] = static_cast<double>(i + 1);
+    dist[i]         = static_cast<double>(i + 1);
     double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1);
     alt[i] =
         alt_fact * static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
index 5da99b7fe2..1ed85352d5 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
@@ -83,19 +83,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Define array bounds and initialize distance and altitude arrays.
   //
-  int N = 100;
+  int    N       = 100;
   double alt_max = 100.0;
 
-  double* dist = memoryManager::allocate<double>(N);
-  double* alt = memoryManager::allocate<double>(N);
-  double* ang = memoryManager::allocate<double>(N);
-  double* ang_max = memoryManager::allocate<double>(N);
-  int* visible = memoryManager::allocate<int>(N);
-  int* visible_ref = memoryManager::allocate<int>(N);
+  double* dist        = memoryManager::allocate<double>(N);
+  double* alt         = memoryManager::allocate<double>(N);
+  double* ang         = memoryManager::allocate<double>(N);
+  double* ang_max     = memoryManager::allocate<double>(N);
+  int*    visible     = memoryManager::allocate<int>(N);
+  int*    visible_ref = memoryManager::allocate<int>(N);
 
   for (int i = 0; i < N; ++i)
   {
-    dist[i] = static_cast<double>(i + 1);
+    dist[i]         = static_cast<double>(i + 1);
     double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1);
     alt[i] =
         alt_fact * static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
@@ -160,16 +160,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                   RAJA::operators::maximum<double>{});
 
 
-  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N), [=](int i) {
-    if (ang[i] >= ang_max[i])
-    {
-      visible[i] = 1;
-    }
-    else
-    {
-      visible[i] = 0;
-    }
-  });
+  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N),
+                          [=](int i)
+                          {
+                            if (ang[i] >= ang_max[i])
+                            {
+                              visible[i] = 1;
+                            }
+                            else
+                            {
+                              visible[i] = 0;
+                            }
+                          });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
@@ -194,16 +196,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                   RAJA::make_span(ang_max, N),
                                   RAJA::operators::maximum<double>{});
 
-  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N), [=](int i) {
-    if (ang[i] >= ang_max[i])
-    {
-      visible[i] = 1;
-    }
-    else
-    {
-      visible[i] = 0;
-    }
-  });
+  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N),
+                          [=](int i)
+                          {
+                            if (ang[i] >= ang_max[i])
+                            {
+                              visible[i] = 1;
+                            }
+                            else
+                            {
+                              visible[i] = 0;
+                            }
+                          });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
@@ -230,16 +234,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                   RAJA::make_span(ang_max, N),
                                   RAJA::operators::maximum<double>{});
 
-  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) {
-    if (ang[i] >= ang_max[i])
-    {
-      visible[i] = 1;
-    }
-    else
-    {
-      visible[i] = 0;
-    }
-  });
+  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N),
+                          [=] RAJA_DEVICE(int i)
+                          {
+                            if (ang[i] >= ang_max[i])
+                            {
+                              visible[i] = 1;
+                            }
+                            else
+                            {
+                              visible[i] = 0;
+                            }
+                          });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
index 9e3968d313..0a3f6f5982 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
@@ -143,8 +143,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize input array
   //
-  int* B = memoryManager::allocate<int>(tot_cells * sizeof(int));
-  int* A = memoryManager::allocate<int>(int_cells * sizeof(int));
+  int* B     = memoryManager::allocate<int>(tot_cells * sizeof(int));
+  int* A     = memoryManager::allocate<int>(int_cells * sizeof(int));
   int* A_ref = memoryManager::allocate<int>(int_cells * sizeof(int));
 
 
@@ -164,7 +164,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int j = 1; j <= Nr_int; ++j)
     {
       int idx = j + Nr_tot * i;
-      B[idx] = 1;
+      B[idx]  = 1;
     }
   }
   // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns);
@@ -184,7 +184,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     {
 
       int idx_out = j + Nr_int * i;
-      int idx_in = (j + 1) + Nr_tot * (i + 1);
+      int idx_in  = (j + 1) + Nr_tot * (i + 1);
 
       A_ref[idx_out] = B[idx_in] +                               // C
                        B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E
@@ -248,7 +248,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int i = 1; i <= Nc_int; ++i)
     {
       int idx = i + Nc_tot * j;
-      B[idx] = 1;
+      B[idx]  = 1;
     }
   }
   // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
@@ -268,7 +268,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     {
 
       int idx_out = i + Nc_int * j;
-      int idx_in = (i + 1) + Nc_tot * (j + 1);
+      int idx_in  = (i + 1) + Nc_tot * (j + 1);
 
       A_ref[idx_out] = B[idx_in] +                               // C
                        B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
index e323c3f4d3..9845e9572c 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
@@ -145,8 +145,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize input array
   //
-  int* B = memoryManager::allocate<int>(tot_cells * sizeof(int));
-  int* A = memoryManager::allocate<int>(int_cells * sizeof(int));
+  int* B     = memoryManager::allocate<int>(tot_cells * sizeof(int));
+  int* A     = memoryManager::allocate<int>(int_cells * sizeof(int));
   int* A_ref = memoryManager::allocate<int>(int_cells * sizeof(int));
 
 
@@ -164,7 +164,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int j = 1; j <= Nr_int; ++j)
     {
       int idx = j + Nr_tot * i;
-      B[idx] = 1;
+      B[idx]  = 1;
     }
   }
   // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns);
@@ -184,7 +184,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     {
 
       int idx_out = j + Nr_int * i;
-      int idx_in = (j + 1) + Nr_tot * (i + 1);
+      int idx_in  = (j + 1) + Nr_tot * (i + 1);
 
       A_ref[idx_out] = B[idx_in] +                               // C
                        B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E
@@ -215,7 +215,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::make_offset_layout<DIM>({{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}});
 
   RAJA::View<int, RAJA::OffsetLayout<DIM>> Bview(B, B_layout);
-  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, Nc_int, Nr_int);
+  RAJA::View<int, RAJA::Layout<DIM>>       Aview(A, Nc_int, Nr_int);
 
   for (int i = 0; i < Nc_int; ++i)
   {
@@ -246,7 +246,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int i = 1; i <= Nc_int; ++i)
     {
       int idx = i + Nc_tot * j;
-      B[idx] = 1;
+      B[idx]  = 1;
     }
   }
   // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
@@ -266,7 +266,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     {
 
       int idx_out = i + Nc_int * j;
-      int idx_in = (i + 1) + Nc_tot * (j + 1);
+      int idx_in  = (i + 1) + Nc_tot * (j + 1);
 
       A_ref[idx_out] = B[idx_in] +                               // C
                        B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N
@@ -307,7 +307,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::make_permuted_layout({{Nc_int, Nr_int}}, perm);
 
   RAJA::View<int, RAJA::OffsetLayout<DIM>> pBview(B, pB_layout);
-  RAJA::View<int, RAJA::Layout<DIM>> pAview(A, pA_layout);
+  RAJA::View<int, RAJA::Layout<DIM>>       pAview(A, pA_layout);
 
   for (int j = 0; j < Nr_int; ++j)
   {
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
index f9ac15ab9e..fd24969ae6 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
@@ -66,7 +66,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -271,9 +271,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       >                                                     // closes Tile 1
                                                   >; // closes policy list
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(RAJA::make_tuple(col_Range, row_Range),
+                                     [=](int col, int row)
+                                     { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
index 50691ce1c1..2f6f320080 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
@@ -64,7 +64,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -181,9 +181,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               RAJA::statement::
                   For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL_SEQ>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_SEQ>(RAJA::make_tuple(col_Range, row_Range),
+                                    [=](int col, int row)
+                                    { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -214,9 +214,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               RAJA::statement::
                   For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>(RAJA::make_tuple(col_Range, row_Range),
+                                    [=](int col, int row)
+                                    { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -249,9 +249,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       >                                                     // closes Tile 1
                                                   >; // closes policy list
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(RAJA::make_tuple(col_Range, row_Range),
+                                     [=](int col, int row)
+                                     { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -280,9 +280,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
-                                     [=] RAJA_DEVICE(int col, int row) {
-                                       Atview(col, row) = Aview(row, col);
-                                     });
+                                     [=] RAJA_DEVICE(int col, int row)
+                                     { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
index 3c04f6af1d..a7b1585ea8 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
@@ -71,7 +71,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
index 9603820403..f8acc88e31 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
@@ -71,7 +71,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A = memoryManager::allocate<int>(N_r * N_c);
+  int* A  = memoryManager::allocate<int>(N_r * N_c);
   int* At = memoryManager::allocate<int>(N_r * N_c);
 
   //
@@ -247,13 +247,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
       RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
-        RAJA_Tile(trow, tcol) = Aview(row, col);
-      },
+      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile)
+      { RAJA_Tile(trow, tcol) = Aview(row, col); },
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
-        Atview(col, row) = RAJA_Tile(trow, tcol);
-      });
+      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile)
+      { Atview(col, row) = RAJA_Tile(trow, tcol); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -301,13 +299,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
       RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
-        RAJA_Tile(trow, tcol) = Aview(row, col);
-      },
+      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile)
+      { RAJA_Tile(trow, tcol) = Aview(row, col); },
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
-        Atview(col, row) = RAJA_Tile(trow, tcol);
-      });
+      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile)
+      { Atview(col, row) = RAJA_Tile(trow, tcol); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -363,15 +359,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
       RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-      [=] RAJA_DEVICE(
-          int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
-        RAJA_Tile(trow, tcol) = Aview(row, col);
-      },
+      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile)
+      { RAJA_Tile(trow, tcol) = Aview(row, col); },
 
-      [=] RAJA_DEVICE(
-          int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
-        Atview(col, row) = RAJA_Tile(trow, tcol);
-      });
+      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile)
+      { Atview(col, row) = RAJA_Tile(trow, tcol); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp
index 4528d7a8c9..1092857eaf 100644
--- a/exercises/vector-addition.cpp
+++ b/exercises/vector-addition.cpp
@@ -70,9 +70,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data to random numbers in [1, 10].
   //
-  int* a = memoryManager::allocate<int>(N);
-  int* b = memoryManager::allocate<int>(N);
-  int* c = memoryManager::allocate<int>(N);
+  int* a     = memoryManager::allocate<int>(N);
+  int* b     = memoryManager::allocate<int>(N);
+  int* c     = memoryManager::allocate<int>(N);
   int* c_ref = memoryManager::allocate<int>(N);
 
   for (int i = 0; i < N; ++i)
diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp
index 5149d23d56..976d2afec3 100644
--- a/exercises/vector-addition_solution.cpp
+++ b/exercises/vector-addition_solution.cpp
@@ -70,9 +70,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data to random numbers in [1, 10].
   //
-  int* a = memoryManager::allocate<int>(N);
-  int* b = memoryManager::allocate<int>(N);
-  int* c = memoryManager::allocate<int>(N);
+  int* a     = memoryManager::allocate<int>(N);
+  int* b     = memoryManager::allocate<int>(N);
+  int* c     = memoryManager::allocate<int>(N);
   int* c_ref = memoryManager::allocate<int>(N);
 
   for (int i = 0; i < N; ++i)
diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp
index 028293a7f9..e534b4e737 100644
--- a/exercises/vertexsum-indexset.cpp
+++ b/exercises/vertexsum-indexset.cpp
@@ -77,16 +77,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // 2D mesh has N^2 elements (N+1)^2 vertices.
   //
-  constexpr int N = 1000;
-  constexpr int Nelem = N;
+  constexpr int N         = 1000;
+  constexpr int Nelem     = N;
   constexpr int Nelem_tot = Nelem * Nelem;
-  constexpr int Nvert = N + 1;
+  constexpr int Nvert     = N + 1;
   constexpr int Nvert_tot = Nvert * Nvert;
   // _vertexsum_define_end
-  double* areae = memoryManager::allocate<double>(Nelem_tot);
-  double* areav = memoryManager::allocate<double>(Nvert_tot);
+  double* areae     = memoryManager::allocate<double>(Nelem_tot);
+  double* areav     = memoryManager::allocate<double>(Nvert_tot);
   double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
-  int* e2v_map = memoryManager::allocate<int>(4 * Nelem_tot);
+  int*    e2v_map   = memoryManager::allocate<int>(4 * Nelem_tot);
 
   // _vertexsum_elemarea_start
   //
@@ -96,9 +96,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   for (int ie = 0; ie < Nelem_tot; ++ie)
   {
-    int j = ie / Nelem;
-    int imap = 4 * ie;
-    e2v_map[imap] = ie + j;
+    int j             = ie / Nelem;
+    int imap          = 4 * ie;
+    e2v_map[imap]     = ie + j;
     e2v_map[imap + 1] = ie + j + 1;
     e2v_map[imap + 2] = ie + j + Nvert;
     e2v_map[imap + 3] = ie + j + 1 + Nvert;
@@ -112,8 +112,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   for (int ie = 0; ie < Nelem_tot; ++ie)
   {
-    int i = ie % Nelem;
-    int j = ie / Nelem;
+    int i     = ie % Nelem;
+    int j     = ie / Nelem;
     areae[ie] = h * (i + 1) * h * (j + 1);
   }
   // _vertexsum_elemarea_end
@@ -218,12 +218,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   for (int icol = 0; icol < 4; ++icol)
   {
     const std::vector<int>& ievec = idx[icol];
-    const int len = static_cast<int>(ievec.size());
+    const int               len   = static_cast<int>(ievec.size());
 
 #pragma omp parallel for
     for (int i = 0; i < len; ++i)
     {
-      int ie = ievec[i];
+      int  ie = ievec[i];
       int* iv = &(e2v_map[4 * ie]);
       areav[iv[0]] += areae[ie] / 4.0;
       areav[iv[1]] += areae[ie] / 4.0;
@@ -288,13 +288,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL1 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
 
-  RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
-    int* iv = &(e2v_map[4 * ie]);
-    areav[iv[0]] += areae[ie] / 4.0;
-    areav[iv[1]] += areae[ie] / 4.0;
-    areav[iv[2]] += areae[ie] / 4.0;
-    areav[iv[3]] += areae[ie] / 4.0;
-  });
+  RAJA::forall<EXEC_POL1>(colorset,
+                          [=](int ie)
+                          {
+                            int* iv = &(e2v_map[4 * ie]);
+                            areav[iv[0]] += areae[ie] / 4.0;
+                            areav[iv[1]] += areae[ie] / 4.0;
+                            areav[iv[2]] += areae[ie] / 4.0;
+                            areav[iv[3]] += areae[ie] / 4.0;
+                          });
   // _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -342,13 +344,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL2 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE(int ie) {
-    int* iv = &(e2v_map[4 * ie]);
-    areav[iv[0]] += areae[ie] / 4.0;
-    areav[iv[1]] += areae[ie] / 4.0;
-    areav[iv[2]] += areae[ie] / 4.0;
-    areav[iv[3]] += areae[ie] / 4.0;
-  });
+  RAJA::forall<EXEC_POL2>(cuda_colorset,
+                          [=] RAJA_DEVICE(int ie)
+                          {
+                            int* iv = &(e2v_map[4 * ie]);
+                            areav[iv[0]] += areae[ie] / 4.0;
+                            areav[iv[1]] += areae[ie] / 4.0;
+                            areav[iv[2]] += areae[ie] / 4.0;
+                            areav[iv[3]] += areae[ie] / 4.0;
+                          });
   // _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -367,9 +371,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize device memory arrays
   //
-  double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
-  double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
-  int* d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
+  double* d_areae   = memoryManager::allocate_gpu<double>(Nelem_tot);
+  double* d_areav   = memoryManager::allocate_gpu<double>(Nvert_tot);
+  int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
   hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
   hipMemcpy(
@@ -402,13 +406,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE(int ie) {
-    int* iv = &(d_e2v_map[4 * ie]);
-    d_areav[iv[0]] += d_areae[ie] / 4.0;
-    d_areav[iv[1]] += d_areae[ie] / 4.0;
-    d_areav[iv[2]] += d_areae[ie] / 4.0;
-    d_areav[iv[3]] += d_areae[ie] / 4.0;
-  });
+  RAJA::forall<EXEC_POL3>(hip_colorset,
+                          [=] RAJA_DEVICE(int ie)
+                          {
+                            int* iv = &(d_e2v_map[4 * ie]);
+                            d_areav[iv[0]] += d_areae[ie] / 4.0;
+                            d_areav[iv[1]] += d_areae[ie] / 4.0;
+                            d_areav[iv[2]] += d_areae[ie] / 4.0;
+                            d_areav[iv[3]] += d_areae[ie] / 4.0;
+                          });
   // _raja_vertexarea_hip_end
 
   hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp
index 2861109eda..ac81c53f4a 100644
--- a/exercises/vertexsum-indexset_solution.cpp
+++ b/exercises/vertexsum-indexset_solution.cpp
@@ -77,16 +77,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // 2D mesh has N^2 elements (N+1)^2 vertices.
   //
-  constexpr int N = 1000;
-  constexpr int Nelem = N;
+  constexpr int N         = 1000;
+  constexpr int Nelem     = N;
   constexpr int Nelem_tot = Nelem * Nelem;
-  constexpr int Nvert = N + 1;
+  constexpr int Nvert     = N + 1;
   constexpr int Nvert_tot = Nvert * Nvert;
   // _vertexsum_define_end
-  double* areae = memoryManager::allocate<double>(Nelem_tot);
-  double* areav = memoryManager::allocate<double>(Nvert_tot);
+  double* areae     = memoryManager::allocate<double>(Nelem_tot);
+  double* areav     = memoryManager::allocate<double>(Nvert_tot);
   double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
-  int* e2v_map = memoryManager::allocate<int>(4 * Nelem_tot);
+  int*    e2v_map   = memoryManager::allocate<int>(4 * Nelem_tot);
 
   // _vertexsum_elemarea_start
   //
@@ -96,9 +96,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   for (int ie = 0; ie < Nelem_tot; ++ie)
   {
-    int j = ie / Nelem;
-    int imap = 4 * ie;
-    e2v_map[imap] = ie + j;
+    int j             = ie / Nelem;
+    int imap          = 4 * ie;
+    e2v_map[imap]     = ie + j;
     e2v_map[imap + 1] = ie + j + 1;
     e2v_map[imap + 2] = ie + j + Nvert;
     e2v_map[imap + 3] = ie + j + 1 + Nvert;
@@ -112,8 +112,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   for (int ie = 0; ie < Nelem_tot; ++ie)
   {
-    int i = ie % Nelem;
-    int j = ie / Nelem;
+    int i     = ie % Nelem;
+    int j     = ie / Nelem;
     areae[ie] = h * (i + 1) * h * (j + 1);
   }
   // _vertexsum_elemarea_end
@@ -218,12 +218,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   for (int icol = 0; icol < 4; ++icol)
   {
     const std::vector<int>& ievec = idx[icol];
-    const int len = static_cast<int>(ievec.size());
+    const int               len   = static_cast<int>(ievec.size());
 
 #pragma omp parallel for
     for (int i = 0; i < len; ++i)
     {
-      int ie = ievec[i];
+      int  ie = ievec[i];
       int* iv = &(e2v_map[4 * ie]);
       areav[iv[0]] += areae[ie] / 4.0;
       areav[iv[1]] += areae[ie] / 4.0;
@@ -285,13 +285,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL1 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
 
-  RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
-    int* iv = &(e2v_map[4 * ie]);
-    areav[iv[0]] += areae[ie] / 4.0;
-    areav[iv[1]] += areae[ie] / 4.0;
-    areav[iv[2]] += areae[ie] / 4.0;
-    areav[iv[3]] += areae[ie] / 4.0;
-  });
+  RAJA::forall<EXEC_POL1>(colorset,
+                          [=](int ie)
+                          {
+                            int* iv = &(e2v_map[4 * ie]);
+                            areav[iv[0]] += areae[ie] / 4.0;
+                            areav[iv[1]] += areae[ie] / 4.0;
+                            areav[iv[2]] += areae[ie] / 4.0;
+                            areav[iv[3]] += areae[ie] / 4.0;
+                          });
   // _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -334,13 +336,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL2 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE(int ie) {
-    int* iv = &(e2v_map[4 * ie]);
-    areav[iv[0]] += areae[ie] / 4.0;
-    areav[iv[1]] += areae[ie] / 4.0;
-    areav[iv[2]] += areae[ie] / 4.0;
-    areav[iv[3]] += areae[ie] / 4.0;
-  });
+  RAJA::forall<EXEC_POL2>(cuda_colorset,
+                          [=] RAJA_DEVICE(int ie)
+                          {
+                            int* iv = &(e2v_map[4 * ie]);
+                            areav[iv[0]] += areae[ie] / 4.0;
+                            areav[iv[1]] += areae[ie] / 4.0;
+                            areav[iv[2]] += areae[ie] / 4.0;
+                            areav[iv[3]] += areae[ie] / 4.0;
+                          });
   // _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -359,9 +363,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize device memory arrays
   //
-  double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
-  double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
-  int* d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
+  double* d_areae   = memoryManager::allocate_gpu<double>(Nelem_tot);
+  double* d_areav   = memoryManager::allocate_gpu<double>(Nvert_tot);
+  int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
   hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
   hipMemcpy(
@@ -394,13 +398,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE(int ie) {
-    int* iv = &(d_e2v_map[4 * ie]);
-    d_areav[iv[0]] += d_areae[ie] / 4.0;
-    d_areav[iv[1]] += d_areae[ie] / 4.0;
-    d_areav[iv[2]] += d_areae[ie] / 4.0;
-    d_areav[iv[3]] += d_areae[ie] / 4.0;
-  });
+  RAJA::forall<EXEC_POL3>(hip_colorset,
+                          [=] RAJA_DEVICE(int ie)
+                          {
+                            int* iv = &(d_e2v_map[4 * ie]);
+                            d_areav[iv[0]] += d_areae[ie] / 4.0;
+                            d_areav[iv[1]] += d_areae[ie] / 4.0;
+                            d_areav[iv[2]] += d_areae[ie] / 4.0;
+                            d_areav[iv[3]] += d_areae[ie] / 4.0;
+                          });
   // _raja_vertexarea_hip_end
 
   hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp
index c743b84d28..16413f65fa 100644
--- a/exercises/view-layout.cpp
+++ b/exercises/view-layout.cpp
@@ -58,18 +58,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate storage for matrices and initialize matrix entries
   //
-  double* A = new double[N * N];
-  double* B = new double[N * N];
-  double* C = new double[N * N];
+  double* A    = new double[N * N];
+  double* B    = new double[N * N];
+  double* C    = new double[N * N];
   double* Cref = new double[N * N];
 
   for (int row = 0; row < N; ++row)
   {
     for (int col = 0; col < N; ++col)
     {
-      A[col + N * row] = row + 1;
-      B[col + N * row] = col + 1;
-      C[col + N * row] = 0.0;
+      A[col + N * row]    = row + 1;
+      B[col + N * row]    = col + 1;
+      C[col + N * row]    = 0.0;
       Cref[col + N * row] = 0.0;
     }
   }
@@ -149,12 +149,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define dimensions and allocate arrays
   //
   // _default_views_init_start
-  constexpr int Nx = 3;
-  constexpr int Ny = 5;
-  constexpr int Nz = 2;
+  constexpr int Nx   = 3;
+  constexpr int Ny   = 5;
+  constexpr int Nz   = 2;
   constexpr int Ntot = Nx * Ny * Nz;
-  int* a = new int[Ntot];
-  int* aref = new int[Ntot];
+  int*          a    = new int[Ntot];
+  int*          aref = new int[Ntot];
 
   for (int i = 0; i < Ntot; ++i)
   {
@@ -240,7 +240,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _default_perm_view2D_start
   std::array<RAJA::idx_t, 2> defperm2{{0, 1}};
-  RAJA::Layout<2, int> defperm2_layout =
+  RAJA::Layout<2, int>       defperm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
   RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
 
@@ -283,7 +283,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _perm_2D_start
   std::array<RAJA::idx_t, 2> perm2{{1, 0}};
-  RAJA::Layout<2, int> perm2_layout =
+  RAJA::Layout<2, int>       perm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
   RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
 
@@ -331,7 +331,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _permb_view3D_start
   std::array<RAJA::idx_t, 3> perm3b{{1, 2, 0}};
-  RAJA::Layout<3, int> perm3b_layout =
+  RAJA::Layout<3, int>       perm3b_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
   RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
 
@@ -379,9 +379,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\nperm3a_layout...\n" << std::endl;
 
   int lin = -1;
-  int i = -1;
-  int j = -1;
-  int k = -1;
+  int i   = -1;
+  int j   = -1;
+  int k   = -1;
 
   /*
     // _perm3d_layout_start
@@ -471,8 +471,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define some dimensions, and allocate arrays
   //
   constexpr int Ntot_ao = 40;
-  int* ao = new int[Ntot_ao];
-  int* ao_ref = new int[Ntot_ao];
+  int*          ao      = new int[Ntot_ao];
+  int*          ao_ref  = new int[Ntot_ao];
 
   //----------------------------------------//
 
@@ -526,8 +526,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
 
   // _cstyle_offlayout2D_start
-  imin = -1;
-  imax = 2;
+  imin     = -1;
+  imax     = 2;
   int jmin = -5;
   int jmax = 5;
 
diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp
index cb87b84aa4..0d41a96ad7 100644
--- a/exercises/view-layout_solution.cpp
+++ b/exercises/view-layout_solution.cpp
@@ -58,18 +58,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate storage for matrices and initialize matrix entries
   //
-  double* A = new double[N * N];
-  double* B = new double[N * N];
-  double* C = new double[N * N];
+  double* A    = new double[N * N];
+  double* B    = new double[N * N];
+  double* C    = new double[N * N];
   double* Cref = new double[N * N];
 
   for (int row = 0; row < N; ++row)
   {
     for (int col = 0; col < N; ++col)
     {
-      A[col + N * row] = row + 1;
-      B[col + N * row] = col + 1;
-      C[col + N * row] = 0.0;
+      A[col + N * row]    = row + 1;
+      B[col + N * row]    = col + 1;
+      C[col + N * row]    = 0.0;
       Cref[col + N * row] = 0.0;
     }
   }
@@ -149,12 +149,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define dimensions and allocate arrays
   //
   // _default_views_init_start
-  constexpr int Nx = 3;
-  constexpr int Ny = 5;
-  constexpr int Nz = 2;
+  constexpr int Nx   = 3;
+  constexpr int Ny   = 5;
+  constexpr int Nz   = 2;
   constexpr int Ntot = Nx * Ny * Nz;
-  int* a = new int[Ntot];
-  int* aref = new int[Ntot];
+  int*          a    = new int[Ntot];
+  int*          aref = new int[Ntot];
 
   for (int i = 0; i < Ntot; ++i)
   {
@@ -249,7 +249,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _default_perm_view2D_start
   std::array<RAJA::idx_t, 2> defperm2{{0, 1}};
-  RAJA::Layout<2, int> defperm2_layout =
+  RAJA::Layout<2, int>       defperm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
   RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
 
@@ -275,7 +275,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _default_perm_view3D_start
   std::array<RAJA::idx_t, 3> defperm3{{0, 1, 2}};
-  RAJA::Layout<3, int> defperm3_layout =
+  RAJA::Layout<3, int>       defperm3_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, defperm3);
   RAJA::View<int, RAJA::Layout<3, int>> defperm_view_3D(a, defperm3_layout);
 
@@ -305,7 +305,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _perm_2D_start
   std::array<RAJA::idx_t, 2> perm2{{1, 0}};
-  RAJA::Layout<2, int> perm2_layout =
+  RAJA::Layout<2, int>       perm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
   RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
 
@@ -331,7 +331,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _perma_view3D_start
   std::array<RAJA::idx_t, 3> perm3a{{2, 1, 0}};
-  RAJA::Layout<3, int> perm3a_layout =
+  RAJA::Layout<3, int>       perm3a_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3a);
   RAJA::View<int, RAJA::Layout<3, int>> perm3a_view_3D(a, perm3a_layout);
 
@@ -360,7 +360,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _permb_view3D_start
   std::array<RAJA::idx_t, 3> perm3b{{1, 2, 0}};
-  RAJA::Layout<3, int> perm3b_layout =
+  RAJA::Layout<3, int>       perm3b_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
   RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
 
@@ -408,9 +408,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\nperm3a_layout...\n" << std::endl;
 
   int lin = -1;
-  int i = -1;
-  int j = -1;
-  int k = -1;
+  int i   = -1;
+  int j   = -1;
+  int k   = -1;
 
   // _perm3d_layout_start
   lin = perm3a_layout(1, 2, 0);
@@ -494,8 +494,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define some dimensions, and allocate arrays
   //
   constexpr int Ntot_ao = 40;
-  int* ao = new int[Ntot_ao];
-  int* ao_ref = new int[Ntot_ao];
+  int*          ao      = new int[Ntot_ao];
+  int*          ao_ref  = new int[Ntot_ao];
 
   //----------------------------------------//
 
@@ -549,8 +549,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
 
   // _cstyle_offlayout2D_start
-  imin = -1;
-  imax = 2;
+  imin     = -1;
+  imax     = 2;
   int jmin = -5;
   int jmax = 5;
 
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 45f6777d93..4d1adcefd8 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -65,7 +65,7 @@ struct ExecPolicy
     : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
                                          RAJA::Pattern::forall>
 {
-  using seg_it = SEG_ITER_POLICY_T;
+  using seg_it   = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
 };
 
@@ -86,7 +86,7 @@ using policy::indexset::ExecPolicy;
 template <typename T0, typename... TREST>
 class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 {
-  using PARENT = TypedIndexSet<TREST...>;
+  using PARENT               = TypedIndexSet<TREST...>;
   static const int T0_TypeId = sizeof...(TREST);
 
 public:
@@ -163,7 +163,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///
   template <typename P0, typename... PREST>
   RAJA_INLINE bool
-  compareSegmentById(size_t segid,
+  compareSegmentById(size_t                             segid,
                      const TypedIndexSet<P0, PREST...>& other) const
   {
     // drill down our types until we have the right type
@@ -249,8 +249,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 private:
   template <typename... CALL>
   RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
-                             PushEnd pend = PUSH_BACK,
-                             PushCopy pcopy = PUSH_COPY)
+                             PushEnd                 pend  = PUSH_BACK,
+                             PushCopy                pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
@@ -278,10 +278,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 public:
   template <typename... CALL>
-  RAJA_INLINE void segment_push_into(size_t segid,
+  RAJA_INLINE void segment_push_into(size_t                  segid,
                                      TypedIndexSet<CALL...>& c,
-                                     PushEnd pend = PUSH_BACK,
-                                     PushCopy pcopy = PUSH_COPY)
+                                     PushEnd                 pend  = PUSH_BACK,
+                                     PushCopy                pcopy = PUSH_COPY)
   {
     if (getSegmentTypes()[segid] != T0_TypeId)
     {
@@ -343,7 +343,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE size_t getLength() const
   {
     size_t total = PARENT::getLength();
-    size_t num = data.size();
+    size_t num   = data.size();
     for (size_t i = 0; i < num; ++i)
     {
       total += data[i]->size();
@@ -512,7 +512,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   TypedIndexSet<T0, TREST...> createSlice(const T& segIds)
   {
     TypedIndexSet<T0, TREST...> retVal;
-    int numSeg = getNumSegments();
+    int                         numSeg = getNumSegments();
     for (auto& seg : segIds)
     {
       if (seg >= 0 && seg < numSeg)
@@ -527,7 +527,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   void setSegmentInterval(size_t interval_id, int begin, int end)
   {
     m_seg_interval_begin[interval_id] = begin;
-    m_seg_interval_end[interval_id] = end;
+    m_seg_interval_end[interval_id]   = end;
   }
 
   //! get lower bound of segment identified with interval_id
@@ -642,10 +642,10 @@ class TypedIndexSet<>
   RAJA_INLINE
   TypedIndexSet(TypedIndexSet const& c)
   {
-    segment_types = c.segment_types;
+    segment_types   = c.segment_types;
     segment_offsets = c.segment_offsets;
     segment_icounts = c.segment_icounts;
-    m_len = c.m_len;
+    m_len           = c.m_len;
   }
 
   //! Swap function for copy-and-swap idiom (deep copy).
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index cd614cca01..ca6f1f2e67 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -55,9 +55,9 @@ namespace RAJA
  */
 void RAJASHAREDDLL_API buildIndexSetAligned(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource work_res,
-    const RAJA::Index_type* const indices_in,
-    RAJA::Index_type length,
+    camp::resources::Resource                                   work_res,
+    const RAJA::Index_type* const                               indices_in,
+    RAJA::Index_type                                            length,
     RAJA::Index_type range_min_length,
     RAJA::Index_type range_align);
 
@@ -117,12 +117,12 @@ void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
  */
 void buildLockFreeColorIndexset(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource work_res,
-    RAJA::Index_type const* domainToRange,
-    int numEntity,
-    int numRangePerDomain,
-    int numEntityRange,
-    RAJA::Index_type* elemPermutation = nullptr,
+    camp::resources::Resource                                   work_res,
+    RAJA::Index_type const*                                     domainToRange,
+    int                                                         numEntity,
+    int               numRangePerDomain,
+    int               numEntityRange,
+    RAJA::Index_type* elemPermutation  = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
 } // namespace RAJA
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index eefc0ebbc4..4b6303d113 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -45,7 +45,7 @@ namespace RAJA
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename... SEG_TYPES>
-RAJA_INLINE void getIndices(CONTAINER_T& con,
+RAJA_INLINE void getIndices(CONTAINER_T&                       con,
                             const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
@@ -79,13 +79,15 @@ RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename... SEG_TYPES, typename CONDITIONAL>
-RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
+RAJA_INLINE void getIndicesConditional(CONTAINER_T&                       con,
                                        const TypedIndexSet<SEG_TYPES...>& iset,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
   forall<ExecPolicy<seq_segit, seq_exec>>(
-      iset, [&](typename CONTAINER_T::value_type idx) {
+      iset,
+      [&](typename CONTAINER_T::value_type idx)
+      {
         if (conditional(idx)) tcon.push_back(idx);
       });
   con = tcon;
@@ -100,14 +102,16 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename SEGMENT_T, typename CONDITIONAL>
-RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
+RAJA_INLINE void getIndicesConditional(CONTAINER_T&     con,
                                        const SEGMENT_T& seg,
-                                       CONDITIONAL conditional)
+                                       CONDITIONAL      conditional)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx) {
-    if (conditional(idx)) tcon.push_back(idx);
-  });
+  forall<seq_exec>(seg,
+                   [&](typename CONTAINER_T::value_type idx)
+                   {
+                     if (conditional(idx)) tcon.push_back(idx);
+                   });
   con = tcon;
 }
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 8579f2c856..c66c3db20d 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -50,11 +50,11 @@ struct IndexValue : public IndexValueBase
   using value_type = VALUE;
 
   //! Default constructor initializes value to 0.
-  RAJA_INLINE constexpr IndexValue() = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue const&) = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue&&) = default;
-  RAJA_INLINE IndexValue& operator=(IndexValue const&) = default;
-  RAJA_INLINE IndexValue& operator=(IndexValue&&) = default;
+  RAJA_INLINE constexpr IndexValue()                    = default;
+  constexpr RAJA_INLINE   IndexValue(IndexValue const&) = default;
+  constexpr RAJA_INLINE   IndexValue(IndexValue&&)      = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue const&)  = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue&&)       = default;
 
   /*!
    * \brief Explicit constructor.
@@ -379,18 +379,18 @@ using make_signed_t =
  * \param TYPE the name of the type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE(TYPE, NAME)                                           \
-  class TYPE : public ::RAJA::IndexValue<TYPE>                                 \
-  {                                                                            \
-    using parent = ::RAJA::IndexValue<TYPE>;                                   \
-                                                                               \
-  public:                                                                      \
-    using IndexValueType = TYPE;                                               \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}              \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v)           \
-        : parent::IndexValue(v)                                                \
-    {}                                                                         \
-    static inline std::string getName() { return NAME; }                       \
+#define RAJA_INDEX_VALUE(TYPE, NAME)                                             \
+  class TYPE : public ::RAJA::IndexValue<TYPE>                                   \
+  {                                                                              \
+    using parent = ::RAJA::IndexValue<TYPE>;                                     \
+                                                                                 \
+  public:                                                                        \
+    using IndexValueType = TYPE;                                                 \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}                \
+    RAJA_HOST_DEVICE             RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \
+        : parent::IndexValue(v)                                                  \
+    {}                                                                           \
+    static inline std::string getName() { return NAME; }                         \
   };
 
 /*!
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index ec4da54a1d..fec7a23ad2 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -116,10 +116,10 @@ class TypedListSegment
    * If 'Unowned' is passed as last argument, the segment will not own its
    * index data. In this case, caller must manage array lifetime properly.
    */
-  TypedListSegment(const value_type* values,
-                   Index_type length,
+  TypedListSegment(const value_type*         values,
+                   Index_type                length,
                    camp::resources::Resource resource,
-                   IndexOwnership owned = Owned)
+                   IndexOwnership            owned = Owned)
       : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
@@ -138,7 +138,7 @@ class TypedListSegment
    * Constructor assumes container data lives in host memory space.
    */
   template <typename Container>
-  TypedListSegment(const Container& container,
+  TypedListSegment(const Container&          container,
                    camp::resources::Resource resource)
       : m_resource(nullptr),
         m_owned(Unowned),
@@ -152,9 +152,9 @@ class TypedListSegment
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
-      auto dest = tmp;
-      auto src = container.begin();
-      auto const end = container.end();
+      auto       dest = tmp;
+      auto       src  = container.begin();
+      auto const end  = container.end();
       while (src != end)
       {
         *dest = *src;
@@ -163,7 +163,7 @@ class TypedListSegment
       }
 
       m_resource = new camp::resources::Resource(resource);
-      m_data = m_resource->allocate<value_type>(m_size);
+      m_data     = m_resource->allocate<value_type>(m_size);
       m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
       m_owned = Owned;
 
@@ -191,9 +191,9 @@ class TypedListSegment
   {
     clear();
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_data = other.m_data;
-    m_size = other.m_size;
+    m_owned    = Unowned;
+    m_data     = other.m_data;
+    m_size     = other.m_size;
   }
 
   //! move assignment for list segment
@@ -203,14 +203,14 @@ class TypedListSegment
   {
     clear();
     m_resource = rhs.m_resource;
-    m_owned = rhs.m_owned;
-    m_data = rhs.m_data;
-    m_size = rhs.m_size;
+    m_owned    = rhs.m_owned;
+    m_data     = rhs.m_data;
+    m_size     = rhs.m_size;
 
     rhs.m_resource = nullptr;
-    rhs.m_owned = Unowned;
-    rhs.m_data = nullptr;
-    rhs.m_size = 0;
+    rhs.m_owned    = Unowned;
+    rhs.m_data     = nullptr;
+    rhs.m_size     = 0;
   }
 
   //! Move constructor for list segment
@@ -220,10 +220,10 @@ class TypedListSegment
         m_data(rhs.m_data),
         m_size(rhs.m_size)
   {
-    rhs.m_owned = Unowned;
+    rhs.m_owned    = Unowned;
     rhs.m_resource = nullptr;
-    rhs.m_size = 0;
-    rhs.m_data = nullptr;
+    rhs.m_size     = 0;
+    rhs.m_data     = nullptr;
   }
 
   //! List segment destructor
@@ -240,10 +240,10 @@ class TypedListSegment
       delete m_resource;
     }
 #endif
-    m_data = nullptr;
+    m_data     = nullptr;
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_size = 0;
+    m_owned    = Unowned;
+    m_size     = 0;
   }
 
   //@}
@@ -289,7 +289,7 @@ class TypedListSegment
    * memory space.
    */
   RAJA_HOST_DEVICE bool indicesEqual(const value_type* container,
-                                     Index_type len) const
+                                     Index_type        len) const
   {
     if (container == m_data) return len == m_size;
     if (len != m_size || container == nullptr || m_data == nullptr)
@@ -342,23 +342,23 @@ class TypedListSegment
   //
   // Initialize segment data based on whether object owns the index data.
   //
-  void initIndexData(const value_type* container,
-                     Index_type len,
+  void initIndexData(const value_type*         container,
+                     Index_type                len,
                      camp::resources::Resource resource_,
-                     IndexOwnership container_own)
+                     IndexOwnership            container_own)
   {
 
     // empty list segment
     if (len <= 0 || container == nullptr)
     {
-      m_data = nullptr;
-      m_size = 0;
+      m_data  = nullptr;
+      m_size  = 0;
       m_owned = Unowned;
       return;
     }
 
     // some non-zero size -- initialize accordingly
-    m_size = len;
+    m_size  = len;
     m_owned = container_own;
     if (m_owned == Owned)
     {
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index 3ee1ba3653..4aafd63593 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -224,10 +224,10 @@ struct TypedRangeSegment
    *   \endverbatim
    */
   RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment slice(StorageT begin,
-                                                       DiffT length) const
+                                                       DiffT    length) const
   {
     StorageT start = m_begin[0] + begin;
-    StorageT end = start + length > m_end[0] ? m_end[0] : start + length;
+    StorageT end   = start + length > m_end[0] ? m_end[0] : start + length;
 
     return TypedRangeSegment{stripIndexType(start), stripIndexType(end)};
   }
@@ -486,11 +486,11 @@ struct TypedRangeStrideSegment
    *   \endverbatim
    */
   RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
-                                                 DiffT length) const
+                                                 DiffT    length) const
   {
     StorageT stride = m_begin.get_stride();
-    StorageT start = m_begin[0] + begin * stride;
-    StorageT end = start + stride * length;
+    StorageT start  = m_begin[0] + begin * stride;
+    StorageT end    = start + stride * length;
 
     if (stride > 0)
     {
@@ -563,7 +563,7 @@ template <typename BeginT,
           typename EndT,
           typename Common = detail::common_type_t<BeginT, EndT>>
 RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
-                                                      EndT&& end)
+                                                      EndT&&   end)
 {
   return {begin, end};
 }
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 6c9858221a..cd3fde2113 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -119,9 +119,9 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   void print(std::ostream& os) const;
 
 private:
-  int m_dep_task[_MaxDepTasks_];
-  int m_num_dep_tasks;
-  int m_semaphore_reload_value;
+  int              m_dep_task[_MaxDepTasks_];
+  int              m_num_dep_tasks;
+  int              m_semaphore_reload_value;
   std::atomic<int> m_semaphore_value;
 };
 
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 73628a035b..1f24dcb838 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -61,7 +61,7 @@ RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 }
 
 template <typename Type, typename DifferenceType>
-RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
+RAJA_HOST_DEVICE bool is_subtraction_overflow(Type           lhs,
                                               DifferenceType rhs,
                                               bool iterator_on_left = true)
 {
@@ -97,7 +97,7 @@ RAJA_HOST_DEVICE void check_is_addition_overflow(Type lhs, DifferenceType rhs)
 }
 
 template <typename Type, typename DifferenceType>
-RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
+RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type           lhs,
                                                     DifferenceType rhs)
 {
   if (is_subtraction_overflow(lhs, rhs))
@@ -105,24 +105,24 @@ RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
 }
 #endif
 
-template <typename Type = Index_type,
+template <typename Type           = Index_type,
           typename DifferenceType = Type,
-          typename PointerType = Type*>
+          typename PointerType    = Type*>
 class numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = PointerType;
-  using reference = value_type&;
-  using iterator_category = std::random_access_iterator_tag;
-
-  constexpr numeric_iterator() noexcept = default;
-  constexpr numeric_iterator(const numeric_iterator&) noexcept = default;
-  constexpr numeric_iterator(numeric_iterator&&) noexcept = default;
+  using difference_type     = DifferenceType;
+  using pointer             = PointerType;
+  using reference           = value_type&;
+  using iterator_category   = std::random_access_iterator_tag;
+
+  constexpr numeric_iterator() noexcept                         = default;
+  constexpr numeric_iterator(const numeric_iterator&) noexcept  = default;
+  constexpr numeric_iterator(numeric_iterator&&) noexcept       = default;
   numeric_iterator& operator=(const numeric_iterator&) noexcept = default;
-  numeric_iterator& operator=(numeric_iterator&&) noexcept = default;
+  numeric_iterator& operator=(numeric_iterator&&) noexcept      = default;
 
   RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs)
       : val(rhs)
@@ -275,18 +275,18 @@ class numeric_iterator
   stripped_value_type val = 0;
 };
 
-template <typename Type = Index_type,
+template <typename Type           = Index_type,
           typename DifferenceType = Type,
-          typename PointerType = Type*>
+          typename PointerType    = Type*>
 class strided_numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = DifferenceType*;
-  using reference = DifferenceType&;
-  using iterator_category = std::random_access_iterator_tag;
+  using difference_type     = DifferenceType;
+  using pointer             = DifferenceType*;
+  using reference           = DifferenceType&;
+  using iterator_category   = std::random_access_iterator_tag;
 
   constexpr strided_numeric_iterator() noexcept = default;
   constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept =
@@ -300,7 +300,7 @@ class strided_numeric_iterator
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
-      DifferenceType stride_ = DifferenceType(1))
+      DifferenceType      stride_ = DifferenceType(1))
       : val(rhs), stride(stride_)
   {}
 
@@ -419,8 +419,8 @@ class strided_numeric_iterator
   }
 
 private:
-  stripped_value_type val = 0;
-  DifferenceType stride = 1;
+  stripped_value_type val    = 0;
+  DifferenceType      stride = 1;
 };
 
 
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index e1540c8384..0006ad4bfe 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -44,7 +44,7 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #if defined(RAJA_HAVE_POSIX_MEMALIGN)
   // posix_memalign available
   void* ret = nullptr;
-  int err = posix_memalign(&ret, alignment, size);
+  int   err = posix_memalign(&ret, alignment, size);
   return err ? nullptr : ret;
 #elif defined(RAJA_HAVE_ALIGNED_ALLOC)
   return std::aligned_alloc(alignment, size);
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 966a319bbc..c0aa6bd69f 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -61,22 +61,22 @@ class RAJAVec
       typename allocator_traits_type::propagate_on_container_swap;
 
 public:
-  using value_type = T;
-  using allocator_type = Allocator;
-  using size_type = std::size_t;
+  using value_type      = T;
+  using allocator_type  = Allocator;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = typename allocator_traits_type::pointer;
-  using const_pointer = typename allocator_traits_type::const_pointer;
-  using iterator = value_type*;
-  using const_iterator = const value_type*;
+  using pointer         = typename allocator_traits_type::pointer;
+  using const_pointer   = typename allocator_traits_type::const_pointer;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
 
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(size_type init_cap = 0,
-                   const allocator_type& a = allocator_type())
+  explicit RAJAVec(size_type             init_cap = 0,
+                   const allocator_type& a        = allocator_type())
       : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
   {
     reserve(init_cap);
@@ -106,9 +106,9 @@ class RAJAVec
         m_capacity(other.m_capacity),
         m_size(other.m_size)
   {
-    other.m_data = nullptr;
+    other.m_data     = nullptr;
     other.m_capacity = 0;
-    other.m_size = 0;
+    other.m_size     = 0;
   }
 
   ///
@@ -305,10 +305,10 @@ class RAJAVec
   void pop_back() { destroy_items_after(m_size - 1); }
 
 private:
-  pointer m_data;
+  pointer        m_data;
   allocator_type m_allocator;
-  size_type m_capacity;
-  size_type m_size;
+  size_type      m_capacity;
+  size_type      m_size;
 
   ///
   /// Copy assignment implementation
@@ -354,14 +354,14 @@ class RAJAVec
     clear();
     shrink_to_fit();
 
-    m_data = rhs.m_data;
+    m_data      = rhs.m_data;
     m_allocator = std::move(rhs.m_allocator);
-    m_capacity = rhs.m_capacity;
-    m_size = rhs.m_size;
+    m_capacity  = rhs.m_capacity;
+    m_size      = rhs.m_size;
 
-    rhs.m_data = nullptr;
+    rhs.m_data     = nullptr;
     rhs.m_capacity = 0;
-    rhs.m_size = 0;
+    rhs.m_size     = 0;
   }
 
   ///
@@ -375,13 +375,13 @@ class RAJAVec
       clear();
       shrink_to_fit();
 
-      m_data = rhs.m_data;
+      m_data     = rhs.m_data;
       m_capacity = rhs.m_capacity;
-      m_size = rhs.m_size;
+      m_size     = rhs.m_size;
 
-      rhs.m_data = nullptr;
+      rhs.m_data     = nullptr;
       rhs.m_capacity = 0;
-      rhs.m_size = 0;
+      rhs.m_size     = 0;
     }
     else
     {
@@ -534,7 +534,7 @@ class RAJAVec
   // relying on STL directly.
   //
   static constexpr const size_type s_init_cap = 8;
-  static constexpr const double s_grow_fac = 1.5;
+  static constexpr const double    s_grow_fac = 1.5;
 
   //
   // Get the next value for capacity given a target and minimum.
@@ -594,7 +594,7 @@ class RAJAVec
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
     }
 
-    m_data = tdata;
+    m_data     = tdata;
     m_capacity = next_cap;
   }
 };
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index da72005702..4254d79ba1 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -39,9 +39,9 @@
 
 #define RAJA_FT_BEGIN                                                          \
   extern volatile int fault_type;                                              \
-  bool repeat;                                                                 \
-  bool do_time = false;                                                        \
-  ticks start = 0, stop = 0;                                                   \
+  bool                repeat;                                                  \
+  bool                do_time = false;                                         \
+  ticks               start = 0, stop = 0;                                     \
   if (fault_type != 0)                                                         \
   {                                                                            \
     printf("Uncaught fault %d\n", fault_type);                                 \
@@ -60,7 +60,7 @@
   {                                                                            \
     stop = getticks();                                                         \
     printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start));   \
-    do_time = false;                                                           \
+    do_time    = false;                                                        \
     fault_type = 0;                                                            \
   }                                                                            \
   if (fault_type < 0)                                                          \
@@ -71,7 +71,7 @@
   if (fault_type > 0)                                                          \
   {                                                                            \
     /* invalidate cache */                                                     \
-    repeat = true;                                                             \
+    repeat  = true;                                                            \
     do_time = true;                                                            \
   }                                                                            \
   }                                                                            \
@@ -81,7 +81,7 @@
 #else
 #define RAJA_FT_BEGIN                                                          \
   extern volatile int fault_type;                                              \
-  bool repeat;                                                                 \
+  bool                repeat;                                                  \
   if (fault_type == 0)                                                         \
   {                                                                            \
     do                                                                         \
@@ -92,7 +92,7 @@
   if (fault_type > 0)                                                          \
   {                                                                            \
     /* invalidate cache */                                                     \
-    repeat = true;                                                             \
+    repeat     = true;                                                         \
     fault_type = 0;                                                            \
   }                                                                            \
   }                                                                            \
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index b3e50fea8e..4da28c4aa4 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -227,20 +227,20 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy,
+  using policy          = WorkGroupPolicy<exec_policy,
                                  order_policy,
                                  storage_policy,
                                  dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
-  using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
+  using worksite_type  = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
   using workrunner_type = detail::WorkRunner<exec_policy,
@@ -262,10 +262,10 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 
   explicit WorkPool(Allocator const& aloc) : m_storage(aloc) {}
 
-  WorkPool(WorkPool const&) = delete;
+  WorkPool(WorkPool const&)            = delete;
   WorkPool& operator=(WorkPool const&) = delete;
 
-  WorkPool(WorkPool&&) = default;
+  WorkPool(WorkPool&&)            = default;
   WorkPool& operator=(WorkPool&&) = default;
 
   size_t num_loops() const { return m_storage.size(); }
@@ -317,8 +317,8 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 
 private:
   storage_type m_storage;
-  size_t m_max_num_loops = 0;
-  size_t m_max_storage_bytes = 0;
+  size_t       m_max_num_loops     = 0;
+  size_t       m_max_storage_bytes = 0;
 
   workrunner_type m_runner;
 };
@@ -338,23 +338,23 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                  xargs<Args...>,
                  ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy,
+  using policy          = WorkGroupPolicy<exec_policy,
                                  order_policy,
                                  storage_policy,
                                  dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using storage_type = typename workpool_type::storage_type;
+  using storage_type    = typename workpool_type::storage_type;
   using workrunner_type = typename workpool_type::workrunner_type;
 
   friend workpool_type;
@@ -363,10 +363,10 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkGroup(WorkGroup const&) = delete;
+  WorkGroup(WorkGroup const&)            = delete;
   WorkGroup& operator=(WorkGroup const&) = delete;
 
-  WorkGroup(WorkGroup&&) = default;
+  WorkGroup(WorkGroup&&)            = default;
   WorkGroup& operator=(WorkGroup&&) = default;
 
   inline worksite_type run(resource_type r, Args...);
@@ -388,7 +388,7 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
   ~WorkGroup() { clear(); }
 
 private:
-  storage_type m_storage;
+  storage_type    m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
@@ -411,19 +411,19 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy,
+  using policy          = WorkGroupPolicy<exec_policy,
                                  order_policy,
                                  storage_policy,
                                  dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
-  using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
+  using workpool_type  = WorkPool<policy, index_type, xarg_type, Allocator>;
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
 
 private:
@@ -436,10 +436,10 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkSite(WorkSite const&) = delete;
+  WorkSite(WorkSite const&)            = delete;
   WorkSite& operator=(WorkSite const&) = delete;
 
-  WorkSite(WorkSite&&) = default;
+  WorkSite(WorkSite&&)            = default;
   WorkSite& operator=(WorkSite&&) = default;
 
   resource_type get_resource() const { return m_resource; }
@@ -454,7 +454,7 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 
 private:
   per_run_storage m_run_storage;
-  resource_type m_resource;
+  resource_type   m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
       : m_run_storage(std::move(run_storage)), m_resource(r)
@@ -485,7 +485,7 @@ WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
          ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
-  m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
+  m_max_num_loops     = std::max(m_storage.size(), m_max_num_loops);
   m_max_storage_bytes = std::max(m_storage.storage_size(), m_max_storage_bytes);
 
   // move storage into workgroup
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index 954e59b9af..f9a138e6c6 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -106,8 +106,8 @@ struct Dispatcher<platform,
                   CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_function_call_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
@@ -119,7 +119,7 @@ struct Dispatcher<platform,
                                        void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
-    T* src_as_T = static_cast<T*>(src.ptr);
+    T* src_as_T  = static_cast<T*>(src.ptr);
     new (dest_as_T) T(std::move(*src_as_T));
     (*src_as_T).~T();
   }
@@ -152,9 +152,9 @@ struct Dispatcher<platform,
     (*obj_as_T).~T();
   }
 
-  using mover_type = void (*)(void_ptr_wrapper /*dest*/,
+  using mover_type     = void (*)(void_ptr_wrapper /*dest*/,
                               void_ptr_wrapper /*src*/);
-  using invoker_type = void (*)(void_cptr_wrapper /*obj*/,
+  using invoker_type   = void (*)(void_cptr_wrapper /*obj*/,
                                 CallArgs... /*args*/);
   using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
@@ -177,7 +177,7 @@ struct Dispatcher<platform,
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
   template <typename T,
-            bool uhi = use_host_invoke,
+            bool uhi               = use_host_invoke,
             std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
@@ -198,7 +198,7 @@ struct Dispatcher<platform,
   ///
   template <typename T,
             typename CreateOnDevice,
-            bool uhi = use_host_invoke,
+            bool uhi                = use_host_invoke,
             std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
   {
@@ -209,10 +209,10 @@ struct Dispatcher<platform,
             sizeof(T)};
   }
 
-  mover_type move_construct_destroy;
-  invoker_type invoke;
+  mover_type     move_construct_destroy;
+  invoker_type   invoke;
   destroyer_type destroy;
-  size_t size;
+  size_t         size;
 };
 
 
@@ -239,15 +239,15 @@ struct Dispatcher<platform,
                   CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_virtual_function_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   struct impl_base
   {
     virtual void move_destroy(void_ptr_wrapper dest,
                               void_ptr_wrapper src) const = 0;
-    virtual void destroy(void_ptr_wrapper obj) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const      = 0;
   };
 
   struct host_impl_base
@@ -272,7 +272,7 @@ struct Dispatcher<platform,
                               void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
+      T* src_as_T  = static_cast<T*>(src.ptr);
       new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
@@ -317,7 +317,7 @@ struct Dispatcher<platform,
   struct mover_type
   {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
+    void       operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       m_impl->move_destroy(dest, src);
     }
@@ -326,7 +326,7 @@ struct Dispatcher<platform,
   struct host_invoker_type
   {
     host_impl_base* m_impl;
-    void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    void            operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
@@ -335,7 +335,7 @@ struct Dispatcher<platform,
   struct device_invoker_type
   {
     device_impl_base* m_impl;
-    RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    RAJA_DEVICE void  operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
@@ -346,7 +346,7 @@ struct Dispatcher<platform,
   struct destroyer_type
   {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
+    void       operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
@@ -369,7 +369,7 @@ struct Dispatcher<platform,
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
   template <typename T,
-            bool uhi = use_host_invoke,
+            bool uhi               = use_host_invoke,
             std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
@@ -392,11 +392,11 @@ struct Dispatcher<platform,
   ///
   template <typename T,
             typename CreateOnDevice,
-            bool uhi = use_host_invoke,
+            bool uhi                = use_host_invoke,
             std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
   {
-    static base_impl_type<T> s_base_impl;
+    static base_impl_type<T>    s_base_impl;
     static device_impl_type<T>* s_device_impl_ptr{std::forward<CreateOnDevice>(
         createOnDevice)(DeviceImplTypeFactory<T>{})};
     return {mover_type{&s_base_impl},
@@ -405,10 +405,10 @@ struct Dispatcher<platform,
             sizeof(T)};
   }
 
-  mover_type move_construct_destroy;
-  invoker_type invoke;
+  mover_type     move_construct_destroy;
+  invoker_type   invoke;
   destroyer_type destroy;
-  size_t size;
+  size_t         size;
 };
 
 
@@ -431,8 +431,8 @@ struct Dispatcher<platform,
                   CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
@@ -470,7 +470,7 @@ struct Dispatcher<platform,
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
   template <typename T,
-            bool uhi = use_host_invoke,
+            bool uhi               = use_host_invoke,
             std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
@@ -484,17 +484,17 @@ struct Dispatcher<platform,
   ///
   template <typename T,
             typename CreateOnDevice,
-            bool uhi = use_host_invoke,
+            bool uhi                = use_host_invoke,
             std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
     return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
-  mover_type move_construct_destroy;
-  invoker_type invoke;
+  mover_type     move_construct_destroy;
+  invoker_type   invoke;
   destroyer_type destroy;
-  size_t size;
+  size_t         size;
 };
 
 /*!
@@ -511,8 +511,8 @@ struct Dispatcher<platform,
                   CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<T>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
@@ -524,7 +524,7 @@ struct Dispatcher<platform,
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
+      T* src_as_T  = static_cast<T*>(src.ptr);
       new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
@@ -568,7 +568,7 @@ struct Dispatcher<platform,
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
   template <typename U,
-            bool uhi = use_host_invoke,
+            bool uhi               = use_host_invoke,
             std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
@@ -584,7 +584,7 @@ struct Dispatcher<platform,
   ///
   template <typename U,
             typename CreateOnDevice,
-            bool uhi = use_host_invoke,
+            bool uhi                = use_host_invoke,
             std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
@@ -593,10 +593,10 @@ struct Dispatcher<platform,
     return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
-  mover_type move_construct_destroy;
-  invoker_type invoke;
+  mover_type     move_construct_destroy;
+  invoker_type   invoke;
   destroyer_type destroy;
-  size_t size;
+  size_t         size;
 };
 
 /*!
@@ -615,13 +615,13 @@ struct Dispatcher<platform,
                   CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::direct_dispatch<T0, T1, TNs...>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  using id_type = int;
+  using id_type          = int;
   using callable_indices = camp::make_int_seq_t<id_type, 2 + sizeof...(TNs)>;
-  using callable_types = camp::list<T0, T1, TNs...>;
+  using callable_types   = camp::list<T0, T1, TNs...>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
@@ -650,7 +650,7 @@ struct Dispatcher<platform,
     void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
+      T* src_as_T  = static_cast<T*>(src.ptr);
       new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
@@ -774,7 +774,7 @@ struct Dispatcher<platform,
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
   template <typename T,
-            bool uhi = use_host_invoke,
+            bool uhi               = use_host_invoke,
             std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
@@ -792,7 +792,7 @@ struct Dispatcher<platform,
   ///
   template <typename T,
             typename CreateOnDevice,
-            bool uhi = use_host_invoke,
+            bool uhi                = use_host_invoke,
             std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
@@ -803,10 +803,10 @@ struct Dispatcher<platform,
         mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
   }
 
-  mover_type move_construct_destroy;
-  invoker_type invoke;
+  mover_type     move_construct_destroy;
+  invoker_type   invoke;
   destroyer_type destroy;
-  size_t size;
+  size_t         size;
 };
 
 /*!
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index e07b64cdb2..fc40a72b22 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -54,7 +54,7 @@ struct HoldBodyArgs_base
   {}
 
 protected:
-  LoopBody m_body;
+  LoopBody             m_body;
   camp::tuple<Args...> m_arg_tuple;
 };
 
@@ -113,7 +113,7 @@ template <typename ExecutionPolicy,
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
-  using HoldBodyArgs = typename std::conditional<
+  using HoldBodyArgs  = typename std::conditional<
       !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
       HoldBodyArgs_host<LoopBody, index_type, Args...>,
       HoldBodyArgs_device<LoopBody, index_type, Args...>>::type;
@@ -134,7 +134,7 @@ struct HoldForall
 
 private:
   Segment_type m_segment;
-  LoopBody m_body;
+  LoopBody     m_body;
 };
 
 
@@ -162,11 +162,11 @@ template <typename FORALL_EXEC_POLICY,
           typename... Args>
 struct WorkRunnerForallOrdered_base
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
   using resource_type =
       typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
@@ -263,7 +263,7 @@ struct WorkRunnerForallOrdered
 
   // run the loops using forall in the order that they were enqueued
   template <typename WorkContainer>
-  typename base::per_run_storage run(WorkContainer const& storage,
+  typename base::per_run_storage run(WorkContainer const&         storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
@@ -312,7 +312,7 @@ struct WorkRunnerForallReverse
   // run the loops using forall in the reverse order to the order they were
   // enqueued
   template <typename WorkContainer>
-  typename base::per_run_storage run(WorkContainer const& storage,
+  typename base::per_run_storage run(WorkContainer const&         storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 8a43982bd3..6b0bea8689 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -49,20 +49,20 @@ namespace detail
 template <typename iterator_base>
 struct random_access_iterator : iterator_base
 {
-  using base = iterator_base;
-  using value_type = const typename base::value_type;
-  using pointer = typename base::pointer;
-  using reference = typename base::reference;
-  using difference_type = typename base::difference_type;
+  using base              = iterator_base;
+  using value_type        = const typename base::value_type;
+  using pointer           = typename base::pointer;
+  using reference         = typename base::reference;
+  using difference_type   = typename base::difference_type;
   using iterator_category = std::random_access_iterator_tag;
 
   using base::base;
 
   random_access_iterator(random_access_iterator const&) = default;
-  random_access_iterator(random_access_iterator&&) = default;
+  random_access_iterator(random_access_iterator&&)      = default;
 
   random_access_iterator& operator=(random_access_iterator const&) = default;
-  random_access_iterator& operator=(random_access_iterator&&) = default;
+  random_access_iterator& operator=(random_access_iterator&&)      = default;
 
 
   RAJA_HOST_DEVICE reference operator*() const
@@ -215,26 +215,26 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       "WorkStorage expects an allocator for 'char's.");
 
 public:
-  using storage_policy = RAJA::array_of_pointers;
+  using storage_policy  = RAJA::array_of_pointers;
   using dispatcher_type = Dispatcher_T;
 
   template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
 private:
   // struct used in storage vector to retain pointer and allocation size
   struct pointer_and_size
   {
-    pointer ptr;
+    pointer   ptr;
     size_type size;
   };
 
@@ -243,10 +243,10 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {}
@@ -291,7 +291,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       : m_vec(0, aloc), m_aloc(aloc)
   {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
@@ -360,14 +360,14 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   RAJAVec<
       pointer_and_size,
       typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
-      m_vec;
+                 m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
-    m_vec = std::move(rhs.m_vec);
+    m_vec  = std::move(rhs.m_vec);
     m_aloc = std::move(rhs.m_aloc);
   }
 
@@ -410,7 +410,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
   // allocate and move construct object as copy of other value and
   // destroy and deallocate other value
-  pointer_and_size move_destroy_value(WorkStorage&& rhs,
+  pointer_and_size move_destroy_value(WorkStorage&&    rhs,
                                       pointer_and_size other_value_and_size)
   {
     pointer value_ptr = reinterpret_cast<pointer>(
@@ -452,29 +452,29 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       "WorkStorage expects an allocator for 'char's.");
 
 public:
-  using storage_policy = RAJA::ragged_array_of_objects;
+  using storage_policy  = RAJA::ragged_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
   template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
   // iterator base class for accessing stored WorkStructs outside of the
   // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_begin, const size_type* offset_iter)
@@ -514,7 +514,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     }
 
   private:
-    const char* m_array_begin;
+    const char*      m_array_begin;
     const size_type* m_offset_iter;
   };
 
@@ -525,7 +525,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       : m_offsets(0, aloc), m_aloc(aloc)
   {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
@@ -536,8 +536,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
         m_aloc(std::move(rhs.m_aloc))
   {
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end = nullptr;
-    rhs.m_array_cap = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
@@ -579,7 +579,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
                holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
-    size_type value_size = create_value<holder>(
+    size_type value_size   = create_value<holder>(
         value_offset, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
@@ -594,8 +594,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       allocator_traits_type::deallocate(
           m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
-      m_array_end = nullptr;
-      m_array_cap = nullptr;
+      m_array_end   = nullptr;
+      m_array_cap   = nullptr;
     }
   }
 
@@ -604,10 +604,10 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 private:
   RAJAVec<size_type,
           typename allocator_traits_type::template rebind_alloc<size_type>>
-      m_offsets;
-  char* m_array_begin = nullptr;
-  char* m_array_end = nullptr;
-  char* m_array_cap = nullptr;
+                 m_offsets;
+  char*          m_array_begin = nullptr;
+  char*          m_array_end   = nullptr;
+  char*          m_array_cap   = nullptr;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
@@ -615,15 +615,15 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   {
     clear();
 
-    m_offsets = std::move(rhs.m_offsets);
+    m_offsets     = std::move(rhs.m_offsets);
     m_array_begin = rhs.m_array_begin;
-    m_array_end = rhs.m_array_end;
-    m_array_cap = rhs.m_array_cap;
-    m_aloc = std::move(rhs.m_aloc);
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
+    m_aloc        = std::move(rhs.m_aloc);
 
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end = nullptr;
-    rhs.m_array_cap = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
   }
 
   // move assignment if allocator does not propagate on move assignment
@@ -633,14 +633,14 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     if (m_aloc == rhs.m_aloc)
     {
 
-      m_offsets = std::move(rhs.m_offsets);
+      m_offsets     = std::move(rhs.m_offsets);
       m_array_begin = rhs.m_array_begin;
-      m_array_end = rhs.m_array_end;
-      m_array_cap = rhs.m_array_cap;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       rhs.m_array_begin = nullptr;
-      rhs.m_array_end = nullptr;
-      rhs.m_array_cap = nullptr;
+      rhs.m_array_end   = nullptr;
+      rhs.m_array_cap   = nullptr;
     }
     else
     {
@@ -652,7 +652,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
         move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]);
         m_offsets.emplace_back(rhs.m_offsets[i]);
       }
-      m_array_end = m_array_begin + rhs.storage_size();
+      m_array_end     = m_array_begin + rhs.storage_size();
       rhs.m_array_end = rhs.m_array_begin;
       rhs.m_offsets.clear();
       rhs.clear();
@@ -689,8 +689,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       }
 
       m_array_begin = new_array_begin;
-      m_array_end = new_array_end;
-      m_array_cap = new_array_cap;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
@@ -709,7 +709,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
   template <typename holder, typename... holder_ctor_args>
-  size_type create_value(size_type value_offset,
+  size_type create_value(size_type              value_offset,
                          const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
   {
@@ -762,29 +762,29 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       "WorkStorage expects an allocator for 'char's.");
 
 public:
-  using storage_policy = RAJA::constant_stride_array_of_objects;
+  using storage_policy  = RAJA::constant_stride_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
   template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
   // iterator base class for accessing stored WorkStructs outside of the
   // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_pos, size_type stride)
@@ -825,7 +825,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   private:
     const char* m_array_pos;
-    size_type m_stride;
+    size_type   m_stride;
   };
 
   using const_iterator = random_access_iterator<const_iterator_base>;
@@ -833,7 +833,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
@@ -845,8 +845,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end = nullptr;
-    rhs.m_array_cap = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
@@ -899,8 +899,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       allocator_traits_type::deallocate(
           m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
-      m_array_end = nullptr;
-      m_array_cap = nullptr;
+      m_array_end   = nullptr;
+      m_array_cap   = nullptr;
     }
   }
 
@@ -908,26 +908,26 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
 private:
   allocator_type m_aloc;
-  size_type m_stride = 1; // can't be 0 because size divides stride
-  char* m_array_begin = nullptr;
-  char* m_array_end = nullptr;
-  char* m_array_cap = nullptr;
+  size_type      m_stride      = 1; // can't be 0 because size divides stride
+  char*          m_array_begin = nullptr;
+  char*          m_array_end   = nullptr;
+  char*          m_array_cap   = nullptr;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
 
-    m_aloc = std::move(rhs.m_aloc);
-    m_stride = rhs.m_stride;
+    m_aloc        = std::move(rhs.m_aloc);
+    m_stride      = rhs.m_stride;
     m_array_begin = rhs.m_array_begin;
-    m_array_end = rhs.m_array_end;
-    m_array_cap = rhs.m_array_cap;
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
 
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end = nullptr;
-    rhs.m_array_cap = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
   }
 
   // move assignment if allocator does not propagate on move assignment
@@ -937,15 +937,15 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     if (m_aloc == rhs.m_aloc)
     {
 
-      m_stride = rhs.m_stride;
+      m_stride      = rhs.m_stride;
       m_array_begin = rhs.m_array_begin;
-      m_array_end = rhs.m_array_end;
-      m_array_cap = rhs.m_array_cap;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       // do not reset stride, leave it for reuse
       rhs.m_array_begin = nullptr;
-      rhs.m_array_end = nullptr;
-      rhs.m_array_cap = nullptr;
+      rhs.m_array_end   = nullptr;
+      rhs.m_array_cap   = nullptr;
     }
     else
     {
@@ -996,10 +996,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
             m_aloc, m_array_begin, storage_capacity());
       }
 
-      m_stride = new_stride;
+      m_stride      = new_stride;
       m_array_begin = new_array_begin;
-      m_array_end = new_array_end;
-      m_array_cap = new_array_cap;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 4ccfc5d4f5..155ead62fb 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -47,7 +47,7 @@ struct WorkStruct;
 template <typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template <size_t size,
+template <size_t   size,
           Platform platform,
           typename dispatch_policy,
           typename DispatcherID,
@@ -62,12 +62,12 @@ struct WorkStruct<
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
   template <typename holder, typename... holder_ctor_args>
-  static RAJA_INLINE void construct(void* ptr,
+  static RAJA_INLINE void construct(void*                  ptr,
                                     const dispatcher_type* dispatcher,
                                     holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
-    using value_type = GenericWorkStruct<dispatcher_type>;
+    using value_type      = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
                   "holder must fit in WorkStruct::obj");
@@ -83,7 +83,7 @@ struct WorkStruct<
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
-    value_ptr->invoke = dispatcher->invoke;
+    value_ptr->invoke     = dispatcher->invoke;
     new (&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
@@ -92,7 +92,7 @@ struct WorkStruct<
                                        WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
-    value_dst->invoke = value_src->invoke;
+    value_dst->invoke     = value_src->invoke;
     value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
                                                   &value_src->obj);
   }
@@ -117,8 +117,8 @@ struct WorkStruct<
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
 
-  const dispatcher_type* dispatcher;
-  typename dispatcher_type::invoker_type invoke;
+  const dispatcher_type*                                    dispatcher;
+  typename dispatcher_type::invoker_type                    invoke;
   typename std::aligned_storage<size, RAJA_MAX_ALIGN>::type obj;
 };
 
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index 2a024c6db3..d8d50f6bd1 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -372,7 +372,7 @@ class AtomicRef
   bool compare_exchange_strong(value_type& expect, value_type rhs) const
   {
     value_type compare = expect;
-    value_type old = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
+    value_type old     = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
     if (compare == old)
     {
       return true;
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 23ed6c462e..5932cf9b19 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -57,7 +57,7 @@ using ContainerDiff =
                          camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE DiffType firstIndex(DiffType n,
+RAJA_INLINE DiffType firstIndex(DiffType  n,
                                 CountType num_threads,
                                 CountType thread_id)
 {
diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp
index 217ef0b882..aa9a3ac888 100644
--- a/include/RAJA/pattern/detail/forall.hpp
+++ b/include/RAJA/pattern/detail/forall.hpp
@@ -23,8 +23,8 @@
   using std::begin;                                                            \
   using std::end;                                                              \
   using std::distance;                                                         \
-  auto begin##SUFFIX = begin(CONTAINER);                                       \
-  auto end##SUFFIX = end(CONTAINER);                                           \
+  auto begin##SUFFIX    = begin(CONTAINER);                                    \
+  auto end##SUFFIX      = end(CONTAINER);                                      \
   auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX)
 
 #define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it)
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 0f6b7069b2..3b3c5599aa 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -33,8 +33,8 @@
             DATA<T, RAJA::reduce::OP<T>, tuning>>                              \
   {                                                                            \
     using policy = POL<tuning>;                                                \
-    using Base = reduce::detail::BaseMultiReduce##OP_NAME<                     \
-        DATA<T, RAJA::reduce::OP<T>, tuning>>;                                 \
+    using Base   = reduce::detail::BaseMultiReduce##OP_NAME<                   \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                               \
     using Base::Base;                                                          \
     using typename Base::value_type;                                           \
     using typename Base::reference;                                            \
@@ -64,14 +64,14 @@ template <typename t_MultiReduceData>
 struct BaseMultiReduce
 {
   using MultiReduceData = t_MultiReduceData;
-  using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
-  using value_type = typename t_MultiReduceData::value_type;
+  using MultiReduceOp   = typename t_MultiReduceData::MultiReduceOp;
+  using value_type      = typename t_MultiReduceData::value_type;
 
   BaseMultiReduce()
       : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)}
   {}
 
-  explicit BaseMultiReduce(size_t num_bins,
+  explicit BaseMultiReduce(size_t     num_bins,
                            value_type init_val = MultiReduceOp::identity(),
                            value_type identity = MultiReduceOp::identity())
       : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
@@ -92,9 +92,9 @@ struct BaseMultiReduce
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduce(BaseMultiReduce const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduce(BaseMultiReduce&&) = default;
+  BaseMultiReduce(BaseMultiReduce&&)                 = default;
   BaseMultiReduce& operator=(BaseMultiReduce const&) = delete;
-  BaseMultiReduce& operator=(BaseMultiReduce&&) = delete;
+  BaseMultiReduce& operator=(BaseMultiReduce&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduce() = default;
 
@@ -103,7 +103,7 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(MultiReduceOp::identity(), size()));
   }
 
-  void reset(size_t num_bins,
+  void reset(size_t     num_bins,
              value_type init_val = MultiReduceOp::identity(),
              value_type identity = MultiReduceOp::identity())
   {
@@ -113,7 +113,7 @@ struct BaseMultiReduce
   template <typename Container,
             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void reset(Container const& container,
-             value_type identity = MultiReduceOp::identity())
+             value_type       identity = MultiReduceOp::identity())
   {
     for (size_t bin = 0; bin < data.num_bins(); ++bin)
     {
@@ -206,7 +206,7 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceMin const& m_base;
-    size_t m_bin;
+    size_t                    m_bin;
   };
 };
 
@@ -229,9 +229,9 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMax(BaseMultiReduceMax&&) = default;
+  BaseMultiReduceMax(BaseMultiReduceMax&&)                 = default;
   BaseMultiReduceMax& operator=(BaseMultiReduceMax const&) = delete;
-  BaseMultiReduceMax& operator=(BaseMultiReduceMax&&) = delete;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMax() = default;
 
@@ -254,7 +254,7 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceMax const& m_base;
-    size_t m_bin;
+    size_t                    m_bin;
   };
 };
 
@@ -277,9 +277,9 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceSum(BaseMultiReduceSum&&) = default;
+  BaseMultiReduceSum(BaseMultiReduceSum&&)                 = default;
   BaseMultiReduceSum& operator=(BaseMultiReduceSum const&) = delete;
-  BaseMultiReduceSum& operator=(BaseMultiReduceSum&&) = delete;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceSum() = default;
 
@@ -302,7 +302,7 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceSum const& m_base;
-    size_t m_bin;
+    size_t                    m_bin;
   };
 };
 
@@ -325,9 +325,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitOr(BaseMultiReduceBitOr&&) = default;
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr&&)                 = default;
   BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr const&) = delete;
-  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&) = delete;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitOr() = default;
 
@@ -350,7 +350,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceBitOr const& m_base;
-    size_t m_bin;
+    size_t                      m_bin;
   };
 };
 
@@ -373,9 +373,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&) = default;
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&)                 = default;
   BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd const&) = delete;
-  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&) = delete;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitAnd() = default;
 
@@ -398,7 +398,7 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceBitAnd const& m_base;
-    size_t m_bin;
+    size_t                       m_bin;
   };
 };
 
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 036890e067..34a4f0052b 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -48,7 +48,7 @@ struct GenericWrapperBase
 template <typename T>
 struct Privatizer
 {
-  using value_type = camp::decay<T>;
+  using value_type     = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
   static_assert(!has_privatizer<T>::value,
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 70cfbd856c..769246c7e0 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -144,8 +144,8 @@ class ValueLoc
     return *this;
   }
 #else
-  constexpr ValueLoc() = default;
-  constexpr ValueLoc(ValueLoc const&) = default;
+  constexpr ValueLoc()                 = default;
+  constexpr ValueLoc(ValueLoc const&)  = default;
   ValueLoc& operator=(ValueLoc const&) = default;
 #endif
 
@@ -156,9 +156,9 @@ class ValueLoc
       : val{val_}, loc{loc_}
   {}
 
-  RAJA_HOST_DEVICE operator T() const { return val; }
+  RAJA_HOST_DEVICE           operator T() const { return val; }
   RAJA_HOST_DEVICE IndexType getLoc() { return loc; }
-  RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const
+  RAJA_HOST_DEVICE bool      operator<(ValueLoc const& rhs) const
   {
     return val < rhs.val;
   }
@@ -211,7 +211,7 @@ class BaseReduce
   Combiner_t mutable c;
 
 public:
-  using value_type = T;
+  using value_type  = T;
   using reduce_type = Reduce;
 
   RAJA_SUPPRESS_HD_WARN
@@ -267,7 +267,7 @@ class BaseCombinable
 {
 protected:
   BaseCombinable const* parent = nullptr;
-  T identity;
+  T                     identity;
   T mutable my_data;
 
 public:
@@ -285,7 +285,7 @@ class BaseCombinable
   RAJA_HOST_DEVICE
   void reset(T init_val, T identity_)
   {
-    my_data = init_val;
+    my_data  = init_val;
     identity = identity_;
   }
 
@@ -371,24 +371,24 @@ class BaseReduceMinLoc
 {
 public:
   using Base = BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>;
-  using value_type = typename Base::value_type;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
   constexpr BaseReduceMinLoc(
-      T init_val,
+      T         init_val,
       IndexType init_idx,
-      T identity_val_ = reduce_type::identity(),
+      T         identity_val_ = reduce_type::identity(),
       IndexType identity_loc_ = DefaultLoc<IndexType>().value())
       : Base(value_type(init_val, init_idx),
              value_type(identity_val_, identity_loc_))
   {}
 
-  void reset(T init_val,
+  void reset(T         init_val,
              IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+             T         identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
     operator T(); // automatic get() before reset
@@ -525,24 +525,24 @@ class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
 public:
   using Base =
       BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
-  using value_type = typename Base::value_type;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
   constexpr BaseReduceMaxLoc(
-      T init_val,
+      T         init_val,
       IndexType init_idx,
-      T identity_val_ = reduce_type::identity(),
+      T         identity_val_ = reduce_type::identity(),
       IndexType identity_loc_ = DefaultLoc<IndexType>().value())
       : Base(value_type(init_val, init_idx),
              value_type(identity_val_, identity_loc_))
   {}
 
-  void reset(T init_val,
+  void reset(T         init_val,
              IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+             T         identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
     operator T(); // automatic get() before reset
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 2382f2bc78..67f999fe51 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -104,7 +104,7 @@ struct icount_adapter
   typename std::decay<Body>::type body;
   using container_type = typename std::decay<Range>::type;
   typename container_type::iterator begin_it;
-  Index_type icount;
+  Index_type                        icount;
   icount_adapter(Range const& r, Body const& b, IndexT icount_)
       : body{b}, icount{icount_}
   {
@@ -128,7 +128,7 @@ struct CallForall
             typename Res,
             typename ForallParams>
   RAJA_INLINE camp::resources::EventProxy<Res>
-  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+              operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
 
 struct CallForallIcount
@@ -141,7 +141,7 @@ struct CallForallIcount
             typename Res,
             typename ForallParams>
   RAJA_INLINE camp::resources::EventProxy<Res>
-  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+              operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
@@ -174,11 +174,11 @@ RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res r,
+forall(Res               r,
        ExecutionPolicy&& p,
-       Container&& c,
-       LoopBody&& loop_body,
-       ForallParams&& f_params)
+       Container&&       c,
+       LoopBody&&        loop_body,
+       ForallParams&&    f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
   return forall_impl(r,
@@ -220,11 +220,11 @@ template <typename Res,
           typename IndexType,
           typename LoopBody,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res               r,
                                                      ExecutionPolicy&& p,
-                                                     Container&& c,
-                                                     IndexType&& icount,
-                                                     LoopBody&& loop_body,
+                                                     Container&&       c,
+                                                     IndexType&&       icount,
+                                                     LoopBody&&     loop_body,
                                                      ForallParams&& f_params)
 {
   using std::begin;
@@ -258,23 +258,28 @@ template <typename Res,
           typename LoopBody,
           typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res>
-forall_Icount(Res r,
-              ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-              const TypedIndexSet<SegmentTypes...>& iset,
-              LoopBody loop_body,
-              ForallParams f_params)
+            forall_Icount(Res r,
+                          ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+                          const TypedIndexSet<SegmentTypes...>& iset,
+                          LoopBody                              loop_body,
+                          ForallParams                          f_params)
 {
   // no need for icount variant here
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID,
+  wrap::forall(segIterRes,
+               SegmentIterPolicy(),
+               iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(
+                     segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
                      SegmentExecPolicy(),
                      loop_body,
                      r,
                      f_params);
-  });
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
@@ -285,22 +290,26 @@ template <typename Res,
           typename... SegmentTypes,
           typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res>
-forall(Res r,
-       ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-       const TypedIndexSet<SegmentTypes...>& iset,
-       LoopBody loop_body,
-       ForallParams f_params)
+            forall(Res r,
+                   ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+                   const TypedIndexSet<SegmentTypes...>& iset,
+                   LoopBody                              loop_body,
+                   ForallParams                          f_params)
 {
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID,
-                     detail::CallForall{},
-                     SegmentExecPolicy(),
-                     loop_body,
-                     r,
-                     f_params);
-  });
+  wrap::forall(segIterRes,
+               SegmentIterPolicy(),
+               iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(segID,
+                                  detail::CallForall{},
+                                  SegmentExecPolicy(),
+                                  loop_body,
+                                  r,
+                                  f_params);
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
@@ -369,7 +378,7 @@ template <
     typename LoopBody,
     typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res>
-forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+            forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -456,7 +465,7 @@ template <
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_multi_policy<ExecutionPolicy>,
                                   type_traits::is_range<Container>>
-forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
+            forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
@@ -486,12 +495,12 @@ template <typename ExecutionPolicy,
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_range<Container>,
                                   type_traits::is_integral<IndexType>>
-forall_Icount(ExecutionPolicy&& p,
-              Res r,
-              Container&& c,
-              IndexType icount,
-              FirstParam&& first,
-              Params&&... params)
+            forall_Icount(ExecutionPolicy&& p,
+                          Res               r,
+                          Container&&       c,
+                          IndexType         icount,
+                          FirstParam&&      first,
+                          Params&&... params)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
@@ -536,9 +545,9 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_integral<IndexType>>
 forall_Icount(ExecutionPolicy&& p,
-              Container&& c,
-              IndexType icount,
-              LoopBody&& loop_body)
+              Container&&       c,
+              IndexType         icount,
+              LoopBody&&        loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -637,7 +646,7 @@ RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 template <typename ExecutionPolicy, typename Res, typename... Args>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_resource<Res>>
-forall(Res r, Args&&... args)
+            forall(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall(
       ExecutionPolicy(), r, std::forward<Args>(args)...);
@@ -662,7 +671,7 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 template <typename ExecutionPolicy, typename Res, typename... Args>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_resource<Res>>
-forall_Icount(Res r, Args&&... args)
+            forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
       ExecutionPolicy(), r, std::forward<Args>(args)...);
@@ -679,8 +688,8 @@ template <typename T,
 RAJA_INLINE camp::resources::EventProxy<Res>
 CallForall::operator()(T const& segment,
                        ExecutionPolicy,
-                       LoopBody body,
-                       Res r,
+                       LoopBody     body,
+                       Res          r,
                        ForallParams f_params) const
 {
   // this is only called inside a region, use impl
@@ -699,8 +708,8 @@ template <typename T,
 RAJA_INLINE camp::resources::EventProxy<Res>
 CallForallIcount::operator()(T const& segment,
                              ExecutionPolicy,
-                             LoopBody body,
-                             Res r,
+                             LoopBody     body,
+                             Res          r,
                              ForallParams f_params) const
 {
   // go through wrap to unwrap icount
@@ -739,12 +748,12 @@ struct dynamic_helper
   template <typename SEGMENT, typename BODY>
   static resources::EventProxy<resources::Resource>
   invoke_forall(RAJA::resources::Resource r,
-                const int pol,
-                SEGMENT const& seg,
-                BODY const& body)
+                const int                 pol,
+                SEGMENT const&            seg,
+                BODY const&               body)
   {
 
-    using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
     using resource_type = typename resources::get_resource<t_pol>::type;
 
     if (IDX == pol)
@@ -779,13 +788,13 @@ struct dynamic_helper<0, POLICY_LIST>
   template <typename SEGMENT, typename BODY>
   static resources::EventProxy<resources::Resource>
   invoke_forall(RAJA::resources::Resource r,
-                const int pol,
-                SEGMENT const& seg,
-                BODY const& body)
+                const int                 pol,
+                SEGMENT const&            seg,
+                BODY const&               body)
   {
     if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
-    using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<0>>::type;
     using resource_type = typename resources::get_resource<t_pol>::type;
 
     RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
@@ -812,9 +821,9 @@ void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
 template <typename POLICY_LIST, typename SEGMENT, typename BODY>
 resources::EventProxy<resources::Resource>
 dynamic_forall(RAJA::resources::Resource r,
-               const int pol,
-               SEGMENT const& seg,
-               BODY const& body)
+               const int                 pol,
+               SEGMENT const&            seg,
+               BODY const&               body)
 {
   constexpr int N = camp::size<POLICY_LIST>::value;
   static_assert(N > 0, "RAJA policy list must not be empty");
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index da1e6f6be7..0c4473e620 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -101,10 +101,10 @@ template <typename PolicyType,
           typename Resource,
           typename... Bodies>
 RAJA_INLINE resources::EventProxy<Resource>
-kernel_param_resource(SegmentTuple&& segments,
-                      ParamTuple&& params,
-                      Resource resource,
-                      Bodies&&... bodies)
+            kernel_param_resource(SegmentTuple&& segments,
+                                  ParamTuple&&   params,
+                                  Resource       resource,
+                                  Bodies&&... bodies)
 {
   util::PluginContext context{util::make_context<PolicyType>()};
 
@@ -183,7 +183,7 @@ kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
 RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
-kernel(SegmentTuple&& segments, Bodies&&... bodies)
+            kernel(SegmentTuple&& segments, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
   return RAJA::kernel_param_resource<PolicyType>(
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 71b6bd3009..6206780e85 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -105,7 +105,7 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = data.res;
@@ -139,7 +139,7 @@ struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     RAJA_EXTRACT_BED_IT(TypedRangeSegment<len_t>(0, len));
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 9dfd2ca126..31525a9374 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -114,7 +114,7 @@ struct StatementExecutor<
     ForICountWrapper<ArgumentId, ParamId, Data, NewTypes, EnclosedStmts...>
         for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 78a2383e43..8686eb47f0 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -169,7 +169,7 @@ struct StatementExecutor<
   {
 
     // get h value
-    auto h = camp::get<HpArgumentId>(data.offset_tuple);
+    auto h      = camp::get<HpArgumentId>(data.offset_tuple);
     using idx_t = decltype(h);
 
     // compute actual iterate for HpArgumentId
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 69e8bd7f8c..39c872c6da 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -309,7 +309,7 @@ RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
                                                 camp::idx_seq<PARAMS...> const&)
 {
 
-  using AllSegs = Segs<SEGS...>;
+  using AllSegs   = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
@@ -326,9 +326,9 @@ struct StatementExecutor<statement::Lambda<LambdaIndex>, Types>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    using Data_t = camp::decay<Data>;
+    using Data_t         = camp::decay<Data>;
     using offset_tuple_t = typename Data_t::offset_tuple_t;
-    using param_tuple_t = typename Data_t::param_tuple_t;
+    using param_tuple_t  = typename Data_t::param_tuple_t;
 
     invoke_lambda<LambdaIndex, Types>(
         std::forward<Data>(data),
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 7acb322494..2732305105 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -53,10 +53,13 @@ struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
   static RAJA_INLINE void exec(Data&& data)
   {
 
-    RAJA::region<RegionPolicy>([&]() {
-      using data_t = camp::decay<Data>;
-      execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
-    });
+    RAJA::region<RegionPolicy>(
+        [&]()
+        {
+          using data_t = camp::decay<Data>;
+          execute_statement_list<camp::list<EnclosedStmts...>, Types>(
+              data_t(data));
+        });
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 86cfcb4345..501a9d5c15 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -123,13 +123,13 @@ struct IterableTiler
   {
     // NOTE: this must be held by value for NVCC support, *even on the host*
     const IterableTiler itiler;
-    const Index_type block_id;
+    const Index_type    block_id;
 
   public:
-    using value_type = iterate;
-    using difference_type = camp::idx_t;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using value_type        = iterate;
+    using difference_type   = camp::idx_t;
+    using pointer           = value_type*;
+    using reference         = value_type&;
     using iterator_category = std::random_access_iterator_tag;
 
     RAJA_HOST_DEVICE
@@ -194,7 +194,7 @@ struct IterableTiler
     using std::begin;
     using std::distance;
     using std::end;
-    dist = it.end() - it.begin(); // distance(begin(it), end(it));
+    dist       = it.end() - it.begin(); // distance(begin(it), end(it));
     num_blocks = dist / block_size;
     // if (dist % block_size) num_blocks += 1;
     if (dist - num_blocks * block_size > 0)
@@ -211,7 +211,7 @@ struct IterableTiler
   RAJA_INLINE
   iterator end() const { return iterator(*this, num_blocks); }
 
-  value_type it;
+  value_type  it;
   camp::idx_t block_size;
   camp::idx_t num_blocks;
   camp::idx_t dist;
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 3109b9b452..6080176081 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -51,9 +51,9 @@ template <camp::idx_t ArgumentId, typename Policy>
 struct ForTraitBase : public ForBase
 {
   constexpr static camp::idx_t index_val = ArgumentId;
-  using index = camp::num<ArgumentId>;
-  using index_type = camp::nil; // default to invalid type
-  using policy_type = Policy;
+  using index                            = camp::num<ArgumentId>;
+  using index_type                       = camp::nil; // default to invalid type
+  using policy_type                      = Policy;
   using type = ForTraitBase; // make camp::value compatible
 };
 
@@ -129,7 +129,7 @@ struct LoopData
   // Lambdas that were passed into the kernel
   using BodiesTuple = camp::tuple<Bodies...>;
   const BodiesTuple bodies;
-  offset_tuple_t offset_tuple;
+  offset_tuple_t    offset_tuple;
 
   // Vector sizes of each segment.  This is only used by the vector_exec
   // policies
@@ -137,13 +137,13 @@ struct LoopData
   vector_sizes_t vector_sizes;
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
-                                                  ParamTuple const& p,
-                                                  Resource r,
+                                                  ParamTuple const&   p,
+                                                  Resource            r,
                                                   Bodies const&... b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
   {}
   constexpr LoopData(LoopData const&) = default;
-  constexpr LoopData(LoopData&&) = default;
+  constexpr LoopData(LoopData&&)      = default;
 
   template <camp::idx_t Idx, typename IndexT>
   RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const& i)
@@ -209,11 +209,11 @@ struct GenericWrapper : GenericWrapperBase
 template <typename T>
 struct NestedPrivatizer
 {
-  using data_t = typename T::data_t;
-  using value_type = camp::decay<T>;
+  using data_t         = typename T::data_t;
+  using value_type     = camp::decay<T>;
   using reference_type = value_type&;
 
-  data_t privatized_data;
+  data_t     privatized_data;
   value_type privatized_wrapper;
 
   RAJA_INLINE
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 3bfb7b5e9f..78f62d40e7 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -49,7 +49,7 @@ struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
                 "match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
-  using offset_types_t = camp::list<OffsetTypes...>;
+  using offset_types_t  = camp::list<OffsetTypes...>;
 };
 
 
@@ -66,7 +66,7 @@ template <typename Types, camp::idx_t Segment, typename T, camp::idx_t... SEQ>
 struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
 {
   using segment_list = typename Types::segment_types_t;
-  using offset_list = typename Types::offset_types_t;
+  using offset_list  = typename Types::offset_types_t;
 
   static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
                 "Segment was already assigned: Probably looping over same "
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 8279aac29c..99a37cf946 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -38,7 +38,7 @@ struct Statement
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
-  using execution_policy_t = ExecPolicy;
+  using execution_policy_t    = ExecPolicy;
 };
 
 
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 7c14e08236..78e2a8f9f7 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -142,16 +142,16 @@ struct Lanes
 struct LaunchParams
 {
 public:
-  Teams teams;
+  Teams   teams;
   Threads threads;
-  size_t shared_mem_size;
+  size_t  shared_mem_size;
 
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams in_teams,
+  LaunchParams(Teams   in_teams,
                Threads in_threads,
-               size_t in_shared_mem_size = 0)
+               size_t  in_shared_mem_size = 0)
       : teams(in_teams),
         threads(in_threads),
         shared_mem_size(in_shared_mem_size){};
@@ -236,7 +236,7 @@ struct LaunchExecute;
 // Policy based launch with support to new reducers...
 template <typename LAUNCH_POLICY, typename... ReduceParams>
 void launch(LaunchParams const& launch_params,
-            const char* kernel_name,
+            const char*         kernel_name,
             ReduceParams&&... rest_of_launch_args)
 {
 
@@ -322,10 +322,10 @@ void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place,
+void launch(ExecPlace           place,
             const LaunchParams& params,
-            const char* kernel_name,
-            BODY const& body)
+            const char*         kernel_name,
+            BODY const&         body)
 {
 
   // Forward to single policy launch API - simplifies testing of plugins
@@ -356,9 +356,9 @@ void launch(ExecPlace place,
 
 // Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place,
+void launch(ExecPlace           place,
             const LaunchParams& launch_params,
-            const char* kernel_name,
+            const char*         kernel_name,
             ReduceParams&&... rest_of_launch_args)
 {
 
@@ -397,7 +397,7 @@ void launch(ExecPlace place,
 // Run-time API for new reducer interface with support of the case without a new
 // kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place,
+void launch(ExecPlace           place,
             const LaunchParams& launch_params,
             ReduceParams&&... rest_of_launch_args)
 // BODY const &body)
@@ -472,8 +472,8 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
 template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
 launch(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
-       const char* kernel_name,
+       LaunchParams const&       launch_params,
+       const char*               kernel_name,
        ReduceParams&&... rest_of_launch_args)
 {
 
@@ -554,7 +554,7 @@ launch(RAJA::resources::Resource res,
 template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
 launch(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
+       LaunchParams const&       launch_params,
        ReduceParams&&... rest_of_launch_args)
 {
 
@@ -679,7 +679,7 @@ template <typename POLICY_LIST,
 RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
                                        SEGMENT const& segment0,
                                        SEGMENT const& segment1,
-                                       BODY const& body)
+                                       BODY const&    body)
 {
 
   LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
@@ -695,7 +695,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
                                               SEGMENT const& segment0,
                                               SEGMENT const& segment1,
                                               SEGMENT const& segment2,
-                                              BODY const& body)
+                                              BODY const&    body)
 {
 
   LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
@@ -716,9 +716,9 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
-                                       TILE_T tile_size,
+                                       TILE_T         tile_size,
                                        SEGMENT const& segment,
-                                       BODY const& body)
+                                       BODY const&    body)
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
@@ -731,9 +731,9 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
-                                              TILE_T tile_size,
+                                              TILE_T         tile_size,
                                               SEGMENT const& segment,
-                                              BODY const& body)
+                                              BODY const&    body)
 {
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
       ctx, tile_size, segment, body);
@@ -748,11 +748,11 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
-                                       TILE_T tile_size0,
-                                       TILE_T tile_size1,
+                                       TILE_T         tile_size0,
+                                       TILE_T         tile_size1,
                                        SEGMENT const& segment0,
                                        SEGMENT const& segment1,
-                                       BODY const& body)
+                                       BODY const&    body)
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
@@ -765,11 +765,11 @@ template <typename POLICY_LIST,
           typename SEGMENT,
           typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
-                                              TILE_T tile_size0,
-                                              TILE_T tile_size1,
+                                              TILE_T         tile_size0,
+                                              TILE_T         tile_size1,
                                               SEGMENT const& segment0,
                                               SEGMENT const& segment1,
-                                              BODY const& body)
+                                              BODY const&    body)
 {
 
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index b1fdfa3b59..62e0d53aba 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -38,7 +38,7 @@ struct ForallParamPack
   Base param_tup;
 
   static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
-  using params_seq = camp::make_idx_seq_t<param_tup_sz>;
+  using params_seq                     = camp::make_idx_seq_t<param_tup_sz>;
 
 private:
   // Init
@@ -57,7 +57,7 @@ struct ForallParamPack
   RAJA_HOST_DEVICE static constexpr void
   detail_combine(EXEC_POL,
                  camp::idx_seq<Seq...>,
-                 ForallParamPack& out,
+                 ForallParamPack&       out,
                  const ForallParamPack& in)
   {
     CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
@@ -434,7 +434,7 @@ RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
 CAMP_SUPPRESS_HD_WARN
 template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
 RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                  Fn&& f,
+                                                  Fn&&     f,
                                                   camp::idx_seq<Sequence...>,
                                                   Ts&&... extra)
 {
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index 78b14f907a..073cea1a8e 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -15,7 +15,7 @@ struct ForallParamBase
   // Some of this can be made virtual in c++20, for now must be defined in each
   // child class if any arguments to the forall lambda are needed (e.g.
   // KernelName is excluded.)
-  using ARG_TUP_T = camp::tuple<>;
+  using ARG_TUP_T  = camp::tuple<>;
   using ARG_LIST_T = typename ARG_TUP_T::TList;
   RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
   static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index e6c4c737a1..a9f180a8ac 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -48,7 +48,7 @@ struct ValLoc
   bool constexpr operator<(const ValLoc& rhs) const { return val < rhs.val; }
   bool constexpr operator>(const ValLoc& rhs) const { return val > rhs.val; }
 
-  value_type getVal() { return val; }
+  value_type       getVal() { return val; }
   RAJA::Index_type getLoc() { return loc; }
 
 private:
@@ -102,21 +102,21 @@ using device_mem_pool_t = RAJA::sycl::device_mempool_type;
 template <typename Op, typename T>
 struct Reducer : public ForallParamBase
 {
-  using op = Op;
+  using op         = Op;
   using value_type = T;
 
   RAJA_HOST_DEVICE Reducer() {}
   Reducer(value_type* target_in) : target(target_in), val(op::identity()) {}
 
   value_type* target = nullptr;
-  value_type val = op::identity();
+  value_type  val    = op::identity();
 
 #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
     defined(RAJA_SYCL_ACTIVE)
   // Device related attributes.
-  value_type* devicetarget = nullptr;
+  value_type*                                         devicetarget = nullptr;
   RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-  unsigned int* device_count = nullptr;
+  unsigned int*                                       device_count = nullptr;
 #endif
 
   using ARG_TUP_T = camp::tuple<value_type*>;
@@ -125,7 +125,7 @@ struct Reducer : public ForallParamBase
     return camp::make_tuple(&val);
   }
 
-  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
   static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
 };
 
@@ -149,12 +149,12 @@ namespace detail
 template <typename Op, typename T>
 struct ReducerLoc : public Reducer<Op, T>
 {
-  using Base = Reducer<Op, T>;
+  using Base       = Reducer<Op, T>;
   using value_type = typename Base::value_type;
   ReducerLoc(value_type* target_in)
   {
     Base::target = target_in;
-    Base::val = value_type(Op::identity());
+    Base::val    = value_type(Op::identity());
   }
 };
 
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 79f6ee678e..b25ab5d018 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -58,9 +58,9 @@ RAJA_INLINE
                           std::is_constructible<camp::resources::Resource, Res>,
                           type_traits::is_range<Container>>
     inclusive_scan_inplace(ExecPolicy&& p,
-                           Res r,
-                           Container&& c,
-                           Function binop = Function{})
+                           Res          r,
+                           Container&&  c,
+                           Function     binop = Function{})
 {
   using std::begin;
   using std::end;
@@ -81,7 +81,7 @@ template <
     typename ExecPolicy,
     typename Container,
     typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
+    typename Res      = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -89,8 +89,8 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
 inclusive_scan_inplace(ExecPolicy&& p,
-                       Container&& c,
-                       Function binop = Function{})
+                       Container&&  c,
+                       Function     binop = Function{})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
@@ -112,7 +112,7 @@ inclusive_scan_inplace(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename Res,
           typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
+          typename T        = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
     concepts::enable_if_t<resources::EventProxy<Res>,
@@ -121,10 +121,10 @@ RAJA_INLINE
                           std::is_constructible<camp::resources::Resource, Res>,
                           type_traits::is_range<Container>>
     exclusive_scan_inplace(ExecPolicy&& p,
-                           Res r,
-                           Container&& c,
-                           Function binop = Function{},
-                           T value = Function::identity())
+                           Res          r,
+                           Container&&  c,
+                           Function     binop = Function{},
+                           T            value = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -143,7 +143,7 @@ RAJA_INLINE
 ///
 template <typename ExecPolicy,
           typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
+          typename T        = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
@@ -153,9 +153,9 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
 exclusive_scan_inplace(ExecPolicy&& p,
-                       Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+                       Container&&  c,
+                       Function     binop = Function{},
+                       T            value = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -191,11 +191,11 @@ RAJA_INLINE
                           std::is_constructible<camp::resources::Resource, Res>,
                           type_traits::is_range<InContainer>,
                           type_traits::is_range<OutContainer>>
-    inclusive_scan(ExecPolicy&& p,
-                   Res r,
-                   InContainer&& in,
+    inclusive_scan(ExecPolicy&&   p,
+                   Res            r,
+                   InContainer&&  in,
                    OutContainer&& out,
-                   Function binop = Function{})
+                   Function       binop = Function{})
 {
   using std::begin;
   using std::end;
@@ -228,10 +228,10 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, InContainer>>,
     type_traits::is_range<OutContainer>>
-inclusive_scan(ExecPolicy&& p,
-               InContainer&& in,
+inclusive_scan(ExecPolicy&&   p,
+               InContainer&&  in,
                OutContainer&& out,
-               Function binop = Function{})
+               Function       binop = Function{})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -262,7 +262,7 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
     concepts::enable_if_t<resources::EventProxy<Res>,
@@ -271,12 +271,12 @@ RAJA_INLINE
                           std::is_constructible<camp::resources::Resource, Res>,
                           type_traits::is_range<InContainer>,
                           type_traits::is_range<OutContainer>>
-    exclusive_scan(ExecPolicy&& p,
-                   Res r,
-                   InContainer&& in,
+    exclusive_scan(ExecPolicy&&   p,
+                   Res            r,
+                   InContainer&&  in,
                    OutContainer&& out,
-                   Function binop = Function{},
-                   T value = Function::identity())
+                   Function       binop = Function{},
+                   T              value = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -304,7 +304,7 @@ RAJA_INLINE
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
@@ -314,11 +314,11 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, InContainer>>,
     type_traits::is_range<OutContainer>>
-exclusive_scan(ExecPolicy&& p,
-               InContainer&& in,
+exclusive_scan(ExecPolicy&&   p,
+               InContainer&&  in,
                OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+               Function       binop = Function{},
+               T              value = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -344,7 +344,7 @@ template <typename ExecPolicy,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>>
-exclusive_scan(Args&&... args)
+            exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan<ExecPolicy>(
@@ -355,7 +355,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>,
                                   type_traits::is_resource<Res>>
-exclusive_scan(Res r, Args&&... args)
+            exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -372,7 +372,7 @@ template <typename ExecPolicy,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>>
-inclusive_scan(Args&&... args)
+            inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan<ExecPolicy>(
@@ -383,7 +383,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>,
                                   type_traits::is_resource<Res>>
-inclusive_scan(Res r, Args&&... args)
+            inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -400,7 +400,7 @@ template <typename ExecPolicy,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>>
-exclusive_scan_inplace(Args&&... args)
+            exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace<ExecPolicy>(
@@ -411,7 +411,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>,
                                   type_traits::is_resource<Res>>
-exclusive_scan_inplace(Res r, Args&&... args)
+            exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -428,7 +428,7 @@ template <typename ExecPolicy,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>>
-inclusive_scan_inplace(Args&&... args)
+            inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace<ExecPolicy>(
@@ -439,7 +439,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_execution_policy<ExecPolicy>,
                                   type_traits::is_resource<Res>>
-inclusive_scan_inplace(Res r, Args&&... args)
+            inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
       ExecPolicy(), r, std::forward<Args>(args)...);
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index 75b596500b..c02348fbf1 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -68,8 +68,8 @@ sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it = end(c);
-  auto N = distance(begin_it, end_it);
+  auto end_it   = end(c);
+  auto N        = distance(begin_it, end_it);
 
   if (N > 1)
   {
@@ -86,7 +86,7 @@ template <
     typename ExecPolicy,
     typename Container,
     typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -134,8 +134,8 @@ stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it = end(c);
-  auto N = distance(begin_it, end_it);
+  auto end_it   = end(c);
+  auto N        = distance(begin_it, end_it);
 
   if (N > 1)
   {
@@ -152,7 +152,7 @@ template <
     typename ExecPolicy,
     typename Container,
     typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -191,11 +191,11 @@ concepts::enable_if_t<resources::EventProxy<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<KeyContainer>,
                       type_traits::is_range<ValContainer>>
-sort_pairs(ExecPolicy&& p,
-           Res r,
+sort_pairs(ExecPolicy&&   p,
+           Res            r,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare        comp = Compare{})
 {
   using std::begin;
   using std::distance;
@@ -209,8 +209,8 @@ sort_pairs(ExecPolicy&& p,
                 "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
-  auto end_key = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto end_key   = end(keys);
+  auto N         = distance(begin_key, end_key);
 
   if (N > 1)
   {
@@ -236,10 +236,10 @@ concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, KeyContainer>>,
     type_traits::is_range<ValContainer>>
-sort_pairs(ExecPolicy&& p,
+sort_pairs(ExecPolicy&&   p,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare        comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
@@ -275,11 +275,11 @@ concepts::enable_if_t<resources::EventProxy<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<KeyContainer>,
                       type_traits::is_range<ValContainer>>
-stable_sort_pairs(ExecPolicy&& p,
-                  Res r,
+stable_sort_pairs(ExecPolicy&&   p,
+                  Res            r,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare        comp = Compare{})
 {
   using std::begin;
   using std::distance;
@@ -293,8 +293,8 @@ stable_sort_pairs(ExecPolicy&& p,
                 "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
-  auto end_key = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto end_key   = end(keys);
+  auto N         = distance(begin_key, end_key);
 
   if (N > 1)
   {
@@ -320,10 +320,10 @@ concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, KeyContainer>>,
     type_traits::is_range<ValContainer>>
-stable_sort_pairs(ExecPolicy&& p,
+stable_sort_pairs(ExecPolicy&&   p,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare        comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 41b8f35a0b..c79035789f 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -31,8 +31,8 @@ namespace expt
 
 template <typename IDX,
           typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          IDX INDEX_VALUE,
+          camp::idx_t             DIM,
+          IDX                     INDEX_VALUE,
           strip_index_type_t<IDX> LENGTH_VALUE>
 struct StaticTensorIndexInner;
 
@@ -44,9 +44,9 @@ template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
 class TensorIndex
 {
 public:
-  using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using value_type = strip_index_type_t<IDX>;
-  using index_type = IDX;
+  using self_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
   using tensor_type = TENSOR_TYPE;
 
   RAJA_INLINE
@@ -155,19 +155,19 @@ class TensorIndex
 
 template <typename IDX,
           typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          IDX INDEX_VALUE,
+          camp::idx_t             DIM,
+          IDX                     INDEX_VALUE,
           strip_index_type_t<IDX> LENGTH_VALUE>
 struct StaticTensorIndex<
     StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
 {
 
-  using base_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using value_type = strip_index_type_t<IDX>;
-  using index_type = IDX;
+  using base_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
   using tensor_type = TENSOR_TYPE;
 
-  static const index_type s_index = INDEX_VALUE;
+  static const index_type s_index  = INDEX_VALUE;
   static const index_type s_length = LENGTH_VALUE;
 
   RAJA_INLINE
@@ -200,7 +200,7 @@ using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
  */
 template <typename IDX, typename MATRIX_TYPE>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
-toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
+                 toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
 {
   return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
 }
@@ -210,7 +210,7 @@ toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
  */
 template <typename IDX, typename MATRIX_TYPE>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
-toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
+                 toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
 {
   return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
 }
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index 9a72d976d0..632c40160d 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -27,7 +27,7 @@ namespace expt
 // Convenience to describe VectorTensors
 template <typename T,
           typename REGISTER_POLICY = default_register,
-          camp::idx_t NUM_ELEM = Register<T, REGISTER_POLICY>::s_num_elem>
+          camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
 using VectorRegister =
     TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
 } // namespace expt
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 29fc5b01da..5a3ecb3ed1 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -45,26 +45,26 @@ class TensorBinaryOperator
 {
 public:
   using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
-  using operator_type = OPERATOR;
-  using left_operand_type = LEFT_OPERAND;
+  using operator_type      = OPERATOR;
+  using left_operand_type  = LEFT_OPERAND;
   using right_operand_type = RIGHT_OPERAND;
 
   using element_type = typename LEFT_OPERAND::element_type;
-  using index_type = typename LEFT_OPERAND::index_type;
+  using index_type   = typename LEFT_OPERAND::index_type;
 
   using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
-  using result_type = typename operator_traits::result_type;
+  using result_type     = typename operator_traits::result_type;
 
   static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
 
 private:
-  left_operand_type m_left_operand;
+  left_operand_type  m_left_operand;
   right_operand_type m_right_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorBinaryOperator(left_operand_type const& left,
+  TensorBinaryOperator(left_operand_type const&  left,
                        right_operand_type const& right)
       : m_left_operand{left}, m_right_operand{right}
   {}
@@ -116,7 +116,7 @@ template <typename LEFT_OPERAND,
           typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
                                                   RIGHT_OPERAND>::value,
                                   bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const& left,
+RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const&  left,
                                             RIGHT_OPERAND const& right)
     -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
                  RIGHT_OPERAND>
@@ -138,7 +138,7 @@ template <typename LEFT_OPERAND,
           typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
                                                   RIGHT_OPERAND>::value,
                                   bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
+RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const&  left,
                                             RIGHT_OPERAND const& right)
     -> TensorSubtract<
         typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index 07cb1ac466..671230ee15 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -34,7 +34,7 @@ struct TensorOperatorAdd
 {
 
   template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const&  left,
                                                 RIGHT const& right)
       -> decltype(left + right)
   {
@@ -50,7 +50,7 @@ struct TensorOperatorSubtract
 {
 
   template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const&  left,
                                                 RIGHT const& right)
       -> decltype(left - right)
   {
@@ -85,7 +85,7 @@ template <typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
 struct OperatorTraits
 {
 
-  using result_type = typename LHS_TYPE::result_type;
+  using result_type                       = typename LHS_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
   RAJA_INLINE
@@ -110,7 +110,7 @@ struct OperatorTraits<LHS_TYPE,
                       typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
 {
 
-  using result_type = typename RHS_TYPE::result_type;
+  using result_type                       = typename RHS_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
   RAJA_INLINE
@@ -134,7 +134,7 @@ struct OperatorTraits<LHS_TYPE,
                       typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
 {
 
-  using result_type = typename LHS_TYPE::result_type;
+  using result_type                       = typename LHS_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
   RAJA_INLINE
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index 110405149e..bff1f7cdf1 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -49,20 +49,20 @@ class BlockLiteral
     : public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>>
 {
 public:
-  using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
+  using self_type    = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
   using storage_type = STORAGE_TYPE;
-  using tensor_type = TENSOR_TYPE;
-  using result_type = TENSOR_TYPE;
-  using ref_type = typename STORAGE_TYPE::ref_type;
-  using tile_type = typename ref_type::tile_type;
-  using index_type = camp::idx_t;
+  using tensor_type  = TENSOR_TYPE;
+  using result_type  = TENSOR_TYPE;
+  using ref_type     = typename STORAGE_TYPE::ref_type;
+  using tile_type    = typename ref_type::tile_type;
+  using index_type   = camp::idx_t;
 
   static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
 
 private:
   storage_type m_storage;
-  tile_type m_tile_origin;
+  tile_type    m_tile_origin;
 
 public:
   RAJA_INLINE
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3d03d0dd67..3f5d8f6614 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -96,7 +96,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   RAJA_SUPPRESS_HD_WARN
   template <typename RHS>
   RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
-  operator+(RHS const& rhs) const
+                               operator+(RHS const& rhs) const
   {
     return TensorAdd<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
@@ -106,7 +106,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   template <typename RHS>
   RAJA_INLINE
       RAJA_HOST_DEVICE TensorSubtract<self_type, normalize_operand_t<RHS>>
-      operator-(RHS const& rhs) const
+                       operator-(RHS const& rhs) const
   {
     return TensorSubtract<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
@@ -124,7 +124,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   template <typename RHS>
   RAJA_INLINE
       RAJA_HOST_DEVICE TensorMultiply<self_type, normalize_operand_t<RHS>>
-      operator*(RHS const& rhs) const
+                       operator*(RHS const& rhs) const
   {
     return TensorMultiply<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
@@ -133,7 +133,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   RAJA_SUPPRESS_HD_WARN
   template <typename RHS>
   RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
-  operator/(RHS const& rhs) const
+                               operator/(RHS const& rhs) const
   {
     return TensorDivide<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index 47c502ac83..eb70821915 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -62,8 +62,8 @@ struct MultiplyOperator
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(int dim,
-                        LEFT_OPERAND_TYPE const& left,
+  static int getDimSize(int                       dim,
+                        LEFT_OPERAND_TYPE const&  left,
                         RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
@@ -74,8 +74,8 @@ struct MultiplyOperator
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
+  multiply(TILE_TYPE const&          tile,
+           LEFT_OPERAND_TYPE const&  left,
            RIGHT_OPERAND_TYPE const& right)
       -> decltype(left.eval(tile) * right.eval(tile))
   {
@@ -88,10 +88,10 @@ struct MultiplyOperator
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
+  multiply_add(TILE_TYPE const&          tile,
+               LEFT_OPERAND_TYPE const&  left,
                RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const& add)
+               ADD_OPERAND_TYPE const&   add)
       -> decltype(left.eval(tile).multiply_add(right.eval(tile),
                                                add.eval(tile)))
   {
@@ -104,9 +104,9 @@ struct MultiplyOperator
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const& tile,
-                    LEFT_OPERAND_TYPE const& left,
-                    RIGHT_OPERAND_TYPE const& right,
+  multiply_subtract(TILE_TYPE const&             tile,
+                    LEFT_OPERAND_TYPE const&     left,
+                    RIGHT_OPERAND_TYPE const&    right,
                     SUBTRACT_OPERAND_TYPE const& subtract)
       -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
                                                     subtract.eval(tile)))
@@ -147,8 +147,8 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
+  multiply(TILE_TYPE const&          tile,
+           LEFT_OPERAND_TYPE const&  left,
            RIGHT_OPERAND_TYPE const& right)
       -> decltype(right.eval(tile).scale(left.eval(tile)))
   {
@@ -161,10 +161,10 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
+  multiply_add(TILE_TYPE const&          tile,
+               LEFT_OPERAND_TYPE const&  left,
                RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const& add)
+               ADD_OPERAND_TYPE const&   add)
       -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
   {
     return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
@@ -176,9 +176,9 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const& tile,
-                    LEFT_OPERAND_TYPE const& left,
-                    RIGHT_OPERAND_TYPE const& right,
+  multiply_subtract(TILE_TYPE const&             tile,
+                    LEFT_OPERAND_TYPE const&     left,
+                    RIGHT_OPERAND_TYPE const&    right,
                     SUBTRACT_OPERAND_TYPE const& subtract)
       -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
   {
@@ -217,8 +217,8 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
+  multiply(TILE_TYPE const&          tile,
+           LEFT_OPERAND_TYPE const&  left,
            RIGHT_OPERAND_TYPE const& right)
       -> decltype(left.eval(tile).scale(right.eval(tile)))
   {
@@ -231,10 +231,10 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
+  multiply_add(TILE_TYPE const&          tile,
+               LEFT_OPERAND_TYPE const&  left,
                RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const& add)
+               ADD_OPERAND_TYPE const&   add)
       -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
   {
     return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
@@ -246,9 +246,9 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const& tile,
-                    LEFT_OPERAND_TYPE const& left,
-                    RIGHT_OPERAND_TYPE const& right,
+  multiply_subtract(TILE_TYPE const&             tile,
+                    LEFT_OPERAND_TYPE const&     left,
+                    RIGHT_OPERAND_TYPE const&    right,
                     SUBTRACT_OPERAND_TYPE const& subtract)
       -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
   {
@@ -277,7 +277,7 @@ struct MultiplyOperator<
                             RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
 {
 
-  using left_type = LEFT_OPERAND_TYPE;
+  using left_type  = LEFT_OPERAND_TYPE;
   using right_type = RIGHT_OPERAND_TYPE;
   using result_type =
       typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
@@ -300,8 +300,8 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
+  multiply(TILE_TYPE const&          tile,
+           LEFT_OPERAND_TYPE const&  left,
            RIGHT_OPERAND_TYPE const& right)
   {
 
@@ -316,10 +316,10 @@ struct MultiplyOperator<
 
   template <typename TILE_TYPE, typename ADD_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
+  multiply_add(TILE_TYPE const&          tile,
+               LEFT_OPERAND_TYPE const&  left,
                RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const& add)
+               ADD_TYPE const&           add)
   {
 
     // evaluate add into result
@@ -337,16 +337,16 @@ struct MultiplyOperator<
 
   template <typename STORAGE, typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
+  multiply_into_result(STORAGE&                  result,
+                       TILE_TYPE const&          tile,
+                       LEFT_OPERAND_TYPE const&  et_left,
                        RIGHT_OPERAND_TYPE const& et_right)
   {
     // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
     // get tile size from matrix type
     auto tile_size = left_type::result_type::s_dim_elem(1);
-    auto k_size = et_left.getDimSize(1);
+    auto k_size    = et_left.getDimSize(1);
     // TODO: check that left and right are compatible
     // m_left.getDimSize(1) == m_right.getDimSize(0)
     // how do we provide checking for this kind of error?
@@ -355,8 +355,8 @@ struct MultiplyOperator<
     auto left_tile =
         LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
     left_tile.m_begin[0] = tile.m_begin[0];
-    left_tile.m_size[0] = tile.m_size[0];
-    left_tile.m_size[1] = tile_size;
+    left_tile.m_size[0]  = tile.m_size[0];
+    left_tile.m_size[1]  = tile_size;
 
     using RightType = typename TILE_TYPE::nonstatic_self_type;
 
@@ -370,10 +370,10 @@ struct MultiplyOperator<
 
       // evaluate both sides of operator
       left_tile.m_begin[1] = k;
-      auto left = et_left.eval(left_tile);
+      auto left            = et_left.eval(left_tile);
 
       right_tile.m_begin[0] = k;
-      auto right = et_right.eval(right_tile);
+      auto right            = et_right.eval(right_tile);
 
       // accumulate product
       result = left.right_multiply_vector_accumulate(right, result);
@@ -381,15 +381,15 @@ struct MultiplyOperator<
     // remainder tile in k
     if (k < k_size)
     {
-      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
       left_part_tile.m_begin[1] = k;
-      left_part_tile.m_size[1] = k_size - k;
-      auto left = et_left.eval(left_part_tile);
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
 
-      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
       right_part_tile.m_begin[0] = k;
-      right_part_tile.m_size[0] = k_size - k;
-      auto right = et_right.eval(right_part_tile);
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
 
       // accumulate product of partial tile
       result = left.right_multiply_vector_accumulate(right, result);
@@ -417,16 +417,16 @@ struct MultiplyOperator<
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TILE_TYPE const& tile,
-                                     LEFT_OPERAND_TYPE const& et_left,
+    static void multiply_into_result(STORAGE&                  result,
+                                     TILE_TYPE const&          tile,
+                                     LEFT_OPERAND_TYPE const&  et_left,
                                      RIGHT_OPERAND_TYPE const& et_right)
     {
       // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
       // get tile size from matrix type
       auto tile_size = left_type::result_type::s_dim_elem(1);
-      auto k_size = et_left.getDimSize(1);
+      auto k_size    = et_left.getDimSize(1);
       // TODO: check that left and right are compatible
       // m_left.getDimSize(1) == m_right.getDimSize(0)
       // how do we provide checking for this kind of error?
@@ -435,8 +435,8 @@ struct MultiplyOperator<
       auto left_tile =
           LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
       left_tile.m_begin[0] = tile.m_begin[0];
-      left_tile.m_size[0] = tile.m_size[0];
-      left_tile.m_size[1] = tile_size;
+      left_tile.m_size[0]  = tile.m_size[0];
+      left_tile.m_size[1]  = tile_size;
 
       using RightType = typename TILE_TYPE::nonstatic_self_type;
 
@@ -450,10 +450,10 @@ struct MultiplyOperator<
 
         // evaluate both sides of operator
         left_tile.m_begin[1] = k;
-        auto left = et_left.eval(left_tile);
+        auto left            = et_left.eval(left_tile);
 
         right_tile.m_begin[0] = k;
-        auto right = et_right.eval(right_tile);
+        auto right            = et_right.eval(right_tile);
 
         // accumulate product
         result = left.right_multiply_vector_accumulate(right, result);
@@ -461,15 +461,15 @@ struct MultiplyOperator<
       // remainder tile in k
       if (k < k_size)
       {
-        auto& left_part_tile = make_tensor_tile_partial(left_tile);
+        auto& left_part_tile      = make_tensor_tile_partial(left_tile);
         left_part_tile.m_begin[1] = k;
-        left_part_tile.m_size[1] = k_size - k;
-        auto left = et_left.eval(left_part_tile);
+        left_part_tile.m_size[1]  = k_size - k;
+        auto left                 = et_left.eval(left_part_tile);
 
-        auto& right_part_tile = make_tensor_tile_partial(right_tile);
+        auto& right_part_tile      = make_tensor_tile_partial(right_tile);
         right_part_tile.m_begin[0] = k;
-        right_part_tile.m_size[0] = k_size - k;
-        auto right = et_right.eval(right_part_tile);
+        right_part_tile.m_size[0]  = k_size - k;
+        auto right                 = et_right.eval(right_part_tile);
 
         // accumulate product of partial tile
         result = left.right_multiply_vector_accumulate(right, result);
@@ -482,7 +482,7 @@ struct MultiplyOperator<
             typename STORAGE,
             typename INDEX_TYPE,
             TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE Begin0,
+            INDEX_TYPE     Begin0,
             INDEX_TYPE... BeginTail,
             INDEX_TYPE Size0,
             INDEX_TYPE... SizeTail>
@@ -503,15 +503,15 @@ struct MultiplyOperator<
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TileType const& tile,
-                                     LEFT_OPERAND_TYPE const& et_left,
+    static void multiply_into_result(STORAGE&                  result,
+                                     TileType const&           tile,
+                                     LEFT_OPERAND_TYPE const&  et_left,
                                      RIGHT_OPERAND_TYPE const& et_right)
     {
 
       // get tile size from matrix type
       const auto tile_size = left_type::result_type::s_dim_elem(1);
-      const auto k_size = et_left.getDimSize(1);
+      const auto k_size    = et_left.getDimSize(1);
 
       auto const offset = INDEX * tile_size;
 
@@ -569,7 +569,7 @@ struct MultiplyOperator<
   template <typename STORAGE,
             typename INDEX_TYPE,
             TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE Begin0,
+            INDEX_TYPE     Begin0,
             INDEX_TYPE... BeginTail,
             INDEX_TYPE Size0,
             INDEX_TYPE... SizeTail>
@@ -592,13 +592,13 @@ struct MultiplyOperator<
     RAJA_HOST_DEVICE
     static void multiply_into_result(STORAGE& result,
                                      TileType const&,
-                                     LEFT_OPERAND_TYPE const& et_left,
+                                     LEFT_OPERAND_TYPE const&  et_left,
                                      RIGHT_OPERAND_TYPE const& et_right)
     {
 
       // get tile size from matrix type
       const auto tile_size = left_type::result_type::s_dim_elem(1);
-      const auto k_size = et_left.getDimSize(1);
+      const auto k_size    = et_left.getDimSize(1);
 
       auto const offset = 0;
 
@@ -651,7 +651,7 @@ struct MultiplyOperator<
   template <typename STORAGE,
             typename INDEX_TYPE,
             TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE Begin0,
+            INDEX_TYPE     Begin0,
             INDEX_TYPE... BeginTail,
             INDEX_TYPE Size0,
             INDEX_TYPE... SizeTail>
@@ -672,14 +672,14 @@ struct MultiplyOperator<
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TileType const& tile,
-                                     LEFT_OPERAND_TYPE const& et_left,
+    static void multiply_into_result(STORAGE&                  result,
+                                     TileType const&           tile,
+                                     LEFT_OPERAND_TYPE const&  et_left,
                                      RIGHT_OPERAND_TYPE const& et_right)
     {
 
-      const auto tile_size = left_type::result_type::s_dim_elem(1);
-      const auto k_size = et_left.getDimSize(1);
+      const auto   tile_size = left_type::result_type::s_dim_elem(1);
+      const auto   k_size    = et_left.getDimSize(1);
       const size_t iter_count =
           (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
@@ -718,8 +718,8 @@ struct MultiplyOperator<
                             RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
-  using left_type = LEFT_OPERAND_TYPE;
-  using right_type = RIGHT_OPERAND_TYPE;
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
   using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
   static constexpr camp::idx_t s_num_dims = 1;
 
@@ -740,8 +740,8 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
+  multiply(TILE_TYPE const&          tile,
+           LEFT_OPERAND_TYPE const&  left,
            RIGHT_OPERAND_TYPE const& right)
   {
     // clear result
@@ -755,10 +755,10 @@ struct MultiplyOperator<
 
   template <typename TILE_TYPE, typename ADD_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
+  multiply_add(TILE_TYPE const&          tile,
+               LEFT_OPERAND_TYPE const&  left,
                RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const& add)
+               ADD_TYPE const&           add)
   {
     // evaluate add into result
     result_type result = add.eval(tile);
@@ -772,14 +772,14 @@ struct MultiplyOperator<
 private:
   template <typename STORAGE, typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
+  multiply_into_result(STORAGE&                  result,
+                       TILE_TYPE const&          tile,
+                       LEFT_OPERAND_TYPE const&  et_left,
                        RIGHT_OPERAND_TYPE const& et_right)
   {
     // get tile size from matrix type
     auto tile_size = right_type::result_type::s_dim_elem(0);
-    auto k_size = et_right.getDimSize(0);
+    auto k_size    = et_right.getDimSize(0);
 
 
     // TODO: check that left and right are compatible
@@ -790,8 +790,8 @@ struct MultiplyOperator<
     auto right_tile =
         RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
     right_tile.m_begin[1] = tile.m_begin[0];
-    right_tile.m_size[1] = tile.m_size[0];
-    right_tile.m_size[0] = tile_size;
+    right_tile.m_size[1]  = tile.m_size[0];
+    right_tile.m_size[0]  = tile_size;
 
     TILE_TYPE left_tile = tile;
     left_tile.m_size[0] = tile_size;
@@ -804,10 +804,10 @@ struct MultiplyOperator<
 
       // evaluate both sides of operator
       right_tile.m_begin[0] = k;
-      auto right = et_right.eval(right_tile);
+      auto right            = et_right.eval(right_tile);
 
       left_tile.m_begin[0] = k;
-      auto left = et_left.eval(left_tile);
+      auto left            = et_left.eval(left_tile);
 
       // accumulate product
       result = right.left_multiply_vector_accumulate(left, result);
@@ -815,15 +815,15 @@ struct MultiplyOperator<
     // remainder tile in k
     if (k < k_size)
     {
-      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
       right_part_tile.m_begin[0] = k;
-      right_part_tile.m_size[0] = k_size - k;
-      auto right = et_right.eval(right_part_tile);
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
 
-      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
       left_part_tile.m_begin[0] = k;
-      left_part_tile.m_size[0] = k_size - k;
-      auto left = et_left.eval(left_part_tile);
+      left_part_tile.m_size[0]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
 
       // compute product into x of partial tile
       result = right.left_multiply_vector_accumulate(left, result);
@@ -847,8 +847,8 @@ struct MultiplyOperator<
                             RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
-  using left_type = LEFT_OPERAND_TYPE;
-  using right_type = RIGHT_OPERAND_TYPE;
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
   using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
   static constexpr camp::idx_t s_num_dims = 2;
 
@@ -858,8 +858,8 @@ struct MultiplyOperator<
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(int dim,
-                        LEFT_OPERAND_TYPE const& left,
+  static int getDimSize(int                       dim,
+                        LEFT_OPERAND_TYPE const&  left,
                         RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
@@ -870,8 +870,8 @@ struct MultiplyOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
+  multiply(TILE_TYPE const&          tile,
+           LEFT_OPERAND_TYPE const&  left,
            RIGHT_OPERAND_TYPE const& right)
   {
 
@@ -902,10 +902,10 @@ struct MultiplyOperator<
 
   template <typename TILE_TYPE, typename ADD_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
+  multiply_add(TILE_TYPE const&          tile,
+               LEFT_OPERAND_TYPE const&  left,
                RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const& add)
+               ADD_TYPE const&           add)
   {
 
     // start accumulator with addition term
@@ -919,15 +919,15 @@ struct MultiplyOperator<
 private:
   template <typename STORAGE, typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
+  multiply_into_result(STORAGE&                  result,
+                       TILE_TYPE const&          tile,
+                       LEFT_OPERAND_TYPE const&  et_left,
                        RIGHT_OPERAND_TYPE const& et_right)
   {
     // get tile size from matrix type
     using right_tensor_type = typename right_type::result_type;
-    auto tile_size = right_tensor_type::s_dim_elem(0);
-    auto k_size = et_left.getDimSize(1);
+    auto tile_size          = right_tensor_type::s_dim_elem(0);
+    auto k_size             = et_left.getDimSize(1);
 
     // TODO: check that left and right are compatible
     // m_left.getDimSize(1) == m_right.getDimSize(0)
@@ -936,11 +936,11 @@ struct MultiplyOperator<
     // tile over row of left and column of right
     TILE_TYPE left_tile = tile;
     left_tile.m_size[1] = tile_size;
-    auto left_begin = et_left.getDimBegin(1);
+    auto left_begin     = et_left.getDimBegin(1);
 
     TILE_TYPE right_tile = tile;
     right_tile.m_size[0] = tile_size;
-    auto right_begin = et_right.getDimBegin(0);
+    auto right_begin     = et_right.getDimBegin(0);
 
 
     // Do full tiles in k
@@ -950,10 +950,10 @@ struct MultiplyOperator<
 
       // evaluate both sides of operator
       left_tile.m_begin[1] = k + left_begin;
-      auto left = et_left.eval(left_tile);
+      auto left            = et_left.eval(left_tile);
 
       right_tile.m_begin[0] = k + right_begin;
-      auto right = et_right.eval(right_tile);
+      auto right            = et_right.eval(right_tile);
 
       // accumulate product
       left.matrix_multiply_accumulate(result, right);
@@ -962,15 +962,15 @@ struct MultiplyOperator<
     if (k < k_size)
     {
 
-      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
       left_part_tile.m_begin[1] = k + left_begin;
-      left_part_tile.m_size[1] = k_size - k;
-      auto left = et_left.eval(left_part_tile);
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
 
-      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
       right_part_tile.m_begin[0] = k + right_begin;
-      right_part_tile.m_size[0] = k_size - k;
-      auto right = et_right.eval(right_part_tile);
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
 
       // accumulate product
       left.matrix_multiply_accumulate(result, right);
@@ -984,16 +984,16 @@ class RestrictExtents
     : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>>
 {
 public:
-  using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
+  using self_type    = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
   using operand_type = OPERAND_TYPE;
-  using result_type = typename OPERAND_TYPE::result_type;
-  using index_type = typename TILE_TYPE::index_type;
-  using tile_type = TILE_TYPE;
+  using result_type  = typename OPERAND_TYPE::result_type;
+  using index_type   = typename TILE_TYPE::index_type;
+  using tile_type    = TILE_TYPE;
   static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
 
 private:
   operand_type m_operand;
-  tile_type m_tile;
+  tile_type    m_tile;
 
 public:
   RAJA_INLINE
@@ -1037,7 +1037,7 @@ class RestrictExtents
 
 template <typename OPERAND, typename TILE>
 RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
-                                               TILE const& tile)
+                                               TILE const&    tile)
 {
   using tile_type = typename OPERAND::tile_type;
   tile_type new_tile;
@@ -1064,8 +1064,8 @@ struct MultiplyOperator<
         LEFT_OPERAND_TYPE::s_num_dims == 2 &&
         RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
-  using left_type = LEFT_OPERAND_TYPE;
-  using right_type = RIGHT_OPERAND_TYPE;
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
   using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
   static constexpr camp::idx_t s_num_dims = 2;
 
@@ -1091,8 +1091,8 @@ struct MultiplyOperator<
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(int dim,
-                        LEFT_OPERAND_TYPE const& left,
+  static int getDimSize(int                       dim,
+                        LEFT_OPERAND_TYPE const&  left,
                         RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
@@ -1138,8 +1138,8 @@ struct MultiplyOperator<
 
   template <typename TILE_TYPE, typename ADD_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static block_literal
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
+  multiply_add(TILE_TYPE const&          tile,
+               LEFT_OPERAND_TYPE const&  left,
                RIGHT_OPERAND_TYPE const& right,
                ADD_TYPE const&
                    add) //->
@@ -1166,12 +1166,12 @@ struct MultiplyOperator<
     block_tile.copy(tile);
     block_literal result(block_tile);
 
-    using ref_type = typename block_literal::ref_type;
+    using ref_type        = typename block_literal::ref_type;
     using load_store_type = TensorLoadStore<tensor_type, ref_type>;
 
     // initialize the result with our addition term
     auto result_et = load_store_type(result.get_ref()).eval(tile);
-    result_et = add.eval(tile);
+    result_et      = add.eval(tile);
 
     // return TensorMultiplyAdd<decltype(left.eval(tile)),
     // decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile),
@@ -1188,15 +1188,15 @@ struct MultiplyOperator<
 private:
   template <typename STORAGE, typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
+  multiply_into_result(STORAGE&                  result,
+                       TILE_TYPE const&          tile,
+                       LEFT_OPERAND_TYPE const&  et_left,
                        RIGHT_OPERAND_TYPE const& et_right)
   {
 
     // get tile size from matrix type
     auto tile_size = result_type::s_dim_elem(1);
-    auto k_size = et_left.getDimSize(1);
+    auto k_size    = et_left.getDimSize(1);
 
     // TODO: check that left and right are compatible
     // m_left.getDimSize(1) == m_right.getDimSize(0)
@@ -1205,11 +1205,11 @@ struct MultiplyOperator<
     // tile over row of left and column of right
     TILE_TYPE left_tile = tile;
     left_tile.m_size[1] = tile_size;
-    auto left_begin = et_left.getDimBegin(1);
+    auto left_begin     = et_left.getDimBegin(1);
 
     TILE_TYPE right_tile = tile;
     right_tile.m_size[0] = tile_size;
-    auto right_begin = et_right.getDimBegin(0);
+    auto right_begin     = et_right.getDimBegin(0);
 
 
     // Do full tiles in k
@@ -1220,10 +1220,10 @@ struct MultiplyOperator<
 
       // evaluate both sides of operator
       left_tile.m_begin[1] = k + left_begin;
-      auto left = et_left.eval(left_tile);
+      auto left            = et_left.eval(left_tile);
 
       right_tile.m_begin[0] = k + right_begin;
-      auto right = et_right.eval(right_tile);
+      auto right            = et_right.eval(right_tile);
 
       // accumulate product
       // left.matrix_multiply_accumulate(result, right);
@@ -1234,15 +1234,15 @@ struct MultiplyOperator<
     if (k < k_size)
     {
 
-      auto& left_part_tile = make_tensor_tile_partial(left_tile);
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
       left_part_tile.m_begin[1] = k + left_begin;
-      left_part_tile.m_size[1] = k_size - k;
-      auto left = et_left.eval(left_part_tile);
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
 
-      auto& right_part_tile = make_tensor_tile_partial(right_tile);
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
       right_part_tile.m_begin[0] = k + right_begin;
-      right_part_tile.m_size[0] = k_size - k;
-      auto right = et_right.eval(right_part_tile);
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
 
       // accumulate product
       // left.matrix_multiply_accumulate(result, right);
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index b2edaebf09..f4e5b3eec3 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -69,8 +69,8 @@ struct DivideOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
+  divide(TILE_TYPE const&          tile,
+         LEFT_OPERAND_TYPE const&  left,
          RIGHT_OPERAND_TYPE const& right)
   {
     result_type numerator(left.eval(tile));
@@ -112,8 +112,8 @@ struct DivideOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
+  divide(TILE_TYPE const&          tile,
+         LEFT_OPERAND_TYPE const&  left,
          RIGHT_OPERAND_TYPE const& right)
   {
     result_type denominator(right.eval(tile));
@@ -157,8 +157,8 @@ struct DivideOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
+  divide(TILE_TYPE const&          tile,
+         LEFT_OPERAND_TYPE const&  left,
          RIGHT_OPERAND_TYPE const& right)
   {
     if (tile.s_tensor_size == TENSOR_FULL)
@@ -200,8 +200,8 @@ struct DivideOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
+  divide(TILE_TYPE const&          tile,
+         LEFT_OPERAND_TYPE const&  left,
          RIGHT_OPERAND_TYPE const& right)
   {
     result_type numerator(left.eval(tile));
@@ -245,8 +245,8 @@ struct DivideOperator<
   RAJA_SUPPRESS_HD_WARN
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
+  divide(TILE_TYPE const&          tile,
+         LEFT_OPERAND_TYPE const&  left,
          RIGHT_OPERAND_TYPE const& right)
   {
     result_type denominator(right.eval(tile));
@@ -291,8 +291,8 @@ struct DivideOperator<
    */
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
+  divide(TILE_TYPE const&          tile,
+         LEFT_OPERAND_TYPE const&  left,
          RIGHT_OPERAND_TYPE const& right)
   {
     if (tile.s_tensor_size == TENSOR_FULL)
@@ -313,25 +313,25 @@ class TensorDivide : public TensorExpressionBase<
                          TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
 {
 public:
-  using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using self_type         = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
   using left_operand_type = LEFT_OPERAND_TYPE;
   using right_operand_type = RIGHT_OPERAND_TYPE;
-  using element_type = typename LEFT_OPERAND_TYPE::element_type;
-  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+  using element_type       = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type         = typename LEFT_OPERAND_TYPE::index_type;
 
-  using divide_op = DivideOperator<left_operand_type, right_operand_type>;
+  using divide_op   = DivideOperator<left_operand_type, right_operand_type>;
   using result_type = typename divide_op::result_type;
   static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
 
 
 private:
-  left_operand_type m_left_operand;
+  left_operand_type  m_left_operand;
   right_operand_type m_right_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorDivide(left_operand_type const& left_operand,
+  TensorDivide(left_operand_type const&  left_operand,
                right_operand_type const& right_operand)
       : m_left_operand{left_operand}, m_right_operand{right_operand}
   {}
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index c1c1cb54be..fd93411909 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -41,11 +41,11 @@ template <typename TENSOR_TYPE>
 class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
 {
 public:
-  using self_type = TensorLiteral<TENSOR_TYPE>;
-  using tensor_type = TENSOR_TYPE;
+  using self_type    = TensorLiteral<TENSOR_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
   using element_type = typename TENSOR_TYPE::element_type;
-  using result_type = tensor_type;
-  using index_type = RAJA::Index_type;
+  using result_type  = tensor_type;
+  using index_type   = RAJA::Index_type;
 
   static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index f6642c4c6c..5185deeb23 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -74,13 +74,13 @@ class TensorLoadStore
     : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>>
 {
 public:
-  using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
-  using tensor_type = TENSOR_TYPE;
+  using self_type    = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
   using element_type = typename TENSOR_TYPE::element_type;
-  using index_type = typename REF_TYPE::index_type;
-  using ref_type = REF_TYPE;
-  using tile_type = typename REF_TYPE::tile_type;
-  using result_type = TENSOR_TYPE;
+  using index_type   = typename REF_TYPE::index_type;
+  using ref_type     = REF_TYPE;
+  using tile_type    = typename REF_TYPE::tile_type;
+  using result_type  = TENSOR_TYPE;
 
   static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 956bcc7314..14f0645e53 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -50,24 +50,24 @@ class TensorMultiply
 {
 public:
   using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-  using left_operand_type = LEFT_OPERAND_TYPE;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
   using right_operand_type = RIGHT_OPERAND_TYPE;
   using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
 
   using element_type = typename LEFT_OPERAND_TYPE::element_type;
-  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
 
-  using result_type = typename multiply_op::result_type;
+  using result_type                       = typename multiply_op::result_type;
   static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
 
 private:
-  left_operand_type m_left_operand;
+  left_operand_type  m_left_operand;
   right_operand_type m_right_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorMultiply(left_operand_type const& left_operand,
+  TensorMultiply(left_operand_type const&  left_operand,
                  right_operand_type const& right_operand)
       : m_left_operand{left_operand}, m_right_operand{right_operand}
   {}
@@ -117,7 +117,7 @@ class TensorMultiply
   RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
                                                  right_operand_type,
                                                  normalize_operand_t<ADD>>
-  operator+(ADD const& add) const
+                               operator+(ADD const& add) const
   {
     return TensorMultiplyAdd<left_operand_type,
                              right_operand_type,
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 8d652a7c67..4ebda615b2 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -53,31 +53,31 @@ class TensorMultiplyAdd
                                                     ADD_OPERAND_TYPE>>
 {
 public:
-  using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+  using self_type          = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
                                       RIGHT_OPERAND_TYPE,
                                       ADD_OPERAND_TYPE>;
-  using left_operand_type = LEFT_OPERAND_TYPE;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
   using right_operand_type = RIGHT_OPERAND_TYPE;
-  using add_operand_type = ADD_OPERAND_TYPE;
+  using add_operand_type   = ADD_OPERAND_TYPE;
   using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
 
   using element_type = typename LEFT_OPERAND_TYPE::element_type;
-  using index_type = typename LEFT_OPERAND_TYPE::index_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
 
-  using result_type = typename multiply_op::result_type;
+  using result_type                       = typename multiply_op::result_type;
   static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
 
 private:
-  left_operand_type m_left_operand;
+  left_operand_type  m_left_operand;
   right_operand_type m_right_operand;
-  add_operand_type m_add_operand;
+  add_operand_type   m_add_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorMultiplyAdd(left_operand_type const& left_operand,
+  TensorMultiplyAdd(left_operand_type const&  left_operand,
                     right_operand_type const& right_operand,
-                    add_operand_type const& add_operand)
+                    add_operand_type const&   add_operand)
       : m_left_operand{left_operand},
         m_right_operand{right_operand},
         m_add_operand{add_operand}
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index 116caeb42e..4432eaefd7 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -40,14 +40,14 @@ template <typename ET_TYPE>
 class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
 {
 public:
-  using self_type = TensorNegate<ET_TYPE>;
-  using rhs_type = ET_TYPE;
-  using tensor_type = typename ET_TYPE::result_type;
+  using self_type    = TensorNegate<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
   using element_type = typename tensor_type::element_type;
-  using index_type = typename ET_TYPE::index_type;
+  using index_type   = typename ET_TYPE::index_type;
 
-  using result_type = tensor_type;
-  using tile_type = typename ET_TYPE::tile_type;
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
   static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
 
   RAJA_INLINE
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index b015ca395c..e438fe37f8 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -41,11 +41,11 @@ template <typename T>
 class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
 {
 public:
-  using self_type = TensorScalarLiteral<T>;
-  using tensor_type = RAJA::expt::ScalarRegister<T>;
+  using self_type    = TensorScalarLiteral<T>;
+  using tensor_type  = RAJA::expt::ScalarRegister<T>;
   using element_type = T;
-  using result_type = T;
-  using index_type = RAJA::Index_type;
+  using result_type  = T;
+  using index_type   = RAJA::Index_type;
 
   static constexpr camp::idx_t s_num_dims = 0;
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index 590ce2a14d..11efbbd354 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -40,14 +40,14 @@ template <typename ET_TYPE>
 class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
 {
 public:
-  using self_type = TensorTranspose<ET_TYPE>;
-  using rhs_type = ET_TYPE;
-  using tensor_type = typename ET_TYPE::result_type;
+  using self_type    = TensorTranspose<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
   using element_type = typename tensor_type::element_type;
-  using index_type = typename ET_TYPE::index_type;
+  using index_type   = typename ET_TYPE::index_type;
 
-  using result_type = tensor_type;
-  using tile_type = typename ET_TYPE::tile_type;
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
   static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
 
   RAJA_INLINE
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 9272f64cd6..a8685ea3cd 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -94,9 +94,9 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const& A,
+      multiply_accumulate(left_type const&  A,
                           right_type const& B,
-                          result_type& C)
+                          result_type&      C)
   {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
     RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
@@ -108,7 +108,7 @@ struct MatrixMatrixMultiplyHelper<
     for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
     {
       camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
-      camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
+      camp::idx_t ac_row     = c_reg / num_bc_reg_per_row;
 
       RAJA_UNROLL
       for (camp::idx_t a_col = 0; a_col < M_SIZE; ++a_col)
@@ -129,18 +129,18 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE RAJA_INLINE static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const& A,
+      multiply_accumulate(left_type const&  A,
                           right_type const& B,
-                          result_type& C)
+                          result_type&      C)
   {
-    constexpr camp::idx_t bc_segbits = result_type::s_segbits;
+    constexpr camp::idx_t bc_segbits              = result_type::s_segbits;
     constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
 
     RAJA_UNROLL
     for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row)
     {
-      camp::idx_t c_reg = ac_row / result_type::s_major_dim_per_register;
-      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
+      camp::idx_t   c_reg     = ac_row / result_type::s_major_dim_per_register;
+      camp::idx_t   c_segment = ac_row % result_type::s_major_dim_per_register;
       register_type c_tmp;
 
       RAJA_UNROLL
@@ -148,7 +148,7 @@ struct MatrixMatrixMultiplyHelper<
       {
 
         camp::idx_t a_segment = ac_row * right_type::s_num_registers + b_reg;
-        camp::idx_t a_reg = a_segment / a_segments_per_register;
+        camp::idx_t a_reg     = a_segment / a_segments_per_register;
         camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
 
         auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(
@@ -249,9 +249,9 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const& A,
+      multiply_accumulate(left_type const&  A,
                           right_type const& B,
-                          result_type& C)
+                          result_type&      C)
   {
 
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
@@ -265,7 +265,7 @@ struct MatrixMatrixMultiplyHelper<
     for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
     {
       camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
-      camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
+      camp::idx_t bc_col     = c_reg / num_ac_reg_per_col;
 
       RAJA_UNROLL
       for (camp::idx_t b_row = 0; b_row < M_SIZE; ++b_row)
@@ -286,11 +286,11 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE RAJA_INLINE static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const& A,
+      multiply_accumulate(left_type const&  A,
                           right_type const& B,
-                          result_type& C)
+                          result_type&      C)
   {
-    constexpr camp::idx_t ac_segbits = result_type::s_segbits;
+    constexpr camp::idx_t ac_segbits              = result_type::s_segbits;
     constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
 
     camp::idx_t bc_col = 0;
@@ -316,7 +316,7 @@ struct MatrixMatrixMultiplyHelper<
 
 
           camp::idx_t b_segment = bc_col * right_type::s_num_registers + a_reg;
-          camp::idx_t b_reg = b_segment / b_segments_per_register;
+          camp::idx_t b_reg     = b_segment / b_segments_per_register;
           camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
 
           register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 07ebbe3099..cf4ca1cce5 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -61,12 +61,12 @@ class TensorRegister<REGISTER_POLICY,
                      T,
                      TensorLayout<ROW_ORD, COL_ORD>,
                      camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-  using register_type = Register<T, REGISTER_POLICY>;
-  using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
+  using register_type      = Register<T, REGISTER_POLICY>;
+  using row_vector_type    = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
   using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
-  using register_policy = REGISTER_POLICY;
-  using element_type = T;
-  using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
+  using register_policy    = REGISTER_POLICY;
+  using element_type       = T;
+  using layout_type        = TensorLayout<ROW_ORD, COL_ORD>;
 
   using transpose_tensor_type =
       TensorRegister<REGISTER_POLICY,
@@ -78,12 +78,12 @@ class TensorRegister<REGISTER_POLICY,
                                         T,
                                         layout_type,
                                         camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-  using product_type = TensorRegister<REGISTER_POLICY,
+  using product_type   = TensorRegister<REGISTER_POLICY,
                                       T,
                                       layout_type,
                                       camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
 
-  static constexpr camp::idx_t s_num_rows = ROW_SIZE;
+  static constexpr camp::idx_t s_num_rows    = ROW_SIZE;
   static constexpr camp::idx_t s_num_columns = COL_SIZE;
 
 
@@ -260,7 +260,7 @@ class TensorRegister<REGISTER_POLICY,
   template <typename POINTER_TYPE,
             typename INDEX_TYPE,
             RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            camp::idx_t STRIDE_ONE_DIM>
+            camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<
       RAJA::internal::expt::
           TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
@@ -373,13 +373,13 @@ class TensorRegister<REGISTER_POLICY,
   template <typename POINTER_TYPE,
             typename INDEX_TYPE,
             RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE StrideInt1,
-            INDEX_TYPE StrideInt2,
-            INDEX_TYPE BeginInt1,
-            INDEX_TYPE BeginInt2,
-            INDEX_TYPE SizeInt1,
-            INDEX_TYPE SizeInt2,
-            camp::idx_t STRIDE_ONE_DIM>
+            INDEX_TYPE                           StrideInt1,
+            INDEX_TYPE                           StrideInt2,
+            INDEX_TYPE                           BeginInt1,
+            INDEX_TYPE                           BeginInt2,
+            INDEX_TYPE                           SizeInt1,
+            INDEX_TYPE                           SizeInt2,
+            camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<RAJA::internal::expt::StaticTensorRef<
       POINTER_TYPE,
       INDEX_TYPE,
@@ -663,10 +663,10 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type& load_packed_nm(element_type const* ptr,
-                            int row_stride,
-                            int col_stride,
-                            int num_rows,
-                            int num_cols)
+                            int                 row_stride,
+                            int                 col_stride,
+                            int                 num_rows,
+                            int                 num_cols)
   {
 
     if (layout_type::is_row_major())
@@ -683,7 +683,7 @@ class TensorRegister<REGISTER_POLICY,
 
             camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-            camp::idx_t col0 = colreg * s_elements_per_register;
+            camp::idx_t col0   = colreg * s_elements_per_register;
             camp::idx_t offset = row * row_stride + col0;
 
             // loading a complete register
@@ -743,7 +743,7 @@ class TensorRegister<REGISTER_POLICY,
 
             camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-            camp::idx_t row0 = rowreg * s_elements_per_register;
+            camp::idx_t row0   = rowreg * s_elements_per_register;
             camp::idx_t offset = col * col_stride + row0;
 
             // loading a complete register
@@ -798,10 +798,10 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type& load_strided_nm(element_type const* ptr,
-                             int row_stride,
-                             int col_stride,
-                             int num_rows,
-                             int num_cols)
+                             int                 row_stride,
+                             int                 col_stride,
+                             int                 num_rows,
+                             int                 num_cols)
   {
 
     if (layout_type::is_row_major())
@@ -849,9 +849,9 @@ class TensorRegister<REGISTER_POLICY,
         {
           // figure out how many rows get loaded in this register
           camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
-          reg_num_rows = reg_num_rows > s_major_dim_per_register
-                             ? s_major_dim_per_register
-                             : reg_num_rows;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
 
           element_type const* ptr_i =
               ptr + i * row_stride * s_major_dim_per_register;
@@ -905,9 +905,9 @@ class TensorRegister<REGISTER_POLICY,
         {
           // figure out how many columns get loaded in this register
           camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
-          reg_num_cols = reg_num_cols > s_major_dim_per_register
-                             ? s_major_dim_per_register
-                             : reg_num_cols;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
 
           element_type const* ptr_i =
               ptr + i * col_stride * s_major_dim_per_register;
@@ -1068,10 +1068,10 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type const& store_packed_nm(element_type* ptr,
-                                   int row_stride,
-                                   int col_stride,
-                                   int num_rows,
-                                   int num_cols) const
+                                   int           row_stride,
+                                   int           col_stride,
+                                   int           num_rows,
+                                   int           num_cols) const
   {
 
 
@@ -1089,7 +1089,7 @@ class TensorRegister<REGISTER_POLICY,
 
             camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-            camp::idx_t col0 = colreg * s_elements_per_register;
+            camp::idx_t col0   = colreg * s_elements_per_register;
             camp::idx_t offset = row * row_stride + col0;
 
             // store a complete register
@@ -1131,7 +1131,7 @@ class TensorRegister<REGISTER_POLICY,
 
             camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-            camp::idx_t row0 = rowreg * s_elements_per_register;
+            camp::idx_t row0   = rowreg * s_elements_per_register;
             camp::idx_t offset = col * col_stride + row0;
 
             // loading a complete register
@@ -1169,10 +1169,10 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type const& store_strided_nm(element_type* ptr,
-                                    int row_stride,
-                                    int col_stride,
-                                    int num_rows,
-                                    int num_cols) const
+                                    int           row_stride,
+                                    int           col_stride,
+                                    int           num_rows,
+                                    int           num_cols) const
   {
 
 
@@ -1217,9 +1217,9 @@ class TensorRegister<REGISTER_POLICY,
         {
           // figure out how many rows get loaded in this register
           camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
-          reg_num_rows = reg_num_rows > s_major_dim_per_register
-                             ? s_major_dim_per_register
-                             : reg_num_rows;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
 
           element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
           m_registers[i].segmented_store_nm(
@@ -1268,9 +1268,9 @@ class TensorRegister<REGISTER_POLICY,
         {
           // figure out how many columns get loaded in this register
           camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
-          reg_num_cols = reg_num_cols > s_major_dim_per_register
-                             ? s_major_dim_per_register
-                             : reg_num_cols;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
 
           element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
           m_registers[i].segmented_store_nm(
@@ -1329,9 +1329,9 @@ class TensorRegister<REGISTER_POLICY,
         {
           // figure out how many rows get loaded in this register
           camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
-          reg_num_rows = reg_num_rows > s_major_dim_per_register
-                             ? s_major_dim_per_register
-                             : reg_num_rows;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
 
           result.m_registers[i] = m_registers[i].segmented_divide_nm(
               mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
@@ -1376,9 +1376,9 @@ class TensorRegister<REGISTER_POLICY,
         {
           // figure out how many columns get loaded in this register
           camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
-          reg_num_cols = reg_num_cols > s_major_dim_per_register
-                             ? s_major_dim_per_register
-                             : reg_num_cols;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
 
           result.m_registers[i] = m_registers[i].segmented_divide_nm(
               mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
@@ -1573,7 +1573,7 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_INLINE
   column_vector_type
   right_multiply_vector_accumulate(row_vector_type const& v,
-                                   column_vector_type result) const
+                                   column_vector_type     result) const
   {
 
     if (layout_type::is_row_major())
@@ -1680,7 +1680,7 @@ class TensorRegister<REGISTER_POLICY,
           {
 
             auto& mv = result.get_register(rowreg);
-            mv = m_registers[reg].multiply_add(v_col, mv);
+            mv       = m_registers[reg].multiply_add(v_col, mv);
 
             reg++;
 
@@ -1888,7 +1888,7 @@ class TensorRegister<REGISTER_POLICY,
 
     register_type result(0);
 
-    camp::idx_t num_rows = register_type::s_num_elem >> segbits;
+    camp::idx_t num_rows    = register_type::s_num_elem >> segbits;
     camp::idx_t num_repeats = 1 << segbits;
 
     camp::idx_t col0 = (starting_column + num_rows * segment) % s_num_columns;
@@ -1896,9 +1896,9 @@ class TensorRegister<REGISTER_POLICY,
 
     for (camp::idx_t i = 0; i < num_rows; ++i)
     {
-      camp::idx_t col = (col0 + i) % s_num_columns;
-      camp::idx_t row = row0 + i;
-      auto value = get(row, col);
+      camp::idx_t col   = (col0 + i) % s_num_columns;
+      camp::idx_t row   = row0 + i;
+      auto        value = get(row, col);
       for (camp::idx_t j = 0; j < num_repeats; ++j)
       {
         result.set(value, (i << segbits) + j);
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 209a11e611..2e22449c1d 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -54,7 +54,7 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type = true>
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).add(rhs);
@@ -69,7 +69,7 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type = true>
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).subtract(rhs);
@@ -84,7 +84,7 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type = true>
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
 {
   return rhs.scale(lhs);
@@ -99,7 +99,7 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type = true>
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).divide(rhs);
@@ -120,7 +120,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
     : public RegisterConcreteBase
 {
 public:
-  using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
+  using self_type    = RAJA::expt::Register<T, REGISTER_POLICY>;
   using element_type = camp::decay<T>;
 
   using index_type = camp::idx_t;
@@ -207,7 +207,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   template <typename T2>
   RAJA_HOST_DEVICE RAJA_INLINE self_type&
-  gather(element_type const* ptr,
+  gather(element_type const*                       ptr,
          RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
@@ -232,9 +232,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   template <typename T2>
   RAJA_HOST_DEVICE RAJA_INLINE self_type&
-  gather_n(element_type const* ptr,
+  gather_n(element_type const*                              ptr,
            RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-           camp::idx_t N)
+           camp::idx_t                                      N)
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -260,9 +260,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t segbits,
-                            camp::idx_t stride_inner,
-                            camp::idx_t stride_outer)
+                            camp::idx_t         segbits,
+                            camp::idx_t         stride_inner,
+                            camp::idx_t         stride_outer)
   {
     getThis()->gather(
         ptr,
@@ -280,15 +280,15 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t segbits,
-                               camp::idx_t stride_inner,
-                               camp::idx_t stride_outer,
-                               camp::idx_t num_inner,
-                               camp::idx_t num_outer)
+                               camp::idx_t         segbits,
+                               camp::idx_t         stride_inner,
+                               camp::idx_t         stride_outer,
+                               camp::idx_t         num_inner,
+                               camp::idx_t         num_outer)
   {
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size = 1 << segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
     camp::idx_t lane = 0;
     for (camp::idx_t seg = 0; seg < num_segments; ++seg)
@@ -329,7 +329,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   template <typename T2>
   RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  scatter(element_type* ptr,
+  scatter(element_type*                                    ptr,
           RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
@@ -353,9 +353,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   template <typename T2>
   RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  scatter_n(element_type* ptr,
+  scatter_n(element_type*                                    ptr,
             RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-            camp::idx_t N) const
+            camp::idx_t                                      N) const
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -381,9 +381,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t segbits,
-                                   camp::idx_t stride_inner,
-                                   camp::idx_t stride_outer) const
+                                   camp::idx_t   segbits,
+                                   camp::idx_t   stride_inner,
+                                   camp::idx_t   stride_outer) const
   {
     getThis()->scatter(
         ptr,
@@ -401,15 +401,15 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_HOST_DEVICE
   RAJA_INLINE
   self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t segbits,
-                                      camp::idx_t stride_inner,
-                                      camp::idx_t stride_outer,
-                                      camp::idx_t num_inner,
-                                      camp::idx_t num_outer) const
+                                      camp::idx_t   segbits,
+                                      camp::idx_t   stride_inner,
+                                      camp::idx_t   stride_outer,
+                                      camp::idx_t   num_inner,
+                                      camp::idx_t   num_outer) const
   {
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size = 1 << segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
     camp::idx_t lane = 0;
     for (camp::idx_t seg = 0; seg < num_segments; ++seg)
@@ -869,7 +869,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
     int_vector_type result;
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size = 1 << segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
     camp::idx_t lane = 0;
     for (camp::idx_t seg = 0; seg < num_segments; ++seg)
@@ -987,7 +987,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
     for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
     {
       camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
-      auto value = getThis()->get(i) + result.get(output_i);
+      auto        value    = getThis()->get(i) + result.get(output_i);
       result.set(value, output_i);
     }
 
@@ -996,7 +996,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
 
 
   RAJA_INLINE
-  self_type segmented_divide_nm(self_type den,
+  self_type segmented_divide_nm(self_type   den,
                                 camp::idx_t segbits,
                                 camp::idx_t num_inner,
                                 camp::idx_t num_outer) const
@@ -1004,7 +1004,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
     self_type result;
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size = 1 << segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
     camp::idx_t lane = 0;
     for (camp::idx_t seg = 0; seg < num_segments; ++seg)
@@ -1065,8 +1065,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_SUPPRESS_HD_WARN
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  self_type segmented_dot(camp::idx_t segbits,
-                          camp::idx_t output_segment,
+  self_type segmented_dot(camp::idx_t      segbits,
+                          camp::idx_t      output_segment,
                           self_type const& x) const
   {
     return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
@@ -1125,7 +1125,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   {
     self_type result;
 
-    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t mask   = (1 << segbits) - 1;
     camp::idx_t offset = input_segment << segbits;
 
     // default implementation is dumb, just sum each value into
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index 641a8ec261..d2dba38301 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -46,7 +46,7 @@ namespace expt
 template <typename ARG>
 struct TensorIndexTraits
 {
-  using arg_type = ARG;
+  using arg_type   = ARG;
   using value_type = strip_index_type_t<ARG>;
 
   RAJA_INLINE
@@ -85,7 +85,7 @@ template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
 struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
 {
   using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using arg_type = IDX;
+  using arg_type   = IDX;
   using value_type = strip_index_type_t<IDX>;
 
   RAJA_INLINE
@@ -129,8 +129,8 @@ struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
 
 template <typename IDX,
           typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          IDX INDEX_VALUE,
+          camp::idx_t             DIM,
+          IDX                     INDEX_VALUE,
           strip_index_type_t<IDX> LENGTH_VALUE>
 struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
     RAJA::expt::StaticTensorIndexInner<IDX,
@@ -139,14 +139,14 @@ struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
                                        INDEX_VALUE,
                                        LENGTH_VALUE>>>
 {
-  using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using base_type  = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
   using index_type = RAJA::expt::StaticTensorIndex<
       RAJA::expt::StaticTensorIndexInner<IDX,
                                          TENSOR_TYPE,
                                          DIM,
                                          INDEX_VALUE,
                                          LENGTH_VALUE>>;
-  using arg_type = IDX;
+  using arg_type   = IDX;
   using value_type = strip_index_type_t<IDX>;
 
   RAJA_INLINE
@@ -216,7 +216,7 @@ stripTensorIndexByValue(ARG const arg) ->
  */
 template <typename ARG, typename IDX>
 RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
-                                                         IDX dim_size)
+                                                         IDX        dim_size)
 {
   return TensorIndexTraits<ARG>::size(arg) >= 0
              ? IDX(TensorIndexTraits<ARG>::size(arg))
@@ -229,7 +229,7 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
  */
 template <typename ARG, typename IDX>
 RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
-                                                          IDX dim_minval)
+                                                          IDX        dim_minval)
 {
   return TensorIndexTraits<ARG>::begin(arg) >= 0
              ? IDX(TensorIndexTraits<ARG>::begin(arg))
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index f4de0be068..d98c0ccf65 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -48,8 +48,8 @@ struct StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>
 {
 
   using seq_type = camp::int_seq<INDEX_TYPE, HEAD, TAIL...>;
-  using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
+  using Self     = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Tail     = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
 
   Tail tail;
 
@@ -145,7 +145,7 @@ struct PrependStaticIndexArray<
 
 
 template <typename INDEX_TYPE,
-          size_t IDX,
+          size_t     IDX,
           INDEX_TYPE DELTA,
           INDEX_TYPE HEAD,
           INDEX_TYPE... TAIL>
@@ -155,7 +155,7 @@ struct AddStaticIndexArray<
     DELTA,
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
-  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
   using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
                                                IDX - 1,
                                                DELTA,
@@ -180,14 +180,14 @@ struct AddStaticIndexArray<
   using Type = typename PrependStaticIndexArray<INDEX_TYPE,
                                                 HEAD + DELTA,
                                                 typename Orig::Tail>::Type;
-  using Seq = typename PrependStaticIndexArray<INDEX_TYPE,
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
                                                HEAD + DELTA,
                                                typename Orig::Tail>::Seq;
 };
 
 
 template <typename INDEX_TYPE,
-          size_t IDX,
+          size_t     IDX,
           INDEX_TYPE VALUE,
           INDEX_TYPE HEAD,
           INDEX_TYPE... TAIL>
@@ -197,7 +197,7 @@ struct SetStaticIndexArray<
     VALUE,
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
-  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
   using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
                                                IDX - 1,
                                                VALUE,
@@ -221,7 +221,7 @@ struct SetStaticIndexArray<
   using Type = typename PrependStaticIndexArray<INDEX_TYPE,
                                                 VALUE,
                                                 typename Orig::Tail>::Type;
-  using Seq = typename PrependStaticIndexArray<INDEX_TYPE,
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
                                                VALUE,
                                                typename Orig::Tail>::Seq;
 };
@@ -237,13 +237,13 @@ enum TensorTileSize
 template <typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
 struct TensorTile
 {
-  using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using self_type           = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
   using nonstatic_self_type = self_type;
-  using index_type = INDEX_TYPE;
+  using index_type          = INDEX_TYPE;
   index_type m_begin[NUM_DIMS];
   index_type m_size[NUM_DIMS];
 
-  static constexpr camp::idx_t s_num_dims = NUM_DIMS;
+  static constexpr camp::idx_t    s_num_dims    = NUM_DIMS;
   static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
 
@@ -253,7 +253,7 @@ struct TensorTile
     for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
     {
       m_begin[i] = c.m_begin[i];
-      m_size[i] = c.m_size[i];
+      m_size[i]  = c.m_size[i];
     }
   }
 
@@ -317,10 +317,10 @@ struct StaticTensorTile<INDEX_TYPE,
 {
 
 
-  using begin_seq = camp::int_seq<INDEX_TYPE, BeginInts...>;
-  using size_seq = camp::int_seq<INDEX_TYPE, SizeInts...>;
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
   using begin_type = StaticIndexArray<begin_seq>;
-  using size_type = StaticIndexArray<size_seq>;
+  using size_type  = StaticIndexArray<size_seq>;
   using self_type =
       StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
   using index_type = INDEX_TYPE;
@@ -333,7 +333,7 @@ struct StaticTensorTile<INDEX_TYPE,
   using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
 
   begin_type m_begin;
-  size_type m_size;
+  size_type  m_size;
 
   static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
                 "Mismatch between "
@@ -344,7 +344,7 @@ struct StaticTensorTile<INDEX_TYPE,
                 "StaticTensorTil"
                 "e");
 
-  static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
+  static constexpr camp::idx_t    s_num_dims    = sizeof...(BeginInts);
   static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
   constexpr operator nonstatic_self_type() const
@@ -384,14 +384,14 @@ template <typename INDEX_TYPE,
           typename TBEGIN,
           typename TSIZE,
           INDEX_TYPE VALUE,
-          size_t IDX>
+          size_t     IDX>
 struct SetStaticTensorTileBegin<
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
     camp::integral_constant<INDEX_TYPE, VALUE>,
     IDX>
 {
   using BeginType = StaticIndexArray<TBEGIN>;
-  using Type = StaticTensorTile<
+  using Type      = StaticTensorTile<
       INDEX_TYPE,
       TENSOR_SIZE,
       typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, BeginType>::Seq,
@@ -406,14 +406,14 @@ template <typename INDEX_TYPE,
           typename TBEGIN,
           typename TSIZE,
           INDEX_TYPE VALUE,
-          size_t IDX>
+          size_t     IDX>
 struct SetStaticTensorTileSize<
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
     camp::integral_constant<INDEX_TYPE, VALUE>,
     IDX>
 {
   using SizeType = StaticIndexArray<TSIZE>;
-  using Type = StaticTensorTile<
+  using Type     = StaticTensorTile<
       INDEX_TYPE,
       TENSOR_SIZE,
       TBEGIN,
@@ -424,27 +424,27 @@ struct SetStaticTensorTileSize<
 template <typename POINTER_TYPE,
           typename INDEX_TYPE,
           TensorTileSize TENSOR_SIZE,
-          camp::idx_t NUM_DIMS,
-          camp::idx_t STRIDE_ONE_DIM = -1>
+          camp::idx_t    NUM_DIMS,
+          camp::idx_t    STRIDE_ONE_DIM = -1>
 struct TensorRef
 {
-  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
-  static constexpr camp::idx_t s_num_dims = NUM_DIMS;
-  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+  static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
 
-  using self_type = TensorRef<POINTER_TYPE,
+  using self_type    = TensorRef<POINTER_TYPE,
                               INDEX_TYPE,
                               TENSOR_SIZE,
                               NUM_DIMS,
                               STRIDE_ONE_DIM>;
-  using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using tile_type    = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
   using pointer_type = POINTER_TYPE;
-  using index_type = INDEX_TYPE;
+  using index_type   = INDEX_TYPE;
 
 
   pointer_type m_pointer;
-  index_type m_stride[NUM_DIMS];
-  tile_type m_tile;
+  index_type   m_stride[NUM_DIMS];
+  tile_type    m_tile;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
@@ -491,15 +491,15 @@ struct StaticTensorRef<POINTER_TYPE,
                        STRIDE_ONE_DIM>
 {
 
-  static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
-  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t    s_num_dims        = sizeof...(BeginInts);
+  static constexpr camp::idx_t    s_stride_one_dim  = STRIDE_ONE_DIM;
   static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
-  using pointer_type = POINTER_TYPE;
-  using index_type = INDEX_TYPE;
+  using pointer_type                                = POINTER_TYPE;
+  using index_type                                  = INDEX_TYPE;
 
   using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
-  using begin_seq = camp::int_seq<INDEX_TYPE, BeginInts...>;
-  using size_seq = camp::int_seq<INDEX_TYPE, SizeInts...>;
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
 
   using stride_type = StaticIndexArray<stride_seq>;
 
@@ -520,8 +520,8 @@ struct StaticTensorRef<POINTER_TYPE,
 
 
   pointer_type m_pointer;
-  stride_type m_stride;
-  tile_type m_tile;
+  stride_type  m_stride;
+  tile_type    m_tile;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
@@ -555,10 +555,10 @@ struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
                 "number of "
                 "dimensions.");
 
-  static constexpr camp::idx_t s_num_dims = REF_TYPE::s_num_dims;
-  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
+  static constexpr camp::idx_t    s_num_dims       = REF_TYPE::s_num_dims;
+  static constexpr camp::idx_t    s_stride_one_dim = REF_TYPE::s_stride_one_dim;
   static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
-  using pointer_type = typename REF_TYPE::pointer_type;
+  using pointer_type   = typename REF_TYPE::pointer_type;
   using ref_index_type = typename REF_TYPE::index_type;
 
   static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
@@ -581,7 +581,7 @@ struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr shift_type shift_origin(REF_TYPE const& ref,
+  static constexpr shift_type shift_origin(REF_TYPE const&  ref,
                                            TILE_TYPE const& tile_origin)
   {
     return shift_type{ref.m_pointer -
@@ -638,7 +638,7 @@ struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
                     INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
 
   using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
-  using shift_size_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
+  using shift_size_seq  = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
 
   using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
                                            TENSOR_SIZE,
@@ -673,7 +673,7 @@ struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr shift_type shift_origin(ref_type const& ref,
+  static constexpr shift_type shift_origin(ref_type const&  ref,
                                            tile_type const& tile_origin)
   {
     return shift_type{ref.m_pointer -
@@ -725,7 +725,7 @@ shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
  */
 template <typename INDEX_TYPE,
           TensorTileSize RTENSOR_SIZE,
-          camp::idx_t NUM_DIMS>
+          camp::idx_t    NUM_DIMS>
 RAJA_INLINE
     RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&
     make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
@@ -738,7 +738,7 @@ RAJA_INLINE
  */
 template <typename INDEX_TYPE,
           TensorTileSize RTENSOR_SIZE,
-          camp::idx_t NUM_DIMS>
+          camp::idx_t    NUM_DIMS>
 RAJA_INLINE
     RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&
     make_tensor_tile_partial(
@@ -760,8 +760,8 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
                                                         TENSOR_FULL,
                                                         TBEGIN,
                                                         TSIZE>&
-make_tensor_tile_full(
-    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+            make_tensor_tile_full(
+                StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
 {
   return reinterpret_cast<
       StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE>&>(tile);
@@ -778,8 +778,8 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
                                                         TENSOR_PARTIAL,
                                                         TBEGIN,
                                                         TSIZE>&
-make_tensor_tile_partial(
-    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+            make_tensor_tile_partial(
+                StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
 {
   return reinterpret_cast<
       StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE>&>(tile);
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index 638dcc7b8b..6d628c4185 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -195,7 +195,7 @@ class TensorRegisterBase<
 
   template <typename REF_TYPE>
   RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
-  create_et_store_ref(REF_TYPE const& ref)
+                   create_et_store_ref(REF_TYPE const& ref)
   {
     return TensorRegisterStoreRef<REF_TYPE>{ref};
   }
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index d215057d9b..6785be61fb 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -54,7 +54,7 @@ struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
   {
 
     auto const orig_begin = otile.m_begin[DIM0];
-    auto const orig_size = otile.m_size[DIM0];
+    auto const orig_size  = otile.m_size[DIM0];
 
     // Do the full tile sizes
     for (tile.m_begin[DIM0] = orig_begin;
@@ -101,7 +101,7 @@ struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
 
 
     auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
     auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
@@ -159,7 +159,7 @@ template <typename STORAGE,
           camp::idx_t... DIM_SEQ>
 RAJA_INLINE RAJA_HOST_DEVICE void
 tensorTileExec_expanded(TILE_TYPE const& orig_tile,
-                        BODY&& body,
+                        BODY&&           body,
                         camp::idx_seq<IDX_SEQ...> const&,
                         camp::idx_seq<DIM_SEQ...> const&)
 {
@@ -180,7 +180,7 @@ tensorTileExec_expanded(TILE_TYPE const& orig_tile,
 
   // Do all of the tiling loops in layout order, this may improve
   // cache performance
-  using layout_order = typename STORAGE::layout_type::seq_t;
+  using layout_order       = typename STORAGE::layout_type::seq_t;
   using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
 
@@ -223,7 +223,7 @@ struct StaticTensorTileExec<STORAGE,
   {
 
     auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
     auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
@@ -279,7 +279,7 @@ struct StaticTensorTileExec<STORAGE,
   exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
   {
     auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-    auto constexpr orig_size = OTILE::size_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
     auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
@@ -339,7 +339,7 @@ template <typename STORAGE,
           camp::idx_t... DIM_SEQ>
 RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
-    BODY&& body,
+    BODY&&                                                          body,
     camp::idx_seq<IDX_SEQ...> const&,
     camp::idx_seq<DIM_SEQ...> const&)
 {
@@ -358,7 +358,7 @@ RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
 
   // Do all of the tiling loops in layout order, this may improve
   // cache performance
-  using layout_order = typename STORAGE::layout_type::seq_t;
+  using layout_order       = typename STORAGE::layout_type::seq_t;
   using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
 
@@ -368,7 +368,7 @@ RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
 
 template <typename STORAGE, typename TILE_TYPE, typename BODY>
 RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
-                                                 BODY&& body)
+                                                 BODY&&           body)
 {
   using layout_type = typename STORAGE::layout_type;
   tensorTileExec_expanded<STORAGE>(
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index e83d9c5fac..de2ee58ed9 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -58,8 +58,8 @@ class TensorRegister<REGISTER_POLICY,
                                  T,
                                  RAJA::expt::VectorLayout,
                                  camp::idx_seq<SIZE>>>;
-  using element_type = camp::decay<T>;
-  using layout_type = TensorLayout<0>;
+  using element_type  = camp::decay<T>;
+  using layout_type   = TensorLayout<0>;
   using register_type = Register<T, REGISTER_POLICY>;
 
   static constexpr camp::idx_t s_num_elem = SIZE;
@@ -228,7 +228,7 @@ class TensorRegister<REGISTER_POLICY,
   template <typename POINTER_TYPE,
             typename INDEX_TYPE,
             RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            camp::idx_t STRIDE_ONE_DIM>
+            camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<
       RAJA::internal::expt::
           TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
@@ -347,10 +347,10 @@ class TensorRegister<REGISTER_POLICY,
   template <typename POINTER_TYPE,
             typename INDEX_TYPE,
             RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE STRIDE_VALUE,
-            INDEX_TYPE BEGIN_VALUE,
-            INDEX_TYPE SIZE_VALUE,
-            camp::idx_t STRIDE_ONE_DIM>
+            INDEX_TYPE                           STRIDE_VALUE,
+            INDEX_TYPE                           BEGIN_VALUE,
+            INDEX_TYPE                           SIZE_VALUE,
+            camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<RAJA::internal::expt::StaticTensorRef<
       POINTER_TYPE,
       INDEX_TYPE,
@@ -771,7 +771,7 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& scatter(element_type* ptr,
+  self_type const& scatter(element_type*          ptr,
                            int_vector_type const& offsets) const
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
@@ -797,9 +797,9 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& scatter_n(element_type* ptr,
+  self_type const& scatter_n(element_type*          ptr,
                              int_vector_type const& offsets,
-                             camp::idx_t N) const
+                             camp::idx_t            N) const
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index 9bf03dd560..f1090996cd 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -96,10 +96,10 @@ template <typename Res,
           typename Selector,
           typename... Policies>
 RAJA_INLINE resources::EventProxy<Res>
-forall_impl(Res r,
-            MultiPolicy<Selector, Policies...> p,
-            Iterable&& iter,
-            Body&& body)
+            forall_impl(Res                                r,
+                        MultiPolicy<Selector, Policies...> p,
+                        Iterable&&                         iter,
+                        Body&&                             body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
@@ -115,7 +115,7 @@ namespace detail
 
 template <camp::idx_t... Indices, typename... Policies, typename Selector>
 auto make_multi_policy(camp::idx_seq<Indices...>,
-                       Selector s,
+                       Selector                s,
                        std::tuple<Policies...> policies)
     -> MultiPolicy<Selector, Policies...>
 {
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 2d2a8a9402..e2446a0a21 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -64,16 +64,16 @@ enum class Launch
 struct PolicyBase
 {};
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
+template <Policy   Policy_,
+          Pattern  Pattern_,
+          Launch   Launch_,
           Platform Platform_,
           typename... Traits>
 struct PolicyBaseT : PolicyBase
 {
-  static constexpr Policy policy = Policy_;
-  static constexpr Pattern pattern = Pattern_;
-  static constexpr Launch launch = Launch_;
+  static constexpr Policy   policy   = Policy_;
+  static constexpr Pattern  pattern  = Pattern_;
+  static constexpr Launch   launch   = Launch_;
   static constexpr Platform platform = Platform_;
 };
 
@@ -128,9 +128,9 @@ struct policy_has_trait_impl : camp::num<false>
 {};
 ///
 template <typename Trait,
-          Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
+          Policy   Policy_,
+          Pattern  Pattern_,
+          Launch   Launch_,
           Platform Platform_,
           typename... Traits>
 struct policy_has_trait_impl<
@@ -165,9 +165,9 @@ template <Policy Pol, Pattern Pat, typename... Args>
 using make_policy_pattern_t =
     PolicyBaseT<Pol, Pat, Launch::undefined, Platform::undefined, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
+template <Policy   Policy_,
+          Pattern  Pattern_,
+          Launch   Launch_,
           Platform Platform_,
           typename... Args>
 using make_policy_pattern_launch_platform_t =
@@ -177,8 +177,8 @@ template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
+template <Policy   Policy_,
+          Pattern  Pattern_,
           Platform Platform_,
           typename... Args>
 using make_policy_pattern_platform_t =
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index cc0acf70f7..045088fb33 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -608,7 +608,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
   do
   {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected));
 
   return old;
@@ -622,8 +622,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
-                                                     Oper&& oper,
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T*             acc,
+                                                     Oper&&         oper,
                                                      ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
@@ -638,7 +638,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
   do
   {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -754,9 +754,10 @@ RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc)
 template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(
+      acc,
+      [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 template <typename T>
@@ -768,10 +769,14 @@ RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
-    return old == static_cast<T>(0) || value < old ? value
-                                                   : old - static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(acc,
+                                        [value](T old)
+                                        {
+                                          return old == static_cast<T>(0) ||
+                                                         value < old
+                                                     ? value
+                                                     : old - static_cast<T>(1);
+                                        });
 }
 
 template <typename T>
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index b1b2fb2233..671b92a7b9 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -118,7 +118,7 @@ struct DeviceZeroedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    auto res = ::camp::resources::Cuda::get_default();
+    auto  res = ::camp::resources::Cuda::get_default();
     void* ptr;
     cudaErrchk(cudaMalloc(&ptr, nbytes));
     cudaErrchk(cudaMemsetAsync(ptr, 0, nbytes, res.get_stream()));
@@ -174,12 +174,12 @@ namespace detail
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct cudaInfo
 {
-  const void* func = nullptr;
-  cuda_dim_t gridDim{0, 0, 0};
-  cuda_dim_t blockDim{0, 0, 0};
-  size_t* dynamic_smem = nullptr;
+  const void*             func = nullptr;
+  cuda_dim_t              gridDim{0, 0, 0};
+  cuda_dim_t              blockDim{0, 0, 0};
+  size_t*                 dynamic_smem = nullptr;
   ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0, 0)};
-  bool setup_reducers = false;
+  bool                    setup_reducers = false;
 };
 struct cudaStatusInfo : cudaInfo
 {
@@ -216,7 +216,7 @@ void synchronize()
     if (!val.second)
     {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
   if (synchronize)
@@ -271,14 +271,14 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func,
-            cuda_dim_t gridDim,
-            cuda_dim_t blockDim,
-            void** args,
-            size_t shmem,
+void launch(const void*             func,
+            cuda_dim_t              gridDim,
+            cuda_dim_t              blockDim,
+            void**                  args,
+            size_t                  shmem,
             ::RAJA::resources::Cuda res,
-            bool async = true,
-            const char* name = nullptr)
+            bool                    async = true,
+            const char*             name  = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
   if (name) nvtxRangePushA(name);
@@ -353,13 +353,13 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  takes the failure return path.
 template <typename T, typename GetNFromMax>
 RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
-                                        size_t align = alignof(T))
+                                        size_t        align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-                                  ? align - (unaligned_shmem % align)
-                                  : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
   const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
@@ -387,12 +387,12 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // resources.
 template <typename LOOP_BODY>
 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
-make_launch_body(const void* func,
-                 cuda_dim_t gridDim,
-                 cuda_dim_t blockDim,
-                 size_t& dynamic_smem,
+make_launch_body(const void*             func,
+                 cuda_dim_t              gridDim,
+                 cuda_dim_t              blockDim,
+                 size_t&                 dynamic_smem,
                  ::RAJA::resources::Cuda res,
-                 LOOP_BODY&& loop_body)
+                 LOOP_BODY&&             loop_body)
 {
   ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
       detail::tl_status,
@@ -403,7 +403,7 @@ make_launch_body(const void* func,
 }
 
 
-static constexpr int cuda_occupancy_uninitialized_int = -1;
+static constexpr int    cuda_occupancy_uninitialized_int = -1;
 static constexpr size_t cuda_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
 
@@ -428,15 +428,15 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 struct CudaOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
-  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
+  int    func_max_blocks_per_device   = cuda_occupancy_uninitialized_int;
+  int    func_max_threads_per_block   = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
 RAJA_INLINE CudaOccMaxBlocksThreadsData
 cuda_occupancy_max_blocks_threads(const void* func,
-                                  size_t func_dynamic_shmem_per_block)
+                                  size_t      func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
@@ -459,8 +459,8 @@ cuda_occupancy_max_blocks_threads(const void* func,
 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_threads_per_block = cuda_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
+  int    func_threads_per_block       = cuda_occupancy_uninitialized_int;
+  int    func_max_blocks_per_sm       = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
@@ -474,7 +474,7 @@ cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
   {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &data.func_max_blocks_per_sm,
@@ -490,8 +490,8 @@ cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
 RAJA_INLINE CudaOccMaxBlocksData
 cuda_occupancy_max_blocks(const void* func,
-                          size_t func_dynamic_shmem_per_block,
-                          int func_threads_per_block)
+                          size_t      func_dynamic_shmem_per_block,
+                          int         func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
@@ -500,7 +500,7 @@ cuda_occupancy_max_blocks(const void* func,
   {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &data.func_max_blocks_per_sm,
@@ -543,8 +543,8 @@ template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
   ConcretizerImpl(const void* func,
-                  size_t func_dynamic_shmem_per_block,
-                  IdxT len)
+                  size_t      func_dynamic_shmem_per_block,
+                  IdxT        len)
       : m_func(func),
         m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
         m_len(len)
@@ -623,8 +623,8 @@ struct ConcretizerImpl
 
 private:
   const void* m_func;
-  size_t m_func_dynamic_shmem_per_block;
-  IdxT m_len;
+  size_t      m_func_dynamic_shmem_per_block;
+  IdxT        m_len;
 };
 
 } // namespace cuda
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index d2bea228e7..c67f54ab21 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -43,7 +43,7 @@ namespace cuda
 // factory and writes it into a pinned ptr
 template <typename Factory>
 __global__ void get_value_global(typename Factory::value_type* ptr,
-                                 Factory factory)
+                                 Factory                       factory)
 {
   *ptr = factory();
 }
@@ -52,7 +52,7 @@ __global__ void get_value_global(typename Factory::value_type* ptr,
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
+  static void*  ptr           = nullptr;
   if (nbytes > cached_nbytes)
   {
     cached_nbytes = 0;
@@ -109,12 +109,12 @@ template <typename T,
           typename Dispatcher_T,
           size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
-          bool Async>
+          bool   Async>
 inline const Dispatcher_T*
 get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory) {
         return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
       })};
   return &dispatcher;
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 0178736eea..cb31f32f86 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -38,7 +38,7 @@ namespace detail
  */
 template <size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -67,7 +67,7 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
       INDEX_T,
       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
@@ -75,7 +75,7 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const& storage,
+  per_run_storage run(WorkContainer const&         storage,
                       typename base::resource_type r,
                       Args... args) const
   {
@@ -103,7 +103,7 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
  */
 template <size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -132,7 +132,7 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
       INDEX_T,
       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
@@ -140,7 +140,7 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const& storage,
+  per_run_storage run(WorkContainer const&         storage,
                       typename base::resource_type r,
                       Args... args) const
   {
@@ -184,9 +184,9 @@ struct HoldCudaDeviceXThreadblockLoop
     // TODO:: decide when to run hooks, may bypass this and use impl directly
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
-    const index_type stride = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end = m_segment.end();
+    const index_type stride  = blockDim.x * gridDim.x;
+    const auto       begin   = m_segment.begin();
+    const auto       end     = m_segment.end();
     const index_type len(end - begin);
     for (index_type i = i_begin; i < len; i += stride)
     {
@@ -196,7 +196,7 @@ struct HoldCudaDeviceXThreadblockLoop
 
 private:
   Segment_type m_segment;
-  LoopBody m_body;
+  LoopBody     m_body;
 };
 
 template <size_t BLOCK_SIZE,
@@ -223,7 +223,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  */
 template <size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -241,9 +241,9 @@ struct WorkRunner<
   using order_policy = RAJA::policy::cuda::
       unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Cuda;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Cuda;
 
   // The type that will hold the segment and loop body in work storage
   struct holder_type
@@ -276,7 +276,7 @@ struct WorkRunner<
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
   WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
@@ -297,9 +297,9 @@ struct WorkRunner<
   inline void
   enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
-    using Iterator = camp::decay<decltype(std::begin(iter))>;
+    using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE = camp::decay<Iterable>;
+    using ITERABLE  = camp::decay<Iterable>;
     using IndexType =
         camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
@@ -308,9 +308,9 @@ struct WorkRunner<
     // using true_value_type = typename WorkContainer::template
     // true_value_type<holder>;
 
-    Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator  begin = std::begin(iter);
+    Iterator  end   = std::end(iter);
+    IndexType len   = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
     if (len > 0 && BLOCK_SIZE > 0)
@@ -340,8 +340,8 @@ struct WorkRunner<
   per_run_storage
   run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
@@ -357,8 +357,8 @@ struct WorkRunner<
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator  begin     = std::begin(storage);
+    Iterator  end       = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index 3c94656ae4..628ccc4715 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -342,7 +342,7 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
   do
   {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected));
 
   return old;
@@ -355,8 +355,8 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
  * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
-                                             Oper&& oper,
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T*             acc,
+                                             Oper&&         oper,
                                              ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
@@ -371,7 +371,7 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
   do
   {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -529,9 +529,11 @@ RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(acc, [value](T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return cuda_atomicCAS_loop(acc,
+                             [value](T old) {
+                               return value <= old ? static_cast<T>(0)
+                                                   : old + static_cast<T>(1);
+                             });
 }
 
 template <
@@ -565,10 +567,13 @@ RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc, [value](T old) {
-    return old == static_cast<T>(0) || value < old ? value
-                                                   : old - static_cast<T>(1);
-  });
+  return cuda_atomicCAS_loop(acc,
+                             [value](T old)
+                             {
+                               return old == static_cast<T>(0) || value < old
+                                          ? value
+                                          : old - static_cast<T>(1);
+                             });
 }
 
 template <
@@ -694,7 +699,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
@@ -707,7 +712,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
@@ -720,7 +725,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
@@ -733,7 +738,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
@@ -746,7 +751,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -773,7 +778,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -800,7 +805,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
@@ -813,7 +818,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
                                         T* acc,
-                                        T value)
+                                        T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
@@ -826,7 +831,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
@@ -839,7 +844,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
                                               T* acc,
-                                              T value)
+                                              T  value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 24a3d3a1c0..137cdc591e 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -81,8 +81,8 @@ struct ForallDimensionCalculator;
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
 template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+          int       BLOCK_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -102,12 +102,12 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* RAJA_UNUSED_ARG(func),
+                             IdxT                len,
+                             const void*         RAJA_UNUSED_ARG(func),
                              size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     if (len > (block_size * grid_size))
     {
@@ -123,7 +123,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int GRID_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -141,14 +141,14 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT                len,
+                             const void*         func,
+                             size_t              dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
     if (block_size == IdxT(0))
@@ -163,7 +163,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int BLOCK_SIZE,
+          int       BLOCK_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -181,15 +181,15 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT                len,
+                             const void*         func,
+                             size_t              dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
@@ -209,9 +209,9 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT                len,
+                             const void*         func,
+                             size_t              dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
@@ -224,8 +224,8 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+          int       BLOCK_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -245,12 +245,12 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func),
+                             IdxT                RAJA_UNUSED_ARG(len),
+                             const void*         RAJA_UNUSED_ARG(func),
                              size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
@@ -258,7 +258,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int GRID_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -276,14 +276,14 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT                len,
+                             const void*         func,
+                             size_t              dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -292,7 +292,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int BLOCK_SIZE,
+          int       BLOCK_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -310,15 +310,15 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT                len,
+                             const void*         func,
+                             size_t              dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
@@ -338,9 +338,9 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT                len,
+                             const void*         func,
+                             size_t              dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
@@ -373,20 +373,20 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size > 0),
                            size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forall_cuda_kernel(LOOP_BODY loop_body,
+    void forall_cuda_kernel(LOOP_BODY      loop_body,
                             const Iterator idx,
-                            IndexType length)
+                            IndexType      length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
@@ -399,7 +399,7 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size <= 0),
@@ -408,9 +408,9 @@ __global__ void
 forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
@@ -424,21 +424,21 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size > 0),
                            size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(LOOP_BODY loop_body,
+    void forallp_cuda_kernel(LOOP_BODY      loop_body,
                              const Iterator idx,
-                             IndexType length,
-                             ForallParam f_params)
+                             IndexType      length,
+                             ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -453,20 +453,20 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size <= 0),
                            size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+__global__ void forallp_cuda_kernel(LOOP_BODY      loop_body,
                                     const Iterator idx,
-                                    IndexType length,
-                                    ForallParam f_params)
+                                    IndexType      length,
+                                    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -480,8 +480,8 @@ template <
     typename Iterator,
     typename LOOP_BODY,
     typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
@@ -489,13 +489,13 @@ template <
                          (IterationGetter::block_size > 0),
                      size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forall_cuda_kernel(LOOP_BODY loop_body,
+    void forall_cuda_kernel(LOOP_BODY      loop_body,
                             const Iterator idx,
-                            IndexType length)
+                            IndexType      length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -510,7 +510,7 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
@@ -521,8 +521,8 @@ __global__ void
 forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -538,8 +538,8 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
@@ -547,14 +547,14 @@ template <
                          (IterationGetter::block_size > 0),
                      size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(LOOP_BODY loop_body,
+    void forallp_cuda_kernel(LOOP_BODY      loop_body,
                              const Iterator idx,
-                             IndexType length,
-                             ForallParam f_params)
+                             IndexType      length,
+                             ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -571,21 +571,21 @@ template <
     typename IndexType,
     typename ForallParam,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
                                          IterationMapping>::value &&
                          (IterationGetter::block_size <= 0),
                      size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+__global__ void forallp_cuda_kernel(LOOP_BODY      loop_body,
                                     const Iterator idx,
-                                    IndexType length,
-                                    ForallParam f_params)
+                                    IndexType      length,
+                                    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -610,7 +610,7 @@ template <typename Iterable,
           typename IterationGetter,
           typename Concretizer,
           size_t BlocksPerSM,
-          bool Async,
+          bool   Async,
           typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
@@ -626,7 +626,7 @@ forall_impl(resources::Cuda cuda_res,
             LoopBody&& loop_body,
             ForallParam)
 {
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -645,9 +645,9 @@ forall_impl(resources::Cuda cuda_res,
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
@@ -706,7 +706,7 @@ template <typename Iterable,
           typename IterationGetter,
           typename Concretizer,
           size_t BlocksPerSM,
-          bool Async,
+          bool   Async,
           typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
@@ -719,11 +719,11 @@ forall_impl(resources::Cuda cuda_res,
                                                      Concretizer,
                                                      BlocksPerSM,
                                                      Async> const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
+            Iterable&&  iter,
+            LoopBody&&  loop_body,
             ForallParam f_params)
 {
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -732,7 +732,7 @@ forall_impl(resources::Cuda cuda_res,
                                                             Concretizer,
                                                             BlocksPerSM,
                                                             Async>;
-  using UniqueMarker = ::camp::list<IterationMapping,
+  using UniqueMarker        = ::camp::list<IterationMapping,
                                     IterationGetter,
                                     camp::num<BlocksPerSM>,
                                     LOOP_BODY,
@@ -746,9 +746,9 @@ forall_impl(resources::Cuda cuda_res,
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
@@ -776,9 +776,9 @@ forall_impl(resources::Cuda cuda_res,
     RAJA_FT_BEGIN;
 
     RAJA::cuda::detail::cudaInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = cuda_res;
+    launch_info.res      = cuda_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -835,7 +835,7 @@ template <typename LoopBody,
           typename IterationGetter,
           typename Concretizer,
           size_t BlocksPerSM,
-          bool Async,
+          bool   Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
     resources::Cuda r,
@@ -846,7 +846,7 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
                                                         BlocksPerSM,
                                                         Async>>,
     const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody&& loop_body)
+    LoopBody&&                            loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index d7239a64db..990af56784 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -98,7 +98,7 @@ struct AccessorDeviceScopeUseBlockFence
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(
+    auto      ptr = const_cast<integer_type*>(
         reinterpret_cast<const integer_type*>(in_ptr + idx));
 
     for (size_t i = 0; i < u.array_size(); ++i)
@@ -218,7 +218,7 @@ shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
 
 template <>
 RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
-                                                           int laneMask)
+                                                           int       laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -289,7 +289,7 @@ shfl_sync<unsigned long>(unsigned long var, int srcLane)
 
 template <>
 RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
-                                                       int srcLane)
+                                                       int       srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -358,7 +358,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
     for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T   rhs     = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -401,7 +401,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::cuda::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
@@ -423,7 +423,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T   rhs     = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index 968a3010e9..73cd37b8b8 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -45,7 +45,7 @@ struct CudaStatementExecutor<Data,
                              Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index a08a554496..abe4458413 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -213,7 +213,7 @@ template <typename Data, typename Exec>
 __global__ void CudaKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -232,7 +232,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
     void CudaKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -296,9 +296,9 @@ struct CudaLaunchHelper;
  * determined at runtime using the CUDA occupancy calculator.
  */
 template <bool async0,
-          int num_blocks,
-          int num_threads,
-          int blocks_per_sm,
+          int  num_blocks,
+          int  num_threads,
+          int  blocks_per_sm,
           typename StmtList,
           typename Data,
           typename Types>
@@ -327,8 +327,8 @@ struct CudaLaunchHelper<
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-                                                int& recommended_blocks,
-                                                int& recommended_threads)
+                                                int&   recommended_blocks,
+                                                int&   recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -344,7 +344,7 @@ struct CudaLaunchHelper<
         //
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
       }
       else
@@ -391,7 +391,7 @@ struct CudaLaunchHelper<
   }
 
   inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
-                                 int& max_threads)
+                                 int&   max_threads)
   {
     if (num_threads <= 0)
     {
@@ -465,8 +465,8 @@ struct CudaLaunchHelper<
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
 inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
-                              cuda_dim_t result,
-                              cuda_dim_t minimum = cuda_dim_t())
+                              cuda_dim_t        result,
+                              cuda_dim_t        minimum = cuda_dim_t())
 {
 
 
@@ -551,7 +551,7 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
     if (num_blocks > 0 || num_threads > 0)
     {
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index bce587f6fd..04eba2cbc1 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -69,7 +69,7 @@ struct CudaStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -138,8 +138,8 @@ struct CudaStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
@@ -216,8 +216,8 @@ struct CudaStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
@@ -376,8 +376,8 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
@@ -514,8 +514,8 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 3900964ab3..bb50c5dd01 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -79,7 +79,7 @@ struct CudaStatementExecutor<
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -147,8 +147,8 @@ struct CudaStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
@@ -224,8 +224,8 @@ struct CudaStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
@@ -392,8 +392,8 @@ struct CudaStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
@@ -525,8 +525,8 @@ struct CudaStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 8ea06be341..4c024c3023 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -51,7 +51,7 @@ struct CudaStatementExecutor<Data,
                              Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
@@ -137,7 +137,7 @@ struct CudaStatementExecutor<Data,
                              Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index b93e49e966..ad01fb03d2 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -56,7 +56,7 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
@@ -119,7 +119,7 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index a65dc58a4e..53e864e285 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -116,7 +116,7 @@ struct CudaStatementExecutor<
     LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -174,7 +174,7 @@ struct CudaStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
@@ -217,7 +217,7 @@ struct CudaStatementExecutor<
     LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -275,7 +275,7 @@ struct CudaStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
@@ -313,7 +313,7 @@ struct CudaStatementExecutor<
     LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index a234b27b0f..630e69c215 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -99,8 +99,8 @@ struct CudaStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -180,13 +180,13 @@ struct CudaStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
@@ -272,13 +272,13 @@ struct CudaStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 50b51e9385..837d8f6442 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -50,8 +50,8 @@ struct LaunchDims
   CudaDims dims;
   CudaDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
@@ -221,7 +221,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   template <typename IdxT>
   static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
                              CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+                             IdxT      len)
   {
     if (len > static_cast<IdxT>(1))
     {
@@ -336,7 +336,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   template <typename IdxT>
   static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
                              CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+                             IdxT      len)
   {
     if (len > static_cast<IdxT>(0))
     {
@@ -405,9 +405,9 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+template <named_dim               dim,
+          int                     BLOCK_SIZE,
+          int                     GRID_SIZE,
           kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::Direct,
@@ -457,7 +457,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   template <typename IdxT>
   static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
                              CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT RAJA_UNUSED_ARG(len))
+                             IdxT      RAJA_UNUSED_ARG(len))
   {}
 };
 
@@ -624,9 +624,9 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+template <named_dim               dim,
+          int                     BLOCK_SIZE,
+          int                     GRID_SIZE,
           kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index a11dd9f0a0..77fe2f325c 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -34,8 +34,8 @@ __global__ void launch_global_fcn(BODY body_in)
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -45,14 +45,14 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in,
+__global__ void launch_new_reduce_global_fcn(BODY         body_in,
                                              ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -78,10 +78,10 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+       const LaunchParams&       params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -152,10 +152,10 @@ struct LaunchExecute<
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+       const LaunchParams&       launch_params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -188,10 +188,10 @@ struct LaunchExecute<
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
 
       {
         using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
@@ -245,8 +245,8 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -256,18 +256,18 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
 }
 
 template <typename BODY,
-          int num_threads,
+          int    num_threads,
           size_t BLOCKS_PER_SM,
           typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+    void launch_new_reduce_global_fcn_fixed(BODY         body_in,
                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -291,10 +291,10 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+       const LaunchParams&       params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -367,10 +367,10 @@ struct LaunchExecute<
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+       const LaunchParams&       launch_params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -408,10 +408,10 @@ struct LaunchExecute<
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
       {
 
         using EXEC_POL = RAJA::policy::cuda::
@@ -471,11 +471,11 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     if (i < len)
     {
@@ -499,9 +499,9 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -535,10 +535,10 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -572,11 +572,11 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     for (diff_t i = i_init; i < len; i += i_stride)
@@ -602,9 +602,9 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -647,10 +647,10 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -696,11 +696,11 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     if (i < len)
     {
@@ -723,9 +723,9 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -759,10 +759,10 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -799,11 +799,11 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     for (diff_t i = i_init; i < len; i += i_stride)
@@ -829,9 +829,9 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -874,10 +874,10 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -942,8 +942,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -979,8 +979,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1030,8 +1030,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1067,8 +1067,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1106,9 +1106,9 @@ struct TileExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i =
@@ -1136,9 +1136,9 @@ struct TileExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init =
@@ -1167,13 +1167,13 @@ struct TileTCountExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
     if (i < len)
     {
@@ -1197,13 +1197,13 @@ struct TileTCountExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index e7b776416e..bc57cce32a 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -79,13 +79,13 @@ template <typename Combiner,
           typename GetTallyOffset>
 RAJA_DEVICE RAJA_INLINE void
 block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                         T identity,
+                                         T   identity,
                                          int bin,
-                                         T value,
-                                         T* tally_mem,
+                                         T   value,
+                                         T*  tally_mem,
                                          GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+                                         int            tally_replication,
+                                         int            tally_bins)
 {
   if (value == identity)
   {
@@ -105,8 +105,8 @@ block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
 template <typename T>
 RAJA_DEVICE RAJA_INLINE void
 block_multi_reduce_init_shmem(int num_bins,
-                              T identity,
-                              T* shared_mem,
+                              T   identity,
+                              T*  shared_mem,
                               int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
@@ -128,13 +128,13 @@ template <typename Combiner,
           typename T,
           typename GetSharedOffset>
 RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                        T identity,
-                                        int bin,
-                                        T value,
-                                        T* shared_mem,
+block_multi_reduce_combine_shmem_atomic(int             num_bins,
+                                        T               identity,
+                                        int             bin,
+                                        T               value,
+                                        T*              shared_mem,
                                         GetSharedOffset get_shared_offset,
-                                        int shared_replication)
+                                        int             shared_replication)
 {
   if (value == identity)
   {
@@ -156,15 +156,15 @@ template <typename Combiner,
           typename GetSharedOffset,
           typename GetTallyOffset>
 RAJA_DEVICE RAJA_INLINE void
-grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                         T identity,
-                                         T* shared_mem,
+grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
+                                         T               identity,
+                                         T*              shared_mem,
                                          GetSharedOffset get_shared_offset,
-                                         int shared_replication,
-                                         T* tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+                                         int             shared_replication,
+                                         T*              tally_mem,
+                                         GetTallyOffset  get_tally_offset,
+                                         int             tally_replication,
+                                         int             tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -212,7 +212,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
   //! setup permanent settings, allocate and initialize tally memory
   template <typename Container>
   MultiReduceGridAtomicHostInit_TallyData(Container const& container,
-                                          T const& identity)
+                                          T const&         identity)
       : m_tally_mem(nullptr),
         m_identity(identity),
         m_num_bins(container.size()),
@@ -232,7 +232,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
   operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
   MultiReduceGridAtomicHostInit_TallyData&
   operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
@@ -243,17 +243,17 @@ struct MultiReduceGridAtomicHostInit_TallyData
     if (new_num_bins != m_num_bins)
     {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(
+      m_tally_mem         = create_tally(
           container, identity, m_num_bins, m_tally_bins, m_tally_replication);
     }
     else
     {
       {
         int tally_rep = 0;
-        int bin = 0;
+        int bin       = 0;
         for (auto const& value : container)
         {
           m_tally_mem[GetTallyOffset{}(
@@ -308,7 +308,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
       RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
   using TallyAtomicReplicationConcretizer =
       typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
@@ -341,10 +341,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
 
   template <typename Container>
   static T* create_tally(Container const& container,
-                         T const& identity,
-                         int num_bins,
-                         int tally_bins,
-                         int tally_replication)
+                         T const&         identity,
+                         int              num_bins,
+                         int              tally_bins,
+                         int              tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -358,7 +358,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       {
         int tally_rep = 0;
-        int bin = 0;
+        int bin       = 0;
         for (auto const& value : container)
         {
           int tally_offset =
@@ -404,11 +404,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
-  T* m_tally_mem;
-  T m_identity;
+  T*  m_tally_mem;
+  T   m_identity;
   int m_num_bins;
   int m_tally_bins;
   int m_tally_replication; // power of 2, at least the max number of omp threads
@@ -497,7 +497,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   //! setup permanent settings, defer to tally data
   template <typename Container>
   MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
-                                              T const& identity)
+                                              T const&         identity)
       : TallyData(container, identity),
         m_shared_offset(s_shared_offset_unknown),
         m_shared_replication(0)
@@ -512,7 +512,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
   MultiReduceBlockThenGridAtomicHostInit_Data&
   operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
 
 
   //! defer to tally data for some functions
@@ -531,9 +531,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
       return;
     }
 
-    size_t shared_replication = 0;
-    const size_t shared_offset =
-        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+    size_t       shared_replication = 0;
+    const size_t shared_offset      = allocateDynamicShmem<T>(
+        [&](size_t max_shmem_size)
+        {
           struct
           {
             size_t func_threads_per_block;
@@ -549,7 +550,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_offset != dynamic_smem_allocation_failure)
     {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
+      m_shared_offset      = static_cast<int>(shared_offset);
     }
     else
     {
@@ -561,7 +562,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
 
@@ -644,7 +645,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
   using SharedAtomicReplicationConcretizer =
       typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -728,7 +729,7 @@ struct MultiReduceDataCuda
   using SyncList = std::vector<resources::Cuda>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataCuda() = delete;
@@ -768,7 +769,7 @@ struct MultiReduceDataCuda
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
@@ -780,9 +781,9 @@ struct MultiReduceDataCuda
 #endif
   }
 
-  MultiReduceDataCuda(MultiReduceDataCuda&&) = delete;
+  MultiReduceDataCuda(MultiReduceDataCuda&&)                 = delete;
   MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
-  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -856,9 +857,9 @@ struct MultiReduceDataCuda
 
 private:
   MultiReduceDataCuda const* m_parent;
-  SyncList* m_sync_list;
-  reduce_data_type m_data;
-  bool m_own_launch_data;
+  SyncList*                  m_sync_list;
+  reduce_data_type           m_data;
+  bool                       m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Cuda res)
   {
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index e5f33d748a..d7412c4d51 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -101,7 +101,7 @@ struct MaxOccupancyConcretizer
   template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
     IdxT func_max_blocks_per_device =
@@ -126,7 +126,7 @@ struct FractionOffsetOccupancyConcretizer
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
     if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
@@ -163,8 +163,8 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
     IdxT func_max_threads_per_sm =
         func_threads_per_block * func_max_blocks_per_sm;
@@ -208,7 +208,7 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
   template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
     if (func_threads_per_block < cutoff)
@@ -281,17 +281,17 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <reduce_algorithm t_algorithm,
+template <reduce_algorithm         t_algorithm,
           block_communication_mode t_comm_mode,
-          size_t t_replication,
-          size_t t_atomic_stride>
+          size_t                   t_replication,
+          size_t                   t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
-  static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
-  static constexpr bool consistent =
+  static constexpr reduce_algorithm         algorithm     = t_algorithm;
+  static constexpr block_communication_mode comm_mode     = t_comm_mode;
+  static constexpr size_t                   replication   = t_replication;
+  static constexpr size_t                   atomic_stride = t_atomic_stride;
+  static constexpr bool                     consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
 
@@ -308,8 +308,8 @@ template <typename t_AtomicReplicationConcretizer,
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
 template <multi_reduce_algorithm t_algorithm,
@@ -320,7 +320,7 @@ struct MultiReduceTuning
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 } // namespace cuda
@@ -389,20 +389,20 @@ template <typename _IterationMapping,
           typename _IterationGetter,
           typename _LaunchConcretizer,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool Async = false>
+          bool   Async         = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::forall,
                                 detail::get_launch<Async>::value,
                                 RAJA::Platform::cuda>
 {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async,
-          int num_threads = named_usage::unspecified,
+template <bool   Async,
+          int    num_threads   = named_usage::unspecified,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 struct cuda_launch_explicit_t
     : public RAJA::make_policy_pattern_launch_platform_t<
@@ -422,7 +422,7 @@ struct cuda_launch_explicit_t
 ///
 template <size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool Async = false>
+          bool   Async         = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::workgroup_exec,
@@ -570,8 +570,8 @@ struct CudaDims
   cuda_dim_t blocks{0, 0, 0};
   cuda_dim_t threads{0, 0, 0};
 
-  CudaDims() = default;
-  CudaDims(CudaDims const&) = default;
+  CudaDims()                           = default;
+  CudaDims(CudaDims const&)            = default;
   CudaDims& operator=(CudaDims const&) = default;
 
   RAJA_INLINE
@@ -692,11 +692,11 @@ namespace cuda
 struct IndexSize
 {
   cuda_dim_member_t block_size = named_usage::unspecified;
-  cuda_dim_member_t grid_size = named_usage::unspecified;
+  cuda_dim_member_t grid_size  = named_usage::unspecified;
 
   RAJA_HOST_DEVICE constexpr IndexSize(
       cuda_dim_member_t _block_size = named_usage::unspecified,
-      cuda_dim_member_t _grid_size = named_usage::unspecified)
+      cuda_dim_member_t _grid_size  = named_usage::unspecified)
       : block_size(_block_size), grid_size(_grid_size)
   {}
 };
@@ -713,7 +713,7 @@ struct IndexGlobal
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -738,7 +738,7 @@ struct IndexGlobal<dim, 1, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -760,7 +760,7 @@ struct IndexGlobal<dim, BLOCK_SIZE, 1>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -780,7 +780,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -802,7 +802,7 @@ struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -828,7 +828,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -852,7 +852,7 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -877,7 +877,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -899,7 +899,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -930,7 +930,7 @@ struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -950,7 +950,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -969,7 +969,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -994,7 +994,7 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -1014,7 +1014,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -1033,7 +1033,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -1056,7 +1056,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -1252,7 +1252,7 @@ using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
           size_t BLOCK_SIZE_X = named_usage::unspecified,
           size_t BLOCK_SIZE_Y = named_usage::unspecified,
           size_t BLOCK_SIZE_Z = named_usage::unspecified>
@@ -1260,13 +1260,13 @@ using warp_xyz =
     IndexDivide<WARP_SIZE,
                 thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
 
-template <size_t WARP_SIZE = RAJA::policy::cuda::device_constants.WARP_SIZE,
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
           size_t BLOCK_SIZE_X = named_usage::unspecified,
           size_t BLOCK_SIZE_Y = named_usage::unspecified,
           size_t BLOCK_SIZE_Z = named_usage::unspecified,
-          size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
 using warp_global_xyz =
     IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
                  block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
@@ -1294,7 +1294,7 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 template <size_t BLOCK_SIZE,
           size_t GRID_SIZE,
           size_t BLOCKS_PER_SM,
-          bool Async = false>
+          bool   Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
@@ -1524,10 +1524,10 @@ using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
     policy::cuda::MIN_BLOCKS_PER_SM,
     true>;
 
-template <bool with_reduce,
+template <bool   with_reduce,
           size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
-          bool Async = false>
+          bool   Async = false>
 using cuda_exec_base_explicit = std::conditional_t<
     with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
@@ -1555,7 +1555,7 @@ using cuda_exec_base_async =
 // policies usable with WorkGroup
 template <size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool Async = false>
+          bool   Async         = false>
 using cuda_work_explicit =
     policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
@@ -1580,10 +1580,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template <cuda::reduce_algorithm algorithm,
+template <cuda::reduce_algorithm         algorithm,
           cuda::block_communication_mode comm_mode,
-          size_t replication = named_usage::unspecified,
-          size_t atomic_stride = named_usage::unspecified>
+          size_t                         replication = named_usage::unspecified,
+          size_t atomic_stride                       = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1757,8 +1757,8 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool Async,
-          int num_threads = named_usage::unspecified,
+template <bool   Async,
+          int    num_threads   = named_usage::unspecified,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 using cuda_launch_explicit_t =
     policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index b5c0ea235a..2e3f13519e 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -136,9 +136,9 @@ template <typename Combiner,
           int atomic_stride,
           typename T,
           typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                                   T identity,
-                                                   TempIterator in_device_mem,
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T&            val,
+                                                   T             identity,
+                                                   TempIterator  in_device_mem,
                                                    unsigned int* device_count)
 {
   typename TempIterator::template rebind_accessor<Accessor> device_mem(
@@ -153,15 +153,15 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
-  unsigned int numSlots = (numBlocks / replication) +
+  int          maxNumSlots = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots    = (numBlocks / replication) +
                           ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
@@ -220,9 +220,9 @@ template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
@@ -234,7 +234,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner{}(temp, rhs);
     }
   }
   else
@@ -244,7 +244,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
+      T   rhs     = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -299,7 +299,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
            i *= 2)
       {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner{}(temp, rhs);
       }
     }
 
@@ -319,11 +319,11 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
   using ThreadIterationGetter =
       typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int          numBlocks   = BlockIterationGetter::size();
+  const int          numThreads  = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(red.val, OP::identity());
@@ -337,7 +337,7 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(red.device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
@@ -375,9 +375,9 @@ template <typename Combiner,
           int atomic_stride,
           typename T>
 RAJA_DEVICE RAJA_INLINE int
-grid_reduce_atomic_device_init(T& val,
-                               T identity,
-                               T* device_mem,
+grid_reduce_atomic_device_init(T&            val,
+                               T             identity,
+                               T*            device_mem,
                                unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
@@ -388,7 +388,7 @@ grid_reduce_atomic_device_init(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = (blockId % replication);
-  int atomicOffset = replicationId * atomic_stride;
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
                           ((replicationId < (numBlocks % replication)) ? 1 : 0);
@@ -455,7 +455,7 @@ grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
   int replicationId = (blockId % replication);
-  int atomicOffset = replicationId * atomic_stride;
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
@@ -478,14 +478,14 @@ class PinnedTally
   struct Node
   {
     Node* next;
-    T values[num_slots];
+    T     values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode
   {
-    ResourceNode* next;
+    ResourceNode*           next;
     ::RAJA::resources::Cuda res;
-    Node* node_list;
+    Node*                   node_list;
   };
 
   //! Iterator over resources used by reducer
@@ -505,7 +505,7 @@ class PinnedTally
     ResourceIterator operator++(int)
     {
       ResourceIterator ret = *this;
-      this->operator++();
+      this->           operator++();
       return ret;
     }
 
@@ -542,12 +542,12 @@ class PinnedTally
       else if (m_rn->next)
       {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
+        m_n  = m_rn->node_list;
       }
       else
       {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -555,7 +555,7 @@ class PinnedTally
     ResourceNodeIterator operator++(int)
     {
       ResourceNodeIterator ret = *this;
-      this->operator++();
+      this->               operator++();
       return ret;
     }
 
@@ -573,7 +573,7 @@ class PinnedTally
 
   private:
     ResourceNode* m_rn;
-    Node* m_n;
+    Node*         m_n;
   };
 
   PinnedTally() : resource_list(nullptr) {}
@@ -609,14 +609,14 @@ class PinnedTally
     }
     if (!rn)
     {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -639,7 +639,7 @@ class PinnedTally
       ResourceNode* rn = resource_list;
       while (rn->node_list)
       {
-        Node* n = rn->node_list;
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -676,16 +676,16 @@ template <typename Combiner,
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T value;
-  T identity;
-  unsigned int* device_count;
+  mutable T                                  value;
+  T                                          identity;
+  unsigned int*                              device_count;
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
-  bool owns_device_pointer;
+  bool                                       owns_device_pointer;
 
   ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
 
@@ -746,9 +746,9 @@ struct ReduceLastBlock_Data
     bool act = !device.allocated() && setupReducers();
     if (act)
     {
-      cuda_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
+      cuda_dim_t gridDim     = currentGridDim();
+      size_t     numBlocks   = gridDim.x * gridDim.y * gridDim.z;
+      size_t     maxNumSlots = (numBlocks + replication - 1) / replication;
       device.allocate(maxNumSlots * replication);
       device_count =
           count_mempool_type::getInstance().template malloc<unsigned int>(
@@ -767,7 +767,7 @@ struct ReduceLastBlock_Data
     {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -786,9 +786,9 @@ struct ReduceAtomicHostInit_Data
   static constexpr size_t tally_slots = replication * atomic_stride;
 
   mutable T value;
-  T identity;
-  bool is_setup;
-  bool owns_device_pointer;
+  T         identity;
+  bool      is_setup;
+  bool      owns_device_pointer;
 
   ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
 
@@ -838,7 +838,7 @@ struct ReduceAtomicHostInit_Data
     bool act = !is_setup && setupReducers();
     if (act)
     {
-      is_setup = true;
+      is_setup            = true;
       owns_device_pointer = true;
     }
     return act;
@@ -851,7 +851,7 @@ struct ReduceAtomicHostInit_Data
     bool act = owns_device_pointer;
     if (act)
     {
-      is_setup = false;
+      is_setup            = false;
       owns_device_pointer = false;
     }
     return act;
@@ -867,16 +867,16 @@ template <typename Combiner,
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T value;
-  T identity;
+  mutable T     value;
+  T             identity;
   unsigned int* device_count;
-  T* device;
-  bool owns_device_pointer;
+  T*            device;
+  bool          owns_device_pointer;
 
   ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
 
@@ -955,7 +955,7 @@ struct ReduceAtomicDeviceInit_Data
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -1035,7 +1035,7 @@ class Reduce
   union tally_u
   {
     TallyType* list;
-    T* val_ptr;
+    T*         val_ptr;
     constexpr tally_u(TallyType* l) : list(l){};
     constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
   };
@@ -1124,7 +1124,7 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
     if (n != end)
     {
@@ -1159,8 +1159,8 @@ class Reduce
   T get_combined() const { return val.value; }
 
 private:
-  const Reduce* parent;
-  tally_u tally_or_val_ptr;
+  const Reduce*    parent;
+  tally_u          tally_or_val_ptr;
   reduce_data_type val;
 };
 
@@ -1266,16 +1266,16 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val,
+  ReduceMinLoc(T         init_val,
                IndexType init_idx,
-               T identity_val = NonLocCombiner::identity(),
+               T         identity_val = NonLocCombiner::identity(),
                IndexType identity_idx =
                    RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : Base(value_type(init_val, init_idx),
@@ -1284,9 +1284,9 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
+  void reset(T         init_val,
              IndexType init_idx,
-             T identity_val = NonLocCombiner::identity(),
+             T         identity_val = NonLocCombiner::identity(),
              IndexType identity_idx =
                  RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
@@ -1322,16 +1322,16 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
           tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val,
+  ReduceMaxLoc(T         init_val,
                IndexType init_idx,
-               T identity_val = NonLocCombiner::identity(),
+               T         identity_val = NonLocCombiner::identity(),
                IndexType identity_idx =
                    RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : Base(value_type(init_val, init_idx),
@@ -1340,9 +1340,9 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
+  void reset(T         init_val,
              IndexType init_idx,
-             T identity_val = NonLocCombiner::identity(),
+             T         identity_val = NonLocCombiner::identity(),
              IndexType identity_idx =
                  RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index e0bc4a9287..42aa31b2cb 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -46,25 +46,25 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename InputIter,
           typename Function>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
-inclusive_inplace(resources::Cuda cuda_res,
-                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+            inclusive_inplace(resources::Cuda cuda_res,
+                              ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                            IterationGetter,
                                                            Concretizer,
                                                            BLOCKS_PER_SM,
                                                            Async>,
-                  InputIter begin,
-                  InputIter end,
-                  Function binary_op)
+                              InputIter begin,
+                              InputIter end,
+                              Function  binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
                                               temp_storage_bytes,
@@ -101,27 +101,27 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename InputIter,
           typename Function,
           typename T>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
-exclusive_inplace(resources::Cuda cuda_res,
-                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+            exclusive_inplace(resources::Cuda cuda_res,
+                              ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                            IterationGetter,
                                                            Concretizer,
                                                            BLOCKS_PER_SM,
                                                            Async>,
-                  InputIter begin,
-                  InputIter end,
-                  Function binary_op,
-                  T init)
+                              InputIter begin,
+                              InputIter end,
+                              Function  binary_op,
+                              T         init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
                                               temp_storage_bytes,
@@ -160,27 +160,27 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename InputIter,
           typename OutputIter,
           typename Function>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
-inclusive(resources::Cuda cuda_res,
-          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+            inclusive(resources::Cuda cuda_res,
+                      ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                    IterationGetter,
                                                    Concretizer,
                                                    BLOCKS_PER_SM,
                                                    Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op)
+                      InputIter  begin,
+                      InputIter  end,
+                      OutputIter out,
+                      Function   binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -207,29 +207,29 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename InputIter,
           typename OutputIter,
           typename Function,
           typename T>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
-exclusive(resources::Cuda cuda_res,
-          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+            exclusive(resources::Cuda cuda_res,
+                      ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                    IterationGetter,
                                                    Concretizer,
                                                    BLOCKS_PER_SM,
                                                    Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op,
-          T init)
+                      InputIter  begin,
+                      InputIter  end,
+                      OutputIter out,
+                      Function   binary_op,
+                      T          init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
                                               temp_storage_bytes,
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index 408cc139fa..3eadda61f2 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -48,7 +48,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename Iter,
           typename Compare>
 concepts::enable_if_t<
@@ -93,7 +93,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -112,9 +112,9 @@ stable(resources::Cuda cuda_res,
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
+  int len       = std::distance(begin, end);
   int begin_bit = 0;
-  int end_bit = sizeof(R) * CHAR_BIT;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -124,7 +124,7 @@ stable(resources::Cuda cuda_res,
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
                                               temp_storage_bytes,
@@ -171,7 +171,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -190,9 +190,9 @@ stable(resources::Cuda cuda_res,
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
+  int len       = std::distance(begin, end);
   int begin_bit = 0;
-  int end_bit = sizeof(R) * CHAR_BIT;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -202,7 +202,7 @@ stable(resources::Cuda cuda_res,
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
                                                         temp_storage_bytes,
@@ -250,7 +250,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename Iter,
           typename Compare>
 concepts::enable_if_t<
@@ -294,20 +294,20 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(resources::Cuda cuda_res,
+unstable(resources::Cuda                                 cuda_res,
          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                   IterationGetter,
                                                   Concretizer,
                                                   BLOCKS_PER_SM,
                                                   Async> p,
-         Iter begin,
-         Iter end,
-         operators::less<RAJA::detail::IterVal<Iter>> comp)
+         Iter                                            begin,
+         Iter                                            end,
+         operators::less<RAJA::detail::IterVal<Iter>>    comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -319,19 +319,19 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(resources::Cuda cuda_res,
+unstable(resources::Cuda                                 cuda_res,
          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                   IterationGetter,
                                                   Concretizer,
                                                   BLOCKS_PER_SM,
                                                   Async> p,
-         Iter begin,
-         Iter end,
+         Iter                                            begin,
+         Iter                                            end,
          operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
@@ -345,7 +345,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename KeyIter,
           typename ValIter,
           typename Compare>
@@ -398,7 +398,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename KeyIter,
           typename ValIter>
 concepts::enable_if_t<
@@ -422,9 +422,9 @@ stable_pairs(resources::Cuda cuda_res,
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
+  int len       = std::distance(keys_begin, keys_end);
   int begin_bit = 0;
-  int end_bit = sizeof(K) * CHAR_BIT;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -436,7 +436,7 @@ stable_pairs(resources::Cuda cuda_res,
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
                                                temp_storage_bytes,
@@ -493,7 +493,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename KeyIter,
           typename ValIter>
 concepts::enable_if_t<
@@ -517,9 +517,9 @@ stable_pairs(resources::Cuda cuda_res,
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
+  int len       = std::distance(keys_begin, keys_end);
   int begin_bit = 0;
-  int end_bit = sizeof(K) * CHAR_BIT;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -531,7 +531,7 @@ stable_pairs(resources::Cuda cuda_res,
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
                                                          temp_storage_bytes,
@@ -589,7 +589,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename KeyIter,
           typename ValIter,
           typename Compare>
@@ -640,7 +640,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename KeyIter,
           typename ValIter>
 concepts::enable_if_t<
@@ -648,15 +648,15 @@ concepts::enable_if_t<
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-unstable_pairs(resources::Cuda cuda_res,
+unstable_pairs(resources::Cuda                                 cuda_res,
                ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                         IterationGetter,
                                                         Concretizer,
                                                         BLOCKS_PER_SM,
                                                         Async> p,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
+               KeyIter                                         keys_begin,
+               KeyIter                                         keys_end,
+               ValIter                                         vals_begin,
                operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
@@ -669,7 +669,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async,
+          bool   Async,
           typename KeyIter,
           typename ValIter>
 concepts::enable_if_t<
@@ -677,15 +677,15 @@ concepts::enable_if_t<
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-unstable_pairs(resources::Cuda cuda_res,
+unstable_pairs(resources::Cuda                                    cuda_res,
                ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                         IterationGetter,
                                                         Concretizer,
                                                         BLOCKS_PER_SM,
-                                                        Async> p,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
+                                                        Async>    p,
+               KeyIter                                            keys_begin,
+               KeyIter                                            keys_end,
+               ValIter                                            vals_begin,
                operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 298d9b16e2..83c37c5480 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -120,7 +120,7 @@ struct DeviceZeroedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    auto res = ::camp::resources::Hip::get_default();
+    auto  res = ::camp::resources::Hip::get_default();
     void* ptr;
     hipErrchk(hipMalloc(&ptr, nbytes));
     hipErrchk(hipMemsetAsync(ptr, 0, nbytes, res.get_stream()));
@@ -169,12 +169,12 @@ namespace detail
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct hipInfo
 {
-  const void* func = nullptr;
-  hip_dim_t gridDim{0, 0, 0};
-  hip_dim_t blockDim{0, 0, 0};
-  size_t* dynamic_smem = nullptr;
+  const void*            func = nullptr;
+  hip_dim_t              gridDim{0, 0, 0};
+  hip_dim_t              blockDim{0, 0, 0};
+  size_t*                dynamic_smem = nullptr;
   ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0, 0)};
-  bool setup_reducers = false;
+  bool                   setup_reducers = false;
 };
 struct hipStatusInfo : hipInfo
 {
@@ -211,7 +211,7 @@ void synchronize()
     if (!val.second)
     {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
   if (synchronize)
@@ -266,14 +266,14 @@ void launch(::RAJA::resources::Hip res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func,
-            hip_dim_t gridDim,
-            hip_dim_t blockDim,
-            void** args,
-            size_t shmem,
+void launch(const void*            func,
+            hip_dim_t              gridDim,
+            hip_dim_t              blockDim,
+            void**                 args,
+            size_t                 shmem,
             ::RAJA::resources::Hip res,
-            bool async = true,
-            const char* name = nullptr)
+            bool                   async = true,
+            const char*            name  = nullptr)
 {
 #if defined(RAJA_ENABLE_ROCTX)
   if (name) roctxRangePush(name);
@@ -348,13 +348,13 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  takes the failure return path.
 template <typename T, typename GetNFromMax>
 RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
-                                        size_t align = alignof(T))
+                                        size_t        align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-                                  ? align - (unaligned_shmem % align)
-                                  : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
   const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
@@ -382,12 +382,12 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // resources.
 template <typename LOOP_BODY>
 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
-make_launch_body(const void* func,
-                 hip_dim_t gridDim,
-                 hip_dim_t blockDim,
-                 size_t& dynamic_smem,
+make_launch_body(const void*            func,
+                 hip_dim_t              gridDim,
+                 hip_dim_t              blockDim,
+                 size_t&                dynamic_smem,
                  ::RAJA::resources::Hip res,
-                 LOOP_BODY&& loop_body)
+                 LOOP_BODY&&            loop_body)
 {
   ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
       detail::tl_status,
@@ -398,7 +398,7 @@ make_launch_body(const void* func,
 }
 
 
-static constexpr int hip_occupancy_uninitialized_int = -1;
+static constexpr int    hip_occupancy_uninitialized_int = -1;
 static constexpr size_t hip_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
 
@@ -423,15 +423,15 @@ HipFixedMaxBlocksData hip_max_blocks()
 struct HipOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
-  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
+  int    func_max_blocks_per_device   = hip_occupancy_uninitialized_int;
+  int    func_max_threads_per_block   = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
 RAJA_INLINE HipOccMaxBlocksThreadsData
 hip_occupancy_max_blocks_threads(const void* func,
-                                 size_t func_dynamic_shmem_per_block)
+                                 size_t      func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
@@ -448,7 +448,7 @@ hip_occupancy_max_blocks_threads(const void* func,
                                           func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    hipDeviceProp_t& prop = hip::device_prop();
+    hipDeviceProp_t& prop           = hip::device_prop();
     data.func_max_blocks_per_device = prop.multiProcessorCount;
     data.func_max_threads_per_block = 1024;
 #endif
@@ -461,8 +461,8 @@ hip_occupancy_max_blocks_threads(const void* func,
 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_threads_per_block = hip_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
+  int    func_threads_per_block       = hip_occupancy_uninitialized_int;
+  int    func_max_blocks_per_sm       = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
@@ -476,7 +476,7 @@ hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
   {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
@@ -502,8 +502,8 @@ hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
 RAJA_INLINE HipOccMaxBlocksData
 hip_occupancy_max_blocks(const void* func,
-                         size_t func_dynamic_shmem_per_block,
-                         int func_threads_per_block)
+                         size_t      func_dynamic_shmem_per_block,
+                         int         func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
@@ -512,7 +512,7 @@ hip_occupancy_max_blocks(const void* func,
   {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
@@ -565,8 +565,8 @@ template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
   ConcretizerImpl(const void* func,
-                  size_t func_dynamic_shmem_per_block,
-                  IdxT len)
+                  size_t      func_dynamic_shmem_per_block,
+                  IdxT        len)
       : m_func(func),
         m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
         m_len(len)
@@ -645,8 +645,8 @@ struct ConcretizerImpl
 
 private:
   const void* m_func;
-  size_t m_func_dynamic_shmem_per_block;
-  IdxT m_len;
+  size_t      m_func_dynamic_shmem_per_block;
+  IdxT        m_len;
 };
 
 } // namespace hip
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index ffce3b254d..5b122dd48f 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -43,7 +43,7 @@ namespace hip
 // factory and writes it into a pinned ptr
 template <typename Factory>
 __global__ void get_value_global(typename Factory::value_type* ptr,
-                                 Factory factory)
+                                 Factory                       factory)
 {
   *ptr = factory();
 }
@@ -52,7 +52,7 @@ __global__ void get_value_global(typename Factory::value_type* ptr,
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
+  static void*  ptr           = nullptr;
   if (nbytes > cached_nbytes)
   {
     cached_nbytes = 0;
@@ -108,8 +108,8 @@ inline auto get_cached_value(Factory&& factory)
 template <typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory) {
         return hip::get_cached_value(std::forward<decltype(factory)>(factory));
       })};
   return &dispatcher;
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 739c120e26..c9b56efbb1 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -37,7 +37,7 @@ namespace detail
  * and returns any per run resources
  */
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -64,7 +64,7 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
                                        INDEX_T,
                                        Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
@@ -72,7 +72,7 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const& storage,
+  per_run_storage run(WorkContainer const&         storage,
                       typename base::resource_type r,
                       Args... args) const
   {
@@ -99,7 +99,7 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
  * and returns any per run resources
  */
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -126,7 +126,7 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
                                        INDEX_T,
                                        Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
@@ -134,7 +134,7 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const& storage,
+  per_run_storage run(WorkContainer const&         storage,
                       typename base::resource_type r,
                       Args... args) const
   {
@@ -178,9 +178,9 @@ struct HoldHipDeviceXThreadblockLoop
     // TODO:: decide when to run hooks, may bypass this and use impl directly
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
-    const index_type stride = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end = m_segment.end();
+    const index_type stride  = blockDim.x * gridDim.x;
+    const auto       begin   = m_segment.begin();
+    const auto       end     = m_segment.end();
     const index_type len(end - begin);
     for (index_type i = i_begin; i < len; i += stride)
     {
@@ -190,7 +190,7 @@ struct HoldHipDeviceXThreadblockLoop
 
 private:
   Segment_type m_segment;
-  LoopBody m_body;
+  LoopBody     m_body;
 };
 
 template <size_t BLOCK_SIZE,
@@ -215,7 +215,7 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * by the average number of iterates per loop
  */
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -232,9 +232,9 @@ struct WorkRunner<
   using order_policy =
       RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Hip;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Hip;
 
   // The type that will hold the segment and loop body in work storage
   struct holder_type
@@ -266,7 +266,7 @@ struct WorkRunner<
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
   WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
@@ -287,9 +287,9 @@ struct WorkRunner<
   inline void
   enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
-    using Iterator = camp::decay<decltype(std::begin(iter))>;
+    using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE = camp::decay<Iterable>;
+    using ITERABLE  = camp::decay<Iterable>;
     using IndexType =
         camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
@@ -298,9 +298,9 @@ struct WorkRunner<
     // using true_value_type = typename WorkContainer::template
     // true_value_type<holder>;
 
-    Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator  begin = std::begin(iter);
+    Iterator  end   = std::end(iter);
+    IndexType len   = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
     if (len > 0 && BLOCK_SIZE > 0)
@@ -330,8 +330,8 @@ struct WorkRunner<
   per_run_storage
   run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage),
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
@@ -346,8 +346,8 @@ struct WorkRunner<
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator  begin     = std::begin(storage);
+    Iterator  end       = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
@@ -401,7 +401,7 @@ struct WorkRunner<
 
 /// leave unsupported runner types incomplete
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename... Args>
@@ -414,7 +414,7 @@ struct WorkRunner<
     Args...>;
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename... Args>
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index e441c52005..5ae1eb659d 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -396,7 +396,7 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
   do
   {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected));
 
   return old;
@@ -410,8 +410,8 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
  * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
-                                            Oper&& oper,
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T*             acc,
+                                            Oper&&         oper,
                                             ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
@@ -426,7 +426,7 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
   do
   {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -598,9 +598,11 @@ RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value](T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return hip_atomicCAS_loop(acc,
+                            [value](T old) {
+                              return value <= old ? static_cast<T>(0)
+                                                  : old + static_cast<T>(1);
+                            });
 }
 
 
@@ -620,10 +622,13 @@ RAJA_INLINE __device__ T hip_atomicInc(T* acc)
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value](T old) {
-    return old == static_cast<T>(0) || value < old ? value
-                                                   : old - static_cast<T>(1);
-  });
+  return hip_atomicCAS_loop(acc,
+                            [value](T old)
+                            {
+                              return old == static_cast<T>(0) || value < old
+                                         ? value
+                                         : old - static_cast<T>(1);
+                            });
 }
 
 
@@ -740,7 +745,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
@@ -753,7 +758,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
@@ -766,7 +771,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
@@ -779,7 +784,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
@@ -792,7 +797,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
@@ -817,7 +822,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
@@ -842,7 +847,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
@@ -855,7 +860,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
                                         T* acc,
-                                        T value)
+                                        T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
@@ -868,7 +873,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
                                          T* acc,
-                                         T value)
+                                         T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
@@ -881,7 +886,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
                                               T* acc,
-                                              T value)
+                                              T  value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 128b318685..7f2a227c72 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -82,8 +82,8 @@ struct ForallDimensionCalculator;
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
 template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+          int       BLOCK_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -103,12 +103,12 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* RAJA_UNUSED_ARG(func),
+                             IdxT               len,
+                             const void*        RAJA_UNUSED_ARG(func),
                              size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     if (len > (block_size * grid_size))
     {
@@ -124,7 +124,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int GRID_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -142,14 +142,14 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT               len,
+                             const void*        func,
+                             size_t             dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
     if (block_size == IdxT(0))
@@ -164,7 +164,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int BLOCK_SIZE,
+          int       BLOCK_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -182,15 +182,15 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT               len,
+                             const void*        func,
+                             size_t             dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
@@ -210,9 +210,9 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT               len,
+                             const void*        func,
+                             size_t             dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
@@ -225,8 +225,8 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+          int       BLOCK_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -246,12 +246,12 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func),
+                             IdxT               RAJA_UNUSED_ARG(len),
+                             const void*        RAJA_UNUSED_ARG(func),
                              size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
@@ -259,7 +259,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int GRID_SIZE,
+          int       GRID_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -277,14 +277,14 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT               len,
+                             const void*        func,
+                             size_t             dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -293,7 +293,7 @@ struct ForallDimensionCalculator<
 };
 
 template <named_dim dim,
-          int BLOCK_SIZE,
+          int       BLOCK_SIZE,
           typename Concretizer,
           typename UniqueMarker>
 struct ForallDimensionCalculator<
@@ -311,15 +311,15 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT               len,
+                             const void*        func,
+                             size_t             dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
@@ -339,9 +339,9 @@ struct ForallDimensionCalculator<
 
   template <typename IdxT>
   static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+                             IdxT               len,
+                             const void*        func,
+                             size_t             dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
         func, dynamic_shmem_size, len};
@@ -373,20 +373,20 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size > 0),
                            size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forall_hip_kernel(LOOP_BODY loop_body,
+    void forall_hip_kernel(LOOP_BODY      loop_body,
                            const Iterator idx,
-                           IndexType length)
+                           IndexType      length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
@@ -398,7 +398,7 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size <= 0),
@@ -407,9 +407,9 @@ __global__ void
 forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
@@ -422,21 +422,21 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size > 0),
                            size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(LOOP_BODY loop_body,
+    void forallp_hip_kernel(LOOP_BODY      loop_body,
                             const Iterator idx,
-                            IndexType length,
-                            ForallParam f_params)
+                            IndexType      length,
+                            ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -450,20 +450,20 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
           std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
                                            IterationMapping>::value &&
                                (IterationGetter::block_size <= 0),
                            size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+__global__ void forallp_hip_kernel(LOOP_BODY      loop_body,
                                    const Iterator idx,
-                                   IndexType length,
-                                   ForallParam f_params)
+                                   IndexType      length,
+                                   ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
+  auto  ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -476,8 +476,8 @@ template <
     typename Iterator,
     typename LOOP_BODY,
     typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
@@ -485,13 +485,13 @@ template <
                          (IterationGetter::block_size > 0),
                      size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forall_hip_kernel(LOOP_BODY loop_body,
+    void forall_hip_kernel(LOOP_BODY      loop_body,
                            const Iterator idx,
-                           IndexType length)
+                           IndexType      length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -505,7 +505,7 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
@@ -516,8 +516,8 @@ __global__ void
 forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -532,8 +532,8 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
@@ -541,14 +541,14 @@ template <
                          (IterationGetter::block_size > 0),
                      size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(LOOP_BODY loop_body,
+    void forallp_hip_kernel(LOOP_BODY      loop_body,
                             const Iterator idx,
-                            IndexType length,
-                            ForallParam f_params)
+                            IndexType      length,
+                            ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -564,21 +564,21 @@ template <
     typename IndexType,
     typename ForallParam,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter = typename EXEC_POL::IterationGetter,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
     std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
                                      IterationMapping>::value &&
                          std::is_base_of<iteration_mapping::UnsizedLoopBase,
                                          IterationMapping>::value &&
                          (IterationGetter::block_size <= 0),
                      size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+__global__ void forallp_hip_kernel(LOOP_BODY      loop_body,
                                    const Iterator idx,
-                                   IndexType length,
-                                   ForallParam f_params)
+                                   IndexType      length,
+                                   ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(loop_body);
+  auto& body       = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -616,7 +616,7 @@ forall_impl(
     LoopBody&& loop_body,
     ForallParam)
 {
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -632,9 +632,9 @@ forall_impl(
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
@@ -699,11 +699,11 @@ forall_impl(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-    Iterable&& iter,
-    LoopBody&& loop_body,
+    Iterable&&  iter,
+    LoopBody&&  loop_body,
     ForallParam f_params)
 {
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -719,9 +719,9 @@ forall_impl(
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
@@ -748,9 +748,9 @@ forall_impl(
     RAJA_FT_BEGIN;
 
     RAJA::hip::detail::hipInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = hip_res;
+    launch_info.res      = hip_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -815,7 +815,7 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
         ::RAJA::policy::hip::
             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
     const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody&& loop_body)
+    LoopBody&&                            loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index ac8b372f48..59d726e131 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -98,7 +98,7 @@ struct AccessorDeviceScopeUseBlockFence
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(
+    auto      ptr = const_cast<integer_type*>(
         reinterpret_cast<const integer_type*>(in_ptr + idx));
 
     for (size_t i = 0; i < u.array_size(); ++i)
@@ -262,7 +262,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
     for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T   rhs     = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -305,7 +305,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::hip::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
@@ -327,7 +327,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T   rhs     = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 26e7644167..302f151657 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -45,7 +45,7 @@ struct HipStatementExecutor<Data,
                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index cffe671f9a..2e7aa010c3 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -69,7 +69,7 @@ struct HipStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -138,8 +138,8 @@ struct HipStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
@@ -216,8 +216,8 @@ struct HipStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
@@ -376,8 +376,8 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
@@ -513,8 +513,8 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 31ec3af3b0..c8bcf7897e 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -79,7 +79,7 @@ struct HipStatementExecutor<
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -147,8 +147,8 @@ struct HipStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
@@ -224,8 +224,8 @@ struct HipStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
@@ -392,8 +392,8 @@ struct HipStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
@@ -525,8 +525,8 @@ struct HipStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
     const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 10292bab0f..f4c636abbe 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -181,7 +181,7 @@ template <typename Data, typename Exec>
 __global__ void HipKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -200,7 +200,7 @@ __launch_bounds__(BlockSize, 1) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -260,8 +260,8 @@ struct HipLaunchHelper;
  * determined at runtime using the HIP occupancy calculator.
  */
 template <bool async0,
-          int num_blocks,
-          int num_threads,
+          int  num_blocks,
+          int  num_threads,
           typename StmtList,
           typename Data,
           typename Types>
@@ -288,8 +288,8 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-                                                int& recommended_blocks,
-                                                int& recommended_threads)
+                                                int&   recommended_blocks,
+                                                int&   recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -305,7 +305,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
         //
         auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
       }
       else
@@ -352,7 +352,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
   }
 
   inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
-                                 int& max_threads)
+                                 int&   max_threads)
   {
     if (num_threads <= 0)
     {
@@ -426,8 +426,8 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
 inline hip_dim_t fitHipDims(hip_dim_member_t limit,
-                            hip_dim_t result,
-                            hip_dim_t minimum = hip_dim_t())
+                            hip_dim_t        result,
+                            hip_dim_t        minimum = hip_dim_t())
 {
 
 
@@ -488,7 +488,7 @@ struct StatementExecutor<
     Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t   = StatementList<EnclosedStmts...>;
   using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
@@ -511,7 +511,7 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
     if (num_blocks > 0 || num_threads > 0)
     {
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index 4a36288431..503c89cab1 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -51,7 +51,7 @@ struct HipStatementExecutor<Data,
                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
@@ -137,7 +137,7 @@ struct HipStatementExecutor<Data,
                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index a7bfdd7983..734f74f0e8 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -56,7 +56,7 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
@@ -118,7 +118,7 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 2e928587d6..8420b40ad1 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -116,7 +116,7 @@ struct HipStatementExecutor<
     LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -174,7 +174,7 @@ struct HipStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
@@ -217,7 +217,7 @@ struct HipStatementExecutor<
     LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -275,7 +275,7 @@ struct HipStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
@@ -313,7 +313,7 @@ struct HipStatementExecutor<
     LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 09ffee4f9d..9a6c922470 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -99,8 +99,8 @@ struct HipStatementExecutor<
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -180,13 +180,13 @@ struct HipStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
@@ -272,13 +272,13 @@ struct HipStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 1cdf45a76a..a4dc2fc49b 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -50,8 +50,8 @@ struct LaunchDims
   HipDims dims;
   HipDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
@@ -221,7 +221,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   template <typename IdxT>
   static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
                              HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+                             IdxT     len)
   {
     if (len > static_cast<IdxT>(1))
     {
@@ -336,7 +336,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   template <typename IdxT>
   static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
                              HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+                             IdxT     len)
   {
     if (len > static_cast<IdxT>(0))
     {
@@ -405,9 +405,9 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+template <named_dim               dim,
+          int                     BLOCK_SIZE,
+          int                     GRID_SIZE,
           kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     iteration_mapping::Direct,
@@ -457,7 +457,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   template <typename IdxT>
   static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
                              HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT RAJA_UNUSED_ARG(len))
+                             IdxT     RAJA_UNUSED_ARG(len))
   {}
 };
 
@@ -624,9 +624,9 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
+template <named_dim               dim,
+          int                     BLOCK_SIZE,
+          int                     GRID_SIZE,
           kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 8f5c44059c..e6c7ccdc86 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -34,8 +34,8 @@ __global__ void launch_global_fcn(BODY body_in)
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -45,14 +45,14 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in,
+__global__ void launch_new_reduce_global_fcn(BODY         body_in,
                                              ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -76,10 +76,10 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+       const LaunchParams&       params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -149,10 +149,10 @@ struct LaunchExecute<
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+       const LaunchParams&       launch_params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -185,10 +185,10 @@ struct LaunchExecute<
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
         using EXEC_POL =
@@ -238,8 +238,8 @@ __launch_bounds__(num_threads, 1) __global__
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -250,14 +250,14 @@ __launch_bounds__(num_threads, 1) __global__
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+    void launch_new_reduce_global_fcn_fixed(BODY         body_in,
                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -281,10 +281,10 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+       const LaunchParams&       params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -353,10 +353,10 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+       const LaunchParams&       launch_params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -391,10 +391,10 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
         using EXEC_POL =
@@ -454,11 +454,11 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     if (i < len)
     {
@@ -482,9 +482,9 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -518,10 +518,10 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -555,11 +555,11 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     for (diff_t i = i_init; i < len; i += i_stride)
@@ -585,9 +585,9 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -630,10 +630,10 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -679,11 +679,11 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     if (i < len)
     {
@@ -706,9 +706,9 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -742,10 +742,10 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -782,11 +782,11 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     for (diff_t i = i_init; i < len; i += i_stride)
@@ -812,9 +812,9 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -857,10 +857,10 @@ struct LoopICountExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -925,8 +925,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -962,8 +962,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1013,8 +1013,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1050,8 +1050,8 @@ struct LoopExecute<
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1089,9 +1089,9 @@ struct TileExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i =
@@ -1119,9 +1119,9 @@ struct TileExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init =
@@ -1150,13 +1150,13 @@ struct TileTCountExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
     if (i < len)
     {
@@ -1180,13 +1180,13 @@ struct TileTCountExecute<
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 55fcc46154..4467b0ab24 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -79,13 +79,13 @@ template <typename Combiner,
           typename GetTallyOffset>
 RAJA_DEVICE RAJA_INLINE void
 block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                         T identity,
+                                         T   identity,
                                          int bin,
-                                         T value,
-                                         T* tally_mem,
+                                         T   value,
+                                         T*  tally_mem,
                                          GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+                                         int            tally_replication,
+                                         int            tally_bins)
 {
   if (value == identity)
   {
@@ -105,8 +105,8 @@ block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
 template <typename T>
 RAJA_DEVICE RAJA_INLINE void
 block_multi_reduce_init_shmem(int num_bins,
-                              T identity,
-                              T* shared_mem,
+                              T   identity,
+                              T*  shared_mem,
                               int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
@@ -128,13 +128,13 @@ template <typename Combiner,
           typename T,
           typename GetSharedOffset>
 RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                        T identity,
-                                        int bin,
-                                        T value,
-                                        T* shared_mem,
+block_multi_reduce_combine_shmem_atomic(int             num_bins,
+                                        T               identity,
+                                        int             bin,
+                                        T               value,
+                                        T*              shared_mem,
                                         GetSharedOffset get_shared_offset,
-                                        int shared_replication)
+                                        int             shared_replication)
 {
   if (value == identity)
   {
@@ -156,15 +156,15 @@ template <typename Combiner,
           typename GetSharedOffset,
           typename GetTallyOffset>
 RAJA_DEVICE RAJA_INLINE void
-grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                         T identity,
-                                         T* shared_mem,
+grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
+                                         T               identity,
+                                         T*              shared_mem,
                                          GetSharedOffset get_shared_offset,
-                                         int shared_replication,
-                                         T* tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+                                         int             shared_replication,
+                                         T*              tally_mem,
+                                         GetTallyOffset  get_tally_offset,
+                                         int             tally_replication,
+                                         int             tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -212,7 +212,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
   //! setup permanent settings, allocate and initialize tally memory
   template <typename Container>
   MultiReduceGridAtomicHostInit_TallyData(Container const& container,
-                                          T const& identity)
+                                          T const&         identity)
       : m_tally_mem(nullptr),
         m_identity(identity),
         m_num_bins(container.size()),
@@ -232,7 +232,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
   operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
   MultiReduceGridAtomicHostInit_TallyData&
   operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
@@ -243,17 +243,17 @@ struct MultiReduceGridAtomicHostInit_TallyData
     if (new_num_bins != m_num_bins)
     {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(
+      m_tally_mem         = create_tally(
           container, identity, m_num_bins, m_tally_bins, m_tally_replication);
     }
     else
     {
       {
         int tally_rep = 0;
-        int bin = 0;
+        int bin       = 0;
         for (auto const& value : container)
         {
           m_tally_mem[GetTallyOffset{}(
@@ -308,7 +308,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
       RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
   using TallyAtomicReplicationConcretizer =
       typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
@@ -341,10 +341,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
 
   template <typename Container>
   static T* create_tally(Container const& container,
-                         T const& identity,
-                         int num_bins,
-                         int tally_bins,
-                         int tally_replication)
+                         T const&         identity,
+                         int              num_bins,
+                         int              tally_bins,
+                         int              tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -358,7 +358,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       {
         int tally_rep = 0;
-        int bin = 0;
+        int bin       = 0;
         for (auto const& value : container)
         {
           int tally_offset =
@@ -404,11 +404,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
-  T* m_tally_mem;
-  T m_identity;
+  T*  m_tally_mem;
+  T   m_identity;
   int m_num_bins;
   int m_tally_bins;
   int m_tally_replication; // power of 2, at least the max number of omp threads
@@ -497,7 +497,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   //! setup permanent settings, defer to tally data
   template <typename Container>
   MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
-                                              T const& identity)
+                                              T const&         identity)
       : TallyData(container, identity),
         m_shared_offset(s_shared_offset_unknown),
         m_shared_replication(0)
@@ -512,7 +512,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
   MultiReduceBlockThenGridAtomicHostInit_Data&
   operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
 
 
   //! defer to tally data for some functions
@@ -531,9 +531,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
       return;
     }
 
-    size_t shared_replication = 0;
-    const size_t shared_offset =
-        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+    size_t       shared_replication = 0;
+    const size_t shared_offset      = allocateDynamicShmem<T>(
+        [&](size_t max_shmem_size)
+        {
           struct
           {
             size_t func_threads_per_block;
@@ -549,7 +550,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_offset != dynamic_smem_allocation_failure)
     {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
+      m_shared_offset      = static_cast<int>(shared_offset);
     }
     else
     {
@@ -561,7 +562,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
 
@@ -644,7 +645,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
   using SharedAtomicReplicationConcretizer =
       typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -728,7 +729,7 @@ struct MultiReduceDataHip
   using SyncList = std::vector<resources::Hip>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataHip() = delete;
@@ -767,7 +768,7 @@ struct MultiReduceDataHip
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
@@ -779,9 +780,9 @@ struct MultiReduceDataHip
 #endif
   }
 
-  MultiReduceDataHip(MultiReduceDataHip&&) = delete;
+  MultiReduceDataHip(MultiReduceDataHip&&)                 = delete;
   MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
-  MultiReduceDataHip& operator=(MultiReduceDataHip&&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -855,9 +856,9 @@ struct MultiReduceDataHip
 
 private:
   MultiReduceDataHip const* m_parent;
-  SyncList* m_sync_list;
-  reduce_data_type m_data;
-  bool m_own_launch_data;
+  SyncList*                 m_sync_list;
+  reduce_data_type          m_data;
+  bool                      m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Hip res)
   {
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 1a9558f7ac..a41d93d20d 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -33,7 +33,7 @@ init(KernelName& kn, const RAJA::hip::detail::hipInfo&)
 // Combine
 template <typename EXEC_POL>
 RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-combine(KernelName&)
+                 combine(KernelName&)
 {}
 
 // Resolve
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index 68a6ea2667..a14377750e 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -30,7 +30,7 @@ init(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
 // Combine
 template <typename EXEC_POL, typename OP, typename T>
 RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-combine(Reducer<OP, T>& red)
+                 combine(Reducer<OP, T>& red)
 {
   RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
 }
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 4b25ac1758..6547933d0b 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -38,7 +38,7 @@
 namespace RAJA
 {
 
-using hip_dim_t = dim3;
+using hip_dim_t        = dim3;
 using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 
 //
@@ -96,7 +96,7 @@ struct MaxOccupancyConcretizer
   template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
     IdxT func_max_blocks_per_device =
@@ -121,7 +121,7 @@ struct FractionOffsetOccupancyConcretizer
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
     if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
@@ -158,8 +158,8 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
     IdxT func_max_threads_per_sm =
         func_threads_per_block * func_max_blocks_per_sm;
@@ -203,7 +203,7 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
   template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
     if (func_threads_per_block < cutoff)
@@ -276,17 +276,17 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <reduce_algorithm t_algorithm,
+template <reduce_algorithm         t_algorithm,
           block_communication_mode t_comm_mode,
-          size_t t_replication,
-          size_t t_atomic_stride>
+          size_t                   t_replication,
+          size_t                   t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
-  static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
-  static constexpr bool consistent =
+  static constexpr reduce_algorithm         algorithm     = t_algorithm;
+  static constexpr block_communication_mode comm_mode     = t_comm_mode;
+  static constexpr size_t                   replication   = t_replication;
+  static constexpr size_t                   atomic_stride = t_atomic_stride;
+  static constexpr bool                     consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
 
@@ -303,8 +303,8 @@ template <typename t_AtomicReplicationConcretizer,
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
 template <multi_reduce_algorithm t_algorithm,
@@ -315,7 +315,7 @@ struct MultiReduceTuning
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 } // namespace hip
@@ -391,8 +391,8 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                       detail::get_launch<Async>::value,
                       RAJA::Platform::hip>
 {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
@@ -559,8 +559,8 @@ struct HipDims
   hip_dim_t blocks{0, 0, 0};
   hip_dim_t threads{0, 0, 0};
 
-  HipDims() = default;
-  HipDims(HipDims const&) = default;
+  HipDims()                          = default;
+  HipDims(HipDims const&)            = default;
   HipDims& operator=(HipDims const&) = default;
 
   RAJA_INLINE
@@ -681,11 +681,11 @@ namespace hip
 struct IndexSize
 {
   hip_dim_member_t block_size = named_usage::unspecified;
-  hip_dim_member_t grid_size = named_usage::unspecified;
+  hip_dim_member_t grid_size  = named_usage::unspecified;
 
   RAJA_HOST_DEVICE constexpr IndexSize(
       hip_dim_member_t _block_size = named_usage::unspecified,
-      hip_dim_member_t _grid_size = named_usage::unspecified)
+      hip_dim_member_t _grid_size  = named_usage::unspecified)
       : block_size(_block_size), grid_size(_grid_size)
   {}
 };
@@ -702,7 +702,7 @@ struct IndexGlobal
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -727,7 +727,7 @@ struct IndexGlobal<dim, 1, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -749,7 +749,7 @@ struct IndexGlobal<dim, BLOCK_SIZE, 1>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -769,7 +769,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -791,7 +791,7 @@ struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -817,7 +817,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -841,7 +841,7 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -865,7 +865,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -886,7 +886,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -916,7 +916,7 @@ struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -936,7 +936,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -955,7 +955,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -979,7 +979,7 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -999,7 +999,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -1018,7 +1018,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -1041,7 +1041,7 @@ template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
@@ -1237,7 +1237,7 @@ using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
           size_t BLOCK_SIZE_X = named_usage::unspecified,
           size_t BLOCK_SIZE_Y = named_usage::unspecified,
           size_t BLOCK_SIZE_Z = named_usage::unspecified>
@@ -1245,13 +1245,13 @@ using warp_xyz =
     IndexDivide<WARP_SIZE,
                 thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
 
-template <size_t WARP_SIZE = RAJA::policy::hip::device_constants.WARP_SIZE,
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
           size_t BLOCK_SIZE_X = named_usage::unspecified,
           size_t BLOCK_SIZE_Y = named_usage::unspecified,
           size_t BLOCK_SIZE_Z = named_usage::unspecified,
-          size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
 using warp_global_xyz =
     IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
                  block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
@@ -1399,10 +1399,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template <hip::reduce_algorithm algorithm,
+template <hip::reduce_algorithm         algorithm,
           hip::block_communication_mode comm_mode,
-          size_t replication = named_usage::unspecified,
-          size_t atomic_stride = named_usage::unspecified>
+          size_t                        replication = named_usage::unspecified,
+          size_t atomic_stride                      = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 4f2b5fe4db..2927b6bc70 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -130,9 +130,9 @@ template <typename Combiner,
           int atomic_stride,
           typename T,
           typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                                   T identity,
-                                                   TempIterator in_device_mem,
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T&            val,
+                                                   T             identity,
+                                                   TempIterator  in_device_mem,
                                                    unsigned int* device_count)
 {
   typename TempIterator::template rebind_accessor<Accessor> device_mem(
@@ -147,15 +147,15 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
-  unsigned int numSlots = (numBlocks / replication) +
+  int          maxNumSlots = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots    = (numBlocks / replication) +
                           ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
@@ -214,9 +214,9 @@ template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
@@ -228,7 +228,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner{}(temp, rhs);
     }
   }
   else
@@ -238,7 +238,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
+      T   rhs     = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -292,7 +292,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
       {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner{}(temp, rhs);
       }
     }
 
@@ -312,11 +312,11 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
   using ThreadIterationGetter =
       typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int          numBlocks   = BlockIterationGetter::size();
+  const int          numThreads  = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(red.val, OP::identity());
@@ -330,7 +330,7 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(red.device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
@@ -368,9 +368,9 @@ template <typename Combiner,
           int atomic_stride,
           typename T>
 RAJA_DEVICE RAJA_INLINE int
-grid_reduce_atomic_device_init(T& val,
-                               T identity,
-                               T* device_mem,
+grid_reduce_atomic_device_init(T&            val,
+                               T             identity,
+                               T*            device_mem,
                                unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
@@ -381,7 +381,7 @@ grid_reduce_atomic_device_init(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = (blockId % replication);
-  int atomicOffset = replicationId * atomic_stride;
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
                           ((replicationId < (numBlocks % replication)) ? 1 : 0);
@@ -448,7 +448,7 @@ grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
   int replicationId = (blockId % replication);
-  int atomicOffset = replicationId * atomic_stride;
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
@@ -471,14 +471,14 @@ class PinnedTally
   struct Node
   {
     Node* next;
-    T values[num_slots];
+    T     values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode
   {
-    ResourceNode* next;
+    ResourceNode*          next;
     ::RAJA::resources::Hip res;
-    Node* node_list;
+    Node*                  node_list;
   };
 
   //! Iterator over resources used by reducer
@@ -498,7 +498,7 @@ class PinnedTally
     ResourceIterator operator++(int)
     {
       ResourceIterator ret = *this;
-      this->operator++();
+      this->           operator++();
       return ret;
     }
 
@@ -535,12 +535,12 @@ class PinnedTally
       else if (m_rn->next)
       {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
+        m_n  = m_rn->node_list;
       }
       else
       {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -548,7 +548,7 @@ class PinnedTally
     ResourceNodeIterator operator++(int)
     {
       ResourceNodeIterator ret = *this;
-      this->operator++();
+      this->               operator++();
       return ret;
     }
 
@@ -566,7 +566,7 @@ class PinnedTally
 
   private:
     ResourceNode* m_rn;
-    Node* m_n;
+    Node*         m_n;
   };
 
   PinnedTally() : resource_list(nullptr) {}
@@ -602,14 +602,14 @@ class PinnedTally
     }
     if (!rn)
     {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -632,7 +632,7 @@ class PinnedTally
       ResourceNode* rn = resource_list;
       while (rn->node_list)
       {
-        Node* n = rn->node_list;
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -669,16 +669,16 @@ template <typename Combiner,
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T value;
-  T identity;
-  unsigned int* device_count;
+  mutable T                                  value;
+  T                                          identity;
+  unsigned int*                              device_count;
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
-  bool own_device_ptr;
+  bool                                       own_device_ptr;
 
   ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
 
@@ -721,7 +721,7 @@ struct ReduceLastBlock_Data
   RAJA_DEVICE
   void grid_reduce(T* output)
   {
-    T temp = value;
+    T      temp          = value;
     size_t replicationId = impl::
         grid_reduce_last_block<Combiner, Accessor, replication, atomic_stride>(
             temp, identity, device, device_count);
@@ -738,9 +738,9 @@ struct ReduceLastBlock_Data
     bool act = !device.allocated() && setupReducers();
     if (act)
     {
-      hip_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
+      hip_dim_t gridDim     = currentGridDim();
+      size_t    numBlocks   = gridDim.x * gridDim.y * gridDim.z;
+      size_t    maxNumSlots = (numBlocks + replication - 1) / replication;
       device.allocate(maxNumSlots * replication);
       device_count =
           count_mempool_type::getInstance().template malloc<unsigned int>(
@@ -759,7 +759,7 @@ struct ReduceLastBlock_Data
     {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -779,9 +779,9 @@ struct ReduceAtomicHostInit_Data
   static constexpr size_t tally_slots = replication * atomic_stride;
 
   mutable T value;
-  T identity;
-  bool is_setup;
-  bool own_device_ptr;
+  T         identity;
+  bool      is_setup;
+  bool      own_device_ptr;
 
   ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
@@ -831,7 +831,7 @@ struct ReduceAtomicHostInit_Data
     bool act = !is_setup && setupReducers();
     if (act)
     {
-      is_setup = true;
+      is_setup       = true;
       own_device_ptr = true;
     }
     return act;
@@ -844,7 +844,7 @@ struct ReduceAtomicHostInit_Data
     bool act = own_device_ptr;
     if (act)
     {
-      is_setup = false;
+      is_setup       = false;
       own_device_ptr = false;
     }
     return act;
@@ -860,16 +860,16 @@ template <typename Combiner,
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T value;
-  T identity;
+  mutable T     value;
+  T             identity;
   unsigned int* device_count;
-  T* device;
-  bool own_device_ptr;
+  T*            device;
+  bool          own_device_ptr;
 
   ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
@@ -948,7 +948,7 @@ struct ReduceAtomicDeviceInit_Data
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -1028,7 +1028,7 @@ class Reduce
   union tally_u
   {
     TallyType* list;
-    T* val_ptr;
+    T*         val_ptr;
     constexpr tally_u(TallyType* l) : list(l){};
     constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
   };
@@ -1117,7 +1117,7 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
     if (n != end)
     {
@@ -1152,8 +1152,8 @@ class Reduce
   T get_combined() const { return val.value; }
 
 private:
-  const Reduce* parent;
-  tally_u tally_or_val_ptr;
+  const Reduce*    parent;
+  tally_u          tally_or_val_ptr;
   reduce_data_type val;
 };
 
@@ -1259,16 +1259,16 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val,
+  ReduceMinLoc(T         init_val,
                IndexType init_idx,
-               T identity_val = NonLocCombiner::identity(),
+               T         identity_val = NonLocCombiner::identity(),
                IndexType identity_idx =
                    RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : Base(value_type(init_val, init_idx),
@@ -1277,9 +1277,9 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
+  void reset(T         init_val,
              IndexType init_idx,
-             T identity_val = NonLocCombiner::identity(),
+             T         identity_val = NonLocCombiner::identity(),
              IndexType identity_idx =
                  RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
@@ -1315,16 +1315,16 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
           tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val,
+  ReduceMaxLoc(T         init_val,
                IndexType init_idx,
-               T identity_val = NonLocCombiner::identity(),
+               T         identity_val = NonLocCombiner::identity(),
                IndexType identity_idx =
                    RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : Base(value_type(init_val, init_idx),
@@ -1333,9 +1333,9 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
+  void reset(T         init_val,
              IndexType init_idx,
-             T identity_val = NonLocCombiner::identity(),
+             T         identity_val = NonLocCombiner::identity(),
              IndexType identity_idx =
                  RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 827d834483..b9a3adbc7e 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -59,13 +59,13 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
-    Function binary_op)
+    Function  binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
@@ -132,14 +132,14 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
-    Function binary_op,
-    T init)
+    Function  binary_op,
+    T         init)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
@@ -204,19 +204,19 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function>
 RAJA_INLINE resources::EventProxy<resources::Hip>
-inclusive(resources::Hip hip_res,
-          ::RAJA::policy::hip::
-              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op)
+            inclusive(resources::Hip hip_res,
+                      ::RAJA::policy::hip::
+                          hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+                      InputIter  begin,
+                      InputIter  end,
+                      OutputIter out,
+                      Function   binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::inclusive_scan(
@@ -258,20 +258,20 @@ template <typename IterationMapping,
           typename Function,
           typename T>
 RAJA_INLINE resources::EventProxy<resources::Hip>
-exclusive(resources::Hip hip_res,
-          ::RAJA::policy::hip::
-              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op,
-          T init)
+            exclusive(resources::Hip hip_res,
+                      ::RAJA::policy::hip::
+                          hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+                      InputIter  begin,
+                      InputIter  end,
+                      OutputIter out,
+                      Function   binary_op,
+                      T          init)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index b831bcb8a4..1f729da96e 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -135,9 +135,9 @@ stable(resources::Hip hip_res,
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
+  int len       = std::distance(begin, end);
   int begin_bit = 0;
-  int end_bit = sizeof(R) * CHAR_BIT;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -147,7 +147,7 @@ stable(resources::Hip hip_res,
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
@@ -229,9 +229,9 @@ stable(resources::Hip hip_res,
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
+  int len       = std::distance(begin, end);
   int begin_bit = 0;
-  int end_bit = sizeof(R) * CHAR_BIT;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -241,7 +241,7 @@ stable(resources::Hip hip_res,
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
@@ -358,8 +358,8 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
 unstable(resources::Hip hip_res,
          ::RAJA::policy::hip::
              hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-         Iter begin,
-         Iter end,
+         Iter                                         begin,
+         Iter                                         end,
          operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
@@ -379,8 +379,8 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
 unstable(resources::Hip hip_res,
          ::RAJA::policy::hip::
              hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-         Iter begin,
-         Iter end,
+         Iter                                            begin,
+         Iter                                            end,
          operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
@@ -463,9 +463,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
+  int len       = std::distance(keys_begin, keys_end);
   int begin_bit = 0;
-  int end_bit = sizeof(K) * CHAR_BIT;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -477,7 +477,7 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
@@ -577,9 +577,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
+  int len       = std::distance(keys_begin, keys_end);
   int begin_bit = 0;
-  int end_bit = sizeof(K) * CHAR_BIT;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -591,7 +591,7 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
   hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
@@ -730,9 +730,9 @@ unstable_pairs(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
+    KeyIter                                         keys_begin,
+    KeyIter                                         keys_end,
+    ValIter                                         vals_begin,
     operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(hip_res, p, keys_begin, keys_end, vals_begin, comp);
@@ -756,9 +756,9 @@ unstable_pairs(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
+    KeyIter                                            keys_begin,
+    KeyIter                                            keys_end,
+    ValIter                                            vals_begin,
     operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(hip_res, p, keys_begin, keys_end, vals_begin, comp);
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index fa0bc24ce9..582c987517 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -53,7 +53,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T* acc, T value)
   T ret;
 #pragma omp atomic capture
   {
-    ret = *acc;
+    ret  = *acc;
     *acc = value;
   }
   RAJA_UNUSED_VAR(ret);
@@ -221,7 +221,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old  = *acc; // capture old for return value
     *acc = value;
   }
   return old;
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 73ffe64558..c290ba0f62 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -65,15 +65,17 @@ RAJA_INLINE concepts::enable_if_t<
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
-            Iterable&& iter,
-            Func&& loop_body,
+            Iterable&&  iter,
+            Func&&      loop_body,
             ForallParam f_params)
 {
-  RAJA::region<RAJA::omp_parallel_region>([&]() {
-    using RAJA::internal::thread_privatize;
-    auto body = thread_privatize(loop_body);
-    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
-  });
+  RAJA::region<RAJA::omp_parallel_region>(
+      [&]()
+      {
+        using RAJA::internal::thread_privatize;
+        auto body = thread_privatize(loop_body);
+        forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
+      });
   return resources::EventProxy<resources::Host>(host_res);
 }
 
@@ -111,7 +113,7 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
                              Iterable&& iter,
-                             Func&& loop_body)
+                             Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static)
@@ -130,7 +132,7 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
                              Iterable&& iter,
-                             Func&& loop_body)
+                             Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static, ChunkSize)
@@ -149,7 +151,7 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
                              Iterable&& iter,
-                             Func&& loop_body)
+                             Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(dynamic)
@@ -168,7 +170,7 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
                              Iterable&& iter,
-                             Func&& loop_body)
+                             Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(dynamic, ChunkSize)
@@ -187,7 +189,7 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
                              Iterable&& iter,
-                             Func&& loop_body)
+                             Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(guided)
@@ -206,7 +208,7 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
                              Iterable&& iter,
-                             Func&& loop_body)
+                             Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(guided, ChunkSize)
@@ -222,7 +224,7 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
 template <typename Iterable, typename Func>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
                              Iterable&& iter,
-                             Func&& loop_body)
+                             Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(runtime)
@@ -239,7 +241,7 @@ template <typename Policy, typename Iterable, typename Func>
 RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
 {
   omp_sched_t prev_sched;
-  int prev_chunk;
+  int         prev_chunk;
   omp_get_schedule(&prev_sched, &prev_chunk);
   omp_set_schedule(Policy::schedule, Policy::chunk_size);
   forall_impl(::RAJA::policy::omp::Runtime{},
@@ -258,7 +260,7 @@ RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
 template <typename Iterable, typename Func>
 RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
                                     Iterable&& iter,
-                                    Func&& loop_body)
+                                    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for nowait
@@ -278,7 +280,7 @@ template <typename Iterable,
 RAJA_INLINE void
 forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
                    Iterable&& iter,
-                   Func&& loop_body)
+                   Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static) nowait
@@ -298,7 +300,7 @@ template <typename Iterable,
 RAJA_INLINE void
 forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
                    Iterable&& iter,
-                   Func&& loop_body)
+                   Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static, ChunkSize) nowait
@@ -316,7 +318,7 @@ RAJA_INLINE void
 forall_impl_nowait(const Policy&, Iterable&& iter, Func&& loop_body)
 {
   omp_sched_t prev_sched;
-  int prev_chunk;
+  int         prev_chunk;
   omp_get_schedule(&prev_sched, &prev_chunk);
   omp_set_schedule(Policy::schedule, Policy::chunk_size);
   forall_impl_nowait(::RAJA::policy::omp::Runtime{},
@@ -339,7 +341,7 @@ RAJA_INLINE concepts::enable_if_t<
 forall_impl(resources::Host host_res,
             const omp_for_schedule_exec<Schedule>&,
             Iterable&& iter,
-            Func&& loop_body,
+            Func&&     loop_body,
             ForallParam)
 {
   internal::forall_impl(
@@ -358,7 +360,7 @@ RAJA_INLINE concepts::enable_if_t<
 forall_impl(resources::Host host_res,
             const omp_for_nowait_schedule_exec<Schedule>&,
             Iterable&& iter,
-            Func&& loop_body,
+            Func&&     loop_body,
             ForallParam)
 {
   internal::forall_impl_nowait(
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index 0a0d6ba335..2c16bccac6 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -111,9 +111,9 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
     const auto l0 = segment_length<Arg0>(data);
     const auto l1 = segment_length<Arg1>(data);
     const auto l2 = segment_length<Arg2>(data);
-    auto i0 = l0;
-    auto i1 = l1;
-    auto i2 = l2;
+    auto       i0 = l0;
+    auto       i1 = l1;
+    auto       i2 = l2;
 
     // Set the argument types for this loop
     using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 1c9c28b622..6e21a445c4 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -34,24 +34,26 @@ struct LaunchExecute<RAJA::omp_launch_t>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
+       LaunchParams const&       params,
        const char*,
-       BODY const& body,
+       BODY const&   body,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      LaunchContext ctx;
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          LaunchContext ctx;
 
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
-      ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
+          ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
 
-      loop_body.get_priv()(ctx);
+          loop_body.get_priv()(ctx);
 
-      free(ctx.shared_mem_ptr);
-      ctx.shared_mem_ptr = nullptr;
-    });
+          free(ctx.shared_mem_ptr);
+          ctx.shared_mem_ptr = nullptr;
+        });
 
     return resources::EventProxy<resources::Resource>(res);
   }
@@ -63,10 +65,10 @@ struct LaunchExecute<RAJA::omp_launch_t>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
-       const char* RAJA_UNUSED_ARG(kernel_name),
-       BODY const& body,
-       ReduceParams& f_params)
+       LaunchParams const&       launch_params,
+       const char*               RAJA_UNUSED_ARG(kernel_name),
+       BODY const&               body,
+       ReduceParams&             f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
@@ -106,81 +108,87 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 #pragma omp for
-      for (int i = 0; i < len; i++)
-      {
+          for (int i = 0; i < len; i++)
+          {
 
-        loop_body.get_priv()(*(segment.begin() + i));
-      }
-    });
+            loop_body.get_priv()(*(segment.begin() + i));
+          }
+        });
   }
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int j = 0; j < len1; j++)
-      {
-        for (int i = 0; i < len0; i++)
-        {
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
 
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j));
-        }
-      }
-    });
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
+            }
+          }
+        });
   }
 
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int k = 0; k < len2; k++)
-      {
-        for (int j = 0; j < len1; j++)
-        {
-          for (int i = 0; i < len0; i++)
+          for (int k = 0; k < len2; k++)
           {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k));
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
@@ -191,8 +199,8 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -207,9 +215,9 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -229,10 +237,10 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -265,8 +273,8 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -281,9 +289,9 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -303,10 +311,10 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -342,62 +350,66 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++)
-      {
-        for (int i = 0; i < len0; i++)
-        {
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
 
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j));
-        }
-      }
-    });
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
+            }
+          }
+        });
   }
 
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++)
-      {
-        for (int j = 0; j < len1; j++)
-        {
-          for (int i = 0; i < len0; i++)
+          for (int k = 0; k < len2; k++)
           {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k));
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
@@ -409,65 +421,69 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++)
-      {
-        for (int i = 0; i < len0; i++)
-        {
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
 
-          loop_body.get_priv()(
-              *(segment0.begin() + i), *(segment1.begin() + j), i, j);
-        }
-      }
-    });
+              loop_body.get_priv()(
+                  *(segment0.begin() + i), *(segment1.begin() + j), i, j);
+            }
+          }
+        });
   }
 
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++)
-      {
-        for (int j = 0; j < len1; j++)
-        {
-          for (int i = 0; i < len0; i++)
+          for (int k = 0; k < len2; k++)
           {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k),
-                                 i,
-                                 j,
-                                 k);
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k),
+                                     i,
+                                     j,
+                                     k);
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
@@ -479,23 +495,25 @@ struct TileExecute<omp_parallel_for_exec, SEGMENT>
   template <typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int i = 0; i < len; i += tile_size)
-      {
-        loop_body.get_priv()(segment.slice(i, tile_size));
-      }
-    });
+          for (int i = 0; i < len; i += tile_size)
+          {
+            loop_body.get_priv()(segment.slice(i, tile_size));
+          }
+        });
   }
 };
 
@@ -506,25 +524,27 @@ struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
   template <typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp parallel for
-      for (int i = 0; i < numTiles; i++)
-      {
-        const int i_tile_size = i * tile_size;
-        loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
-      }
-    });
+          for (int i = 0; i < numTiles; i++)
+          {
+            const int i_tile_size = i * tile_size;
+            loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
+          }
+        });
   }
 };
 
@@ -535,9 +555,9 @@ struct TileExecute<omp_for_exec, SEGMENT>
   template <typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -556,12 +576,12 @@ struct TileTCountExecute<omp_for_exec, SEGMENT>
   template <typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
 #pragma omp for
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 00c2960c14..0b35f4c0ab 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -75,7 +75,7 @@ struct MultiReduceDataOMP<
     RAJA::omp::MultiReduceTuning<
         RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
@@ -103,9 +103,9 @@ struct MultiReduceDataOMP<
                     other.m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP&&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
@@ -128,13 +128,13 @@ struct MultiReduceDataOMP<
   template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
     if (new_num_bins != m_num_bins)
     {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
+      m_data     = create_data(container, m_num_bins);
     }
     else
     {
@@ -157,9 +157,9 @@ struct MultiReduceDataOMP<
 
 private:
   MultiReduceDataOMP const* m_parent;
-  size_t m_num_bins;
-  T m_identity;
-  T* m_data;
+  size_t                    m_num_bins;
+  T                         m_identity;
+  T*                        m_data;
 
   template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
@@ -210,7 +210,7 @@ struct MultiReduceDataOMP<
     RAJA::omp::MultiReduceTuning<
         RAJA::omp::multi_reduce_algorithm::combine_on_get>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
@@ -244,9 +244,9 @@ struct MultiReduceDataOMP<
         m_data(other.m_data)
   {}
 
-  MultiReduceDataOMP(MultiReduceDataOMP&&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
@@ -263,15 +263,15 @@ struct MultiReduceDataOMP<
   template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
     if (new_num_bins != m_num_bins)
     {
       destroy_data(
           m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-      m_num_bins = new_num_bins;
+      m_num_bins    = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container,
+      m_data        = create_data(container,
                            identity,
                            m_num_bins,
                            m_max_threads,
@@ -284,7 +284,7 @@ struct MultiReduceDataOMP<
       {
         {
           size_t thread_idx = 0;
-          size_t bin = 0;
+          size_t bin        = 0;
           for (auto const& value : container)
           {
             m_data[index_data(
@@ -330,12 +330,12 @@ struct MultiReduceDataOMP<
 
 private:
   MultiReduceDataOMP const* m_parent;
-  size_t m_max_threads;
-  size_t m_num_bins;
-  size_t m_padded_threads;
-  size_t m_padded_bins;
-  T m_identity;
-  T* m_data;
+  size_t                    m_max_threads;
+  size_t                    m_num_bins;
+  size_t                    m_padded_threads;
+  size_t                    m_padded_bins;
+  T                         m_identity;
+  T*                        m_data;
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
@@ -360,11 +360,11 @@ struct MultiReduceDataOMP<
 
   template <typename Container>
   static T* create_data(Container const& container,
-                        T identity,
-                        size_t num_bins,
-                        size_t max_threads,
-                        size_t padded_bins,
-                        size_t padded_threads)
+                        T                identity,
+                        size_t           num_bins,
+                        size_t           max_threads,
+                        size_t           padded_bins,
+                        size_t           padded_threads)
   {
     if (num_bins == size_t(0))
     {
@@ -376,7 +376,7 @@ struct MultiReduceDataOMP<
     {
       {
         size_t thread_idx = 0;
-        size_t bin = 0;
+        size_t bin        = 0;
         for (auto const& value : container)
         {
           new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
@@ -396,7 +396,7 @@ struct MultiReduceDataOMP<
     return data;
   }
 
-  static void destroy_data(T*& data,
+  static void destroy_data(T*&    data,
                            size_t num_bins,
                            size_t max_threads,
                            size_t padded_bins,
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index ef7801b200..c761f4c2c0 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -28,10 +28,10 @@ template <typename ExecPol,
           typename Func,
           typename ForallParam>
 RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
-forall_impl(const ExecPol& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam&& f_params)
+            forall_impl(const ExecPol& p,
+                        Iterable&&     iter,
+                        Func&&         loop_body,
+                        ForallParam&&  f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -59,9 +59,9 @@ RAJA_INLINE concepts::enable_if<
     std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
     std::integral_constant<bool, (ChunkSize <= 0)>>
 forall_impl(const ExecPol<ChunkSize>& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam&& f_params)
+            Iterable&&                iter,
+            Func&&                    loop_body,
+            ForallParam&&             f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -89,9 +89,9 @@ RAJA_INLINE concepts::enable_if<
     std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
     std::integral_constant<bool, (ChunkSize > 0)>>
 forall_impl(const ExecPol<ChunkSize>& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam&& f_params)
+            Iterable&&                iter,
+            Func&&                    loop_body,
+            ForallParam&&             f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -113,9 +113,9 @@ forall_impl(const ExecPol<ChunkSize>& p,
 //
 template <typename Iterable, typename Func, typename ForallParam>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
+                             Iterable&&                          iter,
+                             Func&&                              loop_body,
+                             ForallParam&&                       f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -136,9 +136,9 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
 //
 template <typename Iterable, typename Func, typename ForallParam>
 RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                    Iterable&& iter,
-                                    Func&& loop_body,
-                                    ForallParam&& f_params)
+                                    Iterable&&                       iter,
+                                    Func&&                           loop_body,
+                                    ForallParam&&                    f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -166,8 +166,8 @@ template <typename Iterable,
           typename ForallParam,
           typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
+                             Iterable&&    iter,
+                             Func&&        loop_body,
                              ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
@@ -193,8 +193,8 @@ template <typename Iterable,
           typename ForallParam,
           typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
+                             Iterable&&    iter,
+                             Func&&        loop_body,
                              ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
@@ -221,8 +221,8 @@ template <typename Iterable,
           typename ForallParam,
           typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
+                             Iterable&&                                    iter,
+                             Func&&        loop_body,
                              ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
@@ -248,8 +248,8 @@ template <typename Iterable,
           typename ForallParam,
           typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
+                             Iterable&&                                    iter,
+                             Func&&        loop_body,
                              ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
@@ -277,9 +277,9 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
 RAJA_INLINE void
 forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                   Iterable&& iter,
-                   Func&& loop_body,
-                   ForallParam&& f_params)
+                   Iterable&&                                    iter,
+                   Func&&                                        loop_body,
+                   ForallParam&&                                 f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -308,9 +308,9 @@ template <typename Iterable,
           typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
 RAJA_INLINE void
 forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                   Iterable&& iter,
-                   Func&& loop_body,
-                   ForallParam&& f_params)
+                   Iterable&&                                    iter,
+                   Func&&                                        loop_body,
+                   ForallParam&&                                 f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -336,11 +336,11 @@ template <typename Schedule,
           typename Func,
           typename ForallParam>
 RAJA_INLINE resources::EventProxy<resources::Host>
-forall_impl(resources::Host host_res,
-            const omp_for_schedule_exec<Schedule>&,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam f_params)
+            forall_impl(resources::Host host_res,
+                        const omp_for_schedule_exec<Schedule>&,
+                        Iterable&&  iter,
+                        Func&&      loop_body,
+                        ForallParam f_params)
 {
   expt::internal::forall_impl(Schedule{},
                               std::forward<Iterable>(iter),
@@ -364,8 +364,8 @@ RAJA_INLINE concepts::enable_if_t<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
-            Iterable&& iter,
-            Func&& loop_body,
+            Iterable&&  iter,
+            Func&&      loop_body,
             ForallParam f_params)
 {
   expt::forall_impl(host_res, InnerPolicy{}, iter, loop_body, f_params);
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index 172de3e965..e4dc83bfbb 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -29,10 +29,10 @@
 typedef enum omp_sched_t
 {
   // schedule kinds
-  omp_sched_static = 0x1,
+  omp_sched_static  = 0x1,
   omp_sched_dynamic = 0x2,
-  omp_sched_guided = 0x3,
-  omp_sched_auto = 0x4,
+  omp_sched_guided  = 0x3,
+  omp_sched_auto    = 0x4,
 
   // schedule modifier
   omp_sched_monotonic = 0x80000000u
@@ -56,7 +56,7 @@ template <multi_reduce_algorithm t_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
-  static constexpr bool consistent =
+  static constexpr bool                   consistent =
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
@@ -75,9 +75,9 @@ struct ScheduleTag
 template <omp_sched_t Sched, int Chunk>
 struct Schedule : public ScheduleTag
 {
-  constexpr static omp_sched_t schedule = Sched;
-  constexpr static int chunk_size = Chunk;
-  constexpr static Policy policy = Policy::openmp;
+  constexpr static omp_sched_t schedule   = Sched;
+  constexpr static int         chunk_size = Chunk;
+  constexpr static Policy      policy     = Policy::openmp;
 };
 } // namespace internal
 
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 474b2517c8..e3c916527f 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -46,25 +46,25 @@ namespace scan
 template <typename Policy, typename Iter, typename BinFn>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(resources::Host host_res,
-                  const Policy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f)
+            inclusive_inplace(resources::Host host_res,
+                              const Policy&,
+                              Iter  begin,
+                              Iter  end,
+                              BinFn f)
 {
   using RAJA::detail::firstIndex;
   using std::distance;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, Value());
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int       p         = omp_get_num_threads();
+    const int       pid       = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
     if (idx_begin != idx_end)
     {
       inclusive_inplace(
@@ -72,7 +72,7 @@ inclusive_inplace(resources::Host host_res,
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
-#pragma omp single
+#pragma omp          single
     exclusive_inplace(host_res,
                       ::RAJA::seq_exec{},
                       sums.data(),
@@ -81,7 +81,7 @@ inclusive_inplace(resources::Host host_res,
                       BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i)
     {
-      begin[i] = f(begin[i], sums[pid]);
+               begin[i] = f(begin[i], sums[pid]);
     }
   }
 
@@ -95,27 +95,27 @@ inclusive_inplace(resources::Host host_res,
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(resources::Host host_res,
-                  const Policy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f,
-                  ValueT v)
+            exclusive_inplace(resources::Host host_res,
+                              const Policy&,
+                              Iter   begin,
+                              Iter   end,
+                              BinFn  f,
+                              ValueT v)
 {
   using RAJA::detail::firstIndex;
   using std::distance;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, v);
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int       p         = omp_get_num_threads();
+    const int       pid       = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
+    const Value     init      = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
     if (idx_begin != idx_end)
     {
@@ -147,12 +147,12 @@ exclusive_inplace(resources::Host host_res,
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_openmp_policy<Policy>>
-inclusive(resources::Host host_res,
-          const Policy& exec,
-          Iter begin,
-          Iter end,
-          OutIter out,
-          BinFn f)
+            inclusive(resources::Host host_res,
+                      const Policy&   exec,
+                      Iter            begin,
+                      Iter            end,
+                      OutIter         out,
+                      BinFn           f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -170,13 +170,13 @@ template <typename Policy,
           typename ValueT>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_openmp_policy<Policy>>
-exclusive(resources::Host host_res,
-          const Policy& exec,
-          Iter begin,
-          Iter end,
-          OutIter out,
-          BinFn f,
-          ValueT v)
+            exclusive(resources::Host host_res,
+                      const Policy&   exec,
+                      Iter            begin,
+                      Iter            end,
+                      OutIter         out,
+                      BinFn           f,
+                      ValueT          v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 8404c556cc..3bad04d1d6 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -55,14 +55,14 @@ constexpr int get_min_iterates_per_task() { return 128; }
                by spawning tasks
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline void sort_task(Sorter sorter,
-                      Iter begin,
+inline void sort_task(Sorter                       sorter,
+                      Iter                         begin,
                       RAJA::detail::IterDiff<Iter> i_begin,
                       RAJA::detail::IterDiff<Iter> i_end,
                       RAJA::detail::IterDiff<Iter> iterates_per_task,
-                      Compare comp)
+                      Compare                      comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type   = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
 
   if (n <= iterates_per_task)
@@ -97,10 +97,10 @@ inline void sort_task(Sorter sorter,
                by manually assigning work to threads
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline void sort_parallel_region(Sorter sorter,
-                                 Iter begin,
+inline void sort_parallel_region(Sorter                       sorter,
+                                 Iter                         begin,
                                  RAJA::detail::IterDiff<Iter> n,
-                                 Compare comp)
+                                 Compare                      comp)
 {
   using RAJA::detail::firstIndex;
   using diff_type = RAJA::detail::IterDiff<Iter>;
@@ -178,7 +178,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
     RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
-#pragma omp master
+#pragma omp          master
     {
       sort_task(sorter, begin, 0, n, iterates_per_task, comp);
     }
@@ -210,8 +210,8 @@ concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
 unstable(resources::Host host_res,
          const ExecPolicy&,
-         Iter begin,
-         Iter end,
+         Iter    begin,
+         Iter    end,
          Compare comp)
 {
   detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
@@ -227,8 +227,8 @@ concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
 stable(resources::Host host_res,
        const ExecPolicy&,
-       Iter begin,
-       Iter end,
+       Iter    begin,
+       Iter    end,
        Compare comp)
 {
   detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
@@ -252,8 +252,8 @@ unstable_pairs(resources::Host host_res,
                ValIter vals_begin,
                Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::openmp::sort(
       detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
@@ -278,8 +278,8 @@ stable_pairs(resources::Host host_res,
              ValIter vals_begin,
              Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::openmp::sort(
       detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index 507221d7d6..1c3744fce3 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -67,8 +67,9 @@ inline auto get_cached_value(Factory&& factory)
 template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
-  static Dispatcher_T dispatcher{
-      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory)
+      {
         return omp_target::get_cached_value(
             std::forward<decltype(factory)>(factory));
       })};
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index f11d694ba7..3f0462fbc0 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -42,18 +42,18 @@ RAJA_INLINE concepts::enable_if_t<
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Omp omp_res,
+forall_impl(resources::Omp                                      omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam f_params)
+            Iterable&&                                          iter,
+            Func&&                                              loop_body,
+            ForallParam                                         f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
@@ -103,11 +103,11 @@ RAJA_INLINE concepts::enable_if_t<
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>&,
             Iterable&& iter,
-            Func&& loop_body,
+            Func&&     loop_body,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
@@ -151,18 +151,18 @@ RAJA_INLINE concepts::enable_if_t<
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Omp omp_res,
+forall_impl(resources::Omp                         omp_res,
             const omp_target_parallel_for_exec_nt& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam f_params)
+            Iterable&&                             iter,
+            Func&&                                 loop_body,
+            ForallParam                            f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
@@ -187,11 +187,11 @@ RAJA_INLINE concepts::enable_if_t<
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt&,
             Iterable&& iter,
-            Func&& loop_body,
+            Func&&     loop_body,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 9e151c981c..9dc3bfe39f 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -46,7 +46,7 @@ struct OpenMPTargetForWrapper : public GenericWrapperBase
 };
 
 template <camp::idx_t ArgumentId,
-          int N,
+          int         N,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<statement::For<ArgumentId,
@@ -64,7 +64,7 @@ struct StatementExecutor<statement::For<ArgumentId,
     OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...>
         for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 5b27a5c92c..8f789517be 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -70,8 +70,8 @@ static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 //! Information necessary for OpenMP offload to be considered
 struct Offload_Info
 {
-  int hostID{omp_get_initial_device()};
-  int deviceID{omp_get_default_device()};
+  int  hostID{omp_get_initial_device()};
+  int  deviceID{omp_get_default_device()};
   bool isMapped{false};
 
   Offload_Info() = default;
@@ -87,8 +87,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T* device;
-  T* host;
+  T*        device;
+  T*        host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -180,7 +180,7 @@ struct Reduce_Data
 template <typename Reducer, typename T>
 struct TargetReduce
 {
-  TargetReduce() = delete;
+  TargetReduce()                    = delete;
   TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val_, T identity_ = Reducer::identity())
@@ -194,7 +194,7 @@ struct TargetReduce
   {
     operator T();
     val.reset(identity_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_;
   }
 
@@ -260,8 +260,8 @@ struct TargetReduce
   omp::Offload_Info info;
   //! storage for reduction data (host ptr, device ptr, value)
   omp::Reduce_Data<T> val;
-  T initVal;
-  T finalVal;
+  T                   initVal;
+  T                   finalVal;
 };
 
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
@@ -269,12 +269,12 @@ struct TargetReduce
 template <typename Reducer, typename T, typename IndexType>
 struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
+  TargetReduceLoc()                       = delete;
   TargetReduceLoc(const TargetReduceLoc&) = default;
   explicit TargetReduceLoc(
-      T init_val_,
+      T         init_val_,
       IndexType init_loc,
-      T identity_val_ = Reducer::identity,
+      T         identity_val_ = Reducer::identity,
       IndexType identity_loc_ =
           RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
@@ -286,18 +286,18 @@ struct TargetReduceLoc
         finalLoc(identity_loc_)
   {}
 
-  void reset(T init_val_,
+  void reset(T         init_val_,
              IndexType init_loc_,
-             T identity_val_ = Reducer::identity,
+             T         identity_val_ = Reducer::identity,
              IndexType identity_loc_ =
                  RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
     loc.reset(identity_loc_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_val_;
-    initLoc = init_loc_;
+    initLoc  = init_loc_;
     finalLoc = identity_loc_;
   }
 
@@ -370,10 +370,10 @@ struct TargetReduceLoc
   omp::Reduce_Data<T> val;
   //! storage for redcution data for location
   omp::Reduce_Data<IndexType> loc;
-  T initVal;
-  T finalVal;
-  IndexType initLoc;
-  IndexType finalLoc;
+  T                           initVal;
+  T                           finalVal;
+  IndexType                   initLoc;
+  IndexType                   finalLoc;
 };
 
 
@@ -383,7 +383,7 @@ class ReduceSum<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-  using self = ReduceSum<omp_target_reduce, T>;
+  using self   = ReduceSum<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
@@ -408,7 +408,7 @@ class ReduceBitOr<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-  using self = ReduceBitOr<omp_target_reduce, T>;
+  using self   = ReduceBitOr<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
@@ -433,7 +433,7 @@ class ReduceBitAnd<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-  using self = ReduceBitAnd<omp_target_reduce, T>;
+  using self   = ReduceBitAnd<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
@@ -458,7 +458,7 @@ class ReduceMin<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-  using self = ReduceMin<omp_target_reduce, T>;
+  using self   = ReduceMin<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
@@ -484,7 +484,7 @@ class ReduceMax<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-  using self = ReduceMax<omp_target_reduce, T>;
+  using self   = ReduceMax<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
@@ -509,7 +509,7 @@ class ReduceMinLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>
 {
 public:
-  using self = ReduceMinLoc<omp_target_reduce, T, IndexType>;
+  using self   = ReduceMinLoc<omp_target_reduce, T, IndexType>;
   using parent = TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
@@ -535,7 +535,7 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>
 {
 public:
-  using self = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
+  using self   = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
   using parent = TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 774cbf6855..081f04f652 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -64,7 +64,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = ret < value ? ret : value;
+  *acc  = ret < value ? ret : value;
   return ret;
 }
 
@@ -73,7 +73,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value < ret ? ret : value;
+  *acc  = value < ret ? ret : value;
   return ret;
 }
 
@@ -92,7 +92,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = val <= old ? T(0) : old + T(1);
+  *acc  = val <= old ? T(0) : old + T(1);
   return old;
 }
 
@@ -110,7 +110,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = old == T(0) || val < old ? val : old - T(1);
+  *acc  = old == T(0) || val < old ? val : old - T(1);
   return old;
 }
 
@@ -146,7 +146,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value;
+  *acc  = value;
   return ret;
 }
 
@@ -155,7 +155,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
 {
   T ret = *acc;
-  *acc = ret == compare ? value : ret;
+  *acc  = ret == compare ? value : ret;
   return ret;
 }
 
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 922b7a16fb..07b4f25ef8 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -65,8 +65,8 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(Resource res,
             const seq_exec&,
-            Iterable&& iter,
-            Func&& body,
+            Iterable&&  iter,
+            Func&&      body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
@@ -93,7 +93,7 @@ RAJA_INLINE concepts::enable_if_t<
 forall_impl(Resource res,
             const seq_exec&,
             Iterable&& iter,
-            Func&& body,
+            Func&&     body,
             ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index 6dbe7eee56..f9397aabee 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -30,7 +30,7 @@ struct LaunchExecute<RAJA::null_launch_t>
 {
   template <typename BODY>
   static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
-                   BODY const& RAJA_UNUSED_ARG(body))
+                   BODY const&          RAJA_UNUSED_ARG(body))
   {
     RAJA_ABORT_OR_THROW("NULL Launch");
   }
@@ -47,16 +47,16 @@ struct LaunchExecute<RAJA::seq_launch_t>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
-       const char* RAJA_UNUSED_ARG(kernel_name),
-       BODY const& body,
-       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
+       LaunchParams const&       params,
+       const char*               RAJA_UNUSED_ARG(kernel_name),
+       BODY const&               body,
+       ReduceParams&             RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
 
     char* kernel_local_mem = new char[params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     body(ctx);
 
@@ -73,16 +73,16 @@ struct LaunchExecute<RAJA::seq_launch_t>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
-       const char* RAJA_UNUSED_ARG(kernel_name),
-       BODY const& body,
-       ReduceParams& launch_reducers)
+       LaunchParams const&       launch_params,
+       const char*               RAJA_UNUSED_ARG(kernel_name),
+       BODY const&               body,
+       ReduceParams&             launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
     LaunchContext ctx;
-    char* kernel_local_mem = new char[launch_params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    char*         kernel_local_mem = new char[launch_params.shared_mem_size];
+    ctx.shared_mem_ptr             = kernel_local_mem;
 
     expt::invoke_body(launch_reducers, body, ctx);
 
@@ -103,7 +103,7 @@ struct LoopExecute<seq_exec, SEGMENT>
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
-                                                BODY const& body)
+                                                BODY const&    body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -117,8 +117,8 @@ struct LoopExecute<seq_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -131,9 +131,9 @@ struct LoopExecute<seq_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
 
     // block stride loop
@@ -153,10 +153,10 @@ struct LoopExecute<seq_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
 
     // block stride loop
@@ -187,8 +187,8 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
     for (int i = 0; i < len; i++)
@@ -200,9 +200,9 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       BODY const&         body)
   {
 
     // block stride loop
@@ -222,10 +222,10 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+       SEGMENT const&      segment0,
+       SEGMENT const&      segment1,
+       SEGMENT const&      segment2,
+       BODY const&         body)
   {
 
     // block stride loop
@@ -260,9 +260,9 @@ struct TileExecute<seq_exec, SEGMENT>
   template <typename TILE_T, typename BODY>
   static RAJA_HOST_DEVICE RAJA_INLINE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -281,9 +281,9 @@ struct TileTCountExecute<seq_exec, SEGMENT>
   template <typename TILE_T, typename BODY>
   static RAJA_HOST_DEVICE RAJA_INLINE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+       TILE_T              tile_size,
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index c506c16ac6..4b44750a0e 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -66,7 +66,7 @@ struct MultiReduceDataSeq<
     RAJA::sequential::MultiReduceTuning<
         RAJA::sequential::multi_reduce_algorithm::left_fold>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataSeq() = delete;
@@ -90,9 +90,9 @@ struct MultiReduceDataSeq<
         m_data(other.m_data)
   {}
 
-  MultiReduceDataSeq(MultiReduceDataSeq&&) = delete;
+  MultiReduceDataSeq(MultiReduceDataSeq&&)                 = delete;
   MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
-  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&)      = delete;
 
   ~MultiReduceDataSeq()
   {
@@ -108,13 +108,13 @@ struct MultiReduceDataSeq<
   template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
     if (new_num_bins != m_num_bins)
     {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
+      m_data     = create_data(container, m_num_bins);
     }
     else
     {
@@ -137,9 +137,9 @@ struct MultiReduceDataSeq<
 
 private:
   MultiReduceDataSeq const* m_parent;
-  size_t m_num_bins;
-  T m_identity;
-  T* m_data;
+  size_t                    m_num_bins;
+  T                         m_identity;
+  T*                        m_data;
 
   template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
@@ -149,8 +149,8 @@ struct MultiReduceDataSeq<
       return nullptr;
     }
 
-    auto data = static_cast<T*>(malloc(num_bins * sizeof(T)));
-    size_t bin = 0;
+    auto   data = static_cast<T*>(malloc(num_bins * sizeof(T)));
+    size_t bin  = 0;
     for (auto const& value : container)
     {
       new (&data[bin]) T(value);
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 73e492e13b..05c47a0e62 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -34,7 +34,7 @@ template <multi_reduce_algorithm t_multi_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
-  static constexpr bool consistent =
+  static constexpr bool                   consistent =
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index bbf945bf49..3481d4d908 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -43,19 +43,19 @@ namespace scan
 template <typename ExecPolicy, typename Iter, typename BinFn>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(resources::Host host_res,
-                  const ExecPolicy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f)
+            inclusive_inplace(resources::Host host_res,
+                              const ExecPolicy&,
+                              Iter  begin,
+                              Iter  end,
+                              BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = *begin;
+  ValueT agg   = *begin;
 
   for (Iter i = ++begin; i != end; ++i)
   {
     agg = f(agg, *i);
-    *i = agg;
+    *i  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -68,25 +68,25 @@ inclusive_inplace(resources::Host host_res,
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(resources::Host host_res,
-                  const ExecPolicy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f,
-                  T v)
+            exclusive_inplace(resources::Host host_res,
+                              const ExecPolicy&,
+                              Iter  begin,
+                              Iter  end,
+                              BinFn f,
+                              T     v)
 {
   using std::distance;
-  const auto n = distance(begin, end);
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
 
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = v;
+  ValueT agg   = v;
 
   for (DistanceT i = 0; i < n; ++i)
   {
-    auto t = begin[i];
+    auto t   = begin[i];
     begin[i] = agg;
-    agg = f(agg, t);
+    agg      = f(agg, t);
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -99,20 +99,20 @@ exclusive_inplace(resources::Host host_res,
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(resources::Host host_res,
-          const ExecPolicy&,
-          const Iter begin,
-          const Iter end,
-          OutIter out,
-          BinFn f)
+            inclusive(resources::Host host_res,
+                      const ExecPolicy&,
+                      const Iter begin,
+                      const Iter end,
+                      OutIter    out,
+                      BinFn      f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = *begin;
-  *out++ = agg;
+  ValueT agg   = *begin;
+  *out++       = agg;
 
   for (Iter i = begin + 1; i != end; ++i)
   {
-    agg = f(agg, *i);
+    agg    = f(agg, *i);
     *out++ = agg;
   }
 
@@ -130,23 +130,23 @@ template <typename ExecPolicy,
           typename T>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
                                   type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(resources::Host host_res,
-          const ExecPolicy&,
-          const Iter begin,
-          const Iter end,
-          OutIter out,
-          BinFn f,
-          T v)
+            exclusive(resources::Host host_res,
+                      const ExecPolicy&,
+                      const Iter begin,
+                      const Iter end,
+                      OutIter    out,
+                      BinFn      f,
+                      T          v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = v;
-  OutIter o = out;
-  *o++ = v;
+  ValueT  agg  = v;
+  OutIter o    = out;
+  *o++         = v;
 
   for (Iter i = begin; i != end - 1; ++i, ++o)
   {
     agg = f(agg, *i);
-    *o = agg;
+    *o  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 0588af4885..6004a6be11 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -80,8 +80,8 @@ concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
 unstable(resources::Host host_res,
          const ExecPolicy&,
-         Iter begin,
-         Iter end,
+         Iter    begin,
+         Iter    end,
          Compare comp)
 {
   detail::UnstableSorter{}(begin, end, comp);
@@ -97,8 +97,8 @@ concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
 stable(resources::Host host_res,
        const ExecPolicy&,
-       Iter begin,
-       Iter end,
+       Iter    begin,
+       Iter    end,
        Compare comp)
 {
   detail::StableSorter{}(begin, end, comp);
@@ -122,8 +122,8 @@ unstable_pairs(resources::Host host_res,
                ValIter vals_begin,
                Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
@@ -147,8 +147,8 @@ stable_pairs(resources::Host host_res,
              ValIter vals_begin,
              Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
   detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 7a39742dfc..bac74942fa 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -54,14 +54,14 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(RAJA::resources::Host host_res,
             const simd_exec&,
-            Iterable&& iter,
-            Func&& loop_body,
+            Iterable&&  iter,
+            Func&&      loop_body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
   for (decltype(distance) i = 0; i < distance; ++i)
@@ -81,11 +81,11 @@ RAJA_INLINE concepts::enable_if_t<
 forall_impl(RAJA::resources::Host host_res,
             const simd_exec&,
             Iterable&& iter,
-            Func&& loop_body,
+            Func&&     loop_body,
             ForallParam)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
   for (decltype(distance) i = 0; i < distance; ++i)
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index dc01afb581..40b69df80b 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -113,9 +113,9 @@ struct StatementExecutor<
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
@@ -124,7 +124,7 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto  privatizer   = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       // Assign offset on privatized data
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index c2ea42d571..6ce2d07ee9 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -59,9 +59,9 @@ struct StatementExecutor<
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
@@ -74,7 +74,7 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto  privatizer   = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 8c9069cfee..63daf17f18 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -32,8 +32,8 @@ struct LoopExecute<simd_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -52,8 +52,8 @@ struct LoopICountExecute<simd_exec, SEGMENT>
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void
   exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+       SEGMENT const&      segment,
+       BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 36289fed7f..15ad96c51f 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -49,12 +49,12 @@ namespace detail
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct syclInfo
 {
-  sycl_dim_t gridDim{0};
-  sycl_dim_t blockDim{0};
-  cl::sycl::queue qu = cl::sycl::queue();
-  bool setup_reducers = false;
+  sycl_dim_t      gridDim{0};
+  sycl_dim_t      blockDim{0};
+  cl::sycl::queue qu             = cl::sycl::queue();
+  bool            setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
-  syclInfo* thread_states = nullptr;
+  syclInfo*  thread_states = nullptr;
   omp::mutex lock;
 #endif
 };
@@ -74,9 +74,9 @@ struct PinnedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    void* ptr;
+    void*          ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_host(nbytes, *q);
+    ptr              = ::sycl::malloc_host(nbytes, *q);
     return ptr;
   }
 
@@ -97,9 +97,9 @@ struct DeviceAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    void* ptr;
+    void*          ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     return ptr;
   }
 
@@ -121,9 +121,9 @@ struct DeviceZeroedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    void* ptr;
+    void*          ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     q->memset(ptr, 0, nbytes);
     return ptr;
   }
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index e629a3d10c..ea276026ea 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -69,7 +69,7 @@ namespace impl
 RAJA_INLINE
 ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 {
-  size_t size = {block_size * ((len + block_size - 1) / block_size)};
+  size_t           size = {block_size * ((len + block_size - 1) / block_size)};
   ::sycl::range<1> gridSize(size);
   return gridSize;
 }
@@ -88,7 +88,7 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 template <typename Iterable,
           typename LoopBody,
           size_t BlockSize,
-          bool Async,
+          bool   Async,
           typename ForallParam,
           typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
                                   bool>::type = true>
@@ -103,7 +103,7 @@ forall_impl(resources::Sycl& sycl_res,
             ForallParam)
 {
 
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -111,9 +111,9 @@ forall_impl(resources::Sycl& sycl_res,
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0 && BlockSize > 0)
@@ -130,16 +130,19 @@ forall_impl(resources::Sycl& sycl_res,
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
-                     [=](::sycl::nd_item<1> it) {
-                       IndexType ii = it.get_global_id(0);
-                       if (ii < len)
-                       {
-                         loop_body(begin[ii]);
-                       }
-                     });
-    });
+    q->submit(
+        [&](::sycl::handler& h)
+        {
+          h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
+                         [=](::sycl::nd_item<1> it)
+                         {
+                           IndexType ii = it.get_global_id(0);
+                           if (ii < len)
+                           {
+                             loop_body(begin[ii]);
+                           }
+                         });
+        });
 
     if (!Async)
     {
@@ -153,18 +156,18 @@ forall_impl(resources::Sycl& sycl_res,
 template <typename Iterable,
           typename LoopBody,
           size_t BlockSize,
-          bool Async,
+          bool   Async,
           typename ForallParam,
           typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
                                   bool>::type = true>
 RAJA_INLINE resources::EventProxy<resources::Sycl>
-forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async>,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+            forall_impl(resources::Sycl& sycl_res,
+                        sycl_exec<BlockSize, Async>,
+                        Iterable&& iter,
+                        LoopBody&& loop_body,
+                        ForallParam)
 {
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -172,9 +175,9 @@ forall_impl(resources::Sycl& sycl_res,
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0 && BlockSize > 0)
@@ -193,7 +196,7 @@ forall_impl(resources::Sycl& sycl_res,
     ::sycl::queue* q = sycl_res.get_queue();
 
     LOOP_BODY* lbody;
-    Iterator* beg;
+    Iterator*  beg;
 
     RAJA_FT_BEGIN;
     //
@@ -207,17 +210,21 @@ forall_impl(resources::Sycl& sycl_res,
     beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    q->submit([&](::sycl::handler& h) {
-       h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
-                      [=](::sycl::nd_item<1> it) {
-                        Index_type ii = it.get_global_id(0);
-
-                        if (ii < len)
-                        {
-                          (*lbody)((*beg)[ii]);
-                        }
-                      });
-     }).wait(); // Need to wait for completion to free memory
+    q->submit(
+         [&](::sycl::handler& h)
+         {
+           h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
+                          [=](::sycl::nd_item<1> it)
+                          {
+                            Index_type ii = it.get_global_id(0);
+
+                            if (ii < len)
+                            {
+                              (*lbody)((*beg)[ii]);
+                            }
+                          });
+         })
+        .wait(); // Need to wait for completion to free memory
 
     // Free our device memory
     cl::sycl::free(lbody, *q);
@@ -232,7 +239,7 @@ forall_impl(resources::Sycl& sycl_res,
 template <typename Iterable,
           typename LoopBody,
           size_t BlockSize,
-          bool Async,
+          bool   Async,
           typename ForallParam,
           typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
                                   bool>::type = true>
@@ -243,12 +250,12 @@ RAJA_INLINE concepts::enable_if_t<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
-            Iterable&& iter,
-            LoopBody&& loop_body,
+            Iterable&&  iter,
+            LoopBody&&  loop_body,
             ForallParam f_params)
 
 {
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -256,9 +263,9 @@ forall_impl(resources::Sycl& sycl_res,
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
@@ -274,7 +281,8 @@ forall_impl(resources::Sycl& sycl_res,
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = [](ForallParam x, ForallParam y) {
+    auto combiner = [](ForallParam x, ForallParam y)
+    {
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
@@ -283,19 +291,23 @@ forall_impl(resources::Sycl& sycl_res,
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for(
-          ::sycl::range<1>(len), reduction, [=](::sycl::item<1> it, auto& red) {
-            ForallParam fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-            IndexType ii = it.get_id(0);
-            if (ii < len)
-            {
-              RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-            }
-            red.combine(fp);
-          });
-    });
+    q->submit(
+        [&](::sycl::handler& h)
+        {
+          h.parallel_for(::sycl::range<1>(len),
+                         reduction,
+                         [=](::sycl::item<1> it, auto& red)
+                         {
+                           ForallParam fp;
+                           RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                           IndexType ii = it.get_id(0);
+                           if (ii < len)
+                           {
+                             RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                           }
+                           red.combine(fp);
+                         });
+        });
 
     q->wait();
     RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
@@ -309,7 +321,7 @@ forall_impl(resources::Sycl& sycl_res,
 template <typename Iterable,
           typename LoopBody,
           size_t BlockSize,
-          bool Async,
+          bool   Async,
           typename ForallParam,
           typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
                                   bool>::type = true>
@@ -320,12 +332,12 @@ RAJA_INLINE concepts::enable_if_t<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
-            Iterable&& iter,
-            LoopBody&& loop_body,
+            Iterable&&  iter,
+            LoopBody&&  loop_body,
             ForallParam f_params)
 
 {
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
@@ -333,9 +345,9 @@ forall_impl(resources::Sycl& sycl_res,
   //
   // Compute the requested iteration space size
   //
-  Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator  begin = std::begin(iter);
+  Iterator  end   = std::end(iter);
+  IndexType len   = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
@@ -350,7 +362,8 @@ forall_impl(resources::Sycl& sycl_res,
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = [](ForallParam x, ForallParam y) {
+    auto combiner = [](ForallParam x, ForallParam y)
+    {
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
@@ -358,7 +371,7 @@ forall_impl(resources::Sycl& sycl_res,
     // START
     //
     LOOP_BODY* lbody;
-    Iterator* beg;
+    Iterator*  beg;
     RAJA_FT_BEGIN;
     //
     // Setup shared memory buffers
@@ -375,20 +388,24 @@ forall_impl(resources::Sycl& sycl_res,
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit([&](::sycl::handler& h) {
-       h.parallel_for(::sycl::range<1>(len),
-                      reduction,
-                      [=](::sycl::item<1> it, auto& red) {
-                        Index_type ii = it.get_id(0);
-                        ForallParam fp;
-                        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-                        if (ii < len)
-                        {
-                          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-                        }
-                        red.combine(fp);
-                      });
-     }).wait(); // Need to wait for completion to free memory
+    q->submit(
+         [&](::sycl::handler& h)
+         {
+           h.parallel_for(::sycl::range<1>(len),
+                          reduction,
+                          [=](::sycl::item<1> it, auto& red)
+                          {
+                            Index_type  ii = it.get_id(0);
+                            ForallParam fp;
+                            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                            if (ii < len)
+                            {
+                              RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                            }
+                            red.combine(fp);
+                          });
+         })
+        .wait(); // Need to wait for completion to free memory
     RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     // Free our device memory
     ::sycl::free(res, *q);
@@ -423,13 +440,13 @@ forall_impl(resources::Sycl& sycl_res,
  */
 template <typename LoopBody,
           size_t BlockSize,
-          bool Async,
+          bool   Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Sycl>
-forall_impl(resources::Sycl& r,
-            ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+            forall_impl(resources::Sycl& r,
+                        ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+                        const TypedIndexSet<SegmentTypes...>& iset,
+                        LoopBody&&                            loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index 6eadb57651..6baa025435 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -45,7 +45,7 @@ struct SyclStatementExecutor<Data,
                              Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
 
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index cf966db172..ba78f1a6a6 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -39,8 +39,8 @@ namespace internal
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          int Dim,
-          int Local_Size,
+          int         Dim,
+          int         Local_Size,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
@@ -65,7 +65,7 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(Dim);
+    auto i   = item.get_global_id(Dim);
 
     // Assign the x thread to the argument
     data.template assign_offset<ArgumentId>(i);
@@ -83,17 +83,17 @@ struct SyclStatementExecutor<
     if (Dim == 0)
     {
       dims.global.x = len;
-      dims.local.x = Local_Size;
+      dims.local.x  = Local_Size;
     }
     if (Dim == 1)
     {
       dims.global.y = len;
-      dims.local.y = Local_Size;
+      dims.local.y  = Local_Size;
     }
     if (Dim == 2)
     {
       dims.global.z = len;
-      dims.local.z = Local_Size;
+      dims.local.z  = Local_Size;
     }
 
     // combine with enclosed statements
@@ -109,7 +109,7 @@ struct SyclStatementExecutor<
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          int Dim,
+          int         Dim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -133,7 +133,7 @@ struct SyclStatementExecutor<Data,
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(Dim);
+    auto i   = item.get_group(Dim);
 
     // Assign the x thread to the argument
     data.template assign_offset<ArgumentId>(i);
@@ -175,7 +175,7 @@ struct SyclStatementExecutor<Data,
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          int Dim,
+          int         Dim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -198,8 +198,8 @@ struct SyclStatementExecutor<Data,
   static inline RAJA_DEVICE void
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_group(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_group(Dim);
     auto i_stride = item.get_group_range(Dim);
 
     for (auto i = i0; i < len; i += i_stride)
@@ -245,7 +245,7 @@ struct SyclStatementExecutor<Data,
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          int Dim,
+          int         Dim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -269,7 +269,7 @@ struct SyclStatementExecutor<Data,
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(Dim);
+    auto i   = item.get_local_id(Dim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -311,7 +311,7 @@ struct SyclStatementExecutor<Data,
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          int Dim,
+          int         Dim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -334,10 +334,10 @@ struct SyclStatementExecutor<Data,
   static inline RAJA_DEVICE void
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_local_id(Dim);
     auto i_stride = item.get_local_range(Dim);
-    auto i = i0;
+    auto i        = i0;
 
     for (; i < len; i += i_stride)
     {
@@ -391,7 +391,7 @@ struct SyclStatementExecutor<Data,
  */
 template <typename Data,
           camp::idx_t ArgumentId,
-          int Local_Size,
+          int         Local_Size,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
@@ -413,7 +413,7 @@ struct SyclStatementExecutor<
   static inline RAJA_DEVICE void exec(Data& data, cl::sycl::nd_item<3> item)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(0);
+    auto i   = item.get_global_id(0);
 
     if (i < len)
     {
@@ -433,7 +433,7 @@ struct SyclStatementExecutor<
 
     // request one block per element in the segment
     LaunchDims dims;
-    dims.local.x = Local_Size;
+    dims.local.x  = Local_Size;
     dims.global.x = len;
 
     // combine with enclosed statements
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index 78414d2434..75491feff7 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -72,7 +72,7 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(ThreadDim);
+    auto   i   = item.get_local_id(ThreadDim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -132,8 +132,8 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i = mask_t::maskValue(i0);
+    auto   i0  = item.get_local_id(0);
+    diff_t i   = mask_t::maskValue(i0);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -193,9 +193,9 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // masked size strided loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i_init = mask_t::maskValue(i0);
+    diff_t len      = segment_length<ArgumentId>(data);
+    auto   i0       = item.get_local_id(0);
+    diff_t i_init   = mask_t::maskValue(i0);
     diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
@@ -260,9 +260,9 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // block stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_local_id(ThreadDim);
-    auto i_stride = item.get_local_range(ThreadDim);
+    diff_t len      = segment_length<ArgumentId>(data);
+    auto   i_init   = item.get_local_id(ThreadDim);
+    auto   i_stride = item.get_local_range(ThreadDim);
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride)
@@ -326,7 +326,7 @@ struct SyclStatementExecutor<
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(BlockDim);
+    auto   i   = item.get_group(BlockDim);
 
     if (i < len)
     {
@@ -383,9 +383,9 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_group(BlockDim);
-    auto i_stride = item.get_group_range(BlockDim);
+    diff_t len      = segment_length<ArgumentId>(data);
+    auto   i_init   = item.get_group(BlockDim);
+    auto   i_stride = item.get_group_range(BlockDim);
 
     // Iterate through grid stride of chunks
     for (diff_t i = i_init; i < len; i += i_stride)
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index bb176b4e9b..cf2a3bc642 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -92,7 +92,7 @@ template <typename Data, typename Exec>
 void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -126,10 +126,10 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
       internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data&& data,
+  static void launch(Data&&               data,
                      internal::LaunchDims launch_dims,
-                     size_t shmem,
-                     cl::sycl::queue* qu)
+                     size_t               shmem,
+                     cl::sycl::queue*     qu)
   {
 
     //
@@ -140,12 +140,15 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
     data_t* m_data = (data_t*)cl::sycl::malloc_device(sizeof(data_t), *qu);
     qu->memcpy(m_data, &data, sizeof(data_t)).wait();
 
-    qu->submit([&](cl::sycl::handler& h) {
-        h.parallel_for(launch_dims.fit_nd_range(qu),
-                       [=](cl::sycl::nd_item<3> item) {
-                         SyclKernelLauncher<Data, executor_t>(*m_data, item);
-                       });
-      }).wait(); // Need to wait to free memory
+    qu->submit(
+          [&](cl::sycl::handler& h)
+          {
+            h.parallel_for(
+                launch_dims.fit_nd_range(qu),
+                [=](cl::sycl::nd_item<3> item)
+                { SyclKernelLauncher<Data, executor_t>(*m_data, item); });
+          })
+        .wait(); // Need to wait to free memory
 
     cl::sycl::free(m_data, *qu);
   }
@@ -167,18 +170,19 @@ struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
       internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data&& data,
+  static void launch(Data&&               data,
                      internal::LaunchDims launch_dims,
-                     size_t shmem,
-                     cl::sycl::queue* qu)
+                     size_t               shmem,
+                     cl::sycl::queue*     qu)
   {
 
-    qu->submit([&](cl::sycl::handler& h) {
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=](cl::sycl::nd_item<3> item) {
-                       SyclKernelLauncher<Data, executor_t>(data, item);
-                     });
-    });
+    qu->submit(
+        [&](cl::sycl::handler& h)
+        {
+          h.parallel_for(launch_dims.fit_nd_range(qu),
+                         [=](cl::sycl::nd_item<3> item)
+                         { SyclKernelLauncher<Data, executor_t>(data, item); });
+        });
 
     if (!async)
     {
@@ -214,7 +218,7 @@ struct StatementExecutor<
                                       Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue* q = res.get_queue();
+    ::sycl::queue*        q   = res.get_queue();
     ;
 
     //
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index fb987210a9..e72a95108a 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -58,9 +58,9 @@ struct SyclStatementExecutor<
     Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
-  using diff_t = segment_diff_type<ArgumentId, Data>;
+  using diff_t           = segment_diff_type<ArgumentId, Data>;
 
   static inline RAJA_DEVICE void
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
@@ -69,7 +69,7 @@ struct SyclStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -97,7 +97,7 @@ struct SyclStatementExecutor<
   {
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -123,7 +123,7 @@ struct SyclStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
-          int BlockDim,
+          int         BlockDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -178,7 +178,7 @@ struct SyclStatementExecutor<Data,
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
     if (num_blocks * chunk_size < len)
     {
@@ -193,7 +193,7 @@ struct SyclStatementExecutor<Data,
 
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -218,7 +218,7 @@ struct SyclStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
-          int BlockDim,
+          int         BlockDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -242,12 +242,12 @@ struct SyclStatementExecutor<Data,
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t i_init = item.get_group(BlockDim) * chunk_size;         // TODO
+    diff_t len      = segment.end() - segment.begin();
+    diff_t i_init   = item.get_group(BlockDim) * chunk_size;       // TODO
     diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
 
     // Iterate through grid stride of chunks
@@ -270,7 +270,7 @@ struct SyclStatementExecutor<Data,
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
     if (num_blocks * chunk_size < len)
     {
@@ -282,7 +282,7 @@ struct SyclStatementExecutor<Data,
 
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -308,7 +308,7 @@ struct SyclStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
-          int ThreadDim,
+          int         ThreadDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -332,12 +332,12 @@ struct SyclStatementExecutor<Data,
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    diff_t i = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t i   = item.get_local_id(ThreadDim) * chunk_size;
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -345,7 +345,7 @@ struct SyclStatementExecutor<Data,
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
 
     // execute enclosed statements
     enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -359,7 +359,7 @@ struct SyclStatementExecutor<Data,
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
     if (num_threads * chunk_size < len)
     {
@@ -371,7 +371,7 @@ struct SyclStatementExecutor<Data,
     set_sycl_dim<ThreadDim>(dims.min_locals, num_threads);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
@@ -397,7 +397,7 @@ struct SyclStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
-          int ThreadDim,
+          int         ThreadDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<Data,
@@ -421,12 +421,12 @@ struct SyclStatementExecutor<Data,
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment_length<ArgumentId>(data);
-    diff_t i_init = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t len      = segment_length<ArgumentId>(data);
+    diff_t i_init   = item.get_local_id(ThreadDim) * chunk_size;
     diff_t i_stride = item.get_group_range(ThreadDim) * chunk_size;
 
     // Iterate through grid stride of chunks
@@ -440,7 +440,7 @@ struct SyclStatementExecutor<Data,
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -455,7 +455,7 @@ struct SyclStatementExecutor<Data,
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
     if (num_threads * chunk_size < len)
     {
@@ -468,7 +468,7 @@ struct SyclStatementExecutor<Data,
     set_sycl_dim<ThreadDim>(dims.min_locals, 1);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index 922a90eb85..43705962c5 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -79,7 +79,7 @@ struct SyclStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -114,7 +114,7 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           camp::idx_t chunk_size,
-          int BlockDim,
+          int         BlockDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
@@ -188,7 +188,7 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           camp::idx_t chunk_size,
-          int BlockDim,
+          int         BlockDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
@@ -226,13 +226,13 @@ struct SyclStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t t_init = item.get_group(BlockDim);
-    diff_t i_init = t_init * chunk_size;
+    diff_t len      = segment.end() - segment.begin();
+    diff_t t_init   = item.get_group(BlockDim);
+    diff_t i_init   = t_init * chunk_size;
     diff_t t_stride = item.get_group_range(BlockDim);
     diff_t i_stride = t_stride * chunk_size;
 
@@ -263,7 +263,7 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           camp::idx_t chunk_size,
-          int ThreadDim,
+          int         ThreadDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
@@ -301,7 +301,7 @@ struct SyclStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
@@ -316,7 +316,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
     data.template assign_param<ParamId>(t);
 
     // execute enclosed statements
@@ -337,7 +337,7 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           camp::idx_t chunk_size,
-          int ThreadDim,
+          int         ThreadDim,
           typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
@@ -375,7 +375,7 @@ struct SyclStatementExecutor<
     auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
@@ -398,7 +398,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
       data.template assign_param<ParamId>(t);
 
       // execute enclosed statements
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 8919fdbcde..2003659c47 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -136,7 +136,7 @@ struct LaunchDims
       else
       {
         launch_local.y = remaining;
-        remaining = remaining / launch_local.y;
+        remaining      = remaining / launch_local.y;
       }
       if (remaining < launch_local.x)
       {
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 1521b53605..15d8d7bde2 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -42,10 +42,10 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+       const LaunchParams&       params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -73,22 +73,27 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
       RAJA_FT_BEGIN;
 
-      q->submit([&](cl::sycl::handler& h) {
-        auto s_vec = ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
+      q->submit(
+          [&](cl::sycl::handler& h)
+          {
+            auto s_vec =
+                ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
 
-        h.parallel_for(
-            cl::sycl::nd_range<3>(gridSize, blockSize),
-            [=](cl::sycl::nd_item<3> itm) {
-              LaunchContext ctx;
-              ctx.itm = &itm;
+            h.parallel_for(
+                cl::sycl::nd_range<3>(gridSize, blockSize),
+                [=](cl::sycl::nd_item<3> itm)
+                {
+                  LaunchContext ctx;
+                  ctx.itm = &itm;
 
-              // Point to shared memory
-              ctx.shared_mem_ptr =
-                  s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+                  // Point to shared memory
+                  ctx.shared_mem_ptr =
+                      s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                          .get();
 
-              body_in(ctx);
-            });
-      });
+                  body_in(ctx);
+                });
+          });
 
       if (!async)
       {
@@ -113,10 +118,10 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams launch_reducers)
+       const LaunchParams&       launch_params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams              launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -148,7 +153,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     {
 
 
-      auto combiner = [](ReduceParams x, ReduceParams y) {
+      auto combiner = [](ReduceParams x, ReduceParams y)
+      {
         RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
       };
@@ -159,29 +165,34 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](cl::sycl::handler& h) {
-         auto s_vec =
-             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec = ::sycl::local_accessor<char, 1>(
+                 launch_params.shared_mem_size, h);
 
-         h.parallel_for(
-             cl::sycl::nd_range<3>(gridSize, blockSize),
-             reduction,
-             [=](cl::sycl::nd_item<3> itm, auto& red) {
-               LaunchContext ctx;
-               ctx.itm = &itm;
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize),
+                 reduction,
+                 [=](cl::sycl::nd_item<3> itm, auto& red)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
 
-               // Point to shared memory
-               ctx.shared_mem_ptr =
-                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
 
-               ReduceParams fp;
-               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                   ReduceParams fp;
+                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-               RAJA::expt::invoke_body(fp, body_in, ctx);
+                   RAJA::expt::invoke_body(fp, body_in, ctx);
 
-               red.combine(fp);
-             });
-       }).wait(); // Need to wait for completion to free memory
+                   red.combine(fp);
+                 });
+           })
+          .wait(); // Need to wait for completion to free memory
 
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
@@ -204,10 +215,10 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+       const LaunchParams&       params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -244,23 +255,28 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      q->submit([&](cl::sycl::handler& h) {
-         auto s_vec =
-             ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
-
-         h.parallel_for(
-             cl::sycl::nd_range<3>(gridSize, blockSize),
-             [=](cl::sycl::nd_item<3> itm) {
-               LaunchContext ctx;
-               ctx.itm = &itm;
-
-               // Point to shared memory
-               ctx.shared_mem_ptr =
-                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-               (*lbody)(ctx);
-             });
-       }).wait(); // Need to wait for completion to free memory
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec =
+                 ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
+
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize),
+                 [=](cl::sycl::nd_item<3> itm)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
+
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
+
+                   (*lbody)(ctx);
+                 });
+           })
+          .wait(); // Need to wait for completion to free memory
 
       cl::sycl::free(lbody, *q);
 
@@ -282,10 +298,10 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams launch_reducers)
+       const LaunchParams&       launch_params,
+       const char*               kernel_name,
+       BODY_IN&&                 body_in,
+       ReduceParams              launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -317,7 +333,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     {
 
 
-      auto combiner = [](ReduceParams x, ReduceParams y) {
+      auto combiner = [](ReduceParams x, ReduceParams y)
+      {
         RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
       };
@@ -337,29 +354,34 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](cl::sycl::handler& h) {
-         auto s_vec =
-             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec = ::sycl::local_accessor<char, 1>(
+                 launch_params.shared_mem_size, h);
 
-         h.parallel_for(
-             cl::sycl::nd_range<3>(gridSize, blockSize),
-             reduction,
-             [=](cl::sycl::nd_item<3> itm, auto& red) {
-               LaunchContext ctx;
-               ctx.itm = &itm;
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize),
+                 reduction,
+                 [=](cl::sycl::nd_item<3> itm, auto& red)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
 
-               // Point to shared memory
-               ctx.shared_mem_ptr =
-                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
 
-               ReduceParams fp;
-               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                   ReduceParams fp;
+                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-               RAJA::expt::invoke_body(fp, *lbody, ctx);
+                   RAJA::expt::invoke_body(fp, *lbody, ctx);
 
-               red.combine(fp);
-             });
-       }).wait(); // Need to wait for completion to free memory
+                   red.combine(fp);
+                 });
+           })
+          .wait(); // Need to wait for completion to free memory
 
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
@@ -416,9 +438,9 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -450,10 +472,10 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           SEGMENT const&       segment2,
+                                           BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -543,9 +565,9 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx = ctx.itm->get_local_id(DIM0);
-      const int ty = ctx.itm->get_local_id(DIM1);
-      const int bx = ctx.itm->get_local_range(DIM0);
+      const int tx  = ctx.itm->get_local_id(DIM0);
+      const int ty  = ctx.itm->get_local_id(DIM1);
+      const int bx  = ctx.itm->get_local_range(DIM0);
       const int tid = tx + bx * ty;
 
       if (tid < len) body(*(segment.begin() + tid));
@@ -809,9 +831,9 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -830,10 +852,10 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           SEGMENT const&       segment2,
+                                           BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -860,9 +882,9 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -881,10 +903,10 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           SEGMENT const&       segment2,
+                                           BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -925,9 +947,9 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -952,10 +974,10 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           SEGMENT const&       segment2,
+                                           BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -991,9 +1013,9 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -1019,10 +1041,10 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
 
   template <typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+                                           SEGMENT const&       segment0,
+                                           SEGMENT const&       segment1,
+                                           SEGMENT const&       segment2,
+                                           BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -1058,9 +1080,9 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1080,9 +1102,9 @@ struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1102,9 +1124,9 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1126,9 +1148,9 @@ struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1148,9 +1170,9 @@ struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1170,9 +1192,9 @@ struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1192,9 +1214,9 @@ struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1214,9 +1236,9 @@ struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
 
   template <typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+                                           TILE_T               tile_size,
+                                           SEGMENT const&       segment,
+                                           BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index a2fcf9ccbb..42857a74dc 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -74,8 +74,8 @@ static int MaxNumTeams = 1;
 //! Information necessary for SYCL offload to be considered
 struct Offload_Info
 {
-  int hostID{1};
-  int deviceID{2};
+  int  hostID{1};
+  int  deviceID{2};
   bool isMapped{false};
 
   Offload_Info() = default;
@@ -91,8 +91,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T* device;
-  T* host;
+  T*        device;
+  T*        host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -195,7 +195,7 @@ struct Reduce_Data
 template <typename Reducer, typename T>
 struct TargetReduce
 {
-  TargetReduce() = delete;
+  TargetReduce()                    = delete;
   TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val)
@@ -208,10 +208,10 @@ struct TargetReduce
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     val.cleanup(info);
-    val = sycl::Reduce_Data<T>(identity_, identity_, info);
+    val           = sycl::Reduce_Data<T>(identity_, identity_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_;
+    initVal       = init_val_;
+    finalVal      = identity_;
   }
 
   //! apply reduction on device upon destruction
@@ -294,12 +294,12 @@ struct TargetReduce
 template <typename Reducer, typename T, typename IndexType>
 struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
+  TargetReduceLoc()                       = delete;
   TargetReduceLoc(const TargetReduceLoc&) = default;
   explicit TargetReduceLoc(
-      T init_val,
+      T         init_val,
       IndexType init_loc,
-      T identity_val_ = Reducer::identity,
+      T         identity_val_ = Reducer::identity,
       IndexType identity_loc_ =
           RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
@@ -311,9 +311,9 @@ struct TargetReduceLoc
         finalLoc(identity_loc_)
   {}
 
-  void reset(T init_val_,
+  void reset(T         init_val_,
              IndexType init_loc_,
-             T identity_val_ = Reducer::identity,
+             T         identity_val_ = Reducer::identity,
              IndexType identity_loc_ =
                  RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
@@ -322,10 +322,10 @@ struct TargetReduceLoc
     loc.cleanup(info);
     loc = sycl::Reduce_Data<IndexType>(identity_loc_, identity_loc_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_val_;
-    initLoc = init_loc_;
-    finalLoc = identity_loc_;
+    initVal       = init_val_;
+    finalVal      = identity_val_;
+    initLoc       = init_loc_;
+    finalLoc      = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
@@ -392,7 +392,7 @@ struct TargetReduceLoc
   }
 
   //! storage for reduction data for value
-  sycl::Reduce_Data<T> val;
+  sycl::Reduce_Data<T>         val;
   sycl::Reduce_Data<IndexType> loc;
 
 private:
@@ -401,9 +401,9 @@ struct TargetReduceLoc
   //! storage for reduction data for value
   //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
-  T initVal;
-  T finalVal;
-  T returnVal;
+  T         initVal;
+  T         finalVal;
+  T         returnVal;
   IndexType initLoc;
   IndexType finalLoc;
   IndexType returnLoc;
@@ -415,7 +415,7 @@ template <typename T>
 class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-  using self = ReduceSum<sycl_reduce, T>;
+  using self   = ReduceSum<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
@@ -452,7 +452,7 @@ class ReduceBitOr<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-  using self = ReduceBitOr<sycl_reduce, T>;
+  using self   = ReduceBitOr<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
@@ -501,7 +501,7 @@ class ReduceBitAnd<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-  using self = ReduceBitAnd<sycl_reduce, T>;
+  using self   = ReduceBitAnd<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
@@ -550,7 +550,7 @@ template <typename T>
 class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-  using self = ReduceMin<sycl_reduce, T>;
+  using self   = ReduceMin<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
@@ -599,7 +599,7 @@ template <typename T>
 class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-  using self = ReduceMax<sycl_reduce, T>;
+  using self   = ReduceMax<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index fc8004331f..753a075bd2 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -43,9 +43,9 @@ class Register<double, avx_register>
       internal::expt::RegisterBase<Register<double, avx_register>>;
 
   using register_policy = avx_register;
-  using self_type = Register<double, avx_register>;
-  using element_type = double;
-  using register_type = __m256d;
+  using self_type       = Register<double, avx_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
 
   using int_vector_type = Register<int64_t, avx_register>;
 
@@ -310,7 +310,7 @@ class Register<double, avx_register>
   RAJA_INLINE
   element_type sum() const
   {
-    auto sh1 = _mm256_permute_pd(m_value, 0x5);
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
     auto red1 = _mm256_add_pd(m_value, sh1);
     return red1[0] + red1[2];
   }
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 5bf5a589f8..078d3a421c 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -42,9 +42,9 @@ class Register<float, avx_register>
   using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
 
   using register_policy = avx_register;
-  using self_type = Register<float, avx_register>;
-  using element_type = float;
-  using register_type = __m256;
+  using self_type       = Register<float, avx_register>;
+  using element_type    = float;
+  using register_type   = __m256;
 
   using int_vector_type = Register<int32_t, avx_register>;
 
@@ -321,11 +321,11 @@ class Register<float, avx_register>
   element_type sum() const
   {
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
     auto red1 = _mm256_add_ps(m_value, sh1);
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
     auto red2 = _mm256_add_ps(red1, sh2);
 
     return red2[0] + red2[4];
@@ -340,11 +340,11 @@ class Register<float, avx_register>
   element_type max() const
   {
     // swap odd-even pairs and combine
-    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
     auto red1 = _mm256_max_ps(m_value, sh1);
 
     // swap odd-even quads and combine
-    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
     auto red2 = _mm256_max_ps(red1, sh2);
 
     // combine quads
@@ -391,7 +391,7 @@ class Register<float, avx_register>
 
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
     auto red2 = _mm256_max_ps(red1, sh2);
 
     if (N == 4)
@@ -429,11 +429,11 @@ class Register<float, avx_register>
   element_type min() const
   {
     // swap odd-even pairs and combine
-    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
     auto red1 = _mm256_min_ps(m_value, sh1);
 
     // swap odd-even quads and combine
-    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
     auto red2 = _mm256_min_ps(red1, sh2);
 
     // combine quads
@@ -480,7 +480,7 @@ class Register<float, avx_register>
 
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
     auto red2 = _mm256_min_ps(red1, sh2);
 
     if (N == 4)
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 312f85e2c5..fc67ad6323 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -42,9 +42,9 @@ class Register<int32_t, avx_register>
       internal::expt::RegisterBase<Register<int32_t, avx_register>>;
 
   using register_policy = avx_register;
-  using self_type = Register<int32_t, avx_register>;
-  using element_type = int32_t;
-  using register_type = __m256i;
+  using self_type       = Register<int32_t, avx_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
 
   using int_vector_type = Register<int32_t, avx_register>;
 
@@ -376,13 +376,13 @@ class Register<int32_t, avx_register>
     // no 8-way 32-bit add, but there is a 4-way... split and conquer
 
     // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
     auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
 
     // Hi 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
     auto res_hi = _mm_add_epi32(hi_a, hi_b);
 
     // Stitch back together
@@ -396,13 +396,13 @@ class Register<int32_t, avx_register>
     // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
 
     // Low 128-bits
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
     auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
 
     // Hi 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
     auto res_hi = _mm_sub_epi32(hi_a, hi_b);
 
     // Stitch back together
@@ -423,13 +423,13 @@ class Register<int32_t, avx_register>
     auto res_low_even = _mm_mul_epi32(low_a, low_b);
 
     // multiply odd lanes 1, 3
-    auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
-    auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
+    auto low_a_sh    = _mm_shuffle_epi32(low_a, 0xB1);
+    auto low_b_sh    = _mm_shuffle_epi32(low_b, 0xB1);
     auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
 
     // recombine to get all 4 lanes
     // note: AVX doesn't have a int32 blend, so we use the float32 blend
-    res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
+    res_low_odd  = _mm_shuffle_epi32(res_low_odd, 0xB1);
     auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
         _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
 
@@ -441,13 +441,13 @@ class Register<int32_t, avx_register>
     auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
 
     // multiply odd lanes 1, 3
-    auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
-    auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
+    auto hi_a_sh    = _mm_shuffle_epi32(hi_a, 0xB1);
+    auto hi_b_sh    = _mm_shuffle_epi32(hi_b, 0xB1);
     auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
 
     // recombine to get all 4 lanes
     // note: AVX doesn't have a int32 blend, so we use the float32 blend
-    res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
+    res_hi_odd  = _mm_shuffle_epi32(res_hi_odd, 0xB1);
     auto res_hi = _mm_castps_si128(_mm_blend_ps(
         _mm_castsi128_ps(res_hi_odd), _mm_castsi128_ps(res_hi_even), 0x05));
 
@@ -496,20 +496,20 @@ class Register<int32_t, avx_register>
     // Low 128-bits
     auto low = _mm256_castsi256_si128(m_value);
 
-    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
     auto low_red1 = _mm_add_epi32(low, low_sh1);
 
-    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto low_sh2  = _mm_shuffle_epi32(low_red1, 0x1B);
     auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
 
 
     // High 128-bits
     auto hi = _mm256_extractf128_si256(m_value, 1);
 
-    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
     auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
 
-    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
     auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
 
 
@@ -534,7 +534,7 @@ class Register<int32_t, avx_register>
     // Low 128-bits
     auto low = _mm256_castsi256_si128(m_value);
 
-    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
     auto low_red1 = _mm_max_epi32(low, low_sh1);
 
     auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
@@ -547,10 +547,10 @@ class Register<int32_t, avx_register>
     auto hi = _mm256_extractf128_si256(m_value, 1);
 
 
-    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
     auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
 
-    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
     auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
 
 
@@ -583,7 +583,7 @@ class Register<int32_t, avx_register>
     // Low 128-bits
     auto low = _mm256_castsi256_si128(m_value);
 
-    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
     auto low_red1 = _mm_max_epi32(low, low_sh1);
 
     if (N == 2)
@@ -594,7 +594,7 @@ class Register<int32_t, avx_register>
     if (N == 3)
     {
       // get lane 2 into lane 0
-      auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
       auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
       return _mm_extract_epi32(low_red1a, 0);
     }
@@ -618,7 +618,7 @@ class Register<int32_t, avx_register>
       return _mm_extract_epi32(red_5, 0);
     }
 
-    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
     auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
 
     if (N == 6)
@@ -629,13 +629,13 @@ class Register<int32_t, avx_register>
     if (N == 7)
     {
       // get lane 6 (lane 2 of hi) into lane 0
-      auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
       auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
-      auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
+      auto red_7    = _mm_max_epi32(low_red2, hi_red_6);
       return _mm_extract_epi32(red_7, 0);
     }
 
-    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
     auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
 
 
@@ -654,13 +654,13 @@ class Register<int32_t, avx_register>
     // no 8-way 32-bit min, but there is a 4-way... split and conquer
 
     // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
     auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
 
     // Hi 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
     auto res_hi = _mm_max_epi32(hi_a, hi_b);
 
     // Stitch back together
@@ -681,7 +681,7 @@ class Register<int32_t, avx_register>
     // Low 128-bits
     auto low = _mm256_castsi256_si128(m_value);
 
-    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
     auto low_red1 = _mm_min_epi32(low, low_sh1);
 
     auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
@@ -693,11 +693,11 @@ class Register<int32_t, avx_register>
     // High 128-bits
     auto hi = _mm256_extractf128_si256(m_value, 1);
 
-    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
     auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
 
 
-    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
     auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
 
 
@@ -729,7 +729,7 @@ class Register<int32_t, avx_register>
     // Low 128-bits
     auto low = _mm256_castsi256_si128(m_value);
 
-    auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
     auto low_red1 = _mm_min_epi32(low, low_sh1);
 
     if (N == 2)
@@ -740,7 +740,7 @@ class Register<int32_t, avx_register>
     if (N == 3)
     {
       // get lane 2 into lane 0
-      auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
       auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
       return _mm_extract_epi32(low_red1a, 0);
     }
@@ -764,7 +764,7 @@ class Register<int32_t, avx_register>
       return _mm_extract_epi32(red_5, 0);
     }
 
-    auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
     auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
 
     if (N == 6)
@@ -775,13 +775,13 @@ class Register<int32_t, avx_register>
     if (N == 7)
     {
       // get lane 6 (lane 2 of hi) into lane 0
-      auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
       auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
-      auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
+      auto red_7    = _mm_min_epi32(low_red2, hi_red_6);
       return _mm_extract_epi32(red_7, 0);
     }
 
-    auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
     auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
 
 
@@ -800,13 +800,13 @@ class Register<int32_t, avx_register>
     // no 8-way 32-bit min, but there is a 4-way... split and conquer
 
     // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
     auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
 
     // Hi 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
     auto res_hi = _mm_min_epi32(hi_a, hi_b);
 
     // Stitch back together
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 22c48b2bff..d5b5cc41d0 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -42,9 +42,9 @@ class Register<int64_t, avx_register>
       internal::expt::RegisterBase<Register<int64_t, avx_register>>;
 
   using register_policy = avx_register;
-  using self_type = Register<int64_t, avx_register>;
-  using element_type = int64_t;
-  using register_type = __m256i;
+  using self_type       = Register<int64_t, avx_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
 
   using int_vector_type = Register<int64_t, avx_register>;
 
@@ -317,13 +317,13 @@ class Register<int64_t, avx_register>
     // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
 
     // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
     auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
 
     // Hi 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
     auto res_hi = _mm_add_epi64(hi_a, hi_b);
 
     // Stitch back together
@@ -337,13 +337,13 @@ class Register<int64_t, avx_register>
     // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
 
     // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(b.m_value);
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
     auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
 
     // Hi 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
     auto res_hi = _mm_sub_epi64(hi_a, hi_b);
 
     // Stitch back together
@@ -395,13 +395,13 @@ class Register<int64_t, avx_register>
     auto sh1 = permute<0x5>(m_value);
 
     // Add lower 128-bits
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(sh1);
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(sh1);
     auto res_low = _mm_add_epi64(low_a, low_b);
 
     // Add upper 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(sh1, 1);
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(sh1, 1);
     auto res_hi = _mm_add_epi64(hi_a, hi_b);
 
     // Sum upper and lower
@@ -423,13 +423,13 @@ class Register<int64_t, avx_register>
     auto red = get(0);
 
     auto v1 = get(1);
-    red = red < v1 ? v1 : red;
+    red     = red < v1 ? v1 : red;
 
     auto v2 = get(2);
-    red = red < v2 ? v2 : red;
+    red     = red < v2 ? v2 : red;
 
     auto v3 = get(3);
-    red = red < v3 ? v3 : red;
+    red     = red < v3 ? v3 : red;
 
     return red;
   }
@@ -452,17 +452,17 @@ class Register<int64_t, avx_register>
     if (N > 1)
     {
       auto v1 = get(1);
-      red = red < v1 ? v1 : red;
+      red     = red < v1 ? v1 : red;
     }
     if (N > 2)
     {
       auto v2 = get(2);
-      red = red < v2 ? v2 : red;
+      red     = red < v2 ? v2 : red;
     }
     if (N > 3)
     {
       auto v3 = get(3);
-      red = red < v3 ? v3 : red;
+      red     = red < v3 ? v3 : red;
     }
 
     return red;
@@ -493,13 +493,13 @@ class Register<int64_t, avx_register>
     auto red = get(0);
 
     auto v1 = get(1);
-    red = red > v1 ? v1 : red;
+    red     = red > v1 ? v1 : red;
 
     auto v2 = get(2);
-    red = red > v2 ? v2 : red;
+    red     = red > v2 ? v2 : red;
 
     auto v3 = get(3);
-    red = red > v3 ? v3 : red;
+    red     = red > v3 ? v3 : red;
 
     return red;
   }
@@ -522,17 +522,17 @@ class Register<int64_t, avx_register>
     if (N > 1)
     {
       auto v1 = get(1);
-      red = red > v1 ? v1 : red;
+      red     = red > v1 ? v1 : red;
     }
     if (N > 2)
     {
       auto v2 = get(2);
-      red = red > v2 ? v2 : red;
+      red     = red > v2 ? v2 : red;
     }
     if (N > 3)
     {
       auto v3 = get(3);
-      red = red > v3 ? v3 : red;
+      red     = red > v3 ? v3 : red;
     }
 
     return red;
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index 9a8d9ed000..5cfded959d 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -30,41 +30,41 @@ namespace expt
 template <>
 struct RegisterTraits<RAJA::expt::avx_register, int32_t>
 {
-  using element_type = int32_t;
-  using register_policy = RAJA::expt::avx_register;
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx_register, int64_t>
 {
-  using element_type = int64_t;
-  using register_policy = RAJA::expt::avx_register;
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx_register, float>
 {
-  using element_type = float;
-  using register_policy = RAJA::expt::avx_register;
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx_register, double>
 {
-  using element_type = double;
-  using register_policy = RAJA::expt::avx_register;
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 } // namespace expt
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index 98c6a38182..d2d08ccc06 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -43,9 +43,9 @@ class Register<double, avx2_register>
       internal::expt::RegisterBase<Register<double, avx2_register>>;
 
   using register_policy = avx2_register;
-  using self_type = Register<double, avx2_register>;
-  using element_type = double;
-  using register_type = __m256d;
+  using self_type       = Register<double, avx2_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
 
   using int_vector_type = Register<int64_t, avx2_register>;
 
@@ -420,7 +420,7 @@ class Register<double, avx2_register>
   RAJA_INLINE
   element_type sum(camp::idx_t = 4) const
   {
-    auto sh1 = _mm256_permute_pd(m_value, 0x5);
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
     auto red1 = _mm256_add_pd(m_value, sh1);
     return red1[0] + red1[2];
   }
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 2c13ef0532..abb7916790 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -42,9 +42,9 @@ class Register<float, avx2_register>
       internal::expt::RegisterBase<Register<float, avx2_register>>;
 
   using register_policy = avx2_register;
-  using self_type = Register<float, avx2_register>;
-  using element_type = float;
-  using register_type = __m256;
+  using self_type       = Register<float, avx2_register>;
+  using element_type    = float;
+  using register_type   = __m256;
 
   using int_vector_type = Register<int32_t, avx2_register>;
 
@@ -381,11 +381,11 @@ class Register<float, avx2_register>
   element_type sum() const
   {
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
     auto red1 = _mm256_add_ps(m_value, sh1);
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permute_ps(red1, 0x4E);
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
     auto red2 = _mm256_add_ps(red1, sh2);
 
     return red2[0] + red2[4];
@@ -401,11 +401,11 @@ class Register<float, avx2_register>
   {
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
     auto red1 = _mm256_max_ps(m_value, sh1);
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
     auto red2 = _mm256_max_ps(red1, sh2);
 
     return std::max<element_type>(red2[0], red2[4]);
@@ -433,7 +433,7 @@ class Register<float, avx2_register>
     }
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
     auto red1 = _mm256_max_ps(m_value, sh1);
 
     if (N == 3)
@@ -446,7 +446,7 @@ class Register<float, avx2_register>
     }
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
     auto red2 = _mm256_max_ps(red1, sh2);
 
     return std::max<element_type>(red2[0], red2[4]);
@@ -471,11 +471,11 @@ class Register<float, avx2_register>
   {
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
     auto red1 = _mm256_min_ps(m_value, sh1);
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
     auto red2 = _mm256_min_ps(red1, sh2);
 
     return std::min<element_type>(red2[0], red2[4]);
@@ -503,7 +503,7 @@ class Register<float, avx2_register>
     }
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
     auto red1 = _mm256_min_ps(m_value, sh1);
 
     if (N == 3)
@@ -516,7 +516,7 @@ class Register<float, avx2_register>
     }
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
     auto red2 = _mm256_min_ps(red1, sh2);
 
     return std::min<element_type>(red2[0], red2[4]);
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index 1b7ae23905..ee148bd6bb 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -43,9 +43,9 @@ class Register<int32_t, avx2_register>
       internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
 
   using register_policy = avx2_register;
-  using self_type = Register<int32_t, avx2_register>;
-  using element_type = int32_t;
-  using register_type = __m256i;
+  using self_type       = Register<int32_t, avx2_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
 
   using int_vector_type = Register<int32_t, avx2_register>;
 
@@ -473,11 +473,11 @@ class Register<int32_t, avx2_register>
   {
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
     auto red1 = _mm256_max_epi32(m_value, sh1);
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
     auto red2 = _mm256_max_epi32(red1, sh2);
 
     return std::max<element_type>(_mm256_extract_epi32(red2, 0),
@@ -507,7 +507,7 @@ class Register<int32_t, avx2_register>
     }
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
     auto red1 = _mm256_max_epi32(m_value, sh1);
 
     if (N == 3)
@@ -521,7 +521,7 @@ class Register<int32_t, avx2_register>
     }
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
     auto red2 = _mm256_max_epi32(red1, sh2);
 
     return std::max<element_type>(_mm256_extract_epi32(red2, 0),
@@ -547,12 +547,12 @@ class Register<int32_t, avx2_register>
   {
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
     auto red1 = _mm256_min_epi32(m_value, sh1);
 
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
     auto red2 = _mm256_min_epi32(red1, sh2);
 
     return std::min<element_type>(_mm256_extract_epi32(red2, 0),
@@ -582,7 +582,7 @@ class Register<int32_t, avx2_register>
     }
 
     // swap odd-even pairs and add
-    auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
     auto red1 = _mm256_min_epi32(m_value, sh1);
 
     if (N == 3)
@@ -596,7 +596,7 @@ class Register<int32_t, avx2_register>
     }
 
     // swap odd-even quads and add
-    auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
     auto red2 = _mm256_min_epi32(red1, sh2);
 
     return std::min<element_type>(_mm256_extract_epi32(red2, 0),
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 6a6bdd96da..41cdf1c1e0 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -42,9 +42,9 @@ class Register<int64_t, avx2_register>
       internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
 
   using register_policy = avx2_register;
-  using self_type = Register<int64_t, avx2_register>;
-  using element_type = int64_t;
-  using register_type = __m256i;
+  using self_type       = Register<int64_t, avx2_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
 
   using int_vector_type = Register<int64_t, avx2_register>;
 
@@ -418,7 +418,7 @@ class Register<int64_t, avx2_register>
   {
 
     // swap pairs and add
-    auto sh1 = permute<0x5>(m_value);
+    auto sh1  = permute<0x5>(m_value);
     auto red1 = _mm256_add_epi64(m_value, sh1);
 
     // add lower and upper
@@ -437,13 +437,13 @@ class Register<int64_t, avx2_register>
     auto red = get(0);
 
     auto v1 = get(1);
-    red = red < v1 ? v1 : red;
+    red     = red < v1 ? v1 : red;
 
     auto v2 = get(2);
-    red = red < v2 ? v2 : red;
+    red     = red < v2 ? v2 : red;
 
     auto v3 = get(3);
-    red = red < v3 ? v3 : red;
+    red     = red < v3 ? v3 : red;
 
     return red;
   }
@@ -466,17 +466,17 @@ class Register<int64_t, avx2_register>
     if (N > 1)
     {
       auto v1 = get(1);
-      red = red < v1 ? v1 : red;
+      red     = red < v1 ? v1 : red;
     }
     if (N > 2)
     {
       auto v2 = get(2);
-      red = red < v2 ? v2 : red;
+      red     = red < v2 ? v2 : red;
     }
     if (N > 3)
     {
       auto v3 = get(3);
-      red = red < v3 ? v3 : red;
+      red     = red < v3 ? v3 : red;
     }
 
     return red;
@@ -506,13 +506,13 @@ class Register<int64_t, avx2_register>
     auto red = get(0);
 
     auto v1 = get(1);
-    red = red > v1 ? v1 : red;
+    red     = red > v1 ? v1 : red;
 
     auto v2 = get(2);
-    red = red > v2 ? v2 : red;
+    red     = red > v2 ? v2 : red;
 
     auto v3 = get(3);
-    red = red > v3 ? v3 : red;
+    red     = red > v3 ? v3 : red;
 
     return red;
   }
@@ -535,17 +535,17 @@ class Register<int64_t, avx2_register>
     if (N > 1)
     {
       auto v1 = get(1);
-      red = red > v1 ? v1 : red;
+      red     = red > v1 ? v1 : red;
     }
     if (N > 2)
     {
       auto v2 = get(2);
-      red = red > v2 ? v2 : red;
+      red     = red > v2 ? v2 : red;
     }
     if (N > 3)
     {
       auto v3 = get(3);
-      red = red > v3 ? v3 : red;
+      red     = red > v3 ? v3 : red;
     }
 
     return red;
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index d2cd703fe3..b4c6e1ea37 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -32,41 +32,41 @@ namespace expt
 template <>
 struct RegisterTraits<RAJA::expt::avx2_register, int32_t>
 {
-  using element_type = int32_t;
-  using register_policy = RAJA::expt::avx2_register;
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx2_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx2_register, int64_t>
 {
-  using element_type = int64_t;
-  using register_policy = RAJA::expt::avx2_register;
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx2_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx2_register, float>
 {
-  using element_type = float;
-  using register_policy = RAJA::expt::avx2_register;
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx2_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx2_register, double>
 {
-  using element_type = double;
-  using register_policy = RAJA::expt::avx2_register;
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx2_register;
   static constexpr camp::idx_t s_num_bits = 256;
   static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 } // namespace expt
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index 45675752dd..7f45c06141 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -44,9 +44,9 @@ class Register<double, avx512_register>
 
 
   using register_policy = avx512_register;
-  using self_type = Register<double, avx512_register>;
-  using element_type = double;
-  using register_type = __m512d;
+  using self_type       = Register<double, avx512_register>;
+  using element_type    = double;
+  using register_type   = __m512d;
 
   using int_vector_type = Register<int64_t, avx512_register>;
 
@@ -87,7 +87,7 @@ class Register<double, avx512_register>
   {
     // Generate a strided offset list
     auto vstride = _mm512_set1_epi64(stride);
-    auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
     return _mm512_mullo_epi64(vstride, vseq);
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 9f2a0e5766..2082293046 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -42,9 +42,9 @@ class Register<float, avx512_register>
       internal::expt::RegisterBase<Register<float, avx512_register>>;
 
   using register_policy = avx512_register;
-  using self_type = Register<float, avx512_register>;
-  using element_type = float;
-  using register_type = __m512;
+  using self_type       = Register<float, avx512_register>;
+  using element_type    = float;
+  using register_type   = __m512;
 
   using int_vector_type = Register<int32_t, avx512_register>;
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index eb8fd5c6f5..4645ed6cf4 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -42,9 +42,9 @@ class Register<int32_t, avx512_register>
       internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
 
   using register_policy = avx512_register;
-  using self_type = Register<int32_t, avx512_register>;
-  using element_type = int32_t;
-  using register_type = __m512i;
+  using self_type       = Register<int32_t, avx512_register>;
+  using element_type    = int32_t;
+  using register_type   = __m512i;
 
   using int_vector_type = Register<int32_t, avx512_register>;
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index fdac43407b..655b06c8fd 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -42,9 +42,9 @@ class Register<int64_t, avx512_register>
       internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
 
   using register_policy = avx512_register;
-  using self_type = Register<int64_t, avx512_register>;
-  using element_type = int64_t;
-  using register_type = __m512i;
+  using self_type       = Register<int64_t, avx512_register>;
+  using element_type    = int64_t;
+  using register_type   = __m512i;
 
   using int_vector_type = Register<int64_t, avx512_register>;
 
@@ -85,7 +85,7 @@ class Register<int64_t, avx512_register>
   {
     // Generate a strided offset list
     auto vstride = _mm512_set1_epi64(stride);
-    auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
     return _mm512_mullo_epi64(vstride, vseq);
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index 50c98ba5d2..12308289fd 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -32,41 +32,41 @@ namespace expt
 template <>
 struct RegisterTraits<RAJA::expt::avx512_register, int32_t>
 {
-  using element_type = int32_t;
-  using register_policy = RAJA::expt::avx512_register;
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx512_register;
   static constexpr camp::idx_t s_num_bits = 512;
   static constexpr camp::idx_t s_num_elem = 16;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx512_register, int64_t>
 {
-  using element_type = int64_t;
-  using register_policy = RAJA::expt::avx512_register;
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx512_register;
   static constexpr camp::idx_t s_num_bits = 512;
   static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx512_register, float>
 {
-  using element_type = float;
-  using register_policy = RAJA::expt::avx512_register;
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx512_register;
   static constexpr camp::idx_t s_num_bits = 512;
   static constexpr camp::idx_t s_num_elem = 16;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::avx512_register, double>
 {
-  using element_type = double;
-  using register_policy = RAJA::expt::avx512_register;
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx512_register;
   static constexpr camp::idx_t s_num_bits = 512;
   static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 } // namespace expt
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index bc79caf8de..1826cf9944 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -45,9 +45,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
       internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
 
   using register_policy = cuda_warp_register;
-  using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
-  using element_type = ELEMENT_TYPE;
-  using register_type = ELEMENT_TYPE;
+  using self_type       = Register<ELEMENT_TYPE, cuda_warp_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
 
   using int_vector_type = Register<int64_t, cuda_warp_register>;
 
@@ -257,15 +257,15 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t segbits,
-                            camp::idx_t stride_inner,
-                            camp::idx_t stride_outer)
+                            camp::idx_t         segbits,
+                            camp::idx_t         stride_inner,
+                            camp::idx_t         stride_outer)
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     m_value = ptr[seg * stride_outer + i * stride_inner];
 
@@ -282,17 +282,17 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t segbits,
-                               camp::idx_t stride_inner,
-                               camp::idx_t stride_outer,
-                               camp::idx_t num_inner,
-                               camp::idx_t num_outer)
+                               camp::idx_t         segbits,
+                               camp::idx_t         stride_inner,
+                               camp::idx_t         stride_outer,
+                               camp::idx_t         num_inner,
+                               camp::idx_t         num_outer)
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     if (seg >= num_outer || i >= num_inner)
     {
@@ -388,7 +388,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   template <typename T2>
   RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
-                                                   T2 const& offsets) const
+                                                   T2 const&     offsets) const
   {
 
     ptr[offsets.get_raw_value()] = m_value;
@@ -425,15 +425,15 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t segbits,
-                                   camp::idx_t stride_inner,
-                                   camp::idx_t stride_outer) const
+                                   camp::idx_t   segbits,
+                                   camp::idx_t   stride_inner,
+                                   camp::idx_t   stride_outer) const
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     ptr[seg * stride_outer + i * stride_inner] = m_value;
 
@@ -448,17 +448,17 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t segbits,
-                                      camp::idx_t stride_inner,
-                                      camp::idx_t stride_outer,
-                                      camp::idx_t num_inner,
-                                      camp::idx_t num_outer) const
+                                      camp::idx_t   segbits,
+                                      camp::idx_t   stride_inner,
+                                      camp::idx_t   stride_outer,
+                                      camp::idx_t   num_inner,
+                                      camp::idx_t   num_outer) const
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     if (seg >= num_outer || i >= num_inner)
     {
@@ -662,7 +662,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
                                          RAJA::operators::maximum>;
 
     auto ident = RAJA::operators::limits<element_type>::min();
-    auto lane = get_lane();
+    auto lane  = get_lane();
     auto value = lane < N ? m_value : ident;
     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
   }
@@ -708,7 +708,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
                                          RAJA::operators::minimum>;
 
     auto ident = RAJA::operators::limits<element_type>::max();
-    auto lane = get_lane();
+    auto lane  = get_lane();
     auto value = lane < N ? m_value : ident;
     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
   }
@@ -744,7 +744,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     result.get_raw_value() = seg * stride_outer + i * stride_inner;
 
@@ -811,8 +811,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
     // Third: mask off everything but output_segment
     //        this is because all output segments are valid at this point
     // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-    int our_output_segment = get_lane() >> (5 - segbits);
-    bool in_output_segment = our_output_segment == output_segment;
+    int  our_output_segment = get_lane() >> (5 - segbits);
+    bool in_output_segment  = our_output_segment == output_segment;
     if (!in_output_segment)
     {
       result.get_raw_value() = 0;
@@ -865,8 +865,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
     {
 
       // tree shuffle
-      int delta = s_num_elem >> (i + 1);
-      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+      int          delta = s_num_elem >> (i + 1);
+      element_type y     = __shfl_sync(0xffffffff, x, get_lane() + delta);
 
       // reduce
       x += y;
@@ -874,7 +874,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 
     // Second: send result to output segment lanes
     self_type result;
-    int get_from = get_lane() & ((1 << segbits) - 1);
+    int       get_from     = get_lane() & ((1 << segbits) - 1);
     result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
 
     int mask = (get_lane() >> segbits) == output_segment;
@@ -891,7 +891,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_divide_nm(self_type den,
+  self_type segmented_divide_nm(self_type   den,
                                 camp::idx_t segbits,
                                 camp::idx_t num_inner,
                                 camp::idx_t num_outer) const
@@ -902,7 +902,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     if (seg >= num_outer || i >= num_inner)
     {
@@ -971,7 +971,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   {
     self_type result;
 
-    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t mask   = (1 << segbits) - 1;
     camp::idx_t offset = input_segment << segbits;
 
 
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 41df311140..7d0ab04238 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -31,11 +31,11 @@ namespace expt
 template <typename T>
 struct RegisterTraits<RAJA::expt::cuda_warp_register, T>
 {
-  using element_type = T;
-  using register_policy = RAJA::expt::cuda_warp_register;
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::cuda_warp_register;
   static constexpr camp::idx_t s_num_elem = 32;
   static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 } // namespace expt
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index 72b680cba0..a84ae592af 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -46,9 +46,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
       internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
 
   using register_policy = hip_wave_register;
-  using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
-  using element_type = ELEMENT_TYPE;
-  using register_type = ELEMENT_TYPE;
+  using self_type       = Register<ELEMENT_TYPE, hip_wave_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
 
   using int_vector_type = Register<int64_t, hip_wave_register>;
 
@@ -258,15 +258,15 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t segbits,
-                            camp::idx_t stride_inner,
-                            camp::idx_t stride_outer)
+                            camp::idx_t         segbits,
+                            camp::idx_t         stride_inner,
+                            camp::idx_t         stride_outer)
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     m_value = ptr[seg * stride_outer + i * stride_inner];
 
@@ -283,17 +283,17 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t segbits,
-                               camp::idx_t stride_inner,
-                               camp::idx_t stride_outer,
-                               camp::idx_t num_inner,
-                               camp::idx_t num_outer)
+                               camp::idx_t         segbits,
+                               camp::idx_t         stride_inner,
+                               camp::idx_t         stride_outer,
+                               camp::idx_t         num_inner,
+                               camp::idx_t         num_outer)
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     if (seg >= num_outer || i >= num_inner)
     {
@@ -389,7 +389,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   template <typename T2>
   RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
-                                                   T2 const& offsets) const
+                                                   T2 const&     offsets) const
   {
 
     ptr[offsets.get_raw_value()] = m_value;
@@ -427,15 +427,15 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t segbits,
-                                   camp::idx_t stride_inner,
-                                   camp::idx_t stride_outer) const
+                                   camp::idx_t   segbits,
+                                   camp::idx_t   stride_inner,
+                                   camp::idx_t   stride_outer) const
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     ptr[seg * stride_outer + i * stride_inner] = m_value;
 
@@ -450,17 +450,17 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   RAJA_DEVICE
   RAJA_INLINE
   self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t segbits,
-                                      camp::idx_t stride_inner,
-                                      camp::idx_t stride_outer,
-                                      camp::idx_t num_inner,
-                                      camp::idx_t num_outer) const
+                                      camp::idx_t   segbits,
+                                      camp::idx_t   stride_inner,
+                                      camp::idx_t   stride_outer,
+                                      camp::idx_t   num_inner,
+                                      camp::idx_t   num_outer) const
   {
     auto lane = get_lane();
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     if (seg >= num_outer || i >= num_inner)
     {
@@ -664,7 +664,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
                                          RAJA::operators::maximum>;
 
     auto ident = RAJA::operators::limits<element_type>::min();
-    auto lane = get_lane();
+    auto lane  = get_lane();
     auto value = lane < N ? m_value : ident;
     return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
   }
@@ -710,7 +710,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
                                          RAJA::operators::minimum>;
 
     auto ident = RAJA::operators::limits<element_type>::max();
-    auto lane = get_lane();
+    auto lane  = get_lane();
     auto value = lane < N ? m_value : ident;
     return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
   }
@@ -746,7 +746,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     result.get_raw_value() = seg * stride_outer + i * stride_inner;
 
@@ -813,8 +813,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
     // Third: mask off everything but output_segment
     //        this is because all output segments are valid at this point
     // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-    int our_output_segment = get_lane() >> (6 - segbits);
-    bool in_output_segment = our_output_segment == output_segment;
+    int  our_output_segment = get_lane() >> (6 - segbits);
+    bool in_output_segment  = our_output_segment == output_segment;
     if (!in_output_segment)
     {
       result.get_raw_value() = 0;
@@ -867,8 +867,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
     {
 
       // tree shuffle
-      int delta = s_num_elem >> (i + 1);
-      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+      int          delta = s_num_elem >> (i + 1);
+      element_type y     = hip::impl::shfl_sync(x, get_lane() + delta);
 
       // reduce
       x += y;
@@ -876,7 +876,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 
     // Second: send result to output segment lanes
     self_type result;
-    int get_from = get_lane() & ((1 << segbits) - 1);
+    int       get_from     = get_lane() & ((1 << segbits) - 1);
     result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
 
     int mask = (get_lane() >> segbits) == output_segment;
@@ -893,7 +893,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_divide_nm(self_type den,
+  self_type segmented_divide_nm(self_type   den,
                                 camp::idx_t segbits,
                                 camp::idx_t num_inner,
                                 camp::idx_t num_outer) const
@@ -904,7 +904,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 
     // compute segment and segment_size
     auto seg = lane >> segbits;
-    auto i = lane & ((1 << segbits) - 1);
+    auto i   = lane & ((1 << segbits) - 1);
 
     if (seg >= num_outer || i >= num_inner)
     {
@@ -973,7 +973,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   {
     self_type result;
 
-    camp::idx_t mask = (1 << segbits) - 1;
+    camp::idx_t mask   = (1 << segbits) - 1;
     camp::idx_t offset = input_segment << segbits;
 
 
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index bd0e2a0136..8b2d56a006 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -31,11 +31,11 @@ namespace expt
 template <typename T>
 struct RegisterTraits<RAJA::expt::hip_wave_register, T>
 {
-  using element_type = T;
-  using register_policy = RAJA::expt::hip_wave_register;
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::hip_wave_register;
   static constexpr camp::idx_t s_num_elem = 64;
   static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 } // namespace expt
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index 96c13ea2b5..ac2300d8ac 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -38,9 +38,9 @@ class Register<T, scalar_register>
   using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
 
   using register_policy = scalar_register;
-  using self_type = Register<T, scalar_register>;
-  using element_type = T;
-  using register_type = T;
+  using self_type       = Register<T, scalar_register>;
+  using element_type    = T;
+  using register_type   = T;
 
   using int_vector_type =
       Register<typename internal::expt::RegisterTraits<scalar_register,
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index 34541f57b3..d8059422b5 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -30,41 +30,41 @@ namespace expt
 template <>
 struct RegisterTraits<RAJA::expt::scalar_register, int32_t>
 {
-  using element_type = int32_t;
-  using register_policy = RAJA::expt::scalar_register;
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::scalar_register;
   static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
   static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::scalar_register, int64_t>
 {
-  using element_type = int64_t;
-  using register_policy = RAJA::expt::scalar_register;
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::scalar_register;
   static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
   static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::scalar_register, float>
 {
-  using element_type = float;
-  using register_policy = RAJA::expt::scalar_register;
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::scalar_register;
   static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
   static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type = int32_t;
+  using int_element_type                  = int32_t;
 };
 
 template <>
 struct RegisterTraits<RAJA::expt::scalar_register, double>
 {
-  using element_type = double;
-  using register_policy = RAJA::expt::scalar_register;
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::scalar_register;
   static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
   static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type = int64_t;
+  using int_element_type                  = int64_t;
 };
 
 
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 494264f9db..fc928447c7 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -50,7 +50,7 @@ struct tensor_exec : public EXEC_POLICY
   using tensor_type = TENSOR_TYPE;
 
   static constexpr camp::idx_t s_tensor_dim = DIM;
-  static constexpr camp::idx_t s_tile_size = TILE_SIZE;
+  static constexpr camp::idx_t s_tile_size  = TILE_SIZE;
 };
 
 
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 1b915e2fd0..57e90a678a 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -27,15 +27,15 @@ namespace RAJA
 template <camp::idx_t N>
 struct LogBase2
 {
-  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
-  static constexpr bool is_exact = ((1 << value) == N);
+  static constexpr camp::idx_t value    = LogBase2<(N >> 1)>::value + 1;
+  static constexpr bool        is_exact = ((1 << value) == N);
 };
 
 template <>
 struct LogBase2<0>
 {
-  static constexpr camp::idx_t value = -1;
-  static constexpr bool is_exact = true;
+  static constexpr camp::idx_t value    = -1;
+  static constexpr bool        is_exact = true;
 };
 
 /*!
@@ -53,10 +53,10 @@ struct LogBase2<0>
 template <int Width, int Shift>
 struct BitMask
 {
-  static constexpr int shift = Shift;
-  static constexpr int width = Width;
-  static constexpr int max_input_size = 1 << (Shift + Width);
-  static constexpr int max_masked_size = 1 << Width;
+  static constexpr int shift            = Shift;
+  static constexpr int width            = Width;
+  static constexpr int max_input_size   = 1 << (Shift + Width);
+  static constexpr int max_masked_size  = 1 << Width;
   static constexpr int max_shifted_size = 1 << Shift;
 
   template <typename T>
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index 1813d42224..e65e8aef39 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -82,11 +82,11 @@ struct CombiningAdapter
 {
   using Layout = Layout_;
 
-  using IndexRange = typename Layout::IndexRange;
+  using IndexRange     = typename Layout::IndexRange;
   using StrippedIdxLin = typename Layout::StrippedIdxLin;
-  using IndexLinear = typename Layout::IndexLinear;
-  using DimTuple = typename Layout::DimTuple;
-  using DimArr = typename Layout::DimArr;
+  using IndexLinear    = typename Layout::IndexLinear;
+  using DimTuple       = typename Layout::DimTuple;
+  using DimArr         = typename Layout::DimArr;
 
   using RangeLinear = RAJA::TypedRangeSegment<IndexLinear>;
 
@@ -232,7 +232,7 @@ make_CombiningAdapter(Lambda&& lambda,
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
-  Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
+  Layout       layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
       {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
                                           : static_cast<IdxLin>(0))...}},
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 1446a868ca..0008fadbb0 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -95,9 +95,9 @@ struct IndexLayout_impl;
 template <camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
 struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
 {
-  using IndexRange = camp::idx_seq<RangeInts...>;
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
@@ -129,7 +129,7 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
 } // namespace internal
 
 
-template <size_t n_dims = 1,
+template <size_t n_dims   = 1,
           typename IdxLin = Index_type,
           typename... IndexTypes>
 struct IndexLayout
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index 74ab0fa425..210d3ad841 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -44,9 +44,9 @@ class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
 
   void initDirectory(const std::string& path);
 
-  std::vector<init_function> init_functions;
-  std::vector<pre_function> pre_functions;
-  std::vector<post_function> post_functions;
+  std::vector<init_function>     init_functions;
+  std::vector<pre_function>      pre_functions;
+  std::vector<post_function>     post_functions;
   std::vector<finalize_function> finalize_functions;
 
 }; // end KokkosPluginLoader class
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index cab983d52c..6f64d403fd 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -39,7 +39,7 @@ namespace detail
 
 
 template <typename Range,
-          typename IdxLin = Index_type,
+          typename IdxLin        = Index_type,
           ptrdiff_t StrideOneDim = -1>
 struct LayoutBase_impl;
 
@@ -72,26 +72,26 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
 {
 public:
   using IndexLinear = IdxLin;
-  using IndexRange = camp::make_idx_seq_t<sizeof...(RangeInts)>;
+  using IndexRange  = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
-  static constexpr size_t n_dims = sizeof...(RangeInts);
-  static constexpr IdxLin limit = RAJA::operators::limits<IdxLin>::max();
+  static constexpr size_t    n_dims = sizeof...(RangeInts);
+  static constexpr IdxLin    limit  = RAJA::operators::limits<IdxLin>::max();
   static constexpr ptrdiff_t stride_one_dim = StrideOneDim;
 
-  IdxLin sizes[n_dims] = {0};
-  IdxLin strides[n_dims] = {0};
+  IdxLin sizes[n_dims]       = {0};
+  IdxLin strides[n_dims]     = {0};
   IdxLin inv_strides[n_dims] = {0};
-  IdxLin inv_mods[n_dims] = {0};
+  IdxLin inv_mods[n_dims]    = {0};
 
 
   /*!
    * Default constructor with zero sizes and strides.
    */
-  constexpr RAJA_INLINE LayoutBase_impl() = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&) = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&) = default;
+  constexpr RAJA_INLINE LayoutBase_impl()                        = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&)  = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&)       = default;
   RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl const&) = default;
-  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&) = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&)      = default;
 
   /*!
    * Construct a layout given the size of each dimension.
@@ -337,8 +337,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
 {
 
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-  using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
-  using Base = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
+  using Self   = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
+  using Base   = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
 
   // Pull in base constructors
@@ -403,7 +403,7 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
  */
 template <ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
 RAJA_INLINE Layout<n_dims, IdxLin, s1_dim>
-make_stride_one(Layout<n_dims, IdxLin> const& l)
+            make_stride_one(Layout<n_dims, IdxLin> const& l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
@@ -415,10 +415,10 @@ make_stride_one(Layout<n_dims, IdxLin> const& l)
  */
 template <ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
 RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim>
-make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
+            make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
 {
   // strip l to it's base-class type
-  using Base = typename TypedLayout<IdxLin, IdxTuple>::Base;
+  using Base    = typename TypedLayout<IdxLin, IdxTuple>::Base;
   Base const& b = (Base const&)l;
 
   // Use non-typed layout to initialize new typed layout
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 8d99c1f564..946756d783 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -109,9 +109,9 @@ struct AtomicTypedLocalArray<AtomicPolicy,
                              IndexTypes...>
 {
   DataType* m_arrayPtr = nullptr;
-  using value_type = DataType;
-  using atomic_ref_t = RAJA::AtomicRef<value_type, AtomicPolicy>;
-  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+  using value_type     = DataType;
+  using atomic_ref_t   = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using layout_type    = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
   static const camp::idx_t NumElem = layout_type::s_size;
 
   RAJA_HOST_DEVICE
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 1751ec1503..0ead8684ad 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -43,16 +43,16 @@ struct OffsetLayout_impl;
 template <camp::idx_t... RangeInts, typename IdxLin>
 struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
 {
-  using Self = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
-  using IndexRange = camp::idx_seq<RangeInts...>;
+  using Self        = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
-  static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims] = {0}; // If not specified set to zero
+  static constexpr size_t n_dims          = sizeof...(RangeInts);
+  IdxLin                  offsets[n_dims] = {0}; // If not specified set to zero
 
   constexpr RAJA_INLINE
   OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
@@ -117,9 +117,9 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   }
 
   static RAJA_INLINE OffsetLayout_impl<IndexRange, IdxLin>
-  from_layout_and_offsets(
-      const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
-      const Layout<sizeof...(RangeInts), IdxLin>& rhs)
+                     from_layout_and_offsets(
+                         const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
+                         const Layout<sizeof...(RangeInts), IdxLin>&     rhs)
   {
     OffsetLayout_impl ret{rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
@@ -188,11 +188,11 @@ struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
     : public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
 {
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-  using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-  using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
-  using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
-  using DimTuple = camp::tuple<DimTypes...>;
-  using IndexLinear = IdxLin;
+  using Self           = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+  using Base           = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
+  using DimArr         = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
+  using DimTuple       = camp::tuple<DimTypes...>;
+  using IndexLinear    = IdxLin;
 
   // Pull in base coonstructors
 #if 0
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index 264628d1f4..a7f9c6eaff 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -60,16 +60,16 @@ using associative_or_fp_associative_tag =
 template <typename Arg1, typename Arg2, typename Result>
 struct binary_function
 {
-  using first_argument_type = Arg1;
+  using first_argument_type  = Arg1;
   using second_argument_type = Arg2;
-  using result_type = Result;
+  using result_type          = Result;
 };
 
 template <typename Argument, typename Result>
 struct unary_function
 {
   using argument_type = Argument;
-  using result_type = Result;
+  using result_type   = Result;
 };
 
 template <typename Arg1, typename Arg2>
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index cf370601a3..3c9f29c87f 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -39,39 +39,39 @@ struct as_array<camp::idx_seq<Indices...>>
   }
 };
 
-using PERM_I = camp::idx_seq<0>;
-using PERM_IJ = camp::idx_seq<0, 1>;
-using PERM_JI = camp::idx_seq<1, 0>;
-using PERM_IJK = camp::idx_seq<0, 1, 2>;
-using PERM_IKJ = camp::idx_seq<0, 2, 1>;
-using PERM_JIK = camp::idx_seq<1, 0, 2>;
-using PERM_JKI = camp::idx_seq<1, 2, 0>;
-using PERM_KIJ = camp::idx_seq<2, 0, 1>;
-using PERM_KJI = camp::idx_seq<2, 1, 0>;
-using PERM_IJKL = camp::idx_seq<0, 1, 2, 3>;
-using PERM_IJLK = camp::idx_seq<0, 1, 3, 2>;
-using PERM_IKJL = camp::idx_seq<0, 2, 1, 3>;
-using PERM_IKLJ = camp::idx_seq<0, 2, 3, 1>;
-using PERM_ILJK = camp::idx_seq<0, 3, 1, 2>;
-using PERM_ILKJ = camp::idx_seq<0, 3, 2, 1>;
-using PERM_JIKL = camp::idx_seq<1, 0, 2, 3>;
-using PERM_JILK = camp::idx_seq<1, 0, 3, 2>;
-using PERM_JKIL = camp::idx_seq<1, 2, 0, 3>;
-using PERM_JKLI = camp::idx_seq<1, 2, 3, 0>;
-using PERM_JLIK = camp::idx_seq<1, 3, 0, 2>;
-using PERM_JLKI = camp::idx_seq<1, 3, 2, 0>;
-using PERM_KIJL = camp::idx_seq<2, 0, 1, 3>;
-using PERM_KILJ = camp::idx_seq<2, 0, 3, 1>;
-using PERM_KJIL = camp::idx_seq<2, 1, 0, 3>;
-using PERM_KJLI = camp::idx_seq<2, 1, 3, 0>;
-using PERM_KLIJ = camp::idx_seq<2, 3, 0, 1>;
-using PERM_KLJI = camp::idx_seq<2, 3, 1, 0>;
-using PERM_LIJK = camp::idx_seq<3, 0, 1, 2>;
-using PERM_LIKJ = camp::idx_seq<3, 0, 2, 1>;
-using PERM_LJIK = camp::idx_seq<3, 1, 0, 2>;
-using PERM_LJKI = camp::idx_seq<3, 1, 2, 0>;
-using PERM_LKIJ = camp::idx_seq<3, 2, 0, 1>;
-using PERM_LKJI = camp::idx_seq<3, 2, 1, 0>;
+using PERM_I     = camp::idx_seq<0>;
+using PERM_IJ    = camp::idx_seq<0, 1>;
+using PERM_JI    = camp::idx_seq<1, 0>;
+using PERM_IJK   = camp::idx_seq<0, 1, 2>;
+using PERM_IKJ   = camp::idx_seq<0, 2, 1>;
+using PERM_JIK   = camp::idx_seq<1, 0, 2>;
+using PERM_JKI   = camp::idx_seq<1, 2, 0>;
+using PERM_KIJ   = camp::idx_seq<2, 0, 1>;
+using PERM_KJI   = camp::idx_seq<2, 1, 0>;
+using PERM_IJKL  = camp::idx_seq<0, 1, 2, 3>;
+using PERM_IJLK  = camp::idx_seq<0, 1, 3, 2>;
+using PERM_IKJL  = camp::idx_seq<0, 2, 1, 3>;
+using PERM_IKLJ  = camp::idx_seq<0, 2, 3, 1>;
+using PERM_ILJK  = camp::idx_seq<0, 3, 1, 2>;
+using PERM_ILKJ  = camp::idx_seq<0, 3, 2, 1>;
+using PERM_JIKL  = camp::idx_seq<1, 0, 2, 3>;
+using PERM_JILK  = camp::idx_seq<1, 0, 3, 2>;
+using PERM_JKIL  = camp::idx_seq<1, 2, 0, 3>;
+using PERM_JKLI  = camp::idx_seq<1, 2, 3, 0>;
+using PERM_JLIK  = camp::idx_seq<1, 3, 0, 2>;
+using PERM_JLKI  = camp::idx_seq<1, 3, 2, 0>;
+using PERM_KIJL  = camp::idx_seq<2, 0, 1, 3>;
+using PERM_KILJ  = camp::idx_seq<2, 0, 3, 1>;
+using PERM_KJIL  = camp::idx_seq<2, 1, 0, 3>;
+using PERM_KJLI  = camp::idx_seq<2, 1, 3, 0>;
+using PERM_KLIJ  = camp::idx_seq<2, 3, 0, 1>;
+using PERM_KLJI  = camp::idx_seq<2, 3, 1, 0>;
+using PERM_LIJK  = camp::idx_seq<3, 0, 1, 2>;
+using PERM_LIKJ  = camp::idx_seq<3, 0, 2, 1>;
+using PERM_LJIK  = camp::idx_seq<3, 1, 0, 2>;
+using PERM_LJKI  = camp::idx_seq<3, 1, 2, 0>;
+using PERM_LKIJ  = camp::idx_seq<3, 2, 0, 1>;
+using PERM_LKJI  = camp::idx_seq<3, 2, 1, 0>;
 using PERM_IJKLM = camp::idx_seq<0, 1, 2, 3, 4>;
 using PERM_IJKML = camp::idx_seq<0, 1, 2, 4, 3>;
 using PERM_IJLKM = camp::idx_seq<0, 1, 3, 2, 4>;
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index df51395123..cd5084bb94 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -61,7 +61,7 @@ namespace RAJA
  *
  */
 template <size_t Rank, typename IdxLin = Index_type>
-auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
+auto make_permuted_layout(std::array<IdxLin, Rank>      sizes,
                           std::array<camp::idx_t, Rank> permutation)
     -> Layout<Rank, IdxLin>
 {
@@ -87,10 +87,10 @@ auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
   auto ret = Layout<Rank, IdxLin>();
   for (size_t i = 0; i < Rank; ++i)
   {
-    ret.sizes[i] = sizes[i];
-    ret.strides[i] = strides[i];
+    ret.sizes[i]       = sizes[i];
+    ret.strides[i]     = strides[i];
     ret.inv_strides[i] = strides[i] ? strides[i] : 1;
-    ret.inv_mods[i] = sizes[i] ? sizes[i] : 1;
+    ret.inv_mods[i]    = sizes[i] ? sizes[i] : 1;
   }
   return ret;
 }
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index dd72975090..c676c62bb1 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -18,7 +18,7 @@ namespace util
 template <typename T>
 class RegistryEntry
 {
-  std::string Name, Desc;
+  std::string        Name, Desc;
   std::shared_ptr<T> object;
 
 public:
@@ -30,7 +30,7 @@ class RegistryEntry
 
   const std::string& getName() const { return Name; }
   const std::string& getDesc() const { return Desc; }
-  T* get() const { return object.get(); }
+  T*                 get() const { return object.get(); }
 };
 
 /// A global registry used in conjunction with static constructors to make
@@ -40,7 +40,7 @@ template <typename T>
 class Registry
 {
 public:
-  using type = T;
+  using type  = T;
   using entry = RegistryEntry<T>;
 
   class node;
@@ -60,7 +60,7 @@ class Registry
     friend class iterator;
     friend Registry<T>;
 
-    node* Next;
+    node*        Next;
     const entry& Val;
 
   public:
@@ -85,8 +85,8 @@ class Registry
   public:
     explicit iterator(const node* N) : Cur(N) {}
 
-    bool operator==(const iterator& That) const { return Cur == That.Cur; }
-    bool operator!=(const iterator& That) const { return Cur != That.Cur; }
+    bool      operator==(const iterator& That) const { return Cur == That.Cur; }
+    bool      operator!=(const iterator& That) const { return Cur != That.Cur; }
     iterator& operator++()
     {
       Cur = Cur->Next;
@@ -99,14 +99,14 @@ class Registry
   // begin is not defined here in order to avoid usage of an undefined static
   // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
   static RAJASHAREDDLL_API iterator begin();
-  static iterator end() { return iterator(nullptr); }
+  static iterator                   end() { return iterator(nullptr); }
 
   /// A static registration template.
   template <typename V>
   class add
   {
     entry Entry;
-    node Node;
+    node  Node;
 
     static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 7383364296..9f1722dccd 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -56,8 +56,8 @@ struct RepeatView
   struct iterator
   {
     using difference_type = std::ptrdiff_t;
-    using value_type = T;
-    using reference = value_type const&;
+    using value_type      = T;
+    using reference       = value_type const&;
 
     iterator() = default;
 
@@ -157,7 +157,7 @@ struct RepeatView
 
   private:
     const T* m_value = nullptr;
-    size_t m_index = 0;
+    size_t   m_index = 0;
   };
 
   RepeatView() = delete;
@@ -184,13 +184,13 @@ struct RepeatView
   constexpr iterator cend() const { return iterator(&m_value, m_bound); }
 
   constexpr explicit operator bool() const { return m_bound != 0; }
-  constexpr bool empty() const { return m_bound == 0; }
+  constexpr bool     empty() const { return m_bound == 0; }
 
   constexpr size_t size() const { return m_bound; }
 
 private:
   size_t m_bound = 0;
-  T m_value;
+  T      m_value;
 };
 
 } // end namespace RAJA
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index 3b86c43b89..0593c8584a 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -42,7 +42,7 @@ class SoAArray
 
 public:
   RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; }
+  RAJA_HOST_DEVICE void       set(size_t i, value_type val) { mem[i] = val; }
 
 private:
   value_type mem[size];
@@ -54,8 +54,8 @@ class SoAArray
 template <typename T, typename IndexType, bool doing_min, size_t size>
 class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
 {
-  using value_type = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
-  using first_type = T;
+  using value_type  = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+  using first_type  = T;
   using second_type = IndexType;
 
 public:
@@ -65,12 +65,12 @@ class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    mem[i] = val;
+    mem[i]     = val;
     mem_idx[i] = val.getLoc();
   }
 
 private:
-  first_type mem[size];
+  first_type  mem[size];
   second_type mem_idx[size];
 };
 
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index eb8259b83b..ae2b46bb0c 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -54,11 +54,11 @@ class SoAPtr
   template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr&&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr&&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
@@ -112,7 +112,7 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
              mempool,
              accessor>
 {
-  using first_type = T;
+  using first_type  = T;
   using second_type = IndexType;
 
   template <typename, typename, typename>
@@ -124,11 +124,11 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
   template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr&&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr&&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
@@ -145,7 +145,7 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
 
   SoAPtr& allocate(size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem     = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -172,7 +172,7 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
   }
 
 private:
-  first_type* mem = nullptr;
+  first_type*  mem     = nullptr;
   second_type* mem_idx = nullptr;
 };
 
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index d3c103a95a..b6bd48538a 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -58,14 +58,14 @@ namespace RAJA
 template <typename IterType, typename IndexType>
 struct Span
 {
-  using element_type = typename std::iterator_traits<IterType>::value_type;
-  using value_type = camp::decay<element_type>;
-  using size_type = IndexType;
+  using element_type    = typename std::iterator_traits<IterType>::value_type;
+  using value_type      = camp::decay<element_type>;
+  using size_type       = IndexType;
   using difference_type = std::ptrdiff_t;
-  using reference = element_type&;
+  using reference       = element_type&;
   using const_reference = const element_type&;
-  using iterator = IterType;
-  using const_iterator = IterType;
+  using iterator        = IterType;
+  using const_iterator  = IterType;
 
   static_assert(type_traits::is_integral<IndexType>::value,
                 "IndexType must "
@@ -81,8 +81,8 @@ struct Span
       : m_begin{begin}, m_end{begin + size}
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
-  RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
+  RAJA_HOST_DEVICE RAJA_INLINE iterator       begin() { return m_begin; }
+  RAJA_HOST_DEVICE RAJA_INLINE iterator       end() { return m_end; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator begin() const { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator end() const { return m_end; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
@@ -145,7 +145,7 @@ struct Span
                                           size_type length) const
   {
     auto start = m_begin + begin;
-    auto end = start + length > m_end ? m_end : start + length;
+    auto end   = start + length > m_end ? m_end : start + length;
     return Span(start, end);
   }
 
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 661aceebad..cf86f26ec8 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -59,8 +59,8 @@ struct StaticLayoutBase_impl<IdxLin,
 {
 
   using IndexLinear = IdxLin;
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
-  using strides = camp::int_seq<IdxLin, Strides...>;
+  using sizes       = camp::int_seq<IdxLin, Sizes...>;
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
   static constexpr camp::idx_t stride_one_dim = RAJA::max<camp::idx_t>(
       (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts)
@@ -165,7 +165,7 @@ struct StrideCalculatorIdx
 {
   static_assert(N == sizeof...(Sizes), "");
 
-  using sizes_seq = camp::int_seq<IdxLin, Sizes...>;
+  using sizes_seq              = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin size = camp::seq_at<Idx, sizes_seq>::value;
   static constexpr IdxLin size_last =
       StrideCalculatorIdx<IdxLin, N, Idx + 1, Sizes...>::size;
@@ -180,8 +180,8 @@ struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
 {
   static_assert(N == sizeof...(Sizes), "");
 
-  static constexpr IdxLin size = 1;
-  static constexpr IdxLin value = 1;
+  static constexpr IdxLin size   = 1;
+  static constexpr IdxLin value  = 1;
   static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
@@ -199,11 +199,11 @@ struct StrideCalculator<IdxLin,
 {
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using sizes               = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin N = sizeof...(Sizes);
-  using range = camp::int_seq<IdxLin, Range...>;
-  using perm = camp::idx_seq<Perm...>;
-  using inv_perm = invert_permutation<perm>;
+  using range               = camp::int_seq<IdxLin, Range...>;
+  using perm                = camp::idx_seq<Perm...>;
+  using inv_perm            = invert_permutation<perm>;
 
   using strides_unperm = camp::int_seq<
       IdxLin,
@@ -233,9 +233,9 @@ struct StaticLayoutBase_impl<IdxLin,
 
 
   using IndexLinear = IdxLin;
-  using ranges = camp::int_seq<IdxLin, RangeInts...>;
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
-  using strides = camp::int_seq<IdxLin, Strides...>;
+  using ranges      = camp::int_seq<IdxLin, RangeInts...>;
+  using sizes       = camp::int_seq<IdxLin, Sizes...>;
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
   using InnerLayout =
       StaticLayoutBase_impl<IdxLin, ranges, sizes, strides, void>;
@@ -257,7 +257,7 @@ struct StaticLayoutBase_impl<IdxLin,
   }
 
 
-  static constexpr IndexLinear s_size = InnerLayout::s_size;
+  static constexpr IndexLinear s_size        = InnerLayout::s_size;
   static constexpr IndexLinear s_size_noproj = InnerLayout::s_size_noproj;
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr static IndexLinear size()
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index a0f5ca80ea..f63e44b3c6 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -51,7 +51,7 @@ class BGQTimer
   using ElapsedType = double;
 
 private:
-  using TimeType = timeval;
+  using TimeType     = timeval;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
@@ -74,8 +74,8 @@ class BGQTimer
   void reset() { telapsed = 0; }
 
 private:
-  TimeType tstart;
-  TimeType tstop;
+  TimeType    tstart;
+  TimeType    tstop;
   ElapsedType telapsed;
 };
 
@@ -104,8 +104,8 @@ class ChronoTimer
   using ElapsedType = double;
 
 private:
-  using ClockType = std::chrono::steady_clock;
-  using TimeType = ClockType::time_point;
+  using ClockType    = std::chrono::steady_clock;
+  using TimeType     = ClockType::time_point;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
@@ -126,8 +126,8 @@ class ChronoTimer
   void reset() { telapsed = 0; }
 
 private:
-  TimeType tstart;
-  TimeType tstop;
+  TimeType    tstart;
+  TimeType    tstop;
   ElapsedType telapsed;
 };
 
@@ -173,13 +173,13 @@ class GettimeTimer
 
   void reset()
   {
-    stime_elapsed = 0;
+    stime_elapsed  = 0;
     nstime_elapsed = 0;
   }
 
 private:
-  TimeType tstart;
-  TimeType tstop;
+  TimeType   tstart;
+  TimeType   tstop;
   ElasedType telapsed;
 
   ElapsedType stime_elapsed;
@@ -238,8 +238,8 @@ class ClockTimer
   void reset() { telapsed = 0; }
 
 private:
-  TimeType tstart;
-  TimeType tstop;
+  TimeType    tstart;
+  TimeType    tstop;
   long double telapsed;
 
   void set_elapsed() { telapsed += (tstop - tstart); }
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 50fa7e9f99..0588062711 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -144,7 +144,7 @@ struct GetTensorArgIdx<DIM, camp::list<ARGS...>>
  */
 template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
 RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
-get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
+            get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
 {
   return RAJA::max<camp::idx_t>(
       internal::expt::getTensorDim<ARGS>() == DIM
@@ -160,7 +160,7 @@ get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
  */
 template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
 RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
-get_tensor_args_size(LAYOUT const& layout, ARGS... args)
+            get_tensor_args_size(LAYOUT const& layout, ARGS... args)
 {
   return RAJA::max<camp::idx_t>(
       internal::expt::getTensorDim<ARGS>() == DIM
@@ -212,7 +212,7 @@ struct ViewReturnHelper<camp::idx_seq<>,
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const& layout,
+  static constexpr return_type make_return(LayoutType const&  layout,
                                            PointerType const& data,
                                            Args const&... args)
   {
@@ -271,7 +271,7 @@ struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const& layout,
+  static constexpr return_type make_return(LayoutType const&  layout,
                                            PointerType const& data,
                                            Args const&... args)
   {
@@ -329,8 +329,8 @@ struct ViewReturnHelper<
 
   using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
 
-  using range_seq = camp::int_seq<LinIdx, RangeInts...>;
-  using size_seq = camp::int_seq<LinIdx, SizeInts...>;
+  using range_seq  = camp::int_seq<LinIdx, RangeInts...>;
+  using size_seq   = camp::int_seq<LinIdx, SizeInts...>;
   using stride_seq = camp::int_seq<LinIdx, StrideInts...>;
   using LayoutType = RAJA::detail::
       StaticLayoutBase_impl<LinIdx, range_seq, size_seq, stride_seq, DIM_LIST>;
@@ -369,7 +369,7 @@ struct ViewReturnHelper<
                         RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
 
   using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
-  using new_size_type = internal::expt::StaticIndexArray<new_size_seq>;
+  using new_size_type  = internal::expt::StaticIndexArray<new_size_seq>;
 
 
   using tensor_reg_type =
@@ -390,7 +390,7 @@ struct ViewReturnHelper<
   RAJA_INLINE
   RAJA_HOST_DEVICE
   static constexpr return_type
-  make_return(LayoutType const& layout,
+  make_return(LayoutType const&  layout,
               PointerType const& data,
               RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
   {
@@ -455,9 +455,9 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
                                                           LinIdx,
                                                           LayoutType,
                                                           Args...>
-view_make_return_value(LayoutType const& layout,
-                       PointerType const& data,
-                       Args const&... args)
+            view_make_return_value(LayoutType const&  layout,
+                                   PointerType const& data,
+                                   Args const&... args)
 {
   return detail::ViewReturnHelper<
       camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
@@ -533,8 +533,8 @@ struct MatchTypedViewArgHelper<Expected,
 template <typename Expected,
           typename Arg,
           typename VectorType,
-          camp::idx_t DIM,
-          Arg BEGIN,
+          camp::idx_t             DIM,
+          Arg                     BEGIN,
           strip_index_type_t<Arg> LENGTH>
 struct MatchTypedViewArgHelper<
     Expected,
@@ -581,22 +581,22 @@ class ViewBase
 {
 
 public:
-  using value_type = ValueType;
-  using pointer_type = PointerType;
-  using layout_type = LayoutType;
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
   using linear_index_type = typename layout_type::IndexLinear;
-  using nc_value_type = typename std::remove_const<value_type>::type;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
   using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
       typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-  using Self = ViewBase<value_type, pointer_type, layout_type>;
+  using Self         = ViewBase<value_type, pointer_type, layout_type>;
   using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
 
   using shifted_layout_type = typename add_offset<layout_type>::type;
   using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
 
 protected:
-  pointer_type m_data;
+  pointer_type      m_data;
   layout_type const m_layout;
 
 public:
@@ -626,14 +626,14 @@ class ViewBase
   ViewBase& operator=(ViewBase const& c)
   {
     m_layout = c.m_layout;
-    m_data = c.m_data;
+    m_data   = c.m_data;
   }
 #else
-  constexpr ViewBase() = default;
-  RAJA_INLINE constexpr ViewBase(ViewBase const&) = default;
-  RAJA_INLINE constexpr ViewBase(ViewBase&&) = default;
+  constexpr ViewBase()                             = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase const&)  = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase&&)       = default;
   RAJA_INLINE ViewBase& operator=(ViewBase const&) = default;
-  RAJA_INLINE ViewBase& operator=(ViewBase&&) = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase&&)      = default;
 
 #endif
 
@@ -686,7 +686,7 @@ class ViewBase
                                                             linear_index_type,
                                                             layout_type,
                                                             Args...>
-  operator()(Args... args) const
+                   operator()(Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         m_layout, m_data, args...);
@@ -705,14 +705,14 @@ class ViewBase
                                                             linear_index_type,
                                                             layout_type,
                                                             Args...>
-  operator[](Args... args) const
+                   operator[](Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         m_layout, m_data, args...);
   }
 
 
-  template <size_t n_dims = layout_type::n_dims,
+  template <size_t n_dims   = layout_type::n_dims,
             typename IdxLin = linear_index_type>
   RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
   {
@@ -745,16 +745,16 @@ class TypedViewBase<ValueType,
 {
 
 public:
-  using value_type = ValueType;
-  using pointer_type = PointerType;
-  using layout_type = LayoutType;
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
   using linear_index_type = typename layout_type::IndexLinear;
-  using nc_value_type = typename std::remove_const<value_type>::type;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
   using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
       typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-  using Base = ViewBase<ValueType, PointerType, LayoutType>;
-  using Self = TypedViewBase<value_type,
+  using Base         = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self         = TypedViewBase<value_type,
                              pointer_type,
                              layout_type,
                              camp::list<IndexTypes...>>;
@@ -764,7 +764,7 @@ class TypedViewBase<ValueType,
                                      camp::list<IndexTypes...>>;
 
   using shifted_layout_type = typename add_offset<layout_type>::type;
-  using ShiftedView = TypedViewBase<value_type,
+  using ShiftedView         = TypedViewBase<value_type,
                                     pointer_type,
                                     shifted_layout_type,
                                     camp::list<IndexTypes...>>;
@@ -779,7 +779,7 @@ class TypedViewBase<ValueType,
                                                             linear_index_type,
                                                             layout_type,
                                                             Args...>
-  operator()(Args... args) const
+                   operator()(Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         Base::m_layout,
@@ -800,7 +800,7 @@ class TypedViewBase<ValueType,
                                                             linear_index_type,
                                                             layout_type,
                                                             Args...>
-  operator[](Args... args) const
+                   operator[](Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         Base::m_layout,
@@ -809,7 +809,7 @@ class TypedViewBase<ValueType,
   }
 
 
-  template <size_t n_dims = sizeof...(IndexTypes),
+  template <size_t n_dims   = sizeof...(IndexTypes),
             typename IdxLin = linear_index_type>
   RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
   {
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index 6e3246db3a..074cc8b65e 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -69,8 +69,8 @@ template <size_t n_dims,
           typename ValueType,
           typename... IndexTypes>
 RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
-make_index_view(ValueType* ptr,
-                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+            make_index_view(ValueType*                                    ptr,
+                            IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
   return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
       ptr, index_layout);
@@ -145,8 +145,8 @@ removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
 template <
     typename ValueType,
     typename LayoutType,
-    RAJA::Index_type P2Pidx = 0,
-    typename PointerType = ValueType**,
+    RAJA::Index_type P2Pidx      = 0,
+    typename PointerType         = ValueType**,
     typename NonConstPointerType = camp::type::ptr::add<           // adds *
         camp::type::ptr::add<camp::type::cv::rem<                  // removes cv
             camp::type::ptr::rem<camp::type::ptr::rem<PointerType> // removes
@@ -154,16 +154,16 @@ template <
                                  >>>>>
 struct MultiView
 {
-  using value_type = ValueType;
-  using pointer_type = PointerType;
-  using layout_type = LayoutType;
-  using nc_value_type = camp::decay<value_type>;
+  using value_type      = ValueType;
+  using pointer_type    = PointerType;
+  using layout_type     = LayoutType;
+  using nc_value_type   = camp::decay<value_type>;
   using nc_pointer_type = NonConstPointerType;
   using NonConstView =
       MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
-  nc_pointer_type data;
+  nc_pointer_type   data;
 
   template <typename... Args>
   RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
@@ -174,10 +174,10 @@ struct MultiView
       : layout(layout), data(data_ptr)
   {}
 
-  RAJA_INLINE constexpr MultiView(MultiView const&) = default;
-  RAJA_INLINE constexpr MultiView(MultiView&&) = default;
+  RAJA_INLINE constexpr MultiView(MultiView const&)  = default;
+  RAJA_INLINE constexpr MultiView(MultiView&&)       = default;
   RAJA_INLINE MultiView& operator=(MultiView const&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView&&) = default;
+  RAJA_INLINE MultiView& operator=(MultiView&&)      = default;
 
   template <bool IsConstView = std::is_const<value_type>::value>
   RAJA_INLINE constexpr MultiView(
@@ -228,10 +228,10 @@ struct MultiView
 template <typename ViewType, typename AtomicPolicy = RAJA::auto_atomic>
 struct AtomicViewWrapper
 {
-  using base_type = ViewType;
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, AtomicPolicy>;
 
   base_type base_;
 
@@ -255,10 +255,10 @@ struct AtomicViewWrapper
 template <typename ViewType>
 struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
 {
-  using base_type = ViewType;
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
 
   base_type base_;
 
@@ -277,7 +277,7 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
 
 template <typename AtomicPolicy, typename ViewType>
 RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy>
-make_atomic_view(ViewType const& view)
+            make_atomic_view(ViewType const& view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 3ab6e79705..50bf5716c2 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -45,7 +45,7 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
     size_t d = static_cast<size_t>(p2 - p1);
     if (d <= space - size)
     {
-      r = p2;
+      r   = p2;
       ptr = r;
       space -= d;
     }
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index ffe5a19a7f..9139f981b4 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -54,9 +54,9 @@ namespace detail
 class MemoryArena
 {
 public:
-  using free_type = std::map<void*, void*>;
+  using free_type       = std::map<void*, void*>;
   using free_value_type = typename free_type::value_type;
-  using used_type = std::map<void*, void*>;
+  using used_type       = std::map<void*, void*>;
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
@@ -72,10 +72,10 @@ class MemoryArena
     }
   }
 
-  MemoryArena(MemoryArena const&) = delete;
+  MemoryArena(MemoryArena const&)            = delete;
   MemoryArena& operator=(MemoryArena const&) = delete;
 
-  MemoryArena(MemoryArena&&) = default;
+  MemoryArena(MemoryArena&&)            = default;
   MemoryArena& operator=(MemoryArena&&) = default;
 
   size_t capacity()
@@ -97,7 +97,7 @@ class MemoryArena
       for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter)
       {
 
-        void* adj_ptr = iter->first;
+        void*  adj_ptr = iter->first;
         size_t cap =
             static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
 
@@ -210,7 +210,7 @@ class MemoryArena
   void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
   {
 
-    void* ptr = iter->first;
+    void* ptr     = iter->first;
     void* ptr_end = iter->second;
 
     // fixup m_free_space, shrinking and adding chunks as needed
@@ -253,8 +253,8 @@ class MemoryArena
   }
 
   memory_chunk m_allocation;
-  free_type m_free_space;
-  used_type m_used_space;
+  free_type    m_free_space;
+  used_type    m_used_space;
 };
 
 } /* end namespace detail */
@@ -356,7 +356,7 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    size_t prev_size = m_default_arena_size;
+    size_t prev_size     = m_default_arena_size;
     m_default_arena_size = new_size;
     return prev_size;
   }
@@ -368,9 +368,9 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    const size_t size = nTs * sizeof(T);
-    void* ptr = nullptr;
-    arena_container_type::iterator end = m_arenas.end();
+    const size_t                   size = nTs * sizeof(T);
+    void*                          ptr  = nullptr;
+    arena_container_type::iterator end  = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
          ++iter)
     {
@@ -402,7 +402,7 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    void* ptr = const_cast<void*>(cptr);
+    void*                          ptr = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
          ++iter)
@@ -427,8 +427,8 @@ class MemPool
 #endif
 
   arena_container_type m_arenas;
-  size_t m_default_arena_size;
-  allocator_t m_alloc;
+  size_t               m_default_arena_size;
+  allocator_t          m_alloc;
 };
 
 //! example allocator for basic_mempool using malloc/free
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 665f7f4a95..17e3d63ce1 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -39,8 +39,8 @@ namespace detail
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
-                                                Iter end,
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter      begin,
+                                                Iter      end,
                                                 UnaryFunc func)
 {
   for (; begin != end; ++begin)
@@ -67,7 +67,7 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&&   t,
                                                       UnaryFunc func,
                                                       camp::idx_seq<Is...>)
 {
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index eddc9e7c6d..621cea6509 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -41,8 +41,8 @@
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 
 #if defined(RAJA_ENABLE_CLANG_CUDA)
 #define RAJA_SUPPRESS_HD_WARN
@@ -52,8 +52,8 @@
 
 #elif defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 #define RAJA_SUPPRESS_HD_WARN
 
 #define RAJA_USE_HIP_INTRINSICS
@@ -167,9 +167,9 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
 #else
 #ifdef RAJA_COMPILER_MSVC
   fflush(stdout);
-  char* value;
+  char*  value;
   size_t len;
-  bool no_except = false;
+  bool   no_except = false;
   if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr)
   {
     no_except = true;
@@ -204,7 +204,7 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
  */
 
 #if (__cplusplus >= 201402L)
-#define RAJA_HAS_CXX14 1
+#define RAJA_HAS_CXX14                    1
 #define RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED 1
 #elif defined(__has_cpp_attribute)
 #if __has_cpp_attribute(deprecated)
@@ -214,7 +214,7 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
 
 #if defined(RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED)
 // When using a C++14 compiler, use the standard-specified deprecated attribute
-#define RAJA_DEPRECATE(Msg) [[deprecated(Msg)]]
+#define RAJA_DEPRECATE(Msg)       [[deprecated(Msg)]]
 #define RAJA_DEPRECATE_ALIAS(Msg) [[deprecated(Msg)]]
 
 #elif defined(_MSC_VER)
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
index df897b183b..bd9c2ee070 100644
--- a/include/RAJA/util/mutex.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -39,10 +39,10 @@ class mutex
 
   mutex() { omp_init_lock(&m_lock); }
 
-  mutex(const mutex&) = delete;
-  mutex(mutex&&) = delete;
+  mutex(const mutex&)            = delete;
+  mutex(mutex&&)                 = delete;
   mutex& operator=(const mutex&) = delete;
-  mutex& operator=(mutex&&) = delete;
+  mutex& operator=(mutex&&)      = delete;
 
   void lock() { omp_set_lock(&m_lock); }
 
@@ -68,10 +68,10 @@ class lock_guard
 public:
   explicit lock_guard(mutex_type& m) : m_mutex(m) { m_mutex.lock(); }
 
-  lock_guard(const lock_guard&) = delete;
-  lock_guard(lock_guard&&) = delete;
+  lock_guard(const lock_guard&)            = delete;
+  lock_guard(lock_guard&&)                 = delete;
   lock_guard& operator=(const lock_guard&) = delete;
-  lock_guard& operator=(lock_guard&&) = delete;
+  lock_guard& operator=(lock_guard&&)      = delete;
 
   ~lock_guard() { m_mutex.unlock(); }
 
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 078b7d6363..83f7d14f78 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -45,15 +45,15 @@ template <typename T, typename BinaryOp>
 struct LeftFoldReduce
 {
   RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit LeftFoldReduce(
-      T init = BinaryOp::identity(),
-      BinaryOp op = BinaryOp{}) noexcept
+      T        init = BinaryOp::identity(),
+      BinaryOp op   = BinaryOp{}) noexcept
       : m_op(std::move(op)), m_accumulated_value(std::move(init))
   {}
 
-  LeftFoldReduce(LeftFoldReduce const&) = delete;
+  LeftFoldReduce(LeftFoldReduce const&)            = delete;
   LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
-  LeftFoldReduce(LeftFoldReduce&&) = delete;
-  LeftFoldReduce& operator=(LeftFoldReduce&&) = delete;
+  LeftFoldReduce(LeftFoldReduce&&)                 = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce&&)      = delete;
 
   ~LeftFoldReduce() = default;
 
@@ -93,7 +93,7 @@ struct LeftFoldReduce
 
 private:
   BinaryOp m_op;
-  T m_accumulated_value;
+  T        m_accumulated_value;
 };
 
 /*!
@@ -101,7 +101,7 @@ struct LeftFoldReduce
 */
 template <typename T,
           typename BinaryOp,
-          typename SizeType = size_t,
+          typename SizeType     = size_t,
           SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
 struct BinaryTreeReduce
 {
@@ -115,17 +115,17 @@ struct BinaryTreeReduce
   static constexpr SizeType num_levels = t_num_levels;
 
   RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit BinaryTreeReduce(
-      T init = BinaryOp::identity(),
-      BinaryOp op = BinaryOp{}) noexcept
+      T        init = BinaryOp::identity(),
+      BinaryOp op   = BinaryOp{}) noexcept
       : m_op(std::move(op))
   {
     combine(std::move(init));
   }
 
-  BinaryTreeReduce(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce const&)            = delete;
   BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
-  BinaryTreeReduce(BinaryTreeReduce&&) = delete;
-  BinaryTreeReduce& operator=(BinaryTreeReduce&&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce&&)                 = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce&&)      = delete;
 
   RAJA_HOST_DEVICE RAJA_INLINE ~BinaryTreeReduce() { clear(); }
 
@@ -324,13 +324,13 @@ high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
     see https://en.cppreference.com/w/cpp/algorithm/accumulate
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-    accumulate(Container&& c,
-               T init = BinaryOp::identity(),
-               BinaryOp op = BinaryOp{})
+                accumulate(Container&& c,
+                           T           init = BinaryOp::identity(),
+                           BinaryOp    op   = BinaryOp{})
 {
   using std::begin;
   using std::end;
@@ -347,13 +347,13 @@ RAJA_HOST_DEVICE
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-    binary_tree_reduce(Container&& c,
-                       T init = BinaryOp::identity(),
-                       BinaryOp op = BinaryOp{})
+                binary_tree_reduce(Container&& c,
+                                   T           init = BinaryOp::identity(),
+                                   BinaryOp    op   = BinaryOp{})
 {
   using std::begin;
   using std::end;
@@ -371,13 +371,13 @@ RAJA_HOST_DEVICE
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-    high_accuracy_reduce(Container&& c,
-                         T init = BinaryOp::identity(),
-                         BinaryOp op = BinaryOp{})
+                high_accuracy_reduce(Container&& c,
+                                     T           init = BinaryOp::identity(),
+                                     BinaryOp    op   = BinaryOp{})
 {
   using std::begin;
   using std::end;
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index fa15283525..0656bb872e 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -74,7 +74,7 @@ template <typename IterationMapping,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async>
+          bool   Async>
 struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
                                                              IterationGetter,
                                                              Concretizer,
@@ -97,7 +97,7 @@ template <typename ISetIter,
           typename IterationGetter,
           typename Concretizer,
           size_t BLOCKS_PER_SM,
-          bool Async>
+          bool   Async>
 struct get_resource<
     ExecPolicy<ISetIter,
                ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index af5c415a29..d059d707e7 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -40,8 +40,8 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
-                                            Iter end,
+RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter      begin,
+                                            Iter      end,
                                             Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
@@ -233,7 +233,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
              i_to_insert -= stride)
         {
 
-          Iter to_insert = begin + i_to_insert;
+          Iter to_insert   = begin + i_to_insert;
           Iter next_sorted = to_insert - stride;
 
           // compare with next item to left
@@ -400,7 +400,7 @@ intro_sort_depth(Iter begin, Iter end, Compare comp, unsigned depth)
 
     // use quick sort
     // choose pivot with median of 3 (N >= insertion_sort_cutoff)
-    Iter mid = begin + N / 2;
+    Iter mid  = begin + N / 2;
     Iter last = end - 1;
     Iter pivot =
         comp(*begin, *mid)
@@ -462,7 +462,7 @@ RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
 template <typename Iter, typename Compare>
 void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   diff_type copylen = middle - first;
@@ -538,10 +538,10 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
 // constexpr OutIter // <-- std:: return value
 void RAJA_INLINE
-merge_like_std(Iter1 first1,
-               Iter1 last1,
-               Iter2 first2,
-               Iter2 last2,
+merge_like_std(Iter1   first1,
+               Iter1   last1,
+               Iter2   first2,
+               Iter2   last2,
                OutIter d_first, // using this as direct access to result
                Compare comp)
 {
@@ -600,7 +600,7 @@ merge_like_std(Iter1 first1,
 template <typename Iter, typename Compare>
 RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   // iterative mergesort (bottom up) for future parallelism
@@ -609,7 +609,7 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
   auto minlam = [](diff_type a, diff_type b) { return (a < b) ? a : b; };
 
   // insertion sort for sizes <= 16
-  diff_type len = end - begin;
+  diff_type                  len                   = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
   if (len <= insertion_sort_cutoff && len > 0)
   {
@@ -730,7 +730,7 @@ template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    insertion_sort(Container&& c, Compare comp = Compare{})
+                insertion_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -741,7 +741,7 @@ RAJA_HOST_DEVICE
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it = end(c);
+  auto end_it   = end(c);
 
   if (begin_it != end_it)
   {
@@ -761,7 +761,7 @@ template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    shell_sort(Container&& c, Compare comp = Compare{})
+                shell_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -772,7 +772,7 @@ RAJA_HOST_DEVICE
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it = end(c);
+  auto end_it   = end(c);
 
   if (begin_it != end_it)
   {
@@ -792,7 +792,7 @@ template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    heap_sort(Container&& c, Compare comp = Compare{})
+                heap_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -803,7 +803,7 @@ RAJA_HOST_DEVICE
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it = end(c);
+  auto end_it   = end(c);
 
   if (begin_it != end_it)
   {
@@ -823,7 +823,7 @@ template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    intro_sort(Container&& c, Compare comp = Compare{})
+                intro_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -834,7 +834,7 @@ RAJA_HOST_DEVICE
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it = end(c);
+  auto end_it   = end(c);
 
   if (begin_it != end_it)
   {
@@ -853,7 +853,7 @@ RAJA_HOST_DEVICE
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-merge_sort(Container&& c, Compare comp = Compare{})
+            merge_sort(Container&& c, Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -864,7 +864,7 @@ merge_sort(Container&& c, Compare comp = Compare{})
                 "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
-  auto end_it = end(c);
+  auto end_it   = end(c);
 
   if (begin_it != end_it)
   {
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index e49c538cbe..4e2c3ffd56 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -41,7 +41,7 @@ namespace RAJA
 ///
 enum named_usage : int
 {
-  ignored = -1,
+  ignored     = -1,
   unspecified = 0
 };
 
@@ -246,7 +246,7 @@ using Real_type = double;
 
 #elif defined(RAJA_USE_FLOAT)
 ///
-using Real_type = float;
+using Real_type         = float;
 
 #else
 #error RAJA Real_type is undefined!
@@ -833,51 +833,51 @@ class RestrictComplexPtr
  ******************************************************************************
  */
 #if defined(RAJA_USE_BARE_PTR)
-using Real_ptr = Real_type*;
+using Real_ptr       = Real_type*;
 using const_Real_ptr = const Real_type*;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type*;
+using Complex_ptr       = Complex_type*;
 using const_Complex_ptr = const Complex_type*;
 #endif
 
-using UnalignedReal_ptr = Real_type*;
+using UnalignedReal_ptr       = Real_type*;
 using const_UnalignedReal_ptr = const Real_type*;
 
 #elif defined(RAJA_USE_RESTRICT_PTR)
-using Real_ptr = Real_type* RAJA_RESTRICT;
-using const_Real_ptr = const Real_type* RAJA_RESTRICT;
+using Real_ptr          = Real_type*                   RAJA_RESTRICT;
+using const_Real_ptr    = const Real_type*       RAJA_RESTRICT;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
+using Complex_ptr       = Complex_type*             RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type*             RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR)
-using Real_ptr = TDRAReal_ptr;
-using const_Real_ptr = const_TDRAReal_ptr;
+using Real_ptr           = TDRAReal_ptr;
+using const_Real_ptr     = const_TDRAReal_ptr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
-using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
+using Complex_ptr        = Complex_type*             RAJA_RESTRICT;
+using const_Complex_ptr  = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type*             RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_PTR_CLASS)
-using Real_ptr = RestrictAlignedRealPtr;
-using const_Real_ptr = ConstRestrictAlignedRealPtr;
+using Real_ptr           = RestrictAlignedRealPtr;
+using const_Real_ptr     = ConstRestrictAlignedRealPtr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = RestrictComplexPtr;
-using const_Complex_ptr = ConstRestrictComplexPtr;
+using Complex_ptr        = RestrictComplexPtr;
+using const_Complex_ptr  = ConstRestrictComplexPtr;
 #endif
 
-using UnalignedReal_ptr = RestrictRealPtr;
+using UnalignedReal_ptr       = RestrictRealPtr;
 using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 
 #else
@@ -1007,16 +1007,16 @@ struct ScopedAssignment
     m_ref_to_val = std::move(new_val);
   }
 
-  ScopedAssignment(ScopedAssignment const&) = delete;
-  ScopedAssignment(ScopedAssignment&&) = delete;
+  ScopedAssignment(ScopedAssignment const&)            = delete;
+  ScopedAssignment(ScopedAssignment&&)                 = delete;
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
-  ScopedAssignment& operator=(ScopedAssignment&&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment&&)      = delete;
 
   ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
 
 private:
   T& m_ref_to_val;
-  T m_prev_val;
+  T  m_prev_val;
 };
 
 } // namespace detail
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 21762e0dcd..277d69553c 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -50,7 +50,7 @@ struct ZipIterator
   using value_type =
       zip_val<typename std::iterator_traits<Iters>::value_type...>;
   using difference_type = std::ptrdiff_t;
-  using pointer = void;
+  using pointer         = void;
   using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
   using creference =
       zip_ref<const typename std::iterator_traits<Iters>::reference...>;
@@ -164,7 +164,7 @@ struct ZipIterator
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
+  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type    lhs,
                                                 const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index eed97af440..670297081e 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -42,9 +42,9 @@ namespace RAJA
  */
 void buildIndexSetAligned(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource work_res,
-    const RAJA::Index_type* const indices_in,
-    RAJA::Index_type length,
+    camp::resources::Resource                                   work_res,
+    const RAJA::Index_type* const                               indices_in,
+    RAJA::Index_type                                            length,
     RAJA::Index_type range_min_length,
     RAJA::Index_type range_align)
 {
@@ -61,7 +61,7 @@ void buildIndexSetAligned(
     /* first, gather statistics */
     /****************************/
 
-    RAJA::Index_type scanVal = indices_in[0];
+    RAJA::Index_type scanVal    = indices_in[0];
     RAJA::Index_type sliceCount = 0;
     for (RAJA::Index_type ii = 1; ii < length; ++ii)
     {
@@ -87,7 +87,7 @@ void buildIndexSetAligned(
           {
             docount += 1 + sliceCount; /* length + singletons */
           }
-          inrange = 1;
+          inrange    = 1;
           sliceCount = 0;
         }
         ++sliceCount; /* account for scanVal */
@@ -105,7 +105,7 @@ void buildIndexSetAligned(
           /* a range array */
           ++sliceCount;
           docount += 2; /* length + begin */
-          inrange = 0;
+          inrange    = 0;
           sliceCount = 0;
         }
         else
@@ -153,9 +153,9 @@ void buildIndexSetAligned(
       RAJA::Index_type dobegin;
       inrange = -1;
 
-      scanVal = indices_in[0];
+      scanVal    = indices_in[0];
       sliceCount = 0;
-      dobegin = scanVal;
+      dobegin    = scanVal;
       for (RAJA::Index_type ii = 1; ii < length; ++ii)
       {
         RAJA::Index_type lookAhead = indices_in[ii];
@@ -181,8 +181,8 @@ void buildIndexSetAligned(
               iset.push_back(
                   ListSegment(&indices_in[dobegin], sliceCount, work_res));
             }
-            inrange = 1;
-            dobegin = scanVal;
+            inrange    = 1;
+            dobegin    = scanVal;
             sliceCount = 0;
           }
           ++sliceCount; /* account for scanVal */
@@ -200,9 +200,9 @@ void buildIndexSetAligned(
             /* a range array */
             ++sliceCount;
             iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
-            inrange = 0;
+            inrange    = 0;
             sliceCount = 0;
-            dobegin = ii;
+            dobegin    = ii;
           }
           else
           {
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index c2ae293bd9..bdfcdf61b5 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -122,7 +122,7 @@ void KokkosPluginLoader::initDirectory(const std::string& path)
     return;
   }
 
-  DIR* dir;
+  DIR*           dir;
   struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index 73fb00e638..2ce0c4b308 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -75,7 +75,7 @@ void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
         for (int i = lane; i < numSegments; i += 3)
         {
           RAJA::Index_type start = i * fastDim / numSegments;
-          RAJA::Index_type end = (i + 1) * fastDim / numSegments;
+          RAJA::Index_type end   = (i + 1) * fastDim / numSegments;
           // printf("%d %d\n", start, end) ;
           iset.push_back(RAJA::RangeSegment(start, end));
         }
@@ -104,10 +104,10 @@ void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
         for (int i = 0; i < numThreads; ++i)
         {
           RAJA::Index_type startRow = i * midDim / numThreads;
-          RAJA::Index_type endRow = (i + 1) * midDim / numThreads;
-          RAJA::Index_type start = startRow * fastDim;
-          RAJA::Index_type end = endRow * fastDim;
-          RAJA::Index_type len = end - start;
+          RAJA::Index_type endRow   = (i + 1) * midDim / numThreads;
+          RAJA::Index_type start    = startRow * fastDim;
+          RAJA::Index_type end      = endRow * fastDim;
+          RAJA::Index_type len      = end - start;
           // printf("%d %d\n", start + (lane  )*len/3,
           //                   start + (lane+1)*len/3  ) ;
           iset.push_back(RAJA::RangeSegment(start + (lane)*len / 3,
@@ -213,22 +213,22 @@ void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
  */
 void buildLockFreeColorIndexset(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource work_res,
-    RAJA::Index_type const* domainToRange,
-    int numEntity,
-    int numRangePerDomain,
-    int numEntityRange,
+    camp::resources::Resource                                   work_res,
+    RAJA::Index_type const*                                     domainToRange,
+    int                                                         numEntity,
+    int               numRangePerDomain,
+    int               numEntityRange,
     RAJA::Index_type* elemPermutation,
     RAJA::Index_type* ielemPermutation)
 {
-  bool done = false;
+  bool  done     = false;
   bool* isMarked = new bool[numEntity];
 
-  RAJA::Index_type numWorkset = 0;
+  RAJA::Index_type  numWorkset   = 0;
   RAJA::Index_type* worksetDelim = new RAJA::Index_type[numEntity];
 
-  RAJA::Index_type worksetSize = 0;
-  RAJA::Index_type* workset = new RAJA::Index_type[numEntity];
+  RAJA::Index_type  worksetSize = 0;
+  RAJA::Index_type* workset     = new RAJA::Index_type[numEntity];
 
   RAJA::Index_type* rangeToDomain =
       new RAJA::Index_type[numEntityRange * numRangePerDomain];
@@ -241,7 +241,7 @@ void buildLockFreeColorIndexset(
   {
     for (int j = 0; j < numRangePerDomain; ++j)
     {
-      RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
+      RAJA::Index_type id  = domainToRange[i * numRangePerDomain + j];
       RAJA::Index_type idx = id * numRangePerDomain + rangeToDomainCount[id]++;
       if (idx > numEntityRange * numRangePerDomain ||
           rangeToDomainCount[id] > numRangePerDomain)
@@ -326,7 +326,7 @@ void buildLockFreeColorIndexset(
     for (int i = 0; i < numWorkset; ++i)
     {
       RAJA::Index_type begin = end;
-      end = worksetDelim[i];
+      end                    = worksetDelim[i];
       iset.push_back(RAJA::RangeSegment(begin, end));
     }
   }
@@ -336,8 +336,8 @@ void buildLockFreeColorIndexset(
     for (int i = 0; i < numWorkset; ++i)
     {
       RAJA::Index_type begin = end;
-      end = worksetDelim[i];
-      bool isRange = true;
+      end                    = worksetDelim[i];
+      bool isRange           = true;
       for (int j = begin + 1; j < end; ++j)
       {
         if (workset[j - 1] + 1 != workset[j])
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
index c6357d2965..2c122e5865 100644
--- a/src/RuntimePluginLoader.cpp
+++ b/src/RuntimePluginLoader.cpp
@@ -121,7 +121,7 @@ void RuntimePluginLoader::initDirectory(const std::string& path)
     return;
   }
 
-  DIR* dir;
+  DIR*           dir;
   struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
diff --git a/src/TensorStats.cpp b/src/TensorStats.cpp
index 9dbcf6190f..d34a6c8159 100644
--- a/src/TensorStats.cpp
+++ b/src/TensorStats.cpp
@@ -10,18 +10,18 @@
 
 int RAJA::tensor_stats::indent = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_copy = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_copy_ctor = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_copy           = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_copy_ctor      = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_broadcast_ctor = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_load_packed = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_load_packed_n = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_load_strided = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_packed    = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_packed_n  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_strided   = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_load_strided_n = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_store_packed = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_store_packed_n = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_store_strided = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_packed    = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_packed_n  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_strided   = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_store_strided_n = 0;
 
 camp::idx_t RAJA::tensor_stats::num_vector_broadcast = 0;
@@ -29,39 +29,39 @@ camp::idx_t RAJA::tensor_stats::num_vector_broadcast = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_get = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_set = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_add = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_add      = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_subtract = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_multiply = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_divide = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_divide   = 0;
 
 camp::idx_t RAJA::tensor_stats::num_vector_fma = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_fms = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_sum = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_max = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_min = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_sum  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_max  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_min  = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_vmax = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_vmin = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_dot = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_dot  = 0;
 
-camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_row_row = 0;
+camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_row_row    = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_row_row = 0;
-camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col = 0;
+camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col    = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_col_col = 0;
 
 void RAJA::tensor_stats::resetVectorStats()
 {
-  num_vector_copy = 0;
-  num_vector_copy_ctor = 0;
+  num_vector_copy           = 0;
+  num_vector_copy_ctor      = 0;
   num_vector_broadcast_ctor = 0;
 
-  num_vector_load_packed = 0;
-  num_vector_load_packed_n = 0;
-  num_vector_load_strided = 0;
-  num_vector_load_strided_n = 0;
-  num_vector_store_packed = 0;
-  num_vector_store_packed_n = 0;
-  num_vector_store_strided = 0;
+  num_vector_load_packed     = 0;
+  num_vector_load_packed_n   = 0;
+  num_vector_load_strided    = 0;
+  num_vector_load_strided_n  = 0;
+  num_vector_store_packed    = 0;
+  num_vector_store_packed_n  = 0;
+  num_vector_store_strided   = 0;
   num_vector_store_strided_n = 0;
 
   num_vector_broadcast = 0;
@@ -69,23 +69,23 @@ void RAJA::tensor_stats::resetVectorStats()
   num_vector_get = 0;
   num_vector_set = 0;
 
-  num_vector_add = 0;
+  num_vector_add      = 0;
   num_vector_subtract = 0;
   num_vector_multiply = 0;
-  num_vector_divide = 0;
+  num_vector_divide   = 0;
 
-  num_vector_fma = 0;
-  num_vector_fms = 0;
-  num_vector_sum = 0;
-  num_vector_max = 0;
-  num_vector_min = 0;
+  num_vector_fma  = 0;
+  num_vector_fms  = 0;
+  num_vector_sum  = 0;
+  num_vector_max  = 0;
+  num_vector_min  = 0;
   num_vector_vmax = 0;
   num_vector_vmin = 0;
-  num_vector_dot = 0;
+  num_vector_dot  = 0;
 
-  num_matrix_mm_mult_row_row = 0;
+  num_matrix_mm_mult_row_row    = 0;
   num_matrix_mm_multacc_row_row = 0;
-  num_matrix_mm_mult_col_col = 0;
+  num_matrix_mm_mult_col_col    = 0;
   num_matrix_mm_multacc_col_col = 0;
 }
 
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index 0e4b5ab6cd..c988caa033 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -14,18 +14,18 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
 void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
                                                INDEX_TYPE last,
-                                               const int pol)
+                                               const int  pol)
 {
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
                                          RAJA::stripIndexType(last));
-  INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
+  INDEX_TYPE                          N = INDEX_TYPE(r1.end() - r1.begin());
 
-  WORKING_RES working_res;
+  WORKING_RES               working_res;
   camp::resources::Resource erased_working_res{working_res};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       N, erased_working_res, &working_array, &check_array, &test_array);
@@ -35,9 +35,11 @@ void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
   RAJA::expt::dynamic_forall<POLICY_LIST>(
-      working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-        working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-      });
+      working_res,
+      pol,
+      r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
   working_res.memcpy(
       check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
@@ -61,7 +63,7 @@ class DynamicForallResourceRangeSegmentTest : public ::testing::Test
 TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using POLICY_LIST = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -76,9 +78,9 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   // If N == 5 host, openmp, device are on
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host =
+  bool                      is_on_host =
       working_res.get_platform() == camp::resources::Platform::host ? true
-                                                                    : false;
+                                                                                         : false;
 
   if (is_on_host)
   {
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index 1f636b1fe1..531dd29528 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -14,7 +14,7 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
 void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
                                        INDEX_TYPE last,
-                                       const int pol)
+                                       const int  pol)
 {
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
@@ -22,9 +22,9 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -43,9 +43,10 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
     RAJA::expt::dynamic_forall<POLICY_LIST>(
-        pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-          working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-        });
+        pol,
+        r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
   }
   else
   { // zero-length segment
@@ -55,11 +56,13 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
     working_res.memcpy(
         working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(
-        pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-          (void)idx;
-          working_array[0]++;
-        });
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol,
+                                            r1,
+                                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                            {
+                                              (void)idx;
+                                              working_array[0]++;
+                                            });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
@@ -83,7 +86,7 @@ class DynamicForallRangeSegmentTest : public ::testing::Test
 TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using POLICY_LIST = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -97,9 +100,9 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
   // If N == 5 host, openmp, device are on
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host =
+  bool                      is_on_host =
       working_res.get_platform() == camp::resources::Platform::host ? true
-                                                                    : false;
+                                                                                         : false;
 
   if (is_on_host)
   {
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index ac4386e935..85b3d09cb1 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -19,12 +19,12 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first),
                                          RAJA::stripIndexType(last));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
-  INDEX_TYPE N = N0;
+  INDEX_TYPE N  = N0;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
@@ -43,7 +43,8 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
     auto adapter = RAJA::make_CombiningAdapter(
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        {
           if (idx >= first && idx < last)
           {
             // in bounds
@@ -106,7 +107,7 @@ void runNegativeTests()
 
 TYPED_TEST_P(ForallCombiningAdapter1DTest, Forall1D)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 077ca63d04..1cfcbfe690 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -25,12 +25,12 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
                                          RAJA::stripIndexType(last1));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
-  INDEX_TYPE N = N0 * N1;
+  INDEX_TYPE N  = N0 * N1;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
@@ -51,7 +51,8 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
     auto adapter = RAJA::make_CombiningAdapter(
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1) {
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1)
+        {
           if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 && idx1 < last1)
           {
             // in bounds
@@ -117,7 +118,7 @@ void runNegativeTests()
 
 TYPED_TEST_P(ForallCombiningAdapter2DTest, Forall2D)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index 8427a93b5c..b3e48fa3d4 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -30,12 +30,12 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
-  INDEX_TYPE N = N0 * N1 * N2;
+  INDEX_TYPE N  = N0 * N1 * N2;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
@@ -59,8 +59,8 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
     auto adapter = RAJA::make_CombiningAdapter(
-        [=] RAJA_HOST_DEVICE(
-            INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2) {
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2)
+        {
           if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 &&
               idx1 < last1 && idx2 >= first2 && idx2 < last2)
           {
@@ -144,7 +144,7 @@ void runNegativeTests()
 
 TYPED_TEST_P(ForallCombiningAdapter3DTest, Forall3D)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index c0615df535..096a1267a9 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -43,7 +43,7 @@ struct RSMultiplexer<IdxType, RAJA::TypedRangeStrideSegment<IdxType>>
 template <typename IdxType>
 struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 {
-  RAJA::TypedListSegment<IdxType> makeseg(IdxType N,
+  RAJA::TypedListSegment<IdxType> makeseg(IdxType                   N,
                                           camp::resources::Resource work_res)
   {
     std::vector<IdxType> temp(N);
@@ -79,36 +79,39 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
       len, work_res, &work_array, &check_array, &test_array);
 
   // use atomic add to reduce the array
-  test_array[0] = static_cast<T>(0);
-  test_array[1] = static_cast<T>(seglimit);
-  test_array[2] = static_cast<T>(seglimit);
-  test_array[3] = static_cast<T>(0);
-  test_array[4] = static_cast<T>(0);
-  test_array[5] = static_cast<T>(seglimit + 1);
-  test_array[6] = static_cast<T>(seglimit);
-  test_array[7] = static_cast<T>(0);
-  test_array[8] = static_cast<T>(0);
-  test_array[9] = static_cast<T>(0);
+  test_array[0]  = static_cast<T>(0);
+  test_array[1]  = static_cast<T>(seglimit);
+  test_array[2]  = static_cast<T>(seglimit);
+  test_array[3]  = static_cast<T>(0);
+  test_array[4]  = static_cast<T>(0);
+  test_array[5]  = static_cast<T>(seglimit + 1);
+  test_array[6]  = static_cast<T>(seglimit);
+  test_array[7]  = static_cast<T>(0);
+  test_array[8]  = static_cast<T>(0);
+  test_array[9]  = static_cast<T>(0);
   test_array[10] = static_cast<T>(0);
   test_array[11] = static_cast<T>(0);
 
   work_res.memcpy(work_array, test_array, sizeof(T) * len);
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
-    RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
-    RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
-    RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
-    RAJA::atomicInc<AtomicPolicy>(work_array + 4);
-    RAJA::atomicDec<AtomicPolicy>(work_array + 5);
-    RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-    RAJA::atomicCAS<AtomicPolicy>(
-        work_array + 7, static_cast<T>(i), static_cast<T>(i + 1));
-    RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
-    RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
-    RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
-    RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
-  });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
+        RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
+        RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
+        RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
+        RAJA::atomicInc<AtomicPolicy>(work_array + 4);
+        RAJA::atomicDec<AtomicPolicy>(work_array + 5);
+        RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
+        RAJA::atomicCAS<AtomicPolicy>(
+            work_array + 7, static_cast<T>(i), static_cast<T>(i + 1));
+        RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
+        RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
+        RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
+        RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
+      });
 
   work_res.memcpy(check_array, work_array, sizeof(T) * len);
   work_res.wait();
@@ -138,11 +141,11 @@ class ForallAtomicBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicBasicTestImpl<AExec,
                             APol,
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
index 6f1d960004..7e4075cadc 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
@@ -16,9 +16,9 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PreIncCountOp
 {
-  PreIncCountOp(T* dcount,
-                T* hcount,
-                camp::resources::Resource work_res,
+  PreIncCountOp(T*                               dcount,
+                T*                               hcount,
+                camp::resources::Resource        work_res,
                 RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
@@ -31,15 +31,15 @@ struct PreIncCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (++counter) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PostIncCountOp
 {
-  PostIncCountOp(T* dcount,
-                 T* hcount,
-                 camp::resources::Resource work_res,
+  PostIncCountOp(T*                               dcount,
+                 T*                               hcount,
+                 camp::resources::Resource        work_res,
                  RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
@@ -52,15 +52,15 @@ struct PostIncCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter++); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AddEqCountOp
 {
-  AddEqCountOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
+  AddEqCountOp(T*                               dcount,
+               T*                               hcount,
+               camp::resources::Resource        work_res,
                RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
@@ -76,15 +76,15 @@ struct AddEqCountOp
     return (counter += (T)1) - (T)1;
   }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchAddCountOp
 {
-  FetchAddCountOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
+  FetchAddCountOp(T*                               dcount,
+                  T*                               hcount,
+                  camp::resources::Resource        work_res,
                   RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
@@ -100,7 +100,7 @@ struct FetchAddCountOp
     return counter.fetch_add((T)1);
   }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename ExecPolicy,
@@ -110,27 +110,31 @@ template <typename ExecPolicy,
           template <typename, typename, typename>
           class CountOp>
 void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
-                      T* count,
-                      T* list,
-                      bool* hit,
-                      T* hcount,
-                      T* hlist,
-                      bool* hhit,
-                      camp::resources::Resource work_res,
-                      IdxType N)
+                      T*                               count,
+                      T*                               list,
+                      bool*                            hit,
+                      T*                               hcount,
+                      T*                               hlist,
+                      bool*                            hhit,
+                      camp::resources::Resource        work_res,
+                      IdxType                          N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    list[i] = countop.max + (T)1;
-    hit[i] = false;
-  });
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    T val = countop(i);
-    list[i] = val;
-    hit[(IdxType)val] = true;
-  });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -174,13 +178,13 @@ void ForallAtomicRefAddTestImpl(IdxType N)
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count = work_res.allocate<T>(1);
-  T* list = work_res.allocate<T>(N);
-  bool* hit = work_res.allocate<bool>(N);
+  T*    count = work_res.allocate<T>(1);
+  T*    list  = work_res.allocate<T>(N);
+  bool* hit   = work_res.allocate<bool>(N);
 
-  T* hcount = host_res.allocate<T>(1);
-  T* hlist = host_res.allocate<T>(N);
-  bool* hhit = host_res.allocate<bool>(N);
+  T*    hcount = host_res.allocate<T>(1);
+  T*    hlist  = host_res.allocate<T>(N);
+  bool* hhit   = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -215,11 +219,11 @@ class ForallAtomicRefAddTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 08acac4f79..82b847e86e 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -16,9 +16,9 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CASOtherOp : all_op
 {
-  CASOtherOp(T* dcount,
-             T* hcount,
-             camp::resources::Resource work_res,
+  CASOtherOp(T*                               dcount,
+             T*                               hcount,
+             camp::resources::Resource        work_res,
              RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
@@ -40,15 +40,15 @@ struct CASOtherOp : all_op
     return received;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CompareExchangeWeakOtherOp : all_op
 {
-  CompareExchangeWeakOtherOp(T* dcount,
-                             T* hcount,
-                             camp::resources::Resource work_res,
+  CompareExchangeWeakOtherOp(T*                               dcount,
+                             T*                               hcount,
+                             camp::resources::Resource        work_res,
                              RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
@@ -68,15 +68,15 @@ struct CompareExchangeWeakOtherOp : all_op
     return expect;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CompareExchangeStrongOtherOp : all_op
 {
-  CompareExchangeStrongOtherOp(T* dcount,
-                               T* hcount,
-                               camp::resources::Resource work_res,
+  CompareExchangeStrongOtherOp(T*                               dcount,
+                               T*                               hcount,
+                               camp::resources::Resource        work_res,
                                RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
@@ -96,7 +96,7 @@ struct CompareExchangeStrongOtherOp : all_op
     return expect;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename ExecPolicy,
@@ -106,20 +106,22 @@ template <typename ExecPolicy,
           template <typename, typename, typename>
           class OtherOp>
 void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
-                        T* count,
-                        T* list,
-                        T* hcount,
-                        T* hlist,
-                        camp::resources::Resource work_res,
-                        IdxType N)
+                        T*                               count,
+                        T*                               list,
+                        T*                               hcount,
+                        T*                               hlist,
+                        camp::resources::Resource        work_res,
+                        IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
   RAJA::forall<ExecPolicy>(
       seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    T val = otherop(i);
-    list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -154,10 +156,10 @@ void ForallAtomicRefCASTestImpl(IdxType N)
   camp::resources::Resource host_res{camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
-  T* list = work_res.allocate<T>(N);
+  T* list  = work_res.allocate<T>(N);
 
   T* hcount = host_res.allocate<T>(1);
-  T* hlist = host_res.allocate<T>(N);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -196,11 +198,11 @@ class ForallAtomicRefCASTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index f1aedadf22..7e63e1bfb1 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -16,9 +16,9 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct LoadOtherOp : all_op
 {
-  LoadOtherOp(T* dcount,
-              T* hcount,
-              camp::resources::Resource work_res,
+  LoadOtherOp(T*                               dcount,
+              T*                               hcount,
+              camp::resources::Resource        work_res,
               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)seg.size()),
@@ -32,15 +32,15 @@ struct LoadOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other.load(); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct OperatorTOtherOp : all_op
 {
-  OperatorTOtherOp(T* dcount,
-                   T* hcount,
-                   camp::resources::Resource work_res,
+  OperatorTOtherOp(T*                               dcount,
+                   T*                               hcount,
+                   camp::resources::Resource        work_res,
                    RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
       : other(dcount), min(T(0)), max(min), final_min(min), final_max(min)
   {
@@ -50,15 +50,15 @@ struct OperatorTOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct StoreOtherOp : all_op
 {
-  StoreOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
+  StoreOtherOp(T*                               dcount,
+               T*                               hcount,
+               camp::resources::Resource        work_res,
                RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
@@ -76,15 +76,15 @@ struct StoreOtherOp : all_op
     return (T)i;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AssignOtherOp : all_op
 {
-  AssignOtherOp(T* dcount,
-                T* hcount,
-                camp::resources::Resource work_res,
+  AssignOtherOp(T*                               dcount,
+                T*                               hcount,
+                camp::resources::Resource        work_res,
                 RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -98,7 +98,7 @@ struct AssignOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return (other = (T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename ExecPolicy,
@@ -108,20 +108,22 @@ template <typename ExecPolicy,
           template <typename, typename, typename>
           class OtherOp>
 void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
-                              T* count,
-                              T* list,
-                              T* hcount,
-                              T* hlist,
-                              camp::resources::Resource work_res,
-                              IdxType N)
+                              T*                               count,
+                              T*                               list,
+                              T*                               hcount,
+                              T*                               hlist,
+                              camp::resources::Resource        work_res,
+                              IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
   RAJA::forall<ExecPolicy>(
       seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    T val = otherop(i);
-    list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -156,10 +158,10 @@ void ForallAtomicRefLoadStoreTestImpl(IdxType N)
   camp::resources::Resource host_res{camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
-  T* list = work_res.allocate<T>(N);
+  T* list  = work_res.allocate<T>(N);
 
   T* hcount = host_res.allocate<T>(1);
-  T* hlist = host_res.allocate<T>(N);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -196,11 +198,11 @@ class ForallAtomicRefLoadStoreTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 5afffd7140..88ab21f6ec 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -16,9 +16,9 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AndEqOtherOp : int_op
 {
-  AndEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
+  AndEqOtherOp(T*                               dcount,
+               T*                               hcount,
+               camp::resources::Resource        work_res,
                RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -32,15 +32,15 @@ struct AndEqOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other &= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchAndOtherOp : int_op
 {
-  FetchAndOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
+  FetchAndOtherOp(T*                               dcount,
+                  T*                               hcount,
+                  camp::resources::Resource        work_res,
                   RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -54,15 +54,15 @@ struct FetchAndOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_and((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct OrEqOtherOp : int_op
 {
-  OrEqOtherOp(T* dcount,
-              T* hcount,
-              camp::resources::Resource work_res,
+  OrEqOtherOp(T*                               dcount,
+              T*                               hcount,
+              camp::resources::Resource        work_res,
               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -76,15 +76,15 @@ struct OrEqOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other |= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchOrOtherOp : int_op
 {
-  FetchOrOtherOp(T* dcount,
-                 T* hcount,
-                 camp::resources::Resource work_res,
+  FetchOrOtherOp(T*                               dcount,
+                 T*                               hcount,
+                 camp::resources::Resource        work_res,
                  RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -98,15 +98,15 @@ struct FetchOrOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_or((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct XorEqOtherOp : int_op
 {
-  XorEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
+  XorEqOtherOp(T*                               dcount,
+               T*                               hcount,
+               camp::resources::Resource        work_res,
                RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -125,15 +125,15 @@ struct XorEqOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other ^= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchXorOtherOp : int_op
 {
-  FetchXorOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
+  FetchXorOtherOp(T*                               dcount,
+                  T*                               hcount,
+                  camp::resources::Resource        work_res,
                   RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -152,7 +152,7 @@ struct FetchXorOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_xor((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename ExecPolicy,
@@ -166,12 +166,12 @@ typename std::enable_if<
     (std::is_floating_point<T>::value &&
      std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
 testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
-                       T* RAJA_UNUSED_ARG(count),
-                       T* RAJA_UNUSED_ARG(list),
-                       T* RAJA_UNUSED_ARG(hcount),
-                       T* RAJA_UNUSED_ARG(hlist),
+                       T*                               RAJA_UNUSED_ARG(count),
+                       T*                               RAJA_UNUSED_ARG(list),
+                       T*                               RAJA_UNUSED_ARG(hcount),
+                       T*                               RAJA_UNUSED_ARG(hlist),
                        camp::resources::Resource RAJA_UNUSED_ARG(work_res),
-                       IdxType RAJA_UNUSED_ARG(N))
+                       IdxType                   RAJA_UNUSED_ARG(N))
 {}
 
 template <typename ExecPolicy,
@@ -186,20 +186,22 @@ typename std::enable_if<
      std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value) ||
     (std::is_base_of<all_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
 testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
-                       T* count,
-                       T* list,
-                       T* hcount,
-                       T* hlist,
-                       camp::resources::Resource work_res,
-                       IdxType N)
+                       T*                               count,
+                       T*                               list,
+                       T*                               hcount,
+                       T*                               hlist,
+                       camp::resources::Resource        work_res,
+                       IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
   RAJA::forall<ExecPolicy>(
       seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    T val = otherop(i);
-    list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -234,10 +236,10 @@ void ForallAtomicRefLogicalTestImpl(IdxType N)
   camp::resources::Resource host_res{camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
-  T* list = work_res.allocate<T>(N);
+  T* list  = work_res.allocate<T>(N);
 
   T* hcount = host_res.allocate<T>(1);
-  T* hlist = host_res.allocate<T>(N);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -276,11 +278,11 @@ class ForallAtomicRefLogicalTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index 90248a5e48..df202c4b98 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -16,9 +16,9 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct MaxEqOtherOp : all_op
 {
-  MaxEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
+  MaxEqOtherOp(T*                               dcount,
+               T*                               hcount,
+               camp::resources::Resource        work_res,
                RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -32,15 +32,15 @@ struct MaxEqOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchMaxOtherOp : all_op
 {
-  FetchMaxOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
+  FetchMaxOtherOp(T*                               dcount,
+                  T*                               hcount,
+                  camp::resources::Resource        work_res,
                   RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -54,15 +54,15 @@ struct FetchMaxOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct MinEqOtherOp : all_op
 {
-  MinEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
+  MinEqOtherOp(T*                               dcount,
+               T*                               hcount,
+               camp::resources::Resource        work_res,
                RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -76,15 +76,15 @@ struct MinEqOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchMinOtherOp : all_op
 {
-  FetchMinOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
+  FetchMinOtherOp(T*                               dcount,
+                  T*                               hcount,
+                  camp::resources::Resource        work_res,
                   RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
@@ -98,7 +98,7 @@ struct FetchMinOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T min, max, final_min, final_max;
+  T                                min, max, final_min, final_max;
 };
 
 template <typename ExecPolicy,
@@ -108,20 +108,22 @@ template <typename ExecPolicy,
           template <typename, typename, typename>
           class OtherOp>
 void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
-                           T* count,
-                           T* list,
-                           T* hcount,
-                           T* hlist,
-                           camp::resources::Resource work_res,
-                           IdxType N)
+                           T*                               count,
+                           T*                               list,
+                           T*                               hcount,
+                           T*                               hlist,
+                           camp::resources::Resource        work_res,
+                           IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
   RAJA::forall<ExecPolicy>(
       seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    T val = otherop(i);
-    list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -156,10 +158,10 @@ void ForallAtomicRefMinMaxTestImpl(IdxType N)
   camp::resources::Resource host_res{camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
-  T* list = work_res.allocate<T>(N);
+  T* list  = work_res.allocate<T>(N);
 
   T* hcount = host_res.allocate<T>(1);
-  T* hlist = host_res.allocate<T>(N);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -192,11 +194,11 @@ class ForallAtomicRefMinMaxTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
index 3d6fffd5f6..6f9e837057 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
@@ -16,9 +16,9 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PreDecCountOp
 {
-  PreDecCountOp(T* dcount,
-                T* hcount,
-                camp::resources::Resource work_res,
+  PreDecCountOp(T*                               dcount,
+                T*                               hcount,
+                camp::resources::Resource        work_res,
                 RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
@@ -28,15 +28,15 @@ struct PreDecCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (--counter); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PostDecCountOp
 {
-  PostDecCountOp(T* dcount,
-                 T* hcount,
-                 camp::resources::Resource work_res,
+  PostDecCountOp(T*                               dcount,
+                 T*                               hcount,
+                 camp::resources::Resource        work_res,
                  RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
@@ -46,15 +46,15 @@ struct PostDecCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter--) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct SubEqCountOp
 {
-  SubEqCountOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
+  SubEqCountOp(T*                               dcount,
+               T*                               hcount,
+               camp::resources::Resource        work_res,
                RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
@@ -64,15 +64,15 @@ struct SubEqCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter -= (T)1); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchSubCountOp
 {
-  FetchSubCountOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
+  FetchSubCountOp(T*                               dcount,
+                  T*                               hcount,
+                  camp::resources::Resource        work_res,
                   RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
@@ -85,7 +85,7 @@ struct FetchSubCountOp
     return counter.fetch_sub((T)1) - (T)1;
   }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T min, max, final;
+  T                                min, max, final;
 };
 
 template <typename ExecPolicy,
@@ -95,25 +95,29 @@ template <typename ExecPolicy,
           template <typename, typename, typename>
           class CountOp>
 void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
-                      T* count,
-                      T* list,
-                      bool* hit,
-                      T* hcount,
-                      T* hlist,
-                      bool* hhit,
-                      camp::resources::Resource work_res,
-                      IdxType N)
+                      T*                               count,
+                      T*                               list,
+                      bool*                            hit,
+                      T*                               hcount,
+                      T*                               hlist,
+                      bool*                            hhit,
+                      camp::resources::Resource        work_res,
+                      IdxType                          N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    list[i] = countop.max + (T)1;
-    hit[i] = false;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    T val = countop(i);
-    list[i] = val;
-    hit[(IdxType)val] = true;
-  });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -148,13 +152,13 @@ void ForallAtomicRefSubTestImpl(IdxType N)
 
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count = work_res.allocate<T>(1);
-  T* list = work_res.allocate<T>(N);
-  bool* hit = work_res.allocate<bool>(N);
+  T*    count = work_res.allocate<T>(1);
+  T*    list  = work_res.allocate<T>(N);
+  bool* hit   = work_res.allocate<bool>(N);
 
-  T* hcount = host_res.allocate<T>(1);
-  T* hlist = host_res.allocate<T>(N);
-  bool* hhit = host_res.allocate<bool>(N);
+  T*    hcount = host_res.allocate<T>(1);
+  T*    hlist  = host_res.allocate<T>(N);
+  bool* hhit   = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -189,11 +193,11 @@ class ForallAtomicRefSubTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index 4300cf82f0..2069c90ed1 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -35,11 +35,11 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
   camp::resources::Resource work_res{WORKINGRES()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* actualsource = work_res.allocate<T>(N);
-  T** source = work_res.allocate<T*>(src_side);
-  T* actualdest = work_res.allocate<T>(N / 2);
-  T** dest = work_res.allocate<T*>(dst_side);
-  T* check_array = host_res.allocate<T>(N / 2);
+  T*  actualsource = work_res.allocate<T>(N);
+  T** source       = work_res.allocate<T*>(src_side);
+  T*  actualdest   = work_res.allocate<T>(N / 2);
+  T** dest         = work_res.allocate<T*>(dst_side);
+  T*  check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -50,14 +50,14 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 #endif
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii) {
-    source[ii] = actualsource + (ii * dst_side);
-  });
+  RAJA::forall<ExecPolicy>(seg_srcside,
+                           [=] RAJA_HOST_DEVICE(IdxType ii)
+                           { source[ii] = actualsource + (ii * dst_side); });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii) {
-    dest[ii] = actualdest + (ii * dst_side);
-  });
+  RAJA::forall<ExecPolicy>(seg_dstside,
+                           [=] RAJA_HOST_DEVICE(IdxType ii)
+                           { dest[ii] = actualdest + (ii * dst_side); });
 
   RAJA::forall<ExecPolicy>(
       seg, [=] RAJA_HOST_DEVICE(IdxType i) { actualsource[i] = (T)1; });
@@ -72,20 +72,25 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-    {
-      sum_atomic_view(i, aopidx) = (T)0;
-    }
-  });
+  RAJA::forall<ExecPolicy>(seg_dstside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i, aopidx) = (T)0;
+                             }
+                           });
 
   // Assign values to dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-    {
-      sum_atomic_view(i / 2, aopidx) += vec_view(aopidx, i / 2);
-    }
-  });
+  RAJA::forall<ExecPolicy>(seg_srcside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i / 2, aopidx) +=
+                                   vec_view(aopidx, i / 2);
+                             }
+                           });
 
   work_res.memcpy(check_array, actualdest, sizeof(T) * N / 2);
 
@@ -116,11 +121,11 @@ class ForallAtomicMultiViewTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>(20000);
 }
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index f2f31dd7e8..81e4da707d 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -35,11 +35,11 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
   camp::resources::Resource work_res{WORKINGRES()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* actualsource = work_res.allocate<T>(N);
-  T** source = work_res.allocate<T*>(src_side);
-  T* actualdest = work_res.allocate<T>(N / 2);
-  T** dest = work_res.allocate<T*>(dst_side);
-  T* check_array = host_res.allocate<T>(N / 2);
+  T*  actualsource = work_res.allocate<T>(N);
+  T** source       = work_res.allocate<T*>(src_side);
+  T*  actualdest   = work_res.allocate<T>(N / 2);
+  T** dest         = work_res.allocate<T*>(dst_side);
+  T*  check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -90,11 +90,11 @@ class ForallAtomicOutOfBoundsMultiViewTest : public ::testing::Test
 TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
              AtomicOutOfBoundsMultiViewForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicOutOfBoundsMultiViewTestImpl<AExec,
                                            APol,
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index 723cbbcae7..057d5e8ee6 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -26,9 +26,9 @@ void ForallAtomicViewTestImpl(IdxType N)
   camp::resources::Resource work_res{WORKINGRES()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* hsource = host_res.allocate<T>(N);
-  T* source = work_res.allocate<T>(N);
-  T* dest = work_res.allocate<T>(N / 2);
+  T* hsource     = host_res.allocate<T>(N);
+  T* source      = work_res.allocate<T>(N);
+  T* dest        = work_res.allocate<T>(N / 2);
   T* check_array = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -63,9 +63,9 @@ void ForallAtomicViewTestImpl(IdxType N)
       seg_half, [=] RAJA_HOST_DEVICE(IdxType i) { sum_atomic_view(i) = (T)0; });
 
   // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    sum_atomic_view(i / 2) += vec_view(i);
-  });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i / 2) += vec_view(i); });
 
   work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
 
@@ -95,11 +95,11 @@ class ForallAtomicViewTest : public ::testing::Test
 
 TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
 {
-  using AExec = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>(100000);
 }
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index d0a2db9e2c..4e18352df7 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -19,16 +19,16 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIcountIndexSetViewTestImpl()
 {
 
-  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset;
+  IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -58,14 +58,14 @@ void ForallIcountIndexSetViewTestImpl()
     test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::Layout<1> layout(N);
+  RAJA::Layout<1>                                        layout(N);
   RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>> work_view(
       working_array, layout);
 
   RAJA::forall_Icount<EXEC_POLICY>(
-      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-        work_view(icount) = idx;
-      });
+      iset,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      { work_view(icount) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -87,9 +87,9 @@ class ForallIcountIndexSetViewTest : public ::testing::Test
 
 TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIcountIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index 1c0199815f..e56423aa36 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -18,16 +18,16 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIndexSetViewTestImpl()
 {
 
-  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset;
+  IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -59,7 +59,7 @@ void ForallIndexSetViewTestImpl()
   using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
-  view_type work_view(working_array, layout);
+  view_type       work_view(working_array, layout);
 
   RAJA::forall<EXEC_POLICY>(
       iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
@@ -84,9 +84,9 @@ class ForallIndexSetViewTest : public ::testing::Test
 
 TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index a607dde3eb..da699db71d 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -17,16 +17,16 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIcountIndexSetTestImpl()
 {
 
-  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset;
+  IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -58,9 +58,8 @@ void ForallIcountIndexSetTestImpl()
 
   RAJA::forall_Icount(EXEC_POLICY(),
                       iset,
-                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-                        working_array[icount] = idx;
-                      });
+                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+                      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -81,9 +80,9 @@ class ForallIcountIndexSetTest : public ::testing::Test
 
 TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index ec63e7be57..00edd70b07 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -16,16 +16,16 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallIndexSetTestImpl()
 {
 
-  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset;
+  IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -54,9 +54,10 @@ void ForallIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall(EXEC_POLICY(), iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall(EXEC_POLICY(),
+               iset,
+               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+               { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -78,9 +79,9 @@ class ForallIndexSetTest : public ::testing::Test
 
 TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index 3cd68ce031..e3932ebfc3 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -45,19 +45,19 @@ template <typename EXEC_POLICY,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
-ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
-                               const Container& multi_init,
+ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
+                               const Container&             multi_init,
                                const std::vector<IDX_TYPE>& seg_idx,
-                               camp::resources::Resource working_res,
-                               RandomGenerator& rngen)
+                               camp::resources::Resource    working_res,
+                               RandomGenerator&             rngen)
 {
   using MULTIREDUCER =
       typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1;
-  const IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  const IDX_TYPE idx_len   = static_cast<IDX_TYPE>(seg_idx.size());
 
-  const int modval = 100;
+  const int    modval   = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -88,7 +88,7 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
     for (IDX_TYPE i = 0; i < idx_len; ++i)
     {
-      IDX_TYPE idx = seg_idx[i];
+      IDX_TYPE idx    = seg_idx[i];
       test_range[idx] = data_len;
       data_len += work_per_iterate_distribution(rngen);
       test_range[idx + 1] = data_len;
@@ -138,13 +138,17 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
           ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1]; ++idx)
-      {
-        ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+        {
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
     for (auto init_val : multi_init)
@@ -173,13 +177,16 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
             ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-             ++idx)
-        {
-          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-        }
-      });
+      RAJA::forall<EXEC_POLICY>(
+          seg,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+          {
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
     for (size_t bin = 0; bin < num_bins; ++bin)
@@ -212,20 +219,23 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
 
     std::vector<DATA_TYPE> ref_vals;
-    bool got_ref_vals = false;
+    bool                   got_ref_vals = false;
 
     const int nloops = 2;
     for (int j = 0; j < nloops; ++j)
     {
       red.reset();
 
-      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-             ++idx)
-        {
-          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-        }
-      });
+      RAJA::forall<EXEC_POLICY>(
+          seg,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+          {
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
 
       if (!got_ref_vals)
       {
@@ -257,15 +267,15 @@ class ForallMultiReduceBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ABSTRACTION = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto         random_seed = std::random_device{}();
   std::mt19937 rngen(random_seed);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
@@ -275,13 +285,13 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
-  size_t num_bins_min = 0;
+  size_t              num_bins_min = 0;
   for (size_t num_bins_max : num_bins_max_container)
   {
 
     std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
                                                         num_bins_max);
-    num_bins_min = num_bins_max + 1;
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
@@ -316,7 +326,7 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 
     // List segment test
     seg_idx.clear();
-    IDX_TYPE last = 10567;
+    IDX_TYPE                                last = 10567;
     std::uniform_int_distribution<IDX_TYPE> dist(0, last - 1);
     for (IDX_TYPE i = 0; i < last; ++i)
     {
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index 99a5e283fe..ed2d4c9dc9 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                     camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -43,9 +43,9 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpand &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -71,10 +71,12 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redand &= working_array[idx];
-    redand2 &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redand &= working_array[idx];
+                              redand2 &= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -84,9 +86,9 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      redand &= working_array[idx];
-    });
+    RAJA::forall<EXEC_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redand &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
@@ -104,10 +106,10 @@ class ForallReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index d08ebb6790..6d34119516 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
                                     const std::vector<IDX_TYPE>& seg_idx,
-                                    camp::resources::Resource working_res)
+                                    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -43,9 +43,9 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpor |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
@@ -72,10 +72,12 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor(0);
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redor |= working_array[idx];
-    redor2 |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redor |= working_array[idx];
+                              redor2 |= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2.get()), ref_or);
@@ -85,9 +87,9 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      redor |= working_array[idx];
-    });
+    RAJA::forall<EXEC_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redor |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
@@ -105,10 +107,10 @@ class ForallReduceBitOrBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index 4ab5329b68..9e88a8a1dd 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,9 +32,9 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
+  const int       modval   = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max = modval + 1;
+  const DATA_TYPE big_max  = modval + 1;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -52,10 +52,12 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> maxinit(big_max);
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.max(working_array[idx]);
-    max.max(working_array[idx]);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.max(working_array[idx]);
+                              max.max(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
@@ -64,15 +66,15 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
 
@@ -87,10 +89,10 @@ class ForallReduceMaxBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index 205e232bee..c492320b99 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                     camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,12 +32,12 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
-  const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE big_max = modval + 1;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const int       modval      = 100;
+  const DATA_TYPE max_init    = -modval;
+  const IDX_TYPE  maxloc_init = -1;
+  const IDX_TYPE  maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max     = modval + 1;
+  const IDX_TYPE  big_maxloc  = maxloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -45,13 +45,13 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
-  IDX_TYPE ref_maxloc = maxloc_init;
+  DATA_TYPE ref_max    = max_init;
+  IDX_TYPE  ref_maxloc = maxloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] > ref_max)
     {
-      ref_max = test_array[seg_idx[i]];
+      ref_max    = test_array[seg_idx[i]];
       ref_maxloc = seg_idx[i];
     }
   }
@@ -64,10 +64,12 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init,
                                                              maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.maxloc(working_array[idx], idx);
-    max.maxloc(working_array[idx], idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.maxloc(working_array[idx], idx);
+                              max.maxloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -79,16 +81,16 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.maxloc(working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.maxloc(working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
@@ -104,10 +106,10 @@ class ForallReduceMaxLocBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index 0115377dcc..23de6c393f 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,8 +32,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval + 1;
+  const int       modval    = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
@@ -53,10 +53,12 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.min(working_array[idx]);
-    min.min(working_array[idx]);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.min(working_array[idx]);
+                              min.min(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -65,16 +67,16 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.min(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.min(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
@@ -91,10 +93,10 @@ class ForallReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index cfbfbab27b..0bf882ceea 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                     camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,12 +32,12 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval + 1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE small_min = -modval;
-  const IDX_TYPE small_minloc = minloc_init;
+  const int       modval       = 100;
+  const DATA_TYPE min_init     = modval + 1;
+  const IDX_TYPE  minloc_init  = -1;
+  const IDX_TYPE  minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min    = -modval;
+  const IDX_TYPE  small_minloc = minloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -45,13 +45,13 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
-  IDX_TYPE ref_minloc = minloc_init;
+  DATA_TYPE ref_min    = min_init;
+  IDX_TYPE  ref_minloc = minloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] < ref_min)
     {
-      ref_min = test_array[seg_idx[i]];
+      ref_min    = test_array[seg_idx[i]];
       ref_minloc = seg_idx[i];
     }
   }
@@ -64,10 +64,12 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init,
                                                              minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.minloc(working_array[idx], idx);
-    min.minloc(working_array[idx], idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.minloc(working_array[idx], idx);
+                              min.minloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -79,16 +81,16 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.minloc(working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.minloc(working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
@@ -104,10 +106,10 @@ class ForallReduceMinLocBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 675aeee306..145355e64f 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -51,10 +51,12 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    sum += working_array[idx];
-    sum2 += working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              sum += working_array[idx];
+                              sum2 += working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -84,10 +86,10 @@ class ForallReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index 29c1988fa3..5525eaf302 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                     camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -43,9 +43,9 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpand &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -76,7 +76,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
       RAJA::expt::KernelName("RAJA Reduce BitAnd"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2)
+      {
         r1 &= working_array[idx];
         r2 &= working_array[idx];
       });
@@ -92,9 +93,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
     RAJA::forall<EXEC_POLICY>(
         seg,
         RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1) {
-          r1 &= working_array[idx];
-        });
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1)
+        { r1 &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
@@ -112,10 +112,10 @@ class ForallReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index 1280d86f92..fdc9ca64ee 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
                                     const std::vector<IDX_TYPE>& seg_idx,
-                                    camp::resources::Resource working_res)
+                                    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -43,9 +43,9 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpor |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
@@ -76,7 +76,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
       RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
       RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
       RAJA::expt::KernelName("RAJA Reduce BitOr"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2)
+      {
         r1 |= working_array[idx];
         r2 |= working_array[idx];
       });
@@ -92,9 +93,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
     RAJA::forall<EXEC_POLICY>(
         seg,
         RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1) {
-          r1 |= working_array[idx];
-        });
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1)
+        { r1 |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
@@ -112,10 +112,10 @@ class ForallReduceBitOrBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index a4eaea5fb0..7f38c25324 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,9 +32,9 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
+  const int       modval   = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max = modval + 1;
+  const DATA_TYPE big_max  = modval + 1;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -57,9 +57,10 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
       RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
       RAJA::expt::KernelName("RAJA Reduce Max"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m)
+      {
         mi = RAJA_MAX(working_array[idx], mi);
-        m = RAJA_MAX(working_array[idx], m);
+        m  = RAJA_MAX(working_array[idx], m);
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit), big_max);
@@ -71,17 +72,15 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
-                              m = RAJA_MAX(working_array[idx] * factor, m);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MAX(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
 
   factor = 3;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
-                              m = RAJA_MAX(working_array[idx] * factor, m);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MAX(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
 
 
@@ -96,10 +95,10 @@ class ForallReduceMaxBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index 190cd5d01b..61705c86d3 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                     camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,12 +32,12 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
-  const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE big_max = modval * 10;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const int       modval      = 100;
+  const DATA_TYPE max_init    = -modval;
+  const IDX_TYPE  maxloc_init = -1;
+  const IDX_TYPE  maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max     = modval * 10;
+  const IDX_TYPE  big_maxloc  = maxloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -45,13 +45,13 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
-  IDX_TYPE ref_maxloc = maxloc_init;
+  DATA_TYPE ref_max    = max_init;
+  IDX_TYPE  ref_maxloc = maxloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] > ref_max)
     {
-      ref_max = test_array[seg_idx[i]];
+      ref_max    = test_array[seg_idx[i]];
       ref_maxloc = seg_idx[i];
     }
   }
@@ -68,7 +68,8 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
       RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
       RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m)
+      {
         mi.max(working_array[idx], idx);
         m.max(working_array[idx], idx);
       });
@@ -85,18 +86,16 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
-                              m.max(working_array[idx] * factor, idx);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.max(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
   factor = 3;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
-                              m.max(working_array[idx] * factor, idx);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.max(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
@@ -112,10 +111,10 @@ class ForallReduceMaxLocBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index 42e3389608..a2395bdf3e 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,8 +32,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval + 1;
+  const int       modval    = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
@@ -57,9 +57,10 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
       RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
       RAJA::expt::KernelName("RAJA Reduce Min"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m)
+      {
         mi = RAJA_MIN(working_array[idx], mi);
-        m = RAJA_MIN(working_array[idx], m);
+        m  = RAJA_MIN(working_array[idx], m);
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
@@ -71,17 +72,15 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE factor = 3;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
-                              m = RAJA_MIN(working_array[idx] * factor, m);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MIN(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m) {
-                              m = RAJA_MIN(working_array[idx] * factor, m);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MIN(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   deallocateForallTestData<DATA_TYPE>(
@@ -96,10 +95,10 @@ class ForallReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index 00adda1616..da34fc618d 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                     camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -32,12 +32,12 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval + 1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE small_min = -modval;
-  const IDX_TYPE small_minloc = minloc_init;
+  const int       modval       = 100;
+  const DATA_TYPE min_init     = modval + 1;
+  const IDX_TYPE  minloc_init  = -1;
+  const IDX_TYPE  minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min    = -modval;
+  const IDX_TYPE  small_minloc = minloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -45,13 +45,13 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
-  IDX_TYPE ref_minloc = minloc_init;
+  DATA_TYPE ref_min    = min_init;
+  IDX_TYPE  ref_minloc = minloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] < ref_min)
     {
-      ref_min = test_array[seg_idx[i]];
+      ref_min    = test_array[seg_idx[i]];
       ref_minloc = seg_idx[i];
     }
   }
@@ -68,7 +68,8 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
       RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
       RAJA::expt::KernelName("RAJA Reduce MinLoc"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m)
+      {
         mi.min(working_array[idx], idx);
         m.min(working_array[idx], idx);
       });
@@ -85,18 +86,16 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
-                              m.min(working_array[idx] * factor, idx);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.min(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
   RAJA::forall<EXEC_POLICY>(seg,
                             RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m) {
-                              m.min(working_array[idx] * factor, idx);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.min(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
@@ -112,10 +111,10 @@ class ForallReduceMinLocBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index 57e444a4ac..f324845cd3 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -18,12 +18,12 @@ template <typename IDX_TYPE,
           typename SEG_TYPE,
           typename EXEC_POLICY,
           typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
+void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
@@ -47,7 +47,7 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
-  DATA_TYPE sum = 0;
+  DATA_TYPE sum  = 0;
   DATA_TYPE sum2 = 2;
 
   RAJA::forall<EXEC_POLICY>(
@@ -55,7 +55,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
       RAJA::expt::KernelName("RAJA Reduce Sum"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s1, DATA_TYPE & s2) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s1, DATA_TYPE & s2)
+      {
         s1 += working_array[idx];
         s2 += working_array[idx];
       });
@@ -69,12 +70,10 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(
-        seg,
-        RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s) {
-          s += working_array[idx];
-        });
+    RAJA::forall<EXEC_POLICY>(seg,
+                              RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s)
+                              { s += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
@@ -92,10 +91,10 @@ class ForallReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index a32b35f3b0..40f4b87829 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -26,7 +26,7 @@ template <typename IDX_TYPE,
 void ForallIndexSetReduceMaxMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -58,11 +58,11 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   }
 
   // for setting random values in arrays
-  std::random_device rd;
-  std::mt19937 mt(rd());
+  std::random_device                     rd;
+  std::mt19937                           mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_max = default_val;
+  double    current_max = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax0(default_val);
@@ -81,15 +81,17 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
     if (test_array[index] > droll)
     {
       test_array[index] = droll;
-      current_max = RAJA_MAX(current_max, droll);
+      current_max       = RAJA_MAX(current_max, droll);
     }
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-      dmax0.max(working_array[i]);
-      dmax1.max(2 * working_array[i]);
-    });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.max(working_array[i]);
+                                dmax1.max(2 * working_array[i]);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
@@ -107,9 +109,9 @@ class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
 TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
              ReduceMaxMultipleForallIndexSet)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 6cfcf9818b..7fd5d20dbe 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -26,7 +26,7 @@ template <typename IDX_TYPE,
 void ForallIndexSetReduceMaxLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -50,7 +50,7 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   allocateForallTestData<double>(
       alen, working_res, &working_array, &check_array, &test_array);
 
-  double current_max = -DBL_MAX;
+  double   current_max = -DBL_MAX;
   IDX_TYPE current_loc = -1;
 
   for (IDX_TYPE i = 0; i < alen; ++i)
@@ -81,10 +81,12 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-      dmax0.maxloc(working_array[i], i);
-      dmax1.maxloc(2 * working_array[i], i);
-    });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.maxloc(working_array[i], i);
+                                dmax1.maxloc(2 * working_array[i], i);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
@@ -104,9 +106,9 @@ class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
 TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
              ReduceMaxLocMultipleForallIndexSet)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index e955e8c785..b99a446fdd 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -26,7 +26,7 @@ template <typename IDX_TYPE,
 void ForallIndexSetReduceMinMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -58,11 +58,11 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   }
 
   // for setting random values in arrays
-  std::random_device rd;
-  std::mt19937 mt(rd());
+  std::random_device                     rd;
+  std::mt19937                           mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_min = default_val;
+  double    current_min = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin0(default_val);
@@ -81,15 +81,17 @@ void ForallIndexSetReduceMinMultipleTestImpl()
     if (test_array[index] > droll)
     {
       test_array[index] = droll;
-      current_min = RAJA_MIN(current_min, droll);
+      current_min       = RAJA_MIN(current_min, droll);
     }
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-      dmin0.min(working_array[i]);
-      dmin1.min(2 * working_array[i]);
-    });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.min(working_array[i]);
+                                dmin1.min(2 * working_array[i]);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
@@ -107,9 +109,9 @@ class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
 TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
              ReduceMinMultipleForallIndexSet)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index aa413fe40e..c6d0b5645d 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -26,7 +26,7 @@ template <typename IDX_TYPE,
 void ForallIndexSetReduceMinLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -50,7 +50,7 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   allocateForallTestData<double>(
       alen, working_res, &working_array, &check_array, &test_array);
 
-  double current_min = DBL_MAX;
+  double   current_min = DBL_MAX;
   IDX_TYPE current_loc = -1;
 
   for (IDX_TYPE i = 0; i < alen; ++i)
@@ -81,10 +81,12 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-      dmin0.minloc(working_array[i], i);
-      dmin1.minloc(2 * working_array[i], i);
-    });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.minloc(working_array[i], i);
+                                dmin1.minloc(2 * working_array[i], i);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
@@ -104,9 +106,9 @@ class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
 TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
              ReduceMinLocMultipleForallIndexSet)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index bda437e82b..16e80f31d7 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -24,7 +24,7 @@ template <typename IDX_TYPE,
 void ForallIndexSetReduceSumMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -56,7 +56,7 @@ void ForallIndexSetReduceSumMultipleTestImpl()
       alen, working_res, &iworking_array, &icheck_array, &itest_array);
 
   const double dinit_val = 0.1;
-  const int iinit_val = 1;
+  const int    iinit_val = 1;
 
   for (IDX_TYPE i = 0; i < alen; ++i)
   {
@@ -67,27 +67,29 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   working_res.memcpy(dworking_array, dtest_array, sizeof(double) * alen);
   working_res.memcpy(iworking_array, itest_array, sizeof(int) * alen);
 
-  const double drinit = 5.0;
-  const int irinit = 4;
-  const int test_repeat = 4;
+  const double drinit      = 5.0;
+  const int    irinit      = 4;
+  const int    test_repeat = 4;
 
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum0(drinit * 1.0);
-  RAJA::ReduceSum<REDUCE_POLICY, int> isum1(irinit * 2);
+  RAJA::ReduceSum<REDUCE_POLICY, int>    isum1(irinit * 2);
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum2(drinit * 3.0);
-  RAJA::ReduceSum<REDUCE_POLICY, int> isum3(irinit * 4);
+  RAJA::ReduceSum<REDUCE_POLICY, int>    isum3(irinit * 4);
 
   for (int tcount = 1; tcount <= test_repeat; ++tcount)
   {
 
-    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      dsum0 += 1.0 * dworking_array[idx];
-      isum1 += 2 * iworking_array[idx];
-      dsum2 += 3.0 * dworking_array[idx];
-      isum3 += 4 * iworking_array[idx];
-    });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                dsum0 += 1.0 * dworking_array[idx];
+                                isum1 += 2 * iworking_array[idx];
+                                dsum2 += 3.0 * dworking_array[idx];
+                                isum3 += 4 * iworking_array[idx];
+                              });
 
     double dchk_val = dinit_val * static_cast<double>(iset.getLength());
-    int ichk_val = iinit_val * static_cast<int>(iset.getLength());
+    int    ichk_val = iinit_val * static_cast<int>(iset.getLength());
 
     ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()),
                     tcount * (1 * dchk_val) + (drinit * 1.0));
@@ -114,9 +116,9 @@ class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
 TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
              ReduceSumMultipleForallIndexSet)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index a688a549d0..5ceb700c2a 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -24,20 +24,20 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  DATA_TYPE* working_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*                working_array;
+  DATA_TYPE*                check_array;
+  DATA_TYPE*                test_array;
 
   allocateForallTestData<DATA_TYPE>(
       last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const DATA_TYPE big_val = 500;
+  const DATA_TYPE big_val     = 500;
 
-  static std::random_device rd;
-  static std::mt19937 mt(rd());
+  static std::random_device                     rd;
+  static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
                                                   static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
@@ -71,8 +71,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE  max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
         working_res.memcpy(&working_array[max_index],
@@ -84,11 +84,13 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
           current_max = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          max0.max(working_array[idx]);
-          max1.max(2 * working_array[idx]);
-          max2.max(working_array[idx]);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.max(working_array[idx]);
+                                    max1.max(2 * working_array[idx]);
+                                    max2.max(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
@@ -116,10 +118,10 @@ class ForallReduceMaxMultipleTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallReduceMaxMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index 73c44d1167..5fa290a86e 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -24,21 +24,21 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  DATA_TYPE* working_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*                working_array;
+  DATA_TYPE*                check_array;
+  DATA_TYPE*                test_array;
 
   allocateForallTestData<DATA_TYPE>(
       last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const IDX_TYPE default_loc = -1;
-  const DATA_TYPE big_val = 500;
+  const IDX_TYPE  default_loc = -1;
+  const DATA_TYPE big_val     = 500;
 
-  static std::random_device rd;
-  static std::mt19937 mt(rd());
+  static std::random_device                     rd;
+  static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
                                                   static_cast<int>(last) - 1);
 
   RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
@@ -62,7 +62,7 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
     DATA_TYPE current_max = default_val;
-    IDX_TYPE current_loc = default_loc;
+    IDX_TYPE  current_loc = default_loc;
 
     const int nMiddleLoops = 2;
     for (int k = 0; k < nMiddleLoops; ++k)
@@ -78,8 +78,8 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE  max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         if (current_max != roll)
         { // avoid two indices getting the same value
@@ -95,11 +95,13 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
           }
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          max0.maxloc(working_array[idx], idx);
-          max1.maxloc(2 * working_array[idx], idx);
-          max2.maxloc(working_array[idx], idx);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.maxloc(working_array[idx], idx);
+                                    max1.maxloc(2 * working_array[idx], idx);
+                                    max2.maxloc(working_array[idx], idx);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -137,10 +139,10 @@ class ForallReduceMaxLocMultipleTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallReduceMaxLocMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index 1a5dc3870d..c9c6fbd84e 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -24,20 +24,20 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  DATA_TYPE* working_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*                working_array;
+  DATA_TYPE*                check_array;
+  DATA_TYPE*                test_array;
 
   allocateForallTestData<DATA_TYPE>(
       last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const DATA_TYPE big_val = -500;
+  const DATA_TYPE big_val     = -500;
 
-  static std::random_device rd;
-  static std::mt19937 mt(rd());
+  static std::random_device                     rd;
+  static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
                                                   static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
@@ -71,8 +71,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE  min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
         working_res.memcpy(&working_array[min_index],
@@ -84,11 +84,13 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
           current_min = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          min0.min(working_array[idx]);
-          min1.min(2 * working_array[idx]);
-          min2.min(working_array[idx]);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.min(working_array[idx]);
+                                    min1.min(2 * working_array[idx]);
+                                    min2.min(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
         ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
@@ -116,10 +118,10 @@ class ForallReduceMinMultipleTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallReduceMinMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index 8dcaba8b17..15df809f2b 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -24,21 +24,21 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  DATA_TYPE* working_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*                working_array;
+  DATA_TYPE*                check_array;
+  DATA_TYPE*                test_array;
 
   allocateForallTestData<DATA_TYPE>(
       last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const IDX_TYPE default_loc = -1;
-  const DATA_TYPE big_val = -500;
+  const IDX_TYPE  default_loc = -1;
+  const DATA_TYPE big_val     = -500;
 
-  static std::random_device rd;
-  static std::mt19937 mt(rd());
+  static std::random_device                     rd;
+  static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
                                                   static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
@@ -64,7 +64,7 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
     DATA_TYPE current_min = default_val;
-    IDX_TYPE current_loc = default_loc;
+    IDX_TYPE  current_loc = default_loc;
 
     const int nMiddleLoops = 2;
     for (int k = 0; k < nMiddleLoops; ++k)
@@ -81,8 +81,8 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE  min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         printf("rolling { %f, %f }\n", (double)roll, (double)min_index);
         if (current_min != roll)
@@ -101,11 +101,13 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         printf(
             "current { %f, %f }\n", (double)current_min, (double)current_loc);
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          min0.minloc(working_array[idx], idx);
-          min1.minloc(2 * working_array[idx], idx);
-          min2.minloc(working_array[idx], idx);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.minloc(working_array[idx], idx);
+                                    min1.minloc(2 * working_array[idx], idx);
+                                    min2.minloc(working_array[idx], idx);
+                                  });
 
         printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
@@ -146,10 +148,10 @@ class ForallReduceMinLocMultipleTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallReduceMinLocMultipleTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index bfc77c0d26..18f069e5de 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -21,9 +21,9 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  DATA_TYPE* working_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*                working_array;
+  DATA_TYPE*                check_array;
+  DATA_TYPE*                test_array;
 
   allocateForallTestData<DATA_TYPE>(
       last, working_res, &working_array, &check_array, &test_array);
@@ -53,16 +53,18 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
   for (int j = 0; j < nloops; ++j)
   {
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum0 += working_array[idx];
-      sum1 += working_array[idx] * 2;
-      sum2 += working_array[idx] * 3;
-      sum3 += working_array[idx] * 4;
-      sum4 += working_array[idx] * 5;
-      sum5 += working_array[idx] * 6;
-      sum6 += working_array[idx] * 7;
-      sum7 += working_array[idx] * 8;
-    });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
@@ -94,9 +96,9 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  DATA_TYPE* working_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*                working_array;
+  DATA_TYPE*                check_array;
+  DATA_TYPE*                test_array;
 
   allocateForallTestData<DATA_TYPE>(
       last, working_res, &working_array, &check_array, &test_array);
@@ -137,16 +139,18 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
   for (int j = 0; j < nloops; ++j)
   {
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum0 += working_array[idx];
-      sum1 += working_array[idx] * 2;
-      sum2 += working_array[idx] * 3;
-      sum3 += working_array[idx] * 4;
-      sum4 += working_array[idx] * 5;
-      sum5 += working_array[idx] * 6;
-      sum6 += working_array[idx] * 7;
-      sum7 += working_array[idx] * 8;
-    });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
@@ -175,10 +179,10 @@ class ForallReduceSumMultipleTest : public ::testing::Test
 
 TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE,
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index 6ea4301d4b..7fb536e61e 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -40,15 +40,17 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
 
-  RAJA::region<REG_POLICY>([=]() {
-    RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[idx - first] += 1;
-    });
+  RAJA::region<REG_POLICY>(
+      [=]()
+      {
+        RAJA::forall<EXEC_POLICY>(rseg,
+                                  [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 1; });
 
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[idx - first] += 2;
-    });
-  });
+        RAJA::forall<EXEC_POLICY>(lseg,
+                                  [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 2; });
+      });
 
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
@@ -70,9 +72,9 @@ class ForallRegionTest : public ::testing::Test
 
 TYPED_TEST_P(ForallRegionTest, RegionForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using REG_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using REG_POLICY  = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index 5784674802..78c9e1e6e1 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -17,17 +17,17 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceIcountIndexSetTestImpl()
 {
 
-  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  WORKING_RES working_res;
+  WORKING_RES               working_res;
   camp::resources::Resource erased_working_res{working_res};
 
-  IndexSetType iset;
+  IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, erased_working_res);
@@ -60,9 +60,8 @@ void ForallResourceIcountIndexSetTestImpl()
   RAJA::forall_Icount<EXEC_POLICY>(
       working_res,
       iset,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-        working_array[icount] = idx;
-      });
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -83,9 +82,9 @@ class ForallResourceIcountIndexSetTest : public ::testing::Test
 
 TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallResourceIcountIndexSetTestImpl<INDEX_TYPE,
                                        WORKING_RESOURCE,
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index 194c1d49b8..6a1ad8a544 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -16,17 +16,17 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceIndexSetTestImpl()
 {
 
-  using RangeSegType = RAJA::TypedRangeSegment<INDEX_TYPE>;
+  using RangeSegType       = RAJA::TypedRangeSegment<INDEX_TYPE>;
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
-  using ListSegType = RAJA::TypedListSegment<INDEX_TYPE>;
+  using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  WORKING_RES working_res;
+  WORKING_RES               working_res;
   camp::resources::Resource erased_working_res{working_res};
 
-  IndexSetType iset;
+  IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, erased_working_res);
@@ -55,10 +55,10 @@ void ForallResourceIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(
-      working_res, iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-        working_array[idx] = idx;
-      });
+  RAJA::forall<EXEC_POLICY>(working_res,
+                            iset,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -80,9 +80,9 @@ class ForallResourceIndexSetTest : public ::testing::Test
 
 TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallResourceIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index 051c3dc60a..bb1ec42e0a 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -35,7 +35,7 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
 
   size_t idxlen = idx_array.size();
 
-  WORKING_RES working_res;
+  WORKING_RES               working_res;
   camp::resources::Resource erased_working_res{working_res};
 
   // Create list segment for tests
@@ -62,10 +62,11 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
     test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(
-      working_res, lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-        working_array[RAJA::stripIndexType(idx)] = idx;
-      });
+  RAJA::forall<EXEC_POLICY>(working_res,
+                            lseg,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                              working_array[RAJA::stripIndexType(idx)] = idx;
+                            });
 
   working_res.memcpy(
       check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
@@ -89,9 +90,9 @@ class ForallResourceListSegmentTest : public ::testing::Test
 
 TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
       INDEX_TYPE(13));
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 04d557c244..6e44124c6e 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -15,13 +15,13 @@ void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
                                          RAJA::stripIndexType(last));
-  INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
+  INDEX_TYPE                          N = INDEX_TYPE(r1.end() - r1.begin());
 
-  WORKING_RES working_res;
+  WORKING_RES               working_res;
   camp::resources::Resource erased_working_res{working_res};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       N, erased_working_res, &working_array, &check_array, &test_array);
@@ -31,9 +31,10 @@ void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
   RAJA::forall<EXEC_POLICY>(
-      working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-        working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-      });
+      working_res,
+      r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
   working_res.memcpy(
       check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
@@ -78,7 +79,7 @@ void runNegativeTests()
 
 TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index 6d06361ab5..6f4d3e7d6d 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -14,18 +14,18 @@ template <typename INDEX_TYPE,
           typename EXEC_POLICY>
 void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
                                               INDEX_TYPE last,
-                                              DIFF_TYPE stride)
+                                              DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  WORKING_RES working_res;
+  WORKING_RES               working_res;
   camp::resources::Resource erased_working_res{working_res};
   camp::resources::Resource host_res{camp::resources::Host()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       N, erased_working_res, &working_array, &check_array, &test_array);
@@ -46,9 +46,10 @@ void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
   }
 
   RAJA::forall<EXEC_POLICY>(
-      working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-        working_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
-      });
+      working_res,
+      r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
 
   working_res.memcpy(
       check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
@@ -119,7 +120,7 @@ void runNegativeStrideTests()
 TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
              ResourceRangeStrideSegmentForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE =
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index 4b2dca335b..755d3db420 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -57,7 +57,7 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   }
 
   using layout_type = RAJA::Layout<1, INDEX_TYPE, 0>;
-  using view_type = RAJA::View<INDEX_TYPE, layout_type>;
+  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
 #if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) ||            \
        defined(RAJA_COMPILER_MSVC))) ||                                        \
     _GLIBCXX_RELEASE >= 20150716
@@ -79,7 +79,7 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 #endif
 
   RAJA::Layout<1> layout(N);
-  view_type work_view(working_array, layout);
+  view_type       work_view(working_array, layout);
 
   RAJA::forall<EXEC_POLICY>(
       lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
@@ -136,10 +136,10 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   }
 
   using layout_type = RAJA::OffsetLayout<1, INDEX_TYPE>;
-  using view_type = RAJA::View<INDEX_TYPE, layout_type>;
+  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type work_view(
+  view_type  work_view(
       working_array,
       RAJA::make_offset_layout<1, INDEX_TYPE>({{offset}}, {{N_offset}}));
 
@@ -164,9 +164,9 @@ class ForallListSegmentViewTest : public ::testing::Test
 
 TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13);
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index 6f1434a3f1..fc9b2056e4 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -15,14 +15,14 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 {
   INDEX_TYPE lentot = N * N;
-  const int NDIMS = 2;
+  const int  NDIMS  = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, lentot);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       lentot, working_res, &working_array, &check_array, &test_array);
@@ -34,11 +34,13 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    const INDEX_TYPE row = idx / N;
-    const INDEX_TYPE col = idx % N;
-    work_view(row, col) = row * N + col;
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = row * N + col;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
@@ -55,15 +57,15 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 {
   const INDEX_TYPE leninterior = N * N;
-  const INDEX_TYPE lentot = (N + 2) * (N + 2);
-  const int NDIMS = 2;
+  const INDEX_TYPE lentot      = (N + 2) * (N + 2);
+  const int        NDIMS       = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, leninterior);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       lentot, working_res, &working_array, &check_array, &test_array);
@@ -76,7 +78,7 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
   {
     for (int col = 1; col < N + 1; ++col)
     {
-      int idx = row * (N + 2) + col;
+      int idx         = row * (N + 2) + col;
       test_array[idx] = (row - 1) * N + (col - 1);
     }
   }
@@ -87,11 +89,13 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    const INDEX_TYPE row = idx / N;
-    const INDEX_TYPE col = idx % N;
-    work_view(row, col) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = idx;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
@@ -133,7 +137,7 @@ void runOffsetViewTests()
 
 TYPED_TEST_P(ForallRangeSegment2DViewTest, RangeSegmentForall2DView)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index d2573be4a4..8fc3208088 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -14,12 +14,12 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first, last);
-  INDEX_TYPE N = r1.end() - r1.begin();
+  INDEX_TYPE                          N = r1.end() - r1.begin();
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       N, working_res, &working_array, &check_array, &test_array);
@@ -31,11 +31,11 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
-  view_type work_view(working_array, layout);
+  view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view(idx - rbegin) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx - rbegin) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -54,12 +54,12 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
                                           INDEX_TYPE offset)
 {
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first + offset, last + offset);
-  INDEX_TYPE N = r1.end() - r1.begin();
+  INDEX_TYPE                          N = r1.end() - r1.begin();
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       N, working_res, &working_array, &check_array, &test_array);
@@ -72,7 +72,7 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type work_view(
+  view_type  work_view(
       working_array,
       RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}}, {{l_offset}}));
 
@@ -124,7 +124,7 @@ class ForallRangeSegmentViewTest : public ::testing::Test
 
 TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index f376448844..2a250e50c5 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -14,15 +14,15 @@ template <typename INDEX_TYPE,
           typename EXEC_POLICY>
 void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
                                           INDEX_TYPE last,
-                                          DIFF_TYPE stride)
+                                          DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
-  INDEX_TYPE N = r1.size();
+  INDEX_TYPE                                N = r1.size();
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   allocateForallTestData<INDEX_TYPE>(
       N, working_res, &working_array, &check_array, &test_array);
@@ -41,11 +41,11 @@ void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
   using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
-  view_type work_view(working_array, layout);
+  view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view((idx - first) / stride) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view((idx - first) / stride) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -106,10 +106,10 @@ class ForallRangeStrideSegmentViewTest : public ::testing::Test
 
 TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE = typename std::make_signed<INDEX_TYPE>::type;
+  using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
 
   ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
                                        DIFF_TYPE,
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index 4a7b46367a..5567b66fa4 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -69,9 +69,10 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
     working_res.memcpy(
         working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx)] = idx;
-    });
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                                working_array[RAJA::stripIndexType(idx)] = idx;
+                              });
   }
   else
   { // zero-length segment
@@ -81,10 +82,12 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
     working_res.memcpy(
         working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void)idx;
-      working_array[0]++;
-    });
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
@@ -107,9 +110,9 @@ class ForallListSegmentTest : public ::testing::Test
 
 TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length list segment
   ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index 30d3d4a4ef..f0454ae339 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -19,9 +19,9 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -39,9 +39,10 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-    });
+    RAJA::forall<EXEC_POLICY>(
+        r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
   }
   else
   { // zero-length segment
@@ -51,10 +52,12 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     working_res.memcpy(
         working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void)idx;
-      working_array[0]++;
-    });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
@@ -103,7 +106,7 @@ void runNegativeTests()
 
 TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index 17ac498f2b..983fe19662 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -16,7 +16,7 @@ template <typename INDEX_TYPE,
           typename EXEC_POLICY>
 void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
                                       INDEX_TYPE last,
-                                      DIFF_TYPE stride)
+                                      DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
@@ -24,9 +24,9 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   camp::resources::Resource host_res{camp::resources::Host()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -51,17 +51,20 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
       idx += stride;
     }
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
-    });
+    RAJA::forall<EXEC_POLICY>(
+        r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
   }
   else
   { // zero-length segment
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void)idx;
-      working_array[0]++;
-    });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
@@ -131,7 +134,7 @@ void runNegativeStrideTests()
 
 TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE =
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index 97994cac35..1146e31e58 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -21,7 +21,7 @@
 TEST(IndexSetBuild, Aligned)
 {
   const RAJA::Index_type range_min_length = 8;
-  const RAJA::Index_type range_align = 2;
+  const RAJA::Index_type range_align      = 2;
 
   using RSType = RAJA::RangeSegment;
   using LSType = RAJA::ListSegment;
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index a937a19221..d50b4e0480 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -21,10 +21,10 @@ template <typename IDX_TYPE,
           typename WORKING_RES,
           typename SEG_TYPE>
 void KernelBasicFissionFusionLoopTestImpl(
-    const SEG_TYPE& seg,
+    const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES working_res,
-    camp::resources::Resource erased_working_res)
+    WORKING_RES                  working_res,
+    camp::resources::Resource    erased_working_res)
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
@@ -60,12 +60,14 @@ void KernelBasicFissionFusionLoopTestImpl(
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+      {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
       },
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+      {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
       }
@@ -80,10 +82,13 @@ void KernelBasicFissionFusionLoopTestImpl(
          0,
          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
-    check_array_y[RAJA::stripIndexType(i)] += 1;
-    check_array_y[RAJA::stripIndexType(i)] += 2;
-  });
+  RAJA::forall<RAJA::seq_exec>(working_res,
+                               seg_idx,
+                               [=](IDX_TYPE i)
+                               {
+                                 check_array_y[RAJA::stripIndexType(i)] += 1;
+                                 check_array_y[RAJA::stripIndexType(i)] += 2;
+                               });
 
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 1a0bd8637c..47075076f9 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -18,11 +18,11 @@ class KernelBasicFissionFusionLoopTest : public ::testing::Test
 TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
              BasicFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES               working_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res{working_res};
 
   std::vector<IDX_TYPE> seg_idx;
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index a583689061..f2cd2d9de6 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -21,12 +21,12 @@ template <typename IDX_TYPE,
           typename WORKING_RES,
           typename SEG_TYPE>
 void KernelBasicSingleICountLoopTestImpl(
-    const SEG_TYPE& seg,
+    const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES working_res,
-    camp::resources::Resource erased_working_res)
+    WORKING_RES                  working_res,
+    camp::resources::Resource    erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
   if (seg_idx.size() > 0)
   {
@@ -80,8 +80,9 @@ void KernelBasicSingleICountLoopTestImpl(
         RAJA::make_tuple(seg),
         RAJA::make_tuple(IDX_TYPE(0)),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
-          working_array[RAJA::stripIndexType(idx)] = IDX_TYPE(idx);
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
+        {
+          working_array[RAJA::stripIndexType(idx)]     = IDX_TYPE(idx);
           working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx);
         });
   }
@@ -92,7 +93,8 @@ void KernelBasicSingleICountLoopTestImpl(
         RAJA::make_tuple(seg),
         RAJA::make_tuple(IDX_TYPE(0)),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
+        {
           (void)idx;
           (void)i_idx;
           working_array[0]++;
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index 00f8adc4d2..2e80007803 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -18,11 +18,11 @@ class KernelBasicSingleICountLoopTest : public ::testing::Test
 TYPED_TEST_P(KernelBasicSingleICountLoopTest,
              BasicSingleICountLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES               working_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res{working_res};
 
   std::vector<IDX_TYPE> seg_idx;
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index d318fa475b..5373435404 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -43,12 +43,12 @@ template <typename IDX_TYPE,
           typename WORKING_RES,
           typename SEG_TYPE,
           bool USE_RESOURCE>
-void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
+void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
                                    const std::vector<IDX_TYPE>& seg_idx,
-                                   WORKING_RES working_res,
+                                   WORKING_RES                  working_res,
                                    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
   if (seg_idx.size() > 0)
   {
@@ -85,18 +85,21 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
     }
 
     call_kernel<EXEC_POLICY, USE_RESOURCE>(
-        RAJA::make_tuple(seg), working_res, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          working_array[RAJA::stripIndexType(idx)] = idx;
-        });
+        RAJA::make_tuple(seg),
+        working_res,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx)] = idx; });
   }
   else
   { // zero-length segment
 
-    call_kernel<EXEC_POLICY, USE_RESOURCE>(
-        RAJA::make_tuple(seg), working_res, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          (void)idx;
-          working_array[0]++;
-        });
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg),
+                                           working_res,
+                                           [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                           {
+                                             (void)idx;
+                                             working_array[0]++;
+                                           });
   }
 
   working_res.memcpy(check_array,
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index dbbfcdbed6..1b7a90eb00 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -17,11 +17,11 @@ class KernelBasicSingleLoopTest : public ::testing::Test
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES               working_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res{working_res};
 
   constexpr bool USE_RES = false;
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index fe70c730f5..9cd1096bd4 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -17,11 +17,11 @@ class KernelBasicSingleLoopTest : public ::testing::Test
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES               working_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res{working_res};
 
   constexpr bool USE_RES = true;
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index dc7f9ee1c8..d6bdf64d1f 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -21,10 +21,10 @@ template <typename IDX_TYPE,
           typename WORKING_RES,
           typename SEG_TYPE>
 void KernelConditionalFissionFusionLoopTestImpl(
-    const SEG_TYPE& seg,
+    const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES working_res,
-    camp::resources::Resource erased_working_res)
+    WORKING_RES                  working_res,
+    camp::resources::Resource    erased_working_res)
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
@@ -66,12 +66,14 @@ void KernelConditionalFissionFusionLoopTestImpl(
 
         RAJA::make_tuple(param),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
         },
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
         }
@@ -86,9 +88,12 @@ void KernelConditionalFissionFusionLoopTestImpl(
            0,
            sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
-      check_array_y[RAJA::stripIndexType(i)] = 3 + 3 * param;
-    });
+    RAJA::forall<RAJA::seq_exec>(working_res,
+                                 seg_idx,
+                                 [=](IDX_TYPE i) {
+                                   check_array_y[RAJA::stripIndexType(i)] =
+                                       3 + 3 * param;
+                                 });
 
 
     for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index 54f83eec3e..b0329de313 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -18,11 +18,11 @@ class KernelConditionalFissionFusionLoopTest : public ::testing::Test
 TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
              ConditionalFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES               working_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res{working_res};
 
   std::vector<IDX_TYPE> seg_idx;
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index 5ee1507d90..d97139cb5b 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -57,7 +57,8 @@ void KernelHyperplane2DTestImpl(const int groups,
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(Grange, Irange, Jrange),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj) {
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj)
+      {
         if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim ||
             (int)jj < 0 || (int)jj >= jdim)
         {
@@ -133,10 +134,10 @@ class KernelHyperplane2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   KernelHyperplane2DTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 9bec3d5038..99c6357496 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -83,7 +83,7 @@ KernelHyperplane3DTestImpl(const int groups,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
   // perform array arithmetic with a 2D J-K hyperplane
-  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
+  RAJA::TypedRangeSegment<INDEX_TYPE>       Grange(0, groups);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Irange(0, idim, 1);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
@@ -91,7 +91,8 @@ KernelHyperplane3DTestImpl(const int groups,
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(Grange, Irange, Jrange, Krange),
       [=] RAJA_HOST_DEVICE(
-          INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk) {
+          INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk)
+      {
         if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 ||
             jj >= jdim || kk < 0 || kk >= kdim)
         {
@@ -186,10 +187,10 @@ class KernelHyperplane3DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   KernelHyperplane3DTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index c63af82818..9a6dd220be 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -47,9 +47,9 @@ template <typename EXEC_POLICY,
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
-                                const Container& multi_init,
-                                WORKING_RES working_res,
-                                RandomGenerator& rngen)
+                                const Container&     multi_init,
+                                WORKING_RES          working_res,
+                                RandomGenerator&     rngen)
 {
   using RAJA::get;
   using MULTIREDUCER =
@@ -69,7 +69,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval = 100;
+  const int    modval   = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -104,7 +104,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       {
         for (IDX_TYPE i : si)
         {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
           test_range[ii + 1] = data_len;
@@ -159,7 +159,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     RAJA::kernel_resource<EXEC_POLICY>(
         segments,
         working_res,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+        {
           IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
           for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
                ++idx)
@@ -199,7 +200,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       RAJA::kernel_resource<EXEC_POLICY>(
           segments,
           working_res,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
             IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
             for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
                  ++idx)
@@ -239,7 +241,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
 
     std::vector<DATA_TYPE> ref_vals;
-    bool got_ref_vals = false;
+    bool                   got_ref_vals = false;
 
     const int nloops = 2;
     for (int j = 0; j < nloops; ++j)
@@ -249,7 +251,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       RAJA::kernel_resource<EXEC_POLICY>(
           segments,
           working_res,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
             IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
             for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
                  ++idx)
@@ -342,12 +345,12 @@ struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
 
 TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ABSTRACTION = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
   using LOOP_POLS = typename EXEC_POL_DATA::type;
@@ -355,7 +358,7 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
       typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto         random_seed = std::random_device{}();
   std::mt19937 rngen(random_seed);
 
   WORKING_RES working_res{WORKING_RES::get_default()};
@@ -363,13 +366,13 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
-  size_t num_bins_min = 0;
+  size_t              num_bins_min = 0;
   for (size_t num_bins_max : num_bins_max_container)
   {
 
     std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
                                                         num_bins_max);
-    num_bins_min = num_bins_max + 1;
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index cbf25b46a6..c6cc00a099 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -16,8 +16,8 @@ template <typename EXEC_POL,
           typename PARAMS,
           typename WORKING_RES,
           typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
-                                                        PARAMS&& params,
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&&  segs,
+                                                        PARAMS&&    params,
                                                         WORKING_RES work_res,
                                                         Args&&... args)
 {
@@ -56,7 +56,7 @@ template <typename WORKING_RES,
 void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
 {
 
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   // Allocate Tests Data
@@ -86,14 +86,12 @@ void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
       work_res,
 
       // lambda 0, only runs for sequential
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value) {
-        value = work_array[i];
-      },
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
+      { value = work_array[i]; },
 
       // lambda 1, only runs for device
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value) {
-        value += work_array[i];
-      },
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
+      { value += work_array[i]; },
 
       // lambda 2, (reduction) runs for both sequential and device
       // Device: This only gets executed on the "root" thread which received the
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index 377ee899f2..6169265bb6 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -59,10 +59,10 @@ void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
                           const RAJA::Index_type dim1,
                           const RAJA::Index_type dim2)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
-  RAJA::Index_type flatSize = dim0 * dim1 * dim2;
+  RAJA::Index_type  flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -82,24 +82,24 @@ void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
                          sizeof(RAJA::Index_type) *
                              RAJA::stripIndexType(flatSize));
 
-  constexpr int Depth = 3;
+  constexpr int                                     Depth = 3;
   RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
       work_array, dim0, dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
-  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type>       worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range0, range1, range2),
       work_res,
       [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k) {
-        worksum += work_view(i, j, k);
-      });
+          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k)
+      { worksum += work_view(i, j, k); });
 
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=](RAJA::Index_type i) {
-    hostsum += test_array[RAJA::stripIndexType(i)];
-  });
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i) {
+                                 hostsum += test_array[RAJA::stripIndexType(i)];
+                               });
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
index 0e0f7f2b5d..699a8de450 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
@@ -22,8 +22,8 @@ class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
index 1c0283795b..c31a9c1ce7 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
@@ -22,8 +22,8 @@ class KernelNestedLoopReduceSumTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
index 3c9631e225..835d466d1e 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
@@ -22,8 +22,8 @@ class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
index f27fea9bfa..c94c4835b3 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
@@ -22,8 +22,8 @@ class KernelNestedLoopReduceSumTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index 462c14209b..5009122c76 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -19,14 +19,14 @@
 
 template <typename IDX_TYPE, typename DATA_TYPE, typename EXEC_POLICY>
 void KernelNestedLoopsSegmentTypesTestImpl(
-    const RAJA::TypedRangeSegment<IDX_TYPE>& s1,
-    const std::vector<IDX_TYPE>& s1_idx,
+    const RAJA::TypedRangeSegment<IDX_TYPE>&       s1,
+    const std::vector<IDX_TYPE>&                   s1_idx,
     const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
-    const std::vector<IDX_TYPE>& s2_idx,
-    const RAJA::TypedListSegment<IDX_TYPE>& s3,
-    const std::vector<IDX_TYPE>& s3_idx,
-    camp::resources::Resource working_res,
-    int perm)
+    const std::vector<IDX_TYPE>&                   s2_idx,
+    const RAJA::TypedListSegment<IDX_TYPE>&        s3,
+    const std::vector<IDX_TYPE>&                   s3_idx,
+    camp::resources::Resource                      working_res,
+    int                                            perm)
 {
   IDX_TYPE idx1_len = static_cast<IDX_TYPE>(s1_idx.size());
   IDX_TYPE idx2_len = static_cast<IDX_TYPE>(s2_idx.size());
@@ -94,7 +94,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   {
     RAJA::kernel<EXEC_POLICY>(
         RAJA::make_tuple(s1, s2, s3),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3)
+        {
           work_view(i1, i2, i3) =
               static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
         });
@@ -104,7 +105,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   {
     RAJA::kernel<EXEC_POLICY>(
         RAJA::make_tuple(s2, s3, s1),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1)
+        {
           work_view(i1, i2, i3) =
               static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
         });
@@ -114,7 +116,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   {
     RAJA::kernel<EXEC_POLICY>(
         RAJA::make_tuple(s3, s1, s2),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2)
+        {
           work_view(i1, i2, i3) =
               static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
         });
@@ -142,7 +145,7 @@ class KernelNestedLoopsSegmentTypesTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
@@ -189,7 +192,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 
   // Zero-length range segment
   RAJA::TypedRangeSegment<IDX_TYPE> s4(4, 4);
-  std::vector<IDX_TYPE> s4_idx;
+  std::vector<IDX_TYPE>             s4_idx;
   RAJA::getIndices(s4_idx, s4);
 
   perm = 1;
@@ -206,7 +209,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 
   // Zero-length range stride segment
   RAJA::TypedRangeStrideSegment<IDX_TYPE> s5(3, 3, 2);
-  std::vector<IDX_TYPE> s5_idx;
+  std::vector<IDX_TYPE>                   s5_idx;
   RAJA::getIndices(s5_idx, s5);
 
   perm = 1;
@@ -222,7 +225,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
       s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   // Zero-length list segment
-  std::vector<IDX_TYPE> s6_idx;
+  std::vector<IDX_TYPE>            s6_idx;
   RAJA::TypedListSegment<IDX_TYPE> s6(nullptr, s6_idx.size(), working_res);
 
   perm = 1;
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index f2dff847bd..7b2f446e72 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -15,9 +15,9 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                 std::array<RAJA::idx_t, 2> offset_hi)
 {
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  IDX_TYPE* working_array;
-  IDX_TYPE* check_array;
-  IDX_TYPE* test_array;
+  IDX_TYPE*                 working_array;
+  IDX_TYPE*                 check_array;
+  IDX_TYPE*                 test_array;
 
   RAJA::idx_t N = dim.at(0) * dim.at(1);
 
@@ -51,9 +51,8 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-                              view(i, j) = static_cast<IDX_TYPE>(1);
-                            });
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            { view(i, j) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
@@ -75,13 +74,13 @@ class KernelNestedLoopOffsetView2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  RAJA::idx_t dim0 = 21;
-  RAJA::idx_t dim1 = 23;
+  RAJA::idx_t                dim0 = 21;
+  RAJA::idx_t                dim1 = 23;
   std::array<RAJA::idx_t, 2> dim{{dim0, dim1}};
 
   //
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index e196982981..434dbd485b 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -15,9 +15,9 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                 std::array<RAJA::idx_t, 3> offset_hi)
 {
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  IDX_TYPE* working_array;
-  IDX_TYPE* check_array;
-  IDX_TYPE* test_array;
+  IDX_TYPE*                 working_array;
+  IDX_TYPE*                 check_array;
+  IDX_TYPE*                 test_array;
 
   RAJA::idx_t N = dim.at(0) * dim.at(1) * dim.at(2);
 
@@ -62,9 +62,8 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(iseg, jseg, kseg),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-        view(i, j, k) = static_cast<IDX_TYPE>(1);
-      });
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      { view(i, j, k) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
@@ -86,14 +85,14 @@ class KernelNestedLoopOffsetView3DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  RAJA::idx_t dim0 = 13;
-  RAJA::idx_t dim1 = 19;
-  RAJA::idx_t dim2 = 16;
+  RAJA::idx_t                dim0 = 13;
+  RAJA::idx_t                dim1 = 19;
+  RAJA::idx_t                dim2 = 16;
   std::array<RAJA::idx_t, 3> dim{{dim0, dim1, dim2}};
 
   //
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index 8c7bab320a..e9dd1c8e4b 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -13,12 +13,12 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                         std::array<RAJA::idx_t, 2> perm)
 {
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  IDX_TYPE* A_work_array;
-  IDX_TYPE* A_check_array;
-  IDX_TYPE* A_test_array;
-  IDX_TYPE* B_work_array;
-  IDX_TYPE* B_check_array;
-  IDX_TYPE* B_test_array;
+  IDX_TYPE*                 A_work_array;
+  IDX_TYPE*                 A_check_array;
+  IDX_TYPE*                 A_test_array;
+  IDX_TYPE*                 B_work_array;
+  IDX_TYPE*                 B_check_array;
+  IDX_TYPE*                 B_test_array;
 
   //
   // These are used for RAJA Layout, Segment definitions in the test.
@@ -92,13 +92,14 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
       RAJA::make_permuted_layout({{Nint_len.at(0), Nint_len.at(1)}}, perm);
 
   RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> B_view(B_work_array, B_layout);
-  RAJA::View<IDX_TYPE, RAJA::Layout<2>> A_view(A_work_array, A_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2>>       A_view(A_work_array, A_layout);
 
   RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            {
                               A_view(i, j) = B_view(i, j) + B_view(i - 1, j) +
                                              B_view(i + 1, j) +
                                              B_view(i, j - 1) +
@@ -129,13 +130,13 @@ class KernelNestedLoopPermutedOffsetView2DTest : public ::testing::Test
 TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
              PermutedOffsetView2DKernelTest)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  RAJA::idx_t dim0 = 23;
-  RAJA::idx_t dim1 = 37;
+  RAJA::idx_t                dim0 = 23;
+  RAJA::idx_t                dim1 = 37;
   std::array<RAJA::idx_t, 2> dim{{dim0, dim1}};
 
   std::array<RAJA::idx_t, 2> perm{{0, 1}};
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index ef09e58c41..b7c2820df1 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -13,12 +13,12 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                         std::array<RAJA::idx_t, 3> perm)
 {
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  IDX_TYPE* A_work_array;
-  IDX_TYPE* A_check_array;
-  IDX_TYPE* A_test_array;
-  IDX_TYPE* B_work_array;
-  IDX_TYPE* B_check_array;
-  IDX_TYPE* B_test_array;
+  IDX_TYPE*                 A_work_array;
+  IDX_TYPE*                 A_check_array;
+  IDX_TYPE*                 A_test_array;
+  IDX_TYPE*                 B_work_array;
+  IDX_TYPE*                 B_check_array;
+  IDX_TYPE*                 B_test_array;
 
   //
   // These are used for RAJA Layout, Segment definitions in the test.
@@ -36,13 +36,13 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer = dim.at(perm.at(0));
+  RAJA::idx_t Nint_outer  = dim.at(perm.at(0));
   RAJA::idx_t Nint_middle = dim.at(perm.at(1));
-  RAJA::idx_t Nint_inner = dim.at(perm.at(2));
+  RAJA::idx_t Nint_inner  = dim.at(perm.at(2));
 
-  RAJA::idx_t Ntot_outer = Nint_outer + 2 * 1;
+  RAJA::idx_t Ntot_outer  = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_middle = Nint_middle + 2 * 1;
-  RAJA::idx_t Ntot_inner = Nint_inner + 2 * 1;
+  RAJA::idx_t Ntot_inner  = Nint_inner + 2 * 1;
 
   RAJA::idx_t Nint = Nint_outer * Nint_middle * Nint_inner;
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
@@ -108,7 +108,7 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
       {{Nint_len.at(0), Nint_len.at(1), Nint_len.at(2)}}, perm);
 
   RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> B_view(B_work_array, B_layout);
-  RAJA::View<IDX_TYPE, RAJA::Layout<3>> A_view(A_work_array, A_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3>>       A_view(A_work_array, A_layout);
 
   RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
@@ -116,7 +116,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(iseg, jseg, kseg),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      {
         A_view(i, j, k) = B_view(i, j, k) + B_view(i - 1, j, k) +
                           B_view(i + 1, j, k) + B_view(i, j - 1, k) +
                           B_view(i, j + 1, k) + B_view(i, j, k - 1) +
@@ -147,7 +148,7 @@ class KernelNestedLoopPermutedOffsetView3DTest : public ::testing::Test
 TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
              PermutedOffsetView3DKernelTest)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index 271e689a88..a3663c733b 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -10,13 +10,13 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
+void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2>    dim,
                                   std::array<RAJA::idx_t, 2> perm)
 {
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  IDX_TYPE* working_array;
-  IDX_TYPE* check_array;
-  IDX_TYPE* test_array;
+  IDX_TYPE*                 working_array;
+  IDX_TYPE*                 check_array;
+  IDX_TYPE*                 test_array;
 
   std::array<RAJA::idx_t, 2> dim_strip{
       {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
@@ -42,8 +42,9 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
                        RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-        int val = RAJA::stripIndexType(layout(i, j)) % mod_val;
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+      {
+        int val    = RAJA::stripIndexType(layout(i, j)) % mod_val;
         view(i, j) = static_cast<IDX_TYPE>(val);
       });
 
@@ -67,7 +68,7 @@ class KernelNestedLoopPermutedView2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index 17a449e170..083183706e 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -10,13 +10,13 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
+void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3>    dim,
                                   std::array<RAJA::idx_t, 3> perm)
 {
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  IDX_TYPE* working_array;
-  IDX_TYPE* check_array;
-  IDX_TYPE* test_array;
+  IDX_TYPE*                 working_array;
+  IDX_TYPE*                 check_array;
+  IDX_TYPE*                 test_array;
 
   std::array<RAJA::idx_t, 3> dim_strip{
       {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
@@ -44,8 +44,9 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
       RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
                        RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
                        RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-        int val = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      {
+        int val       = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
         view(i, j, k) = static_cast<IDX_TYPE>(val);
       });
 
@@ -69,7 +70,7 @@ class KernelNestedLoopPermutedView3DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index 9a04251f0e..f92b4f9fab 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -59,10 +59,10 @@ void KernelNestedLoopTest(const DEPTH_2&,
                           const RAJA::Index_type dim1,
                           ExtraArgs...)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
-  RAJA::Index_type flatSize = dim0 * dim1;
+  RAJA::Index_type  flatSize = dim0 * dim1;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -76,24 +76,26 @@ void KernelNestedLoopTest(const DEPTH_2&,
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  constexpr int Depth = 2;
+  constexpr int                                     Depth = 2;
   RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
       work_array, dim1, dim0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range1, range0),
       work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i) {
-        work_view(j, i) = (j * dim0) + i;
-      });
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i)
+      { work_view(j, i) = (j * dim0) + i; });
 
   work_res.memcpy(check_array,
                   work_array,
                   sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=](RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
-  });
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
 
   deallocateForallTestData<RAJA::Index_type>(
       erased_work_res, work_array, check_array, test_array);
@@ -132,10 +134,10 @@ void KernelNestedLoopTest(const DEPTH_3&,
                           const RAJA::Index_type dim1,
                           const RAJA::Index_type dim2)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
-  RAJA::Index_type flatSize = dim0 * dim1 * dim2;
+  RAJA::Index_type  flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -150,7 +152,7 @@ void KernelNestedLoopTest(const DEPTH_3&,
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  constexpr int Depth = 3;
+  constexpr int                                     Depth = 3;
   RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
       work_array, dim2, dim1, dim0);
 
@@ -158,17 +160,19 @@ void KernelNestedLoopTest(const DEPTH_3&,
       RAJA::make_tuple(range2, range1, range0),
       work_res,
       [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i) {
-        work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i;
-      });
+          RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i)
+      { work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i; });
 
   work_res.memcpy(check_array,
                   work_array,
                   sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=](RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
-  });
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
 
   deallocateForallTestData<RAJA::Index_type>(
       erased_work_res, work_array, check_array, test_array);
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 5937eac900..1834b22ac9 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -48,11 +48,11 @@ using MultiLambdaSupportedLoopTypeList =
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
 void KernelNestedLoopTest()
 {
-  constexpr static int N = 1000;
+  constexpr static int N   = 1000;
   constexpr static int DIM = 2;
 
   camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
 
   // Allocate Tests Data
   double* work_arrA = work_res.template allocate<double>(N * N);
@@ -110,14 +110,16 @@ void KernelNestedLoopTest()
       work_res,
 
       // lambda 0
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j) {
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
+      {
         work_viewB(i, j) = 0.2 * (work_viewA(i, j) + work_viewA(i, j - 1) +
                                   work_viewA(i, j + 1) + work_viewA(i + 1, j) +
                                   work_viewA(i - 1, j));
       },
 
       // lambda 1
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j) {
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
+      {
         work_viewA(i, j) = 0.2 * (work_viewB(i, j) + work_viewB(i, j - 1) +
                                   work_viewB(i, j + 1) + work_viewB(i + 1, j) +
                                   work_viewB(i - 1, j));
@@ -129,7 +131,9 @@ void KernelNestedLoopTest()
       check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment{0, N * N}, [=](RAJA::Index_type i) {
+      RAJA::RangeSegment{0, N * N},
+      [=](RAJA::Index_type i)
+      {
         ASSERT_TRUE(RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8);
         ASSERT_TRUE(RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8);
       });
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index 056fc38be9..fd751bc399 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -16,8 +16,8 @@ template <typename EXEC_POL,
           typename PARAMS,
           typename WORKING_RES,
           typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
-                                                        PARAMS&& params,
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&&  segs,
+                                                        PARAMS&&    params,
                                                         WORKING_RES work_res,
                                                         Args&&... args)
 {
@@ -53,11 +53,11 @@ template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
 void KernelNestedLoopTest()
 {
 
-  constexpr static int N = 100;
+  constexpr static int N   = 100;
   constexpr static int DIM = 2;
 
   camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
 
   // Allocate Tests Data
   double* work_arrA = work_res.template allocate<double>(N * N);
@@ -127,14 +127,12 @@ void KernelNestedLoopTest()
       [=] RAJA_HOST_DEVICE(double& dot) { dot = 0.0; },
 
       // lambda 1
-      [=] RAJA_HOST_DEVICE(int col, int row, int k, double& dot) {
-        dot += work_viewA(row, k) * work_viewB(k, col);
-      },
+      [=] RAJA_HOST_DEVICE(int col, int row, int k, double& dot)
+      { dot += work_viewA(row, k) * work_viewB(k, col); },
 
       // lambda 2
-      [=] RAJA_HOST_DEVICE(int col, int row, double& dot) {
-        work_viewC(row, col) = dot;
-      }
+      [=] RAJA_HOST_DEVICE(int col, int row, double& dot)
+      { work_viewC(row, col) = dot; }
 
   );
 
@@ -142,9 +140,9 @@ void KernelNestedLoopTest()
       check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment{0, N * N}, [=](RAJA::Index_type i) {
-        ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8);
-      });
+      RAJA::RangeSegment{0, N * N},
+      [=](RAJA::Index_type i)
+      { ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8); });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index 02726aecf4..f276e1e273 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -22,7 +22,7 @@ class KernelNestedLoopBasicTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
index 222849ecfc..18ffdfbf16 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
@@ -22,7 +22,7 @@ class KernelNestedLoopMultiLambdaTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
index f109fc68f6..82972e884a 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
@@ -23,7 +23,7 @@ class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
 TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
              NestedLoopMultiLambdaParamKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index d8215f1e83..5da4815546 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -21,7 +21,7 @@ class KernelNestedLoopBasicTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
index 63998f2a76..88ca995772 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
@@ -22,7 +22,7 @@ class KernelNestedLoopMultiLambdaTest : public ::testing::Test
 
 TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
index b43b82afe3..d60aa9b985 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
@@ -23,7 +23,7 @@ class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
 TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
              NestedLoopMultiLambdaParamKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index fe898875ac..d053dd6c2a 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -21,9 +21,9 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*  work_array;
+  DATA_TYPE*  check_array;
+  DATA_TYPE*  test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
@@ -36,21 +36,23 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+  RAJA::forall<FORALL_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
   RAJA::forall<RAJA::seq_exec>(
       seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
-    for (int xx = 0; xx < xdim; ++xx)
-    {
-      checkarr2D[zz][xx] = zz * xdim + xx;
-    }
-    checkarr2D[ydim - 1][xdim - 1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -61,24 +63,27 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)0, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
-        maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r));
-      });
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(int c, int r)
+      { maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
-    for (int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D   raja_loc      = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
+  Index2D   checkraja_loc = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
@@ -100,11 +105,11 @@ class KernelLocMax2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
   KernelLocMax2DTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index 834e205320..42626bb986 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -21,9 +21,9 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*  work_array;
+  DATA_TYPE*  check_array;
+  DATA_TYPE*  test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
@@ -36,21 +36,23 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+  RAJA::forall<FORALL_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
   RAJA::forall<RAJA::seq_exec>(
       seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
-    for (int xx = 0; xx < xdim; ++xx)
-    {
-      checkarr2D[zz][xx] = zz * xdim + xx;
-    }
-    checkarr2D[ydim - 1][xdim - 1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -63,24 +65,27 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)0, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
-        maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r));
-      });
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(int c, int r)
+      { maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
-    for (int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D   raja_loc      = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
+  Index2D   checkraja_loc = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
@@ -102,11 +107,11 @@ class KernelLocMax2DViewTest : public ::testing::Test
 
 TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
   KernelLocMax2DViewTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index bb31ef1704..864b68644e 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -21,9 +21,9 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*  work_array;
+  DATA_TYPE*  check_array;
+  DATA_TYPE*  test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
@@ -36,21 +36,23 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+  RAJA::forall<FORALL_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
   RAJA::forall<RAJA::seq_exec>(
       seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
-    for (int xx = 0; xx < xdim; ++xx)
-    {
-      checkarr2D[zz][xx] = zz * xdim + xx;
-    }
-    checkarr2D[ydim - 1][xdim - 1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -69,26 +71,31 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
       ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
           maxloc_reducer((DATA_TYPE)0, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
-        maxloc_reducer.maxloc(ArrView(r, c),
-                              RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              maxloc_reducer.maxloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
-    for (int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  DATA_TYPE                         raja_max = (DATA_TYPE)maxloc_reducer.get();
+  Index2D   checkraja_loc                    = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
@@ -110,11 +117,11 @@ class KernelLocMax2DViewTupleTest : public ::testing::Test
 
 TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
   KernelLocMax2DViewTupleTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index cc0fe11878..1acd780406 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -21,9 +21,9 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*  work_array;
+  DATA_TYPE*  check_array;
+  DATA_TYPE*  test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
@@ -36,21 +36,23 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+  RAJA::forall<FORALL_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
   RAJA::forall<RAJA::seq_exec>(
       seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
-    for (int xx = 0; xx < xdim; ++xx)
-    {
-      checkarr2D[zz][xx] = zz * xdim + xx + 1;
-    }
-    checkarr2D[ydim - 1][xdim - 1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -61,24 +63,27 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)1024, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
-        minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r));
-      });
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(int c, int r)
+      { minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
-    for (int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D   raja_loc      = minloc_reducer.getLoc();
+  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
+  Index2D   checkraja_loc = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
@@ -100,11 +105,11 @@ class KernelLocMin2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
   KernelLocMin2DTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index 12f1655b60..8eada9e8c8 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -21,9 +21,9 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*  work_array;
+  DATA_TYPE*  check_array;
+  DATA_TYPE*  test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
@@ -36,21 +36,23 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+  RAJA::forall<FORALL_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
   RAJA::forall<RAJA::seq_exec>(
       seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
-    for (int xx = 0; xx < xdim; ++xx)
-    {
-      checkarr2D[zz][xx] = zz * xdim + xx + 1;
-    }
-    checkarr2D[ydim - 1][xdim - 1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -63,24 +65,27 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)1024, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
-        minloc_reducer.minloc(ArrView(r, c), Index2D(c, r));
-      });
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(int c, int r)
+      { minloc_reducer.minloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
-    for (int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D   raja_loc      = minloc_reducer.getLoc();
+  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
+  Index2D   checkraja_loc = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
@@ -102,11 +107,11 @@ class KernelLocMin2DViewTest : public ::testing::Test
 
 TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
   KernelLocMin2DViewTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index e1dc5e5d32..8b886df355 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -21,9 +21,9 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE*  work_array;
+  DATA_TYPE*  check_array;
+  DATA_TYPE*  test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
@@ -36,21 +36,23 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz) {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+  RAJA::forall<FORALL_POLICY>(seg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
   RAJA::forall<RAJA::seq_exec>(
       seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz) {
-    for (int xx = 0; xx < xdim; ++xx)
-    {
-      checkarr2D[zz][xx] = zz * xdim + xx + 1;
-    }
-    checkarr2D[ydim - 1][xdim - 1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -65,26 +67,31 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
       ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
           minloc_reducer((DATA_TYPE)1024, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r) {
-        minloc_reducer.minloc(ArrView(r, c),
-                              RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              minloc_reducer.minloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange, [=](INDEX_TYPE c) {
-    for (int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  DATA_TYPE                         raja_min = (DATA_TYPE)minloc_reducer.get();
+  Index2D   checkraja_loc                    = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
@@ -106,11 +113,11 @@ class KernelLocMin2DViewTupleTest : public ::testing::Test
 
 TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
   KernelLocMin2DViewTupleTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
index 0bc6620e61..7a08050e72 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-data.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -9,13 +9,13 @@
 #define __TEST_KERNEL_REGION_DATA_HPP__
 
 template <typename T>
-void allocRegionTestData(int N,
+void allocRegionTestData(int                       N,
                          camp::resources::Resource work_res,
-                         T** work1,
-                         T** work2,
-                         T** work3,
+                         T**                       work1,
+                         T**                       work2,
+                         T**                       work3,
                          camp::resources::Resource host_res,
-                         T** check)
+                         T**                       check)
 {
   *work1 = work_res.allocate<T>(N);
   *work2 = work_res.allocate<T>(N);
@@ -26,11 +26,11 @@ void allocRegionTestData(int N,
 
 template <typename T>
 void deallocRegionTestData(camp::resources::Resource work_res,
-                           T* work1,
-                           T* work2,
-                           T* work3,
+                           T*                        work1,
+                           T*                        work2,
+                           T*                        work3,
                            camp::resources::Resource host_res,
-                           T* check)
+                           T*                        check)
 {
   work_res.deallocate(work1);
   work_res.deallocate(work2);
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index c0eb29ea9d..2855c53b91 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -62,7 +62,8 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
       [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-      [=](INDEX_TYPE i) {
+      [=](INDEX_TYPE i)
+      {
         work_array3[i - first] =
             work_array1[i - first] + work_array2[i - first] + 1;
       }
@@ -88,7 +89,7 @@ class KernelRegionSyncTest : public ::testing::Test
 
 TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index 4a2b200bac..adce8baad5 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -47,7 +47,8 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
       [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-      [=](INDEX_TYPE i) {
+      [=](INDEX_TYPE i)
+      {
         work_array3[i - first] =
             work_array1[i - first] + work_array2[i - first] + 1;
       }
@@ -73,7 +74,7 @@ class KernelRegionTest : public ::testing::Test
 
 TYPED_TEST_P(KernelRegionTest, RegionKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
index c0989d0b45..6ec6fe5d2f 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
@@ -33,7 +33,8 @@ void KernelSingleLoopForICountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
         RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
         RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii)
+        {
           trip_count += 1;
           if (i % tsize == t && ii == t)
           {
@@ -64,8 +65,8 @@ class KernelSingleLoopForICountTest : public ::testing::Test
 
 TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
index 441f96deb4..01763c09d4 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
@@ -35,7 +35,8 @@ void KernelSingleLoopTileTCountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
         RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
         RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti)
+        {
           trip_count += 1;
           if (i / tsize == t && ti == t)
           {
@@ -66,8 +67,8 @@ class KernelSingleLoopTileTCountTest : public ::testing::Test
 
 TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index a1fde799b4..eaf2062158 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -66,9 +66,8 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
       RAJA::make_tuple(RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y}),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
-        WorkTView(cc, rr) = WorkView(rr, cc);
-      });
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { WorkTView(cc, rr) = WorkView(rr, cc); });
 
   work_res.memcpy(
       check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
@@ -90,9 +89,8 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
       RAJA::make_tuple(colrange, rowrange),
       RAJA::make_tuple(RAJA::TileSize{tile_dim_x},
                        RAJA::TileSize{tile_dim_y / 2}),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
-        WorkTView(cc, rr) = WorkView(rr, cc);
-      });
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { WorkTView(cc, rr) = WorkView(rr, cc); });
 
   work_res.memcpy(
       check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
@@ -120,8 +118,8 @@ class KernelTileDynamic2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelTileDynamic2DTest, TileDynamic2DKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index d470590458..c947a01829 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -64,9 +64,8 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
-                              WorkTView(cc, rr) = WorkView(rr, cc);
-                            });
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { WorkTView(cc, rr) = WorkView(rr, cc); });
 
   work_res.memcpy(
       check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
@@ -94,8 +93,8 @@ class KernelTileFixed2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelTileFixed2DTest, TileFixed2DKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index a271feb1ed..6c540bebfb 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -59,7 +59,8 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 
   // find min and max on target platform
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            {
                               workmin.min(WorkView(rr, cc));
                               workmax.max(WorkView(rr, cc));
                             });
@@ -80,10 +81,10 @@ class KernelTileFixed2DMinMaxTest : public ::testing::Test
 
 TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index 9405b71847..dd2a9b9524 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -69,9 +69,8 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 
   // sum on target platform
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr) {
-                              worksum += (DATA_TYPE)(rr * 1.1 + cc);
-                            });
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
 
   ASSERT_FLOAT_EQ(hostsum, (DATA_TYPE)worksum.get());
 }
@@ -84,10 +83,10 @@ class KernelTileFixed2DSumTest : public ::testing::Test
 
 TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   KernelTileFixed2DSumTestImpl<INDEX_TYPE,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index f1f0fa6667..2c47f96a2e 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -76,17 +76,15 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
                            INDEX_TYPE rr,
                            INDEX_TYPE tx,
                            INDEX_TYPE ty,
-                           TILE_MEM & Tile_Array) {
-        Tile_Array(ty, tx) = WorkView(rr, cc);
-      },
+                           TILE_MEM & Tile_Array)
+      { Tile_Array(ty, tx) = WorkView(rr, cc); },
 
       [=] RAJA_HOST_DEVICE(INDEX_TYPE cc,
                            INDEX_TYPE rr,
                            INDEX_TYPE tx,
                            INDEX_TYPE ty,
-                           TILE_MEM & Tile_Array) {
-        WorkTView(cc, rr) = Tile_Array(ty, tx);
-      });
+                           TILE_MEM & Tile_Array)
+      { WorkTView(cc, rr) = Tile_Array(ty, tx); });
 
   work_res.memcpy(
       check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
@@ -114,8 +112,8 @@ class KernelTileLocalArray2DTest : public ::testing::Test
 
 TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
index 8a126e1f99..9c7df0d4d0 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -22,8 +22,8 @@ class KernelWarpThreadReduceMaskTest : public ::testing::Test
 
 TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
index efbdc67e70..05ad33d1da 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -22,8 +22,8 @@ class KernelWarpThreadReduceWarpTest : public ::testing::Test
 
 TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
index 5a888c10bd..f20112d62f 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -22,8 +22,8 @@ class KernelWarpThreadWarpLoopTest : public ::testing::Test
 
 TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
index c3c44f6964..c8881fdf14 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -22,8 +22,8 @@ class KernelWarpThreadReduceMaskTest : public ::testing::Test
 
 TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
index 37f4ded092..0a31416989 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -22,8 +22,8 @@ class KernelWarpThreadReduceWarpTest : public ::testing::Test
 
 TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
index 5bf74de3f8..24b8dd3153 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -22,8 +22,8 @@ class KernelWarpThreadWarpLoopTest : public ::testing::Test
 
 TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
 {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index efe813bbf9..2bf555ef61 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -39,8 +39,8 @@ template <typename EXEC_POL,
           typename WORKING_RES,
           typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs,
-                  PARAMS&& params,
+call_kernel_param(SEGMENTS&&  segs,
+                  PARAMS&&    params,
                   WORKING_RES work_res,
                   Args&&... args)
 {
@@ -81,7 +81,7 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
@@ -94,7 +94,7 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
                                            &check_array,
                                            &test_array);
 
-  RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
+  RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
@@ -102,7 +102,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
                        RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
       work_res,
-      [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j)) {
+      [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j))
+      {
         trip_count += 1;
         worksum += i; // i should only be 0..directlen-1
         max_thread.max(threadIdx.x);
@@ -124,7 +125,7 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
@@ -137,7 +138,7 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
                                            &check_array,
                                            &test_array);
 
-  RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
+  RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
@@ -149,7 +150,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
       [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
                       RAJA::Index_type RAJA_UNUSED_ARG(j),
                       RAJA::Index_type RAJA_UNUSED_ARG(x),
-                      RAJA::Index_type y) {
+                      RAJA::Index_type y)
+      {
         trip_count += 1;
         worksum += y; // y should only be 0..3
         max_thread.max(threadIdx.x);
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index bea0bf5bb6..e453ab19eb 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -17,8 +17,8 @@ template <typename EXEC_POL,
           typename WORKING_RES,
           typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs,
-                  PARAMS&& params,
+call_kernel_param(SEGMENTS&&  segs,
+                  PARAMS&&    params,
                   WORKING_RES work_res,
                   Args&&... args)
 {
@@ -59,7 +59,7 @@ template <typename WORKING_RES,
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
@@ -77,11 +77,11 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
       RAJA::make_tuple((RAJA::Index_type)0),
       work_res,
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type & value) {
-        value += i;
-      },
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type & value)
+      { value += i; },
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value) {
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
         // This only gets executed on the "root" thread which received the
         // reduced value.
         worksum += value;
@@ -103,7 +103,7 @@ void KernelWarpThreadTest(
     const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
     const RAJA::Index_type len) // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
@@ -126,11 +126,11 @@ void KernelWarpThreadTest(
       work_res,
 
       [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type & value) {
-        value += i + j * outerlen;
-      },
+          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type & value)
+      { value += i + j * outerlen; },
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value) {
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
         // This only gets executed on the "root" thread which received the
         // reduced value.
         worksum += value;
@@ -152,16 +152,16 @@ void KernelWarpThreadTest(
     const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
     const RAJA::Index_type len) // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  RAJA::Index_type innerlen = 10;
+  RAJA::Index_type innerlen  = 10;
   RAJA::Index_type middlelen = 16;
-  RAJA::Index_type outerlen = len / (innerlen * middlelen);
+  RAJA::Index_type outerlen  = len / (innerlen * middlelen);
 
   allocateForallTestData<RAJA::Index_type>(
       len, erased_work_res, &work_array, &check_array, &test_array);
@@ -179,11 +179,11 @@ void KernelWarpThreadTest(
       [=] RAJA_HOST_DEVICE(RAJA::Index_type i,
                            RAJA::Index_type j,
                            RAJA::Index_type k,
-                           RAJA::Index_type & value) {
-        value += i + j * outerlen + k * outerlen * middlelen;
-      },
+                           RAJA::Index_type & value)
+      { value += i + j * outerlen + k * outerlen * middlelen; },
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value) {
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
         // This only gets executed on the "root" thread which received the
         // reduced value.
         worksum += value;
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index af65db3172..b624fd5e8c 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -39,8 +39,8 @@ template <typename EXEC_POL,
           typename WORKING_RES,
           typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs,
-                  PARAMS&& params,
+call_kernel_param(SEGMENTS&&  segs,
+                  PARAMS&&    params,
                   WORKING_RES work_res,
                   Args&&... args)
 {
@@ -81,7 +81,7 @@ template <typename WORKING_RES,
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
@@ -113,10 +113,10 @@ template <typename WORKING_RES,
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
                           const RAJA::Index_type numtiles)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  WORKING_RES               work_res{WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res{work_res};
 
-  RAJA::Index_type flatSize = 32 * numtiles;
+  RAJA::Index_type  flatSize = 32 * numtiles;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -133,7 +133,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
       RAJA::make_tuple((RAJA::Index_type)0),
       work_res,
       [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
-                           RAJA::Index_type j) {
+                           RAJA::Index_type j)
+      {
         worksum += j; // j should only be 0..31
       });
 
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index 8a1ebc6741..8655bf2119 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -58,33 +58,40 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
                          RAJA::Threads(threads_i, threads_j, threads_k)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<TEAM_Z_POLICY>(
             ctx,
             RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k),
-            [&](IDX_TYPE bk) {
+            [&](IDX_TYPE bk)
+            {
               RAJA::loop<TEAM_Y_POLICY>(
                   ctx,
                   RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j),
-                  [&](IDX_TYPE bj) {
+                  [&](IDX_TYPE bj)
+                  {
                     RAJA::loop<TEAM_X_POLICY>(
                         ctx,
                         RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i),
-                        [&](IDX_TYPE bi) {
+                        [&](IDX_TYPE bi)
+                        {
                           RAJA::loop<THREAD_Z_POLICY>(
                               ctx,
                               RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k),
-                              [&](IDX_TYPE tk) {
+                              [&](IDX_TYPE tk)
+                              {
                                 RAJA::loop<THREAD_Y_POLICY>(
                                     ctx,
                                     RAJA::TypedRangeSegment<IDX_TYPE>(
                                         0, threads_j),
-                                    [&](IDX_TYPE tj) {
+                                    [&](IDX_TYPE tj)
+                                    {
                                       RAJA::loop<THREAD_X_POLICY>(
                                           ctx,
                                           RAJA::TypedRangeSegment<IDX_TYPE>(
                                               0, threads_i),
-                                          [&](IDX_TYPE ti) {
+                                          [&](IDX_TYPE ti)
+                                          {
                                             IDX_TYPE i = ti + threads_i * bi;
                                             IDX_TYPE j = tj + threads_j * bj;
                                             IDX_TYPE k = tk + threads_k * bk;
@@ -138,9 +145,9 @@ template <typename EXEC_POL_DATA,
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
-                                const Container& multi_init,
-                                WORKING_RES working_res,
-                                RandomGenerator& rngen)
+                                const Container&     multi_init,
+                                WORKING_RES          working_res,
+                                RandomGenerator&     rngen)
 {
   using RAJA::get;
   using MULTIREDUCER =
@@ -160,7 +167,7 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval = 100;
+  const int    modval   = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -195,7 +202,7 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       {
         for (IDX_TYPE i : si)
         {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
           test_range[ii + 1] = data_len;
@@ -248,7 +255,9 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
 
     Launch<EXEC_POL_DATA, IDX_TYPE>(
-        segments, [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        segments,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+        {
           IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
           for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
                ++idx)
@@ -286,7 +295,9 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       }
 
       Launch<EXEC_POL_DATA, IDX_TYPE>(
-          segments, [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+          segments,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
             IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
             for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
                  ++idx)
@@ -326,7 +337,7 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
 
     std::vector<DATA_TYPE> ref_vals;
-    bool got_ref_vals = false;
+    bool                   got_ref_vals = false;
 
     const int nloops = 2;
     for (int j = 0; j < nloops; ++j)
@@ -334,7 +345,9 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       red.reset();
 
       Launch<EXEC_POL_DATA, IDX_TYPE>(
-          segments, [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+          segments,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
             IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
             for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
                  ++idx)
@@ -373,15 +386,15 @@ class LaunchMultiReduceNestedTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ABSTRACTION = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto         random_seed = std::random_device{}();
   std::mt19937 rngen(random_seed);
 
   WORKING_RES working_res{WORKING_RES::get_default()};
@@ -389,13 +402,13 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
-  size_t num_bins_min = 0;
+  size_t              num_bins_min = 0;
   for (size_t num_bins_max : num_bins_max_container)
   {
 
     std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
                                                         num_bins_max);
-    num_bins_min = num_bins_max + 1;
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index 7278cced79..13d0892098 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -41,9 +41,9 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -68,34 +68,67 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(
         working_array, N6, N5, N4, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-              RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-                RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                  RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                    RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-                      auto idx =
-                          tx +
-                          N1 * (ty +
-                                N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                      Aview(bz, by, bx, tz, ty, tx) =
-                          static_cast<INDEX_TYPE>(idx);
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx,
+              r6,
+              [&](INDEX_TYPE bz)
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx,
+                    r5,
+                    [&](INDEX_TYPE by)
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx,
+                          r4,
+                          [&](INDEX_TYPE bx)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx,
+                                r3,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx,
+                                      r2,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx,
+                                            r1,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx +
+                                                  N1 *
+                                                      (ty +
+                                                       N2 *
+                                                           (tz +
+                                                            N3 *
+                                                                (bx +
+                                                                 N4 *
+                                                                     (by +
+                                                                      N5 *
+                                                                          bz))));
+
+
+                                              Aview(bz, by, bx, tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
+                          });
                     });
-                  });
-                });
               });
-            });
-          });
         });
   }
   else
@@ -109,24 +142,38 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::loop<TEAM_Z_POLICY>(
-              ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
+              ctx,
+              r3,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
+              {
                 RAJA::loop<TEAM_Y_POLICY>(
-                    ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
+                    ctx,
+                    r2,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
+                    {
                       RAJA::loop<TEAM_X_POLICY>(
-                          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                          ctx,
+                          r1,
+                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
+                          {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                                ctx,
+                                r3,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
                                   RAJA::loop<THREAD_Y_POLICY>(
                                       ctx,
                                       r2,
-                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
                                         RAJA::loop<THREAD_X_POLICY>(
                                             ctx,
                                             r1,
-                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(
-                                                tx)) { working_array[0]++; });
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            { working_array[0]++; });
                                       });
                                 });
                           });
@@ -166,7 +213,7 @@ class LaunchNestedDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index 9e3b48d470..b5b8ab20a6 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -42,9 +42,9 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -70,34 +70,67 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(
         working_array, N6, N5, N4, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-              RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-                RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                  RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                    RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-                      auto idx =
-                          tx +
-                          N1 * (ty +
-                                N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                      Aview(bz, by, bx, tz, ty, tx) =
-                          static_cast<INDEX_TYPE>(idx);
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx,
+              r6,
+              [&](INDEX_TYPE bz)
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx,
+                    r5,
+                    [&](INDEX_TYPE by)
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx,
+                          r4,
+                          [&](INDEX_TYPE bx)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx,
+                                r3,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx,
+                                      r2,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx,
+                                            r1,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx +
+                                                  N1 *
+                                                      (ty +
+                                                       N2 *
+                                                           (tz +
+                                                            N3 *
+                                                                (bx +
+                                                                 N4 *
+                                                                     (by +
+                                                                      N5 *
+                                                                          bz))));
+
+
+                                              Aview(bz, by, bx, tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
+                          });
                     });
-                  });
-                });
               });
-            });
-          });
         });
   }
   else
@@ -111,24 +144,38 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::loop<TEAM_Z_POLICY>(
-              ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
+              ctx,
+              r3,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
+              {
                 RAJA::loop<TEAM_Y_POLICY>(
-                    ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
+                    ctx,
+                    r2,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
+                    {
                       RAJA::loop<TEAM_X_POLICY>(
-                          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                          ctx,
+                          r1,
+                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
+                          {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                                ctx,
+                                r3,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
                                   RAJA::loop<THREAD_Y_POLICY>(
                                       ctx,
                                       r2,
-                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
                                         RAJA::loop<THREAD_X_POLICY>(
                                             ctx,
                                             r1,
-                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(
-                                                tx)) { working_array[0]++; });
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            { working_array[0]++; });
                                       });
                                 });
                           });
@@ -168,7 +215,7 @@ class LaunchNestedLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 37048f15bb..e82dfb3a4a 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -45,9 +45,9 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -64,35 +64,47 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile<TEAM_Z_POLICY>(
               ctx,
               tile_size_z,
               r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
                 RAJA::tile<TEAM_Y_POLICY>(
                     ctx,
                     tile_size_y,
                     r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
                       RAJA::tile<TEAM_X_POLICY>(
                           ctx,
                           tile_size_x,
                           r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
-                                  x_tile) {
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, z_tile, [&](INDEX_TYPE tz) {
+                                ctx,
+                                z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, y_tile, [&](INDEX_TYPE ty) {
+                                      ctx,
+                                      y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, x_tile, [&](INDEX_TYPE tx) {
+                                            ctx,
+                                            x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
                                               auto idx =
                                                   tx + N1 * (ty + N2 * tz);
 
@@ -117,36 +129,41 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile<TEAM_Z_POLICY>(
               ctx,
               threads_z,
               r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
                 RAJA::tile<TEAM_Y_POLICY>(
                     ctx,
                     threads_y,
                     r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
                       RAJA::tile<TEAM_X_POLICY>(
                           ctx,
                           threads_x,
                           r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
-                                  x_tile) {
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
                             RAJA::loop<THREAD_Z_POLICY>(
                                 ctx,
                                 z_tile,
-                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
                                   RAJA::loop<THREAD_Y_POLICY>(
                                       ctx,
                                       y_tile,
-                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
                                         RAJA::loop<THREAD_X_POLICY>(
                                             ctx,
                                             x_tile,
-                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(
-                                                tx)) { working_array[0]++; });
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            { working_array[0]++; });
                                       });
                                 });
                           });
@@ -186,7 +203,7 @@ class LaunchNestedTileDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index cc27bc9787..83535095a6 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -42,9 +42,9 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -61,35 +61,47 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile<TEAM_Z_POLICY>(
               ctx,
               threads_z,
               r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
                 RAJA::tile<TEAM_Y_POLICY>(
                     ctx,
                     threads_y,
                     r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
                       RAJA::tile<TEAM_X_POLICY>(
                           ctx,
                           threads_x,
                           r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
-                                  x_tile) {
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, z_tile, [&](INDEX_TYPE tz) {
+                                ctx,
+                                z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, y_tile, [&](INDEX_TYPE ty) {
+                                      ctx,
+                                      y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, x_tile, [&](INDEX_TYPE tx) {
+                                            ctx,
+                                            x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
                                               auto idx =
                                                   tx + N1 * (ty + N2 * tz);
 
@@ -114,29 +126,41 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
                            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile<TEAM_Z_POLICY>(
               ctx,
               threads_z,
               r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile) {
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
                 RAJA::tile<TEAM_Y_POLICY>(
                     ctx,
                     threads_y,
                     r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile) {
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
                       RAJA::tile<TEAM_X_POLICY>(
                           ctx,
                           threads_x,
                           r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const&
-                                  x_tile) {
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, z_tile, [&](INDEX_TYPE tz) {
+                                ctx,
+                                z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, y_tile, [&](INDEX_TYPE ty) {
+                                      ctx,
+                                      y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, x_tile, [&](INDEX_TYPE tx) {
+                                            ctx,
+                                            x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
                                               (void)tx;
                                               (void)ty;
                                               (void)tz;
@@ -182,7 +206,7 @@ class LaunchNestedTileLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index f537f43b55..4fd6703c2c 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -19,19 +19,19 @@ template <typename IDX_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
           typename REDUCE_POLICY>
-void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                     camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1) / threads + 1;
+  int           blocks  = (seg.size() - 1) / threads + 1;
 
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
@@ -49,7 +49,8 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
             ctx, seg, [&](IDX_TYPE idx) { simpand &= working_array[idx]; });
       });
@@ -80,11 +81,15 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          redand &= working_array[idx];
-          redand2 &= working_array[idx];
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
+                                         seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           redand &= working_array[idx];
+                                           redand2 &= working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
@@ -97,7 +102,8 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   {
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::loop<GLOBAL_THREAD_POLICY>(
               ctx, seg, [&](IDX_TYPE idx) { redand &= working_array[idx]; });
         });
@@ -118,8 +124,8 @@ class LaunchReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index 3c01e5c9c7..88a1ca2ba4 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -19,25 +19,25 @@ template <typename IDX_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
           typename REDUCE_POLICY>
-void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
+void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1) / threads + 1;
+  int           blocks  = (seg.size() - 1) / threads + 1;
 
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval + 1;
+  const int       modval    = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
@@ -59,11 +59,15 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          mininit.min(working_array[idx]);
-          min.min(working_array[idx]);
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
+                                         seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           mininit.min(working_array[idx]);
+                                           min.min(working_array[idx]);
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
@@ -75,10 +79,12 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE factor = 3;
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          min.min(working_array[idx] * factor);
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx,
+            seg,
+            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
@@ -86,10 +92,12 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   factor = 2;
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          min.min(working_array[idx] * factor);
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx,
+            seg,
+            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
@@ -107,8 +115,8 @@ class LaunchReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index a06ca98678..c60239013f 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -19,20 +19,20 @@ template <typename IDX_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
           typename REDUCE_POLICY>
-void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
+void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource working_res)
+                                  camp::resources::Resource    working_res)
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1) / threads + 1;
+  int           blocks  = (seg.size() - 1) / threads + 1;
 
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
@@ -58,11 +58,15 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          sum += working_array[idx];
-          sum2 += working_array[idx];
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
+                                         seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           sum += working_array[idx];
+                                           sum2 += working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
@@ -76,7 +80,8 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
   {
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::loop<GLOBAL_THREAD_POLICY>(
               ctx, seg, [&](IDX_TYPE idx) { sum += working_array[idx]; });
         });
@@ -97,8 +102,8 @@ class LaunchReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index 43f067941b..50658f5023 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -20,19 +20,19 @@ template <typename IDX_TYPE,
           typename GLOBAL_THREAD_POLICY>
 
 void LaunchParamExptReduceBitAndBasicTestImpl(
-    const SEG_TYPE& seg,
+    const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource working_res)
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1) / threads + 1;
+  int           blocks  = (seg.size() - 1) / threads + 1;
 
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
@@ -51,7 +51,8 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _simpand) {
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _simpand)
+      {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
             ctx, seg, [&](IDX_TYPE idx) { _simpand &= working_array[idx]; });
       });
@@ -85,11 +86,15 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
       [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _redand, DATA_TYPE & _redand2) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _redand &= working_array[idx];
-          _redand2 &= working_array[idx];
-        });
+          RAJA::LaunchContext ctx, DATA_TYPE & _redand, DATA_TYPE & _redand2)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
+                                         seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _redand &= working_array[idx];
+                                           _redand2 &= working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
@@ -103,7 +108,8 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE _redand) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE _redand)
+        {
           RAJA::loop<GLOBAL_THREAD_POLICY>(
               ctx, seg, [&](IDX_TYPE idx) { _redand &= working_array[idx]; });
         });
@@ -124,8 +130,8 @@ class LaunchParamExptReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 0567c99891..c646edc950 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -19,25 +19,25 @@ template <typename IDX_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
 void LaunchParamExptReduceMinBasicTestImpl(
-    const SEG_TYPE& seg,
+    const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource working_res)
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1) / threads + 1;
+  int           blocks  = (seg.size() - 1) / threads + 1;
 
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval + 1;
+  const int       modval    = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
@@ -62,11 +62,16 @@ void LaunchParamExptReduceMinBasicTestImpl(
       RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
       [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _mininit, DATA_TYPE & _min) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _mininit = RAJA_MIN(working_array[idx], _mininit);
-          _min = RAJA_MIN(working_array[idx], _min);
-        });
+          RAJA::LaunchContext ctx, DATA_TYPE & _mininit, DATA_TYPE & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx,
+            seg,
+            [&](IDX_TYPE idx)
+            {
+              _mininit = RAJA_MIN(working_array[idx], _mininit);
+              _min     = RAJA_MIN(working_array[idx], _min);
+            });
       });
 
 
@@ -80,10 +85,13 @@ void LaunchParamExptReduceMinBasicTestImpl(
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _min = RAJA_MIN(working_array[idx] * factor, _min);
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx,
+            seg,
+            [&](IDX_TYPE idx)
+            { _min = RAJA_MIN(working_array[idx] * factor, _min); });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
@@ -93,10 +101,13 @@ void LaunchParamExptReduceMinBasicTestImpl(
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _min = RAJA_MIN(working_array[idx] * factor, _min);
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx,
+            seg,
+            [&](IDX_TYPE idx)
+            { _min = RAJA_MIN(working_array[idx] * factor, _min); });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
@@ -114,8 +125,8 @@ class LaunchParamExptReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index 59ae7203a0..2bd97982e4 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -19,20 +19,20 @@ template <typename IDX_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
 void LaunchParamExptReduceSumBasicTestImpl(
-    const SEG_TYPE& seg,
+    const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource working_res)
+    camp::resources::Resource    working_res)
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1) / threads + 1;
+  int           blocks  = (seg.size() - 1) / threads + 1;
 
   allocateForallTestData<DATA_TYPE>(
       data_len, working_res, &working_array, &check_array, &test_array);
@@ -61,11 +61,15 @@ void LaunchParamExptReduceSumBasicTestImpl(
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
       [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _sum, DATA_TYPE & _sum2) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _sum += working_array[idx];
-          _sum2 += working_array[idx];
-        });
+          RAJA::LaunchContext ctx, DATA_TYPE & _sum, DATA_TYPE & _sum2)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
+                                         seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _sum += working_array[idx];
+                                           _sum2 += working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
@@ -80,7 +84,8 @@ void LaunchParamExptReduceSumBasicTestImpl(
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum)
+        {
           RAJA::loop<GLOBAL_THREAD_POLICY>(
               ctx, seg, [&](IDX_TYPE idx) { _sum += working_array[idx]; });
         });
@@ -101,8 +106,8 @@ class LaunchParamExptReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index a702b9071e..7f478705c8 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -20,9 +20,9 @@ void LaunchBasicSharedTestImpl()
   int N = 1000;
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  int* working_array;
-  int* check_array;
-  int* test_array;
+  int*                      working_array;
+  int*                      check_array;
+  int*                      test_array;
 
   allocateForallTestData<int>(
       N * N, working_res, &working_array, &check_array, &test_array);
@@ -44,25 +44,33 @@ void LaunchBasicSharedTestImpl()
   RAJA::launch<LAUNCH_POLICY>(
       select_cpu_or_gpu,
       RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
-          // Array shared within threads of the same team
-          int* s_A = ctx.getSharedMemory<int>(1);
-
-          RAJA::loop<THREAD_POLICY>(
-              ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; });
-
-          ctx.teamSync();
-
-          // broadcast shared value to all threads and write to array
-          RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
-            const int idx = c + N * r;
-            working_array[idx] = s_A[0];
-          }); // loop j
-
-          ctx.releaseSharedMemory();
-        }); // loop r
-      });   // outer lambda
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx,
+            RAJA::RangeSegment(0, N),
+            [&](int r)
+            {
+              // Array shared within threads of the same team
+              int* s_A = ctx.getSharedMemory<int>(1);
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; });
+
+              ctx.teamSync();
+
+              // broadcast shared value to all threads and write to array
+              RAJA::loop<THREAD_POLICY>(ctx,
+                                        RAJA::RangeSegment(0, N),
+                                        [&](int c)
+                                        {
+                                          const int idx      = c + N * r;
+                                          working_array[idx] = s_A[0];
+                                        }); // loop j
+
+              ctx.releaseSharedMemory();
+            }); // loop r
+      });       // outer lambda
 
 
   working_res.memcpy(check_array, working_array, sizeof(int) * N * N);
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index e41abaf827..9f14fe6b2e 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -62,7 +62,7 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
       data_len, working_res, &working_array, &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1) / threads + 1;
+  int           blocks  = (data_len - 1) / threads + 1;
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -77,10 +77,13 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
-            working_array[RAJA::stripIndexType(idx)] = idx;
-          });
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx,
+              lseg,
+              [&](INDEX_TYPE idx)
+              { working_array[RAJA::stripIndexType(idx)] = idx; });
         });
   }
   else
@@ -93,11 +96,15 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
-            (void)idx;
-            working_array[0]++;
-          });
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx,
+                                          lseg,
+                                          [&](INDEX_TYPE idx)
+                                          {
+                                            (void)idx;
+                                            working_array[0]++;
+                                          });
         });
   }
 
@@ -129,7 +136,7 @@ class LaunchListSegmentTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index 5cd9d04caa..6691541038 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -22,9 +22,9 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -36,7 +36,7 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
       data_len, working_res, &working_array, &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1) / threads + 1;
+  int           blocks  = (data_len - 1) / threads + 1;
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -47,10 +47,13 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
-            working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-          });
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx,
+              r1,
+              [&](INDEX_TYPE idx)
+              { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
         });
   }
   else
@@ -63,11 +66,12 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-                working_array[0]++;
-              });
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx,
+                                          r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
         });
   }
 
@@ -137,7 +141,7 @@ void runNegativeTests()
 TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index 7d29703587..d17cd31280 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -17,7 +17,7 @@ template <typename INDEX_TYPE,
           typename GLOBAL_THREAD_POICY>
 void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
                                       INDEX_TYPE last,
-                                      DIFF_TYPE stride)
+                                      DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
@@ -25,9 +25,9 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
   camp::resources::Resource host_res{camp::resources::Host()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -43,7 +43,7 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1) / threads + 1;
+  int           blocks  = (data_len - 1) / threads + 1;
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -57,10 +57,15 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
-            working_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
-          });
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx,
+              r1,
+              [&](INDEX_TYPE idx) {
+                working_array[RAJA::stripIndexType((idx - first) / stride)] =
+                    idx;
+              });
         });
   }
   else
@@ -68,11 +73,12 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-                working_array[0]++;
-              });
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx,
+                                          r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
         });
   }
 
@@ -159,7 +165,7 @@ void runNegativeStrideTests()
 
 TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
 {
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index 01d42ae7c5..9f944fb805 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -24,9 +24,9 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
       RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
@@ -41,7 +41,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   {
     for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
     {
-      s_type idx = c + RAJA::stripIndexType(thread_range) * b;
+      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx) + INDEX_TYPE(c);
     }
   }
@@ -57,36 +57,50 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
       RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
                          RAJA::Threads(RAJA::stripIndexType(thread_range)),
                          shared_mem_size),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-          INDEX_TYPE* tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(
-              RAJA::stripIndexType(thread_range));
-          RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(
-              tile_ptr, RAJA::stripIndexType(thread_range));
-
-          int* int_tile_ptr =
-              ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
-          RAJA::View<int, RAJA::Layout<1>> Int_Tile(
-              int_tile_ptr, RAJA::stripIndexType(thread_range));
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-            Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
-            Tile(RAJA::stripIndexType(thread_range) -
-                 RAJA::stripIndexType(tid) - 1) =
-                thread_range - tid - 1 + thread_range * bid;
-          });
-
-          ctx.teamSync();
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-            INDEX_TYPE idx = tid + thread_range * bid;
-            working_array[RAJA::stripIndexType(idx)] =
-                Tile(RAJA::stripIndexType(tid)) +
-                Int_Tile(RAJA::stripIndexType(tid));
-          });
-
-          ctx.releaseSharedMemory();
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx,
+            outer_range,
+            [&](INDEX_TYPE bid)
+            {
+              INDEX_TYPE* tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(
+                  RAJA::stripIndexType(thread_range));
+              RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(
+                  tile_ptr, RAJA::stripIndexType(thread_range));
+
+              int* int_tile_ptr =
+                  ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
+              RAJA::View<int, RAJA::Layout<1>> Int_Tile(
+                  int_tile_ptr, RAJA::stripIndexType(thread_range));
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx,
+                  inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    Int_Tile(RAJA::stripIndexType(tid)) =
+                        RAJA::stripIndexType(tid);
+                    Tile(RAJA::stripIndexType(thread_range) -
+                         RAJA::stripIndexType(tid) - 1) =
+                        thread_range - tid - 1 + thread_range * bid;
+                  });
+
+              ctx.teamSync();
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx,
+                  inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    INDEX_TYPE idx = tid + thread_range * bid;
+                    working_array[RAJA::stripIndexType(idx)] =
+                        Tile(RAJA::stripIndexType(tid)) +
+                        Int_Tile(RAJA::stripIndexType(tid));
+                  });
+
+              ctx.releaseSharedMemory();
+            });
       });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
@@ -110,7 +124,7 @@ class LaunchDynamicMemTest : public ::testing::Test
 TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index 74e88783fa..623075af98 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -27,9 +27,9 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
       RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_array;
-  INDEX_TYPE* check_array;
-  INDEX_TYPE* test_array;
+  INDEX_TYPE*               working_array;
+  INDEX_TYPE*               check_array;
+  INDEX_TYPE*               test_array;
 
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
@@ -44,7 +44,7 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
   {
     for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
     {
-      s_type idx = c + RAJA::stripIndexType(thread_range) * b;
+      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx);
     }
   }
@@ -52,34 +52,48 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
                          RAJA::Threads(RAJA::stripIndexType(thread_range))),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-          // Since we are using custom index type we have to first use a
-          // type that the device compiler can intialize, we can then use a
-          // pointer to recast the shared memory to our desired type.
-          // This enables us to work around the following warning:
-          //  warning #3019-D: dynamic initialization is not supported for
-          // a function-scope static __shared__ variable within a
-          // __device__/__global__ function
-          RAJA_TEAM_SHARED char char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
-          INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-            Tile[RAJA::stripIndexType(thread_range) -
-                 RAJA::stripIndexType(tid) - 1] =
-                thread_range - tid - 1 + thread_range * bid;
-          });
-
-          ctx.teamSync();
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-            INDEX_TYPE idx = tid + thread_range * bid;
-            working_array[RAJA::stripIndexType(idx)] =
-                Tile[RAJA::stripIndexType(tid)];
-          });
-
-          ctx.releaseSharedMemory();
-        });
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx,
+            outer_range,
+            [&](INDEX_TYPE bid)
+            {
+              // Since we are using custom index type we have to first use a
+              // type that the device compiler can intialize, we can then use a
+              // pointer to recast the shared memory to our desired type.
+              // This enables us to work around the following warning:
+              //  warning #3019-D: dynamic initialization is not supported for
+              // a function-scope static __shared__ variable within a
+              // __device__/__global__ function
+              RAJA_TEAM_SHARED char
+                          char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
+              INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx,
+                  inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    Tile[RAJA::stripIndexType(thread_range) -
+                         RAJA::stripIndexType(tid) - 1] =
+                        thread_range - tid - 1 + thread_range * bid;
+                  });
+
+              ctx.teamSync();
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx,
+                  inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    INDEX_TYPE idx = tid + thread_range * bid;
+                    working_array[RAJA::stripIndexType(idx)] =
+                        Tile[RAJA::stripIndexType(tid)];
+                  });
+
+              ctx.releaseSharedMemory();
+            });
       });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
@@ -103,7 +117,7 @@ class LaunchStaticMemTest : public ::testing::Test
 TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index c1045945ac..1a5e95d5bf 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -19,7 +19,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
   constexpr int threads_x = 4;
-  constexpr int blocks_x = 4;
+  constexpr int blocks_x  = 4;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * threads_x + 1);
 
@@ -30,9 +30,9 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_ttile_array;
-  INDEX_TYPE* check_ttile_array;
-  INDEX_TYPE* test_ttile_array;
+  INDEX_TYPE*               working_ttile_array;
+  INDEX_TYPE*               check_ttile_array;
+  INDEX_TYPE*               test_ttile_array;
 
   INDEX_TYPE* working_iloop_array;
   INDEX_TYPE* check_iloop_array;
@@ -64,15 +64,20 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile_tcount<TEAM_X_POLICY>(
               ctx,
               threads_x,
               r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE bx) {
+                  INDEX_TYPE                                 bx)
+              {
                 RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
+                    ctx,
+                    x_tile,
+                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
+                    {
                       working_ttile_array[tx] = bx;
                       working_iloop_array[tx] = ix;
                     });
@@ -90,18 +95,21 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile_tcount<TEAM_X_POLICY>(
               ctx,
               threads_x,
               r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
+              {
                 RAJA::loop_icount<THREAD_X_POLICY>(
                     ctx,
                     x_tile,
                     [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
-                        INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
+                    {
                       working_ttile_array[0]++;
                       working_iloop_array[0]++;
                     });
@@ -156,7 +164,7 @@ class LaunchNestedTileDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index 80100a121e..cd79c5259f 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -22,7 +22,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
   // following grid will require loop policies
   constexpr int threads_x = 3;
-  constexpr int blocks_x = 1;
+  constexpr int blocks_x  = 1;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * tile_size + 1);
 
@@ -33,9 +33,9 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
   camp::resources::Resource working_res{WORKING_RES::get_default()};
-  INDEX_TYPE* working_ttile_array;
-  INDEX_TYPE* check_ttile_array;
-  INDEX_TYPE* test_ttile_array;
+  INDEX_TYPE*               working_ttile_array;
+  INDEX_TYPE*               check_ttile_array;
+  INDEX_TYPE*               test_ttile_array;
 
   INDEX_TYPE* working_iloop_array;
   INDEX_TYPE* check_iloop_array;
@@ -67,15 +67,20 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile_tcount<TEAM_X_POLICY>(
               ctx,
               tile_size,
               r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE bx) {
+                  INDEX_TYPE                                 bx)
+              {
                 RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
+                    ctx,
+                    x_tile,
+                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
+                    {
                       working_ttile_array[tx] = bx;
                       working_iloop_array[tx] = ix;
                     });
@@ -93,18 +98,21 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
           RAJA::tile_tcount<TEAM_X_POLICY>(
               ctx,
               tile_size,
               r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
+              {
                 RAJA::loop_icount<THREAD_X_POLICY>(
                     ctx,
                     x_tile,
                     [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
-                        INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
+                    {
                       working_ttile_array[0]++;
                       working_iloop_array[0]++;
                     });
@@ -159,7 +167,7 @@ class LaunchNestedTileLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using LAUNCH_POLICY =
       typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index 2e92d3833b..a0d2af93d9 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -13,8 +13,8 @@
 template <typename OP, typename T>
 ::testing::AssertionResult check_exclusive(const T* actual,
                                            const T* original,
-                                           int N,
-                                           T init = OP::identity())
+                                           int      N,
+                                           T        init = OP::identity())
 {
   for (int i = 0; i < N; ++i)
   {
@@ -32,12 +32,12 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
 void ScanExclusiveTestImpl(
-    int N,
+    int                           N,
     typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
+  WORKING_RES               res{WORKING_RES::get_default()};
   camp::resources::Resource working_res{res};
 
   T* work_in;
@@ -90,9 +90,9 @@ class ScanExclusiveTest : public ::testing::Test
 
 TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
 {
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
   ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index da8f727bc7..f4d0b7dfc9 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -13,8 +13,8 @@
 template <typename OP, typename T>
 ::testing::AssertionResult check_exclusive(const T* actual,
                                            const T* original,
-                                           int N,
-                                           T init = OP::identity())
+                                           int      N,
+                                           T        init = OP::identity())
 {
   for (int i = 0; i < N; ++i)
   {
@@ -32,12 +32,12 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
 void ScanExclusiveInplaceTestImpl(
-    int N,
+    int                           N,
     typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
+  WORKING_RES               res{WORKING_RES::get_default()};
   camp::resources::Resource working_res{res};
 
   T* work_in;
@@ -83,9 +83,9 @@ class ScanExclusiveInplaceTest : public ::testing::Test
 
 TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
 {
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
   ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index 7a7e490ff6..91b550e99f 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -14,7 +14,7 @@ template <typename OP>
 ::testing::AssertionResult
 check_inclusive(const typename OP::result_type* actual,
                 const typename OP::result_type* original,
-                int N)
+                int                             N)
 {
   typename OP::result_type init = OP::identity();
   for (int i = 0; i < N; ++i)
@@ -36,7 +36,7 @@ void ScanInclusiveTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
+  WORKING_RES               res{WORKING_RES::get_default()};
   camp::resources::Resource working_res{res};
 
   T* work_in;
@@ -87,9 +87,9 @@ class ScanInclusiveTest : public ::testing::Test
 
 TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
 {
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
   ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index 690ba827d2..ffdc9869aa 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -14,7 +14,7 @@ template <typename OP>
 ::testing::AssertionResult
 check_inclusive(const typename OP::result_type* actual,
                 const typename OP::result_type* original,
-                int N)
+                int                             N)
 {
   typename OP::result_type init = OP::identity();
   for (int i = 0; i < N; ++i)
@@ -36,7 +36,7 @@ void ScanInclusiveInplaceTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
+  WORKING_RES               res{WORKING_RES::get_default()};
   camp::resources::Resource working_res{res};
 
   T* work_in;
@@ -82,9 +82,9 @@ class ScanInclusiveInplaceTest : public ::testing::Test
 
 TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
 {
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using OP_TYPE = typename camp::at<TypeParam, camp::num<2>>::type;
+  using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
   ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
index fe14bb8f3f..f60163d827 100644
--- a/test/functional/scan/tests/test-scan-data.hpp
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -13,28 +13,28 @@
 //
 
 template <typename T>
-void allocScanTestData(int N,
+void allocScanTestData(int                       N,
                        camp::resources::Resource work_res,
-                       T** work_in,
-                       T** work_out,
-                       T** host_in,
-                       T** host_out)
+                       T**                       work_in,
+                       T**                       work_out,
+                       T**                       host_in,
+                       T**                       host_out)
 {
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  *work_in = work_res.allocate<T>(N);
+  *work_in  = work_res.allocate<T>(N);
   *work_out = work_res.allocate<T>(N);
 
-  *host_in = host_res.allocate<T>(N);
+  *host_in  = host_res.allocate<T>(N);
   *host_out = host_res.allocate<T>(N);
 }
 
 template <typename T>
 void deallocScanTestData(camp::resources::Resource work_res,
-                         T* work_in,
-                         T* work_out,
-                         T* host_in,
-                         T* host_out)
+                         T*                        work_in,
+                         T*                        work_out,
+                         T*                        host_in,
+                         T*                        host_out)
 {
   camp::resources::Resource host_res{camp::resources::Host()};
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index 33c9faa4c3..261d3a6d75 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -14,15 +14,15 @@ template <typename MATRIX_TYPE>
 void CtorGetSetImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
   //
   // Allocate Data
   //
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data1_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
@@ -32,7 +32,7 @@ void CtorGetSetImpl()
       data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
@@ -45,23 +45,25 @@ void CtorGetSetImpl()
   //
   // Do Operation: broadcast-ctor and copy-ctor
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // create a matrix that contains all 3's
-    matrix_t m1(element_t(3));
-
-    // copy to another matrix
-    matrix_t m2(m1);
-
-    // write out both matrices
-    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-    {
-      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
       {
-        data1_d(i, j) = m1.get(i, j);
-        data2_d(i, j) = m2.get(i, j);
-      }
-    }
-  });
+        // create a matrix that contains all 3's
+        matrix_t m1(element_t(3));
+
+        // copy to another matrix
+        matrix_t m2(m1);
+
+        // write out both matrices
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data1_d(i, j) = m1.get(i, j);
+            data2_d(i, j) = m2.get(i, j);
+          }
+        }
+      });
 
   // copy data back to host
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index 5d391cf951..a2d2a27b32 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_AddImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -27,7 +27,7 @@ void ET_AddImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N * N);
+  std::vector<element_t>                 data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
   element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
@@ -35,7 +35,7 @@ void ET_AddImpl()
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N * N);
+  std::vector<element_t>                 data2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
   element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
@@ -63,7 +63,7 @@ void ET_AddImpl()
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N * N);
+  std::vector<element_t>                 data5_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
   element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
@@ -91,27 +91,29 @@ void ET_AddImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows =
-        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-    auto SRcols =
-        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to
-    // normal Layout access. data3_d - StaticLayout with static_all() and
-    // static_range(). data4_d - StaticLayout with static_all() and all().
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-    data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) +
-                          data3_d(SArows, SRcols) + data4_d(SAcols, rows);
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) + data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -156,17 +158,19 @@ void ET_AddImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be
-        // determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index db4954358b..f934c097a0 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_DivideImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   static constexpr camp::idx_t N =
@@ -27,7 +27,7 @@ void ET_DivideImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(N * N);
+  std::vector<element_t>                 data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
   element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
@@ -36,7 +36,7 @@ void ET_DivideImpl()
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(N * N);
+  std::vector<element_t>                 data2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
   element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
@@ -64,7 +64,7 @@ void ET_DivideImpl()
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N * N);
+  std::vector<element_t>                 data5_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
   element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
@@ -92,27 +92,29 @@ void ET_DivideImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows =
-        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-    auto SRcols =
-        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to
-    // normal Layout access. data3_d - StaticLayout with static_all() and
-    // static_range(). data4_d - StaticLayout with static_all() and all().
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-    data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) +
-                          data3_d(SArows, SRcols) / data4_d(SAcols, rows);
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) / data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -157,17 +159,19 @@ void ET_DivideImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be
-        // determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 8aa903b48e..f1183bcc2d 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_LoadStoreImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -24,7 +24,7 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data1_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
@@ -35,7 +35,7 @@ void ET_LoadStoreImpl()
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
@@ -63,7 +63,7 @@ void ET_LoadStoreImpl()
 
 
   // alloc data4
-  std::vector<element_t> data4_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data4_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data4_h(
       data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
@@ -74,7 +74,7 @@ void ET_LoadStoreImpl()
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data5_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(
       data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
@@ -85,7 +85,7 @@ void ET_LoadStoreImpl()
 
 
   // alloc data6
-  std::vector<element_t> data6_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data6_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data6_h(
       data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
@@ -96,7 +96,7 @@ void ET_LoadStoreImpl()
 
 
   // alloc data7
-  std::vector<element_t> data7_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data7_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data7_h(
       data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
@@ -123,27 +123,29 @@ void ET_LoadStoreImpl()
   //
   // Do Operation: Load/Store full matrix from one view to another
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::
-        template static_range<0, matrix_t::s_num_rows>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::
-        template static_range<0, matrix_t::s_num_columns>();
+        auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::
+            template static_range<0, matrix_t::s_num_rows>();
+        auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::
+            template static_range<0, matrix_t::s_num_columns>();
 
-    data2_d(cols, rows) = data1_d(rows, cols);
+        data2_d(cols, rows) = data1_d(rows, cols);
 
-    data4_d(cols, rows) =
-        data3_d(SArows, SRcols); // mixed static_all and static_range
-    data5_d(cols, rows) = data3_d(SArows, SAcols); // static_all
-    data6_d(cols, rows) = data3_d(SRrows, SRcols); // static_range
-    data7_d(cols, rows) =
-        data3_d(rows, SRcols); // mixed static_range and non-static
-  });
+        data4_d(cols, rows) =
+            data3_d(SArows, SRcols); // mixed static_all and static_range
+        data5_d(cols, rows) = data3_d(SArows, SAcols); // static_all
+        data6_d(cols, rows) = data3_d(SRrows, SRcols); // static_range
+        data7_d(cols, rows) =
+            data3_d(rows, SRcols); // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
   tensor_copy_to_host<policy_t>(data4_vec, data4_ptr);
@@ -194,13 +196,15 @@ void ET_LoadStoreImpl()
       //
       // Do Operation: Load/Store partial matrix from one view to another
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        data2_d(cols, rows) = data1_d(rows, cols);
-      });
+            data2_d(cols, rows) = data1_d(rows, cols);
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index f07d26a8f8..389c765239 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -55,7 +55,7 @@ void ET_MatrixMatrixMultiplyImpl()
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N * N);
+  std::vector<element_t>                 data3_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(), N, N);
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
@@ -98,20 +98,25 @@ void ET_MatrixMatrixMultiplyImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols =
-        RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0, N>();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
 
-    auto B_rows =
-        RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0, N>();
-    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
-    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
 
-    data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-  });
+        data3_d(C_rows, C_cols) =
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -179,19 +184,27 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
-
-        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
-
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
-
-        data3_d(C_rows, C_cols) =
-            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            auto A_rows =
+                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+            auto A_cols =
+                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+
+            auto B_rows =
+                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+            auto B_cols =
+                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+
+            auto C_rows =
+                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_cols =
+                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+
+            data3_d(C_rows, C_cols) =
+                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index 2b17a8a69f..b7fda8523f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -17,8 +17,8 @@ template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyAddImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -58,7 +58,7 @@ void ET_MatrixMatrixMultiplyAddImpl()
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N * N);
+  std::vector<element_t>                              data3_vec(N * N);
   RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(
       data3_vec.data(), N, N);
 
@@ -104,21 +104,25 @@ void ET_MatrixMatrixMultiplyAddImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols =
-        RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0, N>();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
 
-    auto B_rows =
-        RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0, N>();
-    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
-    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
 
-    data3_d(C_rows, C_cols) +=
-        data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-  });
+        data3_d(C_rows, C_cols) +=
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -186,20 +190,28 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            auto A_rows =
+                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+            auto A_cols =
+                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
-        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+            auto B_rows =
+                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+            auto B_cols =
+                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
 
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_rows =
+                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_cols =
+                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
 
-        data3_d(C_rows, C_cols) +=
-            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+            data3_d(C_rows, C_cols) +=
+                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
index ad837e77ff..e5043c9d56 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_MatrixVectorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using cvector_t = typename matrix_t::column_vector_type;
@@ -42,7 +42,7 @@ void ET_MatrixVectorImpl()
 
   // alloc data2 - The input vector
 
-  std::vector<element_t> data2_vec(N);
+  std::vector<element_t>                                     data2_vec(N);
   RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_h(
       data2_vec.data());
 
@@ -52,7 +52,7 @@ void ET_MatrixVectorImpl()
 
   // alloc data3 - The output vector
 
-  std::vector<element_t> data3_vec(N);
+  std::vector<element_t>                         data3_vec(N);
   RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_h(data3_vec.data(), N);
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
@@ -90,16 +90,18 @@ void ET_MatrixVectorImpl()
   //
   // Do Operation: A*x
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols =
-        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
 
-    data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
-  });
+        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -146,16 +148,20 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+            auto vrow =
+                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+            auto vcol =
+                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
-      });
+            data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -189,15 +195,17 @@ void ET_MatrixVectorImpl()
   //
   // Do Operation: (x')*A
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
 
-    data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
-  });
+        data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -243,16 +251,20 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
-
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
-
-        data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+
+            auto vrow =
+                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+            auto vcol =
+                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+
+            data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index 54259eac0b..23c8f64c13 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_NegateImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -28,7 +28,7 @@ void ET_NegateImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N * N);
+  std::vector<element_t>                 input0_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, N);
 
   element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
@@ -48,7 +48,7 @@ void ET_NegateImpl()
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N * N);
+  std::vector<element_t>                 output0_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), N, N);
 
   element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
@@ -57,7 +57,7 @@ void ET_NegateImpl()
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N * N);
+  std::vector<element_t>                 output1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), N, N);
 
   element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
@@ -66,7 +66,7 @@ void ET_NegateImpl()
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N * N);
+  std::vector<element_t>                 output2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), N, N);
 
   element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
@@ -75,7 +75,7 @@ void ET_NegateImpl()
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N * N);
+  std::vector<element_t>                 output3_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), N, N);
 
   element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
@@ -84,7 +84,7 @@ void ET_NegateImpl()
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N * N);
+  std::vector<element_t>                 output4_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), N, N);
 
   element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
@@ -108,27 +108,29 @@ void ET_NegateImpl()
   //
   // Do Operation: negation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SRrows =
-        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-    auto SRcols =
-        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
-
-    output0_d(rows, cols) = -input0_d(rows, cols);
-
-    output1_d(rows, cols) =
-        -input1_d(SArows, SRcols); // mixed static_all and static_range
-    output2_d(rows, cols) = -input1_d(SArows, SAcols); // static_all
-    output3_d(rows, cols) = -input1_d(SRrows, SRcols); // static_range
-    output4_d(rows, cols) =
-        -input1_d(rows, SRcols); // mixed static_range and non-static
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+
+        output0_d(rows, cols) = -input0_d(rows, cols);
+
+        output1_d(rows, cols) =
+            -input1_d(SArows, SRcols); // mixed static_all and static_range
+        output2_d(rows, cols) = -input1_d(SArows, SAcols); // static_all
+        output3_d(rows, cols) = -input1_d(SRrows, SRcols); // static_range
+        output4_d(rows, cols) =
+            -input1_d(rows, SRcols); // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index ffa119b3fa..de849d5e32 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_SubtractImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -27,7 +27,7 @@ void ET_SubtractImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N * N);
+  std::vector<element_t>                 data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
   element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
@@ -35,7 +35,7 @@ void ET_SubtractImpl()
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N * N);
+  std::vector<element_t>                 data2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
   element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
@@ -63,7 +63,7 @@ void ET_SubtractImpl()
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N * N);
+  std::vector<element_t>                 data5_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
   element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
@@ -91,27 +91,29 @@ void ET_SubtractImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows =
-        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-    auto SRcols =
-        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to
-    // normal Layout access. data3_d - StaticLayout with static_all() and
-    // static_range(). data4_d - StaticLayout with static_all() and all().
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-    data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) +
-                          data3_d(SArows, SRcols) - data4_d(SAcols, rows);
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) - data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -156,17 +158,19 @@ void ET_SubtractImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be
-        // determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index 07efa0ed03..57a50856d3 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void ET_TransposeImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
@@ -34,7 +34,7 @@ void ET_TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N * M);
+  std::vector<element_t>                 input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
   element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
@@ -54,7 +54,7 @@ void ET_TransposeImpl()
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N * M);
+  std::vector<element_t>                 output0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
   element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
@@ -63,7 +63,7 @@ void ET_TransposeImpl()
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N * M);
+  std::vector<element_t>                 output1_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), M, N);
 
   element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
@@ -72,7 +72,7 @@ void ET_TransposeImpl()
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N * M);
+  std::vector<element_t>                 output2_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), M, N);
 
   element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
@@ -81,7 +81,7 @@ void ET_TransposeImpl()
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N * M);
+  std::vector<element_t>                 output3_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), M, N);
 
   element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
@@ -90,7 +90,7 @@ void ET_TransposeImpl()
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N * M);
+  std::vector<element_t>                 output4_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), M, N);
 
   element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
@@ -114,33 +114,36 @@ void ET_TransposeImpl()
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
-
-    auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
-    auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SRrows =
-        RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-    auto SRcols =
-        RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, M>();
-
-    output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
-
-    output1_d(rows_tr, cols_tr) =
-        input1_d(SArows, SRcols)
-            .transpose(); // mixed static_all and static_range
-    output2_d(rows_tr, cols_tr) =
-        input1_d(SArows, SAcols).transpose(); // static_all
-    output3_d(rows_tr, cols_tr) =
-        input1_d(SRrows, SRcols).transpose(); // static_range
-    output4_d(rows_tr, cols_tr) =
-        input1_d(rows, SRcols).transpose(); // mixed static_range and non-static
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+        auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
+        auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
+
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, M>();
+
+        output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
+
+        output1_d(rows_tr, cols_tr) =
+            input1_d(SArows, SRcols)
+                .transpose(); // mixed static_all and static_range
+        output2_d(rows_tr, cols_tr) =
+            input1_d(SArows, SAcols).transpose(); // static_all
+        output3_d(rows_tr, cols_tr) =
+            input1_d(SRrows, SRcols).transpose(); // static_range
+        output4_d(rows_tr, cols_tr) =
+            input1_d(rows, SRcols)
+                .transpose(); // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index d458337aac..767d5df0a5 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void Load_ColMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,7 +25,7 @@ void Load_ColMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
@@ -37,7 +37,7 @@ void Load_ColMajorImpl()
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
@@ -62,28 +62,30 @@ void Load_ColMajorImpl()
   //
   // Do operation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    matrix_t m;
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        matrix_t m;
 
-    if (matrix_t::layout_type::is_column_major())
-    {
-      m.load_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-    }
-    else
-    {
-      m.load_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-    }
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.load_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+        else
+        {
+          m.load_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
 
-    // write out to a second view so we can check it on the host
-    // on GPU's we'll write way too much, but it should stil be correct
-    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-    {
-      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-      {
-        data2_d(j, i) = m.get(i, j);
-      }
-    }
-  });
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(j, i) = m.get(i, j);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -127,29 +129,31 @@ void Load_ColMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        matrix_t m;
-        if (matrix_t::layout_type::is_column_major())
-        {
-          m.load_packed_nm(
-              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
-        }
-        else
-        {
-          m.load_strided_nm(
-              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
-        }
-
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
           {
-            data2_d(j, i) = m.get(i, j);
-          }
-        }
-      });
+            matrix_t m;
+            if (matrix_t::layout_type::is_column_major())
+            {
+              m.load_packed_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+            }
+            else
+            {
+              m.load_strided_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+            }
+
+            // write out to a second view so we can check it on the host
+            // on GPU's we'll write way too much, but it should stil be correct
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                data2_d(j, i) = m.get(i, j);
+              }
+            }
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 22ffda9c70..50943cee85 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void Load_RowMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,7 +25,7 @@ void Load_RowMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
@@ -37,7 +37,7 @@ void Load_RowMajorImpl()
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
@@ -62,27 +62,29 @@ void Load_RowMajorImpl()
   //
   // Do Operation: Full load
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    matrix_t m;
-    if (matrix_t::layout_type::is_row_major())
-    {
-      m.load_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-    }
-    else
-    {
-      m.load_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-    }
-
-    // write out to a second view so we can check it on the host
-    // on GPU's we'll write way too much, but it should stil be correct
-    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-    {
-      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
       {
-        data2_d(i, j) = m.get(i, j);
-      }
-    }
-  });
+        matrix_t m;
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.load_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+        else
+        {
+          m.load_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(i, j) = m.get(i, j);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -125,29 +127,31 @@ void Load_RowMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        matrix_t m;
-        if (matrix_t::layout_type::is_row_major())
-        {
-          m.load_packed_nm(
-              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-        else
-        {
-          m.load_strided_nm(
-              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
           {
-            data2_d(i, j) = m.get(i, j);
-          }
-        }
-      });
+            matrix_t m;
+            if (matrix_t::layout_type::is_row_major())
+            {
+              m.load_packed_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+            }
+            else
+            {
+              m.load_strided_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+            }
+
+            // write out to a second view so we can check it on the host
+            // on GPU's we'll write way too much, but it should stil be correct
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                data2_d(i, j) = m.get(i, j);
+              }
+            }
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index cb6888daa5..b991b37225 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void Store_ColMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,7 +25,7 @@ void Store_ColMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
@@ -37,7 +37,7 @@ void Store_ColMajorImpl()
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
@@ -70,28 +70,30 @@ void Store_ColMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill out matrix
-    matrix_t m(-1.0);
-
-    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-    {
-      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
       {
-        m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-      }
-    }
+        // fill out matrix
+        matrix_t m(-1.0);
 
-    // Store matrix to memory
-    if (matrix_t::layout_type::is_column_major())
-    {
-      m.store_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-    }
-    else
-    {
-      m.store_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-    }
-  });
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+          }
+        }
+
+        // Store matrix to memory
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.store_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+        else
+        {
+          m.store_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -142,30 +144,32 @@ void Store_ColMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // fill out matrix
-        matrix_t m(-1.0);
-
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
           {
-            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-          }
-        }
-
-        // Store matrix to memory
-        if (matrix_t::layout_type::is_column_major())
-        {
-          m.store_packed_nm(
-              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
-        }
-        else
-        {
-          m.store_strided_nm(
-              data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
-        }
-      });
+            // fill out matrix
+            matrix_t m(-1.0);
+
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+              }
+            }
+
+            // Store matrix to memory
+            if (matrix_t::layout_type::is_column_major())
+            {
+              m.store_packed_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+            }
+            else
+            {
+              m.store_strided_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+            }
+          });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index 11731950fe..c077afcb49 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void Store_RowMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   //
@@ -24,7 +24,7 @@ void Store_RowMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
@@ -36,7 +36,7 @@ void Store_RowMajorImpl()
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
@@ -69,28 +69,30 @@ void Store_RowMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill out matrix
-    matrix_t m(-1.0);
-
-    for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-    {
-      for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
       {
-        m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-      }
-    }
+        // fill out matrix
+        matrix_t m(-1.0);
 
-    // Store matrix to memory
-    if (matrix_t::layout_type::is_row_major())
-    {
-      m.store_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-    }
-    else
-    {
-      m.store_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-    }
-  });
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+          }
+        }
+
+        // Store matrix to memory
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.store_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+        else
+        {
+          m.store_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -141,30 +143,32 @@ void Store_RowMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        // fill out matrix
-        matrix_t m(-1.0);
-
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
           {
-            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-          }
-        }
-
-        // Store matrix to memory
-        if (matrix_t::layout_type::is_row_major())
-        {
-          m.store_packed_nm(
-              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-        else
-        {
-          m.store_strided_nm(
-              data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-      });
+            // fill out matrix
+            matrix_t m(-1.0);
+
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+              }
+            }
+
+            // Store matrix to memory
+            if (matrix_t::layout_type::is_row_major())
+            {
+              m.store_packed_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+            }
+            else
+            {
+              m.store_strided_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+            }
+          });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
index 2ffe26118f..43d59fd44c 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
@@ -14,8 +14,8 @@ template <typename MATRIX_TYPE>
 void TransposeImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
@@ -32,7 +32,7 @@ void TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N * M);
+  std::vector<element_t>                 input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
   element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
@@ -41,7 +41,7 @@ void TransposeImpl()
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N * M);
+  std::vector<element_t>                 output0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
   element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
@@ -63,17 +63,19 @@ void TransposeImpl()
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // load original matrix
-    matrix_t A;
-    A.load_strided(input0_ptr, M, 1);
-
-    // transpose matrix
-    transpose_t B = A.transpose();
-
-    // store transposed matrix
-    B.store_strided(output0_ptr, N, 1);
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load original matrix
+        matrix_t A;
+        A.load_strided(input0_ptr, M, 1);
+
+        // transpose matrix
+        transpose_t B = A.transpose();
+
+        // store transposed matrix
+        B.store_strided(output0_ptr, N, 1);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
index cd98f4f61f..2de56c4e85 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
@@ -14,19 +14,19 @@ template <typename REGISTER_TYPE>
 void AddImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -49,17 +49,19 @@ void AddImpl()
   //
 
   // operator +
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x + y;
+        register_t z = x + y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -70,19 +72,21 @@ void AddImpl()
 
 
   // operator +=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z += y;
+        z += y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -93,14 +97,16 @@ void AddImpl()
 
 
   // operator + scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x + 7;
+        register_t z = x + 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -111,16 +117,18 @@ void AddImpl()
 
 
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z += 3;
+        z += 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
index aa9a2abdf9..7376893481 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
@@ -14,19 +14,19 @@ template <typename REGISTER_TYPE>
 void DivideImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -49,17 +49,19 @@ void DivideImpl()
   //
 
   // operator /
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x / y;
+        register_t z = x / y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -70,19 +72,21 @@ void DivideImpl()
 
 
   // operator /=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z /= y;
+        z /= y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -93,14 +97,16 @@ void DivideImpl()
 
 
   // operator / scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x / 7;
+        register_t z = x / 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -111,16 +117,18 @@ void DivideImpl()
 
 
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z /= 3;
+        z /= 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -136,17 +144,19 @@ void DivideImpl()
   for (camp::idx_t N = 0; N < num_elem; ++N)
   {
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      register_t x;
-      x.load_packed_n(input0_dptr, N);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed_n(input0_dptr, N);
 
-      register_t y;
-      y.load_packed_n(input1_dptr, N);
+          register_t y;
+          y.load_packed_n(input1_dptr, N);
 
-      register_t z = x.divide_n(y, N);
+          register_t z = x.divide_n(y, N);
 
-      z.store_packed(output0_dptr);
-    });
+          z.store_packed(output0_dptr);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
index b5beb6db30..7a6472ee96 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
@@ -14,23 +14,23 @@ template <typename REGISTER_TYPE>
 void DotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t*             output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
 
   // Initialize input data
@@ -48,16 +48,18 @@ void DotProductImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
 
-    output0_dptr[0] = x.dot(y);
-  });
+        output0_dptr[0] = x.dot(y);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
index 8b04b3da25..ec64f31428 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
@@ -14,23 +14,23 @@ template <typename REGISTER_TYPE>
 void FMAImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t* input2_hptr = input2_vec.data();
+  element_t*             input2_hptr = input2_vec.data();
   element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -55,20 +55,22 @@ void FMAImpl()
   //
 
   // operator z = a*b+c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t a;
-    a.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t b;
-    b.load_packed(input1_dptr);
+        register_t b;
+        b.load_packed(input1_dptr);
 
-    register_t c;
-    c.load_packed(input2_dptr);
+        register_t c;
+        c.load_packed(input2_dptr);
 
-    register_t z = a.multiply_add(b, c);
+        register_t z = a.multiply_add(b, c);
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
index 98a942ced3..5b02616ee8 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
@@ -14,23 +14,23 @@ template <typename REGISTER_TYPE>
 void FMSImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t* input2_hptr = input2_vec.data();
+  element_t*             input2_hptr = input2_vec.data();
   element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -55,20 +55,22 @@ void FMSImpl()
   //
 
   // operator z = a*b-c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t a;
-    a.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t b;
-    b.load_packed(input1_dptr);
+        register_t b;
+        b.load_packed(input1_dptr);
 
-    register_t c;
-    c.load_packed(input2_dptr);
+        register_t c;
+        c.load_packed(input2_dptr);
 
-    register_t z = a.multiply_subtract(b, c);
+        register_t z = a.multiply_subtract(b, c);
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
index 3f1459de2c..4817279969 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
@@ -14,26 +14,26 @@ template <typename REGISTER_TYPE>
 void GatherImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t = typename int_register_t::element_type;
+  using index_t        = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read (10x larger than output)
   std::vector<element_t> input0_vec(10 * num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Indexing into input0
   std::vector<index_t> input1_vec(num_elem);
-  index_t* input1_hptr = input1_vec.data();
-  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t*             input1_hptr = input1_vec.data();
+  index_t*             input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
   element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
@@ -58,18 +58,20 @@ void GatherImpl()
   //
 
   // operator z[i] = a[b[i]]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // get offsets
-    int_register_t idx;
-    idx.load_packed(input1_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // get offsets
+        int_register_t idx;
+        idx.load_packed(input1_dptr);
 
-    // gather elements from a given offsets in idx
-    register_t a;
-    a.gather(input0_dptr, idx);
+        // gather elements from a given offsets in idx
+        register_t a;
+        a.gather(input0_dptr, idx);
 
-    // write out gathered elements in packed order
-    a.store_packed(output0_dptr);
-  });
+        // write out gathered elements in packed order
+        a.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -87,19 +89,21 @@ void GatherImpl()
   {
 
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      // get offsets
-      int_register_t idx;
-      idx.load_packed_n(input1_dptr, N);
-
-      // gather elements from a given offsets in idx
-      register_t a;
-      a.gather_n(input0_dptr, idx, N);
-
-      // write out gathered elements in packed order
-      // we're writing out entire length to check the zeroing
-      a.store_packed(output0_dptr);
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // get offsets
+          int_register_t idx;
+          idx.load_packed_n(input1_dptr, N);
+
+          // gather elements from a given offsets in idx
+          register_t a;
+          a.gather_n(input0_dptr, idx, N);
+
+          // write out gathered elements in packed order
+          // we're writing out entire length to check the zeroing
+          a.store_packed(output0_dptr);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
index 560bb4f0b0..a7161e674e 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
@@ -14,14 +14,14 @@ template <typename REGISTER_TYPE>
 void GetSetImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -36,20 +36,22 @@ void GetSetImpl()
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
   // Test set and get operations
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x using set
-    register_t x;
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      x.set(input0_dptr[i], i);
-    }
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = x.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -62,22 +64,24 @@ void GetSetImpl()
   //
   // test copy construction
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x using set
-    register_t x;
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc(x);
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = cc.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc(x);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -90,23 +94,25 @@ void GetSetImpl()
   //
   // test explicit copy
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x using set
-    register_t x;
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc;
-    cc.copy(x);
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = cc.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc;
+        cc.copy(x);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -119,22 +125,24 @@ void GetSetImpl()
   //
   // test assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x using set
-    register_t x;
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc = x;
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = cc.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc = x;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -147,15 +155,17 @@ void GetSetImpl()
   //
   // test scalar construction (broadcast)
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t cc = (element_t)5;
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = cc.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)5;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -168,16 +178,18 @@ void GetSetImpl()
   //
   // test scalar broadcast by assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t cc = (element_t)0;
-    cc = (element_t)11.0;
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = cc.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)0;
+        cc            = (element_t)11.0;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -190,16 +202,18 @@ void GetSetImpl()
   //
   // test scalar explicit broadcast
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t cc = (element_t)0;
-    cc.broadcast(13.0);
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = cc.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)0;
+        cc.broadcast(13.0);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
index c8675fc85e..435961a33b 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
@@ -14,14 +14,14 @@ template <typename REGISTER_TYPE>
 void LoadImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(10 * num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -37,17 +37,19 @@ void LoadImpl()
 
 
   // load stride-1 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x using set
-    register_t x;
-    x.load_packed(input0_dptr);
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = x.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        x.load_packed(input0_dptr);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -60,17 +62,19 @@ void LoadImpl()
   for (camp::idx_t N = 0; N < num_elem; ++N)
   {
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      // fill x using set
-      register_t x;
-      x.load_packed_n(input0_dptr, N);
-
-      // extract from x using get
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
-        output0_dptr[i] = x.get(i);
-      }
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x using set
+          register_t x;
+          x.load_packed_n(input0_dptr, N);
+
+          // extract from x using get
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            output0_dptr[i] = x.get(i);
+          }
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
@@ -89,17 +93,19 @@ void LoadImpl()
 
 
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x using set
-    register_t x;
-    x.load_strided(input0_dptr, 2);
-
-    // extract from x using get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_dptr[i] = x.get(i);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        x.load_strided(input0_dptr, 2);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -112,17 +118,19 @@ void LoadImpl()
   for (camp::idx_t N = 0; N < num_elem; ++N)
   {
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      // fill x using set
-      register_t x;
-      x.load_strided_n(input0_dptr, 2, N);
-
-      // extract from x using get
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
-        output0_dptr[i] = x.get(i);
-      }
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x using set
+          register_t x;
+          x.load_strided_n(input0_dptr, 2, N);
+
+          // extract from x using get
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            output0_dptr[i] = x.get(i);
+          }
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
index c407a77530..ffdd99d251 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
@@ -14,23 +14,23 @@ template <typename REGISTER_TYPE>
 void MaxImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t*             output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
   element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
@@ -51,23 +51,25 @@ void MaxImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // load input vectors
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load input vectors
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
 
-    // compute reduction
-    output0_dptr[0] = x.max();
+        // compute reduction
+        output0_dptr[0] = x.max();
 
 
-    // compute element-wise
-    register_t z = x.vmax(y);
-    z.store_packed(output1_dptr);
-  });
+        // compute element-wise
+        register_t z = x.vmax(y);
+        z.store_packed(output1_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
@@ -101,12 +103,14 @@ void MaxImpl()
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      register_t x;
-      x.load_packed(input0_dptr);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed(input0_dptr);
 
-      output0_dptr[0] = x.max_n(N);
-    });
+          output0_dptr[0] = x.max_n(N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
index 340ab16e8a..823c61ae5d 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
@@ -14,22 +14,22 @@ template <typename REGISTER_TYPE>
 void MinImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t*             output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
   element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
@@ -50,23 +50,25 @@ void MinImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // load input vectors
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load input vectors
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
 
-    // compute reduction
-    output0_dptr[0] = x.min();
+        // compute reduction
+        output0_dptr[0] = x.min();
 
 
-    // compute element-wise
-    register_t z = x.vmin(y);
-    z.store_packed(output1_dptr);
-  });
+        // compute element-wise
+        register_t z = x.vmin(y);
+        z.store_packed(output1_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
@@ -100,12 +102,14 @@ void MinImpl()
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      register_t x;
-      x.load_packed(input0_dptr);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed(input0_dptr);
 
-      output0_dptr[0] = x.min_n(N);
-    });
+          output0_dptr[0] = x.min_n(N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
index 33fbd89565..e3d7e5b3cc 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
@@ -14,19 +14,19 @@ template <typename REGISTER_TYPE>
 void MultiplyImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -49,17 +49,19 @@ void MultiplyImpl()
   //
 
   // operator *
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x * y;
+        register_t z = x * y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -70,19 +72,21 @@ void MultiplyImpl()
 
 
   // operator *=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z *= y;
+        z *= y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -93,14 +97,16 @@ void MultiplyImpl()
 
 
   // operator * scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x * 7;
+        register_t z = x * 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -111,16 +117,18 @@ void MultiplyImpl()
 
 
   // operator *= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z *= 3;
+        z *= 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
index f1558c5f9f..bf7c77ebf6 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
@@ -14,26 +14,26 @@ template <typename REGISTER_TYPE>
 void ScatterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t = typename int_register_t::element_type;
+  using index_t        = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Indexing into output0
   std::vector<index_t> input1_vec(num_elem);
-  index_t* input1_hptr = input1_vec.data();
-  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t*             input1_hptr = input1_vec.data();
+  index_t*             input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   // Scattered output (10x larger than output)
   std::vector<element_t> output0_vec(10 * num_elem);
@@ -66,15 +66,17 @@ void ScatterImpl()
   //
 
   // operator z[b[i]] = a[i]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    int_register_t idx;
-    idx.load_packed(input1_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        int_register_t idx;
+        idx.load_packed(input1_dptr);
 
-    register_t a;
-    a.load_packed(input0_dptr);
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    a.scatter(output0_dptr, idx);
-  });
+        a.scatter(output0_dptr, idx);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -111,15 +113,17 @@ void ScatterImpl()
 
 
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      int_register_t idx;
-      idx.load_packed(input1_dptr);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          int_register_t idx;
+          idx.load_packed(input1_dptr);
 
-      register_t a;
-      a.load_packed(input0_dptr);
+          register_t a;
+          a.load_packed(input0_dptr);
 
-      a.scatter_n(output0_dptr, idx, N);
-    });
+          a.scatter_n(output0_dptr, idx, N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
index 08e2d4be43..9119c64d99 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
@@ -14,15 +14,15 @@ template <typename REGISTER_TYPE>
 void SegmentedBroadcastInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -53,14 +53,16 @@ void SegmentedBroadcastInnerImpl()
       //      (camp::idx_t)input_segment);
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_broadcast_inner(segbits, input_segment);
+            register_t y = x.segmented_broadcast_inner(segbits, input_segment);
 
-        y.store_packed(output0_dptr);
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -71,7 +73,7 @@ void SegmentedBroadcastInnerImpl()
       // Compute expected values
       element_t expected[num_elem];
 
-      camp::idx_t mask = (1 << segbits) - 1;
+      camp::idx_t mask   = (1 << segbits) - 1;
       camp::idx_t offset = input_segment << segbits;
 
       // default implementation is dumb, just sum each value into
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
index 40213a9972..15c57ad335 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
@@ -14,15 +14,15 @@ template <typename REGISTER_TYPE>
 void SegmentedBroadcastOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -51,14 +51,16 @@ void SegmentedBroadcastOuterImpl()
     {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_broadcast_outer(segbits, input_segment);
+            register_t y = x.segmented_broadcast_outer(segbits, input_segment);
 
-        y.store_packed(output0_dptr);
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
index 5ad61c7dad..884b50eb57 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
@@ -14,19 +14,19 @@ template <typename REGISTER_TYPE>
 void SegmentedDotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -55,16 +55,18 @@ void SegmentedDotProductImpl()
     {
 
 
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y;
-        y.load_packed(input1_dptr);
+            register_t y;
+            y.load_packed(input1_dptr);
 
-        register_t dp = x.segmented_dot(segbits, output_segment, y);
-        dp.store_packed(output0_dptr);
-      });
+            register_t dp = x.segmented_dot(segbits, output_segment, y);
+            dp.store_packed(output0_dptr);
+          });
 
 
       // Move result to host
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
index c212e0a601..664d2d7318 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
@@ -14,15 +14,15 @@ template <typename REGISTER_TYPE>
 void SegmentedSumInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -49,14 +49,16 @@ void SegmentedSumInnerImpl()
     {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_sum_inner(segbits, output_segment);
+            register_t y = x.segmented_sum_inner(segbits, output_segment);
 
-        y.store_packed(output0_dptr);
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
index f51df07b74..4448fc2fcb 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
@@ -14,15 +14,15 @@ template <typename REGISTER_TYPE>
 void SegmentedSumOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -48,14 +48,16 @@ void SegmentedSumOuterImpl()
     {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_sum_outer(segbits, output_segment);
+            register_t y = x.segmented_sum_outer(segbits, output_segment);
 
-        y.store_packed(output0_dptr);
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
index 679abdf6fb..3fb1ad7001 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
@@ -14,14 +14,14 @@ template <typename REGISTER_TYPE>
 void StoreImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(10 * num_elem);
@@ -45,16 +45,18 @@ void StoreImpl()
 
 
   // store stride-1 to pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x
-    register_t x;
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      x.set(input0_dptr[i], i);
-    }
-
-    x.store_packed(output0_dptr);
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        x.store_packed(output0_dptr);
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy
@@ -76,16 +78,18 @@ void StoreImpl()
 
 
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      // fill x
-      register_t x;
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
-        x.set(input0_dptr[i], i);
-      }
-
-      x.store_packed_n(output0_dptr, N);
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x
+          register_t x;
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            x.set(input0_dptr[i], i);
+          }
+
+          x.store_packed_n(output0_dptr, N);
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
@@ -112,16 +116,18 @@ void StoreImpl()
 
 
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // fill x
-    register_t x;
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      x.set(input0_dptr[i], i);
-    }
-
-    x.store_strided(output0_dptr, 2);
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        x.store_strided(output0_dptr, 2);
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
@@ -143,16 +149,18 @@ void StoreImpl()
 
 
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-      // fill x
-      register_t x;
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
-        x.set(input0_dptr[i], i);
-      }
-
-      x.store_strided_n(output0_dptr, 2, N);
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x
+          register_t x;
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            x.set(input0_dptr[i], i);
+          }
+
+          x.store_strided_n(output0_dptr, 2, N);
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
index 7a961cb69e..85120e8b03 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
@@ -14,19 +14,19 @@ template <typename REGISTER_TYPE>
 void SubtractImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
+  element_t*             input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
+  element_t*             input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -49,17 +49,19 @@ void SubtractImpl()
   //
 
   // operator -
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x - y;
+        register_t z = x - y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -70,19 +72,21 @@ void SubtractImpl()
 
 
   // operator -=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z -= y;
+        z -= y;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -93,14 +97,16 @@ void SubtractImpl()
 
 
   // operator - scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x - 7;
+        register_t z = x - 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
@@ -111,16 +117,18 @@ void SubtractImpl()
 
 
   // operator -= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z -= 3;
+        z -= 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
index f31e64afb0..d6c7e72dd8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
@@ -14,8 +14,8 @@ template <typename VECTOR_TYPE>
 void CtorGetSetImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
@@ -23,13 +23,13 @@ void CtorGetSetImpl()
   std::vector<element_t> get(vector_t::s_num_elem);
   std::vector<element_t> set(vector_t::s_num_elem);
 
-  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* A_ptr   = tensor_malloc<policy_t>(A);
   element_t* get_ptr = tensor_malloc<policy_t>(get);
   element_t* set_ptr = tensor_malloc<policy_t>(set);
 
   for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
   {
-    A[i] = (element_t)(i * 2);
+    A[i]   = (element_t)(i * 2);
     get[i] = 0;
     set[i] = 0;
   }
@@ -40,27 +40,29 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-    {
-      // load array A as vector
-      vector_t vec;
-      vec.load_packed_n(A_ptr, N);
-
-      // try get operations
-      for (camp::idx_t i = 0; i < N; ++i)
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
       {
-        get_ptr[i] = vec.get(i);
-      }
-
-      // try set and get operations
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        vec.set((element_t)(i + 1), i);
-        set_ptr[i] = vec.get(i);
-      }
-    }
-  });
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
+          // load array A as vector
+          vector_t vec;
+          vec.load_packed_n(A_ptr, N);
+
+          // try get operations
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            get_ptr[i] = vec.get(i);
+          }
+
+          // try set and get operations
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            vec.set((element_t)(i + 1), i);
+            set_ptr[i] = vec.get(i);
+          }
+        }
+      });
 
 
   tensor_copy_to_host<policy_t>(get, get_ptr);
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
index cf4e8c065e..a2489097d8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
@@ -14,8 +14,8 @@ template <typename VECTOR_TYPE>
 void FmaFmsImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -24,17 +24,17 @@ void FmaFmsImpl()
   std::vector<element_t> fma(vector_t::s_num_elem);
   std::vector<element_t> fms(vector_t::s_num_elem);
 
-  element_t* A_ptr = tensor_malloc<policy_t>(A);
-  element_t* B_ptr = tensor_malloc<policy_t>(B);
-  element_t* C_ptr = tensor_malloc<policy_t>(C);
+  element_t* A_ptr   = tensor_malloc<policy_t>(A);
+  element_t* B_ptr   = tensor_malloc<policy_t>(B);
+  element_t* C_ptr   = tensor_malloc<policy_t>(C);
   element_t* fma_ptr = tensor_malloc<policy_t>(fma);
   element_t* fms_ptr = tensor_malloc<policy_t>(fms);
 
   for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
   {
-    A[i] = (element_t)i;
-    B[i] = (element_t)i * 2;
-    C[i] = (element_t)i * 3;
+    A[i]   = (element_t)i;
+    B[i]   = (element_t)i * 2;
+    C[i]   = (element_t)i * 3;
     fma[i] = 0;
     fms[i] = 0;
   }
@@ -47,37 +47,39 @@ void FmaFmsImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-    {
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
 
-      // load arrays as vectors
-      vector_t vec_A;
-      vec_A.load_packed_n(A_ptr, N);
+          // load arrays as vectors
+          vector_t vec_A;
+          vec_A.load_packed_n(A_ptr, N);
 
-      vector_t vec_B;
-      vec_B.load_packed_n(B_ptr, N);
+          vector_t vec_B;
+          vec_B.load_packed_n(B_ptr, N);
 
-      vector_t vec_C;
-      vec_C.load_packed_n(C_ptr, N);
+          vector_t vec_C;
+          vec_C.load_packed_n(C_ptr, N);
 
 
-      // try FMA (A*B+C)
+          // try FMA (A*B+C)
 
-      vector_t fma = vec_A.multiply_add(vec_B, vec_C);
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        fma_ptr[i] = fma.get(i);
-      }
+          vector_t fma = vec_A.multiply_add(vec_B, vec_C);
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            fma_ptr[i] = fma.get(i);
+          }
 
-      // try FMS (A*B-C)
-      vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        fms_ptr[i] = fms.get(i);
-      }
-    }
-  });
+          // try FMS (A*B-C)
+          vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            fms_ptr[i] = fms.get(i);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(fma, fma_ptr);
   tensor_copy_to_host<policy_t>(fms, fms_ptr);
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
index 670044a7de..2f4269161c 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
@@ -16,8 +16,8 @@ template <typename VECTOR_TYPE>
 void ForallVectorRef1dImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
@@ -58,9 +58,8 @@ void ForallVectorRef1dImpl()
   tensor_copy_to_device<policy_t>(B_ptr, B);
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    Z_d[all] = 3 + (X_d[all] * (5 / Y_d[all])) + 9;
-  });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
+                      { Z_d[all] = 3 + (X_d[all] * (5 / Y_d[all])) + 9; });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
@@ -83,9 +82,9 @@ void ForallVectorRef1dImpl()
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    Z_d[all] = 3 + ((X_d[all] * Y_d[all]) / Y_d[all]) + 9;
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      { Z_d[all] = 3 + ((X_d[all] * Y_d[all]) / Y_d[all]) + 9; });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
@@ -104,9 +103,8 @@ void ForallVectorRef1dImpl()
 
   // evaluate on a subrange [N/2, N)
   auto some = idx_t::range(N / 2, N);
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    Z_d[some] = 3. + (X_d[some] * (5 / Y_d[some])) + 9;
-  });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
+                      { Z_d[some] = 3. + (X_d[some] * (5 / Y_d[some])) + 9; });
 
   tensor_copy_to_host<policy_t>(A, A_ptr);
   tensor_copy_to_host<policy_t>(B, B_ptr);
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index 431be0b045..d4ff9d90a8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -23,7 +23,7 @@ typename std::enable_if<
     !TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
-  using vector_t = VECTOR_TYPE;
+  using vector_t  = VECTOR_TYPE;
   using element_t = typename vector_t::element_type;
 
   using index_t = ptrdiff_t;
@@ -51,7 +51,7 @@ ForallVectorRef2dImpl()
   RAJA::View<element_t, RAJA::Layout<2>> Z(C.data(), N, M);
 
   using idx_t = RAJA::expt::VectorIndex<index_t, vector_t>;
-  auto all = idx_t::all();
+  auto all    = idx_t::all();
 
   //
   // Test with kernel, using sequential policies and ::all()
@@ -95,9 +95,8 @@ ForallVectorRef2dImpl()
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N),
                        RAJA::TypedRangeSegment<index_t>(0, M)),
 
-      [=](index_t i, index_t j) {
-        Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9;
-      });
+      [=](index_t i, index_t j)
+      { Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9; });
 
   for (index_t i = 0; i < N * M; i++)
   {
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
index 25fb00cee4..42b13bb70c 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
@@ -14,15 +14,15 @@ template <typename VECTOR_TYPE>
 void MinMaxImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
   std::vector<element_t> ex_min(1);
   std::vector<element_t> ex_max(1);
 
-  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* A_ptr      = tensor_malloc<policy_t>(A);
   element_t* ex_min_ptr = tensor_malloc<policy_t>(ex_min);
   element_t* ex_max_ptr = tensor_malloc<policy_t>(ex_max);
 
@@ -40,18 +40,20 @@ void MinMaxImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-    {
-
-      // load array A as vector
-      vector_t vec;
-      vec.load_packed_n(A_ptr, N);
-
-      ex_min_ptr[0] = vec.min_n(N);
-      ex_max_ptr[0] = vec.max_n(N);
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
+
+          // load array A as vector
+          vector_t vec;
+          vec.load_packed_n(A_ptr, N);
+
+          ex_min_ptr[0] = vec.min_n(N);
+          ex_max_ptr[0] = vec.max_n(N);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(ex_min, ex_min_ptr);
   tensor_copy_to_host<policy_t>(ex_max, ex_max_ptr);
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
index 2993f50ea4..5138d7858a 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
@@ -14,8 +14,8 @@ template <typename VECTOR_TYPE>
 void SumDotImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -25,7 +25,7 @@ void SumDotImpl()
   element_t host_sum = 0;
   element_t host_dot = 0;
 
-  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* A_ptr      = tensor_malloc<policy_t>(A);
   element_t* ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
   element_t* ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
 
@@ -50,14 +50,16 @@ void SumDotImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE() {
-    // load array A as vector
-    vector_t vec;
-    vec.load_packed_n(A_ptr, vector_t::s_num_elem);
-
-    ex_sum_ptr[0] = vec.sum();
-    ex_dot_ptr[0] = vec.dot(vec);
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load array A as vector
+        vector_t vec;
+        vec.load_packed_n(A_ptr, vector_t::s_num_elem);
+
+        ex_sum_ptr[0] = vec.sum();
+        ex_dot_ptr[0] = vec.dot(vec);
+      });
 
   tensor_copy_to_host<policy_t>(ex_sum, ex_sum_ptr);
   tensor_copy_to_host<policy_t>(ex_dot, ex_dot_ptr);
diff --git a/test/functional/util/test-CombiningAdapter-1D.cpp b/test/functional/util/test-CombiningAdapter-1D.cpp
index b823630788..4068067b1e 100644
--- a/test/functional/util/test-CombiningAdapter-1D.cpp
+++ b/test/functional/util/test-CombiningAdapter-1D.cpp
@@ -28,8 +28,9 @@ void test_CombiningAdapter_1D(Segment0 const& seg0)
   auto seg0_begin = begin(seg0);
 
   size_t counter0 = 0;
-  auto adapter = RAJA::make_CombiningAdapter(
-      [&](SegIndexType i0) {
+  auto   adapter  = RAJA::make_CombiningAdapter(
+      [&](SegIndexType i0)
+      {
         ASSERT_EQ(seg0_begin[counter0], i0);
         counter0 += 1;
       },
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index 7b12033430..d72f1f5297 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -28,14 +28,15 @@ void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
   using std::begin;
   using std::distance;
   using std::end;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
-  size_t seg1_len = static_cast<size_t>(seg1.size());
+  auto   seg0_begin = begin(seg0);
+  auto   seg1_begin = begin(seg1);
+  size_t seg1_len   = static_cast<size_t>(seg1.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
-  auto adapter = RAJA::make_CombiningAdapter(
-      [&](SegIndexType0 i0, SegIndexType1 i1) {
+  auto   adapter  = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1)
+      {
         ASSERT_EQ(seg0_begin[counter0], i0);
         ASSERT_EQ(seg1_begin[counter1], i1);
         counter1 += 1;
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index a8feca569e..39231bb232 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -32,17 +32,18 @@ void test_CombiningAdapter_3D(Segment0 const& seg0,
   using std::begin;
   using std::distance;
   using std::end;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
-  size_t seg1_len = static_cast<size_t>(seg1.size());
-  auto seg2_begin = begin(seg2);
-  size_t seg2_len = static_cast<size_t>(seg2.size());
+  auto   seg0_begin = begin(seg0);
+  auto   seg1_begin = begin(seg1);
+  size_t seg1_len   = static_cast<size_t>(seg1.size());
+  auto   seg2_begin = begin(seg2);
+  size_t seg2_len   = static_cast<size_t>(seg2.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
   size_t counter2 = 0;
-  auto adapter = RAJA::make_CombiningAdapter(
-      [&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2) {
+  auto   adapter  = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2)
+      {
         ASSERT_EQ(seg0_begin[counter0], i0);
         ASSERT_EQ(seg1_begin[counter1], i1);
         ASSERT_EQ(seg2_begin[counter2], i2);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
index 31dfaba714..0bd7040e78 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
@@ -28,8 +28,9 @@ void test_PermutedCombiningAdapter_1D(Segment const& seg0)
   auto seg0_begin = begin(seg0);
 
   size_t counters[1] = {0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>(
-      [&](IndexType i0) {
+  auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0)
+      {
         ASSERT_EQ(seg0_begin[counters[0]], i0);
         counters[camp::seq_at<0, Perm>::value] += 1;
       },
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index f07420f8f5..104c213f96 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -25,14 +25,15 @@ void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
   using std::begin;
   using std::distance;
   using std::end;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
+  auto   seg0_begin  = begin(seg0);
+  auto   seg1_begin  = begin(seg1);
   size_t seg_lens[2] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size())};
 
   size_t counters[2] = {0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>(
-      [&](IndexType i0, IndexType i1) {
+  auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1)
+      {
         ASSERT_EQ(seg0_begin[counters[0]], i0);
         ASSERT_EQ(seg1_begin[counters[1]], i1);
         counters[camp::seq_at<1, Perm>::value] += 1;
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index cf495e48ea..3790882fa1 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -27,16 +27,17 @@ void test_PermutedCombiningAdapter_3D(Segment const& seg0,
   using std::begin;
   using std::distance;
   using std::end;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
-  auto seg2_begin = begin(seg2);
+  auto   seg0_begin  = begin(seg0);
+  auto   seg1_begin  = begin(seg1);
+  auto   seg2_begin  = begin(seg2);
   size_t seg_lens[3] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size()),
                         static_cast<size_t>(seg2.size())};
 
   size_t counters[3] = {0, 0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>(
-      [&](IndexType i0, IndexType i1, IndexType i2) {
+  auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1, IndexType i2)
+      {
         ASSERT_EQ(seg0_begin[counters[0]], i0);
         ASSERT_EQ(seg1_begin[counters[1]], i1);
         ASSERT_EQ(seg2_begin[counters[2]], i2);
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index ee31e84155..412529e661 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -23,7 +23,7 @@
 template <typename IndexType, typename type1>
 struct callable11
 {
-  type1* working_ptr1;
+  type1*                working_ptr1;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr1[i] += type1(i);
@@ -32,8 +32,8 @@ struct callable11
 template <typename IndexType, typename type1>
 struct callable12
 {
-  type1* working_ptr1;
-  type1 const test_val1;
+  type1*                working_ptr1;
+  type1 const           test_val1;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr1[i] += test_val1;
@@ -43,7 +43,7 @@ struct callable12
 template <typename IndexType, typename type2>
 struct callable21
 {
-  type2* working_ptr2;
+  type2*                working_ptr2;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr2[i] += type2(i);
@@ -52,8 +52,8 @@ struct callable21
 template <typename IndexType, typename type2>
 struct callable22
 {
-  type2* working_ptr2;
-  type2 const test_val2;
+  type2*                working_ptr2;
+  type2 const           test_val2;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr2[i] += test_val2;
@@ -63,7 +63,7 @@ struct callable22
 template <typename IndexType, typename type3>
 struct callable31
 {
-  type3* working_ptr3;
+  type3*                working_ptr3;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr3[i] += type3(i);
@@ -72,8 +72,8 @@ struct callable31
 template <typename IndexType, typename type3>
 struct callable32
 {
-  type3* working_ptr3;
-  type3 const test_val3;
+  type3*                working_ptr3;
+  type3 const           test_val3;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr3[i] += test_val3;
@@ -91,13 +91,13 @@ template <typename ExecPolicy,
 struct testWorkGroupOrderedMultiple
 {
   void operator()(std::mt19937& rng,
-                  IndexType max_begin,
-                  IndexType min_end,
-                  IndexType num1,
-                  IndexType num2,
-                  IndexType num3,
-                  IndexType pool_reuse,
-                  IndexType group_reuse) const
+                  IndexType     max_begin,
+                  IndexType     min_end,
+                  IndexType     num1,
+                  IndexType     num2,
+                  IndexType     num3,
+                  IndexType     pool_reuse,
+                  IndexType     group_reuse) const
   {
     ASSERT_GT(min_end, max_begin);
     IndexType N = min_end + max_begin;
@@ -128,7 +128,7 @@ struct testWorkGroupOrderedMultiple
       }
     }
 
-    WORKING_RES res = WORKING_RES::get_default();
+    WORKING_RES               res = WORKING_RES::get_default();
     camp::resources::Resource working_res{res};
 
     using type1 = IndexType;
@@ -136,16 +136,16 @@ struct testWorkGroupOrderedMultiple
     using type3 = double;
 
     type1* working_array1 = nullptr;
-    type1* check_array1 = nullptr;
-    type1* test_array1 = nullptr;
+    type1* check_array1   = nullptr;
+    type1* test_array1    = nullptr;
 
     type2* working_array2 = nullptr;
-    type2* check_array2 = nullptr;
-    type2* test_array2 = nullptr;
+    type2* check_array2   = nullptr;
+    type2* test_array2    = nullptr;
 
     type3* working_array3 = nullptr;
-    type3* check_array3 = nullptr;
-    type3* test_array3 = nullptr;
+    type3* check_array3   = nullptr;
+    type3* test_array3    = nullptr;
 
     allocateForallTestData<type1>(
         N * num1, working_res, &working_array1, &check_array1, &test_array1);
@@ -198,9 +198,9 @@ struct testWorkGroupOrderedMultiple
 
     using resource_type = typename WorkGroup_type::resource_type;
 
-    WorkPool_type pool(Allocator{});
+    WorkPool_type  pool(Allocator{});
     WorkGroup_type group = pool.instantiate();
-    WorkSite_type site = group.run();
+    WorkSite_type  site  = group.run();
 
     for (IndexType pr = 0; pr < pool_reuse; pr++)
     {
@@ -324,7 +324,7 @@ struct testWorkGroupOrderedMultiple
 
           for (IndexType j = IndexType(0); j < num1; j++)
           {
-            type1* test_ptr1 = test_array1 + N * j;
+            type1* test_ptr1  = test_array1 + N * j;
             type1* check_ptr1 = check_array1 + N * j;
             for (IndexType i = IndexType(0); i < begin1[j]; i++)
             {
@@ -342,7 +342,7 @@ struct testWorkGroupOrderedMultiple
 
           for (IndexType j = IndexType(0); j < num2; j++)
           {
-            type2* test_ptr2 = test_array2 + N * j;
+            type2* test_ptr2  = test_array2 + N * j;
             type2* check_ptr2 = check_array2 + N * j;
             for (IndexType i = IndexType(0); i < begin2[j]; i++)
             {
@@ -360,7 +360,7 @@ struct testWorkGroupOrderedMultiple
 
           for (IndexType j = IndexType(0); j < num3; j++)
           {
-            type3* test_ptr3 = test_array3 + N * j;
+            type3* test_ptr3  = test_array3 + N * j;
             type3* check_ptr3 = check_array3 + N * j;
             for (IndexType i = IndexType(0); i < begin3[j]; i++)
             {
@@ -400,7 +400,7 @@ struct testWorkGroupOrderedMultiple
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -426,7 +426,7 @@ struct testWorkGroupOrderedMultiple<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -464,12 +464,12 @@ TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
 TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
              BasicWorkGroupOrderedMultipleReuse)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
@@ -479,7 +479,7 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
   IndexType num2 = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType num3 = dist_type(IndexType(0), IndexType(8))(rng);
 
-  IndexType pool_reuse = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
   testWorkGroupOrderedMultiple<ExecPolicy,
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index 609f20ba2b..7437cb3059 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -34,7 +34,7 @@ struct testWorkGroupOrderedSingle
     ASSERT_GE(end, begin);
     IndexType N = end + begin;
 
-    WORKING_RES res = WORKING_RES::get_default();
+    WORKING_RES               res = WORKING_RES::get_default();
     camp::resources::Resource working_res{res};
 
     IndexType* working_array;
@@ -48,13 +48,11 @@ struct testWorkGroupOrderedSingle
 
     using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-    auto callable1 = [=] RAJA_HOST_DEVICE(IndexType i) {
-      working_array[i] += i;
-    };
+    auto callable1 = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += i; };
 
-    auto callable2 = [=] RAJA_HOST_DEVICE(IndexType i) {
-      working_array[i] += test_val;
-    };
+    auto callable2 = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += test_val; };
 
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, decltype(callable1)>,
@@ -139,7 +137,7 @@ struct testWorkGroupOrderedSingle
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -157,7 +155,7 @@ struct testWorkGroupOrderedSingle<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -187,12 +185,12 @@ TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
 TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
              BasicWorkGroupOrderedSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index a58379f21d..dba9f98844 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -22,8 +22,8 @@
 template <typename IndexType, typename type1>
 struct callable1
 {
-  type1* working_ptr1;
-  type1 const test_val1;
+  type1*                working_ptr1;
+  type1 const           test_val1;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr1[i] += type1(i) + test_val1;
@@ -33,8 +33,8 @@ struct callable1
 template <typename IndexType, typename type2>
 struct callable2
 {
-  type2* working_ptr2;
-  type2 const test_val2;
+  type2*                working_ptr2;
+  type2 const           test_val2;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr2[i] += type2(i) + test_val2;
@@ -44,8 +44,8 @@ struct callable2
 template <typename IndexType, typename type3>
 struct callable3
 {
-  type3* working_ptr3;
-  type3 const test_val3;
+  type3*                working_ptr3;
+  type3 const           test_val3;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr3[i] += type3(i) + test_val3;
@@ -63,13 +63,13 @@ template <typename ExecPolicy,
 struct testWorkGroupUnorderedMultiple
 {
   void operator()(std::mt19937& rng,
-                  IndexType max_begin,
-                  IndexType min_end,
-                  IndexType num1,
-                  IndexType num2,
-                  IndexType num3,
-                  IndexType pool_reuse,
-                  IndexType group_reuse) const
+                  IndexType     max_begin,
+                  IndexType     min_end,
+                  IndexType     num1,
+                  IndexType     num2,
+                  IndexType     num3,
+                  IndexType     pool_reuse,
+                  IndexType     group_reuse) const
   {
     ASSERT_GT(min_end, max_begin);
     IndexType N = min_end + max_begin;
@@ -100,7 +100,7 @@ struct testWorkGroupUnorderedMultiple
       }
     }
 
-    WORKING_RES res = WORKING_RES::get_default();
+    WORKING_RES               res = WORKING_RES::get_default();
     camp::resources::Resource working_res{res};
 
     using type1 = IndexType;
@@ -108,16 +108,16 @@ struct testWorkGroupUnorderedMultiple
     using type3 = double;
 
     type1* working_array1 = nullptr;
-    type1* check_array1 = nullptr;
-    type1* test_array1 = nullptr;
+    type1* check_array1   = nullptr;
+    type1* test_array1    = nullptr;
 
     type2* working_array2 = nullptr;
-    type2* check_array2 = nullptr;
-    type2* test_array2 = nullptr;
+    type2* check_array2   = nullptr;
+    type2* test_array2    = nullptr;
 
     type3* working_array3 = nullptr;
-    type3* check_array3 = nullptr;
-    type3* test_array3 = nullptr;
+    type3* check_array3   = nullptr;
+    type3* test_array3    = nullptr;
 
     allocateForallTestData<type1>(
         N * num1, working_res, &working_array1, &check_array1, &test_array1);
@@ -278,7 +278,7 @@ struct testWorkGroupUnorderedMultiple
 
           for (IndexType j = IndexType(0); j < num1; j++)
           {
-            type1* test_ptr1 = test_array1 + N * j;
+            type1* test_ptr1  = test_array1 + N * j;
             type1* check_ptr1 = check_array1 + N * j;
             for (IndexType i = IndexType(0); i < begin1[j]; i++)
             {
@@ -296,7 +296,7 @@ struct testWorkGroupUnorderedMultiple
 
           for (IndexType j = IndexType(0); j < num2; j++)
           {
-            type2* test_ptr2 = test_array2 + N * j;
+            type2* test_ptr2  = test_array2 + N * j;
             type2* check_ptr2 = check_array2 + N * j;
             for (IndexType i = IndexType(0); i < begin2[j]; i++)
             {
@@ -314,7 +314,7 @@ struct testWorkGroupUnorderedMultiple
 
           for (IndexType j = IndexType(0); j < num3; j++)
           {
-            type3* test_ptr3 = test_array3 + N * j;
+            type3* test_ptr3  = test_array3 + N * j;
             type3* check_ptr3 = check_array3 + N * j;
             for (IndexType i = IndexType(0); i < begin3[j]; i++)
             {
@@ -352,7 +352,7 @@ struct testWorkGroupUnorderedMultiple
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -378,7 +378,7 @@ struct testWorkGroupUnorderedMultiple<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -417,12 +417,12 @@ TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
 TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
              BasicWorkGroupUnorderedMultipleReuse)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
@@ -432,7 +432,7 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
   IndexType num2 = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType num3 = dist_type(IndexType(0), IndexType(8))(rng);
 
-  IndexType pool_reuse = dist_type(IndexType(0), IndexType(8))(rng);
+  IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
   testWorkGroupUnorderedMultiple<ExecPolicy,
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index 00629145e7..2d782c6c72 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -34,7 +34,7 @@ struct testWorkGroupUnorderedSingle
     ASSERT_GE(end, begin);
     IndexType N = end + begin;
 
-    WORKING_RES res = WORKING_RES::get_default();
+    WORKING_RES               res = WORKING_RES::get_default();
     camp::resources::Resource working_res{res};
 
     IndexType* working_array;
@@ -48,9 +48,8 @@ struct testWorkGroupUnorderedSingle
 
     using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-    auto callable = [=] RAJA_HOST_DEVICE(IndexType i) {
-      working_array[i] += i + test_val;
-    };
+    auto callable = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += i + test_val; };
 
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, decltype(callable)>>;
@@ -139,7 +138,7 @@ struct testWorkGroupUnorderedSingle
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -157,7 +156,7 @@ struct testWorkGroupUnorderedSingle<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -187,12 +186,12 @@ TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
 TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
              BasicWorkGroupUnorderedSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index efa2e2bbe2..aedc570448 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -89,7 +89,7 @@
     {                                                                          \
     private:                                                                   \
       typedef SuiteName<gtest_TypeParam_> TestFixture;                         \
-      typedef gtest_TypeParam_ TypeParam;                                      \
+      typedef gtest_TypeParam_            TypeParam;                           \
                                                                                \
     public:                                                                    \
       void TestBody() override;                                                \
diff --git a/test/include/RAJA_test-atomic-ref-types.hpp b/test/include/RAJA_test-atomic-ref-types.hpp
index dc0667ee73..c2ac4045d7 100644
--- a/test/include/RAJA_test-atomic-ref-types.hpp
+++ b/test/include/RAJA_test-atomic-ref-types.hpp
@@ -20,7 +20,7 @@
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 1, T>::type
-np2m1(T val)
+            np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -30,7 +30,7 @@ np2m1(T val)
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 2, T>::type
-np2m1(T val)
+            np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -41,7 +41,7 @@ np2m1(T val)
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 4, T>::type
-np2m1(T val)
+            np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -53,7 +53,7 @@ np2m1(T val)
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 8, T>::type
-np2m1(T val)
+            np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -66,7 +66,7 @@ np2m1(T val)
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 16, T>::type
-np2m1(T val)
+            np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
diff --git a/test/include/RAJA_test-forall-async-execpol.hpp b/test/include/RAJA_test-forall-async-execpol.hpp
index 1025b43c21..df861077c7 100644
--- a/test/include/RAJA_test-forall-async-execpol.hpp
+++ b/test/include/RAJA_test-forall-async-execpol.hpp
@@ -18,20 +18,20 @@
 #include "RAJA_test-forall-execpol.hpp"
 
 // Sequential execution policy types
-using SequentialAsyncForallExecPols = SequentialForallExecPols;
+using SequentialAsyncForallExecPols       = SequentialForallExecPols;
 using SequentialAsyncForallReduceExecPols = SequentialForallReduceExecPols;
 using SequentialAsyncForallAtomicExecPols = SequentialForallAtomicExecPols;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPAsyncForallExecPols = OpenMPForallExecPols;
+using OpenMPAsyncForallExecPols       = OpenMPForallExecPols;
 using OpenMPAsyncForallReduceExecPols = OpenMPForallReduceExecPols;
 using OpenMPAsyncForallAtomicExecPols = OpenMPForallAtomicExecPols;
 
 #endif // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAsyncForallExecPols = OpenMPTargetForallExecPols;
+using OpenMPTargetAsyncForallExecPols       = OpenMPTargetForallExecPols;
 using OpenMPTargetAsyncForallReduceExecPols = OpenMPTargetForallReduceExecPols;
 using OpenMPTargetAsyncForallAtomicExecPols = OpenMPTargetForallAtomicExecPols;
 
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
index 6d6fd34118..f494f03ca4 100644
--- a/test/include/RAJA_test-forall-data.hpp
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -15,18 +15,18 @@
 #include "camp/resource.hpp"
 
 template <typename T>
-void allocateForallTestData(size_t N,
+void allocateForallTestData(size_t                    N,
                             camp::resources::Resource work_res,
-                            T** work_array,
-                            T** check_array,
-                            T** test_array)
+                            T**                       work_array,
+                            T**                       check_array,
+                            T**                       test_array)
 {
   camp::resources::Resource host_res{camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
 // for RAJA strongly typed indices
@@ -34,25 +34,25 @@ template <typename T,
           typename std::enable_if<
               std::is_base_of<RAJA::IndexValueBase,
                               camp::type::ptr::rem<T>>::value>::type* = nullptr>
-void allocateForallTestData(T N,
+void allocateForallTestData(T                         N,
                             camp::resources::Resource work_res,
-                            T** work_array,
-                            T** check_array,
-                            T** test_array)
+                            T**                       work_array,
+                            T**                       check_array,
+                            T**                       test_array)
 {
   camp::resources::Resource host_res{camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
 template <typename T>
 void deallocateForallTestData(camp::resources::Resource work_res,
-                              T* work_array,
-                              T* check_array,
-                              T* test_array)
+                              T*                        work_array,
+                              T*                        check_array,
+                              T*                        test_array)
 {
   camp::resources::Resource host_res{camp::resources::Host()};
 
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
index 394669f9fd..bd6eb4d839 100644
--- a/test/include/RAJA_test-indexset-build.hpp
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -27,17 +27,17 @@ template <typename INDEX_TYPE,
           typename LIST_TYPE>
 void buildIndexSet(
     RAJA::TypedIndexSet<RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE>& iset,
-    std::vector<INDEX_TYPE>& indices_out,
-    camp::resources::Resource working_res)
+    std::vector<INDEX_TYPE>&                                      indices_out,
+    camp::resources::Resource                                     working_res)
 {
   //
   //  Build vector of integers for creating List segments.
   //
-  std::default_random_engine gen;
+  std::default_random_engine             gen;
   std::uniform_real_distribution<double> dist(0.0, 1.0);
 
   std::vector<INDEX_TYPE> lindices;
-  INDEX_TYPE idx = 0;
+  INDEX_TYPE              idx = 0;
   while (lindices.size() < 3000)
   {
     double dval = dist(gen);
@@ -52,11 +52,11 @@ void buildIndexSet(
   // Construct a mix of Range, RangeStride, and List segments
   // and add them to index set
   //
-  INDEX_TYPE rbeg = 0;
-  INDEX_TYPE rend = 0;
-  INDEX_TYPE stride = 0;
-  INDEX_TYPE last_idx = 0;
-  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>(lindices.size());
+  INDEX_TYPE              rbeg     = 0;
+  INDEX_TYPE              rend     = 0;
+  INDEX_TYPE              stride   = 0;
+  INDEX_TYPE              last_idx = 0;
+  INDEX_TYPE              lseg_len = static_cast<INDEX_TYPE>(lindices.size());
   std::vector<INDEX_TYPE> lseg(lseg_len);
   std::vector<INDEX_TYPE> lseg_vec(lseg_len);
 
@@ -97,8 +97,8 @@ void buildIndexSet(
   last_idx = lseg_vec[lseg_len - 1];
 
   // Create Range-stride segment
-  rbeg = last_idx + 16;
-  rend = rbeg + 2040;
+  rbeg   = last_idx + 16;
+  rend   = rbeg + 2040;
   stride = 3;
   iset.push_back(RANGESTRIDE_TYPE(rbeg, rend, stride));
   for (INDEX_TYPE i = rbeg; i < rend; i += stride)
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
index 91c402e849..7c3a4c8843 100644
--- a/test/include/RAJA_test-multi-reduce-abstractor.hpp
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -224,10 +224,10 @@ struct BitOrAbstractor
 
 
 // Sequential reduction policy types
-using ReduceSumAbstractors = camp::list<SumAbstractor>;
-using ReduceMinAbstractors = camp::list<MinAbstractor>;
-using ReduceMaxAbstractors = camp::list<MaxAbstractor>;
+using ReduceSumAbstractors    = camp::list<SumAbstractor>;
+using ReduceMinAbstractors    = camp::list<MinAbstractor>;
+using ReduceMaxAbstractors    = camp::list<MaxAbstractor>;
 using ReduceBitAndAbstractors = camp::list<BitAndAbstractor>;
-using ReduceBitOrAbstractors = camp::list<BitOrAbstractor>;
+using ReduceBitOrAbstractors  = camp::list<BitOrAbstractor>;
 
 #endif // __RAJA_test_multi_reduce_abstractor_HPP__
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index 8870462a0d..801222dd5a 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -34,8 +34,8 @@ template <typename BODY>
 __global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
   body();
 }
 
@@ -65,8 +65,8 @@ template <typename BODY>
 __global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto  privatizer = thread_privatize(body_in);
+  auto& body       = privatizer.get_priv();
   body();
 }
 
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index f2ba680d52..14c8daf63f 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -51,10 +51,10 @@ struct ResourceAllocator
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator&&) = default;
+    std_allocator(std_allocator&&)      = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator&&) = default;
+    std_allocator& operator=(std_allocator&&)      = default;
 
     template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
@@ -96,7 +96,7 @@ struct ResourceAllocator
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs,
+    friend inline bool operator!=(std_allocator const&    lhs,
                                   std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
@@ -111,15 +111,15 @@ struct NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
 
   NeverEqualAllocator() = default;
 
   NeverEqualAllocator(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator(NeverEqualAllocator&&) = default;
+  NeverEqualAllocator(NeverEqualAllocator&&)      = default;
 
   NeverEqualAllocator& operator=(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator& operator=(NeverEqualAllocator&&) = default;
+  NeverEqualAllocator& operator=(NeverEqualAllocator&&)      = default;
 
   NeverEqualAllocator select_on_container_copy_construction()
   {
@@ -137,8 +137,8 @@ struct NeverEqualAllocator
   /*[[nodiscard]]*/
   void* allocate(size_t size)
   {
-    void* ptr = malloc(size);
-    auto iter_b = m_allocations.emplace(ptr, size);
+    void* ptr    = malloc(size);
+    auto  iter_b = m_allocations.emplace(ptr, size);
     if (!iter_b.second)
     {
       RAJA_ABORT_OR_THROW("failed to add allocation to map");
@@ -171,15 +171,15 @@ struct AlwaysEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap = std::false_type;
+  using propagate_on_container_swap            = std::false_type;
 
   AlwaysEqualAllocator() = default;
 
   AlwaysEqualAllocator(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator(AlwaysEqualAllocator&&) = default;
+  AlwaysEqualAllocator(AlwaysEqualAllocator&&)      = default;
 
   AlwaysEqualAllocator& operator=(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator&&) = default;
+  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator&&)      = default;
 
   AlwaysEqualAllocator select_on_container_copy_construction() { return *this; }
 
@@ -205,15 +205,15 @@ struct PropogatingAllocator : NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::true_type;
   using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_swap = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
 
   PropogatingAllocator() = default;
 
   PropogatingAllocator(PropogatingAllocator const&) = default;
-  PropogatingAllocator(PropogatingAllocator&&) = default;
+  PropogatingAllocator(PropogatingAllocator&&)      = default;
 
   PropogatingAllocator& operator=(PropogatingAllocator const&) = default;
-  PropogatingAllocator& operator=(PropogatingAllocator&&) = default;
+  PropogatingAllocator& operator=(PropogatingAllocator&&)      = default;
 
   PropogatingAllocator select_on_container_copy_construction()
   {
@@ -244,10 +244,10 @@ struct WorkStorageTestAllocator
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator&&) = default;
+    std_allocator(std_allocator&&)      = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator&&) = default;
+    std_allocator& operator=(std_allocator&&)      = default;
 
     template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
@@ -286,14 +286,14 @@ struct WorkStorageTestAllocator
     AllocatorImpl const& get_impl() const { return m_impl; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& lhs,
+    friend inline bool operator==(std_allocator const&    lhs,
                                   std_allocator<U> const& rhs)
     {
       return lhs.get_impl() == rhs.get_impl();
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs,
+    friend inline bool operator!=(std_allocator const&    lhs,
                                   std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
@@ -328,16 +328,16 @@ using SequentialStoragePolicyList =
                RAJA::constant_stride_array_of_objects>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPExecPolicyList = camp::list<RAJA::omp_work>;
+using OpenMPExecPolicyList    = camp::list<RAJA::omp_work>;
 using OpenMPOrderedPolicyList = SequentialOrderedPolicyList;
-using OpenMPOrderPolicyList = SequentialOrderPolicyList;
+using OpenMPOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetExecPolicyList = camp::list<RAJA::omp_target_work>;
+using OpenMPTargetExecPolicyList    = camp::list<RAJA::omp_target_work>;
 using OpenMPTargetOrderedPolicyList = SequentialOrderedPolicyList;
-using OpenMPTargetOrderPolicyList = SequentialOrderPolicyList;
+using OpenMPTargetOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
diff --git a/test/include/RAJA_unit-test-policy.hpp b/test/include/RAJA_unit-test-policy.hpp
index e5bdc682d4..00af606f19 100644
--- a/test/include/RAJA_unit-test-policy.hpp
+++ b/test/include/RAJA_unit-test-policy.hpp
@@ -84,7 +84,7 @@ template <>
 struct test_policy_info<test_seq>
 {
   using resource = camp::resources::Host;
-  using type = RAJA::seq_exec;
+  using type     = RAJA::seq_exec;
   using platform = RunOnHost;
   static const char* name() { return "test_seq"; }
 };
@@ -100,7 +100,7 @@ template <>
 struct test_policy_info<test_openmp_target>
 {
   using resource = camp::resources::Omp;
-  using type = RAJA::omp_target_parallel_for_exec<1>;
+  using type     = RAJA::omp_target_parallel_for_exec<1>;
   using platform = RunOnHost;
   static const char* name() { return "test_openmp_target"; }
 };
@@ -118,7 +118,7 @@ template <>
 struct test_policy_info<test_cuda>
 {
   using resource = camp::resources::Cuda;
-  using type = RAJA::cuda_exec<1>;
+  using type     = RAJA::cuda_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_cuda"; }
 };
@@ -136,7 +136,7 @@ template <>
 struct test_policy_info<test_hip>
 {
   using resource = camp::resources::Hip;
-  using type = RAJA::hip_exec<1>;
+  using type     = RAJA::hip_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_hip"; }
 };
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
index 10cee62226..694578af8b 100644
--- a/test/install/using-with-cmake/using-with-cmake.cpp
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -13,7 +13,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
   double* a = new double[N];
   double* b = new double[N];
-  double c = 3.14159;
+  double  c = 3.14159;
 
   for (std::size_t i = 0; i < N; i++)
   {
@@ -21,9 +21,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
     b[i] = 2.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_HOST_DEVICE(std::size_t i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
+                               [=] RAJA_HOST_DEVICE(std::size_t i)
+                               { a[i] += b[i] * c; });
 
   delete[] a;
   delete[] b;
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
index a539076d84..9df43f29ac 100644
--- a/test/integration/plugin/tests/counter.hpp
+++ b/test/integration/plugin/tests/counter.hpp
@@ -11,11 +11,11 @@
 struct CounterData
 {
   RAJA::Platform capture_platform_active = RAJA::Platform::undefined;
-  int capture_counter_pre = 0;
-  int capture_counter_post = 0;
-  RAJA::Platform launch_platform_active = RAJA::Platform::undefined;
-  int launch_counter_pre = 0;
-  int launch_counter_post = 0;
+  int            capture_counter_pre     = 0;
+  int            capture_counter_post    = 0;
+  RAJA::Platform launch_platform_active  = RAJA::Platform::undefined;
+  int            launch_counter_pre      = 0;
+  int            launch_counter_post     = 0;
 };
 
 // note the use of a pointer here to allow different types of memory
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index dc86463af5..8a5f030ac5 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -195,8 +195,8 @@ class PluginForallTest : public ::testing::Test
 
 TYPED_TEST_P(PluginForallTest, PluginForall)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
   PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
@@ -204,8 +204,8 @@ TYPED_TEST_P(PluginForallTest, PluginForall)
 
 TYPED_TEST_P(PluginForallTest, PluginForAllICount)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
   PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
@@ -213,8 +213,8 @@ TYPED_TEST_P(PluginForallTest, PluginForAllICount)
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
   PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
@@ -222,8 +222,8 @@ TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
   PluginForAllIcountIdxSetTestImpl<ExecPolicy,
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index bb02759476..c9139c3d15 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -65,8 +65,8 @@ class PluginKernelTest : public ::testing::Test
 
 TYPED_TEST_P(PluginKernelTest, PluginKernel)
 {
-  using KernelPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using KernelPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
   PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>();
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 59fe5fd67b..43cd98a866 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -38,9 +38,8 @@ void PluginLaunchTestImpl()
 
       RAJA::launch<LaunchPolicy>(
           RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx)) {
-            p_callable(i);
-          });
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+          { p_callable(i); });
     }
 
     CounterData loop_data;
@@ -74,8 +73,8 @@ class PluginLaunchTest : public ::testing::Test
 
 TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
 {
-  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
   PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>();
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index aea16eae0b..792372531e 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -41,9 +41,8 @@ void PluginResourceLaunchTestImpl()
       RAJA::launch<LaunchPolicy>(
           res,
           RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx)) {
-            p_callable(i);
-          });
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+          { p_callable(i); });
     }
 
     CounterData loop_data;
@@ -77,8 +76,8 @@ class PluginResourceLaunchTest : public ::testing::Test
 
 TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
 {
-  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
   PluginResourceLaunchTestImpl<LaunchPolicy,
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index 31bcc53b60..2c9fcbe1cf 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -72,11 +72,11 @@ struct PluginWorkGroupTestImpl
       for (int i = 0; i < 10; i++)
       {
         loop_data[i].capture_platform_active = RAJA::Platform::undefined;
-        loop_data[i].capture_counter_pre = -1;
-        loop_data[i].capture_counter_post = -1;
-        loop_data[i].launch_platform_active = RAJA::Platform::undefined;
-        loop_data[i].launch_counter_pre = -1;
-        loop_data[i].launch_counter_post = -1;
+        loop_data[i].capture_counter_pre     = -1;
+        loop_data[i].capture_counter_post    = -1;
+        loop_data[i].launch_platform_active  = RAJA::Platform::undefined;
+        loop_data[i].launch_counter_pre      = -1;
+        loop_data[i].launch_counter_post     = -1;
       }
       plugin_test_resource->memcpy(
           data, &loop_data[0], 10 * sizeof(CounterData));
@@ -189,7 +189,7 @@ struct PluginWorkGroupTestImpl
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -209,7 +209,7 @@ struct PluginWorkGroupTestImpl<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
@@ -238,14 +238,14 @@ class PluginWorkGroupTest : public ::testing::Test
 
 TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
-  using PlatformHolder = typename camp::at<TypeParam, camp::num<7>>::type;
+  using PlatformHolder   = typename camp::at<TypeParam, camp::num<7>>::type;
 
   PluginWorkGroupTestImpl<ExecPolicy,
                           OrderPolicy,
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
index 4f48d0e1d0..a5ef3346c9 100644
--- a/test/integration/plugin/tests/test-plugin.hpp
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -32,24 +32,24 @@ struct SetupPluginVars
     // ASSERT_EQ(plugin_test_data, nullptr);
     // ASSERT_EQ(plugin_test_resource, nullptr);
 
-    plugin_test_data = m_test_resource.allocate<CounterData>(1);
+    plugin_test_data     = m_test_resource.allocate<CounterData>(1);
     plugin_test_resource = &m_test_resource;
 
     CounterData data;
     data.capture_platform_active = RAJA::Platform::undefined;
-    data.capture_counter_pre = 0;
-    data.capture_counter_post = 0;
-    data.launch_platform_active = RAJA::Platform::undefined;
-    data.launch_counter_pre = 0;
-    data.launch_counter_post = 0;
+    data.capture_counter_pre     = 0;
+    data.capture_counter_post    = 0;
+    data.launch_platform_active  = RAJA::Platform::undefined;
+    data.launch_counter_pre      = 0;
+    data.launch_counter_post     = 0;
 
     m_test_resource.memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  SetupPluginVars(SetupPluginVars const&) = delete;
-  SetupPluginVars(SetupPluginVars&&) = delete;
+  SetupPluginVars(SetupPluginVars const&)            = delete;
+  SetupPluginVars(SetupPluginVars&&)                 = delete;
   SetupPluginVars& operator=(SetupPluginVars const&) = delete;
-  SetupPluginVars& operator=(SetupPluginVars&&) = delete;
+  SetupPluginVars& operator=(SetupPluginVars&&)      = delete;
 
   ~SetupPluginVars()
   {
@@ -57,7 +57,7 @@ struct SetupPluginVars
     // ASSERT_NE(plugin_test_resource, nullptr);
 
     m_test_resource.deallocate(plugin_test_data);
-    plugin_test_data = nullptr;
+    plugin_test_data     = nullptr;
     plugin_test_resource = nullptr;
   }
 
@@ -110,7 +110,7 @@ struct PluginTestCallable
     {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
-      m_data = rhs.m_data;
+      m_data      = rhs.m_data;
     }
     return *this;
   }
@@ -121,7 +121,7 @@ struct PluginTestCallable
     {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
-      m_data = rhs.m_data;
+      m_data      = rhs.m_data;
       rhs.clear();
     }
     return *this;
@@ -130,11 +130,11 @@ struct PluginTestCallable
   RAJA_HOST_DEVICE void operator()(int i) const
   {
     m_data_optr[i].capture_platform_active = m_data.capture_platform_active;
-    m_data_optr[i].capture_counter_pre = m_data.capture_counter_pre;
-    m_data_optr[i].capture_counter_post = m_data.capture_counter_post;
+    m_data_optr[i].capture_counter_pre     = m_data.capture_counter_pre;
+    m_data_optr[i].capture_counter_post    = m_data.capture_counter_post;
     m_data_optr[i].launch_platform_active = m_data_iptr->launch_platform_active;
-    m_data_optr[i].launch_counter_pre = m_data_iptr->launch_counter_pre;
-    m_data_optr[i].launch_counter_post = m_data_iptr->launch_counter_post;
+    m_data_optr[i].launch_counter_pre     = m_data_iptr->launch_counter_pre;
+    m_data_optr[i].launch_counter_post    = m_data_iptr->launch_counter_post;
   }
 
   RAJA_HOST_DEVICE void operator()(int count, int i) const
@@ -144,9 +144,9 @@ struct PluginTestCallable
   }
 
 private:
-  CounterData* m_data_optr = nullptr;
+  CounterData*       m_data_optr = nullptr;
   const CounterData* m_data_iptr = nullptr;
-  CounterData m_data;
+  CounterData        m_data;
 
 
   RAJA_HOST_DEVICE void clear()
@@ -159,11 +159,11 @@ struct PluginTestCallable
   RAJA_HOST_DEVICE void clear_data()
   {
     m_data.capture_platform_active = RAJA::Platform::undefined;
-    m_data.capture_counter_pre = -1;
-    m_data.capture_counter_post = -1;
-    m_data.launch_platform_active = RAJA::Platform::undefined;
-    m_data.launch_counter_pre = -1;
-    m_data.launch_counter_post = -1;
+    m_data.capture_counter_pre     = -1;
+    m_data.capture_counter_post    = -1;
+    m_data.launch_platform_active  = RAJA::Platform::undefined;
+    m_data.launch_counter_pre      = -1;
+    m_data.launch_counter_post     = -1;
   }
 };
 
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
index f934d864f7..3f6e1d857e 100644
--- a/test/integration/plugin_for_test_kokkos.cpp
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -10,16 +10,16 @@
 #include <exception>
 
 extern "C" void
-kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
+kokkosp_init_library(const int      RAJA_UNUSED_ARG(loadSeq),
                      const uint64_t RAJA_UNUSED_ARG(interfaceVer),
                      const uint32_t RAJA_UNUSED_ARG(devInfoCount),
-                     void* RAJA_UNUSED_ARG(deviceInfo))
+                     void*          RAJA_UNUSED_ARG(deviceInfo))
 {}
 
 extern "C" void
-kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
+kokkosp_begin_parallel_for(const char*    RAJA_UNUSED_ARG(name),
                            const uint32_t RAJA_UNUSED_ARG(devID),
-                           uint64_t* RAJA_UNUSED_ARG(kID))
+                           uint64_t*      RAJA_UNUSED_ARG(kID))
 {
   throw std::runtime_error("preLaunch");
 }
diff --git a/test/old-tests/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
index fc5a02b1ed..dd46bfbdc1 100644
--- a/test/old-tests/unit/cuda/test-synchronize.cpp
+++ b/test/old-tests/unit/cuda/test-synchronize.cpp
@@ -21,9 +21,9 @@ GPU_TEST(SynchronizeTest, CUDA)
       [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::cuda_synchronize>();
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, 50),
-      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   cudaErrchk(cudaFree(managed_data));
 }
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index a577ab3218..55dd8924f0 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -45,7 +45,7 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
@@ -96,7 +96,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
 
       // Load data into shared memory
       [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
 
@@ -108,7 +109,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
 
       // read from shared mem
       [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
 
@@ -153,7 +155,7 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
@@ -165,7 +167,7 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   double *A, *B;
   double *d_A, *d_B;
-  size_t Arr_sz = N_rows * N_cols;
+  size_t  Arr_sz = N_rows * N_cols;
   hipMalloc(&d_A, sizeof(double) * Arr_sz);
   hipMalloc(&d_B, sizeof(double) * Arr_sz);
   A = new double[N_rows * N_cols];
@@ -207,7 +209,8 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
       // Load data into shared memory
       [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
 
@@ -219,7 +222,8 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
       // read from shared mem
       [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&) {
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
 
@@ -267,7 +271,7 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
@@ -284,9 +288,9 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   cudaErrchk(cudaMallocManaged(&B, sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&Bt, sizeof(double) * N_rows * N_cols));
 #else
-  A = new double[N_rows * N_cols];
+  A  = new double[N_rows * N_cols];
   At = new double[N_rows * N_cols];
-  B = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
   Bt = new double[N_rows * N_cols];
 #endif
 
@@ -320,29 +324,31 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int tx,
-                           int ty,
-                           int bx,
-                           int by,
+      [=] RAJA_HOST_DEVICE(int         tx,
+                           int         ty,
+                           int         bx,
+                           int         by,
                            SharedTile& myTile,
-                           SharedTile& myTile2) {
+                           SharedTile& myTile2)
+      {
         int col = bx * TILE_DIM + tx; // Matrix column index
         int row = by * TILE_DIM + ty; // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
-          myTile(ty, tx) = Aview(row, col);
+          myTile(ty, tx)  = Aview(row, col);
           myTile2(ty, tx) = Bview(row, col);
         }
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(int tx,
-                           int ty,
-                           int bx,
-                           int by,
+      [=] RAJA_HOST_DEVICE(int         tx,
+                           int         ty,
+                           int         bx,
+                           int         by,
                            SharedTile& myTile,
-                           SharedTile& myTile2) {
+                           SharedTile& myTile2)
+      {
         int col = by * TILE_DIM + tx; // Transposed matrix column index
         int row = bx * TILE_DIM + ty; // Transposed matrix row index
 
@@ -395,7 +401,7 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
@@ -411,9 +417,9 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipMalloc(&d_At, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_B, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_Bt, sizeof(double) * N_rows * N_cols);
-  A = new double[N_rows * N_cols];
+  A  = new double[N_rows * N_cols];
   At = new double[N_rows * N_cols];
-  B = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
   Bt = new double[N_rows * N_cols];
 
   RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N_rows, N_cols);
@@ -455,29 +461,31 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int tx,
-                           int ty,
-                           int bx,
-                           int by,
+      [=] RAJA_HOST_DEVICE(int         tx,
+                           int         ty,
+                           int         bx,
+                           int         by,
                            SharedTile& myTile,
-                           SharedTile& myTile2) {
+                           SharedTile& myTile2)
+      {
         int col = bx * TILE_DIM + tx; // Matrix column index
         int row = by * TILE_DIM + ty; // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
-          myTile(ty, tx) = d_Aview(row, col);
+          myTile(ty, tx)  = d_Aview(row, col);
           myTile2(ty, tx) = d_Bview(row, col);
         }
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(int tx,
-                           int ty,
-                           int bx,
-                           int by,
+      [=] RAJA_HOST_DEVICE(int         tx,
+                           int         ty,
+                           int         bx,
+                           int         by,
                            SharedTile& myTile,
-                           SharedTile& myTile2) {
+                           SharedTile& myTile2)
+      {
         int col = by * TILE_DIM + tx; // Transposed matrix column index
         int row = bx * TILE_DIM + ty; // Transposed matrix row index
 
@@ -909,7 +917,7 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
           dot += Aview(r, k) * Bview(k, c);
         }
         C_solView(r, c) = dot;
-        Cview(r, c) = 0;
+        Cview(r, c)     = 0;
       }
     }
   }
@@ -924,11 +932,11 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   RAJA::View<double, RAJA::Layout<2>> Bview(d_B, M, P);
   RAJA::View<double, RAJA::Layout<2>> Cview(d_C, N, P);
 
-  using Shmem = typename TypeParam::Shmem;
+  using Shmem      = typename TypeParam::Shmem;
   using ThreadPriv = typename TypeParam::ThreadPriv;
 
-  Shmem aShared, bShared; // memory to be shared between threads
-  ThreadPriv pVal;        // iteration dependent data
+  Shmem      aShared, bShared; // memory to be shared between threads
+  ThreadPriv pVal;             // iteration dependent data
 
   RAJA::kernel_param<Pol>(
       RAJA::make_tuple(RAJA::RangeSegment(0, N),
@@ -937,34 +945,29 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
       RAJA::make_tuple(aShared, bShared, pVal),
 
       // Zero out thread local memory for storing dot products
-      [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal) {
-        pVal(tn, tp) = 0.0;
-      },
+      [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal)
+      { pVal(tn, tp) = 0.0; },
 
       // Load tile of A
-      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared) {
-        aShared(tn, tm) = Aview(n, m);
-      },
+      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared)
+      { aShared(tn, tm) = Aview(n, m); },
 
       // Load tile of B
-      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared) {
-        bShared(tm, tp) = Bview(m, p);
-      },
+      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared)
+      { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(int tn,
-                           int tm,
-                           int tp,
-                           Shmem& aShared,
-                           Shmem& bShared,
-                           ThreadPriv& pVal) {
-        pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp);
-      },
+      [=] RAJA_HOST_DEVICE(int         tn,
+                           int         tm,
+                           int         tp,
+                           Shmem&      aShared,
+                           Shmem&      bShared,
+                           ThreadPriv& pVal)
+      { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
-      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal) {
-        Cview(n, p) = pVal(tn, tp);
-      });
+      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal)
+      { Cview(n, p) = pVal(tn, tp); });
 
   // copy result back to host (NOP on CPU)
   TypeParam::copy_d2h(N * P, C, d_C);
@@ -989,7 +992,7 @@ REGISTER_TYPED_TEST_SUITE_P(MatMultiply, shmem);
 
 void alloc_cpu(size_t N, double** host, double** device)
 {
-  *host = new double[N];
+  *host   = new double[N];
   *device = *host;
 }
 
@@ -1008,15 +1011,15 @@ void free_cpu(double* host, double*) { delete[] host; }
 struct Policy_MatMultiply_cpu
 {
 
-  static constexpr size_t N = 150;
-  static constexpr size_t M = 25;
-  static constexpr size_t P = 95;
+  static constexpr size_t N         = 150;
+  static constexpr size_t M         = 25;
+  static constexpr size_t P         = 95;
   static constexpr size_t tile_size = 16;
 
   constexpr static void (*alloc_double)(size_t, double**, double**) = alloc_cpu;
   constexpr static void (*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
   constexpr static void (*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
-  constexpr static void (*free_double)(double*, double*) = free_cpu;
+  constexpr static void (*free_double)(double*, double*)      = free_cpu;
 
   using Shmem = RAJA::
       LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index 7b535c0dbe..597666c4ba 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -21,8 +21,8 @@ using namespace RAJA::statement;
 TEST(SIMD, Align)
 {
 
-  int N = 1024;
-  double c = 0.5;
+  int     N = 1024;
+  double  c = 0.5;
   double* a =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
   double* b =
@@ -78,9 +78,8 @@ TEST(SIMD, OMPAndSimd)
 
   RAJA::kernel<POL>(
       RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
-      [=](RAJA::Index_type i, RAJA::Index_type j) {
-        c[i + j * N] = a[i + j * N] + b[i + j * N];
-      });
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c[i + j * N] = a[i + j * N] + b[i + j * N]; });
 
   for (int i = 0; i < N * M; ++i)
   {
@@ -122,9 +121,9 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
 
   for (int i = 0; i < N * M; ++i)
   {
-    a[i] = 1;
-    b[i] = 1;
-    c[i] = 0.0;
+    a[i]  = 1;
+    b[i]  = 1;
+    c[i]  = 0.0;
     a2[i] = 1;
     b2[i] = 1;
     c2[i] = 0.0;
@@ -132,12 +131,10 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
 
   RAJA::kernel<POL>(
       RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
-      [=](RAJA::Index_type i, RAJA::Index_type j) {
-        c[i + j * N] = a[i + j * N] + b[i + j * N];
-      },
-      [=](RAJA::Index_type i, RAJA::Index_type j) {
-        c2[i + j * N] = a2[i + j * N] + b2[i + j * N];
-      });
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c[i + j * N] = a[i + j * N] + b[i + j * N]; },
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c2[i + j * N] = a2[i + j * N] + b2[i + j * N]; });
 
   for (int i = 0; i < N * M; ++i)
   {
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
index 8a7dff339d..383ec56c30 100644
--- a/test/unit/algorithm/test-algorithm-util-for_each.cpp
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -31,10 +31,12 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
   std::vector<TypeParam> numbers;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam& number) {
-    number += 1;
-    copies.push_back(number);
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   number += 1;
+                   copies.push_back(number);
+                 });
 
   ASSERT_EQ(copies.size(), 0);
   ASSERT_EQ(numbers.size(), 0);
@@ -49,10 +51,12 @@ TYPED_TEST(ForEachUnitTest, VectorRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam& number) {
-    copies.push_back(number);
-    number += 1;
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 13);
   for (TypeParam i = 0; i < 13; ++i)
@@ -70,10 +74,12 @@ TYPED_TEST(ForEachUnitTest, RajaSpanRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) {
-    copies.push_back(number);
-    number += 1;
-  });
+  RAJA::for_each(RAJA::make_span(numbers.data(), 11),
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 11);
   for (TypeParam i = 0; i < 11; ++i)
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index a49356f3e7..690fd79d71 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -100,9 +100,9 @@ struct ReduceData;
 template <typename Res, typename ValType>
 struct ReduceData<Res, reduce_interface_tag, ValType>
 {
-  ValType* values = nullptr;
+  ValType* values        = nullptr;
   ValType* reduced_value = nullptr;
-  Res m_res;
+  Res      m_res;
 
   template <typename RandomGenerator>
   ReduceData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
@@ -128,7 +128,7 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 
   Res resource() { return m_res; }
 
-  ReduceData(ReduceData const&) = delete;
+  ReduceData(ReduceData const&)            = delete;
   ReduceData& operator=(ReduceData const&) = delete;
 
   ~ReduceData()
@@ -144,7 +144,7 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
 void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type N,
+              RAJA::Index_type                          N,
               T,
               BinaryOp,
               Reducer reducer,
@@ -159,8 +159,8 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
 void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type N,
-              T init,
+              RAJA::Index_type                          N,
+              T                                         init,
               BinaryOp,
               Reducer reducer,
               reduce_interface_tag,
@@ -174,10 +174,10 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
 void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type N,
-              T init,
-              BinaryOp op,
-              Reducer reducer,
+              RAJA::Index_type                          N,
+              T                                         init,
+              BinaryOp                                  op,
+              Reducer                                   reducer,
               reduce_interface_tag,
               reduce_init_op_interface_tag)
 {
@@ -194,16 +194,16 @@ template <typename Res,
           typename TestReducer,
           typename BinaryOpInterface>
 ::testing::AssertionResult
-testReduce(const char* test_name,
-           const unsigned seed,
+testReduce(const char*                               test_name,
+           const unsigned                            seed,
            ReduceData<Res, reduce_interface_tag, T>& data,
-           RAJA::Index_type N,
-           T init,
-           BinaryOp op,
-           TestReducer test_reducer,
+           RAJA::Index_type                          N,
+           T                                         init,
+           BinaryOp                                  op,
+           TestReducer                               test_reducer,
            left_fold_reduce_tag,
            reduce_interface_tag si,
-           BinaryOpInterface ci)
+           BinaryOpInterface    ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
@@ -231,16 +231,16 @@ template <typename Res,
           typename TestReducer,
           typename BinaryOpInterface>
 ::testing::AssertionResult
-testReduce(const char* test_name,
-           const unsigned seed,
+testReduce(const char*                               test_name,
+           const unsigned                            seed,
            ReduceData<Res, reduce_interface_tag, T>& data,
-           RAJA::Index_type N,
-           T init,
-           BinaryOp op,
-           TestReducer test_reducer,
+           RAJA::Index_type                          N,
+           T                                         init,
+           BinaryOp                                  op,
+           TestReducer                               test_reducer,
            unordered_reduce_tag,
            reduce_interface_tag si,
-           BinaryOpInterface ci)
+           BinaryOpInterface    ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
@@ -264,18 +264,18 @@ testReduce(const char* test_name,
 
 
 template <typename ValType, typename Reducer, typename Res>
-void testReducerInterfaces(unsigned seed,
+void testReducerInterfaces(unsigned         seed,
                            RAJA::Index_type MaxN,
-                           Reducer reducer,
-                           Res res)
+                           Reducer          reducer,
+                           Res              res)
 {
-  using reduce_category = typename Reducer::reduce_category;
+  using reduce_category    = typename Reducer::reduce_category;
   using interface_category = typename Reducer::reduce_interface;
-  using no_init_operator = reduce_default_interface_tag;
-  using init_no_operator = reduce_init_interface_tag;
-  using init_operator = reduce_init_op_interface_tag;
+  using no_init_operator   = reduce_default_interface_tag;
+  using init_no_operator   = reduce_init_interface_tag;
+  using init_operator      = reduce_init_op_interface_tag;
 
-  std::mt19937 rng(seed);
+  std::mt19937     rng(seed);
   RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
       (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
@@ -350,15 +350,15 @@ class ReduceUnitTest : public ::testing::Test
 
 TYPED_TEST_P(ReduceUnitTest, UnitReduce)
 {
-  using Reducer = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ValType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Reducer  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed = get_random_seed();
+  unsigned         seed = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Reducer reducer{};
-  ResType res = ResType::get_default();
+  Reducer          reducer{};
+  ResType          res = ResType::get_default();
 
   testReducer<ValType>(seed, MaxN, reducer, res);
 }
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index 5f411bcd51..9e4862b442 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -107,8 +107,8 @@ struct SortData;
 template <typename Res, typename K, typename V>
 struct SortData<Res, sort_interface_tag, K, V>
 {
-  K* orig_keys = nullptr;
-  K* sorted_keys = nullptr;
+  K*  orig_keys   = nullptr;
+  K*  sorted_keys = nullptr;
   Res m_res;
 
   template <typename RandomGenerator>
@@ -136,7 +136,7 @@ struct SortData<Res, sort_interface_tag, K, V>
 
   Res resource() { return m_res; }
 
-  SortData(SortData const&) = delete;
+  SortData(SortData const&)            = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
@@ -156,7 +156,7 @@ struct SortData<Res, sort_pairs_interface_tag, K, V>
 {
   using base = SortData<Res, sort_interface_tag, K, V>;
 
-  V* orig_vals = nullptr;
+  V* orig_vals   = nullptr;
   V* sorted_vals = nullptr;
 
   template <typename RandomGenerator>
@@ -184,7 +184,7 @@ struct SortData<Res, sort_pairs_interface_tag, K, V>
     this->m_res.memcpy(sorted_vals, orig_vals, N * sizeof(V));
   }
 
-  SortData(SortData const&) = delete;
+  SortData(SortData const&)            = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
@@ -201,7 +201,7 @@ struct SortData<Res, sort_pairs_interface_tag, K, V>
 
 template <typename Res, typename T, typename Compare, typename Sorter>
 void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type N,
+            RAJA::Index_type                      N,
             Compare,
             Sorter sorter,
             sort_interface_tag,
@@ -215,9 +215,9 @@ void doSort(SortData<Res, sort_interface_tag, T>& data,
 
 template <typename Res, typename T, typename Compare, typename Sorter>
 void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type N,
-            Compare comp,
-            Sorter sorter,
+            RAJA::Index_type                      N,
+            Compare                               comp,
+            Sorter                                sorter,
             sort_interface_tag,
             sort_comp_interface_tag)
 {
@@ -229,7 +229,7 @@ void doSort(SortData<Res, sort_interface_tag, T>& data,
 
 template <typename Res, typename T, typename Compare, typename Sorter>
 void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type N,
+            RAJA::Index_type                      N,
             Compare,
             Sorter sorter,
             sort_interface_tag,
@@ -242,9 +242,9 @@ void doSort(SortData<Res, sort_interface_tag, T>& data,
 
 template <typename Res, typename T, typename Compare, typename Sorter>
 void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type N,
-            Compare comp,
-            Sorter sorter,
+            RAJA::Index_type                      N,
+            Compare                               comp,
+            Sorter                                sorter,
             sort_interface_tag,
             sort_res_comp_interface_tag)
 {
@@ -259,7 +259,7 @@ template <typename Res,
           typename Compare,
           typename Sorter>
 void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type N,
+            RAJA::Index_type                               N,
             Compare,
             Sorter sorter,
             sort_pairs_interface_tag,
@@ -278,9 +278,9 @@ template <typename Res,
           typename Compare,
           typename Sorter>
 void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type N,
-            Compare comp,
-            Sorter sorter,
+            RAJA::Index_type                               N,
+            Compare                                        comp,
+            Sorter                                         sorter,
             sort_pairs_interface_tag,
             sort_comp_interface_tag)
 {
@@ -298,7 +298,7 @@ template <typename Res,
           typename Compare,
           typename Sorter>
 void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type N,
+            RAJA::Index_type                               N,
             Compare,
             Sorter sorter,
             sort_pairs_interface_tag,
@@ -317,9 +317,9 @@ template <typename Res,
           typename Compare,
           typename Sorter>
 void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type N,
-            Compare comp,
-            Sorter sorter,
+            RAJA::Index_type                               N,
+            Compare                                        comp,
+            Sorter                                         sorter,
             sort_pairs_interface_tag,
             sort_res_comp_interface_tag)
 {
@@ -337,15 +337,15 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(const char* test_name,
+::testing::AssertionResult testSort(const char*    test_name,
                                     const unsigned seed,
                                     SortData<Res, sort_interface_tag, T>& data,
-                                    RAJA::Index_type N,
-                                    Compare comp,
+                                    RAJA::Index_type                      N,
+                                    Compare                               comp,
                                     TestSorter test_sorter,
                                     unstable_sort_tag,
                                     sort_interface_tag si,
-                                    CompareInterface ci)
+                                    CompareInterface   ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -402,15 +402,15 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(const char* test_name,
+::testing::AssertionResult testSort(const char*    test_name,
                                     const unsigned seed,
                                     SortData<Res, sort_interface_tag, T>& data,
-                                    RAJA::Index_type N,
-                                    Compare comp,
+                                    RAJA::Index_type                      N,
+                                    Compare                               comp,
                                     TestSorter test_sorter,
                                     stable_sort_tag,
                                     sort_interface_tag si,
-                                    CompareInterface ci)
+                                    CompareInterface   ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -469,15 +469,15 @@ template <typename Res,
           typename TestSorter,
           typename CompareInterface>
 ::testing::AssertionResult
-testSort(const char* test_name,
-         const unsigned seed,
+testSort(const char*                                    test_name,
+         const unsigned                                 seed,
          SortData<Res, sort_pairs_interface_tag, K, V>& data,
-         RAJA::Index_type N,
-         Compare comp,
-         TestSorter test_sorter,
+         RAJA::Index_type                               N,
+         Compare                                        comp,
+         TestSorter                                     test_sorter,
          unstable_sort_tag,
          sort_pairs_interface_tag si,
-         CompareInterface ci)
+         CompareInterface         ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -540,15 +540,15 @@ template <typename Res,
           typename TestSorter,
           typename CompareInterface>
 ::testing::AssertionResult
-testSort(const char* test_name,
-         const unsigned seed,
+testSort(const char*                                    test_name,
+         const unsigned                                 seed,
          SortData<Res, sort_pairs_interface_tag, K, V>& data,
-         RAJA::Index_type N,
-         Compare comp,
-         TestSorter test_sorter,
+         RAJA::Index_type                               N,
+         Compare                                        comp,
+         TestSorter                                     test_sorter,
          stable_sort_tag,
          sort_pairs_interface_tag si,
-         CompareInterface ci)
+         CompareInterface         ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -618,15 +618,15 @@ void testSorterResInterfaces(
 template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::true_type,
-    unsigned seed,
+    unsigned                                              seed,
     SortData<Res, typename Sorter::sort_interface, K, V>& data,
-    RAJA::Index_type N,
-    Sorter sorter)
+    RAJA::Index_type                                      N,
+    Sorter                                                sorter)
 {
   // Sorter supports resource interface, res tests
-  using stability_category = typename Sorter::sort_category;
-  using pairs_category = typename Sorter::sort_interface;
-  using resource_no_comparator = sort_res_default_interface_tag;
+  using stability_category      = typename Sorter::sort_category;
+  using pairs_category          = typename Sorter::sort_interface;
+  using resource_no_comparator  = sort_res_default_interface_tag;
   using resource_use_comparator = sort_res_comp_interface_tag;
 
   ASSERT_TRUE(testSort("resource+default",
@@ -659,18 +659,18 @@ void testSorterResInterfaces(
 }
 
 template <typename K, typename Sorter, typename Res>
-void testSorterInterfaces(unsigned seed,
+void testSorterInterfaces(unsigned         seed,
                           RAJA::Index_type MaxN,
-                          Sorter sorter,
-                          Res res)
+                          Sorter           sorter,
+                          Res              res)
 {
   using stability_category = typename Sorter::sort_category;
-  using pairs_category = typename Sorter::sort_interface;
-  using supports_resource = typename Sorter::supports_resource;
-  using no_comparator = sort_default_interface_tag;
-  using use_comparator = sort_comp_interface_tag;
+  using pairs_category     = typename Sorter::sort_interface;
+  using supports_resource  = typename Sorter::supports_resource;
+  using no_comparator      = sort_default_interface_tag;
+  using use_comparator     = sort_comp_interface_tag;
 
-  std::mt19937 rng(seed);
+  std::mt19937     rng(seed);
   RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
       (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
@@ -733,15 +733,15 @@ class SortUnitTest : public ::testing::Test
 
 TYPED_TEST_P(SortUnitTest, UnitSort)
 {
-  using Sorter = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using KeyType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Sorter   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using KeyType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed = get_random_seed();
+  unsigned         seed = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Sorter sorter{};
-  ResType res = ResType::get_default();
+  Sorter           sorter{};
+  ResType          res = ResType::get_default();
 
   testSorter<KeyType>(seed, MaxN, sorter, res);
 }
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index 0c8d279e97..1d6d6eba48 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -25,8 +25,8 @@
 template <typename policy>
 struct PolicySort : PolicySynchronize<policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
@@ -49,8 +49,8 @@ struct PolicySort : PolicySynchronize<policy>
 template <typename policy>
 struct PolicySortPairs : PolicySynchronize<policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index 25c8f0bc55..725ebbaee1 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -26,8 +26,8 @@
 template <typename policy>
 struct PolicyStableSort : PolicySynchronize<policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
@@ -51,8 +51,8 @@ struct PolicyStableSort : PolicySynchronize<policy>
 template <typename policy>
 struct PolicyStableSortPairs : PolicySynchronize<policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index 6ae3fe7138..3f006025ff 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -38,7 +38,7 @@ struct Accumulate;
 template <typename test_policy>
 struct BinaryTreeReduce<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = unordered_reduce_tag;
+  using reduce_category  = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   const char* name() { return "RAJA::binary_tree_reduce"; }
@@ -53,7 +53,7 @@ struct BinaryTreeReduce<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct Accumulate<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = left_fold_reduce_tag;
+  using reduce_category  = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   const char* name() { return "RAJA::accumulate"; }
@@ -71,7 +71,7 @@ template <typename test_policy>
 struct BinaryTreeReduce<test_policy, RunOnDevice>
     : ForoneSynchronize<test_policy>
 {
-  using reduce_category = unordered_reduce_tag;
+  using reduce_category  = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
@@ -86,36 +86,36 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE() { *reduced_value = RAJA::binary_tree_reduce(c); });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::binary_tree_reduce(c); });
   }
 
   template <typename T, typename Container>
-  void operator()(T* reduced_value,
-                  Container&& c,
+  void operator()(T*                                    reduced_value,
+                  Container&&                           c,
                   RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>([=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c, init);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        { *reduced_value = RAJA::binary_tree_reduce(c, init); });
   }
 
   template <typename T, typename Container, typename BinaryOp>
-  void operator()(T* reduced_value,
-                  Container&& c,
+  void operator()(T*                                    reduced_value,
+                  Container&&                           c,
                   RAJA::detail::ContainerVal<Container> init,
-                  BinaryOp op)
+                  BinaryOp                              op)
   {
-    forone<test_policy>([=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c, init, op);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        { *reduced_value = RAJA::binary_tree_reduce(c, init, op); });
   }
 };
 
 template <typename test_policy>
 struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = left_fold_reduce_tag;
+  using reduce_category  = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
@@ -130,27 +130,27 @@ struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE() { *reduced_value = RAJA::accumulate(c); });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c); });
   }
 
   template <typename T, typename Container>
-  void operator()(T* reduced_value,
-                  Container&& c,
+  void operator()(T*                                    reduced_value,
+                  Container&&                           c,
                   RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE() { *reduced_value = RAJA::accumulate(c, init); });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c, init); });
   }
 
   template <typename T, typename Container, typename BinaryOp>
-  void operator()(T* reduced_value,
-                  Container&& c,
+  void operator()(T*                                    reduced_value,
+                  Container&&                           c,
                   RAJA::detail::ContainerVal<Container> init,
-                  BinaryOp op)
+                  BinaryOp                              op)
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE() { *reduced_value = RAJA::accumulate(c, init, op); });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c, init, op); });
   }
 };
 
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
index 59d010e4e5..638859090b 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -62,8 +62,8 @@ struct MergeSortPairs;
 template <typename test_policy>
 struct InsertionSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::insertion_sort"; }
@@ -79,8 +79,8 @@ template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnHost>
     : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::insertion_sort[pairs]"; }
@@ -92,7 +92,7 @@ struct InsertionSortPairs<test_policy, RunOnHost>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -101,8 +101,8 @@ struct InsertionSortPairs<test_policy, RunOnHost>
 template <typename test_policy>
 struct ShellSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::shell_sort"; }
@@ -117,8 +117,8 @@ struct ShellSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::shell_sort[pairs]"; }
@@ -130,7 +130,7 @@ struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -139,8 +139,8 @@ struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct HeapSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::heap_sort"; }
@@ -155,8 +155,8 @@ struct HeapSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::heap_sort[pairs]"; }
@@ -168,7 +168,7 @@ struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -177,8 +177,8 @@ struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct IntroSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::intro_sort"; }
@@ -193,8 +193,8 @@ struct IntroSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::intro_sort[pairs]"; }
@@ -206,7 +206,7 @@ struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -215,8 +215,8 @@ struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct MergeSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::merge_sort"; }
@@ -231,8 +231,8 @@ struct MergeSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   const char* name() { return "RAJA::merge_sort[pairs]"; }
@@ -244,7 +244,7 @@ struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -255,8 +255,8 @@ struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct InsertionSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -285,8 +285,8 @@ template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnDevice>
     : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -305,19 +305,21 @@ struct InsertionSortPairs<test_policy, RunOnDevice>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>([=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
 template <typename test_policy>
 struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -345,8 +347,8 @@ struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -365,19 +367,21 @@ struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>([=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
 template <typename test_policy>
 struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -405,8 +409,8 @@ struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -425,19 +429,21 @@ struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>([=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
 template <typename test_policy>
 struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -465,8 +471,8 @@ struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -485,19 +491,21 @@ struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>([=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
 template <typename test_policy>
 struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -525,8 +533,8 @@ struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 template <typename test_policy>
 struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
@@ -545,11 +553,13 @@ struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   void
   operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
   {
-    forone<test_policy>([=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
diff --git a/test/unit/atomic/test-atomic-incdec.cpp b/test/unit/atomic/test-atomic-incdec.cpp
index 6a4f16f10f..a3e2b94444 100644
--- a/test/unit/atomic/test-atomic-incdec.cpp
+++ b/test/unit/atomic/test-atomic-incdec.cpp
@@ -54,14 +54,14 @@ TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest);
 
 TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // test "wrapping" increment
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
-  T inc_init = (T)0;
+  T  inc_init   = (T)0;
   T* inc_result = &inc_init;
 
   // oldval < val, increment oldval
@@ -81,7 +81,7 @@ TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
-  T dec_init = (T)1;
+  T  dec_init   = (T)1;
   T* dec_result = &dec_init;
 
   // oldval > 0, decrement oldval
@@ -123,7 +123,7 @@ TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest);
 
 GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T* inc_result = nullptr;
@@ -138,21 +138,21 @@ GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 
   inc_result[0] = (T)0;
   // oldval < val, increment oldval
-  forone<test_cuda>(
-      [=] __device__() { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(inc_result[0], (T)1);
 
   // oldval == val, wrap to 0
-  forone<test_cuda>(
-      [=] __device__() { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(inc_result[0], (T)0);
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
-  forone<test_cuda>(
-      [=] __device__() { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(inc_result[0], (T)0);
 
@@ -162,21 +162,21 @@ GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 
   dec_result[0] = (T)1;
   // oldval > 0, decrement oldval
-  forone<test_cuda>(
-      [=] __device__() { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(dec_result[0], (T)0);
 
   // oldval == 0, wrap to val
-  forone<test_cuda>(
-      [=] __device__() { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(dec_result[0], (T)1);
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
-  forone<test_cuda>(
-      [=] __device__() { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(dec_result[0], (T)1);
 
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
index b084019a6a..38dd03d3d3 100644
--- a/test/unit/atomic/test-atomic-ref-accessors.cpp
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -29,13 +29,13 @@ TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest);
 
 TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // should also work with CUDA
-  T theval = (T)0;
+  T  theval  = (T)0;
   T* memaddr = &theval;
-  T result;
+  T  result;
 
   // explicit constructor with memory address
   RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
@@ -76,11 +76,11 @@ TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest);
 
 GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T* memaddr = nullptr;
-  T* result = nullptr;
+  T* result  = nullptr;
   cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
@@ -99,19 +99,23 @@ GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
   ASSERT_EQ(test1, (T)23);
 
   // test load method
-  forone<test_cuda>([=] __device__() {
-    test1 = (T)29;
-    result[0] = test1.load();
-  });
+  forone<test_cuda>(
+      [=] __device__()
+      {
+        test1     = (T)29;
+        result[0] = test1.load();
+      });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(result[0], (T)29);
   ASSERT_EQ(test1, (T)29);
 
   // test T()
-  forone<test_cuda>([=] __device__() {
-    test1 = (T)47;
-    result[0] = test1;
-  });
+  forone<test_cuda>(
+      [=] __device__()
+      {
+        test1     = (T)47;
+        result[0] = test1;
+      });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(result[0], (T)47);
   ASSERT_EQ(test1, (T)47);
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
index 050ef5e2ca..5e6e48d34c 100644
--- a/test/unit/atomic/test-atomic-ref-addsub.cpp
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -29,10 +29,10 @@ TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest);
 
 TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)0;
+  T  theval  = (T)0;
   T* memaddr = &theval;
 
   // explicit constructor with memory address
@@ -88,7 +88,7 @@ TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest);
 
 GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T* memaddr = nullptr;
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
index a362ff0d9c..a6260737d0 100644
--- a/test/unit/atomic/test-atomic-ref-bitwise.cpp
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -27,12 +27,12 @@ TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest);
 
 TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)1;
+  T  theval  = (T)1;
   T* memaddr = &theval;
-  T result;
+  T  result;
 
   // explicit constructor with memory address
   RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
@@ -107,11 +107,11 @@ TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest);
 
 GPU_TYPED_TEST_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   T* memaddr = nullptr;
-  T* result = nullptr;
+  T* result  = nullptr;
   cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   memaddr[0] = (T)1;
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
index dd9ff23eb2..f0bc598da3 100644
--- a/test/unit/atomic/test-atomic-ref-constructor.cpp
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -40,7 +40,7 @@ void DefaultPolConstructors()
 
   // ref constructor
   RAJA::AtomicRef<T> const& reft1 = test1;
-  RAJA::AtomicRef<T> reftest1(reft1);
+  RAJA::AtomicRef<T>        reftest1(reft1);
 
   ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
@@ -69,7 +69,7 @@ TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest);
 
 TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
 {
-  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   NumericType* memaddr = nullptr;
@@ -81,7 +81,7 @@ TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
 
   // ref constructor
   RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
+  RAJA::AtomicRef<NumericType, AtomicPolicy>        reftest1(reft1);
 
   ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
@@ -105,11 +105,11 @@ TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest);
 
 GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 {
-  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   NumericType* memaddr = nullptr;
-  NumericType* proxy = nullptr;
+  NumericType* proxy   = nullptr;
   cudaErrchk(cudaMallocManaged((void**)&proxy, sizeof(NumericType)));
   proxy = memaddr;
   cudaErrchk(cudaDeviceSynchronize());
@@ -125,7 +125,7 @@ GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 
   // ref constructor
   RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
+  RAJA::AtomicRef<NumericType, AtomicPolicy>        reftest1(reft1);
   forone<test_cuda>([=] __device__() { reftest1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
 
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
index 8e0135535d..6ce2af5a08 100644
--- a/test/unit/atomic/test-atomic-ref-exchanges.cpp
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -27,11 +27,11 @@ TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest);
 
 TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T swapper = (T)91;
-  T theval = (T)0;
+  T  swapper = (T)91;
+  T  theval  = (T)0;
   T* memaddr = &theval;
 
   // explicit constructor with memory address
@@ -48,9 +48,9 @@ TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
   ASSERT_EQ(swapper, (T)91);
 
 
-  bool result = true;
-  T testval = (T)19;
-  T& valref = testval;
+  bool result  = true;
+  T    testval = (T)19;
+  T&   valref  = testval;
 
   // test strong exchange method
   result = test1.compare_exchange_strong(valref, testval);
@@ -118,13 +118,13 @@ TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest);
 
 GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* swapper = nullptr;
-  T* memaddr = nullptr;
-  T* testval = nullptr;
-  bool* result = nullptr;
+  T*    swapper = nullptr;
+  T*    memaddr = nullptr;
+  T*    testval = nullptr;
+  bool* result  = nullptr;
   cudaErrchk(cudaMallocManaged(&swapper, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&testval, sizeof(T)));
@@ -132,30 +132,30 @@ GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
   swapper[0] = (T)91;
   memaddr[0] = (T)0;
   testval[0] = (T)19;
-  result[0] = true;
+  result[0]  = true;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
   RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test exchange method
-  forone<test_cuda>(
-      [=] __device__() { swapper[0] = test1.exchange(swapper[0]); });
+  forone<test_cuda>([=] __device__()
+                    { swapper[0] = test1.exchange(swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(test1, (T)91);
   ASSERT_EQ(swapper[0], (T)0);
 
   // test CAS method
-  forone<test_cuda>(
-      [=] __device__() { swapper[0] = test1.CAS((T)91, swapper[0]); });
+  forone<test_cuda>([=] __device__()
+                    { swapper[0] = test1.CAS((T)91, swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(test1, (T)0);
   ASSERT_EQ(swapper[0], (T)91);
 
   // test strong exchange method
-  forone<test_cuda>([=] __device__() {
-    result[0] = test1.compare_exchange_strong(testval[0], testval[0]);
-  });
+  forone<test_cuda>(
+      [=] __device__()
+      { result[0] = test1.compare_exchange_strong(testval[0], testval[0]); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(result[0], false);
   ASSERT_EQ(test1, (T)0);
@@ -163,9 +163,9 @@ GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
   ASSERT_EQ(testval[0], (T)0);
 
   // test weak exchange method (same as strong exchange)
-  forone<test_cuda>([=] __device__() {
-    result[0] = test1.compare_exchange_weak(testval[0], swapper[0]);
-  });
+  forone<test_cuda>(
+      [=] __device__()
+      { result[0] = test1.compare_exchange_weak(testval[0], swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
   ASSERT_EQ(result[0], true);
   ASSERT_EQ(test1, (T)91);
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
index b5c0b4d5db..0f375f2aa6 100644
--- a/test/unit/atomic/test-atomic-ref-minmax.cpp
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -29,12 +29,12 @@ TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest);
 
 TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)91;
+  T  theval  = (T)91;
   T* memaddr = &theval;
-  T result;
+  T  result;
 
   // explicit constructor with memory address
   RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
@@ -76,10 +76,10 @@ TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest);
 
 GPU_TYPED_TEST_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* result = nullptr;
+  T* result  = nullptr;
   T* memaddr = nullptr;
   cudaErrchk(cudaMallocManaged(&result, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 284dc85754..f623a65680 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -19,17 +19,17 @@ GPU_TEST(SynchronizeUnitTest, HIP)
   hipMalloc(&d_managed_data, sizeof(double) * 50);
 
   RAJA::forall<RAJA::hip_exec_async<256>>(
-      RAJA::RangeSegment(0, 50), [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-        d_managed_data[i] = 1.0 * i;
-      });
+      RAJA::RangeSegment(0, 50),
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
+      { d_managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
   hipMemcpy(
       managed_data, d_managed_data, sizeof(double) * 50, hipMemcpyDeviceToHost);
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, 50),
-      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   free(managed_data);
   hipFree(d_managed_data);
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index b453044371..77ba153913 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -34,7 +34,7 @@ TEST(IndexSetUnitTest, Empty)
 
 TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType isr;
   ASSERT_EQ((size_t)1, isr.getNumTypes());
@@ -56,17 +56,17 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_NE(isr.size(), isr2.size());
   ASSERT_EQ(isr.getLength(), isr2.getLength());
 
-  using ListSegType = RAJA::TypedListSegment<int>;
+  using ListSegType    = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType isrl;
   ASSERT_EQ(size_t(2), isrl.getNumTypes());
-  int idx[] = {0, 2, 4, 5};
+  int         idx[] = {0, 2, 4, 5};
   ListSegType lseg(idx, 4, host_res);
   isrl.push_back(lseg);
   isrl.push_back(RangeSegType(6, 8));
   ASSERT_EQ(2, isrl.size());
   ASSERT_EQ(size_t(6), isrl.getLength());
-  const ListSegType ls0 = isrl.getSegment<const ListSegType>(0);
+  const ListSegType  ls0  = isrl.getSegment<const ListSegType>(0);
   const RangeSegType rs11 = isrl.getSegment<const RangeSegType>(1);
   ASSERT_EQ(4, ls0.size());
   ASSERT_EQ(2, rs11.size());
@@ -74,7 +74,7 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_FALSE(isrl.compareSegmentById(0, isr));
   ASSERT_FALSE(isr.compareSegmentById(1, isrl));
 
-  RIndexSetType isr3(isr);
+  RIndexSetType  isr3(isr);
   RLIndexSetType isrl3 = isrl;
   ASSERT_TRUE(isr == isr3);
   ASSERT_FALSE(isrl != isrl3);
@@ -84,10 +84,10 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 
 TEST(IndexSetUnitTest, Swap)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
-  RangeSegType range(0, 10);
+  RangeSegType  range(0, 10);
   iset1.push_back(range);
   iset1.push_back_nocopy(&range);
   iset1.push_front(range);
@@ -109,14 +109,14 @@ TEST(IndexSetUnitTest, Swap)
 
 TEST(IndexSetUnitTest, Slice)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
-  RangeSegType range1(0, 2);
-  RangeSegType range2(2, 4);
-  RangeSegType range3(4, 6);
-  RangeSegType range4(6, 8);
-  RangeSegType range5(8, 10);
+  RangeSegType  range1(0, 2);
+  RangeSegType  range2(2, 4);
+  RangeSegType  range3(4, 6);
+  RangeSegType  range4(6, 8);
+  RangeSegType  range5(8, 10);
   iset1.push_back(range1);
   iset1.push_back(range2);
   iset1.push_back(range3);
@@ -138,8 +138,8 @@ TEST(IndexSetUnitTest, Slice)
   ASSERT_EQ(8, *rs22.begin());
   ASSERT_EQ(10, *rs22.end());
 
-  int segs[] = {0, 3};
-  RIndexSetType iset3 = iset1.createSlice(segs, 2);
+  int           segs[] = {0, 3};
+  RIndexSetType iset3  = iset1.createSlice(segs, 2);
   ASSERT_EQ(2, iset3.size());
   ASSERT_EQ(size_t(4), iset3.getLength());
   const RangeSegType rs30 = iset3.getSegment<const RangeSegType>(0);
@@ -165,13 +165,13 @@ TEST(IndexSetUnitTest, Slice)
 
 TEST(IndexSetUnitTest, ConditionalEvenIndices)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
-  using ListSegType = RAJA::TypedListSegment<int>;
+  using RangeSegType   = RAJA::TypedRangeSegment<int>;
+  using ListSegType    = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType iset;
 
   iset.push_back(RangeSegType(0, 6));
-  int idx[] = {7, 8, 10, 11};
+  int         idx[] = {7, 8, 10, 11};
   ListSegType lseg(idx, 4, host_res);
   iset.push_back(lseg);
   iset.push_back(RangeSegType(13, 17));
@@ -197,7 +197,7 @@ TEST(IndexSetUnitTest, ConditionalEvenIndices)
 
 TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset;
 
diff --git a/test/unit/index/test-indexvalue.cpp b/test/unit/index/test-indexvalue.cpp
index ca148d2c91..d663b30f72 100644
--- a/test/unit/index/test-indexvalue.cpp
+++ b/test/unit/index/test-indexvalue.cpp
@@ -147,7 +147,7 @@ TYPED_TEST(IndexValueUnitTest, StrongTypesArith)
 
 TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
 {
-  StrongTypeIndex a(8);
+  StrongTypeIndex  a(8);
   RAJA::Index_type b(2);
 
   ASSERT_EQ(StrongTypeIndex(10), a + b);
@@ -184,7 +184,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
 
 
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
-  TestType c(8);
+  TestType         c(8);
   RAJA::Index_type d(2);
 
   ASSERT_EQ(TestType(10), c + d);
@@ -250,7 +250,7 @@ TYPED_TEST(IndexValueUnitTest, StrongTypeCompare)
 
 TYPED_TEST(IndexValueUnitTest, IndexTypeCompare)
 {
-  StrongTypeIndex v(5);
+  StrongTypeIndex  v(5);
   RAJA::Index_type v_lower(4);
   RAJA::Index_type v_higher(6);
   RAJA::Index_type v_same(5);
@@ -265,7 +265,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeCompare)
   ASSERT_NE(v, v_higher);
 
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
-  TestType x(5);
+  TestType         x(5);
   RAJA::Index_type x_lower(4);
   RAJA::Index_type x_higher(6);
   RAJA::Index_type x_same(5);
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index efe2017992..89844fab89 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -83,7 +83,7 @@ TYPED_TEST(ListSegmentUnitTest, Swaps)
 
 TYPED_TEST(ListSegmentUnitTest, Equality)
 {
-  std::vector<TypeParam> idx1{5, 3, 1, 2};
+  std::vector<TypeParam>            idx1{5, 3, 1, 2};
   RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   std::vector<TypeParam> idx2{2, 1, 3, 5};
@@ -97,7 +97,7 @@ TYPED_TEST(ListSegmentUnitTest, Equality)
 
 TYPED_TEST(ListSegmentUnitTest, Iterators)
 {
-  std::vector<TypeParam> idx1{5, 3, 1, 2};
+  std::vector<TypeParam>            idx1{5, 3, 1, 2};
   RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   ASSERT_EQ(TypeParam(5), *list.begin());
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index 050d746fde..f9435c3544 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -23,8 +23,8 @@ TYPED_TEST_SUITE_P(IndexingUnitTest);
 template <typename test_policy,
           typename indexer_type,
           RAJA::named_dim dim_012,
-          int BLOCK_SIZE,
-          int GRID_SIZE>
+          int             BLOCK_SIZE,
+          int             GRID_SIZE>
 void testBasicIndexing()
 {
   dim3d3d expected_dim{{1, 1, 1}, {1, 1, 1}};
@@ -54,16 +54,16 @@ void testBasicIndexing()
 
   const int total_global = expected_dim.product();
 
-  auto host_res = get_test_resource<test_seq>();
+  auto host_res    = get_test_resource<test_seq>();
   auto working_res = get_test_resource<test_policy>();
 
   int* actual_index = host_res.allocate<int>(total_global);
-  int* actual_size = host_res.allocate<int>(total_global);
+  int* actual_size  = host_res.allocate<int>(total_global);
 
   for (int i = 0; i < total_global; ++i)
   {
     actual_index[i] = -1;
-    actual_size[i] = -1;
+    actual_size[i]  = -1;
   }
 
   actual_index =
@@ -72,10 +72,11 @@ void testBasicIndexing()
       test_reallocate(working_res, host_res, actual_size, total_global);
 
   for3d3d<test_policy>(expected_dim,
-                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim) {
-                         int i = index(idx, dim);
+                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
+                       {
+                         int i           = index(idx, dim);
                          actual_index[i] = indexer_type::template index<int>();
-                         actual_size[i] = indexer_type::template size<int>();
+                         actual_size[i]  = indexer_type::template size<int>();
                        });
 
   actual_index =
@@ -95,11 +96,11 @@ void testBasicIndexing()
 
 TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
 {
-  using test_policy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using test_policy         = typename camp::at<TypeParam, camp::num<0>>::type;
   using indexer_holder_type = typename camp::at<TypeParam, camp::num<1>>::type;
-  using dim_type = typename camp::at<TypeParam, camp::num<2>>::type;
-  using threads_type = typename camp::at<TypeParam, camp::num<3>>::type;
-  using blocks_type = typename camp::at<TypeParam, camp::num<4>>::type;
+  using dim_type            = typename camp::at<TypeParam, camp::num<2>>::type;
+  using threads_type        = typename camp::at<TypeParam, camp::num<3>>::type;
+  using blocks_type         = typename camp::at<TypeParam, camp::num<4>>::type;
 
   using indexer_type = typename indexer_holder_type::
       template type<dim_type::value, threads_type::value, blocks_type::value>;
diff --git a/test/unit/internal/test-iterators.cpp b/test/unit/internal/test-iterators.cpp
index 2d90dca4c0..7baebdb69b 100644
--- a/test/unit/internal/test-iterators.cpp
+++ b/test/unit/internal/test-iterators.cpp
@@ -89,7 +89,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
   if (std::is_unsigned<TypeParam>::value)
   {
     ASSERT_ANY_THROW({
-      TypeParam val = 10;
+      TypeParam                                    val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
       of_it -= 11;
     });
@@ -100,28 +100,28 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
     });
 
     ASSERT_ANY_THROW({
-      TypeParam val = 10;
+      TypeParam                                    val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto sum = of_it - 11u;
+      auto                                         sum = of_it - 11u;
       (void)sum;
     });
     ASSERT_ANY_THROW({
       TypeParam val = std::numeric_limits<TypeParam>::max() - 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto sum = of_it + 11;
+      auto                                         sum = of_it + 11;
       (void)sum;
     });
 
     ASSERT_ANY_THROW({
-      TypeParam val = 10;
+      TypeParam                                          val = 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto sum = 8 - of_it;
+      auto                                               sum = 8 - of_it;
       (void)sum;
     });
     ASSERT_ANY_THROW({
       TypeParam val = std::numeric_limits<TypeParam>::max() - 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto sum = 11 + of_it;
+      auto                                               sum = 11 + of_it;
       (void)sum;
     });
   }
@@ -132,7 +132,7 @@ TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
   if (std::is_unsigned<TypeParam>::value)
   {
     ASSERT_ANY_THROW({
-      TypeParam val = 2;
+      TypeParam                                            val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
       of_it -= 2;
     });
@@ -143,15 +143,15 @@ TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
     });
 
     ASSERT_ANY_THROW({
-      TypeParam val = 2;
+      TypeParam                                            val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
-      auto sum = of_it - 2;
+      auto                                                 sum = of_it - 2;
       (void)sum;
     });
     ASSERT_ANY_THROW({
       TypeParam val = std::numeric_limits<TypeParam>::max() - 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
-      auto sum = of_it + 2;
+      auto                                                 sum = of_it + 2;
       (void)sum;
     });
   }
diff --git a/test/unit/internal/test-rajavec.cpp b/test/unit/internal/test-rajavec.cpp
index 727dec64f9..131bb16c0b 100644
--- a/test/unit/internal/test-rajavec.cpp
+++ b/test/unit/internal/test-rajavec.cpp
@@ -27,7 +27,7 @@ TEST(RAJAVecUnitTest, basic_test)
 
   RAJA::RAJAVec<int> a1(a);
   ASSERT_EQ(a.size(), a1.size());
-  int* a_data = a.data();
+  int* a_data  = a.data();
   int* a1_data = a1.data();
   ASSERT_EQ(a_data[0], a1_data[0]);
   ASSERT_EQ(a_data[1], a1_data[1]);
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
index f48c015773..4b518a85a4 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -113,7 +113,7 @@ void testBasicMultiReducerConstructor(size_t num_bins)
 TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
   testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(0);
   testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(1);
@@ -123,7 +123,7 @@ TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
 
 
 template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
+void testMultiReducerSingleInitConstructorRegular(size_t      num_bins,
                                                   NumericType initVal)
 {
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
@@ -150,7 +150,7 @@ void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
 }
 
 template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorBitwise(size_t num_bins,
+void testMultiReducerSingleInitConstructorBitwise(size_t      num_bins,
                                                   NumericType initVal)
 {
   RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
@@ -194,7 +194,7 @@ void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
   testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
       0, NumericType(2));
@@ -284,11 +284,11 @@ TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
              MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
-  std::set<NumericType> c2;
+  std::set<NumericType>    c2;
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index 346ea79f83..e307a08b46 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -51,14 +51,16 @@ void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
 
   if (use_reducer)
   {
-    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin)
-      {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal - 1);
-        multi_reduce_max[bin].max(initVal + 1);
-      }
-    });
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < num_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset();
@@ -96,13 +98,15 @@ void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
 
   if (use_reducer)
   {
-    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin)
-      {
-        multi_reduce_and[bin] &= initVal - 1;
-        multi_reduce_or[bin] |= initVal + 1;
-      }
-    });
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < num_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset();
@@ -157,8 +161,8 @@ void testMultiReducerBasicReset(size_t num_bins)
 TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
 
   testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(0);
   testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(1);
@@ -168,9 +172,9 @@ TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
 
 
 template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetRegular(bool use_reducer,
-                                        size_t init_bins,
-                                        size_t num_bins,
+void testMultiReducerSingleResetRegular(bool        use_reducer,
+                                        size_t      init_bins,
+                                        size_t      num_bins,
                                         NumericType initVal)
 {
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
@@ -182,14 +186,16 @@ void testMultiReducerSingleResetRegular(bool use_reducer,
 
   if (use_reducer)
   {
-    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin)
-      {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal - 1);
-        multi_reduce_max[bin].max(initVal + 1);
-      }
-    });
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset(num_bins, initVal);
@@ -213,9 +219,9 @@ void testMultiReducerSingleResetRegular(bool use_reducer,
 }
 
 template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetBitwise(bool use_reducer,
-                                        size_t init_bins,
-                                        size_t num_bins,
+void testMultiReducerSingleResetBitwise(bool        use_reducer,
+                                        size_t      init_bins,
+                                        size_t      num_bins,
                                         NumericType initVal)
 {
   RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
@@ -225,13 +231,15 @@ void testMultiReducerSingleResetBitwise(bool use_reducer,
 
   if (use_reducer)
   {
-    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin)
-      {
-        multi_reduce_and[bin] &= initVal - 1;
-        multi_reduce_or[bin] |= initVal + 1;
-      }
-    });
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset(num_bins, initVal);
@@ -254,8 +262,8 @@ template <typename MultiReducePolicy,
           typename NumericType,
           typename ForOnePol,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(size_t init_bins,
-                                     size_t num_bins,
+void testMultiReducerSingleResetSize(size_t      init_bins,
+                                     size_t      num_bins,
                                      NumericType initVal)
 {
   testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
@@ -274,8 +282,8 @@ template <typename MultiReducePolicy,
           typename NumericType,
           typename ForOnePol,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(size_t init_bins,
-                                     size_t num_bins,
+void testMultiReducerSingleResetSize(size_t      init_bins,
+                                     size_t      num_bins,
                                      NumericType initVal)
 {
   testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
@@ -300,8 +308,8 @@ void testMultiReducerSingleReset(size_t num_bins, NumericType initVal)
 TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
 
   testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
       0, NumericType(3));
@@ -318,12 +326,12 @@ template <typename MultiReducePolicy,
           typename NumericType,
           typename ForOnePol,
           typename Container>
-void testMultiReducerContainerResetRegular(bool use_reducer,
-                                           size_t init_bins,
+void testMultiReducerContainerResetRegular(bool             use_reducer,
+                                           size_t           init_bins,
                                            Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal = NumericType(5);
+  NumericType  initVal  = NumericType(5);
 
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
       init_bins, initVal);
@@ -334,14 +342,16 @@ void testMultiReducerContainerResetRegular(bool use_reducer,
 
   if (use_reducer)
   {
-    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin)
-      {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal - 1);
-        multi_reduce_max[bin].max(initVal + 1);
-      }
-    });
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset(container);
@@ -370,12 +380,12 @@ template <typename MultiReducePolicy,
           typename NumericType,
           typename ForOnePol,
           typename Container>
-void testMultiReducerContainerResetBitwise(bool use_reducer,
-                                           size_t init_bins,
+void testMultiReducerContainerResetBitwise(bool             use_reducer,
+                                           size_t           init_bins,
                                            Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal = NumericType(5);
+  NumericType  initVal  = NumericType(5);
 
   RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
       init_bins, initVal);
@@ -384,13 +394,15 @@ void testMultiReducerContainerResetBitwise(bool use_reducer,
 
   if (use_reducer)
   {
-    forone<ForOnePol>([=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin)
-      {
-        multi_reduce_and[bin] &= initVal - 1;
-        multi_reduce_or[bin] |= initVal + 1;
-      }
-    });
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset(container);
@@ -416,7 +428,7 @@ template <typename MultiReducePolicy,
           typename ForOnePol,
           typename Container,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(size_t init_bins,
+void testMultiReducerContainerResetSize(size_t           init_bins,
                                         Container const& container)
 {
   testMultiReducerContainerResetRegular<MultiReducePolicy,
@@ -438,7 +450,7 @@ template <typename MultiReducePolicy,
           typename ForOnePol,
           typename Container,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(size_t init_bins,
+void testMultiReducerContainerResetSize(size_t           init_bins,
                                         Container const& container)
 {
   testMultiReducerContainerResetRegular<MultiReducePolicy,
@@ -467,12 +479,12 @@ void testMultiReducerContainerReset(Container const& container)
 TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
-  std::set<NumericType> c2;
+  std::set<NumericType>    c2;
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index eaa92588a6..cf3b6bf087 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -60,9 +60,9 @@ typename std::enable_if< // CPU policy.
     >::type
 testReducerConstructor()
 {
-  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
-  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min;
-  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max;
+  RAJA::ReduceSum<ReducePolicy, NumericType>    reduce_sum;
+  RAJA::ReduceMin<ReducePolicy, NumericType>    reduce_min;
+  RAJA::ReduceMax<ReducePolicy, NumericType>    reduce_max;
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
@@ -99,7 +99,7 @@ testReducerConstructor()
 TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
 {
   using ReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
 
   testReducerConstructor<ReducePolicy, NumericType>();
 }
@@ -122,10 +122,12 @@ typename std::enable_if< // GPU policy fiddles with value.
     std::is_base_of<RunOnDevice, ForOnePol>::value>::type
 exec_dispatcher(NumericType* initVal)
 {
-  forone<ForOnePol>([=] __device__() {
-    initVal[0] += 1;
-    initVal[0] -= 1;
-  });
+  forone<ForOnePol>(
+      [=] __device__()
+      {
+        initVal[0] += 1;
+        initVal[0] -= 1;
+      });
 }
 #endif
 
@@ -138,13 +140,13 @@ void testInitReducerConstructor()
   camp::resources::Resource work_res{WORKING_RES::get_default()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  NumericType* theVal = nullptr;
+  NumericType* theVal  = nullptr;
   NumericType* workVal = nullptr;
 
   NumericType initVal = (NumericType)5;
 
   workVal = work_res.allocate<NumericType>(1);
-  theVal = host_res.allocate<NumericType>(1);
+  theVal  = host_res.allocate<NumericType>(1);
 
   work_res.memcpy(workVal, &initVal, sizeof(initVal));
   theVal[0] = (NumericType)10;
@@ -157,9 +159,9 @@ void testInitReducerConstructor()
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
-  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
-  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(initVal);
+  RAJA::ReduceSum<ReducePolicy, NumericType>    reduce_sum(initVal);
+  RAJA::ReduceMin<ReducePolicy, NumericType>    reduce_min(initVal);
+  RAJA::ReduceMax<ReducePolicy, NumericType>    reduce_max(initVal);
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(initVal, 1);
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
@@ -208,10 +210,10 @@ void testInitReducerConstructor()
 
 TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
 {
-  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
 
   testInitReducerConstructor<ReduceType,
                              NumericType,
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index cec6f34403..c4b906c27b 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -49,26 +49,28 @@ template <typename ReducePolicy,
 typename std::enable_if< // GPU policy execution.
     std::is_base_of<RunOnDevice, ForOnePol>::value>::type
 exec_dispatcher(
-    RAJA::ReduceSum<ReducePolicy, NumericType>& reduce_sum,
-    RAJA::ReduceMin<ReducePolicy, NumericType>& reduce_min,
-    RAJA::ReduceMax<ReducePolicy, NumericType>& reduce_max,
+    RAJA::ReduceSum<ReducePolicy, NumericType>&             reduce_sum,
+    RAJA::ReduceMin<ReducePolicy, NumericType>&             reduce_min,
+    RAJA::ReduceMax<ReducePolicy, NumericType>&             reduce_max,
     RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& reduce_minloc,
     RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& reduce_maxloc,
-    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& reduce_minloctup,
-    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& reduce_maxloctup,
-    NumericType initVal)
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>&   reduce_minloctup,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>&   reduce_maxloctup,
+    NumericType                                             initVal)
 {
   // Use device to activate any value for each reducer.
-  forone<ForOnePol>([=] __host__ __device__() {
-    Tuple temploc(0, 0);
-    reduce_sum += initVal;
-    reduce_min.min(0);
-    reduce_max.max(0);
-    reduce_minloc.minloc(0, 0);
-    reduce_maxloc.maxloc(0, 0);
-    reduce_minloctup.minloc(0, temploc);
-    reduce_maxloctup.maxloc(0, temploc);
-  });
+  forone<ForOnePol>(
+      [=] __host__ __device__()
+      {
+        Tuple temploc(0, 0);
+        reduce_sum += initVal;
+        reduce_min.min(0);
+        reduce_max.max(0);
+        reduce_minloc.minloc(0, 0);
+        reduce_maxloc.maxloc(0, 0);
+        reduce_minloctup.minloc(0, temploc);
+        reduce_maxloctup.maxloc(0, temploc);
+      });
   // Relying on implicit device synchronization in forone.
 }
 #endif
@@ -89,11 +91,11 @@ void testReducerReset()
   camp::resources::Resource host_res{camp::resources::Host()};
 
   NumericType* resetVal = nullptr;
-  NumericType* workVal = nullptr;
+  NumericType* workVal  = nullptr;
 
   NumericType initVal = (NumericType)5;
 
-  workVal = work_res.allocate<NumericType>(1);
+  workVal  = work_res.allocate<NumericType>(1);
   resetVal = host_res.allocate<NumericType>(1);
 
   work_res.memcpy(workVal, &initVal, sizeof(initVal));
@@ -107,9 +109,9 @@ void testReducerReset()
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
-  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
-  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(initVal);
+  RAJA::ReduceSum<ReducePolicy, NumericType>    reduce_sum(initVal);
+  RAJA::ReduceMin<ReducePolicy, NumericType>    reduce_min(initVal);
+  RAJA::ReduceMax<ReducePolicy, NumericType>    reduce_max(initVal);
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(initVal, 1);
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
@@ -183,10 +185,10 @@ void testReducerReset()
 
 TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
 {
-  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
   testReducerReset<ReduceType, NumericType, ResourceType, ForOneType>();
 }
 
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index 272a5bf881..0349c8e8bf 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -18,18 +18,18 @@ inline __host__ __device__ void gpu_time_wait_for(float time, float clockrate)
 {
   clock_t time_in_clocks = time * clockrate;
 
-  unsigned int start_clock = (unsigned int)clock();
-  clock_t clock_offset = 0;
+  unsigned int start_clock  = (unsigned int)clock();
+  clock_t      clock_offset = 0;
   while (clock_offset < time_in_clocks)
   {
     unsigned int end_clock = (unsigned int)clock();
-    clock_offset = (clock_t)(end_clock - start_clock);
+    clock_offset           = (clock_t)(end_clock - start_clock);
   }
 }
 
 int get_clockrate()
 {
-  int cuda_device = 0;
+  int            cuda_device = 0;
   cudaDeviceProp deviceProp;
   cudaGetDevice(&cuda_device);
   cudaGetDeviceProperties(&deviceProp, cuda_device);
@@ -60,23 +60,23 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   using namespace RAJA;
 
   constexpr std::size_t NUM_STREAMS{8};
-  WORKING_RES dev[NUM_STREAMS];
-  resources::Host host;
+  WORKING_RES           dev[NUM_STREAMS];
+  resources::Host       host;
 
   int clockrate{get_clockrate()};
   ASSERT_TRUE(clockrate != -1);
 
   using AsyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, true>;
-  using SyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, false>;
+  using SyncExecPol  = RAJA::cuda_exec<BLOCK_SIZE, false>;
 
   RAJA::Timer sync_timer;
   sync_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<SyncExecPol>(
-        dev[stream], RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-          gpu_time_wait_for(100, clockrate);
-        });
+    forall<SyncExecPol>(dev[stream],
+                        RangeSegment(0, ARRAY_SIZE),
+                        [=] RAJA_HOST_DEVICE(int i)
+                        { gpu_time_wait_for(100, clockrate); });
   }
   sync_timer.stop();
   RAJA::Timer::ElapsedType t_sync = sync_timer.elapsed();
@@ -85,10 +85,10 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   async_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<AsyncExecPol>(
-        dev[stream], RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-          gpu_time_wait_for(100, clockrate);
-        });
+    forall<AsyncExecPol>(dev[stream],
+                         RangeSegment(0, ARRAY_SIZE),
+                         [=] RAJA_HOST_DEVICE(int i)
+                         { gpu_time_wait_for(100, clockrate); });
   }
   async_timer.stop();
   RAJA::Timer::ElapsedType t_async = async_timer.elapsed();
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index 8316d0c69e..18ee61720b 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -16,16 +16,16 @@ void ResourceBasicAsyncSemanticsTestImpl()
   constexpr std::size_t ARRAY_SIZE{10000000};
   using namespace RAJA;
 
-  WORKING_RES dev;
+  WORKING_RES     dev;
   resources::Host host;
 
   int* d_array = resources::Resource{dev}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-        h_array[i] = i;
-      });
+  forall<policy::sequential::seq_exec>(host,
+                                       RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
@@ -37,10 +37,10 @@ void ResourceBasicAsyncSemanticsTestImpl()
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
-        ASSERT_EQ(h_array[i], i + 2);
-      });
+  forall<policy::sequential::seq_exec>(host,
+                                       RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev.deallocate(d_array);
   host.deallocate(h_array);
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index a3f823a9db..19875a1c90 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -16,13 +16,13 @@ void ResourceDependsTestImpl()
   constexpr std::size_t ARRAY_SIZE{10000};
   using namespace RAJA;
 
-  WORKING_RES dev1;
-  WORKING_RES dev2;
+  WORKING_RES     dev1;
+  WORKING_RES     dev2;
   resources::Host host;
 
   int* d_array1 = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
   int* d_array2 = resources::Resource{dev2}.allocate<int>(ARRAY_SIZE);
-  int* h_array = host.allocate<int>(ARRAY_SIZE);
+  int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
 
   forall<EXEC_POLICY>(dev1,
@@ -36,19 +36,19 @@ void ResourceDependsTestImpl()
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(
-      dev1, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-        d_array1[i] *= d_array2[i];
-      });
+  forall<EXEC_POLICY>(dev1,
+                      RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i)
+                      { d_array1[i] *= d_array2[i]; });
 
   dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
-        ASSERT_EQ(h_array[i], -i);
-      });
+  forall<policy::sequential::seq_exec>(host,
+                                       RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], -i); });
 
   dev1.deallocate(d_array1);
   dev2.deallocate(d_array2);
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index 54854596b7..11317194f5 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -16,17 +16,17 @@ void ResourceJoinAsyncSemanticsTestImpl()
   constexpr std::size_t ARRAY_SIZE{1000000};
   using namespace RAJA;
 
-  WORKING_RES dev1;
-  WORKING_RES dev2;
+  WORKING_RES     dev1;
+  WORKING_RES     dev2;
   resources::Host host;
 
   int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-        h_array[i] = i;
-      });
+  forall<policy::sequential::seq_exec>(host,
+                                       RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev2.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
@@ -44,10 +44,10 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
-        ASSERT_EQ(h_array[i], i + 2);
-      });
+  forall<policy::sequential::seq_exec>(host,
+                                       RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index 2400e5a20e..379188614a 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -16,37 +16,43 @@ void ResourceMultiStreamTestImpl()
   constexpr std::size_t ARRAY_SIZE{10000};
   using namespace RAJA;
 
-  WORKING_RES dev1;
-  WORKING_RES dev2;
-  WORKING_RES dev3;
+  WORKING_RES     dev1;
+  WORKING_RES     dev2;
+  WORKING_RES     dev3;
   resources::Host host;
 
   int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  resources::Event e1 = forall<EXEC_POLICY>(
-      dev1, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-        if (i % 3 == 0)
-        {
-          d_array[i] = i;
-        }
-      });
-
-  resources::Event e2 = forall<EXEC_POLICY>(
-      dev2, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-        if (i % 3 == 1)
-        {
-          d_array[i] = i;
-        }
-      });
-
-  resources::Event e3 = forall<EXEC_POLICY>(
-      dev2, RangeSegment(0, ARRAY_SIZE), [=] RAJA_HOST_DEVICE(int i) {
-        if (i % 3 == 2)
-        {
-          d_array[i] = i;
-        }
-      });
+  resources::Event e1 = forall<EXEC_POLICY>(dev1,
+                                            RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 0)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e2 = forall<EXEC_POLICY>(dev2,
+                                            RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 1)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e3 = forall<EXEC_POLICY>(dev2,
+                                            RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 2)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
 
   dev1.wait_for(&e2);
   dev1.wait_for(&e3);
@@ -55,10 +61,10 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE), [=](int i) {
-        ASSERT_EQ(h_array[i], i);
-      });
+  forall<policy::sequential::seq_exec>(host,
+                                       RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
diff --git a/test/unit/util/operator/test-operators-bitwise-modulus.cpp b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
index db81160eb1..d87fec6484 100644
--- a/test/unit/util/operator/test-operators-bitwise-modulus.cpp
+++ b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
@@ -24,8 +24,8 @@ void modulus_test()
   using Mod = RAJA::operators::modulus<T>;
 
   Mod m;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(2);
+  T   i = static_cast<T>(5);
+  T   j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), T(1));
 
   if (std::is_signed<T>::value)
@@ -42,9 +42,9 @@ void bit_or_test()
   using Or = RAJA::operators::bit_or<T>;
 
   Or o;
-  T i = static_cast<T>(0010);
-  T j = static_cast<T>(0001);
-  T k = static_cast<T>(0111);
+  T  i = static_cast<T>(0010);
+  T  j = static_cast<T>(0001);
+  T  k = static_cast<T>(0111);
   ASSERT_EQ(o(i, j), T(0011));
   ASSERT_EQ(o(i, k), T(0111));
   ASSERT_EQ(o(j, k), T(0111));
@@ -56,9 +56,9 @@ void bit_and_test()
   using And = RAJA::operators::bit_and<T>;
 
   And a;
-  T i = static_cast<T>(0010);
-  T j = static_cast<T>(0001);
-  T k = static_cast<T>(0111);
+  T   i = static_cast<T>(0010);
+  T   j = static_cast<T>(0001);
+  T   k = static_cast<T>(0111);
   ASSERT_EQ(a(i, j), T(0000));
   ASSERT_EQ(a(i, k), T(0010));
   ASSERT_EQ(a(j, k), T(0001));
@@ -70,9 +70,9 @@ void bit_xor_test()
   using Xor = RAJA::operators::bit_xor<T>;
 
   Xor x;
-  T i = static_cast<T>(0010);
-  T j = static_cast<T>(0001);
-  T k = static_cast<T>(0111);
+  T   i = static_cast<T>(0010);
+  T   j = static_cast<T>(0001);
+  T   k = static_cast<T>(0111);
   ASSERT_EQ(x(i, j), T(0011));
   ASSERT_EQ(x(i, k), T(0101));
   ASSERT_EQ(x(j, k), T(0110));
diff --git a/test/unit/util/operator/test-operators-equivalence.cpp b/test/unit/util/operator/test-operators-equivalence.cpp
index b4be241224..18f57d96b6 100644
--- a/test/unit/util/operator/test-operators-equivalence.cpp
+++ b/test/unit/util/operator/test-operators-equivalence.cpp
@@ -24,8 +24,8 @@ void equal_test()
   using Eq = RAJA::operators::equal_to<T>;
 
   Eq eq;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(5);
+  T  i = static_cast<T>(5);
+  T  j = static_cast<T>(5);
   ASSERT_TRUE(eq(i, j));
 
   if (std::is_signed<T>::value)
@@ -42,8 +42,8 @@ void not_equal_test()
   using NEq = RAJA::operators::not_equal_to<T>;
 
   NEq neq;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(3);
+  T   i = static_cast<T>(5);
+  T   j = static_cast<T>(3);
   ASSERT_TRUE(neq(i, j));
 
   if (std::is_signed<T>::value)
@@ -100,18 +100,18 @@ void greater_eq_test()
   using G = RAJA::operators::greater_equal<T>;
 
   G g;
-  T i = static_cast<T>(5);
+  T i  = static_cast<T>(5);
   T i2 = static_cast<T>(5);
-  T j = static_cast<T>(4);
+  T j  = static_cast<T>(4);
   ASSERT_TRUE(g(i, j));
   ASSERT_TRUE(g(i, i2));
   ASSERT_FALSE(g(j, i));
 
   if (std::is_signed<T>::value)
   {
-    i = static_cast<T>(-4);
+    i  = static_cast<T>(-4);
     i2 = static_cast<T>(-4);
-    j = static_cast<T>(-5);
+    j  = static_cast<T>(-5);
     ASSERT_TRUE(g(i, j));
     ASSERT_TRUE(g(i, i2));
     ASSERT_FALSE(g(j, i));
@@ -124,18 +124,18 @@ void less_eq_test()
   using L = RAJA::operators::less_equal<T>;
 
   L l;
-  T i = static_cast<T>(4);
+  T i  = static_cast<T>(4);
   T i2 = static_cast<T>(4);
-  T j = static_cast<T>(5);
+  T j  = static_cast<T>(5);
   ASSERT_TRUE(l(i, j));
   ASSERT_TRUE(l(i, i2));
   ASSERT_FALSE(l(j, i));
 
   if (std::is_signed<T>::value)
   {
-    i = static_cast<T>(-5);
+    i  = static_cast<T>(-5);
     i2 = static_cast<T>(-5);
-    j = static_cast<T>(-4);
+    j  = static_cast<T>(-4);
     ASSERT_TRUE(l(i, j));
     ASSERT_TRUE(l(i, i2));
     ASSERT_FALSE(l(j, i));
@@ -148,8 +148,8 @@ void maximum_test()
   using Max = RAJA::operators::maximum<T>;
 
   Max m;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(2);
+  T   i = static_cast<T>(5);
+  T   j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), i);
 
   if (std::is_signed<T>::value)
@@ -166,8 +166,8 @@ void minimum_test()
   using Min = RAJA::operators::minimum<T>;
 
   Min m;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(2);
+  T   i = static_cast<T>(5);
+  T   j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), j);
 
   if (std::is_signed<T>::value)
diff --git a/test/unit/util/operator/test-operators-identity.cpp b/test/unit/util/operator/test-operators-identity.cpp
index 0512080504..bba85e3970 100644
--- a/test/unit/util/operator/test-operators-identity.cpp
+++ b/test/unit/util/operator/test-operators-identity.cpp
@@ -24,8 +24,8 @@ void identity_test()
   using Ident = RAJA::operators::identity<T>;
 
   Ident id;
-  T i = static_cast<T>(0);
-  T j = static_cast<T>(1);
+  T     i = static_cast<T>(0);
+  T     j = static_cast<T>(1);
   ASSERT_EQ(id(i), T(0));
   ASSERT_EQ(id(j), T(1));
 
@@ -42,8 +42,8 @@ void project1st_test()
   using Proj1 = RAJA::operators::project1st<T, T>;
 
   Proj1 p;
-  T i = static_cast<T>(0);
-  T j = static_cast<T>(1);
+  T     i = static_cast<T>(0);
+  T     j = static_cast<T>(1);
   ASSERT_EQ(p(i, j), T(0));
   ASSERT_EQ(p(j, i), T(1));
 
@@ -61,8 +61,8 @@ void project2nd_test()
   using Proj2 = RAJA::operators::project2nd<T, T>;
 
   Proj2 p;
-  T i = static_cast<T>(0);
-  T j = static_cast<T>(1);
+  T     i = static_cast<T>(0);
+  T     j = static_cast<T>(1);
   ASSERT_EQ(p(i, j), T(1));
   ASSERT_EQ(p(j, i), T(0));
 
diff --git a/test/unit/util/operator/test-operators-logical.cpp b/test/unit/util/operator/test-operators-logical.cpp
index 8edb9cdad0..2e8d3a9f3a 100644
--- a/test/unit/util/operator/test-operators-logical.cpp
+++ b/test/unit/util/operator/test-operators-logical.cpp
@@ -23,12 +23,12 @@ void logical_and_test()
   using And = RAJA::operators::logical_and<T>;
 
   And a;
-  T i0 = static_cast<T>(0);
-  T i1 = static_cast<T>(1);
-  T i2 = static_cast<T>(2);
-  T j0 = static_cast<T>(0);
-  T j1 = static_cast<T>(1);
-  T j2 = static_cast<T>(2);
+  T   i0 = static_cast<T>(0);
+  T   i1 = static_cast<T>(1);
+  T   i2 = static_cast<T>(2);
+  T   j0 = static_cast<T>(0);
+  T   j1 = static_cast<T>(1);
+  T   j2 = static_cast<T>(2);
   ASSERT_FALSE(a(i0, j0));
   ASSERT_FALSE(a(i0, j1));
   ASSERT_FALSE(a(i1, j0));
@@ -50,12 +50,12 @@ void logical_or_test()
   using Or = RAJA::operators::logical_or<T>;
 
   Or o;
-  T i0 = static_cast<T>(0);
-  T i1 = static_cast<T>(1);
-  T i2 = static_cast<T>(2);
-  T j0 = static_cast<T>(0);
-  T j1 = static_cast<T>(1);
-  T j2 = static_cast<T>(2);
+  T  i0 = static_cast<T>(0);
+  T  i1 = static_cast<T>(1);
+  T  i2 = static_cast<T>(2);
+  T  j0 = static_cast<T>(0);
+  T  j1 = static_cast<T>(1);
+  T  j2 = static_cast<T>(2);
   ASSERT_FALSE(o(i0, j0));
   ASSERT_TRUE(o(i0, j1));
   ASSERT_TRUE(o(i1, j0));
@@ -77,8 +77,8 @@ void logical_not_test()
   using Not = RAJA::operators::logical_not<T>;
 
   Not n;
-  T i0 = static_cast<T>(0);
-  T i1 = static_cast<T>(1);
+  T   i0 = static_cast<T>(0);
+  T   i1 = static_cast<T>(1);
   ASSERT_FALSE(n(i1));
   ASSERT_TRUE(n(i0));
   if (std::is_signed<T>::value)
diff --git a/test/unit/util/operator/test-operators-math.cpp b/test/unit/util/operator/test-operators-math.cpp
index 16dd7c170a..c89a8f9ecc 100644
--- a/test/unit/util/operator/test-operators-math.cpp
+++ b/test/unit/util/operator/test-operators-math.cpp
@@ -25,8 +25,8 @@ void plus_test()
   ASSERT_EQ(ident, T(0));
 
   Plus p;
-  T i = static_cast<T>(1);
-  T j = static_cast<T>(2);
+  T    i = static_cast<T>(1);
+  T    j = static_cast<T>(2);
   ASSERT_EQ(p(i, j), T(3));
 
   if (std::is_signed<T>::value)
@@ -43,8 +43,8 @@ void minus_test()
   using Minus = RAJA::operators::minus<T>;
 
   Minus m;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(2);
+  T     i = static_cast<T>(5);
+  T     j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), T(3));
 
   if (std::is_signed<T>::value)
@@ -63,8 +63,8 @@ void multiplies_test()
   ASSERT_EQ(ident, T(1));
 
   Mult m;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(2);
+  T    i = static_cast<T>(5);
+  T    j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), T(10));
 
   if (std::is_signed<T>::value)
@@ -81,8 +81,8 @@ void divides_test()
   using Div = RAJA::operators::divides<T>;
 
   Div d;
-  T i = static_cast<T>(5);
-  T j = static_cast<T>(2);
+  T   i = static_cast<T>(5);
+  T   j = static_cast<T>(2);
   if (std::is_floating_point<T>::value)
     ASSERT_EQ(d(i, j), T(2.5));
   else
diff --git a/test/unit/util/test-span.hpp b/test/unit/util/test-span.hpp
index fc0e8d06b0..3e817548da 100644
--- a/test/unit/util/test-span.hpp
+++ b/test/unit/util/test-span.hpp
@@ -26,7 +26,7 @@ template <typename ValueType, typename IndexType>
 void testSpanConstructTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType*          ptr = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -49,10 +49,10 @@ template <typename ValueType, typename IndexType>
 void testSpanAssignTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType*          ptr = new ValueType[len];
 
   {
-    RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    RAJA::Span<ValueType*, IndexType>       span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> span2(ptr, len);
     span = span2;
 
@@ -61,9 +61,9 @@ void testSpanAssignTypes()
   }
 
   {
-    ValueType* ptr2 = ptr + 1;
-    constexpr IndexType len2 = 1;
-    RAJA::Span<ValueType*, IndexType> span(ptr, len);
+    ValueType*                              ptr2 = ptr + 1;
+    constexpr IndexType                     len2 = 1;
+    RAJA::Span<ValueType*, IndexType>       span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> span2(ptr2, len2);
     span = span2;
 
@@ -77,11 +77,11 @@ void testSpanAssignTypes()
 template <typename ValueType, typename IndexType>
 void testSpanIteratorTypes()
 {
-  using span_type = RAJA::Span<ValueType*, IndexType>;
-  using iterator = typename span_type::iterator;
-  using const_iterator = typename span_type::const_iterator;
+  using span_type         = RAJA::Span<ValueType*, IndexType>;
+  using iterator          = typename span_type::iterator;
+  using const_iterator    = typename span_type::const_iterator;
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType*          ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -94,7 +94,7 @@ void testSpanIteratorTypes()
     const span_type span(ptr, len);
 
     iterator begin = span.begin();
-    iterator end = span.end();
+    iterator end   = span.end();
     ASSERT_EQ(ptr, begin);
     ASSERT_EQ(ptr + len, end);
 
@@ -107,7 +107,7 @@ void testSpanIteratorTypes()
     }
 
     const_iterator cbegin = span.cbegin();
-    const_iterator cend = span.cend();
+    const_iterator cend   = span.cend();
     ASSERT_EQ(ptr, cbegin);
     ASSERT_EQ(ptr + len, cend);
 
@@ -127,7 +127,7 @@ template <typename ValueType, typename IndexType>
 void testSpanElementAccessTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType*          ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -156,7 +156,7 @@ template <typename ValueType, typename IndexType>
 void testSpanObserveTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType*          ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -186,7 +186,7 @@ template <typename ValueType, typename IndexType>
 void testSpanSubViewTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType*          ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -196,7 +196,7 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType count = 3;
+    constexpr IndexType                     count = 3;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan = span.first(count);
 
@@ -205,7 +205,7 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType count = 3;
+    constexpr IndexType                     count = 3;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan = span.last(count);
 
@@ -214,8 +214,8 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType begin = 1;
-    constexpr IndexType count = 2;
+    constexpr IndexType                     begin = 1;
+    constexpr IndexType                     count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan =
         span.subspan(begin, count);
@@ -225,8 +225,8 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType begin = 1;
-    constexpr IndexType count = 2;
+    constexpr IndexType                     begin = 1;
+    constexpr IndexType                     count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan = span.slice(begin, count);
 
@@ -241,7 +241,7 @@ template <typename ValueType, typename IndexType>
 void testSpanMakeSpanTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType*          ptr = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span = RAJA::make_span(ptr, len);
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index b62762d876..ae46ef993c 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -19,7 +19,7 @@ TEST(IndexLayout, IndexList1D)
 
   Index_type arr[3] = {1, 2, 3};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
@@ -38,7 +38,7 @@ TEST(IndexLayout, IndexList1DSubsetOfLayout)
 
   Index_type arr[3] = {2, 3, 4};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   EXPECT_EQ(index_layout(0), 2);
@@ -62,7 +62,7 @@ TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0)
 
   Index_type arr[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
   for (int i = 0; i < 10; i++)
@@ -86,7 +86,7 @@ TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1)
 
   Index_type arr[2] = {9, 5};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
   EXPECT_EQ(index_layout(0, 0), 9);
@@ -111,7 +111,7 @@ TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0)
 
   Index_type arr[1] = {2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
   EXPECT_EQ(index_layout(0, 0), 6);
@@ -133,7 +133,7 @@ TEST(IndexLayout, IndexList2DLayoutExtractOneIndex)
 
   Index_type arr[1] = {2};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
   EXPECT_EQ(index_layout(0, 0), 2);
@@ -155,7 +155,7 @@ TEST(IndexLayout, ConditionalIndexListNullPtr)
 
   Index_type* arr_ptr = nullptr;
 
-  auto index_tuple = make_index_tuple(ConditionalIndexList<>{arr_ptr});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<>{arr_ptr});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 0);
@@ -177,7 +177,7 @@ TEST(IndexLayout, ConditionalIndexListWithIndexList)
 
   Index_type arr[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(ConditionalIndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
@@ -195,10 +195,10 @@ TEST(IndexLayout, View1DLayout)
    *   index_layout(2)     -> layout(3)            -> 3                 -> 20
    */
 
-  Index_type data[5] = {5, 10, 15, 20, 25};
+  Index_type data[5]       = {5, 10, 15, 20, 25};
   Index_type index_list[3] = {4, 2, 3};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
+  auto index_tuple  = make_index_tuple(IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   auto view = make_index_view(&data[0], index_layout);
@@ -330,7 +330,7 @@ TEST(IndexLayout, MultiView1DLayout)
 
   Index_type index_list[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
+  auto index_tuple  = make_index_tuple(IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 4);
 
   auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<>>>(
diff --git a/test/unit/view-layout/test-makelayout.cpp b/test/unit/view-layout/test-makelayout.cpp
index 6ab43b7348..11b682acbd 100644
--- a/test/unit/view-layout/test-makelayout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -140,7 +140,7 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 
 TEST(LayoutUnitTest, 2D_StrideOne)
 {
-  using my_layout = RAJA::Layout<2>;
+  using my_layout    = RAJA::Layout<2>;
   using my_layout_s1 = RAJA::Layout<2, ptrdiff_t, 0>; // first index is stride-1
 
   /*
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
index 6e29916fcd..89f4cc1ec3 100644
--- a/test/unit/view-layout/test-multiview.cpp
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -32,16 +32,16 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 
   using layout = RAJA::Layout<1>;
 
-  TypeParam a1[10];
-  TypeParam a2[10];
+  TypeParam  a1[10];
+  TypeParam  a2[10];
   TypeParam* data[2];
 
   data[0] = a1;
   data[1] = a2;
 
   constexpr int val = 8;
-  a1[0] = val;
-  a2[0] = val;
+  a1[0]             = val;
+  a2[0]             = val;
 
   RAJA::MultiView<TypeParam, layout> view(data, layout(10));
   ASSERT_EQ(val, view(0, 0));
@@ -107,12 +107,12 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 TYPED_TEST(MultiViewUnitTest, Accessor)
 {
 
-  const int Nx = 3;
-  const int Ny = 5;
-  const int Nz = 2;
-  const int N = Nx * Ny * Nz;
-  TypeParam* b = new TypeParam[N];
-  TypeParam* c = new TypeParam[N];
+  const int  Nx = 3;
+  const int  Ny = 5;
+  const int  Nz = 2;
+  const int  N  = Nx * Ny * Nz;
+  TypeParam* b  = new TypeParam[N];
+  TypeParam* c  = new TypeParam[N];
   TypeParam* a[2];
 
   a[0] = b;
@@ -129,9 +129,9 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 1D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>>    view_1D(a, N);
   RAJA::MultiView<TypeParam, RAJA::Layout<1>, 1> view_1D1p(a, N);
-  TypeParam val{0};
+  TypeParam                                      val{0};
   for (int i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(0, i));
@@ -144,7 +144,7 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 2D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>>    view_2D(a, Ny, Nx);
   RAJA::MultiView<TypeParam, RAJA::Layout<2>, 1> view_2D1p(a, Ny, Nx);
   val = TypeParam{0};
   for (int j = 0; j < Ny; ++j)
@@ -162,7 +162,7 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>>    view_3D(a, Nz, Ny, Nx);
   RAJA::MultiView<TypeParam, RAJA::Layout<3>, 2> view_3D1p(a, Nz, Ny, Nx);
   val = TypeParam{0};
   for (int k = 0; k < Nz; ++k)
@@ -198,8 +198,8 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
   /*
    * MultiView is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower{{1}};
-  std::array<RAJA::Index_type, 1> upper{{11}};
+  std::array<RAJA::Index_type, 1>    lower{{1}};
+  std::array<RAJA::Index_type, 1>    upper{{11}};
   RAJA::MultiView<TypeParam, layout> view(
       data, RAJA::make_offset_layout<1>(lower, upper));
   RAJA::MultiView<TypeParam, layout, 1> view1p(
@@ -223,7 +223,7 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
 TYPED_TEST(MultiViewUnitTest, Shift1D)
 {
 
-  int N = 10;
+  int        N     = 10;
   TypeParam* reala = new TypeParam[N];
   TypeParam* realb = new TypeParam[N];
   TypeParam* a[2];
@@ -231,10 +231,10 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
   a[1] = realb;
 
   // Create a view from a base view
-  const int DIM = 1;
+  const int               DIM    = 1;
   RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>>       B(a, N);
 
   for (int i = 0; i < N; ++i)
   {
@@ -266,12 +266,12 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 
 
   // Create a shifted view from a view with a typed layout
-  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
 
-  RAJA::MultiView<TypeParam, TLayout> D(a, myLayout);
+  RAJA::MultiView<TypeParam, TLayout>       D(a, myLayout);
   RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
   for (TIX i = TIX{N}; i < TIX{2 * N}; ++i)
@@ -286,18 +286,18 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 TYPED_TEST(MultiViewUnitTest, Shift2D)
 {
 
-  int N = 10;
+  int        N  = 10;
   TypeParam* a0 = new TypeParam[N * N];
   TypeParam* b0 = new TypeParam[N * N];
   TypeParam* a[2];
   a[0] = a0;
   a[1] = b0;
 
-  const int DIM = 2;
+  const int               DIM = 2;
   RAJA::OffsetLayout<DIM> layout =
       RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>>       B(a, N, N);
 
   for (int y = 0; y < N; ++y)
   {
@@ -325,7 +325,7 @@ TYPED_TEST(MultiViewUnitTest, Shift2D)
 
   // Create a view from a base view with permuted layout
   std::array<RAJA::idx_t, 2> perm{{1, 0}};
-  RAJA::OffsetLayout<2> playout =
+  RAJA::OffsetLayout<2>      playout =
       RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index 1cfcf6a5fc..e1480e20ea 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -63,11 +63,11 @@ TYPED_TEST(TypedViewUnitTest, Constructors)
 TYPED_TEST(TypedViewUnitTest, Accessor)
 {
 
-  const int Nx = 3;
-  const int Ny = 5;
-  const int Nz = 2;
-  const int N = Nx * Ny * Nz;
-  TypeParam* a = new TypeParam[N];
+  const int  Nx = 3;
+  const int  Ny = 5;
+  const int  Nz = 2;
+  const int  N  = Nx * Ny * Nz;
+  TypeParam* a  = new TypeParam[N];
 
   int iter{0};
   for (TypeParam i = 0; i < TypeParam{N}; ++i)
@@ -80,7 +80,7 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
    * 1D Accessor
    */
   RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a, N);
-  TypeParam val{0};
+  TypeParam                              val{0};
   for (int i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(i));
@@ -123,11 +123,11 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
 TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
 {
 
-  const int Nx = 3;
-  const int Ny = 5;
-  const int Nz = 2;
-  const int N = Nx * Ny * Nz;
-  TypeParam* a = new TypeParam[N];
+  const int  Nx = 3;
+  const int  Ny = 5;
+  const int  Nz = 2;
+  const int  N  = Nx * Ny * Nz;
+  TypeParam* a  = new TypeParam[N];
 
   int iter{0};
   for (TypeParam i = 0; i < TypeParam{N}; ++i)
@@ -140,7 +140,7 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
    * 1D Typed Accessor
    */
   RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a, N);
-  TypeParam val{0};
+  TypeParam                                              val{0};
   for (TypeParam i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(i));
@@ -192,7 +192,7 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
    */
   std::array<RAJA::Index_type, 1> lower{{1}};
   std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::View<TypeParam, layout> view(data,
+  RAJA::View<TypeParam, layout>   view(data,
                                      RAJA::make_offset_layout<1>(lower, upper));
 
   for (int i = 0; i < 10; i++)
@@ -209,17 +209,17 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
 TYPED_TEST(TypedViewUnitTest, Shift1D)
 {
 
-  int N = 10;
+  int        N = 10;
   TypeParam* a = new TypeParam[N];
   TypeParam* b = new TypeParam[N];
 
   /*
    * Create a view from a base view
    */
-  const int DIM = 1;
+  const int               DIM    = 1;
   RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N);
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>>    A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>>          B(a, N);
   RAJA::TypedView<TypeParam, RAJA::Layout<DIM>, TX> C(a, N);
 
   for (int i = 0; i < N; ++i)
@@ -250,12 +250,12 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   /*
    * Create a shifted view from a view with a typed layout
    */
-  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
 
-  RAJA::View<TypeParam, TLayout> D(a, myLayout);
+  RAJA::View<TypeParam, TLayout>       D(a, myLayout);
   RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
   for (TIX i = TIX{N}; i < TIX{2 * N}; ++i)
@@ -271,15 +271,15 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
 TYPED_TEST(TypedViewUnitTest, Shift2D)
 {
 
-  int N = 10;
+  int        N = 10;
   TypeParam* a = new TypeParam[N * N];
   TypeParam* b = new TypeParam[N * N];
 
-  const int DIM = 2;
+  const int               DIM = 2;
   RAJA::OffsetLayout<DIM> layout =
       RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>>       B(a, N, N);
 
   for (int y = 0; y < N; ++y)
   {
@@ -308,7 +308,7 @@ TYPED_TEST(TypedViewUnitTest, Shift2D)
    * Create a view from a base view with permuted layout
    */
   std::array<RAJA::idx_t, 2> perm{{1, 0}};
-  RAJA::OffsetLayout<2> playout =
+  RAJA::OffsetLayout<2>      playout =
       RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
diff --git a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
index d680ddc95f..b2cbe3ca9e 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
@@ -22,10 +22,10 @@ struct EnqueueTestCallable
 {
   EnqueueTestCallable(IndexType* _ptr, IndexType _val) : ptr(_ptr), val(_val) {}
 
-  EnqueueTestCallable(EnqueueTestCallable const&) = default;
+  EnqueueTestCallable(EnqueueTestCallable const&)            = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable const&) = default;
 
-  EnqueueTestCallable(EnqueueTestCallable&& o) = default;
+  EnqueueTestCallable(EnqueueTestCallable&& o)            = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable&& o) = default;
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
@@ -36,7 +36,7 @@ struct EnqueueTestCallable
 
 private:
   IndexType* ptr;
-  IndexType val;
+  IndexType  val;
 };
 
 #endif //__TEST_UTIL_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
index c55031b757..0b753021f1 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
@@ -24,7 +24,7 @@ struct TestCallable
 {
   TestCallable(T _val) : val(_val) {}
 
-  TestCallable(TestCallable const&) = delete;
+  TestCallable(TestCallable const&)            = delete;
   TestCallable& operator=(TestCallable const&) = delete;
 
   TestCallable(TestCallable&& o) : val(o.val), move_constructed(true)
@@ -34,7 +34,7 @@ struct TestCallable
 
   TestCallable& operator=(TestCallable&& o)
   {
-    val = o.val;
+    val          = o.val;
     o.moved_from = true;
     return *this;
   }
@@ -44,8 +44,8 @@ struct TestCallable
                                    bool* moved_from_ptr) const
   {
     *static_cast<T*>(val_ptr) = val;
-    *move_constructed_ptr = move_constructed;
-    *moved_from_ptr = moved_from;
+    *move_constructed_ptr     = move_constructed;
+    *moved_from_ptr           = moved_from;
   }
 
 private:
@@ -53,7 +53,7 @@ struct TestCallable
 
 public:
   bool move_constructed = false;
-  bool moved_from = false;
+  bool moved_from       = false;
 };
 
 
@@ -61,9 +61,9 @@ struct TestCallable
 template <typename T, size_t N>
 struct TestArray
 {
-  T a[N]{};
-  T& operator[](size_t i) { return a[i]; }
-  T const& operator[](size_t i) const { return a[i]; }
+  T                  a[N]{};
+  T&                 operator[](size_t i) { return a[i]; }
+  T const&           operator[](size_t i) const { return a[i]; }
   friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
   {
     for (size_t i = 0; i < N; ++i)
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index 3c09987193..7942c552ab 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -92,7 +92,7 @@ struct testWorkGroupConstructorSingle
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator>
@@ -110,7 +110,7 @@ struct testWorkGroupConstructorSingle<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator>
@@ -140,13 +140,13 @@ TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
 TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
              BasicWorkGroupConstructorSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
   testWorkGroupConstructorSingle<ExecPolicy,
                                  OrderPolicy,
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 5fa53d8a59..b77700c46d 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -28,8 +28,8 @@ typename std::enable_if<std::is_base_of<RunOnDevice, ForOnePol>::value>::type
 call_dispatcher(Invoker invoker, CallArgs... callArgs)
 {
   RAJA::tuple<CallArgs...> lambda_capturable_callArgs(callArgs...);
-  forone<ForOnePol>(
-      [=] RAJA_DEVICE() { camp::invoke(lambda_capturable_callArgs, invoker); });
+  forone<ForOnePol>([=] RAJA_DEVICE()
+                    { camp::invoke(lambda_capturable_callArgs, invoker); });
 }
 #endif
 
@@ -37,16 +37,16 @@ template <typename IndexType, typename... Args>
 struct DispatcherTestCallable
 {
   DispatcherTestCallable(IndexType* _ptr_call,
-                         IndexType _val_call,
+                         IndexType  _val_call,
                          IndexType* _ptr_dtor,
-                         IndexType _val_dtor)
+                         IndexType  _val_dtor)
       : ptr_call(_ptr_call),
         val_call(_val_call),
         ptr_dtor(_ptr_dtor),
         val_dtor(_val_dtor)
   {}
 
-  DispatcherTestCallable(DispatcherTestCallable const&) = delete;
+  DispatcherTestCallable(DispatcherTestCallable const&)            = delete;
   DispatcherTestCallable& operator=(DispatcherTestCallable const&) = delete;
 
   DispatcherTestCallable(DispatcherTestCallable&& o)
@@ -60,10 +60,10 @@ struct DispatcherTestCallable
   }
   DispatcherTestCallable& operator=(DispatcherTestCallable&& o)
   {
-    ptr_call = o.ptr_call;
-    val_call = o.val_call;
-    ptr_dtor = o.ptr_dtor;
-    val_dtor = o.val_dtor;
+    ptr_call     = o.ptr_call;
+    val_call     = o.val_call;
+    ptr_dtor     = o.ptr_dtor;
+    val_dtor     = o.val_dtor;
     o.moved_from = true;
     return *this;
   }
@@ -78,13 +78,13 @@ struct DispatcherTestCallable
 
 private:
   IndexType* ptr_call;
-  IndexType val_call;
+  IndexType  val_call;
   IndexType* ptr_dtor;
-  IndexType val_dtor;
+  IndexType  val_dtor;
 
 public:
   bool move_constructed = false;
-  bool moved_from = false;
+  bool moved_from       = false;
 };
 
 template <typename ExecPolicy,
@@ -103,10 +103,10 @@ struct testWorkGroupDispatcherSingle
     camp::resources::Resource host_res{camp::resources::Host()};
 
     static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
-    using DispatchPolicy = typename DispatchTyper::template type<TestCallable>;
+    using DispatchPolicy  = typename DispatchTyper::template type<TestCallable>;
     using Dispatcher_type = RAJA::detail::
         Dispatcher<platform, DispatchPolicy, void, IndexType, Args...>;
-    using Invoker_type = typename Dispatcher_type::invoker_type;
+    using Invoker_type         = typename Dispatcher_type::invoker_type;
     using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
     const Dispatcher_type* dispatcher =
         RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
@@ -209,7 +209,7 @@ struct testWorkGroupDispatcherSingle
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename IndexType,
           typename WORKING_RES,
           typename ForOnePol>
@@ -226,7 +226,7 @@ struct testWorkGroupDispatcherSingle<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename IndexType,
           typename WORKING_RES,
           typename ForOnePol>
@@ -254,12 +254,12 @@ TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
              BasicWorkGroupDispatcherSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using Args = typename camp::at<TypeParam, camp::num<3>>::type;
-  using ResourceType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<5>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Args          = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ResourceType  = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ForOneType    = typename camp::at<TypeParam, camp::num<5>>::type;
 
   testWorkGroupDispatcherSingle<ExecPolicy,
                                 DispatchTyper,
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index 704dd458e4..3303cac474 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -28,14 +28,14 @@ struct testWorkGroupEnqueueMultiple
 {
   template <typename... Args>
   void operator()(RAJA::xargs<Args...>,
-                  bool do_instantiate,
+                  bool   do_instantiate,
                   size_t rep,
                   size_t num) const
   {
     IndexType success = (IndexType)1;
 
     using range_segment = RAJA::TypedRangeSegment<IndexType>;
-    using callable = EnqueueTestCallable<IndexType, Args...>;
+    using callable      = EnqueueTestCallable<IndexType, Args...>;
 
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
@@ -100,7 +100,7 @@ struct testWorkGroupEnqueueMultiple
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator>
@@ -118,7 +118,7 @@ struct testWorkGroupEnqueueMultiple<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator>
@@ -148,15 +148,15 @@ TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
 TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
              BasicWorkGroupEnqueueMultiple)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937                          rng(std::random_device{}());
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index 8776b5d853..a31a18502b 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -28,14 +28,14 @@ struct testWorkGroupEnqueueSingle
 {
   template <typename... Args>
   void operator()(RAJA::xargs<Args...>,
-                  bool do_instantiate,
+                  bool   do_instantiate,
                   size_t rep,
                   size_t num) const
   {
     IndexType success = (IndexType)1;
 
     using range_segment = RAJA::TypedRangeSegment<IndexType>;
-    using callable = EnqueueTestCallable<IndexType, Args...>;
+    using callable      = EnqueueTestCallable<IndexType, Args...>;
 
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
@@ -100,7 +100,7 @@ struct testWorkGroupEnqueueSingle
 
 /// leave unsupported types untested
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator>
@@ -118,7 +118,7 @@ struct testWorkGroupEnqueueSingle<
 };
 ///
 template <size_t BLOCK_SIZE,
-          bool Async,
+          bool   Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator>
@@ -147,13 +147,13 @@ TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueSingleUnitTest);
 
 TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
   testWorkGroupEnqueueSingle<ExecPolicy,
                              OrderPolicy,
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 54eebcaf1c..e8ec1368f6 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -26,14 +26,15 @@ void testWorkGroupWorkStorageConstructor()
   bool success = true;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<>;
-  using Dispatcher_type = RAJA::detail::
+  using DispatchPolicy           = typename DispatchTyper::template type<>;
+  using Dispatcher_type          = RAJA::detail::
       Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
@@ -76,7 +77,7 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
   testWorkGroupWorkStorageConstructor<StoragePolicy,
                                       DispatchTyper,
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index bc77d0fa3f..58f012d966 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -28,7 +28,7 @@ void testWorkGroupWorkStorageInsertCall()
   using callable = TestCallable<double>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable>;
+  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
   using Dispatcher_type = RAJA::detail::
       Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type =
@@ -39,12 +39,14 @@ void testWorkGroupWorkStorageInsertCall()
       RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(RAJA::seq_work{});
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val) {
+    auto fill_contents = [&](WorkStorage_type& container, double init_val)
+    {
       callable c(init_val);
 
       ASSERT_FALSE(c.move_constructed);
@@ -59,15 +61,16 @@ void testWorkGroupWorkStorageInsertCall()
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val) {
+    auto test_contents = [&](WorkStorage_type& container, double init_val)
+    {
       ASSERT_EQ(container.size(), (size_t)1);
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
 
       auto iter = container.begin();
 
-      double test_val = -1;
-      bool move_constructed = false;
-      bool moved_from = true;
+      double test_val         = -1;
+      bool   move_constructed = false;
+      bool   moved_from       = true;
       WorkStruct_type::host_call(
           &*iter, (void*)&test_val, &move_constructed, &moved_from);
 
@@ -128,7 +131,7 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
   testWorkGroupWorkStorageInsertCall<StoragePolicy, DispatchTyper, Allocator>();
 }
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index a98aeb41fa..d731b93111 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -28,7 +28,7 @@ void testWorkGroupWorkStorageIterator()
   using callable = TestCallable<int>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable>;
+  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
   using Dispatcher_type = RAJA::detail::
       Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type =
@@ -94,7 +94,7 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
   testWorkGroupWorkStorageIterator<StoragePolicy, DispatchTyper, Allocator>();
 }
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 2a6254e2ce..517b51b4b6 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -31,11 +31,13 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
   using type1 = TestArray<double, 6>;
   using type2 = TestArray<double, 14>;
 
-  auto make_type0 = [](double init_val, size_t i) {
+  auto make_type0 = [](double init_val, size_t i)
+  {
     type0 obj(init_val - (double)i);
     return obj;
   };
-  auto make_type1 = [](double init_val, size_t i) {
+  auto make_type1 = [](double init_val, size_t i)
+  {
     type1 obj{};
     for (size_t j = 0; j < 6; ++j)
     {
@@ -43,7 +45,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
     }
     return obj;
   };
-  auto make_type2 = [](double init_val, size_t i) {
+  auto make_type2 = [](double init_val, size_t i)
+  {
     type2 obj{};
     for (size_t j = 0; j < 14; ++j)
     {
@@ -77,15 +80,17 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
           RAJA::seq_work{});
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
     auto fill_contents = [&](WorkStorage_type& container,
-                             double init_val0,
-                             double init_val1,
-                             double init_val2) {
+                             double            init_val0,
+                             double            init_val1,
+                             double            init_val2)
+    {
       std::vector<callable0> vec0;
       vec0.reserve(num0);
       for (size_t i = 0; i < num0; ++i)
@@ -129,9 +134,10 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
     };
 
     auto test_contents = [&](WorkStorage_type& container,
-                             double init_val0,
-                             double init_val1,
-                             double init_val2) {
+                             double            init_val0,
+                             double            init_val1,
+                             double            init_val2)
+    {
       ASSERT_EQ(container.size(), num0 + num1 + num2);
       ASSERT_GE(container.storage_size(),
                 num0 * sizeof(callable0) + num1 * sizeof(callable1) +
@@ -143,8 +149,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
         for (size_t i = 0; i < num0; ++i)
         {
           type0 val{};
-          bool move_constructed = false;
-          bool moved_from = true;
+          bool  move_constructed = false;
+          bool  moved_from       = true;
           WorkStruct_type::host_call(
               &*iter, (void*)&val, &move_constructed, &moved_from);
 
@@ -159,8 +165,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
         for (size_t i = 0; i < num1; ++i)
         {
           type1 val{};
-          bool move_constructed = false;
-          bool moved_from = true;
+          bool  move_constructed = false;
+          bool  moved_from       = true;
           WorkStruct_type::host_call(
               &*iter, (void*)&val, &move_constructed, &moved_from);
 
@@ -175,8 +181,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
         for (size_t i = 0; i < num2; ++i)
         {
           type2 val{};
-          bool move_constructed = false;
-          bool moved_from = true;
+          bool  move_constructed = false;
+          bool  moved_from       = true;
           WorkStruct_type::host_call(
               &*iter, (void*)&val, &move_constructed, &moved_from);
 
@@ -244,9 +250,9 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest,
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937                          rng(std::random_device{}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
   testWorkGroupWorkStorageMultiple<StoragePolicy, DispatchTyper, Allocator>(

From 34496ca7d0664a621b957feb34e1c80b9036adc0 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Thu, 5 Sep 2024 13:56:58 -0700
Subject: [PATCH 4/9] Try new formatting to make nested templates more readable

---
 .clang-format                                 |   4 +-
 examples/dynamic-forall.cpp                   |   3 +-
 examples/dynamic_mat_transpose.cpp            |  60 +--
 examples/forall-param-reductions.cpp          |  71 +--
 examples/forall_multi-reductions.cpp          |   4 +-
 examples/jacobi.cpp                           |  37 +-
 examples/kernel-dynamic-tile.cpp              |  11 +-
 examples/launch-param-reductions.cpp          |  71 +--
 examples/launch_flatten.cpp                   |  18 +-
 examples/launch_matrix-multiply.cpp           | 119 ++---
 examples/launch_reductions.cpp                |   3 +-
 examples/multiview.cpp                        |  26 +-
 examples/omp-target-ltimes.cpp                |  34 +-
 examples/pi-reduce_vs_atomic.cpp              |   8 +-
 examples/raja-launch.cpp                      |  14 +-
 examples/red-black-gauss-seidel.cpp           |   3 +-
 examples/resource-dynamic-forall.cpp          |   4 +-
 examples/resource-forall.cpp                  |  30 +-
 examples/resource-kernel.cpp                  |   6 +-
 examples/resource-launch.cpp                  |  12 +-
 examples/resource-runtime-launch.cpp          |   6 +-
 examples/tut_halo-exchange.cpp                | 476 ++++++------------
 examples/tut_launch_basic.cpp                 |  29 +-
 examples/tut_matrix-multiply.cpp              | 256 ++++------
 examples/wave-eqn.cpp                         |  13 +-
 exercises/atomic-histogram_solution.cpp       |  12 +-
 exercises/dot-product_solution.cpp            |   4 +-
 .../kernel-matrix-transpose-local-array.cpp   |  80 +--
 ...-matrix-transpose-local-array_solution.cpp | 167 ++----
 exercises/kernel-matrix-transpose-tiled.cpp   |  52 +-
 ...kernel-matrix-transpose-tiled_solution.cpp |  84 ++--
 .../kernel-matrix-transpose_solution.cpp      |  16 +-
 exercises/kernelintro-execpols.cpp            |  23 +-
 exercises/kernelintro-execpols_solution.cpp   |  26 +-
 exercises/kernelintro-nested-loop-reorder.cpp |   3 +-
 ...rnelintro-nested-loop-reorder_solution.cpp |   9 +-
 .../launch-matrix-transpose-local-array.cpp   |  34 +-
 ...-matrix-transpose-local-array_solution.cpp |  80 +--
 exercises/launch-matrix-transpose-tiled.cpp   |  17 +-
 ...launch-matrix-transpose-tiled_solution.cpp |  62 +--
 exercises/launch-matrix-transpose.cpp         |  12 +-
 .../launch-matrix-transpose_solution.cpp      |  18 +-
 exercises/launchintro-execpols.cpp            |  52 +-
 exercises/launchintro-execpols_solution.cpp   |  70 +--
 exercises/offset-layout-stencil.cpp           |   4 +-
 exercises/offset-layout-stencil_solution.cpp  |  15 +-
 ...-layout-batch-matrix-multiply_solution.cpp |  12 +-
 exercises/scan_solution.cpp                   |   9 +-
 exercises/sort_solution.cpp                   |   9 +-
 .../ex8_tiled-matrix-transpose.cpp            |  20 +-
 .../ex8_tiled-matrix-transpose_solution.cpp   |  64 +--
 ..._matrix-transpose-local-array_solution.cpp |  75 +--
 exercises/vertexsum-indexset.cpp              |   4 +-
 exercises/vertexsum-indexset_solution.cpp     |   4 +-
 include/RAJA/index/IndexSet.hpp               |  10 +-
 include/RAJA/index/IndexSetUtils.hpp          |   4 +-
 include/RAJA/index/RangeSegment.hpp           |   4 +-
 include/RAJA/internal/RAJAVec.hpp             |  28 +-
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  34 +-
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |   7 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    |  22 +-
 include/RAJA/pattern/forall.hpp               | 115 ++---
 include/RAJA/pattern/kernel.hpp               |  23 +-
 include/RAJA/pattern/kernel/For.hpp           |   5 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   5 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |  10 +-
 include/RAJA/pattern/kernel/InitLocalMem.hpp  |   3 +-
 include/RAJA/pattern/kernel/Tile.hpp          |  16 +-
 include/RAJA/pattern/kernel/TileTCount.hpp    |   5 +-
 .../pattern/kernel/internal/StatementList.hpp |   4 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  40 +-
 include/RAJA/pattern/params/forall.hpp        |  23 +-
 include/RAJA/pattern/scan.hpp                 |  36 +-
 include/RAJA/pattern/sort.hpp                 |  34 +-
 include/RAJA/pattern/tensor/TensorIndex.hpp   |   7 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   |  33 +-
 .../tensor/internal/ET/TensorDivide.hpp       |  12 +-
 .../tensor/internal/ET/TensorMultiply.hpp     |   3 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |   4 +-
 .../tensor/internal/MatrixMatrixMultiply.hpp  |   6 +-
 .../tensor/internal/MatrixRegisterImpl.hpp    | 126 ++---
 .../pattern/tensor/internal/RegisterBase.hpp  |  10 +-
 .../pattern/tensor/internal/TensorRef.hpp     |  20 +-
 .../tensor/internal/TensorRegisterBase.hpp    |   3 +-
 .../tensor/internal/TensorTileExec.hpp        |  21 +-
 .../tensor/internal/VectorRegisterImpl.hpp    |  50 +-
 include/RAJA/policy/MultiPolicy.hpp           |   7 +-
 include/RAJA/policy/atomic_builtin.hpp        |  20 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |  20 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  21 +-
 include/RAJA/policy/cuda/atomic.hpp           |  21 +-
 include/RAJA/policy/cuda/forall.hpp           | 105 ++--
 include/RAJA/policy/cuda/intrinsics.hpp       |  26 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  35 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |  32 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |   4 +-
 include/RAJA/policy/cuda/launch.hpp           | 115 ++---
 include/RAJA/policy/cuda/multi_reduce.hpp     |  69 +--
 include/RAJA/policy/cuda/policy.hpp           |   6 +-
 include/RAJA/policy/cuda/raja_cudaerrchk.hpp  |   5 +-
 include/RAJA/policy/cuda/reduce.hpp           |  25 +-
 include/RAJA/policy/cuda/scan.hpp             |  52 +-
 include/RAJA/policy/cuda/sort.hpp             | 104 ++--
 include/RAJA/policy/desul/atomic.hpp          |  60 ++-
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |  20 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  19 +-
 include/RAJA/policy/hip/atomic.hpp            |  21 +-
 include/RAJA/policy/hip/forall.hpp            |  77 ++-
 include/RAJA/policy/hip/intrinsics.hpp        |  34 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  31 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |  32 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |   4 +-
 include/RAJA/policy/hip/launch.hpp            |  93 +---
 include/RAJA/policy/hip/multi_reduce.hpp      |  69 +--
 include/RAJA/policy/hip/policy.hpp            |   6 +-
 include/RAJA/policy/hip/raja_hiperrchk.hpp    |   4 +-
 include/RAJA/policy/hip/reduce.hpp            |  21 +-
 include/RAJA/policy/hip/scan.hpp              | 118 ++---
 include/RAJA/policy/hip/sort.hpp              | 186 +++----
 include/RAJA/policy/openmp/forall.hpp         |  11 +-
 include/RAJA/policy/openmp/launch.hpp         |  20 +-
 include/RAJA/policy/openmp/multi_reduce.hpp   |  32 +-
 include/RAJA/policy/openmp/params/forall.hpp  |   3 +-
 include/RAJA/policy/openmp/scan.hpp           |  26 +-
 include/RAJA/policy/openmp/sort.hpp           |  16 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   6 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |  10 +-
 include/RAJA/policy/sequential/launch.hpp     |  11 +-
 include/RAJA/policy/sycl/forall.hpp           |  10 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |   8 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |  12 +-
 include/RAJA/policy/sycl/launch.hpp           |  31 +-
 include/RAJA/policy/sycl/reduce.hpp           |  39 +-
 .../policy/tensor/arch/avx/avx_double.hpp     |  11 +-
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |  24 +-
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp |  71 +--
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp |  26 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   |  31 +-
 .../policy/tensor/arch/avx2/avx2_float.hpp    |  68 +--
 .../policy/tensor/arch/avx2/avx2_int32.hpp    |  78 +--
 .../policy/tensor/arch/avx2/avx2_int64.hpp    |  51 +-
 .../tensor/arch/avx512/avx512_double.hpp      |  21 +-
 .../tensor/arch/avx512/avx512_float.hpp       |  21 +-
 .../tensor/arch/avx512/avx512_int32.hpp       |  68 +--
 .../tensor/arch/avx512/avx512_int64.hpp       |  57 +--
 include/RAJA/util/Layout.hpp                  |   3 +-
 include/RAJA/util/OffsetLayout.hpp            |   3 +-
 include/RAJA/util/StaticLayout.hpp            |   4 +-
 include/RAJA/util/TypedViewBase.hpp           |  21 +-
 include/RAJA/util/View.hpp                    |   6 +-
 include/RAJA/util/basic_mempool.hpp           |   4 +-
 include/RAJA/util/for_each.hpp                |   3 +-
 include/RAJA/util/reduce.hpp                  |  12 +-
 include/RAJA/util/sort.hpp                    |  63 +--
 include/RAJA/util/zip_tuple.hpp               |   9 +-
 src/KokkosPluginLoader.cpp                    |  12 +-
 ...t-dynamic-forall-resource-RangeSegment.hpp |  22 +-
 .../test-dynamic-forall-RangeSegment.hpp      |  18 +-
 .../tests/test-forall-CombiningAdapter-1D.hpp |   8 +-
 .../tests/test-forall-CombiningAdapter-2D.hpp |  11 +-
 .../tests/test-forall-CombiningAdapter-3D.hpp |  82 +--
 .../tests/test-forall-atomic-basic.hpp        |  33 +-
 .../tests/test-forall-AtomicRefCAS.hpp        |  22 +-
 .../tests/test-forall-AtomicRefLoadStore.hpp  |  13 +-
 .../tests/test-forall-AtomicRefLogical.hpp    |   4 +-
 .../tests/test-forall-AtomicRefMinMax.hpp     |   4 +-
 .../tests/test-forall-AtomicMultiView.hpp     |  10 +-
 ...test-forall-AtomicOutOfBoundsMultiView.hpp |   5 +-
 .../tests/test-forall-AtomicView.hpp          |   7 +-
 .../tests/test-forall-IcountIndexSetView.hpp  |  11 +-
 .../tests/test-forall-IndexSetView.hpp        |  12 +-
 .../tests/test-forall-IcountIndexSet.hpp      |  11 +-
 .../indexset/tests/test-forall-IndexSet.hpp   |  11 +-
 .../tests/test-forall-basic-MultiReduce.hpp   |  96 ++--
 .../tests/test-forall-basic-ReduceBitAnd.hpp  |  55 +-
 .../tests/test-forall-basic-ReduceBitOr.hpp   |  52 +-
 .../tests/test-forall-basic-ReduceMax.hpp     |  52 +-
 .../tests/test-forall-basic-ReduceMaxLoc.hpp  |  55 +-
 .../tests/test-forall-basic-ReduceMin.hpp     |  52 +-
 .../tests/test-forall-basic-ReduceMinLoc.hpp  |  55 +-
 .../tests/test-forall-basic-ReduceSum.hpp     |  50 +-
 .../test-forall-basic-expt-ReduceBitAnd.hpp   |  58 +--
 .../test-forall-basic-expt-ReduceBitOr.hpp    |  55 +-
 .../test-forall-basic-expt-ReduceMax.hpp      |  49 +-
 .../test-forall-basic-expt-ReduceMaxLoc.hpp   |  52 +-
 .../test-forall-basic-expt-ReduceMin.hpp      |  49 +-
 .../test-forall-basic-expt-ReduceMinLoc.hpp   |  52 +-
 .../test-forall-basic-expt-ReduceSum.hpp      |  49 +-
 ...est-forall-indexset-multiple-ReduceMax.hpp |  12 +-
 ...-forall-indexset-multiple-ReduceMaxLoc.hpp |  12 +-
 ...est-forall-indexset-multiple-ReduceMin.hpp |  12 +-
 ...-forall-indexset-multiple-ReduceMinLoc.hpp |  12 +-
 ...est-forall-indexset-multiple-ReduceSum.hpp |  20 +-
 ...test-forall-segment-multiple-ReduceMax.hpp |  18 +-
 ...t-forall-segment-multiple-ReduceMaxLoc.hpp |  20 +-
 ...test-forall-segment-multiple-ReduceMin.hpp |  18 +-
 ...t-forall-segment-multiple-ReduceMinLoc.hpp |  24 +-
 ...test-forall-segment-multiple-ReduceSum.hpp |  33 +-
 .../region/tests/test-forall-region.hpp       |  14 +-
 .../test-forall-ResourceIcountIndexSet.hpp    |  14 +-
 .../tests/test-forall-ResourceIndexSet.hpp    |  11 +-
 .../test-forall-resource-ListSegment.hpp      |  23 +-
 .../test-forall-resource-RangeSegment.hpp     |  15 +-
 ...est-forall-resource-RangeStrideSegment.hpp |  75 +--
 .../tests/test-forall-ListSegmentView.hpp     |  38 +-
 .../tests/test-forall-RangeSegment2DView.hpp  |  16 +-
 .../tests/test-forall-RangeSegmentView.hpp    |  28 +-
 .../test-forall-RangeStrideSegmentView.hpp    |  67 +--
 .../segment/tests/test-forall-ListSegment.hpp |  16 +-
 .../tests/test-forall-RangeSegment.hpp        |  15 +-
 .../tests/test-forall-RangeStrideSegment.hpp  | 123 ++---
 .../indexset-build/test-aligned-indexset.cpp  |   7 +-
 .../tests/basic-fission-fusion-loop-impl.hpp  |  33 +-
 ...nel-basic-fission-fusion-loop-segments.hpp |  40 +-
 .../tests/basic-single-icount-loop-impl.hpp   |  39 +-
 ...rnel-basic-single-icount-loop-segments.hpp |  40 +-
 .../tests/basic-single-loop-segments-impl.hpp |  23 +-
 ...test-kernel-basic-single-loop-segments.hpp |  67 +--
 ...el-resource-basic-single-loop-segments.hpp |  67 +--
 .../conditional-fission-fusion-loop-impl.hpp  |  33 +-
 ...nditional-fission-fusion-loop-segments.hpp |  52 +-
 .../tests/test-kernel-hyperplane-2D.hpp       |  23 +-
 .../tests/test-kernel-hyperplane-3D.hpp       |  89 ++--
 .../tests/test-kernel-nested-MultiReduce.hpp  |  56 +--
 .../tests/nested-loop-BlockReduceSum-impl.hpp |  15 +-
 .../tests/nested-loop-ReduceSum-impl.hpp      |  18 +-
 ...test-kernel-nested-loops-segment-types.hpp |  25 +-
 .../test-kernel-nested-loop-OffsetView2D.hpp  |  24 +-
 .../test-kernel-nested-loop-OffsetView3D.hpp  |  27 +-
 ...ernel-nested-loop-PermutedOffsetView2D.hpp |  16 +-
 ...ernel-nested-loop-PermutedOffsetView3D.hpp |  19 +-
 ...test-kernel-nested-loop-PermutedView2D.hpp |   8 +-
 ...test-kernel-nested-loop-PermutedView3D.hpp |   8 +-
 .../tests/nested-loop-Basic-impl.hpp          |  32 +-
 .../tests/nested-loop-MultiLambda-impl.hpp    |  16 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |  22 +-
 .../tests/test-kernel-nested-loop-Basic.hpp   |   4 +-
 ...test-kernel-resource-nested-loop-Basic.hpp |   4 +-
 .../tests/test-kernel-reduceloc-Max2D.hpp     |  50 +-
 .../tests/test-kernel-reduceloc-Max2DView.hpp |  50 +-
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |  56 +--
 .../tests/test-kernel-reduceloc-Min2D.hpp     |  50 +-
 .../tests/test-kernel-reduceloc-Min2DView.hpp |  50 +-
 .../test-kernel-reduceloc-Min2DViewTuple.hpp  |  56 +--
 .../region/tests/test-kernel-region-sync.hpp  |  13 +-
 .../region/tests/test-kernel-region.hpp       |  13 +-
 .../tests/test-kernel-tile-Dynamic2D.hpp      |  24 +-
 .../tests/test-kernel-tile-Fixed2D.hpp        |  20 +-
 .../tests/test-kernel-tile-Fixed2DMinMax.hpp  |  33 +-
 .../tests/test-kernel-tile-Fixed2DSum.hpp     |  19 +-
 .../tests/test-kernel-tile-LocalArray2D.hpp   |  49 +-
 .../tests/warp-thread-ReduceMask-impl.hpp     |  26 +-
 .../tests/warp-thread-ReduceWarp-impl.hpp     |  48 +-
 .../tests/warp-thread-WarpLoop-impl.hpp       |  18 +-
 .../tests/test-launch-nested-MultiReduce.hpp  |  59 +--
 .../tests/test-launch-nested-Direct.hpp       |  78 ++-
 .../tests/test-launch-nested-Loop.hpp         |  80 ++-
 .../tests/test-launch-nested-Tile-Direct.hpp  |  80 +--
 .../tests/test-launch-nested-Tile-Loop.hpp    |  80 +--
 .../tests/test-launch-basic-ReduceBitAnd.hpp  |  67 +--
 .../tests/test-launch-basic-ReduceMin.hpp     |  77 ++-
 .../tests/test-launch-basic-ReduceSum.hpp     |  71 ++-
 ...t-launch-basic-param-expt-ReduceBitAnd.hpp |  53 +-
 ...test-launch-basic-param-expt-ReduceMin.hpp |  57 +--
 ...test-launch-basic-param-expt-ReduceSum.hpp |  54 +-
 .../tests/test-launch-BasicShared.hpp         |  22 +-
 .../segment/tests/test-launch-ListSegment.hpp |  38 +-
 .../tests/test-launch-RangeSegment.hpp        |  50 +-
 .../tests/test-launch-RangeStrideSegment.hpp  | 117 ++---
 .../tests/test-launch-DynamicMem.hpp          |  27 +-
 .../tests/test-launch-StaticMem.hpp           |  33 +-
 .../test-launch-nested-Tile-iCount-Direct.hpp |  73 +--
 .../test-launch-nested-Tile-iCount-Loop.hpp   |  73 +--
 .../scan/tests/test-scan-Exclusive.hpp        |  11 +-
 .../scan/tests/test-scan-ExclusiveInplace.hpp |   8 +-
 .../scan/tests/test-scan-Inclusive.hpp        |   9 +-
 .../scan/tests/test-scan-InclusiveInplace.hpp |   4 +-
 .../tests/test-tensor-matrix-CtorGetSet.hpp   |   4 +-
 .../tests/test-tensor-matrix-ET_Add.hpp       |   5 +-
 .../tests/test-tensor-matrix-ET_Divide.hpp    |   5 +-
 .../tests/test-tensor-matrix-ET_LoadStore.hpp |  32 +-
 ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp |   4 +-
 .../tests/test-tensor-matrix-ET_Subtract.hpp  |   5 +-
 .../test-tensor-matrix-Load_ColMajor.hpp      |  12 +-
 .../test-tensor-matrix-Load_RowMajor.hpp      |  12 +-
 .../test-tensor-matrix-Store_ColMajor.hpp     |  12 +-
 .../test-tensor-matrix-Store_RowMajor.hpp     |  12 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |   6 +-
 .../util/test-CombiningAdapter-2D.cpp         |   3 +-
 .../util/test-CombiningAdapter-3D.cpp         |   4 +-
 .../util/test-PermutedCombiningAdapter-2D.cpp |   3 +-
 .../util/test-PermutedCombiningAdapter-3D.cpp |  16 +-
 .../test-workgroup-Ordered-MultipleReuse.hpp  |  77 ++-
 .../tests/test-workgroup-Ordered-Single.hpp   |  66 +--
 ...test-workgroup-Unordered-MultipleReuse.hpp |  77 ++-
 .../tests/test-workgroup-Unordered-Single.hpp |  66 +--
 test/include/RAJA_gtest.hpp                   |  11 +-
 test/include/RAJA_test-tensor.hpp             |  16 +-
 test/include/RAJA_unit-test-for3d3d.hpp       |  28 +-
 test/include/RAJA_unit-test-forone.hpp        |   6 +-
 .../plugin/tests/test-plugin-forall.hpp       |  23 +-
 .../plugin/tests/test-plugin-kernel.hpp       |   4 +-
 .../plugin/tests/test-plugin-launch.hpp       |   4 +-
 .../tests/test-plugin-resource-launch.hpp     |  10 +-
 .../plugin/tests/test-plugin-workgroup.hpp    |  67 +--
 test/old-tests/unit/test-sharedmem.cpp        |  96 ++--
 test/old-tests/unit/test-simd.cpp             |  14 +-
 .../tests/test-algorithm-reduce-utils.hpp     |  53 +-
 .../tests/test-algorithm-sort-utils.hpp       |  79 +--
 test/unit/hip/test-synchronize.cpp            |   7 +-
 test/unit/index/test-indexset.cpp             |   4 +-
 test/unit/index/test-rangestridesegment.cpp   |   6 +-
 .../indexing/tests/test-indexing-global.hpp   |  13 +-
 .../tests/test-multi-reducer-reset.hpp        |   9 +-
 .../tests/test-reducer-constructors.hpp       |  16 +-
 .../unit/reducer/tests/test-reducer-reset.hpp |  22 +-
 .../tests/test-resource-AsyncTime.hpp         |   6 +-
 .../test-resource-BasicAsyncSemantics.hpp     |   9 +-
 .../resource/tests/test-resource-Depends.hpp  |  12 +-
 .../test-resource-JoinAsyncSemantics.hpp      |   9 +-
 .../tests/test-resource-MultiStream.hpp       |  12 +-
 test/unit/view-layout/test-indexlayout.cpp    |   6 +-
 test/unit/view-layout/test-typedlayout.cpp    |  32 +-
 test/unit/view-layout/test-typedview.cpp      |   2 +-
 .../tests/test-workgroup-Constructor.hpp      |  51 +-
 .../tests/test-workgroup-Dispatcher.hpp       |  22 +-
 .../tests/test-workgroup-Enqueue-Multiple.hpp |  37 +-
 .../tests/test-workgroup-Enqueue-Single.hpp   |  39 +-
 ...test-workgroup-WorkStorage-Constructor.hpp |   7 +-
 .../test-workgroup-WorkStorage-InsertCall.hpp |   8 +-
 .../test-workgroup-WorkStorage-Iterator.hpp   |   4 +-
 .../test-workgroup-WorkStorage-Multiple.hpp   |  40 +-
 332 files changed, 3975 insertions(+), 7026 deletions(-)

diff --git a/.clang-format b/.clang-format
index 0f46089fda..f80c5f3a7f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,6 +1,7 @@
 BasedOnStyle : LLVM
 # Indent formatting
 IndentWidth : 2
+Language: Cpp
 UseTab: Never
 KeepEmptyLinesAtTheStartOfBlocks : true
 MaxEmptyLinesToKeep : 2
@@ -40,9 +41,10 @@ PointerAlignment: Left
 AllowShortIfStatementsOnASingleLine : true
 AllowShortFunctionsOnASingleLine : true
 AllowShortLoopsOnASingleLine : false
+AllowAllArgumentsOnNextLine : false
 AllowAllParametersOfDeclarationOnNextLine : false
 AlignTrailingComments : true
-BinPackArguments : false
+BinPackArguments : true
 BinPackParameters : false
 ConstructorInitializerAllOnOneLineOrOnePerLine : true
 ColumnLimit : 80
diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp
index 751d6e3537..b69e5fec28 100644
--- a/examples/dynamic-forall.cpp
+++ b/examples/dynamic-forall.cpp
@@ -100,8 +100,7 @@ int main(int argc, char* argv[])
   //----------------------------------------------------------------------------//
 
   // policy is chosen from the list
-  RAJA::expt::dynamic_forall<policy_list>(pol,
-                                          RAJA::RangeSegment(0, N),
+  RAJA::expt::dynamic_forall<policy_list>(pol, RAJA::RangeSegment(0, N),
                                           [=] RAJA_HOST_DEVICE(int i)
                                           { c[i] = a[i] + b[i]; });
   // _rajaseq_vector_add_end
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
index 2404a2dc64..d83f43e9b0 100644
--- a/examples/dynamic_mat_transpose.cpp
+++ b/examples/dynamic_mat_transpose.cpp
@@ -355,13 +355,11 @@ int main(int argc, char* argv[])
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<outer1>(
-            ctx,
-            RAJA::RangeSegment(0, outer_Dimr),
+            ctx, RAJA::RangeSegment(0, outer_Dimr),
             [&](int by)
             {
               RAJA::loop<outer0>(
-                  ctx,
-                  RAJA::RangeSegment(0, outer_Dimc),
+                  ctx, RAJA::RangeSegment(0, outer_Dimc),
                   [&](int bx)
                   {
                     // Request memory from shared memory pool
@@ -369,44 +367,40 @@ int main(int argc, char* argv[])
                         ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
 
                     // Use RAJA View for simplified indexing
-                    RAJA::View<int, RAJA::Layout<2>> Tile(
-                        tile_ptr, TILE_DIM, TILE_DIM);
-
-                    RAJA::loop<inner1>(ctx,
-                                       RAJA::RangeSegment(0, TILE_DIM),
-                                       [&](int ty)
-                                       {
-                                         RAJA::loop<inner0>(
-                                             ctx,
-                                             RAJA::RangeSegment(0, TILE_DIM),
-                                             [&](int tx)
-                                             {
-                                               int col =
-                                                   bx * TILE_DIM +
-                                                   tx; // Matrix column index
-                                               int row = by * TILE_DIM +
-                                                         ty; // Matrix row index
-
-                                               // Bounds check
-                                               if (row < N_r && col < N_c)
-                                               {
-                                                 Tile(ty, tx) = Aview(row, col);
-                                               }
-                                             });
-                                       });
+                    RAJA::View<int, RAJA::Layout<2>> Tile(tile_ptr, TILE_DIM,
+                                                          TILE_DIM);
+
+                    RAJA::loop<inner1>(
+                        ctx, RAJA::RangeSegment(0, TILE_DIM),
+                        [&](int ty)
+                        {
+                          RAJA::loop<inner0>(
+                              ctx, RAJA::RangeSegment(0, TILE_DIM),
+                              [&](int tx)
+                              {
+                                int col =
+                                    bx * TILE_DIM + tx; // Matrix column index
+                                int row =
+                                    by * TILE_DIM + ty; // Matrix row index
+
+                                // Bounds check
+                                if (row < N_r && col < N_c)
+                                {
+                                  Tile(ty, tx) = Aview(row, col);
+                                }
+                              });
+                        });
 
                     // Barrier is needed to ensure all threads have written to
                     // Tile
                     ctx.teamSync();
 
                     RAJA::loop<inner1>(
-                        ctx,
-                        RAJA::RangeSegment(0, TILE_DIM),
+                        ctx, RAJA::RangeSegment(0, TILE_DIM),
                         [&](int ty)
                         {
                           RAJA::loop<inner0>(
-                              ctx,
-                              RAJA::RangeSegment(0, TILE_DIM),
+                              ctx, RAJA::RangeSegment(0, TILE_DIM),
                               [&](int tx)
                               {
                                 int col =
diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp
index 7459953ffe..fbfcda2d93 100644
--- a/examples/forall-param-reductions.cpp
+++ b/examples/forall-param-reductions.cpp
@@ -118,20 +118,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL1>(
-      host_res,
-      arange,
-      RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
+      host_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
       RAJA::expt::KernelName("RAJA Reduce Seq Kernel"),
-      [=](int         i,
-          int&        _seq_sum,
-          int&        _seq_min,
-          int&        _seq_max,
-          VALLOC_INT& _seq_minloc,
-          VALLOC_INT& _seq_maxloc)
+      [=](int i, int& _seq_sum, int& _seq_min, int& _seq_max,
+          VALLOC_INT& _seq_minloc, VALLOC_INT& _seq_maxloc)
       {
         _seq_sum += a[i];
 
@@ -173,20 +167,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL2>(
-      host_res,
-      arange,
-      RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
+      host_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
       RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"),
-      [=](int         i,
-          int&        _omp_sum,
-          int&        _omp_min,
-          int&        _omp_max,
-          VALLOC_INT& _omp_minloc,
-          VALLOC_INT& _omp_maxloc)
+      [=](int i, int& _omp_sum, int& _omp_min, int& _omp_max,
+          VALLOC_INT& _omp_minloc, VALLOC_INT& _omp_maxloc)
       {
         _omp_sum += a[i];
 
@@ -227,20 +215,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_t_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL3>(
-      omp_res,
-      arange,
-      RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
+      omp_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
       RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"),
-      [=](int         i,
-          int&        _omp_t_sum,
-          int&        _omp_t_min,
-          int&        _omp_t_max,
-          VALLOC_INT& _omp_t_minloc,
-          VALLOC_INT& _omp_t_maxloc)
+      [=](int i, int& _omp_t_sum, int& _omp_t_min, int& _omp_t_max,
+          VALLOC_INT& _omp_t_minloc, VALLOC_INT& _omp_t_maxloc)
       {
         _omp_t_sum += a[i];
 
@@ -285,20 +267,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL3>(
-      cuda_res,
-      arange,
-      RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
+      cuda_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
       RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
-      [=] RAJA_DEVICE(int         i,
-                      int&        _cuda_sum,
-                      int&        _cuda_min,
-                      int&        _cuda_max,
-                      VALLOC_INT& _cuda_minloc,
-                      VALLOC_INT& _cuda_maxloc)
+      [=] RAJA_DEVICE(int i, int& _cuda_sum, int& _cuda_min, int& _cuda_max,
+                      VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
       {
         _cuda_sum += d_a[i];
 
@@ -342,19 +318,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL3>(
-      arange,
-      RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
+      arange, RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
       RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
-      [=] RAJA_DEVICE(int         i,
-                      int&        _hip_sum,
-                      int&        _hip_min,
-                      int&        _hip_max,
-                      VALLOC_INT& _hip_minloc,
-                      VALLOC_INT& _hip_maxloc)
+      [=] RAJA_DEVICE(int i, int& _hip_sum, int& _hip_min, int& _hip_max,
+                      VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
       {
         _hip_sum += d_a[i];
 
@@ -399,20 +370,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL3>(
-      sycl_res,
-      arange,
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
+      sycl_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
       RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
-      [=] RAJA_DEVICE(int         i,
-                      int&        _sycl_sum,
-                      int&        _sycl_min,
-                      int&        _sycl_max,
-                      VALLOC_INT& _sycl_minloc,
-                      VALLOC_INT& _sycl_maxloc)
+      [=] RAJA_DEVICE(int i, int& _sycl_sum, int& _sycl_min, int& _sycl_max,
+                      VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
       {
         _sycl_sum += d_a[i];
 
diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp
index 090ded938f..3809a107c1 100644
--- a/examples/forall_multi-reductions.cpp
+++ b/examples/forall_multi-reductions.cpp
@@ -154,8 +154,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
         res.memcpy(bins, host_bins, N * sizeof(int));
         res.memcpy(a, host_a, N * sizeof(int));
 
-        example_code<exec_policy, multi_reduce_policy>(
-            arange, num_bins, bins, a);
+        example_code<exec_policy, multi_reduce_policy>(arange, num_bins, bins,
+                                                       a);
 
         res.deallocate(bins);
         res.deallocate(a);
diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp
index dd74a929eb..76888d8c62 100644
--- a/examples/jacobi.cpp
+++ b/examples/jacobi.cpp
@@ -191,8 +191,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment jacobiRange(1, (N + 1));
 
   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   printf("RAJA: Sequential Policy - Nested ForallN \n");
@@ -267,8 +266,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
    */
 
   using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
+      1, RAJA::omp_parallel_for_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   while (resI2 > tol * tol)
@@ -329,18 +327,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using jacobiCUDANestedPolicy =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<32>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<32>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<0,
-                                       RAJA::cuda_thread_x_direct,
+                  1, RAJA::cuda_thread_y_direct,
+                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   resI2     = 1;
@@ -411,18 +403,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using jacobiHIPNestedPolicy =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<32>,
-          RAJA::hip_block_y_loop,
+          1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<32>,
-              RAJA::hip_block_x_loop,
+              0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<0,
-                                       RAJA::hip_thread_x_direct,
+                  1, RAJA::hip_thread_y_direct,
+                  RAJA::statement::For<0, RAJA::hip_thread_x_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   resI2     = 1;
@@ -512,8 +498,7 @@ void computeErr(double* I, grid_s grid)
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<jacobiSeqNestedPolicy>(
diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp
index 028cd220aa..ddac3ebf57 100644
--- a/examples/kernel-dynamic-tile.cpp
+++ b/examples/kernel-dynamic-tile.cpp
@@ -15,15 +15,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using namespace RAJA;
 
   kernel_param<KernelPolicy<statement::Tile<
-      1,
-      tile_dynamic<1>,
-      seq_exec,
+      1, tile_dynamic<1>, seq_exec,
       statement::Tile<
-          0,
-          tile_dynamic<0>,
-          seq_exec,
-          statement::For<1,
-                         seq_exec,
+          0, tile_dynamic<0>, seq_exec,
+          statement::For<1, seq_exec,
                          statement::For<0, seq_exec, statement::Lambda<0>>>>>>>(
       make_tuple(RangeSegment{0, 25}, RangeSegment{0, 25}),
       make_tuple(TileSize{5}, TileSize{10}),
diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp
index 8e90ce2d7b..898023a6be 100644
--- a/examples/launch-param-reductions.cpp
+++ b/examples/launch-param-reductions.cpp
@@ -134,23 +134,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::launch<LAUNCH_POL1>(
-      host_res,
-      RAJA::LaunchParams(),
-      "SeqReductionKernel",
+      host_res, RAJA::LaunchParams(), "SeqReductionKernel",
       RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int&                _seq_sum,
-                           int&                _seq_min,
-                           int&                _seq_max,
-                           VALLOC_INT&         _seq_minloc,
-                           VALLOC_INT&         _seq_maxloc)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _seq_sum,
+                           int& _seq_min, int& _seq_max,
+                           VALLOC_INT& _seq_minloc, VALLOC_INT& _seq_maxloc)
       {
-        RAJA::loop<LOOP_POL1>(ctx,
-                              arange,
+        RAJA::loop<LOOP_POL1>(ctx, arange,
                               [&](int i)
                               {
                                 _seq_sum += a[i];
@@ -198,23 +192,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::launch<LAUNCH_POL2>(
-      host_res,
-      RAJA::LaunchParams(),
-      "OmpReductionKernel",
+      host_res, RAJA::LaunchParams(), "OmpReductionKernel",
       RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int&                _omp_sum,
-                           int&                _omp_min,
-                           int&                _omp_max,
-                           VALLOC_INT&         _omp_minloc,
-                           VALLOC_INT&         _omp_maxloc)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _omp_sum,
+                           int& _omp_min, int& _omp_max,
+                           VALLOC_INT& _omp_minloc, VALLOC_INT& _omp_maxloc)
       {
-        RAJA::loop<LOOP_POL2>(ctx,
-                              arange,
+        RAJA::loop<LOOP_POL2>(ctx, arange,
                               [&](int i)
                               {
                                 _omp_sum += a[i];
@@ -272,16 +260,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int&                _cuda_sum,
-                           int&                _cuda_min,
-                           int&                _cuda_max,
-                           VALLOC_INT&         _cuda_minloc,
-                           VALLOC_INT&         _cuda_maxloc)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _cuda_sum,
+                           int& _cuda_min, int& _cuda_max,
+                           VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
       {
         RAJA::loop<LOOP_POL3>(
-            ctx,
-            arange,
+            ctx, arange,
             [&](int i)
             {
               _cuda_sum += d_a[i];
@@ -332,22 +316,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       device_res,
       RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS),
                          RAJA::Threads(HIP_BLOCK_SIZE)),
-      "HipReductionKernel",
-      RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
+      "HipReductionKernel", RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int&                _hip_sum,
-                           int&                _hip_min,
-                           int&                _hip_max,
-                           VALLOC_INT&         _hip_minloc,
-                           VALLOC_INT&         _hip_maxloc)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _hip_sum,
+                           int& _hip_min, int& _hip_max,
+                           VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
       {
         RAJA::loop<LOOP_POL3>(
-            ctx,
-            arange,
+            ctx, arange,
             [&](int i)
             {
               _hip_sum += d_a[i];
@@ -404,16 +383,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx,
-                           int&                _sycl_sum,
-                           int&                _sycl_min,
-                           int&                _sycl_max,
-                           VALLOC_INT&         _sycl_minloc,
-                           VALLOC_INT&         _sycl_maxloc)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _sycl_sum,
+                           int& _sycl_min, int& _sycl_max,
+                           VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
       {
         RAJA::loop<LOOP_POL4>(
-            ctx,
-            arange,
+            ctx, arange,
             [&](int i)
             {
               _sycl_sum += d_a[i];
diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp
index 48cc03669b..506313f42f 100644
--- a/examples/launch_flatten.cpp
+++ b/examples/launch_flatten.cpp
@@ -102,13 +102,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       launch_params,
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<device_inner_pol1>(ctx,
-                                      RAJA::RangeSegment(0, N),
+        RAJA::loop<device_inner_pol1>(ctx, RAJA::RangeSegment(0, N),
                                       [&](int j)
                                       {
                                         RAJA::loop<device_inner_pol0>(
-                                            ctx,
-                                            RAJA::RangeSegment(0, N),
+                                            ctx, RAJA::RangeSegment(0, N),
                                             [&](int i)
                                             { d_A_2DView(j, i) = i + j; });
                                       });
@@ -117,8 +115,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
         // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
         // accumulating memory contents
-        RAJA::loop<device_flatten_pol>(ctx,
-                                       RAJA::RangeSegment(0, NN),
+        RAJA::loop<device_flatten_pol>(ctx, RAJA::RangeSegment(0, NN),
                                        [&](int i)
                                        { device_kernel_sum += d_A_1DView(i); });
       });
@@ -135,13 +132,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       launch_params,
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<host_loop>(ctx,
-                              RAJA::RangeSegment(0, N),
+        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N),
                               [&](int j)
                               {
                                 RAJA::loop<host_loop>(
-                                    ctx,
-                                    RAJA::RangeSegment(0, N),
+                                    ctx, RAJA::RangeSegment(0, N),
                                     [&](int i) { h_A_2DView(j, i) = i + j; });
                               });
 
@@ -149,8 +144,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
         // As loops are dispatched as standard C loops we can revert to using
         // a regular seq_exec policy
-        RAJA::loop<host_loop>(ctx,
-                              RAJA::RangeSegment(0, NN),
+        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN),
                               [&](int i) { host_kernel_sum += h_A_1DView(i); });
       });
 
diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp
index 1355fb10e8..e9654f91b4 100644
--- a/examples/launch_matrix-multiply.cpp
+++ b/examples/launch_matrix-multiply.cpp
@@ -335,13 +335,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                          RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<global_thread_y>(ctx,
-                                    col_range,
+        RAJA::loop<global_thread_y>(ctx, col_range,
                                     [&](int col)
                                     {
                                       RAJA::loop<global_thread_x>(
-                                          ctx,
-                                          row_range,
+                                          ctx, row_range,
                                           [&](int row)
                                           {
                                             double dot = 0.0;
@@ -382,13 +380,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchParams(),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<omp_col_policy0>(ctx,
-                                    col_range,
+        RAJA::loop<omp_col_policy0>(ctx, col_range,
                                     [&](int col)
                                     {
                                       RAJA::loop<omp_row_policy0>(
-                                          ctx,
-                                          row_range,
+                                          ctx, row_range,
                                           [&](int row)
                                           {
                                             double dot = 0.0;
@@ -418,14 +414,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   using global_thread_xy = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
-  RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST,
-                                  RAJA::LaunchParams(),
+  RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST, RAJA::LaunchParams(),
                                   [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
                                   {
                                     RAJA::expt::loop<global_thread_xy>(
-                                        ctx,
-                                        col_range,
-                                        row_range,
+                                        ctx, col_range, row_range,
                                         [&](int col, int row)
                                         {
                                           double dot = 0.0;
@@ -465,13 +458,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<teams_x>(ctx,
-                            col_range,
+        RAJA::loop<teams_x>(ctx, col_range,
                             [&](int col)
                             {
                               RAJA::loop<threads_x>(
-                                  ctx,
-                                  row_range,
+                                  ctx, row_range,
                                   [&](int row)
                                   {
                                     double dot = 0.0;
@@ -508,25 +499,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                          RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::tile<teams_y>(ctx,
-                            THREAD_SZ,
-                            row_range,
+        RAJA::tile<teams_y>(ctx, THREAD_SZ, row_range,
                             [&](RAJA::RangeSegment const& row_tile)
                             {
                               RAJA::tile<teams_x>(
-                                  ctx,
-                                  THREAD_SZ,
-                                  col_range,
+                                  ctx, THREAD_SZ, col_range,
                                   [&](RAJA::RangeSegment const& col_tile)
                                   {
                                     RAJA::loop<threads_y>(
-                                        ctx,
-                                        row_tile,
+                                        ctx, row_tile,
                                         [&](int col)
                                         {
                                           RAJA::loop<threads_x>(
-                                              ctx,
-                                              col_tile,
+                                              ctx, col_tile,
                                               [&](int row)
                                               {
                                                 double dot = 0.0;
@@ -582,13 +567,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<teams_x>(ctx,
-                            col_range,
+        RAJA::loop<teams_x>(ctx, col_range,
                             [&](int col)
                             {
                               RAJA::loop<threads_x>(
-                                  ctx,
-                                  row_range,
+                                  ctx, row_range,
                                   [&](int row)
                                   {
                                     double dot = 0.0;
@@ -629,25 +612,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                          RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::tile<teams_y>(ctx,
-                            THREAD_SZ,
-                            row_range,
+        RAJA::tile<teams_y>(ctx, THREAD_SZ, row_range,
                             [&](RAJA::RangeSegment const& row_tile)
                             {
                               RAJA::tile<teams_x>(
-                                  ctx,
-                                  THREAD_SZ,
-                                  col_range,
+                                  ctx, THREAD_SZ, col_range,
                                   [&](RAJA::RangeSegment const& col_tile)
                                   {
                                     RAJA::loop<threads_y>(
-                                        ctx,
-                                        row_tile,
+                                        ctx, row_tile,
                                         [&](int col)
                                         {
                                           RAJA::loop<threads_x>(
-                                              ctx,
-                                              col_tile,
+                                              ctx, col_tile,
                                               [&](int row)
                                               {
                                                 double dot = 0.0;
@@ -697,58 +674,46 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         // Loop over teams
         //
         RAJA::tile<teams_y>(
-            ctx,
-            THREAD_SZ,
-            row_range,
+            ctx, THREAD_SZ, row_range,
             [&](RAJA::RangeSegment const& y_tile)
             {
               RAJA::tile<teams_x>(
-                  ctx,
-                  THREAD_SZ,
-                  col_range,
+                  ctx, THREAD_SZ, col_range,
                   [&](RAJA::RangeSegment const& x_tile)
                   {
                     RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ];
                     RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
                     RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
 
-                    RAJA::loop_icount<threads_y>(ctx,
-                                                 y_tile,
+                    RAJA::loop_icount<threads_y>(ctx, y_tile,
                                                  [&](int row, int ty)
                                                  {
                                                    RAJA::loop_icount<threads_x>(
-                                                       ctx,
-                                                       x_tile,
+                                                       ctx, x_tile,
                                                        [&](int col, int tx)
                                                        { Cs[ty][tx] = 0.0; });
                                                  });
 
                     RAJA::tile<seq_loop>(
-                        ctx,
-                        THREAD_SZ,
-                        dot_range,
+                        ctx, THREAD_SZ, dot_range,
                         [&](RAJA::RangeSegment const& k_tile)
                         {
                           RAJA::loop_icount<threads_y>(
-                              ctx,
-                              y_tile,
+                              ctx, y_tile,
                               [&](int row, int ty)
                               {
                                 RAJA::loop_icount<threads_x>(
-                                    ctx,
-                                    k_tile,
+                                    ctx, k_tile,
                                     [&](int k_id, int tx)
                                     { As[ty][tx] = Aview(row, k_id); });
                               });
 
                           RAJA::loop_icount<threads_y>(
-                              ctx,
-                              k_tile,
+                              ctx, k_tile,
                               [&](int k_id, int ty)
                               {
                                 RAJA::loop_icount<threads_x>(
-                                    ctx,
-                                    x_tile,
+                                    ctx, x_tile,
                                     [&](int col, int tx)
                                     { Bs[ty][tx] = Bview(k_id, col); });
                               });
@@ -756,18 +721,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                           ctx.teamSync();
 
                           RAJA::loop_icount<threads_y>(
-                              ctx,
-                              y_tile,
+                              ctx, y_tile,
                               [&](int row, int ty)
                               {
                                 RAJA::loop_icount<threads_x>(
-                                    ctx,
-                                    x_tile,
+                                    ctx, x_tile,
                                     [&](int col, int tx)
                                     {
                                       RAJA::loop_icount<seq_loop>(
-                                          ctx,
-                                          k_tile,
+                                          ctx, k_tile,
                                           [&](int gid, int e) {
                                             Cs[ty][tx] += As[ty][e] * Bs[e][tx];
                                           });
@@ -777,13 +739,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                           ctx.teamSync();
                         }); // slide across matrix
 
-                    RAJA::loop_icount<threads_y>(ctx,
-                                                 y_tile,
+                    RAJA::loop_icount<threads_y>(ctx, y_tile,
                                                  [&](int row, int ty)
                                                  {
                                                    RAJA::loop_icount<threads_x>(
-                                                       ctx,
-                                                       x_tile,
+                                                       ctx, x_tile,
                                                        [&](int col, int tx) {
                                                          Cview(col, row) =
                                                              Cs[ty][tx];
@@ -852,8 +812,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL(
-      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
+  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N,
+                     d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
@@ -868,15 +828,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL((sharedMatMultKernel),
-                     dim3(griddim),
-                     dim3(blockdim),
-                     0,
-                     0,
-                     N,
-                     d_C,
-                     d_A,
-                     d_B);
+  hipLaunchKernelGGL((sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0,
+                     N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp
index d248a0ac70..5d35a7260d 100644
--- a/examples/launch_reductions.cpp
+++ b/examples/launch_reductions.cpp
@@ -175,8 +175,7 @@ int main(int argc, char* argv[])
       "Launch Reductions",
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<loop_pol>(ctx,
-                             arange,
+        RAJA::loop<loop_pol>(ctx, arange,
                              [&](int i)
                              {
                                kernel_sum += a[i];
diff --git a/examples/multiview.cpp b/examples/multiview.cpp
index 42fbbfc7fe..a65609e8ee 100644
--- a/examples/multiview.cpp
+++ b/examples/multiview.cpp
@@ -111,8 +111,8 @@ void docs_example()
 
   t3 = MView1(3, 0); // accesses the 4th index of the 0th internal array a1,
                      // returns value of 8
-  t4 = MView1(2, 1); // accesses 3rd index of the 1st internal array a2, returns
-                     // value of 11
+  t4 = MView1(2, 1); // accesses 3rd index of the 1st internal array a2,
+                     // returns value of 11
   // _multiview_example_1Daopindex_end
 
   printf("Comparison of default MultiView with another MultiView that has the "
@@ -181,15 +181,8 @@ int main()
     {
       for (int jj = 0; jj < 3; ++jj)
       {
-        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n",
-               pp,
-               kk,
-               jj,
-               arrView(pp, kk, jj),
-               kk,
-               pp,
-               jj,
-               arrViewMov(kk, pp, jj));
+        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
+               arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
       }
     }
   }
@@ -215,15 +208,8 @@ int main()
     {
       for (int jj = 0; jj < 3; ++jj)
       {
-        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n",
-               pp,
-               kk,
-               jj,
-               arrView(pp, kk, jj),
-               kk,
-               pp,
-               jj,
-               arrViewMov(kk, pp, jj));
+        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
+               arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
       }
     }
   }
diff --git a/examples/omp-target-ltimes.cpp b/examples/omp-target-ltimes.cpp
index 08b527cfed..f0be32126b 100644
--- a/examples/omp-target-ltimes.cpp
+++ b/examples/omp-target-ltimes.cpp
@@ -36,8 +36,8 @@ void runLTimesRajaKernel(bool       debug,
   using namespace RAJA::statement;
 
   // psi[direction, group, zone]
-  using PsiView = RAJA::
-      TypedView<double, Layout<3, Index_type, 2>, IDirection, IGroup, IZone>;
+  using PsiView = RAJA::TypedView<double, Layout<3, Index_type, 2>, IDirection,
+                                  IGroup, IZone>;
 
   // phi[moment, group, zone]
   using PhiView =
@@ -84,33 +84,31 @@ void runLTimesRajaKernel(bool       debug,
       omp_target_alloc(sizeof(double) * psi_data.size(), did));
 
   // Copy to device
-  omp_target_memcpy(
-      &ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0, hid, did);
-  omp_target_memcpy(
-      &phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0, hid, did);
-  omp_target_memcpy(
-      &psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0, hid, did);
+  omp_target_memcpy(&ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0,
+                    hid, did);
+  omp_target_memcpy(&phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0,
+                    hid, did);
+  omp_target_memcpy(&psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0,
+                    hid, did);
 
 
   // create views on data
   std::array<RAJA::idx_t, 2> ell_perm{{0, 1}};
   EllView                    ell(d_ell,
-              make_permuted_layout({{num_moments, num_directions}}, ell_perm));
+                                 make_permuted_layout({{num_moments, num_directions}}, ell_perm));
 
   std::array<RAJA::idx_t, 3> psi_perm{{0, 1, 2}};
-  PsiView                    psi(d_psi,
-              make_permuted_layout({{num_directions, num_groups, num_zones}},
-                                   psi_perm));
+  PsiView                    psi(d_psi, make_permuted_layout(
+                                            {{num_directions, num_groups, num_zones}}, psi_perm));
 
   std::array<RAJA::idx_t, 3> phi_perm{{0, 1, 2}};
-  PhiView                    phi(
-      d_phi,
-      make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm));
+  PhiView                    phi(d_phi, make_permuted_layout(
+                                            {{num_moments, num_groups, num_zones}}, phi_perm));
 
 
-  using Pol = RAJA::KernelPolicy<Collapse<omp_target_parallel_collapse_exec,
-                                          ArgList<0, 1, 2>,
-                                          For<3, RAJA::seq_exec, Lambda<0>>>>;
+  using Pol = RAJA::KernelPolicy<
+      Collapse<omp_target_parallel_collapse_exec, ArgList<0, 1, 2>,
+               For<3, RAJA::seq_exec, Lambda<0>>>>;
 
   RAJA::Timer timer;
   timer.start();
diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp
index 777e66980b..2be24c1b44 100644
--- a/examples/pi-reduce_vs_atomic.cpp
+++ b/examples/pi-reduce_vs_atomic.cpp
@@ -223,8 +223,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi          = 0;
   double* d_atomic_pi = memoryManager::allocate_gpu<double>(1);
-  hipErrchk(hipMemcpy(
-      d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_atomic_pi, atomic_pi, 1 * sizeof(double),
+                      hipMemcpyHostToDevice));
 
   using ATOMIC_POL4 = RAJA::hip_atomic;
 
@@ -236,8 +236,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                          dx / (1.0 + x * x));
                           });
 
-  hipErrchk(hipMemcpy(
-      atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(atomic_pi, d_atomic_pi, 1 * sizeof(double),
+                      hipMemcpyDeviceToHost));
   *atomic_pi *= 4.0;
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
 
diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp
index 8a2f40a76a..4adef54d83 100644
--- a/examples/raja-launch.cpp
+++ b/examples/raja-launch.cpp
@@ -163,31 +163,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<teams_x>(ctx,
-                              RAJA::RangeSegment(0, N_tri),
+          RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri),
                               [&](int r)
                               {
                                 // Array shared within threads of the same team
                                 RAJA_TEAM_SHARED int s_A[1];
 
                                 RAJA::loop<threads_x>(
-                                    ctx,
-                                    RAJA::RangeSegment(0, 1),
+                                    ctx, RAJA::RangeSegment(0, 1),
                                     [&](int c) { s_A[c] = r; }); // loop c
 
                                 ctx.teamSync();
 
                                 RAJA::loop<threads_x>(
-                                    ctx,
-                                    RAJA::RangeSegment(r, N_tri),
+                                    ctx, RAJA::RangeSegment(r, N_tri),
                                     [&](int c)
                                     {
                                       D(r, c) = r * N_tri + c;
                                       printf("r=%d, c=%d : D=%d : s_A = %d \n",
-                                             r,
-                                             c,
-                                             D(r, c),
-                                             s_A[0]);
+                                             r, c, D(r, c), s_A[0]);
                                     }); // loop c
                               });       // loop r
         });                             // outer lambda
diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp
index e6ef3a64ca..8a73be765f 100644
--- a/examples/red-black-gauss-seidel.cpp
+++ b/examples/red-black-gauss-seidel.cpp
@@ -241,8 +241,7 @@ void computeErr(double* I, grid_s grid)
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
   using errPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<errPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
index aa7c42f8b6..f41e9a5a9c 100644
--- a/examples/resource-dynamic-forall.cpp
+++ b/examples/resource-dynamic-forall.cpp
@@ -134,9 +134,7 @@ int main(int argc, char* argv[])
       RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
-  RAJA::expt::dynamic_forall<policy_list>(res,
-                                          pol,
-                                          RAJA::RangeSegment(0, N),
+  RAJA::expt::dynamic_forall<policy_list>(res, pol, RAJA::RangeSegment(0, N),
                                           [=] RAJA_HOST_DEVICE(int i)
                                           { c[i] = a[i] + b[i]; });
 
diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp
index d3f6abc3f6..8cd87c6d26 100644
--- a/examples/resource-forall.cpp
+++ b/examples/resource-forall.cpp
@@ -91,8 +91,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
 
-  RAJA::forall<RAJA::seq_exec>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::seq_exec>(host, RAJA::RangeSegment(0, N),
+                               [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
@@ -102,8 +102,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA simd_exec vector addition...\n";
 
-  RAJA::forall<RAJA::simd_exec>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::simd_exec>(host, RAJA::RangeSegment(0, N),
+                                [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
@@ -114,8 +114,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(host, RAJA::RangeSegment(0, N),
+                                            [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
@@ -188,13 +188,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     res_gpu2.memcpy(d_b2, b, sizeof(int) * N);
 
 
-    RAJA::forall<EXEC_POLICY>(res_gpu1,
-                              RAJA::RangeSegment(0, N),
+    RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N),
                               [=] RAJA_DEVICE(int i)
                               { d_c1[i] = d_a1[i] + d_b1[i]; });
 
-    RAJA::forall<EXEC_POLICY>(res_gpu2,
-                              RAJA::RangeSegment(0, N),
+    RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0, N),
                               [=] RAJA_DEVICE(int i)
                               { d_c2[i] = d_a2[i] + d_b2[i]; });
 
@@ -250,15 +248,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     // _raja_res_alloc_end
 
     // _raja_res_k1_start
-    RAJA::forall<EXEC_POLICY>(res_gpu1,
-                              RAJA::RangeSegment(0, N),
+    RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N),
                               [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
     // _raja_res_k1_end
 
     // _raja_res_k2_start
     RAJA::resources::Event e = RAJA::forall<EXEC_POLICY>(
-        res_gpu2,
-        RAJA::RangeSegment(0, N),
+        res_gpu2, RAJA::RangeSegment(0, N),
         [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
     // _raja_res_k2_end
 
@@ -267,8 +263,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     // _raja_res_wait_end
 
     // _raja_res_k3_start
-    RAJA::forall<EXEC_POLICY>(res_gpu1,
-                              RAJA::RangeSegment(0, N),
+    RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N),
                               [=] RAJA_HOST_DEVICE(int i)
                               { d_array1[i] *= d_array2[i]; });
     // _raja_res_k3_end
@@ -279,8 +274,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     // _raja_res_k4_start
     bool check = true;
-    RAJA::forall<RAJA::seq_exec>(res_host,
-                                 RAJA::RangeSegment(0, N),
+    RAJA::forall<RAJA::seq_exec>(res_host, RAJA::RangeSegment(0, N),
                                  [&check, h_array](int i)
                                  {
                                    if (h_array[i] != -i)
diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp
index 64690126c1..a96dbba7f0 100644
--- a/examples/resource-kernel.cpp
+++ b/examples/resource-kernel.cpp
@@ -29,13 +29,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment n_range(0, N);
 
   using TEST_POL = RAJA::KernelPolicy<statement::CudaKernelAsync<statement::For<
-      0,
-      cuda_block_x_loop,
+      0, cuda_block_x_loop,
       statement::For<1, cuda_thread_x_loop, statement::Lambda<0>>>>>;
 
   RAJA::forall<RAJA::seq_exec>(
-      def_host_res,
-      n_range,
+      def_host_res, n_range,
       [=, &def_cuda_res](int i)
       {
         RAJA::resources::Cuda res_cuda;
diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp
index 42da55148a..7fd2dc5fcb 100644
--- a/examples/resource-launch.cpp
+++ b/examples/resource-launch.cpp
@@ -35,23 +35,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
   RAJA::forall<RAJA::seq_exec>(
-      def_host_res,
-      n_range,
+      def_host_res, n_range,
       [=, &def_cuda_res](int i)
       {
         RAJA::resources::Cuda res_cuda;
 
         RAJA::resources::Event e = RAJA::launch<launch_policy>(
-            res_cuda,
-            RAJA::LaunchParams(RAJA::Teams(64), RAJA::Threads(1)),
+            res_cuda, RAJA::LaunchParams(RAJA::Teams(64), RAJA::Threads(1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
             {
-              RAJA::loop<teams_x>(ctx,
-                                  m_range,
+              RAJA::loop<teams_x>(ctx, m_range,
                                   [&](int j)
                                   {
-                                    RAJA::loop<threads_x>(ctx,
-                                                          one_range,
+                                    RAJA::loop<threads_x>(ctx, one_range,
                                                           [&](int k) {
                                                             d_array[i * M + j] =
                                                                 i * M + j;
diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp
index 4c9204747a..6354aa46ef 100644
--- a/examples/resource-runtime-launch.cpp
+++ b/examples/resource-runtime-launch.cpp
@@ -185,12 +185,10 @@ int main(int argc, char* argv[])
   // How the kernel executes now depends on how the resource is constructed
   // (host or device)
   RAJA::launch<launch_policy>(
-      res,
-      RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
+      res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<loop_pol>(ctx,
-                             arange,
+        RAJA::loop<loop_pol>(ctx, arange,
                              [&](int i)
                              {
                                kernel_sum += a[i];
diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp
index 6052d9ca14..8e266d0e43 100644
--- a/examples/tut_halo-exchange.cpp
+++ b/examples/tut_halo-exchange.cpp
@@ -260,13 +260,13 @@ int main(int argc, char** argv)
   //
   std::vector<int*> pack_index_lists(num_neighbors, nullptr);
   std::vector<int>  pack_index_list_lengths(num_neighbors, 0);
-  create_pack_lists(
-      pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
+  create_pack_lists(pack_index_lists, pack_index_list_lengths, halo_width,
+                    grid_dims);
 
   std::vector<int*> unpack_index_lists(num_neighbors, nullptr);
   std::vector<int>  unpack_index_list_lengths(num_neighbors, 0);
-  create_unpack_lists(
-      unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
+  create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width,
+                      grid_dims);
   // _halo_exchange_index_list_generate_end
 
 
@@ -447,8 +447,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=](int i)
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
                                         { buffer[i] = var[list[i]]; });
 
             buffer += len;
@@ -474,8 +473,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=](int i)
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
                                         { var[list[i]] = buffer[i]; });
 
             buffer += len;
@@ -519,24 +517,17 @@ int main(int argc, char** argv)
     using forall_policy = RAJA::seq_exec;
 
     using workgroup_policy =
-        RAJA::WorkGroupPolicy<RAJA::seq_work,
-                              RAJA::ordered,
+        RAJA::WorkGroupPolicy<RAJA::seq_work, RAJA::ordered,
                               RAJA::ragged_array_of_objects,
                               RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::WorkPool<workgroup_policy,
-                                    int,
-                                    RAJA::xargs<>,
+    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
                                     memory_manager_allocator<char>>;
 
-    using workgroup = RAJA::WorkGroup<workgroup_policy,
-                                      int,
-                                      RAJA::xargs<>,
+    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
                                       memory_manager_allocator<char>>;
 
-    using worksite = RAJA::WorkSite<workgroup_policy,
-                                    int,
-                                    RAJA::xargs<>,
+    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
                                     memory_manager_allocator<char>>;
     // _halo_exchange_seq_workgroup_policies_end
 
@@ -702,8 +693,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=](int i)
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
                                         { buffer[i] = var[list[i]]; });
 
             buffer += len;
@@ -729,8 +719,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=](int i)
+            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
                                         { var[list[i]] = buffer[i]; });
 
             buffer += len;
@@ -772,24 +761,17 @@ int main(int argc, char** argv)
     using forall_policy = RAJA::omp_parallel_for_exec;
 
     using workgroup_policy =
-        RAJA::WorkGroupPolicy<RAJA::omp_work,
-                              RAJA::ordered,
+        RAJA::WorkGroupPolicy<RAJA::omp_work, RAJA::ordered,
                               RAJA::ragged_array_of_objects,
                               RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::WorkPool<workgroup_policy,
-                                    int,
-                                    RAJA::xargs<>,
+    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
                                     memory_manager_allocator<char>>;
 
-    using workgroup = RAJA::WorkGroup<workgroup_policy,
-                                      int,
-                                      RAJA::xargs<>,
+    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
                                       memory_manager_allocator<char>>;
 
-    using worksite = RAJA::WorkSite<workgroup_policy,
-                                    int,
-                                    RAJA::xargs<>,
+    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
                                     memory_manager_allocator<char>>;
     // _halo_exchange_openmp_workgroup_policies_end
 
@@ -928,17 +910,13 @@ int main(int argc, char** argv)
     {
       int pack_len             = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l],
-                            pack_index_lists[l],
-                            pack_len * sizeof(int),
-                            cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l], pack_index_lists[l],
+                            pack_len * sizeof(int), cudaMemcpyDefault));
 
       int unpack_len             = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l],
-                            unpack_index_lists[l],
-                            unpack_len * sizeof(int),
-                            cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l], unpack_index_lists[l],
+                            unpack_len * sizeof(int), cudaMemcpyDefault));
     }
 
     std::swap(vars, cuda_vars);
@@ -1050,8 +1028,8 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      cudaErrchk(cudaMemcpy(
-          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(vars[v], cuda_vars[v], var_size * sizeof(double),
+                            cudaMemcpyDefault));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
@@ -1093,17 +1071,13 @@ int main(int argc, char** argv)
     {
       int pack_len             = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l],
-                            pack_index_lists[l],
-                            pack_len * sizeof(int),
-                            cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l], pack_index_lists[l],
+                            pack_len * sizeof(int), cudaMemcpyDefault));
 
       int unpack_len             = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l],
-                            unpack_index_lists[l],
-                            unpack_len * sizeof(int),
-                            cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l], unpack_index_lists[l],
+                            unpack_len * sizeof(int), cudaMemcpyDefault));
     }
 
     std::swap(vars, cuda_vars);
@@ -1120,14 +1094,14 @@ int main(int argc, char** argv)
         RAJA::constant_stride_array_of_objects,
         RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::
-        WorkPool<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
+                                    pinned_allocator<char>>;
 
-    using workgroup = RAJA::
-        WorkGroup<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
+                                      pinned_allocator<char>>;
 
-    using worksite = RAJA::
-        WorkSite<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
+                                    pinned_allocator<char>>;
     // _halo_exchange_cuda_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
@@ -1173,8 +1147,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len),
-                              [=] RAJA_DEVICE(int i)
+            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
                               { buffer[i] = var[list[i]]; });
 
             buffer += len;
@@ -1206,8 +1179,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len),
-                                [=] RAJA_DEVICE(int i)
+            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
                                 { var[list[i]] = buffer[i]; });
 
             buffer += len;
@@ -1241,8 +1213,8 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      cudaErrchk(cudaMemcpy(
-          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(vars[v], cuda_vars[v], var_size * sizeof(double),
+                            cudaMemcpyDefault));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
@@ -1291,17 +1263,13 @@ int main(int argc, char** argv)
     {
       int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(hip_pack_index_lists[l],
-                          pack_index_lists[l],
-                          pack_len * sizeof(int),
-                          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(hip_pack_index_lists[l], pack_index_lists[l],
+                          pack_len * sizeof(int), hipMemcpyHostToDevice));
 
       int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
-                          unpack_index_lists[l],
-                          unpack_len * sizeof(int),
-                          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(hip_unpack_index_lists[l], unpack_index_lists[l],
+                          unpack_len * sizeof(int), hipMemcpyHostToDevice));
     }
 
     std::swap(vars, hip_vars);
@@ -1413,9 +1381,7 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      hipErrchk(hipMemcpy(vars[v],
-                          hip_vars[v],
-                          var_size * sizeof(double),
+      hipErrchk(hipMemcpy(vars[v], hip_vars[v], var_size * sizeof(double),
                           hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
@@ -1460,17 +1426,13 @@ int main(int argc, char** argv)
     {
       int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(hip_pack_index_lists[l],
-                          pack_index_lists[l],
-                          pack_len * sizeof(int),
-                          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(hip_pack_index_lists[l], pack_index_lists[l],
+                          pack_len * sizeof(int), hipMemcpyHostToDevice));
 
       int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
-                          unpack_index_lists[l],
-                          unpack_len * sizeof(int),
-                          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(hip_unpack_index_lists[l], unpack_index_lists[l],
+                          unpack_len * sizeof(int), hipMemcpyHostToDevice));
     }
 
     std::swap(vars, hip_vars);
@@ -1487,14 +1449,14 @@ int main(int argc, char** argv)
         RAJA::constant_stride_array_of_objects,
         RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::
-        WorkPool<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
+                                    pinned_allocator<char>>;
 
-    using workgroup = RAJA::
-        WorkGroup<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
+                                      pinned_allocator<char>>;
 
-    using worksite = RAJA::
-        WorkSite<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
+                                    pinned_allocator<char>>;
     // _halo_exchange_hip_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
@@ -1540,8 +1502,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len),
-                              [=] RAJA_DEVICE(int i)
+            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
                               { buffer[i] = var[list[i]]; });
 
             buffer += len;
@@ -1573,8 +1534,7 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len),
-                                [=] RAJA_DEVICE(int i)
+            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
                                 { var[list[i]] = buffer[i]; });
 
             buffer += len;
@@ -1608,9 +1568,7 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      hipErrchk(hipMemcpy(vars[v],
-                          hip_vars[v],
-                          var_size * sizeof(double),
+      hipErrchk(hipMemcpy(vars[v], hip_vars[v], var_size * sizeof(double),
                           hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
@@ -1654,17 +1612,13 @@ int main(int argc, char** argv)
     {
       int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(hip_pack_index_lists[l],
-                          pack_index_lists[l],
-                          pack_len * sizeof(int),
-                          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(hip_pack_index_lists[l], pack_index_lists[l],
+                          pack_len * sizeof(int), hipMemcpyHostToDevice));
 
       int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(hip_unpack_index_lists[l],
-                          unpack_index_lists[l],
-                          unpack_len * sizeof(int),
-                          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(hip_unpack_index_lists[l], unpack_index_lists[l],
+                          unpack_len * sizeof(int), hipMemcpyHostToDevice));
     }
 
     std::swap(vars, hip_vars);
@@ -1697,14 +1651,14 @@ int main(int argc, char** argv)
         RAJA::direct_dispatch<camp::list<range_segment, Packer>,
                               camp::list<range_segment, UnPacker>>>;
 
-    using workpool = RAJA::
-        WorkPool<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
+                                    pinned_allocator<char>>;
 
-    using workgroup = RAJA::
-        WorkGroup<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
+                                      pinned_allocator<char>>;
 
-    using worksite = RAJA::
-        WorkSite<workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
+                                    pinned_allocator<char>>;
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
@@ -1810,9 +1764,7 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      hipErrchk(hipMemcpy(vars[v],
-                          hip_vars[v],
-                          var_size * sizeof(double),
+      hipErrchk(hipMemcpy(vars[v], hip_vars[v], var_size * sizeof(double),
                           hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
@@ -1927,166 +1879,88 @@ void create_pack_lists(std::vector<int*>& pack_index_lists,
   std::vector<Extent> pack_index_list_extents(num_neighbors);
 
   // faces
-  pack_index_list_extents[0] = Extent{halo_width,
-                                      halo_width + halo_width,
-                                      halo_width,
-                                      grid_dims[1] + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[1] = Extent{grid_dims[0],
-                                      grid_dims[0] + halo_width,
-                                      halo_width,
-                                      grid_dims[1] + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[2] = Extent{halo_width,
-                                      grid_dims[0] + halo_width,
-                                      halo_width,
-                                      halo_width + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[3] = Extent{halo_width,
-                                      grid_dims[0] + halo_width,
-                                      grid_dims[1],
-                                      grid_dims[1] + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[4] = Extent{halo_width,
-                                      grid_dims[0] + halo_width,
-                                      halo_width,
-                                      grid_dims[1] + halo_width,
-                                      halo_width,
-                                      halo_width + halo_width};
-  pack_index_list_extents[5] = Extent{halo_width,
-                                      grid_dims[0] + halo_width,
-                                      halo_width,
-                                      grid_dims[1] + halo_width,
-                                      grid_dims[2],
-                                      grid_dims[2] + halo_width};
+  pack_index_list_extents[0] = Extent{halo_width, halo_width + halo_width,
+                                      halo_width, grid_dims[1] + halo_width,
+                                      halo_width, grid_dims[2] + halo_width};
+  pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                      halo_width,   grid_dims[1] + halo_width,
+                                      halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[2] = Extent{halo_width, grid_dims[0] + halo_width,
+                                      halo_width, halo_width + halo_width,
+                                      halo_width, grid_dims[2] + halo_width};
+  pack_index_list_extents[3] = Extent{halo_width,   grid_dims[0] + halo_width,
+                                      grid_dims[1], grid_dims[1] + halo_width,
+                                      halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[4] = Extent{halo_width, grid_dims[0] + halo_width,
+                                      halo_width, grid_dims[1] + halo_width,
+                                      halo_width, halo_width + halo_width};
+  pack_index_list_extents[5] = Extent{halo_width,   grid_dims[0] + halo_width,
+                                      halo_width,   grid_dims[1] + halo_width,
+                                      grid_dims[2], grid_dims[2] + halo_width};
 
   // edges
-  pack_index_list_extents[6]  = Extent{halo_width,
-                                      halo_width + halo_width,
-                                      halo_width,
-                                      halo_width + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[7]  = Extent{halo_width,
-                                      halo_width + halo_width,
-                                      grid_dims[1],
-                                      grid_dims[1] + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[8]  = Extent{grid_dims[0],
-                                      grid_dims[0] + halo_width,
-                                      halo_width,
-                                      halo_width + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[9]  = Extent{grid_dims[0],
-                                      grid_dims[0] + halo_width,
-                                      grid_dims[1],
-                                      grid_dims[1] + halo_width,
-                                      halo_width,
-                                      grid_dims[2] + halo_width};
-  pack_index_list_extents[10] = Extent{halo_width,
-                                       halo_width + halo_width,
-                                       halo_width,
-                                       grid_dims[1] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[11] = Extent{halo_width,
-                                       halo_width + halo_width,
-                                       halo_width,
-                                       grid_dims[1] + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[12] = Extent{grid_dims[0],
-                                       grid_dims[0] + halo_width,
-                                       halo_width,
-                                       grid_dims[1] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[13] = Extent{grid_dims[0],
-                                       grid_dims[0] + halo_width,
-                                       halo_width,
-                                       grid_dims[1] + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[14] = Extent{halo_width,
-                                       grid_dims[0] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[15] = Extent{halo_width,
-                                       grid_dims[0] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[16] = Extent{halo_width,
-                                       grid_dims[0] + halo_width,
-                                       grid_dims[1],
-                                       grid_dims[1] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[17] = Extent{halo_width,
-                                       grid_dims[0] + halo_width,
-                                       grid_dims[1],
-                                       grid_dims[1] + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[6]  = Extent{halo_width, halo_width + halo_width,
+                                      halo_width, halo_width + halo_width,
+                                      halo_width, grid_dims[2] + halo_width};
+  pack_index_list_extents[7]  = Extent{halo_width,   halo_width + halo_width,
+                                      grid_dims[1], grid_dims[1] + halo_width,
+                                      halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[8]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                      halo_width,   halo_width + halo_width,
+                                      halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[9]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                      grid_dims[1], grid_dims[1] + halo_width,
+                                      halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[10] = Extent{halo_width, halo_width + halo_width,
+                                       halo_width, grid_dims[1] + halo_width,
+                                       halo_width, halo_width + halo_width};
+  pack_index_list_extents[11] = Extent{halo_width,   halo_width + halo_width,
+                                       halo_width,   grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width,   grid_dims[1] + halo_width,
+                                       halo_width,   halo_width + halo_width};
+  pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width,   grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[14] = Extent{halo_width, grid_dims[0] + halo_width,
+                                       halo_width, halo_width + halo_width,
+                                       halo_width, halo_width + halo_width};
+  pack_index_list_extents[15] = Extent{halo_width,   grid_dims[0] + halo_width,
+                                       halo_width,   halo_width + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[16] = Extent{halo_width,   grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width,   halo_width + halo_width};
+  pack_index_list_extents[17] = Extent{halo_width,   grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
 
   // corners
-  pack_index_list_extents[18] = Extent{halo_width,
-                                       halo_width + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[19] = Extent{halo_width,
-                                       halo_width + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[20] = Extent{halo_width,
-                                       halo_width + halo_width,
-                                       grid_dims[1],
-                                       grid_dims[1] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[21] = Extent{halo_width,
-                                       halo_width + halo_width,
-                                       grid_dims[1],
-                                       grid_dims[1] + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[22] = Extent{grid_dims[0],
-                                       grid_dims[0] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[23] = Extent{grid_dims[0],
-                                       grid_dims[0] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
-  pack_index_list_extents[24] = Extent{grid_dims[0],
-                                       grid_dims[0] + halo_width,
-                                       grid_dims[1],
-                                       grid_dims[1] + halo_width,
-                                       halo_width,
-                                       halo_width + halo_width};
-  pack_index_list_extents[25] = Extent{grid_dims[0],
-                                       grid_dims[0] + halo_width,
-                                       grid_dims[1],
-                                       grid_dims[1] + halo_width,
-                                       grid_dims[2],
-                                       grid_dims[2] + halo_width};
+  pack_index_list_extents[18] = Extent{halo_width, halo_width + halo_width,
+                                       halo_width, halo_width + halo_width,
+                                       halo_width, halo_width + halo_width};
+  pack_index_list_extents[19] = Extent{halo_width,   halo_width + halo_width,
+                                       halo_width,   halo_width + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[20] = Extent{halo_width,   halo_width + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width,   halo_width + halo_width};
+  pack_index_list_extents[21] = Extent{halo_width,   halo_width + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width,   halo_width + halo_width,
+                                       halo_width,   halo_width + halo_width};
+  pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width,   halo_width + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width,   halo_width + halo_width};
+  pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
 
   const int grid_i_stride = 1;
   const int grid_j_stride = grid_dims[0] + 2 * halo_width;
@@ -2149,36 +2023,27 @@ void create_unpack_lists(std::vector<int*>& unpack_index_lists,
   std::vector<Extent> unpack_index_list_extents(num_neighbors);
 
   // faces
-  unpack_index_list_extents[0] = Extent{0,
-                                        halo_width,
-                                        halo_width,
-                                        grid_dims[1] + halo_width,
-                                        halo_width,
-                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[0] = Extent{0,          halo_width,
+                                        halo_width, grid_dims[1] + halo_width,
+                                        halo_width, grid_dims[2] + halo_width};
   unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width,
                                         grid_dims[0] + 2 * halo_width,
                                         halo_width,
                                         grid_dims[1] + halo_width,
                                         halo_width,
                                         grid_dims[2] + halo_width};
-  unpack_index_list_extents[2] = Extent{halo_width,
-                                        grid_dims[0] + halo_width,
-                                        0,
-                                        halo_width,
-                                        halo_width,
-                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[2] =
+      Extent{halo_width, grid_dims[0] + halo_width, 0, halo_width,
+             halo_width, grid_dims[2] + halo_width};
   unpack_index_list_extents[3] = Extent{halo_width,
                                         grid_dims[0] + halo_width,
                                         grid_dims[1] + halo_width,
                                         grid_dims[1] + 2 * halo_width,
                                         halo_width,
                                         grid_dims[2] + halo_width};
-  unpack_index_list_extents[4] = Extent{halo_width,
-                                        grid_dims[0] + halo_width,
-                                        halo_width,
-                                        grid_dims[1] + halo_width,
-                                        0,
-                                        halo_width};
+  unpack_index_list_extents[4] = Extent{halo_width, grid_dims[0] + halo_width,
+                                        halo_width, grid_dims[1] + halo_width,
+                                        0,          halo_width};
   unpack_index_list_extents[5] = Extent{halo_width,
                                         grid_dims[0] + halo_width,
                                         halo_width,
@@ -2229,12 +2094,9 @@ void create_unpack_lists(std::vector<int*>& unpack_index_lists,
                                          grid_dims[2] + 2 * halo_width};
   unpack_index_list_extents[14] = Extent{
       halo_width, grid_dims[0] + halo_width, 0, halo_width, 0, halo_width};
-  unpack_index_list_extents[15] = Extent{halo_width,
-                                         grid_dims[0] + halo_width,
-                                         0,
-                                         halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[15] = Extent{
+      halo_width, grid_dims[0] + halo_width, 0,
+      halo_width, grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
   unpack_index_list_extents[16] = Extent{halo_width,
                                          grid_dims[0] + halo_width,
                                          grid_dims[1] + halo_width,
@@ -2257,12 +2119,9 @@ void create_unpack_lists(std::vector<int*>& unpack_index_lists,
                                          halo_width,
                                          grid_dims[2] + halo_width,
                                          grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[20] = Extent{0,
-                                         halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[1] + 2 * halo_width,
-                                         0,
-                                         halo_width};
+  unpack_index_list_extents[20] = Extent{
+      0, halo_width, grid_dims[1] + halo_width, grid_dims[1] + 2 * halo_width,
+      0, halo_width};
   unpack_index_list_extents[21] = Extent{0,
                                          halo_width,
                                          grid_dims[1] + halo_width,
@@ -2275,24 +2134,19 @@ void create_unpack_lists(std::vector<int*>& unpack_index_lists,
                                          halo_width,
                                          0,
                                          halo_width};
-  unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width,
-                                         grid_dims[0] + 2 * halo_width,
-                                         0,
-                                         halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[23] = Extent{
+      grid_dims[0] + halo_width, grid_dims[0] + 2 * halo_width, 0, halo_width,
+      grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
   unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width,
                                          grid_dims[0] + 2 * halo_width,
                                          grid_dims[1] + halo_width,
                                          grid_dims[1] + 2 * halo_width,
                                          0,
                                          halo_width};
-  unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width,
-                                         grid_dims[0] + 2 * halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[1] + 2 * halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[25] =
+      Extent{grid_dims[0] + halo_width, grid_dims[0] + 2 * halo_width,
+             grid_dims[1] + halo_width, grid_dims[1] + 2 * halo_width,
+             grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
 
   const int grid_i_stride = 1;
   const int grid_j_stride = grid_dims[0] + 2 * halo_width;
diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp
index 85c80b9ad5..5a22512dfb 100644
--- a/examples/tut_launch_basic.cpp
+++ b/examples/tut_launch_basic.cpp
@@ -125,10 +125,7 @@ __global__ void gpuKernel()
 
           printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d "
                  "block_by %d \n",
-                 tx,
-                 ty,
-                 bx,
-                 by);
+                 tx, ty, bx, by);
         }
       }
     }
@@ -198,31 +195,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
       {
         // _team_loops_start
         RAJA::loop<teams_y>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, Nteams),
+            ctx, RAJA::TypedRangeSegment<int>(0, Nteams),
             [&](int by)
             {
               RAJA::loop<teams_x>(
-                  ctx,
-                  RAJA::TypedRangeSegment<int>(0, Nteams),
+                  ctx, RAJA::TypedRangeSegment<int>(0, Nteams),
                   [&](int bx)
                   {
                     RAJA::loop<threads_y>(
-                        ctx,
-                        RAJA::TypedRangeSegment<int>(0, Nthreads),
+                        ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),
                         [&](int ty)
                         {
                           RAJA::loop<threads_x>(
-                              ctx,
-                              RAJA::TypedRangeSegment<int>(0, Nthreads),
+                              ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),
                               [&](int tx)
                               {
                                 printf("RAJA Teams: threadId_x %d threadId_y "
                                        "%d teamId_x %d teamId_y %d \n",
-                                       tx,
-                                       ty,
-                                       bx,
-                                       by);
+                                       tx, ty, bx, by);
                               });
                         });
                   });
@@ -244,11 +234,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
           for (int tx = 0; tx < Nthreads; ++tx)
           {
 
-            printf("c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n",
-                   tx,
-                   ty,
-                   bx,
-                   by);
+            printf("c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n", tx,
+                   ty, bx, by);
           }
         }
       }
diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp
index 699fcc3dc2..97759a5112 100644
--- a/examples/tut_matrix-multiply.cpp
+++ b/examples/tut_matrix-multiply.cpp
@@ -391,10 +391,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This is the same as using an OpenMP 'parallel for' directive on the
   // outer loop with a 'collapse(2) clause.
   //
-  using EXEC_POL3 = RAJA::KernelPolicy<
-      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<1, 0>, // row, col
-                                RAJA::statement::Lambda<0>>>;
+  using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>, // row, col
+      RAJA::statement::Lambda<0>>>;
 
   RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
                           [=](int col, int row)
@@ -429,12 +428,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-  using EXEC_POL4 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          1,
-          RAJA::cuda_block_x_loop,
-          RAJA::statement::
-              For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
+  using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
+                           RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
                           [=] RAJA_DEVICE(int col, int row)
@@ -467,18 +464,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   using EXEC_POL5 =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::cuda_thread_y_loop,
-                  RAJA::statement::For<0,
-                                       RAJA::cuda_thread_x_loop,
+                  1, RAJA::cuda_thread_y_loop,
+                  RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
@@ -526,12 +517,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // and blocksize N; i.e., kernel<<<N, N>>> and defining row = blockIdx.x
   // and col = threadIdx.x in the kernel.
   //
-  using EXEC_POL4 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-          1,
-          RAJA::hip_block_x_loop,
-          RAJA::statement::
-              For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
+  using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::HipKernel<
+      RAJA::statement::For<1, RAJA::hip_block_x_loop,
+                           RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
                           [=] RAJA_DEVICE(int col, int row)
@@ -567,18 +556,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   using EXEC_POL5 =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-          RAJA::hip_block_y_loop,
+          1, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-              RAJA::hip_block_x_loop,
+              0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::hip_thread_y_loop,
-                  RAJA::statement::For<0,
-                                       RAJA::hip_thread_x_loop,
+                  1, RAJA::hip_thread_y_loop,
+                  RAJA::statement::For<0, RAJA::hip_thread_x_loop,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
@@ -626,22 +609,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // _matmult_3lambdakernel_seq_start
   using EXEC_POL6a = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<
-          0,
-          RAJA::seq_exec,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-          RAJA::statement::For<2,
-                               RAJA::seq_exec,
+          0, RAJA::seq_exec, RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot
+                                                                          // =
+                                                                          // 0.0
+          RAJA::statement::For<2, RAJA::seq_exec,
                                RAJA::statement::Lambda<1> // inner loop: dot +=
                                                           // ...
                                >,
-          RAJA::statement::
-              Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set
-                                                           // C(row,
-                                                           // col)
-                                                           // = dot
+          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
+                                  RAJA::Params<0>> // set
+                                                   // C(row,
+                                                   // col)
+                                                   // = dot
           >>>;
 
   RAJA::kernel_param<EXEC_POL6a>(
@@ -684,18 +665,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using RAJA::Segs;
 
   using EXEC_POL6b = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<
-          0,
-          RAJA::seq_exec,
-          RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
-          RAJA::statement::For<
-              2,
-              RAJA::seq_exec,
-              RAJA::statement::Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ...
-              >,
-          RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // C(row, col) = dot
+          0, RAJA::seq_exec, RAJA::statement::Lambda<0, Params<0>>, // dot =
+                                                                    // 0.0
+          RAJA::statement::For<2, RAJA::seq_exec,
+                               RAJA::statement::Lambda<1, Segs<0, 1, 2>,
+                                                       Params<0>> // dot += ...
+                               >,
+          RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // C(row, col) =
+                                                            // dot
           >>>;
 
   RAJA::kernel_param<EXEC_POL6b>(
@@ -730,11 +709,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _matmult_3lambdakernel_ompcollapse_start
   using EXEC_POL7 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec,
-      RAJA::ArgList<1, 0>,                         // row, col
-      RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-      RAJA::statement::For<2,
-                           RAJA::seq_exec,
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>, // row, col
+      RAJA::statement::Lambda<0, RAJA::Params<0>>,           // dot = 0.0
+      RAJA::statement::For<2, RAJA::seq_exec,
                            RAJA::statement::Lambda<1> // inner loop: dot += ...
                            >,
       RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set
@@ -783,12 +760,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0,
               RAJA::cuda_thread_x_loop,                    // col
               RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-              RAJA::statement::For<2,
-                                   RAJA::seq_exec,
+              RAJA::statement::For<2, RAJA::seq_exec,
                                    RAJA::statement::Lambda<1> // dot += ...
                                    >,
-              RAJA::statement::
-                  Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ...
+              RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
+                                      RAJA::Params<0>> // set C = ...
               >>>>;
   // _matmult_3lambdakernel_cuda_end
 
@@ -822,13 +798,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _matmult_3lambdakernel_cudatiled_start
   using EXEC_POL9a =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1,
                   RAJA::cuda_thread_y_loop, // row
@@ -836,14 +808,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                       0,
                       RAJA::cuda_thread_x_loop,                    // col
                       RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-                      RAJA::statement::For<2,
-                                           RAJA::seq_exec,
+                      RAJA::statement::For<2, RAJA::seq_exec,
                                            RAJA::statement::Lambda<1> // dot +=
                                                                       // ...
                                            >,
-                      RAJA::statement::
-                          Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C
-                                                                       // = ...
+                      RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
+                                              RAJA::Params<0>> // set C
+                                                               // = ...
                       >>>>>>;
   // _matmult_3lambdakernel_cudatiled_end
 
@@ -876,13 +847,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL9b =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1,
                   RAJA::cuda_thread_y_loop, // row
@@ -891,13 +858,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                       RAJA::cuda_thread_x_loop,              // col
                       RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
                       RAJA::statement::For<
-                          2,
-                          RAJA::seq_exec,
-                          RAJA::statement::
-                              Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ...
+                          2, RAJA::seq_exec,
+                          RAJA::statement::Lambda<1, Segs<0, 1, 2>,
+                                                  Params<0>> // dot += ...
                           >,
-                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set C
-                                                                        // = ...
+                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set
+                                                                        // C =
+                                                                        // ...
                       >>>>>>;
 
   RAJA::kernel_param<EXEC_POL9b>(
@@ -935,87 +902,75 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // for an introduction to RAJA LocalArray types and thread synchronization.
 
   using Shmem =
-      RAJA::LocalArray<double,
-                       RAJA::PERM_IJ,
+      RAJA::LocalArray<double, RAJA::PERM_IJ,
                        RAJA::SizeList<CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>;
 
   using shmem_Lambda0 =
       RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-  using shmem_Lambda1 = RAJA::statement::
-      Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
-  using shmem_Lambda2 = RAJA::statement::
-      Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+  using shmem_Lambda1 =
+      RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>,
+                              RAJA::Params<0>>;
+  using shmem_Lambda2 =
+      RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>,
+                              RAJA::Params<1>>;
   using shmem_Lambda3 =
       RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-  using shmem_Lambda4 = RAJA::statement::
-      Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+  using shmem_Lambda4 =
+      RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>,
+                              RAJA::Params<2>>;
 
   using EXEC_POL10 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
       CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE,
       // Initalize thread private value
       RAJA::statement::InitLocalMem<
-          RAJA::cuda_shared_mem,
-          RAJA::ParamList<2, 1, 0>,
+          RAJA::cuda_shared_mem, RAJA::ParamList<2, 1, 0>,
 
           // Tile rows and cols of C (the result matrix C)
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-              RAJA::cuda_block_x_direct,
+              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_direct,
               RAJA::statement::Tile<
-                  2,
-                  RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+                  2, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
                   RAJA::cuda_block_y_direct,
 
                   // zero out shmem tile of C
                   RAJA::statement::For<
-                      2,
-                      RAJA::cuda_thread_y_loop,
-                      RAJA::statement::
-                          For<0, RAJA::cuda_thread_x_loop, shmem_Lambda0>>,
+                      2, RAJA::cuda_thread_y_loop,
+                      RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                                           shmem_Lambda0>>,
 
                   // Slide window across matrix: Load tiles of global matrices
                   // A, B and compute local dot products
                   RAJA::statement::Tile<
-                      1,
-                      RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-                      RAJA::seq_exec,
+                      1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::seq_exec,
 
                       // Load tile of A into shmem
                       RAJA::statement::For<
-                          1,
-                          RAJA::cuda_thread_y_loop,
-                          RAJA::statement::
-                              For<0, RAJA::cuda_thread_x_loop, shmem_Lambda1>>,
+                          1, RAJA::cuda_thread_y_loop,
+                          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                                               shmem_Lambda1>>,
 
                       // Load tile of B into shmem
                       RAJA::statement::For<
-                          2,
-                          RAJA::cuda_thread_y_loop,
-                          RAJA::statement::
-                              For<1, RAJA::cuda_thread_x_loop, shmem_Lambda2>>,
+                          2, RAJA::cuda_thread_y_loop,
+                          RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
+                                               shmem_Lambda2>>,
 
                       RAJA::statement::CudaSyncThreads,
 
                       // Partial multiplication
                       RAJA::statement::For<
-                          2,
-                          RAJA::cuda_thread_y_loop,
+                          2, RAJA::cuda_thread_y_loop,
                           RAJA::statement::For<
-                              1,
-                              RAJA::seq_exec,
-                              RAJA::statement::For<0,
-                                                   RAJA::cuda_thread_x_loop,
+                              1, RAJA::seq_exec,
+                              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
                                                    shmem_Lambda3>>>,
 
                       RAJA::statement::CudaSyncThreads>, // sliding window
 
                   // Write memory out to global matrix
                   RAJA::statement::For<
-                      2,
-                      RAJA::cuda_thread_y_loop,
-                      RAJA::statement::For<0,
-                                           RAJA::cuda_thread_x_loop,
+                      2, RAJA::cuda_thread_y_loop,
+                      RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
                                            shmem_Lambda4>>>>> // Create shared
                                                               // memory
       >                                                       // Cuda kernel
@@ -1042,12 +997,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(int    tn,
-                           int    tm,
-                           int    tp,
-                           Shmem& aShared,
-                           Shmem& bShared,
-                           Shmem& cShared)
+      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
+                           Shmem& bShared, Shmem& cShared)
       { cShared(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
@@ -1104,12 +1055,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0,
               RAJA::hip_thread_x_loop,                     // col
               RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-              RAJA::statement::For<2,
-                                   RAJA::seq_exec,
+              RAJA::statement::For<2, RAJA::seq_exec,
                                    RAJA::statement::Lambda<1> // dot += ...
                                    >,
-              RAJA::statement::
-                  Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ...
+              RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
+                                      RAJA::Params<0>> // set C = ...
               >>>>;
   // _matmult_3lambdakernel_hip_end
 
@@ -1147,13 +1097,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _matmult_3lambdakernel_hiptiled_start
   using EXEC_POL9b =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-          RAJA::hip_block_y_loop,
+          1, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<HIP_BLOCK_SIZE>,
-              RAJA::hip_block_x_loop,
+              0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
                   1,
                   RAJA::hip_thread_y_loop, // row
@@ -1162,13 +1108,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                       RAJA::hip_thread_x_loop,               // col
                       RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
                       RAJA::statement::For<
-                          2,
-                          RAJA::seq_exec,
-                          RAJA::statement::
-                              Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ...
+                          2, RAJA::seq_exec,
+                          RAJA::statement::Lambda<1, Segs<0, 1, 2>,
+                                                  Params<0>> // dot += ...
                           >,
-                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set C
-                                                                        // = ...
+                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set
+                                                                        // C =
+                                                                        // ...
                       >>>>>>;
   // _matmult_3lambdakernel_hiptiled_end
 
@@ -1211,8 +1157,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL(
-      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
+  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N,
+                     d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp
index c25a8ded7c..bce47544ba 100644
--- a/examples/wave-eqn.cpp
+++ b/examples/wave-eqn.cpp
@@ -126,8 +126,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // Sequential policy
   using fdPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   // OpenMP policy
@@ -198,8 +197,7 @@ void computeErr(double* P, double tf, grid_s grid)
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
   using initialPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
@@ -231,8 +229,7 @@ void setIC(double* P1, double* P2, double t0, double t1, grid_s grid)
   RAJA::RangeSegment fdBounds(0, grid.nx);
 
   using initialPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
@@ -259,8 +256,8 @@ void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx)
         //
         // Coefficients for fourth order stencil
         //
-        double coeff[5] = {
-            -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
+        double coeff[5] = {-1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0,
+                           -1.0 / 12.0};
 
         const int id     = tx + ty * nx;
         double    P_old  = P1[id];
diff --git a/exercises/atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp
index 7884b4e305..4f9dff0c8a 100644
--- a/exercises/atomic-histogram_solution.cpp
+++ b/exercises/atomic-histogram_solution.cpp
@@ -194,8 +194,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajacuda_atomic_histogram_start
   RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      array_range,
-      [=] RAJA_DEVICE(int i)
+      array_range, [=] RAJA_DEVICE(int i)
       { RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1); });
   // _rajacuda_atomic_histogram_end
 
@@ -218,8 +217,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajacuda_atomicauto_histogram_start
   RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      array_range,
-      [=] RAJA_DEVICE(int i)
+      array_range, [=] RAJA_DEVICE(int i)
       { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
   // _rajacuda_atomicauto_histogram_end
 
@@ -240,8 +238,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajahip_atomic_histogram_start
   RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      array_range,
-      [=] RAJA_DEVICE(int i)
+      array_range, [=] RAJA_DEVICE(int i)
       { RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1); });
   // _rajahip_atomic_histogram_end
 
@@ -264,8 +261,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _rajahip_atomicauto_histogram_start
   RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      array_range,
-      [=] RAJA_DEVICE(int i)
+      array_range, [=] RAJA_DEVICE(int i)
       { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
   // _rajahip_atomicauto_histogram_end
 
diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp
index c181ca04c2..57887ea51b 100644
--- a/exercises/dot-product_solution.cpp
+++ b/exercises/dot-product_solution.cpp
@@ -107,8 +107,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajaomp_dotprod_start
   RAJA::ReduceSum<RAJA::omp_reduce, double> ompdot(0.0);
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      RAJA::RangeSegment(0, N), [=](int i) { ompdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=](int i)
+                                            { ompdot += a[i] * b[i]; });
 
   dot = ompdot.get();
   // _rajaomp_dotprod_end
diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp
index 80ff56a913..29c18e7052 100644
--- a/exercises/kernel-matrix-transpose-local-array.cpp
+++ b/exercises/kernel-matrix-transpose-local-array.cpp
@@ -195,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM = RAJA::
-      LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM = RAJA::LocalArray<int, RAJA::Perm<0, 1>,
+                                    RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -362,31 +362,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //      tiles needed to carry out the transpose
       //
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::seq_exec,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::seq_exec,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
               // This statement will initalize local array memory inside a
               // kernel. The cpu_tile_mem policy specifies that memory should be
               // allocated on the stack. The entries in the RAJA::ParamList
               // identify RAJA local arrays to intialize in the parameter tuple.
               RAJA::statement::InitLocalMem<
-                  RAJA::cpu_tile_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::cpu_tile_mem, RAJA::ParamList<2>,
                   //
                   // (1) Execution policies for the first set of inner
                   // loops. These loops copy data from the global matrices
                   // to the local tile.
                   //
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<1>,
-                      RAJA::omp_parallel_for_exec,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<0>,
+                      1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
                                                  RAJA::seq_exec,
                                                  RAJA::statement::Lambda<0>>>,
                   //
@@ -398,13 +390,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //     index has unit stride.
                   //
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<0>,
-                      RAJA::seq_exec,
+                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
                       RAJA::statement::ForICount<
-                          1,
-                          RAJA::statement::Param<1>,
-                          RAJA::seq_exec,
+                          1, RAJA::statement::Param<1>, RAJA::seq_exec,
                           RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
@@ -435,31 +423,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //      tiles needed to carry out the transpose
       //
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
               // This statement will initalize local array memory inside a
               // kernel. The cpu_tile_mem policy specifies that memory should be
               // allocated on the stack. The entries in the RAJA::ParamList
               // identify RAJA local arrays to intialize in the parameter tuple.
               RAJA::statement::InitLocalMem<
-                  RAJA::cuda_shared_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::cuda_shared_mem, RAJA::ParamList<2>,
                   //
                   // (1) Execution policies for the first set of inner
                   // loops. These loops copy data from the global matrices
                   // to the local tile.
                   //
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<0>,
-                      RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<1>,
+                      1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
                                                  RAJA::cuda_thread_x_direct,
                                                  RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
@@ -474,11 +454,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //     index has unit stride.
                   //
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<1>,
-                      RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<1,
-                                                 RAJA::statement::Param<0>,
+                      0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
                                                  RAJA::cuda_thread_x_direct,
                                                  RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
@@ -531,31 +508,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //      tiles needed to carry out the transpose
       //
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::hip_block_y_loop,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::hip_block_x_loop,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
               // This statement will initalize local array memory inside a
               // kernel. The cpu_tile_mem policy specifies that memory should be
               // allocated on the stack. The entries in the RAJA::ParamList
               // identify RAJA local arrays to intialize in the parameter tuple.
               RAJA::statement::InitLocalMem<
-                  RAJA::hip_shared_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::hip_shared_mem, RAJA::ParamList<2>,
                   //
                   // (1) Execution policies for the first set of inner
                   // loops. These loops copy data from the global matrices
                   // to the local tile.
                   //
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<0>,
-                      RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<1>,
+                      1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
                                                  RAJA::hip_thread_x_direct,
                                                  RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
@@ -570,11 +539,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //     index has unit stride.
                   //
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<1>,
-                      RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<1,
-                                                 RAJA::statement::Param<0>,
+                      0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
                                                  RAJA::hip_thread_x_direct,
                                                  RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
index 80ec0e61d4..eedad74d05 100644
--- a/exercises/kernel-matrix-transpose-local-array_solution.cpp
+++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -195,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM = RAJA::
-      LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM = RAJA::LocalArray<int, RAJA::Perm<0, 1>,
+                                    RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -210,33 +210,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _mattranspose_localarray_raja_start
   using SEQ_EXEC_POL_I = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_DIM>,
-      RAJA::seq_exec,
+      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
 
           RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem,
-              RAJA::ParamList<2>,
+              RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
               RAJA::statement::ForICount<
-                  1,
-                  RAJA::statement::Param<0>,
-                  RAJA::seq_exec,
-                  RAJA::statement::ForICount<0,
-                                             RAJA::statement::Param<1>,
+                  1, RAJA::statement::Param<0>, RAJA::seq_exec,
+                  RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
                                              RAJA::seq_exec,
                                              RAJA::statement::Lambda<0>>>,
 
               RAJA::statement::ForICount<
-                  0,
-                  RAJA::statement::Param<1>,
-                  RAJA::seq_exec,
-                  RAJA::statement::ForICount<1,
-                                             RAJA::statement::Param<0>,
+                  0, RAJA::statement::Param<1>, RAJA::seq_exec,
+                  RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
                                              RAJA::seq_exec,
                                              RAJA::statement::Lambda<1>>>
 
@@ -275,31 +264,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //      tiles needed to carry out the transpose
       //
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::omp_parallel_for_exec,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::seq_exec,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
               // This statement will initalize local array memory inside a
               // kernel. The cpu_tile_mem policy specifies that memory should be
               // allocated on the stack. The entries in the RAJA::ParamList
               // identify RAJA local arrays in the parameter tuple to intialize.
               RAJA::statement::InitLocalMem<
-                  RAJA::cpu_tile_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::cpu_tile_mem, RAJA::ParamList<2>,
                   //
                   // (1) Execution policies for the first set of inner
                   // loops. These loops copy data from the global matrices
                   // to the local tile.
                   //
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<0>,
-                      RAJA::seq_exec,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<1>,
+                      1, RAJA::statement::Param<0>, RAJA::seq_exec,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
                                                  RAJA::seq_exec,
                                                  RAJA::statement::Lambda<0>>>,
                   //
@@ -311,13 +292,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //     index has unit stride.
                   //
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<1>,
-                      RAJA::seq_exec,
+                      0, RAJA::statement::Param<1>, RAJA::seq_exec,
                       RAJA::statement::ForICount<
-                          1,
-                          RAJA::statement::Param<0>,
-                          RAJA::seq_exec,
+                          1, RAJA::statement::Param<0>, RAJA::seq_exec,
                           RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_1_POL>(
@@ -347,31 +324,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //      tiles needed to carry out the transpose
       //
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::seq_exec,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::seq_exec,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
               // This statement will initalize local array memory inside a
               // kernel. The cpu_tile_mem policy specifies that memory should be
               // allocated on the stack. The entries in the RAJA::ParamList
               // identify RAJA local arrays to intialize in the parameter tuple.
               RAJA::statement::InitLocalMem<
-                  RAJA::cpu_tile_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::cpu_tile_mem, RAJA::ParamList<2>,
                   //
                   // (1) Execution policies for the first set of inner
                   // loops. These loops copy data from the global matrices
                   // to the local tile.
                   //
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<1>,
-                      RAJA::omp_parallel_for_exec,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<0>,
+                      1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
                                                  RAJA::seq_exec,
                                                  RAJA::statement::Lambda<0>>>,
                   //
@@ -383,13 +352,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //     index has unit stride.
                   //
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<0>,
-                      RAJA::seq_exec,
+                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
                       RAJA::statement::ForICount<
-                          1,
-                          RAJA::statement::Param<1>,
-                          RAJA::seq_exec,
+                          1, RAJA::statement::Param<1>, RAJA::seq_exec,
                           RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
@@ -420,31 +385,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //      tiles needed to carry out the transpose
       //
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
               // This statement will initalize local array memory inside a
               // kernel. The cpu_tile_mem policy specifies that memory should be
               // allocated on the stack. The entries in the RAJA::ParamList
               // identify RAJA local arrays to intialize in the parameter tuple.
               RAJA::statement::InitLocalMem<
-                  RAJA::cuda_shared_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::cuda_shared_mem, RAJA::ParamList<2>,
                   //
                   // (1) Execution policies for the first set of inner
                   // loops. These loops copy data from the global matrices
                   // to the local tile.
                   //
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<0>,
-                      RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<1>,
+                      1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
                                                  RAJA::cuda_thread_x_direct,
                                                  RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
@@ -459,11 +416,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //     index has unit stride.
                   //
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<1>,
-                      RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<1,
-                                                 RAJA::statement::Param<0>,
+                      0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
                                                  RAJA::cuda_thread_x_direct,
                                                  RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
@@ -516,31 +470,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //      tiles needed to carry out the transpose
       //
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::hip_block_y_loop,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::hip_block_x_loop,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
               // This statement will initalize local array memory inside a
               // kernel. The cpu_tile_mem policy specifies that memory should be
               // allocated on the stack. The entries in the RAJA::ParamList
               // identify RAJA local arrays to intialize in the parameter tuple.
               RAJA::statement::InitLocalMem<
-                  RAJA::hip_shared_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::hip_shared_mem, RAJA::ParamList<2>,
                   //
                   // (1) Execution policies for the first set of inner
                   // loops. These loops copy data from the global matrices
                   // to the local tile.
                   //
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<0>,
-                      RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<1>,
+                      1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
                                                  RAJA::hip_thread_x_direct,
                                                  RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
@@ -555,11 +501,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //     index has unit stride.
                   //
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<1>,
-                      RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<1,
-                                                 RAJA::statement::Param<0>,
+                      0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
+                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
                                                  RAJA::hip_thread_x_direct,
                                                  RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
@@ -598,38 +541,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _raja_mattranspose_lambdaargs_start
   using SEQ_EXEC_POL_II = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_DIM>,
-      RAJA::seq_exec,
+      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
 
           RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem,
-              RAJA::ParamList<0>,
+              RAJA::cpu_tile_mem, RAJA::ParamList<0>,
 
               RAJA::statement::For<
-                  1,
-                  RAJA::seq_exec,
-                  RAJA::statement::For<0,
-                                       RAJA::seq_exec,
-                                       RAJA::statement::Lambda<0,
-                                                               Segs<0>,
-                                                               Segs<1>,
-                                                               Offsets<0>,
-                                                               Offsets<1>,
-                                                               Params<0>>>>,
+                  1, RAJA::seq_exec,
+                  RAJA::statement::For<
+                      0, RAJA::seq_exec,
+                      RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>,
+                                              Offsets<1>, Params<0>>>>,
 
               RAJA::statement::For<
-                  0,
-                  RAJA::seq_exec,
+                  0, RAJA::seq_exec,
                   RAJA::statement::For<
-                      1,
-                      RAJA::seq_exec,
-                      RAJA::statement::
-                          Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0>>>>
+                      1, RAJA::seq_exec,
+                      RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>,
+                                              Params<0>>>>
 
               >>>>;
 
diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp
index d7658565ff..59a7a9b58f 100644
--- a/exercises/kernel-matrix-transpose-tiled.cpp
+++ b/exercises/kernel-matrix-transpose-tiled.cpp
@@ -175,18 +175,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_DIM>,
-      RAJA::seq_exec,
+      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<
-              1,
-              RAJA::seq_exec,
-              RAJA::statement::
-                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+              1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                                   RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
                                       [=](int col, int row)
@@ -243,18 +238,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // to/from the tile.
   //
   using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_DIM>,
-      RAJA::seq_exec,
-      RAJA::statement::Tile<0,
-                            RAJA::tile_fixed<TILE_DIM>,
-                            RAJA::seq_exec,
-                            RAJA::statement::Collapse<
-                                RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<0, 1>,
-                                RAJA::statement::Lambda<0>> // closes collapse
-                            >                               // closes Tile 0
-      >                                                     // closes Tile 1
+      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::Collapse<
+              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
+              RAJA::statement::Lambda<0>>                  // closes collapse
+          >                                                // closes Tile 0
+      >                                                    // closes Tile 1
                                                         >; // closes policy list
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
@@ -318,23 +309,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using TILED_KERNEL_EXEC_POL_HIP =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::hip_block_y_loop,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::hip_block_x_loop,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::hip_thread_x_direct,
-                  RAJA::statement::For<0,
-                                       RAJA::hip_thread_y_direct,
+                  1, RAJA::hip_thread_x_direct,
+                  RAJA::statement::For<0, RAJA::hip_thread_y_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=] RAJA_DEVICE(int col, int row)
+      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
       { d_Atview(col, row) = d_Aview(row, col); });
 
   hipErrchk(
diff --git a/exercises/kernel-matrix-transpose-tiled_solution.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp
index 39b6e00b6a..9e0c838e58 100644
--- a/exercises/kernel-matrix-transpose-tiled_solution.cpp
+++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp
@@ -165,18 +165,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // _raja_tiled_mattranspose_start
   using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_DIM>,
-      RAJA::seq_exec,
+      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<
-              1,
-              RAJA::seq_exec,
-              RAJA::statement::
-                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+              1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                                   RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
                                       [=](int col, int row)
@@ -198,18 +193,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // one of the inner loops.
   //
   using TILED_KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_DIM>,
-      RAJA::omp_parallel_for_exec,
+      1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<
-              1,
-              RAJA::omp_parallel_for_exec,
-              RAJA::statement::
-                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+              1, RAJA::omp_parallel_for_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                                   RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>(
       RAJA::make_tuple(col_Range, row_Range),
@@ -230,18 +220,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // to/from the tile.
   //
   using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_DIM>,
-      RAJA::seq_exec,
-      RAJA::statement::Tile<0,
-                            RAJA::tile_fixed<TILE_DIM>,
-                            RAJA::seq_exec,
-                            RAJA::statement::Collapse<
-                                RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<0, 1>,
-                                RAJA::statement::Lambda<0>> // closes collapse
-                            >                               // closes Tile 0
-      >                                                     // closes Tile 1
+      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::Collapse<
+              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
+              RAJA::statement::Lambda<0>>                  // closes collapse
+          >                                                // closes Tile 0
+      >                                                    // closes Tile 1
                                                         >; // closes policy list
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
@@ -263,23 +249,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_mattranspose_cuda_start
   using TILED_KERNEL_EXEC_POL_CUDA =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::cuda_thread_x_direct,
-                  RAJA::statement::For<0,
-                                       RAJA::cuda_thread_y_direct,
+                  1, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=] RAJA_DEVICE(int col, int row)
+      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
       { Atview(col, row) = Aview(row, col); });
   // _raja_mattranspose_cuda_end
 
@@ -305,23 +284,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using TILED_KERNEL_EXEC_POL_HIP =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_DIM>,
-          RAJA::hip_block_y_loop,
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_DIM>,
-              RAJA::hip_block_x_loop,
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::hip_thread_x_direct,
-                  RAJA::statement::For<0,
-                                       RAJA::hip_thread_y_direct,
+                  1, RAJA::hip_thread_x_direct,
+                  RAJA::statement::For<0, RAJA::hip_thread_y_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=] RAJA_DEVICE(int col, int row)
+      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
       { d_Atview(col, row) = d_Aview(row, col); });
 
   hipErrchk(
diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp
index 05aadcbaa3..ea18b22d6d 100644
--- a/exercises/kernel-matrix-transpose_solution.cpp
+++ b/exercises/kernel-matrix-transpose_solution.cpp
@@ -126,8 +126,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // _raja_mattranspose_start
   using KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,
+      1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
@@ -149,8 +148,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // one of the inner loops.
   //
   using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
+      1, RAJA::omp_parallel_for_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP>(RAJA::make_tuple(col_Range, row_Range),
@@ -168,12 +166,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _raja_mattranspose_cuda_start
-  using KERNEL_EXEC_POL_CUDA =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          1,
-          RAJA::cuda_thread_x_loop,
-          RAJA::statement::
-              For<0, RAJA::cuda_thread_y_loop, RAJA::statement::Lambda<0>>>>>;
+  using KERNEL_EXEC_POL_CUDA = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+      RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
+                           RAJA::statement::For<0, RAJA::cuda_thread_y_loop,
+                                                RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
                                      [=] RAJA_DEVICE(int col, int row)
diff --git a/exercises/kernelintro-execpols.cpp b/exercises/kernelintro-execpols.cpp
index bfc3f8ddd1..df8ee78800 100644
--- a/exercises/kernelintro-execpols.cpp
+++ b/exercises/kernelintro-execpols.cpp
@@ -226,10 +226,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, N_tot * sizeof(double));
 
   // _raja_tensorinit_omp_collapse_start
-  using EXEC_POL3 = RAJA::KernelPolicy<
-      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<2, 1, 0>, // k, j, i
-                                RAJA::statement::Lambda<0>>>;
+  using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>, // k, j, i
+      RAJA::statement::Lambda<0>>>;
 
   RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
                                            RAJA::TypedRangeSegment<int>(0, N),
@@ -317,13 +316,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL6 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
       i_block_sz * j_block_sz * k_block_sz,
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<j_block_sz>,
-          RAJA::cuda_block_y_direct,
+          1, RAJA::tile_fixed<j_block_sz>, RAJA::cuda_block_y_direct,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<i_block_sz>,
-              RAJA::cuda_block_x_direct,
+              0, RAJA::tile_fixed<i_block_sz>, RAJA::cuda_block_x_direct,
               RAJA::statement::For<
                   2,
                   RAJA::cuda_block_z_direct, // k
@@ -432,13 +427,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL8 = RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
       i_block_sz * j_block_sz * k_block_sz,
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<j_block_sz>,
-          RAJA::hip_block_y_direct,
+          1, RAJA::tile_fixed<j_block_sz>, RAJA::hip_block_y_direct,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<i_block_sz>,
-              RAJA::hip_block_x_direct,
+              0, RAJA::tile_fixed<i_block_sz>, RAJA::hip_block_x_direct,
               RAJA::statement::For<
                   2,
                   RAJA::hip_block_z_direct, // k
diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp
index 0323befabe..3b483ff4fe 100644
--- a/exercises/kernelintro-execpols_solution.cpp
+++ b/exercises/kernelintro-execpols_solution.cpp
@@ -235,10 +235,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, N_tot * sizeof(double));
 
   // _raja_tensorinit_omp_collapse_start
-  using EXEC_POL3 = RAJA::KernelPolicy<
-      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<2, 1, 0>, // k, j, i
-                                RAJA::statement::Lambda<0>>>;
+  using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>, // k, j, i
+      RAJA::statement::Lambda<0>>>;
 
   RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
                                            RAJA::TypedRangeSegment<int>(0, N),
@@ -259,8 +258,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _raja_tensorinit_omp_collapse_start
   using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec,
-      RAJA::ArgList<2, 1>, // k, j
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1>, // k, j
       RAJA::statement::For<0,
                            RAJA::seq_exec, // i
                            RAJA::statement::Lambda<0>>>>;
@@ -332,13 +330,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL6 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
       i_block_sz * j_block_sz * k_block_sz,
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<j_block_sz>,
-          RAJA::cuda_block_y_direct,
+          1, RAJA::tile_fixed<j_block_sz>, RAJA::cuda_block_y_direct,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<i_block_sz>,
-              RAJA::cuda_block_x_direct,
+              0, RAJA::tile_fixed<i_block_sz>, RAJA::cuda_block_x_direct,
               RAJA::statement::For<
                   2,
                   RAJA::cuda_block_z_direct, // k
@@ -447,13 +441,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL8 = RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
       i_block_sz * j_block_sz * k_block_sz,
       RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<j_block_sz>,
-          RAJA::hip_block_y_direct,
+          1, RAJA::tile_fixed<j_block_sz>, RAJA::hip_block_y_direct,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<i_block_sz>,
-              RAJA::hip_block_x_direct,
+              0, RAJA::tile_fixed<i_block_sz>, RAJA::hip_block_x_direct,
               RAJA::statement::For<
                   2,
                   RAJA::hip_block_z_direct, // k
diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp
index db2371f897..78a12e34c4 100644
--- a/exercises/kernelintro-nested-loop-reorder.cpp
+++ b/exercises/kernelintro-nested-loop-reorder.cpp
@@ -105,8 +105,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KJI_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange),
-      [=](IIDX i, JIDX j, KIDX k)
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
       { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_kji_loops_end
 
diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp
index 311afce13c..f4a96ba809 100644
--- a/exercises/kernelintro-nested-loop-reorder_solution.cpp
+++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp
@@ -105,8 +105,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KJI_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange),
-      [=](IIDX i, JIDX j, KIDX k)
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
       { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_kji_loops_end
 
@@ -149,8 +148,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<JIK_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange),
-      [=](IIDX i, JIDX j, KIDX k)
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
       { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_jik_loops_end
 
@@ -194,8 +192,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                 RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<IKJ_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange),
-      [=](IIDX i, JIDX j, KIDX k)
+      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
       { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
   // _raja_ikj_loops_end
 
diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
index 524457754d..9b42dd7e30 100644
--- a/exercises/launch-matrix-transpose-local-array.cpp
+++ b/exercises/launch-matrix-transpose-local-array.cpp
@@ -193,15 +193,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<loop_pol_1>(
-            ctx,
-            TILE_DIM,
-            RAJA::TypedRangeSegment<int>(0, N_r),
+            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<loop_pol_1>(
-                  ctx,
-                  TILE_DIM,
-                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
@@ -214,13 +210,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                     ///
 
                     RAJA::loop_icount<loop_pol_1>(
-                        ctx,
-                        col_tile,
+                        ctx, col_tile,
                         [&](int col, int tx)
                         {
                           RAJA::loop_icount<loop_pol_1>(
-                              ctx,
-                              row_tile,
+                              ctx, row_tile,
                               [&](int row, int ty)
                               { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
@@ -399,39 +393,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
-            ctx,
-            TILE_DIM,
-            RAJA::TypedRangeSegment<int>(0, N_r),
+            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<hip_teams_x>(
-                  ctx,
-                  TILE_DIM,
-                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<hip_threads_y>(
-                        ctx,
-                        row_tile,
+                        ctx, row_tile,
                         [&](int row, int ty)
                         {
                           RAJA::loop_icount<hip_threads_x>(
-                              ctx,
-                              col_tile,
+                              ctx, col_tile,
                               [&](int col, int tx)
                               { Tile_Array[ty][tx] = d_Aview(row, col); });
                         });
 
                     RAJA::loop_icount<hip_threads_x>(
-                        ctx,
-                        col_tile,
+                        ctx, col_tile,
                         [&](int col, int tx)
                         {
                           RAJA::loop_icount<hip_threads_y>(
-                              ctx,
-                              row_tile,
+                              ctx, row_tile,
                               [&](int row, int ty)
                               { d_Atview(col, row) = Tile_Array[ty][tx]; });
                         });
diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
index b2fd888d99..0e4abbe1bf 100644
--- a/exercises/launch-matrix-transpose-local-array_solution.cpp
+++ b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -193,26 +193,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<loop_pol_1>(
-            ctx,
-            TILE_DIM,
-            RAJA::TypedRangeSegment<int>(0, N_r),
+            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<loop_pol_1>(
-                  ctx,
-                  TILE_DIM,
-                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<loop_pol_1>(
-                        ctx,
-                        row_tile,
+                        ctx, row_tile,
                         [&](int row, int ty)
                         {
-                          RAJA::loop_icount<loop_pol_1>(ctx,
-                                                        col_tile,
+                          RAJA::loop_icount<loop_pol_1>(ctx, col_tile,
                                                         [&](int col, int tx) {
                                                           Tile_Array[ty][tx] =
                                                               Aview(row, col);
@@ -220,13 +214,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                         });
 
                     RAJA::loop_icount<loop_pol_1>(
-                        ctx,
-                        col_tile,
+                        ctx, col_tile,
                         [&](int col, int tx)
                         {
                           RAJA::loop_icount<loop_pol_1>(
-                              ctx,
-                              row_tile,
+                              ctx, row_tile,
                               [&](int row, int ty)
                               { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
@@ -259,26 +251,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<omp_pol_2>(
-            ctx,
-            TILE_DIM,
-            RAJA::TypedRangeSegment<int>(0, N_r),
+            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<loop_pol_2>(
-                  ctx,
-                  TILE_DIM,
-                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<loop_pol_2>(
-                        ctx,
-                        row_tile,
+                        ctx, row_tile,
                         [&](int row, int ty)
                         {
-                          RAJA::loop_icount<loop_pol_2>(ctx,
-                                                        col_tile,
+                          RAJA::loop_icount<loop_pol_2>(ctx, col_tile,
                                                         [&](int col, int tx) {
                                                           Tile_Array[ty][tx] =
                                                               Aview(row, col);
@@ -286,13 +272,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                         });
 
                     RAJA::loop_icount<loop_pol_2>(
-                        ctx,
-                        col_tile,
+                        ctx, col_tile,
                         [&](int col, int tx)
                         {
                           RAJA::loop_icount<loop_pol_2>(
-                              ctx,
-                              row_tile,
+                              ctx, row_tile,
                               [&](int row, int ty)
                               { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
@@ -331,39 +315,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<cuda_teams_y>(
-            ctx,
-            TILE_DIM,
-            RAJA::TypedRangeSegment<int>(0, N_r),
+            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<cuda_teams_x>(
-                  ctx,
-                  TILE_DIM,
-                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<cuda_threads_y>(
-                        ctx,
-                        row_tile,
+                        ctx, row_tile,
                         [&](int row, int ty)
                         {
                           RAJA::loop_icount<cuda_threads_x>(
-                              ctx,
-                              col_tile,
+                              ctx, col_tile,
                               [&](int col, int tx)
                               { Tile_Array[ty][tx] = Aview(row, col); });
                         });
 
                     RAJA::loop_icount<cuda_threads_x>(
-                        ctx,
-                        col_tile,
+                        ctx, col_tile,
                         [&](int col, int tx)
                         {
                           RAJA::loop_icount<cuda_threads_y>(
-                              ctx,
-                              row_tile,
+                              ctx, row_tile,
                               [&](int row, int ty)
                               { Atview(col, row) = Tile_Array[ty][tx]; });
                         });
@@ -418,39 +394,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
-            ctx,
-            TILE_DIM,
-            RAJA::TypedRangeSegment<int>(0, N_r),
+            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<hip_teams_x>(
-                  ctx,
-                  TILE_DIM,
-                  RAJA::TypedRangeSegment<int>(0, N_c),
+                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
                     RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
                     RAJA::loop_icount<hip_threads_y>(
-                        ctx,
-                        row_tile,
+                        ctx, row_tile,
                         [&](int row, int ty)
                         {
                           RAJA::loop_icount<hip_threads_x>(
-                              ctx,
-                              col_tile,
+                              ctx, col_tile,
                               [&](int col, int tx)
                               { Tile_Array[ty][tx] = d_Aview(row, col); });
                         });
 
                     RAJA::loop_icount<hip_threads_x>(
-                        ctx,
-                        col_tile,
+                        ctx, col_tile,
                         [&](int col, int tx)
                         {
                           RAJA::loop_icount<hip_threads_y>(
-                              ctx,
-                              row_tile,
+                              ctx, row_tile,
                               [&](int row, int ty)
                               { d_Atview(col, row) = Tile_Array[ty][tx]; });
                         });
diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp
index d74afb6989..006b6a2958 100644
--- a/exercises/launch-matrix-transpose-tiled.cpp
+++ b/exercises/launch-matrix-transpose-tiled.cpp
@@ -175,7 +175,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
       {
         /*
@@ -362,24 +363,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
-            ctx,
-            TILE_DIM,
-            row_Range2,
+            ctx, TILE_DIM, row_Range2,
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<hip_teams_x>(
-                  ctx,
-                  TILE_DIM,
-                  col_Range2,
+                  ctx, TILE_DIM, col_Range2,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<hip_threads_y>(ctx,
-                                              row_tile,
+                    RAJA::loop<hip_threads_y>(ctx, row_tile,
                                               [&](int row)
                                               {
                                                 RAJA::loop<hip_threads_x>(
-                                                    ctx,
-                                                    col_tile,
+                                                    ctx, col_tile,
                                                     [&](int col) {
                                                       Atview(col, row) =
                                                           Aview(row, col);
diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp
index 258f595b68..6ca5ede73c 100644
--- a/exercises/launch-matrix-transpose-tiled_solution.cpp
+++ b/exercises/launch-matrix-transpose-tiled_solution.cpp
@@ -169,28 +169,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<loop_pol_1>(
-            ctx,
-            TILE_DIM,
-            row_Range,
+            ctx, TILE_DIM, row_Range,
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<loop_pol_1>(
-                  ctx,
-                  TILE_DIM,
-                  col_Range,
+                  ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<loop_pol_1>(ctx,
-                                           row_tile,
+                    RAJA::loop<loop_pol_1>(ctx, row_tile,
                                            [&](int row)
                                            {
                                              RAJA::loop<loop_pol_1>(
-                                                 ctx,
-                                                 col_tile,
+                                                 ctx, col_tile,
                                                  [&](int col) {
                                                    Atview(col, row) =
                                                        Aview(row, col);
@@ -220,28 +215,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu
+      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
+                            // cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<omp_for_pol_2>(
-            ctx,
-            TILE_DIM,
-            row_Range,
+            ctx, TILE_DIM, row_Range,
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<loop_pol_2>(
-                  ctx,
-                  TILE_DIM,
-                  col_Range,
+                  ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<loop_pol_2>(ctx,
-                                           row_tile,
+                    RAJA::loop<loop_pol_2>(ctx, row_tile,
                                            [&](int row)
                                            {
                                              RAJA::loop<loop_pol_2>(
-                                                 ctx,
-                                                 col_tile,
+                                                 ctx, col_tile,
                                                  [&](int col) {
                                                    Atview(col, row) =
                                                        Aview(row, col);
@@ -285,24 +275,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<cuda_teams_y>(
-            ctx,
-            TILE_DIM,
-            row_Range,
+            ctx, TILE_DIM, row_Range,
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<cuda_teams_x>(
-                  ctx,
-                  TILE_DIM,
-                  col_Range,
+                  ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<cuda_threads_y>(ctx,
-                                               row_tile,
+                    RAJA::loop<cuda_threads_y>(ctx, row_tile,
                                                [&](int row)
                                                {
                                                  RAJA::loop<cuda_threads_x>(
-                                                     ctx,
-                                                     col_tile,
+                                                     ctx, col_tile,
                                                      [&](int col) {
                                                        Atview(col, row) =
                                                            Aview(row, col);
@@ -353,24 +337,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
-            ctx,
-            TILE_DIM,
-            row_Range,
+            ctx, TILE_DIM, row_Range,
             [&](RAJA::TypedRangeSegment<int> const& row_tile)
             {
               RAJA::tile<hip_teams_x>(
-                  ctx,
-                  TILE_DIM,
-                  col_Range,
+                  ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<hip_threads_y>(ctx,
-                                              row_tile,
+                    RAJA::loop<hip_threads_y>(ctx, row_tile,
                                               [&](int row)
                                               {
                                                 RAJA::loop<hip_threads_x>(
-                                                    ctx,
-                                                    col_tile,
+                                                    ctx, col_tile,
                                                     [&](int col) {
                                                       d_Atview(col, row) =
                                                           d_Aview(row, col);
diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp
index 14c52d784b..5b1c8e2aa4 100644
--- a/exercises/launch-matrix-transpose.cpp
+++ b/exercises/launch-matrix-transpose.cpp
@@ -133,13 +133,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<loop_policy_seq>(ctx,
-                                    row_Range,
+        RAJA::loop<loop_policy_seq>(ctx, row_Range,
                                     [&](int /*row*/)
                                     {
                                       RAJA::loop<loop_policy_seq>(
-                                          ctx,
-                                          col_Range,
+                                          ctx, col_Range,
                                           [&](int /*col*/)
                                           {
                                             /// TODO...
@@ -205,13 +203,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_thread_y>(
-            ctx,
-            row_Range,
+            ctx, row_Range,
             [&](int row)
             {
               RAJA::loop<cuda_thread_x>(
-                  ctx,
-                  col_Range,
+                  ctx, col_Range,
                   [&](int col) { Atview(col, row) = Aview(row, col); });
             });
       });
diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp
index 7bdd775683..e5e1f164d9 100644
--- a/exercises/launch-matrix-transpose_solution.cpp
+++ b/exercises/launch-matrix-transpose_solution.cpp
@@ -134,13 +134,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<loop_policy_seq>(
-            ctx,
-            row_Range,
+            ctx, row_Range,
             [&](int row)
             {
               RAJA::loop<loop_policy_seq>(
-                  ctx,
-                  col_Range,
+                  ctx, col_Range,
                   [&](int col) { Atview(col, row) = Aview(row, col); });
             });
       });
@@ -168,13 +166,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<loop_policy_omp>(
-            ctx,
-            row_Range,
+            ctx, row_Range,
             [&](int row)
             {
               RAJA::loop<loop_policy_seq>(
-                  ctx,
-                  col_Range,
+                  ctx, col_Range,
                   [&](int col) { Atview(col, row) = Aview(row, col); });
             });
       });
@@ -201,13 +197,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_thread_y>(
-            ctx,
-            row_Range,
+            ctx, row_Range,
             [&](int row)
             {
               RAJA::loop<cuda_thread_x>(
-                  ctx,
-                  col_Range,
+                  ctx, col_Range,
                   [&](int col) { Atview(col, row) = Aview(row, col); });
             });
       });
diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp
index 9791caf763..1252701063 100644
--- a/exercises/launchintro-execpols.cpp
+++ b/exercises/launchintro-execpols.cpp
@@ -269,18 +269,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_3>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::loop<cuda_global_thread_y_3>(
-                  ctx,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, RAJA::TypedRangeSegment<int>(0, N),
                   [&](int j)
                   {
                     RAJA::loop<cuda_global_thread_x_3>(
-                        ctx,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, RAJA::TypedRangeSegment<int>(0, N),
                         [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
@@ -314,30 +311,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_4>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::tile<cuda_teams_y_4>(
-                  ctx,
-                  j_block_sz,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                   [&](RAJA::TypedRangeSegment<int> const& j_tile)
                   {
                     RAJA::tile<cuda_teams_x_4>(
-                        ctx,
-                        i_block_sz,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                         [&](RAJA::TypedRangeSegment<int> const& i_tile)
                         {
                           RAJA::loop<cuda_threads_y_4>(
-                              ctx,
-                              j_tile,
+                              ctx, j_tile,
                               [&](int j)
                               {
                                 RAJA::loop<cuda_threads_x_4>(
-                                    ctx,
-                                    i_tile,
+                                    ctx, i_tile,
                                     [&](int i)
                                     { aView(i, j, k) = c * i * j * k; });
                               });
@@ -420,18 +410,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_5>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::loop<hip_global_thread_y_5>(
-                  ctx,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, RAJA::TypedRangeSegment<int>(0, N),
                   [&](int j)
                   {
                     RAJA::loop<hip_global_thread_x_5>(
-                        ctx,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, RAJA::TypedRangeSegment<int>(0, N),
                         [&](int i) { d_aView(i, j, k) = c * i * j * k; });
                   });
             });
@@ -466,29 +453,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_6>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::tile<hip_teams_y_6>(
-                  ctx,
-                  j_block_sz,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                   [&](RAJA::TypedRangeSegment<int> const& j_tile)
                   {
                     RAJA::tile<hip_teams_x_6>(
-                        ctx,
-                        i_block_sz,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                         [&](RAJA::TypedRangeSegment<int> const& i_tile)
                         {
                           RAJA::loop<hip_threads_y_6>(
-                              ctx,
-                              j_tile,
+                              ctx, j_tile,
                               [&](int j)
                               {
-                                RAJA::loop<hip_threads_x_6>(ctx,
-                                                            i_tile,
+                                RAJA::loop<hip_threads_x_6>(ctx, i_tile,
                                                             [&](int i) {
                                                               d_aView(i, j, k) =
                                                                   c * i * j * k;
diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp
index 7e40599394..7cd56efed3 100644
--- a/exercises/launchintro-execpols_solution.cpp
+++ b/exercises/launchintro-execpols_solution.cpp
@@ -140,18 +140,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<loop_policy_1>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::loop<loop_policy_1>(
-                  ctx,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, RAJA::TypedRangeSegment<int>(0, N),
                   [&](int j)
                   {
                     RAJA::loop<loop_policy_1>(
-                        ctx,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, RAJA::TypedRangeSegment<int>(0, N),
                         [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
@@ -205,18 +202,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<omp_policy_2>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::loop<loop_policy_2>(
-                  ctx,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, RAJA::TypedRangeSegment<int>(0, N),
                   [&](int j)
                   {
                     RAJA::loop<loop_policy_2>(
-                        ctx,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, RAJA::TypedRangeSegment<int>(0, N),
                         [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
@@ -266,18 +260,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_3>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::loop<cuda_global_thread_y_3>(
-                  ctx,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, RAJA::TypedRangeSegment<int>(0, N),
                   [&](int j)
                   {
                     RAJA::loop<cuda_global_thread_x_3>(
-                        ctx,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, RAJA::TypedRangeSegment<int>(0, N),
                         [&](int i) { aView(i, j, k) = c * i * j * k; });
                   });
             });
@@ -311,30 +302,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_4>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::tile<cuda_teams_y_4>(
-                  ctx,
-                  j_block_sz,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                   [&](RAJA::TypedRangeSegment<int> const& j_tile)
                   {
                     RAJA::tile<cuda_teams_x_4>(
-                        ctx,
-                        i_block_sz,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                         [&](RAJA::TypedRangeSegment<int> const& i_tile)
                         {
                           RAJA::loop<cuda_threads_y_4>(
-                              ctx,
-                              j_tile,
+                              ctx, j_tile,
                               [&](int j)
                               {
                                 RAJA::loop<cuda_threads_x_4>(
-                                    ctx,
-                                    i_tile,
+                                    ctx, i_tile,
                                     [&](int i)
                                     { aView(i, j, k) = c * i * j * k; });
                               });
@@ -417,18 +401,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_5>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::loop<hip_global_thread_y_5>(
-                  ctx,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, RAJA::TypedRangeSegment<int>(0, N),
                   [&](int j)
                   {
                     RAJA::loop<hip_global_thread_x_5>(
-                        ctx,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, RAJA::TypedRangeSegment<int>(0, N),
                         [&](int i) { d_aView(i, j, k) = c * i * j * k; });
                   });
             });
@@ -463,29 +444,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_6>(
-            ctx,
-            RAJA::TypedRangeSegment<int>(0, N),
+            ctx, RAJA::TypedRangeSegment<int>(0, N),
             [&](int k)
             {
               RAJA::tile<hip_teams_y_6>(
-                  ctx,
-                  j_block_sz,
-                  RAJA::TypedRangeSegment<int>(0, N),
+                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                   [&](RAJA::TypedRangeSegment<int> const& j_tile)
                   {
                     RAJA::tile<hip_teams_x_6>(
-                        ctx,
-                        i_block_sz,
-                        RAJA::TypedRangeSegment<int>(0, N),
+                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
                         [&](RAJA::TypedRangeSegment<int> const& i_tile)
                         {
                           RAJA::loop<hip_threads_y_6>(
-                              ctx,
-                              j_tile,
+                              ctx, j_tile,
                               [&](int j)
                               {
-                                RAJA::loop<hip_threads_x_6>(ctx,
-                                                            i_tile,
+                                RAJA::loop<hip_threads_x_6>(ctx, i_tile,
                                                             [&](int i) {
                                                               d_aView(i, j, k) =
                                                                   c * i * j * k;
diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp
index 892c0b09c5..2dc54fef81 100644
--- a/exercises/offset-layout-stencil.cpp
+++ b/exercises/offset-layout-stencil.cpp
@@ -329,8 +329,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                  });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy(
-      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(output, d_output, totCells * sizeof(int),
+                      hipMemcpyDeviceToHost));
 
   std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp
index 881f188fc4..8b8ebc9f30 100644
--- a/exercises/offset-layout-stencil_solution.cpp
+++ b/exercises/offset-layout-stencil_solution.cpp
@@ -247,10 +247,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaomp_start
-  using NESTED_EXEC_POL2 = RAJA::KernelPolicy<
-      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<1, 0>, // row, col
-                                RAJA::statement::Lambda<0>>>;
+  using NESTED_EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>, // row, col
+      RAJA::statement::Lambda<0>>>;
 
   RAJA::kernel<NESTED_EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
                                  [=](int col, int row)
@@ -316,8 +315,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   hipErrchk(
       hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(
-      d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_output, output, totCells * sizeof(int),
+                      hipMemcpyHostToDevice));
 
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView(d_input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
@@ -342,8 +341,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                  });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy(
-      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(output, d_output, totCells * sizeof(int),
+                      hipMemcpyDeviceToHost));
 
   std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
index 1aded0e9fe..ed05769394 100644
--- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
@@ -557,10 +557,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
   hipErrchk(
       hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(
-      d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(
-      d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_A2, A2, N_c * N_r * N * sizeof(double),
+                      hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(d_B2, B2, N_c * N_r * N * sizeof(double),
+                      hipMemcpyHostToDevice));
 
   minRun = std::numeric_limits<double>::max();
   for (int i = 0; i < NITER; ++i)
@@ -665,8 +665,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy(
-      C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(C2, d_C2, N_c * N_r * N * sizeof(double),
+                      hipMemcpyDeviceToHost));
 
   std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp
index 223f7557aa..03322e45a5 100644
--- a/exercises/scan_solution.cpp
+++ b/exercises/scan_solution.cpp
@@ -174,8 +174,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_inclusive_omp_plus_start
   RAJA::inclusive_scan<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(in, N),
-      RAJA::make_span(out, N),
+      RAJA::make_span(in, N), RAJA::make_span(out, N),
       RAJA::operators::plus<int>{});
   // _scan_inclusive_omp_plus_end
 
@@ -244,8 +243,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_cuda_plus_start
   RAJA::exclusive_scan<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(in, N),
-      RAJA::make_span(out, N),
+      RAJA::make_span(in, N), RAJA::make_span(out, N),
       RAJA::operators::plus<int>{});
   // _scan_exclusive_cuda_plus_end
 
@@ -291,8 +289,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
 
   RAJA::exclusive_scan<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_in, N),
-      RAJA::make_span(d_out, N),
+      RAJA::make_span(d_in, N), RAJA::make_span(d_out, N),
       RAJA::operators::plus<int>{});
 
   hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp
index 8bacb285d8..8a3daf4db1 100644
--- a/exercises/sort_solution.cpp
+++ b/exercises/sort_solution.cpp
@@ -254,8 +254,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _sort_stable_pairs_omp_greater_start
   RAJA::stable_sort_pairs<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(out, N),
-      RAJA::make_span(out_vals, N),
+      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
       RAJA::operators::greater<int>{});
   // _sort_stable_pairs_omp_greater_end
 
@@ -282,8 +281,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _sort_pairs_cuda_greater_start
   RAJA::sort_pairs<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N),
-      RAJA::make_span(out_vals, N),
+      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
       RAJA::operators::greater<int>{});
   // _sort_pairs_cuda_greater_end
 
@@ -332,8 +330,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice));
 
   RAJA::sort_pairs<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_out, N),
-      RAJA::make_span(d_out_vals, N),
+      RAJA::make_span(d_out, N), RAJA::make_span(d_out_vals, N),
       RAJA::operators::less<int>{});
 
   hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
index fd24969ae6..2cdd785635 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
@@ -257,18 +257,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // to/from the tile.
   //
   using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_SZ>,
-      RAJA::seq_exec,
-      RAJA::statement::Tile<0,
-                            RAJA::tile_fixed<TILE_SZ>,
-                            RAJA::seq_exec,
-                            RAJA::statement::Collapse<
-                                RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<0, 1>,
-                                RAJA::statement::Lambda<0>> // closes collapse
-                            >                               // closes Tile 0
-      >                                                     // closes Tile 1
+      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
+          RAJA::statement::Collapse<
+              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
+              RAJA::statement::Lambda<0>>            // closes collapse
+          >                                          // closes Tile 0
+      >                                              // closes Tile 1
                                                   >; // closes policy list
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP2>(RAJA::make_tuple(col_Range, row_Range),
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
index 2f6f320080..07bfdc3ce4 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
@@ -168,18 +168,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   using KERNEL_EXEC_POL_SEQ = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_SZ>,
-      RAJA::seq_exec,
+      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_SZ>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
           RAJA::statement::For<
-              1,
-              RAJA::seq_exec,
-              RAJA::statement::
-                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+              1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                                   RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_SEQ>(RAJA::make_tuple(col_Range, row_Range),
                                     [=](int col, int row)
@@ -201,18 +196,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_SZ>,
-      RAJA::seq_exec,
+      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_SZ>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
           RAJA::statement::For<
-              1,
-              RAJA::omp_parallel_for_exec,
-              RAJA::statement::
-                  For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+              1, RAJA::omp_parallel_for_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                                   RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP>(RAJA::make_tuple(col_Range, row_Range),
                                     [=](int col, int row)
@@ -235,18 +225,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_SZ>,
-      RAJA::seq_exec,
-      RAJA::statement::Tile<0,
-                            RAJA::tile_fixed<TILE_SZ>,
-                            RAJA::seq_exec,
-                            RAJA::statement::Collapse<
-                                RAJA::omp_parallel_collapse_exec,
-                                RAJA::ArgList<0, 1>,
-                                RAJA::statement::Lambda<0>> // closes collapse
-                            >                               // closes Tile 0
-      >                                                     // closes Tile 1
+      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
+      RAJA::statement::Tile<
+          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
+          RAJA::statement::Collapse<
+              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
+              RAJA::statement::Lambda<0>>            // closes collapse
+          >                                          // closes Tile 0
+      >                                              // closes Tile 1
                                                   >; // closes policy list
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP2>(RAJA::make_tuple(col_Range, row_Range),
@@ -265,18 +251,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using KERNEL_EXEC_POL_CUDA =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_SZ>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_SZ>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
-                  1,
-                  RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<0,
-                                       RAJA::cuda_thread_x_direct,
+                  1, RAJA::cuda_thread_y_direct,
+                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
                                        RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
index f8acc88e31..8921291c23 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
@@ -210,33 +210,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   using SEQ_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_SZ>,
-      RAJA::seq_exec,
+      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_SZ>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
 
           RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem,
-              RAJA::ParamList<2>,
+              RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
               RAJA::statement::ForICount<
-                  1,
-                  RAJA::statement::Param<1>,
-                  RAJA::seq_exec,
-                  RAJA::statement::ForICount<0,
-                                             RAJA::statement::Param<0>,
+                  1, RAJA::statement::Param<1>, RAJA::seq_exec,
+                  RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
                                              RAJA::seq_exec,
                                              RAJA::statement::Lambda<0>>>,
 
               RAJA::statement::ForICount<
-                  0,
-                  RAJA::statement::Param<0>,
-                  RAJA::seq_exec,
-                  RAJA::statement::ForICount<1,
-                                             RAJA::statement::Param<1>,
+                  0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                  RAJA::statement::ForICount<1, RAJA::statement::Param<1>,
                                              RAJA::seq_exec,
                                              RAJA::statement::Lambda<1>>>
 
@@ -264,33 +253,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   using OPENMP_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1,
-      RAJA::tile_fixed<TILE_SZ>,
-      RAJA::omp_parallel_for_exec,
+      1, RAJA::tile_fixed<TILE_SZ>, RAJA::omp_parallel_for_exec,
       RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<TILE_SZ>,
-          RAJA::seq_exec,
+          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
 
           RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem,
-              RAJA::ParamList<2>,
+              RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
               RAJA::statement::ForICount<
-                  1,
-                  RAJA::statement::Param<1>,
-                  RAJA::seq_exec,
-                  RAJA::statement::ForICount<0,
-                                             RAJA::statement::Param<0>,
+                  1, RAJA::statement::Param<1>, RAJA::seq_exec,
+                  RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
                                              RAJA::seq_exec,
                                              RAJA::statement::Lambda<0>>>,
 
               RAJA::statement::ForICount<
-                  0,
-                  RAJA::statement::Param<0>,
-                  RAJA::seq_exec,
-                  RAJA::statement::ForICount<1,
-                                             RAJA::statement::Param<1>,
+                  0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                  RAJA::statement::ForICount<1, RAJA::statement::Param<1>,
                                              RAJA::seq_exec,
                                              RAJA::statement::Lambda<1>>>>>>>;
 
@@ -319,35 +297,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using CUDA_EXEC_POL =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1,
-          RAJA::tile_fixed<TILE_SZ>,
-          RAJA::cuda_block_y_loop,
+          1, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<TILE_SZ>,
-              RAJA::cuda_block_x_loop,
+              0, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_x_loop,
 
               RAJA::statement::InitLocalMem<
-                  RAJA::cuda_shared_mem,
-                  RAJA::ParamList<2>,
+                  RAJA::cuda_shared_mem, RAJA::ParamList<2>,
 
                   RAJA::statement::ForICount<
-                      1,
-                      RAJA::statement::Param<1>,
-                      RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<0,
-                                                 RAJA::statement::Param<0>,
+                      1, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
                                                  RAJA::cuda_thread_x_direct,
                                                  RAJA::statement::Lambda<0>>>,
 
                   RAJA::statement::CudaSyncThreads,
 
                   RAJA::statement::ForICount<
-                      0,
-                      RAJA::statement::Param<0>,
-                      RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<1,
-                                                 RAJA::statement::Param<1>,
+                      0, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
+                      RAJA::statement::ForICount<1, RAJA::statement::Param<1>,
                                                  RAJA::cuda_thread_x_direct,
                                                  RAJA::statement::Lambda<1>>>,
 
diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp
index e534b4e737..e8474ed0fe 100644
--- a/exercises/vertexsum-indexset.cpp
+++ b/exercises/vertexsum-indexset.cpp
@@ -376,8 +376,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
   hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(
-      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
+  hipMemcpy(d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int),
+            hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
   hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp
index ac81c53f4a..366d3fa8b4 100644
--- a/exercises/vertexsum-indexset_solution.cpp
+++ b/exercises/vertexsum-indexset_solution.cpp
@@ -368,8 +368,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
   hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(
-      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
+  hipMemcpy(d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int),
+            hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
   hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 4d1adcefd8..ba1863fa80 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -326,8 +326,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE void push_back(Tnew&& val)
   {
     push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
-                  PUSH_BACK,
-                  PUSH_COPY);
+                  PUSH_BACK, PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
@@ -335,8 +334,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE void push_front(Tnew&& val)
   {
     push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
-                  PUSH_FRONT,
-                  PUSH_COPY);
+                  PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
@@ -373,8 +371,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     if (getSegmentTypes()[segid] != T0_TypeId)
     {
-      PARENT::segmentCall(
-          segid, std::forward<BODY>(body), std::forward<ARGS>(args)...);
+      PARENT::segmentCall(segid, std::forward<BODY>(body),
+                          std::forward<ARGS>(args)...);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index 4b6303d113..d750d6536b 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -65,8 +65,8 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(
-      seg, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx)
+                   { tcon.push_back(idx); });
   con = tcon;
 }
 
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index 4aafd63593..f70b3b4b30 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -501,8 +501,8 @@ struct TypedRangeStrideSegment
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment{
-        stripIndexType(start), stripIndexType(end), m_begin.get_stride()};
+    return TypedRangeStrideSegment{stripIndexType(start), stripIndexType(end),
+                                   m_begin.get_stride()};
   }
 
   /*!
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index c0aa6bd69f..682c59479e 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -452,8 +452,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(
-          m_allocator, m_data + m_size, std::forward<Os>(os)...);
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::forward<Os>(os)...);
     }
   }
 
@@ -464,8 +464,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(
-          m_allocator, m_data + m_size, o_data[m_size]);
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       o_data[m_size]);
     }
   }
 
@@ -476,8 +476,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(
-          m_allocator, m_data + m_size, std::move(o_data[m_size]));
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::move(o_data[m_size]));
     }
   }
 
@@ -503,16 +503,16 @@ class RAJAVec
     if (m_size > 0)
     {
       size_type i = m_size;
-      allocator_traits_type::construct(
-          m_allocator, m_data + i, std::move(m_data[i - 1]));
+      allocator_traits_type::construct(m_allocator, m_data + i,
+                                       std::move(m_data[i - 1]));
       for (--i; i > 0; --i)
       {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(
-        m_allocator, m_data, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -523,8 +523,8 @@ class RAJAVec
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(
-        m_allocator, m_data + m_size, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -587,8 +587,8 @@ class RAJAVec
     {
       for (size_type i = 0; i < m_size; ++i)
       {
-        allocator_traits_type::construct(
-            m_allocator, tdata + i, std::move(m_data[i]));
+        allocator_traits_type::construct(m_allocator, tdata + i,
+                                         std::move(m_data[i]));
         allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index f9a138e6c6..a515c27fc3 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -182,8 +182,7 @@ struct Dispatcher<platform,
   static inline Dispatcher makeDispatcher()
   {
     return {mover_type{&s_move_construct_destroy<T>},
-            invoker_type{&s_host_invoke<T>},
-            destroyer_type{&s_destroy<T>},
+            invoker_type{&s_host_invoke<T>}, destroyer_type{&s_destroy<T>},
             sizeof(T)};
   }
   ///
@@ -205,8 +204,7 @@ struct Dispatcher<platform,
     return {mover_type{&s_move_construct_destroy<T>},
             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(
                 DeviceInvokerFactory<T>{})},
-            destroyer_type{&s_destroy<T>},
-            sizeof(T)};
+            destroyer_type{&s_destroy<T>}, sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
@@ -375,10 +373,8 @@ struct Dispatcher<platform,
   {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return {mover_type{&s_base_impl},
-            host_invoker_type{&s_host_impl},
-            destroyer_type{&s_base_impl},
-            sizeof(T)};
+    return {mover_type{&s_base_impl}, host_invoker_type{&s_host_impl},
+            destroyer_type{&s_base_impl}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -399,10 +395,8 @@ struct Dispatcher<platform,
     static base_impl_type<T>    s_base_impl;
     static device_impl_type<T>* s_device_impl_ptr{std::forward<CreateOnDevice>(
         createOnDevice)(DeviceImplTypeFactory<T>{})};
-    return {mover_type{&s_base_impl},
-            device_invoker_type{s_device_impl_ptr},
-            destroyer_type{&s_base_impl},
-            sizeof(T)};
+    return {mover_type{&s_base_impl}, device_invoker_type{s_device_impl_ptr},
+            destroyer_type{&s_base_impl}, sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
@@ -665,9 +659,7 @@ struct Dispatcher<platform,
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{},
-                  callable_types{},
-                  obj,
+      impl_helper(callable_indices{}, callable_types{}, obj,
                   std::forward<CallArgs>(args)...);
     }
 
@@ -696,9 +688,7 @@ struct Dispatcher<platform,
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{},
-                  callable_types{},
-                  obj,
+      impl_helper(callable_indices{}, callable_types{}, obj,
                   std::forward<CallArgs>(args)...);
     }
 
@@ -781,8 +771,8 @@ struct Dispatcher<platform,
     static constexpr id_type id =
         get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {
-        mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -799,8 +789,8 @@ struct Dispatcher<platform,
     static constexpr id_type id =
         get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {
-        mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id},
+            sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index fc40a72b22..363f692710 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -126,9 +126,7 @@ struct HoldForall
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
-    wrap::forall(r,
-                 ExecutionPolicy(),
-                 m_segment,
+    wrap::forall(r, ExecutionPolicy(), m_segment,
                  HoldBodyArgs{m_body, std::forward<Args>(args)...});
   }
 
@@ -222,8 +220,7 @@ struct WorkRunnerForallOrdered_base
 
     storage.template emplace<holder>(
         get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-        std::forward<segment_T>(seg),
-        std::forward<loop_T>(loop));
+        std::forward<segment_T>(seg), std::forward<loop_T>(loop));
   }
 
   // clear any state so ready to be destroyed or reused
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 6b0bea8689..3bf3030b89 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -419,8 +419,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     value_type::move_destroy(value_ptr, other_value_and_size.ptr);
 
     allocator_traits_type::deallocate(
-        rhs.m_aloc,
-        reinterpret_cast<char*>(other_value_and_size.ptr),
+        rhs.m_aloc, reinterpret_cast<char*>(other_value_and_size.ptr),
         other_value_and_size.size);
 
     return pointer_and_size{value_ptr, other_value_and_size.size};
@@ -431,8 +430,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   {
     value_type::destroy(value_and_size_ptr.ptr);
     allocator_traits_type::deallocate(
-        m_aloc,
-        reinterpret_cast<char*>(value_and_size_ptr.ptr),
+        m_aloc, reinterpret_cast<char*>(value_and_size_ptr.ptr),
         value_and_size_ptr.size);
   }
 };
@@ -591,8 +589,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     array_clear();
     if (m_array_begin != nullptr)
     {
-      allocator_traits_type::deallocate(
-          m_aloc, m_array_begin, storage_capacity());
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
@@ -684,8 +682,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
       if (m_array_begin != nullptr)
       {
-        allocator_traits_type::deallocate(
-            m_aloc, m_array_begin, storage_capacity());
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
       m_array_begin = new_array_begin;
@@ -896,8 +894,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     array_clear();
     if (m_array_begin != nullptr)
     {
-      allocator_traits_type::deallocate(
-          m_aloc, m_array_begin, storage_capacity());
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
@@ -992,8 +990,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
       if (m_array_begin != nullptr)
       {
-        allocator_traits_type::deallocate(
-            m_aloc, m_array_begin, storage_capacity());
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
       m_stride      = new_stride;
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 67f999fe51..03f065b5db 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -181,11 +181,9 @@ forall(Res               r,
        ForallParams&&    f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     std::forward<ForallParams>(f_params));
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), std::forward<ForallParams>(f_params));
 }
 
 template <typename Res,
@@ -199,11 +197,9 @@ RAJA_INLINE concepts::enable_if_t<
 forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     expt::get_empty_forall_param_pack());
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), expt::get_empty_forall_param_pack());
 }
 
 
@@ -231,14 +227,11 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res               r,
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(
-      c, loop_body, icount);
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c, loop_body,
+                                                                 icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     range,
-                     adapted,
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted,
                      std::forward<ForallParams>(f_params));
 }
 
@@ -267,18 +260,13 @@ RAJA_INLINE resources::EventProxy<Res>
   // no need for icount variant here
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes,
-               SegmentIterPolicy(),
-               iset,
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
                [=, &r](int segID)
                {
                  iset.segmentCall(
                      segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
-                     SegmentExecPolicy(),
-                     loop_body,
-                     r,
-                     f_params);
+                     SegmentExecPolicy(), loop_body, r, f_params);
                });
   return RAJA::resources::EventProxy<Res>(r);
 }
@@ -298,17 +286,11 @@ RAJA_INLINE resources::EventProxy<Res>
 {
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes,
-               SegmentIterPolicy(),
-               iset,
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
                [=, &r](int segID)
                {
-                 iset.segmentCall(segID,
-                                  detail::CallForall{},
-                                  SegmentExecPolicy(),
-                                  loop_body,
-                                  r,
-                                  f_params);
+                 iset.segmentCall(segID, detail::CallForall{},
+                                  SegmentExecPolicy(), loop_body, r, f_params);
                });
   return RAJA::resources::EventProxy<Res>(r);
 }
@@ -363,11 +345,8 @@ forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
   util::callPreLaunchPlugins(context);
 
   RAJA::resources::EventProxy<Res> e =
-      wrap::forall_Icount(r,
-                          std::forward<ExecutionPolicy>(p),
-                          std::forward<IdxSet>(c),
-                          std::move(body),
-                          f_params);
+      wrap::forall_Icount(r, std::forward<ExecutionPolicy>(p),
+                          std::forward<IdxSet>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -382,9 +361,7 @@ RAJA_INLINE resources::EventProxy<Res>
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -423,11 +400,9 @@ RAJA_INLINE
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(r,
-                                              std::forward<ExecutionPolicy>(p),
-                                              std::forward<IdxSet>(c),
-                                              std::move(body),
-                                              f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
+                   std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -444,9 +419,7 @@ RAJA_INLINE
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -473,8 +446,7 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
+  return forall_impl(r, std::forward<ExecutionPolicy>(p),
                      std::forward<Container>(c),
                      std::forward<LoopBody>(loop_body));
 }
@@ -522,13 +494,9 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =
-      wrap::forall_Icount(r,
-                          std::forward<ExecutionPolicy>(p),
-                          std::forward<Container>(c),
-                          icount,
-                          std::move(body),
-                          f_params);
+  resources::EventProxy<Res> e = wrap::forall_Icount(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c), icount,
+      std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -551,10 +519,7 @@ forall_Icount(ExecutionPolicy&& p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      icount,
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c), icount,
       std::forward<LoopBody>(loop_body));
 }
 
@@ -595,11 +560,9 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(r,
-                                              std::forward<ExecutionPolicy>(p),
-                                              std::forward<Container>(c),
-                                              std::move(body),
-                                              f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p),
+                   std::forward<Container>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -619,9 +582,7 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -640,16 +601,16 @@ template <
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
 RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
                                   type_traits::is_resource<Res>>
             forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 
 /*!
@@ -713,8 +674,8 @@ CallForallIcount::operator()(T const& segment,
                              ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(
-      r, ExecutionPolicy(), segment, start, body, f_params);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body,
+                             f_params);
 }
 
 } // namespace detail
@@ -765,8 +726,8 @@ struct dynamic_helper
       return {r};
     }
 
-    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(
-        r, pol, seg, body);
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r, pol, seg,
+                                                               body);
   }
 };
 
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index 0c4473e620..488caff1e1 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -119,10 +119,8 @@ RAJA_INLINE resources::EventProxy<Resource>
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<segment_tuple_t,
-                                         param_tuple_t,
-                                         Resource,
-                                         camp::decay<Bodies>...>;
+  using loop_data_t = internal::LoopData<segment_tuple_t, param_tuple_t,
+                                         Resource, camp::decay<Bodies>...>;
 
 
   util::callPreCapturePlugins(context);
@@ -133,8 +131,7 @@ RAJA_INLINE resources::EventProxy<Resource>
   // and only copied to provide thread-private instances.
   loop_data_t loop_data(
       make_wrapped_tuple(std::forward<SegmentTuple>(segments)),
-      std::forward<ParamTuple>(params),
-      resource,
+      std::forward<ParamTuple>(params), resource,
       std::forward<Bodies>(bodies)...);
 
   util::callPostCapturePlugins(context);
@@ -160,9 +157,7 @@ RAJA_INLINE resources::EventProxy<Resource>
 kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
 {
   return RAJA::kernel_param_resource<PolicyType>(
-      std::forward<SegmentTuple>(segments),
-      RAJA::make_tuple(),
-      resource,
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), resource,
       std::forward<Bodies>(bodies)...);
 }
 
@@ -175,10 +170,8 @@ kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
   return RAJA::kernel_param_resource<PolicyType>(
-      std::forward<SegmentTuple>(segments),
-      std::forward<ParamTuple>(params),
-      res,
-      std::forward<Bodies>(bodies)...);
+      std::forward<SegmentTuple>(segments), std::forward<ParamTuple>(params),
+      res, std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
@@ -187,9 +180,7 @@ RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
 {
   auto res = resources::get_default_resource<PolicyType>();
   return RAJA::kernel_param_resource<PolicyType>(
-      std::forward<SegmentTuple>(segments),
-      RAJA::make_tuple(),
-      res,
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), res,
       std::forward<Bodies>(bodies)...);
 }
 
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 6206780e85..f4c54c1146 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -110,10 +110,7 @@ struct StatementExecutor<
 
     auto r = data.res;
 
-    forall_impl(r,
-                ExecPolicy{},
-                TypedRangeSegment<len_t>(0, len),
-                for_wrapper,
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
   }
 };
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 31525a9374..345cacb4d5 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -119,10 +119,7 @@ struct StatementExecutor<
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r,
-                ExecPolicy{},
-                TypedRangeSegment<len_t>(0, len),
-                for_wrapper,
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
   }
 };
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 8686eb47f0..0f866b63e7 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -125,8 +125,7 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
     // Add a Collapse policy around our enclosed statements that will handle
     // the inner hyperplane loop's execution
     using kernel_policy = statement::Collapse<
-        ExecPolicy,
-        ArgList<Args...>,
+        ExecPolicy, ArgList<Args...>,
         HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>>;
 
     // Create a For-loop wrapper for the outer loop
@@ -145,11 +144,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r,
-                HpExecPolicy{},
-                TypedRangeSegment<idx_t>(0, hp_len),
-                outer_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, HpExecPolicy{}, TypedRangeSegment<idx_t>(0, hp_len),
+                outer_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index b8cb6208d3..6450f697bc 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -81,8 +81,7 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
   static void RAJA_INLINE exec_expanded(Data&& data)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
 
     // Initialize memory
 #ifdef RAJA_COMPILER_MSVC
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 501a9d5c15..3e92146ff0 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -162,9 +162,9 @@ struct IterableTiler
     RAJA_HOST_DEVICE
     RAJA_INLINE iterator operator+(const difference_type& rhs) const
     {
-      return iterator(itiler,
-                      block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
-                                                          : block_id + rhs);
+      return iterator(itiler, block_id + rhs >= itiler.num_blocks
+                                  ? itiler.num_blocks
+                                  : block_id + rhs);
     }
 
     RAJA_HOST_DEVICE
@@ -250,10 +250,7 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r,
-                EPol{},
-                tiled_iterable,
-                tile_wrapper,
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
@@ -291,10 +288,7 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r,
-                EPol{},
-                tiled_iterable,
-                tile_wrapper,
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index eadf8dc2d2..733e0a838e 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -130,10 +130,7 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r,
-                EPol{},
-                tiled_iterable,
-                tile_wrapper,
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index ac88ffe3cf..fcd04b75be 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -61,9 +61,7 @@ struct StatementListExecutor
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1,
-                          num_statements,
-                          StmtList,
+    StatementListExecutor<statement_index + 1, num_statements, StmtList,
                           Types>::exec(std::forward<Data>(data));
   }
 };
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 78e2a8f9f7..7f610bd175 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -265,8 +265,8 @@ void launch(LaunchParams const& launch_params,
   using Res = typename resources::get_resource<
       typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(
-      Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
@@ -306,8 +306,8 @@ void launch(LaunchParams const& launch_params,
   using Res = typename resources::get_resource<
       typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(
-      Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
@@ -370,9 +370,7 @@ void launch(ExecPlace           place,
     using Res = typename resources::get_resource<
         typename POLICY_LIST::host_policy_t>::type;
     launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
-        Res::get_default(),
-        launch_params,
-        kernel_name,
+        Res::get_default(), launch_params, kernel_name,
         std::forward<ReduceParams>(rest_of_launch_args)...);
     break;
   }
@@ -382,9 +380,7 @@ void launch(ExecPlace           place,
     using Res = typename resources::get_resource<
         typename POLICY_LIST::device_policy_t>::type;
     launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
-        Res::get_default(),
-        launch_params,
-        kernel_name,
+        Res::get_default(), launch_params, kernel_name,
         std::forward<ReduceParams>(rest_of_launch_args)...);
     break;
   }
@@ -413,9 +409,7 @@ void launch(ExecPlace           place,
     using Res = typename resources::get_resource<
         typename POLICY_LIST::host_policy_t>::type;
     launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
-        Res::get_default(),
-        launch_params,
-        kernel_name,
+        Res::get_default(), launch_params, kernel_name,
         std::forward<ReduceParams>(rest_of_launch_args)...);
     break;
   }
@@ -425,9 +419,7 @@ void launch(ExecPlace           place,
     using Res = typename resources::get_resource<
         typename POLICY_LIST::device_policy_t>::type;
     launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
-        Res::get_default(),
-        launch_params,
-        kernel_name,
+        Res::get_default(), launch_params, kernel_name,
         std::forward<ReduceParams>(rest_of_launch_args)...);
     break;
   }
@@ -664,8 +656,8 @@ RAJA_HOST_DEVICE RAJA_INLINE void
 loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, segment, body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment,
+                                                             body);
 }
 
 namespace expt
@@ -682,8 +674,8 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
                                        BODY const&    body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, segment0, segment1, body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
+                                                       body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -721,8 +713,8 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        BODY const&    body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, tile_size, segment, body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size, segment,
+                                                       body);
 }
 
 template <typename POLICY_LIST,
@@ -735,8 +727,8 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
                                               SEGMENT const& segment,
                                               BODY const&    body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, tile_size, segment, body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size,
+                                                             segment, body);
 }
 
 namespace expt
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 62e0d53aba..9a204936ac 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -153,9 +153,7 @@ struct ParamMultiplexer
   static void constexpr init(ForallParamPack<Params...>& f_params,
                              Args&&... args)
   {
-    FP::detail_init(EXEC_POL(),
-                    typename FP::params_seq(),
-                    f_params,
+    FP::detail_init(EXEC_POL(), typename FP::params_seq(), f_params,
                     std::forward<Args>(args)...);
   }
   template <typename EXEC_POL,
@@ -165,9 +163,7 @@ struct ParamMultiplexer
   static void constexpr combine(ForallParamPack<Params...>& f_params,
                                 Args&&... args)
   {
-    FP::detail_combine(EXEC_POL(),
-                       typename FP::params_seq(),
-                       f_params,
+    FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params,
                        std::forward<Args>(args)...);
   }
   template <typename EXEC_POL,
@@ -177,9 +173,7 @@ struct ParamMultiplexer
   static void constexpr resolve(ForallParamPack<Params...>& f_params,
                                 Args&&... args)
   {
-    FP::detail_resolve(EXEC_POL(),
-                       typename FP::params_seq(),
-                       f_params,
+    FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params,
                        std::forward<Args>(args)...);
   }
 };
@@ -365,10 +359,10 @@ constexpr concepts::enable_if<has_empty_op<LAMBDA>>
 check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
 {
 #if !defined(RAJA_ENABLE_HIP)
-  static_assert(is_invocable<LAMBDA,
-                             typename get_lambda_index_type<LAMBDA>::type,
-                             EXPECTED_ARGS...>::value,
-                "LAMBDA Not invocable w/ EXPECTED_ARGS.");
+  static_assert(
+      is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
+                   EXPECTED_ARGS...>::value,
+      "LAMBDA Not invocable w/ EXPECTED_ARGS.");
 #endif
 }
 
@@ -449,8 +443,7 @@ RAJA_HOST_DEVICE constexpr auto
 invoke_body(Params&& params, Fn&& f, Ts&&... extra)
 {
   return detail::invoke_with_order(
-      camp::forward<Params>(params),
-      camp::forward<Fn>(f),
+      camp::forward<Params>(params), camp::forward<Fn>(f),
       typename camp::decay<Params>::lambda_arg_seq(),
       camp::forward<Ts...>(extra)...);
 }
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index b25ab5d018..01ef0fd1ba 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -73,8 +73,8 @@ RAJA_INLINE
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(
-      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop);
+  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop);
 }
 ///
 template <
@@ -137,8 +137,8 @@ RAJA_INLINE
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(
-      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop, value);
+  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop, value);
 }
 ///
 template <typename ExecPolicy,
@@ -211,8 +211,8 @@ RAJA_INLINE
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(
-      r, std::forward<ExecPolicy>(p), begin(in), end(in), begin(out), binop);
+  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop);
 }
 ///
 template <typename ExecPolicy,
@@ -235,11 +235,8 @@ inclusive_scan(ExecPolicy&&   p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop);
 }
 
 /*!
@@ -292,13 +289,8 @@ RAJA_INLINE
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r,
-                               std::forward<ExecPolicy>(p),
-                               begin(in),
-                               end(in),
-                               begin(out),
-                               binop,
-                               value);
+  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop, value);
 }
 ///
 template <typename ExecPolicy,
@@ -322,12 +314,8 @@ exclusive_scan(ExecPolicy&&   p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop, value);
 }
 
 } // namespace policy_by_value_interface
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index c02348fbf1..684964ef6e 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -73,8 +73,8 @@ sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 
   if (N > 1)
   {
-    return impl::sort::unstable(
-        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
+    return impl::sort::unstable(r, std::forward<ExecPolicy>(p), begin_it,
+                                end_it, comp);
   }
   else
   {
@@ -139,8 +139,8 @@ stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 
   if (N > 1)
   {
-    return impl::sort::stable(
-        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
+    return impl::sort::stable(r, std::forward<ExecPolicy>(p), begin_it, end_it,
+                              comp);
   }
   else
   {
@@ -214,8 +214,8 @@ sort_pairs(ExecPolicy&&   p,
 
   if (N > 1)
   {
-    return impl::sort::unstable_pairs(
-        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
+    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                      end_key, begin(vals), comp);
   }
   else
   {
@@ -243,11 +243,8 @@ sort_pairs(ExecPolicy&&   p,
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
 /*!
@@ -298,8 +295,8 @@ stable_sort_pairs(ExecPolicy&&   p,
 
   if (N > 1)
   {
-    return impl::sort::stable_pairs(
-        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
+    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                    end_key, begin(vals), comp);
   }
   else
   {
@@ -327,11 +324,8 @@ stable_sort_pairs(ExecPolicy&&   p,
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
 } // namespace policy_by_value_interface
@@ -362,8 +356,8 @@ concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_resource<Res>>
 sort(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(
-      ExecPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(), r,
+                                                 std::forward<Args>(args)...);
 }
 
 /*!
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index c79035789f..9f5c09f152 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -65,11 +65,8 @@ class TensorIndex
                                                             value_type(-1)>>
   static_all()
   {
-    return StaticTensorIndex<StaticTensorIndexInner<IDX,
-                                                    TENSOR_TYPE,
-                                                    DIM,
-                                                    index_type(-1),
-                                                    value_type(-1)>>();
+    return StaticTensorIndex<StaticTensorIndexInner<
+        IDX, TENSOR_TYPE, DIM, index_type(-1), value_type(-1)>>();
   }
 
   RAJA_INLINE
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index eb70821915..3f5adc2f43 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -54,8 +54,7 @@ struct MultiplyOperator
   RAJA_HOST_DEVICE
   static void print_ast()
   {
-    printf("Elemental(%d,%d)",
-           (int)s_num_dims,
+    printf("Elemental(%d,%d)", (int)s_num_dims,
            (int)RIGHT_OPERAND_TYPE::s_num_dims);
   }
 
@@ -519,16 +518,14 @@ struct MultiplyOperator<
       {
 
         using LeftType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_SIZE,
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
                              camp::int_seq<INDEX_TYPE, Begin0, offset>,
                              camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
         // evaluate both sides of operator
         auto left = et_left.eval(LeftType());
 
         using RightType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_SIZE,
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
                              camp::int_seq<INDEX_TYPE, offset>,
                              camp::int_seq<INDEX_TYPE, tile_size>>;
 
@@ -536,8 +533,7 @@ struct MultiplyOperator<
 
         // accumulate product
         auto temp = left.right_multiply_vector_accumulate(right, result);
-        MultiplyBridge<STORAGE,
-                       TileType,
+        MultiplyBridge<STORAGE, TileType,
                        camp::integral_constant<size_t, INDEX - 1>>::
             multiply_into_result(result, tile, et_left, et_right);
         result += temp;
@@ -546,15 +542,13 @@ struct MultiplyOperator<
       {
 
         using LeftType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_PARTIAL,
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
                              camp::int_seq<INDEX_TYPE, Begin0, offset>,
                              camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
         auto left = et_left.eval(LeftType());
 
         using RightType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_PARTIAL,
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
                              camp::int_seq<INDEX_TYPE, offset>,
                              camp::int_seq<INDEX_TYPE, k_size - offset>>;
         auto right = et_right.eval(RightType());
@@ -606,16 +600,14 @@ struct MultiplyOperator<
       {
 
         using LeftType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_SIZE,
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
                              camp::int_seq<INDEX_TYPE, Begin0, offset>,
                              camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
         // evaluate both sides of operator
         auto left = et_left.eval(LeftType());
 
         using RightType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_SIZE,
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
                              camp::int_seq<INDEX_TYPE, offset>,
                              camp::int_seq<INDEX_TYPE, tile_size>>;
 
@@ -629,15 +621,13 @@ struct MultiplyOperator<
       {
 
         using LeftType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_PARTIAL,
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
                              camp::int_seq<INDEX_TYPE, Begin0, offset>,
                              camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
         auto left = et_left.eval(LeftType());
 
         using RightType =
-            StaticTensorTile<INDEX_TYPE,
-                             TENSOR_PARTIAL,
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
                              camp::int_seq<INDEX_TYPE, offset>,
                              camp::int_seq<INDEX_TYPE, k_size - offset>>;
         auto right = et_right.eval(RightType());
@@ -683,8 +673,7 @@ struct MultiplyOperator<
       const size_t iter_count =
           (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
-      MultiplyBridge<STORAGE,
-                     TileType,
+      MultiplyBridge<STORAGE, TileType,
                      camp::integral_constant<size_t, iter_count>>::
           multiply_into_result(result, tile, et_left, et_right);
     }
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index f4e5b3eec3..cb93f08e09 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -211,8 +211,8 @@ struct DivideOperator<
       return numerator.divide(right.eval(tile));
     }
 
-    return numerator.divide_nm(
-        right.eval(tile), tile.m_size[0], tile.m_size[1]);
+    return numerator.divide_nm(right.eval(tile), tile.m_size[0],
+                               tile.m_size[1]);
   }
 };
 
@@ -257,8 +257,8 @@ struct DivideOperator<
     }
     else
     {
-      return left.eval(tile).divide_nm(
-          denominator, tile.m_size[0], tile.m_size[1]);
+      return left.eval(tile).divide_nm(denominator, tile.m_size[0],
+                                       tile.m_size[1]);
     }
   }
 };
@@ -301,8 +301,8 @@ struct DivideOperator<
     }
     else
     {
-      return left.eval(tile).divide_nm(
-          right.eval(tile), tile.m_size[0], tile.m_size[1]);
+      return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0],
+                                       tile.m_size[1]);
     }
   }
 };
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 14f0645e53..ed4689ef6e 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -119,8 +119,7 @@ class TensorMultiply
                                                  normalize_operand_t<ADD>>
                                operator+(ADD const& add) const
   {
-    return TensorMultiplyAdd<left_operand_type,
-                             right_operand_type,
+    return TensorMultiplyAdd<left_operand_type, right_operand_type,
                              normalize_operand_t<ADD>>(
         m_left_operand, m_right_operand, normalizeOperand(add));
   }
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 4ebda615b2..fc118ca4e5 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -91,8 +91,8 @@ class TensorMultiplyAdd
                                             m_right_operand,
                                             m_add_operand))
   {
-    return multiply_op::multiply_add(
-        tile, m_left_operand, m_right_operand, m_add_operand);
+    return multiply_op::multiply_add(tile, m_left_operand, m_right_operand,
+                                     m_add_operand);
   }
 
 
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index a8685ea3cd..c9b7be60dc 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -297,14 +297,12 @@ struct MatrixMatrixMultiplyHelper<
 
     RAJA_UNROLL
     for (camp::idx_t c_reg = 0;
-         c_reg < N_SIZE / result_type::s_major_dim_per_register;
-         ++c_reg)
+         c_reg < N_SIZE / result_type::s_major_dim_per_register; ++c_reg)
     {
 
       RAJA_UNROLL
       for (camp::idx_t c_segment = 0;
-           c_segment < result_type::s_major_dim_per_register;
-           ++c_segment)
+           c_segment < result_type::s_major_dim_per_register; ++c_segment)
       {
 
         register_type c_tmp;
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index cf4ca1cce5..58a8e21868 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -291,11 +291,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_packed_nm(ptr,
-                              ref.m_stride[0],
-                              ref.m_stride[1],
-                              ref.m_tile.m_size[0],
-                              ref.m_tile.m_size[1]);
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -309,11 +306,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_strided_nm(ptr,
-                               ref.m_stride[0],
-                               ref.m_stride[1],
-                               ref.m_tile.m_size[0],
-                               ref.m_tile.m_size[1]);
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
@@ -341,11 +335,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_packed_nm(ptr,
-                               ref.m_stride[0],
-                               ref.m_stride[1],
-                               ref.m_tile.m_size[0],
-                               ref.m_tile.m_size[1]);
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -359,11 +350,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_strided_nm(ptr,
-                                ref.m_stride[0],
-                                ref.m_stride[1],
-                                ref.m_tile.m_size[0],
-                                ref.m_tile.m_size[1]);
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
@@ -421,11 +409,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_packed_nm(ptr,
-                              ref.m_stride[0],
-                              ref.m_stride[1],
-                              ref.m_tile.m_size[0],
-                              ref.m_tile.m_size[1]);
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -439,11 +424,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_strided_nm(ptr,
-                               ref.m_stride[0],
-                               ref.m_stride[1],
-                               ref.m_tile.m_size[0],
-                               ref.m_tile.m_size[1]);
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
@@ -471,11 +453,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_packed_nm(ptr,
-                               ref.m_stride[0],
-                               ref.m_stride[1],
-                               ref.m_tile.m_size[0],
-                               ref.m_tile.m_size[1]);
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -489,11 +468,8 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_strided_nm(ptr,
-                                ref.m_stride[0],
-                                ref.m_stride[1],
-                                ref.m_tile.m_size[0],
-                                ref.m_tile.m_size[1]);
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
@@ -617,8 +593,8 @@ class TensorRegister<REGISTER_POLICY,
         {
           element_type const* ptr_i =
               ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(
-              ptr_i, s_segbits, col_stride, row_stride);
+          m_registers[i].segmented_load(ptr_i, s_segbits, col_stride,
+                                        row_stride);
         }
       }
     }
@@ -648,8 +624,8 @@ class TensorRegister<REGISTER_POLICY,
         {
           element_type const* ptr_i =
               ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(
-              ptr_i, s_segbits, row_stride, col_stride);
+          m_registers[i].segmented_load(ptr_i, s_segbits, row_stride,
+                                        col_stride);
         }
       }
     }
@@ -830,8 +806,7 @@ class TensorRegister<REGISTER_POLICY,
               reg_num_cols = num_cols - col;
               m_registers[i].load_strided_n(ptr + row * row_stride +
                                                 col * col_stride,
-                                            col_stride,
-                                            reg_num_cols);
+                                            col_stride, reg_num_cols);
             }
             else
             {
@@ -855,8 +830,8 @@ class TensorRegister<REGISTER_POLICY,
 
           element_type const* ptr_i =
               ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(
-              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride,
+                                           row_stride, num_cols, reg_num_rows);
         }
       }
     }
@@ -887,8 +862,7 @@ class TensorRegister<REGISTER_POLICY,
               reg_num_rows = num_rows - row;
               m_registers[i].load_strided_n(ptr + row * row_stride +
                                                 col * col_stride,
-                                            row_stride,
-                                            reg_num_rows);
+                                            row_stride, reg_num_rows);
             }
             else
             {
@@ -911,8 +885,8 @@ class TensorRegister<REGISTER_POLICY,
 
           element_type const* ptr_i =
               ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(
-              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride,
+                                           col_stride, num_rows, reg_num_cols);
         }
       }
     }
@@ -1025,8 +999,8 @@ class TensorRegister<REGISTER_POLICY,
         for (camp::idx_t i = 0; i < s_num_registers; ++i)
         {
           element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(
-              ptr_i, s_segbits, col_stride, row_stride);
+          m_registers[i].segmented_store(ptr_i, s_segbits, col_stride,
+                                         row_stride);
         }
       }
     }
@@ -1053,8 +1027,8 @@ class TensorRegister<REGISTER_POLICY,
         for (camp::idx_t i = 0; i < s_num_registers; ++i)
         {
           element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(
-              ptr_i, s_segbits, row_stride, col_stride);
+          m_registers[i].segmented_store(ptr_i, s_segbits, row_stride,
+                                         col_stride);
         }
       }
     }
@@ -1112,8 +1086,8 @@ class TensorRegister<REGISTER_POLICY,
       else
       {
         // default to strided operation
-        return store_strided_nm(
-            ptr, row_stride, col_stride, num_rows, num_cols);
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
       }
     }
     // Do semi-dense store for column-major
@@ -1155,8 +1129,8 @@ class TensorRegister<REGISTER_POLICY,
       {
 
         // default to strided operation
-        return store_strided_nm(
-            ptr, row_stride, col_stride, num_rows, num_cols);
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
       }
     }
 
@@ -1198,8 +1172,7 @@ class TensorRegister<REGISTER_POLICY,
               reg_num_cols = num_cols - col;
               m_registers[i].store_strided_n(ptr + row * row_stride +
                                                  col * col_stride,
-                                             col_stride,
-                                             reg_num_cols);
+                                             col_stride, reg_num_cols);
             }
             else
             {
@@ -1222,8 +1195,8 @@ class TensorRegister<REGISTER_POLICY,
                                          : reg_num_rows;
 
           element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(
-              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride,
+                                            row_stride, num_cols, reg_num_rows);
         }
       }
     }
@@ -1250,8 +1223,7 @@ class TensorRegister<REGISTER_POLICY,
               reg_num_rows = num_rows - row;
               m_registers[i].store_strided_n(ptr + row * row_stride +
                                                  col * col_stride,
-                                             row_stride,
-                                             reg_num_rows);
+                                             row_stride, reg_num_rows);
             }
             else
             {
@@ -1273,8 +1245,8 @@ class TensorRegister<REGISTER_POLICY,
                                          : reg_num_cols;
 
           element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(
-              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride,
+                                            col_stride, num_rows, reg_num_cols);
         }
       }
     }
@@ -1823,9 +1795,8 @@ class TensorRegister<REGISTER_POLICY,
       MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
       matrix_multiply(RMAT const& mat) const
   {
-    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,
-                                                              RMAT>::result_type
-        res(0);
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(0);
     RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::multiply(
         *this, mat, res);
     return res;
@@ -1843,11 +1814,10 @@ class TensorRegister<REGISTER_POLICY,
               self_type,
               RMAT>::result_type const& C) const
   {
-    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,
-                                                              RMAT>::result_type
-        res(C);
-    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::
-        multiply_accumulate(*this, B, res);
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(C);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, res);
     return res;
   }
 
@@ -1858,8 +1828,8 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_HOST_DEVICE RAJA_INLINE void
   matrix_multiply_accumulate(ACCMAT& acc, RMAT const& B) const
   {
-    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::
-        multiply_accumulate(*this, B, acc);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, acc);
   }
 
 
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 2e22449c1d..7d125238c6 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -264,9 +264,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
                             camp::idx_t         stride_inner,
                             camp::idx_t         stride_outer)
   {
-    getThis()->gather(
-        ptr,
-        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+    getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner,
+                                                          stride_outer));
     return *getThis();
   }
 
@@ -385,9 +384,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
                                    camp::idx_t   stride_inner,
                                    camp::idx_t   stride_outer) const
   {
-    getThis()->scatter(
-        ptr,
-        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+    getThis()->scatter(ptr, self_type::s_segmented_offsets(
+                                segbits, stride_inner, stride_outer));
     return *getThis();
   }
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index d98c0ccf65..4288c1c333 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -450,8 +450,7 @@ struct TensorRef
   RAJA_INLINE
   void print() const
   {
-    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[",
-           (int)NUM_DIMS,
+    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
            m_pointer);
 
     for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
@@ -527,8 +526,7 @@ struct StaticTensorRef<POINTER_TYPE,
   RAJA_INLINE
   void print() const
   {
-    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=",
-           (int)s_num_dims,
+    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
            m_pointer);
 
     m_stride.print();
@@ -575,8 +573,9 @@ struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
   RAJA_HOST_DEVICE
   static constexpr merge_type merge(REF_TYPE const& ref, TILE_TYPE const& tile)
   {
-    return merge_type{
-        ref.m_pointer, {tile_index_type(ref.m_stride[DIM_SEQ])...}, tile};
+    return merge_type{ref.m_pointer,
+                      {tile_index_type(ref.m_stride[DIM_SEQ])...},
+                      tile};
   }
 
   RAJA_INLINE
@@ -679,8 +678,7 @@ struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
     return shift_type{ref.m_pointer -
                           RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
                                                   ref.m_stride[DIM_SEQ])...),
-                      new_stride_type(),
-                      shift_tile_type()};
+                      new_stride_type(), shift_tile_type()};
   }
 };
 
@@ -693,8 +691,7 @@ merge_ref_tile(REF_TYPE const& ref, TILE_TYPE const& tile) ->
         TILE_TYPE,
         camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
 {
-  return MergeRefTile<REF_TYPE,
-                      TILE_TYPE,
+  return MergeRefTile<REF_TYPE, TILE_TYPE,
                       camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
                                                                           tile);
 }
@@ -713,8 +710,7 @@ shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
         camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
 {
   return MergeRefTile<
-      REF_TYPE,
-      TILE_TYPE,
+      REF_TYPE, TILE_TYPE,
       camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
                                                                  tile_origin);
 }
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index 6d628c4185..99ac6f1b5b 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -236,8 +236,7 @@ class TensorRegisterBase<
                                     camp::int_seq<int, int(SIZES)...>>
   s_get_default_tile()
   {
-    return StaticTensorTile<int,
-                            TENSOR_FULL,
+    return StaticTensorTile<int, TENSOR_FULL,
                             camp::int_seq<int, int(SIZES * 0)...>,
                             camp::int_seq<int, int(SIZES)...>>();
   }
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 6785be61fb..0c0644a722 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -234,11 +234,12 @@ struct StaticTensorTileExec<STORAGE,
         camp::integral_constant<typename TTYPE::index_type,
                                 (orig_begin + orig_size) - tile_begin>;
 
-    using NextTile = typename expt::
-        SetStaticTensorTileBegin<TTYPE, NextBegin, (size_t)DIM0>::Type;
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-    using TailTile = typename expt::
-        SetStaticTensorTileSize<TTYPE, TailSize, (size_t)DIM0>::Type;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
     using PartTile = typename TailTile::Partial;
 
 
@@ -290,11 +291,12 @@ struct StaticTensorTileExec<STORAGE,
         camp::integral_constant<typename TTYPE::index_type,
                                 (orig_begin + orig_size) - tile_begin>;
 
-    using NextTile = typename expt::
-        SetStaticTensorTileBegin<TTYPE, NextBegin, (size_t)DIM0>::Type;
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-    using TailTile = typename expt::
-        SetStaticTensorTileSize<TTYPE, TailSize, (size_t)DIM0>::Type;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
     using PartTile = typename TailTile::Partial;
 
 
@@ -349,8 +351,7 @@ RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
   using InputBegin = typename InputType::begin_type;
 
   using Type = StaticTensorTile<
-      INDEX_TYPE,
-      TENSOR_FULL,
+      INDEX_TYPE, TENSOR_FULL,
       camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
       camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
 
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index de2ee58ed9..1d36543bab 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -511,8 +511,7 @@ class TensorRegister<REGISTER_POLICY,
     if (s_num_partial_lanes)
     {
       m_registers[s_final_register].load_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride,
-          stride,
+          ptr + s_final_register * s_register_num_elem * stride, stride,
           s_num_partial_lanes);
     }
     return *this;
@@ -570,8 +569,7 @@ class TensorRegister<REGISTER_POLICY,
       {
         m_registers[reg].load_strided_n(ptr +
                                             reg * s_register_num_elem * stride,
-                                        stride,
-                                        N - reg * s_register_num_elem);
+                                        stride, N - reg * s_register_num_elem);
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
           m_registers[r].broadcast(0);
@@ -582,8 +580,7 @@ class TensorRegister<REGISTER_POLICY,
     if (s_num_partial_lanes)
     {
       m_registers[s_final_register].load_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride,
-          stride,
+          ptr + s_final_register * s_register_num_elem * stride, stride,
           N - s_final_register * s_register_num_elem);
     }
     return *this;
@@ -609,8 +606,8 @@ class TensorRegister<REGISTER_POLICY,
     }
     if (s_num_partial_lanes)
     {
-      m_registers[s_final_register].gather_n(
-          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             s_num_partial_lanes);
     }
     return *this;
   }
@@ -636,8 +633,8 @@ class TensorRegister<REGISTER_POLICY,
       }
       else
       {
-        m_registers[reg].gather_n(
-            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
+        m_registers[reg].gather_n(ptr, offsets.vec(reg),
+                                  N - reg * s_register_num_elem);
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
           m_registers[r].broadcast(0);
@@ -647,8 +644,7 @@ class TensorRegister<REGISTER_POLICY,
     }
     if (s_num_partial_lanes)
     {
-      m_registers[s_final_register].gather_n(ptr,
-                                             offsets.vec(s_final_register),
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
                                              N - s_final_register *
                                                      s_register_num_elem);
     }
@@ -690,8 +686,7 @@ class TensorRegister<REGISTER_POLICY,
     if (s_num_partial_lanes)
     {
       m_registers[s_final_register].store_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride,
-          stride,
+          ptr + s_final_register * s_register_num_elem * stride, stride,
           s_num_partial_lanes);
     }
     return *this;
@@ -744,16 +739,14 @@ class TensorRegister<REGISTER_POLICY,
       {
         m_registers[reg].store_strided_n(ptr +
                                              reg * s_register_num_elem * stride,
-                                         stride,
-                                         N - reg * s_register_num_elem);
+                                         stride, N - reg * s_register_num_elem);
         return *this;
       }
     }
     if (s_num_partial_lanes)
     {
       m_registers[s_final_register].store_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride,
-          stride,
+          ptr + s_final_register * s_register_num_elem * stride, stride,
           N - s_final_register * s_register_num_elem);
     }
     return *this;
@@ -809,18 +802,17 @@ class TensorRegister<REGISTER_POLICY,
       }
       else
       {
-        m_registers[reg].scatter_n(
-            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
+        m_registers[reg].scatter_n(ptr, offsets.vec(reg),
+                                   N - reg * s_register_num_elem);
 
         return *this;
       }
     }
     if (s_num_partial_lanes)
     {
-      m_registers[s_final_register].scatter_n(ptr,
-                                              offsets.vec(s_final_register),
-                                              N - s_num_full_registers *
-                                                      s_register_num_elem);
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register),
+          N - s_num_full_registers * s_register_num_elem);
     }
     return *this;
   }
@@ -938,9 +930,8 @@ class TensorRegister<REGISTER_POLICY,
     if (N - s_num_full_registers * s_register_num_elem > 0)
     {
       result = RAJA::min<element_type>(
-          result,
-          m_registers[s_final_register].min_n(N - s_final_register *
-                                                      s_register_num_elem));
+          result, m_registers[s_final_register].min_n(
+                      N - s_final_register * s_register_num_elem));
     }
     return result;
   }
@@ -1001,9 +992,8 @@ class TensorRegister<REGISTER_POLICY,
     if (N - s_num_full_registers * s_register_num_elem > 0)
     {
       result = RAJA::max<element_type>(
-          result,
-          m_registers[s_final_register].max_n(N - s_final_register *
-                                                      s_register_num_elem));
+          result, m_registers[s_final_register].max_n(
+                      N - s_final_register * s_register_num_elem));
     }
     return result;
   }
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index f1090996cd..465e559dbe 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -152,8 +152,8 @@ RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(
-      camp::make_idx_seq_t<sizeof...(Policies)>{}, s, policies);
+  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)>{},
+                                   s, policies);
 }
 
 namespace detail
@@ -193,8 +193,7 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...>
     }
     else
     {
-      NextInvoker::invoke(offset,
-                          std::forward<Iterable>(iter),
+      NextInvoker::invoke(offset, std::forward<Iterable>(iter),
                           std::forward<LoopBody>(loop_body));
     }
   }
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index 045088fb33..6350e1e6fb 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -427,8 +427,8 @@ template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
+                              __ATOMIC_RELAXED);
   return compare;
 }
 
@@ -554,10 +554,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-      builtin_atomicCAS(reinterpret_cast<R*>(acc),
-                        RAJA::util::reinterp_A_as_B<T, R>(compare),
-                        RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -731,8 +730,7 @@ template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-      acc,
-      [value](T old) { return value < old ? value : old; },
+      acc, [value](T old) { return value < old ? value : old; },
       [value](T current) { return current <= value; });
 }
 
@@ -740,8 +738,7 @@ template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-      acc,
-      [value](T old) { return old < value ? value : old; },
+      acc, [value](T old) { return old < value ? value : old; },
       [value](T current) { return value <= current; });
 }
 
@@ -755,8 +752,7 @@ template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-      acc,
-      [value](T old)
+      acc, [value](T old)
       { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 671b92a7b9..294f532de9 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -147,8 +147,8 @@ struct DevicePinnedAllocator
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
     cudaErrchk(
         cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(
-        ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy,
+                             cudaCpuDeviceId));
 
     return ptr;
   }
@@ -445,11 +445,9 @@ cuda_occupancy_max_blocks_threads(const void* func,
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
-    cudaErrchk(
-        cudaOccupancyMaxPotentialBlockSize(&data.func_max_blocks_per_device,
-                                           &data.func_max_threads_per_block,
-                                           func,
-                                           func_dynamic_shmem_per_block));
+    cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -477,9 +475,7 @@ cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
     data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm,
-        func,
-        func_threads_per_block,
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
         func_dynamic_shmem_per_block));
   }
 
@@ -503,9 +499,7 @@ cuda_occupancy_max_blocks(const void* func,
     data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm,
-        func,
-        func_threads_per_block,
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
         func_dynamic_shmem_per_block));
   }
 
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index cb31f32f86..b031481713 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -328,8 +328,7 @@ struct WorkRunner<
 
       storage.template emplace<holder>(
           get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-          std::forward<Iterable>(iter),
-          std::forward<LoopBody>(loop_body));
+          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
 
@@ -342,17 +341,14 @@ struct WorkRunner<
   {
     using Iterator   = camp::decay<decltype(std::begin(storage))>;
     using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
-                                                         std::end(storage)))>;
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage{};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE,
-                                              BLOCKS_PER_SM,
-                                              Iterator,
-                                              value_type,
-                                              index_type,
-                                              Args...>;
+    auto func =
+        cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
+                                      value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
@@ -376,8 +372,7 @@ struct WorkRunner<
       cuda_dim_t gridSize{
           static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
                                          block_size),
-          static_cast<cuda_dim_member_t>(num_loops),
-          1};
+          static_cast<cuda_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -391,8 +386,8 @@ struct WorkRunner<
         // Launch the kernel
         //
         void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::cuda::launch(
-            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
+                           shmem, r, Async);
       }
 
       RAJA_FT_END;
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index 628ccc4715..768f4297a6 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -298,10 +298,9 @@ RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-      cuda_atomicCAS(reinterpret_cast<R*>(acc),
-                     RAJA::util::reinterp_A_as_B<T, R>(compare),
-                     RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 /*!
@@ -474,8 +473,7 @@ template <
 RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-      acc,
-      [value](T old) { return value < old ? value : old; },
+      acc, [value](T old) { return value < old ? value : old; },
       [value](T current) { return current <= value; });
 }
 
@@ -498,8 +496,7 @@ template <
 RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-      acc,
-      [value](T old) { return old < value ? value : old; },
+      acc, [value](T old) { return old < value ? value : old; },
       [value](T current) { return value <= current; });
 }
 
@@ -529,11 +526,9 @@ RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(acc,
-                             [value](T old) {
-                               return value <= old ? static_cast<T>(0)
-                                                   : old + static_cast<T>(1);
-                             });
+  return cuda_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 template <
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 137cdc591e..746c21f2ca 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -630,17 +630,13 @@ forall_impl(resources::Cuda cuda_res,
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                            IterationGetter,
-                                                            Concretizer,
-                                                            BlocksPerSM,
-                                                            Async>;
-  using UniqueMarker = ::camp::
-      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
-                                                              IterationGetter,
-                                                              Concretizer,
-                                                              UniqueMarker>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -653,12 +649,9 @@ forall_impl(resources::Cuda cuda_res,
   if (len > 0)
   {
 
-    auto func =
-        reinterpret_cast<const void*>(&impl::forall_cuda_kernel<EXEC_POL,
-                                                                BlocksPerSM,
-                                                                Iterator,
-                                                                LOOP_BODY,
-                                                                IndexType>);
+    auto func = reinterpret_cast<const void*>(
+        &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                  IndexType>);
 
     //
     // Setup shared memory buffers
@@ -677,20 +670,16 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body =
-          RAJA::cuda::make_launch_body(func,
-                                       dims.blocks,
-                                       dims.threads,
-                                       shmem,
-                                       cuda_res,
-                                       std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
       void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(
-          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
     }
 
     RAJA_FT_END;
@@ -727,21 +716,14 @@ forall_impl(resources::Cuda cuda_res,
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                            IterationGetter,
-                                                            Concretizer,
-                                                            BlocksPerSM,
-                                                            Async>;
-  using UniqueMarker        = ::camp::list<IterationMapping,
-                                    IterationGetter,
-                                    camp::num<BlocksPerSM>,
-                                    LOOP_BODY,
-                                    Iterator,
-                                    ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
-                                                              IterationGetter,
-                                                              Concretizer,
-                                                              UniqueMarker>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker =
+      ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
+                   LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -755,12 +737,8 @@ forall_impl(resources::Cuda cuda_res,
   {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel<EXEC_POL,
-                                   BlocksPerSM,
-                                   Iterator,
-                                   LOOP_BODY,
-                                   IndexType,
-                                   camp::decay<ForallParam>>);
+        &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                   IndexType, camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -786,21 +764,17 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body =
-          RAJA::cuda::make_launch_body(func,
-                                       dims.blocks,
-                                       dims.threads,
-                                       shmem,
-                                       cuda_res,
-                                       std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void* args[] = {
-          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::cuda::launch(
-          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -851,15 +825,12 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
   {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                              IterationGetter,
-                                                              Concretizer,
-                                                              BlocksPerSM,
-                                                              true>(),
-                     loop_body);
+    iset.segmentCall(
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                 IterationGetter, Concretizer,
+                                                 BlocksPerSM, true>(),
+        loop_body);
   } // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index 990af56784..2d8aa65cba 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -93,8 +93,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::
-        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -112,8 +112,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::
-        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -153,9 +153,9 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::
-      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
-          u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i)
@@ -172,9 +172,9 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::
-      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
-          u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i)
@@ -448,10 +448,8 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
-        reinterpret_cast<
-            RAJA::detail::SoAArray<T,
-                                   policy::cuda::device_constants.MAX_WARPS>*>(
-            tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0)
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index abe4458413..c14b3c97f7 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -258,8 +258,8 @@ struct CudaKernelLauncherGetter
                                                               executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::
-        CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
+                                              executor_t>;
   }
 };
 
@@ -567,8 +567,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -586,8 +586,7 @@ struct StatementExecutor<
       if (recommended_threads >= get_size(launch_dims.min_dims.threads))
       {
 
-        fit_threads = fitCudaDims(recommended_threads,
-                                  launch_dims.dims.threads,
+        fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
                                   launch_dims.min_dims.threads);
       }
 
@@ -598,8 +597,7 @@ struct StatementExecutor<
           get_size(fit_threads) != recommended_threads)
       {
 
-        fit_threads = fitCudaDims(max_threads,
-                                  launch_dims.dims.threads,
+        fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
                                   launch_dims.min_dims.threads);
       }
 
@@ -631,8 +629,8 @@ struct StatementExecutor<
         use_blocks = max_blocks;
       }
 
-      launch_dims.dims.blocks = fitCudaDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
+                                            launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -656,23 +654,16 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto cuda_data = RAJA::cuda::make_launch_body(func,
-                                                      launch_dims.dims.blocks,
-                                                      launch_dims.dims.threads,
-                                                      shmem,
-                                                      res,
-                                                      data);
+        auto cuda_data = RAJA::cuda::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func,
-                           launch_dims.dims.blocks,
-                           launch_dims.dims.threads,
-                           args,
-                           shmem,
-                           res,
+        RAJA::cuda::launch(func, launch_dims.dims.blocks,
+                           launch_dims.dims.threads, args, shmem, res,
                            launch_t::async);
       }
     }
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 4c024c3023..feda56fd39 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -60,11 +60,9 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -78,11 +76,9 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -146,11 +142,9 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -164,11 +158,9 @@ struct CudaStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 837d8f6442..1e302f86e2 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -193,8 +193,8 @@ struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types>
   static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return CudaStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 77fe2f325c..406cf57432 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -116,26 +116,16 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body =
-            RAJA::cuda::make_launch_body(func,
-                                         gridSize,
-                                         blockSize,
-                                         shared_mem_size,
-                                         cuda_res,
-                                         std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func,
-                           gridSize,
-                           blockSize,
-                           args,
-                           shared_mem_size,
-                           cuda_res,
-                           async,
-                           kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -195,9 +185,7 @@ struct LaunchExecute<
 
       {
         using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
-            async,
-            named_usage::unspecified,
-            named_usage::unspecified>;
+            async, named_usage::unspecified, named_usage::unspecified>;
         RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
                                                      launch_info);
 
@@ -205,26 +193,16 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body =
-            RAJA::cuda::make_launch_body(func,
-                                         gridSize,
-                                         blockSize,
-                                         shared_mem_size,
-                                         cuda_res,
-                                         std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func,
-                           gridSize,
-                           blockSize,
-                           args,
-                           shared_mem_size,
-                           cuda_res,
-                           async,
-                           kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
                                                         launch_info);
@@ -331,26 +309,16 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body =
-            RAJA::cuda::make_launch_body(func,
-                                         gridSize,
-                                         blockSize,
-                                         shared_mem_size,
-                                         cuda_res,
-                                         std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func,
-                           gridSize,
-                           blockSize,
-                           args,
-                           shared_mem_size,
-                           cuda_res,
-                           async,
-                           kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -376,9 +344,7 @@ struct LaunchExecute<
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY,
-                                            nthreads,
-                                            BLOCKS_PER_SM,
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
                                             camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
@@ -414,34 +380,25 @@ struct LaunchExecute<
       launch_info.res          = cuda_res;
       {
 
-        using EXEC_POL = RAJA::policy::cuda::
-            cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
+        using EXEC_POL =
+            RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
+                                                       BLOCKS_PER_SM>;
         RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
                                                      launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body =
-            RAJA::cuda::make_launch_body(func,
-                                         gridSize,
-                                         blockSize,
-                                         shared_mem_size,
-                                         cuda_res,
-                                         std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func,
-                           gridSize,
-                           blockSize,
-                           args,
-                           shared_mem_size,
-                           cuda_res,
-                           async,
-                           kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
                                                         launch_info);
@@ -550,8 +507,7 @@ struct LoopExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
@@ -673,8 +629,7 @@ struct LoopExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -774,12 +729,8 @@ struct LoopICountExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0,
-           i1,
-           i2);
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
@@ -900,12 +851,8 @@ struct LoopICountExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0,
-               i1,
-               i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index bc57cce32a..af8cf382d5 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -114,8 +114,7 @@ block_multi_reduce_init_shmem(int num_bins,
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads)
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
   {
     shared_mem[shmem_offset] = identity;
   }
@@ -219,8 +218,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         m_tally_bins(get_tally_bins(m_num_bins)),
         m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(
-        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
@@ -246,8 +245,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins          = new_num_bins;
       m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem         = create_tally(
-          container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
     }
     else
     {
@@ -256,8 +255,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         int bin       = 0;
         for (auto const& value : container)
         {
-          m_tally_mem[GetTallyOffset{}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
+                                       m_tally_replication)] = value;
           ++bin;
         }
       }
@@ -265,8 +264,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       {
         for (int bin = 0; bin < m_num_bins; ++bin)
         {
-          m_tally_mem[GetTallyOffset{}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
+                                       m_tally_replication)] = identity;
         }
       }
     }
@@ -394,8 +393,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       for (int bin = num_bins; bin > 0; --bin)
       {
-        int tally_offset = GetTallyOffset{}(
-            bin - 1, tally_bins, tally_rep - 1, tally_replication);
+        int tally_offset = GetTallyOffset{}(bin - 1, tally_bins, tally_rep - 1,
+                                            tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -411,7 +410,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T   m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication; // power of 2, at least the max number of omp
+                           // threads
 };
 
 
@@ -452,14 +452,8 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins,
-        m_identity,
-        bin,
-        value,
-        m_tally_mem,
-        GetTallyOffset{},
-        m_tally_replication,
-        m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -573,8 +567,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr)
     {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity, shared_mem, m_shared_replication);
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -586,15 +580,9 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins,
-          m_identity,
-          shared_mem,
-          GetSharedOffset{},
-          m_shared_replication,
-          m_tally_mem,
-          GetTallyOffset{},
-          m_tally_replication,
-          m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset{},
+          m_shared_replication, m_tally_mem, GetTallyOffset{},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -607,25 +595,14 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins,
-          m_identity,
-          bin,
-          value,
-          shared_mem,
-          GetSharedOffset{},
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset{},
           m_shared_replication);
     }
     else
     {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins,
-          m_identity,
-          bin,
-          value,
-          m_tally_mem,
-          GetTallyOffset{},
-          m_tally_replication,
-          m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+          m_tally_replication, m_tally_bins);
     }
   }
 
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index d7412c4d51..a0232701aa 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -591,8 +591,7 @@ struct CudaDims
   {
     if (num_blocks() != 0)
     {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
     }
     else
@@ -606,8 +605,7 @@ struct CudaDims
   {
     if (num_threads() != 0)
     {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
     }
     else
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 989b328155..2c2c7b4496 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -66,10 +66,7 @@ cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
     }
     else
     {
-      fprintf(stderr,
-              "CUDAassert: %s %s %d\n",
-              cudaGetErrorString(code),
-              file,
+      fprintf(stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
               line);
     }
   }
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 2e3f13519e..82c1443d9b 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -263,16 +263,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(
-        RAJA::detail::
-            SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char
+        tmpsd[sizeof(RAJA::detail::SoAArray<
+                     T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
         sd = reinterpret_cast<RAJA::detail::SoAArray<
-            T,
-            RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
+            T, RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0)
@@ -730,9 +729,10 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::
-        grid_reduce_last_block<Combiner, Accessor, replication, atomic_stride>(
-            temp, identity, device, device_count);
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -917,11 +917,10 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<Combiner,
-                                                                Accessor,
-                                                                replication,
-                                                                atomic_stride>(
-        temp, identity, device, device_count);
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 42aa31b2cb..4415c8ab10 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -67,24 +67,16 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -124,26 +116,16 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -232,26 +214,16 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index 3eadda61f2..7bcb7a2440 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -127,12 +127,8 @@ stable(resources::Cuda cuda_res,
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
@@ -140,12 +136,8 @@ stable(resources::Cuda cuda_res,
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -153,8 +145,8 @@ stable(resources::Cuda cuda_res,
   {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(
-        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -204,26 +196,18 @@ stable(resources::Cuda cuda_res,
   // Determine temporary device storage requirements
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -231,8 +215,8 @@ stable(resources::Cuda cuda_res,
   {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(
-        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -438,28 +422,18 @@ stable_pairs(resources::Cuda cuda_res,
   // Determine temporary device storage requirements
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -467,15 +441,15 @@ stable_pairs(resources::Cuda cuda_res,
   {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out)
   {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -533,28 +507,18 @@ stable_pairs(resources::Cuda cuda_res,
   // Determine temporary device storage requirements
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -562,15 +526,15 @@ stable_pairs(resources::Cuda cuda_res,
   {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out)
   {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 45f0b344fe..39595fb63c 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -30,56 +30,56 @@ RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
 {
-  return desul::atomic_load(
-      acc, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_load(acc, raja_default_desul_order{},
+                            raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
 {
-  desul::atomic_store(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  desul::atomic_store(acc, value, raja_default_desul_order{},
+                      raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_add(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_add(acc, value, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_sub(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_sub(acc, value, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_min(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_min(acc, value, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_max(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_max(acc, value, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_inc(
-      acc, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_inc(acc, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -88,16 +88,16 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(
-      acc, val, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(acc, val, raja_default_desul_order{},
+                                     raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_dec(
-      acc, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_dec(acc, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -106,40 +106,40 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(
-      acc, val, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(acc, val, raja_default_desul_order{},
+                                     raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_and(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_and(acc, value, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_or(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_or(acc, value, raja_default_desul_order{},
+                                raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_xor(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_fetch_xor(acc, value, raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_exchange(
-      acc, value, raja_default_desul_order{}, raja_default_desul_scope{});
+  return desul::atomic_exchange(acc, value, raja_default_desul_order{},
+                                raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -147,9 +147,7 @@ template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T
 atomicCAS(AtomicPolicy, T* acc, T compare, T value)
 {
-  return desul::atomic_compare_exchange(acc,
-                                        compare,
-                                        value,
+  return desul::atomic_compare_exchange(acc, compare, value,
                                         raja_default_desul_order{},
                                         raja_default_desul_scope{});
 }
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 83c37c5480..6f130d3670 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -79,8 +79,8 @@ struct PinnedAllocator
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    hipErrchk(hipHostMalloc(
-        &ptr, nbytes, hipHostMallocMapped | hipHostMallocNonCoherent));
+    hipErrchk(hipHostMalloc(&ptr, nbytes,
+                            hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -441,11 +441,9 @@ hip_occupancy_max_blocks_threads(const void* func,
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
-    hipErrchk(
-        hipOccupancyMaxPotentialBlockSize(&data.func_max_blocks_per_device,
-                                          &data.func_max_threads_per_block,
-                                          func,
-                                          func_dynamic_shmem_per_block));
+    hipErrchk(hipOccupancyMaxPotentialBlockSize(
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
     hipDeviceProp_t& prop           = hip::device_prop();
@@ -480,9 +478,7 @@ hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm,
-        func,
-        func_threads_per_block,
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
         func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
@@ -516,9 +512,7 @@ hip_occupancy_max_blocks(const void* func,
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm,
-        func,
-        func_threads_per_block,
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
         func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index c9b56efbb1..afcbcb949a 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -318,8 +318,7 @@ struct WorkRunner<
 
       storage.template emplace<holder>(
           get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
-          std::forward<Iterable>(iter),
-          std::forward<LoopBody>(loop_body));
+          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
 
@@ -332,16 +331,13 @@ struct WorkRunner<
   {
     using Iterator   = camp::decay<decltype(std::begin(storage))>;
     using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
-                                                         std::end(storage)))>;
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage{};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE,
-                                             Iterator,
-                                             value_type,
-                                             index_type,
-                                             Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
+                                             index_type, Args...>;
 
     //
     // Compute the requested iteration space size
@@ -365,8 +361,7 @@ struct WorkRunner<
       hip_dim_t gridSize{
           static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
                                         block_size),
-          static_cast<hip_dim_member_t>(num_loops),
-          1};
+          static_cast<hip_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -380,8 +375,8 @@ struct WorkRunner<
         // Launch the kernel
         //
         void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::hip::launch(
-            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
+                          shmem, r, Async);
       }
 
       RAJA_FT_END;
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index 5ae1eb659d..ba306f1eaa 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -351,10 +351,9 @@ RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-      hip_atomicCAS(reinterpret_cast<R*>(acc),
-                    RAJA::util::reinterp_A_as_B<T, R>(compare),
-                    RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -553,8 +552,7 @@ template <
 RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-      acc,
-      [value](T old) { return value < old ? value : old; },
+      acc, [value](T old) { return value < old ? value : old; },
       [value](T current) { return current <= value; });
 }
 
@@ -578,8 +576,7 @@ template <
 RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-      acc,
-      [value](T old) { return old < value ? value : old; },
+      acc, [value](T old) { return old < value ? value : old; },
       [value](T current) { return value <= current; });
 }
 
@@ -598,11 +595,9 @@ RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc,
-                            [value](T old) {
-                              return value <= old ? static_cast<T>(0)
-                                                  : old + static_cast<T>(1);
-                            });
+  return hip_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 7f2a227c72..b5430e3470 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -620,14 +620,14 @@ forall_impl(
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::
-      hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::
-      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
-                                                              IterationGetter,
-                                                              Concretizer,
-                                                              UniqueMarker>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -660,20 +660,16 @@ forall_impl(
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body =
-          RAJA::hip::make_launch_body(func,
-                                      dims.blocks,
-                                      dims.threads,
-                                      shmem,
-                                      hip_res,
-                                      std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
       void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(
-          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
     }
 
     RAJA_FT_END;
@@ -707,14 +703,14 @@ forall_impl(
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::
-      hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::
-      list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping,
-                                                              IterationGetter,
-                                                              Concretizer,
-                                                              UniqueMarker>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -728,10 +724,7 @@ forall_impl(
   {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_hip_kernel<EXEC_POL,
-                                  Iterator,
-                                  LOOP_BODY,
-                                  IndexType,
+        &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
                                   camp::decay<ForallParam>>);
 
     //
@@ -758,21 +751,17 @@ forall_impl(
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body =
-          RAJA::hip::make_launch_body(func,
-                                      dims.blocks,
-                                      dims.threads,
-                                      shmem,
-                                      hip_res,
-                                      std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void* args[] = {
-          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::hip::launch(
-          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -821,11 +810,9 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
   for (int isi = 0; isi < num_seg; ++isi)
   {
     iset.segmentCall(
-        r,
-        isi,
-        detail::CallForall(),
-        ::RAJA::policy::hip::
-            hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                      Concretizer, true>(),
         loop_body);
   } // iterate over segments of index set
 
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index 59d726e131..3c4360685b 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -93,8 +93,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::
-        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -105,8 +105,8 @@ struct AccessorDeviceScopeUseBlockFence
     {
 #if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
     RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(
-          &ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
+                                     __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -118,8 +118,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::
-        AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -130,8 +130,8 @@ struct AccessorDeviceScopeUseBlockFence
     {
 #if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
     RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(
-          &ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
+                         __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -181,9 +181,9 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::
-      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
-          u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i)
@@ -196,9 +196,9 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::
-      AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size>
-          u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
   for (size_t i = 0; i < u.array_size(); ++i)
@@ -348,10 +348,8 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     __shared__ unsigned char tmpsd[sizeof(
         RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
     RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
-        reinterpret_cast<
-            RAJA::detail::SoAArray<T,
-                                   policy::hip::device_constants.MAX_WARPS>*>(
-            tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0)
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index f4c636abbe..8e892c3718 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -527,8 +527,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -546,8 +546,7 @@ struct StatementExecutor<
       if (recommended_threads >= get_size(launch_dims.min_dims.threads))
       {
 
-        fit_threads = fitHipDims(recommended_threads,
-                                 launch_dims.dims.threads,
+        fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
                                  launch_dims.min_dims.threads);
       }
 
@@ -558,8 +557,7 @@ struct StatementExecutor<
           get_size(fit_threads) != recommended_threads)
       {
 
-        fit_threads = fitHipDims(max_threads,
-                                 launch_dims.dims.threads,
+        fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
                                  launch_dims.min_dims.threads);
       }
 
@@ -591,8 +589,8 @@ struct StatementExecutor<
         use_blocks = max_blocks;
       }
 
-      launch_dims.dims.blocks = fitHipDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
+                                           launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -616,23 +614,16 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto hip_data = RAJA::hip::make_launch_body(func,
-                                                    launch_dims.dims.blocks,
-                                                    launch_dims.dims.threads,
-                                                    shmem,
-                                                    res,
-                                                    data);
+        auto hip_data = RAJA::hip::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func,
-                          launch_dims.dims.blocks,
-                          launch_dims.dims.threads,
-                          args,
-                          shmem,
-                          res,
+        RAJA::hip::launch(func, launch_dims.dims.blocks,
+                          launch_dims.dims.threads, args, shmem, res,
                           launch_t::async);
       }
     }
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index 503c89cab1..bd263fb705 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -60,11 +60,9 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -78,11 +76,9 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -146,11 +142,9 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -164,11 +158,9 @@ struct HipStatementExecutor<Data,
   static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
     using varType = typename camp::tuple_element_t<
-        Pos,
-        typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem =
-        camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::
-            layout_type::s_size;
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index a4dc2fc49b..a0d1218b85 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -193,8 +193,8 @@ struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types>
   static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return HipStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index e6c7ccdc86..1aa27db7eb 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -113,25 +113,16 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-                                                gridSize,
-                                                blockSize,
-                                                shared_mem_size,
-                                                hip_res,
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
                                                 std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::hip::launch(func,
-                          gridSize,
-                          blockSize,
-                          args,
-                          shared_mem_size,
-                          hip_res,
-                          async,
-                          kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -199,25 +190,16 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-                                                gridSize,
-                                                blockSize,
-                                                shared_mem_size,
-                                                hip_res,
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
                                                 std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func,
-                          gridSize,
-                          blockSize,
-                          args,
-                          shared_mem_size,
-                          hip_res,
-                          async,
-                          kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
                                                         launch_info);
@@ -318,25 +300,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-                                                gridSize,
-                                                blockSize,
-                                                shared_mem_size,
-                                                hip_res,
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
                                                 std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::hip::launch(func,
-                          gridSize,
-                          blockSize,
-                          args,
-                          shared_mem_size,
-                          hip_res,
-                          async,
-                          kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -361,8 +334,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY,
-                                            nthreads,
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads,
                                             camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
@@ -405,25 +377,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-                                                gridSize,
-                                                blockSize,
-                                                shared_mem_size,
-                                                hip_res,
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
                                                 std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func,
-                          gridSize,
-                          blockSize,
-                          args,
-                          shared_mem_size,
-                          hip_res,
-                          async,
-                          kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
                                                         launch_info);
@@ -533,8 +496,7 @@ struct LoopExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
@@ -656,8 +618,7 @@ struct LoopExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -757,12 +718,8 @@ struct LoopICountExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0,
-           i1,
-           i2);
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
@@ -883,12 +840,8 @@ struct LoopICountExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0,
-               i1,
-               i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 4467b0ab24..a74a2aaa51 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -114,8 +114,7 @@ block_multi_reduce_init_shmem(int num_bins,
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads)
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
   {
     shared_mem[shmem_offset] = identity;
   }
@@ -219,8 +218,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         m_tally_bins(get_tally_bins(m_num_bins)),
         m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(
-        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
@@ -246,8 +245,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins          = new_num_bins;
       m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem         = create_tally(
-          container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
     }
     else
     {
@@ -256,8 +255,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         int bin       = 0;
         for (auto const& value : container)
         {
-          m_tally_mem[GetTallyOffset{}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
+                                       m_tally_replication)] = value;
           ++bin;
         }
       }
@@ -265,8 +264,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       {
         for (int bin = 0; bin < m_num_bins; ++bin)
         {
-          m_tally_mem[GetTallyOffset{}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
+                                       m_tally_replication)] = identity;
         }
       }
     }
@@ -394,8 +393,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       for (int bin = num_bins; bin > 0; --bin)
       {
-        int tally_offset = GetTallyOffset{}(
-            bin - 1, tally_bins, tally_rep - 1, tally_replication);
+        int tally_offset = GetTallyOffset{}(bin - 1, tally_bins, tally_rep - 1,
+                                            tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -411,7 +410,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T   m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication; // power of 2, at least the max number of omp
+                           // threads
 };
 
 
@@ -452,14 +452,8 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins,
-        m_identity,
-        bin,
-        value,
-        m_tally_mem,
-        GetTallyOffset{},
-        m_tally_replication,
-        m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -573,8 +567,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr)
     {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity, shared_mem, m_shared_replication);
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -586,15 +580,9 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins,
-          m_identity,
-          shared_mem,
-          GetSharedOffset{},
-          m_shared_replication,
-          m_tally_mem,
-          GetTallyOffset{},
-          m_tally_replication,
-          m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset{},
+          m_shared_replication, m_tally_mem, GetTallyOffset{},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -607,25 +595,14 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins,
-          m_identity,
-          bin,
-          value,
-          shared_mem,
-          GetSharedOffset{},
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset{},
           m_shared_replication);
     }
     else
     {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins,
-          m_identity,
-          bin,
-          value,
-          m_tally_mem,
-          GetTallyOffset{},
-          m_tally_replication,
-          m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+          m_tally_replication, m_tally_bins);
     }
   }
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 6547933d0b..415d1199c8 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -580,8 +580,7 @@ struct HipDims
   {
     if (num_blocks() != 0)
     {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
     }
     else
@@ -595,8 +594,7 @@ struct HipDims
   {
     if (num_threads() != 0)
     {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
     }
     else
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index a018509164..5e0cabed0d 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -65,8 +65,8 @@ hipAssert(hipError_t code, const char* file, int line, bool abort = true)
     }
     else
     {
-      fprintf(
-          stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file, line);
+      fprintf(stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 2927b6bc70..bdd03bd9fc 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -265,8 +265,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
         sd = reinterpret_cast<RAJA::detail::SoAArray<
-            T,
-            RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
+            T, RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0)
@@ -721,10 +720,11 @@ struct ReduceLastBlock_Data
   RAJA_DEVICE
   void grid_reduce(T* output)
   {
-    T      temp          = value;
-    size_t replicationId = impl::
-        grid_reduce_last_block<Combiner, Accessor, replication, atomic_stride>(
-            temp, identity, device, device_count);
+    T      temp = value;
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -910,11 +910,10 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<Combiner,
-                                                                Accessor,
-                                                                replication,
-                                                                atomic_stride>(
-        temp, identity, device, device_count);
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index b9a3adbc7e..391d7d606e 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -68,20 +68,11 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
 
@@ -91,20 +82,11 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -142,22 +124,11 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -166,22 +137,11 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -219,8 +179,8 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -231,8 +191,8 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -274,22 +234,11 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -298,22 +247,11 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index 1f729da96e..e04d0eb13d 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -103,9 +103,8 @@ stable(resources::Hip hip_res,
           concepts::any_of<
               camp::is_same<Compare,
                             operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<Compare,
-                            operators::greater<RAJA::detail::IterVal<Iter>>>>>::
-          value,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
       "RAJA stable_sort<hip_exec> is only implemented for pointers to "
       "arithmetic types and RAJA::operators::less and "
       "RAJA::operators::greater.");
@@ -150,21 +149,13 @@ stable(resources::Hip hip_res,
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -173,21 +164,13 @@ stable(resources::Hip hip_res,
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -196,8 +179,8 @@ stable(resources::Hip hip_res,
   {
 
     // copy
-    hipErrchk(hipMemcpyAsync(
-        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -244,21 +227,13 @@ stable(resources::Hip hip_res,
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -267,21 +242,13 @@ stable(resources::Hip hip_res,
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -290,8 +257,8 @@ stable(resources::Hip hip_res,
   {
 
     // copy
-    hipErrchk(hipMemcpyAsync(
-        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -335,9 +302,8 @@ unstable(resources::Hip hip_res,
           concepts::any_of<
               camp::is_same<Compare,
                             operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<Compare,
-                            operators::greater<RAJA::detail::IterVal<Iter>>>>>::
-          value,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
       "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
       "types and RAJA::operators::less and RAJA::operators::greater.");
 
@@ -480,23 +446,13 @@ stable_pairs(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -505,23 +461,13 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -530,15 +476,15 @@ stable_pairs(
   {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out)
   {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -594,23 +540,13 @@ stable_pairs(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -619,23 +555,13 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -644,15 +570,15 @@ stable_pairs(
   {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out)
   {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index c290ba0f62..e21c8ef93e 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -244,8 +244,7 @@ RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
   int         prev_chunk;
   omp_get_schedule(&prev_sched, &prev_chunk);
   omp_set_schedule(Policy::schedule, Policy::chunk_size);
-  forall_impl(::RAJA::policy::omp::Runtime{},
-              std::forward<Iterable>(iter),
+  forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter),
               std::forward<Func>(loop_body));
   omp_set_schedule(prev_sched, prev_chunk);
 }
@@ -344,8 +343,8 @@ forall_impl(resources::Host host_res,
             Func&&     loop_body,
             ForallParam)
 {
-  internal::forall_impl(
-      Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter),
+                        std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
@@ -363,8 +362,8 @@ forall_impl(resources::Host host_res,
             Func&&     loop_body,
             ForallParam)
 {
-  internal::forall_impl_nowait(
-      Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter),
+                               std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 6e21a445c4..a6450b8555 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -254,8 +254,7 @@ struct LoopExecute<omp_for_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
@@ -328,12 +327,8 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
-               *(segment2.begin() + k),
-               i,
-               j,
-               k);
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k), i, j, k);
         }
       }
     }
@@ -441,8 +436,8 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
             for (int i = 0; i < len0; i++)
             {
 
-              loop_body.get_priv()(
-                  *(segment0.begin() + i), *(segment1.begin() + j), i, j);
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j), i, j);
             }
           }
         });
@@ -476,10 +471,7 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
               {
                 loop_body.get_priv()(*(segment0.begin() + i),
                                      *(segment1.begin() + j),
-                                     *(segment2.begin() + k),
-                                     i,
-                                     j,
-                                     k);
+                                     *(segment2.begin() + k), i, j, k);
               }
             }
           }
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 0b35f4c0ab..01b9038d2b 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -227,12 +227,8 @@ struct MultiReduceDataOMP<
         m_identity(identity),
         m_data(nullptr)
   {
-    m_data = create_data(container,
-                         identity,
-                         m_num_bins,
-                         m_max_threads,
-                         m_padded_bins,
-                         m_padded_threads);
+    m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                         m_padded_bins, m_padded_threads);
   }
 
   MultiReduceDataOMP(MultiReduceDataOMP const& other)
@@ -254,8 +250,8 @@ struct MultiReduceDataOMP<
     {
       if (!m_parent)
       {
-        destroy_data(
-            m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                     m_padded_threads);
       }
     }
   }
@@ -267,16 +263,12 @@ struct MultiReduceDataOMP<
     size_t new_num_bins = container.size();
     if (new_num_bins != m_num_bins)
     {
-      destroy_data(
-          m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                   m_padded_threads);
       m_num_bins    = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data        = create_data(container,
-                           identity,
-                           m_num_bins,
-                           m_max_threads,
-                           m_padded_bins,
-                           m_padded_threads);
+      m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                           m_padded_bins, m_padded_threads);
     }
     else
     {
@@ -287,8 +279,8 @@ struct MultiReduceDataOMP<
           size_t bin        = 0;
           for (auto const& value : container)
           {
-            m_data[index_data(
-                bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = value;
             ++bin;
           }
         }
@@ -296,8 +288,8 @@ struct MultiReduceDataOMP<
         {
           for (size_t bin = 0; bin < m_num_bins; ++bin)
           {
-            m_data[index_data(
-                bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = identity;
           }
         }
       }
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index c761f4c2c0..1739318941 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -342,8 +342,7 @@ RAJA_INLINE resources::EventProxy<resources::Host>
                         Func&&      loop_body,
                         ForallParam f_params)
 {
-  expt::internal::forall_impl(Schedule{},
-                              std::forward<Iterable>(iter),
+  expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter),
                               std::forward<Func>(loop_body),
                               std::forward<ForallParam>(f_params));
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index e3c916527f..28e690c831 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -67,18 +67,14 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
     const DistanceT idx_end   = firstIndex(n, p, pid + 1);
     if (idx_begin != idx_end)
     {
-      inclusive_inplace(
-          host_res, ::RAJA::seq_exec{}, begin + idx_begin, begin + idx_end, f);
+      inclusive_inplace(host_res, ::RAJA::seq_exec{}, begin + idx_begin,
+                        begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp          single
-    exclusive_inplace(host_res,
-                      ::RAJA::seq_exec{},
-                      sums.data(),
-                      sums.data() + p,
-                      f,
-                      BinFn::identity());
+    exclusive_inplace(host_res, ::RAJA::seq_exec{}, sums.data(),
+                               sums.data() + p, f, BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i)
     {
                begin[i] = f(begin[i], sums[pid]);
@@ -119,17 +115,13 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
 #pragma omp barrier
     if (idx_begin != idx_end)
     {
-      exclusive_inplace(
-          host_res, seq_exec{}, begin + idx_begin, begin + idx_end, f, init);
+      exclusive_inplace(host_res, seq_exec{}, begin + idx_begin,
+                        begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res,
-                      seq_exec{},
-                      sums.data(),
-                      sums.data() + p,
-                      f,
+    exclusive_inplace(host_res, seq_exec{}, sums.data(), sums.data() + p, f,
                       BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i)
     {
@@ -180,8 +172,8 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(
-      host_res, exec, out, out + distance(begin, end), f, v);
+  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f,
+                           v);
 }
 
 } // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 3bad04d1d6..c8a137eaee 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -85,8 +85,8 @@ inline void sort_task(Sorter                       sorter,
 
     // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
     // comp);
-    RAJA::detail::inplace_merge(
-        begin + i_begin, begin + i_middle, begin + i_end, comp);
+    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                begin + i_end, comp);
   }
 }
 
@@ -137,8 +137,8 @@ inline void sort_parallel_region(Sorter                       sorter,
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
       // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
       // comp);
-      RAJA::detail::inplace_merge(
-          begin + i_begin, begin + i_middle, begin + i_end, comp);
+      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                  begin + i_end, comp);
     }
   }
 }
@@ -255,8 +255,8 @@ unstable_pairs(resources::Host host_res,
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(
-      detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::UnstableSorter{}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -281,8 +281,8 @@ stable_pairs(resources::Host host_res,
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(
-      detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::StableSorter{}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 9dc3bfe39f..ddc09add09 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -68,10 +68,8 @@ struct StatementExecutor<statement::For<ArgumentId,
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r,
-                omp_target_parallel_for_exec<N>{},
-                TypedRangeSegment<len_t>(0, len),
-                for_wrapper,
+    forall_impl(r, omp_target_parallel_for_exec<N>{},
+                TypedRangeSegment<len_t>(0, len), for_wrapper,
                 RAJA::expt::get_empty_forall_param_pack());
   }
 };
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 8f789517be..d0e16601bc 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -129,10 +129,7 @@ struct Reduce_Data
     // precondition: host and device are valid pointers
     if (omp_target_memcpy(reinterpret_cast<void*>(device),
                           reinterpret_cast<void*>(host),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.deviceID,
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
                           info.hostID) != 0)
     {
       printf("Unable to copy memory from host to device\n");
@@ -146,10 +143,7 @@ struct Reduce_Data
     // precondition: host and device are valid pointers
     if (omp_target_memcpy(reinterpret_cast<void*>(host),
                           reinterpret_cast<void*>(device),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.hostID,
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
                           info.deviceID) != 0)
     {
       printf("Unable to copy memory from device to host\n");
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index f9397aabee..28ad518547 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -170,8 +170,7 @@ struct LoopExecute<seq_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
@@ -239,12 +238,8 @@ struct LoopICountExecute<seq_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
-               *(segment2.begin() + k),
-               i,
-               j,
-               k);
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k), i, j, k);
         }
       }
     }
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index ea276026ea..b712af52df 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -294,8 +294,7 @@ forall_impl(resources::Sycl& sycl_res,
     q->submit(
         [&](::sycl::handler& h)
         {
-          h.parallel_for(::sycl::range<1>(len),
-                         reduction,
+          h.parallel_for(::sycl::range<1>(len), reduction,
                          [=](::sycl::item<1> it, auto& red)
                          {
                            ForallParam fp;
@@ -391,8 +390,7 @@ forall_impl(resources::Sycl& sycl_res,
     q->submit(
          [&](::sycl::handler& h)
          {
-           h.parallel_for(::sycl::range<1>(len),
-                          reduction,
+           h.parallel_for(::sycl::range<1>(len), reduction,
                           [=](::sycl::item<1> it, auto& red)
                           {
                             Index_type  ii = it.get_id(0);
@@ -451,8 +449,8 @@ RAJA_INLINE resources::EventProxy<resources::Sycl>
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
   {
-    iset.segmentCall(
-        r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(), loop_body);
+    iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
+                     loop_body);
   } // iterate over segments of index set
 
   if (!Async)
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index cf2a3bc642..3fa7299a36 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -144,8 +144,7 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
           [&](cl::sycl::handler& h)
           {
             h.parallel_for(
-                launch_dims.fit_nd_range(qu),
-                [=](cl::sycl::nd_item<3> item)
+                launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item)
                 { SyclKernelLauncher<Data, executor_t>(*m_data, item); });
           })
         .wait(); // Need to wait to free memory
@@ -212,10 +211,7 @@ struct StatementExecutor<
     using executor_t =
         sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
-                                      LaunchConfig,
-                                      stmt_list_t,
-                                      data_t,
-                                      Types>;
+                                      LaunchConfig, stmt_list_t, data_t, Types>;
 
     camp::resources::Sycl res = data.get_resource();
     ::sycl::queue*        q   = res.get_queue();
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 2003659c47..a0ac5aeae6 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -184,10 +184,10 @@ struct LaunchDims
           ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
-    cl::sycl::range<3> ret_th = {
-        launch_local.x, launch_local.y, launch_local.z};
-    cl::sycl::range<3> ret_gl = {
-        launch_global.x, launch_global.y, launch_global.z};
+    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y,
+                                 launch_local.z};
+    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y,
+                                 launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
@@ -271,8 +271,8 @@ struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types>
   static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return SyclStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 15d8d7bde2..6f15e0bab2 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -172,8 +172,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
                  launch_params.shared_mem_size, h);
 
              h.parallel_for(
-                 cl::sycl::nd_range<3>(gridSize, blockSize),
-                 reduction,
+                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
                  [=](cl::sycl::nd_item<3> itm, auto& red)
                  {
                    LaunchContext ctx;
@@ -361,8 +360,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
                  launch_params.shared_mem_size, h);
 
              h.parallel_for(
-                 cl::sycl::nd_range<3>(gridSize, blockSize),
-                 reduction,
+                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
                  [=](cl::sycl::nd_item<3> itm, auto& red)
                  {
                    LaunchContext ctx;
@@ -491,8 +489,7 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
                      ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment1.begin() + ty));
     }
   }
@@ -865,8 +862,7 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment2.begin() + tz));
     }
   }
@@ -916,12 +912,8 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
-             *(segment2.begin() + tz),
-             tx,
-             ty,
-             tz);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+             *(segment2.begin() + tz), tx, ty, tz);
     }
   }
 };
@@ -995,8 +987,7 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
              bz += ctx.itm->get_group_range(DIM2))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
                *(segment2.begin() + bz));
         }
       }
@@ -1062,12 +1053,8 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
              bz += ctx.itm->get_group_range(DIM0))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
-               *(segment2.begin() + bz),
-               bx,
-               by,
-               bz);
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
+               *(segment2.begin() + bz), bx, by, bz);
         }
       }
     }
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 42857a74dc..1edb198a8c 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -143,9 +143,9 @@ struct Reduce_Data
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void*>(device),
-                       reinterpret_cast<void*>(host),
-                       sycl::MaxNumTeams * sizeof(T));
+    auto e =
+        q->memcpy(reinterpret_cast<void*>(device),
+                  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
@@ -246,8 +246,7 @@ struct TargetReduce
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             val.device[i]);
@@ -265,8 +264,7 @@ struct TargetReduce
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             val.device[i]);
@@ -432,8 +430,7 @@ class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -462,8 +459,7 @@ class ReduceBitOr<sycl_reduce, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -481,8 +477,7 @@ class ReduceBitOr<sycl_reduce, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -511,8 +506,7 @@ class ReduceBitAnd<sycl_reduce, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -530,8 +524,7 @@ class ReduceBitAnd<sycl_reduce, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -560,8 +553,7 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -579,8 +571,7 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -609,8 +600,7 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
@@ -628,8 +618,7 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
     auto atm =
-        ::sycl::atomic_ref<T,
-                           cl::sycl::memory_order_acq_rel,
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
                            cl::sycl::memory_scope::device,
                            cl::sycl::access::address_space::global_space>(
             parent::val.device[i]);
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index 753a075bd2..d8c3cd3057 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -57,8 +57,8 @@ class Register<double, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -297,10 +297,9 @@ class Register<double, avx_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply a masked divide, so do it manually
-    return self_type(_mm256_set_pd(N >= 4 ? get(3) / b.get(3) : 0,
-                                   N >= 3 ? get(2) / b.get(2) : 0,
-                                   N >= 2 ? get(1) / b.get(1) : 0,
-                                   N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
   /*!
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 078d3a421c..41c2c3134c 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -56,14 +56,9 @@ class Register<float, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0,
-                            N >= 7 ? -1 : 0,
-                            N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0,
-                            N >= 4 ? -1 : 0,
-                            N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0,
-                            N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
 public:
@@ -302,14 +297,11 @@ class Register<float, avx_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply a masked divide
-    return self_type(_mm256_set_ps(N >= 8 ? get(7) / b.get(7) : 0,
-                                   N >= 7 ? get(6) / b.get(6) : 0,
-                                   N >= 6 ? get(5) / b.get(5) : 0,
-                                   N >= 5 ? get(4) / b.get(4) : 0,
-                                   N >= 4 ? get(3) / b.get(3) : 0,
-                                   N >= 3 ? get(2) / b.get(2) : 0,
-                                   N >= 2 ? get(1) / b.get(1) : 0,
-                                   N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index fc67ad6323..1426a4658a 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -56,56 +56,35 @@ class Register<int32_t, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0,
-                            N >= 7 ? -1 : 0,
-                            N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0,
-                            N >= 4 ? -1 : 0,
-                            N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0,
-                            N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride,
-                            6 * stride,
-                            5 * stride,
-                            4 * stride,
-                            3 * stride,
-                            2 * stride,
-                            stride,
-                            0);
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0,
-                            N >= 8 ? 7 : 0,
-                            N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0,
-                            N >= 3 ? 2 : 0,
-                            N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0,
-                            N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0,
-                            N >= 5 ? 4 : 0,
-                            N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0,
-                            N >= 2 ? 1 : 0,
-                            N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0,
-                            N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -243,8 +222,7 @@ class Register<int32_t, avx_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr),
-                        createMask(N),
+    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
                         reinterpret_cast<__m256>(m_value));
     return *this;
   }
@@ -460,14 +438,10 @@ class Register<int32_t, avx_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(get(7) / b.get(7),
-                                      get(6) / b.get(6),
-                                      get(5) / b.get(5),
-                                      get(4) / b.get(4),
-                                      get(3) / b.get(3),
-                                      get(2) / b.get(2),
-                                      get(1) / b.get(1),
-                                      get(0) / b.get(0)));
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -475,14 +449,11 @@ class Register<int32_t, avx_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(N >= 8 ? get(7) / b.get(7) : 0,
-                                      N >= 7 ? get(6) / b.get(6) : 0,
-                                      N >= 6 ? get(5) / b.get(5) : 0,
-                                      N >= 5 ? get(4) / b.get(4) : 0,
-                                      N >= 4 ? get(3) / b.get(3) : 0,
-                                      N >= 3 ? get(2) / b.get(2) : 0,
-                                      N >= 2 ? get(1) / b.get(1) : 0,
-                                      N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index d5b5cc41d0..313592f70a 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -56,8 +56,8 @@ class Register<int64_t, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -204,8 +204,7 @@ class Register<int64_t, avx_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr),
-                        createMask(N),
+    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N),
                         reinterpret_cast<__m256d>(m_value));
     return *this;
   }
@@ -355,10 +354,8 @@ class Register<int64_t, avx_register>
   self_type multiply(self_type const& b) const
   {
     // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) * b.get(3),
-                                       get(2) * b.get(2),
-                                       get(1) * b.get(1),
-                                       get(0) * b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -366,10 +363,8 @@ class Register<int64_t, avx_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) / b.get(3),
-                                       get(2) / b.get(2),
-                                       get(1) / b.get(1),
-                                       get(0) / b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -377,10 +372,9 @@ class Register<int64_t, avx_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(N >= 4 ? get(3) / b.get(3) : 0,
-                                       N >= 3 ? get(2) / b.get(2) : 0,
-                                       N >= 2 ? get(1) / b.get(1) : 0,
-                                       N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index d2d08ccc06..385f938dd2 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -56,8 +56,8 @@ class Register<double, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -162,8 +162,8 @@ class Register<double, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-    m_value = _mm256_i64gather_pd(
-        ptr, createStridedOffsets(stride), sizeof(element_type));
+    m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
     return *this;
   }
 
@@ -180,11 +180,9 @@ class Register<double, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-    m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                       ptr,
-                                       createStridedOffsets(stride),
-                                       _mm256_castsi256_pd(createMask(N)),
-                                       sizeof(element_type));
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
     return *this;
   }
 
@@ -224,11 +222,9 @@ class Register<double, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-    m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                       ptr,
-                                       offsets.get_register(),
-                                       _mm256_castsi256_pd(createMask(N)),
-                                       sizeof(element_type));
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, offsets.get_register(),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
     return *this;
   }
 
@@ -390,10 +386,9 @@ class Register<double, avx2_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply a masked divide, so do it manually
-    return self_type(_mm256_set_pd(N >= 4 ? get(3) / b.get(3) : 0,
-                                   N >= 3 ? get(2) / b.get(2) : 0,
-                                   N >= 2 ? get(1) / b.get(1) : 0,
-                                   N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 // only use FMA's if the compiler has them turned on
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index abb7916790..a9f8bcfe7c 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -56,56 +56,35 @@ class Register<float, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0,
-                            N >= 7 ? -1 : 0,
-                            N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0,
-                            N >= 4 ? -1 : 0,
-                            N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0,
-                            N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride,
-                            6 * stride,
-                            5 * stride,
-                            4 * stride,
-                            3 * stride,
-                            2 * stride,
-                            stride,
-                            0);
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0,
-                            N >= 8 ? 7 : 0,
-                            N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0,
-                            N >= 3 ? 2 : 0,
-                            N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0,
-                            N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0,
-                            N >= 5 ? 4 : 0,
-                            N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0,
-                            N >= 2 ? 1 : 0,
-                            N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0,
-                            N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -198,8 +177,8 @@ class Register<float, avx2_register>
   RAJA_INLINE
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i32gather_ps(
-        ptr, createStridedOffsets(stride), sizeof(element_type));
+    m_value = _mm256_i32gather_ps(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
     return *this;
   }
 
@@ -213,11 +192,9 @@ class Register<float, avx2_register>
   self_type&
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
-    m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
-                                       ptr,
-                                       createStridedOffsets(stride),
-                                       _mm256_castsi256_ps(createMask(N)),
-                                       sizeof(element_type));
+    m_value = _mm256_mask_i32gather_ps(
+        _mm256_setzero_ps(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_ps(createMask(N)), sizeof(element_type));
     return *this;
   }
 
@@ -346,14 +323,11 @@ class Register<float, avx2_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply a masked divide
-    return self_type(_mm256_set_ps(N >= 8 ? get(7) / b.get(7) : 0,
-                                   N >= 7 ? get(6) / b.get(6) : 0,
-                                   N >= 6 ? get(5) / b.get(5) : 0,
-                                   N >= 5 ? get(4) / b.get(4) : 0,
-                                   N >= 4 ? get(3) / b.get(3) : 0,
-                                   N >= 3 ? get(2) / b.get(2) : 0,
-                                   N >= 2 ? get(1) / b.get(1) : 0,
-                                   N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 // only use FMA's if the compiler has them turned on
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index ee148bd6bb..af488521ef 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -57,56 +57,35 @@ class Register<int32_t, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0,
-                            N >= 7 ? -1 : 0,
-                            N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0,
-                            N >= 4 ? -1 : 0,
-                            N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0,
-                            N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride,
-                            6 * stride,
-                            5 * stride,
-                            4 * stride,
-                            3 * stride,
-                            2 * stride,
-                            stride,
-                            0);
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0,
-                            N >= 8 ? 7 : 0,
-                            N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0,
-                            N >= 3 ? 2 : 0,
-                            N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0,
-                            N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0,
-                            N >= 5 ? 4 : 0,
-                            N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0,
-                            N >= 2 ? 1 : 0,
-                            N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0,
-                            N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -202,8 +181,8 @@ class Register<int32_t, avx2_register>
   RAJA_INLINE
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i32gather_epi32(
-        ptr, createStridedOffsets(stride), sizeof(element_type));
+    m_value = _mm256_i32gather_epi32(ptr, createStridedOffsets(stride),
+                                     sizeof(element_type));
     return *this;
   }
 
@@ -217,11 +196,9 @@ class Register<int32_t, avx2_register>
   self_type&
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
-    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
-                                          ptr,
+    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(), ptr,
                                           createStridedOffsets(stride),
-                                          createMask(N),
-                                          sizeof(element_type));
+                                          createMask(N), sizeof(element_type));
     return *this;
   }
 
@@ -416,14 +393,10 @@ class Register<int32_t, avx2_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(get(7) / b.get(7),
-                                      get(6) / b.get(6),
-                                      get(5) / b.get(5),
-                                      get(4) / b.get(4),
-                                      get(3) / b.get(3),
-                                      get(2) / b.get(2),
-                                      get(1) / b.get(1),
-                                      get(0) / b.get(0)));
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -431,14 +404,11 @@ class Register<int32_t, avx2_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(N >= 8 ? get(7) / b.get(7) : 0,
-                                      N >= 7 ? get(6) / b.get(6) : 0,
-                                      N >= 6 ? get(5) / b.get(5) : 0,
-                                      N >= 5 ? get(4) / b.get(4) : 0,
-                                      N >= 4 ? get(3) / b.get(3) : 0,
-                                      N >= 3 ? get(2) / b.get(2) : 0,
-                                      N >= 2 ? get(1) / b.get(1) : 0,
-                                      N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 41cdf1c1e0..c0cc3b7012 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -55,8 +55,8 @@ class Register<int64_t, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -182,12 +182,9 @@ class Register<int64_t, avx2_register>
   self_type&
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
-    m_value =
-        _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
-                                    reinterpret_cast<long long const*>(ptr),
-                                    createStridedOffsets(stride),
-                                    createMask(N),
-                                    sizeof(element_type));
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_set1_epi64x(0), reinterpret_cast<long long const*>(ptr),
+        createStridedOffsets(stride), createMask(N), sizeof(element_type));
     return *this;
   }
 
@@ -206,9 +203,9 @@ class Register<int64_t, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
-                                     offsets.get_register(),
-                                     sizeof(element_type));
+    m_value =
+        _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                               offsets.get_register(), sizeof(element_type));
     return *this;
   }
 
@@ -228,12 +225,9 @@ class Register<int64_t, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-    m_value =
-        _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
-                                    reinterpret_cast<long long const*>(ptr),
-                                    offsets.get_register(),
-                                    createMask(N),
-                                    sizeof(element_type));
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_setzero_si256(), reinterpret_cast<long long const*>(ptr),
+        offsets.get_register(), createMask(N), sizeof(element_type));
     return *this;
   }
 
@@ -256,8 +250,8 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_epi64(
-        reinterpret_cast<long long*>(ptr), createMask(N), m_value);
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N),
+                           m_value);
     return *this;
   }
 
@@ -380,10 +374,8 @@ class Register<int64_t, avx2_register>
   self_type multiply(self_type const& b) const
   {
     // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) * b.get(3),
-                                       get(2) * b.get(2),
-                                       get(1) * b.get(1),
-                                       get(0) * b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -391,10 +383,8 @@ class Register<int64_t, avx2_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) / b.get(3),
-                                       get(2) / b.get(2),
-                                       get(1) / b.get(1),
-                                       get(0) / b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -402,10 +392,9 @@ class Register<int64_t, avx2_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(N >= 4 ? get(3) / b.get(3) : 0,
-                                       N >= 3 ? get(2) / b.get(2) : 0,
-                                       N >= 2 ? get(1) / b.get(1) : 0,
-                                       N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index 7f45c06141..b35ef7f595 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -166,8 +166,8 @@ class Register<double, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i64gather_pd(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
     return *this;
   }
 
@@ -182,10 +182,8 @@ class Register<double, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
-                                       createMask(N),
-                                       createStridedOffsets(stride),
-                                       ptr,
+    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
                                        sizeof(element_type));
     return *this;
   }
@@ -223,8 +221,8 @@ class Register<double, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i64scatter_pd(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
     return *this;
   }
 
@@ -238,11 +236,8 @@ class Register<double, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i64scatter_pd(ptr,
-                              createMask(N),
-                              createStridedOffsets(stride),
-                              m_value,
-                              sizeof(element_type));
+    _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
     return *this;
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 2082293046..ccc6991ccc 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -181,8 +181,8 @@ class Register<float, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i32gather_ps(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i32gather_ps(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
     return *this;
   }
 
@@ -197,10 +197,8 @@ class Register<float, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
-                                       createMask(N),
-                                       createStridedOffsets(stride),
-                                       ptr,
+    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
                                        sizeof(element_type));
     return *this;
   }
@@ -238,8 +236,8 @@ class Register<float, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i32scatter_ps(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i32scatter_ps(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
     return *this;
   }
 
@@ -253,11 +251,8 @@ class Register<float, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i32scatter_ps(ptr,
-                              createMask(N),
-                              createStridedOffsets(stride),
-                              m_value,
-                              sizeof(element_type));
+    _mm512_mask_i32scatter_ps(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
     return *this;
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 4645ed6cf4..324a50db3a 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -187,8 +187,8 @@ class Register<int32_t, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i32gather_epi32(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
     return *this;
   }
 
@@ -203,10 +203,8 @@ class Register<int32_t, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
-                                          createMask(N),
-                                          createStridedOffsets(stride),
-                                          ptr,
+    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
                                           sizeof(element_type));
     return *this;
   }
@@ -248,8 +246,8 @@ class Register<int32_t, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i32scatter_epi32(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i32scatter_epi32(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
     return *this;
   }
 
@@ -263,10 +261,8 @@ class Register<int32_t, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i32scatter_epi32(ptr,
-                                 createMask(N),
-                                 createStridedOffsets(stride),
-                                 m_value,
+    _mm512_mask_i32scatter_epi32(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
                                  sizeof(element_type));
     return *this;
   }
@@ -378,22 +374,13 @@ class Register<int32_t, avx512_register>
   self_type divide(self_type const& b) const
   {
     // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi32(get(15) / b.get(15),
-                                      get(14) / b.get(14),
-                                      get(13) / b.get(13),
-                                      get(12) / b.get(12),
-                                      get(11) / b.get(11),
-                                      get(10) / b.get(10),
-                                      get(9) / b.get(9),
-                                      get(8) / b.get(8),
-                                      get(7) / b.get(7),
-                                      get(6) / b.get(6),
-                                      get(5) / b.get(5),
-                                      get(4) / b.get(4),
-                                      get(3) / b.get(3),
-                                      get(2) / b.get(2),
-                                      get(1) / b.get(1),
-                                      get(0) / b.get(0)));
+    return self_type(_mm512_set_epi32(
+        get(15) / b.get(15), get(14) / b.get(14), get(13) / b.get(13),
+        get(12) / b.get(12), get(11) / b.get(11), get(10) / b.get(10),
+        get(9) / b.get(9), get(8) / b.get(8), get(7) / b.get(7),
+        get(6) / b.get(6), get(5) / b.get(5), get(4) / b.get(4),
+        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
+        get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -401,22 +388,15 @@ class Register<int32_t, avx512_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi32(N >= 16 ? get(15) / b.get(15) : 0,
-                                      N >= 15 ? get(14) / b.get(14) : 0,
-                                      N >= 14 ? get(13) / b.get(13) : 0,
-                                      N >= 13 ? get(12) / b.get(12) : 0,
-                                      N >= 12 ? get(11) / b.get(11) : 0,
-                                      N >= 11 ? get(10) / b.get(10) : 0,
-                                      N >= 10 ? get(9) / b.get(9) : 0,
-                                      N >= 9 ? get(8) / b.get(8) : 0,
-                                      N >= 8 ? get(7) / b.get(7) : 0,
-                                      N >= 7 ? get(6) / b.get(6) : 0,
-                                      N >= 6 ? get(5) / b.get(5) : 0,
-                                      N >= 5 ? get(4) / b.get(4) : 0,
-                                      N >= 4 ? get(3) / b.get(3) : 0,
-                                      N >= 3 ? get(2) / b.get(2) : 0,
-                                      N >= 2 ? get(1) / b.get(1) : 0,
-                                      N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm512_set_epi32(
+        N >= 16 ? get(15) / b.get(15) : 0, N >= 15 ? get(14) / b.get(14) : 0,
+        N >= 14 ? get(13) / b.get(13) : 0, N >= 13 ? get(12) / b.get(12) : 0,
+        N >= 12 ? get(11) / b.get(11) : 0, N >= 11 ? get(10) / b.get(10) : 0,
+        N >= 10 ? get(9) / b.get(9) : 0, N >= 9 ? get(8) / b.get(8) : 0,
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index 655b06c8fd..9266d0b979 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -175,8 +175,8 @@ class Register<int64_t, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i64gather_epi64(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
     return *this;
   }
 
@@ -191,10 +191,8 @@ class Register<int64_t, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
-                                          createMask(N),
-                                          createStridedOffsets(stride),
-                                          ptr,
+    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
                                           sizeof(element_type));
     return *this;
   }
@@ -211,13 +209,13 @@ class Register<int64_t, avx512_register>
 #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
     (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
      defined(__INTEL_LLVM_COMPILER)) // Check for oneapi's icpx.
-    _mm512_mask_storeu_epi64(ptr,
-                             ~0,
-                             m_value); // May cause slowdown due to looping over
-                                       // 8 bytes, one at a time.
+    _mm512_mask_storeu_epi64(ptr, ~0,
+                             m_value); // May cause slowdown due to looping
+                                       // over 8 bytes, one at a time.
 #else
-    _mm512_storeu_epi64(ptr, m_value); // GNU 7-10 are missing this instruction,
-                                       // as is icpx as of version 2022.2.
+    _mm512_storeu_epi64(ptr,
+                        m_value); // GNU 7-10 are missing this instruction,
+                                  // as is icpx as of version 2022.2.
 #endif
     return *this;
   }
@@ -242,8 +240,8 @@ class Register<int64_t, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i64scatter_epi64(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
     return *this;
   }
 
@@ -257,10 +255,8 @@ class Register<int64_t, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i64scatter_epi64(ptr,
-                                 createMask(N),
-                                 createStridedOffsets(stride),
-                                 m_value,
+    _mm512_mask_i64scatter_epi64(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
                                  sizeof(element_type));
     return *this;
   }
@@ -329,14 +325,10 @@ class Register<int64_t, avx512_register>
   self_type divide(self_type const& b) const
   {
     // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi64(get(7) / b.get(7),
-                                      get(6) / b.get(6),
-                                      get(5) / b.get(5),
-                                      get(4) / b.get(4),
-                                      get(3) / b.get(3),
-                                      get(2) / b.get(2),
-                                      get(1) / b.get(1),
-                                      get(0) / b.get(0)));
+    return self_type(_mm512_set_epi64(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -344,14 +336,11 @@ class Register<int64_t, avx512_register>
   self_type divide_n(self_type const& b, camp::idx_t N) const
   {
     // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi64(N >= 8 ? get(7) / b.get(7) : 0,
-                                      N >= 7 ? get(6) / b.get(6) : 0,
-                                      N >= 6 ? get(5) / b.get(5) : 0,
-                                      N >= 5 ? get(4) / b.get(4) : 0,
-                                      N >= 4 ? get(3) / b.get(3) : 0,
-                                      N >= 3 ? get(2) / b.get(2) : 0,
-                                      N >= 2 ? get(1) / b.get(1) : 0,
-                                      N >= 1 ? get(0) / b.get(0) : 0));
+    return self_type(_mm512_set_epi64(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
   }
 
   /*!
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 6f64d403fd..3520983ee5 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -144,8 +144,7 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N),
-           static_cast<long int>(idx),
+           static_cast<int>(N), static_cast<long int>(idx),
            static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 0ead8684ad..19ef3f12e7 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -75,8 +75,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
-           static_cast<int>(N),
-           static_cast<long int>(idx),
+           static_cast<int>(N), static_cast<long int>(idx),
            static_cast<long int>(offsets[N]),
            static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index cf86f26ec8..f1e528e05b 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -76,9 +76,7 @@ struct StaticLayoutBase_impl<IdxLin,
   RAJA_INLINE static void print()
   {
     camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                      (int)RangeInts,
-                      (int)Sizes,
-                      (int)Strides)...);
+                      (int)RangeInts, (int)Sizes, (int)Strides)...);
   }
 
 
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 0588062711..c38ebc908a 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -149,9 +149,8 @@ RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
   return RAJA::max<camp::idx_t>(
       internal::expt::getTensorDim<ARGS>() == DIM
           ? internal::expt::getTensorBegin<ARGS>(
-                args,
-                layout.template get_dim_begin<
-                    GetTensorArgIdx<DIM, ARGS...>::value>())
+                args, layout.template get_dim_begin<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
           : 0 ...);
 }
 
@@ -165,9 +164,8 @@ RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
   return RAJA::max<camp::idx_t>(
       internal::expt::getTensorDim<ARGS>() == DIM
           ? internal::expt::getTensorSize<ARGS>(
-                args,
-                layout.template get_dim_size<
-                    GetTensorArgIdx<DIM, ARGS...>::value>())
+                args, layout.template get_dim_size<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
           : 0 ...);
 }
 #endif
@@ -461,10 +459,7 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
 {
   return detail::ViewReturnHelper<
       camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-      camp::list<Args...>,
-      ElementType,
-      PointerType,
-      LinIdx,
+      camp::list<Args...>, ElementType, PointerType, LinIdx,
       LayoutType>::make_return(layout, data, args...);
 }
 
@@ -782,8 +777,7 @@ class TypedViewBase<ValueType,
                    operator()(Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
-        Base::m_layout,
-        Base::m_data,
+        Base::m_layout, Base::m_data,
         match_typed_view_arg<IndexTypes>(args)...);
   }
 
@@ -803,8 +797,7 @@ class TypedViewBase<ValueType,
                    operator[](Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
-        Base::m_layout,
-        Base::m_data,
+        Base::m_layout, Base::m_data,
         match_typed_view_arg<IndexTypes>(args)...);
   }
 
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index 074cc8b65e..943f843c04 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -127,8 +127,7 @@ removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
               >{}))
 {
   return selecttuple<Lay>(
-      lyout,
-      std::forward<Tup>(tup),
+      lyout, std::forward<Tup>(tup),
       cat_seq_t<camp::make_idx_seq_t<Nth>, // sequence up to Nth
                 offset_seq_t<Nth + 1,      // after Nth
                              camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
@@ -198,8 +197,7 @@ struct MultiView
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType,
-                           typename add_offset<layout_type>::type,
+    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type,
                            P2Pidx>(data, shift_layout);
   }
 
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index 9139f981b4..9ee1c0270f 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -106,8 +106,8 @@ class MemoryArena
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(
-              iter, adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
+          remove_free_chunk(iter, adj_ptr,
+                            static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
 
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 17e3d63ce1..f83281e50e 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -120,8 +120,7 @@ template <typename Tuple, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
   return detail::for_each_tuple(
-      std::forward<Tuple>(t),
-      std::move(func),
+      std::forward<Tuple>(t), std::move(func),
       camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
 }
 
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 83f7d14f78..84b37d8b4a 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -337,8 +337,8 @@ RAJA_HOST_DEVICE
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(
-      begin(c), end(c), std::move(init), std::move(op));
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init),
+                                  std::move(op));
 }
 
 /*!
@@ -360,8 +360,8 @@ RAJA_HOST_DEVICE
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(
-      begin(c), end(c), std::move(init), std::move(op));
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
+                                    std::move(op));
 }
 
 /*!
@@ -384,8 +384,8 @@ RAJA_HOST_DEVICE
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(
-      begin(c), end(c), std::move(init), std::move(op));
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
+                                      std::move(op));
 }
 
 } // namespace RAJA
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index d059d707e7..d4261d1e70 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -143,45 +143,14 @@ get_shell_stride(int i)
   using array_type = long long unsigned[num_shell_strides()];
   return (array_type{
       // strides from M. Ciura 2001
-      1llu,
-      4llu,
-      10llu,
-      23llu,
-      57llu,
-      132llu,
-      301llu,
-      701llu,
-      1750llu,
+      1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
-      3937llu,
-      8858llu,
-      19930llu,
-      44842llu,
-      100894llu,
-      227011llu,
-      510774llu,
-      1149241llu,
-      2585792llu,
-      5818032llu,
-      13090572llu,
-      29453787llu,
-      66271020llu,
-      149109795llu,
-      335497038llu,
-      754868335llu,
-      1698453753llu,
-      3821520944llu,
-      8598422124llu,
-      19346449779llu,
-      43529512002llu,
-      97941402004llu,
-      220368154509llu,
-      495828347645llu,
-      1115613782201llu,
-      2510131009952llu,
-      5647794772392llu,
-      12707538237882llu,
-      28591961035234llu,
+      3937llu, 8858llu, 19930llu, 44842llu, 100894llu, 227011llu, 510774llu,
+      1149241llu, 2585792llu, 5818032llu, 13090572llu, 29453787llu, 66271020llu,
+      149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
+      8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
+      220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
+      5647794772392llu, 12707538237882llu, 28591961035234llu,
       64331912329276llu})[i];
 }
 
@@ -683,21 +652,15 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
         if (copyvalid) // switch arrays per level of merging to avoid copying
                        // back to copyarr
         {
-          detail::merge_like_std(copyarr + start,
-                                 copyarr + start + midpoint,
-                                 copyarr + start + midpoint,
-                                 copyarr + finish,
-                                 begin + start,
-                                 comp);
+          detail::merge_like_std(copyarr + start, copyarr + start + midpoint,
+                                 copyarr + start + midpoint, copyarr + finish,
+                                 begin + start, comp);
         }
         else
         {
-          detail::merge_like_std(begin + start,
-                                 begin + start + midpoint,
-                                 begin + start + midpoint,
-                                 begin + finish,
-                                 copyarr + start,
-                                 comp);
+          detail::merge_like_std(begin + start, begin + start + midpoint,
+                                 begin + start + midpoint, begin + finish,
+                                 copyarr + start, comp);
         }
       }
 
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index fa8993f70d..1618d9ca2e 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -205,8 +205,7 @@ zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 template <typename Tuple, typename F>
 RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t),
-                    std::forward<F>(f),
+  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
                     typename camp::decay<Tuple>::IdxSeq{});
 }
 
@@ -219,10 +218,8 @@ RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
   static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
                              typename camp::decay<Tuple1>::IdxSeq>::value,
                 "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0),
-                    std::forward<Tuple1>(t1),
-                    std::forward<F>(f),
-                    typename camp::decay<Tuple0>::IdxSeq{});
+  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1),
+                    std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
 }
 
 } // end namespace detail
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index bdfcdf61b5..082253924b 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -99,14 +99,14 @@ void KokkosPluginLoader::initPlugin(const std::string& path)
   // Getting and storing supported kokkos functions.
   getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
 
-  getFunction<pre_function>(
-      plugin, pre_functions, "kokkosp_begin_parallel_for");
+  getFunction<pre_function>(plugin, pre_functions,
+                            "kokkosp_begin_parallel_for");
 
-  getFunction<post_function>(
-      plugin, post_functions, "kokkosp_end_parallel_for");
+  getFunction<post_function>(plugin, post_functions,
+                             "kokkosp_end_parallel_for");
 
-  getFunction<finalize_function>(
-      plugin, finalize_functions, "kokkosp_finalize_library");
+  getFunction<finalize_function>(plugin, finalize_functions,
+                                 "kokkosp_finalize_library");
 #else
   RAJA_UNUSED_ARG(path);
 #endif
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index c988caa033..8f09c50231 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -27,22 +27,20 @@ void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
   RAJA::expt::dynamic_forall<POLICY_LIST>(
-      working_res,
-      pol,
-      r1,
+      working_res, pol, r1,
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
@@ -50,8 +48,8 @@ void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -91,8 +89,7 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
     // Loop through policy list
     for (int pol = 0; pol < host_range; ++pol)
     {
-      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE,
-                                                WORKING_RES,
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
                                                 POLICY_LIST>(
           INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
@@ -106,8 +103,7 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 #endif
     for (int pol = device_start; pol < N; ++pol)
     {
-      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE,
-                                                WORKING_RES,
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
                                                 POLICY_LIST>(
           INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index 531dd29528..3d47e7cd53 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -32,8 +32,8 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -43,8 +43,7 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
     RAJA::expt::dynamic_forall<POLICY_LIST>(
-        pol,
-        r1,
+        pol, r1,
         [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
         { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
   }
@@ -53,11 +52,10 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol,
-                                            r1,
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1,
                                             [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
                                             {
                                               (void)idx;
@@ -73,8 +71,8 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index 85b3d09cb1..28f7d3f128 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -28,8 +28,8 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
@@ -69,8 +69,8 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 1cfcbfe690..327e74c9f0 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -34,8 +34,8 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
@@ -66,8 +66,7 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
             working_array[RAJA::stripIndexType(N)]++;
           }
         },
-        r0,
-        r1);
+        r0, r1);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
   }
@@ -80,8 +79,8 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index b3e48fa3d4..f7a2d3405b 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -39,8 +39,8 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
@@ -76,9 +76,7 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
             working_array[RAJA::stripIndexType(N)]++;
           }
         },
-        r0,
-        r1,
-        r2);
+        r0, r1, r2);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
   }
@@ -91,8 +89,8 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -118,27 +116,15 @@ void runNegativeTests()
 {
   // test zero-length range segment
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5),
-      INDEX_TYPE(-5),
-      INDEX_TYPE(-3),
-      INDEX_TYPE(-3),
-      INDEX_TYPE(-1),
-      INDEX_TYPE(-1));
+      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3),
+      INDEX_TYPE(-1), INDEX_TYPE(-1));
 
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5),
-      INDEX_TYPE(0),
-      INDEX_TYPE(-3),
-      INDEX_TYPE(0),
-      INDEX_TYPE(-4),
-      INDEX_TYPE(0));
+      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0),
+      INDEX_TYPE(-4), INDEX_TYPE(0));
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5),
-      INDEX_TYPE(5),
-      INDEX_TYPE(-3),
-      INDEX_TYPE(2),
-      INDEX_TYPE(-7),
-      INDEX_TYPE(-2));
+      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2),
+      INDEX_TYPE(-7), INDEX_TYPE(-2));
 }
 
 
@@ -150,55 +136,27 @@ TYPED_TEST_P(ForallCombiningAdapter3DTest, Forall3D)
 
   // test zero-length range segment
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3),
-      INDEX_TYPE(3),
-      INDEX_TYPE(5),
-      INDEX_TYPE(5),
-      INDEX_TYPE(7),
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
       INDEX_TYPE(7));
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3),
-      INDEX_TYPE(3),
-      INDEX_TYPE(5),
-      INDEX_TYPE(6),
-      INDEX_TYPE(7),
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
       INDEX_TYPE(8));
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3),
-      INDEX_TYPE(4),
-      INDEX_TYPE(5),
-      INDEX_TYPE(5),
-      INDEX_TYPE(7),
+      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
       INDEX_TYPE(8));
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3),
-      INDEX_TYPE(4),
-      INDEX_TYPE(5),
-      INDEX_TYPE(6),
-      INDEX_TYPE(7),
+      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
       INDEX_TYPE(7));
 
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0),
-      INDEX_TYPE(7),
-      INDEX_TYPE(0),
-      INDEX_TYPE(6),
-      INDEX_TYPE(0),
+      INDEX_TYPE(0), INDEX_TYPE(7), INDEX_TYPE(0), INDEX_TYPE(6), INDEX_TYPE(0),
       INDEX_TYPE(3));
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1),
-      INDEX_TYPE(13),
-      INDEX_TYPE(4),
-      INDEX_TYPE(17),
-      INDEX_TYPE(6),
-      INDEX_TYPE(11));
+      INDEX_TYPE(1), INDEX_TYPE(13), INDEX_TYPE(4), INDEX_TYPE(17),
+      INDEX_TYPE(6), INDEX_TYPE(11));
   ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(13),
-      INDEX_TYPE(46),
-      INDEX_TYPE(17),
-      INDEX_TYPE(51),
-      INDEX_TYPE(4),
-      INDEX_TYPE(31));
+      INDEX_TYPE(13), INDEX_TYPE(46), INDEX_TYPE(17), INDEX_TYPE(51),
+      INDEX_TYPE(4), INDEX_TYPE(31));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index 096a1267a9..c2d8293ce6 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -75,8 +75,8 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
   T* test_array;
   T* check_array;
 
-  allocateForallTestData<T>(
-      len, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<T>(len, work_res, &work_array, &check_array,
+                            &test_array);
 
   // use atomic add to reduce the array
   test_array[0]  = static_cast<T>(0);
@@ -105,8 +105,8 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
         RAJA::atomicInc<AtomicPolicy>(work_array + 4);
         RAJA::atomicDec<AtomicPolicy>(work_array + 5);
         RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-        RAJA::atomicCAS<AtomicPolicy>(
-            work_array + 7, static_cast<T>(i), static_cast<T>(i + 1));
+        RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i),
+                                      static_cast<T>(i + 1));
         RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
         RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
         RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
@@ -147,24 +147,13 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicBasicTestImpl<AExec,
-                            APol,
-                            ResType,
-                            IdxType,
-                            RAJA::TypedRangeSegment<IdxType>,
-                            DType>(10000);
-  ForallAtomicBasicTestImpl<AExec,
-                            APol,
-                            ResType,
-                            IdxType,
-                            RAJA::TypedRangeStrideSegment<IdxType>,
-                            DType>(10000);
-  ForallAtomicBasicTestImpl<AExec,
-                            APol,
-                            ResType,
-                            IdxType,
-                            RAJA::TypedListSegment<IdxType>,
-                            DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeSegment<IdxType>, DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeStrideSegment<IdxType>, DType>(
+      10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedListSegment<IdxType>, DType>(10000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, AtomicBasicForall);
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 82b847e86e..163b518977 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -114,8 +114,8 @@ void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
                         IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg,
                            [=] RAJA_HOST_DEVICE(IdxType i)
                            {
@@ -171,18 +171,12 @@ void ForallAtomicRefCASTestImpl(IdxType N)
 
   testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, CASOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy,
-                     AtomicPolicy,
-                     IdxType,
-                     T,
-                     CompareExchangeWeakOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy,
-                     AtomicPolicy,
-                     IdxType,
-                     T,
-                     CompareExchangeStrongOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeWeakOtherOp>(seg, count, list, hcount,
+                                                 hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeStrongOtherOp>(seg, count, list, hcount,
+                                                   hlist, work_res, N);
 
   work_res.deallocate(count);
   work_res.deallocate(list);
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index 7e63e1bfb1..db2729ecf4 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -116,8 +116,8 @@ void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
                               IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg,
                            [=] RAJA_HOST_DEVICE(IdxType i)
                            {
@@ -173,12 +173,9 @@ void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, LoadOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy,
-                           AtomicPolicy,
-                           IdxType,
-                           T,
-                           OperatorTOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                           OperatorTOtherOp>(seg, count, list, hcount, hlist,
+                                             work_res, N);
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, StoreOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, AssignOtherOp>(
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 88ab21f6ec..276993e300 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -194,8 +194,8 @@ testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
                        IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg,
                            [=] RAJA_HOST_DEVICE(IdxType i)
                            {
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index df202c4b98..e3fd035ab5 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -116,8 +116,8 @@ void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
                            IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
   RAJA::forall<ExecPolicy>(seg,
                            [=] RAJA_HOST_DEVICE(IdxType i)
                            {
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index 2069c90ed1..d737929fa9 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -50,17 +50,15 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 #endif
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_srcside,
-                           [=] RAJA_HOST_DEVICE(IdxType ii)
+  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
                            { source[ii] = actualsource + (ii * dst_side); });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_dstside,
-                           [=] RAJA_HOST_DEVICE(IdxType ii)
+  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
                            { dest[ii] = actualdest + (ii * dst_side); });
 
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { actualsource[i] = (T)1; });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { actualsource[i] = (T)1; });
 
   // use atomic add to reduce the array
   // 1D defaut MultiView
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index 81e4da707d..e863d8d47b 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -96,10 +96,7 @@ TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec,
-                                           APol,
-                                           ResType,
-                                           IdxType,
+  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType,
                                            DType>(20000);
 }
 
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index 057d5e8ee6..f8760c170b 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -59,12 +59,11 @@ void ForallAtomicViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(
-      seg_half, [=] RAJA_HOST_DEVICE(IdxType i) { sum_atomic_view(i) = (T)0; });
+  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i) = (T)0; });
 
   // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
                            { sum_atomic_view(i / 2) += vec_view(i); });
 
   work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index 4e18352df7..4441475bd0 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -45,8 +45,8 @@ void ForallIcountIndexSetViewTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -63,8 +63,7 @@ void ForallIcountIndexSetViewTestImpl()
       working_array, layout);
 
   RAJA::forall_Icount<EXEC_POLICY>(
-      iset,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
       { work_view(icount) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
@@ -75,8 +74,8 @@ void ForallIcountIndexSetViewTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index e56423aa36..2769275a88 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -44,8 +44,8 @@ void ForallIndexSetViewTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -61,8 +61,8 @@ void ForallIndexSetViewTestImpl()
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
+  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -72,8 +72,8 @@ void ForallIndexSetViewTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index da699db71d..ee4fba7b5f 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -43,8 +43,8 @@ void ForallIcountIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -56,8 +56,7 @@ void ForallIcountIndexSetTestImpl()
     test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::forall_Icount(EXEC_POLICY(),
-                      iset,
+  RAJA::forall_Icount(EXEC_POLICY(), iset,
                       [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
                       { working_array[icount] = idx; });
 
@@ -68,8 +67,8 @@ void ForallIcountIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index 00edd70b07..657d464f1c 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -42,8 +42,8 @@ void ForallIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -54,8 +54,7 @@ void ForallIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall(EXEC_POLICY(),
-               iset,
+  RAJA::forall(EXEC_POLICY(), iset,
                [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
                { working_array[idx] = idx; });
 
@@ -67,8 +66,8 @@ void ForallIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index e3932ebfc3..f93b2affef 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -74,8 +74,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(
-      idx_range + 1, working_res, &working_range, &check_range, &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -95,11 +95,11 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
     }
   }
 
-  allocateForallTestData(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+                         &test_array);
 
-  allocateForallTestData(
-      data_len, working_res, &working_bins, &check_bins, &test_bins);
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+                         &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -119,8 +119,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
     }
   }
 
-  working_res.memcpy(
-      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -177,16 +177,16 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
             ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::forall<EXEC_POLICY>(
-          seg,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-          {
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
     }
 
     for (size_t bin = 0; bin < num_bins; ++bin)
@@ -213,8 +213,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(
-          working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -226,16 +226,16 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
     {
       red.reset();
 
-      RAJA::forall<EXEC_POLICY>(
-          seg,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-          {
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
 
       if (!got_ref_vals)
       {
@@ -299,30 +299,24 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
     // Range segment tests
     RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
     RAJA::getIndices(seg_idx, r1);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
-                                   REDUCE_POLICY,
-                                   ABSTRACTION,
-                                   DATA_TYPE>(
-        r1, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r1, container, seg_idx,
+                                              working_res, rngen);
 
     seg_idx.clear();
     RAJA::TypedRangeSegment<IDX_TYPE> r3(3, 2060);
     RAJA::getIndices(seg_idx, r3);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
-                                   REDUCE_POLICY,
-                                   ABSTRACTION,
-                                   DATA_TYPE>(
-        r3, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r3, container, seg_idx,
+                                              working_res, rngen);
 
     // Range-stride segment test
     seg_idx.clear();
     RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
     RAJA::getIndices(seg_idx, r5);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
-                                   REDUCE_POLICY,
-                                   ABSTRACTION,
-                                   DATA_TYPE>(
-        r5, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r5, container, seg_idx,
+                                              working_res, rngen);
 
     // List segment test
     seg_idx.clear();
@@ -336,13 +330,11 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
         seg_idx.push_back(i);
       }
     }
-    RAJA::TypedListSegment<IDX_TYPE> l1(
-        &seg_idx[0], seg_idx.size(), working_res);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY,
-                                   REDUCE_POLICY,
-                                   ABSTRACTION,
-                                   DATA_TYPE>(
-        l1, container, seg_idx, working_res, rngen);
+    RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                        working_res);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(l1, container, seg_idx,
+                                              working_res, rngen);
   }
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index ed2d4c9dc9..b76d88741f 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -29,8 +29,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +43,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
@@ -86,16 +85,15 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                               { redand &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -119,48 +117,43 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -175,10 +168,8 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                   REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index 6d34119516..c6786ba23d 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -29,8 +29,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +43,7 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
@@ -87,16 +86,15 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                               { redor |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
@@ -120,48 +118,40 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
-                                 REDUCE_POLICY>(r4, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
-                                 REDUCE_POLICY>(r5, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -176,10 +166,8 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index 9e88a8a1dd..8da71aca26 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -29,8 +29,8 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval   = 100;
   const DATA_TYPE max_init = -1;
@@ -66,20 +66,18 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
@@ -102,48 +100,40 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -158,10 +148,8 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index c492320b99..b9434014cb 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -29,8 +29,8 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval      = 100;
   const DATA_TYPE max_init    = -modval;
@@ -81,22 +81,20 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
@@ -119,48 +117,43 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -175,10 +168,8 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                   REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index 23de6c393f..0af276ac7d 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -29,8 +29,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -67,22 +67,20 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -106,48 +104,40 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -162,10 +152,8 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index 0bf882ceea..f20b086bc7 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -29,8 +29,8 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval       = 100;
   const DATA_TYPE min_init     = modval + 1;
@@ -81,22 +81,20 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
@@ -119,48 +117,43 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -175,10 +168,8 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                   REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 145355e64f..6b8c0b2506 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -29,8 +29,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -67,15 +67,15 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
 
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(
-        seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { sum += working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { sum += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -99,48 +99,40 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -155,10 +147,8 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index 5525eaf302..d4a91c4732 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -29,8 +29,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +43,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
@@ -72,8 +71,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE redand2(2);
 
   RAJA::forall<EXEC_POLICY>(
-      seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
       RAJA::expt::KernelName("RAJA Reduce BitAnd"),
       [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2)
@@ -91,8 +89,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   for (int j = 0; j < nloops; ++j)
   {
     RAJA::forall<EXEC_POLICY>(
-        seg,
-        RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+        seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
         [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1)
         { r1 &= working_array[idx]; });
   }
@@ -100,8 +97,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -125,48 +122,43 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -181,10 +173,8 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                   REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index fdc9ca64ee..118a2488b7 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -29,8 +29,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +43,7 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                             { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
@@ -72,8 +71,7 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE redor2(2);
 
   RAJA::forall<EXEC_POLICY>(
-      seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
       RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
       RAJA::expt::KernelName("RAJA Reduce BitOr"),
       [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2)
@@ -91,8 +89,7 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   for (int j = 0; j < nloops; ++j)
   {
     RAJA::forall<EXEC_POLICY>(
-        seg,
-        RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+        seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
         [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1)
         { r1 |= working_array[idx]; });
   }
@@ -100,8 +97,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
@@ -125,48 +122,40 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
-                                 REDUCE_POLICY>(r4, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
-                                 REDUCE_POLICY>(r5, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -181,10 +170,8 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE,
-                                 DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY,
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index 7f38c25324..0afa44d017 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -29,8 +29,8 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval   = 100;
   const DATA_TYPE max_init = -1;
@@ -53,8 +53,7 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE max(max_init);
 
   RAJA::forall<EXEC_POLICY>(
-      seg,
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
       RAJA::expt::KernelName("RAJA Reduce Max"),
       [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m)
@@ -84,8 +83,8 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
@@ -108,48 +107,40 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -164,10 +155,8 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index 61705c86d3..7e225c0cf2 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -29,8 +29,8 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval      = 100;
   const DATA_TYPE max_init    = -modval;
@@ -64,8 +64,7 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   VL_TYPE max(max_init, maxloc_init);
 
   RAJA::forall<EXEC_POLICY>(
-      seg,
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
       RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
       [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m)
@@ -100,8 +99,8 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
@@ -124,48 +123,43 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -180,10 +174,8 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                   REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index a2395bdf3e..beef5ee707 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -29,8 +29,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -53,8 +53,7 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE min(min_init);
 
   RAJA::forall<EXEC_POLICY>(
-      seg,
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
       RAJA::expt::KernelName("RAJA Reduce Min"),
       [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m)
@@ -83,8 +82,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
                             { m = RAJA_MIN(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -108,48 +107,40 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -164,10 +155,8 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index da34fc618d..33f05290c8 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -29,8 +29,8 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval       = 100;
   const DATA_TYPE min_init     = modval + 1;
@@ -64,8 +64,7 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   VL_TYPE min(min_init, minloc_init);
 
   RAJA::forall<EXEC_POLICY>(
-      seg,
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
       RAJA::expt::KernelName("RAJA Reduce MinLoc"),
       [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m)
@@ -100,8 +99,8 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
@@ -124,48 +123,43 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -180,10 +174,8 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY,
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                   REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index f324845cd3..e794486608 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -29,8 +29,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -51,8 +51,7 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE sum2 = 2;
 
   RAJA::forall<EXEC_POLICY>(
-      seg,
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      seg, RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
       RAJA::expt::KernelName("RAJA Reduce Sum"),
       [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s1, DATA_TYPE & s2)
@@ -79,8 +78,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -104,48 +103,40 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -160,10 +151,8 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY,
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
                                REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index 40f4b87829..af10db39e5 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -47,8 +47,8 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = -DBL_MAX;
 
@@ -97,8 +97,8 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
@@ -114,9 +114,7 @@ TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE,
-                                          WORKING_RES,
-                                          EXEC_POLICY,
+  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
                                           REDUCE_POLICY>();
 }
 
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 7fd5d20dbe..161f04a3cb 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -47,8 +47,8 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   double   current_max = -DBL_MAX;
   IDX_TYPE current_loc = -1;
@@ -94,8 +94,8 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
@@ -111,9 +111,7 @@ TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE,
-                                             WORKING_RES,
-                                             EXEC_POLICY,
+  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
                                              REDUCE_POLICY>();
 }
 
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index b99a446fdd..3a04bc4764 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -47,8 +47,8 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = DBL_MAX;
 
@@ -97,8 +97,8 @@ void ForallIndexSetReduceMinMultipleTestImpl()
     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
@@ -114,9 +114,7 @@ TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE,
-                                          WORKING_RES,
-                                          EXEC_POLICY,
+  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
                                           REDUCE_POLICY>();
 }
 
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index c6d0b5645d..89acb80207 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -47,8 +47,8 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   double   current_min = DBL_MAX;
   IDX_TYPE current_loc = -1;
@@ -94,8 +94,8 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
@@ -111,9 +111,7 @@ TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE,
-                                             WORKING_RES,
-                                             EXEC_POLICY,
+  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
                                              REDUCE_POLICY>();
 }
 
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index 16e80f31d7..7811a605bc 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -45,15 +45,15 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   double* dcheck_array;
   double* dtest_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &dworking_array, &dcheck_array, &dtest_array);
+  allocateForallTestData<double>(alen, working_res, &dworking_array,
+                                 &dcheck_array, &dtest_array);
 
   int* iworking_array;
   int* icheck_array;
   int* itest_array;
 
-  allocateForallTestData<int>(
-      alen, working_res, &iworking_array, &icheck_array, &itest_array);
+  allocateForallTestData<int>(alen, working_res, &iworking_array, &icheck_array,
+                              &itest_array);
 
   const double dinit_val = 0.1;
   const int    iinit_val = 1;
@@ -101,11 +101,11 @@ void ForallIndexSetReduceSumMultipleTestImpl()
               tcount * (4 * ichk_val) + (irinit * 4));
   }
 
-  deallocateForallTestData<double>(
-      working_res, dworking_array, dcheck_array, dtest_array);
+  deallocateForallTestData<double>(working_res, dworking_array, dcheck_array,
+                                   dtest_array);
 
-  deallocateForallTestData<int>(
-      working_res, iworking_array, icheck_array, itest_array);
+  deallocateForallTestData<int>(working_res, iworking_array, icheck_array,
+                                itest_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
@@ -121,9 +121,7 @@ TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE,
-                                          WORKING_RES,
-                                          EXEC_POLICY,
+  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
                                           REDUCE_POLICY>();
 }
 
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index 5ceb700c2a..281ad51d43 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -28,8 +28,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
   const DATA_TYPE big_val     = 500;
@@ -38,7 +38,7 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
   static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+                                                      static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `max0;` not `max0(0);`
@@ -75,8 +75,7 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         IDX_TYPE  max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
-        working_res.memcpy(&working_array[max_index],
-                           &test_array[max_index],
+        working_res.memcpy(&working_array[max_index], &test_array[max_index],
                            sizeof(DATA_TYPE));
 
         if (current_max < roll)
@@ -107,8 +106,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
@@ -124,10 +123,7 @@ TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxMultipleTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  EXEC_POLICY,
+  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                                   REDUCE_POLICY>(0, 2115);
 }
 
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index 5fa290a86e..2351c790ca 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -28,8 +28,8 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
   const IDX_TYPE  default_loc = -1;
@@ -39,7 +39,7 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
   static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+                                                      static_cast<int>(last) - 1);
 
   RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
                                                               default_loc);
@@ -84,8 +84,7 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         if (current_max != roll)
         { // avoid two indices getting the same value
           test_array[max_index] = roll;
-          working_res.memcpy(&working_array[max_index],
-                             &test_array[max_index],
+          working_res.memcpy(&working_array[max_index], &test_array[max_index],
                              sizeof(DATA_TYPE));
 
           if (current_max < roll)
@@ -128,8 +127,8 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
@@ -145,11 +144,8 @@ TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxLocMultipleTestImpl<IDX_TYPE,
-                                     DATA_TYPE,
-                                     WORKING_RES,
-                                     EXEC_POLICY,
-                                     REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index c9c6fbd84e..60ab8d817c 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -28,8 +28,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
   const DATA_TYPE big_val     = -500;
@@ -38,7 +38,7 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
   static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+                                                      static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `min0;` not `min0(0);`
@@ -75,8 +75,7 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         IDX_TYPE  min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
-        working_res.memcpy(&working_array[min_index],
-                           &test_array[min_index],
+        working_res.memcpy(&working_array[min_index], &test_array[min_index],
                            sizeof(DATA_TYPE));
 
         if (current_min > roll)
@@ -107,8 +106,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
@@ -124,10 +123,7 @@ TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinMultipleTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  EXEC_POLICY,
+  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                                   REDUCE_POLICY>(0, 2115);
 }
 
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index 15df809f2b..5b39ce5547 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -28,8 +28,8 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
   const IDX_TYPE  default_loc = -1;
@@ -39,7 +39,7 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
   static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+                                                      static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
   RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val,
@@ -88,8 +88,7 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         if (current_min != roll)
         { // avoid two indices getting the same value
           test_array[min_index] = roll;
-          working_res.memcpy(&working_array[min_index],
-                             &test_array[min_index],
+          working_res.memcpy(&working_array[min_index], &test_array[min_index],
                              sizeof(DATA_TYPE));
 
           if (current_min > roll)
@@ -98,8 +97,8 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
             current_loc = min_index;
           }
         }
-        printf(
-            "current { %f, %f }\n", (double)current_min, (double)current_loc);
+        printf("current { %f, %f }\n", (double)current_min,
+               (double)current_loc);
 
         RAJA::forall<EXEC_POLICY>(r1,
                                   [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
@@ -137,8 +136,8 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
@@ -154,11 +153,8 @@ TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinLocMultipleTestImpl<IDX_TYPE,
-                                     DATA_TYPE,
-                                     WORKING_RES,
-                                     EXEC_POLICY,
-                                     REDUCE_POLICY>(0, 2115);
+  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index 18f069e5de..8010cbafd0 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -25,8 +25,8 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
@@ -82,8 +82,8 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
               static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 template <typename IDX_TYPE,
@@ -100,8 +100,8 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
@@ -168,8 +168,8 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
               static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
@@ -185,17 +185,12 @@ TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE,
-                                           DATA_TYPE,
-                                           WORKING_RES,
-                                           EXEC_POLICY,
-                                           REDUCE_POLICY>(0, 2115);
-
-  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE,
-                                            DATA_TYPE,
-                                            WORKING_RES,
-                                            EXEC_POLICY,
-                                            REDUCE_POLICY>(0, 2115);
+  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                           EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+
+  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                            EXEC_POLICY, REDUCE_POLICY>(0,
+                                                                        2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index 7fb536e61e..6f47f26032 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -35,20 +35,18 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
 
   RAJA::region<REG_POLICY>(
       [=]()
       {
-        RAJA::forall<EXEC_POLICY>(rseg,
-                                  [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
                                   { working_array[idx - first] += 1; });
 
-        RAJA::forall<EXEC_POLICY>(lseg,
-                                  [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
                                   { working_array[idx - first] += 2; });
       });
 
@@ -60,8 +58,8 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 3);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index 78c9e1e6e1..d8f67dbf7a 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -44,8 +44,8 @@ void ForallResourceIcountIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -58,8 +58,7 @@ void ForallResourceIcountIndexSetTestImpl()
   }
 
   RAJA::forall_Icount<EXEC_POLICY>(
-      working_res,
-      iset,
+      working_res, iset,
       [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
       { working_array[icount] = idx; });
 
@@ -70,8 +69,8 @@ void ForallResourceIcountIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -86,8 +85,7 @@ TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE,
-                                       WORKING_RESOURCE,
+  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE,
                                        EXEC_POLICY>();
 }
 
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index 6a1ad8a544..a462597fca 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -43,8 +43,8 @@ void ForallResourceIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -55,8 +55,7 @@ void ForallResourceIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res,
-                            iset,
+  RAJA::forall<EXEC_POLICY>(working_res, iset,
                             [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
                             { working_array[idx] = idx; });
 
@@ -68,8 +67,8 @@ void ForallResourceIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index bb1ec42e0a..460067476b 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -39,37 +39,36 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
   camp::resources::Resource erased_working_res{working_res};
 
   // Create list segment for tests
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(
-      &idx_array[0], idxlen, erased_working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen,
+                                          erased_working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(
-      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (size_t i = 0; i < idxlen; ++i)
   {
     test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res,
-                            lseg,
+  RAJA::forall<EXEC_POLICY>(working_res, lseg,
                             [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
                               working_array[RAJA::stripIndexType(idx)] = idx;
                             });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   //
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
@@ -78,8 +77,8 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 6e44124c6e..0b0b068554 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -23,21 +23,20 @@ void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
   RAJA::forall<EXEC_POLICY>(
-      working_res,
-      r1,
+      working_res, r1,
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
@@ -45,8 +44,8 @@ void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index 6f4d3e7d6d..677ab28e62 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -27,16 +27,16 @@ void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(
-      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   INDEX_TYPE idx = first;
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
@@ -46,13 +46,12 @@ void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
   }
 
   RAJA::forall<EXEC_POLICY>(
-      working_res,
-      r1,
+      working_res, r1,
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
@@ -60,8 +59,8 @@ void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -87,31 +86,21 @@ template <typename INDEX_TYPE,
               RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
 
   // Test negative strides
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
@@ -126,51 +115,33 @@ TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
 
   // Test size zero segments
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                           DIFF_TYPE,
-                                           WORKING_RES,
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                            EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index 755d3db420..4c9e045cce 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -44,8 +44,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -81,8 +81,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -91,8 +91,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -123,8 +123,8 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -139,12 +139,11 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type  work_view(
-      working_array,
-      RAJA::make_offset_layout<1, INDEX_TYPE>({{offset}}, {{N_offset}}));
+  view_type  work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{offset}}, {{N_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(
-      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -153,8 +152,8 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
@@ -174,14 +173,11 @@ TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
       32000);
 
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE,
-                                      WORKING_RESOURCE,
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
                                       EXEC_POLICY>(13, 1);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE,
-                                      WORKING_RESOURCE,
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
                                       EXEC_POLICY>(2047, 2);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE,
-                                      WORKING_RESOURCE,
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
                                       EXEC_POLICY>(32000, 3);
 }
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index fc9b2056e4..c065c274ff 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -24,8 +24,8 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      lentot, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
   std::iota(test_array, test_array + lentot, 0);
 
@@ -49,8 +49,8 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -67,8 +67,8 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      lentot, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * lentot);
 
@@ -104,8 +104,8 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index 8fc3208088..eee1bac55c 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -21,8 +21,8 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -33,8 +33,7 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
                             { work_view(idx - rbegin) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
@@ -44,8 +43,8 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -61,8 +60,8 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -72,12 +71,11 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type  work_view(
-      working_array,
-      RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}}, {{l_offset}}));
+  view_type  work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{f_offset}}, {{l_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(
-      r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -86,8 +84,8 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE,
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index 2a250e50c5..a0428203db 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -24,8 +24,8 @@ void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -43,8 +43,7 @@ void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
                             { work_view((idx - first) / stride) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
@@ -54,8 +53,8 @@ void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE,
@@ -75,26 +74,16 @@ template <
     typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
 {
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(-10, -1, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(-5, 0, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(-5, 5, 3);
 
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(10, -1, -1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(10, 0, -2);
 }
 
@@ -111,43 +100,25 @@ TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
 
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(0, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(1, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(0, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(1, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(0, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(1, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(1, 255, 2);
 
   // Test size zero segments
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(0, 20, -2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE,
-                                       DIFF_TYPE,
-                                       WORKING_RES,
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
                                        EXEC_POLICY>(1, 20, -2);
 
   runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index 5567b66fa4..6bd5f707b9 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -55,8 +55,8 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -66,8 +66,8 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
       test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::forall<EXEC_POLICY>(lseg,
                               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
@@ -79,8 +79,8 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::forall<EXEC_POLICY>(lseg,
                               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
@@ -98,8 +98,8 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index f0454ae339..4f88cbb27d 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -29,8 +29,8 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -40,8 +40,7 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
     RAJA::forall<EXEC_POLICY>(
-        r1,
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
         { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
   }
   else
@@ -49,8 +48,8 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::forall<EXEC_POLICY>(r1,
                               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
@@ -68,8 +67,8 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index 983fe19662..909474b019 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -34,8 +34,8 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
@@ -52,8 +52,7 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
     }
 
     RAJA::forall<EXEC_POLICY>(
-        r1,
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
         { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
   }
   else
@@ -75,8 +74,8 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -102,33 +101,23 @@ template <typename INDEX_TYPE,
               RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
+                                                DIFF_TYPE(3));
 
   // Test negative strides
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0),
+                                                DIFF_TYPE(-2));
 }
 
 
@@ -140,53 +129,35 @@ TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255),
+                                                DIFF_TYPE(2));
 
   // Test size zero segments
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index 1146e31e58..80eb2d77f5 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -49,12 +49,9 @@ TEST(IndexSetBuild, Aligned)
 
   RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
 
-  RAJA::buildIndexSetAligned(iset,
-                             res,
-                             &indices[0],
+  RAJA::buildIndexSetAligned(iset, res, &indices[0],
                              static_cast<RAJA::Index_type>(indices.size()),
-                             range_min_length,
-                             range_align);
+                             range_min_length, range_align);
 
   ASSERT_EQ(iset.getLength(), indices.size());
 
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index d50b4e0480..c729126a84 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -42,20 +42,16 @@ void KernelBasicFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_x,
-                                    &check_array_x,
-                                    &test_array_x);
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_y,
-                                    &check_array_y,
-                                    &test_array_y);
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(
-      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(working_array_x, 0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
@@ -74,16 +70,13 @@ void KernelBasicFissionFusionLoopTestImpl(
 
   );
 
-  working_res.memcpy(check_array_x,
-                     working_array_x,
+  working_res.memcpy(check_array_x, working_array_x,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  memset(static_cast<void*>(check_array_y),
-         0,
+  memset(static_cast<void*>(check_array_y), 0,
          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  RAJA::forall<RAJA::seq_exec>(working_res,
-                               seg_idx,
+  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
                                [=](IDX_TYPE i)
                                {
                                  check_array_y[RAJA::stripIndexType(i)] += 1;
@@ -97,12 +90,12 @@ void KernelBasicFissionFusionLoopTestImpl(
               check_array_y[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_x, check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_y, check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 47075076f9..3a9c996bd3 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -31,18 +31,14 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -51,9 +47,7 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -61,18 +55,14 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
@@ -80,9 +70,7 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
@@ -98,21 +86,17 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index f2cd2d9de6..adda6c10e9 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -45,25 +45,20 @@ void KernelBasicSingleICountLoopTestImpl(
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(
-      data_len, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array_i,
-                                   &check_array_i,
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res,
+                                   &working_array_i, &check_array_i,
                                    &test_array_i);
 
-  memset(static_cast<void*>(test_array),
-         0,
+  memset(static_cast<void*>(test_array), 0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array,
-                     test_array,
+  working_res.memcpy(working_array, test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array_i,
-                     test_array_i,
+  working_res.memcpy(working_array_i, test_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   if (RAJA::stripIndexType(idx_len) > 0)
@@ -77,8 +72,7 @@ void KernelBasicSingleICountLoopTestImpl(
     }
 
     RAJA::kernel_param<EXEC_POLICY>(
-        RAJA::make_tuple(seg),
-        RAJA::make_tuple(IDX_TYPE(0)),
+        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
 
         [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
         {
@@ -90,8 +84,7 @@ void KernelBasicSingleICountLoopTestImpl(
   { // zero-length segment
 
     RAJA::kernel_param<EXEC_POLICY>(
-        RAJA::make_tuple(seg),
-        RAJA::make_tuple(IDX_TYPE(0)),
+        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
 
         [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
         {
@@ -102,11 +95,9 @@ void KernelBasicSingleICountLoopTestImpl(
         });
   }
 
-  working_res.memcpy(check_array,
-                     working_array,
+  working_res.memcpy(check_array, working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
-  working_res.memcpy(check_array_i,
-                     working_array_i,
+  working_res.memcpy(check_array_i, working_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
@@ -117,11 +108,11 @@ void KernelBasicSingleICountLoopTestImpl(
               check_array_i[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 
-  deallocateForallTestData<IDX_TYPE>(
-      erased_working_res, working_array_i, check_array_i, test_array_i);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array_i,
+                                     check_array_i, test_array_i);
 }
 
 #endif // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index 2e80007803..15dc025698 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -31,18 +31,14 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -51,9 +47,7 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -61,18 +55,14 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
@@ -80,9 +70,7 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
@@ -98,21 +86,17 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE,
-                                      EXEC_POLICY,
-                                      WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index 5373435404..a3732a6f1e 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -64,15 +64,13 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(
-      data_len, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  memset(static_cast<void*>(test_array),
-         0,
+  memset(static_cast<void*>(test_array), 0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array,
-                     test_array,
+  working_res.memcpy(working_array, test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   if (RAJA::stripIndexType(idx_len) > 0)
@@ -85,16 +83,14 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
     }
 
     call_kernel<EXEC_POLICY, USE_RESOURCE>(
-        RAJA::make_tuple(seg),
-        working_res,
+        RAJA::make_tuple(seg), working_res,
         [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
         { working_array[RAJA::stripIndexType(idx)] = idx; });
   }
   else
   { // zero-length segment
 
-    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg),
-                                           working_res,
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg), working_res,
                                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
                                            {
                                              (void)idx;
@@ -102,8 +98,7 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
                                            });
   }
 
-  working_res.memcpy(check_array,
-                     working_array,
+  working_res.memcpy(check_array, working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
@@ -112,8 +107,8 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 }
 
 #endif // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index 1b7a90eb00..c358ec3071 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -32,21 +32,15 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                                USE_RES>(
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                                USE_RES>(
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -54,44 +48,35 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                                USE_RES>(
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(
-      rs1, seg_idx, working_res, erased_working_res);
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(
-      rs2, seg_idx, working_res, erased_working_res);
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(
-      rs3, seg_idx, working_res, erased_working_res);
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -105,24 +90,18 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>,
-                                USE_RES>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>,
-                                USE_RES>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index 9cd1096bd4..aa06b1003d 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -32,21 +32,15 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                                USE_RES>(
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                                USE_RES>(
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -54,44 +48,35 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>,
-                                USE_RES>(
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(
-      rs1, seg_idx, working_res, erased_working_res);
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(
-      rs2, seg_idx, working_res, erased_working_res);
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(
-      rs3, seg_idx, working_res, erased_working_res);
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -105,24 +90,18 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>,
-                                USE_RES>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE,
-                                EXEC_POLICY,
-                                WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>,
-                                USE_RES>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index d6bdf64d1f..558c06feb5 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -42,20 +42,16 @@ void KernelConditionalFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_x,
-                                    &check_array_x,
-                                    &test_array_x);
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_y,
-                                    &check_array_y,
-                                    &test_array_y);
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(
-      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(working_array_x, 0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (int param = 0; param < 2; ++param)
   {
@@ -80,16 +76,13 @@ void KernelConditionalFissionFusionLoopTestImpl(
 
     );
 
-    working_res.memcpy(check_array_x,
-                       working_array_x,
+    working_res.memcpy(check_array_x, working_array_x,
                        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    memset(static_cast<void*>(check_array_y),
-           0,
+    memset(static_cast<void*>(check_array_y), 0,
            sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    RAJA::forall<RAJA::seq_exec>(working_res,
-                                 seg_idx,
+    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
                                  [=](IDX_TYPE i) {
                                    check_array_y[RAJA::stripIndexType(i)] =
                                        3 + 3 * param;
@@ -103,12 +96,12 @@ void KernelConditionalFissionFusionLoopTestImpl(
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_x, check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_y, check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index b0329de313..d7403ed14a 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -31,18 +31,14 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -51,9 +47,7 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -62,32 +56,26 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs1, seg_idx, working_res, erased_working_res);
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1, seg_idx, working_res,
+                                               erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs2, seg_idx, working_res, erased_working_res);
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2, seg_idx, working_res,
+                                               erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs3, seg_idx, working_res, erased_working_res);
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3, seg_idx, working_res,
+                                               erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -101,21 +89,17 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index d97139cb5b..603f54e695 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -31,8 +31,8 @@ void KernelHyperplane2DTestImpl(const int groups,
 
   INDEX_TYPE array_length = groups * idim * jdim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView(
       test_array, groups, idim, jdim);
@@ -122,8 +122,8 @@ void KernelHyperplane2DTestImpl(const int groups,
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
@@ -140,20 +140,11 @@ TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane2DTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             EXEC_POLICY,
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                              REDUCE_POLICY>(1, 10, 10);
-  KernelHyperplane2DTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             EXEC_POLICY,
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                              REDUCE_POLICY>(2, 111, 205);
-  KernelHyperplane2DTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             EXEC_POLICY,
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                              REDUCE_POLICY>(3, 213, 123);
 }
 
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 99c6357496..9ed9ca453c 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -64,8 +64,8 @@ KernelHyperplane3DTestImpl(const int groups,
 
   INDEX_TYPE array_length = groups * idim * jdim * kdim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView(
       test_array, groups, idim, jdim, kdim);
@@ -88,39 +88,39 @@ KernelHyperplane3DTestImpl(const int groups,
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(Grange, Irange, Jrange, Krange),
-      [=] RAJA_HOST_DEVICE(
-          INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk)
-      {
-        if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 ||
-            jj >= jdim || kk < 0 || kk >= kdim)
-        {
-          oob_count += 1;
-        }
-
-        DATA_TYPE left = 1;
-        if (ii > 0)
-        {
-          left = WorkView(g, ii - 1, jj, kk);
-        }
-
-        DATA_TYPE up = 1;
-        if (jj > 0)
-        {
-          up = WorkView(g, ii, jj - 1, kk);
-        }
-
-        DATA_TYPE back = 1;
-        if (kk > 0)
-        {
-          back = WorkView(g, ii, jj, kk - 1);
-        }
-
-        WorkView(g, ii, jj, kk) = left + up + back;
-
-        trip_count += 1;
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(Grange, Irange, Jrange, Krange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii,
+                                                 INDEX_TYPE jj, INDEX_TYPE kk)
+                            {
+                              if (g < 0 || g >= groups || ii < 0 ||
+                                  ii >= idim || jj < 0 || jj >= jdim ||
+                                  kk < 0 || kk >= kdim)
+                              {
+                                oob_count += 1;
+                              }
+
+                              DATA_TYPE left = 1;
+                              if (ii > 0)
+                              {
+                                left = WorkView(g, ii - 1, jj, kk);
+                              }
+
+                              DATA_TYPE up = 1;
+                              if (jj > 0)
+                              {
+                                up = WorkView(g, ii, jj - 1, kk);
+                              }
+
+                              DATA_TYPE back = 1;
+                              if (kk > 0)
+                              {
+                                back = WorkView(g, ii, jj, kk - 1);
+                              }
+
+                              WorkView(g, ii, jj, kk) = left + up + back;
+
+                              trip_count += 1;
+                            });
 
   work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
 
@@ -175,8 +175,8 @@ KernelHyperplane3DTestImpl(const int groups,
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
@@ -193,20 +193,11 @@ TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane3DTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             EXEC_POLICY,
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                              REDUCE_POLICY>(1, 10, 10, 10);
-  KernelHyperplane3DTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             EXEC_POLICY,
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                              REDUCE_POLICY>(2, 151, 111, 205);
-  KernelHyperplane3DTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             EXEC_POLICY,
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                              REDUCE_POLICY>(3, 101, 213, 123);
 }
 
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index 9a6dd220be..5ae4194992 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -86,8 +86,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(
-      idx_range + 1, working_res, &working_range, &check_range, &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -113,11 +113,11 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  allocateForallTestData(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+                         &test_array);
 
-  allocateForallTestData(
-      data_len, working_res, &working_bins, &check_bins, &test_bins);
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+                         &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -137,8 +137,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(
-      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -157,8 +157,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
 
     RAJA::kernel_resource<EXEC_POLICY>(
-        segments,
-        working_res,
+        segments, working_res,
         [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
         {
           IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
@@ -198,8 +197,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       }
 
       RAJA::kernel_resource<EXEC_POLICY>(
-          segments,
-          working_res,
+          segments, working_res,
           [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
           {
             IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
@@ -235,8 +233,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(
-          working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -249,8 +247,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       red.reset();
 
       RAJA::kernel_resource<EXEC_POLICY>(
-          segments,
-          working_res,
+          segments, working_res,
           [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
           {
             IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
@@ -381,34 +378,25 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
     auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
                                RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
                                RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY,
-                                    REDUCE_POLICY,
-                                    ABSTRACTION,
-                                    DATA_TYPE,
-                                    IDX_TYPE>(
-        s1, container, working_res, rngen);
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
 
     auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
                                RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
                                RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY,
-                                    REDUCE_POLICY,
-                                    ABSTRACTION,
-                                    DATA_TYPE,
-                                    IDX_TYPE>(
-        s2, container, working_res, rngen);
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
     auto s3 =
         RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
                          RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
                          RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY,
-                                    REDUCE_POLICY,
-                                    ABSTRACTION,
-                                    DATA_TYPE,
-                                    IDX_TYPE>(
-        s3, container, working_res, rngen);
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index c6cc00a099..60013aa430 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -64,23 +64,22 @@ void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
   int* check_array;
   int* test_array;
 
-  allocateForallTestData<int>(
-      N, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<int>(N, erased_work_res, &work_array, &check_array,
+                              &test_array);
 
   RAJA::TypedRangeSegment<int> range(0, N);
 
   // Initialize Data
   std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
-  erased_work_res.memcpy(
-      work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
+  erased_work_res.memcpy(work_array, test_array,
+                         sizeof(int) * RAJA::stripIndexType(N));
 
   RAJA::ReduceSum<REDUCE_POL, int> worksum(0);
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N)),
-      RAJA::make_tuple<int>(0),
+      RAJA::make_tuple(RAJA::RangeSegment(0, N)), RAJA::make_tuple<int>(0),
 
       // Resource
       work_res,
@@ -102,8 +101,8 @@ void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
 
   ASSERT_EQ(worksum.get(), N * (N - 1) / 2);
 
-  deallocateForallTestData<int>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<int>(erased_work_res, work_array, check_array,
+                                test_array);
 }
 
 // DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index 6169265bb6..159261177c 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -77,23 +77,21 @@ void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  erased_work_res.memcpy(work_array,
-                         test_array,
+  erased_work_res.memcpy(work_array, test_array,
                          sizeof(RAJA::Index_type) *
                              RAJA::stripIndexType(flatSize));
 
   constexpr int                                     Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
-      work_array, dim0, dim1, dim2);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim0,
+                                                              dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type>       worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(range0, range1, range2),
-      work_res,
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k)
+      RAJA::make_tuple(range0, range1, range2), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k)
       { worksum += work_view(i, j, k); });
 
   RAJA::forall<RAJA::seq_exec>(rangeflat,
@@ -103,8 +101,8 @@ void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index 5009122c76..84cfad0815 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -55,20 +55,18 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(
-      work_array, dim1, dim2, dim3);
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(
-      test_array, dim1, dim2, dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(work_array, dim1, dim2,
+                                                   dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(test_array, dim1, dim2,
+                                                   dim3);
 
-  memset(static_cast<void*>(test_array),
-         0,
+  memset(static_cast<void*>(test_array), 0,
          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(work_array,
-                     test_array,
+  working_res.memcpy(work_array, test_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   if (!zero_legth_segment)
@@ -123,8 +121,7 @@ void KernelNestedLoopsSegmentTypesTestImpl(
         });
   }
 
-  working_res.memcpy(check_array,
-                     work_array,
+  working_res.memcpy(check_array, work_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
@@ -133,8 +130,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, work_array, check_array,
+                                      test_array);
 }
 
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index 7b2f446e72..2d6fa22331 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -26,8 +26,8 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   EXPECT_LT(off_dim0, dim.at(0));
   EXPECT_LT(off_dim1, dim.at(1));
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -61,8 +61,8 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
@@ -88,26 +88,26 @@ TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
   //
   std::array<RAJA::idx_t, 2> offset_lo{{0, 2}};
   std::array<RAJA::idx_t, 2> offset_hi{{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2>{{-1, -2}};
   offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 6}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 2>{{0, 1}};
   offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 1}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2>{{-1, -1}};
   offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index 434dbd485b..2db2288d07 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -28,8 +28,8 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   EXPECT_LT(off_dim1, dim.at(1));
   EXPECT_LT(off_dim2, dim.at(2));
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -50,8 +50,7 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 
   RAJA::OffsetLayout<3> layout = RAJA::make_offset_layout<3>(
       {{offset_lo.at(0), offset_lo.at(1), offset_lo.at(2)}},
-      {{offset_lo.at(0) + dim.at(0),
-        offset_lo.at(1) + dim.at(1),
+      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1),
         offset_lo.at(2) + dim.at(2)}});
 
   RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> view(working_array, layout);
@@ -72,8 +71,8 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
@@ -100,26 +99,26 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   //
   std::array<RAJA::idx_t, 3> offset_lo{{0, 2, 1}};
   std::array<RAJA::idx_t, 3> offset_hi{{dim0 - 2, dim1 - 6, dim2 - 4}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3>{{-1, -2, -3}};
   offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 10, dim2 - 8}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 3>{{0, 1, 2}};
   offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 2, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3>{{-1, -1, 0}};
   offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 4, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index e9dd1c8e4b..ce0cef6b42 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -45,8 +45,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
@@ -62,8 +62,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
@@ -113,11 +113,11 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, A_work_array, A_check_array, A_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
+                                     A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, B_work_array, B_check_array, B_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
+                                     B_test_array);
 }
 
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index b7c2820df1..f5b48fab5b 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -48,8 +48,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
@@ -69,8 +69,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
@@ -102,8 +102,7 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 
   RAJA::OffsetLayout<3> B_layout = RAJA::make_permuted_offset_layout<3>(
       {{-1, -1, -1}},
-      {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1, Ntot_len.at(2) - 1}},
-      perm);
+      {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1, Ntot_len.at(2) - 1}}, perm);
   RAJA::Layout<3> A_layout = RAJA::make_permuted_layout(
       {{Nint_len.at(0), Nint_len.at(1), Nint_len.at(2)}}, perm);
 
@@ -131,11 +130,11 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, A_work_array, A_check_array, A_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
+                                     A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, B_work_array, B_check_array, B_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
+                                     B_test_array);
 }
 
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index a3663c733b..b4829d1cd0 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -23,8 +23,8 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2>    dim,
        static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1);
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -55,8 +55,8 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2>    dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index 083183706e..3af623c3ba 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -24,8 +24,8 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3>    dim,
        static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(2)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1) * dim_strip.at(2);
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -57,8 +57,8 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3>    dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index f92b4f9fab..f5013c7420 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -77,17 +77,15 @@ void KernelNestedLoopTest(const DEPTH_2&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int                                     Depth = 2;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
-      work_array, dim1, dim0);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim1,
+                                                              dim0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(range1, range0),
-      work_res,
+      RAJA::make_tuple(range1, range0), work_res,
       [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i)
       { work_view(j, i) = (j * dim0) + i; });
 
-  work_res.memcpy(check_array,
-                  work_array,
+  work_res.memcpy(check_array, work_array,
                   sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
   RAJA::forall<RAJA::seq_exec>(rangeflat,
                                [=](RAJA::Index_type i)
@@ -97,8 +95,8 @@ void KernelNestedLoopTest(const DEPTH_2&,
                                      check_array[RAJA::stripIndexType(i)]);
                                });
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2
@@ -153,18 +151,16 @@ void KernelNestedLoopTest(const DEPTH_3&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int                                     Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
-      work_array, dim2, dim1, dim0);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim2,
+                                                              dim1, dim0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(range2, range1, range0),
-      work_res,
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i)
+      RAJA::make_tuple(range2, range1, range0), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type k, RAJA::Index_type j,
+                           RAJA::Index_type i)
       { work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i; });
 
-  work_res.memcpy(check_array,
-                  work_array,
+  work_res.memcpy(check_array, work_array,
                   sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
   RAJA::forall<RAJA::seq_exec>(rangeflat,
                                [=](RAJA::Index_type i)
@@ -174,8 +170,8 @@ void KernelNestedLoopTest(const DEPTH_3&,
                                      check_array[RAJA::stripIndexType(i)]);
                                });
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEPTH_3_COLLAPSE execution policies use the above DEPTH_3 test.
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 1834b22ac9..caa66a621c 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -70,10 +70,10 @@ void KernelNestedLoopTest()
     test_arrA[i] = i * 1.2;
     test_arrB[i] = i * 0.5;
   }
-  work_res.memcpy(
-      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Initialize RAJA Views
   RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
@@ -125,10 +125,10 @@ void KernelNestedLoopTest()
                                   work_viewB(i - 1, j));
       });
 
-  work_res.memcpy(
-      check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrA, work_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrB, work_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
       RAJA::RangeSegment{0, N * N},
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index fd751bc399..4c09133014 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -90,12 +90,12 @@ void KernelNestedLoopTest()
     }
   }
 
-  work_res.memcpy(
-      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrC, test_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Calculate Test data
   for (int row = 0; row < N; ++row)
@@ -114,8 +114,7 @@ void KernelNestedLoopTest()
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::RangeSegment{0, N},
-                       RAJA::RangeSegment{0, N},
+      RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N},
                        RAJA::RangeSegment{0, N}),
 
       RAJA::tuple<double>{0.0},
@@ -136,12 +135,11 @@ void KernelNestedLoopTest()
 
   );
 
-  work_res.memcpy(
-      check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrC, work_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment{0, N * N},
-      [=](RAJA::Index_type i)
+      RAJA::RangeSegment{0, N * N}, [=](RAJA::Index_type i)
       { ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8); });
 
   work_res.deallocate(work_arrA);
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index f276e1e273..6793b452f3 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -38,8 +38,8 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 
   // For double nested loop tests the third arg is ignored.
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
-      LOOP_TYPE(), 40, 30, 20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index 5da4815546..219c448ce0 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -37,8 +37,8 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 
   // For double nested loop tests the third arg is ignored.
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
-      LOOP_TYPE(), 40, 30, 20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index d053dd6c2a..b08b38fa30 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -28,20 +28,19 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
                               { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
   RAJA::forall<RAJA::seq_exec>(seg,
@@ -63,8 +62,7 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)0, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(int c, int r)
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
       { maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
@@ -90,11 +88,11 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -112,24 +110,12 @@ TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DTestImpl<INDEX_TYPE,
-                         DATA_TYPE,
-                         WORKING_RES,
-                         FORALL_POLICY,
-                         EXEC_POLICY,
-                         REDUCE_POLICY>(10, 10);
-  KernelLocMax2DTestImpl<INDEX_TYPE,
-                         DATA_TYPE,
-                         WORKING_RES,
-                         FORALL_POLICY,
-                         EXEC_POLICY,
-                         REDUCE_POLICY>(151, 151);
-  KernelLocMax2DTestImpl<INDEX_TYPE,
-                         DATA_TYPE,
-                         WORKING_RES,
-                         FORALL_POLICY,
-                         EXEC_POLICY,
-                         REDUCE_POLICY>(362, 362);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest, LocMax2DKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index 42626bb986..f3f5cda19c 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -28,20 +28,19 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
                               { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
   RAJA::forall<RAJA::seq_exec>(seg,
@@ -65,8 +64,7 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)0, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(int c, int r)
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
       { maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
@@ -92,11 +90,11 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -114,24 +112,12 @@ TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             FORALL_POLICY,
-                             EXEC_POLICY,
-                             REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             FORALL_POLICY,
-                             EXEC_POLICY,
-                             REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             FORALL_POLICY,
-                             EXEC_POLICY,
-                             REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest, LocMax2DViewKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index 864b68644e..e4faccc437 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -28,20 +28,19 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
                               { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
   RAJA::forall<RAJA::seq_exec>(seg,
@@ -67,9 +66,9 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::
-      ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
-          maxloc_reducer((DATA_TYPE)0, LocTup);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+      maxloc_reducer((DATA_TYPE)0, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
                             [=] RAJA_HOST_DEVICE(int c, int r)
@@ -102,11 +101,11 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -124,24 +123,15 @@ TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  FORALL_POLICY,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  FORALL_POLICY,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  FORALL_POLICY,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index 1acd780406..fa4d583d74 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -28,20 +28,19 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
                               { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
   RAJA::forall<RAJA::seq_exec>(seg,
@@ -63,8 +62,7 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)1024, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(int c, int r)
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
       { minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
@@ -90,11 +88,11 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -112,24 +110,12 @@ TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DTestImpl<INDEX_TYPE,
-                         DATA_TYPE,
-                         WORKING_RES,
-                         FORALL_POLICY,
-                         EXEC_POLICY,
-                         REDUCE_POLICY>(10, 10);
-  KernelLocMin2DTestImpl<INDEX_TYPE,
-                         DATA_TYPE,
-                         WORKING_RES,
-                         FORALL_POLICY,
-                         EXEC_POLICY,
-                         REDUCE_POLICY>(151, 151);
-  KernelLocMin2DTestImpl<INDEX_TYPE,
-                         DATA_TYPE,
-                         WORKING_RES,
-                         FORALL_POLICY,
-                         EXEC_POLICY,
-                         REDUCE_POLICY>(362, 362);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest, LocMin2DKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index 8eada9e8c8..ccd8c542fc 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -28,20 +28,19 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
                               { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
   RAJA::forall<RAJA::seq_exec>(seg,
@@ -65,8 +64,7 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
       (DATA_TYPE)1024, Index2D(0, 0));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(int c, int r)
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
       { minloc_reducer.minloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
@@ -92,11 +90,11 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -114,24 +112,12 @@ TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             FORALL_POLICY,
-                             EXEC_POLICY,
-                             REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             FORALL_POLICY,
-                             EXEC_POLICY,
-                             REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE,
-                             DATA_TYPE,
-                             WORKING_RES,
-                             FORALL_POLICY,
-                             EXEC_POLICY,
-                             REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest, LocMin2DViewKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index 8b886df355..12be090907 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -28,20 +28,19 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
                               { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
   RAJA::forall<RAJA::seq_exec>(seg,
@@ -63,9 +62,9 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::
-      ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
-          minloc_reducer((DATA_TYPE)1024, LocTup);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+      minloc_reducer((DATA_TYPE)1024, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
                             [=] RAJA_HOST_DEVICE(int c, int r)
@@ -98,11 +97,11 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -120,24 +119,15 @@ TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  FORALL_POLICY,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  FORALL_POLICY,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  FORALL_POLICY,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest,
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index 2855c53b91..99d7353434 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -26,13 +26,8 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N,
-                      work_res,
-                      &work_array1,
-                      &work_array2,
-                      &work_array3,
-                      host_res,
-                      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
   work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
   work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
@@ -77,8 +72,8 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(
-      work_res, work_array1, work_array2, work_array3, host_res, check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index adce8baad5..0a14678bff 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -22,13 +22,8 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N,
-                      work_res,
-                      &work_array1,
-                      &work_array2,
-                      &work_array3,
-                      host_res,
-                      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
   work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
   work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
@@ -62,8 +57,8 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(
-      work_res, work_array1, work_array2, work_array3, host_res, check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index eaf2062158..a0119c3edf 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -31,11 +31,11 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -69,8 +69,8 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
       [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
       { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -92,8 +92,8 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
       [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
       { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -103,11 +103,11 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array_t, check_array_t, test_array_t);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index c947a01829..62f09ff9ce 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -31,11 +31,11 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -67,8 +67,8 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
                             [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
                             { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -78,11 +78,11 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array_t, check_array_t, test_array_t);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index 6c540bebfb..6fa6446883 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -28,8 +28,8 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   // initialize arrays
   std::iota(test_array, test_array + array_length, 1);
@@ -54,8 +54,8 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(
-      &colidx[0], colidx.size(), work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // find min and max on target platform
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
@@ -69,8 +69,8 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
   ASSERT_EQ(static_cast<DATA_TYPE>(array_length + 2),
             static_cast<DATA_TYPE>(workmax.get()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
@@ -87,21 +87,12 @@ TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE,
-                                  DATA_TYPE,
-                                  WORKING_RES,
-                                  EXEC_POLICY,
-                                  REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index dd2a9b9524..9442c8c0bd 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -64,8 +64,8 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(
-      &colidx[0], colidx.size(), work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // sum on target platform
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
@@ -89,20 +89,11 @@ TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE,
-                               DATA_TYPE,
-                               WORKING_RES,
-                               EXEC_POLICY,
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                                REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE,
-                               DATA_TYPE,
-                               WORKING_RES,
-                               EXEC_POLICY,
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                                REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE,
-                               DATA_TYPE,
-                               WORKING_RES,
-                               EXEC_POLICY,
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
                                REDUCE_POLICY>(362, 362);
 }
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index 2c47f96a2e..90a3e34e0d 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -31,11 +31,11 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -44,8 +44,7 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize local array (shared mem)
-  using TILE_MEM = RAJA::LocalArray<DATA_TYPE,
-                                    RAJA::Perm<0, 1>,
+  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0, 1>,
                                     RAJA::SizeList<tile_dim_x, tile_dim_y>>;
   TILE_MEM Tile_Array;
 
@@ -72,22 +71,16 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
       RAJA::make_tuple((INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc,
-                           INDEX_TYPE rr,
-                           INDEX_TYPE tx,
-                           INDEX_TYPE ty,
-                           TILE_MEM & Tile_Array)
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
       { Tile_Array(ty, tx) = WorkView(rr, cc); },
 
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc,
-                           INDEX_TYPE rr,
-                           INDEX_TYPE tx,
-                           INDEX_TYPE ty,
-                           TILE_MEM & Tile_Array)
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
       { WorkTView(cc, rr) = Tile_Array(ty, tx); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -97,11 +90,11 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array_t, check_array_t, test_array_t);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
@@ -117,17 +110,11 @@ TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE,
-                                 DATA_TYPE,
-                                 WORKING_RES,
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
                                  EXEC_POLICY>(10, 10);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE,
-                                 DATA_TYPE,
-                                 WORKING_RES,
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
                                  EXEC_POLICY>(151, 111);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE,
-                                 DATA_TYPE,
-                                 WORKING_RES,
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
                                  EXEC_POLICY>(362, 362);
 }
 
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index 2bf555ef61..a8bc973981 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -88,10 +88,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen * looplen,
-                                           erased_work_res,
-                                           &work_array,
-                                           &check_array,
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
                                            &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
@@ -113,8 +111,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
   ASSERT_EQ(trip_count.get(), looplen * directlen);
   ASSERT_EQ(worksum.get(), looplen * directlen * (directlen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 template <typename WORKING_RES,
@@ -132,10 +130,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen * looplen,
-                                           erased_work_res,
-                                           &work_array,
-                                           &check_array,
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
                                            &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
@@ -145,12 +141,10 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
                        RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-      RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0),
-      work_res,
+      RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0), work_res,
       [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
                       RAJA::Index_type RAJA_UNUSED_ARG(j),
-                      RAJA::Index_type RAJA_UNUSED_ARG(x),
-                      RAJA::Index_type y)
+                      RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
       {
         trip_count += 1;
         worksum += y; // y should only be 0..3
@@ -161,8 +155,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
   ASSERT_EQ(trip_count.get(), looplen * directlen);
   ASSERT_EQ(worksum.get(), looplen * directlen * (looplen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index e453ab19eb..56c5eeb673 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -66,16 +66,15 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
-      RAJA::make_tuple((RAJA::Index_type)0),
-      work_res,
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
       [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type & value)
       { value += i; },
@@ -91,8 +90,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
   ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
   ASSERT_EQ(reduce_count.get(), 1);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 template <typename WORKING_RES,
@@ -113,8 +112,8 @@ void KernelWarpThreadTest(
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type outerlen = len / innerlen;
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
@@ -122,11 +121,10 @@ void KernelWarpThreadTest(
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
                        RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-      RAJA::make_tuple((RAJA::Index_type)0),
-      work_res,
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type & value)
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type & value)
       { value += i + j * outerlen; },
 
       [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
@@ -140,8 +138,8 @@ void KernelWarpThreadTest(
   ASSERT_EQ(worksum.get(), outerlen * innerlen * (outerlen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 template <typename WORKING_RES,
@@ -163,8 +161,8 @@ void KernelWarpThreadTest(
   RAJA::Index_type middlelen = 16;
   RAJA::Index_type outerlen  = len / (innerlen * middlelen);
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
@@ -173,13 +171,10 @@ void KernelWarpThreadTest(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
                        RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
                        RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-      RAJA::make_tuple((RAJA::Index_type)0),
-      work_res,
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i,
-                           RAJA::Index_type j,
-                           RAJA::Index_type k,
-                           RAJA::Index_type & value)
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k, RAJA::Index_type & value)
       { value += i + j * outerlen + k * outerlen * middlelen; },
 
       [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
@@ -190,13 +185,12 @@ void KernelWarpThreadTest(
         reduce_count += 1;
       });
 
-  ASSERT_EQ(worksum.get(),
-            outerlen * middlelen * innerlen *
-                (outerlen * middlelen * innerlen - 1) / 2);
+  ASSERT_EQ(worksum.get(), outerlen * middlelen * innerlen *
+                               (outerlen * middlelen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), middlelen * innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index b624fd5e8c..21326cd7d6 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -88,8 +88,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
 
@@ -97,13 +97,12 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
-      work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { worksum += i; });
+      work_res, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { worksum += i; });
 
   ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 template <typename WORKING_RES,
@@ -130,8 +129,7 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
-      RAJA::make_tuple((RAJA::Index_type)0),
-      work_res,
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
       [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
                            RAJA::Index_type j)
       {
@@ -140,8 +138,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
 
   ASSERT_EQ(worksum.get(), numtiles * 32 * (32 - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // More specific execution policies that use the above
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index 8655bf2119..10ea760622 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -61,18 +61,15 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_Z_POLICY>(
-            ctx,
-            RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k),
+            ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k),
             [&](IDX_TYPE bk)
             {
               RAJA::loop<TEAM_Y_POLICY>(
-                  ctx,
-                  RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j),
+                  ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j),
                   [&](IDX_TYPE bj)
                   {
                     RAJA::loop<TEAM_X_POLICY>(
-                        ctx,
-                        RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i),
+                        ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i),
                         [&](IDX_TYPE bi)
                         {
                           RAJA::loop<THREAD_Z_POLICY>(
@@ -100,8 +97,7 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
                                                 j < distance_sj &&
                                                 k < distance_sk)
                                             {
-                                              lambda(begin_sk[k],
-                                                     begin_sj[j],
+                                              lambda(begin_sk[k], begin_sj[j],
                                                      begin_si[i]);
                                             }
                                           });
@@ -184,8 +180,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(
-      idx_range + 1, working_res, &working_range, &check_range, &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -211,11 +207,11 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  allocateForallTestData(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+                         &test_array);
 
-  allocateForallTestData(
-      data_len, working_res, &working_bins, &check_bins, &test_bins);
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+                         &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -235,8 +231,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(
-      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -331,8 +327,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(
-          working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -417,34 +413,25 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
     auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
                                RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
                                RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA,
-                                    REDUCE_POLICY,
-                                    ABSTRACTION,
-                                    DATA_TYPE,
-                                    IDX_TYPE>(
-        s1, container, working_res, rngen);
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
 
     auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
                                RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
                                RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA,
-                                    REDUCE_POLICY,
-                                    ABSTRACTION,
-                                    DATA_TYPE,
-                                    IDX_TYPE>(
-        s2, container, working_res, rngen);
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
     auto s3 =
         RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
                          RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
                          RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA,
-                                    REDUCE_POLICY,
-                                    ABSTRACTION,
-                                    DATA_TYPE,
-                                    IDX_TYPE>(
-        s3, container, working_res, rngen);
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index 13d0892098..c555649a4d 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -51,8 +51,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
   // 6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
@@ -69,8 +69,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
     constexpr int DIM = 6;
     using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(
-        working_array, N6, N5, N4, N3, N2, N1);
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
@@ -78,33 +78,27 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
-              ctx,
-              r6,
+              ctx, r6,
               [&](INDEX_TYPE bz)
               {
                 RAJA::loop<TEAM_Y_POLICY>(
-                    ctx,
-                    r5,
+                    ctx, r5,
                     [&](INDEX_TYPE by)
                     {
                       RAJA::loop<TEAM_X_POLICY>(
-                          ctx,
-                          r4,
+                          ctx, r4,
                           [&](INDEX_TYPE bx)
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                r3,
+                                ctx, r3,
                                 [&](INDEX_TYPE tz)
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      r2,
+                                      ctx, r2,
                                       [&](INDEX_TYPE ty)
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            r1,
+                                            ctx, r1,
                                             [&](INDEX_TYPE tx)
                                             {
                                               auto idx =
@@ -136,8 +130,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
@@ -145,33 +139,27 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
-              ctx,
-              r3,
+              ctx, r3,
               [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
               {
                 RAJA::loop<TEAM_Y_POLICY>(
-                    ctx,
-                    r2,
+                    ctx, r2,
                     [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
                     {
                       RAJA::loop<TEAM_X_POLICY>(
-                          ctx,
-                          r1,
+                          ctx, r1,
                           [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                r3,
+                                ctx, r3,
                                 [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      r2,
+                                      ctx, r2,
                                       [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            r1,
+                                            ctx, r1,
                                             [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
                                             { working_array[0]++; });
                                       });
@@ -199,8 +187,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -241,26 +229,16 @@ TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedDirectTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
-                             THREAD_X_POLICY,
-                             THREAD_Y_POLICY,
-                             THREAD_Z_POLICY,
-                             TEAM_X_POLICY,
-                             TEAM_Y_POLICY,
-                             TEAM_Z_POLICY>(INDEX_TYPE(0));
+  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedDirectTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
-                             THREAD_X_POLICY,
-                             THREAD_Y_POLICY,
-                             THREAD_Z_POLICY,
-                             TEAM_X_POLICY,
-                             TEAM_Y_POLICY,
-                             TEAM_Z_POLICY>(INDEX_TYPE(1));
+  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest, RangeSegmentTeams);
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index b5b8ab20a6..4d857b37fa 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -52,8 +52,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   // 6 threads total
   constexpr int threads_x = 1;
@@ -71,8 +71,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
     constexpr int DIM = 6;
     using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(
-        working_array, N6, N5, N4, N3, N2, N1);
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
@@ -80,33 +80,27 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
-              ctx,
-              r6,
+              ctx, r6,
               [&](INDEX_TYPE bz)
               {
                 RAJA::loop<TEAM_Y_POLICY>(
-                    ctx,
-                    r5,
+                    ctx, r5,
                     [&](INDEX_TYPE by)
                     {
                       RAJA::loop<TEAM_X_POLICY>(
-                          ctx,
-                          r4,
+                          ctx, r4,
                           [&](INDEX_TYPE bx)
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                r3,
+                                ctx, r3,
                                 [&](INDEX_TYPE tz)
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      r2,
+                                      ctx, r2,
                                       [&](INDEX_TYPE ty)
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            r1,
+                                            ctx, r1,
                                             [&](INDEX_TYPE tx)
                                             {
                                               auto idx =
@@ -138,8 +132,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
@@ -147,33 +141,27 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
-              ctx,
-              r3,
+              ctx, r3,
               [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
               {
                 RAJA::loop<TEAM_Y_POLICY>(
-                    ctx,
-                    r2,
+                    ctx, r2,
                     [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
                     {
                       RAJA::loop<TEAM_X_POLICY>(
-                          ctx,
-                          r1,
+                          ctx, r1,
                           [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                r3,
+                                ctx, r3,
                                 [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      r2,
+                                      ctx, r2,
                                       [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            r1,
+                                            ctx, r1,
                                             [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
                                             { working_array[0]++; });
                                       });
@@ -201,8 +189,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -243,25 +231,15 @@ TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedLoopTestImpl<INDEX_TYPE,
-                           WORKING_RES,
-                           LAUNCH_POLICY,
-                           THREAD_X_POLICY,
-                           THREAD_Y_POLICY,
-                           THREAD_Z_POLICY,
-                           TEAM_X_POLICY,
-                           TEAM_Y_POLICY,
-                           TEAM_Z_POLICY>(INDEX_TYPE(0));
-
-  LaunchNestedLoopTestImpl<INDEX_TYPE,
-                           WORKING_RES,
-                           LAUNCH_POLICY,
-                           THREAD_X_POLICY,
-                           THREAD_Y_POLICY,
-                           THREAD_Z_POLICY,
-                           TEAM_X_POLICY,
-                           TEAM_Y_POLICY,
-                           TEAM_Z_POLICY>(INDEX_TYPE(3));
+  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
+
+  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(3));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest, RangeSegmentTeams);
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index e82dfb3a4a..e4e36a33c5 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -55,8 +55,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -73,36 +73,27 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
-              ctx,
-              tile_size_z,
-              r3,
+              ctx, tile_size_z, r3,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
               {
                 RAJA::tile<TEAM_Y_POLICY>(
-                    ctx,
-                    tile_size_y,
-                    r2,
+                    ctx, tile_size_y, r2,
                     [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
                     {
                       RAJA::tile<TEAM_X_POLICY>(
-                          ctx,
-                          tile_size_x,
-                          r1,
+                          ctx, tile_size_x, r1,
                           [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                z_tile,
+                                ctx, z_tile,
                                 [&](INDEX_TYPE tz)
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      y_tile,
+                                      ctx, y_tile,
                                       [&](INDEX_TYPE ty)
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            x_tile,
+                                            ctx, x_tile,
                                             [&](INDEX_TYPE tx)
                                             {
                                               auto idx =
@@ -123,8 +114,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
@@ -132,36 +123,27 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
-              ctx,
-              threads_z,
-              r3,
+              ctx, threads_z, r3,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
               {
                 RAJA::tile<TEAM_Y_POLICY>(
-                    ctx,
-                    threads_y,
-                    r2,
+                    ctx, threads_y, r2,
                     [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
                     {
                       RAJA::tile<TEAM_X_POLICY>(
-                          ctx,
-                          threads_x,
-                          r1,
+                          ctx, threads_x, r1,
                           [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                z_tile,
+                                ctx, z_tile,
                                 [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      y_tile,
+                                      ctx, y_tile,
                                       [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            x_tile,
+                                            ctx, x_tile,
                                             [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
                                             { working_array[0]++; });
                                       });
@@ -189,8 +171,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -231,26 +213,16 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
-                                 WORKING_RES,
-                                 LAUNCH_POLICY,
-                                 THREAD_X_POLICY,
-                                 THREAD_Y_POLICY,
-                                 THREAD_Z_POLICY,
-                                 TEAM_X_POLICY,
-                                 TEAM_Y_POLICY,
-                                 TEAM_Z_POLICY>(INDEX_TYPE(0));
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
-                                 WORKING_RES,
-                                 LAUNCH_POLICY,
-                                 THREAD_X_POLICY,
-                                 THREAD_Y_POLICY,
-                                 THREAD_Z_POLICY,
-                                 TEAM_X_POLICY,
-                                 TEAM_Y_POLICY,
-                                 TEAM_Z_POLICY>(INDEX_TYPE(1));
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index 83535095a6..6b6619a6bd 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -52,8 +52,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -70,36 +70,27 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
-              ctx,
-              threads_z,
-              r3,
+              ctx, threads_z, r3,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
               {
                 RAJA::tile<TEAM_Y_POLICY>(
-                    ctx,
-                    threads_y,
-                    r2,
+                    ctx, threads_y, r2,
                     [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
                     {
                       RAJA::tile<TEAM_X_POLICY>(
-                          ctx,
-                          threads_x,
-                          r1,
+                          ctx, threads_x, r1,
                           [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                z_tile,
+                                ctx, z_tile,
                                 [&](INDEX_TYPE tz)
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      y_tile,
+                                      ctx, y_tile,
                                       [&](INDEX_TYPE ty)
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            x_tile,
+                                            ctx, x_tile,
                                             [&](INDEX_TYPE tx)
                                             {
                                               auto idx =
@@ -120,8 +111,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
@@ -129,36 +120,27 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
-              ctx,
-              threads_z,
-              r3,
+              ctx, threads_z, r3,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
               {
                 RAJA::tile<TEAM_Y_POLICY>(
-                    ctx,
-                    threads_y,
-                    r2,
+                    ctx, threads_y, r2,
                     [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
                     {
                       RAJA::tile<TEAM_X_POLICY>(
-                          ctx,
-                          threads_x,
-                          r1,
+                          ctx, threads_x, r1,
                           [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
                           {
                             RAJA::loop<THREAD_Z_POLICY>(
-                                ctx,
-                                z_tile,
+                                ctx, z_tile,
                                 [&](INDEX_TYPE tz)
                                 {
                                   RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx,
-                                      y_tile,
+                                      ctx, y_tile,
                                       [&](INDEX_TYPE ty)
                                       {
                                         RAJA::loop<THREAD_X_POLICY>(
-                                            ctx,
-                                            x_tile,
+                                            ctx, x_tile,
                                             [&](INDEX_TYPE tx)
                                             {
                                               (void)tx;
@@ -192,8 +174,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -234,26 +216,16 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
-                               WORKING_RES,
-                               LAUNCH_POLICY,
-                               THREAD_X_POLICY,
-                               THREAD_Y_POLICY,
-                               THREAD_Z_POLICY,
-                               TEAM_X_POLICY,
-                               TEAM_Y_POLICY,
-                               TEAM_Z_POLICY>(INDEX_TYPE(0));
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
-                               WORKING_RES,
-                               LAUNCH_POLICY,
-                               THREAD_X_POLICY,
-                               THREAD_Y_POLICY,
-                               THREAD_Z_POLICY,
-                               TEAM_X_POLICY,
-                               TEAM_Y_POLICY,
-                               TEAM_Z_POLICY>(INDEX_TYPE(1));
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index 4fd6703c2c..58654af529 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -33,8 +33,8 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -83,8 +83,7 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
-                                         seg,
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
                                          [&](IDX_TYPE idx)
                                          {
                                            redand &= working_array[idx];
@@ -112,8 +111,8 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -142,53 +141,40 @@ TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY,
-                                  GLOBAL_THREAD_POLICY,
-                                  REDUCE_POLICY>(r1, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY,
-                                  GLOBAL_THREAD_POLICY,
-                                  REDUCE_POLICY>(r2, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY,
-                                  GLOBAL_THREAD_POLICY,
-                                  REDUCE_POLICY>(r3, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY,
-                                  GLOBAL_THREAD_POLICY,
-                                  REDUCE_POLICY>(r4, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY,
-                                  GLOBAL_THREAD_POLICY,
-                                  REDUCE_POLICY>(r5, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -203,12 +189,9 @@ TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE,
-                                  DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY,
-                                  GLOBAL_THREAD_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest,
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index 88a1ca2ba4..7cba0f0569 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -33,8 +33,8 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -61,8 +61,7 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
-                                         seg,
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
                                          [&](IDX_TYPE idx)
                                          {
                                            mininit.min(working_array[idx]);
@@ -82,8 +81,7 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx,
-            seg,
+            ctx, seg,
             [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
       });
 
@@ -95,16 +93,15 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx,
-            seg,
+            ctx, seg,
             [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -133,53 +130,43 @@ TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -194,12 +181,10 @@ TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest, ReduceMinBasicForall);
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index c60239013f..a48897d2fd 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -34,8 +34,8 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -60,8 +60,7 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
-                                         seg,
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
                                          [&](IDX_TYPE idx)
                                          {
                                            sum += working_array[idx];
@@ -90,8 +89,8 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -120,52 +119,42 @@ TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r4, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(r5, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -180,12 +169,10 @@ TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE,
-                               DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest, ReduceSumBasicForall);
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index 50658f5023..b876291685 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -34,8 +34,8 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -85,11 +85,10 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _redand, DATA_TYPE & _redand2)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _redand,
+                           DATA_TYPE & _redand2)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
-                                         seg,
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
                                          [&](IDX_TYPE idx)
                                          {
                                            _redand &= working_array[idx];
@@ -118,8 +117,8 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -147,31 +146,25 @@ TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
-                                           DATA_TYPE,
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                            RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY,
-                                           GLOBAL_THREAD_POLICY>(
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
-                                           DATA_TYPE,
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                            RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY,
-                                           GLOBAL_THREAD_POLICY>(
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
-                                           DATA_TYPE,
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                            RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY,
-                                           GLOBAL_THREAD_POLICY>(
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r3, seg_idx, working_res);
 
   // Range-stride segment tests
@@ -179,21 +172,15 @@ TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE,
-      DATA_TYPE,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE,
-      DATA_TYPE,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -208,11 +195,9 @@ TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE,
-                                           DATA_TYPE,
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                            RAJA::TypedListSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY,
-                                           GLOBAL_THREAD_POLICY>(
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index c646edc950..2eb1fce3da 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -33,8 +33,8 @@ void LaunchParamExptReduceMinBasicTestImpl(
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -61,12 +61,11 @@ void LaunchParamExptReduceMinBasicTestImpl(
       "LaunchMinBasicTest",
       RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _mininit, DATA_TYPE & _min)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _mininit,
+                           DATA_TYPE & _min)
       {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx,
-            seg,
+            ctx, seg,
             [&](IDX_TYPE idx)
             {
               _mininit = RAJA_MIN(working_array[idx], _mininit);
@@ -88,8 +87,7 @@ void LaunchParamExptReduceMinBasicTestImpl(
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min)
       {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx,
-            seg,
+            ctx, seg,
             [&](IDX_TYPE idx)
             { _min = RAJA_MIN(working_array[idx] * factor, _min); });
       });
@@ -104,8 +102,7 @@ void LaunchParamExptReduceMinBasicTestImpl(
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min)
       {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx,
-            seg,
+            ctx, seg,
             [&](IDX_TYPE idx)
             { _min = RAJA_MIN(working_array[idx] * factor, _min); });
       });
@@ -113,8 +110,8 @@ void LaunchParamExptReduceMinBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -142,52 +139,42 @@ TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r5, seg_idx, working_res);
 
   // List segment tests
@@ -203,11 +190,9 @@ TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedListSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index 2bd97982e4..0ce6f6483d 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -34,8 +34,8 @@ void LaunchParamExptReduceSumBasicTestImpl(
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -57,14 +57,12 @@ void LaunchParamExptReduceSumBasicTestImpl(
 
   RAJA::launch<LAUNCH_POLICY>(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      "LaunchSumBasicTest",
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      "LaunchSumBasicTest", RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _sum, DATA_TYPE & _sum2)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum,
+                           DATA_TYPE & _sum2)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx,
-                                         seg,
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
                                          [&](IDX_TYPE idx)
                                          {
                                            _sum += working_array[idx];
@@ -94,8 +92,8 @@ void LaunchParamExptReduceSumBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -123,52 +121,42 @@ TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       r5, seg_idx, working_res);
 
   // List segment tests
@@ -184,11 +172,9 @@ TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE,
-                                        DATA_TYPE,
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedListSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY,
-                                        GLOBAL_THREAD_POLICY>(
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       l1, seg_idx, working_res);
 }
 
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index 7f478705c8..e9d5e0b503 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -24,8 +24,8 @@ void LaunchBasicSharedTestImpl()
   int*                      check_array;
   int*                      test_array;
 
-  allocateForallTestData<int>(
-      N * N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<int>(N * N, working_res, &working_array, &check_array,
+                              &test_array);
 
 
   // Select platform
@@ -47,21 +47,19 @@ void LaunchBasicSharedTestImpl()
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_POLICY>(
-            ctx,
-            RAJA::RangeSegment(0, N),
+            ctx, RAJA::RangeSegment(0, N),
             [&](int r)
             {
               // Array shared within threads of the same team
               int* s_A = ctx.getSharedMemory<int>(1);
 
-              RAJA::loop<THREAD_POLICY>(
-                  ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; });
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1),
+                                        [&](int c) { s_A[c] = r; });
 
               ctx.teamSync();
 
               // broadcast shared value to all threads and write to array
-              RAJA::loop<THREAD_POLICY>(ctx,
-                                        RAJA::RangeSegment(0, N),
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N),
                                         [&](int c)
                                         {
                                           const int idx      = c + N * r;
@@ -83,8 +81,8 @@ void LaunchBasicSharedTestImpl()
     }
   }
 
-  deallocateForallTestData<int>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<int>(working_res, working_array, check_array,
+                                test_array);
 }
 
 
@@ -107,9 +105,7 @@ TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
       typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
                         camp::num<2>>::type;
 
-  LaunchBasicSharedTestImpl<WORKING_RES,
-                            LAUNCH_POLICY,
-                            TEAM_POLICY,
+  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
                             THREAD_POLICY>();
 }
 
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index 9f14fe6b2e..f100d3dac3 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -58,8 +58,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
   int           blocks  = (data_len - 1) / threads + 1;
@@ -72,16 +72,15 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
       test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx,
-              lseg,
+              ctx, lseg,
               [&](INDEX_TYPE idx)
               { working_array[RAJA::stripIndexType(idx)] = idx; });
         });
@@ -91,15 +90,14 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx,
-                                          lseg,
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg,
                                           [&](INDEX_TYPE idx)
                                           {
                                             (void)idx;
@@ -124,8 +122,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
   }
 
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -146,24 +144,16 @@ TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
                         camp::num<1>>::type;
 
   // test zero-length list segment
-  LaunchListSegmentTestImpl<INDEX_TYPE,
-                            WORKING_RESOURCE,
-                            LAUNCH_POLICY,
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE,
-                            WORKING_RESOURCE,
-                            LAUNCH_POLICY,
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE,
-                            WORKING_RESOURCE,
-                            LAUNCH_POLICY,
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE,
-                            WORKING_RESOURCE,
-                            LAUNCH_POLICY,
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
 }
 
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index 6691541038..cdb28dc6f3 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -32,8 +32,8 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
   int           blocks  = (data_len - 1) / threads + 1;
@@ -50,8 +50,7 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx,
-              r1,
+              ctx, r1,
               [&](INDEX_TYPE idx)
               { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
         });
@@ -61,15 +60,14 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx,
-                                          r1,
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
                                           [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
                                           { working_array[0]++; });
         });
@@ -92,8 +90,8 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -120,20 +118,14 @@ template <typename INDEX_TYPE,
 void runNegativeTests()
 {
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                              GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
                                                    INDEX_TYPE(-5));
 
-  LaunchRangeSegmentTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                              GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
                                                    INDEX_TYPE(0));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                              GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
                                                    INDEX_TYPE(5));
 }
@@ -151,31 +143,21 @@ TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
                         camp::num<1>>::type;
 
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                              GLOBAL_THREAD_POLICY>(INDEX_TYPE(3),
                                                    INDEX_TYPE(3));
 
-  LaunchRangeSegmentTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                              GLOBAL_THREAD_POLICY>(INDEX_TYPE(0),
                                                    INDEX_TYPE(27));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                              GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
                                                    INDEX_TYPE(2047));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE,
-                             WORKING_RES,
-                             LAUNCH_POLICY,
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                              GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
                                                    INDEX_TYPE(32000));
 
-  runNegativeTests<INDEX_TYPE,
-                   WORKING_RES,
-                   LAUNCH_POLICY,
+  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                    GLOBAL_THREAD_POLICY>();
 }
 
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index d17cd31280..c8095d1f15 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -35,8 +35,8 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
@@ -60,8 +60,7 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx,
-              r1,
+              ctx, r1,
               [&](INDEX_TYPE idx) {
                 working_array[RAJA::stripIndexType((idx - first) / stride)] =
                     idx;
@@ -75,8 +74,7 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx,
-                                          r1,
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
                                           [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
                                           { working_array[0]++; });
         });
@@ -99,8 +97,8 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -128,37 +126,22 @@ template <typename INDEX_TYPE,
               RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
 
   // Test negative strides
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
@@ -176,67 +159,37 @@ TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
 
   // Test size zero segments
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE,
-                                   DIFF_TYPE,
-                                   WORKING_RES,
-                                   LAUNCH_POLICY,
-                                   GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
-  runNegativeStrideTests<INDEX_TYPE,
-                         DIFF_TYPE,
-                         WORKING_RES,
-                         LAUNCH_POLICY,
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
                          GLOBAL_THREAD_POLICY>();
 }
 
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index 9f944fb805..0c59d7f5ed 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -31,8 +31,8 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
@@ -60,8 +60,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_POLICY>(
-            ctx,
-            outer_range,
+            ctx, outer_range,
             [&](INDEX_TYPE bid)
             {
               INDEX_TYPE* tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(
@@ -75,8 +74,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
                   int_tile_ptr, RAJA::stripIndexType(thread_range));
 
               RAJA::loop<THREAD_POLICY>(
-                  ctx,
-                  inner_range,
+                  ctx, inner_range,
                   [&](INDEX_TYPE tid)
                   {
                     Int_Tile(RAJA::stripIndexType(tid)) =
@@ -89,8 +87,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
               ctx.teamSync();
 
               RAJA::loop<THREAD_POLICY>(
-                  ctx,
-                  inner_range,
+                  ctx, inner_range,
                   [&](INDEX_TYPE tid)
                   {
                     INDEX_TYPE idx = tid + thread_range * bid;
@@ -111,8 +108,8 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -137,16 +134,10 @@ TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
                         camp::num<2>>::type;
 
 
-  LaunchDynamicMemTestImpl<INDEX_TYPE,
-                           WORKING_RES,
-                           LAUNCH_POLICY,
-                           TEAM_POLICY,
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
                            THREAD_POLICY>(INDEX_TYPE(4), INDEX_TYPE(2));
 
-  LaunchDynamicMemTestImpl<INDEX_TYPE,
-                           WORKING_RES,
-                           LAUNCH_POLICY,
-                           TEAM_POLICY,
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
                            THREAD_POLICY>(INDEX_TYPE(5), INDEX_TYPE(32));
 }
 
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index 623075af98..f80f70f752 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -34,8 +34,8 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
@@ -55,8 +55,7 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_POLICY>(
-            ctx,
-            outer_range,
+            ctx, outer_range,
             [&](INDEX_TYPE bid)
             {
               // Since we are using custom index type we have to first use a
@@ -71,8 +70,7 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
               INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
 
               RAJA::loop<THREAD_POLICY>(
-                  ctx,
-                  inner_range,
+                  ctx, inner_range,
                   [&](INDEX_TYPE tid)
                   {
                     Tile[RAJA::stripIndexType(thread_range) -
@@ -83,8 +81,7 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
               ctx.teamSync();
 
               RAJA::loop<THREAD_POLICY>(
-                  ctx,
-                  inner_range,
+                  ctx, inner_range,
                   [&](INDEX_TYPE tid)
                   {
                     INDEX_TYPE idx = tid + thread_range * bid;
@@ -104,8 +101,8 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
               check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -130,19 +127,11 @@ TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
                         camp::num<2>>::type;
 
 
-  LaunchStaticMemTestImpl<INDEX_TYPE,
-                          WORKING_RES,
-                          LAUNCH_POLICY,
-                          TEAM_POLICY,
-                          THREAD_POLICY,
-                          2>(INDEX_TYPE(4));
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 2>(INDEX_TYPE(4));
 
-  LaunchStaticMemTestImpl<INDEX_TYPE,
-                          WORKING_RES,
-                          LAUNCH_POLICY,
-                          TEAM_POLICY,
-                          THREAD_POLICY,
-                          32>(INDEX_TYPE(5));
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 32>(INDEX_TYPE(5));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest, StaticMemLaunch);
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index 1a5e95d5bf..db8c4541a6 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -44,16 +44,12 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_ttile_array,
-                                     &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_iloop_array,
-                                     &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
                                      &test_iloop_array);
 
   if (RAJA::stripIndexType(N) > 0)
@@ -67,15 +63,12 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx,
-              threads_x,
-              r1,
+              ctx, threads_x, r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
                   INDEX_TYPE                                 bx)
               {
                 RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx,
-                    x_tile,
+                    ctx, x_tile,
                     [&](INDEX_TYPE tx, INDEX_TYPE ix)
                     {
                       working_ttile_array[tx] = bx;
@@ -87,26 +80,23 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   else
   { // zero-length segment
 
-    memset(
-        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx,
-              threads_x,
-              r1,
+              ctx, threads_x, r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
                   INDEX_TYPE RAJA_UNUSED_ARG(bx))
               {
                 RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx,
-                    x_tile,
+                    ctx, x_tile,
                     [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
                         INDEX_TYPE RAJA_UNUSED_ARG(ix))
                     {
@@ -117,10 +107,10 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         });
   }
 
-  working_res.memcpy(
-      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(
-      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -147,11 +137,11 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
@@ -179,24 +169,15 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
-                                 WORKING_RES,
-                                 LAUNCH_POLICY,
-                                 THREAD_X_POLICY,
-                                 TEAM_X_POLICY>(INDEX_TYPE(0));
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
-                                 WORKING_RES,
-                                 LAUNCH_POLICY,
-                                 THREAD_X_POLICY,
-                                 TEAM_X_POLICY>(INDEX_TYPE(1));
-
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE,
-                                 WORKING_RES,
-                                 LAUNCH_POLICY,
-                                 THREAD_X_POLICY,
-                                 TEAM_X_POLICY>(INDEX_TYPE(2));
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
+
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index cd79c5259f..e9e7e1c6ca 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -47,16 +47,12 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_ttile_array,
-                                     &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_iloop_array,
-                                     &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
                                      &test_iloop_array);
 
   if (RAJA::stripIndexType(N) > 0)
@@ -70,15 +66,12 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx,
-              tile_size,
-              r1,
+              ctx, tile_size, r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
                   INDEX_TYPE                                 bx)
               {
                 RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx,
-                    x_tile,
+                    ctx, x_tile,
                     [&](INDEX_TYPE tx, INDEX_TYPE ix)
                     {
                       working_ttile_array[tx] = bx;
@@ -90,26 +83,23 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   else
   { // zero-length segment
 
-    memset(
-        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx,
-              tile_size,
-              r1,
+              ctx, tile_size, r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
                   INDEX_TYPE RAJA_UNUSED_ARG(bx))
               {
                 RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx,
-                    x_tile,
+                    ctx, x_tile,
                     [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
                         INDEX_TYPE RAJA_UNUSED_ARG(ix))
                     {
@@ -120,10 +110,10 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         });
   }
 
-  working_res.memcpy(
-      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(
-      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -150,11 +140,11 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
@@ -182,24 +172,15 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
-                               WORKING_RES,
-                               LAUNCH_POLICY,
-                               THREAD_X_POLICY,
-                               TEAM_X_POLICY>(INDEX_TYPE(0));
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
-                               WORKING_RES,
-                               LAUNCH_POLICY,
-                               THREAD_X_POLICY,
-                               TEAM_X_POLICY>(INDEX_TYPE(1));
-
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE,
-                               WORKING_RES,
-                               LAUNCH_POLICY,
-                               THREAD_X_POLICY,
-                               TEAM_X_POLICY>(INDEX_TYPE(2));
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
+
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index a0d2af93d9..8b904dace8 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -55,9 +55,7 @@ void ScanExclusiveTestImpl(
 
   RAJA::exclusive_scan<EXEC_POLICY>(
       RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N),
-      OP_TYPE{},
-      offset);
+      RAJA::make_span(work_out, N), OP_TYPE{}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -68,11 +66,8 @@ void ScanExclusiveTestImpl(
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
   RAJA::exclusive_scan<EXEC_POLICY>(
-      res,
-      RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N),
-      OP_TYPE{},
-      offset);
+      res, RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE{}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index f4d0b7dfc9..4c29013c80 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -53,8 +53,8 @@ void ScanExclusiveInplaceTestImpl(
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
-      RAJA::make_span(work_in, N), OP_TYPE{}, offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
+                                            OP_TYPE{}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -64,8 +64,8 @@ void ScanExclusiveInplaceTestImpl(
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
-      res, RAJA::make_span(work_in, N), OP_TYPE{}, offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE{}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index 91b550e99f..1286785154 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -54,8 +54,7 @@ void ScanInclusiveTestImpl(int N)
 
   RAJA::inclusive_scan<EXEC_POLICY>(
       RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N),
-      OP_TYPE{});
+      RAJA::make_span(work_out, N), OP_TYPE{});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -66,10 +65,8 @@ void ScanInclusiveTestImpl(int N)
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
   RAJA::inclusive_scan<EXEC_POLICY>(
-      res,
-      RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N),
-      OP_TYPE{});
+      res, RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE{});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index ffdc9869aa..92b7e7447c 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -63,8 +63,8 @@ void ScanInclusiveInplaceTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(
-      res, RAJA::make_span(work_in, N), OP_TYPE{});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE{});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index 261d3a6d75..e1390849f9 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -23,7 +23,7 @@ void CtorGetSetImpl()
   // Allocate Data
   //
   std::vector<element_t>                 data1_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -33,7 +33,7 @@ void CtorGetSetImpl()
 
 
   std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index a2d2a27b32..3f8d42417f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -125,9 +125,8 @@ void ET_AddImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(data5_h(j, i),
-                       data1_h(i, j) + data2_h(j, i) + data3_h(i, j) +
-                           data4_h(j, i));
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i) +
+                                          data3_h(i, j) + data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index f934c097a0..95b3b38304 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -126,9 +126,8 @@ void ET_DivideImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(data5_h(j, i),
-                       data1_h(i, j) / data2_h(j, i) +
-                           data3_h(i, j) / data4_h(j, i));
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i) +
+                                          data3_h(i, j) / data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index f1183bcc2d..12e48fdc92 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -25,7 +25,7 @@ void ET_LoadStoreImpl()
 
   // alloc data1
   std::vector<element_t>                 data1_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -36,7 +36,7 @@ void ET_LoadStoreImpl()
 
   // alloc data2
   std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -48,23 +48,19 @@ void ET_LoadStoreImpl()
   // alloc data3 with StaticLayout
   std::vector<element_t> data3_vec(matrix_t::s_num_rows *
                                    matrix_t::s_num_columns);
-  RAJA::View<element_t,
-             RAJA::StaticLayout<RAJA::PERM_IJ,
-                                matrix_t::s_num_rows,
-                                matrix_t::s_num_columns>>
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
       data3_h(data3_vec.data());
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t,
-             RAJA::StaticLayout<RAJA::PERM_IJ,
-                                matrix_t::s_num_rows,
-                                matrix_t::s_num_columns>>
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
       data3_d(data3_ptr);
 
 
   // alloc data4
   std::vector<element_t>                 data4_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data4_h(
       data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -75,7 +71,7 @@ void ET_LoadStoreImpl()
 
   // alloc data5
   std::vector<element_t>                 data5_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(
       data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -86,7 +82,7 @@ void ET_LoadStoreImpl()
 
   // alloc data6
   std::vector<element_t>                 data6_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data6_h(
       data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -97,7 +93,7 @@ void ET_LoadStoreImpl()
 
   // alloc data7
   std::vector<element_t>                 data7_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data7_h(
       data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -132,10 +128,10 @@ void ET_LoadStoreImpl()
         auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
         auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-        auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::
-            template static_range<0, matrix_t::s_num_rows>();
-        auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::
-            template static_range<0, matrix_t::s_num_columns>();
+        auto SRrows = RAJA::expt::RowIndex<
+            int, matrix_t>::template static_range<0, matrix_t::s_num_rows>();
+        auto SRcols = RAJA::expt::ColIndex<
+            int, matrix_t>::template static_range<0, matrix_t::s_num_columns>();
 
         data2_d(cols, rows) = data1_d(rows, cols);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index b7fda8523f..1e15d39bc9 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -59,8 +59,8 @@ void ET_MatrixMatrixMultiplyAddImpl()
   // alloc data3 - The result matrix
 
   std::vector<element_t>                              data3_vec(N * N);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(
-      data3_vec.data(), N, N);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),
+                                                              N, N);
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
   RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr, N, N);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index de849d5e32..2d4abee459 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -125,9 +125,8 @@ void ET_SubtractImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(data5_h(j, i),
-                       data1_h(i, j) - data2_h(j, i) + data3_h(i, j) -
-                           data4_h(j, i));
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i) +
+                                          data3_h(i, j) - data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index 767d5df0a5..37c749524c 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -26,7 +26,7 @@ void Load_ColMajorImpl()
   // alloc data1
 
   std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
@@ -38,7 +38,7 @@ void Load_ColMajorImpl()
   // alloc data2
 
   std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -135,13 +135,13 @@ void Load_ColMajorImpl()
             matrix_t m;
             if (matrix_t::layout_type::is_column_major())
             {
-              m.load_packed_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.load_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                               m_size);
             }
             else
             {
-              m.load_strided_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.load_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
             }
 
             // write out to a second view so we can check it on the host
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 50943cee85..11eb276124 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -26,7 +26,7 @@ void Load_RowMajorImpl()
   // alloc data1
 
   std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
@@ -38,7 +38,7 @@ void Load_RowMajorImpl()
   // alloc data2
 
   std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -133,13 +133,13 @@ void Load_RowMajorImpl()
             matrix_t m;
             if (matrix_t::layout_type::is_row_major())
             {
-              m.load_packed_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.load_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                               n_size, m_size);
             }
             else
             {
-              m.load_strided_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.load_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
             }
 
             // write out to a second view so we can check it on the host
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index b991b37225..974994c7e1 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -26,7 +26,7 @@ void Store_ColMajorImpl()
   // alloc data1 - matrix data will be generated on device, stored into data1
 
   std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
@@ -38,7 +38,7 @@ void Store_ColMajorImpl()
   // alloc data2 - reference data to compare with data1 on host
 
   std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -161,13 +161,13 @@ void Store_ColMajorImpl()
             // Store matrix to memory
             if (matrix_t::layout_type::is_column_major())
             {
-              m.store_packed_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.store_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
             }
             else
             {
-              m.store_strided_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.store_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                 m_size);
             }
           });
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index c077afcb49..3958c2bf5d 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -25,7 +25,7 @@ void Store_RowMajorImpl()
   // alloc data1 - matrix data will be generated on device, stored into data1
 
   std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
@@ -37,7 +37,7 @@ void Store_RowMajorImpl()
   // alloc data2 - reference data to compare with data1 on host
 
   std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
+                                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -160,13 +160,13 @@ void Store_RowMajorImpl()
             // Store matrix to memory
             if (matrix_t::layout_type::is_row_major())
             {
-              m.store_packed_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.store_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
             }
             else
             {
-              m.store_strided_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.store_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                 n_size, m_size);
             }
           });
 
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index d4ff9d90a8..3b1111b6ef 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -85,10 +85,8 @@ ForallVectorRef2dImpl()
   }
 
   using policy2_t = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      RAJA::seq_exec,
-      RAJA::statement::For<1,
-                           RAJA::expt::vector_exec<vector_t>,
+      0, RAJA::seq_exec,
+      RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
                            RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<policy2_t>(
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index d72f1f5297..f4658f2cde 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -46,8 +46,7 @@ void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
           counter0 += 1;
         }
       },
-      seg0,
-      seg1);
+      seg0, seg1);
 
   ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index 39231bb232..ffcce8a06f 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -59,9 +59,7 @@ void test_CombiningAdapter_3D(Segment0 const& seg0,
           }
         }
       },
-      seg0,
-      seg1,
-      seg2);
+      seg0, seg1, seg2);
 
   ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index 104c213f96..882af87b07 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -44,8 +44,7 @@ void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
           counters[camp::seq_at<0, Perm>::value] += 1;
         }
       },
-      seg0,
-      seg1);
+      seg0, seg1);
 
   ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index 3790882fa1..ab0ed6b4dd 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -55,9 +55,7 @@ void test_PermutedCombiningAdapter_3D(Segment const& seg0,
           }
         }
       },
-      seg0,
-      seg1,
-      seg2);
+      seg0, seg1, seg2);
 
   ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
@@ -95,10 +93,10 @@ TEST(PermutedCombiningAdapter, test3D)
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_JKI, int>(0, 0, 0, 0, 0, 5);
 
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KIJ, int>(0, 3, 0, 4, 0, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(
-      -3, 5, 0, 6, 2, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(
-      4, 13, -2, 7, -3, 0);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(
-      -8, -2, -5, 3, 1, 4);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2,
+                                                               5);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3,
+                                                               0);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1,
+                                                               4);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index 412529e661..bb3f2917bb 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -147,14 +147,14 @@ struct testWorkGroupOrderedMultiple
     type3* check_array3   = nullptr;
     type3* test_array3    = nullptr;
 
-    allocateForallTestData<type1>(
-        N * num1, working_res, &working_array1, &check_array1, &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-    allocateForallTestData<type2>(
-        N * num2, working_res, &working_array2, &check_array2, &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-    allocateForallTestData<type3>(
-        N * num3, working_res, &working_array3, &check_array3, &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
     type1 const test_val1(5);
     type2 const test_val2(7);
@@ -171,30 +171,20 @@ struct testWorkGroupOrderedMultiple
         camp::list<range_segment, callable31<IndexType, type3>>,
         camp::list<range_segment, callable32<IndexType, type3>>>;
 
-    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                              OrderPolicy,
-                                              StoragePolicy,
-                                              DispatchPolicy>,
-                        IndexType,
-                        RAJA::xargs<>,
-                        Allocator>;
-
-    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using resource_type = typename WorkGroup_type::resource_type;
 
@@ -384,14 +374,14 @@ struct testWorkGroupOrderedMultiple
     }
 
 
-    deallocateForallTestData<type1>(
-        working_res, working_array1, check_array1, test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-    deallocateForallTestData<type2>(
-        working_res, working_array2, check_array2, test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-    deallocateForallTestData<type3>(
-        working_res, working_array3, check_array3, test_array3);
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
   }
 };
 
@@ -482,20 +472,11 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple<ExecPolicy,
-                               OrderPolicy,
-                               StoragePolicy,
-                               DispatchTyper,
-                               IndexType,
-                               Allocator,
-                               WORKING_RESOURCE>{}(rng,
-                                                   IndexType(96),
-                                                   IndexType(4000),
-                                                   num1,
-                                                   num2,
-                                                   num3,
-                                                   pool_reuse,
-                                                   group_reuse);
+  testWorkGroupOrderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE>{}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
+      group_reuse);
 }
 
 #endif //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index 7437cb3059..d2989bd763 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -41,8 +41,8 @@ struct testWorkGroupOrderedSingle
     IndexType* check_array;
     IndexType* test_array;
 
-    allocateForallTestData<IndexType>(
-        N, working_res, &working_array, &check_array, &test_array);
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
     IndexType const test_val(5);
 
@@ -58,30 +58,20 @@ struct testWorkGroupOrderedSingle
         camp::list<range_segment, decltype(callable1)>,
         camp::list<range_segment, decltype(callable2)>>;
 
-    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                              OrderPolicy,
-                                              StoragePolicy,
-                                              DispatchPolicy>,
-                        IndexType,
-                        RAJA::xargs<>,
-                        Allocator>;
-
-    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     {
       for (IndexType i = IndexType(0); i < N; i++)
@@ -127,8 +117,8 @@ struct testWorkGroupOrderedSingle
     }
 
 
-    deallocateForallTestData<IndexType>(
-        working_res, working_array, check_array, test_array);
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
   }
 };
 
@@ -205,26 +195,14 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle<ExecPolicy,
-                             OrderPolicy,
-                             StoragePolicy,
-                             DispatchTyper,
-                             IndexType,
-                             Allocator,
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
                              WORKING_RESOURCE>{}(b1, e1);
-  testWorkGroupOrderedSingle<ExecPolicy,
-                             OrderPolicy,
-                             StoragePolicy,
-                             DispatchTyper,
-                             IndexType,
-                             Allocator,
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
                              WORKING_RESOURCE>{}(b2, e2);
-  testWorkGroupOrderedSingle<ExecPolicy,
-                             OrderPolicy,
-                             StoragePolicy,
-                             DispatchTyper,
-                             IndexType,
-                             Allocator,
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
                              WORKING_RESOURCE>{}(b3, e3);
 }
 
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index dba9f98844..57701c806f 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -119,14 +119,14 @@ struct testWorkGroupUnorderedMultiple
     type3* check_array3   = nullptr;
     type3* test_array3    = nullptr;
 
-    allocateForallTestData<type1>(
-        N * num1, working_res, &working_array1, &check_array1, &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-    allocateForallTestData<type2>(
-        N * num2, working_res, &working_array2, &check_array2, &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-    allocateForallTestData<type3>(
-        N * num3, working_res, &working_array3, &check_array3, &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
     type1 const test_val1(5);
     type2 const test_val2(7);
@@ -139,30 +139,20 @@ struct testWorkGroupUnorderedMultiple
         camp::list<range_segment, callable2<IndexType, type2>>,
         camp::list<range_segment, callable3<IndexType, type3>>>;
 
-    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                              OrderPolicy,
-                                              StoragePolicy,
-                                              DispatchPolicy>,
-                        IndexType,
-                        RAJA::xargs<>,
-                        Allocator>;
-
-    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     WorkPool_type pool(Allocator{});
 
@@ -336,14 +326,14 @@ struct testWorkGroupUnorderedMultiple
     }
 
 
-    deallocateForallTestData<type1>(
-        working_res, working_array1, check_array1, test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-    deallocateForallTestData<type2>(
-        working_res, working_array2, check_array2, test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-    deallocateForallTestData<type3>(
-        working_res, working_array3, check_array3, test_array3);
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
   }
 };
 
@@ -435,20 +425,11 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple<ExecPolicy,
-                                 OrderPolicy,
-                                 StoragePolicy,
-                                 DispatchTyper,
-                                 IndexType,
-                                 Allocator,
-                                 WORKING_RESOURCE>{}(rng,
-                                                     IndexType(96),
-                                                     IndexType(4000),
-                                                     num1,
-                                                     num2,
-                                                     num3,
-                                                     pool_reuse,
-                                                     group_reuse);
+  testWorkGroupUnorderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator,
+                                 WORKING_RESOURCE>{}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
+      group_reuse);
 }
 
 #endif //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index 2d782c6c72..bb1fefe83f 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -41,8 +41,8 @@ struct testWorkGroupUnorderedSingle
     IndexType* check_array;
     IndexType* test_array;
 
-    allocateForallTestData<IndexType>(
-        N, working_res, &working_array, &check_array, &test_array);
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
     IndexType const test_val(5);
 
@@ -54,30 +54,20 @@ struct testWorkGroupUnorderedSingle
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, decltype(callable)>>;
 
-    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                              OrderPolicy,
-                                              StoragePolicy,
-                                              DispatchPolicy>,
-                        IndexType,
-                        RAJA::xargs<>,
-                        Allocator>;
-
-    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using resource_type = typename WorkSite_type::resource_type;
     static_assert(std::is_same<WORKING_RES, resource_type>::value,
@@ -128,8 +118,8 @@ struct testWorkGroupUnorderedSingle
     }
 
 
-    deallocateForallTestData<IndexType>(
-        working_res, working_array, check_array, test_array);
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
   }
 };
 
@@ -206,26 +196,14 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle<ExecPolicy,
-                               OrderPolicy,
-                               StoragePolicy,
-                               DispatchTyper,
-                               IndexType,
-                               Allocator,
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
                                WORKING_RESOURCE>{}(b1, e1);
-  testWorkGroupUnorderedSingle<ExecPolicy,
-                               OrderPolicy,
-                               StoragePolicy,
-                               DispatchTyper,
-                               IndexType,
-                               Allocator,
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
                                WORKING_RESOURCE>{}(b2, e2);
-  testWorkGroupUnorderedSingle<ExecPolicy,
-                               OrderPolicy,
-                               StoragePolicy,
-                               DispatchTyper,
-                               IndexType,
-                               Allocator,
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
                                WORKING_RESOURCE>{}(b3, e3);
 }
 
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index aedc570448..0ab921f7d9 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -34,9 +34,7 @@
 
 #define GPU_TEST_F(test_fixture, test_name)                                    \
   static void gpu_test_f_##test_fixture##_##test_name();                       \
-  GTEST_TEST_(test_fixture,                                                    \
-              test_name,                                                       \
-              test_fixture,                                                    \
+  GTEST_TEST_(test_fixture, test_name, test_fixture,                           \
               ::testing::internal::GetTypeId<test_fixture>())                  \
   {                                                                            \
     gpu_test_f_##test_fixture##_##test_name();                                 \
@@ -65,8 +63,7 @@
               #test_case_name,                                                 \
               ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
           ->AddTestPattern(                                                    \
-              #test_case_name,                                                 \
-              #test_name,                                                      \
+              #test_case_name, #test_name,                                     \
               new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
                   test_case_name, test_name)>());                              \
       return 0;                                                                \
@@ -96,9 +93,7 @@
     };                                                                         \
     static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =          \
         GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(                \
-            __FILE__,                                                          \
-            __LINE__,                                                          \
-            GTEST_STRINGIFY_(SuiteName),                                       \
+            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),                   \
             GTEST_STRINGIFY_(TestName));                                       \
   }                                                                            \
   template <typename gtest_TypeParam_>                                         \
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index 801222dd5a..ed426fab4e 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -134,8 +134,8 @@ void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    cudaErrchk(cudaMemcpy(
-        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                          cudaMemcpyHostToDevice));
   }
   else
   {
@@ -148,8 +148,8 @@ void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    cudaErrchk(cudaMemcpy(
-        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), cudaMemcpyDeviceToHost));
+    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                          cudaMemcpyDeviceToHost));
   }
   else
   {
@@ -196,8 +196,8 @@ void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    hipErrchk(hipMemcpy(
-        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), hipMemcpyHostToDevice));
+    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                        hipMemcpyHostToDevice));
   }
   else
   {
@@ -210,8 +210,8 @@ void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    hipErrchk(hipMemcpy(
-        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), hipMemcpyDeviceToHost));
+    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                        hipMemcpyDeviceToHost));
   }
   else
   {
diff --git a/test/include/RAJA_unit-test-for3d3d.hpp b/test/include/RAJA_unit-test-for3d3d.hpp
index 878bf7b7ca..37eb5652ea 100644
--- a/test/include/RAJA_unit-test-for3d3d.hpp
+++ b/test/include/RAJA_unit-test-for3d3d.hpp
@@ -123,17 +123,13 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_cuda_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x),
-               static_cast<int>(threadIdx.y),
+  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
                static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x),
-               static_cast<int>(blockIdx.y),
+              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
                static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x),
-               static_cast<int>(blockDim.y),
+      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
                static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x),
-               static_cast<int>(gridDim.y),
+              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
                static_cast<int>(gridDim.z)}});
 }
 
@@ -155,17 +151,13 @@ inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_hip_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x),
-               static_cast<int>(threadIdx.y),
+  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
                static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x),
-               static_cast<int>(blockIdx.y),
+              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
                static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x),
-               static_cast<int>(blockDim.y),
+      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
                static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x),
-               static_cast<int>(gridDim.y),
+              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
                static_cast<int>(gridDim.z)}});
 }
 
@@ -175,9 +167,7 @@ inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 {
   hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
                      dim3(dim.block[0], dim.block[1], dim.block[2]),
-                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]),
-                     0,
-                     0,
+                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
                      std::forward<L>(run));
   hipErrchk(hipGetLastError());
   hipErrchk(hipDeviceSynchronize());
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
index 2d9b5ba453..615f250e0e 100644
--- a/test/include/RAJA_unit-test-forone.hpp
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -71,11 +71,7 @@ __global__ void forone_hip_global(L run)
 template <typename L>
 inline void forone(test_hip, L&& run)
 {
-  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>,
-                     dim3(1),
-                     dim3(1),
-                     0,
-                     0,
+  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
                      std::forward<L>(run));
   hipErrchk(hipGetLastError());
   hipErrchk(hipDeviceSynchronize());
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index 8a5f030ac5..0ef04acdcb 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -45,8 +45,8 @@ void PluginForallTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -68,8 +68,8 @@ void PluginForAllICountTestImpl()
   for (int i = 0; i < 10; i++)
   {
 
-    RAJA::forall_Icount<ExecPolicy>(
-        RAJA::RangeSegment(i, i + 1), i, PluginTestCallable{data});
+    RAJA::forall_Icount<ExecPolicy>(RAJA::RangeSegment(i, i + 1), i,
+                                    PluginTestCallable{data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
@@ -82,8 +82,8 @@ void PluginForAllICountTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -129,8 +129,8 @@ void PluginForAllIdxSetTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -176,8 +176,8 @@ void PluginForAllIcountIdxSetTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -226,8 +226,7 @@ TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
   using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIcountIdxSetTestImpl<ExecPolicy,
-                                   ResType,
+  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType,
                                    PlatformHolder::platform>();
 }
 
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index c9139c3d15..0f1a2f0a82 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -45,8 +45,8 @@ void PluginKernelTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 43cd98a866..46674df4cf 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -53,8 +53,8 @@ void PluginLaunchTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index 792372531e..b71d4e707e 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -39,8 +39,7 @@ void PluginResourceLaunchTestImpl()
       PluginTestCallable p_callable{data};
 
       RAJA::launch<LaunchPolicy>(
-          res,
-          RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+          res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
           [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
           { p_callable(i); });
     }
@@ -56,8 +55,8 @@ void PluginResourceLaunchTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -80,8 +79,7 @@ TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
   using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginResourceLaunchTestImpl<LaunchPolicy,
-                               ResType,
+  PluginResourceLaunchTestImpl<LaunchPolicy, ResType,
                                PlatformHolder::platform>();
 }
 
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index 2c9fcbe1cf..040991f03f 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -38,30 +38,20 @@ struct PluginWorkGroupTestImpl
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, PluginTestCallable>>;
 
-    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                              OrderPolicy,
-                                              StoragePolicy,
-                                              DispatchPolicy>,
-                        IndexType,
-                        RAJA::xargs<>,
-                        Allocator>;
-
-    using WorkSite_type = RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<>,
-                                         Allocator>;
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     SetupPluginVars spv(WORKINGRES{});
 
@@ -78,8 +68,8 @@ struct PluginWorkGroupTestImpl
         loop_data[i].launch_counter_pre      = -1;
         loop_data[i].launch_counter_post     = -1;
       }
-      plugin_test_resource->memcpy(
-          data, &loop_data[0], 10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(data, &loop_data[0],
+                                   10 * sizeof(CounterData));
     }
 
     WorkPool_type pool(Allocator{});
@@ -91,8 +81,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(
-          &plugin_data, plugin_test_data, sizeof(CounterData));
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -103,8 +93,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData loop_data[10];
-      plugin_test_resource->memcpy(
-          &loop_data[0], data, 10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
 
       for (int i = 0; i < 10; i++)
       {
@@ -123,8 +113,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(
-          &plugin_data, plugin_test_data, sizeof(CounterData));
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -135,8 +125,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData loop_data[10];
-      plugin_test_resource->memcpy(
-          &loop_data[0], data, 10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
 
       for (int i = 0; i < 10; i++)
       {
@@ -155,8 +145,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(
-          &plugin_data, plugin_test_data, sizeof(CounterData));
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -247,13 +237,8 @@ TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
   using PlatformHolder   = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<ExecPolicy,
-                          OrderPolicy,
-                          StoragePolicy,
-                          DispatchTyper,
-                          IndexType,
-                          Allocator,
-                          WORKING_RESOURCE,
+  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                          IndexType, Allocator, WORKING_RESOURCE,
                           PlatformHolder::platform>{}();
 }
 
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index 55dd8924f0..1ac947ae8c 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -76,12 +76,9 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
     }
   }
 
-  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic,
-                                           double,
-                                           RAJA::PERM_IJ,
-                                           RAJA::SizeList<TILE_DIM, TILE_DIM>,
-                                           TY,
-                                           TX>;
+  using SharedTile =
+      AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ,
+                            RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
@@ -95,8 +92,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
@@ -108,8 +105,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
@@ -175,10 +172,10 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(
-      d_A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(
-      d_B, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows,
+                                                             N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows,
+                                                             N_cols);
 
   for (int row = 0; row < N_rows; ++row)
   {
@@ -190,11 +187,9 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   hipMemcpy(d_A, A, Arr_sz * sizeof(double), hipMemcpyHostToDevice);
 
-  using SharedTile = TypedLocalArray<double,
-                                     RAJA::PERM_IJ,
-                                     RAJA::SizeList<TILE_DIM, TILE_DIM>,
-                                     TY,
-                                     TX>;
+  using SharedTile =
+      TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>,
+                      TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
@@ -208,8 +203,8 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
@@ -221,8 +216,8 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx; // Matrix column index
         TY row = by * TY_TILE_DIM + ty; // Matrix row index
@@ -317,18 +312,13 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   SharedTile myTile, myTile2;
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0),
-                       RAJA::RangeSegment(0, inner_Dim1),
-                       RAJA::RangeSegment(0, outer_Dim0),
-                       RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(
+          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
+          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int         tx,
-                           int         ty,
-                           int         bx,
-                           int         by,
-                           SharedTile& myTile,
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
                            SharedTile& myTile2)
       {
         int col = bx * TILE_DIM + tx; // Matrix column index
@@ -342,11 +332,7 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(int         tx,
-                           int         ty,
-                           int         bx,
-                           int         by,
-                           SharedTile& myTile,
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
                            SharedTile& myTile2)
       {
         int col = by * TILE_DIM + tx; // Transposed matrix column index
@@ -454,18 +440,13 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   SharedTile myTile, myTile2;
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0),
-                       RAJA::RangeSegment(0, inner_Dim1),
-                       RAJA::RangeSegment(0, outer_Dim0),
-                       RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(
+          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
+          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int         tx,
-                           int         ty,
-                           int         bx,
-                           int         by,
-                           SharedTile& myTile,
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
                            SharedTile& myTile2)
       {
         int col = bx * TILE_DIM + tx; // Matrix column index
@@ -479,11 +460,7 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(int         tx,
-                           int         ty,
-                           int         bx,
-                           int         by,
-                           SharedTile& myTile,
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
                            SharedTile& myTile2)
       {
         int col = by * TILE_DIM + tx; // Transposed matrix column index
@@ -582,7 +559,8 @@ using TestTypes = ::testing::Types<
                 // Read data from shared memory
                 RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
                                           RAJA::ArgList<0, 1>,
-                                          RAJA::statement::Lambda<1>>>> // for 2
+                                          RAJA::statement::Lambda<1>>>> // for
+                                                                        // 2
         >                                                               // for 3
                                   > // close policy
                >                    // close list
@@ -676,7 +654,8 @@ using TestTypes = ::testing::Types<
                 RAJA::statement::For<0,
                                      RAJA::seq_exec,
                                      RAJA::statement::Lambda<1>>>> // close
-                                                                   // shared mem
+                                                                   // shared
+                                                                   // mem
                                                                    // window
         >                           // outer collapsed
                                   > // close policy list
@@ -939,8 +918,7 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   ThreadPriv pVal;             // iteration dependent data
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                       RAJA::RangeSegment(0, M),
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
                        RAJA::RangeSegment(0, P)),
       RAJA::make_tuple(aShared, bShared, pVal),
 
@@ -957,12 +935,8 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
       { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(int         tn,
-                           int         tm,
-                           int         tp,
-                           Shmem&      aShared,
-                           Shmem&      bShared,
-                           ThreadPriv& pVal)
+      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
+                           Shmem& bShared, ThreadPriv& pVal)
       { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index 597666c4ba..1872aa6d7f 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -55,8 +55,7 @@ TEST(SIMD, OMPAndSimd)
 {
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
+      1, RAJA::omp_parallel_for_exec,
       RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>;
 
   const RAJA::Index_type N = 32;
@@ -94,13 +93,10 @@ TEST(SIMD, OMPAndSimd)
 TEST(SIMD, OMPAndSimd_MultiLambda)
 {
 
-  using POL = RAJA::KernelPolicy<
-      RAJA::statement::For<1,
-                           RAJA::omp_parallel_for_exec,
-                           RAJA::statement::For<0,
-                                                RAJA::simd_exec,
-                                                RAJA::statement::Lambda<0>,
-                                                RAJA::statement::Lambda<1>>>>;
+  using POL = RAJA::KernelPolicy<RAJA::statement::For<
+      1, RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
+                           RAJA::statement::Lambda<1>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index 690fd79d71..c262f6f39d 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -283,46 +283,19 @@ void testReducerInterfaces(unsigned         seed,
   ReduceData<Res, interface_category, ValType> data(
       N, res, [&]() { return dist(rng); });
 
-  ASSERT_TRUE(testReduce("default",
-                         seed,
-                         data,
-                         N,
-                         RAJA::operators::plus<ValType>::identity(),
-                         RAJA::operators::plus<ValType>{},
-                         reducer,
-                         reduce_category{},
-                         interface_category{},
-                         no_init_operator{}));
-  ASSERT_TRUE(testReduce("init",
-                         seed,
-                         data,
-                         N,
-                         ValType(N),
-                         RAJA::operators::plus<ValType>{},
-                         reducer,
-                         reduce_category{},
-                         interface_category{},
-                         init_no_operator{}));
-  ASSERT_TRUE(testReduce("minimum",
-                         seed,
-                         data,
-                         N,
-                         ValType(0),
-                         RAJA::operators::minimum<ValType>{},
-                         reducer,
-                         reduce_category{},
-                         interface_category{},
-                         init_operator{}));
-  ASSERT_TRUE(testReduce("Maximum",
-                         seed,
-                         data,
-                         N,
-                         ValType(0),
-                         RAJA::operators::maximum<ValType>{},
-                         reducer,
-                         reduce_category{},
-                         interface_category{},
-                         init_operator{}));
+  ASSERT_TRUE(testReduce(
+      "default", seed, data, N, RAJA::operators::plus<ValType>::identity(),
+      RAJA::operators::plus<ValType>{}, reducer, reduce_category{},
+      interface_category{}, no_init_operator{}));
+  ASSERT_TRUE(testReduce(
+      "init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
+  ASSERT_TRUE(testReduce(
+      "minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
+  ASSERT_TRUE(testReduce(
+      "Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
 }
 
 template <typename ValType, typename Reducer, typename Res>
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index 9e4862b442..d8fad70e6d 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -287,8 +287,7 @@ void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
   data.copy_data(N);
   data.resource().wait();
   sorter(RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N),
-         comp);
+         RAJA::make_span(data.sorted_vals, N), comp);
   sorter.synchronize();
 }
 
@@ -305,8 +304,7 @@ void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
          RAJA::make_span(data.sorted_vals, N));
   data.resource().wait();
 }
@@ -324,10 +322,8 @@ void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N),
-         comp);
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N), comp);
   data.resource().wait();
 }
 
@@ -629,32 +625,15 @@ void testSorterResInterfaces(
   using resource_no_comparator  = sort_res_default_interface_tag;
   using resource_use_comparator = sort_res_comp_interface_tag;
 
-  ASSERT_TRUE(testSort("resource+default",
-                       seed,
-                       data,
-                       N,
-                       RAJA::operators::less<K>{},
-                       sorter,
-                       stability_category{},
-                       pairs_category{},
-                       resource_no_comparator{}));
-  ASSERT_TRUE(testSort("resource+ascending",
-                       seed,
-                       data,
-                       N,
-                       RAJA::operators::less<K>{},
-                       sorter,
-                       stability_category{},
-                       pairs_category{},
-                       resource_use_comparator{}));
-  ASSERT_TRUE(testSort("resource+descending",
-                       seed,
-                       data,
-                       N,
-                       RAJA::operators::greater<K>{},
-                       sorter,
-                       stability_category{},
-                       pairs_category{},
+  ASSERT_TRUE(testSort("resource+default", seed, data, N,
+                       RAJA::operators::less<K>{}, sorter, stability_category{},
+                       pairs_category{}, resource_no_comparator{}));
+  ASSERT_TRUE(testSort("resource+ascending", seed, data, N,
+                       RAJA::operators::less<K>{}, sorter, stability_category{},
+                       pairs_category{}, resource_use_comparator{}));
+  ASSERT_TRUE(testSort("resource+descending", seed, data, N,
+                       RAJA::operators::greater<K>{}, sorter,
+                       stability_category{}, pairs_category{},
                        resource_use_comparator{}));
 }
 
@@ -677,33 +656,15 @@ void testSorterInterfaces(unsigned         seed,
 
   SortData<Res, pairs_category, K> data(N, res, [&]() { return dist(rng); });
 
-  ASSERT_TRUE(testSort("default",
-                       seed,
-                       data,
-                       N,
-                       RAJA::operators::less<K>{},
-                       sorter,
-                       stability_category{},
-                       pairs_category{},
+  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K>{},
+                       sorter, stability_category{}, pairs_category{},
                        no_comparator{}));
-  ASSERT_TRUE(testSort("ascending",
-                       seed,
-                       data,
-                       N,
-                       RAJA::operators::less<K>{},
-                       sorter,
-                       stability_category{},
-                       pairs_category{},
-                       use_comparator{}));
-  ASSERT_TRUE(testSort("descending",
-                       seed,
-                       data,
-                       N,
-                       RAJA::operators::greater<K>{},
-                       sorter,
-                       stability_category{},
-                       pairs_category{},
+  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K>{},
+                       sorter, stability_category{}, pairs_category{},
                        use_comparator{}));
+  ASSERT_TRUE(testSort(
+      "descending", seed, data, N, RAJA::operators::greater<K>{}, sorter,
+      stability_category{}, pairs_category{}, use_comparator{}));
 
   testSorterResInterfaces(supports_resource(), seed, data, N, sorter);
 }
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index f623a65680..0ddc91bb9a 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -19,13 +19,12 @@ GPU_TEST(SynchronizeUnitTest, HIP)
   hipMalloc(&d_managed_data, sizeof(double) * 50);
 
   RAJA::forall<RAJA::hip_exec_async<256>>(
-      RAJA::RangeSegment(0, 50),
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
+      RAJA::RangeSegment(0, 50), [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
       { d_managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
-  hipMemcpy(
-      managed_data, d_managed_data, sizeof(double) * 50, hipMemcpyDeviceToHost);
+  hipMemcpy(managed_data, d_managed_data, sizeof(double) * 50,
+            hipMemcpyDeviceToHost);
 
   RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
                                [=](RAJA::Index_type i)
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index 77ba153913..63b0448bab 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -214,8 +214,8 @@ TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
   ref_lt100_indices.push_back(99);
 
   RAJA::RAJAVec<int> lt100_indices;
-  getIndicesConditional(
-      lt100_indices, iset, [](int idx) { return (idx < 100); });
+  getIndicesConditional(lt100_indices, iset,
+                        [](int idx) { return (idx < 100); });
 
   EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
   for (size_t i = 0; i < ref_lt100_indices.size(); ++i)
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
index d97021b0cf..dc14699663 100644
--- a/test/unit/index/test-rangestridesegment.cpp
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -32,8 +32,7 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Constructors)
 TYPED_TEST(RangeStrideSegmentUnitTest, Assignments)
 {
   auto r = RAJA::make_strided_range<TypeParam>(
-      static_cast<TypeParam>(0),
-      static_cast<TypeParam>(5),
+      static_cast<TypeParam>(0), static_cast<TypeParam>(5),
       static_cast<typename std::make_signed<TypeParam>::type>(3));
   RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
   ASSERT_EQ(r, seg1);
@@ -122,8 +121,7 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   ASSERT_EQ(segment11.size(), difftype_t(2));
 
   // PRIMES
-  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0,
-                                                     7,
+  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0, 7,
                                                      3); // should produce 0,3,6
   ASSERT_EQ(segment12.size(), difftype_t(3));
 
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index f9435c3544..004fe6da53 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -102,14 +102,11 @@ TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
   using threads_type        = typename camp::at<TypeParam, camp::num<3>>::type;
   using blocks_type         = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  using indexer_type = typename indexer_holder_type::
-      template type<dim_type::value, threads_type::value, blocks_type::value>;
-
-  testBasicIndexing<test_policy,
-                    indexer_type,
-                    dim_type::value,
-                    threads_type::value,
-                    blocks_type::value>();
+  using indexer_type = typename indexer_holder_type::template type<
+      dim_type::value, threads_type::value, blocks_type::value>;
+
+  testBasicIndexing<test_policy, indexer_type, dim_type::value,
+                    threads_type::value, blocks_type::value>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(IndexingUnitTest, BasicIndexing);
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index e307a08b46..f139edb834 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -431,11 +431,9 @@ template <typename MultiReducePolicy,
 void testMultiReducerContainerResetSize(size_t           init_bins,
                                         Container const& container)
 {
-  testMultiReducerContainerResetRegular<MultiReducePolicy,
-                                        NumericType,
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
                                         ForOnePol>(false, init_bins, container);
-  testMultiReducerContainerResetBitwise<MultiReducePolicy,
-                                        NumericType,
+  testMultiReducerContainerResetBitwise<MultiReducePolicy, NumericType,
                                         ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
@@ -453,8 +451,7 @@ template <typename MultiReducePolicy,
 void testMultiReducerContainerResetSize(size_t           init_bins,
                                         Container const& container)
 {
-  testMultiReducerContainerResetRegular<MultiReducePolicy,
-                                        NumericType,
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
                                         ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index cf3b6bf087..9098910db3 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -66,12 +66,10 @@ testReducerConstructor()
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
-  RAJA::ReduceMinLoc<ReducePolicy,
-                     NumericType,
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
                      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup;
-  RAJA::ReduceMaxLoc<ReducePolicy,
-                     NumericType,
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
                      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup;
 
@@ -166,12 +164,10 @@ void testInitReducerConstructor()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy,
-                     NumericType,
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
                      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy,
-                     NumericType,
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
                      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup(initVal, LocTup);
 
@@ -215,9 +211,7 @@ TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
   using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  testInitReducerConstructor<ReduceType,
-                             NumericType,
-                             ResourceType,
+  testInitReducerConstructor<ReduceType, NumericType, ResourceType,
                              ForOneType>();
 }
 
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index c4b906c27b..9fc0b996d1 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -116,28 +116,18 @@ void testReducerReset()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy,
-                     NumericType,
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
                      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy,
-                     NumericType,
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
                      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup(initVal, LocTup);
 
   // initiate some device computation if using device policy
-  exec_dispatcher<ReducePolicy,
-                  NumericType,
-                  RAJA::Index_type,
-                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>,
-                  ForOnePol>(reduce_sum,
-                             reduce_min,
-                             reduce_max,
-                             reduce_minloc,
-                             reduce_maxloc,
-                             reduce_minloctup,
-                             reduce_maxloctup,
-                             initVal);
+  exec_dispatcher<ReducePolicy, NumericType, RAJA::Index_type,
+                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
+      reduce_sum, reduce_min, reduce_max, reduce_minloc, reduce_maxloc,
+      reduce_minloctup, reduce_maxloctup, initVal);
 
   // perform real host resets
   reduce_sum.reset(resetVal[0]);
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index 0349c8e8bf..fdadcafc6c 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -73,8 +73,7 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   sync_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<SyncExecPol>(dev[stream],
-                        RangeSegment(0, ARRAY_SIZE),
+    forall<SyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
                         [=] RAJA_HOST_DEVICE(int i)
                         { gpu_time_wait_for(100, clockrate); });
   }
@@ -85,8 +84,7 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   async_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<AsyncExecPol>(dev[stream],
-                         RangeSegment(0, ARRAY_SIZE),
+    forall<AsyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
                          [=] RAJA_HOST_DEVICE(int i)
                          { gpu_time_wait_for(100, clockrate); });
   }
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index 18ee61720b..8f78509d16 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -22,23 +22,20 @@ void ResourceBasicAsyncSemanticsTestImpl()
   int* d_array = resources::Resource{dev}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host,
-                                       RangeSegment(0, ARRAY_SIZE),
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
                                        [=] RAJA_HOST_DEVICE(int i)
                                        { h_array[i] = i; });
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
-  forall<EXEC_POLICY>(dev,
-                      RangeSegment(0, ARRAY_SIZE),
+  forall<EXEC_POLICY>(dev, RangeSegment(0, ARRAY_SIZE),
                       [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(host,
-                                       RangeSegment(0, ARRAY_SIZE),
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
                                        [=](int i)
                                        { ASSERT_EQ(h_array[i], i + 2); });
 
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index 19875a1c90..daf75b3eb4 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -25,19 +25,16 @@ void ResourceDependsTestImpl()
   int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
 
-  forall<EXEC_POLICY>(dev1,
-                      RangeSegment(0, ARRAY_SIZE),
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
                       [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
 
   resources::Event e =
-      forall<EXEC_POLICY>(dev2,
-                          RangeSegment(0, ARRAY_SIZE),
+      forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
                           [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(dev1,
-                      RangeSegment(0, ARRAY_SIZE),
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
                       [=] RAJA_HOST_DEVICE(int i)
                       { d_array1[i] *= d_array2[i]; });
 
@@ -45,8 +42,7 @@ void ResourceDependsTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host,
-                                       RangeSegment(0, ARRAY_SIZE),
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
                                        [=](int i)
                                        { ASSERT_EQ(h_array[i], -i); });
 
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index 11317194f5..ee3ff324a2 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -23,8 +23,7 @@ void ResourceJoinAsyncSemanticsTestImpl()
   int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host,
-                                       RangeSegment(0, ARRAY_SIZE),
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
                                        [=] RAJA_HOST_DEVICE(int i)
                                        { h_array[i] = i; });
 
@@ -34,8 +33,7 @@ void ResourceJoinAsyncSemanticsTestImpl()
   dev1.wait_for(&e1);
 
   RAJA::resources::Event e2 =
-      forall<EXEC_POLICY>(dev1,
-                          RangeSegment(0, ARRAY_SIZE),
+      forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
                           [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev2.wait_for(&e2);
@@ -44,8 +42,7 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(host,
-                                       RangeSegment(0, ARRAY_SIZE),
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
                                        [=](int i)
                                        { ASSERT_EQ(h_array[i], i + 2); });
 
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index 379188614a..befc0d5c0b 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -24,8 +24,7 @@ void ResourceMultiStreamTestImpl()
   int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  resources::Event e1 = forall<EXEC_POLICY>(dev1,
-                                            RangeSegment(0, ARRAY_SIZE),
+  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
                                             [=] RAJA_HOST_DEVICE(int i)
                                             {
                                               if (i % 3 == 0)
@@ -34,8 +33,7 @@ void ResourceMultiStreamTestImpl()
                                               }
                                             });
 
-  resources::Event e2 = forall<EXEC_POLICY>(dev2,
-                                            RangeSegment(0, ARRAY_SIZE),
+  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
                                             [=] RAJA_HOST_DEVICE(int i)
                                             {
                                               if (i % 3 == 1)
@@ -44,8 +42,7 @@ void ResourceMultiStreamTestImpl()
                                               }
                                             });
 
-  resources::Event e3 = forall<EXEC_POLICY>(dev2,
-                                            RangeSegment(0, ARRAY_SIZE),
+  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
                                             [=] RAJA_HOST_DEVICE(int i)
                                             {
                                               if (i % 3 == 2)
@@ -61,8 +58,7 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host,
-                                       RangeSegment(0, ARRAY_SIZE),
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
                                        [=](int i)
                                        { ASSERT_EQ(h_array[i], i); });
 
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index ae46ef993c..14b54ddab0 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -279,9 +279,9 @@ TEST(IndexLayout, View3DLayout)
   Index_type index_list_j[2] = {1, 2};
   Index_type index_list_k[2] = {2, 3};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(),
-                                      IndexList<>{&index_list_j[0]},
-                                      IndexList<>{&index_list_k[0]});
+  auto index_tuple =
+      make_index_tuple(DirectIndex<>(), IndexList<>{&index_list_j[0]},
+                       IndexList<>{&index_list_k[0]});
 
   auto index_layout = make_index_layout(index_tuple, 2, 3, 4);
 
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
index 12b24c28da..4b1a3c4294 100644
--- a/test/unit/view-layout/test-typedlayout.cpp
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -145,11 +145,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
   using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_IJ,
-                              TypeParam,
-                              RAJA::list<TypeParam, TypeParam>,
-                              7,
-                              5>;
+      RAJA::TypedStaticLayout<RAJA::PERM_IJ, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -167,11 +164,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
   auto dynamic_layout = RAJA::make_permuted_layout(
       {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
   using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_JI,
-                              TypeParam,
-                              RAJA::list<TypeParam, TypeParam>,
-                              7,
-                              5>;
+      RAJA::TypedStaticLayout<RAJA::PERM_JI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -188,12 +182,9 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
   auto dynamic_layout = RAJA::make_permuted_layout(
       {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
   using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_JKI,
-                              TypeParam,
-                              RAJA::list<TypeParam, TypeParam, TypeParam>,
-                              7,
-                              13,
-                              5>;
+      RAJA::TypedStaticLayout<RAJA::PERM_JKI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
+                              13, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -215,13 +206,8 @@ TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
   auto dynamic_layout = RAJA::make_permuted_layout(
       {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
   using static_layout = RAJA::TypedStaticLayout<
-      RAJA::PERM_LJKI,
-      TypeParam,
-      RAJA::list<TypeParam, TypeParam, TypeParam, TypeParam>,
-      7,
-      13,
-      5,
-      17>;
+      RAJA::PERM_LJKI, TypeParam,
+      RAJA::list<TypeParam, TypeParam, TypeParam, TypeParam>, 7, 13, 5, 17>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index e1480e20ea..97aeb654b1 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -193,7 +193,7 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
   std::array<RAJA::Index_type, 1> lower{{1}};
   std::array<RAJA::Index_type, 1> upper{{11}};
   RAJA::View<TypeParam, layout>   view(data,
-                                     RAJA::make_offset_layout<1>(lower, upper));
+                                       RAJA::make_offset_layout<1>(lower, upper));
 
   for (int i = 0; i < 10; i++)
   {
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index 7942c552ab..1322453ed5 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -31,47 +31,31 @@ struct testWorkGroupConstructorSingle
     using DispatchPolicy = typename DispatchTyper::template type<>;
 
     {
-      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                           OrderPolicy,
-                                           StoragePolicy,
-                                           DispatchPolicy>,
-                     IndexType,
-                     RAJA::xargs<Xargs...>,
-                     Allocator>
+      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
           pool(Allocator{});
 
       ASSERT_EQ(pool.num_loops(), (size_t)0);
       ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                            OrderPolicy,
-                                            StoragePolicy,
-                                            DispatchPolicy>,
-                      IndexType,
-                      RAJA::xargs<Xargs...>,
-                      Allocator>
+      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                            StoragePolicy, DispatchPolicy>,
+                      IndexType, RAJA::xargs<Xargs...>, Allocator>
           group = pool.instantiate();
 
       ASSERT_EQ(pool.num_loops(), (size_t)0);
       ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy,
-                                           OrderPolicy,
-                                           StoragePolicy,
-                                           DispatchPolicy>,
-                     IndexType,
-                     RAJA::xargs<Xargs...>,
-                     Allocator>
+      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
           site = group.run(Xargs{}...);
 
-      using resource_type =
-          typename RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                        OrderPolicy,
-                                                        StoragePolicy,
-                                                        DispatchPolicy>,
-                                  IndexType,
-                                  RAJA::xargs<Xargs...>,
-                                  Allocator>::resource_type;
+      using resource_type = typename RAJA::WorkPool<
+          RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy,
+                                DispatchPolicy>,
+          IndexType, RAJA::xargs<Xargs...>, Allocator>::resource_type;
       auto e = resource_type::get_default().get_event();
       e.wait();
 
@@ -148,12 +132,9 @@ TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle<ExecPolicy,
-                                 OrderPolicy,
-                                 StoragePolicy,
-                                 DispatchTyper,
-                                 IndexType,
-                                 Allocator>{}(Xargs{});
+  testWorkGroupConstructorSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator>{}(
+      Xargs{});
 }
 
 #endif //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index b77700c46d..21f9a1d01b 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -104,9 +104,9 @@ struct testWorkGroupDispatcherSingle
 
     static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
     using DispatchPolicy  = typename DispatchTyper::template type<TestCallable>;
-    using Dispatcher_type = RAJA::detail::
-        Dispatcher<platform, DispatchPolicy, void, IndexType, Args...>;
-    using Invoker_type         = typename Dispatcher_type::invoker_type;
+    using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                     void, IndexType, Args...>;
+    using Invoker_type    = typename Dispatcher_type::invoker_type;
     using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
     const Dispatcher_type* dispatcher =
         RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
@@ -172,12 +172,9 @@ struct testWorkGroupDispatcherSingle
     work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
     // move a value onto device and fiddle
-    call_dispatcher<ForOnePol,
-                    Invoker_type,
-                    Dispatcher_cptr_type,
-                    IndexType,
-                    Args...>(
-        dispatcher->invoke, wrk_obj, (IndexType)1, Args{}...);
+    call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType,
+                    Args...>(dispatcher->invoke, wrk_obj, (IndexType)1,
+                             Args{}...);
 
     work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
@@ -261,11 +258,8 @@ TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
   using ResourceType  = typename camp::at<TypeParam, camp::num<4>>::type;
   using ForOneType    = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupDispatcherSingle<ExecPolicy,
-                                DispatchTyper,
-                                IndexType,
-                                ResourceType,
-                                ForOneType>{}(Args{});
+  testWorkGroupDispatcherSingle<ExecPolicy, DispatchTyper, IndexType,
+                                ResourceType, ForOneType>{}(Args{});
 }
 
 #endif //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index 3303cac474..7fc09e6b79 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -40,22 +40,15 @@ struct testWorkGroupEnqueueMultiple
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
 
-    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<Args...>,
-                                         Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
     using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                              OrderPolicy,
-                                              StoragePolicy,
-                                              DispatchPolicy>,
-                        IndexType,
-                        RAJA::xargs<Args...>,
-                        Allocator>;
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
     {
       WorkPool_type pool(Allocator{});
@@ -160,19 +153,11 @@ TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
-  testWorkGroupEnqueueMultiple<ExecPolicy,
-                               OrderPolicy,
-                               StoragePolicy,
-                               DispatchTyper,
-                               IndexType,
-                               Allocator>{}(
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator>{}(
       Xargs{}, false, dist_rep(rng), dist_num(rng));
-  testWorkGroupEnqueueMultiple<ExecPolicy,
-                               OrderPolicy,
-                               StoragePolicy,
-                               DispatchTyper,
-                               IndexType,
-                               Allocator>{}(
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator>{}(
       Xargs{}, true, dist_rep(rng), dist_num(rng));
 }
 
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index a31a18502b..c313d52ebb 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -40,22 +40,15 @@ struct testWorkGroupEnqueueSingle
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
 
-    using WorkPool_type = RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy,
-                                                               OrderPolicy,
-                                                               StoragePolicy,
-                                                               DispatchPolicy>,
-                                         IndexType,
-                                         RAJA::xargs<Args...>,
-                                         Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
     using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy,
-                                              OrderPolicy,
-                                              StoragePolicy,
-                                              DispatchPolicy>,
-                        IndexType,
-                        RAJA::xargs<Args...>,
-                        Allocator>;
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
     {
       WorkPool_type pool(Allocator{});
@@ -155,18 +148,12 @@ TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupEnqueueSingle<ExecPolicy,
-                             OrderPolicy,
-                             StoragePolicy,
-                             DispatchTyper,
-                             IndexType,
-                             Allocator>{}(Xargs{}, false, 1, 1);
-  testWorkGroupEnqueueSingle<ExecPolicy,
-                             OrderPolicy,
-                             StoragePolicy,
-                             DispatchTyper,
-                             IndexType,
-                             Allocator>{}(Xargs{}, true, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator>{}(
+      Xargs{}, false, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator>{}(Xargs{},
+                                                                    true, 1, 1);
 }
 
 #endif //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index e8ec1368f6..2028343127 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -27,8 +27,8 @@ void testWorkGroupWorkStorageConstructor()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy           = typename DispatchTyper::template type<>;
-  using Dispatcher_type          = RAJA::detail::
-      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
@@ -79,8 +79,7 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor<StoragePolicy,
-                                      DispatchTyper,
+  testWorkGroupWorkStorageConstructor<StoragePolicy, DispatchTyper,
                                       Allocator>();
 }
 
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index 58f012d966..0b341d4b5e 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -29,8 +29,8 @@ void testWorkGroupWorkStorageInsertCall()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::
-      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
@@ -71,8 +71,8 @@ void testWorkGroupWorkStorageInsertCall()
       double test_val         = -1;
       bool   move_constructed = false;
       bool   moved_from       = true;
-      WorkStruct_type::host_call(
-          &*iter, (void*)&test_val, &move_constructed, &moved_from);
+      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed,
+                                 &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index d731b93111..72432d8962 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -29,8 +29,8 @@ void testWorkGroupWorkStorageIterator()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::
-      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 517b51b4b6..4500fb6749 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -62,8 +62,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy =
       typename DispatchTyper::template type<callable0, callable1, callable2>;
-  using Dispatcher_type = RAJA::detail::
-      Dispatcher<platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
@@ -86,10 +86,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container,
-                             double            init_val0,
-                             double            init_val1,
-                             double            init_val2)
+    auto fill_contents = [&](WorkStorage_type& container, double init_val0,
+                             double init_val1, double init_val2)
     {
       std::vector<callable0> vec0;
       vec0.reserve(num0);
@@ -128,20 +126,18 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
       }
 
       ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(container.storage_size(),
-                num0 * sizeof(callable0) + num1 * sizeof(callable1) +
-                    num2 * sizeof(callable2));
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
     };
 
-    auto test_contents = [&](WorkStorage_type& container,
-                             double            init_val0,
-                             double            init_val1,
-                             double            init_val2)
+    auto test_contents = [&](WorkStorage_type& container, double init_val0,
+                             double init_val1, double init_val2)
     {
       ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(container.storage_size(),
-                num0 * sizeof(callable0) + num1 * sizeof(callable1) +
-                    num2 * sizeof(callable2));
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
 
       {
         auto iter = container.begin();
@@ -151,8 +147,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
           type0 val{};
           bool  move_constructed = false;
           bool  moved_from       = true;
-          WorkStruct_type::host_call(
-              &*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -167,8 +163,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
           type1 val{};
           bool  move_constructed = false;
           bool  moved_from       = true;
-          WorkStruct_type::host_call(
-              &*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -183,8 +179,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
           type2 val{};
           bool  move_constructed = false;
           bool  moved_from       = true;
-          WorkStruct_type::host_call(
-              &*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);

From a0a21d485fbe06fec6626f9d16da0677c3a53bd6 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Mon, 16 Sep 2024 09:34:00 -0700
Subject: [PATCH 5/9] Try out new formatting

---
 .clang-format                                 |   17 +-
 examples/dynamic-forall.cpp                   |   23 +-
 examples/dynamic_mat_transpose.cpp            |   77 +-
 examples/forall-param-reductions.cpp          |   15 +-
 examples/forall_multi-reductions.cpp          |   33 +-
 examples/jacobi.cpp                           |   38 +-
 examples/kernel-dynamic-tile.cpp              |    9 +-
 examples/launch-param-reductions.cpp          |  108 +-
 examples/launch_flatten.cpp                   |   42 +-
 examples/launch_matrix-multiply.cpp           |  415 ++--
 examples/launch_reductions.cpp                |   41 +-
 examples/memoryManager.hpp                    |    2 +-
 examples/multiview.cpp                        |   44 +-
 examples/omp-target-kernel.cpp                |   11 +-
 examples/omp-target-ltimes.cpp                |   69 +-
 examples/pi-reduce_vs_atomic.cpp              |  116 +-
 examples/plugin/counter-plugin.cpp            |   11 +-
 examples/plugin/test-plugin-dynamic.cpp       |    4 +-
 examples/plugin/test-plugin.cpp               |    4 +-
 examples/plugin/timer-plugin.cpp              |   18 +-
 examples/raja-launch.cpp                      |   63 +-
 examples/red-black-gauss-seidel.cpp           |   21 +-
 examples/resource-dynamic-forall.cpp          |   25 +-
 examples/resource-forall.cpp                  |   54 +-
 examples/resource-kernel.cpp                  |   15 +-
 examples/resource-launch.cpp                  |   32 +-
 examples/resource-runtime-launch.cpp          |   41 +-
 examples/tut_daxpy.cpp                        |   12 +-
 examples/tut_halo-exchange.cpp                |  777 ++++---
 examples/tut_launch_basic.cpp                 |   84 +-
 examples/tut_matrix-multiply.cpp              |  566 ++---
 examples/wave-eqn.cpp                         |   51 +-
 exercises/dot-product.cpp                     |    5 +-
 exercises/dot-product_solution.cpp            |   21 +-
 .../kernel-matrix-transpose-local-array.cpp   |   61 +-
 ...-matrix-transpose-local-array_solution.cpp |  103 +-
 exercises/kernel-matrix-transpose-tiled.cpp   |   40 +-
 ...kernel-matrix-transpose-tiled_solution.cpp |   49 +-
 .../kernel-matrix-transpose_solution.cpp      |   29 +-
 exercises/kernelintro-execpols.cpp            |  161 +-
 exercises/kernelintro-execpols_solution.cpp   |  214 +-
 exercises/kernelintro-nested-loop-reorder.cpp |   16 +-
 ...rnelintro-nested-loop-reorder_solution.cpp |   44 +-
 .../launch-matrix-transpose-local-array.cpp   |   28 +-
 ...-matrix-transpose-local-array_solution.cpp |   46 +-
 exercises/launch-matrix-transpose-tiled.cpp   |   32 +-
 ...launch-matrix-transpose-tiled_solution.cpp |   98 +-
 exercises/launch-matrix-transpose.cpp         |   39 +-
 .../launch-matrix-transpose_solution.cpp      |   10 +-
 exercises/launchintro-execpols.cpp            |   52 +-
 exercises/launchintro-execpols_solution.cpp   |   52 +-
 exercises/memoryManager.hpp                   |    2 +-
 exercises/offset-layout-stencil.cpp           |  108 +-
 exercises/offset-layout-stencil_solution.cpp  |  133 +-
 .../permuted-layout-batch-matrix-multiply.cpp |   37 +-
 ...-layout-batch-matrix-multiply_solution.cpp |   51 +-
 exercises/reductions.cpp                      |   23 +-
 exercises/reductions_solution.cpp             |   76 +-
 exercises/scan.cpp                            |    4 +-
 exercises/scan_solution.cpp                   |   38 +-
 exercises/segment-indexset-basics.cpp         |   28 +-
 .../segment-indexset-basics_solution.cpp      |   41 +-
 exercises/sort.cpp                            |   56 +-
 exercises/sort_solution.cpp                   |   94 +-
 exercises/tutorial_halfday/ex2_approx-pi.cpp  |   13 +-
 .../ex2_approx-pi_solution.cpp                |   39 +-
 .../tutorial_halfday/ex5_line-of-sight.cpp    |    2 +-
 .../ex5_line-of-sight_solution.cpp            |  103 +-
 .../ex6_stencil-offset-layout.cpp             |   12 +-
 .../ex6_stencil-offset-layout_solution.cpp    |   28 +-
 .../ex8_tiled-matrix-transpose.cpp            |   22 +-
 .../ex8_tiled-matrix-transpose_solution.cpp   |   51 +-
 .../ex9_matrix-transpose-local-array.cpp      |   12 +-
 ..._matrix-transpose-local-array_solution.cpp |   48 +-
 exercises/tutorial_halfday/memoryManager.hpp  |    2 +-
 exercises/vector-addition.cpp                 |    4 +-
 exercises/vector-addition_solution.cpp        |   12 +-
 exercises/vertexsum-indexset.cpp              |   67 +-
 exercises/vertexsum-indexset_solution.cpp     |   67 +-
 exercises/view-layout.cpp                     |   10 +-
 exercises/view-layout_solution.cpp            |   14 +-
 include/RAJA/RAJA.hpp                         |    4 +-
 include/RAJA/index/IndexSet.hpp               |   75 +-
 include/RAJA/index/IndexSetBuilders.hpp       |   13 +-
 include/RAJA/index/IndexSetUtils.hpp          |   37 +-
 include/RAJA/index/IndexValue.hpp             |   27 +-
 include/RAJA/index/ListSegment.hpp            |   41 +-
 include/RAJA/index/RangeSegment.hpp           |  127 +-
 include/RAJA/internal/DepGraphNode.hpp        |    4 +-
 include/RAJA/internal/Iterators.hpp           |   40 +-
 include/RAJA/internal/MemUtils_CPU.hpp        |    8 +-
 include/RAJA/internal/RAJAVec.hpp             |   47 +-
 include/RAJA/internal/ThreadUtils_CPU.hpp     |    4 +-
 include/RAJA/internal/fault_tolerance.hpp     |    6 +-
 include/RAJA/internal/foldl.hpp               |   72 +-
 include/RAJA/internal/get_platform.hpp        |   27 +-
 include/RAJA/pattern/WorkGroup.hpp            |  335 +--
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  339 +--
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |  196 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    |  215 +-
 include/RAJA/pattern/WorkGroup/WorkStruct.hpp |   67 +-
 include/RAJA/pattern/atomic.hpp               |   47 +-
 include/RAJA/pattern/detail/algorithm.hpp     |   14 +-
 include/RAJA/pattern/detail/multi_reduce.hpp  |   48 +-
 include/RAJA/pattern/detail/privatizer.hpp    |   40 +-
 include/RAJA/pattern/detail/reduce.hpp        |  118 +-
 include/RAJA/pattern/forall.hpp               |  457 ++--
 include/RAJA/pattern/kernel.hpp               |   78 +-
 include/RAJA/pattern/kernel/Collapse.hpp      |    4 +-
 include/RAJA/pattern/kernel/Conditional.hpp   |    6 +-
 include/RAJA/pattern/kernel/For.hpp           |   41 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   49 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |   63 +-
 include/RAJA/pattern/kernel/InitLocalMem.hpp  |   23 +-
 include/RAJA/pattern/kernel/Lambda.hpp        |   95 +-
 include/RAJA/pattern/kernel/Param.hpp         |    6 +-
 include/RAJA/pattern/kernel/Reduce.hpp        |   22 +-
 include/RAJA/pattern/kernel/Region.hpp        |   11 +-
 include/RAJA/pattern/kernel/Tile.hpp          |   70 +-
 include/RAJA/pattern/kernel/TileTCount.hpp    |   53 +-
 .../RAJA/pattern/kernel/internal/LoopData.hpp |   57 +-
 .../pattern/kernel/internal/LoopTypes.hpp     |   36 +-
 .../pattern/kernel/internal/Statement.hpp     |   13 +-
 .../pattern/kernel/internal/StatementList.hpp |   18 +-
 .../RAJA/pattern/kernel/internal/Template.hpp |    6 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  284 +--
 include/RAJA/pattern/multi_reduce.hpp         |    4 +-
 include/RAJA/pattern/params/forall.hpp        |  176 +-
 include/RAJA/pattern/params/kernel_name.hpp   |    8 +-
 include/RAJA/pattern/params/params_base.hpp   |    8 +-
 include/RAJA/pattern/params/reducer.hpp       |   16 +-
 include/RAJA/pattern/reduce.hpp               |    4 +-
 include/RAJA/pattern/region.hpp               |    4 +-
 include/RAJA/pattern/scan.hpp                 |  371 +--
 include/RAJA/pattern/sort.hpp                 |  300 +--
 include/RAJA/pattern/synchronize.hpp          |    6 +-
 .../RAJA/pattern/tensor/MatrixRegister.hpp    |   22 +-
 .../RAJA/pattern/tensor/ScalarRegister.hpp    |    4 +-
 include/RAJA/pattern/tensor/TensorIndex.hpp   |   37 +-
 include/RAJA/pattern/tensor/TensorLayout.hpp  |    8 +-
 .../RAJA/pattern/tensor/TensorRegister.hpp    |   26 +-
 .../RAJA/pattern/tensor/VectorRegister.hpp    |   11 +-
 .../tensor/internal/ET/BinaryOperator.hpp     |   82 +-
 .../internal/ET/BinaryOperatorTraits.hpp      |   32 +-
 .../tensor/internal/ET/BlockLiteral.hpp       |    8 +-
 .../internal/ET/ExpressionTemplateBase.hpp    |    8 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   |  498 ++--
 .../tensor/internal/ET/TensorDivide.hpp       |  116 +-
 .../tensor/internal/ET/TensorLiteral.hpp      |   10 +-
 .../tensor/internal/ET/TensorLoadStore.hpp    |   16 +-
 .../tensor/internal/ET/TensorMultiply.hpp     |   39 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |   56 +-
 .../tensor/internal/ET/TensorNegate.hpp       |   10 +-
 .../internal/ET/TensorScalarLiteral.hpp       |   10 +-
 .../tensor/internal/ET/TensorTranspose.hpp    |   14 +-
 .../tensor/internal/ET/normalizeOperand.hpp   |    8 +-
 .../tensor/internal/MatrixMatrixMultiply.hpp  |  200 +-
 .../tensor/internal/MatrixRegisterImpl.hpp    |  355 +--
 .../pattern/tensor/internal/RegisterBase.hpp  |  149 +-
 .../tensor/internal/TensorIndexTraits.hpp     |   55 +-
 .../pattern/tensor/internal/TensorRef.hpp     |  514 +++--
 .../tensor/internal/TensorRegisterBase.hpp    |   69 +-
 .../tensor/internal/TensorTileExec.hpp        |  163 +-
 .../tensor/internal/VectorRegisterImpl.hpp    |  158 +-
 include/RAJA/pattern/tensor/stats.hpp         |    4 +-
 include/RAJA/policy/MultiPolicy.hpp           |   73 +-
 include/RAJA/policy/PolicyBase.hpp            |   75 +-
 include/RAJA/policy/WorkGroup.hpp             |   45 +-
 include/RAJA/policy/atomic_auto.hpp           |    2 +-
 include/RAJA/policy/atomic_builtin.hpp        |  160 +-
 include/RAJA/policy/cuda.hpp                  |    4 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |   78 +-
 include/RAJA/policy/cuda/WorkGroup.hpp        |    2 +-
 .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp |   25 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  152 +-
 include/RAJA/policy/cuda/atomic.hpp           |  291 +--
 include/RAJA/policy/cuda/forall.hpp           |  597 ++---
 include/RAJA/policy/cuda/intrinsics.hpp       |   59 +-
 include/RAJA/policy/cuda/kernel.hpp           |    2 +-
 .../RAJA/policy/cuda/kernel/Conditional.hpp   |   20 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  158 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |  194 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  245 +-
 .../RAJA/policy/cuda/kernel/Hyperplane.hpp    |   19 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |   46 +-
 include/RAJA/policy/cuda/kernel/Lambda.hpp    |   24 +-
 include/RAJA/policy/cuda/kernel/Reduce.hpp    |   58 +-
 include/RAJA/policy/cuda/kernel/Sync.hpp      |   10 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |   91 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |   75 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  239 +-
 include/RAJA/policy/cuda/launch.hpp           |  504 ++--
 include/RAJA/policy/cuda/multi_reduce.hpp     |  226 +-
 .../RAJA/policy/cuda/params/kernel_name.hpp   |    8 +-
 include/RAJA/policy/cuda/params/reduce.hpp    |   10 +-
 include/RAJA/policy/cuda/policy.hpp           | 2024 +++++++++--------
 include/RAJA/policy/cuda/raja_cudaerrchk.hpp  |   11 +-
 include/RAJA/policy/cuda/reduce.hpp           |  323 +--
 include/RAJA/policy/cuda/scan.hpp             |  206 +-
 include/RAJA/policy/cuda/sort.hpp             |  622 ++---
 include/RAJA/policy/cuda/synchronize.hpp      |   10 +-
 include/RAJA/policy/desul.hpp                 |    2 +-
 include/RAJA/policy/desul/atomic.hpp          |   68 +-
 include/RAJA/policy/hip.hpp                   |    4 +-
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |   78 +-
 include/RAJA/policy/hip/WorkGroup.hpp         |    2 +-
 .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp  |   14 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  224 +-
 include/RAJA/policy/hip/atomic.hpp            |  265 +--
 include/RAJA/policy/hip/forall.hpp            |  533 +++--
 include/RAJA/policy/hip/intrinsics.hpp        |   59 +-
 include/RAJA/policy/hip/kernel.hpp            |    2 +-
 .../RAJA/policy/hip/kernel/Conditional.hpp    |   20 +-
 include/RAJA/policy/hip/kernel/For.hpp        |  197 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  247 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  105 +-
 include/RAJA/policy/hip/kernel/Hyperplane.hpp |   19 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |   46 +-
 include/RAJA/policy/hip/kernel/Lambda.hpp     |   24 +-
 include/RAJA/policy/hip/kernel/Reduce.hpp     |   58 +-
 include/RAJA/policy/hip/kernel/Sync.hpp       |   10 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |   91 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |   87 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  239 +-
 include/RAJA/policy/hip/launch.hpp            |  516 +++--
 include/RAJA/policy/hip/multi_reduce.hpp      |  233 +-
 .../RAJA/policy/hip/params/kernel_name.hpp    |    8 +-
 include/RAJA/policy/hip/params/reduce.hpp     |   10 +-
 include/RAJA/policy/hip/policy.hpp            | 1869 +++++++--------
 include/RAJA/policy/hip/raja_hiperrchk.hpp    |   10 +-
 include/RAJA/policy/hip/reduce.hpp            |  329 +--
 include/RAJA/policy/hip/scan.hpp              |  178 +-
 include/RAJA/policy/hip/sort.hpp              |  444 ++--
 include/RAJA/policy/hip/synchronize.hpp       |   10 +-
 include/RAJA/policy/openmp.hpp                |    4 +-
 include/RAJA/policy/openmp/WorkGroup.hpp      |    2 +-
 .../policy/openmp/WorkGroup/Dispatcher.hpp    |    8 +-
 .../policy/openmp/WorkGroup/WorkRunner.hpp    |   80 +-
 include/RAJA/policy/openmp/atomic.hpp         |   36 +-
 include/RAJA/policy/openmp/forall.hpp         |  237 +-
 include/RAJA/policy/openmp/kernel.hpp         |    2 +-
 .../RAJA/policy/openmp/kernel/Collapse.hpp    |   56 +-
 .../policy/openmp/kernel/OmpSyncThreads.hpp   |   10 +-
 include/RAJA/policy/openmp/launch.hpp         |  238 +-
 include/RAJA/policy/openmp/multi_reduce.hpp   |  102 +-
 include/RAJA/policy/openmp/params/forall.hpp  |  263 ++-
 .../RAJA/policy/openmp/params/kernel_name.hpp |    8 +-
 include/RAJA/policy/openmp/params/reduce.hpp  |   12 +-
 include/RAJA/policy/openmp/policy.hpp         |  102 +-
 include/RAJA/policy/openmp/reduce.hpp         |   16 +-
 include/RAJA/policy/openmp/region.hpp         |   10 +-
 include/RAJA/policy/openmp/scan.hpp           |  111 +-
 include/RAJA/policy/openmp/sort.hpp           |  143 +-
 include/RAJA/policy/openmp/synchronize.hpp    |    8 +-
 include/RAJA/policy/openmp_target.hpp         |    6 +-
 .../RAJA/policy/openmp_target/WorkGroup.hpp   |    2 +-
 .../openmp_target/WorkGroup/Dispatcher.hpp    |   10 +-
 .../openmp_target/WorkGroup/WorkRunner.hpp    |   80 +-
 include/RAJA/policy/openmp_target/forall.hpp  |   72 +-
 include/RAJA/policy/openmp_target/kernel.hpp  |    2 +-
 .../policy/openmp_target/kernel/Collapse.hpp  |   69 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   40 +-
 .../openmp_target/params/kernel_name.hpp      |    8 +-
 .../policy/openmp_target/params/reduce.hpp    |   12 +-
 include/RAJA/policy/openmp_target/policy.hpp  |   63 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |   77 +-
 include/RAJA/policy/sequential.hpp            |    2 +-
 include/RAJA/policy/sequential/WorkGroup.hpp  |    2 +-
 .../sequential/WorkGroup/Dispatcher.hpp       |    8 +-
 .../sequential/WorkGroup/WorkRunner.hpp       |   78 +-
 include/RAJA/policy/sequential/atomic.hpp     |    4 +-
 include/RAJA/policy/sequential/forall.hpp     |   48 +-
 include/RAJA/policy/sequential/kernel.hpp     |    2 +-
 .../policy/sequential/kernel/Collapse.hpp     |   13 +-
 .../RAJA/policy/sequential/kernel/Reduce.hpp  |   14 +-
 include/RAJA/policy/sequential/launch.hpp     |  123 +-
 .../RAJA/policy/sequential/multi_reduce.hpp   |   20 +-
 .../policy/sequential/params/kernel_name.hpp  |    8 +-
 .../RAJA/policy/sequential/params/reduce.hpp  |   12 +-
 include/RAJA/policy/sequential/policy.hpp     |   60 +-
 include/RAJA/policy/sequential/reduce.hpp     |    6 +-
 include/RAJA/policy/sequential/region.hpp     |    8 +-
 include/RAJA/policy/sequential/scan.hpp       |   89 +-
 include/RAJA/policy/sequential/sort.hpp       |  102 +-
 include/RAJA/policy/simd.hpp                  |    2 +-
 include/RAJA/policy/simd/forall.hpp           |   30 +-
 include/RAJA/policy/simd/kernel/For.hpp       |    4 +-
 include/RAJA/policy/simd/kernel/ForICount.hpp |   13 +-
 include/RAJA/policy/simd/launch.hpp           |   18 +-
 include/RAJA/policy/simd/policy.hpp           |   15 +-
 include/RAJA/policy/sycl.hpp                  |    4 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp    |   14 +-
 include/RAJA/policy/sycl/forall.hpp           |  244 +-
 include/RAJA/policy/sycl/kernel.hpp           |    2 +-
 .../RAJA/policy/sycl/kernel/Conditional.hpp   |   20 +-
 include/RAJA/policy/sycl/kernel/For.hpp       |  128 +-
 include/RAJA/policy/sycl/kernel/ForICount.hpp |  253 ++-
 include/RAJA/policy/sycl/kernel/Lambda.hpp    |   24 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |   52 +-
 include/RAJA/policy/sycl/kernel/Tile.hpp      |  133 +-
 .../RAJA/policy/sycl/kernel/TileTCount.hpp    |  221 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |   30 +-
 include/RAJA/policy/sycl/launch.hpp           |  309 +--
 .../RAJA/policy/sycl/params/kernel_name.hpp   |    8 +-
 include/RAJA/policy/sycl/params/reduce.hpp    |   12 +-
 include/RAJA/policy/sycl/policy.hpp           |   12 +-
 include/RAJA/policy/sycl/reduce.hpp           |  187 +-
 include/RAJA/policy/tensor.hpp                |    2 +-
 include/RAJA/policy/tensor/arch.hpp           |    8 +-
 include/RAJA/policy/tensor/arch/avx.hpp       |    2 +-
 .../policy/tensor/arch/avx/avx_double.hpp     |   10 +-
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |   29 +-
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp |   59 +-
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp |   43 +-
 .../RAJA/policy/tensor/arch/avx/traits.hpp    |    8 +-
 include/RAJA/policy/tensor/arch/avx2.hpp      |    2 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   |   14 +-
 .../policy/tensor/arch/avx2/avx2_float.hpp    |   50 +-
 .../policy/tensor/arch/avx2/avx2_int32.hpp    |   88 +-
 .../policy/tensor/arch/avx2/avx2_int64.hpp    |   54 +-
 .../RAJA/policy/tensor/arch/avx2/traits.hpp   |   10 +-
 include/RAJA/policy/tensor/arch/avx512.hpp    |    2 +-
 .../tensor/arch/avx512/avx512_double.hpp      |   25 +-
 .../tensor/arch/avx512/avx512_float.hpp       |   25 +-
 .../tensor/arch/avx512/avx512_int32.hpp       |   30 +-
 .../tensor/arch/avx512/avx512_int64.hpp       |   59 +-
 .../RAJA/policy/tensor/arch/avx512/traits.hpp |   10 +-
 include/RAJA/policy/tensor/arch/cuda.hpp      |    2 +-
 .../policy/tensor/arch/cuda/cuda_warp.hpp     |  146 +-
 .../RAJA/policy/tensor/arch/cuda/traits.hpp   |    8 +-
 include/RAJA/policy/tensor/arch/hip.hpp       |    2 +-
 .../RAJA/policy/tensor/arch/hip/hip_wave.hpp  |  146 +-
 .../RAJA/policy/tensor/arch/hip/traits.hpp    |    8 +-
 .../RAJA/policy/tensor/arch/scalar/scalar.hpp |   12 +-
 .../RAJA/policy/tensor/arch/scalar/traits.hpp |    6 +-
 include/RAJA/policy/tensor/policy.hpp         |   17 +-
 include/RAJA/util/BitMask.hpp                 |    9 +-
 include/RAJA/util/CombiningAdapter.hpp        |   30 +-
 include/RAJA/util/EnableIf.hpp                |   11 +-
 include/RAJA/util/IndexLayout.hpp             |   45 +-
 include/RAJA/util/KokkosPluginLoader.hpp      |   12 +-
 include/RAJA/util/Layout.hpp                  |  111 +-
 include/RAJA/util/LocalArray.hpp              |   70 +-
 include/RAJA/util/OffsetLayout.hpp            |   85 +-
 include/RAJA/util/OffsetOperators.hpp         |   64 +-
 include/RAJA/util/Operators.hpp               |  184 +-
 include/RAJA/util/Permutations.hpp            |   14 +-
 include/RAJA/util/PermutedLayout.hpp          |    8 +-
 include/RAJA/util/PluginContext.hpp           |    6 +-
 include/RAJA/util/PluginLinker.hpp            |    4 +-
 include/RAJA/util/PluginOptions.hpp           |    8 +-
 include/RAJA/util/PluginStrategy.hpp          |    4 +-
 include/RAJA/util/Registry.hpp                |   11 +-
 include/RAJA/util/RepeatView.hpp              |    6 +-
 include/RAJA/util/RuntimePluginLoader.hpp     |    6 +-
 include/RAJA/util/SoAArray.hpp                |    4 +-
 include/RAJA/util/SoAPtr.hpp                  |   35 +-
 include/RAJA/util/Span.hpp                    |   35 +-
 include/RAJA/util/StaticLayout.hpp            |  119 +-
 include/RAJA/util/Timer.hpp                   |   12 +-
 include/RAJA/util/TypeConvert.hpp             |    6 +-
 include/RAJA/util/TypedViewBase.hpp           |  441 ++--
 include/RAJA/util/View.hpp                    |   66 +-
 include/RAJA/util/align.hpp                   |   10 +-
 include/RAJA/util/basic_mempool.hpp           |   18 +-
 include/RAJA/util/camp_aliases.hpp            |    2 +-
 include/RAJA/util/concepts.hpp                |    6 +-
 include/RAJA/util/for_each.hpp                |   26 +-
 include/RAJA/util/macros.hpp                  |    6 +-
 include/RAJA/util/math.hpp                    |   11 +-
 include/RAJA/util/mutex.hpp                   |    8 +-
 include/RAJA/util/plugins.hpp                 |    4 +-
 include/RAJA/util/reduce.hpp                  |  115 +-
 include/RAJA/util/resource.hpp                |   84 +-
 include/RAJA/util/sort.hpp                    |  163 +-
 include/RAJA/util/sycl_compat.hpp             |    2 +-
 include/RAJA/util/types.hpp                   |  105 +-
 include/RAJA/util/zip.hpp                     |   51 +-
 include/RAJA/util/zip_tuple.hpp               |  132 +-
 src/AlignedRangeIndexSetBuilders.cpp          |   10 +-
 src/DepGraphNode.cpp                          |    2 +-
 src/KokkosPluginLoader.cpp                    |   20 +-
 src/LockFreeIndexSetBuilders.cpp              |   15 +-
 src/MemUtils_CUDA.cpp                         |    8 +-
 src/MemUtils_HIP.cpp                          |    8 +-
 src/MemUtils_SYCL.cpp                         |   10 +-
 src/PluginStrategy.cpp                        |    4 +-
 src/RuntimePluginLoader.cpp                   |    8 +-
 ...t-dynamic-forall-resource-RangeSegment.hpp |   49 +-
 .../test-dynamic-forall-RangeSegment.hpp      |   49 +-
 .../tests/test-forall-CombiningAdapter-1D.hpp |   43 +-
 .../tests/test-forall-CombiningAdapter-2D.hpp |   60 +-
 .../tests/test-forall-CombiningAdapter-3D.hpp |   64 +-
 .../tests/test-forall-atomic-basic.hpp        |   43 +-
 .../tests/test-forall-AtomicRefAdd.hpp        |  115 +-
 .../tests/test-forall-AtomicRefCAS.hpp        |  101 +-
 .../tests/test-forall-AtomicRefLoadStore.hpp  |  109 +-
 .../tests/test-forall-AtomicRefLogical.hpp    |  144 +-
 .../tests/test-forall-AtomicRefMinMax.hpp     |   98 +-
 .../tests/test-forall-AtomicRefSub.hpp        |  113 +-
 .../tests/test-forall-AtomicMultiView.hpp     |   70 +-
 ...test-forall-AtomicOutOfBoundsMultiView.hpp |   35 +-
 .../tests/test-forall-AtomicView.hpp          |   26 +-
 .../tests/test-forall-IcountIndexSetView.hpp  |   17 +-
 .../tests/test-forall-IndexSetView.hpp        |   16 +-
 .../tests/test-forall-IcountIndexSet.hpp      |   19 +-
 .../indexset/tests/test-forall-IndexSet.hpp   |   18 +-
 .../tests/test-forall-basic-MultiReduce.hpp   |  161 +-
 .../tests/test-forall-basic-ReduceBitAnd.hpp  |   99 +-
 .../tests/test-forall-basic-ReduceBitOr.hpp   |   91 +-
 .../tests/test-forall-basic-ReduceMax.hpp     |   91 +-
 .../tests/test-forall-basic-ReduceMaxLoc.hpp  |  107 +-
 .../tests/test-forall-basic-ReduceMin.hpp     |   91 +-
 .../tests/test-forall-basic-ReduceMinLoc.hpp  |  107 +-
 .../tests/test-forall-basic-ReduceSum.hpp     |   85 +-
 .../test-forall-basic-expt-ReduceBitAnd.hpp   |   81 +-
 .../test-forall-basic-expt-ReduceBitOr.hpp    |   73 +-
 .../test-forall-basic-expt-ReduceMax.hpp      |   84 +-
 .../test-forall-basic-expt-ReduceMaxLoc.hpp   |   92 +-
 .../test-forall-basic-expt-ReduceMin.hpp      |   84 +-
 .../test-forall-basic-expt-ReduceMinLoc.hpp   |   92 +-
 .../test-forall-basic-expt-ReduceSum.hpp      |   76 +-
 ...est-forall-indexset-multiple-ReduceMax.hpp |   56 +-
 ...-forall-indexset-multiple-ReduceMaxLoc.hpp |   64 +-
 ...est-forall-indexset-multiple-ReduceMin.hpp |   56 +-
 ...-forall-indexset-multiple-ReduceMinLoc.hpp |   64 +-
 ...est-forall-indexset-multiple-ReduceSum.hpp |   78 +-
 ...test-forall-segment-multiple-ReduceMax.hpp |   56 +-
 ...t-forall-segment-multiple-ReduceMaxLoc.hpp |   70 +-
 ...test-forall-segment-multiple-ReduceMin.hpp |   56 +-
 ...t-forall-segment-multiple-ReduceMinLoc.hpp |   76 +-
 ...test-forall-segment-multiple-ReduceSum.hpp |  140 +-
 .../region/tests/test-forall-region.hpp       |   39 +-
 .../test-forall-ResourceIcountIndexSet.hpp    |   21 +-
 .../tests/test-forall-ResourceIndexSet.hpp    |   18 +-
 .../test-forall-resource-ListSegment.hpp      |   42 +-
 .../test-forall-resource-RangeSegment.hpp     |   54 +-
 ...est-forall-resource-RangeStrideSegment.hpp |  135 +-
 .../tests/test-forall-ListSegmentView.hpp     |   57 +-
 .../tests/test-forall-RangeSegment2DView.hpp  |   68 +-
 .../tests/test-forall-RangeSegmentView.hpp    |   54 +-
 .../test-forall-RangeStrideSegmentView.hpp    |  109 +-
 .../segment/tests/test-forall-ListSegment.hpp |   47 +-
 .../tests/test-forall-RangeSegment.hpp        |   62 +-
 .../tests/test-forall-RangeStrideSegment.hpp  |  167 +-
 .../indexset-build/test-aligned-indexset.cpp  |    8 +-
 .../tests/basic-fission-fusion-loop-impl.hpp  |   63 +-
 ...nel-basic-fission-fusion-loop-segments.hpp |   59 +-
 .../tests/basic-single-icount-loop-impl.hpp   |   66 +-
 ...rnel-basic-single-icount-loop-segments.hpp |   59 +-
 .../tests/basic-single-loop-segments-impl.hpp |   89 +-
 ...test-kernel-basic-single-loop-segments.hpp |   73 +-
 ...el-resource-basic-single-loop-segments.hpp |   73 +-
 .../conditional-fission-fusion-loop-impl.hpp  |   59 +-
 ...nditional-fission-fusion-loop-segments.hpp |   56 +-
 .../tests/test-kernel-hyperplane-2D.hpp       |   45 +-
 .../tests/test-kernel-hyperplane-3D.hpp       |  137 +-
 .../tests/test-kernel-nested-MultiReduce.hpp  |  144 +-
 .../tests/nested-loop-BlockReduceSum-impl.hpp |   84 +-
 .../tests/nested-loop-ReduceSum-impl.hpp      |  203 +-
 ...test-kernel-nested-loop-BlockReduceSum.hpp |    7 +-
 .../test-kernel-nested-loop-ReduceSum.hpp     |    7 +-
 ...el-resource-nested-loop-BlockReduceSum.hpp |    7 +-
 ...-kernel-resource-nested-loop-ReduceSum.hpp |    7 +-
 ...test-kernel-nested-loops-segment-types.hpp |   40 +-
 .../test-kernel-nested-loop-OffsetView2D.hpp  |   66 +-
 .../test-kernel-nested-loop-OffsetView3D.hpp  |   60 +-
 ...ernel-nested-loop-PermutedOffsetView2D.hpp |   78 +-
 ...ernel-nested-loop-PermutedOffsetView3D.hpp |   77 +-
 ...test-kernel-nested-loop-PermutedView2D.hpp |   49 +-
 ...test-kernel-nested-loop-PermutedView3D.hpp |   71 +-
 .../tests/nested-loop-Basic-impl.hpp          |  218 +-
 .../tests/nested-loop-MultiLambda-impl.hpp    |   70 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |   98 +-
 .../tests/test-kernel-nested-loop-Basic.hpp   |    6 +-
 .../test-kernel-nested-loop-MultiLambda.hpp   |    7 +-
 ...st-kernel-nested-loop-MultiLambdaParam.hpp |   12 +-
 ...test-kernel-resource-nested-loop-Basic.hpp |    6 +-
 ...ernel-resource-nested-loop-MultiLambda.hpp |    7 +-
 ...-resource-nested-loop-MultiLambdaParam.hpp |   12 +-
 .../tests/test-kernel-reduceloc-Max2D.hpp     |   94 +-
 .../tests/test-kernel-reduceloc-Max2DView.hpp |   94 +-
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |  120 +-
 .../tests/test-kernel-reduceloc-Min2D.hpp     |   94 +-
 .../tests/test-kernel-reduceloc-Min2DView.hpp |   94 +-
 .../test-kernel-reduceloc-Min2DViewTuple.hpp  |  120 +-
 .../region/tests/test-kernel-region-data.hpp  |   30 +-
 .../region/tests/test-kernel-region-sync.hpp  |   15 +-
 .../region/tests/test-kernel-region.hpp       |   15 +-
 .../test-kernel-single-loop-ForICount.hpp     |    7 +-
 .../test-kernel-single-loop-TileTCount.hpp    |    7 +-
 .../tests/test-kernel-tile-Dynamic2D.hpp      |   44 +-
 .../tests/test-kernel-tile-Fixed2D.hpp        |   40 +-
 .../tests/test-kernel-tile-Fixed2DMinMax.hpp  |   62 +-
 .../tests/test-kernel-tile-Fixed2DSum.hpp     |   38 +-
 .../tests/test-kernel-tile-LocalArray2D.hpp   |   61 +-
 ...kernel-resource-warp-thread-ReduceMask.hpp |    7 +-
 ...kernel-resource-warp-thread-ReduceWarp.hpp |    7 +-
 ...t-kernel-resource-warp-thread-WarpLoop.hpp |    7 +-
 .../test-kernel-warp-thread-ReduceMask.hpp    |    7 +-
 .../test-kernel-warp-thread-ReduceWarp.hpp    |    7 +-
 .../test-kernel-warp-thread-WarpLoop.hpp      |    7 +-
 .../tests/warp-thread-ReduceMask-impl.hpp     |  167 +-
 .../tests/warp-thread-ReduceWarp-impl.hpp     |  190 +-
 .../tests/warp-thread-WarpLoop-impl.hpp       |  162 +-
 .../tests/test-launch-nested-MultiReduce.hpp  |  159 +-
 .../tests/test-launch-nested-Direct.hpp       |  111 +-
 .../tests/test-launch-nested-Loop.hpp         |  111 +-
 .../tests/test-launch-nested-Tile-Direct.hpp  |   95 +-
 .../tests/test-launch-nested-Tile-Loop.hpp    |   95 +-
 .../tests/test-launch-basic-ReduceBitAnd.hpp  |   74 +-
 .../tests/test-launch-basic-ReduceMin.hpp     |   97 +-
 .../tests/test-launch-basic-ReduceSum.hpp     |   97 +-
 ...t-launch-basic-param-expt-ReduceBitAnd.hpp |   91 +-
 ...test-launch-basic-param-expt-ReduceMin.hpp |   92 +-
 ...test-launch-basic-param-expt-ReduceSum.hpp |  105 +-
 .../tests/test-launch-BasicShared.hpp         |   65 +-
 .../segment/tests/test-launch-ListSegment.hpp |   79 +-
 .../tests/test-launch-RangeSegment.hpp        |  134 +-
 .../tests/test-launch-RangeStrideSegment.hpp  |  155 +-
 .../tests/test-launch-DynamicMem.hpp          |   68 +-
 .../tests/test-launch-StaticMem.hpp           |   69 +-
 .../test-launch-nested-Tile-iCount-Direct.hpp |   87 +-
 .../test-launch-nested-Tile-iCount-Loop.hpp   |   87 +-
 .../scan/tests/test-scan-Exclusive.hpp        |   19 +-
 .../scan/tests/test-scan-ExclusiveInplace.hpp |   35 +-
 .../scan/tests/test-scan-Inclusive.hpp        |   18 +-
 .../scan/tests/test-scan-InclusiveInplace.hpp |   22 +-
 test/functional/scan/tests/test-scan-data.hpp |   30 +-
 .../matrix/test-tensor-matrix-double.hpp      |  172 +-
 .../matrix/test-tensor-matrix-float.hpp       |  128 +-
 .../matrix/test-tensor-matrix-int32_t.hpp     |  128 +-
 .../matrix/test-tensor-matrix-int64_t.hpp     |  260 ++-
 .../tests/test-tensor-matrix-CtorGetSet.hpp   |    8 +-
 .../tests/test-tensor-matrix-ET_Add.hpp       |    5 +-
 .../tests/test-tensor-matrix-ET_Divide.hpp    |    5 +-
 .../tests/test-tensor-matrix-ET_LoadStore.hpp |   48 +-
 ...-tensor-matrix-ET_MatrixMatrixMultiply.hpp |   10 +-
 ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp |   14 +-
 .../tests/test-tensor-matrix-ET_Negate.hpp    |    8 +-
 .../tests/test-tensor-matrix-ET_Subtract.hpp  |    5 +-
 .../tests/test-tensor-matrix-ET_Transpose.hpp |    8 +-
 .../test-tensor-matrix-Load_ColMajor.hpp      |   16 +-
 .../test-tensor-matrix-Load_RowMajor.hpp      |   16 +-
 .../test-tensor-matrix-Store_ColMajor.hpp     |   16 +-
 .../test-tensor-matrix-Store_RowMajor.hpp     |   16 +-
 .../tests/test-tensor-register-Divide.hpp     |    4 +-
 .../tests/test-tensor-register-FMA.hpp        |    5 +-
 .../tests/test-tensor-register-FMS.hpp        |    5 +-
 .../tests/test-tensor-register-Max.hpp        |    4 +-
 .../tests/test-tensor-register-Min.hpp        |    4 +-
 ...ensor-register-SegmentedBroadcastInner.hpp |    6 +-
 ...ensor-register-SegmentedBroadcastOuter.hpp |    4 +-
 ...st-tensor-register-SegmentedDotProduct.hpp |    4 +-
 ...test-tensor-register-SegmentedSumInner.hpp |    4 +-
 ...test-tensor-register-SegmentedSumOuter.hpp |    6 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |    9 +-
 .../util/test-CombiningAdapter-2D.cpp         |   18 +-
 .../util/test-CombiningAdapter-3D.cpp         |   45 +-
 .../util/test-PermutedCombiningAdapter-2D.cpp |   13 +-
 .../util/test-PermutedCombiningAdapter-3D.cpp |   43 +-
 .../test-workgroup-Ordered-MultipleReuse.hpp  |  189 +-
 .../tests/test-workgroup-Ordered-Single.hpp   |  108 +-
 ...test-workgroup-Unordered-MultipleReuse.hpp |  174 +-
 .../tests/test-workgroup-Unordered-Single.hpp |  111 +-
 test/include/RAJA_gtest.hpp                   |   27 +-
 test/include/RAJA_test-abs.hpp                |    4 +-
 test/include/RAJA_test-atomic-ref-types.hpp   |    6 +-
 test/include/RAJA_test-atomic-types.hpp       |   17 +-
 test/include/RAJA_test-atomicpol.hpp          |   10 +-
 test/include/RAJA_test-base.hpp               |    2 +-
 test/include/RAJA_test-camp.hpp               |    2 +-
 test/include/RAJA_test-dynamic-forall.hpp     |   25 +-
 .../RAJA_test-forall-async-execpol.hpp        |   12 +-
 test/include/RAJA_test-forall-data.hpp        |   48 +-
 test/include/RAJA_test-forall-execpol.hpp     |   82 +-
 .../RAJA_test-forall-indexset-execpol.hpp     |   40 +-
 test/include/RAJA_test-index-types.hpp        |   56 +-
 test/include/RAJA_test-indexset-build.hpp     |   11 +-
 .../RAJA_test-kernel-nested-loop-types.hpp    |   23 +-
 test/include/RAJA_test-kernel-tile-size.hpp   |    2 +-
 ...launch-direct-teams-threads-1D-execpol.hpp |   46 +-
 ...launch-direct-teams-threads-3D-execpol.hpp |   86 +-
 test/include/RAJA_test-launch-execpol.hpp     |   35 +-
 ...t-launch-loop-teams-threads-1D-execpol.hpp |   46 +-
 ...t-launch-loop-teams-threads-3D-execpol.hpp |   86 +-
 .../RAJA_test-launch-runtime-execpol.hpp      |   77 +-
 .../RAJA_test-multi-reduce-abstractor.hpp     |    6 +-
 test/include/RAJA_test-multi-reducepol.hpp    |    2 +-
 test/include/RAJA_test-platform.hpp           |    2 +-
 test/include/RAJA_test-plugin-kernelpol.hpp   |   31 +-
 test/include/RAJA_test-plugin-launchpol.hpp   |    2 +-
 .../RAJA_test-plugin-resource-launchpol.hpp   |    2 +-
 test/include/RAJA_test-reduce-types.hpp       |   15 +-
 test/include/RAJA_test-reduceloc-types.hpp    |    2 +-
 test/include/RAJA_test-reducepol.hpp          |   32 +-
 test/include/RAJA_test-tensor.hpp             |   20 +-
 test/include/RAJA_test-workgroup.hpp          |   66 +-
 test/include/RAJA_unit-test-for3d3d.hpp       |   60 +-
 test/include/RAJA_unit-test-forone.hpp        |    9 +-
 test/include/RAJA_unit-test-policy.hpp        |    2 +-
 test/include/RAJA_unit-test-types.hpp         |   60 +-
 test/include/type_helper.hpp                  |   16 +-
 .../using-with-cmake/using-with-cmake.cpp     |    8 +-
 test/integration/plugin/plugin_to_test.cpp    |    9 +-
 test/integration/plugin/tests/counter.hpp     |    2 +-
 .../plugin/tests/test-plugin-forall.hpp       |   45 +-
 .../plugin/tests/test-plugin-kernel.hpp       |   11 +-
 .../plugin/tests/test-plugin-launch.hpp       |    8 +-
 .../tests/test-plugin-resource-launch.hpp     |   12 +-
 .../plugin/tests/test-plugin-workgroup.hpp    |  125 +-
 test/integration/plugin/tests/test-plugin.hpp |    2 +-
 test/integration/plugin_for_test_kokkos.cpp   |   18 +-
 test/integration/test_plugin_dynamic.cpp      |    4 +-
 test/integration/test_plugin_kokkos.cpp       |    4 +-
 test/old-tests/unit/cuda/test-synchronize.cpp |    6 +-
 test/old-tests/unit/test-sharedmem.cpp        |  385 ++--
 test/old-tests/unit/test-simd.cpp             |   47 +-
 .../test-algorithm-util-for_each.cpp          |   72 +-
 .../tests/test-algorithm-reduce-utils.hpp     |  159 +-
 .../tests/test-algorithm-sort-utils.hpp       |  381 ++--
 .../algorithm/tests/test-algorithm-sort.hpp   |   24 +-
 .../tests/test-algorithm-stable-sort.hpp      |   37 +-
 .../tests/test-algorithm-util-reduce.hpp      |   44 +-
 .../tests/test-algorithm-util-sort.hpp        |  202 +-
 test/unit/atomic/test-atomic-incdec.cpp       |   62 +-
 .../unit/atomic/test-atomic-ref-accessors.cpp |   14 +-
 test/unit/atomic/test-atomic-ref-addsub.cpp   |   14 +-
 test/unit/atomic/test-atomic-ref-bitwise.cpp  |   68 +-
 .../atomic/test-atomic-ref-constructor.cpp    |   31 +-
 .../unit/atomic/test-atomic-ref-exchanges.cpp |   88 +-
 test/unit/atomic/test-atomic-ref-minmax.cpp   |   14 +-
 test/unit/atomic/test-atomic-ref.hpp          |  102 +-
 test/unit/hip/test-synchronize.cpp            |   10 +-
 test/unit/index/test-indexset.cpp             |    6 +-
 test/unit/index/test-listsegment.cpp          |    8 +-
 test/unit/index/test-rangesegment.cpp         |   34 +-
 test/unit/index/test-rangestridesegment.cpp   |   33 +-
 test/unit/indexing/test-indexing.hpp          |   20 +-
 .../indexing/tests/test-indexing-global.hpp   |   35 +-
 .../unit/multi_reducer/test-multi-reducer.hpp |    2 +-
 .../tests/test-multi-reducer-constructors.hpp |  115 +-
 .../tests/test-multi-reducer-reset.hpp        |  185 +-
 .../test-reducer-constructors-cuda.cpp        |   32 +-
 .../reducer/test-reducer-constructors-hip.cpp |   32 +-
 ...est-reducer-constructors-openmp-target.cpp |   17 +-
 .../test-reducer-constructors-openmp.cpp      |   32 +-
 .../reducer/test-reducer-constructors-seq.cpp |   32 +-
 test/unit/reducer/test-reducer-reset-cuda.cpp |   17 +-
 test/unit/reducer/test-reducer-reset-hip.cpp  |   17 +-
 .../test-reducer-reset-openmp-target.cpp      |   17 +-
 .../reducer/test-reducer-reset-openmp.cpp     |   17 +-
 test/unit/reducer/test-reducer-reset-seq.cpp  |   17 +-
 test/unit/reducer/test-reducer.hpp            |    2 +-
 .../tests/test-reducer-constructors.hpp       |  120 +-
 .../unit/reducer/tests/test-reducer-reset.hpp |   76 +-
 .../tests/test-resource-AsyncTime.hpp         |   20 +-
 .../test-resource-BasicAsyncSemantics.hpp     |   28 +-
 .../resource/tests/test-resource-Depends.hpp  |   31 +-
 .../test-resource-JoinAsyncSemantics.hpp      |   29 +-
 .../tests/test-resource-MultiStream.hpp       |   67 +-
 .../util/operator/test-operators-identity.cpp |    2 +-
 test/unit/util/test-float-limits.cpp          |   17 +-
 test/unit/util/test-fraction.cpp              |   15 +-
 test/unit/util/test-integral-limits.cpp       |   17 +-
 test/unit/util/test-span.cpp                  |   14 +-
 test/unit/view-layout/test-indexlayout.cpp    |   28 +-
 test/unit/view-layout/test-makelayout.cpp     |    9 +-
 test/unit/view-layout/test-multiview.cpp      |   22 +-
 test/unit/view-layout/test-typedlayout.cpp    |   46 +-
 test/unit/view-layout/test-typedview.cpp      |   38 +-
 .../tests/test-util-workgroup-Enqueue.hpp     |    2 +-
 .../tests/test-util-workgroup-WorkStorage.hpp |   11 +-
 .../tests/test-workgroup-Constructor.hpp      |   77 +-
 .../tests/test-workgroup-Dispatcher.hpp       |   72 +-
 .../tests/test-workgroup-Enqueue-Multiple.hpp |   84 +-
 .../tests/test-workgroup-Enqueue-Single.hpp   |   77 +-
 ...test-workgroup-WorkStorage-Constructor.hpp |   19 +-
 .../test-workgroup-WorkStorage-InsertCall.hpp |   24 +-
 .../test-workgroup-WorkStorage-Iterator.hpp   |   18 +-
 .../test-workgroup-WorkStorage-Multiple.hpp   |   68 +-
 681 files changed, 25553 insertions(+), 22731 deletions(-)

diff --git a/.clang-format b/.clang-format
index f80c5f3a7f..4b9a86b409 100644
--- a/.clang-format
+++ b/.clang-format
@@ -41,7 +41,7 @@ PointerAlignment: Left
 AllowShortIfStatementsOnASingleLine : true
 AllowShortFunctionsOnASingleLine : true
 AllowShortLoopsOnASingleLine : false
-AllowAllArgumentsOnNextLine : false
+AllowAllArgumentsOnNextLine : true
 AllowAllParametersOfDeclarationOnNextLine : false
 AlignTrailingComments : true
 BinPackArguments : true
@@ -49,7 +49,20 @@ BinPackParameters : false
 ConstructorInitializerAllOnOneLineOrOnePerLine : true
 ColumnLimit : 80
 
-AlignAfterOpenBracket: true
+AlignAfterOpenBracket: AlwaysBreak
 AlignOperands : true
 AlwaysBreakTemplateDeclarations : true
 BreakBeforeBinaryOperators : None
+
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: false
+SpacesInConditionalStatement: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp
index b69e5fec28..71a89b6146 100644
--- a/examples/dynamic-forall.cpp
+++ b/examples/dynamic-forall.cpp
@@ -28,18 +28,19 @@
 void checkResult(int* res, int len);
 void printResult(int* res, int len);
 
-using policy_list = camp::list<RAJA::seq_exec,
-                               RAJA::simd_exec
+using policy_list = camp::list<
+    RAJA::seq_exec,
+    RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-                               ,
-                               RAJA::omp_parallel_for_exec
+    ,
+    RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                               ,
-                               RAJA::cuda_exec<256>,
-                               RAJA::cuda_exec<512>
+    ,
+    RAJA::cuda_exec<256>,
+    RAJA::cuda_exec<512>
 #endif
-                               >;
+    >;
 
 int main(int argc, char* argv[])
 {
@@ -100,9 +101,9 @@ int main(int argc, char* argv[])
   //----------------------------------------------------------------------------//
 
   // policy is chosen from the list
-  RAJA::expt::dynamic_forall<policy_list>(pol, RAJA::RangeSegment(0, N),
-                                          [=] RAJA_HOST_DEVICE(int i)
-                                          { c[i] = a[i] + b[i]; });
+  RAJA::expt::dynamic_forall<policy_list>(
+      pol, RAJA::RangeSegment(0, N),
+      [=] RAJA_HOST_DEVICE(int i) { c[i] = a[i] + b[i]; });
   // _rajaseq_vector_add_end
 
   checkResult(c, N);
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
index d83f43e9b0..9f6c39ed63 100644
--- a/examples/dynamic_mat_transpose.cpp
+++ b/examples/dynamic_mat_transpose.cpp
@@ -83,20 +83,21 @@ using launch_policy = RAJA::LaunchPolicy<
  * Define team policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using outer0 = RAJA::LoopPolicy<RAJA::seq_exec
+using outer0 = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                ,
-                                RAJA::cuda_block_x_direct
+    ,
+    RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                ,
-                                RAJA::hip_block_x_direct
+    ,
+    RAJA::hip_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-                                ,
-                                RAJA::sycl_group_2_direct
+    ,
+    RAJA::sycl_group_2_direct
 #endif
-                                >;
+    >;
 
 using outer1 = RAJA::LoopPolicy<
 #if defined(RAJA_ENABLE_OPENMP)
@@ -121,35 +122,37 @@ using outer1 = RAJA::LoopPolicy<
  * Define thread policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using inner0 = RAJA::LoopPolicy<RAJA::seq_exec
+using inner0 = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                ,
-                                RAJA::cuda_thread_x_direct
+    ,
+    RAJA::cuda_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                ,
-                                RAJA::hip_thread_x_direct
+    ,
+    RAJA::hip_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-                                ,
-                                RAJA::sycl_local_2_direct
+    ,
+    RAJA::sycl_local_2_direct
 #endif
-                                >;
+    >;
 
-using inner1 = RAJA::LoopPolicy<RAJA::seq_exec
+using inner1 = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                ,
-                                RAJA::cuda_thread_y_direct
+    ,
+    RAJA::cuda_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                ,
-                                RAJA::hip_thread_y_direct
+    ,
+    RAJA::hip_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-                                ,
-                                RAJA::sycl_local_1_direct
+    ,
+    RAJA::sycl_local_1_direct
 #endif
-                                >;
+    >;
 
 int main(int argc, char* argv[])
 {
@@ -275,8 +278,8 @@ int main(int argc, char* argv[])
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -297,8 +300,8 @@ int main(int argc, char* argv[])
         for (int ty = 0; ty < TILE_DIM; ++ty)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -348,9 +351,9 @@ int main(int argc, char* argv[])
   // _dynamic_mattranspose_kernel_start
   RAJA::launch<launch_policy>(
       res,
-      RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
-                         RAJA::Threads(TILE_DIM, TILE_DIM),
-                         dynamic_shared_mem_size),
+      RAJA::LaunchParams(
+          RAJA::Teams(outer_Dimc, outer_Dimr),
+          RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
       "Matrix tranpose with dynamic shared memory kernel",
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
@@ -367,8 +370,8 @@ int main(int argc, char* argv[])
                         ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
 
                     // Use RAJA View for simplified indexing
-                    RAJA::View<int, RAJA::Layout<2>> Tile(tile_ptr, TILE_DIM,
-                                                          TILE_DIM);
+                    RAJA::View<int, RAJA::Layout<2>> Tile(
+                        tile_ptr, TILE_DIM, TILE_DIM);
 
                     RAJA::loop<inner1>(
                         ctx, RAJA::RangeSegment(0, TILE_DIM),
@@ -379,9 +382,9 @@ int main(int argc, char* argv[])
                               [&](int tx)
                               {
                                 int col =
-                                    bx * TILE_DIM + tx; // Matrix column index
+                                    bx * TILE_DIM + tx;  // Matrix column index
                                 int row =
-                                    by * TILE_DIM + ty; // Matrix row index
+                                    by * TILE_DIM + ty;  // Matrix row index
 
                                 // Bounds check
                                 if (row < N_r && col < N_c)
@@ -404,9 +407,9 @@ int main(int argc, char* argv[])
                               [&](int tx)
                               {
                                 int col =
-                                    bx * TILE_DIM + tx; // Matrix column index
+                                    bx * TILE_DIM + tx;  // Matrix column index
                                 int row =
-                                    by * TILE_DIM + ty; // Matrix row index
+                                    by * TILE_DIM + ty;  // Matrix row index
 
                                 // Bounds check
                                 if (row < N_r && col < N_c)
diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp
index fbfcda2d93..338ac69e14 100644
--- a/examples/forall-param-reductions.cpp
+++ b/examples/forall-param-reductions.cpp
@@ -273,8 +273,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
       RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
-      [=] RAJA_DEVICE(int i, int& _cuda_sum, int& _cuda_min, int& _cuda_max,
-                      VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
+      [=] RAJA_DEVICE(
+          int i, int& _cuda_sum, int& _cuda_min, int& _cuda_max,
+          VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
       {
         _cuda_sum += d_a[i];
 
@@ -324,8 +325,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
       RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
-      [=] RAJA_DEVICE(int i, int& _hip_sum, int& _hip_min, int& _hip_max,
-                      VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
+      [=] RAJA_DEVICE(
+          int i, int& _hip_sum, int& _hip_min, int& _hip_max,
+          VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
       {
         _hip_sum += d_a[i];
 
@@ -376,8 +378,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
       RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
-      [=] RAJA_DEVICE(int i, int& _sycl_sum, int& _sycl_min, int& _sycl_max,
-                      VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
+      [=] RAJA_DEVICE(
+          int i, int& _sycl_sum, int& _sycl_min, int& _sycl_max,
+          VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
       {
         _sycl_sum += d_a[i];
 
diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp
index 3809a107c1..eeb5ef51cf 100644
--- a/examples/forall_multi-reductions.cpp
+++ b/examples/forall_multi-reductions.cpp
@@ -38,21 +38,21 @@ struct Backend
 
 auto example_policies = camp::make_tuple(
 
-    Backend<RAJA::seq_exec, RAJA::seq_multi_reduce>{"Sequential"}
+    Backend<RAJA::seq_exec, RAJA::seq_multi_reduce> {"Sequential"}
 
 #if defined(RAJA_ENABLE_OPENMP)
     ,
-    Backend<RAJA::omp_parallel_for_exec, RAJA::omp_multi_reduce>{"OpenMP"}
+    Backend<RAJA::omp_parallel_for_exec, RAJA::omp_multi_reduce> {"OpenMP"}
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
     ,
-    Backend<RAJA::cuda_exec_async<256>, RAJA::cuda_multi_reduce_atomic>{"Cuda"}
+    Backend<RAJA::cuda_exec_async<256>, RAJA::cuda_multi_reduce_atomic> {"Cuda"}
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
     ,
-    Backend<RAJA::hip_exec_async<256>, RAJA::hip_multi_reduce_atomic>{"Hip"}
+    Backend<RAJA::hip_exec_async<256>, RAJA::hip_multi_reduce_atomic> {"Hip"}
 #endif
 
 );
@@ -66,17 +66,18 @@ void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a)
   RAJA::MultiReduceBitAnd<multi_reduce_policy, int> multi_reduce_and(num_bins);
   RAJA::MultiReduceBitOr<multi_reduce_policy, int>  multi_reduce_or(num_bins);
 
-  RAJA::forall<exec_policy>(arange,
-                            [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
-                            {
-                              int bin = bins[i];
+  RAJA::forall<exec_policy>(
+      arange,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
+      {
+        int bin = bins[i];
 
-                              multi_reduce_sum[bin] += a[i];
-                              multi_reduce_min[bin].min(a[i]);
-                              multi_reduce_max[bin].max(a[i]);
-                              multi_reduce_and[bin] &= a[i];
-                              multi_reduce_or[bin] |= a[i];
-                            });
+        multi_reduce_sum[bin] += a[i];
+        multi_reduce_min[bin].min(a[i]);
+        multi_reduce_max[bin].max(a[i]);
+        multi_reduce_and[bin] &= a[i];
+        multi_reduce_or[bin] |= a[i];
+      });
 
   for (int bin = 0; bin < num_bins; ++bin)
   {
@@ -154,8 +155,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
         res.memcpy(bins, host_bins, N * sizeof(int));
         res.memcpy(a, host_a, N * sizeof(int));
 
-        example_code<exec_policy, multi_reduce_policy>(arange, num_bins, bins,
-                                                       a);
+        example_code<exec_policy, multi_reduce_policy>(
+            arange, num_bins, bins, a);
 
         res.deallocate(bins);
         res.deallocate(a);
diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp
index 76888d8c62..64ecc4746f 100644
--- a/examples/jacobi.cpp
+++ b/examples/jacobi.cpp
@@ -227,13 +227,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         });
 
     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
-    RAJA::forall<RAJA::seq_exec>(gridRange,
-                                 [=](RAJA::Index_type k)
-                                 {
-                                   RAJA_resI2 +=
-                                       (I[k] - Iold[k]) * (I[k] - Iold[k]);
-                                   Iold[k] = I[k];
-                                 });
+    RAJA::forall<RAJA::seq_exec>(
+        gridRange,
+        [=](RAJA::Index_type k)
+        {
+          RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
+          Iold[k] = I[k];
+        });
 
     resI2 = RAJA_resI2;
     if (iteration > maxIter)
@@ -290,13 +290,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
 
-    RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange,
-                                              [=](RAJA::Index_type k)
-                                              {
-                                                RAJA_resI2 += (I[k] - Iold[k]) *
-                                                              (I[k] - Iold[k]);
-                                                Iold[k] = I[k];
-                                              });
+    RAJA::forall<RAJA::omp_parallel_for_exec>(
+        gridRange,
+        [=](RAJA::Index_type k)
+        {
+          RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
+          Iold[k] = I[k];
+        });
 
     resI2 = RAJA_resI2;
     if (iteration > maxIter)
@@ -332,8 +332,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                                       RAJA::statement::Lambda<0>>>>>>>;
+                  RAJA::statement::For<
+                      0, RAJA::cuda_thread_x_direct,
+                      RAJA::statement::Lambda<0>>>>>>>;
 
   resI2     = 1;
   iteration = 0;
@@ -408,8 +409,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                                       RAJA::statement::Lambda<0>>>>>>>;
+                  RAJA::statement::For<
+                      0, RAJA::hip_thread_x_direct,
+                      RAJA::statement::Lambda<0>>>>>>>;
 
   resI2     = 1;
   iteration = 0;
diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp
index ddac3ebf57..09a8a295d1 100644
--- a/examples/kernel-dynamic-tile.cpp
+++ b/examples/kernel-dynamic-tile.cpp
@@ -18,10 +18,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       1, tile_dynamic<1>, seq_exec,
       statement::Tile<
           0, tile_dynamic<0>, seq_exec,
-          statement::For<1, seq_exec,
-                         statement::For<0, seq_exec, statement::Lambda<0>>>>>>>(
-      make_tuple(RangeSegment{0, 25}, RangeSegment{0, 25}),
-      make_tuple(TileSize{5}, TileSize{10}),
+          statement::For<
+              1, seq_exec,
+              statement::For<0, seq_exec, statement::Lambda<0>>>>>>>(
+      make_tuple(RangeSegment {0, 25}, RangeSegment {0, 25}),
+      make_tuple(TileSize {5}, TileSize {10}),
       // make_tuple(TileSize(10)), // not sure we need this, good for
       // static_assert
       [=](int i, int j, TileSize x, TileSize y)
diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp
index 898023a6be..35c63e04a6 100644
--- a/examples/launch-param-reductions.cpp
+++ b/examples/launch-param-reductions.cpp
@@ -140,29 +140,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _seq_sum,
-                           int& _seq_min, int& _seq_max,
-                           VALLOC_INT& _seq_minloc, VALLOC_INT& _seq_maxloc)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, int& _seq_sum, int& _seq_min, int& _seq_max,
+          VALLOC_INT& _seq_minloc, VALLOC_INT& _seq_maxloc)
       {
-        RAJA::loop<LOOP_POL1>(ctx, arange,
-                              [&](int i)
-                              {
-                                _seq_sum += a[i];
-
-                                _seq_min = RAJA_MIN(a[i], _seq_min);
-                                _seq_max = RAJA_MAX(a[i], _seq_max);
-
-                                _seq_minloc =
-                                    RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
-                                _seq_maxloc =
-                                    RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
-                                //_seq_minloc.min(a[i], i);
-                                //_seq_maxloc.max(a[i], i);
-                                // Note : RAJA::expt::ValLoc<T> objects provide
-                                // min() and max() methods
-                                //        that are equivalent to the assignments
-                                //        with RAJA_MIN and RAJA_MAX above.
-                              });
+        RAJA::loop<LOOP_POL1>(
+            ctx, arange,
+            [&](int i)
+            {
+              _seq_sum += a[i];
+
+              _seq_min = RAJA_MIN(a[i], _seq_min);
+              _seq_max = RAJA_MAX(a[i], _seq_max);
+
+              _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
+              _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
+              //_seq_minloc.min(a[i], i);
+              //_seq_maxloc.max(a[i], i);
+              // Note : RAJA::expt::ValLoc<T> objects provide
+              // min() and max() methods
+              //        that are equivalent to the assignments
+              //        with RAJA_MIN and RAJA_MAX above.
+            });
       });
 
   std::cout << "\tsum = " << seq_sum << std::endl;
@@ -198,25 +197,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _omp_sum,
-                           int& _omp_min, int& _omp_max,
-                           VALLOC_INT& _omp_minloc, VALLOC_INT& _omp_maxloc)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, int& _omp_sum, int& _omp_min, int& _omp_max,
+          VALLOC_INT& _omp_minloc, VALLOC_INT& _omp_maxloc)
       {
-        RAJA::loop<LOOP_POL2>(ctx, arange,
-                              [&](int i)
-                              {
-                                _omp_sum += a[i];
-
-                                _omp_min = RAJA_MIN(a[i], _omp_min);
-                                _omp_max = RAJA_MAX(a[i], _omp_max);
-
-                                _omp_minloc =
-                                    RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
-                                _omp_maxloc =
-                                    RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
-                                //_omp_minloc.min(a[i], i);
-                                //_omp_maxloc.max(a[i], i);
-                              });
+        RAJA::loop<LOOP_POL2>(
+            ctx, arange,
+            [&](int i)
+            {
+              _omp_sum += a[i];
+
+              _omp_min = RAJA_MIN(a[i], _omp_min);
+              _omp_max = RAJA_MAX(a[i], _omp_max);
+
+              _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
+              _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
+              //_omp_minloc.min(a[i], i);
+              //_omp_maxloc.max(a[i], i);
+            });
       });
 
   std::cout << "\tsum = " << omp_sum << std::endl;
@@ -252,17 +250,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<LAUNCH_POL3>(
       device_res,
-      RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS),
-                         RAJA::Threads(CUDA_BLOCK_SIZE)),
+      RAJA::LaunchParams(
+          RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)),
       "CUDAReductionKernel",
       RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _cuda_sum,
-                           int& _cuda_min, int& _cuda_max,
-                           VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, int& _cuda_sum, int& _cuda_min,
+          int& _cuda_max, VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
       {
         RAJA::loop<LOOP_POL3>(
             ctx, arange,
@@ -314,16 +312,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<LAUNCH_POL3>(
       device_res,
-      RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS),
-                         RAJA::Threads(HIP_BLOCK_SIZE)),
+      RAJA::LaunchParams(
+          RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)),
       "HipReductionKernel", RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _hip_sum,
-                           int& _hip_min, int& _hip_max,
-                           VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, int& _hip_sum, int& _hip_min, int& _hip_max,
+          VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
       {
         RAJA::loop<LOOP_POL3>(
             ctx, arange,
@@ -375,17 +373,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<LAUNCH_POL4>(
       device_res,
-      RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS),
-                         RAJA::Threads(SYCL_BLOCK_SIZE)),
+      RAJA::LaunchParams(
+          RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)),
       "SyclReductionKernel",
       RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
       RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, int& _sycl_sum,
-                           int& _sycl_min, int& _sycl_max,
-                           VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, int& _sycl_sum, int& _sycl_min,
+          int& _sycl_max, VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
       {
         RAJA::loop<LOOP_POL4>(
             ctx, arange,
diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp
index 506313f42f..e94925dbb9 100644
--- a/examples/launch_flatten.cpp
+++ b/examples/launch_flatten.cpp
@@ -102,22 +102,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       launch_params,
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<device_inner_pol1>(ctx, RAJA::RangeSegment(0, N),
-                                      [&](int j)
-                                      {
-                                        RAJA::loop<device_inner_pol0>(
-                                            ctx, RAJA::RangeSegment(0, N),
-                                            [&](int i)
-                                            { d_A_2DView(j, i) = i + j; });
-                                      });
+        RAJA::loop<device_inner_pol1>(
+            ctx, RAJA::RangeSegment(0, N),
+            [&](int j)
+            {
+              RAJA::loop<device_inner_pol0>(
+                  ctx, RAJA::RangeSegment(0, N),
+                  [&](int i) { d_A_2DView(j, i) = i + j; });
+            });
 
         ctx.teamSync();
 
         // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
         // accumulating memory contents
-        RAJA::loop<device_flatten_pol>(ctx, RAJA::RangeSegment(0, NN),
-                                       [&](int i)
-                                       { device_kernel_sum += d_A_1DView(i); });
+        RAJA::loop<device_flatten_pol>(
+            ctx, RAJA::RangeSegment(0, NN),
+            [&](int i) { device_kernel_sum += d_A_1DView(i); });
       });
 
   //----------------------------------------------------------------------------//
@@ -132,20 +132,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       launch_params,
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N),
-                              [&](int j)
-                              {
-                                RAJA::loop<host_loop>(
-                                    ctx, RAJA::RangeSegment(0, N),
-                                    [&](int i) { h_A_2DView(j, i) = i + j; });
-                              });
+        RAJA::loop<host_loop>(
+            ctx, RAJA::RangeSegment(0, N),
+            [&](int j)
+            {
+              RAJA::loop<host_loop>(
+                  ctx, RAJA::RangeSegment(0, N),
+                  [&](int i) { h_A_2DView(j, i) = i + j; });
+            });
 
         ctx.teamSync();
 
         // As loops are dispatched as standard C loops we can revert to using
         // a regular seq_exec policy
-        RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN),
-                              [&](int i) { host_kernel_sum += h_A_1DView(i); });
+        RAJA::loop<host_loop>(
+            ctx, RAJA::RangeSegment(0, NN),
+            [&](int i) { host_kernel_sum += h_A_1DView(i); });
       });
 
   if (device_kernel_sum.get() == host_kernel_sum.get())
diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp
index e9654f91b4..bf7d042d08 100644
--- a/examples/launch_matrix-multiply.cpp
+++ b/examples/launch_matrix-multiply.cpp
@@ -37,16 +37,17 @@
 /*
  * Define host/device launch policies
  */
-using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t
+using launch_policy = RAJA::LaunchPolicy<
+    RAJA::seq_launch_t
 #if defined(RAJA_ENABLE_CUDA)
-                                         ,
-                                         RAJA::cuda_launch_t<false>
+    ,
+    RAJA::cuda_launch_t<false>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         RAJA::hip_launch_t<false>
+    ,
+    RAJA::hip_launch_t<false>
 #endif
-                                         >;
+    >;
 
 using loop_policy = RAJA::seq_exec;
 
@@ -74,47 +75,53 @@ using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy;
   Define RAJA Team/Thread policies, if a device is available add
   a device policy.
 */
-using teams_x = RAJA::LoopPolicy<loop_policy
+using teams_x = RAJA::LoopPolicy<
+    loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                 ,
-                                 gpu_block_x_policy
+    ,
+    gpu_block_x_policy
 #endif
-                                 >;
+    >;
 
-using teams_y = RAJA::LoopPolicy<loop_policy
+using teams_y = RAJA::LoopPolicy<
+    loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                 ,
-                                 gpu_block_y_policy
+    ,
+    gpu_block_y_policy
 #endif
-                                 >;
+    >;
 
-using threads_x = RAJA::LoopPolicy<loop_policy
+using threads_x = RAJA::LoopPolicy<
+    loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                   ,
-                                   gpu_thread_x_policy
+    ,
+    gpu_thread_x_policy
 #endif
-                                   >;
+    >;
 
-using threads_y = RAJA::LoopPolicy<loop_policy
+using threads_y = RAJA::LoopPolicy<
+    loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                   ,
-                                   gpu_thread_y_policy
+    ,
+    gpu_thread_y_policy
 #endif
-                                   >;
+    >;
 
-using global_thread_x = RAJA::LoopPolicy<loop_policy
+using global_thread_x = RAJA::LoopPolicy<
+    loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         gpu_global_thread_x_policy
+    ,
+    gpu_global_thread_x_policy
 #endif
-                                         >;
+    >;
 
-using global_thread_y = RAJA::LoopPolicy<loop_policy
+using global_thread_y = RAJA::LoopPolicy<
+    loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         gpu_global_thread_y_policy
+    ,
+    gpu_global_thread_y_policy
 #endif
-                                         >;
+    >;
 
 //
 // Define dimensionality of matrices.
@@ -331,26 +338,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _matmult_basickernel_start
   RAJA::launch<launch_policy>(
       RAJA::ExecPlace::HOST,
-      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
-                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
+      RAJA::LaunchParams(
+          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<global_thread_y>(ctx, col_range,
-                                    [&](int col)
-                                    {
-                                      RAJA::loop<global_thread_x>(
-                                          ctx, row_range,
-                                          [&](int row)
-                                          {
-                                            double dot = 0.0;
-                                            for (int k = 0; k < N; ++k)
-                                            {
-                                              dot +=
-                                                  Aview(row, k) * Bview(k, col);
-                                            }
-                                            Cview(row, col) = dot;
-                                          });
-                                    });
+        RAJA::loop<global_thread_y>(
+            ctx, col_range,
+            [&](int col)
+            {
+              RAJA::loop<global_thread_x>(
+                  ctx, row_range,
+                  [&](int row)
+                  {
+                    double dot = 0.0;
+                    for (int k = 0; k < N; ++k)
+                    {
+                      dot += Aview(row, k) * Bview(k, col);
+                    }
+                    Cview(row, col) = dot;
+                  });
+            });
       });
   // _matmult_basickernel_end
 
@@ -380,22 +387,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchParams(),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<omp_col_policy0>(ctx, col_range,
-                                    [&](int col)
-                                    {
-                                      RAJA::loop<omp_row_policy0>(
-                                          ctx, row_range,
-                                          [&](int row)
-                                          {
-                                            double dot = 0.0;
-                                            for (int k = 0; k < N; ++k)
-                                            {
-                                              dot +=
-                                                  Aview(row, k) * Bview(k, col);
-                                            }
-                                            Cview(row, col) = dot;
-                                          });
-                                    });
+        RAJA::loop<omp_col_policy0>(
+            ctx, col_range,
+            [&](int col)
+            {
+              RAJA::loop<omp_row_policy0>(
+                  ctx, row_range,
+                  [&](int row)
+                  {
+                    double dot = 0.0;
+                    for (int k = 0; k < N; ++k)
+                    {
+                      dot += Aview(row, k) * Bview(k, col);
+                    }
+                    Cview(row, col) = dot;
+                  });
+            });
       });
 
   checkResult<double>(Cview, N);
@@ -414,26 +421,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   using global_thread_xy = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
-  RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST, RAJA::LaunchParams(),
-                                  [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-                                  {
-                                    RAJA::expt::loop<global_thread_xy>(
-                                        ctx, col_range, row_range,
-                                        [&](int col, int row)
-                                        {
-                                          double dot = 0.0;
-                                          for (int k = 0; k < N; ++k)
-                                          {
-                                            dot +=
-                                                Aview(row, k) * Bview(k, col);
-                                          }
-                                          Cview(row, col) = dot;
-                                        });
-                                  });
+  RAJA::launch<omp_launch_policy>(
+      RAJA::ExecPlace::HOST, RAJA::LaunchParams(),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::expt::loop<global_thread_xy>(
+            ctx, col_range, row_range,
+            [&](int col, int row)
+            {
+              double dot = 0.0;
+              for (int k = 0; k < N; ++k)
+              {
+                dot += Aview(row, k) * Bview(k, col);
+              }
+              Cview(row, col) = dot;
+            });
+      });
 
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
-#endif // if RAJA_ENABLE_OPENMP
+#endif  // if RAJA_ENABLE_OPENMP
 
   //----------------------------------------------------------------------------//
 
@@ -458,21 +465,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<teams_x>(ctx, col_range,
-                            [&](int col)
-                            {
-                              RAJA::loop<threads_x>(
-                                  ctx, row_range,
-                                  [&](int row)
-                                  {
-                                    double dot = 0.0;
-                                    for (int k = 0; k < N; ++k)
-                                    {
-                                      dot += Aview(row, k) * Bview(k, col);
-                                    }
-                                    Cview(row, col) = dot;
-                                  });
-                            });
+        RAJA::loop<teams_x>(
+            ctx, col_range,
+            [&](int col)
+            {
+              RAJA::loop<threads_x>(
+                  ctx, row_range,
+                  [&](int row)
+                  {
+                    double dot = 0.0;
+                    for (int k = 0; k < N; ++k)
+                    {
+                      dot += Aview(row, k) * Bview(k, col);
+                    }
+                    Cview(row, col) = dot;
+                  });
+            });
       });
 
   checkResult<double>(Cview, N);
@@ -495,42 +503,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   RAJA::launch<launch_policy>(
       RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
-                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
+      RAJA::LaunchParams(
+          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::tile<teams_y>(ctx, THREAD_SZ, row_range,
-                            [&](RAJA::RangeSegment const& row_tile)
-                            {
-                              RAJA::tile<teams_x>(
-                                  ctx, THREAD_SZ, col_range,
-                                  [&](RAJA::RangeSegment const& col_tile)
-                                  {
-                                    RAJA::loop<threads_y>(
-                                        ctx, row_tile,
-                                        [&](int col)
-                                        {
-                                          RAJA::loop<threads_x>(
-                                              ctx, col_tile,
-                                              [&](int row)
-                                              {
-                                                double dot = 0.0;
-                                                for (int k = 0; k < N; ++k)
-                                                {
-                                                  dot += Aview(row, k) *
-                                                         Bview(k, col);
-                                                }
-                                                Cview(row, col) = dot;
-                                              });
-                                        });
-                                  });
-                            });
+        RAJA::tile<teams_y>(
+            ctx, THREAD_SZ, row_range,
+            [&](RAJA::RangeSegment const& row_tile)
+            {
+              RAJA::tile<teams_x>(
+                  ctx, THREAD_SZ, col_range,
+                  [&](RAJA::RangeSegment const& col_tile)
+                  {
+                    RAJA::loop<threads_y>(
+                        ctx, row_tile,
+                        [&](int col)
+                        {
+                          RAJA::loop<threads_x>(
+                              ctx, col_tile,
+                              [&](int row)
+                              {
+                                double dot = 0.0;
+                                for (int k = 0; k < N; ++k)
+                                {
+                                  dot += Aview(row, k) * Bview(k, col);
+                                }
+                                Cview(row, col) = dot;
+                              });
+                        });
+                  });
+            });
       });
 
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
 
-#endif // if RAJA_ENABLE_CUDA
+#endif  // if RAJA_ENABLE_CUDA
 
   //----------------------------------------------------------------------------//
 
@@ -567,22 +575,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<teams_x>(ctx, col_range,
-                            [&](int col)
-                            {
-                              RAJA::loop<threads_x>(
-                                  ctx, row_range,
-                                  [&](int row)
-                                  {
-                                    double dot = 0.0;
-                                    for (int k = 0; k < N; ++k)
-                                    {
-                                      dot += d_Aview(row, k) * d_Bview(k, col);
-                                    }
+        RAJA::loop<teams_x>(
+            ctx, col_range,
+            [&](int col)
+            {
+              RAJA::loop<threads_x>(
+                  ctx, row_range,
+                  [&](int row)
+                  {
+                    double dot = 0.0;
+                    for (int k = 0; k < N; ++k)
+                    {
+                      dot += d_Aview(row, k) * d_Bview(k, col);
+                    }
 
-                                    d_Cview(row, col) = dot;
-                                  });
-                            });
+                    d_Cview(row, col) = dot;
+                  });
+            });
       });
 
   hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
@@ -608,42 +617,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   RAJA::launch<launch_policy>(
       RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
-                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
+      RAJA::LaunchParams(
+          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::tile<teams_y>(ctx, THREAD_SZ, row_range,
-                            [&](RAJA::RangeSegment const& row_tile)
-                            {
-                              RAJA::tile<teams_x>(
-                                  ctx, THREAD_SZ, col_range,
-                                  [&](RAJA::RangeSegment const& col_tile)
-                                  {
-                                    RAJA::loop<threads_y>(
-                                        ctx, row_tile,
-                                        [&](int col)
-                                        {
-                                          RAJA::loop<threads_x>(
-                                              ctx, col_tile,
-                                              [&](int row)
-                                              {
-                                                double dot = 0.0;
-                                                for (int k = 0; k < N; ++k)
-                                                {
-                                                  dot += Aview(row, k) *
-                                                         Bview(k, col);
-                                                }
-                                                Cview(row, col) = dot;
-                                              });
-                                        });
-                                  });
-                            });
+        RAJA::tile<teams_y>(
+            ctx, THREAD_SZ, row_range,
+            [&](RAJA::RangeSegment const& row_tile)
+            {
+              RAJA::tile<teams_x>(
+                  ctx, THREAD_SZ, col_range,
+                  [&](RAJA::RangeSegment const& col_tile)
+                  {
+                    RAJA::loop<threads_y>(
+                        ctx, row_tile,
+                        [&](int col)
+                        {
+                          RAJA::loop<threads_x>(
+                              ctx, col_tile,
+                              [&](int row)
+                              {
+                                double dot = 0.0;
+                                for (int k = 0; k < N; ++k)
+                                {
+                                  dot += Aview(row, k) * Bview(k, col);
+                                }
+                                Cview(row, col) = dot;
+                              });
+                        });
+                  });
+            });
       });
 
   hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
-#endif // if RAJA_ENABLE_HIP
+#endif  // if RAJA_ENABLE_HIP
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_CUDA)
@@ -666,8 +675,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   RAJA::launch<launch_policy>(
       RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams),
-                         RAJA::Threads(THREAD_SZ, THREAD_SZ)),
+      RAJA::LaunchParams(
+          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         //
@@ -685,14 +694,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                     RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
                     RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
 
-                    RAJA::loop_icount<threads_y>(ctx, y_tile,
-                                                 [&](int row, int ty)
-                                                 {
-                                                   RAJA::loop_icount<threads_x>(
-                                                       ctx, x_tile,
-                                                       [&](int col, int tx)
-                                                       { Cs[ty][tx] = 0.0; });
-                                                 });
+                    RAJA::loop_icount<threads_y>(
+                        ctx, y_tile,
+                        [&](int row, int ty)
+                        {
+                          RAJA::loop_icount<threads_x>(
+                              ctx, x_tile,
+                              [&](int col, int tx) { Cs[ty][tx] = 0.0; });
+                        });
 
                     RAJA::tile<seq_loop>(
                         ctx, THREAD_SZ, dot_range,
@@ -737,21 +746,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                               });
 
                           ctx.teamSync();
-                        }); // slide across matrix
-
-                    RAJA::loop_icount<threads_y>(ctx, y_tile,
-                                                 [&](int row, int ty)
-                                                 {
-                                                   RAJA::loop_icount<threads_x>(
-                                                       ctx, x_tile,
-                                                       [&](int col, int tx) {
-                                                         Cview(col, row) =
-                                                             Cs[ty][tx];
-                                                       });
-                                                 });
+                        });  // slide across matrix
+
+                    RAJA::loop_icount<threads_y>(
+                        ctx, y_tile,
+                        [&](int row, int ty)
+                        {
+                          RAJA::loop_icount<threads_x>(
+                              ctx, x_tile,
+                              [&](int col, int tx)
+                              { Cview(col, row) = Cs[ty][tx]; });
+                        });
                   });
             });
-      }); // kernel
+      });  // kernel
 
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
@@ -766,8 +774,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define thread block dimensions
   dim3 blockdim(THREAD_SZ, THREAD_SZ);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
   // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
   // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
@@ -791,7 +800,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
 
-#endif // if RAJA_ENABLE_CUDA
+#endif  // if RAJA_ENABLE_CUDA
 
   //----------------------------------------------------------------------------//
 
@@ -802,8 +811,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define thread block dimensions
   dim3 blockdim(THREAD_SZ, THREAD_SZ);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
   // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
   // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
@@ -812,8 +822,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N,
-                     d_C, d_A, d_B);
+  hipLaunchKernelGGL(
+      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
@@ -828,8 +838,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL((sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0,
-                     N, d_C, d_A, d_B);
+  hipLaunchKernelGGL(
+      (sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A,
+      d_B);
 
   hipDeviceSynchronize();
 
@@ -840,7 +851,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
-#endif // if RAJA_ENABLE_HIP
+#endif  // if RAJA_ENABLE_HIP
 
   //----------------------------------------------------------------------------//
 
diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp
index 5d35a7260d..815676351c 100644
--- a/examples/launch_reductions.cpp
+++ b/examples/launch_reductions.cpp
@@ -43,19 +43,21 @@ using device_launch = RAJA::hip_launch_t<false>;
 using device_loop   = RAJA::hip_global_thread_x;
 #endif
 
-using launch_policy = RAJA::LaunchPolicy<host_launch
+using launch_policy = RAJA::LaunchPolicy<
+    host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         device_launch
+    ,
+    device_launch
 #endif
-                                         >;
+    >;
 
-using loop_pol = RAJA::LoopPolicy<host_loop
+using loop_pol = RAJA::LoopPolicy<
+    host_loop
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                  ,
-                                  device_loop
+    ,
+    device_loop
 #endif
-                                  >;
+    >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -175,17 +177,18 @@ int main(int argc, char* argv[])
       "Launch Reductions",
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<loop_pol>(ctx, arange,
-                             [&](int i)
-                             {
-                               kernel_sum += a[i];
-
-                               kernel_min.min(a[i]);
-                               kernel_max.max(a[i]);
-
-                               kernel_minloc.minloc(a[i], i);
-                               kernel_maxloc.maxloc(a[i], i);
-                             });
+        RAJA::loop<loop_pol>(
+            ctx, arange,
+            [&](int i)
+            {
+              kernel_sum += a[i];
+
+              kernel_min.min(a[i]);
+              kernel_max.max(a[i]);
+
+              kernel_minloc.minloc(a[i], i);
+              kernel_maxloc.maxloc(a[i], i);
+            });
       });
 
   std::cout << "\tsum = " << kernel_sum.get() << std::endl;
diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp
index 930640a80d..41accd3651 100644
--- a/examples/memoryManager.hpp
+++ b/examples/memoryManager.hpp
@@ -100,5 +100,5 @@ void deallocate_gpu(T*& ptr)
 }
 #endif
 
-}; // namespace memoryManager
+};  // namespace memoryManager
 #endif
diff --git a/examples/multiview.cpp b/examples/multiview.cpp
index a65609e8ee..8f7b2700a1 100644
--- a/examples/multiview.cpp
+++ b/examples/multiview.cpp
@@ -99,20 +99,20 @@ void docs_example()
   // _multiview_example_1Dinit_end
 
   // _multiview_example_1Daccess_start
-  t1 = MView(0, 3); // accesses the 4th index of the 0th internal array a1,
-                    // returns value of 8
-  t2 = MView(1, 2); // accesses 3rd index of the 1st internal array a2, returns
-                    // value of 11
+  t1 = MView(0, 3);  // accesses the 4th index of the 0th internal array a1,
+                     // returns value of 8
+  t2 = MView(1, 2);  // accesses 3rd index of the 1st internal array a2, returns
+                     // value of 11
   // _multiview_example_1Daccess_end
 
   // _multiview_example_1Daopindex_start
   // MultiView with array-of-pointers index in 1st position.
   RAJA::MultiView<int, RAJA::Layout<1>, 1> MView1(myarr, 4);
 
-  t3 = MView1(3, 0); // accesses the 4th index of the 0th internal array a1,
-                     // returns value of 8
-  t4 = MView1(2, 1); // accesses 3rd index of the 1st internal array a2,
-                     // returns value of 11
+  t3 = MView1(3, 0);  // accesses the 4th index of the 0th internal array a1,
+                      // returns value of 8
+  t4 = MView1(2, 1);  // accesses 3rd index of the 1st internal array a2,
+                      // returns value of 11
   // _multiview_example_1Daopindex_end
 
   printf("Comparison of default MultiView with another MultiView that has the "
@@ -123,15 +123,15 @@ void docs_example()
   // _multiview_example_2Daopindex_start
   RAJA::View<int, RAJA::Layout<2>> normalView(a1, 2, 2);
 
-  t1 = normalView(1, 1); // accesses 4th index of the a1 array, value = 8
+  t1 = normalView(1, 1);  // accesses 4th index of the a1 array, value = 8
 
   // MultiView with array-of-pointers index in 2nd position
   RAJA::MultiView<int, RAJA::Layout<2>, 2> MView2(myarr, 2, 2);
 
-  t2 = MView2(1, 1, 0); // accesses the 4th index of the 0th internal array a1,
-                        // returns value of 8 (same as normalView(1,1))
-  t3 = MView2(0, 0, 1); // accesses the 1st index of the 1st internal array a2,
-                        // returns value of 9
+  t2 = MView2(1, 1, 0);  // accesses the 4th index of the 0th internal array a1,
+                         // returns value of 8 (same as normalView(1,1))
+  t3 = MView2(0, 0, 1);  // accesses the 1st index of the 1st internal array a2,
+                         // returns value of 9
   // _multiview_example_2Daopindex_end
 
   printf("Comparison of 2D normal View with 2D MultiView that has the "
@@ -144,7 +144,7 @@ int main()
   docs_example();
 
   constexpr int N = 12;
-  int*          myarr[2]; // two 3x4 arrays
+  int*          myarr[2];  // two 3x4 arrays
   int           arr1[N];
   int           arr2[N];
 
@@ -158,14 +158,14 @@ int main()
   myarr[1] = arr2;
 
   // 4x3 layout
-  std::array<RAJA::idx_t, 2> perm{{0, 1}};
+  std::array<RAJA::idx_t, 2> perm {{0, 1}};
   RAJA::Layout<2> layout = RAJA::make_permuted_layout({{4, 3}}, perm);
 
   // Basic MultiView usage
   // Default usage: no specified array-of-pointers index moving
   // 0th position is used as the array-of-pointers index
-  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>> arrView(myarr,
-                                                                  layout);
+  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>> arrView(
+      myarr, layout);
 
   // Moved array-of-pointers index MultiView usage
   // Add an array-of-pointers index specifier
@@ -181,8 +181,9 @@ int main()
     {
       for (int jj = 0; jj < 3; ++jj)
       {
-        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
-               arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
+        printf(
+            "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
+            arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
       }
     }
   }
@@ -208,8 +209,9 @@ int main()
     {
       for (int jj = 0; jj < 3; ++jj)
       {
-        printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
-               arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
+        printf(
+            "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
+            arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
       }
     }
   }
diff --git a/examples/omp-target-kernel.cpp b/examples/omp-target-kernel.cpp
index 0099205270..7d77311ce0 100644
--- a/examples/omp-target-kernel.cpp
+++ b/examples/omp-target-kernel.cpp
@@ -35,10 +35,11 @@ int main(int /*argc*/, char** /*argv[]*/)
         // array[0] = i*j;
       });
 #else
-  RAJA::forall<RAJA::omp_target_parallel_for_exec<1>>(RAJA::RangeSegment(0, 25),
-                                                      [=](int i)
-                                                      {
-                                                        //
-                                                      });
+  RAJA::forall<RAJA::omp_target_parallel_for_exec<1>>(
+      RAJA::RangeSegment(0, 25),
+      [=](int i)
+      {
+        //
+      });
 #endif
 }
diff --git a/examples/omp-target-ltimes.cpp b/examples/omp-target-ltimes.cpp
index f0be32126b..0fc9866348 100644
--- a/examples/omp-target-ltimes.cpp
+++ b/examples/omp-target-ltimes.cpp
@@ -26,18 +26,19 @@ RAJA_INDEX_VALUE(IGroup, "IGroup");
 RAJA_INDEX_VALUE(IZone, "IZone");
 
 
-void runLTimesRajaKernel(bool       debug,
-                         Index_type num_moments,
-                         Index_type num_directions,
-                         Index_type num_groups,
-                         Index_type num_zones)
+void runLTimesRajaKernel(
+    bool       debug,
+    Index_type num_moments,
+    Index_type num_directions,
+    Index_type num_groups,
+    Index_type num_zones)
 {
 
   using namespace RAJA::statement;
 
   // psi[direction, group, zone]
-  using PsiView = RAJA::TypedView<double, Layout<3, Index_type, 2>, IDirection,
-                                  IGroup, IZone>;
+  using PsiView = RAJA::TypedView<
+      double, Layout<3, Index_type, 2>, IDirection, IGroup, IZone>;
 
   // phi[moment, group, zone]
   using PhiView =
@@ -58,17 +59,17 @@ void runLTimesRajaKernel(bool       debug,
   // randomize data
   for (size_t i = 0; i < ell_data.size(); ++i)
   {
-    ell_data[i] = i; // drand48();
+    ell_data[i] = i;  // drand48();
   }
 
   for (size_t i = 0; i < psi_data.size(); ++i)
   {
-    psi_data[i] = 2 * i; // drand48();
+    psi_data[i] = 2 * i;  // drand48();
   }
 
   for (size_t i = 0; i < phi_data.size(); ++i)
   {
-    phi_data[i] = 0; // drand48();
+    phi_data[i] = 0;  // drand48();
   }
 
   int hid = omp_get_initial_device();
@@ -84,40 +85,42 @@ void runLTimesRajaKernel(bool       debug,
       omp_target_alloc(sizeof(double) * psi_data.size(), did));
 
   // Copy to device
-  omp_target_memcpy(&ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0,
-                    hid, did);
-  omp_target_memcpy(&phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0,
-                    hid, did);
-  omp_target_memcpy(&psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0,
-                    hid, did);
+  omp_target_memcpy(
+      &ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0, hid, did);
+  omp_target_memcpy(
+      &phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0, hid, did);
+  omp_target_memcpy(
+      &psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0, hid, did);
 
 
   // create views on data
-  std::array<RAJA::idx_t, 2> ell_perm{{0, 1}};
-  EllView                    ell(d_ell,
-                                 make_permuted_layout({{num_moments, num_directions}}, ell_perm));
+  std::array<RAJA::idx_t, 2> ell_perm {{0, 1}};
+  EllView                    ell(
+                         d_ell, make_permuted_layout({{num_moments, num_directions}}, ell_perm));
 
-  std::array<RAJA::idx_t, 3> psi_perm{{0, 1, 2}};
-  PsiView                    psi(d_psi, make_permuted_layout(
-                                            {{num_directions, num_groups, num_zones}}, psi_perm));
+  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+  PsiView                    psi(
+                         d_psi, make_permuted_layout(
+                                    {{num_directions, num_groups, num_zones}}, psi_perm));
 
-  std::array<RAJA::idx_t, 3> phi_perm{{0, 1, 2}};
-  PhiView                    phi(d_phi, make_permuted_layout(
-                                            {{num_moments, num_groups, num_zones}}, phi_perm));
+  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+  PhiView                    phi(
+                         d_phi,
+                         make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm));
 
 
-  using Pol = RAJA::KernelPolicy<
-      Collapse<omp_target_parallel_collapse_exec, ArgList<0, 1, 2>,
-               For<3, RAJA::seq_exec, Lambda<0>>>>;
+  using Pol = RAJA::KernelPolicy<Collapse<
+      omp_target_parallel_collapse_exec, ArgList<0, 1, 2>,
+      For<3, RAJA::seq_exec, Lambda<0>>>>;
 
   RAJA::Timer timer;
   timer.start();
 
-  auto segments =
-      RAJA::make_tuple(TypedRangeSegment<IMoment>(0, num_moments),
-                       TypedRangeSegment<IDirection>(0, num_directions),
-                       TypedRangeSegment<IGroup>(0, num_groups),
-                       TypedRangeSegment<IZone>(0, num_zones));
+  auto segments = RAJA::make_tuple(
+      TypedRangeSegment<IMoment>(0, num_moments),
+      TypedRangeSegment<IDirection>(0, num_directions),
+      TypedRangeSegment<IGroup>(0, num_groups),
+      TypedRangeSegment<IZone>(0, num_zones));
 
 
   kernel<Pol>(
diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp
index 2be24c1b44..01685781c5 100644
--- a/examples/pi-reduce_vs_atomic.cpp
+++ b/examples/pi-reduce_vs_atomic.cpp
@@ -86,12 +86,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(bins,
-                          [=](int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            seq_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL1>(
+      bins,
+      [=](int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        seq_pi += dx / (1.0 + x * x);
+      });
   double seq_pi_val = seq_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
@@ -103,13 +104,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL1>(bins,
-                          [=](int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi,
-                                                         dx / (1.0 + x * x));
-                          });
+  RAJA::forall<EXEC_POL1>(
+      bins,
+      [=](int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi, dx / (1.0 + x * x));
+      });
   *atomic_pi *= 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
@@ -126,12 +127,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
 
-  RAJA::forall<EXEC_POL2>(bins,
-                          [=](int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            omp_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL2>(
+      bins,
+      [=](int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        omp_pi += dx / (1.0 + x * x);
+      });
   double omp_pi_val = omp_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
@@ -143,13 +145,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL2>(bins,
-                          [=](int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi,
-                                                         dx / (1.0 + x * x));
-                          });
+  RAJA::forall<EXEC_POL2>(
+      bins,
+      [=](int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi, dx / (1.0 + x * x));
+      });
   *atomic_pi *= 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
@@ -168,12 +170,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
 
-  RAJA::forall<EXEC_POL3>(bins,
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            cuda_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL3>(
+      bins,
+      [=] RAJA_DEVICE(int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        cuda_pi += dx / (1.0 + x * x);
+      });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
@@ -185,13 +188,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL3>(bins,
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi,
-                                                         dx / (1.0 + x * x));
-                          });
+  RAJA::forall<EXEC_POL3>(
+      bins,
+      [=] RAJA_DEVICE(int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi, dx / (1.0 + x * x));
+      });
   *atomic_pi *= 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
@@ -209,12 +212,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL4, double> hip_pi(0.0);
 
-  RAJA::forall<EXEC_POL4>(bins,
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            hip_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL4>(
+      bins,
+      [=] RAJA_DEVICE(int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        hip_pi += dx / (1.0 + x * x);
+      });
   double hip_pi_val = hip_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << hip_pi_val << std::endl;
@@ -223,21 +227,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi          = 0;
   double* d_atomic_pi = memoryManager::allocate_gpu<double>(1);
-  hipErrchk(hipMemcpy(d_atomic_pi, atomic_pi, 1 * sizeof(double),
-                      hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(
+      d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice));
 
   using ATOMIC_POL4 = RAJA::hip_atomic;
 
-  RAJA::forall<EXEC_POL4>(bins,
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi,
-                                                         dx / (1.0 + x * x));
-                          });
+  RAJA::forall<EXEC_POL4>(
+      bins,
+      [=] RAJA_DEVICE(int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi, dx / (1.0 + x * x));
+      });
 
-  hipErrchk(hipMemcpy(atomic_pi, d_atomic_pi, 1 * sizeof(double),
-                      hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(
+      atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost));
   *atomic_pi *= 4.0;
   std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
 
diff --git a/examples/plugin/counter-plugin.cpp b/examples/plugin/counter-plugin.cpp
index ece7814a71..65b5dd391f 100644
--- a/examples/plugin/counter-plugin.cpp
+++ b/examples/plugin/counter-plugin.cpp
@@ -49,11 +49,12 @@ class CounterPlugin : public RAJA::util::PluginStrategy
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin> P("Counter",
-                                                        "Counts "
-                                                        "number of "
-                                                        "kernel "
-                                                        "launches.");
+static RAJA::util::PluginRegistry::add<CounterPlugin>
+    P("Counter",
+      "Counts "
+      "number of "
+      "kernel "
+      "launches.");
 
 // Dynamically loading plugin.
 extern "C" RAJA::util::PluginStrategy* getPlugin() { return new CounterPlugin; }
diff --git a/examples/plugin/test-plugin-dynamic.cpp b/examples/plugin/test-plugin-dynamic.cpp
index b73a13441f..7f871128f7 100644
--- a/examples/plugin/test-plugin-dynamic.cpp
+++ b/examples/plugin/test-plugin-dynamic.cpp
@@ -15,7 +15,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   double* a = new double[10];
   for (int i = 0; i < 4; i++)
   {
-    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                                 [=](int i) { a[i] = 0; });
+    RAJA::forall<RAJA::seq_exec>(
+        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
   }
 }
diff --git a/examples/plugin/test-plugin.cpp b/examples/plugin/test-plugin.cpp
index 2164ae7df9..5f9b02a256 100644
--- a/examples/plugin/test-plugin.cpp
+++ b/examples/plugin/test-plugin.cpp
@@ -13,7 +13,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   for (int i = 0; i < 10; i++)
   {
-    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                                 [=](int i) { a[i] = 0; });
+    RAJA::forall<RAJA::seq_exec>(
+        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
   }
 }
diff --git a/examples/plugin/timer-plugin.cpp b/examples/plugin/timer-plugin.cpp
index cdb330a970..ab733a6b72 100644
--- a/examples/plugin/timer-plugin.cpp
+++ b/examples/plugin/timer-plugin.cpp
@@ -27,13 +27,14 @@ class TimerPlugin : public RAJA::util::PluginStrategy
 
     if (p.platform == RAJA::Platform::host)
     {
-      printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n",
-             elapsedMs);
+      printf(
+          "[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs);
     }
     else
     {
-      printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n",
-             elapsedMs);
+      printf(
+          "[TimerPlugin]: Elapsed time of device kernel was %f ms\n",
+          elapsedMs);
     }
   }
 
@@ -46,7 +47,8 @@ class TimerPlugin : public RAJA::util::PluginStrategy
 extern "C" RAJA::util::PluginStrategy* getPlugin() { return new TimerPlugin; }
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<TimerPlugin> P("Timer",
-                                                      "Prints elapsed "
-                                                      "time of kernel "
-                                                      "executions.");
\ No newline at end of file
+static RAJA::util::PluginRegistry::add<TimerPlugin>
+    P("Timer",
+      "Prints elapsed "
+      "time of kernel "
+      "executions.");
\ No newline at end of file
diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp
index 4adef54d83..520170dd2e 100644
--- a/examples/raja-launch.cpp
+++ b/examples/raja-launch.cpp
@@ -73,16 +73,17 @@ using teams_x = RAJA::LoopPolicy<
  * Define thread policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
+using threads_x = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                   ,
-                                   RAJA::cuda_thread_x_loop
+    ,
+    RAJA::cuda_thread_x_loop
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                   ,
-                                   RAJA::hip_thread_x_loop
+    ,
+    RAJA::hip_thread_x_loop
 #endif
-                                   >;
+    >;
 
 
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
@@ -163,28 +164,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri),
-                              [&](int r)
-                              {
-                                // Array shared within threads of the same team
-                                RAJA_TEAM_SHARED int s_A[1];
-
-                                RAJA::loop<threads_x>(
-                                    ctx, RAJA::RangeSegment(0, 1),
-                                    [&](int c) { s_A[c] = r; }); // loop c
-
-                                ctx.teamSync();
-
-                                RAJA::loop<threads_x>(
-                                    ctx, RAJA::RangeSegment(r, N_tri),
-                                    [&](int c)
-                                    {
-                                      D(r, c) = r * N_tri + c;
-                                      printf("r=%d, c=%d : D=%d : s_A = %d \n",
-                                             r, c, D(r, c), s_A[0]);
-                                    }); // loop c
-                              });       // loop r
-        });                             // outer lambda
+          RAJA::loop<teams_x>(
+              ctx, RAJA::RangeSegment(0, N_tri),
+              [&](int r)
+              {
+                // Array shared within threads of the same team
+                RAJA_TEAM_SHARED int s_A[1];
+
+                RAJA::loop<threads_x>(
+                    ctx, RAJA::RangeSegment(0, 1),
+                    [&](int c) { s_A[c] = r; });  // loop c
+
+                ctx.teamSync();
+
+                RAJA::loop<threads_x>(
+                    ctx, RAJA::RangeSegment(r, N_tri),
+                    [&](int c)
+                    {
+                      D(r, c) = r * N_tri + c;
+                      printf(
+                          "r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c),
+                          s_A[0]);
+                    });  // loop c
+              });        // loop r
+        });              // outer lambda
 
     if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
     {
@@ -198,7 +201,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     }
 #endif
 
-  } // Execution places loop
+  }  // Execution places loop
 
 
-} // Main
+}  // Main
diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp
index 8a73be765f..25d582aa2d 100644
--- a/examples/red-black-gauss-seidel.cpp
+++ b/examples/red-black-gauss-seidel.cpp
@@ -99,7 +99,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   gridx.h = 1.0 / (N + 1.0);
   gridx.n = N + 2;
 
-  camp::resources::Resource resource{camp::resources::Host()};
+  camp::resources::Resource resource {camp::resources::Host()};
 
   double* I = resource.allocate<double>(NN);
 
@@ -244,15 +244,16 @@ void computeErr(double* I, grid_s grid)
       1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<errPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
-                          [=](RAJA::Index_type tx, RAJA::Index_type ty)
-                          {
-                            int    id    = tx + grid.n * ty;
-                            double x     = grid.o + tx * grid.h;
-                            double y     = grid.o + ty * grid.h;
-                            double myErr = std::abs(I[id] - solution(x, y));
-                            tMax.max(myErr);
-                          });
+  RAJA::kernel<errPolicy>(
+      RAJA::make_tuple(fdBounds, fdBounds),
+      [=](RAJA::Index_type tx, RAJA::Index_type ty)
+      {
+        int    id    = tx + grid.n * ty;
+        double x     = grid.o + tx * grid.h;
+        double y     = grid.o + ty * grid.h;
+        double myErr = std::abs(I[id] - solution(x, y));
+        tMax.max(myErr);
+      });
 
   double l2err = tMax;
   printf("Max error = %lg, h = %f \n", l2err, grid.h);
diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
index f41e9a5a9c..ad165e7539 100644
--- a/examples/resource-dynamic-forall.cpp
+++ b/examples/resource-dynamic-forall.cpp
@@ -28,20 +28,21 @@
 void checkResult(int* res, int len);
 void printResult(int* res, int len);
 
-using policy_list = camp::list<RAJA::seq_exec,
-                               RAJA::simd_exec
+using policy_list = camp::list<
+    RAJA::seq_exec,
+    RAJA::simd_exec
 #if defined(RAJA_ENABLE_CUDA)
-                               ,
-                               RAJA::cuda_exec<256>,
-                               RAJA::cuda_exec<512>
+    ,
+    RAJA::cuda_exec<256>,
+    RAJA::cuda_exec<512>
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-                               ,
-                               RAJA::hip_exec<256>,
-                               RAJA::hip_exec<512>
+    ,
+    RAJA::hip_exec<256>,
+    RAJA::hip_exec<512>
 #endif
-                               >;
+    >;
 
 
 int main(int argc, char* argv[])
@@ -134,9 +135,9 @@ int main(int argc, char* argv[])
       RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
-  RAJA::expt::dynamic_forall<policy_list>(res, pol, RAJA::RangeSegment(0, N),
-                                          [=] RAJA_HOST_DEVICE(int i)
-                                          { c[i] = a[i] + b[i]; });
+  RAJA::expt::dynamic_forall<policy_list>(
+      res, pol, RAJA::RangeSegment(0, N),
+      [=] RAJA_HOST_DEVICE(int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
   // printResult(c, N);
diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp
index 8cd87c6d26..9941289bb1 100644
--- a/examples/resource-forall.cpp
+++ b/examples/resource-forall.cpp
@@ -52,7 +52,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate and initialize vector data
   //
-  RAJA::resources::Host host{};
+  RAJA::resources::Host host {};
 
   int* a = host.allocate<int>(N);
   int* b = host.allocate<int>(N);
@@ -91,8 +91,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
 
-  RAJA::forall<RAJA::seq_exec>(host, RAJA::RangeSegment(0, N),
-                               [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::seq_exec>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
@@ -102,8 +102,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA simd_exec vector addition...\n";
 
-  RAJA::forall<RAJA::simd_exec>(host, RAJA::RangeSegment(0, N),
-                                [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::simd_exec>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
@@ -114,8 +114,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(host, RAJA::RangeSegment(0, N),
-                                            [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, N);
 
@@ -188,13 +188,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     res_gpu2.memcpy(d_b2, b, sizeof(int) * N);
 
 
-    RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N),
-                              [=] RAJA_DEVICE(int i)
-                              { d_c1[i] = d_a1[i] + d_b1[i]; });
+    RAJA::forall<EXEC_POLICY>(
+        res_gpu1, RAJA::RangeSegment(0, N),
+        [=] RAJA_DEVICE(int i) { d_c1[i] = d_a1[i] + d_b1[i]; });
 
-    RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0, N),
-                              [=] RAJA_DEVICE(int i)
-                              { d_c2[i] = d_a2[i] + d_b2[i]; });
+    RAJA::forall<EXEC_POLICY>(
+        res_gpu2, RAJA::RangeSegment(0, N),
+        [=] RAJA_DEVICE(int i) { d_c2[i] = d_a2[i] + d_b2[i]; });
 
     res_gpu1.memcpy(c, d_c1, sizeof(int) * N);
 
@@ -248,8 +248,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     // _raja_res_alloc_end
 
     // _raja_res_k1_start
-    RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N),
-                              [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
+    RAJA::forall<EXEC_POLICY>(
+        res_gpu1, RAJA::RangeSegment(0, N),
+        [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
     // _raja_res_k1_end
 
     // _raja_res_k2_start
@@ -263,9 +264,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     // _raja_res_wait_end
 
     // _raja_res_k3_start
-    RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N),
-                              [=] RAJA_HOST_DEVICE(int i)
-                              { d_array1[i] *= d_array2[i]; });
+    RAJA::forall<EXEC_POLICY>(
+        res_gpu1, RAJA::RangeSegment(0, N),
+        [=] RAJA_HOST_DEVICE(int i) { d_array1[i] *= d_array2[i]; });
     // _raja_res_k3_end
 
     // _raja_res_memcpy_start
@@ -274,14 +275,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
     // _raja_res_k4_start
     bool check = true;
-    RAJA::forall<RAJA::seq_exec>(res_host, RAJA::RangeSegment(0, N),
-                                 [&check, h_array](int i)
-                                 {
-                                   if (h_array[i] != -i)
-                                   {
-                                     check = false;
-                                   }
-                                 });
+    RAJA::forall<RAJA::seq_exec>(
+        res_host, RAJA::RangeSegment(0, N),
+        [&check, h_array](int i)
+        {
+          if (h_array[i] != -i)
+          {
+            check = false;
+          }
+        });
     // _raja_res_k4_end
 
     std::cout << "\n         result -- ";
diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp
index a96dbba7f0..bc05cbc2df 100644
--- a/examples/resource-kernel.cpp
+++ b/examples/resource-kernel.cpp
@@ -19,8 +19,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 10;
   constexpr int M = 1000000;
 
-  RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
-  RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
+  RAJA::resources::Cuda def_cuda_res {RAJA::resources::Cuda::get_default()};
+  RAJA::resources::Host def_host_res {RAJA::resources::Host::get_default()};
   int*                  d_array = def_cuda_res.allocate<int>(N * M);
   int*                  h_array = def_host_res.allocate<int>(N * M);
 
@@ -51,11 +51,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M);
 
   int ec_count = 0;
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N * M),
-                               [=, &ec_count](int i)
-                               {
-                                 if (h_array[i] != i) ec_count++;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, N * M),
+      [=, &ec_count](int i)
+      {
+        if (h_array[i] != i) ec_count++;
+      });
 
   std::cout << "    Result -- ";
   if (ec_count > 0)
diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp
index 7fd2dc5fcb..cff8f124ec 100644
--- a/examples/resource-launch.cpp
+++ b/examples/resource-launch.cpp
@@ -19,8 +19,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 10;
   constexpr int M = 1000000;
 
-  RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
-  RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
+  RAJA::resources::Cuda def_cuda_res {RAJA::resources::Cuda::get_default()};
+  RAJA::resources::Host def_host_res {RAJA::resources::Host::get_default()};
   int*                  d_array = def_cuda_res.allocate<int>(N * M);
   int*                  h_array = def_host_res.allocate<int>(N * M);
 
@@ -44,15 +44,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
             res_cuda, RAJA::LaunchParams(RAJA::Teams(64), RAJA::Threads(1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
             {
-              RAJA::loop<teams_x>(ctx, m_range,
-                                  [&](int j)
-                                  {
-                                    RAJA::loop<threads_x>(ctx, one_range,
-                                                          [&](int k) {
-                                                            d_array[i * M + j] =
-                                                                i * M + j;
-                                                          });
-                                  });
+              RAJA::loop<teams_x>(
+                  ctx, m_range,
+                  [&](int j)
+                  {
+                    RAJA::loop<threads_x>(
+                        ctx, one_range,
+                        [&](int k) { d_array[i * M + j] = i * M + j; });
+                  });
             });
 
         def_cuda_res.wait_for(&e);
@@ -61,11 +60,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M);
 
   int ec_count = 0;
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N * M),
-                               [=, &ec_count](int i)
-                               {
-                                 if (h_array[i] != i) ec_count++;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, N * M),
+      [=, &ec_count](int i)
+      {
+        if (h_array[i] != i) ec_count++;
+      });
 
   std::cout << "    Result -- ";
   if (ec_count > 0)
diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp
index 6354aa46ef..29854a1c90 100644
--- a/examples/resource-runtime-launch.cpp
+++ b/examples/resource-runtime-launch.cpp
@@ -40,19 +40,21 @@ using device_launch = RAJA::hip_launch_t<true>;
 using device_loop   = RAJA::hip_global_thread_x;
 #endif
 
-using launch_policy = RAJA::LaunchPolicy<host_launch
+using launch_policy = RAJA::LaunchPolicy<
+    host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         device_launch
+    ,
+    device_launch
 #endif
-                                         >;
+    >;
 
-using loop_pol = RAJA::LoopPolicy<host_loop
+using loop_pol = RAJA::LoopPolicy<
+    host_loop
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                  ,
-                                  device_loop
+    ,
+    device_loop
 #endif
-                                  >;
+    >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -188,17 +190,18 @@ int main(int argc, char* argv[])
       res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<loop_pol>(ctx, arange,
-                             [&](int i)
-                             {
-                               kernel_sum += a[i];
-
-                               kernel_min.min(a[i]);
-                               kernel_max.max(a[i]);
-
-                               kernel_minloc.minloc(a[i], i);
-                               kernel_maxloc.maxloc(a[i], i);
-                             });
+        RAJA::loop<loop_pol>(
+            ctx, arange,
+            [&](int i)
+            {
+              kernel_sum += a[i];
+
+              kernel_min.min(a[i]);
+              kernel_max.max(a[i]);
+
+              kernel_minloc.minloc(a[i], i);
+              kernel_maxloc.maxloc(a[i], i);
+            });
       });
 
 
diff --git a/examples/tut_daxpy.cpp b/examples/tut_daxpy.cpp
index 3aa7faa842..5f25cf42f0 100644
--- a/examples/tut_daxpy.cpp
+++ b/examples/tut_daxpy.cpp
@@ -101,8 +101,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memcpy(a, a0, N * sizeof(double));
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
-                               [=](int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; });
 
   checkResult(a, aref, N);
   // printResult(a, N);
@@ -117,8 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memcpy(a, a0, N * sizeof(double));
 
-  RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
-                                [=](int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::simd_exec>(
+      RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; });
 
   checkResult(a, aref, N);
   // printResult(a, N);
@@ -131,8 +131,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memcpy(a, a0, N * sizeof(double));
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N),
-                                            [=](int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; });
 
   checkResult(a, aref, N);
 // printResult(a, N);
diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp
index 8e266d0e43..a2d730744a 100644
--- a/examples/tut_halo-exchange.cpp
+++ b/examples/tut_halo-exchange.cpp
@@ -57,23 +57,26 @@ const int num_neighbors = 26;
 //
 // Functions for checking and printing results
 //
-void checkResult(std::vector<double*> const& vars,
-                 std::vector<double*> const& vars_ref,
-                 int                         var_size,
-                 int                         num_vars);
+void checkResult(
+    std::vector<double*> const& vars,
+    std::vector<double*> const& vars_ref,
+    int                         var_size,
+    int                         num_vars);
 void printResult(std::vector<double*> const& vars, int var_size, int num_vars);
 
 //
 // Functions for allocating and populating packing and unpacking lists
 //
-void create_pack_lists(std::vector<int*>& pack_index_lists,
-                       std::vector<int>&  pack_index_list_lengths,
-                       const int          halo_width,
-                       const int*         grid_dims);
-void create_unpack_lists(std::vector<int*>& unpack_index_lists,
-                         std::vector<int>&  unpack_index_list_lengths,
-                         const int          halo_width,
-                         const int*         grid_dims);
+void create_pack_lists(
+    std::vector<int*>& pack_index_lists,
+    std::vector<int>&  pack_index_list_lengths,
+    const int          halo_width,
+    const int*         grid_dims);
+void create_unpack_lists(
+    std::vector<int*>& unpack_index_lists,
+    std::vector<int>&  unpack_index_list_lengths,
+    const int          halo_width,
+    const int*         grid_dims);
 void destroy_pack_lists(std::vector<int*>& pack_index_lists);
 void destroy_unpack_lists(std::vector<int*>& unpack_index_lists);
 
@@ -116,15 +119,17 @@ struct memory_manager_allocator
 };
 
 template <typename T, typename U>
-bool operator==(memory_manager_allocator<T> const&,
-                memory_manager_allocator<U> const&)
+bool operator==(
+    memory_manager_allocator<T> const&,
+    memory_manager_allocator<U> const&)
 {
   return true;
 }
 
 template <typename T, typename U>
-bool operator!=(memory_manager_allocator<T> const& lhs,
-                memory_manager_allocator<U> const& rhs)
+bool operator!=(
+    memory_manager_allocator<T> const& lhs,
+    memory_manager_allocator<U> const& rhs)
 {
   return !(lhs == rhs);
 }
@@ -209,9 +214,10 @@ int main(int argc, char** argv)
   // Define number of grid variables
   // Define number of cycles
   //
-  const int grid_dims[3] = {(argc != 7) ? 100 : std::atoi(argv[1]),
-                            (argc != 7) ? 100 : std::atoi(argv[2]),
-                            (argc != 7) ? 100 : std::atoi(argv[3])};
+  const int grid_dims[3] = {
+      (argc != 7) ? 100 : std::atoi(argv[1]),
+      (argc != 7) ? 100 : std::atoi(argv[2]),
+      (argc != 7) ? 100 : std::atoi(argv[3])};
   const int halo_width = (argc != 7) ? 1 : std::atoi(argv[4]);
   const int num_vars   = (argc != 7) ? 3 : std::atoi(argv[5]);
   const int num_cycles = (argc != 7) ? 3 : std::atoi(argv[6]);
@@ -231,9 +237,9 @@ int main(int argc, char** argv)
     std::exit(1);
   }
 
-  const int grid_plus_halo_dims[3] = {grid_dims[0] + 2 * halo_width,
-                                      grid_dims[1] + 2 * halo_width,
-                                      grid_dims[2] + 2 * halo_width};
+  const int grid_plus_halo_dims[3] = {
+      grid_dims[0] + 2 * halo_width, grid_dims[1] + 2 * halo_width,
+      grid_dims[2] + 2 * halo_width};
 
   const int var_size =
       grid_plus_halo_dims[0] * grid_plus_halo_dims[1] * grid_plus_halo_dims[2];
@@ -260,13 +266,13 @@ int main(int argc, char** argv)
   //
   std::vector<int*> pack_index_lists(num_neighbors, nullptr);
   std::vector<int>  pack_index_list_lengths(num_neighbors, 0);
-  create_pack_lists(pack_index_lists, pack_index_list_lengths, halo_width,
-                    grid_dims);
+  create_pack_lists(
+      pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
 
   std::vector<int*> unpack_index_lists(num_neighbors, nullptr);
   std::vector<int>  unpack_index_list_lengths(num_neighbors, 0);
-  create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width,
-                      grid_dims);
+  create_unpack_lists(
+      unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
   // _halo_exchange_index_list_generate_end
 
 
@@ -429,8 +435,8 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=](int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_seq_forall_packing_start
@@ -447,8 +453,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
-                                        { buffer[i] = var[list[i]]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=](int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -473,8 +480,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
-                                        { var[list[i]] = buffer[i]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=](int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -516,19 +524,18 @@ int main(int argc, char** argv)
     // _halo_exchange_seq_workgroup_policies_start
     using forall_policy = RAJA::seq_exec;
 
-    using workgroup_policy =
-        RAJA::WorkGroupPolicy<RAJA::seq_work, RAJA::ordered,
-                              RAJA::ragged_array_of_objects,
-                              RAJA::indirect_function_call_dispatch>;
+    using workgroup_policy = RAJA::WorkGroupPolicy<
+        RAJA::seq_work, RAJA::ordered, RAJA::ragged_array_of_objects,
+        RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
-                                    memory_manager_allocator<char>>;
+    using workpool = RAJA::WorkPool<
+        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
 
-    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
-                                      memory_manager_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<
+        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
 
-    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
-                                    memory_manager_allocator<char>>;
+    using worksite = RAJA::WorkSite<
+        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
     // _halo_exchange_seq_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
@@ -541,8 +548,8 @@ int main(int argc, char** argv)
       buffers[l] = memoryManager::allocate<double>(buffer_len);
     }
 
-    workpool pool_pack(memory_manager_allocator<char>{});
-    workpool pool_unpack(memory_manager_allocator<char>{});
+    workpool pool_pack(memory_manager_allocator<char> {});
+    workpool pool_unpack(memory_manager_allocator<char> {});
 
     for (int c = 0; c < num_cycles; ++c)
     {
@@ -555,8 +562,8 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=](int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_seq_workgroup_packing_start
@@ -573,8 +580,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len),
-                              [=](int i) { buffer[i] = var[list[i]]; });
+            pool_pack.enqueue(
+                range_segment(0, len),
+                [=](int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -603,8 +611,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len),
-                                [=](int i) { var[list[i]] = buffer[i]; });
+            pool_unpack.enqueue(
+                range_segment(0, len),
+                [=](int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -675,8 +684,8 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=](int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_openmp_forall_packing_start
@@ -693,8 +702,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
-                                        { buffer[i] = var[list[i]]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=](int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -719,8 +729,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len), [=](int i)
-                                        { var[list[i]] = buffer[i]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=](int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -760,19 +771,18 @@ int main(int argc, char** argv)
     // _halo_exchange_openmp_workgroup_policies_start
     using forall_policy = RAJA::omp_parallel_for_exec;
 
-    using workgroup_policy =
-        RAJA::WorkGroupPolicy<RAJA::omp_work, RAJA::ordered,
-                              RAJA::ragged_array_of_objects,
-                              RAJA::indirect_function_call_dispatch>;
+    using workgroup_policy = RAJA::WorkGroupPolicy<
+        RAJA::omp_work, RAJA::ordered, RAJA::ragged_array_of_objects,
+        RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
-                                    memory_manager_allocator<char>>;
+    using workpool = RAJA::WorkPool<
+        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
 
-    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
-                                      memory_manager_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<
+        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
 
-    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
-                                    memory_manager_allocator<char>>;
+    using worksite = RAJA::WorkSite<
+        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
     // _halo_exchange_openmp_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
@@ -785,8 +795,8 @@ int main(int argc, char** argv)
       buffers[l] = memoryManager::allocate<double>(buffer_len);
     }
 
-    workpool pool_pack(memory_manager_allocator<char>{});
-    workpool pool_unpack(memory_manager_allocator<char>{});
+    workpool pool_pack(memory_manager_allocator<char> {});
+    workpool pool_unpack(memory_manager_allocator<char> {});
 
     for (int c = 0; c < num_cycles; ++c)
     {
@@ -799,8 +809,8 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=](int i) { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_openmp_workgroup_packing_start
@@ -817,8 +827,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len),
-                              [=](int i) { buffer[i] = var[list[i]]; });
+            pool_pack.enqueue(
+                range_segment(0, len),
+                [=](int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -847,8 +858,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len),
-                                [=](int i) { var[list[i]] = buffer[i]; });
+            pool_unpack.enqueue(
+                range_segment(0, len),
+                [=](int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -910,13 +922,15 @@ int main(int argc, char** argv)
     {
       int pack_len             = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l], pack_index_lists[l],
-                            pack_len * sizeof(int), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(
+          cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
+          cudaMemcpyDefault));
 
       int unpack_len             = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l], unpack_index_lists[l],
-                            unpack_len * sizeof(int), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(
+          cuda_unpack_index_lists[l], unpack_index_lists[l],
+          unpack_len * sizeof(int), cudaMemcpyDefault));
     }
 
     std::swap(vars, cuda_vars);
@@ -949,9 +963,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=] RAJA_DEVICE(int i)
-                                      { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_cuda_forall_packing_start
@@ -968,9 +982,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=] RAJA_DEVICE(int i)
-                                        { buffer[i] = var[list[i]]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -997,9 +1011,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=] RAJA_DEVICE(int i)
-                                        { var[list[i]] = buffer[i]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1028,8 +1042,8 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      cudaErrchk(cudaMemcpy(vars[v], cuda_vars[v], var_size * sizeof(double),
-                            cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(
+          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
@@ -1071,13 +1085,15 @@ int main(int argc, char** argv)
     {
       int pack_len             = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l], pack_index_lists[l],
-                            pack_len * sizeof(int), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(
+          cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
+          cudaMemcpyDefault));
 
       int unpack_len             = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l], unpack_index_lists[l],
-                            unpack_len * sizeof(int), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(
+          cuda_unpack_index_lists[l], unpack_index_lists[l],
+          unpack_len * sizeof(int), cudaMemcpyDefault));
     }
 
     std::swap(vars, cuda_vars);
@@ -1094,14 +1110,14 @@ int main(int argc, char** argv)
         RAJA::constant_stride_array_of_objects,
         RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
-                                    pinned_allocator<char>>;
+    using workpool = RAJA::WorkPool<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
-    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
-                                      pinned_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
-    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
-                                    pinned_allocator<char>>;
+    using worksite = RAJA::WorkSite<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
     // _halo_exchange_cuda_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
@@ -1114,8 +1130,8 @@ int main(int argc, char** argv)
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
     }
 
-    workpool pool_pack(pinned_allocator<char>{});
-    workpool pool_unpack(pinned_allocator<char>{});
+    workpool pool_pack(pinned_allocator<char> {});
+    workpool pool_unpack(pinned_allocator<char> {});
 
     for (int c = 0; c < num_cycles; ++c)
     {
@@ -1128,9 +1144,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=] RAJA_DEVICE(int i)
-                                      { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_cuda_workgroup_packing_start
@@ -1147,8 +1163,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
-                              { buffer[i] = var[list[i]]; });
+            pool_pack.enqueue(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -1179,8 +1196,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
-                                { var[list[i]] = buffer[i]; });
+            pool_unpack.enqueue(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1213,8 +1231,8 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      cudaErrchk(cudaMemcpy(vars[v], cuda_vars[v], var_size * sizeof(double),
-                            cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy(
+          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
@@ -1263,13 +1281,15 @@ int main(int argc, char** argv)
     {
       int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(hip_pack_index_lists[l], pack_index_lists[l],
-                          pack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(
+          hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
+          hipMemcpyHostToDevice));
 
       int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(hip_unpack_index_lists[l], unpack_index_lists[l],
-                          unpack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(
+          hip_unpack_index_lists[l], unpack_index_lists[l],
+          unpack_len * sizeof(int), hipMemcpyHostToDevice));
     }
 
     std::swap(vars, hip_vars);
@@ -1302,9 +1322,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=] RAJA_DEVICE(int i)
-                                      { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_hip_forall_packing_start
@@ -1321,9 +1341,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=] RAJA_DEVICE(int i)
-                                        { buffer[i] = var[list[i]]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -1350,9 +1370,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            RAJA::forall<forall_policy>(range_segment(0, len),
-                                        [=] RAJA_DEVICE(int i)
-                                        { var[list[i]] = buffer[i]; });
+            RAJA::forall<forall_policy>(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1381,8 +1401,9 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      hipErrchk(hipMemcpy(vars[v], hip_vars[v], var_size * sizeof(double),
-                          hipMemcpyDeviceToHost));
+      hipErrchk(hipMemcpy(
+          vars[v], hip_vars[v], var_size * sizeof(double),
+          hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
@@ -1426,13 +1447,15 @@ int main(int argc, char** argv)
     {
       int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(hip_pack_index_lists[l], pack_index_lists[l],
-                          pack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(
+          hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
+          hipMemcpyHostToDevice));
 
       int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(hip_unpack_index_lists[l], unpack_index_lists[l],
-                          unpack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(
+          hip_unpack_index_lists[l], unpack_index_lists[l],
+          unpack_len * sizeof(int), hipMemcpyHostToDevice));
     }
 
     std::swap(vars, hip_vars);
@@ -1449,14 +1472,14 @@ int main(int argc, char** argv)
         RAJA::constant_stride_array_of_objects,
         RAJA::indirect_function_call_dispatch>;
 
-    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
-                                    pinned_allocator<char>>;
+    using workpool = RAJA::WorkPool<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
-    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
-                                      pinned_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
-    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
-                                    pinned_allocator<char>>;
+    using worksite = RAJA::WorkSite<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
     // _halo_exchange_hip_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
@@ -1469,8 +1492,8 @@ int main(int argc, char** argv)
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
     }
 
-    workpool pool_pack(pinned_allocator<char>{});
-    workpool pool_unpack(pinned_allocator<char>{});
+    workpool pool_pack(pinned_allocator<char> {});
+    workpool pool_unpack(pinned_allocator<char> {});
 
     for (int c = 0; c < num_cycles; ++c)
     {
@@ -1483,9 +1506,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=] RAJA_DEVICE(int i)
-                                      { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
         }
 
         // _halo_exchange_hip_workgroup_packing_start
@@ -1502,8 +1525,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
-                              { buffer[i] = var[list[i]]; });
+            pool_pack.enqueue(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
 
             buffer += len;
           }
@@ -1534,8 +1558,9 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i)
-                                { var[list[i]] = buffer[i]; });
+            pool_unpack.enqueue(
+                range_segment(0, len),
+                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
 
             buffer += len;
           }
@@ -1568,8 +1593,9 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      hipErrchk(hipMemcpy(vars[v], hip_vars[v], var_size * sizeof(double),
-                          hipMemcpyDeviceToHost));
+      hipErrchk(hipMemcpy(
+          vars[v], hip_vars[v], var_size * sizeof(double),
+          hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
@@ -1612,13 +1638,15 @@ int main(int argc, char** argv)
     {
       int pack_len            = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(hip_pack_index_lists[l], pack_index_lists[l],
-                          pack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(
+          hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
+          hipMemcpyHostToDevice));
 
       int unpack_len            = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(hip_unpack_index_lists[l], unpack_index_lists[l],
-                          unpack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy(
+          hip_unpack_index_lists[l], unpack_index_lists[l],
+          unpack_len * sizeof(int), hipMemcpyHostToDevice));
     }
 
     std::swap(vars, hip_vars);
@@ -1648,17 +1676,18 @@ int main(int argc, char** argv)
         RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
         RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
         RAJA::constant_stride_array_of_objects,
-        RAJA::direct_dispatch<camp::list<range_segment, Packer>,
-                              camp::list<range_segment, UnPacker>>>;
+        RAJA::direct_dispatch<
+            camp::list<range_segment, Packer>,
+            camp::list<range_segment, UnPacker>>>;
 
-    using workpool = RAJA::WorkPool<workgroup_policy, int, RAJA::xargs<>,
-                                    pinned_allocator<char>>;
+    using workpool = RAJA::WorkPool<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
-    using workgroup = RAJA::WorkGroup<workgroup_policy, int, RAJA::xargs<>,
-                                      pinned_allocator<char>>;
+    using workgroup = RAJA::WorkGroup<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
-    using worksite = RAJA::WorkSite<workgroup_policy, int, RAJA::xargs<>,
-                                    pinned_allocator<char>>;
+    using worksite = RAJA::WorkSite<
+        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
@@ -1670,8 +1699,8 @@ int main(int argc, char** argv)
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
     }
 
-    workpool pool_pack(pinned_allocator<char>{});
-    workpool pool_unpack(pinned_allocator<char>{});
+    workpool pool_pack(pinned_allocator<char> {});
+    workpool pool_unpack(pinned_allocator<char> {});
 
     for (int c = 0; c < num_cycles; ++c)
     {
@@ -1684,9 +1713,9 @@ int main(int argc, char** argv)
 
           double* var = vars[v];
 
-          RAJA::forall<forall_policy>(range_segment(0, var_size),
-                                      [=] RAJA_DEVICE(int i)
-                                      { var[i] = i + v; });
+          RAJA::forall<forall_policy>(
+              range_segment(0, var_size),
+              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
         }
 
         for (int l = 0; l < num_neighbors; ++l)
@@ -1702,7 +1731,8 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+            pool_pack.enqueue(
+                range_segment(0, len), Packer {buffer, var, list});
 
             buffer += len;
           }
@@ -1731,8 +1761,8 @@ int main(int argc, char** argv)
 
             double* var = vars[v];
 
-            pool_unpack.enqueue(range_segment(0, len),
-                                UnPacker{buffer, var, list});
+            pool_unpack.enqueue(
+                range_segment(0, len), UnPacker {buffer, var, list});
 
             buffer += len;
           }
@@ -1764,8 +1794,9 @@ int main(int argc, char** argv)
 
     for (int v = 0; v < num_vars; ++v)
     {
-      hipErrchk(hipMemcpy(vars[v], hip_vars[v], var_size * sizeof(double),
-                          hipMemcpyDeviceToHost));
+      hipErrchk(hipMemcpy(
+          vars[v], hip_vars[v], var_size * sizeof(double),
+          hipMemcpyDeviceToHost));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
@@ -1812,10 +1843,11 @@ int main(int argc, char** argv)
 //
 // Function to compare result to reference and report P/F.
 //
-void checkResult(std::vector<double*> const& vars,
-                 std::vector<double*> const& vars_ref,
-                 int                         var_size,
-                 int                         num_vars)
+void checkResult(
+    std::vector<double*> const& vars,
+    std::vector<double*> const& vars_ref,
+    int                         var_size,
+    int                         num_vars)
 {
   bool correct = true;
   for (int v = 0; v < num_vars; ++v)
@@ -1871,96 +1903,105 @@ struct Extent
 //
 // Function to generate index lists for packing.
 //
-void create_pack_lists(std::vector<int*>& pack_index_lists,
-                       std::vector<int>&  pack_index_list_lengths,
-                       const int          halo_width,
-                       const int*         grid_dims)
+void create_pack_lists(
+    std::vector<int*>& pack_index_lists,
+    std::vector<int>&  pack_index_list_lengths,
+    const int          halo_width,
+    const int*         grid_dims)
 {
   std::vector<Extent> pack_index_list_extents(num_neighbors);
 
   // faces
-  pack_index_list_extents[0] = Extent{halo_width, halo_width + halo_width,
-                                      halo_width, grid_dims[1] + halo_width,
-                                      halo_width, grid_dims[2] + halo_width};
-  pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                      halo_width,   grid_dims[1] + halo_width,
-                                      halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[2] = Extent{halo_width, grid_dims[0] + halo_width,
-                                      halo_width, halo_width + halo_width,
-                                      halo_width, grid_dims[2] + halo_width};
-  pack_index_list_extents[3] = Extent{halo_width,   grid_dims[0] + halo_width,
-                                      grid_dims[1], grid_dims[1] + halo_width,
-                                      halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[4] = Extent{halo_width, grid_dims[0] + halo_width,
-                                      halo_width, grid_dims[1] + halo_width,
-                                      halo_width, halo_width + halo_width};
-  pack_index_list_extents[5] = Extent{halo_width,   grid_dims[0] + halo_width,
-                                      halo_width,   grid_dims[1] + halo_width,
-                                      grid_dims[2], grid_dims[2] + halo_width};
-
-  // edges
-  pack_index_list_extents[6]  = Extent{halo_width, halo_width + halo_width,
-                                      halo_width, halo_width + halo_width,
-                                      halo_width, grid_dims[2] + halo_width};
-  pack_index_list_extents[7]  = Extent{halo_width,   halo_width + halo_width,
-                                      grid_dims[1], grid_dims[1] + halo_width,
-                                      halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[8]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                      halo_width,   halo_width + halo_width,
-                                      halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[9]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                      grid_dims[1], grid_dims[1] + halo_width,
-                                      halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[10] = Extent{halo_width, halo_width + halo_width,
+  pack_index_list_extents[0] = Extent {halo_width, halo_width + halo_width,
                                        halo_width, grid_dims[1] + halo_width,
-                                       halo_width, halo_width + halo_width};
-  pack_index_list_extents[11] = Extent{halo_width,   halo_width + halo_width,
-                                       halo_width,   grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width, grid_dims[2] + halo_width};
+  pack_index_list_extents[1] = Extent {grid_dims[0], grid_dims[0] + halo_width,
                                        halo_width,   grid_dims[1] + halo_width,
-                                       halo_width,   halo_width + halo_width};
-  pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width,   grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[14] = Extent{halo_width, grid_dims[0] + halo_width,
+                                       halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[2] = Extent {halo_width, grid_dims[0] + halo_width,
                                        halo_width, halo_width + halo_width,
-                                       halo_width, halo_width + halo_width};
-  pack_index_list_extents[15] = Extent{halo_width,   grid_dims[0] + halo_width,
-                                       halo_width,   halo_width + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[16] = Extent{halo_width,   grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width,   halo_width + halo_width};
-  pack_index_list_extents[17] = Extent{halo_width,   grid_dims[0] + halo_width,
+                                       halo_width, grid_dims[2] + halo_width};
+  pack_index_list_extents[3] = Extent {halo_width,   grid_dims[0] + halo_width,
                                        grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[4] = Extent {halo_width, grid_dims[0] + halo_width,
+                                       halo_width, grid_dims[1] + halo_width,
+                                       halo_width, halo_width + halo_width};
+  pack_index_list_extents[5] = Extent {halo_width,   grid_dims[0] + halo_width,
+                                       halo_width,   grid_dims[1] + halo_width,
                                        grid_dims[2], grid_dims[2] + halo_width};
 
-  // corners
-  pack_index_list_extents[18] = Extent{halo_width, halo_width + halo_width,
+  // edges
+  pack_index_list_extents[6]  = Extent {halo_width, halo_width + halo_width,
                                        halo_width, halo_width + halo_width,
-                                       halo_width, halo_width + halo_width};
-  pack_index_list_extents[19] = Extent{halo_width,   halo_width + halo_width,
-                                       halo_width,   halo_width + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[20] = Extent{halo_width,   halo_width + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width,   halo_width + halo_width};
-  pack_index_list_extents[21] = Extent{halo_width,   halo_width + halo_width,
+                                       halo_width, grid_dims[2] + halo_width};
+  pack_index_list_extents[7]  = Extent {halo_width,   halo_width + halo_width,
                                        grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width,   halo_width + halo_width,
-                                       halo_width,   halo_width + halo_width};
-  pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[8]  = Extent {grid_dims[0], grid_dims[0] + halo_width,
                                        halo_width,   halo_width + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width,   halo_width + halo_width};
-  pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[9]  = Extent {grid_dims[0], grid_dims[0] + halo_width,
                                        grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
+                                       halo_width,   grid_dims[2] + halo_width};
+  pack_index_list_extents[10] = Extent {halo_width, halo_width + halo_width,
+                                        halo_width, grid_dims[1] + halo_width,
+                                        halo_width, halo_width + halo_width};
+  pack_index_list_extents[11] =
+      Extent {halo_width,   halo_width + halo_width,
+              halo_width,   grid_dims[1] + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[12] = Extent {grid_dims[0], grid_dims[0] + halo_width,
+                                        halo_width,   grid_dims[1] + halo_width,
+                                        halo_width,   halo_width + halo_width};
+  pack_index_list_extents[13] =
+      Extent {grid_dims[0], grid_dims[0] + halo_width,
+              halo_width,   grid_dims[1] + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[14] = Extent {halo_width, grid_dims[0] + halo_width,
+                                        halo_width, halo_width + halo_width,
+                                        halo_width, halo_width + halo_width};
+  pack_index_list_extents[15] =
+      Extent {halo_width,   grid_dims[0] + halo_width,
+              halo_width,   halo_width + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[16] = Extent {halo_width,   grid_dims[0] + halo_width,
+                                        grid_dims[1], grid_dims[1] + halo_width,
+                                        halo_width,   halo_width + halo_width};
+  pack_index_list_extents[17] =
+      Extent {halo_width,   grid_dims[0] + halo_width,
+              grid_dims[1], grid_dims[1] + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
+
+  // corners
+  pack_index_list_extents[18] = Extent {halo_width, halo_width + halo_width,
+                                        halo_width, halo_width + halo_width,
+                                        halo_width, halo_width + halo_width};
+  pack_index_list_extents[19] =
+      Extent {halo_width,   halo_width + halo_width,
+              halo_width,   halo_width + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[20] = Extent {halo_width,   halo_width + halo_width,
+                                        grid_dims[1], grid_dims[1] + halo_width,
+                                        halo_width,   halo_width + halo_width};
+  pack_index_list_extents[21] =
+      Extent {halo_width,   halo_width + halo_width,
+              grid_dims[1], grid_dims[1] + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[22] = Extent {grid_dims[0], grid_dims[0] + halo_width,
+                                        halo_width,   halo_width + halo_width,
+                                        halo_width,   halo_width + halo_width};
+  pack_index_list_extents[23] =
+      Extent {grid_dims[0], grid_dims[0] + halo_width,
+              halo_width,   halo_width + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[24] = Extent {grid_dims[0], grid_dims[0] + halo_width,
+                                        grid_dims[1], grid_dims[1] + halo_width,
+                                        halo_width,   halo_width + halo_width};
+  pack_index_list_extents[25] =
+      Extent {grid_dims[0], grid_dims[0] + halo_width,
+              grid_dims[1], grid_dims[1] + halo_width,
+              grid_dims[2], grid_dims[2] + halo_width};
 
   const int grid_i_stride = 1;
   const int grid_j_stride = grid_dims[0] + 2 * halo_width;
@@ -2015,138 +2056,154 @@ void destroy_pack_lists(std::vector<int*>& pack_index_lists)
 //
 // Function to generate index lists for unpacking.
 //
-void create_unpack_lists(std::vector<int*>& unpack_index_lists,
-                         std::vector<int>&  unpack_index_list_lengths,
-                         const int          halo_width,
-                         const int*         grid_dims)
+void create_unpack_lists(
+    std::vector<int*>& unpack_index_lists,
+    std::vector<int>&  unpack_index_list_lengths,
+    const int          halo_width,
+    const int*         grid_dims)
 {
   std::vector<Extent> unpack_index_list_extents(num_neighbors);
 
   // faces
-  unpack_index_list_extents[0] = Extent{0,          halo_width,
-                                        halo_width, grid_dims[1] + halo_width,
-                                        halo_width, grid_dims[2] + halo_width};
-  unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width,
-                                        grid_dims[0] + 2 * halo_width,
-                                        halo_width,
-                                        grid_dims[1] + halo_width,
-                                        halo_width,
-                                        grid_dims[2] + halo_width};
+  unpack_index_list_extents[0] = Extent {0,          halo_width,
+                                         halo_width, grid_dims[1] + halo_width,
+                                         halo_width, grid_dims[2] + halo_width};
+  unpack_index_list_extents[1] = Extent {
+      grid_dims[0] + halo_width,
+      grid_dims[0] + 2 * halo_width,
+      halo_width,
+      grid_dims[1] + halo_width,
+      halo_width,
+      grid_dims[2] + halo_width};
   unpack_index_list_extents[2] =
-      Extent{halo_width, grid_dims[0] + halo_width, 0, halo_width,
-             halo_width, grid_dims[2] + halo_width};
-  unpack_index_list_extents[3] = Extent{halo_width,
-                                        grid_dims[0] + halo_width,
-                                        grid_dims[1] + halo_width,
-                                        grid_dims[1] + 2 * halo_width,
-                                        halo_width,
-                                        grid_dims[2] + halo_width};
-  unpack_index_list_extents[4] = Extent{halo_width, grid_dims[0] + halo_width,
-                                        halo_width, grid_dims[1] + halo_width,
-                                        0,          halo_width};
-  unpack_index_list_extents[5] = Extent{halo_width,
-                                        grid_dims[0] + halo_width,
-                                        halo_width,
-                                        grid_dims[1] + halo_width,
-                                        grid_dims[2] + halo_width,
-                                        grid_dims[2] + 2 * halo_width};
+      Extent {halo_width, grid_dims[0] + halo_width, 0, halo_width,
+              halo_width, grid_dims[2] + halo_width};
+  unpack_index_list_extents[3] = Extent {
+      halo_width,
+      grid_dims[0] + halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[1] + 2 * halo_width,
+      halo_width,
+      grid_dims[2] + halo_width};
+  unpack_index_list_extents[4] = Extent {halo_width, grid_dims[0] + halo_width,
+                                         halo_width, grid_dims[1] + halo_width,
+                                         0,          halo_width};
+  unpack_index_list_extents[5] = Extent {
+      halo_width,
+      grid_dims[0] + halo_width,
+      halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[2] + halo_width,
+      grid_dims[2] + 2 * halo_width};
 
   // edges
-  unpack_index_list_extents[6] = Extent{
+  unpack_index_list_extents[6] = Extent {
       0, halo_width, 0, halo_width, halo_width, grid_dims[2] + halo_width};
-  unpack_index_list_extents[7]  = Extent{0,
-                                        halo_width,
-                                        grid_dims[1] + halo_width,
-                                        grid_dims[1] + 2 * halo_width,
-                                        halo_width,
-                                        grid_dims[2] + halo_width};
-  unpack_index_list_extents[8]  = Extent{grid_dims[0] + halo_width,
-                                        grid_dims[0] + 2 * halo_width,
-                                        0,
-                                        halo_width,
-                                        halo_width,
-                                        grid_dims[2] + halo_width};
-  unpack_index_list_extents[9]  = Extent{grid_dims[0] + halo_width,
-                                        grid_dims[0] + 2 * halo_width,
-                                        grid_dims[1] + halo_width,
-                                        grid_dims[1] + 2 * halo_width,
-                                        halo_width,
-                                        grid_dims[2] + halo_width};
-  unpack_index_list_extents[10] = Extent{
+  unpack_index_list_extents[7] = Extent {
+      0,
+      halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[1] + 2 * halo_width,
+      halo_width,
+      grid_dims[2] + halo_width};
+  unpack_index_list_extents[8] = Extent {
+      grid_dims[0] + halo_width,
+      grid_dims[0] + 2 * halo_width,
+      0,
+      halo_width,
+      halo_width,
+      grid_dims[2] + halo_width};
+  unpack_index_list_extents[9] = Extent {
+      grid_dims[0] + halo_width,
+      grid_dims[0] + 2 * halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[1] + 2 * halo_width,
+      halo_width,
+      grid_dims[2] + halo_width};
+  unpack_index_list_extents[10] = Extent {
       0, halo_width, halo_width, grid_dims[1] + halo_width, 0, halo_width};
-  unpack_index_list_extents[11] = Extent{0,
-                                         halo_width,
-                                         halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width,
-                                         grid_dims[0] + 2 * halo_width,
-                                         halo_width,
-                                         grid_dims[1] + halo_width,
-                                         0,
-                                         halo_width};
-  unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width,
-                                         grid_dims[0] + 2 * halo_width,
-                                         halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[14] = Extent{
+  unpack_index_list_extents[11] = Extent {
+      0,
+      halo_width,
+      halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[2] + halo_width,
+      grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[12] = Extent {
+      grid_dims[0] + halo_width,
+      grid_dims[0] + 2 * halo_width,
+      halo_width,
+      grid_dims[1] + halo_width,
+      0,
+      halo_width};
+  unpack_index_list_extents[13] = Extent {
+      grid_dims[0] + halo_width,
+      grid_dims[0] + 2 * halo_width,
+      halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[2] + halo_width,
+      grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[14] = Extent {
       halo_width, grid_dims[0] + halo_width, 0, halo_width, 0, halo_width};
-  unpack_index_list_extents[15] = Extent{
+  unpack_index_list_extents[15] = Extent {
       halo_width, grid_dims[0] + halo_width, 0,
       halo_width, grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[16] = Extent{halo_width,
-                                         grid_dims[0] + halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[1] + 2 * halo_width,
-                                         0,
-                                         halo_width};
-  unpack_index_list_extents[17] = Extent{halo_width,
-                                         grid_dims[0] + halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[1] + 2 * halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[16] = Extent {
+      halo_width,
+      grid_dims[0] + halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[1] + 2 * halo_width,
+      0,
+      halo_width};
+  unpack_index_list_extents[17] = Extent {
+      halo_width,
+      grid_dims[0] + halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[1] + 2 * halo_width,
+      grid_dims[2] + halo_width,
+      grid_dims[2] + 2 * halo_width};
 
   // corners
   unpack_index_list_extents[18] =
-      Extent{0, halo_width, 0, halo_width, 0, halo_width};
-  unpack_index_list_extents[19] = Extent{0,
-                                         halo_width,
-                                         0,
-                                         halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[20] = Extent{
+      Extent {0, halo_width, 0, halo_width, 0, halo_width};
+  unpack_index_list_extents[19] = Extent {
+      0,
+      halo_width,
+      0,
+      halo_width,
+      grid_dims[2] + halo_width,
+      grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[20] = Extent {
       0, halo_width, grid_dims[1] + halo_width, grid_dims[1] + 2 * halo_width,
       0, halo_width};
-  unpack_index_list_extents[21] = Extent{0,
-                                         halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[1] + 2 * halo_width,
-                                         grid_dims[2] + halo_width,
-                                         grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width,
-                                         grid_dims[0] + 2 * halo_width,
-                                         0,
-                                         halo_width,
-                                         0,
-                                         halo_width};
-  unpack_index_list_extents[23] = Extent{
+  unpack_index_list_extents[21] = Extent {
+      0,
+      halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[1] + 2 * halo_width,
+      grid_dims[2] + halo_width,
+      grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[22] = Extent {
+      grid_dims[0] + halo_width,
+      grid_dims[0] + 2 * halo_width,
+      0,
+      halo_width,
+      0,
+      halo_width};
+  unpack_index_list_extents[23] = Extent {
       grid_dims[0] + halo_width, grid_dims[0] + 2 * halo_width, 0, halo_width,
       grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width,
-                                         grid_dims[0] + 2 * halo_width,
-                                         grid_dims[1] + halo_width,
-                                         grid_dims[1] + 2 * halo_width,
-                                         0,
-                                         halo_width};
+  unpack_index_list_extents[24] = Extent {
+      grid_dims[0] + halo_width,
+      grid_dims[0] + 2 * halo_width,
+      grid_dims[1] + halo_width,
+      grid_dims[1] + 2 * halo_width,
+      0,
+      halo_width};
   unpack_index_list_extents[25] =
-      Extent{grid_dims[0] + halo_width, grid_dims[0] + 2 * halo_width,
-             grid_dims[1] + halo_width, grid_dims[1] + 2 * halo_width,
-             grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
+      Extent {grid_dims[0] + halo_width, grid_dims[0] + 2 * halo_width,
+              grid_dims[1] + halo_width, grid_dims[1] + 2 * halo_width,
+              grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
 
   const int grid_i_stride = 1;
   const int grid_j_stride = grid_dims[0] + 2 * halo_width;
diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp
index 5a22512dfb..02e59412c7 100644
--- a/examples/tut_launch_basic.cpp
+++ b/examples/tut_launch_basic.cpp
@@ -45,12 +45,13 @@ using device_launch = RAJA::cuda_launch_t<false>;
 using device_launch = RAJA::hip_launch_t<false>;
 #endif
 
-using launch_policy = RAJA::LaunchPolicy<host_launch
+using launch_policy = RAJA::LaunchPolicy<
+    host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-                                         ,
-                                         device_launch
+    ,
+    device_launch
 #endif
-                                         >;
+    >;
 
 /*
  * RAJA launch exposes a thread/block programming model
@@ -64,49 +65,53 @@ using launch_policy = RAJA::LaunchPolicy<host_launch
  * On the host the loops expands to standard C style for loops.
  */
 
-using teams_x = RAJA::LoopPolicy<RAJA::seq_exec
+using teams_x = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                 ,
-                                 RAJA::cuda_block_x_direct
+    ,
+    RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                 ,
-                                 RAJA::hip_block_x_direct
+    ,
+    RAJA::hip_block_x_direct
 #endif
-                                 >;
+    >;
 
-using teams_y = RAJA::LoopPolicy<RAJA::seq_exec
+using teams_y = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                 ,
-                                 RAJA::cuda_block_y_direct
+    ,
+    RAJA::cuda_block_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                 ,
-                                 RAJA::hip_block_y_direct
+    ,
+    RAJA::hip_block_y_direct
 #endif
-                                 >;
+    >;
 
-using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
+using threads_x = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                   ,
-                                   RAJA::cuda_thread_x_direct
+    ,
+    RAJA::cuda_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                   ,
-                                   RAJA::hip_thread_x_direct
+    ,
+    RAJA::hip_thread_x_direct
 #endif
-                                   >;
+    >;
 
-using threads_y = RAJA::LoopPolicy<RAJA::seq_exec
+using threads_y = RAJA::LoopPolicy<
+    RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                   ,
-                                   RAJA::cuda_thread_y_direct
+    ,
+    RAJA::cuda_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                   ,
-                                   RAJA::hip_thread_y_direct
+    ,
+    RAJA::hip_thread_y_direct
 #endif
-                                   >;
+    >;
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 __global__ void gpuKernel()
@@ -123,9 +128,10 @@ __global__ void gpuKernel()
         {
           int tx = blockIdx.x;
 
-          printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d "
-                 "block_by %d \n",
-                 tx, ty, bx, by);
+          printf(
+              "device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d "
+              "block_by %d \n",
+              tx, ty, bx, by);
         }
       }
     }
@@ -188,8 +194,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
   RAJA::launch<launch_policy>(
       select_cpu_or_gpu,
-      RAJA::LaunchParams(RAJA::Teams(Nteams, Nteams),
-                         RAJA::Threads(Nthreads, Nthreads)),
+      RAJA::LaunchParams(
+          RAJA::Teams(Nteams, Nteams), RAJA::Threads(Nthreads, Nthreads)),
 
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
@@ -210,9 +216,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
                               ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),
                               [&](int tx)
                               {
-                                printf("RAJA Teams: threadId_x %d threadId_y "
-                                       "%d teamId_x %d teamId_y %d \n",
-                                       tx, ty, bx, by);
+                                printf(
+                                    "RAJA Teams: threadId_x %d threadId_y "
+                                    "%d teamId_x %d teamId_y %d \n",
+                                    tx, ty, bx, by);
                               });
                         });
                   });
@@ -234,8 +241,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
           for (int tx = 0; tx < Nthreads; ++tx)
           {
 
-            printf("c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n", tx,
-                   ty, bx, by);
+            printf(
+                "c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n", tx,
+                ty, bx, by);
           }
         }
       }
diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp
index 97759a5112..ba0a74478e 100644
--- a/examples/tut_matrix-multiply.cpp
+++ b/examples/tut_matrix-multiply.cpp
@@ -199,20 +199,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_outerforall_start
-  RAJA::forall<RAJA::seq_exec>(row_range,
-                               [=](int row)
-                               {
-                                 for (int col = 0; col < N; ++col)
-                                 {
-
-                                   double dot = 0.0;
-                                   for (int k = 0; k < N; ++k)
-                                   {
-                                     dot += Aview(row, k) * Bview(k, col);
-                                   }
-                                   Cview(row, col) = dot;
-                                 }
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      row_range,
+      [=](int row)
+      {
+        for (int col = 0; col < N; ++col)
+        {
+
+          double dot = 0.0;
+          for (int k = 0; k < N; ++k)
+          {
+            dot += Aview(row, k) * Bview(k, col);
+          }
+          Cview(row, col) = dot;
+        }
+      });
   // _matmult_outerforall_end
 
   checkResult<double>(Cview, N);
@@ -237,21 +238,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_nestedforall_start
-  RAJA::forall<RAJA::seq_exec>(row_range,
-                               [=](int row)
-                               {
-                                 RAJA::forall<RAJA::seq_exec>(
-                                     col_range,
-                                     [=](int col)
-                                     {
-                                       double dot = 0.0;
-                                       for (int k = 0; k < N; ++k)
-                                       {
-                                         dot += Aview(row, k) * Bview(k, col);
-                                       }
-                                       Cview(row, col) = dot;
-                                     });
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      row_range,
+      [=](int row)
+      {
+        RAJA::forall<RAJA::seq_exec>(
+            col_range,
+            [=](int col)
+            {
+              double dot = 0.0;
+              for (int k = 0; k < N; ++k)
+              {
+                dot += Aview(row, k) * Bview(k, col);
+              }
+              Cview(row, col) = dot;
+            });
+      });
   // _matmult_nestedforall_end
 
   checkResult<double>(Cview, N);
@@ -291,23 +293,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_basickernel_start
-  using EXEC_POL = RAJA::KernelPolicy<
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // row
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // col
-                                                RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<EXEC_POL>(RAJA::make_tuple(col_range, row_range),
-                         [=](int col, int row)
-                         {
-                           double dot = 0.0;
-                           for (int k = 0; k < N; ++k)
-                           {
-                             dot += Aview(row, k) * Bview(k, col);
-                           }
-                           Cview(row, col) = dot;
-                         });
+  using EXEC_POL = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,  // row
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,  // col
+          RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<EXEC_POL>(
+      RAJA::make_tuple(col_range, row_range),
+      [=](int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        Cview(row, col) = dot;
+      });
   // _matmult_basickernel_end
 
   checkResult<double>(Cview, N);
@@ -322,24 +326,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N * N * sizeof(double));
 
   // _matmult_ompkernel_start
-  using EXEC_POL1 = RAJA::KernelPolicy<
-      RAJA::statement::For<1,
-                           RAJA::omp_parallel_for_exec, // row
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // col
-                                                RAJA::statement::Lambda<0>>>>;
+  using EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::omp_parallel_for_exec,  // row
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,  // col
+          RAJA::statement::Lambda<0>>>>;
   // _matmult_ompkernel_end
 
-  RAJA::kernel<EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
-                          [=](int col, int row)
-                          {
-                            double dot = 0.0;
-                            for (int k = 0; k < N; ++k)
-                            {
-                              dot += Aview(row, k) * Bview(k, col);
-                            }
-                            Cview(row, col) = dot;
-                          });
+  RAJA::kernel<EXEC_POL1>(
+      RAJA::make_tuple(col_range, row_range),
+      [=](int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        Cview(row, col) = dot;
+      });
 
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
@@ -360,22 +366,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _matmult_ompkernel_swap_start
   using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
       0,
-      RAJA::seq_exec, // col
-      RAJA::statement::For<1,
-                           RAJA::omp_parallel_for_exec, // row
-                           RAJA::statement::Lambda<0>>>>;
+      RAJA::seq_exec,  // col
+      RAJA::statement::For<
+          1,
+          RAJA::omp_parallel_for_exec,  // row
+          RAJA::statement::Lambda<0>>>>;
   // _matmult_ompkernel_swap_end
 
-  RAJA::kernel<EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
-                          [=](int col, int row)
-                          {
-                            double dot = 0.0;
-                            for (int k = 0; k < N; ++k)
-                            {
-                              dot += Aview(row, k) * Bview(k, col);
-                            }
-                            Cview(row, col) = dot;
-                          });
+  RAJA::kernel<EXEC_POL2>(
+      RAJA::make_tuple(col_range, row_range),
+      [=](int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        Cview(row, col) = dot;
+      });
 
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
@@ -392,22 +400,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // outer loop with a 'collapse(2) clause.
   //
   using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>, // row, col
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>,  // row, col
       RAJA::statement::Lambda<0>>>;
 
-  RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
-                          [=](int col, int row)
-                          {
-                            double dot = 0.0;
-                            for (int k = 0; k < N; ++k)
-                            {
-                              dot += Aview(row, k) * Bview(k, col);
-                            }
-                            Cview(row, col) = dot;
-                          });
+  RAJA::kernel<EXEC_POL3>(
+      RAJA::make_tuple(col_range, row_range),
+      [=](int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        Cview(row, col) = dot;
+      });
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
-#endif // if RAJA_ENABLE_OPENMP
+#endif  // if RAJA_ENABLE_OPENMP
 
   //----------------------------------------------------------------------------//
 
@@ -428,21 +437,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-  using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
-      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
-                           RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row)
-                          {
-                            double dot = 0.0;
-                            for (int k = 0; k < N; ++k)
-                            {
-                              dot += Aview(row, k) * Bview(k, col);
-                            }
-                            Cview(row, col) = dot;
-                          });
+  using EXEC_POL4 =
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          1, RAJA::cuda_block_x_loop,
+          RAJA::statement::For<
+              0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<EXEC_POL4>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        Cview(row, col) = dot;
+      });
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
 
@@ -469,23 +480,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::cuda_thread_y_loop,
-                  RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                                       RAJA::statement::Lambda<0>>>>>>>;
-
-  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row)
-                          {
-                            double dot = 0.0;
-                            for (int k = 0; k < N; ++k)
-                            {
-                              dot += Aview(row, k) * Bview(k, col);
-                            }
-                            Cview(row, col) = dot;
-                          });
+                  RAJA::statement::For<
+                      0, RAJA::cuda_thread_x_loop,
+                      RAJA::statement::Lambda<0>>>>>>>;
+
+  RAJA::kernel<EXEC_POL5>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        Cview(row, col) = dot;
+      });
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
 
-#endif // if RAJA_ENABLE_CUDA
+#endif  // if RAJA_ENABLE_CUDA
 
   //----------------------------------------------------------------------------//
 
@@ -517,22 +530,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // and blocksize N; i.e., kernel<<<N, N>>> and defining row = blockIdx.x
   // and col = threadIdx.x in the kernel.
   //
-  using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::HipKernel<
-      RAJA::statement::For<1, RAJA::hip_block_x_loop,
-                           RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row)
-                          {
-                            double dot = 0.0;
-                            for (int k = 0; k < N; ++k)
-                            {
-                              dot += d_Aview(row, k) * d_Bview(k, col);
-                            }
-
-                            d_Cview(row, col) = dot;
-                          });
+  using EXEC_POL4 =
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+          1, RAJA::hip_block_x_loop,
+          RAJA::statement::For<
+              0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<EXEC_POL4>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += d_Aview(row, k) * d_Bview(k, col);
+        }
+
+        d_Cview(row, col) = dot;
+      });
   hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
@@ -561,24 +576,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::hip_thread_y_loop,
-                  RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-                                       RAJA::statement::Lambda<0>>>>>>>;
-
-  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
-                          [=] RAJA_DEVICE(int col, int row)
-                          {
-                            double dot = 0.0;
-                            for (int k = 0; k < N; ++k)
-                            {
-                              dot += d_Aview(row, k) * d_Bview(k, col);
-                            }
-
-                            d_Cview(row, col) = dot;
-                          });
+                  RAJA::statement::For<
+                      0, RAJA::hip_thread_x_loop,
+                      RAJA::statement::Lambda<0>>>>>>>;
+
+  RAJA::kernel<EXEC_POL5>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k)
+        {
+          dot += d_Aview(row, k) * d_Bview(k, col);
+        }
+
+        d_Cview(row, col) = dot;
+      });
   hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
-#endif // if RAJA_ENABLE_HIP
+#endif  // if RAJA_ENABLE_HIP
 
   //----------------------------------------------------------------------------//
 
@@ -611,24 +628,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL6a = RAJA::KernelPolicy<RAJA::statement::For<
       1, RAJA::seq_exec,
       RAJA::statement::For<
-          0, RAJA::seq_exec, RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot
-                                                                          // =
-                                                                          // 0.0
-          RAJA::statement::For<2, RAJA::seq_exec,
-                               RAJA::statement::Lambda<1> // inner loop: dot +=
-                                                          // ...
-                               >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
-                                  RAJA::Params<0>> // set
-                                                   // C(row,
-                                                   // col)
-                                                   // = dot
+          0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot
+                                                        // =
+                                                        // 0.0
+          RAJA::statement::For<
+              2, RAJA::seq_exec,
+              RAJA::statement::Lambda<1>  // inner loop: dot +=
+                                          // ...
+              >,
+          RAJA::statement::Lambda<
+              2, RAJA::Segs<0, 1>,
+              RAJA::Params<0>>  // set
+                                // C(row,
+                                // col)
+                                // = dot
           >>>;
 
   RAJA::kernel_param<EXEC_POL6a>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=](double& dot) { dot = 0.0; },
@@ -667,20 +687,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL6b = RAJA::KernelPolicy<RAJA::statement::For<
       1, RAJA::seq_exec,
       RAJA::statement::For<
-          0, RAJA::seq_exec, RAJA::statement::Lambda<0, Params<0>>, // dot =
-                                                                    // 0.0
-          RAJA::statement::For<2, RAJA::seq_exec,
-                               RAJA::statement::Lambda<1, Segs<0, 1, 2>,
-                                                       Params<0>> // dot += ...
-                               >,
-          RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // C(row, col) =
-                                                            // dot
+          0, RAJA::seq_exec, RAJA::statement::Lambda<0, Params<0>>,  // dot =
+                                                                     // 0.0
+          RAJA::statement::For<
+              2, RAJA::seq_exec,
+              RAJA::statement::Lambda<
+                  1, Segs<0, 1, 2>,
+                  Params<0>>  // dot += ...
+              >,
+          RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>>  // C(row, col) =
+                                                             // dot
           >>>;
 
   RAJA::kernel_param<EXEC_POL6b>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=](double& dot) { dot = 0.0; },
@@ -709,22 +731,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _matmult_3lambdakernel_ompcollapse_start
   using EXEC_POL7 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>, // row, col
-      RAJA::statement::Lambda<0, RAJA::Params<0>>,           // dot = 0.0
-      RAJA::statement::For<2, RAJA::seq_exec,
-                           RAJA::statement::Lambda<1> // inner loop: dot += ...
-                           >,
-      RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set
-                                                                    // C(row,
-                                                                    // col) =
-                                                                    // dot
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>,  // row, col
+      RAJA::statement::Lambda<0, RAJA::Params<0>>,            // dot = 0.0
+      RAJA::statement::For<
+          2, RAJA::seq_exec,
+          RAJA::statement::Lambda<1>  // inner loop: dot += ...
+          >,
+      RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set
+                                                                     // C(row,
+                                                                     // col) =
+                                                                     // dot
       >>;
   // _matmult_3lambdakernel_ompcollapse_end
 
   RAJA::kernel_param<EXEC_POL7>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=](double& dot) { dot = 0.0; },
@@ -740,7 +763,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
-#endif // if RAJA_ENABLE_OPENMP
+#endif  // if RAJA_ENABLE_OPENMP
 
   //----------------------------------------------------------------------------//
 
@@ -755,23 +778,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL8 =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
           1,
-          RAJA::cuda_block_x_loop, // row
+          RAJA::cuda_block_x_loop,  // row
           RAJA::statement::For<
               0,
-              RAJA::cuda_thread_x_loop,                    // col
-              RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-              RAJA::statement::For<2, RAJA::seq_exec,
-                                   RAJA::statement::Lambda<1> // dot += ...
-                                   >,
-              RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
-                                      RAJA::Params<0>> // set C = ...
+              RAJA::cuda_thread_x_loop,                     // col
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+              RAJA::statement::For<
+                  2, RAJA::seq_exec,
+                  RAJA::statement::Lambda<1>  // dot += ...
+                  >,
+              RAJA::statement::Lambda<
+                  2, RAJA::Segs<0, 1>,
+                  RAJA::Params<0>>  // set C = ...
               >>>>;
   // _matmult_3lambdakernel_cuda_end
 
   RAJA::kernel_param<EXEC_POL8>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
@@ -803,25 +828,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1,
-                  RAJA::cuda_thread_y_loop, // row
+                  RAJA::cuda_thread_y_loop,  // row
                   RAJA::statement::For<
                       0,
-                      RAJA::cuda_thread_x_loop,                    // col
-                      RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-                      RAJA::statement::For<2, RAJA::seq_exec,
-                                           RAJA::statement::Lambda<1> // dot +=
-                                                                      // ...
-                                           >,
-                      RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
-                                              RAJA::Params<0>> // set C
-                                                               // = ...
+                      RAJA::cuda_thread_x_loop,                     // col
+                      RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+                      RAJA::statement::For<
+                          2, RAJA::seq_exec,
+                          RAJA::statement::Lambda<1>  // dot +=
+                                                      // ...
+                          >,
+                      RAJA::statement::Lambda<
+                          2, RAJA::Segs<0, 1>,
+                          RAJA::Params<0>>  // set C
+                                            // = ...
                       >>>>>>;
   // _matmult_3lambdakernel_cudatiled_end
 
   RAJA::kernel_param<EXEC_POL9a>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
@@ -852,25 +879,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1,
-                  RAJA::cuda_thread_y_loop, // row
+                  RAJA::cuda_thread_y_loop,  // row
                   RAJA::statement::For<
                       0,
-                      RAJA::cuda_thread_x_loop,              // col
-                      RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
+                      RAJA::cuda_thread_x_loop,               // col
+                      RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
                       RAJA::statement::For<
                           2, RAJA::seq_exec,
-                          RAJA::statement::Lambda<1, Segs<0, 1, 2>,
-                                                  Params<0>> // dot += ...
+                          RAJA::statement::Lambda<
+                              1, Segs<0, 1, 2>,
+                              Params<0>>  // dot += ...
                           >,
-                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set
-                                                                        // C =
-                                                                        // ...
+                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>>  // set
+                                                                         // C =
+                                                                         // ...
                       >>>>>>;
 
   RAJA::kernel_param<EXEC_POL9b>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
@@ -901,23 +929,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // synchronization. We recommend viewing tut_matrix-transpose-local-array.cpp
   // for an introduction to RAJA LocalArray types and thread synchronization.
 
-  using Shmem =
-      RAJA::LocalArray<double, RAJA::PERM_IJ,
-                       RAJA::SizeList<CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>;
+  using Shmem = RAJA::LocalArray<
+      double, RAJA::PERM_IJ, RAJA::SizeList<CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>;
 
   using shmem_Lambda0 =
       RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-  using shmem_Lambda1 =
-      RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>,
-                              RAJA::Params<0>>;
-  using shmem_Lambda2 =
-      RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>,
-                              RAJA::Params<1>>;
+  using shmem_Lambda1 = RAJA::statement::Lambda<
+      1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+  using shmem_Lambda2 = RAJA::statement::Lambda<
+      2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
   using shmem_Lambda3 =
       RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-  using shmem_Lambda4 =
-      RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>,
-                              RAJA::Params<2>>;
+  using shmem_Lambda4 = RAJA::statement::Lambda<
+      4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
 
   using EXEC_POL10 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
       CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE,
@@ -935,8 +959,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   // zero out shmem tile of C
                   RAJA::statement::For<
                       2, RAJA::cuda_thread_y_loop,
-                      RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                                           shmem_Lambda0>>,
+                      RAJA::statement::For<
+                          0, RAJA::cuda_thread_x_loop, shmem_Lambda0>>,
 
                   // Slide window across matrix: Load tiles of global matrices
                   // A, B and compute local dot products
@@ -946,14 +970,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                       // Load tile of A into shmem
                       RAJA::statement::For<
                           1, RAJA::cuda_thread_y_loop,
-                          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                                               shmem_Lambda1>>,
+                          RAJA::statement::For<
+                              0, RAJA::cuda_thread_x_loop, shmem_Lambda1>>,
 
                       // Load tile of B into shmem
                       RAJA::statement::For<
                           2, RAJA::cuda_thread_y_loop,
-                          RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
-                                               shmem_Lambda2>>,
+                          RAJA::statement::For<
+                              1, RAJA::cuda_thread_x_loop, shmem_Lambda2>>,
 
                       RAJA::statement::CudaSyncThreads,
 
@@ -962,26 +986,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                           2, RAJA::cuda_thread_y_loop,
                           RAJA::statement::For<
                               1, RAJA::seq_exec,
-                              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                                                   shmem_Lambda3>>>,
+                              RAJA::statement::For<
+                                  0, RAJA::cuda_thread_x_loop, shmem_Lambda3>>>,
 
-                      RAJA::statement::CudaSyncThreads>, // sliding window
+                      RAJA::statement::CudaSyncThreads>,  // sliding window
 
                   // Write memory out to global matrix
                   RAJA::statement::For<
                       2, RAJA::cuda_thread_y_loop,
-                      RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-                                           shmem_Lambda4>>>>> // Create shared
-                                                              // memory
-      >                                                       // Cuda kernel
+                      RAJA::statement::For<
+                          0, RAJA::cuda_thread_x_loop,
+                          shmem_Lambda4>>>>>  // Create shared
+                                              // memory
+      >                                       // Cuda kernel
                                         >;
 
   Shmem aShared, bShared, cShared;
 
   RAJA::kernel_param<EXEC_POL10>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N),
-                       RAJA::TypedRangeSegment<int>(0, N)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
       RAJA::make_tuple(aShared, bShared, cShared),
 
       // Zero out thread local memory for storing dot products
@@ -997,8 +1023,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
-                           Shmem& bShared, Shmem& cShared)
+      [=] RAJA_HOST_DEVICE(
+          int tn, int tm, int tp, Shmem& aShared, Shmem& bShared,
+          Shmem& cShared)
       { cShared(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
@@ -1007,7 +1034,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult<double>(Cview, N);
 // printResult<double>(Cview, N);
-#endif // if RAJA_ENABLE_CUDA
+#endif  // if RAJA_ENABLE_CUDA
 
   //----------------------------------------------------------------------------//
   //----------------------------------------------------------------------------//
@@ -1021,8 +1048,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define thread block dimensions
   dim3 blockdim(CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
   // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
   // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
@@ -1035,7 +1063,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult<double>(Cview, N);
   // printResult<double>(Cview, N);
 
-#endif // if RAJA_ENABLE_CUDA
+#endif  // if RAJA_ENABLE_CUDA
 
   //----------------------------------------------------------------------------//
 
@@ -1050,23 +1078,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL8 =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
           1,
-          RAJA::hip_block_x_loop, // row
+          RAJA::hip_block_x_loop,  // row
           RAJA::statement::For<
               0,
-              RAJA::hip_thread_x_loop,                     // col
-              RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
-              RAJA::statement::For<2, RAJA::seq_exec,
-                                   RAJA::statement::Lambda<1> // dot += ...
-                                   >,
-              RAJA::statement::Lambda<2, RAJA::Segs<0, 1>,
-                                      RAJA::Params<0>> // set C = ...
+              RAJA::hip_thread_x_loop,                      // col
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+              RAJA::statement::For<
+                  2, RAJA::seq_exec,
+                  RAJA::statement::Lambda<1>  // dot += ...
+                  >,
+              RAJA::statement::Lambda<
+                  2, RAJA::Segs<0, 1>,
+                  RAJA::Params<0>>  // set C = ...
               >>>>;
   // _matmult_3lambdakernel_hip_end
 
   RAJA::kernel_param<EXEC_POL8>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
@@ -1102,26 +1132,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
                   1,
-                  RAJA::hip_thread_y_loop, // row
+                  RAJA::hip_thread_y_loop,  // row
                   RAJA::statement::For<
                       0,
-                      RAJA::hip_thread_x_loop,               // col
-                      RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0
+                      RAJA::hip_thread_x_loop,                // col
+                      RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
                       RAJA::statement::For<
                           2, RAJA::seq_exec,
-                          RAJA::statement::Lambda<1, Segs<0, 1, 2>,
-                                                  Params<0>> // dot += ...
+                          RAJA::statement::Lambda<
+                              1, Segs<0, 1, 2>,
+                              Params<0>>  // dot += ...
                           >,
-                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set
-                                                                        // C =
-                                                                        // ...
+                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>>  // set
+                                                                         // C =
+                                                                         // ...
                       >>>>>>;
   // _matmult_3lambdakernel_hiptiled_end
 
   RAJA::kernel_param<EXEC_POL9b>(
       RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double>{0.0}, // thread local variable for 'dot'
+      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
 
       // lambda 0
       [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
@@ -1150,15 +1181,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define thread block dimensions
   dim3 blockdim(HIP_BLOCK_SIZE, HIP_BLOCK_SIZE);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-               RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
+      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
 
   // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
   // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N,
-                     d_C, d_A, d_B);
+  hipLaunchKernelGGL(
+      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
@@ -1169,7 +1201,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
-#endif // if RAJA_ENABLE_HIP
+#endif  // if RAJA_ENABLE_HIP
 
   //----------------------------------------------------------------------------//
 
diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp
index bce47544ba..3d168d88fa 100644
--- a/examples/wave-eqn.cpp
+++ b/examples/wave-eqn.cpp
@@ -200,20 +200,20 @@ void computeErr(double* P, double tf, grid_s grid)
       1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
-                              [=](RAJA::Index_type tx, RAJA::Index_type ty)
-                              {
-                                int    id = tx + grid.nx * ty;
-                                double x  = grid.ox + tx * grid.dx;
-                                double y  = grid.ox + ty * grid.dx;
-                                double myErr =
-                                    std::abs(P[id] - waveSol(tf, x, y));
-
-                                //
-                                // tMax.max() is used to store the maximum value
-                                //
-                                tMax.max(myErr);
-                              });
+  RAJA::kernel<initialPolicy>(
+      RAJA::make_tuple(fdBounds, fdBounds),
+      [=](RAJA::Index_type tx, RAJA::Index_type ty)
+      {
+        int    id    = tx + grid.nx * ty;
+        double x     = grid.ox + tx * grid.dx;
+        double y     = grid.ox + ty * grid.dx;
+        double myErr = std::abs(P[id] - waveSol(tf, x, y));
+
+        //
+        // tMax.max() is used to store the maximum value
+        //
+        tMax.max(myErr);
+      });
 
   double lInfErr = tMax;
   printf("Max Error = %lg, dx = %f \n", lInfErr, grid.dx);
@@ -232,16 +232,17 @@ void setIC(double* P1, double* P2, double t0, double t1, grid_s grid)
       1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds, fdBounds),
-                              [=](RAJA::Index_type tx, RAJA::Index_type ty)
-                              {
-                                int    id = tx + ty * grid.nx;
-                                double x  = grid.ox + tx * grid.dx;
-                                double y  = grid.ox + ty * grid.dx;
+  RAJA::kernel<initialPolicy>(
+      RAJA::make_tuple(fdBounds, fdBounds),
+      [=](RAJA::Index_type tx, RAJA::Index_type ty)
+      {
+        int    id = tx + ty * grid.nx;
+        double x  = grid.ox + tx * grid.dx;
+        double y  = grid.ox + ty * grid.dx;
 
-                                P1[id] = waveSol(t0, x, y);
-                                P2[id] = waveSol(t1, x, y);
-                              });
+        P1[id] = waveSol(t0, x, y);
+        P2[id] = waveSol(t1, x, y);
+      });
 }
 
 
@@ -256,8 +257,8 @@ void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx)
         //
         // Coefficients for fourth order stencil
         //
-        double coeff[5] = {-1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0,
-                           -1.0 / 12.0};
+        double coeff[5] = {
+            -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
 
         const int id     = tx + ty * nx;
         double    P_old  = P1[id];
diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp
index 9625220983..c68104a327 100644
--- a/exercises/dot-product.cpp
+++ b/exercises/dot-product.cpp
@@ -92,8 +92,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
-                               [=](int i) { seqdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=](int i) { seqdot += a[i] * b[i]; });
 
   dot = seqdot.get();
 
diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp
index 57887ea51b..2181d8ecc4 100644
--- a/exercises/dot-product_solution.cpp
+++ b/exercises/dot-product_solution.cpp
@@ -86,8 +86,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajaseq_dotprod_start
   RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
-                               [=](int i) { seqdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=](int i) { seqdot += a[i] * b[i]; });
 
   dot = seqdot.get();
   // _rajaseq_dotprod_end
@@ -107,8 +108,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajaomp_dotprod_start
   RAJA::ReduceSum<RAJA::omp_reduce, double> ompdot(0.0);
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=](int i)
-                                            { ompdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      RAJA::RangeSegment(0, N), [=](int i) { ompdot += a[i] * b[i]; });
 
   dot = ompdot.get();
   // _rajaomp_dotprod_end
@@ -132,9 +133,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajacuda_dotprod_start
   RAJA::ReduceSum<RAJA::cuda_reduce, double> cudot(0.0);
 
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
-                                                 [=] RAJA_DEVICE(int i)
-                                                 { cudot += a[i] * b[i]; });
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_DEVICE(int i) { cudot += a[i] * b[i]; });
 
   dot = cudot.get();
   // _rajacuda_dotprod_end
@@ -163,9 +164,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajahip_dotprod_start
   RAJA::ReduceSum<RAJA::hip_reduce, double> hpdot(0.0);
 
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
-                                               [=] RAJA_DEVICE(int i)
-                                               { hpdot += d_a[i] * d_b[i]; });
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_DEVICE(int i) { hpdot += d_a[i] * d_b[i]; });
 
   dot = hpdot.get();
   // _rajahip_dotprod_end
diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp
index 29c18e7052..daf97a7fe7 100644
--- a/exercises/kernel-matrix-transpose-local-array.cpp
+++ b/exercises/kernel-matrix-transpose-local-array.cpp
@@ -141,8 +141,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -163,8 +163,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int ty = 0; ty < TILE_DIM; ++ty)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -195,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM = RAJA::LocalArray<int, RAJA::Perm<0, 1>,
-                                    RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM = RAJA::LocalArray<
+      int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -378,9 +378,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
-                                                 RAJA::seq_exec,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                          RAJA::statement::Lambda<0>>>,
                   //
                   // (2) Execution policies for the second set of inner
                   // loops. These loops copy data from the local tile to
@@ -396,8 +396,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                           RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
       [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
@@ -439,9 +440,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-                                                 RAJA::cuda_thread_x_direct,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<1>,
+                          RAJA::cuda_thread_x_direct,
+                          RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
                   // to the local array are complete
                   RAJA::statement::CudaSyncThreads,
@@ -455,17 +457,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
-                                                 RAJA::cuda_thread_x_direct,
-                                                 RAJA::statement::Lambda<1>>>,
+                      RAJA::statement::ForICount<
+                          1, RAJA::statement::Param<0>,
+                          RAJA::cuda_thread_x_direct,
+                          RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
                   // from the local array are complete
                   RAJA::statement::CudaSyncThreads>>>>>;
 
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
       [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
@@ -524,9 +528,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-                                                 RAJA::hip_thread_x_direct,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<1>,
+                          RAJA::hip_thread_x_direct,
+                          RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
                   // to the local array are complete
                   RAJA::statement::HipSyncThreads,
@@ -540,17 +545,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
-                                                 RAJA::hip_thread_x_direct,
-                                                 RAJA::statement::Lambda<1>>>,
+                      RAJA::statement::ForICount<
+                          1, RAJA::statement::Param<0>,
+                          RAJA::hip_thread_x_direct,
+                          RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
                   // from the local array are complete
                   RAJA::statement::HipSyncThreads>>>>>;
 
 
   RAJA::kernel_param<HIP_EXEC_POL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
       [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
index eedad74d05..4601f98b6e 100644
--- a/exercises/kernel-matrix-transpose-local-array_solution.cpp
+++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -141,8 +141,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -163,8 +163,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int ty = 0; ty < TILE_DIM; ++ty)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -195,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM = RAJA::LocalArray<int, RAJA::Perm<0, 1>,
-                                    RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM = RAJA::LocalArray<
+      int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -219,21 +219,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
               RAJA::statement::ForICount<
                   1, RAJA::statement::Param<0>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-                                             RAJA::seq_exec,
-                                             RAJA::statement::Lambda<0>>>,
+                  RAJA::statement::ForICount<
+                      0, RAJA::statement::Param<1>, RAJA::seq_exec,
+                      RAJA::statement::Lambda<0>>>,
 
               RAJA::statement::ForICount<
                   0, RAJA::statement::Param<1>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
-                                             RAJA::seq_exec,
-                                             RAJA::statement::Lambda<1>>>
+                  RAJA::statement::ForICount<
+                      1, RAJA::statement::Param<0>, RAJA::seq_exec,
+                      RAJA::statement::Lambda<1>>>
 
               >>>>;
 
   RAJA::kernel_param<SEQ_EXEC_POL_I>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
 
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
@@ -280,9 +281,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<0>, RAJA::seq_exec,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-                                                 RAJA::seq_exec,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<1>, RAJA::seq_exec,
+                          RAJA::statement::Lambda<0>>>,
                   //
                   // (2) Execution policies for the second set of inner
                   // loops. These loops copy data from the local tile to
@@ -298,8 +299,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                           RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_1_POL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
       [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
@@ -340,9 +342,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
-                                                 RAJA::seq_exec,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                          RAJA::statement::Lambda<0>>>,
                   //
                   // (2) Execution policies for the second set of inner
                   // loops. These loops copy data from the local tile to
@@ -358,8 +360,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                           RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
       [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
@@ -401,9 +404,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-                                                 RAJA::cuda_thread_x_direct,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<1>,
+                          RAJA::cuda_thread_x_direct,
+                          RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
                   // to the local array are complete
                   RAJA::statement::CudaSyncThreads,
@@ -417,17 +421,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
-                                                 RAJA::cuda_thread_x_direct,
-                                                 RAJA::statement::Lambda<1>>>,
+                      RAJA::statement::ForICount<
+                          1, RAJA::statement::Param<0>,
+                          RAJA::cuda_thread_x_direct,
+                          RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
                   // from the local array are complete
                   RAJA::statement::CudaSyncThreads>>>>>;
 
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
       [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
@@ -486,9 +492,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-                                                 RAJA::hip_thread_x_direct,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<1>,
+                          RAJA::hip_thread_x_direct,
+                          RAJA::statement::Lambda<0>>>,
                   // Synchronize threads to ensure all loads
                   // to the local array are complete
                   RAJA::statement::HipSyncThreads,
@@ -502,17 +509,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   //
                   RAJA::statement::ForICount<
                       0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
-                                                 RAJA::hip_thread_x_direct,
-                                                 RAJA::statement::Lambda<1>>>,
+                      RAJA::statement::ForICount<
+                          1, RAJA::statement::Param<0>,
+                          RAJA::hip_thread_x_direct,
+                          RAJA::statement::Lambda<1>>>,
                   // Synchronize threads to ensure all reads
                   // from the local array are complete
                   RAJA::statement::HipSyncThreads>>>>>;
 
 
   RAJA::kernel_param<HIP_EXEC_POL>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
       RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
       [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
@@ -552,21 +561,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   1, RAJA::seq_exec,
                   RAJA::statement::For<
                       0, RAJA::seq_exec,
-                      RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>,
-                                              Offsets<1>, Params<0>>>>,
+                      RAJA::statement::Lambda<
+                          0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>,
+                          Params<0>>>>,
 
               RAJA::statement::For<
                   0, RAJA::seq_exec,
                   RAJA::statement::For<
                       1, RAJA::seq_exec,
-                      RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>,
-                                              Params<0>>>>
+                      RAJA::statement::Lambda<
+                          1, Segs<0, 1>, Offsets<0, 1>, Params<0>>>>
 
               >>>>;
 
   RAJA::kernel_param<SEQ_EXEC_POL_II>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                       RAJA::TypedRangeSegment<int>(0, N_r)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N_c),
+          RAJA::TypedRangeSegment<int>(0, N_r)),
 
       RAJA::make_tuple(Tile_Array),
 
diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp
index 59a7a9b58f..a99c11a789 100644
--- a/exercises/kernel-matrix-transpose-tiled.cpp
+++ b/exercises/kernel-matrix-transpose-tiled.cpp
@@ -124,8 +124,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -180,12 +180,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<
               1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                                   RAJA::statement::Lambda<0>>>>>>;
+              RAJA::statement::For<
+                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
-                                      [=](int col, int row)
-                                      { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -237,16 +237,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP parallel for loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Collapse<
-              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-              RAJA::statement::Lambda<0>>                  // closes collapse
-          >                                                // closes Tile 0
-      >                                                    // closes Tile 1
-                                                        >; // closes policy list
+  using TILED_KERNEL_EXEC_POL_OMP2 =
+      RAJA::KernelPolicy<RAJA::statement::Tile<
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::Tile<
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+              RAJA::statement::Collapse<
+                  RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
+                  RAJA::statement::Lambda<0>>  // closes collapse
+              >                                // closes Tile 0
+          >                                    // closes Tile 1
+                         >;                    // closes policy list
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
       RAJA::make_tuple(col_Range, row_Range),
@@ -314,8 +315,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::hip_thread_x_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_y_direct,
-                                       RAJA::statement::Lambda<0>>>>>>>;
+                  RAJA::statement::For<
+                      0, RAJA::hip_thread_y_direct,
+                      RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
       RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
diff --git a/exercises/kernel-matrix-transpose-tiled_solution.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp
index 9e0c838e58..ad000e58b6 100644
--- a/exercises/kernel-matrix-transpose-tiled_solution.cpp
+++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp
@@ -124,8 +124,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -170,12 +170,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<
               1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                                   RAJA::statement::Lambda<0>>>>>>;
+              RAJA::statement::For<
+                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
-                                      [=](int col, int row)
-                                      { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -198,8 +198,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
           RAJA::statement::For<
               1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                                   RAJA::statement::Lambda<0>>>>>>;
+              RAJA::statement::For<
+                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>(
       RAJA::make_tuple(col_Range, row_Range),
@@ -219,16 +219,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP parallel for loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Collapse<
-              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-              RAJA::statement::Lambda<0>>                  // closes collapse
-          >                                                // closes Tile 0
-      >                                                    // closes Tile 1
-                                                        >; // closes policy list
+  using TILED_KERNEL_EXEC_POL_OMP2 =
+      RAJA::KernelPolicy<RAJA::statement::Tile<
+          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::Tile<
+              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+              RAJA::statement::Collapse<
+                  RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
+                  RAJA::statement::Lambda<0>>  // closes collapse
+              >                                // closes Tile 0
+          >                                    // closes Tile 1
+                         >;                    // closes policy list
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
       RAJA::make_tuple(col_Range, row_Range),
@@ -254,8 +255,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
-                                       RAJA::statement::Lambda<0>>>>>>>;
+                  RAJA::statement::For<
+                      0, RAJA::cuda_thread_y_direct,
+                      RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>(
       RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
@@ -289,8 +291,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::hip_thread_x_direct,
-                  RAJA::statement::For<0, RAJA::hip_thread_y_direct,
-                                       RAJA::statement::Lambda<0>>>>>>>;
+                  RAJA::statement::For<
+                      0, RAJA::hip_thread_y_direct,
+                      RAJA::statement::Lambda<0>>>>>>>;
 
   RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
       RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp
index ea18b22d6d..64d8371697 100644
--- a/exercises/kernel-matrix-transpose_solution.cpp
+++ b/exercises/kernel-matrix-transpose_solution.cpp
@@ -129,9 +129,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       1, RAJA::seq_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL>(RAJA::make_tuple(col_Range, row_Range),
-                                [=](int col, int row)
-                                { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -151,9 +151,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       1, RAJA::omp_parallel_for_exec,
       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>(RAJA::make_tuple(col_Range, row_Range),
-                                    [=](int col, int row)
-                                    { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -166,14 +166,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _raja_mattranspose_cuda_start
-  using KERNEL_EXEC_POL_CUDA = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
-      RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
-                           RAJA::statement::For<0, RAJA::cuda_thread_y_loop,
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
-                                     [=] RAJA_DEVICE(int col, int row)
-                                     { Atview(col, row) = Aview(row, col); });
+  using KERNEL_EXEC_POL_CUDA =
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          1, RAJA::cuda_thread_x_loop,
+          RAJA::statement::For<
+              0, RAJA::cuda_thread_y_loop, RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(
+      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
+      { Atview(col, row) = Aview(row, col); });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
diff --git a/exercises/kernelintro-execpols.cpp b/exercises/kernelintro-execpols.cpp
index df8ee78800..57542f4289 100644
--- a/exercises/kernelintro-execpols.cpp
+++ b/exercises/kernelintro-execpols.cpp
@@ -178,19 +178,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_tensorinit_omp_outer_start
   using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
       2,
-      RAJA::omp_parallel_for_exec, // k
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // j
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // i
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL2>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=](int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      RAJA::omp_parallel_for_exec,  // k
+      RAJA::statement::For<
+          1,
+          RAJA::seq_exec,  // j
+          RAJA::statement::For<
+              0,
+              RAJA::seq_exec,  // i
+              RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<EXEC_POL2>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
@@ -227,15 +230,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _raja_tensorinit_omp_collapse_start
   using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>, // k, j, i
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>,  // k, j, i
       RAJA::statement::Lambda<0>>>;
 
-  RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL3>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=](int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
@@ -259,7 +263,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult(a, a_ref, N_tot);
 
-#endif // if defined(RAJA_ENABLE_OPENMP)
+#endif  // if defined(RAJA_ENABLE_OPENMP)
 
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -277,20 +281,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL5 =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
           2,
-          RAJA::cuda_thread_z_loop, // k
+          RAJA::cuda_thread_z_loop,  // k
           RAJA::statement::For<
               1,
-              RAJA::cuda_thread_y_loop, // j
-              RAJA::statement::For<0,
-                                   RAJA::cuda_thread_x_loop, // i
-                                   RAJA::statement::Lambda<0>>>>>>;
+              RAJA::cuda_thread_y_loop,  // j
+              RAJA::statement::For<
+                  0,
+                  RAJA::cuda_thread_x_loop,  // i
+                  RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL5>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=] __device__(int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
@@ -321,20 +327,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<i_block_sz>, RAJA::cuda_block_x_direct,
               RAJA::statement::For<
                   2,
-                  RAJA::cuda_block_z_direct, // k
+                  RAJA::cuda_block_z_direct,  // k
                   RAJA::statement::For<
                       1,
-                      RAJA::cuda_thread_y_direct, // j
-                      RAJA::statement::For<0,
-                                           RAJA::cuda_thread_x_direct, // i
-                                           RAJA::statement::Lambda<0>>>>>>>>;
-
-  RAJA::kernel<EXEC_POL6>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=] __device__(int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+                      RAJA::cuda_thread_y_direct,  // j
+                      RAJA::statement::For<
+                          0,
+                          RAJA::cuda_thread_x_direct,  // i
+                          RAJA::statement::Lambda<0>>>>>>>>;
+
+  RAJA::kernel<EXEC_POL6>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
@@ -348,12 +356,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
-                "Invalid block_size");
+  static_assert(
+      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
 
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
       <<<nblocks, nthreads_per_block>>>(a, c, N);
@@ -363,7 +372,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult(a, a_ref, N_tot);
 
-#endif // if defined(RAJA_ENABLE_CUDA)
+#endif  // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
@@ -388,20 +397,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL7 =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
           2,
-          RAJA::hip_thread_z_loop, // k
+          RAJA::hip_thread_z_loop,  // k
           RAJA::statement::For<
               1,
-              RAJA::hip_thread_y_loop, // j
-              RAJA::statement::For<0,
-                                   RAJA::hip_thread_x_loop, // i
-                                   RAJA::statement::Lambda<0>>>>>>;
-
-  RAJA::kernel<EXEC_POL7>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=] __device__(int i, int j, int k)
-                          { d_aView(i, j, k) = c * i * j * k; });
+              RAJA::hip_thread_y_loop,  // j
+              RAJA::statement::For<
+                  0,
+                  RAJA::hip_thread_x_loop,  // i
+                  RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<EXEC_POL7>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=] __device__(int i, int j, int k)
+      { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
@@ -432,20 +444,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<i_block_sz>, RAJA::hip_block_x_direct,
               RAJA::statement::For<
                   2,
-                  RAJA::hip_block_z_direct, // k
+                  RAJA::hip_block_z_direct,  // k
                   RAJA::statement::For<
                       1,
-                      RAJA::hip_thread_y_direct, // j
-                      RAJA::statement::For<0,
-                                           RAJA::hip_thread_x_direct, // i
-                                           RAJA::statement::Lambda<0>>>>>>>>;
-
-  RAJA::kernel<EXEC_POL8>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=] __device__(int i, int j, int k)
-                          { d_aView(i, j, k) = c * i * j * k; });
+                      RAJA::hip_thread_y_direct,  // j
+                      RAJA::statement::For<
+                          0,
+                          RAJA::hip_thread_x_direct,  // i
+                          RAJA::statement::Lambda<0>>>>>>>>;
+
+  RAJA::kernel<EXEC_POL8>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=] __device__(int i, int j, int k)
+      { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_tiled_direct_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
@@ -453,7 +468,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif // if defined(RAJA_ENABLE_HIP)
+#endif  // if defined(RAJA_ENABLE_HIP)
 
   //----------------------------------------------------------------------------//
 
diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp
index 3b483ff4fe..c0b86f89a1 100644
--- a/exercises/kernelintro-execpols_solution.cpp
+++ b/exercises/kernelintro-execpols_solution.cpp
@@ -133,19 +133,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_tensorinit_seq_start
   using EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
       2,
-      RAJA::seq_exec, // k
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // j
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // i
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL1>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=](int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      RAJA::seq_exec,  // k
+      RAJA::statement::For<
+          1,
+          RAJA::seq_exec,  // j
+          RAJA::statement::For<
+              0,
+              RAJA::seq_exec,  // i
+              RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<EXEC_POL1>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
@@ -187,19 +190,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_tensorinit_omp_outer_start
   using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
       2,
-      RAJA::omp_parallel_for_exec, // k
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // j
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // i
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL2>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=](int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      RAJA::omp_parallel_for_exec,  // k
+      RAJA::statement::For<
+          1,
+          RAJA::seq_exec,  // j
+          RAJA::statement::For<
+              0,
+              RAJA::seq_exec,  // i
+              RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<EXEC_POL2>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
@@ -236,15 +242,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _raja_tensorinit_omp_collapse_start
   using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>, // k, j, i
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>,  // k, j, i
       RAJA::statement::Lambda<0>>>;
 
-  RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL3>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=](int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
@@ -258,22 +265,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _raja_tensorinit_omp_collapse_start
   using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1>, // k, j
-      RAJA::statement::For<0,
-                           RAJA::seq_exec, // i
-                           RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=](int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1>,  // k, j
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,  // i
+          RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<EXEC_POL4>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-#endif // if defined(RAJA_ENABLE_OPENMP)
+#endif  // if defined(RAJA_ENABLE_OPENMP)
 
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -291,20 +300,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL5 =
       RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
           2,
-          RAJA::cuda_thread_z_loop, // k
+          RAJA::cuda_thread_z_loop,  // k
           RAJA::statement::For<
               1,
-              RAJA::cuda_thread_y_loop, // j
-              RAJA::statement::For<0,
-                                   RAJA::cuda_thread_x_loop, // i
-                                   RAJA::statement::Lambda<0>>>>>>;
+              RAJA::cuda_thread_y_loop,  // j
+              RAJA::statement::For<
+                  0,
+                  RAJA::cuda_thread_x_loop,  // i
+                  RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
+  RAJA::kernel<EXEC_POL5>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
 
-                          [=] __device__(int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
@@ -335,20 +346,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<i_block_sz>, RAJA::cuda_block_x_direct,
               RAJA::statement::For<
                   2,
-                  RAJA::cuda_block_z_direct, // k
+                  RAJA::cuda_block_z_direct,  // k
                   RAJA::statement::For<
                       1,
-                      RAJA::cuda_thread_y_direct, // j
-                      RAJA::statement::For<0,
-                                           RAJA::cuda_thread_x_direct, // i
-                                           RAJA::statement::Lambda<0>>>>>>>>;
-
-  RAJA::kernel<EXEC_POL6>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=] __device__(int i, int j, int k)
-                          { aView(i, j, k) = c * i * j * k; });
+                      RAJA::cuda_thread_y_direct,  // j
+                      RAJA::statement::For<
+                          0,
+                          RAJA::cuda_thread_x_direct,  // i
+                          RAJA::statement::Lambda<0>>>>>>>>;
+
+  RAJA::kernel<EXEC_POL6>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
@@ -362,12 +375,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
-                "Invalid block_size");
+  static_assert(
+      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
 
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
       <<<nblocks, nthreads_per_block>>>(a, c, N);
@@ -377,7 +391,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult(a, a_ref, N_tot);
 
-#endif // if defined(RAJA_ENABLE_CUDA)
+#endif  // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
@@ -402,20 +416,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL7 =
       RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
           2,
-          RAJA::hip_thread_z_loop, // k
+          RAJA::hip_thread_z_loop,  // k
           RAJA::statement::For<
               1,
-              RAJA::hip_thread_y_loop, // j
-              RAJA::statement::For<0,
-                                   RAJA::hip_thread_x_loop, // i
-                                   RAJA::statement::Lambda<0>>>>>>;
-
-  RAJA::kernel<EXEC_POL7>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=] __device__(int i, int j, int k)
-                          { d_aView(i, j, k) = c * i * j * k; });
+              RAJA::hip_thread_y_loop,  // j
+              RAJA::statement::For<
+                  0,
+                  RAJA::hip_thread_x_loop,  // i
+                  RAJA::statement::Lambda<0>>>>>>;
+
+  RAJA::kernel<EXEC_POL7>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=] __device__(int i, int j, int k)
+      { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
@@ -446,20 +463,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<i_block_sz>, RAJA::hip_block_x_direct,
               RAJA::statement::For<
                   2,
-                  RAJA::hip_block_z_direct, // k
+                  RAJA::hip_block_z_direct,  // k
                   RAJA::statement::For<
                       1,
-                      RAJA::hip_thread_y_direct, // j
-                      RAJA::statement::For<0,
-                                           RAJA::hip_thread_x_direct, // i
-                                           RAJA::statement::Lambda<0>>>>>>>>;
-
-  RAJA::kernel<EXEC_POL8>(RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N),
-                                           RAJA::TypedRangeSegment<int>(0, N)),
-
-                          [=] __device__(int i, int j, int k)
-                          { d_aView(i, j, k) = c * i * j * k; });
+                      RAJA::hip_thread_y_direct,  // j
+                      RAJA::statement::For<
+                          0,
+                          RAJA::hip_thread_x_direct,  // i
+                          RAJA::statement::Lambda<0>>>>>>>>;
+
+  RAJA::kernel<EXEC_POL8>(
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N),
+          RAJA::TypedRangeSegment<int>(0, N)),
+
+      [=] __device__(int i, int j, int k)
+      { d_aView(i, j, k) = c * i * j * k; });
   // _raja_tensorinit_hip_tiled_direct_end
 
   hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
@@ -467,7 +487,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif // if defined(RAJA_ENABLE_HIP)
+#endif  // if defined(RAJA_ENABLE_HIP)
 
   //----------------------------------------------------------------------------//
 
diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp
index 78a12e34c4..7586b8b29d 100644
--- a/exercises/kernelintro-nested-loop-reorder.cpp
+++ b/exercises/kernelintro-nested-loop-reorder.cpp
@@ -97,12 +97,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_kji_loops_start
   using KJI_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
       2,
-      RAJA::seq_exec, // k
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // j
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // i
-                                                RAJA::statement::Lambda<0>>>>>;
+      RAJA::seq_exec,  // k
+      RAJA::statement::For<
+          1,
+          RAJA::seq_exec,  // j
+          RAJA::statement::For<
+              0,
+              RAJA::seq_exec,  // i
+              RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KJI_EXECPOL>(
       RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
@@ -184,7 +186,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //----------------------------------------------------------------------------//
   //----------------------------------------------------------------------------//
 
-#if 0 // Enable this code block to generate compiler error.
+#if 0  // Enable this code block to generate compiler error.
 //----------------------------------------------------------------------------//
 // The following demonstrates that code will not compile if lambda argument
 // types/order do not match the types/order For statements in the execution
diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp
index f4a96ba809..d7c1077b15 100644
--- a/exercises/kernelintro-nested-loop-reorder_solution.cpp
+++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp
@@ -97,12 +97,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_kji_loops_start
   using KJI_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
       2,
-      RAJA::seq_exec, // k
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // j
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // i
-                                                RAJA::statement::Lambda<0>>>>>;
+      RAJA::seq_exec,  // k
+      RAJA::statement::For<
+          1,
+          RAJA::seq_exec,  // j
+          RAJA::statement::For<
+              0,
+              RAJA::seq_exec,  // i
+              RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<KJI_EXECPOL>(
       RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
@@ -140,12 +142,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_jik_loops_start
   using JIK_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
       1,
-      RAJA::seq_exec, // j
-      RAJA::statement::For<0,
-                           RAJA::seq_exec, // i
-                           RAJA::statement::For<2,
-                                                RAJA::seq_exec, // k
-                                                RAJA::statement::Lambda<0>>>>>;
+      RAJA::seq_exec,  // j
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,  // i
+          RAJA::statement::For<
+              2,
+              RAJA::seq_exec,  // k
+              RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<JIK_EXECPOL>(
       RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
@@ -184,12 +188,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_ikj_loops_start
   using IKJ_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
       0,
-      RAJA::seq_exec, // i
-      RAJA::statement::For<2,
-                           RAJA::seq_exec, // k
-                           RAJA::statement::For<1,
-                                                RAJA::seq_exec, // j
-                                                RAJA::statement::Lambda<0>>>>>;
+      RAJA::seq_exec,  // i
+      RAJA::statement::For<
+          2,
+          RAJA::seq_exec,  // k
+          RAJA::statement::For<
+              1,
+              RAJA::seq_exec,  // j
+              RAJA::statement::Lambda<0>>>>>;
 
   RAJA::kernel<IKJ_EXECPOL>(
       RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
@@ -200,7 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //----------------------------------------------------------------------------//
   //----------------------------------------------------------------------------//
 
-#if 0 // Enable this code block to generate compiler error.
+#if 0  // Enable this code block to generate compiler error.
 //----------------------------------------------------------------------------//
 // The following demonstrates that code will not compile if lambda argument
 // types/order do not match the types/order For statements in the execution
diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
index 9b42dd7e30..79159539bc 100644
--- a/exercises/launch-matrix-transpose-local-array.cpp
+++ b/exercises/launch-matrix-transpose-local-array.cpp
@@ -50,7 +50,7 @@
 // Define dimensionality of matrices and tile size
 //
 const int DIM = 2;
-#define TILE_DIM (16) // #define to appease msvc
+#define TILE_DIM (16)  // #define to appease msvc
 
 //
 // Function for checking results
@@ -138,8 +138,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -160,8 +160,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int ty = 0; ty < TILE_DIM; ++ty)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -188,8 +188,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
-                            // the cpu
+      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
+                             // the cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<loop_pol_1>(
@@ -249,8 +249,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
-                            // the cpu
+      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
+                             // the cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
       {
         /*
@@ -309,8 +309,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                         RAJA::Threads(c_block_sz, r_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_c, n_blocks_r),
+          RAJA::Threads(c_block_sz, r_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
       {
         /*
@@ -388,8 +389,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                         RAJA::Threads(c_block_sz, r_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_c, n_blocks_r),
+          RAJA::Threads(c_block_sz, r_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
index 0e4abbe1bf..faff3153c6 100644
--- a/exercises/launch-matrix-transpose-local-array_solution.cpp
+++ b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -50,7 +50,7 @@
 // Define dimensionality of matrices and tile size
 //
 const int DIM = 2;
-#define TILE_DIM (16) // #define to appease msvc
+#define TILE_DIM (16)  // #define to appease msvc
 
 //
 // Function for checking results
@@ -138,8 +138,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -160,8 +160,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int ty = 0; ty < TILE_DIM; ++ty)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -188,8 +188,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
-                            // the cpu
+      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
+                             // the cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<loop_pol_1>(
@@ -206,11 +206,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                         ctx, row_tile,
                         [&](int row, int ty)
                         {
-                          RAJA::loop_icount<loop_pol_1>(ctx, col_tile,
-                                                        [&](int col, int tx) {
-                                                          Tile_Array[ty][tx] =
-                                                              Aview(row, col);
-                                                        });
+                          RAJA::loop_icount<loop_pol_1>(
+                              ctx, col_tile,
+                              [&](int col, int tx)
+                              { Tile_Array[ty][tx] = Aview(row, col); });
                         });
 
                     RAJA::loop_icount<loop_pol_1>(
@@ -246,8 +245,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when only running on
-                            // the cpu
+      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
+                             // the cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<omp_pol_2>(
@@ -264,11 +263,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                         ctx, row_tile,
                         [&](int row, int ty)
                         {
-                          RAJA::loop_icount<loop_pol_2>(ctx, col_tile,
-                                                        [&](int col, int tx) {
-                                                          Tile_Array[ty][tx] =
-                                                              Aview(row, col);
-                                                        });
+                          RAJA::loop_icount<loop_pol_2>(
+                              ctx, col_tile,
+                              [&](int col, int tx)
+                              { Tile_Array[ty][tx] = Aview(row, col); });
                         });
 
                     RAJA::loop_icount<loop_pol_2>(
@@ -310,8 +308,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                         RAJA::Threads(c_block_sz, r_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_c, n_blocks_r),
+          RAJA::Threads(c_block_sz, r_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<cuda_teams_y>(
@@ -389,8 +388,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                         RAJA::Threads(c_block_sz, r_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_c, n_blocks_r),
+          RAJA::Threads(c_block_sz, r_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp
index 006b6a2958..1917f58d43 100644
--- a/exercises/launch-matrix-transpose-tiled.cpp
+++ b/exercises/launch-matrix-transpose-tiled.cpp
@@ -125,8 +125,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -175,8 +175,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // cpu
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
       {
         /*
@@ -358,8 +358,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                         RAJA::Threads(c_block_sz, r_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_c, n_blocks_r),
+          RAJA::Threads(c_block_sz, r_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
@@ -370,16 +371,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   ctx, TILE_DIM, col_Range2,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<hip_threads_y>(ctx, row_tile,
-                                              [&](int row)
-                                              {
-                                                RAJA::loop<hip_threads_x>(
-                                                    ctx, col_tile,
-                                                    [&](int col) {
-                                                      Atview(col, row) =
-                                                          Aview(row, col);
-                                                    });
-                                              });
+                    RAJA::loop<hip_threads_y>(
+                        ctx, row_tile,
+                        [&](int row)
+                        {
+                          RAJA::loop<hip_threads_x>(
+                              ctx, col_tile,
+                              [&](int col)
+                              { Atview(col, row) = Aview(row, col); });
+                        });
                   });
             });
       });
diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp
index 6ca5ede73c..32c4efa4a6 100644
--- a/exercises/launch-matrix-transpose-tiled_solution.cpp
+++ b/exercises/launch-matrix-transpose-tiled_solution.cpp
@@ -125,8 +125,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tx = 0; tx < TILE_DIM; ++tx)
         {
 
-          int col = bx * TILE_DIM + tx; // Matrix column index
-          int row = by * TILE_DIM + ty; // Matrix row index
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -169,8 +169,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // cpu
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<loop_pol_1>(
@@ -181,16 +181,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<loop_pol_1>(ctx, row_tile,
-                                           [&](int row)
-                                           {
-                                             RAJA::loop<loop_pol_1>(
-                                                 ctx, col_tile,
-                                                 [&](int col) {
-                                                   Atview(col, row) =
-                                                       Aview(row, col);
-                                                 });
-                                           });
+                    RAJA::loop<loop_pol_1>(
+                        ctx, row_tile,
+                        [&](int row)
+                        {
+                          RAJA::loop<loop_pol_1>(
+                              ctx, col_tile,
+                              [&](int col)
+                              { Atview(col, row) = Aview(row, col); });
+                        });
                   });
             });
       });
@@ -215,8 +214,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // cpu
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // cpu
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<omp_for_pol_2>(
@@ -227,16 +226,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<loop_pol_2>(ctx, row_tile,
-                                           [&](int row)
-                                           {
-                                             RAJA::loop<loop_pol_2>(
-                                                 ctx, col_tile,
-                                                 [&](int col) {
-                                                   Atview(col, row) =
-                                                       Aview(row, col);
-                                                 });
-                                           });
+                    RAJA::loop<loop_pol_2>(
+                        ctx, row_tile,
+                        [&](int row)
+                        {
+                          RAJA::loop<loop_pol_2>(
+                              ctx, col_tile,
+                              [&](int col)
+                              { Atview(col, row) = Aview(row, col); });
+                        });
                   });
             });
       });
@@ -270,8 +268,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                         RAJA::Threads(c_block_sz, r_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_c, n_blocks_r),
+          RAJA::Threads(c_block_sz, r_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<cuda_teams_y>(
@@ -282,16 +281,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<cuda_threads_y>(ctx, row_tile,
-                                               [&](int row)
-                                               {
-                                                 RAJA::loop<cuda_threads_x>(
-                                                     ctx, col_tile,
-                                                     [&](int col) {
-                                                       Atview(col, row) =
-                                                           Aview(row, col);
-                                                     });
-                                               });
+                    RAJA::loop<cuda_threads_y>(
+                        ctx, row_tile,
+                        [&](int row)
+                        {
+                          RAJA::loop<cuda_threads_x>(
+                              ctx, col_tile,
+                              [&](int col)
+                              { Atview(col, row) = Aview(row, col); });
+                        });
                   });
             });
       });
@@ -332,8 +330,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                         RAJA::Threads(c_block_sz, r_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_c, n_blocks_r),
+          RAJA::Threads(c_block_sz, r_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::tile<hip_teams_y>(
@@ -344,16 +343,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                   ctx, TILE_DIM, col_Range,
                   [&](RAJA::TypedRangeSegment<int> const& col_tile)
                   {
-                    RAJA::loop<hip_threads_y>(ctx, row_tile,
-                                              [&](int row)
-                                              {
-                                                RAJA::loop<hip_threads_x>(
-                                                    ctx, col_tile,
-                                                    [&](int col) {
-                                                      d_Atview(col, row) =
-                                                          d_Aview(row, col);
-                                                    });
-                                              });
+                    RAJA::loop<hip_threads_y>(
+                        ctx, row_tile,
+                        [&](int row)
+                        {
+                          RAJA::loop<hip_threads_x>(
+                              ctx, col_tile,
+                              [&](int col)
+                              { d_Atview(col, row) = d_Aview(row, col); });
+                        });
                   });
             });
       });
diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp
index 5b1c8e2aa4..5d32dd369a 100644
--- a/exercises/launch-matrix-transpose.cpp
+++ b/exercises/launch-matrix-transpose.cpp
@@ -129,24 +129,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_seq>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<loop_policy_seq>(ctx, row_Range,
-                                    [&](int /*row*/)
-                                    {
-                                      RAJA::loop<loop_policy_seq>(
-                                          ctx, col_Range,
-                                          [&](int /*col*/)
-                                          {
-                                            /// TODO...
-                                            ///
-                                            /// EXERCISE: Implement the kernel
-                                            /// body for the transpose operation
-                                            ///
-                                          });
-                                    });
+        RAJA::loop<loop_policy_seq>(
+            ctx, row_Range,
+            [&](int /*row*/)
+            {
+              RAJA::loop<loop_policy_seq>(
+                  ctx, col_Range,
+                  [&](int /*col*/)
+                  {
+                    /// TODO...
+                    ///
+                    /// EXERCISE: Implement the kernel
+                    /// body for the transpose operation
+                    ///
+                  });
+            });
       });
   // _raja_mattranspose_end
 
@@ -168,8 +169,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_omp>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
       {
         /// TODO...
@@ -195,7 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async         = false; // execute asynchronously
+  const bool async         = false;  // execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
   RAJA::launch<launch_policy_cuda>(
diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp
index e5e1f164d9..b54de8b30b 100644
--- a/exercises/launch-matrix-transpose_solution.cpp
+++ b/exercises/launch-matrix-transpose_solution.cpp
@@ -129,8 +129,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_seq>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<loop_policy_seq>(
@@ -161,8 +161,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_omp>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<loop_policy_omp>(
@@ -189,7 +189,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async         = false; // execute asynchronously
+  const bool async         = false;  // execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
   RAJA::launch<launch_policy_cuda>(
diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp
index 1252701063..076164a5bf 100644
--- a/exercises/launchintro-execpols.cpp
+++ b/exercises/launchintro-execpols.cpp
@@ -142,8 +142,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
       {
         /*
@@ -208,8 +208,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
       {
         // TODO: Use the omp_policy_2 to distribute loop iterations
@@ -264,8 +264,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
   RAJA::launch<launch_policy_3>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_3>(
@@ -306,8 +307,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
   RAJA::launch<launch_policy_4>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_4>(
@@ -348,12 +350,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
-                "Invalid block_size");
+  static_assert(
+      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
 
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
       <<<nblocks, nthreads_per_block>>>(a, c, N);
@@ -363,7 +366,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult(a, a_ref, N_tot);
 
-#endif // if defined(RAJA_ENABLE_CUDA)
+#endif  // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
@@ -405,8 +408,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
   RAJA::launch<launch_policy_5>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_5>(
@@ -448,8 +452,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
   RAJA::launch<launch_policy_6>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_6>(
@@ -468,11 +473,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                               ctx, j_tile,
                               [&](int j)
                               {
-                                RAJA::loop<hip_threads_x_6>(ctx, i_tile,
-                                                            [&](int i) {
-                                                              d_aView(i, j, k) =
-                                                                  c * i * j * k;
-                                                            });
+                                RAJA::loop<hip_threads_x_6>(
+                                    ctx, i_tile,
+                                    [&](int i)
+                                    { d_aView(i, j, k) = c * i * j * k; });
                               });
                         });
                   });
@@ -485,7 +489,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif // if defined(RAJA_ENABLE_HIP)
+#endif  // if defined(RAJA_ENABLE_HIP)
 
   //----------------------------------------------------------------------------//
 
diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp
index 7cd56efed3..64d718f828 100644
--- a/exercises/launchintro-execpols_solution.cpp
+++ b/exercises/launchintro-execpols_solution.cpp
@@ -135,8 +135,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<loop_policy_1>(
@@ -197,8 +197,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(), // LaunchParams may be empty when running on the
-                            // host
+      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
+                             // host
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<omp_policy_2>(
@@ -255,8 +255,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
   RAJA::launch<launch_policy_3>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_3>(
@@ -297,8 +298,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
   RAJA::launch<launch_policy_4>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<cuda_teams_z_4>(
@@ -339,12 +341,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(i_block_sz * j_block_sz * k_block_sz == block_size,
-                "Invalid block_size");
+  static_assert(
+      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
 
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
       <<<nblocks, nthreads_per_block>>>(a, c, N);
@@ -354,7 +357,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult(a, a_ref, N_tot);
 
-#endif // if defined(RAJA_ENABLE_CUDA)
+#endif  // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
@@ -396,8 +399,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
   RAJA::launch<launch_policy_5>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_5>(
@@ -439,8 +443,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
   RAJA::launch<launch_policy_6>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-                         RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+      RAJA::LaunchParams(
+          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<hip_teams_z_6>(
@@ -459,11 +464,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                               ctx, j_tile,
                               [&](int j)
                               {
-                                RAJA::loop<hip_threads_x_6>(ctx, i_tile,
-                                                            [&](int i) {
-                                                              d_aView(i, j, k) =
-                                                                  c * i * j * k;
-                                                            });
+                                RAJA::loop<hip_threads_x_6>(
+                                    ctx, i_tile,
+                                    [&](int i)
+                                    { d_aView(i, j, k) = c * i * j * k; });
                               });
                         });
                   });
@@ -476,7 +480,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif // if defined(RAJA_ENABLE_HIP)
+#endif  // if defined(RAJA_ENABLE_HIP)
 
   //----------------------------------------------------------------------------//
 
diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp
index 930640a80d..41accd3651 100644
--- a/exercises/memoryManager.hpp
+++ b/exercises/memoryManager.hpp
@@ -100,5 +100,5 @@ void deallocate_gpu(T*& ptr)
 }
 #endif
 
-}; // namespace memoryManager
+};  // namespace memoryManager
 #endif
diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp
index 2dc54fef81..ac25b66a7d 100644
--- a/exercises/offset-layout-stencil.cpp
+++ b/exercises/offset-layout-stencil.cpp
@@ -214,23 +214,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaseq_start
-  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // row
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // col
-                                                RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
-                                 [=](int col, int row)
-                                 {
-                                   outputView(row, col) =
-                                       inputView(row, col) +
-                                       inputView(row - 1, col) +
-                                       inputView(row + 1, col) +
-                                       inputView(row, col - 1) +
-                                       inputView(row, col + 1);
-                                 });
+  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,  // row
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,  // col
+          RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL1>(
+      RAJA::make_tuple(col_range, row_range),
+      [=](int col, int row)
+      {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajaseq_end
 
   std::cout << "\noutput lattice:\n";
@@ -269,23 +269,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajacuda_start
-  using NESTED_EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
-      RAJA::statement::For<1,
-                           RAJA::cuda_block_x_loop, // row
-                           RAJA::statement::For<0,
-                                                RAJA::cuda_thread_x_loop, // col
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row)
-                                 {
-                                   outputView(row, col) =
-                                       inputView(row, col) +
-                                       inputView(row - 1, col) +
-                                       inputView(row + 1, col) +
-                                       inputView(row, col - 1) +
-                                       inputView(row, col + 1);
-                                 });
+  using NESTED_EXEC_POL3 =
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          1,
+          RAJA::cuda_block_x_loop,  // row
+          RAJA::statement::For<
+              0,
+              RAJA::cuda_thread_x_loop,  // col
+              RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL3>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajacuda_end
 
   std::cout << "\noutput lattice:\n";
@@ -310,27 +311,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
 
   // _offsetlayout_rajahip_start
-  using NESTED_EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::HipKernel<
-      RAJA::statement::For<1,
-                           RAJA::hip_block_x_loop, // row
-                           RAJA::statement::For<0,
-                                                RAJA::hip_thread_x_loop, // col
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row)
-                                 {
-                                   d_outputView(row, col) =
-                                       d_inputView(row, col) +
-                                       d_inputView(row - 1, col) +
-                                       d_inputView(row + 1, col) +
-                                       d_inputView(row, col - 1) +
-                                       d_inputView(row, col + 1);
-                                 });
+  using NESTED_EXEC_POL4 =
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+          1,
+          RAJA::hip_block_x_loop,  // row
+          RAJA::statement::For<
+              0,
+              RAJA::hip_thread_x_loop,  // col
+              RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL4>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        d_outputView(row, col) =
+            d_inputView(row, col) + d_inputView(row - 1, col) +
+            d_inputView(row + 1, col) + d_inputView(row, col - 1) +
+            d_inputView(row, col + 1);
+      });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy(output, d_output, totCells * sizeof(int),
-                      hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(
+      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
 
   std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp
index 8b8ebc9f30..91ac3dabfd 100644
--- a/exercises/offset-layout-stencil_solution.cpp
+++ b/exercises/offset-layout-stencil_solution.cpp
@@ -215,23 +215,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaseq_start
-  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<
-      RAJA::statement::For<1,
-                           RAJA::seq_exec, // row
-                           RAJA::statement::For<0,
-                                                RAJA::seq_exec, // col
-                                                RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
-                                 [=](int col, int row)
-                                 {
-                                   outputView(row, col) =
-                                       inputView(row, col) +
-                                       inputView(row - 1, col) +
-                                       inputView(row + 1, col) +
-                                       inputView(row, col - 1) +
-                                       inputView(row, col + 1);
-                                 });
+  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      RAJA::seq_exec,  // row
+      RAJA::statement::For<
+          0,
+          RAJA::seq_exec,  // col
+          RAJA::statement::Lambda<0>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL1>(
+      RAJA::make_tuple(col_range, row_range),
+      [=](int col, int row)
+      {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajaseq_end
 
   std::cout << "\noutput lattice:\n";
@@ -248,19 +248,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _offsetlayout_rajaomp_start
   using NESTED_EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>, // row, col
+      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>,  // row, col
       RAJA::statement::Lambda<0>>>;
 
-  RAJA::kernel<NESTED_EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
-                                 [=](int col, int row)
-                                 {
-                                   outputView(row, col) =
-                                       inputView(row, col) +
-                                       inputView(row - 1, col) +
-                                       inputView(row + 1, col) +
-                                       inputView(row, col - 1) +
-                                       inputView(row, col + 1);
-                                 });
+  RAJA::kernel<NESTED_EXEC_POL2>(
+      RAJA::make_tuple(col_range, row_range),
+      [=](int col, int row)
+      {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajaomp_end
 
   std::cout << "\noutput lattice:\n";
@@ -277,23 +276,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajacuda_start
-  using NESTED_EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
-      RAJA::statement::For<1,
-                           RAJA::cuda_block_x_loop, // row
-                           RAJA::statement::For<0,
-                                                RAJA::cuda_thread_x_loop, // col
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row)
-                                 {
-                                   outputView(row, col) =
-                                       inputView(row, col) +
-                                       inputView(row - 1, col) +
-                                       inputView(row + 1, col) +
-                                       inputView(row, col - 1) +
-                                       inputView(row, col + 1);
-                                 });
+  using NESTED_EXEC_POL3 =
+      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+          1,
+          RAJA::cuda_block_x_loop,  // row
+          RAJA::statement::For<
+              0,
+              RAJA::cuda_thread_x_loop,  // col
+              RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL3>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
+                               inputView(row + 1, col) +
+                               inputView(row, col - 1) +
+                               inputView(row, col + 1);
+      });
   // _offsetlayout_rajacuda_end
 
   std::cout << "\noutput lattice:\n";
@@ -315,34 +315,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   hipErrchk(
       hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_output, output, totCells * sizeof(int),
-                      hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(
+      d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice));
 
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView(d_input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
 
   // _offsetlayout_rajahip_start
-  using NESTED_EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::HipKernel<
-      RAJA::statement::For<1,
-                           RAJA::hip_block_x_loop, // row
-                           RAJA::statement::For<0,
-                                                RAJA::hip_thread_x_loop, // col
-                                                RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
-                                 [=] RAJA_DEVICE(int col, int row)
-                                 {
-                                   d_outputView(row, col) =
-                                       d_inputView(row, col) +
-                                       d_inputView(row - 1, col) +
-                                       d_inputView(row + 1, col) +
-                                       d_inputView(row, col - 1) +
-                                       d_inputView(row, col + 1);
-                                 });
+  using NESTED_EXEC_POL4 =
+      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+          1,
+          RAJA::hip_block_x_loop,  // row
+          RAJA::statement::For<
+              0,
+              RAJA::hip_thread_x_loop,  // col
+              RAJA::statement::Lambda<0>>>>>;
+
+  RAJA::kernel<NESTED_EXEC_POL4>(
+      RAJA::make_tuple(col_range, row_range),
+      [=] RAJA_DEVICE(int col, int row)
+      {
+        d_outputView(row, col) =
+            d_inputView(row, col) + d_inputView(row - 1, col) +
+            d_inputView(row + 1, col) + d_inputView(row, col - 1) +
+            d_inputView(row, col + 1);
+      });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy(output, d_output, totCells * sizeof(int),
-                      hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(
+      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
 
   std::cout << "\noutput lattice:\n";
   printLattice(output, totCellsInRow, totCellsInCol);
diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp
index 91761490a5..02c3d03a65 100644
--- a/exercises/permuted-layout-batch-matrix-multiply.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply.cpp
@@ -126,7 +126,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // 2).
   //
   // _permutedlayout_defviews_start
-  std::array<RAJA::idx_t, 3> perm1{{0, 1, 2}};
+  std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
   auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
@@ -179,23 +179,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using INIT_POL = RAJA::seq_exec;
 #endif
 
-  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N),
-                         [=](int e)
-                         {
-                           for (int row = 0; row < N_r; ++row)
-                           {
-                             for (int col = 0; col < N_c; ++col)
-                             {
-                               Aview(e, row, col) = row;
-                               Bview(e, row, col) = col;
-                               Cview(e, row, col) = 0;
-
-                               //      Aview2(e, row, col) = row;
-                               //      Bview2(e, row, col) = col;
-                               //      Cview2(e, row, col) = 0;
-                             }
-                           }
-                         });
+  RAJA::forall<INIT_POL>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=](int e)
+      {
+        for (int row = 0; row < N_r; ++row)
+        {
+          for (int col = 0; col < N_c; ++col)
+          {
+            Aview(e, row, col) = row;
+            Bview(e, row, col) = col;
+            Cview(e, row, col) = 0;
+
+            //      Aview2(e, row, col) = row;
+            //      Bview2(e, row, col) = col;
+            //      Cview2(e, row, col) = 0;
+          }
+        }
+      });
 
 
   //----------------------------------------------------------------------------//
diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
index ed05769394..b04aead60b 100644
--- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
@@ -127,7 +127,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // 2).
   //
   // _permutedlayout_defviews_start
-  std::array<RAJA::idx_t, 3> perm1{{0, 1, 2}};
+  std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
   auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
@@ -148,7 +148,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // In this case the element index has unit stride (argument 0).
   //
   // _permutedlayout_permviews_start
-  std::array<RAJA::idx_t, 3> perm2{{1, 2, 0}};
+  std::array<RAJA::idx_t, 3> perm2 {{1, 2, 0}};
   auto layout2 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm2);
 
   RAJA::View<double, RAJA::Layout<3, int, 0>> Aview2(A2, layout2);
@@ -168,23 +168,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using INIT_POL = RAJA::seq_exec;
 #endif
 
-  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N),
-                         [=](int e)
-                         {
-                           for (int row = 0; row < N_r; ++row)
-                           {
-                             for (int col = 0; col < N_c; ++col)
-                             {
-                               Aview(e, row, col) = row;
-                               Bview(e, row, col) = col;
-                               Cview(e, row, col) = 0;
-
-                               Aview2(e, row, col) = row;
-                               Bview2(e, row, col) = col;
-                               Cview2(e, row, col) = 0;
-                             }
-                           }
-                         });
+  RAJA::forall<INIT_POL>(
+      RAJA::TypedRangeSegment<int>(0, N),
+      [=](int e)
+      {
+        for (int row = 0; row < N_r; ++row)
+        {
+          for (int col = 0; col < N_c; ++col)
+          {
+            Aview(e, row, col) = row;
+            Bview(e, row, col) = col;
+            Cview(e, row, col) = 0;
+
+            Aview2(e, row, col) = row;
+            Bview2(e, row, col) = col;
+            Cview2(e, row, col) = 0;
+          }
+        }
+      });
 
 
   //----------------------------------------------------------------------------//
@@ -557,10 +558,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
   hipErrchk(
       hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_A2, A2, N_c * N_r * N * sizeof(double),
-                      hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_B2, B2, N_c * N_r * N * sizeof(double),
-                      hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(
+      d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy(
+      d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
 
   minRun = std::numeric_limits<double>::max();
   for (int i = 0; i < NITER; ++i)
@@ -665,8 +666,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy(C2, d_C2, N_c * N_r * N * sizeof(double),
-                      hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy(
+      C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
 
   std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp
index c09321adc0..e5aa733e90 100644
--- a/exercises/reductions.cpp
+++ b/exercises/reductions.cpp
@@ -263,17 +263,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange1,
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            hip_sum += d_a[i];
-
-                            hip_min.min(d_a[i]);
-                            hip_max.max(d_a[i]);
-
-                            hip_minloc.minloc(d_a[i], i);
-                            hip_maxloc.maxloc(d_a[i], i);
-                          });
+  RAJA::forall<EXEC_POL3>(
+      arange1,
+      [=] RAJA_DEVICE(int i)
+      {
+        hip_sum += d_a[i];
+
+        hip_min.min(d_a[i]);
+        hip_max.max(d_a[i]);
+
+        hip_minloc.minloc(d_a[i], i);
+        hip_maxloc.maxloc(d_a[i], i);
+      });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
diff --git a/exercises/reductions_solution.cpp b/exercises/reductions_solution.cpp
index 6c0bacdad9..88ae4d0d37 100644
--- a/exercises/reductions_solution.cpp
+++ b/exercises/reductions_solution.cpp
@@ -112,17 +112,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL1>(arange,
-                          [=](int i)
-                          {
-                            seq_sum += a[i];
+  RAJA::forall<EXEC_POL1>(
+      arange,
+      [=](int i)
+      {
+        seq_sum += a[i];
 
-                            seq_min.min(a[i]);
-                            seq_max.max(a[i]);
+        seq_min.min(a[i]);
+        seq_max.max(a[i]);
 
-                            seq_minloc.minloc(a[i], i);
-                            seq_maxloc.maxloc(a[i], i);
-                          });
+        seq_minloc.minloc(a[i], i);
+        seq_maxloc.maxloc(a[i], i);
+      });
 
   std::cout << "\tsum = " << seq_sum.get() << std::endl;
   std::cout << "\tmin = " << seq_min.get() << std::endl;
@@ -152,17 +153,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceMaxLoc<REDUCE_POL2, int> omp_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL2>(arange,
-                          [=](int i)
-                          {
-                            omp_sum += a[i];
+  RAJA::forall<EXEC_POL2>(
+      arange,
+      [=](int i)
+      {
+        omp_sum += a[i];
 
-                            omp_min.min(a[i]);
-                            omp_max.max(a[i]);
+        omp_min.min(a[i]);
+        omp_max.max(a[i]);
 
-                            omp_minloc.minloc(a[i], i);
-                            omp_maxloc.maxloc(a[i], i);
-                          });
+        omp_minloc.minloc(a[i], i);
+        omp_maxloc.maxloc(a[i], i);
+      });
 
   std::cout << "\tsum = " << omp_sum.get() << std::endl;
   std::cout << "\tmin = " << omp_min.get() << std::endl;
@@ -192,17 +194,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceMaxLoc<REDUCE_POL3, int> cuda_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange,
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            cuda_sum += a[i];
+  RAJA::forall<EXEC_POL3>(
+      arange,
+      [=] RAJA_DEVICE(int i)
+      {
+        cuda_sum += a[i];
 
-                            cuda_min.min(a[i]);
-                            cuda_max.max(a[i]);
+        cuda_min.min(a[i]);
+        cuda_max.max(a[i]);
 
-                            cuda_minloc.minloc(a[i], i);
-                            cuda_maxloc.maxloc(a[i], i);
-                          });
+        cuda_minloc.minloc(a[i], i);
+        cuda_maxloc.maxloc(a[i], i);
+      });
 
   std::cout << "\tsum = " << cuda_sum.get() << std::endl;
   std::cout << "\tmin = " << cuda_min.get() << std::endl;
@@ -234,17 +237,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
       std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange,
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            hip_sum += d_a[i];
+  RAJA::forall<EXEC_POL3>(
+      arange,
+      [=] RAJA_DEVICE(int i)
+      {
+        hip_sum += d_a[i];
 
-                            hip_min.min(d_a[i]);
-                            hip_max.max(d_a[i]);
+        hip_min.min(d_a[i]);
+        hip_max.max(d_a[i]);
 
-                            hip_minloc.minloc(d_a[i], i);
-                            hip_maxloc.maxloc(d_a[i], i);
-                          });
+        hip_minloc.minloc(d_a[i], i);
+        hip_maxloc.maxloc(d_a[i], i);
+      });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
diff --git a/exercises/scan.cpp b/exercises/scan.cpp
index 899d3a0aa4..53d7cac705 100644
--- a/exercises/scan.cpp
+++ b/exercises/scan.cpp
@@ -102,8 +102,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   // _scan_inclusive_seq_start
-  RAJA::inclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
-                                       RAJA::make_span(out, N));
+  RAJA::inclusive_scan<RAJA::seq_exec>(
+      RAJA::make_span(in, N), RAJA::make_span(out, N));
   // _scan_inclusive_seq_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp
index 03322e45a5..c37b562dd6 100644
--- a/exercises/scan_solution.cpp
+++ b/exercises/scan_solution.cpp
@@ -93,8 +93,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running sequential inclusive_scan (default)...\n";
 
   // _scan_inclusive_seq_start
-  RAJA::inclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
-                                       RAJA::make_span(out, N));
+  RAJA::inclusive_scan<RAJA::seq_exec>(
+      RAJA::make_span(in, N), RAJA::make_span(out, N));
   // _scan_inclusive_seq_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
@@ -108,9 +108,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _scan_inclusive_seq_plus_start
-  RAJA::inclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
-                                       RAJA::make_span(out, N),
-                                       RAJA::operators::plus<int>{});
+  RAJA::inclusive_scan<RAJA::seq_exec>(
+      RAJA::make_span(in, N), RAJA::make_span(out, N),
+      RAJA::operators::plus<int> {});
   // _scan_inclusive_seq_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
@@ -124,9 +124,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _scan_exclusive_seq_plus_start
-  RAJA::exclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
-                                       RAJA::make_span(out, N),
-                                       RAJA::operators::plus<int>{});
+  RAJA::exclusive_scan<RAJA::seq_exec>(
+      RAJA::make_span(in, N), RAJA::make_span(out, N),
+      RAJA::operators::plus<int> {});
   // _scan_exclusive_seq_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -140,8 +140,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _scan_inclusive_inplace_seq_min_start
   std::copy_n(in, N, out);
 
-  RAJA::inclusive_scan_inplace<RAJA::seq_exec>(RAJA::make_span(out, N),
-                                               RAJA::operators::minimum<int>{});
+  RAJA::inclusive_scan_inplace<RAJA::seq_exec>(
+      RAJA::make_span(out, N), RAJA::operators::minimum<int> {});
   // _scan_inclusive_inplace_seq_min_end
 
   CHECK_INC_SCAN_RESULTS(OP_MIN_INT)
@@ -155,8 +155,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _scan_exclusive_inplace_seq_max_start
-  RAJA::exclusive_scan_inplace<RAJA::seq_exec>(RAJA::make_span(out, N),
-                                               RAJA::operators::maximum<int>{});
+  RAJA::exclusive_scan_inplace<RAJA::seq_exec>(
+      RAJA::make_span(out, N), RAJA::operators::maximum<int> {});
   // _scan_exclusive_inplace_seq_max_end
 
   CHECK_EXC_SCAN_RESULTS(OP_MAX_INT)
@@ -175,7 +175,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _scan_inclusive_omp_plus_start
   RAJA::inclusive_scan<RAJA::omp_parallel_for_exec>(
       RAJA::make_span(in, N), RAJA::make_span(out, N),
-      RAJA::operators::plus<int>{});
+      RAJA::operators::plus<int> {});
   // _scan_inclusive_omp_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
@@ -190,7 +190,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_inplace_omp_plus_start
   RAJA::exclusive_scan_inplace<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(out, N), RAJA::operators::plus<int>{});
+      RAJA::make_span(out, N), RAJA::operators::plus<int> {});
   // _scan_exclusive_inplace_omp_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -213,7 +213,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_inclusive_inplace_cuda_plus_start
   RAJA::inclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N), RAJA::operators::plus<int>{});
+      RAJA::make_span(out, N), RAJA::operators::plus<int> {});
   // _scan_inclusive_inplace_cuda_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
@@ -228,7 +228,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_inplace_cuda_plus_start
   RAJA::exclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N), RAJA::operators::plus<int>{});
+      RAJA::make_span(out, N), RAJA::operators::plus<int> {});
   // _scan_exclusive_inplace_cuda_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -244,7 +244,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _scan_exclusive_cuda_plus_start
   RAJA::exclusive_scan<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
       RAJA::make_span(in, N), RAJA::make_span(out, N),
-      RAJA::operators::plus<int>{});
+      RAJA::operators::plus<int> {});
   // _scan_exclusive_cuda_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -272,7 +272,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_inclusive_inplace_hip_plus_start
   RAJA::inclusive_scan_inplace<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_out, N), RAJA::operators::plus<int>{});
+      RAJA::make_span(d_out, N), RAJA::operators::plus<int> {});
   // _scan_inclusive_inplace_hip_plus_end
 
   hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
@@ -290,7 +290,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::exclusive_scan<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
       RAJA::make_span(d_in, N), RAJA::make_span(d_out, N),
-      RAJA::operators::plus<int>{});
+      RAJA::operators::plus<int> {});
 
   hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
 
diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp
index bac3d08784..4a3e8c13c6 100644
--- a/exercises/segment-indexset-basics.cpp
+++ b/exercises/segment-indexset-basics.cpp
@@ -54,7 +54,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // Resource object used to construct list segment objects with indices
   // living in host (CPU) memory.
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
 
   //----------------------------------------------------------------------------//
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA range kernel...\n";
 
   // _raja_range1_start
-  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeSegType(0, 20), [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range1_end
 
   std::cout << std::endl;
@@ -88,8 +88,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA stride-1 range kernel...\n";
 
   // _raja_striderange1_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeStrideSegType(0, 20, 1), [=](IdxType i) { std::cout << i << "  "; });
   // _raja_striderange1_end
 
   std::cout << std::endl;
@@ -110,8 +110,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   ListSegType idx_list1(idx, host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1,
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      idx_list1, [=](IdxType i) { std::cout << i << "  "; });
   // _raja_list1_end
 
   std::cout << std::endl;
@@ -121,7 +121,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running C-style stride-1 list kernel...\n";
 
   // _cstyle_list1_start
-  IdxType iis = static_cast<IdxType>(idx.size()); // to avoid compiler warning
+  IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
   for (IdxType ii = 0; ii < iis; ++ii)
   {
     std::cout << idx[ii] << "  ";
@@ -170,8 +170,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::reverse(idx.begin(), idx.end());
   ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse,
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      idx_list1_reverse, [=](IdxType i) { std::cout << i << "  "; });
   // _raja_negstridelist1_end
 
   std::cout << std::endl;
@@ -196,8 +196,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA stride-2 range kernel...\n";
 
   // _raja_range2_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeStrideSegType(0, 20, 2), [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range2_end
 
   std::cout << std::endl;
@@ -234,8 +234,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   is2.push_back(RangeSegType(0, 10));
   is2.push_back(RangeSegType(15, 20));
 
-  RAJA::forall<SEQ_ISET_EXECPOL>(is2,
-                                 [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<SEQ_ISET_EXECPOL>(
+      is2, [=](IdxType i) { std::cout << i << "  "; });
   // _raja_indexset_2ranges_end
 
   std::cout << std::endl;
diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp
index 1a93f235bc..5bbb6ecdea 100644
--- a/exercises/segment-indexset-basics_solution.cpp
+++ b/exercises/segment-indexset-basics_solution.cpp
@@ -54,7 +54,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // Resource object used to construct list segment objects with indices
   // living in host (CPU) memory.
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
 
   //----------------------------------------------------------------------------//
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA range kernel...\n";
 
   // _raja_range1_start
-  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeSegType(0, 20), [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range1_end
 
   std::cout << std::endl;
@@ -88,8 +88,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA stride-1 range kernel...\n";
 
   // _raja_striderange1_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeStrideSegType(0, 20, 1), [=](IdxType i) { std::cout << i << "  "; });
   // _raja_striderange1_end
 
   std::cout << std::endl;
@@ -110,8 +110,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   ListSegType idx_list1(idx, host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1,
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      idx_list1, [=](IdxType i) { std::cout << i << "  "; });
   // _raja_list1_end
 
   std::cout << std::endl;
@@ -121,7 +121,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running C-style stride-1 list kernel...\n";
 
   // _cstyle_list1_start
-  IdxType iis = static_cast<IdxType>(idx.size()); // to avoid compiler warning
+  IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
   for (IdxType ii = 0; ii < iis; ++ii)
   {
     std::cout << idx[ii] << "  ";
@@ -150,8 +150,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA negative stride kernel...\n";
 
   // _raja_negstriderange1_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(19, -1, -1),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeStrideSegType(19, -1, -1),
+      [=](IdxType i) { std::cout << i << "  "; });
   // _raja_negstriderange1_end
 
   std::cout << std::endl;
@@ -169,8 +170,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::reverse(idx.begin(), idx.end());
   ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res);
 
-  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse,
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      idx_list1_reverse, [=](IdxType i) { std::cout << i << "  "; });
   // _raja_negstridelist1_end
 
   std::cout << std::endl;
@@ -195,8 +196,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA stride-2 range kernel...\n";
 
   // _raja_range2_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeStrideSegType(0, 20, 2), [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range2_end
 
   std::cout << std::endl;
@@ -206,8 +207,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA stride-3 range kernel...\n";
 
   // _raja_range3_start
-  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 3),
-                               [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(
+      RangeStrideSegType(0, 20, 3), [=](IdxType i) { std::cout << i << "  "; });
   // _raja_range3_end
 
   std::cout << std::endl;
@@ -230,8 +231,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   is2.push_back(RangeSegType(0, 10));
   is2.push_back(RangeSegType(15, 20));
 
-  RAJA::forall<SEQ_ISET_EXECPOL>(is2,
-                                 [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<SEQ_ISET_EXECPOL>(
+      is2, [=](IdxType i) { std::cout << i << "  "; });
   // _raja_indexset_2ranges_end
 
   std::cout << std::endl;
@@ -268,8 +269,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   is3.push_back(RangeSegType(24, 28));
 
-  RAJA::forall<SEQ_ISET_EXECPOL>(is3,
-                                 [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<SEQ_ISET_EXECPOL>(
+      is3, [=](IdxType i) { std::cout << i << "  "; });
   // _raja_indexset_3segs_end
 
   std::cout << std::endl;
diff --git a/exercises/sort.cpp b/exercises/sort.cpp
index 006f2b05e4..8778514102 100644
--- a/exercises/sort.cpp
+++ b/exercises/sort.cpp
@@ -62,20 +62,22 @@
 template <typename Function, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkUnstableSortResult(const T* in,
-                             const T* out,
-                             const U* in_vals,
-                             const U* out_vals,
-                             int      N);
+void checkUnstableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkStableSortResult(const T* in,
-                           const T* out,
-                           const U* in_vals,
-                           const U* out_vals,
-                           int      N);
+void checkStableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -105,8 +107,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::iota(in, in + N / 2, 0);
   std::iota(in + N / 2, in + N, 0);
-  std::shuffle(in, in + N / 2, std::mt19937{12345u});
-  std::shuffle(in + N / 2, in + N, std::mt19937{67890u});
+  std::shuffle(in, in + N / 2, std::mt19937 {12345u});
+  std::shuffle(in + N / 2, in + N, std::mt19937 {67890u});
 
   std::fill(in_vals, in_vals + N / 2, 0);
   std::fill(in_vals + N / 2, in_vals + N, 1);
@@ -449,7 +451,7 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
     auto key_iter = keys.find(in[i]);
     if (key_iter == keys.end())
     {
-      auto ret = keys.emplace(in[i], val_map{});
+      auto ret = keys.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -505,11 +507,12 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
 }
 
 template <typename Comparator, typename T, typename U>
-void checkUnstableSortResult(const T* in,
-                             const T* out,
-                             const U* in_vals,
-                             const U* out_vals,
-                             int      N)
+void checkUnstableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N)
 {
   Comparator comp;
   bool       correct = true;
@@ -522,7 +525,7 @@ void checkUnstableSortResult(const T* in,
     auto key_iter = keys_to_vals.find(in[i]);
     if (key_iter == keys_to_vals.end())
     {
-      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      auto ret = keys_to_vals.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -601,7 +604,7 @@ void checkStableSortResult(const T* in, const T* out, int N)
     auto key_iter = keys.find(in[i]);
     if (key_iter == keys.end())
     {
-      auto ret = keys.emplace(in[i], val_map{});
+      auto ret = keys.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -656,11 +659,12 @@ void checkStableSortResult(const T* in, const T* out, int N)
 }
 
 template <typename Comparator, typename T, typename U>
-void checkStableSortResult(const T* in,
-                           const T* out,
-                           const U* in_vals,
-                           const U* out_vals,
-                           int      N)
+void checkStableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N)
 {
   Comparator comp;
   bool       correct = true;
@@ -673,7 +677,7 @@ void checkStableSortResult(const T* in,
     auto key_iter = keys_to_vals.find(in[i]);
     if (key_iter == keys_to_vals.end())
     {
-      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      auto ret = keys_to_vals.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp
index 8a3daf4db1..fef8cd2e76 100644
--- a/exercises/sort_solution.cpp
+++ b/exercises/sort_solution.cpp
@@ -62,20 +62,22 @@ constexpr int HIP_BLOCK_SIZE = 16;
 template <typename Function, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkUnstableSortResult(const T* in,
-                             const T* out,
-                             const U* in_vals,
-                             const U* out_vals,
-                             int      N);
+void checkUnstableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkStableSortResult(const T* in,
-                           const T* out,
-                           const U* in_vals,
-                           const U* out_vals,
-                           int      N);
+void checkStableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -105,8 +107,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::iota(in, in + N / 2, 0);
   std::iota(in + N / 2, in + N, 0);
-  std::shuffle(in, in + N / 2, std::mt19937{12345u});
-  std::shuffle(in + N / 2, in + N, std::mt19937{67890u});
+  std::shuffle(in, in + N / 2, std::mt19937 {12345u});
+  std::shuffle(in + N / 2, in + N, std::mt19937 {67890u});
 
   std::fill(in_vals, in_vals + N / 2, 0);
   std::fill(in_vals + N / 2, in_vals + N, 1);
@@ -145,8 +147,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _sort_seq_less_start
-  RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N),
-                             RAJA::operators::less<int>{});
+  RAJA::sort<RAJA::seq_exec>(
+      RAJA::make_span(out, N), RAJA::operators::less<int> {});
   // _sort_seq_less_end
 
   // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
@@ -161,8 +163,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _sort_stable_seq_less_start
-  RAJA::stable_sort<RAJA::seq_exec>(RAJA::make_span(out, N),
-                                    RAJA::operators::less<int>{});
+  RAJA::stable_sort<RAJA::seq_exec>(
+      RAJA::make_span(out, N), RAJA::operators::less<int> {});
   // _sort_stable_seq_less_end
 
   // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
@@ -177,8 +179,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _sort_stable_seq_greater_start
-  RAJA::stable_sort<RAJA::seq_exec>(RAJA::make_span(out, N),
-                                    RAJA::operators::greater<int>{});
+  RAJA::stable_sort<RAJA::seq_exec>(
+      RAJA::make_span(out, N), RAJA::operators::greater<int> {});
   // _sort_stable_seq_greater_end
 
   // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
@@ -194,9 +196,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_pairs_seq_less_start
-  RAJA::sort_pairs<RAJA::seq_exec>(RAJA::make_span(out, N),
-                                   RAJA::make_span(out_vals, N),
-                                   RAJA::operators::less<int>{});
+  RAJA::sort_pairs<RAJA::seq_exec>(
+      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
+      RAJA::operators::less<int> {});
   // _sort_pairs_seq_less_end
 
   // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
@@ -213,9 +215,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_stable_pairs_seq_greater_start
-  RAJA::stable_sort_pairs<RAJA::seq_exec>(RAJA::make_span(out, N),
-                                          RAJA::make_span(out_vals, N),
-                                          RAJA::operators::greater<int>{});
+  RAJA::stable_sort_pairs<RAJA::seq_exec>(
+      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
+      RAJA::operators::greater<int> {});
   // _sort_stable_pairs_seq_greater_end
 
   // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
@@ -236,8 +238,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _sort_omp_less_start
-  RAJA::sort<RAJA::omp_parallel_for_exec>(RAJA::make_span(out, N),
-                                          RAJA::operators::less<int>{});
+  RAJA::sort<RAJA::omp_parallel_for_exec>(
+      RAJA::make_span(out, N), RAJA::operators::less<int> {});
   // _sort_omp_less_end
 
   // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
@@ -255,7 +257,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _sort_stable_pairs_omp_greater_start
   RAJA::stable_sort_pairs<RAJA::omp_parallel_for_exec>(
       RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
-      RAJA::operators::greater<int>{});
+      RAJA::operators::greater<int> {});
   // _sort_stable_pairs_omp_greater_end
 
   // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
@@ -282,7 +284,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _sort_pairs_cuda_greater_start
   RAJA::sort_pairs<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
       RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
-      RAJA::operators::greater<int>{});
+      RAJA::operators::greater<int> {});
   // _sort_pairs_cuda_greater_end
 
   // checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
@@ -299,7 +301,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _sort_stable_cuda_less_start
   RAJA::stable_sort<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N), RAJA::operators::less<int>{});
+      RAJA::make_span(out, N), RAJA::operators::less<int> {});
   // _sort_stable_cuda_less_end
 
   // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
@@ -331,7 +333,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::sort_pairs<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
       RAJA::make_span(d_out, N), RAJA::make_span(d_out_vals, N),
-      RAJA::operators::less<int>{});
+      RAJA::operators::less<int> {});
 
   hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
   hipErrchk(
@@ -353,7 +355,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _sort_stable_hip_greater_start
   RAJA::stable_sort<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_out, N), RAJA::operators::greater<int>{});
+      RAJA::make_span(d_out, N), RAJA::operators::greater<int> {});
   // _sort_stable_hip_greater_end
 
   hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
@@ -408,7 +410,7 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
     auto key_iter = keys.find(in[i]);
     if (key_iter == keys.end())
     {
-      auto ret = keys.emplace(in[i], val_map{});
+      auto ret = keys.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -464,11 +466,12 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
 }
 
 template <typename Comparator, typename T, typename U>
-void checkUnstableSortResult(const T* in,
-                             const T* out,
-                             const U* in_vals,
-                             const U* out_vals,
-                             int      N)
+void checkUnstableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N)
 {
   Comparator comp;
   bool       correct = true;
@@ -481,7 +484,7 @@ void checkUnstableSortResult(const T* in,
     auto key_iter = keys_to_vals.find(in[i]);
     if (key_iter == keys_to_vals.end())
     {
-      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      auto ret = keys_to_vals.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -560,7 +563,7 @@ void checkStableSortResult(const T* in, const T* out, int N)
     auto key_iter = keys.find(in[i]);
     if (key_iter == keys.end())
     {
-      auto ret = keys.emplace(in[i], val_map{});
+      auto ret = keys.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -615,11 +618,12 @@ void checkStableSortResult(const T* in, const T* out, int N)
 }
 
 template <typename Comparator, typename T, typename U>
-void checkStableSortResult(const T* in,
-                           const T* out,
-                           const U* in_vals,
-                           const U* out_vals,
-                           int      N)
+void checkStableSortResult(
+    const T* in,
+    const T* out,
+    const U* in_vals,
+    const U* out_vals,
+    int      N)
 {
   Comparator comp;
   bool       correct = true;
@@ -632,7 +636,7 @@ void checkStableSortResult(const T* in,
     auto key_iter = keys_to_vals.find(in[i]);
     if (key_iter == keys_to_vals.end())
     {
-      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      auto ret = keys_to_vals.emplace(in[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
diff --git a/exercises/tutorial_halfday/ex2_approx-pi.cpp b/exercises/tutorial_halfday/ex2_approx-pi.cpp
index 7b8b370b12..d919e7a3da 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi.cpp
@@ -97,12 +97,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N),
-                          [=](int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            seq_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL1>(
+      RAJA::RangeSegment(0, N),
+      [=](int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        seq_pi += dx / (1.0 + x * x);
+      });
   double seq_pi_val = seq_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
diff --git a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
index 6f6465c20e..f66c8dde14 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
@@ -83,12 +83,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N),
-                          [=](int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            seq_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL1>(
+      RAJA::RangeSegment(0, N),
+      [=](int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        seq_pi += dx / (1.0 + x * x);
+      });
   double seq_pi_val = seq_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
@@ -130,12 +131,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
 
-  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N),
-                          [=](int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            omp_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL2>(
+      RAJA::RangeSegment(0, N),
+      [=](int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        omp_pi += dx / (1.0 + x * x);
+      });
   double omp_pi_val = omp_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
@@ -156,12 +158,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
 
-  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N),
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            double x = (double(i) + 0.5) * dx;
-                            cuda_pi += dx / (1.0 + x * x);
-                          });
+  RAJA::forall<EXEC_POL3>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_DEVICE(int i)
+      {
+        double x = (double(i) + 0.5) * dx;
+        cuda_pi += dx / (1.0 + x * x);
+      });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
   std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight.cpp b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
index bf77811226..df11c12629 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
@@ -109,7 +109,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   for (int i = 0; i < N; ++i)
   {
-    ang[i] = atan2(alt[i], dist[i]); // set angle in radians
+    ang[i] = atan2(alt[i], dist[i]);  // set angle in radians
   }
 
 
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
index 1ed85352d5..b628804d64 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
@@ -106,7 +106,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   for (int i = 0; i < N; ++i)
   {
-    ang[i] = atan2(alt[i], dist[i]); // set angle in radians
+    ang[i] = atan2(alt[i], dist[i]);  // set angle in radians
   }
 
 
@@ -155,23 +155,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL1 = RAJA::seq_exec;
 
-  RAJA::inclusive_scan<EXEC_POL1>(RAJA::make_span(ang, N),
-                                  RAJA::make_span(ang_max, N),
-                                  RAJA::operators::maximum<double>{});
-
-
-  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N),
-                          [=](int i)
-                          {
-                            if (ang[i] >= ang_max[i])
-                            {
-                              visible[i] = 1;
-                            }
-                            else
-                            {
-                              visible[i] = 0;
-                            }
-                          });
+  RAJA::inclusive_scan<EXEC_POL1>(
+      RAJA::make_span(ang, N), RAJA::make_span(ang_max, N),
+      RAJA::operators::maximum<double> {});
+
+
+  RAJA::forall<EXEC_POL1>(
+      RAJA::RangeSegment(0, N),
+      [=](int i)
+      {
+        if (ang[i] >= ang_max[i])
+        {
+          visible[i] = 1;
+        }
+        else
+        {
+          visible[i] = 0;
+        }
+      });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
@@ -192,22 +193,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL2 = RAJA::omp_parallel_for_exec;
 
-  RAJA::inclusive_scan<EXEC_POL2>(RAJA::make_span(ang, N),
-                                  RAJA::make_span(ang_max, N),
-                                  RAJA::operators::maximum<double>{});
-
-  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N),
-                          [=](int i)
-                          {
-                            if (ang[i] >= ang_max[i])
-                            {
-                              visible[i] = 1;
-                            }
-                            else
-                            {
-                              visible[i] = 0;
-                            }
-                          });
+  RAJA::inclusive_scan<EXEC_POL2>(
+      RAJA::make_span(ang, N), RAJA::make_span(ang_max, N),
+      RAJA::operators::maximum<double> {});
+
+  RAJA::forall<EXEC_POL2>(
+      RAJA::RangeSegment(0, N),
+      [=](int i)
+      {
+        if (ang[i] >= ang_max[i])
+        {
+          visible[i] = 1;
+        }
+        else
+        {
+          visible[i] = 0;
+        }
+      });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
@@ -230,22 +232,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
 
-  RAJA::inclusive_scan<EXEC_POL3>(RAJA::make_span(ang, N),
-                                  RAJA::make_span(ang_max, N),
-                                  RAJA::operators::maximum<double>{});
-
-  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N),
-                          [=] RAJA_DEVICE(int i)
-                          {
-                            if (ang[i] >= ang_max[i])
-                            {
-                              visible[i] = 1;
-                            }
-                            else
-                            {
-                              visible[i] = 0;
-                            }
-                          });
+  RAJA::inclusive_scan<EXEC_POL3>(
+      RAJA::make_span(ang, N), RAJA::make_span(ang_max, N),
+      RAJA::operators::maximum<double> {});
+
+  RAJA::forall<EXEC_POL3>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_DEVICE(int i)
+      {
+        if (ang[i] >= ang_max[i])
+        {
+          visible[i] = 1;
+        }
+        else
+        {
+          visible[i] = 0;
+        }
+      });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
index 0a3f6f5982..55166ffed1 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
@@ -186,9 +186,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       int idx_out = j + Nr_int * i;
       int idx_in  = (j + 1) + Nr_tot * (i + 1);
 
-      A_ref[idx_out] = B[idx_in] +                               // C
-                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E
-                       B[idx_in - 1] + B[idx_in + 1];            // S, N
+      A_ref[idx_out] = B[idx_in] +                                // C
+                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] +  // W, E
+                       B[idx_in - 1] + B[idx_in + 1];             // S, N
     }
   }
 
@@ -270,9 +270,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       int idx_out = i + Nc_int * j;
       int idx_in  = (i + 1) + Nc_tot * (j + 1);
 
-      A_ref[idx_out] = B[idx_in] +                               // C
-                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N
-                       B[idx_in - 1] + B[idx_in + 1];            // W, E
+      A_ref[idx_out] = B[idx_in] +                                // C
+                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] +  // S, N
+                       B[idx_in - 1] + B[idx_in + 1];             // W, E
     }
   }
 
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
index 9845e9572c..08c7f2d851 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
@@ -186,9 +186,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       int idx_out = j + Nr_int * i;
       int idx_in  = (j + 1) + Nr_tot * (i + 1);
 
-      A_ref[idx_out] = B[idx_in] +                               // C
-                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E
-                       B[idx_in - 1] + B[idx_in + 1];            // S, N
+      A_ref[idx_out] = B[idx_in] +                                // C
+                       B[idx_in - Nr_tot] + B[idx_in + Nr_tot] +  // W, E
+                       B[idx_in - 1] + B[idx_in + 1];             // S, N
     }
   }
 
@@ -222,9 +222,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int j = 0; j < Nr_int; ++j)
     {
 
-      Aview(i, j) = Bview(i, j) +                       // C
-                    Bview(i - 1, j) + Bview(i + 1, j) + // W, E
-                    Bview(i, j - 1) + Bview(i, j + 1);  // S, N
+      Aview(i, j) = Bview(i, j) +                        // C
+                    Bview(i - 1, j) + Bview(i + 1, j) +  // W, E
+                    Bview(i, j - 1) + Bview(i, j + 1);   // S, N
     }
   }
 
@@ -268,9 +268,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       int idx_out = i + Nc_int * j;
       int idx_in  = (i + 1) + Nc_tot * (j + 1);
 
-      A_ref[idx_out] = B[idx_in] +                               // C
-                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N
-                       B[idx_in - 1] + B[idx_in + 1];            // W, E
+      A_ref[idx_out] = B[idx_in] +                                // C
+                       B[idx_in - Nc_tot] + B[idx_in + Nc_tot] +  // S, N
+                       B[idx_in - 1] + B[idx_in + 1];             // W, E
     }
   }
 
@@ -297,8 +297,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // application.
   //
 
-  std::array<RAJA::idx_t, DIM> perm{{1, 0}}; // 'i' index (position zero0)
-                                             // is stride-1
+  std::array<RAJA::idx_t, DIM> perm {{1, 0}};  // 'i' index (position zero0)
+                                               // is stride-1
 
   RAJA::OffsetLayout<DIM> pB_layout = RAJA::make_permuted_offset_layout(
       {{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}}, perm);
@@ -314,9 +314,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     for (int i = 0; i < Nc_int; ++i)
     {
 
-      pAview(i, j) = pBview(i, j) +                        // C
-                     pBview(i - 1, j) + pBview(i + 1, j) + // W, E
-                     pBview(i, j - 1) + pBview(i, j + 1);  // S, N
+      pAview(i, j) = pBview(i, j) +                         // C
+                     pBview(i - 1, j) + pBview(i + 1, j) +  // W, E
+                     pBview(i, j - 1) + pBview(i, j + 1);   // S, N
     }
   }
 
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
index 2cdd785635..8ab8e37acb 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
@@ -80,7 +80,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
@@ -129,8 +129,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tcol = 0; tcol < TILE_SZ; ++tcol)
         {
 
-          int col = bx * TILE_SZ + tcol; // Matrix column index
-          int row = by * TILE_SZ + trow; // Matrix row index
+          int col = bx * TILE_SZ + tcol;  // Matrix column index
+          int row = by * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -262,14 +262,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
           RAJA::statement::Collapse<
               RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-              RAJA::statement::Lambda<0>>            // closes collapse
-          >                                          // closes Tile 0
-      >                                              // closes Tile 1
-                                                  >; // closes policy list
-
-  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(RAJA::make_tuple(col_Range, row_Range),
-                                     [=](int col, int row)
-                                     { Atview(col, row) = Aview(row, col); });
+              RAJA::statement::Lambda<0>>             // closes collapse
+          >                                           // closes Tile 0
+      >                                               // closes Tile 1
+                                                  >;  // closes policy list
+
+  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
index 07bfdc3ce4..4924a532c1 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
@@ -78,7 +78,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
@@ -127,8 +127,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tcol = 0; tcol < TILE_SZ; ++tcol)
         {
 
-          int col = bx * TILE_SZ + tcol; // Matrix column index
-          int row = by * TILE_SZ + trow; // Matrix row index
+          int col = bx * TILE_SZ + tcol;  // Matrix column index
+          int row = by * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -173,12 +173,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
           RAJA::statement::For<
               1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                                   RAJA::statement::Lambda<0>>>>>>;
+              RAJA::statement::For<
+                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL_SEQ>(RAJA::make_tuple(col_Range, row_Range),
-                                    [=](int col, int row)
-                                    { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_SEQ>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -201,12 +201,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
           RAJA::statement::For<
               1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                                   RAJA::statement::Lambda<0>>>>>>;
+              RAJA::statement::For<
+                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>(RAJA::make_tuple(col_Range, row_Range),
-                                    [=](int col, int row)
-                                    { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -230,14 +230,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
           RAJA::statement::Collapse<
               RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-              RAJA::statement::Lambda<0>>            // closes collapse
-          >                                          // closes Tile 0
-      >                                              // closes Tile 1
-                                                  >; // closes policy list
+              RAJA::statement::Lambda<0>>             // closes collapse
+          >                                           // closes Tile 0
+      >                                               // closes Tile 1
+                                                  >;  // closes policy list
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(RAJA::make_tuple(col_Range, row_Range),
-                                     [=](int col, int row)
-                                     { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
+      RAJA::make_tuple(col_Range, row_Range),
+      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -256,12 +256,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
               0, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_x_loop,
               RAJA::statement::For<
                   1, RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                                       RAJA::statement::Lambda<0>>>>>>>;
+                  RAJA::statement::For<
+                      0, RAJA::cuda_thread_x_direct,
+                      RAJA::statement::Lambda<0>>>>>>>;
 
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(RAJA::make_tuple(col_Range, row_Range),
-                                     [=] RAJA_DEVICE(int col, int row)
-                                     { Atview(col, row) = Aview(row, col); });
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(
+      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
+      { Atview(col, row) = Aview(row, col); });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
index a7b1585ea8..f070f49703 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
@@ -85,7 +85,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
@@ -137,8 +137,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tcol = 0; tcol < TILE_SZ; ++tcol)
         {
 
-          int col = bcol * TILE_SZ + tcol; // Matrix column index
-          int row = brow * TILE_SZ + trow; // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -159,8 +159,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int trow = 0; trow < TILE_SZ; ++trow)
         {
 
-          int col = bcol * TILE_SZ + tcol; // Matrix column index
-          int row = brow * TILE_SZ + trow; // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -185,7 +185,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // tile_fixed statements. Iterations inside a RAJA loop is given by their
   // global iteration number.
   //
-#if 0 // needed for exercises, but if-def'd out to quiet compiler warnings.
+#if 0  // needed for exercises, but if-def'd out to quiet compiler warnings.
   RAJA::RangeSegment row_Range(0, N_r);
   RAJA::RangeSegment col_Range(0, N_c);
 #endif
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
index 8921291c23..e421a70a4d 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
@@ -85,7 +85,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Construct a permuted layout for At so that the column index has stride 1
   //
-  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
@@ -137,8 +137,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int tcol = 0; tcol < TILE_SZ; ++tcol)
         {
 
-          int col = bcol * TILE_SZ + tcol; // Matrix column index
-          int row = brow * TILE_SZ + trow; // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -159,8 +159,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
         for (int trow = 0; trow < TILE_SZ; ++trow)
         {
 
-          int col = bcol * TILE_SZ + tcol; // Matrix column index
-          int row = brow * TILE_SZ + trow; // Matrix row index
+          int col = bcol * TILE_SZ + tcol;  // Matrix column index
+          int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
           if (row < N_r && col < N_c)
@@ -219,15 +219,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
               RAJA::statement::ForICount<
                   1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
-                                             RAJA::seq_exec,
-                                             RAJA::statement::Lambda<0>>>,
+                  RAJA::statement::ForICount<
+                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                      RAJA::statement::Lambda<0>>>,
 
               RAJA::statement::ForICount<
                   0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<1, RAJA::statement::Param<1>,
-                                             RAJA::seq_exec,
-                                             RAJA::statement::Lambda<1>>>
+                  RAJA::statement::ForICount<
+                      1, RAJA::statement::Param<1>, RAJA::seq_exec,
+                      RAJA::statement::Lambda<1>>>
 
               >>>>;
 
@@ -262,15 +262,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
               RAJA::statement::ForICount<
                   1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
-                                             RAJA::seq_exec,
-                                             RAJA::statement::Lambda<0>>>,
+                  RAJA::statement::ForICount<
+                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                      RAJA::statement::Lambda<0>>>,
 
               RAJA::statement::ForICount<
                   0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<1, RAJA::statement::Param<1>,
-                                             RAJA::seq_exec,
-                                             RAJA::statement::Lambda<1>>>>>>>;
+                  RAJA::statement::ForICount<
+                      1, RAJA::statement::Param<1>, RAJA::seq_exec,
+                      RAJA::statement::Lambda<1>>>>>>>;
 
   RAJA::kernel_param<OPENMP_EXEC_POL>(
       RAJA::make_tuple(col_Range, row_Range),
@@ -306,17 +306,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
                   RAJA::statement::ForICount<
                       1, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<0, RAJA::statement::Param<0>,
-                                                 RAJA::cuda_thread_x_direct,
-                                                 RAJA::statement::Lambda<0>>>,
+                      RAJA::statement::ForICount<
+                          0, RAJA::statement::Param<0>,
+                          RAJA::cuda_thread_x_direct,
+                          RAJA::statement::Lambda<0>>>,
 
                   RAJA::statement::CudaSyncThreads,
 
                   RAJA::statement::ForICount<
                       0, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<1, RAJA::statement::Param<1>,
-                                                 RAJA::cuda_thread_x_direct,
-                                                 RAJA::statement::Lambda<1>>>,
+                      RAJA::statement::ForICount<
+                          1, RAJA::statement::Param<1>,
+                          RAJA::cuda_thread_x_direct,
+                          RAJA::statement::Lambda<1>>>,
 
                   RAJA::statement::CudaSyncThreads>>>>>;
 
diff --git a/exercises/tutorial_halfday/memoryManager.hpp b/exercises/tutorial_halfday/memoryManager.hpp
index c563033f9c..6cda797b3e 100644
--- a/exercises/tutorial_halfday/memoryManager.hpp
+++ b/exercises/tutorial_halfday/memoryManager.hpp
@@ -82,5 +82,5 @@ void deallocate_gpu(T*& ptr)
 }
 #endif
 
-}; // namespace memoryManager
+};  // namespace memoryManager
 #endif
diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp
index 1092857eaf..b6255d5def 100644
--- a/exercises/vector-addition.cpp
+++ b/exercises/vector-addition.cpp
@@ -118,8 +118,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   // _rajaseq_vector_add_start
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
-                               [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
   // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp
index 976d2afec3..a56404a780 100644
--- a/exercises/vector-addition_solution.cpp
+++ b/exercises/vector-addition_solution.cpp
@@ -109,8 +109,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
   // _rajaseq_vector_add_start
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
-                               [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
   // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
@@ -126,8 +126,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA SIMD vector addition...\n";
 
-  RAJA::forall<RAJA::simd_exec>(RAJA::TypedRangeSegment<int>(0, N),
-                                [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::simd_exec>(
+      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
 
   checkResult(c, c_ref, N);
   // printArray(c, N);
@@ -167,8 +167,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA OpenMP multithreaded vector addition...\n";
 
   // _rajaomp_vector_add_start
-  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N),
-                                            [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
   // _rajaomp_vector_add_end
 
   checkResult(c, c_ref, N);
diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp
index e8474ed0fe..925579d10a 100644
--- a/exercises/vertexsum-indexset.cpp
+++ b/exercises/vertexsum-indexset.cpp
@@ -256,7 +256,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Resource object used to construct list segment objects with indices
   // living in host (CPU) memory.
   //
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   //
   // Create a RAJA IndexSet with four ListSegments, one for the indices of
@@ -288,15 +288,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL1 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
 
-  RAJA::forall<EXEC_POL1>(colorset,
-                          [=](int ie)
-                          {
-                            int* iv = &(e2v_map[4 * ie]);
-                            areav[iv[0]] += areae[ie] / 4.0;
-                            areav[iv[1]] += areae[ie] / 4.0;
-                            areav[iv[2]] += areae[ie] / 4.0;
-                            areav[iv[3]] += areae[ie] / 4.0;
-                          });
+  RAJA::forall<EXEC_POL1>(
+      colorset,
+      [=](int ie)
+      {
+        int* iv = &(e2v_map[4 * ie]);
+        areav[iv[0]] += areae[ie] / 4.0;
+        areav[iv[1]] += areae[ie] / 4.0;
+        areav[iv[2]] += areae[ie] / 4.0;
+        areav[iv[3]] += areae[ie] / 4.0;
+      });
   // _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -317,7 +318,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Resource object used to construct list segment objects with indices
   // living in device (GPU) memory.
   //
-  camp::resources::Resource cuda_res{camp::resources::Cuda()};
+  camp::resources::Resource cuda_res {camp::resources::Cuda()};
 
   //
   // Create a RAJA IndexSet with four ListSegments, one for the indices of
@@ -344,15 +345,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL2 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(cuda_colorset,
-                          [=] RAJA_DEVICE(int ie)
-                          {
-                            int* iv = &(e2v_map[4 * ie]);
-                            areav[iv[0]] += areae[ie] / 4.0;
-                            areav[iv[1]] += areae[ie] / 4.0;
-                            areav[iv[2]] += areae[ie] / 4.0;
-                            areav[iv[3]] += areae[ie] / 4.0;
-                          });
+  RAJA::forall<EXEC_POL2>(
+      cuda_colorset,
+      [=] RAJA_DEVICE(int ie)
+      {
+        int* iv = &(e2v_map[4 * ie]);
+        areav[iv[0]] += areae[ie] / 4.0;
+        areav[iv[1]] += areae[ie] / 4.0;
+        areav[iv[2]] += areae[ie] / 4.0;
+        areav[iv[3]] += areae[ie] / 4.0;
+      });
   // _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -376,8 +378,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
   hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int),
-            hipMemcpyHostToDevice);
+  hipMemcpy(
+      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
   hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
@@ -386,7 +388,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Resource object used to construct list segment objects with indices
   // living in device (GPU) memory.
   //
-  camp::resources::Resource hip_res{camp::resources::Hip()};
+  camp::resources::Resource hip_res {camp::resources::Hip()};
 
   //
   // Create a RAJA IndexSet with four ListSegments, one for the indices of
@@ -406,15 +408,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL3>(hip_colorset,
-                          [=] RAJA_DEVICE(int ie)
-                          {
-                            int* iv = &(d_e2v_map[4 * ie]);
-                            d_areav[iv[0]] += d_areae[ie] / 4.0;
-                            d_areav[iv[1]] += d_areae[ie] / 4.0;
-                            d_areav[iv[2]] += d_areae[ie] / 4.0;
-                            d_areav[iv[3]] += d_areae[ie] / 4.0;
-                          });
+  RAJA::forall<EXEC_POL3>(
+      hip_colorset,
+      [=] RAJA_DEVICE(int ie)
+      {
+        int* iv = &(d_e2v_map[4 * ie]);
+        d_areav[iv[0]] += d_areae[ie] / 4.0;
+        d_areav[iv[1]] += d_areae[ie] / 4.0;
+        d_areav[iv[2]] += d_areae[ie] / 4.0;
+        d_areav[iv[3]] += d_areae[ie] / 4.0;
+      });
   // _raja_vertexarea_hip_end
 
   hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp
index 366d3fa8b4..f2bb20947e 100644
--- a/exercises/vertexsum-indexset_solution.cpp
+++ b/exercises/vertexsum-indexset_solution.cpp
@@ -256,7 +256,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Resource object used to construct list segment objects with indices
   // living in host (CPU) memory.
   //
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   //
   // Create a RAJA IndexSet with four ListSegments, one for the indices of
@@ -285,15 +285,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL1 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
 
-  RAJA::forall<EXEC_POL1>(colorset,
-                          [=](int ie)
-                          {
-                            int* iv = &(e2v_map[4 * ie]);
-                            areav[iv[0]] += areae[ie] / 4.0;
-                            areav[iv[1]] += areae[ie] / 4.0;
-                            areav[iv[2]] += areae[ie] / 4.0;
-                            areav[iv[3]] += areae[ie] / 4.0;
-                          });
+  RAJA::forall<EXEC_POL1>(
+      colorset,
+      [=](int ie)
+      {
+        int* iv = &(e2v_map[4 * ie]);
+        areav[iv[0]] += areae[ie] / 4.0;
+        areav[iv[1]] += areae[ie] / 4.0;
+        areav[iv[2]] += areae[ie] / 4.0;
+        areav[iv[3]] += areae[ie] / 4.0;
+      });
   // _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -314,7 +315,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Resource object used to construct list segment objects with indices
   // living in device (GPU) memory.
   //
-  camp::resources::Resource cuda_res{camp::resources::Cuda()};
+  camp::resources::Resource cuda_res {camp::resources::Cuda()};
 
   //
   // Create a RAJA IndexSet with four ListSegments, one for the indices of
@@ -336,15 +337,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL2 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(cuda_colorset,
-                          [=] RAJA_DEVICE(int ie)
-                          {
-                            int* iv = &(e2v_map[4 * ie]);
-                            areav[iv[0]] += areae[ie] / 4.0;
-                            areav[iv[1]] += areae[ie] / 4.0;
-                            areav[iv[2]] += areae[ie] / 4.0;
-                            areav[iv[3]] += areae[ie] / 4.0;
-                          });
+  RAJA::forall<EXEC_POL2>(
+      cuda_colorset,
+      [=] RAJA_DEVICE(int ie)
+      {
+        int* iv = &(e2v_map[4 * ie]);
+        areav[iv[0]] += areae[ie] / 4.0;
+        areav[iv[1]] += areae[ie] / 4.0;
+        areav[iv[2]] += areae[ie] / 4.0;
+        areav[iv[3]] += areae[ie] / 4.0;
+      });
   // _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
@@ -368,8 +370,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
 
   hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int),
-            hipMemcpyHostToDevice);
+  hipMemcpy(
+      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
   hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
@@ -378,7 +380,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Resource object used to construct list segment objects with indices
   // living in device (GPU) memory.
   //
-  camp::resources::Resource hip_res{camp::resources::Hip()};
+  camp::resources::Resource hip_res {camp::resources::Hip()};
 
   //
   // Create a RAJA IndexSet with four ListSegments, one for the indices of
@@ -398,15 +400,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3 =
       RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL3>(hip_colorset,
-                          [=] RAJA_DEVICE(int ie)
-                          {
-                            int* iv = &(d_e2v_map[4 * ie]);
-                            d_areav[iv[0]] += d_areae[ie] / 4.0;
-                            d_areav[iv[1]] += d_areae[ie] / 4.0;
-                            d_areav[iv[2]] += d_areae[ie] / 4.0;
-                            d_areav[iv[3]] += d_areae[ie] / 4.0;
-                          });
+  RAJA::forall<EXEC_POL3>(
+      hip_colorset,
+      [=] RAJA_DEVICE(int ie)
+      {
+        int* iv = &(d_e2v_map[4 * ie]);
+        d_areav[iv[0]] += d_areae[ie] / 4.0;
+        d_areav[iv[1]] += d_areae[ie] / 4.0;
+        d_areav[iv[2]] += d_areae[ie] / 4.0;
+        d_areav[iv[3]] += d_areae[ie] / 4.0;
+      });
   // _raja_vertexarea_hip_end
 
   hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp
index 16413f65fa..9239280256 100644
--- a/exercises/view-layout.cpp
+++ b/exercises/view-layout.cpp
@@ -193,7 +193,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _default_view2D_start
   RAJA::View<int, RAJA::Layout<2, int>> view_2D(a, Nx, Ny);
 
-  int iter{0};
+  int iter {0};
   for (int i = 0; i < Nx; ++i)
   {
     for (int j = 0; j < Ny; ++j)
@@ -239,7 +239,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _default_perm_view2D_start
-  std::array<RAJA::idx_t, 2> defperm2{{0, 1}};
+  std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
   RAJA::Layout<2, int>       defperm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
   RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
@@ -282,7 +282,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _perm_2D_start
-  std::array<RAJA::idx_t, 2> perm2{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
   RAJA::Layout<2, int>       perm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
   RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
@@ -330,7 +330,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _permb_view3D_start
-  std::array<RAJA::idx_t, 3> perm3b{{1, 2, 0}};
+  std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
   RAJA::Layout<3, int>       perm3b_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
   RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
@@ -588,7 +588,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_permofflayout2D_start
-  std::array<RAJA::idx_t, 2> perm1D{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
   RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>(
       {{imin, jmin}}, {{imax, jmax}}, perm1D);
 
diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp
index 0d41a96ad7..691bdb311f 100644
--- a/exercises/view-layout_solution.cpp
+++ b/exercises/view-layout_solution.cpp
@@ -193,7 +193,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _default_view2D_start
   RAJA::View<int, RAJA::Layout<2, int>> view_2D(a, Nx, Ny);
 
-  int iter{0};
+  int iter {0};
   for (int i = 0; i < Nx; ++i)
   {
     for (int j = 0; j < Ny; ++j)
@@ -248,7 +248,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _default_perm_view2D_start
-  std::array<RAJA::idx_t, 2> defperm2{{0, 1}};
+  std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
   RAJA::Layout<2, int>       defperm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
   RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
@@ -274,7 +274,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _default_perm_view3D_start
-  std::array<RAJA::idx_t, 3> defperm3{{0, 1, 2}};
+  std::array<RAJA::idx_t, 3> defperm3 {{0, 1, 2}};
   RAJA::Layout<3, int>       defperm3_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, defperm3);
   RAJA::View<int, RAJA::Layout<3, int>> defperm_view_3D(a, defperm3_layout);
@@ -304,7 +304,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _perm_2D_start
-  std::array<RAJA::idx_t, 2> perm2{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
   RAJA::Layout<2, int>       perm2_layout =
       RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
   RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
@@ -330,7 +330,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _perma_view3D_start
-  std::array<RAJA::idx_t, 3> perm3a{{2, 1, 0}};
+  std::array<RAJA::idx_t, 3> perm3a {{2, 1, 0}};
   RAJA::Layout<3, int>       perm3a_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3a);
   RAJA::View<int, RAJA::Layout<3, int>> perm3a_view_3D(a, perm3a_layout);
@@ -359,7 +359,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(a, 0, Ntot * sizeof(int));
 
   // _permb_view3D_start
-  std::array<RAJA::idx_t, 3> perm3b{{1, 2, 0}};
+  std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
   RAJA::Layout<3, int>       perm3b_layout =
       RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
   RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
@@ -619,7 +619,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_permofflayout2D_start
-  std::array<RAJA::idx_t, 2> perm1D{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
   RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>(
       {{imin, jmin}}, {{imax, jmax}}, perm1D);
 
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 062be6c1bb..abc965b0f5 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -204,6 +204,6 @@ namespace expt
 //  // provide a RAJA::expt namespace for experimental work, but bring alias
 //  // it into RAJA so it doesn't affect user code
 //  using namespace expt;
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index ba1863fa80..056715a4f3 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -61,16 +61,16 @@ namespace indexset
 /// each segment.
 ///
 template <typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
-struct ExecPolicy
-    : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
-                                         RAJA::Pattern::forall>
+struct ExecPolicy : public RAJA::make_policy_pattern_t<
+                        SEG_EXEC_POLICY_T::policy,
+                        RAJA::Pattern::forall>
 {
   using seg_it   = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
 };
 
-} // end namespace indexset
-} // end namespace policy
+}  // end namespace indexset
+}  // end namespace policy
 
 using policy::indexset::ExecPolicy;
 
@@ -94,9 +94,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   using value_type = typename T0::value_type;
 
   // Ensure that all value types in all segments are the same
-  static_assert(std::is_same<value_type, typename PARENT::value_type>::value ||
-                    T0_TypeId == 0,
-                "All segments must have the same value_type");
+  static_assert(
+      std::is_same<value_type, typename PARENT::value_type>::value ||
+          T0_TypeId == 0,
+      "All segments must have the same value_type");
 
   //! Construct empty index set
 #if _MSC_VER < 1910
@@ -162,9 +163,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This is used to implement the == and != operators
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool
-  compareSegmentById(size_t                             segid,
-                     const TypedIndexSet<P0, PREST...>& other) const
+  RAJA_INLINE bool compareSegmentById(
+      size_t                             segid,
+      const TypedIndexSet<P0, PREST...>& other) const
   {
     // drill down our types until we have the right type
     if (getSegmentTypes()[segid] != T0_TypeId)
@@ -248,9 +249,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 private:
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
-                             PushEnd                 pend  = PUSH_BACK,
-                             PushCopy                pcopy = PUSH_COPY)
+  RAJA_INLINE void push_into(
+      TypedIndexSet<CALL...>& c,
+      PushEnd                 pend  = PUSH_BACK,
+      PushCopy                pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
@@ -278,10 +280,11 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 public:
   template <typename... CALL>
-  RAJA_INLINE void segment_push_into(size_t                  segid,
-                                     TypedIndexSet<CALL...>& c,
-                                     PushEnd                 pend  = PUSH_BACK,
-                                     PushCopy                pcopy = PUSH_COPY)
+  RAJA_INLINE void segment_push_into(
+      size_t                  segid,
+      TypedIndexSet<CALL...>& c,
+      PushEnd                 pend  = PUSH_BACK,
+      PushCopy                pcopy = PUSH_COPY)
   {
     if (getSegmentTypes()[segid] != T0_TypeId)
     {
@@ -325,16 +328,18 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename Tnew>
   RAJA_INLINE void push_back(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
-                  PUSH_BACK, PUSH_COPY);
+    push_internal(
+        new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK,
+        PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
   RAJA_INLINE void push_front(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
-                  PUSH_FRONT, PUSH_COPY);
+    push_internal(
+        new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+        PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
@@ -371,8 +376,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     if (getSegmentTypes()[segid] != T0_TypeId)
     {
-      PARENT::segmentCall(segid, std::forward<BODY>(body),
-                          std::forward<ARGS>(args)...);
+      PARENT::segmentCall(
+          segid, std::forward<BODY>(body), std::forward<ARGS>(args)...);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
@@ -465,8 +470,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     TypedIndexSet<T0, TREST...> retVal;
 
-    int minSeg = RAJA::operators::maximum<int>{}(0, begin);
-    int maxSeg = RAJA::operators::minimum<int>{}(end, getNumSegments());
+    int minSeg = RAJA::operators::maximum<int> {}(0, begin);
+    int maxSeg = RAJA::operators::minimum<int> {}(end, getNumSegments());
     for (int i = minSeg; i < maxSeg; ++i)
     {
       segment_push_into(i, retVal, PUSH_BACK, PUSH_NOCOPY);
@@ -711,8 +716,8 @@ class TypedIndexSet<>
   RAJA_INLINE void increaseTotalLength(int n) { m_len += n; }
 
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool compareSegmentById(size_t,
-                                      const TypedIndexSet<P0, PREST...>&) const
+  RAJA_INLINE bool
+  compareSegmentById(size_t, const TypedIndexSet<P0, PREST...>&) const
   {
     return false;
   }
@@ -791,17 +796,17 @@ namespace type_traits
 
 template <typename T>
 struct is_index_set
-    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
-                                            typename std::decay<T>::type>
+    : ::RAJA::type_traits::
+          SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type>
 {};
 
 template <typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
-                                            typename std::decay<T>::type>
+    : ::RAJA::type_traits::
+          SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type>
 {};
-} // namespace type_traits
+}  // namespace type_traits
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index ca6f1f2e67..f0b98471fe 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -92,10 +92,11 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-                                int fastDim,
-                                int midDim,
-                                int slowDim);
+void buildLockFreeBlockIndexset(
+    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+    int                                      fastDim,
+    int                                      midDim,
+    int                                      slowDim);
 
 /*!
  ******************************************************************************
@@ -125,6 +126,6 @@ void buildLockFreeColorIndexset(
     RAJA::Index_type* elemPermutation  = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index d750d6536b..cc928eb298 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -45,8 +45,8 @@ namespace RAJA
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename... SEG_TYPES>
-RAJA_INLINE void getIndices(CONTAINER_T&                       con,
-                            const TypedIndexSet<SEG_TYPES...>& iset)
+RAJA_INLINE void
+getIndices(CONTAINER_T& con, const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
   forall<ExecPolicy<seq_segit, seq_exec>>(
@@ -65,8 +65,8 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx)
-                   { tcon.push_back(idx); });
+  forall<seq_exec>(
+      seg, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -79,9 +79,10 @@ RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename... SEG_TYPES, typename CONDITIONAL>
-RAJA_INLINE void getIndicesConditional(CONTAINER_T&                       con,
-                                       const TypedIndexSet<SEG_TYPES...>& iset,
-                                       CONDITIONAL conditional)
+RAJA_INLINE void getIndicesConditional(
+    CONTAINER_T&                       con,
+    const TypedIndexSet<SEG_TYPES...>& iset,
+    CONDITIONAL                        conditional)
 {
   CONTAINER_T tcon;
   forall<ExecPolicy<seq_segit, seq_exec>>(
@@ -102,21 +103,23 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T&                       con,
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename SEGMENT_T, typename CONDITIONAL>
-RAJA_INLINE void getIndicesConditional(CONTAINER_T&     con,
-                                       const SEGMENT_T& seg,
-                                       CONDITIONAL      conditional)
+RAJA_INLINE void getIndicesConditional(
+    CONTAINER_T&     con,
+    const SEGMENT_T& seg,
+    CONDITIONAL      conditional)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-                   [&](typename CONTAINER_T::value_type idx)
-                   {
-                     if (conditional(idx)) tcon.push_back(idx);
-                   });
+  forall<seq_exec>(
+      seg,
+      [&](typename CONTAINER_T::value_type idx)
+      {
+        if (conditional(idx)) tcon.push_back(idx);
+      });
   con = tcon;
 }
 
 //@}
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index c66c3db20d..b0a0214148 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -295,7 +295,7 @@ convertIndex_helper(typename FROM::IndexValueType const val)
 }
 
 
-} // namespace internal
+}  // namespace internal
 
 /*!
  * \brief Function provides a way to take either an int or any Index<> type, and
@@ -315,10 +315,10 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE TO convertIndex(FROM const val)
  */
 // This version is enabled if FROM is a strongly typed class
 template <typename FROM>
-constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value,
-                            typename FROM::value_type>::type
-    stripIndexType(FROM const val)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE typename std::enable_if<
+    std::is_base_of<IndexValueBase, FROM>::value,
+    typename FROM::value_type>::type
+stripIndexType(FROM const val)
 {
   return *val;
 }
@@ -326,9 +326,8 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
  * enabled if FROM is not a strongly typed class
  */
 template <typename FROM>
-constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    typename std::enable_if<!std::is_base_of<IndexValueBase, FROM>::value,
-                            FROM>::type
+constexpr RAJA_HOST_DEVICE RAJA_INLINE typename std::
+    enable_if<!std::is_base_of<IndexValueBase, FROM>::value, FROM>::type
     stripIndexType(FROM const val)
 {
   return val;
@@ -349,7 +348,7 @@ struct StripIndexTypeT<
 {
   using type = typename FROM::value_type;
 };
-} // namespace internal
+}  // namespace internal
 
 /*!
  * \brief Strips a strongly typed index to its underlying type
@@ -367,12 +366,12 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  * \param FROM the original type
  */
 template <typename FROM>
-using make_signed_t =
-    typename std::conditional<std::is_floating_point<FROM>::value,
-                              std::common_type<FROM>,
-                              std::make_signed<FROM>>::type::type;
+using make_signed_t = typename std::conditional<
+    std::is_floating_point<FROM>::value,
+    std::common_type<FROM>,
+    std::make_signed<FROM>>::type::type;
 
-} // namespace RAJA
+}  // namespace RAJA
 
 /*!
  * \brief Helper Macro to create new Index types.
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index fec7a23ad2..a09d0b9f9d 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -116,10 +116,11 @@ class TypedListSegment
    * If 'Unowned' is passed as last argument, the segment will not own its
    * index data. In this case, caller must manage array lifetime properly.
    */
-  TypedListSegment(const value_type*         values,
-                   Index_type                length,
-                   camp::resources::Resource resource,
-                   IndexOwnership            owned = Owned)
+  TypedListSegment(
+      const value_type*         values,
+      Index_type                length,
+      camp::resources::Resource resource,
+      IndexOwnership            owned = Owned)
       : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
@@ -138,8 +139,9 @@ class TypedListSegment
    * Constructor assumes container data lives in host memory space.
    */
   template <typename Container>
-  TypedListSegment(const Container&          container,
-                   camp::resources::Resource resource)
+  TypedListSegment(
+      const Container&          container,
+      camp::resources::Resource resource)
       : m_resource(nullptr),
         m_owned(Unowned),
         m_data(nullptr),
@@ -148,7 +150,7 @@ class TypedListSegment
     if (m_size > 0)
     {
 
-      camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
@@ -288,8 +290,8 @@ class TypedListSegment
    * Method assumes values in given array and segment indices both live in host
    * memory space.
    */
-  RAJA_HOST_DEVICE bool indicesEqual(const value_type* container,
-                                     Index_type        len) const
+  RAJA_HOST_DEVICE bool
+  indicesEqual(const value_type* container, Index_type len) const
   {
     if (container == m_data) return len == m_size;
     if (len != m_size || container == nullptr || m_data == nullptr)
@@ -342,10 +344,11 @@ class TypedListSegment
   //
   // Initialize segment data based on whether object owns the index data.
   //
-  void initIndexData(const value_type*         container,
-                     Index_type                len,
-                     camp::resources::Resource resource_,
-                     IndexOwnership            container_own)
+  void initIndexData(
+      const value_type*         container,
+      Index_type                len,
+      camp::resources::Resource resource_,
+      IndexOwnership            container_own)
   {
 
     // empty list segment
@@ -365,7 +368,7 @@ class TypedListSegment
 
       m_resource = new camp::resources::Resource(resource_);
 
-      camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
@@ -404,18 +407,18 @@ class TypedListSegment
 //! Alias for A TypedListSegment<Index_type>
 using ListSegment = TypedListSegment<Index_type>;
 
-} // namespace RAJA
+}  // namespace RAJA
 
 namespace std
 {
 
 //! Specialization of std::swap for TypedListSegment
 template <typename StorageT>
-RAJA_INLINE void swap(RAJA::TypedListSegment<StorageT>& a,
-                      RAJA::TypedListSegment<StorageT>& b)
+RAJA_INLINE void
+swap(RAJA::TypedListSegment<StorageT>& a, RAJA::TypedListSegment<StorageT>& b)
 {
   a.swap(b);
 }
-} // namespace std
+}  // namespace std
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index f70b3b4b30..20b2a8aba6 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -92,8 +92,9 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT,
-          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+template <
+    typename StorageT,
+    typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeSegment
 {
 
@@ -101,13 +102,15 @@ struct TypedRangeSegment
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value,
-                "TypedRangeSegment DiffT "
-                "requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value,
-                "TypedRangeSegment "
-                "Type must be non "
-                "floating point.");
+  static_assert(
+      std::is_signed<DiffT>::value,
+      "TypedRangeSegment DiffT "
+      "requires signed type.");
+  static_assert(
+      !std::is_floating_point<StorageT>::value,
+      "TypedRangeSegment "
+      "Type must be non "
+      "floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -133,8 +136,9 @@ struct TypedRangeSegment
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
-                                               StripStorageT end)
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(
+      StripStorageT begin,
+      StripStorageT end)
       : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end))
   {}
 
@@ -223,13 +227,13 @@ struct TypedRangeSegment
    *
    *   \endverbatim
    */
-  RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment slice(StorageT begin,
-                                                       DiffT    length) const
+  RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment
+  slice(StorageT begin, DiffT length) const
   {
     StorageT start = m_begin[0] + begin;
     StorageT end   = start + length > m_end[0] ? m_end[0] : start + length;
 
-    return TypedRangeSegment{stripIndexType(start), stripIndexType(end)};
+    return TypedRangeSegment {stripIndexType(start), stripIndexType(end)};
   }
 
   /*!
@@ -327,8 +331,9 @@ struct TypedRangeSegment
  *
  ******************************************************************************
  */
-template <typename StorageT,
-          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+template <
+    typename StorageT,
+    typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeStrideSegment
 {
 
@@ -336,14 +341,16 @@ struct TypedRangeStrideSegment
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value,
-                "TypedRangeStrideSegment DiffT "
-                "requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value,
-                "TypedRangeStrideSegm"
-                "ent Type must be "
-                "non floating "
-                "point.");
+  static_assert(
+      std::is_signed<DiffT>::value,
+      "TypedRangeStrideSegment DiffT "
+      "requires signed type.");
+  static_assert(
+      !std::is_floating_point<StorageT>::value,
+      "TypedRangeStrideSegm"
+      "ent Type must be "
+      "non floating "
+      "point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -389,7 +396,7 @@ struct TypedRangeStrideSegment
       m_end = m_begin;
     }
     // m_size initialized as negative indicates a zero iteration space
-    m_size = m_size < DiffT{0} ? DiffT{0} : m_size;
+    m_size = m_size < DiffT {0} ? DiffT {0} : m_size;
   }
 
   //! Disable compiler generated constructor
@@ -485,8 +492,8 @@ struct TypedRangeStrideSegment
    *
    *   \endverbatim
    */
-  RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
-                                                 DiffT    length) const
+  RAJA_HOST_DEVICE TypedRangeStrideSegment
+  slice(StorageT begin, DiffT length) const
   {
     StorageT stride = m_begin.get_stride();
     StorageT start  = m_begin[0] + begin * stride;
@@ -501,8 +508,8 @@ struct TypedRangeStrideSegment
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment{stripIndexType(start), stripIndexType(end),
-                                   m_begin.get_stride()};
+    return TypedRangeStrideSegment {
+        stripIndexType(start), stripIndexType(end), m_begin.get_stride()};
   }
 
   /*!
@@ -549,7 +556,7 @@ struct common_type<T>
 template <typename... Ts>
 using common_type_t = typename common_type<Ts...>::type;
 
-} // namespace detail
+}  // namespace detail
 
 /*!
  * \brief Function to make a TypedRangeSegment for the interval [begin, end)
@@ -559,11 +566,12 @@ using common_type_t = typename common_type<Ts...>::type;
  *          @begin and @end. If there is no common type, then
  *          a compiler error will be produced.
  */
-template <typename BeginT,
-          typename EndT,
-          typename Common = detail::common_type_t<BeginT, EndT>>
-RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
-                                                      EndT&&   end)
+template <
+    typename BeginT,
+    typename EndT,
+    typename Common = detail::common_type_t<BeginT, EndT>>
+RAJA_HOST_DEVICE TypedRangeSegment<Common>
+                 make_range(BeginT&& begin, EndT&& end)
 {
   return {begin, end};
 }
@@ -577,18 +585,21 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
  *          @begin, @end, and @stride. If there is no common
  *          type, then a compiler error will be produced.
  */
-template <typename BeginT,
-          typename EndT,
-          typename StrideT,
-          typename Common = detail::common_type_t<BeginT, EndT>>
+template <
+    typename BeginT,
+    typename EndT,
+    typename StrideT,
+    typename Common = detail::common_type_t<BeginT, EndT>>
 RAJA_HOST_DEVICE TypedRangeStrideSegment<Common>
 make_strided_range(BeginT&& begin, EndT&& end, StrideT&& stride)
 {
-  static_assert(std::is_signed<StrideT>::value,
-                "make_strided_segment : stride must be signed.");
-  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value,
-                "make_stride_segment : stride and end must be of similar "
-                "types.");
+  static_assert(
+      std::is_signed<StrideT>::value,
+      "make_strided_segment : stride must be signed.");
+  static_assert(
+      std::is_same<make_signed_t<EndT>, StrideT>::value,
+      "make_stride_segment : stride and end must be of similar "
+      "types.");
   return {begin, end, stride};
 }
 
@@ -605,40 +616,42 @@ struct RangeStrideConstructible
     : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>())
 {};
 
-} // namespace concepts
+}  // namespace concepts
 
 namespace type_traits
 {
 
-DefineTypeTraitFromConcept(is_range_constructible,
-                           RAJA::concepts::RangeConstructible);
+DefineTypeTraitFromConcept(
+    is_range_constructible,
+    RAJA::concepts::RangeConstructible);
 
-DefineTypeTraitFromConcept(is_range_stride_constructible,
-                           RAJA::concepts::RangeStrideConstructible);
+DefineTypeTraitFromConcept(
+    is_range_stride_constructible,
+    RAJA::concepts::RangeStrideConstructible);
 
-} // namespace type_traits
+}  // namespace type_traits
 
-} // namespace RAJA
+}  // namespace RAJA
 
 namespace std
 {
 
 //! Specialization of std::swap for TypedRangeSegment
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeSegment<T>& a,
-                                       RAJA::TypedRangeSegment<T>& b)
+RAJA_HOST_DEVICE RAJA_INLINE void
+swap(RAJA::TypedRangeSegment<T>& a, RAJA::TypedRangeSegment<T>& b)
 {
   a.swap(b);
 }
 
 //! Specialization of std::swap for TypedRangeStrideSegment
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeStrideSegment<T>& a,
-                                       RAJA::TypedRangeStrideSegment<T>& b)
+RAJA_HOST_DEVICE RAJA_INLINE void
+swap(RAJA::TypedRangeStrideSegment<T>& a, RAJA::TypedRangeStrideSegment<T>& b)
 {
   a.swap(b);
 }
 
-} // namespace std
+}  // namespace std
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index cd3fde2113..2eef044283 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -125,6 +125,6 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   std::atomic<int> m_semaphore_value;
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 1f24dcb838..a59fdafb06 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -61,9 +61,10 @@ RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 }
 
 template <typename Type, typename DifferenceType>
-RAJA_HOST_DEVICE bool is_subtraction_overflow(Type           lhs,
-                                              DifferenceType rhs,
-                                              bool iterator_on_left = true)
+RAJA_HOST_DEVICE bool is_subtraction_overflow(
+    Type           lhs,
+    DifferenceType rhs,
+    bool           iterator_on_left = true)
 {
   if (iterator_on_left)
   {
@@ -77,7 +78,7 @@ RAJA_HOST_DEVICE bool is_subtraction_overflow(Type           lhs,
     }
   }
   else
-  { // Special case where operation is : value(lhs) - iterator(rhs).
+  {  // Special case where operation is : value(lhs) - iterator(rhs).
 
     if (std::is_unsigned<DifferenceType>::value)
     {
@@ -97,17 +98,18 @@ RAJA_HOST_DEVICE void check_is_addition_overflow(Type lhs, DifferenceType rhs)
 }
 
 template <typename Type, typename DifferenceType>
-RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type           lhs,
-                                                    DifferenceType rhs)
+RAJA_HOST_DEVICE void
+check_is_subtraction_overflow(Type lhs, DifferenceType rhs)
 {
   if (is_subtraction_overflow(lhs, rhs))
     throw std::runtime_error(overflow_msg(lhs, rhs));
 }
 #endif
 
-template <typename Type           = Index_type,
-          typename DifferenceType = Type,
-          typename PointerType    = Type*>
+template <
+    typename Type           = Index_type,
+    typename DifferenceType = Type,
+    typename PointerType    = Type*>
 class numeric_iterator
 {
 public:
@@ -275,9 +277,10 @@ class numeric_iterator
   stripped_value_type val = 0;
 };
 
-template <typename Type           = Index_type,
-          typename DifferenceType = Type,
-          typename PointerType    = Type*>
+template <
+    typename Type           = Index_type,
+    typename DifferenceType = Type,
+    typename PointerType    = Type*>
 class strided_numeric_iterator
 {
 public:
@@ -346,11 +349,12 @@ class strided_numeric_iterator
   RAJA_HOST_DEVICE inline difference_type
   operator-(const strided_numeric_iterator& rhs) const
   {
-    difference_type diff = (static_cast<difference_type>(val) -
-                            (static_cast<difference_type>(rhs.val)));
+    difference_type diff =
+        (static_cast<difference_type>(val) -
+         (static_cast<difference_type>(rhs.val)));
 
-    return (diff % stride != difference_type{0})
-               ? (difference_type{1} + diff / stride)
+    return (diff % stride != difference_type {0})
+               ? (difference_type {1} + diff / stride)
                : diff / stride;
   }
   RAJA_HOST_DEVICE inline strided_numeric_iterator
@@ -424,8 +428,8 @@ class strided_numeric_iterator
 };
 
 
-} // namespace Iterators
+}  // namespace Iterators
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif /* RAJA_ITERATORS_HPP */
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 0006ad4bfe..18bab98d1e 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -55,8 +55,8 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #else
   char* mem = (char*)malloc(size + alignment + sizeof(void*));
   if (nullptr == mem) return nullptr;
-  void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) &
-                        ~(alignment - 1));
+  void** ptr =
+      (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) & ~(alignment - 1));
   // Store the original address one position behind what we give the user.
   ptr[-1] = mem;
   return ptr;
@@ -119,6 +119,6 @@ struct FreeAlignedType : FreeAligned
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 682c59479e..3a3d5aec39 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -75,8 +75,9 @@ class RAJAVec
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(size_type             init_cap = 0,
-                   const allocator_type& a        = allocator_type())
+  explicit RAJAVec(
+      size_type             init_cap = 0,
+      const allocator_type& a        = allocator_type())
       : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
   {
     reserve(init_cap);
@@ -118,7 +119,7 @@ class RAJAVec
   {
     if (&rhs != this)
     {
-      copy_assign_private(rhs, propagate_on_container_copy_assignment{});
+      copy_assign_private(rhs, propagate_on_container_copy_assignment {});
     }
     return *this;
   }
@@ -130,8 +131,8 @@ class RAJAVec
   {
     if (&rhs != this)
     {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment{});
+      move_assign_private(
+          std::move(rhs), propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -150,7 +151,7 @@ class RAJAVec
   ///
   void swap(RAJAVec& other)
   {
-    swap_private(other, propagate_on_container_swap{});
+    swap_private(other, propagate_on_container_swap {});
   }
 
   ///
@@ -323,7 +324,7 @@ class RAJAVec
       m_allocator = rhs.m_allocator;
     }
 
-    copy_assign_private(rhs, std::false_type{});
+    copy_assign_private(rhs, std::false_type {});
   }
 
   ///
@@ -452,8 +453,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                       std::forward<Os>(os)...);
+      allocator_traits_type::construct(
+          m_allocator, m_data + m_size, std::forward<Os>(os)...);
     }
   }
 
@@ -464,8 +465,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                       o_data[m_size]);
+      allocator_traits_type::construct(
+          m_allocator, m_data + m_size, o_data[m_size]);
     }
   }
 
@@ -476,8 +477,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                       std::move(o_data[m_size]));
+      allocator_traits_type::construct(
+          m_allocator, m_data + m_size, std::move(o_data[m_size]));
     }
   }
 
@@ -503,16 +504,16 @@ class RAJAVec
     if (m_size > 0)
     {
       size_type i = m_size;
-      allocator_traits_type::construct(m_allocator, m_data + i,
-                                       std::move(m_data[i - 1]));
+      allocator_traits_type::construct(
+          m_allocator, m_data + i, std::move(m_data[i - 1]));
       for (--i; i > 0; --i)
       {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(m_allocator, m_data,
-                                     std::forward<Os>(os)...);
+    allocator_traits_type::construct(
+        m_allocator, m_data, std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -523,8 +524,8 @@ class RAJAVec
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                     std::forward<Os>(os)...);
+    allocator_traits_type::construct(
+        m_allocator, m_data + m_size, std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -587,8 +588,8 @@ class RAJAVec
     {
       for (size_type i = 0; i < m_size; ++i)
       {
-        allocator_traits_type::construct(m_allocator, tdata + i,
-                                         std::move(m_data[i]));
+        allocator_traits_type::construct(
+            m_allocator, tdata + i, std::move(m_data[i]));
         allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
@@ -599,6 +600,6 @@ class RAJAVec
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/internal/ThreadUtils_CPU.hpp b/include/RAJA/internal/ThreadUtils_CPU.hpp
index c83905ea77..addd22c4f7 100644
--- a/include/RAJA/internal/ThreadUtils_CPU.hpp
+++ b/include/RAJA/internal/ThreadUtils_CPU.hpp
@@ -47,6 +47,6 @@ int getMaxOMPThreadsCPU()
   return nthreads;
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index 4254d79ba1..3527252035 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -104,7 +104,7 @@
     fault_type = 0; /* ignore for the simulation */                            \
   }
 
-#endif // RAJA_REPORT_FT
+#endif  // RAJA_REPORT_FT
 
 #else
 
@@ -112,6 +112,6 @@
 
 #define RAJA_FT_END
 
-#endif // RAJA_ENABLE_FT
+#endif  // RAJA_ENABLE_FT
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index 3e9e0bc15a..55355f23dc 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -57,20 +57,21 @@ struct foldl_impl<Op, Arg1, Arg2>
   using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
 };
 
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
+template <
+    typename Op,
+    typename Arg1,
+    typename Arg2,
+    typename Arg3,
+    typename... Rest>
 struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
 {
-  using Ret =
-      typename foldl_impl<Op,
-                          typename std::invoke_result<
-                              Op,
-                              typename std::invoke_result<Op, Arg1, Arg2>::type,
-                              Arg3>::type,
-                          Rest...>::Ret;
+  using Ret = typename foldl_impl<
+      Op,
+      typename std::invoke_result<
+          Op,
+          typename std::invoke_result<Op, Arg1, Arg2>::type,
+          Arg3>::type,
+      Rest...>::Ret;
 };
 
 #else
@@ -81,23 +82,24 @@ struct foldl_impl<Op, Arg1, Arg2>
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
 };
 
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
+template <
+    typename Op,
+    typename Arg1,
+    typename Arg2,
+    typename Arg3,
+    typename... Rest>
 struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
 {
   using Ret = typename foldl_impl<
       Op,
-      typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
-                                 Arg3)>::type,
+      typename std::result_of<
+          Op(typename std::result_of<Op(Arg1, Arg2)>::type, Arg3)>::type,
       Rest...>::Ret;
 };
 
 #endif
 
-} // namespace detail
+}  // namespace detail
 
 template <typename Op, typename Arg1>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
@@ -112,25 +114,27 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
 foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2) ->
     typename detail::foldl_impl<Op, Arg1, Arg2>::Ret
 {
-  return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
-                                      camp::forward<Arg2>(arg2));
+  return camp::forward<Op>(operation)(
+      camp::forward<Arg1>(arg1), camp::forward<Arg2>(arg2));
 }
 
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
+template <
+    typename Op,
+    typename Arg1,
+    typename Arg2,
+    typename Arg3,
+    typename... Rest>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
 foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Rest&&... rest) ->
     typename detail::foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
 {
-  return foldl(camp::forward<Op>(operation),
-               camp::forward<Op>(operation)(
-                   camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
-                                                camp::forward<Arg2>(arg2)),
-                   camp::forward<Arg3>(arg3)),
-               camp::forward<Rest>(rest)...);
+  return foldl(
+      camp::forward<Op>(operation),
+      camp::forward<Op>(operation)(
+          camp::forward<Op>(operation)(
+              camp::forward<Arg1>(arg1), camp::forward<Arg2>(arg2)),
+          camp::forward<Arg3>(arg3)),
+      camp::forward<Rest>(rest)...);
 }
 
 
@@ -160,6 +164,6 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args)
 }
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index d32344629c..f067e2a0ae 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -16,7 +16,7 @@ template <typename Selector, typename... Policies>
 class MultiPolicy;
 
 }
-} // namespace policy
+}  // namespace policy
 
 namespace detail
 {
@@ -25,8 +25,8 @@ struct max_platform
 {
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr RAJA::Platform operator()(const RAJA::Platform& l,
-                                      const RAJA::Platform& r) const
+  constexpr RAJA::Platform
+  operator()(const RAJA::Platform& l, const RAJA::Platform& r) const
   {
     return (l > r) ? l : r;
   }
@@ -72,10 +72,11 @@ struct get_platform_from_list<>
  * (not for MultiPolicy or nested::Policy)
  */
 template <typename T>
-struct get_platform<T,
-                    typename std::enable_if<
-                        std::is_base_of<RAJA::PolicyBase, T>::value &&
-                        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
+struct get_platform<
+    T,
+    typename std::enable_if<
+        std::is_base_of<RAJA::PolicyBase, T>::value &&
+        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
 {
 
   static constexpr Platform value = T::platform;
@@ -96,9 +97,9 @@ struct get_platform<RAJA::ExecPolicy<SEG, EXEC>>
 template <typename T>
 struct get_statement_platform
 {
-  static constexpr Platform value =
-      get_platform_from_list<typename T::execution_policy_t,
-                             typename T::enclosed_statements_t>::value;
+  static constexpr Platform value = get_platform_from_list<
+      typename T::execution_policy_t,
+      typename T::enclosed_statements_t>::value;
 };
 
 /*!
@@ -134,7 +135,7 @@ struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>>
   static constexpr Platform value = Platform::undefined;
 };
 
-} // namespace detail
-} // namespace RAJA
+}  // namespace detail
+}  // namespace RAJA
 
-#endif // RAJA_get_platform_HPP
+#endif  // RAJA_get_platform_HPP
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 4da28c4aa4..a49edcc819 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -75,7 +75,7 @@ struct is_xargs<xargs<Args...>>
   static constexpr bool value = true;
 };
 
-} // namespace detail
+}  // namespace detail
 
 
 //
@@ -115,19 +115,21 @@ struct is_xargs<xargs<Args...>>
  *
  ******************************************************************************
  */
-template <typename WORKGROUP_POLICY_T,
-          typename INDEX_T,
-          typename EXTRA_ARGS_T,
-          typename ALLOCATOR_T>
+template <
+    typename WORKGROUP_POLICY_T,
+    typename INDEX_T,
+    typename EXTRA_ARGS_T,
+    typename ALLOCATOR_T>
 struct WorkPool
 {
   static_assert(
       RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
-  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-                "WorkPool: EXTRA_ARGS_T "
-                "must be a "
-                "RAJA::xargs<...> type");
+  static_assert(
+      detail::is_xargs<EXTRA_ARGS_T>::value,
+      "WorkPool: EXTRA_ARGS_T "
+      "must be a "
+      "RAJA::xargs<...> type");
 };
 
 /*!
@@ -156,19 +158,21 @@ struct WorkPool
  *
  ******************************************************************************
  */
-template <typename WORKGROUP_POLICY_T,
-          typename INDEX_T,
-          typename EXTRA_ARGS_T,
-          typename ALLOCATOR_T>
+template <
+    typename WORKGROUP_POLICY_T,
+    typename INDEX_T,
+    typename EXTRA_ARGS_T,
+    typename ALLOCATOR_T>
 struct WorkGroup
 {
   static_assert(
       RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
-  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-                "WorkGroup: "
-                "EXTRA_ARGS_T must be a "
-                "RAJA::xargs<...> type");
+  static_assert(
+      detail::is_xargs<EXTRA_ARGS_T>::value,
+      "WorkGroup: "
+      "EXTRA_ARGS_T must be a "
+      "RAJA::xargs<...> type");
 };
 
 /*!
@@ -196,63 +200,70 @@ struct WorkGroup
  *
  ******************************************************************************
  */
-template <typename WORKGROUP_POLICY_T,
-          typename INDEX_T,
-          typename EXTRA_ARGS_T,
-          typename ALLOCATOR_T>
+template <
+    typename WORKGROUP_POLICY_T,
+    typename INDEX_T,
+    typename EXTRA_ARGS_T,
+    typename ALLOCATOR_T>
 struct WorkSite
 {
   static_assert(
       RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
-  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-                "WorkSite: EXTRA_ARGS_T "
-                "must be a "
-                "RAJA::xargs<...> type");
+  static_assert(
+      detail::is_xargs<EXTRA_ARGS_T>::value,
+      "WorkSite: EXTRA_ARGS_T "
+      "must be a "
+      "RAJA::xargs<...> type");
 };
 
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename... Args,
-          typename ALLOCATOR_T>
-struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
-                                ORDER_POLICY_T,
-                                STORAGE_POLICY_T,
-                                DISPATCH_POLICY_T>,
-                INDEX_T,
-                xargs<Args...>,
-                ALLOCATOR_T>
+template <
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename STORAGE_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename INDEX_T,
+    typename... Args,
+    typename ALLOCATOR_T>
+struct WorkPool<
+    WorkGroupPolicy<
+        EXEC_POLICY_T,
+        ORDER_POLICY_T,
+        STORAGE_POLICY_T,
+        DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>
 {
   using exec_policy     = EXEC_POLICY_T;
   using order_policy    = ORDER_POLICY_T;
   using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<exec_policy,
-                                 order_policy,
-                                 storage_policy,
-                                 dispatch_policy>;
-  using index_type      = INDEX_T;
-  using xarg_type       = xargs<Args...>;
-  using Allocator       = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<
+      exec_policy,
+      order_policy,
+      storage_policy,
+      dispatch_policy>;
+  using index_type = INDEX_T;
+  using xarg_type  = xargs<Args...>;
+  using Allocator  = ALLOCATOR_T;
 
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
   using worksite_type  = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<exec_policy,
-                                             order_policy,
-                                             dispatch_policy,
-                                             Allocator,
-                                             index_type,
-                                             Args...>;
-  using storage_type =
-      detail::WorkStorage<storage_policy,
-                          Allocator,
-                          typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<
+      exec_policy,
+      order_policy,
+      dispatch_policy,
+      Allocator,
+      index_type,
+      Args...>;
+  using storage_type = detail::WorkStorage<
+      storage_policy,
+      Allocator,
+      typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -292,7 +303,7 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
       reserve(m_max_num_loops, m_max_storage_bytes);
     }
 
-    util::PluginContext context{util::make_context<exec_policy>()};
+    util::PluginContext context {util::make_context<exec_policy>()};
     util::callPreCapturePlugins(context);
 
     using RAJA::util::trigger_updates_before;
@@ -323,32 +334,36 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
   workrunner_type m_runner;
 };
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename... Args,
-          typename ALLOCATOR_T>
-struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
-                                 ORDER_POLICY_T,
-                                 STORAGE_POLICY_T,
-                                 DISPATCH_POLICY_T>,
-                 INDEX_T,
-                 xargs<Args...>,
-                 ALLOCATOR_T>
+template <
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename STORAGE_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename INDEX_T,
+    typename... Args,
+    typename ALLOCATOR_T>
+struct WorkGroup<
+    WorkGroupPolicy<
+        EXEC_POLICY_T,
+        ORDER_POLICY_T,
+        STORAGE_POLICY_T,
+        DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>
 {
   using exec_policy     = EXEC_POLICY_T;
   using order_policy    = ORDER_POLICY_T;
   using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<exec_policy,
-                                 order_policy,
-                                 storage_policy,
-                                 dispatch_policy>;
-  using index_type      = INDEX_T;
-  using xarg_type       = xargs<Args...>;
-  using Allocator       = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<
+      exec_policy,
+      order_policy,
+      storage_policy,
+      dispatch_policy>;
+  using index_type = INDEX_T;
+  using xarg_type  = xargs<Args...>;
+  using Allocator  = ALLOCATOR_T;
 
   using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
@@ -396,32 +411,36 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
   {}
 };
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename... Args,
-          typename ALLOCATOR_T>
-struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
-                                ORDER_POLICY_T,
-                                STORAGE_POLICY_T,
-                                DISPATCH_POLICY_T>,
-                INDEX_T,
-                xargs<Args...>,
-                ALLOCATOR_T>
+template <
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename STORAGE_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename INDEX_T,
+    typename... Args,
+    typename ALLOCATOR_T>
+struct WorkSite<
+    WorkGroupPolicy<
+        EXEC_POLICY_T,
+        ORDER_POLICY_T,
+        STORAGE_POLICY_T,
+        DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>
 {
   using exec_policy     = EXEC_POLICY_T;
   using order_policy    = ORDER_POLICY_T;
   using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<exec_policy,
-                                 order_policy,
-                                 storage_policy,
-                                 dispatch_policy>;
-  using index_type      = INDEX_T;
-  using xarg_type       = xargs<Args...>;
-  using Allocator       = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<
+      exec_policy,
+      order_policy,
+      storage_policy,
+      dispatch_policy>;
+  using index_type = INDEX_T;
+  using xarg_type  = xargs<Args...>;
+  using Allocator  = ALLOCATOR_T;
 
   using workpool_type  = WorkPool<policy, index_type, xarg_type, Allocator>;
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
@@ -462,78 +481,90 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 };
 
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename... Args,
-          typename ALLOCATOR_T>
-inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
-                                         ORDER_POLICY_T,
-                                         STORAGE_POLICY_T,
-                                         DISPATCH_POLICY_T>,
-                         INDEX_T,
-                         xargs<Args...>,
-                         ALLOCATOR_T>::workgroup_type
-WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
-                         ORDER_POLICY_T,
-                         STORAGE_POLICY_T,
-                         DISPATCH_POLICY_T>,
-         INDEX_T,
-         xargs<Args...>,
-         ALLOCATOR_T>::instantiate()
+template <
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename STORAGE_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename INDEX_T,
+    typename... Args,
+    typename ALLOCATOR_T>
+inline typename WorkPool<
+    WorkGroupPolicy<
+        EXEC_POLICY_T,
+        ORDER_POLICY_T,
+        STORAGE_POLICY_T,
+        DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::workgroup_type
+WorkPool<
+    WorkGroupPolicy<
+        EXEC_POLICY_T,
+        ORDER_POLICY_T,
+        STORAGE_POLICY_T,
+        DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
   m_max_num_loops     = std::max(m_storage.size(), m_max_num_loops);
   m_max_storage_bytes = std::max(m_storage.storage_size(), m_max_storage_bytes);
 
   // move storage into workgroup
-  return workgroup_type{std::move(m_storage), std::move(m_runner)};
+  return workgroup_type {std::move(m_storage), std::move(m_runner)};
 }
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename... Args,
-          typename ALLOCATOR_T>
-inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
-                                          ORDER_POLICY_T,
-                                          STORAGE_POLICY_T,
-                                          DISPATCH_POLICY_T>,
-                          INDEX_T,
-                          xargs<Args...>,
-                          ALLOCATOR_T>::worksite_type
+template <
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename STORAGE_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename INDEX_T,
+    typename... Args,
+    typename ALLOCATOR_T>
+inline typename WorkGroup<
+    WorkGroupPolicy<
+        EXEC_POLICY_T,
+        ORDER_POLICY_T,
+        STORAGE_POLICY_T,
+        DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T,
-                    ORDER_POLICY_T,
-                    STORAGE_POLICY_T,
-                    DISPATCH_POLICY_T>,
+    WorkGroupPolicy<
+        EXEC_POLICY_T,
+        ORDER_POLICY_T,
+        STORAGE_POLICY_T,
+        DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
-                                                         ORDER_POLICY_T,
-                                                         STORAGE_POLICY_T,
-                                                         DISPATCH_POLICY_T>,
-                                         INDEX_T,
-                                         xargs<Args...>,
-                                         ALLOCATOR_T>::resource_type r,
-                      Args... args)
+    ALLOCATOR_T>::
+    run(typename WorkGroup<
+            WorkGroupPolicy<
+                EXEC_POLICY_T,
+                ORDER_POLICY_T,
+                STORAGE_POLICY_T,
+                DISPATCH_POLICY_T>,
+            INDEX_T,
+            xargs<Args...>,
+            ALLOCATOR_T>::resource_type r,
+        Args... args)
 {
-  util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
+  util::PluginContext context {util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(r,
-                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(
+      r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
   return site;
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index a515c27fc3..8c0f95db30 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -76,16 +76,18 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template <Platform platform,
-          typename dispatch_policy,
-          typename DispatcherID,
-          typename... CallArgs>
+template <
+    Platform platform,
+    typename dispatch_policy,
+    typename DispatcherID,
+    typename... CallArgs>
 struct Dispatcher;
 
 
 template <typename holder_type>
-struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
-                                  holder_type>
+struct dispatcher_transform_types<
+    ::RAJA::indirect_function_call_dispatch,
+    holder_type>
 {
   using type = ::RAJA::indirect_function_call_dispatch;
 };
@@ -100,10 +102,11 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
  * device linking to fail.
  */
 template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::indirect_function_call_dispatch,
-                  DispatcherID,
-                  CallArgs...>
+struct Dispatcher<
+    platform,
+    ::RAJA::indirect_function_call_dispatch,
+    DispatcherID,
+    CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy   = ::RAJA::indirect_function_call_dispatch;
@@ -115,8 +118,8 @@ struct Dispatcher<platform,
   /// destroy the T obj in src
   ///
   template <typename T>
-  static void s_move_construct_destroy(void_ptr_wrapper dest,
-                                       void_ptr_wrapper src)
+  static void
+  s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
     T* src_as_T  = static_cast<T*>(src.ptr);
@@ -135,8 +138,8 @@ struct Dispatcher<platform,
   }
   ///
   template <typename T>
-  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
-                                          CallArgs... args)
+  static RAJA_DEVICE void
+  s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -152,10 +155,10 @@ struct Dispatcher<platform,
     (*obj_as_T).~T();
   }
 
-  using mover_type     = void (*)(void_ptr_wrapper /*dest*/,
-                              void_ptr_wrapper /*src*/);
-  using invoker_type   = void (*)(void_cptr_wrapper /*obj*/,
-                                CallArgs... /*args*/);
+  using mover_type =
+      void (*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
+  using invoker_type =
+      void (*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
   using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
@@ -176,14 +179,16 @@ struct Dispatcher<platform,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
+  template <
+      typename T,
+      bool uhi               = use_host_invoke,
+      std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
-    return {mover_type{&s_move_construct_destroy<T>},
-            invoker_type{&s_host_invoke<T>}, destroyer_type{&s_destroy<T>},
-            sizeof(T)};
+    return {
+        mover_type {&s_move_construct_destroy<T>},
+        invoker_type {&s_host_invoke<T>}, destroyer_type {&s_destroy<T>},
+        sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -195,16 +200,18 @@ struct Dispatcher<platform,
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
+  template <
+      typename T,
+      typename CreateOnDevice,
+      bool uhi                = use_host_invoke,
+      std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
   {
-    return {mover_type{&s_move_construct_destroy<T>},
-            invoker_type{std::forward<CreateOnDevice>(createOnDevice)(
-                DeviceInvokerFactory<T>{})},
-            destroyer_type{&s_destroy<T>}, sizeof(T)};
+    return {
+        mover_type {&s_move_construct_destroy<T>},
+        invoker_type {std::forward<CreateOnDevice>(createOnDevice)(
+            DeviceInvokerFactory<T> {})},
+        destroyer_type {&s_destroy<T>}, sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
@@ -215,8 +222,9 @@ struct Dispatcher<platform,
 
 
 template <typename holder_type>
-struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
-                                  holder_type>
+struct dispatcher_transform_types<
+    ::RAJA::indirect_virtual_function_dispatch,
+    holder_type>
 {
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
@@ -231,10 +239,11 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
  * device linking to fail.
  */
 template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::indirect_virtual_function_dispatch,
-                  DispatcherID,
-                  CallArgs...>
+struct Dispatcher<
+    platform,
+    ::RAJA::indirect_virtual_function_dispatch,
+    DispatcherID,
+    CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy   = ::RAJA::indirect_virtual_function_dispatch;
@@ -243,9 +252,9 @@ struct Dispatcher<platform,
 
   struct impl_base
   {
-    virtual void move_destroy(void_ptr_wrapper dest,
-                              void_ptr_wrapper src) const = 0;
-    virtual void destroy(void_ptr_wrapper obj) const      = 0;
+    virtual void
+    move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const                = 0;
   };
 
   struct host_impl_base
@@ -255,8 +264,8 @@ struct Dispatcher<platform,
 
   struct device_impl_base
   {
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
-                                    CallArgs... args) const = 0;
+    virtual RAJA_DEVICE void
+    invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
   };
 
   template <typename T>
@@ -266,8 +275,8 @@ struct Dispatcher<platform,
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void move_destroy(void_ptr_wrapper dest,
-                              void_ptr_wrapper src) const override
+    virtual void
+    move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T  = static_cast<T*>(src.ptr);
@@ -304,8 +313,8 @@ struct Dispatcher<platform,
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
-                                    CallArgs... args) const override
+    virtual RAJA_DEVICE void
+    invoke(void_cptr_wrapper obj, CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -366,15 +375,17 @@ struct Dispatcher<platform,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
+  template <
+      typename T,
+      bool uhi               = use_host_invoke,
+      std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return {mover_type{&s_base_impl}, host_invoker_type{&s_host_impl},
-            destroyer_type{&s_base_impl}, sizeof(T)};
+    return {
+        mover_type {&s_base_impl}, host_invoker_type {&s_host_impl},
+        destroyer_type {&s_base_impl}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -386,17 +397,19 @@ struct Dispatcher<platform,
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
+  template <
+      typename T,
+      typename CreateOnDevice,
+      bool uhi                = use_host_invoke,
+      std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
   {
     static base_impl_type<T>    s_base_impl;
-    static device_impl_type<T>* s_device_impl_ptr{std::forward<CreateOnDevice>(
-        createOnDevice)(DeviceImplTypeFactory<T>{})};
-    return {mover_type{&s_base_impl}, device_invoker_type{s_device_impl_ptr},
-            destroyer_type{&s_base_impl}, sizeof(T)};
+    static device_impl_type<T>* s_device_impl_ptr {std::forward<CreateOnDevice>(
+        createOnDevice)(DeviceImplTypeFactory<T> {})};
+    return {
+        mover_type {&s_base_impl}, device_invoker_type {s_device_impl_ptr},
+        destroyer_type {&s_base_impl}, sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
@@ -419,10 +432,11 @@ struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type>
  * It implements the interface with callable objects.
  */
 template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::direct_dispatch<>,
-                  DispatcherID,
-                  CallArgs...>
+struct Dispatcher<
+    platform,
+    ::RAJA::direct_dispatch<>,
+    DispatcherID,
+    CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy                 = ::RAJA::direct_dispatch<>;
@@ -463,12 +477,13 @@ struct Dispatcher<platform,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
+  template <
+      typename T,
+      bool uhi               = use_host_invoke,
+      std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -476,13 +491,15 @@ struct Dispatcher<platform,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
+  template <
+      typename T,
+      typename CreateOnDevice,
+      bool uhi                = use_host_invoke,
+      std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+    return {
+        mover_type {}, device_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
@@ -495,14 +512,16 @@ struct Dispatcher<platform,
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template <Platform platform,
-          typename T,
-          typename DispatcherID,
-          typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::direct_dispatch<T>,
-                  DispatcherID,
-                  CallArgs...>
+template <
+    Platform platform,
+    typename T,
+    typename DispatcherID,
+    typename... CallArgs>
+struct Dispatcher<
+    platform,
+    ::RAJA::direct_dispatch<T>,
+    DispatcherID,
+    CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy                 = ::RAJA::direct_dispatch<T>;
@@ -561,14 +580,15 @@ struct Dispatcher<platform,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename U,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
+  template <
+      typename U,
+      bool uhi               = use_host_invoke,
+      std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
-    static_assert(std::is_same<T, U>::value,
-                  "U must be in direct_dispatch types");
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+    static_assert(
+        std::is_same<T, U>::value, "U must be in direct_dispatch types");
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -576,15 +596,17 @@ struct Dispatcher<platform,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <typename U,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
+  template <
+      typename U,
+      typename CreateOnDevice,
+      bool uhi                = use_host_invoke,
+      std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
-    static_assert(std::is_same<T, U>::value,
-                  "U must be in direct_dispatch types");
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+    static_assert(
+        std::is_same<T, U>::value, "U must be in direct_dispatch types");
+    return {
+        mover_type {}, device_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
@@ -597,16 +619,18 @@ struct Dispatcher<platform,
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template <typename T0,
-          typename T1,
-          typename... TNs,
-          Platform platform,
-          typename DispatcherID,
-          typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
-                  DispatcherID,
-                  CallArgs...>
+template <
+    typename T0,
+    typename T1,
+    typename... TNs,
+    Platform platform,
+    typename DispatcherID,
+    typename... CallArgs>
+struct Dispatcher<
+    platform,
+    ::RAJA::direct_dispatch<T0, T1, TNs...>,
+    DispatcherID,
+    CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy   = ::RAJA::direct_dispatch<T0, T1, TNs...>;
@@ -627,15 +651,16 @@ struct Dispatcher<platform,
 
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
-      impl_helper(callable_indices{}, callable_types{}, dest, src);
+      impl_helper(callable_indices {}, callable_types {}, dest, src);
     }
 
   private:
     template <int... id_types, typename... Ts>
-    void impl_helper(camp::int_seq<int, id_types...>,
-                     camp::list<Ts...>,
-                     void_ptr_wrapper dest,
-                     void_ptr_wrapper src) const
+    void impl_helper(
+        camp::int_seq<int, id_types...>,
+        camp::list<Ts...>,
+        void_ptr_wrapper dest,
+        void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
@@ -659,20 +684,22 @@ struct Dispatcher<platform,
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{}, obj,
-                  std::forward<CallArgs>(args)...);
+      impl_helper(
+          callable_indices {}, callable_types {}, obj,
+          std::forward<CallArgs>(args)...);
     }
 
   private:
     template <int... id_types, typename... Ts>
-    void impl_helper(camp::int_seq<int, id_types...>,
-                     camp::list<Ts...>,
-                     void_cptr_wrapper obj,
-                     CallArgs... args) const
+    void impl_helper(
+        camp::int_seq<int, id_types...>,
+        camp::list<Ts...>,
+        void_cptr_wrapper obj,
+        CallArgs... args) const
     {
-      camp::sink(((id_types == id)
-                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
-                      : 0)...);
+      camp::sink((
+          (id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                           : 0)...);
     }
 
     template <typename T>
@@ -688,20 +715,22 @@ struct Dispatcher<platform,
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{}, obj,
-                  std::forward<CallArgs>(args)...);
+      impl_helper(
+          callable_indices {}, callable_types {}, obj,
+          std::forward<CallArgs>(args)...);
     }
 
   private:
     template <int... id_types, typename... Ts>
-    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
-                                 camp::list<Ts...>,
-                                 void_cptr_wrapper obj,
-                                 CallArgs... args) const
+    RAJA_DEVICE void impl_helper(
+        camp::int_seq<int, id_types...>,
+        camp::list<Ts...>,
+        void_cptr_wrapper obj,
+        CallArgs... args) const
     {
-      camp::sink(((id_types == id)
-                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
-                      : 0)...);
+      camp::sink((
+          (id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                           : 0)...);
     }
 
     template <typename T>
@@ -723,14 +752,15 @@ struct Dispatcher<platform,
 
     void operator()(void_ptr_wrapper obj) const
     {
-      impl_helper(callable_indices{}, callable_types{}, obj);
+      impl_helper(callable_indices {}, callable_types {}, obj);
     }
 
   private:
     template <int... id_types, typename... Ts>
-    void impl_helper(camp::int_seq<int, id_types...>,
-                     camp::list<Ts...>,
-                     void_ptr_wrapper obj) const
+    void impl_helper(
+        camp::int_seq<int, id_types...>,
+        camp::list<Ts...>,
+        void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
@@ -750,29 +780,32 @@ struct Dispatcher<platform,
   /// If T is not in Ts return -1.
   ///
   template <typename T, int... id_types, typename... Ts>
-  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
-                                  camp::list<Ts...>)
+  static constexpr id_type
+  get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
   {
-    id_type id{-1};
+    id_type id {-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[]{0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
-    camp::sink(unused); // quiet unused var warning
+    int unused[] {
+        0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    camp::sink(unused);  // quiet unused var warning
     return id;
   }
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
+  template <
+      typename T,
+      bool uhi               = use_host_invoke,
+      std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
     static constexpr id_type id =
-        get_id<T>(callable_indices{}, callable_types{});
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id},
-            sizeof(T)};
+    return {
+        mover_type {id}, host_invoker_type {id}, destroyer_type {id},
+        sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -780,17 +813,19 @@ struct Dispatcher<platform,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
+  template <
+      typename T,
+      typename CreateOnDevice,
+      bool uhi                = use_host_invoke,
+      std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
     static constexpr id_type id =
-        get_id<T>(callable_indices{}, callable_types{});
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id},
-            sizeof(T)};
+    return {
+        mover_type {id}, device_invoker_type {id}, destroyer_type {id},
+        sizeof(T)};
   }
 
   mover_type     move_construct_destroy;
@@ -806,8 +841,8 @@ struct Dispatcher<platform,
 // template < typename T, typename Dispatcher_T >
 // inline const Dispatcher_T* get_Dispatcher(work_policy const&);
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 363f692710..8180ea6ee5 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -45,9 +45,10 @@ struct HoldBodyArgs_base
 {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template <typename body_in,
-            typename = typename std::enable_if<
-                std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
+  template <
+      typename body_in,
+      typename = typename std::enable_if<
+          std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
   HoldBodyArgs_base(body_in&& body, Args... args)
       : m_body(std::forward<body_in>(body)),
         m_arg_tuple(std::forward<Args>(args)...)
@@ -70,7 +71,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
   template <camp::idx_t... Is>
@@ -92,7 +93,7 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_DEVICE RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
   template <camp::idx_t... Is>
@@ -105,11 +106,12 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <typename ExecutionPolicy,
-          typename Segment_type,
-          typename LoopBody,
-          typename index_type,
-          typename... Args>
+template <
+    typename ExecutionPolicy,
+    typename Segment_type,
+    typename LoopBody,
+    typename index_type,
+    typename... Args>
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
@@ -126,8 +128,9 @@ struct HoldForall
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
-    wrap::forall(r, ExecutionPolicy(), m_segment,
-                 HoldBodyArgs{m_body, std::forward<Args>(args)...});
+    wrap::forall(
+        r, ExecutionPolicy(), m_segment,
+        HoldBodyArgs {m_body, std::forward<Args>(args)...});
   }
 
 private:
@@ -139,25 +142,27 @@ struct HoldForall
 /*!
  * A class that handles running work in a work container
  */
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
+template <
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
 struct WorkRunner;
 
 
 /*!
  * Base class describing storage for ordered runners using forall
  */
-template <typename FORALL_EXEC_POLICY,
-          typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
+template <
+    typename FORALL_EXEC_POLICY,
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
 struct WorkRunnerForallOrdered_base
 {
   using exec_policy     = EXEC_POLICY_T;
@@ -174,12 +179,12 @@ struct WorkRunnerForallOrdered_base
   struct holder_type
   {
     template <typename T>
-    using type =
-        HoldForall<forall_exec_policy,
-                   typename camp::at<T, camp::num<0>>::type, // segment_type
-                   typename camp::at<T, camp::num<1>>::type, // loop_type
-                   index_type,
-                   Args...>;
+    using type = HoldForall<
+        forall_exec_policy,
+        typename camp::at<T, camp::num<0>>::type,  // segment_type
+        typename camp::at<T, camp::num<1>>::type,  // loop_type
+        index_type,
+        Args...>;
   };
   ///
   template <typename T>
@@ -194,11 +199,12 @@ struct WorkRunnerForallOrdered_base
   using dispatcher_holder_policy =
       dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::host,
-                                     dispatcher_holder_policy,
-                                     void,
-                                     resource_type,
-                                     Args...>;
+  using dispatcher_type = Dispatcher<
+      Platform::host,
+      dispatcher_holder_policy,
+      void,
+      resource_type,
+      Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
@@ -219,7 +225,7 @@ struct WorkRunnerForallOrdered_base
         holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
     storage.template emplace<holder>(
-        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
         std::forward<segment_T>(seg), std::forward<loop_T>(loop));
   }
 
@@ -233,40 +239,43 @@ struct WorkRunnerForallOrdered_base
 /*!
  * Runs work in a storage container in order using forall
  */
-template <typename FORALL_EXEC_POLICY,
-          typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunnerForallOrdered
-    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                   EXEC_POLICY_T,
-                                   ORDER_POLICY_T,
-                                   DISPATCH_POLICY_T,
-                                   ALLOCATOR_T,
-                                   INDEX_T,
-                                   Args...>
+template <
+    typename FORALL_EXEC_POLICY,
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunnerForallOrdered : WorkRunnerForallOrdered_base<
+                                     FORALL_EXEC_POLICY,
+                                     EXEC_POLICY_T,
+                                     ORDER_POLICY_T,
+                                     DISPATCH_POLICY_T,
+                                     ALLOCATOR_T,
+                                     INDEX_T,
+                                     Args...>
 {
-  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                            EXEC_POLICY_T,
-                                            ORDER_POLICY_T,
-                                            DISPATCH_POLICY_T,
-                                            ALLOCATOR_T,
-                                            INDEX_T,
-                                            Args...>;
+  using base = WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
   template <typename WorkContainer>
-  typename base::per_run_storage run(WorkContainer const&         storage,
-                                     typename base::resource_type r,
-                                     Args... args) const
+  typename base::per_run_storage
+  run(WorkContainer const&         storage,
+      typename base::resource_type r,
+      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto end = storage.end();
     for (auto iter = storage.begin(); iter != end; ++iter)
@@ -281,41 +290,44 @@ struct WorkRunnerForallOrdered
 /*!
  * Runs work in a storage container in reverse order using forall
  */
-template <typename FORALL_EXEC_POLICY,
-          typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunnerForallReverse
-    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                   EXEC_POLICY_T,
-                                   ORDER_POLICY_T,
-                                   DISPATCH_POLICY_T,
-                                   ALLOCATOR_T,
-                                   INDEX_T,
-                                   Args...>
+template <
+    typename FORALL_EXEC_POLICY,
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunnerForallReverse : WorkRunnerForallOrdered_base<
+                                     FORALL_EXEC_POLICY,
+                                     EXEC_POLICY_T,
+                                     ORDER_POLICY_T,
+                                     DISPATCH_POLICY_T,
+                                     ALLOCATOR_T,
+                                     INDEX_T,
+                                     Args...>
 {
-  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                            EXEC_POLICY_T,
-                                            ORDER_POLICY_T,
-                                            DISPATCH_POLICY_T,
-                                            ALLOCATOR_T,
-                                            INDEX_T,
-                                            Args...>;
+  using base = WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
 
   // run the loops using forall in the reverse order to the order they were
   // enqueued
   template <typename WorkContainer>
-  typename base::per_run_storage run(WorkContainer const&         storage,
-                                     typename base::resource_type r,
-                                     Args... args) const
+  typename base::per_run_storage
+  run(WorkContainer const&         storage,
+      typename base::resource_type r,
+      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto begin = storage.begin();
     for (auto iter = storage.end(); iter != begin; --iter)
@@ -327,8 +339,8 @@ struct WorkRunnerForallReverse
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 3bf3030b89..535463eaf6 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -141,51 +141,51 @@ struct random_access_iterator : iterator_base
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline difference_type
-  operator-(random_access_iterator const& lhs,
-            random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline difference_type operator-(
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator==(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator==(
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator!=(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator!=(
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator<(random_access_iterator const& lhs,
-            random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator<(
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator<=(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator<=(
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator>(random_access_iterator const& lhs,
-            random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator>(
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator>=(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator>=(
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -195,9 +195,10 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template <typename STORAGE_POLICY_T,
-          typename ALLOCATOR_T,
-          typename Dispatcher_T>
+template <
+    typename STORAGE_POLICY_T,
+    typename ALLOCATOR_T,
+    typename Dispatcher_T>
 class WorkStorage;
 
 template <typename ALLOCATOR_T, typename Dispatcher_T>
@@ -259,23 +260,23 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type
-    operator-(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator==(const_iterator_base const& lhs_iter,
-               const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator<(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -302,8 +303,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   {
     if (this != &rhs)
     {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment{});
+      move_assign_private(
+          std::move(rhs), propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -336,8 +337,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   template <typename holder, typename... holder_ctor_args>
-  void emplace(const dispatcher_type* dispatcher,
-               holder_ctor_args&&... ctor_args)
+  void
+  emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
         dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
@@ -394,8 +395,9 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
   // allocate and construct value in storage
   template <typename holder, typename... holder_ctor_args>
-  pointer_and_size create_value(const dispatcher_type* dispatcher,
-                                holder_ctor_args&&... ctor_args)
+  pointer_and_size create_value(
+      const dispatcher_type* dispatcher,
+      holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
@@ -405,13 +407,13 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     value_type::template construct<holder>(
         value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
 
-    return pointer_and_size{value_ptr, value_size};
+    return pointer_and_size {value_ptr, value_size};
   }
 
   // allocate and move construct object as copy of other value and
   // destroy and deallocate other value
-  pointer_and_size move_destroy_value(WorkStorage&&    rhs,
-                                      pointer_and_size other_value_and_size)
+  pointer_and_size
+  move_destroy_value(WorkStorage&& rhs, pointer_and_size other_value_and_size)
   {
     pointer value_ptr = reinterpret_cast<pointer>(
         allocator_traits_type::allocate(m_aloc, other_value_and_size.size));
@@ -422,7 +424,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
         rhs.m_aloc, reinterpret_cast<char*>(other_value_and_size.ptr),
         other_value_and_size.size);
 
-    return pointer_and_size{value_ptr, other_value_and_size.size};
+    return pointer_and_size {value_ptr, other_value_and_size.size};
   }
 
   // destroy and deallocate value
@@ -490,23 +492,23 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type
-    operator-(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator==(const_iterator_base const& lhs_iter,
-               const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator<(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
@@ -542,8 +544,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   {
     if (this != &rhs)
     {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment{});
+      move_assign_private(
+          std::move(rhs), propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -573,8 +575,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   size_type storage_size() const { return m_array_end - m_array_begin; }
 
   template <typename holder, typename... holder_ctor_args>
-  void emplace(const dispatcher_type* dispatcher,
-               holder_ctor_args&&... ctor_args)
+  void
+  emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
     size_type value_size   = create_value<holder>(
@@ -589,8 +591,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     array_clear();
     if (m_array_begin != nullptr)
     {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                        storage_capacity());
+      allocator_traits_type::deallocate(
+          m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
@@ -600,8 +602,9 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<size_type,
-          typename allocator_traits_type::template rebind_alloc<size_type>>
+  RAJAVec<
+      size_type,
+      typename allocator_traits_type::template rebind_alloc<size_type>>
                  m_offsets;
   char*          m_array_begin = nullptr;
   char*          m_array_end   = nullptr;
@@ -676,14 +679,14 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
       for (size_type i = 0; i < size(); ++i)
       {
-        move_destroy_value(new_array_begin + m_offsets[i],
-                           m_array_begin + m_offsets[i]);
+        move_destroy_value(
+            new_array_begin + m_offsets[i], m_array_begin + m_offsets[i]);
       }
 
       if (m_array_begin != nullptr)
       {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                          storage_capacity());
+        allocator_traits_type::deallocate(
+            m_aloc, m_array_begin, storage_capacity());
       }
 
       m_array_begin = new_array_begin;
@@ -707,9 +710,10 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
   template <typename holder, typename... holder_ctor_args>
-  size_type create_value(size_type              value_offset,
-                         const dispatcher_type* dispatcher,
-                         holder_ctor_args&&... ctor_args)
+  size_type create_value(
+      size_type              value_offset,
+      const dispatcher_type* dispatcher,
+      holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
@@ -731,8 +735,9 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // loop body in other
   void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
-    value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
-                             reinterpret_cast<pointer>(other_value_ptr));
+    value_type::move_destroy(
+        reinterpret_cast<pointer>(value_ptr),
+        reinterpret_cast<pointer>(other_value_ptr));
   }
 
   // destroy the loop body at value offset
@@ -744,9 +749,10 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 };
 
 template <typename ALLOCATOR_T, typename Dispatcher_T>
-class WorkStorage<RAJA::constant_stride_array_of_objects,
-                  ALLOCATOR_T,
-                  Dispatcher_T>
+class WorkStorage<
+    RAJA::constant_stride_array_of_objects,
+    ALLOCATOR_T,
+    Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
   using propagate_on_container_copy_assignment =
@@ -800,23 +806,23 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type
-    operator-(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator==(const_iterator_base const& lhs_iter,
-               const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator<(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
@@ -851,8 +857,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     if (this != &rhs)
     {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment{});
+      move_assign_private(
+          std::move(rhs), propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -880,11 +886,11 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   size_type storage_size() const { return m_array_end - m_array_begin; }
 
   template <typename holder, typename... holder_ctor_args>
-  void emplace(const dispatcher_type* dispatcher,
-               holder_ctor_args&&... ctor_args)
+  void
+  emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(dispatcher,
-                         std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(
+        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -894,8 +900,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     array_clear();
     if (m_array_begin != nullptr)
     {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                        storage_capacity());
+      allocator_traits_type::deallocate(
+          m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
@@ -906,7 +912,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
 private:
   allocator_type m_aloc;
-  size_type      m_stride      = 1; // can't be 0 because size divides stride
+  size_type      m_stride      = 1;  // can't be 0 because size divides stride
   char*          m_array_begin = nullptr;
   char*          m_array_end   = nullptr;
   char*          m_array_cap   = nullptr;
@@ -984,14 +990,14 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
       for (size_type i = 0; i < size(); ++i)
       {
-        move_destroy_value(new_array_begin + i * new_stride,
-                           m_array_begin + i * m_stride);
+        move_destroy_value(
+            new_array_begin + i * new_stride, m_array_begin + i * m_stride);
       }
 
       if (m_array_begin != nullptr)
       {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                          storage_capacity());
+        allocator_traits_type::deallocate(
+            m_aloc, m_array_begin, storage_capacity());
       }
 
       m_stride      = new_stride;
@@ -1015,15 +1021,17 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
   template <typename holder, typename... holder_ctor_args>
-  void create_value(const dispatcher_type* dispatcher,
-                    holder_ctor_args&&... ctor_args)
+  void create_value(
+      const dispatcher_type* dispatcher,
+      holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
     if (value_size > storage_unused() && value_size <= m_stride)
     {
-      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
-                    m_stride);
+      array_reserve(
+          std::max(storage_size() + m_stride, 2 * storage_capacity()),
+          m_stride);
     }
     else if (value_size > m_stride)
     {
@@ -1041,8 +1049,9 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // destroy the loop body in other
   void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
-    value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
-                             reinterpret_cast<pointer>(other_value_ptr));
+    value_type::move_destroy(
+        reinterpret_cast<pointer>(value_ptr),
+        reinterpret_cast<pointer>(other_value_ptr));
   }
 
   // destroy the loop body at value offset
@@ -1053,8 +1062,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 155ead62fb..0ff8602623 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -47,11 +47,12 @@ struct WorkStruct;
 template <typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template <size_t   size,
-          Platform platform,
-          typename dispatch_policy,
-          typename DispatcherID,
-          typename... CallArgs>
+template <
+    size_t   size,
+    Platform platform,
+    typename dispatch_policy,
+    typename DispatcherID,
+    typename... CallArgs>
 struct WorkStruct<
     size,
     Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
@@ -62,24 +63,30 @@ struct WorkStruct<
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
   template <typename holder, typename... holder_ctor_args>
-  static RAJA_INLINE void construct(void*                  ptr,
-                                    const dispatcher_type* dispatcher,
-                                    holder_ctor_args&&... ctor_args)
+  static RAJA_INLINE void construct(
+      void*                  ptr,
+      const dispatcher_type* dispatcher,
+      holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
     using value_type      = GenericWorkStruct<dispatcher_type>;
 
-    static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
-                  "holder must fit in WorkStruct::obj");
-    static_assert(std::is_standard_layout<true_value_type>::value,
-                  "WorkStruct must be a standard layout type");
-    static_assert(std::is_standard_layout<value_type>::value,
-                  "GenericWorkStruct must be a standard layout type");
-    static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
-                  "WorkStruct and GenericWorkStruct must have obj at the same "
-                  "offset");
-    static_assert(sizeof(value_type) <= sizeof(true_value_type),
-                  "WorkStruct must not be smaller than GenericWorkStruct");
+    static_assert(
+        sizeof(holder) <= sizeof(true_value_type::obj),
+        "holder must fit in WorkStruct::obj");
+    static_assert(
+        std::is_standard_layout<true_value_type>::value,
+        "WorkStruct must be a standard layout type");
+    static_assert(
+        std::is_standard_layout<value_type>::value,
+        "GenericWorkStruct must be a standard layout type");
+    static_assert(
+        offsetof(value_type, obj) == offsetof(true_value_type, obj),
+        "WorkStruct and GenericWorkStruct must have obj at the same "
+        "offset");
+    static_assert(
+        sizeof(value_type) <= sizeof(true_value_type),
+        "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
@@ -88,13 +95,13 @@ struct WorkStruct<
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
-                                       WorkStruct* value_src)
+  static RAJA_INLINE void
+  move_destroy(WorkStruct* value_dst, WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
     value_dst->invoke     = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
-                                                  &value_src->obj);
+    value_dst->dispatcher->move_construct_destroy(
+        &value_dst->obj, &value_src->obj);
   }
 
   // destroy the value ptr
@@ -104,15 +111,15 @@ struct WorkStruct<
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
-                                    CallArgs... args)
+  static RAJA_INLINE void
+  host_call(const WorkStruct* value_ptr, CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
-                                                  CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE void
+  device_call(const WorkStruct* value_ptr, CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
@@ -122,8 +129,8 @@ struct WorkStruct<
   typename std::aligned_storage<size, RAJA_MAX_ALIGN>::type obj;
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index d8d50f6bd1..226ed79da8 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -89,7 +89,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc)
 {
-  return RAJA::atomicLoad(Policy{}, acc);
+  return RAJA::atomicLoad(Policy {}, acc);
 }
 
 
@@ -102,7 +102,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value)
 {
-  RAJA::atomicStore(Policy{}, acc, value);
+  RAJA::atomicStore(Policy {}, acc, value);
 }
 
 
@@ -116,7 +116,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value)
 {
-  return RAJA::atomicAdd(Policy{}, acc, value);
+  return RAJA::atomicAdd(Policy {}, acc, value);
 }
 
 
@@ -130,7 +130,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value)
 {
-  return RAJA::atomicSub(Policy{}, acc, value);
+  return RAJA::atomicSub(Policy {}, acc, value);
 }
 
 
@@ -144,7 +144,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value)
 {
-  return RAJA::atomicMin(Policy{}, acc, value);
+  return RAJA::atomicMin(Policy {}, acc, value);
 }
 
 
@@ -158,7 +158,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value)
 {
-  return RAJA::atomicMax(Policy{}, acc, value);
+  return RAJA::atomicMax(Policy {}, acc, value);
 }
 
 
@@ -171,7 +171,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc)
 {
-  return RAJA::atomicInc(Policy{}, acc);
+  return RAJA::atomicInc(Policy {}, acc);
 }
 
 
@@ -187,7 +187,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare)
 {
-  return RAJA::atomicInc(Policy{}, acc, compare);
+  return RAJA::atomicInc(Policy {}, acc, compare);
 }
 
 
@@ -200,7 +200,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc)
 {
-  return RAJA::atomicDec(Policy{}, acc);
+  return RAJA::atomicDec(Policy {}, acc);
 }
 
 
@@ -216,7 +216,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare)
 {
-  return RAJA::atomicDec(Policy{}, acc, compare);
+  return RAJA::atomicDec(Policy {}, acc, compare);
 }
 
 
@@ -231,9 +231,10 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
 {
-  static_assert(std::is_integral<T>::value,
-                "atomicAnd can only be used on integral types");
-  return RAJA::atomicAnd(Policy{}, acc, value);
+  static_assert(
+      std::is_integral<T>::value,
+      "atomicAnd can only be used on integral types");
+  return RAJA::atomicAnd(Policy {}, acc, value);
 }
 
 
@@ -248,9 +249,10 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
 {
-  static_assert(std::is_integral<T>::value,
-                "atomicOr can only be used on integral types");
-  return RAJA::atomicOr(Policy{}, acc, value);
+  static_assert(
+      std::is_integral<T>::value,
+      "atomicOr can only be used on integral types");
+  return RAJA::atomicOr(Policy {}, acc, value);
 }
 
 
@@ -265,9 +267,10 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
 {
-  static_assert(std::is_integral<T>::value,
-                "atomicXor can only be used on integral types");
-  return RAJA::atomicXor(Policy{}, acc, value);
+  static_assert(
+      std::is_integral<T>::value,
+      "atomicXor can only be used on integral types");
+  return RAJA::atomicXor(Policy {}, acc, value);
 }
 
 
@@ -281,7 +284,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
 {
-  return RAJA::atomicExchange(Policy{}, acc, value);
+  return RAJA::atomicExchange(Policy {}, acc, value);
 }
 
 
@@ -297,7 +300,7 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value)
 {
-  return RAJA::atomicCAS(Policy{}, acc, compare, value);
+  return RAJA::atomicCAS(Policy {}, acc, compare, value);
 }
 
 /*!
@@ -524,6 +527,6 @@ class AtomicRef
 };
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 5932cf9b19..79378831b1 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -52,19 +52,17 @@ template <typename Container>
 using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
 
 template <typename Container>
-using ContainerDiff =
-    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
-                         camp::val<camp::iterator_from<Container>>())>;
+using ContainerDiff = camp::decay<
+    decltype(camp::val<camp::iterator_from<Container>>() - camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE DiffType firstIndex(DiffType  n,
-                                CountType num_threads,
-                                CountType thread_id)
+RAJA_INLINE DiffType
+firstIndex(DiffType n, CountType num_threads, CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
 
-} // end namespace detail
+}  // end namespace detail
 
 
 /*!
@@ -102,6 +100,6 @@ RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it)
   return it;
 }
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 3b3c5599aa..8016081dbe 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -68,13 +68,14 @@ struct BaseMultiReduce
   using value_type      = typename t_MultiReduceData::value_type;
 
   BaseMultiReduce()
-      : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)}
+      : BaseMultiReduce {RepeatView<value_type>(MultiReduceOp::identity(), 0)}
   {}
 
-  explicit BaseMultiReduce(size_t     num_bins,
-                           value_type init_val = MultiReduceOp::identity(),
-                           value_type identity = MultiReduceOp::identity())
-      : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
+  explicit BaseMultiReduce(
+      size_t     num_bins,
+      value_type init_val = MultiReduceOp::identity(),
+      value_type identity = MultiReduceOp::identity())
+      : BaseMultiReduce {RepeatView<value_type>(init_val, num_bins), identity}
   {}
 
   template <
@@ -84,9 +85,10 @@ struct BaseMultiReduce
           concepts::negate<std::is_convertible<Container, size_t>>,
           concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* =
           nullptr>
-  explicit BaseMultiReduce(Container const& container,
-                           value_type identity = MultiReduceOp::identity())
-      : data{container, identity}
+  explicit BaseMultiReduce(
+      Container const& container,
+      value_type       identity = MultiReduceOp::identity())
+      : data {container, identity}
   {}
 
   RAJA_SUPPRESS_HD_WARN
@@ -103,21 +105,24 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(MultiReduceOp::identity(), size()));
   }
 
-  void reset(size_t     num_bins,
-             value_type init_val = MultiReduceOp::identity(),
-             value_type identity = MultiReduceOp::identity())
+  void reset(
+      size_t     num_bins,
+      value_type init_val = MultiReduceOp::identity(),
+      value_type identity = MultiReduceOp::identity())
   {
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template <typename Container,
-            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
-  void reset(Container const& container,
-             value_type       identity = MultiReduceOp::identity())
+  template <
+      typename Container,
+      concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
+  void reset(
+      Container const& container,
+      value_type       identity = MultiReduceOp::identity())
   {
     for (size_t bin = 0; bin < data.num_bins(); ++bin)
     {
-      RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
+      RAJA_UNUSED_VAR(get(bin));  // automatic get() before reset
     }
     data.reset(container, identity);
   }
@@ -138,8 +143,9 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template <typename Container,
-            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
+  template <
+      typename Container,
+      concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void get_all(Container& container) const
   {
     RAJA_EXTRACT_BED_IT(container);
@@ -402,10 +408,10 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   };
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace reduce
+}  // namespace reduce
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif /* RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP */
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 34a4f0052b..9d85915cd4 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -30,7 +30,7 @@ class has_privatizer
 private:
   template <typename C>
   static auto Test(void*)
-      -> decltype(camp::val<typename C::privatizer>(), camp::true_type{});
+      -> decltype(camp::val<typename C::privatizer>(), camp::true_type {});
 
   template <typename>
   static camp::false_type Test(...);
@@ -51,17 +51,19 @@ struct Privatizer
   using value_type     = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
-  static_assert(!has_privatizer<T>::value,
-                "Privatizer selected "
-                "inappropriately, this is almost "
-                "certainly "
-                "a bug");
-  static_assert(!std::is_base_of<GenericWrapperBase, T>::value,
-                "Privatizer selected inappropriately, this is almost certainly "
-                "a bug");
+  static_assert(
+      !has_privatizer<T>::value,
+      "Privatizer selected "
+      "inappropriately, this is almost "
+      "certainly "
+      "a bug");
+  static_assert(
+      !std::is_base_of<GenericWrapperBase, T>::value,
+      "Privatizer selected inappropriately, this is almost certainly "
+      "a bug");
 
   RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
+  RAJA_HOST_DEVICE Privatizer(const T& o) : priv {o} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE reference_type get_priv() { return priv; }
@@ -84,23 +86,25 @@ struct Privatizer
  * that does not belong here.
  *
  */
-template <typename T,
-          typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
 {
-  return Privatizer<T>{item};
+  return Privatizer<T> {item};
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T,
-          typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer
 {
-  return typename T::privatizer{item};
+  return typename T::privatizer {item};
 }
 
-} // namespace internal
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif /* __RAJA_PRIVATIZER_HPP */
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 769246c7e0..92976c8955 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -77,7 +77,7 @@ struct op_adapter : private Op<T, T, T>
     val = operator_type::operator()(val, v);
   }
 };
-} // namespace detail
+}  // namespace detail
 
 template <typename T>
 struct sum : detail::op_adapter<T, RAJA::operators::plus>
@@ -112,7 +112,7 @@ struct DefaultLoc
 {};
 
 template <typename T>
-struct DefaultLoc<T, false> // any non-integral type
+struct DefaultLoc<T, false>  // any non-integral type
 {
   RAJA_HOST_DEVICE constexpr T value() const { return T(); }
 };
@@ -134,7 +134,7 @@ class ValueLoc
     defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
   RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const& other)
-      : val{other.val}, loc{other.loc}
+      : val {other.val}, loc {other.loc}
   {}
   RAJA_HOST_DEVICE
   ValueLoc& operator=(ValueLoc const& other)
@@ -150,10 +150,10 @@ class ValueLoc
 #endif
 
   RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_)
-      : val{val_}, loc{DefaultLoc<IndexType>().value()}
+      : val {val_}, loc {DefaultLoc<IndexType>().value()}
   {}
   RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_, IndexType const& loc_)
-      : val{val_}, loc{loc_}
+      : val {val_}, loc {loc_}
   {}
 
   RAJA_HOST_DEVICE           operator T() const { return val; }
@@ -168,9 +168,9 @@ class ValueLoc
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace reduce
+}  // namespace reduce
 
 namespace operators
 {
@@ -190,7 +190,7 @@ struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>>
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::max());
   }
 };
-} // namespace operators
+}  // namespace operators
 
 namespace reduce
 {
@@ -198,11 +198,12 @@ namespace reduce
 namespace detail
 {
 
-template <typename T,
-          template <typename>
-          class Reduce_,
-          template <typename, typename>
-          class Combiner_>
+template <
+    typename T,
+    template <typename>
+    class Reduce_,
+    template <typename, typename>
+    class Combiner_>
 class BaseReduce
 {
   using Reduce = Reduce_<T>;
@@ -216,19 +217,19 @@ class BaseReduce
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce() : c{T(), Reduce::identity()} {}
+  BaseReduce() : c {T(), Reduce::identity()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   BaseReduce(T init_val, T identity_ = Reduce::identity())
-      : c{init_val, identity_}
+      : c {init_val, identity_}
   {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   void reset(T val, T identity_ = Reduce::identity())
   {
-    operator T(); // automatic get() before reset
+    operator T();  // automatic get() before reset
     c.reset(val, identity_);
   }
 
@@ -273,12 +274,12 @@ class BaseCombinable
 public:
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable() : identity{T()}, my_data{T()} {}
+  constexpr BaseCombinable() : identity {T()}, my_data {T()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   constexpr BaseCombinable(T init_val, T identity_ = T())
-      : identity{identity_}, my_data{init_val}
+      : identity {identity_}, my_data {init_val}
   {}
 
   RAJA_SUPPRESS_HD_WARN
@@ -292,9 +293,9 @@ class BaseCombinable
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   constexpr BaseCombinable(BaseCombinable const& other)
-      : parent{other.parent ? other.parent : &other},
-        identity{other.identity},
-        my_data{identity}
+      : parent {other.parent ? other.parent : &other},
+        identity {other.identity},
+        my_data {identity}
   {}
 
   RAJA_SUPPRESS_HD_WARN
@@ -309,7 +310,7 @@ class BaseCombinable
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const& other) { Reduce{}(my_data, other); }
+  void combine(T const& other) { Reduce {}(my_data, other); }
 
   /*!
    *  \return the calculated reduced value
@@ -362,10 +363,11 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <typename T,
-          typename IndexType,
-          template <typename, typename>
-          class Combiner>
+template <
+    typename T,
+    typename IndexType,
+    template <typename, typename>
+    class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
@@ -382,18 +384,21 @@ class BaseReduceMinLoc
       IndexType init_idx,
       T         identity_val_ = reduce_type::identity(),
       IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val_, identity_loc_))
+      : Base(
+            value_type(init_val, init_idx),
+            value_type(identity_val_, identity_loc_))
   {}
 
-  void reset(T         init_val,
-             IndexType init_idx,
-             T         identity_val_ = reduce_type::identity(),
-             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val_ = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(
+        value_type(init_val, init_idx),
+        value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
@@ -514,13 +519,15 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T,
-          typename IndexType,
-          template <typename, typename>
-          class Combiner>
-class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
-                                           RAJA::reduce::max,
-                                           Combiner>
+template <
+    typename T,
+    typename IndexType,
+    template <typename, typename>
+    class Combiner>
+class BaseReduceMaxLoc : public BaseReduce<
+                             ValueLoc<T, IndexType, false>,
+                             RAJA::reduce::max,
+                             Combiner>
 {
 public:
   using Base =
@@ -536,18 +543,21 @@ class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
       IndexType init_idx,
       T         identity_val_ = reduce_type::identity(),
       IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val_, identity_loc_))
+      : Base(
+            value_type(init_val, init_idx),
+            value_type(identity_val_, identity_loc_))
   {}
 
-  void reset(T         init_val,
-             IndexType init_idx,
-             T         identity_val_ = reduce_type::identity(),
-             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val_ = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(
+        value_type(init_val, init_idx),
+        value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
@@ -565,10 +575,10 @@ class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
   operator T() const { return Base::get(); }
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace reduce
+}  // namespace reduce
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif /* RAJA_PATTERN_DETAIL_REDUCE_HPP */
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 03f065b5db..3a593e80ca 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -106,7 +106,7 @@ struct icount_adapter
   typename container_type::iterator begin_it;
   Index_type                        icount;
   icount_adapter(Range const& r, Body const& b, IndexT icount_)
-      : body{b}, icount{icount_}
+      : body {b}, icount {icount_}
   {
     using std::begin;
     begin_it = begin(r);
@@ -122,11 +122,12 @@ struct icount_adapter
 
 struct CallForall
 {
-  template <typename T,
-            typename ExecPol,
-            typename Body,
-            typename Res,
-            typename ForallParams>
+  template <
+      typename T,
+      typename ExecPol,
+      typename Body,
+      typename Res,
+      typename ForallParams>
   RAJA_INLINE camp::resources::EventProxy<Res>
               operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
@@ -135,17 +136,18 @@ struct CallForallIcount
 {
   constexpr CallForallIcount(int s);
 
-  template <typename T,
-            typename ExecPol,
-            typename Body,
-            typename Res,
-            typename ForallParams>
+  template <
+      typename T,
+      typename ExecPol,
+      typename Body,
+      typename Res,
+      typename ForallParams>
   RAJA_INLINE camp::resources::EventProxy<Res>
               operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
-} // namespace detail
+}  // namespace detail
 
 /*!
  ******************************************************************************
@@ -165,20 +167,22 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename Res,
-          typename ExecutionPolicy,
-          typename Container,
-          typename LoopBody,
-          typename ForallParams>
+template <
+    typename Res,
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res               r,
-       ExecutionPolicy&& p,
-       Container&&       c,
-       LoopBody&&        loop_body,
-       ForallParams&&    f_params)
+forall(
+    Res               r,
+    ExecutionPolicy&& p,
+    Container&&       c,
+    LoopBody&&        loop_body,
+    ForallParams&&    f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
   return forall_impl(
@@ -186,10 +190,11 @@ forall(Res               r,
       std::forward<LoopBody>(loop_body), std::forward<ForallParams>(f_params));
 }
 
-template <typename Res,
-          typename ExecutionPolicy,
-          typename Container,
-          typename LoopBody>
+template <
+    typename Res,
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -210,29 +215,32 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename Res,
-          typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody,
-          typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res               r,
-                                                     ExecutionPolicy&& p,
-                                                     Container&&       c,
-                                                     IndexType&&       icount,
-                                                     LoopBody&&     loop_body,
-                                                     ForallParams&& f_params)
+template <
+    typename Res,
+    typename ExecutionPolicy,
+    typename Container,
+    typename IndexType,
+    typename LoopBody,
+    typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(
+    Res               r,
+    ExecutionPolicy&& p,
+    Container&&       c,
+    IndexType&&       icount,
+    LoopBody&&        loop_body,
+    ForallParams&&    f_params)
 {
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c, loop_body,
-                                                                 icount);
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(
+      c, loop_body, icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted,
-                     std::forward<ForallParams>(f_params));
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), range, adapted,
+      std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -244,58 +252,62 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res               r,
 *
 ******************************************************************************
 */
-template <typename Res,
-          typename SegmentIterPolicy,
-          typename SegmentExecPolicy,
-          typename... SegmentTypes,
-          typename LoopBody,
-          typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res>
-            forall_Icount(Res r,
-                          ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-                          const TypedIndexSet<SegmentTypes...>& iset,
-                          LoopBody                              loop_body,
-                          ForallParams                          f_params)
+template <
+    typename Res,
+    typename SegmentIterPolicy,
+    typename SegmentExecPolicy,
+    typename... SegmentTypes,
+    typename LoopBody,
+    typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(
+    Res r,
+    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody                              loop_body,
+    ForallParams                          f_params)
 {
   // no need for icount variant here
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
-               [=, &r](int segID)
-               {
-                 iset.segmentCall(
-                     segID,
-                     detail::CallForallIcount(iset.getStartingIcount(segID)),
-                     SegmentExecPolicy(), loop_body, r, f_params);
-               });
+  wrap::forall(
+      segIterRes, SegmentIterPolicy(), iset,
+      [=, &r](int segID)
+      {
+        iset.segmentCall(
+            segID, detail::CallForallIcount(iset.getStartingIcount(segID)),
+            SegmentExecPolicy(), loop_body, r, f_params);
+      });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
-template <typename Res,
-          typename SegmentIterPolicy,
-          typename SegmentExecPolicy,
-          typename LoopBody,
-          typename... SegmentTypes,
-          typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res>
-            forall(Res r,
-                   ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-                   const TypedIndexSet<SegmentTypes...>& iset,
-                   LoopBody                              loop_body,
-                   ForallParams                          f_params)
+template <
+    typename Res,
+    typename SegmentIterPolicy,
+    typename SegmentExecPolicy,
+    typename LoopBody,
+    typename... SegmentTypes,
+    typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res> forall(
+    Res r,
+    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody                              loop_body,
+    ForallParams                          f_params)
 {
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
-               [=, &r](int segID)
-               {
-                 iset.segmentCall(segID, detail::CallForall{},
-                                  SegmentExecPolicy(), loop_body, r, f_params);
-               });
+  wrap::forall(
+      segIterRes, SegmentIterPolicy(), iset,
+      [=, &r](int segID)
+      {
+        iset.segmentCall(
+            segID, detail::CallForall {}, SegmentExecPolicy(), loop_body, r,
+            f_params);
+      });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
-} // end namespace wrap
+}  // end namespace wrap
 
 
 /*!
@@ -318,22 +330,24 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename Res,
-          typename IdxSet,
-          typename... Params>
+template <
+    typename ExecutionPolicy,
+    typename Res,
+    typename IdxSet,
+    typename... Params>
 RAJA_INLINE resources::EventProxy<Res>
 forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
-  static_assert(type_traits::is_index_set<IdxSet>::value,
-                "Expected a TypedIndexSet but did not get one. Are you using "
-                "a TypedIndexSet policy by mistake?");
+  static_assert(
+      type_traits::is_index_set<IdxSet>::value,
+      "Expected a TypedIndexSet but did not get one. Are you using "
+      "a TypedIndexSet policy by mistake?");
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -344,9 +358,9 @@ forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e =
-      wrap::forall_Icount(r, std::forward<ExecutionPolicy>(p),
-                          std::forward<IdxSet>(c), std::move(body), f_params);
+  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
+      r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
+      std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -372,24 +386,26 @@ RAJA_INLINE resources::EventProxy<Res>
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename Res,
-          typename IdxSet,
-          typename... Params>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_indexset_policy<ExecutionPolicy>>
-    forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
-{
-  static_assert(type_traits::is_index_set<IdxSet>::value,
-                "Expected a TypedIndexSet but did not get one. Are you using "
-                "a TypedIndexSet policy by mistake?");
+template <
+    typename ExecutionPolicy,
+    typename Res,
+    typename IdxSet,
+    typename... Params>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_indexset_policy<ExecutionPolicy>>
+forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
+{
+  static_assert(
+      type_traits::is_index_set<IdxSet>::value,
+      "Expected a TypedIndexSet but did not get one. Are you using "
+      "a TypedIndexSet policy by mistake?");
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -400,9 +416,9 @@ RAJA_INLINE
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =
-      wrap::forall(r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
-                   std::move(body), f_params);
+  resources::EventProxy<Res> e = wrap::forall(
+      r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
+      std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -412,10 +428,10 @@ template <
     typename IdxSet,
     typename LoopBody,
     typename Res = typename resources::get_resource<ExecutionPolicy>::type>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_indexset_policy<ExecutionPolicy>>
-    forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_indexset_policy<ExecutionPolicy>>
+forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
@@ -435,20 +451,22 @@ template <
     typename Container,
     typename LoopBody,
     typename Res = typename resources::get_resource<ExecutionPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_multi_policy<ExecutionPolicy>,
-                                  type_traits::is_range<Container>>
-            forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_multi_policy<ExecutionPolicy>,
+    type_traits::is_range<Container>>
+forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container does not model RandomAccessIterator");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container does not model RandomAccessIterator");
 
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  return forall_impl(r, std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body));
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -458,32 +476,36 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename Res,
-          typename Container,
-          typename IndexType,
-          typename FirstParam,
-          typename... Params>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_range<Container>,
-                                  type_traits::is_integral<IndexType>>
-            forall_Icount(ExecutionPolicy&& p,
-                          Res               r,
-                          Container&&       c,
-                          IndexType         icount,
-                          FirstParam&&      first,
-                          Params&&... params)
-{
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container does not model RandomAccessIterator");
-
-  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
-                                               std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
-                                      std::forward<Params>(params)...);
+template <
+    typename ExecutionPolicy,
+    typename Res,
+    typename Container,
+    typename IndexType,
+    typename FirstParam,
+    typename... Params>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_range<Container>,
+    type_traits::is_integral<IndexType>>
+forall_Icount(
+    ExecutionPolicy&& p,
+    Res               r,
+    Container&&       c,
+    IndexType         icount,
+    FirstParam&&      first,
+    Params&&... params)
+{
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container does not model RandomAccessIterator");
+
+  auto f_params = expt::make_forall_param_pack(
+      std::forward<FirstParam>(first), std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(
+      std::forward<FirstParam>(first), std::forward<Params>(params)...);
   // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -512,10 +534,11 @@ RAJA_INLINE concepts::enable_if_t<
     type_traits::is_range<Container>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_integral<IndexType>>
-forall_Icount(ExecutionPolicy&& p,
-              Container&&       c,
-              IndexType         icount,
-              LoopBody&&        loop_body)
+forall_Icount(
+    ExecutionPolicy&& p,
+    Container&&       c,
+    IndexType         icount,
+    LoopBody&&        loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -531,10 +554,11 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
-template <typename ExecutionPolicy,
-          typename Res,
-          typename Container,
-          typename... Params>
+template <
+    typename ExecutionPolicy,
+    typename Res,
+    typename Container,
+    typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -542,14 +566,15 @@ RAJA_INLINE concepts::enable_if_t<
     type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 {
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container does not model RandomAccessIterator");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container does not model RandomAccessIterator");
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -560,9 +585,9 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =
-      wrap::forall(r, std::forward<ExecutionPolicy>(p),
-                   std::forward<Container>(c), std::move(body), f_params);
+  resources::EventProxy<Res> e = wrap::forall(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -586,7 +611,7 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
       std::forward<LoopBody>(loop_body));
 }
 
-} // namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -601,16 +626,16 @@ template <
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
-                                                   std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(
+      ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_resource<Res>>
-            forall(Res r, Args&&... args)
+RAJA_INLINE concepts::
+    enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+    forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
-                                                   std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(
+      ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 
 /*!
@@ -630,9 +655,9 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_resource<Res>>
-            forall_Icount(Res r, Args&&... args)
+RAJA_INLINE concepts::
+    enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+    forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
       ExecutionPolicy(), r, std::forward<Args>(args)...);
@@ -641,17 +666,18 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
 namespace detail
 {
 
-template <typename T,
-          typename ExecutionPolicy,
-          typename LoopBody,
-          typename Res,
-          typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res>
-CallForall::operator()(T const& segment,
-                       ExecutionPolicy,
-                       LoopBody     body,
-                       Res          r,
-                       ForallParams f_params) const
+template <
+    typename T,
+    typename ExecutionPolicy,
+    typename LoopBody,
+    typename Res,
+    typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(
+    T const& segment,
+    ExecutionPolicy,
+    LoopBody     body,
+    Res          r,
+    ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -661,24 +687,25 @@ CallForall::operator()(T const& segment,
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T,
-          typename ExecutionPolicy,
-          typename LoopBody,
-          typename Res,
-          typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res>
-CallForallIcount::operator()(T const& segment,
-                             ExecutionPolicy,
-                             LoopBody     body,
-                             Res          r,
-                             ForallParams f_params) const
+template <
+    typename T,
+    typename ExecutionPolicy,
+    typename LoopBody,
+    typename Res,
+    typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(
+    T const& segment,
+    ExecutionPolicy,
+    LoopBody     body,
+    Res          r,
+    ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body,
-                             f_params);
+  return wrap::forall_Icount(
+      r, ExecutionPolicy(), segment, start, body, f_params);
 }
 
-} // namespace detail
+}  // namespace detail
 
 //
 // Experimental support for dynamic policy selection
@@ -707,11 +734,11 @@ struct dynamic_helper
   }
 
   template <typename SEGMENT, typename BODY>
-  static resources::EventProxy<resources::Resource>
-  invoke_forall(RAJA::resources::Resource r,
-                const int                 pol,
-                SEGMENT const&            seg,
-                BODY const&               body)
+  static resources::EventProxy<resources::Resource> invoke_forall(
+      RAJA::resources::Resource r,
+      const int                 pol,
+      SEGMENT const&            seg,
+      BODY const&               body)
   {
 
     using t_pol         = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
@@ -726,8 +753,8 @@ struct dynamic_helper
       return {r};
     }
 
-    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r, pol, seg,
-                                                               body);
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(
+        r, pol, seg, body);
   }
 };
 
@@ -747,11 +774,11 @@ struct dynamic_helper<0, POLICY_LIST>
   }
 
   template <typename SEGMENT, typename BODY>
-  static resources::EventProxy<resources::Resource>
-  invoke_forall(RAJA::resources::Resource r,
-                const int                 pol,
-                SEGMENT const&            seg,
-                BODY const&               body)
+  static resources::EventProxy<resources::Resource> invoke_forall(
+      RAJA::resources::Resource r,
+      const int                 pol,
+      SEGMENT const&            seg,
+      BODY const&               body)
   {
     if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
@@ -780,11 +807,11 @@ void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
 }
 
 template <typename POLICY_LIST, typename SEGMENT, typename BODY>
-resources::EventProxy<resources::Resource>
-dynamic_forall(RAJA::resources::Resource r,
-               const int                 pol,
-               SEGMENT const&            seg,
-               BODY const&               body)
+resources::EventProxy<resources::Resource> dynamic_forall(
+    RAJA::resources::Resource r,
+    const int                 pol,
+    SEGMENT const&            seg,
+    BODY const&               body)
 {
   constexpr int N = camp::size<POLICY_LIST>::value;
   static_assert(N > 0, "RAJA policy list must not be empty");
@@ -797,10 +824,10 @@ dynamic_forall(RAJA::resources::Resource r,
   return dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
 }
 
-} // namespace expt
+}  // namespace expt
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index 488caff1e1..a82b48d527 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -58,8 +58,9 @@ template <typename... Ts>
 struct IterableWrapperTuple<camp::tuple<Ts...>>
 {
 
-  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
-                                      typename camp::decay<Ts>::IndexType>...>;
+  using type = camp::tuple<RAJA::Span<
+      typename camp::decay<Ts>::iterator,
+      typename camp::decay<Ts>::IndexType>...>;
 };
 
 
@@ -67,46 +68,49 @@ namespace internal
 {
 template <class Tuple, camp::idx_t... I>
 RAJA_INLINE constexpr auto
-make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>) -> camp::tuple<
-    RAJA::Span<typename camp::decay<
-                   camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-               typename camp::decay<
-                   camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
+make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>)
+    -> camp::tuple<RAJA::Span<
+        typename camp::decay<
+            camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+        typename camp::decay<
+            camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<typename camp::decay<
-                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-                 typename camp::decay<
-                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>{
+      RAJA::Span<
+          typename camp::decay<
+              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+          typename camp::decay<
+              camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType> {
           camp::get<I>(std::forward<Tuple>(t)).begin(),
           camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
-} // namespace internal
+}  // namespace internal
 
 template <class Tuple>
 RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t)
     -> decltype(internal::make_wrapped_tuple_impl(
         std::forward<Tuple>(t),
-        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{}))
+        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {}))
 {
   return internal::make_wrapped_tuple_impl(
       std::forward<Tuple>(t),
-      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{});
+      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
 
-template <typename PolicyType,
-          typename SegmentTuple,
-          typename ParamTuple,
-          typename Resource,
-          typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource>
-            kernel_param_resource(SegmentTuple&& segments,
-                                  ParamTuple&&   params,
-                                  Resource       resource,
-                                  Bodies&&... bodies)
+template <
+    typename PolicyType,
+    typename SegmentTuple,
+    typename ParamTuple,
+    typename Resource,
+    typename... Bodies>
+RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(
+    SegmentTuple&& segments,
+    ParamTuple&&   params,
+    Resource       resource,
+    Bodies&&... bodies)
 {
-  util::PluginContext context{util::make_context<PolicyType>()};
+  util::PluginContext context {util::make_context<PolicyType>()};
 
   // TODO: test that all policy members model the Executor policy concept
   // TODO: add a static_assert for functors which cannot be invoked with
@@ -119,8 +123,8 @@ RAJA_INLINE resources::EventProxy<Resource>
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<segment_tuple_t, param_tuple_t,
-                                         Resource, camp::decay<Bodies>...>;
+  using loop_data_t = internal::LoopData<
+      segment_tuple_t, param_tuple_t, Resource, camp::decay<Bodies>...>;
 
 
   util::callPreCapturePlugins(context);
@@ -149,10 +153,11 @@ RAJA_INLINE resources::EventProxy<Resource>
   return resources::EventProxy<Resource>(resource);
 }
 
-template <typename PolicyType,
-          typename SegmentTuple,
-          typename Resource,
-          typename... Bodies>
+template <
+    typename PolicyType,
+    typename SegmentTuple,
+    typename Resource,
+    typename... Bodies>
 RAJA_INLINE resources::EventProxy<Resource>
 kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
 {
@@ -161,10 +166,11 @@ kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
       std::forward<Bodies>(bodies)...);
 }
 
-template <typename PolicyType,
-          typename SegmentTuple,
-          typename ParamTuple,
-          typename... Bodies>
+template <
+    typename PolicyType,
+    typename SegmentTuple,
+    typename ParamTuple,
+    typename... Bodies>
 RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
 kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 {
@@ -185,7 +191,7 @@ RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
 }
 
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 
 #include "RAJA/pattern/kernel/Collapse.hpp"
diff --git a/include/RAJA/pattern/kernel/Collapse.hpp b/include/RAJA/pattern/kernel/Collapse.hpp
index 095ad402ef..10afccda53 100644
--- a/include/RAJA/pattern/kernel/Collapse.hpp
+++ b/include/RAJA/pattern/kernel/Collapse.hpp
@@ -33,8 +33,8 @@ struct Collapse : public internal::ForList,
 {};
 
 
-} // namespace statement
-} // end namespace RAJA
+}  // namespace statement
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index 450fecfd5d..1b8f38f76b 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -199,7 +199,7 @@ struct Not
 };
 
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
@@ -223,8 +223,8 @@ struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types>
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index f4c54c1146..d796505a97 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -37,9 +37,10 @@ namespace statement
  * Assigns the loop iterate to argument ArgumentId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ExecPolicy = camp::nil,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename ExecPolicy = camp::nil,
+    typename... EnclosedStmts>
 struct For : public internal::ForList,
              public internal::ForTraitBase<ArgumentId, ExecPolicy>,
              public internal::Statement<ExecPolicy, EnclosedStmts...>
@@ -50,7 +51,7 @@ struct For : public internal::ForList,
 };
 
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
@@ -60,10 +61,11 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename Data,
+    typename Types,
+    typename... EnclosedStmts>
 struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -85,10 +87,11 @@ struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ExecPolicy,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t ArgumentId,
+    typename ExecPolicy,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
     Types>
@@ -110,8 +113,9 @@ struct StatementExecutor<
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(
+        r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+        RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -121,8 +125,9 @@ struct StatementExecutor<
  *
  */
 template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-                         Types>
+struct StatementExecutor<
+    statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+    Types>
 {
 
 
@@ -149,8 +154,8 @@ struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_For_HPP */
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 345cacb4d5..f85d37c914 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -39,23 +39,25 @@ namespace statement
  * Assigns the loop index to param ParamId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename ExecPolicy = camp::nil,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename ExecPolicy = camp::nil,
+    typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
                    public internal::ForTraitBase<ArgumentId, ExecPolicy>,
                    public internal::Statement<ExecPolicy, EnclosedStmts...>
 {
 
-  static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
-                "Inappropriate ParamId, ParamId must be of type "
-                "RAJA::Statement::Param< # >");
+  static_assert(
+      std::is_base_of<internal::ParamBase, ParamId>::value,
+      "Inappropriate ParamId, ParamId must be of type "
+      "RAJA::Statement::Param< # >");
   // TODO: add static_assert for valid policy in Pol
   using execution_policy_t = ExecPolicy;
 };
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
@@ -65,11 +67,12 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Data,
+    typename Types,
+    typename... EnclosedStmts>
 struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -92,11 +95,12 @@ struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename ExecPolicy,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename ExecPolicy,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
     Types>
@@ -119,14 +123,15 @@ struct StatementExecutor<
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(
+        r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+        RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_nested_HPP */
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 0f866b63e7..5673098030 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -76,15 +76,16 @@ namespace statement
  *  });
  *
  */
-template <camp::idx_t HpArgumentId,
-          typename HpExecPolicy,
-          typename ArgList,
-          typename ExecPolicy,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t HpArgumentId,
+    typename HpExecPolicy,
+    typename ArgList,
+    typename ExecPolicy,
+    typename... EnclosedStmts>
 struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...>
 {};
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
@@ -95,18 +96,21 @@ struct HyperplaneInner : public internal::Statement<camp::nil, EnclosedStmts...>
 {};
 
 
-template <camp::idx_t HpArgumentId,
-          typename HpExecPolicy,
-          camp::idx_t... Args,
-          typename ExecPolicy,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::Hyperplane<HpArgumentId,
-                                               HpExecPolicy,
-                                               ArgList<Args...>,
-                                               ExecPolicy,
-                                               EnclosedStmts...>,
-                         Types>
+template <
+    camp::idx_t HpArgumentId,
+    typename HpExecPolicy,
+    camp::idx_t... Args,
+    typename ExecPolicy,
+    typename... EnclosedStmts,
+    typename Types>
+struct StatementExecutor<
+    statement::Hyperplane<
+        HpArgumentId,
+        HpExecPolicy,
+        ArgList<Args...>,
+        ExecPolicy,
+        EnclosedStmts...>,
+    Types>
 {
 
 
@@ -144,16 +148,18 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r, HpExecPolicy{}, TypedRangeSegment<idx_t>(0, hp_len),
-                outer_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(
+        r, HpExecPolicy {}, TypedRangeSegment<idx_t>(0, hp_len), outer_wrapper,
+        RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
-template <camp::idx_t HpArgumentId,
-          camp::idx_t... Args,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t HpArgumentId,
+    camp::idx_t... Args,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
     Types>
@@ -170,8 +176,9 @@ struct StatementExecutor<
 
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
-    idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
-                        camp::get<Args>(data.offset_tuple)...);
+    idx_t i = h - foldl(
+                      RAJA::operators::plus<idx_t>(),
+                      camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
@@ -193,8 +200,8 @@ struct StatementExecutor<
 };
 
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 6450f697bc..2febccf763 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -49,23 +49,26 @@ struct InitLocalMem : public internal::Statement<camp::nil>
 
 // Policy Specialization
 template <camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<RAJA::cpu_tile_mem,
-                    camp::idx_seq<Indices...>,
-                    EnclosedStmts...> : public internal::Statement<camp::nil>
+struct InitLocalMem<
+    RAJA::cpu_tile_mem,
+    camp::idx_seq<Indices...>,
+    EnclosedStmts...> : public internal::Statement<camp::nil>
 {};
 
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
 
 // Statement executor to initalize RAJA local array
 template <camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
-                                                 camp::idx_seq<Indices...>,
-                                                 EnclosedStmts...>,
-                         Types>
+struct StatementExecutor<
+    statement::InitLocalMem<
+        RAJA::cpu_tile_mem,
+        camp::idx_seq<Indices...>,
+        EnclosedStmts...>,
+    Types>
 {
 
   // Execute statement list
@@ -113,8 +116,8 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 39c872c6da..2867e61ddf 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -58,7 +58,7 @@ struct LambdaArg
   static constexpr camp::idx_t value = V;
 };
 
-} // namespace internal
+}  // namespace internal
 
 
 /*!
@@ -128,7 +128,7 @@ struct Lambda : internal::Statement<camp::nil>
   static const camp::idx_t loop_body_index = BodyIdx;
 };
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
@@ -147,11 +147,12 @@ template <typename SegmentType, camp::idx_t id>
 struct LambdaSegExtractor
 {
 
-  static_assert(!std::is_same<SegmentType, void>::value,
-                "Segment not "
-                "assigned, but used "
-                "in Lambda with "
-                "Segs<> argument");
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
+      "Segment not "
+      "assigned, but used "
+      "in Lambda with "
+      "Segs<> argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
@@ -175,11 +176,12 @@ template <typename OffsetType, camp::idx_t id>
 struct LambdaOffsetExtractor
 {
 
-  static_assert(!std::is_same<OffsetType, void>::value,
-                "Segment not assigned, "
-                "but used in Lambda "
-                "with Offsets<> "
-                "argument");
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
+      "Segment not assigned, "
+      "but used in Lambda "
+      "with Offsets<> "
+      "argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
@@ -206,11 +208,12 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
-  static_assert(!std::is_same<OffsetType, void>::value,
-                "Offset not assigned, "
-                "but used in Lambda "
-                "with Offsets<> "
-                "argument");
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
+      "Offset not assigned, "
+      "but used in Lambda "
+      "with Offsets<> "
+      "argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
@@ -226,11 +229,12 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
-  static_assert(!std::is_same<SegmentType, void>::value,
-                "Segment not "
-                "assigned, but used "
-                "in Lambda with "
-                "Segs<> argument");
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
+      "Segment not "
+      "assigned, but used "
+      "in Lambda with "
+      "Segs<> argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
@@ -246,8 +250,8 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto
   extract(Data&& data) -> typename std::add_lvalue_reference<
-      camp::tuple_element_t<id,
-                            typename camp::decay<Data>::param_tuple_t>>::type
+      camp::tuple_element_t<id, typename camp::decay<Data>::param_tuple_t>>::
+      type
   {
     return camp::get<id>(data.param_tuple);
   }
@@ -266,10 +270,11 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
 
 
 RAJA_SUPPRESS_HD_WARN
-template <camp::idx_t LoopIndex,
-          typename Types,
-          typename Data,
-          typename... targLists>
+template <
+    camp::idx_t LoopIndex,
+    typename Types,
+    typename Data,
+    typename... targLists>
 RAJA_INLINE RAJA_HOST_DEVICE void
 invoke_lambda_with_args(Data&& data, camp::list<targLists...> const&)
 {
@@ -293,28 +298,30 @@ struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>
     // Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
-                                                targList{});
+    invoke_lambda_with_args<LambdaIndex, Types>(
+        std::forward<Data>(data), targList {});
   }
 };
 
 
-template <camp::idx_t LambdaIndex,
-          typename Types,
-          typename Data,
-          camp::idx_t... SEGS,
-          camp::idx_t... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
-                                                camp::idx_seq<SEGS...> const&,
-                                                camp::idx_seq<PARAMS...> const&)
+template <
+    camp::idx_t LambdaIndex,
+    typename Types,
+    typename Data,
+    camp::idx_t... SEGS,
+    camp::idx_t... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(
+    Data&& data,
+    camp::idx_seq<SEGS...> const&,
+    camp::idx_seq<PARAMS...> const&)
 {
 
   using AllSegs   = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
-                    Types>::exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::
+      exec(std::forward<Data>(data));
 }
 
 
@@ -332,15 +339,15 @@ struct StatementExecutor<statement::Lambda<LambdaIndex>, Types>
 
     invoke_lambda<LambdaIndex, Types>(
         std::forward<Data>(data),
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
-        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{});
+        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value> {},
+        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value> {});
   }
 };
 
 
-} // namespace internal
+}  // namespace internal
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp
index 60972754e0..999e1a9ebe 100644
--- a/include/RAJA/pattern/kernel/Param.hpp
+++ b/include/RAJA/pattern/kernel/Param.hpp
@@ -34,7 +34,7 @@ namespace internal
 struct ParamBase
 {};
 
-} // end namespace internal
+}  // end namespace internal
 
 namespace statement
 {
@@ -60,8 +60,8 @@ struct Param : public internal::ParamBase
   }
 };
 
-} // end namespace statement
-} // end namespace RAJA
+}  // end namespace statement
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index ec1835e75d..0f2cdac6d6 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -38,26 +38,28 @@ namespace statement
  * the enclosed statements on the thread which contains the reduced value.
  *
  */
-template <typename ReducePolicy,
-          template <typename...>
-          class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts>
+template <
+    typename ReducePolicy,
+    template <typename...>
+    class ReduceOperator,
+    typename ParamId,
+    typename... EnclosedStmts>
 struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...>
 {
 
-  static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
-                "Inappropriate ParamId, ParamId must be of type "
-                "RAJA::Statement::Param< # >");
+  static_assert(
+      std::is_base_of<internal::ParamBase, ParamId>::value,
+      "Inappropriate ParamId, ParamId must be of type "
+      "RAJA::Statement::Param< # >");
 
   using execution_policy_t = camp::nil;
 };
 
 
-} // end namespace statement
+}  // end namespace statement
 
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_Reduce_HPP */
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 2732305105..3df80d639d 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -35,7 +35,7 @@ struct Region : public internal::Statement<camp::nil>
 {};
 
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
@@ -45,8 +45,9 @@ namespace internal
 // Note: RAJA region's lambda must capture by reference otherwise
 // internal function calls are undefined.
 template <typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
-                         Types>
+struct StatementExecutor<
+    statement::Region<RegionPolicy, EnclosedStmts...>,
+    Types>
 {
 
   template <typename Data>
@@ -64,7 +65,7 @@ struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 3e92146ff0..5cd57d93b4 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -40,7 +40,7 @@ struct TileSize
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr TileSize(camp::idx_t size_) : size{size_} {}
+  constexpr TileSize(camp::idx_t size_) : size {size_} {}
 };
 
 namespace statement
@@ -51,17 +51,18 @@ namespace statement
  * A RAJA::kernel statement that implements a tiling (or blocking) loop.
  *
  */
-template <camp::idx_t ArgumentId,
-          typename TilePolicy,
-          typename ExecPolicy,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename TilePolicy,
+    typename ExecPolicy,
+    typename... EnclosedStmts>
 struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
 {
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
 
-} // end namespace statement
+}  // end namespace statement
 
 ///! tag for a tiling loop
 template <camp::idx_t chunk_size_>
@@ -85,10 +86,11 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename Data,
+    typename Types,
+    typename... EnclosedStmts>
 struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -135,7 +137,7 @@ struct IterableTiler
     RAJA_HOST_DEVICE
     RAJA_INLINE
     constexpr iterator(IterableTiler const& itiler_, Index_type block_id_)
-        : itiler{itiler_}, block_id{block_id_}
+        : itiler {itiler_}, block_id {block_id_}
     {}
 
     RAJA_HOST_DEVICE
@@ -143,7 +145,7 @@ struct IterableTiler
     value_type operator*()
     {
       auto start = block_id * itiler.block_size;
-      return iterate{itiler.it.slice(start, itiler.block_size), block_id};
+      return iterate {itiler.it.slice(start, itiler.block_size), block_id};
     }
 
     RAJA_HOST_DEVICE
@@ -162,9 +164,9 @@ struct IterableTiler
     RAJA_HOST_DEVICE
     RAJA_INLINE iterator operator+(const difference_type& rhs) const
     {
-      return iterator(itiler, block_id + rhs >= itiler.num_blocks
-                                  ? itiler.num_blocks
-                                  : block_id + rhs);
+      return iterator(
+          itiler, block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
+                                                      : block_id + rhs);
     }
 
     RAJA_HOST_DEVICE
@@ -189,12 +191,12 @@ struct IterableTiler
   RAJA_HOST_DEVICE
   RAJA_INLINE
   IterableTiler(const Iterable& it_, camp::idx_t block_size_)
-      : it{it_}, block_size{block_size_}
+      : it {it_}, block_size {block_size_}
   {
     using std::begin;
     using std::distance;
     using std::end;
-    dist       = it.end() - it.begin(); // distance(begin(it), end(it));
+    dist       = it.end() - it.begin();  // distance(begin(it), end(it));
     num_blocks = dist / block_size;
     // if (dist % block_size) num_blocks += 1;
     if (dist - num_blocks * block_size > 0)
@@ -222,11 +224,12 @@ struct IterableTiler
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          camp::idx_t ChunkSize,
-          typename EPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t ArgumentId,
+    camp::idx_t ChunkSize,
+    typename EPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
     Types>
@@ -250,18 +253,20 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(
+        r, EPol {}, tiled_iterable, tile_wrapper,
+        RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-template <camp::idx_t ArgumentId,
-          typename EPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t ArgumentId,
+    typename EPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::
         Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
@@ -288,15 +293,16 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(
+        r, EPol {}, tiled_iterable, tile_wrapper,
+        RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 733e0a838e..72e7b36f56 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -42,22 +42,24 @@ namespace statement
  * Assigns the tile index to param ParamId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TilePolicy,
-          typename ExecPolicy,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename TilePolicy,
+    typename ExecPolicy,
+    typename... EnclosedStmts>
 struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...>
 {
-  static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
-                "Inappropriate ParamId, ParamId must be of type "
-                "RAJA::Statement::Param< # >");
+  static_assert(
+      std::is_base_of<internal::ParamBase, ParamId>::value,
+      "Inappropriate ParamId, ParamId must be of type "
+      "RAJA::Statement::Param< # >");
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
 
 
-} // end namespace statement
+}  // end namespace statement
 
 namespace internal
 {
@@ -67,11 +69,12 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Data,
+    typename Types,
+    typename... EnclosedStmts>
 struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -99,12 +102,13 @@ struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename EPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename TPol,
+    typename EPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
     Types>
@@ -130,8 +134,9 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(
+        r, EPol {}, tiled_iterable, tile_wrapper,
+        RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
@@ -139,7 +144,7 @@ struct StatementExecutor<
 };
 
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 6080176081..a3fc2eab22 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -52,9 +52,9 @@ struct ForTraitBase : public ForBase
 {
   constexpr static camp::idx_t index_val = ArgumentId;
   using index                            = camp::num<ArgumentId>;
-  using index_type                       = camp::nil; // default to invalid type
-  using policy_type                      = Policy;
-  using type = ForTraitBase; // make camp::value compatible
+  using index_type  = camp::nil;  // default to invalid type
+  using policy_type = Policy;
+  using type        = ForTraitBase;  // make camp::value compatible
 };
 
 
@@ -71,9 +71,9 @@ using difftype_list_from_segments =
 
 
 template <typename Segments>
-using difftype_tuple_from_segments =
-    typename camp::apply_l<camp::lambda<camp::tuple>,
-                           difftype_list_from_segments<Segments>>::type;
+using difftype_tuple_from_segments = typename camp::apply_l<
+    camp::lambda<camp::tuple>,
+    difftype_list_from_segments<Segments>>::type;
 
 
 template <typename Iterator>
@@ -89,20 +89,21 @@ using value_type_list_from_segments =
 
 
 template <typename Segments>
-using index_tuple_from_segments =
-    typename camp::apply_l<camp::lambda<camp::tuple>,
-                           value_type_list_from_segments<Segments>>::type;
+using index_tuple_from_segments = typename camp::apply_l<
+    camp::lambda<camp::tuple>,
+    value_type_list_from_segments<Segments>>::type;
 
 template <typename Segments>
-using index_types_from_segments =
-    typename camp::apply_l<camp::lambda<camp::list>,
-                           value_type_list_from_segments<Segments>>::type;
+using index_types_from_segments = typename camp::apply_l<
+    camp::lambda<camp::list>,
+    value_type_list_from_segments<Segments>>::type;
 
 
-template <typename SegmentTuple,
-          typename ParamTuple,
-          typename Resource,
-          typename... Bodies>
+template <
+    typename SegmentTuple,
+    typename ParamTuple,
+    typename Resource,
+    typename... Bodies>
 struct LoopData
 {
 
@@ -136,10 +137,12 @@ struct LoopData
   using vector_sizes_t = tuple_of_n<int, camp::tuple_size<SegmentTuple>::value>;
   vector_sizes_t vector_sizes;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
-                                                  ParamTuple const&   p,
-                                                  Resource            r,
-                                                  Bodies const&... b)
+  RAJA_INLINE
+  RAJA_HOST_DEVICE constexpr LoopData(
+      SegmentTuple const& s,
+      ParamTuple const&   p,
+      Resource            r,
+      Bodies const&... b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
   {}
   constexpr LoopData(LoopData const&) = default;
@@ -171,9 +174,9 @@ struct LoopData
 
 
 template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type = typename std::iterator_traits<
-    typename camp::at_v<typename Data::segment_tuple_t::TList,
-                        ArgumentId>::iterator>::difference_type;
+using segment_diff_type = typename std::iterator_traits<typename camp::at_v<
+    typename Data::segment_tuple_t::TList,
+    ArgumentId>::iterator>::difference_type;
 
 
 template <camp::idx_t ArgumentId, typename Data>
@@ -193,7 +196,7 @@ struct GenericWrapper : GenericWrapperBase
   data_t& data;
 
   RAJA_INLINE
-  constexpr explicit GenericWrapper(data_t& d) : data{d} {}
+  constexpr explicit GenericWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
   void exec()
@@ -218,7 +221,7 @@ struct NestedPrivatizer
 
   RAJA_INLINE
   constexpr NestedPrivatizer(const T& o)
-      : privatized_data{o.data}, privatized_wrapper(privatized_data)
+      : privatized_data {o.data}, privatized_wrapper(privatized_data)
   {}
 
   RAJA_INLINE
@@ -226,8 +229,8 @@ struct NestedPrivatizer
 };
 
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 78f62d40e7..22b8654b8e 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -43,10 +43,11 @@ struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
-  static_assert(s_num_segments == sizeof...(OffsetTypes),
-                "Number of segments "
-                "and offsets must "
-                "match");
+  static_assert(
+      s_num_segments == sizeof...(OffsetTypes),
+      "Number of segments "
+      "and offsets must "
+      "match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
   using offset_types_t  = camp::list<OffsetTypes...>;
@@ -68,19 +69,20 @@ struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
   using segment_list = typename Types::segment_types_t;
   using offset_list  = typename Types::offset_types_t;
 
-  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-                "Segment was already assigned: Probably looping over same "
-                "segment in loop nest");
+  static_assert(
+      std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+      "Segment was already assigned: Probably looping over same "
+      "segment in loop nest");
 
   using type = LoopTypes<
-      camp::list<
-          typename std::conditional<SEQ == Segment,
-                                    T,
-                                    camp::at_v<segment_list, SEQ>>::type...>,
-      camp::list<
-          typename std::conditional<SEQ == Segment,
-                                    T,
-                                    camp::at_v<segment_list, SEQ>>::type...>>;
+      camp::list<typename std::conditional<
+          SEQ == Segment,
+          T,
+          camp::at_v<segment_list, SEQ>>::type...>,
+      camp::list<typename std::conditional<
+          SEQ == Segment,
+          T,
+          camp::at_v<segment_list, SEQ>>::type...>>;
 };
 
 
@@ -98,8 +100,8 @@ using setSegmentTypeFromData = setSegmentType<
     camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
 
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_LoopTypes_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 99a37cf946..72c83adc01 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -31,10 +31,11 @@ namespace internal
 template <typename ExecPolicy, typename... EnclosedStmts>
 struct Statement
 {
-  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
-                    sizeof...(EnclosedStmts) > 0,
-                "Executable statement with no enclosed statements, this is "
-                "almost certainly a bug");
+  static_assert(
+      std::is_same<ExecPolicy, camp::nil>::value ||
+          sizeof...(EnclosedStmts) > 0,
+      "Executable statement with no enclosed statements, this is "
+      "almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
@@ -46,8 +47,8 @@ template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index fcd04b75be..b19947272f 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -43,10 +43,11 @@ template <camp::idx_t idx, camp::idx_t N, typename StmtList, typename Types>
 struct StatementListExecutor;
 
 
-template <camp::idx_t statement_index,
-          camp::idx_t num_statements,
-          typename StmtList,
-          typename Types>
+template <
+    camp::idx_t statement_index,
+    camp::idx_t num_statements,
+    typename StmtList,
+    typename Types>
 struct StatementListExecutor
 {
 
@@ -61,8 +62,9 @@ struct StatementListExecutor
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList,
-                          Types>::exec(std::forward<Data>(data));
+    StatementListExecutor<
+        statement_index + 1, num_statements, StmtList,
+        Types>::exec(std::forward<Data>(data));
   }
 };
 
@@ -89,8 +91,8 @@ RAJA_INLINE void execute_statement_list(Data&& data)
 }
 
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index c8a980bf97..7771ae99ee 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -55,7 +55,7 @@ struct TupleOfNHelper<T, camp::idx_seq<SEQ...>>
   using type = camp::tuple<typename SeqToType<T, SEQ>::type...>;
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*
  *  This creates a camp::list with N types, each one being T.
@@ -79,8 +79,8 @@ using tuple_of_n =
     typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_internal_Template_HPP */
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 7f610bd175..7421ed0a94 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -53,12 +53,13 @@ struct null_launch_t
 {};
 
 // Support for host, and device
-template <typename HOST_POLICY
+template <
+    typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
-          ,
-          typename DEVICE_POLICY = HOST_POLICY
+    ,
+    typename DEVICE_POLICY = HOST_POLICY
 #endif
-          >
+    >
 
 struct LoopPolicy
 {
@@ -68,12 +69,13 @@ struct LoopPolicy
 #endif
 };
 
-template <typename HOST_POLICY
+template <
+    typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
-          ,
-          typename DEVICE_POLICY = HOST_POLICY
+    ,
+    typename DEVICE_POLICY = HOST_POLICY
 #endif
-          >
+    >
 struct LaunchPolicy
 {
   using host_policy_t = HOST_POLICY;
@@ -89,19 +91,19 @@ struct Teams
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams() : value{1, 1, 1} {}
+  constexpr Teams() : value {1, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i) : value{i, 1, 1} {}
+  constexpr Teams(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j) : value{i, j, 1} {}
+  constexpr Teams(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j, int k) : value{i, j, k} {}
+  constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
 struct Threads
@@ -110,20 +112,20 @@ struct Threads
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads() : value{1, 1, 1} {}
+  constexpr Threads() : value {1, 1, 1} {}
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i) : value{i, 1, 1} {}
+  constexpr Threads(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j) : value{i, j, 1} {}
+  constexpr Threads(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j, int k) : value{i, j, k} {}
+  constexpr Threads(int i, int j, int k) : value {i, j, k} {}
 };
 
 struct Lanes
@@ -149,12 +151,13 @@ struct LaunchParams
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams   in_teams,
-               Threads in_threads,
-               size_t  in_shared_mem_size = 0)
+  LaunchParams(
+      Teams   in_teams,
+      Threads in_threads,
+      size_t  in_shared_mem_size = 0)
       : teams(in_teams),
         threads(in_threads),
-        shared_mem_size(in_shared_mem_size){};
+        shared_mem_size(in_shared_mem_size) {};
 
 private:
   RAJA_HOST_DEVICE
@@ -235,9 +238,10 @@ struct LaunchExecute;
 
 // Policy based launch with support to new reducers...
 template <typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(LaunchParams const& launch_params,
-            const char*         kernel_name,
-            ReduceParams&&... rest_of_launch_args)
+void launch(
+    LaunchParams const& launch_params,
+    const char*         kernel_name,
+    ReduceParams&&... rest_of_launch_args)
 {
 
   // Get reducers
@@ -249,7 +253,7 @@ void launch(LaunchParams const& launch_params,
 
   // Take the first policy as we assume the second policy is not user defined.
   // We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
@@ -265,8 +269,8 @@ void launch(LaunchParams const& launch_params,
   using Res = typename resources::get_resource<
       typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
-                 reducers);
+  launch_t::exec(
+      Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
@@ -275,8 +279,9 @@ void launch(LaunchParams const& launch_params,
 // Duplicate of code above on account that we need to support the case in which
 // a kernel_name is not given
 template <typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(LaunchParams const& launch_params,
-            ReduceParams&&... rest_of_launch_args)
+void launch(
+    LaunchParams const& launch_params,
+    ReduceParams&&... rest_of_launch_args)
 {
 
   const char* kernel_name = nullptr;
@@ -290,7 +295,7 @@ void launch(LaunchParams const& launch_params,
 
   // Take the first policy as we assume the second policy is not user defined.
   // We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
@@ -306,8 +311,8 @@ void launch(LaunchParams const& launch_params,
   using Res = typename resources::get_resource<
       typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
-                 reducers);
+  launch_t::exec(
+      Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
@@ -322,10 +327,11 @@ void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace           place,
-            const LaunchParams& params,
-            const char*         kernel_name,
-            BODY const&         body)
+void launch(
+    ExecPlace           place,
+    const LaunchParams& params,
+    const char*         kernel_name,
+    BODY const&         body)
 {
 
   // Forward to single policy launch API - simplifies testing of plugins
@@ -356,10 +362,11 @@ void launch(ExecPlace           place,
 
 // Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace           place,
-            const LaunchParams& launch_params,
-            const char*         kernel_name,
-            ReduceParams&&... rest_of_launch_args)
+void launch(
+    ExecPlace           place,
+    const LaunchParams& launch_params,
+    const char*         kernel_name,
+    ReduceParams&&... rest_of_launch_args)
 {
 
   // Forward to single policy launch API - simplifies testing of plugins
@@ -393,9 +400,10 @@ void launch(ExecPlace           place,
 // Run-time API for new reducer interface with support of the case without a new
 // kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace           place,
-            const LaunchParams& launch_params,
-            ReduceParams&&... rest_of_launch_args)
+void launch(
+    ExecPlace           place,
+    const LaunchParams& launch_params,
+    ReduceParams&&... rest_of_launch_args)
 // BODY const &body)
 {
 
@@ -462,11 +470,11 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
 
 // Launch API which takes team resource struct and supports new reducers
 template <typename POLICY_LIST, typename... ReduceParams>
-resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res,
-       LaunchParams const&       launch_params,
-       const char*               kernel_name,
-       ReduceParams&&... rest_of_launch_args)
+resources::EventProxy<resources::Resource> launch(
+    RAJA::resources::Resource res,
+    LaunchParams const&       launch_params,
+    const char*               kernel_name,
+    ReduceParams&&... rest_of_launch_args)
 {
 
   // Get reducers
@@ -490,12 +498,12 @@ launch(RAJA::resources::Resource res,
   // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{
+  util::PluginContext context {
       place == ExecPlace::HOST
           ? util::make_context<typename POLICY_LIST::host_policy_t>()
           : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
@@ -544,10 +552,10 @@ launch(RAJA::resources::Resource res,
 // Duplicate of API above on account that we need to handle the case that a
 // kernel name is not provided
 template <typename POLICY_LIST, typename... ReduceParams>
-resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res,
-       LaunchParams const&       launch_params,
-       ReduceParams&&... rest_of_launch_args)
+resources::EventProxy<resources::Resource> launch(
+    RAJA::resources::Resource res,
+    LaunchParams const&       launch_params,
+    ReduceParams&&... rest_of_launch_args)
 {
 
   const char* kernel_name = nullptr;
@@ -573,12 +581,12 @@ launch(RAJA::resources::Resource res,
   // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{
+  util::PluginContext context {
       place == ExecPlace::HOST
           ? util::make_context<typename POLICY_LIST::host_policy_t>()
           : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{
+  util::PluginContext context {
       util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
@@ -637,10 +645,11 @@ template <typename POLICY, typename SEGMENT>
 struct LoopICountExecute;
 
 RAJA_SUPPRESS_HD_WARN
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename SEGMENT,
+    typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void
 loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
@@ -648,53 +657,58 @@ loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
   LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename SEGMENT,
+    typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void
 loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment,
-                                                             body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment, body);
 }
 
 namespace expt
 {
 
 RAJA_SUPPRESS_HD_WARN
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
-                                       SEGMENT const& segment0,
-                                       SEGMENT const& segment1,
-                                       BODY const&    body)
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename SEGMENT,
+    typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(
+    CONTEXT const& ctx,
+    SEGMENT const& segment0,
+    SEGMENT const& segment1,
+    BODY const&    body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, body);
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
-                                              SEGMENT const& segment0,
-                                              SEGMENT const& segment1,
-                                              SEGMENT const& segment2,
-                                              BODY const&    body)
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename SEGMENT,
+    typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(
+    CONTEXT const& ctx,
+    SEGMENT const& segment0,
+    SEGMENT const& segment1,
+    SEGMENT const& segment2,
+    BODY const&    body)
 {
 
   LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
       ctx, segment0, segment1, segment2, body);
 }
 
-} // namespace expt
+}  // namespace expt
 
 template <typename POLICY, typename SEGMENT>
 struct TileExecute;
@@ -702,73 +716,81 @@ struct TileExecute;
 template <typename POLICY, typename SEGMENT>
 struct TileTCountExecute;
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
-                                       TILE_T         tile_size,
-                                       SEGMENT const& segment,
-                                       BODY const&    body)
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename TILE_T,
+    typename SEGMENT,
+    typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile(
+    CONTEXT const& ctx,
+    TILE_T         tile_size,
+    SEGMENT const& segment,
+    BODY const&    body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size, segment,
-                                                       body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size, segment, body);
 }
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
-                                              TILE_T         tile_size,
-                                              SEGMENT const& segment,
-                                              BODY const&    body)
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename TILE_T,
+    typename SEGMENT,
+    typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(
+    CONTEXT const& ctx,
+    TILE_T         tile_size,
+    SEGMENT const& segment,
+    BODY const&    body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size,
-                                                             segment, body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size, segment, body);
 }
 
 namespace expt
 {
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
-                                       TILE_T         tile_size0,
-                                       TILE_T         tile_size1,
-                                       SEGMENT const& segment0,
-                                       SEGMENT const& segment1,
-                                       BODY const&    body)
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename TILE_T,
+    typename SEGMENT,
+    typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile(
+    CONTEXT const& ctx,
+    TILE_T         tile_size0,
+    TILE_T         tile_size1,
+    SEGMENT const& segment0,
+    SEGMENT const& segment1,
+    BODY const&    body)
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
       ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
-                                              TILE_T         tile_size0,
-                                              TILE_T         tile_size1,
-                                              SEGMENT const& segment0,
-                                              SEGMENT const& segment1,
-                                              BODY const&    body)
+template <
+    typename POLICY_LIST,
+    typename CONTEXT,
+    typename TILE_T,
+    typename SEGMENT,
+    typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(
+    CONTEXT const& ctx,
+    TILE_T         tile_size0,
+    TILE_T         tile_size1,
+    SEGMENT const& segment0,
+    SEGMENT const& segment1,
+    BODY const&    body)
 {
 
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
       ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
index 9d3d9dc975..ca3f4e58d0 100644
--- a/include/RAJA/pattern/multi_reduce.hpp
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -189,7 +189,7 @@ struct MultiReduceBitOr;
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitAnd;
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 9a204936ac..7659c99a30 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -43,25 +43,26 @@ struct ForallParamPack
 private:
   // Init
   template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
-  static constexpr void detail_init(EXEC_POL,
-                                    camp::idx_seq<Seq...>,
-                                    ForallParamPack& f_params,
-                                    Args&&... args)
+  static constexpr void detail_init(
+      EXEC_POL,
+      camp::idx_seq<Seq...>,
+      ForallParamPack& f_params,
+      Args&&... args)
   {
-    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
-                                             std::forward<Args>(args)...));
+    CAMP_EXPAND(expt::detail::init<EXEC_POL>(
+        camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)...));
   }
 
   // Combine
   template <typename EXEC_POL, camp::idx_t... Seq>
-  RAJA_HOST_DEVICE static constexpr void
-  detail_combine(EXEC_POL,
-                 camp::idx_seq<Seq...>,
-                 ForallParamPack&       out,
-                 const ForallParamPack& in)
+  RAJA_HOST_DEVICE static constexpr void detail_combine(
+      EXEC_POL,
+      camp::idx_seq<Seq...>,
+      ForallParamPack&       out,
+      const ForallParamPack& in)
   {
-    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
-                                          camp::get<Seq>(in.param_tup)));
+    CAMP_EXPAND(detail::combine<EXEC_POL>(
+        camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
   }
 
   template <typename EXEC_POL, camp::idx_t... Seq>
@@ -73,34 +74,37 @@ struct ForallParamPack
 
   // Resolve
   template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
-  static constexpr void detail_resolve(EXEC_POL,
-                                       camp::idx_seq<Seq...>,
-                                       ForallParamPack& f_params,
-                                       Args&&... args)
+  static constexpr void detail_resolve(
+      EXEC_POL,
+      camp::idx_seq<Seq...>,
+      ForallParamPack& f_params,
+      Args&&... args)
   {
-    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
-                                          std::forward<Args>(args)...));
+    CAMP_EXPAND(detail::resolve<EXEC_POL>(
+        camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)...));
   }
 
   // Used to construct the argument TYPES that will be invoked with the lambda.
   template <typename null_t = camp::nil>
   static constexpr auto LAMBDA_ARG_TUP_T()
   {
-    return camp::tuple<>{};
+    return camp::tuple<> {};
   };
   template <typename null_t = camp::nil, typename First>
   static constexpr auto LAMBDA_ARG_TUP_T()
   {
     return typename First::ARG_TUP_T();
   };
-  template <typename null_t = camp::nil,
-            typename First,
-            typename Second,
-            typename... Rest>
+  template <
+      typename null_t = camp::nil,
+      typename First,
+      typename Second,
+      typename... Rest>
   static constexpr auto LAMBDA_ARG_TUP_T()
   {
-    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
-                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
+    return camp::tuple_cat_pair(
+        typename First::ARG_TUP_T(),
+        LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
   };
 
   using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
@@ -134,8 +138,8 @@ struct ForallParamPack
       camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
 
   template <typename... Ts>
-  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)){};
-}; // struct ForallParamPack
+  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
+};  // struct ForallParamPack
 
 
 //===========================================================================
@@ -146,35 +150,44 @@ struct ForallParamPack
 //
 struct ParamMultiplexer
 {
-  template <typename EXEC_POL,
-            typename... Params,
-            typename... Args,
-            typename FP = ForallParamPack<Params...>>
-  static void constexpr init(ForallParamPack<Params...>& f_params,
-                             Args&&... args)
+  template <
+      typename EXEC_POL,
+      typename... Params,
+      typename... Args,
+      typename FP = ForallParamPack<Params...>>
+  static void constexpr init(
+      ForallParamPack<Params...>& f_params,
+      Args&&... args)
   {
-    FP::detail_init(EXEC_POL(), typename FP::params_seq(), f_params,
-                    std::forward<Args>(args)...);
+    FP::detail_init(
+        EXEC_POL(), typename FP::params_seq(), f_params,
+        std::forward<Args>(args)...);
   }
-  template <typename EXEC_POL,
-            typename... Params,
-            typename... Args,
-            typename FP = ForallParamPack<Params...>>
-  static void constexpr combine(ForallParamPack<Params...>& f_params,
-                                Args&&... args)
+  template <
+      typename EXEC_POL,
+      typename... Params,
+      typename... Args,
+      typename FP = ForallParamPack<Params...>>
+  static void constexpr combine(
+      ForallParamPack<Params...>& f_params,
+      Args&&... args)
   {
-    FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params,
-                       std::forward<Args>(args)...);
+    FP::detail_combine(
+        EXEC_POL(), typename FP::params_seq(), f_params,
+        std::forward<Args>(args)...);
   }
-  template <typename EXEC_POL,
-            typename... Params,
-            typename... Args,
-            typename FP = ForallParamPack<Params...>>
-  static void constexpr resolve(ForallParamPack<Params...>& f_params,
-                                Args&&... args)
+  template <
+      typename EXEC_POL,
+      typename... Params,
+      typename... Args,
+      typename FP = ForallParamPack<Params...>>
+  static void constexpr resolve(
+      ForallParamPack<Params...>& f_params,
+      Args&&... args)
   {
-    FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params,
-                       std::forward<Args>(args)...);
+    FP::detail_resolve(
+        EXEC_POL(), typename FP::params_seq(), f_params,
+        std::forward<Args>(args)...);
   }
 };
 //===========================================================================
@@ -204,16 +217,17 @@ using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
 template <typename Base, typename... Ts>
 using check_types_derive_base =
     all_true<std::is_convertible<Ts, Base>::value...>;
-} // namespace detail
+}  // namespace detail
 
 
 template <typename... Ts>
 constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
 {
-  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
-                                                camp::decay<Ts>...>::value,
-                "Forall optional arguments do not derive ForallParamBase. "
-                "Please see Reducer, ReducerLoc and KernelName for examples.");
+  static_assert(
+      detail::check_types_derive_base<
+          detail::ForallParamBase, camp::decay<Ts>...>::value,
+      "Forall optional arguments do not derive ForallParamBase. "
+      "Please see Reducer, ReducerLoc and KernelName for examples.");
   return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
 }
 
@@ -231,10 +245,10 @@ constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
 template <typename... Ts>
 constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
 {
-  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1>{},
-                        std::move(tuple));
+  return tuple_from_seq(
+      camp::make_idx_seq_t<sizeof...(Ts) - 1> {}, std::move(tuple));
 };
-} // namespace detail
+}  // namespace detail
 
 
 // Make a tuple of the param pack except the final element...
@@ -284,12 +298,12 @@ struct lambda_traits;
 
 template <class R, class C, class First, class... Rest>
 struct lambda_traits<R (C::*)(First, Rest...)>
-{ // non-const specialization
+{  // non-const specialization
   using arg_type = First;
 };
 template <class R, class C, class First, class... Rest>
 struct lambda_traits<R (C::*)(First, Rest...) const>
-{ // const specialization
+{  // const specialization
   using arg_type = First;
 };
 
@@ -305,19 +319,19 @@ typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
 template <typename... Ts>
 constexpr auto list_remove_pointer(const camp::list<Ts...>&)
 {
-  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
+  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...> {};
 }
 
 template <typename... Ts>
 constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&)
 {
-  return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
+  return camp::list<typename std::add_lvalue_reference<Ts>::type...> {};
 }
 
 template <typename... Ts>
 constexpr auto tuple_to_list(const camp::tuple<Ts...>&)
 {
-  return camp::list<Ts...>{};
+  return camp::list<Ts...> {};
 }
 
 // TODO : Change to std::is_invocable at c++17
@@ -360,13 +374,14 @@ check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
 {
 #if !defined(RAJA_ENABLE_HIP)
   static_assert(
-      is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
-                   EXPECTED_ARGS...>::value,
+      is_invocable<
+          LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
+          EXPECTED_ARGS...>::value,
       "LAMBDA Not invocable w/ EXPECTED_ARGS.");
 #endif
 }
 
-} // namespace detail
+}  // namespace detail
 
 
 template <typename Lambda, typename ForallParams>
@@ -376,7 +391,7 @@ constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
   using expected_arg_type_list = decltype(detail::list_add_lvalue_ref(
       detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args()))));
 
-  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
+  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list {});
 }
 //===========================================================================
 
@@ -406,7 +421,7 @@ struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
 template <>
 struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
 {};
-} // namespace type_traits
+}  // namespace type_traits
 //===========================================================================
 
 
@@ -427,15 +442,16 @@ RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
 
 CAMP_SUPPRESS_HD_WARN
 template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
-RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                  Fn&&     f,
-                                                  camp::idx_seq<Sequence...>,
-                                                  Ts&&... extra)
+RAJA_HOST_DEVICE constexpr auto invoke_with_order(
+    Params&& params,
+    Fn&&     f,
+    camp::idx_seq<Sequence...>,
+    Ts&&... extra)
 {
-  return f(std::forward<Ts...>(extra...),
-           (get_lambda_args<Sequence>(params))...);
+  return f(
+      std::forward<Ts...>(extra...), (get_lambda_args<Sequence>(params))...);
 }
-} // namespace detail
+}  // namespace detail
 
 // CAMP_SUPPRESS_HD_WARN
 template <typename Params, typename Fn, typename... Ts>
@@ -449,7 +465,7 @@ invoke_body(Params&& params, Fn&& f, Ts&&... extra)
 }
 //===========================================================================
 
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  FORALL_PARAM_HPP
+#endif  //  FORALL_PARAM_HPP
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
index f3a517fbac..e9d2a6e3e1 100644
--- a/include/RAJA/pattern/params/kernel_name.hpp
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -17,13 +17,13 @@ struct KernelName : public ForallParamBase
   const char* name;
 };
 
-} // namespace detail
+}  // namespace detail
 
 inline auto KernelName(const char* n) { return detail::KernelName(n); }
-} // namespace expt
+}  // namespace expt
 
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
 
-#endif // KERNEL_NAME_HPP
+#endif  // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index 073cea1a8e..e15d1db819 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -21,10 +21,10 @@ struct ForallParamBase
   static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace expt
+}  // namespace expt
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  RAJA_PARAMS_BASE
+#endif  //  RAJA_PARAMS_BASE
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index a9f180a8ac..379007073d 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -56,7 +56,7 @@ struct ValLoc
   index_type loc = -1;
 };
 
-} //  namespace expt
+}  //  namespace expt
 
 namespace operators
 {
@@ -74,9 +74,9 @@ struct limits<RAJA::expt::ValLoc<T>>
   }
 };
 
-} //  namespace operators
+}  //  namespace operators
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
 namespace RAJA
 {
@@ -129,7 +129,7 @@ struct Reducer : public ForallParamBase
   static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
 };
 
-} // namespace detail
+}  // namespace detail
 
 template <template <typename, typename, typename> class Op, typename T>
 auto constexpr Reduce(T* target)
@@ -158,16 +158,16 @@ struct ReducerLoc : public Reducer<Op, T>
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 template <template <typename, typename, typename> class Op, typename T>
 auto constexpr ReduceLoc(T* target)
 {
   return detail::ReducerLoc<Op<T, T, T>, T>(target);
 }
-} // namespace expt
+}  // namespace expt
 
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_HPP
+#endif  //  NEW_REDUCE_HPP
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 98d552236a..a1cc15dceb 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -231,7 +231,7 @@ class ReduceBitOr;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitAnd;
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/region.hpp b/include/RAJA/pattern/region.hpp
index 09a890498f..a79422fa7b 100644
--- a/include/RAJA/pattern/region.hpp
+++ b/include/RAJA/pattern/region.hpp
@@ -38,7 +38,7 @@ void region(OuterBody&& outer_body, InnerBody&& inner_body)
   region_impl(ExecutionPolicy(), outer_body, inner_body);
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 01ef0fd1ba..79d78cabcc 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -51,30 +51,33 @@ template <
     typename Res,
     typename Container,
     typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<Container>>
-    inclusive_scan_inplace(ExecPolicy&& p,
-                           Res          r,
-                           Container&&  c,
-                           Function     binop = Function{})
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
+inclusive_scan_inplace(
+    ExecPolicy&& p,
+    Res          r,
+    Container&&  c,
+    Function     binop = Function {})
 {
   using std::begin;
   using std::end;
   using R = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Function, R, R, R>::value,
-                "Function must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Function, R, R, R>::value,
+      "Function must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
   if (begin(c) == end(c))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
-                                       end(c), binop);
+  return impl::scan::inclusive_inplace(
+      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop);
 }
 ///
 template <
@@ -88,9 +91,10 @@ RAJA_INLINE concepts::enable_if_t<
     type_traits::is_range<Container>,
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
-inclusive_scan_inplace(ExecPolicy&& p,
-                       Container&&  c,
-                       Function     binop = Function{})
+inclusive_scan_inplace(
+    ExecPolicy&& p,
+    Container&&  c,
+    Function     binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
@@ -109,53 +113,59 @@ inclusive_scan_inplace(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename T        = RAJA::detail::ContainerVal<Container>,
-          typename Function = operators::plus<T>>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<Container>>
-    exclusive_scan_inplace(ExecPolicy&& p,
-                           Res          r,
-                           Container&&  c,
-                           Function     binop = Function{},
-                           T            value = Function::identity())
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename T        = RAJA::detail::ContainerVal<Container>,
+    typename Function = operators::plus<T>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
+exclusive_scan_inplace(
+    ExecPolicy&& p,
+    Res          r,
+    Container&&  c,
+    Function     binop = Function {},
+    T            value = Function::identity())
 {
   using std::begin;
   using std::end;
   using R = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
-                "Function must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Function, R, T, R>::value,
+      "Function must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
   if (begin(c) == end(c))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
-                                       end(c), binop, value);
+  return impl::scan::exclusive_inplace(
+      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop, value);
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename T        = RAJA::detail::ContainerVal<Container>,
-          typename Function = operators::plus<T>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename T        = RAJA::detail::ContainerVal<Container>,
+    typename Function = operators::plus<T>,
+    typename Res      = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
     type_traits::is_range<Container>,
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
-exclusive_scan_inplace(ExecPolicy&& p,
-                       Container&&  c,
-                       Function     binop = Function{},
-                       T            value = Function::identity())
+exclusive_scan_inplace(
+    ExecPolicy&& p,
+    Container&&  c,
+    Function     binop = Function {},
+    T            value = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -178,49 +188,55 @@ exclusive_scan_inplace(ExecPolicy&& p,
 *begin))}
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename InContainer,
-          typename OutContainer,
-          typename Function =
-              operators::plus<RAJA::detail::ContainerVal<InContainer>>>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<InContainer>,
-                          type_traits::is_range<OutContainer>>
-    inclusive_scan(ExecPolicy&&   p,
-                   Res            r,
-                   InContainer&&  in,
-                   OutContainer&& out,
-                   Function       binop = Function{})
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename InContainer,
+    typename OutContainer,
+    typename Function =
+        operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<InContainer>,
+    type_traits::is_range<OutContainer>>
+inclusive_scan(
+    ExecPolicy&&   p,
+    Res            r,
+    InContainer&&  in,
+    OutContainer&& out,
+    Function       binop = Function {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<InContainer>;
   using R = RAJA::detail::ContainerVal<OutContainer>;
-  static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
-                "Function must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<InContainer>::value,
-                "InContainer must model RandomAccessRange");
-  static_assert(type_traits::is_random_access_range<OutContainer>::value,
-                "OutContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Function, R, T, R>::value,
+      "Function must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<InContainer>::value,
+      "InContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_random_access_range<OutContainer>::value,
+      "OutContainer must model RandomAccessRange");
   if (begin(in) == end(in))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p), begin(in),
-                               end(in), begin(out), binop);
+  return impl::scan::inclusive(
+      r, std::forward<ExecPolicy>(p), begin(in), end(in), begin(out), binop);
 }
 ///
-template <typename ExecPolicy,
-          typename InContainer,
-          typename OutContainer,
-          typename Function =
-              operators::plus<RAJA::detail::ContainerVal<InContainer>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <
+    typename ExecPolicy,
+    typename InContainer,
+    typename OutContainer,
+    typename Function =
+        operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -228,10 +244,11 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, InContainer>>,
     type_traits::is_range<OutContainer>>
-inclusive_scan(ExecPolicy&&   p,
-               InContainer&&  in,
-               OutContainer&& out,
-               Function       binop = Function{})
+inclusive_scan(
+    ExecPolicy&&   p,
+    InContainer&&  in,
+    OutContainer&& out,
+    Function       binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -255,50 +272,57 @@ inclusive_scan(ExecPolicy&&   p,
 *begin))}
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename InContainer,
-          typename OutContainer,
-          typename T        = RAJA::detail::ContainerVal<InContainer>,
-          typename Function = operators::plus<T>>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<InContainer>,
-                          type_traits::is_range<OutContainer>>
-    exclusive_scan(ExecPolicy&&   p,
-                   Res            r,
-                   InContainer&&  in,
-                   OutContainer&& out,
-                   Function       binop = Function{},
-                   T              value = Function::identity())
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename InContainer,
+    typename OutContainer,
+    typename T        = RAJA::detail::ContainerVal<InContainer>,
+    typename Function = operators::plus<T>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<InContainer>,
+    type_traits::is_range<OutContainer>>
+exclusive_scan(
+    ExecPolicy&&   p,
+    Res            r,
+    InContainer&&  in,
+    OutContainer&& out,
+    Function       binop = Function {},
+    T              value = Function::identity())
 {
   using std::begin;
   using std::end;
   using U = RAJA::detail::ContainerVal<InContainer>;
   using R = RAJA::detail::ContainerVal<OutContainer>;
-  static_assert(type_traits::is_binary_function<Function, R, T, U>::value,
-                "Function must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<InContainer>::value,
-                "InContainer must model RandomAccessRange");
-  static_assert(type_traits::is_random_access_range<OutContainer>::value,
-                "OutContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Function, R, T, U>::value,
+      "Function must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<InContainer>::value,
+      "InContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_random_access_range<OutContainer>::value,
+      "OutContainer must model RandomAccessRange");
   if (begin(in) == end(in))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p), begin(in),
-                               end(in), begin(out), binop, value);
+  return impl::scan::exclusive(
+      r, std::forward<ExecPolicy>(p), begin(in), end(in), begin(out), binop,
+      value);
 }
 ///
-template <typename ExecPolicy,
-          typename InContainer,
-          typename OutContainer,
-          typename T        = RAJA::detail::ContainerVal<InContainer>,
-          typename Function = operators::plus<T>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <
+    typename ExecPolicy,
+    typename InContainer,
+    typename OutContainer,
+    typename T        = RAJA::detail::ContainerVal<InContainer>,
+    typename Function = operators::plus<T>,
+    typename Res      = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -306,11 +330,12 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, InContainer>>,
     type_traits::is_range<OutContainer>>
-exclusive_scan(ExecPolicy&&   p,
-               InContainer&&  in,
-               OutContainer&& out,
-               Function       binop = Function{},
-               T              value = Function::identity())
+exclusive_scan(
+    ExecPolicy&&   p,
+    InContainer&&  in,
+    OutContainer&& out,
+    Function       binop = Function {},
+    T              value = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -318,7 +343,7 @@ exclusive_scan(ExecPolicy&&   p,
       std::forward<OutContainer>(out), binop, value);
 }
 
-} // namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -327,12 +352,14 @@ exclusive_scan(ExecPolicy&&   p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
-            exclusive_scan(Args&&... args)
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
+exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan<ExecPolicy>(
@@ -340,10 +367,11 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
-            exclusive_scan(Res r, Args&&... args)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
+exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -355,12 +383,14 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
-            inclusive_scan(Args&&... args)
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
+inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan<ExecPolicy>(
@@ -368,10 +398,11 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
-            inclusive_scan(Res r, Args&&... args)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
+inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -383,12 +414,14 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
-            exclusive_scan_inplace(Args&&... args)
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
+exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace<ExecPolicy>(
@@ -396,10 +429,11 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
-            exclusive_scan_inplace(Res r, Args&&... args)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
+exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -411,12 +445,14 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
-            inclusive_scan_inplace(Args&&... args)
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
+inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace<ExecPolicy>(
@@ -424,15 +460,16 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
-            inclusive_scan_inplace(Res r, Args&&... args)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
+inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index 684964ef6e..fee62e78c3 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -51,21 +51,24 @@ template <
     typename Res,
     typename Container,
     typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
+sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -73,8 +76,8 @@ sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 
   if (N > 1)
   {
-    return impl::sort::unstable(r, std::forward<ExecPolicy>(p), begin_it,
-                                end_it, comp);
+    return impl::sort::unstable(
+        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
   }
   else
   {
@@ -93,7 +96,7 @@ concepts::enable_if_t<
     type_traits::is_range<Container>,
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
-sort(ExecPolicy&& p, Container&& c, Compare comp = Compare{})
+sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort(
@@ -117,21 +120,24 @@ template <
     typename Res,
     typename Container,
     typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
+stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -139,8 +145,8 @@ stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare{})
 
   if (N > 1)
   {
-    return impl::sort::stable(r, std::forward<ExecPolicy>(p), begin_it, end_it,
-                              comp);
+    return impl::sort::stable(
+        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
   }
   else
   {
@@ -159,7 +165,7 @@ concepts::enable_if_t<
     type_traits::is_range<Container>,
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
-stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare{})
+stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort(
@@ -179,34 +185,40 @@ stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare{})
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<KeyContainer>,
-                      type_traits::is_range<ValContainer>>
-sort_pairs(ExecPolicy&&   p,
-           Res            r,
-           KeyContainer&& keys,
-           ValContainer&& vals,
-           Compare        comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename KeyContainer,
+    typename ValContainer,
+    typename Compare =
+        operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<KeyContainer>,
+    type_traits::is_range<ValContainer>>
+sort_pairs(
+    ExecPolicy&&   p,
+    Res            r,
+    KeyContainer&& keys,
+    ValContainer&& vals,
+    Compare        comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<KeyContainer>::value,
-                "KeyContainer must model RandomAccessRange");
-  static_assert(type_traits::is_random_access_range<ValContainer>::value,
-                "ValContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<KeyContainer>::value,
+      "KeyContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_random_access_range<ValContainer>::value,
+      "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
@@ -214,8 +226,8 @@ sort_pairs(ExecPolicy&&   p,
 
   if (N > 1)
   {
-    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
-                                      end_key, begin(vals), comp);
+    return impl::sort::unstable_pairs(
+        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
   }
   else
   {
@@ -223,12 +235,13 @@ sort_pairs(ExecPolicy&&   p,
   }
 }
 ///
-template <typename ExecPolicy,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <
+    typename ExecPolicy,
+    typename KeyContainer,
+    typename ValContainer,
+    typename Compare =
+        operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -236,10 +249,11 @@ concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, KeyContainer>>,
     type_traits::is_range<ValContainer>>
-sort_pairs(ExecPolicy&&   p,
-           KeyContainer&& keys,
-           ValContainer&& vals,
-           Compare        comp = Compare{})
+sort_pairs(
+    ExecPolicy&&   p,
+    KeyContainer&& keys,
+    ValContainer&& vals,
+    Compare        comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
@@ -260,34 +274,40 @@ sort_pairs(ExecPolicy&&   p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<KeyContainer>,
-                      type_traits::is_range<ValContainer>>
-stable_sort_pairs(ExecPolicy&&   p,
-                  Res            r,
-                  KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare        comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename KeyContainer,
+    typename ValContainer,
+    typename Compare =
+        operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<KeyContainer>,
+    type_traits::is_range<ValContainer>>
+stable_sort_pairs(
+    ExecPolicy&&   p,
+    Res            r,
+    KeyContainer&& keys,
+    ValContainer&& vals,
+    Compare        comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<KeyContainer>::value,
-                "KeyContainer must model RandomAccessRange");
-  static_assert(type_traits::is_random_access_range<ValContainer>::value,
-                "ValContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<KeyContainer>::value,
+      "KeyContainer must model RandomAccessRange");
+  static_assert(
+      type_traits::is_random_access_range<ValContainer>::value,
+      "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
@@ -295,8 +315,8 @@ stable_sort_pairs(ExecPolicy&&   p,
 
   if (N > 1)
   {
-    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
-                                    end_key, begin(vals), comp);
+    return impl::sort::stable_pairs(
+        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
   }
   else
   {
@@ -304,12 +324,13 @@ stable_sort_pairs(ExecPolicy&&   p,
   }
 }
 ///
-template <typename ExecPolicy,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <
+    typename ExecPolicy,
+    typename KeyContainer,
+    typename ValContainer,
+    typename Compare =
+        operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -317,10 +338,11 @@ concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, KeyContainer>>,
     type_traits::is_range<ValContainer>>
-stable_sort_pairs(ExecPolicy&&   p,
-                  KeyContainer&& keys,
-                  ValContainer&& vals,
-                  Compare        comp = Compare{})
+stable_sort_pairs(
+    ExecPolicy&&   p,
+    KeyContainer&& keys,
+    ValContainer&& vals,
+    Compare        comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
@@ -328,7 +350,7 @@ stable_sort_pairs(ExecPolicy&&   p,
       std::forward<ValContainer>(vals), comp);
 }
 
-} // namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 // =============================================================================
 
@@ -338,11 +360,13 @@ stable_sort_pairs(ExecPolicy&&   p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
 sort(Args&&... args)
 {
   Res r = Res::get_default();
@@ -351,13 +375,14 @@ sort(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
 sort(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(), r,
-                                                 std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(
+      ExecPolicy(), r, std::forward<Args>(args)...);
 }
 
 /*!
@@ -366,11 +391,13 @@ sort(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
 stable_sort(Args&&... args)
 {
   Res r = Res::get_default();
@@ -379,9 +406,10 @@ stable_sort(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
 stable_sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
@@ -394,11 +422,13 @@ stable_sort(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
 sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
@@ -407,9 +437,10 @@ sort_pairs(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
 sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
@@ -422,11 +453,13 @@ sort_pairs(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <
+    typename ExecPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>>
 stable_sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
@@ -435,15 +468,16 @@ stable_sort_pairs(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>>
 stable_sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/synchronize.hpp b/include/RAJA/pattern/synchronize.hpp
index 3d6047a407..77c88e5c6d 100644
--- a/include/RAJA/pattern/synchronize.hpp
+++ b/include/RAJA/pattern/synchronize.hpp
@@ -41,8 +41,8 @@ namespace RAJA
 template <typename Policy>
 void synchronize()
 {
-  synchronize_impl(Policy{});
+  synchronize_impl(Policy {});
 }
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_synchronize_HPP
+#endif  // RAJA_synchronize_HPP
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index cd386ddcf5..3cd06d77b7 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -28,9 +28,10 @@ namespace RAJA
 {
 namespace expt
 {
-template <typename T,
-          typename LAYOUT,
-          typename REGISTER_POLICY = default_register>
+template <
+    typename T,
+    typename LAYOUT,
+    typename REGISTER_POLICY = default_register>
 using SquareMatrixRegister = TensorRegister<
     REGISTER_POLICY,
     T,
@@ -39,16 +40,17 @@ using SquareMatrixRegister = TensorRegister<
         RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
         RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
 
-template <typename T,
-          typename LAYOUT,
-          camp::idx_t ROWS,
-          camp::idx_t COLS,
-          typename REGISTER_POLICY = default_register>
+template <
+    typename T,
+    typename LAYOUT,
+    camp::idx_t ROWS,
+    camp::idx_t COLS,
+    typename REGISTER_POLICY = default_register>
 using RectMatrixRegister =
     TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/ScalarRegister.hpp b/include/RAJA/pattern/tensor/ScalarRegister.hpp
index d0cc0fa6db..d532d58ade 100644
--- a/include/RAJA/pattern/tensor/ScalarRegister.hpp
+++ b/include/RAJA/pattern/tensor/ScalarRegister.hpp
@@ -34,8 +34,8 @@ using ScalarRegister =
     TensorRegister<scalar_register, T, ScalarLayout, camp::idx_seq<>>;
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 9f5c09f152..c472c825c4 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -29,11 +29,12 @@ namespace expt
 {
 
 
-template <typename IDX,
-          typename TENSOR_TYPE,
-          camp::idx_t             DIM,
-          IDX                     INDEX_VALUE,
-          strip_index_type_t<IDX> LENGTH_VALUE>
+template <
+    typename IDX,
+    typename TENSOR_TYPE,
+    camp::idx_t             DIM,
+    IDX                     INDEX_VALUE,
+    strip_index_type_t<IDX> LENGTH_VALUE>
 struct StaticTensorIndexInner;
 
 template <typename INNER_TYPE>
@@ -58,11 +59,12 @@ class TensorIndex
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
-                                                            TENSOR_TYPE,
-                                                            DIM,
-                                                            index_type(-1),
-                                                            value_type(-1)>>
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<
+      IDX,
+      TENSOR_TYPE,
+      DIM,
+      index_type(-1),
+      value_type(-1)>>
   static_all()
   {
     return StaticTensorIndex<StaticTensorIndexInner<
@@ -150,11 +152,12 @@ class TensorIndex
 };
 
 
-template <typename IDX,
-          typename TENSOR_TYPE,
-          camp::idx_t             DIM,
-          IDX                     INDEX_VALUE,
-          strip_index_type_t<IDX> LENGTH_VALUE>
+template <
+    typename IDX,
+    typename TENSOR_TYPE,
+    camp::idx_t             DIM,
+    IDX                     INDEX_VALUE,
+    strip_index_type_t<IDX> LENGTH_VALUE>
 struct StaticTensorIndex<
     StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
 {
@@ -212,8 +215,8 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
   return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
 }
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
 
diff --git a/include/RAJA/pattern/tensor/TensorLayout.hpp b/include/RAJA/pattern/tensor/TensorLayout.hpp
index 3d581b631b..8e2404c3a2 100644
--- a/include/RAJA/pattern/tensor/TensorLayout.hpp
+++ b/include/RAJA/pattern/tensor/TensorLayout.hpp
@@ -54,14 +54,14 @@ struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
   RAJA_HOST_DEVICE
   static constexpr bool is_column_major()
   {
-    return S1 == 0; // Rows are stride-1
+    return S1 == 0;  // Rows are stride-1
   }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   static constexpr bool is_row_major()
   {
-    return S1 == 1; // Columns are stride-1
+    return S1 == 1;  // Columns are stride-1
   }
 };
 
@@ -77,8 +77,8 @@ using RowMajorLayout = TensorLayout<0, 1>;
 using ColMajorLayout = TensorLayout<1, 0>;
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index 38c8baedac..649d7a2166 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -34,7 +34,7 @@ namespace expt
 {
 class TensorRegisterConcreteBase;
 }
-} // namespace internal
+}  // namespace internal
 
 namespace expt
 {
@@ -53,8 +53,9 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
+        std::is_base_of<
+            RAJA::internal::expt::TensorRegisterConcreteBase,
+            RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
 {
@@ -70,8 +71,9 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
+        std::is_base_of<
+            RAJA::internal::expt::TensorRegisterConcreteBase,
+            RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
 {
@@ -87,8 +89,9 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
+        std::is_base_of<
+            RAJA::internal::expt::TensorRegisterConcreteBase,
+            RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
 {
@@ -104,16 +107,17 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
+        std::is_base_of<
+            RAJA::internal::expt::TensorRegisterConcreteBase,
+            RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).divide(rhs);
 }
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #include "RAJA/pattern/tensor/internal/TensorRegisterBase.hpp"
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index 632c40160d..f1dd049d41 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -25,14 +25,15 @@ namespace RAJA
 namespace expt
 {
 // Convenience to describe VectorTensors
-template <typename T,
-          typename REGISTER_POLICY = default_register,
-          camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
+template <
+    typename T,
+    typename REGISTER_POLICY = default_register,
+    camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
 using VectorRegister =
     TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 5a3ecb3ed1..074ff320df 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -64,28 +64,31 @@ class TensorBinaryOperator
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorBinaryOperator(left_operand_type const&  left,
-                       right_operand_type const& right)
-      : m_left_operand{left}, m_right_operand{right}
+  TensorBinaryOperator(
+      left_operand_type const&  left,
+      right_operand_type const& right)
+      : m_left_operand {left}, m_right_operand {right}
   {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   constexpr auto getDimSize(camp::idx_t dim) const
-      -> decltype(operator_traits::getDimSize(dim,
-                                              m_left_operand,
-                                              m_right_operand))
+      -> decltype(operator_traits::getDimSize(
+          dim,
+          m_left_operand,
+          m_right_operand))
   {
     return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
   }
 
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(operator_type::eval(m_left_operand.eval(tile),
-                                      m_right_operand.eval(tile)))
+      -> decltype(operator_type::eval(
+          m_left_operand.eval(tile),
+          m_right_operand.eval(tile)))
   {
-    return operator_type::eval(m_left_operand.eval(tile),
-                               m_right_operand.eval(tile));
+    return operator_type::eval(
+        m_left_operand.eval(tile), m_right_operand.eval(tile));
   }
 
 
@@ -109,20 +112,22 @@ class TensorBinaryOperator
  * Overload for:    arithmetic + tensorexpression
 
  */
-template <typename LEFT_OPERAND,
-          typename RIGHT_OPERAND,
-          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
-                                  bool>::type = true,
-          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
-                                                  RIGHT_OPERAND>::value,
-                                  bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const&  left,
-                                            RIGHT_OPERAND const& right)
-    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-                 RIGHT_OPERAND>
+template <
+    typename LEFT_OPERAND,
+    typename RIGHT_OPERAND,
+    typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::
+        type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto
+operator+(LEFT_OPERAND const& left, RIGHT_OPERAND const& right) -> TensorAdd<
+    typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+    RIGHT_OPERAND>
 {
-  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-                   RIGHT_OPERAND>(
+  return TensorAdd<
+      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+      RIGHT_OPERAND>(
       NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
 }
 
@@ -131,23 +136,24 @@ RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const&  left,
  * Overload for:    arithmetic - tensorexpression
 
  */
-template <typename LEFT_OPERAND,
-          typename RIGHT_OPERAND,
-          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
-                                  bool>::type = true,
-          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
-                                                  RIGHT_OPERAND>::value,
-                                  bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const&  left,
-                                            RIGHT_OPERAND const& right)
+template <
+    typename LEFT_OPERAND,
+    typename RIGHT_OPERAND,
+    typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::
+        type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto
+operator-(LEFT_OPERAND const& left, RIGHT_OPERAND const& right)
     -> TensorSubtract<
         typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
         RIGHT_OPERAND>
 {
   return TensorSubtract<
       typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
-                     right);
+      RIGHT_OPERAND>(
+      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
 }
 
 
@@ -173,12 +179,12 @@ RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const&  left,
 //    }
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index 671230ee15..eaf9acea5a 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -34,9 +34,8 @@ struct TensorOperatorAdd
 {
 
   template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const&  left,
-                                                RIGHT const& right)
-      -> decltype(left + right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  eval(LEFT const& left, RIGHT const& right) -> decltype(left + right)
   {
     return left + right;
   }
@@ -50,9 +49,8 @@ struct TensorOperatorSubtract
 {
 
   template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const&  left,
-                                                RIGHT const& right)
-      -> decltype(left - right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  eval(LEFT const& left, RIGHT const& right) -> decltype(left - right)
   {
     return left - right;
   }
@@ -105,9 +103,10 @@ struct OperatorTraits
  * Specialization when the left operand is a scalar
  */
 template <typename LHS_TYPE, typename RHS_TYPE>
-struct OperatorTraits<LHS_TYPE,
-                      RHS_TYPE,
-                      typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
+struct OperatorTraits<
+    LHS_TYPE,
+    RHS_TYPE,
+    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
 {
 
   using result_type                       = typename RHS_TYPE::result_type;
@@ -129,9 +128,10 @@ struct OperatorTraits<LHS_TYPE,
  * Specialization when the right operand is a scalar
  */
 template <typename LHS_TYPE, typename RHS_TYPE>
-struct OperatorTraits<LHS_TYPE,
-                      RHS_TYPE,
-                      typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
+struct OperatorTraits<
+    LHS_TYPE,
+    RHS_TYPE,
+    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
 {
 
   using result_type                       = typename LHS_TYPE::result_type;
@@ -150,12 +150,12 @@ struct OperatorTraits<LHS_TYPE,
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index bff1f7cdf1..ec14930dc6 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -130,12 +130,12 @@ class BlockLiteral
 //        }
 //    };
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3f5d8f6614..3a3f1ef577 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -149,12 +149,12 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index 3f5adc2f43..71ba204403 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -41,9 +41,10 @@ namespace ET
  * If the operands are both matrices, we perform a matrix-matrix multiply.
  * Otherwise, we perform element-wise operations.
  */
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          class ENABLE = void>
+template <
+    typename LEFT_OPERAND_TYPE,
+    typename RIGHT_OPERAND_TYPE,
+    class ENABLE = void>
 struct MultiplyOperator
 {
 
@@ -54,16 +55,18 @@ struct MultiplyOperator
   RAJA_HOST_DEVICE
   static void print_ast()
   {
-    printf("Elemental(%d,%d)", (int)s_num_dims,
-           (int)RIGHT_OPERAND_TYPE::s_num_dims);
+    printf(
+        "Elemental(%d,%d)", (int)s_num_dims,
+        (int)RIGHT_OPERAND_TYPE::s_num_dims);
   }
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(int                       dim,
-                        LEFT_OPERAND_TYPE const&  left,
-                        RIGHT_OPERAND_TYPE const& right)
+  static int getDimSize(
+      int                       dim,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
   }
@@ -72,10 +75,10 @@ struct MultiplyOperator
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const&          tile,
-           LEFT_OPERAND_TYPE const&  left,
-           RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
       -> decltype(left.eval(tile) * right.eval(tile))
   {
     return left.eval(tile) * right.eval(tile);
@@ -86,13 +89,13 @@ struct MultiplyOperator
    * Evaluate operands and perform element-wise multiply add
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const&          tile,
-               LEFT_OPERAND_TYPE const&  left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const&   add)
-      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
-                                               add.eval(tile)))
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_OPERAND_TYPE const&   add)
+      -> decltype(left.eval(tile)
+                      .multiply_add(right.eval(tile), add.eval(tile)))
   {
     return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
   }
@@ -102,16 +105,16 @@ struct MultiplyOperator
    * Evaluate operands and perform element-wise multiply subtract
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const&             tile,
-                    LEFT_OPERAND_TYPE const&     left,
-                    RIGHT_OPERAND_TYPE const&    right,
-                    SUBTRACT_OPERAND_TYPE const& subtract)
-      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
-                                                    subtract.eval(tile)))
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const&             tile,
+      LEFT_OPERAND_TYPE const&     left,
+      RIGHT_OPERAND_TYPE const&    right,
+      SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile)
+                      .multiply_subtract(right.eval(tile), subtract.eval(tile)))
   {
-    return left.eval(tile).multiply_subtract(right.eval(tile),
-                                             subtract.eval(tile));
+    return left.eval(tile).multiply_subtract(
+        right.eval(tile), subtract.eval(tile));
   }
 };
 
@@ -145,10 +148,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform scaling operation
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const&          tile,
-           LEFT_OPERAND_TYPE const&  left,
-           RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
       -> decltype(right.eval(tile).scale(left.eval(tile)))
   {
     return right.eval(tile).scale(left.eval(tile));
@@ -159,11 +162,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply add
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const&          tile,
-               LEFT_OPERAND_TYPE const&  left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const&   add)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_OPERAND_TYPE const&   add)
       -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
   {
     return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
@@ -174,11 +177,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply subtract
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const&             tile,
-                    LEFT_OPERAND_TYPE const&     left,
-                    RIGHT_OPERAND_TYPE const&    right,
-                    SUBTRACT_OPERAND_TYPE const& subtract)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const&             tile,
+      LEFT_OPERAND_TYPE const&     left,
+      RIGHT_OPERAND_TYPE const&    right,
+      SUBTRACT_OPERAND_TYPE const& subtract)
       -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
   {
     return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
@@ -215,10 +218,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform scaling operation
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const&          tile,
-           LEFT_OPERAND_TYPE const&  left,
-           RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
       -> decltype(left.eval(tile).scale(right.eval(tile)))
   {
     return left.eval(tile).scale(right.eval(tile));
@@ -229,11 +232,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply add
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const&          tile,
-               LEFT_OPERAND_TYPE const&  left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const&   add)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_OPERAND_TYPE const&   add)
       -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
   {
     return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
@@ -244,11 +247,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply subtract
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const&             tile,
-                    LEFT_OPERAND_TYPE const&     left,
-                    RIGHT_OPERAND_TYPE const&    right,
-                    SUBTRACT_OPERAND_TYPE const& subtract)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const&             tile,
+      LEFT_OPERAND_TYPE const&     left,
+      RIGHT_OPERAND_TYPE const&    right,
+      SUBTRACT_OPERAND_TYPE const& subtract)
       -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
   {
     return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
@@ -272,8 +275,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
 {
 
   using left_type  = LEFT_OPERAND_TYPE;
@@ -298,10 +302,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const&          tile,
-           LEFT_OPERAND_TYPE const&  left,
-           RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
 
     // clear result
@@ -314,11 +318,11 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const&          tile,
-               LEFT_OPERAND_TYPE const&  left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const&           add)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const&           add)
   {
 
     // evaluate add into result
@@ -335,11 +339,11 @@ struct MultiplyOperator<
   struct MultiplyBridge;
 
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE&                  result,
-                       TILE_TYPE const&          tile,
-                       LEFT_OPERAND_TYPE const&  et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE&                  result,
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
   {
     // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
@@ -416,10 +420,11 @@ struct MultiplyOperator<
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE&                  result,
-                                     TILE_TYPE const&          tile,
-                                     LEFT_OPERAND_TYPE const&  et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(
+        STORAGE&                  result,
+        TILE_TYPE const&          tile,
+        LEFT_OPERAND_TYPE const&  et_left,
+        RIGHT_OPERAND_TYPE const& et_right)
     {
       // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
@@ -477,35 +482,38 @@ struct MultiplyOperator<
   };
 
 
-  template <size_t INDEX,
-            typename STORAGE,
-            typename INDEX_TYPE,
-            TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE     Begin0,
-            INDEX_TYPE... BeginTail,
-            INDEX_TYPE Size0,
-            INDEX_TYPE... SizeTail>
+  template <
+      size_t INDEX,
+      typename STORAGE,
+      typename INDEX_TYPE,
+      TensorTileSize TENSOR_SIZE,
+      INDEX_TYPE     Begin0,
+      INDEX_TYPE... BeginTail,
+      INDEX_TYPE Size0,
+      INDEX_TYPE... SizeTail>
   struct MultiplyBridge<
       STORAGE,
-      StaticTensorTile<INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      StaticTensorTile<
+          INDEX_TYPE,
+          TENSOR_SIZE,
+          camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+          camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
       camp::integral_constant<size_t, INDEX>>
   {
 
-    using TileType =
-        StaticTensorTile<INDEX_TYPE,
-                         TENSOR_SIZE,
-                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+    using TileType = StaticTensorTile<
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+        camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE&                  result,
-                                     TileType const&           tile,
-                                     LEFT_OPERAND_TYPE const&  et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(
+        STORAGE&                  result,
+        TileType const&           tile,
+        LEFT_OPERAND_TYPE const&  et_left,
+        RIGHT_OPERAND_TYPE const& et_right)
     {
 
       // get tile size from matrix type
@@ -517,40 +525,37 @@ struct MultiplyOperator<
       if ((offset + tile_size) <= k_size)
       {
 
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        using LeftType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, Begin0, offset>,
+            camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
         // evaluate both sides of operator
         auto left = et_left.eval(LeftType());
 
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, tile_size>>;
+        using RightType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, offset>,
+            camp::int_seq<INDEX_TYPE, tile_size>>;
 
         auto right = et_right.eval(RightType());
 
         // accumulate product
         auto temp = left.right_multiply_vector_accumulate(right, result);
-        MultiplyBridge<STORAGE, TileType,
-                       camp::integral_constant<size_t, INDEX - 1>>::
+        MultiplyBridge<
+            STORAGE, TileType, camp::integral_constant<size_t, INDEX - 1>>::
             multiply_into_result(result, tile, et_left, et_right);
         result += temp;
       }
       else
       {
 
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        using LeftType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_PARTIAL,
+            camp::int_seq<INDEX_TYPE, Begin0, offset>,
+            camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
         auto left = et_left.eval(LeftType());
 
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        using RightType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_PARTIAL, camp::int_seq<INDEX_TYPE, offset>,
+            camp::int_seq<INDEX_TYPE, k_size - offset>>;
         auto right = et_right.eval(RightType());
 
         // accumulate product of partial tile
@@ -560,34 +565,37 @@ struct MultiplyOperator<
   };
 
 
-  template <typename STORAGE,
-            typename INDEX_TYPE,
-            TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE     Begin0,
-            INDEX_TYPE... BeginTail,
-            INDEX_TYPE Size0,
-            INDEX_TYPE... SizeTail>
+  template <
+      typename STORAGE,
+      typename INDEX_TYPE,
+      TensorTileSize TENSOR_SIZE,
+      INDEX_TYPE     Begin0,
+      INDEX_TYPE... BeginTail,
+      INDEX_TYPE Size0,
+      INDEX_TYPE... SizeTail>
   struct MultiplyBridge<
       STORAGE,
-      StaticTensorTile<INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      StaticTensorTile<
+          INDEX_TYPE,
+          TENSOR_SIZE,
+          camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+          camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
       camp::integral_constant<size_t, 0>>
   {
 
-    using TileType =
-        StaticTensorTile<INDEX_TYPE,
-                         TENSOR_SIZE,
-                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+    using TileType = StaticTensorTile<
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+        camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TileType const&,
-                                     LEFT_OPERAND_TYPE const&  et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(
+        STORAGE& result,
+        TileType const&,
+        LEFT_OPERAND_TYPE const&  et_left,
+        RIGHT_OPERAND_TYPE const& et_right)
     {
 
       // get tile size from matrix type
@@ -599,17 +607,15 @@ struct MultiplyOperator<
       if ((offset + tile_size) <= k_size)
       {
 
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        using LeftType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, Begin0, offset>,
+            camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
         // evaluate both sides of operator
         auto left = et_left.eval(LeftType());
 
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, tile_size>>;
+        using RightType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, offset>,
+            camp::int_seq<INDEX_TYPE, tile_size>>;
 
         auto right = et_right.eval(RightType());
 
@@ -620,16 +626,15 @@ struct MultiplyOperator<
       else
       {
 
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        using LeftType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_PARTIAL,
+            camp::int_seq<INDEX_TYPE, Begin0, offset>,
+            camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
         auto left = et_left.eval(LeftType());
 
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        using RightType = StaticTensorTile<
+            INDEX_TYPE, TENSOR_PARTIAL, camp::int_seq<INDEX_TYPE, offset>,
+            camp::int_seq<INDEX_TYPE, k_size - offset>>;
         auto right = et_right.eval(RightType());
 
         // accumulate product of partial tile
@@ -638,34 +643,37 @@ struct MultiplyOperator<
     }
   };
 
-  template <typename STORAGE,
-            typename INDEX_TYPE,
-            TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE     Begin0,
-            INDEX_TYPE... BeginTail,
-            INDEX_TYPE Size0,
-            INDEX_TYPE... SizeTail>
+  template <
+      typename STORAGE,
+      typename INDEX_TYPE,
+      TensorTileSize TENSOR_SIZE,
+      INDEX_TYPE     Begin0,
+      INDEX_TYPE... BeginTail,
+      INDEX_TYPE Size0,
+      INDEX_TYPE... SizeTail>
   struct MultiplyBridge<
       STORAGE,
-      StaticTensorTile<INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      StaticTensorTile<
+          INDEX_TYPE,
+          TENSOR_SIZE,
+          camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+          camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
       void>
   {
 
-    using TileType =
-        StaticTensorTile<INDEX_TYPE,
-                         TENSOR_SIZE,
-                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+    using TileType = StaticTensorTile<
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+        camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE&                  result,
-                                     TileType const&           tile,
-                                     LEFT_OPERAND_TYPE const&  et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(
+        STORAGE&                  result,
+        TileType const&           tile,
+        LEFT_OPERAND_TYPE const&  et_left,
+        RIGHT_OPERAND_TYPE const& et_right)
     {
 
       const auto   tile_size = left_type::result_type::s_dim_elem(1);
@@ -673,17 +681,18 @@ struct MultiplyOperator<
       const size_t iter_count =
           (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
-      MultiplyBridge<STORAGE, TileType,
-                     camp::integral_constant<size_t, iter_count>>::
+      MultiplyBridge<
+          STORAGE, TileType, camp::integral_constant<size_t, iter_count>>::
           multiply_into_result(result, tile, et_left, et_right);
     }
   };
 };
 
 
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          typename ADD_OPERAND_TYPE>
+template <
+    typename LEFT_OPERAND_TYPE,
+    typename RIGHT_OPERAND_TYPE,
+    typename ADD_OPERAND_TYPE>
 class TensorMultiplyAdd;
 
 
@@ -703,8 +712,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
   using left_type   = LEFT_OPERAND_TYPE;
@@ -728,10 +738,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const&          tile,
-           LEFT_OPERAND_TYPE const&  left,
-           RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     // clear result
     result_type result(0);
@@ -743,11 +753,11 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const&          tile,
-               LEFT_OPERAND_TYPE const&  left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const&           add)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const&           add)
   {
     // evaluate add into result
     result_type result = add.eval(tile);
@@ -760,11 +770,11 @@ struct MultiplyOperator<
 
 private:
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE&                  result,
-                       TILE_TYPE const&          tile,
-                       LEFT_OPERAND_TYPE const&  et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE&                  result,
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
   {
     // get tile size from matrix type
     auto tile_size = right_type::result_type::s_dim_elem(0);
@@ -832,8 +842,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
   using left_type   = LEFT_OPERAND_TYPE;
@@ -847,9 +858,10 @@ struct MultiplyOperator<
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(int                       dim,
-                        LEFT_OPERAND_TYPE const&  left,
-                        RIGHT_OPERAND_TYPE const& right)
+  static int getDimSize(
+      int                       dim,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
   }
@@ -858,10 +870,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const&          tile,
-           LEFT_OPERAND_TYPE const&  left,
-           RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
 
     /*
@@ -890,11 +902,11 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const&          tile,
-               LEFT_OPERAND_TYPE const&  left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const&           add)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const&           add)
   {
 
     // start accumulator with addition term
@@ -907,11 +919,11 @@ struct MultiplyOperator<
 
 private:
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE&                  result,
-                       TILE_TYPE const&          tile,
-                       LEFT_OPERAND_TYPE const&  et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE&                  result,
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
   {
     // get tile size from matrix type
     using right_tensor_type = typename right_type::result_type;
@@ -988,7 +1000,7 @@ class RestrictExtents
   RAJA_INLINE
   RAJA_HOST_DEVICE
   RestrictExtents(operand_type const& operand, tile_type const& tile)
-      : m_operand{operand}, m_tile{tile}
+      : m_operand {operand}, m_tile {tile}
   {}
 
 
@@ -1025,8 +1037,8 @@ class RestrictExtents
 };
 
 template <typename OPERAND, typename TILE>
-RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
-                                               TILE const&    tile)
+RestrictExtents<OPERAND, TILE>
+restrictExtents(OPERAND const& operand, TILE const& tile)
 {
   using tile_type = typename OPERAND::tile_type;
   tile_type new_tile;
@@ -1048,8 +1060,9 @@ struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
     typename std::enable_if<
-        std::is_base_of<TensorBlockConcreteBase,
-                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+        std::is_base_of<
+            TensorBlockConcreteBase,
+            typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
         LEFT_OPERAND_TYPE::s_num_dims == 2 &&
         RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
@@ -1080,9 +1093,10 @@ struct MultiplyOperator<
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(int                       dim,
-                        LEFT_OPERAND_TYPE const&  left,
-                        RIGHT_OPERAND_TYPE const& right)
+  static int getDimSize(
+      int                       dim,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
   }
@@ -1094,10 +1108,10 @@ struct MultiplyOperator<
   RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply(
       TILE_TYPE const& tile,
       LEFT_OPERAND_TYPE const&,
-      RIGHT_OPERAND_TYPE const&) //->
-                                 /// decltype(TensorMultiply<decltype(left.eval(tile)),
-                                 /// decltype(right.eval(tile))>(left.eval(tile),
-                                 /// right.eval(tile)))
+      RIGHT_OPERAND_TYPE const&)  //->
+                                  /// decltype(TensorMultiply<decltype(left.eval(tile)),
+                                  /// decltype(right.eval(tile))>(left.eval(tile),
+                                  /// right.eval(tile)))
   {
 
     /*
@@ -1126,16 +1140,16 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static block_literal
-  multiply_add(TILE_TYPE const&          tile,
-               LEFT_OPERAND_TYPE const&  left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const&
-                   add) //->
-                        // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
-                        // decltype(right.eval(tile)),
-                        // decltype(add.eval(tile))>(left.eval(tile),
-                        // right.eval(tile), add.eval(tile)))
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const&
+          add)  //->
+                // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
+                // decltype(right.eval(tile)),
+                // decltype(add.eval(tile))>(left.eval(tile),
+                // right.eval(tile), add.eval(tile)))
   {
     /*
      * First pass:  we want to return a BlockLiteral ET node with the
@@ -1176,11 +1190,11 @@ struct MultiplyOperator<
 
 private:
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE&                  result,
-                       TILE_TYPE const&          tile,
-                       LEFT_OPERAND_TYPE const&  et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE&                  result,
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
   {
 
     // get tile size from matrix type
@@ -1242,12 +1256,12 @@ struct MultiplyOperator<
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index cb93f08e09..5908918c8e 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -36,9 +36,10 @@ namespace expt
 namespace ET
 {
 
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          class ENABLE = void>
+template <
+    typename LEFT_OPERAND_TYPE,
+    typename RIGHT_OPERAND_TYPE,
+    class ENABLE = void>
 struct DivideOperator;
 
 
@@ -49,8 +50,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
 {
 
   using result_type = typename RIGHT_OPERAND_TYPE::result_type;
@@ -68,10 +70,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const&          tile,
-         LEFT_OPERAND_TYPE const&  left,
-         RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     result_type numerator(left.eval(tile));
 
@@ -92,8 +94,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -111,10 +114,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const&          tile,
-         LEFT_OPERAND_TYPE const&  left,
-         RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     result_type denominator(right.eval(tile));
 
@@ -137,8 +140,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -156,10 +160,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const&          tile,
-         LEFT_OPERAND_TYPE const&  left,
-         RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     if (tile.s_tensor_size == TENSOR_FULL)
     {
@@ -180,8 +184,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
   using result_type = typename RIGHT_OPERAND_TYPE::result_type;
@@ -199,10 +204,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const&          tile,
-         LEFT_OPERAND_TYPE const&  left,
-         RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     result_type numerator(left.eval(tile));
 
@@ -211,8 +216,8 @@ struct DivideOperator<
       return numerator.divide(right.eval(tile));
     }
 
-    return numerator.divide_nm(right.eval(tile), tile.m_size[0],
-                               tile.m_size[1]);
+    return numerator.divide_nm(
+        right.eval(tile), tile.m_size[0], tile.m_size[1]);
   }
 };
 
@@ -224,8 +229,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -244,10 +250,10 @@ struct DivideOperator<
    */
   RAJA_SUPPRESS_HD_WARN
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const&          tile,
-         LEFT_OPERAND_TYPE const&  left,
-         RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     result_type denominator(right.eval(tile));
 
@@ -257,8 +263,8 @@ struct DivideOperator<
     }
     else
     {
-      return left.eval(tile).divide_nm(denominator, tile.m_size[0],
-                                       tile.m_size[1]);
+      return left.eval(tile).divide_nm(
+          denominator, tile.m_size[0], tile.m_size[1]);
     }
   }
 };
@@ -271,8 +277,9 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -290,10 +297,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const&          tile,
-         LEFT_OPERAND_TYPE const&  left,
-         RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const&          tile,
+      LEFT_OPERAND_TYPE const&  left,
+      RIGHT_OPERAND_TYPE const& right)
   {
     if (tile.s_tensor_size == TENSOR_FULL)
     {
@@ -301,8 +308,8 @@ struct DivideOperator<
     }
     else
     {
-      return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0],
-                                       tile.m_size[1]);
+      return left.eval(tile).divide_nm(
+          right.eval(tile), tile.m_size[0], tile.m_size[1]);
     }
   }
 };
@@ -331,9 +338,10 @@ class TensorDivide : public TensorExpressionBase<
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorDivide(left_operand_type const&  left_operand,
-               right_operand_type const& right_operand)
-      : m_left_operand{left_operand}, m_right_operand{right_operand}
+  TensorDivide(
+      left_operand_type const&  left_operand,
+      right_operand_type const& right_operand)
+      : m_left_operand {left_operand}, m_right_operand {right_operand}
   {}
 
 
@@ -396,20 +404,20 @@ template <
     typename std::enable_if<
         std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
         bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const& left_operand,
-                                            RHS const& right_operand)
+RAJA_INLINE RAJA_HOST_DEVICE auto
+operator/(LHS const& left_operand, RHS const& right_operand)
     -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
 {
   return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
       NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
 }
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index fd93411909..10367f0d5b 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -58,7 +58,7 @@ class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  explicit TensorLiteral(tensor_type const& value) : m_value{value} {}
+  explicit TensorLiteral(tensor_type const& value) : m_value {value} {}
 
 
   template <typename TILE_TYPE>
@@ -95,12 +95,12 @@ struct NormalizeOperandHelper<
   }
 };
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 5185deeb23..1f877c97ba 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -65,7 +65,7 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
 makeTensorStoreFunctor(LHS_TYPE const& lhs, RHS_TYPE const& rhs)
     -> TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
 {
-  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
+  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE> {lhs, rhs};
 }
 
 
@@ -92,7 +92,7 @@ class TensorLoadStore
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  explicit TensorLoadStore(ref_type const& ref) : m_ref{ref} {}
+  explicit TensorLoadStore(ref_type const& ref) : m_ref {ref} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -170,8 +170,8 @@ class TensorLoadStore
   RAJA_SUPPRESS_HD_WARN
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const& tile) const
-      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
-                                                                  tile)))
+      -> decltype(TENSOR_TYPE::create_et_store_ref(
+          merge_ref_tile(this->m_ref, tile)))
   {
     return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
   }
@@ -209,12 +209,12 @@ class TensorLoadStore
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index ed4689ef6e..6ee5ac747a 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -37,9 +37,10 @@ namespace ET
 {
 
 // forward decl for FMA contraction
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          typename ADD_TYPE>
+template <
+    typename LEFT_OPERAND_TYPE,
+    typename RIGHT_OPERAND_TYPE,
+    typename ADD_TYPE>
 class TensorMultiplyAdd;
 
 
@@ -67,9 +68,10 @@ class TensorMultiply
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorMultiply(left_operand_type const&  left_operand,
-                 right_operand_type const& right_operand)
-      : m_left_operand{left_operand}, m_right_operand{right_operand}
+  TensorMultiply(
+      left_operand_type const&  left_operand,
+      right_operand_type const& right_operand)
+      : m_left_operand {left_operand}, m_right_operand {right_operand}
   {}
 
 
@@ -114,13 +116,14 @@ class TensorMultiply
    */
   RAJA_SUPPRESS_HD_WARN
   template <typename ADD>
-  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
-                                                 right_operand_type,
-                                                 normalize_operand_t<ADD>>
-                               operator+(ADD const& add) const
+  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<
+      left_operand_type,
+      right_operand_type,
+      normalize_operand_t<ADD>>
+  operator+(ADD const& add) const
   {
-    return TensorMultiplyAdd<left_operand_type, right_operand_type,
-                             normalize_operand_t<ADD>>(
+    return TensorMultiplyAdd<
+        left_operand_type, right_operand_type, normalize_operand_t<ADD>>(
         m_left_operand, m_right_operand, normalizeOperand(add));
   }
 
@@ -151,20 +154,20 @@ template <
     typename std::enable_if<
         std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
         bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const& left_operand,
-                                            RHS const& right_operand)
+RAJA_INLINE RAJA_HOST_DEVICE auto
+operator*(LHS const& left_operand, RHS const& right_operand)
     -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
 {
   return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
       NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
 }
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index fc118ca4e5..a0c3dd5589 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -44,18 +44,20 @@ namespace ET
  * This ET can only be generated by contracting an Add and Multiple ET.
  *
  */
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          typename ADD_OPERAND_TYPE>
-class TensorMultiplyAdd
-    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
-                                                    RIGHT_OPERAND_TYPE,
-                                                    ADD_OPERAND_TYPE>>
+template <
+    typename LEFT_OPERAND_TYPE,
+    typename RIGHT_OPERAND_TYPE,
+    typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<
+                              LEFT_OPERAND_TYPE,
+                              RIGHT_OPERAND_TYPE,
+                              ADD_OPERAND_TYPE>>
 {
 public:
-  using self_type          = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
-                                      RIGHT_OPERAND_TYPE,
-                                      ADD_OPERAND_TYPE>;
+  using self_type = TensorMultiplyAdd<
+      LEFT_OPERAND_TYPE,
+      RIGHT_OPERAND_TYPE,
+      ADD_OPERAND_TYPE>;
   using left_operand_type  = LEFT_OPERAND_TYPE;
   using right_operand_type = RIGHT_OPERAND_TYPE;
   using add_operand_type   = ADD_OPERAND_TYPE;
@@ -75,24 +77,26 @@ class TensorMultiplyAdd
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorMultiplyAdd(left_operand_type const&  left_operand,
-                    right_operand_type const& right_operand,
-                    add_operand_type const&   add_operand)
-      : m_left_operand{left_operand},
-        m_right_operand{right_operand},
-        m_add_operand{add_operand}
+  TensorMultiplyAdd(
+      left_operand_type const&  left_operand,
+      right_operand_type const& right_operand,
+      add_operand_type const&   add_operand)
+      : m_left_operand {left_operand},
+        m_right_operand {right_operand},
+        m_add_operand {add_operand}
   {}
 
 
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(multiply_op::multiply_add(tile,
-                                            m_left_operand,
-                                            m_right_operand,
-                                            m_add_operand))
+      -> decltype(multiply_op::multiply_add(
+          tile,
+          m_left_operand,
+          m_right_operand,
+          m_add_operand))
   {
-    return multiply_op::multiply_add(tile, m_left_operand, m_right_operand,
-                                     m_add_operand);
+    return multiply_op::multiply_add(
+        tile, m_left_operand, m_right_operand, m_add_operand);
   }
 
 
@@ -113,12 +117,12 @@ class TensorMultiplyAdd
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index 4432eaefd7..f0512665cf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -52,7 +52,7 @@ class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorNegate(rhs_type const& tensor) : m_tensor{tensor} {}
+  TensorNegate(rhs_type const& tensor) : m_tensor {tensor} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -81,12 +81,12 @@ class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index e438fe37f8..ac692c3bcf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -56,7 +56,7 @@ class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
   RAJA_INLINE
   RAJA_HOST_DEVICE
   explicit constexpr TensorScalarLiteral(element_type const& value) noexcept
-      : m_value{value}
+      : m_value {value}
   {}
 
 
@@ -94,12 +94,12 @@ struct NormalizeOperandHelper<
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index 11efbbd354..e0c4a6c019 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -52,7 +52,7 @@ class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorTranspose(rhs_type const& tensor) : m_tensor{tensor} {}
+  TensorTranspose(rhs_type const& tensor) : m_tensor {tensor} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -65,8 +65,8 @@ class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
   RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
   {
     // transpose which tile we are returning
-    TILE_TYPE trans_tile{{tile.m_begin[1], tile.m_begin[0]},
-                         {tile.m_size[1], tile.m_size[0]}};
+    TILE_TYPE trans_tile {
+        {tile.m_begin[1], tile.m_begin[0]}, {tile.m_size[1], tile.m_size[0]}};
 
     // evaluate and return the transposed tile
     return m_tensor.eval(trans_tile).transpose();
@@ -86,12 +86,12 @@ class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
 };
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
index 90a81aea6e..7f3059acdf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
@@ -80,12 +80,12 @@ template <typename RHS>
 using normalize_operand_t = typename NormalizeOperandHelper<RHS>::return_type;
 
 
-} // namespace ET
+}  // namespace ET
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index c9b7be60dc..28f7458752 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -40,41 +40,48 @@ struct MatrixMatrixMultiplyHelper;
  * Row-Major * Row-Major ==> Row-Major
  *
  */
-template <typename T,
-          typename REGISTER_POLICY,
-          camp::idx_t N_SIZE,
-          camp::idx_t M_SIZE,
-          camp::idx_t M2_SIZE,
-          camp::idx_t O_SIZE>
+template <
+    typename T,
+    typename REGISTER_POLICY,
+    camp::idx_t N_SIZE,
+    camp::idx_t M_SIZE,
+    camp::idx_t M2_SIZE,
+    camp::idx_t O_SIZE>
 struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::RowMajorLayout,
-                               camp::idx_seq<N_SIZE, M_SIZE>>,
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::RowMajorLayout,
-                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+    RAJA::expt::TensorRegister<
+        REGISTER_POLICY,
+        T,
+        RAJA::expt::RowMajorLayout,
+        camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<
+        REGISTER_POLICY,
+        T,
+        RAJA::expt::RowMajorLayout,
+        camp::idx_seq<M2_SIZE, O_SIZE>>>
 {
 
-  static_assert(M_SIZE == M2_SIZE,
-                "Matrices are not compatible for "
-                "multiplication");
-
-  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                               T,
-                                               RAJA::expt::RowMajorLayout,
-                                               camp::idx_seq<N_SIZE, M_SIZE>>;
-
-  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                T,
-                                                RAJA::expt::RowMajorLayout,
-                                                camp::idx_seq<M_SIZE, O_SIZE>>;
-
-  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                 T,
-                                                 RAJA::expt::RowMajorLayout,
-                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+  static_assert(
+      M_SIZE == M2_SIZE,
+      "Matrices are not compatible for "
+      "multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<
+      REGISTER_POLICY,
+      T,
+      RAJA::expt::RowMajorLayout,
+      camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<
+      REGISTER_POLICY,
+      T,
+      RAJA::expt::RowMajorLayout,
+      camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<
+      REGISTER_POLICY,
+      T,
+      RAJA::expt::RowMajorLayout,
+      camp::idx_seq<N_SIZE, O_SIZE>>;
 
   using register_type = typename result_type::register_type;
 
@@ -94,9 +101,10 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const&  A,
-                          right_type const& B,
-                          result_type&      C)
+      multiply_accumulate(
+          left_type const&  A,
+          right_type const& B,
+          result_type&      C)
   {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
     RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
@@ -129,9 +137,10 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE RAJA_INLINE static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const&  A,
-                          right_type const& B,
-                          result_type&      C)
+      multiply_accumulate(
+          left_type const&  A,
+          right_type const& B,
+          result_type&      C)
   {
     constexpr camp::idx_t bc_segbits              = result_type::s_segbits;
     constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
@@ -184,51 +193,60 @@ struct MatrixMatrixMultiplyHelper<
  * Column-Major * Column-Major ==> Column-Major
  *
  */
-template <typename T,
-          typename REGISTER_POLICY,
-          camp::idx_t N_SIZE,
-          camp::idx_t M_SIZE,
-          camp::idx_t M2_SIZE,
-          camp::idx_t O_SIZE>
+template <
+    typename T,
+    typename REGISTER_POLICY,
+    camp::idx_t N_SIZE,
+    camp::idx_t M_SIZE,
+    camp::idx_t M2_SIZE,
+    camp::idx_t O_SIZE>
 struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::ColMajorLayout,
-                               camp::idx_seq<N_SIZE, M_SIZE>>,
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::ColMajorLayout,
-                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+    RAJA::expt::TensorRegister<
+        REGISTER_POLICY,
+        T,
+        RAJA::expt::ColMajorLayout,
+        camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<
+        REGISTER_POLICY,
+        T,
+        RAJA::expt::ColMajorLayout,
+        camp::idx_seq<M2_SIZE, O_SIZE>>>
 {
 
   using self_type = MatrixMatrixMultiplyHelper<
-      RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                 T,
-                                 RAJA::expt::ColMajorLayout,
-                                 camp::idx_seq<N_SIZE, M_SIZE>>,
-      RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                 T,
-                                 RAJA::expt::ColMajorLayout,
-                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
-
-  static_assert(M_SIZE == M2_SIZE,
-                "Matrices are not compatible for "
-                "multiplication");
-
-  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                               T,
-                                               RAJA::expt::ColMajorLayout,
-                                               camp::idx_seq<N_SIZE, M_SIZE>>;
-
-  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                T,
-                                                RAJA::expt::ColMajorLayout,
-                                                camp::idx_seq<M_SIZE, O_SIZE>>;
-
-  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                 T,
-                                                 RAJA::expt::ColMajorLayout,
-                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+      RAJA::expt::TensorRegister<
+          REGISTER_POLICY,
+          T,
+          RAJA::expt::ColMajorLayout,
+          camp::idx_seq<N_SIZE, M_SIZE>>,
+      RAJA::expt::TensorRegister<
+          REGISTER_POLICY,
+          T,
+          RAJA::expt::ColMajorLayout,
+          camp::idx_seq<M2_SIZE, O_SIZE>>>;
+
+  static_assert(
+      M_SIZE == M2_SIZE,
+      "Matrices are not compatible for "
+      "multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<
+      REGISTER_POLICY,
+      T,
+      RAJA::expt::ColMajorLayout,
+      camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<
+      REGISTER_POLICY,
+      T,
+      RAJA::expt::ColMajorLayout,
+      camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<
+      REGISTER_POLICY,
+      T,
+      RAJA::expt::ColMajorLayout,
+      camp::idx_seq<N_SIZE, O_SIZE>>;
 
   using register_type = typename result_type::register_type;
 
@@ -249,9 +267,10 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const&  A,
-                          right_type const& B,
-                          result_type&      C)
+      multiply_accumulate(
+          left_type const&  A,
+          right_type const& B,
+          result_type&      C)
   {
 
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
@@ -286,9 +305,10 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE RAJA_INLINE static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const&  A,
-                          right_type const& B,
-                          result_type&      C)
+      multiply_accumulate(
+          left_type const&  A,
+          right_type const& B,
+          result_type&      C)
   {
     constexpr camp::idx_t ac_segbits              = result_type::s_segbits;
     constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
@@ -334,8 +354,8 @@ struct MatrixMatrixMultiplyHelper<
             c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
         ++bc_col;
-      } // c_segment
-    }   // c_reg
+      }  // c_segment
+    }    // c_reg
   }
 
 
@@ -349,9 +369,9 @@ struct MatrixMatrixMultiplyHelper<
 };
 
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 58a8e21868..633ee484f5 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -35,53 +35,58 @@ namespace expt
 /*
  * 2D (Matrix) specialization of TensorRegister
  */
-template <typename REGISTER_POLICY,
-          typename T,
-          camp::idx_t ROW_ORD,
-          camp::idx_t COL_ORD,
-          camp::idx_t ROW_SIZE,
-          camp::idx_t COL_SIZE>
-class TensorRegister<REGISTER_POLICY,
-                     T,
-                     TensorLayout<ROW_ORD, COL_ORD>,
-                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
-    : public RAJA::internal::expt::TensorRegisterBase<
-          TensorRegister<REGISTER_POLICY,
-                         T,
-                         TensorLayout<ROW_ORD, COL_ORD>,
-                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+template <
+    typename REGISTER_POLICY,
+    typename T,
+    camp::idx_t ROW_ORD,
+    camp::idx_t COL_ORD,
+    camp::idx_t ROW_SIZE,
+    camp::idx_t COL_SIZE>
+class TensorRegister<
+    REGISTER_POLICY,
+    T,
+    TensorLayout<ROW_ORD, COL_ORD>,
+    camp::idx_seq<ROW_SIZE, COL_SIZE>>
+    : public RAJA::internal::expt::TensorRegisterBase<TensorRegister<
+          REGISTER_POLICY,
+          T,
+          TensorLayout<ROW_ORD, COL_ORD>,
+          camp::idx_seq<ROW_SIZE, COL_SIZE>>>
 {
 public:
-  using self_type = TensorRegister<REGISTER_POLICY,
-                                   T,
-                                   TensorLayout<ROW_ORD, COL_ORD>,
-                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-  using base_type = RAJA::internal::expt::TensorRegisterBase<
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     TensorLayout<ROW_ORD, COL_ORD>,
-                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-  using register_type      = Register<T, REGISTER_POLICY>;
+  using self_type = TensorRegister<
+      REGISTER_POLICY,
+      T,
+      TensorLayout<ROW_ORD, COL_ORD>,
+      camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+  using base_type     = RAJA::internal::expt::TensorRegisterBase<TensorRegister<
+      REGISTER_POLICY,
+      T,
+      TensorLayout<ROW_ORD, COL_ORD>,
+      camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+  using register_type = Register<T, REGISTER_POLICY>;
   using row_vector_type    = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
   using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
   using register_policy    = REGISTER_POLICY;
   using element_type       = T;
   using layout_type        = TensorLayout<ROW_ORD, COL_ORD>;
 
-  using transpose_tensor_type =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     TensorLayout<!ROW_ORD, !COL_ORD>,
-                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-
-  using transpose_type = TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        layout_type,
-                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-  using product_type   = TensorRegister<REGISTER_POLICY,
-                                      T,
-                                      layout_type,
-                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+  using transpose_tensor_type = TensorRegister<
+      REGISTER_POLICY,
+      T,
+      TensorLayout<!ROW_ORD, !COL_ORD>,
+      camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+
+  using transpose_type = TensorRegister<
+      REGISTER_POLICY,
+      T,
+      layout_type,
+      camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+  using product_type = TensorRegister<
+      REGISTER_POLICY,
+      T,
+      layout_type,
+      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
 
   static constexpr camp::idx_t s_num_rows    = ROW_SIZE;
   static constexpr camp::idx_t s_num_columns = COL_SIZE;
@@ -95,10 +100,10 @@ class TensorRegister<REGISTER_POLICY,
       (ROW_SIZE * COL_SIZE) / s_elements_per_register;
 
   // We only allow matrix sizes that exactly fit in some number of registers
-  static_assert((ROW_SIZE * COL_SIZE) ==
-                    s_num_registers * s_elements_per_register,
-                "MatrixRegister must be dimensioned to exactly fit an integer "
-                "number of registers");
+  static_assert(
+      (ROW_SIZE * COL_SIZE) == s_num_registers * s_elements_per_register,
+      "MatrixRegister must be dimensioned to exactly fit an integer "
+      "number of registers");
 
   using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
 
@@ -120,14 +125,16 @@ class TensorRegister<REGISTER_POLICY,
   static constexpr camp::idx_t s_minor_dim_registers =
       s_minor_dim_elements / s_elements_per_register;
 
-  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
-                "Minor dimension smaller than a vector need to be a power of "
-                "two fraction");
+  static_assert(
+      s_minor_dim_registers > 0 || log_base2_t::is_exact,
+      "Minor dimension smaller than a vector need to be a power of "
+      "two fraction");
 
-  static_assert(s_minor_dim_registers == 0 ||
-                    (s_minor_dim_elements % s_elements_per_register == 0),
-                "Minor dimensions greater than a vector length must be an "
-                "integer number of vectors");
+  static_assert(
+      s_minor_dim_registers == 0 ||
+          (s_minor_dim_elements % s_elements_per_register == 0),
+      "Minor dimensions greater than a vector length must be an "
+      "integer number of vectors");
 
 
   static constexpr camp::idx_t s_major_dim_per_register =
@@ -138,8 +145,8 @@ class TensorRegister<REGISTER_POLICY,
 
 private:
   template <typename IDX>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
-                                                                 IDX col) -> IDX
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto
+  to_register(IDX row, IDX col) -> IDX
   {
     return layout_type::is_row_major()
                ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
@@ -257,10 +264,11 @@ class TensorRegister<REGISTER_POLICY,
   }
 
 
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            camp::idx_t                          STRIDE_ONE_DIM>
+  template <
+      typename POINTER_TYPE,
+      typename INDEX_TYPE,
+      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+      camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<
       RAJA::internal::expt::
           TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
@@ -291,8 +299,9 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.load_packed_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -306,8 +315,9 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.load_strided_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
     }
@@ -335,8 +345,9 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.store_packed_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -350,24 +361,26 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.store_strided_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
     }
   };
 
 
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE                           StrideInt1,
-            INDEX_TYPE                           StrideInt2,
-            INDEX_TYPE                           BeginInt1,
-            INDEX_TYPE                           BeginInt2,
-            INDEX_TYPE                           SizeInt1,
-            INDEX_TYPE                           SizeInt2,
-            camp::idx_t                          STRIDE_ONE_DIM>
+  template <
+      typename POINTER_TYPE,
+      typename INDEX_TYPE,
+      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+      INDEX_TYPE                           StrideInt1,
+      INDEX_TYPE                           StrideInt2,
+      INDEX_TYPE                           BeginInt1,
+      INDEX_TYPE                           BeginInt2,
+      INDEX_TYPE                           SizeInt1,
+      INDEX_TYPE                           SizeInt2,
+      camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<RAJA::internal::expt::StaticTensorRef<
       POINTER_TYPE,
       INDEX_TYPE,
@@ -409,8 +422,9 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.load_packed_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -424,8 +438,9 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.load_strided_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
     }
@@ -453,8 +468,9 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.store_packed_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -468,8 +484,9 @@ class TensorRegister<REGISTER_POLICY,
         // partial
         else
         {
-          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+          self.store_strided_nm(
+              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
+              ref.m_tile.m_size[1]);
         }
       }
     }
@@ -582,8 +599,8 @@ class TensorRegister<REGISTER_POLICY,
               i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
           camp::idx_t col =
               s_elements_per_register * (i - (row * s_minor_dim_registers));
-          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
-                                      col_stride);
+          m_registers[i].load_strided(
+              ptr + row * row_stride + col * col_stride, col_stride);
         }
       }
       // less than one register per row
@@ -593,8 +610,8 @@ class TensorRegister<REGISTER_POLICY,
         {
           element_type const* ptr_i =
               ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(ptr_i, s_segbits, col_stride,
-                                        row_stride);
+          m_registers[i].segmented_load(
+              ptr_i, s_segbits, col_stride, row_stride);
         }
       }
     }
@@ -613,8 +630,8 @@ class TensorRegister<REGISTER_POLICY,
           camp::idx_t row =
               s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
-                                      row_stride);
+          m_registers[i].load_strided(
+              ptr + row * row_stride + col * col_stride, row_stride);
         }
       }
       // less than one register per column
@@ -624,8 +641,8 @@ class TensorRegister<REGISTER_POLICY,
         {
           element_type const* ptr_i =
               ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(ptr_i, s_segbits, row_stride,
-                                        col_stride);
+          m_registers[i].segmented_load(
+              ptr_i, s_segbits, row_stride, col_stride);
         }
       }
     }
@@ -638,11 +655,12 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& load_packed_nm(element_type const* ptr,
-                            int                 row_stride,
-                            int                 col_stride,
-                            int                 num_rows,
-                            int                 num_cols)
+  self_type& load_packed_nm(
+      element_type const* ptr,
+      int                 row_stride,
+      int                 col_stride,
+      int                 num_rows,
+      int                 num_cols)
   {
 
     if (layout_type::is_row_major())
@@ -680,7 +698,7 @@ class TensorRegister<REGISTER_POLICY,
                 m_registers[reg] = element_type(0);
               }
 
-              break; // end this row
+              break;  // end this row
             }
           }
         }
@@ -740,7 +758,7 @@ class TensorRegister<REGISTER_POLICY,
                 m_registers[reg] = element_type(0);
               }
 
-              break; // end this column
+              break;  // end this column
             }
           }
         }
@@ -773,11 +791,12 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& load_strided_nm(element_type const* ptr,
-                             int                 row_stride,
-                             int                 col_stride,
-                             int                 num_rows,
-                             int                 num_cols)
+  self_type& load_strided_nm(
+      element_type const* ptr,
+      int                 row_stride,
+      int                 col_stride,
+      int                 num_rows,
+      int                 num_cols)
   {
 
     if (layout_type::is_row_major())
@@ -804,9 +823,9 @@ class TensorRegister<REGISTER_POLICY,
             if (reg_num_cols + col > num_cols)
             {
               reg_num_cols = num_cols - col;
-              m_registers[i].load_strided_n(ptr + row * row_stride +
-                                                col * col_stride,
-                                            col_stride, reg_num_cols);
+              m_registers[i].load_strided_n(
+                  ptr + row * row_stride + col * col_stride, col_stride,
+                  reg_num_cols);
             }
             else
             {
@@ -830,8 +849,8 @@ class TensorRegister<REGISTER_POLICY,
 
           element_type const* ptr_i =
               ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride,
-                                           row_stride, num_cols, reg_num_rows);
+          m_registers[i].segmented_load_nm(
+              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
         }
       }
     }
@@ -860,9 +879,9 @@ class TensorRegister<REGISTER_POLICY,
             if (reg_num_rows + row > num_rows)
             {
               reg_num_rows = num_rows - row;
-              m_registers[i].load_strided_n(ptr + row * row_stride +
-                                                col * col_stride,
-                                            row_stride, reg_num_rows);
+              m_registers[i].load_strided_n(
+                  ptr + row * row_stride + col * col_stride, row_stride,
+                  reg_num_rows);
             }
             else
             {
@@ -885,8 +904,8 @@ class TensorRegister<REGISTER_POLICY,
 
           element_type const* ptr_i =
               ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride,
-                                           col_stride, num_rows, reg_num_cols);
+          m_registers[i].segmented_load_nm(
+              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
         }
       }
     }
@@ -930,8 +949,8 @@ class TensorRegister<REGISTER_POLICY,
               i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
           camp::idx_t col =
               s_elements_per_register * (i - (row * s_minor_dim_registers));
-          m_registers[i].store_packed(ptr + row * row_stride +
-                                      col * col_stride);
+          m_registers[i].store_packed(
+              ptr + row * row_stride + col * col_stride);
         }
       }
       // more than one column per register
@@ -952,8 +971,8 @@ class TensorRegister<REGISTER_POLICY,
               i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
           camp::idx_t row =
               s_elements_per_register * (i - (col * s_minor_dim_registers));
-          m_registers[i].store_packed(ptr + row * row_stride +
-                                      col * col_stride);
+          m_registers[i].store_packed(
+              ptr + row * row_stride + col * col_stride);
         }
       }
       // more than one row per register
@@ -999,8 +1018,8 @@ class TensorRegister<REGISTER_POLICY,
         for (camp::idx_t i = 0; i < s_num_registers; ++i)
         {
           element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(ptr_i, s_segbits, col_stride,
-                                         row_stride);
+          m_registers[i].segmented_store(
+              ptr_i, s_segbits, col_stride, row_stride);
         }
       }
     }
@@ -1027,8 +1046,8 @@ class TensorRegister<REGISTER_POLICY,
         for (camp::idx_t i = 0; i < s_num_registers; ++i)
         {
           element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(ptr_i, s_segbits, row_stride,
-                                         col_stride);
+          m_registers[i].segmented_store(
+              ptr_i, s_segbits, row_stride, col_stride);
         }
       }
     }
@@ -1041,11 +1060,12 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& store_packed_nm(element_type* ptr,
-                                   int           row_stride,
-                                   int           col_stride,
-                                   int           num_rows,
-                                   int           num_cols) const
+  self_type const& store_packed_nm(
+      element_type* ptr,
+      int           row_stride,
+      int           col_stride,
+      int           num_rows,
+      int           num_cols) const
   {
 
 
@@ -1077,7 +1097,7 @@ class TensorRegister<REGISTER_POLICY,
             {
               m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
 
-              break; // end this row
+              break;  // end this row
             }
           }
         }
@@ -1086,8 +1106,8 @@ class TensorRegister<REGISTER_POLICY,
       else
       {
         // default to strided operation
-        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
-                                num_cols);
+        return store_strided_nm(
+            ptr, row_stride, col_stride, num_rows, num_cols);
       }
     }
     // Do semi-dense store for column-major
@@ -1119,7 +1139,7 @@ class TensorRegister<REGISTER_POLICY,
             {
               m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
 
-              break; // end this column
+              break;  // end this column
             }
           }
         }
@@ -1129,8 +1149,8 @@ class TensorRegister<REGISTER_POLICY,
       {
 
         // default to strided operation
-        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
-                                num_cols);
+        return store_strided_nm(
+            ptr, row_stride, col_stride, num_rows, num_cols);
       }
     }
 
@@ -1142,11 +1162,12 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& store_strided_nm(element_type* ptr,
-                                    int           row_stride,
-                                    int           col_stride,
-                                    int           num_rows,
-                                    int           num_cols) const
+  self_type const& store_strided_nm(
+      element_type* ptr,
+      int           row_stride,
+      int           col_stride,
+      int           num_rows,
+      int           num_cols) const
   {
 
 
@@ -1170,9 +1191,9 @@ class TensorRegister<REGISTER_POLICY,
             if (reg_num_cols + col > num_cols)
             {
               reg_num_cols = num_cols - col;
-              m_registers[i].store_strided_n(ptr + row * row_stride +
-                                                 col * col_stride,
-                                             col_stride, reg_num_cols);
+              m_registers[i].store_strided_n(
+                  ptr + row * row_stride + col * col_stride, col_stride,
+                  reg_num_cols);
             }
             else
             {
@@ -1195,8 +1216,8 @@ class TensorRegister<REGISTER_POLICY,
                                          : reg_num_rows;
 
           element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride,
-                                            row_stride, num_cols, reg_num_rows);
+          m_registers[i].segmented_store_nm(
+              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
         }
       }
     }
@@ -1221,9 +1242,9 @@ class TensorRegister<REGISTER_POLICY,
             if (reg_num_rows + row > num_rows)
             {
               reg_num_rows = num_rows - row;
-              m_registers[i].store_strided_n(ptr + row * row_stride +
-                                                 col * col_stride,
-                                             row_stride, reg_num_rows);
+              m_registers[i].store_strided_n(
+                  ptr + row * row_stride + col * col_stride, row_stride,
+                  reg_num_rows);
             }
             else
             {
@@ -1245,8 +1266,8 @@ class TensorRegister<REGISTER_POLICY,
                                          : reg_num_cols;
 
           element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride,
-                                            col_stride, num_rows, reg_num_cols);
+          m_registers[i].segmented_store_nm(
+              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
         }
       }
     }
@@ -1543,9 +1564,9 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  column_vector_type
-  right_multiply_vector_accumulate(row_vector_type const& v,
-                                   column_vector_type     result) const
+  column_vector_type right_multiply_vector_accumulate(
+      row_vector_type const& v,
+      column_vector_type     result) const
   {
 
     if (layout_type::is_row_major())
@@ -1599,13 +1620,13 @@ class TensorRegister<REGISTER_POLICY,
                 m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
             reg++;
 
-          } // rowreg
+          }  // rowreg
 
           // finish dot product by taking sum of rowsum
           auto value = result.get(row) + rowsum.sum();
           result.set(value, row);
 
-        } // row
+        }  // row
       }
     }
     else
@@ -1656,8 +1677,8 @@ class TensorRegister<REGISTER_POLICY,
 
             reg++;
 
-          } // rowreg
-        }   // col
+          }  // rowreg
+        }    // col
       }
     }
     return result;
@@ -1671,8 +1692,9 @@ class TensorRegister<REGISTER_POLICY,
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  row_vector_type left_multiply_vector_accumulate(column_vector_type const& v,
-                                                  row_vector_type result) const
+  row_vector_type left_multiply_vector_accumulate(
+      column_vector_type const& v,
+      row_vector_type           result) const
   {
 
     if (layout_type::is_row_major())
@@ -1716,12 +1738,12 @@ class TensorRegister<REGISTER_POLICY,
                 lhs_bcat, result.get_register(colreg));
             reg++;
 
-          } // rowreg
+          }  // rowreg
         }
       }
 
 
-    } // row-major
+    }  // row-major
 
     // Column-major:
     else
@@ -1772,17 +1794,17 @@ class TensorRegister<REGISTER_POLICY,
                 m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
             reg++;
 
-          } // rowreg
+          }  // rowreg
 
           // finish dot product by taking sum of rowsum
           auto value = result.get(col) + colsum.sum();
           result.set(value, col);
 
-        } // col
+        }  // col
       }
 
 
-    } // col-major
+    }  // col-major
     return result;
   }
 
@@ -1851,9 +1873,10 @@ class TensorRegister<REGISTER_POLICY,
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  register_type extract_diagonal_register(camp::idx_t starting_column,
-                                          camp::idx_t segbits,
-                                          camp::idx_t segment) const
+  register_type extract_diagonal_register(
+      camp::idx_t starting_column,
+      camp::idx_t segbits,
+      camp::idx_t segment) const
   {
 
     register_type result(0);
@@ -1928,11 +1951,11 @@ class TensorRegister<REGISTER_POLICY,
     return s;
   }
 
-}; // MatrixRegisterImpl
+};  // MatrixRegisterImpl
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 7d125238c6..608060b2d5 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -53,8 +53,9 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
+    typename std::enable_if<
+        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+        bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).add(rhs);
@@ -68,8 +69,9 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
+    typename std::enable_if<
+        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+        bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).subtract(rhs);
@@ -83,8 +85,9 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
+    typename std::enable_if<
+        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+        bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
 {
   return rhs.scale(lhs);
@@ -98,8 +101,9 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
+    typename std::enable_if<
+        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+        bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).divide(rhs);
@@ -206,9 +210,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type&
-  gather(element_type const*                       ptr,
-         RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& gather(
+      element_type const*                       ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -231,10 +235,10 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type&
-  gather_n(element_type const*                              ptr,
-           RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-           camp::idx_t                                      N)
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& gather_n(
+      element_type const*                              ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+      camp::idx_t                                      N)
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -259,13 +263,15 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t         segbits,
-                            camp::idx_t         stride_inner,
-                            camp::idx_t         stride_outer)
+  self_type& segmented_load(
+      element_type const* ptr,
+      camp::idx_t         segbits,
+      camp::idx_t         stride_inner,
+      camp::idx_t         stride_outer)
   {
-    getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner,
-                                                          stride_outer));
+    getThis()->gather(
+        ptr,
+        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
     return *getThis();
   }
 
@@ -278,12 +284,13 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t         segbits,
-                               camp::idx_t         stride_inner,
-                               camp::idx_t         stride_outer,
-                               camp::idx_t         num_inner,
-                               camp::idx_t         num_outer)
+  self_type& segmented_load_nm(
+      element_type const* ptr,
+      camp::idx_t         segbits,
+      camp::idx_t         stride_inner,
+      camp::idx_t         stride_outer,
+      camp::idx_t         num_inner,
+      camp::idx_t         num_outer)
   {
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
@@ -327,9 +334,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  scatter(element_type*                                    ptr,
-          RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& scatter(
+      element_type*                                    ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -351,10 +358,10 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  scatter_n(element_type*                                    ptr,
-            RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-            camp::idx_t                                      N) const
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& scatter_n(
+      element_type*                                    ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+      camp::idx_t                                      N) const
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -379,13 +386,15 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t   segbits,
-                                   camp::idx_t   stride_inner,
-                                   camp::idx_t   stride_outer) const
+  self_type const& segmented_store(
+      element_type* ptr,
+      camp::idx_t   segbits,
+      camp::idx_t   stride_inner,
+      camp::idx_t   stride_outer) const
   {
-    getThis()->scatter(ptr, self_type::s_segmented_offsets(
-                                segbits, stride_inner, stride_outer));
+    getThis()->scatter(
+        ptr,
+        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
     return *getThis();
   }
 
@@ -398,12 +407,13 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t   segbits,
-                                      camp::idx_t   stride_inner,
-                                      camp::idx_t   stride_outer,
-                                      camp::idx_t   num_inner,
-                                      camp::idx_t   num_outer) const
+  self_type const& segmented_store_nm(
+      element_type* ptr,
+      camp::idx_t   segbits,
+      camp::idx_t   stride_inner,
+      camp::idx_t   stride_outer,
+      camp::idx_t   num_inner,
+      camp::idx_t   num_outer) const
   {
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
@@ -860,9 +870,10 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    * and the stride between segments (stride_outer)
    */
   RAJA_INLINE
-  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
-                                             camp::idx_t stride_inner,
-                                             camp::idx_t stride_outer)
+  static int_vector_type s_segmented_offsets(
+      camp::idx_t segbits,
+      camp::idx_t stride_inner,
+      camp::idx_t stride_outer)
   {
     int_vector_type result;
 
@@ -917,8 +928,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   RAJA_INLINE
-  self_type segmented_sum_inner(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
+  self_type
+  segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
   {
     self_type result(0);
 
@@ -973,8 +984,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   RAJA_INLINE
-  self_type segmented_sum_outer(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
+  self_type
+  segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
   {
     self_type result(0);
 
@@ -994,10 +1005,11 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
 
 
   RAJA_INLINE
-  self_type segmented_divide_nm(self_type   den,
-                                camp::idx_t segbits,
-                                camp::idx_t num_inner,
-                                camp::idx_t num_outer) const
+  self_type segmented_divide_nm(
+      self_type   den,
+      camp::idx_t segbits,
+      camp::idx_t num_inner,
+      camp::idx_t num_outer) const
   {
     self_type result;
 
@@ -1063,9 +1075,10 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_SUPPRESS_HD_WARN
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  self_type segmented_dot(camp::idx_t      segbits,
-                          camp::idx_t      output_segment,
-                          self_type const& x) const
+  self_type segmented_dot(
+      camp::idx_t      segbits,
+      camp::idx_t      output_segment,
+      self_type const& x) const
   {
     return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
   }
@@ -1118,8 +1131,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   RAJA_INLINE
-  self_type segmented_broadcast_inner(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
+  self_type segmented_broadcast_inner(
+      camp::idx_t segbits,
+      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1177,8 +1191,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *      Result= x6, x6, x6, x6, x7, x7, x7, x7
    */
   RAJA_INLINE
-  self_type segmented_broadcast_outer(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
+  self_type segmented_broadcast_outer(
+      camp::idx_t segbits,
+      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1221,9 +1236,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
 };
 
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index d2dba38301..1c09d35246 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -127,25 +127,28 @@ struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
 };
 
 
-template <typename IDX,
-          typename TENSOR_TYPE,
-          camp::idx_t             DIM,
-          IDX                     INDEX_VALUE,
-          strip_index_type_t<IDX> LENGTH_VALUE>
-struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
-    RAJA::expt::StaticTensorIndexInner<IDX,
-                                       TENSOR_TYPE,
-                                       DIM,
-                                       INDEX_VALUE,
-                                       LENGTH_VALUE>>>
+template <
+    typename IDX,
+    typename TENSOR_TYPE,
+    camp::idx_t             DIM,
+    IDX                     INDEX_VALUE,
+    strip_index_type_t<IDX> LENGTH_VALUE>
+struct TensorIndexTraits<
+    RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<
+        IDX,
+        TENSOR_TYPE,
+        DIM,
+        INDEX_VALUE,
+        LENGTH_VALUE>>>
 {
-  using base_type  = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using index_type = RAJA::expt::StaticTensorIndex<
-      RAJA::expt::StaticTensorIndexInner<IDX,
-                                         TENSOR_TYPE,
-                                         DIM,
-                                         INDEX_VALUE,
-                                         LENGTH_VALUE>>;
+  using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using index_type =
+      RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<
+          IDX,
+          TENSOR_TYPE,
+          DIM,
+          INDEX_VALUE,
+          LENGTH_VALUE>>;
   using arg_type   = IDX;
   using value_type = strip_index_type_t<IDX>;
 
@@ -215,8 +218,8 @@ stripTensorIndexByValue(ARG const arg) ->
  * For VectorIndex types, returns the number of vector lanes.
  */
 template <typename ARG, typename IDX>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
-                                                         IDX        dim_size)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX
+getTensorSize(ARG const& arg, IDX dim_size)
 {
   return TensorIndexTraits<ARG>::size(arg) >= 0
              ? IDX(TensorIndexTraits<ARG>::size(arg))
@@ -228,8 +231,8 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
  *
  */
 template <typename ARG, typename IDX>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
-                                                          IDX        dim_minval)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX
+getTensorBegin(ARG const& arg, IDX dim_minval)
 {
   return TensorIndexTraits<ARG>::begin(arg) >= 0
              ? IDX(TensorIndexTraits<ARG>::begin(arg))
@@ -255,7 +258,7 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr auto getTensorDim()
   return TensorIndexTraits<ARG>::dim();
 }
 
-} // namespace expt
+}  // namespace expt
 
 
 /*
@@ -292,13 +295,13 @@ struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
       extract(Data&& data)
   {
     return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-        IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
+        IDX(camp::get<id>(data.offset_tuple)),  // convert offset type to IDX
         camp::get<id>(data.vector_sizes));
   }
 };
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index 4288c1c333..49746976bd 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -144,11 +144,12 @@ struct PrependStaticIndexArray<
 };
 
 
-template <typename INDEX_TYPE,
-          size_t     IDX,
-          INDEX_TYPE DELTA,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
+template <
+    typename INDEX_TYPE,
+    size_t     IDX,
+    INDEX_TYPE DELTA,
+    INDEX_TYPE HEAD,
+    INDEX_TYPE... TAIL>
 struct AddStaticIndexArray<
     INDEX_TYPE,
     IDX,
@@ -156,19 +157,21 @@ struct AddStaticIndexArray<
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
   using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
-                                               IDX - 1,
-                                               DELTA,
-                                               typename Orig::Tail>::Type;
+  using AddTail = typename AddStaticIndexArray<
+      INDEX_TYPE,
+      IDX - 1,
+      DELTA,
+      typename Orig::Tail>::Type;
   using Type =
       typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
   using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
 };
 
-template <typename INDEX_TYPE,
-          INDEX_TYPE DELTA,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
+template <
+    typename INDEX_TYPE,
+    INDEX_TYPE DELTA,
+    INDEX_TYPE HEAD,
+    INDEX_TYPE... TAIL>
 struct AddStaticIndexArray<
     INDEX_TYPE,
     0,
@@ -177,20 +180,23 @@ struct AddStaticIndexArray<
 {
 
   using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
-                                                HEAD + DELTA,
-                                                typename Orig::Tail>::Type;
-  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
-                                               HEAD + DELTA,
-                                               typename Orig::Tail>::Seq;
+  using Type = typename PrependStaticIndexArray<
+      INDEX_TYPE,
+      HEAD + DELTA,
+      typename Orig::Tail>::Type;
+  using Seq = typename PrependStaticIndexArray<
+      INDEX_TYPE,
+      HEAD + DELTA,
+      typename Orig::Tail>::Seq;
 };
 
 
-template <typename INDEX_TYPE,
-          size_t     IDX,
-          INDEX_TYPE VALUE,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
+template <
+    typename INDEX_TYPE,
+    size_t     IDX,
+    INDEX_TYPE VALUE,
+    INDEX_TYPE HEAD,
+    INDEX_TYPE... TAIL>
 struct SetStaticIndexArray<
     INDEX_TYPE,
     IDX,
@@ -198,19 +204,21 @@ struct SetStaticIndexArray<
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
   using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
-                                               IDX - 1,
-                                               VALUE,
-                                               typename Orig::Tail>::Type;
+  using SetTail = typename SetStaticIndexArray<
+      INDEX_TYPE,
+      IDX - 1,
+      VALUE,
+      typename Orig::Tail>::Type;
   using Type =
       typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
   using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
 };
 
-template <typename INDEX_TYPE,
-          INDEX_TYPE VALUE,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
+template <
+    typename INDEX_TYPE,
+    INDEX_TYPE VALUE,
+    INDEX_TYPE HEAD,
+    INDEX_TYPE... TAIL>
 struct SetStaticIndexArray<
     INDEX_TYPE,
     0,
@@ -218,20 +226,20 @@ struct SetStaticIndexArray<
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
   using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
-                                                VALUE,
-                                                typename Orig::Tail>::Type;
-  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
-                                               VALUE,
-                                               typename Orig::Tail>::Seq;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, VALUE, typename Orig::Tail>::
+          Type;
+  using Seq =
+      typename PrependStaticIndexArray<INDEX_TYPE, VALUE, typename Orig::Tail>::
+          Seq;
 };
 
 
 enum TensorTileSize
 {
-  TENSOR_PARTIAL, // the tile is a full TensorRegister
-  TENSOR_FULL,    // the tile is a partial TensorRegister
-  TENSOR_MULTIPLE // the tile is multiple TennsorRegisters
+  TENSOR_PARTIAL,  // the tile is a full TensorRegister
+  TENSOR_FULL,     // the tile is a partial TensorRegister
+  TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
 };
 
 template <typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
@@ -300,20 +308,23 @@ struct TensorTile
 };
 
 
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE>
+template <
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    typename TBEGIN,
+    typename TSIZE>
 struct StaticTensorTile;
 
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE... BeginInts,
-          INDEX_TYPE... SizeInts>
-struct StaticTensorTile<INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, BeginInts...>,
-                        camp::int_seq<INDEX_TYPE, SizeInts...>>
+template <
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    INDEX_TYPE... BeginInts,
+    INDEX_TYPE... SizeInts>
+struct StaticTensorTile<
+    INDEX_TYPE,
+    TENSOR_SIZE,
+    camp::int_seq<INDEX_TYPE, BeginInts...>,
+    camp::int_seq<INDEX_TYPE, SizeInts...>>
 {
 
 
@@ -335,21 +346,22 @@ struct StaticTensorTile<INDEX_TYPE,
   begin_type m_begin;
   size_type  m_size;
 
-  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
-                "Mismatch between "
-                "number of "
-                "elements in "
-                "Begin and Size "
-                "series of "
-                "StaticTensorTil"
-                "e");
+  static_assert(
+      sizeof...(BeginInts) == sizeof...(SizeInts),
+      "Mismatch between "
+      "number of "
+      "elements in "
+      "Begin and Size "
+      "series of "
+      "StaticTensorTil"
+      "e");
 
   static constexpr camp::idx_t    s_num_dims    = sizeof...(BeginInts);
   static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
   constexpr operator nonstatic_self_type() const
   {
-    return nonstatic_self_type{{BeginInts...}, {SizeInts...}};
+    return nonstatic_self_type {{BeginInts...}, {SizeInts...}};
   }
 
   constexpr nonstatic_self_type nonstatic() const { return *this; }
@@ -379,12 +391,13 @@ struct StaticTensorTile<INDEX_TYPE,
 template <typename TILE, typename VALUE, size_t IDX>
 struct SetStaticTensorTileBegin;
 
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE,
-          INDEX_TYPE VALUE,
-          size_t     IDX>
+template <
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    typename TBEGIN,
+    typename TSIZE,
+    INDEX_TYPE VALUE,
+    size_t     IDX>
 struct SetStaticTensorTileBegin<
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
     camp::integral_constant<INDEX_TYPE, VALUE>,
@@ -401,12 +414,13 @@ struct SetStaticTensorTileBegin<
 template <typename TILE, typename VALUE, size_t IDX>
 struct SetStaticTensorTileSize;
 
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE,
-          INDEX_TYPE VALUE,
-          size_t     IDX>
+template <
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    typename TBEGIN,
+    typename TSIZE,
+    INDEX_TYPE VALUE,
+    size_t     IDX>
 struct SetStaticTensorTileSize<
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
     camp::integral_constant<INDEX_TYPE, VALUE>,
@@ -421,22 +435,24 @@ struct SetStaticTensorTileSize<
 };
 
 
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          camp::idx_t    NUM_DIMS,
-          camp::idx_t    STRIDE_ONE_DIM = -1>
+template <
+    typename POINTER_TYPE,
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    camp::idx_t    NUM_DIMS,
+    camp::idx_t    STRIDE_ONE_DIM = -1>
 struct TensorRef
 {
   static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
   static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
   static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
 
-  using self_type    = TensorRef<POINTER_TYPE,
-                              INDEX_TYPE,
-                              TENSOR_SIZE,
-                              NUM_DIMS,
-                              STRIDE_ONE_DIM>;
+  using self_type = TensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      NUM_DIMS,
+      STRIDE_ONE_DIM>;
   using tile_type    = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
   using pointer_type = POINTER_TYPE;
   using index_type   = INDEX_TYPE;
@@ -450,8 +466,9 @@ struct TensorRef
   RAJA_INLINE
   void print() const
   {
-    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
-           m_pointer);
+    printf(
+        "TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
+        m_pointer);
 
     for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
     {
@@ -465,29 +482,32 @@ struct TensorRef
 };
 
 
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename STRIDE_TYPE,
-          typename BEGIN_TYPE,
-          typename SIZE_TYPE,
-          camp::idx_t STRIDE_ONE_DIM = -1>
+template <
+    typename POINTER_TYPE,
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    typename STRIDE_TYPE,
+    typename BEGIN_TYPE,
+    typename SIZE_TYPE,
+    camp::idx_t STRIDE_ONE_DIM = -1>
 struct StaticTensorRef;
 
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE... StrideInts,
-          INDEX_TYPE... BeginInts,
-          INDEX_TYPE... SizeInts,
-          camp::idx_t STRIDE_ONE_DIM>
-struct StaticTensorRef<POINTER_TYPE,
-                       INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, StrideInts...>,
-                       camp::int_seq<INDEX_TYPE, BeginInts...>,
-                       camp::int_seq<INDEX_TYPE, SizeInts...>,
-                       STRIDE_ONE_DIM>
+template <
+    typename POINTER_TYPE,
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    INDEX_TYPE... StrideInts,
+    INDEX_TYPE... BeginInts,
+    INDEX_TYPE... SizeInts,
+    camp::idx_t STRIDE_ONE_DIM>
+struct StaticTensorRef<
+    POINTER_TYPE,
+    INDEX_TYPE,
+    TENSOR_SIZE,
+    camp::int_seq<INDEX_TYPE, StrideInts...>,
+    camp::int_seq<INDEX_TYPE, BeginInts...>,
+    camp::int_seq<INDEX_TYPE, SizeInts...>,
+    STRIDE_ONE_DIM>
 {
 
   static constexpr camp::idx_t    s_num_dims        = sizeof...(BeginInts);
@@ -502,18 +522,20 @@ struct StaticTensorRef<POINTER_TYPE,
 
   using stride_type = StaticIndexArray<stride_seq>;
 
-  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
-                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
-                "Mismatch between number of elements in Begin and Size series "
-                "of StaticTensorRef");
+  static_assert(
+      (sizeof...(BeginInts) == sizeof...(SizeInts)) &&
+          (sizeof...(SizeInts) == sizeof...(StrideInts)),
+      "Mismatch between number of elements in Begin and Size series "
+      "of StaticTensorRef");
 
 
-  using self_type = StaticTensorRef<POINTER_TYPE,
-                                    INDEX_TYPE,
-                                    TENSOR_SIZE,
-                                    stride_seq,
-                                    begin_seq,
-                                    size_seq>;
+  using self_type = StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      stride_seq,
+      begin_seq,
+      size_seq>;
   using tile_type =
       StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
@@ -526,8 +548,9 @@ struct StaticTensorRef<POINTER_TYPE,
   RAJA_INLINE
   void print() const
   {
-    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
-           m_pointer);
+    printf(
+        "StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
+        m_pointer);
 
     m_stride.print();
 
@@ -545,13 +568,14 @@ template <typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
 struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
 {
 
-  static_assert(REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
-                "Merging a ref "
-                "with a tile "
-                "requires an "
-                "equivalent "
-                "number of "
-                "dimensions.");
+  static_assert(
+      REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
+      "Merging a ref "
+      "with a tile "
+      "requires an "
+      "equivalent "
+      "number of "
+      "dimensions.");
 
   static constexpr camp::idx_t    s_num_dims       = REF_TYPE::s_num_dims;
   static constexpr camp::idx_t    s_stride_one_dim = REF_TYPE::s_stride_one_dim;
@@ -562,123 +586,132 @@ struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
   static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
   using tile_index_type = typename TILE_TYPE::index_type;
 
-  using merge_type = TensorRef<pointer_type,
-                               tile_index_type,
-                               s_tile_tensor_size,
-                               s_num_dims,
-                               s_stride_one_dim>;
+  using merge_type = TensorRef<
+      pointer_type,
+      tile_index_type,
+      s_tile_tensor_size,
+      s_num_dims,
+      s_stride_one_dim>;
   using shift_type = merge_type;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   static constexpr merge_type merge(REF_TYPE const& ref, TILE_TYPE const& tile)
   {
-    return merge_type{ref.m_pointer,
-                      {tile_index_type(ref.m_stride[DIM_SEQ])...},
-                      tile};
+    return merge_type {
+        ref.m_pointer, {tile_index_type(ref.m_stride[DIM_SEQ])...}, tile};
   }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr shift_type shift_origin(REF_TYPE const&  ref,
-                                           TILE_TYPE const& tile_origin)
+  static constexpr shift_type
+  shift_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin)
   {
-    return shift_type{ref.m_pointer -
-                          RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
-                                                  ref.m_stride[DIM_SEQ])...),
-                      {tile_index_type(ref.m_stride[DIM_SEQ])...},
-                      ref.m_tile};
+    return shift_type {
+        ref.m_pointer -
+            RAJA::sum<camp::idx_t>(
+                (tile_origin.m_begin[DIM_SEQ] * ref.m_stride[DIM_SEQ])...),
+        {tile_index_type(ref.m_stride[DIM_SEQ])...},
+        ref.m_tile};
   }
 };
 
 
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE1,
-          TensorTileSize RTENSOR_SIZE,
-          typename STRIDE,
-          INDEX_TYPE1... BEGIN1,
-          INDEX_TYPE1... SIZE1,
-          camp::idx_t STRIDE_ONE_DIM,
-          typename INDEX_TYPE2,
-          TensorTileSize TENSOR_SIZE,
-          typename BEGIN2,
-          typename SIZE2,
-          camp::idx_t... DIM_SEQ>
-struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
-                                    INDEX_TYPE1,
-                                    RTENSOR_SIZE,
-                                    STRIDE,
-                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                                    STRIDE_ONE_DIM>,
-                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
-                    camp::idx_seq<DIM_SEQ...>>
+template <
+    typename POINTER_TYPE,
+    typename INDEX_TYPE1,
+    TensorTileSize RTENSOR_SIZE,
+    typename STRIDE,
+    INDEX_TYPE1... BEGIN1,
+    INDEX_TYPE1... SIZE1,
+    camp::idx_t STRIDE_ONE_DIM,
+    typename INDEX_TYPE2,
+    TensorTileSize TENSOR_SIZE,
+    typename BEGIN2,
+    typename SIZE2,
+    camp::idx_t... DIM_SEQ>
+struct MergeRefTile<
+    StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE1,
+        RTENSOR_SIZE,
+        STRIDE,
+        camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+        camp::int_seq<INDEX_TYPE1, SIZE1...>,
+        STRIDE_ONE_DIM>,
+    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
+    camp::idx_seq<DIM_SEQ...>>
 {
 
-  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
-                                         RTENSOR_SIZE,
-                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
-
-  using ref_type = StaticTensorRef<POINTER_TYPE,
-                                   INDEX_TYPE1,
-                                   RTENSOR_SIZE,
-                                   STRIDE,
-                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                                   STRIDE_ONE_DIM>;
+  using ref_tile_type = StaticTensorTile<
+      INDEX_TYPE1,
+      RTENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+      camp::int_seq<INDEX_TYPE1, SIZE1...>>;
+
+  using ref_type = StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE1,
+      RTENSOR_SIZE,
+      STRIDE,
+      camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+      camp::int_seq<INDEX_TYPE1, SIZE1...>,
+      STRIDE_ONE_DIM>;
 
   using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
 
   using ref_stride_type = typename ref_type ::stride_type;
 
-  using new_stride_seq =
-      camp::int_seq<INDEX_TYPE2,
-                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
+  using new_stride_seq = camp::
+      int_seq<INDEX_TYPE2, INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
 
   using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
   using shift_size_seq  = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
 
-  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
-                                           TENSOR_SIZE,
-                                           shift_begin_seq,
-                                           shift_size_seq>;
+  using shift_tile_type = StaticTensorTile<
+      INDEX_TYPE2,
+      TENSOR_SIZE,
+      shift_begin_seq,
+      shift_size_seq>;
 
   using new_stride_type = StaticIndexArray<new_stride_seq>;
 
-  using merge_type = StaticTensorRef<POINTER_TYPE,
-                                     INDEX_TYPE2,
-                                     TENSOR_SIZE,
-                                     new_stride_seq,
-                                     BEGIN2,
-                                     SIZE2,
-                                     STRIDE_ONE_DIM>;
-
-  using shift_type = StaticTensorRef<POINTER_TYPE,
-                                     INDEX_TYPE2,
-                                     TENSOR_SIZE,
-                                     new_stride_seq,
-                                     shift_begin_seq,
-                                     shift_size_seq,
-                                     STRIDE_ONE_DIM>;
+  using merge_type = StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE2,
+      TENSOR_SIZE,
+      new_stride_seq,
+      BEGIN2,
+      SIZE2,
+      STRIDE_ONE_DIM>;
+
+  using shift_type = StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE2,
+      TENSOR_SIZE,
+      new_stride_seq,
+      shift_begin_seq,
+      shift_size_seq,
+      STRIDE_ONE_DIM>;
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   static constexpr merge_type merge(ref_type const& ref, tile_type const& tile)
   {
-    return merge_type{ref.m_pointer, new_stride_type(), tile};
+    return merge_type {ref.m_pointer, new_stride_type(), tile};
   }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr shift_type shift_origin(ref_type const&  ref,
-                                           tile_type const& tile_origin)
+  static constexpr shift_type
+  shift_origin(ref_type const& ref, tile_type const& tile_origin)
   {
-    return shift_type{ref.m_pointer -
-                          RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
-                                                  ref.m_stride[DIM_SEQ])...),
-                      new_stride_type(), shift_tile_type()};
+    return shift_type {
+        ref.m_pointer -
+            RAJA::sum<camp::idx_t>(
+                (tile_origin.m_begin[DIM_SEQ] * ref.m_stride[DIM_SEQ])...),
+        new_stride_type(), shift_tile_type()};
   }
 };
 
@@ -691,9 +724,9 @@ merge_ref_tile(REF_TYPE const& ref, TILE_TYPE const& tile) ->
         TILE_TYPE,
         camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
 {
-  return MergeRefTile<REF_TYPE, TILE_TYPE,
-                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
-                                                                          tile);
+  return MergeRefTile<
+      REF_TYPE, TILE_TYPE,
+      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
 }
 
 
@@ -710,18 +743,18 @@ shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
         camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
 {
   return MergeRefTile<
-      REF_TYPE, TILE_TYPE,
-      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
-                                                                 tile_origin);
+      REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::
+      shift_origin(ref, tile_origin);
 }
 
 
 /*!
  * Changes TensorTile size type to FULL
  */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          camp::idx_t    NUM_DIMS>
+template <
+    typename INDEX_TYPE,
+    TensorTileSize RTENSOR_SIZE,
+    camp::idx_t    NUM_DIMS>
 RAJA_INLINE
     RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&
     make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
@@ -732,9 +765,10 @@ RAJA_INLINE
 /*!
  * Changes TensorTile size type to PARTIAL
  */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          camp::idx_t    NUM_DIMS>
+template <
+    typename INDEX_TYPE,
+    TensorTileSize RTENSOR_SIZE,
+    camp::idx_t    NUM_DIMS>
 RAJA_INLINE
     RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&
     make_tensor_tile_partial(
@@ -748,16 +782,18 @@ RAJA_INLINE
 /*!
  * Changes StaticTensorTile size type to FULL
  */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
-                                                        TENSOR_FULL,
-                                                        TBEGIN,
-                                                        TSIZE>&
-            make_tensor_tile_full(
-                StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+template <
+    typename INDEX_TYPE,
+    TensorTileSize RTENSOR_SIZE,
+    typename TBEGIN,
+    typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<
+    INDEX_TYPE,
+    TENSOR_FULL,
+    TBEGIN,
+    TSIZE>&
+make_tensor_tile_full(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
 {
   return reinterpret_cast<
       StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE>&>(tile);
@@ -766,26 +802,28 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
 /*!
  * Changes StaticTensorTile size type to PARTIAL
  */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
-                                                        TENSOR_PARTIAL,
-                                                        TBEGIN,
-                                                        TSIZE>&
-            make_tensor_tile_partial(
-                StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+template <
+    typename INDEX_TYPE,
+    TensorTileSize RTENSOR_SIZE,
+    typename TBEGIN,
+    typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<
+    INDEX_TYPE,
+    TENSOR_PARTIAL,
+    TBEGIN,
+    TSIZE>&
+make_tensor_tile_partial(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
 {
   return reinterpret_cast<
       StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE>&>(tile);
 }
 
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index 99ac6f1b5b..6e8d7c9d52 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -37,7 +37,7 @@ namespace expt
 namespace ET
 {
 class TensorExpressionConcreteBase;
-} // namespace ET
+}  // namespace ET
 
 
 template <typename TENSOR, camp::idx_t DIM>
@@ -104,10 +104,11 @@ class TensorRegisterConcreteBase
 template <typename Derived>
 class TensorRegisterBase;
 
-template <typename REGISTER_POLICY,
-          typename T,
-          typename LAYOUT,
-          typename camp::idx_t... SIZES>
+template <
+    typename REGISTER_POLICY,
+    typename T,
+    typename LAYOUT,
+    typename camp::idx_t... SIZES>
 class TensorRegisterBase<
     RAJA::expt::
         TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
@@ -120,9 +121,9 @@ class TensorRegisterBase<
 
   static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
 
-  static constexpr camp::idx_t s_num_registers =
-      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
-                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
+  static constexpr camp::idx_t s_num_registers = DivideRoundUp<
+      RAJA::product<camp::idx_t>(SIZES...),
+      RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
 
   using index_type = camp::idx_t;
 
@@ -168,10 +169,11 @@ class TensorRegisterBase<
   /*
    * Overload for:    assignment of ET to a TensorRegister
    */
-  template <typename RHS,
-            typename std::enable_if<
-                std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
-                bool>::type = true>
+  template <
+      typename RHS,
+      typename std::enable_if<
+          std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
+          bool>::type = true>
   RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const& rhs)
   {
     // evaluate a single tile of the ET, storing in this TensorRegister
@@ -180,12 +182,13 @@ class TensorRegisterBase<
 
 
   template <typename... REGS>
-  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
-                                                           REGS const&... regs)
-      : m_registers{reg0, regs...}
+  explicit RAJA_HOST_DEVICE RAJA_INLINE
+  TensorRegisterBase(register_type reg0, REGS const&... regs)
+      : m_registers {reg0, regs...}
   {
-    static_assert(1 + sizeof...(REGS) == s_num_registers,
-                  "Incompatible number of registers");
+    static_assert(
+        1 + sizeof...(REGS) == s_num_registers,
+        "Incompatible number of registers");
   }
 
   RAJA_HOST_DEVICE
@@ -197,7 +200,7 @@ class TensorRegisterBase<
   RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
                    create_et_store_ref(REF_TYPE const& ref)
   {
-    return TensorRegisterStoreRef<REF_TYPE>{ref};
+    return TensorRegisterStoreRef<REF_TYPE> {ref};
   }
 
   RAJA_SUPPRESS_HD_WARN
@@ -230,15 +233,16 @@ class TensorRegisterBase<
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  static constexpr StaticTensorTile<int,
-                                    TENSOR_FULL,
-                                    camp::int_seq<int, int(SIZES * 0)...>,
-                                    camp::int_seq<int, int(SIZES)...>>
+  static constexpr StaticTensorTile<
+      int,
+      TENSOR_FULL,
+      camp::int_seq<int, int(SIZES * 0)...>,
+      camp::int_seq<int, int(SIZES)...>>
   s_get_default_tile()
   {
-    return StaticTensorTile<int, TENSOR_FULL,
-                            camp::int_seq<int, int(SIZES * 0)...>,
-                            camp::int_seq<int, int(SIZES)...>>();
+    return StaticTensorTile<
+        int, TENSOR_FULL, camp::int_seq<int, int(SIZES * 0)...>,
+        camp::int_seq<int, int(SIZES)...>>();
   }
 
   /*!
@@ -435,10 +439,11 @@ class TensorRegisterBase<
   RAJA_SUPPRESS_HD_WARN
   template <typename T2>
   RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
-                                       T2,
-                                       RAJA::expt::ScalarLayout,
-                                       camp::idx_seq<>> const& value)
+  operator=(RAJA::expt::TensorRegister<
+            RAJA::expt::scalar_register,
+            T2,
+            RAJA::expt::ScalarLayout,
+            camp::idx_seq<>> const& value)
   {
     getThis()->broadcast(value.get(0));
     return *getThis();
@@ -818,11 +823,11 @@ class TensorRegisterBase<
   }
 };
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace internal
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 0c0644a722..a0c97db681 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -117,8 +117,8 @@ struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
     using IterCount =
         camp::integral_constant<typename TTYPE::index_type, iter_count>;
     using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
-    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
-                                                  IterCount>::type;
+    using IdxSeq = typename camp::detail::gen_seq<
+        typename TTYPE::index_type, IterCount>::type;
 
     StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
   }
@@ -152,21 +152,22 @@ struct TensorTileExec<STORAGE, camp::idx_seq<>>
 };
 
 
-template <typename STORAGE,
-          typename TILE_TYPE,
-          typename BODY,
-          camp::idx_t... IDX_SEQ,
-          camp::idx_t... DIM_SEQ>
-RAJA_INLINE RAJA_HOST_DEVICE void
-tensorTileExec_expanded(TILE_TYPE const& orig_tile,
-                        BODY&&           body,
-                        camp::idx_seq<IDX_SEQ...> const&,
-                        camp::idx_seq<DIM_SEQ...> const&)
+template <
+    typename STORAGE,
+    typename TILE_TYPE,
+    typename BODY,
+    camp::idx_t... IDX_SEQ,
+    camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    TILE_TYPE const& orig_tile,
+    BODY&&           body,
+    camp::idx_seq<IDX_SEQ...> const&,
+    camp::idx_seq<DIM_SEQ...> const&)
 {
 
   // tile over full rows and columns
   // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
-  TILE_TYPE tile{
+  TILE_TYPE tile {
       {orig_tile.m_begin[IDX_SEQ]...},
       {STORAGE::s_dim_elem(IDX_SEQ)...},
   };
@@ -195,14 +196,16 @@ struct StaticTensorTileExec;
  * Implement a dimension tiling loop
  */
 
-template <typename STORAGE,
-          camp::idx_t DIM0,
-          camp::idx_t... DIM_REST,
-          camp::idx_t IDX,
-          camp::idx_t... IDX_REST>
-struct StaticTensorTileExec<STORAGE,
-                            camp::idx_seq<DIM0, DIM_REST...>,
-                            camp::idx_seq<IDX, IDX_REST...>>
+template <
+    typename STORAGE,
+    camp::idx_t DIM0,
+    camp::idx_t... DIM_REST,
+    camp::idx_t IDX,
+    camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<
+    STORAGE,
+    camp::idx_seq<DIM0, DIM_REST...>,
+    camp::idx_seq<IDX, IDX_REST...>>
 {
 
   using DimList = camp::idx_seq<DIM0, DIM_REST...>;
@@ -211,9 +214,10 @@ struct StaticTensorTileExec<STORAGE,
   using IdxTail = camp::idx_seq<IDX_REST...>;
 
   using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
-  using NextExec = StaticTensorTileExec<STORAGE,
-                                        camp::idx_seq<DIM0, DIM_REST...>,
-                                        camp::idx_seq<IDX_REST...>>;
+  using NextExec = StaticTensorTileExec<
+      STORAGE,
+      camp::idx_seq<DIM0, DIM_REST...>,
+      camp::idx_seq<IDX_REST...>>;
 
   static auto const step_size = STORAGE::s_dim_elem(DIM0);
 
@@ -227,25 +231,23 @@ struct StaticTensorTileExec<STORAGE,
 
     auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-    using NextBegin =
-        camp::integral_constant<typename TTYPE::index_type,
-                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
-    using TailSize =
-        camp::integral_constant<typename TTYPE::index_type,
-                                (orig_begin + orig_size) - tile_begin>;
+    using NextBegin = camp::integral_constant<
+        typename TTYPE::index_type, tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize = camp::integral_constant<
+        typename TTYPE::index_type, (orig_begin + orig_size) - tile_begin>;
 
-    using NextTile =
-        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
-                                                (size_t)DIM0>::Type;
+    using NextTile = typename expt::SetStaticTensorTileBegin<
+        TTYPE, NextBegin, (size_t)DIM0>::Type;
 
-    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
-                                                            (size_t)DIM0>::Type;
+    using TailTile = typename expt::SetStaticTensorTileSize<
+        TTYPE, TailSize, (size_t)DIM0>::Type;
     using PartTile = typename TailTile::Partial;
 
 
-    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
-                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
-                  "OOB StaticTensorTileExec DOWN");
+    static_assert(
+        (tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+            (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+        "OOB StaticTensorTileExec DOWN");
 
     if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
@@ -262,17 +264,20 @@ struct StaticTensorTileExec<STORAGE,
 };
 
 
-template <typename STORAGE,
-          camp::idx_t DIM0,
-          camp::idx_t IDX,
-          camp::idx_t... IDX_REST>
-struct StaticTensorTileExec<STORAGE,
-                            camp::idx_seq<DIM0>,
-                            camp::idx_seq<IDX, IDX_REST...>>
+template <
+    typename STORAGE,
+    camp::idx_t DIM0,
+    camp::idx_t IDX,
+    camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<
+    STORAGE,
+    camp::idx_seq<DIM0>,
+    camp::idx_seq<IDX, IDX_REST...>>
 {
-  using NextExec = StaticTensorTileExec<STORAGE,
-                                        camp::idx_seq<DIM0>,
-                                        camp::idx_seq<IDX_REST...>>;
+  using NextExec = StaticTensorTileExec<
+      STORAGE,
+      camp::idx_seq<DIM0>,
+      camp::idx_seq<IDX_REST...>>;
 
 
   template <typename OTILE, typename TTYPE, typename BODY>
@@ -284,25 +289,23 @@ struct StaticTensorTileExec<STORAGE,
 
     auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-    using NextBegin =
-        camp::integral_constant<typename TTYPE::index_type,
-                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
-    using TailSize =
-        camp::integral_constant<typename TTYPE::index_type,
-                                (orig_begin + orig_size) - tile_begin>;
+    using NextBegin = camp::integral_constant<
+        typename TTYPE::index_type, tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize = camp::integral_constant<
+        typename TTYPE::index_type, (orig_begin + orig_size) - tile_begin>;
 
-    using NextTile =
-        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
-                                                (size_t)DIM0>::Type;
+    using NextTile = typename expt::SetStaticTensorTileBegin<
+        TTYPE, NextBegin, (size_t)DIM0>::Type;
 
-    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
-                                                            (size_t)DIM0>::Type;
+    using TailTile = typename expt::SetStaticTensorTileSize<
+        TTYPE, TailSize, (size_t)DIM0>::Type;
     using PartTile = typename TailTile::Partial;
 
 
-    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
-                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
-                  "OOB StaticTensorTileExec ACROSS");
+    static_assert(
+        (tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+            (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+        "OOB StaticTensorTileExec ACROSS");
 
     if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
@@ -319,9 +322,10 @@ struct StaticTensorTileExec<STORAGE,
 };
 
 template <typename STORAGE, camp::idx_t... DIM_REST>
-struct StaticTensorTileExec<STORAGE,
-                            camp::idx_seq<DIM_REST...>,
-                            camp::idx_seq<>>
+struct StaticTensorTileExec<
+    STORAGE,
+    camp::idx_seq<DIM_REST...>,
+    camp::idx_seq<>>
 {
 
   template <typename OTILE, typename TTYPE, typename BODY>
@@ -331,14 +335,15 @@ struct StaticTensorTileExec<STORAGE,
 };
 
 
-template <typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE,
-          typename BODY,
-          camp::idx_t... IDX_SEQ,
-          camp::idx_t... DIM_SEQ>
+template <
+    typename STORAGE,
+    typename INDEX_TYPE,
+    TensorTileSize TENSOR_SIZE,
+    typename TBEGIN,
+    typename TSIZE,
+    typename BODY,
+    camp::idx_t... IDX_SEQ,
+    camp::idx_t... DIM_SEQ>
 RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
     BODY&&                                                          body,
@@ -368,18 +373,18 @@ RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
 
 
 template <typename STORAGE, typename TILE_TYPE, typename BODY>
-RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
-                                                 BODY&&           body)
+RAJA_INLINE RAJA_HOST_DEVICE void
+tensorTileExec(TILE_TYPE const& tile, BODY&& body)
 {
   using layout_type = typename STORAGE::layout_type;
   tensorTileExec_expanded<STORAGE>(
-      tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
+      tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims> {}, layout_type {});
 }
 
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 1d36543bab..48d7e9e643 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -38,26 +38,29 @@ namespace expt
  * This provides a Tensor specialization for vectors
  */
 template <typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-class TensorRegister<REGISTER_POLICY,
-                     T,
-                     RAJA::expt::VectorLayout,
-                     camp::idx_seq<SIZE>>
-    : public internal::expt::TensorRegisterBase<
-          RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                     T,
-                                     RAJA::expt::VectorLayout,
-                                     camp::idx_seq<SIZE>>>
+class TensorRegister<
+    REGISTER_POLICY,
+    T,
+    RAJA::expt::VectorLayout,
+    camp::idx_seq<SIZE>>
+    : public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<
+          REGISTER_POLICY,
+          T,
+          RAJA::expt::VectorLayout,
+          camp::idx_seq<SIZE>>>
 {
 public:
-  using self_type = TensorRegister<REGISTER_POLICY,
-                                   T,
-                                   RAJA::expt::VectorLayout,
-                                   camp::idx_seq<SIZE>>;
-  using base_type = internal::expt::TensorRegisterBase<
-      RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                 T,
-                                 RAJA::expt::VectorLayout,
-                                 camp::idx_seq<SIZE>>>;
+  using self_type = TensorRegister<
+      REGISTER_POLICY,
+      T,
+      RAJA::expt::VectorLayout,
+      camp::idx_seq<SIZE>>;
+  using base_type =
+      internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<
+          REGISTER_POLICY,
+          T,
+          RAJA::expt::VectorLayout,
+          camp::idx_seq<SIZE>>>;
   using element_type  = camp::decay<T>;
   using layout_type   = TensorLayout<0>;
   using register_type = Register<T, REGISTER_POLICY>;
@@ -66,10 +69,11 @@ class TensorRegister<REGISTER_POLICY,
 
   using int_element_type =
       typename register_type::int_vector_type::element_type;
-  using int_vector_type = TensorRegister<REGISTER_POLICY,
-                                         int_element_type,
-                                         RAJA::expt::VectorLayout,
-                                         camp::idx_seq<SIZE>>;
+  using int_vector_type = TensorRegister<
+      REGISTER_POLICY,
+      int_element_type,
+      RAJA::expt::VectorLayout,
+      camp::idx_seq<SIZE>>;
 
 private:
   static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
@@ -129,12 +133,13 @@ class TensorRegister<REGISTER_POLICY,
   /*
    * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
    */
-  template <typename RHS,
-            typename std::enable_if<
-                std::is_base_of<
-                    RAJA::internal::expt::ET::TensorExpressionConcreteBase,
-                    RHS>::value,
-                bool>::type = true>
+  template <
+      typename RHS,
+      typename std::enable_if<
+          std::is_base_of<
+              RAJA::internal::expt::ET::TensorExpressionConcreteBase,
+              RHS>::value,
+          bool>::type = true>
   RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
   {
     // evaluate a single tile of the ET, storing in this
@@ -144,8 +149,8 @@ class TensorRegister<REGISTER_POLICY,
 
 
   template <typename... REGS>
-  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
-                                                       REGS const&... regs)
+  explicit RAJA_HOST_DEVICE RAJA_INLINE
+  TensorRegister(register_type reg0, REGS const&... regs)
       : base_type(reg0, regs...)
   {}
 
@@ -225,10 +230,11 @@ class TensorRegister<REGISTER_POLICY,
   }
 
 
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            camp::idx_t                          STRIDE_ONE_DIM>
+  template <
+      typename POINTER_TYPE,
+      typename INDEX_TYPE,
+      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+      camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<
       RAJA::internal::expt::
           TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
@@ -344,13 +350,14 @@ class TensorRegister<REGISTER_POLICY,
   };
 
 
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE                           STRIDE_VALUE,
-            INDEX_TYPE                           BEGIN_VALUE,
-            INDEX_TYPE                           SIZE_VALUE,
-            camp::idx_t                          STRIDE_ONE_DIM>
+  template <
+      typename POINTER_TYPE,
+      typename INDEX_TYPE,
+      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+      INDEX_TYPE                           STRIDE_VALUE,
+      INDEX_TYPE                           BEGIN_VALUE,
+      INDEX_TYPE                           SIZE_VALUE,
+      camp::idx_t                          STRIDE_ONE_DIM>
   struct RefBridge<RAJA::internal::expt::StaticTensorRef<
       POINTER_TYPE,
       INDEX_TYPE,
@@ -505,8 +512,8 @@ class TensorRegister<REGISTER_POLICY,
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
-      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
-                                    stride);
+      m_registers[reg].load_strided(
+          ptr + reg * s_register_num_elem * stride, stride);
     }
     if (s_num_partial_lanes)
     {
@@ -532,8 +539,8 @@ class TensorRegister<REGISTER_POLICY,
       }
       else
       {
-        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
-                                       N - reg * s_register_num_elem);
+        m_registers[reg].load_packed_n(
+            ptr + reg * s_register_num_elem, N - reg * s_register_num_elem);
 
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
@@ -562,14 +569,14 @@ class TensorRegister<REGISTER_POLICY,
     {
       if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
-                                      stride);
+        m_registers[reg].load_strided(
+            ptr + reg * s_register_num_elem * stride, stride);
       }
       else
       {
-        m_registers[reg].load_strided_n(ptr +
-                                            reg * s_register_num_elem * stride,
-                                        stride, N - reg * s_register_num_elem);
+        m_registers[reg].load_strided_n(
+            ptr + reg * s_register_num_elem * stride, stride,
+            N - reg * s_register_num_elem);
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
           m_registers[r].broadcast(0);
@@ -606,8 +613,8 @@ class TensorRegister<REGISTER_POLICY,
     }
     if (s_num_partial_lanes)
     {
-      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
-                                             s_num_partial_lanes);
+      m_registers[s_final_register].gather_n(
+          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
     }
     return *this;
   }
@@ -633,8 +640,8 @@ class TensorRegister<REGISTER_POLICY,
       }
       else
       {
-        m_registers[reg].gather_n(ptr, offsets.vec(reg),
-                                  N - reg * s_register_num_elem);
+        m_registers[reg].gather_n(
+            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
           m_registers[r].broadcast(0);
@@ -644,9 +651,9 @@ class TensorRegister<REGISTER_POLICY,
     }
     if (s_num_partial_lanes)
     {
-      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
-                                             N - s_final_register *
-                                                     s_register_num_elem);
+      m_registers[s_final_register].gather_n(
+          ptr, offsets.vec(s_final_register),
+          N - s_final_register * s_register_num_elem);
     }
     return *this;
   }
@@ -680,8 +687,8 @@ class TensorRegister<REGISTER_POLICY,
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
-      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
-                                     stride);
+      m_registers[reg].store_strided(
+          ptr + reg * s_register_num_elem * stride, stride);
     }
     if (s_num_partial_lanes)
     {
@@ -707,8 +714,8 @@ class TensorRegister<REGISTER_POLICY,
       }
       else
       {
-        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
-                                        N - reg * s_register_num_elem);
+        m_registers[reg].store_packed_n(
+            ptr + reg * s_register_num_elem, N - reg * s_register_num_elem);
         return *this;
       }
     }
@@ -732,14 +739,14 @@ class TensorRegister<REGISTER_POLICY,
     {
       if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
-                                       stride);
+        m_registers[reg].store_strided(
+            ptr + reg * s_register_num_elem * stride, stride);
       }
       else
       {
-        m_registers[reg].store_strided_n(ptr +
-                                             reg * s_register_num_elem * stride,
-                                         stride, N - reg * s_register_num_elem);
+        m_registers[reg].store_strided_n(
+            ptr + reg * s_register_num_elem * stride, stride,
+            N - reg * s_register_num_elem);
         return *this;
       }
     }
@@ -764,8 +771,8 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& scatter(element_type*          ptr,
-                           int_vector_type const& offsets) const
+  self_type const&
+  scatter(element_type* ptr, int_vector_type const& offsets) const
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
@@ -790,9 +797,10 @@ class TensorRegister<REGISTER_POLICY,
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& scatter_n(element_type*          ptr,
-                             int_vector_type const& offsets,
-                             camp::idx_t            N) const
+  self_type const& scatter_n(
+      element_type*          ptr,
+      int_vector_type const& offsets,
+      camp::idx_t            N) const
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
@@ -802,8 +810,8 @@ class TensorRegister<REGISTER_POLICY,
       }
       else
       {
-        m_registers[reg].scatter_n(ptr, offsets.vec(reg),
-                                   N - reg * s_register_num_elem);
+        m_registers[reg].scatter_n(
+            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
 
         return *this;
       }
@@ -1091,8 +1099,8 @@ class TensorRegister<REGISTER_POLICY,
 };
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 // Bring in the register policy file so we get the default register type
diff --git a/include/RAJA/pattern/tensor/stats.hpp b/include/RAJA/pattern/tensor/stats.hpp
index d92590a663..643cd3ca22 100644
--- a/include/RAJA/pattern/tensor/stats.hpp
+++ b/include/RAJA/pattern/tensor/stats.hpp
@@ -79,7 +79,7 @@ struct tensor_stats
   static void printVectorStats();
 };
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index 465e559dbe..8f00ef1efb 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -56,8 +56,8 @@ class MultiPolicy
   Selector s;
 
 public:
-  MultiPolicy() = delete; // No default construction
-  MultiPolicy(Selector s) : s(s), _policies({Policies{}...}) {}
+  MultiPolicy() = delete;  // No default construction
+  MultiPolicy(Selector s) : s(s), _policies({Policies {}...}) {}
   MultiPolicy(Selector s, Policies... policies) : s(s), _policies({policies...})
   {}
 
@@ -81,32 +81,34 @@ class MultiPolicy
 /// \param p MultiPolicy to use for selection
 /// \param iter iterable of items to supply to body
 /// \param body functor, will receive each value produced by iterable iter
-template <typename Iterable,
-          typename Body,
-          typename Selector,
-          typename... Policies>
+template <
+    typename Iterable,
+    typename Body,
+    typename Selector,
+    typename... Policies>
 RAJA_INLINE void
 forall_impl(MultiPolicy<Selector, Policies...> p, Iterable&& iter, Body&& body)
 {
   p.invoke(iter, body);
 }
-template <typename Res,
-          typename Iterable,
-          typename Body,
-          typename Selector,
-          typename... Policies>
-RAJA_INLINE resources::EventProxy<Res>
-            forall_impl(Res                                r,
-                        MultiPolicy<Selector, Policies...> p,
-                        Iterable&&                         iter,
-                        Body&&                             body)
+template <
+    typename Res,
+    typename Iterable,
+    typename Body,
+    typename Selector,
+    typename... Policies>
+RAJA_INLINE resources::EventProxy<Res> forall_impl(
+    Res                                r,
+    MultiPolicy<Selector, Policies...> p,
+    Iterable&&                         iter,
+    Body&&                             body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
 }
 
-} // end namespace multi
-} // end namespace policy
+}  // end namespace multi
+}  // end namespace policy
 
 using policy::multi::MultiPolicy;
 
@@ -114,14 +116,14 @@ namespace detail
 {
 
 template <camp::idx_t... Indices, typename... Policies, typename Selector>
-auto make_multi_policy(camp::idx_seq<Indices...>,
-                       Selector                s,
-                       std::tuple<Policies...> policies)
-    -> MultiPolicy<Selector, Policies...>
+auto make_multi_policy(
+    camp::idx_seq<Indices...>,
+    Selector                s,
+    std::tuple<Policies...> policies) -> MultiPolicy<Selector, Policies...>
 {
   return MultiPolicy<Selector, Policies...>(s, std::get<Indices>(policies)...);
 }
-} // namespace detail
+}  // namespace detail
 
 /// make_multi_policy - Construct a MultiPolicy from the given selector and
 /// Policies
@@ -135,7 +137,7 @@ template <typename... Policies, typename Selector>
 RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 {
-  return MultiPolicy<Selector, Policies...>(s, Policies{}...);
+  return MultiPolicy<Selector, Policies...>(s, Policies {}...);
 }
 
 /// make_multi_policy - Construct a MultiPolicy from the given selector and
@@ -152,8 +154,8 @@ RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)>{},
-                                   s, policies);
+  return detail::make_multi_policy(
+      camp::make_idx_seq_t<sizeof...(Policies)> {}, s, policies);
 }
 
 namespace detail
@@ -174,7 +176,7 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...>
     if (offset == size - index - 1)
     {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -193,8 +195,9 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...>
     }
     else
     {
-      NextInvoker::invoke(offset, std::forward<Iterable>(iter),
-                          std::forward<LoopBody>(loop_body));
+      NextInvoker::invoke(
+          offset, std::forward<Iterable>(iter),
+          std::forward<LoopBody>(loop_body));
     }
   }
 };
@@ -210,7 +213,7 @@ struct policy_invoker<0, size, Policy, rest...>
     if (offset == size - 1)
     {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -235,18 +238,18 @@ struct policy_invoker<0, size, Policy, rest...>
   }
 };
 
-} // end namespace detail
+}  // end namespace detail
 
 namespace type_traits
 {
 
 template <typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
-                                            typename std::decay<T>::type>
+    : ::RAJA::type_traits::
+          SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type>
 {};
-} // namespace type_traits
+}  // namespace type_traits
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index e2446a0a21..eb81b693ba 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -64,11 +64,12 @@ enum class Launch
 struct PolicyBase
 {};
 
-template <Policy   Policy_,
-          Pattern  Pattern_,
-          Launch   Launch_,
-          Platform Platform_,
-          typename... Traits>
+template <
+    Policy   Policy_,
+    Pattern  Pattern_,
+    Launch   Launch_,
+    Platform Platform_,
+    typename... Traits>
 struct PolicyBaseT : PolicyBase
 {
   static constexpr Policy   policy   = Policy_;
@@ -127,12 +128,13 @@ template <typename PolicyType, typename Trait>
 struct policy_has_trait_impl : camp::num<false>
 {};
 ///
-template <typename Trait,
-          Policy   Policy_,
-          Pattern  Pattern_,
-          Launch   Launch_,
-          Platform Platform_,
-          typename... Traits>
+template <
+    typename Trait,
+    Policy   Policy_,
+    Pattern  Pattern_,
+    Launch   Launch_,
+    Platform Platform_,
+    typename... Traits>
 struct policy_has_trait_impl<
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
     Trait>
@@ -158,18 +160,19 @@ struct ordered
 struct unordered
 {};
 
-} // namespace reduce
+}  // namespace reduce
 
 
 template <Policy Pol, Pattern Pat, typename... Args>
 using make_policy_pattern_t =
     PolicyBaseT<Pol, Pat, Launch::undefined, Platform::undefined, Args...>;
 
-template <Policy   Policy_,
-          Pattern  Pattern_,
-          Launch   Launch_,
-          Platform Platform_,
-          typename... Args>
+template <
+    Policy   Policy_,
+    Pattern  Pattern_,
+    Launch   Launch_,
+    Platform Platform_,
+    typename... Args>
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
@@ -177,10 +180,11 @@ template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
-template <Policy   Policy_,
-          Pattern  Pattern_,
-          Platform Platform_,
-          typename... Args>
+template <
+    Policy   Policy_,
+    Pattern  Pattern_,
+    Platform Platform_,
+    typename... Args>
 using make_policy_pattern_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch::undefined, Platform_, Args...>;
 
@@ -188,18 +192,18 @@ namespace concepts
 {
 
 template <typename Pol>
-struct ExecutionPolicy
-    : DefineConcept(::RAJA::concepts::has_type<::RAJA::Policy>(
-                        camp::decay<decltype(Pol::policy)>()),
-                    ::RAJA::concepts::has_type<::RAJA::Pattern>(
-                        camp::decay<decltype(Pol::pattern)>()),
-                    ::RAJA::concepts::has_type<::RAJA::Launch>(
-                        camp::decay<decltype(Pol::launch)>()),
-                    ::RAJA::concepts::has_type<::RAJA::Platform>(
-                        camp::decay<decltype(Pol::platform)>()))
+struct ExecutionPolicy : DefineConcept(
+                             ::RAJA::concepts::has_type<::RAJA::Policy>(
+                                 camp::decay<decltype(Pol::policy)>()),
+                             ::RAJA::concepts::has_type<::RAJA::Pattern>(
+                                 camp::decay<decltype(Pol::pattern)>()),
+                             ::RAJA::concepts::has_type<::RAJA::Launch>(
+                                 camp::decay<decltype(Pol::launch)>()),
+                             ::RAJA::concepts::has_type<::RAJA::Platform>(
+                                 camp::decay<decltype(Pol::platform)>()))
 {};
 
-} // end namespace concepts
+}  // end namespace concepts
 
 namespace type_traits
 {
@@ -232,8 +236,9 @@ struct is_device_exec_policy
     : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip>
 {};
 
-DefineTypeTraitFromConcept(is_execution_policy,
-                           RAJA::concepts::ExecutionPolicy);
+DefineTypeTraitFromConcept(
+    is_execution_policy,
+    RAJA::concepts::ExecutionPolicy);
 
 
 template <typename Pol>
@@ -245,8 +250,8 @@ struct is_multi_reduce_policy
     : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce>
 {};
 
-} // end namespace type_traits
+}  // end namespace type_traits
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_POLICYBASE_HPP */
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index dd50242cc5..d008e6e301 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -67,13 +67,13 @@ struct constant_stride_array_of_objects
 
 /// Dispatch using function pointers to make indirect function calls
 struct indirect_function_call_dispatch
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch>
+    : RAJA::
+          make_policy_pattern_t<Policy::undefined, Pattern::workgroup_dispatch>
 {};
 /// Dispatch using virtual functions to make indirect function calls
 struct indirect_virtual_function_dispatch
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch>
+    : RAJA::
+          make_policy_pattern_t<Policy::undefined, Pattern::workgroup_dispatch>
 {};
 /// Dispatch using an implementation equivalent to a switch statement to select
 /// the type from RangeAndCallables and directly call the object.
@@ -82,14 +82,15 @@ struct indirect_virtual_function_dispatch
 /// objects that may be passed to WorkPool enqueue.
 template <typename... RangeAndCallables>
 struct direct_dispatch
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch>
+    : RAJA::
+          make_policy_pattern_t<Policy::undefined, Pattern::workgroup_dispatch>
 {};
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
+template <
+    typename EXEC_POLICY_T,
+    typename ORDER_POLICY_T,
+    typename STORAGE_POLICY_T,
+    typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
 struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
                              policy_of<EXEC_POLICY_T>::value,
                              Pattern::workgroup,
@@ -101,18 +102,20 @@ struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
   static_assert(
       RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(RAJA::pattern_is<STORAGE_POLICY_T,
-                                 RAJA::Pattern::workgroup_storage>::value,
-                "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage "
-                "policy");
-  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T,
-                                 RAJA::Pattern::workgroup_dispatch>::value,
-                "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup "
-                "dispatch policy");
+  static_assert(
+      RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::
+          value,
+      "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage "
+      "policy");
+  static_assert(
+      RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::
+          value,
+      "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup "
+      "dispatch policy");
 };
 
-} // end namespace workgroup
-} // end namespace policy
+}  // end namespace workgroup
+}  // end namespace policy
 
 using policy::workgroup::ordered;
 using policy::workgroup::reverse_ordered;
@@ -127,6 +130,6 @@ using policy::workgroup::indirect_virtual_function_dispatch;
 
 using policy::workgroup::WorkGroupPolicy;
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index 324891b5b0..ee859b4a91 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -155,7 +155,7 @@ atomicCAS(auto_atomic, T* acc, T compare, T value)
 }
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 // make sure this define doesn't bleed out of this header
 #undef RAJA_AUTO_ATOMIC
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index 6350e1e6fb..d24a14b8a8 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -81,9 +81,10 @@ struct builtin_useReinterpret
   using type = std::conditional_t<
       sizeof(T) == 1,
       char,
-      std::conditional_t<sizeof(T) == 2,
-                         short,
-                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      std::conditional_t<
+          sizeof(T) == 2,
+          short,
+          std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -132,8 +133,9 @@ RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
 /*!
  * Atomic load using atomic or
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
@@ -167,8 +169,9 @@ RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
 /*!
  * Atomic store using atomic exchange
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   builtin_atomicExchange(acc, value);
@@ -296,7 +299,7 @@ RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
 }
 
 
-#else // RAJA_COMPILER_MSVC
+#else  // RAJA_COMPILER_MSVC
 
 
 /*!
@@ -342,29 +345,31 @@ struct builtin_useReinterpret
 #endif
                                   ));
 
-  using type =
-      std::conditional_t<sizeof(T) == 1,
+  using type = std::conditional_t<
+      sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                         uint8_t,
+      uint8_t,
 #else
-                         unsigned char,
+      unsigned char,
 #endif
-                         std::conditional_t<sizeof(T) == 2,
+      std::conditional_t<
+          sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                                            uint16_t,
+          uint16_t,
 #else
-                                            unsigned short,
+          unsigned short,
 #endif
-                                            std::conditional_t<sizeof(T) == 4,
+          std::conditional_t<
+              sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                                                               uint32_t,
+              uint32_t,
 #else
-                                                               unsigned int,
+              unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                                                               uint64_t>>>;
+              uint64_t>>>;
 #else
-                                                               unsigned long long>>>;
+              unsigned long long>>>;
 #endif
 };
 
@@ -390,8 +395,9 @@ struct builtin_useCAS
 /*!
  * Atomic load using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
@@ -401,8 +407,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 /*!
  * Atomic store using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
@@ -412,8 +419,9 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 /*!
  * Atomic exchange using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
@@ -423,12 +431,13 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 /*!
  * Atomic compare and swap using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
-                              __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(
+      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   return compare;
 }
 
@@ -436,8 +445,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 /*!
  * Atomic addition using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
@@ -447,8 +457,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 /*!
  * Atomic subtraction using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
@@ -458,8 +469,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 /*!
  * Atomic and using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
@@ -469,8 +481,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 /*!
  * Atomic or using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
@@ -480,15 +493,16 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 /*!
  * Atomic xor using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
 }
 
 
-#endif // RAJA_COMPILER_MSVC
+#endif  // RAJA_COMPILER_MSVC
 
 
 /*!
@@ -506,8 +520,9 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
 /*!
  * Atomic load using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   using R = builtin_useReinterpret_t<T>;
@@ -520,22 +535,24 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 /*!
  * Atomic store using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  builtin_atomicStore(reinterpret_cast<R*>(acc),
-                      RAJA::util::reinterp_A_as_B<T, R>(value));
+  builtin_atomicStore(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
 
 /*!
  * Atomic exchange using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
@@ -548,8 +565,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 /*!
  * Atomic compare and swap using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
@@ -569,8 +587,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
  * Equality comparison for compare and swap loop using types supported by
  * intrinsics.
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
@@ -582,14 +601,16 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
  * Converts to the underlying integral type to avoid cases where the values
  * will never compare equal (most notably, NaNs).
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return builtin_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
-                                 RAJA::util::reinterp_A_as_B<T, R>(b));
+  return builtin_atomicCAS_equal(
+      RAJA::util::reinterp_A_as_B<T, R>(a),
+      RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
@@ -621,9 +642,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T*             acc,
-                                                     Oper&&         oper,
-                                                     ShortCircuit&& sc)
+RAJA_DEVICE_HIP RAJA_INLINE T
+builtin_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
 
@@ -699,7 +719,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 }
 
 
-} // namespace detail
+}  // namespace detail
 
 
 template <typename T>
@@ -765,14 +785,14 @@ RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc,
-                                        [value](T old)
-                                        {
-                                          return old == static_cast<T>(0) ||
-                                                         value < old
-                                                     ? value
-                                                     : old - static_cast<T>(1);
-                                        });
+  return detail::builtin_atomicCAS_loop(
+      acc,
+      [value](T old)
+      {
+        return old == static_cast<T>(0) || value < old
+                   ? value
+                   : old - static_cast<T>(1);
+      });
 }
 
 template <typename T>
@@ -807,7 +827,7 @@ atomicCAS(builtin_atomic, T* acc, T compare, T value)
 }
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index 87031da720..40d5e68e4c 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -42,6 +42,6 @@
 #include "RAJA/policy/cuda/launch.hpp"
 #include "RAJA/policy/cuda/WorkGroup.hpp"
 
-#endif // closing endif for if defined(RAJA_ENABLE_CUDA)
+#endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 294f532de9..2a7c444fe0 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -147,8 +147,8 @@ struct DevicePinnedAllocator
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
     cudaErrchk(
         cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy,
-                             cudaCpuDeviceId));
+    cudaErrchk(cudaMemAdvise(
+        ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
 
     return ptr;
   }
@@ -175,10 +175,10 @@ namespace detail
 struct cudaInfo
 {
   const void*             func = nullptr;
-  cuda_dim_t              gridDim{0, 0, 0};
-  cuda_dim_t              blockDim{0, 0, 0};
+  cuda_dim_t              gridDim {0, 0, 0};
+  cuda_dim_t              blockDim {0, 0, 0};
   size_t*                 dynamic_smem = nullptr;
-  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0, 0)};
+  ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
   bool                    setup_reducers = false;
 };
 struct cudaStatusInfo : cudaInfo
@@ -201,7 +201,7 @@ extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 RAJA_INLINE
 void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
 
-} // namespace detail
+}  // namespace detail
 
 //! Ensure all resources in use are synchronized wrt raja kernel launches
 RAJA_INLINE
@@ -271,14 +271,15 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void*             func,
-            cuda_dim_t              gridDim,
-            cuda_dim_t              blockDim,
-            void**                  args,
-            size_t                  shmem,
-            ::RAJA::resources::Cuda res,
-            bool                    async = true,
-            const char*             name  = nullptr)
+void launch(
+    const void*             func,
+    cuda_dim_t              gridDim,
+    cuda_dim_t              blockDim,
+    void**                  args,
+    size_t                  shmem,
+    ::RAJA::resources::Cuda res,
+    bool                    async = true,
+    const char*             name  = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
   if (name) nvtxRangePushA(name);
@@ -352,8 +353,8 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
 template <typename T, typename GetNFromMax>
-RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
-                                        size_t        align = alignof(T))
+RAJA_INLINE size_t
+allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
@@ -386,17 +387,17 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
-make_launch_body(const void*             func,
-                 cuda_dim_t              gridDim,
-                 cuda_dim_t              blockDim,
-                 size_t&                 dynamic_smem,
-                 ::RAJA::resources::Cuda res,
-                 LOOP_BODY&&             loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
+    const void*             func,
+    cuda_dim_t              gridDim,
+    cuda_dim_t              blockDim,
+    size_t&                 dynamic_smem,
+    ::RAJA::resources::Cuda res,
+    LOOP_BODY&&             loop_body)
 {
   ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
       detail::tl_status,
-      detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+      detail::cudaInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -434,9 +435,9 @@ struct CudaOccMaxBlocksThreadsData
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE CudaOccMaxBlocksThreadsData
-cuda_occupancy_max_blocks_threads(const void* func,
-                                  size_t      func_dynamic_shmem_per_block)
+RAJA_INLINE CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(
+    const void* func,
+    size_t      func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
@@ -484,10 +485,10 @@ cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE CudaOccMaxBlocksData
-cuda_occupancy_max_blocks(const void* func,
-                          size_t      func_dynamic_shmem_per_block,
-                          int         func_threads_per_block)
+RAJA_INLINE CudaOccMaxBlocksData cuda_occupancy_max_blocks(
+    const void* func,
+    size_t      func_dynamic_shmem_per_block,
+    int         func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
@@ -536,9 +537,10 @@ cuda_occupancy_max_blocks(const void* func,
 template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func,
-                  size_t      func_dynamic_shmem_per_block,
-                  IdxT        len)
+  ConcretizerImpl(
+      const void* func,
+      size_t      func_dynamic_shmem_per_block,
+      IdxT        len)
       : m_func(func),
         m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
         m_len(len)
@@ -621,10 +623,10 @@ struct ConcretizerImpl
   IdxT        m_len;
 };
 
-} // namespace cuda
+}  // namespace cuda
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA
+#endif  // closing endif for RAJA_ENABLE_CUDA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup.hpp b/include/RAJA/policy/cuda/WorkGroup.hpp
index 0c3405b401..d24dcfb769 100644
--- a/include/RAJA/policy/cuda/WorkGroup.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/cuda/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/cuda/WorkGroup/WorkRunner.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index c67f54ab21..5e73d27d7f 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -42,8 +42,8 @@ namespace cuda
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
 template <typename Factory>
-__global__ void get_value_global(typename Factory::value_type* ptr,
-                                 Factory                       factory)
+__global__ void
+get_value_global(typename Factory::value_type* ptr, Factory factory)
 {
   *ptr = factory();
 }
@@ -100,28 +100,29 @@ inline auto get_cached_value(Factory&& factory)
   return value;
 }
 
-} // namespace cuda
+}  // namespace cuda
 
 /*!
  * Populate and return a Dispatcher object that can be used in device code
  */
-template <typename T,
-          typename Dispatcher_T,
-          size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool   Async>
+template <
+    typename T,
+    typename Dispatcher_T,
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    bool   Async>
 inline const Dispatcher_T*
 get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
-  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>(
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
       [](auto&& factory) {
         return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
       })};
   return &dispatcher;
 }
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index b031481713..8544a76819 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,19 +36,21 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    RAJA::ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
     : WorkRunnerForallOrdered<
           RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
           RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
@@ -75,9 +77,10 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const&         storage,
-                      typename base::resource_type r,
-                      Args... args) const
+  per_run_storage
+  run(WorkContainer const&         storage,
+      typename base::resource_type r,
+      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -101,19 +104,21 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    RAJA::reverse_ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
     : WorkRunnerForallReverse<
           RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
           RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
@@ -140,9 +145,10 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const&         storage,
-                      typename base::resource_type r,
-                      Args... args) const
+  per_run_storage
+  run(WorkContainer const&         storage,
+      typename base::resource_type r,
+      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -167,10 +173,11 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type,
-          typename LoopBody,
-          typename index_type,
-          typename... Args>
+template <
+    typename Segment_type,
+    typename LoopBody,
+    typename index_type,
+    typename... Args>
 struct HoldCudaDeviceXThreadblockLoop
 {
   template <typename segment_in, typename body_in>
@@ -199,12 +206,13 @@ struct HoldCudaDeviceXThreadblockLoop
   LoopBody     m_body;
 };
 
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          typename StorageIter,
-          typename value_type,
-          typename index_type,
-          typename... Args>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    typename StorageIter,
+    typename value_type,
+    typename index_type,
+    typename... Args>
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -221,13 +229,14 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
 struct WorkRunner<
     RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
@@ -250,8 +259,8 @@ struct WorkRunner<
   {
     template <typename T>
     using type = HoldCudaDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
         index_type,
         Args...>;
   };
@@ -268,11 +277,11 @@ struct WorkRunner<
   using dispatcher_holder_policy =
       dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type =
-      Dispatcher<Platform::cuda,
-                 dispatcher_holder_policy,
-                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
-                 Args...>;
+  using dispatcher_type = Dispatcher<
+      Platform::cuda,
+      dispatcher_holder_policy,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
+      Args...>;
 
   WorkRunner() = default;
 
@@ -327,7 +336,7 @@ struct WorkRunner<
       //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -340,15 +349,14 @@ struct WorkRunner<
   run(WorkContainer const& storage, resource_type r, Args... args) const
   {
     using Iterator   = camp::decay<decltype(std::begin(storage))>;
-    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
-                                                          std::end(storage)))>;
+    using IndexType  = camp::decay<decltype(std::distance(
+         std::begin(storage), std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func =
-        cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
-                                      value_type, index_type, Args...>;
+    auto func = cuda_unordered_y_block_global<
+        BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
@@ -368,10 +376,10 @@ struct WorkRunner<
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize{
-          static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
-                                         block_size),
+      cuda_dim_t blockSize {static_cast<cuda_dim_member_t>(block_size), 1, 1};
+      cuda_dim_t gridSize {
+          static_cast<cuda_dim_member_t>(
+              (average_iterations + block_size - 1) / block_size),
           static_cast<cuda_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
@@ -386,8 +394,8 @@ struct WorkRunner<
         // Launch the kernel
         //
         void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
-                           shmem, r, Async);
+        RAJA::cuda::launch(
+            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -404,8 +412,8 @@ struct WorkRunner<
 };
 
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index 768f4297a6..8485de5176 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -88,9 +88,10 @@ struct cuda_useReinterpretCommon
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type = std::conditional_t<
+      sizeof(T) == sizeof(unsigned int),
+      unsigned int,
+      unsigned long long>;
 };
 
 
@@ -108,8 +109,9 @@ using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
  * This overload using builtin functions is used to implement atomic loads
  * under some build configurations.
  */
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
@@ -144,9 +146,10 @@ struct cuda_useReinterpretExchange
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type = std::conditional_t<
+      sizeof(T) == sizeof(unsigned int),
+      unsigned int,
+      unsigned long long>;
 };
 
 /*!
@@ -160,8 +163,9 @@ using cuda_useReinterpretExchange_t =
  * Performs an atomic exchange using a builtin function. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
@@ -171,8 +175,9 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
  * Performs an atomic exchange using a reinterpret cast. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   using R = cuda_useReinterpretExchange_t<T>;
@@ -191,7 +196,7 @@ template <typename T>
 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
-      cuda::memory_order_relaxed{});
+      cuda::memory_order_relaxed {});
 }
 
 
@@ -199,20 +204,22 @@ template <typename T>
 RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
-      value, cuda::memory_order_relaxed{});
+      value, cuda::memory_order_relaxed {});
 }
 
 #else
 
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda_atomicOr(acc, static_cast<T>(0));
 }
 
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   using R = cuda_useReinterpretCommon_t<T>;
@@ -267,14 +274,16 @@ struct cuda_useReinterpretCAS
 
   using type =
 #if __CUDA_ARCH__ >= 700
-      std::conditional_t<sizeof(T) == sizeof(unsigned short),
-                         unsigned short,
+      std::conditional_t<
+          sizeof(T) == sizeof(unsigned short),
+          unsigned short,
 #endif
-                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                            unsigned int,
-                                            unsigned long long>
+          std::conditional_t<
+              sizeof(T) == sizeof(unsigned int),
+              unsigned int,
+              unsigned long long>
 #if __CUDA_ARCH__ >= 700
-                         >
+          >
 #endif
       ;
 };
@@ -285,15 +294,17 @@ struct cuda_useReinterpretCAS
 template <typename T>
 using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
 
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
@@ -308,21 +319,24 @@ RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
  * integral type to avoid cases where the values will never compare equal
  * (most notably, NaNs).
  */
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
 
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
-  return cuda_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
-                              RAJA::util::reinterp_A_as_B<T, R>(b));
+  return cuda_atomicCAS_equal(
+      RAJA::util::reinterp_A_as_B<T, R>(a),
+      RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
@@ -354,9 +368,8 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
  * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T*             acc,
-                                             Oper&&         oper,
-                                             ShortCircuit&& sc)
+RAJA_INLINE __device__ T
+cuda_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
 
@@ -380,19 +393,21 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T*             acc,
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<int,
-                                                  unsigned int,
-                                                  unsigned long long int,
-                                                  float
+using cuda_atomicAdd_builtin_types = ::camp::list<
+    int,
+    unsigned int,
+    unsigned long long int,
+    float
 #if __CUDA_ARCH__ >= 600
-                                                  ,
-                                                  double
+    ,
+    double
 #endif
-                                                  >;
+    >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
-              nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old + value; });
@@ -414,18 +429,19 @@ using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
 using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
-using cuda_atomicSub_via_Add_builtin_types =
-    ::camp::list<unsigned long long int,
-                 float
+using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
+    unsigned long long int,
+    float
 #if __CUDA_ARCH__ >= 600
-                 ,
-                 double
+    ,
+    double
 #endif
-                 >;
+    >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
-              nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old - value; });
@@ -453,14 +469,15 @@ RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<int,
-                                                     unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<
+    int,
+    unsigned int
 #if __CUDA_ARCH__ >= 500
-                                                     ,
-                                                     long long int,
-                                                     unsigned long long int
+    ,
+    long long int,
+    unsigned long long int
 #endif
-                                                     >;
+    >;
 
 
 /*!
@@ -477,9 +494,10 @@ RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
       [value](T current) { return current <= value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
-              nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
@@ -500,9 +518,10 @@ RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
       [value](T current) { return value <= current; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
-              nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
@@ -562,13 +581,14 @@ RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc,
-                             [value](T old)
-                             {
-                               return old == static_cast<T>(0) || value < old
-                                          ? value
-                                          : old - static_cast<T>(1);
-                             });
+  return cuda_atomicCAS_loop(
+      acc,
+      [value](T old)
+      {
+        return old == static_cast<T>(0) || value < old
+                   ? value
+                   : old - static_cast<T>(1);
+      });
 }
 
 template <
@@ -601,9 +621,10 @@ using cuda_atomicBit_builtin_types =
 /*!
  * Atomic and
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-              nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old & value; });
@@ -621,9 +642,10 @@ RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 /*!
  * Atomic or
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-              nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old | value; });
@@ -638,9 +660,10 @@ RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 /*!
  * Atomic xor
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-              nullptr>
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+        nullptr>
 RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
@@ -655,7 +678,7 @@ RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 }
 
 
-} // namespace detail
+}  // namespace detail
 
 
 /*!
@@ -668,13 +691,13 @@ RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
-                                          T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicLoad(cuda_atomic_explicit<host_policy>, T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
@@ -686,165 +709,155 @@ atomicStore(cuda_atomic_explicit<host_policy>, T* acc, T value)
 #ifdef __CUDA_ARCH__
   detail::cuda_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAdd(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicSub(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMin(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMax(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
   return detail::cuda_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(cuda_atomic_explicit<host_policy>, T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
   return detail::cuda_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(cuda_atomic_explicit<host_policy>, T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAnd(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
-                                        T* acc,
-                                        T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicOr(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicXor(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
-                                              T* acc,
-                                              T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicExchange(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
@@ -856,12 +869,12 @@ atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // RAJA_ENABLE_CUDA
-#endif // guard
+#endif  // RAJA_ENABLE_CUDA
+#endif  // guard
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 746c21f2ca..f99d68ddc5 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,41 +70,46 @@ namespace impl
  *
  ******************************************************************************
  */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                len,
-                             const void*         RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                len,
+      const void*         RAJA_UNUSED_ARG(func),
+      size_t              RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -115,37 +120,40 @@ struct ForallDimensionCalculator<
                           "space");
     }
 
-    internal::set_cuda_dim<dim>(dims.threads,
-                                static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks,
-                                static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(
+        dims.threads, static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(
+        dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template <named_dim dim,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                len,
-                             const void*         func,
-                             size_t              dynamic_shmem_size)
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                len,
+      const void*         func,
+      size_t              dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -162,30 +170,33 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                len,
-                             const void*         func,
-                             size_t              dynamic_shmem_size)
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                len,
+      const void*         func,
+      size_t              dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
@@ -208,12 +219,13 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                len,
-                             const void*         func,
-                             size_t              dynamic_shmem_size)
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                len,
+      const void*         func,
+      size_t              dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
@@ -223,31 +235,35 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                RAJA_UNUSED_ARG(len),
-                             const void*         RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                RAJA_UNUSED_ARG(len),
+      const void*         RAJA_UNUSED_ARG(func),
+      size_t              RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -257,30 +273,33 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                len,
-                             const void*         func,
-                             size_t              dynamic_shmem_size)
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                len,
+      const void*         func,
+      size_t              dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -291,30 +310,33 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                len,
-                             const void*         func,
-                             size_t              dynamic_shmem_size)
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                len,
+      const void*         func,
+      size_t              dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
@@ -337,12 +359,13 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT                len,
-                             const void*         func,
-                             size_t              dynamic_shmem_size)
+  static void set_dimensions(
+      internal::CudaDims& dims,
+      IdxT                len,
+      const void*         func,
+      size_t              dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
@@ -367,21 +390,23 @@ struct ForallDimensionCalculator<
  *
  ******************************************************************************
  */
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forall_cuda_kernel(LOOP_BODY      loop_body,
-                            const Iterator idx,
-                            IndexType      length)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forall_cuda_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -393,17 +418,19 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
   }
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
@@ -417,23 +444,25 @@ forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
   }
 }
 
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(LOOP_BODY      loop_body,
-                             const Iterator idx,
-                             IndexType      length,
-                             ForallParam    f_params)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forallp_cuda_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -446,22 +475,25 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(LOOP_BODY      loop_body,
-                                    const Iterator idx,
-                                    IndexType      length,
-                                    ForallParam    f_params)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -480,18 +512,20 @@ template <
     typename Iterator,
     typename LOOP_BODY,
     typename IndexType,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forall_cuda_kernel(LOOP_BODY      loop_body,
-                            const Iterator idx,
-                            IndexType      length)
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forall_cuda_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -511,12 +545,14 @@ template <
     typename IndexType,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
@@ -538,19 +574,21 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename ForallParam,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(LOOP_BODY      loop_body,
-                             const Iterator idx,
-                             IndexType      length,
-                             ForallParam    f_params)
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forallp_cuda_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -572,16 +610,19 @@ template <
     typename ForallParam,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(LOOP_BODY      loop_body,
-                                    const Iterator idx,
-                                    IndexType      length,
-                                    ForallParam    f_params)
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -594,7 +635,7 @@ __global__ void forallp_cuda_kernel(LOOP_BODY      loop_body,
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-} // namespace impl
+}  // namespace impl
 
 //
 ////////////////////////////////////////////////////////////////////////
@@ -604,27 +645,30 @@ __global__ void forallp_cuda_kernel(LOOP_BODY      loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BlocksPerSM,
-          bool   Async,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename LoopBody,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BlocksPerSM,
+    bool   Async,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                     IterationGetter,
-                                                     Concretizer,
-                                                     BlocksPerSM,
-                                                     Async> const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+forall_impl(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BlocksPerSM,
+        Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -632,11 +676,10 @@ forall_impl(resources::Cuda cuda_res,
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
       IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
-                                    LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using UniqueMarker = ::camp::list<
+      IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<
+      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -650,8 +693,8 @@ forall_impl(resources::Cuda cuda_res,
   {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                  IndexType>);
+        &impl::forall_cuda_kernel<
+            EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, IndexType>);
 
     //
     // Setup shared memory buffers
@@ -678,8 +721,8 @@ forall_impl(resources::Cuda cuda_res,
       // Launch the kernels
       //
       void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
-                         Async);
+      RAJA::cuda::launch(
+          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
     }
 
     RAJA_FT_END;
@@ -689,28 +732,31 @@ forall_impl(resources::Cuda cuda_res,
 }
 
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BlocksPerSM,
-          bool   Async,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename LoopBody,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BlocksPerSM,
+    bool   Async,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                     IterationGetter,
-                                                     Concretizer,
-                                                     BlocksPerSM,
-                                                     Async> const&,
-            Iterable&&  iter,
-            LoopBody&&  loop_body,
-            ForallParam f_params)
+forall_impl(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BlocksPerSM,
+        Async> const&,
+    Iterable&&  iter,
+    LoopBody&&  loop_body,
+    ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -718,12 +764,11 @@ forall_impl(resources::Cuda cuda_res,
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
       IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker =
-      ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
-                   LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using UniqueMarker = ::camp::list<
+      IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY,
+      Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<
+      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -737,8 +782,9 @@ forall_impl(resources::Cuda cuda_res,
   {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                   IndexType, camp::decay<ForallParam>>);
+        &impl::forallp_cuda_kernel<
+            EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, IndexType,
+            camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -771,10 +817,10 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Launch the kernels
       //
-      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
-                      (void*)&f_params};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
-                         Async);
+      void* args[] = {
+          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::cuda::launch(
+          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -804,21 +850,24 @@ forall_impl(resources::Cuda cuda_res,
  *
  ******************************************************************************
  */
-template <typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BlocksPerSM,
-          bool   Async,
-          typename... SegmentTypes>
+template <
+    typename LoopBody,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BlocksPerSM,
+    bool   Async,
+    typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
     resources::Cuda r,
-    ExecPolicy<seq_segit,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BlocksPerSM,
-                                                        Async>>,
+    ExecPolicy<
+        seq_segit,
+        ::RAJA::policy::cuda::cuda_exec_explicit<
+            IterationMapping,
+            IterationGetter,
+            Concretizer,
+            BlocksPerSM,
+            Async>>,
     const TypedIndexSet<SegmentTypes...>& iset,
     LoopBody&&                            loop_body)
 {
@@ -827,22 +876,22 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
   {
     iset.segmentCall(
         r, isi, detail::CallForall(),
-        ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                 IterationGetter, Concretizer,
-                                                 BlocksPerSM, true>(),
+        ::RAJA::policy::cuda::cuda_exec_explicit<
+            IterationMapping, IterationGetter, Concretizer, BlocksPerSM,
+            true>(),
         loop_body);
-  } // iterate over segments of index set
+  }  // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
   return resources::EventProxy<resources::Cuda>(r);
 }
 
-} // namespace cuda
+}  // namespace cuda
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index 2d8aa65cba..c87cbf493c 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -93,8 +93,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<
+        T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -112,8 +112,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<
+        T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -153,8 +153,8 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<
+      T, min_shfl_int_type_size, max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -172,8 +172,8 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<
+      T, min_shfl_int_type_size, max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -217,8 +217,8 @@ shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
-                                                           int       laneMask)
+RAJA_DEVICE RAJA_INLINE long long
+shfl_xor_sync<long long>(long long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -268,8 +268,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
-                                                             int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned int
+shfl_sync<unsigned int>(unsigned int var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -288,8 +288,8 @@ shfl_sync<unsigned long>(unsigned long var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
-                                                       int       srcLane)
+RAJA_DEVICE RAJA_INLINE long long
+shfl_sync<long long>(long long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -348,7 +348,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
     for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
   }
   else
@@ -362,7 +362,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -385,7 +385,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
   for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
   {
     T rhs = __shfl_xor_sync(0xffffffff, temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
@@ -413,7 +413,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
   }
   else
@@ -427,7 +427,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -436,10 +436,11 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   if (numThreads > policy::cuda::device_constants.WARP_SIZE)
   {
 
-    static_assert(policy::cuda::device_constants.MAX_WARPS <=
-                      policy::cuda::device_constants.WARP_SIZE,
-                  "This algorithms assumes a warp of WARP_SIZE threads can "
-                  "reduce MAX_WARPS values");
+    static_assert(
+        policy::cuda::device_constants.MAX_WARPS <=
+            policy::cuda::device_constants.WARP_SIZE,
+        "This algorithms assumes a warp of WARP_SIZE threads can "
+        "reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
     __shared__ unsigned char tmpsd[sizeof(
@@ -475,7 +476,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
       {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
@@ -485,12 +486,12 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   return temp;
 }
 
-} // end namespace impl
+}  // end namespace impl
 
-} // end namespace cuda
+}  // end namespace cuda
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel.hpp b/include/RAJA/policy/cuda/kernel.hpp
index eef7fc6e28..3ed72ecd90 100644
--- a/include/RAJA/policy/cuda/kernel.hpp
+++ b/include/RAJA/policy/cuda/kernel.hpp
@@ -33,4 +33,4 @@
 #include "RAJA/policy/cuda/kernel/TileTCount.hpp"
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index 73cd37b8b8..8109bf2e81 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -36,13 +36,15 @@ namespace internal
 {
 
 
-template <typename Data,
-          typename Conditional,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    typename Conditional,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::If<Conditional, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -67,8 +69,8 @@ struct CudaStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index c14b3c97f7..7feb947cd8 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -68,10 +68,11 @@ struct cuda_explicit_launch
  * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<async0,
-                                         num_blocks,
-                                         num_threads,
-                                         policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch = cuda_explicit_launch<
+    async0,
+    num_blocks,
+    num_threads,
+    policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
@@ -79,11 +80,11 @@ using cuda_launch = cuda_explicit_launch<async0,
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using cuda_occ_calc_launch =
-    cuda_explicit_launch<async0,
-                         0,
-                         num_threads0,
-                         policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_occ_calc_launch = cuda_explicit_launch<
+    async0,
+    0,
+    num_threads0,
+    policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -109,8 +110,9 @@ struct CudaKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
-                                    EnclosedStmts...>;
+using CudaKernelExp = CudaKernelExt<
+    cuda_launch<false, num_blocks, num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -156,9 +158,9 @@ using CudaKernelFixed = CudaKernelExt<
  * The kernel launch is asynchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixedAsync =
-    CudaKernelExt<cuda_launch<true, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using CudaKernelFixedAsync = CudaKernelExt<
+    cuda_launch<true, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -166,12 +168,13 @@ using CudaKernelFixedAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
-using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false,
-                                       operators::limits<int>::max(),
-                                       num_threads,
-                                       blocks_per_sm>,
-                  EnclosedStmts...>;
+using CudaKernelFixedSM = CudaKernelExt<
+    cuda_explicit_launch<
+        false,
+        operators::limits<int>::max(),
+        num_threads,
+        blocks_per_sm>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -179,12 +182,13 @@ using CudaKernelFixedSM =
  * The kernel launch is asynchronous.
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
-using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true,
-                                       operators::limits<int>::max(),
-                                       num_threads,
-                                       blocks_per_sm>,
-                  EnclosedStmts...>;
+using CudaKernelFixedSMAsync = CudaKernelExt<
+    cuda_explicit_launch<
+        true,
+        operators::limits<int>::max(),
+        num_threads,
+        blocks_per_sm>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with 1024 threads
@@ -200,7 +204,7 @@ using CudaKernel = CudaKernelFixed<1024, EnclosedStmts...>;
 template <typename... EnclosedStmts>
 using CudaKernelAsync = CudaKernelFixedAsync<1024, EnclosedStmts...>;
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -251,15 +255,15 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
 template <int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
-  using type =
-      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
-                                                              BlocksPerSM,
-                                                              Data,
-                                                              executor_t>)>;
+  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<
+                                    BlockSize,
+                                    BlocksPerSM,
+                                    Data,
+                                    executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
-                                              executor_t>;
+    return &internal::CudaKernelLauncherFixed<
+        BlockSize, BlocksPerSM, Data, executor_t>;
   }
 };
 
@@ -283,10 +287,11 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <typename LaunchPolicy,
-          typename StmtList,
-          typename Data,
-          typename Types>
+template <
+    typename LaunchPolicy,
+    typename StmtList,
+    typename Data,
+    typename Types>
 struct CudaLaunchHelper;
 
 
@@ -295,13 +300,14 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template <bool async0,
-          int  num_blocks,
-          int  num_threads,
-          int  blocks_per_sm,
-          typename StmtList,
-          typename Data,
-          typename Types>
+template <
+    bool async0,
+    int  num_blocks,
+    int  num_threads,
+    int  blocks_per_sm,
+    typename StmtList,
+    typename Data,
+    typename Types>
 struct CudaLaunchHelper<
     cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
     StmtList,
@@ -315,20 +321,21 @@ struct CudaLaunchHelper<
   using executor_t =
       internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t =
-      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
-                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
-                               Data,
-                               executor_t>;
+  using kernelGetter_t = CudaKernelLauncherGetter<
+      (num_threads <= 0) ? 0 : num_threads,
+      (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
+      Data,
+      executor_t>;
 
   inline static const void* get_func()
   {
     return reinterpret_cast<const void*>(kernelGetter_t::get());
   }
 
-  inline static void recommended_blocks_threads(size_t shmem_size,
-                                                int&   recommended_blocks,
-                                                int&   recommended_threads)
+  inline static void recommended_blocks_threads(
+      size_t shmem_size,
+      int&   recommended_blocks,
+      int&   recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -390,8 +397,8 @@ struct CudaLaunchHelper<
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
-                                 int&   max_threads)
+  inline static void
+  max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int& max_threads)
   {
     if (num_threads <= 0)
     {
@@ -464,9 +471,10 @@ struct CudaLaunchHelper<
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
-                              cuda_dim_t        result,
-                              cuda_dim_t        minimum = cuda_dim_t())
+inline cuda_dim_t fitCudaDims(
+    cuda_dim_member_t limit,
+    cuda_dim_t        result,
+    cuda_dim_t        minimum = cuda_dim_t())
 {
 
 
@@ -567,8 +575,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
-                                           recommended_threads);
+      launch_t::recommended_blocks_threads(
+          shmem, recommended_blocks, recommended_threads);
 
 
       //
@@ -581,13 +589,14 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      cuda_dim_t fit_threads{0, 0, 0};
+      cuda_dim_t fit_threads {0, 0, 0};
 
       if (recommended_threads >= get_size(launch_dims.min_dims.threads))
       {
 
-        fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
-                                  launch_dims.min_dims.threads);
+        fit_threads = fitCudaDims(
+            recommended_threads, launch_dims.dims.threads,
+            launch_dims.min_dims.threads);
       }
 
       //
@@ -597,8 +606,9 @@ struct StatementExecutor<
           get_size(fit_threads) != recommended_threads)
       {
 
-        fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
-                                  launch_dims.min_dims.threads);
+        fit_threads = fitCudaDims(
+            max_threads, launch_dims.dims.threads,
+            launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -629,8 +639,8 @@ struct StatementExecutor<
         use_blocks = max_blocks;
       }
 
-      launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
-                                            launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(
+          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -662,18 +672,18 @@ struct StatementExecutor<
         // Launch the kernel
         //
         void* args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func, launch_dims.dims.blocks,
-                           launch_dims.dims.threads, args, shmem, res,
-                           launch_t::async);
+        RAJA::cuda::launch(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, args,
+            shmem, res, launch_t::async);
       }
     }
   }
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 04eba2cbc1..cb08f47b0c 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -36,19 +36,20 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -88,7 +89,7 @@ struct CudaStatementExecutor<
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -102,19 +103,21 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::sync,
-                       IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -165,8 +168,8 @@ struct CudaStatementExecutor<
     diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -180,19 +183,21 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::none,
-                       IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -238,8 +243,8 @@ struct CudaStatementExecutor<
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -251,10 +256,11 @@ struct CudaStatementExecutor<
 /*
  * Executor for sequential loops inside of a CudaKernel.
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
@@ -266,9 +272,10 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
+                  cuda::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -279,16 +286,17 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::cuda_warp_masked_direct<Mask>,
-                                            EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -303,9 +311,9 @@ struct CudaStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
-                "BitMask is too large for CUDA warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+      "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -345,16 +353,17 @@ struct CudaStatementExecutor<Data,
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::cuda_warp_masked_loop<Mask>,
-                                            EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -369,9 +378,9 @@ struct CudaStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
-                "BitMask is too large for CUDA warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+      "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -422,16 +431,18 @@ struct CudaStatementExecutor<Data,
  * Mapping directly from raw threadIdx.x
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::cuda_thread_masked_direct<Mask>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -486,16 +497,17 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::cuda_thread_masked_loop<Mask>,
-                                            EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -556,8 +568,8 @@ struct CudaStatementExecutor<Data,
   }
 };
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_cuda_kernel_For_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index bb50c5dd01..23dca70de0 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -37,13 +37,14 @@ namespace internal
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<
@@ -65,11 +66,11 @@ struct CudaStatementExecutor<
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                      sync,
-                                                      IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::
+              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -101,12 +102,13 @@ struct CudaStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<
@@ -178,12 +180,13 @@ struct CudaStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<
@@ -251,11 +254,12 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
@@ -267,9 +271,10 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
+                  cuda::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -280,33 +285,37 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::cuda_warp_masked_direct<Mask>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::cuda_warp_masked_direct<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      CudaStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::cuda_warp_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = CudaStatementExecutor<
+      Data,
+      statement::For<
+          ArgumentId,
+          RAJA::cuda_warp_masked_direct<Mask>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -320,9 +329,9 @@ struct CudaStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
-                "BitMask is too large for CUDA warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+      "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -345,33 +354,35 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::cuda_warp_masked_loop<Mask>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::cuda_warp_masked_loop<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      CudaStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::cuda_warp_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = CudaStatementExecutor<
+      Data,
+      statement::
+          For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>, EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -385,9 +396,9 @@ struct CudaStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
-                "BitMask is too large for CUDA warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+      "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -421,32 +432,36 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::cuda_thread_masked_direct<Mask>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::cuda_thread_masked_direct<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::cuda_thread_masked_direct<Mask>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::cuda_thread_masked_direct<Mask>,
+          EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -482,33 +497,37 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::cuda_thread_masked_loop<Mask>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::cuda_thread_masked_loop<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      CudaStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::cuda_thread_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = CudaStatementExecutor<
+      Data,
+      statement::For<
+          ArgumentId,
+          RAJA::cuda_thread_masked_loop<Mask>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -548,8 +567,8 @@ struct CudaStatementExecutor<
   }
 };
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_cuda_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index 017bb0af9b..0c22adaae6 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -36,11 +36,12 @@ namespace internal
 {
 
 
-template <typename Data,
-          camp::idx_t HpArgumentId,
-          camp::idx_t... Args,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t HpArgumentId,
+    camp::idx_t... Args,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::
@@ -64,8 +65,8 @@ struct CudaStatementExecutor<
         segment_length<HpArgumentId>(data) +
         foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
-    int h_args = foldl(RAJA::operators::plus<idx_t>(),
-                       camp::get<Args>(data.offset_tuple)...);
+    int h_args = foldl(
+        RAJA::operators::plus<idx_t>(), camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -99,8 +100,8 @@ struct CudaStatementExecutor<
 };
 
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index feda56fd39..b2036c7e0f 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -40,15 +40,18 @@ namespace internal
 {
 
 // Intialize thread shared array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::InitLocalMem<RAJA::cuda_shared_mem,
-                                                     camp::idx_seq<Indices...>,
-                                                     EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t... Indices,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::InitLocalMem<
+        RAJA::cuda_shared_mem,
+        camp::idx_seq<Indices...>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -122,15 +125,18 @@ struct CudaStatementExecutor<Data,
 };
 
 // Intialize thread private array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::InitLocalMem<RAJA::cuda_thread_mem,
-                                                     camp::idx_seq<Indices...>,
-                                                     EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t... Indices,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::InitLocalMem<
+        RAJA::cuda_thread_mem,
+        camp::idx_seq<Indices...>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -204,8 +210,8 @@ struct CudaStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index 0d42b67e1c..49e3cdada0 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -40,13 +40,15 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data,
-          camp::idx_t LambdaIndex,
-          typename... Args,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Lambda<LambdaIndex, Args...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t LambdaIndex,
+    typename... Args,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::Lambda<LambdaIndex, Args...>,
+    Types>
 {
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
@@ -68,9 +70,9 @@ struct CudaStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index ad01fb03d2..effb3469aa 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -34,18 +34,21 @@ namespace internal
 //
 // Executor that handles reductions across a single CUDA thread block
 //
-template <typename Data,
-          template <typename...>
-          class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Reduce<RAJA::cuda_block_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    template <typename...>
+    class ReduceOperator,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::Reduce<
+        RAJA::cuda_block_reduce,
+        ReduceOperator,
+        ParamId,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -97,18 +100,21 @@ struct CudaStatementExecutor<Data,
 //
 // Executor that handles reductions across a single CUDA thread warp
 //
-template <typename Data,
-          template <typename...>
-          class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Reduce<RAJA::cuda_warp_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    template <typename...>
+    class ReduceOperator,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::Reduce<
+        RAJA::cuda_warp_reduce,
+        ReduceOperator,
+        ParamId,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -155,8 +161,8 @@ struct CudaStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_cuda_kernel_Reduce_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index ca885ff28c..ae00d346ae 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -52,7 +52,7 @@ struct CudaSyncThreads : public internal::Statement<camp::nil>
 struct CudaSyncWarp : public internal::Statement<camp::nil>
 {};
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -96,9 +96,9 @@ struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types>
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index 53e864e285..14780e8a37 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -47,21 +47,22 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                     sync,
-                                                     IndexMapper>,
-                    EnclosedStmts...>,
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -112,8 +113,8 @@ struct CudaStatementExecutor<
         RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
     using data_t        = camp::decay<Data>;
@@ -137,12 +138,13 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<
@@ -213,8 +215,8 @@ struct CudaStatementExecutor<
         RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
     using data_t        = camp::decay<Data>;
@@ -238,12 +240,13 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<
@@ -309,8 +312,8 @@ struct CudaStatementExecutor<
         RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
     using data_t        = camp::decay<Data>;
@@ -335,11 +338,12 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename TPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
@@ -352,15 +356,16 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
+                  cuda::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
-#endif // RAJA_ENABLE_CUDA
-#endif /* RAJA_policy_cuda_kernel_Tile_HPP */
+#endif  // RAJA_ENABLE_CUDA
+#endif  /* RAJA_policy_cuda_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 630e69c215..3ffdf3a36c 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -48,14 +48,15 @@ namespace internal
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::TileTCount<
@@ -127,13 +128,14 @@ struct CudaStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::TileTCount<
@@ -219,13 +221,14 @@ struct CudaStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::TileTCount<
@@ -306,12 +309,13 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename TPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::
@@ -326,15 +330,16 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
+                  cuda::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
-#endif // RAJA_ENABLE_CUDA
-#endif /* RAJA_policy_cuda_kernel_TileTCount_HPP */
+#endif  // RAJA_ENABLE_CUDA
+#endif  /* RAJA_policy_cuda_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 1e302f86e2..2f9c756368 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -55,11 +55,11 @@ struct LaunchDims
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(CudaDims _dims) : dims{_dims}, min_dims{} {}
+  LaunchDims(CudaDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(CudaDims _dims, CudaDims _min_dims)
-      : dims{_dims}, min_dims{_min_dims}
+      : dims {_dims}, min_dims {_min_dims}
   {}
 
   RAJA_INLINE
@@ -219,9 +219,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
       cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
-                             CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT      len)
+  static void set_dimensions(
+      CudaDims& RAJA_UNUSED_ARG(dims),
+      CudaDims& RAJA_UNUSED_ARG(min_dims),
+      IdxT      len)
   {
     if (len > static_cast<IdxT>(1))
     {
@@ -257,10 +258,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -273,8 +275,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
                           "space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -302,10 +304,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -318,8 +321,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
                           "space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -334,9 +337,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
-                             CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT      len)
+  static void set_dimensions(
+      CudaDims& RAJA_UNUSED_ARG(dims),
+      CudaDims& RAJA_UNUSED_ARG(min_dims),
+      IdxT      len)
   {
     if (len > static_cast<IdxT>(0))
     {
@@ -351,10 +355,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -364,15 +369,15 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_cuda_dim<dim>(dims.threads,
-                      RAJA_DIVIDE_CEILING_INT(
+    set_cuda_dim<dim>(
+        dims.threads, RAJA_DIVIDE_CEILING_INT(
                           len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(
+        min_dims.threads, RAJA_DIVIDE_CEILING_INT(
+                              len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -382,10 +387,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -394,34 +400,37 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(
+        dims.blocks, RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(
+        min_dims.blocks, RAJA_DIVIDE_CEILING_INT(
+                             len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template <named_dim               dim,
-          int                     BLOCK_SIZE,
-          int                     GRID_SIZE,
-          kernel_sync_requirement sync>
+template <
+    named_dim               dim,
+    int                     BLOCK_SIZE,
+    int                     GRID_SIZE,
+    kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::Direct,
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -436,10 +445,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -455,9 +464,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
       cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
-                             CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT      RAJA_UNUSED_ARG(len))
+  static void set_dimensions(
+      CudaDims& RAJA_UNUSED_ARG(dims),
+      CudaDims& RAJA_UNUSED_ARG(min_dims),
+      IdxT      RAJA_UNUSED_ARG(len))
   {}
 };
 
@@ -487,10 +497,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -499,8 +510,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -528,10 +539,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -540,8 +552,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -574,10 +586,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -587,13 +600,13 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_cuda_dim<dim>(dims.threads,
-                      RAJA_DIVIDE_CEILING_INT(
+    set_cuda_dim<dim>(
+        dims.threads, RAJA_DIVIDE_CEILING_INT(
                           len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -603,10 +616,11 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -615,32 +629,35 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(
+        dims.blocks, RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template <named_dim               dim,
-          int                     BLOCK_SIZE,
-          int                     GRID_SIZE,
-          kernel_sync_requirement sync>
+template <
+    named_dim               dim,
+    int                     BLOCK_SIZE,
+    int                     GRID_SIZE,
+    kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -650,17 +667,17 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-} // namespace internal
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 406cf57432..f0f16dd706 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -45,8 +45,8 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY         body_in,
-                                             ReduceParams reduce_params)
+__global__ void
+launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -66,10 +66,10 @@ __global__ void launch_new_reduce_global_fcn(BODY         body_in,
 }
 
 template <bool async>
-struct LaunchExecute<
-    RAJA::policy::cuda::cuda_launch_explicit_t<async,
-                                               named_usage::unspecified,
-                                               named_usage::unspecified>>
+struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
+    async,
+    named_usage::unspecified,
+    named_usage::unspecified>>
 {
 
   template <typename BODY_IN, typename ReduceParams>
@@ -77,11 +77,12 @@ struct LaunchExecute<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -93,11 +94,12 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                        static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                        static_cast<cuda_dim_member_t>(params.teams.value[2])};
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{
+    cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(params.threads.value[0]),
         static_cast<cuda_dim_member_t>(params.threads.value[1]),
         static_cast<cuda_dim_member_t>(params.threads.value[2])};
@@ -124,8 +126,9 @@ struct LaunchExecute<
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        RAJA::cuda::launch(
+            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
+            kernel_name);
       }
 
       RAJA_FT_END;
@@ -141,11 +144,12 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       launch_params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             launch_reducers)
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       launch_params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -158,12 +162,12 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{
+    cuda_dim_t gridSize {
         static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
         static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{
+    cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
         static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
         static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
@@ -186,8 +190,8 @@ struct LaunchExecute<
       {
         using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
             async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
+            launch_reducers, launch_info);
 
 
         //
@@ -201,11 +205,12 @@ struct LaunchExecute<
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        RAJA::cuda::launch(
+            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
+            kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
+            launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
@@ -233,13 +238,15 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
   body(ctx);
 }
 
-template <typename BODY,
-          int    num_threads,
-          size_t BLOCKS_PER_SM,
-          typename ReduceParams>
+template <
+    typename BODY,
+    int    num_threads,
+    size_t BLOCKS_PER_SM,
+    typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_new_reduce_global_fcn_fixed(BODY         body_in,
-                                            ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(
+        BODY         body_in,
+        ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -268,11 +275,12 @@ struct LaunchExecute<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -286,11 +294,12 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                        static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                        static_cast<cuda_dim_member_t>(params.teams.value[2])};
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{
+    cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(params.threads.value[0]),
         static_cast<cuda_dim_member_t>(params.threads.value[1]),
         static_cast<cuda_dim_member_t>(params.threads.value[2])};
@@ -317,8 +326,9 @@ struct LaunchExecute<
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        RAJA::cuda::launch(
+            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
+            kernel_name);
       }
 
       RAJA_FT_END;
@@ -334,18 +344,19 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       launch_params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             launch_reducers)
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       launch_params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
-                                            camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<
+            BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -353,12 +364,12 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{
+    cuda_dim_t gridSize {
         static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
         static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{
+    cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
         static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
         static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
@@ -380,11 +391,10 @@ struct LaunchExecute<
       launch_info.res          = cuda_res;
       {
 
-        using EXEC_POL =
-            RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
-                                                       BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
+            async, nthreads, BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
+            launch_reducers, launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
@@ -397,11 +407,12 @@ struct LaunchExecute<
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        RAJA::cuda::launch(
+            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
+            kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
+            launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
@@ -416,9 +427,10 @@ struct LaunchExecute<
 */
 template <typename SEGMENT, typename IndexMapper>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -426,10 +438,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -443,10 +455,11 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
     SEGMENT>
 {
 
@@ -454,11 +467,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -473,16 +486,18 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1,
-                                     IndexMapper2>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
     SEGMENT>
 {
 
@@ -490,12 +505,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -507,8 +522,9 @@ struct LoopExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-           *(segment2.begin() + i2));
+      body(
+          *(segment0.begin() + i0), *(segment1.begin() + i1),
+          *(segment2.begin() + i2));
     }
   }
 };
@@ -526,10 +542,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -556,11 +572,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -583,10 +599,11 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::cuda::cuda_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -601,12 +618,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -629,8 +646,9 @@ struct LoopExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-               *(segment2.begin() + i2));
+          body(
+              *(segment0.begin() + i0), *(segment1.begin() + i1),
+              *(segment2.begin() + i2));
         }
       }
     }
@@ -639,9 +657,10 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -649,10 +668,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -665,10 +684,11 @@ struct LoopICountExecute<
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
     SEGMENT>
 {
 
@@ -676,11 +696,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -695,16 +715,18 @@ struct LoopICountExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1,
-                                     IndexMapper2>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
     SEGMENT>
 {
 
@@ -712,12 +734,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -729,8 +751,9 @@ struct LoopICountExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-           *(segment2.begin() + i2), i0, i1, i2);
+      body(
+          *(segment0.begin() + i0), *(segment1.begin() + i1),
+          *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
@@ -748,10 +771,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -778,11 +801,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -805,10 +828,11 @@ struct LoopICountExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopICountExecute<
     RAJA::policy::cuda::cuda_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -823,12 +847,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -851,8 +875,9 @@ struct LoopICountExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-               *(segment2.begin() + i2), i0, i1, i2);
+          body(
+              *(segment0.begin() + i0), *(segment1.begin() + i1),
+              *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -865,9 +890,10 @@ struct LoopICountExecute<
 */
 template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                             sync,
-                                             IndexMapper0>,
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::Direct,
+        sync,
+        IndexMapper0>,
     SEGMENT>
     : LoopExecute<
           RAJA::policy::cuda::
@@ -877,20 +903,21 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                             kernel_sync_requirement::none,
-                                             IndexMapper0,
-                                             IndexMapper1>,
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -908,26 +935,28 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                             kernel_sync_requirement::none,
-                                             IndexMapper0,
-                                             IndexMapper1,
-                                             IndexMapper2>,
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -975,10 +1004,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -995,10 +1024,11 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::cuda::cuda_flatten_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -1012,10 +1042,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1041,9 +1071,10 @@ struct LoopExecute<
 */
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -1051,11 +1082,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i =
@@ -1081,11 +1112,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init =
@@ -1102,9 +1133,10 @@ struct TileExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -1112,11 +1144,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t   = IndexMapper::template index<diff_t>();
@@ -1142,11 +1174,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t t_init   = IndexMapper::template index<diff_t>();
@@ -1161,5 +1193,5 @@ struct TileTCountExecute<
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index af8cf382d5..666a064a45 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -73,19 +73,20 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner,
-          typename GetTallyIndex,
-          typename T,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                         T   identity,
-                                         int bin,
-                                         T   value,
-                                         T*  tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int            tally_replication,
-                                         int            tally_bins)
+template <
+    typename Combiner,
+    typename GetTallyIndex,
+    typename T,
+    typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
+    int            RAJA_UNUSED_ARG(num_bins),
+    T              identity,
+    int            bin,
+    T              value,
+    T*             tally_mem,
+    GetTallyOffset get_tally_offset,
+    int            tally_replication,
+    int            tally_bins)
 {
   if (value == identity)
   {
@@ -93,21 +94,21 @@ block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
   }
 
   int tally_index =
-      GetTallyIndex::template index<int>(); // globalWarpId by default
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
   int tally_offset =
       get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_init_shmem(int num_bins,
-                              T   identity,
-                              T*  shared_mem,
-                              int shared_replication)
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
+    int num_bins,
+    T   identity,
+    T*  shared_mem,
+    int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -122,18 +123,19 @@ block_multi_reduce_init_shmem(int num_bins,
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename GetSharedIndex,
-          typename T,
-          typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_shmem_atomic(int             num_bins,
-                                        T               identity,
-                                        int             bin,
-                                        T               value,
-                                        T*              shared_mem,
-                                        GetSharedOffset get_shared_offset,
-                                        int             shared_replication)
+template <
+    typename Combiner,
+    typename GetSharedIndex,
+    typename T,
+    typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
+    int             num_bins,
+    T               identity,
+    int             bin,
+    T               value,
+    T*              shared_mem,
+    GetSharedOffset get_shared_offset,
+    int             shared_replication)
 {
   if (value == identity)
   {
@@ -141,29 +143,30 @@ block_multi_reduce_combine_shmem_atomic(int             num_bins,
   }
 
   int shared_index =
-      GetSharedIndex::template index<int>(); // threadId by default
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
   int shmem_offset =
       get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::cuda::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename T,
-          typename GetSharedOffset,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
-                                         T               identity,
-                                         T*              shared_mem,
-                                         GetSharedOffset get_shared_offset,
-                                         int             shared_replication,
-                                         T*              tally_mem,
-                                         GetTallyOffset  get_tally_offset,
-                                         int             tally_replication,
-                                         int             tally_bins)
+template <
+    typename Combiner,
+    typename T,
+    typename GetSharedOffset,
+    typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
+    int             num_bins,
+    T               identity,
+    T*              shared_mem,
+    GetSharedOffset get_shared_offset,
+    int             shared_replication,
+    T*              tally_mem,
+    GetTallyOffset  get_tally_offset,
+    int             tally_replication,
+    int             tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -181,7 +184,7 @@ grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
     {
       int shmem_offset =
           get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
     if (value != identity)
@@ -189,12 +192,12 @@ grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
       int tally_offset =
           get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
   }
 }
 
-} // namespace impl
+}  // namespace impl
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -210,16 +213,17 @@ struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
   template <typename Container>
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
-                                          T const&         identity)
+  MultiReduceGridAtomicHostInit_TallyData(
+      Container const& container,
+      T const&         identity)
       : m_tally_mem(nullptr),
         m_identity(identity),
         m_num_bins(container.size()),
         m_tally_bins(get_tally_bins(m_num_bins)),
         m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                               m_tally_replication);
+    m_tally_mem = create_tally(
+        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
@@ -245,8 +249,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins          = new_num_bins;
       m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                                 m_tally_replication);
+      m_tally_mem         = create_tally(
+                  container, identity, m_num_bins, m_tally_bins, m_tally_replication);
     }
     else
     {
@@ -255,8 +259,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         int bin       = 0;
         for (auto const& value : container)
         {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
-                                       m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset {}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
@@ -264,8 +268,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       {
         for (int bin = 0; bin < m_num_bins; ++bin)
         {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
-                                       m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset {}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -287,7 +291,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
     for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
     {
       int tally_offset =
-          GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -332,18 +336,19 @@ struct MultiReduceGridAtomicHostInit_TallyData
     struct
     {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}
+    return TallyAtomicReplicationConcretizer {}
         .template get_global_replication<int>(func_data);
   }
 
   template <typename Container>
-  static T* create_tally(Container const& container,
-                         T const&         identity,
-                         int              num_bins,
-                         int              tally_bins,
-                         int              tally_replication)
+  static T* create_tally(
+      Container const& container,
+      T const&         identity,
+      int              num_bins,
+      int              tally_bins,
+      int              tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -361,7 +366,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
         for (auto const& value : container)
         {
           int tally_offset =
-              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
           new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
@@ -371,7 +376,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
         for (int bin = 0; bin < num_bins; ++bin)
         {
           int tally_offset =
-              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
           new (&tally_mem[tally_offset]) T(identity);
         }
       }
@@ -379,10 +384,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
     return tally_mem;
   }
 
-  static void destroy_tally(T*& tally_mem,
-                            int num_bins,
-                            int tally_bins,
-                            int tally_replication)
+  static void destroy_tally(
+      T*& tally_mem,
+      int num_bins,
+      int tally_bins,
+      int tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -393,8 +399,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       for (int bin = num_bins; bin > 0; --bin)
       {
-        int tally_offset = GetTallyOffset{}(bin - 1, tally_bins, tally_rep - 1,
-                                            tally_replication);
+        int tally_offset = GetTallyOffset {}(
+            bin - 1, tally_bins, tally_rep - 1, tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -410,8 +416,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T   m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp
-                           // threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
@@ -452,7 +458,7 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
         m_tally_replication, m_tally_bins);
   }
 
@@ -464,8 +470,8 @@ struct MultiReduceGridAtomicHostInit_Data
     tally_rep = omp_get_thread_num();
 #endif
     int tally_offset =
-        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
@@ -490,8 +496,9 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 
   //! setup permanent settings, defer to tally data
   template <typename Container>
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
-                                              T const&         identity)
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      Container const& container,
+      T const&         identity)
       : TallyData(container, identity),
         m_shared_offset(s_shared_offset_unknown),
         m_shared_replication(0)
@@ -533,10 +540,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
           {
             size_t func_threads_per_block;
             size_t func_max_shared_replication_per_block;
-          } func_data{block_size, max_shmem_size / m_num_bins};
+          } func_data {block_size, max_shmem_size / m_num_bins};
 
           shared_replication =
-              SharedAtomicReplicationConcretizer{}
+              SharedAtomicReplicationConcretizer {}
                   .template get_shared_replication<size_t>(func_data);
           return m_num_bins * shared_replication;
         });
@@ -567,8 +574,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr)
     {
-      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
-                                          m_shared_replication);
+      impl::block_multi_reduce_init_shmem(
+          m_num_bins, m_identity, shared_mem, m_shared_replication);
     }
   }
 
@@ -580,8 +587,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity, shared_mem, GetSharedOffset{},
-          m_shared_replication, m_tally_mem, GetTallyOffset{},
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
           m_tally_replication, m_tally_bins);
     }
   }
@@ -595,13 +602,13 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset{},
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
           m_shared_replication);
     }
     else
     {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
           m_tally_replication, m_tally_bins);
     }
   }
@@ -614,8 +621,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     tally_rep = omp_get_thread_num();
 #endif
     int tally_offset =
-        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
@@ -642,8 +649,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset;      // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
@@ -690,15 +697,17 @@ struct MultiReduceDataCuda
           (tuning::algorithm ==
            multi_reduce_algorithm::
                init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                            T,
-                                                            tuning>,
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<
+              t_MultiReduceOp,
+              T,
+              tuning>,
           std::conditional_t<
               (tuning::algorithm ==
                multi_reduce_algorithm::init_host_combine_global_atomic),
-              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                       T,
-                                                       tuning>,
+              cuda::MultiReduceGridAtomicHostInit_Data<
+                  t_MultiReduceOp,
+                  T,
+                  tuning>,
               void>>,
       void>;
 
@@ -860,13 +869,14 @@ struct MultiReduceDataCuda
   }
 };
 
-} // end namespace cuda
+}  // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
-                                cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(
+    policy::cuda::cuda_multi_reduce_policy,
+    cuda::MultiReduceDataCuda)
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index b7fb284136..6411dfe72d 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -43,10 +43,10 @@ resolve(KernelName&, const RAJA::cuda::detail::cudaInfo&)
 #endif
 }
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 1b895a51f3..2fa86da40e 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -43,7 +43,7 @@ resolve(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
 {
   // complete reduction
   ci.res.wait();
-  *red.target = OP{}(*red.target, *red.devicetarget);
+  *red.target = OP {}(*red.target, *red.devicetarget);
 
   // free memory
   RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
@@ -53,10 +53,10 @@ resolve(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
   red.devicetarget = nullptr;
 }
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index a0232701aa..eab7273509 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -71,7 +71,7 @@ struct get_launch<false>
 {
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
-} // end namespace detail
+}  // end namespace detail
 
 namespace cuda
 {
@@ -200,9 +200,10 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template <size_t t_cutoff,
-          size_t preferred_replication_before_cutoff,
-          size_t preferred_replication_after_cutoff>
+template <
+    size_t t_cutoff,
+    size_t preferred_replication_before_cutoff,
+    size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
   template <typename IdxT, typename Data>
@@ -237,7 +238,7 @@ struct SharedAtomicReplicationMaxPow2Concretizer
         data.func_max_shared_replication_per_block;
 
     IdxT preferred_replication =
-        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
             data);
 
     return prev_pow2(
@@ -259,7 +260,7 @@ struct GlobalAtomicReplicationMinPow2Concretizer
     IdxT func_min_global_replication = data.func_min_global_replication;
 
     IdxT preferred_replication =
-        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
             data);
 
     return next_pow2(
@@ -281,10 +282,11 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <reduce_algorithm         t_algorithm,
-          block_communication_mode t_comm_mode,
-          size_t                   t_replication,
-          size_t                   t_atomic_stride>
+template <
+    reduce_algorithm         t_algorithm,
+    block_communication_mode t_comm_mode,
+    size_t                   t_replication,
+    size_t                   t_atomic_stride>
 struct ReduceTuning
 {
   static constexpr reduce_algorithm         algorithm     = t_algorithm;
@@ -302,9 +304,10 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template <typename t_AtomicReplicationConcretizer,
-          typename t_ReplicationIndexer,
-          typename t_OffsetCalculator>
+template <
+    typename t_AtomicReplicationConcretizer,
+    typename t_ReplicationIndexer,
+    typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
@@ -312,9 +315,10 @@ struct AtomicReplicationTuning
   using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template <multi_reduce_algorithm t_algorithm,
-          typename t_SharedAtomicReplicationTuning,
-          typename t_GlobalAtomicReplicationTuning>
+template <
+    multi_reduce_algorithm t_algorithm,
+    typename t_SharedAtomicReplicationTuning,
+    typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -323,7 +327,7 @@ struct MultiReduceTuning
   static constexpr bool consistent    = false;
 };
 
-} // namespace cuda
+}  // namespace cuda
 
 namespace policy
 {
@@ -336,13 +340,14 @@ struct DeviceConstants
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
   RAJA::Index_type
-      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of
-                                            // the cache level that handles
-                                            // atomics
-
-  constexpr DeviceConstants(RAJA::Index_type warp_size,
-                            RAJA::Index_type max_block_size,
-                            RAJA::Index_type atomic_cache_line_bytes) noexcept
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
+
+  constexpr DeviceConstants(
+      RAJA::Index_type warp_size,
+      RAJA::Index_type max_block_size,
+      RAJA::Index_type atomic_cache_line_bytes) noexcept
       : WARP_SIZE(warp_size),
         MAX_BLOCK_SIZE(max_block_size),
         MAX_WARPS(max_block_size / warp_size),
@@ -354,27 +359,31 @@ struct DeviceConstants
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
-static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
-              "device_constants.MAX_WARPS");
-static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
-              "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
-              "a multiple of device_constants.WARP_SIZE");
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
+static_assert(
+    device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
+    "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+    "device_constants.MAX_WARPS");
+static_assert(
+    device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
+    "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
+    "a multiple of device_constants.WARP_SIZE");
 
 constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
 
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
+template <
+    typename _IterationMapping,
+    kernel_sync_requirement sync,
+    typename... _IterationGetters>
 struct cuda_indexer
 {};
 
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
+template <
+    typename _IterationMapping,
+    kernel_sync_requirement sync,
+    typename... _IterationGetters>
 struct cuda_flatten_indexer
     : public RAJA::make_policy_pattern_launch_platform_t<
           RAJA::Policy::cuda,
@@ -385,11 +394,12 @@ struct cuda_flatten_indexer
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping,
-          typename _IterationGetter,
-          typename _LaunchConcretizer,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool   Async         = false>
+template <
+    typename _IterationMapping,
+    typename _IterationGetter,
+    typename _LaunchConcretizer,
+    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+    bool   Async         = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::forall,
@@ -401,9 +411,10 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool   Async,
-          int    num_threads   = named_usage::unspecified,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <
+    bool   Async,
+    int    num_threads   = named_usage::unspecified,
+    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 struct cuda_launch_explicit_t
     : public RAJA::make_policy_pattern_launch_platform_t<
           RAJA::Policy::cuda,
@@ -420,9 +431,10 @@ struct cuda_launch_explicit_t
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool   Async         = false>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+    bool   Async         = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::workgroup_exec,
@@ -456,9 +468,10 @@ struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Pattern::reduce,
                                 detail::get_launch<false>::value,
                                 RAJA::Platform::cuda,
-                                std::conditional_t<tuning::consistent,
-                                                   reduce::ordered,
-                                                   reduce::unordered>>
+                                std::conditional_t<
+                                    tuning::consistent,
+                                    reduce::ordered,
+                                    reduce::unordered>>
 {};
 
 template <typename tuning>
@@ -468,9 +481,10 @@ struct cuda_multi_reduce_policy
           RAJA::Pattern::multi_reduce,
           detail::get_launch<false>::value,
           RAJA::Platform::cuda,
-          std::conditional_t<tuning::consistent,
-                             reduce::ordered,
-                             reduce::unordered>>
+          std::conditional_t<
+              tuning::consistent,
+              reduce::ordered,
+              reduce::unordered>>
 {};
 
 /*!
@@ -542,13 +556,14 @@ struct cuda_thread_masked_loop
 {};
 
 
-struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
-                                                       Pattern::synchronize,
-                                                       Launch::sync>
+struct cuda_synchronize : make_policy_pattern_launch_t<
+                              Policy::cuda,
+                              Pattern::synchronize,
+                              Launch::sync>
 {};
 
-} // end namespace cuda
-} // end namespace policy
+}  // end namespace cuda
+}  // end namespace policy
 
 
 namespace internal
@@ -567,8 +582,8 @@ int get_size(cuda_dim_t dims)
 struct CudaDims
 {
 
-  cuda_dim_t blocks{0, 0, 0};
-  cuda_dim_t threads{0, 0, 0};
+  cuda_dim_t blocks {0, 0, 0};
+  cuda_dim_t threads {0, 0, 0};
 
   CudaDims()                           = default;
   CudaDims(CudaDims const&)            = default;
@@ -576,8 +591,8 @@ struct CudaDims
 
   RAJA_INLINE
   CudaDims(cuda_dim_member_t default_val)
-      : blocks{default_val, default_val, default_val},
-        threads{default_val, default_val, default_val}
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
   {}
 
   RAJA_INLINE
@@ -591,8 +606,9 @@ struct CudaDims
   {
     if (num_blocks() != 0)
     {
-      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
-              (blocks.z ? blocks.z : 1)};
+      return {
+          (blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
+          (blocks.z ? blocks.z : 1)};
     }
     else
     {
@@ -605,8 +621,9 @@ struct CudaDims
   {
     if (num_threads() != 0)
     {
-      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
-              (threads.z ? threads.z : 1)};
+      return {
+          (threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
+          (threads.z ? threads.z : 1)};
     }
     else
     {
@@ -681,7 +698,7 @@ RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
   return CudaDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace cuda
 {
@@ -1144,8 +1161,8 @@ struct IndexDivide
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
-                                   static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(
+        indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
   }
 };
 
@@ -1179,9 +1196,10 @@ struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<typename get_index_thread<x_index>::type,
-                            typename get_index_thread<y_index>::type,
-                            typename get_index_thread<z_index>::type>;
+  using type = IndexFlatten<
+      typename get_index_thread<x_index>::type,
+      typename get_index_thread<y_index>::type,
+      typename get_index_thread<z_index>::type>;
 };
 
 // helper to get just the block indexing part of IndexGlobal
@@ -1197,9 +1215,10 @@ struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<typename get_index_block<x_index>::type,
-                            typename get_index_block<y_index>::type,
-                            typename get_index_block<z_index>::type>;
+  using type = IndexFlatten<
+      typename get_index_block<x_index>::type,
+      typename get_index_block<y_index>::type,
+      typename get_index_block<z_index>::type>;
 };
 
 
@@ -1210,12 +1229,14 @@ using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
 template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
-                                thread_y<BLOCK_SIZE_Y>,
-                                thread_z<BLOCK_SIZE_Z>>;
+template <
+    size_t BLOCK_SIZE_X = named_usage::unspecified,
+    size_t BLOCK_SIZE_Y = named_usage::unspecified,
+    size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using thread_xyz = IndexFlatten<
+    thread_x<BLOCK_SIZE_X>,
+    thread_y<BLOCK_SIZE_Y>,
+    thread_z<BLOCK_SIZE_Z>>;
 
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
@@ -1224,12 +1245,14 @@ using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
-using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
-                               block_y<GRID_SIZE_Y>,
-                               block_z<GRID_SIZE_Z>>;
+template <
+    size_t GRID_SIZE_X = named_usage::unspecified,
+    size_t GRID_SIZE_Y = named_usage::unspecified,
+    size_t GRID_SIZE_Z = named_usage::unspecified>
+using block_xyz = IndexFlatten<
+    block_x<GRID_SIZE_X>,
+    block_y<GRID_SIZE_Y>,
+    block_z<GRID_SIZE_Z>>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
@@ -1239,37 +1262,41 @@ template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
-template <size_t BLOCK_SIZE_X,
-          size_t BLOCK_SIZE_Y,
-          size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
-using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
-                                global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
-                                global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
-
-
-template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using warp_xyz =
-    IndexDivide<WARP_SIZE,
-                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified,
-          size_t GRID_SIZE_X  = named_usage::unspecified,
-          size_t GRID_SIZE_Y  = named_usage::unspecified,
-          size_t GRID_SIZE_Z  = named_usage::unspecified>
-using warp_global_xyz =
-    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
-                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
-
-} // namespace cuda
+template <
+    size_t BLOCK_SIZE_X,
+    size_t BLOCK_SIZE_Y,
+    size_t BLOCK_SIZE_Z,
+    size_t GRID_SIZE_X = named_usage::unspecified,
+    size_t GRID_SIZE_Y = named_usage::unspecified,
+    size_t GRID_SIZE_Z = named_usage::unspecified>
+using global_xyz = IndexFlatten<
+    global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
+    global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
+    global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
+
+
+template <
+    size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+    size_t BLOCK_SIZE_X = named_usage::unspecified,
+    size_t BLOCK_SIZE_Y = named_usage::unspecified,
+    size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz = IndexDivide<
+    WARP_SIZE,
+    thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <
+    size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+    size_t BLOCK_SIZE_X = named_usage::unspecified,
+    size_t BLOCK_SIZE_Y = named_usage::unspecified,
+    size_t BLOCK_SIZE_Z = named_usage::unspecified,
+    size_t GRID_SIZE_X  = named_usage::unspecified,
+    size_t GRID_SIZE_Y  = named_usage::unspecified,
+    size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz = IndexFlatten<
+    warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+    block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
 
@@ -1289,10 +1316,11 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE,
-          size_t GRID_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool   Async = false>
+template <
+    size_t BLOCK_SIZE,
+    size_t GRID_SIZE,
+    size_t BLOCKS_PER_SM,
+    bool   Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
@@ -1325,36 +1353,36 @@ using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
     true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     BLOCKS_PER_SM,
-                                     Async>;
+using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     BLOCKS_PER_SM,
-                                     true>;
+using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     policy::cuda::MIN_BLOCKS_PER_SM,
-                                     Async>;
+using cuda_exec = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     policy::cuda::MIN_BLOCKS_PER_SM,
-                                     true>;
+using cuda_exec_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
@@ -1420,10 +1448,11 @@ using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
     policy::cuda::MIN_BLOCKS_PER_SM,
     true>;
 
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          typename Fraction,
-          bool Async = false>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    typename Fraction,
+    bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     cuda::global_x<BLOCK_SIZE>,
@@ -1455,10 +1484,11 @@ using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
     policy::cuda::MIN_BLOCKS_PER_SM,
     true>;
 
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          typename Concretizer,
-          bool Async = false>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    typename Concretizer,
+    bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     cuda::global_x<BLOCK_SIZE>,
@@ -1522,10 +1552,11 @@ using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
     policy::cuda::MIN_BLOCKS_PER_SM,
     true>;
 
-template <bool   with_reduce,
-          size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool   Async = false>
+template <
+    bool   with_reduce,
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM,
+    bool   Async = false>
 using cuda_exec_base_explicit = std::conditional_t<
     with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
@@ -1538,27 +1569,29 @@ using cuda_exec_base_explicit_async = std::conditional_t<
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base =
-    std::conditional_t<with_reduce,
-                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-                       cuda_exec<BLOCK_SIZE, Async>>;
+using cuda_exec_base = std::conditional_t<
+    with_reduce,
+    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+    cuda_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async =
-    std::conditional_t<with_reduce,
-                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
-                       cuda_exec_async<BLOCK_SIZE>>;
+using cuda_exec_base_async = std::conditional_t<
+    with_reduce,
+    cuda_exec_with_reduce_async<BLOCK_SIZE>,
+    cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool   Async         = false>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+    bool   Async         = false>
 using cuda_work_explicit =
     policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <
+    size_t BLOCK_SIZE,
+    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 using cuda_work_explicit_async =
     policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
@@ -1578,10 +1611,11 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template <cuda::reduce_algorithm         algorithm,
-          cuda::block_communication_mode comm_mode,
-          size_t                         replication = named_usage::unspecified,
-          size_t atomic_stride                       = named_usage::unspecified>
+template <
+    cuda::reduce_algorithm         algorithm,
+    cuda::block_communication_mode comm_mode,
+    size_t                         replication   = named_usage::unspecified,
+    size_t                         atomic_stride = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1604,41 +1638,41 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
-                       cuda::block_communication_mode::device_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::combine_last_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
-                       cuda::block_communication_mode::block_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::combine_last_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
-                       cuda::block_communication_mode::device_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_device_combine_atomic_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
-                       cuda::block_communication_mode::block_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_device_combine_atomic_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
-                       cuda::block_communication_mode::device_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_host_combine_atomic_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
-                       cuda::block_communication_mode::block_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_host_combine_atomic_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1656,20 +1690,23 @@ using cuda_reduce_base =
 
 
 // policies usable with multi_reducers
-template <cuda::multi_reduce_algorithm algorithm,
-          typename SharedAtomicReplicationConcretizer,
-          typename SharedAtomicReplicationIndexer,
-          typename GlobalAtomicReplicationConcretizer,
-          typename GlobalAtomicReplicationIndexer>
+template <
+    cuda::multi_reduce_algorithm algorithm,
+    typename SharedAtomicReplicationConcretizer,
+    typename SharedAtomicReplicationIndexer,
+    typename GlobalAtomicReplicationConcretizer,
+    typename GlobalAtomicReplicationIndexer>
 using cuda_multi_reduce_tuning =
     policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
         algorithm,
-        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                      SharedAtomicReplicationIndexer,
-                                      GetOffsetRight<int>>,
-        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                      GlobalAtomicReplicationIndexer,
-                                      GetOffsetLeft<int>>>>;
+        cuda::AtomicReplicationTuning<
+            SharedAtomicReplicationConcretizer,
+            SharedAtomicReplicationIndexer,
+            GetOffsetRight<int>>,
+        cuda::AtomicReplicationTuning<
+            GlobalAtomicReplicationConcretizer,
+            GlobalAtomicReplicationIndexer,
+            GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1707,8 +1744,8 @@ using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing
 //
 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
     cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     cuda::GlobalAtomicReplicationMinPow2Concretizer<
         cuda::ConstantPreferredReplicationConcretizer<2>>,
     cuda::warp_global_xyz<>>;
@@ -1716,8 +1753,8 @@ using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
 using cuda_multi_reduce_atomic_global_no_replication_host_init =
     cuda_multi_reduce_tuning<
         cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-        void, // unused with this algorithm
-        void, // unused with this algorithm
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
         cuda::GlobalAtomicReplicationMinPow2Concretizer<
             cuda::ConstantPreferredReplicationConcretizer<1>>,
         cuda::block_xyz<>>;
@@ -1755,29 +1792,29 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool   Async,
-          int    num_threads   = named_usage::unspecified,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <
+    bool   Async,
+    int    num_threads   = named_usage::unspecified,
+    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 using cuda_launch_explicit_t =
     policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
 // CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
 template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t =
-    policy::cuda::cuda_launch_explicit_t<Async,
-                                         num_threads,
-                                         (num_threads ==
-                                          named_usage::unspecified)
-                                             ? named_usage::unspecified
-                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<
+    Async,
+    num_threads,
+    (num_threads == named_usage::unspecified)
+        ? named_usage::unspecified
+        : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
 template <typename... indexers>
-using cuda_indexer_direct =
-    policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                               kernel_sync_requirement::none,
-                               indexers...>;
+using cuda_indexer_direct = policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
 template <typename... indexers>
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
@@ -1792,10 +1829,10 @@ using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     indexers...>;
 
 template <typename... indexers>
-using cuda_flatten_indexer_direct =
-    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
-                                       kernel_sync_requirement::none,
-                                       indexers...>;
+using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
 template <typename... indexers>
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
@@ -2109,10 +2146,10 @@ using cuda_flatten_block_zyx_loop =
  * physical threads to fit all of the direct map requests.
  */
 template <named_dim... dims>
-using cuda_global_direct =
-    cuda_indexer_direct<cuda::IndexGlobal<dims,
-                                          named_usage::unspecified,
-                                          named_usage::unspecified>...>;
+using cuda_global_direct = cuda_indexer_direct<cuda::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
@@ -2143,16 +2180,16 @@ using cuda_global_zyx_direct =
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
 template <named_dim... dims>
-using cuda_global_loop =
-    cuda_indexer_loop<cuda::IndexGlobal<dims,
-                                        named_usage::unspecified,
-                                        named_usage::unspecified>...>;
+using cuda_global_loop = cuda_indexer_loop<cuda::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 template <named_dim... dims>
-using cuda_global_syncable_loop =
-    cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
-                                                 named_usage::unspecified,
-                                                 named_usage::unspecified>...>;
+using cuda_global_syncable_loop = cuda_indexer_syncable_loop<cuda::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
@@ -2186,9 +2223,10 @@ using cuda_global_zyx_loop =
  */
 template <named_dim... dims>
 using cuda_flatten_global_direct =
-    cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
-                                                  named_usage::unspecified,
-                                                  named_usage::unspecified>...>;
+    cuda_flatten_indexer_direct<cuda::IndexGlobal<
+        dims,
+        named_usage::unspecified,
+        named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
@@ -2227,10 +2265,10 @@ using cuda_flatten_global_zyx_direct =
  * global threads
  */
 template <named_dim... dims>
-using cuda_flatten_global_loop =
-    cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
-                                                named_usage::unspecified,
-                                                named_usage::unspecified>...>;
+using cuda_flatten_global_loop = cuda_flatten_indexer_loop<cuda::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
@@ -2279,60 +2317,60 @@ using cuda_thread_size_z_direct =
     cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xy_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xy_direct = cuda_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xz_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xz_direct = cuda_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yx_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yx_direct = cuda_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yz_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yz_direct = cuda_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zx_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zx_direct = cuda_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zy_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zy_direct = cuda_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xyz_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xyz_direct = cuda_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xzy_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xzy_direct = cuda_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yxz_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yxz_direct = cuda_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yzx_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yzx_direct = cuda_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zxy_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zxy_direct = cuda_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zyx_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zyx_direct = cuda_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2365,35 +2403,35 @@ using cuda_block_size_zy_direct =
     cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xyz_direct =
-    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_xyz_direct = cuda_indexer_direct<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xzy_direct =
-    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_xzy_direct = cuda_indexer_direct<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yxz_direct =
-    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_yxz_direct = cuda_indexer_direct<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yzx_direct =
-    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_yzx_direct = cuda_indexer_direct<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zxy_direct =
-    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_zxy_direct = cuda_indexer_direct<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zyx_direct =
-    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_zyx_direct = cuda_indexer_direct<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2406,109 +2444,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_global_size_z_direct =
     cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xy_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xz_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yx_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yz_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zx_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zy_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xyz_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xzy_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yxz_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yzx_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zxy_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zyx_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_direct = cuda_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_direct = cuda_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_direct = cuda_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_direct = cuda_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_direct = cuda_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_direct = cuda_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_direct = cuda_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_direct = cuda_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_direct = cuda_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_direct = cuda_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_direct = cuda_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_direct = cuda_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to CUDA global threads.
@@ -2522,60 +2572,60 @@ template <int Z_BLOCK_SIZE>
 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xy_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xy_loop = cuda_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xz_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xz_loop = cuda_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yx_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yx_loop = cuda_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yz_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yz_loop = cuda_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zx_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zx_loop = cuda_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zy_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zy_loop = cuda_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xyz_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xyz_loop = cuda_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xzy_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xzy_loop = cuda_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yxz_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yxz_loop = cuda_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yzx_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yzx_loop = cuda_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zxy_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zxy_loop = cuda_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zyx_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zyx_loop = cuda_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2605,29 +2655,35 @@ using cuda_block_size_zy_loop =
     cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_xyz_loop = cuda_indexer_loop<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_xzy_loop = cuda_indexer_loop<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_yxz_loop = cuda_indexer_loop<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_yzx_loop = cuda_indexer_loop<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_zxy_loop = cuda_indexer_loop<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_zyx_loop = cuda_indexer_loop<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2640,109 +2696,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_global_size_z_loop =
     cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xy_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xz_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yx_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yz_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zx_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zy_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xyz_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xzy_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yxz_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yzx_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zxy_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zyx_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_loop = cuda_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_loop = cuda_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_loop = cuda_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_loop = cuda_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_loop = cuda_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_loop = cuda_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_loop = cuda_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_loop = cuda_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_loop = cuda_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_loop = cuda_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_loop = cuda_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_loop = cuda_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2761,60 +2829,60 @@ using cuda_flatten_thread_size_z_direct =
     cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xyz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xzy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yxz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yzx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zxy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zyx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2828,60 +2896,60 @@ using cuda_flatten_block_size_z_direct =
     cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xy_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xz_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yx_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yz_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zx_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zy_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xyz_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xzy_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yxz_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yzx_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zxy_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zyx_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2894,109 +2962,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_flatten_global_size_z_direct =
     cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xy_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xz_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yx_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yz_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zx_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zy_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xyz_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xzy_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yxz_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yzx_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zxy_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zyx_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -3015,60 +3095,60 @@ using cuda_flatten_thread_size_z_loop =
     cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xyz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xzy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yxz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yzx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zxy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zyx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<
+    cuda::thread_z<Z_BLOCK_SIZE>,
+    cuda::thread_y<Y_BLOCK_SIZE>,
+    cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -3082,60 +3162,60 @@ using cuda_flatten_block_size_z_loop =
     cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xy_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xz_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yx_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yz_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zx_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zy_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xyz_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xzy_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yxz_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yzx_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zxy_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zyx_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<
+    cuda::block_z<Z_GRID_SIZE>,
+    cuda::block_y<Y_GRID_SIZE>,
+    cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -3148,109 +3228,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_flatten_global_size_z_loop =
     cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xy_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xz_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yx_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yz_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zx_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zy_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xyz_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xzy_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yxz_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yzx_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zxy_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zyx_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<
+    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
@@ -3330,7 +3422,7 @@ using cuda_block_yzx_nested_loop = cuda_block_yzx_loop;
 using cuda_block_zxy_nested_loop = cuda_block_zxy_loop;
 using cuda_block_zyx_nested_loop = cuda_block_zyx_loop;
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
 #endif
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 2c2c7b4496..0d8ed0956b 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -66,14 +66,15 @@ cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
     }
     else
     {
-      fprintf(stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
-              line);
+      fprintf(
+          stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
+          line);
     }
   }
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for if defined(RAJA_ENABLE_CUDA)
+#endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 82c1443d9b..24a2070ac3 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -70,7 +70,7 @@ struct atomic<sum<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
@@ -79,7 +79,7 @@ struct atomic<min<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
@@ -88,7 +88,7 @@ struct atomic<max<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
@@ -97,7 +97,7 @@ struct atomic<and_bit<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
@@ -106,7 +106,7 @@ struct atomic<or_bit<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
@@ -118,9 +118,9 @@ struct cuda_atomic_available
       std::is_same<T, float>::value || std::is_same<T, double>::value;
 };
 
-} // namespace cuda
+}  // namespace cuda
 
-} // namespace reduce
+}  // namespace reduce
 
 namespace cuda
 {
@@ -130,16 +130,18 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
-          typename T,
-          typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T&            val,
-                                                   T             identity,
-                                                   TempIterator  in_device_mem,
-                                                   unsigned int* device_count)
+template <
+    typename Combiner,
+    typename Accessor,
+    int replication,
+    int atomic_stride,
+    typename T,
+    typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(
+    T&            val,
+    T             identity,
+    TempIterator  in_device_mem,
+    unsigned int* device_count)
 {
   typename TempIterator::template rebind_accessor<Accessor> device_mem(
       in_device_mem);
@@ -198,7 +200,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T&            val,
 
     for (unsigned int i = threadId; i < numSlots; i += numThreads)
     {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
@@ -234,7 +236,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-      temp  = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
   }
   else
@@ -248,15 +250,16 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
-        temp = Combiner{}(temp, rhs);
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
-                "Max Warps must be less than or equal to Warp Size for this "
-                "algorithm to work");
+  static_assert(
+      RAJA::policy::cuda::device_constants.MAX_WARPS <=
+          RAJA::policy::cuda::device_constants.WARP_SIZE,
+      "Max Warps must be less than or equal to Warp Size for this "
+      "algorithm to work");
 
   // reduce per warp values
   if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
@@ -298,7 +301,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
            i *= 2)
       {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-        temp  = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -350,7 +353,7 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
 
     for (int i = threadId; i < numBlocks; i += numThreads)
     {
-      temp = OP{}(temp, red.device_mem.get(i));
+      temp = OP {}(temp, red.device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
@@ -363,21 +366,22 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
-          typename T>
-RAJA_DEVICE RAJA_INLINE int
-grid_reduce_atomic_device_init(T&            val,
-                               T             identity,
-                               T*            device_mem,
-                               unsigned int* device_count)
+template <
+    typename Combiner,
+    typename Accessor,
+    int replication,
+    int atomic_stride,
+    typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
+    T&            val,
+    T             identity,
+    T*            device_mem,
+    unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -424,7 +428,7 @@ grid_reduce_atomic_device_init(T&            val,
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
     unsigned int old_count =
@@ -461,11 +465,11 @@ grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
   // one thread per block performs an atomic on device_mem
   if (threadId == 0 && temp != identity)
   {
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
 }
 
-} // namespace impl
+}  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
@@ -667,11 +671,12 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <
+    typename Combiner,
+    typename Accessor,
+    typename T,
+    size_t replication,
+    size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -694,20 +699,20 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        owns_device_pointer{false}
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        owns_device_pointer {false}
   {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
   {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
@@ -729,10 +734,9 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId =
-        impl::grid_reduce_last_block<Combiner, Accessor, replication,
-                                     atomic_stride>(temp, identity, device,
-                                                    device_count);
+    size_t replicationId = impl::grid_reduce_last_block<
+        Combiner, Accessor, replication, atomic_stride>(
+        temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -775,10 +779,11 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <
+    typename Combiner,
+    typename T,
+    size_t replication,
+    size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -790,21 +795,21 @@ struct ReduceAtomicHostInit_Data
   bool      is_setup;
   bool      owns_device_pointer;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        owns_device_pointer{false}
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        owns_device_pointer {false}
   {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        owns_device_pointer{false}
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        owns_device_pointer {false}
   {}
 
   ReduceAtomicHostInit_Data&
@@ -859,11 +864,12 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <
+    typename Combiner,
+    typename Accessor,
+    typename T,
+    size_t replication,
+    size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -878,23 +884,23 @@ struct ReduceAtomicDeviceInit_Data
   T*            device;
   bool          owns_device_pointer;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        owns_device_pointer{false}
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        owns_device_pointer {false}
   {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
   {}
 
   ReduceAtomicDeviceInit_Data&
@@ -917,10 +923,9 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId =
-        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
-                                             atomic_stride>(
-            temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
+        Combiner, Accessor, replication, atomic_stride>(
+        temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -982,10 +987,10 @@ class Reduce
   using Accessor = std::conditional_t<
       (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode ==
-                          block_communication_mode::device_fence),
-                         impl::AccessorDeviceScopeUseDeviceFence,
-                         void>>;
+      std::conditional_t<
+          (tuning::comm_mode == block_communication_mode::device_fence),
+          impl::AccessorDeviceScopeUseDeviceFence,
+          void>>;
 
   static constexpr bool atomic_policy =
       (tuning::algorithm ==
@@ -998,36 +1003,40 @@ class Reduce
   using reduce_data_type = std::conditional_t<
       (tuning::algorithm == reduce_algorithm::combine_last_block) ||
           (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<Combiner,
-                                 Accessor,
-                                 T,
-                                 replication,
-                                 atomic_stride>,
+      cuda::ReduceLastBlock_Data<
+          Combiner,
+          Accessor,
+          T,
+          replication,
+          atomic_stride>,
       std::conditional_t<
           atomic_available,
           std::conditional_t<
               (tuning::algorithm ==
                reduce_algorithm::init_device_combine_atomic_block),
-              cuda::ReduceAtomicDeviceInit_Data<Combiner,
-                                                Accessor,
-                                                T,
-                                                replication,
-                                                atomic_stride>,
+              cuda::ReduceAtomicDeviceInit_Data<
+                  Combiner,
+                  Accessor,
+                  T,
+                  replication,
+                  atomic_stride>,
               std::conditional_t<
                   (tuning::algorithm ==
                    reduce_algorithm::init_host_combine_atomic_block),
-                  cuda::ReduceAtomicHostInit_Data<Combiner,
-                                                  T,
-                                                  replication,
-                                                  atomic_stride>,
+                  cuda::ReduceAtomicHostInit_Data<
+                      Combiner,
+                      T,
+                      replication,
+                      atomic_stride>,
                   void>>,
           void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T,
-                                tally_slots,
-                                typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<
+      T,
+      tally_slots,
+      typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
@@ -1035,8 +1044,8 @@ class Reduce
   {
     TallyType* list;
     T*         val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -1045,12 +1054,14 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this}, tally_or_val_ptr{new TallyType}, val(init_val, identity_)
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
+        val(init_val, identity_)
   {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
-    operator T(); // syncs device
+    operator T();  // syncs device
     val = reduce_data_type(in_val, identity_);
   }
 
@@ -1060,11 +1071,11 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
@@ -1148,7 +1159,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1163,7 +1174,7 @@ class Reduce
   reduce_data_type val;
 };
 
-} // end namespace cuda
+}  // end namespace cuda
 
 //! specialization of ReduceSum for cuda_reduce
 template <typename tuning, typename T>
@@ -1272,25 +1283,28 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T         init_val,
-               IndexType init_idx,
-               T         identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
+  ReduceMinLoc(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(
+            value_type(init_val, init_idx),
+            value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T         init_val,
-             IndexType init_idx,
-             T         identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(
+        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1328,25 +1342,28 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T         init_val,
-               IndexType init_idx,
-               T         identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
+  ReduceMaxLoc(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(
+            value_type(init_val, init_idx),
+            value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T         init_val,
-             IndexType init_idx,
-             T         identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(
+        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1367,8 +1384,8 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   T get() { return Base::get(); }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 4415c8ab10..a061b729e5 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -42,23 +42,25 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename InputIter,
-          typename Function>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-            inclusive_inplace(resources::Cuda cuda_res,
-                              ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                           IterationGetter,
-                                                           Concretizer,
-                                                           BLOCKS_PER_SM,
-                                                           Async>,
-                              InputIter begin,
-                              InputIter end,
-                              Function  binary_op)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename InputIter,
+    typename Function>
+RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive_inplace(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    InputIter begin,
+    InputIter end,
+    Function  binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -66,17 +68,17 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
   // Determine temporary device storage requirements
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, len, stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, len, stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -89,25 +91,27 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename InputIter,
-          typename Function,
-          typename T>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-            exclusive_inplace(resources::Cuda cuda_res,
-                              ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                           IterationGetter,
-                                                           Concretizer,
-                                                           BLOCKS_PER_SM,
-                                                           Async>,
-                              InputIter begin,
-                              InputIter end,
-                              Function  binary_op,
-                              T         init)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename InputIter,
+    typename Function,
+    typename T>
+RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive_inplace(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    InputIter begin,
+    InputIter end,
+    Function  binary_op,
+    T         init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -115,17 +119,17 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
   // Determine temporary device storage requirements
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, init, len, stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, init, len, stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -138,25 +142,27 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-            inclusive(resources::Cuda cuda_res,
-                      ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                   IterationGetter,
-                                                   Concretizer,
-                                                   BLOCKS_PER_SM,
-                                                   Async>,
-                      InputIter  begin,
-                      InputIter  end,
-                      OutputIter out,
-                      Function   binary_op)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename InputIter,
+    typename OutputIter,
+    typename Function>
+RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    InputIter  begin,
+    InputIter  end,
+    OutputIter out,
+    Function   binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -185,27 +191,29 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function,
-          typename T>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-            exclusive(resources::Cuda cuda_res,
-                      ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                   IterationGetter,
-                                                   Concretizer,
-                                                   BLOCKS_PER_SM,
-                                                   Async>,
-                      InputIter  begin,
-                      InputIter  end,
-                      OutputIter out,
-                      Function   binary_op,
-                      T          init)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename InputIter,
+    typename OutputIter,
+    typename Function,
+    typename T>
+RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    InputIter  begin,
+    InputIter  end,
+    OutputIter out,
+    Function   binary_op,
+    T          init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -213,17 +221,17 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
   // Determine temporary device storage requirements
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, out,
-                                              binary_op, init, len, stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, out,
-                                              binary_op, init, len, stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -232,12 +240,12 @@ RAJA_INLINE resources::EventProxy<resources::Cuda>
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
 
-} // namespace scan
+}  // namespace scan
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index 7bcb7a2440..d884abb6f0 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,44 +44,51 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename Iter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename Iter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(resources::Cuda cuda_res,
-       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                IterationGetter,
-                                                Concretizer,
-                                                BLOCKS_PER_SM,
-                                                Async>,
-       Iter,
-       Iter,
-       Compare)
+            camp::
+                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    Iter,
+    Iter,
+    Compare)
 {
-  static_assert(std::is_pointer<Iter>::value,
-                "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<Iter>::value,
+      "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert(type_traits::is_arithmetic<iterval>::value,
-                "stable_sort<cuda_exec> is only implemented for arithmetic "
-                "types");
-  static_assert(concepts::any_of<
-                    camp::is_same<Compare, operators::less<iterval>>,
-                    camp::is_same<Compare, operators::greater<iterval>>>::value,
-                "stable_sort<cuda_exec> is only implemented for "
-                "RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      type_traits::is_arithmetic<iterval>::value,
+      "stable_sort<cuda_exec> is only implemented for arithmetic "
+      "types");
+  static_assert(
+      concepts::any_of<
+          camp::is_same<Compare, operators::less<iterval>>,
+          camp::is_same<Compare, operators::greater<iterval>>>::value,
+      "stable_sort<cuda_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -89,24 +96,28 @@ stable(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-stable(resources::Cuda cuda_res,
-       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                IterationGetter,
-                                                Concretizer,
-                                                BLOCKS_PER_SM,
-                                                Async>,
-       Iter begin,
-       Iter end,
-       operators::less<RAJA::detail::IterVal<Iter>>)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+stable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    Iter begin,
+    Iter end,
+    operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -126,18 +137,18 @@ stable(resources::Cuda cuda_res,
   // Determine temporary device storage requirements
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -145,8 +156,8 @@ stable(resources::Cuda cuda_res,
   {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
-                               stream));
+    cudaErrchk(cudaMemcpyAsync(
+        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -159,24 +170,28 @@ stable(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-stable(resources::Cuda cuda_res,
-       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                IterationGetter,
-                                                Concretizer,
-                                                BLOCKS_PER_SM,
-                                                Async>,
-       Iter begin,
-       Iter end,
-       operators::greater<RAJA::detail::IterVal<Iter>>)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+stable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    Iter begin,
+    Iter end,
+    operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -215,8 +230,8 @@ stable(resources::Cuda cuda_res,
   {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
-                               stream));
+    cudaErrchk(cudaMemcpyAsync(
+        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -230,43 +245,50 @@ stable(resources::Cuda cuda_res,
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename Iter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename Iter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(resources::Cuda cuda_res,
-         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                  IterationGetter,
-                                                  Concretizer,
-                                                  BLOCKS_PER_SM,
-                                                  Async>,
-         Iter,
-         Iter,
-         Compare)
+            camp::
+                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    Iter,
+    Iter,
+    Compare)
 {
-  static_assert(std::is_pointer<Iter>::value,
-                "sort<cuda_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<Iter>::value,
+      "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert(type_traits::is_arithmetic<iterval>::value,
-                "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert(concepts::any_of<
-                    camp::is_same<Compare, operators::less<iterval>>,
-                    camp::is_same<Compare, operators::greater<iterval>>>::value,
-                "sort<cuda_exec> is only implemented for RAJA::operators::less "
-                "or RAJA::operators::greater");
+  static_assert(
+      type_traits::is_arithmetic<iterval>::value,
+      "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert(
+      concepts::any_of<
+          camp::is_same<Compare, operators::less<iterval>>,
+          camp::is_same<Compare, operators::greater<iterval>>>::value,
+      "sort<cuda_exec> is only implemented for RAJA::operators::less "
+      "or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -274,24 +296,28 @@ unstable(resources::Cuda cuda_res,
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-unstable(resources::Cuda                                 cuda_res,
-         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                  IterationGetter,
-                                                  Concretizer,
-                                                  BLOCKS_PER_SM,
-                                                  Async> p,
-         Iter                                            begin,
-         Iter                                            end,
-         operators::less<RAJA::detail::IterVal<Iter>>    comp)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+unstable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>                                   p,
+    Iter                                         begin,
+    Iter                                         end,
+    operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -299,24 +325,28 @@ unstable(resources::Cuda                                 cuda_res,
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-unstable(resources::Cuda                                 cuda_res,
-         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                  IterationGetter,
-                                                  Concretizer,
-                                                  BLOCKS_PER_SM,
-                                                  Async> p,
-         Iter                                            begin,
-         Iter                                            end,
-         operators::greater<RAJA::detail::IterVal<Iter>> comp)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+unstable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>                                      p,
+    Iter                                            begin,
+    Iter                                            end,
+    operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -325,14 +355,15 @@ unstable(resources::Cuda                                 cuda_res,
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
@@ -340,35 +371,42 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(resources::Cuda cuda_res,
-             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                      IterationGetter,
-                                                      Concretizer,
-                                                      BLOCKS_PER_SM,
-                                                      Async>,
-             KeyIter,
-             KeyIter,
-             ValIter,
-             Compare)
+stable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    KeyIter,
+    KeyIter,
+    ValIter,
+    Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "stable_sort_pairs<cuda_exec> is only implemented for "
-                "pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "stable_sort_pairs<cuda_exec> is only implemented for "
-                "pointers");
+  static_assert(
+      std::is_pointer<KeyIter>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "pointers");
+  static_assert(
+      std::is_pointer<ValIter>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "stable_sort_pairs<cuda_exec> is only implemented for "
-                "arithmetic types");
   static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
+      type_traits::is_arithmetic<K>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "arithmetic types");
+  static_assert(
+      concepts::any_of<
+          camp::is_same<Compare, operators::less<K>>,
+          camp::is_same<Compare, operators::greater<K>>>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for "
       "RAJA::operators::less or RAJA::operators::greater");
 
@@ -378,28 +416,31 @@ stable_pairs(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-stable_pairs(resources::Cuda cuda_res,
-             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                      IterationGetter,
-                                                      Concretizer,
-                                                      BLOCKS_PER_SM,
-                                                      Async>,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             operators::less<RAJA::detail::IterVal<KeyIter>>)
+stable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -441,15 +482,15 @@ stable_pairs(resources::Cuda cuda_res,
   {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out)
   {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -463,28 +504,31 @@ stable_pairs(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-stable_pairs(resources::Cuda cuda_res,
-             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                      IterationGetter,
-                                                      Concretizer,
-                                                      BLOCKS_PER_SM,
-                                                      Async>,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             operators::greater<RAJA::detail::IterVal<KeyIter>>)
+stable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -526,15 +570,15 @@ stable_pairs(resources::Cuda cuda_res,
   {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out)
   {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -549,14 +593,15 @@ stable_pairs(resources::Cuda cuda_res,
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
@@ -564,33 +609,40 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(resources::Cuda cuda_res,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async>,
-               KeyIter,
-               KeyIter,
-               ValIter,
-               Compare)
+unstable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>,
+    KeyIter,
+    KeyIter,
+    ValIter,
+    Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<KeyIter>::value,
+      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<ValIter>::value,
+      "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "sort_pairs<cuda_exec> is only implemented for arithmetic "
-                "types");
   static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
+      type_traits::is_arithmetic<K>::value,
+      "sort_pairs<cuda_exec> is only implemented for arithmetic "
+      "types");
+  static_assert(
+      concepts::any_of<
+          camp::is_same<Compare, operators::less<K>>,
+          camp::is_same<Compare, operators::greater<K>>>::value,
       "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
       "RAJA::operators::greater");
 
@@ -600,28 +652,31 @@ unstable_pairs(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-unstable_pairs(resources::Cuda                                 cuda_res,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async> p,
-               KeyIter                                         keys_begin,
-               KeyIter                                         keys_end,
-               ValIter                                         vals_begin,
-               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+unstable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>                                      p,
+    KeyIter                                         keys_begin,
+    KeyIter                                         keys_end,
+    ValIter                                         vals_begin,
+    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -629,38 +684,41 @@ unstable_pairs(resources::Cuda                                 cuda_res,
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-unstable_pairs(resources::Cuda                                    cuda_res,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async>    p,
-               KeyIter                                            keys_begin,
-               KeyIter                                            keys_end,
-               ValIter                                            vals_begin,
-               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+unstable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>                                         p,
+    KeyIter                                            keys_begin,
+    KeyIter                                            keys_end,
+    ValIter                                            vals_begin,
+    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
 
-} // namespace sort
+}  // namespace sort
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_CUDA guard
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/synchronize.hpp b/include/RAJA/policy/cuda/synchronize.hpp
index 51efdaa63e..7f2224a28b 100644
--- a/include/RAJA/policy/cuda/synchronize.hpp
+++ b/include/RAJA/policy/cuda/synchronize.hpp
@@ -43,10 +43,10 @@ void synchronize_impl(const cuda_synchronize&)
 }
 
 
-} // end of namespace cuda
-} // namespace policy
-} // end of namespace RAJA
+}  // end of namespace cuda
+}  // namespace policy
+}  // end of namespace RAJA
 
-#endif // defined(RAJA_ENABLE_CUDA)
+#endif  // defined(RAJA_ENABLE_CUDA)
 
-#endif // RAJA_synchronize_cuda_HPP
+#endif  // RAJA_synchronize_cuda_HPP
diff --git a/include/RAJA/policy/desul.hpp b/include/RAJA/policy/desul.hpp
index 7036614d7b..d657bad8ff 100644
--- a/include/RAJA/policy/desul.hpp
+++ b/include/RAJA/policy/desul.hpp
@@ -22,4 +22,4 @@
 
 #include "RAJA/policy/desul/atomic.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 39595fb63c..d93995f8d7 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -30,56 +30,56 @@ RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
 {
-  return desul::atomic_load(acc, raja_default_desul_order{},
-                            raja_default_desul_scope{});
+  return desul::atomic_load(
+      acc, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
 {
-  desul::atomic_store(acc, value, raja_default_desul_order{},
-                      raja_default_desul_scope{});
+  desul::atomic_store(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_add(acc, value, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_add(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_sub(acc, value, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_sub(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_min(acc, value, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_min(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_max(acc, value, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_max(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_inc(acc, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_inc(
+      acc, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -88,16 +88,16 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(acc, val, raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(
+      acc, val, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_dec(acc, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_dec(
+      acc, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -106,40 +106,40 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(acc, val, raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(
+      acc, val, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_and(acc, value, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_and(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_or(acc, value, raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_fetch_or(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_xor(acc, value, raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_xor(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_exchange(acc, value, raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_exchange(
+      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -147,12 +147,12 @@ template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T
 atomicCAS(AtomicPolicy, T* acc, T compare, T value)
 {
-  return desul::atomic_compare_exchange(acc, compare, value,
-                                        raja_default_desul_order{},
-                                        raja_default_desul_scope{});
+  return desul::atomic_compare_exchange(
+      acc, compare, value, raja_default_desul_order {},
+      raja_default_desul_scope {});
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_ENABLE_DESUL_ATOMICS
-#endif // guard
+#endif  // RAJA_ENABLE_DESUL_ATOMICS
+#endif  // guard
diff --git a/include/RAJA/policy/hip.hpp b/include/RAJA/policy/hip.hpp
index 7389f44850..ab7e922c0f 100644
--- a/include/RAJA/policy/hip.hpp
+++ b/include/RAJA/policy/hip.hpp
@@ -42,6 +42,6 @@
 #include "RAJA/policy/hip/WorkGroup.hpp"
 
 
-#endif // closing endif for if defined(RAJA_HIP_ACTIVE)
+#endif  // closing endif for if defined(RAJA_HIP_ACTIVE)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 6f130d3670..6211fede8c 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -79,8 +79,8 @@ struct PinnedAllocator
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    hipErrchk(hipHostMalloc(&ptr, nbytes,
-                            hipHostMallocMapped | hipHostMallocNonCoherent));
+    hipErrchk(hipHostMalloc(
+        &ptr, nbytes, hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -170,10 +170,10 @@ namespace detail
 struct hipInfo
 {
   const void*            func = nullptr;
-  hip_dim_t              gridDim{0, 0, 0};
-  hip_dim_t              blockDim{0, 0, 0};
+  hip_dim_t              gridDim {0, 0, 0};
+  hip_dim_t              blockDim {0, 0, 0};
   size_t*                dynamic_smem = nullptr;
-  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0, 0)};
+  ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
   bool                   setup_reducers = false;
 };
 struct hipStatusInfo : hipInfo
@@ -196,7 +196,7 @@ extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
 RAJA_INLINE
 void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
 
-} // namespace detail
+}  // namespace detail
 
 //! Ensure all resources in use are synchronized wrt raja kernel launches
 RAJA_INLINE
@@ -266,14 +266,15 @@ void launch(::RAJA::resources::Hip res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void*            func,
-            hip_dim_t              gridDim,
-            hip_dim_t              blockDim,
-            void**                 args,
-            size_t                 shmem,
-            ::RAJA::resources::Hip res,
-            bool                   async = true,
-            const char*            name  = nullptr)
+void launch(
+    const void*            func,
+    hip_dim_t              gridDim,
+    hip_dim_t              blockDim,
+    void**                 args,
+    size_t                 shmem,
+    ::RAJA::resources::Hip res,
+    bool                   async = true,
+    const char*            name  = nullptr)
 {
 #if defined(RAJA_ENABLE_ROCTX)
   if (name) roctxRangePush(name);
@@ -347,8 +348,8 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
 template <typename T, typename GetNFromMax>
-RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
-                                        size_t        align = alignof(T))
+RAJA_INLINE size_t
+allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
@@ -381,17 +382,17 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
-make_launch_body(const void*            func,
-                 hip_dim_t              gridDim,
-                 hip_dim_t              blockDim,
-                 size_t&                dynamic_smem,
-                 ::RAJA::resources::Hip res,
-                 LOOP_BODY&&            loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
+    const void*            func,
+    hip_dim_t              gridDim,
+    hip_dim_t              blockDim,
+    size_t&                dynamic_smem,
+    ::RAJA::resources::Hip res,
+    LOOP_BODY&&            loop_body)
 {
   ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
       detail::tl_status,
-      detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+      detail::hipInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -429,9 +430,9 @@ struct HipOccMaxBlocksThreadsData
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE HipOccMaxBlocksThreadsData
-hip_occupancy_max_blocks_threads(const void* func,
-                                 size_t      func_dynamic_shmem_per_block)
+RAJA_INLINE HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(
+    const void* func,
+    size_t      func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
@@ -496,10 +497,10 @@ hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE HipOccMaxBlocksData
-hip_occupancy_max_blocks(const void* func,
-                         size_t      func_dynamic_shmem_per_block,
-                         int         func_threads_per_block)
+RAJA_INLINE HipOccMaxBlocksData hip_occupancy_max_blocks(
+    const void* func,
+    size_t      func_dynamic_shmem_per_block,
+    int         func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
@@ -558,9 +559,10 @@ hip_occupancy_max_blocks(const void* func,
 template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func,
-                  size_t      func_dynamic_shmem_per_block,
-                  IdxT        len)
+  ConcretizerImpl(
+      const void* func,
+      size_t      func_dynamic_shmem_per_block,
+      IdxT        len)
       : m_func(func),
         m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
         m_len(len)
@@ -643,10 +645,10 @@ struct ConcretizerImpl
   IdxT        m_len;
 };
 
-} // namespace hip
+}  // namespace hip
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP
+#endif  // closing endif for RAJA_ENABLE_HIP
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup.hpp b/include/RAJA/policy/hip/WorkGroup.hpp
index b456d20036..2c4a29739e 100644
--- a/include/RAJA/policy/hip/WorkGroup.hpp
+++ b/include/RAJA/policy/hip/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/hip/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/hip/WorkGroup/WorkRunner.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 5b122dd48f..e58837f075 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -42,8 +42,8 @@ namespace hip
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
 template <typename Factory>
-__global__ void get_value_global(typename Factory::value_type* ptr,
-                                 Factory                       factory)
+__global__ void
+get_value_global(typename Factory::value_type* ptr, Factory factory)
 {
   *ptr = factory();
 }
@@ -100,7 +100,7 @@ inline auto get_cached_value(Factory&& factory)
   return value;
 }
 
-} // namespace hip
+}  // namespace hip
 
 /*!
  * Populate and return a Dispatcher object that can be used in device code
@@ -108,15 +108,15 @@ inline auto get_cached_value(Factory&& factory)
 template <typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
-  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>(
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
       [](auto&& factory) {
         return hip::get_cached_value(std::forward<decltype(factory)>(factory));
       })};
   return &dispatcher;
 }
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index afcbcb949a..febf12450c 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,33 +36,37 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
-                              RAJA::hip_work<BLOCK_SIZE, Async>,
-                              RAJA::ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallOrdered<
+          RAJA::hip_exec_async<BLOCK_SIZE>,
+          RAJA::hip_work<BLOCK_SIZE, Async>,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
-  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
-                                       RAJA::hip_work<BLOCK_SIZE, Async>,
-                                       RAJA::ordered,
-                                       DISPATCH_POLICY_T,
-                                       ALLOCATOR_T,
-                                       INDEX_T,
-                                       Args...>;
+  using base = WorkRunnerForallOrdered<
+      RAJA::hip_exec_async<BLOCK_SIZE>,
+      RAJA::hip_work<BLOCK_SIZE, Async>,
+      RAJA::ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
   using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -72,9 +76,10 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const&         storage,
-                      typename base::resource_type r,
-                      Args... args) const
+  per_run_storage
+  run(WorkContainer const&         storage,
+      typename base::resource_type r,
+      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -98,33 +103,37 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
-                              RAJA::hip_work<BLOCK_SIZE, Async>,
-                              RAJA::reverse_ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::reverse_ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallReverse<
+          RAJA::hip_exec_async<BLOCK_SIZE>,
+          RAJA::hip_work<BLOCK_SIZE, Async>,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
-  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
-                                       RAJA::hip_work<BLOCK_SIZE, Async>,
-                                       RAJA::reverse_ordered,
-                                       DISPATCH_POLICY_T,
-                                       ALLOCATOR_T,
-                                       INDEX_T,
-                                       Args...>;
+  using base = WorkRunnerForallReverse<
+      RAJA::hip_exec_async<BLOCK_SIZE>,
+      RAJA::hip_work<BLOCK_SIZE, Async>,
+      RAJA::reverse_ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
   using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -134,9 +143,10 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage run(WorkContainer const&         storage,
-                      typename base::resource_type r,
-                      Args... args) const
+  per_run_storage
+  run(WorkContainer const&         storage,
+      typename base::resource_type r,
+      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -161,10 +171,11 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type,
-          typename LoopBody,
-          typename index_type,
-          typename... Args>
+template <
+    typename Segment_type,
+    typename LoopBody,
+    typename index_type,
+    typename... Args>
 struct HoldHipDeviceXThreadblockLoop
 {
   template <typename segment_in, typename body_in>
@@ -193,11 +204,12 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody     m_body;
 };
 
-template <size_t BLOCK_SIZE,
-          typename StorageIter,
-          typename value_type,
-          typename index_type,
-          typename... Args>
+template <
+    size_t BLOCK_SIZE,
+    typename StorageIter,
+    typename value_type,
+    typename index_type,
+    typename... Args>
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -214,12 +226,13 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
 struct WorkRunner<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -241,8 +254,8 @@ struct WorkRunner<
   {
     template <typename T>
     using type = HoldHipDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
         index_type,
         Args...>;
   };
@@ -259,10 +272,11 @@ struct WorkRunner<
   using dispatcher_holder_policy =
       dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::hip,
-                                     dispatcher_holder_policy,
-                                     RAJA::hip_work<BLOCK_SIZE, true>,
-                                     Args...>;
+  using dispatcher_type = Dispatcher<
+      Platform::hip,
+      dispatcher_holder_policy,
+      RAJA::hip_work<BLOCK_SIZE, true>,
+      Args...>;
 
   WorkRunner() = default;
 
@@ -317,7 +331,7 @@ struct WorkRunner<
       //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -330,14 +344,14 @@ struct WorkRunner<
   run(WorkContainer const& storage, resource_type r, Args... args) const
   {
     using Iterator   = camp::decay<decltype(std::begin(storage))>;
-    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
-                                                          std::end(storage)))>;
+    using IndexType  = camp::decay<decltype(std::distance(
+         std::begin(storage), std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
-                                             index_type, Args...>;
+    auto func = hip_unordered_y_block_global<
+        BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
@@ -357,10 +371,10 @@ struct WorkRunner<
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize{
-          static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
-                                        block_size),
+      hip_dim_t blockSize {static_cast<hip_dim_member_t>(block_size), 1, 1};
+      hip_dim_t gridSize {
+          static_cast<hip_dim_member_t>(
+              (average_iterations + block_size - 1) / block_size),
           static_cast<hip_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
@@ -375,8 +389,8 @@ struct WorkRunner<
         // Launch the kernel
         //
         void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
-                          shmem, r, Async);
+        RAJA::hip::launch(
+            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -395,11 +409,12 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
 struct WorkRunner<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -408,11 +423,12 @@ struct WorkRunner<
     INDEX_T,
     Args...>;
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
 struct WorkRunner<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -423,8 +439,8 @@ struct WorkRunner<
 
 #endif
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index ba306f1eaa..0b28ad3c37 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -81,9 +81,10 @@ struct hip_useReinterpretCommon
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type = std::conditional_t<
+      sizeof(T) == sizeof(unsigned int),
+      unsigned int,
+      unsigned long long>;
 };
 
 
@@ -101,8 +102,9 @@ using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
  * This overload using builtin functions is used to implement atomic loads
  * under some build configurations.
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
@@ -133,9 +135,10 @@ struct hip_useReinterpretExchange
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type = std::conditional_t<
+      sizeof(T) == sizeof(unsigned int),
+      unsigned int,
+      unsigned long long>;
 };
 
 /*!
@@ -149,8 +152,9 @@ using hip_useReinterpretExchange_t =
  * Performs an atomic exchange using a builtin function. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
@@ -160,8 +164,9 @@ RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
  * Performs an atomic exchange using a reinterpret cast. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   using R = hip_useReinterpretExchange_t<T>;
@@ -220,29 +225,31 @@ struct hip_useReinterpretLoad
 #endif
                                   ));
 
-  using type =
-      std::conditional_t<sizeof(T) == 1,
+  using type = std::conditional_t<
+      sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                         uint8_t,
+      uint8_t,
 #else
-                         unsigned char,
+      unsigned char,
 #endif
-                         std::conditional_t<sizeof(T) == 2,
+      std::conditional_t<
+          sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                                            uint16_t,
+          uint16_t,
 #else
-                                            unsigned short,
+          unsigned short,
 #endif
-                                            std::conditional_t<sizeof(T) == 4,
+          std::conditional_t<
+              sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                                                               uint32_t,
+              uint32_t,
 #else
-                                                               unsigned int,
+              unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                                                               uint64_t>>>;
+              uint64_t>>>;
 #else
-                                                               unsigned long long>>>;
+              unsigned long long>>>;
 #endif
 };
 
@@ -281,8 +288,9 @@ using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
 /*!
  * Atomic load
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
@@ -292,8 +300,9 @@ RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 #endif
 }
 
-template <typename T,
-          std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
   using R = hip_useReinterpretLoad_t<T>;
@@ -306,8 +315,9 @@ RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 /*!
  * Atomic store
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
 RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
@@ -317,14 +327,15 @@ RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 #endif
 }
 
-template <typename T,
-          std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
 RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
-  hip_atomicStore(reinterpret_cast<R*>(acc),
-                  RAJA::util::reinterp_A_as_B<T, R>(value));
+  hip_atomicStore(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
 
@@ -333,8 +344,9 @@ RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
  *
  * Returns the old value in memory before this operation.
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
@@ -345,8 +357,9 @@ RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
  *
  * Returns the old value in memory before this operation.
  */
-template <typename T,
-          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
@@ -362,21 +375,24 @@ RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
  * integral type to avoid cases where the values will never compare equal
  * (most notably, NaNs).
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
 
-template <typename T,
-          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+template <
+    typename T,
+    std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
-  return hip_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
-                             RAJA::util::reinterp_A_as_B<T, R>(b));
+  return hip_atomicCAS_equal(
+      RAJA::util::reinterp_A_as_B<T, R>(a),
+      RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
@@ -409,9 +425,8 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
  * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T*             acc,
-                                            Oper&&         oper,
-                                            ShortCircuit&& sc)
+RAJA_INLINE __device__ T
+hip_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
 
@@ -439,15 +454,16 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T*             acc,
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<int,
-                                                 unsigned int,
-                                                 unsigned long long,
-                                                 float
+using hip_atomicAdd_builtin_types = ::camp::list<
+    int,
+    unsigned int,
+    unsigned long long,
+    float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-                                                 ,
-                                                 double
+    ,
+    double
 #endif
-                                                 >;
+    >;
 
 template <
     typename T,
@@ -473,15 +489,16 @@ RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<int,
-                                                 unsigned int,
-                                                 unsigned long long,
-                                                 float
+using hip_atomicSub_builtin_types = ::camp::list<
+    int,
+    unsigned int,
+    unsigned long long,
+    float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-                                                 ,
-                                                 double
+    ,
+    double
 #endif
-                                                 >;
+    >;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -497,13 +514,14 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
-                                                         float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<
+    unsigned long long,
+    float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-                                                         ,
-                                                         double
+    ,
+    double
 #endif
-                                                         >;
+    >;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
@@ -617,13 +635,14 @@ RAJA_INLINE __device__ T hip_atomicInc(T* acc)
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc,
-                            [value](T old)
-                            {
-                              return old == static_cast<T>(0) || value < old
-                                         ? value
-                                         : old - static_cast<T>(1);
-                            });
+  return hip_atomicCAS_loop(
+      acc,
+      [value](T old)
+      {
+        return old == static_cast<T>(0) || value < old
+                   ? value
+                   : old - static_cast<T>(1);
+      });
 }
 
 
@@ -700,7 +719,7 @@ RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 }
 
 
-} // namespace detail
+}  // namespace detail
 
 
 /*!
@@ -714,13 +733,13 @@ RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
-                                          T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicLoad(hip_atomic_explicit<host_policy>, T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
@@ -732,161 +751,151 @@ atomicStore(hip_atomic_explicit<host_policy>, T* acc, T value)
 #if defined(__HIP_DEVICE_COMPILE__)
   detail::hip_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAdd(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicSub(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMin(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMax(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(hip_atomic_explicit<host_policy>, T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(hip_atomic_explicit<host_policy>, T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAnd(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
-                                        T* acc,
-                                        T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicOr(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicXor(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
-                                              T* acc,
-                                              T  value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicExchange(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
@@ -898,12 +907,12 @@ atomicCAS(hip_atomic_explicit<host_policy>, T* acc, T compare, T value)
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // RAJA_ENABLE_HIP
-#endif // guard
+#endif  // RAJA_ENABLE_HIP
+#endif  // guard
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index b5430e3470..9f8103b0c6 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,41 +71,46 @@ namespace impl
  *
  ******************************************************************************
  */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               len,
-                             const void*        RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               len,
+      const void*        RAJA_UNUSED_ARG(func),
+      size_t             RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -116,37 +121,40 @@ struct ForallDimensionCalculator<
                           "space");
     }
 
-    internal::set_hip_dim<dim>(dims.threads,
-                               static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks,
-                               static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(
+        dims.threads, static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(
+        dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template <named_dim dim,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               len,
-                             const void*        func,
-                             size_t             dynamic_shmem_size)
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               len,
+      const void*        func,
+      size_t             dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -163,30 +171,33 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               len,
-                             const void*        func,
-                             size_t             dynamic_shmem_size)
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               len,
+      const void*        func,
+      size_t             dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
@@ -209,12 +220,13 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               len,
-                             const void*        func,
-                             size_t             dynamic_shmem_size)
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               len,
+      const void*        func,
+      size_t             dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
@@ -224,31 +236,35 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               RAJA_UNUSED_ARG(len),
-                             const void*        RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               RAJA_UNUSED_ARG(len),
+      const void*        RAJA_UNUSED_ARG(func),
+      size_t             RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -258,30 +274,33 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       GRID_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               len,
-                             const void*        func,
-                             size_t             dynamic_shmem_size)
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               len,
+      const void*        func,
+      size_t             dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -292,30 +311,33 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int       BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
+template <
+    named_dim dim,
+    int       BLOCK_SIZE,
+    typename Concretizer,
+    typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or "
+      "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               len,
-                             const void*        func,
-                             size_t             dynamic_shmem_size)
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               len,
+      const void*        func,
+      size_t             dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
@@ -338,12 +360,13 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT               len,
-                             const void*        func,
-                             size_t             dynamic_shmem_size)
+  static void set_dimensions(
+      internal::HipDims& dims,
+      IdxT               len,
+      const void*        func,
+      size_t             dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
@@ -368,20 +391,22 @@ struct ForallDimensionCalculator<
  *
  ******************************************************************************
  */
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__
-    void forall_hip_kernel(LOOP_BODY      loop_body,
-                           const Iterator idx,
-                           IndexType      length)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__ void forall_hip_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -393,16 +418,18 @@ __launch_bounds__(BlockSize, 1) __global__
   }
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
@@ -416,22 +443,24 @@ forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
   }
 }
 
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(LOOP_BODY      loop_body,
-                            const Iterator idx,
-                            IndexType      length,
-                            ForallParam    f_params)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__ void forallp_hip_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -444,21 +473,24 @@ __launch_bounds__(BlockSize, 1) __global__
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(LOOP_BODY      loop_body,
-                                   const Iterator idx,
-                                   IndexType      length,
-                                   ForallParam    f_params)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
+                value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -476,18 +508,20 @@ template <
     typename Iterator,
     typename LOOP_BODY,
     typename IndexType,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__
-    void forall_hip_kernel(LOOP_BODY      loop_body,
-                           const Iterator idx,
-                           IndexType      length)
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__ void forall_hip_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -506,12 +540,14 @@ template <
     typename IndexType,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
@@ -532,19 +568,21 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename ForallParam,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(LOOP_BODY      loop_body,
-                            const Iterator idx,
-                            IndexType      length,
-                            ForallParam    f_params)
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size > 0),
+        size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__ void forallp_hip_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -565,16 +603,19 @@ template <
     typename ForallParam,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(LOOP_BODY      loop_body,
-                                   const Iterator idx,
-                                   IndexType      length,
-                                   ForallParam    f_params)
+    std::enable_if_t<
+        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
+                value &&
+            std::is_base_of<
+                iteration_mapping::UnsizedLoopBase,
+                IterationMapping>::value &&
+            (IterationGetter::block_size <= 0),
+        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(
+    LOOP_BODY      loop_body,
+    const Iterator idx,
+    IndexType      length,
+    ForallParam    f_params)
 {
   using RAJA::internal::thread_privatize;
   auto  privatizer = thread_privatize(loop_body);
@@ -587,7 +628,7 @@ __global__ void forallp_hip_kernel(LOOP_BODY      loop_body,
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-} // namespace impl
+}  // namespace impl
 
 //
 ////////////////////////////////////////////////////////////////////////
@@ -597,13 +638,14 @@ __global__ void forallp_hip_kernel(LOOP_BODY      loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename LoopBody,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
@@ -620,14 +662,12 @@ forall_impl(
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL =
-      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
-                                    Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
-                                    LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<
+      IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::list<
+      IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<
+      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -668,8 +708,8 @@ forall_impl(
       // Launch the kernels
       //
       void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
-                        Async);
+      RAJA::hip::launch(
+          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
     }
 
     RAJA_FT_END;
@@ -679,13 +719,14 @@ forall_impl(
 }
 
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename LoopBody,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
@@ -703,14 +744,12 @@ forall_impl(
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL =
-      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
-                                    Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
-                                    LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<
+      IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::list<
+      IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<
+      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -723,9 +762,10 @@ forall_impl(
   if (len > 0)
   {
 
-    auto func = reinterpret_cast<const void*>(
-        &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
-                                  camp::decay<ForallParam>>);
+    auto func =
+        reinterpret_cast<const void*>(&impl::forallp_hip_kernel<
+                                      EXEC_POL, Iterator, LOOP_BODY, IndexType,
+                                      camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -758,10 +798,10 @@ forall_impl(
       //
       // Launch the kernels
       //
-      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
-                      (void*)&f_params};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
-                        Async);
+      void* args[] = {
+          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::hip::launch(
+          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -791,12 +831,13 @@ forall_impl(
  *
  ******************************************************************************
  */
-template <typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename... SegmentTypes>
+template <
+    typename LoopBody,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
     resources::Hip r,
     ExecPolicy<
@@ -811,21 +852,21 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
   {
     iset.segmentCall(
         r, isi, detail::CallForall(),
-        ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
-                                      Concretizer, true>(),
+        ::RAJA::policy::hip::hip_exec<
+            IterationMapping, IterationGetter, Concretizer, true>(),
         loop_body);
-  } // iterate over segments of index set
+  }  // iterate over segments of index set
 
   if (!Async) RAJA::hip::synchronize(r);
   return resources::EventProxy<resources::Hip>(r);
 }
 
-} // namespace hip
+}  // namespace hip
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index 3c4360685b..608c6afb1a 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -93,8 +93,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<
+        T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -105,8 +105,8 @@ struct AccessorDeviceScopeUseBlockFence
     {
 #if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
     RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
-                                     __HIP_MEMORY_SCOPE_AGENT);
+      u.array[i] = __hip_atomic_load(
+          &ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -118,8 +118,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<
+        T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -130,8 +130,8 @@ struct AccessorDeviceScopeUseBlockFence
     {
 #if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
     RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
-                         __HIP_MEMORY_SCOPE_AGENT);
+      __hip_atomic_store(
+          &ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -155,8 +155,8 @@ struct AccessorDeviceScopeUseBlockFence
     RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
-    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) |
-                               (/*lgkmcnt*/ 0xf << 8));
+    __builtin_amdgcn_s_waitcnt(
+        /*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
 #else
     __threadfence();
 #endif
@@ -181,8 +181,8 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<
+      T, min_shfl_int_type_size, max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -196,8 +196,8 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<
+      T, min_shfl_int_type_size, max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -252,7 +252,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
     for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
   }
   else
@@ -266,7 +266,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -289,7 +289,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
   for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
   {
     T rhs = shfl_xor_sync(temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
@@ -317,7 +317,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
   }
   else
@@ -331,7 +331,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -340,10 +340,11 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   if (numThreads > policy::hip::device_constants.WARP_SIZE)
   {
 
-    static_assert(policy::hip::device_constants.MAX_WARPS <=
-                      policy::hip::device_constants.WARP_SIZE,
-                  "This algorithms assumes a warp of WARP_SIZE threads can "
-                  "reduce MAX_WARPS values");
+    static_assert(
+        policy::hip::device_constants.MAX_WARPS <=
+            policy::hip::device_constants.WARP_SIZE,
+        "This algorithms assumes a warp of WARP_SIZE threads can "
+        "reduce MAX_WARPS values");
 
     __shared__ unsigned char tmpsd[sizeof(
         RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
@@ -375,7 +376,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
       {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
@@ -385,12 +386,12 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   return temp;
 }
 
-} // end namespace impl
+}  // end namespace impl
 
-} // end namespace hip
+}  // end namespace hip
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel.hpp b/include/RAJA/policy/hip/kernel.hpp
index 502213a576..4f907f5f5f 100644
--- a/include/RAJA/policy/hip/kernel.hpp
+++ b/include/RAJA/policy/hip/kernel.hpp
@@ -33,4 +33,4 @@
 #include "RAJA/policy/hip/kernel/TileTCount.hpp"
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 302f151657..1b4bdba005 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -36,13 +36,15 @@ namespace internal
 {
 
 
-template <typename Data,
-          typename Conditional,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::If<Conditional, EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    typename Conditional,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::If<Conditional, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -67,8 +69,8 @@ struct HipStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 2e7aa010c3..d4f8728899 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -36,19 +36,20 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -88,7 +89,7 @@ struct HipStatementExecutor<
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -102,19 +103,21 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::sync,
-                       IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -165,8 +168,8 @@ struct HipStatementExecutor<
     diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -180,19 +183,21 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::none,
-                       IndexMapper>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -238,8 +243,8 @@ struct HipStatementExecutor<
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -251,10 +256,11 @@ struct HipStatementExecutor<
 /*
  * Executor for sequential loops inside of a HipKernel.
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
@@ -266,9 +272,10 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
+                  hip::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -279,16 +286,17 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_warp_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -303,9 +311,9 @@ struct HipStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
-                "BitMask is too large for HIP warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+      "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -345,16 +353,17 @@ struct HipStatementExecutor<Data,
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_warp_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -369,9 +378,9 @@ struct HipStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
-                "BitMask is too large for HIP warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+      "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -422,16 +431,17 @@ struct HipStatementExecutor<Data,
  * Mapping directly from raw threadIdx.x
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_thread_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -485,16 +495,17 @@ struct HipStatementExecutor<Data,
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_thread_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -555,8 +566,8 @@ struct HipStatementExecutor<Data,
   }
 };
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_hip_kernel_For_HPP */
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index c8bcf7897e..ce36d042ed 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -37,13 +37,14 @@ namespace internal
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<
@@ -65,11 +66,11 @@ struct HipStatementExecutor<
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::
+              hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -101,12 +102,13 @@ struct HipStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<
@@ -178,12 +180,13 @@ struct HipStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<
@@ -251,11 +254,12 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
@@ -267,9 +271,10 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
+                  hip::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -280,33 +285,35 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::hip_warp_masked_direct<Mask>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::hip_warp_masked_direct<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_warp_masked_direct<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+      Data,
+      statement::
+          For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>, EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -320,9 +327,9 @@ struct HipStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
-                "BitMask is too large for HIP warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+      "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -345,33 +352,35 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::hip_warp_masked_loop<Mask>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::hip_warp_masked_loop<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_warp_masked_loop<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+      Data,
+      statement::
+          For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>, EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -385,9 +394,9 @@ struct HipStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
-                "BitMask is too large for HIP warp size");
+  static_assert(
+      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+      "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -421,33 +430,37 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::hip_thread_masked_direct<Mask>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::hip_thread_masked_direct<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_thread_masked_direct<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+      Data,
+      statement::For<
+          ArgumentId,
+          RAJA::hip_thread_masked_direct<Mask>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -482,33 +495,35 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::hip_thread_masked_loop<Mask>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::hip_thread_masked_loop<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_thread_masked_loop<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+      Data,
+      statement::
+          For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>, EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -548,8 +563,8 @@ struct HipStatementExecutor<
   }
 };
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_hip_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 8e892c3718..55a4802357 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -101,9 +101,9 @@ struct HipKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using HipKernelExp =
-    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
-                 EnclosedStmts...>;
+using HipKernelExp = HipKernelExt<
+    hip_explicit_launch<false, num_blocks, num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -112,9 +112,9 @@ using HipKernelExp =
  * The kernel launch is asynchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using HipKernelExpAsync =
-    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
-                 EnclosedStmts...>;
+using HipKernelExpAsync = HipKernelExt<
+    hip_explicit_launch<true, num_blocks, num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
@@ -168,7 +168,7 @@ using HipKernel = HipKernelFixed<1024, EnclosedStmts...>;
 template <typename... EnclosedStmts>
 using HipKernelAsync = HipKernelFixedAsync<1024, EnclosedStmts...>;
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -247,10 +247,11 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <typename LaunchPolicy,
-          typename StmtList,
-          typename Data,
-          typename Types>
+template <
+    typename LaunchPolicy,
+    typename StmtList,
+    typename Data,
+    typename Types>
 struct HipLaunchHelper;
 
 
@@ -259,16 +260,18 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template <bool async0,
-          int  num_blocks,
-          int  num_threads,
-          typename StmtList,
-          typename Data,
-          typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
-                       StmtList,
-                       Data,
-                       Types>
+template <
+    bool async0,
+    int  num_blocks,
+    int  num_threads,
+    typename StmtList,
+    typename Data,
+    typename Types>
+struct HipLaunchHelper<
+    hip_explicit_launch<async0, num_blocks, num_threads>,
+    StmtList,
+    Data,
+    Types>
 {
   using Self = HipLaunchHelper;
 
@@ -277,19 +280,20 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
   using executor_t =
       internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t =
-      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
-                              Data,
-                              executor_t>;
+  using kernelGetter_t = HipKernelLauncherGetter<
+      (num_threads <= 0) ? 0 : num_threads,
+      Data,
+      executor_t>;
 
   inline static const void* get_func()
   {
     return reinterpret_cast<const void*>(kernelGetter_t::get());
   }
 
-  inline static void recommended_blocks_threads(size_t shmem_size,
-                                                int&   recommended_blocks,
-                                                int&   recommended_threads)
+  inline static void recommended_blocks_threads(
+      size_t shmem_size,
+      int&   recommended_blocks,
+      int&   recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -351,8 +355,8 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
-                                 int&   max_threads)
+  inline static void
+  max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int& max_threads)
   {
     if (num_threads <= 0)
     {
@@ -425,9 +429,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline hip_dim_t fitHipDims(hip_dim_member_t limit,
-                            hip_dim_t        result,
-                            hip_dim_t        minimum = hip_dim_t())
+inline hip_dim_t fitHipDims(
+    hip_dim_member_t limit,
+    hip_dim_t        result,
+    hip_dim_t        minimum = hip_dim_t())
 {
 
 
@@ -527,8 +532,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
-                                           recommended_threads);
+      launch_t::recommended_blocks_threads(
+          shmem, recommended_blocks, recommended_threads);
 
 
       //
@@ -541,13 +546,14 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      hip_dim_t fit_threads{0, 0, 0};
+      hip_dim_t fit_threads {0, 0, 0};
 
       if (recommended_threads >= get_size(launch_dims.min_dims.threads))
       {
 
-        fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
-                                 launch_dims.min_dims.threads);
+        fit_threads = fitHipDims(
+            recommended_threads, launch_dims.dims.threads,
+            launch_dims.min_dims.threads);
       }
 
       //
@@ -557,8 +563,9 @@ struct StatementExecutor<
           get_size(fit_threads) != recommended_threads)
       {
 
-        fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
-                                 launch_dims.min_dims.threads);
+        fit_threads = fitHipDims(
+            max_threads, launch_dims.dims.threads,
+            launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -589,8 +596,8 @@ struct StatementExecutor<
         use_blocks = max_blocks;
       }
 
-      launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
-                                           launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(
+          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -622,18 +629,18 @@ struct StatementExecutor<
         // Launch the kernel
         //
         void* args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func, launch_dims.dims.blocks,
-                          launch_dims.dims.threads, args, shmem, res,
-                          launch_t::async);
+        RAJA::hip::launch(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, args,
+            shmem, res, launch_t::async);
       }
     }
   }
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index 2a0e7e2779..b033576c48 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -36,11 +36,12 @@ namespace internal
 {
 
 
-template <typename Data,
-          camp::idx_t HpArgumentId,
-          camp::idx_t... Args,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t HpArgumentId,
+    camp::idx_t... Args,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::
@@ -64,8 +65,8 @@ struct HipStatementExecutor<
         segment_length<HpArgumentId>(data) +
         foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
-    int h_args = foldl(RAJA::operators::plus<idx_t>(),
-                       camp::get<Args>(data.offset_tuple)...);
+    int h_args = foldl(
+        RAJA::operators::plus<idx_t>(), camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -99,8 +100,8 @@ struct HipStatementExecutor<
 };
 
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index bd263fb705..e3392199ba 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -40,15 +40,18 @@ namespace internal
 {
 
 // Intialize thread shared array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_shared_mem,
-                                                    camp::idx_seq<Indices...>,
-                                                    EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    camp::idx_t... Indices,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::InitLocalMem<
+        RAJA::hip_shared_mem,
+        camp::idx_seq<Indices...>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -122,15 +125,18 @@ struct HipStatementExecutor<Data,
 };
 
 // Intialize thread private array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_thread_mem,
-                                                    camp::idx_seq<Indices...>,
-                                                    EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    camp::idx_t... Indices,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::InitLocalMem<
+        RAJA::hip_thread_mem,
+        camp::idx_seq<Indices...>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -204,8 +210,8 @@ struct HipStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index a2fedbaf91..9b5d2c3d92 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -40,13 +40,15 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data,
-          camp::idx_t LambdaIndex,
-          typename... Args,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::Lambda<LambdaIndex, Args...>,
-                            Types>
+template <
+    typename Data,
+    camp::idx_t LambdaIndex,
+    typename... Args,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::Lambda<LambdaIndex, Args...>,
+    Types>
 {
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
@@ -68,9 +70,9 @@ struct HipStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index 734f74f0e8..e510bcdf1f 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -34,18 +34,21 @@ namespace internal
 //
 // Executor that handles reductions across a single HIP thread block
 //
-template <typename Data,
-          template <typename...>
-          class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::Reduce<RAJA::hip_block_reduce,
-                                              ReduceOperator,
-                                              ParamId,
-                                              EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    template <typename...>
+    class ReduceOperator,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::Reduce<
+        RAJA::hip_block_reduce,
+        ReduceOperator,
+        ParamId,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -96,18 +99,21 @@ struct HipStatementExecutor<Data,
 //
 // Executor that handles reductions across a single HIP thread warp
 //
-template <typename Data,
-          template <typename...>
-          class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::Reduce<RAJA::hip_warp_reduce,
-                                              ReduceOperator,
-                                              ParamId,
-                                              EnclosedStmts...>,
-                            Types>
+template <
+    typename Data,
+    template <typename...>
+    class ReduceOperator,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::Reduce<
+        RAJA::hip_warp_reduce,
+        ReduceOperator,
+        ParamId,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -154,8 +160,8 @@ struct HipStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_hip_kernel_Reduce_HPP */
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index 0f525ab5fd..b5d590cb3a 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -52,7 +52,7 @@ struct HipSyncThreads : public internal::Statement<camp::nil>
 struct HipSyncWarp : public internal::Statement<camp::nil>
 {};
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -89,9 +89,9 @@ struct HipStatementExecutor<Data, statement::HipSyncWarp, Types>
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 8420b40ad1..01d96ae1bb 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -47,21 +47,22 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                   sync,
-                                                   IndexMapper>,
-                    EnclosedStmts...>,
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -112,8 +113,8 @@ struct HipStatementExecutor<
         RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
     using data_t        = camp::decay<Data>;
@@ -137,12 +138,13 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<
@@ -213,8 +215,8 @@ struct HipStatementExecutor<
         RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
     using data_t        = camp::decay<Data>;
@@ -238,12 +240,13 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<
@@ -309,8 +312,8 @@ struct HipStatementExecutor<
         RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
     using data_t        = camp::decay<Data>;
@@ -335,11 +338,12 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename TPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
@@ -352,15 +356,16 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
+                  hip::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
-#endif // RAJA_ENABLE_HIP
-#endif /* RAJA_policy_hip_kernel_Tile_HPP */
+#endif  // RAJA_ENABLE_HIP
+#endif  /* RAJA_policy_hip_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 9a6c922470..2abfaf9772 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -48,14 +48,15 @@ namespace internal
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    kernel_sync_requirement sync,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::TileTCount<
@@ -79,12 +80,12 @@ struct HipStatementExecutor<
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                     sync,
-                                                     IndexMapper>,
-                      EnclosedStmts...>,
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::
+              hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -127,13 +128,14 @@ struct HipStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::TileTCount<
@@ -219,13 +221,14 @@ struct HipStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    typename IndexMapper,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::TileTCount<
@@ -306,12 +309,13 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename TPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct HipStatementExecutor<
     Data,
     statement::
@@ -326,15 +330,16 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
+                  hip::IndexGlobal<
+                      named_dim::x,
+                      named_usage::ignored,
+                      named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
-#endif // RAJA_ENABLE_HIP
-#endif /* RAJA_policy_hip_kernel_TileTCount_HPP */
+#endif  // RAJA_ENABLE_HIP
+#endif  /* RAJA_policy_hip_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index a0d1218b85..01d31bbbe1 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -55,11 +55,11 @@ struct LaunchDims
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(HipDims _dims) : dims{_dims}, min_dims{} {}
+  LaunchDims(HipDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(HipDims _dims, HipDims _min_dims)
-      : dims{_dims}, min_dims{_min_dims}
+      : dims {_dims}, min_dims {_min_dims}
   {}
 
   RAJA_INLINE
@@ -219,9 +219,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
       hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
-                             HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT     len)
+  static void set_dimensions(
+      HipDims& RAJA_UNUSED_ARG(dims),
+      HipDims& RAJA_UNUSED_ARG(min_dims),
+      IdxT     len)
   {
     if (len > static_cast<IdxT>(1))
     {
@@ -257,10 +258,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -273,8 +275,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
                           "space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -302,10 +304,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -318,8 +321,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
                           "space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -334,9 +337,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
       hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
-                             HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT     len)
+  static void set_dimensions(
+      HipDims& RAJA_UNUSED_ARG(dims),
+      HipDims& RAJA_UNUSED_ARG(min_dims),
+      IdxT     len)
   {
     if (len > static_cast<IdxT>(0))
     {
@@ -351,10 +355,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -364,15 +369,15 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_hip_dim<dim>(dims.threads,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(
+        dims.threads, RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(
+        min_dims.threads, RAJA_DIVIDE_CEILING_INT(
+                              len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -382,10 +387,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -394,34 +400,37 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     RAJA_DIVIDE_CEILING_INT(
+    set_hip_dim<dim>(
+        dims.blocks, RAJA_DIVIDE_CEILING_INT(
                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(
+        min_dims.blocks, RAJA_DIVIDE_CEILING_INT(
+                             len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template <named_dim               dim,
-          int                     BLOCK_SIZE,
-          int                     GRID_SIZE,
-          kernel_sync_requirement sync>
+template <
+    named_dim               dim,
+    int                     BLOCK_SIZE,
+    int                     GRID_SIZE,
+    kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     iteration_mapping::Direct,
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -436,10 +445,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -455,9 +464,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
       hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
-                             HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT     RAJA_UNUSED_ARG(len))
+  static void set_dimensions(
+      HipDims& RAJA_UNUSED_ARG(dims),
+      HipDims& RAJA_UNUSED_ARG(min_dims),
+      IdxT     RAJA_UNUSED_ARG(len))
   {}
 };
 
@@ -487,10 +497,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -499,8 +510,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -528,10 +539,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -540,8 +552,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -574,10 +586,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -587,13 +600,13 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_hip_dim<dim>(dims.threads,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(
+        dims.threads, RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -603,10 +616,11 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -615,32 +629,35 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks,
-                     RAJA_DIVIDE_CEILING_INT(
+    set_hip_dim<dim>(
+        dims.blocks, RAJA_DIVIDE_CEILING_INT(
                          len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template <named_dim               dim,
-          int                     BLOCK_SIZE,
-          int                     GRID_SIZE,
-          kernel_sync_requirement sync>
+template <
+    named_dim               dim,
+    int                     BLOCK_SIZE,
+    int                     GRID_SIZE,
+    kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0, "
+      "named_usage::unspecified, or "
+      "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -650,17 +667,17 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(
+        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(
+        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-} // namespace internal
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 1aa27db7eb..fd428ac741 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -45,8 +45,8 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY         body_in,
-                                             ReduceParams reduce_params)
+__global__ void
+launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -75,11 +75,12 @@ struct LaunchExecute<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -91,13 +92,15 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{static_cast<hip_dim_member_t>(params.teams.value[0]),
-                       static_cast<hip_dim_member_t>(params.teams.value[1]),
-                       static_cast<hip_dim_member_t>(params.teams.value[2])};
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(params.teams.value[0]),
+        static_cast<hip_dim_member_t>(params.teams.value[1]),
+        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{static_cast<hip_dim_member_t>(params.threads.value[0]),
-                        static_cast<hip_dim_member_t>(params.threads.value[1]),
-                        static_cast<hip_dim_member_t>(params.threads.value[2])};
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -113,16 +116,17 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, hip_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        RAJA::hip::launch(
+            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
+            kernel_name);
       }
 
       RAJA_FT_END;
@@ -139,11 +143,12 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       launch_params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             launch_reducers)
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       launch_params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -156,12 +161,12 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{
+    hip_dim_t gridSize {
         static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
         static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
         static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{
+    hip_dim_t blockSize {
         static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
         static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
@@ -184,25 +189,26 @@ struct LaunchExecute<
       {
         using EXEC_POL =
             RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
+            launch_reducers, launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, hip_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        RAJA::hip::launch(
+            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
+            kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
+            launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
@@ -232,8 +238,9 @@ __launch_bounds__(num_threads, 1) __global__
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-    void launch_new_reduce_global_fcn_fixed(BODY         body_in,
-                                            ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(
+        BODY         body_in,
+        ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -262,11 +269,12 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -279,13 +287,15 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{static_cast<hip_dim_member_t>(params.teams.value[0]),
-                       static_cast<hip_dim_member_t>(params.teams.value[1]),
-                       static_cast<hip_dim_member_t>(params.teams.value[2])};
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(params.teams.value[0]),
+        static_cast<hip_dim_member_t>(params.teams.value[1]),
+        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{static_cast<hip_dim_member_t>(params.threads.value[0]),
-                        static_cast<hip_dim_member_t>(params.threads.value[1]),
-                        static_cast<hip_dim_member_t>(params.threads.value[2])};
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -300,16 +310,17 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, hip_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        RAJA::hip::launch(
+            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
+            kernel_name);
       }
 
       RAJA_FT_END;
@@ -325,17 +336,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       launch_params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             launch_reducers)
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       launch_params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads,
-                                            camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<
+            BODY, nthreads, camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -343,12 +355,12 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{
+    hip_dim_t gridSize {
         static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
         static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
         static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{
+    hip_dim_t blockSize {
         static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
         static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
@@ -371,25 +383,26 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       {
         using EXEC_POL =
             RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
+            launch_reducers, launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, hip_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        RAJA::hip::launch(
+            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
+            kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
+            launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
@@ -405,9 +418,10 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
 */
 template <typename SEGMENT, typename IndexMapper>
 struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -415,10 +429,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -432,10 +446,11 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
     SEGMENT>
 {
 
@@ -443,11 +458,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -462,16 +477,18 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1,
-                                   IndexMapper2>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
     SEGMENT>
 {
 
@@ -479,12 +496,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -496,8 +513,9 @@ struct LoopExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-           *(segment2.begin() + i2));
+      body(
+          *(segment0.begin() + i0), *(segment1.begin() + i1),
+          *(segment2.begin() + i2));
     }
   }
 };
@@ -515,10 +533,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -545,11 +563,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -572,10 +590,11 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::hip::hip_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -590,12 +609,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -618,8 +637,9 @@ struct LoopExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-               *(segment2.begin() + i2));
+          body(
+              *(segment0.begin() + i0), *(segment1.begin() + i1),
+              *(segment2.begin() + i2));
         }
       }
     }
@@ -628,9 +648,10 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -638,10 +659,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -654,10 +675,11 @@ struct LoopICountExecute<
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
     SEGMENT>
 {
 
@@ -665,11 +687,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -684,16 +706,18 @@ struct LoopICountExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1,
-                                   IndexMapper2>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
     SEGMENT>
 {
 
@@ -701,12 +725,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -718,8 +742,9 @@ struct LoopICountExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-           *(segment2.begin() + i2), i0, i1, i2);
+      body(
+          *(segment0.begin() + i0), *(segment1.begin() + i1),
+          *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
@@ -737,10 +762,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -767,11 +792,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -794,10 +819,11 @@ struct LoopICountExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopICountExecute<
     RAJA::policy::hip::hip_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -812,12 +838,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -840,8 +866,9 @@ struct LoopICountExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-               *(segment2.begin() + i2), i0, i1, i2);
+          body(
+              *(segment0.begin() + i0), *(segment1.begin() + i1),
+              *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -854,9 +881,10 @@ struct LoopICountExecute<
 */
 template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
 struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                           sync,
-                                           IndexMapper0>,
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::Direct,
+        sync,
+        IndexMapper0>,
     SEGMENT>
     : LoopExecute<
           RAJA::policy::hip::
@@ -866,20 +894,21 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                           kernel_sync_requirement::none,
-                                           IndexMapper0,
-                                           IndexMapper1>,
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -897,26 +926,28 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                           kernel_sync_requirement::none,
-                                           IndexMapper0,
-                                           IndexMapper1,
-                                           IndexMapper2>,
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -964,10 +995,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -984,10 +1015,11 @@ struct LoopExecute<
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
+template <
+    typename SEGMENT,
+    typename IndexMapper0,
+    typename IndexMapper1,
+    typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::hip::hip_flatten_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -1001,10 +1033,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1030,9 +1062,10 @@ struct LoopExecute<
 */
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -1040,11 +1073,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i =
@@ -1070,11 +1103,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init =
@@ -1091,9 +1124,10 @@ struct TileExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::Direct,
+        kernel_sync_requirement::none,
+        IndexMapper>,
     SEGMENT>
 {
 
@@ -1101,11 +1135,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t   = IndexMapper::template index<diff_t>();
@@ -1131,11 +1165,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t t_init   = IndexMapper::template index<diff_t>();
@@ -1150,5 +1184,5 @@ struct TileTCountExecute<
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index a74a2aaa51..b892dcd339 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -73,19 +73,20 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner,
-          typename GetTallyIndex,
-          typename T,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                         T   identity,
-                                         int bin,
-                                         T   value,
-                                         T*  tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int            tally_replication,
-                                         int            tally_bins)
+template <
+    typename Combiner,
+    typename GetTallyIndex,
+    typename T,
+    typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
+    int            RAJA_UNUSED_ARG(num_bins),
+    T              identity,
+    int            bin,
+    T              value,
+    T*             tally_mem,
+    GetTallyOffset get_tally_offset,
+    int            tally_replication,
+    int            tally_bins)
 {
   if (value == identity)
   {
@@ -93,21 +94,21 @@ block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
   }
 
   int tally_index =
-      GetTallyIndex::template index<int>(); // globalWarpId by default
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
   int tally_offset =
       get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_init_shmem(int num_bins,
-                              T   identity,
-                              T*  shared_mem,
-                              int shared_replication)
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
+    int num_bins,
+    T   identity,
+    T*  shared_mem,
+    int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -122,18 +123,19 @@ block_multi_reduce_init_shmem(int num_bins,
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename GetSharedIndex,
-          typename T,
-          typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_shmem_atomic(int             num_bins,
-                                        T               identity,
-                                        int             bin,
-                                        T               value,
-                                        T*              shared_mem,
-                                        GetSharedOffset get_shared_offset,
-                                        int             shared_replication)
+template <
+    typename Combiner,
+    typename GetSharedIndex,
+    typename T,
+    typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
+    int             num_bins,
+    T               identity,
+    int             bin,
+    T               value,
+    T*              shared_mem,
+    GetSharedOffset get_shared_offset,
+    int             shared_replication)
 {
   if (value == identity)
   {
@@ -141,29 +143,30 @@ block_multi_reduce_combine_shmem_atomic(int             num_bins,
   }
 
   int shared_index =
-      GetSharedIndex::template index<int>(); // threadId by default
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
   int shmem_offset =
       get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::hip::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename T,
-          typename GetSharedOffset,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
-                                         T               identity,
-                                         T*              shared_mem,
-                                         GetSharedOffset get_shared_offset,
-                                         int             shared_replication,
-                                         T*              tally_mem,
-                                         GetTallyOffset  get_tally_offset,
-                                         int             tally_replication,
-                                         int             tally_bins)
+template <
+    typename Combiner,
+    typename T,
+    typename GetSharedOffset,
+    typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
+    int             num_bins,
+    T               identity,
+    T*              shared_mem,
+    GetSharedOffset get_shared_offset,
+    int             shared_replication,
+    T*              tally_mem,
+    GetTallyOffset  get_tally_offset,
+    int             tally_replication,
+    int             tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -181,7 +184,7 @@ grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
     {
       int shmem_offset =
           get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
     if (value != identity)
@@ -189,12 +192,12 @@ grid_multi_reduce_shmem_to_global_atomic(int             num_bins,
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
       int tally_offset =
           get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
   }
 }
 
-} // namespace impl
+}  // namespace impl
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -210,16 +213,17 @@ struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
   template <typename Container>
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
-                                          T const&         identity)
+  MultiReduceGridAtomicHostInit_TallyData(
+      Container const& container,
+      T const&         identity)
       : m_tally_mem(nullptr),
         m_identity(identity),
         m_num_bins(container.size()),
         m_tally_bins(get_tally_bins(m_num_bins)),
         m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                               m_tally_replication);
+    m_tally_mem = create_tally(
+        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
@@ -245,8 +249,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins          = new_num_bins;
       m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                                 m_tally_replication);
+      m_tally_mem         = create_tally(
+                  container, identity, m_num_bins, m_tally_bins, m_tally_replication);
     }
     else
     {
@@ -255,8 +259,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         int bin       = 0;
         for (auto const& value : container)
         {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
-                                       m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset {}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
@@ -264,8 +268,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       {
         for (int bin = 0; bin < m_num_bins; ++bin)
         {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep,
-                                       m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset {}(
+              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -287,7 +291,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
     for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
     {
       int tally_offset =
-          GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -332,18 +336,19 @@ struct MultiReduceGridAtomicHostInit_TallyData
     struct
     {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}
+    return TallyAtomicReplicationConcretizer {}
         .template get_global_replication<int>(func_data);
   }
 
   template <typename Container>
-  static T* create_tally(Container const& container,
-                         T const&         identity,
-                         int              num_bins,
-                         int              tally_bins,
-                         int              tally_replication)
+  static T* create_tally(
+      Container const& container,
+      T const&         identity,
+      int              num_bins,
+      int              tally_bins,
+      int              tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -361,7 +366,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
         for (auto const& value : container)
         {
           int tally_offset =
-              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
           new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
@@ -371,7 +376,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
         for (int bin = 0; bin < num_bins; ++bin)
         {
           int tally_offset =
-              GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
           new (&tally_mem[tally_offset]) T(identity);
         }
       }
@@ -379,10 +384,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
     return tally_mem;
   }
 
-  static void destroy_tally(T*& tally_mem,
-                            int num_bins,
-                            int tally_bins,
-                            int tally_replication)
+  static void destroy_tally(
+      T*& tally_mem,
+      int num_bins,
+      int tally_bins,
+      int tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -393,8 +399,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       for (int bin = num_bins; bin > 0; --bin)
       {
-        int tally_offset = GetTallyOffset{}(bin - 1, tally_bins, tally_rep - 1,
-                                            tally_replication);
+        int tally_offset = GetTallyOffset {}(
+            bin - 1, tally_bins, tally_rep - 1, tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -410,8 +416,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T   m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp
-                           // threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
@@ -452,7 +458,7 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
         m_tally_replication, m_tally_bins);
   }
 
@@ -464,8 +470,8 @@ struct MultiReduceGridAtomicHostInit_Data
     tally_rep = omp_get_thread_num();
 #endif
     int tally_offset =
-        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
@@ -490,8 +496,9 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 
   //! setup permanent settings, defer to tally data
   template <typename Container>
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
-                                              T const&         identity)
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      Container const& container,
+      T const&         identity)
       : TallyData(container, identity),
         m_shared_offset(s_shared_offset_unknown),
         m_shared_replication(0)
@@ -533,10 +540,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
           {
             size_t func_threads_per_block;
             size_t func_max_shared_replication_per_block;
-          } func_data{block_size, max_shmem_size / m_num_bins};
+          } func_data {block_size, max_shmem_size / m_num_bins};
 
           shared_replication =
-              SharedAtomicReplicationConcretizer{}
+              SharedAtomicReplicationConcretizer {}
                   .template get_shared_replication<size_t>(func_data);
           return m_num_bins * shared_replication;
         });
@@ -567,8 +574,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr)
     {
-      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
-                                          m_shared_replication);
+      impl::block_multi_reduce_init_shmem(
+          m_num_bins, m_identity, shared_mem, m_shared_replication);
     }
   }
 
@@ -580,8 +587,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity, shared_mem, GetSharedOffset{},
-          m_shared_replication, m_tally_mem, GetTallyOffset{},
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
           m_tally_replication, m_tally_bins);
     }
   }
@@ -595,13 +602,13 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     if (shared_mem != nullptr)
     {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset{},
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
           m_shared_replication);
     }
     else
     {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset{},
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
           m_tally_replication, m_tally_bins);
     }
   }
@@ -614,8 +621,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     tally_rep = omp_get_thread_num();
 #endif
     int tally_offset =
-        GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
@@ -642,8 +649,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset;      // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
@@ -690,15 +697,17 @@ struct MultiReduceDataHip
           (tuning::algorithm ==
            multi_reduce_algorithm::
                init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                           T,
-                                                           tuning>,
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<
+              t_MultiReduceOp,
+              T,
+              tuning>,
           std::conditional_t<
               (tuning::algorithm ==
                multi_reduce_algorithm::init_host_combine_global_atomic),
-              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                      T,
-                                                      tuning>,
+              hip::MultiReduceGridAtomicHostInit_Data<
+                  t_MultiReduceOp,
+                  T,
+                  tuning>,
               void>>,
       void>;
 
@@ -711,9 +720,10 @@ struct MultiReduceDataHip
 
   MultiReduceDataHip() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* =
+          nullptr>
   MultiReduceDataHip(Container const& container, T identity)
       : m_parent(this),
         m_sync_list(new SyncList),
@@ -859,13 +869,14 @@ struct MultiReduceDataHip
   }
 };
 
-} // end namespace hip
+}  // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
-                                hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(
+    policy::hip::hip_multi_reduce_policy,
+    hip::MultiReduceDataHip)
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index a41d93d20d..5c51ad6c38 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -46,10 +46,10 @@ resolve(KernelName&, const RAJA::hip::detail::hipInfo&)
 #endif
 }
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index a14377750e..3ea0865256 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -42,7 +42,7 @@ resolve(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
 {
   // complete reduction
   hi.res.wait();
-  *red.target = OP{}(*red.target, *red.devicetarget);
+  *red.target = OP {}(*red.target, *red.devicetarget);
 
   // free memory
   RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
@@ -52,10 +52,10 @@ resolve(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
   red.devicetarget = nullptr;
 }
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 415d1199c8..20fb09b167 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -66,7 +66,7 @@ struct get_launch<false>
 {
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
-} // end namespace detail
+}  // end namespace detail
 
 namespace hip
 {
@@ -195,9 +195,10 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template <size_t t_cutoff,
-          size_t preferred_replication_before_cutoff,
-          size_t preferred_replication_after_cutoff>
+template <
+    size_t t_cutoff,
+    size_t preferred_replication_before_cutoff,
+    size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
   template <typename IdxT, typename Data>
@@ -232,7 +233,7 @@ struct SharedAtomicReplicationMaxPow2Concretizer
         data.func_max_shared_replication_per_block;
 
     IdxT preferred_replication =
-        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
             data);
 
     return prev_pow2(
@@ -254,7 +255,7 @@ struct GlobalAtomicReplicationMinPow2Concretizer
     IdxT func_min_global_replication = data.func_min_global_replication;
 
     IdxT preferred_replication =
-        GetPreferredReplication{}.template get_preferred_replication<IdxT>(
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
             data);
 
     return next_pow2(
@@ -276,10 +277,11 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <reduce_algorithm         t_algorithm,
-          block_communication_mode t_comm_mode,
-          size_t                   t_replication,
-          size_t                   t_atomic_stride>
+template <
+    reduce_algorithm         t_algorithm,
+    block_communication_mode t_comm_mode,
+    size_t                   t_replication,
+    size_t                   t_atomic_stride>
 struct ReduceTuning
 {
   static constexpr reduce_algorithm         algorithm     = t_algorithm;
@@ -297,9 +299,10 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template <typename t_AtomicReplicationConcretizer,
-          typename t_ReplicationIndexer,
-          typename t_OffsetCalculator>
+template <
+    typename t_AtomicReplicationConcretizer,
+    typename t_ReplicationIndexer,
+    typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
@@ -307,9 +310,10 @@ struct AtomicReplicationTuning
   using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template <multi_reduce_algorithm t_algorithm,
-          typename t_SharedAtomicReplicationTuning,
-          typename t_GlobalAtomicReplicationTuning>
+template <
+    multi_reduce_algorithm t_algorithm,
+    typename t_SharedAtomicReplicationTuning,
+    typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -318,7 +322,7 @@ struct MultiReduceTuning
   static constexpr bool consistent    = false;
 };
 
-} // namespace hip
+}  // namespace hip
 
 namespace policy
 {
@@ -331,13 +335,14 @@ struct DeviceConstants
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
   RAJA::Index_type
-      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of
-                                            // the cache level that handles
-                                            // atomics
-
-  constexpr DeviceConstants(RAJA::Index_type warp_size,
-                            RAJA::Index_type max_block_size,
-                            RAJA::Index_type atomic_cache_line_bytes) noexcept
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
+
+  constexpr DeviceConstants(
+      RAJA::Index_type warp_size,
+      RAJA::Index_type max_block_size,
+      RAJA::Index_type atomic_cache_line_bytes) noexcept
       : WARP_SIZE(warp_size),
         MAX_BLOCK_SIZE(max_block_size),
         MAX_WARPS(max_block_size / warp_size),
@@ -350,28 +355,32 @@ struct DeviceConstants
 // values for HIP warp size and max block size.
 //
 #if defined(__HIP_PLATFORM_AMD__)
-constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A
+constexpr DeviceConstants device_constants(64, 1024, 64);  // MI300A
 // constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X
 #elif defined(__HIP_PLATFORM_NVIDIA__)
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 #endif
-static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
-              "device_constants.MAX_WARPS");
-static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
-              "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
-              "a multiple of device_constants.WARP_SIZE");
-
-
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
+static_assert(
+    device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
+    "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+    "device_constants.MAX_WARPS");
+static_assert(
+    device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
+    "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
+    "a multiple of device_constants.WARP_SIZE");
+
+
+template <
+    typename _IterationMapping,
+    kernel_sync_requirement sync,
+    typename... _IterationGetters>
 struct hip_indexer
 {};
 
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
+template <
+    typename _IterationMapping,
+    kernel_sync_requirement sync,
+    typename... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
                                  RAJA::Policy::hip,
                                  RAJA::Pattern::region,
@@ -381,10 +390,11 @@ struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping,
-          typename _IterationGetter,
-          typename _LaunchConcretizer,
-          bool Async = false>
+template <
+    typename _IterationMapping,
+    typename _IterationGetter,
+    typename _LaunchConcretizer,
+    bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                       RAJA::Policy::hip,
                       RAJA::Pattern::forall,
@@ -447,9 +457,10 @@ struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
                                RAJA::Pattern::reduce,
                                detail::get_launch<false>::value,
                                RAJA::Platform::hip,
-                               std::conditional_t<tuning::consistent,
-                                                  reduce::ordered,
-                                                  reduce::unordered>>
+                               std::conditional_t<
+                                   tuning::consistent,
+                                   reduce::ordered,
+                                   reduce::unordered>>
 {};
 
 template <typename tuning>
@@ -459,9 +470,10 @@ struct hip_multi_reduce_policy
           RAJA::Pattern::multi_reduce,
           detail::get_launch<false>::value,
           RAJA::Platform::hip,
-          std::conditional_t<tuning::consistent,
-                             reduce::ordered,
-                             reduce::unordered>>
+          std::conditional_t<
+              tuning::consistent,
+              reduce::ordered,
+              reduce::unordered>>
 {};
 
 /*!
@@ -531,13 +543,14 @@ struct hip_thread_masked_loop
 {};
 
 
-struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
-                                                      Pattern::synchronize,
-                                                      Launch::sync>
+struct hip_synchronize : make_policy_pattern_launch_t<
+                             Policy::hip,
+                             Pattern::synchronize,
+                             Launch::sync>
 {};
 
-} // end namespace hip
-} // end namespace policy
+}  // end namespace hip
+}  // end namespace policy
 
 
 namespace internal
@@ -556,8 +569,8 @@ int get_size(hip_dim_t dims)
 struct HipDims
 {
 
-  hip_dim_t blocks{0, 0, 0};
-  hip_dim_t threads{0, 0, 0};
+  hip_dim_t blocks {0, 0, 0};
+  hip_dim_t threads {0, 0, 0};
 
   HipDims()                          = default;
   HipDims(HipDims const&)            = default;
@@ -565,8 +578,8 @@ struct HipDims
 
   RAJA_INLINE
   HipDims(hip_dim_member_t default_val)
-      : blocks{default_val, default_val, default_val},
-        threads{default_val, default_val, default_val}
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
   {}
 
   RAJA_INLINE
@@ -580,8 +593,9 @@ struct HipDims
   {
     if (num_blocks() != 0)
     {
-      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
-              (blocks.z ? blocks.z : 1)};
+      return {
+          (blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
+          (blocks.z ? blocks.z : 1)};
     }
     else
     {
@@ -594,8 +608,9 @@ struct HipDims
   {
     if (num_threads() != 0)
     {
-      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
-              (threads.z ? threads.z : 1)};
+      return {
+          (threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
+          (threads.z ? threads.z : 1)};
     }
     else
     {
@@ -670,7 +685,7 @@ RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
   return HipDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace hip
 {
@@ -1129,8 +1144,8 @@ struct IndexDivide
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
-                                   static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(
+        indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
   }
 };
 
@@ -1164,9 +1179,10 @@ struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<typename get_index_thread<x_index>::type,
-                            typename get_index_thread<y_index>::type,
-                            typename get_index_thread<z_index>::type>;
+  using type = IndexFlatten<
+      typename get_index_thread<x_index>::type,
+      typename get_index_thread<y_index>::type,
+      typename get_index_thread<z_index>::type>;
 };
 
 // helper to get just the block indexing part of IndexGlobal
@@ -1182,9 +1198,10 @@ struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<typename get_index_block<x_index>::type,
-                            typename get_index_block<y_index>::type,
-                            typename get_index_block<z_index>::type>;
+  using type = IndexFlatten<
+      typename get_index_block<x_index>::type,
+      typename get_index_block<y_index>::type,
+      typename get_index_block<z_index>::type>;
 };
 
 
@@ -1195,12 +1212,14 @@ using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
 template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
-                                thread_y<BLOCK_SIZE_Y>,
-                                thread_z<BLOCK_SIZE_Z>>;
+template <
+    size_t BLOCK_SIZE_X = named_usage::unspecified,
+    size_t BLOCK_SIZE_Y = named_usage::unspecified,
+    size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using thread_xyz = IndexFlatten<
+    thread_x<BLOCK_SIZE_X>,
+    thread_y<BLOCK_SIZE_Y>,
+    thread_z<BLOCK_SIZE_Z>>;
 
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
@@ -1209,12 +1228,14 @@ using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
-using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
-                               block_y<GRID_SIZE_Y>,
-                               block_z<GRID_SIZE_Z>>;
+template <
+    size_t GRID_SIZE_X = named_usage::unspecified,
+    size_t GRID_SIZE_Y = named_usage::unspecified,
+    size_t GRID_SIZE_Z = named_usage::unspecified>
+using block_xyz = IndexFlatten<
+    block_x<GRID_SIZE_X>,
+    block_y<GRID_SIZE_Y>,
+    block_z<GRID_SIZE_Z>>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
@@ -1224,37 +1245,41 @@ template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
-template <size_t BLOCK_SIZE_X,
-          size_t BLOCK_SIZE_Y,
-          size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
-using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
-                                global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
-                                global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
-
-
-template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using warp_xyz =
-    IndexDivide<WARP_SIZE,
-                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified,
-          size_t GRID_SIZE_X  = named_usage::unspecified,
-          size_t GRID_SIZE_Y  = named_usage::unspecified,
-          size_t GRID_SIZE_Z  = named_usage::unspecified>
-using warp_global_xyz =
-    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
-                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
-
-} // namespace hip
+template <
+    size_t BLOCK_SIZE_X,
+    size_t BLOCK_SIZE_Y,
+    size_t BLOCK_SIZE_Z,
+    size_t GRID_SIZE_X = named_usage::unspecified,
+    size_t GRID_SIZE_Y = named_usage::unspecified,
+    size_t GRID_SIZE_Z = named_usage::unspecified>
+using global_xyz = IndexFlatten<
+    global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
+    global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
+    global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
+
+
+template <
+    size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+    size_t BLOCK_SIZE_X = named_usage::unspecified,
+    size_t BLOCK_SIZE_Y = named_usage::unspecified,
+    size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz = IndexDivide<
+    WARP_SIZE,
+    thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <
+    size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+    size_t BLOCK_SIZE_X = named_usage::unspecified,
+    size_t BLOCK_SIZE_Y = named_usage::unspecified,
+    size_t BLOCK_SIZE_Z = named_usage::unspecified,
+    size_t GRID_SIZE_X  = named_usage::unspecified,
+    size_t GRID_SIZE_Y  = named_usage::unspecified,
+    size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz = IndexFlatten<
+    warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+    block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace hip
 
 // contretizers used in forall, scan, and sort policies
 
@@ -1290,16 +1315,18 @@ using hip_exec_grid_async = policy::hip::hip_exec<
     true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
-                                       hip::global_x<BLOCK_SIZE>,
-                                       HipDefaultConcretizer,
-                                       Async>;
+using hip_exec = policy::hip::hip_exec<
+    iteration_mapping::Direct,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
-                                             hip::global_x<BLOCK_SIZE>,
-                                             HipDefaultConcretizer,
-                                             true>;
+using hip_exec_async = policy::hip::hip_exec<
+    iteration_mapping::Direct,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
@@ -1372,16 +1399,16 @@ using hip_exec_with_reduce_async = policy::hip::hip_exec<
     true>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base =
-    std::conditional_t<with_reduce,
-                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
-                       hip_exec<BLOCK_SIZE, Async>>;
+using hip_exec_base = std::conditional_t<
+    with_reduce,
+    hip_exec_with_reduce<BLOCK_SIZE, Async>,
+    hip_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async =
-    std::conditional_t<with_reduce,
-                       hip_exec_with_reduce_async<BLOCK_SIZE>,
-                       hip_exec_async<BLOCK_SIZE>>;
+using hip_exec_base_async = std::conditional_t<
+    with_reduce,
+    hip_exec_with_reduce_async<BLOCK_SIZE>,
+    hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -1397,10 +1424,11 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template <hip::reduce_algorithm         algorithm,
-          hip::block_communication_mode comm_mode,
-          size_t                        replication = named_usage::unspecified,
-          size_t atomic_stride                      = named_usage::unspecified>
+template <
+    hip::reduce_algorithm         algorithm,
+    hip::block_communication_mode comm_mode,
+    size_t                        replication   = named_usage::unspecified,
+    size_t                        atomic_stride = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1423,41 +1451,41 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
-                      hip::block_communication_mode::device_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::combine_last_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using hip_reduce_block_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
-                      hip::block_communication_mode::block_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::combine_last_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
-                      hip::block_communication_mode::device_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_device_combine_atomic_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
-                      hip::block_communication_mode::block_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_device_combine_atomic_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
-                      hip::block_communication_mode::device_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_host_combine_atomic_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
-                      hip::block_communication_mode::block_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_host_combine_atomic_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified,
+    named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1475,20 +1503,23 @@ using hip_reduce_base =
 
 
 // policies usable with multi_reducers
-template <hip::multi_reduce_algorithm algorithm,
-          typename SharedAtomicReplicationConcretizer,
-          typename SharedAtomicReplicationIndexer,
-          typename GlobalAtomicReplicationConcretizer,
-          typename GlobalAtomicReplicationIndexer>
+template <
+    hip::multi_reduce_algorithm algorithm,
+    typename SharedAtomicReplicationConcretizer,
+    typename SharedAtomicReplicationIndexer,
+    typename GlobalAtomicReplicationConcretizer,
+    typename GlobalAtomicReplicationIndexer>
 using hip_multi_reduce_tuning =
     policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
         algorithm,
-        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                     SharedAtomicReplicationIndexer,
-                                     GetOffsetRight<int>>,
-        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                     GlobalAtomicReplicationIndexer,
-                                     GetOffsetLeft<int>>>>;
+        hip::AtomicReplicationTuning<
+            SharedAtomicReplicationConcretizer,
+            SharedAtomicReplicationIndexer,
+            GetOffsetRight<int>>,
+        hip::AtomicReplicationTuning<
+            GlobalAtomicReplicationConcretizer,
+            GlobalAtomicReplicationIndexer,
+            GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1525,8 +1556,8 @@ using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing
 //
 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
     hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     hip::GlobalAtomicReplicationMinPow2Concretizer<
         hip::ConstantPreferredReplicationConcretizer<32>>,
     hip::warp_global_xyz<>>;
@@ -1534,8 +1565,8 @@ using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
 using hip_multi_reduce_atomic_global_no_replication_host_init =
     hip_multi_reduce_tuning<
         hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-        void, // unused with this algorithm
-        void, // unused with this algorithm
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
         hip::GlobalAtomicReplicationMinPow2Concretizer<
             hip::ConstantPreferredReplicationConcretizer<1>>,
         hip::block_xyz<>>;
@@ -1578,10 +1609,10 @@ using policy::hip::hip_launch_t;
 
 // policies usable with kernel and launch
 template <typename... indexers>
-using hip_indexer_direct =
-    policy::hip::hip_indexer<iteration_mapping::Direct,
-                             kernel_sync_requirement::none,
-                             indexers...>;
+using hip_indexer_direct = policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
 template <typename... indexers>
 using hip_indexer_loop = policy::hip::hip_indexer<
@@ -1596,10 +1627,10 @@ using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     indexers...>;
 
 template <typename... indexers>
-using hip_flatten_indexer_direct =
-    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     indexers...>;
+using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
 template <typename... indexers>
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
@@ -1913,10 +1944,10 @@ using hip_flatten_block_zyx_loop =
  * physical threads to fit all of the direct map requests.
  */
 template <named_dim... dims>
-using hip_global_direct =
-    hip_indexer_direct<hip::IndexGlobal<dims,
-                                        named_usage::unspecified,
-                                        named_usage::unspecified>...>;
+using hip_global_direct = hip_indexer_direct<hip::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 using hip_global_x_direct = hip_global_direct<named_dim::x>;
 using hip_global_y_direct = hip_global_direct<named_dim::y>;
@@ -1947,16 +1978,16 @@ using hip_global_zyx_direct =
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
 template <named_dim... dims>
-using hip_global_loop =
-    hip_indexer_loop<hip::IndexGlobal<dims,
-                                      named_usage::unspecified,
-                                      named_usage::unspecified>...>;
+using hip_global_loop = hip_indexer_loop<hip::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 template <named_dim... dims>
-using hip_global_syncable_loop =
-    hip_indexer_syncable_loop<hip::IndexGlobal<dims,
-                                               named_usage::unspecified,
-                                               named_usage::unspecified>...>;
+using hip_global_syncable_loop = hip_indexer_syncable_loop<hip::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 using hip_global_x_loop = hip_global_loop<named_dim::x>;
 using hip_global_y_loop = hip_global_loop<named_dim::y>;
@@ -1989,10 +2020,10 @@ using hip_global_zyx_loop =
  * Reshapes multiple physical global threads into a 1D iteration space
  */
 template <named_dim... dims>
-using hip_flatten_global_direct =
-    hip_flatten_indexer_direct<hip::IndexGlobal<dims,
-                                                named_usage::unspecified,
-                                                named_usage::unspecified>...>;
+using hip_flatten_global_direct = hip_flatten_indexer_direct<hip::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
@@ -2031,10 +2062,10 @@ using hip_flatten_global_zyx_direct =
  * global threads
  */
 template <named_dim... dims>
-using hip_flatten_global_loop =
-    hip_flatten_indexer_loop<hip::IndexGlobal<dims,
-                                              named_usage::unspecified,
-                                              named_usage::unspecified>...>;
+using hip_flatten_global_loop = hip_flatten_indexer_loop<hip::IndexGlobal<
+    dims,
+    named_usage::unspecified,
+    named_usage::unspecified>...>;
 
 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
@@ -2083,60 +2114,60 @@ using hip_thread_size_z_direct =
     hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xy_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_xy_direct = hip_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xz_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_xz_direct = hip_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yx_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_yx_direct = hip_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yz_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_yz_direct = hip_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zx_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_zx_direct = hip_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zy_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_zy_direct = hip_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xyz_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_xyz_direct = hip_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xzy_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_xzy_direct = hip_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yxz_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_yxz_direct = hip_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yzx_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_yzx_direct = hip_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zxy_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_zxy_direct = hip_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zyx_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_zyx_direct = hip_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2166,29 +2197,35 @@ using hip_block_size_zy_direct =
     hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_xyz_direct = hip_indexer_direct<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_xzy_direct = hip_indexer_direct<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_yxz_direct = hip_indexer_direct<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_yzx_direct = hip_indexer_direct<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_zxy_direct = hip_indexer_direct<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_zyx_direct = hip_indexer_direct<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2201,109 +2238,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_global_size_z_direct =
     hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xy_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xz_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yx_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yz_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zx_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zy_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xyz_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xzy_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yxz_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yzx_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zxy_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zyx_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_direct = hip_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_direct = hip_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_direct = hip_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_direct = hip_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_direct = hip_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_direct = hip_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_direct = hip_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_direct = hip_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_direct = hip_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_direct = hip_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_direct = hip_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_direct = hip_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to HIP global threads.
@@ -2336,29 +2385,35 @@ using hip_thread_size_zy_loop =
     hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_xyz_loop = hip_indexer_loop<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_xzy_loop = hip_indexer_loop<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_yxz_loop = hip_indexer_loop<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_yzx_loop = hip_indexer_loop<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_zxy_loop = hip_indexer_loop<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_zyx_loop = hip_indexer_loop<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2388,29 +2443,35 @@ using hip_block_size_zy_loop =
     hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_xyz_loop = hip_indexer_loop<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_xzy_loop = hip_indexer_loop<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_yxz_loop = hip_indexer_loop<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_yzx_loop = hip_indexer_loop<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_zxy_loop = hip_indexer_loop<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_zyx_loop = hip_indexer_loop<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2423,109 +2484,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_global_size_z_loop =
     hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xy_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xz_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yx_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yz_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zx_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zy_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xyz_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xzy_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yxz_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yzx_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zxy_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zyx_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_loop = hip_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_loop = hip_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_loop = hip_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_loop = hip_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_loop = hip_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_loop = hip_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_loop = hip_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_loop = hip_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_loop = hip_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_loop = hip_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_loop = hip_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_loop = hip_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2544,60 +2617,60 @@ using hip_flatten_thread_size_z_direct =
     hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xy_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xz_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yx_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yz_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zx_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zy_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xyz_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xzy_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yxz_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yzx_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zxy_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zyx_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2611,60 +2684,60 @@ using hip_flatten_block_size_z_direct =
     hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xy_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xz_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yx_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yz_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zx_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zy_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xyz_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xzy_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yxz_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yzx_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zxy_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zyx_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2677,109 +2750,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_flatten_global_size_z_direct =
     hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xy_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xz_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yx_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yz_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zx_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zy_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xyz_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xzy_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yxz_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yzx_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zxy_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zyx_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2798,60 +2883,60 @@ using hip_flatten_thread_size_z_loop =
     hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xy_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xz_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yx_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yz_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zx_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zy_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xyz_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xzy_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yxz_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yzx_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zxy_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zyx_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<
+    hip::thread_z<Z_BLOCK_SIZE>,
+    hip::thread_y<Y_BLOCK_SIZE>,
+    hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2865,60 +2950,60 @@ using hip_flatten_block_size_z_loop =
     hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xy_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xz_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yx_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yz_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zx_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zy_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xyz_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xzy_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yxz_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yzx_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zxy_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zyx_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<
+    hip::block_z<Z_GRID_SIZE>,
+    hip::block_y<Y_GRID_SIZE>,
+    hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2931,109 +3016,121 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_flatten_global_size_z_loop =
     hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xy_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xz_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yx_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yz_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zx_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zy_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xyz_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xzy_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yxz_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yzx_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zxy_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zyx_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <
+    int Y_BLOCK_SIZE,
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <
+    int Z_BLOCK_SIZE,
+    int Y_BLOCK_SIZE,
+    int X_BLOCK_SIZE,
+    int Z_GRID_SIZE = named_usage::unspecified,
+    int Y_GRID_SIZE = named_usage::unspecified,
+    int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<
+    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
@@ -3113,7 +3210,7 @@ using hip_block_yzx_nested_loop = hip_block_yzx_loop;
 using hip_block_zxy_nested_loop = hip_block_zxy_loop;
 using hip_block_zyx_nested_loop = hip_block_zyx_loop;
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 #endif
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 5e0cabed0d..8fb09fdd6f 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -65,14 +65,14 @@ hipAssert(hipError_t code, const char* file, int line, bool abort = true)
     }
     else
     {
-      fprintf(stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file,
-              line);
+      fprintf(
+          stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file, line);
     }
   }
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for if defined(RAJA_ENABLE_HIP)
+#endif  // closing endif for if defined(RAJA_ENABLE_HIP)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index bdd03bd9fc..495aaf436e 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -64,7 +64,7 @@ struct atomic<sum<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
@@ -73,7 +73,7 @@ struct atomic<min<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::hip_atomic {}, &val, v);
   }
 };
 
@@ -82,7 +82,7 @@ struct atomic<max<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::hip_atomic {}, &val, v);
   }
 };
 
@@ -91,7 +91,7 @@ struct atomic<and_bit<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
@@ -100,7 +100,7 @@ struct atomic<or_bit<T>>
 {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::hip_atomic {}, &val, v);
   }
 };
 
@@ -112,9 +112,9 @@ struct hip_atomic_available
       std::is_same<T, float>::value || std::is_same<T, double>::value;
 };
 
-} // namespace hip
+}  // namespace hip
 
-} // namespace reduce
+}  // namespace reduce
 
 namespace hip
 {
@@ -124,16 +124,18 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
-          typename T,
-          typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T&            val,
-                                                   T             identity,
-                                                   TempIterator  in_device_mem,
-                                                   unsigned int* device_count)
+template <
+    typename Combiner,
+    typename Accessor,
+    int replication,
+    int atomic_stride,
+    typename T,
+    typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(
+    T&            val,
+    T             identity,
+    TempIterator  in_device_mem,
+    unsigned int* device_count)
 {
   typename TempIterator::template rebind_accessor<Accessor> device_mem(
       in_device_mem);
@@ -192,7 +194,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T&            val,
 
     for (unsigned int i = threadId; i < numSlots; i += numThreads)
     {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
@@ -228,7 +230,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-      temp  = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
   }
   else
@@ -242,24 +244,25 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
-        temp = Combiner{}(temp, rhs);
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
-                "Max Warps must be less than or equal to Warp Size for this "
-                "algorithm to work");
+  static_assert(
+      RAJA::policy::hip::device_constants.MAX_WARPS <=
+          RAJA::policy::hip::device_constants.WARP_SIZE,
+      "Max Warps must be less than or equal to Warp Size for this "
+      "algorithm to work");
 
   // reduce per warp values
   if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
   {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(
-        RAJA::detail::SoAArray<T,
-                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char
+        tmpsd[sizeof(RAJA::detail::SoAArray<
+                     T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
@@ -291,7 +294,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
       for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
       {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-        temp  = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -343,7 +346,7 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
 
     for (int i = threadId; i < numBlocks; i += numThreads)
     {
-      temp = OP{}(temp, red.device_mem.get(i));
+      temp = OP {}(temp, red.device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
@@ -356,21 +359,22 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
-          typename T>
-RAJA_DEVICE RAJA_INLINE int
-grid_reduce_atomic_device_init(T&            val,
-                               T             identity,
-                               T*            device_mem,
-                               unsigned int* device_count)
+template <
+    typename Combiner,
+    typename Accessor,
+    int replication,
+    int atomic_stride,
+    typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
+    T&            val,
+    T             identity,
+    T*            device_mem,
+    unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -417,7 +421,7 @@ grid_reduce_atomic_device_init(T&            val,
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
     unsigned int old_count =
@@ -454,11 +458,11 @@ grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
   // one thread per block performs an atomic on device_mem
   if (threadId == 0 && temp != identity)
   {
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
 }
 
-} // namespace impl
+}  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
@@ -660,11 +664,12 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <
+    typename Combiner,
+    typename Accessor,
+    typename T,
+    size_t replication,
+    size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -679,7 +684,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool                                       own_device_ptr;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
 
   /*! \brief create from a default value and offload information
    *
@@ -687,20 +692,20 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        own_device_ptr{false}
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        own_device_ptr {false}
   {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
   {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
@@ -720,11 +725,10 @@ struct ReduceLastBlock_Data
   RAJA_DEVICE
   void grid_reduce(T* output)
   {
-    T      temp = value;
-    size_t replicationId =
-        impl::grid_reduce_last_block<Combiner, Accessor, replication,
-                                     atomic_stride>(temp, identity, device,
-                                                    device_count);
+    T      temp          = value;
+    size_t replicationId = impl::grid_reduce_last_block<
+        Combiner, Accessor, replication, atomic_stride>(
+        temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -768,10 +772,11 @@ struct ReduceLastBlock_Data
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <
+    typename Combiner,
+    typename T,
+    size_t replication,
+    size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -786,18 +791,18 @@ struct ReduceAtomicHostInit_Data
   ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        own_device_ptr{false}
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        own_device_ptr {false}
   {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        own_device_ptr{false}
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        own_device_ptr {false}
   {}
 
   ReduceAtomicHostInit_Data&
@@ -852,11 +857,12 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <
+    typename Combiner,
+    typename Accessor,
+    typename T,
+    size_t replication,
+    size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -874,20 +880,20 @@ struct ReduceAtomicDeviceInit_Data
   ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        own_device_ptr{false}
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        own_device_ptr {false}
   {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
   {}
 
   ReduceAtomicDeviceInit_Data&
@@ -910,10 +916,9 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId =
-        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
-                                             atomic_stride>(
-            temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
+        Combiner, Accessor, replication, atomic_stride>(
+        temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -975,10 +980,10 @@ class Reduce
   using Accessor = std::conditional_t<
       (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode ==
-                          block_communication_mode::device_fence),
-                         impl::AccessorDeviceScopeUseDeviceFence,
-                         void>>;
+      std::conditional_t<
+          (tuning::comm_mode == block_communication_mode::device_fence),
+          impl::AccessorDeviceScopeUseDeviceFence,
+          void>>;
 
   static constexpr bool atomic_policy =
       (tuning::algorithm ==
@@ -991,36 +996,40 @@ class Reduce
   using reduce_data_type = std::conditional_t<
       (tuning::algorithm == reduce_algorithm::combine_last_block) ||
           (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<Combiner,
-                                Accessor,
-                                T,
-                                replication,
-                                atomic_stride>,
+      hip::ReduceLastBlock_Data<
+          Combiner,
+          Accessor,
+          T,
+          replication,
+          atomic_stride>,
       std::conditional_t<
           atomic_available,
           std::conditional_t<
               (tuning::algorithm ==
                reduce_algorithm::init_device_combine_atomic_block),
-              hip::ReduceAtomicDeviceInit_Data<Combiner,
-                                               Accessor,
-                                               T,
-                                               replication,
-                                               atomic_stride>,
+              hip::ReduceAtomicDeviceInit_Data<
+                  Combiner,
+                  Accessor,
+                  T,
+                  replication,
+                  atomic_stride>,
               std::conditional_t<
                   (tuning::algorithm ==
                    reduce_algorithm::init_host_combine_atomic_block),
-                  hip::ReduceAtomicHostInit_Data<Combiner,
-                                                 T,
-                                                 replication,
-                                                 atomic_stride>,
+                  hip::ReduceAtomicHostInit_Data<
+                      Combiner,
+                      T,
+                      replication,
+                      atomic_stride>,
                   void>>,
           void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T,
-                                tally_slots,
-                                typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<
+      T,
+      tally_slots,
+      typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
@@ -1028,8 +1037,8 @@ class Reduce
   {
     TallyType* list;
     T*         val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -1038,12 +1047,14 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this}, tally_or_val_ptr{new TallyType}, val(init_val, identity_)
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
+        val(init_val, identity_)
   {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
-    operator T(); // syncs device
+    operator T();  // syncs device
     val = reduce_data_type(in_val, identity_);
   }
 
@@ -1053,11 +1064,11 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
@@ -1141,7 +1152,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1156,7 +1167,7 @@ class Reduce
   reduce_data_type val;
 };
 
-} // end namespace hip
+}  // end namespace hip
 
 //! specialization of ReduceSum for hip_reduce
 template <typename tuning, typename T>
@@ -1265,25 +1276,28 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T         init_val,
-               IndexType init_idx,
-               T         identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
+  ReduceMinLoc(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(
+            value_type(init_val, init_idx),
+            value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T         init_val,
-             IndexType init_idx,
-             T         identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(
+        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1321,25 +1335,28 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T         init_val,
-               IndexType init_idx,
-               T         identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
+  ReduceMaxLoc(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(
+            value_type(init_val, init_idx),
+            value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T         init_val,
-             IndexType init_idx,
-             T         identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val,
+      IndexType init_idx,
+      T         identity_val = NonLocCombiner::identity(),
+      IndexType identity_idx =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(
+        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1360,8 +1377,8 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   T get() { return Base::get(); }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 391d7d606e..bbedf4f09e 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -47,12 +47,13 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename Function>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename InputIter,
+    typename Function>
 RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
@@ -68,12 +69,13 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, begin, len, binary_op,
+      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
+      stream));
 #endif
 
   // Allocate temporary storage
@@ -82,12 +84,13 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, begin, len, binary_op,
+      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -101,13 +104,14 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename Function,
-          typename T>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename InputIter,
+    typename Function,
+    typename T>
 RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
@@ -124,12 +128,13 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, begin, init, len, binary_op,
+      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, init, len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
+      stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -137,12 +142,13 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, begin, init, len, binary_op,
+      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, init, len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -156,21 +162,22 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-            inclusive(resources::Hip hip_res,
-                      ::RAJA::policy::hip::
-                          hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-                      InputIter  begin,
-                      InputIter  end,
-                      OutputIter out,
-                      Function   binary_op)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename InputIter,
+    typename OutputIter,
+    typename Function>
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    InputIter  begin,
+    InputIter  end,
+    OutputIter out,
+    Function   binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -179,8 +186,8 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -191,8 +198,8 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -209,23 +216,24 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function,
-          typename T>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-            exclusive(resources::Hip hip_res,
-                      ::RAJA::policy::hip::
-                          hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-                      InputIter  begin,
-                      InputIter  end,
-                      OutputIter out,
-                      Function   binary_op,
-                      T          init)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename InputIter,
+    typename OutputIter,
+    typename Function,
+    typename T>
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    InputIter  begin,
+    InputIter  end,
+    OutputIter out,
+    Function   binary_op,
+    T          init)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -234,12 +242,13 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, init, len, binary_op,
+      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, out, binary_op, init, len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
+      stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -247,12 +256,13 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(
+      d_temp_storage, temp_storage_bytes, begin, out, init, len, binary_op,
+      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, out, binary_op, init, len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -262,12 +272,12 @@ RAJA_INLINE resources::EventProxy<resources::Hip>
   return resources::EventProxy<resources::Hip>(hip_res);
 }
 
-} // namespace scan
+}  // namespace scan
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index e04d0eb13d..ab0767bfcc 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -68,43 +68,47 @@ R* get_current(double_buffer<R>& d_bufs)
 #endif
 }
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename Iter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(resources::Hip hip_res,
-       ::RAJA::policy::hip::
-           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-       Iter,
-       Iter,
-       Compare)
+            camp::
+                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter,
+    Iter,
+    Compare)
 {
   static_assert(
       concepts::all_of<
           type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
           std::is_pointer<Iter>,
           concepts::any_of<
-              camp::is_same<Compare,
-                            operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<Compare, operators::greater<
-                                         RAJA::detail::IterVal<Iter>>>>>::value,
+              camp::is_same<
+                  Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<
+                  Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::
+          value,
       "RAJA stable_sort<hip_exec> is only implemented for pointers to "
       "arithmetic types and RAJA::operators::less and "
       "RAJA::operators::greater.");
@@ -115,20 +119,23 @@ stable(resources::Hip hip_res,
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-stable(resources::Hip hip_res,
-       ::RAJA::policy::hip::
-           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-       Iter begin,
-       Iter end,
-       operators::less<RAJA::detail::IterVal<Iter>>)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+stable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter begin,
+    Iter end,
+    operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -149,13 +156,13 @@ stable(resources::Hip hip_res,
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
-                                       d_keys, len, begin_bit, end_bit,
-                                       stream));
+  hipErrchk(::rocprim::radix_sort_keys(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -164,13 +171,13 @@ stable(resources::Hip hip_res,
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
-                                       d_keys, len, begin_bit, end_bit,
-                                       stream));
+  hipErrchk(::rocprim::radix_sort_keys(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -179,8 +186,8 @@ stable(resources::Hip hip_res,
   {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
-                             stream));
+    hipErrchk(hipMemcpyAsync(
+        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -193,20 +200,23 @@ stable(resources::Hip hip_res,
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-stable(resources::Hip hip_res,
-       ::RAJA::policy::hip::
-           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-       Iter begin,
-       Iter end,
-       operators::greater<RAJA::detail::IterVal<Iter>>)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+stable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter begin,
+    Iter end,
+    operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -227,9 +237,9 @@ stable(resources::Hip hip_res,
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
-                                            d_keys, len, begin_bit, end_bit,
-                                            stream));
+  hipErrchk(::rocprim::radix_sort_keys_desc(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
       d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
@@ -242,9 +252,9 @@ stable(resources::Hip hip_res,
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
-                                            d_keys, len, begin_bit, end_bit,
-                                            stream));
+  hipErrchk(::rocprim::radix_sort_keys_desc(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
       d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
@@ -257,8 +267,8 @@ stable(resources::Hip hip_res,
   {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
-                             stream));
+    hipErrchk(hipMemcpyAsync(
+        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -272,38 +282,42 @@ stable(resources::Hip hip_res,
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename Iter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(resources::Hip hip_res,
-         ::RAJA::policy::hip::
-             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-         Iter,
-         Iter,
-         Compare)
+            camp::
+                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter,
+    Iter,
+    Compare)
 {
   static_assert(
       concepts::all_of<
           type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
           std::is_pointer<Iter>,
           concepts::any_of<
-              camp::is_same<Compare,
-                            operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<Compare, operators::greater<
-                                         RAJA::detail::IterVal<Iter>>>>>::value,
+              camp::is_same<
+                  Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<
+                  Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::
+          value,
       "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
       "types and RAJA::operators::less and RAJA::operators::greater.");
 
@@ -313,20 +327,23 @@ unstable(resources::Hip hip_res,
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-unstable(resources::Hip hip_res,
-         ::RAJA::policy::hip::
-             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-         Iter                                         begin,
-         Iter                                         end,
-         operators::less<RAJA::detail::IterVal<Iter>> comp)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+unstable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    Iter                                                                begin,
+    Iter                                                                end,
+    operators::less<RAJA::detail::IterVal<Iter>>                        comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -334,20 +351,23 @@ unstable(resources::Hip hip_res,
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                      std::is_pointer<Iter>>
-unstable(resources::Hip hip_res,
-         ::RAJA::policy::hip::
-             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-         Iter                                            begin,
-         Iter                                            end,
-         operators::greater<RAJA::detail::IterVal<Iter>> comp)
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename Iter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+    std::is_pointer<Iter>>
+unstable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    Iter                                                                begin,
+    Iter                                                                end,
+    operators::greater<RAJA::detail::IterVal<Iter>>                     comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -356,13 +376,14 @@ unstable(resources::Hip hip_res,
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
@@ -370,8 +391,9 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
@@ -384,17 +406,21 @@ stable_pairs(
     ValIter,
     Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<KeyIter>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<ValIter>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "stable_sort_pairs<hip_exec> is only implemented for "
-                "arithmetic types");
   static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
+      type_traits::is_arithmetic<K>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for "
+      "arithmetic types");
+  static_assert(
+      concepts::any_of<
+          camp::is_same<Compare, operators::less<K>>,
+          camp::is_same<Compare, operators::greater<K>>>::value,
       "stable_sort_pairs<hip_exec> is only implemented for "
       "RAJA::operators::less or RAJA::operators::greater");
 
@@ -404,12 +430,13 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -446,9 +473,9 @@ stable_pairs(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
-                                        d_keys, d_vals, len, begin_bit, end_bit,
-                                        stream));
+  hipErrchk(::rocprim::radix_sort_pairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairs(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -461,9 +488,9 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
-                                        d_keys, d_vals, len, begin_bit, end_bit,
-                                        stream));
+  hipErrchk(::rocprim::radix_sort_pairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairs(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -476,15 +503,15 @@ stable_pairs(
   {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out)
   {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -498,12 +525,13 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -540,9 +568,9 @@ stable_pairs(
   void*  d_temp_storage     = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
-                                             d_keys, d_vals, len, begin_bit,
-                                             end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -555,9 +583,9 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
-                                             d_keys, d_vals, len, begin_bit,
-                                             end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -570,15 +598,15 @@ stable_pairs(
   {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out)
   {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(
+        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -593,13 +621,14 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
@@ -607,8 +636,9 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
@@ -621,17 +651,21 @@ unstable_pairs(
     ValIter,
     Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<KeyIter>::value,
+      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(
+      std::is_pointer<ValIter>::value,
+      "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "sort_pairs<hip_exec> is only implemented for arithmetic "
-                "types");
   static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
+      type_traits::is_arithmetic<K>::value,
+      "sort_pairs<hip_exec> is only implemented for arithmetic "
+      "types");
+  static_assert(
+      concepts::any_of<
+          camp::is_same<Compare, operators::less<K>>,
+          camp::is_same<Compare, operators::greater<K>>>::value,
       "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
       "RAJA::operators::greater");
 
@@ -641,12 +675,13 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -667,12 +702,13 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async,
+    typename KeyIter,
+    typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -690,12 +726,12 @@ unstable_pairs(
   return stable_pairs(hip_res, p, keys_begin, keys_end, vals_begin, comp);
 }
 
-} // namespace sort
+}  // namespace sort
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_HIP guard
+#endif  // closing endif for RAJA_ENABLE_HIP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/synchronize.hpp b/include/RAJA/policy/hip/synchronize.hpp
index 4d2c993c15..deddd1c867 100644
--- a/include/RAJA/policy/hip/synchronize.hpp
+++ b/include/RAJA/policy/hip/synchronize.hpp
@@ -43,10 +43,10 @@ void synchronize_impl(const hip_synchronize&)
 }
 
 
-} // end of namespace hip
-} // namespace policy
-} // end of namespace RAJA
+}  // end of namespace hip
+}  // namespace policy
+}  // end of namespace RAJA
 
-#endif // defined(RAJA_ENABLE_HIP)
+#endif  // defined(RAJA_ENABLE_HIP)
 
-#endif // RAJA_synchronize_hip_HPP
+#endif  // RAJA_synchronize_hip_HPP
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index de9bd28efe..89a7997b31 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -46,6 +46,6 @@
 #include "RAJA/policy/openmp/WorkGroup.hpp"
 
 
-#endif // closing endif for if defined(RAJA_ENABLE_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup.hpp b/include/RAJA/policy/openmp/WorkGroup.hpp
index fbc40a15c7..f86c4d66a0 100644
--- a/include/RAJA/policy/openmp/WorkGroup.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/openmp/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/openmp/WorkGroup/WorkRunner.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index e7515cf97c..8a3263bfd2 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -37,11 +37,11 @@ namespace detail
 template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
-  return get_Dispatcher<T, Dispatcher_T>(seq_work{});
+  return get_Dispatcher<T, Dispatcher_T>(seq_work {});
 }
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index 87c4c8fc0d..277173ed70 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -35,50 +35,56 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_work,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
-                              RAJA::omp_work,
-                              RAJA::ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+template <
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::omp_work,
+    RAJA::ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallOrdered<
+          RAJA::omp_parallel_for_exec,
+          RAJA::omp_work,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_work,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
-                              RAJA::omp_work,
-                              RAJA::reverse_ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+template <
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::omp_work,
+    RAJA::reverse_ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallReverse<
+          RAJA::omp_parallel_for_exec,
+          RAJA::omp_work,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 582c987517..43e790759d 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -40,7 +40,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T* acc)
   T ret;
 #pragma omp atomic capture
   {
-    ret = *acc; // capture old for return value
+    ret = *acc;  // capture old for return value
     *acc += (T)0;
   }
   return ret;
@@ -66,7 +66,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T* acc, T value)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old = *acc;  // capture old for return value
     *acc += value;
   }
   return old;
@@ -80,7 +80,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T* acc, T value)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old = *acc;  // capture old for return value
     *acc -= value;
   }
   return old;
@@ -104,7 +104,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T* acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMin(builtin_atomic{}, acc, value);
+  return atomicMin(builtin_atomic {}, acc, value);
 #endif
 }
 
@@ -125,7 +125,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T* acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMax(builtin_atomic{}, acc, value);
+  return atomicMax(builtin_atomic {}, acc, value);
 #endif
 }
 
@@ -137,7 +137,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old = *acc;  // capture old for return value
     *acc += T(1);
   }
   return old;
@@ -149,7 +149,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicInc(builtin_atomic{}, acc, value);
+  return RAJA::atomicInc(builtin_atomic {}, acc, value);
 }
 
 
@@ -160,7 +160,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old = *acc;  // capture old for return value
     *acc -= T(1);
   }
   return old;
@@ -172,7 +172,7 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicDec(builtin_atomic{}, acc, value);
+  return RAJA::atomicDec(builtin_atomic {}, acc, value);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -182,7 +182,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T* acc, T value)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old = *acc;  // capture old for return value
     *acc &= value;
   }
   return old;
@@ -195,7 +195,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T* acc, T value)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old = *acc;  // capture old for return value
     *acc |= value;
   }
   return old;
@@ -208,7 +208,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T* acc, T value)
   T old;
 #pragma omp atomic capture
   {
-    old = *acc; // capture old for return value
+    old = *acc;  // capture old for return value
     *acc ^= value;
   }
   return old;
@@ -221,7 +221,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
   T old;
 #pragma omp atomic capture
   {
-    old  = *acc; // capture old for return value
+    old  = *acc;  // capture old for return value
     *acc = value;
   }
   return old;
@@ -232,13 +232,13 @@ template <typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value)
 {
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
+  return RAJA::atomicCAS(builtin_atomic {}, acc, compare, value);
 }
 
-#endif // not defined RAJA_COMPILER_MSVC
+#endif  // not defined RAJA_COMPILER_MSVC
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_ENABLE_OPENMP
-#endif // guard
+#endif  // RAJA_ENABLE_OPENMP
+#endif  // guard
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index e21c8ef93e..cdc3fcc1c3 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -55,26 +55,28 @@ namespace policy
 namespace omp
 {
 
-template <typename Iterable,
-          typename Func,
-          typename InnerPolicy,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename Func,
+    typename InnerPolicy,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Host host_res,
-            const omp_parallel_exec<InnerPolicy>&,
-            Iterable&&  iter,
-            Func&&      loop_body,
-            ForallParam f_params)
+forall_impl(
+    resources::Host host_res,
+    const omp_parallel_exec<InnerPolicy>&,
+    Iterable&&  iter,
+    Func&&      loop_body,
+    ForallParam f_params)
 {
   RAJA::region<RAJA::omp_parallel_region>(
       [&]()
       {
         using RAJA::internal::thread_privatize;
         auto body = thread_privatize(loop_body);
-        forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
+        forall_impl(host_res, InnerPolicy {}, iter, body.get_priv(), f_params);
       });
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -107,13 +109,15 @@ forall_impl(const ::RAJA::policy::omp::Auto&, Iterable&& iter, Func&& loop_body)
 //
 // omp for schedule(static)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static)
@@ -126,13 +130,15 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
 //
 // omp for schedule(static, ChunkSize)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static, ChunkSize)
@@ -145,13 +151,15 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
 //
 // omp for schedule(dynamic)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(dynamic)
@@ -164,13 +172,15 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
 //
 // omp for schedule(dynamic, ChunkSize)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(dynamic, ChunkSize)
@@ -183,13 +193,15 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
 //
 // omp for schedule(guided)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Guided<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(guided)
@@ -202,13 +214,15 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
 //
 // omp for schedule(guided, ChunkSize)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Guided<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(guided, ChunkSize)
@@ -222,9 +236,10 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
 // omp for schedule(runtime)
 //
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
-                             Iterable&& iter,
-                             Func&&     loop_body)
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Runtime&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(runtime)
@@ -244,8 +259,9 @@ RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
   int         prev_chunk;
   omp_get_schedule(&prev_sched, &prev_chunk);
   omp_set_schedule(Policy::schedule, Policy::chunk_size);
-  forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter),
-              std::forward<Func>(loop_body));
+  forall_impl(
+      ::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
+      std::forward<Func>(loop_body));
   omp_set_schedule(prev_sched, prev_chunk);
 }
 #endif
@@ -257,9 +273,10 @@ RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
 // omp for nowait (Auto)
 //
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
-                                    Iterable&& iter,
-                                    Func&&     loop_body)
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Auto&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for nowait
@@ -272,14 +289,15 @@ RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
 //
 // omp for schedule(static) nowait
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                   Iterable&& iter,
-                   Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static) nowait
@@ -292,14 +310,15 @@ forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
 //
 // omp for schedule(static, ChunkSize) nowait
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                   Iterable&& iter,
-                   Func&&     loop_body)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&&     loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static, ChunkSize) nowait
@@ -320,50 +339,54 @@ forall_impl_nowait(const Policy&, Iterable&& iter, Func&& loop_body)
   int         prev_chunk;
   omp_get_schedule(&prev_sched, &prev_chunk);
   omp_set_schedule(Policy::schedule, Policy::chunk_size);
-  forall_impl_nowait(::RAJA::policy::omp::Runtime{},
-                     std::forward<Iterable>(iter),
-                     std::forward<Func>(loop_body));
+  forall_impl_nowait(
+      ::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
+      std::forward<Func>(loop_body));
   omp_set_schedule(prev_sched, prev_chunk);
 }
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-template <typename Schedule,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
+template <
+    typename Schedule,
+    typename Iterable,
+    typename Func,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Host host_res,
-            const omp_for_schedule_exec<Schedule>&,
-            Iterable&& iter,
-            Func&&     loop_body,
-            ForallParam)
+forall_impl(
+    resources::Host host_res,
+    const omp_for_schedule_exec<Schedule>&,
+    Iterable&& iter,
+    Func&&     loop_body,
+    ForallParam)
 {
-  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter),
-                        std::forward<Func>(loop_body));
+  internal::forall_impl(
+      Schedule {}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
+template <
+    typename Schedule,
+    typename Iterable,
+    typename Func,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Host host_res,
-            const omp_for_nowait_schedule_exec<Schedule>&,
-            Iterable&& iter,
-            Func&&     loop_body,
-            ForallParam)
+forall_impl(
+    resources::Host host_res,
+    const omp_for_nowait_schedule_exec<Schedule>&,
+    Iterable&& iter,
+    Func&&     loop_body,
+    ForallParam)
 {
-  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter),
-                               std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(
+      Schedule {}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
@@ -439,12 +462,12 @@ RAJA_INLINE void forall(
 }
 */
 
-} // namespace omp
+}  // namespace omp
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for if defined(RAJA_ENABLE_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/kernel.hpp b/include/RAJA/policy/openmp/kernel.hpp
index aa110cb4c3..7b9e2e4034 100644
--- a/include/RAJA/policy/openmp/kernel.hpp
+++ b/include/RAJA/policy/openmp/kernel.hpp
@@ -22,4 +22,4 @@
 #include "RAJA/policy/openmp/kernel/Collapse.hpp"
 #include "RAJA/policy/openmp/kernel/OmpSyncThreads.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index 2c16bccac6..2668eccdda 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -35,10 +35,10 @@
 namespace RAJA
 {
 
-struct omp_parallel_collapse_exec
-    : make_policy_pattern_t<RAJA::Policy::openmp,
-                            RAJA::Pattern::forall,
-                            RAJA::policy::omp::For>
+struct omp_parallel_collapse_exec : make_policy_pattern_t<
+                                        RAJA::Policy::openmp,
+                                        RAJA::Pattern::forall,
+                                        RAJA::policy::omp::For>
 {};
 
 namespace internal
@@ -48,14 +48,17 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
-                                             ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>,
-                         Types>
+template <
+    camp::idx_t Arg0,
+    camp::idx_t Arg1,
+    typename... EnclosedStmts,
+    typename Types>
+struct StatementExecutor<
+    statement::Collapse<
+        omp_parallel_collapse_exec,
+        ArgList<Arg0, Arg1>,
+        EnclosedStmts...>,
+    Types>
 {
 
 
@@ -93,15 +96,18 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          camp::idx_t Arg2,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
-                                             ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>,
-                         Types>
+template <
+    camp::idx_t Arg0,
+    camp::idx_t Arg1,
+    camp::idx_t Arg2,
+    typename... EnclosedStmts,
+    typename Types>
+struct StatementExecutor<
+    statement::Collapse<
+        omp_parallel_collapse_exec,
+        ArgList<Arg0, Arg1, Arg2>,
+        EnclosedStmts...>,
+    Types>
 {
 
 
@@ -143,11 +149,11 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
 #undef RAJA_COLLAPSE
 
-#endif // closing endif for RAJA_ENABLE_OPENMP guard
+#endif  // closing endif for RAJA_ENABLE_OPENMP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
index c1e3045ae6..be051f1209 100644
--- a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -38,7 +38,7 @@ namespace statement
 struct OmpSyncThreads : public internal::Statement<camp::nil>
 {};
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -57,10 +57,10 @@ struct StatementExecutor<statement::OmpSyncThreads, Types>
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
 
-#endif // closing endif for RAJA_ENABLE_OPENMP guard
+#endif  // closing endif for RAJA_ENABLE_OPENMP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index a6450b8555..96283e323f 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -33,11 +33,12 @@ struct LaunchExecute<RAJA::omp_launch_t>
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const&       params,
-       const char*,
-       BODY const&   body,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  exec(
+      RAJA::resources::Resource res,
+      LaunchParams const&       params,
+      const char*,
+      BODY const&   body,
+      ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     RAJA::region<RAJA::omp_parallel_region>(
         [&]()
@@ -64,11 +65,12 @@ struct LaunchExecute<RAJA::omp_launch_t>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const&       launch_params,
-       const char*               RAJA_UNUSED_ARG(kernel_name),
-       BODY const&               body,
-       ReduceParams&             f_params)
+  exec(
+      RAJA::resources::Resource res,
+      LaunchParams const&       launch_params,
+      const char*               RAJA_UNUSED_ARG(kernel_name),
+      BODY const&               body,
+      ReduceParams&             f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
@@ -106,10 +108,10 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -127,11 +129,11 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
         });
   }
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -149,20 +151,20 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
             for (int i = 0; i < len0; i++)
             {
 
-              loop_body.get_priv()(*(segment0.begin() + i),
-                                   *(segment1.begin() + j));
+              loop_body.get_priv()(
+                  *(segment0.begin() + i), *(segment1.begin() + j));
             }
           }
         });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -182,9 +184,9 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
             {
               for (int i = 0; i < len0; i++)
               {
-                loop_body.get_priv()(*(segment0.begin() + i),
-                                     *(segment1.begin() + j),
-                                     *(segment2.begin() + k));
+                loop_body.get_priv()(
+                    *(segment0.begin() + i), *(segment1.begin() + j),
+                    *(segment2.begin() + k));
               }
             }
           }
@@ -197,10 +199,10 @@ struct LoopExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -213,11 +215,11 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -235,12 +237,12 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -254,8 +256,9 @@ struct LoopExecute<omp_for_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
-               *(segment2.begin() + k));
+          body(
+              *(segment0.begin() + i), *(segment1.begin() + j),
+              *(segment2.begin() + k));
         }
       }
     }
@@ -270,10 +273,10 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -286,11 +289,11 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -308,12 +311,12 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -327,8 +330,9 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
-               *(segment2.begin() + k), i, j, k);
+          body(
+              *(segment0.begin() + i), *(segment1.begin() + j),
+              *(segment2.begin() + k), i, j, k);
         }
       }
     }
@@ -343,11 +347,11 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -365,20 +369,20 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
             for (int i = 0; i < len0; i++)
             {
 
-              loop_body.get_priv()(*(segment0.begin() + i),
-                                   *(segment1.begin() + j));
+              loop_body.get_priv()(
+                  *(segment0.begin() + i), *(segment1.begin() + j));
             }
           }
         });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -398,9 +402,9 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
             {
               for (int i = 0; i < len0; i++)
               {
-                loop_body.get_priv()(*(segment0.begin() + i),
-                                     *(segment1.begin() + j),
-                                     *(segment2.begin() + k));
+                loop_body.get_priv()(
+                    *(segment0.begin() + i), *(segment1.begin() + j),
+                    *(segment2.begin() + k));
               }
             }
           }
@@ -414,11 +418,11 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -436,20 +440,20 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
             for (int i = 0; i < len0; i++)
             {
 
-              loop_body.get_priv()(*(segment0.begin() + i),
-                                   *(segment1.begin() + j), i, j);
+              loop_body.get_priv()(
+                  *(segment0.begin() + i), *(segment1.begin() + j), i, j);
             }
           }
         });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -469,9 +473,9 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
             {
               for (int i = 0; i < len0; i++)
               {
-                loop_body.get_priv()(*(segment0.begin() + i),
-                                     *(segment1.begin() + j),
-                                     *(segment2.begin() + k), i, j, k);
+                loop_body.get_priv()(
+                    *(segment0.begin() + i), *(segment1.begin() + j),
+                    *(segment2.begin() + k), i, j, k);
               }
             }
           }
@@ -485,11 +489,11 @@ struct TileExecute<omp_parallel_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -514,11 +518,11 @@ struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     const int len      = segment.end() - segment.begin();
@@ -545,11 +549,11 @@ struct TileExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     int len = segment.end() - segment.begin();
@@ -566,11 +570,11 @@ struct TileTCountExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     const int len      = segment.end() - segment.begin();
@@ -585,5 +589,5 @@ struct TileTCountExecute<omp_for_exec, SEGMENT>
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 01b9038d2b..3e84ceb481 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -80,9 +80,10 @@ struct MultiReduceDataOMP<
 
   MultiReduceDataOMP() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* =
+          nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
       : m_parent(nullptr),
         m_num_bins(container.size()),
@@ -98,9 +99,9 @@ struct MultiReduceDataOMP<
         m_identity(other.m_identity),
         m_data(nullptr)
   {
-    m_data =
-        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
-                    other.m_num_bins);
+    m_data = create_data(
+        RepeatView<value_type>(other.m_identity, other.m_num_bins),
+        other.m_num_bins);
   }
 
   MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
@@ -117,7 +118,7 @@ struct MultiReduceDataOMP<
         {
           for (size_t bin = 0; bin < m_num_bins; ++bin)
           {
-            MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]);
+            MultiReduceOp {}(m_parent->m_data[bin], m_data[bin]);
           }
         }
       }
@@ -151,7 +152,7 @@ struct MultiReduceDataOMP<
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const& val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
@@ -215,9 +216,10 @@ struct MultiReduceDataOMP<
 
   MultiReduceDataOMP() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* =
+          nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
       : m_parent(nullptr),
         m_max_threads(omp_get_max_threads()),
@@ -227,8 +229,9 @@ struct MultiReduceDataOMP<
         m_identity(identity),
         m_data(nullptr)
   {
-    m_data = create_data(container, identity, m_num_bins, m_max_threads,
-                         m_padded_bins, m_padded_threads);
+    m_data = create_data(
+        container, identity, m_num_bins, m_max_threads, m_padded_bins,
+        m_padded_threads);
   }
 
   MultiReduceDataOMP(MultiReduceDataOMP const& other)
@@ -250,8 +253,8 @@ struct MultiReduceDataOMP<
     {
       if (!m_parent)
       {
-        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
-                     m_padded_threads);
+        destroy_data(
+            m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
       }
     }
   }
@@ -263,12 +266,13 @@ struct MultiReduceDataOMP<
     size_t new_num_bins = container.size();
     if (new_num_bins != m_num_bins)
     {
-      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
-                   m_padded_threads);
+      destroy_data(
+          m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
       m_num_bins    = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container, identity, m_num_bins, m_max_threads,
-                           m_padded_bins, m_padded_threads);
+      m_data        = create_data(
+                 container, identity, m_num_bins, m_max_threads, m_padded_bins,
+                 m_padded_threads);
     }
     else
     {
@@ -279,8 +283,8 @@ struct MultiReduceDataOMP<
           size_t bin        = 0;
           for (auto const& value : container)
           {
-            m_data[index_data(bin, thread_idx, m_padded_bins,
-                              m_padded_threads)] = value;
+            m_data[index_data(
+                bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
             ++bin;
           }
         }
@@ -288,8 +292,8 @@ struct MultiReduceDataOMP<
         {
           for (size_t bin = 0; bin < m_num_bins; ++bin)
           {
-            m_data[index_data(bin, thread_idx, m_padded_bins,
-                              m_padded_threads)] = identity;
+            m_data[index_data(
+                bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
           }
         }
       }
@@ -303,7 +307,7 @@ struct MultiReduceDataOMP<
   void combine(size_t bin, T const& val)
   {
     size_t thread_idx = omp_get_thread_num();
-    MultiReduceOp{}(
+    MultiReduceOp {}(
         m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)],
         val);
   }
@@ -333,8 +337,8 @@ struct MultiReduceDataOMP<
   {
     size_t num_cache_lines =
         RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
-                                   sizeof(T));
+    return RAJA_DIVIDE_CEILING_INT(
+        num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -342,21 +346,23 @@ struct MultiReduceDataOMP<
     return max_threads;
   }
 
-  static constexpr size_t index_data(size_t bin,
-                                     size_t thread_idx,
-                                     size_t padded_bins,
-                                     size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(
+      size_t bin,
+      size_t thread_idx,
+      size_t padded_bins,
+      size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
   template <typename Container>
-  static T* create_data(Container const& container,
-                        T                identity,
-                        size_t           num_bins,
-                        size_t           max_threads,
-                        size_t           padded_bins,
-                        size_t           padded_threads)
+  static T* create_data(
+      Container const& container,
+      T                identity,
+      size_t           num_bins,
+      size_t           max_threads,
+      size_t           padded_bins,
+      size_t           padded_threads)
   {
     if (num_bins == size_t(0))
     {
@@ -388,11 +394,12 @@ struct MultiReduceDataOMP<
     return data;
   }
 
-  static void destroy_data(T*&    data,
-                           size_t num_bins,
-                           size_t max_threads,
-                           size_t padded_bins,
-                           size_t padded_threads)
+  static void destroy_data(
+      T*&    data,
+      size_t num_bins,
+      size_t max_threads,
+      size_t padded_bins,
+      size_t padded_threads)
   {
     if (num_bins == size_t(0))
     {
@@ -411,13 +418,14 @@ struct MultiReduceDataOMP<
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
-                                detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(
+    policy::omp::omp_multi_reduce_policy,
+    detail::MultiReduceDataOMP)
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_OPENMP guard
+#endif  // closing endif for RAJA_ENABLE_OPENMP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index 1739318941..97898ddf85 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -23,15 +23,17 @@ namespace internal
 //
 // omp for (Auto)
 //
-template <typename ExecPol,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
+template <
+    typename ExecPol,
+    typename Iterable,
+    typename Func,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
-            forall_impl(const ExecPol& p,
-                        Iterable&&     iter,
-                        Func&&         loop_body,
-                        ForallParam&&  f_params)
+            forall_impl(
+                const ExecPol& p,
+                Iterable&&     iter,
+                Func&&         loop_body,
+                ForallParam&&  f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -50,18 +52,21 @@ RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
 //
 // omp for schedule(static)
 //
-template <template <int> class ExecPol,
-          typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam>
+template <
+    template <int>
+    class ExecPol,
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if<
     std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
     std::integral_constant<bool, (ChunkSize <= 0)>>
-forall_impl(const ExecPol<ChunkSize>& p,
-            Iterable&&                iter,
-            Func&&                    loop_body,
-            ForallParam&&             f_params)
+forall_impl(
+    const ExecPol<ChunkSize>& p,
+    Iterable&&                iter,
+    Func&&                    loop_body,
+    ForallParam&&             f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -80,18 +85,21 @@ forall_impl(const ExecPol<ChunkSize>& p,
 //
 // omp for schedule(static, ChunkSize)
 //
-template <template <int> class ExecPol,
-          typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam>
+template <
+    template <int>
+    class ExecPol,
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if<
     std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
     std::integral_constant<bool, (ChunkSize > 0)>>
-forall_impl(const ExecPol<ChunkSize>& p,
-            Iterable&&                iter,
-            Func&&                    loop_body,
-            ForallParam&&             f_params)
+forall_impl(
+    const ExecPol<ChunkSize>& p,
+    Iterable&&                iter,
+    Func&&                    loop_body,
+    ForallParam&&             f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -112,10 +120,11 @@ forall_impl(const ExecPol<ChunkSize>& p,
 // omp for schedule(runtime)
 //
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                             Iterable&&                          iter,
-                             Func&&                              loop_body,
-                             ForallParam&&                       f_params)
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Runtime& p,
+    Iterable&&                          iter,
+    Func&&                              loop_body,
+    ForallParam&&                       f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -135,10 +144,11 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
 // omp for nowait (Auto)
 //
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                    Iterable&&                       iter,
-                                    Func&&                           loop_body,
-                                    ForallParam&&                    f_params)
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Auto& p,
+    Iterable&&                       iter,
+    Func&&                           loop_body,
+    ForallParam&&                    f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -160,15 +170,17 @@ RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
 //
 // omp for schedule(dynamic)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                             Iterable&&    iter,
-                             Func&&        loop_body,
-                             ForallParam&& f_params)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+    Iterable&&                                     iter,
+    Func&&                                         loop_body,
+    ForallParam&&                                  f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -187,15 +199,17 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
 //
 // omp for schedule(dynamic, ChunkSize)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                             Iterable&&    iter,
-                             Func&&        loop_body,
-                             ForallParam&& f_params)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+    Iterable&&                                     iter,
+    Func&&                                         loop_body,
+    ForallParam&&                                  f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -215,15 +229,17 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
 //
 // omp for schedule(guided)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                             Iterable&&                                    iter,
-                             Func&&        loop_body,
-                             ForallParam&& f_params)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+    Iterable&&                                    iter,
+    Func&&                                        loop_body,
+    ForallParam&&                                 f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -242,15 +258,17 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
 //
 // omp for schedule(guided, ChunkSize)
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                             Iterable&&                                    iter,
-                             Func&&        loop_body,
-                             ForallParam&& f_params)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(
+    const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+    Iterable&&                                    iter,
+    Func&&                                        loop_body,
+    ForallParam&&                                 f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -270,16 +288,17 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
 //
 // omp for schedule(static) nowait
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                   Iterable&&                                    iter,
-                   Func&&                                        loop_body,
-                   ForallParam&&                                 f_params)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>& p,
+    Iterable&&                                    iter,
+    Func&&                                        loop_body,
+    ForallParam&&                                 f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -301,16 +320,17 @@ forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
 //
 // omp for schedule(static, ChunkSize) nowait
 //
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                   Iterable&&                                    iter,
-                   Func&&                                        loop_body,
-                   ForallParam&&                                 f_params)
+template <
+    typename Iterable,
+    typename Func,
+    int ChunkSize,
+    typename ForallParam,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>& p,
+    Iterable&&                                    iter,
+    Func&&                                        loop_body,
+    ForallParam&&                                 f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -329,52 +349,55 @@ forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 }
 
-} //  namespace internal
-
-template <typename Schedule,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
-RAJA_INLINE resources::EventProxy<resources::Host>
-            forall_impl(resources::Host host_res,
-                        const omp_for_schedule_exec<Schedule>&,
-                        Iterable&&  iter,
-                        Func&&      loop_body,
-                        ForallParam f_params)
+}  //  namespace internal
+
+template <
+    typename Schedule,
+    typename Iterable,
+    typename Func,
+    typename ForallParam>
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(
+    resources::Host host_res,
+    const omp_for_schedule_exec<Schedule>&,
+    Iterable&&  iter,
+    Func&&      loop_body,
+    ForallParam f_params)
 {
-  expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter),
-                              std::forward<Func>(loop_body),
-                              std::forward<ForallParam>(f_params));
+  expt::internal::forall_impl(
+      Schedule {}, std::forward<Iterable>(iter), std::forward<Func>(loop_body),
+      std::forward<ForallParam>(f_params));
   return resources::EventProxy<resources::Host>(host_res);
 }
-} //  namespace expt
+}  //  namespace expt
 
 ///
 /// OpenMP parallel policy implementation
 ///
-template <typename Iterable,
-          typename Func,
-          typename InnerPolicy,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename Func,
+    typename InnerPolicy,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Host host_res,
-            const omp_parallel_exec<InnerPolicy>&,
-            Iterable&&  iter,
-            Func&&      loop_body,
-            ForallParam f_params)
+forall_impl(
+    resources::Host host_res,
+    const omp_parallel_exec<InnerPolicy>&,
+    Iterable&&  iter,
+    Func&&      loop_body,
+    ForallParam f_params)
 {
-  expt::forall_impl(host_res, InnerPolicy{}, iter, loop_body, f_params);
+  expt::forall_impl(host_res, InnerPolicy {}, iter, loop_body, f_params);
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-} // namespace omp
+}  // namespace omp
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
index 599f87422b..3a6c6d9bea 100644
--- a/include/RAJA/policy/openmp/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -36,9 +36,9 @@ resolve(KernelName&)
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
index b796592063..4846929843 100644
--- a/include/RAJA/policy/openmp/params/reduce.hpp
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -25,7 +25,7 @@ template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
 combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
 {
-  out.val = OP{}(out.val, in.val);
+  out.val = OP {}(out.val, in.val);
 }
 
 // Resolve
@@ -33,13 +33,13 @@ template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
 resolve(Reducer<OP, T>& red)
 {
-  *red.target = OP{}(*red.target, red.val);
+  *red.target = OP {}(*red.target, red.val);
 }
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index e4dc83bfbb..e98e8a43be 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -60,7 +60,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
-} // namespace omp
+}  // namespace omp
 
 namespace policy
 {
@@ -79,7 +79,7 @@ struct Schedule : public ScheduleTag
   constexpr static int         chunk_size = Chunk;
   constexpr static Policy      policy     = Policy::openmp;
 };
-} // namespace internal
+}  // namespace internal
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -113,8 +113,8 @@ using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 template <int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
-                                            default_chunk_size>
+struct Runtime : private internal::
+                     Schedule<static_cast<omp_sched_t>(-1), default_chunk_size>
 {};
 
 //
@@ -128,20 +128,21 @@ struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
 ///
 ///  Struct supporting OpenMP parallel region.
 ///
-struct omp_parallel_region
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::region,
-                                            Launch::undefined,
-                                            Platform::host>
+struct omp_parallel_region : make_policy_pattern_launch_platform_t<
+                                 Policy::openmp,
+                                 Pattern::region,
+                                 Launch::undefined,
+                                 Platform::host>
 {};
 
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                            Pattern::region,
-                                                            Launch::undefined,
-                                                            Platform::host>
+struct omp_launch_t : make_policy_pattern_launch_platform_t<
+                          Policy::openmp,
+                          Pattern::region,
+                          Launch::undefined,
+                          Platform::host>
 {};
 
 
@@ -149,14 +150,14 @@ struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
 template <typename Sched>
-struct omp_for_nowait_schedule_exec
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::For,
-                                            omp::NoWait,
-                                            Sched>
+struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<
+                                          Policy::openmp,
+                                          Pattern::forall,
+                                          Launch::undefined,
+                                          Platform::host,
+                                          omp::For,
+                                          omp::NoWait,
+                                          Sched>
 {
   static_assert(
       std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
@@ -168,13 +169,13 @@ struct omp_for_nowait_schedule_exec
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
 template <typename Sched>
-struct omp_for_schedule_exec
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::For,
-                                            Sched>
+struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<
+                                   Policy::openmp,
+                                   Pattern::forall,
+                                   Launch::undefined,
+                                   Platform::host,
+                                   omp::For,
+                                   Sched>
 {
   static_assert(
       std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
@@ -225,13 +226,13 @@ using omp_for_nowait_static_exec =
 ///  execution construct.
 ///
 template <typename InnerPolicy>
-using omp_parallel_exec =
-    make_policy_pattern_launch_platform_t<Policy::openmp,
-                                          Pattern::forall,
-                                          Launch::undefined,
-                                          Platform::host,
-                                          omp::Parallel,
-                                          wrapper<InnerPolicy>>;
+using omp_parallel_exec = make_policy_pattern_launch_platform_t<
+    Policy::openmp,
+    Pattern::forall,
+    Launch::undefined,
+    Platform::host,
+    omp::Parallel,
+    wrapper<InnerPolicy>>;
 
 ///
 ///  Internal type aliases supporting 'omp parallel for schedule( )' for
@@ -296,10 +297,11 @@ struct omp_taskgraph_interval_segit
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                        Pattern::workgroup_exec,
-                                                        Launch::sync,
-                                                        Platform::host>
+struct omp_work : make_policy_pattern_launch_platform_t<
+                      Policy::openmp,
+                      Pattern::workgroup_exec,
+                      Launch::sync,
+                      Platform::host>
 {};
 
 ///
@@ -324,15 +326,17 @@ struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
                                      Pattern::multi_reduce,
                                      Launch::undefined,
                                      Platform::host,
-                                     std::conditional_t<tuning::consistent,
-                                                        reduce::ordered,
-                                                        reduce::unordered>>
+                                     std::conditional_t<
+                                         tuning::consistent,
+                                         reduce::ordered,
+                                         reduce::unordered>>
 {};
 
 ///
-struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
-                                                      Pattern::synchronize,
-                                                      Launch::sync>
+struct omp_synchronize : make_policy_pattern_launch_t<
+                             Policy::openmp,
+                             Pattern::synchronize,
+                             Launch::sync>
 {};
 
 #if defined(RAJA_COMPILER_MSVC)
@@ -340,7 +344,7 @@ struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
 // For MS Visual C, just default to builtin_atomic for everything
 using omp_atomic = builtin_atomic;
 
-#else // RAJA_COMPILER_MSVC not defined
+#else  // RAJA_COMPILER_MSVC not defined
 
 struct omp_atomic
 {};
@@ -373,8 +377,8 @@ using omp_multi_reduce_unordered = omp_multi_reduce_combine_on_destruction;
 
 using omp_multi_reduce = omp_multi_reduce_unordered;
 
-} // namespace omp
-} // namespace policy
+}  // namespace omp
+}  // namespace policy
 
 
 ///
@@ -476,6 +480,6 @@ using policy::omp::omp_synchronize;
 ///
 using policy::omp::omp_work;
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index d1d1dac68d..7fb0953c03 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -64,7 +64,7 @@ class ReduceOMP
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 RAJA_DECLARE_ALL_REDUCERS(omp_reduce, detail::ReduceOMP)
 
@@ -102,7 +102,7 @@ class ReduceOMPOrdered
 
   ~ReduceOMPOrdered()
   {
-    Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+    Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
     Base::my_data = Base::identity;
   }
 
@@ -110,25 +110,25 @@ class ReduceOMPOrdered
   {
     if (Base::my_data != Base::identity)
     {
-      Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+      Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
       Base::my_data = Base::identity;
     }
 
     T res = Base::identity;
     for (size_t i = 0; i < data->size(); ++i)
     {
-      Reduce{}(res, (*data)[i]);
+      Reduce {}(res, (*data)[i]);
     }
     return res;
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 RAJA_DECLARE_ALL_REDUCERS(omp_reduce_ordered, detail::ReduceOMPOrdered)
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_OPENMP guard
+#endif  // closing endif for RAJA_ENABLE_OPENMP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/region.hpp b/include/RAJA/policy/openmp/region.hpp
index 744e8510ce..80f2dbd84a 100644
--- a/include/RAJA/policy/openmp/region.hpp
+++ b/include/RAJA/policy/openmp/region.hpp
@@ -39,17 +39,17 @@ RAJA_INLINE void region_impl(const omp_parallel_region&, Func&& body)
 {
 
 #pragma omp parallel
-  { // curly brackets to ensure body() is encapsulated in omp parallel region
+  {  // curly brackets to ensure body() is encapsulated in omp parallel region
     // thread private copy of body
     auto loopbody = body;
     loopbody();
   }
 }
 
-} // namespace omp
+}  // namespace omp
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 28e690c831..5e724367f8 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -44,13 +44,15 @@ namespace scan
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-            inclusive_inplace(resources::Host host_res,
-                              const Policy&,
-                              Iter  begin,
-                              Iter  end,
-                              BinFn f)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(
+    resources::Host host_res,
+    const Policy&,
+    Iter  begin,
+    Iter  end,
+    BinFn f)
 {
   using RAJA::detail::firstIndex;
   using std::distance;
@@ -67,14 +69,15 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
     const DistanceT idx_end   = firstIndex(n, p, pid + 1);
     if (idx_begin != idx_end)
     {
-      inclusive_inplace(host_res, ::RAJA::seq_exec{}, begin + idx_begin,
-                        begin + idx_end, f);
+      inclusive_inplace(
+          host_res, ::RAJA::seq_exec {}, begin + idx_begin, begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp          single
-    exclusive_inplace(host_res, ::RAJA::seq_exec{}, sums.data(),
-                               sums.data() + p, f, BinFn::identity());
+    exclusive_inplace(
+                 host_res, ::RAJA::seq_exec {}, sums.data(), sums.data() + p, f,
+                 BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i)
     {
                begin[i] = f(begin[i], sums[pid]);
@@ -89,14 +92,16 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-            exclusive_inplace(resources::Host host_res,
-                              const Policy&,
-                              Iter   begin,
-                              Iter   end,
-                              BinFn  f,
-                              ValueT v)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(
+    resources::Host host_res,
+    const Policy&,
+    Iter   begin,
+    Iter   end,
+    BinFn  f,
+    ValueT v)
 {
   using RAJA::detail::firstIndex;
   using std::distance;
@@ -115,14 +120,15 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
 #pragma omp barrier
     if (idx_begin != idx_end)
     {
-      exclusive_inplace(host_res, seq_exec{}, begin + idx_begin,
-                        begin + idx_end, f, init);
+      exclusive_inplace(
+          host_res, seq_exec {}, begin + idx_begin, begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, seq_exec{}, sums.data(), sums.data() + p, f,
-                      BinFn::identity());
+    exclusive_inplace(
+        host_res, seq_exec {}, sums.data(), sums.data() + p, f,
+        BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i)
     {
       begin[i] = f(begin[i], sums[pid]);
@@ -137,14 +143,16 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
    initial value
 */
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-            inclusive(resources::Host host_res,
-                      const Policy&   exec,
-                      Iter            begin,
-                      Iter            end,
-                      OutIter         out,
-                      BinFn           f)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<Policy>>
+inclusive(
+    resources::Host host_res,
+    const Policy&   exec,
+    Iter            begin,
+    Iter            end,
+    OutIter         out,
+    BinFn           f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -155,31 +163,34 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename Policy,
-          typename Iter,
-          typename OutIter,
-          typename BinFn,
-          typename ValueT>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-            exclusive(resources::Host host_res,
-                      const Policy&   exec,
-                      Iter            begin,
-                      Iter            end,
-                      OutIter         out,
-                      BinFn           f,
-                      ValueT          v)
+template <
+    typename Policy,
+    typename Iter,
+    typename OutIter,
+    typename BinFn,
+    typename ValueT>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<Policy>>
+exclusive(
+    resources::Host host_res,
+    const Policy&   exec,
+    Iter            begin,
+    Iter            end,
+    OutIter         out,
+    BinFn           f,
+    ValueT          v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f,
-                           v);
+  return exclusive_inplace(
+      host_res, exec, out, out + distance(begin, end), f, v);
 }
 
-} // namespace scan
+}  // namespace scan
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index c8a137eaee..5a4439abbb 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -55,12 +55,13 @@ constexpr int get_min_iterates_per_task() { return 128; }
                by spawning tasks
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline void sort_task(Sorter                       sorter,
-                      Iter                         begin,
-                      RAJA::detail::IterDiff<Iter> i_begin,
-                      RAJA::detail::IterDiff<Iter> i_end,
-                      RAJA::detail::IterDiff<Iter> iterates_per_task,
-                      Compare                      comp)
+inline void sort_task(
+    Sorter                       sorter,
+    Iter                         begin,
+    RAJA::detail::IterDiff<Iter> i_begin,
+    RAJA::detail::IterDiff<Iter> i_end,
+    RAJA::detail::IterDiff<Iter> iterates_per_task,
+    Compare                      comp)
 {
   using diff_type   = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
@@ -85,8 +86,8 @@ inline void sort_task(Sorter                       sorter,
 
     // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
     // comp);
-    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
-                                begin + i_end, comp);
+    RAJA::detail::inplace_merge(
+        begin + i_begin, begin + i_middle, begin + i_end, comp);
   }
 }
 
@@ -97,10 +98,11 @@ inline void sort_task(Sorter                       sorter,
                by manually assigning work to threads
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline void sort_parallel_region(Sorter                       sorter,
-                                 Iter                         begin,
-                                 RAJA::detail::IterDiff<Iter> n,
-                                 Compare                      comp)
+inline void sort_parallel_region(
+    Sorter                       sorter,
+    Iter                         begin,
+    RAJA::detail::IterDiff<Iter> n,
+    Compare                      comp)
 {
   using RAJA::detail::firstIndex;
   using diff_type = RAJA::detail::IterDiff<Iter>;
@@ -137,8 +139,8 @@ inline void sort_parallel_region(Sorter                       sorter,
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
       // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
       // comp);
-      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
-                                  begin + i_end, comp);
+      RAJA::detail::inplace_merge(
+          begin + i_begin, begin + i_middle, begin + i_end, comp);
     }
   }
 }
@@ -175,7 +177,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 
     const diff_type requested_num_threads =
         std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
 #pragma omp          master
@@ -187,7 +189,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 
     const diff_type requested_num_threads = std::min(
         (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
     {
@@ -198,23 +200,25 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
   }
 }
 
-} // namespace openmp
+}  // namespace openmp
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<ExecPolicy>>
-unstable(resources::Host host_res,
-         const ExecPolicy&,
-         Iter    begin,
-         Iter    end,
-         Compare comp)
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<ExecPolicy>>
+unstable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter    begin,
+    Iter    end,
+    Compare comp)
 {
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -223,15 +227,17 @@ unstable(resources::Host host_res,
         \brief stable sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<ExecPolicy>>
-stable(resources::Host host_res,
-       const ExecPolicy&,
-       Iter    begin,
-       Iter    end,
-       Compare comp)
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<ExecPolicy>>
+stable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter    begin,
+    Iter    end,
+    Compare comp)
 {
-  detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::StableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -239,24 +245,28 @@ stable(resources::Host host_res,
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(resources::Host host_res,
-               const ExecPolicy&,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
-               Compare comp)
+template <
+    typename ExecPolicy,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<ExecPolicy>>
+unstable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end,
-                       RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(
+      detail::UnstableSorter {}, begin, end,
+      RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -265,32 +275,35 @@ unstable_pairs(resources::Host host_res,
         \brief stable sort given range of pairs using comparison function on
    keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(resources::Host host_res,
-             const ExecPolicy&,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             Compare comp)
+template <
+    typename ExecPolicy,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_openmp_policy<ExecPolicy>>
+stable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::StableSorter{}, begin, end,
-                       RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(
+      detail::StableSorter {}, begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-} // namespace sort
+}  // namespace sort
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/openmp/synchronize.hpp b/include/RAJA/policy/openmp/synchronize.hpp
index b717fdb00e..0ecc10a46b 100644
--- a/include/RAJA/policy/openmp/synchronize.hpp
+++ b/include/RAJA/policy/openmp/synchronize.hpp
@@ -37,8 +37,8 @@ void synchronize_impl(const omp_synchronize&)
 }
 
 
-} // end of namespace omp
-} // namespace policy
-} // end of namespace RAJA
+}  // end of namespace omp
+}  // namespace policy
+}  // end of namespace RAJA
 
-#endif // RAJA_synchronize_openmp_HPP
+#endif  // RAJA_synchronize_openmp_HPP
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index b17d719321..4c48a12eda 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -34,7 +34,7 @@
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
-#endif // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
-       // defined(RAJA_ENABLE_TARGET_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
+        // defined(RAJA_ENABLE_TARGET_OPENMP)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup.hpp b/include/RAJA/policy/openmp_target/WorkGroup.hpp
index f987f12e60..47ade8ac57 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index 1c3744fce3..6ace7460fd 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -58,7 +58,7 @@ inline auto get_cached_value(Factory&& factory)
   return value;
 }
 
-} // namespace omp_target
+}  // namespace omp_target
 
 /*!
  * Populate and return a Dispatcher object that can be used in omp target
@@ -67,7 +67,7 @@ inline auto get_cached_value(Factory&& factory)
 template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
-  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>(
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
       [](auto&& factory)
       {
         return omp_target::get_cached_value(
@@ -76,8 +76,8 @@ inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
   return &dispatcher;
 }
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index 41a59bcef1..1a96dcb7c6 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -35,50 +35,56 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_target_work,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
-                              RAJA::omp_target_work,
-                              RAJA::ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+template <
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::omp_target_work,
+    RAJA::ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallOrdered<
+          RAJA::omp_target_parallel_for_exec_nt,
+          RAJA::omp_target_work,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_target_work,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
-                              RAJA::omp_target_work,
-                              RAJA::reverse_ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+template <
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::omp_target_work,
+    RAJA::reverse_ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallReverse<
+          RAJA::omp_target_parallel_for_exec_nt,
+          RAJA::omp_target_work,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 3f0462fbc0..5d465b5727 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -33,20 +33,22 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
+template <
+    size_t ThreadsPerTeam,
+    typename Iterable,
+    typename Func,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Omp                                      omp_res,
-            const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
-            Iterable&&                                          iter,
-            Func&&                                              loop_body,
-            ForallParam                                         f_params)
+forall_impl(
+    resources::Omp                                      omp_res,
+    const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
+    Iterable&&                                          iter,
+    Func&&                                              loop_body,
+    ForallParam                                         f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -92,19 +94,21 @@ forall_impl(resources::Omp                                      omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <size_t ThreadsPerTeam,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
+template <
+    size_t ThreadsPerTeam,
+    typename Iterable,
+    typename Func,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Omp omp_res,
-            const omp_target_parallel_for_exec<ThreadsPerTeam>&,
-            Iterable&& iter,
-            Func&&     loop_body,
-            ForallParam)
+forall_impl(
+    resources::Omp omp_res,
+    const omp_target_parallel_for_exec<ThreadsPerTeam>&,
+    Iterable&& iter,
+    Func&&     loop_body,
+    ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body  = loop_body;
@@ -151,11 +155,12 @@ RAJA_INLINE concepts::enable_if_t<
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Omp                         omp_res,
-            const omp_target_parallel_for_exec_nt& p,
-            Iterable&&                             iter,
-            Func&&                                 loop_body,
-            ForallParam                            f_params)
+forall_impl(
+    resources::Omp                         omp_res,
+    const omp_target_parallel_for_exec_nt& p,
+    Iterable&&                             iter,
+    Func&&                                 loop_body,
+    ForallParam                            f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -184,11 +189,12 @@ RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Omp omp_res,
-            const omp_target_parallel_for_exec_nt&,
-            Iterable&& iter,
-            Func&&     loop_body,
-            ForallParam)
+forall_impl(
+    resources::Omp omp_res,
+    const omp_target_parallel_for_exec_nt&,
+    Iterable&& iter,
+    Func&&     loop_body,
+    ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body  = loop_body;
@@ -206,12 +212,12 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-} // namespace omp
+}  // namespace omp
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for if defined(RAJA_TARGET_RAJA_ENABLE_OPENMP)
+#endif  // closing endif for if defined(RAJA_TARGET_RAJA_ENABLE_OPENMP)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/kernel.hpp b/include/RAJA/policy/openmp_target/kernel.hpp
index 54edbd7b8f..83038ce80a 100644
--- a/include/RAJA/policy/openmp_target/kernel.hpp
+++ b/include/RAJA/policy/openmp_target/kernel.hpp
@@ -11,4 +11,4 @@
 #include "RAJA/policy/openmp_target/kernel/Collapse.hpp"
 #include "RAJA/policy/openmp_target/kernel/For.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index 535a027fa6..72bb3fc634 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -15,14 +15,17 @@ namespace RAJA
 namespace internal
 {
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
-                                             ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>,
-                         Types>
+template <
+    camp::idx_t Arg0,
+    camp::idx_t Arg1,
+    typename... EnclosedStmts,
+    typename Types>
+struct StatementExecutor<
+    statement::Collapse<
+        omp_target_parallel_collapse_exec,
+        ArgList<Arg0, Arg1>,
+        EnclosedStmts...>,
+    Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -52,15 +55,18 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
   }
 };
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          camp::idx_t Arg2,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
-                                             ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>,
-                         Types>
+template <
+    camp::idx_t Arg0,
+    camp::idx_t Arg1,
+    camp::idx_t Arg2,
+    typename... EnclosedStmts,
+    typename Types>
+struct StatementExecutor<
+    statement::Collapse<
+        omp_target_parallel_collapse_exec,
+        ArgList<Arg0, Arg1, Arg2>,
+        EnclosedStmts...>,
+    Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -96,16 +102,19 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
   }
 };
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          camp::idx_t Arg2,
-          camp::idx_t Arg3,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
-                                             ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>,
-                         Types>
+template <
+    camp::idx_t Arg0,
+    camp::idx_t Arg1,
+    camp::idx_t Arg2,
+    camp::idx_t Arg3,
+    typename... EnclosedStmts,
+    typename Types>
+struct StatementExecutor<
+    statement::Collapse<
+        omp_target_parallel_collapse_exec,
+        ArgList<Arg0, Arg1, Arg2, Arg3>,
+        EnclosedStmts...>,
+    Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -147,7 +156,7 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
   }
 };
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_kernel_Collapse_HPP
+#endif  // RAJA_policy_openmp_target_kernel_Collapse_HPP
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index ddc09add09..35bc190d66 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -15,10 +15,11 @@ namespace RAJA
 namespace internal
 {
 
-template <camp::idx_t ArgumentId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
+template <
+    camp::idx_t ArgumentId,
+    typename Data,
+    typename Types,
+    typename... EnclosedStmts>
 struct OpenMPTargetForWrapper : public GenericWrapperBase
 {
   using data_t = camp::decay<Data>;
@@ -29,7 +30,7 @@ struct OpenMPTargetForWrapper : public GenericWrapperBase
    * \brief Deferences data so that it can be mapped to the device
    */
   RAJA_INLINE
-  constexpr explicit OpenMPTargetForWrapper(data_t& d) : data{d} {}
+  constexpr explicit OpenMPTargetForWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
   void exec()
@@ -45,14 +46,15 @@ struct OpenMPTargetForWrapper : public GenericWrapperBase
   }
 };
 
-template <camp::idx_t ArgumentId,
-          int         N,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::For<ArgumentId,
-                                        omp_target_parallel_for_exec<N>,
-                                        EnclosedStmts...>,
-                         Types>
+template <
+    camp::idx_t ArgumentId,
+    int         N,
+    typename... EnclosedStmts,
+    typename Types>
+struct StatementExecutor<
+    statement::
+        For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>,
+    Types>
 {
 
   template <typename Data>
@@ -68,14 +70,14 @@ struct StatementExecutor<statement::For<ArgumentId,
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N>{},
-                TypedRangeSegment<len_t>(0, len), for_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(
+        r, omp_target_parallel_for_exec<N> {}, TypedRangeSegment<len_t>(0, len),
+        for_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_kernel_For_HPP
+#endif  // RAJA_policy_openmp_kernel_For_HPP
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
index fc744b94cc..3579269bdf 100644
--- a/include/RAJA/policy/openmp_target/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -36,9 +36,9 @@ resolve(KernelName&)
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index b047e14dbe..02b7885973 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -25,7 +25,7 @@ template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
 combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
 {
-  out.val = OP{}(out.val, in.val);
+  out.val = OP {}(out.val, in.val);
 }
 
 // Resolve
@@ -33,13 +33,13 @@ template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
 resolve(Reducer<OP, T>& red)
 {
-  *red.target = OP{}(*red.target, red.val);
+  *red.target = OP {}(*red.target, red.val);
 }
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index deff13110d..95db806635 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -37,49 +37,50 @@ struct Collapse
 {};
 
 template <size_t ThreadsPerTeam>
-struct omp_target_parallel_for_exec
-    : make_policy_pattern_platform_t<Policy::target_openmp,
-                                     Pattern::forall,
-                                     Platform::omp_target,
-                                     omp::Target,
-                                     omp::Teams<ThreadsPerTeam>,
-                                     omp::Distribute>
+struct omp_target_parallel_for_exec : make_policy_pattern_platform_t<
+                                          Policy::target_openmp,
+                                          Pattern::forall,
+                                          Platform::omp_target,
+                                          omp::Target,
+                                          omp::Teams<ThreadsPerTeam>,
+                                          omp::Distribute>
 {};
 
-struct omp_target_parallel_for_exec_nt
-    : make_policy_pattern_platform_t<Policy::target_openmp,
-                                     Pattern::forall,
-                                     Platform::omp_target,
-                                     omp::Target,
-                                     omp::Distribute>
+struct omp_target_parallel_for_exec_nt : make_policy_pattern_platform_t<
+                                             Policy::target_openmp,
+                                             Pattern::forall,
+                                             Platform::omp_target,
+                                             omp::Target,
+                                             omp::Distribute>
 {};
 
-struct omp_target_parallel_collapse_exec
-    : make_policy_pattern_platform_t<Policy::target_openmp,
-                                     Pattern::forall,
-                                     Platform::omp_target,
-                                     omp::Target,
-                                     omp::Collapse>
+struct omp_target_parallel_collapse_exec : make_policy_pattern_platform_t<
+                                               Policy::target_openmp,
+                                               Pattern::forall,
+                                               Platform::omp_target,
+                                               omp::Target,
+                                               omp::Collapse>
 {};
 
-struct omp_target_reduce : make_policy_pattern_platform_t<Policy::target_openmp,
-                                                          Pattern::reduce,
-                                                          Platform::omp_target>
+struct omp_target_reduce : make_policy_pattern_platform_t<
+                               Policy::target_openmp,
+                               Pattern::reduce,
+                               Platform::omp_target>
 {};
 
 ///
 /// WorkGroup execution policies
 ///
-struct omp_target_work
-    : make_policy_pattern_launch_platform_t<Policy::target_openmp,
-                                            Pattern::workgroup_exec,
-                                            Launch::sync,
-                                            Platform::omp_target>
+struct omp_target_work : make_policy_pattern_launch_platform_t<
+                             Policy::target_openmp,
+                             Pattern::workgroup_exec,
+                             Launch::sync,
+                             Platform::omp_target>
 {};
 
 
-} // namespace omp
-} // namespace policy
+}  // namespace omp
+}  // namespace policy
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using policy::omp::omp_target_parallel_collapse_exec;
@@ -89,6 +90,6 @@ using policy::omp::omp_target_reduce;
 using policy::omp::omp_target_work;
 #endif
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_HPP
+#endif  // RAJA_policy_openmp_target_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index d0e16601bc..fa6ae04dab 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -70,14 +70,16 @@ static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 //! Information necessary for OpenMP offload to be considered
 struct Offload_Info
 {
-  int  hostID{omp_get_initial_device()};
-  int  deviceID{omp_get_default_device()};
-  bool isMapped{false};
+  int  hostID {omp_get_initial_device()};
+  int  deviceID {omp_get_default_device()};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
   Offload_Info(const Offload_Info& other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
   {}
 };
 
@@ -99,9 +101,9 @@ struct Reduce_Data
    */
   Reduce_Data(T initValue, T identityValue, Offload_Info& info)
       : value(initValue),
-        device{reinterpret_cast<T*>(
+        device {reinterpret_cast<T*>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
-        host{new T[omp::MaxNumTeams]}
+        host {new T[omp::MaxNumTeams]}
   {
     if (!host)
     {
@@ -127,10 +129,10 @@ struct Reduce_Data
   RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void*>(device),
-                          reinterpret_cast<void*>(host),
-                          omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
-                          info.hostID) != 0)
+    if (omp_target_memcpy(
+            reinterpret_cast<void*>(device), reinterpret_cast<void*>(host),
+            omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
+            info.hostID) != 0)
     {
       printf("Unable to copy memory from host to device\n");
       exit(1);
@@ -141,10 +143,10 @@ struct Reduce_Data
   RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void*>(host),
-                          reinterpret_cast<void*>(device),
-                          omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
-                          info.deviceID) != 0)
+    if (omp_target_memcpy(
+            reinterpret_cast<void*>(host), reinterpret_cast<void*>(device),
+            omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
+            info.deviceID) != 0)
     {
       printf("Unable to copy memory from device to host\n");
       exit(1);
@@ -167,7 +169,7 @@ struct Reduce_Data
   }
 };
 
-} // end namespace omp
+}  // end namespace omp
 
 //! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
@@ -192,7 +194,7 @@ struct TargetReduce
     finalVal = identity_;
   }
 
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp declare target
 #endif
   //! apply reduction on device upon destruction
@@ -205,11 +207,11 @@ struct TargetReduce
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], val.value);
+        Reducer {}(val.device[tid], val.value);
       }
     }
   }
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp end declare target
 #endif
 
@@ -222,14 +224,14 @@ struct TargetReduce
 
       for (int i = 0; i < omp::MaxNumTeams; ++i)
       {
-        Reducer{}(val.value, val.host[i]);
+        Reducer {}(val.value, val.host[i]);
       }
       val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     return finalVal;
   }
   //! alias for operator T()
@@ -238,14 +240,14 @@ struct TargetReduce
   //! apply reduction
   TargetReduce& reduce(T rhsVal)
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
   const TargetReduce& reduce(T rhsVal) const
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
@@ -280,11 +282,12 @@ struct TargetReduceLoc
         finalLoc(identity_loc_)
   {}
 
-  void reset(T         init_val_,
-             IndexType init_loc_,
-             T         identity_val_ = Reducer::identity,
-             IndexType identity_loc_ =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val_,
+      IndexType init_loc_,
+      T         identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
@@ -305,7 +308,7 @@ struct TargetReduceLoc
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], loc.device[tid], val.value, loc.value);
+        Reducer {}(val.device[tid], loc.device[tid], val.value, loc.value);
       }
     }
   }
@@ -319,7 +322,7 @@ struct TargetReduceLoc
       loc.deviceToHost(info);
       for (int i = 0; i < omp::MaxNumTeams; ++i)
       {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       val.cleanup(info);
       loc.cleanup(info);
@@ -327,8 +330,8 @@ struct TargetReduceLoc
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     return finalVal;
   }
   //! alias for operator T()
@@ -346,14 +349,14 @@ struct TargetReduceLoc
   //! apply reduction
   TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
   const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -549,8 +552,8 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
 };
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_TARGET_OPENMP guard
+#endif  // closing endif for RAJA_ENABLE_TARGET_OPENMP guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 1ee8fe7d0f..90c6cb85ed 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -34,4 +34,4 @@
 #include "RAJA/policy/sequential/launch.hpp"
 #include "RAJA/policy/sequential/WorkGroup.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup.hpp b/include/RAJA/policy/sequential/WorkGroup.hpp
index a5ffefa83d..291518037c 100644
--- a/include/RAJA/policy/sequential/WorkGroup.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup.hpp
@@ -21,4 +21,4 @@
 #include "RAJA/policy/sequential/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/sequential/WorkGroup/WorkRunner.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index aa496ca02d..ab97dbd3cf 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -37,12 +37,12 @@ namespace detail
 template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  static Dispatcher_T dispatcher{Dispatcher_T::template makeDispatcher<T>()};
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>()};
   return &dispatcher;
 }
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index dc97f7636e..54e2c90182 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -35,48 +35,56 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::seq_work,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
-                                                     RAJA::seq_work,
-                                                     RAJA::ordered,
-                                                     DISPATCH_POLICY_T,
-                                                     ALLOCATOR_T,
-                                                     INDEX_T,
-                                                     Args...>
+template <
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::seq_work,
+    RAJA::ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallOrdered<
+          RAJA::seq_exec,
+          RAJA::seq_work,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::seq_work,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
-                                                     RAJA::seq_work,
-                                                     RAJA::reverse_ordered,
-                                                     DISPATCH_POLICY_T,
-                                                     ALLOCATOR_T,
-                                                     INDEX_T,
-                                                     Args...>
+template <
+    typename DISPATCH_POLICY_T,
+    typename ALLOCATOR_T,
+    typename INDEX_T,
+    typename... Args>
+struct WorkRunner<
+    RAJA::seq_work,
+    RAJA::reverse_ordered,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
+    : WorkRunnerForallReverse<
+          RAJA::seq_exec,
+          RAJA::seq_work,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 081f04f652..a9e5e4f256 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -160,7 +160,7 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
 }
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // guard
+#endif  // guard
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 07b4f25ef8..e5153a5aeb 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -55,19 +55,21 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename Func,
-          typename Resource,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename Func,
+    typename Resource,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Resource>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(Resource res,
-            const seq_exec&,
-            Iterable&&  iter,
-            Func&&      body,
-            ForallParam f_params)
+forall_impl(
+    Resource res,
+    const seq_exec&,
+    Iterable&&  iter,
+    Func&&      body,
+    ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
@@ -82,19 +84,21 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-template <typename Iterable,
-          typename Func,
-          typename Resource,
-          typename ForallParam>
+template <
+    typename Iterable,
+    typename Func,
+    typename Resource,
+    typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Resource>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(Resource res,
-            const seq_exec&,
-            Iterable&& iter,
-            Func&&     body,
-            ForallParam)
+forall_impl(
+    Resource res,
+    const seq_exec&,
+    Iterable&& iter,
+    Func&&     body,
+    ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
@@ -105,10 +109,10 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-} // namespace sequential
+}  // namespace sequential
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/kernel.hpp b/include/RAJA/policy/sequential/kernel.hpp
index b6b7318806..9bb107b4e7 100644
--- a/include/RAJA/policy/sequential/kernel.hpp
+++ b/include/RAJA/policy/sequential/kernel.hpp
@@ -22,4 +22,4 @@
 #include "RAJA/policy/sequential/kernel/Collapse.hpp"
 #include "RAJA/policy/sequential/kernel/Reduce.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index aef80f6223..0e3a334a33 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -49,10 +49,11 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0,
-          camp::idx_t... ArgRest,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t Arg0,
+    camp::idx_t... ArgRest,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
     Types>
@@ -82,9 +83,9 @@ struct StatementExecutor<
 };
 
 
-} // namespace internal
+}  // namespace internal
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 336f32924e..61a38155e2 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -29,10 +29,12 @@ namespace internal
 //
 // Executor that handles reductions for
 //
-template <template <typename...> class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    template <typename...>
+    class ReduceOperator,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
     Types>
@@ -49,9 +51,9 @@ struct StatementExecutor<
 };
 
 
-} // namespace internal
+}  // namespace internal
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_sequential_kernel_Reduce_HPP */
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index 28ad518547..2b0b15357d 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -29,8 +29,9 @@ template <>
 struct LaunchExecute<RAJA::null_launch_t>
 {
   template <typename BODY>
-  static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
-                   BODY const&          RAJA_UNUSED_ARG(body))
+  static void exec(
+      LaunchContext const& RAJA_UNUSED_ARG(ctx),
+      BODY const&          RAJA_UNUSED_ARG(body))
   {
     RAJA_ABORT_OR_THROW("NULL Launch");
   }
@@ -46,11 +47,12 @@ struct LaunchExecute<RAJA::seq_launch_t>
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const&       params,
-       const char*               RAJA_UNUSED_ARG(kernel_name),
-       BODY const&               body,
-       ReduceParams&             RAJA_UNUSED_ARG(ReduceParams))
+  exec(
+      RAJA::resources::Resource res,
+      LaunchParams const&       params,
+      const char*               RAJA_UNUSED_ARG(kernel_name),
+      BODY const&               body,
+      ReduceParams&             RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
@@ -72,11 +74,12 @@ struct LaunchExecute<RAJA::seq_launch_t>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const&       launch_params,
-       const char*               RAJA_UNUSED_ARG(kernel_name),
-       BODY const&               body,
-       ReduceParams&             launch_reducers)
+  exec(
+      RAJA::resources::Resource res,
+      LaunchParams const&       launch_params,
+      const char*               RAJA_UNUSED_ARG(kernel_name),
+      BODY const&               body,
+      ReduceParams&             launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
@@ -102,8 +105,8 @@ struct LoopExecute<seq_exec, SEGMENT>
 
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
-                                                BODY const&    body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -115,10 +118,10 @@ struct LoopExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -129,11 +132,11 @@ struct LoopExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
 
     // block stride loop
@@ -151,12 +154,12 @@ struct LoopExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
 
     // block stride loop
@@ -170,8 +173,9 @@ struct LoopExecute<seq_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
-               *(segment2.begin() + k));
+          body(
+              *(segment0.begin() + i), *(segment1.begin() + j),
+              *(segment2.begin() + k));
         }
       }
     }
@@ -184,10 +188,10 @@ struct LoopICountExecute<seq_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
     const int len = segment.end() - segment.begin();
     for (int i = 0; i < len; i++)
@@ -197,11 +201,11 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      BODY const&         body)
   {
 
     // block stride loop
@@ -219,12 +223,12 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment0,
-       SEGMENT const&      segment1,
-       SEGMENT const&      segment2,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment0,
+      SEGMENT const&      segment1,
+      SEGMENT const&      segment2,
+      BODY const&         body)
   {
 
     // block stride loop
@@ -238,8 +242,9 @@ struct LoopICountExecute<seq_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
-               *(segment2.begin() + k), i, j, k);
+          body(
+              *(segment0.begin() + i), *(segment1.begin() + j),
+              *(segment2.begin() + k), i, j, k);
         }
       }
     }
@@ -253,11 +258,11 @@ struct TileExecute<seq_exec, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -274,11 +279,11 @@ struct TileTCountExecute<seq_exec, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T              tile_size,
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T              tile_size,
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -290,5 +295,5 @@ struct TileTCountExecute<seq_exec, SEGMENT>
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index 4b44750a0e..8ca7365c89 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -71,9 +71,10 @@ struct MultiReduceDataSeq<
 
   MultiReduceDataSeq() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* =
+          nullptr>
   MultiReduceDataSeq(Container const& container, T identity)
       : m_parent(nullptr),
         m_num_bins(container.size()),
@@ -131,7 +132,7 @@ struct MultiReduceDataSeq<
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const& val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
@@ -175,11 +176,12 @@ struct MultiReduceDataSeq<
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
-                                detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(
+    policy::sequential::seq_multi_reduce_policy,
+    detail::MultiReduceDataSeq)
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
index 1370b63533..d31f271569 100644
--- a/include/RAJA/policy/sequential/params/kernel_name.hpp
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -33,9 +33,9 @@ resolve(KernelName&)
   // TODO: Define kernel naming
 }
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
index e44296f1d8..5f0be6d53a 100644
--- a/include/RAJA/policy/sequential/params/reduce.hpp
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -22,18 +22,18 @@ template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
 combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
 {
-  out.val = OP{}(out.val, in.val);
+  out.val = OP {}(out.val, in.val);
 }
 // Resolve
 template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
 resolve(Reducer<OP, T>& red)
 {
-  *red.target = OP{}(*red.target, red.val);
+  *red.target = OP {}(*red.target, red.val);
 }
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 05c47a0e62..8704d87f2b 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -38,7 +38,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
-} // namespace sequential
+}  // namespace sequential
 
 namespace policy
 {
@@ -57,22 +57,25 @@ namespace sequential
 /// Segment execution policies
 ///
 
-struct seq_region : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                          Pattern::region,
-                                                          Launch::sync,
-                                                          Platform::host>
+struct seq_region : make_policy_pattern_launch_platform_t<
+                        Policy::sequential,
+                        Pattern::region,
+                        Launch::sync,
+                        Platform::host>
 {};
 
-struct seq_launch_t : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                            Pattern::region,
-                                                            Launch::sync,
-                                                            Platform::host>
+struct seq_launch_t : make_policy_pattern_launch_platform_t<
+                          Policy::sequential,
+                          Pattern::region,
+                          Launch::sync,
+                          Platform::host>
 {};
 
-struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                        Pattern::forall,
-                                                        Launch::undefined,
-                                                        Platform::host>
+struct seq_exec : make_policy_pattern_launch_platform_t<
+                      Policy::sequential,
+                      Pattern::forall,
+                      Launch::undefined,
+                      Platform::host>
 {};
 
 ///
@@ -83,10 +86,11 @@ using seq_segit = seq_exec;
 ///
 /// WorkGroup execution policies
 ///
-struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                        Pattern::workgroup_exec,
-                                                        Launch::sync,
-                                                        Platform::host>
+struct seq_work : make_policy_pattern_launch_platform_t<
+                      Policy::sequential,
+                      Pattern::workgroup_exec,
+                      Launch::sync,
+                      Platform::host>
 {};
 
 ///
@@ -96,10 +100,11 @@ struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                          Pattern::reduce,
-                                                          Launch::undefined,
-                                                          Platform::host>
+struct seq_reduce : make_policy_pattern_launch_platform_t<
+                        Policy::sequential,
+                        Pattern::reduce,
+                        Launch::undefined,
+                        Platform::host>
 {};
 
 ///
@@ -109,9 +114,10 @@ struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
                                      Pattern::multi_reduce,
                                      Launch::undefined,
                                      Platform::host,
-                                     std::conditional_t<tuning::consistent,
-                                                        reduce::ordered,
-                                                        reduce::unordered>>
+                                     std::conditional_t<
+                                         tuning::consistent,
+                                         reduce::ordered,
+                                         reduce::unordered>>
 {};
 
 ///
@@ -138,8 +144,8 @@ using seq_multi_reduce_left_fold = seq_multi_reduce_tuning<
 // same answer every time when used in the same way
 using seq_multi_reduce = seq_multi_reduce_left_fold;
 
-} // namespace sequential
-} // namespace policy
+}  // namespace sequential
+}  // namespace policy
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
@@ -151,6 +157,6 @@ using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/sequential/reduce.hpp b/include/RAJA/policy/sequential/reduce.hpp
index 31a2d751a0..0870726183 100644
--- a/include/RAJA/policy/sequential/reduce.hpp
+++ b/include/RAJA/policy/sequential/reduce.hpp
@@ -51,10 +51,10 @@ class ReduceSeq
 };
 
 
-} // namespace detail
+}  // namespace detail
 
 RAJA_DECLARE_ALL_REDUCERS(seq_reduce, detail::ReduceSeq)
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/region.hpp b/include/RAJA/policy/sequential/region.hpp
index 48f850e868..81c5d41647 100644
--- a/include/RAJA/policy/sequential/region.hpp
+++ b/include/RAJA/policy/sequential/region.hpp
@@ -40,10 +40,10 @@ RAJA_INLINE void region_impl(const seq_region&, Func&& body)
   body();
 }
 
-} // namespace sequential
+}  // namespace sequential
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 3481d4d908..66e383cade 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -41,13 +41,15 @@ namespace scan
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-            inclusive_inplace(resources::Host host_res,
-                              const ExecPolicy&,
-                              Iter  begin,
-                              Iter  end,
-                              BinFn f)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter  begin,
+    Iter  end,
+    BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
   ValueT agg   = *begin;
@@ -66,14 +68,16 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-            exclusive_inplace(resources::Host host_res,
-                              const ExecPolicy&,
-                              Iter  begin,
-                              Iter  end,
-                              BinFn f,
-                              T     v)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter  begin,
+    Iter  end,
+    BinFn f,
+    T     v)
 {
   using std::distance;
   const auto n    = distance(begin, end);
@@ -97,14 +101,16 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-            inclusive(resources::Host host_res,
-                      const ExecPolicy&,
-                      const Iter begin,
-                      const Iter end,
-                      OutIter    out,
-                      BinFn      f)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(
+    resources::Host host_res,
+    const ExecPolicy&,
+    const Iter begin,
+    const Iter end,
+    OutIter    out,
+    BinFn      f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
   ValueT agg   = *begin;
@@ -123,20 +129,23 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename ExecPolicy,
-          typename Iter,
-          typename OutIter,
-          typename BinFn,
-          typename T>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-            exclusive(resources::Host host_res,
-                      const ExecPolicy&,
-                      const Iter begin,
-                      const Iter end,
-                      OutIter    out,
-                      BinFn      f,
-                      T          v)
+template <
+    typename ExecPolicy,
+    typename Iter,
+    typename OutIter,
+    typename BinFn,
+    typename T>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(
+    resources::Host host_res,
+    const ExecPolicy&,
+    const Iter begin,
+    const Iter end,
+    OutIter    out,
+    BinFn      f,
+    T          v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
   ValueT  agg  = v;
@@ -152,10 +161,10 @@ RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-} // namespace scan
+}  // namespace scan
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 6004a6be11..590a2f0e3b 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -70,21 +70,23 @@ struct StableSorter
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-unstable(resources::Host host_res,
-         const ExecPolicy&,
-         Iter    begin,
-         Iter    end,
-         Compare comp)
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+unstable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter    begin,
+    Iter    end,
+    Compare comp)
 {
-  detail::UnstableSorter{}(begin, end, comp);
+  detail::UnstableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -93,15 +95,17 @@ unstable(resources::Host host_res,
         \brief stable sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-stable(resources::Host host_res,
-       const ExecPolicy&,
-       Iter    begin,
-       Iter    end,
-       Compare comp)
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+stable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter    begin,
+    Iter    end,
+    Compare comp)
 {
-  detail::StableSorter{}(begin, end, comp);
+  detail::StableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -109,23 +113,26 @@ stable(resources::Host host_res,
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(resources::Host host_res,
-               const ExecPolicy&,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
-               Compare comp)
+template <
+    typename ExecPolicy,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+unstable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::UnstableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -134,31 +141,34 @@ unstable_pairs(resources::Host host_res,
         \brief stable sort given range of pairs using comparison function on
    keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(resources::Host host_res,
-             const ExecPolicy&,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             Compare comp)
+template <
+    typename ExecPolicy,
+    typename KeyIter,
+    typename ValIter,
+    typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    type_traits::is_sequential_policy<ExecPolicy>>
+stable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::StableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-} // namespace sort
+}  // namespace sort
 
-} // namespace impl
+}  // namespace impl
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/simd.hpp b/include/RAJA/policy/simd.hpp
index 8c0ac6c9bf..6cb6cd4c57 100644
--- a/include/RAJA/policy/simd.hpp
+++ b/include/RAJA/policy/simd.hpp
@@ -26,4 +26,4 @@
 #include "RAJA/policy/simd/kernel/For.hpp"
 #include "RAJA/policy/simd/kernel/ForICount.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index bac74942fa..e0b66fb270 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -52,11 +52,12 @@ RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(RAJA::resources::Host host_res,
-            const simd_exec&,
-            Iterable&&  iter,
-            Func&&      loop_body,
-            ForallParam f_params)
+forall_impl(
+    RAJA::resources::Host host_res,
+    const simd_exec&,
+    Iterable&&  iter,
+    Func&&      loop_body,
+    ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
@@ -78,11 +79,12 @@ RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(RAJA::resources::Host host_res,
-            const simd_exec&,
-            Iterable&& iter,
-            Func&&     loop_body,
-            ForallParam)
+forall_impl(
+    RAJA::resources::Host host_res,
+    const simd_exec&,
+    Iterable&& iter,
+    Func&&     loop_body,
+    ForallParam)
 {
   auto begin    = std::begin(iter);
   auto end      = std::end(iter);
@@ -96,10 +98,10 @@ forall_impl(RAJA::resources::Host host_res,
   return RAJA::resources::EventProxy<resources::Host>(host_res);
 }
 
-} // namespace simd
+}  // namespace simd
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index 40b69df80b..0d4a23c6fa 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -137,8 +137,8 @@ struct StatementExecutor<
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_pattern_nested_HPP */
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 6ce2d07ee9..9fd86b27b5 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -42,10 +42,11 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
 struct StatementExecutor<
     statement::
         ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
@@ -83,8 +84,8 @@ struct StatementExecutor<
   }
 };
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 63daf17f18..07ea13da65 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -30,10 +30,10 @@ struct LoopExecute<simd_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -50,10 +50,10 @@ struct LoopICountExecute<simd_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const&      segment,
-       BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const&      segment,
+      BODY const&         body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -65,5 +65,5 @@ struct LoopICountExecute<simd_exec, SEGMENT>
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index 87826473de..9450c0513e 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -38,18 +38,19 @@ namespace policy
 namespace simd
 {
 
-struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                         Pattern::forall,
-                                                         Launch::undefined,
-                                                         Platform::host>
+struct simd_exec : make_policy_pattern_launch_platform_t<
+                       Policy::sequential,
+                       Pattern::forall,
+                       Launch::undefined,
+                       Platform::host>
 {};
 
-} // end of namespace simd
+}  // end of namespace simd
 
-} // end of namespace policy
+}  // end of namespace policy
 
 using policy::simd::simd_exec;
 
-} // end of namespace RAJA
+}  // end of namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index b06ba81ed6..491e39910c 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -37,6 +37,6 @@
 #include "RAJA/policy/sycl/launch.hpp"
 //#include "RAJA/policy/sycl/WorkGroup.hpp"
 
-#endif // closing endif for if defined(RAJA_ENABLE_SYCL)
+#endif  // closing endif for if defined(RAJA_ENABLE_SYCL)
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 15ad96c51f..b7832d59a6 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -49,8 +49,8 @@ namespace detail
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct syclInfo
 {
-  sycl_dim_t      gridDim{0};
-  sycl_dim_t      blockDim{0};
+  sycl_dim_t      gridDim {0};
+  sycl_dim_t      blockDim {0};
   cl::sycl::queue qu             = cl::sycl::queue();
   bool            setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
@@ -65,7 +65,7 @@ extern syclInfo tl_status;
 
 extern std::unordered_map<cl::sycl::queue, bool> g_queue_info_map;
 
-} // namespace detail
+}  // namespace detail
 
 //! Allocator for pinned memory for use in basic_mempool
 struct PinnedAllocator
@@ -143,10 +143,10 @@ using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
-} // namespace sycl
+}  // namespace sycl
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_SYCL
+#endif  // closing endif for RAJA_ENABLE_SYCL
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index b712af52df..523770e27b 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -74,7 +74,7 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
   return gridSize;
 }
 
-} // namespace impl
+}  // namespace impl
 
 
 //
@@ -85,22 +85,24 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool   Async,
-          typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
-                                  bool>::type = true>
+template <
+    typename Iterable,
+    typename LoopBody,
+    size_t BlockSize,
+    bool   Async,
+    typename ForallParam,
+    typename std::enable_if<std::is_trivially_copyable<LoopBody> {}, bool>::
+        type = true>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Sycl>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async>,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+forall_impl(
+    resources::Sycl& sycl_res,
+    sycl_exec<BlockSize, Async>,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
 
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
@@ -125,7 +127,7 @@ forall_impl(resources::Sycl& sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -133,15 +135,16 @@ forall_impl(resources::Sycl& sycl_res,
     q->submit(
         [&](::sycl::handler& h)
         {
-          h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
-                         [=](::sycl::nd_item<1> it)
-                         {
-                           IndexType ii = it.get_global_id(0);
-                           if (ii < len)
-                           {
-                             loop_body(begin[ii]);
-                           }
-                         });
+          h.parallel_for(
+              ::sycl::nd_range<1> {gridSize, blockSize},
+              [=](::sycl::nd_item<1> it)
+              {
+                IndexType ii = it.get_global_id(0);
+                if (ii < len)
+                {
+                  loop_body(begin[ii]);
+                }
+              });
         });
 
     if (!Async)
@@ -153,19 +156,20 @@ forall_impl(resources::Sycl& sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool   Async,
-          typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
-                                  bool>::type = true>
-RAJA_INLINE resources::EventProxy<resources::Sycl>
-            forall_impl(resources::Sycl& sycl_res,
-                        sycl_exec<BlockSize, Async>,
-                        Iterable&& iter,
-                        LoopBody&& loop_body,
-                        ForallParam)
+template <
+    typename Iterable,
+    typename LoopBody,
+    size_t BlockSize,
+    bool   Async,
+    typename ForallParam,
+    typename std::enable_if<!std::is_trivially_copyable<LoopBody> {}, bool>::
+        type = true>
+RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
+    resources::Sycl& sycl_res,
+    sycl_exec<BlockSize, Async>,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -190,7 +194,7 @@ RAJA_INLINE resources::EventProxy<resources::Sycl>
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -213,18 +217,19 @@ RAJA_INLINE resources::EventProxy<resources::Sycl>
     q->submit(
          [&](::sycl::handler& h)
          {
-           h.parallel_for(::sycl::nd_range<1>{gridSize, blockSize},
-                          [=](::sycl::nd_item<1> it)
-                          {
-                            Index_type ii = it.get_global_id(0);
-
-                            if (ii < len)
-                            {
-                              (*lbody)((*beg)[ii]);
-                            }
-                          });
+           h.parallel_for(
+               ::sycl::nd_range<1> {gridSize, blockSize},
+               [=](::sycl::nd_item<1> it)
+               {
+                 Index_type ii = it.get_global_id(0);
+
+                 if (ii < len)
+                 {
+                   (*lbody)((*beg)[ii]);
+                 }
+               });
          })
-        .wait(); // Need to wait for completion to free memory
+        .wait();  // Need to wait for completion to free memory
 
     // Free our device memory
     cl::sycl::free(lbody, *q);
@@ -236,23 +241,25 @@ RAJA_INLINE resources::EventProxy<resources::Sycl>
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool   Async,
-          typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},
-                                  bool>::type = true>
+template <
+    typename Iterable,
+    typename LoopBody,
+    size_t BlockSize,
+    bool   Async,
+    typename ForallParam,
+    typename std::enable_if<std::is_trivially_copyable<LoopBody> {}, bool>::
+        type = true>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Sycl>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async>,
-            Iterable&&  iter,
-            LoopBody&&  loop_body,
-            ForallParam f_params)
+forall_impl(
+    resources::Sycl& sycl_res,
+    sycl_exec<BlockSize, Async>,
+    Iterable&&  iter,
+    LoopBody&&  loop_body,
+    ForallParam f_params)
 
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
@@ -276,7 +283,7 @@ forall_impl(resources::Sycl& sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -294,18 +301,19 @@ forall_impl(resources::Sycl& sycl_res,
     q->submit(
         [&](::sycl::handler& h)
         {
-          h.parallel_for(::sycl::range<1>(len), reduction,
-                         [=](::sycl::item<1> it, auto& red)
-                         {
-                           ForallParam fp;
-                           RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-                           IndexType ii = it.get_id(0);
-                           if (ii < len)
-                           {
-                             RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-                           }
-                           red.combine(fp);
-                         });
+          h.parallel_for(
+              ::sycl::range<1>(len), reduction,
+              [=](::sycl::item<1> it, auto& red)
+              {
+                ForallParam fp;
+                RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                IndexType ii = it.get_id(0);
+                if (ii < len)
+                {
+                  RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                }
+                red.combine(fp);
+              });
         });
 
     q->wait();
@@ -317,23 +325,25 @@ forall_impl(resources::Sycl& sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool   Async,
-          typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},
-                                  bool>::type = true>
+template <
+    typename Iterable,
+    typename LoopBody,
+    size_t BlockSize,
+    bool   Async,
+    typename ForallParam,
+    typename std::enable_if<!std::is_trivially_copyable<LoopBody> {}, bool>::
+        type = true>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Sycl>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async>,
-            Iterable&&  iter,
-            LoopBody&&  loop_body,
-            ForallParam f_params)
+forall_impl(
+    resources::Sycl& sycl_res,
+    sycl_exec<BlockSize, Async>,
+    Iterable&&  iter,
+    LoopBody&&  loop_body,
+    ForallParam f_params)
 
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
@@ -356,7 +366,7 @@ forall_impl(resources::Sycl& sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -390,20 +400,21 @@ forall_impl(resources::Sycl& sycl_res,
     q->submit(
          [&](::sycl::handler& h)
          {
-           h.parallel_for(::sycl::range<1>(len), reduction,
-                          [=](::sycl::item<1> it, auto& red)
-                          {
-                            Index_type  ii = it.get_id(0);
-                            ForallParam fp;
-                            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-                            if (ii < len)
-                            {
-                              RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-                            }
-                            red.combine(fp);
-                          });
+           h.parallel_for(
+               ::sycl::range<1>(len), reduction,
+               [=](::sycl::item<1> it, auto& red)
+               {
+                 Index_type  ii = it.get_id(0);
+                 ForallParam fp;
+                 RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                 if (ii < len)
+                 {
+                   RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                 }
+                 red.combine(fp);
+               });
          })
-        .wait(); // Need to wait for completion to free memory
+        .wait();  // Need to wait for completion to free memory
     RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     // Free our device memory
     ::sycl::free(res, *q);
@@ -436,22 +447,23 @@ forall_impl(resources::Sycl& sycl_res,
  *
  ******************************************************************************
  */
-template <typename LoopBody,
-          size_t BlockSize,
-          bool   Async,
-          typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl>
-            forall_impl(resources::Sycl& r,
-                        ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-                        const TypedIndexSet<SegmentTypes...>& iset,
-                        LoopBody&&                            loop_body)
+template <
+    typename LoopBody,
+    size_t BlockSize,
+    bool   Async,
+    typename... SegmentTypes>
+RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
+    resources::Sycl& r,
+    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&&                            loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
   {
-    iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
-                     loop_body);
-  } // iterate over segments of index set
+    iset.segmentCall(
+        r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(), loop_body);
+  }  // iterate over segments of index set
 
   if (!Async)
   {
@@ -462,12 +474,12 @@ RAJA_INLINE resources::EventProxy<resources::Sycl>
   return resources::EventProxy<resources::Sycl>(r);
 }
 
-} // namespace sycl
+}  // namespace sycl
 
-} // namespace policy
+}  // namespace policy
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_SYCL guard
+#endif  // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp
index 5b669271eb..641c3a9ef3 100644
--- a/include/RAJA/policy/sycl/kernel.hpp
+++ b/include/RAJA/policy/sycl/kernel.hpp
@@ -32,4 +32,4 @@
 #include "RAJA/policy/sycl/kernel/TileTCount.hpp"
 #include "RAJA/policy/sycl/kernel/internal.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index 6baa025435..a41537c19c 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -36,13 +36,15 @@ namespace internal
 {
 
 
-template <typename Data,
-          typename Conditional,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    typename Conditional,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::If<Conditional, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -68,8 +70,8 @@ struct SyclStatementExecutor<Data,
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index ba78f1a6a6..321881b3d1 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -37,17 +37,19 @@ namespace internal
  * Mapping directly to indicies
  * Assigns the global index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int         Dim,
-          int         Local_Size,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    int         Dim,
+    int         Local_Size,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::sycl_global_012<Dim, Local_Size>,
-                   EnclosedStmts...>,
+    statement::For<
+        ArgumentId,
+        RAJA::sycl_global_012<Dim, Local_Size>,
+        EnclosedStmts...>,
     Types>
 {
 
@@ -107,16 +109,17 @@ struct SyclStatementExecutor<
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int         Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_group_012_direct<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    int         Dim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -173,16 +176,17 @@ struct SyclStatementExecutor<Data,
  * each group in dims.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int         Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_group_012_loop<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    int         Dim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -243,16 +247,17 @@ struct SyclStatementExecutor<Data,
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int         Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_local_012_direct<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    int         Dim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -309,16 +314,17 @@ struct SyclStatementExecutor<Data,
  * for each item in dim.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int         Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_local_012_loop<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    int         Dim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::
+        For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -389,11 +395,12 @@ struct SyclStatementExecutor<Data,
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int         Local_Size,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    int         Local_Size,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::sycl_exec<Local_Size>, EnclosedStmts...>,
@@ -448,10 +455,11 @@ struct SyclStatementExecutor<
  * This is specialized since it need to execute the loop immediately.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
@@ -494,8 +502,8 @@ struct SyclStatementExecutor<
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index 75491feff7..a5f52cfe0f 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -37,32 +37,36 @@ namespace internal
  * Assigns the loop iterate to offset ArgumentId
  * Assigns the loop count to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    int ThreadDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_012_direct<ThreadDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::sycl_local_012_direct<ThreadDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_012_direct<ThreadDim>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::sycl_local_012_direct<ThreadDim>,
+              EnclosedStmts...>,
           Types>
 {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::sycl_local_012_direct<ThreadDim>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::sycl_local_012_direct<ThreadDim>,
+          EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -88,33 +92,37 @@ struct SyclStatementExecutor<
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::sycl_local_masked_direct<Mask>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::sycl_local_masked_direct<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_local_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::For<
+          ArgumentId,
+          RAJA::sycl_local_masked_direct<Mask>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -149,33 +157,35 @@ struct SyclStatementExecutor<
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename Mask,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::sycl_local_masked_loop<Mask>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::sycl_local_masked_loop<Mask>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_local_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::
+          For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>, EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
@@ -225,33 +235,37 @@ struct SyclStatementExecutor<
  * Assigns the loop iterate to offset ArgumentId
  * Assigns the loop offset to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    int ThreadDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_012_loop<ThreadDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::sycl_local_012_loop<ThreadDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_012_loop<ThreadDim>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::sycl_local_012_loop<ThreadDim>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_local_012_loop<ThreadDim>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::For<
+          ArgumentId,
+          RAJA::sycl_local_012_loop<ThreadDim>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -290,32 +304,36 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    int BlockDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_group_012_direct<BlockDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::sycl_group_012_direct<BlockDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_group_012_direct<BlockDim>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::sycl_group_012_direct<BlockDim>,
+              EnclosedStmts...>,
           Types>
 {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::sycl_group_012_direct<BlockDim>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::sycl_group_012_direct<BlockDim>,
+          EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -348,33 +366,37 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    int BlockDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_group_012_loop<BlockDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::sycl_group_012_loop<BlockDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_group_012_loop<BlockDim>,
-                         EnclosedStmts...>,
+          statement::For<
+              ArgumentId,
+              RAJA::sycl_group_012_loop<BlockDim>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_group_012_loop<BlockDim>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::For<
+          ArgumentId,
+          RAJA::sycl_group_012_loop<BlockDim>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -409,11 +431,12 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
@@ -450,8 +473,8 @@ struct SyclStatementExecutor<
 };
 
 
-} // namespace internal
-} // end namespace RAJA
+}  // namespace internal
+}  // end namespace RAJA
 
 
 #endif /* RAJA_policy_sycl_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 7d517ef4ca..6e98b9471c 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -42,13 +42,15 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <typename Data,
-          camp::idx_t LambdaIndex,
-          typename... Args,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Lambda<LambdaIndex, Args...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t LambdaIndex,
+    typename... Args,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::Lambda<LambdaIndex, Args...>,
+    Types>
 {
 
   static inline RAJA_DEVICE void
@@ -69,9 +71,9 @@ struct SyclStatementExecutor<Data,
   }
 };
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_SYCL guard
+#endif  // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 3fa7299a36..77f0246040 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -80,7 +80,7 @@ using SyclKernel = SyclKernelExt<sycl_launch<false>, EnclosedStmts...>;
 template <typename... EnclosedStmts>
 using SyclKernelAsync = SyclKernelExt<sycl_launch<true>, EnclosedStmts...>;
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -103,11 +103,12 @@ void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <bool IsTriviallyCopyable,
-          typename LaunchPolicy,
-          typename StmtList,
-          typename Data,
-          typename Types>
+template <
+    bool IsTriviallyCopyable,
+    typename LaunchPolicy,
+    typename StmtList,
+    typename Data,
+    typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -126,10 +127,11 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
       internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data&&               data,
-                     internal::LaunchDims launch_dims,
-                     size_t               shmem,
-                     cl::sycl::queue*     qu)
+  static void launch(
+      Data&&               data,
+      internal::LaunchDims launch_dims,
+      size_t               shmem,
+      cl::sycl::queue*     qu)
   {
 
     //
@@ -147,7 +149,7 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
                 launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item)
                 { SyclKernelLauncher<Data, executor_t>(*m_data, item); });
           })
-        .wait(); // Need to wait to free memory
+        .wait();  // Need to wait to free memory
 
     cl::sycl::free(m_data, *qu);
   }
@@ -169,18 +171,19 @@ struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
       internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data&&               data,
-                     internal::LaunchDims launch_dims,
-                     size_t               shmem,
-                     cl::sycl::queue*     qu)
+  static void launch(
+      Data&&               data,
+      internal::LaunchDims launch_dims,
+      size_t               shmem,
+      cl::sycl::queue*     qu)
   {
 
     qu->submit(
         [&](cl::sycl::handler& h)
         {
-          h.parallel_for(launch_dims.fit_nd_range(qu),
-                         [=](cl::sycl::nd_item<3> item)
-                         { SyclKernelLauncher<Data, executor_t>(data, item); });
+          h.parallel_for(
+              launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item)
+              { SyclKernelLauncher<Data, executor_t>(data, item); });
         });
 
     if (!async)
@@ -210,8 +213,9 @@ struct StatementExecutor<
     using data_t = camp::decay<Data>;
     using executor_t =
         sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
-    using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
-                                      LaunchConfig, stmt_list_t, data_t, Types>;
+    using launch_t = SyclLaunchHelper<
+        std::is_trivially_copyable<data_t>::value, LaunchConfig, stmt_list_t,
+        data_t, Types>;
 
     camp::resources::Sycl res = data.get_resource();
     ::sycl::queue*        q   = res.get_queue();
@@ -232,9 +236,9 @@ struct StatementExecutor<
 };
 
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_SYCL guard
+#endif  // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index e72a95108a..71b22a625f 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -47,11 +47,12 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename TPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
@@ -120,18 +121,21 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int         BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_group_012_direct<BlockDim>,
-                                             EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    int         BlockDim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_group_012_direct<BlockDim>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -153,7 +157,7 @@ struct SyclStatementExecutor<Data,
     // diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
     diff_t i =
         item.get_group(BlockDim) *
-        chunk_size; // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+        chunk_size;  // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
 
     // check have chunk
     if (i < len)
@@ -215,18 +219,21 @@ struct SyclStatementExecutor<Data,
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int         BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_group_012_loop<BlockDim>,
-                                             EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    int         BlockDim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_group_012_loop<BlockDim>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -247,8 +254,8 @@ struct SyclStatementExecutor<Data,
 
     // compute trip count
     diff_t len      = segment.end() - segment.begin();
-    diff_t i_init   = item.get_group(BlockDim) * chunk_size;       // TODO
-    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
+    diff_t i_init   = item.get_group(BlockDim) * chunk_size;        // TODO
+    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size;  // TODO
 
     // Iterate through grid stride of chunks
     for (diff_t i = i_init; i < len; i += i_stride)
@@ -305,18 +312,21 @@ struct SyclStatementExecutor<Data,
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int         ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_local_012_direct<ThreadDim>,
-                                             EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    int         ThreadDim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_local_012_direct<ThreadDim>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -394,18 +404,21 @@ struct SyclStatementExecutor<Data,
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int         ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_local_012_loop<ThreadDim>,
-                                             EnclosedStmts...>,
-                             Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    camp::idx_t chunk_size,
+    int         ThreadDim,
+    typename... EnclosedStmts,
+    typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_local_012_loop<ThreadDim>,
+        EnclosedStmts...>,
+    Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -486,8 +499,8 @@ struct SyclStatementExecutor<Data,
 };
 
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
-#endif // RAJA_ENABLE_SYCL
-#endif /* RAJA_policy_sycl_kernel_Tile_HPP */
+#endif  // RAJA_ENABLE_SYCL
+#endif  /* RAJA_policy_sycl_kernel_Tile_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index 43705962c5..8fdf020a93 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -47,12 +47,13 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    typename TPol,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::
@@ -110,37 +111,41 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int         BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    int         BlockDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_direct<BlockDim>,
-                          EnclosedStmts...>,
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_group_012_direct<BlockDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<ArgumentId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_direct<BlockDim>,
-                          EnclosedStmts...>,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              sycl_group_012_direct<BlockDim>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_group_012_direct<BlockDim>,
-                                            EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          sycl_group_012_direct<BlockDim>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -184,37 +189,41 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int         BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    int         BlockDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_loop<BlockDim>,
-                          EnclosedStmts...>,
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_group_012_loop<BlockDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<ArgumentId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_loop<BlockDim>,
-                          EnclosedStmts...>,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              sycl_group_012_loop<BlockDim>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_group_012_loop<BlockDim>,
-                                            EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          sycl_group_012_loop<BlockDim>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -259,37 +268,41 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int         ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    int         ThreadDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts...>,
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_local_012_direct<ThreadDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<ArgumentId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts...>,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              sycl_local_012_direct<ThreadDim>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_local_012_direct<ThreadDim>,
-                                            EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          sycl_local_012_direct<ThreadDim>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -333,37 +346,41 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int         ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
+template <
+    typename Data,
+    camp::idx_t ArgumentId,
+    typename ParamId,
+    camp::idx_t chunk_size,
+    int         ThreadDim,
+    typename... EnclosedStmts,
+    typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts...>,
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        sycl_local_012_loop<ThreadDim>,
+        EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<ArgumentId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts...>,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              sycl_local_012_loop<ThreadDim>,
+              EnclosedStmts...>,
           Types>
 {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_local_012_loop<ThreadDim>,
-                                            EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          sycl_local_012_loop<ThreadDim>,
+          EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -410,8 +427,8 @@ struct SyclStatementExecutor<
   }
 };
 
-} // end namespace internal
-} // end namespace RAJA
+}  // end namespace internal
+}  // end namespace RAJA
 
-#endif // RAJA_ENABLE_SYCL
-#endif /* RAJA_policy_sycl_kernel_TileTCount_HPP */
+#endif  // RAJA_ENABLE_SYCL
+#endif  /* RAJA_policy_sycl_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index a0ac5aeae6..5c9ddbee4b 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -54,11 +54,11 @@ struct LaunchDims
   RAJA_INLINE
   RAJA_HOST_DEVICE
   LaunchDims()
-      : group{0, 0, 0},
-        local{1, 1, 1},
-        global{1, 1, 1},
-        min_groups{0, 0, 0},
-        min_locals{0, 0, 0}
+      : group {0, 0, 0},
+        local {1, 1, 1},
+        global {1, 1, 1},
+        min_groups {0, 0, 0},
+        min_locals {0, 0, 0}
   {}
 
   RAJA_INLINE
@@ -92,7 +92,7 @@ struct LaunchDims
 
     sycl_dim_3_t launch_global;
 
-    sycl_dim_3_t launch_local{1, 1, 1};
+    sycl_dim_3_t launch_local {1, 1, 1};
     launch_local.x = std::max(launch_local.x, local.x);
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
@@ -148,7 +148,7 @@ struct LaunchDims
     // User gave group policy, use to calculate global space
     if (group.x != 0 || group.y != 0 || group.z != 0)
     {
-      sycl_dim_3_t launch_group{1, 1, 1};
+      sycl_dim_3_t launch_group {1, 1, 1};
       launch_group.x = std::max(launch_group.x, group.x);
       launch_group.y = std::max(launch_group.y, group.y);
       launch_group.z = std::max(launch_group.z, group.z);
@@ -184,10 +184,10 @@ struct LaunchDims
           ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
-    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y,
-                                 launch_local.z};
-    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y,
-                                 launch_global.z};
+    cl::sycl::range<3> ret_th = {
+        launch_local.x, launch_local.y, launch_local.z};
+    cl::sycl::range<3> ret_gl = {
+        launch_global.x, launch_global.y, launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
@@ -280,9 +280,9 @@ template <typename StmtList, typename Data, typename Types>
 using sycl_statement_list_executor_t =
     SyclStatementListExecutor<Data, StmtList, Types>;
 
-} // namespace internal
-} // namespace RAJA
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_SYCL guard
+#endif  // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 6f15e0bab2..66f435c950 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -33,19 +33,21 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 {
 
   // If the launch lambda is trivially copyable
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},
-                                    bool>::type = true>
+  template <
+      typename BODY_IN,
+      typename ReduceParams,
+      typename std::enable_if<std::is_trivially_copyable<BODY_IN> {}, bool>::
+          type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -55,9 +57,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(params.threads.value[2],
-                                     params.threads.value[1],
-                                     params.threads.value[0]);
+    const ::sycl::range<3> blockSize(
+        params.threads.value[2], params.threads.value[1],
+        params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         params.threads.value[2] * params.teams.value[2],
@@ -108,20 +110,22 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
   // If the launch lambda is trivially copyable and we have explcit reduction
   // parameters
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},
-                                    bool>::type = true>
+  template <
+      typename BODY_IN,
+      typename ReduceParams,
+      typename std::enable_if<std::is_trivially_copyable<BODY_IN> {}, bool>::
+          type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       launch_params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams              launch_reducers)
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       launch_params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams              launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -133,9 +137,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     //
     // Compute the number of blocks and threads
     //
-    const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-                                     launch_params.threads.value[1],
-                                     launch_params.threads.value[0]);
+    const ::sycl::range<3> blockSize(
+        launch_params.threads.value[2], launch_params.threads.value[1],
+        launch_params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         launch_params.threads.value[2] * launch_params.teams.value[2],
@@ -191,7 +195,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
                    red.combine(fp);
                  });
            })
-          .wait(); // Need to wait for completion to free memory
+          .wait();  // Need to wait for completion to free memory
 
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
@@ -205,19 +209,21 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},
-                                    bool>::type = true>
+  template <
+      typename BODY_IN,
+      typename ReduceParams,
+      typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {}, bool>::
+          type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -227,9 +233,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(params.threads.value[2],
-                                     params.threads.value[1],
-                                     params.threads.value[0]);
+    const ::sycl::range<3> blockSize(
+        params.threads.value[2], params.threads.value[1],
+        params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         params.threads.value[2] * params.teams.value[2],
@@ -275,7 +281,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
                    (*lbody)(ctx);
                  });
            })
-          .wait(); // Need to wait for completion to free memory
+          .wait();  // Need to wait for completion to free memory
 
       cl::sycl::free(lbody, *q);
 
@@ -287,20 +293,22 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
 
   // If the launch lambda is not trivially copyable
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},
-                                    bool>::type = true>
+  template <
+      typename BODY_IN,
+      typename ReduceParams,
+      typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {}, bool>::
+          type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams&       launch_params,
-       const char*               kernel_name,
-       BODY_IN&&                 body_in,
-       ReduceParams              launch_reducers)
+  exec(
+      RAJA::resources::Resource res,
+      const LaunchParams&       launch_params,
+      const char*               kernel_name,
+      BODY_IN&&                 body_in,
+      ReduceParams              launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -312,9 +320,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     //
     // Compute the number of blocks and threads
     //
-    const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-                                     launch_params.threads.value[1],
-                                     launch_params.threads.value[0]);
+    const ::sycl::range<3> blockSize(
+        launch_params.threads.value[2], launch_params.threads.value[1],
+        launch_params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         launch_params.threads.value[2] * launch_params.teams.value[2],
@@ -379,7 +387,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
                    red.combine(fp);
                  });
            })
-          .wait(); // Need to wait for completion to free memory
+          .wait();  // Need to wait for completion to free memory
 
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
@@ -435,10 +443,11 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -469,11 +478,12 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           SEGMENT const&       segment2,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      SEGMENT const&       segment2,
+      BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -489,8 +499,9 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
                      ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             *(segment1.begin() + ty));
+        body(
+            *(segment0.begin() + tx), *(segment1.begin() + ty),
+            *(segment1.begin() + ty));
     }
   }
 };
@@ -827,10 +838,11 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -848,11 +860,12 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           SEGMENT const&       segment2,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      SEGMENT const&       segment2,
+      BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -862,8 +875,9 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             *(segment2.begin() + tz));
+        body(
+            *(segment0.begin() + tx), *(segment1.begin() + ty),
+            *(segment2.begin() + tz));
     }
   }
 };
@@ -877,10 +891,11 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -898,11 +913,12 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           SEGMENT const&       segment2,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      SEGMENT const&       segment2,
+      BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -912,8 +928,9 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             *(segment2.begin() + tz), tx, ty, tz);
+        body(
+            *(segment0.begin() + tx), *(segment1.begin() + ty),
+            *(segment2.begin() + tz), tx, ty, tz);
     }
   }
 };
@@ -938,10 +955,11 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -965,11 +983,12 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           SEGMENT const&       segment2,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      SEGMENT const&       segment2,
+      BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -987,8 +1006,9 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
              bz += ctx.itm->get_group_range(DIM2))
         {
 
-          body(*(segment0.begin() + bx), *(segment1.begin() + by),
-               *(segment2.begin() + bz));
+          body(
+              *(segment0.begin() + bx), *(segment1.begin() + by),
+              *(segment2.begin() + bz));
         }
       }
     }
@@ -1003,10 +1023,11 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      BODY const&          body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -1031,11 +1052,12 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const&       segment0,
-                                           SEGMENT const&       segment1,
-                                           SEGMENT const&       segment2,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      SEGMENT const&       segment0,
+      SEGMENT const&       segment1,
+      SEGMENT const&       segment2,
+      BODY const&          body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -1053,8 +1075,9 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
              bz += ctx.itm->get_group_range(DIM0))
         {
 
-          body(*(segment0.begin() + bx), *(segment1.begin() + by),
-               *(segment2.begin() + bz), bx, by, bz);
+          body(
+              *(segment0.begin() + bx), *(segment1.begin() + by),
+              *(segment2.begin() + bz), bx, by, bz);
         }
       }
     }
@@ -1066,10 +1089,11 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1088,10 +1112,11 @@ struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1110,10 +1135,11 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1134,10 +1160,11 @@ struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1156,10 +1183,11 @@ struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1178,10 +1206,11 @@ struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1200,10 +1229,11 @@ struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1222,10 +1252,11 @@ struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T               tile_size,
-                                           SEGMENT const&       segment,
-                                           BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const& ctx,
+      TILE_T               tile_size,
+      SEGMENT const&       segment,
+      BODY const&          body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1238,5 +1269,5 @@ struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
index 68ed5d5d5e..149d4ca0fd 100644
--- a/include/RAJA/policy/sycl/params/kernel_name.hpp
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -36,9 +36,9 @@ resolve(KernelName&)
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/params/reduce.hpp b/include/RAJA/policy/sycl/params/reduce.hpp
index de7818957b..26b67c469d 100644
--- a/include/RAJA/policy/sycl/params/reduce.hpp
+++ b/include/RAJA/policy/sycl/params/reduce.hpp
@@ -25,7 +25,7 @@ template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
     SYCL_EXTERNAL combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
 {
-  out.val = OP{}(out.val, in.val);
+  out.val = OP {}(out.val, in.val);
 }
 
 // Resolve
@@ -33,13 +33,13 @@ template <typename EXEC_POL, typename OP, typename T>
 camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
 resolve(Reducer<OP, T>& red)
 {
-  *red.target = OP{}(*red.target, red.val);
+  *red.target = OP {}(*red.target, red.val);
 }
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index 73f77b0154..afd7c24b22 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -57,7 +57,7 @@ struct get_launch<false>
 {
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
-} // end namespace detail
+}  // end namespace detail
 
 namespace policy
 {
@@ -113,8 +113,8 @@ template <typename Mask>
 struct sycl_local_masked_loop
 {};
 
-} // namespace sycl
-} // namespace policy
+}  // namespace sycl
+}  // namespace policy
 
 using policy::sycl::sycl_exec;
 using policy::sycl::sycl_reduce;
@@ -257,10 +257,10 @@ void set_sycl_dim(dim_t& d, int value)
 {
   return SyclDimHelper<dim>::set(d, value);
 }
-} // namespace internal
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #endif
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 1edb198a8c..3d954097f6 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -74,14 +74,16 @@ static int MaxNumTeams = 1;
 //! Information necessary for SYCL offload to be considered
 struct Offload_Info
 {
-  int  hostID{1};
-  int  deviceID{2};
-  bool isMapped{false};
+  int  hostID {1};
+  int  deviceID {2};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
   Offload_Info(const Offload_Info& other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
   {}
 };
 
@@ -143,9 +145,9 @@ struct Reduce_Data
     }
 
     // precondition: host and device are valid pointers
-    auto e =
-        q->memcpy(reinterpret_cast<void*>(device),
-                  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
+    auto e = q->memcpy(
+        reinterpret_cast<void*>(device), reinterpret_cast<void*>(host),
+        sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
@@ -162,9 +164,9 @@ struct Reduce_Data
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void*>(host),
-                       reinterpret_cast<void*>(device),
-                       sycl::MaxNumTeams * sizeof(T));
+    auto e = q->memcpy(
+        reinterpret_cast<void*>(host), reinterpret_cast<void*>(device),
+        sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
@@ -188,7 +190,7 @@ struct Reduce_Data
   }
 };
 
-} // end namespace sycl
+}  // end namespace sycl
 
 //! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
@@ -225,14 +227,14 @@ struct TargetReduce
       val.deviceToHost(info);
       for (int i = 0; i < sycl::MaxNumTeams; ++i)
       {
-        Reducer{}(val.value, val.host[i]);
+        Reducer {}(val.value, val.host[i]);
       }
       //      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     T returnVal = finalVal;
     reset(finalVal);
     return returnVal;
@@ -244,16 +246,14 @@ struct TargetReduce
   TargetReduce& reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            val.device[i]);
-    Reducer{}(atm, rhsVal);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -262,16 +262,14 @@ struct TargetReduce
   const TargetReduce& reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            val.device[i]);
-    Reducer{}(atm, rhsVal);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -309,11 +307,12 @@ struct TargetReduceLoc
         finalLoc(identity_loc_)
   {}
 
-  void reset(T         init_val_,
-             IndexType init_loc_,
-             T         identity_val_ = Reducer::identity,
-             IndexType identity_loc_ =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(
+      T         init_val_,
+      IndexType init_loc_,
+      T         identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
@@ -339,14 +338,14 @@ struct TargetReduceLoc
 
       for (int i = 0; i < sycl::MaxNumTeams; ++i)
       {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       info.isMapped = true;
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     returnVal = finalVal;
     returnLoc = finalLoc;
     reset(finalVal, finalLoc);
@@ -369,15 +368,15 @@ struct TargetReduceLoc
   TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire,
-                           cl::sycl::memory_scope::device);
-    Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    cl::sycl::atomic_fence(cl::sycl::memory_order_release,
-                           cl::sycl::memory_scope::device);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    cl::sycl::atomic_fence(
+        cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
+    Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
+    cl::sycl::atomic_fence(
+        cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
     return *this;
 #else
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
 #endif
   }
@@ -385,7 +384,7 @@ struct TargetReduceLoc
   //! apply reduction (const version) -- still reduces internal values
   const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -428,12 +427,10 @@ class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
   const self& operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -457,12 +454,10 @@ class ReduceBitOr<sycl_reduce, T>
   self& operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -475,12 +470,10 @@ class ReduceBitOr<sycl_reduce, T>
   const self& operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -504,12 +497,10 @@ class ReduceBitAnd<sycl_reduce, T>
   self& operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -522,12 +513,10 @@ class ReduceBitAnd<sycl_reduce, T>
   const self& operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -551,12 +540,10 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
   self& min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -569,12 +556,10 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
   const self& min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -598,12 +583,10 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
   self& max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -616,12 +599,10 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
   const self& max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<
+        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
+        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -631,8 +612,8 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for RAJA_ENABLE_SYCL guard
+#endif  // closing endif for RAJA_ENABLE_SYCL guard
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/tensor.hpp b/include/RAJA/policy/tensor.hpp
index ef51060029..bc38787f63 100644
--- a/include/RAJA/policy/tensor.hpp
+++ b/include/RAJA/policy/tensor.hpp
@@ -23,4 +23,4 @@
 #include "RAJA/policy/tensor/arch_impl.hpp"
 #include "RAJA/policy/tensor/policy.hpp"
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/tensor/arch.hpp b/include/RAJA/policy/tensor/arch.hpp
index 8c98ac87b3..50de38b80a 100644
--- a/include/RAJA/policy/tensor/arch.hpp
+++ b/include/RAJA/policy/tensor/arch.hpp
@@ -42,8 +42,8 @@ struct RegisterTraits;
  * static constexpr camp::idx s_num_elem = Y;
  *
  */
-} // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -120,8 +120,8 @@ struct scalar_register
 using default_register = RAJA_TENSOR_REGISTER_TYPE;
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 //
diff --git a/include/RAJA/policy/tensor/arch/avx.hpp b/include/RAJA/policy/tensor/arch/avx.hpp
index 4c5445096e..c0df27fac9 100644
--- a/include/RAJA/policy/tensor/arch/avx.hpp
+++ b/include/RAJA/policy/tensor/arch/avx.hpp
@@ -24,4 +24,4 @@
 #include <RAJA/policy/tensor/arch/avx/avx_double.hpp>
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index d8c3cd3057..e79d4e5d57 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -57,8 +57,8 @@ class Register<double, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -486,10 +486,10 @@ class Register<double, avx_register>
 };
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 41c2c3134c..c6993255be 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -56,9 +56,9 @@ class Register<float, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(
+        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
 public:
@@ -81,14 +81,15 @@ class Register<float, avx_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
+  Register(
+      element_type x0,
+      element_type x1,
+      element_type x2,
+      element_type x3,
+      element_type x4,
+      element_type x5,
+      element_type x6,
+      element_type x7)
       : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
@@ -504,11 +505,11 @@ class Register<float, avx_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 1426a4658a..9b4c85b791 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -56,35 +56,36 @@ class Register<int32_t, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(
+        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
-                            3 * stride, 2 * stride, stride, 0);
+    return _mm256_set_epi32(
+        7 * stride, 6 * stride, 5 * stride, 4 * stride, 3 * stride, 2 * stride,
+        stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(
+        N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0, N >= 6 ? 5 : 0,
+        N >= 3 ? 2 : 0, N >= 4 ? 3 : 0, N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(
+        N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0, N >= 7 ? 6 : 0,
+        N >= 2 ? 1 : 0, N >= 1 ? 0 : 0, N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -108,14 +109,15 @@ class Register<int32_t, avx_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
+  Register(
+      element_type x0,
+      element_type x1,
+      element_type x2,
+      element_type x3,
+      element_type x4,
+      element_type x5,
+      element_type x6,
+      element_type x7)
       : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
@@ -222,8 +224,9 @@ class Register<int32_t, avx_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
-                        reinterpret_cast<__m256>(m_value));
+    _mm256_maskstore_ps(
+        reinterpret_cast<float*>(ptr), createMask(N),
+        reinterpret_cast<__m256>(m_value));
     return *this;
   }
 
@@ -438,10 +441,10 @@ class Register<int32_t, avx_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
-                                      get(5) / b.get(5), get(4) / b.get(4),
-                                      get(3) / b.get(3), get(2) / b.get(2),
-                                      get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm256_set_epi32(
+        get(7) / b.get(7), get(6) / b.get(6), get(5) / b.get(5),
+        get(4) / b.get(4), get(3) / b.get(3), get(2) / b.get(2),
+        get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -786,11 +789,11 @@ class Register<int32_t, avx_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 313592f70a..7a05434b94 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -56,8 +56,8 @@ class Register<int64_t, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -204,8 +204,9 @@ class Register<int64_t, avx_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N),
-                        reinterpret_cast<__m256d>(m_value));
+    _mm256_maskstore_pd(
+        reinterpret_cast<double*>(ptr), createMask(N),
+        reinterpret_cast<__m256d>(m_value));
     return *this;
   }
 
@@ -354,8 +355,9 @@ class Register<int64_t, avx_register>
   self_type multiply(self_type const& b) const
   {
     // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
-                                       get(1) * b.get(1), get(0) * b.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) * b.get(3), get(2) * b.get(2), get(1) * b.get(1),
+        get(0) * b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -363,8 +365,9 @@ class Register<int64_t, avx_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
-                                       get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
+        get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -469,10 +472,11 @@ class Register<int64_t, avx_register>
   RAJA_INLINE
   self_type vmax(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
-                                       get(2) > a.get(2) ? get(2) : a.get(2),
-                                       get(1) > a.get(1) ? get(1) : a.get(1),
-                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) > a.get(3) ? get(3) : a.get(3),
+        get(2) > a.get(2) ? get(2) : a.get(2),
+        get(1) > a.get(1) ? get(1) : a.get(1),
+        get(0) > a.get(0) ? get(0) : a.get(0)));
   }
 
   /*!
@@ -539,19 +543,20 @@ class Register<int64_t, avx_register>
   RAJA_INLINE
   self_type vmin(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
-                                       get(2) < a.get(2) ? get(2) : a.get(2),
-                                       get(1) < a.get(1) ? get(1) : a.get(1),
-                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) < a.get(3) ? get(3) : a.get(3),
+        get(2) < a.get(2) ? get(2) : a.get(2),
+        get(1) < a.get(1) ? get(1) : a.get(1),
+        get(0) < a.get(0) ? get(0) : a.get(0)));
   }
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index 5cfded959d..ad0c7b3d26 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -67,12 +67,12 @@ struct RegisterTraits<RAJA::expt::avx_register, double>
   using int_element_type                  = int64_t;
 };
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx2.hpp b/include/RAJA/policy/tensor/arch/avx2.hpp
index 95edf0724f..4ae2ca6bdd 100644
--- a/include/RAJA/policy/tensor/arch/avx2.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2.hpp
@@ -24,4 +24,4 @@
 #include <RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
 
 
-#endif // __AVX2__
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index 385f938dd2..b4d26d499c 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -56,8 +56,8 @@ class Register<double, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -162,8 +162,8 @@ class Register<double, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-    m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
-                                  sizeof(element_type));
+    m_value = _mm256_i64gather_pd(
+        ptr, createStridedOffsets(stride), sizeof(element_type));
     return *this;
   }
 
@@ -569,11 +569,11 @@ class Register<double, avx2_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index a9f8bcfe7c..0538a217bc 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -56,35 +56,36 @@ class Register<float, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(
+        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
-                            3 * stride, 2 * stride, stride, 0);
+    return _mm256_set_epi32(
+        7 * stride, 6 * stride, 5 * stride, 4 * stride, 3 * stride, 2 * stride,
+        stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(
+        N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0, N >= 6 ? 5 : 0,
+        N >= 3 ? 2 : 0, N >= 4 ? 3 : 0, N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(
+        N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0, N >= 7 ? 6 : 0,
+        N >= 2 ? 1 : 0, N >= 1 ? 0 : 0, N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -100,14 +101,15 @@ class Register<float, avx2_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
+  Register(
+      element_type x0,
+      element_type x1,
+      element_type x2,
+      element_type x3,
+      element_type x4,
+      element_type x5,
+      element_type x6,
+      element_type x7)
       : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
@@ -177,8 +179,8 @@ class Register<float, avx2_register>
   RAJA_INLINE
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i32gather_ps(ptr, createStridedOffsets(stride),
-                                  sizeof(element_type));
+    m_value = _mm256_i32gather_ps(
+        ptr, createStridedOffsets(stride), sizeof(element_type));
     return *this;
   }
 
@@ -508,11 +510,11 @@ class Register<float, avx2_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index af488521ef..0d8bc941fa 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -57,35 +57,36 @@ class Register<int32_t, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(
+        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
-                            3 * stride, 2 * stride, stride, 0);
+    return _mm256_set_epi32(
+        7 * stride, 6 * stride, 5 * stride, 4 * stride, 3 * stride, 2 * stride,
+        stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(
+        N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0, N >= 6 ? 5 : 0,
+        N >= 3 ? 2 : 0, N >= 4 ? 3 : 0, N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(
+        N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0, N >= 7 ? 6 : 0,
+        N >= 2 ? 1 : 0, N >= 1 ? 0 : 0, N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -102,14 +103,15 @@ class Register<int32_t, avx2_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
+  Register(
+      element_type x0,
+      element_type x1,
+      element_type x2,
+      element_type x3,
+      element_type x4,
+      element_type x5,
+      element_type x6,
+      element_type x7)
       : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
@@ -181,8 +183,8 @@ class Register<int32_t, avx2_register>
   RAJA_INLINE
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i32gather_epi32(ptr, createStridedOffsets(stride),
-                                     sizeof(element_type));
+    m_value = _mm256_i32gather_epi32(
+        ptr, createStridedOffsets(stride), sizeof(element_type));
     return *this;
   }
 
@@ -196,9 +198,9 @@ class Register<int32_t, avx2_register>
   self_type&
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
-    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(), ptr,
-                                          createStridedOffsets(stride),
-                                          createMask(N), sizeof(element_type));
+    m_value = _mm256_mask_i32gather_epi32(
+        _mm256_setzero_si256(), ptr, createStridedOffsets(stride),
+        createMask(N), sizeof(element_type));
     return *this;
   }
 
@@ -393,10 +395,10 @@ class Register<int32_t, avx2_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
-                                      get(5) / b.get(5), get(4) / b.get(4),
-                                      get(3) / b.get(3), get(2) / b.get(2),
-                                      get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm256_set_epi32(
+        get(7) / b.get(7), get(6) / b.get(6), get(5) / b.get(5),
+        get(4) / b.get(4), get(3) / b.get(3), get(2) / b.get(2),
+        get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -450,8 +452,8 @@ class Register<int32_t, avx2_register>
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
     auto red2 = _mm256_max_epi32(red1, sh2);
 
-    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
+    return std::max<element_type>(
+        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
   }
 
   /*!
@@ -486,16 +488,16 @@ class Register<int32_t, avx2_register>
     }
     if (N == 4)
     {
-      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
-                                    _mm256_extract_epi32(red1, 2));
+      return std::max<element_type>(
+          _mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
     }
 
     // swap odd-even quads and add
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
     auto red2 = _mm256_max_epi32(red1, sh2);
 
-    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
+    return std::max<element_type>(
+        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
   }
 
   /*!
@@ -525,8 +527,8 @@ class Register<int32_t, avx2_register>
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
     auto red2 = _mm256_min_epi32(red1, sh2);
 
-    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
+    return std::min<element_type>(
+        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
   }
 
   /*!
@@ -561,16 +563,16 @@ class Register<int32_t, avx2_register>
     }
     if (N == 4)
     {
-      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
-                                    _mm256_extract_epi32(red1, 2));
+      return std::min<element_type>(
+          _mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
     }
 
     // swap odd-even quads and add
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
     auto red2 = _mm256_min_epi32(red1, sh2);
 
-    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
+    return std::min<element_type>(
+        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
   }
 
   /*!
@@ -585,11 +587,11 @@ class Register<int32_t, avx2_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index c0cc3b7012..83ca8ba3ac 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -55,8 +55,8 @@ class Register<int64_t, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(
+        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -166,9 +166,9 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type& load_strided(int64_t const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
-                                     createStridedOffsets(stride),
-                                     sizeof(element_type));
+    m_value = _mm256_i64gather_epi64(
+        reinterpret_cast<long long const*>(ptr), createStridedOffsets(stride),
+        sizeof(element_type));
     return *this;
   }
 
@@ -203,9 +203,9 @@ class Register<int64_t, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-    m_value =
-        _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
-                               offsets.get_register(), sizeof(element_type));
+    m_value = _mm256_i64gather_epi64(
+        reinterpret_cast<long long const*>(ptr), offsets.get_register(),
+        sizeof(element_type));
     return *this;
   }
 
@@ -250,8 +250,8 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N),
-                           m_value);
+    _mm256_maskstore_epi64(
+        reinterpret_cast<long long*>(ptr), createMask(N), m_value);
     return *this;
   }
 
@@ -374,8 +374,9 @@ class Register<int64_t, avx2_register>
   self_type multiply(self_type const& b) const
   {
     // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
-                                       get(1) * b.get(1), get(0) * b.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) * b.get(3), get(2) * b.get(2), get(1) * b.get(1),
+        get(0) * b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -383,8 +384,9 @@ class Register<int64_t, avx2_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
-                                       get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
+        get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -478,10 +480,11 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type vmax(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
-                                       get(2) > a.get(2) ? get(2) : a.get(2),
-                                       get(1) > a.get(1) ? get(1) : a.get(1),
-                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) > a.get(3) ? get(3) : a.get(3),
+        get(2) > a.get(2) ? get(2) : a.get(2),
+        get(1) > a.get(1) ? get(1) : a.get(1),
+        get(0) > a.get(0) ? get(0) : a.get(0)));
   }
 
   /*!
@@ -547,19 +550,20 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type vmin(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
-                                       get(2) < a.get(2) ? get(2) : a.get(2),
-                                       get(1) < a.get(1) ? get(1) : a.get(1),
-                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(
+        get(3) < a.get(3) ? get(3) : a.get(3),
+        get(2) < a.get(2) ? get(2) : a.get(2),
+        get(1) < a.get(1) ? get(1) : a.get(1),
+        get(0) < a.get(0) ? get(0) : a.get(0)));
   }
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index b4c6e1ea37..d51b4ad853 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -69,12 +69,12 @@ struct RegisterTraits<RAJA::expt::avx2_register, double>
   using int_element_type                  = int64_t;
 };
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
-#endif // guard
+#endif  // guard
 
 
-#endif // __AVX2__
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx512.hpp b/include/RAJA/policy/tensor/arch/avx512.hpp
index f8c2f86247..71d0212c5e 100644
--- a/include/RAJA/policy/tensor/arch/avx512.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512.hpp
@@ -25,4 +25,4 @@
 #include <RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
 
 
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index b35ef7f595..5f859f76d7 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -166,8 +166,8 @@ class Register<double, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
-                                  sizeof(element_type));
+    m_value = _mm512_i64gather_pd(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
     return *this;
   }
 
@@ -182,9 +182,9 @@ class Register<double, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
-                                       createStridedOffsets(stride), ptr,
-                                       sizeof(element_type));
+    m_value = _mm512_mask_i64gather_pd(
+        _mm512_setzero_pd(), createMask(N), createStridedOffsets(stride), ptr,
+        sizeof(element_type));
     return *this;
   }
 
@@ -221,8 +221,8 @@ class Register<double, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
-                         sizeof(element_type));
+    _mm512_i64scatter_pd(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
     return *this;
   }
 
@@ -236,8 +236,9 @@ class Register<double, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
-                              m_value, sizeof(element_type));
+    _mm512_mask_i64scatter_pd(
+        ptr, createMask(N), createStridedOffsets(stride), m_value,
+        sizeof(element_type));
     return *this;
   }
 
@@ -394,11 +395,11 @@ class Register<double, avx512_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index ccc6991ccc..e3582acee7 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -181,8 +181,8 @@ class Register<float, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i32gather_ps(createStridedOffsets(stride), ptr,
-                                  sizeof(element_type));
+    m_value = _mm512_i32gather_ps(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
     return *this;
   }
 
@@ -197,9 +197,9 @@ class Register<float, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), createMask(N),
-                                       createStridedOffsets(stride), ptr,
-                                       sizeof(element_type));
+    m_value = _mm512_mask_i32gather_ps(
+        _mm512_setzero_ps(), createMask(N), createStridedOffsets(stride), ptr,
+        sizeof(element_type));
     return *this;
   }
 
@@ -236,8 +236,8 @@ class Register<float, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i32scatter_ps(ptr, createStridedOffsets(stride), m_value,
-                         sizeof(element_type));
+    _mm512_i32scatter_ps(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
     return *this;
   }
 
@@ -251,8 +251,9 @@ class Register<float, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i32scatter_ps(ptr, createMask(N), createStridedOffsets(stride),
-                              m_value, sizeof(element_type));
+    _mm512_mask_i32scatter_ps(
+        ptr, createMask(N), createStridedOffsets(stride), m_value,
+        sizeof(element_type));
     return *this;
   }
 
@@ -409,11 +410,11 @@ class Register<float, avx512_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 324a50db3a..139b503236 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -160,7 +160,7 @@ class Register<int32_t, avx512_register>
 #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
     m_value = _mm512_loadu_si512(ptr);
 #else
-    m_value = _mm512_loadu_epi32(ptr); // GNU 7-9 are missing this instruction.
+    m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
 #endif
     return *this;
   }
@@ -187,8 +187,8 @@ class Register<int32_t, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride), ptr,
-                                     sizeof(element_type));
+    m_value = _mm512_i32gather_epi32(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
     return *this;
   }
 
@@ -203,9 +203,9 @@ class Register<int32_t, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), createMask(N),
-                                          createStridedOffsets(stride), ptr,
-                                          sizeof(element_type));
+    m_value = _mm512_mask_i32gather_epi32(
+        _mm512_setzero_epi32(), createMask(N), createStridedOffsets(stride),
+        ptr, sizeof(element_type));
     return *this;
   }
 
@@ -221,7 +221,7 @@ class Register<int32_t, avx512_register>
 #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
     _mm512_storeu_si512(ptr, m_value);
 #else
-    _mm512_storeu_epi32(ptr, m_value); // GNU 7-9 are missing this instruction.
+    _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
 #endif
     return *this;
   }
@@ -246,8 +246,8 @@ class Register<int32_t, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i32scatter_epi32(ptr, createStridedOffsets(stride), m_value,
-                            sizeof(element_type));
+    _mm512_i32scatter_epi32(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
     return *this;
   }
 
@@ -261,9 +261,9 @@ class Register<int32_t, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i32scatter_epi32(ptr, createMask(N),
-                                 createStridedOffsets(stride), m_value,
-                                 sizeof(element_type));
+    _mm512_mask_i32scatter_epi32(
+        ptr, createMask(N), createStridedOffsets(stride), m_value,
+        sizeof(element_type));
     return *this;
   }
 
@@ -462,11 +462,11 @@ class Register<int32_t, avx512_register>
   }
 };
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index 9266d0b979..c35280d27c 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -142,13 +142,14 @@ class Register<int64_t, avx512_register>
     // AVX512F
 #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
     (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
-     defined(__INTEL_LLVM_COMPILER)) // Check for oneapi's icpx.
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
     m_value = _mm512_maskz_loadu_epi64(
         ~0,
-        ptr); // May cause slowdown due to looping over 8 bytes, one at a time.
+        ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
 #else
-    m_value = _mm512_loadu_epi64(ptr); // GNU 7-10 are missing this instruction,
-                                       // as is icpx as of version 2022.2.
+    m_value =
+        _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction,
+                                  // as is icpx as of version 2022.2.
 #endif
     return *this;
   }
@@ -175,8 +176,8 @@ class Register<int64_t, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
-                                     sizeof(element_type));
+    m_value = _mm512_i64gather_epi64(
+        createStridedOffsets(stride), ptr, sizeof(element_type));
     return *this;
   }
 
@@ -191,9 +192,9 @@ class Register<int64_t, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
-                                          createStridedOffsets(stride), ptr,
-                                          sizeof(element_type));
+    m_value = _mm512_mask_i64gather_epi64(
+        _mm512_setzero_epi32(), createMask(N), createStridedOffsets(stride),
+        ptr, sizeof(element_type));
     return *this;
   }
 
@@ -208,14 +209,16 @@ class Register<int64_t, avx512_register>
     // AVX512F
 #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
     (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
-     defined(__INTEL_LLVM_COMPILER)) // Check for oneapi's icpx.
-    _mm512_mask_storeu_epi64(ptr, ~0,
-                             m_value); // May cause slowdown due to looping
-                                       // over 8 bytes, one at a time.
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    _mm512_mask_storeu_epi64(
+        ptr, ~0,
+        m_value);  // May cause slowdown due to looping
+                   // over 8 bytes, one at a time.
 #else
-    _mm512_storeu_epi64(ptr,
-                        m_value); // GNU 7-10 are missing this instruction,
-                                  // as is icpx as of version 2022.2.
+    _mm512_storeu_epi64(
+        ptr,
+        m_value);  // GNU 7-10 are missing this instruction,
+                   // as is icpx as of version 2022.2.
 #endif
     return *this;
   }
@@ -240,8 +243,8 @@ class Register<int64_t, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
-                            sizeof(element_type));
+    _mm512_i64scatter_epi64(
+        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
     return *this;
   }
 
@@ -255,9 +258,9 @@ class Register<int64_t, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i64scatter_epi64(ptr, createMask(N),
-                                 createStridedOffsets(stride), m_value,
-                                 sizeof(element_type));
+    _mm512_mask_i64scatter_epi64(
+        ptr, createMask(N), createStridedOffsets(stride), m_value,
+        sizeof(element_type));
     return *this;
   }
 
@@ -325,10 +328,10 @@ class Register<int64_t, avx512_register>
   self_type divide(self_type const& b) const
   {
     // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi64(get(7) / b.get(7), get(6) / b.get(6),
-                                      get(5) / b.get(5), get(4) / b.get(4),
-                                      get(3) / b.get(3), get(2) / b.get(2),
-                                      get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm512_set_epi64(
+        get(7) / b.get(7), get(6) / b.get(6), get(5) / b.get(5),
+        get(4) / b.get(4), get(3) / b.get(3), get(2) / b.get(2),
+        get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -407,11 +410,11 @@ class Register<int64_t, avx512_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index 12308289fd..3088b0b8ae 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -69,11 +69,11 @@ struct RegisterTraits<RAJA::expt::avx512_register, double>
   using int_element_type                  = int64_t;
 };
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // guard
+#endif  // guard
 
 
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/cuda.hpp b/include/RAJA/policy/tensor/arch/cuda.hpp
index f1d33f7121..cfda807e68 100644
--- a/include/RAJA/policy/tensor/arch/cuda.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda.hpp
@@ -28,4 +28,4 @@
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index 1826cf9944..b692c0bb0d 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -256,10 +256,11 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t         segbits,
-                            camp::idx_t         stride_inner,
-                            camp::idx_t         stride_outer)
+  self_type& segmented_load(
+      element_type const* ptr,
+      camp::idx_t         segbits,
+      camp::idx_t         stride_inner,
+      camp::idx_t         stride_outer)
   {
     auto lane = get_lane();
 
@@ -281,12 +282,13 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t         segbits,
-                               camp::idx_t         stride_inner,
-                               camp::idx_t         stride_outer,
-                               camp::idx_t         num_inner,
-                               camp::idx_t         num_outer)
+  self_type& segmented_load_nm(
+      element_type const* ptr,
+      camp::idx_t         segbits,
+      camp::idx_t         stride_inner,
+      camp::idx_t         stride_outer,
+      camp::idx_t         num_inner,
+      camp::idx_t         num_outer)
   {
     auto lane = get_lane();
 
@@ -387,8 +389,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    *
    */
   template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
-                                                   T2 const&     offsets) const
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter(element_type* ptr, T2 const& offsets) const
   {
 
     ptr[offsets.get_raw_value()] = m_value;
@@ -424,10 +426,11 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t   segbits,
-                                   camp::idx_t   stride_inner,
-                                   camp::idx_t   stride_outer) const
+  self_type const& segmented_store(
+      element_type* ptr,
+      camp::idx_t   segbits,
+      camp::idx_t   stride_inner,
+      camp::idx_t   stride_outer) const
   {
     auto lane = get_lane();
 
@@ -447,12 +450,13 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t   segbits,
-                                      camp::idx_t   stride_inner,
-                                      camp::idx_t   stride_outer,
-                                      camp::idx_t   num_inner,
-                                      camp::idx_t   num_outer) const
+  self_type const& segmented_store_nm(
+      element_type* ptr,
+      camp::idx_t   segbits,
+      camp::idx_t   stride_inner,
+      camp::idx_t   stride_outer,
+      camp::idx_t   num_inner,
+      camp::idx_t   num_outer) const
   {
     auto lane = get_lane();
 
@@ -571,10 +575,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * floats and doubles use the CUDA instrinsic FMA
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      !std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, c.m_value));
   }
@@ -583,10 +587,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value + c.m_value);
   }
@@ -595,10 +599,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * floats and doubles use the CUDA instrinsic FMS
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      !std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, -c.m_value));
   }
@@ -607,10 +611,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value - c.m_value);
   }
@@ -641,9 +645,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type max() const
   {
     // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::maximum>;
 
     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -657,9 +660,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type max_n(int N) const
   {
     // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::maximum>;
 
     auto ident = RAJA::operators::limits<element_type>::min();
     auto lane  = get_lane();
@@ -675,7 +677,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   RAJA_DEVICE
   self_type vmax(self_type a) const
   {
-    return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
   }
 
   /*!
@@ -687,9 +689,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type min() const
   {
     // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::minimum>;
 
     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -703,9 +704,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type min_n(int N) const
   {
     // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::minimum>;
 
     auto ident = RAJA::operators::limits<element_type>::max();
     auto lane  = get_lane();
@@ -721,7 +721,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   RAJA_DEVICE
   self_type vmin(self_type a) const
   {
-    return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
   }
 
 
@@ -734,9 +734,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
-                                             camp::idx_t stride_inner,
-                                             camp::idx_t stride_outer)
+  static int_vector_type s_segmented_offsets(
+      camp::idx_t segbits,
+      camp::idx_t stride_inner,
+      camp::idx_t stride_outer)
   {
     int_vector_type result;
 
@@ -787,8 +788,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_sum_inner(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
+  self_type
+  segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -854,8 +855,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_sum_outer(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
+  self_type
+  segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -891,10 +892,11 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_divide_nm(self_type   den,
-                                camp::idx_t segbits,
-                                camp::idx_t num_inner,
-                                camp::idx_t num_outer) const
+  self_type segmented_divide_nm(
+      self_type   den,
+      camp::idx_t segbits,
+      camp::idx_t num_inner,
+      camp::idx_t num_outer) const
   {
     self_type result;
 
@@ -966,8 +968,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_inner(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
+  self_type segmented_broadcast_inner(
+      camp::idx_t segbits,
+      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1022,8 +1025,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_outer(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
+  self_type segmented_broadcast_outer(
+      camp::idx_t segbits,
+      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1038,11 +1042,11 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // Guard
+#endif  // Guard
 
-#endif // CUDA
+#endif  // CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 7d0ab04238..8b9c355f44 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -38,12 +38,12 @@ struct RegisterTraits<RAJA::expt::cuda_warp_register, T>
   using int_element_type                  = int32_t;
 };
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip.hpp b/include/RAJA/policy/tensor/arch/hip.hpp
index 32ea1e6520..3ddf27e39c 100644
--- a/include/RAJA/policy/tensor/arch/hip.hpp
+++ b/include/RAJA/policy/tensor/arch/hip.hpp
@@ -28,4 +28,4 @@
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index a84ae592af..c46865ff45 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -257,10 +257,11 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t         segbits,
-                            camp::idx_t         stride_inner,
-                            camp::idx_t         stride_outer)
+  self_type& segmented_load(
+      element_type const* ptr,
+      camp::idx_t         segbits,
+      camp::idx_t         stride_inner,
+      camp::idx_t         stride_outer)
   {
     auto lane = get_lane();
 
@@ -282,12 +283,13 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t         segbits,
-                               camp::idx_t         stride_inner,
-                               camp::idx_t         stride_outer,
-                               camp::idx_t         num_inner,
-                               camp::idx_t         num_outer)
+  self_type& segmented_load_nm(
+      element_type const* ptr,
+      camp::idx_t         segbits,
+      camp::idx_t         stride_inner,
+      camp::idx_t         stride_outer,
+      camp::idx_t         num_inner,
+      camp::idx_t         num_outer)
   {
     auto lane = get_lane();
 
@@ -388,8 +390,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    *
    */
   template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
-                                                   T2 const&     offsets) const
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter(element_type* ptr, T2 const& offsets) const
   {
 
     ptr[offsets.get_raw_value()] = m_value;
@@ -426,10 +428,11 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t   segbits,
-                                   camp::idx_t   stride_inner,
-                                   camp::idx_t   stride_outer) const
+  self_type const& segmented_store(
+      element_type* ptr,
+      camp::idx_t   segbits,
+      camp::idx_t   stride_inner,
+      camp::idx_t   stride_outer) const
   {
     auto lane = get_lane();
 
@@ -449,12 +452,13 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t   segbits,
-                                      camp::idx_t   stride_inner,
-                                      camp::idx_t   stride_outer,
-                                      camp::idx_t   num_inner,
-                                      camp::idx_t   num_outer) const
+  self_type const& segmented_store_nm(
+      element_type* ptr,
+      camp::idx_t   segbits,
+      camp::idx_t   stride_inner,
+      camp::idx_t   stride_outer,
+      camp::idx_t   num_inner,
+      camp::idx_t   num_outer) const
   {
     auto lane = get_lane();
 
@@ -573,10 +577,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * floats and doubles use the CUDA instrinsic FMA
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      !std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, c.m_value));
   }
@@ -585,10 +589,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value + c.m_value);
   }
@@ -597,10 +601,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * floats and doubles use the CUDA instrinsic FMS
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      !std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, -c.m_value));
   }
@@ -609,10 +613,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
-      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
+      std::numeric_limits<element_type>::is_integer,
+      RETURN_TYPE>::type
+  multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value - c.m_value);
   }
@@ -643,9 +647,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type max() const
   {
     // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::maximum>;
 
     return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -659,9 +662,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type max_n(int N) const
   {
     // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::maximum>;
 
     auto ident = RAJA::operators::limits<element_type>::min();
     auto lane  = get_lane();
@@ -677,7 +679,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   RAJA_DEVICE
   self_type vmax(self_type a) const
   {
-    return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
   }
 
   /*!
@@ -689,9 +691,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type min() const
   {
     // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::minimum>;
 
     return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -705,9 +706,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type min_n(int N) const
   {
     // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
+    using combiner_t = RAJA::reduce::detail::op_adapter<
+        element_type, RAJA::operators::minimum>;
 
     auto ident = RAJA::operators::limits<element_type>::max();
     auto lane  = get_lane();
@@ -723,7 +723,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   RAJA_DEVICE
   self_type vmin(self_type a) const
   {
-    return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
   }
 
 
@@ -736,9 +736,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
-                                             camp::idx_t stride_inner,
-                                             camp::idx_t stride_outer)
+  static int_vector_type s_segmented_offsets(
+      camp::idx_t segbits,
+      camp::idx_t stride_inner,
+      camp::idx_t stride_outer)
   {
     int_vector_type result;
 
@@ -789,8 +790,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_sum_inner(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
+  self_type
+  segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -856,8 +857,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_sum_outer(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
+  self_type
+  segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -893,10 +894,11 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_divide_nm(self_type   den,
-                                camp::idx_t segbits,
-                                camp::idx_t num_inner,
-                                camp::idx_t num_outer) const
+  self_type segmented_divide_nm(
+      self_type   den,
+      camp::idx_t segbits,
+      camp::idx_t num_inner,
+      camp::idx_t num_outer) const
   {
     self_type result;
 
@@ -968,8 +970,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_inner(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
+  self_type segmented_broadcast_inner(
+      camp::idx_t segbits,
+      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1024,8 +1027,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_outer(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
+  self_type segmented_broadcast_outer(
+      camp::idx_t segbits,
+      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1040,11 +1044,11 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 };
 
 
-} // namespace expt
+}  // namespace expt
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // Guard
+#endif  // Guard
 
-#endif // HIP
+#endif  // HIP
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index 8b2d56a006..dc4d0d63d1 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -38,12 +38,12 @@ struct RegisterTraits<RAJA::expt::hip_wave_register, T>
   using int_element_type                  = int32_t;
 };
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index ac2300d8ac..0e8f33569c 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -42,10 +42,10 @@ class Register<T, scalar_register>
   using element_type    = T;
   using register_type   = T;
 
-  using int_vector_type =
-      Register<typename internal::expt::RegisterTraits<scalar_register,
-                                                       T>::int_element_type,
-               scalar_register>;
+  using int_vector_type = Register<
+      typename internal::expt::RegisterTraits<scalar_register, T>::
+          int_element_type,
+      scalar_register>;
 
 
 private:
@@ -479,8 +479,8 @@ class Register<T, scalar_register>
     return self_type(RAJA::min<element_type>(m_value, a.m_value));
   }
 };
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index d8059422b5..92496eeae3 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -68,8 +68,8 @@ struct RegisterTraits<RAJA::expt::scalar_register, double>
 };
 
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index fc928447c7..4b27bd2566 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -40,10 +40,11 @@ namespace policy
 namespace tensor
 {
 
-template <typename EXEC_POLICY,
-          typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          camp::idx_t TILE_SIZE>
+template <
+    typename EXEC_POLICY,
+    typename TENSOR_TYPE,
+    camp::idx_t DIM,
+    camp::idx_t TILE_SIZE>
 struct tensor_exec : public EXEC_POLICY
 {
   using exec_policy = EXEC_POLICY;
@@ -54,9 +55,9 @@ struct tensor_exec : public EXEC_POLICY
 };
 
 
-} // end of namespace tensor
+}  // end of namespace tensor
 
-} // end of namespace policy
+}  // end of namespace policy
 
 namespace expt
 {
@@ -75,9 +76,9 @@ using matrix_col_exec =
     policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
 
 
-} //  namespace expt
+}  //  namespace expt
 
 
-} // end of namespace RAJA
+}  // end of namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 57e90a678a..c055a3a9e2 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -62,8 +62,9 @@ struct BitMask
   template <typename T>
   RAJA_HOST_DEVICE static constexpr T maskValue(T input)
   {
-    return ((input >> (static_cast<T>(Shift))) &
-            static_cast<T>((1 << (Width)) - 1));
+    return (
+        (input >> (static_cast<T>(Shift))) &
+        static_cast<T>((1 << (Width)) - 1));
   }
 
 
@@ -80,6 +81,6 @@ struct BitMask
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // RAJA_util_BitMask_HPP
+#endif  // RAJA_util_BitMask_HPP
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index e65e8aef39..afed430afd 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -96,8 +96,8 @@ struct CombiningAdapter
 
   RAJA_SUPPRESS_HD_WARN
   template <camp::idx_t... RangeInts>
-  RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
-                                           camp::idx_seq<RangeInts...>)
+  RAJA_HOST_DEVICE inline auto
+  call_helper(IndexLinear linear_index, camp::idx_seq<RangeInts...>)
       -> decltype(m_lambda(
           camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
@@ -108,8 +108,8 @@ struct CombiningAdapter
   ///
   RAJA_SUPPRESS_HD_WARN
   template <camp::idx_t... RangeInts>
-  RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
-                                           camp::idx_seq<RangeInts...>) const
+  RAJA_HOST_DEVICE inline auto
+  call_helper(IndexLinear linear_index, camp::idx_seq<RangeInts...>) const
       -> decltype(m_lambda(
           camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
@@ -217,9 +217,9 @@ make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
-RAJA_INLINE auto
-make_CombiningAdapter(Lambda&& lambda,
-                      ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+RAJA_INLINE auto make_CombiningAdapter(
+    Lambda&& lambda,
+    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
 // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
 //             camp::val<RAJA::TypedOffsetLayout<
 //                 typename std::common_type< strip_index_type_t<IdxTs>...
@@ -237,15 +237,15 @@ make_CombiningAdapter(Lambda&& lambda,
       {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
                                           : static_cast<IdxLin>(0))...}},
       std::move(layout));
-  return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-                                           std::move(offset_layout));
+  return make_CombiningAdapter_from_layout(
+      std::forward<Lambda>(lambda), std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE auto
-make_PermutedCombiningAdapter(Lambda&& lambda,
-                              ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+RAJA_INLINE auto make_PermutedCombiningAdapter(
+    Lambda&& lambda,
+    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
 // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
 //             camp::val<RAJA::TypedOffsetLayout<
 //                 typename std::common_type< strip_index_type_t<IdxTs>...
@@ -265,10 +265,10 @@ make_PermutedCombiningAdapter(Lambda&& lambda,
                                           : static_cast<IdxLin>(0))...}},
 
       std::move(layout));
-  return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-                                           std::move(offset_layout));
+  return make_CombiningAdapter_from_layout(
+      std::forward<Lambda>(lambda), std::move(offset_layout));
 }
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_CombingAdapter_HPP */
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index 18ad9c7951..b38e3eefac 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -48,12 +48,11 @@ template <typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
 template <typename T, typename TypeList>
-using enable_if_is_none_of =
-    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
-                     T>;
+using enable_if_is_none_of = std::
+    enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
 
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 0008fadbb0..75c65e0610 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -50,7 +50,7 @@ template <typename IdxLin = Index_type>
 struct IndexList
 {
 
-  IdxLin* index_list{nullptr};
+  IdxLin* index_list {nullptr};
 
   IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
   operator()(const IdxLin idx) const
@@ -70,7 +70,7 @@ template <typename IdxLin = Index_type>
 struct ConditionalIndexList
 {
 
-  IdxLin* index_list{nullptr};
+  IdxLin* index_list {nullptr};
 
   IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
   operator()(const IdxLin idx) const
@@ -107,7 +107,7 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
   template <typename... Types>
   constexpr RAJA_INLINE
   IndexLayout_impl(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
-      : base_{(ns)...}, tuple(index_tuple_in)
+      : base_ {(ns)...}, tuple(index_tuple_in)
   {}
 
   /*!
@@ -126,12 +126,13 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
   }
 };
 
-} // namespace internal
+}  // namespace internal
 
 
-template <size_t n_dims   = 1,
-          typename IdxLin = Index_type,
-          typename... IndexTypes>
+template <
+    size_t n_dims   = 1,
+    typename IdxLin = Index_type,
+    typename... IndexTypes>
 struct IndexLayout
     : public internal::
           IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>
@@ -139,15 +140,16 @@ struct IndexLayout
   using Base = internal::
       IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
-  using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                   IdxLin,
-                                   IndexTypes...>::IndexLayout_impl;
+  using internal::IndexLayout_impl<
+      camp::make_idx_seq_t<n_dims>,
+      IdxLin,
+      IndexTypes...>::IndexLayout_impl;
 
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE
-  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                               IdxLin,
-                                               IndexTypes...>& rhs)
-      : Base{rhs}
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
+      const internal::
+          IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
+              rhs)
+      : Base {rhs}
   {}
 };
 
@@ -166,17 +168,18 @@ auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
  * creates an index layout based on the input camp::tuple of index types
  *
  */
-template <typename IdxLin = Index_type,
-          typename... Types,
-          typename... IndexTypes>
+template <
+    typename IdxLin = Index_type,
+    typename... Types,
+    typename... IndexTypes>
 auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
     -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
   static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
-                                                              ns...);
+  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(
+      index_tuple_in, ns...);
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index 210d3ad841..2fc56be88b 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -23,10 +23,8 @@ class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
 {
 public:
   using Parent = ::RAJA::util::PluginStrategy;
-  typedef void (*init_function)(const int,
-                                const uint64_t,
-                                const uint32_t,
-                                void*);
+  typedef void (
+      *init_function)(const int, const uint64_t, const uint32_t, void*);
   typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
   typedef void (*post_function)(uint64_t);
   typedef void (*finalize_function)();
@@ -49,11 +47,11 @@ class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
   std::vector<post_function>     post_functions;
   std::vector<finalize_function> finalize_functions;
 
-}; // end KokkosPluginLoader class
+};  // end KokkosPluginLoader class
 
 void linkKokkosPluginLoader();
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 3520983ee5..4e12b2fb22 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -38,9 +38,10 @@ namespace detail
 {
 
 
-template <typename Range,
-          typename IdxLin        = Index_type,
-          ptrdiff_t StrideOneDim = -1>
+template <
+    typename Range,
+    typename IdxLin        = Index_type,
+    ptrdiff_t StrideOneDim = -1>
 struct LayoutBase_impl;
 
 /*!
@@ -53,7 +54,7 @@ struct stride_calculator
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
   operator()(IdxLin cur_stride, IdxLin const (&sizes)[n_dims]) const
   {
-    return stride_calculator<j + 1, n_dims, IdxLin>{}(
+    return stride_calculator<j + 1, n_dims, IdxLin> {}(
         cur_stride * (sizes[j] ? sizes[j] : 1), sizes);
   }
 };
@@ -98,15 +99,15 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    */
   template <typename... Types>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(Types... ns)
-      : sizes{static_cast<IdxLin>(stripIndexType(ns))...},
-        strides{(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin>{}(
+      : sizes {static_cast<IdxLin>(stripIndexType(ns))...},
+        strides {(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin> {}(
             sizes[RangeInts] ? IdxLin(1) : IdxLin(0),
             sizes))...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
-    static_assert(n_dims == sizeof...(Types),
-                  "number of dimensions must match");
+    static_assert(
+        n_dims == sizeof...(Types), "number of dimensions must match");
   }
 
   /*!
@@ -114,13 +115,14 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    */
   template <typename CIdxLin, ptrdiff_t CStrideOneDim>
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
-  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
-                                        CIdxLin,
-                                        CStrideOneDim>& rhs)
-      : sizes{static_cast<IdxLin>(rhs.sizes[RangeInts])...},
-        strides{static_cast<IdxLin>(rhs.strides[RangeInts])...},
-        inv_strides{static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
-        inv_mods{static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
+  LayoutBase_impl(const LayoutBase_impl<
+                  camp::idx_seq<RangeInts...>,
+                  CIdxLin,
+                  CStrideOneDim>& rhs)
+      : sizes {static_cast<IdxLin>(rhs.sizes[RangeInts])...},
+        strides {static_cast<IdxLin>(rhs.strides[RangeInts])...},
+        inv_strides {static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
+        inv_mods {static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
   {}
 
 
@@ -131,10 +133,10 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   RAJA_INLINE constexpr LayoutBase_impl(
       const std::array<IdxLin, n_dims>& sizes_in,
       const std::array<IdxLin, n_dims>& strides_in)
-      : sizes{sizes_in[RangeInts]...},
-        strides{strides_in[RangeInts]...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+      : sizes {sizes_in[RangeInts]...},
+        strides {strides_in[RangeInts]...},
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {}
 
   /*!
@@ -143,9 +145,10 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
-    printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(sizes[N] - 1));
+    printf(
+        "Error at index %d, value %ld is not within bounds [0, %ld] \n",
+        static_cast<int>(N), static_cast<long int>(idx),
+        static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
@@ -154,8 +157,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
-                                                Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void
+  BoundsCheck(Idx idx, Indices... indices) const
   {
     if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N])))
     {
@@ -181,12 +184,13 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>((RangeInts == stride_one_dim
-                            ? // Is this dimension stride-one?
-                            indices
-                            : // it's stride one, so dont bother with multiply
-                            strides[RangeInts] * indices // it's not stride one
-                        )...);
+    return sum<IdxLin>(
+        (RangeInts == stride_one_dim
+             ?  // Is this dimension stride-one?
+             indices
+             :  // it's stride one, so dont bother with multiply
+             strides[RangeInts] * indices  // it's not stride one
+         )...);
   }
 
 
@@ -201,23 +205,25 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    *                 dimensionality of this layout.
    */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices&&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void
+  toIndices(IdxLin linear_index, Indices&&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
     if (totSize > 0 && (linear_index < 0 || linear_index >= totSize))
     {
-      printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
-             static_cast<long int>(linear_index),
-             static_cast<long int>(totSize - 1));
+      printf(
+          "Error! Linear index %ld is not within bounds [0, %ld]. \n",
+          static_cast<long int>(linear_index),
+          static_cast<long int>(totSize - 1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
     }
 #endif
 
-    camp::sink((indices = (camp::decay<Indices>)((linear_index /
-                                                  inv_strides[RangeInts]) %
-                                                 inv_mods[RangeInts]))...);
+    camp::sink((
+        indices =
+            (camp::decay<
+                Indices>)((linear_index / inv_strides[RangeInts]) % inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -272,7 +278,7 @@ constexpr size_t
 template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
 constexpr IdxLin
     LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::limit;
-} // namespace detail
+}  // namespace detail
 
 /*!
  * @brief A mapping of n-dimensional index space to a linear index space.
@@ -368,12 +374,13 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    * @param indices  Variadic list of indices to be assigned, number must match
    *                 dimensionality of this layout.
    */
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void
+  toIndices(IdxLin linear_index, DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
-                    std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes&>(indices)...);
+    toIndicesHelper(
+        camp::make_idx_seq_t<sizeof...(DimTypes)> {},
+        std::forward<IdxLin>(linear_index),
+        std::forward<DimTypes&>(indices)...);
   }
 
 private:
@@ -385,13 +392,15 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *
    */
   template <typename... Indices, camp::idx_t... RangeInts>
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
-                                                    IdxLin linear_index,
-                                                    Indices&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(
+      camp::idx_seq<RangeInts...>,
+      IdxLin linear_index,
+      Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -425,6 +434,6 @@ RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim>
 }
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 946756d783..57bb337844 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -67,46 +67,50 @@ template <typename Perm, typename Sizes>
 using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 
 
-} // namespace internal
+}  // namespace internal
 
 
-template <typename ValueType,
-          typename Perm,
-          typename Sizes,
-          typename... IndexTypes>
-using TypedLocalArray =
-    internal::TypedViewBase<ValueType,
-                            ValueType*,
-                            internal::getStaticLayoutType<Perm, Sizes>,
-                            camp::list<IndexTypes...>>;
+template <
+    typename ValueType,
+    typename Perm,
+    typename Sizes,
+    typename... IndexTypes>
+using TypedLocalArray = internal::TypedViewBase<
+    ValueType,
+    ValueType*,
+    internal::getStaticLayoutType<Perm, Sizes>,
+    camp::list<IndexTypes...>>;
 
 
 template <typename ValueType, typename Perm, typename Sizes>
-using LocalArray =
-    internal::TypedViewBase<ValueType,
-                            ValueType*,
-                            internal::getStaticLayoutType<Perm, Sizes>,
-                            internal::getDefaultIndexTypes<Perm>>;
-
-
-template <typename AtomicPolicy,
-          typename DataType,
-          typename Perm,
-          typename Sizes,
-          typename... IndexTypes>
+using LocalArray = internal::TypedViewBase<
+    ValueType,
+    ValueType*,
+    internal::getStaticLayoutType<Perm, Sizes>,
+    internal::getDefaultIndexTypes<Perm>>;
+
+
+template <
+    typename AtomicPolicy,
+    typename DataType,
+    typename Perm,
+    typename Sizes,
+    typename... IndexTypes>
 struct AtomicTypedLocalArray
 {};
 
-template <typename AtomicPolicy,
-          typename DataType,
-          camp::idx_t... Perm,
-          Index_type... Sizes,
-          typename... IndexTypes>
-struct AtomicTypedLocalArray<AtomicPolicy,
-                             DataType,
-                             camp::idx_seq<Perm...>,
-                             RAJA::SizeList<Sizes...>,
-                             IndexTypes...>
+template <
+    typename AtomicPolicy,
+    typename DataType,
+    camp::idx_t... Perm,
+    Index_type... Sizes,
+    typename... IndexTypes>
+struct AtomicTypedLocalArray<
+    AtomicPolicy,
+    DataType,
+    camp::idx_seq<Perm...>,
+    RAJA::SizeList<Sizes...>,
+    IndexTypes...>
 {
   DataType* m_arrayPtr = nullptr;
   using value_type     = DataType;
@@ -130,7 +134,7 @@ struct AtomicTypedLocalArray<AtomicPolicy,
 };
 
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 19ef3f12e7..948a9b038f 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -51,18 +51,18 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
 
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
-  static constexpr size_t n_dims          = sizeof...(RangeInts);
-  IdxLin                  offsets[n_dims] = {0}; // If not specified set to zero
-
-  constexpr RAJA_INLINE
-  OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
-                    std::array<IdxLin, sizeof...(RangeInts)> end)
-      : base_{(end[RangeInts] - begin[RangeInts])...},
-        offsets{begin[RangeInts]...}
+  static constexpr size_t n_dims = sizeof...(RangeInts);
+  IdxLin offsets[n_dims]         = {0};  // If not specified set to zero
+
+  constexpr RAJA_INLINE OffsetLayout_impl(
+      std::array<IdxLin, sizeof...(RangeInts)> begin,
+      std::array<IdxLin, sizeof...(RangeInts)> end)
+      : base_ {(end[RangeInts] - begin[RangeInts])...},
+        offsets {begin[RangeInts]...}
   {}
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout_impl(Self const& c)
-      : base_(c.base_), offsets{c.offsets[RangeInts]...}
+      : base_(c.base_), offsets {c.offsets[RangeInts]...}
   {}
 
   void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
@@ -74,10 +74,11 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
-    printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(offsets[N]),
-           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+    printf(
+        "Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
+        static_cast<int>(N), static_cast<long int>(idx),
+        static_cast<long int>(offsets[N]),
+        static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
@@ -86,8 +87,8 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
-                                                Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void
+  BoundsCheck(Idx idx, Indices... indices) const
   {
     if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N]))
     {
@@ -108,8 +109,8 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   }
 
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices&&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void
+  toIndices(IdxLin linear_index, Indices&&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
@@ -120,14 +121,14 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
                          const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
                          const Layout<sizeof...(RangeInts), IdxLin>&     rhs)
   {
-    OffsetLayout_impl ret{rhs};
+    OffsetLayout_impl ret {rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
     return ret;
   }
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
   OffsetLayout_impl(const Layout<sizeof...(RangeInts), IdxLin>& rhs)
-      : base_{rhs}
+      : base_ {rhs}
   {}
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin size() const
@@ -159,7 +160,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   }
 };
 
-} // namespace internal
+}  // namespace internal
 
 template <size_t n_dims = 1, typename IdxLin = Index_type>
 struct OffsetLayout
@@ -168,13 +169,13 @@ struct OffsetLayout
   using Base =
       internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
-  using internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                    IdxLin>::OffsetLayout_impl;
+  using internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>::
+      OffsetLayout_impl;
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout(
       const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
           rhs)
-      : Base{rhs}
+      : Base {rhs}
   {}
 };
 
@@ -207,39 +208,43 @@ struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void
+  toIndices(IdxLin linear_index, DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
-                    std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes&>(indices)...);
+    toIndicesHelper(
+        camp::make_idx_seq_t<sizeof...(DimTypes)> {},
+        std::forward<IdxLin>(linear_index),
+        std::forward<DimTypes&>(indices)...);
   }
 
 private:
   template <typename... Indices, camp::idx_t... RangeInts>
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
-                                                    IdxLin linear_index,
-                                                    Indices&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(
+      camp::idx_seq<RangeInts...>,
+      IdxLin linear_index,
+      Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
 
 template <size_t n_dims, typename IdxLin = Index_type>
-auto make_offset_layout(const std::array<IdxLin, n_dims>& begin,
-                        const std::array<IdxLin, n_dims>& end)
-    -> OffsetLayout<n_dims, IdxLin>
+auto make_offset_layout(
+    const std::array<IdxLin, n_dims>& begin,
+    const std::array<IdxLin, n_dims>& end) -> OffsetLayout<n_dims, IdxLin>
 {
-  return OffsetLayout<n_dims, IdxLin>{begin, end};
+  return OffsetLayout<n_dims, IdxLin> {begin, end};
 }
 
 template <size_t Rank, typename IdxLin = Index_type>
-auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
-                                 const std::array<IdxLin, Rank>& end,
-                                 const std::array<IdxLin, Rank>& permutation)
+auto make_permuted_offset_layout(
+    const std::array<IdxLin, Rank>& begin,
+    const std::array<IdxLin, Rank>& end,
+    const std::array<IdxLin, Rank>& permutation)
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
@@ -251,6 +256,6 @@ auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
       from_layout_and_offsets(begin, make_permuted_layout(sizes, permutation));
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 9d627fedb8..253a1bf8e4 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -29,19 +29,20 @@ namespace RAJA
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeft
 {
-  template <typename new_Ret,
-            typename new_Arg1 = new_Ret,
-            typename new_Arg2 = new_Ret>
+  template <
+      typename new_Ret,
+      typename new_Arg1 = new_Ret,
+      typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
   template <size_t>
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
-  operator()(Arg1 const& i,
-             Arg1 const& num_i,
-             Arg2 const& j,
-             Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& num_i,
+      Arg2 const& j,
+      Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
@@ -50,46 +51,49 @@ struct GetOffsetLeft
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetRight
 {
-  template <typename new_Ret,
-            typename new_Arg1 = new_Ret,
-            typename new_Arg2 = new_Ret>
+  template <
+      typename new_Ret,
+      typename new_Arg1 = new_Ret,
+      typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
   template <size_t>
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
-  operator()(Arg1 const& i,
-             Arg1 const& RAJA_UNUSED_ARG(num_i),
-             Arg2 const& j,
-             Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& RAJA_UNUSED_ARG(num_i),
+      Arg2 const& j,
+      Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
-template <size_t t_bunch_num_i,
-          typename Ret,
-          typename Arg1 = Ret,
-          typename Arg2 = Arg1>
+template <
+    size_t t_bunch_num_i,
+    typename Ret,
+    typename Arg1 = Ret,
+    typename Arg2 = Arg1>
 struct GetOffsetLeftBunched
 {
-  template <typename new_Ret,
-            typename new_Arg1 = new_Ret,
-            typename new_Arg2 = new_Ret>
+  template <
+      typename new_Ret,
+      typename new_Arg1 = new_Ret,
+      typename new_Arg2 = new_Ret>
   using rebind =
       GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
 
   template <size_t new_bunch_num_i>
   using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
 
-  static constexpr Arg1 bunch_num_i{t_bunch_num_i};
+  static constexpr Arg1 bunch_num_i {t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
-  operator()(Arg1 const& i,
-             Arg1 const& RAJA_UNUSED_ARG(num_i),
-             Arg2 const& j,
-             Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& RAJA_UNUSED_ARG(num_i),
+      Arg2 const& j,
+      Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
@@ -98,6 +102,6 @@ struct GetOffsetLeftBunched
   }
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index a7f9c6eaff..11d68680f7 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -52,10 +52,10 @@ struct fp_associative_tag : associative_tag
 
 // get associativity tag appropriate for the type
 template <typename T>
-using associative_or_fp_associative_tag =
-    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
-                       fp_associative_tag,
-                       associative_tag>;
+using associative_or_fp_associative_tag = std::conditional_t<
+    std::is_floating_point<std::decay_t<T>>::value,
+    fp_associative_tag,
+    associative_tag>;
 
 template <typename Arg1, typename Arg2, typename Result>
 struct binary_function
@@ -76,7 +76,7 @@ template <typename Arg1, typename Arg2>
 struct comparison_function : public binary_function<Arg1, Arg2, bool>
 {};
 
-} // namespace detail
+}  // namespace detail
 
 namespace types
 {
@@ -188,7 +188,7 @@ struct largest<T, false, false, true, true>
 {
   using type = double;
 };
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief type lookup to return largest similar type. If running on GPU,
@@ -197,11 +197,12 @@ struct largest<T, false, false, true, true>
 template <typename T, bool gpu = false>
 struct largest
 {
-  using type = typename detail::largest<T,
-                                        std::is_integral<T>::value,
-                                        std::is_signed<T>::value,
-                                        std::is_floating_point<T>::value,
-                                        gpu>::type;
+  using type = typename detail::largest<
+      T,
+      std::is_integral<T>::value,
+      std::is_signed<T>::value,
+      std::is_floating_point<T>::value,
+      gpu>::type;
 };
 
 
@@ -232,7 +233,7 @@ struct larger_of<T, U, false>
 {
   using type = U;
 };
-} // namespace detail
+}  // namespace detail
 
 template <typename T, typename U>
 struct larger_of
@@ -241,7 +242,7 @@ struct larger_of
       larger_of<T, U, (size_of<T>::value > size_of<U>::value)>::type;
 };
 
-} // namespace types
+}  // namespace types
 
 
 template <typename T, typename Enable = void>
@@ -250,9 +251,10 @@ struct limits;
 
 // limits for signed integer types
 template <typename T>
-struct limits<T,
-              typename std::enable_if<std::is_integral<T>::value &&
-                                      !std::is_unsigned<T>::value>::type>
+struct limits<
+    T,
+    typename std::enable_if<
+        std::is_integral<T>::value && !std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -278,9 +280,10 @@ struct limits<T,
 
 // limits for signed integer types
 template <typename T>
-struct limits<T,
-              typename std::enable_if<std::is_integral<T>::value &&
-                                      std::is_unsigned<T>::value>::type>
+struct limits<
+    T,
+    typename std::enable_if<
+        std::is_integral<T>::value && std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -346,13 +349,15 @@ static_assert(check<unsigned int>(), "limits for unsigned int is broken");
 static_assert(check<long>(), "limits for long is broken");
 static_assert(check<unsigned long>(), "limits for unsigned long is broken");
 static_assert(check<long int>(), "limits for long int is broken");
-static_assert(check<unsigned long int>(),
-              "limits for unsigned long int is "
-              "broken");
+static_assert(
+    check<unsigned long int>(),
+    "limits for unsigned long int is "
+    "broken");
 static_assert(check<long long>(), "limits for long long is broken");
-static_assert(check<unsigned long long>(),
-              "limits for unsigned long long is "
-              "broken");
+static_assert(
+    check<unsigned long long>(),
+    "limits for unsigned long long is "
+    "broken");
 #endif
 
 // Arithmetic
@@ -361,21 +366,21 @@ template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
               detail::associative_or_fp_associative_tag<Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
-    return Ret{lhs} + rhs;
+    return Ret {lhs} + rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minus : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
-    return Ret{lhs} - rhs;
+    return Ret {lhs} - rhs;
   }
 };
 
@@ -384,31 +389,31 @@ struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
                     detail::associative_or_fp_associative_tag<Ret>
 {
 
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
-    return Ret{lhs} * rhs;
+    return Ret {lhs} * rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{1}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {1}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct divides : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
-    return Ret{lhs} / rhs;
+    return Ret {lhs} / rhs;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
-    return Ret{lhs} % rhs;
+    return Ret {lhs} % rhs;
   }
 };
 
@@ -418,8 +423,8 @@ template <typename Arg1, typename Arg2 = Arg1>
 struct logical_and : public detail::comparison_function<Arg1, Arg2>,
                      detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs && rhs;
   }
@@ -430,8 +435,8 @@ template <typename Arg1, typename Arg2 = Arg1>
 struct logical_or : public detail::comparison_function<Arg1, Arg2>,
                     detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs || rhs;
   }
@@ -452,33 +457,33 @@ struct logical_not : public detail::unary_function<T, bool>
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs | rhs;
   }
 
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs & rhs;
   }
 
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret {0}; }
 };
 
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs ^ rhs;
   }
@@ -495,8 +500,8 @@ template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
                  detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return (rhs < lhs) ? rhs : lhs;
   }
@@ -510,8 +515,8 @@ template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
                  detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
-                                            const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return (lhs < rhs) ? rhs : lhs;
   }
@@ -526,8 +531,8 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
 template <typename Arg1, typename Arg2 = Arg1>
 struct equal_to : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs == rhs;
   }
@@ -536,8 +541,8 @@ struct equal_to : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs != rhs;
   }
@@ -546,8 +551,8 @@ struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct greater : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs > rhs;
   }
@@ -556,8 +561,8 @@ struct greater : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct less : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs < rhs;
   }
@@ -567,8 +572,8 @@ struct less : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct greater_equal : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs >= rhs;
   }
@@ -577,8 +582,8 @@ struct greater_equal : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct less_equal : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
-                                             const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool
+  operator()(const Arg1& lhs, const Arg2& rhs) const
   {
     return lhs <= rhs;
   }
@@ -598,8 +603,8 @@ struct identity : public detail::unary_function<Orig, Ret>
 template <typename T, typename U>
 struct project1st : public detail::binary_function<T, U, T>
 {
-  RAJA_HOST_DEVICE constexpr T operator()(const T& lhs,
-                                          const U& RAJA_UNUSED_ARG(rhs)) const
+  RAJA_HOST_DEVICE constexpr T
+  operator()(const T& lhs, const U& RAJA_UNUSED_ARG(rhs)) const
   {
     return lhs;
   }
@@ -608,8 +613,8 @@ struct project1st : public detail::binary_function<T, U, T>
 template <typename T, typename U = T>
 struct project2nd : public detail::binary_function<T, U, U>
 {
-  RAJA_HOST_DEVICE constexpr U operator()(const T& RAJA_UNUSED_ARG(lhs),
-                                          const U& rhs) const
+  RAJA_HOST_DEVICE constexpr U
+  operator()(const T& RAJA_UNUSED_ARG(lhs), const U& rhs) const
   {
     return rhs;
   }
@@ -632,22 +637,23 @@ struct is_fp_associative
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct safe_plus
-    : public plus<Arg1,
-                  Arg2,
-                  typename types::larger<
-                      typename types::larger_of<Arg1, Arg2>::type>::type>
+struct safe_plus : public plus<
+                       Arg1,
+                       Arg2,
+                       typename types::larger<
+                           typename types::larger_of<Arg1, Arg2>::type>::type>
 {};
 
-} // namespace operators
+}  // namespace operators
 
 namespace concepts
 {
 
-template <typename Function,
-          typename Return,
-          typename Arg1 = Return,
-          typename Arg2 = Arg1>
+template <
+    typename Function,
+    typename Return,
+    typename Arg1 = Return,
+    typename Arg2 = Arg1>
 struct BinaryFunction
     : DefineConcept(::RAJA::concepts::convertible_to<Return>(
           camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>())))
@@ -667,17 +673,17 @@ using is_binary_function =
 
 template <typename Fun, typename Ret, typename T>
 using is_unary_function = ::RAJA::concepts::requires_<UnaryFunction, Ret, T>;
-} // namespace detail
+}  // namespace detail
 
-} // namespace concepts
+}  // namespace concepts
 
 namespace type_traits
 {
 DefineTypeTraitFromConcept(is_binary_function, RAJA::concepts::BinaryFunction);
 DefineTypeTraitFromConcept(is_unary_function, RAJA::concepts::UnaryFunction);
-} // namespace type_traits
+}  // namespace type_traits
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index 3c9f29c87f..3962b18e16 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -221,15 +221,15 @@ template <camp::idx_t... Range, camp::idx_t... Perm>
 struct InversePermutationHelper<camp::idx_seq<Range...>, camp::idx_seq<Perm...>>
 {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq<
-      CalcInversePermutationElem<Range,
-                                 0,
-                                 sizeof...(Range),
-                                 camp::idx_seq<Perm...>>::value...>;
+  using type = camp::idx_seq<CalcInversePermutationElem<
+      Range,
+      0,
+      sizeof...(Range),
+      camp::idx_seq<Perm...>>::value...>;
 };
 
 
-} // namespace internal
+}  // namespace internal
 
 
 /*!
@@ -240,6 +240,6 @@ using invert_permutation = typename internal::InversePermutationHelper<
     camp::make_idx_seq_t<camp::size<Perm>::value>,
     Perm>::type;
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif /* RAJA_FORALLN_PERMUTATIONS_HPP */
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index cd5084bb94..98fe9e11e4 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -61,9 +61,9 @@ namespace RAJA
  *
  */
 template <size_t Rank, typename IdxLin = Index_type>
-auto make_permuted_layout(std::array<IdxLin, Rank>      sizes,
-                          std::array<camp::idx_t, Rank> permutation)
-    -> Layout<Rank, IdxLin>
+auto make_permuted_layout(
+    std::array<IdxLin, Rank>      sizes,
+    std::array<camp::idx_t, Rank> permutation) -> Layout<Rank, IdxLin>
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
@@ -101,6 +101,6 @@ using Perm = camp::idx_seq<Ints...>;
 template <camp::idx_t N>
 using MakePerm = typename camp::make_idx_seq<N>::type;
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 4be2ba0385..97aebf9431 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -34,10 +34,10 @@ struct PluginContext
 template <typename Policy>
 PluginContext make_context()
 {
-  return PluginContext{detail::get_platform<Policy>::value};
+  return PluginContext {detail::get_platform<Policy>::value};
 }
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
index e7caa12d46..5920142759 100644
--- a/include/RAJA/util/PluginLinker.hpp
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -23,6 +23,6 @@ struct pluginLinker
     (void)RAJA::util::linkKokkosPluginLoader();
   }
 } pluginLinker;
-} // namespace anonymous_RAJA
-} // namespace
+}  // namespace anonymous_RAJA
+}  // namespace
 #endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
index f787fb604a..50ed3a1da9 100644
--- a/include/RAJA/util/PluginOptions.hpp
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -17,17 +17,17 @@ namespace util
 
 struct PluginOptions
 {
-  PluginOptions(const std::string& newstr) : str(newstr){};
+  PluginOptions(const std::string& newstr) : str(newstr) {};
 
   std::string str;
 };
 
 inline PluginOptions make_options(const std::string& newstr)
 {
-  return PluginOptions{newstr};
+  return PluginOptions {newstr};
 }
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 2f6769c822..86f8fd7f6b 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -39,8 +39,8 @@ class PluginStrategy
 
 using PluginRegistry = Registry<PluginStrategy>;
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index c676c62bb1..b6905cad0e 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -22,9 +22,10 @@ class RegistryEntry
   std::shared_ptr<T> object;
 
 public:
-  RegistryEntry(const std::string& N,
-                const std::string& D,
-                std::shared_ptr<T> (*C)())
+  RegistryEntry(
+      const std::string& N,
+      const std::string& D,
+      std::shared_ptr<T> (*C)())
       : Name(N), Desc(D), object(C())
   {}
 
@@ -119,8 +120,8 @@ class Registry
   };
 };
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
   namespace RAJA                                                               \
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 9f1722dccd..43102ef680 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -122,8 +122,8 @@ struct RepeatView
       lhs -= rhs;
       return lhs;
     }
-    friend constexpr difference_type operator-(iterator const& lhs,
-                                               iterator const& rhs)
+    friend constexpr difference_type
+    operator-(iterator const& lhs, iterator const& rhs)
     {
       return static_cast<difference_type>(lhs.m_index) -
              static_cast<difference_type>(rhs.m_index);
@@ -193,6 +193,6 @@ struct RepeatView
   T      m_value;
 };
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_REPEATVIEW_HPP */
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
index 64e6133453..289e067b0a 100644
--- a/include/RAJA/util/RuntimePluginLoader.hpp
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -45,11 +45,11 @@ class RuntimePluginLoader : public RAJA::util::PluginStrategy
 
   std::vector<std::unique_ptr<Parent>> plugins;
 
-}; // end RuntimePluginLoader class
+};  // end RuntimePluginLoader class
 
 void linkRuntimePluginLoader();
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index 0593c8584a..b832ec4bd5 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -74,8 +74,8 @@ class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   second_type mem_idx[size];
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif /* RAJA_SOA_ARRAY_HPP */
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index ae2b46bb0c..b774523f47 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -39,14 +39,15 @@ namespace detail
  * This is useful for creating a vectorizable data layout and getting
  * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
  */
-template <typename T,
-          typename mempool = RAJA::basic_mempool::MemPool<
-              RAJA::basic_mempool::generic_allocator>,
-          typename accessor = DefaultAccessor>
+template <
+    typename T,
+    typename mempool =
+        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>,
+    typename accessor = DefaultAccessor>
 class SoAPtr
 {
   template <typename, typename, typename>
-  friend class SoAPtr; // friend other instantiations of this class
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = T;
@@ -103,20 +104,22 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T,
-          typename IndexType,
-          bool doing_min,
-          typename mempool,
-          typename accessor>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
-             mempool,
-             accessor>
+template <
+    typename T,
+    typename IndexType,
+    bool doing_min,
+    typename mempool,
+    typename accessor>
+class SoAPtr<
+    RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
+    mempool,
+    accessor>
 {
   using first_type  = T;
   using second_type = IndexType;
 
   template <typename, typename, typename>
-  friend class SoAPtr; // fiend other instantiations of this class
+  friend class SoAPtr;  // fiend other instantiations of this class
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
@@ -176,8 +179,8 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
   second_type* mem_idx = nullptr;
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif /* RAJA_SOA_PTR_HPP */
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index b6bd48538a..8b12d4b3b3 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -67,18 +67,20 @@ struct Span
   using iterator        = IterType;
   using const_iterator  = IterType;
 
-  static_assert(type_traits::is_integral<IndexType>::value,
-                "IndexType must "
-                "model Integral");
-  static_assert(type_traits::is_random_access_iterator<IterType>::value,
-                "IterType must model RandomAccessIterator");
+  static_assert(
+      type_traits::is_integral<IndexType>::value,
+      "IndexType must "
+      "model Integral");
+  static_assert(
+      type_traits::is_random_access_iterator<IterType>::value,
+      "IterType must model RandomAccessIterator");
 
   RAJA_HOST_DEVICE Span(iterator begin, iterator end)
-      : m_begin{begin}, m_end{end}
+      : m_begin {begin}, m_end {end}
   {}
 
   RAJA_HOST_DEVICE Span(iterator begin, size_type size)
-      : m_begin{begin}, m_end{begin + size}
+      : m_begin {begin}, m_end {begin + size}
   {}
 
   RAJA_HOST_DEVICE RAJA_INLINE iterator       begin() { return m_begin; }
@@ -136,13 +138,13 @@ struct Span
   {
     return slice(size() - count, count);
   }
-  RAJA_HOST_DEVICE RAJA_INLINE Span subspan(size_type begin,
-                                            size_type length) const
+  RAJA_HOST_DEVICE RAJA_INLINE Span
+  subspan(size_type begin, size_type length) const
   {
     return slice(begin, length);
   }
-  RAJA_HOST_DEVICE RAJA_INLINE Span slice(size_type begin,
-                                          size_type length) const
+  RAJA_HOST_DEVICE RAJA_INLINE Span
+  slice(size_type begin, size_type length) const
   {
     auto start = m_begin + begin;
     auto end   = start + length > m_end ? m_end : start + length;
@@ -175,8 +177,8 @@ struct Span
  *
  */
 template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
-                                                                 IndexType size)
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType>
+                             make_span(IterType begin, IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
@@ -187,11 +189,12 @@ RAJA_INLINE auto make_span(Iter& iterable)
   using std::begin;
   using std::distance;
   using std::end;
-  return Span<typename Iter::iterator,
-              decltype(distance(begin(iterable), end(iterable)))>(
+  return Span<
+      typename Iter::iterator,
+      decltype(distance(begin(iterable), end(iterable)))>(
       begin(iterable), end(iterable));
 }
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_SPAN_HPP */
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index f1e528e05b..7288ca8d17 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -39,23 +39,26 @@ namespace detail
 {
 
 
-template <typename IdxLin,
-          typename Range,
-          typename Sizes,
-          typename Strides,
-          typename DimTypeList = void>
+template <
+    typename IdxLin,
+    typename Range,
+    typename Sizes,
+    typename Strides,
+    typename DimTypeList = void>
 struct StaticLayoutBase_impl;
 
 
-template <typename IdxLin,
-          IdxLin... RangeInts,
-          IdxLin... Sizes,
-          IdxLin... Strides>
-struct StaticLayoutBase_impl<IdxLin,
-                             camp::int_seq<IdxLin, RangeInts...>,
-                             camp::int_seq<IdxLin, Sizes...>,
-                             camp::int_seq<IdxLin, Strides...>,
-                             void>
+template <
+    typename IdxLin,
+    IdxLin... RangeInts,
+    IdxLin... Sizes,
+    IdxLin... Strides>
+struct StaticLayoutBase_impl<
+    IdxLin,
+    camp::int_seq<IdxLin, RangeInts...>,
+    camp::int_seq<IdxLin, Sizes...>,
+    camp::int_seq<IdxLin, Strides...>,
+    void>
 {
 
   using IndexLinear = IdxLin;
@@ -75,8 +78,9 @@ struct StaticLayoutBase_impl<IdxLin,
 
   RAJA_INLINE static void print()
   {
-    camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                      (int)RangeInts, (int)Sizes, (int)Strides)...);
+    camp::sink(printf(
+        "StaticLayout: arg%d: size=%d, stride=%d\n", (int)RangeInts, (int)Sizes,
+        (int)Strides)...);
   }
 
 
@@ -186,14 +190,16 @@ struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
 template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <typename IdxLin,
-          IdxLin... Range,
-          camp::idx_t... Perm,
-          IdxLin... Sizes>
-struct StrideCalculator<IdxLin,
-                        camp::int_seq<IdxLin, Range...>,
-                        camp::idx_seq<Perm...>,
-                        camp::int_seq<IdxLin, Sizes...>>
+template <
+    typename IdxLin,
+    IdxLin... Range,
+    camp::idx_t... Perm,
+    IdxLin... Sizes>
+struct StrideCalculator<
+    IdxLin,
+    camp::int_seq<IdxLin, Range...>,
+    camp::idx_seq<Perm...>,
+    camp::int_seq<IdxLin, Sizes...>>
 {
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
@@ -205,28 +211,31 @@ struct StrideCalculator<IdxLin,
 
   using strides_unperm = camp::int_seq<
       IdxLin,
-      StrideCalculatorIdx<IdxLin,
-                          N,
-                          Range,
-                          camp::seq_at<Perm, sizes>::value...>::stride...>;
+      StrideCalculatorIdx<
+          IdxLin,
+          N,
+          Range,
+          camp::seq_at<Perm, sizes>::value...>::stride...>;
 
-  using strides =
-      camp::int_seq<IdxLin,
-                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
-                                 strides_unperm>::value...>;
+  using strides = camp::int_seq<
+      IdxLin,
+      camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::
+          value...>;
 };
 
 
-template <typename IdxLin,
-          IdxLin... RangeInts,
-          IdxLin... Sizes,
-          IdxLin... Strides,
-          typename... DimTypes>
-struct StaticLayoutBase_impl<IdxLin,
-                             camp::int_seq<IdxLin, RangeInts...>,
-                             camp::int_seq<IdxLin, Sizes...>,
-                             camp::int_seq<IdxLin, Strides...>,
-                             camp::list<DimTypes...>>
+template <
+    typename IdxLin,
+    IdxLin... RangeInts,
+    IdxLin... Sizes,
+    IdxLin... Strides,
+    typename... DimTypes>
+struct StaticLayoutBase_impl<
+    IdxLin,
+    camp::int_seq<IdxLin, RangeInts...>,
+    camp::int_seq<IdxLin, Sizes...>,
+    camp::int_seq<IdxLin, Strides...>,
+    camp::list<DimTypes...>>
 {
 
 
@@ -271,7 +280,7 @@ struct StaticLayoutBase_impl<IdxLin,
   template <camp::idx_t DIM>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
   {
-    return InnerLayout{}.get_dim_stride();
+    return InnerLayout {}.get_dim_stride();
   }
 
   template <camp::idx_t DIM>
@@ -292,11 +301,12 @@ struct StaticLayoutBase_impl<IdxLin,
 };
 
 
-template <typename Perm,
-          typename IdxLin,
-          typename Sizes,
-          typename Indexes,
-          typename TypeList>
+template <
+    typename Perm,
+    typename IdxLin,
+    typename Sizes,
+    typename Indexes,
+    typename TypeList>
 struct StaticLayoutMaker
 {
   using strides =
@@ -305,7 +315,7 @@ struct StaticLayoutMaker
 };
 
 
-} // namespace detail
+}  // namespace detail
 
 
 template <typename Perm, typename IdxLin, camp::idx_t... Sizes>
@@ -319,10 +329,11 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <typename Perm,
-          typename IdxLin,
-          typename TypeList,
-          camp::idx_t... Sizes>
+template <
+    typename Perm,
+    typename IdxLin,
+    typename TypeList,
+    camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
@@ -330,6 +341,6 @@ using TypedStaticLayout = typename detail::StaticLayoutMaker<
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
     TypeList>::type;
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index f63e44b3c6..a0e5053d1e 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -80,7 +80,7 @@ class BGQTimer
 };
 
 using TimerBase = BGQTimer;
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #elif defined(RAJA_USE_CHRONO)
@@ -132,7 +132,7 @@ class ChronoTimer
 };
 
 using TimerBase = ChronoTimer;
-} // namespace RAJA
+}  // namespace RAJA
 
 
 #elif defined(RAJA_USE_GETTIME)
@@ -194,7 +194,7 @@ class GettimeTimer
 };
 
 using TimerBase = GettimeTimer;
-} // namespace RAJA
+}  // namespace RAJA
 
 #elif defined(RAJA_USE_CLOCK)
 
@@ -246,7 +246,7 @@ class ClockTimer
 };
 
 using TimerBase = ClockTimer;
-} // namespace RAJA
+}  // namespace RAJA
 
 #else
 
@@ -272,6 +272,6 @@ class Timer : public TimerBase
 #endif
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 511b20a532..9b34eb5e71 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -49,7 +49,7 @@ RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const& a)
 }
 
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index c38ebc908a..80bb0948dc 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -98,7 +98,7 @@ struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>>
 };
 
 
-} // namespace detail
+}  // namespace detail
 #endif
 
 
@@ -113,7 +113,7 @@ struct count_num_tensor_args
       RAJA::sum<camp::idx_t>(
           (internal::expt::isTensorIndex<ARGS>() ? 1 : 0)...);
 #else
-      0; // There should be 0 Tensor indices if not vectorizing.
+      0;  // There should be 0 Tensor indices if not vectorizing.
 #endif
 };
 
@@ -182,37 +182,41 @@ namespace detail
  * In the future development, this may return SIMD vectors or matrices using
  * class specializations.
  */
-template <typename VecSeq,
-          typename Args,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType>
+template <
+    typename VecSeq,
+    typename Args,
+    typename ElementType,
+    typename PointerType,
+    typename LinIdx,
+    typename LayoutType>
 struct ViewReturnHelper;
 
 
 /*
  * Specialization for Scalar return types
  */
-template <typename... Args,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType>
-struct ViewReturnHelper<camp::idx_seq<>,
-                        camp::list<Args...>,
-                        ElementType,
-                        PointerType,
-                        LinIdx,
-                        LayoutType>
+template <
+    typename... Args,
+    typename ElementType,
+    typename PointerType,
+    typename LinIdx,
+    typename LayoutType>
+struct ViewReturnHelper<
+    camp::idx_seq<>,
+    camp::list<Args...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType>
 {
   using return_type = ElementType&;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const&  layout,
-                                           PointerType const& data,
-                                           Args const&... args)
+  static constexpr return_type make_return(
+      LayoutType const&  layout,
+      PointerType const& data,
+      Args const&... args)
   {
     return data[stripIndexType(layout(args...))];
   }
@@ -223,19 +227,21 @@ struct ViewReturnHelper<camp::idx_seq<>,
 /*
  * Specialization for Tensor return types
  */
-template <camp::idx_t VecHead,
-          camp::idx_t... VecSeq,
-          typename... Args,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType>
-struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
-                        camp::list<Args...>,
-                        ElementType,
-                        PointerType,
-                        LinIdx,
-                        LayoutType>
+template <
+    camp::idx_t VecHead,
+    camp::idx_t... VecSeq,
+    typename... Args,
+    typename ElementType,
+    typename PointerType,
+    typename LinIdx,
+    typename LayoutType>
+struct ViewReturnHelper<
+    camp::idx_seq<VecHead, VecSeq...>,
+    camp::list<Args...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType>
 {
 
   static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
@@ -256,31 +262,34 @@ struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
            : -1)...);
 
 
-  using tensor_reg_type =
-      typename camp::at_v<camp::list<Args...>,
-                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
-  using ref_type = internal::expt::TensorRef<ElementType*,
-                                             LinIdx,
-                                             internal::expt::TENSOR_MULTIPLE,
-                                             s_num_dims,
-                                             s_stride_one_dim>;
+  using tensor_reg_type = typename camp::at_v<
+      camp::list<Args...>,
+      GetTensorArgIdx<0, Args...>::value>::tensor_type;
+  using ref_type = internal::expt::TensorRef<
+      ElementType*,
+      LinIdx,
+      internal::expt::TENSOR_MULTIPLE,
+      s_num_dims,
+      s_stride_one_dim>;
   using return_type =
       internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const&  layout,
-                                           PointerType const& data,
-                                           Args const&... args)
+  static constexpr return_type make_return(
+      LayoutType const&  layout,
+      PointerType const& data,
+      Args const&... args)
   {
 
-    return return_type(ref_type{
+    return return_type(ref_type {
         // data pointer
         &data[0] +
-            layout(internal::expt::isTensorIndex<Args>()
-                       ? LinIdx{0}
-                       : (LinIdx)stripIndexType(
-                             internal::expt::stripTensorIndexByValue(args))...),
+            layout(
+                internal::expt::isTensorIndex<Args>()
+                    ? LinIdx {0}
+                    : (LinIdx)stripIndexType(
+                          internal::expt::stripTensorIndexByValue(args))...),
         // strides
         {(LinIdx)layout.template get_dim_stride<
              GetTensorArgIdx<VecHead, Args...>::value>(),
@@ -301,27 +310,29 @@ struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
 /*
  * Specialization for Tensor return types and static layout types
  */
-template <camp::idx_t VecHead,
-          camp::idx_t... VecSeq,
-          typename... INDEX_TYPES,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          LinIdx... RangeInts,
-          LinIdx... SizeInts,
-          LinIdx... StrideInts,
-          typename DIM_LIST>
+template <
+    camp::idx_t VecHead,
+    camp::idx_t... VecSeq,
+    typename... INDEX_TYPES,
+    typename ElementType,
+    typename PointerType,
+    typename LinIdx,
+    LinIdx... RangeInts,
+    LinIdx... SizeInts,
+    LinIdx... StrideInts,
+    typename DIM_LIST>
 struct ViewReturnHelper<
     camp::idx_seq<VecHead, VecSeq...>,
     camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
     ElementType,
     PointerType,
     LinIdx,
-    RAJA::detail::StaticLayoutBase_impl<LinIdx,
-                                        camp::int_seq<LinIdx, RangeInts...>,
-                                        camp::int_seq<LinIdx, SizeInts...>,
-                                        camp::int_seq<LinIdx, StrideInts...>,
-                                        DIM_LIST>>
+    RAJA::detail::StaticLayoutBase_impl<
+        LinIdx,
+        camp::int_seq<LinIdx, RangeInts...>,
+        camp::int_seq<LinIdx, SizeInts...>,
+        camp::int_seq<LinIdx, StrideInts...>,
+        DIM_LIST>>
 {
   static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
 
@@ -349,59 +360,59 @@ struct ViewReturnHelper<
            : -1)...);
 
 
-  using new_begin_seq =
-      camp::int_seq<LinIdx,
-                    (LinIdx)get_tensor_args_begin<VecHead>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                    (LinIdx)get_tensor_args_begin<VecSeq>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
-  using new_size_seq =
-      camp::int_seq<LinIdx,
-                    (LinIdx)get_tensor_args_size<VecHead>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                    (LinIdx)get_tensor_args_size<VecSeq>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_begin_seq = camp::int_seq<
+      LinIdx,
+      (LinIdx)get_tensor_args_begin<VecHead>(
+          LayoutType(),
+          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+      (LinIdx)get_tensor_args_begin<VecSeq>(
+          LayoutType(),
+          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_size_seq = camp::int_seq<
+      LinIdx,
+      (LinIdx)get_tensor_args_size<VecHead>(
+          LayoutType(),
+          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+      (LinIdx)get_tensor_args_size<VecSeq>(
+          LayoutType(),
+          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
 
   using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
   using new_size_type  = internal::expt::StaticIndexArray<new_size_seq>;
 
 
-  using tensor_reg_type =
-      typename camp::at_v<index_list,
-                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
-  using ref_type =
-      internal::expt::StaticTensorRef<ElementType*,
-                                      LinIdx,
-                                      internal::expt::TENSOR_MULTIPLE,
-                                      stride_seq,
-                                      new_begin_seq,
-                                      new_size_seq,
-                                      s_stride_one_dim>;
+  using tensor_reg_type = typename camp::
+      at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
+  using ref_type = internal::expt::StaticTensorRef<
+      ElementType*,
+      LinIdx,
+      internal::expt::TENSOR_MULTIPLE,
+      stride_seq,
+      new_begin_seq,
+      new_size_seq,
+      s_stride_one_dim>;
   using return_type =
       internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type
-  make_return(LayoutType const&  layout,
-              PointerType const& data,
-              RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
+  static constexpr return_type make_return(
+      LayoutType const&  layout,
+      PointerType const& data,
+      RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
   {
 
-    return return_type(ref_type{
+    return return_type(ref_type {
         // data pointer
         &data[0] +
-            layout(internal::expt::isTensorIndex<
-                       typename RAJA::expt::StaticTensorIndex<
-                           INDEX_TYPES>::base_type>()
-                       ? LinIdx{0}
-                       : (LinIdx)stripIndexType(
-                             internal::expt::stripTensorIndexByValue(args))...),
+            layout(
+                internal::expt::isTensorIndex<
+                    typename RAJA::expt::StaticTensorIndex<
+                        INDEX_TYPES>::base_type>()
+                    ? LinIdx {0}
+                    : (LinIdx)stripIndexType(
+                          internal::expt::stripTensorIndexByValue(args))...),
         // strides
         typename ref_type::stride_type(),
         // tile
@@ -411,7 +422,7 @@ struct ViewReturnHelper<
 #endif
 
 
-} // namespace detail
+}  // namespace detail
 
 
 /*
@@ -422,11 +433,12 @@ struct ViewReturnHelper<
  *
  * Otherwise it produces the usual scalar reference return type
  */
-template <typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType,
-          typename... Args>
+template <
+    typename ElementType,
+    typename PointerType,
+    typename LinIdx,
+    typename LayoutType,
+    typename... Args>
 using view_return_type_t = typename detail::ViewReturnHelper<
     camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
     camp::list<Args...>,
@@ -443,19 +455,22 @@ using view_return_type_t = typename detail::ViewReturnHelper<
  *
  * Otherwise it produces the usual scalar reference return value
  */
-template <typename ElementType,
-          typename LinIdx,
-          typename LayoutType,
-          typename PointerType,
-          typename... Args>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
-                                                          PointerType,
-                                                          LinIdx,
-                                                          LayoutType,
-                                                          Args...>
-            view_make_return_value(LayoutType const&  layout,
-                                   PointerType const& data,
-                                   Args const&... args)
+template <
+    typename ElementType,
+    typename LinIdx,
+    typename LayoutType,
+    typename PointerType,
+    typename... Args>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType,
+    Args...>
+view_make_return_value(
+    LayoutType const&  layout,
+    PointerType const& data,
+    Args const&... args)
 {
   return detail::ViewReturnHelper<
       camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
@@ -479,9 +494,11 @@ namespace detail
 template <typename Expected, typename Arg>
 struct MatchTypedViewArgHelper
 {
-  static_assert(std::is_convertible<strip_index_type_t<Arg>,
-                                    strip_index_type_t<Expected>>::value,
-                "Argument isn't compatible");
+  static_assert(
+      std::is_convertible<
+          strip_index_type_t<Arg>,
+          strip_index_type_t<Expected>>::value,
+      "Argument isn't compatible");
 
   using type = strip_index_type_t<Arg>;
 
@@ -500,13 +517,16 @@ struct MatchTypedViewArgHelper
  * typed indices.
  */
 template <typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-struct MatchTypedViewArgHelper<Expected,
-                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
+struct MatchTypedViewArgHelper<
+    Expected,
+    RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
 {
 
-  static_assert(std::is_convertible<strip_index_type_t<Arg>,
-                                    strip_index_type_t<Expected>>::value,
-                "Argument isn't compatible");
+  static_assert(
+      std::is_convertible<
+          strip_index_type_t<Arg>,
+          strip_index_type_t<Expected>>::value,
+      "Argument isn't compatible");
 
   using arg_type = strip_index_type_t<Arg>;
 
@@ -525,12 +545,13 @@ struct MatchTypedViewArgHelper<Expected,
  * In this case, there is no StaticTensorIndex to unpack, just strip any
  * strongly typed indices.
  */
-template <typename Expected,
-          typename Arg,
-          typename VectorType,
-          camp::idx_t             DIM,
-          Arg                     BEGIN,
-          strip_index_type_t<Arg> LENGTH>
+template <
+    typename Expected,
+    typename Arg,
+    typename VectorType,
+    camp::idx_t             DIM,
+    Arg                     BEGIN,
+    strip_index_type_t<Arg> LENGTH>
 struct MatchTypedViewArgHelper<
     Expected,
     RAJA::expt::StaticTensorIndex<
@@ -538,9 +559,11 @@ struct MatchTypedViewArgHelper<
             StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>>
 {
 
-  static_assert(std::is_convertible<strip_index_type_t<Arg>,
-                                    strip_index_type_t<Expected>>::value,
-                "Argument isn't compatible");
+  static_assert(
+      std::is_convertible<
+          strip_index_type_t<Arg>,
+          strip_index_type_t<Expected>>::value,
+      "Argument isn't compatible");
 
   using arg_type = strip_index_type_t<Arg>;
 
@@ -559,7 +582,7 @@ struct MatchTypedViewArgHelper<
 };
 #endif
 
-} // namespace detail
+}  // namespace detail
 
 
 template <typename Expected, typename Arg>
@@ -609,7 +632,7 @@ class ViewBase
 #if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr ViewBase(){};
+  constexpr ViewBase() {};
 
   RAJA_HOST_DEVICE
   RAJA_INLINE ViewBase(ViewBase const& c)
@@ -639,8 +662,8 @@ class ViewBase
   {}
 
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
-                                                  Args... dim_sizes)
+  RAJA_HOST_DEVICE
+      RAJA_INLINE constexpr ViewBase(pointer_type data, Args... dim_sizes)
       : m_data(data), m_layout(dim_sizes...)
   {}
 
@@ -676,12 +699,13 @@ class ViewBase
 
 
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-                   operator()(Args... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
+      value_type,
+      pointer_type,
+      linear_index_type,
+      layout_type,
+      Args...>
+  operator()(Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         m_layout, m_data, args...);
@@ -695,24 +719,26 @@ class ViewBase
    * which seems to have been fixed in CUDA 9.2+
    */
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-                   operator[](Args... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
+      value_type,
+      pointer_type,
+      linear_index_type,
+      layout_type,
+      Args...>
+  operator[](Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         m_layout, m_data, args...);
   }
 
 
-  template <size_t n_dims   = layout_type::n_dims,
-            typename IdxLin = linear_index_type>
+  template <
+      size_t n_dims   = layout_type::n_dims,
+      typename IdxLin = linear_index_type>
   RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims == layout_type::n_dims,
-                  "Dimension mismatch in view shift");
+    static_assert(
+        n_dims == layout_type::n_dims, "Dimension mismatch in view shift");
 
     shifted_layout_type shift_layout(m_layout);
     shift_layout.shift(shift);
@@ -722,20 +748,23 @@ class ViewBase
 };
 
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType,
-          typename IndexTypes>
+template <
+    typename ValueType,
+    typename PointerType,
+    typename LayoutType,
+    typename IndexTypes>
 class TypedViewBase;
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType,
-          typename... IndexTypes>
-class TypedViewBase<ValueType,
-                    PointerType,
-                    LayoutType,
-                    camp::list<IndexTypes...>>
+template <
+    typename ValueType,
+    typename PointerType,
+    typename LayoutType,
+    typename... IndexTypes>
+class TypedViewBase<
+    ValueType,
+    PointerType,
+    LayoutType,
+    camp::list<IndexTypes...>>
     : public ViewBase<ValueType, PointerType, LayoutType>
 {
 
@@ -748,33 +777,37 @@ class TypedViewBase<ValueType,
   using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
       typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-  using Base         = ViewBase<ValueType, PointerType, LayoutType>;
-  using Self         = TypedViewBase<value_type,
-                             pointer_type,
-                             layout_type,
-                             camp::list<IndexTypes...>>;
-  using NonConstView = TypedViewBase<nc_value_type,
-                                     nc_pointer_type,
-                                     layout_type,
-                                     camp::list<IndexTypes...>>;
+  using Base = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self = TypedViewBase<
+      value_type,
+      pointer_type,
+      layout_type,
+      camp::list<IndexTypes...>>;
+  using NonConstView = TypedViewBase<
+      nc_value_type,
+      nc_pointer_type,
+      layout_type,
+      camp::list<IndexTypes...>>;
 
   using shifted_layout_type = typename add_offset<layout_type>::type;
-  using ShiftedView         = TypedViewBase<value_type,
-                                    pointer_type,
-                                    shifted_layout_type,
-                                    camp::list<IndexTypes...>>;
+  using ShiftedView         = TypedViewBase<
+      value_type,
+      pointer_type,
+      shifted_layout_type,
+      camp::list<IndexTypes...>>;
 
   static constexpr size_t n_dims = sizeof...(IndexTypes);
 
   using Base::Base;
 
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-                   operator()(Args... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
+      value_type,
+      pointer_type,
+      linear_index_type,
+      layout_type,
+      Args...>
+  operator()(Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         Base::m_layout, Base::m_data,
@@ -789,12 +822,13 @@ class TypedViewBase<ValueType,
    * which seems to have been fixed in CUDA 9.2+
    */
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-                   operator[](Args... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
+      value_type,
+      pointer_type,
+      linear_index_type,
+      layout_type,
+      Args...>
+  operator[](Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
         Base::m_layout, Base::m_data,
@@ -802,12 +836,13 @@ class TypedViewBase<ValueType,
   }
 
 
-  template <size_t n_dims   = sizeof...(IndexTypes),
-            typename IdxLin = linear_index_type>
+  template <
+      size_t n_dims   = sizeof...(IndexTypes),
+      typename IdxLin = linear_index_type>
   RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims == layout_type::n_dims,
-                  "Dimension mismatch in view shift");
+    static_assert(
+        n_dims == layout_type::n_dims, "Dimension mismatch in view shift");
 
     shifted_layout_type shift_layout(Base::get_layout());
     shift_layout.shift(shift);
@@ -817,8 +852,8 @@ class TypedViewBase<ValueType,
 };
 
 
-} // namespace internal
+}  // namespace internal
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index 943f843c04..7ee9cba7e5 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -47,9 +47,10 @@ struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
   using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
 };
 
-template <typename ValueType,
-          typename LayoutType,
-          typename PointerType = ValueType*>
+template <
+    typename ValueType,
+    typename LayoutType,
+    typename PointerType = ValueType*>
 using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
 
 
@@ -64,13 +65,15 @@ RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
   return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
 
-template <size_t n_dims,
-          typename IndexType,
-          typename ValueType,
-          typename... IndexTypes>
+template <
+    size_t n_dims,
+    typename IndexType,
+    typename ValueType,
+    typename... IndexTypes>
 RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
-            make_index_view(ValueType*                                    ptr,
-                            IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+            make_index_view(
+                ValueType*                                    ptr,
+                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
   return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
       ptr, index_layout);
@@ -120,20 +123,24 @@ RAJA_HOST_DEVICE RAJA_INLINE auto
 removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
     lyout,
     std::forward<Tup>(tup),
-    cat_seq_t<camp::make_idx_seq_t<Nth>, // sequence up to Nth
-              offset_seq_t<Nth + 1,      // after Nth
-                           camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
-                                                Nth - 1>> // sequence after Nth
-              >{}))
+    cat_seq_t<
+        camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+        offset_seq_t<
+            Nth + 1,  // after Nth
+            camp::make_idx_seq_t<
+                camp::tuple_size<Tup>::value - Nth - 1>>  // sequence after Nth
+        > {}))
 {
   return selecttuple<Lay>(
       lyout, std::forward<Tup>(tup),
-      cat_seq_t<camp::make_idx_seq_t<Nth>, // sequence up to Nth
-                offset_seq_t<Nth + 1,      // after Nth
-                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
-                                                  Nth - 1>> // sequence after
+      cat_seq_t<
+          camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+          offset_seq_t<
+              Nth + 1,  // after Nth
+              camp::make_idx_seq_t<
+                  camp::tuple_size<Tup>::value - Nth - 1>>  // sequence after
                                                             // Nth
-                >{});
+          > {});
 }
 
 
@@ -146,10 +153,10 @@ template <
     typename LayoutType,
     RAJA::Index_type P2Pidx      = 0,
     typename PointerType         = ValueType**,
-    typename NonConstPointerType = camp::type::ptr::add<           // adds *
-        camp::type::ptr::add<camp::type::cv::rem<                  // removes cv
-            camp::type::ptr::rem<camp::type::ptr::rem<PointerType> // removes
-                                                                   // *
+    typename NonConstPointerType = camp::type::ptr::add<  // adds *
+        camp::type::ptr::add<camp::type::cv::rem<         // removes cv
+            camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
+                                                                    // *
                                  >>>>>
 struct MultiView
 {
@@ -191,14 +198,15 @@ struct MultiView
       RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
       shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims == layout_type::n_dims,
-                  "Dimension mismatch in view shift");
+    static_assert(
+        n_dims == layout_type::n_dims, "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type,
-                           P2Pidx>(data, shift_layout);
+    return RAJA::MultiView<
+        ValueType, typename add_offset<layout_type>::type, P2Pidx>(
+        data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
@@ -261,7 +269,7 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
   base_type base_;
 
   RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_{view} {}
+  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_ {view} {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
@@ -282,6 +290,6 @@ RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy>
 }
 
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 50bf5716c2..23ccbee14c 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -32,8 +32,8 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(disable : 4146) // Force msvc to ignore subtracting from signed
-                                // number warning
+#pragma warning(disable : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
   void* r = nullptr;
   if (size <= space)
@@ -53,11 +53,11 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
   return r;
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4146) // Force msvc to ignore subtracting from signed
-                                // number warning
+#pragma warning(default : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
 }
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index 9ee1c0270f..b91bdc9db0 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -60,7 +60,7 @@ class MemoryArena
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-      : m_allocation{ptr, static_cast<char*>(ptr) + size},
+      : m_allocation {ptr, static_cast<char*>(ptr) + size},
         m_free_space(),
         m_used_space()
   {
@@ -106,8 +106,8 @@ class MemoryArena
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(iter, adj_ptr,
-                            static_cast<char*>(adj_ptr) + nbytes);
+          remove_free_chunk(
+              iter, adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
 
@@ -195,7 +195,7 @@ class MemoryArena
       if (next->first == end)
       {
         // extend next to cover [begin, end)
-        m_free_space.insert(next, free_value_type{begin, next->second});
+        m_free_space.insert(next, free_value_type {begin, next->second});
         m_free_space.erase(next);
 
         return;
@@ -204,7 +204,7 @@ class MemoryArena
 
     // no free space adjacent to this chunk, add seperate free chunk [begin,
     // end)
-    m_free_space.insert(next, free_value_type{begin, end});
+    m_free_space.insert(next, free_value_type {begin, end});
   }
 
   void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
@@ -226,7 +226,7 @@ class MemoryArena
         // insert free region [end, ptr_end) after current free region
         free_type::iterator next = iter;
         ++next;
-        m_free_space.insert(next, free_value_type{end, ptr_end});
+        m_free_space.insert(next, free_value_type {end, ptr_end});
       }
     }
     else if (end != ptr_end)
@@ -235,7 +235,7 @@ class MemoryArena
       // shrink beginning of current free region to [end, ptr_end)
       free_type::iterator next = iter;
       ++next;
-      m_free_space.insert(next, free_value_type{end, ptr_end});
+      m_free_space.insert(next, free_value_type {end, ptr_end});
       m_free_space.erase(iter);
     }
     else
@@ -249,7 +249,7 @@ class MemoryArena
   void add_used_chunk(void* begin, void* end)
   {
     // simply inserts a chunk of memory into used_space
-    m_used_space.insert(used_value_type{begin, end});
+    m_used_space.insert(used_value_type {begin, end});
   }
 
   memory_chunk m_allocation;
@@ -308,7 +308,7 @@ class MemPool
 
   static inline MemPool<allocator_t>& getInstance()
   {
-    static MemPool<allocator_t> pool{};
+    static MemPool<allocator_t> pool {};
     return pool;
   }
 
diff --git a/include/RAJA/util/camp_aliases.hpp b/include/RAJA/util/camp_aliases.hpp
index 4c735b9c8f..c747ac64a0 100644
--- a/include/RAJA/util/camp_aliases.hpp
+++ b/include/RAJA/util/camp_aliases.hpp
@@ -52,6 +52,6 @@ using ::camp::get;
 
 using ::camp::resources::Platform;
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif /* RAJA_CAMP_ALIASES_HPP */
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 5e62b11877..06637e7a96 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -37,15 +37,15 @@ struct ConvertibleTo
     : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>()))
 {};
 
-} // namespace concepts
+}  // namespace concepts
 
 namespace type_traits
 {
 using namespace camp::type_traits;
 
 DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
-} // namespace type_traits
+}  // namespace type_traits
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index f83281e50e..202c31c912 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -39,9 +39,8 @@ namespace detail
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter      begin,
-                                                Iter      end,
-                                                UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
+for_each(Iter begin, Iter end, UnaryFunc func)
 {
   for (; begin != end; ++begin)
   {
@@ -54,11 +53,11 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter      begin,
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
-                                                     UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
+for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 {
   // braced init lists are evaluated in order
-  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
+  int seq_unused_array[] = {0, (func(Ts {}), 0)...};
   RAJA_UNUSED_VAR(seq_unused_array);
 
   return func;
@@ -67,9 +66,8 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&&   t,
-                                                      UnaryFunc func,
-                                                      camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
+for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -79,7 +77,7 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&&   t,
   return func;
 }
 
-} // namespace detail
+}  // namespace detail
 
 
 /*!
@@ -105,8 +103,8 @@ RAJA_HOST_DEVICE RAJA_INLINE
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
-                                                     UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
+for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
@@ -121,9 +119,9 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
   return detail::for_each_tuple(
       std::forward<Tuple>(t), std::move(func),
-      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
+      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 621cea6509..f79e713a24 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -143,7 +143,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
 #define RAJA_OMP_DECLARE_REDUCTION_COMBINE                                     \
   _Pragma(" omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ") // initializer(omp_priv = omp_in) ")
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")  // initializer(omp_priv = omp_in) ")
 #endif
 
 
@@ -152,12 +152,12 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
   // segfault here ran into linking problems
-  *((volatile char*)0) = 0; // write to address 0
+  *((volatile char*)0) = 0;  // write to address 0
 #else
   printf("%s\n", str);
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
-  *((volatile char*)0) = 0; // write to address 0
+  *((volatile char*)0) = 0;  // write to address 0
 #elif defined(__CUDA_ARCH__)
   asm("trap;");
 
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index 9b09414ef7..ed19f907d9 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -93,15 +93,16 @@ RAJA_HOST_DEVICE constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template <typename L,
-          typename R,
-          std::enable_if_t<std::is_integral<L>::value &&
-                           std::is_integral<R>::value>* = nullptr>
+template <
+    typename L,
+    typename R,
+    std::enable_if_t<
+        std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr>
 constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
 {
   return lhs & (rhs - R(1));
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
index bd9c2ee070..631177cbf6 100644
--- a/include/RAJA/util/mutex.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -58,8 +58,8 @@ class mutex
   native_handle_type m_lock;
 };
 
-} // namespace omp
-#endif // closing endif for if defined(RAJA_ENABLE_OPENMP)
+}  // namespace omp
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
 //! class providing functionality of std::lock_guard
 template <typename mutex_type>
@@ -79,6 +79,6 @@ class lock_guard
   mutex_type& m_mutex;
 };
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index c4ade8f529..301bbc875c 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -99,7 +99,7 @@ void finalize_plugins()
   }
 }
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 84b37d8b4a..444fd6e533 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -44,9 +44,10 @@ namespace detail
 template <typename T, typename BinaryOp>
 struct LeftFoldReduce
 {
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit LeftFoldReduce(
+  RAJA_HOST_DEVICE
+  RAJA_INLINE constexpr explicit LeftFoldReduce(
       T        init = BinaryOp::identity(),
-      BinaryOp op   = BinaryOp{}) noexcept
+      BinaryOp op   = BinaryOp {}) noexcept
       : m_op(std::move(op)), m_accumulated_value(std::move(init))
   {}
 
@@ -99,24 +100,27 @@ struct LeftFoldReduce
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <typename T,
-          typename BinaryOp,
-          typename SizeType     = size_t,
-          SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
+template <
+    typename T,
+    typename BinaryOp,
+    typename SizeType     = size_t,
+    SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
 struct BinaryTreeReduce
 {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(t_num_levels <= CHAR_BIT * sizeof(SizeType),
-                "SizeType must be "
-                "large enough to "
-                "act at a bitset "
-                "for num_levels");
+  static_assert(
+      t_num_levels <= CHAR_BIT * sizeof(SizeType),
+      "SizeType must be "
+      "large enough to "
+      "act at a bitset "
+      "for num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit BinaryTreeReduce(
+  RAJA_HOST_DEVICE
+  RAJA_INLINE constexpr explicit BinaryTreeReduce(
       T        init = BinaryOp::identity(),
-      BinaryOp op   = BinaryOp{}) noexcept
+      BinaryOp op   = BinaryOp {}) noexcept
       : m_op(std::move(op))
   {
     combine(std::move(init));
@@ -246,10 +250,10 @@ struct BinaryTreeReduce
 
 
 template <typename T, typename BinaryOp>
-using HighAccuracyReduce =
-    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
-                       BinaryTreeReduce<T, BinaryOp>,
-                       LeftFoldReduce<T, BinaryOp>>;
+using HighAccuracyReduce = std::conditional_t<
+    RAJA::operators::is_fp_associative<T>::value,
+    BinaryTreeReduce<T, BinaryOp>,
+    LeftFoldReduce<T, BinaryOp>>;
 
 
 /*!
@@ -284,8 +288,8 @@ binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
-                                                  std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(
+      std::move(init), std::move(op));
 
   for (; begin != end; ++begin)
   {
@@ -316,29 +320,32 @@ high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
   return reducer.get_and_clear();
 }
 
-} // namespace detail
+}  // namespace detail
 
 /*!
   \brief Accumulate given range to a single value
   using a left fold algorithm in O(N) operations and O(1) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/accumulate
 */
-template <typename Container,
-          typename T        = detail::ContainerVal<Container>,
-          typename BinaryOp = operators::plus<T>>
+template <
+    typename Container,
+    typename T        = detail::ContainerVal<Container>,
+    typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-                accumulate(Container&& c,
-                           T           init = BinaryOp::identity(),
-                           BinaryOp    op   = BinaryOp{})
+                accumulate(
+                    Container&& c,
+                    T           init = BinaryOp::identity(),
+                    BinaryOp    op   = BinaryOp {})
 {
   using std::begin;
   using std::end;
-  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
-                "BinaryOp must model BinaryFunction");
+  static_assert(
+      type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+      "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(begin(c), end(c), std::move(init),
-                                  std::move(op));
+  return detail::left_fold_reduce(
+      begin(c), end(c), std::move(init), std::move(op));
 }
 
 /*!
@@ -346,22 +353,25 @@ RAJA_HOST_DEVICE
   using a binary tree algorithm in O(N) operations and O(lg(N)) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
-template <typename Container,
-          typename T        = detail::ContainerVal<Container>,
-          typename BinaryOp = operators::plus<T>>
+template <
+    typename Container,
+    typename T        = detail::ContainerVal<Container>,
+    typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-                binary_tree_reduce(Container&& c,
-                                   T           init = BinaryOp::identity(),
-                                   BinaryOp    op   = BinaryOp{})
+                binary_tree_reduce(
+                    Container&& c,
+                    T           init = BinaryOp::identity(),
+                    BinaryOp    op   = BinaryOp {})
 {
   using std::begin;
   using std::end;
-  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
-                "BinaryOp must model BinaryFunction");
+  static_assert(
+      type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+      "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
-                                    std::move(op));
+  return detail::binary_tree_reduce(
+      begin(c), end(c), std::move(init), std::move(op));
 }
 
 /*!
@@ -370,24 +380,27 @@ RAJA_HOST_DEVICE
   concern
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
-template <typename Container,
-          typename T        = detail::ContainerVal<Container>,
-          typename BinaryOp = operators::plus<T>>
+template <
+    typename Container,
+    typename T        = detail::ContainerVal<Container>,
+    typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-                high_accuracy_reduce(Container&& c,
-                                     T           init = BinaryOp::identity(),
-                                     BinaryOp    op   = BinaryOp{})
+                high_accuracy_reduce(
+                    Container&& c,
+                    T           init = BinaryOp::identity(),
+                    BinaryOp    op   = BinaryOp {})
 {
   using std::begin;
   using std::end;
-  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
-                "BinaryOp must model BinaryFunction");
+  static_assert(
+      type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+      "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
-                                      std::move(op));
+  return detail::high_accuracy_reduce(
+      begin(c), end(c), std::move(init), std::move(op));
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 0656bb872e..53e0f2e69a 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -70,41 +70,45 @@ struct get_resource_from_platform<Platform::cuda>
   using type = camp::resources::Cuda;
 };
 
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async>
-struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                             IterationGetter,
-                                                             Concretizer,
-                                                             BLOCKS_PER_SM,
-                                                             Async>>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async>
+struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<
+    IterationMapping,
+    IterationGetter,
+    Concretizer,
+    BLOCKS_PER_SM,
+    Async>>
 {
   using type = camp::resources::Cuda;
 };
 
 template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async,
-                                                                 num_threads,
-                                                                 BLOCKS_PER_SM>>
+struct get_resource<
+    ::RAJA::policy::cuda::
+        cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>
 {
   using type = camp::resources::Cuda;
 };
 
-template <typename ISetIter,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool   Async>
-struct get_resource<
-    ExecPolicy<ISetIter,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async>>>
+template <
+    typename ISetIter,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    size_t BLOCKS_PER_SM,
+    bool   Async>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::cuda::cuda_exec_explicit<
+        IterationMapping,
+        IterationGetter,
+        Concretizer,
+        BLOCKS_PER_SM,
+        Async>>>
 {
   using type = camp::resources::Cuda;
 };
@@ -117,10 +121,11 @@ struct get_resource_from_platform<Platform::hip>
   using type = camp::resources::Hip;
 };
 
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async>
+template <
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async>
 struct get_resource<
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>
@@ -134,11 +139,12 @@ struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>
   using type = camp::resources::Hip;
 };
 
-template <typename ISetIter,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async>
+template <
+    typename ISetIter,
+    typename IterationMapping,
+    typename IterationGetter,
+    typename Concretizer,
+    bool Async>
 struct get_resource<ExecPolicy<
     ISetIter,
     ::RAJA::policy::hip::
@@ -211,7 +217,7 @@ struct get_resource<ExecPolicy<
 };
 #endif
 
-} // end namespace resources
+}  // end namespace resources
 
 namespace type_traits
 {
@@ -241,8 +247,8 @@ template <>
 struct is_resource<resources::Omp> : std::true_type
 {};
 #endif
-} // end namespace type_traits
+}  // end namespace type_traits
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
-#endif // RAJA_resources_HPP#
+#endif  // RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index d4261d1e70..6a5badba1c 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -40,9 +40,8 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter      begin,
-                                            Iter      end,
-                                            Predicate pred)
+RAJA_HOST_DEVICE RAJA_INLINE Iter
+partition(Iter begin, Iter end, Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
@@ -141,7 +140,7 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr long long unsigned
 get_shell_stride(int i)
 {
   using array_type = long long unsigned[num_shell_strides()];
-  return (array_type{
+  return (array_type {
       // strides from M. Ciura 2001
       1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
@@ -453,8 +452,8 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
-                                              copylen * sizeof(value_type)),
+      RAJA::allocate_aligned_type<value_type>(
+          RAJA::DATA_ALIGN, copylen * sizeof(value_type)),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
@@ -475,12 +474,12 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
   // merge
   for (diff_type cur = 0; cur < copylen;)
   {
-    if (middle >= last) // moved all second half, put copy into remainder
+    if (middle >= last)  // moved all second half, put copy into remainder
     {
       std::move(copyarr + cur, copyarr + copylen, first);
       break;
     }
-    else if (first == middle) // everything prior to middle is sorted, done
+    else if (first == middle)  // everything prior to middle is sorted, done
     {
       break;
     }
@@ -506,22 +505,22 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 */
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
 // constexpr OutIter // <-- std:: return value
-void RAJA_INLINE
-merge_like_std(Iter1   first1,
-               Iter1   last1,
-               Iter2   first2,
-               Iter2   last2,
-               OutIter d_first, // using this as direct access to result
-               Compare comp)
+void RAJA_INLINE merge_like_std(
+    Iter1   first1,
+    Iter1   last1,
+    Iter2   first2,
+    Iter2   last2,
+    OutIter d_first,  // using this as direct access to result
+    Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (first1 == last2 - 1) // should never need to do this
+  if (first1 == last2 - 1)  // should never need to do this
   {
     return;
   }
 
-  if ((last2 - first1) == 2) // only 2 elements, simple swap
+  if ((last2 - first1) == 2)  // only 2 elements, simple swap
   {
     if (!comp(*d_first, *(d_first + 1)))
     {
@@ -532,17 +531,17 @@ merge_like_std(Iter1   first1,
 
   while (first1 < last1 || first2 < last2)
   {
-    if (first1 >= last1) // first half done
+    if (first1 >= last1)  // first half done
     {
       *d_first = std::move(*first2);
       ++first2;
     }
-    else if (first2 >= last2) // second half done
+    else if (first2 >= last2)  // second half done
     {
       *d_first = std::move(*first1);
       ++first1;
     }
-    else // neither half done
+    else  // neither half done
     {
       if (comp(*first2, *first1))
       {
@@ -556,7 +555,7 @@ merge_like_std(Iter1   first1,
       }
     }
 
-    ++d_first; // advance output
+    ++d_first;  // advance output
   }
 
   return;
@@ -600,8 +599,8 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
-                                                len * sizeof(value_type)),
+        RAJA::allocate_aligned_type<value_type>(
+            RAJA::DATA_ALIGN, len * sizeof(value_type)),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
@@ -623,16 +622,16 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
     // for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log
     // n) loop
     for (diff_type midpoint = 16; midpoint < len;
-         midpoint *= 2) // O(log n) loop
+         midpoint *= 2)  // O(log n) loop
     {
       for (diff_type start = 0; start < len;
-           start += midpoint * 2) // O(n) merging loop (can be parallelized)
+           start += midpoint * 2)  // O(n) merging loop (can be parallelized)
       {
         diff_type finish = minlam(start + midpoint * 2, len);
         if (finish > len)
         {
           RAJA_ABORT_OR_THROW(
-              "merge_sort invalid finish point"); // sanity check
+              "merge_sort invalid finish point");  // sanity check
         }
 
         if (start + midpoint >= len)
@@ -646,26 +645,27 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
           {
             std::move(begin + start, begin + finish, copyarr + start);
           }
-          break; // skip merge if no second half exists
+          break;  // skip merge if no second half exists
         }
 
-        if (copyvalid) // switch arrays per level of merging to avoid copying
-                       // back to copyarr
+        if (copyvalid)  // switch arrays per level of merging to avoid copying
+                        // back to copyarr
         {
-          detail::merge_like_std(copyarr + start, copyarr + start + midpoint,
-                                 copyarr + start + midpoint, copyarr + finish,
-                                 begin + start, comp);
+          detail::merge_like_std(
+              copyarr + start, copyarr + start + midpoint,
+              copyarr + start + midpoint, copyarr + finish, begin + start,
+              comp);
         }
         else
         {
-          detail::merge_like_std(begin + start, begin + start + midpoint,
-                                 begin + start + midpoint, begin + finish,
-                                 copyarr + start, comp);
+          detail::merge_like_std(
+              begin + start, begin + start + midpoint, begin + start + midpoint,
+              begin + finish, copyarr + start, comp);
         }
       }
 
-      copyvalid = !copyvalid; // switch arrays per level of merging to avoid
-                              // copying back to copyarr
+      copyvalid = !copyvalid;  // switch arrays per level of merging to avoid
+                               // copying back to copyarr
     }
 
     // update copy if necessary
@@ -683,25 +683,28 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
   //}
 }
 
-} // namespace detail
+}  // namespace detail
 
 /*!
     \brief stable insertion sort given range inplace using comparison function
     and using O(N^2) comparisons and O(1) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <
+    typename Container,
+    typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                insertion_sort(Container&& c, Compare comp = Compare{})
+                insertion_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -720,19 +723,22 @@ RAJA_HOST_DEVICE
     \brief unstable shell sort given range inplace using comparison function
     and using O(N^?) comparisons and O(1) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <
+    typename Container,
+    typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                shell_sort(Container&& c, Compare comp = Compare{})
+                shell_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -751,19 +757,22 @@ RAJA_HOST_DEVICE
     \brief unstable heap sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(1) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <
+    typename Container,
+    typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                heap_sort(Container&& c, Compare comp = Compare{})
+                heap_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -782,19 +791,22 @@ RAJA_HOST_DEVICE
     \brief unstable intro sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <
+    typename Container,
+    typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                intro_sort(Container&& c, Compare comp = Compare{})
+                intro_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -813,18 +825,21 @@ RAJA_HOST_DEVICE
     \brief stable merge sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(N) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <
+    typename Container,
+    typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-            merge_sort(Container&& c, Compare comp = Compare{})
+            merge_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
-                "Compare must model BinaryFunction");
-  static_assert(type_traits::is_random_access_range<Container>::value,
-                "Container must model RandomAccessRange");
+  static_assert(
+      type_traits::is_binary_function<Compare, bool, T, T>::value,
+      "Compare must model BinaryFunction");
+  static_assert(
+      type_traits::is_random_access_range<Container>::value,
+      "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -839,6 +854,6 @@ RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
   }
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/sycl_compat.hpp b/include/RAJA/util/sycl_compat.hpp
index ff8a3754d1..7754caa273 100644
--- a/include/RAJA/util/sycl_compat.hpp
+++ b/include/RAJA/util/sycl_compat.hpp
@@ -26,4 +26,4 @@
 #include <sycl/sycl.hpp>
 #endif
 
-#endif // RAJA_util_sycl_compat_HPP
+#endif  // RAJA_util_sycl_compat_HPP
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 4e2c3ffd56..aeafdca8f2 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -138,11 +138,11 @@ struct Direct : DirectBase
 ///   // 2 -> {6, 7}
 ///
 template <size_t max_iterations>
-struct Contiguousloop
-    : ContiguousLoopBase,
-      std::conditional_t<(max_iterations != named_usage::unspecified),
-                         SizedLoopSpecifyingBase<max_iterations>,
-                         UnsizedLoopBase>
+struct Contiguousloop : ContiguousLoopBase,
+                        std::conditional_t<
+                            (max_iterations != named_usage::unspecified),
+                            SizedLoopSpecifyingBase<max_iterations>,
+                            UnsizedLoopBase>
 {};
 
 ///
@@ -171,14 +171,14 @@ struct Contiguousloop
 ///   // 2 -> {2, 5}
 ///
 template <size_t max_iterations>
-struct StridedLoop
-    : StridedLoopBase,
-      std::conditional_t<(max_iterations != named_usage::unspecified),
+struct StridedLoop : StridedLoopBase,
+                     std::conditional_t<
+                         (max_iterations != named_usage::unspecified),
                          SizedLoopSpecifyingBase<max_iterations>,
                          UnsizedLoopBase>
 {};
 
-} // namespace iteration_mapping
+}  // namespace iteration_mapping
 
 ///
 /// Enumeration used to indicate whether ListSegment object owns data
@@ -478,10 +478,10 @@ class ConstRestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if __ICC < 1300 // use alignment intrinsic
+#if __ICC < 1300  // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
     return ((const Real_type* RAJA_RESTRICT)dptr)[i];
-#else // use alignment attribute
+#else  // use alignment attribute
     return ((const_TDRAReal_ptr)dptr)[i];
 #endif
   }
@@ -490,7 +490,7 @@ class ConstRestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
+#if 1  // NOTE: alignment instrinsic not available for older GNU compilers
     return ((const Real_type* RAJA_RESTRICT)RAJA_ALIGN_DATA(dptr))[i];
 #else
     return ((const Real_type* RAJA_RESTRICT)dptr)[i];
@@ -592,10 +592,10 @@ class RestrictAlignedRealPtr
   ///
   Real_type& operator[](Index_type i)
   {
-#if __ICC < 1300 // use alignment intrinsic
+#if __ICC < 1300  // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
-#else // use alignment attribute
+#else  // use alignment attribute
     return ((TDRAReal_ptr)dptr)[i];
 #endif
   }
@@ -603,10 +603,10 @@ class RestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if __ICC < 1300 // use alignment intrinsic
+#if __ICC < 1300  // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
-#else // use alignment attribute
+#else  // use alignment attribute
     return ((TDRAReal_ptr)dptr)[i];
 #endif
   }
@@ -615,7 +615,7 @@ class RestrictAlignedRealPtr
   ///
   Real_type& operator[](Index_type i)
   {
-#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
+#if 1  // NOTE: alignment instrinsic not available for older GNU compilers
     return ((Real_type * RAJA_RESTRICT) RAJA_ALIGN_DATA(dptr))[i];
 #else
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
@@ -625,7 +625,7 @@ class RestrictAlignedRealPtr
   ///
   const Real_type& operator[](Index_type i) const
   {
-#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
+#if 1  // NOTE: alignment instrinsic not available for older GNU compilers
     return ((Real_type * RAJA_RESTRICT) RAJA_ALIGN_DATA(dptr))[i];
 #else
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
@@ -820,9 +820,9 @@ class RestrictComplexPtr
 private:
   Complex_type* dptr;
 };
-#endif // defined(RAJA_USE_COMPLEX)
+#endif  // defined(RAJA_USE_COMPLEX)
 
-#endif // defined(RAJA_USE_PTR_CLASS)
+#endif  // defined(RAJA_USE_PTR_CLASS)
 
 /*
  ******************************************************************************
@@ -912,16 +912,18 @@ struct DefaultAccessor
  * \brief Abstracts T into an equal or greater size array of integers whose
  * size is between min_integer_type_size and max_interger_type_size inclusive.
  */
-template <typename T,
-          size_t min_integer_type_size = 1,
-          size_t max_integer_type_size = sizeof(unsigned long long)>
+template <
+    typename T,
+    size_t min_integer_type_size = 1,
+    size_t max_integer_type_size = sizeof(unsigned long long)>
 struct AsIntegerArray
 {
-  static_assert(min_integer_type_size <= max_integer_type_size,
-                "incompatible "
-                "min and max "
-                "integer type "
-                "size");
+  static_assert(
+      min_integer_type_size <= max_integer_type_size,
+      "incompatible "
+      "min and max "
+      "integer type "
+      "size");
   using integer_type = std::conditional_t<
       ((alignof(T) >= alignof(unsigned long long) &&
         sizeof(unsigned long long) <= max_integer_type_size) ||
@@ -942,25 +944,28 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
-                                       sizeof(unsigned char) <=
-                                           max_integer_type_size)),
-                                     unsigned char,
-                                     void>>>>>;
-  static_assert(!std::is_same<integer_type, void>::value,
-                "could not find a "
-                "compatible integer "
-                "type");
-  static_assert(sizeof(integer_type) >= min_integer_type_size,
-                "integer_type "
-                "smaller than "
-                "min integer "
-                "type size");
-  static_assert(sizeof(integer_type) <= max_integer_type_size,
-                "integer_type "
-                "greater than "
-                "max integer "
-                "type size");
+                  std::conditional_t<
+                      ((alignof(T) >= alignof(unsigned char) &&
+                        sizeof(unsigned char) <= max_integer_type_size)),
+                      unsigned char,
+                      void>>>>>;
+  static_assert(
+      !std::is_same<integer_type, void>::value,
+      "could not find a "
+      "compatible integer "
+      "type");
+  static_assert(
+      sizeof(integer_type) >= min_integer_type_size,
+      "integer_type "
+      "smaller than "
+      "min integer "
+      "type size");
+  static_assert(
+      sizeof(integer_type) <= max_integer_type_size,
+      "integer_type "
+      "greater than "
+      "max integer "
+      "type size");
 
   static constexpr size_t num_integer_type =
       (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
@@ -1019,8 +1024,8 @@ struct ScopedAssignment
   T  m_prev_val;
 };
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 277d69553c..ef82f4c5ec 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -43,9 +43,10 @@ struct ZipIterator
   static_assert(
       concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
-  static_assert(sizeof...(Iters) > 1,
-                "ZipIterator must contain one or more "
-                "iterators");
+  static_assert(
+      sizeof...(Iters) > 1,
+      "ZipIterator must contain one or more "
+      "iterators");
 
   using value_type =
       zip_val<typename std::iterator_traits<Iters>::value_type...>;
@@ -58,9 +59,10 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
 
-  template <typename... Args,
-            typename = concepts::enable_if<
-                type_traits::convertible_to<Args&&, Iters>...>>
+  template <
+      typename... Args,
+      typename =
+          concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...>>
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
       : m_iterators(std::forward<Args>(args)...)
   {}
@@ -113,12 +115,12 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator& operator++()
   {
-    detail::zip_for_each(m_iterators, detail::PreInc{});
+    detail::zip_for_each(m_iterators, detail::PreInc {});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator& operator--()
   {
-    detail::zip_for_each(m_iterators, detail::PreDec{});
+    detail::zip_for_each(m_iterators, detail::PreDec {});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator operator++(int)
@@ -136,12 +138,12 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type> {rhs});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type> {rhs});
     return *this;
   }
 
@@ -164,8 +166,8 @@ struct ZipIterator
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type    lhs,
-                                                const ZipIterator& rhs)
+  RAJA_HOST_DEVICE friend ZipIterator
+  operator+(difference_type lhs, const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -174,7 +176,7 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline reference operator*() const
   {
-    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)>{});
+    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)> {});
   }
   // TODO:: figure out what to do with this
   // RAJA_HOST_DEVICE inline reference operator->() const
@@ -186,10 +188,10 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
-                                                     ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void
+  safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
   {
-    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
+    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap {});
   }
 
 private:
@@ -218,16 +220,17 @@ RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
     ZipIterator objects.
 */
 template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
-    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-            typename ZipIterator<
-                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args) -> Span<
+    ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+    typename ZipIterator<
+        detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
-  return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-              typename ZipIterator<detail::ContainerIter<
-                  camp::decay<Args>>...>::difference_type>(
+  return Span<
+      ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+      typename ZipIterator<
+          detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
       zip(end(std::forward<Args>(args))...));
 }
@@ -260,6 +263,6 @@ RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
   return {comp};
 }
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index 1618d9ca2e..8cc6921969 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -50,9 +50,9 @@ using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
 template <camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
-                                                     zip_tuple<is_val, Ts...>>&
-get(zip_tuple<is_val, Ts...>& z) noexcept
+RAJA_HOST_DEVICE constexpr RAJA::
+    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>&
+    get(zip_tuple<is_val, Ts...>& z) noexcept
 {
   return z.template get<I>();
 }
@@ -195,8 +195,9 @@ template <typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
 RAJA_HOST_DEVICE inline void
 zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
-                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(
+      RAJA::get<Is>(std::forward<Tuple0>(t0)),
+      RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
@@ -205,8 +206,9 @@ zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 template <typename Tuple, typename F>
 RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
-                    typename camp::decay<Tuple>::IdxSeq{});
+  zip_for_each_impl(
+      std::forward<Tuple>(t), std::forward<F>(f),
+      typename camp::decay<Tuple>::IdxSeq {});
 }
 
 /*!
@@ -215,14 +217,17 @@ RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 template <typename Tuple0, typename Tuple1, typename F>
 RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
-                             typename camp::decay<Tuple1>::IdxSeq>::value,
-                "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1),
-                    std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
+  static_assert(
+      std::is_same<
+          typename camp::decay<Tuple0>::IdxSeq,
+          typename camp::decay<Tuple1>::IdxSeq>::value,
+      "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(
+      std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f),
+      typename camp::decay<Tuple0>::IdxSeq {});
 }
 
-} // end namespace detail
+}  // end namespace detail
 
 /*!
     \brief Tuple used by ZipIterator for storing multiple references and values.
@@ -235,10 +240,10 @@ struct zip_tuple
   using value_type = RAJA::tuple<Ts...>;
 
   template <typename T>
-  using opp_type =
-      typename std::conditional<is_val,
-                                typename std::add_lvalue_reference<T>::type,
-                                typename std::remove_reference<T>::type>::type;
+  using opp_type = typename std::conditional<
+      is_val,
+      typename std::add_lvalue_reference<T>::type,
+      typename std::remove_reference<T>::type>::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -255,61 +260,62 @@ struct zip_tuple
   {}
 
   // assignment from types convertible to Ts
-  template <typename... Os,
-            typename = concepts::enable_if<type_traits::convertible_to<
-                Os&&,
-                typename std::remove_reference<Ts>::type>...>>
+  template <
+      typename... Os,
+      typename = concepts::enable_if<type_traits::convertible_to<
+          Os&&,
+          typename std::remove_reference<Ts>::type>...>>
   zip_tuple& assign(Os&&... os)
   {
-    return assign_helper(IdxSeq{}, std::forward<Os>(os)...);
+    return assign_helper(IdxSeq {}, std::forward<Os>(os)...);
   }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq{})
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq {})
   {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
-      : zip_tuple(o, IdxSeq{})
+      : zip_tuple(o, IdxSeq {})
   {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
-      : zip_tuple(std::move(o), IdxSeq{})
-  {} // move if is_val, pass-through otherwise
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
   {
-    return assign_helper(o, IdxSeq{});
+    return assign_helper(o, IdxSeq {});
   }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
   {
-    return assign_helper(o, IdxSeq{});
+    return assign_helper(o, IdxSeq {});
   }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
   {
-    return assign_helper(std::move(o), IdxSeq{});
+    return assign_helper(std::move(o), IdxSeq {});
   }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq{})
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq {})
   {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
-      : zip_tuple(o, IdxSeq{})
+      : zip_tuple(o, IdxSeq {})
   {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
-      : zip_tuple(std::move(o), IdxSeq{})
-  {} // move if is_val, pass-through otherwise
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
   {
-    return assign_helper(o, IdxSeq{});
+    return assign_helper(o, IdxSeq {});
   }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
   {
-    return assign_helper(o, IdxSeq{});
+    return assign_helper(o, IdxSeq {});
   }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
   {
-    return assign_helper(std::move(o), IdxSeq{});
+    return assign_helper(std::move(o), IdxSeq {});
   }
 
   // get member functions for zip_tuples
@@ -343,18 +349,18 @@ struct zip_tuple
   }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
-                                                     zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void
+  safe_swap(zip_tuple& lhs, zip_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
-                                                     opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void
+  safe_swap(zip_tuple& lhs, opp_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // allow printing of zip_tuples by printing value_type
@@ -385,19 +391,19 @@ struct zip_tuple
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
-                                         camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE
+  zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...)
-  {} // move if is_val, pass-through otherwise
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
+  assign_helper(zip_tuple& o, camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -406,8 +412,8 @@ struct zip_tuple
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
+  assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -416,8 +422,8 @@ struct zip_tuple
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
+  assign_helper(zip_tuple&& o, camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -432,33 +438,33 @@ struct zip_tuple
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
-                                         camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE
+  zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...)
-  {} // move if is_val, pass-through otherwise
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
+  assign_helper(opp_tuple& o, camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(o)...);
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
+  assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(o)...);
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
+  assign_helper(opp_tuple&& o, camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
     return *this;
@@ -473,6 +479,6 @@ using zip_ref = zip_tuple<false, Ts...>;
 template <typename... Ts>
 using zip_val = zip_tuple<true, Ts...>;
 
-} // end namespace RAJA
+}  // end namespace RAJA
 
 #endif
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index 670297081e..c3720b937e 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -115,7 +115,7 @@ void buildIndexSetAligned(
       }
 
       scanVal = lookAhead;
-    } // end loop to gather statistics
+    }  // end loop to gather statistics
 
     if (inrange != -1)
     {
@@ -211,7 +211,7 @@ void buildIndexSetAligned(
         }
 
         scanVal = lookAhead;
-      } // for (RAJA::Index_type ii ...
+      }  // for (RAJA::Index_type ii ...
 
       if (inrange != -1)
       {
@@ -233,14 +233,14 @@ void buildIndexSetAligned(
       }
     }
     else
-    { // !(docount < (length*range_align-1))/range_align)
+    {  // !(docount < (length*range_align-1))/range_align)
       iset.push_back(ListSegment(indices_in, length, work_res));
     }
   }
   else
-  { // else !(length > range_min_length)
+  {  // else !(length > range_min_length)
     iset.push_back(ListSegment(indices_in, length, work_res));
   }
 }
 
-} // namespace RAJA
+}  // namespace RAJA
diff --git a/src/DepGraphNode.cpp b/src/DepGraphNode.cpp
index 92e984c15e..df994ce396 100644
--- a/src/DepGraphNode.cpp
+++ b/src/DepGraphNode.cpp
@@ -41,4 +41,4 @@ void DepGraphNode::print(std::ostream& os) const
   os << std::endl;
 }
 
-} // namespace RAJA
+}  // namespace RAJA
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index 082253924b..7d3d590131 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -17,8 +17,8 @@ const uint64_t kokkos_interface_version = 20171029;
 RAJA_INLINE
 bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 &&
-          !filename.compare(filename.size() - 3, 3, ".so"));
+  return (
+      filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
 template <typename function>
@@ -99,14 +99,14 @@ void KokkosPluginLoader::initPlugin(const std::string& path)
   // Getting and storing supported kokkos functions.
   getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
 
-  getFunction<pre_function>(plugin, pre_functions,
-                            "kokkosp_begin_parallel_for");
+  getFunction<pre_function>(
+      plugin, pre_functions, "kokkosp_begin_parallel_for");
 
-  getFunction<post_function>(plugin, post_functions,
-                             "kokkosp_end_parallel_for");
+  getFunction<post_function>(
+      plugin, post_functions, "kokkosp_end_parallel_for");
 
-  getFunction<finalize_function>(plugin, finalize_functions,
-                                 "kokkosp_finalize_library");
+  getFunction<finalize_function>(
+      plugin, finalize_functions, "kokkosp_finalize_library");
 #else
   RAJA_UNUSED_ARG(path);
 #endif
@@ -147,8 +147,8 @@ void KokkosPluginLoader::initDirectory(const std::string& path)
 
 void linkKokkosPluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
 static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader>
     P("KokkosPluginLoader",
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index 2ce0c4b308..03da4551a3 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -42,10 +42,11 @@ namespace RAJA
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-                                int fastDim,
-                                int midDim,
-                                int slowDim)
+void buildLockFreeBlockIndexset(
+    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+    int                                      fastDim,
+    int                                      midDim,
+    int                                      slowDim)
 {
   constexpr int PROFITABLE_ENTITY_THRESHOLD_BLOCK = 100;
 
@@ -110,8 +111,8 @@ void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
           RAJA::Index_type len      = end - start;
           // printf("%d %d\n", start + (lane  )*len/3,
           //                   start + (lane+1)*len/3  ) ;
-          iset.push_back(RAJA::RangeSegment(start + (lane)*len / 3,
-                                            start + (lane + 1) * len / 3));
+          iset.push_back(RAJA::RangeSegment(
+              start + (lane)*len / 3, start + (lane + 1) * len / 3));
         }
       }
     }
@@ -368,4 +369,4 @@ void buildLockFreeColorIndexset(
   delete[] workset;
 }
 
-} // namespace RAJA
+}  // namespace RAJA
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index 6f312a5dd5..85ead614d9 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -54,11 +54,11 @@ cudaStatusInfo tl_status;
 std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace cuda
+}  // namespace cuda
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // if defined(RAJA_ENABLE_CUDA)
+#endif  // if defined(RAJA_ENABLE_CUDA)
diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp
index e1ade2b021..97bd82775e 100644
--- a/src/MemUtils_HIP.cpp
+++ b/src/MemUtils_HIP.cpp
@@ -54,11 +54,11 @@ hipStatusInfo tl_status;
 std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace hip
+}  // namespace hip
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // if defined(RAJA_ENABLE_HIP)
+#endif  // if defined(RAJA_ENABLE_HIP)
diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp
index d57a9738ae..c7bdbadb1a 100644
--- a/src/MemUtils_SYCL.cpp
+++ b/src/MemUtils_SYCL.cpp
@@ -49,14 +49,14 @@ syclInfo tl_status;
 #endif
 
 //! State of raja sycl queue synchronization for sycl reducer objects
-std::unordered_map<cl::sycl::queue, bool> g_queue_info_map{
+std::unordered_map<cl::sycl::queue, bool> g_queue_info_map {
     {cl::sycl::queue(), true}};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace sycl
+}  // namespace sycl
 
-} // namespace RAJA
+}  // namespace RAJA
 
 
-#endif // if defined(RAJA_ENABLE_SYCL)
+#endif  // if defined(RAJA_ENABLE_SYCL)
diff --git a/src/PluginStrategy.cpp b/src/PluginStrategy.cpp
index b429b9116d..eee0962fc4 100644
--- a/src/PluginStrategy.cpp
+++ b/src/PluginStrategy.cpp
@@ -28,5 +28,5 @@ void PluginStrategy::postLaunch(const PluginContext&) {}
 
 void PluginStrategy::finalize() {}
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
index 2c122e5865..b7324213f3 100644
--- a/src/RuntimePluginLoader.cpp
+++ b/src/RuntimePluginLoader.cpp
@@ -15,8 +15,8 @@
 RAJA_INLINE
 bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 &&
-          !filename.compare(filename.size() - 3, 3, ".so"));
+  return (
+      filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
 namespace RAJA
@@ -146,8 +146,8 @@ void RuntimePluginLoader::initDirectory(const std::string& path)
 
 void linkRuntimePluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
 static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader>
     P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index 8f09c50231..c522a46b06 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -12,23 +12,24 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
-                                               INDEX_TYPE last,
-                                               const int  pol)
+void DynamicForallResourceRangeSegmentTestImpl(
+    INDEX_TYPE first,
+    INDEX_TYPE last,
+    const int  pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
-  INDEX_TYPE                          N = INDEX_TYPE(r1.end() - r1.begin());
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES               working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -39,17 +40,18 @@ void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
@@ -75,7 +77,7 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   // If N == 4 host, device is available
   // If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   bool                      is_on_host =
       working_res.get_platform() == camp::resources::Platform::host ? true
                                                                                          : false;
@@ -89,8 +91,8 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
     // Loop through policy list
     for (int pol = 0; pol < host_range; ++pol)
     {
-      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
-                                                POLICY_LIST>(
+      DynamicForallResourceRangeSegmentTestImpl<
+          INDEX_TYPE, WORKING_RES, POLICY_LIST>(
           INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
@@ -103,15 +105,16 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 #endif
     for (int pol = device_start; pol < N; ++pol)
     {
-      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
-                                                POLICY_LIST>(
+      DynamicForallResourceRangeSegmentTestImpl<
+          INDEX_TYPE, WORKING_RES, POLICY_LIST>(
           INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
 }
 
-REGISTER_TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest,
-                            RangeSegmentForallResource);
+REGISTER_TYPED_TEST_SUITE_P(
+    DynamicForallResourceRangeSegmentTest,
+    RangeSegmentForallResource);
 
-#endif // __TEST_BASIC_SHARED_HPP__
+#endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index 3d47e7cd53..d4c319fdc6 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -12,16 +12,17 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
-                                       INDEX_TYPE last,
-                                       const int  pol)
+void DynamicForallRangeSegmentTestImpl(
+    INDEX_TYPE first,
+    INDEX_TYPE last,
+    const int  pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -32,8 +33,8 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -48,31 +49,33 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
         { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1,
-                                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                                            {
-                                              (void)idx;
-                                              working_array[0]++;
-                                            });
+    RAJA::expt::dynamic_forall<POLICY_LIST>(
+        pol, r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        {
+          (void)idx;
+          working_array[0]++;
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -97,7 +100,7 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
   // If N == 4 host, device is available
   // If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   bool                      is_on_host =
       working_res.get_platform() == camp::resources::Platform::host ? true
                                                                                          : false;
@@ -133,4 +136,4 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
 
 REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest, RangeSegmentForall);
 
-#endif // __TEST_BASIC_SHARED_HPP__
+#endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index 28f7d3f128..3b426bdd96 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -16,20 +16,20 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N  = N0;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   {
 
@@ -65,12 +65,13 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -79,19 +80,21 @@ template <typename T>
 class ForallCombiningAdapter1DTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
@@ -127,4 +130,4 @@ TYPED_TEST_P(ForallCombiningAdapter1DTest, Forall1D)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest, Forall1D);
 
-#endif // __TEST_FORALL_CombiningAdapter_1D_HPP__
+#endif  // __TEST_FORALL_CombiningAdapter_1D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 327e74c9f0..8dba3983eb 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -14,28 +14,29 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
-                                      INDEX_TYPE last0,
-                                      INDEX_TYPE first1,
-                                      INDEX_TYPE last1)
+void ForallCombiningAdapter2DTestImpl(
+    INDEX_TYPE first0,
+    INDEX_TYPE last0,
+    INDEX_TYPE first1,
+    INDEX_TYPE last1)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
-                                         RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
-                                         RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(
+      RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N  = N0 * N1;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   {
 
@@ -56,8 +57,8 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
           if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 && idx1 < last1)
           {
             // in bounds
-            working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
-                                               (idx1 - first1))] +=
+            working_array[RAJA::stripIndexType(
+                (idx0 - first0) * N1 + (idx1 - first1))] +=
                 (idx0 - first0) * N1 + (idx1 - first1);
           }
           else
@@ -75,12 +76,13 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -89,19 +91,21 @@ template <typename T>
 class ForallCombiningAdapter2DTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
@@ -141,4 +145,4 @@ TYPED_TEST_P(ForallCombiningAdapter2DTest, Forall2D)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest, Forall2D);
 
-#endif // __TEST_FORALL_CombiningAdapter_2D_HPP__
+#endif  // __TEST_FORALL_CombiningAdapter_2D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index f7a2d3405b..1d864d42c9 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -14,33 +14,34 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
-                                      INDEX_TYPE last0,
-                                      INDEX_TYPE first1,
-                                      INDEX_TYPE last1,
-                                      INDEX_TYPE first2,
-                                      INDEX_TYPE last2)
+void ForallCombiningAdapter3DTestImpl(
+    INDEX_TYPE first0,
+    INDEX_TYPE last0,
+    INDEX_TYPE first1,
+    INDEX_TYPE last1,
+    INDEX_TYPE first2,
+    INDEX_TYPE last2)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
-                                         RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
-                                         RAJA::stripIndexType(last1));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2),
-                                         RAJA::stripIndexType(last2));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(
+      RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(
+      RAJA::stripIndexType(first2), RAJA::stripIndexType(last2));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N  = N0 * N1 * N2;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   {
 
@@ -85,12 +86,13 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -99,19 +101,21 @@ template <typename T>
 class ForallCombiningAdapter3DTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
@@ -163,4 +167,4 @@ TYPED_TEST_P(ForallCombiningAdapter3DTest, Forall3D)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest, Forall3D);
 
-#endif // __TEST_FORALL_CombiningAdapter_3D_HPP__
+#endif  // __TEST_FORALL_CombiningAdapter_3D_HPP__
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index c2d8293ce6..2bc4b5f4de 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -43,8 +43,8 @@ struct RSMultiplexer<IdxType, RAJA::TypedRangeStrideSegment<IdxType>>
 template <typename IdxType>
 struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 {
-  RAJA::TypedListSegment<IdxType> makeseg(IdxType                   N,
-                                          camp::resources::Resource work_res)
+  RAJA::TypedListSegment<IdxType>
+  makeseg(IdxType N, camp::resources::Resource work_res)
   {
     std::vector<IdxType> temp(N);
     std::iota(std::begin(temp), std::end(temp), 0);
@@ -55,18 +55,19 @@ struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 // end segment multiplexer
 
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename SegmentType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename SegmentType,
+    typename T>
 void ForallAtomicBasicTestImpl(IdxType seglimit)
 {
   // initialize an array
   const int len = 12;
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
   SegmentType seg =
       RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
@@ -75,8 +76,8 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
   T* test_array;
   T* check_array;
 
-  allocateForallTestData<T>(len, work_res, &work_array, &check_array,
-                            &test_array);
+  allocateForallTestData<T>(
+      len, work_res, &work_array, &check_array, &test_array);
 
   // use atomic add to reduce the array
   test_array[0]  = static_cast<T>(0);
@@ -105,8 +106,8 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
         RAJA::atomicInc<AtomicPolicy>(work_array + 4);
         RAJA::atomicDec<AtomicPolicy>(work_array + 5);
         RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-        RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i),
-                                      static_cast<T>(i + 1));
+        RAJA::atomicCAS<AtomicPolicy>(
+            work_array + 7, static_cast<T>(i), static_cast<T>(i + 1));
         RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
         RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
         RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
@@ -147,15 +148,17 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
-                            RAJA::TypedRangeSegment<IdxType>, DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
-                            RAJA::TypedRangeStrideSegment<IdxType>, DType>(
+  ForallAtomicBasicTestImpl<
+      AExec, APol, ResType, IdxType, RAJA::TypedRangeSegment<IdxType>, DType>(
+      10000);
+  ForallAtomicBasicTestImpl<
+      AExec, APol, ResType, IdxType, RAJA::TypedRangeStrideSegment<IdxType>,
+      DType>(10000);
+  ForallAtomicBasicTestImpl<
+      AExec, APol, ResType, IdxType, RAJA::TypedListSegment<IdxType>, DType>(
       10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
-                            RAJA::TypedListSegment<IdxType>, DType>(10000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, AtomicBasicForall);
 
-#endif //__TEST_FORALL_ATOMIC_BASIC_HPP__
+#endif  //__TEST_FORALL_ATOMIC_BASIC_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
index 7e4075cadc..cf0f2e0831 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
@@ -16,10 +16,11 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PreIncCountOp
 {
-  PreIncCountOp(T*                               dcount,
-                T*                               hcount,
-                camp::resources::Resource        work_res,
-                RAJA::TypedRangeSegment<IdxType> seg)
+  PreIncCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -37,10 +38,11 @@ struct PreIncCountOp
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PostIncCountOp
 {
-  PostIncCountOp(T*                               dcount,
-                 T*                               hcount,
-                 camp::resources::Resource        work_res,
-                 RAJA::TypedRangeSegment<IdxType> seg)
+  PostIncCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -58,10 +60,11 @@ struct PostIncCountOp
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AddEqCountOp
 {
-  AddEqCountOp(T*                               dcount,
-               T*                               hcount,
-               camp::resources::Resource        work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
+  AddEqCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -82,10 +85,11 @@ struct AddEqCountOp
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchAddCountOp
 {
-  FetchAddCountOp(T*                               dcount,
-                  T*                               hcount,
-                  camp::resources::Resource        work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
+  FetchAddCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -103,38 +107,42 @@ struct FetchAddCountOp
   T                                min, max, final;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class CountOp>
-void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
-                      T*                               count,
-                      T*                               list,
-                      bool*                            hit,
-                      T*                               hcount,
-                      T*                               hlist,
-                      bool*                            hhit,
-                      camp::resources::Resource        work_res,
-                      IdxType                          N)
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename IdxType,
+    typename T,
+    template <typename, typename, typename>
+    class CountOp>
+void testAtomicRefAdd(
+    RAJA::TypedRangeSegment<IdxType> seg,
+    T*                               count,
+    T*                               list,
+    bool*                            hit,
+    T*                               hcount,
+    T*                               hlist,
+    bool*                            hhit,
+    camp::resources::Resource        work_res,
+    IdxType                          N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
 
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             list[i] = countop.max + (T)1;
-                             hit[i]  = false;
-                           });
-
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val             = countop(i);
-                             list[i]           = val;
-                             hit[(IdxType)val] = true;
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        list[i] = countop.max + (T)1;
+        hit[i]  = false;
+      });
+
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        T val             = countop(i);
+        list[i]           = val;
+        hit[(IdxType)val] = true;
+      });
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -165,18 +173,19 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
 }
 
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicRefAddTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T*    count = work_res.allocate<T>(1);
   T*    list  = work_res.allocate<T>(N);
@@ -230,4 +239,4 @@ TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest, AtomicRefAddForall);
 
-#endif //__TEST_FORALL_ATOMICREF_ADD_HPP__
+#endif  //__TEST_FORALL_ATOMICREF_ADD_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 163b518977..66fd877fd2 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -16,10 +16,11 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CASOtherOp : all_op
 {
-  CASOtherOp(T*                               dcount,
-             T*                               hcount,
-             camp::resources::Resource        work_res,
-             RAJA::TypedRangeSegment<IdxType> seg)
+  CASOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -46,10 +47,11 @@ struct CASOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CompareExchangeWeakOtherOp : all_op
 {
-  CompareExchangeWeakOtherOp(T*                               dcount,
-                             T*                               hcount,
-                             camp::resources::Resource        work_res,
-                             RAJA::TypedRangeSegment<IdxType> seg)
+  CompareExchangeWeakOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -74,10 +76,11 @@ struct CompareExchangeWeakOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CompareExchangeStrongOtherOp : all_op
 {
-  CompareExchangeStrongOtherOp(T*                               dcount,
-                               T*                               hcount,
-                               camp::resources::Resource        work_res,
-                               RAJA::TypedRangeSegment<IdxType> seg)
+  CompareExchangeStrongOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -99,29 +102,32 @@ struct CompareExchangeStrongOtherOp : all_op
   T                                min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
-void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
-                        T*                               count,
-                        T*                               list,
-                        T*                               hcount,
-                        T*                               hlist,
-                        camp::resources::Resource        work_res,
-                        IdxType                          N)
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename IdxType,
+    typename T,
+    template <typename, typename, typename>
+    class OtherOp>
+void testAtomicRefCASOp(
+    RAJA::TypedRangeSegment<IdxType> seg,
+    T*                               count,
+    T*                               list,
+    T*                               hcount,
+    T*                               hlist,
+    camp::resources::Resource        work_res,
+    IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        T val   = otherop(i);
+        list[i] = val;
+      });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -142,18 +148,19 @@ void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
 }
 
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicRefCASTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
   T* list  = work_res.allocate<T>(N);
@@ -171,12 +178,12 @@ void ForallAtomicRefCASTestImpl(IdxType N)
 
   testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, CASOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
-                     CompareExchangeWeakOtherOp>(seg, count, list, hcount,
-                                                 hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
-                     CompareExchangeStrongOtherOp>(seg, count, list, hcount,
-                                                   hlist, work_res, N);
+  testAtomicRefCASOp<
+      ExecPolicy, AtomicPolicy, IdxType, T, CompareExchangeWeakOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<
+      ExecPolicy, AtomicPolicy, IdxType, T, CompareExchangeStrongOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
 
   work_res.deallocate(count);
   work_res.deallocate(list);
@@ -203,4 +210,4 @@ TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest, AtomicRefCASForall);
 
-#endif //__TEST_FORALL_ATOMICREF_CAS_HPP__
+#endif  //__TEST_FORALL_ATOMICREF_CAS_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index db2729ecf4..683cf0548c 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -16,10 +16,11 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct LoadOtherOp : all_op
 {
-  LoadOtherOp(T*                               dcount,
-              T*                               hcount,
-              camp::resources::Resource        work_res,
-              RAJA::TypedRangeSegment<IdxType> seg)
+  LoadOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)seg.size()),
         max(min),
@@ -38,10 +39,11 @@ struct LoadOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct OperatorTOtherOp : all_op
 {
-  OperatorTOtherOp(T*                               dcount,
-                   T*                               hcount,
-                   camp::resources::Resource        work_res,
-                   RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
+  OperatorTOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
       : other(dcount), min(T(0)), max(min), final_min(min), final_max(min)
   {
     hcount[0] = min;
@@ -56,10 +58,11 @@ struct OperatorTOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct StoreOtherOp : all_op
 {
-  StoreOtherOp(T*                               dcount,
-               T*                               hcount,
-               camp::resources::Resource        work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
+  StoreOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -82,10 +85,11 @@ struct StoreOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AssignOtherOp : all_op
 {
-  AssignOtherOp(T*                               dcount,
-                T*                               hcount,
-                camp::resources::Resource        work_res,
-                RAJA::TypedRangeSegment<IdxType> seg)
+  AssignOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -101,29 +105,32 @@ struct AssignOtherOp : all_op
   T                                min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
-void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
-                              T*                               count,
-                              T*                               list,
-                              T*                               hcount,
-                              T*                               hlist,
-                              camp::resources::Resource        work_res,
-                              IdxType                          N)
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename IdxType,
+    typename T,
+    template <typename, typename, typename>
+    class OtherOp>
+void testAtomicRefLoadStoreOp(
+    RAJA::TypedRangeSegment<IdxType> seg,
+    T*                               count,
+    T*                               list,
+    T*                               hcount,
+    T*                               hlist,
+    camp::resources::Resource        work_res,
+    IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        T val   = otherop(i);
+        list[i] = val;
+      });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -144,18 +151,19 @@ void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
 }
 
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
   T* list  = work_res.allocate<T>(N);
@@ -173,9 +181,9 @@ void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, LoadOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T,
-                           OperatorTOtherOp>(seg, count, list, hcount, hlist,
-                                             work_res, N);
+  testAtomicRefLoadStoreOp<
+      ExecPolicy, AtomicPolicy, IdxType, T, OperatorTOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, StoreOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, AssignOtherOp>(
@@ -204,7 +212,8 @@ TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
   ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest,
-                            AtomicRefLoadStoreForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallAtomicRefLoadStoreTest,
+    AtomicRefLoadStoreForall);
 
-#endif //__TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
+#endif  //__TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 276993e300..051947354a 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -16,10 +16,11 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AndEqOtherOp : int_op
 {
-  AndEqOtherOp(T*                               dcount,
-               T*                               hcount,
-               camp::resources::Resource        work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
+  AndEqOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size()),
@@ -38,10 +39,11 @@ struct AndEqOtherOp : int_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchAndOtherOp : int_op
 {
-  FetchAndOtherOp(T*                               dcount,
-                  T*                               hcount,
-                  camp::resources::Resource        work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
+  FetchAndOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -60,10 +62,11 @@ struct FetchAndOtherOp : int_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct OrEqOtherOp : int_op
 {
-  OrEqOtherOp(T*                               dcount,
-              T*                               hcount,
-              camp::resources::Resource        work_res,
-              RAJA::TypedRangeSegment<IdxType> seg)
+  OrEqOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -82,10 +85,11 @@ struct OrEqOtherOp : int_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchOrOtherOp : int_op
 {
-  FetchOrOtherOp(T*                               dcount,
-                 T*                               hcount,
-                 camp::resources::Resource        work_res,
-                 RAJA::TypedRangeSegment<IdxType> seg)
+  FetchOrOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -104,10 +108,11 @@ struct FetchOrOtherOp : int_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct XorEqOtherOp : int_op
 {
-  XorEqOtherOp(T*                               dcount,
-               T*                               hcount,
-               camp::resources::Resource        work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
+  XorEqOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -131,10 +136,11 @@ struct XorEqOtherOp : int_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchXorOtherOp : int_op
 {
-  FetchXorOtherOp(T*                               dcount,
-                  T*                               hcount,
-                  camp::resources::Resource        work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
+  FetchXorOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -155,53 +161,58 @@ struct FetchXorOtherOp : int_op
   T                                min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename IdxType,
+    typename T,
+    template <typename, typename, typename>
+    class OtherOp>
 // No test when underlying op type is int, and index type is float
 typename std::enable_if<
     (std::is_floating_point<T>::value &&
      std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
-                       T*                               RAJA_UNUSED_ARG(count),
-                       T*                               RAJA_UNUSED_ARG(list),
-                       T*                               RAJA_UNUSED_ARG(hcount),
-                       T*                               RAJA_UNUSED_ARG(hlist),
-                       camp::resources::Resource RAJA_UNUSED_ARG(work_res),
-                       IdxType                   RAJA_UNUSED_ARG(N))
+testAtomicRefLogicalOp(
+    RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
+    T*                               RAJA_UNUSED_ARG(count),
+    T*                               RAJA_UNUSED_ARG(list),
+    T*                               RAJA_UNUSED_ARG(hcount),
+    T*                               RAJA_UNUSED_ARG(hlist),
+    camp::resources::Resource        RAJA_UNUSED_ARG(work_res),
+    IdxType                          RAJA_UNUSED_ARG(N))
 {}
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename IdxType,
+    typename T,
+    template <typename, typename, typename>
+    class OtherOp>
 // Run test if T is integral and operation is int_op, or for any all_op
 typename std::enable_if<
     (std::is_integral<T>::value &&
      std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value) ||
     (std::is_base_of<all_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
-                       T*                               count,
-                       T*                               list,
-                       T*                               hcount,
-                       T*                               hlist,
-                       camp::resources::Resource        work_res,
-                       IdxType                          N)
+testAtomicRefLogicalOp(
+    RAJA::TypedRangeSegment<IdxType> seg,
+    T*                               count,
+    T*                               list,
+    T*                               hcount,
+    T*                               hlist,
+    camp::resources::Resource        work_res,
+    IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        T val   = otherop(i);
+        list[i] = val;
+      });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -222,18 +233,19 @@ testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
 }
 
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicRefLogicalTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
   T* list  = work_res.allocate<T>(N);
@@ -289,4 +301,4 @@ TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall);
 
-#endif //__TEST_FORALL_ATOMICREF_LOGICAL_HPP__
+#endif  //__TEST_FORALL_ATOMICREF_LOGICAL_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index e3fd035ab5..3a6394271f 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -16,10 +16,11 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct MaxEqOtherOp : all_op
 {
-  MaxEqOtherOp(T*                               dcount,
-               T*                               hcount,
-               camp::resources::Resource        work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
+  MaxEqOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -38,10 +39,11 @@ struct MaxEqOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchMaxOtherOp : all_op
 {
-  FetchMaxOtherOp(T*                               dcount,
-                  T*                               hcount,
-                  camp::resources::Resource        work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
+  FetchMaxOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -60,10 +62,11 @@ struct FetchMaxOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct MinEqOtherOp : all_op
 {
-  MinEqOtherOp(T*                               dcount,
-               T*                               hcount,
-               camp::resources::Resource        work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
+  MinEqOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -82,10 +85,11 @@ struct MinEqOtherOp : all_op
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchMinOtherOp : all_op
 {
-  FetchMinOtherOp(T*                               dcount,
-                  T*                               hcount,
-                  camp::resources::Resource        work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
+  FetchMinOtherOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size()),
@@ -101,29 +105,32 @@ struct FetchMinOtherOp : all_op
   T                                min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
-void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
-                           T*                               count,
-                           T*                               list,
-                           T*                               hcount,
-                           T*                               hlist,
-                           camp::resources::Resource        work_res,
-                           IdxType                          N)
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename IdxType,
+    typename T,
+    template <typename, typename, typename>
+    class OtherOp>
+void testAtomicRefMinMaxOp(
+    RAJA::TypedRangeSegment<IdxType> seg,
+    T*                               count,
+    T*                               list,
+    T*                               hcount,
+    T*                               hlist,
+    camp::resources::Resource        work_res,
+    IdxType                          N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        T val   = otherop(i);
+        list[i] = val;
+      });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -144,18 +151,19 @@ void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
 }
 
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicRefMinMaxTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T* count = work_res.allocate<T>(1);
   T* list  = work_res.allocate<T>(N);
@@ -205,4 +213,4 @@ TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall);
 
-#endif //__TEST_FORALL_ATOMICREF_MINMAX_HPP__
+#endif  //__TEST_FORALL_ATOMICREF_MINMAX_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
index 6f9e837057..a9bc1fde30 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
@@ -16,10 +16,11 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PreDecCountOp
 {
-  PreDecCountOp(T*                               dcount,
-                T*                               hcount,
-                camp::resources::Resource        work_res,
-                RAJA::TypedRangeSegment<IdxType> seg)
+  PreDecCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -34,10 +35,11 @@ struct PreDecCountOp
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PostDecCountOp
 {
-  PostDecCountOp(T*                               dcount,
-                 T*                               hcount,
-                 camp::resources::Resource        work_res,
-                 RAJA::TypedRangeSegment<IdxType> seg)
+  PostDecCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -52,10 +54,11 @@ struct PostDecCountOp
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct SubEqCountOp
 {
-  SubEqCountOp(T*                               dcount,
-               T*                               hcount,
-               camp::resources::Resource        work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
+  SubEqCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -70,10 +73,11 @@ struct SubEqCountOp
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchSubCountOp
 {
-  FetchSubCountOp(T*                               dcount,
-                  T*                               hcount,
-                  camp::resources::Resource        work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
+  FetchSubCountOp(
+      T*                               dcount,
+      T*                               hcount,
+      camp::resources::Resource        work_res,
+      RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -88,36 +92,40 @@ struct FetchSubCountOp
   T                                min, max, final;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class CountOp>
-void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
-                      T*                               count,
-                      T*                               list,
-                      bool*                            hit,
-                      T*                               hcount,
-                      T*                               hlist,
-                      bool*                            hhit,
-                      camp::resources::Resource        work_res,
-                      IdxType                          N)
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename IdxType,
+    typename T,
+    template <typename, typename, typename>
+    class CountOp>
+void testAtomicRefSub(
+    RAJA::TypedRangeSegment<IdxType> seg,
+    T*                               count,
+    T*                               list,
+    bool*                            hit,
+    T*                               hcount,
+    T*                               hlist,
+    bool*                            hhit,
+    camp::resources::Resource        work_res,
+    IdxType                          N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             list[i] = countop.max + (T)1;
-                             hit[i]  = false;
-                           });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val             = countop(i);
-                             list[i]           = val;
-                             hit[(IdxType)val] = true;
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        list[i] = countop.max + (T)1;
+        hit[i]  = false;
+      });
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        T val             = countop(i);
+        list[i]           = val;
+        hit[(IdxType)val] = true;
+      });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -139,18 +147,19 @@ void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
 }
 
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicRefSubTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T*    count = work_res.allocate<T>(1);
   T*    list  = work_res.allocate<T>(N);
@@ -204,4 +213,4 @@ TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest, AtomicRefSubForall);
 
-#endif //__TEST_FORALL_ATOMICREF_SUB_HPP__
+#endif  //__TEST_FORALL_ATOMICREF_SUB_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index d737929fa9..c78ed6ffac 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -15,25 +15,26 @@
 
 #include <cmath>
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
   int dst_side = static_cast<int>(
-      std::sqrt(static_cast<double>(N / 2))); // dest[] dimension
-  int src_side = dst_side * 2;                // source[] dimension
+      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
+  int src_side = dst_side * 2;                 // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T*  actualsource = work_res.allocate<T>(N);
   T** source       = work_res.allocate<T*>(src_side);
@@ -50,15 +51,17 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 #endif
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
-                           { source[ii] = actualsource + (ii * dst_side); });
+  RAJA::forall<ExecPolicy>(
+      seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
+      { source[ii] = actualsource + (ii * dst_side); });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
-  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
-                           { dest[ii] = actualdest + (ii * dst_side); });
+  RAJA::forall<ExecPolicy>(
+      seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
+      { dest[ii] = actualdest + (ii * dst_side); });
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { actualsource[i] = (T)1; });
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i) { actualsource[i] = (T)1; });
 
   // use atomic add to reduce the array
   // 1D defaut MultiView
@@ -70,25 +73,26 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_dstside,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-                             {
-                               sum_atomic_view(i, aopidx) = (T)0;
-                             }
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg_dstside,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+        {
+          sum_atomic_view(i, aopidx) = (T)0;
+        }
+      });
 
   // Assign values to dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_srcside,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-                             {
-                               sum_atomic_view(i / 2, aopidx) +=
-                                   vec_view(aopidx, i / 2);
-                             }
-                           });
+  RAJA::forall<ExecPolicy>(
+      seg_srcside,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+        {
+          sum_atomic_view(i / 2, aopidx) += vec_view(aopidx, i / 2);
+        }
+      });
 
   work_res.memcpy(check_array, actualdest, sizeof(T) * N / 2);
 
@@ -130,4 +134,4 @@ TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest, AtomicMultiViewForall);
 
-#endif //__TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
+#endif  //__TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index e863d8d47b..1c6e9cbc3a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -15,25 +15,26 @@
 
 #include <cmath>
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
   int dst_side = static_cast<int>(
-      std::sqrt(static_cast<double>(N / 2))); // dest[] dimension
-  int src_side = dst_side * 2;                // source[] dimension
+      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
+  int src_side = dst_side * 2;                 // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T*  actualsource = work_res.allocate<T>(N);
   T** source       = work_res.allocate<T*>(src_side);
@@ -87,8 +88,9 @@ template <typename T>
 class ForallAtomicOutOfBoundsMultiViewTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
-             AtomicOutOfBoundsMultiViewForall)
+TYPED_TEST_P(
+    ForallAtomicOutOfBoundsMultiViewTest,
+    AtomicOutOfBoundsMultiViewForall)
 {
   using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
   using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -96,11 +98,12 @@ TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType,
-                                           DType>(20000);
+  ForallAtomicOutOfBoundsMultiViewTestImpl<
+      AExec, APol, ResType, IdxType, DType>(20000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest,
-                            AtomicOutOfBoundsMultiViewForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallAtomicOutOfBoundsMultiViewTest,
+    AtomicOutOfBoundsMultiViewForall);
 
-#endif //__TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
+#endif  //__TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index f8760c170b..4082ba13be 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -13,18 +13,19 @@
 #ifndef __TEST_FORALL_ATOMIC_VIEW_HPP__
 #define __TEST_FORALL_ATOMIC_VIEW_HPP__
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename T>
+template <
+    typename ExecPolicy,
+    typename AtomicPolicy,
+    typename WORKINGRES,
+    typename IdxType,
+    typename T>
 void ForallAtomicViewTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_half(0, N / 2);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   T* hsource     = host_res.allocate<T>(N);
   T* source      = work_res.allocate<T>(N);
@@ -59,12 +60,13 @@ void ForallAtomicViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { sum_atomic_view(i) = (T)0; });
+  RAJA::forall<ExecPolicy>(
+      seg_half, [=] RAJA_HOST_DEVICE(IdxType i) { sum_atomic_view(i) = (T)0; });
 
   // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { sum_atomic_view(i / 2) += vec_view(i); });
+  RAJA::forall<ExecPolicy>(
+      seg, [=] RAJA_HOST_DEVICE(IdxType i)
+      { sum_atomic_view(i / 2) += vec_view(i); });
 
   work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
 
@@ -105,4 +107,4 @@ TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest, AtomicViewForall);
 
-#endif //__TEST_FORALL_ATOMIC_VIEW_HPP__
+#endif  //__TEST_FORALL_ATOMIC_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index 4441475bd0..1622071b60 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -26,7 +26,7 @@ void ForallIcountIndexSetViewTestImpl()
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
@@ -45,8 +45,8 @@ void ForallIcountIndexSetViewTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -74,8 +74,8 @@ void ForallIcountIndexSetViewTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -93,7 +93,8 @@ TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
   ForallIcountIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest,
-                            IndexSetForallIcountView);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallIcountIndexSetViewTest,
+    IndexSetForallIcountView);
 
-#endif // __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
+#endif  // __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index 2769275a88..3383b8b6d9 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -25,7 +25,7 @@ void ForallIndexSetViewTestImpl()
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
@@ -44,8 +44,8 @@ void ForallIndexSetViewTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -61,8 +61,8 @@ void ForallIndexSetViewTestImpl()
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { working_array[idx] = idx; });
+  RAJA::forall<EXEC_POLICY>(
+      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -72,8 +72,8 @@ void ForallIndexSetViewTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -93,4 +93,4 @@ TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest, IndexSetForallView);
 
-#endif // __TEST_FORALL_INDEXSET_VIEW_HPP__
+#endif  // __TEST_FORALL_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index ee4fba7b5f..1d2e5a2f21 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -24,7 +24,7 @@ void ForallIcountIndexSetTestImpl()
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
@@ -43,8 +43,8 @@ void ForallIcountIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -56,9 +56,10 @@ void ForallIcountIndexSetTestImpl()
     test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::forall_Icount(EXEC_POLICY(), iset,
-                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
-                      { working_array[icount] = idx; });
+  RAJA::forall_Icount(
+      EXEC_POLICY(), iset,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -67,8 +68,8 @@ void ForallIcountIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -88,4 +89,4 @@ TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest, IndexSetForallIcount);
 
-#endif // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
+#endif  // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index 657d464f1c..5a7bfbf857 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -23,7 +23,7 @@ void ForallIndexSetTestImpl()
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
@@ -42,8 +42,8 @@ void ForallIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -54,9 +54,9 @@ void ForallIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall(EXEC_POLICY(), iset,
-               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-               { working_array[idx] = idx; });
+  RAJA::forall(
+      EXEC_POLICY(), iset,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -66,8 +66,8 @@ void ForallIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -87,4 +87,4 @@ TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest, IndexSetForall);
 
-#endif // __TEST_FORALL_INDEXSET_HPP__
+#endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index f93b2affef..18f23bf2fc 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -15,41 +15,45 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEG_TYPE,
-          typename Container,
-          typename RandomGenerator>
+template <
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY,
+    typename ABSTRACTION,
+    typename DATA_TYPE,
+    typename IDX_TYPE,
+    typename SEG_TYPE,
+    typename Container,
+    typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
-ForallMultiReduceBasicTestImpl(const SEG_TYPE&,
-                               const Container&,
-                               const std::vector<IDX_TYPE>&,
-                               camp::resources::Resource,
-                               RandomGenerator&)
+ForallMultiReduceBasicTestImpl(
+    const SEG_TYPE&,
+    const Container&,
+    const std::vector<IDX_TYPE>&,
+    camp::resources::Resource,
+    RandomGenerator&)
 {
   return false;
 }
 ///
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEG_TYPE,
-          typename Container,
-          typename RandomGenerator>
+template <
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY,
+    typename ABSTRACTION,
+    typename DATA_TYPE,
+    typename IDX_TYPE,
+    typename SEG_TYPE,
+    typename Container,
+    typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
-ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
-                               const Container&             multi_init,
-                               const std::vector<IDX_TYPE>& seg_idx,
-                               camp::resources::Resource    working_res,
-                               RandomGenerator&             rngen)
+ForallMultiReduceBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const Container&             multi_init,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res,
+    RandomGenerator&             rngen)
 {
   using MULTIREDUCER =
       typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
@@ -74,8 +78,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range + 1, working_res, &working_range,
-                         &check_range, &test_range);
+  allocateForallTestData(
+      idx_range + 1, working_res, &working_range, &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -95,11 +99,11 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
     }
   }
 
-  allocateForallTestData(data_len, working_res, &working_array, &check_array,
-                         &test_array);
+  allocateForallTestData(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
-                         &test_bins);
+  allocateForallTestData(
+      data_len, working_res, &working_bins, &check_bins, &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -119,8 +123,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
     }
   }
 
-  working_res.memcpy(working_range, test_range,
-                     sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(
+      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -177,16 +181,16 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
             ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::forall<EXEC_POLICY>(seg,
-                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-                                {
-                                  for (IDX_TYPE idx = working_range[ii];
-                                       idx < working_range[ii + 1]; ++idx)
-                                  {
-                                    ABSTRACTION::reduce(red[working_bins[idx]],
-                                                        working_array[idx]);
-                                  }
-                                });
+      RAJA::forall<EXEC_POLICY>(
+          seg,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+          {
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
     for (size_t bin = 0; bin < num_bins; ++bin)
@@ -204,17 +208,18 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
     {
 
       // use floating point values to accentuate floating point precision issues
-      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-                         std::uniform_int_distribution<DATA_TYPE>,
-                         std::uniform_real_distribution<DATA_TYPE>>
+      std::conditional_t<
+          !std::is_floating_point<DATA_TYPE>::value,
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>>
           array_flt_distribution(0, modval - 1);
 
       for (IDX_TYPE i = 0; i < data_len; ++i)
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array,
-                         sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(
+          working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -226,16 +231,16 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&              seg,
     {
       red.reset();
 
-      RAJA::forall<EXEC_POLICY>(seg,
-                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-                                {
-                                  for (IDX_TYPE idx = working_range[ii];
-                                       idx < working_range[ii + 1]; ++idx)
-                                  {
-                                    ABSTRACTION::reduce(red[working_bins[idx]],
-                                                        working_array[idx]);
-                                  }
-                                });
+      RAJA::forall<EXEC_POLICY>(
+          seg,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+          {
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
 
       if (!got_ref_vals)
       {
@@ -275,10 +280,10 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto         random_seed = std::random_device{}();
+  auto         random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -289,8 +294,8 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   for (size_t num_bins_max : num_bins_max_container)
   {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
-                                                        num_bins_max);
+    std::uniform_int_distribution<size_t> num_bins_dist(
+        num_bins_min, num_bins_max);
     num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
@@ -299,24 +304,24 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
     // Range segment tests
     RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
     RAJA::getIndices(seg_idx, r1);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(r1, container, seg_idx,
-                                              working_res, rngen);
+    ForallMultiReduceBasicTestImpl<
+        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+        r1, container, seg_idx, working_res, rngen);
 
     seg_idx.clear();
     RAJA::TypedRangeSegment<IDX_TYPE> r3(3, 2060);
     RAJA::getIndices(seg_idx, r3);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(r3, container, seg_idx,
-                                              working_res, rngen);
+    ForallMultiReduceBasicTestImpl<
+        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+        r3, container, seg_idx, working_res, rngen);
 
     // Range-stride segment test
     seg_idx.clear();
     RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
     RAJA::getIndices(seg_idx, r5);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(r5, container, seg_idx,
-                                              working_res, rngen);
+    ForallMultiReduceBasicTestImpl<
+        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+        r5, container, seg_idx, working_res, rngen);
 
     // List segment test
     seg_idx.clear();
@@ -330,14 +335,14 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
         seg_idx.push_back(i);
       }
     }
-    RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                        working_res);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(l1, container, seg_idx,
-                                              working_res, rngen);
+    RAJA::TypedListSegment<IDX_TYPE> l1(
+        &seg_idx[0], seg_idx.size(), working_res);
+    ForallMultiReduceBasicTestImpl<
+        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+        l1, container, seg_idx, working_res, rngen);
   }
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest, MultiReduceBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index b76d88741f..fbde22fca6 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceBitAndBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +45,9 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpand &= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -70,12 +73,13 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              redand &= working_array[idx];
-                              redand2 &= working_array[idx];
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      {
+        redand &= working_array[idx];
+        redand2 &= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -85,15 +89,16 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              { redand &= working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { redand &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -110,50 +115,45 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -168,12 +168,13 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
-                            ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceBitAndBasicTest,
+    ReduceBitAndBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index c6786ba23d..d375d0d986 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
-                                    const std::vector<IDX_TYPE>& seg_idx,
-                                    camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceBitOrBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +45,9 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpor |= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
@@ -71,12 +74,13 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor(0);
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              redor |= working_array[idx];
-                              redor2 |= working_array[idx];
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      {
+        redor |= working_array[idx];
+        redor2 |= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2.get()), ref_or);
@@ -86,15 +90,16 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              { redor |= working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { redor |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
@@ -111,47 +116,45 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                             working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                             working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -166,11 +169,11 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index 8da71aca26..babf462f1e 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMaxBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval   = 100;
   const DATA_TYPE max_init = -1;
@@ -52,12 +54,13 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> maxinit(big_max);
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              maxinit.max(working_array[idx]);
-                              max.max(working_array[idx]);
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      {
+        maxinit.max(working_array[idx]);
+        max.max(working_array[idx]);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
@@ -66,18 +69,20 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.max(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.max(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
@@ -93,47 +98,45 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -148,11 +151,11 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index b9434014cb..a4037ee0fa 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMaxLocBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval      = 100;
   const DATA_TYPE max_init    = -modval;
@@ -59,17 +61,18 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max,
-                                                                 maxloc_init);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init,
-                                                             maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(
+      big_max, maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(
+      max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              maxinit.maxloc(working_array[idx], idx);
-                              max.maxloc(working_array[idx], idx);
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      {
+        maxinit.maxloc(working_array[idx], idx);
+        max.maxloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -81,20 +84,22 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.maxloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.maxloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
@@ -110,50 +115,45 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -168,12 +168,13 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
-                            ReduceMaxLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMaxLocBasicTest,
+    ReduceMaxLocBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index 0af276ac7d..b58adda3a4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMinBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -53,12 +55,13 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              mininit.min(working_array[idx]);
-                              min.min(working_array[idx]);
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      {
+        mininit.min(working_array[idx]);
+        min.min(working_array[idx]);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -67,20 +70,22 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.min(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.min(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -97,47 +102,45 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -152,11 +155,11 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index f20b086bc7..dde95db3cb 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMinLocBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval       = 100;
   const DATA_TYPE min_init     = modval + 1;
@@ -59,17 +61,18 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min,
-                                                                 minloc_init);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init,
-                                                             minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(
+      small_min, minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(
+      min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              mininit.minloc(working_array[idx], idx);
-                              min.minloc(working_array[idx], idx);
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      {
+        mininit.minloc(working_array[idx], idx);
+        min.minloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -81,20 +84,22 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.minloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.minloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
@@ -110,50 +115,45 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -168,12 +168,13 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
-                            ReduceMinLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMinLocBasicTest,
+    ReduceMinLocBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 6b8c0b2506..970e2e3a80 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
@@ -51,12 +53,13 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              sum += working_array[idx];
-                              sum2 += working_array[idx];
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+      {
+        sum += working_array[idx];
+        sum2 += working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -67,15 +70,15 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
 
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              { sum += working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(
+        seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { sum += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -92,47 +95,45 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -147,11 +148,11 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index d4a91c4732..c0e56db720 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceBitAndBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +45,9 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpand &= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -97,8 +100,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -115,50 +118,45 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -173,12 +171,13 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
-                            ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceBitAndBasicTest,
+    ReduceBitAndBasicForall);
 
-#endif // __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
+#endif  // __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index 118a2488b7..39b433df13 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
-                                    const std::vector<IDX_TYPE>& seg_idx,
-                                    camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceBitOrBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -43,8 +45,9 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpor |= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
@@ -97,8 +100,8 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
@@ -115,47 +118,45 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                             working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                             working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -170,11 +171,11 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index 0afa44d017..86ad07c954 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMaxBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval   = 100;
   const DATA_TYPE max_init = -1;
@@ -69,22 +71,22 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MAX(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+      { m = RAJA_MAX(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MAX(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+      { m = RAJA_MAX(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
@@ -100,47 +102,45 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -155,11 +155,11 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index 7e225c0cf2..2cafa5b034 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMaxLocBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval      = 100;
   const DATA_TYPE max_init    = -modval;
@@ -83,24 +85,24 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.max(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+      { m.max(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.max(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+      { m.max(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
@@ -116,50 +118,45 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -174,12 +171,13 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
-                            ReduceMaxLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMaxLocBasicTest,
+    ReduceMaxLocBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index beef5ee707..22e0497a28 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMinBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -69,21 +71,21 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MIN(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+      { m = RAJA_MIN(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MIN(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+      { m = RAJA_MIN(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -100,47 +102,45 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -155,11 +155,11 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index 33f05290c8..14a51d8149 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceMinLocBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval       = 100;
   const DATA_TYPE min_init     = modval + 1;
@@ -83,24 +85,24 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.min(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+      { m.min(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.min(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+      { m.min(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
@@ -116,50 +118,45 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -174,12 +171,13 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
-                            ReduceMinLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMinLocBasicTest,
+    ReduceMinLocBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index e794486608..ca12d4073c 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -13,14 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -29,8 +31,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
@@ -69,17 +71,17 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE&              seg,
 
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(seg,
-                              RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s)
-                              { s += working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(
+        seg, RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s)
+        { s += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -96,47 +98,45 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -151,11 +151,11 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+      REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
-#endif // __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index af10db39e5..26dc2e129b 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -19,10 +19,11 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -41,14 +42,14 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   const double default_val = -DBL_MAX;
 
@@ -72,10 +73,10 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   {
 
     // pick an index in one of the segments
-    int index = 5127;               // seg 3
-    if (tcount == 2) index = 1938;  // seg2
-    if (tcount == 3) index = 13333; // seg4
-    if (tcount == 4) index = 52;    // seg1
+    int index = 5127;                // seg 3
+    if (tcount == 2) index = 1938;   // seg2
+    if (tcount == 3) index = 13333;  // seg4
+    if (tcount == 4) index = 52;     // seg1
 
     double droll = dist(mt);
     if (test_array[index] > droll)
@@ -86,19 +87,20 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmax0.max(working_array[i]);
-                                dmax1.max(2 * working_array[i]);
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        iset,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
+          dmax0.max(working_array[i]);
+          dmax1.max(2 * working_array[i]);
+        });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
@@ -106,19 +108,21 @@ template <typename T>
 class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
-             ReduceMaxMultipleForallIndexSet)
+TYPED_TEST_P(
+    ForallIndexSetReduceMaxMultipleTest,
+    ReduceMaxMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                          REDUCE_POLICY>();
+  ForallIndexSetReduceMaxMultipleTestImpl<
+      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest,
-                            ReduceMaxMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallIndexSetReduceMaxMultipleTest,
+    ReduceMaxMultipleForallIndexSet);
 
-#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 161f04a3cb..13546a492a 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -19,10 +19,11 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -41,14 +42,14 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   double   current_max = -DBL_MAX;
   IDX_TYPE current_loc = -1;
@@ -60,10 +61,10 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
 
   const int test_repeat = 4;
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max,
-                                                            current_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max,
-                                                            current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(
+      current_max, current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(
+      current_max, current_loc);
 
   for (int tcount = 1; tcount <= test_repeat; ++tcount)
   {
@@ -72,21 +73,22 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
     current_max = 100.0 + tcount * 10.0;
 
     // pick an index in one of the segments
-    current_loc = 5127;                   // seg 3
-    if (tcount == 2) current_loc = 1938;  // seg2
-    if (tcount == 3) current_loc = 13333; // seg4
-    if (tcount == 4) current_loc = 52;    // seg1
+    current_loc = 5127;                    // seg 3
+    if (tcount == 2) current_loc = 1938;   // seg2
+    if (tcount == 3) current_loc = 13333;  // seg4
+    if (tcount == 4) current_loc = 52;     // seg1
 
     test_array[current_loc] = current_max;
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmax0.maxloc(working_array[i], i);
-                                dmax1.maxloc(2 * working_array[i], i);
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        iset,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
+          dmax0.maxloc(working_array[i], i);
+          dmax1.maxloc(2 * working_array[i], i);
+        });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
@@ -94,8 +96,8 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
@@ -103,19 +105,21 @@ template <typename T>
 class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
-             ReduceMaxLocMultipleForallIndexSet)
+TYPED_TEST_P(
+    ForallIndexSetReduceMaxLocMultipleTest,
+    ReduceMaxLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                             REDUCE_POLICY>();
+  ForallIndexSetReduceMaxLocMultipleTestImpl<
+      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest,
-                            ReduceMaxLocMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallIndexSetReduceMaxLocMultipleTest,
+    ReduceMaxLocMultipleForallIndexSet);
 
-#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index 3a04bc4764..0a8c116721 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -19,10 +19,11 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallIndexSetReduceMinMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -41,14 +42,14 @@ void ForallIndexSetReduceMinMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   const double default_val = DBL_MAX;
 
@@ -72,10 +73,10 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   {
 
     // pick an index in one of the segments
-    int index = 5127;               // seg 3
-    if (tcount == 2) index = 1938;  // seg2
-    if (tcount == 3) index = 13333; // seg4
-    if (tcount == 4) index = 52;    // seg1
+    int index = 5127;                // seg 3
+    if (tcount == 2) index = 1938;   // seg2
+    if (tcount == 3) index = 13333;  // seg4
+    if (tcount == 4) index = 52;     // seg1
 
     double droll = dist(mt);
     if (test_array[index] > droll)
@@ -86,19 +87,20 @@ void ForallIndexSetReduceMinMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmin0.min(working_array[i]);
-                                dmin1.min(2 * working_array[i]);
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        iset,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
+          dmin0.min(working_array[i]);
+          dmin1.min(2 * working_array[i]);
+        });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
@@ -106,19 +108,21 @@ template <typename T>
 class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
-             ReduceMinMultipleForallIndexSet)
+TYPED_TEST_P(
+    ForallIndexSetReduceMinMultipleTest,
+    ReduceMinMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                          REDUCE_POLICY>();
+  ForallIndexSetReduceMinMultipleTestImpl<
+      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest,
-                            ReduceMinMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallIndexSetReduceMinMultipleTest,
+    ReduceMinMultipleForallIndexSet);
 
-#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index 89acb80207..6a311aab53 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -19,10 +19,11 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallIndexSetReduceMinLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -41,14 +42,14 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(
+      alen, working_res, &working_array, &check_array, &test_array);
 
   double   current_min = DBL_MAX;
   IDX_TYPE current_loc = -1;
@@ -60,10 +61,10 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
 
   const int test_repeat = 4;
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min,
-                                                            current_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min,
-                                                            current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(
+      current_min, current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(
+      current_min, current_loc);
 
   for (int tcount = 1; tcount <= test_repeat; ++tcount)
   {
@@ -72,21 +73,22 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
     current_min = 100.0 - tcount * 10.0;
 
     // pick an index in one of the segments
-    current_loc = 5127;                   // seg 3
-    if (tcount == 2) current_loc = 1938;  // seg2
-    if (tcount == 3) current_loc = 13333; // seg4
-    if (tcount == 4) current_loc = 52;    // seg1
+    current_loc = 5127;                    // seg 3
+    if (tcount == 2) current_loc = 1938;   // seg2
+    if (tcount == 3) current_loc = 13333;  // seg4
+    if (tcount == 4) current_loc = 52;     // seg1
 
     test_array[current_loc] = current_min;
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmin0.minloc(working_array[i], i);
-                                dmin1.minloc(2 * working_array[i], i);
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        iset,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
+          dmin0.minloc(working_array[i], i);
+          dmin1.minloc(2 * working_array[i], i);
+        });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
@@ -94,8 +96,8 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
-                                   test_array);
+  deallocateForallTestData<double>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
@@ -103,19 +105,21 @@ template <typename T>
 class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
-             ReduceMinLocMultipleForallIndexSet)
+TYPED_TEST_P(
+    ForallIndexSetReduceMinLocMultipleTest,
+    ReduceMinLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                             REDUCE_POLICY>();
+  ForallIndexSetReduceMinLocMultipleTestImpl<
+      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest,
-                            ReduceMinLocMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallIndexSetReduceMinLocMultipleTest,
+    ReduceMinLocMultipleForallIndexSet);
 
-#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index 7811a605bc..5167e601a9 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -17,10 +17,11 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallIndexSetReduceSumMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -39,21 +40,21 @@ void ForallIndexSetReduceSumMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* dworking_array;
   double* dcheck_array;
   double* dtest_array;
 
-  allocateForallTestData<double>(alen, working_res, &dworking_array,
-                                 &dcheck_array, &dtest_array);
+  allocateForallTestData<double>(
+      alen, working_res, &dworking_array, &dcheck_array, &dtest_array);
 
   int* iworking_array;
   int* icheck_array;
   int* itest_array;
 
-  allocateForallTestData<int>(alen, working_res, &iworking_array, &icheck_array,
-                              &itest_array);
+  allocateForallTestData<int>(
+      alen, working_res, &iworking_array, &icheck_array, &itest_array);
 
   const double dinit_val = 0.1;
   const int    iinit_val = 1;
@@ -79,33 +80,36 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   for (int tcount = 1; tcount <= test_repeat; ++tcount)
   {
 
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              {
-                                dsum0 += 1.0 * dworking_array[idx];
-                                isum1 += 2 * iworking_array[idx];
-                                dsum2 += 3.0 * dworking_array[idx];
-                                isum3 += 4 * iworking_array[idx];
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        iset,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+        {
+          dsum0 += 1.0 * dworking_array[idx];
+          isum1 += 2 * iworking_array[idx];
+          dsum2 += 3.0 * dworking_array[idx];
+          isum3 += 4 * iworking_array[idx];
+        });
 
     double dchk_val = dinit_val * static_cast<double>(iset.getLength());
     int    ichk_val = iinit_val * static_cast<int>(iset.getLength());
 
-    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()),
-                    tcount * (1 * dchk_val) + (drinit * 1.0));
-    ASSERT_EQ(static_cast<int>(isum1.get()),
-              tcount * (2 * ichk_val) + (irinit * 2));
-    ASSERT_FLOAT_EQ(static_cast<double>(dsum2.get()),
-                    tcount * (3 * dchk_val) + (drinit * 3.0));
-    ASSERT_EQ(static_cast<int>(isum3.get()),
-              tcount * (4 * ichk_val) + (irinit * 4));
+    ASSERT_FLOAT_EQ(
+        static_cast<double>(dsum0.get()),
+        tcount * (1 * dchk_val) + (drinit * 1.0));
+    ASSERT_EQ(
+        static_cast<int>(isum1.get()), tcount * (2 * ichk_val) + (irinit * 2));
+    ASSERT_FLOAT_EQ(
+        static_cast<double>(dsum2.get()),
+        tcount * (3 * dchk_val) + (drinit * 3.0));
+    ASSERT_EQ(
+        static_cast<int>(isum3.get()), tcount * (4 * ichk_val) + (irinit * 4));
   }
 
-  deallocateForallTestData<double>(working_res, dworking_array, dcheck_array,
-                                   dtest_array);
+  deallocateForallTestData<double>(
+      working_res, dworking_array, dcheck_array, dtest_array);
 
-  deallocateForallTestData<int>(working_res, iworking_array, icheck_array,
-                                itest_array);
+  deallocateForallTestData<int>(
+      working_res, iworking_array, icheck_array, itest_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
@@ -113,19 +117,21 @@ template <typename T>
 class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
-             ReduceSumMultipleForallIndexSet)
+TYPED_TEST_P(
+    ForallIndexSetReduceSumMultipleTest,
+    ReduceSumMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                          REDUCE_POLICY>();
+  ForallIndexSetReduceSumMultipleTestImpl<
+      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest,
-                            ReduceSumMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallIndexSetReduceSumMultipleTest,
+    ReduceSumMultipleForallIndexSet);
 
-#endif // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
+#endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index 281ad51d43..cbe5114860 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -14,22 +14,23 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE*                working_array;
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
   const DATA_TYPE big_val     = 500;
@@ -37,8 +38,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::random_device                     rd;
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                      static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int>     dist2(
+          static_cast<int>(first), static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `max0;` not `max0(0);`
@@ -75,21 +76,23 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         IDX_TYPE  max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
-        working_res.memcpy(&working_array[max_index], &test_array[max_index],
-                           sizeof(DATA_TYPE));
+        working_res.memcpy(
+            &working_array[max_index], &test_array[max_index],
+            sizeof(DATA_TYPE));
 
         if (current_max < roll)
         {
           current_max = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    max0.max(working_array[idx]);
-                                    max1.max(2 * working_array[idx]);
-                                    max2.max(working_array[idx]);
-                                  });
+        RAJA::forall<EXEC_POLICY>(
+            r1,
+            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+            {
+              max0.max(working_array[idx]);
+              max1.max(2 * working_array[idx]);
+              max2.max(working_array[idx]);
+            });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
@@ -106,8 +109,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
@@ -123,11 +126,12 @@ TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                                  REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxMultipleTestImpl<
+      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest,
-                            ReduceMaxMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMaxMultipleTest,
+    ReduceMaxMultipleForall);
 
-#endif // __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index 2351c790ca..44bfe41de1 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -14,22 +14,23 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE*                working_array;
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
   const IDX_TYPE  default_loc = -1;
@@ -38,15 +39,15 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::random_device                     rd;
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                      static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int>     dist2(
+          static_cast<int>(first), static_cast<int>(last) - 1);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
-                                                              default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val,
-                                                              default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val,
-                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(
+      default_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(
+      default_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(
+      big_val, default_loc);
 
   const int nOuterLoops = 2;
   for (int l = 0; l < nOuterLoops; ++l)
@@ -82,10 +83,11 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         IDX_TYPE  max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         if (current_max != roll)
-        { // avoid two indices getting the same value
+        {  // avoid two indices getting the same value
           test_array[max_index] = roll;
-          working_res.memcpy(&working_array[max_index], &test_array[max_index],
-                             sizeof(DATA_TYPE));
+          working_res.memcpy(
+              &working_array[max_index], &test_array[max_index],
+              sizeof(DATA_TYPE));
 
           if (current_max < roll)
           {
@@ -94,13 +96,14 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
           }
         }
 
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    max0.maxloc(working_array[idx], idx);
-                                    max1.maxloc(2 * working_array[idx], idx);
-                                    max2.maxloc(working_array[idx], idx);
-                                  });
+        RAJA::forall<EXEC_POLICY>(
+            r1,
+            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+            {
+              max0.maxloc(working_array[idx], idx);
+              max1.maxloc(2 * working_array[idx], idx);
+              max2.maxloc(working_array[idx], idx);
+            });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -127,8 +130,8 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
@@ -144,11 +147,12 @@ TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxLocMultipleTestImpl<
+      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest,
-                            ReduceMaxLocMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMaxLocMultipleTest,
+    ReduceMaxLocMultipleForall);
 
-#endif // __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index 60ab8d817c..65688b2044 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -14,22 +14,23 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE*                working_array;
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
   const DATA_TYPE big_val     = -500;
@@ -37,8 +38,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::random_device                     rd;
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                      static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int>     dist2(
+          static_cast<int>(first), static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `min0;` not `min0(0);`
@@ -75,21 +76,23 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
         IDX_TYPE  min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
-        working_res.memcpy(&working_array[min_index], &test_array[min_index],
-                           sizeof(DATA_TYPE));
+        working_res.memcpy(
+            &working_array[min_index], &test_array[min_index],
+            sizeof(DATA_TYPE));
 
         if (current_min > roll)
         {
           current_min = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    min0.min(working_array[idx]);
-                                    min1.min(2 * working_array[idx]);
-                                    min2.min(working_array[idx]);
-                                  });
+        RAJA::forall<EXEC_POLICY>(
+            r1,
+            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+            {
+              min0.min(working_array[idx]);
+              min1.min(2 * working_array[idx]);
+              min2.min(working_array[idx]);
+            });
 
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
         ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
@@ -106,8 +109,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
@@ -123,11 +126,12 @@ TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                                  REDUCE_POLICY>(0, 2115);
+  ForallReduceMinMultipleTestImpl<
+      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest,
-                            ReduceMinMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMinMultipleTest,
+    ReduceMinMultipleForall);
 
-#endif // __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index 5b39ce5547..ae8f824dc0 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -14,22 +14,23 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE*                working_array;
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
   const IDX_TYPE  default_loc = -1;
@@ -38,16 +39,16 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   static std::random_device                     rd;
   static std::mt19937                           mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(static_cast<int>(first),
-                                                      static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int>     dist2(
+          static_cast<int>(first), static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val,
-                                                              default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val,
-                                                              default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val,
-                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(
+      default_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(
+      default_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(
+      big_val, default_loc);
 
   const int nOuterLoops = 2;
   for (int l = 0; l < nOuterLoops; ++l)
@@ -86,10 +87,11 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 
         printf("rolling { %f, %f }\n", (double)roll, (double)min_index);
         if (current_min != roll)
-        { // avoid two indices getting the same value
+        {  // avoid two indices getting the same value
           test_array[min_index] = roll;
-          working_res.memcpy(&working_array[min_index], &test_array[min_index],
-                             sizeof(DATA_TYPE));
+          working_res.memcpy(
+              &working_array[min_index], &test_array[min_index],
+              sizeof(DATA_TYPE));
 
           if (current_min > roll)
           {
@@ -97,16 +99,17 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
             current_loc = min_index;
           }
         }
-        printf("current { %f, %f }\n", (double)current_min,
-               (double)current_loc);
-
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    min0.minloc(working_array[idx], idx);
-                                    min1.minloc(2 * working_array[idx], idx);
-                                    min2.minloc(working_array[idx], idx);
-                                  });
+        printf(
+            "current { %f, %f }\n", (double)current_min, (double)current_loc);
+
+        RAJA::forall<EXEC_POLICY>(
+            r1,
+            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+            {
+              min0.minloc(working_array[idx], idx);
+              min1.minloc(2 * working_array[idx], idx);
+              min2.minloc(working_array[idx], idx);
+            });
 
         printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
@@ -136,8 +139,8 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
@@ -153,11 +156,12 @@ TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMinLocMultipleTestImpl<
+      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest,
-                            ReduceMinLocMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceMinLocMultipleTest,
+    ReduceMinLocMultipleForall);
 
-#endif // __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
+#endif  // __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index 8010cbafd0..0fe742b351 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -11,22 +11,23 @@
 #include <cstdlib>
 #include <numeric>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE*                working_array;
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
@@ -53,55 +54,57 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
   for (int j = 0; j < nloops; ++j)
   {
 
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              {
-                                sum0 += working_array[idx];
-                                sum1 += working_array[idx] * 2;
-                                sum2 += working_array[idx] * 3;
-                                sum3 += working_array[idx] * 4;
-                                sum4 += working_array[idx] * 5;
-                                sum5 += working_array[idx] * 6;
-                                sum6 += working_array[idx] * 7;
-                                sum7 += working_array[idx] * 8;
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        r1,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+        {
+          sum0 += working_array[idx];
+          sum1 += working_array[idx] * 2;
+          sum2 += working_array[idx] * 3;
+          sum3 += working_array[idx] * 4;
+          sum4 += working_array[idx] * 5;
+          sum5 += working_array[idx] * 6;
+          sum6 += working_array[idx] * 7;
+          sum7 += working_array[idx] * 8;
+        });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval * 1),
-              static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(
+        2 * check_val + (initval * 1), static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval * 3),
-              static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(
+        4 * check_val + (initval * 3), static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval * 5),
-              static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(
+        6 * check_val + (initval * 5), static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval * 7),
-              static_cast<DATA_TYPE>(sum7.get()));
+    ASSERT_EQ(
+        8 * check_val + (initval * 7), static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE*                working_array;
   DATA_TYPE*                check_array;
   DATA_TYPE*                test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      last, working_res, &working_array, &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
@@ -139,37 +142,38 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
   for (int j = 0; j < nloops; ++j)
   {
 
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              {
-                                sum0 += working_array[idx];
-                                sum1 += working_array[idx] * 2;
-                                sum2 += working_array[idx] * 3;
-                                sum3 += working_array[idx] * 4;
-                                sum4 += working_array[idx] * 5;
-                                sum5 += working_array[idx] * 6;
-                                sum6 += working_array[idx] * 7;
-                                sum7 += working_array[idx] * 8;
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        r1,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+        {
+          sum0 += working_array[idx];
+          sum1 += working_array[idx] * 2;
+          sum2 += working_array[idx] * 3;
+          sum3 += working_array[idx] * 4;
+          sum4 += working_array[idx] * 5;
+          sum5 += working_array[idx] * 6;
+          sum6 += working_array[idx] * 7;
+          sum7 += working_array[idx] * 8;
+        });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval * 1),
-              static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(
+        2 * check_val + (initval * 1), static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval * 3),
-              static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(
+        4 * check_val + (initval * 3), static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval * 5),
-              static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(
+        6 * check_val + (initval * 5), static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval * 7),
-              static_cast<DATA_TYPE>(sum7.get()));
+    ASSERT_EQ(
+        8 * check_val + (initval * 7), static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
@@ -185,15 +189,15 @@ TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                           EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceSumMultipleStaggeredTestImpl<
+      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 
-  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                            EXEC_POLICY, REDUCE_POLICY>(0,
-                                                                        2115);
+  ForallReduceSumMultipleStaggered2TestImpl<
+      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
-                            ReduceSumMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallReduceSumMultipleTest,
+    ReduceSumMultipleForall);
 
-#endif // __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
+#endif  // __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index 6f47f26032..c7cc42ae96 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -11,13 +11,14 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename REG_POLICY,
-          typename EXEC_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename REG_POLICY,
+    typename EXEC_POLICY>
 void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   //
   // Set some local variables and create some segments for using in tests
@@ -35,19 +36,21 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
 
   RAJA::region<REG_POLICY>(
       [=]()
       {
-        RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                                  { working_array[idx - first] += 1; });
+        RAJA::forall<EXEC_POLICY>(
+            rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+            { working_array[idx - first] += 1; });
 
-        RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                                  { working_array[idx - first] += 2; });
+        RAJA::forall<EXEC_POLICY>(
+            lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+            { working_array[idx - first] += 2; });
       });
 
 
@@ -58,8 +61,8 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 3);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -76,12 +79,12 @@ TYPED_TEST_P(ForallRegionTest, RegionForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1,
-                                                                         153);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3,
-                                                                         2556);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(
+      1, 153);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(
+      3, 2556);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest, RegionForall);
 
-#endif // __TEST_FORALL_REGION_HPP__
+#endif  // __TEST_FORALL_REGION_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index d8f67dbf7a..be03756467 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -25,7 +25,7 @@ void ForallResourceIcountIndexSetTestImpl()
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES               working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
   IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
@@ -44,8 +44,8 @@ void ForallResourceIcountIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -69,8 +69,8 @@ void ForallResourceIcountIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
@@ -85,11 +85,12 @@ TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                       EXEC_POLICY>();
+  ForallResourceIcountIndexSetTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest,
-                            ResourceIndexSetForallIcount);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallResourceIcountIndexSetTest,
+    ResourceIndexSetForallIcount);
 
-#endif // __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
+#endif  // __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index a462597fca..5e6ea862b1 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -24,7 +24,7 @@ void ForallResourceIndexSetTestImpl()
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES               working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
   IndexSetType            iset;
   std::vector<INDEX_TYPE> is_indices;
@@ -43,8 +43,8 @@ void ForallResourceIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -55,9 +55,9 @@ void ForallResourceIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, iset,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { working_array[idx] = idx; });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, iset,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -67,8 +67,8 @@ void ForallResourceIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
@@ -88,4 +88,4 @@ TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest, ResourceIndexSetForall);
 
-#endif // __TEST_FORALL_INDEXSET_HPP__
+#endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index 460067476b..b52c6be2d3 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -36,49 +36,50 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
   size_t idxlen = idx_array.size();
 
   WORKING_RES               working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
   // Create list segment for tests
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen,
-                                          erased_working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(
+      &idx_array[0], idxlen, erased_working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (size_t i = 0; i < idxlen; ++i)
   {
     test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, lseg,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-                              working_array[RAJA::stripIndexType(idx)] = idx;
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, lseg,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType(idx)] = idx; });
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   //
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
@@ -103,7 +104,8 @@ TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
       INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceListSegmentTest,
-                            ResourceListSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallResourceListSegmentTest,
+    ResourceListSegmentForall);
 
-#endif // __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
+#endif  // __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 0b0b068554..62e796a367 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -13,18 +13,18 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
-  INDEX_TYPE                          N = INDEX_TYPE(r1.end() - r1.begin());
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES               working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -35,17 +35,18 @@ void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
@@ -54,19 +55,21 @@ template <typename T>
 class ForallResourceRangeSegmentTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
@@ -92,7 +95,8 @@ TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest,
-                            ResourceRangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallResourceRangeSegmentTest,
+    ResourceRangeSegmentForall);
 
-#endif // __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
+#endif  // __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index 677ab28e62..58ae4e7bf4 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -8,35 +8,37 @@
 #ifndef __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 #define __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
-void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
-                                              INDEX_TYPE last,
-                                              DIFF_TYPE  stride)
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY>
+void ForallResourceRangeStrideSegmentTestImpl(
+    INDEX_TYPE first,
+    INDEX_TYPE last,
+    DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
   WORKING_RES               working_res;
-  camp::resources::Resource erased_working_res{working_res};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, erased_working_res, &working_array, &check_array, &test_array);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   INDEX_TYPE idx = first;
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
@@ -50,17 +52,18 @@ void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(
+      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
 
@@ -69,45 +72,48 @@ template <typename T>
 class ForallResourceRangeStrideSegmentTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
 
   // Test negative strides
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
-TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
-             ResourceRangeStrideSegmentForall)
+TYPED_TEST_P(
+    ForallResourceRangeStrideSegmentTest,
+    ResourceRangeStrideSegmentForall)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -115,40 +121,41 @@ TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
 
   // Test size zero segments
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest,
-                            ResourceRangeStrideSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallResourceRangeStrideSegmentTest,
+    ResourceRangeStrideSegmentForall);
 
-#endif // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
+#endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index 4c9e045cce..808e0550c7 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -36,7 +36,7 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
@@ -44,8 +44,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -66,13 +66,15 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 #else
 #define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
 #endif
-  static_assert(IS_TRIVIALLY_COPYABLE(layout_type),
-                "These layouts should always be triviallly copyable");
+  static_assert(
+      IS_TRIVIALLY_COPYABLE(layout_type),
+      "These layouts should always be triviallly copyable");
 
   // AJK: see ViewBase Ctor notes in RAJA/Util/TypedViewBase.hpp
 #if (!defined(RAJA_ENABLE_CUDA) && !defined(RAJA_ENABLE_CLANG_CUDA))
-  static_assert(IS_TRIVIALLY_COPYABLE(view_type),
-                "These views should always be triviallly copyable");
+  static_assert(
+      IS_TRIVIALLY_COPYABLE(view_type),
+      "These views should always be triviallly copyable");
 #endif
 
 
@@ -81,8 +83,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(
+      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -91,8 +93,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -115,7 +117,7 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
@@ -123,8 +125,8 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -139,11 +141,12 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type  work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
-                                         {{offset}}, {{N_offset}}));
+  view_type  work_view(
+       working_array,
+       RAJA::make_offset_layout<1, INDEX_TYPE>({{offset}}, {{N_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(
+      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -152,8 +155,8 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
@@ -173,14 +176,14 @@ TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
       32000);
 
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                      EXEC_POLICY>(13, 1);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                      EXEC_POLICY>(2047, 2);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                      EXEC_POLICY>(32000, 3);
+  ForallListSegmentOffsetViewTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13, 1);
+  ForallListSegmentOffsetViewTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047, 2);
+  ForallListSegmentOffsetViewTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000, 3);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest, ListSegmentForallView);
 
-#endif // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
+#endif  // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index c065c274ff..269dd193e2 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -19,13 +19,13 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, lentot);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      lentot, working_res, &working_array, &check_array, &test_array);
 
   std::iota(test_array, test_array + lentot, 0);
 
@@ -34,13 +34,14 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            {
-                              const INDEX_TYPE row = idx / N;
-                              const INDEX_TYPE col = idx % N;
-                              work_view(row, col)  = row * N + col;
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      {
+        const INDEX_TYPE row = idx / N;
+        const INDEX_TYPE col = idx % N;
+        work_view(row, col)  = row * N + col;
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
@@ -49,8 +50,8 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -62,13 +63,13 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, leninterior);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      lentot, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * lentot);
 
@@ -89,13 +90,14 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            {
-                              const INDEX_TYPE row = idx / N;
-                              const INDEX_TYPE col = idx % N;
-                              work_view(row, col)  = idx;
-                            });
+  RAJA::forall<EXEC_POLICY>(
+      r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      {
+        const INDEX_TYPE row = idx / N;
+        const INDEX_TYPE col = idx % N;
+        work_view(row, col)  = idx;
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
@@ -104,8 +106,8 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
@@ -113,11 +115,12 @@ template <typename T>
 class ForallRangeSegment2DViewTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-              nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+        nullptr>
 void runOffsetViewTests()
 {}
 
@@ -147,7 +150,8 @@ TYPED_TEST_P(ForallRangeSegment2DViewTest, RangeSegmentForall2DView)
   runOffsetViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest,
-                            RangeSegmentForall2DView);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallRangeSegment2DViewTest,
+    RangeSegmentForall2DView);
 
-#endif // __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
+#endif  // __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index eee1bac55c..4c184981d7 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -16,13 +16,13 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first, last);
   INDEX_TYPE                          N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -33,8 +33,9 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx - rbegin) = idx; });
+  RAJA::forall<EXEC_POLICY>(
+      r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx - rbegin) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -43,25 +44,26 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
-                                          INDEX_TYPE last,
-                                          INDEX_TYPE offset)
+void ForallRangeSegmentOffsetViewTestImpl(
+    INDEX_TYPE first,
+    INDEX_TYPE last,
+    INDEX_TYPE offset)
 {
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first + offset, last + offset);
   INDEX_TYPE                          N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -71,11 +73,12 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type  work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
-                                         {{f_offset}}, {{l_offset}}));
+  view_type  work_view(
+       working_array,
+       RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}}, {{l_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(
+      r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -84,15 +87,16 @@ void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-              nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+        nullptr>
 void runNegativeViewTests()
 {}
 
@@ -142,4 +146,4 @@ TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest, RangeSegmentForallView);
 
-#endif // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
+#endif  // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index a0428203db..e9e1f4c281 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -8,24 +8,26 @@
 #ifndef __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 #define __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
-void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
-                                          INDEX_TYPE last,
-                                          DIFF_TYPE  stride)
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY>
+void ForallRangeStrideSegmentViewTestImpl(
+    INDEX_TYPE first,
+    INDEX_TYPE last,
+    DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
   INDEX_TYPE                                N = r1.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -43,8 +45,9 @@ void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
   RAJA::Layout<1> layout(N);
   view_type       work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view((idx - first) / stride) = idx; });
+  RAJA::forall<EXEC_POLICY>(
+      r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { work_view((idx - first) / stride) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -53,16 +56,17 @@ void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-              nullptr>
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+        nullptr>
 void runNegativeIndexViewTests()
 {}
 
@@ -74,17 +78,17 @@ template <
     typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
 {
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(-10, -1, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(-5, 0, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(-5, 5, 3);
-
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(10, -1, -1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(10, 0, -2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-10, -1, 2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 3);
+
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, -1, -1);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, 0, -2);
 }
 
 
@@ -100,31 +104,32 @@ TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
 
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 255, 2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 2);
 
   // Test size zero segments
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 20, -2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, -2);
 
   runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest,
-                            RangeStrideSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallRangeStrideSegmentViewTest,
+    RangeStrideSegmentForallView);
 
-#endif // __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
+#endif  // __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index 6bd5f707b9..a642963f85 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -35,7 +35,7 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
@@ -55,8 +55,8 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -66,40 +66,41 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
       test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(lseg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-                                working_array[RAJA::stripIndexType(idx)] = idx;
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx)] = idx; });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(lseg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                              {
-                                (void)idx;
-                                working_array[0]++;
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        lseg,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        {
+          (void)idx;
+          working_array[0]++;
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -130,4 +131,4 @@ TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest, ListSegmentForall);
 
-#endif // __TEST_FORALL_LISTSEGMENT_HPP__
+#endif  // __TEST_FORALL_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index 4f88cbb27d..72c4ea89c4 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -14,11 +14,11 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -29,8 +29,8 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -44,31 +44,33 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
         { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                              {
-                                (void)idx;
-                                working_array[0]++;
-                              });
+    RAJA::forall<EXEC_POLICY>(
+        r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        {
+          (void)idx;
+          working_array[0]++;
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -77,19 +79,21 @@ template <typename T>
 class ForallRangeSegmentTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
@@ -125,4 +129,4 @@ TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
 
 REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest, RangeSegmentForall);
 
-#endif // __TEST_FORALL_RANGESEGMENT_HPP__
+#endif  // __TEST_FORALL_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index 909474b019..4f3940d94c 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -10,20 +10,22 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
-void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
-                                      INDEX_TYPE last,
-                                      DIFF_TYPE  stride)
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY>
+void ForallRangeStrideSegmentTestImpl(
+    INDEX_TYPE first,
+    INDEX_TYPE last,
+    DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -34,8 +36,8 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
@@ -56,26 +58,28 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
         { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
   }
   else
-  { // zero-length segment
-
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                              {
-                                (void)idx;
-                                working_array[0]++;
-                              });
+  {  // zero-length segment
+
+    RAJA::forall<EXEC_POLICY>(
+        r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        {
+          (void)idx;
+          working_array[0]++;
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -84,40 +88,42 @@ template <typename T>
 class ForallRangeStrideSegmentTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                DIFF_TYPE(3));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
 
   // Test negative strides
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1),
-                                                DIFF_TYPE(-1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0),
-                                                DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
@@ -129,40 +135,41 @@ TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
-                                                DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
-                                                DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255),
-                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
 
   // Test size zero segments
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
-                                                DIFF_TYPE(-2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
-                                                DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest,
-                            RangeStrideSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    ForallRangeStrideSegmentTest,
+    RangeStrideSegmentForall);
 
-#endif // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
+#endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index 80eb2d77f5..678fdd9555 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -45,13 +45,13 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(30);
   indices.push_back(31);
 
-  camp::resources::Resource res{camp::resources::Host()};
+  camp::resources::Resource res {camp::resources::Host()};
 
   RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
 
-  RAJA::buildIndexSetAligned(iset, res, &indices[0],
-                             static_cast<RAJA::Index_type>(indices.size()),
-                             range_min_length, range_align);
+  RAJA::buildIndexSetAligned(
+      iset, res, &indices[0], static_cast<RAJA::Index_type>(indices.size()),
+      range_min_length, range_align);
 
   ASSERT_EQ(iset.getLength(), indices.size());
 
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index c729126a84..137bc9c506 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -16,10 +16,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename EXEC_POLICY,
-          typename WORKING_RES,
-          typename SEG_TYPE>
+template <
+    typename IDX_TYPE,
+    typename EXEC_POLICY,
+    typename WORKING_RES,
+    typename SEG_TYPE>
 void KernelBasicFissionFusionLoopTestImpl(
     const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
@@ -41,17 +42,17 @@ void KernelBasicFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_x;
   DATA_TYPE* test_array_y;
 
-  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_x,
-                                    &check_array_x, &test_array_x);
+  allocateForallTestData<DATA_TYPE>(
+      RAJA::stripIndexType(data_len), erased_working_res, &working_array_x,
+      &check_array_x, &test_array_x);
 
-  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_y,
-                                    &check_array_y, &test_array_y);
+  allocateForallTestData<DATA_TYPE>(
+      RAJA::stripIndexType(data_len), erased_working_res, &working_array_y,
+      &check_array_y, &test_array_y);
 
 
-  working_res.memset(working_array_x, 0,
-                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(
+      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
@@ -70,32 +71,36 @@ void KernelBasicFissionFusionLoopTestImpl(
 
   );
 
-  working_res.memcpy(check_array_x, working_array_x,
-                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      check_array_x, working_array_x,
+      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  memset(static_cast<void*>(check_array_y), 0,
-         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  memset(
+      static_cast<void*>(check_array_y), 0,
+      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
-                               [=](IDX_TYPE i)
-                               {
-                                 check_array_y[RAJA::stripIndexType(i)] += 1;
-                                 check_array_y[RAJA::stripIndexType(i)] += 2;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      working_res, seg_idx,
+      [=](IDX_TYPE i)
+      {
+        check_array_y[RAJA::stripIndexType(i)] += 1;
+        check_array_y[RAJA::stripIndexType(i)] += 2;
+      });
 
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
   {
-    ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
-              check_array_y[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        check_array_x[RAJA::stripIndexType(i)],
+        check_array_y[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
-                                      check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_x, check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
-                                      check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_y, check_array_y, test_array_y);
 }
 
-#endif // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
+#endif  // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 3a9c996bd3..27f9fbc1e8 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -15,15 +15,16 @@ template <typename T>
 class KernelBasicFissionFusionLoopTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
-             BasicFissionFusionLoopSegmentKernel)
+TYPED_TEST_P(
+    KernelBasicFissionFusionLoopTest,
+    BasicFissionFusionLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES               working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -31,15 +32,15 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -47,31 +48,34 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
@@ -86,21 +90,22 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                       RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelBasicFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelBasicFissionFusionLoopTest,
-                            BasicFissionFusionLoopSegmentKernel);
-#endif // __TEST_KERNEL_BASIC_FISSION_FUSION_LOOP_HPP__
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelBasicFissionFusionLoopTest,
+    BasicFissionFusionLoopSegmentKernel);
+#endif  // __TEST_KERNEL_BASIC_FISSION_FUSION_LOOP_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index adda6c10e9..0704619204 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -16,10 +16,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename EXEC_POLICY,
-          typename WORKING_RES,
-          typename SEG_TYPE>
+template <
+    typename IDX_TYPE,
+    typename EXEC_POLICY,
+    typename WORKING_RES,
+    typename SEG_TYPE>
 void KernelBasicSingleICountLoopTestImpl(
     const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
@@ -45,21 +46,24 @@ void KernelBasicSingleICountLoopTestImpl(
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
-                                   &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      data_len, erased_working_res, &working_array, &check_array, &test_array);
 
-  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res,
-                                   &working_array_i, &check_array_i,
-                                   &test_array_i);
+  allocateForallTestData<IDX_TYPE>(
+      data_len, erased_working_res, &working_array_i, &check_array_i,
+      &test_array_i);
 
-  memset(static_cast<void*>(test_array), 0,
-         sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  memset(
+      static_cast<void*>(test_array), 0,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array,
-                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      working_array, test_array,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array_i, test_array_i,
-                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      working_array_i, test_array_i,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   if (RAJA::stripIndexType(idx_len) > 0)
   {
@@ -81,7 +85,7 @@ void KernelBasicSingleICountLoopTestImpl(
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     RAJA::kernel_param<EXEC_POLICY>(
         RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
@@ -95,24 +99,28 @@ void KernelBasicSingleICountLoopTestImpl(
         });
   }
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
-  working_res.memcpy(check_array_i, working_array_i,
-                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      check_array, working_array,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      check_array_i, working_array_i,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
-    ASSERT_EQ(test_array_i[RAJA::stripIndexType(i)],
-              check_array_i[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array_i[RAJA::stripIndexType(i)],
+        check_array_i[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
-                                     check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array_i,
-                                     check_array_i, test_array_i);
+  deallocateForallTestData<IDX_TYPE>(
+      erased_working_res, working_array_i, check_array_i, test_array_i);
 }
 
-#endif // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
+#endif  // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index 15dc025698..1cc86a31b7 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -15,15 +15,16 @@ template <typename T>
 class KernelBasicSingleICountLoopTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(KernelBasicSingleICountLoopTest,
-             BasicSingleICountLoopSegmentKernel)
+TYPED_TEST_P(
+    KernelBasicSingleICountLoopTest,
+    BasicSingleICountLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES               working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -31,15 +32,15 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -47,31 +48,34 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
@@ -86,22 +90,23 @@ TYPED_TEST_P(KernelBasicSingleICountLoopTest,
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                      RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest,
-                            BasicSingleICountLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelBasicSingleICountLoopTest,
+    BasicSingleICountLoopSegmentKernel);
 
-#endif // __TEST_KERNEL_BASIC_SINGLE_ICOUNT_LOOP_HPP__
+#endif  // __TEST_KERNEL_BASIC_SINGLE_ICOUNT_LOOP_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index a3732a6f1e..a383b686b5 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -16,37 +16,41 @@
 #include <numeric>
 #include <vector>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
   RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <typename IDX_TYPE,
-          typename EXEC_POLICY,
-          typename WORKING_RES,
-          typename SEG_TYPE,
-          bool USE_RESOURCE>
-void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
-                                   const std::vector<IDX_TYPE>& seg_idx,
-                                   WORKING_RES                  working_res,
-                                   camp::resources::Resource erased_working_res)
+template <
+    typename IDX_TYPE,
+    typename EXEC_POLICY,
+    typename WORKING_RES,
+    typename SEG_TYPE,
+    bool USE_RESOURCE>
+void KernelBasicSingleLoopTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    WORKING_RES                  working_res,
+    camp::resources::Resource    erased_working_res)
 {
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
@@ -64,14 +68,16 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
-                                   &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      data_len, erased_working_res, &working_array, &check_array, &test_array);
 
-  memset(static_cast<void*>(test_array), 0,
-         sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  memset(
+      static_cast<void*>(test_array), 0,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array,
-                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      working_array, test_array,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   if (RAJA::stripIndexType(idx_len) > 0)
   {
@@ -88,27 +94,30 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE&              seg,
         { working_array[RAJA::stripIndexType(idx)] = idx; });
   }
   else
-  { // zero-length segment
-
-    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg), working_res,
-                                           [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                           {
-                                             (void)idx;
-                                             working_array[0]++;
-                                           });
+  {  // zero-length segment
+
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(
+        RAJA::make_tuple(seg), working_res,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+        {
+          (void)idx;
+          working_array[0]++;
+        });
   }
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      check_array, working_array,
+      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
-                                     check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      erased_working_res, working_array, check_array, test_array);
 }
 
-#endif // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
+#endif  // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index c358ec3071..36001b0c7f 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -21,8 +21,8 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES               working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = false;
 
@@ -32,51 +32,51 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
+      USE_RES>(r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
+      USE_RES>(r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
+      USE_RES>(r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs1, seg_idx, working_res,
-                                         erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs2, seg_idx, working_res,
-                                         erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs3, seg_idx, working_res,
-                                         erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+      rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -90,22 +90,23 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l1, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
+      USE_RES>(l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l2, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
+      USE_RES>(l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
-                            BasicSingleLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelBasicSingleLoopTest,
+    BasicSingleLoopSegmentKernel);
 
-#endif // __TEST_KERNEL_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
+#endif  // __TEST_KERNEL_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index aa06b1003d..3014825d4d 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -21,8 +21,8 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES               working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = true;
 
@@ -32,51 +32,51 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
+      USE_RES>(r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
+      USE_RES>(r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
+      USE_RES>(r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs1, seg_idx, working_res,
-                                         erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs2, seg_idx, working_res,
-                                         erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs3, seg_idx, working_res,
-                                         erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+      rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -90,22 +90,23 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l1, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
+      USE_RES>(l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
-  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l2, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelBasicSingleLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
+      USE_RES>(l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
-                            BasicSingleLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelBasicSingleLoopTest,
+    BasicSingleLoopSegmentKernel);
 
-#endif // __TEST_KERNEL_RESOURCE_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
+#endif  // __TEST_KERNEL_RESOURCE_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index 558c06feb5..9daea3c157 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -16,10 +16,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename EXEC_POLICY,
-          typename WORKING_RES,
-          typename SEG_TYPE>
+template <
+    typename IDX_TYPE,
+    typename EXEC_POLICY,
+    typename WORKING_RES,
+    typename SEG_TYPE>
 void KernelConditionalFissionFusionLoopTestImpl(
     const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
@@ -41,17 +42,17 @@ void KernelConditionalFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_x;
   DATA_TYPE* test_array_y;
 
-  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_x,
-                                    &check_array_x, &test_array_x);
+  allocateForallTestData<DATA_TYPE>(
+      RAJA::stripIndexType(data_len), erased_working_res, &working_array_x,
+      &check_array_x, &test_array_x);
 
-  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_y,
-                                    &check_array_y, &test_array_y);
+  allocateForallTestData<DATA_TYPE>(
+      RAJA::stripIndexType(data_len), erased_working_res, &working_array_y,
+      &check_array_y, &test_array_y);
 
 
-  working_res.memset(working_array_x, 0,
-                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(
+      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (int param = 0; param < 2; ++param)
   {
@@ -76,32 +77,34 @@ void KernelConditionalFissionFusionLoopTestImpl(
 
     );
 
-    working_res.memcpy(check_array_x, working_array_x,
-                       sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+    working_res.memcpy(
+        check_array_x, working_array_x,
+        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    memset(static_cast<void*>(check_array_y), 0,
-           sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+    memset(
+        static_cast<void*>(check_array_y), 0,
+        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
-                                 [=](IDX_TYPE i) {
-                                   check_array_y[RAJA::stripIndexType(i)] =
-                                       3 + 3 * param;
-                                 });
+    RAJA::forall<RAJA::seq_exec>(
+        working_res, seg_idx,
+        [=](IDX_TYPE i)
+        { check_array_y[RAJA::stripIndexType(i)] = 3 + 3 * param; });
 
 
     for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
     {
-      ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
-                check_array_y[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          check_array_x[RAJA::stripIndexType(i)],
+          check_array_y[RAJA::stripIndexType(i)]);
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
-                                      check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_x, check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
-                                      check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(
+      erased_working_res, working_array_y, check_array_y, test_array_y);
 }
 
-#endif // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
+#endif  // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index d7403ed14a..07b15942a3 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -15,15 +15,16 @@ template <typename T>
 class KernelConditionalFissionFusionLoopTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
-             ConditionalFissionFusionLoopSegmentKernel)
+TYPED_TEST_P(
+    KernelConditionalFissionFusionLoopTest,
+    ConditionalFissionFusionLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES               working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -31,15 +32,15 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                             RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelConditionalFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                             RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelConditionalFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -47,8 +48,8 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                             RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelConditionalFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
@@ -57,16 +58,16 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::getIndices(seg_idx, rs1);
   KernelConditionalFissionFusionLoopTestImpl<
       IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1, seg_idx, working_res,
-                                               erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelConditionalFissionFusionLoopTestImpl<
       IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2, seg_idx, working_res,
-                                               erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
@@ -74,8 +75,8 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::getIndices(seg_idx, rs3);
   KernelConditionalFissionFusionLoopTestImpl<
       IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3, seg_idx, working_res,
-                                               erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+      rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -89,21 +90,22 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                             RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(
+      &seg_idx[0], seg_idx.size(), erased_working_res);
+  KernelConditionalFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                             RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(
+      nullptr, seg_idx.size(), erased_working_res);
+  KernelConditionalFissionFusionLoopTestImpl<
+      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelConditionalFissionFusionLoopTest,
-                            ConditionalFissionFusionLoopSegmentKernel);
-#endif // __TEST_KERNEL_CONDITIONAL_FISSION_FUSION_LOOP_HPP__
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelConditionalFissionFusionLoopTest,
+    ConditionalFissionFusionLoopSegmentKernel);
+#endif  // __TEST_KERNEL_CONDITIONAL_FISSION_FUSION_LOOP_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index 603f54e695..bf8df979c0 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -11,19 +11,21 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void KernelHyperplane2DTestImpl(const int groups,
-                                const int idim,
-                                const int jdim)
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
+void KernelHyperplane2DTestImpl(
+    const int groups,
+    const int idim,
+    const int jdim)
 {
   // This test traverses "groups" 2D arrays, and modifies values in a 1D
   // hyperplane manner.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE* work_array;
   DATA_TYPE* check_array;
@@ -31,8 +33,8 @@ void KernelHyperplane2DTestImpl(const int groups,
 
   INDEX_TYPE array_length = groups * idim * jdim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView(
       test_array, groups, idim, jdim);
@@ -122,8 +124,8 @@ void KernelHyperplane2DTestImpl(const int groups,
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 }
 
 
@@ -140,14 +142,17 @@ TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(1, 10, 10);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(2, 111, 205);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(3, 213, 123);
+  KernelHyperplane2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
+      1, 10, 10);
+  KernelHyperplane2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
+      2, 111, 205);
+  KernelHyperplane2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
+      3, 213, 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest, Hyperplane2DKernel);
 
-#endif // __TEST_KERNEL_HYPERPLANE_2D_HPP__
+#endif  // __TEST_KERNEL_HYPERPLANE_2D_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 9ed9ca453c..48be474ad1 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -11,32 +11,36 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 typename std::enable_if<
     std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups),
-                           const int RAJA_UNUSED_ARG(idim),
-                           const int RAJA_UNUSED_ARG(jdim),
-                           const int RAJA_UNUSED_ARG(kdim))
+KernelHyperplane3DTestImpl(
+    const int RAJA_UNUSED_ARG(groups),
+    const int RAJA_UNUSED_ARG(idim),
+    const int RAJA_UNUSED_ARG(jdim),
+    const int RAJA_UNUSED_ARG(kdim))
 {
   // do nothing for unsigned index types
 }
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 typename std::enable_if<
     std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int groups,
-                           const int idimin,
-                           const int jdimin,
-                           const int kdimin)
+KernelHyperplane3DTestImpl(
+    const int groups,
+    const int idimin,
+    const int jdimin,
+    const int kdimin)
 {
   // This test traverses "groups" number of 3D arrays, and modifies values in a
   // 2D hyperplane manner.
@@ -56,7 +60,7 @@ KernelHyperplane3DTestImpl(const int groups,
     kdim = kdimin;
   }
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE* work_array;
   DATA_TYPE* check_array;
@@ -64,8 +68,8 @@ KernelHyperplane3DTestImpl(const int groups,
 
   INDEX_TYPE array_length = groups * idim * jdim * kdim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView(
       test_array, groups, idim, jdim, kdim);
@@ -88,44 +92,44 @@ KernelHyperplane3DTestImpl(const int groups,
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(Grange, Irange, Jrange, Krange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii,
-                                                 INDEX_TYPE jj, INDEX_TYPE kk)
-                            {
-                              if (g < 0 || g >= groups || ii < 0 ||
-                                  ii >= idim || jj < 0 || jj >= jdim ||
-                                  kk < 0 || kk >= kdim)
-                              {
-                                oob_count += 1;
-                              }
-
-                              DATA_TYPE left = 1;
-                              if (ii > 0)
-                              {
-                                left = WorkView(g, ii - 1, jj, kk);
-                              }
-
-                              DATA_TYPE up = 1;
-                              if (jj > 0)
-                              {
-                                up = WorkView(g, ii, jj - 1, kk);
-                              }
-
-                              DATA_TYPE back = 1;
-                              if (kk > 0)
-                              {
-                                back = WorkView(g, ii, jj, kk - 1);
-                              }
-
-                              WorkView(g, ii, jj, kk) = left + up + back;
-
-                              trip_count += 1;
-                            });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(Grange, Irange, Jrange, Krange),
+      [=] RAJA_HOST_DEVICE(
+          INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk)
+      {
+        if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 ||
+            jj >= jdim || kk < 0 || kk >= kdim)
+        {
+          oob_count += 1;
+        }
+
+        DATA_TYPE left = 1;
+        if (ii > 0)
+        {
+          left = WorkView(g, ii - 1, jj, kk);
+        }
+
+        DATA_TYPE up = 1;
+        if (jj > 0)
+        {
+          up = WorkView(g, ii, jj - 1, kk);
+        }
+
+        DATA_TYPE back = 1;
+        if (kk > 0)
+        {
+          back = WorkView(g, ii, jj, kk - 1);
+        }
+
+        WorkView(g, ii, jj, kk) = left + up + back;
+
+        trip_count += 1;
+      });
 
   work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
 
-  ASSERT_EQ((INDEX_TYPE)trip_count.get(),
-            (INDEX_TYPE)groups * idim * jdim * kdim);
+  ASSERT_EQ(
+      (INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim * kdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
@@ -175,8 +179,8 @@ KernelHyperplane3DTestImpl(const int groups,
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 }
 
 
@@ -193,14 +197,17 @@ TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(1, 10, 10, 10);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(2, 151, 111, 205);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(3, 101, 213, 123);
+  KernelHyperplane3DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
+      1, 10, 10, 10);
+  KernelHyperplane3DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
+      2, 151, 111, 205);
+  KernelHyperplane3DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
+      3, 101, 213, 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest, Hyperplane3DKernel);
 
-#endif // __TEST_KERNEL_HYPERPLANE_3D_HPP__
+#endif  // __TEST_KERNEL_HYPERPLANE_3D_HPP__
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index 5ae4194992..a76a1f90c0 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -15,41 +15,45 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY,
+    typename ABSTRACTION,
+    typename DATA_TYPE,
+    typename IDX_TYPE,
+    typename SEGMENTS_TYPE,
+    typename Container,
+    typename WORKING_RES,
+    typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
-KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
-                                const Container&,
-                                WORKING_RES,
-                                RandomGenerator&)
+KernelMultiReduceNestedTestImpl(
+    const SEGMENTS_TYPE&,
+    const Container&,
+    WORKING_RES,
+    RandomGenerator&)
 {
   return false;
 }
 ///
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY,
+    typename ABSTRACTION,
+    typename DATA_TYPE,
+    typename IDX_TYPE,
+    typename SEGMENTS_TYPE,
+    typename Container,
+    typename WORKING_RES,
+    typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
-KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
-                                const Container&     multi_init,
-                                WORKING_RES          working_res,
-                                RandomGenerator&     rngen)
+KernelMultiReduceNestedTestImpl(
+    const SEGMENTS_TYPE& segments,
+    const Container&     multi_init,
+    WORKING_RES          working_res,
+    RandomGenerator&     rngen)
 {
   using RAJA::get;
   using MULTIREDUCER =
@@ -86,8 +90,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range + 1, working_res, &working_range,
-                         &check_range, &test_range);
+  allocateForallTestData(
+      idx_range + 1, working_res, &working_range, &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -113,11 +117,11 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  allocateForallTestData(data_len, working_res, &working_array, &check_array,
-                         &test_array);
+  allocateForallTestData(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
-                         &test_bins);
+  allocateForallTestData(
+      data_len, working_res, &working_bins, &check_bins, &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -137,8 +141,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range,
-                     sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(
+      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -224,17 +228,18 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     {
 
       // use floating point values to accentuate floating point precision issues
-      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-                         std::uniform_int_distribution<DATA_TYPE>,
-                         std::uniform_real_distribution<DATA_TYPE>>
+      std::conditional_t<
+          !std::is_floating_point<DATA_TYPE>::value,
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>>
           array_flt_distribution(0, modval - 1);
 
       for (IDX_TYPE i = 0; i < data_len; ++i)
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array,
-                         sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(
+          working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -334,11 +339,11 @@ struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
               RAJA::statement::For<
                   2,
                   typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
                          >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
 
 TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
 {
@@ -355,10 +360,10 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
       typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   // for setting random values in arrays
-  auto         random_seed = std::random_device{}();
+  auto         random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
@@ -367,40 +372,43 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
   for (size_t num_bins_max : num_bins_max_container)
   {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
-                                                        num_bins_max);
+    std::uniform_int_distribution<size_t> num_bins_dist(
+        num_bins_min, num_bins_max);
     num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s1, container,
-                                                         working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s2, container,
-                                                         working_res, rngen);
+    auto s1 = RAJA::make_tuple(
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    KernelMultiReduceNestedTestImpl<
+        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+        s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(
+        RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+        RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    KernelMultiReduceNestedTestImpl<
+        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+        s2, container, working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 =
-        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s3, container,
-                                                         working_res, rngen);
+    auto s3 = RAJA::make_tuple(
+        RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+        RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+        RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    KernelMultiReduceNestedTestImpl<
+        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+        s3, container, working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest,
-                            MultiReduceNestedKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelMultiReduceNestedTest,
+    MultiReduceNestedKernel);
 
-#endif // __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
+#endif  // __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index 60013aa430..ba3b2dd44a 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -10,26 +10,29 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&&  segs,
-                                                        PARAMS&&    params,
-                                                        WORKING_RES work_res,
-                                                        Args&&... args)
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(
+    SEGMENTS&&  segs,
+    PARAMS&&    params,
+    WORKING_RES work_res,
+    Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -49,31 +52,32 @@ using BlockReduceSumSupportedLoopTypeList =
 // Nest loop trip count test.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
 {
 
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   // Allocate Tests Data
   int* work_array;
   int* check_array;
   int* test_array;
 
-  allocateForallTestData<int>(N, erased_work_res, &work_array, &check_array,
-                              &test_array);
+  allocateForallTestData<int>(
+      N, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::TypedRangeSegment<int> range(0, N);
 
   // Initialize Data
   std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
-  erased_work_res.memcpy(work_array, test_array,
-                         sizeof(int) * RAJA::stripIndexType(N));
+  erased_work_res.memcpy(
+      work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
 
   RAJA::ReduceSum<REDUCE_POL, int> worksum(0);
 
@@ -101,17 +105,18 @@ void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
 
   ASSERT_EQ(worksum.get(), N * (N - 1) / 2);
 
-  deallocateForallTestData<int>(erased_work_res, work_array, check_array,
-                                test_array);
+  deallocateForallTestData<int>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 // DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above
 // DEPTH_1_REDUCESUM test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
@@ -146,9 +151,10 @@ template <typename REDUCE_POL, typename POLICY_DATA>
 struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                           RAJA::statement::Lambda<1>>,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::Lambda<1>>,
       RAJA::statement::Reduce<
           typename camp::at<POLICY_DATA, camp::num<1>>::type,
           RAJA::operators::plus,
@@ -156,10 +162,10 @@ struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
           RAJA::statement::Lambda<2, RAJA::Params<0>>
           // Device: Lambda 2 only gets executed on the "root" thread which
           // received the reduced value.
-          >> // end DEVICE_KERNEL
+          >>  // end DEVICE_KERNEL
                                   >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_IMPL_HPP__
+#endif  // __NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index 159261177c..45e3e957b2 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -10,22 +10,24 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
@@ -37,30 +39,32 @@ call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceSum test supports.
 //
 //
-using ReduceSumSupportedLoopTypeList =
-    camp::list<DEPTH_3_REDUCESUM,
-               DEPTH_3_REDUCESUM_SEQ_INNER,
-               DEPTH_3_REDUCESUM_SEQ_OUTER,
-               DEVICE_DEPTH_3_REDUCESUM,
-               DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-               DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
+using ReduceSumSupportedLoopTypeList = camp::list<
+    DEPTH_3_REDUCESUM,
+    DEPTH_3_REDUCESUM_SEQ_INNER,
+    DEPTH_3_REDUCESUM_SEQ_OUTER,
+    DEVICE_DEPTH_3_REDUCESUM,
+    DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+    DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
 
 //
 //
 // ReduceSum 3D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
-                          const RAJA::Index_type dim0,
-                          const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
+void KernelNestedLoopTest(
+    const DEPTH_3_REDUCESUM&,
+    const RAJA::Index_type dim0,
+    const RAJA::Index_type dim1,
+    const RAJA::Index_type dim2)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type  flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
@@ -77,88 +81,94 @@ void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  erased_work_res.memcpy(work_array, test_array,
-                         sizeof(RAJA::Index_type) *
-                             RAJA::stripIndexType(flatSize));
+  erased_work_res.memcpy(
+      work_array, test_array,
+      sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
 
   constexpr int                                     Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim0,
-                                                              dim1, dim2);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
+      work_array, dim0, dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type>       worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range0, range1, range2), work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
-                           RAJA::Index_type k)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k)
       { worksum += work_view(i, j, k); });
 
-  RAJA::forall<RAJA::seq_exec>(rangeflat,
-                               [=](RAJA::Index_type i) {
-                                 hostsum += test_array[RAJA::stripIndexType(i)];
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      rangeflat, [=](RAJA::Index_type i)
+      { hostsum += test_array[RAJA::stripIndexType(i)]; });
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 // DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above
 // DEPTH_3_REDUCESUM test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
-                          Args... args)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE,
+    typename... Args>
+void KernelNestedLoopTest(
+    const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
+    Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
-                          Args... args)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE,
+    typename... Args>
+void KernelNestedLoopTest(
+    const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
+    Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
@@ -188,9 +198,10 @@ struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA>
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER,
-                               REDUCE_POL,
-                               POLICY_DATA>
+struct ReduceSumNestedLoopExec<
+    DEPTH_3_REDUCESUM_SEQ_OUTER,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::For<
       0,
@@ -205,26 +216,28 @@ struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER,
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER,
-                               REDUCE_POL,
-                               POLICY_DATA>
+struct ReduceSumNestedLoopExec<
+    DEPTH_3_REDUCESUM_SEQ_INNER,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::For<
       0,
       typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<1,
-                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
-                           RAJA::statement::For<2,
-                                                RAJA::seq_exec,
-                                                RAJA::statement::Lambda<0>>>>>;
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::
+              For<2, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM,
-                               REDUCE_POL,
-                               POLICY_DATA>
+struct ReduceSumNestedLoopExec<
+    DEVICE_DEPTH_3_REDUCESUM,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -236,14 +249,15 @@ struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM,
               RAJA::statement::For<
                   2,
                   typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
                          >;
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
-                               REDUCE_POL,
-                               POLICY_DATA>
+struct ReduceSumNestedLoopExec<
+    DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -255,14 +269,15 @@ struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
               RAJA::statement::For<
                   2,
                   typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
                          >;
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-                               REDUCE_POL,
-                               POLICY_DATA>
+struct ReduceSumNestedLoopExec<
+    DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -274,10 +289,10 @@ struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
               RAJA::statement::For<
                   2,
                   RAJA::seq_exec,
-                  RAJA::statement::Lambda<0>>>>> // end DEVICE_KERNEL
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
                          >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __NESTED_LOOP_REDUCESUM_IMPL_HPP__
+#endif  // __NESTED_LOOP_REDUCESUM_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
index 699a8de450..bfd3ba4ed6 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
@@ -45,7 +45,8 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
-                            NestedLoopBlockKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopBlockReduceSumTest,
+    NestedLoopBlockKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
index c31a9c1ce7..df9999a381 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
@@ -45,7 +45,8 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
       LOOP_TYPE(), 40, 30, 20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
-                            NestedLoopReduceSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopReduceSumTest,
+    NestedLoopReduceSumKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOP_REDUCESUM_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_REDUCESUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
index 835d466d1e..6222f17ec1 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
@@ -45,7 +45,8 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
-                            NestedLoopBlockKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopBlockReduceSumTest,
+    NestedLoopBlockKernel);
 
-#endif // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
+#endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
index c94c4835b3..810b05227f 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
@@ -45,7 +45,8 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
       LOOP_TYPE(), 40, 30, 20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
-                            NestedLoopReduceSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopReduceSumTest,
+    NestedLoopReduceSumKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOP_RESOURCE_REDUCESUM_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_REDUCESUM_HPP__
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index 84cfad0815..ae10f52342 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -55,19 +55,21 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &work_array, &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(work_array, dim1, dim2,
-                                                   dim3);
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(test_array, dim1, dim2,
-                                                   dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(
+      work_array, dim1, dim2, dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(
+      test_array, dim1, dim2, dim3);
 
-  memset(static_cast<void*>(test_array), 0,
-         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  memset(
+      static_cast<void*>(test_array), 0,
+      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(work_array, test_array,
-                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      work_array, test_array,
+      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   if (!zero_legth_segment)
   {
@@ -121,8 +123,9 @@ void KernelNestedLoopsSegmentTypesTestImpl(
         });
   }
 
-  working_res.memcpy(check_array, work_array,
-                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(
+      check_array, work_array,
+      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -130,8 +133,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, work_array, check_array, test_array);
 }
 
 
@@ -146,7 +149,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> s1_idx;
   std::vector<IDX_TYPE> s2_idx;
@@ -238,7 +241,8 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
       s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest,
-                            NestedLoopsSegmentTypesKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopsSegmentTypesTest,
+    NestedLoopsSegmentTypesKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOPS_SEGMENT_TYPES_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOPS_SEGMENT_TYPES_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index 2d6fa22331..ef2c2cfb27 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -10,11 +10,12 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
-                                std::array<RAJA::idx_t, 2> offset_lo,
-                                std::array<RAJA::idx_t, 2> offset_hi)
+void KernelOffsetView2DTestImpl(
+    std::array<RAJA::idx_t, 2> dim,
+    std::array<RAJA::idx_t, 2> offset_lo,
+    std::array<RAJA::idx_t, 2> offset_hi)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE*                 working_array;
   IDX_TYPE*                 check_array;
   IDX_TYPE*                 test_array;
@@ -26,8 +27,8 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   EXPECT_LT(off_dim0, dim.at(0));
   EXPECT_LT(off_dim1, dim.at(1));
 
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -50,9 +51,9 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
-                            { view(i, j) = static_cast<IDX_TYPE>(1); });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(iseg, jseg), [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+      { view(i, j) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
@@ -61,8 +62,8 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -81,36 +82,37 @@ TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
 
   RAJA::idx_t                dim0 = 21;
   RAJA::idx_t                dim1 = 23;
-  std::array<RAJA::idx_t, 2> dim{{dim0, dim1}};
+  std::array<RAJA::idx_t, 2> dim {{dim0, dim1}};
 
   //
   // Square views
   //
-  std::array<RAJA::idx_t, 2> offset_lo{{0, 2}};
-  std::array<RAJA::idx_t, 2> offset_hi{{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
+  std::array<RAJA::idx_t, 2> offset_lo {{0, 2}};
+  std::array<RAJA::idx_t, 2> offset_hi {{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 
-  offset_lo = std::array<RAJA::idx_t, 2>{{-1, -2}};
-  offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 6}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
+  offset_lo = std::array<RAJA::idx_t, 2> {{-1, -2}};
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 6}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 
   //
   // Non-square views
   //
-  offset_lo = std::array<RAJA::idx_t, 2>{{0, 1}};
-  offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 1}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
-
-  offset_lo = std::array<RAJA::idx_t, 2>{{-1, -1}};
-  offset_hi = std::array<RAJA::idx_t, 2>{{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
+  offset_lo = std::array<RAJA::idx_t, 2> {{0, 1}};
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 1}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
+
+  offset_lo = std::array<RAJA::idx_t, 2> {{-1, -1}};
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest,
-                            OffsetView2DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopOffsetView2DTest,
+    OffsetView2DKernelTest);
 
-#endif // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW2D_HPP__
+#endif  // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index 2db2288d07..6c1f5da17b 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -10,11 +10,12 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
-                                std::array<RAJA::idx_t, 3> offset_lo,
-                                std::array<RAJA::idx_t, 3> offset_hi)
+void KernelOffsetView3DTestImpl(
+    std::array<RAJA::idx_t, 3> dim,
+    std::array<RAJA::idx_t, 3> offset_lo,
+    std::array<RAJA::idx_t, 3> offset_hi)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE*                 working_array;
   IDX_TYPE*                 check_array;
   IDX_TYPE*                 test_array;
@@ -28,8 +29,8 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   EXPECT_LT(off_dim1, dim.at(1));
   EXPECT_LT(off_dim2, dim.at(2));
 
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -71,8 +72,8 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -92,36 +93,37 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   RAJA::idx_t                dim0 = 13;
   RAJA::idx_t                dim1 = 19;
   RAJA::idx_t                dim2 = 16;
-  std::array<RAJA::idx_t, 3> dim{{dim0, dim1, dim2}};
+  std::array<RAJA::idx_t, 3> dim {{dim0, dim1, dim2}};
 
   //
   // Square views
   //
-  std::array<RAJA::idx_t, 3> offset_lo{{0, 2, 1}};
-  std::array<RAJA::idx_t, 3> offset_hi{{dim0 - 2, dim1 - 6, dim2 - 4}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
+  std::array<RAJA::idx_t, 3> offset_lo {{0, 2, 1}};
+  std::array<RAJA::idx_t, 3> offset_hi {{dim0 - 2, dim1 - 6, dim2 - 4}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 
-  offset_lo = std::array<RAJA::idx_t, 3>{{-1, -2, -3}};
-  offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 10, dim2 - 8}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
+  offset_lo = std::array<RAJA::idx_t, 3> {{-1, -2, -3}};
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 10, dim2 - 8}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 
   //
   // Non-square views
   //
-  offset_lo = std::array<RAJA::idx_t, 3>{{0, 1, 2}};
-  offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 2, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
-
-  offset_lo = std::array<RAJA::idx_t, 3>{{-1, -1, 0}};
-  offset_hi = std::array<RAJA::idx_t, 3>{{dim0 - 3, dim1 - 4, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
-                                                                 offset_hi);
+  offset_lo = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 2, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
+
+  offset_lo = std::array<RAJA::idx_t, 3> {{-1, -1, 0}};
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 4, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, offset_lo, offset_hi);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest,
-                            OffsetView3DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopOffsetView3DTest,
+    OffsetView3DKernelTest);
 
-#endif // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW3D_HPP__
+#endif  // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW3D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index ce0cef6b42..49ec54bd15 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -9,10 +9,11 @@
 #define __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW2D_HPP__
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
-                                        std::array<RAJA::idx_t, 2> perm)
+void KernelPermutedOffsetView2DTestImpl(
+    std::array<RAJA::idx_t, 2> dim,
+    std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE*                 A_work_array;
   IDX_TYPE*                 A_check_array;
   IDX_TYPE*                 A_test_array;
@@ -25,8 +26,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   //
   // Note that we assume a finite difference stencil width of one.
   //
-  std::array<RAJA::idx_t, 2> Nint_len{{dim.at(0), dim.at(1)}};
-  std::array<RAJA::idx_t, 2> Ntot_len{{dim.at(0) + 2 * 1, dim.at(1) + 2 * 1}};
+  std::array<RAJA::idx_t, 2> Nint_len {{dim.at(0), dim.at(1)}};
+  std::array<RAJA::idx_t, 2> Ntot_len {{dim.at(0) + 2 * 1, dim.at(1) + 2 * 1}};
 
   //
   // These are used in data initialization and setting reference solution.
@@ -45,8 +46,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
-                                   &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
@@ -62,8 +63,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
-                                   &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
@@ -77,11 +78,11 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
       int A_idx = j + Nint_inner * i;
       int B_idx = (j + 1) + Ntot_inner * (i + 1);
 
-      A_test_array[A_idx] = B_test_array[B_idx] +              // C
-                            B_test_array[B_idx - Ntot_inner] + // S
-                            B_test_array[B_idx + Ntot_inner] + // N
-                            B_test_array[B_idx - 1] +          // W
-                            B_test_array[B_idx + 1];           // E
+      A_test_array[A_idx] = B_test_array[B_idx] +               // C
+                            B_test_array[B_idx - Ntot_inner] +  // S
+                            B_test_array[B_idx + Ntot_inner] +  // N
+                            B_test_array[B_idx - 1] +           // W
+                            B_test_array[B_idx + 1];            // E
     }
   }
 
@@ -97,14 +98,13 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
-                            {
-                              A_view(i, j) = B_view(i, j) + B_view(i - 1, j) +
-                                             B_view(i + 1, j) +
-                                             B_view(i, j - 1) +
-                                             B_view(i, j + 1);
-                            });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(iseg, jseg),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+      {
+        A_view(i, j) = B_view(i, j) + B_view(i - 1, j) + B_view(i + 1, j) +
+                       B_view(i, j - 1) + B_view(i, j + 1);
+      });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
@@ -113,11 +113,11 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
-                                     A_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, A_work_array, A_check_array, A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
-                                     B_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, B_work_array, B_check_array, B_test_array);
 }
 
 
@@ -127,8 +127,9 @@ class KernelNestedLoopPermutedOffsetView2DTest : public ::testing::Test
 {};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
-             PermutedOffsetView2DKernelTest)
+TYPED_TEST_P(
+    KernelNestedLoopPermutedOffsetView2DTest,
+    PermutedOffsetView2DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -137,18 +138,19 @@ TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
 
   RAJA::idx_t                dim0 = 23;
   RAJA::idx_t                dim1 = 37;
-  std::array<RAJA::idx_t, 2> dim{{dim0, dim1}};
+  std::array<RAJA::idx_t, 2> dim {{dim0, dim1}};
 
-  std::array<RAJA::idx_t, 2> perm{{0, 1}};
-  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                         perm);
+  std::array<RAJA::idx_t, 2> perm {{0, 1}};
+  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, perm);
 
-  perm = std::array<RAJA::idx_t, 2>{{1, 0}};
-  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                         perm);
+  perm = std::array<RAJA::idx_t, 2> {{1, 0}};
+  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView2DTest,
-                            PermutedOffsetView2DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopPermutedOffsetView2DTest,
+    PermutedOffsetView2DKernelTest);
 
-#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW2D_HPP__
+#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index f5b48fab5b..2f44756fb9 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -9,10 +9,11 @@
 #define __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW3D_HPP__
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
-                                        std::array<RAJA::idx_t, 3> perm)
+void KernelPermutedOffsetView3DTestImpl(
+    std::array<RAJA::idx_t, 3> dim,
+    std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE*                 A_work_array;
   IDX_TYPE*                 A_check_array;
   IDX_TYPE*                 A_test_array;
@@ -25,8 +26,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   //
   // Note that we assume a finite difference stencil width of one.
   //
-  std::array<RAJA::idx_t, 3> Nint_len{{dim.at(0), dim.at(1), dim.at(2)}};
-  std::array<RAJA::idx_t, 3> Ntot_len{
+  std::array<RAJA::idx_t, 3> Nint_len {{dim.at(0), dim.at(1), dim.at(2)}};
+  std::array<RAJA::idx_t, 3> Ntot_len {
       {dim.at(0) + 2 * 1, dim.at(1) + 2 * 1, dim.at(2) + 2 * 1}};
 
   //
@@ -48,8 +49,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
-                                   &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
@@ -69,8 +70,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
-                                   &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(
+      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
@@ -88,13 +89,13 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
             (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
 
         A_test_array[A_idx] =
-            B_test_array[B_idx] +                              // C
-            B_test_array[B_idx - 1] +                          // W
-            B_test_array[B_idx + 1] +                          // E
-            B_test_array[B_idx - Ntot_inner] +                 // S
-            B_test_array[B_idx + Ntot_inner] +                 // N
-            B_test_array[B_idx - (Ntot_inner * Ntot_middle)] + // B
-            B_test_array[B_idx + (Ntot_inner * Ntot_middle)];  // T
+            B_test_array[B_idx] +                               // C
+            B_test_array[B_idx - 1] +                           // W
+            B_test_array[B_idx + 1] +                           // E
+            B_test_array[B_idx - Ntot_inner] +                  // S
+            B_test_array[B_idx + Ntot_inner] +                  // N
+            B_test_array[B_idx - (Ntot_inner * Ntot_middle)] +  // B
+            B_test_array[B_idx + (Ntot_inner * Ntot_middle)];   // T
       }
     }
   }
@@ -130,11 +131,11 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
-                                     A_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, A_work_array, A_check_array, A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
-                                     B_test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, B_work_array, B_check_array, B_test_array);
 }
 
 
@@ -144,8 +145,9 @@ class KernelNestedLoopPermutedOffsetView3DTest : public ::testing::Test
 {};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
-             PermutedOffsetView3DKernelTest)
+TYPED_TEST_P(
+    KernelNestedLoopPermutedOffsetView3DTest,
+    PermutedOffsetView3DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -161,21 +163,22 @@ TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
   RAJA::idx_t dim1 = 4;
   RAJA::idx_t dim2 = 5;
 #endif
-  std::array<RAJA::idx_t, 3> dim{{dim0, dim1, dim2}};
-
-  std::array<RAJA::idx_t, 3> perm{{0, 1, 2}};
-  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                         perm);
-  perm = std::array<RAJA::idx_t, 3>{{1, 2, 0}};
-  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                         perm);
-
-  perm = std::array<RAJA::idx_t, 3>{{2, 0, 1}};
-  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                         perm);
+  std::array<RAJA::idx_t, 3> dim {{dim0, dim1, dim2}};
+
+  std::array<RAJA::idx_t, 3> perm {{0, 1, 2}};
+  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, perm);
+  perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
+  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, perm);
+
+  perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
+  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim, perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView3DTest,
-                            PermutedOffsetView3DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopPermutedOffsetView3DTest,
+    PermutedOffsetView3DKernelTest);
 
-#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW3D_HPP__
+#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW3D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index b4829d1cd0..76db158a0a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -10,21 +10,22 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2>    dim,
-                                  std::array<RAJA::idx_t, 2> perm)
+void KernelPermutedView2DTestImpl(
+    std::array<IDX_TYPE, 2>    dim,
+    std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE*                 working_array;
   IDX_TYPE*                 check_array;
   IDX_TYPE*                 test_array;
 
-  std::array<RAJA::idx_t, 2> dim_strip{
+  std::array<RAJA::idx_t, 2> dim_strip {
       {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
        static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1);
 
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -40,8 +41,9 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2>    dim,
   RAJA::View<IDX_TYPE, RAJA::Layout<2, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
       [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
       {
         int val    = RAJA::stripIndexType(layout(i, j)) % mod_val;
@@ -55,8 +57,8 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2>    dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -73,32 +75,33 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  std::array<RAJA::idx_t, 2> perm{{0, 1}};
+  std::array<RAJA::idx_t, 2> perm {{0, 1}};
   //
   // Square view
   //
-  std::array<IDX_TYPE, 2> dim_s{
+  std::array<IDX_TYPE, 2> dim_s {
       {static_cast<IDX_TYPE>(21), static_cast<IDX_TYPE>(21)}};
   KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 2>{{1, 0}};
+  perm = std::array<RAJA::idx_t, 2> {{1, 0}};
   KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 2>{{0, 1}};
+  perm = std::array<RAJA::idx_t, 2> {{0, 1}};
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 2> dim_ns{
+  std::array<IDX_TYPE, 2> dim_ns {
       {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24)}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim_ns, perm);
 
-  perm = std::array<RAJA::idx_t, 2>{{1, 0}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  perm = std::array<RAJA::idx_t, 2> {{1, 0}};
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim_ns, perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest,
-                            PermutedView2DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopPermutedView2DTest,
+    PermutedView2DKernelTest);
 
-#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW2D_HPP__
+#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index 3af623c3ba..0bc63c682a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -10,22 +10,23 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3>    dim,
-                                  std::array<RAJA::idx_t, 3> perm)
+void KernelPermutedView3DTestImpl(
+    std::array<IDX_TYPE, 3>    dim,
+    std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE*                 working_array;
   IDX_TYPE*                 check_array;
   IDX_TYPE*                 test_array;
 
-  std::array<RAJA::idx_t, 3> dim_strip{
+  std::array<RAJA::idx_t, 3> dim_strip {
       {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
        static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1))),
        static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(2)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1) * dim_strip.at(2);
 
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(
+      N, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -41,9 +42,10 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3>    dim,
   RAJA::View<IDX_TYPE, RAJA::Layout<3, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
-                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
+          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
       [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
       {
         int val       = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
@@ -57,8 +59,8 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3>    dim,
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -75,41 +77,42 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  std::array<RAJA::idx_t, 3> perm{{0, 1, 2}};
+  std::array<RAJA::idx_t, 3> perm {{0, 1, 2}};
   //
   // Square view
   //
-  std::array<IDX_TYPE, 3> dim_s{{static_cast<IDX_TYPE>(21),
-                                 static_cast<IDX_TYPE>(21),
-                                 static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 3> dim_s {
+      {static_cast<IDX_TYPE>(21), static_cast<IDX_TYPE>(21),
+       static_cast<IDX_TYPE>(21)}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 3>{{1, 2, 0}};
+  perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 3>{{2, 0, 1}};
+  perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
-  perm = std::array<RAJA::idx_t, 3>{{0, 1, 2}};
+  perm = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 3> dim_ns{{static_cast<IDX_TYPE>(15),
-                                  static_cast<IDX_TYPE>(24),
-                                  static_cast<IDX_TYPE>(17)}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
-
-  perm = std::array<RAJA::idx_t, 3>{{1, 2, 0}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
-
-  perm = std::array<RAJA::idx_t, 3>{{2, 0, 1}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  std::array<IDX_TYPE, 3> dim_ns {
+      {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24),
+       static_cast<IDX_TYPE>(17)}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim_ns, perm);
+
+  perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim_ns, perm);
+
+  perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
+      dim_ns, perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest,
-                            PermutedView3DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopPermutedView3DTest,
+    PermutedView3DKernelTest);
 
-#endif // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW3D_HPP_
+#endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW3D_HPP_
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index f5013c7420..ef7c5e8b37 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -10,22 +10,24 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
@@ -37,30 +39,33 @@ call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 // Define list of nested loop types the Basic test supports.
 //
 //
-using BasicSupportedLoopTypeList = camp::list<DEPTH_2,
-                                              DEPTH_2_COLLAPSE,
-                                              DEPTH_3,
-                                              DEPTH_3_COLLAPSE,
-                                              DEPTH_3_COLLAPSE_SEQ_INNER,
-                                              DEPTH_3_COLLAPSE_SEQ_OUTER,
-                                              DEVICE_DEPTH_2>;
+using BasicSupportedLoopTypeList = camp::list<
+    DEPTH_2,
+    DEPTH_2_COLLAPSE,
+    DEPTH_3,
+    DEPTH_3_COLLAPSE,
+    DEPTH_3_COLLAPSE_SEQ_INNER,
+    DEPTH_3_COLLAPSE_SEQ_OUTER,
+    DEVICE_DEPTH_2>;
 
 //
 //
 // Basic 2D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... ExtraArgs>
-void KernelNestedLoopTest(const DEPTH_2&,
-                          const RAJA::Index_type dim0,
-                          const RAJA::Index_type dim1,
-                          ExtraArgs...)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    bool USE_RESOURCE,
+    typename... ExtraArgs>
+void KernelNestedLoopTest(
+    const DEPTH_2&,
+    const RAJA::Index_type dim0,
+    const RAJA::Index_type dim1,
+    ExtraArgs...)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type  flatSize = dim0 * dim1;
   RAJA::Index_type* work_array;
@@ -77,48 +82,52 @@ void KernelNestedLoopTest(const DEPTH_2&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int                                     Depth = 2;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim1,
-                                                              dim0);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
+      work_array, dim1, dim0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range1, range0), work_res,
       [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i)
       { work_view(j, i) = (j * dim0) + i; });
 
-  work_res.memcpy(check_array, work_array,
-                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat,
-                               [=](RAJA::Index_type i)
-                               {
-                                 ASSERT_EQ(
-                                     test_array[RAJA::stripIndexType(i)],
-                                     check_array[RAJA::stripIndexType(i)]);
-                               });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  work_res.memcpy(
+      check_array, work_array,
+      sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(
+      rangeflat,
+      [=](RAJA::Index_type i)
+      {
+        ASSERT_EQ(
+            test_array[RAJA::stripIndexType(i)],
+            check_array[RAJA::stripIndexType(i)]);
+      });
+
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 // DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2
 // test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
-                                                               args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
+      DEPTH_2(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
-                                                               args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
+      DEPTH_2(), args...);
 }
 
 //
@@ -127,13 +136,14 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(const DEPTH_3&,
-                          const RAJA::Index_type dim0,
-                          const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2)
+void KernelNestedLoopTest(
+    const DEPTH_3&,
+    const RAJA::Index_type dim0,
+    const RAJA::Index_type dim1,
+    const RAJA::Index_type dim2)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type  flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
@@ -151,58 +161,63 @@ void KernelNestedLoopTest(const DEPTH_3&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int                                     Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim2,
-                                                              dim1, dim0);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
+      work_array, dim2, dim1, dim0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range2, range1, range0), work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type k, RAJA::Index_type j,
-                           RAJA::Index_type i)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i)
       { work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i; });
 
-  work_res.memcpy(check_array, work_array,
-                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat,
-                               [=](RAJA::Index_type i)
-                               {
-                                 ASSERT_EQ(
-                                     test_array[RAJA::stripIndexType(i)],
-                                     check_array[RAJA::stripIndexType(i)]);
-                               });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  work_res.memcpy(
+      check_array, work_array,
+      sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(
+      rangeflat,
+      [=](RAJA::Index_type i)
+      {
+        ASSERT_EQ(
+            test_array[RAJA::stripIndexType(i)],
+            check_array[RAJA::stripIndexType(i)]);
+      });
+
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 // DEPTH_3_COLLAPSE execution policies use the above DEPTH_3 test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
-                                                               args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
+      DEPTH_3(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
-                                                               args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
+      DEPTH_3(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    bool USE_RESOURCE,
+    typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
-                                                               args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
+      DEPTH_3(), args...);
 }
 
 //
@@ -234,9 +249,10 @@ struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA>
   using type = RAJA::KernelPolicy<RAJA::statement::For<
       1,
       typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
-                           RAJA::statement::Lambda<0>>>>;
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::Lambda<0>>>>;
 };
 
 template <typename POLICY_DATA>
@@ -287,14 +303,14 @@ struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
           1,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type, // row
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
           RAJA::statement::For<
               0,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type, // col
-              RAJA::statement::Lambda<0>>>> // end CudaKernel
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
+              RAJA::statement::Lambda<0>>>>  // end CudaKernel
                          >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __NESTED_LOOP_BASIC_IMPL_HPP__
+#endif  // __NESTED_LOOP_BASIC_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index caa66a621c..9f8270ddde 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -10,22 +10,24 @@
 
 #include "RAJA_test-abs.hpp"
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
@@ -51,8 +53,8 @@ void KernelNestedLoopTest()
   constexpr static int N   = 1000;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES               work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
   double* work_arrA = work_res.template allocate<double>(N * N);
@@ -70,10 +72,10 @@ void KernelNestedLoopTest()
     test_arrA[i] = i * 1.2;
     test_arrB[i] = i * 0.5;
   }
-  work_res.memcpy(work_arrA, test_arrA,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(work_arrB, test_arrB,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Initialize RAJA Views
   RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
@@ -103,8 +105,8 @@ void KernelNestedLoopTest()
   RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::RangeSegment{1, N - 1},
-                       RAJA::RangeSegment{1, N - 1}),
+      RAJA::make_tuple(
+          RAJA::RangeSegment {1, N - 1}, RAJA::RangeSegment {1, N - 1}),
 
       // Resource
       work_res,
@@ -125,13 +127,13 @@ void KernelNestedLoopTest()
                                   work_viewB(i - 1, j));
       });
 
-  work_res.memcpy(check_arrA, work_arrA,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(check_arrB, work_arrB,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment{0, N * N},
+      RAJA::RangeSegment {0, N * N},
       [=](RAJA::Index_type i)
       {
         ASSERT_TRUE(RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8);
@@ -179,15 +181,15 @@ struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA>
 template <typename POLICY_DATA>
 struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
 {
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::Collapse<
-                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                             RAJA::ArgList<1, 0>,
-                             RAJA::statement::Lambda<0>>,
-                         RAJA::statement::Collapse<
-                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                             RAJA::ArgList<1, 0>,
-                             RAJA::statement::Lambda<1>>>;
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::Collapse<
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::ArgList<1, 0>,
+          RAJA::statement::Lambda<0>>,
+      RAJA::statement::Collapse<
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::ArgList<1, 0>,
+          RAJA::statement::Lambda<1>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
@@ -213,6 +215,6 @@ struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
               RAJA::statement::Lambda<1>>>>>;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __NESTED_LOOP_MULTI_LAMBDA_IMPL_HPP__
+#endif  // __NESTED_LOOP_MULTI_LAMBDA_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index 4c09133014..b3cc2879e7 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -10,26 +10,29 @@
 
 #include "RAJA_test-abs.hpp"
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&&  segs,
-                                                        PARAMS&&    params,
-                                                        WORKING_RES work_res,
-                                                        Args&&... args)
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(
+    SEGMENTS&&  segs,
+    PARAMS&&    params,
+    WORKING_RES work_res,
+    Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -56,8 +59,8 @@ void KernelNestedLoopTest()
   constexpr static int N   = 100;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES               work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
   double* work_arrA = work_res.template allocate<double>(N * N);
@@ -90,12 +93,12 @@ void KernelNestedLoopTest()
     }
   }
 
-  work_res.memcpy(work_arrA, test_arrA,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(work_arrB, test_arrB,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(work_arrC, test_arrC,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Calculate Test data
   for (int row = 0; row < N; ++row)
@@ -114,10 +117,11 @@ void KernelNestedLoopTest()
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N},
-                       RAJA::RangeSegment{0, N}),
+      RAJA::make_tuple(
+          RAJA::RangeSegment {0, N}, RAJA::RangeSegment {0, N},
+          RAJA::RangeSegment {0, N}),
 
-      RAJA::tuple<double>{0.0},
+      RAJA::tuple<double> {0.0},
 
       // Resource
       work_res,
@@ -135,11 +139,11 @@ void KernelNestedLoopTest()
 
   );
 
-  work_res.memcpy(check_arrC, work_arrC,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(
+      check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment{0, N * N}, [=](RAJA::Index_type i)
+      RAJA::RangeSegment {0, N * N}, [=](RAJA::Index_type i)
       { ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8); });
 
   work_res.deallocate(work_arrA);
@@ -170,18 +174,19 @@ struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA>
       RAJA::statement::For<
           0,
           typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
           RAJA::statement::For<
               2,
               typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<1> // inner loop: dot += ...
+              RAJA::statement::Lambda<1>  // inner loop: dot += ...
               >,
-          RAJA::statement::Lambda<2,
-                                  RAJA::Segs<0, 1>,
-                                  RAJA::Params<0>> // set
-                                                   // C(row,
-                                                   // col)
-                                                   // = dot
+          RAJA::statement::Lambda<
+              2,
+              RAJA::Segs<0, 1>,
+              RAJA::Params<0>>  // set
+                                // C(row,
+                                // col)
+                                // = dot
           >>>;
 };
 
@@ -198,19 +203,20 @@ struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
           RAJA::statement::For<
               0,
               typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
               RAJA::statement::For<
                   2,
                   typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<1> // inner loop: dot += ...
+                  RAJA::statement::Lambda<1>  // inner loop: dot += ...
                   >,
-              RAJA::statement::Lambda<2,
-                                      RAJA::Segs<0, 1>,
-                                      RAJA::Params<0>> // set C(row, col) = dot
-              >>>                                      // end CudaKernel
+              RAJA::statement::Lambda<
+                  2,
+                  RAJA::Segs<0, 1>,
+                  RAJA::Params<0>>  // set C(row, col) = dot
+              >>>                   // end CudaKernel
                          >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __NESTED_LOOP_MULTI_LAMBDA_PARAM_IMPL_HPP__
+#endif  // __NESTED_LOOP_MULTI_LAMBDA_PARAM_IMPL_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index 6793b452f3..eb69e705ac 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -38,10 +38,10 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 
   // For double nested loop tests the third arg is ignored.
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
-                                                          20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOP_BASIC_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
index 18ffdfbf16..a93ebe9564 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
@@ -41,7 +41,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest,
-                            NestedLoopMultiLambdaKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopMultiLambdaTest,
+    NestedLoopMultiLambdaKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
index 82972e884a..29bbf13ba0 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
@@ -20,8 +20,9 @@ template <typename T>
 class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
-             NestedLoopMultiLambdaParamKernel)
+TYPED_TEST_P(
+    KernelNestedLoopMultiLambdaParamTest,
+    NestedLoopMultiLambdaParamKernel)
 {
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -42,7 +43,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest,
-                            NestedLoopMultiLambdaParamKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopMultiLambdaParamTest,
+    NestedLoopMultiLambdaParamKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index 219c448ce0..de4fad2b5a 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -37,10 +37,10 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 
   // For double nested loop tests the third arg is ignored.
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
-                                                          20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
-#endif // __TEST_KERNEL_NESTED_LOOP_RESOURCE_BASIC_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
index 88ca995772..639b73fb7d 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
@@ -41,7 +41,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest,
-                            NestedLoopMultiLambdaKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopMultiLambdaTest,
+    NestedLoopMultiLambdaKernel);
 
-#endif // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_HPP__
+#endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
index d60aa9b985..a76faafd1d 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
@@ -20,8 +20,9 @@ template <typename T>
 class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
-             NestedLoopMultiLambdaParamKernel)
+TYPED_TEST_P(
+    KernelNestedLoopMultiLambdaParamTest,
+    NestedLoopMultiLambdaParamKernel)
 {
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -42,7 +43,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest,
-                            NestedLoopMultiLambdaParamKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelNestedLoopMultiLambdaParamTest,
+    NestedLoopMultiLambdaParamKernel);
 
-#endif // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
+#endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index b08b38fa30..3bf4c82418 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -8,15 +8,16 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename FORALL_POLICY,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelLocMax2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
@@ -28,30 +29,32 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+      { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      seg,
+      [=](INDEX_TYPE zz)
+      {
+        for (int xx = 0; xx < xdim; ++xx)
+        {
+          checkarr2D[zz][xx] = zz * xdim + xx;
+        }
+        checkarr2D[ydim - 1][xdim - 1] = 0;
+      });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -69,15 +72,15 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      colrange,
+      [=](INDEX_TYPE c)
+      {
+        for (int r = 0; r < ydim; ++r)
+        {
+          checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
+        }
+      });
 
   Index2D   raja_loc      = maxloc_reducer.getLoc();
   DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
@@ -88,11 +91,11 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
@@ -110,14 +113,17 @@ TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(10, 10);
+  KernelLocMax2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(151, 151);
+  KernelLocMax2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest, LocMax2DKernel);
 
-#endif // __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
+#endif  // __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index f3f5cda19c..fb0d213f69 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -8,15 +8,16 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename FORALL_POLICY,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
@@ -28,30 +29,32 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+      { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      seg,
+      [=](INDEX_TYPE zz)
+      {
+        for (int xx = 0; xx < xdim; ++xx)
+        {
+          checkarr2D[zz][xx] = zz * xdim + xx;
+        }
+        checkarr2D[ydim - 1][xdim - 1] = 0;
+      });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -71,15 +74,15 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      colrange,
+      [=](INDEX_TYPE c)
+      {
+        for (int r = 0; r < ydim; ++r)
+        {
+          checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
+        }
+      });
 
   Index2D   raja_loc      = maxloc_reducer.getLoc();
   DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
@@ -90,11 +93,11 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
@@ -112,14 +115,17 @@ TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest, LocMax2DViewKernel);
 
-#endif // __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
+#endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index e4faccc437..3a8f5554ba 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -8,15 +8,16 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename FORALL_POLICY,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
@@ -28,30 +29,32 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+      { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      seg,
+      [=](INDEX_TYPE zz)
+      {
+        for (int xx = 0; xx < xdim; ++xx)
+        {
+          checkarr2D[zz][xx] = zz * xdim + xx;
+        }
+        checkarr2D[ydim - 1][xdim - 1] = 0;
+      });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -66,31 +69,31 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE,
-                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+  RAJA::ReduceMaxLoc<
+      REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
       maxloc_reducer((DATA_TYPE)0, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(int c, int r)
-                            {
-                              maxloc_reducer.maxloc(
-                                  ArrView(r, c),
-                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                            });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(int c, int r)
+      {
+        maxloc_reducer.maxloc(
+            ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+      });
 
   // CPU answer
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      colrange,
+      [=](INDEX_TYPE c)
+      {
+        for (int r = 0; r < ydim; ++r)
+        {
+          checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
+        }
+      });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = maxloc_reducer.getLoc();
   DATA_TYPE                         raja_max = (DATA_TYPE)maxloc_reducer.get();
@@ -101,11 +104,11 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
@@ -123,18 +126,19 @@ TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      10, 10);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      151, 151);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      362, 362);
+  KernelLocMax2DViewTupleTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTupleTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTupleTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest,
-                            LocMax2DViewTupleKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelLocMax2DViewTupleTest,
+    LocMax2DViewTupleKernel);
 
-#endif // __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
+#endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index fa4d583d74..d6d476d57f 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -8,15 +8,16 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename FORALL_POLICY,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelLocMin2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
@@ -28,30 +29,32 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+      { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      seg,
+      [=](INDEX_TYPE zz)
+      {
+        for (int xx = 0; xx < xdim; ++xx)
+        {
+          checkarr2D[zz][xx] = zz * xdim + xx + 1;
+        }
+        checkarr2D[ydim - 1][xdim - 1] = 0;
+      });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -69,15 +72,15 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkminloc_reducer.minloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      colrange,
+      [=](INDEX_TYPE c)
+      {
+        for (int r = 0; r < ydim; ++r)
+        {
+          checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
+        }
+      });
 
   Index2D   raja_loc      = minloc_reducer.getLoc();
   DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
@@ -88,11 +91,11 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
@@ -110,14 +113,17 @@ TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(10, 10);
+  KernelLocMin2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(151, 151);
+  KernelLocMin2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest, LocMin2DKernel);
 
-#endif // __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
+#endif  // __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index ccd8c542fc..ccf8d341f4 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -8,15 +8,16 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename FORALL_POLICY,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
@@ -28,30 +29,32 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+      { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      seg,
+      [=](INDEX_TYPE zz)
+      {
+        for (int xx = 0; xx < xdim; ++xx)
+        {
+          checkarr2D[zz][xx] = zz * xdim + xx + 1;
+        }
+        checkarr2D[ydim - 1][xdim - 1] = 0;
+      });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -71,15 +74,15 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkminloc_reducer.minloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      colrange,
+      [=](INDEX_TYPE c)
+      {
+        for (int r = 0; r < ydim; ++r)
+        {
+          checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
+        }
+      });
 
   Index2D   raja_loc      = minloc_reducer.getLoc();
   DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
@@ -90,11 +93,11 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
@@ -112,14 +115,17 @@ TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest, LocMin2DViewKernel);
 
-#endif // __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
+#endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index 12be090907..f9b606cbbb 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -8,15 +8,16 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename FORALL_POLICY,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
@@ -28,30 +29,32 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(
+      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(
+      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+      { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(
+      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      seg,
+      [=](INDEX_TYPE zz)
+      {
+        for (int xx = 0; xx < xdim; ++xx)
+        {
+          checkarr2D[zz][xx] = zz * xdim + xx + 1;
+        }
+        checkarr2D[ydim - 1][xdim - 1] = 0;
+      });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -62,31 +65,31 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE,
-                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+  RAJA::ReduceMinLoc<
+      REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
       minloc_reducer((DATA_TYPE)1024, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(int c, int r)
-                            {
-                              minloc_reducer.minloc(
-                                  ArrView(r, c),
-                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                            });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(int c, int r)
+      {
+        minloc_reducer.minloc(
+            ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+      });
 
   // CPU answer
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkminloc_reducer.minloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      colrange,
+      [=](INDEX_TYPE c)
+      {
+        for (int r = 0; r < ydim; ++r)
+        {
+          checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
+        }
+      });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = minloc_reducer.getLoc();
   DATA_TYPE                         raja_min = (DATA_TYPE)minloc_reducer.get();
@@ -97,11 +100,11 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(
+      work_res, workarr2D, checkarr2D, testarr2D);
 }
 
 
@@ -119,18 +122,19 @@ TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      10, 10);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      151, 151);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      362, 362);
+  KernelLocMin2DViewTupleTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTupleTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTupleTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
+      REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest,
-                            LocMin2DViewTupleKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelLocMin2DViewTupleTest,
+    LocMin2DViewTupleKernel);
 
-#endif // __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
+#endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
index 7a08050e72..3802771927 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-data.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -9,13 +9,14 @@
 #define __TEST_KERNEL_REGION_DATA_HPP__
 
 template <typename T>
-void allocRegionTestData(int                       N,
-                         camp::resources::Resource work_res,
-                         T**                       work1,
-                         T**                       work2,
-                         T**                       work3,
-                         camp::resources::Resource host_res,
-                         T**                       check)
+void allocRegionTestData(
+    int                       N,
+    camp::resources::Resource work_res,
+    T**                       work1,
+    T**                       work2,
+    T**                       work3,
+    camp::resources::Resource host_res,
+    T**                       check)
 {
   *work1 = work_res.allocate<T>(N);
   *work2 = work_res.allocate<T>(N);
@@ -25,12 +26,13 @@ void allocRegionTestData(int                       N,
 }
 
 template <typename T>
-void deallocRegionTestData(camp::resources::Resource work_res,
-                           T*                        work1,
-                           T*                        work2,
-                           T*                        work3,
-                           camp::resources::Resource host_res,
-                           T*                        check)
+void deallocRegionTestData(
+    camp::resources::Resource work_res,
+    T*                        work1,
+    T*                        work2,
+    T*                        work3,
+    camp::resources::Resource host_res,
+    T*                        check)
 {
   work_res.deallocate(work1);
   work_res.deallocate(work2);
@@ -39,4 +41,4 @@ void deallocRegionTestData(camp::resources::Resource work_res,
   host_res.deallocate(check);
 }
 
-#endif // __TEST_KERNEL_REGION_UTILS_HPP__
+#endif  // __TEST_KERNEL_REGION_UTILS_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index 99d7353434..9282e3687b 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -15,8 +15,8 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
 
@@ -26,8 +26,9 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
-                      host_res, &check_array);
+  allocRegionTestData(
+      N, work_res, &work_array1, &work_array2, &work_array3, host_res,
+      &check_array);
 
   work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
   work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
@@ -72,8 +73,8 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
-                        host_res, check_array);
+  deallocRegionTestData(
+      work_res, work_array1, work_array2, work_array3, host_res, check_array);
 }
 
 
@@ -95,4 +96,4 @@ TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
 
 REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest, RegionSyncKernel);
 
-#endif // __TEST_KERNEL_REGION_SYNC_HPP__
+#endif  // __TEST_KERNEL_REGION_SYNC_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index 0a14678bff..55835c7dbe 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -11,8 +11,8 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
 
@@ -22,8 +22,9 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
-                      host_res, &check_array);
+  allocRegionTestData(
+      N, work_res, &work_array1, &work_array2, &work_array3, host_res,
+      &check_array);
 
   work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
   work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
@@ -57,8 +58,8 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
-                        host_res, check_array);
+  deallocRegionTestData(
+      work_res, work_array1, work_array2, work_array3, host_res, check_array);
 }
 
 
@@ -80,4 +81,4 @@ TYPED_TEST_P(KernelRegionTest, RegionKernel)
 
 REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest, RegionKernel);
 
-#endif // __TEST_KERNEL_REGION_HPP__
+#endif  // __TEST_KERNEL_REGION_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
index 6ec6fe5d2f..02f0958a2b 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
@@ -77,7 +77,8 @@ TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
       IDX_TYPE(1035), tsize);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest,
-                            ForICountSingleLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelSingleLoopForICountTest,
+    ForICountSingleLoopKernel);
 
-#endif // __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
+#endif  // __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
index 01763c09d4..cfe65a6e7e 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
@@ -79,7 +79,8 @@ TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
       IDX_TYPE(1035), tsize);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest,
-                            TileTCountSingleLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelSingleLoopTileTCountTest,
+    TileTCountSingleLoopKernel);
 
-#endif // __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
+#endif  // __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index a0119c3edf..974ddefde1 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -10,15 +10,16 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY>
 void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE* work_array;
   DATA_TYPE* check_array;
@@ -31,11 +32,11 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
-                                    &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -65,12 +66,13 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
-      RAJA::make_tuple(RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y}),
+      RAJA::make_tuple(
+          RAJA::TileSize {tile_dim_x}, RAJA::TileSize {tile_dim_y}),
       [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
       { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -87,13 +89,13 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
   // transpose work_array again with different tile sizes
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
-      RAJA::make_tuple(RAJA::TileSize{tile_dim_x},
-                       RAJA::TileSize{tile_dim_y / 2}),
+      RAJA::make_tuple(
+          RAJA::TileSize {tile_dim_x}, RAJA::TileSize {tile_dim_y / 2}),
       [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
       { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -103,11 +105,11 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
-                                      test_array_t);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array_t, check_array_t, test_array_t);
 }
 
 
@@ -133,4 +135,4 @@ TYPED_TEST_P(KernelTileDynamic2DTest, TileDynamic2DKernel)
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest, TileDynamic2DKernel);
 
-#endif // __TEST_KERNEL_TILE_DYNAMIC2D_HPP__
+#endif  // __TEST_KERNEL_TILE_DYNAMIC2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index 62f09ff9ce..c125619b9c 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -10,15 +10,16 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY>
 void KernelTileFixed2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE* work_array;
   DATA_TYPE* check_array;
@@ -31,11 +32,11 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
-                                    &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -63,12 +64,13 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-                            { WorkTView(cc, rr) = WorkView(rr, cc); });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -78,11 +80,11 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
-                                      test_array_t);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array_t, check_array_t, test_array_t);
 }
 
 
@@ -108,4 +110,4 @@ TYPED_TEST_P(KernelTileFixed2DTest, TileFixed2DKernel)
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest, TileFixed2DKernel);
 
-#endif // __TEST_KERNEL_TILE_FIXED2D_HPP__
+#endif  // __TEST_KERNEL_TILE_FIXED2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index 6fa6446883..91c114280a 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -11,16 +11,17 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 {
   // This test reduces min and max with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE* work_array;
   DATA_TYPE* check_array;
@@ -28,8 +29,8 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
   // initialize arrays
   std::iota(test_array, test_array + array_length, 1);
@@ -54,23 +55,25 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
-                                              work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(
+      &colidx[0], colidx.size(), work_res);
 
   // find min and max on target platform
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-                            {
-                              workmin.min(WorkView(rr, cc));
-                              workmax.max(WorkView(rr, cc));
-                            });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      {
+        workmin.min(WorkView(rr, cc));
+        workmax.max(WorkView(rr, cc));
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(-1), static_cast<DATA_TYPE>(workmin.get()));
-  ASSERT_EQ(static_cast<DATA_TYPE>(array_length + 2),
-            static_cast<DATA_TYPE>(workmax.get()));
+  ASSERT_EQ(
+      static_cast<DATA_TYPE>(array_length + 2),
+      static_cast<DATA_TYPE>(workmax.get()));
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 }
 
 
@@ -87,15 +90,16 @@ TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DMinMaxTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DMinMaxTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DMinMaxTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest,
-                            TileFixed2DMinMaxKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelTileFixed2DMinMaxTest,
+    TileFixed2DMinMaxKernel);
 
-#endif // __TEST_KERNEL_TILE_FIXED2DMINMAX_HPP__
+#endif  // __TEST_KERNEL_TILE_FIXED2DMINMAX_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index 9442c8c0bd..67da3f80d6 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -12,11 +12,12 @@
 #include <vector>
 #include <type_traits>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POLICY>
 void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 {
   // This test reduces sums with tiling.
@@ -34,7 +35,7 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     cols = colsin;
   }
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE hostsum = 0;
 
@@ -64,13 +65,14 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
-                                              work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(
+      &colidx[0], colidx.size(), work_res);
 
   // sum on target platform
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-                            { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
 
   ASSERT_FLOAT_EQ(hostsum, (DATA_TYPE)worksum.get());
 }
@@ -89,14 +91,14 @@ TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                               REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                               REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                               REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DSumTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DSumTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DSumTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel);
 
-#endif // __TEST_KERNEL_TILE_FIXED2DSUM_HPP__
+#endif  // __TEST_KERNEL_TILE_FIXED2DSUM_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index 90a3e34e0d..0ca8c2b3dd 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -10,15 +10,16 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename DATA_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY>
 void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE* work_array;
   DATA_TYPE* check_array;
@@ -31,11 +32,11 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array, &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
-                                    &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(
+      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -44,8 +45,8 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize local array (shared mem)
-  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0, 1>,
-                                    RAJA::SizeList<tile_dim_x, tile_dim_y>>;
+  using TILE_MEM = RAJA::LocalArray<
+      DATA_TYPE, RAJA::Perm<0, 1>, RAJA::SizeList<tile_dim_x, tile_dim_y>>;
   TILE_MEM Tile_Array;
 
   // initialize arrays
@@ -71,16 +72,16 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
       RAJA::make_tuple((INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
-                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
-      { Tile_Array(ty, tx) = WorkView(rr, cc); },
+      [=] RAJA_HOST_DEVICE(
+          INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty,
+          TILE_MEM & Tile_Array) { Tile_Array(ty, tx) = WorkView(rr, cc); },
 
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
-                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
-      { WorkTView(cc, rr) = Tile_Array(ty, tx); });
+      [=] RAJA_HOST_DEVICE(
+          INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty,
+          TILE_MEM & Tile_Array) { WorkTView(cc, rr) = Tile_Array(ty, tx); });
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(
+      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -90,11 +91,11 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array, check_array, test_array);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
-                                      test_array_t);
+  deallocateForallTestData<DATA_TYPE>(
+      work_res, work_array_t, check_array_t, test_array_t);
 }
 
 
@@ -110,14 +111,14 @@ TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                 EXEC_POLICY>(10, 10);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                 EXEC_POLICY>(151, 111);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                 EXEC_POLICY>(362, 362);
+  KernelTileLocalArray2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
+  KernelTileLocalArray2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
+  KernelTileLocalArray2DTestImpl<
+      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel);
 
-#endif // __TEST_KERNEL_TILE_LOCALARRAY2D_HPP__
+#endif  // __TEST_KERNEL_TILE_LOCALARRAY2D_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
index 9c7df0d4d0..dc3ecfb7bc 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -43,7 +43,8 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
       LOOP_TYPE(), 64, 4 * 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
-                            WarpThreadReduceMaskKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelWarpThreadReduceMaskTest,
+    WarpThreadReduceMaskKernel);
 
-#endif // __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
+#endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
index 05ad33d1da..439ed4870d 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -44,7 +44,8 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
       LOOP_TYPE(), 4000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
-                            WarpThreadReduceWarpKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelWarpThreadReduceWarpTest,
+    WarpThreadReduceWarpKernel);
 
-#endif // __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
+#endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
index f20112d62f..99b9239466 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -43,7 +43,8 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
-                            WarpThreadWarpLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelWarpThreadWarpLoopTest,
+    WarpThreadWarpLoopKernel);
 
-#endif // __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
+#endif  // __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
index c8881fdf14..0d78e690a2 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -43,7 +43,8 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
       LOOP_TYPE(), 64, 4 * 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
-                            WarpThreadReduceMaskKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelWarpThreadReduceMaskTest,
+    WarpThreadReduceMaskKernel);
 
-#endif // __TEST_WARP_THREAD_REDUCEMASK_HPP__
+#endif  // __TEST_WARP_THREAD_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
index 0a31416989..966301b563 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -44,7 +44,8 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
       LOOP_TYPE(), 4000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
-                            WarpThreadReduceWarpKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelWarpThreadReduceWarpTest,
+    WarpThreadReduceWarpKernel);
 
-#endif // __TEST_WARP_THREAD_REDUCEWARP_HPP__
+#endif  // __TEST_WARP_THREAD_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
index 24b8dd3153..003c07bacb 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -43,7 +43,8 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
-                            WarpThreadWarpLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(
+    KernelWarpThreadWarpLoopTest,
+    WarpThreadWarpLoopKernel);
 
-#endif // __TEST_WARP_THREAD_WARPLOOP_HPP__
+#endif  // __TEST_WARP_THREAD_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index a8bc973981..c4973d56e5 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -10,49 +10,53 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
   RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&&  segs,
-                  PARAMS&&    params,
-                  WORKING_RES work_res,
-                  Args&&... args)
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel_param(
+    SEGMENTS&&  segs,
+    PARAMS&&    params,
+    WORKING_RES work_res,
+    Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -64,46 +68,49 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceMask test supports.
 //
 //
-using ReduceMaskSupportedLoopTypeList =
-    camp::list<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-               DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
+using ReduceMaskSupportedLoopTypeList = camp::list<
+    DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+    DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
-                          const RAJA::Index_type directlen,
-                          const RAJA::Index_type looplen)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
+    const RAJA::Index_type directlen,
+    const RAJA::Index_type looplen)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
-                                           &work_array, &check_array,
-                                           &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      directlen * looplen, erased_work_res, &work_array, &check_array,
+      &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
       work_res,
       [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j))
       {
         trip_count += 1;
-        worksum += i; // i should only be 0..directlen-1
+        worksum += i;  // i should only be 0..directlen-1
         max_thread.max(threadIdx.x);
       });
 
@@ -111,43 +118,47 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
   ASSERT_EQ(trip_count.get(), looplen * directlen);
   ASSERT_EQ(worksum.get(), looplen * directlen * (directlen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
-                          const RAJA::Index_type directlen,
-                          const RAJA::Index_type looplen)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
+    const RAJA::Index_type directlen,
+    const RAJA::Index_type looplen)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
-                                           &work_array, &check_array,
-                                           &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      directlen * looplen, erased_work_res, &work_array, &check_array,
+      &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
       RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0), work_res,
-      [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
-                      RAJA::Index_type RAJA_UNUSED_ARG(j),
-                      RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
+      [=] RAJA_DEVICE(
+          RAJA::Index_type RAJA_UNUSED_ARG(i),
+          RAJA::Index_type RAJA_UNUSED_ARG(j),
+          RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
       {
         trip_count += 1;
-        worksum += y; // y should only be 0..3
+        worksum += y;  // y should only be 0..3
         max_thread.max(threadIdx.x);
       });
 
@@ -155,8 +166,8 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
   ASSERT_EQ(trip_count.get(), looplen * directlen);
   ASSERT_EQ(worksum.get(), looplen * directlen * (looplen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 //
@@ -170,9 +181,10 @@ struct WarpThreadExec;
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-                      REDUCE_POL,
-                      POLICY_DATA>
+struct WarpThreadExec<
+    DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -181,14 +193,15 @@ struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
           RAJA::statement::For<
               1,
               typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
                          >;
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
-                      REDUCE_POL,
-                      POLICY_DATA>
+struct WarpThreadExec<
+    DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<
       RAJA::statement::DEVICE_KERNEL<RAJA::statement::ForICount<
@@ -199,10 +212,10 @@ struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
               1,
               RAJA::statement::Param<1>,
               typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
       >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __WARP_THREAD_REDUCEMASK_IMPL_HPP__
+#endif  // __WARP_THREAD_REDUCEMASK_IMPL_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index 56c5eeb673..39da05cbf2 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -10,27 +10,29 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&&  segs,
-                  PARAMS&&    params,
-                  WORKING_RES work_res,
-                  Args&&... args)
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel_param(
+    SEGMENTS&&  segs,
+    PARAMS&&    params,
+    WORKING_RES work_res,
+    Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -42,32 +44,34 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceWarp test supports.
 //
 //
-using ReduceWarpSupportedLoopTypeList =
-    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-               DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-               DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
+using ReduceWarpSupportedLoopTypeList = camp::list<
+    DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+    DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+    DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
-                          const RAJA::Index_type len)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
+    const RAJA::Index_type len)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
@@ -90,20 +94,21 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
   ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
   ASSERT_EQ(reduce_count.get(), 1);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
 void KernelWarpThreadTest(
     const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
-    const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
@@ -112,19 +117,20 @@ void KernelWarpThreadTest(
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type outerlen = len / innerlen;
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
       RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
-                           RAJA::Index_type & value)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type & value)
       { value += i + j * outerlen; },
 
       [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
@@ -138,20 +144,21 @@ void KernelWarpThreadTest(
   ASSERT_EQ(worksum.get(), outerlen * innerlen * (outerlen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
 void KernelWarpThreadTest(
     const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
-    const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
@@ -161,20 +168,22 @@ void KernelWarpThreadTest(
   RAJA::Index_type middlelen = 16;
   RAJA::Index_type outerlen  = len / (innerlen * middlelen);
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
+          RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
       RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
-                           RAJA::Index_type k, RAJA::Index_type & value)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k,
+          RAJA::Index_type & value)
       { value += i + j * outerlen + k * outerlen * middlelen; },
 
       [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
@@ -185,12 +194,13 @@ void KernelWarpThreadTest(
         reduce_count += 1;
       });
 
-  ASSERT_EQ(worksum.get(), outerlen * middlelen * innerlen *
-                               (outerlen * middlelen * innerlen - 1) / 2);
+  ASSERT_EQ(
+      worksum.get(), outerlen * middlelen * innerlen *
+                         (outerlen * middlelen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), middlelen * innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 //
@@ -204,26 +214,29 @@ struct WarpThreadExec;
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-                      REDUCE_POL,
-                      POLICY_DATA>
+struct WarpThreadExec<
+    DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                           RAJA::statement::Lambda<0>>,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::Lambda<0>>,
       RAJA::statement::Reduce<
           typename camp::at<POLICY_DATA, camp::num<1>>::type,
           RAJA::operators::plus,
           RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>>> // end DEVICE_KERNEL
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
                                   >;
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-                      REDUCE_POL,
-                      POLICY_DATA>
+struct WarpThreadExec<
+    DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
       RAJA::statement::For<
@@ -237,14 +250,15 @@ struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
           typename camp::at<POLICY_DATA, camp::num<2>>::type,
           RAJA::operators::plus,
           RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>>> // end DEVICE_KERNEL
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
                                   >;
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
-                      REDUCE_POL,
-                      POLICY_DATA>
+struct WarpThreadExec<
+    DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -256,19 +270,19 @@ struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
               RAJA::statement::For<
                   0,
                   typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>                 // end For 0
-              >,                                              // end For 1
-          typename camp::at<POLICY_DATA, camp::num<3>>::type, // warp
-                                                              // synchronize
+                  RAJA::statement::Lambda<0>>                  // end For 0
+              >,                                               // end For 1
+          typename camp::at<POLICY_DATA, camp::num<3>>::type,  // warp
+                                                               // synchronize
           RAJA::statement::Reduce<
               typename camp::at<POLICY_DATA, camp::num<4>>::type,
               RAJA::operators::plus,
               RAJA::statement::Param<0>,
-              RAJA::statement::Lambda<1, RAJA::Params<0>>>> // end For 2
-                                                        >   // end DEVICE_KERNEL
+              RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end For 2
+                                                        >  // end DEVICE_KERNEL
                          >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __WARP_THREAD_REDUCEWARP_IMPL_HPP__
+#endif  // __WARP_THREAD_REDUCEWARP_IMPL_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index 21326cd7d6..cdd268f681 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -10,49 +10,53 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
   RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&&  segs,
-                  PARAMS&&    params,
-                  WORKING_RES work_res,
-                  Args&&... args)
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel_param(
+    SEGMENTS&&  segs,
+    PARAMS&&    params,
+    WORKING_RES work_res,
+    Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
+template <
+    typename EXEC_POL,
+    bool USE_RESOURCE,
+    typename SEGMENTS,
+    typename PARAMS,
+    typename WORKING_RES,
+    typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -64,32 +68,34 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the WarpLoop test supports.
 //
 //
-using WarpLoopSupportedLoopTypeList =
-    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARP,
-               DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-               DEVICE_DEPTH_2_REDUCESUM_WARP>;
+using WarpLoopSupportedLoopTypeList = camp::list<
+    DEVICE_DEPTH_1_REDUCESUM_WARP,
+    DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+    DEVICE_DEPTH_2_REDUCESUM_WARP>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
-                          const RAJA::Index_type len)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_1_REDUCESUM_WARP&,
+    const RAJA::Index_type len)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      len, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
 
@@ -101,19 +107,21 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
 
   ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
-                          const RAJA::Index_type numtiles)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_2_REDUCESUM_WARP&,
+    const RAJA::Index_type numtiles)
 {
-  WORKING_RES               work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES               work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type  flatSize = 32 * numtiles;
   RAJA::Index_type* work_array;
@@ -130,27 +138,29 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
       RAJA::make_tuple((RAJA::Index_type)0), work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
-                           RAJA::Index_type j)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type j)
       {
-        worksum += j; // j should only be 0..31
+        worksum += j;  // j should only be 0..31
       });
 
   ASSERT_EQ(worksum.get(), numtiles * 32 * (32 - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(
+      erased_work_res, work_array, check_array, test_array);
 }
 
 // More specific execution policies that use the above
 // DEVICE_DEPTH_1_REDUCESUM_WARP test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
-                          Args... args)
+template <
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename REDUCE_POL,
+    bool USE_RESOURCE,
+    typename... Args>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
+    Args... args)
 {
   KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
@@ -169,17 +179,19 @@ struct WarpThreadExec;
 template <typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
 {
-  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                           RAJA::statement::Lambda<0>>> // end DEVICE_KERNEL
-                                  >;
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::Lambda<0>>>  // end DEVICE_KERNEL
+                         >;
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-                      REDUCE_POL,
-                      POLICY_DATA>
+struct WarpThreadExec<
+    DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+    REDUCE_POL,
+    POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
@@ -189,7 +201,7 @@ struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
           RAJA::statement::For<
               0,
               typename camp::at<POLICY_DATA, camp::num<0>>::type,
-              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
                          >;
 };
 
@@ -205,10 +217,10 @@ struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
               0,
               RAJA::statement::Param<0>,
               typename camp::at<POLICY_DATA, camp::num<0>>::type,
-              RAJA::statement::Lambda<0>>>> // end DEVICE_KERNEL
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
                          >;
 };
 
-#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
 
-#endif // __WARP_THREAD_WARPLOOP_IMPL_HPP__
+#endif  // __WARP_THREAD_WARPLOOP_IMPL_HPP__
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index 10ea760622..0d0c972689 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -21,10 +21,11 @@
 // Defining the Launch Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template <typename EXEC_POL_DATA,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Lambda>
+template <
+    typename EXEC_POL_DATA,
+    typename IDX_TYPE,
+    typename SEGMENTS_TYPE,
+    typename Lambda>
 void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
 {
   using RAJA::get;
@@ -56,8 +57,9 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
   IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k);
 
   RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
-                         RAJA::Threads(threads_i, threads_j, threads_k)),
+      RAJA::LaunchParams(
+          RAJA::Teams(blocks_i, blocks_j, blocks_k),
+          RAJA::Threads(threads_i, threads_j, threads_k)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_Z_POLICY>(
@@ -97,8 +99,9 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
                                                 j < distance_sj &&
                                                 k < distance_sk)
                                             {
-                                              lambda(begin_sk[k], begin_sj[j],
-                                                     begin_si[i]);
+                                              lambda(
+                                                  begin_sk[k], begin_sj[j],
+                                                  begin_si[i]);
                                             }
                                           });
                                     });
@@ -109,41 +112,45 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
       });
 }
 
-template <typename EXEC_POL_DATA,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <
+    typename EXEC_POL_DATA,
+    typename REDUCE_POLICY,
+    typename ABSTRACTION,
+    typename DATA_TYPE,
+    typename IDX_TYPE,
+    typename SEGMENTS_TYPE,
+    typename Container,
+    typename WORKING_RES,
+    typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
-LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
-                                const Container&,
-                                WORKING_RES,
-                                RandomGenerator&)
+LaunchMultiReduceNestedTestImpl(
+    const SEGMENTS_TYPE&,
+    const Container&,
+    WORKING_RES,
+    RandomGenerator&)
 {
   return false;
 }
 ///
-template <typename EXEC_POL_DATA,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <
+    typename EXEC_POL_DATA,
+    typename REDUCE_POLICY,
+    typename ABSTRACTION,
+    typename DATA_TYPE,
+    typename IDX_TYPE,
+    typename SEGMENTS_TYPE,
+    typename Container,
+    typename WORKING_RES,
+    typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
-LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
-                                const Container&     multi_init,
-                                WORKING_RES          working_res,
-                                RandomGenerator&     rngen)
+LaunchMultiReduceNestedTestImpl(
+    const SEGMENTS_TYPE& segments,
+    const Container&     multi_init,
+    WORKING_RES          working_res,
+    RandomGenerator&     rngen)
 {
   using RAJA::get;
   using MULTIREDUCER =
@@ -180,8 +187,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range + 1, working_res, &working_range,
-                         &check_range, &test_range);
+  allocateForallTestData(
+      idx_range + 1, working_res, &working_range, &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -207,11 +214,11 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  allocateForallTestData(data_len, working_res, &working_array, &check_array,
-                         &test_array);
+  allocateForallTestData(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
-  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
-                         &test_bins);
+  allocateForallTestData(
+      data_len, working_res, &working_bins, &check_bins, &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -231,8 +238,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range,
-                     sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(
+      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -318,17 +325,18 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     {
 
       // use floating point values to accentuate floating point precision issues
-      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-                         std::uniform_int_distribution<DATA_TYPE>,
-                         std::uniform_real_distribution<DATA_TYPE>>
+      std::conditional_t<
+          !std::is_floating_point<DATA_TYPE>::value,
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>>
           array_flt_distribution(0, modval - 1);
 
       for (IDX_TYPE i = 0; i < data_len; ++i)
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array,
-                         sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(
+          working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -390,10 +398,10 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto         random_seed = std::random_device{}();
+  auto         random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
@@ -402,40 +410,43 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   for (size_t num_bins_max : num_bins_max_container)
   {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
-                                                        num_bins_max);
+    std::uniform_int_distribution<size_t> num_bins_dist(
+        num_bins_min, num_bins_max);
     num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s1, container,
-                                                         working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s2, container,
-                                                         working_res, rngen);
+    auto s1 = RAJA::make_tuple(
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    LaunchMultiReduceNestedTestImpl<
+        EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+        s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(
+        RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+        RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+        RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    LaunchMultiReduceNestedTestImpl<
+        EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+        s2, container, working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 =
-        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s3, container,
-                                                         working_res, rngen);
+    auto s3 = RAJA::make_tuple(
+        RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+        RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+        RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    LaunchMultiReduceNestedTestImpl<
+        EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+        s3, container, working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest,
-                            MultiReduceNestedLaunch);
+REGISTER_TYPED_TEST_SUITE_P(
+    LaunchMultiReduceNestedTest,
+    MultiReduceNestedLaunch);
 
-#endif // __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
+#endif  // __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index c555649a4d..c12d1d35af 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -10,15 +10,16 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename THREAD_X_POLICY,
+    typename THREAD_Y_POLICY,
+    typename THREAD_Z_POLICY,
+    typename TEAM_X_POLICY,
+    typename TEAM_Y_POLICY,
+    typename TEAM_Z_POLICY>
 void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -40,7 +41,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -51,8 +52,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
   // 6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
@@ -69,12 +70,13 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
     constexpr int DIM = 6;
     using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
-                                           N1);
+    RAJA::View<INDEX_TYPE, layout_t> Aview(
+        working_array, N6, N5, N4, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -126,16 +128,17 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -177,8 +180,9 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          test_array[RAJA::stripIndexType(i)],
+          check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -187,8 +191,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -201,46 +205,39 @@ class LaunchNestedDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+
+  using TEAM_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
+
+  using THREAD_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(1));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest, RangeSegmentTeams);
 
-#endif // __TEST_LAUNCH_NESTED_DIRECT_HPP__
+#endif  // __TEST_LAUNCH_NESTED_DIRECT_HPP__
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index 4d857b37fa..a8ed179ed9 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -10,15 +10,16 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename THREAD_X_POLICY,
+    typename THREAD_Y_POLICY,
+    typename THREAD_Z_POLICY,
+    typename TEAM_X_POLICY,
+    typename TEAM_Y_POLICY,
+    typename TEAM_Z_POLICY>
 void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -41,7 +42,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -52,8 +53,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   // 6 threads total
   constexpr int threads_x = 1;
@@ -71,12 +72,13 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
     constexpr int DIM = 6;
     using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
-                                           N1);
+    RAJA::View<INDEX_TYPE, layout_t> Aview(
+        working_array, N6, N5, N4, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -128,16 +130,17 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -179,8 +182,9 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          test_array[RAJA::stripIndexType(i)],
+          check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -189,8 +193,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -203,45 +207,38 @@ class LaunchNestedLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+
+  using TEAM_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
+
+  using THREAD_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(0));
 
-  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(3));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest, RangeSegmentTeams);
 
-#endif // __TEST_LAUNCH_NESTED_LOOP_HPP__
+#endif  // __TEST_LAUNCH_NESTED_LOOP_HPP__
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index e4e36a33c5..675fcc760a 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -10,15 +10,16 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename THREAD_X_POLICY,
+    typename THREAD_Y_POLICY,
+    typename THREAD_Z_POLICY,
+    typename TEAM_X_POLICY,
+    typename TEAM_Y_POLICY,
+    typename TEAM_Z_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -44,7 +45,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -55,8 +56,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -68,8 +69,9 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -110,16 +112,17 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -161,8 +164,9 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          test_array[RAJA::stripIndexType(i)],
+          check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -171,8 +175,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -185,31 +189,24 @@ class LaunchNestedTileDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+
+  using THREAD_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using THREAD_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
+
+  using TEAM_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
+  using TEAM_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
+  using TEAM_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
@@ -227,4 +224,4 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
-#endif // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index 6b6619a6bd..ea4b0a2b78 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -10,15 +10,16 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename THREAD_X_POLICY,
+    typename THREAD_Y_POLICY,
+    typename THREAD_Z_POLICY,
+    typename TEAM_X_POLICY,
+    typename TEAM_Y_POLICY,
+    typename TEAM_Z_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -41,7 +42,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -52,8 +53,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -65,8 +66,9 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -107,16 +109,17 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(
+            RAJA::Teams(blocks_x, blocks_y, blocks_z),
+            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -164,8 +167,9 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          test_array[RAJA::stripIndexType(i)],
+          check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -174,8 +178,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -188,31 +192,24 @@ class LaunchNestedTileLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+
+  using THREAD_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using THREAD_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
+
+  using TEAM_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
+  using TEAM_Y_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
+  using TEAM_Z_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
@@ -230,4 +227,4 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
-#endif // __TEST_LAUNCH_NESTED_TILE_LOOP_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_LOOP_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index 58654af529..a9b7c306ab 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -13,15 +13,17 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename REDUCE_POLICY>
-void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY,
+    typename REDUCE_POLICY>
+void LaunchReduceBitAndBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -33,8 +35,8 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -83,12 +85,13 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           redand &= working_array[idx];
-                                           redand2 &= working_array[idx];
-                                         });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx)
+            {
+              redand &= working_array[idx];
+              redand2 &= working_array[idx];
+            });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
@@ -111,8 +114,8 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -123,18 +126,16 @@ class LaunchReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -165,16 +166,16 @@ TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
   RAJA::getIndices(seg_idx, r4);
   LaunchReduceBitAndBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                          working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchReduceBitAndBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                          working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -194,7 +195,8 @@ TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
       GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest,
-                            ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    LaunchReduceBitAndBasicTest,
+    ReduceBitAndBasicForall);
 
-#endif // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
+#endif  // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index 7cba0f0569..79ea1d1004 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -13,15 +13,17 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename REDUCE_POLICY>
-void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY,
+    typename REDUCE_POLICY>
+void LaunchReduceMinBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -33,8 +35,8 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -61,12 +63,13 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           mininit.min(working_array[idx]);
-                                           min.min(working_array[idx]);
-                                         });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx)
+            {
+              mininit.min(working_array[idx]);
+              min.min(working_array[idx]);
+            });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
@@ -100,8 +103,8 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -112,44 +115,39 @@ class LaunchReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r1, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r2, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r3, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
@@ -157,16 +155,16 @@ TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
   RAJA::getIndices(seg_idx, r4);
   LaunchReduceMinBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                          working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchReduceMinBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                          working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -181,12 +179,11 @@ TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      l1, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest, ReduceMinBasicForall);
 
-#endif // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
+#endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index a48897d2fd..942cbc8b91 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -13,15 +13,17 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename REDUCE_POLICY>
-void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
-                                  const std::vector<IDX_TYPE>& seg_idx,
-                                  camp::resources::Resource    working_res)
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY,
+    typename REDUCE_POLICY>
+void LaunchReduceSumBasicTestImpl(
+    const SEG_TYPE&              seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource    working_res)
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
@@ -34,8 +36,8 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
@@ -60,12 +62,13 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           sum += working_array[idx];
-                                           sum2 += working_array[idx];
-                                         });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx)
+            {
+              sum += working_array[idx];
+              sum2 += working_array[idx];
+            });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
@@ -89,8 +92,8 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE&              seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -101,43 +104,38 @@ class LaunchReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r1, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r2, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r3, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
@@ -145,16 +143,16 @@ TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
   RAJA::getIndices(seg_idx, r4);
   LaunchReduceSumBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                          working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchReduceSumBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                          working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -169,12 +167,11 @@ TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      l1, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest, ReduceSumBasicForall);
 
-#endif // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
+#endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index b876291685..b0eebd67b3 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -13,11 +13,12 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY>
 
 void LaunchParamExptReduceBitAndBasicTestImpl(
     const SEG_TYPE&              seg,
@@ -34,8 +35,8 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -85,15 +86,16 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _redand,
-                           DATA_TYPE & _redand2)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, DATA_TYPE & _redand, DATA_TYPE & _redand2)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           _redand &= working_array[idx];
-                                           _redand2 &= working_array[idx];
-                                         });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx)
+            {
+              _redand &= working_array[idx];
+              _redand2 &= working_array[idx];
+            });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
@@ -117,8 +119,8 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -129,43 +131,38 @@ class LaunchParamExptReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r1, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r2, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r3, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
@@ -195,13 +192,13 @@ TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedListSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      l1, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest,
-                            ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    LaunchParamExptReduceBitAndBasicTest,
+    ReduceBitAndBasicForall);
 
-#endif // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
+#endif  // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 2eb1fce3da..99f890d267 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -13,11 +13,12 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY>
 void LaunchParamExptReduceMinBasicTestImpl(
     const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
@@ -33,8 +34,8 @@ void LaunchParamExptReduceMinBasicTestImpl(
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int       modval    = 100;
   const DATA_TYPE min_init  = modval + 1;
@@ -61,8 +62,8 @@ void LaunchParamExptReduceMinBasicTestImpl(
       "LaunchMinBasicTest",
       RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _mininit,
-                           DATA_TYPE & _min)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, DATA_TYPE & _mininit, DATA_TYPE & _min)
       {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
             ctx, seg,
@@ -110,8 +111,8 @@ void LaunchParamExptReduceMinBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -122,60 +123,53 @@ class LaunchParamExptReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r1, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r2, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r3, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r4, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r5, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -190,13 +184,13 @@ TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedListSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      l1, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    LaunchParamExptReduceMinBasicTest,
+    ReduceMinBasicForall);
 
-#endif // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
+#endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index 0ce6f6483d..6b6ebd4e6a 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -13,11 +13,12 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename SEG_TYPE,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY>
+template <
+    typename IDX_TYPE,
+    typename DATA_TYPE,
+    typename SEG_TYPE,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY>
 void LaunchParamExptReduceSumBasicTestImpl(
     const SEG_TYPE&              seg,
     const std::vector<IDX_TYPE>& seg_idx,
@@ -34,8 +35,8 @@ void LaunchParamExptReduceSumBasicTestImpl(
   constexpr int threads = 256;
   int           blocks  = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   const int modval = 100;
 
@@ -59,15 +60,16 @@ void LaunchParamExptReduceSumBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       "LaunchSumBasicTest", RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum,
-                           DATA_TYPE & _sum2)
+      [=] RAJA_HOST_DEVICE(
+          RAJA::LaunchContext ctx, DATA_TYPE & _sum, DATA_TYPE & _sum2)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           _sum += working_array[idx];
-                                           _sum2 += working_array[idx];
-                                         });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx)
+            {
+              _sum += working_array[idx];
+              _sum2 += working_array[idx];
+            });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
@@ -92,8 +94,8 @@ void LaunchParamExptReduceSumBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -104,60 +106,53 @@ class LaunchParamExptReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r1, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r2, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r3, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r4, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r5, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -172,13 +167,13 @@ TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedListSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      l1, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(
+    LaunchParamExptReduceSumBasicTest,
+    ReduceSumBasicForall);
 
-#endif // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
+#endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index e9d5e0b503..44dda21028 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -10,22 +10,23 @@
 
 #include <numeric>
 
-template <typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename TEAM_POLICY,
-          typename THREAD_POLICY>
+template <
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename TEAM_POLICY,
+    typename THREAD_POLICY>
 void LaunchBasicSharedTestImpl()
 {
 
   int N = 1000;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   int*                      working_array;
   int*                      check_array;
   int*                      test_array;
 
-  allocateForallTestData<int>(N * N, working_res, &working_array, &check_array,
-                              &test_array);
+  allocateForallTestData<int>(
+      N * N, working_res, &working_array, &check_array, &test_array);
 
 
   // Select platform
@@ -53,22 +54,23 @@ void LaunchBasicSharedTestImpl()
               // Array shared within threads of the same team
               int* s_A = ctx.getSharedMemory<int>(1);
 
-              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1),
-                                        [&](int c) { s_A[c] = r; });
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; });
 
               ctx.teamSync();
 
               // broadcast shared value to all threads and write to array
-              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N),
-                                        [&](int c)
-                                        {
-                                          const int idx      = c + N * r;
-                                          working_array[idx] = s_A[0];
-                                        }); // loop j
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, RAJA::RangeSegment(0, N),
+                  [&](int c)
+                  {
+                    const int idx      = c + N * r;
+                    working_array[idx] = s_A[0];
+                  });  // loop j
 
               ctx.releaseSharedMemory();
-            }); // loop r
-      });       // outer lambda
+            });  // loop r
+      });        // outer lambda
 
 
   working_res.memcpy(check_array, working_array, sizeof(int) * N * N);
@@ -81,8 +83,8 @@ void LaunchBasicSharedTestImpl()
     }
   }
 
-  deallocateForallTestData<int>(working_res, working_array, check_array,
-                                test_array);
+  deallocateForallTestData<int>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -94,21 +96,18 @@ class LaunchBasicSharedTest : public ::testing::Test
 TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
 {
 
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
-                        camp::num<0>>::type;
-  using TEAM_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
-                        camp::num<1>>::type;
-  using THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
-                        camp::num<2>>::type;
-
-  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                            THREAD_POLICY>();
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<1>>::type, camp::num<0>>::type;
+  using TEAM_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<1>>::type, camp::num<1>>::type;
+  using THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<1>>::type, camp::num<2>>::type;
+
+  LaunchBasicSharedTestImpl<
+      WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest, BasicSharedTeams);
 
-#endif // __TEST_BASIC_SHARED_HPP__
+#endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index f100d3dac3..b16340416c 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -15,10 +15,11 @@
 #include <algorithm>
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POICY>
 void LaunchListSegmentTestImpl(INDEX_TYPE N)
 {
 
@@ -38,7 +39,7 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
@@ -58,8 +59,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   constexpr int threads = 256;
   int           blocks  = (data_len - 1) / threads + 1;
@@ -72,8 +73,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
       test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
@@ -86,23 +87,24 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg,
-                                          [&](INDEX_TYPE idx)
-                                          {
-                                            (void)idx;
-                                            working_array[0]++;
-                                          });
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, lseg,
+              [&](INDEX_TYPE idx)
+              {
+                (void)idx;
+                working_array[0]++;
+              });
         });
   }
 
@@ -112,8 +114,9 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
   {
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          test_array[RAJA::stripIndexType(i)],
+          check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -122,8 +125,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
   }
 
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -136,27 +139,29 @@ TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
 {
   using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
+  using LAUNCH_POLICY    = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
 
   // test zero-length list segment
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
+  LaunchListSegmentTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
+  LaunchListSegmentTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(13));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
+  LaunchListSegmentTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(2047));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
+  LaunchListSegmentTestImpl<
+      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(32000));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest, ListSegmentTeams);
 
-#endif // __TEST_TEAMS_LISTSEGMENT_HPP__
+#endif  // __TEST_TEAMS_LISTSEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index cdb28dc6f3..fd29331771 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -10,18 +10,19 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POICY>
 void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -32,8 +33,8 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   constexpr int threads = 256;
   int           blocks  = (data_len - 1) / threads + 1;
@@ -56,20 +57,20 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
-                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
-                                          { working_array[0]++; });
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) { working_array[0]++; });
         });
   }
 
@@ -80,8 +81,9 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          test_array[RAJA::stripIndexType(i)],
+          check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -90,8 +92,8 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -100,67 +102,67 @@ template <typename T>
 class LaunchRangeSegmentTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
-                                                   INDEX_TYPE(-5));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
-                                                   INDEX_TYPE(0));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
-                                                   INDEX_TYPE(5));
+  LaunchRangeSegmentTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5));
+
+  LaunchRangeSegmentTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  LaunchRangeSegmentTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
 
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(3),
-                                                   INDEX_TYPE(3));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(0),
-                                                   INDEX_TYPE(27));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
-                                                   INDEX_TYPE(2047));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
-                                                   INDEX_TYPE(32000));
-
-  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                   GLOBAL_THREAD_POLICY>();
+  LaunchRangeSegmentTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3));
+
+  LaunchRangeSegmentTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  LaunchRangeSegmentTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  LaunchRangeSegmentTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
+
+  runNegativeTests<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest, RangeSegmentTeams);
 
-#endif // __TEST_RANGE_SEGMENT_HPP__
+#endif  // __TEST_RANGE_SEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index c8095d1f15..144e80519d 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -10,21 +10,23 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY>
-void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
-                                      INDEX_TYPE last,
-                                      DIFF_TYPE  stride)
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POICY>
+void LaunchRangeStrideSegmentTestImpl(
+    INDEX_TYPE first,
+    INDEX_TYPE last,
+    DIFF_TYPE  stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -35,8 +37,8 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
@@ -68,15 +70,15 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
-                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
-                                          { working_array[0]++; });
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) { working_array[0]++; });
         });
   }
 
@@ -87,8 +89,9 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(
+          test_array[RAJA::stripIndexType(i)],
+          check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -97,8 +100,8 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -107,93 +110,95 @@ template <typename T>
 class LaunchRangeStrideSegmentTest : public ::testing::Test
 {};
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POICY,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {}
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename GLOBAL_THREAD_POLICY,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
 
   // Test negative strides
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
 TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
 
   // Test size zero segments
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
-  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
-                         GLOBAL_THREAD_POLICY>();
+  runNegativeStrideTests<
+      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest,
-                            RangeStrideSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(
+    LaunchRangeStrideSegmentTest,
+    RangeStrideSegmentTeams);
 
-#endif // __TEST_TEAMS_RANGESTRIDESEGMENT_HPP__
+#endif  // __TEST_TEAMS_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index 0c59d7f5ed..f53faae728 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -10,11 +10,12 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename TEAM_POLICY,
-          typename THREAD_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename TEAM_POLICY,
+    typename THREAD_POLICY>
 void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 {
 
@@ -23,7 +24,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
       RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -31,8 +32,8 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
@@ -54,9 +55,9 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   shared_mem_size += RAJA::stripIndexType(thread_range) * sizeof(int);
 
   RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                         RAJA::Threads(RAJA::stripIndexType(thread_range)),
-                         shared_mem_size),
+      RAJA::LaunchParams(
+          RAJA::Teams(RAJA::stripIndexType(block_range)),
+          RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_POLICY>(
@@ -79,8 +80,9 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
                   {
                     Int_Tile(RAJA::stripIndexType(tid)) =
                         RAJA::stripIndexType(tid);
-                    Tile(RAJA::stripIndexType(thread_range) -
-                         RAJA::stripIndexType(tid) - 1) =
+                    Tile(
+                        RAJA::stripIndexType(thread_range) -
+                        RAJA::stripIndexType(tid) - 1) =
                         thread_range - tid - 1 + thread_range * bid;
                   });
 
@@ -104,12 +106,13 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 
   for (size_t i = 0; i < data_len; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -121,26 +124,25 @@ class LaunchDynamicMemTest : public ::testing::Test
 TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using TEAM_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+  using TEAM_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
 
 
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                           THREAD_POLICY>(INDEX_TYPE(4), INDEX_TYPE(2));
+  LaunchDynamicMemTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>(
+      INDEX_TYPE(4), INDEX_TYPE(2));
 
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                           THREAD_POLICY>(INDEX_TYPE(5), INDEX_TYPE(32));
+  LaunchDynamicMemTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>(
+      INDEX_TYPE(5), INDEX_TYPE(32));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest, DynamicMemLaunch);
 
-#endif // __TEST_DYNAMIC_MEM_HPP__
+#endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index f80f70f752..ba0cd6dcec 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -10,12 +10,13 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename TEAM_POLICY,
-          typename THREAD_POLICY,
-          int THREAD_RANGE>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename TEAM_POLICY,
+    typename THREAD_POLICY,
+    int THREAD_RANGE>
 void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 {
 
@@ -26,7 +27,7 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
   RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
       RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_array;
   INDEX_TYPE*               check_array;
   INDEX_TYPE*               test_array;
@@ -34,8 +35,8 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_array, &check_array, &test_array);
 
   // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
@@ -50,8 +51,9 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
   }
 
   RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                         RAJA::Threads(RAJA::stripIndexType(thread_range))),
+      RAJA::LaunchParams(
+          RAJA::Teams(RAJA::stripIndexType(block_range)),
+          RAJA::Threads(RAJA::stripIndexType(thread_range))),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_POLICY>(
@@ -73,9 +75,10 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
                   ctx, inner_range,
                   [&](INDEX_TYPE tid)
                   {
-                    Tile[RAJA::stripIndexType(thread_range) -
+                    Tile
+                        [RAJA::stripIndexType(thread_range) -
                          RAJA::stripIndexType(tid) - 1] =
-                        thread_range - tid - 1 + thread_range * bid;
+                            thread_range - tid - 1 + thread_range * bid;
                   });
 
               ctx.teamSync();
@@ -97,12 +100,13 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 
   for (size_t i = 0; i < data_len; i++)
   {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(
+        test_array[RAJA::stripIndexType(i)],
+        check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_array, check_array, test_array);
 }
 
 
@@ -114,26 +118,25 @@ class LaunchStaticMemTest : public ::testing::Test
 TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using TEAM_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+  using TEAM_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
 
 
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                          THREAD_POLICY, 2>(INDEX_TYPE(4));
+  LaunchStaticMemTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 2>(
+      INDEX_TYPE(4));
 
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                          THREAD_POLICY, 32>(INDEX_TYPE(5));
+  LaunchStaticMemTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 32>(
+      INDEX_TYPE(5));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest, StaticMemLaunch);
 
-#endif // __TEST_DYNAMIC_MEM_HPP__
+#endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index db8c4541a6..d1041c31c4 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -10,11 +10,12 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename TEAM_X_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename THREAD_X_POLICY,
+    typename TEAM_X_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -29,7 +30,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_ttile_array;
   INDEX_TYPE*               check_ttile_array;
   INDEX_TYPE*               test_ttile_array;
@@ -44,13 +45,13 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_ttile_array, &check_ttile_array,
-                                     &test_ttile_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_ttile_array, &check_ttile_array,
+      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_iloop_array, &check_iloop_array,
-                                     &test_iloop_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_iloop_array, &check_iloop_array,
+      &test_iloop_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -78,13 +79,13 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0,
-           sizeof(INDEX_TYPE) * data_len);
+    memset(
+        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
@@ -107,10 +108,10 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array,
-                     sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array,
-                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -137,11 +138,11 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
-                                       check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
-                                       check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
 }
 
 
@@ -154,32 +155,32 @@ class LaunchNestedTileDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
 
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
+      INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
+      INDEX_TYPE(1));
 
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
+      INDEX_TYPE(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
-#endif // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index e9e7e1c6ca..9b92ad7166 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -10,11 +10,12 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename TEAM_X_POLICY>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename LAUNCH_POLICY,
+    typename THREAD_X_POLICY,
+    typename TEAM_X_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -32,7 +33,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE*               working_ttile_array;
   INDEX_TYPE*               check_ttile_array;
   INDEX_TYPE*               test_ttile_array;
@@ -47,13 +48,13 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_ttile_array, &check_ttile_array,
-                                     &test_ttile_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_ttile_array, &check_ttile_array,
+      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_iloop_array, &check_iloop_array,
-                                     &test_iloop_array);
+  allocateForallTestData<INDEX_TYPE>(
+      data_len, working_res, &working_iloop_array, &check_iloop_array,
+      &test_iloop_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -81,13 +82,13 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         });
   }
   else
-  { // zero-length segment
+  {  // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0,
-           sizeof(INDEX_TYPE) * data_len);
+    memset(
+        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(
+        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
@@ -110,10 +111,10 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array,
-                     sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array,
-                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(
+      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -140,11 +141,11 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
-                                       check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
-                                       check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(
+      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
 }
 
 
@@ -157,32 +158,32 @@ class LaunchNestedTileLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
 
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_X_POLICY = typename camp::at<
+      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
+      INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
+      INDEX_TYPE(1));
 
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
+      INDEX_TYPE(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
-#endif // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
+#endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index 8b904dace8..61a004db4b 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -11,10 +11,11 @@
 #include <numeric>
 
 template <typename OP, typename T>
-::testing::AssertionResult check_exclusive(const T* actual,
-                                           const T* original,
-                                           int      N,
-                                           T        init = OP::identity())
+::testing::AssertionResult check_exclusive(
+    const T* actual,
+    const T* original,
+    int      N,
+    T        init = OP::identity())
 {
   for (int i = 0; i < N; ++i)
   {
@@ -37,8 +38,8 @@ void ScanExclusiveTestImpl(
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES               res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
@@ -55,7 +56,7 @@ void ScanExclusiveTestImpl(
 
   RAJA::exclusive_scan<EXEC_POLICY>(
       RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE{}, offset);
+      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -67,7 +68,7 @@ void ScanExclusiveTestImpl(
 
   RAJA::exclusive_scan<EXEC_POLICY>(
       res, RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE{}, offset);
+      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -105,4 +106,4 @@ TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
 
 REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest, ScanExclusive);
 
-#endif // __TEST_SCAN_EXCLUSIVE_HPP__
+#endif  // __TEST_SCAN_EXCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index 4c29013c80..4491ea539c 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -11,10 +11,11 @@
 #include <numeric>
 
 template <typename OP, typename T>
-::testing::AssertionResult check_exclusive(const T* actual,
-                                           const T* original,
-                                           int      N,
-                                           T        init = OP::identity())
+::testing::AssertionResult check_exclusive(
+    const T* actual,
+    const T* original,
+    int      N,
+    T        init = OP::identity())
 {
   for (int i = 0; i < N; ++i)
   {
@@ -37,8 +38,8 @@ void ScanExclusiveInplaceTestImpl(
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES               res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
@@ -53,8 +54,8 @@ void ScanExclusiveInplaceTestImpl(
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE{}, offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
+      RAJA::make_span(work_in, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -64,8 +65,8 @@ void ScanExclusiveInplaceTestImpl(
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
-                                            OP_TYPE{}, offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
+      res, RAJA::make_span(work_in, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -96,14 +97,14 @@ TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0,
-                                                                       T(13));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357,
-                                                                       T(15));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000,
-                                                                       T(2));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(
+      0, T(13));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(
+      357, T(15));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(
+      32000, T(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest, ScanExclusiveInplace);
 
-#endif // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
+#endif  // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index 1286785154..ca744d480e 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -11,10 +11,10 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult
-check_inclusive(const typename OP::result_type* actual,
-                const typename OP::result_type* original,
-                int                             N)
+::testing::AssertionResult check_inclusive(
+    const typename OP::result_type* actual,
+    const typename OP::result_type* original,
+    int                             N)
 {
   typename OP::result_type init = OP::identity();
   for (int i = 0; i < N; ++i)
@@ -36,8 +36,8 @@ void ScanInclusiveTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES               res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
@@ -54,7 +54,7 @@ void ScanInclusiveTestImpl(int N)
 
   RAJA::inclusive_scan<EXEC_POLICY>(
       RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE{});
+      RAJA::make_span(work_out, N), OP_TYPE {});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -66,7 +66,7 @@ void ScanInclusiveTestImpl(int N)
 
   RAJA::inclusive_scan<EXEC_POLICY>(
       res, RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE{});
+      RAJA::make_span(work_out, N), OP_TYPE {});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -95,4 +95,4 @@ TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
 
 REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest, ScanInclusive);
 
-#endif // __TEST_SCAN_INCLUSIVE_HPP__
+#endif  // __TEST_SCAN_INCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index 92b7e7447c..4a86d7d64f 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -11,10 +11,10 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult
-check_inclusive(const typename OP::result_type* actual,
-                const typename OP::result_type* original,
-                int                             N)
+::testing::AssertionResult check_inclusive(
+    const typename OP::result_type* actual,
+    const typename OP::result_type* original,
+    int                             N)
 {
   typename OP::result_type init = OP::identity();
   for (int i = 0; i < N; ++i)
@@ -36,8 +36,8 @@ void ScanInclusiveInplaceTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES               res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
@@ -52,8 +52,8 @@ void ScanInclusiveInplaceTestImpl(int N)
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE{});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(
+      RAJA::make_span(work_in, N), OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -63,8 +63,8 @@ void ScanInclusiveInplaceTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
-                                            OP_TYPE{});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(
+      res, RAJA::make_span(work_in, N), OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -93,4 +93,4 @@ TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
 
 REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest, ScanInclusiveInplace);
 
-#endif // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
+#endif  // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
index f60163d827..e6010c2cad 100644
--- a/test/functional/scan/tests/test-scan-data.hpp
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -13,14 +13,15 @@
 //
 
 template <typename T>
-void allocScanTestData(int                       N,
-                       camp::resources::Resource work_res,
-                       T**                       work_in,
-                       T**                       work_out,
-                       T**                       host_in,
-                       T**                       host_out)
+void allocScanTestData(
+    int                       N,
+    camp::resources::Resource work_res,
+    T**                       work_in,
+    T**                       work_out,
+    T**                       host_in,
+    T**                       host_out)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_in  = work_res.allocate<T>(N);
   *work_out = work_res.allocate<T>(N);
@@ -30,13 +31,14 @@ void allocScanTestData(int                       N,
 }
 
 template <typename T>
-void deallocScanTestData(camp::resources::Resource work_res,
-                         T*                        work_in,
-                         T*                        work_out,
-                         T*                        host_in,
-                         T*                        host_out)
+void deallocScanTestData(
+    camp::resources::Resource work_res,
+    T*                        work_in,
+    T*                        work_out,
+    T*                        host_in,
+    T*                        host_out)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   work_res.deallocate(work_in);
   work_res.deallocate(work_out);
@@ -44,4 +46,4 @@ void deallocScanTestData(camp::resources::Resource work_res,
   host_res.deallocate(host_out);
 }
 
-#endif // __TEST_SCAN_DATA_HPP__
+#endif  // __TEST_SCAN_DATA_HPP__
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index d988dd8e55..b0b399be09 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -14,34 +14,39 @@ using MatrixElementType = double;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        8,
+        RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        16,
+        RAJA::expt::hip_wave_register>,
 #endif
 
 
@@ -65,21 +70,24 @@ using TensorMatrixTypes = ::testing::Types<
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   4,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   12,
-                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        2,
+        4,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        2,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        2,
+        12,
+        RAJA::expt::avx2_register>,
 
 //    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
 //    4,8, RAJA::expt::avx2_register>,
@@ -95,41 +103,48 @@ using TensorMatrixTypes = ::testing::Types<
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   4,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        8,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        4,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx512_register>,
 #endif
 
 
@@ -137,8 +152,9 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
index 40cb6f67fd..3dc5828dbb 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
@@ -14,73 +14,84 @@ using MatrixElementType = float;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        8,
+        RAJA::expt::avx512_register>,
 #endif
 
 
@@ -88,8 +99,9 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
index b3e415abbc..9c48f41e64 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
@@ -14,73 +14,84 @@ using MatrixElementType = int32_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        8,
+        RAJA::expt::avx512_register>,
 #endif
 
 
@@ -88,8 +99,9 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
index 3dca8e44a6..1db4d3f4e5 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
@@ -14,134 +14,157 @@ using MatrixElementType = int64_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   4,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   2,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   2,
-                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        2,
+        4,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        2,
+        8,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        2,
+        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        2,
+        RAJA::expt::avx_register>,
 
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   4,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   2,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   2,
-                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        2,
+        4,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        2,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        8,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        2,
+        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        2,
+        RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   4,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        8,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        4,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        16,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        8,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        16,
+        4,
+        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        8,
+        4,
+        RAJA::expt::avx512_register>,
 #endif
 
 
@@ -149,8 +172,9 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<
+        MatrixElementType,
+        TensorMatrixLayoutType,
+        RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index e1390849f9..911a0f88f3 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -22,8 +22,8 @@ void CtorGetSetImpl()
   //
   // Allocate Data
   //
-  std::vector<element_t>                 data1_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -32,8 +32,8 @@ void CtorGetSetImpl()
       data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
-  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index 3f8d42417f..ce74af66a4 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -125,8 +125,9 @@ void ET_AddImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i) +
-                                          data3_h(i, j) + data4_h(j, i));
+      ASSERT_SCALAR_EQ(
+          data5_h(j, i),
+          data1_h(i, j) + data2_h(j, i) + data3_h(i, j) + data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index 95b3b38304..dd6b18ff18 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -126,8 +126,9 @@ void ET_DivideImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i) +
-                                          data3_h(i, j) / data4_h(j, i));
+      ASSERT_SCALAR_EQ(
+          data5_h(j, i),
+          data1_h(i, j) / data2_h(j, i) + data3_h(i, j) / data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 12e48fdc92..78c033d261 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -24,8 +24,8 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-  std::vector<element_t>                 data1_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -35,8 +35,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data2
-  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -46,21 +46,25 @@ void ET_LoadStoreImpl()
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
-                                           matrix_t::s_num_columns>>
+  std::vector<element_t> data3_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  RAJA::View<
+      element_t,
+      RAJA::StaticLayout<
+          RAJA::PERM_IJ, matrix_t::s_num_rows, matrix_t::s_num_columns>>
       data3_h(data3_vec.data());
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
-                                           matrix_t::s_num_columns>>
+  RAJA::View<
+      element_t,
+      RAJA::StaticLayout<
+          RAJA::PERM_IJ, matrix_t::s_num_rows, matrix_t::s_num_columns>>
       data3_d(data3_ptr);
 
 
   // alloc data4
-  std::vector<element_t>                 data4_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data4_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data4_h(
       data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -70,8 +74,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data5
-  std::vector<element_t>                 data5_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data5_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(
       data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -81,8 +85,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data6
-  std::vector<element_t>                 data6_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data6_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data6_h(
       data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -92,8 +96,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data7
-  std::vector<element_t>                 data7_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data7_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data7_h(
       data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -136,11 +140,11 @@ void ET_LoadStoreImpl()
         data2_d(cols, rows) = data1_d(rows, cols);
 
         data4_d(cols, rows) =
-            data3_d(SArows, SRcols); // mixed static_all and static_range
-        data5_d(cols, rows) = data3_d(SArows, SAcols); // static_all
-        data6_d(cols, rows) = data3_d(SRrows, SRcols); // static_range
+            data3_d(SArows, SRcols);  // mixed static_all and static_range
+        data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
+        data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
         data7_d(cols, rows) =
-            data3_d(rows, SRcols); // mixed static_range and non-static
+            data3_d(rows, SRcols);  // mixed static_range and non-static
       });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index 389c765239..e51b86b0c3 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -102,13 +102,11 @@ void ET_MatrixMatrixMultiplyImpl()
       [=] RAJA_HOST_DEVICE()
       {
         auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-        auto A_cols =
-            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
-                                                                         N>();
+        auto A_cols = RAJA::expt::ColIndex<
+            int, A_matrix_t>::template static_range<0, N>();
 
-        auto B_rows =
-            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
-                                                                         N>();
+        auto B_rows = RAJA::expt::RowIndex<
+            int, B_matrix_t>::template static_range<0, N>();
         auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
         auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index 1e15d39bc9..d7114633d2 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -59,8 +59,8 @@ void ET_MatrixMatrixMultiplyAddImpl()
   // alloc data3 - The result matrix
 
   std::vector<element_t>                              data3_vec(N * N);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),
-                                                              N, N);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(
+      data3_vec.data(), N, N);
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
   RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr, N, N);
@@ -108,13 +108,11 @@ void ET_MatrixMatrixMultiplyAddImpl()
       [=] RAJA_HOST_DEVICE()
       {
         auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-        auto A_cols =
-            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
-                                                                         N>();
+        auto A_cols = RAJA::expt::ColIndex<
+            int, A_matrix_t>::template static_range<0, N>();
 
-        auto B_rows =
-            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
-                                                                         N>();
+        auto B_rows = RAJA::expt::RowIndex<
+            int, B_matrix_t>::template static_range<0, N>();
         auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
         auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index 23c8f64c13..37e44a9828 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -125,11 +125,11 @@ void ET_NegateImpl()
         output0_d(rows, cols) = -input0_d(rows, cols);
 
         output1_d(rows, cols) =
-            -input1_d(SArows, SRcols); // mixed static_all and static_range
-        output2_d(rows, cols) = -input1_d(SArows, SAcols); // static_all
-        output3_d(rows, cols) = -input1_d(SRrows, SRcols); // static_range
+            -input1_d(SArows, SRcols);  // mixed static_all and static_range
+        output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
+        output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
         output4_d(rows, cols) =
-            -input1_d(rows, SRcols); // mixed static_range and non-static
+            -input1_d(rows, SRcols);  // mixed static_range and non-static
       });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index 2d4abee459..77cea0df88 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -125,8 +125,9 @@ void ET_SubtractImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i) +
-                                          data3_h(i, j) - data4_h(j, i));
+      ASSERT_SCALAR_EQ(
+          data5_h(j, i),
+          data1_h(i, j) - data2_h(j, i) + data3_h(i, j) - data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index 57a50856d3..9b54a6e7ce 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -135,14 +135,14 @@ void ET_TransposeImpl()
 
         output1_d(rows_tr, cols_tr) =
             input1_d(SArows, SRcols)
-                .transpose(); // mixed static_all and static_range
+                .transpose();  // mixed static_all and static_range
         output2_d(rows_tr, cols_tr) =
-            input1_d(SArows, SAcols).transpose(); // static_all
+            input1_d(SArows, SAcols).transpose();  // static_all
         output3_d(rows_tr, cols_tr) =
-            input1_d(SRrows, SRcols).transpose(); // static_range
+            input1_d(SRrows, SRcols).transpose();  // static_range
         output4_d(rows_tr, cols_tr) =
             input1_d(rows, SRcols)
-                .transpose(); // mixed static_range and non-static
+                .transpose();  // mixed static_range and non-static
       });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index 37c749524c..efa2d0f912 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -25,8 +25,8 @@ void Load_ColMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(
+      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
@@ -37,8 +37,8 @@ void Load_ColMajorImpl()
 
   // alloc data2
 
-  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -135,13 +135,13 @@ void Load_ColMajorImpl()
             matrix_t m;
             if (matrix_t::layout_type::is_column_major())
             {
-              m.load_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                               m_size);
+              m.load_packed_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
             }
             else
             {
-              m.load_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                                m_size);
+              m.load_strided_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
             }
 
             // write out to a second view so we can check it on the host
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 11eb276124..17723d4647 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -25,8 +25,8 @@ void Load_RowMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(
+      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
@@ -37,8 +37,8 @@ void Load_RowMajorImpl()
 
   // alloc data2
 
-  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -133,13 +133,13 @@ void Load_RowMajorImpl()
             matrix_t m;
             if (matrix_t::layout_type::is_row_major())
             {
-              m.load_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                               n_size, m_size);
+              m.load_packed_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
             }
             else
             {
-              m.load_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                                n_size, m_size);
+              m.load_strided_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
             }
 
             // write out to a second view so we can check it on the host
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index 974994c7e1..b58ad8aab0 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -25,8 +25,8 @@ void Store_ColMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(
+      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
@@ -37,8 +37,8 @@ void Store_ColMajorImpl()
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -161,13 +161,13 @@ void Store_ColMajorImpl()
             // Store matrix to memory
             if (matrix_t::layout_type::is_column_major())
             {
-              m.store_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                                m_size);
+              m.store_packed_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
             }
             else
             {
-              m.store_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                                 m_size);
+              m.store_strided_nm(
+                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
             }
           });
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index 3958c2bf5d..23bddd60ba 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -24,8 +24,8 @@ void Store_RowMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t>                 data1_vec(4 * matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(
+      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
@@ -36,8 +36,8 @@ void Store_RowMajorImpl()
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t>                 data2_vec(matrix_t::s_num_rows *
-                                                   matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(
+      matrix_t::s_num_rows * matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -160,13 +160,13 @@ void Store_RowMajorImpl()
             // Store matrix to memory
             if (matrix_t::layout_type::is_row_major())
             {
-              m.store_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                                n_size, m_size);
+              m.store_packed_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
             }
             else
             {
-              m.store_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                                 n_size, m_size);
+              m.store_strided_nm(
+                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
             }
           });
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
index 7376893481..67d107f9f2 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
@@ -164,8 +164,8 @@ void DivideImpl()
     {
       if (lane < N)
       {
-        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane],
-                         output0_vec[lane]);
+        ASSERT_SCALAR_EQ(
+            input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
       }
       else
       {
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
index ec64f31428..e42c12da9d 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
@@ -76,8 +76,9 @@ void FMAImpl()
 
   for (camp::idx_t lane = 0; lane < num_elem; ++lane)
   {
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
-                     output0_vec[lane]);
+    ASSERT_SCALAR_EQ(
+        input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
+        output0_vec[lane]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
index 5b02616ee8..aac787b54f 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
@@ -76,8 +76,9 @@ void FMSImpl()
 
   for (camp::idx_t lane = 0; lane < num_elem; ++lane)
   {
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
-                     output0_vec[lane]);
+    ASSERT_SCALAR_EQ(
+        input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
+        output0_vec[lane]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
index ffdd99d251..07206c2b2c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
@@ -89,8 +89,8 @@ void MaxImpl()
   // check element-wise operation
   for (camp::idx_t i = 0; i < num_elem; ++i)
   {
-    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]),
-                     output1_vec[i]);
+    ASSERT_SCALAR_EQ(
+        std::max<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
index 823c61ae5d..1ca4c581c1 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
@@ -88,8 +88,8 @@ void MinImpl()
   // check element-wise operation
   for (camp::idx_t i = 0; i < num_elem; ++i)
   {
-    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]),
-                     output1_vec[i]);
+    ASSERT_SCALAR_EQ(
+        std::min<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
index 9119c64d99..cd010e9cb1 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
@@ -33,7 +33,7 @@ void SegmentedBroadcastInnerImpl()
   //  printf("input: ");
   for (camp::idx_t i = 0; i < num_elem; ++i)
   {
-    input0_hptr[i] = (element_t)(i + 1); //+NO_OPT_RAND);
+    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
     //    printf("%lf ", (double)input0_hptr[i]);
   }
   //  printf("\n");
@@ -104,9 +104,9 @@ void SegmentedBroadcastInnerImpl()
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
index 15c57ad335..18c8c262c1 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
@@ -90,9 +90,9 @@ void SegmentedBroadcastOuterImpl()
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
index 884b50eb57..a081db2d9c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
@@ -91,9 +91,9 @@ void SegmentedDotProductImpl()
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // output_segment
+    }  // output_segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
index 664d2d7318..6ef2f5b074 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
@@ -91,9 +91,9 @@ void SegmentedSumInnerImpl()
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
index 4448fc2fcb..ec61a47ee8 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
@@ -32,7 +32,7 @@ void SegmentedSumOuterImpl()
   // Initialize input data
   for (camp::idx_t i = 0; i < num_elem; ++i)
   {
-    input0_hptr[i] = (element_t)(i + 1); //+NO_OPT_RAND);
+    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
   }
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
@@ -87,9 +87,9 @@ void SegmentedSumOuterImpl()
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index 3b1111b6ef..6f218158f4 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -86,12 +86,13 @@ ForallVectorRef2dImpl()
 
   using policy2_t = RAJA::KernelPolicy<RAJA::statement::For<
       0, RAJA::seq_exec,
-      RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
-                           RAJA::statement::Lambda<0>>>>;
+      RAJA::statement::For<
+          1, RAJA::expt::vector_exec<vector_t>, RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<policy2_t>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N),
-                       RAJA::TypedRangeSegment<index_t>(0, M)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<index_t>(0, N),
+          RAJA::TypedRangeSegment<index_t>(0, M)),
 
       [=](index_t i, index_t j)
       { Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9; });
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index f4658f2cde..6b7af38c85 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -19,10 +19,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename SegIndexType0,
-          typename SegIndexType1,
-          typename Segment0,
-          typename Segment1>
+template <
+    typename SegIndexType0,
+    typename SegIndexType1,
+    typename Segment0,
+    typename Segment1>
 void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 {
   using std::begin;
@@ -62,10 +63,11 @@ void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 }
 
 template <typename SegIndexType0, typename SegIndexType1>
-void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0,
-                                    SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1,
-                                    SegIndexType1 iend1)
+void test_types_CombiningAdapter_2D(
+    SegIndexType0 ibegin0,
+    SegIndexType0 iend0,
+    SegIndexType1 ibegin1,
+    SegIndexType1 iend1)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index ffcce8a06f..e13d617ffe 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -19,15 +19,17 @@
 #include <numeric>
 #include <vector>
 
-template <typename SegIndexType0,
-          typename SegIndexType1,
-          typename SegIndexType2,
-          typename Segment0,
-          typename Segment1,
-          typename Segment2>
-void test_CombiningAdapter_3D(Segment0 const& seg0,
-                              Segment1 const& seg1,
-                              Segment2 const& seg2)
+template <
+    typename SegIndexType0,
+    typename SegIndexType1,
+    typename SegIndexType2,
+    typename Segment0,
+    typename Segment1,
+    typename Segment2>
+void test_CombiningAdapter_3D(
+    Segment0 const& seg0,
+    Segment1 const& seg1,
+    Segment2 const& seg2)
 {
   using std::begin;
   using std::distance;
@@ -65,8 +67,9 @@ void test_CombiningAdapter_3D(Segment0 const& seg0,
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)),
-            seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(
+      distance(begin(range), end(range)),
+      seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
   for (auto idx = begin(range); idx != range_end; ++idx)
@@ -75,15 +78,17 @@ void test_CombiningAdapter_3D(Segment0 const& seg0,
   }
 }
 
-template <typename SegIndexType0,
-          typename SegIndexType1,
-          typename SegIndexType2>
-void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0,
-                                    SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1,
-                                    SegIndexType1 iend1,
-                                    SegIndexType2 ibegin2,
-                                    SegIndexType2 iend2)
+template <
+    typename SegIndexType0,
+    typename SegIndexType1,
+    typename SegIndexType2>
+void test_types_CombiningAdapter_3D(
+    SegIndexType0 ibegin0,
+    SegIndexType0 iend0,
+    SegIndexType1 ibegin1,
+    SegIndexType1 iend1,
+    SegIndexType2 ibegin2,
+    SegIndexType2 iend2)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index 882af87b07..4b05289667 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -27,8 +27,8 @@ void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
   using std::end;
   auto   seg0_begin  = begin(seg0);
   auto   seg1_begin  = begin(seg1);
-  size_t seg_lens[2] = {static_cast<size_t>(seg0.size()),
-                        static_cast<size_t>(seg1.size())};
+  size_t seg_lens[2] = {
+      static_cast<size_t>(seg0.size()), static_cast<size_t>(seg1.size())};
 
   size_t counters[2] = {0, 0};
   auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
@@ -60,10 +60,11 @@ void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
 }
 
 template <typename Perm, typename IndexType>
-void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0,
-                                            IndexType iend0,
-                                            IndexType ibegin1,
-                                            IndexType iend1)
+void test_types_PermutedCombiningAdapter_2D(
+    IndexType ibegin0,
+    IndexType iend0,
+    IndexType ibegin1,
+    IndexType iend1)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index ab0ed6b4dd..3172d336ba 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -20,9 +20,10 @@
 #include <vector>
 
 template <typename Perm, typename IndexType, typename Segment>
-void test_PermutedCombiningAdapter_3D(Segment const& seg0,
-                                      Segment const& seg1,
-                                      Segment const& seg2)
+void test_PermutedCombiningAdapter_3D(
+    Segment const& seg0,
+    Segment const& seg1,
+    Segment const& seg2)
 {
   using std::begin;
   using std::distance;
@@ -30,9 +31,9 @@ void test_PermutedCombiningAdapter_3D(Segment const& seg0,
   auto   seg0_begin  = begin(seg0);
   auto   seg1_begin  = begin(seg1);
   auto   seg2_begin  = begin(seg2);
-  size_t seg_lens[3] = {static_cast<size_t>(seg0.size()),
-                        static_cast<size_t>(seg1.size()),
-                        static_cast<size_t>(seg2.size())};
+  size_t seg_lens[3] = {
+      static_cast<size_t>(seg0.size()), static_cast<size_t>(seg1.size()),
+      static_cast<size_t>(seg2.size())};
 
   size_t counters[3] = {0, 0, 0};
   auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
@@ -61,8 +62,9 @@ void test_PermutedCombiningAdapter_3D(Segment const& seg0,
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)),
-            seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(
+      distance(begin(range), end(range)),
+      seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
   for (auto idx = begin(range); idx != range_end; ++idx)
@@ -72,12 +74,13 @@ void test_PermutedCombiningAdapter_3D(Segment const& seg0,
 }
 
 template <typename Perm, typename IndexType>
-void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0,
-                                            IndexType iend0,
-                                            IndexType ibegin1,
-                                            IndexType iend1,
-                                            IndexType ibegin2,
-                                            IndexType iend2)
+void test_types_PermutedCombiningAdapter_3D(
+    IndexType ibegin0,
+    IndexType iend0,
+    IndexType ibegin1,
+    IndexType iend1,
+    IndexType ibegin2,
+    IndexType iend2)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
@@ -93,10 +96,10 @@ TEST(PermutedCombiningAdapter, test3D)
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_JKI, int>(0, 0, 0, 0, 0, 5);
 
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KIJ, int>(0, 3, 0, 4, 0, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2,
-                                                               5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3,
-                                                               0);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1,
-                                                               4);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(
+      -3, 5, 0, 6, 2, 5);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(
+      4, 13, -2, 7, -3, 0);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(
+      -8, -2, -5, 3, 1, 4);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index bb3f2917bb..b085d8ed4e 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -81,23 +81,25 @@ struct callable32
 };
 
 
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupOrderedMultiple
 {
-  void operator()(std::mt19937& rng,
-                  IndexType     max_begin,
-                  IndexType     min_end,
-                  IndexType     num1,
-                  IndexType     num2,
-                  IndexType     num3,
-                  IndexType     pool_reuse,
-                  IndexType     group_reuse) const
+  void operator()(
+      std::mt19937& rng,
+      IndexType     max_begin,
+      IndexType     min_end,
+      IndexType     num1,
+      IndexType     num2,
+      IndexType     num3,
+      IndexType     pool_reuse,
+      IndexType     group_reuse) const
   {
     ASSERT_GT(min_end, max_begin);
     IndexType N = min_end + max_begin;
@@ -129,7 +131,7 @@ struct testWorkGroupOrderedMultiple
     }
 
     WORKING_RES               res = WORKING_RES::get_default();
-    camp::resources::Resource working_res{res};
+    camp::resources::Resource working_res {res};
 
     using type1 = IndexType;
     using type2 = size_t;
@@ -147,14 +149,14 @@ struct testWorkGroupOrderedMultiple
     type3* check_array3   = nullptr;
     type3* test_array3    = nullptr;
 
-    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
-                                  &check_array1, &test_array1);
+    allocateForallTestData<type1>(
+        N * num1, working_res, &working_array1, &check_array1, &test_array1);
 
-    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
-                                  &check_array2, &test_array2);
+    allocateForallTestData<type2>(
+        N * num2, working_res, &working_array2, &check_array2, &test_array2);
 
-    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
-                                  &check_array3, &test_array3);
+    allocateForallTestData<type3>(
+        N * num3, working_res, &working_array3, &check_array3, &test_array3);
 
     type1 const test_val1(5);
     type2 const test_val2(7);
@@ -171,24 +173,24 @@ struct testWorkGroupOrderedMultiple
         camp::list<range_segment, callable31<IndexType, type3>>,
         camp::list<range_segment, callable32<IndexType, type3>>>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type = RAJA::WorkPool<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type = RAJA::WorkGroup<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type = RAJA::WorkSite<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
     using resource_type = typename WorkGroup_type::resource_type;
 
-    WorkPool_type  pool(Allocator{});
+    WorkPool_type  pool(Allocator {});
     WorkGroup_type group = pool.instantiate();
     WorkSite_type  site  = group.run();
 
@@ -201,28 +203,34 @@ struct testWorkGroupOrderedMultiple
         for (IndexType j = IndexType(0); j < num1; j++)
         {
           type1* working_ptr1 = working_array1 + N * j;
-          pool.enqueue(range_segment{begin1[j], end1[j]},
-                       callable11<IndexType, type1>{working_ptr1});
-          pool.enqueue(range_segment{begin1[j], end1[j]},
-                       callable12<IndexType, type1>{working_ptr1, test_val1});
+          pool.enqueue(
+              range_segment {begin1[j], end1[j]},
+              callable11<IndexType, type1> {working_ptr1});
+          pool.enqueue(
+              range_segment {begin1[j], end1[j]},
+              callable12<IndexType, type1> {working_ptr1, test_val1});
         }
 
         for (IndexType j = IndexType(0); j < num2; j++)
         {
           type2* working_ptr2 = working_array2 + N * j;
-          pool.enqueue(range_segment{begin2[j], end2[j]},
-                       callable21<IndexType, type2>{working_ptr2});
-          pool.enqueue(range_segment{begin2[j], end2[j]},
-                       callable22<IndexType, type2>{working_ptr2, test_val2});
+          pool.enqueue(
+              range_segment {begin2[j], end2[j]},
+              callable21<IndexType, type2> {working_ptr2});
+          pool.enqueue(
+              range_segment {begin2[j], end2[j]},
+              callable22<IndexType, type2> {working_ptr2, test_val2});
         }
 
         for (IndexType j = IndexType(0); j < num3; j++)
         {
           type3* working_ptr3 = working_array3 + N * j;
-          pool.enqueue(range_segment{begin3[j], end3[j]},
-                       callable31<IndexType, type3>{working_ptr3});
-          pool.enqueue(range_segment{begin3[j], end3[j]},
-                       callable32<IndexType, type3>{working_ptr3, test_val3});
+          pool.enqueue(
+              range_segment {begin3[j], end3[j]},
+              callable31<IndexType, type3> {working_ptr3});
+          pool.enqueue(
+              range_segment {begin3[j], end3[j]},
+              callable32<IndexType, type3> {working_ptr3, test_val3});
         }
       }
 
@@ -374,14 +382,14 @@ struct testWorkGroupOrderedMultiple
     }
 
 
-    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
-                                    test_array1);
+    deallocateForallTestData<type1>(
+        working_res, working_array1, check_array1, test_array1);
 
-    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
-                                    test_array2);
+    deallocateForallTestData<type2>(
+        working_res, working_array2, check_array2, test_array2);
 
-    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
-                                    test_array3);
+    deallocateForallTestData<type3>(
+        working_res, working_array3, check_array3, test_array3);
   }
 };
 
@@ -389,12 +397,13 @@ struct testWorkGroupOrderedMultiple
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupOrderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -404,23 +413,25 @@ struct testWorkGroupOrderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
+  void operator()(
+      std::mt19937&,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType) const
   {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupOrderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -430,14 +441,15 @@ struct testWorkGroupOrderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
+  void operator()(
+      std::mt19937&,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType) const
   {}
 };
 
@@ -451,8 +463,9 @@ class WorkGroupBasicOrderedMultipleReuseFunctionalTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
-             BasicWorkGroupOrderedMultipleReuse)
+TYPED_TEST_P(
+    WorkGroupBasicOrderedMultipleReuseFunctionalTest,
+    BasicWorkGroupOrderedMultipleReuse)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -462,7 +475,7 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
   using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -472,11 +485,11 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE>{}(
+  testWorkGroupOrderedMultiple<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(
       rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
       group_reuse);
 }
 
-#endif //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
+#endif  //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index d2989bd763..4b09cfcfab 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -19,13 +19,14 @@
 #include <vector>
 
 
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupOrderedSingle
 {
   void operator()(IndexType begin, IndexType end) const
@@ -35,14 +36,14 @@ struct testWorkGroupOrderedSingle
     IndexType N = end + begin;
 
     WORKING_RES               res = WORKING_RES::get_default();
-    camp::resources::Resource working_res{res};
+    camp::resources::Resource working_res {res};
 
     IndexType* working_array;
     IndexType* check_array;
     IndexType* test_array;
 
-    allocateForallTestData<IndexType>(N, working_res, &working_array,
-                                      &check_array, &test_array);
+    allocateForallTestData<IndexType>(
+        N, working_res, &working_array, &check_array, &test_array);
 
     IndexType const test_val(5);
 
@@ -58,20 +59,20 @@ struct testWorkGroupOrderedSingle
         camp::list<range_segment, decltype(callable1)>,
         camp::list<range_segment, decltype(callable2)>>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type = RAJA::WorkPool<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type = RAJA::WorkGroup<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type = RAJA::WorkSite<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
     {
       for (IndexType i = IndexType(0); i < N; i++)
@@ -87,11 +88,11 @@ struct testWorkGroupOrderedSingle
       }
     }
 
-    WorkPool_type pool(Allocator{});
+    WorkPool_type pool(Allocator {});
 
     {
-      pool.enqueue(range_segment{begin, end}, callable1);
-      pool.enqueue(range_segment{begin, end}, callable2);
+      pool.enqueue(range_segment {begin, end}, callable1);
+      pool.enqueue(range_segment {begin, end}, callable2);
     }
 
     WorkGroup_type group = pool.instantiate();
@@ -117,8 +118,8 @@ struct testWorkGroupOrderedSingle
     }
 
 
-    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
-                                        test_array);
+    deallocateForallTestData<IndexType>(
+        working_res, working_array, check_array, test_array);
   }
 };
 
@@ -126,12 +127,13 @@ struct testWorkGroupOrderedSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupOrderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -144,12 +146,13 @@ struct testWorkGroupOrderedSingle<
   void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupOrderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -172,8 +175,9 @@ class WorkGroupBasicOrderedSingleFunctionalTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
-             BasicWorkGroupOrderedSingle)
+TYPED_TEST_P(
+    WorkGroupBasicOrderedSingleFunctionalTest,
+    BasicWorkGroupOrderedSingle)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -183,7 +187,7 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
   using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -195,15 +199,15 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator,
-                             WORKING_RESOURCE>{}(b1, e1);
-  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator,
-                             WORKING_RESOURCE>{}(b2, e2);
-  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator,
-                             WORKING_RESOURCE>{}(b3, e3);
+  testWorkGroupOrderedSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupOrderedSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupOrderedSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(b3, e3);
 }
 
-#endif //__TEST_WORKGROUP_ORDERED_SINGLE__
+#endif  //__TEST_WORKGROUP_ORDERED_SINGLE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index 57701c806f..e250ba895a 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -53,23 +53,25 @@ struct callable3
 };
 
 
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupUnorderedMultiple
 {
-  void operator()(std::mt19937& rng,
-                  IndexType     max_begin,
-                  IndexType     min_end,
-                  IndexType     num1,
-                  IndexType     num2,
-                  IndexType     num3,
-                  IndexType     pool_reuse,
-                  IndexType     group_reuse) const
+  void operator()(
+      std::mt19937& rng,
+      IndexType     max_begin,
+      IndexType     min_end,
+      IndexType     num1,
+      IndexType     num2,
+      IndexType     num3,
+      IndexType     pool_reuse,
+      IndexType     group_reuse) const
   {
     ASSERT_GT(min_end, max_begin);
     IndexType N = min_end + max_begin;
@@ -101,7 +103,7 @@ struct testWorkGroupUnorderedMultiple
     }
 
     WORKING_RES               res = WORKING_RES::get_default();
-    camp::resources::Resource working_res{res};
+    camp::resources::Resource working_res {res};
 
     using type1 = IndexType;
     using type2 = size_t;
@@ -119,14 +121,14 @@ struct testWorkGroupUnorderedMultiple
     type3* check_array3   = nullptr;
     type3* test_array3    = nullptr;
 
-    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
-                                  &check_array1, &test_array1);
+    allocateForallTestData<type1>(
+        N * num1, working_res, &working_array1, &check_array1, &test_array1);
 
-    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
-                                  &check_array2, &test_array2);
+    allocateForallTestData<type2>(
+        N * num2, working_res, &working_array2, &check_array2, &test_array2);
 
-    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
-                                  &check_array3, &test_array3);
+    allocateForallTestData<type3>(
+        N * num3, working_res, &working_array3, &check_array3, &test_array3);
 
     type1 const test_val1(5);
     type2 const test_val2(7);
@@ -139,22 +141,22 @@ struct testWorkGroupUnorderedMultiple
         camp::list<range_segment, callable2<IndexType, type2>>,
         camp::list<range_segment, callable3<IndexType, type3>>>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type = RAJA::WorkPool<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type = RAJA::WorkGroup<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type = RAJA::WorkSite<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    WorkPool_type pool(Allocator{});
+    WorkPool_type pool(Allocator {});
 
     for (IndexType pr = 0; pr < pool_reuse; pr++)
     {
@@ -164,22 +166,25 @@ struct testWorkGroupUnorderedMultiple
         for (IndexType j = IndexType(0); j < num1; j++)
         {
           type1* working_ptr1 = working_array1 + N * j;
-          pool.enqueue(range_segment{begin1[j], end1[j]},
-                       callable1<IndexType, type1>{working_ptr1, test_val1});
+          pool.enqueue(
+              range_segment {begin1[j], end1[j]},
+              callable1<IndexType, type1> {working_ptr1, test_val1});
         }
 
         for (IndexType j = IndexType(0); j < num2; j++)
         {
           type2* working_ptr2 = working_array2 + N * j;
-          pool.enqueue(range_segment{begin2[j], end2[j]},
-                       callable2<IndexType, type2>{working_ptr2, test_val2});
+          pool.enqueue(
+              range_segment {begin2[j], end2[j]},
+              callable2<IndexType, type2> {working_ptr2, test_val2});
         }
 
         for (IndexType j = IndexType(0); j < num3; j++)
         {
           type3* working_ptr3 = working_array3 + N * j;
-          pool.enqueue(range_segment{begin3[j], end3[j]},
-                       callable3<IndexType, type3>{working_ptr3, test_val3});
+          pool.enqueue(
+              range_segment {begin3[j], end3[j]},
+              callable3<IndexType, type3> {working_ptr3, test_val3});
         }
       }
 
@@ -326,14 +331,14 @@ struct testWorkGroupUnorderedMultiple
     }
 
 
-    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
-                                    test_array1);
+    deallocateForallTestData<type1>(
+        working_res, working_array1, check_array1, test_array1);
 
-    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
-                                    test_array2);
+    deallocateForallTestData<type2>(
+        working_res, working_array2, check_array2, test_array2);
 
-    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
-                                    test_array3);
+    deallocateForallTestData<type3>(
+        working_res, working_array3, check_array3, test_array3);
   }
 };
 
@@ -341,12 +346,13 @@ struct testWorkGroupUnorderedMultiple
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupUnorderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -356,23 +362,25 @@ struct testWorkGroupUnorderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
+  void operator()(
+      std::mt19937&,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType) const
   {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupUnorderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -382,14 +390,15 @@ struct testWorkGroupUnorderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
+  void operator()(
+      std::mt19937&,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType,
+      IndexType) const
   {}
 };
 
@@ -404,8 +413,9 @@ class WorkGroupBasicUnorderedMultipleReuseFunctionalTest
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
-             BasicWorkGroupUnorderedMultipleReuse)
+TYPED_TEST_P(
+    WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
+    BasicWorkGroupUnorderedMultipleReuse)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -415,7 +425,7 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
   using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -425,11 +435,11 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                                 DispatchTyper, IndexType, Allocator,
-                                 WORKING_RESOURCE>{}(
+  testWorkGroupUnorderedMultiple<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(
       rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
       group_reuse);
 }
 
-#endif //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
+#endif  //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index bb1fefe83f..09c0fc4ad9 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -18,13 +18,14 @@
 #include <random>
 
 
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupUnorderedSingle
 {
   void operator()(IndexType begin, IndexType end) const
@@ -35,14 +36,14 @@ struct testWorkGroupUnorderedSingle
     IndexType N = end + begin;
 
     WORKING_RES               res = WORKING_RES::get_default();
-    camp::resources::Resource working_res{res};
+    camp::resources::Resource working_res {res};
 
     IndexType* working_array;
     IndexType* check_array;
     IndexType* test_array;
 
-    allocateForallTestData<IndexType>(N, working_res, &working_array,
-                                      &check_array, &test_array);
+    allocateForallTestData<IndexType>(
+        N, working_res, &working_array, &check_array, &test_array);
 
     IndexType const test_val(5);
 
@@ -54,24 +55,25 @@ struct testWorkGroupUnorderedSingle
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, decltype(callable)>>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type = RAJA::WorkPool<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type = RAJA::WorkGroup<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type = RAJA::WorkSite<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
     using resource_type = typename WorkSite_type::resource_type;
-    static_assert(std::is_same<WORKING_RES, resource_type>::value,
-                  "Expected same resource types");
+    static_assert(
+        std::is_same<WORKING_RES, resource_type>::value,
+        "Expected same resource types");
 
     {
       for (IndexType i = IndexType(0); i < N; i++)
@@ -87,10 +89,10 @@ struct testWorkGroupUnorderedSingle
       }
     }
 
-    WorkPool_type pool(Allocator{});
+    WorkPool_type pool(Allocator {});
 
     {
-      pool.enqueue(range_segment{begin, end}, callable);
+      pool.enqueue(range_segment {begin, end}, callable);
     }
 
     WorkGroup_type group = pool.instantiate();
@@ -118,8 +120,8 @@ struct testWorkGroupUnorderedSingle
     }
 
 
-    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
-                                        test_array);
+    deallocateForallTestData<IndexType>(
+        working_res, working_array, check_array, test_array);
   }
 };
 
@@ -127,12 +129,13 @@ struct testWorkGroupUnorderedSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupUnorderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -145,12 +148,13 @@ struct testWorkGroupUnorderedSingle<
   void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKING_RES>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKING_RES>
 struct testWorkGroupUnorderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -173,8 +177,9 @@ class WorkGroupBasicUnorderedSingleFunctionalTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
-             BasicWorkGroupUnorderedSingle)
+TYPED_TEST_P(
+    WorkGroupBasicUnorderedSingleFunctionalTest,
+    BasicWorkGroupUnorderedSingle)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -184,7 +189,7 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
   using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -196,15 +201,15 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE>{}(b1, e1);
-  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE>{}(b2, e2);
-  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE>{}(b3, e3);
+  testWorkGroupUnorderedSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupUnorderedSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupUnorderedSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE> {}(b3, e3);
 }
 
-#endif //__TEST_WORKGROUP_UNORDERED_SINGLE__
+#endif  //__TEST_WORKGROUP_UNORDERED_SINGLE__
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index 0ab921f7d9..03e3ec2d6e 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -21,8 +21,8 @@
 #ifdef RAJA_COMPILER_MSVC
 // disable some warnings for MSVC that we can't control, because they're emitted
 // by googletest headers
-#pragma warning(disable : 4244) // Force msvc to not emit conversion warning
-#pragma warning(disable : 4389) // Force msvc to not emit conversion warning
+#pragma warning(disable : 4244)  // Force msvc to not emit conversion warning
+#pragma warning(disable : 4389)  // Force msvc to not emit conversion warning
 #endif
 
 #include "gtest/gtest.h"
@@ -34,8 +34,9 @@
 
 #define GPU_TEST_F(test_fixture, test_name)                                    \
   static void gpu_test_f_##test_fixture##_##test_name();                       \
-  GTEST_TEST_(test_fixture, test_name, test_fixture,                           \
-              ::testing::internal::GetTypeId<test_fixture>())                  \
+  GTEST_TEST_(                                                                 \
+      test_fixture, test_name, test_fixture,                                   \
+      ::testing::internal::GetTypeId<test_fixture>())                          \
   {                                                                            \
     gpu_test_f_##test_fixture##_##test_name();                                 \
   }                                                                            \
@@ -69,11 +70,11 @@
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
-                                                           test_name));        \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(                                           \
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name));                    \
   };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
-                             test_name)::gtest_registering_dummy_ =            \
+  int GTEST_TEST_CLASS_NAME_(                                                  \
+      test_case_name, test_name)::gtest_registering_dummy_ =                   \
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
   template <typename Invocable>                                                \
   static void gtest_gpu_##test_case_name##_##test_name(Invocable&& GetParam)
@@ -102,8 +103,8 @@
 
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4244) // reenable warning
-#pragma warning(default : 4389) // reenable warning
+#pragma warning(default : 4244)  // reenable warning
+#pragma warning(default : 4389)  // reenable warning
 #endif
 
 
@@ -207,8 +208,8 @@ inline constexpr int getScalarType(T const&)
 }
 
 
-} // namespace gtest
-} // namespace RAJA
+}  // namespace gtest
+}  // namespace RAJA
 
 // This always returns a 0, but forces compiler not to compile-out
 // constant values
@@ -219,4 +220,4 @@ inline constexpr int getScalarType(T const&)
 #define NO_OPT_RAND (1.0 + (double)rand() / RAND_MAX)
 
 
-#endif // closing endif for header file include guard
+#endif  // closing endif for header file include guard
diff --git a/test/include/RAJA_test-abs.hpp b/test/include/RAJA_test-abs.hpp
index c0fece44fc..57bfadf0c0 100644
--- a/test/include/RAJA_test-abs.hpp
+++ b/test/include/RAJA_test-abs.hpp
@@ -28,6 +28,6 @@ camp::concepts::enable_if_t<T, std::is_integral<T>> test_abs(T&& val)
   return std::abs(val);
 }
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // __RAJA_test_abs_HPP__
+#endif  // __RAJA_test_abs_HPP__
diff --git a/test/include/RAJA_test-atomic-ref-types.hpp b/test/include/RAJA_test-atomic-ref-types.hpp
index c2ac4045d7..79a2d5e196 100644
--- a/test/include/RAJA_test-atomic-ref-types.hpp
+++ b/test/include/RAJA_test-atomic-ref-types.hpp
@@ -80,9 +80,9 @@ RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 16, T>::type
 
 // Assist return type conditional overloading of testAtomicRefLogicalOp
 struct int_op
-{}; // represents underlying op type = integral
+{};  // represents underlying op type = integral
 struct all_op
-{}; // these op types can accept integral or float
+{};  // these op types can accept integral or float
 
 
-#endif // __RAJA_test_atomic_ref_types_HPP__
+#endif  // __RAJA_test_atomic_ref_types_HPP__
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
index 10081c178d..a35fbf4877 100644
--- a/test/include/RAJA_test-atomic-types.hpp
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -21,14 +21,15 @@
 //
 // Atomic data types
 //
-using AtomicDataTypeList = camp::list<RAJA::Index_type,
-                                      int,
+using AtomicDataTypeList = camp::list<
+    RAJA::Index_type,
+    int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                      unsigned int,
-                                      long long,
-                                      unsigned long long,
-                                      float,
+    unsigned int,
+    long long,
+    unsigned long long,
+    float,
 #endif
-                                      double>;
+    double>;
 
-#endif // __RAJA_test_atomic_types_HPP__
+#endif  // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-atomicpol.hpp b/test/include/RAJA_test-atomicpol.hpp
index 7482ca2e21..c13e9a68bd 100644
--- a/test/include/RAJA_test-atomicpol.hpp
+++ b/test/include/RAJA_test-atomicpol.hpp
@@ -49,7 +49,7 @@ using OpenMPAtomicPols = camp::list<
 #endif
 #endif
     RAJA::auto_atomic>;
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaAtomicPols = camp::list<
@@ -62,7 +62,7 @@ using CudaAtomicPols = camp::list<
 #endif
 #endif
     RAJA::cuda_atomic>;
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 using HipAtomicPols = camp::list<
@@ -75,7 +75,7 @@ using HipAtomicPols = camp::list<
 #endif
 #endif
     RAJA::hip_atomic>;
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 using SyclAtomicPols = camp::list<
@@ -88,10 +88,10 @@ using SyclAtomicPols = camp::list<
 #endif
 #endif
     RAJA::sycl_atomic>;
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetAtomicPols = OpenMPAtomicPols;
 #endif
 
-#endif // __TEST_ATOMICPOL__
+#endif  // __TEST_ATOMICPOL__
diff --git a/test/include/RAJA_test-base.hpp b/test/include/RAJA_test-base.hpp
index 8ce31f92eb..470ddb9cfd 100644
--- a/test/include/RAJA_test-base.hpp
+++ b/test/include/RAJA_test-base.hpp
@@ -29,4 +29,4 @@ struct Test<camp::list<T...>>
 };
 
 
-#endif // __RAJA_test_base_HPP__
+#endif  // __RAJA_test_base_HPP__
diff --git a/test/include/RAJA_test-camp.hpp b/test/include/RAJA_test-camp.hpp
index 45e125d92a..a9959f3c73 100644
--- a/test/include/RAJA_test-camp.hpp
+++ b/test/include/RAJA_test-camp.hpp
@@ -42,4 +42,4 @@ using HipResourceList = camp::list<camp::resources::Hip>;
 using SyclResourceList = camp::list<camp::resources::Sycl>;
 #endif
 
-#endif // __RAJA_test_camp_HPP__
+#endif  // __RAJA_test_camp_HPP__
diff --git a/test/include/RAJA_test-dynamic-forall.hpp b/test/include/RAJA_test-dynamic-forall.hpp
index 0bccde9cbc..11562841b7 100644
--- a/test/include/RAJA_test-dynamic-forall.hpp
+++ b/test/include/RAJA_test-dynamic-forall.hpp
@@ -15,23 +15,24 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using policy_list = camp::list<camp::list<RAJA::seq_exec,
-                                          RAJA::simd_exec
+using policy_list = camp::list<camp::list<
+    RAJA::seq_exec,
+    RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-                                          ,
-                                          RAJA::omp_parallel_for_exec
+    ,
+    RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                                          ,
-                                          RAJA::cuda_exec<256>,
-                                          RAJA::cuda_exec<512>
+    ,
+    RAJA::cuda_exec<256>,
+    RAJA::cuda_exec<512>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                          ,
-                                          RAJA::hip_exec<256>,
-                                          RAJA::hip_exec<512>
+    ,
+    RAJA::hip_exec<256>,
+    RAJA::hip_exec<512>
 #endif
-                                          >>;
+    >>;
 
 
-#endif // __RAJA_test_dynamic_execpol_HPP__
+#endif  // __RAJA_test_dynamic_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-async-execpol.hpp b/test/include/RAJA_test-forall-async-execpol.hpp
index df861077c7..66489c3bf4 100644
--- a/test/include/RAJA_test-forall-async-execpol.hpp
+++ b/test/include/RAJA_test-forall-async-execpol.hpp
@@ -28,7 +28,7 @@ using OpenMPAsyncForallExecPols       = OpenMPForallExecPols;
 using OpenMPAsyncForallReduceExecPols = OpenMPForallReduceExecPols;
 using OpenMPAsyncForallAtomicExecPols = OpenMPForallAtomicExecPols;
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetAsyncForallExecPols       = OpenMPTargetForallExecPols;
@@ -38,10 +38,10 @@ using OpenMPTargetAsyncForallAtomicExecPols = OpenMPTargetForallAtomicExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAsyncForallExecPols =
-    camp::list<RAJA::cuda_exec<128, true>,
-               RAJA::cuda_exec<256, true>,
-               RAJA::cuda_exec_explicit<256, 2, true>>;
+using CudaAsyncForallExecPols = camp::list<
+    RAJA::cuda_exec<128, true>,
+    RAJA::cuda_exec<256, true>,
+    RAJA::cuda_exec_explicit<256, 2, true>>;
 
 using CudaAsyncForallReduceExecPols = CudaForallExecPols;
 
@@ -59,4 +59,4 @@ using HipAsyncForallAtomicExecPols = HipForallExecPols;
 
 #endif
 
-#endif // __RAJA_test_forall_execpol_HPP__
+#endif  // __RAJA_test_forall_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
index f494f03ca4..222af900ff 100644
--- a/test/include/RAJA_test-forall-data.hpp
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -15,13 +15,14 @@
 #include "camp/resource.hpp"
 
 template <typename T>
-void allocateForallTestData(size_t                    N,
-                            camp::resources::Resource work_res,
-                            T**                       work_array,
-                            T**                       check_array,
-                            T**                       test_array)
+void allocateForallTestData(
+    size_t                    N,
+    camp::resources::Resource work_res,
+    T**                       work_array,
+    T**                       check_array,
+    T**                       test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
@@ -30,17 +31,19 @@ void allocateForallTestData(size_t                    N,
 }
 
 // for RAJA strongly typed indices
-template <typename T,
-          typename std::enable_if<
-              std::is_base_of<RAJA::IndexValueBase,
-                              camp::type::ptr::rem<T>>::value>::type* = nullptr>
-void allocateForallTestData(T                         N,
-                            camp::resources::Resource work_res,
-                            T**                       work_array,
-                            T**                       check_array,
-                            T**                       test_array)
+template <
+    typename T,
+    typename std::enable_if<
+        std::is_base_of<RAJA::IndexValueBase, camp::type::ptr::rem<T>>::value>::
+        type* = nullptr>
+void allocateForallTestData(
+    T                         N,
+    camp::resources::Resource work_res,
+    T**                       work_array,
+    T**                       check_array,
+    T**                       test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
@@ -49,12 +52,13 @@ void allocateForallTestData(T                         N,
 }
 
 template <typename T>
-void deallocateForallTestData(camp::resources::Resource work_res,
-                              T*                        work_array,
-                              T*                        check_array,
-                              T*                        test_array)
+void deallocateForallTestData(
+    camp::resources::Resource work_res,
+    T*                        work_array,
+    T*                        check_array,
+    T*                        test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   work_res.deallocate(work_array);
 
@@ -62,4 +66,4 @@ void deallocateForallTestData(camp::resources::Resource work_res,
   host_res.deallocate(test_array);
 }
 
-#endif // __RAJA_test_forall_data_HPP__
+#endif  // __RAJA_test_forall_data_HPP__
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 4dd571e32a..6bcd95edb9 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -90,35 +90,35 @@ using OpenMPForallExecPols = camp::list<
 
 using OpenMPForallReduceExecPols = OpenMPForallExecPols;
 
-using OpenMPForallAtomicExecPols =
-    camp::list<RAJA::omp_parallel_for_exec
+using OpenMPForallAtomicExecPols = camp::list<
+    RAJA::omp_parallel_for_exec
 
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               ,
-               RAJA::omp_parallel_for_static_exec<>,
-               RAJA::omp_parallel_for_static_exec<4>,
-               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
-               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-
-               ,
-               RAJA::omp_parallel_for_dynamic_exec<>,
-               RAJA::omp_parallel_for_dynamic_exec<2>
-
-               ,
-               RAJA::omp_parallel_for_guided_exec<>,
-               RAJA::omp_parallel_for_guided_exec<3>
-
-               ,
-               RAJA::omp_parallel_for_runtime_exec
+    ,
+    RAJA::omp_parallel_for_static_exec<>,
+    RAJA::omp_parallel_for_static_exec<4>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
+
+    ,
+    RAJA::omp_parallel_for_dynamic_exec<>,
+    RAJA::omp_parallel_for_dynamic_exec<2>
+
+    ,
+    RAJA::omp_parallel_for_guided_exec<>,
+    RAJA::omp_parallel_for_guided_exec<3>
+
+    ,
+    RAJA::omp_parallel_for_runtime_exec
 #endif
-               >;
+    >;
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetForallExecPols =
-    camp::list<RAJA::omp_target_parallel_for_exec<8>,
-               RAJA::omp_target_parallel_for_exec_nt>;
+using OpenMPTargetForallExecPols = camp::list<
+    RAJA::omp_target_parallel_for_exec<8>,
+    RAJA::omp_target_parallel_for_exec_nt>;
 
 using OpenMPTargetForallReduceExecPols = OpenMPTargetForallExecPols;
 
@@ -127,15 +127,15 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallExecPols =
-    camp::list<RAJA::cuda_exec<128>,
-               RAJA::cuda_exec_occ_calc<256>,
-               RAJA::cuda_exec_grid<256, 64>,
-               RAJA::cuda_exec_explicit<256, 2>,
-               RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
-               RAJA::cuda_exec_occ_custom<
-                   256,
-                   RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
+using CudaForallExecPols = camp::list<
+    RAJA::cuda_exec<128>,
+    RAJA::cuda_exec_occ_calc<256>,
+    RAJA::cuda_exec_grid<256, 64>,
+    RAJA::cuda_exec_explicit<256, 2>,
+    RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+    RAJA::cuda_exec_occ_custom<
+        256,
+        RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -144,14 +144,14 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallExecPols =
-    camp::list<RAJA::hip_exec<128>,
-               RAJA::hip_exec_occ_calc<256>,
-               RAJA::hip_exec_grid<256, 64>,
-               RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
-               RAJA::hip_exec_occ_custom<
-                   256,
-                   RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
+using HipForallExecPols = camp::list<
+    RAJA::hip_exec<128>,
+    RAJA::hip_exec_occ_calc<256>,
+    RAJA::hip_exec_grid<256, 64>,
+    RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+    RAJA::hip_exec_occ_custom<
+        256,
+        RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
@@ -169,4 +169,4 @@ using SyclForallAtomicExecPols = SyclForallExecPols;
 
 #endif
 
-#endif // __RAJA_test_forall_execpol_HPP__
+#endif  // __RAJA_test_forall_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
index 4db6371d55..fa19432d7a 100644
--- a/test/include/RAJA_test-forall-indexset-execpol.hpp
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -12,9 +12,9 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
+using SequentialForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
 
 //
 // Sequential execution policy types for reduction tests.
@@ -25,14 +25,14 @@ using SequentialForallIndexSetReduceExecPols =
     camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-               RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
+using OpenMPForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+    RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 
-using OpenMPForallIndexSetReduceExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
+using OpenMPForallIndexSetReduceExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -45,27 +45,27 @@ using OpenMPTargetForallIndexSetReduceExecPols =
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
+using CudaForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
 
 using CudaForallIndexSetReduceExecPols = CudaForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
+using HipForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
 
 using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
+using SyclForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
 
 using SyclForallIndexSetReduceExecPols = SyclForallIndexSetExecPols;
 #endif
 
-#endif // __RAJA_test_forall_indexset_execpol_HPP__
+#endif  // __RAJA_test_forall_indexset_execpol_HPP__
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
index d35c6573aa..7383e79508 100644
--- a/test/include/RAJA_test-index-types.hpp
+++ b/test/include/RAJA_test-index-types.hpp
@@ -30,20 +30,20 @@ RAJA_INDEX_VALUE_T(StrongULL, unsigned long long, "StrongULLType");
 //
 // Standard index types list
 //
-using IdxTypeList =
-    camp::list<RAJA::Index_type,
-               int,
+using IdxTypeList = camp::list<
+    RAJA::Index_type,
+    int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               unsigned int,
-               // short int types will break a bunch of tests due to assumpitons
-               // made in the test implementations.
-               //                             short,
-               //                             unsigned short,
-               long int,
-               unsigned long,
-               long long,
+    unsigned int,
+    // short int types will break a bunch of tests due to assumpitons
+    // made in the test implementations.
+    //                             short,
+    //                             unsigned short,
+    long int,
+    unsigned long,
+    long long,
 #endif
-               unsigned long long>;
+    unsigned long long>;
 
 //
 // Signed index types list
@@ -53,22 +53,22 @@ using SignedIdxTypeList = camp::list<RAJA::Index_type, int, long long>;
 //
 // Index types w/ Strong types list
 //
-using StrongIdxTypeList =
-    camp::list<RAJA::Index_type,
-               int,
-               StrongIndexType,
+using StrongIdxTypeList = camp::list<
+    RAJA::Index_type,
+    int,
+    StrongIndexType,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               // StrongInt,
-               unsigned int,
-               // short int types will break a bunch of tests due to assumpitons
-               // made in the test implementations.
-               //                                   short,
-               //                                   unsigned short,
-               long int,
-               unsigned long,
-               long long,
+    // StrongInt,
+    unsigned int,
+    // short int types will break a bunch of tests due to assumpitons
+    // made in the test implementations.
+    //                                   short,
+    //                                   unsigned short,
+    long int,
+    unsigned long,
+    long long,
 #endif
-               // StrongULL,
-               unsigned long long>;
+    // StrongULL,
+    unsigned long long>;
 
-#endif // __RAJA_test_index_types_HPP__
+#endif  // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
index bd6eb4d839..99656a2608 100644
--- a/test/include/RAJA_test-indexset-build.hpp
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -21,10 +21,11 @@
 // Utility routine to construct index set with mix of Range, RangeStride,
 // and List segments to use in various tests.
 //
-template <typename INDEX_TYPE,
-          typename RANGE_TYPE,
-          typename RANGESTRIDE_TYPE,
-          typename LIST_TYPE>
+template <
+    typename INDEX_TYPE,
+    typename RANGE_TYPE,
+    typename RANGESTRIDE_TYPE,
+    typename LIST_TYPE>
 void buildIndexSet(
     RAJA::TypedIndexSet<RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE>& iset,
     std::vector<INDEX_TYPE>&                                      indices_out,
@@ -146,4 +147,4 @@ void buildIndexSet(
   last_idx = lseg_vec[lseg_len - 1];
 }
 
-#endif // __TEST_FORALL_INDEXSET_BUILD_HPP__
+#endif  // __TEST_FORALL_INDEXSET_BUILD_HPP__
diff --git a/test/include/RAJA_test-kernel-nested-loop-types.hpp b/test/include/RAJA_test-kernel-nested-loop-types.hpp
index 69fdab271d..20cca84c61 100644
--- a/test/include/RAJA_test-kernel-nested-loop-types.hpp
+++ b/test/include/RAJA_test-kernel-nested-loop-types.hpp
@@ -97,22 +97,25 @@ struct KELB_impl;
 
 template <typename T, typename First, typename... Rest>
 struct is_in_type_list<T, list<First, Rest...>>
-    : std::conditional<std::is_same<typename T::LoopType, First>::value,
-                       list<T>,
-                       typename is_in_type_list<T, list<Rest...>>::type>
+    : std::conditional<
+          std::is_same<typename T::LoopType, First>::value,
+          list<T>,
+          typename is_in_type_list<T, list<Rest...>>::type>
 {};
 
 template <typename T, typename Last>
 struct is_in_type_list<T, list<Last>>
-    : std::conditional<std::is_same<typename T::LoopType, Last>::value,
-                       list<T>,
-                       list<>>
+    : std::conditional<
+          std::is_same<typename T::LoopType, Last>::value,
+          list<T>,
+          list<>>
 {};
 
 template <typename POL_TYPE_LIST, typename First, typename... Rest>
 struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>>
-    : join<typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
-           typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
+    : join<
+          typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
+          typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
 {};
 
 template <typename POL_TYPE_LIST, typename Last>
@@ -120,7 +123,7 @@ struct KELB_impl<POL_TYPE_LIST, list<Last>>
     : is_in_type_list<Last, POL_TYPE_LIST>
 {};
 
-} // namespace detail
+}  // namespace detail
 
 
 template <typename POL_TYPE_LIST, typename EXEC_POL_LIST>
@@ -130,4 +133,4 @@ struct KernelExecListBuilder
 };
 
 
-#endif // __TEST_KERNEL_NESTED_LOOP_TYPES_HPP__
+#endif  // __TEST_KERNEL_NESTED_LOOP_TYPES_HPP__
diff --git a/test/include/RAJA_test-kernel-tile-size.hpp b/test/include/RAJA_test-kernel-tile-size.hpp
index 78fa28172d..9d9bb95556 100644
--- a/test/include/RAJA_test-kernel-tile-size.hpp
+++ b/test/include/RAJA_test-kernel-tile-size.hpp
@@ -15,4 +15,4 @@
 constexpr int tile_dim_x = 16;
 constexpr int tile_dim_y = 16;
 
-#endif // __RAJA_test_kernel_tile_size_HPP__
+#endif  // __RAJA_test_kernel_tile_size_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index f5ce7de856..9b9910faf1 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -16,27 +16,29 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+using cuda_direct_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
 
 using cuda_direct_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -45,29 +47,29 @@ using cuda_direct_explicit_policies = camp::list<
 
 using Cuda_launch_policies =
     camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
+using hip_direct_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
+using sycl_direct_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+    RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
 #endif
 
 
-#endif // __RAJA_TEST_LAUNCH_DIRECT_TEAMS_THREADS_1D_EXECPOL_HPP__
+#endif  // __RAJA_TEST_LAUNCH_DIRECT_TEAMS_THREADS_1D_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index 47268b2c84..519838133e 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -16,40 +16,42 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+using cuda_direct_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
 
 using cuda_direct_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -63,37 +65,37 @@ using cuda_direct_explicit_policies = camp::list<
 using Cuda_launch_policies =
     camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
-               RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
+using hip_direct_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
+    RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
+    RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
+    RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
+    RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, // slowest
-               RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, // fastest
-               RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
+using sycl_direct_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,  // slowest
+    RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
+    RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,  // fastest
+    RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
+    RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
+    RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
 #endif
 
 
-#endif //__RAJA_TEST_LAUNCH_DIRECT_TEAM_THREADS_3D_EXECPOL_HPP__
+#endif  //__RAJA_TEST_LAUNCH_DIRECT_TEAM_THREADS_3D_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index b62694bcee..265d85261f 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -16,46 +16,51 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>>;
+using omp_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_policies = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
-                                 RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+using cuda_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
 using cuda_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
     RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
 using Cuda_launch_policies = camp::list<cuda_policies, cuda_explicit_policies>;
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_policies = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-                                RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
+using hip_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
 
 using Hip_launch_policies = camp::list<hip_policies>;
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_policies = camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-                                 RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
+using sycl_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
 using Sycl_launch_policies = camp::list<sycl_policies>;
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 
-#endif // __RAJA_TEST_LAUNCH_EXECPOL_HPP__
+#endif  // __RAJA_TEST_LAUNCH_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index 2a064ee490..83c721b0ba 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -16,27 +16,29 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+using cuda_loop_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -45,26 +47,26 @@ using cuda_loop_explicit_policies = camp::list<
 
 using Cuda_launch_policies =
     camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+using hip_loop_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
 
 using Hip_launch_policies = camp::list<hip_loop_policies>;
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+using sycl_loop_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+    RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
 
 using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
-#endif // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
+#endif  // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index 04cda22390..1c4110c69c 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -16,39 +16,41 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+using cuda_loop_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -61,34 +63,34 @@ using cuda_loop_explicit_policies = camp::list<
 
 using Cuda_launch_policies =
     camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
-               RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+using hip_loop_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
+    RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
+    RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
+    RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
+    RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
 
 using Hip_launch_policies = camp::list<hip_loop_policies>;
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, // slowest index
-               RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, // fastest index
-               RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+using sycl_loop_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,  // slowest index
+    RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
+    RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,  // fastest index
+    RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
+    RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
+    RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
 
 using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
-#endif // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
+#endif  // __RAJA_test_launch_teams_threads_loop_3D_execpol_HPP__
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index a5be2f6247..b44deb9aeb 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -23,8 +23,9 @@ using seq_cuda_policies = camp::list<
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using seq_cuda_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t,
-                       RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LaunchPolicy<
+        RAJA::seq_launch_t,
+        RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
@@ -32,10 +33,10 @@ using Sequential_launch_policies =
     camp::list<seq_cuda_policies, seq_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
-using seq_hip_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
+using seq_hip_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_hip_policies>;
 
@@ -49,11 +50,11 @@ using seq_sycl_policies = camp::list<
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
 
 #else
-using Sequential_launch_policies =
-    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                          RAJA::LoopPolicy<RAJA::seq_exec>,
-                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
-#endif // Sequential
+using Sequential_launch_policies = camp::list<camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+    RAJA::LoopPolicy<RAJA::seq_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>>;
+#endif  // Sequential
 
 
 #if defined(RAJA_ENABLE_OPENMP)
@@ -66,8 +67,9 @@ using omp_cuda_policies = camp::list<
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using omp_cuda_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t,
-                       RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+    RAJA::LaunchPolicy<
+        RAJA::omp_launch_t,
+        RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
     RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
@@ -94,50 +96,53 @@ using OpenMP_launch_policies = camp::list<omp_sycl_policies>;
 
 #else
 
-using OpenMP_launch_policies =
-    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                          RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
-                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
+using OpenMP_launch_policies = camp::list<camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+    RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
+    RAJA::LoopPolicy<RAJA::seq_exec>>>;
 #endif
 
-#endif // RAJA_ENABLE_OPENMP
+#endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using Cuda_launch_policies = camp::list<seq_cuda_policies,
-                                        seq_cuda_explicit_policies
+using Cuda_launch_policies = camp::list<
+    seq_cuda_policies,
+    seq_cuda_explicit_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-                                        ,
-                                        omp_cuda_policies,
-                                        omp_cuda_explicit_policies
+    ,
+    omp_cuda_policies,
+    omp_cuda_explicit_policies
 #endif
 
-                                        >;
-#endif // RAJA_ENABLE_CUDA
+    >;
+#endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using Hip_launch_policies = camp::list<seq_hip_policies
+using Hip_launch_policies = camp::list<
+    seq_hip_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-                                       ,
-                                       omp_hip_policies
+    ,
+    omp_hip_policies
 #endif
-                                       >;
+    >;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using Sycl_launch_policies = camp::list<seq_sycl_policies
+using Sycl_launch_policies = camp::list<
+    seq_sycl_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-                                        ,
-                                        omp_sycl_policies
+    ,
+    omp_sycl_policies
 #endif
-                                        >;
+    >;
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
-#endif // __RAJA_TEST_LAUNCH_RUNTIME_EXECPOL_HPP__
+#endif  // __RAJA_TEST_LAUNCH_RUNTIME_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
index 7c3a4c8843..144995adfb 100644
--- a/test/include/RAJA_test-multi-reduce-abstractor.hpp
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -36,8 +36,8 @@ struct SumAbstractor
   template <typename Reducer>
   static bool consistent(Reducer const&)
   {
-    return RAJA::policy_has_trait<typename Reducer::policy,
-                                  RAJA::reduce::ordered>::value ||
+    return RAJA::policy_has_trait<
+               typename Reducer::policy, RAJA::reduce::ordered>::value ||
            !std::is_floating_point<typename Reducer::value_type>::value;
   }
 
@@ -230,4 +230,4 @@ using ReduceMaxAbstractors    = camp::list<MaxAbstractor>;
 using ReduceBitAndAbstractors = camp::list<BitAndAbstractor>;
 using ReduceBitOrAbstractors  = camp::list<BitOrAbstractor>;
 
-#endif // __RAJA_test_multi_reduce_abstractor_HPP__
+#endif  // __RAJA_test_multi_reduce_abstractor_HPP__
diff --git a/test/include/RAJA_test-multi-reducepol.hpp b/test/include/RAJA_test-multi-reducepol.hpp
index 892fc51795..3e962c6df2 100644
--- a/test/include/RAJA_test-multi-reducepol.hpp
+++ b/test/include/RAJA_test-multi-reducepol.hpp
@@ -41,4 +41,4 @@ using HipMultiReducePols = camp::list<
     RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
-#endif // __RAJA_test_multi_reducepol_HPP__
+#endif  // __RAJA_test_multi_reducepol_HPP__
diff --git a/test/include/RAJA_test-platform.hpp b/test/include/RAJA_test-platform.hpp
index 4ae29867f0..ecdf7e1a56 100644
--- a/test/include/RAJA_test-platform.hpp
+++ b/test/include/RAJA_test-platform.hpp
@@ -46,4 +46,4 @@ using OpenMPTargetPlatformList =
 using HipPlatformList = camp::list<PlatformHolder<RAJA::Platform::hip>>;
 #endif
 
-#endif // __RAJA_test_platform_HPP__
+#endif  // __RAJA_test_platform_HPP__
diff --git a/test/include/RAJA_test-plugin-kernelpol.hpp b/test/include/RAJA_test-plugin-kernelpol.hpp
index 37b2ae0a4a..767e7b9c05 100644
--- a/test/include/RAJA_test-plugin-kernelpol.hpp
+++ b/test/include/RAJA_test-plugin-kernelpol.hpp
@@ -35,9 +35,9 @@ using SequentialPluginKernelExecPols = camp::list<
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPPluginKernelExecPols = camp::list<
-    RAJA::KernelPolicy<RAJA::statement::For<0,
-                                            RAJA::omp_parallel_for_exec,
-                                            RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<
+        RAJA::statement::
+            For<0, RAJA::omp_parallel_for_exec, RAJA::statement::Lambda<0>>>,
     RAJA::KernelPolicy<RAJA::statement::Tile<
         0,
         RAJA::tile_fixed<2>,
@@ -46,10 +46,11 @@ using OpenMPPluginKernelExecPols = camp::list<
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPluginKernelExecPols = camp::list<RAJA::KernelPolicy<
-    RAJA::statement::For<0,
-                         RAJA::omp_target_parallel_for_exec<64>,
-                         RAJA::statement::Lambda<0>>>>;
+using OpenMPTargetPluginKernelExecPols =
+    camp::list<RAJA::KernelPolicy<RAJA::statement::For<
+        0,
+        RAJA::omp_target_parallel_for_exec<64>,
+        RAJA::statement::Lambda<0>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -73,9 +74,10 @@ using CudaPluginKernelExecPols = camp::list<
             0,
             RAJA::tile_fixed<128>,
             RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0,
-                                 RAJA::cuda_thread_x_direct,
-                                 RAJA::statement::Lambda<0>>>>>>;
+            RAJA::statement::For<
+                0,
+                RAJA::cuda_thread_x_direct,
+                RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
@@ -99,9 +101,10 @@ using HipPluginKernelExecPols = camp::list<
             0,
             RAJA::tile_fixed<128>,
             RAJA::hip_block_x_direct,
-            RAJA::statement::For<0,
-                                 RAJA::hip_thread_x_direct,
-                                 RAJA::statement::Lambda<0>>>>>>;
+            RAJA::statement::For<
+                0,
+                RAJA::hip_thread_x_direct,
+                RAJA::statement::Lambda<0>>>>>>;
 #endif
 
-#endif // __RAJA_test_plugin_kernelpol_HPP__
+#endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-plugin-launchpol.hpp b/test/include/RAJA_test-plugin-launchpol.hpp
index 2ea6d4c3d8..e086842f5f 100644
--- a/test/include/RAJA_test-plugin-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-launchpol.hpp
@@ -36,4 +36,4 @@ using HipPluginLaunchExecPols =
 
 #endif
 
-#endif // __RAJA_test_plugin_kernelpol_HPP__
+#endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-plugin-resource-launchpol.hpp b/test/include/RAJA_test-plugin-resource-launchpol.hpp
index b399bdac46..e1a2caf27e 100644
--- a/test/include/RAJA_test-plugin-resource-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-resource-launchpol.hpp
@@ -36,4 +36,4 @@ using HipPluginResourceLaunchExecPols = camp::list<
 
 #endif
 
-#endif // __RAJA_test_plugin_kernelpol_HPP__
+#endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-reduce-types.hpp b/test/include/RAJA_test-reduce-types.hpp
index 6f472a048a..23e5f7351d 100644
--- a/test/include/RAJA_test-reduce-types.hpp
+++ b/test/include/RAJA_test-reduce-types.hpp
@@ -21,13 +21,14 @@
 //
 // Reduce data types
 //
-using ReduceDataTypeList = camp::list<int,
+using ReduceDataTypeList = camp::list<
+    int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                      unsigned,
-                                      long long,
-                                      unsigned long long,
+    unsigned,
+    long long,
+    unsigned long long,
 #endif
-                                      float,
-                                      double>;
+    float,
+    double>;
 
-#endif // __RAJA_test_reduce_types_HPP__
+#endif  // __RAJA_test_reduce_types_HPP__
diff --git a/test/include/RAJA_test-reduceloc-types.hpp b/test/include/RAJA_test-reduceloc-types.hpp
index 74e4c6b625..a3387ee275 100644
--- a/test/include/RAJA_test-reduceloc-types.hpp
+++ b/test/include/RAJA_test-reduceloc-types.hpp
@@ -24,4 +24,4 @@ struct Index2D
   {}
 };
 
-#endif // __RAJA_test_reduceloc_types_HPP__
+#endif  // __RAJA_test_reduceloc_types_HPP__
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index 532c2d03d3..eda49db423 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -20,7 +20,7 @@ using SequentialReducePols = camp::list<RAJA::seq_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPReducePols =
-#if 0 // is ordered reduction broken???
+#if 0  // is ordered reduction broken???
   camp::list< RAJA::omp_reduce,
               RAJA::omp_reduce_ordered >;
 #else
@@ -33,27 +33,27 @@ using OpenMPReducePols =
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols =
-    camp::list<RAJA::cuda_reduce_device_fence,
-               RAJA::cuda_reduce_block_fence,
-               RAJA::cuda_reduce_atomic_device_init_device_fence,
-               RAJA::cuda_reduce_atomic_device_init_block_fence,
-               RAJA::cuda_reduce_atomic_host_init_device_fence,
-               RAJA::cuda_reduce_atomic_host_init_block_fence>;
+using CudaReducePols = camp::list<
+    RAJA::cuda_reduce_device_fence,
+    RAJA::cuda_reduce_block_fence,
+    RAJA::cuda_reduce_atomic_device_init_device_fence,
+    RAJA::cuda_reduce_atomic_device_init_block_fence,
+    RAJA::cuda_reduce_atomic_host_init_device_fence,
+    RAJA::cuda_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols =
-    camp::list<RAJA::hip_reduce_device_fence,
-               RAJA::hip_reduce_block_fence,
-               RAJA::hip_reduce_atomic_device_init_device_fence,
-               RAJA::hip_reduce_atomic_device_init_block_fence,
-               RAJA::hip_reduce_atomic_host_init_device_fence,
-               RAJA::hip_reduce_atomic_host_init_block_fence>;
+using HipReducePols = camp::list<
+    RAJA::hip_reduce_device_fence,
+    RAJA::hip_reduce_block_fence,
+    RAJA::hip_reduce_atomic_device_init_device_fence,
+    RAJA::hip_reduce_atomic_device_init_block_fence,
+    RAJA::hip_reduce_atomic_host_init_device_fence,
+    RAJA::hip_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
 using SyclReducePols = camp::list<RAJA::sycl_reduce>;
 #endif
 
-#endif // __RAJA_test_reducepol_HPP__
+#endif  // __RAJA_test_reducepol_HPP__
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index ed426fab4e..c80ab784eb 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -79,8 +79,8 @@ struct TensorTestHelper<RAJA::expt::hip_wave_register>
   {
     hipDeviceSynchronize();
 
-    RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0, 64),
-                                     [=] RAJA_HOST_DEVICE(int) { body(); });
+    RAJA::forall<RAJA::hip_exec<64>>(
+        RAJA::RangeSegment(0, 64), [=] RAJA_HOST_DEVICE(int) { body(); });
 
     hipDeviceSynchronize();
   }
@@ -134,8 +134,8 @@ void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
-                          cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMemcpy(
+        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), cudaMemcpyHostToDevice));
   }
   else
   {
@@ -148,8 +148,8 @@ void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
-                          cudaMemcpyDeviceToHost));
+    cudaErrchk(cudaMemcpy(
+        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), cudaMemcpyDeviceToHost));
   }
   else
   {
@@ -196,8 +196,8 @@ void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
-                        hipMemcpyHostToDevice));
+    hipErrchk(hipMemcpy(
+        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), hipMemcpyHostToDevice));
   }
   else
   {
@@ -210,8 +210,8 @@ void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
-                        hipMemcpyDeviceToHost));
+    hipErrchk(hipMemcpy(
+        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), hipMemcpyDeviceToHost));
   }
   else
   {
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index 14c8daf63f..6225feca3b 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -88,16 +88,16 @@ struct ResourceAllocator
     Resource const& get_resource() const { return m_res; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& /*lhs*/,
-                                  std_allocator<U> const& /*rhs*/)
+    friend inline bool
+    operator==(std_allocator const& /*lhs*/, std_allocator<U> const& /*rhs*/)
     {
-      return true; // lhs.get_resource() == rhs.get_resource(); // TODO not
-                   // equality comparable yet
+      return true;  // lhs.get_resource() == rhs.get_resource(); // TODO not
+                    // equality comparable yet
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const&    lhs,
-                                  std_allocator<U> const& rhs)
+    friend inline bool
+    operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -123,7 +123,7 @@ struct NeverEqualAllocator
 
   NeverEqualAllocator select_on_container_copy_construction()
   {
-    return NeverEqualAllocator{};
+    return NeverEqualAllocator {};
   }
 
   ~NeverEqualAllocator()
@@ -286,15 +286,15 @@ struct WorkStorageTestAllocator
     AllocatorImpl const& get_impl() const { return m_impl; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const&    lhs,
-                                  std_allocator<U> const& rhs)
+    friend inline bool
+    operator==(std_allocator const& lhs, std_allocator<U> const& rhs)
     {
       return lhs.get_impl() == rhs.get_impl();
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const&    lhs,
-                                  std_allocator<U> const& rhs)
+    friend inline bool
+    operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -306,7 +306,7 @@ struct WorkStorageTestAllocator
   };
 };
 
-} // namespace detail
+}  // namespace detail
 
 
 //
@@ -322,10 +322,10 @@ using SequentialOrderedPolicyList =
     camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialOrderPolicyList =
     camp::list<RAJA::ordered, RAJA::reverse_ordered>;
-using SequentialStoragePolicyList =
-    camp::list<RAJA::array_of_pointers,
-               RAJA::ragged_array_of_objects,
-               RAJA::constant_stride_array_of_objects>;
+using SequentialStoragePolicyList = camp::list<
+    RAJA::array_of_pointers,
+    RAJA::ragged_array_of_objects,
+    RAJA::constant_stride_array_of_objects>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPExecPolicyList    = camp::list<RAJA::omp_work>;
@@ -352,10 +352,10 @@ using CudaExecPolicyList = camp::list<
     RAJA::cuda_work<1024>,
     RAJA::cuda_work_explicit<256, 2>>;
 using CudaOrderedPolicyList = SequentialOrderedPolicyList;
-using CudaOrderPolicyList =
-    camp::list<RAJA::ordered,
-               RAJA::reverse_ordered,
-               RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
+using CudaOrderPolicyList   = camp::list<
+    RAJA::ordered,
+    RAJA::reverse_ordered,
+    RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
 using CudaStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -366,10 +366,10 @@ using HipExecPolicyList = camp::list<
 #endif
     RAJA::hip_work<1024>>;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
-using HipOrderPolicyList =
-    camp::list<RAJA::ordered,
-               RAJA::reverse_ordered,
-               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
+using HipOrderPolicyList   = camp::list<
+    RAJA::ordered,
+    RAJA::reverse_ordered,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
 using HipStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -415,12 +415,12 @@ using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<
 //
 // Memory resource types for testing different std allocator requirements
 //
-using WorkStorageAllocatorList =
-    camp::list<typename detail::WorkStorageTestAllocator<
-                   detail::AlwaysEqualAllocator>::template std_allocator<char>,
-               typename detail::WorkStorageTestAllocator<
-                   detail::NeverEqualAllocator>::template std_allocator<char>,
-               typename detail::WorkStorageTestAllocator<
-                   detail::PropogatingAllocator>::template std_allocator<char>>;
-
-#endif // __TEST_WORKGROUP_UTILS_HPP__
+using WorkStorageAllocatorList = camp::list<
+    typename detail::WorkStorageTestAllocator<
+        detail::AlwaysEqualAllocator>::template std_allocator<char>,
+    typename detail::WorkStorageTestAllocator<
+        detail::NeverEqualAllocator>::template std_allocator<char>,
+    typename detail::WorkStorageTestAllocator<
+        detail::PropogatingAllocator>::template std_allocator<char>>;
+
+#endif  // __TEST_WORKGROUP_UTILS_HPP__
diff --git a/test/include/RAJA_unit-test-for3d3d.hpp b/test/include/RAJA_unit-test-for3d3d.hpp
index 37eb5652ea..ceda07d85b 100644
--- a/test/include/RAJA_unit-test-for3d3d.hpp
+++ b/test/include/RAJA_unit-test-for3d3d.hpp
@@ -78,7 +78,7 @@ inline void for3d3d(test_seq, dim3d3d dim, L&& run)
           {
             for (int tx = 0; tx < dim.thread[0]; ++tx)
             {
-              run(dim3d3d{{tx, ty, tz}, {bx, by, bz}}, dim);
+              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
             }
           }
         }
@@ -107,7 +107,7 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
           {
             for (int tx = 0; tx < dim.thread[0]; ++tx)
             {
-              run(dim3d3d{{tx, ty, tz}, {bx, by, bz}}, dim);
+              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
             }
           }
         }
@@ -123,22 +123,26 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_cuda_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
-               static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
-               static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
-               static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
-               static_cast<int>(gridDim.z)}});
+  run(
+      dim3d3d {
+          {static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+           static_cast<int>(threadIdx.z)},
+          {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+           static_cast<int>(blockIdx.z)}},
+      dim3d3d {
+          {static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+           static_cast<int>(blockDim.z)},
+          {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+           static_cast<int>(gridDim.z)}});
 }
 
 // test_cuda implementation
 template <typename L>
 inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 {
-  for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
-                        dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
+  for3d3d_cuda_global<<<
+      dim3(dim.block[0], dim.block[1], dim.block[2]),
+      dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
       std::forward<L>(run));
   cudaErrchk(cudaGetLastError());
   cudaErrchk(cudaDeviceSynchronize());
@@ -151,24 +155,28 @@ inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_hip_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
-               static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
-               static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
-               static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
-               static_cast<int>(gridDim.z)}});
+  run(
+      dim3d3d {
+          {static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+           static_cast<int>(threadIdx.z)},
+          {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+           static_cast<int>(blockIdx.z)}},
+      dim3d3d {
+          {static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+           static_cast<int>(blockDim.z)},
+          {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+           static_cast<int>(gridDim.z)}});
 }
 
 // test_hip implementation
 template <typename L>
 inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 {
-  hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
-                     dim3(dim.block[0], dim.block[1], dim.block[2]),
-                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
-                     std::forward<L>(run));
+  hipLaunchKernelGGL(
+      for3d3d_hip_global<camp::decay<L>>,
+      dim3(dim.block[0], dim.block[1], dim.block[2]),
+      dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
+      std::forward<L>(run));
   hipErrchk(hipGetLastError());
   hipErrchk(hipDeviceSynchronize());
 }
@@ -178,7 +186,7 @@ inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 template <typename test_policy, typename L>
 void for3d3d(dim3d3d dim, L&& run)
 {
-  for3d3d(test_policy{}, dim, std::forward<L>(run));
+  for3d3d(test_policy {}, dim, std::forward<L>(run));
 }
 
-#endif // RAJA_test_for3d3d_HPP__
+#endif  // RAJA_test_for3d3d_HPP__
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
index 615f250e0e..e3c06a8d7e 100644
--- a/test/include/RAJA_unit-test-forone.hpp
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -71,8 +71,9 @@ __global__ void forone_hip_global(L run)
 template <typename L>
 inline void forone(test_hip, L&& run)
 {
-  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
-                     std::forward<L>(run));
+  hipLaunchKernelGGL(
+      forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
+      std::forward<L>(run));
   hipErrchk(hipGetLastError());
   hipErrchk(hipDeviceSynchronize());
 }
@@ -82,7 +83,7 @@ inline void forone(test_hip, L&& run)
 template <typename test_policy, typename L>
 void forone(L&& run)
 {
-  forone(test_policy{}, std::forward<L>(run));
+  forone(test_policy {}, std::forward<L>(run));
 }
 
-#endif // RAJA_test_forone_HPP__
+#endif  // RAJA_test_forone_HPP__
diff --git a/test/include/RAJA_unit-test-policy.hpp b/test/include/RAJA_unit-test-policy.hpp
index 00af606f19..3b5dbcd312 100644
--- a/test/include/RAJA_unit-test-policy.hpp
+++ b/test/include/RAJA_unit-test-policy.hpp
@@ -165,4 +165,4 @@ using OpenMPTargetUnitTestPolicyList = camp::list<test_openmp_target>;
 using HipUnitTestPolicyList = camp::list<test_hip>;
 #endif
 
-#endif // RAJA_test_policy_HPP__
+#endif  // RAJA_test_policy_HPP__
diff --git a/test/include/RAJA_unit-test-types.hpp b/test/include/RAJA_unit-test-types.hpp
index 160d1ad3c7..f58af70238 100644
--- a/test/include/RAJA_unit-test-types.hpp
+++ b/test/include/RAJA_unit-test-types.hpp
@@ -17,18 +17,19 @@
 //
 // List of integral types used in RAJA index unit tests
 //
-using UnitIntegralTypes = ::testing::Types<char,
-                                           unsigned char,
-                                           short,
-                                           unsigned short,
-                                           int,
-                                           unsigned int,
-                                           long,
-                                           unsigned long,
-                                           long int,
-                                           unsigned long int,
-                                           long long,
-                                           unsigned long long>;
+using UnitIntegralTypes = ::testing::Types<
+    char,
+    unsigned char,
+    short,
+    unsigned short,
+    int,
+    unsigned int,
+    long,
+    unsigned long,
+    long int,
+    unsigned long int,
+    long long,
+    unsigned long long>;
 
 //
 // Expanded integral types used in RAJA index unit tests
@@ -38,15 +39,15 @@ using UnitIntegralTypes = ::testing::Types<char,
   RAJA::Index_type, char, unsigned char, short, unsigned short, int,           \
       unsigned int, long, unsigned long, long int, unsigned long int,          \
       long long, unsigned long long
-#endif // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
+#endif  // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
 
 #ifndef RAJA_UNIT_FLOAT_TYPES
 #ifndef __clang__
 #define RAJA_UNIT_FLOAT_TYPES float, double, long double
 #else
 #define RAJA_UNIT_FLOAT_TYPES float, double
-#endif // __clang__
-#endif // FLOATING_TYPES
+#endif  // __clang__
+#endif  // FLOATING_TYPES
 
 using UnitExpandedIntegralTypes =
     ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
@@ -59,20 +60,21 @@ using UnitIntFloatTypes =
 //
 // Standard list of index types used in RAJA index unit tests
 //
-using UnitIndexTypes = ::testing::Types<RAJA::Index_type,
-                                        int,
+using UnitIndexTypes = ::testing::Types<
+    RAJA::Index_type,
+    int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                        unsigned int,
-                                        char,
-                                        unsigned char,
-                                        short,
-                                        unsigned short,
-                                        long,
-                                        unsigned long,
-                                        long int,
-                                        unsigned long int,
-                                        long long,
+    unsigned int,
+    char,
+    unsigned char,
+    short,
+    unsigned short,
+    long,
+    unsigned long,
+    long int,
+    unsigned long int,
+    long long,
 #endif
-                                        unsigned long long>;
+    unsigned long long>;
 
-#endif // __RAJA_unit_test_types_HPP__
+#endif  // __RAJA_unit_test_types_HPP__
diff --git a/test/include/type_helper.hpp b/test/include/type_helper.hpp
index 5195737e78..6286c61271 100644
--- a/test/include/type_helper.hpp
+++ b/test/include/type_helper.hpp
@@ -75,7 +75,7 @@ struct product<std::tuple<>, std::tuple<Ts...>>
 {
   using type = std::tuple<>;
 };
-} // namespace types
+}  // namespace types
 
 
 namespace tt
@@ -95,10 +95,12 @@ struct concat<T>
   using type = T;
 };
 
-template <template <class...> class T,
-          class... Front,
-          class... Next,
-          class... Rest>
+template <
+    template <class...>
+    class T,
+    class... Front,
+    class... Next,
+    class... Rest>
 struct concat<T<Front...>, T<Next...>, Rest...>
 {
   using type = typename concat<T<Front..., Next...>, Rest...>::type;
@@ -134,7 +136,7 @@ struct apply<Fn, L<Ts...>>
 template <template <class> class Outer, class T>
 using apply_t = typename apply<Outer, T>::type;
 
-} // namespace tt
+}  // namespace tt
 
 
 namespace detail
@@ -147,7 +149,7 @@ struct ForTesting<T<Ts...>>
 {
   using type = ::testing::Types<Ts...>;
 };
-} // namespace detail
+}  // namespace detail
 
 template <typename T>
 using ForTesting = typename ::detail::ForTesting<T>::type;
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
index 694578af8b..9a669fb341 100644
--- a/test/install/using-with-cmake/using-with-cmake.cpp
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -9,7 +9,7 @@
 
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 {
-  constexpr std::size_t N{1024};
+  constexpr std::size_t N {1024};
 
   double* a = new double[N];
   double* b = new double[N];
@@ -21,9 +21,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
     b[i] = 2.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
-                               [=] RAJA_HOST_DEVICE(std::size_t i)
-                               { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, N),
+      [=] RAJA_HOST_DEVICE(std::size_t i) { a[i] += b[i] * c; });
 
   delete[] a;
   delete[] b;
diff --git a/test/integration/plugin/plugin_to_test.cpp b/test/integration/plugin/plugin_to_test.cpp
index a637c3476a..8b7822ebaf 100644
--- a/test/integration/plugin/plugin_to_test.cpp
+++ b/test/integration/plugin/plugin_to_test.cpp
@@ -77,7 +77,8 @@ class CounterPlugin : public RAJA::util::PluginStrategy
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin",
-                                                        "Coun"
-                                                        "te"
-                                                        "r");
+static RAJA::util::PluginRegistry::add<CounterPlugin>
+    P("counter-plugin",
+      "Coun"
+      "te"
+      "r");
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
index 9df43f29ac..8e1aeeba21 100644
--- a/test/integration/plugin/tests/counter.hpp
+++ b/test/integration/plugin/tests/counter.hpp
@@ -24,4 +24,4 @@ extern CounterData* plugin_test_data;
 
 extern camp::resources::Resource* plugin_test_resource;
 
-#endif // RAJA_counter_HPP
+#endif  // RAJA_counter_HPP
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index 0ef04acdcb..a9eea22f3f 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -31,8 +31,8 @@ void PluginForallTestImpl()
   for (int i = 0; i < 10; i++)
   {
 
-    RAJA::forall<ExecPolicy>(RAJA::RangeSegment(i, i + 1),
-                             PluginTestCallable{data});
+    RAJA::forall<ExecPolicy>(
+        RAJA::RangeSegment(i, i + 1), PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
@@ -45,8 +45,8 @@ void PluginForallTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -68,8 +68,8 @@ void PluginForAllICountTestImpl()
   for (int i = 0; i < 10; i++)
   {
 
-    RAJA::forall_Icount<ExecPolicy>(RAJA::RangeSegment(i, i + 1), i,
-                                    PluginTestCallable{data});
+    RAJA::forall_Icount<ExecPolicy>(
+        RAJA::RangeSegment(i, i + 1), i, PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
@@ -82,8 +82,8 @@ void PluginForAllICountTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -113,7 +113,7 @@ void PluginForAllIdxSetTestImpl()
     }
 
     RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-        iset, PluginTestCallable{data});
+        iset, PluginTestCallable {data});
 
     for (int j = i; j < 10; j++)
     {
@@ -129,8 +129,8 @@ void PluginForAllIdxSetTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -160,7 +160,7 @@ void PluginForAllIcountIdxSetTestImpl()
     }
 
     RAJA::forall_Icount<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-        iset, PluginTestCallable{data});
+        iset, PluginTestCallable {data});
 
     for (int j = i; j < 10; j++)
     {
@@ -176,8 +176,8 @@ void PluginForAllIcountIdxSetTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -226,14 +226,15 @@ TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
   using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType,
-                                   PlatformHolder::platform>();
+  PluginForAllIcountIdxSetTestImpl<
+      ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
-                            PluginForall,
-                            PluginForAllICount,
-                            PluginForAllIdxSet,
-                            PluginForAllIcountIdxSet);
+REGISTER_TYPED_TEST_SUITE_P(
+    PluginForallTest,
+    PluginForall,
+    PluginForAllICount,
+    PluginForAllIdxSet,
+    PluginForAllIcountIdxSet);
 
-#endif //__TEST_PLUGIN_FORALL_HPP__
+#endif  //__TEST_PLUGIN_FORALL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index 0f1a2f0a82..20bf9c6167 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -31,8 +31,9 @@ void PluginKernelTestImpl()
   for (int i = 0; i < 10; i++)
   {
 
-    RAJA::kernel<KernelPolicy>(RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
-                               PluginTestCallable{data});
+    RAJA::kernel<KernelPolicy>(
+        RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
+        PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
@@ -45,8 +46,8 @@ void PluginKernelTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -74,4 +75,4 @@ TYPED_TEST_P(PluginKernelTest, PluginKernel)
 
 REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest, PluginKernel);
 
-#endif //__TEST_PLUGIN_KERNEL_HPP__
+#endif  //__TEST_PLUGIN_KERNEL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 46674df4cf..dc9df9c033 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -34,7 +34,7 @@ void PluginLaunchTestImpl()
     // Keep PluginTestCallable within a scope to ensure
     // destruction, consistent with other test
     {
-      PluginTestCallable p_callable{data};
+      PluginTestCallable p_callable {data};
 
       RAJA::launch<LaunchPolicy>(
           RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
@@ -53,8 +53,8 @@ void PluginLaunchTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -82,4 +82,4 @@ TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
 
 REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest, PluginLaunch);
 
-#endif //__TEST_PLUGIN_LAUNCH_HPP__
+#endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index b71d4e707e..040238089a 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -36,7 +36,7 @@ void PluginResourceLaunchTestImpl()
     // Keep PluginTestCallable within a scope to ensure
     // destruction, consistent with other test
     {
-      PluginTestCallable p_callable{data};
+      PluginTestCallable p_callable {data};
 
       RAJA::launch<LaunchPolicy>(
           res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
@@ -55,8 +55,8 @@ void PluginResourceLaunchTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(
+      &plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -79,10 +79,10 @@ TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
   using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginResourceLaunchTestImpl<LaunchPolicy, ResType,
-                               PlatformHolder::platform>();
+  PluginResourceLaunchTestImpl<
+      LaunchPolicy, ResType, PlatformHolder::platform>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest, PluginResourceLaunch);
 
-#endif //__TEST_PLUGIN_LAUNCH_HPP__
+#endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index 040991f03f..f589354f49 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -21,14 +21,15 @@
 // once before and after each run invocation for the launch counter.
 
 // test with workgroup
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator,
-          typename WORKINGRES,
-          RAJA::Platform PLATFORM>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator,
+    typename WORKINGRES,
+    RAJA::Platform PLATFORM>
 struct PluginWorkGroupTestImpl
 {
   void operator()() const
@@ -38,22 +39,22 @@ struct PluginWorkGroupTestImpl
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, PluginTestCallable>>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type = RAJA::WorkPool<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type = RAJA::WorkGroup<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type = RAJA::WorkSite<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<>, Allocator>;
 
-    SetupPluginVars spv(WORKINGRES{});
+    SetupPluginVars spv(WORKINGRES {});
 
     CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
@@ -68,21 +69,21 @@ struct PluginWorkGroupTestImpl
         loop_data[i].launch_counter_pre      = -1;
         loop_data[i].launch_counter_post     = -1;
       }
-      plugin_test_resource->memcpy(data, &loop_data[0],
-                                   10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(
+          data, &loop_data[0], 10 * sizeof(CounterData));
     }
 
-    WorkPool_type pool(Allocator{});
+    WorkPool_type pool(Allocator {});
 
     for (int i = 0; i < 10; i++)
     {
-      pool.enqueue(range_segment{i, i + 1}, PluginTestCallable{data});
+      pool.enqueue(range_segment {i, i + 1}, PluginTestCallable {data});
     }
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                                   sizeof(CounterData));
+      plugin_test_resource->memcpy(
+          &plugin_data, plugin_test_data, sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -93,17 +94,17 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData loop_data[10];
-      plugin_test_resource->memcpy(&loop_data[0], data,
-                                   10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(
+          &loop_data[0], data, 10 * sizeof(CounterData));
 
       for (int i = 0; i < 10; i++)
       {
-        ASSERT_EQ(loop_data[i].capture_platform_active,
-                  RAJA::Platform::undefined);
+        ASSERT_EQ(
+            loop_data[i].capture_platform_active, RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
         ASSERT_EQ(loop_data[i].capture_counter_post, -1);
-        ASSERT_EQ(loop_data[i].launch_platform_active,
-                  RAJA::Platform::undefined);
+        ASSERT_EQ(
+            loop_data[i].launch_platform_active, RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
         ASSERT_EQ(loop_data[i].launch_counter_post, -1);
       }
@@ -113,8 +114,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                                   sizeof(CounterData));
+      plugin_test_resource->memcpy(
+          &plugin_data, plugin_test_data, sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -125,17 +126,17 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData loop_data[10];
-      plugin_test_resource->memcpy(&loop_data[0], data,
-                                   10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(
+          &loop_data[0], data, 10 * sizeof(CounterData));
 
       for (int i = 0; i < 10; i++)
       {
-        ASSERT_EQ(loop_data[i].capture_platform_active,
-                  RAJA::Platform::undefined);
+        ASSERT_EQ(
+            loop_data[i].capture_platform_active, RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
         ASSERT_EQ(loop_data[i].capture_counter_post, -1);
-        ASSERT_EQ(loop_data[i].launch_platform_active,
-                  RAJA::Platform::undefined);
+        ASSERT_EQ(
+            loop_data[i].launch_platform_active, RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
         ASSERT_EQ(loop_data[i].launch_counter_post, -1);
       }
@@ -145,8 +146,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                                   sizeof(CounterData));
+      plugin_test_resource->memcpy(
+          &plugin_data, plugin_test_data, sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -178,13 +179,14 @@ struct PluginWorkGroupTestImpl
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKINGRES,
-          RAJA::Platform PLATFORM>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKINGRES,
+    RAJA::Platform PLATFORM>
 struct PluginWorkGroupTestImpl<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -198,13 +200,14 @@ struct PluginWorkGroupTestImpl<
   void operator()() const {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator,
-          typename WORKINGRES,
-          RAJA::Platform PLATFORM>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator,
+    typename WORKINGRES,
+    RAJA::Platform PLATFORM>
 struct PluginWorkGroupTestImpl<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -237,11 +240,11 @@ TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
   using PlatformHolder   = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                          IndexType, Allocator, WORKING_RESOURCE,
-                          PlatformHolder::platform>{}();
+  PluginWorkGroupTestImpl<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator, WORKING_RESOURCE, PlatformHolder::platform> {}();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest, PluginWorkGroup);
 
-#endif //__TEST_PLUGIN_WORKGROUP_HPP__
+#endif  //__TEST_PLUGIN_WORKGROUP_HPP__
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
index a5ef3346c9..d6bbd3af37 100644
--- a/test/integration/plugin/tests/test-plugin.hpp
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -167,4 +167,4 @@ struct PluginTestCallable
   }
 };
 
-#endif //__TEST_PLUGIN_HPP__
+#endif  //__TEST_PLUGIN_HPP__
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
index 3f6e1d857e..653ec3cc69 100644
--- a/test/integration/plugin_for_test_kokkos.cpp
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -9,17 +9,17 @@
 
 #include <exception>
 
-extern "C" void
-kokkosp_init_library(const int      RAJA_UNUSED_ARG(loadSeq),
-                     const uint64_t RAJA_UNUSED_ARG(interfaceVer),
-                     const uint32_t RAJA_UNUSED_ARG(devInfoCount),
-                     void*          RAJA_UNUSED_ARG(deviceInfo))
+extern "C" void kokkosp_init_library(
+    const int      RAJA_UNUSED_ARG(loadSeq),
+    const uint64_t RAJA_UNUSED_ARG(interfaceVer),
+    const uint32_t RAJA_UNUSED_ARG(devInfoCount),
+    void*          RAJA_UNUSED_ARG(deviceInfo))
 {}
 
-extern "C" void
-kokkosp_begin_parallel_for(const char*    RAJA_UNUSED_ARG(name),
-                           const uint32_t RAJA_UNUSED_ARG(devID),
-                           uint64_t*      RAJA_UNUSED_ARG(kID))
+extern "C" void kokkosp_begin_parallel_for(
+    const char*    RAJA_UNUSED_ARG(name),
+    const uint32_t RAJA_UNUSED_ARG(devID),
+    uint64_t*      RAJA_UNUSED_ARG(kID))
 {
   throw std::runtime_error("preLaunch");
 }
diff --git a/test/integration/test_plugin_dynamic.cpp b/test/integration/test_plugin_dynamic.cpp
index 5a3f157e97..6904d8c35d 100644
--- a/test/integration/test_plugin_dynamic.cpp
+++ b/test/integration/test_plugin_dynamic.cpp
@@ -13,8 +13,8 @@ TEST(PluginTestDynamic, Exception)
   int* a = new int[10];
 
   ASSERT_ANY_THROW({
-    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                                 [=](int i) { a[i] = 0; });
+    RAJA::forall<RAJA::seq_exec>(
+        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/integration/test_plugin_kokkos.cpp b/test/integration/test_plugin_kokkos.cpp
index 521870494b..4bde2c5f2e 100644
--- a/test/integration/test_plugin_kokkos.cpp
+++ b/test/integration/test_plugin_kokkos.cpp
@@ -13,8 +13,8 @@ TEST(PluginTestKokkos, Exception)
   int* a = new int[10];
 
   ASSERT_ANY_THROW({
-    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                                 [=](int i) { a[i] = 0; });
+    RAJA::forall<RAJA::seq_exec>(
+        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/old-tests/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
index dd46bfbdc1..fc5a02b1ed 100644
--- a/test/old-tests/unit/cuda/test-synchronize.cpp
+++ b/test/old-tests/unit/cuda/test-synchronize.cpp
@@ -21,9 +21,9 @@ GPU_TEST(SynchronizeTest, CUDA)
       [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::cuda_synchronize>();
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
-                               [=](RAJA::Index_type i)
-                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, 50),
+      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   cudaErrchk(cudaFree(managed_data));
 }
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index 1ac947ae8c..3865346009 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -76,27 +76,28 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
     }
   }
 
-  using SharedTile =
-      AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ,
-                            RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
+  using SharedTile = AtomicTypedLocalArray<
+      RAJA::auto_atomic, double, RAJA::PERM_IJ,
+      RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
-                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+          RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+          RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+          RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
       {
-        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
-        TY row = by * TY_TILE_DIM + ty; // Matrix row index
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
@@ -105,11 +106,11 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
       {
-        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
-        TY row = by * TY_TILE_DIM + ty; // Matrix row index
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
@@ -122,8 +123,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
   {
     for (int col = 0; col < N_cols; ++col)
     {
-      ASSERT_FLOAT_EQ((double)B[col + row * N_cols],
-                      (double)A[col + row * N_cols]);
+      ASSERT_FLOAT_EQ(
+          (double)B[col + row * N_cols], (double)A[col + row * N_cols]);
     }
   }
 
@@ -172,10 +173,10 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows,
-                                                             N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows,
-                                                             N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(
+      d_A, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(
+      d_B, N_rows, N_cols);
 
   for (int row = 0; row < N_rows; ++row)
   {
@@ -187,27 +188,27 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   hipMemcpy(d_A, A, Arr_sz * sizeof(double), hipMemcpyHostToDevice);
 
-  using SharedTile =
-      TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>,
-                      TY, TX>;
+  using SharedTile = TypedLocalArray<
+      double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
-                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(
+          RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+          RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+          RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+          RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
       {
-        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
-        TY row = by * TY_TILE_DIM + ty; // Matrix row index
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
@@ -216,11 +217,11 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
+      [=] RAJA_HOST_DEVICE(
+          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
       {
-        TX col = bx * TX_TILE_DIM + tx; // Matrix column index
-        TY row = by * TY_TILE_DIM + ty; // Matrix row index
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
@@ -246,7 +247,7 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 }
 
 REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem_gpu, Basic);
-#endif // defined(RAJA_ENABLE_HIP)
+#endif  // defined(RAJA_ENABLE_HIP)
 
 
 //
@@ -318,11 +319,12 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(
+          int tx, int ty, int bx, int by, SharedTile& myTile,
+          SharedTile& myTile2)
       {
-        int col = bx * TILE_DIM + tx; // Matrix column index
-        int row = by * TILE_DIM + ty; // Matrix row index
+        int col = bx * TILE_DIM + tx;  // Matrix column index
+        int row = by * TILE_DIM + ty;  // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
@@ -332,11 +334,12 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(
+          int tx, int ty, int bx, int by, SharedTile& myTile,
+          SharedTile& myTile2)
       {
-        int col = by * TILE_DIM + tx; // Transposed matrix column index
-        int row = bx * TILE_DIM + ty; // Transposed matrix row index
+        int col = by * TILE_DIM + tx;  // Transposed matrix column index
+        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
 
         if (row < N_cols && col < N_rows)
         {
@@ -446,11 +449,12 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(
+          int tx, int ty, int bx, int by, SharedTile& myTile,
+          SharedTile& myTile2)
       {
-        int col = bx * TILE_DIM + tx; // Matrix column index
-        int row = by * TILE_DIM + ty; // Matrix row index
+        int col = bx * TILE_DIM + tx;  // Matrix column index
+        int row = by * TILE_DIM + ty;  // Matrix row index
 
         if (row < N_rows && col < N_cols)
         {
@@ -460,11 +464,12 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(
+          int tx, int ty, int bx, int by, SharedTile& myTile,
+          SharedTile& myTile2)
       {
-        int col = by * TILE_DIM + tx; // Transposed matrix column index
-        int row = bx * TILE_DIM + ty; // Transposed matrix row index
+        int col = by * TILE_DIM + tx;  // Transposed matrix column index
+        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
 
         if (row < N_cols && col < N_rows)
         {
@@ -499,7 +504,7 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
 
 REGISTER_TYPED_TEST_SUITE_P(MatTranspose_gpu, Basic);
 
-#endif // defined(RAJA_ENABLE_HIP)
+#endif  // defined(RAJA_ENABLE_HIP)
 
 using SeqTypes =
     ::testing::Types<RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
@@ -524,16 +529,17 @@ using SeqTypes =
                 RAJA::statement::For<
                     1,
                     RAJA::seq_exec,
-                    RAJA::statement::For<0,
-                                         RAJA::seq_exec,
-                                         RAJA::statement::Lambda<1>>>
-
-                > // close shared memory scope
-            >     // for 2
-        >         // for 3
-                                                   > // kernel policy
-                                >                    // list
-                     >;                              // types
+                    RAJA::statement::For<
+                        0,
+                        RAJA::seq_exec,
+                        RAJA::statement::Lambda<1>>>
+
+                >  // close shared memory scope
+            >      // for 2
+        >          // for 3
+                                                   >  // kernel policy
+                                >                     // list
+                     >;                               // types
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatTranspose, SeqTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, TypedLocalMem, SeqTypes);
 
@@ -552,18 +558,20 @@ using TestTypes = ::testing::Types<
                 RAJA::ParamList<0, 1>,
 
                 // Load data into shared memory
-                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                          RAJA::ArgList<0, 1>,
-                                          RAJA::statement::Lambda<0>>,
+                RAJA::statement::Collapse<
+                    RAJA::omp_parallel_collapse_exec,
+                    RAJA::ArgList<0, 1>,
+                    RAJA::statement::Lambda<0>>,
 
                 // Read data from shared memory
-                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                          RAJA::ArgList<0, 1>,
-                                          RAJA::statement::Lambda<1>>>> // for
-                                                                        // 2
-        >                                                               // for 3
-                                  > // close policy
-               >                    // close list
+                RAJA::statement::Collapse<
+                    RAJA::omp_parallel_collapse_exec,
+                    RAJA::ArgList<0, 1>,
+                    RAJA::statement::Lambda<1>>>>  // for
+                                                   // 2
+        >                                          // for 3
+                                  >                // close policy
+               >                                   // close list
 
     ,
     RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
@@ -588,16 +596,17 @@ using TestTypes = ::testing::Types<
                 RAJA::statement::For<
                     1,
                     RAJA::seq_exec,
-                    RAJA::statement::For<0,
-                                         RAJA::omp_parallel_for_exec,
-                                         RAJA::statement::Lambda<1>>>> // close
-                                                                       // shared
-                                                                       // mem
-                                                                       // window
-            >                                                          // 2
-        >                                                              // 3
-                                  > // close policy
-               >                    // close list
+                    RAJA::statement::For<
+                        0,
+                        RAJA::omp_parallel_for_exec,
+                        RAJA::statement::Lambda<1>>>>  // close
+                                                       // shared
+                                                       // mem
+                                                       // window
+            >                                          // 2
+        >                                              // 3
+                                  >                    // close policy
+               >                                       // close list
     ,
     RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
         3,
@@ -621,16 +630,17 @@ using TestTypes = ::testing::Types<
                 RAJA::statement::For<
                     1,
                     RAJA::seq_exec,
-                    RAJA::statement::For<0,
-                                         RAJA::seq_exec,
-                                         RAJA::statement::Lambda<1>>>> // close
-                                                                       // shared
-                                                                       // mem
-                                                                       // window
-            >                                                          // 2
-        >                                                              // 3
-                                  > // close policy list
-               >                    // close list
+                    RAJA::statement::For<
+                        0,
+                        RAJA::seq_exec,
+                        RAJA::statement::Lambda<1>>>>  // close
+                                                       // shared
+                                                       // mem
+                                                       // window
+            >                                          // 2
+        >                                              // 3
+                                  >                    // close policy list
+               >                                       // close list
     ,
     RAJA::list<RAJA::KernelPolicy<RAJA::statement::Collapse<
         RAJA::omp_parallel_collapse_exec,
@@ -651,15 +661,16 @@ using TestTypes = ::testing::Types<
             RAJA::statement::For<
                 1,
                 RAJA::seq_exec,
-                RAJA::statement::For<0,
-                                     RAJA::seq_exec,
-                                     RAJA::statement::Lambda<1>>>> // close
-                                                                   // shared
-                                                                   // mem
-                                                                   // window
-        >                           // outer collapsed
-                                  > // close policy list
-               >                    // close list
+                RAJA::statement::For<
+                    0,
+                    RAJA::seq_exec,
+                    RAJA::statement::Lambda<1>>>>  // close
+                                                   // shared
+                                                   // mem
+                                                   // window
+        >                                          // outer collapsed
+                                  >                // close policy list
+               >                                   // close list
     >;
 
 
@@ -686,25 +697,27 @@ using CUDATypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<
+                            0,
+                            RAJA::cuda_thread_x_direct,
+                            RAJA::statement::Lambda<0>>>,
                     RAJA::statement::CudaSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::CudaSyncThreads>    // close shared memory
-                                                         // scope
-                >                                        // for 2
-            >                                            // for 3
-                                                       > // CudaKernel
-                           >                             // kernel policy
-        >                                                // list
+                        RAJA::statement::For<
+                            0,
+                            RAJA::cuda_thread_x_direct,
+                            RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>     // close shared memory
+                                                          // scope
+                >                                         // for 2
+            >                                             // for 3
+                                                       >  // CudaKernel
+                           >                              // kernel policy
+        >                                                 // list
     ,
     RAJA::list<
         RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
@@ -722,26 +735,28 @@ using CUDATypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<
+                            0,
+                            RAJA::cuda_thread_x_direct,
+                            RAJA::statement::Lambda<0>>>,
                     RAJA::statement::CudaSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::CudaSyncThreads>    // close shared memory
-                                                         // scope
-                >                                        // for 2
-            >                                            // for 3
-                                                       > // CudaKernel
-                           >                             // kernel policy
-        >                                                // list
-    >;                                                   // types
+                        RAJA::statement::For<
+                            0,
+                            RAJA::cuda_thread_x_direct,
+                            RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>     // close shared memory
+                                                          // scope
+                >                                         // for 2
+            >                                             // for 3
+                                                       >  // CudaKernel
+                           >                              // kernel policy
+        >                                                 // list
+    >;                                                    // types
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
@@ -766,25 +781,27 @@ using HIPTypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<
+                            0,
+                            RAJA::hip_thread_x_direct,
+                            RAJA::statement::Lambda<0>>>,
                     RAJA::statement::HipSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::HipSyncThreads>    // close shared memory
-                                                        // scope
-                >                                       // for 2
-            >                                           // for 3
-                                                      > // HipKernel
-                           >                            // kernel policy
-        >                                               // list
+                        RAJA::statement::For<
+                            0,
+                            RAJA::hip_thread_x_direct,
+                            RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>     // close shared memory
+                                                         // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                      >  // HipKernel
+                           >                             // kernel policy
+        >                                                // list
     ,
     RAJA::list<
         RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
@@ -802,26 +819,28 @@ using HIPTypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<
+                            0,
+                            RAJA::hip_thread_x_direct,
+                            RAJA::statement::Lambda<0>>>,
                     RAJA::statement::HipSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::HipSyncThreads>    // close shared memory
-                                                        // scope
-                >                                       // for 2
-            >                                           // for 3
-                                                      > // HipKernel
-                           >                            // kernel policy
-        >                                               // list
-    >;                                                  // types
+                        RAJA::statement::For<
+                            0,
+                            RAJA::hip_thread_x_direct,
+                            RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>     // close shared memory
+                                                         // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                      >  // HipKernel
+                           >                             // kernel policy
+        >                                                // list
+    >;                                                   // types
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, TypedLocalMem_gpu, HIPTypes);
 
@@ -914,12 +933,13 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   using Shmem      = typename TypeParam::Shmem;
   using ThreadPriv = typename TypeParam::ThreadPriv;
 
-  Shmem      aShared, bShared; // memory to be shared between threads
-  ThreadPriv pVal;             // iteration dependent data
+  Shmem      aShared, bShared;  // memory to be shared between threads
+  ThreadPriv pVal;              // iteration dependent data
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
-                       RAJA::RangeSegment(0, P)),
+      RAJA::make_tuple(
+          RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
+          RAJA::RangeSegment(0, P)),
       RAJA::make_tuple(aShared, bShared, pVal),
 
       // Zero out thread local memory for storing dot products
@@ -935,8 +955,9 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
       { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
-                           Shmem& bShared, ThreadPriv& pVal)
+      [=] RAJA_HOST_DEVICE(
+          int tn, int tm, int tp, Shmem& aShared, Shmem& bShared,
+          ThreadPriv& pVal)
       { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
@@ -1045,18 +1066,18 @@ struct Policy_MatMultiply_cpu
                       RAJA::seq_exec,
 
                       // Load tile of A into shmem
-                      RAJA::statement::For<1,
-                                           RAJA::seq_exec,
-                                           RAJA::statement::For<0,
-                                                                RAJA::seq_exec,
-                                                                shmem_Lambda1>>,
+                      RAJA::statement::For<
+                          1,
+                          RAJA::seq_exec,
+                          RAJA::statement::
+                              For<0, RAJA::seq_exec, shmem_Lambda1>>,
 
                       // Load tile of B into shmem
-                      RAJA::statement::For<2,
-                                           RAJA::seq_exec,
-                                           RAJA::statement::For<1,
-                                                                RAJA::seq_exec,
-                                                                shmem_Lambda2>>,
+                      RAJA::statement::For<
+                          2,
+                          RAJA::seq_exec,
+                          RAJA::statement::
+                              For<1, RAJA::seq_exec, shmem_Lambda2>>,
 
                       // Partial multiplication
                       RAJA::statement::For<
@@ -1065,19 +1086,21 @@ struct Policy_MatMultiply_cpu
                           RAJA::statement::For<
                               1,
                               RAJA::seq_exec,
-                              RAJA::statement::For<0,
-                                                   RAJA::seq_exec,
-                                                   shmem_Lambda3>>>>, // sliding
-                                                                      // window
+                              RAJA::statement::For<
+                                  0,
+                                  RAJA::seq_exec,
+                                  shmem_Lambda3>>>>,  // sliding
+                                                      // window
 
                   // Write memory out to global matrix
                   RAJA::statement::For<
                       2,
                       RAJA::seq_exec,
-                      RAJA::statement::For<0,
-                                           RAJA::seq_exec,
-                                           shmem_Lambda4>>>>> // Create shared
-                                                              // memory
+                      RAJA::statement::For<
+                          0,
+                          RAJA::seq_exec,
+                          shmem_Lambda4>>>>>  // Create shared
+                                              // memory
       >;
 };
 
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index 1872aa6d7f..a7e78cf966 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -38,8 +38,8 @@ TEST(SIMD, Align)
   double* y = RAJA::align_hint(a);
   double* x = RAJA::align_hint(b);
 
-  RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
-                                [=](int i) { y[i] += x[i] * c; });
+  RAJA::forall<RAJA::simd_exec>(
+      RAJA::RangeSegment(0, N), [=](int i) { y[i] += x[i] * c; });
 
   for (int i = 0; i < N; ++i)
   {
@@ -61,12 +61,12 @@ TEST(SIMD, OMPAndSimd)
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                  N * M * sizeof(double));
-  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                  N * M * sizeof(double));
-  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                  N * M * sizeof(double));
+  double* a = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* b = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* c = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
 
   for (int i = 0; i < N * M; ++i)
   {
@@ -95,25 +95,26 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
       1, RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
-                           RAJA::statement::Lambda<1>>>>;
+      RAJA::statement::For<
+          0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
+          RAJA::statement::Lambda<1>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                  N * M * sizeof(double));
-  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                  N * M * sizeof(double));
-  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                  N * M * sizeof(double));
-
-  double* a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                   N * M * sizeof(double));
-  double* b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                   N * M * sizeof(double));
-  double* c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
-                                                   N * M * sizeof(double));
+  double* a = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* b = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* c = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
+
+  double* a2 = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* b2 = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* c2 = RAJA::allocate_aligned_type<double>(
+      RAJA::DATA_ALIGN, N * M * sizeof(double));
 
   for (int i = 0; i < N * M; ++i)
   {
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
index 383ec56c30..9de2bdb439 100644
--- a/test/unit/algorithm/test-algorithm-util-for_each.cpp
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -31,12 +31,13 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
   std::vector<TypeParam> numbers;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers,
-                 [&](TypeParam& number)
-                 {
-                   number += 1;
-                   copies.push_back(number);
-                 });
+  RAJA::for_each(
+      numbers,
+      [&](TypeParam& number)
+      {
+        number += 1;
+        copies.push_back(number);
+      });
 
   ASSERT_EQ(copies.size(), 0);
   ASSERT_EQ(numbers.size(), 0);
@@ -51,12 +52,13 @@ TYPED_TEST(ForEachUnitTest, VectorRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers,
-                 [&](TypeParam& number)
-                 {
-                   copies.push_back(number);
-                   number += 1;
-                 });
+  RAJA::for_each(
+      numbers,
+      [&](TypeParam& number)
+      {
+        copies.push_back(number);
+        number += 1;
+      });
 
   ASSERT_EQ(copies.size(), 13);
   for (TypeParam i = 0; i < 13; ++i)
@@ -74,12 +76,13 @@ TYPED_TEST(ForEachUnitTest, RajaSpanRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(RAJA::make_span(numbers.data(), 11),
-                 [&](TypeParam& number)
-                 {
-                   copies.push_back(number);
-                   number += 1;
-                 });
+  RAJA::for_each(
+      RAJA::make_span(numbers.data(), 11),
+      [&](TypeParam& number)
+      {
+        copies.push_back(number);
+        number += 1;
+      });
 
   ASSERT_EQ(copies.size(), 11);
   for (TypeParam i = 0; i < 11; ++i)
@@ -97,8 +100,8 @@ TYPED_TEST(ForEachUnitTest, SetRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers,
-                 [&](TypeParam const& number) { copies.push_back(number); });
+  RAJA::for_each(
+      numbers, [&](TypeParam const& number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 6);
   for (TypeParam i = 0; i < 6; ++i)
@@ -114,8 +117,8 @@ TYPED_TEST(ForEachUnitTest, EmptyTypeList)
   using numbers = camp::list<>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{},
-                      [&](auto number) { copies.push_back(number); });
+  RAJA::for_each_type(
+      numbers {}, [&](auto number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 0);
 }
@@ -127,19 +130,21 @@ T get_num(std::integral_constant<T, val>)
   return val;
 }
 
-template <typename TypeParam,
-          std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
+template <
+    typename TypeParam,
+    std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
-  using numbers = camp::list<std::integral_constant<TypeParam, 0>,
-                             std::integral_constant<TypeParam, 1>,
-                             std::integral_constant<TypeParam, 2>,
-                             std::integral_constant<TypeParam, 3>,
-                             std::integral_constant<TypeParam, 4>>;
+  using numbers = camp::list<
+      std::integral_constant<TypeParam, 0>,
+      std::integral_constant<TypeParam, 1>,
+      std::integral_constant<TypeParam, 2>,
+      std::integral_constant<TypeParam, 3>,
+      std::integral_constant<TypeParam, 4>>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{},
-                      [&](auto number) { copies.push_back(get_num(number)); });
+  RAJA::for_each_type(
+      numbers {}, [&](auto number) { copies.push_back(get_num(number)); });
 
   ASSERT_EQ(copies.size(), 5);
   for (TypeParam i = 0; i < 5; ++i)
@@ -148,8 +153,9 @@ void run_int_type_test()
   }
 }
 ///
-template <typename TypeParam,
-          std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
+template <
+    typename TypeParam,
+    std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   // ignore non-ints
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index c262f6f39d..dda14b0425 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -143,13 +143,14 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type                          N,
-              T,
-              BinaryOp,
-              Reducer reducer,
-              reduce_interface_tag,
-              reduce_default_interface_tag)
+void doReduce(
+    ReduceData<Res, reduce_interface_tag, T>& data,
+    RAJA::Index_type                          N,
+    T,
+    BinaryOp,
+    Reducer reducer,
+    reduce_interface_tag,
+    reduce_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -158,13 +159,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
 }
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type                          N,
-              T                                         init,
-              BinaryOp,
-              Reducer reducer,
-              reduce_interface_tag,
-              reduce_init_interface_tag)
+void doReduce(
+    ReduceData<Res, reduce_interface_tag, T>& data,
+    RAJA::Index_type                          N,
+    T                                         init,
+    BinaryOp,
+    Reducer reducer,
+    reduce_interface_tag,
+    reduce_init_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -173,13 +175,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
 }
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type                          N,
-              T                                         init,
-              BinaryOp                                  op,
-              Reducer                                   reducer,
-              reduce_interface_tag,
-              reduce_init_op_interface_tag)
+void doReduce(
+    ReduceData<Res, reduce_interface_tag, T>& data,
+    RAJA::Index_type                          N,
+    T                                         init,
+    BinaryOp                                  op,
+    Reducer                                   reducer,
+    reduce_interface_tag,
+    reduce_init_op_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -188,22 +191,23 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
 }
 
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename TestReducer,
-          typename BinaryOpInterface>
-::testing::AssertionResult
-testReduce(const char*                               test_name,
-           const unsigned                            seed,
-           ReduceData<Res, reduce_interface_tag, T>& data,
-           RAJA::Index_type                          N,
-           T                                         init,
-           BinaryOp                                  op,
-           TestReducer                               test_reducer,
-           left_fold_reduce_tag,
-           reduce_interface_tag si,
-           BinaryOpInterface    ci)
+template <
+    typename Res,
+    typename T,
+    typename BinaryOp,
+    typename TestReducer,
+    typename BinaryOpInterface>
+::testing::AssertionResult testReduce(
+    const char*                               test_name,
+    const unsigned                            seed,
+    ReduceData<Res, reduce_interface_tag, T>& data,
+    RAJA::Index_type                          N,
+    T                                         init,
+    BinaryOp                                  op,
+    TestReducer                               test_reducer,
+    left_fold_reduce_tag,
+    reduce_interface_tag si,
+    BinaryOpInterface    ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
@@ -225,22 +229,23 @@ testReduce(const char*                               test_name,
   return ::testing::AssertionSuccess();
 }
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename TestReducer,
-          typename BinaryOpInterface>
-::testing::AssertionResult
-testReduce(const char*                               test_name,
-           const unsigned                            seed,
-           ReduceData<Res, reduce_interface_tag, T>& data,
-           RAJA::Index_type                          N,
-           T                                         init,
-           BinaryOp                                  op,
-           TestReducer                               test_reducer,
-           unordered_reduce_tag,
-           reduce_interface_tag si,
-           BinaryOpInterface    ci)
+template <
+    typename Res,
+    typename T,
+    typename BinaryOp,
+    typename TestReducer,
+    typename BinaryOpInterface>
+::testing::AssertionResult testReduce(
+    const char*                               test_name,
+    const unsigned                            seed,
+    ReduceData<Res, reduce_interface_tag, T>& data,
+    RAJA::Index_type                          N,
+    T                                         init,
+    BinaryOp                                  op,
+    TestReducer                               test_reducer,
+    unordered_reduce_tag,
+    reduce_interface_tag si,
+    BinaryOpInterface    ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
@@ -264,10 +269,11 @@ testReduce(const char*                               test_name,
 
 
 template <typename ValType, typename Reducer, typename Res>
-void testReducerInterfaces(unsigned         seed,
-                           RAJA::Index_type MaxN,
-                           Reducer          reducer,
-                           Res              res)
+void testReducerInterfaces(
+    unsigned         seed,
+    RAJA::Index_type MaxN,
+    Reducer          reducer,
+    Res              res)
 {
   using reduce_category    = typename Reducer::reduce_category;
   using interface_category = typename Reducer::reduce_interface;
@@ -285,17 +291,19 @@ void testReducerInterfaces(unsigned         seed,
 
   ASSERT_TRUE(testReduce(
       "default", seed, data, N, RAJA::operators::plus<ValType>::identity(),
-      RAJA::operators::plus<ValType>{}, reducer, reduce_category{},
-      interface_category{}, no_init_operator{}));
+      RAJA::operators::plus<ValType> {}, reducer, reduce_category {},
+      interface_category {}, no_init_operator {}));
   ASSERT_TRUE(testReduce(
-      "init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
+      "init", seed, data, N, ValType(N), RAJA::operators::plus<ValType> {},
+      reducer, reduce_category {}, interface_category {}, init_no_operator {}));
   ASSERT_TRUE(testReduce(
-      "minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
+      "minimum", seed, data, N, ValType(0),
+      RAJA::operators::minimum<ValType> {}, reducer, reduce_category {},
+      interface_category {}, init_operator {}));
   ASSERT_TRUE(testReduce(
-      "Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
+      "Maximum", seed, data, N, ValType(0),
+      RAJA::operators::maximum<ValType> {}, reducer, reduce_category {},
+      interface_category {}, init_operator {}));
 }
 
 template <typename ValType, typename Reducer, typename Res>
@@ -310,7 +318,7 @@ void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device{}();
+  static unsigned seed = std::random_device {}();
   return seed;
 }
 
@@ -330,7 +338,7 @@ TYPED_TEST_P(ReduceUnitTest, UnitReduce)
 
   unsigned         seed = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Reducer          reducer{};
+  Reducer          reducer {};
   ResType          res = ResType::get_default();
 
   testReducer<ValType>(seed, MaxN, reducer, res);
@@ -342,15 +350,16 @@ REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
 //
 // Key types for reduce tests
 //
-using ReduceValTypeList = camp::list<RAJA::Index_type,
-                                     int,
+using ReduceValTypeList = camp::list<
+    RAJA::Index_type,
+    int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                     unsigned,
-                                     long long,
-                                     unsigned long long,
-                                     float,
+    unsigned,
+    long long,
+    unsigned long long,
+    float,
 #endif
-                                     double>;
+    double>;
 
 // Max test lengths for reduce tests
 using ReduceMaxNListDefault = camp::list<camp::num<10000>>;
@@ -359,4 +368,4 @@ using ReduceMaxNListSmall = camp::list<camp::num<1000>>;
 
 using ReduceMaxNListTiny = camp::list<camp::num<100>>;
 
-#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
+#endif  //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index d8fad70e6d..c9f57a656d 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -98,10 +98,11 @@ struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 #endif
 
 
-template <typename Res,
-          typename pairs_category,
-          typename K,
-          typename V = RAJA::Index_type>
+template <
+    typename Res,
+    typename pairs_category,
+    typename K,
+    typename V = RAJA::Index_type>
 struct SortData;
 
 template <typename Res, typename K, typename V>
@@ -192,20 +193,21 @@ struct SortData<Res, sort_pairs_interface_tag, K, V>
     if (orig_vals != nullptr)
     {
       this->m_res.deallocate(orig_vals, camp::resources::MemoryAccess::Managed);
-      this->m_res.deallocate(sorted_vals,
-                             camp::resources::MemoryAccess::Managed);
+      this->m_res.deallocate(
+          sorted_vals, camp::resources::MemoryAccess::Managed);
     }
   }
 };
 
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type                      N,
-            Compare,
-            Sorter sorter,
-            sort_interface_tag,
-            sort_default_interface_tag)
+void doSort(
+    SortData<Res, sort_interface_tag, T>& data,
+    RAJA::Index_type                      N,
+    Compare,
+    Sorter sorter,
+    sort_interface_tag,
+    sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -214,12 +216,13 @@ void doSort(SortData<Res, sort_interface_tag, T>& data,
 }
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type                      N,
-            Compare                               comp,
-            Sorter                                sorter,
-            sort_interface_tag,
-            sort_comp_interface_tag)
+void doSort(
+    SortData<Res, sort_interface_tag, T>& data,
+    RAJA::Index_type                      N,
+    Compare                               comp,
+    Sorter                                sorter,
+    sort_interface_tag,
+    sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -228,12 +231,13 @@ void doSort(SortData<Res, sort_interface_tag, T>& data,
 }
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type                      N,
-            Compare,
-            Sorter sorter,
-            sort_interface_tag,
-            sort_res_default_interface_tag)
+void doSort(
+    SortData<Res, sort_interface_tag, T>& data,
+    RAJA::Index_type                      N,
+    Compare,
+    Sorter sorter,
+    sort_interface_tag,
+    sort_res_default_interface_tag)
 {
   data.copy_data(N);
   sorter(data.resource(), RAJA::make_span(data.sorted_keys, N));
@@ -241,107 +245,122 @@ void doSort(SortData<Res, sort_interface_tag, T>& data,
 }
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
-            RAJA::Index_type                      N,
-            Compare                               comp,
-            Sorter                                sorter,
-            sort_interface_tag,
-            sort_res_comp_interface_tag)
+void doSort(
+    SortData<Res, sort_interface_tag, T>& data,
+    RAJA::Index_type                      N,
+    Compare                               comp,
+    Sorter                                sorter,
+    sort_interface_tag,
+    sort_res_comp_interface_tag)
 {
   data.copy_data(N);
   sorter(data.resource(), RAJA::make_span(data.sorted_keys, N), comp);
   data.resource().wait();
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type                               N,
-            Compare,
-            Sorter sorter,
-            sort_pairs_interface_tag,
-            sort_default_interface_tag)
+template <
+    typename Res,
+    typename K,
+    typename V,
+    typename Compare,
+    typename Sorter>
+void doSort(
+    SortData<Res, sort_pairs_interface_tag, K, V>& data,
+    RAJA::Index_type                               N,
+    Compare,
+    Sorter sorter,
+    sort_pairs_interface_tag,
+    sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N));
+  sorter(
+      RAJA::make_span(data.sorted_keys, N),
+      RAJA::make_span(data.sorted_vals, N));
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type                               N,
-            Compare                                        comp,
-            Sorter                                         sorter,
-            sort_pairs_interface_tag,
-            sort_comp_interface_tag)
+template <
+    typename Res,
+    typename K,
+    typename V,
+    typename Compare,
+    typename Sorter>
+void doSort(
+    SortData<Res, sort_pairs_interface_tag, K, V>& data,
+    RAJA::Index_type                               N,
+    Compare                                        comp,
+    Sorter                                         sorter,
+    sort_pairs_interface_tag,
+    sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N), comp);
+  sorter(
+      RAJA::make_span(data.sorted_keys, N),
+      RAJA::make_span(data.sorted_vals, N), comp);
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type                               N,
-            Compare,
-            Sorter sorter,
-            sort_pairs_interface_tag,
-            sort_res_default_interface_tag)
+template <
+    typename Res,
+    typename K,
+    typename V,
+    typename Compare,
+    typename Sorter>
+void doSort(
+    SortData<Res, sort_pairs_interface_tag, K, V>& data,
+    RAJA::Index_type                               N,
+    Compare,
+    Sorter sorter,
+    sort_pairs_interface_tag,
+    sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N));
+  sorter(
+      data.resource(), RAJA::make_span(data.sorted_keys, N),
+      RAJA::make_span(data.sorted_vals, N));
   data.resource().wait();
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
-            RAJA::Index_type                               N,
-            Compare                                        comp,
-            Sorter                                         sorter,
-            sort_pairs_interface_tag,
-            sort_res_comp_interface_tag)
+template <
+    typename Res,
+    typename K,
+    typename V,
+    typename Compare,
+    typename Sorter>
+void doSort(
+    SortData<Res, sort_pairs_interface_tag, K, V>& data,
+    RAJA::Index_type                               N,
+    Compare                                        comp,
+    Sorter                                         sorter,
+    sort_pairs_interface_tag,
+    sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N), comp);
+  sorter(
+      data.resource(), RAJA::make_span(data.sorted_keys, N),
+      RAJA::make_span(data.sorted_vals, N), comp);
   data.resource().wait();
 }
 
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename TestSorter,
-          typename CompareInterface>
-::testing::AssertionResult testSort(const char*    test_name,
-                                    const unsigned seed,
-                                    SortData<Res, sort_interface_tag, T>& data,
-                                    RAJA::Index_type                      N,
-                                    Compare                               comp,
-                                    TestSorter test_sorter,
-                                    unstable_sort_tag,
-                                    sort_interface_tag si,
-                                    CompareInterface   ci)
+template <
+    typename Res,
+    typename T,
+    typename Compare,
+    typename TestSorter,
+    typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char*                           test_name,
+    const unsigned                        seed,
+    SortData<Res, sort_interface_tag, T>& data,
+    RAJA::Index_type                      N,
+    Compare                               comp,
+    TestSorter                            test_sorter,
+    unstable_sort_tag,
+    sort_interface_tag si,
+    CompareInterface   ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -353,7 +372,7 @@ ::testing::AssertionResult testSort(const char*    test_name,
     auto key_iter = keys.find(data.orig_keys[i]);
     if (key_iter == keys.end())
     {
-      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+      auto ret = keys.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -393,20 +412,22 @@ ::testing::AssertionResult testSort(const char*    test_name,
   return ::testing::AssertionSuccess();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename TestSorter,
-          typename CompareInterface>
-::testing::AssertionResult testSort(const char*    test_name,
-                                    const unsigned seed,
-                                    SortData<Res, sort_interface_tag, T>& data,
-                                    RAJA::Index_type                      N,
-                                    Compare                               comp,
-                                    TestSorter test_sorter,
-                                    stable_sort_tag,
-                                    sort_interface_tag si,
-                                    CompareInterface   ci)
+template <
+    typename Res,
+    typename T,
+    typename Compare,
+    typename TestSorter,
+    typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char*                           test_name,
+    const unsigned                        seed,
+    SortData<Res, sort_interface_tag, T>& data,
+    RAJA::Index_type                      N,
+    Compare                               comp,
+    TestSorter                            test_sorter,
+    stable_sort_tag,
+    sort_interface_tag si,
+    CompareInterface   ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -418,7 +439,7 @@ ::testing::AssertionResult testSort(const char*    test_name,
     auto key_iter = keys.find(data.orig_keys[i]);
     if (key_iter == keys.end())
     {
-      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+      auto ret = keys.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -458,22 +479,23 @@ ::testing::AssertionResult testSort(const char*    test_name,
 }
 
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Compare,
-          typename TestSorter,
-          typename CompareInterface>
-::testing::AssertionResult
-testSort(const char*                                    test_name,
-         const unsigned                                 seed,
-         SortData<Res, sort_pairs_interface_tag, K, V>& data,
-         RAJA::Index_type                               N,
-         Compare                                        comp,
-         TestSorter                                     test_sorter,
-         unstable_sort_tag,
-         sort_pairs_interface_tag si,
-         CompareInterface         ci)
+template <
+    typename Res,
+    typename K,
+    typename V,
+    typename Compare,
+    typename TestSorter,
+    typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char*                                    test_name,
+    const unsigned                                 seed,
+    SortData<Res, sort_pairs_interface_tag, K, V>& data,
+    RAJA::Index_type                               N,
+    Compare                                        comp,
+    TestSorter                                     test_sorter,
+    unstable_sort_tag,
+    sort_pairs_interface_tag si,
+    CompareInterface         ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -485,7 +507,7 @@ testSort(const char*                                    test_name,
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
     if (key_iter == keys_to_vals.end())
     {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -529,22 +551,23 @@ testSort(const char*                                    test_name,
   return ::testing::AssertionSuccess();
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Compare,
-          typename TestSorter,
-          typename CompareInterface>
-::testing::AssertionResult
-testSort(const char*                                    test_name,
-         const unsigned                                 seed,
-         SortData<Res, sort_pairs_interface_tag, K, V>& data,
-         RAJA::Index_type                               N,
-         Compare                                        comp,
-         TestSorter                                     test_sorter,
-         stable_sort_tag,
-         sort_pairs_interface_tag si,
-         CompareInterface         ci)
+template <
+    typename Res,
+    typename K,
+    typename V,
+    typename Compare,
+    typename TestSorter,
+    typename CompareInterface>
+::testing::AssertionResult testSort(
+    const char*                                    test_name,
+    const unsigned                                 seed,
+    SortData<Res, sort_pairs_interface_tag, K, V>& data,
+    RAJA::Index_type                               N,
+    Compare                                        comp,
+    TestSorter                                     test_sorter,
+    stable_sort_tag,
+    sort_pairs_interface_tag si,
+    CompareInterface         ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -556,7 +579,7 @@ testSort(const char*                                    test_name,
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
     if (key_iter == keys_to_vals.end())
     {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
@@ -625,23 +648,24 @@ void testSorterResInterfaces(
   using resource_no_comparator  = sort_res_default_interface_tag;
   using resource_use_comparator = sort_res_comp_interface_tag;
 
-  ASSERT_TRUE(testSort("resource+default", seed, data, N,
-                       RAJA::operators::less<K>{}, sorter, stability_category{},
-                       pairs_category{}, resource_no_comparator{}));
-  ASSERT_TRUE(testSort("resource+ascending", seed, data, N,
-                       RAJA::operators::less<K>{}, sorter, stability_category{},
-                       pairs_category{}, resource_use_comparator{}));
-  ASSERT_TRUE(testSort("resource+descending", seed, data, N,
-                       RAJA::operators::greater<K>{}, sorter,
-                       stability_category{}, pairs_category{},
-                       resource_use_comparator{}));
+  ASSERT_TRUE(testSort(
+      "resource+default", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, resource_no_comparator {}));
+  ASSERT_TRUE(testSort(
+      "resource+ascending", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, resource_use_comparator {}));
+  ASSERT_TRUE(testSort(
+      "resource+descending", seed, data, N, RAJA::operators::greater<K> {},
+      sorter, stability_category {}, pairs_category {},
+      resource_use_comparator {}));
 }
 
 template <typename K, typename Sorter, typename Res>
-void testSorterInterfaces(unsigned         seed,
-                          RAJA::Index_type MaxN,
-                          Sorter           sorter,
-                          Res              res)
+void testSorterInterfaces(
+    unsigned         seed,
+    RAJA::Index_type MaxN,
+    Sorter           sorter,
+    Res              res)
 {
   using stability_category = typename Sorter::sort_category;
   using pairs_category     = typename Sorter::sort_interface;
@@ -656,15 +680,15 @@ void testSorterInterfaces(unsigned         seed,
 
   SortData<Res, pairs_category, K> data(N, res, [&]() { return dist(rng); });
 
-  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K>{},
-                       sorter, stability_category{}, pairs_category{},
-                       no_comparator{}));
-  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K>{},
-                       sorter, stability_category{}, pairs_category{},
-                       use_comparator{}));
   ASSERT_TRUE(testSort(
-      "descending", seed, data, N, RAJA::operators::greater<K>{}, sorter,
-      stability_category{}, pairs_category{}, use_comparator{}));
+      "default", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, no_comparator {}));
+  ASSERT_TRUE(testSort(
+      "ascending", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, use_comparator {}));
+  ASSERT_TRUE(testSort(
+      "descending", seed, data, N, RAJA::operators::greater<K> {}, sorter,
+      stability_category {}, pairs_category {}, use_comparator {}));
 
   testSorterResInterfaces(supports_resource(), seed, data, N, sorter);
 }
@@ -681,7 +705,7 @@ void testSorter(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device{}();
+  static unsigned seed = std::random_device {}();
   return seed;
 }
 
@@ -701,7 +725,7 @@ TYPED_TEST_P(SortUnitTest, UnitSort)
 
   unsigned         seed = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Sorter           sorter{};
+  Sorter           sorter {};
   ResType          res = ResType::get_default();
 
   testSorter<KeyType>(seed, MaxN, sorter, res);
@@ -713,15 +737,16 @@ REGISTER_TYPED_TEST_SUITE_P(SortUnitTest, UnitSort);
 //
 // Key types for sort tests
 //
-using SortKeyTypeList = camp::list<RAJA::Index_type,
-                                   int,
+using SortKeyTypeList = camp::list<
+    RAJA::Index_type,
+    int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                   unsigned,
-                                   long long,
-                                   unsigned long long,
-                                   float,
+    unsigned,
+    long long,
+    unsigned long long,
+    float,
 #endif
-                                   double>;
+    double>;
 
 // Max test lengths for sort tests
 using SortMaxNListDefault = camp::list<camp::num<10000>>;
@@ -730,4 +755,4 @@ using SortMaxNListSmall = camp::list<camp::num<1000>>;
 
 using SortMaxNListTiny = camp::list<camp::num<100>>;
 
-#endif //__TEST_ALGORITHM_SORT_UTILS_HPP__
+#endif  //__TEST_ALGORITHM_SORT_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index 1d6d6eba48..c8f29c4de2 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -58,8 +58,8 @@ struct PolicySortPairs : PolicySynchronize<policy>
   PolicySortPairs() : m_name("RAJA::sort<unknown>[pairs]") {}
 
   PolicySortPairs(std::string const& policy_name)
-      : m_name(std::string("RAJA::sort<") + policy_name +
-               std::string(">[pairs]"))
+      : m_name(
+            std::string("RAJA::sort<") + policy_name + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -77,26 +77,26 @@ using SequentialSortSorters =
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPSortSorters =
-    camp::list<PolicySort<RAJA::omp_parallel_for_exec>,
-               PolicySortPairs<RAJA::omp_parallel_for_exec>>;
+using OpenMPSortSorters = camp::list<
+    PolicySort<RAJA::omp_parallel_for_exec>,
+    PolicySortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaSortSorters =
-    camp::list<PolicySort<RAJA::cuda_exec<128>>,
-               PolicySortPairs<RAJA::cuda_exec<128>>,
-               PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
+using CudaSortSorters = camp::list<
+    PolicySort<RAJA::cuda_exec<128>>,
+    PolicySortPairs<RAJA::cuda_exec<128>>,
+    PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipSortSorters = camp::list<PolicySort<RAJA::hip_exec<128>>,
-                                  PolicySortPairs<RAJA::hip_exec<128>>>;
+using HipSortSorters = camp::
+    list<PolicySort<RAJA::hip_exec<128>>, PolicySortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
-#endif //__TEST_UNIT_ALGORITHM_SORT_HPP__
+#endif  //__TEST_UNIT_ALGORITHM_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index 725ebbaee1..a8658caf36 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -35,8 +35,8 @@ struct PolicyStableSort : PolicySynchronize<policy>
   PolicyStableSort() : m_name("RAJA::stable_sort<unknown>") {}
 
   PolicyStableSort(std::string const& policy_name)
-      : m_name(std::string("RAJA::stable_sort<") + policy_name +
-               std::string(">"))
+      : m_name(
+            std::string("RAJA::stable_sort<") + policy_name + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -60,8 +60,9 @@ struct PolicyStableSortPairs : PolicySynchronize<policy>
   PolicyStableSortPairs() : m_name("RAJA::stable_sort<unknown>[pairs]") {}
 
   PolicyStableSortPairs(std::string const& policy_name)
-      : m_name(std::string("RAJA::stable_sort<") + policy_name +
-               std::string(">[pairs]"))
+      : m_name(
+            std::string("RAJA::stable_sort<") + policy_name +
+            std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -73,33 +74,33 @@ struct PolicyStableSortPairs : PolicySynchronize<policy>
   }
 };
 
-using SequentialStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::seq_exec>,
-               PolicyStableSortPairs<RAJA::seq_exec>>;
+using SequentialStableSortSorters = camp::list<
+    PolicyStableSort<RAJA::seq_exec>,
+    PolicyStableSortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::omp_parallel_for_exec>,
-               PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
+using OpenMPStableSortSorters = camp::list<
+    PolicyStableSort<RAJA::omp_parallel_for_exec>,
+    PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::cuda_exec<128>>,
-               PolicyStableSortPairs<RAJA::cuda_exec<128>>,
-               PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
+using CudaStableSortSorters = camp::list<
+    PolicyStableSort<RAJA::cuda_exec<128>>,
+    PolicyStableSortPairs<RAJA::cuda_exec<128>>,
+    PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::hip_exec<128>>,
-               PolicyStableSortPairs<RAJA::hip_exec<128>>>;
+using HipStableSortSorters = camp::list<
+    PolicyStableSort<RAJA::hip_exec<128>>,
+    PolicyStableSortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
-#endif // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
+#endif  // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index 3f006025ff..d3d7bf7f9c 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -77,8 +77,9 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   std::string m_name;
 
   BinaryTreeReduce()
-      : m_name(std::string("RAJA::binary_tree_reduce<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(
+            std::string("RAJA::binary_tree_reduce<") +
+            test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -91,9 +92,10 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   }
 
   template <typename T, typename Container>
-  void operator()(T*                                    reduced_value,
-                  Container&&                           c,
-                  RAJA::detail::ContainerVal<Container> init)
+  void operator()(
+      T*                                    reduced_value,
+      Container&&                           c,
+      RAJA::detail::ContainerVal<Container> init)
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -101,10 +103,11 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   }
 
   template <typename T, typename Container, typename BinaryOp>
-  void operator()(T*                                    reduced_value,
-                  Container&&                           c,
-                  RAJA::detail::ContainerVal<Container> init,
-                  BinaryOp                              op)
+  void operator()(
+      T*                                    reduced_value,
+      Container&&                           c,
+      RAJA::detail::ContainerVal<Container> init,
+      BinaryOp                              op)
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -121,8 +124,9 @@ struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   Accumulate()
-      : m_name(std::string("RAJA::accumulate<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(
+            std::string("RAJA::accumulate<") +
+            test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -135,19 +139,21 @@ struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   }
 
   template <typename T, typename Container>
-  void operator()(T*                                    reduced_value,
-                  Container&&                           c,
-                  RAJA::detail::ContainerVal<Container> init)
+  void operator()(
+      T*                                    reduced_value,
+      Container&&                           c,
+      RAJA::detail::ContainerVal<Container> init)
   {
     forone<test_policy>([=] RAJA_DEVICE()
                         { *reduced_value = RAJA::accumulate(c, init); });
   }
 
   template <typename T, typename Container, typename BinaryOp>
-  void operator()(T*                                    reduced_value,
-                  Container&&                           c,
-                  RAJA::detail::ContainerVal<Container> init,
-                  BinaryOp                              op)
+  void operator()(
+      T*                                    reduced_value,
+      Container&&                           c,
+      RAJA::detail::ContainerVal<Container> init,
+      BinaryOp                              op)
   {
     forone<test_policy>([=] RAJA_DEVICE()
                         { *reduced_value = RAJA::accumulate(c, init, op); });
@@ -178,4 +184,4 @@ using HipAccumulateReduceReducers = camp::list<Accumulate<test_hip>>;
 
 #endif
 
-#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
+#endif  //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
index 638859090b..3fd939a237 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -85,12 +85,15 @@ struct InsertionSortPairs<test_policy, RunOnHost>
 
   const char* name() { return "RAJA::insertion_sort[pairs]"; }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -123,12 +126,15 @@ struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::shell_sort[pairs]"; }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -161,12 +167,15 @@ struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::heap_sort[pairs]"; }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -199,12 +208,15 @@ struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::intro_sort[pairs]"; }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -237,12 +249,15 @@ struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::merge_sort[pairs]"; }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -262,8 +277,9 @@ struct InsertionSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   InsertionSort()
-      : m_name(std::string("RAJA::insertion_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(
+            std::string("RAJA::insertion_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -292,18 +308,22 @@ struct InsertionSortPairs<test_policy, RunOnDevice>
   std::string m_name;
 
   InsertionSortPairs()
-      : m_name(std::string("RAJA::insertion_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(
+            std::string("RAJA::insertion_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -325,8 +345,9 @@ struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   ShellSort()
-      : m_name(std::string("RAJA::shell_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(
+            std::string("RAJA::shell_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -354,18 +375,22 @@ struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   ShellSortPairs()
-      : m_name(std::string("RAJA::shell_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(
+            std::string("RAJA::shell_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -387,8 +412,9 @@ struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   HeapSort()
-      : m_name(std::string("RAJA::heap_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(
+            std::string("RAJA::heap_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -416,18 +442,22 @@ struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   HeapSortPairs()
-      : m_name(std::string("RAJA::heap_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(
+            std::string("RAJA::heap_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -449,8 +479,9 @@ struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   IntroSort()
-      : m_name(std::string("RAJA::intro_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(
+            std::string("RAJA::intro_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -478,18 +509,22 @@ struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   IntroSortPairs()
-      : m_name(std::string("RAJA::intro_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(
+            std::string("RAJA::intro_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -511,8 +546,9 @@ struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   MergeSort()
-      : m_name(std::string("RAJA::merge_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(
+            std::string("RAJA::merge_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -540,18 +576,22 @@ struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   MergeSortPairs()
-      : m_name(std::string("RAJA::merge_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(
+            std::string("RAJA::merge_sort<") +
+            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void
-  operator()(KeyContainer&& keys, ValContainer&& vals, Compare comp = Compare{})
+  template <
+      typename KeyContainer,
+      typename ValContainer,
+      typename Compare =
+          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(
+      KeyContainer&& keys,
+      ValContainer&& vals,
+      Compare        comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -619,4 +659,4 @@ using HipMergeSortSorters =
 
 #endif
 
-#endif //__TEST_ALGORITHM_UTIL_SORT_HPP__
+#endif  //__TEST_ALGORITHM_UTIL_SORT_HPP__
diff --git a/test/unit/atomic/test-atomic-incdec.cpp b/test/unit/atomic/test-atomic-incdec.cpp
index a3e2b94444..b8e02b6d91 100644
--- a/test/unit/atomic/test-atomic-incdec.cpp
+++ b/test/unit/atomic/test-atomic-incdec.cpp
@@ -18,31 +18,31 @@
 #include "RAJA_unit-test-forone.hpp"
 #endif
 
-using unsigned_types =
-    ::testing::Types<std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>
+using unsigned_types = ::testing::Types<
+    std::tuple<unsigned int, RAJA::builtin_atomic>,
+    std::tuple<unsigned int, RAJA::seq_atomic>,
+    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+    std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>
+    ,
+    std::tuple<unsigned int, RAJA::omp_atomic>,
+    std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
+    ,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                     ,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::hip_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::hip_atomic>
+    ,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::hip_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::hip_atomic>
 #endif
-                     >;
+    >;
 
 // Basic Inc Dec
 
@@ -100,19 +100,20 @@ TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest, BasicIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicIncDecUnitTest,
-                               AtomicBasicIncDecUnitTest,
-                               unsigned_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    BasicIncDecUnitTest,
+    AtomicBasicIncDecUnitTest,
+    unsigned_types);
 
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
 
-using CUDA_unsigned_types =
-    ::testing::Types<std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+using CUDA_unsigned_types = ::testing::Types<
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
 
 
 template <typename T>
@@ -187,7 +188,8 @@ GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAIncDecUnitTest,
-                               AtomicCUDAIncDecUnitTest,
-                               CUDA_unsigned_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CUDAIncDecUnitTest,
+    AtomicCUDAIncDecUnitTest,
+    CUDA_unsigned_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
index 38dd03d3d3..9f9df881c3 100644
--- a/test/unit/atomic/test-atomic-ref-accessors.cpp
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -60,9 +60,10 @@ TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest, BasicAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicAccessUnitTest,
-                               AtomicRefBasicAccessorUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    BasicAccessUnitTest,
+    AtomicRefBasicAccessorUnitTest,
+    basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -133,7 +134,8 @@ GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAccessUnitTest,
-                               AtomicRefCUDAAccessorUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CUDAAccessUnitTest,
+    AtomicRefCUDAAccessorUnitTest,
+    CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
index 5e6e48d34c..fe7ca0cd99 100644
--- a/test/unit/atomic/test-atomic-ref-addsub.cpp
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -71,9 +71,10 @@ TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicAddSubUnitTest,
-                               AtomicRefBasicAddSubUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    BasicAddSubUnitTest,
+    AtomicRefBasicAddSubUnitTest,
+    basic_types);
 
 
 // Pure CUDA test.
@@ -149,7 +150,8 @@ GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAddSubUnitTest,
-                               AtomicRefCUDAAddSubUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CUDAAddSubUnitTest,
+    AtomicRefCUDAAddSubUnitTest,
+    CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
index a6260737d0..d104e189e0 100644
--- a/test/unit/atomic/test-atomic-ref-bitwise.cpp
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -66,33 +66,34 @@ TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises);
 
-using basic_types =
-    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
-                     std::tuple<int, RAJA::seq_atomic>,
-                     std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>
+using basic_types = ::testing::Types<
+    std::tuple<int, RAJA::builtin_atomic>,
+    std::tuple<int, RAJA::seq_atomic>,
+    std::tuple<unsigned int, RAJA::builtin_atomic>,
+    std::tuple<unsigned int, RAJA::seq_atomic>,
+    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+    std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<int, RAJA::omp_atomic>,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>
+    ,
+    std::tuple<int, RAJA::omp_atomic>,
+    std::tuple<unsigned int, RAJA::omp_atomic>,
+    std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
+    ,
+    std::tuple<int, RAJA::auto_atomic>,
+    std::tuple<int, RAJA::cuda_atomic>,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
-                     >;
+    >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicBitwiseUnitTest,
-                               AtomicRefBasicBitwiseUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    BasicBitwiseUnitTest,
+    AtomicRefBasicBitwiseUnitTest,
+    basic_types);
 
 
 // Pure CUDA test.
@@ -159,15 +160,16 @@ GPU_TYPED_TEST_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises);
 
-using CUDA_types =
-    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDABitwiseUnitTest,
-                               AtomicRefCUDABitwiseUnitTest,
-                               CUDA_types);
+using CUDA_types = ::testing::Types<
+    std::tuple<int, RAJA::auto_atomic>,
+    std::tuple<int, RAJA::cuda_atomic>,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CUDABitwiseUnitTest,
+    AtomicRefCUDABitwiseUnitTest,
+    CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
index f0bc598da3..54c4195c34 100644
--- a/test/unit/atomic/test-atomic-ref-constructor.cpp
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -50,14 +50,16 @@ TYPED_TEST_P(AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors)
   DefaultPolConstructors<TypeParam>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest,
-                            DefaultPolConstructors);
+REGISTER_TYPED_TEST_SUITE_P(
+    AtomicRefDefaultConstructorUnitTest,
+    DefaultPolConstructors);
 
 using default_types = ::testing::Types<int, float, double>;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(DefaultConstrUnitTest,
-                               AtomicRefDefaultConstructorUnitTest,
-                               default_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    DefaultConstrUnitTest,
+    AtomicRefDefaultConstructorUnitTest,
+    default_types);
 
 // Basic Constructors with policies
 
@@ -86,12 +88,14 @@ TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
   ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest,
-                            BasicConstructors);
+REGISTER_TYPED_TEST_SUITE_P(
+    AtomicRefBasicConstructorUnitTest,
+    BasicConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicConstrUnitTest,
-                               AtomicRefBasicConstructorUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    BasicConstrUnitTest,
+    AtomicRefBasicConstructorUnitTest,
+    basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -136,7 +140,8 @@ GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAConstrUnitTest,
-                               AtomicRefCUDAConstructorUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CUDAConstrUnitTest,
+    AtomicRefCUDAConstructorUnitTest,
+    CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
index 6ce2af5a08..14409ebc69 100644
--- a/test/unit/atomic/test-atomic-ref-exchanges.cpp
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -69,41 +69,42 @@ TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest, BasicExchanges);
 
-using basic_types =
-    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
-                     std::tuple<int, RAJA::seq_atomic>,
-                     std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                     std::tuple<float, RAJA::builtin_atomic>,
-                     std::tuple<float, RAJA::seq_atomic>,
-                     std::tuple<double, RAJA::builtin_atomic>,
-                     std::tuple<double, RAJA::seq_atomic>
+using basic_types = ::testing::Types<
+    std::tuple<int, RAJA::builtin_atomic>,
+    std::tuple<int, RAJA::seq_atomic>,
+    std::tuple<unsigned int, RAJA::builtin_atomic>,
+    std::tuple<unsigned int, RAJA::seq_atomic>,
+    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+    std::tuple<unsigned long long int, RAJA::seq_atomic>,
+    std::tuple<float, RAJA::builtin_atomic>,
+    std::tuple<float, RAJA::seq_atomic>,
+    std::tuple<double, RAJA::builtin_atomic>,
+    std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<int, RAJA::omp_atomic>,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                     std::tuple<float, RAJA::omp_atomic>,
-                     std::tuple<double, RAJA::omp_atomic>
+    ,
+    std::tuple<int, RAJA::omp_atomic>,
+    std::tuple<unsigned int, RAJA::omp_atomic>,
+    std::tuple<unsigned long long int, RAJA::omp_atomic>,
+    std::tuple<float, RAJA::omp_atomic>,
+    std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::cuda_atomic>
+    ,
+    std::tuple<int, RAJA::auto_atomic>,
+    std::tuple<int, RAJA::cuda_atomic>,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+    std::tuple<float, RAJA::auto_atomic>,
+    std::tuple<float, RAJA::cuda_atomic>
 #endif
-                     >;
+    >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicExchangeUnitTest,
-                               AtomicRefBasicExchangeUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    BasicExchangeUnitTest,
+    AtomicRefBasicExchangeUnitTest,
+    basic_types);
 
 
 // Pure CUDA test.
@@ -181,17 +182,18 @@ GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges);
 
-using CUDA_types =
-    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAExchangeUnitTest,
-                               AtomicRefCUDAExchangeUnitTest,
-                               CUDA_types);
+using CUDA_types = ::testing::Types<
+    std::tuple<int, RAJA::auto_atomic>,
+    std::tuple<int, RAJA::cuda_atomic>,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+    std::tuple<float, RAJA::auto_atomic>,
+    std::tuple<float, RAJA::auto_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CUDAExchangeUnitTest,
+    AtomicRefCUDAExchangeUnitTest,
+    CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
index 0f375f2aa6..b45ef9d0ea 100644
--- a/test/unit/atomic/test-atomic-ref-minmax.cpp
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -60,9 +60,10 @@ TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicMinMaxUnitTest,
-                               AtomicRefBasicMinMaxUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    BasicMinMaxUnitTest,
+    AtomicRefBasicMinMaxUnitTest,
+    basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -118,7 +119,8 @@ GPU_TYPED_TEST_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAMinMaxUnitTest,
-                               AtomicRefCUDAMinMaxUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CUDAMinMaxUnitTest,
+    AtomicRefCUDAMinMaxUnitTest,
+    CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref.hpp b/test/unit/atomic/test-atomic-ref.hpp
index 6805c432cd..9749fa42ba 100644
--- a/test/unit/atomic/test-atomic-ref.hpp
+++ b/test/unit/atomic/test-atomic-ref.hpp
@@ -12,63 +12,63 @@
 #include <RAJA/RAJA.hpp>
 #include "RAJA_gtest.hpp"
 
-using basic_types =
-    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
-                     std::tuple<int, RAJA::seq_atomic>,
-                     std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                     std::tuple<float, RAJA::builtin_atomic>,
-                     std::tuple<float, RAJA::seq_atomic>,
-                     std::tuple<double, RAJA::builtin_atomic>,
-                     std::tuple<double, RAJA::seq_atomic>
+using basic_types = ::testing::Types<
+    std::tuple<int, RAJA::builtin_atomic>,
+    std::tuple<int, RAJA::seq_atomic>,
+    std::tuple<unsigned int, RAJA::builtin_atomic>,
+    std::tuple<unsigned int, RAJA::seq_atomic>,
+    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+    std::tuple<unsigned long long int, RAJA::seq_atomic>,
+    std::tuple<float, RAJA::builtin_atomic>,
+    std::tuple<float, RAJA::seq_atomic>,
+    std::tuple<double, RAJA::builtin_atomic>,
+    std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<int, RAJA::omp_atomic>,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                     std::tuple<float, RAJA::omp_atomic>,
-                     std::tuple<double, RAJA::omp_atomic>
+    ,
+    std::tuple<int, RAJA::omp_atomic>,
+    std::tuple<unsigned int, RAJA::omp_atomic>,
+    std::tuple<unsigned long long int, RAJA::omp_atomic>,
+    std::tuple<float, RAJA::omp_atomic>,
+    std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::cuda_atomic>,
-                     std::tuple<double, RAJA::auto_atomic>,
-                     std::tuple<double, RAJA::cuda_atomic>
+    ,
+    std::tuple<int, RAJA::auto_atomic>,
+    std::tuple<int, RAJA::cuda_atomic>,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+    std::tuple<float, RAJA::auto_atomic>,
+    std::tuple<float, RAJA::cuda_atomic>,
+    std::tuple<double, RAJA::auto_atomic>,
+    std::tuple<double, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::hip_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::hip_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::hip_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::hip_atomic>,
-                     std::tuple<double, RAJA::auto_atomic>,
-                     std::tuple<double, RAJA::hip_atomic>
+    ,
+    std::tuple<int, RAJA::auto_atomic>,
+    std::tuple<int, RAJA::hip_atomic>,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::hip_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::hip_atomic>,
+    std::tuple<float, RAJA::auto_atomic>,
+    std::tuple<float, RAJA::hip_atomic>,
+    std::tuple<double, RAJA::auto_atomic>,
+    std::tuple<double, RAJA::hip_atomic>
 #endif
-                     >;
+    >;
 
 #if defined(RAJA_ENABLE_CUDA)
-using CUDA_types =
-    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<double, RAJA::cuda_atomic>,
-                     std::tuple<double, RAJA::cuda_atomic>>;
+using CUDA_types = ::testing::Types<
+    std::tuple<int, RAJA::auto_atomic>,
+    std::tuple<int, RAJA::cuda_atomic>,
+    std::tuple<unsigned int, RAJA::auto_atomic>,
+    std::tuple<unsigned int, RAJA::cuda_atomic>,
+    std::tuple<unsigned long long int, RAJA::auto_atomic>,
+    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+    std::tuple<float, RAJA::auto_atomic>,
+    std::tuple<float, RAJA::auto_atomic>,
+    std::tuple<double, RAJA::cuda_atomic>,
+    std::tuple<double, RAJA::cuda_atomic>>;
 #endif
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 0ddc91bb9a..6c16840b95 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -23,12 +23,12 @@ GPU_TEST(SynchronizeUnitTest, HIP)
       { d_managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
-  hipMemcpy(managed_data, d_managed_data, sizeof(double) * 50,
-            hipMemcpyDeviceToHost);
+  hipMemcpy(
+      managed_data, d_managed_data, sizeof(double) * 50, hipMemcpyDeviceToHost);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
-                               [=](RAJA::Index_type i)
-                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment(0, 50),
+      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   free(managed_data);
   hipFree(d_managed_data);
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index 63b0448bab..586e9b0939 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -17,7 +17,7 @@
 // Resource object used to construct list segment objects with indices
 // living in host (CPU) memory. Used in all tests.
 //
-camp::resources::Resource host_res{camp::resources::Host()};
+camp::resources::Resource host_res {camp::resources::Host()};
 
 
 TEST(IndexSetUnitTest, Empty)
@@ -214,8 +214,8 @@ TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
   ref_lt100_indices.push_back(99);
 
   RAJA::RAJAVec<int> lt100_indices;
-  getIndicesConditional(lt100_indices, iset,
-                        [](int idx) { return (idx < 100); });
+  getIndicesConditional(
+      lt100_indices, iset, [](int idx) { return (idx < 100); });
 
   EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
   for (size_t i = 0; i < ref_lt100_indices.size(); ++i)
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index 89844fab89..f4ee0a87e9 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -27,7 +27,7 @@ TYPED_TEST_SUITE(ListSegmentUnitTest, UnitIndexTypes);
 // Resource object used to construct list segment objects with indices
 // living in host (CPU) memory. Used in all tests in this file.
 //
-camp::resources::Resource host_res{camp::resources::Host()};
+camp::resources::Resource host_res {camp::resources::Host()};
 
 
 TYPED_TEST(ListSegmentUnitTest, Constructors)
@@ -83,10 +83,10 @@ TYPED_TEST(ListSegmentUnitTest, Swaps)
 
 TYPED_TEST(ListSegmentUnitTest, Equality)
 {
-  std::vector<TypeParam>            idx1{5, 3, 1, 2};
+  std::vector<TypeParam>            idx1 {5, 3, 1, 2};
   RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
-  std::vector<TypeParam> idx2{2, 1, 3, 5};
+  std::vector<TypeParam> idx2 {2, 1, 3, 5};
 
   ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), false);
 
@@ -97,7 +97,7 @@ TYPED_TEST(ListSegmentUnitTest, Equality)
 
 TYPED_TEST(ListSegmentUnitTest, Iterators)
 {
-  std::vector<TypeParam>            idx1{5, 3, 1, 2};
+  std::vector<TypeParam>            idx1 {5, 3, 1, 2};
   RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   ASSERT_EQ(TypeParam(5), *list.begin());
diff --git a/test/unit/index/test-rangesegment.cpp b/test/unit/index/test-rangesegment.cpp
index fbed2a15bd..76497982c1 100644
--- a/test/unit/index/test-rangesegment.cpp
+++ b/test/unit/index/test-rangesegment.cpp
@@ -20,13 +20,15 @@ class RangeSegmentUnitTest : public ::testing::Test
 TYPED_TEST_SUITE(RangeSegmentUnitTest, UnitIndexTypes);
 
 
-template <typename T,
-          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {}
 
-template <typename T,
-          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {
   RAJA::TypedRangeSegment<T> r1(-10, 7);
@@ -76,13 +78,15 @@ TYPED_TEST(RangeSegmentUnitTest, Swaps)
   ASSERT_EQ(r2, r3);
 }
 
-template <typename T,
-          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {}
 
-template <typename T,
-          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {
   RAJA::TypedRangeSegment<T> r3(-2, 100);
@@ -102,15 +106,17 @@ TYPED_TEST(RangeSegmentUnitTest, Iterators)
   NegativeRangeSegIteratorsTest<TypeParam>();
 }
 
-template <typename IDX_TYPE,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <
+    typename IDX_TYPE,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {}
 
-template <typename IDX_TYPE,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <
+    typename IDX_TYPE,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {
   auto r1 = RAJA::TypedRangeSegment<IDX_TYPE>(-4, 4);
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
index dc14699663..33a2a680e2 100644
--- a/test/unit/index/test-rangestridesegment.cpp
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -62,13 +62,15 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Iterators)
   ASSERT_EQ(difftype_t(25), r1.size());
 }
 
-template <typename T,
-          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {}
 
-template <typename T,
-          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <
+    typename T,
+    typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {
   RAJA::TypedRangeStrideSegment<T> segment16(-10, -2, 2);
@@ -121,16 +123,17 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   ASSERT_EQ(segment11.size(), difftype_t(2));
 
   // PRIMES
-  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0, 7,
-                                                     3); // should produce 0,3,6
+  RAJA::TypedRangeStrideSegment<TypeParam> segment12(
+      0, 7,
+      3);  // should produce 0,3,6
   ASSERT_EQ(segment12.size(), difftype_t(3));
 
   RAJA::TypedRangeStrideSegment<TypeParam> segment13(
-      0, 13, 3); // should produce 0,3,6,9,12
+      0, 13, 3);  // should produce 0,3,6,9,12
   ASSERT_EQ(segment13.size(), difftype_t(5));
 
   RAJA::TypedRangeStrideSegment<TypeParam> segment14(
-      0, 17, 5); // should produce 0,5,10,15
+      0, 17, 5);  // should produce 0,5,10,15
   ASSERT_EQ(segment14.size(), difftype_t(4));
 
   // NEGATIVE STRIDE
@@ -141,15 +144,17 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   NegativeRangeStrideTestSizes<TypeParam>();
 }
 
-template <typename IDX_TYPE,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <
+    typename IDX_TYPE,
+    typename std::enable_if<std::is_unsigned<
+        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {}
 
-template <typename IDX_TYPE,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <
+    typename IDX_TYPE,
+    typename std::enable_if<std::is_signed<
+        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {
   auto r1 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(10, -1, -1);
diff --git a/test/unit/indexing/test-indexing.hpp b/test/unit/indexing/test-indexing.hpp
index 708e6342ad..d6ce03bd7d 100644
--- a/test/unit/indexing/test-indexing.hpp
+++ b/test/unit/indexing/test-indexing.hpp
@@ -17,19 +17,19 @@
 //
 // List of named_dims
 //
-using NamedDimensionTypeList =
-    camp::list<camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
-               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
-               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
+using NamedDimensionTypeList = camp::list<
+    camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
+    camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
+    camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
 
 //
 // List of sizes
 //
-using SizeTypeList =
-    camp::list<camp::integral_constant<int, RAJA::named_usage::ignored>,
-               camp::integral_constant<int, RAJA::named_usage::unspecified>,
-               camp::integral_constant<int, 1>,
-               camp::integral_constant<int, 7>>;
+using SizeTypeList = camp::list<
+    camp::integral_constant<int, RAJA::named_usage::ignored>,
+    camp::integral_constant<int, RAJA::named_usage::unspecified>,
+    camp::integral_constant<int, 1>,
+    camp::integral_constant<int, 7>>;
 
 //
 // Holder for indexing templates
@@ -54,4 +54,4 @@ using HipIndexingHolderList =
     camp::list<indexing_holder<RAJA::hip::IndexGlobal>>;
 #endif
 
-#endif // __TEST_INDEXING_UTILS_HPP__
+#endif  // __TEST_INDEXING_UTILS_HPP__
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index 004fe6da53..85404a335f 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -20,14 +20,15 @@ class IndexingUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(IndexingUnitTest);
 
-template <typename test_policy,
-          typename indexer_type,
-          RAJA::named_dim dim_012,
-          int             BLOCK_SIZE,
-          int             GRID_SIZE>
+template <
+    typename test_policy,
+    typename indexer_type,
+    RAJA::named_dim dim_012,
+    int             BLOCK_SIZE,
+    int             GRID_SIZE>
 void testBasicIndexing()
 {
-  dim3d3d expected_dim{{1, 1, 1}, {1, 1, 1}};
+  dim3d3d expected_dim {{1, 1, 1}, {1, 1, 1}};
   if (BLOCK_SIZE != RAJA::named_usage::ignored)
   {
     if (BLOCK_SIZE == RAJA::named_usage::unspecified)
@@ -71,13 +72,14 @@ void testBasicIndexing()
   actual_size =
       test_reallocate(working_res, host_res, actual_size, total_global);
 
-  for3d3d<test_policy>(expected_dim,
-                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
-                       {
-                         int i           = index(idx, dim);
-                         actual_index[i] = indexer_type::template index<int>();
-                         actual_size[i]  = indexer_type::template size<int>();
-                       });
+  for3d3d<test_policy>(
+      expected_dim,
+      [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
+      {
+        int i           = index(idx, dim);
+        actual_index[i] = indexer_type::template index<int>();
+        actual_size[i]  = indexer_type::template size<int>();
+      });
 
   actual_index =
       test_reallocate(host_res, working_res, actual_index, total_global);
@@ -105,10 +107,11 @@ TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
   using indexer_type = typename indexer_holder_type::template type<
       dim_type::value, threads_type::value, blocks_type::value>;
 
-  testBasicIndexing<test_policy, indexer_type, dim_type::value,
-                    threads_type::value, blocks_type::value>();
+  testBasicIndexing<
+      test_policy, indexer_type, dim_type::value, threads_type::value,
+      blocks_type::value>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(IndexingUnitTest, BasicIndexing);
 
-#endif //__TEST_INDEXING_GLOBAL__
+#endif  //__TEST_INDEXING_GLOBAL__
diff --git a/test/unit/multi_reducer/test-multi-reducer.hpp b/test/unit/multi_reducer/test-multi-reducer.hpp
index 55a9550cfa..965926b144 100644
--- a/test/unit/multi_reducer/test-multi-reducer.hpp
+++ b/test/unit/multi_reducer/test-multi-reducer.hpp
@@ -44,4 +44,4 @@ using HipMultiReducerPolicyList = camp::list<
     RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
-#endif // __TEST_MULTI_REDUCER_UTILS_HPP__
+#endif  // __TEST_MULTI_REDUCER_UTILS_HPP__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
index 4b518a85a4..2a995053be 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -58,12 +58,15 @@ void testBasicMultiReducerConstructorRegular(size_t num_bins)
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
-              get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
-              get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
-              get_op_identity(multi_reduce_max));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_sum[bin].get(),
+        get_op_identity(multi_reduce_sum));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_min[bin].get(),
+        get_op_identity(multi_reduce_min));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_max[bin].get(),
+        get_op_identity(multi_reduce_max));
   }
 }
 
@@ -83,16 +86,19 @@ void testBasicMultiReducerConstructorBitwise(size_t num_bins)
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
 
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
-              get_op_identity(multi_reduce_or));
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
-              get_op_identity(multi_reduce_and));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_or[bin].get(),
+        get_op_identity(multi_reduce_or));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_and[bin].get(),
+        get_op_identity(multi_reduce_and));
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
   testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
@@ -101,9 +107,10 @@ void testBasicMultiReducerConstructor(size_t num_bins)
       num_bins);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
   testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
@@ -123,8 +130,9 @@ TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
 
 
 template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorRegular(size_t      num_bins,
-                                                  NumericType initVal)
+void testMultiReducerSingleInitConstructorRegular(
+    size_t      num_bins,
+    NumericType initVal)
 {
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
       num_bins, initVal);
@@ -150,8 +158,9 @@ void testMultiReducerSingleInitConstructorRegular(size_t      num_bins,
 }
 
 template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorBitwise(size_t      num_bins,
-                                                  NumericType initVal)
+void testMultiReducerSingleInitConstructorBitwise(
+    size_t      num_bins,
+    NumericType initVal)
 {
   RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
       num_bins, initVal);
@@ -171,9 +180,10 @@ void testMultiReducerSingleInitConstructorBitwise(size_t      num_bins,
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
   testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
@@ -182,9 +192,10 @@ void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
       num_bins, initVal);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
   testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
@@ -258,30 +269,33 @@ void testMultiReducerContainerInitConstructorBitwise(Container const& container)
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename Container,
+    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
-                                                  NumericType>(container);
-  testMultiReducerContainerInitConstructorBitwise<MultiReducePolicy,
-                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorRegular<
+      MultiReducePolicy, NumericType>(container);
+  testMultiReducerContainerInitConstructorBitwise<
+      MultiReducePolicy, NumericType>(container);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename Container,
+    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
-                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorRegular<
+      MultiReducePolicy, NumericType>(container);
 }
 
-TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
-             MultiReducerConstructor)
+TYPED_TEST_P(
+    MultiReducerContainerInitConstructorUnitTest,
+    MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -303,13 +317,16 @@ TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
 }
 
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest,
-                            MultiReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(
+    MultiReducerBasicConstructorUnitTest,
+    MultiReducerConstructor);
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest,
-                            MultiReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(
+    MultiReducerSingleInitConstructorUnitTest,
+    MultiReducerConstructor);
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest,
-                            MultiReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(
+    MultiReducerContainerInitConstructorUnitTest,
+    MultiReducerConstructor);
 
-#endif //__TEST_MULTI_REDUCER_CONSTRUCTOR__
+#endif  //__TEST_MULTI_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index f139edb834..aed0a7462e 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -77,12 +77,15 @@ void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
-              get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
-              get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
-              get_op_identity(multi_reduce_max));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_sum[bin].get(),
+        get_op_identity(multi_reduce_sum));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_min[bin].get(),
+        get_op_identity(multi_reduce_min));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_max[bin].get(),
+        get_op_identity(multi_reduce_max));
   }
 }
 
@@ -120,17 +123,20 @@ void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
 
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
-              get_op_identity(multi_reduce_and));
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
-              get_op_identity(multi_reduce_or));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_and[bin].get(),
+        get_op_identity(multi_reduce_and));
+    ASSERT_EQ(
+        (NumericType)multi_reduce_or[bin].get(),
+        get_op_identity(multi_reduce_or));
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
   testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
@@ -144,10 +150,11 @@ void testMultiReducerBasicReset(size_t num_bins)
   // MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
   testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
@@ -172,10 +179,11 @@ TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
 
 
 template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetRegular(bool        use_reducer,
-                                        size_t      init_bins,
-                                        size_t      num_bins,
-                                        NumericType initVal)
+void testMultiReducerSingleResetRegular(
+    bool        use_reducer,
+    size_t      init_bins,
+    size_t      num_bins,
+    NumericType initVal)
 {
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
       init_bins, initVal);
@@ -219,10 +227,11 @@ void testMultiReducerSingleResetRegular(bool        use_reducer,
 }
 
 template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetBitwise(bool        use_reducer,
-                                        size_t      init_bins,
-                                        size_t      num_bins,
-                                        NumericType initVal)
+void testMultiReducerSingleResetBitwise(
+    bool        use_reducer,
+    size_t      init_bins,
+    size_t      num_bins,
+    NumericType initVal)
 {
   RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
       init_bins, initVal);
@@ -258,13 +267,15 @@ void testMultiReducerSingleResetBitwise(bool        use_reducer,
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(size_t      init_bins,
-                                     size_t      num_bins,
-                                     NumericType initVal)
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(
+    size_t      init_bins,
+    size_t      num_bins,
+    NumericType initVal)
 {
   testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
       false, init_bins, num_bins, initVal);
@@ -278,13 +289,15 @@ void testMultiReducerSingleResetSize(size_t      init_bins,
   // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(size_t      init_bins,
-                                     size_t      num_bins,
-                                     NumericType initVal)
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(
+    size_t      init_bins,
+    size_t      num_bins,
+    NumericType initVal)
 {
   testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
       false, init_bins, num_bins, initVal);
@@ -322,13 +335,15 @@ TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container>
-void testMultiReducerContainerResetRegular(bool             use_reducer,
-                                           size_t           init_bins,
-                                           Container const& container)
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    typename Container>
+void testMultiReducerContainerResetRegular(
+    bool             use_reducer,
+    size_t           init_bins,
+    Container const& container)
 {
   const size_t num_bins = container.size();
   NumericType  initVal  = NumericType(5);
@@ -376,13 +391,15 @@ void testMultiReducerContainerResetRegular(bool             use_reducer,
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container>
-void testMultiReducerContainerResetBitwise(bool             use_reducer,
-                                           size_t           init_bins,
-                                           Container const& container)
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    typename Container>
+void testMultiReducerContainerResetBitwise(
+    bool             use_reducer,
+    size_t           init_bins,
+    Container const& container)
 {
   const size_t num_bins = container.size();
   NumericType  initVal  = NumericType(5);
@@ -423,18 +440,20 @@ void testMultiReducerContainerResetBitwise(bool             use_reducer,
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(size_t           init_bins,
-                                        Container const& container)
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    typename Container,
+    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(
+    size_t           init_bins,
+    Container const& container)
 {
-  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
-                                        ForOnePol>(false, init_bins, container);
-  testMultiReducerContainerResetBitwise<MultiReducePolicy, NumericType,
-                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetRegular<
+      MultiReducePolicy, NumericType, ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetBitwise<
+      MultiReducePolicy, NumericType, ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
   // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
@@ -443,26 +462,29 @@ void testMultiReducerContainerResetSize(size_t           init_bins,
   // ForOnePol >(true, init_bins, container);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(size_t           init_bins,
-                                        Container const& container)
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    typename Container,
+    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(
+    size_t           init_bins,
+    Container const& container)
 {
-  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
-                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetRegular<
+      MultiReducePolicy, NumericType, ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
   // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
   // ForOnePol >(true, init_bins, container);
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container>
+template <
+    typename MultiReducePolicy,
+    typename NumericType,
+    typename ForOnePol,
+    typename Container>
 void testMultiReducerContainerReset(Container const& container)
 {
   testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
@@ -501,7 +523,8 @@ REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest, MultiReducerReset);
 
 REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest, MultiReducerReset);
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest,
-                            MultiReducerReset);
+REGISTER_TYPED_TEST_SUITE_P(
+    MultiReducerContainerResetUnitTest,
+    MultiReducerReset);
 
-#endif //__TEST_MULTI_REDUCER_RESET__
+#endif  //__TEST_MULTI_REDUCER_RESET__
diff --git a/test/unit/reducer/test-reducer-constructors-cuda.cpp b/test/unit/reducer/test-reducer-constructors-cuda.cpp
index 75889c4706..f5e67555e3 100644
--- a/test/unit/reducer/test-reducer-constructors-cuda.cpp
+++ b/test/unit/reducer/test-reducer-constructors-cuda.cpp
@@ -13,22 +13,24 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<CudaReducerPolicyList,
-                                 DataTypeList,
-                                 CudaResourceList>>::Types;
+using CudaBasicReducerConstructorTypes = Test<camp::cartesian_product<
+    CudaReducerPolicyList,
+    DataTypeList,
+    CudaResourceList>>::Types;
 
-using CudaInitReducerConstructorTypes =
-    Test<camp::cartesian_product<CudaReducerPolicyList,
-                                 DataTypeList,
-                                 CudaResourceList,
-                                 CudaUnitTestPolicyList>>::Types;
+using CudaInitReducerConstructorTypes = Test<camp::cartesian_product<
+    CudaReducerPolicyList,
+    DataTypeList,
+    CudaResourceList,
+    CudaUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CudaBasicTest,
-                               ReducerBasicConstructorUnitTest,
-                               CudaBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CudaBasicTest,
+    ReducerBasicConstructorUnitTest,
+    CudaBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CudaInitTest,
-                               ReducerInitConstructorUnitTest,
-                               CudaInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CudaInitTest,
+    ReducerInitConstructorUnitTest,
+    CudaInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-hip.cpp b/test/unit/reducer/test-reducer-constructors-hip.cpp
index c4f4ddb8b4..05ad5ba1ae 100644
--- a/test/unit/reducer/test-reducer-constructors-hip.cpp
+++ b/test/unit/reducer/test-reducer-constructors-hip.cpp
@@ -13,22 +13,24 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<HipReducerPolicyList,
-                                 DataTypeList,
-                                 HipResourceList>>::Types;
+using HipBasicReducerConstructorTypes = Test<camp::cartesian_product<
+    HipReducerPolicyList,
+    DataTypeList,
+    HipResourceList>>::Types;
 
-using HipInitReducerConstructorTypes =
-    Test<camp::cartesian_product<HipReducerPolicyList,
-                                 DataTypeList,
-                                 HipResourceList,
-                                 HipUnitTestPolicyList>>::Types;
+using HipInitReducerConstructorTypes = Test<camp::cartesian_product<
+    HipReducerPolicyList,
+    DataTypeList,
+    HipResourceList,
+    HipUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(HipBasicTest,
-                               ReducerBasicConstructorUnitTest,
-                               HipBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    HipBasicTest,
+    ReducerBasicConstructorUnitTest,
+    HipBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(HipInitTest,
-                               ReducerInitConstructorUnitTest,
-                               HipInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    HipInitTest,
+    ReducerInitConstructorUnitTest,
+    HipInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
index 3dd9e8ae39..11e6311b2a 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
@@ -13,13 +13,14 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetInitReducerConstructorTypes =
-    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
-                                 DataTypeList,
-                                 OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+using OpenMPTargetInitReducerConstructorTypes = Test<camp::cartesian_product<
+    OpenMPTargetReducerPolicyList,
+    DataTypeList,
+    OpenMPTargetResourceList,
+    SequentialUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetInitTest,
-                               ReducerInitConstructorUnitTest,
-                               OpenMPTargetInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    OpenMPTargetInitTest,
+    ReducerInitConstructorUnitTest,
+    OpenMPTargetInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-openmp.cpp b/test/unit/reducer/test-reducer-constructors-openmp.cpp
index eb31791058..d9e32d65f9 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp.cpp
@@ -13,22 +13,24 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<OpenMPReducerPolicyList,
-                                 DataTypeList,
-                                 HostResourceList>>::Types;
+using OpenMPBasicReducerConstructorTypes = Test<camp::cartesian_product<
+    OpenMPReducerPolicyList,
+    DataTypeList,
+    HostResourceList>>::Types;
 
-using OpenMPInitReducerConstructorTypes =
-    Test<camp::cartesian_product<OpenMPReducerPolicyList,
-                                 DataTypeList,
-                                 HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+using OpenMPInitReducerConstructorTypes = Test<camp::cartesian_product<
+    OpenMPReducerPolicyList,
+    DataTypeList,
+    HostResourceList,
+    SequentialUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPBasicTest,
-                               ReducerBasicConstructorUnitTest,
-                               OpenMPBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    OpenMPBasicTest,
+    ReducerBasicConstructorUnitTest,
+    OpenMPBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPInitTest,
-                               ReducerInitConstructorUnitTest,
-                               OpenMPInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    OpenMPInitTest,
+    ReducerInitConstructorUnitTest,
+    OpenMPInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-seq.cpp b/test/unit/reducer/test-reducer-constructors-seq.cpp
index 7d765529f8..7d34a653ad 100644
--- a/test/unit/reducer/test-reducer-constructors-seq.cpp
+++ b/test/unit/reducer/test-reducer-constructors-seq.cpp
@@ -12,21 +12,23 @@
 
 #include "tests/test-reducer-constructors.hpp"
 
-using SequentialBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<SequentialReducerPolicyList,
-                                 DataTypeList,
-                                 HostResourceList>>::Types;
+using SequentialBasicReducerConstructorTypes = Test<camp::cartesian_product<
+    SequentialReducerPolicyList,
+    DataTypeList,
+    HostResourceList>>::Types;
 
-using SequentialInitReducerConstructorTypes =
-    Test<camp::cartesian_product<SequentialReducerPolicyList,
-                                 DataTypeList,
-                                 HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+using SequentialInitReducerConstructorTypes = Test<camp::cartesian_product<
+    SequentialReducerPolicyList,
+    DataTypeList,
+    HostResourceList,
+    SequentialUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
-                               ReducerBasicConstructorUnitTest,
-                               SequentialBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    SequentialBasicTest,
+    ReducerBasicConstructorUnitTest,
+    SequentialBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(SequentialInitTest,
-                               ReducerInitConstructorUnitTest,
-                               SequentialInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    SequentialInitTest,
+    ReducerInitConstructorUnitTest,
+    SequentialInitReducerConstructorTypes);
diff --git a/test/unit/reducer/test-reducer-reset-cuda.cpp b/test/unit/reducer/test-reducer-reset-cuda.cpp
index 2443419c7d..a3647340d5 100644
--- a/test/unit/reducer/test-reducer-reset-cuda.cpp
+++ b/test/unit/reducer/test-reducer-reset-cuda.cpp
@@ -12,14 +12,15 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerResetTypes =
-    Test<camp::cartesian_product<CudaReducerPolicyList,
-                                 DataTypeList,
-                                 CudaResourceList,
-                                 CudaUnitTestPolicyList>>::Types;
+using CudaReducerResetTypes = Test<camp::cartesian_product<
+    CudaReducerPolicyList,
+    DataTypeList,
+    CudaResourceList,
+    CudaUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CudaResetTest,
-                               ReducerResetUnitTest,
-                               CudaReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CudaResetTest,
+    ReducerResetUnitTest,
+    CudaReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-hip.cpp b/test/unit/reducer/test-reducer-reset-hip.cpp
index eb31480311..647171ae3f 100644
--- a/test/unit/reducer/test-reducer-reset-hip.cpp
+++ b/test/unit/reducer/test-reducer-reset-hip.cpp
@@ -12,14 +12,15 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerResetTypes =
-    Test<camp::cartesian_product<HipReducerPolicyList,
-                                 DataTypeList,
-                                 HipResourceList,
-                                 HipUnitTestPolicyList>>::Types;
+using HipReducerResetTypes = Test<camp::cartesian_product<
+    HipReducerPolicyList,
+    DataTypeList,
+    HipResourceList,
+    HipUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(HipResetTest,
-                               ReducerResetUnitTest,
-                               HipReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    HipResetTest,
+    ReducerResetUnitTest,
+    HipReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-openmp-target.cpp b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
index 5f02ec92ea..a5b0540f3f 100644
--- a/test/unit/reducer/test-reducer-reset-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
@@ -12,14 +12,15 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerResetTypes =
-    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
-                                 DataTypeList,
-                                 OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+using OpenMPTargetReducerResetTypes = Test<camp::cartesian_product<
+    OpenMPTargetReducerPolicyList,
+    DataTypeList,
+    OpenMPTargetResourceList,
+    SequentialUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetResetTest,
-                               ReducerResetUnitTest,
-                               OpenMPTargetReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    OpenMPTargetResetTest,
+    ReducerResetUnitTest,
+    OpenMPTargetReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-openmp.cpp b/test/unit/reducer/test-reducer-reset-openmp.cpp
index a570a7be6a..6f84329955 100644
--- a/test/unit/reducer/test-reducer-reset-openmp.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp.cpp
@@ -12,14 +12,15 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerResetTypes =
-    Test<camp::cartesian_product<OpenMPReducerPolicyList,
-                                 DataTypeList,
-                                 HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+using OpenMPReducerResetTypes = Test<camp::cartesian_product<
+    OpenMPReducerPolicyList,
+    DataTypeList,
+    HostResourceList,
+    SequentialUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPResetTest,
-                               ReducerResetUnitTest,
-                               OpenMPReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    OpenMPResetTest,
+    ReducerResetUnitTest,
+    OpenMPReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-seq.cpp b/test/unit/reducer/test-reducer-reset-seq.cpp
index 5884aa43e4..ed8ab68c41 100644
--- a/test/unit/reducer/test-reducer-reset-seq.cpp
+++ b/test/unit/reducer/test-reducer-reset-seq.cpp
@@ -11,13 +11,14 @@
 
 #include "tests/test-reducer-reset.hpp"
 
-using SequentialReducerResetTypes =
-    Test<camp::cartesian_product<SequentialReducerPolicyList,
-                                 DataTypeList,
-                                 HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+using SequentialReducerResetTypes = Test<camp::cartesian_product<
+    SequentialReducerPolicyList,
+    DataTypeList,
+    HostResourceList,
+    SequentialUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(SequentialResetTest,
-                               ReducerResetUnitTest,
-                               SequentialReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    SequentialResetTest,
+    ReducerResetUnitTest,
+    SequentialReducerResetTypes);
diff --git a/test/unit/reducer/test-reducer.hpp b/test/unit/reducer/test-reducer.hpp
index 7b9648d25b..55fa58f6ee 100644
--- a/test/unit/reducer/test-reducer.hpp
+++ b/test/unit/reducer/test-reducer.hpp
@@ -37,4 +37,4 @@ using CudaReducerPolicyList = camp::list<RAJA::cuda_reduce>;
 using HipReducerPolicyList = camp::list<RAJA::hip_reduce>;
 #endif
 
-#endif // __TEST_REDUCER_UTILS_HPP__
+#endif  // __TEST_REDUCER_UTILS_HPP__
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index 9098910db3..032fe260c2 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -31,9 +31,9 @@ TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest);
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 template <typename ReducePolicy, typename NumericType>
 typename std::enable_if<
-#if defined(RAJA_ENABLE_CUDA) // CUDA policy does nothing.
+#if defined(RAJA_ENABLE_CUDA)  // CUDA policy does nothing.
     std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
-#elif defined(RAJA_ENABLE_HIP) // HIP policy does nothing.
+#elif defined(RAJA_ENABLE_HIP)  // HIP policy does nothing.
     std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
 #error Please enable a supported GPU platform, e.g. CUDA or HIP.
@@ -47,15 +47,16 @@ testReducerConstructor()
 
 // Basic constructor tests are only expected to be verified on the host.
 // Should not run this on a GPU.
-template <typename ReducePolicy,
-          typename NumericType>
-typename std::enable_if< // CPU policy.
+template <
+    typename ReducePolicy,
+    typename NumericType>
+typename std::enable_if<  // CPU policy.
 #if defined(RAJA_ENABLE_CUDA)
     !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
 #elif defined(RAJA_ENABLE_HIP)
     !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
-    true // Always run for non-GPU policies.
+    true  // Always run for non-GPU policies.
 #endif
     >::type
 testReducerConstructor()
@@ -66,11 +67,13 @@ testReducerConstructor()
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMinLoc<
+      ReducePolicy, NumericType,
+      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup;
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMaxLoc<
+      ReducePolicy, NumericType,
+      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup;
 
   ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
@@ -84,14 +87,18 @@ testReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-            RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-            RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-            RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-            RAJA::Index_type());
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+      RAJA::Index_type());
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+      RAJA::Index_type());
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+      RAJA::Index_type());
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+      RAJA::Index_type());
 }
 
 TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
@@ -102,10 +109,11 @@ TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
   testReducerConstructor<ReducePolicy, NumericType>();
 }
 
-template <typename ReducePolicy,
-          typename NumericType,
-          typename ForOnePol>
-typename std::enable_if< // Host policy does nothing.
+template <
+    typename ReducePolicy,
+    typename NumericType,
+    typename ForOnePol>
+typename std::enable_if<  // Host policy does nothing.
     std::is_base_of<RunOnHost, ForOnePol>::value>::type
 exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
 {
@@ -113,10 +121,11 @@ exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy,
-          typename NumericType,
-          typename ForOnePol>
-typename std::enable_if< // GPU policy fiddles with value.
+template <
+    typename ReducePolicy,
+    typename NumericType,
+    typename ForOnePol>
+typename std::enable_if<  // GPU policy fiddles with value.
     std::is_base_of<RunOnDevice, ForOnePol>::value>::type
 exec_dispatcher(NumericType* initVal)
 {
@@ -129,14 +138,15 @@ exec_dispatcher(NumericType* initVal)
 }
 #endif
 
-template <typename ReducePolicy,
-          typename NumericType,
-          typename WORKING_RES,
-          typename ForOnePol>
+template <
+    typename ReducePolicy,
+    typename NumericType,
+    typename WORKING_RES,
+    typename ForOnePol>
 void testInitReducerConstructor()
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   NumericType* theVal  = nullptr;
   NumericType* workVal = nullptr;
@@ -164,11 +174,13 @@ void testInitReducerConstructor()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMinLoc<
+      ReducePolicy, NumericType,
+      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMaxLoc<
+      ReducePolicy, NumericType,
+      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup(initVal, LocTup);
 
   // move a value onto device and fiddle
@@ -191,14 +203,18 @@ void testInitReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal));
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)1);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+      (RAJA::Index_type)1);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+      (RAJA::Index_type)1);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+      (RAJA::Index_type)1);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+      (RAJA::Index_type)1);
 
   work_res.deallocate(workVal);
   host_res.deallocate(theVal);
@@ -211,15 +227,17 @@ TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
   using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  testInitReducerConstructor<ReduceType, NumericType, ResourceType,
-                             ForOneType>();
+  testInitReducerConstructor<
+      ReduceType, NumericType, ResourceType, ForOneType>();
 }
 
 
-REGISTER_TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest,
-                            BasicReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(
+    ReducerBasicConstructorUnitTest,
+    BasicReducerConstructor);
 
-REGISTER_TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest,
-                            InitReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(
+    ReducerInitConstructorUnitTest,
+    InitReducerConstructor);
 
-#endif //__TEST_REDUCER_CONSTRUCTOR__
+#endif  //__TEST_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index 9fc0b996d1..6ff9efcc0d 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -16,12 +16,13 @@
 
 #include "../test-reducer.hpp"
 
-template <typename ReducePolicy,
-          typename NumericType,
-          typename Indexer,
-          typename Tuple,
-          typename ForOnePol>
-typename std::enable_if< // Empty function for non-device policy.
+template <
+    typename ReducePolicy,
+    typename NumericType,
+    typename Indexer,
+    typename Tuple,
+    typename ForOnePol>
+typename std::enable_if<  // Empty function for non-device policy.
     std::is_base_of<RunOnHost, ForOnePol>::value>::type
 exec_dispatcher(
     RAJA::ReduceSum<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_sum),
@@ -41,12 +42,13 @@ exec_dispatcher(
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy,
-          typename NumericType,
-          typename Indexer,
-          typename Tuple,
-          typename ForOnePol>
-typename std::enable_if< // GPU policy execution.
+template <
+    typename ReducePolicy,
+    typename NumericType,
+    typename Indexer,
+    typename Tuple,
+    typename ForOnePol>
+typename std::enable_if<  // GPU policy execution.
     std::is_base_of<RunOnDevice, ForOnePol>::value>::type
 exec_dispatcher(
     RAJA::ReduceSum<ReducePolicy, NumericType>&             reduce_sum,
@@ -81,14 +83,15 @@ class ReducerResetUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(ReducerResetUnitTest);
 
-template <typename ReducePolicy,
-          typename NumericType,
-          typename WORKING_RES,
-          typename ForOnePol>
+template <
+    typename ReducePolicy,
+    typename NumericType,
+    typename WORKING_RES,
+    typename ForOnePol>
 void testReducerReset()
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   NumericType* resetVal = nullptr;
   NumericType* workVal  = nullptr;
@@ -116,16 +119,19 @@ void testReducerReset()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMinLoc<
+      ReducePolicy, NumericType,
+      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMaxLoc<
+      ReducePolicy, NumericType,
+      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup(initVal, LocTup);
 
   // initiate some device computation if using device policy
-  exec_dispatcher<ReducePolicy, NumericType, RAJA::Index_type,
-                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
+  exec_dispatcher<
+      ReducePolicy, NumericType, RAJA::Index_type,
+      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
       reduce_sum, reduce_min, reduce_max, reduce_minloc, reduce_maxloc,
       reduce_minloctup, reduce_maxloctup, initVal);
 
@@ -153,14 +159,18 @@ void testReducerReset()
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(resetVal[0]));
 
   // Reset of tuple loc defaults to 0
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)0);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+      (RAJA::Index_type)0);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+      (RAJA::Index_type)0);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+      (RAJA::Index_type)0);
+  ASSERT_EQ(
+      (RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+      (RAJA::Index_type)0);
 
   // reset locs to default of -1.
   reduce_minloc.reset(resetVal[0], -1);
@@ -184,4 +194,4 @@ TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
 
 REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest, BasicReset);
 
-#endif //__TEST_REDUCER_RESET__
+#endif  //__TEST_REDUCER_RESET__
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index fdadcafc6c..813a7df3f8 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -56,14 +56,14 @@ void ResourceAsyncTimeTestImpl(EXEC_POL&&)
 template <typename WORKING_RES, size_t BLOCK_SIZE, bool Async>
 void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
-  constexpr std::size_t NUM_STREAMS{8};
+  constexpr std::size_t NUM_STREAMS {8};
   WORKING_RES           dev[NUM_STREAMS];
   resources::Host       host;
 
-  int clockrate{get_clockrate()};
+  int clockrate {get_clockrate()};
   ASSERT_TRUE(clockrate != -1);
 
   using AsyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, true>;
@@ -73,9 +73,9 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   sync_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<SyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
-                        [=] RAJA_HOST_DEVICE(int i)
-                        { gpu_time_wait_for(100, clockrate); });
+    forall<SyncExecPol>(
+        dev[stream], RangeSegment(0, ARRAY_SIZE),
+        [=] RAJA_HOST_DEVICE(int i) { gpu_time_wait_for(100, clockrate); });
   }
   sync_timer.stop();
   RAJA::Timer::ElapsedType t_sync = sync_timer.elapsed();
@@ -84,9 +84,9 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   async_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<AsyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
-                         [=] RAJA_HOST_DEVICE(int i)
-                         { gpu_time_wait_for(100, clockrate); });
+    forall<AsyncExecPol>(
+        dev[stream], RangeSegment(0, ARRAY_SIZE),
+        [=] RAJA_HOST_DEVICE(int i) { gpu_time_wait_for(100, clockrate); });
   }
   async_timer.stop();
   RAJA::Timer::ElapsedType t_async = async_timer.elapsed();
@@ -126,4 +126,4 @@ TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
 
 REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest, ResourceAsyncTime);
 
-#endif // __TEST_RESOURCE_ASYNC_HPP__
+#endif  // __TEST_RESOURCE_ASYNC_HPP__
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index 8f78509d16..be080a368f 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -13,31 +13,32 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceBasicAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000000};
+  constexpr std::size_t ARRAY_SIZE {10000000};
   using namespace RAJA;
 
   WORKING_RES     dev;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev}.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource {dev}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=] RAJA_HOST_DEVICE(int i)
-                                       { h_array[i] = i; });
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i) { h_array[i] = i; });
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
-  forall<EXEC_POLICY>(dev, RangeSegment(0, ARRAY_SIZE),
-                      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
+  forall<EXEC_POLICY>(
+      dev, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], i + 2); });
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE),
+      [=](int i) { ASSERT_EQ(h_array[i], i + 2); });
 
   dev.deallocate(d_array);
   host.deallocate(h_array);
@@ -56,7 +57,8 @@ TYPED_TEST_P(ResourceBasicAsyncSemanticsTest, ResourceBasicAsyncSemantics)
   ResourceBasicAsyncSemanticsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceBasicAsyncSemanticsTest,
-                            ResourceBasicAsyncSemantics);
+REGISTER_TYPED_TEST_SUITE_P(
+    ResourceBasicAsyncSemanticsTest,
+    ResourceBasicAsyncSemantics);
 
-#endif // __TEST_RESOURCE_DEPENDS_HPP__
+#endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index daf75b3eb4..18cc05cbc8 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -13,38 +13,39 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceDependsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
   WORKING_RES     dev1;
   WORKING_RES     dev2;
   resources::Host host;
 
-  int* d_array1 = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* d_array2 = resources::Resource{dev2}.allocate<int>(ARRAY_SIZE);
+  int* d_array1 = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
+  int* d_array2 = resources::Resource {dev2}.allocate<int>(ARRAY_SIZE);
   int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
+  forall<EXEC_POLICY>(
+      dev1, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
 
-  resources::Event e =
-      forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
-                          [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
+  resources::Event e = forall<EXEC_POLICY>(
+      dev2, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                      [=] RAJA_HOST_DEVICE(int i)
-                      { d_array1[i] *= d_array2[i]; });
+  forall<EXEC_POLICY>(
+      dev1, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] *= d_array2[i]; });
 
   dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], -i); });
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE),
+      [=](int i) { ASSERT_EQ(h_array[i], -i); });
 
   dev1.deallocate(d_array1);
   dev2.deallocate(d_array2);
@@ -66,4 +67,4 @@ TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
 
 REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest, ResourceDepends);
 
-#endif // __TEST_RESOURCE_DEPENDS_HPP__
+#endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index ee3ff324a2..92f59f468f 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -13,28 +13,28 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceJoinAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{1000000};
+  constexpr std::size_t ARRAY_SIZE {1000000};
   using namespace RAJA;
 
   WORKING_RES     dev1;
   WORKING_RES     dev2;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=] RAJA_HOST_DEVICE(int i)
-                                       { h_array[i] = i; });
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i) { h_array[i] = i; });
 
   dev2.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
   auto e1 = dev2.get_event_erased();
   dev1.wait_for(&e1);
 
-  RAJA::resources::Event e2 =
-      forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                          [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
+  RAJA::resources::Event e2 = forall<EXEC_POLICY>(
+      dev1, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev2.wait_for(&e2);
 
@@ -42,9 +42,9 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], i + 2); });
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE),
+      [=](int i) { ASSERT_EQ(h_array[i], i + 2); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
@@ -63,7 +63,8 @@ TYPED_TEST_P(ResourceJoinAsyncSemanticsTest, ResourceJoinAsyncSemantics)
   ResourceJoinAsyncSemanticsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceJoinAsyncSemanticsTest,
-                            ResourceJoinAsyncSemantics);
+REGISTER_TYPED_TEST_SUITE_P(
+    ResourceJoinAsyncSemanticsTest,
+    ResourceJoinAsyncSemantics);
 
-#endif // __TEST_RESOURCE_DEPENDS_HPP__
+#endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index befc0d5c0b..b801e611c6 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -13,7 +13,7 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceMultiStreamTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
   WORKING_RES     dev1;
@@ -21,35 +21,38 @@ void ResourceMultiStreamTestImpl()
   WORKING_RES     dev3;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                                            [=] RAJA_HOST_DEVICE(int i)
-                                            {
-                                              if (i % 3 == 0)
-                                              {
-                                                d_array[i] = i;
-                                              }
-                                            });
-
-  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
-                                            [=] RAJA_HOST_DEVICE(int i)
-                                            {
-                                              if (i % 3 == 1)
-                                              {
-                                                d_array[i] = i;
-                                              }
-                                            });
-
-  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
-                                            [=] RAJA_HOST_DEVICE(int i)
-                                            {
-                                              if (i % 3 == 2)
-                                              {
-                                                d_array[i] = i;
-                                              }
-                                            });
+  resources::Event e1 = forall<EXEC_POLICY>(
+      dev1, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i)
+      {
+        if (i % 3 == 0)
+        {
+          d_array[i] = i;
+        }
+      });
+
+  resources::Event e2 = forall<EXEC_POLICY>(
+      dev2, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i)
+      {
+        if (i % 3 == 1)
+        {
+          d_array[i] = i;
+        }
+      });
+
+  resources::Event e3 = forall<EXEC_POLICY>(
+      dev2, RangeSegment(0, ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE(int i)
+      {
+        if (i % 3 == 2)
+        {
+          d_array[i] = i;
+        }
+      });
 
   dev1.wait_for(&e2);
   dev1.wait_for(&e3);
@@ -58,9 +61,9 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], i); });
+  forall<policy::sequential::seq_exec>(
+      host, RangeSegment(0, ARRAY_SIZE),
+      [=](int i) { ASSERT_EQ(h_array[i], i); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
@@ -81,4 +84,4 @@ TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
 
 REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest, ResourceMultiStream);
 
-#endif // __TEST_RESOURCE_MULTISTREAM_HPP__
+#endif  // __TEST_RESOURCE_MULTISTREAM_HPP__
diff --git a/test/unit/util/operator/test-operators-identity.cpp b/test/unit/util/operator/test-operators-identity.cpp
index bba85e3970..dd0577e3c0 100644
--- a/test/unit/util/operator/test-operators-identity.cpp
+++ b/test/unit/util/operator/test-operators-identity.cpp
@@ -68,7 +68,7 @@ void project2nd_test()
 
 #ifdef RAJA_COMPILER_MSVC
 #pragma warning(                                                               \
-    disable : 4245) // Force msvc to not emit signed conversion warning
+    disable : 4245)  // Force msvc to not emit signed conversion warning
 #endif
   if (std::is_signed<T>::value)
   {
diff --git a/test/unit/util/test-float-limits.cpp b/test/unit/util/test-float-limits.cpp
index d54e454083..18684225ad 100644
--- a/test/unit/util/test-float-limits.cpp
+++ b/test/unit/util/test-float-limits.cpp
@@ -27,15 +27,18 @@ TYPED_TEST_SUITE_P(FloatLimitsUnitTest);
 TYPED_TEST_P(FloatLimitsUnitTest, FloatLimits)
 {
 #if !defined(RAJA_ENABLE_TARGET_OPENMP)
-  ASSERT_EQ(RAJA::operators::limits<TypeParam>::min(),
-            -std::numeric_limits<TypeParam>::max());
-  ASSERT_EQ(RAJA::operators::limits<TypeParam>::max(),
-            std::numeric_limits<TypeParam>::max());
+  ASSERT_EQ(
+      RAJA::operators::limits<TypeParam>::min(),
+      -std::numeric_limits<TypeParam>::max());
+  ASSERT_EQ(
+      RAJA::operators::limits<TypeParam>::max(),
+      std::numeric_limits<TypeParam>::max());
 #endif
 }
 
 REGISTER_TYPED_TEST_SUITE_P(FloatLimitsUnitTest, FloatLimits);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(FloatLimitsUnitTests,
-                               FloatLimitsUnitTest,
-                               UnitFloatTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    FloatLimitsUnitTests,
+    FloatLimitsUnitTest,
+    UnitFloatTypes);
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
index 6cc8941184..6e84484ea7 100644
--- a/test/unit/util/test-fraction.cpp
+++ b/test/unit/util/test-fraction.cpp
@@ -20,14 +20,17 @@ void testFractionMultiplyTypesValues()
 
   ASSERT_EQ(Frac::multiply(IntegerType(0)), IntegerType(0));
 
-  ASSERT_EQ(Frac::multiply(IntegerType(1)),
-            IntegerType(double(numerator) / double(denominator)));
+  ASSERT_EQ(
+      Frac::multiply(IntegerType(1)),
+      IntegerType(double(numerator) / double(denominator)));
 
-  ASSERT_EQ(Frac::multiply(IntegerType(100)),
-            IntegerType(double(numerator) / double(denominator) * double(100)));
+  ASSERT_EQ(
+      Frac::multiply(IntegerType(100)),
+      IntegerType(double(numerator) / double(denominator) * double(100)));
 
-  ASSERT_EQ(Frac::multiply(IntegerType(101)),
-            IntegerType(double(numerator) / double(denominator) * double(101)));
+  ASSERT_EQ(
+      Frac::multiply(IntegerType(101)),
+      IntegerType(double(numerator) / double(denominator) * double(101)));
 
   // Test where naive algorithm causes overflow, when within precision of double
   if /*constexpr*/ (sizeof(IntegerType) < sizeof(double))
diff --git a/test/unit/util/test-integral-limits.cpp b/test/unit/util/test-integral-limits.cpp
index 1e68ecc4f4..02648c0edb 100644
--- a/test/unit/util/test-integral-limits.cpp
+++ b/test/unit/util/test-integral-limits.cpp
@@ -25,14 +25,17 @@ TYPED_TEST_SUITE_P(IntegralLimitsUnitTest);
 
 TYPED_TEST_P(IntegralLimitsUnitTest, IntegralLimits)
 {
-  ASSERT_EQ(RAJA::operators::limits<TypeParam>::min(),
-            std::numeric_limits<TypeParam>::min());
-  ASSERT_EQ(RAJA::operators::limits<TypeParam>::max(),
-            std::numeric_limits<TypeParam>::max());
+  ASSERT_EQ(
+      RAJA::operators::limits<TypeParam>::min(),
+      std::numeric_limits<TypeParam>::min());
+  ASSERT_EQ(
+      RAJA::operators::limits<TypeParam>::max(),
+      std::numeric_limits<TypeParam>::max());
 }
 
 REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsUnitTest, IntegralLimits);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsUnitTests,
-                               IntegralLimitsUnitTest,
-                               UnitIntegralTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    IntegralLimitsUnitTests,
+    IntegralLimitsUnitTest,
+    UnitIntegralTypes);
diff --git a/test/unit/util/test-span.cpp b/test/unit/util/test-span.cpp
index 4bf5e1abf3..90b13d5401 100644
--- a/test/unit/util/test-span.cpp
+++ b/test/unit/util/test-span.cpp
@@ -17,18 +17,18 @@
   test<double, int>();                                                         \
   test<double, size_t>();
 
-TEST(Span, basic_construct_Span){RAJA_SPAN_RUN_TEST(testSpanConstructTypes)}
+TEST(Span, basic_construct_Span) {RAJA_SPAN_RUN_TEST(testSpanConstructTypes)}
 
-TEST(Span, basic_assign_Span){RAJA_SPAN_RUN_TEST(testSpanAssignTypes)}
+TEST(Span, basic_assign_Span) {RAJA_SPAN_RUN_TEST(testSpanAssignTypes)}
 
-TEST(Span, basic_iterator_Span){RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)}
+TEST(Span, basic_iterator_Span) {RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)}
 
-TEST(Span,
-     basic_element_access_Span){RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
+TEST(Span, basic_element_access_Span) {
+    RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
 
-TEST(Span, basic_observe_Span){RAJA_SPAN_RUN_TEST(testSpanObserveTypes)}
+TEST(Span, basic_observe_Span) {RAJA_SPAN_RUN_TEST(testSpanObserveTypes)}
 
-TEST(Span, basic_subview_Span){RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)}
+TEST(Span, basic_subview_Span) {RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)}
 
 TEST(Span, basic_make_span_Span)
 {
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index 14b54ddab0..dd34787ddf 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -19,7 +19,7 @@ TEST(IndexLayout, IndexList1D)
 
   Index_type arr[3] = {1, 2, 3};
 
-  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
@@ -38,7 +38,7 @@ TEST(IndexLayout, IndexList1DSubsetOfLayout)
 
   Index_type arr[3] = {2, 3, 4};
 
-  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   EXPECT_EQ(index_layout(0), 2);
@@ -62,7 +62,7 @@ TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0)
 
   Index_type arr[2] = {1, 2};
 
-  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
   for (int i = 0; i < 10; i++)
@@ -86,7 +86,7 @@ TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1)
 
   Index_type arr[2] = {9, 5};
 
-  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
   EXPECT_EQ(index_layout(0, 0), 9);
@@ -111,7 +111,7 @@ TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0)
 
   Index_type arr[1] = {2};
 
-  auto index_tuple  = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
   EXPECT_EQ(index_layout(0, 0), 6);
@@ -133,7 +133,7 @@ TEST(IndexLayout, IndexList2DLayoutExtractOneIndex)
 
   Index_type arr[1] = {2};
 
-  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
   EXPECT_EQ(index_layout(0, 0), 2);
@@ -155,7 +155,7 @@ TEST(IndexLayout, ConditionalIndexListNullPtr)
 
   Index_type* arr_ptr = nullptr;
 
-  auto index_tuple  = make_index_tuple(ConditionalIndexList<>{arr_ptr});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {arr_ptr});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 0);
@@ -177,7 +177,7 @@ TEST(IndexLayout, ConditionalIndexListWithIndexList)
 
   Index_type arr[2] = {1, 2};
 
-  auto index_tuple  = make_index_tuple(ConditionalIndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
@@ -198,7 +198,7 @@ TEST(IndexLayout, View1DLayout)
   Index_type data[5]       = {5, 10, 15, 20, 25};
   Index_type index_list[3] = {4, 2, 3};
 
-  auto index_tuple  = make_index_tuple(IndexList<>{&index_list[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   auto view = make_index_view(&data[0], index_layout);
@@ -235,7 +235,7 @@ TEST(IndexLayout, View2DLayout)
   Index_type index_list[2] = {1, 2};
 
   auto index_tuple =
-      make_index_tuple(DirectIndex<>(), IndexList<>{&index_list[0]});
+      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 2, 3);
 
   auto view = make_index_view(&data[0][0], index_layout);
@@ -279,9 +279,9 @@ TEST(IndexLayout, View3DLayout)
   Index_type index_list_j[2] = {1, 2};
   Index_type index_list_k[2] = {2, 3};
 
-  auto index_tuple =
-      make_index_tuple(DirectIndex<>(), IndexList<>{&index_list_j[0]},
-                       IndexList<>{&index_list_k[0]});
+  auto index_tuple = make_index_tuple(
+      DirectIndex<>(), IndexList<> {&index_list_j[0]},
+      IndexList<> {&index_list_k[0]});
 
   auto index_layout = make_index_layout(index_tuple, 2, 3, 4);
 
@@ -330,7 +330,7 @@ TEST(IndexLayout, MultiView1DLayout)
 
   Index_type index_list[2] = {1, 2};
 
-  auto index_tuple  = make_index_tuple(IndexList<>{&index_list[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 4);
 
   auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<>>>(
diff --git a/test/unit/view-layout/test-makelayout.cpp b/test/unit/view-layout/test-makelayout.cpp
index 11b682acbd..4ffa3368d7 100644
--- a/test/unit/view-layout/test-makelayout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -140,8 +140,9 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 
 TEST(LayoutUnitTest, 2D_StrideOne)
 {
-  using my_layout    = RAJA::Layout<2>;
-  using my_layout_s1 = RAJA::Layout<2, ptrdiff_t, 0>; // first index is stride-1
+  using my_layout = RAJA::Layout<2>;
+  using my_layout_s1 =
+      RAJA::Layout<2, ptrdiff_t, 0>;  // first index is stride-1
 
   /*
    * Construct a 2D layout:
@@ -240,8 +241,8 @@ TEST(StaticLayoutUnitTest, 4D_PermutedStaticLayout)
       {
         for (int l = 0; l < 5; ++l)
         {
-          ASSERT_EQ(dynamic_layout(i, j, k, l),
-                    static_layout::s_oper(i, j, k, l));
+          ASSERT_EQ(
+              dynamic_layout(i, j, k, l), static_layout::s_oper(i, j, k, l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
index 89f4cc1ec3..14a7e150b1 100644
--- a/test/unit/view-layout/test-multiview.cpp
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -118,11 +118,11 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   a[0] = b;
   a[1] = c;
 
-  int iter{0};
-  for (TypeParam i = 0; i < TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[0][iter] = TypeParam{i};
-    a[1][iter] = TypeParam{i} + 1;
+    a[0][iter] = TypeParam {i};
+    a[1][iter] = TypeParam {i} + 1;
     ++iter;
   }
 
@@ -131,7 +131,7 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
    */
   RAJA::MultiView<TypeParam, RAJA::Layout<1>>    view_1D(a, N);
   RAJA::MultiView<TypeParam, RAJA::Layout<1>, 1> view_1D1p(a, N);
-  TypeParam                                      val{0};
+  TypeParam                                      val {0};
   for (int i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(0, i));
@@ -146,7 +146,7 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
    */
   RAJA::MultiView<TypeParam, RAJA::Layout<2>>    view_2D(a, Ny, Nx);
   RAJA::MultiView<TypeParam, RAJA::Layout<2>, 1> view_2D1p(a, Ny, Nx);
-  val = TypeParam{0};
+  val = TypeParam {0};
   for (int j = 0; j < Ny; ++j)
   {
     for (int i = 0; i < Nx; ++i)
@@ -164,7 +164,7 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
    */
   RAJA::MultiView<TypeParam, RAJA::Layout<3>>    view_3D(a, Nz, Ny, Nx);
   RAJA::MultiView<TypeParam, RAJA::Layout<3>, 2> view_3D1p(a, Nz, Ny, Nx);
-  val = TypeParam{0};
+  val = TypeParam {0};
   for (int k = 0; k < Nz; ++k)
   {
     for (int j = 0; j < Ny; ++j)
@@ -198,8 +198,8 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
   /*
    * MultiView is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1>    lower{{1}};
-  std::array<RAJA::Index_type, 1>    upper{{11}};
+  std::array<RAJA::Index_type, 1>    lower {{1}};
+  std::array<RAJA::Index_type, 1>    upper {{11}};
   RAJA::MultiView<TypeParam, layout> view(
       data, RAJA::make_offset_layout<1>(lower, upper));
   RAJA::MultiView<TypeParam, layout, 1> view1p(
@@ -274,7 +274,7 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
   RAJA::MultiView<TypeParam, TLayout>       D(a, myLayout);
   RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for (TIX i = TIX{N}; i < TIX{2 * N}; ++i)
+  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
   {
     ASSERT_EQ(Dshift(0, i), D(0, i - N));
   };
@@ -324,7 +324,7 @@ TYPED_TEST(MultiViewUnitTest, Shift2D)
   }
 
   // Create a view from a base view with permuted layout
-  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2>      playout =
       RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
index 4b1a3c4294..d866b7977c 100644
--- a/test/unit/view-layout/test-typedlayout.cpp
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -18,20 +18,20 @@ TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
 {
 
-  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,
-                                                                          5);
+  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(
+      10, 5);
 
-  ASSERT_EQ(TypeParam{0}, l(TypeParam{0}, TypeParam{0}));
+  ASSERT_EQ(TypeParam {0}, l(TypeParam {0}, TypeParam {0}));
 
-  ASSERT_EQ(TypeParam{2}, l(TypeParam{0}, TypeParam{2}));
+  ASSERT_EQ(TypeParam {2}, l(TypeParam {0}, TypeParam {2}));
 
-  ASSERT_EQ(TypeParam{10}, l(TypeParam{2}, TypeParam{0}));
+  ASSERT_EQ(TypeParam {10}, l(TypeParam {2}, TypeParam {0}));
 
-  TypeParam x{5};
-  TypeParam y{0};
-  l.toIndices(TypeParam{10}, y, x);
-  ASSERT_EQ(x, TypeParam{0});
-  ASSERT_EQ(y, TypeParam{2});
+  TypeParam x {5};
+  TypeParam y {0};
+  l.toIndices(TypeParam {10}, y, x);
+  ASSERT_EQ(x, TypeParam {0});
+  ASSERT_EQ(y, TypeParam {2});
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
@@ -144,9 +144,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
-  using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_IJ, TypeParam,
-                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
+  using static_layout = RAJA::TypedStaticLayout<
+      RAJA::PERM_IJ, TypeParam, RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -163,9 +162,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
 {
   auto dynamic_layout = RAJA::make_permuted_layout(
       {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_JI, TypeParam,
-                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
+  using static_layout = RAJA::TypedStaticLayout<
+      RAJA::PERM_JI, TypeParam, RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -181,10 +179,9 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 {
   auto dynamic_layout = RAJA::make_permuted_layout(
       {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_JKI, TypeParam,
-                              RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
-                              13, 5>;
+  using static_layout = RAJA::TypedStaticLayout<
+      RAJA::PERM_JKI, TypeParam, RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
+      13, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -193,8 +190,8 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
     {
       for (TypeParam k = 0; k < 5; ++k)
       {
-        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)),
-                  static_layout::s_oper(i, j, k));
+        ASSERT_EQ(
+            TypeParam(dynamic_layout(i, j, k)), static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -218,8 +215,9 @@ TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
       {
         for (TypeParam l = 0; l < 5; ++l)
         {
-          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)),
-                    static_layout::s_oper(i, j, k, l));
+          ASSERT_EQ(
+              TypeParam(dynamic_layout(i, j, k, l)),
+              static_layout::s_oper(i, j, k, l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index 97aeb654b1..5841dada14 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -69,10 +69,10 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   const int  N  = Nx * Ny * Nz;
   TypeParam* a  = new TypeParam[N];
 
-  int iter{0};
-  for (TypeParam i = 0; i < TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[iter] = TypeParam{i};
+    a[iter] = TypeParam {i};
     ++iter;
   }
 
@@ -80,7 +80,7 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
    * 1D Accessor
    */
   RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a, N);
-  TypeParam                              val{0};
+  TypeParam                              val {0};
   for (int i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(i));
@@ -91,7 +91,7 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
    * 2D Accessor
    */
   RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
-  val = TypeParam{0};
+  val = TypeParam {0};
   for (int j = 0; j < Ny; ++j)
   {
     for (int i = 0; i < Nx; ++i)
@@ -105,7 +105,7 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
    * 3D Accessor
    */
   RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
-  val = TypeParam{0};
+  val = TypeParam {0};
   for (int k = 0; k < Nz; ++k)
   {
     for (int j = 0; j < Ny; ++j)
@@ -129,10 +129,10 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   const int  N  = Nx * Ny * Nz;
   TypeParam* a  = new TypeParam[N];
 
-  int iter{0};
-  for (TypeParam i = 0; i < TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[iter] = TypeParam{i};
+    a[iter] = TypeParam {i};
     ++iter;
   }
 
@@ -140,7 +140,7 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
    * 1D Typed Accessor
    */
   RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a, N);
-  TypeParam                                              val{0};
+  TypeParam                                              val {0};
   for (TypeParam i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(i));
@@ -151,7 +151,7 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
    * 2D Typed Accessor
    */
   RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
-  val = TypeParam{0};
+  val = TypeParam {0};
   for (TypeParam j = 0; j < Ny; ++j)
   {
     for (TypeParam i = 0; i < Nx; ++i)
@@ -165,7 +165,7 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
    * 3D Typed Accessor
    */
   RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
-  val = TypeParam{0};
+  val = TypeParam {0};
   for (TypeParam k = 0; k < Nz; ++k)
   {
     for (TypeParam j = 0; j < Ny; ++j)
@@ -190,10 +190,10 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
   /*
    * View is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower{{1}};
-  std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::View<TypeParam, layout>   view(data,
-                                       RAJA::make_offset_layout<1>(lower, upper));
+  std::array<RAJA::Index_type, 1> lower {{1}};
+  std::array<RAJA::Index_type, 1> upper {{11}};
+  RAJA::View<TypeParam, layout>   view(
+        data, RAJA::make_offset_layout<1>(lower, upper));
 
   for (int i = 0; i < 10; i++)
   {
@@ -242,7 +242,7 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
     ASSERT_EQ(Bshift(i), B(i - N));
   }
 
-  for (TX tx = TX{N}; tx < TX{2 * N}; tx++)
+  for (TX tx = TX {N}; tx < TX {2 * N}; tx++)
   {
     ASSERT_EQ(Cshift(tx), C(tx - N));
   }
@@ -258,7 +258,7 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   RAJA::View<TypeParam, TLayout>       D(a, myLayout);
   RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for (TIX i = TIX{N}; i < TIX{2 * N}; ++i)
+  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
   {
     ASSERT_EQ(Dshift(i), D(i - N));
   };
@@ -307,7 +307,7 @@ TYPED_TEST(TypedViewUnitTest, Shift2D)
   /*
    * Create a view from a base view with permuted layout
    */
-  std::array<RAJA::idx_t, 2> perm{{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2>      playout =
       RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
diff --git a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
index b2cbe3ca9e..b48de2c149 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
@@ -39,4 +39,4 @@ struct EnqueueTestCallable
   IndexType  val;
 };
 
-#endif //__TEST_UTIL_WORKGROUP_ENQUEUE__
+#endif  //__TEST_UTIL_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
index 0b753021f1..e34dfd55b1 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
@@ -39,9 +39,10 @@ struct TestCallable
     return *this;
   }
 
-  RAJA_HOST_DEVICE void operator()(void* val_ptr,
-                                   bool* move_constructed_ptr,
-                                   bool* moved_from_ptr) const
+  RAJA_HOST_DEVICE void operator()(
+      void* val_ptr,
+      bool* move_constructed_ptr,
+      bool* moved_from_ptr) const
   {
     *static_cast<T*>(val_ptr) = val;
     *move_constructed_ptr     = move_constructed;
@@ -61,7 +62,7 @@ struct TestCallable
 template <typename T, size_t N>
 struct TestArray
 {
-  T                  a[N]{};
+  T                  a[N] {};
   T&                 operator[](size_t i) { return a[i]; }
   T const&           operator[](size_t i) const { return a[i]; }
   friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
@@ -81,4 +82,4 @@ struct TestArray
   }
 };
 
-#endif //__TEST_UTIL_WORKGROUP_WORKSTORAGE__
+#endif  //__TEST_UTIL_WORKGROUP_WORKSTORAGE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index 1322453ed5..c3f603b5dc 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -15,12 +15,13 @@
 #include "RAJA_test-workgroup.hpp"
 
 
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupConstructorSingle
 {
   template <typename... Xargs>
@@ -31,30 +32,33 @@ struct testWorkGroupConstructorSingle
     using DispatchPolicy = typename DispatchTyper::template type<>;
 
     {
-      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                           StoragePolicy, DispatchPolicy>,
-                     IndexType, RAJA::xargs<Xargs...>, Allocator>
-          pool(Allocator{});
+      RAJA::WorkPool<
+          RAJA::WorkGroupPolicy<
+              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+          IndexType, RAJA::xargs<Xargs...>, Allocator>
+          pool(Allocator {});
 
       ASSERT_EQ(pool.num_loops(), (size_t)0);
       ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                            StoragePolicy, DispatchPolicy>,
-                      IndexType, RAJA::xargs<Xargs...>, Allocator>
+      RAJA::WorkGroup<
+          RAJA::WorkGroupPolicy<
+              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+          IndexType, RAJA::xargs<Xargs...>, Allocator>
           group = pool.instantiate();
 
       ASSERT_EQ(pool.num_loops(), (size_t)0);
       ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                           StoragePolicy, DispatchPolicy>,
-                     IndexType, RAJA::xargs<Xargs...>, Allocator>
-          site = group.run(Xargs{}...);
+      RAJA::WorkSite<
+          RAJA::WorkGroupPolicy<
+              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+          IndexType, RAJA::xargs<Xargs...>, Allocator>
+          site = group.run(Xargs {}...);
 
       using resource_type = typename RAJA::WorkPool<
-          RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy,
-                                DispatchPolicy>,
+          RAJA::WorkGroupPolicy<
+              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
           IndexType, RAJA::xargs<Xargs...>, Allocator>::resource_type;
       auto e = resource_type::get_default().get_event();
       e.wait();
@@ -75,11 +79,12 @@ struct testWorkGroupConstructorSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupConstructorSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -93,11 +98,12 @@ struct testWorkGroupConstructorSingle<
   {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupConstructorSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -121,8 +127,9 @@ class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
-             BasicWorkGroupConstructorSingle)
+TYPED_TEST_P(
+    WorkGroupBasicConstructorSingleUnitTest,
+    BasicWorkGroupConstructorSingle)
 {
   using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -132,9 +139,9 @@ TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                                 DispatchTyper, IndexType, Allocator>{}(
-      Xargs{});
+  testWorkGroupConstructorSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator> {}(Xargs {});
 }
 
-#endif //__TEST_WORKGROUP_CONSTRUCTOR__
+#endif  //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 21f9a1d01b..7c4074bcd7 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -36,10 +36,11 @@ call_dispatcher(Invoker invoker, CallArgs... callArgs)
 template <typename IndexType, typename... Args>
 struct DispatcherTestCallable
 {
-  DispatcherTestCallable(IndexType* _ptr_call,
-                         IndexType  _val_call,
-                         IndexType* _ptr_dtor,
-                         IndexType  _val_dtor)
+  DispatcherTestCallable(
+      IndexType* _ptr_call,
+      IndexType  _val_call,
+      IndexType* _ptr_dtor,
+      IndexType  _val_dtor)
       : ptr_call(_ptr_call),
         val_call(_val_call),
         ptr_dtor(_ptr_dtor),
@@ -87,11 +88,12 @@ struct DispatcherTestCallable
   bool moved_from       = false;
 };
 
-template <typename ExecPolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename WORKING_RES,
-          typename ForOnePol>
+template <
+    typename ExecPolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename WORKING_RES,
+    typename ForOnePol>
 struct testWorkGroupDispatcherSingle
 {
   template <typename... Args>
@@ -99,18 +101,18 @@ struct testWorkGroupDispatcherSingle
   {
     using TestCallable = DispatcherTestCallable<IndexType, Args...>;
 
-    camp::resources::Resource work_res{WORKING_RES()};
-    camp::resources::Resource host_res{camp::resources::Host()};
+    camp::resources::Resource work_res {WORKING_RES()};
+    camp::resources::Resource host_res {camp::resources::Host()};
 
     static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
     using DispatchPolicy  = typename DispatchTyper::template type<TestCallable>;
-    using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                     void, IndexType, Args...>;
-    using Invoker_type    = typename Dispatcher_type::invoker_type;
+    using Dispatcher_type = RAJA::detail::Dispatcher<
+        platform, DispatchPolicy, void, IndexType, Args...>;
+    using Invoker_type         = typename Dispatcher_type::invoker_type;
     using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
     const Dispatcher_type* dispatcher =
         RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
-            ExecPolicy{});
+            ExecPolicy {});
 
     TestCallable* old_obj = host_res.allocate<TestCallable>(1);
     TestCallable* new_obj = host_res.allocate<TestCallable>(1);
@@ -172,9 +174,9 @@ struct testWorkGroupDispatcherSingle
     work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
     // move a value onto device and fiddle
-    call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType,
-                    Args...>(dispatcher->invoke, wrk_obj, (IndexType)1,
-                             Args{}...);
+    call_dispatcher<
+        ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType, Args...>(
+        dispatcher->invoke, wrk_obj, (IndexType)1, Args {}...);
 
     work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
@@ -205,11 +207,12 @@ struct testWorkGroupDispatcherSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename IndexType,
-          typename WORKING_RES,
-          typename ForOnePol>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename IndexType,
+    typename WORKING_RES,
+    typename ForOnePol>
 struct testWorkGroupDispatcherSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     detail::indirect_function_call_dispatch_typer,
@@ -222,11 +225,12 @@ struct testWorkGroupDispatcherSingle<
   {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename IndexType,
-          typename WORKING_RES,
-          typename ForOnePol>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename IndexType,
+    typename WORKING_RES,
+    typename ForOnePol>
 struct testWorkGroupDispatcherSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     detail::indirect_virtual_function_dispatch_typer,
@@ -248,8 +252,9 @@ class WorkGroupBasicDispatcherSingleUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
-             BasicWorkGroupDispatcherSingle)
+TYPED_TEST_P(
+    WorkGroupBasicDispatcherSingleUnitTest,
+    BasicWorkGroupDispatcherSingle)
 {
   using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -258,8 +263,9 @@ TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
   using ResourceType  = typename camp::at<TypeParam, camp::num<4>>::type;
   using ForOneType    = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupDispatcherSingle<ExecPolicy, DispatchTyper, IndexType,
-                                ResourceType, ForOneType>{}(Args{});
+  testWorkGroupDispatcherSingle<
+      ExecPolicy, DispatchTyper, IndexType, ResourceType, ForOneType> {}(
+      Args {});
 }
 
-#endif //__TEST_WORKGROUP_DISPATCHER__
+#endif  //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index 7fc09e6b79..c864774c68 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -18,19 +18,19 @@
 #include <random>
 
 
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupEnqueueMultiple
 {
   template <typename... Args>
-  void operator()(RAJA::xargs<Args...>,
-                  bool   do_instantiate,
-                  size_t rep,
-                  size_t num) const
+  void
+  operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num)
+      const
   {
     IndexType success = (IndexType)1;
 
@@ -40,18 +40,18 @@ struct testWorkGroupEnqueueMultiple
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkPool_type = RAJA::WorkPool<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkGroup_type = RAJA::WorkGroup<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<Args...>, Allocator>;
 
     {
-      WorkPool_type pool(Allocator{});
+      WorkPool_type pool(Allocator {});
 
       // test_empty(pool);
       ASSERT_EQ(pool.num_loops(), (size_t)0);
@@ -63,7 +63,8 @@ struct testWorkGroupEnqueueMultiple
         {
           for (size_t i = 0; i < num; ++i)
           {
-            pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+            pool.enqueue(
+                range_segment {0, 1}, callable {&success, IndexType(0)});
           }
 
           ASSERT_EQ(pool.num_loops(), (size_t)num);
@@ -92,11 +93,12 @@ struct testWorkGroupEnqueueMultiple
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupEnqueueMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -110,11 +112,12 @@ struct testWorkGroupEnqueueMultiple<
   {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupEnqueueMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -138,8 +141,9 @@ class WorkGroupBasicEnqueueMultipleUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
-             BasicWorkGroupEnqueueMultiple)
+TYPED_TEST_P(
+    WorkGroupBasicEnqueueMultipleUnitTest,
+    BasicWorkGroupEnqueueMultiple)
 {
   using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -149,16 +153,16 @@ TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937                          rng(std::random_device{}());
+  std::mt19937                          rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
-  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator>{}(
-      Xargs{}, false, dist_rep(rng), dist_num(rng));
-  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator>{}(
-      Xargs{}, true, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator> {}(Xargs {}, false, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator> {}(Xargs {}, true, dist_rep(rng), dist_num(rng));
 }
 
-#endif //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
+#endif  //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index c313d52ebb..a2cb6bd1be 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -18,19 +18,19 @@
 #include <random>
 
 
-template <typename ExecPolicy,
-          typename OrderPolicy,
-          typename StoragePolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename Allocator>
+template <
+    typename ExecPolicy,
+    typename OrderPolicy,
+    typename StoragePolicy,
+    typename DispatchTyper,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupEnqueueSingle
 {
   template <typename... Args>
-  void operator()(RAJA::xargs<Args...>,
-                  bool   do_instantiate,
-                  size_t rep,
-                  size_t num) const
+  void
+  operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num)
+      const
   {
     IndexType success = (IndexType)1;
 
@@ -40,18 +40,18 @@ struct testWorkGroupEnqueueSingle
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkPool_type = RAJA::WorkPool<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkGroup_type = RAJA::WorkGroup<
+        RAJA::WorkGroupPolicy<
+            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+        IndexType, RAJA::xargs<Args...>, Allocator>;
 
     {
-      WorkPool_type pool(Allocator{});
+      WorkPool_type pool(Allocator {});
 
       // test_empty(pool);
       ASSERT_EQ(pool.num_loops(), (size_t)0);
@@ -63,7 +63,8 @@ struct testWorkGroupEnqueueSingle
         {
           for (size_t i = 0; i < num; ++i)
           {
-            pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+            pool.enqueue(
+                range_segment {0, 1}, callable {&success, IndexType(0)});
           }
 
           ASSERT_EQ(pool.num_loops(), (size_t)num);
@@ -92,11 +93,12 @@ struct testWorkGroupEnqueueSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupEnqueueSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -110,11 +112,12 @@ struct testWorkGroupEnqueueSingle<
   {}
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool   Async,
-          typename StoragePolicy,
-          typename IndexType,
-          typename Allocator>
+template <
+    size_t BLOCK_SIZE,
+    bool   Async,
+    typename StoragePolicy,
+    typename IndexType,
+    typename Allocator>
 struct testWorkGroupEnqueueSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -148,12 +151,12 @@ TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator>{}(
-      Xargs{}, false, 1, 1);
-  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator>{}(Xargs{},
-                                                                    true, 1, 1);
+  testWorkGroupEnqueueSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator> {}(Xargs {}, false, 1, 1);
+  testWorkGroupEnqueueSingle<
+      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
+      Allocator> {}(Xargs {}, true, 1, 1);
 }
 
-#endif //__TEST_WORKGROUP_ENQUEUESINGLE__
+#endif  //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 2028343127..26c47adca0 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -27,8 +27,8 @@ void testWorkGroupWorkStorageConstructor()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy           = typename DispatchTyper::template type<>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
+  using Dispatcher_type          = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
@@ -39,7 +39,7 @@ void testWorkGroupWorkStorageConstructor()
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
 
@@ -54,7 +54,7 @@ void testWorkGroupWorkStorageConstructor()
     test_empty(container2);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
@@ -72,16 +72,17 @@ class WorkGroupBasicWorkStorageConstructorUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
-             BasicWorkGroupWorkStorageConstructor)
+TYPED_TEST_P(
+    WorkGroupBasicWorkStorageConstructorUnitTest,
+    BasicWorkGroupWorkStorageConstructor)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor<StoragePolicy, DispatchTyper,
-                                      Allocator>();
+  testWorkGroupWorkStorageConstructor<
+      StoragePolicy, DispatchTyper, Allocator>();
 }
 
 
-#endif //__TEST_WORKGROUP_WORKSTORAGECONSTRUCTOR__
+#endif  //__TEST_WORKGROUP_WORKSTORAGECONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index 0b341d4b5e..088bd71be3 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -29,14 +29,15 @@ void testWorkGroupWorkStorageInsertCall()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
   const Dispatcher_type* dispatcher =
-      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(RAJA::seq_work{});
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
     auto test_empty = [&](WorkStorage_type& container)
@@ -71,8 +72,8 @@ void testWorkGroupWorkStorageInsertCall()
       double test_val         = -1;
       bool   move_constructed = false;
       bool   moved_from       = true;
-      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed,
-                                 &moved_from);
+      WorkStruct_type::host_call(
+          &*iter, (void*)&test_val, &move_constructed, &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
@@ -80,7 +81,7 @@ void testWorkGroupWorkStorageInsertCall()
     };
 
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
 
@@ -97,14 +98,14 @@ void testWorkGroupWorkStorageInsertCall()
     test_contents(container2, 1.23456789);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.23456789);
 
 
-    WorkStorage_type container4(Allocator{});
+    WorkStorage_type container4(Allocator {});
 
     fill_contents(container4, 2.34567891);
     test_contents(container4, 2.34567891);
@@ -126,8 +127,9 @@ class WorkGroupBasicWorkStorageInsertCallUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
-             BasicWorkGroupWorkStorageInsertCall)
+TYPED_TEST_P(
+    WorkGroupBasicWorkStorageInsertCallUnitTest,
+    BasicWorkGroupWorkStorageInsertCall)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -136,4 +138,4 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
   testWorkGroupWorkStorageInsertCall<StoragePolicy, DispatchTyper, Allocator>();
 }
 
-#endif //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
+#endif  //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index 72432d8962..2156296908 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -29,17 +29,18 @@ void testWorkGroupWorkStorageIterator()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
 
   const Dispatcher_type* dispatcher =
-      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(RAJA::seq_work{});
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)0);
     ASSERT_FALSE(container.begin() < container.end());
@@ -49,7 +50,7 @@ void testWorkGroupWorkStorageIterator()
     ASSERT_TRUE(container.begin() <= container.end());
     ASSERT_TRUE(container.begin() >= container.end());
 
-    container.template emplace<callable>(dispatcher, callable{-1});
+    container.template emplace<callable>(dispatcher, callable {-1});
 
     ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)1);
     ASSERT_TRUE(container.begin() < container.end());
@@ -89,8 +90,9 @@ class WorkGroupBasicWorkStorageIteratorUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
-             BasicWorkGroupWorkStorageIterator)
+TYPED_TEST_P(
+    WorkGroupBasicWorkStorageIteratorUnitTest,
+    BasicWorkGroupWorkStorageIterator)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -99,4 +101,4 @@ TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
   testWorkGroupWorkStorageIterator<StoragePolicy, DispatchTyper, Allocator>();
 }
 
-#endif //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
+#endif  //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 4500fb6749..1ede327f41 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -21,9 +21,10 @@
 
 
 template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
-void testWorkGroupWorkStorageMultiple(const size_t num0,
-                                      const size_t num1,
-                                      const size_t num2)
+void testWorkGroupWorkStorageMultiple(
+    const size_t num0,
+    const size_t num1,
+    const size_t num2)
 {
   bool success = true;
 
@@ -38,7 +39,7 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
   };
   auto make_type1 = [](double init_val, size_t i)
   {
-    type1 obj{};
+    type1 obj {};
     for (size_t j = 0; j < 6; ++j)
     {
       obj[j] = init_val + 10.0 * j + i;
@@ -47,7 +48,7 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
   };
   auto make_type2 = [](double init_val, size_t i)
   {
-    type2 obj{};
+    type2 obj {};
     for (size_t j = 0; j < 14; ++j)
     {
       obj[j] = init_val + 10.0 * j + i;
@@ -62,8 +63,8 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy =
       typename DispatchTyper::template type<callable0, callable1, callable2>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
@@ -71,13 +72,13 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
 
   const Dispatcher_type* dispatcher0 =
       RAJA::detail::get_Dispatcher<callable0, Dispatcher_type>(
-          RAJA::seq_work{});
+          RAJA::seq_work {});
   const Dispatcher_type* dispatcher1 =
       RAJA::detail::get_Dispatcher<callable1, Dispatcher_type>(
-          RAJA::seq_work{});
+          RAJA::seq_work {});
   const Dispatcher_type* dispatcher2 =
       RAJA::detail::get_Dispatcher<callable2, Dispatcher_type>(
-          RAJA::seq_work{});
+          RAJA::seq_work {});
 
   {
     auto test_empty = [&](WorkStorage_type& container)
@@ -126,29 +127,31 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
       }
 
       ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
-                                              num1 * sizeof(callable1) +
-                                              num2 * sizeof(callable2));
+      ASSERT_GE(
+          container.storage_size(), num0 * sizeof(callable0) +
+                                        num1 * sizeof(callable1) +
+                                        num2 * sizeof(callable2));
     };
 
     auto test_contents = [&](WorkStorage_type& container, double init_val0,
                              double init_val1, double init_val2)
     {
       ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
-                                              num1 * sizeof(callable1) +
-                                              num2 * sizeof(callable2));
+      ASSERT_GE(
+          container.storage_size(), num0 * sizeof(callable0) +
+                                        num1 * sizeof(callable1) +
+                                        num2 * sizeof(callable2));
 
       {
         auto iter = container.begin();
 
         for (size_t i = 0; i < num0; ++i)
         {
-          type0 val{};
+          type0 val {};
           bool  move_constructed = false;
           bool  moved_from       = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
-                                     &moved_from);
+          WorkStruct_type::host_call(
+              &*iter, (void*)&val, &move_constructed, &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -160,11 +163,11 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
 
         for (size_t i = 0; i < num1; ++i)
         {
-          type1 val{};
+          type1 val {};
           bool  move_constructed = false;
           bool  moved_from       = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
-                                     &moved_from);
+          WorkStruct_type::host_call(
+              &*iter, (void*)&val, &move_constructed, &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -176,11 +179,11 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
 
         for (size_t i = 0; i < num2; ++i)
         {
-          type2 val{};
+          type2 val {};
           bool  move_constructed = false;
           bool  moved_from       = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
-                                     &moved_from);
+          WorkStruct_type::host_call(
+              &*iter, (void*)&val, &move_constructed, &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);
@@ -194,7 +197,7 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
       }
     };
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
     fill_contents(container, 1.0, 100.0, 1000.0);
@@ -212,14 +215,14 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
     test_contents(container2, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container4(Allocator{});
+    WorkStorage_type container4(Allocator {});
 
     fill_contents(container4, 1.5, 100.5, 1000.5);
     test_contents(container4, 1.5, 100.5, 1000.5);
@@ -241,18 +244,19 @@ class WorkGroupBasicWorkStorageMultipleUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest,
-             BasicWorkGroupWorkStorageMultiple)
+TYPED_TEST_P(
+    WorkGroupBasicWorkStorageMultipleUnitTest,
+    BasicWorkGroupWorkStorageMultiple)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  std::mt19937                          rng(std::random_device{}());
+  std::mt19937                          rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
   testWorkGroupWorkStorageMultiple<StoragePolicy, DispatchTyper, Allocator>(
       dist(rng), dist(rng), dist(rng));
 }
 
-#endif //__TEST_WORKGROUP_WORKSTORAGEMULTIPLE__
+#endif  //__TEST_WORKGROUP_WORKSTORAGEMULTIPLE__

From c0fb2e0e8e5577ae2116ad71d813385a570240d5 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Tue, 1 Oct 2024 17:18:24 -0700
Subject: [PATCH 6/9] remove examples and exercises from branch

---
 .clang-format                                 |    6 +-
 cmake/RAJAMacros.cmake                        |    1 +
 examples/dynamic-forall.cpp                   |   95 +-
 examples/dynamic_mat_transpose.cpp            |  324 +--
 examples/forall-param-reductions.cpp          |  405 ++--
 examples/forall_multi-reductions.cpp          |  150 +-
 examples/jacobi.cpp                           |  332 ++-
 examples/kernel-dynamic-tile.cpp              |   49 +-
 examples/launch-param-reductions.cpp          |  462 ++--
 examples/launch_flatten.cpp                   |  140 +-
 examples/launch_matrix-multiply.cpp           |  876 +++----
 examples/launch_reductions.cpp                |  183 +-
 examples/memoryManager.hpp                    |   55 +-
 examples/multiview.cpp                        |  164 +-
 examples/omp-target-kernel.cpp                |   35 +-
 examples/omp-target-ltimes.cpp                |  137 +-
 examples/pi-reduce_vs_atomic.cpp              |  174 +-
 examples/plugin/counter-plugin.cpp            |   47 +-
 examples/plugin/test-plugin-dynamic.cpp       |    9 +-
 examples/plugin/test-plugin.cpp               |   10 +-
 examples/plugin/timer-plugin.cpp              |   22 +-
 examples/raja-launch.cpp                      |  112 +-
 examples/red-black-gauss-seidel.cpp           |  145 +-
 examples/resource-dynamic-forall.cpp          |  119 +-
 examples/resource-forall.cpp                  |  384 ++-
 examples/resource-kernel.cpp                  |   61 +-
 examples/resource-launch.cpp                  |   65 +-
 examples/resource-runtime-launch.cpp          |  178 +-
 examples/tut_daxpy.cpp                        |  227 +-
 examples/tut_halo-exchange.cpp                | 2101 ++++++++---------
 examples/tut_launch_basic.cpp                 |  228 +-
 examples/tut_matrix-multiply.cpp              | 1482 ++++++------
 examples/wave-eqn.cpp                         |  203 +-
 exercises/atomic-histogram.cpp                |  153 +-
 exercises/atomic-histogram_solution.cpp       |  207 +-
 exercises/dot-product.cpp                     |  102 +-
 exercises/dot-product_solution.cpp            |   96 +-
 .../kernel-matrix-transpose-local-array.cpp   |  436 ++--
 ...-matrix-transpose-local-array_solution.cpp |  652 ++---
 exercises/kernel-matrix-transpose-tiled.cpp   |  193 +-
 ...kernel-matrix-transpose-tiled_solution.cpp |  249 +-
 exercises/kernel-matrix-transpose.cpp         |   85 +-
 .../kernel-matrix-transpose_solution.cpp      |  129 +-
 exercises/kernelintro-execpols.cpp            |  475 ++--
 exercises/kernelintro-execpols_solution.cpp   |  530 +++--
 exercises/kernelintro-nested-loop-reorder.cpp |  158 +-
 ...rnelintro-nested-loop-reorder_solution.cpp |  210 +-
 .../launch-matrix-transpose-local-array.cpp   |  300 +--
 ...-matrix-transpose-local-array_solution.cpp |  348 ++-
 exercises/launch-matrix-transpose-tiled.cpp   |  236 +-
 ...launch-matrix-transpose-tiled_solution.cpp |  261 +-
 exercises/launch-matrix-transpose.cpp         |  159 +-
 .../launch-matrix-transpose_solution.cpp      |  141 +-
 exercises/launchintro-execpols.cpp            |  467 ++--
 exercises/launchintro-execpols_solution.cpp   |  481 ++--
 exercises/memoryManager.hpp                   |   55 +-
 exercises/offset-layout-stencil.cpp           |  294 +--
 exercises/offset-layout-stencil_solution.cpp  |  329 +--
 .../permuted-layout-batch-matrix-multiply.cpp |  693 +++---
 ...-layout-batch-matrix-multiply_solution.cpp |  788 +++----
 exercises/reductions.cpp                      |  149 +-
 exercises/reductions_solution.cpp             |  224 +-
 exercises/scan.cpp                            |  144 +-
 exercises/scan_solution.cpp                   |  168 +-
 exercises/segment-indexset-basics.cpp         |  163 +-
 .../segment-indexset-basics_solution.cpp      |  187 +-
 exercises/sort.cpp                            |  423 ++--
 exercises/sort_solution.cpp                   |  437 ++--
 exercises/tutorial_halfday/ex2_approx-pi.cpp  |  106 +-
 .../ex2_approx-pi_solution.cpp                |  122 +-
 .../tutorial_halfday/ex5_line-of-sight.cpp    |  149 +-
 .../ex5_line-of-sight_solution.cpp            |  191 +-
 .../ex6_stencil-offset-layout.cpp             |  257 +-
 .../ex6_stencil-offset-layout_solution.cpp    |  252 +-
 .../ex8_tiled-matrix-transpose.cpp            |  110 +-
 .../ex8_tiled-matrix-transpose_solution.cpp   |  199 +-
 .../ex9_matrix-transpose-local-array.cpp      |   76 +-
 ..._matrix-transpose-local-array_solution.cpp |  270 ++-
 exercises/tutorial_halfday/memoryManager.hpp  |   42 +-
 exercises/vector-addition.cpp                 |  204 +-
 exercises/vector-addition_solution.cpp        |  228 +-
 exercises/vertexsum-indexset.cpp              |  481 ++--
 exercises/vertexsum-indexset_solution.cpp     |  497 ++--
 exercises/view-layout.cpp                     |  485 ++--
 exercises/view-layout_solution.cpp            |  494 ++--
 include/RAJA/index/IndexSet.hpp               |   63 +-
 include/RAJA/index/IndexSetBuilders.hpp       |   25 +-
 include/RAJA/index/IndexSetUtils.hpp          |   33 +-
 include/RAJA/index/IndexValue.hpp             |   55 +-
 include/RAJA/index/ListSegment.hpp            |   37 +-
 include/RAJA/index/RangeSegment.hpp           |  111 +-
 include/RAJA/internal/DepGraphNode.hpp        |    6 +-
 include/RAJA/internal/Iterators.hpp           |   36 +-
 include/RAJA/internal/MemUtils_CPU.hpp        |    6 +-
 include/RAJA/internal/RAJAVec.hpp             |   45 +-
 include/RAJA/internal/fault_tolerance.hpp     |    8 +-
 include/RAJA/internal/foldl.hpp               |   68 +-
 include/RAJA/internal/get_platform.hpp        |   19 +-
 include/RAJA/pattern/WorkGroup.hpp            |  331 ++-
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  356 ++-
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |  184 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    |  227 +-
 include/RAJA/pattern/WorkGroup/WorkStruct.hpp |   65 +-
 include/RAJA/pattern/atomic.hpp               |   15 +-
 include/RAJA/pattern/detail/algorithm.hpp     |   10 +-
 include/RAJA/pattern/detail/multi_reduce.hpp  |   44 +-
 include/RAJA/pattern/detail/privatizer.hpp    |   28 +-
 include/RAJA/pattern/detail/reduce.hpp        |   88 +-
 include/RAJA/pattern/forall.hpp               |  439 ++--
 include/RAJA/pattern/kernel.hpp               |   70 +-
 include/RAJA/pattern/kernel/For.hpp           |   35 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   43 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |   57 +-
 include/RAJA/pattern/kernel/InitLocalMem.hpp  |   17 +-
 include/RAJA/pattern/kernel/Lambda.hpp        |   83 +-
 include/RAJA/pattern/kernel/Reduce.hpp        |   18 +-
 include/RAJA/pattern/kernel/Region.hpp        |    5 +-
 include/RAJA/pattern/kernel/Tile.hpp          |   58 +-
 include/RAJA/pattern/kernel/TileTCount.hpp    |   47 +-
 .../RAJA/pattern/kernel/internal/LoopData.hpp |   46 +-
 .../pattern/kernel/internal/LoopTypes.hpp     |   32 +-
 .../pattern/kernel/internal/Statement.hpp     |    9 +-
 .../pattern/kernel/internal/StatementList.hpp |   14 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  252 +-
 include/RAJA/pattern/params/forall.hpp        |  142 +-
 include/RAJA/pattern/params/reducer.hpp       |    8 +-
 include/RAJA/pattern/scan.hpp                 |  349 ++-
 include/RAJA/pattern/sort.hpp                 |  286 +--
 .../RAJA/pattern/tensor/MatrixRegister.hpp    |   18 +-
 include/RAJA/pattern/tensor/TensorIndex.hpp   |   37 +-
 .../RAJA/pattern/tensor/TensorRegister.hpp    |   20 +-
 .../RAJA/pattern/tensor/VectorRegister.hpp    |    7 +-
 .../tensor/internal/ET/BinaryOperator.hpp     |   74 +-
 .../internal/ET/BinaryOperatorTraits.hpp      |   24 +-
 .../tensor/internal/ET/BlockLiteral.hpp       |    2 +-
 .../internal/ET/ExpressionTemplateBase.hpp    |    8 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   |  470 ++--
 .../tensor/internal/ET/TensorDivide.hpp       |  108 +-
 .../tensor/internal/ET/TensorLoadStore.hpp    |    4 +-
 .../tensor/internal/ET/TensorMultiply.hpp     |   29 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |   46 +-
 .../tensor/internal/ET/TensorTranspose.hpp    |    4 +-
 .../tensor/internal/MatrixMatrixMultiply.hpp  |  194 +-
 .../tensor/internal/MatrixRegisterImpl.hpp    |  329 ++-
 .../pattern/tensor/internal/RegisterBase.hpp  |  145 +-
 .../tensor/internal/TensorIndexTraits.hpp     |   47 +-
 .../pattern/tensor/internal/TensorRef.hpp     |  509 ++--
 .../tensor/internal/TensorRegisterBase.hpp    |   59 +-
 .../tensor/internal/TensorTileExec.hpp        |  155 +-
 .../tensor/internal/VectorRegisterImpl.hpp    |  154 +-
 include/RAJA/policy/MultiPolicy.hpp           |   51 +-
 include/RAJA/policy/PolicyBase.hpp            |   73 +-
 include/RAJA/policy/WorkGroup.hpp             |   39 +-
 include/RAJA/policy/atomic_builtin.hpp        |  152 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |   86 +-
 .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp |   17 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  150 +-
 include/RAJA/policy/cuda/atomic.hpp           |  249 +-
 include/RAJA/policy/cuda/forall.hpp           |  625 +++--
 include/RAJA/policy/cuda/intrinsics.hpp       |   43 +-
 .../RAJA/policy/cuda/kernel/Conditional.hpp   |   16 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  146 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |  180 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  241 +-
 .../RAJA/policy/cuda/kernel/Hyperplane.hpp    |   15 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |   42 +-
 include/RAJA/policy/cuda/kernel/Lambda.hpp    |   16 +-
 include/RAJA/policy/cuda/kernel/Reduce.hpp    |   54 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |   71 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |   67 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  227 +-
 include/RAJA/policy/cuda/launch.hpp           |  506 ++--
 include/RAJA/policy/cuda/multi_reduce.hpp     |  178 +-
 include/RAJA/policy/cuda/policy.hpp           | 1986 ++++++++--------
 include/RAJA/policy/cuda/raja_cudaerrchk.hpp  |    5 +-
 include/RAJA/policy/cuda/reduce.hpp           |  261 +-
 include/RAJA/policy/cuda/scan.hpp             |  204 +-
 include/RAJA/policy/cuda/sort.hpp             |  620 +++--
 include/RAJA/policy/desul/atomic.hpp          |   62 +-
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |   86 +-
 .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp  |    6 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  222 +-
 include/RAJA/policy/hip/atomic.hpp            |  227 +-
 include/RAJA/policy/hip/forall.hpp            |  565 ++---
 include/RAJA/policy/hip/intrinsics.hpp        |   39 +-
 .../RAJA/policy/hip/kernel/Conditional.hpp    |   16 +-
 include/RAJA/policy/hip/kernel/For.hpp        |  183 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  243 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |   93 +-
 include/RAJA/policy/hip/kernel/Hyperplane.hpp |   15 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |   42 +-
 include/RAJA/policy/hip/kernel/Lambda.hpp     |   16 +-
 include/RAJA/policy/hip/kernel/Reduce.hpp     |   54 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |   71 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |   79 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  227 +-
 include/RAJA/policy/hip/launch.hpp            |  508 ++--
 include/RAJA/policy/hip/multi_reduce.hpp      |  185 +-
 .../RAJA/policy/hip/params/kernel_name.hpp    |    2 +-
 include/RAJA/policy/hip/params/reduce.hpp     |    2 +-
 include/RAJA/policy/hip/policy.hpp            | 1829 +++++++-------
 include/RAJA/policy/hip/raja_hiperrchk.hpp    |    4 +-
 include/RAJA/policy/hip/reduce.hpp            |  269 +--
 include/RAJA/policy/hip/scan.hpp              |  182 +-
 include/RAJA/policy/hip/sort.hpp              |  452 ++--
 .../policy/openmp/WorkGroup/WorkRunner.hpp    |   74 +-
 include/RAJA/policy/openmp/forall.hpp         |  227 +-
 .../RAJA/policy/openmp/kernel/Collapse.hpp    |   54 +-
 include/RAJA/policy/openmp/launch.hpp         |  236 +-
 include/RAJA/policy/openmp/multi_reduce.hpp   |  106 +-
 include/RAJA/policy/openmp/params/forall.hpp  |  247 +-
 include/RAJA/policy/openmp/policy.hpp         |   98 +-
 include/RAJA/policy/openmp/scan.hpp           |  119 +-
 include/RAJA/policy/openmp/sort.hpp           |  127 +-
 .../openmp_target/WorkGroup/WorkRunner.hpp    |   74 +-
 include/RAJA/policy/openmp_target/forall.hpp  |   62 +-
 .../policy/openmp_target/kernel/Collapse.hpp  |   63 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   32 +-
 include/RAJA/policy/openmp_target/policy.hpp  |   55 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |   51 +-
 .../sequential/WorkGroup/WorkRunner.hpp       |   72 +-
 include/RAJA/policy/sequential/forall.hpp     |   40 +-
 .../policy/sequential/kernel/Collapse.hpp     |    9 +-
 .../RAJA/policy/sequential/kernel/Reduce.hpp  |   10 +-
 include/RAJA/policy/sequential/launch.hpp     |  125 +-
 .../RAJA/policy/sequential/multi_reduce.hpp   |   22 +-
 include/RAJA/policy/sequential/policy.hpp     |   54 +-
 include/RAJA/policy/sequential/scan.hpp       |   85 +-
 include/RAJA/policy/sequential/sort.hpp       |   86 +-
 include/RAJA/policy/simd/forall.hpp           |   22 +-
 include/RAJA/policy/simd/kernel/For.hpp       |    2 +-
 include/RAJA/policy/simd/kernel/ForICount.hpp |   11 +-
 include/RAJA/policy/simd/launch.hpp           |   16 +-
 include/RAJA/policy/simd/policy.hpp           |    9 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp    |   16 +-
 include/RAJA/policy/sycl/forall.hpp           |  248 +-
 .../RAJA/policy/sycl/kernel/Conditional.hpp   |   16 +-
 include/RAJA/policy/sycl/kernel/For.hpp       |  124 +-
 include/RAJA/policy/sycl/kernel/ForICount.hpp |  269 +--
 include/RAJA/policy/sycl/kernel/Lambda.hpp    |   16 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |   42 +-
 include/RAJA/policy/sycl/kernel/Tile.hpp      |  119 +-
 .../RAJA/policy/sycl/kernel/TileTCount.hpp    |  213 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |    8 +-
 include/RAJA/policy/sycl/launch.hpp           |  301 ++-
 include/RAJA/policy/sycl/reduce.hpp           |  161 +-
 .../policy/tensor/arch/avx/avx_double.hpp     |    4 +-
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |   23 +-
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp |   53 +-
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp |   37 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   |    8 +-
 .../policy/tensor/arch/avx2/avx2_float.hpp    |   44 +-
 .../policy/tensor/arch/avx2/avx2_int32.hpp    |   82 +-
 .../policy/tensor/arch/avx2/avx2_int64.hpp    |   48 +-
 .../tensor/arch/avx512/avx512_double.hpp      |   19 +-
 .../tensor/arch/avx512/avx512_float.hpp       |   19 +-
 .../tensor/arch/avx512/avx512_int32.hpp       |   20 +-
 .../tensor/arch/avx512/avx512_int64.hpp       |   42 +-
 .../policy/tensor/arch/cuda/cuda_warp.hpp     |  144 +-
 .../RAJA/policy/tensor/arch/hip/hip_wave.hpp  |  144 +-
 .../RAJA/policy/tensor/arch/scalar/scalar.hpp |    8 +-
 include/RAJA/policy/tensor/policy.hpp         |    9 +-
 include/RAJA/util/BitMask.hpp                 |   13 +-
 include/RAJA/util/CombiningAdapter.hpp        |   30 +-
 include/RAJA/util/EnableIf.hpp                |    5 +-
 include/RAJA/util/IndexLayout.hpp             |   33 +-
 include/RAJA/util/KokkosPluginLoader.hpp      |   12 +-
 include/RAJA/util/Layout.hpp                  |   86 +-
 include/RAJA/util/LocalArray.hpp              |   66 +-
 include/RAJA/util/OffsetLayout.hpp            |   64 +-
 include/RAJA/util/OffsetOperators.hpp         |   60 +-
 include/RAJA/util/Operators.hpp               |  146 +-
 include/RAJA/util/Permutations.hpp            |   10 +-
 include/RAJA/util/PermutedLayout.hpp          |    6 +-
 include/RAJA/util/Registry.hpp                |   21 +-
 include/RAJA/util/RepeatView.hpp              |   10 +-
 include/RAJA/util/SoAArray.hpp                |    4 +-
 include/RAJA/util/SoAPtr.hpp                  |   29 +-
 include/RAJA/util/Span.hpp                    |   33 +-
 include/RAJA/util/StaticLayout.hpp            |  113 +-
 include/RAJA/util/Timer.hpp                   |   16 +-
 include/RAJA/util/TypedViewBase.hpp           |  421 ++--
 include/RAJA/util/View.hpp                    |   58 +-
 include/RAJA/util/basic_mempool.hpp           |   22 +-
 include/RAJA/util/for_each.hpp                |   18 +-
 include/RAJA/util/macros.hpp                  |    4 +-
 include/RAJA/util/math.hpp                    |    9 +-
 include/RAJA/util/reduce.hpp                  |  111 +-
 include/RAJA/util/resource.hpp                |   76 +-
 include/RAJA/util/sort.hpp                    |  127 +-
 include/RAJA/util/types.hpp                   |   89 +-
 include/RAJA/util/zip.hpp                     |   37 +-
 include/RAJA/util/zip_tuple.hpp               |   87 +-
 src/AlignedRangeIndexSetBuilders.cpp          |    6 +-
 src/KokkosPluginLoader.cpp                    |   18 +-
 src/LockFreeIndexSetBuilders.cpp              |   31 +-
 src/RuntimePluginLoader.cpp                   |    6 +-
 ...t-dynamic-forall-resource-RangeSegment.hpp |   53 +-
 .../test-dynamic-forall-RangeSegment.hpp      |   51 +-
 .../tests/test-forall-CombiningAdapter-1D.hpp |   45 +-
 .../tests/test-forall-CombiningAdapter-2D.hpp |   62 +-
 .../tests/test-forall-CombiningAdapter-3D.hpp |   66 +-
 .../tests/test-forall-atomic-basic.hpp        |   39 +-
 .../tests/test-forall-AtomicRefAdd.hpp        |  129 +-
 .../tests/test-forall-AtomicRefCAS.hpp        |  101 +-
 .../tests/test-forall-AtomicRefLoadStore.hpp  |  111 +-
 .../tests/test-forall-AtomicRefLogical.hpp    |  150 +-
 .../tests/test-forall-AtomicRefMinMax.hpp     |  100 +-
 .../tests/test-forall-AtomicRefSub.hpp        |  127 +-
 .../tests/test-forall-AtomicMultiView.hpp     |   70 +-
 ...test-forall-AtomicOutOfBoundsMultiView.hpp |   35 +-
 .../tests/test-forall-AtomicView.hpp          |   20 +-
 .../tests/test-forall-IcountIndexSetView.hpp  |   17 +-
 .../tests/test-forall-IndexSetView.hpp        |   16 +-
 .../tests/test-forall-IcountIndexSet.hpp      |   17 +-
 .../indexset/tests/test-forall-IndexSet.hpp   |   16 +-
 .../tests/test-forall-basic-MultiReduce.hpp   |  165 +-
 .../tests/test-forall-basic-ReduceBitAnd.hpp  |   95 +-
 .../tests/test-forall-basic-ReduceBitOr.hpp   |   87 +-
 .../tests/test-forall-basic-ReduceMax.hpp     |   89 +-
 .../tests/test-forall-basic-ReduceMaxLoc.hpp  |  119 +-
 .../tests/test-forall-basic-ReduceMin.hpp     |   89 +-
 .../tests/test-forall-basic-ReduceMinLoc.hpp  |  119 +-
 .../tests/test-forall-basic-ReduceSum.hpp     |   81 +-
 .../test-forall-basic-expt-ReduceBitAnd.hpp   |   77 +-
 .../test-forall-basic-expt-ReduceBitOr.hpp    |   69 +-
 .../test-forall-basic-expt-ReduceMax.hpp      |   82 +-
 .../test-forall-basic-expt-ReduceMaxLoc.hpp   |  104 +-
 .../test-forall-basic-expt-ReduceMin.hpp      |   82 +-
 .../test-forall-basic-expt-ReduceMinLoc.hpp   |  104 +-
 .../test-forall-basic-expt-ReduceSum.hpp      |   72 +-
 ...est-forall-indexset-multiple-ReduceMax.hpp |   50 +-
 ...-forall-indexset-multiple-ReduceMaxLoc.hpp |   54 +-
 ...est-forall-indexset-multiple-ReduceMin.hpp |   50 +-
 ...-forall-indexset-multiple-ReduceMinLoc.hpp |   54 +-
 ...est-forall-indexset-multiple-ReduceSum.hpp |   90 +-
 ...test-forall-segment-multiple-ReduceMax.hpp |   66 +-
 ...t-forall-segment-multiple-ReduceMaxLoc.hpp |   82 +-
 ...test-forall-segment-multiple-ReduceMin.hpp |   66 +-
 ...t-forall-segment-multiple-ReduceMinLoc.hpp |   88 +-
 ...test-forall-segment-multiple-ReduceSum.hpp |  146 +-
 .../region/tests/test-forall-region.hpp       |   35 +-
 .../test-forall-ResourceIcountIndexSet.hpp    |   21 +-
 .../tests/test-forall-ResourceIndexSet.hpp    |   18 +-
 .../test-forall-resource-ListSegment.hpp      |   40 +-
 .../test-forall-resource-RangeSegment.hpp     |   56 +-
 ...est-forall-resource-RangeStrideSegment.hpp |  137 +-
 .../tests/test-forall-ListSegmentView.hpp     |   53 +-
 .../tests/test-forall-RangeSegment2DView.hpp  |   78 +-
 .../tests/test-forall-RangeSegmentView.hpp    |   66 +-
 .../test-forall-RangeStrideSegmentView.hpp    |  115 +-
 .../segment/tests/test-forall-ListSegment.hpp |   41 +-
 .../tests/test-forall-RangeSegment.hpp        |   62 +-
 .../tests/test-forall-RangeStrideSegment.hpp  |  163 +-
 .../indexset-build/test-aligned-indexset.cpp  |    6 +-
 .../tests/basic-fission-fusion-loop-impl.hpp  |   67 +-
 ...nel-basic-fission-fusion-loop-segments.hpp |   55 +-
 .../tests/basic-single-icount-loop-impl.hpp   |   68 +-
 ...rnel-basic-single-icount-loop-segments.hpp |   55 +-
 .../tests/basic-single-loop-segments-impl.hpp |   83 +-
 ...test-kernel-basic-single-loop-segments.hpp |   69 +-
 ...el-resource-basic-single-loop-segments.hpp |   69 +-
 .../conditional-fission-fusion-loop-impl.hpp  |   63 +-
 ...nditional-fission-fusion-loop-segments.hpp |   52 +-
 .../tests/test-kernel-hyperplane-2D.hpp       |   41 +-
 .../tests/test-kernel-hyperplane-3D.hpp       |  135 +-
 .../tests/test-kernel-nested-MultiReduce.hpp  |  142 +-
 .../tests/nested-loop-BlockReduceSum-impl.hpp |   76 +-
 .../tests/nested-loop-ReduceSum-impl.hpp      |  197 +-
 ...test-kernel-nested-loop-BlockReduceSum.hpp |    5 +-
 .../test-kernel-nested-loop-ReduceSum.hpp     |    5 +-
 ...el-resource-nested-loop-BlockReduceSum.hpp |    5 +-
 ...-kernel-resource-nested-loop-ReduceSum.hpp |    5 +-
 ...test-kernel-nested-loops-segment-types.hpp |   56 +-
 .../test-kernel-nested-loop-OffsetView2D.hpp  |   52 +-
 .../test-kernel-nested-loop-OffsetView3D.hpp  |   48 +-
 ...ernel-nested-loop-PermutedOffsetView2D.hpp |   72 +-
 ...ernel-nested-loop-PermutedOffsetView3D.hpp |   57 +-
 ...test-kernel-nested-loop-PermutedView2D.hpp |   37 +-
 ...test-kernel-nested-loop-PermutedView3D.hpp |   55 +-
 .../tests/nested-loop-Basic-impl.hpp          |  212 +-
 .../tests/nested-loop-MultiLambda-impl.hpp    |   62 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |   80 +-
 .../tests/test-kernel-nested-loop-Basic.hpp   |    4 +-
 .../test-kernel-nested-loop-MultiLambda.hpp   |    5 +-
 ...st-kernel-nested-loop-MultiLambdaParam.hpp |   10 +-
 ...test-kernel-resource-nested-loop-Basic.hpp |    4 +-
 ...ernel-resource-nested-loop-MultiLambda.hpp |    5 +-
 ...-resource-nested-loop-MultiLambdaParam.hpp |   10 +-
 .../tests/test-kernel-reduceloc-Max2D.hpp     |  102 +-
 .../tests/test-kernel-reduceloc-Max2DView.hpp |  102 +-
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |  126 +-
 .../tests/test-kernel-reduceloc-Min2D.hpp     |  102 +-
 .../tests/test-kernel-reduceloc-Min2DView.hpp |  102 +-
 .../test-kernel-reduceloc-Min2DViewTuple.hpp  |  126 +-
 .../region/tests/test-kernel-region-data.hpp  |   28 +-
 .../region/tests/test-kernel-region-sync.hpp  |    9 +-
 .../region/tests/test-kernel-region.hpp       |    9 +-
 .../test-kernel-single-loop-ForICount.hpp     |    5 +-
 .../test-kernel-single-loop-TileTCount.hpp    |    5 +-
 .../tests/test-kernel-tile-Dynamic2D.hpp      |   41 +-
 .../tests/test-kernel-tile-Fixed2D.hpp        |   36 +-
 .../tests/test-kernel-tile-Fixed2DMinMax.hpp  |   58 +-
 .../tests/test-kernel-tile-Fixed2DSum.hpp     |   34 +-
 .../tests/test-kernel-tile-LocalArray2D.hpp   |   57 +-
 ...kernel-resource-warp-thread-ReduceMask.hpp |    5 +-
 ...kernel-resource-warp-thread-ReduceWarp.hpp |    5 +-
 ...t-kernel-resource-warp-thread-WarpLoop.hpp |    5 +-
 .../test-kernel-warp-thread-ReduceMask.hpp    |    5 +-
 .../test-kernel-warp-thread-ReduceWarp.hpp    |    5 +-
 .../test-kernel-warp-thread-WarpLoop.hpp      |    5 +-
 .../tests/warp-thread-ReduceMask-impl.hpp     |  155 +-
 .../tests/warp-thread-ReduceWarp-impl.hpp     |  160 +-
 .../tests/warp-thread-WarpLoop-impl.hpp       |  150 +-
 .../tests/test-launch-nested-MultiReduce.hpp  |  161 +-
 .../tests/test-launch-nested-Direct.hpp       |  111 +-
 .../tests/test-launch-nested-Loop.hpp         |  111 +-
 .../tests/test-launch-nested-Tile-Direct.hpp  |   95 +-
 .../tests/test-launch-nested-Tile-Loop.hpp    |   95 +-
 .../tests/test-launch-basic-ReduceBitAnd.hpp  |   72 +-
 .../tests/test-launch-basic-ReduceMin.hpp     |   97 +-
 .../tests/test-launch-basic-ReduceSum.hpp     |   95 +-
 ...t-launch-basic-param-expt-ReduceBitAnd.hpp |   91 +-
 ...test-launch-basic-param-expt-ReduceMin.hpp |   94 +-
 ...test-launch-basic-param-expt-ReduceSum.hpp |  105 +-
 .../tests/test-launch-BasicShared.hpp         |   63 +-
 .../segment/tests/test-launch-ListSegment.hpp |   75 +-
 .../tests/test-launch-RangeSegment.hpp        |  136 +-
 .../tests/test-launch-RangeStrideSegment.hpp  |  155 +-
 .../tests/test-launch-DynamicMem.hpp          |   70 +-
 .../tests/test-launch-StaticMem.hpp           |   73 +-
 .../test-launch-nested-Tile-iCount-Direct.hpp |   89 +-
 .../test-launch-nested-Tile-iCount-Loop.hpp   |   89 +-
 .../scan/tests/test-scan-Exclusive.hpp        |   13 +-
 .../scan/tests/test-scan-ExclusiveInplace.hpp |   33 +-
 .../scan/tests/test-scan-Inclusive.hpp        |   10 +-
 .../scan/tests/test-scan-InclusiveInplace.hpp |   18 +-
 test/functional/scan/tests/test-scan-data.hpp |   24 +-
 .../matrix/test-tensor-matrix-double.hpp      |  172 +-
 .../matrix/test-tensor-matrix-float.hpp       |  128 +-
 .../matrix/test-tensor-matrix-int32_t.hpp     |  128 +-
 .../matrix/test-tensor-matrix-int64_t.hpp     |  260 +-
 .../tests/test-tensor-matrix-CtorGetSet.hpp   |    8 +-
 .../tests/test-tensor-matrix-ET_Add.hpp       |   11 +-
 .../tests/test-tensor-matrix-ET_Divide.hpp    |   11 +-
 .../tests/test-tensor-matrix-ET_LoadStore.hpp |   40 +-
 ...-tensor-matrix-ET_MatrixMatrixMultiply.hpp |   12 +-
 ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp |   16 +-
 .../test-tensor-matrix-ET_MatrixVector.hpp    |    4 +-
 .../tests/test-tensor-matrix-ET_Negate.hpp    |   12 +-
 .../tests/test-tensor-matrix-ET_Subtract.hpp  |   11 +-
 .../tests/test-tensor-matrix-ET_Transpose.hpp |   12 +-
 .../test-tensor-matrix-Load_ColMajor.hpp      |   16 +-
 .../test-tensor-matrix-Load_RowMajor.hpp      |   16 +-
 .../test-tensor-matrix-Store_ColMajor.hpp     |   16 +-
 .../test-tensor-matrix-Store_RowMajor.hpp     |   16 +-
 .../tests/test-tensor-matrix-Transpose.hpp    |    4 +-
 .../tests/test-tensor-register-Add.hpp        |    4 +-
 .../tests/test-tensor-register-Divide.hpp     |    8 +-
 .../tests/test-tensor-register-DotProduct.hpp |    6 +-
 .../tests/test-tensor-register-FMA.hpp        |   11 +-
 .../tests/test-tensor-register-FMS.hpp        |   11 +-
 .../tests/test-tensor-register-Gather.hpp     |    6 +-
 .../tests/test-tensor-register-GetSet.hpp     |    2 +-
 .../tests/test-tensor-register-Load.hpp       |    2 +-
 .../tests/test-tensor-register-Max.hpp        |   10 +-
 .../tests/test-tensor-register-Min.hpp        |    8 +-
 .../tests/test-tensor-register-Multiply.hpp   |    4 +-
 .../tests/test-tensor-register-Scatter.hpp    |    6 +-
 ...ensor-register-SegmentedBroadcastInner.hpp |    2 +-
 ...ensor-register-SegmentedBroadcastOuter.hpp |    2 +-
 ...st-tensor-register-SegmentedDotProduct.hpp |    4 +-
 ...test-tensor-register-SegmentedSumInner.hpp |    2 +-
 ...test-tensor-register-SegmentedSumOuter.hpp |    2 +-
 .../tests/test-tensor-register-Store.hpp      |    2 +-
 .../tests/test-tensor-register-Subtract.hpp   |    4 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |    9 +-
 .../util/test-CombiningAdapter-1D.cpp         |    2 +-
 .../util/test-CombiningAdapter-2D.cpp         |   26 +-
 .../util/test-CombiningAdapter-3D.cpp         |   57 +-
 .../util/test-PermutedCombiningAdapter-1D.cpp |    2 +-
 .../util/test-PermutedCombiningAdapter-2D.cpp |   19 +-
 .../util/test-PermutedCombiningAdapter-3D.cpp |   51 +-
 .../test-workgroup-Ordered-MultipleReuse.hpp  |  205 +-
 .../tests/test-workgroup-Ordered-Single.hpp   |   98 +-
 ...test-workgroup-Unordered-MultipleReuse.hpp |  180 +-
 .../tests/test-workgroup-Unordered-Single.hpp |  103 +-
 test/include/RAJA_gtest.hpp                   |   15 +-
 test/include/RAJA_test-atomic-ref-types.hpp   |   10 +-
 test/include/RAJA_test-atomic-types.hpp       |   15 +-
 test/include/RAJA_test-dynamic-forall.hpp     |   23 +-
 .../RAJA_test-forall-async-execpol.hpp        |    8 +-
 test/include/RAJA_test-forall-data.hpp        |   40 +-
 test/include/RAJA_test-forall-execpol.hpp     |   78 +-
 .../RAJA_test-forall-indexset-execpol.hpp     |   38 +-
 test/include/RAJA_test-index-types.hpp        |   54 +-
 test/include/RAJA_test-indexset-build.hpp     |   27 +-
 .../RAJA_test-kernel-nested-loop-types.hpp    |   19 +-
 ...launch-direct-teams-threads-1D-execpol.hpp |   38 +-
 ...launch-direct-teams-threads-3D-execpol.hpp |   78 +-
 test/include/RAJA_test-launch-execpol.hpp     |   25 +-
 ...t-launch-loop-teams-threads-1D-execpol.hpp |   38 +-
 ...t-launch-loop-teams-threads-3D-execpol.hpp |   78 +-
 .../RAJA_test-launch-runtime-execpol.hpp      |   65 +-
 .../RAJA_test-multi-reduce-abstractor.hpp     |    4 +-
 test/include/RAJA_test-plugin-kernelpol.hpp   |   29 +-
 test/include/RAJA_test-reduce-types.hpp       |   13 +-
 test/include/RAJA_test-reducepol.hpp          |   28 +-
 test/include/RAJA_test-tensor.hpp             |   28 +-
 test/include/RAJA_test-workgroup.hpp          |   58 +-
 test/include/RAJA_unit-test-for3d3d.hpp       |   52 +-
 test/include/RAJA_unit-test-forone.hpp        |    5 +-
 test/include/RAJA_unit-test-types.hpp         |   52 +-
 test/include/type_helper.hpp                  |   10 +-
 .../using-with-cmake/using-with-cmake.cpp     |    8 +-
 test/integration/plugin/plugin_to_test.cpp    |    9 +-
 test/integration/plugin/tests/counter.hpp     |    8 +-
 .../plugin/tests/test-plugin-forall.hpp       |   39 +-
 .../plugin/tests/test-plugin-kernel.hpp       |    9 +-
 .../plugin/tests/test-plugin-launch.hpp       |    4 +-
 .../tests/test-plugin-resource-launch.hpp     |    8 +-
 .../plugin/tests/test-plugin-workgroup.hpp    |  117 +-
 test/integration/plugin/tests/test-plugin.hpp |    4 +-
 test/integration/plugin_for_test_kokkos.cpp   |   18 +-
 test/integration/test_plugin_dynamic.cpp      |    4 +-
 test/integration/test_plugin_kokkos.cpp       |    4 +-
 test/old-tests/unit/cuda/test-synchronize.cpp |    6 +-
 test/old-tests/unit/test-sharedmem.cpp        |  226 +-
 test/old-tests/unit/test-simd.cpp             |   51 +-
 .../test-algorithm-util-for_each.cpp          |   72 +-
 .../tests/test-algorithm-reduce-utils.hpp     |  161 +-
 .../tests/test-algorithm-sort-utils.hpp       |  369 ++-
 .../algorithm/tests/test-algorithm-sort.hpp   |   22 +-
 .../tests/test-algorithm-stable-sort.hpp      |   35 +-
 .../tests/test-algorithm-util-reduce.hpp      |   42 +-
 .../tests/test-algorithm-util-sort.hpp        |  210 +-
 test/unit/atomic/test-atomic-incdec.cpp       |   66 +-
 .../unit/atomic/test-atomic-ref-accessors.cpp |   18 +-
 test/unit/atomic/test-atomic-ref-addsub.cpp   |   16 +-
 test/unit/atomic/test-atomic-ref-bitwise.cpp  |   72 +-
 .../atomic/test-atomic-ref-constructor.cpp    |   37 +-
 .../unit/atomic/test-atomic-ref-exchanges.cpp |  106 +-
 test/unit/atomic/test-atomic-ref-minmax.cpp   |   18 +-
 test/unit/atomic/test-atomic-ref.hpp          |  102 +-
 test/unit/hip/test-synchronize.cpp            |   10 +-
 test/unit/index/test-indexset.cpp             |   28 +-
 test/unit/index/test-indexvalue.cpp           |    8 +-
 test/unit/index/test-listsegment.cpp          |    4 +-
 test/unit/index/test-rangesegment.cpp         |   34 +-
 test/unit/index/test-rangestridesegment.cpp   |   24 +-
 test/unit/indexing/test-indexing.hpp          |   18 +-
 .../indexing/tests/test-indexing-global.hpp   |   31 +-
 test/unit/internal/test-iterators.cpp         |   22 +-
 .../tests/test-multi-reducer-constructors.hpp |  115 +-
 .../tests/test-multi-reducer-reset.hpp        |  189 +-
 .../test-reducer-constructors-cuda.cpp        |   32 +-
 .../reducer/test-reducer-constructors-hip.cpp |   32 +-
 ...est-reducer-constructors-openmp-target.cpp |   17 +-
 .../test-reducer-constructors-openmp.cpp      |   32 +-
 .../reducer/test-reducer-constructors-seq.cpp |   32 +-
 test/unit/reducer/test-reducer-reset-cuda.cpp |   17 +-
 test/unit/reducer/test-reducer-reset-hip.cpp  |   17 +-
 .../test-reducer-reset-openmp-target.cpp      |   17 +-
 .../reducer/test-reducer-reset-openmp.cpp     |   17 +-
 test/unit/reducer/test-reducer-reset-seq.cpp  |   17 +-
 .../tests/test-reducer-constructors.hpp       |  114 +-
 .../unit/reducer/tests/test-reducer-reset.hpp |   84 +-
 .../tests/test-resource-AsyncTime.hpp         |   22 +-
 .../test-resource-BasicAsyncSemantics.hpp     |   24 +-
 .../resource/tests/test-resource-Depends.hpp  |   27 +-
 .../test-resource-JoinAsyncSemantics.hpp      |   27 +-
 .../tests/test-resource-MultiStream.hpp       |   67 +-
 .../test-operators-bitwise-modulus.cpp        |   22 +-
 .../operator/test-operators-equivalence.cpp   |   16 +-
 .../util/operator/test-operators-identity.cpp |   12 +-
 .../util/operator/test-operators-logical.cpp  |   28 +-
 .../util/operator/test-operators-math.cpp     |   16 +-
 test/unit/util/test-float-limits.cpp          |   17 +-
 test/unit/util/test-fraction.cpp              |   15 +-
 test/unit/util/test-integral-limits.cpp       |   17 +-
 test/unit/util/test-span.cpp                  |    4 +-
 test/unit/util/test-span.hpp                  |   34 +-
 test/unit/view-layout/test-indexlayout.cpp    |    6 +-
 test/unit/view-layout/test-makelayout.cpp     |    4 +-
 test/unit/view-layout/test-multiview.cpp      |   44 +-
 test/unit/view-layout/test-typedlayout.cpp    |   30 +-
 test/unit/view-layout/test-typedview.cpp      |   46 +-
 .../tests/test-util-workgroup-Enqueue.hpp     |    2 +-
 .../tests/test-util-workgroup-WorkStorage.hpp |   13 +-
 .../tests/test-workgroup-Constructor.hpp      |   71 +-
 .../tests/test-workgroup-Dispatcher.hpp       |   68 +-
 .../tests/test-workgroup-Enqueue-Multiple.hpp |   81 +-
 .../tests/test-workgroup-Enqueue-Single.hpp   |   74 +-
 ...test-workgroup-WorkStorage-Constructor.hpp |   13 +-
 .../test-workgroup-WorkStorage-InsertCall.hpp |   19 +-
 .../test-workgroup-WorkStorage-Iterator.hpp   |    9 +-
 .../test-workgroup-WorkStorage-Multiple.hpp   |   56 +-
 597 files changed, 30012 insertions(+), 33862 deletions(-)

diff --git a/.clang-format b/.clang-format
index 4b9a86b409..b6fa54b233 100644
--- a/.clang-format
+++ b/.clang-format
@@ -11,8 +11,10 @@ SortIncludes: false
 
 # Alignment of consecutive declarations, assignments etc
 AlignConsecutiveAssignments : true
-AlignConsecutiveDeclarations : true
+AlignConsecutiveDeclarations : false
 AlignConsecutiveMacros : true
+AlignTrailingComments : true
+AlwaysBreakAfterDefinitionReturnType: false
 
 # Control curly brace placement
 BreakBeforeBraces : Custom
@@ -49,7 +51,7 @@ BinPackParameters : false
 ConstructorInitializerAllOnOneLineOrOnePerLine : true
 ColumnLimit : 80
 
-AlignAfterOpenBracket: AlwaysBreak
+AlignAfterOpenBracket: Align
 AlignOperands : true
 AlwaysBreakTemplateDeclarations : true
 BreakBeforeBinaryOperators : None
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index 5233850919..8a19001cc7 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -232,6 +232,7 @@ macro(raja_add_code_checks)
   # another project
   if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
       # Create file globbing expressions that only include directories that contain source
+      # TODO(bowen) Add examples, exercises and benchmark to the list below
       set(_base_dirs "RAJA" "examples" "exercises" "benchmark" "include" "src" "test")
       set(_ext_expressions "*.cpp" "*.hpp" "*.inl"
                            "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh")
diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp
index 71a89b6146..5131010bd6 100644
--- a/examples/dynamic-forall.cpp
+++ b/examples/dynamic-forall.cpp
@@ -28,27 +28,22 @@
 void checkResult(int* res, int len);
 void printResult(int* res, int len);
 
-using policy_list = camp::list<
-    RAJA::seq_exec,
-    RAJA::simd_exec
+using policy_list = camp::list<RAJA::seq_exec
+                               ,RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    RAJA::omp_parallel_for_exec
+                               ,RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_exec<256>,
-    RAJA::cuda_exec<512>
+                               ,RAJA::cuda_exec<256>
+                               ,RAJA::cuda_exec<512>
 #endif
-    >;
+                               >;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 {
 
-  if (argc != 2)
-  {
-    RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the "
-                        "policy to run");
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the policy to run");
   }
 
   //
@@ -60,60 +55,58 @@ int main(int argc, char* argv[])
   const int pol = std::stoi(argv[1]);
 
   std::cout << "\n\nRAJA vector addition example...\n";
-  std::cout << "Using policy # " << pol << std::endl;
+  std::cout << "Using policy # "<<pol<<std::endl;
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   const int N = 1000000;
 
-  //
-  // Allocate and initialize vector data
-  //
-  int* a = memoryManager::allocate<int>(N);
-  int* b = memoryManager::allocate<int>(N);
-  int* c = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data
+//
+  int *a = memoryManager::allocate<int>(N);
+  int *b = memoryManager::allocate<int>(N);
+  int *c = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] = -i;
     b[i] = i;
   }
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     c[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
   checkResult(c, N);
-  // printResult(c, N);
+//printResult(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  // Example of dynamic policy selection for forall
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Example of dynamic policy selection for forall
+//----------------------------------------------------------------------------//
 
-  // policy is chosen from the list
-  RAJA::expt::dynamic_forall<policy_list>(
-      pol, RAJA::RangeSegment(0, N),
-      [=] RAJA_HOST_DEVICE(int i) { c[i] = a[i] + b[i]; });
+  //policy is chosen from the list
+  RAJA::expt::dynamic_forall<policy_list>(pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i)   {
+      c[i] = a[i] + b[i];
+  });
   // _rajaseq_vector_add_end
 
   checkResult(c, N);
-  // printResult(c, N);
+//printResult(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  //
-  // Clean up.
-  //
+//----------------------------------------------------------------------------//
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -129,19 +122,12 @@ int main(int argc, char* argv[])
 void checkResult(int* res, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (res[i] != 0)
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( res[i] != 0 ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -152,8 +138,7 @@ void checkResult(int* res, int len)
 void printResult(int* res, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "result[" << i << "] = " << res[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
index 9f6c39ed63..feb5247224 100644
--- a/examples/dynamic_mat_transpose.cpp
+++ b/examples/dynamic_mat_transpose.cpp
@@ -84,112 +84,98 @@ using launch_policy = RAJA::LaunchPolicy<
  * Up to 3 dimension are supported: x,y,z
  */
 using outer0 = RAJA::LoopPolicy<
-    RAJA::seq_exec
+                                       RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_block_x_direct
+                                       ,
+                                       RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_block_x_direct
+                                       ,
+                                       RAJA::hip_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-    ,
-    RAJA::sycl_group_2_direct
+                                       ,
+                                       RAJA::sycl_group_2_direct
 #endif
-    >;
+                                       >;
 
 using outer1 = RAJA::LoopPolicy<
 #if defined(RAJA_ENABLE_OPENMP)
-    RAJA::omp_for_exec
+                                      RAJA::omp_for_exec
 #else
-    RAJA::seq_exec
+                                       RAJA::seq_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_block_y_direct
+                                       ,
+                                       RAJA::cuda_block_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_block_y_direct
+                                       ,
+                                       RAJA::hip_block_y_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-    ,
-    RAJA::sycl_group_1_direct
+                                       ,
+                                       RAJA::sycl_group_1_direct
 #endif
-    >;
+                                       >;
 /*
  * Define thread policies.
  * Up to 3 dimension are supported: x,y,z
  */
 using inner0 = RAJA::LoopPolicy<
-    RAJA::seq_exec
+                                         RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_thread_x_direct
+                                         ,
+                                         RAJA::cuda_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_thread_x_direct
+                                         ,
+                                         RAJA::hip_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-    ,
-    RAJA::sycl_local_2_direct
+                                        ,
+                                         RAJA::sycl_local_2_direct
 #endif
-    >;
+                                         >;
 
-using inner1 = RAJA::LoopPolicy<
-    RAJA::seq_exec
+using inner1 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_thread_y_direct
+                                         ,
+                                         RAJA::cuda_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_thread_y_direct
+                                         ,
+                                         RAJA::hip_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_SYCL)
-    ,
-    RAJA::sycl_local_1_direct
+                                        ,
+                                         RAJA::sycl_local_1_direct
 #endif
-    >;
+                                         >;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 {
 
-  if (argc != 2)
-  {
-    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or "
-                        "./dynamic_mat_transpose device");
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
   }
 
   //
   // Run time policy section is demonstrated in this example by specifying
   // kernel exection space as a command line argument (host or device).
-  // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose
-  // device
+  // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device
   //
   std::string exec_space = argv[1];
-  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
-  {
-    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or "
-                        "./dynamic_mat_transpose device");
+  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
+    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (exec_space.compare("host") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-    std::cout << "Running RAJA::launch matrix transpose example on the host"
-              << std::endl;
-  }
-  if (exec_space.compare("device") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
-    std::cout << "Running RAJA::launch matrix transpose example on the device"
-              << std::endl;
-  }
+  if(exec_space.compare("host") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<<std::endl; }
+  if(exec_space.compare("device") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; std::cout<<"Running RAJA::launch matrix transpose example on the device" <<std::endl; }
 
   RAJA::resources::Host host_res;
 #if defined(RAJA_ENABLE_CUDA)
@@ -203,11 +189,9 @@ int main(int argc, char* argv[])
 #endif
 
 #if defined(RAJA_GPU_ACTIVE)
-  RAJA::resources::Resource res =
-      RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
-  RAJA::resources::Resource res =
-      RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles
@@ -225,8 +209,8 @@ int main(int argc, char* argv[])
   //
   // Allocate matrix data
   //
-  int* A  = host_res.allocate<int>(N_r * N_c);
-  int* At = host_res.allocate<int>(N_r * N_c);
+  int *A = host_res.allocate<int>(N_r * N_c);
+  int *At = host_res.allocate<int>(N_r * N_c);
   //
   // In the following implementations of matrix transpose, we
   // use RAJA 'View' objects to access the matrix data. A RAJA view
@@ -241,14 +225,12 @@ int main(int argc, char* argv[])
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of shared matrix transpose...\n";
@@ -259,10 +241,8 @@ int main(int argc, char* argv[])
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -273,17 +253,14 @@ int main(int argc, char* argv[])
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -295,21 +272,19 @@ int main(int argc, char* argv[])
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx)
-      {
-        for (int ty = 0; ty < TILE_DIM; ++ty)
-        {
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
+
     }
   }
   // _dynamic_mattranspose_localarray_cstyle_end
@@ -319,26 +294,24 @@ int main(int argc, char* argv[])
 
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory "
-               "...\n";
+  std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n";
 
-  // Reset memory
+  //Reset memory
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
 #if defined(RAJA_GPU_ACTIVE)
-  // Allocate device side pointers
+  //Allocate device side pointers
   int *d_A = nullptr, *d_At = nullptr;
 
-  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
-  {
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
 
-    d_A  = device_res.allocate<int>(N_r * N_c);
+    d_A  =  device_res.allocate<int>(N_r * N_c);
     d_At = device_res.allocate<int>(N_r * N_c);
 
     device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c);
     device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c);
 
-    // switch host/device pointers so we can reuse the views
+    //switch host/device pointers so we can reuse the views
     Aview.set_data(d_A);
     Atview.set_data(d_At);
   }
@@ -349,91 +322,65 @@ int main(int argc, char* argv[])
   // _dynamic_mattranspose_shared_mem_end
 
   // _dynamic_mattranspose_kernel_start
-  RAJA::launch<launch_policy>(
-      res,
-      RAJA::LaunchParams(
-          RAJA::Teams(outer_Dimc, outer_Dimr),
-          RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
-      "Matrix tranpose with dynamic shared memory kernel",
+  RAJA::launch<launch_policy>
+    (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
+                             RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
+     "Matrix tranpose with dynamic shared memory kernel",
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<outer1>(
-            ctx, RAJA::RangeSegment(0, outer_Dimr),
-            [&](int by)
-            {
-              RAJA::loop<outer0>(
-                  ctx, RAJA::RangeSegment(0, outer_Dimc),
-                  [&](int bx)
-                  {
-                    // Request memory from shared memory pool
-                    int* tile_ptr =
-                        ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
-
-                    // Use RAJA View for simplified indexing
-                    RAJA::View<int, RAJA::Layout<2>> Tile(
-                        tile_ptr, TILE_DIM, TILE_DIM);
-
-                    RAJA::loop<inner1>(
-                        ctx, RAJA::RangeSegment(0, TILE_DIM),
-                        [&](int ty)
-                        {
-                          RAJA::loop<inner0>(
-                              ctx, RAJA::RangeSegment(0, TILE_DIM),
-                              [&](int tx)
-                              {
-                                int col =
-                                    bx * TILE_DIM + tx;  // Matrix column index
-                                int row =
-                                    by * TILE_DIM + ty;  // Matrix row index
-
-                                // Bounds check
-                                if (row < N_r && col < N_c)
-                                {
-                                  Tile(ty, tx) = Aview(row, col);
-                                }
-                              });
-                        });
-
-                    // Barrier is needed to ensure all threads have written to
-                    // Tile
-                    ctx.teamSync();
-
-                    RAJA::loop<inner1>(
-                        ctx, RAJA::RangeSegment(0, TILE_DIM),
-                        [&](int ty)
-                        {
-                          RAJA::loop<inner0>(
-                              ctx, RAJA::RangeSegment(0, TILE_DIM),
-                              [&](int tx)
-                              {
-                                int col =
-                                    bx * TILE_DIM + tx;  // Matrix column index
-                                int row =
-                                    by * TILE_DIM + ty;  // Matrix row index
-
-                                // Bounds check
-                                if (row < N_r && col < N_c)
-                                {
-                                  Atview(col, row) = Tile(ty, tx);
-                                }
-                              });
-                        });
-
-                    // The launch context uses bump style allocator in which
-                    // calls to getSharedMemory moves a memory buffer pointer to
-                    // return different segments of shared memory. To avoid
-                    // requesting beyond the pre-allocated memory quantity we
-                    // reset the allocator offset counter in the launch context
-                    // effectively releasing shared memory.
-                    ctx.releaseSharedMemory();
-                  });
-            });
+  {
+    RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){
+        RAJA::loop<outer0>(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){
+
+            //Request memory from shared memory pool
+            int * tile_ptr = ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
+
+            //Use RAJA View for simplified indexing
+            RAJA::View<int, RAJA::Layout<2>> Tile(tile_ptr, TILE_DIM, TILE_DIM);
+
+            RAJA::loop<inner1>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){
+              RAJA::loop<inner0>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){
+
+                  int col = bx * TILE_DIM + tx;  // Matrix column index
+                  int row = by * TILE_DIM + ty;  // Matrix row index
+
+                  // Bounds check
+                  if (row < N_r && col < N_c) {
+                    Tile(ty,tx) = Aview(row, col);
+                  }
+
+                });
+              });
+
+            //Barrier is needed to ensure all threads have written to Tile
+            ctx.teamSync();
+
+            RAJA::loop<inner1>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){
+              RAJA::loop<inner0>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){
+
+                  int col = bx * TILE_DIM + tx;  // Matrix column index
+                  int row = by * TILE_DIM + ty;  // Matrix row index
+
+                  // Bounds check
+                  if (row < N_r && col < N_c) {
+                    Atview(col, row) = Tile(ty, tx);
+                  }
+
+                });
+              });
+
+            //The launch context uses bump style allocator in which calls
+	    //to getSharedMemory moves a memory buffer pointer to return
+	    //different segments of shared memory. To avoid requesting beyond
+	    //the pre-allocated memory quantity we reset the allocator offset counter
+	    //in the launch context effectively releasing shared memory.
+            ctx.releaseSharedMemory();
+          });
       });
+  });
   // _dynamic_mattranspose_kernel_end
 
 #if defined(RAJA_GPU_ACTIVE)
-  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
-  {
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
 
     device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c);
     device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c);
@@ -445,16 +392,15 @@ int main(int argc, char* argv[])
 
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
-  // Release data
+  //Release data
   host_res.deallocate(A);
   host_res.deallocate(At);
 
 #if defined(RAJA_GPU_ACTIVE)
-  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
-  {
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
     device_res.deallocate(d_A);
     device_res.deallocate(d_At);
   }
@@ -472,22 +418,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -499,13 +439,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      //std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //<< std::endl;
-      printf("%d ", Atview(row, col));
+      printf("%d ",Atview(row, col));
     }
     std::cout << "" << std::endl;
   }
diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp
index 338ac69e14..fb82582704 100644
--- a/examples/forall-param-reductions.cpp
+++ b/examples/forall-param-reductions.cpp
@@ -47,157 +47,151 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 1000000;
 
-  //
-  // Allocate array data and initialize data to alternating sequence of 1, -1.
-  //
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
   RAJA::resources::Host host_res;
-  int*                  a = host_res.allocate<int>(N);
+  int* a = host_res.allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (i % 2 == 0)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
       a[i] = 1;
-    }
-    else
-    {
+    } else {
       a[i] = -1;
     }
   }
 
-  //
-  // Set min and max loc values
-  //
+//
+// Set min and max loc values
+//
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref]            = -100;
+  a[minloc_ref] = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref]            = 100;
+  a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-  //
-  // Note: with this data initialization scheme, the following results will
-  //       be observed for all reduction kernels below:
-  //
-  //  - the sum will be zero
-  //  - the min will be -100
-  //  - the max will be 100
-  //  - the min loc will be N/2
-  //  - the max loc will be N/2 + 1
-  //
-  //
-
-  //
-  // Define index range for iterating over a elements in all examples
-  //
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
   // _reductions_range_start
   RAJA::TypedRangeSegment<int> arange(0, N);
   // _reductions_range_end
 
-  //
-  // Define ValLoc Type
-  //
+//
+// Define ValLoc Type
+//
 
   using VALLOC_INT = RAJA::expt::ValLoc<int>;
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
   // _reductions_raja_seq_start
-  using EXEC_POL1 = RAJA::seq_exec;
+  using EXEC_POL1   = RAJA::seq_exec;
 
-  int        seq_sum = 0;
-  int        seq_min = std::numeric_limits<int>::max();
-  int        seq_max = std::numeric_limits<int>::min();
+  int seq_sum = 0;
+  int seq_min = std::numeric_limits<int>::max();
+  int seq_max = std::numeric_limits<int>::min();
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL1>(
-      host_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
-      RAJA::expt::KernelName("RAJA Reduce Seq Kernel"),
-      [=](int i, int& _seq_sum, int& _seq_min, int& _seq_max,
-          VALLOC_INT& _seq_minloc, VALLOC_INT& _seq_maxloc)
-      {
-        _seq_sum += a[i];
-
-        _seq_min = RAJA_MIN(a[i], _seq_min);
-        _seq_max = RAJA_MAX(a[i], _seq_max);
-
-        _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
-        _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
-        //_seq_minloc.min(a[i], i);
-        //_seq_maxloc.max(a[i], i);
-        // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods
-        //        that are equivalent to the assignments with RAJA_MIN and
-        //        RAJA_MAX above.
-      });
+  RAJA::forall<EXEC_POL1>(host_res, arange,
+    RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce Seq Kernel"),
+    [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) {
+      _seq_sum += a[i];
+
+      _seq_min = RAJA_MIN(a[i], _seq_min);
+      _seq_max = RAJA_MAX(a[i], _seq_max);
+
+      _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
+      _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
+      //_seq_minloc.min(a[i], i);
+      //_seq_maxloc.max(a[i], i);
+      // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods
+      //        that are equivalent to the assignments with RAJA_MIN and RAJA_MAX
+      //        above.
+    }
+  );
 
   std::cout << "\tsum = " << seq_sum << std::endl;
   std::cout << "\tmin = " << seq_min << std::endl;
   std::cout << "\tmax = " << seq_max << std::endl;
   std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , "
-            << seq_minloc.getLoc() << std::endl;
+                               << seq_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , "
-            << seq_maxloc.getLoc() << std::endl;
+                               << seq_maxloc.getLoc() << std::endl;
   // _reductions_raja_seq_end
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
 
   // _reductions_raja_omppolicy_start
-  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
+  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
   // _reductions_raja_omppolicy_end
 
-  int        omp_sum = 0;
-  int        omp_min = std::numeric_limits<int>::max();
-  int        omp_max = std::numeric_limits<int>::min();
+  int omp_sum = 0;
+  int omp_min = std::numeric_limits<int>::max();
+  int omp_max = std::numeric_limits<int>::min();
   VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL2>(
-      host_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
-      RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"),
-      [=](int i, int& _omp_sum, int& _omp_min, int& _omp_max,
-          VALLOC_INT& _omp_minloc, VALLOC_INT& _omp_maxloc)
-      {
-        _omp_sum += a[i];
-
-        _omp_min = RAJA_MIN(a[i], _omp_min);
-        _omp_max = RAJA_MAX(a[i], _omp_max);
-
-        _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
-        _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
-        //_omp_minloc.min(a[i], i);
-        //_omp_maxloc.max(a[i], i);
-      });
+  RAJA::forall<EXEC_POL2>(host_res, arange,
+    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"),
+    [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) {
+      _omp_sum += a[i];
+
+      _omp_min = RAJA_MIN(a[i], _omp_min);
+      _omp_max = RAJA_MAX(a[i], _omp_max);
+
+      _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
+      _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
+      //_omp_minloc.min(a[i], i);
+      //_omp_maxloc.max(a[i], i);
+    }
+  );
 
   std::cout << "\tsum = " << omp_sum << std::endl;
   std::cout << "\tmin = " << omp_min << std::endl;
   std::cout << "\tmax = " << omp_max << std::endl;
   std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , "
-            << omp_minloc.getLoc() << std::endl;
+                               << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , "
-            << omp_maxloc.getLoc() << std::endl;
+                               << omp_maxloc.getLoc() << std::endl;
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
   std::cout << "\n Running RAJA OpenMP Target reductions...\n";
@@ -205,48 +199,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::resources::Omp omp_res;
 
   // _reductions_raja_omppolicy_start
-  using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt;
+  using EXEC_POL3   = RAJA::omp_target_parallel_for_exec_nt;
   // _reductions_raja_omppolicy_end
 
-  int        omp_t_sum = 0;
-  int        omp_t_min = std::numeric_limits<int>::max();
-  int        omp_t_max = std::numeric_limits<int>::min();
+  int omp_t_sum = 0;
+  int omp_t_min = std::numeric_limits<int>::max();
+  int omp_t_max = std::numeric_limits<int>::min();
   VALLOC_INT omp_t_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_t_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(
-      omp_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
-      RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"),
-      [=](int i, int& _omp_t_sum, int& _omp_t_min, int& _omp_t_max,
-          VALLOC_INT& _omp_t_minloc, VALLOC_INT& _omp_t_maxloc)
-      {
-        _omp_t_sum += a[i];
-
-        _omp_t_min = RAJA_MIN(a[i], _omp_t_min);
-        _omp_t_max = RAJA_MAX(a[i], _omp_t_max);
-
-        _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc);
-        _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc);
-        //_omp_t_minloc.min(a[i], i);
-        //_omp_t_maxloc.max(a[i], i);
-      });
+  RAJA::forall<EXEC_POL3>(omp_res, arange,
+    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"),
+    [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) {
+      _omp_t_sum += a[i];
+
+      _omp_t_min = RAJA_MIN(a[i], _omp_t_min);
+      _omp_t_max = RAJA_MAX(a[i], _omp_t_max);
+
+      _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc);
+      _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc);
+      //_omp_t_minloc.min(a[i], i);
+      //_omp_t_maxloc.max(a[i], i);
+    }
+  );
 
   std::cout << "\tsum = " << omp_t_sum << std::endl;
   std::cout << "\tmin = " << omp_t_min << std::endl;
   std::cout << "\tmax = " << omp_t_max << std::endl;
   std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , "
-            << omp_t_minloc.getLoc() << std::endl;
+                               << omp_t_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , "
-            << omp_t_maxloc.getLoc() << std::endl;
+                               << omp_t_maxloc.getLoc() << std::endl;
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
@@ -257,48 +250,46 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   cuda_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_cudapolicy_start
-  using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   // _reductions_raja_cudapolicy_end
 
-  int        cuda_sum = 0;
-  int        cuda_min = std::numeric_limits<int>::max();
-  int        cuda_max = std::numeric_limits<int>::min();
+  int cuda_sum = 0;
+  int cuda_min = std::numeric_limits<int>::max();
+  int cuda_max = std::numeric_limits<int>::min();
   VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(
-      cuda_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
-      RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
-      [=] RAJA_DEVICE(
-          int i, int& _cuda_sum, int& _cuda_min, int& _cuda_max,
-          VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
-      {
-        _cuda_sum += d_a[i];
-
-        _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
-        _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
-
-        _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
-        _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
-        //_cuda_minloc.min(a[i], i);
-        //_cuda_maxloc.max(a[i], i);
-      });
+  RAJA::forall<EXEC_POL3>(cuda_res, arange,
+    RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
+    [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) {
+      _cuda_sum += d_a[i];
+
+      _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
+      _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
+
+      _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
+      _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
+      //_cuda_minloc.min(a[i], i);
+      //_cuda_maxloc.max(a[i], i);
+    }
+  );
 
   std::cout << "\tsum = " << cuda_sum << std::endl;
   std::cout << "\tmin = " << cuda_min << std::endl;
   std::cout << "\tmax = " << cuda_max << std::endl;
   std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , "
-            << cuda_minloc.getLoc() << std::endl;
+                               << cuda_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , "
-            << cuda_maxloc.getLoc() << std::endl;
+                               << cuda_maxloc.getLoc() << std::endl;
   cuda_res.deallocate(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
@@ -309,49 +300,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hip_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_hippolicy_start
-  using EXEC_POL3 = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   // _reductions_raja_hippolicy_end
 
-  int        hip_sum = 0;
-  int        hip_min = std::numeric_limits<int>::max();
-  int        hip_max = std::numeric_limits<int>::min();
+  int hip_sum = 0;
+  int hip_min = std::numeric_limits<int>::max();
+  int hip_max = std::numeric_limits<int>::min();
   VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(
-      arange, RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
-      RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
-      [=] RAJA_DEVICE(
-          int i, int& _hip_sum, int& _hip_min, int& _hip_max,
-          VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
-      {
-        _hip_sum += d_a[i];
-
-        _hip_min = RAJA_MIN(d_a[i], _hip_min);
-        _hip_max = RAJA_MAX(d_a[i], _hip_max);
-
-        _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
-        _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
-        //_hip_minloc.min(d_a[i], i);
-        //_hip_maxloc.max(d_a[i], i);
-      });
+  RAJA::forall<EXEC_POL3>(arange,
+    RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
+    [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) {
+      _hip_sum += d_a[i];
+
+      _hip_min = RAJA_MIN(d_a[i], _hip_min);
+      _hip_max = RAJA_MAX(d_a[i], _hip_max);
+
+      _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
+      _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
+      //_hip_minloc.min(d_a[i], i);
+      //_hip_maxloc.max(d_a[i], i);
+    }
+  );
 
   std::cout << "\tsum = " << hip_sum << std::endl;
   std::cout << "\tmin = " << hip_min << std::endl;
   std::cout << "\tmax = " << hip_max << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , "
-            << hip_minloc.getLoc() << std::endl;
+                               << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , "
-            << hip_maxloc.getLoc() << std::endl;
+                               << hip_maxloc.getLoc() << std::endl;
 
   hip_res.deallocate(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL reductions...\n";
@@ -362,53 +351,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   sycl_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_syclpolicy_start
-  using EXEC_POL3 = RAJA::sycl_exec<SYCL_BLOCK_SIZE>;
+  using EXEC_POL3   = RAJA::sycl_exec<SYCL_BLOCK_SIZE>;
   // _reductions_raja_syclpolicy_end
 
-  int        sycl_sum = 0;
-  int        sycl_min = std::numeric_limits<int>::max();
-  int        sycl_max = std::numeric_limits<int>::min();
+  int sycl_sum = 0;
+  int sycl_min = std::numeric_limits<int>::max();
+  int sycl_max = std::numeric_limits<int>::min();
   VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(
-      sycl_res, arange, RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
-      RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
-      [=] RAJA_DEVICE(
-          int i, int& _sycl_sum, int& _sycl_min, int& _sycl_max,
-          VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
-      {
-        _sycl_sum += d_a[i];
-
-        _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
-        _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
-
-        _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
-        _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
-        //_sycl_minloc.min(d_a[i], i);
-        //_sycl_maxloc.max(d_a[i], i);
-      });
+  RAJA::forall<EXEC_POL3>(sycl_res, arange,
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
+    [=] RAJA_DEVICE (int i, int &_sycl_sum, int &_sycl_min, int &_sycl_max, VALLOC_INT &_sycl_minloc, VALLOC_INT &_sycl_maxloc) {
+      _sycl_sum += d_a[i];
+
+      _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
+      _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
+
+      _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
+      _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
+      //_sycl_minloc.min(d_a[i], i);
+      //_sycl_maxloc.max(d_a[i], i);
+    }
+  );
 
   std::cout << "\tsum = " << sycl_sum << std::endl;
   std::cout << "\tmin = " << sycl_min << std::endl;
   std::cout << "\tmax = " << sycl_max << std::endl;
   std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , "
-            << sycl_minloc.getLoc() << std::endl;
+                               << sycl_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , "
-            << sycl_maxloc.getLoc() << std::endl;
+                               << sycl_maxloc.getLoc() << std::endl;
 
   sycl_res.deallocate(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   host_res.deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp
index eeb5ef51cf..c3be312194 100644
--- a/examples/forall_multi-reductions.cpp
+++ b/examples/forall_multi-reductions.cpp
@@ -27,10 +27,10 @@
  *
  */
 
-template <typename t_exec_policy, typename t_multi_reduce_policy>
+template < typename t_exec_policy, typename t_multi_reduce_policy >
 struct Backend
 {
-  using exec_policy         = t_exec_policy;
+  using exec_policy = t_exec_policy;
   using multi_reduce_policy = t_multi_reduce_policy;
 
   std::string name;
@@ -38,26 +38,23 @@ struct Backend
 
 auto example_policies = camp::make_tuple(
 
-    Backend<RAJA::seq_exec, RAJA::seq_multi_reduce> {"Sequential"}
+      Backend<RAJA::seq_exec, RAJA::seq_multi_reduce>{"Sequential"}
 
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    Backend<RAJA::omp_parallel_for_exec, RAJA::omp_multi_reduce> {"OpenMP"}
+    , Backend<RAJA::omp_parallel_for_exec, RAJA::omp_multi_reduce>{"OpenMP"}
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    Backend<RAJA::cuda_exec_async<256>, RAJA::cuda_multi_reduce_atomic> {"Cuda"}
+    , Backend<RAJA::cuda_exec_async<256>, RAJA::cuda_multi_reduce_atomic>{"Cuda"}
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    Backend<RAJA::hip_exec_async<256>, RAJA::hip_multi_reduce_atomic> {"Hip"}
+    , Backend<RAJA::hip_exec_async<256>, RAJA::hip_multi_reduce_atomic>{"Hip"}
 #endif
 
-);
+    );
 
-template <typename exec_policy, typename multi_reduce_policy>
+template < typename exec_policy, typename multi_reduce_policy >
 void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a)
 {
   RAJA::MultiReduceSum<multi_reduce_policy, int>    multi_reduce_sum(num_bins);
@@ -66,26 +63,25 @@ void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a)
   RAJA::MultiReduceBitAnd<multi_reduce_policy, int> multi_reduce_and(num_bins);
   RAJA::MultiReduceBitOr<multi_reduce_policy, int>  multi_reduce_or(num_bins);
 
-  RAJA::forall<exec_policy>(
-      arange,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
-      {
-        int bin = bins[i];
-
-        multi_reduce_sum[bin] += a[i];
-        multi_reduce_min[bin].min(a[i]);
-        multi_reduce_max[bin].max(a[i]);
-        multi_reduce_and[bin] &= a[i];
-        multi_reduce_or[bin] |= a[i];
-      });
-
-  for (int bin = 0; bin < num_bins; ++bin)
-  {
+  RAJA::forall<exec_policy>(arange,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
+
+    int bin = bins[i];
+
+    multi_reduce_sum[bin] +=  a[i];
+    multi_reduce_min[bin].min(a[i]);
+    multi_reduce_max[bin].max(a[i]);
+    multi_reduce_and[bin] &=  a[i];
+    multi_reduce_or [bin] |=  a[i];
+
+  });
+
+  for (int bin = 0; bin < num_bins; ++bin) {
     std::cout << "\tsum[" << bin << "] = " << multi_reduce_sum.get(bin) << '\n';
     std::cout << "\tmin[" << bin << "] = " << multi_reduce_min.get(bin) << '\n';
     std::cout << "\tmax[" << bin << "] = " << multi_reduce_max.get(bin) << '\n';
     std::cout << "\tand[" << bin << "] = " << multi_reduce_and.get(bin) << '\n';
-    std::cout << "\tor [" << bin << "] = " << multi_reduce_or.get(bin) << '\n';
+    std::cout << "\tor [" << bin << "] = " << multi_reduce_or .get(bin) << '\n';
     std::cout << '\n';
   }
 }
@@ -94,83 +90,77 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 {
 
   // _multi_reductions_array_init_start
-  //
-  // Define array length
-  //
-  const int N        = 1000000;
+//
+// Define array length
+//
+  const int N = 1000000;
   const int num_bins = 10;
 
-  //
-  // Allocate array data and initialize data to alternating sequence of 1, -1.
-  //
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
   camp::resources::Host host_res;
-  int*                  host_bins = host_res.template allocate<int>(N);
-  int*                  host_a    = host_res.template allocate<int>(N);
+  int* host_bins = host_res.template allocate<int>(N);
+  int* host_a    = host_res.template allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     host_bins[i] = i % num_bins;
-    host_a[i]    = (i % (2 * num_bins)) - num_bins;
+    host_a[i] = (i % (2*num_bins)) - num_bins;
   }
 
   // _multi_reductions_array_init_end
 
-  //
-  // Note: with this data initialization scheme, the following results will
-  //       be observed for all reduction kernels below:
-  //
-  // for bin in [0, num_bins)
-  //  - the sum will be (bin - num_bins/2) * N / num_bins
-  //  - the min will be bin - num_bins
-  //  - the max will be bin
-  //  - the and will be min & max
-  //  - the or  will be min | max
-  //
-
-  //
-  // Define index range for iterating over a elements in all examples
-  //
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+// for bin in [0, num_bins)
+//  - the sum will be (bin - num_bins/2) * N / num_bins
+//  - the min will be bin - num_bins
+//  - the max will be bin
+//  - the and will be min & max
+//  - the or  will be min | max
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
   // _multi_reductions_range_start
   RAJA::RangeSegment arange(0, N);
   // _multi_reductions_range_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+
+  RAJA::for_each_tuple(example_policies, [&](auto const& backend) {
 
-  RAJA::for_each_tuple(
-      example_policies,
-      [&](auto const& backend)
-      {
-        std::cout << "Running " << backend.name << " policies" << '\n';
+    std::cout << "Running " << backend.name << " policies" << '\n';
 
-        using exec_policy =
-            typename std::decay_t<decltype(backend)>::exec_policy;
-        using multi_reduce_policy =
-            typename std::decay_t<decltype(backend)>::multi_reduce_policy;
+    using exec_policy = typename std::decay_t<decltype(backend)>::exec_policy;
+    using multi_reduce_policy = typename std::decay_t<decltype(backend)>::multi_reduce_policy;
 
-        auto res = RAJA::resources::get_default_resource<exec_policy>();
+    auto res = RAJA::resources::get_default_resource<exec_policy>();
 
-        int* bins = res.template allocate<int>(N);
-        int* a    = res.template allocate<int>(N);
+    int* bins = res.template allocate<int>(N);
+    int* a    = res.template allocate<int>(N);
 
-        res.memcpy(bins, host_bins, N * sizeof(int));
-        res.memcpy(a, host_a, N * sizeof(int));
+    res.memcpy(bins, host_bins, N*sizeof(int));
+    res.memcpy(a   , host_a   , N*sizeof(int));
 
-        example_code<exec_policy, multi_reduce_policy>(
-            arange, num_bins, bins, a);
+    example_code<exec_policy, multi_reduce_policy>(arange, num_bins, bins, a);
 
-        res.deallocate(bins);
-        res.deallocate(a);
+    res.deallocate(bins);
+    res.deallocate(a   );
 
-        std::cout << std::endl;
-      });
+    std::cout << std::endl;
+  });
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   host_res.deallocate(host_bins);
-  host_res.deallocate(host_a);
+  host_res.deallocate(host_a   );
 
   std::cout << "\n DONE!...\n";
 
diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp
index 64ecc4746f..0badaa7396 100644
--- a/examples/jacobi.cpp
+++ b/examples/jacobi.cpp
@@ -39,7 +39,7 @@
  * (I, Iold) and initialized to zero. The first set of
  * nested for loops apply an iteration of the Jacobi
  * scheme. The scheme is only applied to the interior
- * nodes.
+ * nodes. 
  *
  * The second set of nested for loops is used to
  * update Iold and compute the l_2 norm of the
@@ -52,7 +52,7 @@
  * ----[RAJA Concepts]---------------
  * - Forall::nested loop
  * - RAJA Reduction
- *
+ * 
  */
 
 
@@ -63,9 +63,9 @@
  *
  * CUDA_BLOCK_SIZE_Y - Number of threads in the
  *                     y-dimension of a cuda thread block
- *
+ * 
  * CUDA_BLOCK_SIZE   - Number of threads per threads block
- */
+*/
 #if defined(RAJA_ENABLE_CUDA)
 const int CUDA_BLOCK_SIZE = 256;
 #endif
@@ -80,24 +80,23 @@ const int HIP_BLOCK_SIZE = 256;
 //  h - Spacing between grid points
 //  n - Number of grid points
 //
-struct grid_s
-{
+struct grid_s {
   double o, h;
-  int    n;
+  int n;
 };
 
-//
+// 
 // ----[Functions]---------
 // solution   - Function for the analytic solution
 // computeErr - Displays the maximum error in the solution
 //
 double solution(double x, double y);
-void   computeErr(double* I, grid_s grid);
+void computeErr(double *I, grid_s grid);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "Jacobi Example" << std::endl;
+  std::cout<<"Jacobi Example"<<std::endl;
 
   /*
    * ----[Solver Parameters]------------
@@ -109,15 +108,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
    * resI2     - Residual
    * iteration - Iteration number
    * grid_s    - Struct with grid information for a cartesian dimension
-   */
+  */
   double tol = 1e-10;
 
-  int N       = 50;
-  int NN      = (N + 2) * (N + 2);
+  int N = 50;
+  int NN = (N + 2) * (N + 2);
   int maxIter = 100000;
 
   double resI2;
-  int    iteration;
+  int iteration;
 
   grid_s gridx;
   gridx.o = 0.0;
@@ -125,10 +124,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   gridx.n = N + 2;
 
   //
-  // I, Iold - Holds iterates of Jacobi method
+  //I, Iold - Holds iterates of Jacobi method
   //
-  double* I    = memoryManager::allocate<double>(NN);
-  double* Iold = memoryManager::allocate<double>(NN);
+  double *I = memoryManager::allocate<double>(NN);
+  double *Iold = memoryManager::allocate<double>(NN);
 
 
   memset(I, 0, NN * sizeof(double));
@@ -136,29 +135,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 
   printf("Standard  C++ Loop \n");
-  resI2     = 1;
+  resI2 = 1;
   iteration = 0;
 
-  while (resI2 > tol * tol)
-  {
+  while (resI2 > tol * tol) {
 
     //
     // Jacobi Iteration
     //
-    for (int n = 1; n <= N; ++n)
-    {
-      for (int m = 1; m <= N; ++m)
-      {
+    for (int n = 1; n <= N; ++n) {
+      for (int m = 1; m <= N; ++m) {
 
         double x = gridx.o + m * gridx.h;
         double y = gridx.o + n * gridx.h;
 
-        double f = gridx.h * gridx.h *
-                   (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+        double f = gridx.h * gridx.h
+                   * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
         int id = n * (N + 2) + m;
-        I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
-                        Iold[id - 1] + Iold[id + 1]);
+        I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
+                           + Iold[id + 1]);
       }
     }
 
@@ -166,14 +162,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     // Compute residual and update Iold
     //
     resI2 = 0.0;
-    for (int k = 0; k < NN; k++)
-    {
+    for (int k = 0; k < NN; k++) {
       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
       Iold[k] = I[k];
     }
 
-    if (iteration > maxIter)
-    {
+    if (iteration > maxIter) {
       printf("Standard C++ Loop - Maxed out on iterations \n");
       exit(-1);
     }
@@ -190,54 +184,52 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment gridRange(0, NN);
   RAJA::RangeSegment jacobiRange(1, (N + 1));
 
-  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
+  RAJA::statement::For<1, RAJA::seq_exec,
+    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
 
   printf("RAJA: Sequential Policy - Nested ForallN \n");
-  resI2     = 1;
+  resI2 = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
 
   /*
-   *  Sequential Jacobi Iteration.
+   *  Sequential Jacobi Iteration. 
    *
    *  Note that a RAJA ReduceSum object is used to accumulate the sum
-   *  for the residual. Since the loop is run sequentially, this is
-   *  not strictly necessary. It is done here for consistency and
+   *  for the residual. Since the loop is run sequentially, this is 
+   *  not strictly necessary. It is done here for consistency and 
    *  comparison with other RAJA variants in this example.
-   */
-  while (resI2 > tol * tol)
-  {
+   */  
+  while (resI2 > tol * tol) {
 
-    RAJA::kernel<jacobiSeqNestedPolicy>(
-        RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=](RAJA::Index_type m, RAJA::Index_type n)
-        {
+    RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
+                         [=] (RAJA::Index_type m, RAJA::Index_type n) {
+                         
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
-          double f = gridx.h * gridx.h *
-                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+          double f = gridx.h * gridx.h
+                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
-                          Iold[id - 1] + Iold[id + 1]);
+          I[id] =
+               0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
+                          + Iold[id + 1]);
         });
 
     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
     RAJA::forall<RAJA::seq_exec>(
-        gridRange,
-        [=](RAJA::Index_type k)
-        {
-          RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
-          Iold[k] = I[k];
-        });
+      gridRange, [=](RAJA::Index_type k) {
+      
+        RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);          
+        Iold[k] = I[k];
 
+      });
+    
     resI2 = RAJA_resI2;
-    if (iteration > maxIter)
-    {
+    if (iteration > maxIter) {
       printf("Jacobi: Sequential - Maxed out on iterations! \n");
       exit(-1);
     }
@@ -245,17 +237,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   computeErr(I, gridx);
   printf("No of iterations: %d \n \n", iteration);
-
-
+  
+  
 #if defined(RAJA_ENABLE_OPENMP)
   printf("RAJA: OpenMP Policy - Nested ForallN \n");
-  resI2     = 1;
+  resI2 = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
-
+  
   /*
-   *  OpenMP parallel Jacobi Iteration.
+   *  OpenMP parallel Jacobi Iteration. 
    *
    *  ----[RAJA Policies]-----------
    *  RAJA::omp_collapse_for_exec -
@@ -264,43 +256,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
    *  Note that OpenMP RAJA ReduceSum object performs the reduction
    *  operation for the residual in a thread-safe manner.
    */
+  
+  using jacobiOmpNestedPolicy = RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
 
-  using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+  while (resI2 > tol * tol) {
+    
+    RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
+                         [=] (RAJA::Index_type m, RAJA::Index_type n) {
 
-  while (resI2 > tol * tol)
-  {
+                
+      double x = gridx.o + m * gridx.h;
+      double y = gridx.o + n * gridx.h;
 
-    RAJA::kernel<jacobiOmpNestedPolicy>(
-        RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=](RAJA::Index_type m, RAJA::Index_type n)
-        {
-          double x = gridx.o + m * gridx.h;
-          double y = gridx.o + n * gridx.h;
-
-          double f = gridx.h * gridx.h *
-                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+      double f = gridx.h * gridx.h * 
+                 (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
-          int id = n * (N + 2) + m;
-          I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
-                          Iold[id - 1] + Iold[id + 1]);
-        });
+      int id = n * (N + 2) + m;
+      I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + 
+                           Iold[id - 1] + Iold[id + 1]);              
+    });
 
 
     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
 
-    RAJA::forall<RAJA::omp_parallel_for_exec>(
-        gridRange,
-        [=](RAJA::Index_type k)
-        {
-          RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
-          Iold[k] = I[k];
-        });
-
+    RAJA::forall<RAJA::omp_parallel_for_exec>( gridRange, 
+      [=](RAJA::Index_type k) {
+      
+      RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);                    
+      Iold[k] = I[k];
+        
+    });
+    
     resI2 = RAJA_resI2;
-    if (iteration > maxIter)
-    {
+    if (iteration > maxIter) {
       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
       exit(-1);
     }
@@ -313,7 +303,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
   /*
-   *  CUDA Jacobi Iteration.
+   *  CUDA Jacobi Iteration. 
    *
    *  ----[RAJA Policies]-----------
    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
@@ -325,41 +315,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   printf("RAJA: CUDA Policy - Nested ForallN \n");
 
-  using jacobiCUDANestedPolicy =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<
-                      0, RAJA::cuda_thread_x_direct,
-                      RAJA::statement::Lambda<0>>>>>>>;
-
-  resI2     = 1;
+  using jacobiCUDANestedPolicy = RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
+          RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    > >;
+  
+  resI2 = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
 
-  while (resI2 > tol * tol)
-  {
+  while (resI2 > tol * tol) {
 
     //
-    // Jacobi Iteration
+    // Jacobi Iteration 
     //
     RAJA::kernel<jacobiCUDANestedPolicy>(
-        RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n)
-        {
+                         RAJA::make_tuple(jacobiRange,jacobiRange),
+                         [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {
+                           
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
-          double f = gridx.h * gridx.h *
-                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+          double f = gridx.h * gridx.h
+                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
           int id = n * (N + 2) + m;
-          I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
-                          Iold[id - 1] + Iold[id + 1]);
+          I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
+                             + Iold[id + 1]);                            
         });
 
     //
@@ -367,17 +358,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        gridRange,
-        [=] RAJA_DEVICE(RAJA::Index_type k)
-        {
+      gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
+      
           RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
           Iold[k] = I[k];
-        });
+
+      });
 
     resI2 = RAJA_resI2;
 
-    if (iteration > maxIter)
-    {
+    if (iteration > maxIter) {
       printf("RAJA: CUDA - Maxed out on iterations! \n");
       exit(-1);
     }
@@ -402,47 +392,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   printf("RAJA: HIP Policy - Nested ForallN \n");
 
-  using jacobiHIPNestedPolicy =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::hip_thread_y_direct,
-                  RAJA::statement::For<
-                      0, RAJA::hip_thread_x_direct,
-                      RAJA::statement::Lambda<0>>>>>>>;
-
-  resI2     = 1;
+  using jacobiHIPNestedPolicy = RAJA::KernelPolicy<
+    RAJA::statement::HipKernel<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
+          RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    > >;
+
+  resI2 = 1;
   iteration = 0;
   memset(I, 0, NN * sizeof(double));
   memset(Iold, 0, NN * sizeof(double));
 
-  double* d_I    = memoryManager::allocate_gpu<double>(NN);
-  double* d_Iold = memoryManager::allocate_gpu<double>(NN);
-  hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
+  double *d_I    = memoryManager::allocate_gpu<double>(NN);
+  double *d_Iold = memoryManager::allocate_gpu<double>(NN);
+  hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice ));
 
-  while (resI2 > tol * tol)
-  {
+  while (resI2 > tol * tol) {
 
     //
     // Jacobi Iteration
     //
     RAJA::kernel<jacobiHIPNestedPolicy>(
-        RAJA::make_tuple(jacobiRange, jacobiRange),
-        [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n)
-        {
+                         RAJA::make_tuple(jacobiRange,jacobiRange),
+                         [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {
+
           double x = gridx.o + m * gridx.h;
           double y = gridx.o + n * gridx.h;
 
-          double f = gridx.h * gridx.h *
-                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+          double f = gridx.h * gridx.h
+                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
 
-          int id  = n * (N + 2) + m;
-          d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] +
-                            d_Iold[id - 1] + d_Iold[id + 1]);
+          int id = n * (N + 2) + m;
+          d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1]
+                             + d_Iold[id + 1]);
         });
 
     //
@@ -450,24 +440,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        gridRange,
-        [=] RAJA_DEVICE(RAJA::Index_type k)
-        {
+      gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
+
           RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
           d_Iold[k] = d_I[k];
-        });
+
+      });
 
     resI2 = RAJA_resI2;
 
-    if (iteration > maxIter)
-    {
+    if (iteration > maxIter) {
       printf("RAJA: HIP - Maxed out on iterations! \n");
       exit(-1);
     }
     iteration++;
   }
   hipDeviceSynchronize();
-  hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost ));
   computeErr(I, gridx);
   printf("No of iterations: %d \n \n", iteration);
 
@@ -477,7 +466,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   memoryManager::deallocate(I);
   memoryManager::deallocate(Iold);
-
+  
 
   return 0;
 }
@@ -493,26 +482,25 @@ double solution(double x, double y)
 //
 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
 //
-void computeErr(double* I, grid_s grid)
+void computeErr(double *I, grid_s grid)
 {
 
-  RAJA::RangeSegment                        gridRange(0, grid.n);
+  RAJA::RangeSegment gridRange(0, grid.n);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
-  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<jacobiSeqNestedPolicy>(
-      RAJA::make_tuple(gridRange, gridRange),
-      [=](RAJA::Index_type ty, RAJA::Index_type tx)
-      {
-        int    id    = tx + grid.n * ty;
-        double x     = grid.o + tx * grid.h;
-        double y     = grid.o + ty * grid.h;
-        double myErr = std::abs(I[id] - solution(x, y));
-        tMax.max(myErr);
-      });
+  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
+    RAJA::statement::For<1, RAJA::seq_exec,
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
+
+  RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange,gridRange),
+                       [=] (RAJA::Index_type ty, RAJA::Index_type tx ) {
+
+      int id = tx + grid.n * ty;
+      double x = grid.o + tx * grid.h;
+      double y = grid.o + ty * grid.h;
+      double myErr = std::abs(I[id] - solution(x, y));
+      tMax.max(myErr);
+    });
 
   double l2err = tMax;
   printf("Max error = %lg, h = %f \n", l2err, grid.h);
diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp
index 09a8a295d1..5de2123425 100644
--- a/examples/kernel-dynamic-tile.cpp
+++ b/examples/kernel-dynamic-tile.cpp
@@ -1,33 +1,34 @@
 #include "RAJA/RAJA.hpp"
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
   std::cout << "\n\nRAJA dynamic_tile example...\n\n";
 
-  // Using policy = KernelPolicy<Tile<tile_dynamic<0>, seq_exec, …>>;
-  // RAJA::kernel_param<policy>(
-  //  make_tuple(RangeSegment(0,N)),
-  //   make_tuple(32),  // param 0 is referenced by tile_dynamic
-  //   [=](int i, int tile_size){
-  //
-  //   });
+//Using policy = KernelPolicy<Tile<tile_dynamic<0>, seq_exec, …>>;
+//RAJA::kernel_param<policy>(
+// make_tuple(RangeSegment(0,N)),
+//  make_tuple(32),  // param 0 is referenced by tile_dynamic
+//  [=](int i, int tile_size){
+//
+//  });
 
   using namespace RAJA;
 
-  kernel_param<KernelPolicy<statement::Tile<
-      1, tile_dynamic<1>, seq_exec,
-      statement::Tile<
-          0, tile_dynamic<0>, seq_exec,
-          statement::For<
-              1, seq_exec,
-              statement::For<0, seq_exec, statement::Lambda<0>>>>>>>(
-      make_tuple(RangeSegment {0, 25}, RangeSegment {0, 25}),
-      make_tuple(TileSize {5}, TileSize {10}),
-      // make_tuple(TileSize(10)), // not sure we need this, good for
-      // static_assert
-      [=](int i, int j, TileSize x, TileSize y)
-      {
-        std::cout << "Running index (" << i << "," << j << ") of " << x.size
-                  << "x" << y.size << " tile." << std::endl;
-      });
+  kernel_param<
+    KernelPolicy<
+      statement::Tile<1, tile_dynamic<1>, seq_exec,
+        statement::Tile<0, tile_dynamic<0>, seq_exec,
+          statement::For<1, seq_exec,
+             statement::For<0, seq_exec, statement::Lambda<0>>
+          >
+        >
+      >
+    >
+  >(make_tuple(RangeSegment{0,25}, RangeSegment{0,25}),
+      make_tuple(TileSize{5}, TileSize{10}),
+     //make_tuple(TileSize(10)), // not sure we need this, good for static_assert
+     [=](int i, int j, TileSize x, TileSize y){
+       std::cout << "Running index (" << i << "," << j << ") of " << x.size << "x" << y.size << " tile." << std::endl;
+  });
+
 }
diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp
index 35c63e04a6..b57bedfd6b 100644
--- a/examples/launch-param-reductions.cpp
+++ b/examples/launch-param-reductions.cpp
@@ -38,7 +38,7 @@ constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-// LC testing hardware has a limit of 151
+//LC testing hardware has a limit of 151
 constexpr int SYCL_BLOCK_SIZE = 128;
 #endif
 
@@ -48,14 +48,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 1000000;
 
-  //
-  // Use a resource to allocate memory
-  //
+//
+// Use a resource to allocate memory
+//
   RAJA::resources::Host host_res;
 #if defined(RAJA_ENABLE_CUDA)
   RAJA::resources::Cuda device_res;
@@ -68,166 +68,165 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-  //
-  // Allocate array data and initialize data to alternating sequence of 1, -1.
-  //
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
   int* a = host_res.allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (i % 2 == 0)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
       a[i] = 1;
-    }
-    else
-    {
+    } else {
       a[i] = -1;
     }
   }
 
-  //
-  // Set min and max loc values
-  //
+//
+// Set min and max loc values
+//
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref]            = -100;
+  a[minloc_ref] = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref]            = 100;
+  a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-  //
-  // Note: with this data initialization scheme, the following results will
-  //       be observed for all reduction kernels below:
-  //
-  //  - the sum will be zero
-  //  - the min will be -100
-  //  - the max will be 100
-  //  - the min loc will be N/2
-  //  - the max loc will be N/2 + 1
-  //
-  //
-
-  //
-  // Define index range for iterating over a elements in all examples
-  //
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
   // _reductions_range_start
   RAJA::TypedRangeSegment<int> arange(0, N);
   // _reductions_range_end
 
-  //
-  // Define ValLoc Type
-  //
+//
+// Define ValLoc Type
+//
 
   using VALLOC_INT = RAJA::expt::ValLoc<int>;
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
   // _reductions_raja_seq_start
-  using LAUNCH_POL1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
-  using LOOP_POL1   = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using LAUNCH_POL1   = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+  using LOOP_POL1     = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-  int        seq_sum = 0;
-  int        seq_min = std::numeric_limits<int>::max();
-  int        seq_max = std::numeric_limits<int>::min();
+  int seq_sum = 0;
+  int seq_min = std::numeric_limits<int>::max();
+  int seq_max = std::numeric_limits<int>::min();
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL1>(
-      host_res, RAJA::LaunchParams(), "SeqReductionKernel",
-      RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, int& _seq_sum, int& _seq_min, int& _seq_max,
-          VALLOC_INT& _seq_minloc, VALLOC_INT& _seq_maxloc)
-      {
-        RAJA::loop<LOOP_POL1>(
-            ctx, arange,
-            [&](int i)
-            {
-              _seq_sum += a[i];
-
-              _seq_min = RAJA_MIN(a[i], _seq_min);
-              _seq_max = RAJA_MAX(a[i], _seq_max);
-
-              _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
-              _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
-              //_seq_minloc.min(a[i], i);
-              //_seq_maxloc.max(a[i], i);
-              // Note : RAJA::expt::ValLoc<T> objects provide
-              // min() and max() methods
-              //        that are equivalent to the assignments
-              //        with RAJA_MIN and RAJA_MAX above.
-            });
-      });
+  RAJA::launch<LAUNCH_POL1>
+    (host_res, RAJA::LaunchParams(), "SeqReductionKernel",
+    RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
+                           int &_seq_sum, int &_seq_min,
+                           int &_seq_max, VALLOC_INT &_seq_minloc,
+                           VALLOC_INT &_seq_maxloc) {
+
+      RAJA::loop<LOOP_POL1>(ctx, arange, [&] (int i) {
+
+          _seq_sum += a[i];
+
+          _seq_min = RAJA_MIN(a[i], _seq_min);
+          _seq_max = RAJA_MAX(a[i], _seq_max);
+
+          _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
+          _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
+          //_seq_minloc.min(a[i], i);
+          //_seq_maxloc.max(a[i], i);
+          // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods
+          //        that are equivalent to the assignments with RAJA_MIN and RAJA_MAX
+          //        above.
+        }
+      );
+
+    }
+  );
 
   std::cout << "\tsum = " << seq_sum << std::endl;
   std::cout << "\tmin = " << seq_min << std::endl;
   std::cout << "\tmax = " << seq_max << std::endl;
   std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , "
-            << seq_minloc.getLoc() << std::endl;
+                               << seq_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , "
-            << seq_maxloc.getLoc() << std::endl;
+                               << seq_maxloc.getLoc() << std::endl;
   // _reductions_raja_seq_end
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
 
   // _reductions_raja_omppolicy_start
-  using LAUNCH_POL2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
-  using LOOP_POL2   = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using LAUNCH_POL2   = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+  using LOOP_POL2     = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   // _reductions_raja_omppolicy_end
 
-  int        omp_sum = 0;
-  int        omp_min = std::numeric_limits<int>::max();
-  int        omp_max = std::numeric_limits<int>::min();
+  int omp_sum = 0;
+  int omp_min = std::numeric_limits<int>::max();
+  int omp_max = std::numeric_limits<int>::min();
   VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL2>(
-      host_res, RAJA::LaunchParams(), "OmpReductionKernel",
-      RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, int& _omp_sum, int& _omp_min, int& _omp_max,
-          VALLOC_INT& _omp_minloc, VALLOC_INT& _omp_maxloc)
-      {
-        RAJA::loop<LOOP_POL2>(
-            ctx, arange,
-            [&](int i)
-            {
-              _omp_sum += a[i];
-
-              _omp_min = RAJA_MIN(a[i], _omp_min);
-              _omp_max = RAJA_MAX(a[i], _omp_max);
-
-              _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
-              _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
-              //_omp_minloc.min(a[i], i);
-              //_omp_maxloc.max(a[i], i);
-            });
-      });
+  RAJA::launch<LAUNCH_POL2>
+    (host_res, RAJA::LaunchParams(), "OmpReductionKernel",
+    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
+                           int &_omp_sum, int &_omp_min,
+                           int &_omp_max, VALLOC_INT &_omp_minloc,
+                           VALLOC_INT &_omp_maxloc) {
+
+      RAJA::loop<LOOP_POL2>(ctx, arange, [&] (int i) {
+
+          _omp_sum += a[i];
+
+          _omp_min = RAJA_MIN(a[i], _omp_min);
+          _omp_max = RAJA_MAX(a[i], _omp_max);
+
+          _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
+          _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
+          //_omp_minloc.min(a[i], i);
+          //_omp_maxloc.max(a[i], i);
+        }
+      );
+
+    }
+  );
 
   std::cout << "\tsum = " << omp_sum << std::endl;
   std::cout << "\tmin = " << omp_min << std::endl;
   std::cout << "\tmax = " << omp_max << std::endl;
   std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , "
-            << omp_minloc.getLoc() << std::endl;
+                               << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , "
-            << omp_maxloc.getLoc() << std::endl;
+                               << omp_maxloc.getLoc() << std::endl;
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
@@ -236,60 +235,62 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   device_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_cudapolicy_start
-  using LAUNCH_POL3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false /*async*/>>;
-  using LOOP_POL3   = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+  using LAUNCH_POL3   = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false /*async*/>>;
+  using LOOP_POL3     = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
   // _reductions_raja_cudapolicy_end
 
-  const int NUMBER_OF_TEAMS = (N - 1) / CUDA_BLOCK_SIZE + 1;
+  const int NUMBER_OF_TEAMS = (N-1)/CUDA_BLOCK_SIZE + 1;
 
-  int        cuda_sum = 0;
-  int        cuda_min = std::numeric_limits<int>::max();
-  int        cuda_max = std::numeric_limits<int>::min();
+  int cuda_sum = 0;
+  int cuda_min = std::numeric_limits<int>::max();
+  int cuda_max = std::numeric_limits<int>::min();
   VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL3>(
-      device_res,
-      RAJA::LaunchParams(
-          RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)),
-      "CUDAReductionKernel",
-      RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, int& _cuda_sum, int& _cuda_min,
-          int& _cuda_max, VALLOC_INT& _cuda_minloc, VALLOC_INT& _cuda_maxloc)
-      {
-        RAJA::loop<LOOP_POL3>(
-            ctx, arange,
-            [&](int i)
-            {
-              _cuda_sum += d_a[i];
-
-              _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
-              _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
-
-              _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
-              _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
-              //_cuda_minloc.min(a[i], i);
-              //_cuda_maxloc.max(a[i], i);
-            });
-      });
+  RAJA::launch<LAUNCH_POL3>
+    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)),
+     "CUDAReductionKernel",
+    RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
+                           int &_cuda_sum, int &_cuda_min, int &_cuda_max,
+                           VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) {
+
+
+      RAJA::loop<LOOP_POL3>(ctx, arange, [&] (int i) {
+
+          _cuda_sum += d_a[i];
+
+          _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
+          _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
+
+          _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
+          _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
+          //_cuda_minloc.min(a[i], i);
+          //_cuda_maxloc.max(a[i], i);
+
+        }
+      );
+
+
+    }
+  );
 
   std::cout << "\tsum = " << cuda_sum << std::endl;
   std::cout << "\tmin = " << cuda_min << std::endl;
   std::cout << "\tmax = " << cuda_max << std::endl;
   std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , "
-            << cuda_minloc.getLoc() << std::endl;
+                               << cuda_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , "
-            << cuda_maxloc.getLoc() << std::endl;
+                               << cuda_maxloc.getLoc() << std::endl;
 
   device_res.deallocate(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
@@ -298,59 +299,61 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   device_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_hippolicy_start
-  using LAUNCH_POL3 = RAJA::LaunchPolicy<RAJA::hip_launch_t<false /*async*/>>;
-  using LOOP_POL3   = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
+  using LAUNCH_POL3   = RAJA::LaunchPolicy<RAJA::hip_launch_t<false /*async*/>>;
+  using LOOP_POL3     = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
   // _reductions_raja_hippolicy_end
 
-  const int NUMBER_OF_TEAMS = (N - 1) / HIP_BLOCK_SIZE + 1;
+  const int NUMBER_OF_TEAMS = (N-1)/HIP_BLOCK_SIZE + 1;
 
-  int        hip_sum = 0;
-  int        hip_min = std::numeric_limits<int>::max();
-  int        hip_max = std::numeric_limits<int>::min();
+  int hip_sum = 0;
+  int hip_min = std::numeric_limits<int>::max();
+  int hip_max = std::numeric_limits<int>::min();
   VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL3>(
-      device_res,
-      RAJA::LaunchParams(
-          RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)),
-      "HipReductionKernel", RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, int& _hip_sum, int& _hip_min, int& _hip_max,
-          VALLOC_INT& _hip_minloc, VALLOC_INT& _hip_maxloc)
-      {
-        RAJA::loop<LOOP_POL3>(
-            ctx, arange,
-            [&](int i)
-            {
-              _hip_sum += d_a[i];
-
-              _hip_min = RAJA_MIN(d_a[i], _hip_min);
-              _hip_max = RAJA_MAX(d_a[i], _hip_max);
-
-              _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
-              _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
-              //_hip_minloc.min(d_a[i], i);
-              //_hip_maxloc.max(d_a[i], i);
-            });
-      });
+  RAJA::launch<LAUNCH_POL3>
+    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)),
+     "HipReductionKernel",
+    RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
+                           int &_hip_sum, int &_hip_min,
+                           int &_hip_max, VALLOC_INT &_hip_minloc,
+                           VALLOC_INT &_hip_maxloc) {
+
+      RAJA::loop<LOOP_POL3>(ctx, arange, [&] (int i) {
+
+          _hip_sum += d_a[i];
+
+          _hip_min = RAJA_MIN(d_a[i], _hip_min);
+          _hip_max = RAJA_MAX(d_a[i], _hip_max);
+
+          _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
+          _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
+          //_hip_minloc.min(d_a[i], i);
+          //_hip_maxloc.max(d_a[i], i);
+
+        }
+      );
+
+    }
+  );
 
   std::cout << "\tsum = " << hip_sum << std::endl;
   std::cout << "\tmin = " << hip_min << std::endl;
   std::cout << "\tmax = " << hip_max << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , "
-            << hip_minloc.getLoc() << std::endl;
+                               << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , "
-            << hip_maxloc.getLoc() << std::endl;
+                               << hip_maxloc.getLoc() << std::endl;
 
   device_res.deallocate(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL reductions...\n";
@@ -359,64 +362,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   device_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_syclpolicy_start
-  using LAUNCH_POL4 = RAJA::LaunchPolicy<RAJA::sycl_launch_t<false /*async*/>>;
-  using LOOP_POL4   = RAJA::LoopPolicy<RAJA::sycl_global_item_2>;
+  using LAUNCH_POL4   = RAJA::LaunchPolicy<RAJA::sycl_launch_t<false /*async*/>>;
+  using LOOP_POL4     = RAJA::LoopPolicy<RAJA::sycl_global_item_2>;
   // _reductions_raja_syclpolicy_end
 
-  const int NUMBER_OF_TEAMS = (N - 1) / SYCL_BLOCK_SIZE + 1;
+  const int NUMBER_OF_TEAMS = (N-1)/SYCL_BLOCK_SIZE + 1;
 
-  int        sycl_sum = 0;
-  int        sycl_min = std::numeric_limits<int>::max();
-  int        sycl_max = std::numeric_limits<int>::min();
+  int sycl_sum = 0;
+  int sycl_min = std::numeric_limits<int>::max();
+  int sycl_max = std::numeric_limits<int>::min();
   VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::launch<LAUNCH_POL4>(
-      device_res,
-      RAJA::LaunchParams(
-          RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)),
-      "SyclReductionKernel",
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, int& _sycl_sum, int& _sycl_min,
-          int& _sycl_max, VALLOC_INT& _sycl_minloc, VALLOC_INT& _sycl_maxloc)
-      {
-        RAJA::loop<LOOP_POL4>(
-            ctx, arange,
-            [&](int i)
-            {
-              _sycl_sum += d_a[i];
-
-              _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
-              _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
-
-              _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
-              _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
-              //_sycl_minloc.min(d_a[i], i);
-              //_sycl_maxloc.max(d_a[i], i);
-            });
-      });
+  RAJA::launch<LAUNCH_POL4>
+    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)),
+     "SyclReductionKernel",
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
+                           int &_sycl_sum, int &_sycl_min,
+                           int &_sycl_max, VALLOC_INT &_sycl_minloc,
+                           VALLOC_INT &_sycl_maxloc) {
+
+      RAJA::loop<LOOP_POL4>(ctx, arange, [&] (int i) {
+
+          _sycl_sum += d_a[i];
+
+          _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
+          _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
+
+          _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
+          _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
+          //_sycl_minloc.min(d_a[i], i);
+          //_sycl_maxloc.max(d_a[i], i);
+
+        }
+      );
+
+    }
+  );
 
   std::cout << "\tsum = " << sycl_sum << std::endl;
   std::cout << "\tmin = " << sycl_min << std::endl;
   std::cout << "\tmax = " << sycl_max << std::endl;
   std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , "
-            << sycl_minloc.getLoc() << std::endl;
+                               << sycl_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , "
-            << sycl_maxloc.getLoc() << std::endl;
+                               << sycl_maxloc.getLoc() << std::endl;
 
   device_res.deallocate(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   host_res.deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp
index e94925dbb9..2a3d92ad84 100644
--- a/examples/launch_flatten.cpp
+++ b/examples/launch_flatten.cpp
@@ -34,18 +34,16 @@
  */
 
 #if defined(RAJA_ENABLE_CUDA)
-using device_launch     = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
-using device_inner_pol0 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
-using device_inner_pol1 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
-using device_flatten_pol =
-    RAJA::LoopPolicy<RAJA::cuda_flatten_block_threads_xy_direct>;
+using device_launch = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+using device_inner_pol0 =  RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+using device_inner_pol1 =  RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+using device_flatten_pol =  RAJA::LoopPolicy<RAJA::cuda_flatten_block_threads_xy_direct>;
 using reduce_policy = RAJA::cuda_reduce;
 #elif defined(RAJA_ENABLE_HIP)
-using device_launch     = RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>;
-using device_inner_pol0 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
-using device_inner_pol1 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
-using device_flatten_pol =
-    RAJA::LoopPolicy<RAJA::hip_flatten_block_threads_xy_direct>;
+using device_launch = RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>;
+using device_inner_pol0 =  RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+using device_inner_pol1 =  RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+using device_flatten_pol =  RAJA::LoopPolicy<RAJA::hip_flatten_block_threads_xy_direct>;
 using reduce_policy = RAJA::hip_reduce;
 #endif
 
@@ -54,9 +52,9 @@ using reduce_policy = RAJA::hip_reduce;
  */
 
 using host_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
-using host_loop   = RAJA::LoopPolicy<RAJA::seq_exec>;
+using host_loop = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
@@ -64,20 +62,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Problem size dimensions
   //
-  constexpr int N  = 4;
-  constexpr int NN = N * N;
+  constexpr int N = 4;
+  constexpr int NN = N*N;
 
   //
   // Configure grid size
   //
-  RAJA::LaunchParams launch_params(RAJA::Teams(1), RAJA::Threads(N, N));
+  RAJA::LaunchParams launch_params(RAJA::Teams(1),
+                                   RAJA::Threads(N, N));
 
 
   //
   // Resource object for host, used to allocate memory
   //
   camp::resources::Host host_res;
-  int*                  h_A_ptr = host_res.allocate<int>(NN);
+  int *h_A_ptr = host_res.allocate<int>(NN);
 
   //
   // Resource object for device, used to allocate memory
@@ -88,74 +87,67 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   camp::resources::Hip device_res;
 #endif
 
-  int* d_A_ptr = device_res.allocate<int>(NN);
+  int *d_A_ptr = device_res.allocate<int>(NN);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running device version of teams_flatten example ...\n";
 
   RAJA::ReduceSum<reduce_policy, int> device_kernel_sum(0);
-  RAJA::View<int, RAJA::Layout<2>>    d_A_2DView(d_A_ptr, N, N);
-  RAJA::View<int, RAJA::Layout<1>>    d_A_1DView(d_A_ptr, NN);
-
-  RAJA::launch<device_launch>(
-      launch_params,
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<device_inner_pol1>(
-            ctx, RAJA::RangeSegment(0, N),
-            [&](int j)
-            {
-              RAJA::loop<device_inner_pol0>(
-                  ctx, RAJA::RangeSegment(0, N),
-                  [&](int i) { d_A_2DView(j, i) = i + j; });
-            });
-
-        ctx.teamSync();
-
-        // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
-        // accumulating memory contents
-        RAJA::loop<device_flatten_pol>(
-            ctx, RAJA::RangeSegment(0, NN),
-            [&](int i) { device_kernel_sum += d_A_1DView(i); });
-      });
-
-  //----------------------------------------------------------------------------//
+  RAJA::View<int, RAJA::Layout<2>> d_A_2DView(d_A_ptr, N, N);
+  RAJA::View<int, RAJA::Layout<1>> d_A_1DView(d_A_ptr, NN);
+
+  RAJA::launch<device_launch>
+    (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
+     {
+
+       RAJA::loop<device_inner_pol1>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
+         RAJA::loop<device_inner_pol0>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
+             d_A_2DView(j, i) = i + j;
+           });
+         });
+
+       ctx.teamSync();
+
+       // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
+       // accumulating memory contents
+       RAJA::loop<device_flatten_pol>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
+           device_kernel_sum += d_A_1DView(i);
+       });
+
+     });
+
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running host version of teams_flatten example ...\n";
 
   RAJA::ReduceSum<reduce_policy, int> host_kernel_sum(0);
-  RAJA::View<int, RAJA::Layout<2>>    h_A_2DView(h_A_ptr, N, N);
-  RAJA::View<int, RAJA::Layout<1>>    h_A_1DView(h_A_ptr, NN);
-
-  RAJA::launch<host_launch>(
-      launch_params,
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<host_loop>(
-            ctx, RAJA::RangeSegment(0, N),
-            [&](int j)
-            {
-              RAJA::loop<host_loop>(
-                  ctx, RAJA::RangeSegment(0, N),
-                  [&](int i) { h_A_2DView(j, i) = i + j; });
-            });
-
-        ctx.teamSync();
-
-        // As loops are dispatched as standard C loops we can revert to using
-        // a regular seq_exec policy
-        RAJA::loop<host_loop>(
-            ctx, RAJA::RangeSegment(0, NN),
-            [&](int i) { host_kernel_sum += h_A_1DView(i); });
-      });
-
-  if (device_kernel_sum.get() == host_kernel_sum.get())
-  {
+  RAJA::View<int, RAJA::Layout<2>> h_A_2DView(h_A_ptr, N, N);
+  RAJA::View<int, RAJA::Layout<1>> h_A_1DView(h_A_ptr, NN);
+
+  RAJA::launch<host_launch>
+    (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
+    {
+
+       RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
+         RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
+             h_A_2DView(j, i) = i + j;
+           });
+         });
+
+       ctx.teamSync();
+
+       //As loops are dispatched as standard C loops we can revert to using
+       //a regular seq_exec policy
+       RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
+           host_kernel_sum += h_A_1DView(i);
+       });
+
+     });
+
+  if ( device_kernel_sum.get() == host_kernel_sum.get() ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 
diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp
index bf7d042d08..797c5ee7c5 100644
--- a/examples/launch_matrix-multiply.cpp
+++ b/examples/launch_matrix-multiply.cpp
@@ -31,7 +31,7 @@
 /*
  *  Define number of threads in x and y dimensions in a RAJA thread team
  *  or in a CUDA/HIP thread blocks
- */
+*/
 #define THREAD_SZ 16
 
 /*
@@ -52,22 +52,22 @@ using launch_policy = RAJA::LaunchPolicy<
 using loop_policy = RAJA::seq_exec;
 
 #if defined(RAJA_ENABLE_CUDA)
-using gpu_block_x_policy          = RAJA::cuda_block_x_direct;
-using gpu_block_y_policy          = RAJA::cuda_block_y_direct;
-using gpu_thread_x_policy         = RAJA::cuda_thread_x_loop;
-using gpu_thread_y_policy         = RAJA::cuda_thread_y_loop;
-using gpu_global_thread_x_policy  = RAJA::cuda_global_thread_x;
-using gpu_global_thread_y_policy  = RAJA::cuda_global_thread_y;
+using gpu_block_x_policy = RAJA::cuda_block_x_direct;
+using gpu_block_y_policy = RAJA::cuda_block_y_direct;
+using gpu_thread_x_policy = RAJA::cuda_thread_x_loop;
+using gpu_thread_y_policy = RAJA::cuda_thread_y_loop;
+using gpu_global_thread_x_policy = RAJA::cuda_global_thread_x;
+using gpu_global_thread_y_policy = RAJA::cuda_global_thread_y;
 using gpu_global_thread_xy_policy = RAJA::cuda_global_thread_xy;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using gpu_block_x_policy          = RAJA::hip_block_x_direct;
-using gpu_block_y_policy          = RAJA::hip_block_y_direct;
-using gpu_thread_x_policy         = RAJA::hip_thread_x_loop;
-using gpu_thread_y_policy         = RAJA::hip_thread_y_loop;
-using gpu_global_thread_x_policy  = RAJA::hip_global_thread_x;
-using gpu_global_thread_y_policy  = RAJA::hip_global_thread_y;
+using gpu_block_x_policy = RAJA::hip_block_x_direct;
+using gpu_block_y_policy = RAJA::hip_block_y_direct;
+using gpu_thread_x_policy = RAJA::hip_thread_x_loop;
+using gpu_thread_y_policy = RAJA::hip_thread_y_loop;
+using gpu_global_thread_x_policy = RAJA::hip_global_thread_x;
+using gpu_global_thread_y_policy = RAJA::hip_global_thread_y;
 using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy;
 #endif
 
@@ -75,53 +75,41 @@ using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy;
   Define RAJA Team/Thread policies, if a device is available add
   a device policy.
 */
-using teams_x = RAJA::LoopPolicy<
-    loop_policy
+using teams_x = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    gpu_block_x_policy
+                                 , gpu_block_x_policy
 #endif
-    >;
+                                >;
 
-using teams_y = RAJA::LoopPolicy<
-    loop_policy
+using teams_y = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    gpu_block_y_policy
+                                 , gpu_block_y_policy
 #endif
-    >;
+                                >;
 
-using threads_x = RAJA::LoopPolicy<
-    loop_policy
+using threads_x = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    gpu_thread_x_policy
+                                   , gpu_thread_x_policy
 #endif
-    >;
+                                  >;
 
-using threads_y = RAJA::LoopPolicy<
-    loop_policy
+using threads_y = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    gpu_thread_y_policy
+                                   , gpu_thread_y_policy
 #endif
-    >;
+                                  >;
 
-using global_thread_x = RAJA::LoopPolicy<
-    loop_policy
+using global_thread_x = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    gpu_global_thread_x_policy
+                                         , gpu_global_thread_x_policy
 #endif
-    >;
+                                        >;
 
-using global_thread_y = RAJA::LoopPolicy<
-    loop_policy
+using global_thread_y = RAJA::LoopPolicy<loop_policy
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    gpu_global_thread_y_policy
+                                         , gpu_global_thread_y_policy
 #endif
-    >;
+                                        >;
 
 //
 // Define dimensionality of matrices.
@@ -146,11 +134,9 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
   int row = blockIdx.y * blockDim.y + threadIdx.y;
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if (row < N && col < N)
-  {
+  if ( row < N && col < N ) {
     double dot = 0.0;
-    for (int k = 0; k < N; ++k)
-    {
+    for (int k = 0; k < N; ++k) {
       dot += A(row, k) * B(k, col);
     }
 
@@ -161,8 +147,8 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
 __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
 {
 
-  int Row = blockIdx.y * THREAD_SZ + threadIdx.y;
-  int Col = blockIdx.x * THREAD_SZ + threadIdx.x;
+  int Row = blockIdx.y*THREAD_SZ + threadIdx.y;
+  int Col = blockIdx.x*THREAD_SZ + threadIdx.x;
 
   __shared__ double As[THREAD_SZ][THREAD_SZ];
   __shared__ double Bs[THREAD_SZ][THREAD_SZ];
@@ -170,16 +156,15 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
 
   Cs[threadIdx.y][threadIdx.x] = 0.0;
 
-  for (int k = 0; k < (THREAD_SZ + N - 1) / THREAD_SZ; k++)
-  {
+  for (int k = 0; k < (THREAD_SZ + N - 1)/THREAD_SZ; k++) {
 
-    if (static_cast<int>(k * THREAD_SZ + threadIdx.x) < N && Row < N)
-      As[threadIdx.y][threadIdx.x] = A[Row * N + k * THREAD_SZ + threadIdx.x];
+    if ( static_cast<int>(k*THREAD_SZ + threadIdx.x) < N && Row < N )
+      As[threadIdx.y][threadIdx.x] = A[Row*N + k*THREAD_SZ + threadIdx.x];
     else
       As[threadIdx.y][threadIdx.x] = 0.0;
 
-    if (static_cast<int>(k * THREAD_SZ + threadIdx.y) < N && Col < N)
-      Bs[threadIdx.y][threadIdx.x] = B[(k * THREAD_SZ + threadIdx.y) * N + Col];
+    if ( static_cast<int>(k*THREAD_SZ + threadIdx.y) < N && Col < N)
+      Bs[threadIdx.y][threadIdx.x] = B[(k*THREAD_SZ + threadIdx.y)*N + Col];
     else
       Bs[threadIdx.y][threadIdx.x] = 0.0;
 
@@ -192,8 +177,8 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
   }
 
   if (Row < N && Col < N)
-    C[((blockIdx.y * blockDim.y + threadIdx.y) * N) +
-      (blockIdx.x * blockDim.x) + threadIdx.x] = Cs[threadIdx.y][threadIdx.x];
+    C[((blockIdx.y * blockDim.y + threadIdx.y)*N) +
+      (blockIdx.x * blockDim.x)+ threadIdx.x] = Cs[threadIdx.y][threadIdx.x];
 }
 #endif
 
@@ -201,7 +186,7 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B)
 // Functions for checking results
 //
 template <typename T>
-void checkResult(T* C, int N);
+void checkResult(T *C, int N);
 
 template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
@@ -210,72 +195,68 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 // Functions for printing results
 //
 template <typename T>
-void printResult(T* C, int N);
+void printResult(T *C, int N);
 
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix multiplication example...\n";
 
-  //
-  // Define num rows/cols in matrix and number of teams based on
-  // number of threads in a dimension.
-  //
-  const int N      = 1000;
-  const int NTeams = (N - 1) / THREAD_SZ + 1;
+//
+// Define num rows/cols in matrix and number of teams based on
+// number of threads in a dimension.
+//
+  const int N = 1000;
+  const int NTeams = (N - 1)/THREAD_SZ + 1;
 
-  //
-  // Allocate and initialize matrix data.
-  //
-  double* A = memoryManager::allocate<double>(N * N);
-  double* B = memoryManager::allocate<double>(N * N);
-  double* C = memoryManager::allocate<double>(N * N);
-
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
+//
+// Allocate and initialize matrix data.
+//
+  double *A = memoryManager::allocate<double>(N * N);
+  double *B = memoryManager::allocate<double>(N * N);
+  double *C = memoryManager::allocate<double>(N * N);
+
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
       A(row, col) = row;
       B(row, col) = col;
     }
   }
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version of matrix multiplication...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_cstyle_start
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k)
-      {
+      for (int k = 0; k < N; ++k) {
         dot += A(row, k) * B(k, col);
       }
       C(row, col) = dot;
+
     }
   }
   // _matmult_cstyle_end
 
   checkResult<double>(C, N);
-  // printResult<double>(C, N);
+//printResult<double>(C, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // We define RAJA range segments to define the ranges of
-  // row, column, and dot-product loops for RAJA variants
-  //
+//
+// We define RAJA range segments to define the ranges of
+// row, column, and dot-product loops for RAJA variants
+//
   // _matmult_ranges_start
   RAJA::RangeSegment row_range(0, N);
   RAJA::RangeSegment col_range(0, N);
@@ -284,135 +265,120 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
   // _matmult_ranges_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // For the RAJA implementations of matrix multiplication, we
-  // use RAJA 'View' objects to access the matrix data. A RAJA view
-  // holds a pointer to a data array and enables multi-dimensional indexing
-  // into that data, similar to the macros we defined above.
-  //
+//
+// For the RAJA implementations of matrix multiplication, we
+// use RAJA 'View' objects to access the matrix data. A RAJA view
+// holds a pointer to a data array and enables multi-dimensional indexing
+// into that data, similar to the macros we defined above.
+//
   // _matmult_views_start
   RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, N);
   // _matmult_views_end
 
-  //----------------------------------------------------------------------------//
-  //
-  // RAJA Team loops uses a RAJA::launch method to launch a kernel.
-  // These examples, illustrate the basic interface and mechanics.
-  //
-  // This is different than RAJA::forall and so a few points of exmplanation
-  // are in order:
-  //
-  // 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch)
-  //    execution is chosen at run time and we support running on the host
-  //    or device.
-  //
-  // 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP
-  //    and considers programming using a group of threads in which we group
-  //    into teams. Number of threads and teams are defined inside the Resources
-  //    struct.
-  //
-  // 3) Launch context is used synchronize threads within a team, an example of
-  // this
-  //    is presented further below.
-  //
-  // 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism
-  // can be
-  //    expressed by mapping outer loops (up to 3) to gpu blocks (teams) and
-  //    inner loops to threads in a block (team).
-  //
+//----------------------------------------------------------------------------//
+//
+// RAJA Team loops uses a RAJA::launch method to launch a kernel.
+// These examples, illustrate the basic interface and mechanics.
+//
+// This is different than RAJA::forall and so a few points of exmplanation
+// are in order:
+//
+// 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch)
+//    execution is chosen at run time and we support running on the host
+//    or device.
+//
+// 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP
+//    and considers programming using a group of threads in which we group into
+//    teams. Number of threads and teams are defined inside the Resources struct.
+//
+// 3) Launch context is used synchronize threads within a team, an example of this
+//    is presented further below.
+//
+// 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism can be
+//    expressed by mapping outer loops (up to 3) to gpu blocks (teams) and inner
+//    loops to threads in a block (team).
+//
 
   std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
-  // As a starting point we demonstrate assigning each dot product
-  // to a thread on a two dimensional compute grid. Rows are mapped
-  // to threads in the x dimension, while Cols are mapped to threads
-  // in the y dimension. On the host this mapping simplifies to executing
-  // two for loops.
+  //As a starting point we demonstrate assigning each dot product
+  //to a thread on a two dimensional compute grid. Rows are mapped
+  //to threads in the x dimension, while Cols are mapped to threads
+  //in the y dimension. On the host this mapping simplifies to executing
+  //two for loops.
 
   // _matmult_basickernel_start
-  RAJA::launch<launch_policy>(
-      RAJA::ExecPlace::HOST,
-      RAJA::LaunchParams(
-          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<global_thread_y>(
-            ctx, col_range,
-            [&](int col)
-            {
-              RAJA::loop<global_thread_x>(
-                  ctx, row_range,
-                  [&](int row)
-                  {
-                    double dot = 0.0;
-                    for (int k = 0; k < N; ++k)
-                    {
-                      dot += Aview(row, k) * Bview(k, col);
-                    }
-                    Cview(row, col) = dot;
-                  });
-            });
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::HOST,
+   RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                         RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+   RAJA::loop<global_thread_y>(ctx, col_range, [&] (int col) {
+       RAJA::loop<global_thread_x>(ctx, row_range, [&] (int row) {
+
+          double dot = 0.0;
+          for (int k = 0; k < N; ++k) {
+            dot += Aview(row, k) * Bview(k, col);
+          }
+          Cview(row, col) = dot;
       });
+    });
+
+  });
   // _matmult_basickernel_end
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
-  // RAJA Team loops currently only support a pair of policies at a time.
-  // Switching between a sequential and OpenMP launch space requires
-  // recompiling execution policies. When running exclusively on the host
-  // the compute grid may be left uninitialized as loop methods get expanded to
-  // standard C style loops.
+  //RAJA Team loops currently only support a pair of policies at a time.
+  //Switching between a sequential and OpenMP launch space requires
+  //recompiling execution policies. When running exclusively on the host
+  //the compute grid may be left uninitialized as loop methods get expanded to
+  //standard C style loops.
   using omp_launch_policy = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   using omp_col_policy0 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
   using omp_row_policy0 = RAJA::LoopPolicy<loop_policy>;
 
-  RAJA::launch<omp_launch_policy>(
-      RAJA::LaunchParams(),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<omp_col_policy0>(
-            ctx, col_range,
-            [&](int col)
-            {
-              RAJA::loop<omp_row_policy0>(
-                  ctx, row_range,
-                  [&](int row)
-                  {
-                    double dot = 0.0;
-                    for (int k = 0; k < N; ++k)
-                    {
-                      dot += Aview(row, k) * Bview(k, col);
-                    }
-                    Cview(row, col) = dot;
-                  });
-            });
+  RAJA::launch<omp_launch_policy>(RAJA::LaunchParams(),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+   RAJA::loop<omp_col_policy0>(ctx, col_range, [&] (int col) {
+       RAJA::loop<omp_row_policy0>(ctx, row_range, [&] (int row) {
+
+          double dot = 0.0;
+          for (int k = 0; k < N; ++k) {
+            dot += Aview(row, k) * Bview(k, col);
+          }
+          Cview(row, col) = dot;
       });
+    });
+
+  });
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // This example collapses the row and col loops in an OpenMP parallel region.
@@ -421,34 +387,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   using global_thread_xy = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
-  RAJA::launch<omp_launch_policy>(
-      RAJA::ExecPlace::HOST, RAJA::LaunchParams(),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::expt::loop<global_thread_xy>(
-            ctx, col_range, row_range,
-            [&](int col, int row)
-            {
-              double dot = 0.0;
-              for (int k = 0; k < N; ++k)
-              {
-                dot += Aview(row, k) * Bview(k, col);
-              }
-              Cview(row, col) = dot;
-            });
-      });
+   RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST,
+                                         RAJA::LaunchParams(),
+   [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+     RAJA::expt::loop<global_thread_xy>(ctx, col_range, row_range, [&] (int col, int row) {
+
+           double dot = 0.0;
+           for (int k = 0; k < N; ++k) {
+             dot += Aview(row, k) * Bview(k, col);
+           }
+           Cview(row, col) = dot;
+
+     });
+
+   });
 
   checkResult<double>(Cview, N);
-// printResult<double>(Cview, N);
-#endif  // if RAJA_ENABLE_OPENMP
+//printResult<double>(Cview, N);
+#endif // if RAJA_ENABLE_OPENMP
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA mat-mult (RAJA-nested)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // This example maps row indicies to RAJA teams (CUDA
@@ -460,101 +425,87 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-  RAJA::launch<launch_policy>(
-      RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<teams_x>(
-            ctx, col_range,
-            [&](int col)
-            {
-              RAJA::loop<threads_x>(
-                  ctx, row_range,
-                  [&](int row)
-                  {
-                    double dot = 0.0;
-                    for (int k = 0; k < N; ++k)
-                    {
-                      dot += Aview(row, k) * Bview(k, col);
-                    }
-                    Cview(row, col) = dot;
-                  });
-            });
-      });
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(N),
+                          RAJA::Threads(N)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+    RAJA::loop<teams_x>(ctx, col_range, [&] (int col) {
+        RAJA::loop<threads_x>(ctx, row_range, [&] (int row) {
+
+           double dot = 0.0;
+           for (int k = 0; k < N; ++k) {
+             dot += Aview(row, k) * Bview(k, col);
+           }
+           Cview(row, col) = dot;
+       });
+     });
+
+   });
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tiled mat-mult ...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // This example takes the extents of the col and row loops and breaks
   // them down into `tiles`. Tile loops are used to generate RangeSegments of
   // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate
-  // across the work within each tile. On the device, tiles are typically
-  // assigned to teams, while RAJA loops are mapped to threads.
+  // across the work within each tile. On the device, tiles are typically assigned
+  // to teams, while RAJA loops are mapped to threads.
   //
   // The tiling capabilities in RAJA will also mask out of bounds iterations.
   //
-  RAJA::launch<launch_policy>(
-      RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(
-          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<teams_y>(
-            ctx, THREAD_SZ, row_range,
-            [&](RAJA::RangeSegment const& row_tile)
-            {
-              RAJA::tile<teams_x>(
-                  ctx, THREAD_SZ, col_range,
-                  [&](RAJA::RangeSegment const& col_tile)
-                  {
-                    RAJA::loop<threads_y>(
-                        ctx, row_tile,
-                        [&](int col)
-                        {
-                          RAJA::loop<threads_x>(
-                              ctx, col_tile,
-                              [&](int row)
-                              {
-                                double dot = 0.0;
-                                for (int k = 0; k < N; ++k)
-                                {
-                                  dot += Aview(row, k) * Bview(k, col);
-                                }
-                                Cview(row, col) = dot;
-                              });
-                        });
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+      RAJA::tile<teams_y>
+        (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) {
+          RAJA::tile<teams_x>
+            (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) {
+
+              RAJA::loop<threads_y>(ctx, row_tile, [&] (int col) {
+                RAJA::loop<threads_x>(ctx, col_tile, [&] (int row) {
+
+                    double dot = 0.0;
+                    for (int k = 0; k < N; ++k) {
+                      dot += Aview(row, k) * Bview(k, col);
+                    }
+                    Cview(row, col) = dot;
+
                   });
+                });
             });
-      });
+        });
+   });
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-#endif  // if RAJA_ENABLE_CUDA
+#endif // if RAJA_ENABLE_CUDA
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  double* d_A = memoryManager::allocate_gpu<double>(N * N);
-  double* d_B = memoryManager::allocate_gpu<double>(N * N);
-  double* d_C = memoryManager::allocate_gpu<double>(N * N);
+  double *d_A = memoryManager::allocate_gpu<double>(N * N);
+  double *d_B = memoryManager::allocate_gpu<double>(N * N);
+  double *d_C = memoryManager::allocate_gpu<double>(N * N);
 
   std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
-  hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, N, N);
@@ -570,98 +521,84 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-  RAJA::launch<launch_policy>(
-      RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<teams_x>(
-            ctx, col_range,
-            [&](int col)
-            {
-              RAJA::loop<threads_x>(
-                  ctx, row_range,
-                  [&](int row)
-                  {
-                    double dot = 0.0;
-                    for (int k = 0; k < N; ++k)
-                    {
-                      dot += d_Aview(row, k) * d_Bview(k, col);
-                    }
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(N),
+                          RAJA::Threads(N)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-                    d_Cview(row, col) = dot;
-                  });
-            });
-      });
+     RAJA::loop<teams_x>(ctx, col_range, [&] (int col) {
+       RAJA::loop<threads_x>(ctx, row_range, [&] (int row) {
 
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+            double dot = 0.0;
+            for (int k = 0; k < N; ++k) {
+              dot += d_Aview(row, k) * d_Bview(k, col);
+            }
+
+            d_Cview(row, col) = dot;
+
+        });
+     });
+  });
+
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP tiled mat-mult ...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  std::memset(C, 0, N*N * sizeof(double));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   //
   // This example takes the extents of the col and row loops and breaks
   // them down into `tiles`. Tile loops are used to generate RangeSegments of
   // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate
-  // across the work within each tile. On the device tiles are typically
-  // assigned to teams, while RAJA loops are mapped to threads.
+  // across the work within each tile. On the device tiles are typically assigned
+  // to teams, while RAJA loops are mapped to threads.
   //
   // The tiling capabilities in RAJA will also mask out of bounds iterations.
   //
-  RAJA::launch<launch_policy>(
-      RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(
-          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<teams_y>(
-            ctx, THREAD_SZ, row_range,
-            [&](RAJA::RangeSegment const& row_tile)
-            {
-              RAJA::tile<teams_x>(
-                  ctx, THREAD_SZ, col_range,
-                  [&](RAJA::RangeSegment const& col_tile)
-                  {
-                    RAJA::loop<threads_y>(
-                        ctx, row_tile,
-                        [&](int col)
-                        {
-                          RAJA::loop<threads_x>(
-                              ctx, col_tile,
-                              [&](int row)
-                              {
-                                double dot = 0.0;
-                                for (int k = 0; k < N; ++k)
-                                {
-                                  dot += Aview(row, k) * Bview(k, col);
-                                }
-                                Cview(row, col) = dot;
-                              });
-                        });
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+      RAJA::tile<teams_y>
+        (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) {
+          RAJA::tile<teams_x>
+            (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) {
+
+              RAJA::loop<threads_y>(ctx, row_tile, [&] (int col) {
+                RAJA::loop<threads_x>(ctx, col_tile, [&] (int row) {
+
+                    double dot = 0.0;
+                    for (int k = 0; k < N; ++k) {
+                      dot += Aview(row, k) * Bview(k, col);
+                    }
+                    Cview(row, col) = dot;
+
                   });
+                });
             });
-      });
+        });
+   });
 
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-// printResult<double>(Cview, N);
-#endif  // if RAJA_ENABLE_HIP
+//printResult<double>(Cview, N);
+#endif // if RAJA_ENABLE_HIP
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA tiled mat-mult with shared memory ...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
-  using seq_loop = RAJA::LoopPolicy<RAJA::seq_exec, RAJA::seq_exec>;
+  using seq_loop =  RAJA::LoopPolicy<RAJA::seq_exec, RAJA::seq_exec>;
 
   //
   // This example builds on the RAJA tiling capabilies presented earlier
@@ -673,113 +610,85 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This example also uses the teamSync() method in the launch context
   // to add a barrier ensuring all threads have loaded/read from shared memory
   //
-  RAJA::launch<launch_policy>(
-      RAJA::ExecPlace::DEVICE,
-      RAJA::LaunchParams(
-          RAJA::Teams(NTeams, NTeams), RAJA::Threads(THREAD_SZ, THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        //
-        // Loop over teams
-        //
-        RAJA::tile<teams_y>(
-            ctx, THREAD_SZ, row_range,
-            [&](RAJA::RangeSegment const& y_tile)
-            {
-              RAJA::tile<teams_x>(
-                  ctx, THREAD_SZ, col_range,
-                  [&](RAJA::RangeSegment const& x_tile)
-                  {
-                    RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ];
-                    RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
-                    RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
-
-                    RAJA::loop_icount<threads_y>(
-                        ctx, y_tile,
-                        [&](int row, int ty)
-                        {
-                          RAJA::loop_icount<threads_x>(
-                              ctx, x_tile,
-                              [&](int col, int tx) { Cs[ty][tx] = 0.0; });
-                        });
-
-                    RAJA::tile<seq_loop>(
-                        ctx, THREAD_SZ, dot_range,
-                        [&](RAJA::RangeSegment const& k_tile)
-                        {
-                          RAJA::loop_icount<threads_y>(
-                              ctx, y_tile,
-                              [&](int row, int ty)
-                              {
-                                RAJA::loop_icount<threads_x>(
-                                    ctx, k_tile,
-                                    [&](int k_id, int tx)
-                                    { As[ty][tx] = Aview(row, k_id); });
-                              });
-
-                          RAJA::loop_icount<threads_y>(
-                              ctx, k_tile,
-                              [&](int k_id, int ty)
-                              {
-                                RAJA::loop_icount<threads_x>(
-                                    ctx, x_tile,
-                                    [&](int col, int tx)
-                                    { Bs[ty][tx] = Bview(k_id, col); });
-                              });
-
-                          ctx.teamSync();
-
-                          RAJA::loop_icount<threads_y>(
-                              ctx, y_tile,
-                              [&](int row, int ty)
-                              {
-                                RAJA::loop_icount<threads_x>(
-                                    ctx, x_tile,
-                                    [&](int col, int tx)
-                                    {
-                                      RAJA::loop_icount<seq_loop>(
-                                          ctx, k_tile,
-                                          [&](int gid, int e) {
-                                            Cs[ty][tx] += As[ty][e] * Bs[e][tx];
-                                          });
-                                    });
-                              });
-
-                          ctx.teamSync();
-                        });  // slide across matrix
-
-                    RAJA::loop_icount<threads_y>(
-                        ctx, y_tile,
-                        [&](int row, int ty)
-                        {
-                          RAJA::loop_icount<threads_x>(
-                              ctx, x_tile,
-                              [&](int col, int tx)
-                              { Cview(col, row) = Cs[ty][tx]; });
-                        });
-                  });
-            });
-      });  // kernel
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+   //
+   // Loop over teams
+   //
+   RAJA::tile<teams_y>
+     (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &y_tile) {
+     RAJA::tile<teams_x>
+       (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &x_tile) {
+
+         RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ];
+         RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
+         RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
+
+         RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+             RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+               Cs[ty][tx] = 0.0;
+             });
+         });
+
+         RAJA::tile<seq_loop>
+           (ctx, THREAD_SZ, dot_range, [&] (RAJA::RangeSegment const &k_tile) {
+
+           RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, k_tile, [&](int k_id, int tx) {
+                   As[ty][tx] = Aview(row,k_id);
+                 });
+             });
+
+           RAJA::loop_icount<threads_y>(ctx, k_tile, [&](int k_id, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+                   Bs[ty][tx] = Bview(k_id,col);
+               });
+             });
+
+           ctx.teamSync();
+
+           RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+
+                   RAJA::loop_icount<seq_loop>(ctx, k_tile, [&] (int gid, int e) {
+                       Cs[ty][tx] += As[ty][e] * Bs[e][tx];
+                     });
+
+                 });
+             });
+
+           ctx.teamSync();
+
+         });  // slide across matrix
+
+          RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+                   Cview(col,row) = Cs[ty][tx];
+               });
+           });
+       });
+     });
+  });  // kernel
 
   checkResult<double>(Cview, N);
-// printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 #endif
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // Define thread block dimensions
   dim3 blockdim(THREAD_SZ, THREAD_SZ);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
 
-  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
-  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch CUDA kernel defined near the top of this file.
   matMultKernel<<<griddim, blockdim>>>(N, C, A, B);
@@ -788,21 +697,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult<double>(Cview, N);
 
-  std::cout << "\n Running CUDA tiled mat-mult with shared memory (no "
-               "RAJA)...\n";
+  std::cout << "\n Running CUDA tiled mat-mult with shared memory (no RAJA)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   sharedMatMultKernel<<<griddim, blockdim>>>(N, C, A, B);
 
   cudaDeviceSynchronize();
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-#endif  // if RAJA_ENABLE_CUDA
+#endif // if RAJA_ENABLE_CUDA
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -811,53 +719,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define thread block dimensions
   dim3 blockdim(THREAD_SZ, THREAD_SZ);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
 
-  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
-  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
-  std::memset(C, 0, N * N * sizeof(double));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  std::memset(C, 0, N*N * sizeof(double));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL(
-      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
+  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  std::cout << "\n Running HIP tiled mat-mult with shared memory (no "
-               "RAJA)...\n";
+  std::cout << "\n Running HIP tiled mat-mult with shared memory (no RAJA)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  std::memset(C, 0, N*N * sizeof(double));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL(
-      (sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A,
-      d_B);
+  hipLaunchKernelGGL((sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
-#endif  // if RAJA_ENABLE_HIP
+#endif // if RAJA_ENABLE_HIP
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -874,22 +776,16 @@ template <typename T>
 void checkResult(T* C, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      if (std::abs(C(row, col) - row * col * N) > 10e-12)
-      {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      if ( std::abs( C(row, col) - row * col * N ) > 10e-12 ) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if ( match ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -898,22 +794,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      if (std::abs(Cview(row, col) - row * col * N) > 10e-12)
-      {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      if ( std::abs( Cview(row, col) - row * col * N ) > 10e-12 ) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if ( match ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -925,12 +815,10 @@ template <typename T>
 void printResult(T* C, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      std::cout << "C(" << row << "," << col << ") = " << C(row, col)
-                << std::endl;
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      std::cout << "C(" << row << "," << col << ") = "
+                << C(row, col) << std::endl;
     }
   }
   std::cout << std::endl;
@@ -940,12 +828,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      std::cout << "C(" << row << "," << col << ") = " << Cview(row, col)
-                << std::endl;
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      std::cout << "C(" << row << "," << col << ") = "
+                << Cview(row, col) << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp
index 815676351c..24e313e649 100644
--- a/examples/launch_reductions.cpp
+++ b/examples/launch_reductions.cpp
@@ -29,35 +29,31 @@
 
 #if defined(RAJA_ENABLE_OPENMP)
 using host_launch = RAJA::omp_launch_t;
-using host_loop   = RAJA::omp_for_exec;
+using host_loop = RAJA::omp_for_exec;
 #else
-using host_launch   = RAJA::seq_launch_t;
-using host_loop     = RAJA::seq_exec;
+using host_launch = RAJA::seq_launch_t;
+using host_loop = RAJA::seq_exec;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using device_launch = RAJA::cuda_launch_t<false>;
-using device_loop   = RAJA::cuda_global_thread_x;
+using device_loop = RAJA::cuda_global_thread_x;
 #elif defined(RAJA_ENABLE_HIP)
 using device_launch = RAJA::hip_launch_t<false>;
-using device_loop   = RAJA::hip_global_thread_x;
+using device_loop = RAJA::hip_global_thread_x;
 #endif
 
-using launch_policy = RAJA::LaunchPolicy<
-    host_launch
+using launch_policy = RAJA::LaunchPolicy<host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    device_launch
+                                         , device_launch
 #endif
-    >;
+                                        >;
 
-using loop_pol = RAJA::LoopPolicy<
-    host_loop
+using loop_pol = RAJA::LoopPolicy<host_loop
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    device_loop
+                                  , device_loop
 #endif
-    >;
+                                 >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -70,13 +66,11 @@ using reduce_policy = RAJA::seq_reduce;
 #endif
 
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 {
 
-  if (argc != 2)
-  {
-    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions "
-                        "device");
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device");
   }
 
   //
@@ -85,125 +79,110 @@ int main(int argc, char* argv[])
   // Example usage ./launch_reductions host or ./launch_reductions device
   //
   std::string exec_space = argv[1];
-  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
-  {
-    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions "
-                        "device");
+  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
+    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (exec_space.compare("host") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-    printf("Running RAJA-Launch reductions example on the host \n");
-  }
-  if (exec_space.compare("device") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
-    printf("Running RAJA-Launch reductions example on the device \n");
-  }
+  if(exec_space.compare("host") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Launch reductions example on the host \n"); }
+  if(exec_space.compare("device") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Launch reductions example on the device \n"); }
 
   // _reductions_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   const int N = 1000000;
 
-  //
-  // Allocate array data and initialize data to alternating sequence of 1, -1.
-  //
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (i % 2 == 0)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
       a[i] = 1;
-    }
-    else
-    {
+    } else {
       a[i] = -1;
     }
   }
 
-  //
-  // Set min and max loc values
-  //
+//
+// Set min and max loc values
+//
   const int minloc_ref = N / 2;
-  a[minloc_ref]        = -100;
+  a[minloc_ref] = -100;
 
   const int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref]        = 100;
+  a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-  //
-  // Note: with this data initialization scheme, the following results will
-  //       be observed for all reduction kernels below:
-  //
-  //  - the sum will be zero
-  //  - the min will be -100
-  //  - the max will be 100
-  //  - the min loc will be N/2
-  //  - the max loc will be N/2 + 1
-  //
-  //
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
 
-  //
-  // Define index range for iterating over a elements in all examples
-  //
+//
+// Define index range for iterating over a elements in all examples
+//
   // _reductions_range_start
   RAJA::RangeSegment arange(0, N);
   // _reductions_range_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   RAJA::ReduceSum<reduce_policy, int> kernel_sum(0);
-  RAJA::ReduceMin<reduce_policy, int> kernel_min(
-      std::numeric_limits<int>::max());
-  RAJA::ReduceMax<reduce_policy, int> kernel_max(
-      std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(
-      std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(
-      std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMin<reduce_policy, int> kernel_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<reduce_policy, int> kernel_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(std::numeric_limits<int>::min(), -1);
 
   const int TEAM_SZ = 256;
-  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ);
-
-  RAJA::launch<launch_policy>(
-      select_cpu_or_gpu,
-      RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
-      "Launch Reductions",
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<loop_pol>(
-            ctx, arange,
-            [&](int i)
-            {
-              kernel_sum += a[i];
-
-              kernel_min.min(a[i]);
-              kernel_max.max(a[i]);
-
-              kernel_minloc.minloc(a[i], i);
-              kernel_maxloc.maxloc(a[i], i);
-            });
-      });
+  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ);
+
+  RAJA::launch<launch_policy>
+    (select_cpu_or_gpu,
+     RAJA::LaunchParams(RAJA::Teams(GRID_SZ),
+                        RAJA::Threads(TEAM_SZ)),
+     "Launch Reductions",
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) 
+     {
+
+       RAJA::loop<loop_pol>(ctx, arange, [&] (int i) {
+           
+           kernel_sum += a[i];
+           
+           kernel_min.min(a[i]);
+           kernel_max.max(a[i]);
+           
+           kernel_minloc.minloc(a[i], i);
+           kernel_maxloc.maxloc(a[i], i);
+         });
+       
+    });
 
   std::cout << "\tsum = " << kernel_sum.get() << std::endl;
   std::cout << "\tmin = " << kernel_min.get() << std::endl;
   std::cout << "\tmax = " << kernel_max.get() << std::endl;
   std::cout << "\tmin, loc = " << kernel_minloc.get() << " , "
-            << kernel_minloc.getLoc() << std::endl;
+                               << kernel_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , "
-            << kernel_maxloc.getLoc() << std::endl;
+                               << kernel_maxloc.getLoc() << std::endl;
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp
index 41accd3651..62d3d6e3e7 100644
--- a/examples/memoryManager.hpp
+++ b/examples/memoryManager.hpp
@@ -28,20 +28,20 @@ namespace memoryManager
 {
 
 #if defined(RAJA_ENABLE_SYCL)
-static camp::resources::Resource* sycl_res;
+  static camp::resources::Resource* sycl_res;
 #endif
 
 template <typename T>
-T* allocate(RAJA::Index_type size)
+T *allocate(RAJA::Index_type size)
 {
-  T* ptr;
+  T *ptr;
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(
-      cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
+      cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
 #elif defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
+      hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-  ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
+      ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
 #else
   ptr = new T[size];
 #endif
@@ -49,10 +49,9 @@ T* allocate(RAJA::Index_type size)
 }
 
 template <typename T>
-void deallocate(T*& ptr)
+void deallocate(T *&ptr)
 {
-  if (ptr)
-  {
+  if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
@@ -66,38 +65,36 @@ void deallocate(T*& ptr)
   }
 }
 
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
-    defined(RAJA_ENABLE_SYCL)
-template <typename T>
-T* allocate_gpu(RAJA::Index_type size)
-{
-  T* ptr;
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
+  template <typename T>
+  T *allocate_gpu(RAJA::Index_type size)
+  {
+    T *ptr;
 #if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size));
+    cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
+    hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-  auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
-  ptr     = cl::sycl::malloc_device<T>(size, *qu);
+      auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
+      ptr = cl::sycl::malloc_device<T>(size, *qu);
 #endif
-  return ptr;
-}
+    return ptr;
+  }
 
-template <typename T>
-void deallocate_gpu(T*& ptr)
-{
-  if (ptr)
+  template <typename T>
+  void deallocate_gpu(T *&ptr)
   {
+    if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaFree(ptr));
+      cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipFree(ptr));
+      hipErrchk(hipFree(ptr));
 #elif defined(RAJA_ENABLE_SYCL)
     sycl_res->deallocate(ptr);
 #endif
-    ptr = nullptr;
+      ptr = nullptr;
+    }
   }
-}
 #endif
 
 };  // namespace memoryManager
diff --git a/examples/multiview.cpp b/examples/multiview.cpp
index 8f7b2700a1..b765dc84d4 100644
--- a/examples/multiview.cpp
+++ b/examples/multiview.cpp
@@ -15,12 +15,12 @@
  * A RAJA::MultiView object wraps an array-of-pointers,
  * or a pointer-to-pointers, whereas a RAJA::View wraps a single
  * pointer or array. This allows a single RAJA::Layout to be applied to
- * multiple arrays internal to the MultiView, allowing multiple arrays to share
- * indexing arithmetic when their access patterns are the same.
- *
+ * multiple arrays internal to the MultiView, allowing multiple arrays to share indexing
+ * arithmetic when their access patterns are the same.
+ * 
  * The instantiation of a MultiView works exactly like a standard View,
- * except that it takes an array-of-pointers. In the following example, a
- * MultiView applies a 1-D layout of length 4 to 2 internal arrays in myarr:
+ * except that it takes an array-of-pointers. In the following example, a MultiView
+ * applies a 1-D layout of length 4 to 2 internal arrays in myarr:
  *
  *   // Arrays of the same size, which will become internal to the MultiView.
  *   int a1[4] = {5,6,7,8};
@@ -31,35 +31,30 @@
  *   myarr[0] = a1;
  *   myarr[1] = a2;
  *
- *   // This MultiView applies a 1-D layout of length 4 to each internal array
- * in myarr. RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
- *
- * The default MultiView accesses internal arrays via the 0th index of the
- * MultiView:
- *
- *   MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1,
- * returns value of 8 MView( 1, 2 ); // accesses 2nd index of the 1st internal
- * array a2, returns value of 10
- *
+ *   // This MultiView applies a 1-D layout of length 4 to each internal array in myarr.
+ *   RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
+ * 
+ * The default MultiView accesses internal arrays via the 0th index of the MultiView:
+ * 
+ *   MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+ *   MView( 1, 2 ); // accesses 2nd index of the 1st internal array a2, returns value of 10
+ * 
  * The index into the array-of-pointers can be moved to different
- * indices of the MultiView () access operator, rather than the default 0th
- * index. By passing a third template parameter to the MultiView constructor,
- * the internal array index and the integer indicating which array to access can
- * be reversed:
+ * indices of the MultiView () access operator, rather than the default 0th index. By 
+ * passing a third template parameter to the MultiView constructor, the internal array index
+ * and the integer indicating which array to access can be reversed:
  *
  *   // MultiView with array-of-pointers index in 1st position
  *   RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4);
  *
- *   MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1,
- * returns value of 8 MView1( 2, 1 ); // accesses 2nd index of the 1st internal
- * array a2, returns value of 10
- *
- * As the number of Layout dimensions increases, the index into the
- * array-of-pointers can be moved to more distinct locations in the MultiView ()
- * access operator. Here is an example which compares the accesses of a 2-D
- * layout on a normal RAJA::View with a RAJA::MultiView with the
- * array-of-pointers index set to the 2nd position:
- *
+ *   MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+ *   MView1( 2, 1 ); // accesses 2nd index of the 1st internal array a2, returns value of 10
+ * 
+ * As the number of Layout dimensions increases, the index into the array-of-pointers can be
+ * moved to more distinct locations in the MultiView () access operator. Here is an example
+ * which compares the accesses of a 2-D layout on a normal RAJA::View with a RAJA::MultiView
+ * with the array-of-pointers index set to the 2nd position:
+ *  
  *   RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2);
  *
  *   normalView( 2, 1 ); // accesses 3rd index of the a1 array, value = 7
@@ -67,9 +62,8 @@
  *   // MultiView with array-of-pointers index in 2nd position
  *   RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2);
  *
- *   MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1,
- * returns value of 7 (same as normaView(2,1)) MView2( 2, 1, 1 ); // accesses
- * the 3rd index of the 1st internal array a2, returns value of 11
+ *   MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, returns value of 7 (same as normaView(2,1))
+ *   MView2( 2, 1, 1 ); // accesses the 3rd index of the 1st internal array a2, returns value of 11
  *
  * The following code demonstrates 2 aspects of RAJA::MultiView usage:
  * - Basic usage
@@ -81,62 +75,53 @@ void docs_example()
   // temporaries
   int t1, t2, t3, t4;
 
-  printf("MultiView Example from RAJA Documentation:\n");
+  printf( "MultiView Example from RAJA Documentation:\n" );
 
   // _multiview_example_1Dinit_start
   // Arrays of the same size, which will become internal to the MultiView.
-  int a1[4] = {5, 6, 7, 8};
-  int a2[4] = {9, 10, 11, 12};
+  int a1[4] = {5,6,7,8};
+  int a2[4] = {9,10,11,12};
 
   // Array-of-pointers which will be passed into MultiView.
-  int* myarr[2];
+  int * myarr[2];
   myarr[0] = a1;
   myarr[1] = a2;
 
-  // This MultiView applies a 1-D layout of length 4 to each internal array in
-  // myarr.
-  RAJA::MultiView<int, RAJA::Layout<1>> MView(myarr, 4);
+  // This MultiView applies a 1-D layout of length 4 to each internal array in myarr.
+  RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4);
   // _multiview_example_1Dinit_end
 
   // _multiview_example_1Daccess_start
-  t1 = MView(0, 3);  // accesses the 4th index of the 0th internal array a1,
-                     // returns value of 8
-  t2 = MView(1, 2);  // accesses 3rd index of the 1st internal array a2, returns
-                     // value of 11
+  t1 = MView( 0, 3 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+  t2 = MView( 1, 2 ); // accesses 3rd index of the 1st internal array a2, returns value of 11
   // _multiview_example_1Daccess_end
 
   // _multiview_example_1Daopindex_start
   // MultiView with array-of-pointers index in 1st position.
-  RAJA::MultiView<int, RAJA::Layout<1>, 1> MView1(myarr, 4);
+  RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4);
 
-  t3 = MView1(3, 0);  // accesses the 4th index of the 0th internal array a1,
-                      // returns value of 8
-  t4 = MView1(2, 1);  // accesses 3rd index of the 1st internal array a2,
-                      // returns value of 11
+  t3 = MView1( 3, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8
+  t4 = MView1( 2, 1 ); // accesses 3rd index of the 1st internal array a2, returns value of 11
   // _multiview_example_1Daopindex_end
 
-  printf("Comparison of default MultiView with another MultiView that has the "
-         "array-of-pointers index in the 1st position of the () accessor:\n");
-  printf("MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3);
-  printf("MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4);
+  printf( "Comparison of default MultiView with another MultiView that has the array-of-pointers index in the 1st position of the () accessor:\n" );
+  printf( "MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3 );
+  printf( "MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4 );
 
   // _multiview_example_2Daopindex_start
-  RAJA::View<int, RAJA::Layout<2>> normalView(a1, 2, 2);
+  RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2);
 
-  t1 = normalView(1, 1);  // accesses 4th index of the a1 array, value = 8
+  t1 = normalView( 1, 1 ); // accesses 4th index of the a1 array, value = 8
 
   // MultiView with array-of-pointers index in 2nd position
-  RAJA::MultiView<int, RAJA::Layout<2>, 2> MView2(myarr, 2, 2);
+  RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2);
 
-  t2 = MView2(1, 1, 0);  // accesses the 4th index of the 0th internal array a1,
-                         // returns value of 8 (same as normalView(1,1))
-  t3 = MView2(0, 0, 1);  // accesses the 1st index of the 1st internal array a2,
-                         // returns value of 9
+  t2 = MView2( 1, 1, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 (same as normalView(1,1))
+  t3 = MView2( 0, 0, 1 ); // accesses the 1st index of the 1st internal array a2, returns value of 9
   // _multiview_example_2Daopindex_end
 
-  printf("Comparison of 2D normal View with 2D MultiView that has the "
-         "array-of-pointers index in the 2nd position of the () accessor:\n");
-  printf("normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2);
+  printf( "Comparison of 2D normal View with 2D MultiView that has the array-of-pointers index in the 2nd position of the () accessor:\n" );
+  printf( "normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2 );
 }
 
 int main()
@@ -144,11 +129,11 @@ int main()
   docs_example();
 
   constexpr int N = 12;
-  int*          myarr[2];  // two 3x4 arrays
-  int           arr1[N];
-  int           arr2[N];
+  int * myarr[2]; // two 3x4 arrays
+  int arr1[N];
+  int arr2[N];
 
-  for (int ii = 0; ii < N; ++ii)
+  for ( int ii = 0; ii < N; ++ii )
   {
     arr1[ii] = 100 + ii;
     arr2[ii] = 200 + ii;
@@ -158,60 +143,55 @@ int main()
   myarr[1] = arr2;
 
   // 4x3 layout
-  std::array<RAJA::idx_t, 2> perm {{0, 1}};
-  RAJA::Layout<2> layout = RAJA::make_permuted_layout({{4, 3}}, perm);
+  std::array<RAJA::idx_t, 2> perm { {0, 1} };
+  RAJA::Layout<2> layout = RAJA::make_permuted_layout(
+                              { {4, 3} }, perm
+                           );
 
   // Basic MultiView usage
   // Default usage: no specified array-of-pointers index moving
   // 0th position is used as the array-of-pointers index
-  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>> arrView(
-      myarr, layout);
+  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>> arrView(myarr, layout);
 
   // Moved array-of-pointers index MultiView usage
   // Add an array-of-pointers index specifier
-  constexpr int                                                   aopidx = 1;
-  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>, aopidx> arrViewMov(
-      myarr, layout);
+  constexpr int aopidx = 1;
+  RAJA::MultiView<int, RAJA::Layout<2, RAJA::Index_type>, aopidx> arrViewMov(myarr, layout);
 
   // Comparing values of both views
-  printf("Comparing values of both default and 1-index-ed MultiViews:\n");
-  for (int pp = 0; pp < 2; ++pp)
+  printf ( "Comparing values of both default and 1-index-ed MultiViews:\n" );
+  for ( int pp = 0; pp < 2; ++pp )
   {
-    for (int kk = 0; kk < 4; ++kk)
+    for ( int kk = 0; kk < 4; ++kk )
     {
-      for (int jj = 0; jj < 3; ++jj)
+      for ( int jj = 0; jj < 3; ++jj )
       {
-        printf(
-            "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
-            arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
+        printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) );
       }
     }
   }
 
   // switch values
-  printf("Switching values\n");
-  for (int kk = 0; kk < 4; ++kk)
+  printf ( "Switching values\n" );
+  for ( int kk = 0; kk < 4; ++kk )
   {
-    for (int jj = 0; jj < 3; ++jj)
+    for ( int jj = 0; jj < 3; ++jj )
     {
-      int temp           = arrView(0, kk, jj);
+      int temp = arrView(0, kk, jj);
       arrView(0, kk, jj) = arrView(1, kk, jj);
       arrView(1, kk, jj) = temp;
     }
   }
 
   // Comparing switched values of both views
-  printf("Comparing switched values of both default and 1-index-ed "
-         "MultiViews:\n");
-  for (int pp = 0; pp < 2; ++pp)
+  printf ( "Comparing switched values of both default and 1-index-ed MultiViews:\n" );
+  for ( int pp = 0; pp < 2; ++pp )
   {
-    for (int kk = 0; kk < 4; ++kk)
+    for ( int kk = 0; kk < 4; ++kk )
     {
-      for (int jj = 0; jj < 3; ++jj)
+      for ( int jj = 0; jj < 3; ++jj )
       {
-        printf(
-            "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj,
-            arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj));
+        printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) );
       }
     }
   }
diff --git a/examples/omp-target-kernel.cpp b/examples/omp-target-kernel.cpp
index 7d77311ce0..ce425e07a6 100644
--- a/examples/omp-target-kernel.cpp
+++ b/examples/omp-target-kernel.cpp
@@ -10,36 +10,35 @@
 using namespace RAJA;
 using namespace RAJA::statement;
 
-int main(int /*argc*/, char** /*argv[]*/)
-{
+int main(int /*argc*/, char** /*argv[]*/) {
 
   // using Pol = KernelPolicy<
   //               For<1, RAJA::seq_exec>,
   //               For<0, RAJA::omp_target_parallel_for_exec<1>, Lambda<0> >
   //             >;
   using Pol = KernelPolicy<
-      Collapse<omp_target_parallel_collapse_exec, ArgList<0, 1>, Lambda<0>>>;
+    Collapse<omp_target_parallel_collapse_exec, ArgList<0,1>, Lambda<0> > >;
 
-  double* array = new double[25 * 25];
+  double* array = new double[25*25];
 
-#pragma omp target enter data map(to : array [0:25 * 25])
-#pragma omp target data       use_device_ptr(array)
+#pragma omp target enter data map(to: array[0:25*25])
+#pragma omp target data use_device_ptr(array)
 
 #if 1
   RAJA::kernel<Pol>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, 25), RAJA::RangeSegment(0, 25)),
-      [=](int /*i*/, int /*j*/)
-      {
-        // array[i + (25*j)] = i*j;
-        //    int idx = i;
-        // array[0] = i*j;
-      });
+      RAJA::make_tuple(
+        RAJA::RangeSegment(0,25),
+        RAJA::RangeSegment(0,25)),
+      [=] (int /*i*/, int /*j*/) {
+      //array[i + (25*j)] = i*j;
+  //    int idx = i;
+      //array[0] = i*j;
+  });
 #else
   RAJA::forall<RAJA::omp_target_parallel_for_exec<1>>(
-      RAJA::RangeSegment(0, 25),
-      [=](int i)
-      {
-        //
-      });
+      RAJA::RangeSegment(0,25),
+      [=] (int i) {
+      //
+  });
 #endif
 }
diff --git a/examples/omp-target-ltimes.cpp b/examples/omp-target-ltimes.cpp
index 0fc9866348..f51694b3af 100644
--- a/examples/omp-target-ltimes.cpp
+++ b/examples/omp-target-ltimes.cpp
@@ -9,6 +9,7 @@
 #include <stdio.h>
 
 
+
 #include "RAJA/RAJA.hpp"
 #include "RAJA/util/Timer.hpp"
 
@@ -26,27 +27,23 @@ RAJA_INDEX_VALUE(IGroup, "IGroup");
 RAJA_INDEX_VALUE(IZone, "IZone");
 
 
-void runLTimesRajaKernel(
-    bool       debug,
-    Index_type num_moments,
-    Index_type num_directions,
-    Index_type num_groups,
-    Index_type num_zones)
+void runLTimesRajaKernel(bool debug,
+                          Index_type num_moments,
+                          Index_type num_directions,
+                          Index_type num_groups,
+                          Index_type num_zones)
 {
 
-  using namespace RAJA::statement;
+	using namespace RAJA::statement;
 
   // psi[direction, group, zone]
-  using PsiView = RAJA::TypedView<
-      double, Layout<3, Index_type, 2>, IDirection, IGroup, IZone>;
+  using PsiView = RAJA::TypedView<double, Layout<3, Index_type, 2>, IDirection, IGroup, IZone>;
 
   // phi[moment, group, zone]
-  using PhiView =
-      RAJA::TypedView<double, Layout<3, Index_type, 2>, IMoment, IGroup, IZone>;
+  using PhiView = RAJA::TypedView<double, Layout<3, Index_type, 2>, IMoment, IGroup, IZone>;
 
   // ell[moment, direction]
-  using EllView =
-      RAJA::TypedView<double, Layout<2, Index_type, 1>, IMoment, IDirection>;
+  using EllView = RAJA::TypedView<double, Layout<2, Index_type, 1>, IMoment, IDirection>;
 
 
   // allocate data
@@ -57,19 +54,16 @@ void runLTimesRajaKernel(
 
 
   // randomize data
-  for (size_t i = 0; i < ell_data.size(); ++i)
-  {
-    ell_data[i] = i;  // drand48();
+  for (size_t i = 0; i < ell_data.size(); ++i) {
+    ell_data[i] = i; //drand48();
   }
 
-  for (size_t i = 0; i < psi_data.size(); ++i)
-  {
-    psi_data[i] = 2 * i;  // drand48();
+  for (size_t i = 0; i < psi_data.size(); ++i) {
+    psi_data[i] = 2*i; //drand48();
   }
 
-  for (size_t i = 0; i < phi_data.size(); ++i)
-  {
-    phi_data[i] = 0;  // drand48();
+  for (size_t i = 0; i < phi_data.size(); ++i) {
+    phi_data[i] = 0; //drand48();
   }
 
   int hid = omp_get_initial_device();
@@ -77,47 +71,55 @@ void runLTimesRajaKernel(
 
   // create device memory
   double *d_ell, *d_phi, *d_psi;
-  d_ell = static_cast<double*>(
-      omp_target_alloc(sizeof(double) * ell_data.size(), did));
-  d_phi = static_cast<double*>(
-      omp_target_alloc(sizeof(double) * phi_data.size(), did));
-  d_psi = static_cast<double*>(
-      omp_target_alloc(sizeof(double) * psi_data.size(), did));
+  d_ell = static_cast<double*>(omp_target_alloc(sizeof(double) * ell_data.size(), did));
+  d_phi = static_cast<double*>(omp_target_alloc(sizeof(double) * phi_data.size(), did));
+  d_psi = static_cast<double*>(omp_target_alloc(sizeof(double) * psi_data.size(), did));
 
   // Copy to device
   omp_target_memcpy(
-      &ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0, hid, did);
+      &ell_data[0],
+      d_ell,
+      sizeof(double) * ell_data.size(),
+      0,0, hid, did);
   omp_target_memcpy(
-      &phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0, hid, did);
+      &phi_data[0],
+      d_phi,
+      sizeof(double) * phi_data.size(),
+      0,0,hid,did);
   omp_target_memcpy(
-      &psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0, hid, did);
+      &psi_data[0],
+      d_psi,
+      sizeof(double) * psi_data.size(),
+      0,0,hid,did);
 
 
   // create views on data
   std::array<RAJA::idx_t, 2> ell_perm {{0, 1}};
-  EllView                    ell(
-                         d_ell, make_permuted_layout({{num_moments, num_directions}}, ell_perm));
+  EllView ell(
+      d_ell,
+      make_permuted_layout({{num_moments, num_directions}}, ell_perm));
 
   std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
-  PsiView                    psi(
-                         d_psi, make_permuted_layout(
-                                    {{num_directions, num_groups, num_zones}}, psi_perm));
+  PsiView psi(
+      d_psi,
+      make_permuted_layout({{num_directions, num_groups, num_zones}}, psi_perm));
 
   std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
-  PhiView                    phi(
-                         d_phi,
-                         make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm));
+  PhiView phi(
+      d_phi,
+      make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm));
+
 
 
-  using Pol = RAJA::KernelPolicy<Collapse<
-      omp_target_parallel_collapse_exec, ArgList<0, 1, 2>,
-      For<3, RAJA::seq_exec, Lambda<0>>>>;
+  using Pol = RAJA::KernelPolicy<
+      Collapse<omp_target_parallel_collapse_exec,
+        ArgList<0, 1, 2>,
+        For<3, RAJA::seq_exec, Lambda<0>>>>;
 
   RAJA::Timer timer;
   timer.start();
 
-  auto segments = RAJA::make_tuple(
-      TypedRangeSegment<IMoment>(0, num_moments),
+  auto segments =  RAJA::make_tuple(TypedRangeSegment<IMoment>(0, num_moments),
       TypedRangeSegment<IDirection>(0, num_directions),
       TypedRangeSegment<IGroup>(0, num_groups),
       TypedRangeSegment<IZone>(0, num_zones));
@@ -128,61 +130,56 @@ void runLTimesRajaKernel(
       segments,
 
       // Lambda_CalcPhi
-      [=](IMoment m, IDirection d, IGroup g, IZone z)
-      { phi(m, g, z) += ell(m, d) * psi(d, g, z); });
+      [=] (IMoment m, IDirection d, IGroup g, IZone z) {
+        phi(m, g, z) += ell(m, d) * psi(d, g, z);
+      });
+
 
 
   timer.stop();
-  printf("LTimes took %lf seconds using RAJA::kernel\n", timer.elapsed());
+  printf("LTimes took %lf seconds using RAJA::kernel\n",
+      timer.elapsed());
 
 
   // Check correctness
-  if (debug)
-  {
+  if(debug){
 
-    size_t errors      = 0;
+    size_t errors = 0;
     double total_error = 0.;
-    for (IZone z(0); z < num_zones; ++z)
-    {
-      for (IGroup g(0); g < num_groups; ++g)
-      {
-        for (IMoment m(0); m < num_moments; ++m)
-        {
+    for (IZone z(0); z < num_zones; ++z) {
+      for (IGroup g(0); g < num_groups; ++g) {
+        for (IMoment m(0); m < num_moments; ++m) {
           double total = 0.0;
-          for (IDirection d(0); d < num_directions; ++d)
-          {
+          for (IDirection d(0); d < num_directions; ++d) {
             double val = ell(m, d) * psi(d, g, z);
             total += val;
           }
-          if (std::abs(total - phi(m, g, z)) > 1e-9)
-          {
-            ++errors;
+          if(std::abs(total-phi(m,g,z)) > 1e-9){
+            ++ errors;
           }
-          total_error += std::abs(total - phi(m, g, z));
+          total_error += std::abs(total-phi(m,g,z));
         }
       }
     }
-    if (errors == 0)
-    {
+    if(errors == 0){
       printf("  -- no errors (%e)\n", total_error);
     }
-    else
-    {
+    else{
       printf("  -- failed : %ld errors\n", (long)errors);
     }
   }
+
 }
 
 
-int main()
-{
+int main(){
 
   bool debug = true;
 
   int m = 25;
   int d = 80;
   int g = 32;
-  int z = 32 * 1024;
+  int z = 32*1024;
 
   printf("m=%d, d=%d, g=%d, z=%d\n", m, d, g, z);
 
@@ -190,3 +187,5 @@ int main()
 
   return 0;
 }
+
+
diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp
index 01685781c5..ea0c18611f 100644
--- a/examples/pi-reduce_vs_atomic.cpp
+++ b/examples/pi-reduce_vs_atomic.cpp
@@ -45,57 +45,55 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nRAJA pi example...\n";
 
-  //
-  // Define RangeSegment to enumerate "bins" and "bin step" size used in
-  // Riemann integral sum to approximate pi,
-  // and memory location for atomic add operation.
-  //
-  const int    num_bins = 512 * 512;
-  const double dx       = 1.0 / double(num_bins);
+//
+// Define RangeSegment to enumerate "bins" and "bin step" size used in
+// Riemann integral sum to approximate pi,
+// and memory location for atomic add operation.
+//
+  const int num_bins = 512 * 512;
+  const double dx = 1.0 / double(num_bins); 
 
-  RAJA::RangeSegment bins(0, num_bins);
+  RAJA::RangeSegment bins(0, num_bins); 
 
   double* atomic_pi = memoryManager::allocate<double>(1);
 
-  // Set precision for printing pi
+// Set precision for printing pi
   int prec = 16;
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential pi approximation...\n";
-
+ 
   double c_pi = 0.0;
 
-  for (int i = 0; i < num_bins; ++i)
-  {
-    double x = (double(i) + 0.5) * dx;
-    c_pi += dx / (1.0 + x * x);
+  for (int i = 0; i < num_bins; ++i) {
+      double x = (double(i) + 0.5) * dx;
+      c_pi += dx / (1.0 + x * x);
   }
   c_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) 
+            << c_pi << std::endl;
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential pi approximation (reduction)...\n";
 
   using EXEC_POL1   = RAJA::seq_exec;
-  using REDUCE_POL1 = RAJA::seq_reduce;
+  using REDUCE_POL1 = RAJA::seq_reduce; 
 
   RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(
-      bins,
-      [=](int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        seq_pi += dx / (1.0 + x * x);
-      });
+  RAJA::forall<EXEC_POL1>(bins, [=](int i) {
+      double x = (double(i) + 0.5) * dx;
+      seq_pi += dx / (1.0 + x * x);
+  });
   double seq_pi_val = seq_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) 
+            << seq_pi_val << std::endl;
 
 
   std::cout << "\n Running RAJA sequential pi approximation (atomic)...\n";
@@ -104,19 +102,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL1>(
-      bins,
-      [=](int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi, dx / (1.0 + x * x));
-      });
+  RAJA::forall<EXEC_POL1>(bins, [=](int i) {
+      double x = (double(i) + 0.5) * dx;
+      RAJA::atomicAdd<ATOMIC_POL1>(atomic_pi, 
+                                   dx / (1.0 + x * x));
+  });
   *atomic_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) 
+            << *atomic_pi << std::endl;
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -127,16 +124,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
 
-  RAJA::forall<EXEC_POL2>(
-      bins,
-      [=](int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        omp_pi += dx / (1.0 + x * x);
-      });
+  RAJA::forall<EXEC_POL2>(bins, [=](int i) {
+      double x = (double(i) + 0.5) * dx;
+      omp_pi += dx / (1.0 + x * x);
+  });
   double omp_pi_val = omp_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << omp_pi_val << std::endl;
 
 
   std::cout << "\n Running RAJA OpenMP pi approximation (atomic)...\n";
@@ -145,21 +140,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL2>(
-      bins,
-      [=](int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi, dx / (1.0 + x * x));
-      });
+  RAJA::forall<EXEC_POL2>(bins, [=](int i) {
+      double x = (double(i) + 0.5) * dx;
+      RAJA::atomicAdd<ATOMIC_POL2>(atomic_pi, 
+                                   dx / (1.0 + x * x));
+  });
   *atomic_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << *atomic_pi << std::endl;
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -170,16 +164,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
 
-  RAJA::forall<EXEC_POL3>(
-      bins,
-      [=] RAJA_DEVICE(int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        cuda_pi += dx / (1.0 + x * x);
-      });
+  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE (int i) {
+      double x = (double(i) + 0.5) * dx;
+      cuda_pi += dx / (1.0 + x * x);
+  });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << cuda_pi_val << std::endl;
 
 
   std::cout << "\n Running RAJA CUDA pi approximation (atomic)...\n";
@@ -188,20 +180,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   *atomic_pi = 0.0;
 
-  RAJA::forall<EXEC_POL3>(
-      bins,
-      [=] RAJA_DEVICE(int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi, dx / (1.0 + x * x));
-      });
+  RAJA::forall<EXEC_POL3>(bins, [=] RAJA_DEVICE (int i) {
+      double x = (double(i) + 0.5) * dx;
+      RAJA::atomicAdd<ATOMIC_POL3>(atomic_pi, dx / (1.0 + x * x));
+  });
   *atomic_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << *atomic_pi << std::endl;
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -212,47 +202,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::ReduceSum<REDUCE_POL4, double> hip_pi(0.0);
 
-  RAJA::forall<EXEC_POL4>(
-      bins,
-      [=] RAJA_DEVICE(int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        hip_pi += dx / (1.0 + x * x);
-      });
+  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE (int i) {
+      double x = (double(i) + 0.5) * dx;
+      hip_pi += dx / (1.0 + x * x);
+  });
   double hip_pi_val = hip_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << hip_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << hip_pi_val << std::endl;
 
   std::cout << "\n Running RAJA HIP pi approximation (atomic)...\n";
 
-  *atomic_pi          = 0;
+  *atomic_pi = 0;
   double* d_atomic_pi = memoryManager::allocate_gpu<double>(1);
-  hipErrchk(hipMemcpy(
-      d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice ));
 
   using ATOMIC_POL4 = RAJA::hip_atomic;
 
-  RAJA::forall<EXEC_POL4>(
-      bins,
-      [=] RAJA_DEVICE(int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi, dx / (1.0 + x * x));
-      });
+  RAJA::forall<EXEC_POL4>(bins, [=] RAJA_DEVICE (int i) {
+      double x = (double(i) + 0.5) * dx;
+      RAJA::atomicAdd<ATOMIC_POL4>(d_atomic_pi, dx / (1.0 + x * x));
+  });
 
-  hipErrchk(hipMemcpy(
-      atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost));
-  *atomic_pi *= 4.0;
-  std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl;
+  hipErrchk(hipMemcpy( atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost ));
+  *atomic_pi *= 4.0; 
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << *atomic_pi << std::endl;
 
   memoryManager::deallocate_gpu(d_atomic_pi);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(atomic_pi);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/plugin/counter-plugin.cpp b/examples/plugin/counter-plugin.cpp
index 65b5dd391f..8134cd9b83 100644
--- a/examples/plugin/counter-plugin.cpp
+++ b/examples/plugin/counter-plugin.cpp
@@ -10,52 +10,45 @@
 
 #include <iostream>
 
-class CounterPlugin : public RAJA::util::PluginStrategy
+class CounterPlugin :
+  public RAJA::util::PluginStrategy
 {
-public:
-  void preCapture(const RAJA::util::PluginContext& p) override
-  {
-    if (p.platform == RAJA::Platform::host)
+  public:
+  void preCapture(const RAJA::util::PluginContext& p) override {
+    if (p.platform == RAJA::Platform::host) 
     {
-      std::cout << " [CounterPlugin]: Capturing host kernel for the "
-                << ++host_capture_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Capturing host kernel for the " << ++host_capture_counter << " time!" << std::endl;
     }
     else
     {
-      std::cout << " [CounterPlugin]: Capturing device kernel for the "
-                << ++device_capture_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Capturing device kernel for the " << ++device_capture_counter << " time!" << std::endl;
     }
   }
 
-  void preLaunch(const RAJA::util::PluginContext& p) override
-  {
+  void preLaunch(const RAJA::util::PluginContext& p) override {
     if (p.platform == RAJA::Platform::host)
     {
-      std::cout << " [CounterPlugin]: Launching host kernel for the "
-                << ++host_launch_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_launch_counter << " time!" << std::endl;
     }
     else
     {
-      std::cout << " [CounterPlugin]: Launching device kernel for the "
-                << ++device_launch_counter << " time!" << std::endl;
+      std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_launch_counter << " time!" << std::endl;
     }
   }
 
-private:
-  int host_capture_counter;
-  int device_capture_counter;
-  int host_launch_counter;
-  int device_launch_counter;
+  private:
+   int host_capture_counter;
+   int device_capture_counter;
+   int host_launch_counter;
+   int device_launch_counter;
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin>
-    P("Counter",
-      "Counts "
-      "number of "
-      "kernel "
-      "launches.");
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("Counter", "Counts number of kernel launches.");
 
 // Dynamically loading plugin.
-extern "C" RAJA::util::PluginStrategy* getPlugin() { return new CounterPlugin; }
+extern "C" RAJA::util::PluginStrategy *getPlugin ()
+{
+  return new CounterPlugin;
+}
 // _plugin_example_end
diff --git a/examples/plugin/test-plugin-dynamic.cpp b/examples/plugin/test-plugin-dynamic.cpp
index 7f871128f7..c9e574a99e 100644
--- a/examples/plugin/test-plugin-dynamic.cpp
+++ b/examples/plugin/test-plugin-dynamic.cpp
@@ -8,14 +8,15 @@
 #include "RAJA/RAJA.hpp"
 #include <cstdlib>
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
   RAJA::util::init_plugins("../lib/libtimer_plugin.so");
 
-  double* a = new double[10];
+  double *a = new double[10];
   for (int i = 0; i < 4; i++)
   {
-    RAJA::forall<RAJA::seq_exec>(
-        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=](int i) {
+      a[i] = 0;
+    });
   }
 }
diff --git a/examples/plugin/test-plugin.cpp b/examples/plugin/test-plugin.cpp
index 5f9b02a256..b18233cb90 100644
--- a/examples/plugin/test-plugin.cpp
+++ b/examples/plugin/test-plugin.cpp
@@ -7,13 +7,13 @@
 
 #include "RAJA/RAJA.hpp"
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
   double* a = new double[10];
 
-  for (int i = 0; i < 10; i++)
-  {
-    RAJA::forall<RAJA::seq_exec>(
-        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
+  for (int i = 0; i < 10; i++) {
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0,10), [=] (int i) {
+        a[i] = 0;
+    });
   }
 }
diff --git a/examples/plugin/timer-plugin.cpp b/examples/plugin/timer-plugin.cpp
index ab733a6b72..2619f9fcd9 100644
--- a/examples/plugin/timer-plugin.cpp
+++ b/examples/plugin/timer-plugin.cpp
@@ -21,20 +21,15 @@ class TimerPlugin : public RAJA::util::PluginStrategy
   void postLaunch(const RAJA::util::PluginContext& p) override
   {
     end_time = std::chrono::steady_clock::now();
-    double elapsedMs =
-        std::chrono::duration<double, std::milli>(end_time - start_time)
-            .count();
+    double elapsedMs = std::chrono::duration<double, std::milli>(end_time - start_time).count();
 
     if (p.platform == RAJA::Platform::host)
     {
-      printf(
-          "[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs);
+      printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs);
     }
     else
     {
-      printf(
-          "[TimerPlugin]: Elapsed time of device kernel was %f ms\n",
-          elapsedMs);
+      printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", elapsedMs);
     }
   }
 
@@ -44,11 +39,10 @@ class TimerPlugin : public RAJA::util::PluginStrategy
 };
 
 // Dynamically loading plugin.
-extern "C" RAJA::util::PluginStrategy* getPlugin() { return new TimerPlugin; }
+extern "C" RAJA::util::PluginStrategy *getPlugin()
+{
+  return new TimerPlugin;
+}
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<TimerPlugin>
-    P("Timer",
-      "Prints elapsed "
-      "time of kernel "
-      "executions.");
\ No newline at end of file
+static RAJA::util::PluginRegistry::add<TimerPlugin> P("Timer", "Prints elapsed time of kernel executions.");
\ No newline at end of file
diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp
index 520170dd2e..b2642e16ff 100644
--- a/examples/raja-launch.cpp
+++ b/examples/raja-launch.cpp
@@ -56,37 +56,36 @@ using launch_policy = RAJA::LaunchPolicy<
  */
 using teams_x = RAJA::LoopPolicy<
 #if defined(RAJA_ENABLE_OPENMP)
-    RAJA::omp_parallel_for_exec
+                                       RAJA::omp_parallel_for_exec
 #else
-    RAJA::seq_exec
+                                       RAJA::seq_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_block_x_direct
+                                       ,
+                                       RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_block_x_direct
+                                       ,
+                                       RAJA::hip_block_x_direct
 #endif
-    >;
+                                       >;
 /*
  * Define thread policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using threads_x = RAJA::LoopPolicy<
-    RAJA::seq_exec
+using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_thread_x_loop
+                                         ,
+                                         RAJA::cuda_thread_x_loop
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_thread_x_loop
+                                         ,
+                                         RAJA::hip_thread_x_loop
 #endif
-    >;
+                                         >;
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   // Resource object for host
@@ -110,8 +109,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // RAJA teams may switch between host and device policies at run time.
   // The loop below will execute through the available backends.
 
-  for (int exec_place = 0; exec_place < num_of_backends; ++exec_place)
-  {
+  for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) {
 
     auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place;
 
@@ -119,14 +117,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     int N_tri = 5;
 
     int* Ddat = nullptr;
-    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
-    {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
       Ddat = host_res.allocate<int>(N_tri * N_tri);
     }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
-    {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
       Ddat = device_res.allocate<int>(N_tri * N_tri);
     }
 #endif
@@ -145,58 +141,46 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
      * and is used to perform thread synchronizations within a team.
      */
 
-    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
-    {
-      std::cout << "\n Running upper triangular pattern example on the "
-                   "host...\n";
-    }
-    else
-    {
-      std::cout << "\n Running upper triangular pattern example on the "
-                   "device...\n";
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){
+      std::cout << "\n Running upper triangular pattern example on the host...\n";
+    } else {
+      std::cout << "\n Running upper triangular pattern example on the device...\n";
     }
 
 
     RAJA::View<int, RAJA::Layout<2>> D(Ddat, N_tri, N_tri);
 
-    RAJA::launch<launch_policy>(
-        select_cpu_or_gpu,
-        RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<teams_x>(
-              ctx, RAJA::RangeSegment(0, N_tri),
-              [&](int r)
-              {
-                // Array shared within threads of the same team
-                RAJA_TEAM_SHARED int s_A[1];
-
-                RAJA::loop<threads_x>(
-                    ctx, RAJA::RangeSegment(0, 1),
-                    [&](int c) { s_A[c] = r; });  // loop c
-
-                ctx.teamSync();
-
-                RAJA::loop<threads_x>(
-                    ctx, RAJA::RangeSegment(r, N_tri),
-                    [&](int c)
-                    {
-                      D(r, c) = r * N_tri + c;
-                      printf(
-                          "r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c),
-                          s_A[0]);
-                    });  // loop c
-              });        // loop r
-        });              // outer lambda
-
-    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
-    {
+    RAJA::launch<launch_policy>
+      (select_cpu_or_gpu,
+       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+         RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
+
+           // Array shared within threads of the same team
+           RAJA_TEAM_SHARED int s_A[1];
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+              s_A[c] = r;
+           });  // loop c
+
+           ctx.teamSync();
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+               D(r, c) = r * N_tri + c;
+               printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+           });  // loop c
+
+         });  // loop r
+
+       });  // outer lambda
+
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
       host_res.deallocate(Ddat);
     }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
-    {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
       device_res.deallocate(Ddat);
     }
 #endif
diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp
index 25d582aa2d..cfe74dc58a 100644
--- a/examples/red-black-gauss-seidel.cpp
+++ b/examples/red-black-gauss-seidel.cpp
@@ -52,10 +52,9 @@
  * h - Spacing between grid points
  * n - Number of grid points
  */
-struct grid_s
-{
+struct grid_s {
   double o, h;
-  int    n;
+  int n;
 };
 
 /*
@@ -63,16 +62,16 @@ struct grid_s
  * solution      - Function for the analytic solution
  * computeErr    - Displays the maximum error in the solution
  * gsColorPolicy - Generates the custom index set for this example
- */
+*/
 double solution(double x, double y);
-void   computeErr(double* I, grid_s grid);
-RAJA::TypedIndexSet<RAJA::ListSegment>
-gsColorPolicy(int N, camp::resources::Resource res);
+void computeErr(double *I, grid_s grid);
+RAJA::TypedIndexSet<RAJA::ListSegment> 
+  gsColorPolicy(int N, camp::resources::Resource res);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "Red-Black Gauss-Seidel Example" << std::endl;
+  std::cout<<"Red-Black Gauss-Seidel Example"<<std::endl;
 
   /*
    * ----[Solver Parameters]------------
@@ -87,21 +86,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
    */
   double tol = 1e-10;
 
-  int N       = 100;
-  int NN      = (N + 2) * (N + 2);
+  int N = 100;
+  int NN = (N + 2) * (N + 2);
   int maxIter = 100000;
 
   double resI2;
-  int    iteration;
+  int iteration;
 
   grid_s gridx;
   gridx.o = 0.0;
   gridx.h = 1.0 / (N + 1.0);
   gridx.n = N + 2;
 
-  camp::resources::Resource resource {camp::resources::Host()};
+  camp::resources::Resource resource{camp::resources::Host()};
 
-  double* I = resource.allocate<double>(NN);
+  double *I = resource.allocate<double>(NN);
 
   memset(I, 0, NN * sizeof(double));
 
@@ -116,10 +115,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using colorPolicy = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
 #endif
 
-  resI2     = 1;
+  resI2 = 1;
   iteration = 0;
-  while (resI2 > tol * tol)
-  {
+  while (resI2 > tol * tol) {
 
 #if defined(RAJA_ENABLE_OPENMP)
     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
@@ -130,34 +128,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //
     // Gauss-Seidel Iteration
     //
-    RAJA::forall<colorPolicy>(
-        colorSet,
-        [=](RAJA::Index_type id)
-        {
-          //
-          // Compute x,y grid index
-          //
-          int m = id % (N + 2);
-          int n = id / (N + 2);
-
-          double x = gridx.o + m * gridx.h;
-          double y = gridx.o + n * gridx.h;
-
-          double f = gridx.h * gridx.h *
-                     (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
-
-          double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] - I[id - 1] -
-                                 I[id + 1]);
-
-          double oldI = I[id];
-          RAJA_resI2 += (newI - oldI) * (newI - oldI);
-          I[id] = newI;
-        });
+    RAJA::forall<colorPolicy>(colorSet, 
+      [=](RAJA::Index_type id) {
+        
+      //
+      // Compute x,y grid index
+      //
+      int m = id % (N + 2);
+      int n = id / (N + 2);
+
+      double x = gridx.o + m * gridx.h;
+      double y = gridx.o + n * gridx.h;
+
+      double f = gridx.h * gridx.h * 
+                 (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+      double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] -
+                                 I[id - 1] - I[id + 1]);
+
+      double oldI = I[id];
+      RAJA_resI2 += (newI - oldI) * (newI - oldI);
+      I[id] = newI;
+
+    });
     resI2 = RAJA_resI2;
 
-    if (iteration > maxIter)
-    {
-      std::cout << "Gauss-Seidel maxed out on iterations" << std::endl;
+    if (iteration > maxIter) {
+      std::cout<<"Gauss-Seidel maxed out on iterations"<<std::endl;
       break;
     }
 
@@ -177,40 +174,36 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //  to generate RAJA ListSegments and populate a RAJA Static Index
 //  Set.
 
-RAJA::TypedIndexSet<RAJA::ListSegment>
-gsColorPolicy(int N, camp::resources::Resource res)
+RAJA::TypedIndexSet<RAJA::ListSegment> 
+  gsColorPolicy(int N, camp::resources::Resource res)
 {
   RAJA::TypedIndexSet<RAJA::ListSegment> colorSet;
 
-  int redN = static_cast<int>(std::ceil(static_cast<double>(N * N / 2)));
-  int blkN = static_cast<int>(std::floor(static_cast<double>(N * N / 2)));
-  RAJA::Index_type* Red = new RAJA::Index_type[redN];
-  RAJA::Index_type* Blk = new RAJA::Index_type[blkN];
+  int redN = static_cast<int>( std::ceil( static_cast<double>(N * N / 2) ) );
+  int blkN = static_cast<int>( std::floor( static_cast<double>(N * N / 2) ) );
+  RAJA::Index_type *Red = new RAJA::Index_type[redN];
+  RAJA::Index_type *Blk = new RAJA::Index_type[blkN];
 
   int ib = 0;
   int ir = 0;
 
   bool isRed = true;
 
-  for (int n = 1; n <= N; ++n)
-  {
-
-    for (int m = 1; m <= N; ++m)
-    {
-
+  for (int n = 1; n <= N; ++n) {
+    
+    for (int m = 1; m <= N; ++m) {
+      
       RAJA::Index_type id = n * (N + 2) + m;
-      if (isRed)
-      {
+      if (isRed) {
         Red[ib] = id;
         ib++;
-      }
-      else
-      {
+      } else {
         Blk[ir] = id;
         ir++;
       }
       isRed = !isRed;
     }
+
   }
 
   // Create Index
@@ -234,26 +227,26 @@ double solution(double x, double y)
 //
 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
 //
-void computeErr(double* I, grid_s grid)
+void computeErr(double *I, grid_s grid)
 {
 
-  RAJA::RangeSegment                        fdBounds(0, grid.n);
+  RAJA::RangeSegment fdBounds(0, grid.n);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
-  using errPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<errPolicy>(
-      RAJA::make_tuple(fdBounds, fdBounds),
-      [=](RAJA::Index_type tx, RAJA::Index_type ty)
-      {
-        int    id    = tx + grid.n * ty;
-        double x     = grid.o + tx * grid.h;
-        double y     = grid.o + ty * grid.h;
-        double myErr = std::abs(I[id] - solution(x, y));
-        tMax.max(myErr);
-      });
+  using errPolicy = RAJA::KernelPolicy<
+    RAJA::statement::For<1, RAJA::seq_exec,
+    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
+
+  RAJA::kernel<errPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
+                       [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
+    
+      int id = tx + grid.n * ty;
+      double x = grid.o + tx * grid.h;
+      double y = grid.o + ty * grid.h;
+      double myErr = std::abs(I[id] - solution(x, y));
+      tMax.max(myErr);
+
+    });
 
   double l2err = tMax;
   printf("Max error = %lg, h = %f \n", l2err, grid.h);
diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
index ad165e7539..0b35017fac 100644
--- a/examples/resource-dynamic-forall.cpp
+++ b/examples/resource-dynamic-forall.cpp
@@ -28,30 +28,25 @@
 void checkResult(int* res, int len);
 void printResult(int* res, int len);
 
-using policy_list = camp::list<
-    RAJA::seq_exec,
-    RAJA::simd_exec
+using policy_list = camp::list<RAJA::seq_exec
+                               ,RAJA::simd_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_exec<256>,
-    RAJA::cuda_exec<512>
+                               ,RAJA::cuda_exec<256>
+                               ,RAJA::cuda_exec<512>
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_exec<256>,
-    RAJA::hip_exec<512>
+                               ,RAJA::hip_exec<256>
+                               ,RAJA::hip_exec<512>
 #endif
-    >;
+                               >;
 
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 {
 
-  if (argc != 2)
-  {
-    RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index "
-                        "of  the policy to run");
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index of  the policy to run");
   }
 
   //
@@ -63,55 +58,50 @@ int main(int argc, char* argv[])
   const int pol = std::stoi(argv[1]);
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (pol < 2)
-  {
+  if(pol < 2) {
     select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-  }
-  else
-  {
+  } else {
     select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
   }
 
   std::cout << "\n\nRAJA vector addition example...\n";
-  std::cout << "Using policy # " << pol << std::endl;
+  std::cout << "Using policy # "<<pol<<std::endl;
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   const int N = 1000000;
 
-  //
-  // Allocate and initialize vector data
-  //
-  int* a = memoryManager::allocate<int>(N);
-  int* b = memoryManager::allocate<int>(N);
-  int* c = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data
+//
+  int *a = memoryManager::allocate<int>(N);
+  int *b = memoryManager::allocate<int>(N);
+  int *c = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] = -i;
     b[i] = i;
   }
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     c[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
   checkResult(c, N);
-  // printResult(c, N);
+//printResult(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  // Example of dynamic policy selection for forall
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Example of dynamic policy selection for forall
+//----------------------------------------------------------------------------//
 
   RAJA::resources::Host host_res;
 #if defined(RAJA_ENABLE_CUDA)
@@ -122,31 +112,30 @@ int main(int argc, char* argv[])
 #endif
 #if defined(RAJA_ENABLE_SYCL)
   RAJA::resources::Sycl device_res;
-#endif
+#endif  
 
-  // Get typed erased resource - it will internally store if we are running on
-  // the host or device
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
-    defined(RAJA_ENABLE_SYCL)
-  RAJA::resources::Resource res =
-      RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+  //Get typed erased resource - it will internally store if we are running on the host or device
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
+  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
-  RAJA::resources::Resource res =
-      RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
-  RAJA::expt::dynamic_forall<policy_list>(
-      res, pol, RAJA::RangeSegment(0, N),
-      [=] RAJA_HOST_DEVICE(int i) { c[i] = a[i] + b[i]; });
+  RAJA::expt::dynamic_forall<policy_list>
+  (res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i)   {
+
+    c[i] = a[i] + b[i];
+
+  });
 
   checkResult(c, N);
-  // printResult(c, N);
+  //printResult(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  //
-  // Clean up.
-  //
+//----------------------------------------------------------------------------//
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -162,19 +151,12 @@ int main(int argc, char* argv[])
 void checkResult(int* res, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (res[i] != 0)
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( res[i] != 0 ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -185,8 +167,7 @@ void checkResult(int* res, int len)
 void printResult(int* res, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "result[" << i << "] = " << res[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp
index 9941289bb1..b374bdba3f 100644
--- a/examples/resource-forall.cpp
+++ b/examples/resource-forall.cpp
@@ -18,7 +18,7 @@
  *  Vector Addition Example
  *
  *  Computes c = a + b, where a, b, c are vectors of ints.
- *  It illustrates similarities between a  C-style for-loop and a RAJA
+ *  It illustrates similarities between a  C-style for-loop and a RAJA 
  *  forall loop.
  *
  *  RAJA features shown:
@@ -35,273 +35,279 @@
 //
 // Functions for checking and printing results
 //
-void checkResult(int* res, int len);
+void checkResult(int* res, int len); 
 void printResult(int* res, int len);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA vector addition example...\n";
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   const int N = 100000;
 
-  //
-  // Allocate and initialize vector data
-  //
-  RAJA::resources::Host host {};
+//
+// Allocate and initialize vector data
+//
+  RAJA::resources::Host host{};
 
-  int* a = host.allocate<int>(N);
-  int* b = host.allocate<int>(N);
-  int* c = host.allocate<int>(N);
+  int *a = host.allocate<int>(N);
+  int *b = host.allocate<int>(N);
+  int *c = host.allocate<int>(N);
 
-  int* a_ = host.allocate<int>(N);
-  int* b_ = host.allocate<int>(N);
-  int* c_ = host.allocate<int>(N);
+  int *a_ = host.allocate<int>(N);
+  int *b_ = host.allocate<int>(N);
+  int *c_ = host.allocate<int>(N);
 
 
-  for (int i = 0; i < N; ++i)
-  {
-    a[i]  = -i;
-    b[i]  = 2 * i;
+  for (int i = 0; i < N; ++i) {
+    a[i] = -i;
+    b[i] = 2 * i;
     a_[i] = -i;
     b_[i] = 2 * i;
+
   }
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style vector addition...\n";
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     c[i] = a[i] + b[i];
   }
 
   checkResult(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::seq_exec policy enforces sequential execution....
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::seq_exec policy enforces sequential execution.... 
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
 
-  RAJA::forall<RAJA::seq_exec>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::seq_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
 
   checkResult(c, N);
 
-  //----------------------------------------------------------------------------//
-  // RAJA::sind_exec policy enforces simd execution....
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::sind_exec policy enforces simd execution.... 
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA simd_exec vector addition...\n";
 
-  RAJA::forall<RAJA::simd_exec>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::simd_exec>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+    c[i] = a[i] + b[i]; 
+  });
 
   checkResult(c, N);
 
 #if defined(RAJA_ENABLE_OPENMP)
-  //----------------------------------------------------------------------------//
-  // RAJA::omp_for_parallel_exec policy execution....
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::omp_for_parallel_exec policy execution.... 
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
+    c[i] = a[i] + b[i]; 
+  });
 
   checkResult(c, N);
 
-  //----------------------------------------------------------------------------//
-  // RAJA::omp_parallel_for_static_exec policy execution....
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::omp_parallel_for_static_exec policy execution.... 
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA omp_parallel_for_static_exec (default "
-               "chunksize) vector addition...\n";
+  std::cout << "\n Running RAJA omp_parallel_for_static_exec (default chunksize) vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_static_exec<>>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_static_exec< >>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
+    c[i] = a[i] + b[i]; 
+  });
 
   checkResult(c, N);
 
-  //----------------------------------------------------------------------------//
-  // RAJA::omp_parallel_for_dynamic_exec policy execution....
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::omp_parallel_for_dynamic_exec policy execution.... 
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector "
-               "addition...\n";
+  std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_dynamic_exec<16>>(
-      host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_dynamic_exec<16>>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
+    c[i] = a[i] + b[i]; 
+  });
 
   checkResult(c, N);
 #endif
 
 
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
-    defined(RAJA_ENABLE_SYCL)
 
-  /*
-    GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block
-  */
-  const int GPU_BLOCK_SIZE = 256;
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
+
+/*
+  GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block
+*/
+const int GPU_BLOCK_SIZE = 256;
 
-  //----------------------------------------------------------------------------//
-  // RAJA::cuda/hip_exec policy execution....
-  //----------------------------------------------------------------------------//
-  {
-    std::cout << "\n Running RAJA GPU vector addition on 2 seperate "
-                 "streams...\n";
+//----------------------------------------------------------------------------//
+// RAJA::cuda/hip_exec policy execution.... 
+//----------------------------------------------------------------------------//
+{
+  std::cout << "\n Running RAJA GPU vector addition on 2 seperate streams...\n";
 #if defined(RAJA_ENABLE_CUDA)
-    RAJA::resources::Cuda res_gpu1;
-    RAJA::resources::Cuda res_gpu2;
-    using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
+  RAJA::resources::Cuda res_gpu1;
+  RAJA::resources::Cuda res_gpu2;
+  using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
 #elif defined(RAJA_ENABLE_HIP)
-    RAJA::resources::Hip res_gpu1;
-    RAJA::resources::Hip res_gpu2;
-    using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
+  RAJA::resources::Hip res_gpu1;
+  RAJA::resources::Hip res_gpu2;
+  using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
 #elif defined(RAJA_ENABLE_SYCL)
-    RAJA::resources::Sycl res_gpu1;
-    RAJA::resources::Sycl res_gpu2;
-    using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
+RAJA::resources::Sycl res_gpu1;
+RAJA::resources::Sycl res_gpu2;
+using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
 #endif
 
-    int* d_a1 = res_gpu1.allocate<int>(N);
-    int* d_b1 = res_gpu1.allocate<int>(N);
-    int* d_c1 = res_gpu1.allocate<int>(N);
+  int* d_a1 = res_gpu1.allocate<int>(N);
+  int* d_b1 = res_gpu1.allocate<int>(N);
+  int* d_c1 = res_gpu1.allocate<int>(N);
 
-    int* d_a2 = res_gpu2.allocate<int>(N);
-    int* d_b2 = res_gpu2.allocate<int>(N);
-    int* d_c2 = res_gpu2.allocate<int>(N);
+  int* d_a2 = res_gpu2.allocate<int>(N);
+  int* d_b2 = res_gpu2.allocate<int>(N);
+  int* d_c2 = res_gpu2.allocate<int>(N);
 
-    res_gpu1.memcpy(d_a1, a, sizeof(int) * N);
-    res_gpu1.memcpy(d_b1, b, sizeof(int) * N);
+  res_gpu1.memcpy(d_a1, a, sizeof(int)* N);
+  res_gpu1.memcpy(d_b1, b, sizeof(int)* N);
 
-    res_gpu2.memcpy(d_a2, a, sizeof(int) * N);
-    res_gpu2.memcpy(d_b2, b, sizeof(int) * N);
+  res_gpu2.memcpy(d_a2, a, sizeof(int)* N);
+  res_gpu2.memcpy(d_b2, b, sizeof(int)* N);
 
 
-    RAJA::forall<EXEC_POLICY>(
-        res_gpu1, RAJA::RangeSegment(0, N),
-        [=] RAJA_DEVICE(int i) { d_c1[i] = d_a1[i] + d_b1[i]; });
+  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    d_c1[i] = d_a1[i] + d_b1[i]; 
+  });    
 
-    RAJA::forall<EXEC_POLICY>(
-        res_gpu2, RAJA::RangeSegment(0, N),
-        [=] RAJA_DEVICE(int i) { d_c2[i] = d_a2[i] + d_b2[i]; });
+  RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    d_c2[i] = d_a2[i] + d_b2[i]; 
+  }); 
 
-    res_gpu1.memcpy(c, d_c1, sizeof(int) * N);
+  res_gpu1.memcpy(c, d_c1, sizeof(int)*N );
 
-    res_gpu2.memcpy(c_, d_c2, sizeof(int) * N);
+  res_gpu2.memcpy(c_, d_c2, sizeof(int)*N );
 
-    checkResult(c, N);
-    checkResult(c_, N);
+  checkResult(c, N);
+  checkResult(c_, N);
 
-    res_gpu1.deallocate(d_a1);
-    res_gpu1.deallocate(d_b1);
-    res_gpu1.deallocate(d_c1);
+  res_gpu1.deallocate(d_a1);
+  res_gpu1.deallocate(d_b1);
+  res_gpu1.deallocate(d_c1);
 
-    res_gpu2.deallocate(d_a2);
-    res_gpu2.deallocate(d_b2);
-    res_gpu2.deallocate(d_c2);
-  }
+  res_gpu2.deallocate(d_a2);
+  res_gpu2.deallocate(d_b2);
+  res_gpu2.deallocate(d_c2);
+}
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::cuda/hip_exec policy with waiting event....
-  //----------------------------------------------------------------------------//
-  {
-    std::cout << "\n Running RAJA GPU vector with dependency between two "
-                 "seperate streams...\n";
+//----------------------------------------------------------------------------//
+// RAJA::cuda/hip_exec policy with waiting event.... 
+//----------------------------------------------------------------------------//
+{
+  std::cout << "\n Running RAJA GPU vector with dependency between two seperate streams...\n";
 #if defined(RAJA_ENABLE_CUDA)
-    // _raja_res_defres_start
-    RAJA::resources::Cuda res_gpu1;
-    RAJA::resources::Cuda res_gpu2;
-    RAJA::resources::Host res_host;
+  // _raja_res_defres_start
+  RAJA::resources::Cuda res_gpu1;
+  RAJA::resources::Cuda res_gpu2;
+  RAJA::resources::Host res_host;
 
-    using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
-    // _raja_res_defres_end
+  using EXEC_POLICY = RAJA::cuda_exec_async<GPU_BLOCK_SIZE>;
+  // _raja_res_defres_end
 #elif defined(RAJA_ENABLE_HIP)
-    RAJA::resources::Hip  res_gpu1;
-    RAJA::resources::Hip  res_gpu2;
-    RAJA::resources::Host res_host;
+  RAJA::resources::Hip res_gpu1;
+  RAJA::resources::Hip res_gpu2;
+  RAJA::resources::Host res_host;
 
-    using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
+  using EXEC_POLICY = RAJA::hip_exec_async<GPU_BLOCK_SIZE>;
 #elif defined(RAJA_ENABLE_SYCL)
-    RAJA::resources::Sycl res_gpu1;
-    RAJA::resources::Sycl res_gpu2;
-    RAJA::resources::Host res_host;
+  RAJA::resources::Sycl res_gpu1;
+  RAJA::resources::Sycl res_gpu2;
+  RAJA::resources::Host res_host;
 
-    using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
+  using EXEC_POLICY = RAJA::sycl_exec<GPU_BLOCK_SIZE>;
 #endif
 
-    // _raja_res_alloc_start
-    int* d_array1 = res_gpu1.allocate<int>(N);
-    int* d_array2 = res_gpu2.allocate<int>(N);
-    int* h_array  = res_host.allocate<int>(N);
-    // _raja_res_alloc_end
-
-    // _raja_res_k1_start
-    RAJA::forall<EXEC_POLICY>(
-        res_gpu1, RAJA::RangeSegment(0, N),
-        [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
-    // _raja_res_k1_end
-
-    // _raja_res_k2_start
-    RAJA::resources::Event e = RAJA::forall<EXEC_POLICY>(
-        res_gpu2, RAJA::RangeSegment(0, N),
-        [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
-    // _raja_res_k2_end
-
-    // _raja_res_wait_start
-    res_gpu2.wait_for(&e);
-    // _raja_res_wait_end
-
-    // _raja_res_k3_start
-    RAJA::forall<EXEC_POLICY>(
-        res_gpu1, RAJA::RangeSegment(0, N),
-        [=] RAJA_HOST_DEVICE(int i) { d_array1[i] *= d_array2[i]; });
-    // _raja_res_k3_end
-
-    // _raja_res_memcpy_start
-    res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N);
-    // _raja_res_memcpy_end
-
-    // _raja_res_k4_start
-    bool check = true;
-    RAJA::forall<RAJA::seq_exec>(
-        res_host, RAJA::RangeSegment(0, N),
-        [&check, h_array](int i)
-        {
-          if (h_array[i] != -i)
-          {
-            check = false;
-          }
-        });
-    // _raja_res_k4_end
-
-    std::cout << "\n         result -- ";
-    if (check)
-      std::cout << "PASS\n";
-    else
-      std::cout << "FAIL\n";
-
-    res_gpu1.deallocate(d_array1);
-    res_gpu2.deallocate(d_array2);
-    res_host.deallocate(h_array);
-  }
+  // _raja_res_alloc_start
+  int* d_array1 = res_gpu1.allocate<int>(N);
+  int* d_array2 = res_gpu2.allocate<int>(N);
+  int* h_array  = res_host.allocate<int>(N);
+  // _raja_res_alloc_end
+
+  // _raja_res_k1_start
+  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0,N),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] = i;
+    }
+  );
+  // _raja_res_k1_end
+
+  // _raja_res_k2_start
+  RAJA::resources::Event e = RAJA::forall<EXEC_POLICY>(res_gpu2, RAJA::RangeSegment(0,N),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array2[i] = -1;
+    }
+  );
+  // _raja_res_k2_end
+
+  // _raja_res_wait_start
+  res_gpu2.wait_for(&e);
+  // _raja_res_wait_end
+
+  // _raja_res_k3_start
+  RAJA::forall<EXEC_POLICY>(res_gpu1, RAJA::RangeSegment(0,N),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] *= d_array2[i];
+    }
+  );
+  // _raja_res_k3_end
+
+  // _raja_res_memcpy_start
+  res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N);
+  // _raja_res_memcpy_end
+
+  // _raja_res_k4_start
+  bool check = true;
+  RAJA::forall<RAJA::seq_exec>(res_host, RAJA::RangeSegment(0,N),
+    [&check, h_array] (int i) {
+      if(h_array[i] != -i) {check = false;} 
+    }
+  );
+  // _raja_res_k4_end
+  
+  std::cout << "\n         result -- ";
+  if (check) std::cout << "PASS\n";
+  else std::cout << "FAIL\n";
+
+  res_gpu1.deallocate(d_array1);
+  res_gpu2.deallocate(d_array2);
+  res_host.deallocate(h_array);
+
+}
 
 #endif
-  //
-  //
-  // Clean up.
-  //
+//
+//
+// Clean up.
+//
   host.deallocate(a);
   host.deallocate(b);
   host.deallocate(c);
@@ -318,22 +324,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Function to check result and report P/F.
 //
-void checkResult(int* res, int len)
+void checkResult(int* res, int len) 
 {
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (res[i] != i)
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( res[i] != i ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -344,8 +343,7 @@ void checkResult(int* res, int len)
 void printResult(int* res, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "result[" << i << "] = " << res[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp
index bc05cbc2df..a754876479 100644
--- a/examples/resource-kernel.cpp
+++ b/examples/resource-kernel.cpp
@@ -10,7 +10,7 @@
 
 using namespace RAJA;
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -19,47 +19,56 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 10;
   constexpr int M = 1000000;
 
-  RAJA::resources::Cuda def_cuda_res {RAJA::resources::Cuda::get_default()};
-  RAJA::resources::Host def_host_res {RAJA::resources::Host::get_default()};
-  int*                  d_array = def_cuda_res.allocate<int>(N * M);
-  int*                  h_array = def_host_res.allocate<int>(N * M);
+  RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
+  RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
+  int* d_array = def_cuda_res.allocate<int>(N*M);
+  int* h_array = def_host_res.allocate<int>(N*M);
 
   RAJA::RangeSegment one_range(0, 1);
   RAJA::RangeSegment m_range(0, M);
   RAJA::RangeSegment n_range(0, N);
 
-  using TEST_POL = RAJA::KernelPolicy<statement::CudaKernelAsync<statement::For<
-      0, cuda_block_x_loop,
-      statement::For<1, cuda_thread_x_loop, statement::Lambda<0>>>>>;
+  using TEST_POL =
+    RAJA::KernelPolicy<
+      statement::CudaKernelAsync<
+        statement::For<0, cuda_block_x_loop,
+          statement::For<1, cuda_thread_x_loop,
+            statement::Lambda<0>
+          >
+        >
+      >
+    >;
 
-  RAJA::forall<RAJA::seq_exec>(
-      def_host_res, n_range,
-      [=, &def_cuda_res](int i)
-      {
-        RAJA::resources::Cuda res_cuda;
+  RAJA::forall<RAJA::seq_exec>(def_host_res, n_range,
+    [=, &def_cuda_res](int i){
+      RAJA::resources::Cuda res_cuda; 
 
-        RAJA::resources::Event e = RAJA::kernel_resource<TEST_POL>(
-            RAJA::make_tuple(one_range, m_range),
+      RAJA::resources::Event e = RAJA::kernel_resource<TEST_POL>(
+        RAJA::make_tuple(one_range,
+                         m_range),
 
-            res_cuda,
+        res_cuda,
 
-            [=] RAJA_DEVICE(int k, int j) { d_array[i * M + j] = i * M + j; });
+        [=] RAJA_DEVICE (int k, int j) {
+          d_array[i*M + j] = i * M + j;  
+        }
+      );
 
-        def_cuda_res.wait_for(&e);
-      });
+      def_cuda_res.wait_for(&e);
+    }
+  );
 
   def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M);
 
   int ec_count = 0;
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, N * M),
-      [=, &ec_count](int i)
-      {
-        if (h_array[i] != i) ec_count++;
-      });
+  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, N*M),
+    [=, &ec_count](int i){
+      if (h_array[i] != i) ec_count++;
+    }
+  );
 
   std::cout << "    Result -- ";
-  if (ec_count > 0)
+  if (ec_count > 0) 
     std::cout << "FAIL : error count = " << ec_count << "\n";
   else
     std::cout << "PASS!\n";
diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp
index cff8f124ec..288b70f8a5 100644
--- a/examples/resource-launch.cpp
+++ b/examples/resource-launch.cpp
@@ -10,7 +10,7 @@
 
 using namespace RAJA;
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -19,10 +19,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 10;
   constexpr int M = 1000000;
 
-  RAJA::resources::Cuda def_cuda_res {RAJA::resources::Cuda::get_default()};
-  RAJA::resources::Host def_host_res {RAJA::resources::Host::get_default()};
-  int*                  d_array = def_cuda_res.allocate<int>(N * M);
-  int*                  h_array = def_host_res.allocate<int>(N * M);
+  RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
+  RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
+  int* d_array = def_cuda_res.allocate<int>(N*M);
+  int* h_array = def_host_res.allocate<int>(N*M);
 
   RAJA::RangeSegment one_range(0, 1);
   RAJA::RangeSegment m_range(0, M);
@@ -34,38 +34,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
-  RAJA::forall<RAJA::seq_exec>(
-      def_host_res, n_range,
-      [=, &def_cuda_res](int i)
-      {
-        RAJA::resources::Cuda res_cuda;
-
-        RAJA::resources::Event e = RAJA::launch<launch_policy>(
-            res_cuda, RAJA::LaunchParams(RAJA::Teams(64), RAJA::Threads(1)),
-            [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-            {
-              RAJA::loop<teams_x>(
-                  ctx, m_range,
-                  [&](int j)
-                  {
-                    RAJA::loop<threads_x>(
-                        ctx, one_range,
-                        [&](int k) { d_array[i * M + j] = i * M + j; });
-                  });
-            });
-
-        def_cuda_res.wait_for(&e);
+  RAJA::forall<RAJA::seq_exec>(def_host_res, n_range,
+    [=, &def_cuda_res](int i){
+
+      RAJA::resources::Cuda res_cuda;
+
+      RAJA::resources::Event e =
+        RAJA::launch<launch_policy>(res_cuda,
+        RAJA::LaunchParams(RAJA::Teams(64),
+                         RAJA::Threads(1)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
+
+       RAJA::loop<teams_x>(ctx, m_range, [&] (int j) {
+         RAJA::loop<threads_x>(ctx, one_range, [&] (int k) {
+
+           d_array[i*M + j] = i * M + j;
+
+           });
+         });
+
       });
 
+      def_cuda_res.wait_for(&e);
+    }
+  );
+
   def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M);
 
   int ec_count = 0;
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, N * M),
-      [=, &ec_count](int i)
-      {
-        if (h_array[i] != i) ec_count++;
-      });
+  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, N*M),
+    [=, &ec_count](int i){
+      if (h_array[i] != i) ec_count++;
+    }
+  );
 
   std::cout << "    Result -- ";
   if (ec_count > 0)
diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp
index 29854a1c90..e52923d81f 100644
--- a/examples/resource-runtime-launch.cpp
+++ b/examples/resource-runtime-launch.cpp
@@ -30,31 +30,27 @@
  */
 
 using host_launch = RAJA::seq_launch_t;
-using host_loop   = RAJA::seq_exec;
+using host_loop = RAJA::seq_exec;
 
 #if defined(RAJA_ENABLE_CUDA)
 using device_launch = RAJA::cuda_launch_t<true>;
-using device_loop   = RAJA::cuda_global_thread_x;
+using device_loop = RAJA::cuda_global_thread_x;
 #elif defined(RAJA_ENABLE_HIP)
 using device_launch = RAJA::hip_launch_t<true>;
-using device_loop   = RAJA::hip_global_thread_x;
+using device_loop = RAJA::hip_global_thread_x;
 #endif
 
-using launch_policy = RAJA::LaunchPolicy<
-    host_launch
+using launch_policy = RAJA::LaunchPolicy<host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    device_launch
+                                         , device_launch
 #endif
-    >;
+                                        >;
 
-using loop_pol = RAJA::LoopPolicy<
-    host_loop
+using loop_pol = RAJA::LoopPolicy<host_loop
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    device_loop
+                                  , device_loop
 #endif
-    >;
+                                 >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -64,13 +60,11 @@ using reduce_policy = RAJA::hip_reduce;
 using reduce_policy = RAJA::seq_reduce;
 #endif
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 {
 
-  if (argc != 2)
-  {
-    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions "
-                        "device");
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions device");
   }
 
   //
@@ -79,91 +73,75 @@ int main(int argc, char* argv[])
   // Example usage ./teams_reductions host or ./teams_reductions device
   //
   std::string exec_space = argv[1];
-  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
-  {
-    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions "
-                        "device");
+  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
+    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (exec_space.compare("host") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-    printf("Running RAJA-Teams reductions example on the host \n");
-  }
-  if (exec_space.compare("device") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
-    printf("Running RAJA-Teams reductions example on the device \n");
-  }
+  if(exec_space.compare("host") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams reductions example on the host \n"); }
+  if(exec_space.compare("device") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); }
 
   // _reductions_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   const int N = 1000000;
 
-  //
-  // Allocate array data and initialize data to alternating sequence of 1, -1.
-  //
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (i % 2 == 0)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
       a[i] = 1;
-    }
-    else
-    {
+    } else {
       a[i] = -1;
     }
   }
 
-  //
-  // Set min and max loc values
-  //
+//
+// Set min and max loc values
+//
   const int minloc_ref = N / 2;
-  a[minloc_ref]        = -100;
+  a[minloc_ref] = -100;
 
   const int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref]        = 100;
+  a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-  //
-  // Note: with this data initialization scheme, the following results will
-  //       be observed for all reduction kernels below:
-  //
-  //  - the sum will be zero
-  //  - the min will be -100
-  //  - the max will be 100
-  //  - the min loc will be N/2
-  //  - the max loc will be N/2 + 1
-  //
-  //
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
 
-  //
-  // Define index range for iterating over a elements in all examples
-  //
+//
+// Define index range for iterating over a elements in all examples
+//
   // _reductions_range_start
   RAJA::RangeSegment arange(0, N);
   // _reductions_range_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   RAJA::ReduceSum<reduce_policy, int> kernel_sum(0);
-  RAJA::ReduceMin<reduce_policy, int> kernel_min(
-      std::numeric_limits<int>::max());
-  RAJA::ReduceMax<reduce_policy, int> kernel_max(
-      std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(
-      std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(
-      std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMin<reduce_policy, int> kernel_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<reduce_policy, int> kernel_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<reduce_policy, int> kernel_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<reduce_policy, int> kernel_maxloc(std::numeric_limits<int>::min(), -1);
 
   const int TEAM_SZ = 256;
-  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ);
+  const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ);
 
 
   RAJA::resources::Host host_res;
@@ -174,50 +152,44 @@ int main(int argc, char* argv[])
   RAJA::resources::Hip device_res;
 #endif
 
-  // Get typed erased resource - it will internally store if we are running on
-  // the host or device
+  //Get typed erased resource - it will internally store if we are running on the host or device
 #if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
-  RAJA::resources::Resource res =
-      RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
-  RAJA::resources::Resource res =
-      RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
-  // How the kernel executes now depends on how the resource is constructed
-  // (host or device)
-  RAJA::launch<launch_policy>(
-      res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<loop_pol>(
-            ctx, arange,
-            [&](int i)
-            {
-              kernel_sum += a[i];
+  //How the kernel executes now depends on how the resource is constructed (host or device)
+  RAJA::launch<launch_policy>
+    (res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ),
+                                   RAJA::Threads(TEAM_SZ)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
+       RAJA::loop<loop_pol>(ctx, arange, [&] (int i) {
+
+           kernel_sum += a[i];
 
-              kernel_min.min(a[i]);
-              kernel_max.max(a[i]);
+           kernel_min.min(a[i]);
+           kernel_max.max(a[i]);
 
-              kernel_minloc.minloc(a[i], i);
-              kernel_maxloc.maxloc(a[i], i);
-            });
-      });
+           kernel_minloc.minloc(a[i], i);
+           kernel_maxloc.maxloc(a[i], i);
+         });
+    });
 
 
   std::cout << "\tsum = " << kernel_sum.get() << std::endl;
   std::cout << "\tmin = " << kernel_min.get() << std::endl;
   std::cout << "\tmax = " << kernel_max.get() << std::endl;
   std::cout << "\tmin, loc = " << kernel_minloc.get() << " , "
-            << kernel_minloc.getLoc() << std::endl;
+                               << kernel_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , "
-            << kernel_maxloc.getLoc() << std::endl;
+                               << kernel_maxloc.getLoc() << std::endl;
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
diff --git a/examples/tut_daxpy.cpp b/examples/tut_daxpy.cpp
index 5f25cf42f0..74b127e0d6 100644
--- a/examples/tut_daxpy.cpp
+++ b/examples/tut_daxpy.cpp
@@ -15,12 +15,12 @@
  *  Daxpy Example
  *
  *  Computes a += b*c, where a, b are vectors of doubles
- *  and c is a scalar double. It illustrates similarities between a
- *  C-style for-loop and a RAJA forall loop.
+ *  and c is a scalar double. It illustrates similarities between a 
+ *  C-style for-loop and a RAJA forall loop. 
  *
  *  RAJA features shown:
  *    - `forall` loop iteration template method
- *    -  Index range segment
+ *    -  Index range segment 
  *    -  Execution policies
  */
 
@@ -28,184 +28,187 @@
 // Functions for checking and printing results
 //
 void checkResult(double* v1, double* v2, int len);
-void printResult(double* v, int len);
+void printResult(double* v, int len); 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA daxpy example...\n";
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   const int N = 1000000;
 
-  //
-  // Allocate and initialize vector data.
-  //
-  double* a0   = new double[N];
+//
+// Allocate and initialize vector data.
+//
+  double* a0 = new double[N];
   double* aref = new double[N];
 
   double* ta = new double[N];
   double* tb = new double[N];
-
+  
   double c = 3.14159;
-
-  for (int i = 0; i < N; i++)
-  {
+  
+  for (int i = 0; i < N; i++) {
     a0[i] = 1.0;
     tb[i] = 2.0;
   }
 
-  //
-  // Declare and set pointers to array data.
-  // We reset them for each daxpy version so that
-  // they all look the same.
-  //
+//
+// Declare and set pointers to array data. 
+// We reset them for each daxpy version so that 
+// they all look the same.
+//
 
   double* a = ta;
   double* b = tb;
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version of daxpy...\n";
+   
+  std::memcpy( a, a0, N * sizeof(double) );  
 
-  std::memcpy(a, a0, N * sizeof(double));
-
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] += b[i] * c;
   }
 
-  std::memcpy(aref, a, N * sizeof(double));
+  std::memcpy( aref, a, N* sizeof(double) ); 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // In the following, we show a RAJA version
-  // of the daxpy operation and how it can
-  // be run differently by choosing different
-  // RAJA execution policies.
-  //
-  // Note that the only thing that changes in
-  // these versions is the execution policy.
-  // To implement these cases using the
-  // programming model choices directly, would
-  // require unique changes for each.
-  //
-
-  //----------------------------------------------------------------------------//
+//
+// In the following, we show a RAJA version
+// of the daxpy operation and how it can
+// be run differently by choosing different
+// RAJA execution policies. 
+//
+// Note that the only thing that changes in 
+// these versions is the execution policy.
+// To implement these cases using the 
+// programming model choices directly, would
+// require unique changes for each.
+//
+  
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential daxpy...\n";
+   
+  std::memcpy( a, a0, N * sizeof(double) );  
 
-  std::memcpy(a, a0, N * sizeof(double));
-
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N), [=] (int i) {
+    a[i] += b[i] * c;
+  });
 
   checkResult(a, aref, N);
-  // printResult(a, N);
+//printResult(a, N); 
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // RAJA SIMD version.
-  //
+//
+// RAJA SIMD version.
+//
   std::cout << "\n Running RAJA SIMD daxpy...\n";
+   
+  std::memcpy( a, a0, N * sizeof(double) );  
 
-  std::memcpy(a, a0, N * sizeof(double));
-
-  RAJA::forall<RAJA::simd_exec>(
-      RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N), [=] (int i) {
+    a[i] += b[i] * c;
+  });
 
   checkResult(a, aref, N);
-  // printResult(a, N);
+//printResult(a, N); 
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP daxpy...\n";
+   
+  std::memcpy( a, a0, N * sizeof(double) );  
 
-  std::memcpy(a, a0, N * sizeof(double));
-
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=] (int i) {
+    a[i] += b[i] * c;
+  });
 
   checkResult(a, aref, N);
-// printResult(a, N);
+//printResult(a, N); 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
-  //
-  // RAJA CUDA parallel GPU version (256 threads per thread block).
-  //
+//
+// RAJA CUDA parallel GPU version (256 threads per thread block).
+//
   std::cout << "\n Running RAJA CUDA daxpy...\n";
 
-  a = 0;
-  b = 0;
-  cudaErrchk(cudaMalloc((void**)&a, N * sizeof(double)));
-  cudaErrchk(cudaMalloc((void**)&b, N * sizeof(double)));
+  a = 0; b = 0;
+  cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) ));
+  cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) ));
+ 
+  cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); 
+  cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); 
 
-  cudaErrchk(cudaMemcpy(a, a0, N * sizeof(double), cudaMemcpyHostToDevice));
-  cudaErrchk(cudaMemcpy(b, tb, N * sizeof(double), cudaMemcpyHostToDevice));
-
-  RAJA::forall<RAJA::cuda_exec<256>>(
-      RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::cuda_exec<256>>(RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) {
+    a[i] += b[i] * c;
+  });
 
-  cudaErrchk(cudaMemcpy(ta, a, N * sizeof(double), cudaMemcpyDeviceToHost));
+  cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost ));
 
   cudaErrchk(cudaFree(a));
   cudaErrchk(cudaFree(b));
 
   a = ta;
   checkResult(a, aref, N);
-// printResult(a, N);
+//printResult(a, N); 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
-  //
-  // RAJA HIP parallel GPU version (256 threads per thread block).
-  //
+//
+// RAJA HIP parallel GPU version (256 threads per thread block).
+//
   std::cout << "\n Running RAJA HIP daxpy...\n";
 
-  a = 0;
-  b = 0;
-  hipErrchk(hipMalloc((void**)&a, N * sizeof(double)));
-  hipErrchk(hipMalloc((void**)&b, N * sizeof(double)));
+  a = 0; b = 0;
+  hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) ));
+  hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) ));
 
-  hipErrchk(hipMemcpy(a, a0, N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(b, tb, N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice ));
 
-  RAJA::forall<RAJA::hip_exec<256>>(
-      RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::hip_exec<256>>(RAJA::RangeSegment(0, N),
+    [=] RAJA_DEVICE (int i) {
+    a[i] += b[i] * c;
+  });
 
-  hipErrchk(hipMemcpy(ta, a, N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost ));
 
   hipErrchk(hipFree(a));
   hipErrchk(hipFree(b));
 
   a = ta;
   checkResult(a, aref, N);
-// printResult(a, N);
+//printResult(a, N);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
-  delete[] a0;
-  delete[] aref;
-  delete[] ta;
+//
+// Clean up. 
+//
+  delete[] a0; 
+  delete[] aref; 
+  delete[] ta; 
   delete[] tb;
-
+  
   std::cout << "\n DONE!...\n";
 
   return 0;
@@ -214,34 +217,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Function to compare result to reference and report P/F.
 //
-void checkResult(double* v1, double* v2, int len)
+void checkResult(double* v1, double* v2, int len) 
 {
   bool match = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (v1[i] != v2[i])
-    {
-      match = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( v1[i] != v2[i] ) { match = false; }
   }
-  if (match)
-  {
+  if ( match ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
-  }
+  } 
 }
 
 //
-// Function to print result.
+// Function to print result. 
 //
-void printResult(double* v, int len)
+void printResult(double* v, int len) 
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "result[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp
index a2d730744a..c584695128 100644
--- a/examples/tut_halo-exchange.cpp
+++ b/examples/tut_halo-exchange.cpp
@@ -34,17 +34,16 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when
-  using forall CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a
-  CUDA thread block when using workgroup
+  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using forall
+  CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using workgroup
 */
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE           = 256;
+const int CUDA_BLOCK_SIZE = 256;
 const int CUDA_WORKGROUP_BLOCK_SIZE = 1024;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE           = 256;
+const int HIP_BLOCK_SIZE = 256;
 const int HIP_WORKGROUP_BLOCK_SIZE = 1024;
 #endif
 
@@ -57,54 +56,42 @@ const int num_neighbors = 26;
 //
 // Functions for checking and printing results
 //
-void checkResult(
-    std::vector<double*> const& vars,
-    std::vector<double*> const& vars_ref,
-    int                         var_size,
-    int                         num_vars);
+void checkResult(std::vector<double*> const& vars, std::vector<double*> const& vars_ref,
+                 int var_size, int num_vars);
 void printResult(std::vector<double*> const& vars, int var_size, int num_vars);
 
 //
 // Functions for allocating and populating packing and unpacking lists
 //
-void create_pack_lists(
-    std::vector<int*>& pack_index_lists,
-    std::vector<int>&  pack_index_list_lengths,
-    const int          halo_width,
-    const int*         grid_dims);
-void create_unpack_lists(
-    std::vector<int*>& unpack_index_lists,
-    std::vector<int>&  unpack_index_list_lengths,
-    const int          halo_width,
-    const int*         grid_dims);
+void create_pack_lists(std::vector<int*>& pack_index_lists, std::vector<int>& pack_index_list_lengths,
+                       const int halo_width, const int* grid_dims);
+void create_unpack_lists(std::vector<int*>& unpack_index_lists, std::vector<int>& unpack_index_list_lengths,
+                         const int halo_width, const int* grid_dims);
 void destroy_pack_lists(std::vector<int*>& pack_index_lists);
 void destroy_unpack_lists(std::vector<int*>& unpack_index_lists);
 
 
-template <typename T>
+template < typename T >
 struct memory_manager_allocator
 {
   using value_type = T;
 
   memory_manager_allocator() = default;
 
-  template <typename U>
-  constexpr memory_manager_allocator(
-      memory_manager_allocator<U> const&) noexcept
-  {}
+  template < typename U >
+  constexpr memory_manager_allocator(memory_manager_allocator<U> const&) noexcept
+  { }
 
   /*[[nodiscard]]*/
   value_type* allocate(size_t num)
   {
-    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
-    {
+    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
       throw std::bad_alloc();
     }
 
-    value_type* ptr = memoryManager::allocate<value_type>(num);
+    value_type *ptr = memoryManager::allocate<value_type>(num);
 
-    if (!ptr)
-    {
+    if (!ptr) {
       throw std::bad_alloc();
     }
 
@@ -119,51 +106,45 @@ struct memory_manager_allocator
 };
 
 template <typename T, typename U>
-bool operator==(
-    memory_manager_allocator<T> const&,
-    memory_manager_allocator<U> const&)
+bool operator==(memory_manager_allocator<T> const&, memory_manager_allocator<U> const&)
 {
   return true;
 }
 
 template <typename T, typename U>
-bool operator!=(
-    memory_manager_allocator<T> const& lhs,
-    memory_manager_allocator<U> const& rhs)
+bool operator!=(memory_manager_allocator<T> const& lhs, memory_manager_allocator<U> const& rhs)
 {
   return !(lhs == rhs);
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template <typename T>
+template < typename T >
 struct pinned_allocator
 {
   using value_type = T;
 
   pinned_allocator() = default;
 
-  template <typename U>
+  template < typename U >
   constexpr pinned_allocator(pinned_allocator<U> const&) noexcept
-  {}
+  { }
 
   /*[[nodiscard]]*/
   value_type* allocate(size_t num)
   {
-    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
-    {
+    if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
       throw std::bad_alloc();
     }
 
-    value_type* ptr = nullptr;
+    value_type *ptr = nullptr;
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaMallocHost((void**)&ptr, num * sizeof(value_type)));
+    cudaErrchk(cudaMallocHost((void **)&ptr, num*sizeof(value_type)));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipHostMalloc((void**)&ptr, num * sizeof(value_type)));
+    hipErrchk(hipHostMalloc((void **)&ptr, num*sizeof(value_type)));
 #endif
 
-    if (!ptr)
-    {
+    if (!ptr) {
       throw std::bad_alloc();
     }
 
@@ -195,13 +176,12 @@ bool operator!=(pinned_allocator<T> const& lhs, pinned_allocator<U> const& rhs)
 
 #endif
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
 
   std::cout << "\n\nRAJA halo exchange example...\n";
 
-  if (argc != 1 && argc != 7)
-  {
+  if (argc != 1 && argc != 7) {
     std::cerr << "Usage: tut_halo-exchange "
               << "[grid_x grid_y grid_z halo_width num_vars num_cycles]\n";
     std::exit(1);
@@ -214,46 +194,46 @@ int main(int argc, char** argv)
   // Define number of grid variables
   // Define number of cycles
   //
-  const int grid_dims[3] = {
-      (argc != 7) ? 100 : std::atoi(argv[1]),
-      (argc != 7) ? 100 : std::atoi(argv[2]),
-      (argc != 7) ? 100 : std::atoi(argv[3])};
-  const int halo_width = (argc != 7) ? 1 : std::atoi(argv[4]);
-  const int num_vars   = (argc != 7) ? 3 : std::atoi(argv[5]);
-  const int num_cycles = (argc != 7) ? 3 : std::atoi(argv[6]);
+  const int grid_dims[3] = { (argc != 7) ? 100 : std::atoi(argv[1]),
+                             (argc != 7) ? 100 : std::atoi(argv[2]),
+                             (argc != 7) ? 100 : std::atoi(argv[3]) };
+  const int halo_width =     (argc != 7) ?   1 : std::atoi(argv[4]);
+  const int num_vars   =     (argc != 7) ?   3 : std::atoi(argv[5]);
+  const int num_cycles =     (argc != 7) ?   3 : std::atoi(argv[6]);
   // _halo_exchange_input_params_end
 
-  std::cout << "grid dimensions " << grid_dims[0] << " x " << grid_dims[1]
-            << " x " << grid_dims[2] << "\n"
-            << "halo width " << halo_width << "\n"
-            << "number of variables " << num_vars << "\n"
-            << "number of cycles " << num_cycles << "\n";
+  std::cout << "grid dimensions "     << grid_dims[0]
+            << " x "                  << grid_dims[1]
+            << " x "                  << grid_dims[2] << "\n"
+            << "halo width "          << halo_width   << "\n"
+            << "number of variables " << num_vars     << "\n"
+            << "number of cycles "    << num_cycles   << "\n";
 
-  if (grid_dims[0] < halo_width || grid_dims[1] < halo_width ||
-      grid_dims[2] < halo_width)
-  {
+  if ( grid_dims[0] < halo_width ||
+       grid_dims[1] < halo_width ||
+       grid_dims[2] < halo_width ) {
     std::cerr << "Error: "
               << "grid dimensions must not be smaller than the halo width\n";
     std::exit(1);
   }
 
-  const int grid_plus_halo_dims[3] = {
-      grid_dims[0] + 2 * halo_width, grid_dims[1] + 2 * halo_width,
-      grid_dims[2] + 2 * halo_width};
+  const int grid_plus_halo_dims[3] = { grid_dims[0] + 2*halo_width,
+                                       grid_dims[1] + 2*halo_width,
+                                       grid_dims[2] + 2*halo_width };
 
-  const int var_size =
-      grid_plus_halo_dims[0] * grid_plus_halo_dims[1] * grid_plus_halo_dims[2];
+  const int var_size = grid_plus_halo_dims[0] *
+                       grid_plus_halo_dims[1] *
+                       grid_plus_halo_dims[2] ;
 
   // _halo_exchange_vars_allocate_start
   //
   // Allocate grid variables and reference grid variables used to check
   // correctness.
   //
-  std::vector<double*> vars(num_vars, nullptr);
+  std::vector<double*> vars    (num_vars, nullptr);
   std::vector<double*> vars_ref(num_vars, nullptr);
 
-  for (int v = 0; v < num_vars; ++v)
-  {
+  for (int v = 0; v < num_vars; ++v) {
     vars[v]     = memoryManager::allocate<double>(var_size);
     vars_ref[v] = memoryManager::allocate<double>(var_size);
   }
@@ -265,14 +245,12 @@ int main(int argc, char** argv)
   // Generate index lists for packing and unpacking
   //
   std::vector<int*> pack_index_lists(num_neighbors, nullptr);
-  std::vector<int>  pack_index_list_lengths(num_neighbors, 0);
-  create_pack_lists(
-      pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
+  std::vector<int > pack_index_list_lengths(num_neighbors, 0);
+  create_pack_lists(pack_index_lists, pack_index_list_lengths, halo_width, grid_dims);
 
   std::vector<int*> unpack_index_lists(num_neighbors, nullptr);
-  std::vector<int>  unpack_index_list_lengths(num_neighbors, 0);
-  create_unpack_lists(
-      unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
+  std::vector<int > unpack_index_list_lengths(num_neighbors, 0);
+  create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
   // _halo_exchange_index_list_generate_end
 
 
@@ -285,7 +263,7 @@ int main(int argc, char** argv)
   auto timer = RAJA::Timer();
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   {
     std::cout << "\n Running C-style halo exchange...\n";
 
@@ -294,82 +272,74 @@ int main(int argc, char** argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
+
     }
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          for (int i = 0; i < var_size; i++)
-          {
-            var[i] = i + v;
-          }
+        for (int i = 0; i < var_size; i++) {
+          var[i] = i + v;
         }
+      }
 
-        // _halo_exchange_sequential_cstyle_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
-
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+      // _halo_exchange_sequential_cstyle_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-            double* var = vars[v];
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            for (int i = 0; i < len; i++)
-            {
-              buffer[i] = var[list[i]];
-            }
+          double* var = vars[v];
 
-            buffer += len;
+          for (int i = 0; i < len; i++) {
+            buffer[i] = var[list[i]];
           }
 
-          // send single message
+          buffer += len;
         }
-        // _halo_exchange_sequential_cstyle_packing_end
 
-        // _halo_exchange_sequential_cstyle_unpacking_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+        // send single message
+      }
+      // _halo_exchange_sequential_cstyle_packing_end
 
-          // recv single message
+      // _halo_exchange_sequential_cstyle_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        // recv single message
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-            double* var = vars[v];
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            for (int i = 0; i < len; i++)
-            {
-              var[list[i]] = buffer[i];
-            }
+          double* var = vars[v];
 
-            buffer += len;
+          for (int i = 0; i < len; i++) {
+            var[list[i]] = buffer[i];
           }
+
+          buffer += len;
         }
-        // _halo_exchange_sequential_cstyle_unpacking_end
+      }
+      // _halo_exchange_sequential_cstyle_unpacking_end
+
       }
       timer.stop();
 
@@ -378,33 +348,30 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate(buffers[l]);
+
     }
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // copy result of exchange for reference later
-    for (int v = 0; v < num_vars; ++v)
-    {
+    for (int v = 0; v < num_vars; ++v) {
 
       double* var     = vars[v];
       double* var_ref = vars_ref[v];
 
-      for (int i = 0; i < var_size; i++)
-      {
+      for (int i = 0; i < var_size; i++) {
         var_ref[i] = var[i];
       }
     }
   }
 
 
-  //----------------------------------------------------------------------------//
-  // Separate packing/unpacking loops using forall
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA loop forall halo exchange...\n";
 
@@ -416,78 +383,74 @@ int main(int argc, char** argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
+
     }
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
-
-          double* var = vars[v];
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
-        }
+        double* var = vars[v];
 
-        // _halo_exchange_seq_forall_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+      // _halo_exchange_seq_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-            double* var = vars[v];
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=](int i) { buffer[i] = var[list[i]]; });
+          double* var = vars[v];
 
-            buffer += len;
-          }
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-          // send single message
+          buffer += len;
         }
-        // _halo_exchange_seq_forall_packing_end
 
-        // _halo_exchange_seq_forall_unpacking_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+        // send single message
+      }
+      // _halo_exchange_seq_forall_packing_end
+
+      // _halo_exchange_seq_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          // recv single message
+        // recv single message
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=](int i) { var[list[i]] = buffer[i]; });
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
-        // _halo_exchange_seq_forall_unpacking_end
+      }
+      // _halo_exchange_seq_forall_unpacking_end
+
       }
       timer.stop();
 
@@ -496,133 +459,136 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate(buffers[l]);
+
     }
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::WorkGroup with allows deferred execution
-  // This has overhead and indirection not in the separate loop version,
-  // but can be useful for debugging.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with allows deferred execution
+// This has overhead and indirection not in the separate loop version,
+// but can be useful for debugging.
+//----------------------------------------------------------------------------//
   {
-    std::cout << "\n Running RAJA loop workgroup halo exchange...\n";
+  std::cout << "\n Running RAJA loop workgroup halo exchange...\n";
 
     double minCycle = std::numeric_limits<double>::max();
 
     // _halo_exchange_seq_workgroup_policies_start
     using forall_policy = RAJA::seq_exec;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy<
-        RAJA::seq_work, RAJA::ordered, RAJA::ragged_array_of_objects,
-        RAJA::indirect_function_call_dispatch>;
-
-    using workpool = RAJA::WorkPool<
-        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
-
-    using workgroup = RAJA::WorkGroup<
-        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
-
-    using worksite = RAJA::WorkSite<
-        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::seq_work,
+                                 RAJA::ordered,
+                                 RAJA::ragged_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       memory_manager_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
     // _halo_exchange_seq_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
+
     }
 
-    workpool pool_pack(memory_manager_allocator<char> {});
-    workpool pool_unpack(memory_manager_allocator<char> {});
+    workpool pool_pack  (memory_manager_allocator<char>{});
+    workpool pool_unpack(memory_manager_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
-        }
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
 
-        // _halo_exchange_seq_workgroup_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_seq_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_pack.enqueue(
-                range_segment(0, len),
-                [=](int i) { buffer[i] = var[list[i]]; });
+          pool_pack.enqueue(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_pack = pool_pack.instantiate();
+      workgroup group_pack = pool_pack.instantiate();
 
-        worksite site_pack = group_pack.run();
+      worksite site_pack = group_pack.run();
 
-        // send all messages
-        // _halo_exchange_seq_workgroup_packing_end
+      // send all messages
+      // _halo_exchange_seq_workgroup_packing_end
 
-        // _halo_exchange_seq_workgroup_unpacking_start
-        // recv all messages
+      // _halo_exchange_seq_workgroup_unpacking_start
+      // recv all messages
 
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_unpack.enqueue(
-                range_segment(0, len),
-                [=](int i) { var[list[i]] = buffer[i]; });
+          pool_unpack.enqueue(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
+
+      workgroup group_unpack = pool_unpack.instantiate();
 
-        workgroup group_unpack = pool_unpack.instantiate();
+      worksite site_unpack = group_unpack.run();
+      // _halo_exchange_seq_workgroup_unpacking_end
 
-        worksite site_unpack = group_unpack.run();
-        // _halo_exchange_seq_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -631,29 +597,28 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate(buffers[l]);
+
     }
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // Separate packing/unpacking loops using forall
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Openmp forall halo exchange...\n";
 
@@ -665,78 +630,74 @@ int main(int argc, char** argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
+
     }
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
-
-          double* var = vars[v];
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
-        }
+        double* var = vars[v];
 
-        // _halo_exchange_openmp_forall_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+      // _halo_exchange_openmp_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-            double* var = vars[v];
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=](int i) { buffer[i] = var[list[i]]; });
+          double* var = vars[v];
 
-            buffer += len;
-          }
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-          // send single message
+          buffer += len;
         }
-        // _halo_exchange_openmp_forall_packing_end
 
-        // _halo_exchange_openmp_forall_unpacking_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+        // send single message
+      }
+      // _halo_exchange_openmp_forall_packing_end
+
+      // _halo_exchange_openmp_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          // recv single message
+        // recv single message
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=](int i) { var[list[i]] = buffer[i]; });
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
-        // _halo_exchange_openmp_forall_unpacking_end
+      }
+      // _halo_exchange_openmp_forall_unpacking_end
+
       }
       timer.stop();
 
@@ -745,24 +706,23 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate(buffers[l]);
+
     }
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::WorkGroup may allow effective parallelism across loops with Openmp.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup may allow effective parallelism across loops with Openmp.
+//----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA OpenMP workgroup halo exchange...\n";
 
@@ -771,105 +731,109 @@ int main(int argc, char** argv)
     // _halo_exchange_openmp_workgroup_policies_start
     using forall_policy = RAJA::omp_parallel_for_exec;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy<
-        RAJA::omp_work, RAJA::ordered, RAJA::ragged_array_of_objects,
-        RAJA::indirect_function_call_dispatch>;
-
-    using workpool = RAJA::WorkPool<
-        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
-
-    using workgroup = RAJA::WorkGroup<
-        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
-
-    using worksite = RAJA::WorkSite<
-        workgroup_policy, int, RAJA::xargs<>, memory_manager_allocator<char>>;
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::omp_work,
+                                 RAJA::ordered,
+                                 RAJA::ragged_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       memory_manager_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     memory_manager_allocator<char> >;
     // _halo_exchange_openmp_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate<double>(buffer_len);
+
     }
 
-    workpool pool_pack(memory_manager_allocator<char> {});
-    workpool pool_unpack(memory_manager_allocator<char> {});
+    workpool pool_pack  (memory_manager_allocator<char>{});
+    workpool pool_unpack(memory_manager_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size), [=](int i) { var[i] = i + v; });
-        }
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] (int i) {
+          var[i] = i + v;
+        });
+      }
 
-        // _halo_exchange_openmp_workgroup_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_openmp_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_pack.enqueue(
-                range_segment(0, len),
-                [=](int i) { buffer[i] = var[list[i]]; });
+          pool_pack.enqueue(range_segment(0, len), [=] (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_pack = pool_pack.instantiate();
+      workgroup group_pack = pool_pack.instantiate();
 
-        worksite site_pack = group_pack.run();
+      worksite site_pack = group_pack.run();
 
-        // send all messages
-        // _halo_exchange_openmp_workgroup_packing_end
+      // send all messages
+      // _halo_exchange_openmp_workgroup_packing_end
 
-        // _halo_exchange_openmp_workgroup_unpacking_start
-        // recv all messages
+      // _halo_exchange_openmp_workgroup_unpacking_start
+      // recv all messages
 
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_unpack.enqueue(
-                range_segment(0, len),
-                [=](int i) { var[list[i]] = buffer[i]; });
+          pool_unpack.enqueue(range_segment(0, len), [=] (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_unpack = pool_unpack.instantiate();
+      workgroup group_unpack = pool_unpack.instantiate();
+
+      worksite site_unpack = group_unpack.run();
+      // _halo_exchange_openmp_workgroup_unpacking_end
 
-        worksite site_unpack = group_unpack.run();
-        // _halo_exchange_openmp_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -878,31 +842,30 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate(buffers[l]);
+
     }
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //----------------------------------------------------------------------------//
-  // Separate packing/unpacking loops using forall
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Cuda forall halo exchange...\n";
 
@@ -913,28 +876,22 @@ int main(int argc, char** argv)
     std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
     std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
+    for (int v = 0; v < num_vars; ++v) {
       cuda_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
-      int pack_len             = pack_index_list_lengths[l];
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy(
-          cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
-          cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault ));
 
-      int unpack_len             = unpack_index_list_lengths[l];
+      int unpack_len = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy(
-          cuda_unpack_index_lists[l], unpack_index_lists[l],
-          unpack_len * sizeof(int), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault ));
     }
 
-    std::swap(vars, cuda_vars);
-    std::swap(pack_index_lists, cuda_pack_index_lists);
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
 
@@ -944,83 +901,78 @@ int main(int argc, char** argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
     }
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
-        }
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
 
-        // _halo_exchange_cuda_forall_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_cuda_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
+        }
 
-          cudaErrchk(cudaDeviceSynchronize());
+        cudaErrchk(cudaDeviceSynchronize());
 
-          // send single message
-        }
-        // _halo_exchange_cuda_forall_packing_end
+        // send single message
+      }
+      // _halo_exchange_cuda_forall_packing_end
 
-        // _halo_exchange_cuda_forall_unpacking_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_cuda_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          // recv single message
+        // recv single message
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
+
+      cudaErrchk(cudaDeviceSynchronize());
+      // _halo_exchange_cuda_forall_unpacking_end
 
-        cudaErrchk(cudaDeviceSynchronize());
-        // _halo_exchange_cuda_forall_unpacking_end
       }
       timer.stop();
 
@@ -1029,43 +981,39 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate_gpu(buffers[l]);
+
     }
 
 
-    std::swap(vars, cuda_vars);
-    std::swap(pack_index_lists, cuda_pack_index_lists);
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
-      cudaErrchk(cudaMemcpy(
-          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
+    for (int v = 0; v < num_vars; ++v) {
+      cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault ));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
       memoryManager::deallocate_gpu(cuda_pack_index_lists[l]);
       memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]);
     }
 
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution
+//----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Cuda workgroup halo exchange...\n";
 
@@ -1076,140 +1024,135 @@ int main(int argc, char** argv)
     std::vector<int*>    cuda_pack_index_lists(num_neighbors, nullptr);
     std::vector<int*>    cuda_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
+    for (int v = 0; v < num_vars; ++v) {
       cuda_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
-      int pack_len             = pack_index_list_lengths[l];
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
       cuda_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      cudaErrchk(cudaMemcpy(
-          cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
-          cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault ));
 
-      int unpack_len             = unpack_index_list_lengths[l];
+      int unpack_len = unpack_index_list_lengths[l];
       cuda_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      cudaErrchk(cudaMemcpy(
-          cuda_unpack_index_lists[l], unpack_index_lists[l],
-          unpack_len * sizeof(int), cudaMemcpyDefault));
+      cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault ));
     }
 
-    std::swap(vars, cuda_vars);
-    std::swap(pack_index_lists, cuda_pack_index_lists);
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
 
     // _halo_exchange_cuda_workgroup_policies_start
     using forall_policy = RAJA::cuda_exec_async<CUDA_BLOCK_SIZE>;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy<
-        RAJA::cuda_work_async<CUDA_WORKGROUP_BLOCK_SIZE>,
-        RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-        RAJA::constant_stride_array_of_objects,
-        RAJA::indirect_function_call_dispatch>;
-
-    using workpool = RAJA::WorkPool<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
-
-    using workgroup = RAJA::WorkGroup<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
-
-    using worksite = RAJA::WorkSite<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<CUDA_WORKGROUP_BLOCK_SIZE>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       pinned_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
     // _halo_exchange_cuda_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
     }
 
-    workpool pool_pack(pinned_allocator<char> {});
-    workpool pool_unpack(pinned_allocator<char> {});
+    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_unpack(pinned_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
-        }
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
 
-        // _halo_exchange_cuda_workgroup_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_cuda_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_pack.enqueue(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
+          pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_pack = pool_pack.instantiate();
+      workgroup group_pack = pool_pack.instantiate();
 
-        worksite site_pack = group_pack.run();
+      worksite site_pack = group_pack.run();
 
-        cudaErrchk(cudaDeviceSynchronize());
+      cudaErrchk(cudaDeviceSynchronize());
 
-        // send all messages
-        // _halo_exchange_cuda_workgroup_packing_end
+      // send all messages
+      // _halo_exchange_cuda_workgroup_packing_end
 
-        // _halo_exchange_cuda_workgroup_unpacking_start
-        // recv all messages
+      // _halo_exchange_cuda_workgroup_unpacking_start
+      // recv all messages
 
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_unpack.enqueue(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
+          pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_unpack = pool_unpack.instantiate();
+      workgroup group_unpack = pool_unpack.instantiate();
 
-        worksite site_unpack = group_unpack.run();
+      worksite site_unpack = group_unpack.run();
+
+      cudaErrchk(cudaDeviceSynchronize());
+      // _halo_exchange_cuda_workgroup_unpacking_end
 
-        cudaErrchk(cudaDeviceSynchronize());
-        // _halo_exchange_cuda_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -1218,50 +1161,46 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate_gpu(buffers[l]);
+
     }
 
 
-    std::swap(vars, cuda_vars);
-    std::swap(pack_index_lists, cuda_pack_index_lists);
+    std::swap(vars,               cuda_vars);
+    std::swap(pack_index_lists,   cuda_pack_index_lists);
     std::swap(unpack_index_lists, cuda_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
-      cudaErrchk(cudaMemcpy(
-          vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault));
+    for (int v = 0; v < num_vars; ++v) {
+      cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault ));
       memoryManager::deallocate_gpu(cuda_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
       memoryManager::deallocate_gpu(cuda_pack_index_lists[l]);
       memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]);
     }
 
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
-  // Separate packing/unpacking loops using forall
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Separate packing/unpacking loops using forall
+//----------------------------------------------------------------------------//
   {
     std::cout << "\n Running RAJA Hip forall halo exchange...\n";
 
@@ -1272,28 +1211,22 @@ int main(int argc, char** argv)
     std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
     std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
+    for (int v = 0; v < num_vars; ++v) {
       hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
-      int pack_len            = pack_index_list_lengths[l];
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(
-          hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
-          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
 
-      int unpack_len            = unpack_index_list_lengths[l];
+      int unpack_len = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(
-          hip_unpack_index_lists[l], unpack_index_lists[l],
-          unpack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
     }
 
-    std::swap(vars, hip_vars);
-    std::swap(pack_index_lists, hip_pack_index_lists);
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
 
@@ -1303,83 +1236,78 @@ int main(int argc, char** argv)
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
     }
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
-        }
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
 
-        // _halo_exchange_hip_forall_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_hip_forall_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
+        }
 
-          hipErrchk(hipDeviceSynchronize());
+        hipErrchk(hipDeviceSynchronize());
 
-          // send single message
-        }
-        // _halo_exchange_hip_forall_packing_end
+        // send single message
+      }
+      // _halo_exchange_hip_forall_packing_end
 
-        // _halo_exchange_hip_forall_unpacking_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_hip_forall_unpacking_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          // recv single message
+        // recv single message
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            RAJA::forall<forall_policy>(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
+          RAJA::forall<forall_policy>(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
+
+      hipErrchk(hipDeviceSynchronize());
+      // _halo_exchange_hip_forall_unpacking_end
 
-        hipErrchk(hipDeviceSynchronize());
-        // _halo_exchange_hip_forall_unpacking_end
       }
       timer.stop();
 
@@ -1388,48 +1316,42 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate_gpu(buffers[l]);
+
     }
 
 
-    std::swap(vars, hip_vars);
-    std::swap(pack_index_lists, hip_pack_index_lists);
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
-      hipErrchk(hipMemcpy(
-          vars[v], hip_vars[v], var_size * sizeof(double),
-          hipMemcpyDeviceToHost));
+    for (int v = 0; v < num_vars; ++v) {
+      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
       memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
       memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
     }
 
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 
 #if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
-  //----------------------------------------------------------------------------//
-  // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
+//----------------------------------------------------------------------------//
   {
-    std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo "
-                 "exchange...\n";
+    std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo exchange...\n";
 
     double minCycle = std::numeric_limits<double>::max();
 
@@ -1438,140 +1360,135 @@ int main(int argc, char** argv)
     std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
     std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
+    for (int v = 0; v < num_vars; ++v) {
       hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
-      int pack_len            = pack_index_list_lengths[l];
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(
-          hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
-          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
 
-      int unpack_len            = unpack_index_list_lengths[l];
+      int unpack_len = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(
-          hip_unpack_index_lists[l], unpack_index_lists[l],
-          unpack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
     }
 
-    std::swap(vars, hip_vars);
-    std::swap(pack_index_lists, hip_pack_index_lists);
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
 
     // _halo_exchange_hip_workgroup_policies_start
     using forall_policy = RAJA::hip_exec_async<HIP_BLOCK_SIZE>;
 
-    using workgroup_policy = RAJA::WorkGroupPolicy<
-        RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
-        RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::constant_stride_array_of_objects,
-        RAJA::indirect_function_call_dispatch>;
-
-    using workpool = RAJA::WorkPool<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
-
-    using workgroup = RAJA::WorkGroup<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
-
-    using worksite = RAJA::WorkSite<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       pinned_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
     // _halo_exchange_hip_workgroup_policies_end
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
     }
 
-    workpool pool_pack(pinned_allocator<char> {});
-    workpool pool_unpack(pinned_allocator<char> {});
+    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_unpack(pinned_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
-        }
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
 
-        // _halo_exchange_hip_workgroup_packing_start
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      // _halo_exchange_hip_workgroup_packing_start
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_pack.enqueue(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; });
+          pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            buffer[i] = var[list[i]];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_pack = pool_pack.instantiate();
+      workgroup group_pack = pool_pack.instantiate();
 
-        worksite site_pack = group_pack.run();
+      worksite site_pack = group_pack.run();
 
-        hipErrchk(hipDeviceSynchronize());
+      hipErrchk(hipDeviceSynchronize());
 
-        // send all messages
-        // _halo_exchange_hip_workgroup_packing_end
+      // send all messages
+      // _halo_exchange_hip_workgroup_packing_end
 
-        // _halo_exchange_hip_workgroup_unpacking_start
-        // recv all messages
+      // _halo_exchange_hip_workgroup_unpacking_start
+      // recv all messages
 
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_unpack.enqueue(
-                range_segment(0, len),
-                [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; });
+          pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) {
+            var[list[i]] = buffer[i];
+          });
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_unpack = pool_unpack.instantiate();
+      workgroup group_unpack = pool_unpack.instantiate();
 
-        worksite site_unpack = group_unpack.run();
+      worksite site_unpack = group_unpack.run();
+
+      hipErrchk(hipDeviceSynchronize());
+      // _halo_exchange_hip_workgroup_unpacking_end
 
-        hipErrchk(hipDeviceSynchronize());
-        // _halo_exchange_hip_workgroup_unpacking_end
       }
       timer.stop();
 
@@ -1580,47 +1497,41 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate_gpu(buffers[l]);
+
     }
 
 
-    std::swap(vars, hip_vars);
-    std::swap(pack_index_lists, hip_pack_index_lists);
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
-      hipErrchk(hipMemcpy(
-          vars[v], hip_vars[v], var_size * sizeof(double),
-          hipMemcpyDeviceToHost));
+    for (int v = 0; v < num_vars; ++v) {
+      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
       memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
       memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
     }
 
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
+//----------------------------------------------------------------------------//
   {
-    std::cout << "\n Running RAJA Hip direct dispatch workgroup halo "
-                 "exchange...\n";
+    std::cout << "\n Running RAJA Hip direct dispatch workgroup halo exchange...\n";
 
     double minCycle = std::numeric_limits<double>::max();
 
@@ -1629,150 +1540,145 @@ int main(int argc, char** argv)
     std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
     std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
+    for (int v = 0; v < num_vars; ++v) {
       hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
-      int pack_len            = pack_index_list_lengths[l];
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
       hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
-      hipErrchk(hipMemcpy(
-          hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int),
-          hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
 
-      int unpack_len            = unpack_index_list_lengths[l];
+      int unpack_len = unpack_index_list_lengths[l];
       hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
-      hipErrchk(hipMemcpy(
-          hip_unpack_index_lists[l], unpack_index_lists[l],
-          unpack_len * sizeof(int), hipMemcpyHostToDevice));
+      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
     }
 
-    std::swap(vars, hip_vars);
-    std::swap(pack_index_lists, hip_pack_index_lists);
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
 
     using forall_policy = RAJA::hip_exec_async<HIP_BLOCK_SIZE>;
 
-    struct Packer
-    {
-      double*          buffer;
-      double*          var;
-      int*             list;
-      RAJA_DEVICE void operator()(int i) const { buffer[i] = var[list[i]]; }
+    struct Packer {
+      double* buffer;
+      double* var;
+      int* list;
+      RAJA_DEVICE void operator() (int i) const {
+        buffer[i] = var[list[i]];
+      }
     };
 
-    struct UnPacker
-    {
-      double*          buffer;
-      double*          var;
-      int*             list;
-      RAJA_DEVICE void operator()(int i) const { var[list[i]] = buffer[i]; }
+    struct UnPacker {
+      double* buffer;
+      double* var;
+      int* list;
+      RAJA_DEVICE void operator()(int i) const {
+        var[list[i]] = buffer[i];
+      }
     };
 
-    using workgroup_policy = RAJA::WorkGroupPolicy<
-        RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
-        RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::constant_stride_array_of_objects,
-        RAJA::direct_dispatch<
-            camp::list<range_segment, Packer>,
-            camp::list<range_segment, UnPacker>>>;
-
-    using workpool = RAJA::WorkPool<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
-
-    using workgroup = RAJA::WorkGroup<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
-
-    using worksite = RAJA::WorkSite<
-        workgroup_policy, int, RAJA::xargs<>, pinned_allocator<char>>;
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::direct_dispatch<camp::list<range_segment, Packer>,
+                                                       camp::list<range_segment, UnPacker>>
+                                 >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       pinned_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
 
     std::vector<double*> buffers(num_neighbors, nullptr);
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       int buffer_len = num_vars * pack_index_list_lengths[l];
 
       buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
     }
 
-    workpool pool_pack(pinned_allocator<char> {});
-    workpool pool_unpack(pinned_allocator<char> {});
+    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_unpack(pinned_allocator<char>{});
 
-    for (int c = 0; c < num_cycles; ++c)
-    {
+    for (int c = 0; c < num_cycles; ++c ) {
       timer.start();
       {
 
-        // set vars
-        for (int v = 0; v < num_vars; ++v)
-        {
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
 
-          double* var = vars[v];
+        double* var = vars[v];
 
-          RAJA::forall<forall_policy>(
-              range_segment(0, var_size),
-              [=] RAJA_DEVICE(int i) { var[i] = i + v; });
-        }
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
 
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = pack_index_lists[l];
-          int     len    = pack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
 
-          // pack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_pack.enqueue(
-                range_segment(0, len), Packer {buffer, var, list});
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_pack = pool_pack.instantiate();
+      workgroup group_pack = pool_pack.instantiate();
 
-        worksite site_pack = group_pack.run();
+      worksite site_pack = group_pack.run();
 
-        hipErrchk(hipDeviceSynchronize());
+      hipErrchk(hipDeviceSynchronize());
 
-        // send all messages
+      // send all messages
 
-        // recv all messages
+      // recv all messages
 
-        for (int l = 0; l < num_neighbors; ++l)
-        {
+      for (int l = 0; l < num_neighbors; ++l) {
 
-          double* buffer = buffers[l];
-          int*    list   = unpack_index_lists[l];
-          int     len    = unpack_index_list_lengths[l];
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
 
-          // unpack
-          for (int v = 0; v < num_vars; ++v)
-          {
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
 
-            double* var = vars[v];
+          double* var = vars[v];
 
-            pool_unpack.enqueue(
-                range_segment(0, len), UnPacker {buffer, var, list});
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
 
-            buffer += len;
-          }
+          buffer += len;
         }
+      }
 
-        workgroup group_unpack = pool_unpack.instantiate();
+      workgroup group_unpack = pool_unpack.instantiate();
 
-        worksite site_unpack = group_unpack.run();
+      worksite site_unpack = group_unpack.run();
+
+      hipErrchk(hipDeviceSynchronize());
 
-        hipErrchk(hipDeviceSynchronize());
       }
       timer.stop();
 
@@ -1781,51 +1687,45 @@ int main(int argc, char** argv)
       timer.reset();
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
 
       memoryManager::deallocate_gpu(buffers[l]);
+
     }
 
 
-    std::swap(vars, hip_vars);
-    std::swap(pack_index_lists, hip_pack_index_lists);
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
     std::swap(unpack_index_lists, hip_unpack_index_lists);
 
-    for (int v = 0; v < num_vars; ++v)
-    {
-      hipErrchk(hipMemcpy(
-          vars[v], hip_vars[v], var_size * sizeof(double),
-          hipMemcpyDeviceToHost));
+    for (int v = 0; v < num_vars; ++v) {
+      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
       memoryManager::deallocate_gpu(hip_vars[v]);
     }
 
-    for (int l = 0; l < num_neighbors; ++l)
-    {
+    for (int l = 0; l < num_neighbors; ++l) {
       memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
       memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
     }
 
 
-    std::cout << "\tmin cycle run time : " << minCycle << " seconds"
-              << std::endl;
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
     checkResult(vars, vars_ref, var_size, num_vars);
-    // printResult(vars, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
   }
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
-  //
-  // Clean up.
-  //
-  for (int v = 0; v < num_vars; ++v)
-  {
+//
+// Clean up.
+//
+  for (int v = 0; v < num_vars; ++v) {
     memoryManager::deallocate(vars[v]);
     memoryManager::deallocate(vars_ref[v]);
   }
@@ -1843,31 +1743,20 @@ int main(int argc, char** argv)
 //
 // Function to compare result to reference and report P/F.
 //
-void checkResult(
-    std::vector<double*> const& vars,
-    std::vector<double*> const& vars_ref,
-    int                         var_size,
-    int                         num_vars)
+void checkResult(std::vector<double*> const& vars, std::vector<double*> const& vars_ref,
+                 int var_size, int num_vars)
 {
   bool correct = true;
-  for (int v = 0; v < num_vars; ++v)
-  {
-    double* var     = vars[v];
+  for (int v = 0; v < num_vars; ++v) {
+    double* var = vars[v];
     double* var_ref = vars_ref[v];
-    for (int i = 0; i < var_size; i++)
-    {
-      if (var[i] != var_ref[i])
-      {
-        correct = false;
-      }
+    for (int i = 0; i < var_size; i++) {
+      if ( var[i] != var_ref[i] ) { correct = false; }
     }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -1878,11 +1767,9 @@ void checkResult(
 void printResult(std::vector<double*> const& vars, int var_size, int num_vars)
 {
   std::cout << std::endl;
-  for (int v = 0; v < num_vars; ++v)
-  {
+  for (int v = 0; v < num_vars; ++v) {
     double* var = vars[v];
-    for (int i = 0; i < var_size; i++)
-    {
+    for (int i = 0; i < var_size; i++) {
       std::cout << "result[" << i << "] = " << var[i] << std::endl;
     }
   }
@@ -1903,134 +1790,120 @@ struct Extent
 //
 // Function to generate index lists for packing.
 //
-void create_pack_lists(
-    std::vector<int*>& pack_index_lists,
-    std::vector<int>&  pack_index_list_lengths,
-    const int          halo_width,
-    const int*         grid_dims)
+void create_pack_lists(std::vector<int*>& pack_index_lists,
+                       std::vector<int >& pack_index_list_lengths,
+                       const int halo_width, const int* grid_dims)
 {
   std::vector<Extent> pack_index_list_extents(num_neighbors);
 
   // faces
-  pack_index_list_extents[0] = Extent {halo_width, halo_width + halo_width,
-                                       halo_width, grid_dims[1] + halo_width,
-                                       halo_width, grid_dims[2] + halo_width};
-  pack_index_list_extents[1] = Extent {grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width,   grid_dims[1] + halo_width,
-                                       halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[2] = Extent {halo_width, grid_dims[0] + halo_width,
-                                       halo_width, halo_width + halo_width,
-                                       halo_width, grid_dims[2] + halo_width};
-  pack_index_list_extents[3] = Extent {halo_width,   grid_dims[0] + halo_width,
+  pack_index_list_extents[0]  = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[1]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[2]  = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[3]  = Extent{halo_width  , grid_dims[0] + halo_width,
                                        grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[4] = Extent {halo_width, grid_dims[0] + halo_width,
-                                       halo_width, grid_dims[1] + halo_width,
-                                       halo_width, halo_width + halo_width};
-  pack_index_list_extents[5] = Extent {halo_width,   grid_dims[0] + halo_width,
-                                       halo_width,   grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[4]  = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[5]  = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
                                        grid_dims[2], grid_dims[2] + halo_width};
 
   // edges
-  pack_index_list_extents[6]  = Extent {halo_width, halo_width + halo_width,
-                                       halo_width, halo_width + halo_width,
-                                       halo_width, grid_dims[2] + halo_width};
-  pack_index_list_extents[7]  = Extent {halo_width,   halo_width + halo_width,
+  pack_index_list_extents[6]  = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[7]  = Extent{halo_width  , halo_width   + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[8]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[9]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , grid_dims[2] + halo_width};
+  pack_index_list_extents[10] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[11] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[14] = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[15] = Extent{halo_width  , grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[16] = Extent{halo_width  , grid_dims[0] + halo_width,
                                        grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[8]  = Extent {grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width,   halo_width + halo_width,
-                                       halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[9]  = Extent {grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[17] = Extent{halo_width  , grid_dims[0] + halo_width,
                                        grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width,   grid_dims[2] + halo_width};
-  pack_index_list_extents[10] = Extent {halo_width, halo_width + halo_width,
-                                        halo_width, grid_dims[1] + halo_width,
-                                        halo_width, halo_width + halo_width};
-  pack_index_list_extents[11] =
-      Extent {halo_width,   halo_width + halo_width,
-              halo_width,   grid_dims[1] + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[12] = Extent {grid_dims[0], grid_dims[0] + halo_width,
-                                        halo_width,   grid_dims[1] + halo_width,
-                                        halo_width,   halo_width + halo_width};
-  pack_index_list_extents[13] =
-      Extent {grid_dims[0], grid_dims[0] + halo_width,
-              halo_width,   grid_dims[1] + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[14] = Extent {halo_width, grid_dims[0] + halo_width,
-                                        halo_width, halo_width + halo_width,
-                                        halo_width, halo_width + halo_width};
-  pack_index_list_extents[15] =
-      Extent {halo_width,   grid_dims[0] + halo_width,
-              halo_width,   halo_width + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[16] = Extent {halo_width,   grid_dims[0] + halo_width,
-                                        grid_dims[1], grid_dims[1] + halo_width,
-                                        halo_width,   halo_width + halo_width};
-  pack_index_list_extents[17] =
-      Extent {halo_width,   grid_dims[0] + halo_width,
-              grid_dims[1], grid_dims[1] + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
+                                       grid_dims[2], grid_dims[2] + halo_width};
 
   // corners
-  pack_index_list_extents[18] = Extent {halo_width, halo_width + halo_width,
-                                        halo_width, halo_width + halo_width,
-                                        halo_width, halo_width + halo_width};
-  pack_index_list_extents[19] =
-      Extent {halo_width,   halo_width + halo_width,
-              halo_width,   halo_width + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[20] = Extent {halo_width,   halo_width + halo_width,
-                                        grid_dims[1], grid_dims[1] + halo_width,
-                                        halo_width,   halo_width + halo_width};
-  pack_index_list_extents[21] =
-      Extent {halo_width,   halo_width + halo_width,
-              grid_dims[1], grid_dims[1] + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[22] = Extent {grid_dims[0], grid_dims[0] + halo_width,
-                                        halo_width,   halo_width + halo_width,
-                                        halo_width,   halo_width + halo_width};
-  pack_index_list_extents[23] =
-      Extent {grid_dims[0], grid_dims[0] + halo_width,
-              halo_width,   halo_width + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[24] = Extent {grid_dims[0], grid_dims[0] + halo_width,
-                                        grid_dims[1], grid_dims[1] + halo_width,
-                                        halo_width,   halo_width + halo_width};
-  pack_index_list_extents[25] =
-      Extent {grid_dims[0], grid_dims[0] + halo_width,
-              grid_dims[1], grid_dims[1] + halo_width,
-              grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[18] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[19] = Extent{halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[20] = Extent{halo_width  , halo_width   + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[21] = Extent{halo_width  , halo_width   + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       halo_width  , halo_width   + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
+  pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       halo_width  , halo_width   + halo_width};
+  pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width,
+                                       grid_dims[1], grid_dims[1] + halo_width,
+                                       grid_dims[2], grid_dims[2] + halo_width};
 
   const int grid_i_stride = 1;
-  const int grid_j_stride = grid_dims[0] + 2 * halo_width;
-  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2 * halo_width);
+  const int grid_j_stride = grid_dims[0] + 2*halo_width;
+  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
 
-  for (int l = 0; l < num_neighbors; ++l)
-  {
+  for (int l = 0; l < num_neighbors; ++l) {
 
     Extent extent = pack_index_list_extents[l];
 
     pack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
                                  (extent.j_max - extent.j_min) *
-                                 (extent.k_max - extent.k_min);
+                                 (extent.k_max - extent.k_min) ;
 
-    pack_index_lists[l] =
-        memoryManager::allocate<int>(pack_index_list_lengths[l]);
+    pack_index_lists[l] = memoryManager::allocate<int>(pack_index_list_lengths[l]);
 
     int* pack_list = pack_index_lists[l];
 
     int list_idx = 0;
-    for (int kk = extent.k_min; kk < extent.k_max; ++kk)
-    {
-      for (int jj = extent.j_min; jj < extent.j_max; ++jj)
-      {
-        for (int ii = extent.i_min; ii < extent.i_max; ++ii)
-        {
+    for (int kk = extent.k_min; kk < extent.k_max; ++kk) {
+      for (int jj = extent.j_min; jj < extent.j_max; ++jj) {
+        for (int ii = extent.i_min; ii < extent.i_max; ++ii) {
 
-          int pack_idx =
-              ii * grid_i_stride + jj * grid_j_stride + kk * grid_k_stride;
+          int pack_idx = ii * grid_i_stride +
+                         jj * grid_j_stride +
+                         kk * grid_k_stride ;
 
           pack_list[list_idx] = pack_idx;
 
@@ -2046,8 +1919,7 @@ void create_pack_lists(
 //
 void destroy_pack_lists(std::vector<int*>& pack_index_lists)
 {
-  for (int l = 0; l < num_neighbors; ++l)
-  {
+  for (int l = 0; l < num_neighbors; ++l) {
     memoryManager::deallocate(pack_index_lists[l]);
   }
 }
@@ -2056,183 +1928,119 @@ void destroy_pack_lists(std::vector<int*>& pack_index_lists)
 //
 // Function to generate index lists for unpacking.
 //
-void create_unpack_lists(
-    std::vector<int*>& unpack_index_lists,
-    std::vector<int>&  unpack_index_list_lengths,
-    const int          halo_width,
-    const int*         grid_dims)
+void create_unpack_lists(std::vector<int*>& unpack_index_lists, std::vector<int>& unpack_index_list_lengths,
+                         const int halo_width, const int* grid_dims)
 {
   std::vector<Extent> unpack_index_list_extents(num_neighbors);
 
   // faces
-  unpack_index_list_extents[0] = Extent {0,          halo_width,
-                                         halo_width, grid_dims[1] + halo_width,
-                                         halo_width, grid_dims[2] + halo_width};
-  unpack_index_list_extents[1] = Extent {
-      grid_dims[0] + halo_width,
-      grid_dims[0] + 2 * halo_width,
-      halo_width,
-      grid_dims[1] + halo_width,
-      halo_width,
-      grid_dims[2] + halo_width};
-  unpack_index_list_extents[2] =
-      Extent {halo_width, grid_dims[0] + halo_width, 0, halo_width,
-              halo_width, grid_dims[2] + halo_width};
-  unpack_index_list_extents[3] = Extent {
-      halo_width,
-      grid_dims[0] + halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[1] + 2 * halo_width,
-      halo_width,
-      grid_dims[2] + halo_width};
-  unpack_index_list_extents[4] = Extent {halo_width, grid_dims[0] + halo_width,
-                                         halo_width, grid_dims[1] + halo_width,
-                                         0,          halo_width};
-  unpack_index_list_extents[5] = Extent {
-      halo_width,
-      grid_dims[0] + halo_width,
-      halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[2] + halo_width,
-      grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[0]  = Extent{0                        ,                  halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[1]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[2]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         0                        ,                  halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[3]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[4]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[5]  = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
 
   // edges
-  unpack_index_list_extents[6] = Extent {
-      0, halo_width, 0, halo_width, halo_width, grid_dims[2] + halo_width};
-  unpack_index_list_extents[7] = Extent {
-      0,
-      halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[1] + 2 * halo_width,
-      halo_width,
-      grid_dims[2] + halo_width};
-  unpack_index_list_extents[8] = Extent {
-      grid_dims[0] + halo_width,
-      grid_dims[0] + 2 * halo_width,
-      0,
-      halo_width,
-      halo_width,
-      grid_dims[2] + halo_width};
-  unpack_index_list_extents[9] = Extent {
-      grid_dims[0] + halo_width,
-      grid_dims[0] + 2 * halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[1] + 2 * halo_width,
-      halo_width,
-      grid_dims[2] + halo_width};
-  unpack_index_list_extents[10] = Extent {
-      0, halo_width, halo_width, grid_dims[1] + halo_width, 0, halo_width};
-  unpack_index_list_extents[11] = Extent {
-      0,
-      halo_width,
-      halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[2] + halo_width,
-      grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[12] = Extent {
-      grid_dims[0] + halo_width,
-      grid_dims[0] + 2 * halo_width,
-      halo_width,
-      grid_dims[1] + halo_width,
-      0,
-      halo_width};
-  unpack_index_list_extents[13] = Extent {
-      grid_dims[0] + halo_width,
-      grid_dims[0] + 2 * halo_width,
-      halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[2] + halo_width,
-      grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[14] = Extent {
-      halo_width, grid_dims[0] + halo_width, 0, halo_width, 0, halo_width};
-  unpack_index_list_extents[15] = Extent {
-      halo_width, grid_dims[0] + halo_width, 0,
-      halo_width, grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[16] = Extent {
-      halo_width,
-      grid_dims[0] + halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[1] + 2 * halo_width,
-      0,
-      halo_width};
-  unpack_index_list_extents[17] = Extent {
-      halo_width,
-      grid_dims[0] + halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[1] + 2 * halo_width,
-      grid_dims[2] + halo_width,
-      grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[6]  = Extent{0                        ,                  halo_width,
+                                         0                        ,                  halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[7]  = Extent{0                        ,                  halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[8]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         0                        ,                  halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[9]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         halo_width               , grid_dims[2] +   halo_width};
+  unpack_index_list_extents[10] = Extent{0                        ,                  halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[11] = Extent{0                        ,                  halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         halo_width               , grid_dims[1] +   halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[14] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         0                        ,                  halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[15] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         0                        ,                  halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[16] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[17] = Extent{halo_width               , grid_dims[0] +   halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
 
   // corners
-  unpack_index_list_extents[18] =
-      Extent {0, halo_width, 0, halo_width, 0, halo_width};
-  unpack_index_list_extents[19] = Extent {
-      0,
-      halo_width,
-      0,
-      halo_width,
-      grid_dims[2] + halo_width,
-      grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[20] = Extent {
-      0, halo_width, grid_dims[1] + halo_width, grid_dims[1] + 2 * halo_width,
-      0, halo_width};
-  unpack_index_list_extents[21] = Extent {
-      0,
-      halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[1] + 2 * halo_width,
-      grid_dims[2] + halo_width,
-      grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[22] = Extent {
-      grid_dims[0] + halo_width,
-      grid_dims[0] + 2 * halo_width,
-      0,
-      halo_width,
-      0,
-      halo_width};
-  unpack_index_list_extents[23] = Extent {
-      grid_dims[0] + halo_width, grid_dims[0] + 2 * halo_width, 0, halo_width,
-      grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
-  unpack_index_list_extents[24] = Extent {
-      grid_dims[0] + halo_width,
-      grid_dims[0] + 2 * halo_width,
-      grid_dims[1] + halo_width,
-      grid_dims[1] + 2 * halo_width,
-      0,
-      halo_width};
-  unpack_index_list_extents[25] =
-      Extent {grid_dims[0] + halo_width, grid_dims[0] + 2 * halo_width,
-              grid_dims[1] + halo_width, grid_dims[1] + 2 * halo_width,
-              grid_dims[2] + halo_width, grid_dims[2] + 2 * halo_width};
+  unpack_index_list_extents[18] = Extent{0                        ,                  halo_width,
+                                         0                        ,                  halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[19] = Extent{0                        ,                  halo_width,
+                                         0                        ,                  halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[20] = Extent{0                        ,                  halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[21] = Extent{0                        ,                  halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         0                        ,                  halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         0                        ,                  halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
+  unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         0                        ,                  halo_width};
+  unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
+                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
+                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
 
   const int grid_i_stride = 1;
-  const int grid_j_stride = grid_dims[0] + 2 * halo_width;
-  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2 * halo_width);
+  const int grid_j_stride = grid_dims[0] + 2*halo_width;
+  const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
 
-  for (int l = 0; l < num_neighbors; ++l)
-  {
+  for (int l = 0; l < num_neighbors; ++l) {
 
     Extent extent = unpack_index_list_extents[l];
 
     unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
                                    (extent.j_max - extent.j_min) *
-                                   (extent.k_max - extent.k_min);
+                                   (extent.k_max - extent.k_min) ;
 
-    unpack_index_lists[l] =
-        memoryManager::allocate<int>(unpack_index_list_lengths[l]);
+    unpack_index_lists[l] = memoryManager::allocate<int>(unpack_index_list_lengths[l]);
 
     int* unpack_list = unpack_index_lists[l];
 
     int list_idx = 0;
-    for (int kk = extent.k_min; kk < extent.k_max; ++kk)
-    {
-      for (int jj = extent.j_min; jj < extent.j_max; ++jj)
-      {
-        for (int ii = extent.i_min; ii < extent.i_max; ++ii)
-        {
+    for (int kk = extent.k_min; kk < extent.k_max; ++kk) {
+      for (int jj = extent.j_min; jj < extent.j_max; ++jj) {
+        for (int ii = extent.i_min; ii < extent.i_max; ++ii) {
 
-          int unpack_idx =
-              ii * grid_i_stride + jj * grid_j_stride + kk * grid_k_stride;
+          int unpack_idx = ii * grid_i_stride +
+                           jj * grid_j_stride +
+                           kk * grid_k_stride ;
 
           unpack_list[list_idx] = unpack_idx;
 
@@ -2248,8 +2056,7 @@ void create_unpack_lists(
 //
 void destroy_unpack_lists(std::vector<int*>& unpack_index_lists)
 {
-  for (int l = 0; l < num_neighbors; ++l)
-  {
+  for (int l = 0; l < num_neighbors; ++l) {
     memoryManager::deallocate(unpack_index_lists[l]);
   }
 }
diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp
index 02e59412c7..96a2ffe2f0 100644
--- a/examples/tut_launch_basic.cpp
+++ b/examples/tut_launch_basic.cpp
@@ -31,7 +31,7 @@
  * the example below choses a sequential
  * execution space and either a CUDA or HIP
  * execution device execution space.
- */
+*/
 
 // __host_launch_start
 using host_launch = RAJA::seq_launch_t;
@@ -46,12 +46,11 @@ using device_launch = RAJA::hip_launch_t<false>;
 #endif
 
 using launch_policy = RAJA::LaunchPolicy<
-    host_launch
+  host_launch
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-    ,
-    device_launch
+  ,device_launch
 #endif
-    >;
+  >;
 
 /*
  * RAJA launch exposes a thread/block programming model
@@ -66,74 +65,68 @@ using launch_policy = RAJA::LaunchPolicy<
  */
 
 using teams_x = RAJA::LoopPolicy<
-    RAJA::seq_exec
+                                       RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_block_x_direct
+                                       ,
+                                       RAJA::cuda_block_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_block_x_direct
+                                       ,
+                                       RAJA::hip_block_x_direct
 #endif
-    >;
+                                       >;
 
 using teams_y = RAJA::LoopPolicy<
-    RAJA::seq_exec
+                                       RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_block_y_direct
+                                       ,
+                                       RAJA::cuda_block_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_block_y_direct
+                                       ,
+                                       RAJA::hip_block_y_direct
 #endif
-    >;
+                                       >;
 
-using threads_x = RAJA::LoopPolicy<
-    RAJA::seq_exec
+using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_thread_x_direct
+                                         ,
+                                         RAJA::cuda_thread_x_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_thread_x_direct
+                                         ,
+                                         RAJA::hip_thread_x_direct
 #endif
-    >;
+                                         >;
 
-using threads_y = RAJA::LoopPolicy<
-    RAJA::seq_exec
+using threads_y = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_thread_y_direct
+                                         ,
+                                         RAJA::cuda_thread_y_direct
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_thread_y_direct
+                                         ,
+                                         RAJA::hip_thread_y_direct
 #endif
-    >;
+                                         >;
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 __global__ void gpuKernel()
 {
-  // Equivalent CUDA/HIP style thread/block mapping
-  //  _device_loop_start
-  {
-    int by = blockIdx.y;
-    {
-      int bx = blockIdx.x;
-
-      {
-        int ty = threadIdx.y;
-        {
-          int tx = blockIdx.x;
-
-          printf(
-              "device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d "
-              "block_by %d \n",
-              tx, ty, bx, by);
+  //Equivalent CUDA/HIP style thread/block mapping
+  // _device_loop_start
+  {int by = blockIdx.y;
+    {int bx = blockIdx.x;
+
+      {int ty = threadIdx.y;
+        {int tx = blockIdx.x;
+
+          printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d block_by %d \n",
+                 tx, ty, bx, by);
+
         }
       }
+
     }
   }
   // _device_loop_end
@@ -149,103 +142,78 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-  if (argc != 2)
-  {
-    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic "
-                        "device");
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device");
   }
 
-  //
-  // Run time policy section is demonstrated in this example by specifying
-  // kernel exection space as a command line argument (host or device).
-  // Example usage ./tut_launch_basic host or ./tut_launch_basic device
-  //
+//
+// Run time policy section is demonstrated in this example by specifying
+// kernel exection space as a command line argument (host or device).
+// Example usage ./tut_launch_basic host or ./tut_launch_basic device
+//
   std::string exec_space = argv[1];
-  if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0))
-  {
-    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic "
-                        "device");
+  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
+    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device");
     return 0;
   }
 
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (exec_space.compare("host") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-    printf("Running RAJA-Teams on the host \n");
-  }
-  if (exec_space.compare("device") == 0)
-  {
-    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
-    printf("Running RAJA-Teams on the device \n");
-  }
+  if(exec_space.compare("host") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams on the host \n"); }
+  if(exec_space.compare("device") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams on the device \n"); }
 
-  //
-  // The following three kernels illustrate loop based parallelism
-  // based on nested for loops. For correctness team and thread loops
-  // make the assumption that all work inside can be done
-  // concurrently.
-  //
+//
+// The following three kernels illustrate loop based parallelism
+// based on nested for loops. For correctness team and thread loops
+// make the assumption that all work inside can be done
+// concurrently.
+//
 
   // __compute_grid_start
-  const int Nteams   = 2;
+  const int Nteams  = 2;
   const int Nthreads = 2;
   // __compute_grid_end
 
-  RAJA::launch<launch_policy>(
-      select_cpu_or_gpu,
-      RAJA::LaunchParams(
-          RAJA::Teams(Nteams, Nteams), RAJA::Threads(Nthreads, Nthreads)),
-
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        // _team_loops_start
-        RAJA::loop<teams_y>(
-            ctx, RAJA::TypedRangeSegment<int>(0, Nteams),
-            [&](int by)
-            {
-              RAJA::loop<teams_x>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, Nteams),
-                  [&](int bx)
-                  {
-                    RAJA::loop<threads_y>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),
-                        [&](int ty)
-                        {
-                          RAJA::loop<threads_x>(
-                              ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),
-                              [&](int tx)
-                              {
-                                printf(
-                                    "RAJA Teams: threadId_x %d threadId_y "
-                                    "%d teamId_x %d teamId_y %d \n",
-                                    tx, ty, bx, by);
-                              });
-                        });
-                  });
-            });
-        // _team_loops_end
-      });
-
-  // Equivalent C style loops
-  if (select_cpu_or_gpu == RAJA::ExecPlace::HOST)
-  {
+  RAJA::launch<launch_policy>(select_cpu_or_gpu,
+    RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams),
+                     RAJA::Threads(Nthreads,Nthreads)),
+
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+     // _team_loops_start
+     RAJA::loop<teams_y>(ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&] (int by) {
+       RAJA::loop<teams_x>(ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&] (int bx) {
+
+         RAJA::loop<threads_y>(ctx, RAJA::TypedRangeSegment<int>(0, Nthreads), [&] (int ty) {
+           RAJA::loop<threads_x>(ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),       [&] (int tx) {
+               printf("RAJA Teams: threadId_x %d threadId_y %d teamId_x %d teamId_y %d \n",
+                      tx, ty, bx, by);
+
+
+           });
+         });
+
+       });
+     });
+     // _team_loops_end
+
+   });
+
+  //Equivalent C style loops
+  if(select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
     // _c_style_loops_start
-    for (int by = 0; by < Nteams; ++by)
-    {
-      for (int bx = 0; bx < Nteams; ++bx)
-      {
-
-        for (int ty = 0; ty < Nthreads; ++ty)
-        {
-          for (int tx = 0; tx < Nthreads; ++tx)
-          {
-
-            printf(
-                "c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n", tx,
-                ty, bx, by);
+    for (int by=0; by<Nteams; ++by) {
+      for (int bx=0; bx<Nteams; ++bx) {
+
+        for (int ty=0; ty<Nthreads; ++ty) {
+          for (int tx=0; tx<Nthreads; ++tx) {
+
+            printf("c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n",
+	    tx, ty, bx, by);
           }
         }
+
       }
     }
     // _c_style_loops_end
@@ -264,13 +232,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
     gpuKernel<<<griddim, blockdim>>>();
   cudaDeviceSynchronize();
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-  if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
     hipLaunchKernelGGL((gpuKernel), dim3(griddim), dim3(blockdim), 0, 0);
   hipDeviceSynchronize();
 #endif
diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp
index ba0a74478e..e939d96dbb 100644
--- a/examples/tut_matrix-multiply.cpp
+++ b/examples/tut_matrix-multiply.cpp
@@ -64,11 +64,9 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
   int row = blockIdx.y * blockDim.y + threadIdx.y;
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if (row < N && col < N)
-  {
+  if ( row < N && col < N ) {
     double dot = 0.0;
-    for (int k = 0; k < N; ++k)
-    {
+    for (int k = 0; k < N; ++k) {
       dot += A(row, k) * B(k, col);
     }
 
@@ -81,7 +79,7 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B)
 // Functions for checking results
 //
 template <typename T>
-void checkResult(T* C, int N);
+void checkResult(T *C, int N);
 
 template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
@@ -90,271 +88,262 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 // Functions for printing results
 //
 template <typename T>
-void printResult(T* C, int N);
+void printResult(T *C, int N);
 
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix multiplication example...\n";
 
-  //
-  // Define num rows/cols in matrix
-  //
+//
+// Define num rows/cols in matrix
+//
   const int N = 1000;
-  // const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE;
+//const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE;
 
-  //
-  // Allocate and initialize matrix data.
-  //
-  double* A = memoryManager::allocate<double>(N * N);
-  double* B = memoryManager::allocate<double>(N * N);
-  double* C = memoryManager::allocate<double>(N * N);
-
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
+//
+// Allocate and initialize matrix data.
+//
+  double *A = memoryManager::allocate<double>(N * N);
+  double *B = memoryManager::allocate<double>(N * N);
+  double *C = memoryManager::allocate<double>(N * N);
+
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
       A(row, col) = row;
       B(row, col) = col;
     }
   }
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version of matrix multiplication...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_cstyle_start
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k)
-      {
+      for (int k = 0; k < N; ++k) {
         dot += A(row, k) * B(k, col);
       }
       C(row, col) = dot;
+
     }
   }
   // _matmult_cstyle_end
 
   checkResult<double>(C, N);
-  // printResult<double>(C, N);
+//printResult<double>(C, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // We define RAJA range segments to define the ranges of
-  // row, column, and dot-product loops for RAJA variants
-  //
+//
+// We define RAJA range segments to define the ranges of
+// row, column, and dot-product loops for RAJA variants
+//
   // _matmult_ranges_start
   RAJA::TypedRangeSegment<int> row_range(0, N);
   RAJA::TypedRangeSegment<int> col_range(0, N);
   RAJA::TypedRangeSegment<int> dot_range(0, N);
   // _matmult_ranges_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // For the RAJA implementations of matrix multiplication, we
-  // use RAJA 'View' objects to access the matrix data. A RAJA view
-  // holds a pointer to a data array and enables multi-dimensional indexing
-  // into that data, similar to the macros we defined above.
-  //
+//
+// For the RAJA implementations of matrix multiplication, we
+// use RAJA 'View' objects to access the matrix data. A RAJA view
+// holds a pointer to a data array and enables multi-dimensional indexing
+// into that data, similar to the macros we defined above.
+//
   // _matmult_views_start
   RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, N);
   // _matmult_views_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // In the next few examples, we show ways that we can use RAJA::forall
-  // statements for the matrix multiplication kernel. This usage is not
-  // recommended for performance reasons. Specifically, it limits the amount
-  // of parallelism that can be exposed to less than is possible. We show
-  // this usage here, to make this point clear. Later in this file, we
-  // introduce RAJA nested loop abstractions and show that we can extract all
-  // available parallelism.
-  //
-  //
-  // In the first RAJA implementation, we replace the outer 'row' loop
-  // with a RAJA::forall statement. The lambda expression contains the
-  // inner loops.
-  //
+//
+// In the next few examples, we show ways that we can use RAJA::forall
+// statements for the matrix multiplication kernel. This usage is not
+// recommended for performance reasons. Specifically, it limits the amount
+// of parallelism that can be exposed to less than is possible. We show
+// this usage here, to make this point clear. Later in this file, we
+// introduce RAJA nested loop abstractions and show that we can extract all
+// available parallelism.
+//
+//
+// In the first RAJA implementation, we replace the outer 'row' loop
+// with a RAJA::forall statement. The lambda expression contains the
+// inner loops.
+//
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential mat-mult (RAJA-row)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_outerforall_start
-  RAJA::forall<RAJA::seq_exec>(
-      row_range,
-      [=](int row)
-      {
-        for (int col = 0; col < N; ++col)
-        {
-
-          double dot = 0.0;
-          for (int k = 0; k < N; ++k)
-          {
-            dot += Aview(row, k) * Bview(k, col);
-          }
-          Cview(row, col) = dot;
-        }
-      });
+  RAJA::forall<RAJA::seq_exec>( row_range, [=](int row) {
+
+    for (int col = 0; col < N; ++col) {
+
+      double dot = 0.0;
+      for (int k = 0; k < N; ++k) {
+        dot += Aview(row, k) * Bview(k, col);
+      }
+      Cview(row, col) = dot;
+
+    }
+
+  });
   // _matmult_outerforall_end
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Next, we replace the outer 'row' loop and the inner 'col' loop
-  // with RAJA::forall statements. This will also work with parallel
-  // execution policies, such as OpenMP and CUDA, with caveats and
-  // restrictions.
-  //
-  // However, nesting RAJA::forall calls like this is not recommended as
-  // it limits the ability to expose parallelism and flexibility for
-  // implementation alternatives.
-  //
+//
+// Next, we replace the outer 'row' loop and the inner 'col' loop
+// with RAJA::forall statements. This will also work with parallel
+// execution policies, such as OpenMP and CUDA, with caveats and
+// restrictions.
+//
+// However, nesting RAJA::forall calls like this is not recommended as
+// it limits the ability to expose parallelism and flexibility for
+// implementation alternatives.
+//
 
   std::cout << "\n Running sequential mat-mult (RAJA-row, RAJA-col)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_nestedforall_start
-  RAJA::forall<RAJA::seq_exec>(
-      row_range,
-      [=](int row)
-      {
-        RAJA::forall<RAJA::seq_exec>(
-            col_range,
-            [=](int col)
-            {
-              double dot = 0.0;
-              for (int k = 0; k < N; ++k)
-              {
-                dot += Aview(row, k) * Bview(k, col);
-              }
-              Cview(row, col) = dot;
-            });
-      });
+  RAJA::forall<RAJA::seq_exec>( row_range, [=](int row) {
+
+    RAJA::forall<RAJA::seq_exec>( col_range, [=](int col) {
+
+      double dot = 0.0;
+      for (int k = 0; k < N; ++k) {
+        dot += Aview(row, k) * Bview(k, col);
+      }
+      Cview(row, col) = dot;
+
+    });
+
+  });
   // _matmult_nestedforall_end
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Next, we use a RAJA::kernel method to execute the kernel. These examples,
-  // illustrate the basic kernel interface and mechanics. The execution policies
-  // express the outer row and col loops using the RAJA kernel interface. Later,
-  // in this file we show some more complex policy examples where we express all
-  // three loops using the kernel interface and use additional kernel features.
-  //
-  // This is different than RAJA::forall and so a few points of exmplanation
-  // are in order:
-  //
-  // 1) A range and lambda index argument are required for each level in
-  //    the loop nest. Here, we have two of each since we have a doubly-nested
-  //    loop.
-  // 2) A range for each loop nest level is specified in a RAJA tuple object.
-  //    The order of ranges in the tuple must match the order of args to the
-  //    lambda for this to be correct, in general. RAJA provides strongly-typed
-  //    indices to help with this. However, this example does not use them.
-  // 3) An execution policy is required for each level in the loop nest. These
-  //    are specified in the 'RAJA::statement::For' templates in the
-  //    'RAJA::KernelPolicy type.
-  // 4) The loop nest ordering is specified in the nested execution policy --
-  //    the first 'For' policy is the outermost loop, the second 'For' policy
-  //    is the loop nested inside the outermost loop, and so on.
-  // 5) The integer values that are the first template arguments to the policies
-  //    indicate which range/lambda argument, the policy applies to.
-  //
+//
+// Next, we use a RAJA::kernel method to execute the kernel. These examples,
+// illustrate the basic kernel interface and mechanics. The execution policies
+// express the outer row and col loops using the RAJA kernel interface. Later,
+// in this file we show some more complex policy examples where we express all
+// three loops using the kernel interface and use additional kernel features.
+//
+// This is different than RAJA::forall and so a few points of exmplanation
+// are in order:
+//
+// 1) A range and lambda index argument are required for each level in
+//    the loop nest. Here, we have two of each since we have a doubly-nested
+//    loop.
+// 2) A range for each loop nest level is specified in a RAJA tuple object.
+//    The order of ranges in the tuple must match the order of args to the
+//    lambda for this to be correct, in general. RAJA provides strongly-typed
+//    indices to help with this. However, this example does not use them.
+// 3) An execution policy is required for each level in the loop nest. These
+//    are specified in the 'RAJA::statement::For' templates in the
+//    'RAJA::KernelPolicy type.
+// 4) The loop nest ordering is specified in the nested execution policy --
+//    the first 'For' policy is the outermost loop, the second 'For' policy
+//    is the loop nested inside the outermost loop, and so on.
+// 5) The integer values that are the first template arguments to the policies
+//    indicate which range/lambda argument, the policy applies to.
+//
 
   std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_basickernel_start
-  using EXEC_POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,  // row
-      RAJA::statement::For<
-          0,
-          RAJA::seq_exec,  // col
-          RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<EXEC_POL>(
-      RAJA::make_tuple(col_range, row_range),
-      [=](int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += Aview(row, k) * Bview(k, col);
-        }
-        Cview(row, col) = dot;
-      });
+  using EXEC_POL =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::seq_exec,    // row
+        RAJA::statement::For<0, RAJA::seq_exec,  // col
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL>(RAJA::make_tuple(col_range, row_range),
+    [=](int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += Aview(row, k) * Bview(k, col);
+    }
+    Cview(row, col) = dot;
+
+  });
   // _matmult_basickernel_end
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_ompkernel_start
-  using EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,  // row
-      RAJA::statement::For<
-          0,
-          RAJA::seq_exec,  // col
-          RAJA::statement::Lambda<0>>>>;
+  using EXEC_POL1 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::omp_parallel_for_exec,  // row
+        RAJA::statement::For<0, RAJA::seq_exec,            // col
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
   // _matmult_ompkernel_end
 
-  RAJA::kernel<EXEC_POL1>(
-      RAJA::make_tuple(col_range, row_range),
-      [=](int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += Aview(row, k) * Bview(k, col);
-        }
-        Cview(row, col) = dot;
-      });
+  RAJA::kernel<EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
+    [=](int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += Aview(row, k) * Bview(k, col);
+    }
+    Cview(row, col) = dot;
+
+  });
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp inner)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // Swapping the template arguments in this nested policy swaps the loop
@@ -364,67 +353,70 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // sequentially, while row (inner) iterations execute in parallel.
   //
   // _matmult_ompkernel_swap_start
-  using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      RAJA::seq_exec,  // col
-      RAJA::statement::For<
-          1,
-          RAJA::omp_parallel_for_exec,  // row
-          RAJA::statement::Lambda<0>>>>;
+  using EXEC_POL2 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec,                  // col
+        RAJA::statement::For<1, RAJA::omp_parallel_for_exec,    // row
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
   // _matmult_ompkernel_swap_end
 
-  RAJA::kernel<EXEC_POL2>(
-      RAJA::make_tuple(col_range, row_range),
-      [=](int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += Aview(row, k) * Bview(k, col);
-        }
-        Cview(row, col) = dot;
-      });
+  RAJA::kernel<EXEC_POL2>( RAJA::make_tuple(col_range, row_range),
+    [=](int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += Aview(row, k) * Bview(k, col);
+    }
+    Cview(row, col) = dot;
+
+  });
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // This policy collapses the row and col loops in an OpenMP parallel region.
   // This is the same as using an OpenMP 'parallel for' directive on the
   // outer loop with a 'collapse(2) clause.
   //
-  using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>,  // row, col
-      RAJA::statement::Lambda<0>>>;
-
-  RAJA::kernel<EXEC_POL3>(
-      RAJA::make_tuple(col_range, row_range),
-      [=](int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += Aview(row, k) * Bview(k, col);
-        }
-        Cview(row, col) = dot;
-      });
+  using EXEC_POL3 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<1, 0>,   // row, col
+        RAJA::statement::Lambda<0>
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
+    [=](int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += Aview(row, k) * Bview(k, col);
+    }
+    Cview(row, col) = dot;
+
+  });
   checkResult<double>(Cview, N);
-// printResult<double>(Cview, N);
-#endif  // if RAJA_ENABLE_OPENMP
+//printResult<double>(Cview, N);
+#endif // if RAJA_ENABLE_OPENMP
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA mat-mult (RAJA-nested - POL4)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // This policy replaces the loop nest with a single CUDA kernel launch
@@ -438,31 +430,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   //
   using EXEC_POL4 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          1, RAJA::cuda_block_x_loop,
-          RAJA::statement::For<
-              0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL4>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += Aview(row, k) * Bview(k, col);
-        }
-        Cview(row, col) = dot;
-      });
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_block_x_loop,
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
+    [=] RAJA_DEVICE (int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += Aview(row, k) * Bview(k, col);
+    }
+    Cview(row, col) = dot;
+
+  });
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tiled mat-mult (RAJA-POL5)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // This policy collapses the col and row loops into a single CUDA kernel
@@ -474,47 +470,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // same as in this kernel and the one above.
   //
   using EXEC_POL5 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::cuda_thread_y_loop,
-                  RAJA::statement::For<
-                      0, RAJA::cuda_thread_x_loop,
-                      RAJA::statement::Lambda<0>>>>>>>;
-
-  RAJA::kernel<EXEC_POL5>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += Aview(row, k) * Bview(k, col);
-        }
-        Cview(row, col) = dot;
-      });
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
+            RAJA::statement::For<1, RAJA::cuda_thread_y_loop,
+              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
+    [=] RAJA_DEVICE (int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += Aview(row, k) * Bview(k, col);
+    }
+    Cview(row, col) = dot;
+
+  });
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-#endif  // if RAJA_ENABLE_CUDA
+#endif // if RAJA_ENABLE_CUDA
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  double* d_A = memoryManager::allocate_gpu<double>(N * N);
-  double* d_B = memoryManager::allocate_gpu<double>(N * N);
-  double* d_C = memoryManager::allocate_gpu<double>(N * N);
+  double *d_A = memoryManager::allocate_gpu<double>(N * N);
+  double *d_B = memoryManager::allocate_gpu<double>(N * N);
+  double *d_C = memoryManager::allocate_gpu<double>(N * N);
 
   std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
-  hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   RAJA::View<double, RAJA::Layout<DIM>> d_Aview(d_A, N, N);
   RAJA::View<double, RAJA::Layout<DIM>> d_Bview(d_B, N, N);
@@ -531,34 +530,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   using EXEC_POL4 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-          1, RAJA::hip_block_x_loop,
-          RAJA::statement::For<
-              0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL4>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += d_Aview(row, k) * d_Bview(k, col);
-        }
-
-        d_Cview(row, col) = dot;
-      });
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<1, RAJA::hip_block_x_loop,
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
+    [=] RAJA_DEVICE (int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += d_Aview(row, k) * d_Bview(k, col);
+    }
+
+    d_Cview(row, col) = dot;
+
+  });
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP tiled mat-mult (RAJA-POL5)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  std::memset(C, 0, N*N * sizeof(double));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   //
   // This policy collapses the col and row loops into a single HIP kernel
@@ -570,46 +573,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // same as in this kernel and the one above.
   //
   using EXEC_POL5 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::hip_thread_y_loop,
-                  RAJA::statement::For<
-                      0, RAJA::hip_thread_x_loop,
-                      RAJA::statement::Lambda<0>>>>>>>;
-
-  RAJA::kernel<EXEC_POL5>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        double dot = 0.0;
-        for (int k = 0; k < N; ++k)
-        {
-          dot += d_Aview(row, k) * d_Bview(k, col);
-        }
-
-        d_Cview(row, col) = dot;
-      });
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+                                 RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+                                   RAJA::hip_block_x_loop,
+            RAJA::statement::For<1, RAJA::hip_thread_y_loop,
+              RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(col_range, row_range),
+    [=] RAJA_DEVICE (int col, int row) {
+
+    double dot = 0.0;
+    for (int k = 0; k < N; ++k) {
+      dot += d_Aview(row, k) * d_Bview(k, col);
+    }
+
+    d_Cview(row, col) = dot;
+
+  });
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-// printResult<double>(Cview, N);
-#endif  // if RAJA_ENABLE_HIP
+//printResult<double>(Cview, N);
+#endif // if RAJA_ENABLE_HIP
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // The following examples use execution policies to express the outer row and
-  // col loops as well as the inner dot product loop using the RAJA kernel
-  // interface. They show some more complex policy examples and use additional
-  // kernel features.
-  //
+//
+// The following examples use execution policies to express the outer row and
+// col loops as well as the inner dot product loop using the RAJA kernel
+// interface. They show some more complex policy examples and use additional
+// kernel features.
+//
 
-  std::cout << "\n Running sequential mat-mult with multiple lambdas "
-               "(RAJA-POL6a)...\n";
+  std::cout << "\n Running sequential mat-mult with multiple lambdas (RAJA-POL6a)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   //
   // This policy executes the col, row and k (inner dot product) loops
@@ -625,301 +632,310 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // by all three lambdas.
   //
   // _matmult_3lambdakernel_seq_start
-  using EXEC_POL6a = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<
-          0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot
-                                                        // =
-                                                        // 0.0
-          RAJA::statement::For<
-              2, RAJA::seq_exec,
-              RAJA::statement::Lambda<1>  // inner loop: dot +=
-                                          // ...
-              >,
-          RAJA::statement::Lambda<
-              2, RAJA::Segs<0, 1>,
-              RAJA::Params<0>>  // set
-                                // C(row,
-                                // col)
-                                // = dot
-          >>>;
+  using EXEC_POL6a =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+          RAJA::statement::For<2, RAJA::seq_exec,
+            RAJA::statement::Lambda<1> // inner loop: dot += ...
+          >,
+          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
+        >
+      >
+    >;
 
   RAJA::kernel_param<EXEC_POL6a>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=](double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=](int col, int row, int k, double& dot)
-      { dot += Aview(row, k) * Bview(k, col); },
+    // lambda 1
+    [=] (int col, int row, int k, double& dot) {
+       dot += Aview(row, k) * Bview(k, col);
+    },
 
-      // lambda 2
-      [=](int col, int row, double& dot) { Cview(row, col) = dot; }
+    // lambda 2
+    [=] (int col, int row, double& dot) {
+       Cview(row, col) = dot;
+    }
 
   );
   // _matmult_3lambdakernel_seq_end
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+  //printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
-  //
-  // The following examples uses an extension of the lambda statement
-  // to specify lambda arguments. By specifying arguments within statements
-  // we remove the requirement that lambdas require all of the tuple contents.
-  //
+//
+// The following examples uses an extension of the lambda statement
+// to specify lambda arguments. By specifying arguments within statements
+// we remove the requirement that lambdas require all of the tuple contents.
+//
 
-  std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda "
-               "args in statements (RAJA-POL6b)...\n";
+  std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda args in statements (RAJA-POL6b)...\n";
 
   // _matmult_3lambdakernel_args_seq_start
   // Alias for convenience
-  using RAJA::Params;
   using RAJA::Segs;
+  using RAJA::Params;
 
-  using EXEC_POL6b = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<
-          0, RAJA::seq_exec, RAJA::statement::Lambda<0, Params<0>>,  // dot =
-                                                                     // 0.0
-          RAJA::statement::For<
-              2, RAJA::seq_exec,
-              RAJA::statement::Lambda<
-                  1, Segs<0, 1, 2>,
-                  Params<0>>  // dot += ...
-              >,
-          RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>>  // C(row, col) =
-                                                             // dot
-          >>>;
+  using EXEC_POL6b =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
+          RAJA::statement::For<2, RAJA::seq_exec,
+            RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
+          >,
+          RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>  // C(row, col) = dot
+        >
+      >
+    >;
 
   RAJA::kernel_param<EXEC_POL6b>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=](double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=](int col, int row, int k, double& dot)
-      { dot += Aview(row, k) * Bview(k, col); },
+    // lambda 1
+    [=] (int col, int row, int k, double& dot) {
+       dot += Aview(row, k) * Bview(k, col);
+    },
 
-      // lambda 2
-      [=](int col, int row, double& dot) { Cview(row, col) = dot; }
+    // lambda 2
+    [=] (int col, int row, double& dot) {
+       Cview(row, col) = dot;
+    }
 
   );
   // _matmult_3lambdakernel_args_seq_end
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+  //printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop "
-               "collapse (RAJA-POL7)...\n";
+  std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop collapse (RAJA-POL7)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_3lambdakernel_ompcollapse_start
-  using EXEC_POL7 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>,  // row, col
-      RAJA::statement::Lambda<0, RAJA::Params<0>>,            // dot = 0.0
-      RAJA::statement::For<
-          2, RAJA::seq_exec,
-          RAJA::statement::Lambda<1>  // inner loop: dot += ...
-          >,
-      RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set
-                                                                     // C(row,
-                                                                     // col) =
-                                                                     // dot
-      >>;
+  using EXEC_POL7 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<1, 0>,   // row, col
+        RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+        RAJA::statement::For<2, RAJA::seq_exec,
+          RAJA::statement::Lambda<1> // inner loop: dot += ...
+        >,
+        RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
+      >
+    >;
   // _matmult_3lambdakernel_ompcollapse_end
 
   RAJA::kernel_param<EXEC_POL7>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=](double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=](int col, int row, int k, double& dot)
-      { dot += Aview(row, k) * Bview(k, col); },
+    // lambda 1
+    [=] (int col, int row, int k, double& dot) {
+       dot += Aview(row, k) * Bview(k, col);
+    },
 
-      // lambda 2
-      [=](int col, int row, double& dot) { Cview(row, col) = dot; }
+    // lambda 2
+    [=] (int col, int row, double& dot) {
+       Cview(row, col) = dot;
+    }
 
   );
 
   checkResult<double>(Cview, N);
-// printResult<double>(Cview, N);
-#endif  // if RAJA_ENABLE_OPENMP
+//printResult<double>(Cview, N);
+#endif // if RAJA_ENABLE_OPENMP
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  std::cout << "\n Running CUDA mat-mult with multiple lambdas "
-               "(RAJA-POL8)...\n";
+  std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL8)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_3lambdakernel_cuda_start
   using EXEC_POL8 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          1,
-          RAJA::cuda_block_x_loop,  // row
-          RAJA::statement::For<
-              0,
-              RAJA::cuda_thread_x_loop,                     // col
-              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-              RAJA::statement::For<
-                  2, RAJA::seq_exec,
-                  RAJA::statement::Lambda<1>  // dot += ...
-                  >,
-              RAJA::statement::Lambda<
-                  2, RAJA::Segs<0, 1>,
-                  RAJA::Params<0>>  // set C = ...
-              >>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_block_x_loop,    // row
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
+            RAJA::statement::Lambda<0, RAJA::Params<0>>,    // dot = 0.0
+            RAJA::statement::For<2, RAJA::seq_exec,
+                RAJA::statement::Lambda<1>                  // dot += ...
+            >,
+            RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C = ...
+          >
+        >
+      >
+    >;
   // _matmult_3lambdakernel_cuda_end
 
   RAJA::kernel_param<EXEC_POL8>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] RAJA_DEVICE (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
-      { dot += Aview(row, k) * Bview(k, col); },
+    // lambda 1
+    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
+       dot += Aview(row, k) * Bview(k, col);
+    },
 
-      // lambda 2
-      [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
+    // lambda 2
+    [=] RAJA_DEVICE (int col, int row, double& dot) {
+       Cview(row, col) = dot;
+    }
 
   );
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running CUDA mat-mult with multiple lambdas "
-               "(RAJA-POL9a)...\n";
+  std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL9a)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // _matmult_3lambdakernel_cudatiled_start
   using EXEC_POL9a =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
-              RAJA::statement::For<
-                  1,
-                  RAJA::cuda_thread_y_loop,  // row
-                  RAJA::statement::For<
-                      0,
-                      RAJA::cuda_thread_x_loop,                     // col
-                      RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-                      RAJA::statement::For<
-                          2, RAJA::seq_exec,
-                          RAJA::statement::Lambda<1>  // dot +=
-                                                      // ...
-                          >,
-                      RAJA::statement::Lambda<
-                          2, RAJA::Segs<0, 1>,
-                          RAJA::Params<0>>  // set C
-                                            // = ...
-                      >>>>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+                                 RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+                                   RAJA::cuda_block_x_loop,
+            RAJA::statement::For<1, RAJA::cuda_thread_y_loop,   // row
+              RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
+                RAJA::statement::Lambda<0, RAJA::Params<0>>,    // dot = 0.0
+                RAJA::statement::For<2, RAJA::seq_exec,
+                    RAJA::statement::Lambda<1>                 // dot += ...
+                >,
+                RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C = ...
+              >
+            >
+          >
+        >
+      >
+    >;
   // _matmult_3lambdakernel_cudatiled_end
 
   RAJA::kernel_param<EXEC_POL9a>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] RAJA_DEVICE (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
-      { dot += Aview(row, k) * Bview(k, col); },
+    // lambda 1
+    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
+       dot += Aview(row, k) * Bview(k, col);
+    },
 
-      // lambda 2
-      [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
+    // lambda 2
+    [=] RAJA_DEVICE (int col, int row,  double& dot) {
+       Cview(row, col) = dot;
+    }
 
   );
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args "
-               "in statements (RAJA-POL9b)...\n";
+  std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9b)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   using EXEC_POL9b =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_loop,
-              RAJA::statement::For<
-                  1,
-                  RAJA::cuda_thread_y_loop,  // row
-                  RAJA::statement::For<
-                      0,
-                      RAJA::cuda_thread_x_loop,               // col
-                      RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
-                      RAJA::statement::For<
-                          2, RAJA::seq_exec,
-                          RAJA::statement::Lambda<
-                              1, Segs<0, 1, 2>,
-                              Params<0>>  // dot += ...
-                          >,
-                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>>  // set
-                                                                         // C =
-                                                                         // ...
-                      >>>>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+                                 RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
+                                   RAJA::cuda_block_x_loop,
+            RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row
+              RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col
+                RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
+                RAJA::statement::For<2, RAJA::seq_exec,
+                  RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
+                >,
+                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
+              >
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel_param<EXEC_POL9b>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] RAJA_DEVICE (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
-      { dot += Aview(row, k) * Bview(k, col); },
+    // lambda 1
+    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
+       dot += Aview(row, k) * Bview(k, col);
+    },
 
-      // lambda 2
-      [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; }
+    // lambda 2
+    [=] RAJA_DEVICE (int col, int row, double& dot) {
+       Cview(row, col) = dot;
+    }
 
   );
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running  mat-mult with tiling + shared memory...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // This example builds on the RAJA tiling capabilities presented earlier
   // and uses RAJA LocalArray's to load tiles of the global matrix
@@ -929,131 +945,134 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // synchronization. We recommend viewing tut_matrix-transpose-local-array.cpp
   // for an introduction to RAJA LocalArray types and thread synchronization.
 
-  using Shmem = RAJA::LocalArray<
-      double, RAJA::PERM_IJ, RAJA::SizeList<CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>;
-
-  using shmem_Lambda0 =
-      RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-  using shmem_Lambda1 = RAJA::statement::Lambda<
-      1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
-  using shmem_Lambda2 = RAJA::statement::Lambda<
-      2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
-  using shmem_Lambda3 =
-      RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-  using shmem_Lambda4 = RAJA::statement::Lambda<
-      4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-
-  using EXEC_POL10 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
-      CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE,
-      // Initalize thread private value
-      RAJA::statement::InitLocalMem<
-          RAJA::cuda_shared_mem, RAJA::ParamList<2, 1, 0>,
+  using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>;
+
+  using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+  using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+  using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+  using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
+  using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+
+  using EXEC_POL10 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernelFixed<CUDA_BLOCK_SIZE*CUDA_BLOCK_SIZE,
+        //Initalize thread private value
+        RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2,1,0>,
 
           // Tile rows and cols of C (the result matrix C)
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_direct,
-              RAJA::statement::Tile<
-                  2, RAJA::tile_fixed<CUDA_BLOCK_SIZE>,
-                  RAJA::cuda_block_y_direct,
-
-                  // zero out shmem tile of C
-                  RAJA::statement::For<
-                      2, RAJA::cuda_thread_y_loop,
-                      RAJA::statement::For<
-                          0, RAJA::cuda_thread_x_loop, shmem_Lambda0>>,
-
-                  // Slide window across matrix: Load tiles of global matrices
-                  // A, B and compute local dot products
-                  RAJA::statement::Tile<
-                      1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::seq_exec,
-
-                      // Load tile of A into shmem
-                      RAJA::statement::For<
-                          1, RAJA::cuda_thread_y_loop,
-                          RAJA::statement::For<
-                              0, RAJA::cuda_thread_x_loop, shmem_Lambda1>>,
-
-                      // Load tile of B into shmem
-                      RAJA::statement::For<
-                          2, RAJA::cuda_thread_y_loop,
-                          RAJA::statement::For<
-                              1, RAJA::cuda_thread_x_loop, shmem_Lambda2>>,
-
-                      RAJA::statement::CudaSyncThreads,
-
-                      // Partial multiplication
-                      RAJA::statement::For<
-                          2, RAJA::cuda_thread_y_loop,
-                          RAJA::statement::For<
-                              1, RAJA::seq_exec,
-                              RAJA::statement::For<
-                                  0, RAJA::cuda_thread_x_loop, shmem_Lambda3>>>,
-
-                      RAJA::statement::CudaSyncThreads>,  // sliding window
-
-                  // Write memory out to global matrix
-                  RAJA::statement::For<
-                      2, RAJA::cuda_thread_y_loop,
-                      RAJA::statement::For<
-                          0, RAJA::cuda_thread_x_loop,
-                          shmem_Lambda4>>>>>  // Create shared
-                                              // memory
-      >                                       // Cuda kernel
-                                        >;
-
-  Shmem aShared, bShared, cShared;
-
-  RAJA::kernel_param<EXEC_POL10>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+          RAJA::statement::Tile<0, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_x_direct,
+            RAJA::statement::Tile<2, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::cuda_block_y_direct,
+
+            // zero out shmem tile of C
+            RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
+              RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                shmem_Lambda0 > >,
+
+                // Slide window across matrix: Load tiles of global matrices A, B and compute
+                // local dot products
+                RAJA::statement::Tile<1, RAJA::tile_fixed<CUDA_BLOCK_SIZE>, RAJA::seq_exec,
+
+                  // Load tile of A into shmem
+                  RAJA::statement::For<1, RAJA::cuda_thread_y_loop,
+                    RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                      shmem_Lambda1
+                    >
+                   >,
+
+                  // Load tile of B into shmem
+                  RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
+                    RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
+                      shmem_Lambda2
+                    >
+                  >,
+
+                  RAJA::statement::CudaSyncThreads,
+
+                  //Partial multiplication
+                  RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
+                    RAJA::statement::For<1, RAJA::seq_exec,
+                      RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                        shmem_Lambda3
+                      >
+                    >
+                  >,
+
+                  RAJA::statement::CudaSyncThreads
+                >, //sliding window
+
+               //Write memory out to global matrix
+               RAJA::statement::For<2, RAJA::cuda_thread_y_loop,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+                shmem_Lambda4 > >
+             >
+            >
+           > //Create shared memory
+         >//Cuda kernel
+        >;
+
+    Shmem aShared, bShared, cShared;
+
+    RAJA::kernel_param<EXEC_POL10>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
       RAJA::make_tuple(aShared, bShared, cShared),
 
-      // Zero out thread local memory for storing dot products
-      [=] RAJA_HOST_DEVICE(int tn, int tp, Shmem& cShared)
-      { cShared(tn, tp) = 0.0; },
+    // Zero out thread local memory for storing dot products
+    [=] RAJA_HOST_DEVICE (int tn, int tp, Shmem &cShared) {
+
+      cShared(tn,tp) = 0.0;
+
+    },
+
+    // Load tile of A
+    [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) {
+
+      aShared(tn, tm) = Aview(n, m);
 
-      // Load tile of A
-      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared)
-      { aShared(tn, tm) = Aview(n, m); },
+    },
 
-      // Load tile of B
-      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared)
-      { bShared(tm, tp) = Bview(m, p); },
+    // Load tile of B
+    [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) {
 
-      // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(
-          int tn, int tm, int tp, Shmem& aShared, Shmem& bShared,
-          Shmem& cShared)
-      { cShared(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
+      bShared(tm, tp) = Bview(m, p);
 
-      // Write out complete result
-      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, Shmem& cShared)
-      { Cview(n, p) = cShared(tn, tp); });
+    },
+
+    // Do partial update in shmem
+    [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared,  Shmem &bShared, Shmem & cShared) {
+
+      cShared(tn,tp) += aShared(tn,tm) * bShared(tm, tp);
+
+    },
+
+    // Write out complete result
+    [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp,  Shmem &cShared) {
+
+      Cview(n,p) = cShared(tn,tp);
+
+    });
 
   checkResult<double>(Cview, N);
-// printResult<double>(Cview, N);
-#endif  // if RAJA_ENABLE_CUDA
+//printResult<double>(Cview, N);
+#endif // if RAJA_ENABLE_CUDA
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
+  std::memset(C, 0, N*N * sizeof(double));
 
   // Define thread block dimensions
   dim3 blockdim(CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
 
-  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
-  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch CUDA kernel defined near the top of this file.
   matMultKernel<<<griddim, blockdim>>>(N, C, A, B);
@@ -1061,153 +1080,154 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   cudaDeviceSynchronize();
 
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-#endif  // if RAJA_ENABLE_CUDA
+#endif // if RAJA_ENABLE_CUDA
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running HIP mat-mult with multiple lambdas (RAJA-POL8)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  std::memset(C, 0, N*N * sizeof(double));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   // _matmult_3lambdakernel_hip_start
   using EXEC_POL8 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-          1,
-          RAJA::hip_block_x_loop,  // row
-          RAJA::statement::For<
-              0,
-              RAJA::hip_thread_x_loop,                      // col
-              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-              RAJA::statement::For<
-                  2, RAJA::seq_exec,
-                  RAJA::statement::Lambda<1>  // dot += ...
-                  >,
-              RAJA::statement::Lambda<
-                  2, RAJA::Segs<0, 1>,
-                  RAJA::Params<0>>  // set C = ...
-              >>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<1, RAJA::hip_block_x_loop,    // row
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col
+            RAJA::statement::Lambda<0, RAJA::Params<0>>,   // dot = 0.0
+            RAJA::statement::For<2, RAJA::seq_exec,
+                RAJA::statement::Lambda<1>                 // dot += ...
+            >,
+            RAJA::statement::Lambda<2,
+              RAJA::Segs<0,1>, RAJA::Params<0>>            // set C = ...
+          >
+        >
+      >
+    >;
   // _matmult_3lambdakernel_hip_end
 
   RAJA::kernel_param<EXEC_POL8>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] RAJA_DEVICE (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
-      { dot += d_Aview(row, k) * d_Bview(k, col); },
+    // lambda 1
+    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
+       dot += d_Aview(row, k) * d_Bview(k, col);
+    },
 
-      // lambda 2
-      [=] RAJA_DEVICE(int col, int row, double& dot)
-      { d_Cview(row, col) = dot; }
+    // lambda 2
+    [=] RAJA_DEVICE (int col, int row, double& dot) {
+       d_Cview(row, col) = dot;
+    }
 
   );
 
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
 
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in "
-               "statements (RAJA-POL9)...\n";
+  std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  std::memset(C, 0, N*N * sizeof(double));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   // _matmult_3lambdakernel_hiptiled_start
   using EXEC_POL9b =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<HIP_BLOCK_SIZE>, RAJA::hip_block_x_loop,
-              RAJA::statement::For<
-                  1,
-                  RAJA::hip_thread_y_loop,  // row
-                  RAJA::statement::For<
-                      0,
-                      RAJA::hip_thread_x_loop,                // col
-                      RAJA::statement::Lambda<0, Params<0>>,  // dot = 0.0
-                      RAJA::statement::For<
-                          2, RAJA::seq_exec,
-                          RAJA::statement::Lambda<
-                              1, Segs<0, 1, 2>,
-                              Params<0>>  // dot += ...
-                          >,
-                      RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>>  // set
-                                                                         // C =
-                                                                         // ...
-                      >>>>>>;
-  // _matmult_3lambdakernel_hiptiled_end
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+                                 RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<HIP_BLOCK_SIZE>,
+                                   RAJA::hip_block_x_loop,
+            RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // row
+              RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // col
+                RAJA::statement::Lambda<0, Params<0>>,          // dot = 0.0
+                RAJA::statement::For<2, RAJA::seq_exec,
+                  RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ...
+                >,
+                  RAJA::statement::Lambda<2, Segs<0,1>, Params<0>>   // set C = ...
+              >
+            >
+          >
+        >
+      >
+    >;
+ // _matmult_3lambdakernel_hiptiled_end
 
   RAJA::kernel_param<EXEC_POL9b>(
-      RAJA::make_tuple(col_range, row_range, dot_range),
+    RAJA::make_tuple(col_range, row_range, dot_range),
 
-      RAJA::tuple<double> {0.0},  // thread local variable for 'dot'
+    RAJA::tuple<double>{0.0},    // thread local variable for 'dot'
 
-      // lambda 0
-      [=] RAJA_DEVICE(double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] RAJA_DEVICE (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=] RAJA_DEVICE(int col, int row, int k, double& dot)
-      { dot += d_Aview(row, k) * d_Bview(k, col); },
+    // lambda 1
+    [=] RAJA_DEVICE (int col, int row, int k, double& dot) {
+       dot += d_Aview(row, k) * d_Bview(k, col);
+    },
 
-      // lambda 2
-      [=] RAJA_DEVICE(int col, int row, double& dot)
-      { d_Cview(row, col) = dot; }
+    // lambda 2
+    [=] RAJA_DEVICE (int col, int row, double& dot) {
+       d_Cview(row, col) = dot;
+    }
 
   );
 
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP tiled mat-mult (no RAJA)...\n";
 
-  std::memset(C, 0, N * N * sizeof(double));
-  hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice));
+  std::memset(C, 0, N*N * sizeof(double));
+  hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice ));
 
   // Define thread block dimensions
   dim3 blockdim(HIP_BLOCK_SIZE, HIP_BLOCK_SIZE);
   // Define grid dimensions to match the RAJA version above
-  dim3 griddim(
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.x),
-      RAJA_DIVIDE_CEILING_INT(N, blockdim.y));
+  dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x),
+               RAJA_DIVIDE_CEILING_INT(N,blockdim.y));
 
-  // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x,
-  // (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
+//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y);
 
   // Launch HIP kernel defined near the top of this file.
-  hipLaunchKernelGGL(
-      (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
+  hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B);
 
   hipDeviceSynchronize();
 
-  hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult<double>(Cview, N);
-  // printResult<double>(Cview, N);
+//printResult<double>(Cview, N);
 
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
-#endif  // if RAJA_ENABLE_HIP
+#endif // if RAJA_ENABLE_HIP
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -1224,22 +1244,16 @@ template <typename T>
 void checkResult(T* C, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      if (std::abs(C(row, col) - row * col * N) > 10e-12)
-      {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      if ( std::abs( C(row, col) - row * col * N ) > 10e-12 ) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if ( match ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -1248,22 +1262,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   bool match = true;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      if (std::abs(Cview(row, col) - row * col * N) > 10e-12)
-      {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      if ( std::abs( Cview(row, col) - row * col * N ) > 10e-12 ) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if ( match ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -1275,12 +1283,10 @@ template <typename T>
 void printResult(T* C, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      std::cout << "C(" << row << "," << col << ") = " << C(row, col)
-                << std::endl;
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      std::cout << "C(" << row << "," << col << ") = "
+                << C(row, col) << std::endl;
     }
   }
   std::cout << std::endl;
@@ -1290,12 +1296,10 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      std::cout << "C(" << row << "," << col << ") = " << Cview(row, col)
-                << std::endl;
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      std::cout << "C(" << row << "," << col << ") = "
+                << Cview(row, col) << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp
index 3d168d88fa..e3b83480ee 100644
--- a/examples/wave-eqn.cpp
+++ b/examples/wave-eqn.cpp
@@ -13,7 +13,7 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *   Time-Domain Finite Difference
+ *   Time-Domain Finite Difference 
  *   Acoustic Wave Equation Solver
  *
  * ------[Details]----------------------
@@ -26,7 +26,7 @@
  * The scheme uses a second order central difference discretization
  * for time and a fourth order central difference discretization for space.
  * Periodic boundary conditions are assumed on the grid [-1,1] x [-1, 1].
- *
+ * 
  * NOTE: The x and y dimensions are discretized identically.
  * ----[RAJA Concepts]-------------------
  * - RAJA kernels are portable and a single implemenation can run
@@ -34,7 +34,7 @@
  *
  * RAJA MaxReduction - RAJA's implementation for computing a maximum value
  *    (MinReduction computes the min)
- */
+*/
 
 //
 //  ---[Constant Values]-------
@@ -42,7 +42,7 @@
 //  PI - Value of pi
 //
 
-const int    sr = 2;
+const int sr = 2;
 const double PI = 3.14159265359;
 
 //
@@ -51,10 +51,9 @@ const double PI = 3.14159265359;
 //  h - Spacing between grid points
 //  n - Number of grid points
 //
-struct grid_s
-{
+struct grid_s {
   double ox, dx;
-  int    nx;
+  int nx;
 };
 
 
@@ -67,17 +66,16 @@ struct grid_s
 //
 
 template <typename T, typename fdNestedPolicy>
-void   wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx);
+void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx);
 double waveSol(double t, double x, double y);
-void   setIC(double* P1, double* P2, double t0, double t1, grid_s grid);
-void   computeErr(double* P, double tf, grid_s grid);
+void setIC(double *P1, double *P2, double t0, double t1, grid_s grid);
+void computeErr(double *P, double tf, grid_s grid);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "Time-Domain Finite Difference Acoustic Wave Equation Solver"
-            << std::endl;
-
+  std::cout<<"Time-Domain Finite Difference Acoustic Wave Equation Solver"<<std::endl;
+         
   //
   // Wave speed squared
   //
@@ -99,14 +97,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment fdBounds(0, grid.nx);
 
   //
-  // Solution is propagated until time T
+  //Solution is propagated until time T
   //
   double T = 0.82;
 
 
-  int     entries = grid.nx * grid.nx;
-  double* P1      = memoryManager::allocate<double>(entries);
-  double* P2      = memoryManager::allocate<double>(entries);
+  int entries = grid.nx * grid.nx;
+  double *P1 = memoryManager::allocate<double>(entries);
+  double *P2 = memoryManager::allocate<double>(entries);
 
   //
   //----[Time stepping parameters]----
@@ -125,23 +123,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   // Sequential policy
-  using fdPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+  using fdPolicy = RAJA::KernelPolicy<
+    RAJA::statement::For<1, RAJA::seq_exec,
+    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
 
   // OpenMP policy
-  // using fdPolicy = RAJA::KernelPolicy<
-  // RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+  //using fdPolicy = RAJA::KernelPolicy<
+  //RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
   //  RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
 
   // CUDA policy
-  // using fdPolicy =
-  // RAJA::KernelPolicy<
+  //using fdPolicy =
+  //RAJA::KernelPolicy<
   //  RAJA::statement::CudaKernel<
-  //      RAJA::statement::Tile<1, RAJA::tile_fixed<16>,
-  //      RAJA::cuda_block_y_direct,
-  //        RAJA::statement::Tile<0, RAJA::tile_fixed<16>,
-  //        RAJA::cuda_block_x_direct,
+  //      RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct,
+  //        RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::cuda_block_x_direct,
   //          RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
   //            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
   //              RAJA::statement::Lambda<0>
@@ -155,16 +151,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   time = 0;
   setIC(P1, P2, (time - dt), time, grid);
-  for (int k = 0; k < nt; ++k)
-  {
+  for (int k = 0; k < nt; ++k) {
 
     wave<double, fdPolicy>(P1, P2, fdBounds, ct, grid.nx);
 
     time += dt;
 
-    double* Temp = P2;
-    P2           = P1;
-    P1           = Temp;
+    double *Temp = P2;
+    P2 = P1;
+    P1 = Temp;
   }
 #if defined(RAJA_ENABLE_CUDA)
   cudaDeviceSynchronize();
@@ -190,30 +185,29 @@ double waveSol(double t, double x, double y)
 //
 //  Error is computed via ||P_{approx}(:) - P_{analytic}(:)||_{inf}
 //
-void computeErr(double* P, double tf, grid_s grid)
+void computeErr(double *P, double tf, grid_s grid)
 {
 
-  RAJA::RangeSegment                        fdBounds(0, grid.nx);
+  RAJA::RangeSegment fdBounds(0, grid.nx);
   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
 
-  using initialPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<initialPolicy>(
-      RAJA::make_tuple(fdBounds, fdBounds),
-      [=](RAJA::Index_type tx, RAJA::Index_type ty)
-      {
-        int    id    = tx + grid.nx * ty;
-        double x     = grid.ox + tx * grid.dx;
-        double y     = grid.ox + ty * grid.dx;
-        double myErr = std::abs(P[id] - waveSol(tf, x, y));
-
-        //
-        // tMax.max() is used to store the maximum value
-        //
-        tMax.max(myErr);
-      });
+  using initialPolicy = RAJA::KernelPolicy<
+  RAJA::statement::For<1, RAJA::seq_exec ,
+    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
+
+  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
+                       [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
+
+      int id = tx + grid.nx * ty;
+      double x = grid.ox + tx * grid.dx;
+      double y = grid.ox + ty * grid.dx;
+      double myErr = std::abs(P[id] - waveSol(tf, x, y));
+
+      //
+      // tMax.max() is used to store the maximum value
+      //
+      tMax.max(myErr);
+    });
 
   double lInfErr = tMax;
   printf("Max Error = %lg, dx = %f \n", lInfErr, grid.dx);
@@ -223,66 +217,63 @@ void computeErr(double* P, double tf, grid_s grid)
 //
 // Function to set intial condition
 //
-void setIC(double* P1, double* P2, double t0, double t1, grid_s grid)
+void setIC(double *P1, double *P2, double t0, double t1, grid_s grid)
 {
 
   RAJA::RangeSegment fdBounds(0, grid.nx);
 
-  using initialPolicy = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<initialPolicy>(
-      RAJA::make_tuple(fdBounds, fdBounds),
-      [=](RAJA::Index_type tx, RAJA::Index_type ty)
-      {
-        int    id = tx + ty * grid.nx;
-        double x  = grid.ox + tx * grid.dx;
-        double y  = grid.ox + ty * grid.dx;
-
-        P1[id] = waveSol(t0, x, y);
-        P2[id] = waveSol(t1, x, y);
-      });
+  using initialPolicy = RAJA::KernelPolicy<
+  RAJA::statement::For<1, RAJA::seq_exec,
+    RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
+  
+  RAJA::kernel<initialPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
+                       [=] (RAJA::Index_type tx, RAJA::Index_type ty) {
+                         
+      int id = tx + ty * grid.nx;
+      double x = grid.ox + tx * grid.dx;
+      double y = grid.ox + ty * grid.dx;
+      
+      P1[id] = waveSol(t0, x, y);
+      P2[id] = waveSol(t1, x, y);
+    });
 }
 
 
+
 template <typename T, typename fdNestedPolicy>
-void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx)
+void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx)
 {
 
-  RAJA::kernel<fdNestedPolicy>(
-      RAJA::make_tuple(fdBounds, fdBounds),
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type tx, RAJA::Index_type ty)
-      {
-        //
-        // Coefficients for fourth order stencil
-        //
-        double coeff[5] = {
-            -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
-
-        const int id     = tx + ty * nx;
-        double    P_old  = P1[id];
-        double    P_curr = P2[id];
-
-        //
-        // Compute Laplacian
-        //
-        double lap = 0.0;
-
-        for (auto r : RAJA::RangeSegment(-sr, sr + 1))
-        {
-          const int xi  = (tx + r + nx) % nx;
-          const int idx = xi + nx * ty;
-          lap += coeff[r + sr] * P2[idx];
-
-          const int yi  = (ty + r + nx) % nx;
-          const int idy = tx + nx * yi;
-          lap += coeff[r + sr] * P2[idy];
-        }
-
-        //
-        // Store result
-        //
-        P1[id] = 2 * P_curr - P_old + ct * lap;
-      });
+  RAJA::kernel<fdNestedPolicy>(RAJA::make_tuple(fdBounds,fdBounds),
+                       [=] RAJA_HOST_DEVICE (RAJA::Index_type tx, RAJA::Index_type ty) {
+      //                  
+      //Coefficients for fourth order stencil
+      //
+     double coeff[5] = { -1.0/12.0, 4.0/3.0, -5.0/2.0, 4.0/3.0, -1.0/12.0};
+
+     const int id = tx + ty * nx;
+     double P_old = P1[id];
+     double P_curr = P2[id];
+
+     //
+     // Compute Laplacian
+     //
+     double lap = 0.0;
+
+     for (auto r : RAJA::RangeSegment(-sr, sr + 1)) {
+       const int xi = (tx + r + nx) % nx;
+       const int idx = xi + nx * ty;
+       lap += coeff[r + sr] * P2[idx];
+  
+       const int yi = (ty + r + nx) % nx;
+       const int idy = tx + nx * yi;
+       lap += coeff[r + sr] * P2[idy];
+     }
+
+     //
+     // Store result
+     //
+     P1[id] = 2 * P_curr - P_old + ct * lap;
+
+  });
 }
diff --git a/exercises/atomic-histogram.cpp b/exercises/atomic-histogram.cpp
index 55c683ba04..602a04a10e 100644
--- a/exercises/atomic-histogram.cpp
+++ b/exercises/atomic-histogram.cpp
@@ -19,9 +19,9 @@
  *
  *  In this exercise, you will use use RAJA atomic operations to compute
  *  an array which represents a histogram of values in another array.
- *  Given an array of length N containing integers in the interval [0, M),
- *  you will compute entries in an array 'hist' of length M. Each entry
- *  hist[i] in the histogram array will equal the number of occurrences of
+ *  Given an array of length N containing integers in the interval [0, M), 
+ *  you will compute entries in an array 'hist' of length M. Each entry 
+ *  hist[i] in the histogram array will equal the number of occurrences of 
  *  the value 'i' in the orginal array.
  *
  *  This file contains sequential and OpenMP variants of the histogram
@@ -41,11 +41,11 @@
   Specifies the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-// const int CUDA_BLOCK_SIZE = 256;
+//const int CUDA_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-// const int HIP_BLOCK_SIZE = 256;
+//const int HIP_BLOCK_SIZE = 256;
 #endif
 
 //
@@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // Define array bounds and initialize array to compute histogram of values
-  // on.
+  // on. 
   //
 
   // _array_atomic_histogram_start
@@ -70,35 +70,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 100000;
 
   int* array = memoryManager::allocate<int>(N);
-  int* hist  = memoryManager::allocate<int>(M);
+  int* hist = memoryManager::allocate<int>(M);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) { 
     array[i] = rand() % M;
   }
   // _array_atomic_histogram_end
 
   int* hist_ref = memoryManager::allocate<int>(M);
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential historgram...\n";
 
   std::memset(hist_ref, 0, M * sizeof(int));
 
-  for (int i = 0; i < N; ++i)
-  {
-    hist_ref[array[i]]++;
+  for (int i = 0; i < N; ++i) {
+      hist_ref[ array[i] ]++;
   }
 
-  // printArray(hist_ref, M);
+//printArray(hist_ref, M);
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -106,51 +104,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-#pragma omp parallel for
-  for (int i = 0; i < N; ++i)
-  {
-#pragma omp atomic
-    hist[array[i]]++;
+  #pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
+      #pragma omp atomic
+      hist[ array[i] ]++;
   }
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
-#endif
+#endif 
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::seq_exec policy enforces strictly sequential execution.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::seq_exec policy enforces strictly sequential execution.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential atomic histogram...\n";
 
   std::memset(hist, 0, M * sizeof(int));
 
   // _range_atomic_histogram_start
-  // RAJA::TypedRangeSegment<int> array_range(0,N);
+  //RAJA::TypedRangeSegment<int> array_range(0,N);
   // _range_atomic_histogram_end
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
-  ///           method with RAJA::seq_exec execution policy type and a
+  ///           method with RAJA::seq_exec execution policy type and a 
   ///           RAJA::atomicAdd operation with RAJA::seq_atomic policy.
   ///
   ///           You will need to uncomment the range segment definition
   ///           above to use it in the kernel.
   ///
-  // RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
-  // });
+  //RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
+  //});
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -162,44 +159,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
-  ///           method with RAJA::omp_parallel_for_exec execution policy type
+  ///           method with RAJA::omp_parallel_for_exec execution policy type 
   ///           and a RAJA::atomicAdd operation with RAJA::omp_atomic policy.
-  ///
+  /// 
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA auto_atomic policy can also be used with the RAJA OpenMP
-  // execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA OpenMP 
+// execution policy. 
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n";
-
+  
   std::memset(hist, 0, M * sizeof(int));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
-  ///           method with RAJA::omp_parallel_for_exec execution policy type
+  ///           method with RAJA::omp_parallel_for_exec execution policy type 
   ///           and a RAJA::atomicAdd operation with RAJA::auto_atomic policy.
   ///
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -219,20 +216,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA auto_atomic policy can also be used with the RAJA CUDA
-  // execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA CUDA 
+// execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n";
-
+ 
   std::memset(hist, 0, M * sizeof(int));
 
   ///
@@ -245,15 +242,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
-
+   
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA hip_atomic policy is used with the RAJA HIP execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA hip_atomic policy is used with the RAJA HIP execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -273,20 +270,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA auto_atomic policy can also be used with the RAJA HIP
-  // execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA HIP 
+// execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n";
-
+ 
   std::memset(hist, 0, M * sizeof(int));
 
   ///
@@ -299,9 +296,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the
   ///                 top of the file if you want to use it here.
   ///
-
+   
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
@@ -324,19 +321,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* hist, int* hist_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (correct && hist[i] != hist_ref[i])
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( correct && hist[i] != hist_ref[i] ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -347,8 +337,7 @@ void checkResult(int* hist, int* hist_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp
index 4f9dff0c8a..368f729ebc 100644
--- a/exercises/atomic-histogram_solution.cpp
+++ b/exercises/atomic-histogram_solution.cpp
@@ -19,9 +19,9 @@
  *
  *  In this exercise, you will use use RAJA atomic operations to compute
  *  an array which represents a histogram of values in another array.
- *  Given an array of length N containing integers in the interval [0, M),
- *  you will compute entries in an array 'hist' of length M. Each entry
- *  hist[i] in the histogram array will equal the number of occurrences of
+ *  Given an array of length N containing integers in the interval [0, M), 
+ *  you will compute entries in an array 'hist' of length M. Each entry 
+ *  hist[i] in the histogram array will equal the number of occurrences of 
  *  the value 'i' in the orginal array.
  *
  *  This file contains sequential and OpenMP variants of the histogram
@@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // Define array bounds and initialize array to compute histogram of values
-  // on.
+  // on. 
   //
 
   // _array_atomic_histogram_start
@@ -70,35 +70,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int N = 100000;
 
   int* array = memoryManager::allocate<int>(N);
-  int* hist  = memoryManager::allocate<int>(M);
+  int* hist = memoryManager::allocate<int>(M);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) { 
     array[i] = rand() % M;
   }
   // _array_atomic_histogram_end
 
   int* hist_ref = memoryManager::allocate<int>(M);
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential historgram...\n";
 
   std::memset(hist_ref, 0, M * sizeof(int));
 
-  for (int i = 0; i < N; ++i)
-  {
-    hist_ref[array[i]]++;
+  for (int i = 0; i < N; ++i) {
+      hist_ref[ array[i] ]++;
   }
 
-  // printArray(hist_ref, M);
+//printArray(hist_ref, M);
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -106,42 +104,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-#pragma omp parallel for
-  for (int i = 0; i < N; ++i)
-  {
-#pragma omp atomic
-    hist[array[i]]++;
+  #pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
+      #pragma omp atomic
+      hist[ array[i] ]++;
   }
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
-#endif
+#endif 
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::seq_exec policy enforces strictly sequential execution.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::seq_exec policy enforces strictly sequential execution.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential atomic histogram...\n";
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _range_atomic_histogram_start
-  RAJA::TypedRangeSegment<int> array_range(0, N);
-  // _range_atomic_histogram_end
+  // _range_atomic_histogram_start 
+  RAJA::TypedRangeSegment<int> array_range(0,N);
+  // _range_atomic_histogram_end 
+
+  RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
+
+    RAJA::atomicAdd<RAJA::seq_atomic>(&hist[array[i]], 1);
 
-  RAJA::forall<RAJA::seq_exec>(
-      array_range,
-      [=](int i) { RAJA::atomicAdd<RAJA::seq_atomic>(&hist[array[i]], 1); });
+  });
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -149,42 +148,46 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajaomp_atomic_histogram_start
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      array_range,
-      [=](int i) { RAJA::atomicAdd<RAJA::omp_atomic>(&hist[array[i]], 1); });
+  // _rajaomp_atomic_histogram_start 
+  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
+
+    RAJA::atomicAdd<RAJA::omp_atomic>(&hist[array[i]], 1);
+
+  });
   // _rajaomp_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA auto_atomic policy can also be used with the RAJA OpenMP
-  // execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA OpenMP 
+// execution policy. 
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n";
-
+  
   std::memset(hist, 0, M * sizeof(int));
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      array_range,
-      [=](int i) { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
+
+    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
 
+  });
+    
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -192,43 +195,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajacuda_atomic_histogram_start
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i)
-      { RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1); });
+  // _rajacuda_atomic_histogram_start 
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
+
+    RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1);
+
+  });
   // _rajacuda_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA auto_atomic policy can also be used with the RAJA CUDA
-  // execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA CUDA 
+// execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n";
-
+ 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajacuda_atomicauto_histogram_start
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i)
-      { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
-  // _rajacuda_atomicauto_histogram_end
+  // _rajacuda_atomicauto_histogram_start 
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
+
+    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
 
+  });
+  // _rajacuda_atomicauto_histogram_end
+   
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA hip_atomic policy is used with the RAJA HIP execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA hip_atomic policy is used with the RAJA HIP execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -236,37 +243,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajahip_atomic_histogram_start
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i)
-      { RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1); });
+  // _rajahip_atomic_histogram_start 
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
+
+    RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1);
+
+  });
   // _rajahip_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA auto_atomic policy can also be used with the RAJA HIP
-  // execution policy.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA HIP 
+// execution policy.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n";
-
+ 
   std::memset(hist, 0, M * sizeof(int));
 
-  // _rajahip_atomicauto_histogram_start
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      array_range, [=] RAJA_DEVICE(int i)
-      { RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1); });
-  // _rajahip_atomicauto_histogram_end
+  // _rajahip_atomicauto_histogram_start 
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
+
+    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
 
+  });
+  // _rajahip_atomicauto_histogram_end
+   
   checkResult(hist, hist_ref, M);
-  // printArray(hist, M);
+//printArray(hist, M);
 
 #endif
 
@@ -289,19 +300,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* hist, int* hist_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (correct && hist[i] != hist_ref[i])
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( correct && hist[i] != hist_ref[i] ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -312,8 +316,7 @@ void checkResult(int* hist, int* hist_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp
index c68104a327..c2830c6cb2 100644
--- a/exercises/dot-product.cpp
+++ b/exercises/dot-product.cpp
@@ -14,9 +14,9 @@
 /*
  *  Vector Dot Product Exercise
  *
- *  Computes dot = (a,b), where a, b are vectors of
+ *  Computes dot = (a,b), where a, b are vectors of 
  *  doubles and dot is a scalar double. It illustrates how RAJA
- *  supports a portable parallel reduction opertion in a way that
+ *  supports a portable parallel reduction opertion in a way that 
  *  the code looks like it does in a sequential implementation.
  *
  *  RAJA features shown:
@@ -33,40 +33,38 @@
 //
 void checkResult(double compdot, double refdot);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: vector dot product...\n";
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   constexpr int N = 1000000;
 
-  //
-  // Allocate and initialize vector data
-  //
-  double* a = memoryManager::allocate<double>(N);
-  double* b = memoryManager::allocate<double>(N);
+//
+// Allocate and initialize vector data
+//
+  double *a = memoryManager::allocate<double>(N);
+  double *b = memoryManager::allocate<double>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] = 1.0;
     b[i] = 1.0;
   }
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // C-style dot product operation.
-  //
+//
+// C-style dot product operation.
+//
   std::cout << "\n Running C-version of dot product...\n";
 
   // _csytle_dotprod_start
   double dot = 0.0;
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     dot += a[i] * b[i];
   }
 
@@ -75,7 +73,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   double dot_ref = dot;
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential dot product...\n";
 
@@ -85,16 +83,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::seq_exec
-  ///           execution policy type and RAJA::seq_reduce.
+  ///           execution policy type and RAJA::seq_reduce. 
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
 
   RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=](int i) { seqdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+    seqdot += a[i] * b[i]; 
+  });
 
   dot = seqdot.get();
 
@@ -103,7 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP dot product...\n";
@@ -113,10 +111,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement the dot product kernel using a
-  /// RAJA::omp_parallel_for_exec
-  ///           execution policy type and RAJA::omp_reduce reduction policy
-  ///           type.
+  /// EXERCISE: Implement the dot product kernel using a RAJA::omp_parallel_for_exec
+  ///           execution policy type and RAJA::omp_reduce reduction policy type.
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -125,11 +121,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  // const int CUDA_BLOCK_SIZE = 256;
+//const int CUDA_BLOCK_SIZE = 256;
 
   std::cout << "\n Running RAJA CUDA dot product...\n";
 
@@ -139,11 +135,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::cuda_exec
-  ///           execution policy type and RAJA::cuda_reduce reduction policy
-  ///           type.
-  ///
+  ///           execution policy type and RAJA::cuda_reduce reduction policy type.
+  ///      
   ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' above.
-  ///                 if you want to use it here.
+  ///                 if you want to use it here. 
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -151,31 +146,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  // const int HIP_BLOCK_SIZE = 256;
+//const int HIP_BLOCK_SIZE = 256;
 
   std::cout << "\n Running RAJA HIP dot product...\n";
 
   dot = 0.0;
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  int* d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::hip_exec
-  ///           execution policy type and RAJA::hip_reduce reduction policy
-  ///           type.
-  ///
+  ///           execution policy type and RAJA::hip_reduce reduction policy type.
+  ///      
   ///           NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' above
-  ///                 if you want to use it here.
+  ///                 if you want to use it here. 
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -186,11 +180,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_b);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
 
-  // const int SYCL_BLOCK_SIZE = 256;
+//const int SYCL_BLOCK_SIZE = 256;
 
   std::cout << "\n Running RAJA SYCL dot product...\n";
 
@@ -200,10 +194,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the dot product kernel using a RAJA::sycl_exec
-  ///           execution policy type and RAJA::sycl_reduce.
+  ///           execution policy type and RAJA::sycl_reduce. 
   ///
   ///           NOTE: You will need to uncomment 'SYCL_BLOCK_SIZE' above
-  ///                 if you want to use it here.
+  ///                 if you want to use it here. 
   ///
 
   std::cout << "\t (a, b) = " << dot << std::endl;
@@ -212,7 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
   memoryManager::deallocate(a);
@@ -228,12 +222,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 void checkResult(double compdot, double refdot)
 {
-  if (compdot == refdot)
-  {
+  if ( compdot == refdot ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
+
diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp
index 2181d8ecc4..d0ae458171 100644
--- a/exercises/dot-product_solution.cpp
+++ b/exercises/dot-product_solution.cpp
@@ -16,9 +16,9 @@
 /*
  *  Vector Dot Product Exercise
  *
- *  Computes dot = (a,b), where a, b are vectors of
+ *  Computes dot = (a,b), where a, b are vectors of 
  *  doubles and dot is a scalar double. It illustrates how RAJA
- *  supports a portable parallel reduction opertion in a way that
+ *  supports a portable parallel reduction opertion in a way that 
  *  the code looks like it does in a sequential implementation.
  *
  *  RAJA features shown:
@@ -35,40 +35,38 @@
 //
 void checkResult(double compdot, double refdot);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: vector dot product...\n";
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   constexpr int N = 1000000;
 
-  //
-  // Allocate and initialize vector data
-  //
-  double* a = memoryManager::allocate<double>(N);
-  double* b = memoryManager::allocate<double>(N);
+//
+// Allocate and initialize vector data
+//
+  double *a = memoryManager::allocate<double>(N);
+  double *b = memoryManager::allocate<double>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] = 1.0;
     b[i] = 1.0;
   }
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // C-style dot product operation.
-  //
+//
+// C-style dot product operation.
+//
   std::cout << "\n Running C-version of dot product...\n";
 
   // _csytle_dotprod_start
   double dot = 0.0;
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     dot += a[i] * b[i];
   }
 
@@ -77,7 +75,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   double dot_ref = dot;
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential dot product...\n";
 
@@ -86,9 +84,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajaseq_dotprod_start
   RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=](int i) { seqdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+    seqdot += a[i] * b[i]; 
+  });
 
   dot = seqdot.get();
   // _rajaseq_dotprod_end
@@ -98,7 +96,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP dot product...\n";
@@ -108,8 +106,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajaomp_dotprod_start
   RAJA::ReduceSum<RAJA::omp_reduce, double> ompdot(0.0);
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      RAJA::RangeSegment(0, N), [=](int i) { ompdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=] (int i) { 
+    ompdot += a[i] * b[i]; 
+  }); 
 
   dot = ompdot.get();
   // _rajaomp_dotprod_end
@@ -120,7 +119,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -133,9 +132,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajacuda_dotprod_start
   RAJA::ReduceSum<RAJA::cuda_reduce, double> cudot(0.0);
 
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_DEVICE(int i) { cudot += a[i] * b[i]; });
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    cudot += a[i] * b[i]; 
+  });    
 
   dot = cudot.get();
   // _rajacuda_dotprod_end
@@ -145,7 +145,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult(dot, dot_ref);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -155,18 +155,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   dot = 0.0;
 
-  double* d_a = memoryManager::allocate_gpu<double>(N);
-  double* d_b = memoryManager::allocate_gpu<double>(N);
+  double *d_a = memoryManager::allocate_gpu<double>(N);
+  double *d_b = memoryManager::allocate_gpu<double>(N);
 
-  hipErrchk(hipMemcpy(d_a, a, N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_b, b, N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice ));
 
   // _rajahip_dotprod_start
   RAJA::ReduceSum<RAJA::hip_reduce, double> hpdot(0.0);
 
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_DEVICE(int i) { hpdot += d_a[i] * d_b[i]; });
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
+    [=] RAJA_DEVICE (int i) {
+    hpdot += d_a[i] * d_b[i];
+  });
 
   dot = hpdot.get();
   // _rajahip_dotprod_end
@@ -179,7 +180,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_b);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
 
@@ -192,9 +193,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _rajasycl_dotprod_start
   RAJA::ReduceSum<RAJA::sycl_reduce, double> hpdot(0.0);
 
-  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE, false>>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_DEVICE(int i) { hpdot += a[i] * b[i]; });
+  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE, false>>(RAJA::RangeSegment(0, N),
+    [=] RAJA_DEVICE (int i) {
+    hpdot += a[i] * b[i];
+  });
 
   dot = static_cast<double>(hpdot.get());
   // _rajasycl_dotprod_end
@@ -205,7 +207,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
   memoryManager::deallocate(a);
@@ -221,12 +223,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 void checkResult(double compdot, double refdot)
 {
-  if (compdot == refdot)
-  {
+  if ( compdot == refdot ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
+
diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp
index daf97a7fe7..227af7d2be 100644
--- a/exercises/kernel-matrix-transpose-local-array.cpp
+++ b/exercises/kernel-matrix-transpose-local-array.cpp
@@ -66,7 +66,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
@@ -87,8 +87,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -104,10 +104,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
@@ -122,10 +120,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -136,17 +132,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -158,21 +151,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx)
-      {
-        for (int ty = 0; ty < TILE_DIM; ++ty)
-        {
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
+
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -195,8 +186,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM = RAJA::LocalArray<
-      int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM =
+    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -223,19 +214,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           ///
           /// TODO...
           ///
-          /// EXERCISE: Initialize the local memory statement as position 2
+          /// EXERCISE: Initialize the local memory statement as position 2 
           ///           in the paramater list.
           ///
 
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
-  RAJA::seq_exec, RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-  RAJA::seq_exec, RAJA::statement::Lambda<0>
+          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
             >
           >,
 
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>,
-  RAJA::seq_exec, RAJA::statement::ForICount<1, RAJA::statement::Param<0>,
-  RAJA::seq_exec, RAJA::statement::Lambda<1>
+          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+              RAJA::statement::Lambda<1>
             >
           >
 
@@ -244,7 +235,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel_param<SEQ_EXEC_POL_I>(
+  RAJA::kernel_param<SEQ_EXEC_POL_I>( 
     RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
                      RAJA::TypedRangeSegment<int>(0, N_r)),
 
@@ -286,9 +277,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     //      These loops iterate over the number of
     //      tiles needed to carry out the transpose
     //
-    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>,
-  RAJA::omp_parallel_for_exec, RAJA::statement::Tile<0,
-  RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
         // This statement will initalize local array memory inside a
         // kernel. The cpu_tile_mem policy specifies that memory should be
         // allocated on the stack. The entries in the RAJA::ParamList
@@ -303,8 +293,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           ///
           /// TODO...
           ///
-          /// EXERCISE: Use two ForICount statements with seq_exec to call the
-  first lambda.
+          /// EXERCISE: Use two ForICount statements with seq_exec to call the first lambda.
           ///
 
           //
@@ -319,8 +308,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
           ///
           /// TODO...
           ///
-          /// EXERCISE: Use two ForICount statements with seq_exec to call the
-  second lambda.
+          /// EXERCISE: Use two ForICount statements with seq_exec to call the second lambda.
           ///
         >
       >
@@ -355,57 +343,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_2_POL = RAJA::KernelPolicy<
-      //
-      // (0) Execution policies for outer loops
-      //      These loops iterate over the number of
-      //      tiles needed to carry out the transpose
-      //
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-              // This statement will initalize local array memory inside a
-              // kernel. The cpu_tile_mem policy specifies that memory should be
-              // allocated on the stack. The entries in the RAJA::ParamList
-              // identify RAJA local arrays to intialize in the parameter tuple.
-              RAJA::statement::InitLocalMem<
-                  RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-                  //
-                  // (1) Execution policies for the first set of inner
-                  // loops. These loops copy data from the global matrices
-                  // to the local tile.
-                  //
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                          RAJA::statement::Lambda<0>>>,
-                  //
-                  // (2) Execution policies for the second set of inner
-                  // loops. These loops copy data from the local tile to
-                  // the global matrix.
-                  //     Note: The order of the loops have been
-                  //     swapped! This enables us to swap which
-                  //     index has unit stride.
-                  //
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                          RAJA::statement::Lambda<1>>>>>>>;
+  using OPENMP_EXEC_2_POL =
+  RAJA::KernelPolicy<
+    //
+    // (0) Execution policies for outer loops
+    //      These loops iterate over the number of
+    //      tiles needed to carry out the transpose
+    //
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      // This statement will initalize local array memory inside a
+      // kernel. The cpu_tile_mem policy specifies that memory should be
+      // allocated on the stack. The entries in the RAJA::ParamList
+      // identify RAJA local arrays to intialize in the parameter tuple.
+        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+          //
+          // (1) Execution policies for the first set of inner
+          // loops. These loops copy data from the global matrices
+          // to the local tile.
+          //
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                                       RAJA::statement::Lambda<0>
+             >
+          >,
+          //
+          // (2) Execution policies for the second set of inner
+          // loops. These loops copy data from the local tile to
+          // the global matrix.
+          //     Note: The order of the loops have been
+          //     swapped! This enables us to swap which
+          //     index has unit stride.
+          //
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
+              RAJA::statement::Lambda<1>
+            >
+          >
+        >
+      >
+    >
+  >;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = Aview(row, col); },
+      Tile_Array(ty, tx) = Aview(row, col);
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Atview(col, row) = Tile_Array(ty, tx); });
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_r, N_c);
@@ -417,79 +413,87 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using CUDA_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+  using CUDA_EXEC_POL =
+  RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
-              // This statement will initalize local array memory inside a
-              // kernel. The cpu_tile_mem policy specifies that memory should be
-              // allocated on the stack. The entries in the RAJA::ParamList
-              // identify RAJA local arrays to intialize in the parameter tuple.
-              RAJA::statement::InitLocalMem<
-                  RAJA::cuda_shared_mem, RAJA::ParamList<2>,
-                  //
-                  // (1) Execution policies for the first set of inner
-                  // loops. These loops copy data from the global matrices
-                  // to the local tile.
-                  //
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<1>,
-                          RAJA::cuda_thread_x_direct,
-                          RAJA::statement::Lambda<0>>>,
-                  // Synchronize threads to ensure all loads
-                  // to the local array are complete
-                  RAJA::statement::CudaSyncThreads,
-                  //
-                  // (2) Execution policies for the second set of inner
-                  // loops. These loops copy data from the local tile to
-                  // the global matrix.
-                  //     Note: The order of the loops have been
-                  //     swapped! This enables us to swap which
-                  //     index has unit stride.
-                  //
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<0>,
-                          RAJA::cuda_thread_x_direct,
-                          RAJA::statement::Lambda<1>>>,
-                  // Synchronize threads to ensure all reads
-                  // from the local array are complete
-                  RAJA::statement::CudaSyncThreads>>>>>;
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+          // This statement will initalize local array memory inside a
+          // kernel. The cpu_tile_mem policy specifies that memory should be
+          // allocated on the stack. The entries in the RAJA::ParamList
+          // identify RAJA local arrays to intialize in the parameter tuple.
+          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
+            //
+            // (1) Execution policies for the first set of inner
+            // loops. These loops copy data from the global matrices
+            // to the local tile.
+            //
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct,
+                                          RAJA::statement::Lambda<0>
+              >
+            >,
+            // Synchronize threads to ensure all loads
+            // to the local array are complete
+            RAJA::statement::CudaSyncThreads,
+            //
+            // (2) Execution policies for the second set of inner
+            // loops. These loops copy data from the local tile to
+            // the global matrix.
+            //     Note: The order of the loops have been
+            //     swapped! This enables us to swap which
+            //     index has unit stride.
+            //
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
+                                            RAJA::statement::Lambda<1>
+              >
+            >,
+            // Synchronize threads to ensure all reads
+            // from the local array are complete
+            RAJA::statement::CudaSyncThreads
+          >
+        >
+      >
+    >
+  >;
 
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Tile_Array(ty, tx) = Aview(row, col);
+
+    },
+
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = Aview(row, col); },
+      Atview(col, row) = Tile_Array(ty, tx);
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Atview(col, row) = Tile_Array(ty, tx); });
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -501,88 +505,93 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
-  using HIP_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::HipKernel<
+  using HIP_EXEC_POL =
+  RAJA::KernelPolicy<
+    RAJA::statement::HipKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-              // This statement will initalize local array memory inside a
-              // kernel. The cpu_tile_mem policy specifies that memory should be
-              // allocated on the stack. The entries in the RAJA::ParamList
-              // identify RAJA local arrays to intialize in the parameter tuple.
-              RAJA::statement::InitLocalMem<
-                  RAJA::hip_shared_mem, RAJA::ParamList<2>,
-                  //
-                  // (1) Execution policies for the first set of inner
-                  // loops. These loops copy data from the global matrices
-                  // to the local tile.
-                  //
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<1>,
-                          RAJA::hip_thread_x_direct,
-                          RAJA::statement::Lambda<0>>>,
-                  // Synchronize threads to ensure all loads
-                  // to the local array are complete
-                  RAJA::statement::HipSyncThreads,
-                  //
-                  // (2) Execution policies for the second set of inner
-                  // loops. These loops copy data from the local tile to
-                  // the global matrix.
-                  //     Note: The order of the loops have been
-                  //     swapped! This enables us to swap which
-                  //     index has unit stride.
-                  //
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<0>,
-                          RAJA::hip_thread_x_direct,
-                          RAJA::statement::Lambda<1>>>,
-                  // Synchronize threads to ensure all reads
-                  // from the local array are complete
-                  RAJA::statement::HipSyncThreads>>>>>;
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+          // This statement will initalize local array memory inside a
+          // kernel. The cpu_tile_mem policy specifies that memory should be
+          // allocated on the stack. The entries in the RAJA::ParamList
+          // identify RAJA local arrays to intialize in the parameter tuple.
+          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
+            //
+            // (1) Execution policies for the first set of inner
+            // loops. These loops copy data from the global matrices
+            // to the local tile.
+            //
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct,
+                                          RAJA::statement::Lambda<0>
+              >
+            >,
+            // Synchronize threads to ensure all loads
+            // to the local array are complete
+            RAJA::statement::HipSyncThreads,
+            //
+            // (2) Execution policies for the second set of inner
+            // loops. These loops copy data from the local tile to
+            // the global matrix.
+            //     Note: The order of the loops have been
+            //     swapped! This enables us to swap which
+            //     index has unit stride.
+            //
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
+                                            RAJA::statement::Lambda<1>
+              >
+            >,
+            // Synchronize threads to ensure all reads
+            // from the local array are complete
+            RAJA::statement::HipSyncThreads
+          >
+        >
+      >
+    >
+  >;
 
 
   RAJA::kernel_param<HIP_EXEC_POL>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = d_Aview(row, col); },
+      Tile_Array(ty, tx) = d_Aview(row, col);
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { d_Atview(col, row) = Tile_Array(ty, tx); });
+    },
+
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      d_Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
 
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
 
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - sequential matrix transpose exercise with "
-               "args in statement ...\n";
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  // Alias for convenience
+  //Alias for convenience
+  using RAJA::Segs;
   using RAJA::Offsets;
   using RAJA::Params;
-  using RAJA::Segs;
 
   // _mattranspose_localarray_raja_lambdaargs_start
   ///
@@ -600,8 +609,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
           RAJA::statement::For<1, RAJA::seq_exec,
             RAJA::statement::For<0, RAJA::seq_exec,
-              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>,
-  Offsets<1>, Params<0> >
+              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> >
             >
           >,
 
@@ -616,7 +624,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel_param<SEQ_EXEC_POL_II>(
+  RAJA::kernel_param<SEQ_EXEC_POL_II>( 
     RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
                      RAJA::TypedRangeSegment<int>(0, N_r)),
 
@@ -636,7 +644,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -649,22 +657,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -676,10 +678,8 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
index 4601f98b6e..7b44cd3453 100644
--- a/exercises/kernel-matrix-transpose-local-array_solution.cpp
+++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -66,7 +66,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
@@ -87,8 +87,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -104,10 +104,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
@@ -122,10 +120,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -136,17 +132,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -158,21 +151,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx)
-      {
-        for (int ty = 0; ty < TILE_DIM; ++ty)
-        {
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
+
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -195,8 +186,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   // _mattranspose_localarray_start
-  using TILE_MEM = RAJA::LocalArray<
-      int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using TILE_MEM =
+    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
   // _mattranspose_localarray_end
 
@@ -209,40 +200,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _mattranspose_localarray_raja_start
-  using SEQ_EXEC_POL_I = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-
-              RAJA::statement::ForICount<
-                  1, RAJA::statement::Param<0>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<1>, RAJA::seq_exec,
-                      RAJA::statement::Lambda<0>>>,
-
-              RAJA::statement::ForICount<
-                  0, RAJA::statement::Param<1>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<0>, RAJA::seq_exec,
-                      RAJA::statement::Lambda<1>>>
-
-              >>>>;
-
-  RAJA::kernel_param<SEQ_EXEC_POL_I>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = Aview(row, col); },
-
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Atview(col, row) = Tile_Array(ty, tx); }
+  using SEQ_EXEC_POL_I =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+
+          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >,
+
+          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+              RAJA::statement::Lambda<1>
+            >
+          >
+
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel_param<SEQ_EXEC_POL_I>( 
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      Tile_Array(ty, tx) = Aview(row, col);
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      Atview(col, row) = Tile_Array(ty, tx);
+    }
 
   );
   // _mattranspose_localarray_raja_end
@@ -258,57 +252,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_1_POL = RAJA::KernelPolicy<
-      //
-      // (0) Execution policies for outer loops
-      //      These loops iterate over the number of
-      //      tiles needed to carry out the transpose
-      //
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-              // This statement will initalize local array memory inside a
-              // kernel. The cpu_tile_mem policy specifies that memory should be
-              // allocated on the stack. The entries in the RAJA::ParamList
-              // identify RAJA local arrays in the parameter tuple to intialize.
-              RAJA::statement::InitLocalMem<
-                  RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-                  //
-                  // (1) Execution policies for the first set of inner
-                  // loops. These loops copy data from the global matrices
-                  // to the local tile.
-                  //
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<0>, RAJA::seq_exec,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<1>, RAJA::seq_exec,
-                          RAJA::statement::Lambda<0>>>,
-                  //
-                  // (2) Execution policies for the second set of inner
-                  // loops. These loops copy data from the local tile to
-                  // the global matrix.
-                  //     Note: The order of the loops have been
-                  //     swapped! This enables us to swap which
-                  //     index has unit stride.
-                  //
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<1>, RAJA::seq_exec,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<0>, RAJA::seq_exec,
-                          RAJA::statement::Lambda<1>>>>>>>;
+  using OPENMP_EXEC_1_POL =
+  RAJA::KernelPolicy<
+    //
+    // (0) Execution policies for outer loops
+    //      These loops iterate over the number of
+    //      tiles needed to carry out the transpose
+    //
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        // This statement will initalize local array memory inside a
+        // kernel. The cpu_tile_mem policy specifies that memory should be
+        // allocated on the stack. The entries in the RAJA::ParamList
+        // identify RAJA local arrays in the parameter tuple to intialize.
+        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+          //
+          // (1) Execution policies for the first set of inner
+          // loops. These loops copy data from the global matrices
+          // to the local tile.
+          //
+          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+                                       RAJA::statement::Lambda<0>
+            >
+          >,
+          //
+          // (2) Execution policies for the second set of inner
+          // loops. These loops copy data from the local tile to
+          // the global matrix.
+          //     Note: The order of the loops have been
+          //     swapped! This enables us to swap which
+          //     index has unit stride.
+          //
+          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+                                       RAJA::statement::Lambda<1>
+            >
+          >
+        >
+      >
+    >
+   >;
 
   RAJA::kernel_param<OPENMP_EXEC_1_POL>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Tile_Array(ty, tx) = Aview(row, col);
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = Aview(row, col); },
+    },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Atview(col, row) = Tile_Array(ty, tx); });
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -319,57 +321,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_2_POL = RAJA::KernelPolicy<
-      //
-      // (0) Execution policies for outer loops
-      //      These loops iterate over the number of
-      //      tiles needed to carry out the transpose
-      //
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-              // This statement will initalize local array memory inside a
-              // kernel. The cpu_tile_mem policy specifies that memory should be
-              // allocated on the stack. The entries in the RAJA::ParamList
-              // identify RAJA local arrays to intialize in the parameter tuple.
-              RAJA::statement::InitLocalMem<
-                  RAJA::cpu_tile_mem, RAJA::ParamList<2>,
-                  //
-                  // (1) Execution policies for the first set of inner
-                  // loops. These loops copy data from the global matrices
-                  // to the local tile.
-                  //
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                          RAJA::statement::Lambda<0>>>,
-                  //
-                  // (2) Execution policies for the second set of inner
-                  // loops. These loops copy data from the local tile to
-                  // the global matrix.
-                  //     Note: The order of the loops have been
-                  //     swapped! This enables us to swap which
-                  //     index has unit stride.
-                  //
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                          RAJA::statement::Lambda<1>>>>>>>;
+  using OPENMP_EXEC_2_POL =
+  RAJA::KernelPolicy<
+    //
+    // (0) Execution policies for outer loops
+    //      These loops iterate over the number of
+    //      tiles needed to carry out the transpose
+    //
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      // This statement will initalize local array memory inside a
+      // kernel. The cpu_tile_mem policy specifies that memory should be
+      // allocated on the stack. The entries in the RAJA::ParamList
+      // identify RAJA local arrays to intialize in the parameter tuple.
+        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+          //
+          // (1) Execution policies for the first set of inner
+          // loops. These loops copy data from the global matrices
+          // to the local tile.
+          //
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+                                       RAJA::statement::Lambda<0>
+             >
+          >,
+          //
+          // (2) Execution policies for the second set of inner
+          // loops. These loops copy data from the local tile to
+          // the global matrix.
+          //     Note: The order of the loops have been
+          //     swapped! This enables us to swap which
+          //     index has unit stride.
+          //
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
+              RAJA::statement::Lambda<1>
+            >
+          >
+        >
+      >
+    >
+  >;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = Aview(row, col); },
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Atview(col, row) = Tile_Array(ty, tx); });
+      Tile_Array(ty, tx) = Aview(row, col);
+
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_r, N_c);
@@ -381,79 +391,87 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using CUDA_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+  using CUDA_EXEC_POL =
+  RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
-              // This statement will initalize local array memory inside a
-              // kernel. The cpu_tile_mem policy specifies that memory should be
-              // allocated on the stack. The entries in the RAJA::ParamList
-              // identify RAJA local arrays to intialize in the parameter tuple.
-              RAJA::statement::InitLocalMem<
-                  RAJA::cuda_shared_mem, RAJA::ParamList<2>,
-                  //
-                  // (1) Execution policies for the first set of inner
-                  // loops. These loops copy data from the global matrices
-                  // to the local tile.
-                  //
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<1>,
-                          RAJA::cuda_thread_x_direct,
-                          RAJA::statement::Lambda<0>>>,
-                  // Synchronize threads to ensure all loads
-                  // to the local array are complete
-                  RAJA::statement::CudaSyncThreads,
-                  //
-                  // (2) Execution policies for the second set of inner
-                  // loops. These loops copy data from the local tile to
-                  // the global matrix.
-                  //     Note: The order of the loops have been
-                  //     swapped! This enables us to swap which
-                  //     index has unit stride.
-                  //
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<0>,
-                          RAJA::cuda_thread_x_direct,
-                          RAJA::statement::Lambda<1>>>,
-                  // Synchronize threads to ensure all reads
-                  // from the local array are complete
-                  RAJA::statement::CudaSyncThreads>>>>>;
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+          // This statement will initalize local array memory inside a
+          // kernel. The cpu_tile_mem policy specifies that memory should be
+          // allocated on the stack. The entries in the RAJA::ParamList
+          // identify RAJA local arrays to intialize in the parameter tuple.
+          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
+            //
+            // (1) Execution policies for the first set of inner
+            // loops. These loops copy data from the global matrices
+            // to the local tile.
+            //
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct,
+                                          RAJA::statement::Lambda<0>
+              >
+            >,
+            // Synchronize threads to ensure all loads
+            // to the local array are complete
+            RAJA::statement::CudaSyncThreads,
+            //
+            // (2) Execution policies for the second set of inner
+            // loops. These loops copy data from the local tile to
+            // the global matrix.
+            //     Note: The order of the loops have been
+            //     swapped! This enables us to swap which
+            //     index has unit stride.
+            //
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
+                                            RAJA::statement::Lambda<1>
+              >
+            >,
+            // Synchronize threads to ensure all reads
+            // from the local array are complete
+            RAJA::statement::CudaSyncThreads
+          >
+        >
+      >
+    >
+  >;
 
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = Aview(row, col); },
+      Tile_Array(ty, tx) = Aview(row, col);
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Atview(col, row) = Tile_Array(ty, tx); });
+    },
+
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -465,132 +483,138 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
-  using HIP_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::HipKernel<
+  using HIP_EXEC_POL =
+  RAJA::KernelPolicy<
+    RAJA::statement::HipKernel<
       //
       // (0) Execution policies for outer loops
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-              // This statement will initalize local array memory inside a
-              // kernel. The cpu_tile_mem policy specifies that memory should be
-              // allocated on the stack. The entries in the RAJA::ParamList
-              // identify RAJA local arrays to intialize in the parameter tuple.
-              RAJA::statement::InitLocalMem<
-                  RAJA::hip_shared_mem, RAJA::ParamList<2>,
-                  //
-                  // (1) Execution policies for the first set of inner
-                  // loops. These loops copy data from the global matrices
-                  // to the local tile.
-                  //
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<1>,
-                          RAJA::hip_thread_x_direct,
-                          RAJA::statement::Lambda<0>>>,
-                  // Synchronize threads to ensure all loads
-                  // to the local array are complete
-                  RAJA::statement::HipSyncThreads,
-                  //
-                  // (2) Execution policies for the second set of inner
-                  // loops. These loops copy data from the local tile to
-                  // the global matrix.
-                  //     Note: The order of the loops have been
-                  //     swapped! This enables us to swap which
-                  //     index has unit stride.
-                  //
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<0>,
-                          RAJA::hip_thread_x_direct,
-                          RAJA::statement::Lambda<1>>>,
-                  // Synchronize threads to ensure all reads
-                  // from the local array are complete
-                  RAJA::statement::HipSyncThreads>>>>>;
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+          // This statement will initalize local array memory inside a
+          // kernel. The cpu_tile_mem policy specifies that memory should be
+          // allocated on the stack. The entries in the RAJA::ParamList
+          // identify RAJA local arrays to intialize in the parameter tuple.
+          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
+            //
+            // (1) Execution policies for the first set of inner
+            // loops. These loops copy data from the global matrices
+            // to the local tile.
+            //
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct,
+                                          RAJA::statement::Lambda<0>
+              >
+            >,
+            // Synchronize threads to ensure all loads
+            // to the local array are complete
+            RAJA::statement::HipSyncThreads,
+            //
+            // (2) Execution policies for the second set of inner
+            // loops. These loops copy data from the local tile to
+            // the global matrix.
+            //     Note: The order of the loops have been
+            //     swapped! This enables us to swap which
+            //     index has unit stride.
+            //
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
+                                            RAJA::statement::Lambda<1>
+              >
+            >,
+            // Synchronize threads to ensure all reads
+            // from the local array are complete
+            RAJA::statement::HipSyncThreads
+          >
+        >
+      >
+    >
+  >;
 
 
   RAJA::kernel_param<HIP_EXEC_POL>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = d_Aview(row, col); },
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { d_Atview(col, row) = Tile_Array(ty, tx); });
+      Tile_Array(ty, tx) = d_Aview(row, col);
 
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+    },
+
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      d_Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
 
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - sequential matrix transpose exercise with "
-               "args in statement ...\n";
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  // Alias for convenience
+  //Alias for convenience
+  using RAJA::Segs;
   using RAJA::Offsets;
   using RAJA::Params;
-  using RAJA::Segs;
 
   // _raja_mattranspose_lambdaargs_start
-  using SEQ_EXEC_POL_II = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem, RAJA::ParamList<0>,
-
-              RAJA::statement::For<
-                  1, RAJA::seq_exec,
-                  RAJA::statement::For<
-                      0, RAJA::seq_exec,
-                      RAJA::statement::Lambda<
-                          0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>,
-                          Params<0>>>>,
-
-              RAJA::statement::For<
-                  0, RAJA::seq_exec,
-                  RAJA::statement::For<
-                      1, RAJA::seq_exec,
-                      RAJA::statement::Lambda<
-                          1, Segs<0, 1>, Offsets<0, 1>, Params<0>>>>
-
-              >>>>;
-
-  RAJA::kernel_param<SEQ_EXEC_POL_II>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N_c),
-          RAJA::TypedRangeSegment<int>(0, N_r)),
-
-      RAJA::make_tuple(Tile_Array),
-
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Tile_Array(ty, tx) = Aview(row, col); },
-
-      [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array)
-      { Atview(col, row) = Tile_Array(ty, tx); });
+  using SEQ_EXEC_POL_II =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0>,
+
+          RAJA::statement::For<1, RAJA::seq_exec,
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> >
+            >
+          >,
+
+          RAJA::statement::For<0, RAJA::seq_exec,
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> >
+            >
+          >
+
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel_param<SEQ_EXEC_POL_II>( 
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+
+    RAJA::make_tuple(Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      Atview(col, row) = Tile_Array(ty, tx);
+    }
+  );
   // _raja_mattranspose_lambdaargs_start
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -603,22 +627,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -630,10 +648,8 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp
index a99c11a789..7316563117 100644
--- a/exercises/kernel-matrix-transpose-tiled.cpp
+++ b/exercises/kernel-matrix-transpose-tiled.cpp
@@ -21,14 +21,14 @@
  *  transposed and returned as a second matrix At.
  *
  *  This operation is carried out using a tiling algorithm.
- *  The algorithm iterates over tiles of the matrix A and
+ *  The algorithm iterates over tiles of the matrix A and 
  *  performs a transpose copy without explicitly storing the tile.
  *
  *  The algorithm is expressed as a collection of ``outer``
- *  and ``inner`` for loops. Iterations of the inner loop will
+ *  and ``inner`` for loops. Iterations of the inner loop will 
  *  tranpose tile entries; while outer loops will iterate over
  *  the number of tiles needed to carryout the transpose.
- *  We do not assume that tiles divide the number of rows and
+ *  We do not assume that tiles divide the number of rows and 
  *  and columns of the matrix.
  *
  *  RAJA features shown:
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,14 +94,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
@@ -112,28 +110,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
+      
     }
   }
   // _cstyle_tiled_mattranspose_end
@@ -144,12 +138,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose.
+  // transpose. 
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Further partioning of the iteration space is carried out in the
+  // Further partioning of the iteration space is carried out in the 
   // tile_fixed statements. Iterations inside a RAJA loop is given by their
-  // global iteration number.
+  // global iteration number. 
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -160,7 +154,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. The template parameter inside
+  // using sequential loops. The template parameter inside 
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
@@ -174,18 +168,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// NOTE: We have done this first one for you.
   ///
 
-  using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::For<
-              1, RAJA::seq_exec,
-              RAJA::statement::For<
-                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using TILED_KERNEL_EXEC_POL = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::For<1, RAJA::seq_exec, 
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -193,8 +192,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -216,8 +214,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range,
-  row_Range),
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
     [=](int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -227,8 +224,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner "
-               "loops...\n";
+  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -237,34 +233,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP parallel for loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using TILED_KERNEL_EXEC_POL_OMP2 =
-      RAJA::KernelPolicy<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-              RAJA::statement::Collapse<
-                  RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-                  RAJA::statement::Lambda<0>>  // closes collapse
-              >                                // closes Tile 0
-          >                                    // closes Tile 1
-                         >;                    // closes policy list
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using TILED_KERNEL_EXEC_POL_OMP2 = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                    RAJA::ArgList<0, 1>,
+                                    RAJA::statement::Lambda<0>
+          > //closes collapse
+        > // closes Tile 0
+      > // closes Tile 1
+    >; // closes policy list
+      
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-
+  
   // _raja_mattranspose_cuda_start
 
   ///
@@ -280,8 +277,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range,
-  row_Range),
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
     [=] RAJA_DEVICE (int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -289,10 +285,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
@@ -304,29 +300,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
   using TILED_KERNEL_EXEC_POL_HIP =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::hip_thread_x_direct,
-                  RAJA::statement::For<
-                      0, RAJA::hip_thread_y_direct,
-                      RAJA::statement::Lambda<0>>>>>>>;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
-      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
-      { d_Atview(col, row) = d_Aview(row, col); });
-
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+            RAJA::statement::For<1, RAJA::hip_thread_x_direct,
+              RAJA::statement::For<0, RAJA::hip_thread_y_direct,
+                                      RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>( RAJA::make_tuple(col_Range, row_Range),
+    [=] RAJA_DEVICE (int col, int row) {
+      d_Atview(col, row) = d_Aview(row, col);
+  });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -341,7 +340,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-}
+} 
 
 //
 // Function to check result and report P/F.
@@ -350,22 +349,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -377,13 +370,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernel-matrix-transpose-tiled_solution.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp
index ad000e58b6..9124a1b174 100644
--- a/exercises/kernel-matrix-transpose-tiled_solution.cpp
+++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp
@@ -21,14 +21,14 @@
  *  transposed and returned as a second matrix At.
  *
  *  This operation is carried out using a tiling algorithm.
- *  The algorithm iterates over tiles of the matrix A and
+ *  The algorithm iterates over tiles of the matrix A and 
  *  performs a transpose copy without explicitly storing the tile.
  *
  *  The algorithm is expressed as a collection of ``outer``
- *  and ``inner`` for loops. Iterations of the inner loop will
+ *  and ``inner`` for loops. Iterations of the inner loop will 
  *  tranpose tile entries; while outer loops will iterate over
  *  the number of tiles needed to carryout the transpose.
- *  We do not assume that tiles divide the number of rows and
+ *  We do not assume that tiles divide the number of rows and 
  *  and columns of the matrix.
  *
  *  RAJA features shown:
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,14 +94,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
@@ -112,28 +110,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
+      
     }
   }
   // _cstyle_tiled_mattranspose_end
@@ -144,12 +138,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose.
+  // transpose. 
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Further partioning of the iteration space is carried out in the
+  // Further partioning of the iteration space is carried out in the 
   // tile_fixed statements. Iterations inside a RAJA loop is given by their
-  // global iteration number.
+  // global iteration number. 
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -160,22 +154,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops. The template parameter inside
+  // using sequential loops. The template parameter inside 
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
-  using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::For<
-              1, RAJA::seq_exec,
-              RAJA::statement::For<
-                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using TILED_KERNEL_EXEC_POL = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::For<1, RAJA::seq_exec, 
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -183,8 +182,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -192,25 +190,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using TILED_KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::For<
-              1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<
-                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using TILED_KERNEL_EXEC_POL_OMP = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::For<1, RAJA::omp_parallel_for_exec, 
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          > 
+        >
+      >
+    >; 
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner "
-               "loops...\n";
+  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -219,56 +221,62 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP parallel for loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using TILED_KERNEL_EXEC_POL_OMP2 =
-      RAJA::KernelPolicy<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-              RAJA::statement::Collapse<
-                  RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-                  RAJA::statement::Lambda<0>>  // closes collapse
-              >                                // closes Tile 0
-          >                                    // closes Tile 1
-                         >;                    // closes policy list
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using TILED_KERNEL_EXEC_POL_OMP2 = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                    RAJA::ArgList<0, 1>,
+                                    RAJA::statement::Lambda<0>
+          > //closes collapse
+        > // closes Tile 0
+      > // closes Tile 1
+    >; // closes policy list
+      
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-
+  
   // _raja_mattranspose_cuda_start
-  using TILED_KERNEL_EXEC_POL_CUDA =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::For<
-                      0, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::Lambda<0>>>>>>>;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>(
-      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
-      { Atview(col, row) = Aview(row, col); });
+  using TILED_KERNEL_EXEC_POL_CUDA = 
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+            RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
+              RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
+                RAJA::statement::Lambda<0> 
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+    [=] RAJA_DEVICE (int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
@@ -280,29 +288,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
   using TILED_KERNEL_EXEC_POL_HIP =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::hip_thread_x_direct,
-                  RAJA::statement::For<
-                      0, RAJA::hip_thread_y_direct,
-                      RAJA::statement::Lambda<0>>>>>>>;
-
-  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>(
-      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
-      { d_Atview(col, row) = d_Aview(row, col); });
-
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+            RAJA::statement::For<1, RAJA::hip_thread_x_direct,
+              RAJA::statement::For<0, RAJA::hip_thread_y_direct,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>( RAJA::make_tuple(col_Range, row_Range),
+    [=] RAJA_DEVICE (int col, int row) {
+      d_Atview(col, row) = d_Aview(row, col);
+  });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -317,7 +328,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-}
+} 
 
 //
 // Function to check result and report P/F.
@@ -326,22 +337,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -353,13 +358,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernel-matrix-transpose.cpp b/exercises/kernel-matrix-transpose.cpp
index 636fec4fa5..04f71bf7e0 100644
--- a/exercises/kernel-matrix-transpose.cpp
+++ b/exercises/kernel-matrix-transpose.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,14 +77,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -92,11 +90,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -108,13 +104,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose.
+  // transpose. 
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
   //
-  // RAJA::TypedRangeSegment<int> row_Range(0, N_r);
-  // RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+//RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+//RAJA::TypedRangeSegment<int> col_Range(0, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running sequential matrix transpose ...\n";
@@ -122,7 +118,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops.
+  // using sequential loops. 
   //
   // _raja_mattranspose_start
 
@@ -131,9 +127,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a
   ///           basic matrix transpose.
-  ///
+  ///   
   ///           Uncomment 'row_Range' and 'col_Range' objects above so they
-  ///           can be used in the kernel.
+  ///           can be used in the kernel. 
   ///
 
   ///
@@ -153,8 +149,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -168,9 +163,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a
   ///           basic matrix transpose.
-  ///
+  ///   
   ///           Uncomment 'row_Range' and 'col_Range' objects above so they
-  ///           can be used in the kernel.
+  ///           can be used in the kernel. 
   ///
 
   ///
@@ -179,7 +174,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range),
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
     [=](int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -188,13 +183,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-
+  
   // _raja_mattranspose_cuda_start
 
   ///
@@ -202,9 +197,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a
   ///           basic matrix transpose.
-  ///
+  ///   
   ///           Uncomment 'row_Range' and 'col_Range' objects above so they
-  ///           can be used in the kernel.
+  ///           can be used in the kernel. 
   ///
 
   ///
@@ -213,7 +208,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Uncomment this code block.
   ///
   /*
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range),
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
     [=] RAJA_DEVICE (int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -221,10 +216,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -235,7 +230,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-}
+} 
 
 //
 // Function to check result and report P/F.
@@ -244,22 +239,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -271,13 +260,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp
index 64d8371697..4dab678520 100644
--- a/exercises/kernel-matrix-transpose_solution.cpp
+++ b/exercises/kernel-matrix-transpose_solution.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,14 +77,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -92,11 +90,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -108,10 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose.
+  // transpose. 
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -122,24 +118,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops.
+  // using sequential loops. 
   //
   // _raja_mattranspose_start
-  using KERNEL_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::seq_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<KERNEL_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using KERNEL_EXEC_POL = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::seq_exec, 
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0>
+         >
+      >
+    >;
+
+  RAJA::kernel<KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -147,41 +148,53 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using KERNEL_EXEC_POL_OMP = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::omp_parallel_for_exec, 
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0>
+        >
+      > 
+    >; 
+
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-
+  
   // _raja_mattranspose_cuda_start
-  using KERNEL_EXEC_POL_CUDA =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          1, RAJA::cuda_thread_x_loop,
-          RAJA::statement::For<
-              0, RAJA::cuda_thread_y_loop, RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(
-      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
-      { Atview(col, row) = Aview(row, col); });
+  using KERNEL_EXEC_POL_CUDA = 
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
+          RAJA::statement::For<0, RAJA::cuda_thread_y_loop,
+                                  RAJA::statement::Lambda<0> 
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+    [=] RAJA_DEVICE (int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -192,7 +205,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-}
+} 
 
 //
 // Function to check result and report P/F.
@@ -201,22 +214,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -228,13 +235,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/kernelintro-execpols.cpp b/exercises/kernelintro-execpols.cpp
index 57542f4289..fdffc21ca9 100644
--- a/exercises/kernelintro-execpols.cpp
+++ b/exercises/kernelintro-execpols.cpp
@@ -37,17 +37,16 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template <int i_block_size, int j_block_size, int k_block_size>
-__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
-    void nested_init(double* a, double c, int N)
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if (i < N && j < N && k < N)
-  {
-    a[i + N * (j + N * k)] = c * i * j * k;
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -59,71 +58,64 @@ __launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-  // _init_define_start
-  //
-  // 3D tensor has N^3 entries
-  //
-  constexpr int    N     = 100;
-  constexpr int    N_tot = N * N * N;
-  constexpr double c     = 0.0001;
-  double*          a     = memoryManager::allocate<double>(N_tot);
-  double*          a_ref = memoryManager::allocate<double>(N_tot);
-  // _init_define_end
-
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference "
-               "solution ...\n";
-
-  // _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        a_ref[i + N * (j + N * k)] = c * i * j * k;
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_seq_end
+// _cstyle_tensorinit_seq_end
 
 
-  //----------------------------------------------------------------------------//
-  // We introduce a RAJA View to wrap the tensor data pointer and simplify
-  // multi-dimensional indexing.
-  // We use this in the rest of the examples in this file.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-  // _3D_raja_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
-  // _3D_raja_view_end
-
-  // _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_view_seq_end
+// _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
@@ -143,108 +135,110 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA OpenMP multithreading variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cstyle_tensorinit_omp_outer_start
-#pragma omp parallel for
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_omp_outer_end
+// _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_omp_outer_start
-  using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
-      2,
-      RAJA::omp_parallel_for_exec,  // k
-      RAJA::statement::For<
-          1,
-          RAJA::seq_exec,  // j
-          RAJA::statement::For<
-              0,
-              RAJA::seq_exec,  // i
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<EXEC_POL2>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
-
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_omp_outer_end
+// _raja_tensorinit_omp_outer_start
+  using EXEC_POL2 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<2, RAJA::omp_parallel_for_exec,    // k
+        RAJA::statement::For<1, RAJA::seq_exec,              // j
+          RAJA::statement::For<0, RAJA::seq_exec,            // i
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL2>( 
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cstyle_tensorinit_omp_collapse_start
-#pragma omp parallel for collapse(3)
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+  // _cstyle_tensorinit_omp_collapse_start
+  #pragma omp parallel for collapse(3)
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_omp_collapse_end
+// _cstyle_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_omp_collapse_start
-  using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>,  // k, j, i
-      RAJA::statement::Lambda<0>>>;
-
-  RAJA::kernel<EXEC_POL3>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
-
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_omp_collapse_end
+// _raja_tensorinit_omp_collapse_start
+  using EXEC_POL3 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<2, 1, 0>,  // k, j, i
+        RAJA::statement::Lambda<0>
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL3>( 
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n";
 
@@ -263,45 +257,48 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   checkResult(a, a_ref, N_tot);
 
-#endif  // if defined(RAJA_ENABLE_OPENMP)
+#endif // if defined(RAJA_ENABLE_OPENMP)
 
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA CUDA GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_cuda_start
+// _raja_tensorinit_cuda_start
   using EXEC_POL5 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          2,
-          RAJA::cuda_thread_z_loop,  // k
-          RAJA::statement::For<
-              1,
-              RAJA::cuda_thread_y_loop,  // j
-              RAJA::statement::For<
-                  0,
-                  RAJA::cuda_thread_x_loop,  // i
-                  RAJA::statement::Lambda<0>>>>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<2, RAJA::cuda_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::cuda_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::cuda_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL5>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_cuda_end
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
@@ -311,115 +308,120 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Define total thread-block size and size of each block dimension
   //
-  // _cuda_blockdim_start
+// _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
-  // _cuda_blockdim_end
-
-  // _raja_tensorinit_cuda_tiled_direct_start
-  using EXEC_POL6 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
-      i_block_sz * j_block_sz * k_block_sz,
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<j_block_sz>, RAJA::cuda_block_y_direct,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<i_block_sz>, RAJA::cuda_block_x_direct,
-              RAJA::statement::For<
-                  2,
-                  RAJA::cuda_block_z_direct,  // k
-                  RAJA::statement::For<
-                      1,
-                      RAJA::cuda_thread_y_direct,  // j
-                      RAJA::statement::For<
-                          0,
-                          RAJA::cuda_thread_x_direct,  // i
-                          RAJA::statement::Lambda<0>>>>>>>>;
+// _cuda_blockdim_end
+
+// _raja_tensorinit_cuda_tiled_direct_start
+  using EXEC_POL6 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::cuda_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::cuda_block_x_direct,
+            RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL6>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_cuda_tiled_direct_end
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cuda_tensorinit_tiled_direct_start
+// _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(
-      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
 
-  dim3 nblocks(
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-      <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk(cudaGetLastError());
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
   cudaErrchk(cudaDeviceSynchronize());
-  // _cuda_tensorinit_tiled_direct_end
+// _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-#endif  // if defined(RAJA_ENABLE_CUDA)
+#endif // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
-  // RAJA HIP GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-  // _3D_raja_device_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
-  // _3D_raja_device_view_end
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_device_view_end
 
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
 
-  // _raja_tensorinit_hip_start
+// _raja_tensorinit_hip_start
   using EXEC_POL7 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-          2,
-          RAJA::hip_thread_z_loop,  // k
-          RAJA::statement::For<
-              1,
-              RAJA::hip_thread_y_loop,  // j
-              RAJA::statement::For<
-                  0,
-                  RAJA::hip_thread_x_loop,  // i
-                  RAJA::statement::Lambda<0>>>>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<2, RAJA::hip_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL7>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k)
-      { d_aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_hip_end
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_end
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
@@ -433,44 +435,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
-
-  // _raja_tensorinit_hip_tiled_direct_start
-  using EXEC_POL8 = RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
-      i_block_sz * j_block_sz * k_block_sz,
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<j_block_sz>, RAJA::hip_block_y_direct,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<i_block_sz>, RAJA::hip_block_x_direct,
-              RAJA::statement::For<
-                  2,
-                  RAJA::hip_block_z_direct,  // k
-                  RAJA::statement::For<
-                      1,
-                      RAJA::hip_thread_y_direct,  // j
-                      RAJA::statement::For<
-                          0,
-                          RAJA::hip_thread_x_direct,  // i
-                          RAJA::statement::Lambda<0>>>>>>>>;
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_tiled_direct_start
+  using EXEC_POL8 =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::hip_block_x_direct,
+            RAJA::statement::For<2, RAJA::hip_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL8>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k)
-      { d_aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_hip_tiled_direct_end
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_tiled_direct_end
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif  // if defined(RAJA_ENABLE_HIP)
+#endif // if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -489,18 +494,14 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while (correct && (i < n))
-  {
+  while ( correct && (i < n) ) {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp
index c0b86f89a1..c5041e01a9 100644
--- a/exercises/kernelintro-execpols_solution.cpp
+++ b/exercises/kernelintro-execpols_solution.cpp
@@ -37,17 +37,16 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template <int i_block_size, int j_block_size, int k_block_size>
-__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
-    void nested_init(double* a, double c, int N)
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if (i < N && j < N && k < N)
-  {
-    a[i + N * (j + N * k)] = c * i * j * k;
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -59,268 +58,273 @@ __launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-  // _init_define_start
-  //
-  // 3D tensor has N^3 entries
-  //
-  constexpr int    N     = 100;
-  constexpr int    N_tot = N * N * N;
-  constexpr double c     = 0.0001;
-  double*          a     = memoryManager::allocate<double>(N_tot);
-  double*          a_ref = memoryManager::allocate<double>(N_tot);
-  // _init_define_end
-
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference "
-               "solution ...\n";
-
-  // _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        a_ref[i + N * (j + N * k)] = c * i * j * k;
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_seq_end
+// _cstyle_tensorinit_seq_end
 
 
-  //----------------------------------------------------------------------------//
-  // We introduce a RAJA View to wrap the tensor data pointer and simplify
-  // multi-dimensional indexing.
-  // We use this in the rest of the examples in this file.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-  // _3D_raja_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
-  // _3D_raja_view_end
-
-  // _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_view_seq_end
+// _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_seq_start
-  using EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
-      2,
-      RAJA::seq_exec,  // k
-      RAJA::statement::For<
-          1,
-          RAJA::seq_exec,  // j
-          RAJA::statement::For<
-              0,
-              RAJA::seq_exec,  // i
-              RAJA::statement::Lambda<0>>>>>;
+// _raja_tensorinit_seq_start
+  using EXEC_POL1 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<2, RAJA::seq_exec,    // k
+        RAJA::statement::For<1, RAJA::seq_exec,  // j
+          RAJA::statement::For<0, RAJA::seq_exec,// i
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL1>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_seq_end
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA OpenMP multithreading variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cstyle_tensorinit_omp_outer_start
-#pragma omp parallel for
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_omp_outer_end
+// _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_omp_outer_start
-  using EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::For<
-      2,
-      RAJA::omp_parallel_for_exec,  // k
-      RAJA::statement::For<
-          1,
-          RAJA::seq_exec,  // j
-          RAJA::statement::For<
-              0,
-              RAJA::seq_exec,  // i
-              RAJA::statement::Lambda<0>>>>>;
+// _raja_tensorinit_omp_outer_start
+  using EXEC_POL2 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<2, RAJA::omp_parallel_for_exec,    // k
+        RAJA::statement::For<1, RAJA::seq_exec,              // j
+          RAJA::statement::For<0, RAJA::seq_exec,            // i
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL2>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_omp_outer_end
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cstyle_tensorinit_omp_collapse_start
-#pragma omp parallel for collapse(3)
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+  // _cstyle_tensorinit_omp_collapse_start
+  #pragma omp parallel for collapse(3)
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_omp_collapse_end
+// _cstyle_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_omp_collapse_start
-  using EXEC_POL3 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1, 0>,  // k, j, i
-      RAJA::statement::Lambda<0>>>;
+// _raja_tensorinit_omp_collapse_start
+  using EXEC_POL3 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<2, 1, 0>,  // k, j, i
+        RAJA::statement::Lambda<0>
+      >
+    >;
 
   RAJA::kernel<EXEC_POL3>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_omp_collapse_end
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_omp_collapse_start
-  using EXEC_POL4 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<2, 1>,  // k, j
-      RAJA::statement::For<
-          0,
-          RAJA::seq_exec,  // i
-          RAJA::statement::Lambda<0>>>>;
+// _raja_tensorinit_omp_collapse_start
+  using EXEC_POL4 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<2, 1>,    // k, j
+        RAJA::statement::For<0, RAJA::seq_exec,        // i
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL4>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_omp_collapse_end
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_collapse_end
 
   checkResult(a, a_ref, N_tot);
 
-#endif  // if defined(RAJA_ENABLE_OPENMP)
+#endif // if defined(RAJA_ENABLE_OPENMP)
 
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA CUDA GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_cuda_start
+// _raja_tensorinit_cuda_start
   using EXEC_POL5 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          2,
-          RAJA::cuda_thread_z_loop,  // k
-          RAJA::statement::For<
-              1,
-              RAJA::cuda_thread_y_loop,  // j
-              RAJA::statement::For<
-                  0,
-                  RAJA::cuda_thread_x_loop,  // i
-                  RAJA::statement::Lambda<0>>>>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<2, RAJA::cuda_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::cuda_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::cuda_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL5>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_cuda_end
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
@@ -330,115 +334,120 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Define total thread-block size and size of each block dimension
   //
-  // _cuda_blockdim_start
+// _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
-  // _cuda_blockdim_end
-
-  // _raja_tensorinit_cuda_tiled_direct_start
-  using EXEC_POL6 = RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
-      i_block_sz * j_block_sz * k_block_sz,
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<j_block_sz>, RAJA::cuda_block_y_direct,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<i_block_sz>, RAJA::cuda_block_x_direct,
-              RAJA::statement::For<
-                  2,
-                  RAJA::cuda_block_z_direct,  // k
-                  RAJA::statement::For<
-                      1,
-                      RAJA::cuda_thread_y_direct,  // j
-                      RAJA::statement::For<
-                          0,
-                          RAJA::cuda_thread_x_direct,  // i
-                          RAJA::statement::Lambda<0>>>>>>>>;
+// _cuda_blockdim_end
+
+// _raja_tensorinit_cuda_tiled_direct_start
+  using EXEC_POL6 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::cuda_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::cuda_block_x_direct,
+            RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL6>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_cuda_tiled_direct_end
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cuda_tensorinit_tiled_direct_start
+// _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(
-      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
 
-  dim3 nblocks(
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-      <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk(cudaGetLastError());
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
   cudaErrchk(cudaDeviceSynchronize());
-  // _cuda_tensorinit_tiled_direct_end
+// _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-#endif  // if defined(RAJA_ENABLE_CUDA)
+#endif // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
-  // RAJA HIP GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-  // _3D_raja_device_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
-  // _3D_raja_device_view_end
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_device_view_end
 
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
 
-  // _raja_tensorinit_hip_start
+// _raja_tensorinit_hip_start
   using EXEC_POL7 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-          2,
-          RAJA::hip_thread_z_loop,  // k
-          RAJA::statement::For<
-              1,
-              RAJA::hip_thread_y_loop,  // j
-              RAJA::statement::For<
-                  0,
-                  RAJA::hip_thread_x_loop,  // i
-                  RAJA::statement::Lambda<0>>>>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<2, RAJA::hip_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL7>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k)
-      { d_aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_hip_end
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_end
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
@@ -452,44 +461,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
-
-  // _raja_tensorinit_hip_tiled_direct_start
-  using EXEC_POL8 = RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
-      i_block_sz * j_block_sz * k_block_sz,
-      RAJA::statement::Tile<
-          1, RAJA::tile_fixed<j_block_sz>, RAJA::hip_block_y_direct,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<i_block_sz>, RAJA::hip_block_x_direct,
-              RAJA::statement::For<
-                  2,
-                  RAJA::hip_block_z_direct,  // k
-                  RAJA::statement::For<
-                      1,
-                      RAJA::hip_thread_y_direct,  // j
-                      RAJA::statement::For<
-                          0,
-                          RAJA::hip_thread_x_direct,  // i
-                          RAJA::statement::Lambda<0>>>>>>>>;
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_tiled_direct_start
+  using EXEC_POL8 =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::hip_block_x_direct,
+            RAJA::statement::For<2, RAJA::hip_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<EXEC_POL8>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N),
-          RAJA::TypedRangeSegment<int>(0, N)),
+     RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N) ),
 
-      [=] __device__(int i, int j, int k)
-      { d_aView(i, j, k) = c * i * j * k; });
-  // _raja_tensorinit_hip_tiled_direct_end
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_tiled_direct_end
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif  // if defined(RAJA_ENABLE_HIP)
+#endif // if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -508,18 +520,14 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while (correct && (i < n))
-  {
+  while ( correct && (i < n) ) {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp
index 7586b8b29d..406ea7e581 100644
--- a/exercises/kernelintro-nested-loop-reorder.cpp
+++ b/exercises/kernelintro-nested-loop-reorder.cpp
@@ -14,10 +14,10 @@
  * Nested Loop Basics and Loop Reordering (RAJA::kernel)
  *
  *  In this exercise, we introduce basic RAJA::kernel mechanics for executing
- *  nested loop kernels, including using execution policies to permute the
- *  order of loops in a loop nest. The exercise performs no actual
+ *  nested loop kernels, including using execution policies to permute the 
+ *  order of loops in a loop nest. The exercise performs no actual 
  *  computation and just prints out loop indices to show different
- *  loop ordering. Also, to avoid difficulty in interpreting parallel
+ *  loop ordering. Also, to avoid difficulty in interpreting parallel 
  *  output, the execution policies use sequential execution.
  *
  *  RAJA features shown:
@@ -28,18 +28,18 @@
 
 //
 // Define three named loop index integer types used in the triply-nested loops.
-// These will trigger compilation errors if lambda index argument ordering
+// These will trigger compilation errors if lambda index argument ordering 
 // and types do not match the typed range index ordering.  See final
 // example in this file.
 //
 // _raja_typed_indices_start
 RAJA_INDEX_VALUE_T(KIDX, int, "KIDX");
-RAJA_INDEX_VALUE_T(JIDX, int, "JIDX");
-RAJA_INDEX_VALUE_T(IIDX, int, "IIDX");
+RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); 
+RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); 
 // _raja_typed_indices_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   // _range_min_max_start
@@ -51,141 +51,116 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int kmax = 4;
   // _range_min_max_end
 
-  //
-  // The RAJA variants of the loop nest use the following typed range segments
-  // based on the typed indices defined above, outside of main().
-  //
+//
+// The RAJA variants of the loop nest use the following typed range segments
+// based on the typed indices defined above, outside of main().
+//
   // _raja_typed_index_ranges_start
   RAJA::TypedRangeSegment<KIDX> KRange(kmin, kmax);
   RAJA::TypedRangeSegment<JIDX> JRange(jmin, jmax);
   RAJA::TypedRangeSegment<IIDX> IRange(imin, imax);
   // _raja_typed_index_ranges_end
-
+ 
 
   std::cout << "\n\nRAJA::kernel nested loop reorder example...\n";
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, "
-               "I-inner"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _cstyle_kji_loops_start
-  for (int k = kmin; k < kmax; ++k)
-  {
-    for (int j = jmin; j < jmax; ++j)
-    {
-      for (int i = imin; i < imax; ++i)
-      {
-        printf(" (%d, %d, %d) \n", i, j, k);
+  for (int k = kmin; k < kmax; ++k) {
+    for (int j = jmin; j < jmax; ++j) {
+      for (int i = imin; i < imax; ++i) {
+        printf( " (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_kji_loops_end
 
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, "
-               "I-inner)"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)"
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _raja_kji_loops_start
-  using KJI_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
-      2,
-      RAJA::seq_exec,  // k
-      RAJA::statement::For<
-          1,
-          RAJA::seq_exec,  // j
-          RAJA::statement::For<
-              0,
-              RAJA::seq_exec,  // i
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<KJI_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
-      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
+  using KJI_EXECPOL = RAJA::KernelPolicy<
+                        RAJA::statement::For<2, RAJA::seq_exec,    // k
+                          RAJA::statement::For<1, RAJA::seq_exec,  // j
+                            RAJA::statement::For<0, RAJA::seq_exec,// i 
+                              RAJA::statement::Lambda<0>
+                            > 
+                          > 
+                        > 
+                      >;
+
+  RAJA::kernel<KJI_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
+  [=] (IIDX i, JIDX j, KIDX k) { 
+     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+  });
   // _raja_kji_loops_end
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, "
-               "K-inner"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _cstyle_jik_loops_start
-  for (int j = jmin; j < jmax; ++j)
-  {
-    for (int i = imin; i < imax; ++i)
-    {
-      for (int k = kmin; k < kmax; ++k)
-      {
-        printf(" (%d, %d, %d) \n", i, j, k);
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      for (int k = kmin; k < kmax; ++k) {
+        printf( " (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_jik_loops_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Make a RAJA version of the kernel with j on outer loop,
+  /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, 
   ///           i on middle loop, and k on inner loop
   ///
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, "
-               "J-inner"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _cstyle_ikj_loops_start
-  for (int i = imin; i < imax; ++i)
-  {
-    for (int k = kmin; k < kmax; ++k)
-    {
-      for (int j = jmin; j < jmax; ++j)
-      {
-        printf(" (%d, %d, %d) \n", i, j, k);
+  for (int i = imin; i < imax; ++i) {
+    for (int k = kmin; k < kmax; ++k) {
+      for (int j = jmin; j < jmax; ++j) {
+        printf( " (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_ikj_loops_end
 
-  //----------------------------------------------------------------------------//
-
+//----------------------------------------------------------------------------//
+ 
   std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Make a RAJA version of the kernel with i on outer loop,
+  /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, 
   ///           k on middle loop, and j on inner loop
   ///
 
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
-
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
 #if 0  // Enable this code block to generate compiler error.
 //----------------------------------------------------------------------------//
 // The following demonstrates that code will not compile if lambda argument
@@ -206,3 +181,4 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   return 0;
 }
+
diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp
index d7c1077b15..9df3ff4657 100644
--- a/exercises/kernelintro-nested-loop-reorder_solution.cpp
+++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp
@@ -14,10 +14,10 @@
  * Nested Loop Basics and Loop Reordering (RAJA::kernel)
  *
  *  In this exercise, we introduce basic RAJA::kernel mechanics for executing
- *  nested loop kernels, including using execution policies to permute the
- *  order of loops in a loop nest. The exercise performs no actual
+ *  nested loop kernels, including using execution policies to permute the 
+ *  order of loops in a loop nest. The exercise performs no actual 
  *  computation and just prints out loop indices to show different
- *  loop ordering. Also, to avoid difficulty in interpreting parallel
+ *  loop ordering. Also, to avoid difficulty in interpreting parallel 
  *  output, the execution policies use sequential execution.
  *
  *  RAJA features shown:
@@ -28,18 +28,18 @@
 
 //
 // Define three named loop index integer types used in the triply-nested loops.
-// These will trigger compilation errors if lambda index argument ordering
+// These will trigger compilation errors if lambda index argument ordering 
 // and types do not match the typed range index ordering.  See final
 // example in this file.
 //
 // _raja_typed_indices_start
 RAJA_INDEX_VALUE_T(KIDX, int, "KIDX");
-RAJA_INDEX_VALUE_T(JIDX, int, "JIDX");
-RAJA_INDEX_VALUE_T(IIDX, int, "IIDX");
+RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); 
+RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); 
 // _raja_typed_indices_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   // _range_min_max_start
@@ -51,161 +51,136 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int kmax = 4;
   // _range_min_max_end
 
-  //
-  // The RAJA variants of the loop nest use the following typed range segments
-  // based on the typed indices defined above, outside of main().
-  //
+//
+// The RAJA variants of the loop nest use the following typed range segments
+// based on the typed indices defined above, outside of main().
+//
   // _raja_typed_index_ranges_start
   RAJA::TypedRangeSegment<KIDX> KRange(kmin, kmax);
   RAJA::TypedRangeSegment<JIDX> JRange(jmin, jmax);
   RAJA::TypedRangeSegment<IIDX> IRange(imin, imax);
   // _raja_typed_index_ranges_end
-
+ 
 
   std::cout << "\n\nRAJA::kernel nested loop reorder example...\n";
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, "
-               "I-inner"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _cstyle_kji_loops_start
-  for (int k = kmin; k < kmax; ++k)
-  {
-    for (int j = jmin; j < jmax; ++j)
-    {
-      for (int i = imin; i < imax; ++i)
-      {
-        printf(" (%d, %d, %d) \n", i, j, k);
+  for (int k = kmin; k < kmax; ++k) {
+    for (int j = jmin; j < jmax; ++j) {
+      for (int i = imin; i < imax; ++i) {
+        printf( " (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_kji_loops_end
 
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, "
-               "I-inner)"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)"
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _raja_kji_loops_start
-  using KJI_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
-      2,
-      RAJA::seq_exec,  // k
-      RAJA::statement::For<
-          1,
-          RAJA::seq_exec,  // j
-          RAJA::statement::For<
-              0,
-              RAJA::seq_exec,  // i
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<KJI_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
-      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
+  using KJI_EXECPOL = RAJA::KernelPolicy<
+                        RAJA::statement::For<2, RAJA::seq_exec,    // k
+                          RAJA::statement::For<1, RAJA::seq_exec,  // j
+                            RAJA::statement::For<0, RAJA::seq_exec,// i 
+                              RAJA::statement::Lambda<0>
+                            > 
+                          > 
+                        > 
+                      >;
+
+  RAJA::kernel<KJI_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
+  [=] (IIDX i, JIDX j, KIDX k) { 
+     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+  });
   // _raja_kji_loops_end
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, "
-               "K-inner"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _cstyle_jik_loops_start
-  for (int j = jmin; j < jmax; ++j)
-  {
-    for (int i = imin; i < imax; ++i)
-    {
-      for (int k = kmin; k < kmax; ++k)
-      {
-        printf(" (%d, %d, %d) \n", i, j, k);
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      for (int k = kmin; k < kmax; ++k) {
+        printf( " (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_jik_loops_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _raja_jik_loops_start
-  using JIK_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,  // j
-      RAJA::statement::For<
-          0,
-          RAJA::seq_exec,  // i
-          RAJA::statement::For<
-              2,
-              RAJA::seq_exec,  // k
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<JIK_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
-      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
+  using JIK_EXECPOL = RAJA::KernelPolicy<
+                        RAJA::statement::For<1, RAJA::seq_exec,    // j
+                          RAJA::statement::For<0, RAJA::seq_exec,  // i
+                            RAJA::statement::For<2, RAJA::seq_exec,// k 
+                              RAJA::statement::Lambda<0>
+                            > 
+                          > 
+                        > 
+                      >;
+
+  RAJA::kernel<JIK_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
+  [=] (IIDX i, JIDX j, KIDX k) { 
+     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+  });
   // _raja_jik_loops_end
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, "
-               "J-inner"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _cstyle_ikj_loops_start
-  for (int i = imin; i < imax; ++i)
-  {
-    for (int k = kmin; k < kmax; ++k)
-    {
-      for (int j = jmin; j < jmax; ++j)
-      {
-        printf(" (%d, %d, %d) \n", i, j, k);
+  for (int i = imin; i < imax; ++i) {
+    for (int k = kmin; k < kmax; ++k) {
+      for (int j = jmin; j < jmax; ++j) {
+        printf( " (%d, %d, %d) \n", i, j, k);
       }
     }
   }
   // _cstyle_ikj_loops_end
 
-  //----------------------------------------------------------------------------//
-
+//----------------------------------------------------------------------------//
+ 
   std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)"
-            << "...\n\n"
-            << " (I, J, K)\n"
-            << " ---------\n";
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
   // _raja_ikj_loops_start
-  using IKJ_EXECPOL = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      RAJA::seq_exec,  // i
-      RAJA::statement::For<
-          2,
-          RAJA::seq_exec,  // k
-          RAJA::statement::For<
-              1,
-              RAJA::seq_exec,  // j
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<IKJ_EXECPOL>(
-      RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k)
-      { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); });
-  // _raja_ikj_loops_end
+  using IKJ_EXECPOL = RAJA::KernelPolicy<
+                        RAJA::statement::For<0, RAJA::seq_exec,    // i
+                          RAJA::statement::For<2, RAJA::seq_exec,  // k
+                            RAJA::statement::For<1, RAJA::seq_exec,// j 
+                              RAJA::statement::Lambda<0>
+                            > 
+                          > 
+                        > 
+                      >;
 
+  RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
+  [=] (IIDX i, JIDX j, KIDX k) {
+     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+  });
+  // _raja_ikj_loops_end
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
 
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
 #if 0  // Enable this code block to generate compiler error.
 //----------------------------------------------------------------------------//
 // The following demonstrates that code will not compile if lambda argument
@@ -226,3 +201,4 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   return 0;
 }
+
diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
index 79159539bc..eea48d073a 100644
--- a/exercises/launch-matrix-transpose-local-array.cpp
+++ b/exercises/launch-matrix-transpose-local-array.cpp
@@ -65,7 +65,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose example...\n";
@@ -84,8 +84,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -101,10 +101,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
@@ -119,10 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -133,17 +129,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -155,21 +148,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx)
-      {
-        for (int ty = 0; ty < TILE_DIM; ++ty)
-        {
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
+
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -184,43 +175,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _mattranspose_localarray_raja_start
-  using loop_pol_1      = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
-                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<loop_pol_1>(
-            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<loop_pol_1>(
-                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-                    ///
-                    /// TODO ...
-                    ///
-                    /// Exercise Implement loop_icount methods to load tiles of
-                    /// the input matrix into the RAJA_TEAM_SHARED memory array
-                    ///
-
-                    RAJA::loop_icount<loop_pol_1>(
-                        ctx, col_tile,
-                        [&](int col, int tx)
-                        {
-                          RAJA::loop_icount<loop_pol_1>(
-                              ctx, row_tile,
-                              [&](int row, int ty)
-                              { Atview(col, row) = Tile_Array[ty][tx]; });
-                        });
-                  });
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          ///
+          /// TODO ...
+          ///
+          /// Exercise Implement loop_icount methods to load tiles of the
+          /// input matrix into the RAJA_TEAM_SHARED memory array
+          ///
+
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+
+              Atview(col, row) = Tile_Array[ty][tx];
+
             });
+          });
+
+        });
       });
+
+    });
   // _mattranspose_localarray_raja_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -245,43 +231,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           within the omp parallel region.
   ///
 
-  // using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  //using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
-                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
-      {
-        /*
-        RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0,
-        N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-            RAJA::tile<loop_pol_2>(ctx, TILE_DIM,
-        RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int>
-        const &col_tile) {
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+
+      /*
+      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+          RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-              RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+            RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
-              RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty)
-        { RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+              RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
 
-                  Tile_Array[ty][tx] = Aview(row, col);
+                Tile_Array[ty][tx] = Aview(row, col);
 
-                });
               });
+            });
 
-              RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx)
-        { RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+              RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
 
-                  Atview(col, row) = Tile_Array[ty][tx];
+                Atview(col, row) = Tile_Array[ty][tx];
 
-                  });
                 });
+              });
 
-            });
           });
-      */
-      });
+        });
+    */
+    });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -295,70 +277,61 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   /// TODO...
   ///
-  /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads
-  /// directly
+  /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads directly
   ///
 
   const bool cuda_async = false;
-  using cuda_launch_policy =
-      RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_c, n_blocks_r),
-          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
-      {
-        /*
-        RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0,
-        N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-          RAJA::tile<cuda_teams_x>(ctx, TILE_DIM,
-        RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int>
-        const &col_tile) {
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+      /*
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-            RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-            RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int
-        ty) { RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int
-        tx) {
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
-                Tile_Array[ty][tx] = Aview(row, col);
+          RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
 
-              });
             });
+          });
 
-            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int
-        tx) { RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int
-        ty) {
+          RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
 
-                Atview(col, row) = Tile_Array[ty][tx];
+              Atview(col, row) = Tile_Array[ty][tx];
 
-              });
             });
-
           });
+
         });
-        */
       });
+      */
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -370,14 +343,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -385,55 +357,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async    = false;
+  const bool hip_async = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
-  RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_c, n_blocks_r),
-          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<hip_teams_y>(
-            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<hip_teams_x>(
-                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-                    RAJA::loop_icount<hip_threads_y>(
-                        ctx, row_tile,
-                        [&](int row, int ty)
-                        {
-                          RAJA::loop_icount<hip_threads_x>(
-                              ctx, col_tile,
-                              [&](int col, int tx)
-                              { Tile_Array[ty][tx] = d_Aview(row, col); });
-                        });
-
-                    RAJA::loop_icount<hip_threads_x>(
-                        ctx, col_tile,
-                        [&](int col, int tx)
-                        {
-                          RAJA::loop_icount<hip_threads_y>(
-                              ctx, row_tile,
-                              [&](int row, int ty)
-                              { d_Atview(col, row) = Tile_Array[ty][tx]; });
-                        });
-                  });
+  RAJA::launch<hip_launch_policy>
+     (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = d_Aview(row, col);
+
+            });
+          });
+
+          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+
+              d_Atview(col, row) = Tile_Array[ty][tx];
+
             });
+          });
+
+        });
       });
 
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+    });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -446,22 +410,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -473,10 +431,8 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
index faff3153c6..fe2d41ecec 100644
--- a/exercises/launch-matrix-transpose-local-array_solution.cpp
+++ b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -65,7 +65,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA shared matrix transpose example...\n";
@@ -84,8 +84,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -101,10 +101,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
@@ -119,10 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_DIM][TILE_DIM];
@@ -133,17 +129,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Tile[ty][tx] = Aview(row, col);
           }
         }
@@ -155,21 +148,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx)
-      {
-        for (int ty = 0; ty < TILE_DIM; ++ty)
-        {
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Tile[ty][tx];
           }
         }
       }
+
     }
   }
   // _mattranspose_localarray_cstyle_end
@@ -184,46 +175,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _mattranspose_localarray_raja_start
-  using loop_pol_1      = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
-                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<loop_pol_1>(
-            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<loop_pol_1>(
-                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-                    RAJA::loop_icount<loop_pol_1>(
-                        ctx, row_tile,
-                        [&](int row, int ty)
-                        {
-                          RAJA::loop_icount<loop_pol_1>(
-                              ctx, col_tile,
-                              [&](int col, int tx)
-                              { Tile_Array[ty][tx] = Aview(row, col); });
-                        });
-
-                    RAJA::loop_icount<loop_pol_1>(
-                        ctx, col_tile,
-                        [&](int col, int tx)
-                        {
-                          RAJA::loop_icount<loop_pol_1>(
-                              ctx, row_tile,
-                              [&](int row, int ty)
-                              { Atview(col, row) = Tile_Array[ty][tx]; });
-                        });
-                  });
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
+
             });
+          });
+
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+
+              Atview(col, row) = Tile_Array[ty][tx];
+
+            });
+          });
+
+        });
       });
+
+    });
   // _mattranspose_localarray_raja_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -240,48 +224,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using omp_pol_2       = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  using loop_pol_2      = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using omp_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when only running on
-                             // the cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<omp_pol_2>(
-            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<loop_pol_2>(
-                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-                    RAJA::loop_icount<loop_pol_2>(
-                        ctx, row_tile,
-                        [&](int row, int ty)
-                        {
-                          RAJA::loop_icount<loop_pol_2>(
-                              ctx, col_tile,
-                              [&](int col, int tx)
-                              { Tile_Array[ty][tx] = Aview(row, col); });
-                        });
-
-                    RAJA::loop_icount<loop_pol_2>(
-                        ctx, col_tile,
-                        [&](int col, int tx)
-                        {
-                          RAJA::loop_icount<loop_pol_2>(
-                              ctx, row_tile,
-                              [&](int row, int ty)
-                              { Atview(col, row) = Tile_Array[ty][tx]; });
-                        });
-                  });
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
+
             });
+          });
+
+          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+
+              Atview(col, row) = Tile_Array[ty][tx];
+
+            });
+          });
+
+        });
       });
 
+    });
+
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
@@ -294,8 +271,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -304,60 +281,52 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
   const bool cuda_async = false;
-  using cuda_launch_policy =
-      RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_c, n_blocks_r),
-          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<cuda_teams_y>(
-            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<cuda_teams_x>(
-                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-                    RAJA::loop_icount<cuda_threads_y>(
-                        ctx, row_tile,
-                        [&](int row, int ty)
-                        {
-                          RAJA::loop_icount<cuda_threads_x>(
-                              ctx, col_tile,
-                              [&](int col, int tx)
-                              { Tile_Array[ty][tx] = Aview(row, col); });
-                        });
-
-                    RAJA::loop_icount<cuda_threads_x>(
-                        ctx, col_tile,
-                        [&](int col, int tx)
-                        {
-                          RAJA::loop_icount<cuda_threads_y>(
-                              ctx, row_tile,
-                              [&](int row, int ty)
-                              { Atview(col, row) = Tile_Array[ty][tx]; });
-                        });
-                  });
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
+
             });
-      });
+          });
+
+         RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+           RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+
+             Atview(col, row) = Tile_Array[ty][tx];
+
+           });
+         });
+
+       });
+     });
+
+   });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
 
-  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
-  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -369,14 +338,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -384,55 +352,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async    = false;
+  const bool hip_async = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
-  RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_c, n_blocks_r),
-          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<hip_teams_y>(
-            ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<hip_teams_x>(
-                  ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
-
-                    RAJA::loop_icount<hip_threads_y>(
-                        ctx, row_tile,
-                        [&](int row, int ty)
-                        {
-                          RAJA::loop_icount<hip_threads_x>(
-                              ctx, col_tile,
-                              [&](int col, int tx)
-                              { Tile_Array[ty][tx] = d_Aview(row, col); });
-                        });
-
-                    RAJA::loop_icount<hip_threads_x>(
-                        ctx, col_tile,
-                        [&](int col, int tx)
-                        {
-                          RAJA::loop_icount<hip_threads_y>(
-                              ctx, row_tile,
-                              [&](int row, int ty)
-                              { d_Atview(col, row) = Tile_Array[ty][tx]; });
-                        });
-                  });
+  RAJA::launch<hip_launch_policy>
+     (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = d_Aview(row, col);
+
             });
-      });
+          });
+
+          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+           RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
 
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+             d_Atview(col, row) = Tile_Array[ty][tx];
+
+           });
+         });
+
+       });
+     });
+
+   });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
 
   return 0;
 }
@@ -445,22 +405,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -472,10 +426,8 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp
index 1917f58d43..1206cbc680 100644
--- a/exercises/launch-matrix-transpose-tiled.cpp
+++ b/exercises/launch-matrix-transpose-tiled.cpp
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA tiled matrix transpose example...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,14 +94,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
 
   //----------------------------------------------------------------------------//
@@ -113,28 +111,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
+
     }
   }
   // _cstyle_tiled_mattranspose_end
@@ -153,13 +147,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // global iteration number.
   //
 
-  ///
-  /// TODO: Uncomment these range segments so you can use them in the
-  ///       non-HIP exercises in this file.
-  /*
-    RAJA::TypedRangeSegment<int> row_Range(0, N_r);
-    RAJA::TypedRangeSegment<int> col_Range(0, N_c);
-  */
+/// 
+/// TODO: Uncomment these range segments so you can use them in the 
+///       non-HIP exercises in this file.
+/*
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+*/
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running sequential tiled matrix transpose ...\n";
@@ -171,42 +165,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
-  // using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  //using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
-      {
-        /*
-        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&]
-        (RAJA::TypedRangeSegment<int> const &row_tile) {
-
-          RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&]
-        (RAJA::TypedRangeSegment<int> const &col_tile) {
-
-            RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
-
-                ///
-                /// TODO...
-                ///
-                /// EXERCISE: Implement a loop method that takes a col_tile and
-                ///           returns the global index to the column iteration
-                ///
-                /// Uncomment the statement below to run the kernel and check
-        the
-                /// result.
-                ///
-
-                //Atview(col, row) = Aview(row, col);
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
 
-            });
+      /*
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
+
+              /// 
+              /// TODO...
+              ///
+              /// EXERCISE: Implement a loop method that takes a col_tile and 
+              ///           returns the global index to the column iteration
+              ///
+              /// Uncomment the statement below to run the kernel and check the 
+              /// result. 
+              /// 
+              
+              //Atview(col, row) = Aview(row, col);
 
           });
+              
         });
-        */
       });
+      */
+  });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -214,8 +204,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -223,28 +212,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  // using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  // using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+  //using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  //using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Create a launch_policy_2 that will create an omp parallel region
   ///
-  /// Uncomment the kernel below to run it and check the result.
-  ///
-  ///
+  /// Uncomment the kernel below to run it and check the result. 
+  /// 
+  ///           
 
   /*
   RAJA::launch<launch_policy_2>(
      RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&]
-  (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&]
-  (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
             RAJA::loop<loop_pol_2>(ctx, row_tile, [&] (int row) {
                 RAJA::loop<loop_pol_2>(ctx, col_tile, [&] (int col) {
@@ -265,7 +252,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
@@ -290,41 +277,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// EXERCISE: Implement the cuda launch policy to dispatch the kernel below
   ///           on the GPU
   ///
-  ///           When you uncomment kernel code below, you will also need to
+  ///           When you uncomment kernel code below, you will also need to 
   ///           uncomment variables above that are used within it.
   ///
 
-  /*
-    RAJA::launch<cuda_launch_policy>(
-      RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                       RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+/*
+  RAJA::launch<cuda_launch_policy>(
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-        RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&]
-    (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-          RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&]
-    (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-            RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
-              RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
+          RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
 
-                Atview(col, row) = Aview(row, col);
+              Atview(col, row) = Aview(row, col);
 
-              });
             });
-
           });
+
         });
+      });
 
-    });
-  */
+  });
+*/
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
@@ -339,14 +324,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -354,40 +338,34 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async    = false;
+  const bool hip_async = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
-  RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_c, n_blocks_r),
-          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<hip_teams_y>(
-            ctx, TILE_DIM, row_Range2,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<hip_teams_x>(
-                  ctx, TILE_DIM, col_Range2,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA::loop<hip_threads_y>(
-                        ctx, row_tile,
-                        [&](int row)
-                        {
-                          RAJA::loop<hip_threads_x>(
-                              ctx, col_tile,
-                              [&](int col)
-                              { Atview(col, row) = Aview(row, col); });
-                        });
-                  });
+  RAJA::launch<hip_launch_policy>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                        RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, row_Range2, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, col_Range2, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<hip_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<hip_threads_x>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
             });
+          });
+
+        });
       });
 
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+  });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -411,22 +389,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -438,13 +410,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp
index 32c4efa4a6..646040f6f0 100644
--- a/exercises/launch-matrix-transpose-tiled_solution.cpp
+++ b/exercises/launch-matrix-transpose-tiled_solution.cpp
@@ -56,7 +56,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA tiled matrix transpose example...\n";
@@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -94,14 +94,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
 
   //----------------------------------------------------------------------------//
@@ -113,34 +111,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
       //
       // (1) Loops to iterate over tile entries
       //
-      for (int ty = 0; ty < TILE_DIM; ++ty)
-      {
-        for (int tx = 0; tx < TILE_DIM; ++tx)
-        {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
           int col = bx * TILE_DIM + tx;  // Matrix column index
           int row = by * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
+
     }
   }
   // _cstyle_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
   //
@@ -165,34 +159,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
-  using loop_pol_1      = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_pol_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<loop_pol_1>(
-            ctx, TILE_DIM, row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<loop_pol_1>(
-                  ctx, TILE_DIM, col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA::loop<loop_pol_1>(
-                        ctx, row_tile,
-                        [&](int row)
-                        {
-                          RAJA::loop<loop_pol_1>(
-                              ctx, col_tile,
-                              [&](int col)
-                              { Atview(col, row) = Aview(row, col); });
-                        });
-                  });
+  RAJA::launch<launch_policy_1>(RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<loop_pol_1>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
             });
+          });
+
+        });
       });
+
+  });
   // _raja_tiled_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
@@ -200,8 +188,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -209,42 +196,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using omp_for_pol_2   = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  using loop_pol_2      = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_pol_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // cpu
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<omp_for_pol_2>(
-            ctx, TILE_DIM, row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<loop_pol_2>(
-                  ctx, TILE_DIM, col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA::loop<loop_pol_2>(
-                        ctx, row_tile,
-                        [&](int row)
-                        {
-                          RAJA::loop<loop_pol_2>(
-                              ctx, col_tile,
-                              [&](int col)
-                              { Atview(col, row) = Aview(row, col); });
-                        });
-                  });
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<loop_pol_2>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<loop_pol_2>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
             });
+          });
+
+        });
       });
 
+  });
+
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda tiled matrix transpose ...\n";
@@ -253,9 +235,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
-
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  
   // _raja_mattranspose_cuda_start
   using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -264,42 +246,36 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
   const bool cuda_async = false;
-  using cuda_launch_policy =
-      RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
 
   RAJA::launch<cuda_launch_policy>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_c, n_blocks_r),
-          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<cuda_teams_y>(
-            ctx, TILE_DIM, row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<cuda_teams_x>(
-                  ctx, TILE_DIM, col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA::loop<cuda_threads_y>(
-                        ctx, row_tile,
-                        [&](int row)
-                        {
-                          RAJA::loop<cuda_threads_x>(
-                              ctx, col_tile,
-                              [&](int col)
-                              { Atview(col, row) = Aview(row, col); });
-                        });
-                  });
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
             });
+          });
+
+        });
       });
+
+  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running hip tiled matrix transpose ...\n";
@@ -311,14 +287,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-  hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
   constexpr int c_block_sz = TILE_DIM;
   constexpr int r_block_sz = TILE_DIM;
-  const int     n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int     n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
   using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -326,40 +301,34 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool hip_async    = false;
+  const bool hip_async = false;
   using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
 
   RAJA::launch<hip_launch_policy>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_c, n_blocks_r),
-          RAJA::Threads(c_block_sz, r_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::tile<hip_teams_y>(
-            ctx, TILE_DIM, row_Range,
-            [&](RAJA::TypedRangeSegment<int> const& row_tile)
-            {
-              RAJA::tile<hip_teams_x>(
-                  ctx, TILE_DIM, col_Range,
-                  [&](RAJA::TypedRangeSegment<int> const& col_tile)
-                  {
-                    RAJA::loop<hip_threads_y>(
-                        ctx, row_tile,
-                        [&](int row)
-                        {
-                          RAJA::loop<hip_threads_x>(
-                              ctx, col_tile,
-                              [&](int col)
-                              { d_Atview(col, row) = d_Aview(row, col); });
-                        });
-                  });
-            });
-      });
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x> (ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<hip_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<hip_threads_x>(ctx, col_tile, [&] (int col) {
+
+              d_Atview(col, row) = d_Aview(row, col);
 
-  hipErrchk(
-      hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost));
+           });
+         });
+
+       });
+     });
+
+  });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
   //----------------------------------------------------------------------------//
@@ -383,22 +352,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -410,13 +373,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp
index 5d32dd369a..7cd96429bb 100644
--- a/exercises/launch-matrix-transpose.cpp
+++ b/exercises/launch-matrix-transpose.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,14 +77,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -92,11 +90,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -108,10 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose.
+  // transpose. 
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -122,103 +118,98 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops.
+  // using sequential loops. 
   //
   // _raja_mattranspose_start
-  using loop_policy_seq   = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_policy_seq = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_seq>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<loop_policy_seq>(
-            ctx, row_Range,
-            [&](int /*row*/)
-            {
-              RAJA::loop<loop_policy_seq>(
-                  ctx, col_Range,
-                  [&](int /*col*/)
-                  {
-                    /// TODO...
-                    ///
-                    /// EXERCISE: Implement the kernel
-                    /// body for the transpose operation
-                    ///
-                  });
-            });
+  RAJA::launch<launch_policy_seq>
+   (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_seq>(ctx, row_Range, [&] (int /*row*/) {
+        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int /*col*/) {
+
+	  /// TODO...
+	  ///
+	  /// EXERCISE: Implement the kernel body for the transpose operation
+	  ///
+
+        });
       });
+
+  });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   //
   // This policy loops sequentially while exposing parallelism on
   // one of the inner loops.
-
-  // uncomment to use in example below
-  // using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  
+  //uncomment to use in example below
+  //using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
-  RAJA::launch<launch_policy_omp>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
-      {
-        /// TODO...
-        ///
-        /// EXERCISE: Implement the loops to apply omp parallism and sequential
-        ///           execution on the column and row loops respectively
-        ///
-
-        // Atview(col, row) = Aview(row, col);
-      });
+  RAJA::launch<launch_policy_omp>(RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+
+
+      /// TODO...
+      ///
+      /// EXERCISE: Implement the loops to apply omp parallism and sequential
+      ///           execution on the column and row loops respectively
+      ///
+
+      //Atview(col, row) = Aview(row, col);
+
+
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-
+  
   // _raja_mattranspose_cuda_start
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async         = false;  // execute asynchronously
+  const bool async = false; //execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
   RAJA::launch<launch_policy_cuda>(
-      RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<cuda_thread_y>(
-            ctx, row_Range,
-            [&](int row)
-            {
-              RAJA::loop<cuda_thread_x>(
-                  ctx, col_Range,
-                  [&](int col) { Atview(col, row) = Aview(row, col); });
-            });
+    RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_thread_y>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<cuda_thread_x>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
       });
+
+  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -229,7 +220,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-}
+} 
 
 //
 // Function to check result and report P/F.
@@ -238,22 +229,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -265,13 +250,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp
index b54de8b30b..a7822bc1c7 100644
--- a/exercises/launch-matrix-transpose_solution.cpp
+++ b/exercises/launch-matrix-transpose_solution.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA matrix transpose exercise...\n";
@@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -77,14 +77,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of matrix transpose...\n";
@@ -92,11 +90,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   // _cstyle_mattranspose_start
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      Atview(col, row) = Aview(row, col);
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
     }
   }
   // _cstyle_mattranspose_end
@@ -108,10 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following RAJA variants use the RAJA::kernel method to carryout the
-  // transpose.
+  // transpose. 
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
-  // Iterations inside a RAJA loop is given by their global iteration number.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
   //
   RAJA::TypedRangeSegment<int> row_Range(0, N_r);
   RAJA::TypedRangeSegment<int> col_Range(0, N_c);
@@ -122,34 +118,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   //
   // The following policy carries out the transpose
-  // using sequential loops.
+  // using sequential loops. 
   //
   // _raja_mattranspose_start
-  using loop_policy_seq   = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using loop_policy_seq = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
   RAJA::launch<launch_policy_seq>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<loop_policy_seq>(
-            ctx, row_Range,
-            [&](int row)
-            {
-              RAJA::loop<loop_policy_seq>(
-                  ctx, col_Range,
-                  [&](int col) { Atview(col, row) = Aview(row, col); });
-            });
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_seq>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
       });
+
+  });
   // _raja_mattranspose_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running openmp matrix transpose -  parallel top inner "
-               "loop...\n";
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -157,61 +151,60 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // This policy loops sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using loop_policy_omp   = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
   RAJA::launch<launch_policy_omp>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<loop_policy_omp>(
-            ctx, row_Range,
-            [&](int row)
-            {
-              RAJA::loop<loop_policy_seq>(
-                  ctx, col_Range,
-                  [&](int col) { Atview(col, row) = Aview(row, col); });
-            });
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_omp>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
       });
 
+  });
+
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running cuda matrix transpose ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
-
+  
   // _raja_mattranspose_cuda_start
   using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
   using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
 
-  const bool async         = false;  // execute asynchronously
+  const bool async = false; //execute asynchronously
   using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
-  RAJA::launch<launch_policy_cuda>(
-      RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<cuda_thread_y>(
-            ctx, row_Range,
-            [&](int row)
-            {
-              RAJA::loop<cuda_thread_x>(
-                  ctx, col_Range,
-                  [&](int col) { Atview(col, row) = Aview(row, col); });
-            });
+  RAJA::launch<launch_policy_cuda>
+    (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_thread_y>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<cuda_thread_x>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
       });
+
+  });
   // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
   // Clean up.
@@ -222,7 +215,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n DONE!...\n";
 
   return 0;
-}
+} 
 
 //
 // Function to check result and report P/F.
@@ -231,22 +224,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match = false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -258,13 +245,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp
index 076164a5bf..10c2b0e302 100644
--- a/exercises/launchintro-execpols.cpp
+++ b/exercises/launchintro-execpols.cpp
@@ -37,17 +37,16 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template <int i_block_size, int j_block_size, int k_block_size>
-__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
-    void nested_init(double* a, double c, int N)
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if (i < N && j < N && k < N)
-  {
-    a[i + N * (j + N * k)] = c * i * j * k;
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -59,71 +58,64 @@ __launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-  // _init_define_start
-  //
-  // 3D tensor has N^3 entries
-  //
-  constexpr int    N     = 100;
-  constexpr int    N_tot = N * N * N;
-  constexpr double c     = 0.0001;
-  double*          a     = memoryManager::allocate<double>(N_tot);
-  double*          a_ref = memoryManager::allocate<double>(N_tot);
-  // _init_define_end
-
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference "
-               "solution ...\n";
-
-  // _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        a_ref[i + N * (j + N * k)] = c * i * j * k;
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_seq_end
+// _cstyle_tensorinit_seq_end
 
 
-  //----------------------------------------------------------------------------//
-  // We introduce a RAJA View to wrap the tensor data pointer and simplify
-  // multi-dimensional indexing.
-  // We use this in the rest of the examples in this file.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-  // _3D_raja_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
-  // _3D_raja_view_end
-
-  // _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_view_seq_end
+// _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
@@ -137,56 +129,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           the tensor initialization kernel.
   ///
 
-  // _raja_tensorinit_seq_start
-  // using loop_policy_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+// _raja_tensorinit_seq_start
+  //using loop_policy_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
-      {
-        /*
-        RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&]
-        (int k) {
+  RAJA::launch<launch_policy_1>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+      /*
+      RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
 
-            //Add additional loop methods to complete the kernel
+          //Add additional loop methods to complete the kernel
 
-        });
-        */
       });
-  // _raja_tensorinit_seq_end
+      */
+  });
+// _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA OpenMP multithreading variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cstyle_tensorinit_omp_outer_start
-#pragma omp parallel for
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_omp_outer_end
+// _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
@@ -200,102 +186,96 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           kernel that creates a parallel outer loop.
   ///
 
-  // _raja_tensorinit_omp_outer_start
+// _raja_tensorinit_omp_outer_start
   /*
   using omp_policy_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
   using loop_policy_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   */
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
-  RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/)
-      {
-        // TODO: Use the omp_policy_2 to distribute loop iterations
-        // in a RAJA::loop method
-        /*
-        RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&]
-        (int j) { RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0,
-        N), [&] (int i) {
+  RAJA::launch<launch_policy_2>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
 
+         //TODO: Use the omp_policy_2 to distribute loop iterations
+         //in a RAJA::loop method
+         /*
+         RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+            RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
 
-           });
-        });
-       */
-      });
-  // _raja_tensorinit_omp_outer_end
+
+            });
+         });
+        */
+
+  });
+// _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 #endif
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   //
   // Define total thread-block size and size of each block dimension
   //
-  // _cuda_blockdim_start
+// _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
-  // _cuda_blockdim_end
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+// _cuda_blockdim_end
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA CUDA GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_cuda_start
-  using cuda_teams_z_3         = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+// _raja_tensorinit_cuda_start
+  using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
   using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
 
-  const bool async_3    = false;
+  const bool async_3 = false;
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
-  RAJA::launch<launch_policy_3>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<cuda_teams_z_3>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::loop<cuda_global_thread_y_3>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](int j)
-                  {
-                    RAJA::loop<cuda_global_thread_x_3>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](int i) { aView(i, j, k) = c * i * j * k; });
-                  });
-            });
+  RAJA::launch<launch_policy_3>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+        RAJA::loop<cuda_global_thread_y_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+          RAJA::loop<cuda_global_thread_x_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+            aView(i, j, k) = c * i * j * k ;
+
+          });
+        });
       });
+  });
 
-  // _raja_tensorinit_cuda_end
+// _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_cuda_tiled_direct_start
+// _raja_tensorinit_cuda_tiled_direct_start
   using cuda_teams_z_4 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_teams_y_4 = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x_4 = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -303,70 +283,64 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_y_4 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
   using cuda_threads_x_4 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
-  const bool async_4    = false;
+  const bool async_4 = false;
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
-  RAJA::launch<launch_policy_4>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<cuda_teams_z_4>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::tile<cuda_teams_y_4>(
-                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
-                  {
-                    RAJA::tile<cuda_teams_x_4>(
-                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
-                        {
-                          RAJA::loop<cuda_threads_y_4>(
-                              ctx, j_tile,
-                              [&](int j)
-                              {
-                                RAJA::loop<cuda_threads_x_4>(
-                                    ctx, i_tile,
-                                    [&](int i)
-                                    { aView(i, j, k) = c * i * j * k; });
-                              });
-                        });
+  RAJA::launch<launch_policy_4>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_4>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<cuda_teams_y_4>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<cuda_teams_x_4>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<cuda_threads_x_4>(ctx, i_tile, [&] (int i) {
+
+                    aView(i, j, k) = c * i * j * k ;
+
                   });
+              });
+
             });
+          });
+
       });
-  // _raja_tensorinit_cuda_tiled_direct_end
+    });
+// _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cuda_tensorinit_tiled_direct_start
+// _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(
-      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
 
-  dim3 nblocks(
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-      <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk(cudaGetLastError());
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
   cudaErrchk(cudaDeviceSynchronize());
-  // _cuda_tensorinit_tiled_direct_end
+// _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-#endif  // if defined(RAJA_ENABLE_CUDA)
+#endif // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
@@ -379,68 +353,64 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
 
-  //----------------------------------------------------------------------------//
-  // RAJA HIP GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-  // _3D_raja_device_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
-  // _3D_raja_deviceview_end
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_deviceview_end
 
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
 
-  // _raja_tensorinit_hip_start
-  using hip_teams_z_5         = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+// _raja_tensorinit_hip_start
+  using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
   using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
 
-  const bool async_5    = false;
+  const bool async_5 = false;
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
-  RAJA::launch<launch_policy_5>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<hip_teams_z_5>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::loop<hip_global_thread_y_5>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](int j)
-                  {
-                    RAJA::loop<hip_global_thread_x_5>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](int i) { d_aView(i, j, k) = c * i * j * k; });
-                  });
-            });
-      });
-  // _raja_tensorinit_hip_end
+  RAJA::launch<launch_policy_5>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+       RAJA::loop<hip_teams_z_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+           RAJA::loop<hip_global_thread_y_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+               RAJA::loop<hip_global_thread_x_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                   d_aView(i, j, k) = c * i * j * k ;
+
+           });
+         });
+       });
+
+  });
+// _raja_tensorinit_hip_end
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
 
-  // _raja_tensorinit_hip_tiled_direct_start
+// _raja_tensorinit_hip_tiled_direct_start
   using hip_teams_z_6 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_teams_y_6 = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x_6 = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -448,50 +418,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y_6 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x_6 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool async_6    = false;
+  const bool async_6 = false;
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
-  RAJA::launch<launch_policy_6>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<hip_teams_z_6>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::tile<hip_teams_y_6>(
-                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
-                  {
-                    RAJA::tile<hip_teams_x_6>(
-                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
-                        {
-                          RAJA::loop<hip_threads_y_6>(
-                              ctx, j_tile,
-                              [&](int j)
-                              {
-                                RAJA::loop<hip_threads_x_6>(
-                                    ctx, i_tile,
-                                    [&](int i)
-                                    { d_aView(i, j, k) = c * i * j * k; });
-                              });
-                        });
+  RAJA::launch<launch_policy_6>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<hip_teams_z_6>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<hip_teams_y_6>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<hip_teams_x_6>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<hip_threads_x_6>(ctx, i_tile, [&] (int i) {
+
+                    d_aView(i, j, k) = c * i * j * k ;
+
                   });
+              });
+
             });
+          });
+
       });
-  // _raja_tensorinit_hip_tiled_direct_end
+    });
+// _raja_tensorinit_hip_tiled_direct_end
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif  // if defined(RAJA_ENABLE_HIP)
+#endif // if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -510,18 +475,14 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while (correct && (i < n))
-  {
+  while ( correct && (i < n) ) {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp
index 64d718f828..1bfff68acf 100644
--- a/exercises/launchintro-execpols_solution.cpp
+++ b/exercises/launchintro-execpols_solution.cpp
@@ -37,17 +37,16 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 // _cuda_tensorinit_kernel_start
-template <int i_block_size, int j_block_size, int k_block_size>
-__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
-    void nested_init(double* a, double c, int N)
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
 {
   int i = blockIdx.x * i_block_size + threadIdx.x;
   int j = blockIdx.y * j_block_size + threadIdx.y;
   int k = blockIdx.z;
 
-  if (i < N && j < N && k < N)
-  {
-    a[i + N * (j + N * k)] = c * i * j * k;
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
   }
 }
 // _cuda_tensorinit_kernel_end
@@ -59,234 +58,209 @@ __launch_bounds__(i_block_size* j_block_size* k_block_size) __global__
 void checkResult(double* a, double* aref, const int n);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
 
-  // _init_define_start
-  //
-  // 3D tensor has N^3 entries
-  //
-  constexpr int    N     = 100;
-  constexpr int    N_tot = N * N * N;
-  constexpr double c     = 0.0001;
-  double*          a     = memoryManager::allocate<double>(N_tot);
-  double*          a_ref = memoryManager::allocate<double>(N_tot);
-  // _init_define_end
-
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style sequential tensor init: create reference "
-               "solution ...\n";
-
-  // _cstyle_tensorinit_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        a_ref[i + N * (j + N * k)] = c * i * j * k;
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_seq_end
+// _cstyle_tensorinit_seq_end
 
 
-  //----------------------------------------------------------------------------//
-  // We introduce a RAJA View to wrap the tensor data pointer and simplify
-  // multi-dimensional indexing.
-  // We use this in the rest of the examples in this file.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential tensor init...\n";
 
-  // _3D_raja_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> aView(a, N, N, N);
-  // _3D_raja_view_end
-
-  // _cstyle_tensorinit_view_seq_start
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_view_seq_end
+// _cstyle_tensorinit_view_seq_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_seq_start
-  using loop_policy_1   = RAJA::LoopPolicy<RAJA::seq_exec>;
+// _raja_tensorinit_seq_start
+  using loop_policy_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 
-  RAJA::launch<launch_policy_1>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<loop_policy_1>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::loop<loop_policy_1>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](int j)
-                  {
-                    RAJA::loop<loop_policy_1>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](int i) { aView(i, j, k) = c * i * j * k; });
-                  });
+  RAJA::launch<launch_policy_1>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+         RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+            RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                aView(i, j, k) = c * i * j * k ;
+
             });
+         });
       });
-  // _raja_tensorinit_seq_end
+  });
+// _raja_tensorinit_seq_end
 
   checkResult(a, a_ref, N_tot);
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA OpenMP multithreading variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-// _cstyle_tensorinit_omp_outer_start
-#pragma omp parallel for
-  for (int k = 0; k < N; ++k)
-  {
-    for (int j = 0; j < N; ++j)
-    {
-      for (int i = 0; i < N; ++i)
-      {
-        aView(i, j, k) = c * i * j * k;
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
       }
     }
   }
-  // _cstyle_tensorinit_omp_outer_end
+// _cstyle_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_omp_outer_start
-  using omp_policy_2    = RAJA::LoopPolicy<RAJA::omp_for_exec>;
-  using loop_policy_2   = RAJA::LoopPolicy<RAJA::seq_exec>;
+// _raja_tensorinit_omp_outer_start
+  using omp_policy_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_policy_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
   using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
-  RAJA::launch<launch_policy_2>(
-      RAJA::LaunchParams(),  // LaunchParams may be empty when running on the
-                             // host
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<omp_policy_2>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::loop<loop_policy_2>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](int j)
-                  {
-                    RAJA::loop<loop_policy_2>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](int i) { aView(i, j, k) = c * i * j * k; });
-                  });
+  RAJA::launch<launch_policy_2>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<omp_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+         RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+            RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                aView(i, j, k) = c * i * j * k ;
+
             });
+         });
       });
-  // _raja_tensorinit_omp_outer_end
+  });
+// _raja_tensorinit_omp_outer_end
 
   checkResult(a, a_ref, N_tot);
 #endif
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
   //
   // Define total thread-block size and size of each block dimension
   //
-  // _cuda_blockdim_start
+// _cuda_blockdim_start
   constexpr int block_size = 256;
   constexpr int i_block_sz = 32;
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
-  // _cuda_blockdim_end
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+// _cuda_blockdim_end
 
-  //----------------------------------------------------------------------------//
-  // C-style and RAJA CUDA GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_cuda_start
-  using cuda_teams_z_3         = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+// _raja_tensorinit_cuda_start
+  using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
   using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
 
-  const bool async_3    = false;
+  const bool async_3 = false;
   using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
 
-  RAJA::launch<launch_policy_3>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<cuda_teams_z_3>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::loop<cuda_global_thread_y_3>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](int j)
-                  {
-                    RAJA::loop<cuda_global_thread_x_3>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](int i) { aView(i, j, k) = c * i * j * k; });
-                  });
-            });
+  RAJA::launch<launch_policy_3>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+        RAJA::loop<cuda_global_thread_y_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+          RAJA::loop<cuda_global_thread_x_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+            aView(i, j, k) = c * i * j * k ;
+
+          });
+        });
       });
+  });
 
-  // _raja_tensorinit_cuda_end
+// _raja_tensorinit_cuda_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _raja_tensorinit_cuda_tiled_direct_start
+// _raja_tensorinit_cuda_tiled_direct_start
   using cuda_teams_z_4 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
   using cuda_teams_y_4 = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
   using cuda_teams_x_4 = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
@@ -294,70 +268,64 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using cuda_threads_y_4 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
   using cuda_threads_x_4 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
-  const bool async_4    = false;
+  const bool async_4 = false;
   using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
 
-  RAJA::launch<launch_policy_4>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<cuda_teams_z_4>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::tile<cuda_teams_y_4>(
-                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
-                  {
-                    RAJA::tile<cuda_teams_x_4>(
-                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
-                        {
-                          RAJA::loop<cuda_threads_y_4>(
-                              ctx, j_tile,
-                              [&](int j)
-                              {
-                                RAJA::loop<cuda_threads_x_4>(
-                                    ctx, i_tile,
-                                    [&](int i)
-                                    { aView(i, j, k) = c * i * j * k; });
-                              });
-                        });
+  RAJA::launch<launch_policy_4>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                        RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_4>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<cuda_teams_y_4>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<cuda_teams_x_4>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<cuda_threads_x_4>(ctx, i_tile, [&] (int i) {
+
+                    aView(i, j, k) = c * i * j * k ;
+
                   });
+              });
+
             });
+          });
+
       });
-  // _raja_tensorinit_cuda_tiled_direct_end
+    });
+// _raja_tensorinit_cuda_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
 
-  // _cuda_tensorinit_tiled_direct_start
+// _cuda_tensorinit_tiled_direct_start
   dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
-  static_assert(
-      i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size");
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
 
-  dim3 nblocks(
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
-      static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
 
   nested_init<i_block_sz, j_block_sz, k_block_sz>
-      <<<nblocks, nthreads_per_block>>>(a, c, N);
-  cudaErrchk(cudaGetLastError());
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
   cudaErrchk(cudaDeviceSynchronize());
-  // _cuda_tensorinit_tiled_direct_end
+// _cuda_tensorinit_tiled_direct_end
 
   checkResult(a, a_ref, N_tot);
 
-#endif  // if defined(RAJA_ENABLE_CUDA)
+#endif // if defined(RAJA_ENABLE_CUDA)
 
 
 #if defined(RAJA_ENABLE_HIP)
@@ -370,68 +338,64 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   constexpr int j_block_sz = block_size / i_block_sz;
   constexpr int k_block_sz = 1;
 
-  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz);
-  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz);
-  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz);
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
 
-  //----------------------------------------------------------------------------//
-  // RAJA HIP GPU variants.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  double* d_a = memoryManager::allocate_gpu<double>(N_tot);
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
 
-  // _3D_raja_device_view_start
-  RAJA::View<double, RAJA::Layout<3, int>> d_aView(d_a, N, N, N);
-  // _3D_raja_deviceview_end
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_deviceview_end
 
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
 
-  // _raja_tensorinit_hip_start
-  using hip_teams_z_5         = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+// _raja_tensorinit_hip_start
+  using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
   using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
 
-  const bool async_5    = false;
+  const bool async_5 = false;
   using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
 
-  RAJA::launch<launch_policy_5>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<hip_teams_z_5>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::loop<hip_global_thread_y_5>(
-                  ctx, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](int j)
-                  {
-                    RAJA::loop<hip_global_thread_x_5>(
-                        ctx, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](int i) { d_aView(i, j, k) = c * i * j * k; });
-                  });
-            });
-      });
-  // _raja_tensorinit_hip_end
+  RAJA::launch<launch_policy_5>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                        RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+       RAJA::loop<hip_teams_z_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+           RAJA::loop<hip_global_thread_y_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+               RAJA::loop<hip_global_thread_x_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                   d_aView(i, j, k) = c * i * j * k ;
+
+           });
+         });
+       });
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  });
+// _raja_tensorinit_hip_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
 
   // set tensor data to zero to ensure we initializing it correctly.
   std::memset(a, 0, N_tot * sizeof(double));
-  hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
 
-  // _raja_tensorinit_hip_tiled_direct_start
+// _raja_tensorinit_hip_tiled_direct_start
   using hip_teams_z_6 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
   using hip_teams_y_6 = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
   using hip_teams_x_6 = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
@@ -439,50 +403,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using hip_threads_y_6 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x_6 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
 
-  const bool async_6    = false;
+  const bool async_6 = false;
   using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
 
-  RAJA::launch<launch_policy_6>(
-      RAJA::LaunchParams(
-          RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
-          RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<hip_teams_z_6>(
-            ctx, RAJA::TypedRangeSegment<int>(0, N),
-            [&](int k)
-            {
-              RAJA::tile<hip_teams_y_6>(
-                  ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                  [&](RAJA::TypedRangeSegment<int> const& j_tile)
-                  {
-                    RAJA::tile<hip_teams_x_6>(
-                        ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N),
-                        [&](RAJA::TypedRangeSegment<int> const& i_tile)
-                        {
-                          RAJA::loop<hip_threads_y_6>(
-                              ctx, j_tile,
-                              [&](int j)
-                              {
-                                RAJA::loop<hip_threads_x_6>(
-                                    ctx, i_tile,
-                                    [&](int i)
-                                    { d_aView(i, j, k) = c * i * j * k; });
-                              });
-                        });
+  RAJA::launch<launch_policy_6>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<hip_teams_z_6>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<hip_teams_y_6>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<hip_teams_x_6>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<hip_threads_x_6>(ctx, i_tile, [&] (int i) {
+
+                    d_aView(i, j, k) = c * i * j * k ;
+
                   });
+              });
+
             });
+          });
+
       });
-  // _raja_tensorinit_hip_tiled_direct_end
+    });
+// _raja_tensorinit_hip_tiled_direct_end
 
-  hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
   checkResult(a, a_ref, N_tot);
 
   memoryManager::deallocate_gpu(d_a);
 
-#endif  // if defined(RAJA_ENABLE_HIP)
+#endif // if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(a);
@@ -501,18 +460,14 @@ void checkResult(double* a, double* aref, const int n)
   bool correct = true;
 
   int i = 0;
-  while (correct && (i < n))
-  {
+  while ( correct && (i < n) ) {
     correct = std::abs(a[i] - aref[i]) < 10e-12;
     i++;
   }
 
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp
index 41accd3651..62d3d6e3e7 100644
--- a/exercises/memoryManager.hpp
+++ b/exercises/memoryManager.hpp
@@ -28,20 +28,20 @@ namespace memoryManager
 {
 
 #if defined(RAJA_ENABLE_SYCL)
-static camp::resources::Resource* sycl_res;
+  static camp::resources::Resource* sycl_res;
 #endif
 
 template <typename T>
-T* allocate(RAJA::Index_type size)
+T *allocate(RAJA::Index_type size)
 {
-  T* ptr;
+  T *ptr;
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(
-      cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
+      cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
 #elif defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
+      hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-  ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
+      ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
 #else
   ptr = new T[size];
 #endif
@@ -49,10 +49,9 @@ T* allocate(RAJA::Index_type size)
 }
 
 template <typename T>
-void deallocate(T*& ptr)
+void deallocate(T *&ptr)
 {
-  if (ptr)
-  {
+  if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
@@ -66,38 +65,36 @@ void deallocate(T*& ptr)
   }
 }
 
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
-    defined(RAJA_ENABLE_SYCL)
-template <typename T>
-T* allocate_gpu(RAJA::Index_type size)
-{
-  T* ptr;
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
+  template <typename T>
+  T *allocate_gpu(RAJA::Index_type size)
+  {
+    T *ptr;
 #if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size));
+    cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
+    hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_SYCL)
-  auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
-  ptr     = cl::sycl::malloc_device<T>(size, *qu);
+      auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
+      ptr = cl::sycl::malloc_device<T>(size, *qu);
 #endif
-  return ptr;
-}
+    return ptr;
+  }
 
-template <typename T>
-void deallocate_gpu(T*& ptr)
-{
-  if (ptr)
+  template <typename T>
+  void deallocate_gpu(T *&ptr)
   {
+    if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaFree(ptr));
+      cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipFree(ptr));
+      hipErrchk(hipFree(ptr));
 #elif defined(RAJA_ENABLE_SYCL)
     sycl_res->deallocate(ptr);
 #endif
-    ptr = nullptr;
+      ptr = nullptr;
+    }
   }
-}
 #endif
 
 };  // namespace memoryManager
diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp
index ac25b66a7d..3432adbb50 100644
--- a/exercises/offset-layout-stencil.cpp
+++ b/exercises/offset-layout-stencil.cpp
@@ -16,21 +16,21 @@
 /*
  *  Offset Layout Stencil Exercise
  *
- *  This exercise applies a five-point stencil to the interior cells of a
+ *  This exercise applies a five-point stencil to the interior cells of a 
  *  lattice and stores the resulting sums in a second lattice of equal size.
- *  You can think of the lattice as representing the centers of cells on a
- *  two-dimensional Cartesian mesh.
+ *  You can think of the lattice as representing the centers of cells on a 
+ *  two-dimensional Cartesian mesh. 
  *
- *  The five-point stencil accumulates values of a cell and its four neighbors.
- *  Assuming the cells of a lattice may be accessed through a row/col fashion,
+ *  The five-point stencil accumulates values of a cell and its four neighbors. 
+ *  Assuming the cells of a lattice may be accessed through a row/col fashion, 
  *  the stencil may be expressed as the following sum:
- *
+ * 
  *  output(row, col) = input(row, col) +
  *                     input(row - 1, col) + input(row + 1, col) +
  *                     input(row, col - 1) + input(row, col + 1)
  *
  *  We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros
- *  for a lattice of size (N_r + 2) x (N_c + 2).
+ *  for a lattice of size (N_r + 2) x (N_c + 2).  
  *
  *  In the case of N_r = N_c = 3, the input lattice values are:
  *
@@ -60,8 +60,8 @@
  *  | 0 | 0 | 0 | 0 | 0 |
  *  ---------------------
  *
- * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to
- * simplify the indexing to perform the stencil calculation. For the
+ * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to 
+ * simplify the indexing to perform the stencil calculation. For the 
  * purposes of discussion, we enumerate the lattice in the following manner:
  *
  *  --------------------------------------------------
@@ -81,12 +81,12 @@
  *
  *  RAJA features shown:
  *    - RAJA::kernel kernel execution method and execution policies
- *    - RAJA::View
+ *    - RAJA::View 
  *    - RAJA::Layout
  *
  * For the CUDA implementation, we use unified memory to hold the lattice data.
  * For HIP, we use explicit host-device memory and manually copy data between
- * the two.
+ * the two. 
  */
 
 /*
@@ -111,133 +111,134 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nFive-point stencil example...\n";
 
-  // _stencil_define_start
-  //
-  // Define num of interior cells in row/cols in a lattice
-  //
+// _stencil_define_start
+//
+// Define num of interior cells in row/cols in a lattice
+//
   constexpr int N_r = 5;
   constexpr int N_c = 4;
 
-  //
-  // Define total num of cells in rows/cols in a lattice
-  //
+//
+// Define total num of cells in rows/cols in a lattice
+//
   constexpr int totCellsInRow = N_r + 2;
   constexpr int totCellsInCol = N_c + 2;
 
-  //
-  // Define total num of cells in a lattice
-  //
+//
+// Define total num of cells in a lattice
+//
   constexpr int totCells = totCellsInRow * totCellsInCol;
-  // _stencil_define_end
+// _stencil_define_end
 
-  //
-  // Allocate and initialize lattice
-  //
-  int* input      = memoryManager::allocate<int>(totCells * sizeof(int));
-  int* output     = memoryManager::allocate<int>(totCells * sizeof(int));
+//
+// Allocate and initialize lattice
+//
+  int* input = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* output = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output_ref = memoryManager::allocate<int>(totCells * sizeof(int));
 
   std::memset(input, 0, totCells * sizeof(int));
   std::memset(output, 0, totCells * sizeof(int));
   std::memset(output_ref, 0, totCells * sizeof(int));
 
-  //
-  // C-Style intialization
-  //
-  // _stencil_input_init_start
-  for (int row = 1; row <= N_r; ++row)
-  {
-    for (int col = 1; col <= N_c; ++col)
-    {
-      int id    = col + totCellsInCol * row;
+//
+// C-Style intialization
+//
+// _stencil_input_init_start
+  for (int row = 1; row <= N_r; ++row) {
+    for (int col = 1; col <= N_c; ++col) {
+      int id = col + totCellsInCol * row;
       input[id] = 1;
     }
   }
-  // _stencil_input_init_end
+// _stencil_input_init_end
 
-  std::cout << "\ninput lattice:\n";
+  std::cout << "\ninput lattice:\n"; 
   printLattice(input, totCellsInRow, totCellsInCol);
 
-  //
-  // Generate reference solution
-  //
-  // _stencil_output_ref_start
-  for (int row = 1; row <= N_r; ++row)
-  {
-    for (int col = 1; col <= N_c; ++col)
-    {
-
-      int id         = col + totCellsInCol * row;
-      output_ref[id] = input[id] + input[id + 1] + input[id - 1] +
-                       input[id + totCellsInCol] + input[id - totCellsInCol];
+//
+// Generate reference solution
+//
+// _stencil_output_ref_start
+  for (int row = 1; row <= N_r; ++row) {
+    for (int col = 1; col <= N_c; ++col) {
+
+      int id = col + totCellsInCol * row;
+      output_ref[id] = input[id] + input[id + 1]
+                        + input[id - 1]
+                        + input[id + totCellsInCol]
+                        + input[id - totCellsInCol];
     }
   }
-  // _stencil_output_ref_end
+// _stencil_output_ref_end
 
-  std::cout << "\noutput reference lattice:\n";
+  std::cout << "\noutput reference lattice:\n"; 
   printLattice(output_ref, totCellsInRow, totCellsInCol);
 
-  //----------------------------------------------------------------------------//
-
-  //
-  // The following code illustrates pairing an offset layout and a RAJA view
-  // object to simplify multidimensional indexing.
-  // An offset layout is constructed by using the make_offset_layout method.
-  // The first argument of the layout is an array object with the coordinates of
-  // the bottom left corner of the lattice, and the second argument is an array
-  // object of the coordinates of the top right corner plus 1.
-  // The example uses double braces to initiate the array object and its
-  // subobjects.
-  //
+//----------------------------------------------------------------------------//
+
+//
+// The following code illustrates pairing an offset layout and a RAJA view
+// object to simplify multidimensional indexing.
+// An offset layout is constructed by using the make_offset_layout method.
+// The first argument of the layout is an array object with the coordinates of
+// the bottom left corner of the lattice, and the second argument is an array
+// object of the coordinates of the top right corner plus 1.
+// The example uses double braces to initiate the array object and its
+// subobjects.
+//
   // _offsetlayout_views_start
   const int DIM = 2;
 
   RAJA::OffsetLayout<DIM, int> layout =
-      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r + 1, N_c + 1}});
+      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r+1, N_c+1}});
 
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> inputView(input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> outputView(output, layout);
   // _offsetlayout_views_end
 
-  //
-  // Create range segments used in kernels
-  //
+//
+// Create range segments used in kernels
+//
   // _offsetlayout_ranges_start
   RAJA::TypedRangeSegment<int> col_range(0, N_c);
   RAJA::TypedRangeSegment<int> row_range(0, N_r);
   // _offsetlayout_ranges_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n";
 
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaseq_start
-  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,  // row
-      RAJA::statement::For<
-          0,
-          RAJA::seq_exec,  // col
-          RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL1>(
-      RAJA::make_tuple(col_range, row_range),
-      [=](int col, int row)
-      {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+  using NESTED_EXEC_POL1 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::seq_exec,    // row
+        RAJA::statement::For<0, RAJA::seq_exec,  // col
+          RAJA::statement::Lambda<0>
+        >
+      >  
+    >;  
+
+  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
+                                 [=](int col, int row) {
+
+                                   outputView(row, col) =
+                                       inputView(row, col)
+                                       + inputView(row - 1, col)
+                                       + inputView(row + 1, col)
+                                       + inputView(row, col - 1)
+                                       + inputView(row, col + 1);
+
+                                 });
   // _offsetlayout_rajaseq_end
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -255,12 +256,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           earlier tutorial section.
   ///
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -270,31 +271,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _offsetlayout_rajacuda_start
   using NESTED_EXEC_POL3 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          1,
-          RAJA::cuda_block_x_loop,  // row
-          RAJA::statement::For<
-              0,
-              RAJA::cuda_thread_x_loop,  // col
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL3>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;                                                     
+
+  RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
+                                 [=] RAJA_DEVICE(int col, int row) {
+
+                                   outputView(row, col) =
+                                       inputView(row, col)
+                                       + inputView(row - 1, col)
+                                       + inputView(row + 1, col)
+                                       + inputView(row, col - 1)
+                                       + inputView(row, col + 1);
+
+                                 });
   // _offsetlayout_rajacuda_end
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -304,37 +309,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* d_input  = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
   int* d_output = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
 
-  hipErrchk(
-      hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice ));
 
-  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView(d_input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView (d_input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
 
   // _offsetlayout_rajahip_start
   using NESTED_EXEC_POL4 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-          1,
-          RAJA::hip_block_x_loop,  // row
-          RAJA::statement::For<
-              0,
-              RAJA::hip_thread_x_loop,  // col
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL4>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        d_outputView(row, col) =
-            d_inputView(row, col) + d_inputView(row - 1, col) +
-            d_inputView(row + 1, col) + d_inputView(row, col - 1) +
-            d_inputView(row, col + 1);
-      });
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<1, RAJA::hip_block_x_loop, //row
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
+                                 [=] RAJA_DEVICE(int col, int row) {
+
+                                   d_outputView(row, col) =
+                                         d_inputView(row, col)
+                                       + d_inputView(row - 1, col)
+                                       + d_inputView(row + 1, col)
+                                       + d_inputView(row, col - 1)
+                                       + d_inputView(row, col + 1);
+                                 });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy(
-      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost ));
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
@@ -342,11 +348,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_output);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(input);
   memoryManager::deallocate(output);
   memoryManager::deallocate(output_ref);
@@ -361,10 +367,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void printLattice(int* lattice, int totCellsInRow, int totCellsInCol)
 {
   std::cout << std::endl;
-  for (int row = 0; row < totCellsInRow; ++row)
-  {
-    for (int col = 0; col < totCellsInCol; ++col)
-    {
+  for (int row = 0; row < totCellsInRow; ++row) {
+    for (int col = 0; col < totCellsInCol; ++col) {
 
       const int id = col + totCellsInCol * row;
       std::cout << lattice[id] << " ";
@@ -382,18 +386,14 @@ void checkResult(int* compLattice, int* refLattice, int totCells)
   bool correct = true;
 
   int i = 0;
-  while (correct && (i < totCells))
-  {
+  while ( correct && (i < totCells) ) {
     correct = (compLattice[i] == refLattice[i]);
     i++;
   }
 
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp
index 91ac3dabfd..f212ca7630 100644
--- a/exercises/offset-layout-stencil_solution.cpp
+++ b/exercises/offset-layout-stencil_solution.cpp
@@ -16,21 +16,21 @@
 /*
  *  Offset Layout Stencil Exercise
  *
- *  This exercise applies a five-point stencil to the interior cells of a
+ *  This exercise applies a five-point stencil to the interior cells of a 
  *  lattice and stores the resulting sums in a second lattice of equal size.
- *  You can think of the lattice as representing the centers of cells on a
- *  two-dimensional Cartesian mesh.
+ *  You can think of the lattice as representing the centers of cells on a 
+ *  two-dimensional Cartesian mesh. 
  *
- *  The five-point stencil accumulates values of a cell and its four neighbors.
- *  Assuming the cells of a lattice may be accessed through a row/col fashion,
+ *  The five-point stencil accumulates values of a cell and its four neighbors. 
+ *  Assuming the cells of a lattice may be accessed through a row/col fashion, 
  *  the stencil may be expressed as the following sum:
- *
+ * 
  *  output(row, col) = input(row, col) +
  *                     input(row - 1, col) + input(row + 1, col) +
  *                     input(row, col - 1) + input(row, col + 1)
  *
  *  We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros
- *  for a lattice of size (N_r + 2) x (N_c + 2).
+ *  for a lattice of size (N_r + 2) x (N_c + 2).  
  *
  *  In the case of N_r = N_c = 3, the input lattice values are:
  *
@@ -60,8 +60,8 @@
  *  | 0 | 0 | 0 | 0 | 0 |
  *  ---------------------
  *
- * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to
- * simplify the indexing to perform the stencil calculation. For the
+ * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to 
+ * simplify the indexing to perform the stencil calculation. For the 
  * purposes of discussion, we enumerate the lattice in the following manner:
  *
  *  --------------------------------------------------
@@ -81,13 +81,13 @@
  *
  *  RAJA features shown:
  *    - RAJA::kernel kernel execution method and execution policies
- *    - RAJA::View
+ *    - RAJA::View 
  *    - RAJA::OffsetLayout
  *    - RAJA::make_offset_layout method
  *
  * For the CUDA implementation, we use unified memory to hold the lattice data.
  * For HIP, we use explicit host-device memory and manually copy data between
- * the two.
+ * the two. 
  */
 
 /*
@@ -112,133 +112,134 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nFive-point stencil example...\n";
 
-  // _stencil_define_start
-  //
-  // Define num of interior cells in row/cols in a lattice
-  //
+// _stencil_define_start
+//
+// Define num of interior cells in row/cols in a lattice
+//
   constexpr int N_r = 5;
   constexpr int N_c = 4;
 
-  //
-  // Define total num of cells in rows/cols in a lattice
-  //
+//
+// Define total num of cells in rows/cols in a lattice
+//
   constexpr int totCellsInRow = N_r + 2;
   constexpr int totCellsInCol = N_c + 2;
 
-  //
-  // Define total num of cells in a lattice
-  //
+//
+// Define total num of cells in a lattice
+//
   constexpr int totCells = totCellsInRow * totCellsInCol;
-  // _stencil_define_end
+// _stencil_define_end
 
-  //
-  // Allocate and initialize lattice
-  //
-  int* input      = memoryManager::allocate<int>(totCells * sizeof(int));
-  int* output     = memoryManager::allocate<int>(totCells * sizeof(int));
+//
+// Allocate and initialize lattice
+//
+  int* input = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* output = memoryManager::allocate<int>(totCells * sizeof(int));
   int* output_ref = memoryManager::allocate<int>(totCells * sizeof(int));
 
   std::memset(input, 0, totCells * sizeof(int));
   std::memset(output, 0, totCells * sizeof(int));
   std::memset(output_ref, 0, totCells * sizeof(int));
 
-  //
-  // C-Style intialization
-  //
-  // _stencil_input_init_start
-  for (int row = 1; row <= N_r; ++row)
-  {
-    for (int col = 1; col <= N_c; ++col)
-    {
-      int id    = col + totCellsInCol * row;
+//
+// C-Style intialization
+//
+// _stencil_input_init_start
+  for (int row = 1; row <= N_r; ++row) {
+    for (int col = 1; col <= N_c; ++col) {
+      int id = col + totCellsInCol * row;
       input[id] = 1;
     }
   }
-  // _stencil_input_init_end
+// _stencil_input_init_end
 
-  std::cout << "\ninput lattice:\n";
+  std::cout << "\ninput lattice:\n"; 
   printLattice(input, totCellsInRow, totCellsInCol);
 
-  //
-  // Generate reference solution
-  //
-  // _stencil_output_ref_start
-  for (int row = 1; row <= N_r; ++row)
-  {
-    for (int col = 1; col <= N_c; ++col)
-    {
-
-      int id         = col + totCellsInCol * row;
-      output_ref[id] = input[id] + input[id + 1] + input[id - 1] +
-                       input[id + totCellsInCol] + input[id - totCellsInCol];
+//
+// Generate reference solution
+//
+// _stencil_output_ref_start
+  for (int row = 1; row <= N_r; ++row) {
+    for (int col = 1; col <= N_c; ++col) {
+
+      int id = col + totCellsInCol * row;
+      output_ref[id] = input[id] + input[id + 1]
+                        + input[id - 1]
+                        + input[id + totCellsInCol]
+                        + input[id - totCellsInCol];
     }
   }
-  // _stencil_output_ref_end
+// _stencil_output_ref_end
 
-  std::cout << "\noutput reference lattice:\n";
+  std::cout << "\noutput reference lattice:\n"; 
   printLattice(output_ref, totCellsInRow, totCellsInCol);
 
-  //----------------------------------------------------------------------------//
-
-  //
-  // The following code illustrates pairing an offset layout and a RAJA view
-  // object to simplify multidimensional indexing.
-  // An offset layout is constructed by using the make_offset_layout method.
-  // The first argument of the layout is an array object with the coordinates of
-  // the bottom left corner of the lattice, and the second argument is an array
-  // object of the coordinates of the top right corner plus 1.
-  // The example uses double braces to initiate the array object and its
-  // subobjects.
-  //
+//----------------------------------------------------------------------------//
+
+//
+// The following code illustrates pairing an offset layout and a RAJA view
+// object to simplify multidimensional indexing.
+// An offset layout is constructed by using the make_offset_layout method.
+// The first argument of the layout is an array object with the coordinates of
+// the bottom left corner of the lattice, and the second argument is an array
+// object of the coordinates of the top right corner plus 1.
+// The example uses double braces to initiate the array object and its
+// subobjects.
+//
   // _offsetlayout_views_start
   const int DIM = 2;
 
   RAJA::OffsetLayout<DIM, int> layout =
-      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r + 1, N_c + 1}});
+      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r+1, N_c+1}});
 
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> inputView(input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> outputView(output, layout);
   // _offsetlayout_views_end
 
-  //
-  // Create range segments used in kernels
-  //
+//
+// Create range segments used in kernels
+//
   // _offsetlayout_ranges_start
   RAJA::TypedRangeSegment<int> col_range(0, N_c);
   RAJA::TypedRangeSegment<int> row_range(0, N_r);
   // _offsetlayout_ranges_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n";
 
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaseq_start
-  using NESTED_EXEC_POL1 = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::seq_exec,  // row
-      RAJA::statement::For<
-          0,
-          RAJA::seq_exec,  // col
-          RAJA::statement::Lambda<0>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL1>(
-      RAJA::make_tuple(col_range, row_range),
-      [=](int col, int row)
-      {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+  using NESTED_EXEC_POL1 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::seq_exec,    // row
+        RAJA::statement::For<0, RAJA::seq_exec,  // col
+          RAJA::statement::Lambda<0>
+        >
+      >  
+    >;  
+
+  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
+                                 [=](int col, int row) {
+
+                                   outputView(row, col) =
+                                       inputView(row, col)
+                                       + inputView(row - 1, col)
+                                       + inputView(row + 1, col)
+                                       + inputView(row, col - 1)
+                                       + inputView(row, col + 1);
+
+                                 });
   // _offsetlayout_rajaseq_end
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -247,27 +248,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaomp_start
-  using NESTED_EXEC_POL2 = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      RAJA::omp_parallel_collapse_exec, RAJA::ArgList<1, 0>,  // row, col
-      RAJA::statement::Lambda<0>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL2>(
-      RAJA::make_tuple(col_range, row_range),
-      [=](int col, int row)
-      {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+  using NESTED_EXEC_POL2 = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<1, 0>,   // row, col
+        RAJA::statement::Lambda<0>
+      > 
+    >;
+
+  RAJA::kernel<NESTED_EXEC_POL2>(RAJA::make_tuple(col_range, row_range),
+                                 [=](int col, int row) {
+
+                                   outputView(row, col) =
+                                       inputView(row, col)
+                                       + inputView(row - 1, col)
+                                       + inputView(row + 1, col)
+                                       + inputView(row, col - 1)
+                                       + inputView(row, col + 1);
+
+                                 });
   // _offsetlayout_rajaomp_end
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -277,31 +284,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _offsetlayout_rajacuda_start
   using NESTED_EXEC_POL3 =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-          1,
-          RAJA::cuda_block_x_loop,  // row
-          RAJA::statement::For<
-              0,
-              RAJA::cuda_thread_x_loop,  // col
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL3>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        outputView(row, col) = inputView(row, col) + inputView(row - 1, col) +
-                               inputView(row + 1, col) +
-                               inputView(row, col - 1) +
-                               inputView(row, col + 1);
-      });
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;                                                     
+
+  RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
+                                 [=] RAJA_DEVICE(int col, int row) {
+
+                                   outputView(row, col) =
+                                       inputView(row, col)
+                                       + inputView(row - 1, col)
+                                       + inputView(row + 1, col)
+                                       + inputView(row, col - 1)
+                                       + inputView(row, col + 1);
+
+                                 });
   // _offsetlayout_rajacuda_end
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
@@ -313,39 +324,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int* d_input  = memoryManager::allocate_gpu<int>(totCells);
   int* d_output = memoryManager::allocate_gpu<int>(totCells);
 
-  hipErrchk(
-      hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(
-      d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice ));
 
-  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView(d_input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView (d_input, layout);
   RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
 
   // _offsetlayout_rajahip_start
   using NESTED_EXEC_POL4 =
-      RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-          1,
-          RAJA::hip_block_x_loop,  // row
-          RAJA::statement::For<
-              0,
-              RAJA::hip_thread_x_loop,  // col
-              RAJA::statement::Lambda<0>>>>>;
-
-  RAJA::kernel<NESTED_EXEC_POL4>(
-      RAJA::make_tuple(col_range, row_range),
-      [=] RAJA_DEVICE(int col, int row)
-      {
-        d_outputView(row, col) =
-            d_inputView(row, col) + d_inputView(row - 1, col) +
-            d_inputView(row + 1, col) + d_inputView(row, col - 1) +
-            d_inputView(row, col + 1);
-      });
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<1, RAJA::hip_block_x_loop, //row
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
+                                 [=] RAJA_DEVICE(int col, int row) {
+
+                                   d_outputView(row, col) =
+                                         d_inputView(row, col)
+                                       + d_inputView(row - 1, col)
+                                       + d_inputView(row + 1, col)
+                                       + d_inputView(row, col - 1)
+                                       + d_inputView(row, col + 1);
+                                 });
   // _offsetlayout_rajahip_end
 
-  hipErrchk(hipMemcpy(
-      output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost ));
 
-  std::cout << "\noutput lattice:\n";
+  std::cout << "\noutput lattice:\n"; 
   printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
@@ -353,11 +364,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_output);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(input);
   memoryManager::deallocate(output);
   memoryManager::deallocate(output_ref);
@@ -372,10 +383,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void printLattice(int* lattice, int totCellsInRow, int totCellsInCol)
 {
   std::cout << std::endl;
-  for (int row = 0; row < totCellsInRow; ++row)
-  {
-    for (int col = 0; col < totCellsInCol; ++col)
-    {
+  for (int row = 0; row < totCellsInRow; ++row) {
+    for (int col = 0; col < totCellsInCol; ++col) {
 
       const int id = col + totCellsInCol * row;
       std::cout << lattice[id] << " ";
@@ -393,18 +402,14 @@ void checkResult(int* compLattice, int* refLattice, int totCells)
   bool correct = true;
 
   int i = 0;
-  while (correct && (i < totCells))
-  {
+  while ( correct && (i < totCells) ) {
     correct = (compLattice[i] == refLattice[i]);
     i++;
   }
 
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp
index 02c3d03a65..2fb9d7ac56 100644
--- a/exercises/permuted-layout-batch-matrix-multiply.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply.cpp
@@ -75,77 +75,77 @@ constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 //
-// Function for checking results
+//Function for checking results
 //
 template <typename T>
 void checkResult(T C, int nMat, int nRows, int nCols);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA batched matrix multiplication exercise...\n";
 
-  // Dimensions of matrices
+// Dimensions of matrices
   constexpr int N_c = 3;
   constexpr int N_r = 3;
 
-  // Number of matrices
+// Number of matrices
   constexpr int N = 8000000;
 
-  // Number of iterations
+// Number of iterations
   constexpr int NITER = 20;
 
   std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n";
 
-  //
-  // Initialize a RAJA timer object
-  // and variable to store minimum run time
-  //
-  auto   timer  = RAJA::Timer();
+//
+// Initialize a RAJA timer object
+// and variable to store minimum run time
+//
+  auto timer = RAJA::Timer();
   double minRun = std::numeric_limits<double>::max();
 
-  //
-  // Allocate space for data in layout 1
-  //
-  double* A = memoryManager::allocate<double>(N_c * N_r * N);
-  double* B = memoryManager::allocate<double>(N_c * N_r * N);
-  double* C = memoryManager::allocate<double>(N_c * N_r * N);
-
-  //
-  // Layout 1
-  //
-  // make_permuted_layout takes the number of entries in each dimension and a
-  // templated array indicating index arguments with slowest to fastest stride.
-  // Standard C++ arrays are used to hold the number of entries in each
-  // component. This example uses double braces to initalize the array and its
-  // subobjects. The layout object will index into the array as the following C
-  // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
-  //
-  // RAJA::Layout objects may be templated on dimension, argument type, and
-  // index with unit stride. Here, the column index has unit stride (argument
-  // 2).
-  //
+//
+// Allocate space for data in layout 1
+//
+  double *A = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C = memoryManager::allocate<double>(N_c * N_r * N);
+
+//
+// Layout 1
+//
+// make_permuted_layout takes the number of entries in each dimension and a
+// templated array indicating index arguments with slowest to fastest stride.
+// Standard C++ arrays are used to hold the number of entries in each component.
+// This example uses double braces to initalize the array and its subobjects.
+// The layout object will index into the array as the following C macro would
+// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
+//
+// RAJA::Layout objects may be templated on dimension, argument type, and 
+// index with unit stride. Here, the column index has unit stride (argument 2). 
+//
   // _permutedlayout_defviews_start
   std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
-  auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1);
+  auto layout1 =
+      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 );
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Bview(B, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Cview(C, layout1);
   // _permutedlayout_defviews_end
 
-  //
-  // Allocate space for data in layout 2
-  //
-  double* A2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double* B2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double* C2 = memoryManager::allocate<double>(N_c * N_r * N);
+//
+// Allocate space for data in layout 2
+//
+  double *A2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C2 = memoryManager::allocate<double>(N_c * N_r * N);
 
-  //
-  // Permuted layout - equivalent to indexing using the following macro
-  // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
-  // In this case the element index has unit stride (argument 0).
-  //
+//
+// Permuted layout - equivalent to indexing using the following macro
+// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
+// In this case the element index has unit stride (argument 0). 
+//
 
   ///
   /// TODO...
@@ -158,13 +158,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           Then, create views for the A2, B2, C2 arrays using the
   ///           layout object; i.e., Aview2, Bview2, and Cview2.
   ///
-  ///           Hint: You will the same indexing to access the array data
-  ///           via the Views as for the Views above which are created
+  ///           Hint: You will the same indexing to access the array data 
+  ///           via the Views as for the Views above which are created 
   ///           using the layout1 View (see kernels in the code below).
   ///
-  ///           When you are done with the Views, test them out by
+  ///           When you are done with the Views, test them out by 
   ///           uncommenting the kernels in the code below that use the
-  ///           the Aview2, Bview2, and Cview2 views.
+  ///           the Aview2, Bview2, and Cview2 views. 
   ///
 
 //
@@ -179,71 +179,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using INIT_POL = RAJA::seq_exec;
 #endif
 
-  RAJA::forall<INIT_POL>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=](int e)
-      {
-        for (int row = 0; row < N_r; ++row)
-        {
-          for (int col = 0; col < N_c; ++col)
-          {
-            Aview(e, row, col) = row;
-            Bview(e, row, col) = col;
-            Cview(e, row, col) = 0;
-
-            //      Aview2(e, row, col) = row;
-            //      Bview2(e, row, col) = col;
-            //      Cview2(e, row, col) = 0;
-          }
-        }
-      });
+  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+    for (int row = 0; row < N_r; ++row) {
+      for (int col = 0; col < N_c; ++col) {
+        Aview(e, row, col) = row;
+        Bview(e, row, col) = col;
+        Cview(e, row, col) = 0;
 
+//      Aview2(e, row, col) = row;
+//      Bview2(e, row, col) = col;
+//      Cview2(e, row, col) = 0;
+      }
+    }
+  });
 
-  //----------------------------------------------------------------------------//
+
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - sequential) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
     // _permutedlayout_batchedmatmult_loop_start
-    RAJA::forall<RAJA::seq_exec>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=](int e)
-        {
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
-                           Aview(e, 0, 1) * Bview(e, 1, 0) +
-                           Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
-                           Aview(e, 0, 1) * Bview(e, 1, 1) +
-                           Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
-                           Aview(e, 0, 1) * Bview(e, 1, 2) +
-                           Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
-                           Aview(e, 1, 1) * Bview(e, 1, 0) +
-                           Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
-                           Aview(e, 1, 1) * Bview(e, 1, 1) +
-                           Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
-                           Aview(e, 1, 1) * Bview(e, 1, 2) +
-                           Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
-                           Aview(e, 2, 1) * Bview(e, 1, 0) +
-                           Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
-                           Aview(e, 2, 1) * Bview(e, 1, 1) +
-                           Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
-                           Aview(e, 2, 1) * Bview(e, 1, 2) +
-                           Aview(e, 2, 2) * Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+      }
+    );
     // _permutedlayout_batchedmatmult_loop_end
     timer.stop();
 
@@ -251,68 +245,68 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-
+    
   std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - sequential) ... " << std::endl;
 
-  /*
-      timer.start();
-    minRun = std::numeric_limits<double>::max();
-    for (int i = 0; i < NITER; ++i) {
-
-      // _permutedlayout2_batchedmatmult_loop_start
-      RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
-        [=](int e) {
-
-          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+/*
+    timer.start();
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
 
-        }
-      );
-      // _permutedlayout2_batchedmatmult_loop_end
-      timer.stop();
+    // _permutedlayout2_batchedmatmult_loop_start
+    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
 
-      RAJA::Timer::ElapsedType tMin = timer.elapsed();
-      if (tMin < minRun) minRun = tMin;
-      timer.reset();
-    }
-    std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
-    checkResult(Cview2, N, N_r, N_c);
-  */
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
 
-  //----------------------------------------------------------------------------//
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    // _permutedlayout2_batchedmatmult_loop_end
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+*/
+
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -322,45 +316,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
     // _permutedlayout_batchedmatmult_omp_start
-    RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=](int e)
-        {
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
-                           Aview(e, 0, 1) * Bview(e, 1, 0) +
-                           Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
-                           Aview(e, 0, 1) * Bview(e, 1, 1) +
-                           Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
-                           Aview(e, 0, 1) * Bview(e, 1, 2) +
-                           Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
-                           Aview(e, 1, 1) * Bview(e, 1, 0) +
-                           Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
-                           Aview(e, 1, 1) * Bview(e, 1, 1) +
-                           Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
-                           Aview(e, 1, 1) * Bview(e, 1, 2) +
-                           Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
-                           Aview(e, 2, 1) * Bview(e, 1, 0) +
-                           Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
-                           Aview(e, 2, 1) * Bview(e, 1, 1) +
-                           Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
-                           Aview(e, 2, 1) * Bview(e, 1, 2) +
-                           Aview(e, 2, 2) * Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
     // _permutedlayout_batchedmatmult_omp_end
     timer.stop();
 
@@ -368,72 +362,71 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-
-  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - omp parallel for) ... " << std::endl;
 
   std::memset(C2, 0, N_c * N_r * N * sizeof(double));
 
-  /*
-    minRun = std::numeric_limits<double>::max();
-    for (int i = 0; i < NITER; ++i) {
-
-      timer.start();
-      RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0,
-    N),
-        [=](int e) {
-
-          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+/*
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
 
-        }
-      );
-      timer.stop();
+    timer.start();
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
 
-      RAJA::Timer::ElapsedType tMin = timer.elapsed();
-      if (tMin < minRun) minRun = tMin;
-      timer.reset();
-    }
-    std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
-    checkResult(Cview2, N, N_r, N_c);
-  */
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+*/
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -443,44 +436,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=] RAJA_DEVICE(int e)
-        {
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
-                           Aview(e, 0, 1) * Bview(e, 1, 0) +
-                           Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
-                           Aview(e, 0, 1) * Bview(e, 1, 1) +
-                           Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
-                           Aview(e, 0, 1) * Bview(e, 1, 2) +
-                           Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
-                           Aview(e, 1, 1) * Bview(e, 1, 0) +
-                           Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
-                           Aview(e, 1, 1) * Bview(e, 1, 1) +
-                           Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
-                           Aview(e, 1, 1) * Bview(e, 1, 2) +
-                           Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
-                           Aview(e, 2, 1) * Bview(e, 1, 0) +
-                           Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
-                           Aview(e, 2, 1) * Bview(e, 1, 1) +
-                           Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
-                           Aview(e, 2, 1) * Bview(e, 1, 2) +
-                           Aview(e, 2, 2) * Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -488,10 +481,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - cuda) ... " << std::endl;
@@ -503,8 +496,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0,
-  N),
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
       [=] RAJA_DEVICE(int e) {
 
         Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
@@ -550,65 +542,63 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 */
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - hip) ... " << std::endl;
 
-  double* d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double* d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double* d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Aview(d_A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Bview(d_B, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Cview(d_C, layout1);
 
-  hipErrchk(
-      hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=] RAJA_DEVICE(int e)
-        {
-          d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) +
-                             d_Aview(e, 0, 1) * d_Bview(e, 1, 0) +
-                             d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) +
-                             d_Aview(e, 0, 1) * d_Bview(e, 1, 1) +
-                             d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) +
-                             d_Aview(e, 0, 1) * d_Bview(e, 1, 2) +
-                             d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
-
-          d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) +
-                             d_Aview(e, 1, 1) * d_Bview(e, 1, 0) +
-                             d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) +
-                             d_Aview(e, 1, 1) * d_Bview(e, 1, 1) +
-                             d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) +
-                             d_Aview(e, 1, 1) * d_Bview(e, 1, 2) +
-                             d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
-
-          d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) +
-                             d_Aview(e, 2, 1) * d_Bview(e, 1, 0) +
-                             d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) +
-                             d_Aview(e, 2, 1) * d_Bview(e, 1, 1) +
-                             d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) +
-                             d_Aview(e, 2, 1) * d_Bview(e, 1, 2) +
-                             d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
+
+      }
+    );
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -616,20 +606,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(
-      hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
 
-  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate_gpu(d_A);
   memoryManager::deallocate_gpu(d_B);
   memoryManager::deallocate_gpu(d_C);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - hip) ... " << std::endl;
@@ -643,16 +632,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Bview2(d_B2, layout2);
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Cview2(d_C2, layout2);
 
-  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double),
-hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N *
-sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
 
   minRun = std::numeric_limits<double>::max();
   for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0,
-N),
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
       [=] RAJA_DEVICE(int e) {
 
         d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0)
@@ -694,8 +681,7 @@ N),
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double),
-hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
 
   std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
@@ -709,11 +695,11 @@ hipMemcpyDeviceToHost ));
 */
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -733,26 +719,19 @@ void checkResult(T C, int nMat, int nRows, int nCols)
 {
 
   bool status = true;
-  for (int e = 0; e < nMat; ++e)
-  {
-    for (int row = 0; row < nRows; ++row)
-    {
-      for (int col = 0; col < nCols; ++col)
-      {
-        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12)
-        {
+  for (int e = 0; e < nMat; ++e) {
+    for (int row = 0; row < nRows; ++row) {
+      for (int col = 0; col < nCols; ++col) {
+        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) {
           status = false;
         }
       }
     }
   }
 
-  if (status)
-  {
+  if ( status ) {
     std::cout << "\tresult -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\tresult -- FAIL\n";
   }
 }
diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
index b04aead60b..297ec45047 100644
--- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
@@ -76,80 +76,81 @@ constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 //
-// Function for checking results
+//Function for checking results
 //
 template <typename T>
 void checkResult(T C, int nMat, int nRows, int nCols);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA batched matrix multiplication exercise...\n";
 
-  // Dimensions of matrices
+// Dimensions of matrices
   constexpr int N_c = 3;
   constexpr int N_r = 3;
 
-  // Number of matrices
+// Number of matrices
   constexpr int N = 8000000;
 
-  // Number of iterations
+// Number of iterations
   constexpr int NITER = 20;
 
   std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n";
 
-  //
-  // Initialize a RAJA timer object
-  // and variable to store minimum run time
-  //
-  auto   timer  = RAJA::Timer();
+//
+// Initialize a RAJA timer object
+// and variable to store minimum run time
+//
+  auto timer = RAJA::Timer();
   double minRun = std::numeric_limits<double>::max();
 
-  //
-  // Allocate space for data in layout 1
-  //
-  double* A = memoryManager::allocate<double>(N_c * N_r * N);
-  double* B = memoryManager::allocate<double>(N_c * N_r * N);
-  double* C = memoryManager::allocate<double>(N_c * N_r * N);
-
-  //
-  // Layout 1
-  //
-  // make_permuted_layout takes the number of entries in each dimension and a
-  // templated array indicating index arguments with slowest to fastest stride.
-  // Standard C++ arrays are used to hold the number of entries in each
-  // component. This example uses double braces to initalize the array and its
-  // subobjects. The layout object will index into the array as the following C
-  // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
-  //
-  // RAJA::Layout objects may be templated on dimension, argument type, and
-  // index with unit stride. Here, the column index has unit stride (argument
-  // 2).
-  //
+//
+// Allocate space for data in layout 1
+//
+  double *A = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C = memoryManager::allocate<double>(N_c * N_r * N);
+
+//
+// Layout 1
+//
+// make_permuted_layout takes the number of entries in each dimension and a
+// templated array indicating index arguments with slowest to fastest stride.
+// Standard C++ arrays are used to hold the number of entries in each component.
+// This example uses double braces to initalize the array and its subobjects.
+// The layout object will index into the array as the following C macro would
+// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
+//
+// RAJA::Layout objects may be templated on dimension, argument type, and 
+// index with unit stride. Here, the column index has unit stride (argument 2). 
+//
   // _permutedlayout_defviews_start
   std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
-  auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1);
+  auto layout1 =
+      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 );
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Bview(B, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> Cview(C, layout1);
   // _permutedlayout_defviews_end
 
-  //
-  // Allocate space for data in layout 2
-  //
-  double* A2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double* B2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double* C2 = memoryManager::allocate<double>(N_c * N_r * N);
-
-  //
-  // Permuted layout - equivalent to indexing using the following macro
-  // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
-  // In this case the element index has unit stride (argument 0).
-  //
+//
+// Allocate space for data in layout 2
+//
+  double *A2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C2 = memoryManager::allocate<double>(N_c * N_r * N);
+
+//
+// Permuted layout - equivalent to indexing using the following macro
+// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
+// In this case the element index has unit stride (argument 0). 
+//
   // _permutedlayout_permviews_start
   std::array<RAJA::idx_t, 3> perm2 {{1, 2, 0}};
-  auto layout2 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm2);
+  auto layout2 =
+      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 );
 
   RAJA::View<double, RAJA::Layout<3, int, 0>> Aview2(A2, layout2);
   RAJA::View<double, RAJA::Layout<3, int, 0>> Bview2(B2, layout2);
@@ -168,71 +169,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using INIT_POL = RAJA::seq_exec;
 #endif
 
-  RAJA::forall<INIT_POL>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=](int e)
-      {
-        for (int row = 0; row < N_r; ++row)
-        {
-          for (int col = 0; col < N_c; ++col)
-          {
-            Aview(e, row, col) = row;
-            Bview(e, row, col) = col;
-            Cview(e, row, col) = 0;
-
-            Aview2(e, row, col) = row;
-            Bview2(e, row, col) = col;
-            Cview2(e, row, col) = 0;
-          }
-        }
-      });
+  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+    for (int row = 0; row < N_r; ++row) {
+      for (int col = 0; col < N_c; ++col) {
+        Aview(e, row, col) = row;
+        Bview(e, row, col) = col;
+        Cview(e, row, col) = 0;
 
+        Aview2(e, row, col) = row;
+        Bview2(e, row, col) = col;
+        Cview2(e, row, col) = 0;
+      }
+    }
+  });
 
-  //----------------------------------------------------------------------------//
+
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - sequential) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
     // _permutedlayout_batchedmatmult_loop_start
-    RAJA::forall<RAJA::seq_exec>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=](int e)
-        {
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
-                           Aview(e, 0, 1) * Bview(e, 1, 0) +
-                           Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
-                           Aview(e, 0, 1) * Bview(e, 1, 1) +
-                           Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
-                           Aview(e, 0, 1) * Bview(e, 1, 2) +
-                           Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
-                           Aview(e, 1, 1) * Bview(e, 1, 0) +
-                           Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
-                           Aview(e, 1, 1) * Bview(e, 1, 1) +
-                           Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
-                           Aview(e, 1, 1) * Bview(e, 1, 2) +
-                           Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
-                           Aview(e, 2, 1) * Bview(e, 1, 0) +
-                           Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
-                           Aview(e, 2, 1) * Bview(e, 1, 1) +
-                           Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
-                           Aview(e, 2, 1) * Bview(e, 1, 2) +
-                           Aview(e, 2, 2) * Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N),
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+      }
+    );
     // _permutedlayout_batchedmatmult_loop_end
     timer.stop();
 
@@ -240,55 +235,55 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-
+    
   std::cout << "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - sequential) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
     // _permutedlayout2_batchedmatmult_loop_start
-    RAJA::forall<RAJA::seq_exec>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=](int e)
-        {
-          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 0);
-          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 1);
-          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 0);
-          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 1);
-          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 0);
-          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 1);
-          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 2);
-        });
+    RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
     // _permutedlayout2_batchedmatmult_loop_end
     timer.stop();
 
@@ -296,10 +291,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -309,45 +304,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
     // _permutedlayout_batchedmatmult_omp_start
-    RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=](int e)
-        {
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
-                           Aview(e, 0, 1) * Bview(e, 1, 0) +
-                           Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
-                           Aview(e, 0, 1) * Bview(e, 1, 1) +
-                           Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
-                           Aview(e, 0, 1) * Bview(e, 1, 2) +
-                           Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
-                           Aview(e, 1, 1) * Bview(e, 1, 0) +
-                           Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
-                           Aview(e, 1, 1) * Bview(e, 1, 1) +
-                           Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
-                           Aview(e, 1, 1) * Bview(e, 1, 2) +
-                           Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
-                           Aview(e, 2, 1) * Bview(e, 1, 0) +
-                           Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
-                           Aview(e, 2, 1) * Bview(e, 1, 1) +
-                           Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
-                           Aview(e, 2, 1) * Bview(e, 1, 2) +
-                           Aview(e, 2, 2) * Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
     // _permutedlayout_batchedmatmult_omp_end
     timer.stop();
 
@@ -355,11 +350,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-
-  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - omp parallel for) ... " << std::endl;
@@ -367,57 +362,57 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C2, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=](int e)
-        {
-          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 0);
-          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 1);
-          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 0);
-          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 1);
-          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 0);
-          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 1);
-          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 2);
-        });
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -427,44 +422,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=] RAJA_DEVICE(int e)
-        {
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) +
-                           Aview(e, 0, 1) * Bview(e, 1, 0) +
-                           Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) +
-                           Aview(e, 0, 1) * Bview(e, 1, 1) +
-                           Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) +
-                           Aview(e, 0, 1) * Bview(e, 1, 2) +
-                           Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) +
-                           Aview(e, 1, 1) * Bview(e, 1, 0) +
-                           Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) +
-                           Aview(e, 1, 1) * Bview(e, 1, 1) +
-                           Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) +
-                           Aview(e, 1, 1) * Bview(e, 1, 2) +
-                           Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) +
-                           Aview(e, 2, 1) * Bview(e, 1, 0) +
-                           Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) +
-                           Aview(e, 2, 1) * Bview(e, 1, 1) +
-                           Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) +
-                           Aview(e, 2, 1) * Bview(e, 1, 2) +
-                           Aview(e, 2, 2) * Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -472,10 +467,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - cuda) ... " << std::endl;
@@ -483,68 +478,68 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(C2, 0, N_c * N_r * N * sizeof(double));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=] RAJA_DEVICE(int e)
-        {
-          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 0);
-          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 1);
-          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 0, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 0);
-          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 1);
-          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 1, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 0) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 0);
-          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 1) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 1);
-          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) +
-                            Aview2(e, 2, 1) * Bview2(e, 1, 2) +
-                            Aview2(e, 2, 2) * Bview2(e, 2, 2);
-        });
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
     if (tMin < minRun) minRun = tMin;
     timer.reset();
   }
-  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 1 (RAJA - hip) ... " << std::endl;
 
-  double* d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double* d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double* d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
 
-  double* d_A2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double* d_B2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double* d_C2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_A2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_B2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_C2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
 
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Aview(d_A, layout1);
   RAJA::View<double, RAJA::Layout<3, int, 2>> d_Bview(d_B, layout1);
@@ -554,54 +549,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Bview2(d_B2, layout2);
   RAJA::View<double, RAJA::Layout<3, int, 0>> d_Cview2(d_C2, layout2);
 
-  hipErrchk(
-      hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(
-      d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(
-      d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=] RAJA_DEVICE(int e)
-        {
-          d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) +
-                             d_Aview(e, 0, 1) * d_Bview(e, 1, 0) +
-                             d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) +
-                             d_Aview(e, 0, 1) * d_Bview(e, 1, 1) +
-                             d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) +
-                             d_Aview(e, 0, 1) * d_Bview(e, 1, 2) +
-                             d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
-
-          d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) +
-                             d_Aview(e, 1, 1) * d_Bview(e, 1, 0) +
-                             d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) +
-                             d_Aview(e, 1, 1) * d_Bview(e, 1, 1) +
-                             d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) +
-                             d_Aview(e, 1, 1) * d_Bview(e, 1, 2) +
-                             d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
-
-          d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) +
-                             d_Aview(e, 2, 1) * d_Bview(e, 1, 0) +
-                             d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) +
-                             d_Aview(e, 2, 1) * d_Bview(e, 1, 1) +
-                             d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) +
-                             d_Aview(e, 2, 1) * d_Bview(e, 1, 2) +
-                             d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
-        });
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
+
+      }
+    );
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -609,56 +600,55 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(
-      hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
 
-  std::cout << "\trun time: " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
   checkResult(Cview, N, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << " \n Running batched matrix multiplication"
             << " with layout 2 (RAJA - hip) ... " << std::endl;
 
   minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i)
-  {
+  for (int i = 0; i < NITER; ++i) {
 
     timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::TypedRangeSegment<int>(0, N),
-        [=] RAJA_DEVICE(int e)
-        {
-          d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) +
-                              d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) +
-                              d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
-          d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1) +
-                              d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1) +
-                              d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1);
-          d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2) +
-                              d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2) +
-                              d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2);
-
-          d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0) +
-                              d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0) +
-                              d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0);
-          d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1) +
-                              d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1) +
-                              d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1);
-          d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2) +
-                              d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2) +
-                              d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2);
-
-          d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0) +
-                              d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0) +
-                              d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0);
-          d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1) +
-                              d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1) +
-                              d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1);
-          d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2) +
-                              d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2) +
-                              d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2);
-        });
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2);
+
+        d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2);
+
+        d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2);
+
+      }
+    );
     timer.stop();
 
     RAJA::Timer::ElapsedType tMin = timer.elapsed();
@@ -666,10 +656,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     timer.reset();
   }
 
-  hipErrchk(hipMemcpy(
-      C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
 
-  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
   checkResult(Cview2, N, N_r, N_c);
 
   memoryManager::deallocate_gpu(d_A);
@@ -680,11 +669,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   memoryManager::deallocate_gpu(d_C2);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(A);
   memoryManager::deallocate(B);
   memoryManager::deallocate(C);
@@ -704,26 +693,19 @@ void checkResult(T C, int nMat, int nRows, int nCols)
 {
 
   bool status = true;
-  for (int e = 0; e < nMat; ++e)
-  {
-    for (int row = 0; row < nRows; ++row)
-    {
-      for (int col = 0; col < nCols; ++col)
-      {
-        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12)
-        {
+  for (int e = 0; e < nMat; ++e) {
+    for (int row = 0; row < nRows; ++row) {
+      for (int col = 0; col < nCols; ++col) {
+        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) {
           status = false;
         }
       }
     }
   }
 
-  if (status)
-  {
+  if ( status ) {
     std::cout << "\tresult -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\tresult -- FAIL\n";
   }
 }
diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp
index e5aa733e90..4c6b90c063 100644
--- a/exercises/reductions.cpp
+++ b/exercises/reductions.cpp
@@ -32,7 +32,7 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-// constexpr int CUDA_BLOCK_SIZE = 256;
+//constexpr int CUDA_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
@@ -45,58 +45,54 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 1000000;
 
-  //
-  // Allocate array data and initialize data to alternating sequence of 1, -1.
-  //
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (i % 2 == 0)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
       a[i] = 1;
-    }
-    else
-    {
-      a[i] = -1;
+    } else {
+      a[i] = -1; 
     }
   }
 
-  //
-  // Set min and max loc values
-  //
+//
+// Set min and max loc values
+//
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref]            = -100;
+  a[minloc_ref] = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref]            = 100;
+  a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-  //
-  // Note: with this data initialization scheme, the following results will
-  //       be observed for all reduction kernels below:
-  //
-  //  - the sum will be zero
-  //  - the min will be -100
-  //  - the max will be 100
-  //  - the min loc will be N/2
-  //  - the max loc will be N/2 + 1
-  //
-  //
-
-  //
-  // Define index range for iterating over a elements in all examples
-  //
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
   // _reductions_range_start
-  // RAJA::TypedRangeSegment<int> arange(0, N);
-  //  _reductions_range_end
+//RAJA::TypedRangeSegment<int> arange(0, N);
+  // _reductions_range_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
@@ -105,7 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Define EXEC_POL1 and REDCUE_POL1 for executing sequentially.
   ///
-
+ 
   /// TODO...
   ///
   /// EXERCISE: Remove comments for remainder of sequential section.
@@ -116,13 +112,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceSum<REDUCE_POL1, int> seq_sum(0);
   RAJA::ReduceMin<REDUCE_POL1, int> seq_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL1, int> seq_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL1, int>
-  seq_minloc(std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL1, int>
-  seq_maxloc(std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::forall<EXEC_POL1>(arange, [=](int i) {
-
+    
     seq_sum += a[i];
 
     seq_min.min(a[i]);
@@ -136,14 +130,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tsum = " << seq_sum.get() << std::endl;
   std::cout << "\tmin = " << seq_min.get() << std::endl;
   std::cout << "\tmax = " << seq_max.get() << std::endl;
-  std::cout << "\tmin, loc = " << seq_minloc.get() << " , "
+  std::cout << "\tmin, loc = " << seq_minloc.get() << " , " 
                                << seq_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , "
+  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " 
                                << seq_maxloc.getLoc() << std::endl;
   */
+  
 
-
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
@@ -158,8 +152,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this
-  /// exercise.
+  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise.
   ///
   ///           Uncomment 'arange' variable above so it can be used in kernel.
   ///
@@ -188,12 +181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tmin, loc = " << omp_minloc.get() << " , "
                                << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.get() << " , "
-                               << omp_maxloc.getLoc() << std::endl;
+                               << omp_maxloc.getLoc() << std::endl; 
   */
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
@@ -207,8 +200,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this
-  /// exercise.
+  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise.
   ///
   ///           Uncomment 'arange' variable above so it can be used in kernel.
   ///
@@ -240,7 +232,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   */
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
@@ -248,53 +240,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::TypedRangeSegment<int> arange1(0, N);
 
   int* d_a = memoryManager::allocate_gpu<int>(N);
-  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
 
   // _reductions_raja_hippolicy_start
   using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::hip_reduce;
   // _reductions_raja_hippolicy_end
 
-  RAJA::ReduceSum<REDUCE_POL3, int>    hip_sum(0);
-  RAJA::ReduceMin<REDUCE_POL3, int>    hip_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL3, int>    hip_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(
-      std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
-      std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
+  RAJA::ReduceMin<REDUCE_POL3, int> hip_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL3, int> hip_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(arange1, [=] RAJA_DEVICE (int i) {
 
-  RAJA::forall<EXEC_POL3>(
-      arange1,
-      [=] RAJA_DEVICE(int i)
-      {
-        hip_sum += d_a[i];
+    hip_sum += d_a[i];
 
-        hip_min.min(d_a[i]);
-        hip_max.max(d_a[i]);
+    hip_min.min(d_a[i]);
+    hip_max.max(d_a[i]);
 
-        hip_minloc.minloc(d_a[i], i);
-        hip_maxloc.maxloc(d_a[i], i);
-      });
+    hip_minloc.minloc(d_a[i], i);
+    hip_maxloc.maxloc(d_a[i], i);
+
+  });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
   std::cout << "\tmax = " << hip_max.get() << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.get() << " , "
-            << hip_minloc.getLoc() << std::endl;
+                               << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.get() << " , "
-            << hip_maxloc.getLoc() << std::endl;
+                               << hip_maxloc.getLoc() << std::endl;
 
   memoryManager::deallocate_gpu(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
-
+ 
   return 0;
 }
diff --git a/exercises/reductions_solution.cpp b/exercises/reductions_solution.cpp
index 88ae4d0d37..6da731e62e 100644
--- a/exercises/reductions_solution.cpp
+++ b/exercises/reductions_solution.cpp
@@ -45,97 +45,90 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA reductions example...\n";
 
   // _reductions_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 1000000;
 
-  //
-  // Allocate array data and initialize data to alternating sequence of 1, -1.
-  //
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
   int* a = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (i % 2 == 0)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
       a[i] = 1;
-    }
-    else
-    {
-      a[i] = -1;
+    } else {
+      a[i] = -1; 
     }
   }
 
-  //
-  // Set min and max loc values
-  //
+//
+// Set min and max loc values
+//
   constexpr int minloc_ref = N / 2;
-  a[minloc_ref]            = -100;
+  a[minloc_ref] = -100;
 
   constexpr int maxloc_ref = N / 2 + 1;
-  a[maxloc_ref]            = 100;
+  a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
-  //
-  // Note: with this data initialization scheme, the following results will
-  //       be observed for all reduction kernels below:
-  //
-  //  - the sum will be zero
-  //  - the min will be -100
-  //  - the max will be 100
-  //  - the min loc will be N/2
-  //  - the max loc will be N/2 + 1
-  //
-  //
-
-  //
-  // Define index range for iterating over a elements in all examples
-  //
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
   // _reductions_range_start
   RAJA::TypedRangeSegment<int> arange(0, N);
   // _reductions_range_end
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential reductions...\n";
 
   // _reductions_raja_seq_start
   using EXEC_POL1   = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
+ 
+  RAJA::ReduceSum<REDUCE_POL1, int> seq_sum(0);
+  RAJA::ReduceMin<REDUCE_POL1, int> seq_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL1, int> seq_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::ReduceSum<REDUCE_POL1, int>    seq_sum(0);
-  RAJA::ReduceMin<REDUCE_POL1, int>    seq_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL1, int>    seq_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(
-      std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(
-      std::numeric_limits<int>::min(), -1);
+  RAJA::forall<EXEC_POL1>(arange, [=](int i) {
+    
+    seq_sum += a[i];
 
-  RAJA::forall<EXEC_POL1>(
-      arange,
-      [=](int i)
-      {
-        seq_sum += a[i];
+    seq_min.min(a[i]);
+    seq_max.max(a[i]);
 
-        seq_min.min(a[i]);
-        seq_max.max(a[i]);
+    seq_minloc.minloc(a[i], i);
+    seq_maxloc.maxloc(a[i], i);
 
-        seq_minloc.minloc(a[i], i);
-        seq_maxloc.maxloc(a[i], i);
-      });
+  });
 
   std::cout << "\tsum = " << seq_sum.get() << std::endl;
   std::cout << "\tmin = " << seq_min.get() << std::endl;
   std::cout << "\tmax = " << seq_max.get() << std::endl;
-  std::cout << "\tmin, loc = " << seq_minloc.get() << " , "
-            << seq_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , "
-            << seq_maxloc.getLoc() << std::endl;
+  std::cout << "\tmin, loc = " << seq_minloc.get() << " , " 
+                               << seq_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " 
+                               << seq_maxloc.getLoc() << std::endl;
   // _reductions_raja_seq_end
+  
 
-
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP reductions...\n";
@@ -145,38 +138,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using REDUCE_POL2 = RAJA::omp_reduce;
   // _reductions_raja_omppolicy_end
 
-  RAJA::ReduceSum<REDUCE_POL2, int>    omp_sum(0);
-  RAJA::ReduceMin<REDUCE_POL2, int>    omp_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL2, int>    omp_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL2, int> omp_minloc(
-      std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL2, int> omp_maxloc(
-      std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceSum<REDUCE_POL2, int> omp_sum(0);
+  RAJA::ReduceMin<REDUCE_POL2, int> omp_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL2, int> omp_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<REDUCE_POL2, int> omp_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL2, int> omp_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL2>(
-      arange,
-      [=](int i)
-      {
-        omp_sum += a[i];
+  RAJA::forall<EXEC_POL2>(arange, [=](int i) {
 
-        omp_min.min(a[i]);
-        omp_max.max(a[i]);
+    omp_sum += a[i];
 
-        omp_minloc.minloc(a[i], i);
-        omp_maxloc.maxloc(a[i], i);
-      });
+    omp_min.min(a[i]);
+    omp_max.max(a[i]);
+
+    omp_minloc.minloc(a[i], i);
+    omp_maxloc.maxloc(a[i], i);
+
+  });
 
   std::cout << "\tsum = " << omp_sum.get() << std::endl;
   std::cout << "\tmin = " << omp_min.get() << std::endl;
   std::cout << "\tmax = " << omp_max.get() << std::endl;
   std::cout << "\tmin, loc = " << omp_minloc.get() << " , "
-            << omp_minloc.getLoc() << std::endl;
+                               << omp_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << omp_maxloc.get() << " , "
-            << omp_maxloc.getLoc() << std::endl;
+                               << omp_maxloc.getLoc() << std::endl; 
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
@@ -189,86 +179,80 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::ReduceSum<REDUCE_POL3, int> cuda_sum(0);
   RAJA::ReduceMin<REDUCE_POL3, int> cuda_min(std::numeric_limits<int>::max());
   RAJA::ReduceMax<REDUCE_POL3, int> cuda_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL3, int> cuda_minloc(
-      std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL3, int> cuda_maxloc(
-      std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceMinLoc<REDUCE_POL3, int> cuda_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL3, int> cuda_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE (int i) {
 
-  RAJA::forall<EXEC_POL3>(
-      arange,
-      [=] RAJA_DEVICE(int i)
-      {
-        cuda_sum += a[i];
+    cuda_sum += a[i];
 
-        cuda_min.min(a[i]);
-        cuda_max.max(a[i]);
+    cuda_min.min(a[i]);
+    cuda_max.max(a[i]);
 
-        cuda_minloc.minloc(a[i], i);
-        cuda_maxloc.maxloc(a[i], i);
-      });
+    cuda_minloc.minloc(a[i], i);
+    cuda_maxloc.maxloc(a[i], i);
+
+  });
 
   std::cout << "\tsum = " << cuda_sum.get() << std::endl;
   std::cout << "\tmin = " << cuda_min.get() << std::endl;
   std::cout << "\tmax = " << cuda_max.get() << std::endl;
   std::cout << "\tmin, loc = " << cuda_minloc.get() << " , "
-            << cuda_minloc.getLoc() << std::endl;
+                               << cuda_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << cuda_maxloc.get() << " , "
-            << cuda_maxloc.getLoc() << std::endl;
+                               << cuda_maxloc.getLoc() << std::endl;
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
 
   int* d_a = memoryManager::allocate_gpu<int>(N);
-  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
 
   // _reductions_raja_hippolicy_start
   using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::hip_reduce;
   // _reductions_raja_hippolicy_end
 
-  RAJA::ReduceSum<REDUCE_POL3, int>    hip_sum(0);
-  RAJA::ReduceMin<REDUCE_POL3, int>    hip_min(std::numeric_limits<int>::max());
-  RAJA::ReduceMax<REDUCE_POL3, int>    hip_max(std::numeric_limits<int>::min());
-  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(
-      std::numeric_limits<int>::max(), -1);
-  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(
-      std::numeric_limits<int>::min(), -1);
+  RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
+  RAJA::ReduceMin<REDUCE_POL3, int> hip_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL3, int> hip_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE (int i) {
+
+    hip_sum += d_a[i];
 
-  RAJA::forall<EXEC_POL3>(
-      arange,
-      [=] RAJA_DEVICE(int i)
-      {
-        hip_sum += d_a[i];
+    hip_min.min(d_a[i]);
+    hip_max.max(d_a[i]);
 
-        hip_min.min(d_a[i]);
-        hip_max.max(d_a[i]);
+    hip_minloc.minloc(d_a[i], i);
+    hip_maxloc.maxloc(d_a[i], i);
 
-        hip_minloc.minloc(d_a[i], i);
-        hip_maxloc.maxloc(d_a[i], i);
-      });
+  });
 
   std::cout << "\tsum = " << hip_sum.get() << std::endl;
   std::cout << "\tmin = " << hip_min.get() << std::endl;
   std::cout << "\tmax = " << hip_max.get() << std::endl;
   std::cout << "\tmin, loc = " << hip_minloc.get() << " , "
-            << hip_minloc.getLoc() << std::endl;
+                               << hip_minloc.getLoc() << std::endl;
   std::cout << "\tmax, loc = " << hip_maxloc.get() << " , "
-            << hip_maxloc.getLoc() << std::endl;
+                               << hip_maxloc.getLoc() << std::endl;
 
   memoryManager::deallocate_gpu(d_a);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
 
   std::cout << "\n DONE!...\n";
-
+ 
   return 0;
 }
diff --git a/exercises/scan.cpp b/exercises/scan.cpp
index 53d7cac705..68f52fce2b 100644
--- a/exercises/scan.cpp
+++ b/exercises/scan.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#define OP_PLUS_INT               RAJA::operators::plus<int>
-#define OP_MIN_INT                RAJA::operators::minimum<int>
-#define OP_MAX_INT                RAJA::operators::maximum<int>
+#define OP_PLUS_INT RAJA::operators::plus<int>
+#define OP_MIN_INT RAJA::operators::minimum<int>
+#define OP_MAX_INT RAJA::operators::maximum<int>
 #define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult<X>(in, out, N);
 #define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult<X>(in, out, N);
 
@@ -40,11 +40,11 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-// constexpr int CUDA_BLOCK_SIZE = 16;
+//constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-// constexpr int HIP_BLOCK_SIZE = 16;
+//constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -66,15 +66,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA scan example...\n";
 
   // _scan_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 20;
 
-  //
-  // Allocate and initialize vector data
-  //
-  int* in  = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data
+//
+  int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
   std::iota(in, in + N, -1);
@@ -85,10 +85,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _scan_array_init_end
 
 
-  //----------------------------------------------------------------------------//
-  // Perform various sequential scans to illustrate inclusive/exclusive,
-  // in-place, default scans with different operators
-  //----------------------------------------------------------------------------//
+
+//----------------------------------------------------------------------------//
+// Perform various sequential scans to illustrate inclusive/exclusive,
+// in-place, default scans with different operators
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (default)...\n";
 
@@ -96,21 +97,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec
-  ///           execution policy type.
+  ///           execution policy type. 
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
 
   // _scan_inclusive_seq_start
-  RAJA::inclusive_scan<RAJA::seq_exec>(
-      RAJA::make_span(in, N), RAJA::make_span(out, N));
+  RAJA::inclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
+                                       RAJA::make_span(out, N));
   // _scan_inclusive_seq_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (plus)...\n";
 
@@ -120,14 +121,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit plus operator.
+  ///           execution policy type and an explicit plus operator. 
   ///
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan (plus)...\n";
 
@@ -137,14 +138,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an exclusive RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit plus operator.
+  ///           execution policy type and an explicit plus operator. 
   ///
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n";
 
@@ -154,14 +155,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit minimum operator.
+  ///           execution policy type and an explicit minimum operator. 
   ///
 
   CHECK_INC_SCAN_RESULTS(OP_MIN_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n";
 
@@ -171,7 +172,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::seq_exec
-  ///           execution policy type and an explicit maximum operator.
+  ///           execution policy type and an explicit maximum operator. 
   ///
 
   CHECK_EXC_SCAN_RESULTS(OP_MAX_INT)
@@ -181,25 +182,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of OpenMP scans...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of OpenMP scans...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP inclusive_scan (plus)...\n";
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement an inclusive RAJA scan with
-  /// RAJA::omp_parallel_for_exec
-  ///           execution policy type and an explicit plus operator.
+  /// EXERCISE: Implement an inclusive RAJA scan with RAJA::omp_parallel_for_exec
+  ///           execution policy type and an explicit plus operator. 
   ///
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n";
 
@@ -208,9 +208,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement an exclusive inplace RAJA scan with
-  /// RAJA::omp_parallel_for_exec
-  ///           execution policy type and an explicit plus operator.
+  /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::omp_parallel_for_exec
+  ///           execution policy type and an explicit plus operator. 
   ///
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -219,13 +218,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of CUDA scans...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of CUDA scans...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n";
 
@@ -245,7 +244,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n";
 
@@ -265,7 +264,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan (plus)...\n";
 
@@ -287,57 +286,57 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of HIP scans...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of HIP scans...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n";
 
   std::copy_n(in, N, out);
-  int* d_in  = memoryManager::allocate_gpu<int>(N);
+  int* d_in = memoryManager::allocate_gpu<int>(N);
   int* d_out = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::hip_exec
-  ///           execution policy type and an explicit plus operator.
+  ///           execution policy type and an explicit plus operator. 
   ///
   ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top
   ///                 of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP exclusive_scan (plus)...\n";
 
-  hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement an exclusive RAJA scan with RAJA::hip_exec
-  ///           execution policy type and an explicit plus operator.
+  ///           execution policy type and an explicit plus operator. 
   ///
   ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top
   ///                 of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
@@ -348,11 +347,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -369,14 +368,12 @@ template <typename Function, typename T>
 void checkInclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     val = Function()(val, in[i]);
-    if (out[i] != val)
-    {
+    if (out[i] != val) {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
-                << ")\n";
+      std::cout << "\t" << out[i] << " != " << val
+                << " (at index " << i << ")\n";
     }
   }
   std::cout << "\n\t result -- CORRECT\n";
@@ -389,13 +386,11 @@ template <typename Function, typename T>
 void checkExclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i)
-  {
-    if (out[i] != val)
-    {
+  for (int i = 0; i < N; ++i) {
+    if (out[i] != val) {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
-                << ")\n";
+      std::cout << "\t" << out[i] << " != " << val
+                << " (at index " << i << ")\n";
     }
     val = Function()(val, in[i]);
   }
@@ -409,9 +404,6 @@ template <typename T>
 void printArray(const T* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i)
-  {
-    std::cout << " " << v[i];
-  }
+  for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; }
   std::cout << std::endl;
 }
diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp
index c37b562dd6..7ed7101192 100644
--- a/exercises/scan_solution.cpp
+++ b/exercises/scan_solution.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#define OP_PLUS_INT               RAJA::operators::plus<int>
-#define OP_MIN_INT                RAJA::operators::minimum<int>
-#define OP_MAX_INT                RAJA::operators::maximum<int>
+#define OP_PLUS_INT RAJA::operators::plus<int>
+#define OP_MIN_INT RAJA::operators::minimum<int>
+#define OP_MAX_INT RAJA::operators::maximum<int>
 #define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult<X>(in, out, N);
 #define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult<X>(in, out, N);
 
@@ -40,11 +40,11 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-constexpr int CUDA_BLOCK_SIZE = 16;
+  constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-constexpr int HIP_BLOCK_SIZE = 16;
+  constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -66,15 +66,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA scan example...\n";
 
   // _scan_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 20;
 
-  //
-  // Allocate and initialize vector data
-  //
-  int* in  = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data
+//
+  int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
   std::iota(in, in + N, -1);
@@ -85,78 +85,79 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _scan_array_init_end
 
 
-  //----------------------------------------------------------------------------//
-  // Perform various sequential scans to illustrate inclusive/exclusive,
-  // in-place, default scans with different operators
-  //----------------------------------------------------------------------------//
+
+//----------------------------------------------------------------------------//
+// Perform various sequential scans to illustrate inclusive/exclusive,
+// in-place, default scans with different operators
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (default)...\n";
 
   // _scan_inclusive_seq_start
-  RAJA::inclusive_scan<RAJA::seq_exec>(
-      RAJA::make_span(in, N), RAJA::make_span(out, N));
+  RAJA::inclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
+                                       RAJA::make_span(out, N));
   // _scan_inclusive_seq_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan (plus)...\n";
 
   std::copy_n(in, N, out);
 
   // _scan_inclusive_seq_plus_start
-  RAJA::inclusive_scan<RAJA::seq_exec>(
-      RAJA::make_span(in, N), RAJA::make_span(out, N),
-      RAJA::operators::plus<int> {});
+  RAJA::inclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
+                                       RAJA::make_span(out, N),
+                                       RAJA::operators::plus<int>{});
   // _scan_inclusive_seq_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan (plus)...\n";
 
   std::copy_n(in, N, out);
 
   // _scan_exclusive_seq_plus_start
-  RAJA::exclusive_scan<RAJA::seq_exec>(
-      RAJA::make_span(in, N), RAJA::make_span(out, N),
-      RAJA::operators::plus<int> {});
+  RAJA::exclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
+                                       RAJA::make_span(out, N),
+                                       RAJA::operators::plus<int>{});
   // _scan_exclusive_seq_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n";
 
   // _scan_inclusive_inplace_seq_min_start
   std::copy_n(in, N, out);
 
-  RAJA::inclusive_scan_inplace<RAJA::seq_exec>(
-      RAJA::make_span(out, N), RAJA::operators::minimum<int> {});
+  RAJA::inclusive_scan_inplace<RAJA::seq_exec>(RAJA::make_span(out, N),
+                                               RAJA::operators::minimum<int>{});
   // _scan_inclusive_inplace_seq_min_end
 
   CHECK_INC_SCAN_RESULTS(OP_MIN_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n";
 
   std::copy_n(in, N, out);
 
   // _scan_exclusive_inplace_seq_max_start
-  RAJA::exclusive_scan_inplace<RAJA::seq_exec>(
-      RAJA::make_span(out, N), RAJA::operators::maximum<int> {});
+  RAJA::exclusive_scan_inplace<RAJA::seq_exec>(RAJA::make_span(out, N),
+                                               RAJA::operators::maximum<int>{});
   // _scan_exclusive_inplace_seq_max_end
 
   CHECK_EXC_SCAN_RESULTS(OP_MAX_INT)
@@ -166,23 +167,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of OpenMP scans...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of OpenMP scans...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP inclusive_scan (plus)...\n";
 
   // _scan_inclusive_omp_plus_start
-  RAJA::inclusive_scan<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(in, N), RAJA::make_span(out, N),
-      RAJA::operators::plus<int> {});
+  RAJA::inclusive_scan<RAJA::omp_parallel_for_exec>(RAJA::make_span(in, N),
+                                                    RAJA::make_span(out, N),
+                                                    RAJA::operators::plus<int>{});
   // _scan_inclusive_omp_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n";
 
@@ -190,7 +191,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_inplace_omp_plus_start
   RAJA::exclusive_scan_inplace<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(out, N), RAJA::operators::plus<int> {});
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_exclusive_inplace_omp_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -199,13 +201,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //----------------------------------------------------------------------------//
-  // Perform a few CUDA scans...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a few CUDA scans...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n";
 
@@ -213,14 +215,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_inclusive_inplace_cuda_plus_start
   RAJA::inclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N), RAJA::operators::plus<int> {});
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_inclusive_inplace_cuda_plus_end
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n";
 
@@ -228,14 +231,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_inplace_cuda_plus_start
   RAJA::exclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N), RAJA::operators::plus<int> {});
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_exclusive_inplace_cuda_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA exclusive_scan (plus)...\n";
 
@@ -243,8 +247,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _scan_exclusive_cuda_plus_start
   RAJA::exclusive_scan<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(in, N), RAJA::make_span(out, N),
-      RAJA::operators::plus<int> {});
+      RAJA::make_span(in, N),
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_exclusive_cuda_plus_end
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
@@ -253,46 +258,48 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of HIP scans...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of HIP scans...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n";
 
   std::copy_n(in, N, out);
-  int* d_in  = memoryManager::allocate_gpu<int>(N);
+  int* d_in = memoryManager::allocate_gpu<int>(N);
   int* d_out = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
   // _scan_inclusive_inplace_hip_plus_start
   RAJA::inclusive_scan_inplace<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_out, N), RAJA::operators::plus<int> {});
+      RAJA::make_span(d_out, N),
+      RAJA::operators::plus<int>{});
   // _scan_inclusive_inplace_hip_plus_end
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
   CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP exclusive_scan (plus)...\n";
 
-  hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
   RAJA::exclusive_scan<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_in, N), RAJA::make_span(d_out, N),
-      RAJA::operators::plus<int> {});
+      RAJA::make_span(d_in, N),
+      RAJA::make_span(d_out, N),
+      RAJA::operators::plus<int>{});
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
   CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
@@ -303,11 +310,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -324,14 +331,12 @@ template <typename Function, typename T>
 void checkInclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     val = Function()(val, in[i]);
-    if (out[i] != val)
-    {
+    if (out[i] != val) {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
-                << ")\n";
+      std::cout << "\t" << out[i] << " != " << val
+                << " (at index " << i << ")\n";
     }
   }
   std::cout << "\n\t result -- CORRECT\n";
@@ -344,13 +349,11 @@ template <typename Function, typename T>
 void checkExclusiveScanResult(const T* in, const T* out, int N)
 {
   T val = Function::identity();
-  for (int i = 0; i < N; ++i)
-  {
-    if (out[i] != val)
-    {
+  for (int i = 0; i < N; ++i) {
+    if (out[i] != val) {
       std::cout << "\n\t result -- WRONG\n";
-      std::cout << "\t" << out[i] << " != " << val << " (at index " << i
-                << ")\n";
+      std::cout << "\t" << out[i] << " != " << val
+                << " (at index " << i << ")\n";
     }
     val = Function()(val, in[i]);
   }
@@ -364,9 +367,6 @@ template <typename T>
 void printArray(const T* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i)
-  {
-    std::cout << " " << v[i];
-  }
+  for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; }
   std::cout << std::endl;
 }
diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp
index 4a3e8c13c6..b7c0c26458 100644
--- a/exercises/segment-indexset-basics.cpp
+++ b/exercises/segment-indexset-basics.cpp
@@ -20,9 +20,9 @@
  *
  *  In this exercise, you will learn how to create RAJA segments and index sets
  *  and use them to execute kernels. There are no computations performed in the
- *  exercises and no parallel execution. The kernels contain only print
+ *  exercises and no parallel execution. The kernels contain only print 
  *  statements to illustrate various iteration patterns. Thus, all kernels
- *  look the same. The only thing that changes in these versions is the object
+ *  look the same. The only thing that changes in these versions is the object 
  *  passed to the 'forall' method that defines the iteration space.
  *
  *  RAJA features shown:
@@ -39,62 +39,63 @@
 // (so example code is less verbose)
 //----------------------------------------------------------------------------//
 // _raja_segment_type_start
-using IdxType            = int;
-using RangeSegType       = RAJA::TypedRangeSegment<IdxType>;
+using IdxType = int;
+using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
 using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
-using ListSegType        = RAJA::TypedListSegment<IdxType>;
-using IndexSetType       = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
+using ListSegType = RAJA::TypedListSegment<IdxType>;
+using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >;
 // _raja_segment_type_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA segments index sets and index sets...\n";
 
-  // Resource object used to construct list segment objects with indices
-  // living in host (CPU) memory.
-  camp::resources::Resource host_res {camp::resources::Host()};
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+  camp::resources::Resource host_res{camp::resources::Host()};
 
 
-  //----------------------------------------------------------------------------//
-  // Stride-1 iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Stride-1 iteration spaces
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version range kernel...\n";
 
   // _cstyle_range1_start
-  for (IdxType i = 0; i < 20; i++)
-  {
-    std::cout << i << "  ";
+  for (IdxType i = 0; i < 20; i++) {
+    std::cout << i << "  "; 
   }
   // _cstyle_range1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA range kernel...\n";
 
   // _raja_range1_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeSegType(0, 20), [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_range1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 range kernel...\n";
 
   // _raja_striderange1_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeStrideSegType(0, 20, 1), [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_striderange1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 list kernel...\n";
 
@@ -103,49 +104,47 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Collect indices in a vector to create list segment
   //
   std::vector<IdxType> idx;
-  for (IdxType i = 0; i < 20; ++i)
-  {
-    idx.push_back(i);
-  }
+  for (IdxType i = 0; i < 20; ++i) {
+    idx.push_back(i); 
+  } 
 
-  ListSegType idx_list1(idx, host_res);
+  ListSegType idx_list1( idx, host_res );
 
-  RAJA::forall<RAJA::seq_exec>(
-      idx_list1, [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(idx_list1, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_list1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running C-style stride-1 list kernel...\n";
 
   // _cstyle_list1_start
   IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
-  for (IdxType ii = 0; ii < iis; ++ii)
-  {
-    std::cout << idx[ii] << "  ";
+  for (IdxType ii = 0; ii < iis; ++ii) { 
+    std::cout << idx[ ii ] << "  ";
   }
   // _cstyle_list1_end
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
-  // Negative stride iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Negative stride iteration spaces
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version negative stride kernel...\n";
 
   // _cstyle_negstriderange1_start
-  for (IdxType i = 19; i > -1; i--)
-  {
+  for (IdxType i = 19; i > -1; i--) {
     std::cout << i << "  ";
   }
   // _cstyle_negstriderange1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA negative stride kernel...\n";
 
@@ -157,9 +156,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << std::endl;
 
-  //----------------------------------//
-  // List variant
-  //----------------------------------//
+//----------------------------------//
+// List variant
+//----------------------------------//
 
   std::cout << "\n Running RAJA negative stride list kernel...\n";
 
@@ -167,42 +166,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Reverse the order of indices in the vector
   //
-  std::reverse(idx.begin(), idx.end());
-  ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res);
+  std::reverse( idx.begin(), idx.end() );
+  ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res );
 
-  RAJA::forall<RAJA::seq_exec>(
-      idx_list1_reverse, [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_negstridelist1_end
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
-  // Non-unit uniform stride iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Non-unit uniform stride iteration spaces
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version stride-2 range kernel...\n";
 
   // _cstyle_range2_start
-  for (IdxType i = 0; i < 20; i += 2)
-  {
+  for (IdxType i = 0; i < 20; i += 2) {
     std::cout << i << "  ";
   }
   // _cstyle_range2_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-2 range kernel...\n";
 
   // _raja_range2_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeStrideSegType(0, 20, 2), [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_range2_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-3 range kernel...\n";
 
@@ -214,50 +214,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
-  // IndexSets: complex iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// IndexSets: complex iteration spaces
+//----------------------------------------------------------------------------//
 
-  //
-  // Sequential index set execution policy used in several of the following
-  // example implementations.
-  //
+//
+// Sequential index set execution policy used in several of the following
+// example implementations.
+//
 
   // _raja_seq_indexset_policy_start
-  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
+  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
+                                            RAJA::seq_exec>;
   // _raja_seq_indexset_policy__end
 
   std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n";
 
   // _raja_indexset_2ranges_start
   IndexSetType is2;
-  is2.push_back(RangeSegType(0, 10));
-  is2.push_back(RangeSegType(15, 20));
-
-  RAJA::forall<SEQ_ISET_EXECPOL>(
-      is2, [=](IdxType i) { std::cout << i << "  "; });
+  is2.push_back( RangeSegType(0, 10) );
+  is2.push_back( RangeSegType(15, 20) );
+  
+  RAJA::forall<SEQ_ISET_EXECPOL>(is2, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_indexset_2ranges_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running C-version of two segment kernel...\n";
 
   // _cstyle_2ranges_start
-  for (IdxType i = 0; i < 10; ++i)
-  {
+  for (IdxType i = 0; i < 10; ++i) {
     std::cout << i << "  ";
   }
-  for (IdxType i = 15; i < 20; ++i)
-  {
+  for (IdxType i = 15; i < 20; ++i) {
     std::cout << i << "  ";
   }
   // _cstyle_2ranges_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA index set (3 segments) kernel...\n";
 
@@ -265,19 +265,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Make a RAJA version of a kernel that prints the sequence
-  ///
+  ///        
   ///           0 1 2 3 4 5 6 7 10 11 14 20 22 24 25 26 27
   ///
-  ///           using a RAJA::TypedIndexSet containing two
-  ///           RAJA::TypedRangeSegment objects and on
-  ///           RAJA::TypedListSegment object.
+  ///           using a RAJA::TypedIndexSet containing two 
+  ///           RAJA::TypedRangeSegment objects and on 
+  ///           RAJA::TypedListSegment object. 
   ///
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
-
+ 
   return 0;
 }
+
diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp
index 5bbb6ecdea..4267582d98 100644
--- a/exercises/segment-indexset-basics_solution.cpp
+++ b/exercises/segment-indexset-basics_solution.cpp
@@ -20,9 +20,9 @@
  *
  *  In this exercise, you will learn how to create RAJA segments and index sets
  *  and use them to execute kernels. There are no computations performed in the
- *  exercises and no parallel execution. The kernels contain only print
+ *  exercises and no parallel execution. The kernels contain only print 
  *  statements to illustrate various iteration patterns. Thus, all kernels
- *  look the same. The only thing that changes in these versions is the object
+ *  look the same. The only thing that changes in these versions is the object 
  *  passed to the 'forall' method that defines the iteration space.
  *
  *  RAJA features shown:
@@ -39,62 +39,63 @@
 // (so example code is less verbose)
 //----------------------------------------------------------------------------//
 // _raja_segment_type_start
-using IdxType            = int;
-using RangeSegType       = RAJA::TypedRangeSegment<IdxType>;
+using IdxType = int;
+using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
 using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
-using ListSegType        = RAJA::TypedListSegment<IdxType>;
-using IndexSetType       = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
+using ListSegType = RAJA::TypedListSegment<IdxType>;
+using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >;
 // _raja_segment_type_end
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA segments index sets and index sets...\n";
 
-  // Resource object used to construct list segment objects with indices
-  // living in host (CPU) memory.
-  camp::resources::Resource host_res {camp::resources::Host()};
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+  camp::resources::Resource host_res{camp::resources::Host()};
 
 
-  //----------------------------------------------------------------------------//
-  // Stride-1 iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Stride-1 iteration spaces
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version range kernel...\n";
 
-  // _cstyle_range1_start
-  for (IdxType i = 0; i < 20; i++)
-  {
-    std::cout << i << "  ";
+// _cstyle_range1_start
+  for (IdxType i = 0; i < 20; i++) {
+    std::cout << i << "  "; 
   }
-  // _cstyle_range1_end
+// _cstyle_range1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA range kernel...\n";
 
   // _raja_range1_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeSegType(0, 20), [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_range1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 range kernel...\n";
 
   // _raja_striderange1_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeStrideSegType(0, 20, 1), [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_striderange1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-1 list kernel...\n";
 
@@ -103,63 +104,61 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Collect indices in a vector to create list segment
   //
   std::vector<IdxType> idx;
-  for (IdxType i = 0; i < 20; ++i)
-  {
-    idx.push_back(i);
-  }
+  for (IdxType i = 0; i < 20; ++i) {
+    idx.push_back(i); 
+  } 
 
-  ListSegType idx_list1(idx, host_res);
+  ListSegType idx_list1( idx, host_res );
 
-  RAJA::forall<RAJA::seq_exec>(
-      idx_list1, [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(idx_list1, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_list1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running C-style stride-1 list kernel...\n";
 
   // _cstyle_list1_start
   IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
-  for (IdxType ii = 0; ii < iis; ++ii)
-  {
-    std::cout << idx[ii] << "  ";
+  for (IdxType ii = 0; ii < iis; ++ii) { 
+    std::cout << idx[ ii ] << "  ";
   }
   // _cstyle_list1_end
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
-  // Negative stride iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Negative stride iteration spaces
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version negative stride kernel...\n";
 
   // _cstyle_negstriderange1_start
-  for (IdxType i = 19; i > -1; i--)
-  {
+  for (IdxType i = 19; i > -1; i--) {
     std::cout << i << "  ";
   }
   // _cstyle_negstriderange1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA negative stride kernel...\n";
 
   // _raja_negstriderange1_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeStrideSegType(19, -1, -1),
-      [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(19, -1, -1), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_negstriderange1_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
-  // List variant
-  //----------------------------------//
+//----------------------------------//
+// List variant
+//----------------------------------//
 
   std::cout << "\n Running RAJA negative stride list kernel...\n";
 
@@ -167,117 +166,121 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Reverse the order of indices in the vector
   //
-  std::reverse(idx.begin(), idx.end());
-  ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res);
+  std::reverse( idx.begin(), idx.end() );
+  ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res );
 
-  RAJA::forall<RAJA::seq_exec>(
-      idx_list1_reverse, [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_negstridelist1_end
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
-  // Non-unit uniform stride iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Non-unit uniform stride iteration spaces
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-version stride-2 range kernel...\n";
 
   // _cstyle_range2_start
-  for (IdxType i = 0; i < 20; i += 2)
-  {
+  for (IdxType i = 0; i < 20; i += 2) {
     std::cout << i << "  ";
   }
   // _cstyle_range2_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-2 range kernel...\n";
 
   // _raja_range2_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeStrideSegType(0, 20, 2), [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_range2_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA stride-3 range kernel...\n";
 
   // _raja_range3_start
-  RAJA::forall<RAJA::seq_exec>(
-      RangeStrideSegType(0, 20, 3), [=](IdxType i) { std::cout << i << "  "; });
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 3), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_range3_end
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
-  // IndexSets: complex iteration spaces
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// IndexSets: complex iteration spaces
+//----------------------------------------------------------------------------//
 
-  //
-  // Sequential index set execution policy used in several of the following
-  // example implementations.
-  //
+//
+// Sequential index set execution policy used in several of the following
+// example implementations.
+//
 
   std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n";
 
   // _raja_indexset_2ranges_start
-  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
+  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
+                                            RAJA::seq_exec>;
 
   IndexSetType is2;
-  is2.push_back(RangeSegType(0, 10));
-  is2.push_back(RangeSegType(15, 20));
-
-  RAJA::forall<SEQ_ISET_EXECPOL>(
-      is2, [=](IdxType i) { std::cout << i << "  "; });
+  is2.push_back( RangeSegType(0, 10) );
+  is2.push_back( RangeSegType(15, 20) );
+  
+  RAJA::forall<SEQ_ISET_EXECPOL>(is2, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_indexset_2ranges_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running C-version of two segment kernel...\n";
 
   // _cstyle_2ranges_start
-  for (IdxType i = 0; i < 10; ++i)
-  {
+  for (IdxType i = 0; i < 10; ++i) {
     std::cout << i << "  ";
   }
-  for (IdxType i = 15; i < 20; ++i)
-  {
+  for (IdxType i = 15; i < 20; ++i) {
     std::cout << i << "  ";
   }
   // _cstyle_2ranges_end
 
   std::cout << std::endl;
 
-  //----------------------------------//
+//----------------------------------//
 
   std::cout << "\n Running RAJA index set (3 segments) kernel...\n";
 
   // _raja_indexset_3segs_start
   IndexSetType is3;
 
-  is3.push_back(RangeSegType(0, 8));
+  is3.push_back( RangeSegType(0, 8) );
 
-  IdxType     indx[] = {10, 11, 14, 20, 22};
-  ListSegType list2(indx, 5, host_res);
-  is3.push_back(list2);
+  IdxType indx[ ] = {10, 11, 14, 20, 22};
+  ListSegType list2( indx, 5, host_res );
+  is3.push_back( list2 );
 
-  is3.push_back(RangeSegType(24, 28));
-
-  RAJA::forall<SEQ_ISET_EXECPOL>(
-      is3, [=](IdxType i) { std::cout << i << "  "; });
+  is3.push_back( RangeSegType(24, 28) );
+ 
+  RAJA::forall<SEQ_ISET_EXECPOL>(is3, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
   // _raja_indexset_3segs_end
 
   std::cout << std::endl;
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
-
+ 
   return 0;
 }
+
diff --git a/exercises/sort.cpp b/exercises/sort.cpp
index 8778514102..21a5fb5edd 100644
--- a/exercises/sort.cpp
+++ b/exercises/sort.cpp
@@ -6,14 +6,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #define OP_GREATER RAJA::operators::greater<int>
-#define OP_LESS    RAJA::operators::less<int>
+#define OP_LESS RAJA::operators::less<int>
 
-#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N)
-#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X)                                     \
-  checkUnstableSortResult<X>(in, out, in_vals, out_vals, N)
-#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N)
-#define CHECK_STABLE_SORT_PAIR_RESULT(X)                                       \
-  checkStableSortResult<X>(in, out, in_vals, out_vals, N)
+#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N) 
+#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult<X>(in, out, in_vals, out_vals, N) 
+#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N) 
+#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult<X>(in, out, in_vals, out_vals, N) 
 
 #include <cstdlib>
 #include <iostream>
@@ -32,9 +30,9 @@
 /*
  *  Sort Exercise
  *
- *  Exercise demonstrates how to perform RAJA unstable and stable sort
- * operations for integer arrays, including pairs variant, using different
- * comparators. Other array data types, comparators, etc. are similar
+ *  Exercise demonstrates how to perform RAJA unstable and stable sort operations
+ *  for integer arrays, including pairs variant, using different comparators.
+ *  Other array data types, comparators, etc. are similar
  *
  *  RAJA features shown:
  *    - `RAJA::sort` and `RAJA::sort_pairs` methods
@@ -49,11 +47,11 @@
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-// constexpr int CUDA_BLOCK_SIZE = 16;
+//constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-// constexpr int HIP_BLOCK_SIZE = 16;
+//constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -62,22 +60,14 @@
 template <typename Function, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkUnstableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N);
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkStableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N);
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -91,27 +81,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA sort example...\n";
 
   // _sort_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 20;
 
-  //
-  // Allocate and initialize vector data
-  //
-  int* in  = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data
+//
+  int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
-  unsigned* in_vals  = memoryManager::allocate<unsigned>(N);
+  unsigned* in_vals = memoryManager::allocate<unsigned>(N);
   unsigned* out_vals = memoryManager::allocate<unsigned>(N);
 
-  std::iota(in, in + N / 2, 0);
-  std::iota(in + N / 2, in + N, 0);
-  std::shuffle(in, in + N / 2, std::mt19937 {12345u});
-  std::shuffle(in + N / 2, in + N, std::mt19937 {67890u});
+  std::iota(in      , in + N/2, 0);
+  std::iota(in + N/2, in + N  , 0);
+  std::shuffle(in      , in + N/2, std::mt19937{12345u});
+  std::shuffle(in + N/2, in + N  , std::mt19937{67890u});
 
-  std::fill(in_vals, in_vals + N / 2, 0);
-  std::fill(in_vals + N / 2, in_vals + N, 1);
+  std::fill(in_vals      , in_vals + N/2, 0);
+  std::fill(in_vals + N/2, in_vals + N  , 1);
 
   std::cout << "\n in keys...\n";
   printArray(in, N);
@@ -122,10 +112,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _sort_array_init_end
 
 
-  //----------------------------------------------------------------------------//
-  // Perform various sequential sorts to illustrate unstable/stable,
-  // pairs, default sorts with different comparators
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform various sequential sorts to illustrate unstable/stable,
+// pairs, default sorts with different comparators
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (default)...\n";
 
@@ -133,7 +123,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec
-  ///           execution policy type.
+  ///           execution policy type. 
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
@@ -144,12 +134,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N));
   // _sort_seq_end
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (non-decreasing)...\n";
 
@@ -159,15 +149,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec execution
-  ///           policy type and an explicit less operation.
+  ///           policy type and an explicit less operation. 
   ///
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
 
@@ -177,15 +167,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution
-  ///           policy type and an explicit less operation.
+  ///           policy type and an explicit less operation. 
   ///
 
-  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-increasing)...\n";
 
@@ -195,15 +185,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution
-  ///           policy type and an explicit greater operation.
+  ///           policy type and an explicit greater operation. 
   ///
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n";
 
@@ -214,16 +204,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA pair sort with RAJA::seq_exec execution
-  ///           policy type and an explicit less operation.
+  ///           policy type and an explicit less operation. 
   ///
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n";
 
@@ -234,11 +223,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA pair sort with RAJA::seq_exec execution
-  ///           policy type and an explicit greater operation.
+  ///           policy type and an explicit greater operation. 
   ///
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
@@ -246,9 +234,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of OpenMP sorts...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of OpenMP sorts...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP sort (non-decreasing)...\n";
 
@@ -258,15 +246,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA sort with RAJA::omp_parallel_for_exec execution
-  ///           policy type and an explicit less operation.
+  ///           policy type and an explicit less operation. 
   ///
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n";
 
@@ -276,26 +264,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec
-  /// execution
-  ///           policy type and an explicit greater operation.
+  /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec execution
+  ///           policy type and an explicit greater operation. 
   ///
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of CUDA sorts...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of CUDA sorts...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n";
 
@@ -306,19 +292,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA pair sort with RAJA::cuda_exec execution
-  ///           policy type and an explicit greater operation.
+  ///           policy type and an explicit greater operation. 
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
   ///                 top of the file if you want to use it here.
   ///
 
-  // checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n";
 
@@ -328,80 +313,77 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA pair sort with RAJA::cuda_exec execution
-  ///           policy type and an explicit less operation.
+  ///           policy type and an explicit less operation. 
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
   ///                 top of the file if you want to use it here.
   ///
 
-  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of HIP sorts...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of HIP sorts...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n";
 
   std::copy_n(in, N, out);
   std::copy_n(in_vals, N, out_vals);
 
-  int* d_out      = memoryManager::allocate_gpu<int>(N);
+  int* d_out = memoryManager::allocate_gpu<int>(N);
   int* d_out_vals = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice ));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement a RAJA pair sort with RAJA::hip_exec execution
-  ///           policy type and an explicit less operation.
+  ///           policy type and an explicit less operation. 
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
   ///                 top of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
-  hipErrchk(
-      hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP stable_sort (non-increasing)...\n";
 
   std::copy_n(in, N, out);
 
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement a stable RAJA sort with RAJA::hip_exec execution
-  ///           policy type and an explicit less operation.
+  ///           policy type and an explicit less operation. 
   ///
-  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
   ///                 top of the file if you want to use it here.
   ///
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
@@ -412,11 +394,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -441,148 +423,128 @@ template <typename Comparator, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end())
-    {
-      auto ret = keys.emplace(in[i], val_map {});
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order"
-                << " (at index " << i - 1 << ")\n";
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order"
+                << " (at index " << i-1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " unknown or duplicate key"
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out[i]);
-    if (val_iter == key_iter->second.end())
-    {
-      if (correct)
-      {
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " unknown or duplicate val"
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkUnstableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N)
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      auto ret = keys_to_vals.emplace(in[i], val_map {});
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
-                << " (" << out[i] << "," << out_vals[i] << ")"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
                 << " out of order"
-                << " (at index " << i - 1 << ")\n";
+                << " (at index " << i-1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out_vals[i]);
-    if (val_iter == key_iter->second.end())
-    {
-      if (correct)
-      {
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -594,146 +556,126 @@ template <typename Comparator, typename T>
 void checkStableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end())
-    {
-      auto ret = keys.emplace(in[i], val_map {});
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order "
-                << " (at index " << i - 1 << ")\n";
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order "
+                << " (at index " << i-1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " unknown or duplicate key "
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out[i])
-    {
-      if (correct)
-      {
+    if (key_iter->second.front() != out[i]) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " out of stable order or unknown val "
+      std::cout << "\t"
+                << out[i]
+                << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkStableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N)
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to vals
   using val_map = std::list<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      auto ret = keys_to_vals.emplace(in[i], val_map {});
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
-                << " (" << out[i] << "," << out_vals[i] << ")"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
                 << " out of order "
-                << " (at index " << i - 1 << ")\n";
+                << " (at index " << i-1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out_vals[i])
-    {
-      if (correct)
-      {
+    if (key_iter->second.front() != out_vals[i]) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -746,10 +688,7 @@ template <typename T>
 void printArray(const T* k, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i)
-  {
-    std::cout << " " << k[i];
-  }
+  for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; }
   std::cout << std::endl;
 }
 ///
@@ -757,9 +696,7 @@ template <typename T, typename U>
 void printArray(const T* k, const U* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i)
-  {
-    std::cout << " (" << k[i] << "," << v[i] << ")";
-  }
+  for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; }
   std::cout << std::endl;
 }
+
diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp
index fef8cd2e76..98f65c6dbe 100644
--- a/exercises/sort_solution.cpp
+++ b/exercises/sort_solution.cpp
@@ -6,14 +6,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #define OP_GREATER RAJA::operators::greater<int>
-#define OP_LESS    RAJA::operators::less<int>
+#define OP_LESS RAJA::operators::less<int>
 
-#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N)
-#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X)                                     \
-  checkUnstableSortResult<X>(in, out, in_vals, out_vals, N)
-#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N)
-#define CHECK_STABLE_SORT_PAIR_RESULT(X)                                       \
-  checkStableSortResult<X>(in, out, in_vals, out_vals, N)
+#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N) 
+#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult<X>(in, out, in_vals, out_vals, N) 
+#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N) 
+#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult<X>(in, out, in_vals, out_vals, N) 
 
 #include <cstdlib>
 #include <iostream>
@@ -32,9 +30,9 @@
 /*
  *  Sort Exercise
  *
- *  Exercise demonstrates how to perform RAJA unstable and stable sort
- * operations for integer arrays, including pairs variant, using different
- * comparators. Other array data types, comparators, etc. are similar
+ *  Exercise demonstrates how to perform RAJA unstable and stable sort operations
+ *  for integer arrays, including pairs variant, using different comparators.
+ *  Other array data types, comparators, etc. are similar
  *
  *  RAJA features shown:
  *    - `RAJA::sort` and `RAJA::sort_pairs` methods
@@ -62,22 +60,14 @@ constexpr int HIP_BLOCK_SIZE = 16;
 template <typename Function, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkUnstableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N);
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N);
 //
 template <typename Function, typename T>
 void checkStableSortResult(const T* in, const T* out, int N);
 template <typename Function, typename T, typename U>
-void checkStableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N);
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N);
 //
 template <typename T>
 void printArray(const T* k, int N);
@@ -91,27 +81,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n\nRAJA sort example...\n";
 
   // _sort_array_init_start
-  //
-  // Define array length
-  //
+//
+// Define array length
+//
   constexpr int N = 20;
 
-  //
-  // Allocate and initialize vector data
-  //
-  int* in  = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data
+//
+  int* in = memoryManager::allocate<int>(N);
   int* out = memoryManager::allocate<int>(N);
 
-  unsigned* in_vals  = memoryManager::allocate<unsigned>(N);
+  unsigned* in_vals = memoryManager::allocate<unsigned>(N);
   unsigned* out_vals = memoryManager::allocate<unsigned>(N);
 
-  std::iota(in, in + N / 2, 0);
-  std::iota(in + N / 2, in + N, 0);
-  std::shuffle(in, in + N / 2, std::mt19937 {12345u});
-  std::shuffle(in + N / 2, in + N, std::mt19937 {67890u});
+  std::iota(in      , in + N/2, 0);
+  std::iota(in + N/2, in + N  , 0);
+  std::shuffle(in      , in + N/2, std::mt19937{12345u});
+  std::shuffle(in + N/2, in + N  , std::mt19937{67890u});
 
-  std::fill(in_vals, in_vals + N / 2, 0);
-  std::fill(in_vals + N / 2, in_vals + N, 1);
+  std::fill(in_vals      , in_vals + N/2, 0);
+  std::fill(in_vals + N/2, in_vals + N  , 1);
 
   std::cout << "\n in keys...\n";
   printArray(in, N);
@@ -122,10 +112,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _sort_array_init_end
 
 
-  //----------------------------------------------------------------------------//
-  // Perform various sequential sorts to illustrate unstable/stable,
-  // pairs, default sorts with different comparators
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform various sequential sorts to illustrate unstable/stable,
+// pairs, default sorts with different comparators
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (default)...\n";
 
@@ -135,60 +125,60 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N));
   // _sort_seq_end
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort (non-decreasing)...\n";
 
   std::copy_n(in, N, out);
 
   // _sort_seq_less_start
-  RAJA::sort<RAJA::seq_exec>(
-      RAJA::make_span(out, N), RAJA::operators::less<int> {});
+  RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N),
+                             RAJA::operators::less<int>{});
   // _sort_seq_less_end
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
 
   std::copy_n(in, N, out);
 
   // _sort_stable_seq_less_start
-  RAJA::stable_sort<RAJA::seq_exec>(
-      RAJA::make_span(out, N), RAJA::operators::less<int> {});
+  RAJA::stable_sort<RAJA::seq_exec>(RAJA::make_span(out, N),
+                                    RAJA::operators::less<int>{});
   // _sort_stable_seq_less_end
 
-  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort (non-increasing)...\n";
 
   std::copy_n(in, N, out);
 
   // _sort_stable_seq_greater_start
-  RAJA::stable_sort<RAJA::seq_exec>(
-      RAJA::make_span(out, N), RAJA::operators::greater<int> {});
+  RAJA::stable_sort<RAJA::seq_exec>(RAJA::make_span(out, N),
+                                    RAJA::operators::greater<int>{});
   // _sort_stable_seq_greater_end
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n";
 
@@ -196,18 +186,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_pairs_seq_less_start
-  RAJA::sort_pairs<RAJA::seq_exec>(
-      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
-      RAJA::operators::less<int> {});
+  RAJA::sort_pairs<RAJA::seq_exec>(RAJA::make_span(out, N),
+                                   RAJA::make_span(out_vals, N),
+                                   RAJA::operators::less<int>{});
   // _sort_pairs_seq_less_end
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n";
 
@@ -215,13 +204,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_stable_pairs_seq_greater_start
-  RAJA::stable_sort_pairs<RAJA::seq_exec>(
-      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
-      RAJA::operators::greater<int> {});
+  RAJA::stable_sort_pairs<RAJA::seq_exec>(RAJA::make_span(out, N),
+                                          RAJA::make_span(out_vals, N),
+                                          RAJA::operators::greater<int>{});
   // _sort_stable_pairs_seq_greater_end
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
@@ -229,25 +217,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of OpenMP sorts...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of OpenMP sorts...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP sort (non-decreasing)...\n";
 
   std::copy_n(in, N, out);
 
   // _sort_omp_less_start
-  RAJA::sort<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(out, N), RAJA::operators::less<int> {});
+  RAJA::sort<RAJA::omp_parallel_for_exec>(RAJA::make_span(out, N),
+                                          RAJA::operators::less<int>{});
   // _sort_omp_less_end
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n";
 
@@ -255,26 +243,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_stable_pairs_omp_greater_start
-  RAJA::stable_sort_pairs<RAJA::omp_parallel_for_exec>(
-      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
-      RAJA::operators::greater<int> {});
+  RAJA::stable_sort_pairs<RAJA::omp_parallel_for_exec>(RAJA::make_span(out, N),
+                                                       RAJA::make_span(out_vals, N),
+                                                       RAJA::operators::greater<int>{});
   // _sort_stable_pairs_omp_greater_end
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
   CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of CUDA sorts...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of CUDA sorts...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n";
 
@@ -282,85 +269,82 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in_vals, N, out_vals);
 
   // _sort_pairs_cuda_greater_start
-  RAJA::sort_pairs<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N), RAJA::make_span(out_vals, N),
-      RAJA::operators::greater<int> {});
+  RAJA::sort_pairs<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::make_span(out, N),
+                                                     RAJA::make_span(out_vals, N),
+                                                     RAJA::operators::greater<int>{});
   // _sort_pairs_cuda_greater_end
 
-  // checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n";
 
   std::copy_n(in, N, out);
 
   // _sort_stable_cuda_less_start
-  RAJA::stable_sort<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::make_span(out, N), RAJA::operators::less<int> {});
+  RAJA::stable_sort<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::make_span(out, N),
+                                                      RAJA::operators::less<int>{});
   // _sort_stable_cuda_less_end
 
-  // checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //----------------------------------------------------------------------------//
-  // Perform a couple of HIP sorts...
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Perform a couple of HIP sorts...
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n";
 
   std::copy_n(in, N, out);
   std::copy_n(in_vals, N, out_vals);
 
-  int* d_out      = memoryManager::allocate_gpu<int>(N);
+  int* d_out = memoryManager::allocate_gpu<int>(N);
   int* d_out_vals = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(
-      hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice ));
 
-  RAJA::sort_pairs<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_out, N), RAJA::make_span(d_out_vals, N),
-      RAJA::operators::less<int> {});
+  RAJA::sort_pairs<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::make_span(d_out, N),
+                                                   RAJA::make_span(d_out_vals, N),
+                                                   RAJA::operators::less<int>{});
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
-  hipErrchk(
-      hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  // checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals,
-  // out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
   CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running HIP stable_sort (non-increasing)...\n";
 
   std::copy_n(in, N, out);
 
-  hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
   // _sort_stable_hip_greater_start
   RAJA::stable_sort<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::make_span(d_out, N), RAJA::operators::greater<int> {});
+    RAJA::make_span(d_out, N),
+    RAJA::operators::greater<int>{});
   // _sort_stable_hip_greater_end
 
-  hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  // checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
   CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
@@ -371,11 +355,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(in);
   memoryManager::deallocate(out);
 
@@ -400,148 +384,128 @@ template <typename Comparator, typename T>
 void checkUnstableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end())
-    {
-      auto ret = keys.emplace(in[i], val_map {});
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order"
-                << " (at index " << i - 1 << ")\n";
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order"
+                << " (at index " << i-1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " unknown or duplicate key"
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out[i]);
-    if (val_iter == key_iter->second.end())
-    {
-      if (correct)
-      {
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " unknown or duplicate val"
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkUnstableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N)
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      auto ret = keys_to_vals.emplace(in[i], val_map {});
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
-                << " (" << out[i] << "," << out_vals[i] << ")"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
                 << " out of order"
-                << " (at index " << i - 1 << ")\n";
+                << " (at index " << i-1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " unknown or duplicate key"
                 << " (at index " << i << ")\n";
     }
     auto val_iter = key_iter->second.find(out_vals[i]);
-    if (val_iter == key_iter->second.end())
-    {
-      if (correct)
-      {
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " unknown or duplicate val"
                 << " (at index " << i << ")\n";
     }
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -553,146 +517,126 @@ template <typename Comparator, typename T>
 void checkStableSortResult(const T* in, const T* out, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys.find(in[i]);
-    if (key_iter == keys.end())
-    {
-      auto ret = keys.emplace(in[i], val_map {});
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(in[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order "
-                << " (at index " << i - 1 << ")\n";
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order "
+                << " (at index " << i-1 << ")\n";
     }
     // test there is an item with this
     auto key_iter = keys.find(out[i]);
-    if (key_iter == keys.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " unknown or duplicate key "
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out[i])
-    {
-      if (correct)
-      {
+    if (key_iter->second.front() != out[i]) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
-      std::cout << "\t" << out[i] << " out of stable order or unknown val "
+      std::cout << "\t"
+                << out[i]
+                << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
 
 template <typename Comparator, typename T, typename U>
-void checkStableSortResult(
-    const T* in,
-    const T* out,
-    const U* in_vals,
-    const U* out_vals,
-    int      N)
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N)
 {
   Comparator comp;
-  bool       correct = true;
+  bool correct = true;
 
   // make map of keys to vals
   using val_map = std::list<U>;
   std::unordered_map<T, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys_to_vals.find(in[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      auto ret = keys_to_vals.emplace(in[i], val_map {});
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(in_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(out[i], out[i - 1]))
-    {
-      if (correct)
-      {
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i - 1] << "," << out_vals[i - 1] << "),"
-                << " (" << out[i] << "," << out_vals[i] << ")"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
                 << " out of order "
-                << " (at index " << i - 1 << ")\n";
+                << " (at index " << i-1 << ")\n";
     }
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(out[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      if (correct)
-      {
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " unknown or duplicate key "
                 << " (at index " << i << ")\n";
     }
-    if (key_iter->second.front() != out_vals[i])
-    {
-      if (correct)
-      {
+    if (key_iter->second.front() != out_vals[i]) {
+      if (correct) {
         std::cout << "\n\t result -- WRONG\n";
         correct = false;
       }
       std::cout << "\t"
-                << "(" << out[i] << "," << out_vals[i] << ")"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
                 << " out of stable order or unknown val "
                 << " (at index " << i << ")\n";
     }
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys_to_vals.erase(key_iter);
     }
   }
-  if (correct)
-  {
+  if (correct) {
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
@@ -705,10 +649,7 @@ template <typename T>
 void printArray(const T* k, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i)
-  {
-    std::cout << " " << k[i];
-  }
+  for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; }
   std::cout << std::endl;
 }
 
@@ -716,9 +657,7 @@ template <typename T, typename U>
 void printArray(const T* k, const U* v, int N)
 {
   std::cout << std::endl;
-  for (int i = 0; i < N; ++i)
-  {
-    std::cout << " (" << k[i] << "," << v[i] << ")";
-  }
+  for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; }
   std::cout << std::endl;
 }
+
diff --git a/exercises/tutorial_halfday/ex2_approx-pi.cpp b/exercises/tutorial_halfday/ex2_approx-pi.cpp
index d919e7a3da..c1ccc05aee 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi.cpp
@@ -15,7 +15,7 @@
  *  EXERCISE #2: Approximate pi using a Riemann sum
  *
  *  In this exercise, you will apprimate pi using the formula
- *
+ * 
  *    pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the
  *    interval [0, 1].
  *
@@ -28,7 +28,7 @@
  *    - `forall` loop iteration template method
  *    - Index range segment
  *    - Sum reduction
- *    - Execution and reduction policies
+ *    - Execution and reduction policies 
  */
 
 /*
@@ -46,38 +46,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n";
 
-  //
-  // Define number of subintervals (N) and size of each subinterval (dx) used in
-  // Riemann integral sum to approximate pi.
-  //
-  const int    N  = 512 * 512;
-  const double dx = 1.0 / double(N);
+//
+// Define number of subintervals (N) and size of each subinterval (dx) used in
+// Riemann integral sum to approximate pi.
+//
+  const int N = 512 * 512;
+  const double dx = 1.0 / double(N); 
 
-  // Set precision for printing pi
+// Set precision for printing pi
   int prec = 16;
 
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential pi approximation...\n";
-
+ 
   double c_pi = 0.0;
 
-  for (int i = 0; i < N; ++i)
-  {
-    double x = (double(i) + 0.5) * dx;
-    c_pi += dx / (1.0 + x * x);
+  for (int i = 0; i < N; ++i) {
+      double x = (double(i) + 0.5) * dx;
+      c_pi += dx / (1.0 + x * x);
   }
   c_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) 
+            << c_pi << std::endl;
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA sequential variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA sequential variant.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential pi approximation...\n";
 
@@ -85,7 +85,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall
-  ///           method with RAJA::seq_exec execution policy type and a
+  ///           method with RAJA::seq_exec execution policy type and a 
   ///           RAJA::ReduceSum object with RAJA::seq_reduce policy type
   ///           to accumulate the sum.
   ///
@@ -95,23 +95,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL1   = RAJA::seq_exec;
   using REDUCE_POL1 = RAJA::seq_reduce;
 
-  RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
+  RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(
-      RAJA::RangeSegment(0, N),
-      [=](int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        seq_pi += dx / (1.0 + x * x);
-      });
-  double seq_pi_val = seq_pi.get() * 4.0;
+  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) {
+      double x = (double(i) + 0.5) * dx;
+      seq_pi += dx / (1.0 + x * x);
+  });
+  double seq_pi_val = seq_pi.get() * 4.0; 
 
-  std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) 
+            << seq_pi_val << std::endl;
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -119,22 +117,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   double c_pi_omp = 0.0;
 
-#pragma omp parallel for reduction(+ : c_pi_omp)
-  for (int i = 0; i < N; ++i)
-  {
-    double x = (double(i) + 0.5) * dx;
-    c_pi_omp += dx / (1.0 + x * x);
+  #pragma omp parallel for reduction(+:c_pi_omp)
+  for (int i = 0; i < N; ++i) {
+      double x = (double(i) + 0.5) * dx;
+      c_pi_omp += dx / (1.0 + x * x);
   }
   c_pi_omp *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << c_pi_omp << std::endl;
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -144,22 +142,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall
-  ///           method with RAJA::omp_parallel_for_exec execution policy type
+  ///           method with RAJA::omp_parallel_for_exec execution policy type 
   ///           and a RAJA::ReduceSum object with RAJA::omp_reduce policy type
   ///           to accumulate the sum.
-  ///
+  /// 
 
 
   double omp_pi_val = 0.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << omp_pi_val << std::endl;
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA CUDA variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA CUDA variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -169,15 +168,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall
-  ///           method with RAJA::cuda_exec execution policy type and a
+  ///           method with RAJA::cuda_exec execution policy type and a 
   ///           RAJA::ReduceSum object with RAJA::cuda_reduce policy type
   ///           to accumulate the sum.
-  ///
+  /// 
 
 
   double cuda_pi_val = 0.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << cuda_pi_val << std::endl;
 
 #endif
 
diff --git a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
index f66c8dde14..5654ffbea2 100644
--- a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
+++ b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp
@@ -15,7 +15,7 @@
  *  EXERCISE #2: Approximate pi using a Riemann sum
  *
  *  In this exercise, you will apprimate pi using the formula
- *
+ * 
  *    pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the
  *    interval [0, 1].
  *
@@ -28,7 +28,7 @@
  *    - `forall` loop iteration template method
  *    - Index range segment
  *    - Sum reduction
- *    - Execution and reduction policies
+ *    - Execution and reduction policies 
  */
 
 /*
@@ -43,61 +43,59 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n";
 
-  //
-  // Define number of subintervals (N) and size of each subinterval (dx) used in
-  // Riemann integral sum to approximate pi.
-  //
-  const int    N  = 512 * 512;
-  const double dx = 1.0 / double(N);
+//
+// Define number of subintervals (N) and size of each subinterval (dx) used in
+// Riemann integral sum to approximate pi.
+//
+  const int N = 512 * 512;
+  const double dx = 1.0 / double(N); 
 
-  // Set precision for printing pi
+// Set precision for printing pi
   int prec = 16;
 
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running C-style sequential pi approximation...\n";
-
+ 
   double c_pi = 0.0;
 
-  for (int i = 0; i < N; ++i)
-  {
-    double x = (double(i) + 0.5) * dx;
-    c_pi += dx / (1.0 + x * x);
+  for (int i = 0; i < N; ++i) {
+      double x = (double(i) + 0.5) * dx;
+      c_pi += dx / (1.0 + x * x);
   }
   c_pi *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) 
+            << c_pi << std::endl;
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA sequential variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA sequential variant.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA sequential pi approximation...\n";
 
   using EXEC_POL1   = RAJA::seq_exec;
-  using REDUCE_POL1 = RAJA::seq_reduce;
+  using REDUCE_POL1 = RAJA::seq_reduce; 
 
-  RAJA::ReduceSum<REDUCE_POL1, double> seq_pi(0.0);
+  RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0);
 
-  RAJA::forall<EXEC_POL1>(
-      RAJA::RangeSegment(0, N),
-      [=](int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        seq_pi += dx / (1.0 + x * x);
-      });
+  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) {
+      double x = (double(i) + 0.5) * dx;
+      seq_pi += dx / (1.0 + x * x);
+  });
   double seq_pi_val = seq_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec) 
+            << seq_pi_val << std::endl;
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -105,22 +103,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   double c_pi_omp = 0.0;
 
-#pragma omp parallel for reduction(+ : c_pi_omp)
-  for (int i = 0; i < N; ++i)
-  {
-    double x = (double(i) + 0.5) * dx;
-    c_pi_omp += dx / (1.0 + x * x);
+  #pragma omp parallel for reduction(+:c_pi_omp)
+  for (int i = 0; i < N; ++i) {
+      double x = (double(i) + 0.5) * dx;
+      c_pi_omp += dx / (1.0 + x * x);
   }
   c_pi_omp *= 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << c_pi_omp << std::endl;
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -129,25 +127,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL2   = RAJA::omp_parallel_for_exec;
   using REDUCE_POL2 = RAJA::omp_reduce;
 
-  RAJA::ReduceSum<REDUCE_POL2, double> omp_pi(0.0);
+  RAJA::ReduceSum< REDUCE_POL2, double > omp_pi(0.0);
 
-  RAJA::forall<EXEC_POL2>(
-      RAJA::RangeSegment(0, N),
-      [=](int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        omp_pi += dx / (1.0 + x * x);
-      });
+  RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=](int i) {
+      double x = (double(i) + 0.5) * dx;
+      omp_pi += dx / (1.0 + x * x);
+  });
   double omp_pi_val = omp_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << omp_pi_val << std::endl;
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA CUDA variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA CUDA variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -156,18 +152,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   using REDUCE_POL3 = RAJA::cuda_reduce;
 
-  RAJA::ReduceSum<REDUCE_POL3, double> cuda_pi(0.0);
+  RAJA::ReduceSum< REDUCE_POL3, double > cuda_pi(0.0);
 
-  RAJA::forall<EXEC_POL3>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_DEVICE(int i)
-      {
-        double x = (double(i) + 0.5) * dx;
-        cuda_pi += dx / (1.0 + x * x);
-      });
+  RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) {
+      double x = (double(i) + 0.5) * dx;
+      cuda_pi += dx / (1.0 + x * x);
+  });
   double cuda_pi_val = cuda_pi.get() * 4.0;
 
-  std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl;
+  std::cout << "\tpi = " << std::setprecision(prec)
+            << cuda_pi_val << std::endl;
 
 #endif
 
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight.cpp b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
index df11c12629..c17fb2eb8a 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight.cpp
@@ -24,30 +24,30 @@
  *
  *  Given an observation point X on a terrain map, and a set of points
  *  {Y0, Y1, Y2, ...} along a ray starting at X, find which points on the
- *  terrain at Y0, Y1, etc. are visible from the point at X. A point is
- *  visible from the point at X if and only if there is no other point on the
- *  terrain that blocks its view from the point at X. More precisely,
- *  a point on the terrain at Y is visible from the point at X if and only if
- *  no other point on the terrain between X and Y has a greater vertical angle
+ *  terrain at Y0, Y1, etc. are visible from the point at X. A point is 
+ *  visible from the point at X if and only if there is no other point on the 
+ *  terrain that blocks its view from the point at X. More precisely, 
+ *  a point on the terrain at Y is visible from the point at X if and only if 
+ *  no other point on the terrain between X and Y has a greater vertical angle 
  *  from the point at X than the point at Y. So although a point at Y may
- *  be at a higher altitude than all other points on the terrain between Y
+ *  be at a higher altitude than all other points on the terrain between Y 
  *  and X, the point at Y may not be visible from the point at X.
  *
- *  Let 'altX' be the altidue at point X. Suppose we have a vector 'dist'
- *  such that dist[i] is the horizontal distance between X and Yi, and a
- *  vector 'alt' such that alt[i] is the altitude at point Yi. To solve
- *  the line of sight problem, we compute an angle vector 'ang', where
+ *  Let 'altX' be the altidue at point X. Suppose we have a vector 'dist' 
+ *  such that dist[i] is the horizontal distance between X and Yi, and a 
+ *  vector 'alt' such that alt[i] is the altitude at point Yi. To solve 
+ *  the line of sight problem, we compute an angle vector 'ang', where 
  *  ang[i] = arctan( (alt[i] - altX)/(dist[i]). Next, we perform a "max"
- *  scan on the vector 'ang' to form the vector 'ang_max'. Then, the point
+ *  scan on the vector 'ang' to form the vector 'ang_max'. Then, the point 
  *  at Yi is visible from the point at X if ang[i] >= ang_max[i]. Otherwise,
  *  the point at Yi is not visible.
  *
  *  This file contains a C-style sequential implementation of the solution to
- *  the line-of-sight problem. Where indicated by comments, you will fill in
+ *  the line-of-sight problem. Where indicated by comments, you will fill in 
  *  sequential and OpenMP versions of the algorithm using a RAJA scan operation
  *  to compute the 'ang_max' vector and a RAJA forall method to determine which
- *  points are/are not visible. If you have access to an NVIDIA GPU and a CUDA
- *  compiler, fill in the RAJA CUDA version of the algorithm also.
+ *  points are/are not visible. If you have access to an NVIDIA GPU and a CUDA 
+ *  compiler, fill in the RAJA CUDA version of the algorithm also. 
  *
  *  RAJA features you will use:
  *    - inclusive scan operations with 'max' operator
@@ -86,69 +86,62 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Define array bounds and initialize distance and altitude arrays.
   //
-  int    N       = 100;
+  int N = 100;
   double alt_max = 100.0;
 
-  double* dist        = memoryManager::allocate<double>(N);
-  double* alt         = memoryManager::allocate<double>(N);
-  double* ang         = memoryManager::allocate<double>(N);
-  double* ang_max     = memoryManager::allocate<double>(N);
-  int*    visible     = memoryManager::allocate<int>(N);
-  int*    visible_ref = memoryManager::allocate<int>(N);
-
-  for (int i = 0; i < N; ++i)
-  {
-    dist[i]         = static_cast<double>(i + 1);
-    double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1);
-    alt[i] =
-        alt_fact * static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
+  double* dist = memoryManager::allocate<double>(N);
+  double* alt = memoryManager::allocate<double>(N);
+  double* ang = memoryManager::allocate<double>(N);
+  double* ang_max = memoryManager::allocate<double>(N);
+  int* visible = memoryManager::allocate<int>(N);
+  int* visible_ref = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) { 
+    dist[i] = static_cast<double>(i+1);
+    double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 );
+    alt[i] = alt_fact * 
+             static_cast<double>( rand() ) / static_cast<double>( RAND_MAX );
   }
 
   //
   // Set angle array
-  //
-  for (int i = 0; i < N; ++i)
-  {
-    ang[i] = atan2(alt[i], dist[i]);  // set angle in radians
+  // 
+  for (int i = 0; i < N; ++i) { 
+    ang[i] = atan2( alt[i], dist[i] );       // set angle in radians
   }
 
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n";
 
   std::memset(visible_ref, 0, N * sizeof(int));
 
   ang_max[0] = ang[0];
-  for (int i = 1; i < N; ++i)
-  {
-    ang_max[i] = std::max(ang[i], ang_max[i - 1]);
+  for (int i = 1; i < N; ++i) {
+      ang_max[i] = std::max(ang[i], ang_max[i-1]);
   }
 
   int num_visible = 0;
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (ang[i] >= ang_max[i])
-    {
-      visible_ref[i] = 1;
-      num_visible++;
-    }
-    else
-    {
-      visible_ref[i] = 0;
-    }
+  for (int i = 0; i < N; ++i) {
+     if ( ang[i] >= ang_max[i] ) {
+        visible_ref[i] = 1;
+        num_visible++;
+     } else {
+        visible_ref[i] = 0;
+     }
   }
 
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible_ref, N);
+//printArray(visible_ref, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA sequential variant
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA sequential variant
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n";
 
@@ -160,7 +153,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs.
-  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector
+  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector 
   ///           with RAJA::seq_exec execution policy. Then, use a RAJA::forall
   ///           template with the same execution policy to determine which
   ///           points are visible.
@@ -169,12 +162,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible, N);
+//printArray(visible, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA OpenMP multithreading variant
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA OpenMP multithreading variant
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -188,23 +181,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs.
-  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector
-  ///           with RAJA::omp_parallel_for_exec execution policy. Then, use
-  ///           a RAJA::forall template with the same execution policy to
+  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector 
+  ///           with RAJA::omp_parallel_for_exec execution policy. Then, use 
+  ///           a RAJA::forall template with the same execution policy to 
   ///           determine which points are visible.
   ///
 
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible, N);
+//printArray(visible, N);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA CUDA variant
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA CUDA variant
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -218,16 +211,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs.
-  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector
-  ///           with RAJA::cuda_exec execution policy. Then, use a
-  ///           RAJA::forall template with the same execution policy to
+  ///           First, use a 'max' RAJA::inclusive_scan on the angle vector 
+  ///           with RAJA::cuda_exec execution policy. Then, use a 
+  ///           RAJA::forall template with the same execution policy to 
   ///           determine which points are visible.
   ///
 
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible, N);
+//printArray(visible, N);
 
 #endif
 
@@ -255,20 +248,13 @@ int checkResult(int* visible, int* visible_ref, int len)
   int num_visible = 0;
 
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (correct && visible[i] != visible_ref[i])
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( correct && visible[i] != visible_ref[i] ) { correct = false; }
     num_visible += visible[i];
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 
@@ -282,8 +268,7 @@ template <typename T>
 void printArray(T* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
index b628804d64..12348816a1 100644
--- a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
+++ b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp
@@ -83,69 +83,62 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Define array bounds and initialize distance and altitude arrays.
   //
-  int    N       = 100;
+  int N = 100;
   double alt_max = 100.0;
 
-  double* dist        = memoryManager::allocate<double>(N);
-  double* alt         = memoryManager::allocate<double>(N);
-  double* ang         = memoryManager::allocate<double>(N);
-  double* ang_max     = memoryManager::allocate<double>(N);
-  int*    visible     = memoryManager::allocate<int>(N);
-  int*    visible_ref = memoryManager::allocate<int>(N);
-
-  for (int i = 0; i < N; ++i)
-  {
-    dist[i]         = static_cast<double>(i + 1);
-    double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1);
-    alt[i] =
-        alt_fact * static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
+  double* dist = memoryManager::allocate<double>(N);
+  double* alt = memoryManager::allocate<double>(N);
+  double* ang = memoryManager::allocate<double>(N);
+  double* ang_max = memoryManager::allocate<double>(N);
+  int* visible = memoryManager::allocate<int>(N);
+  int* visible_ref = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    dist[i] = static_cast<double>(i+1);
+    double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 );
+    alt[i] = alt_fact *
+             static_cast<double>( rand() ) / static_cast<double>( RAND_MAX );
   }
 
   //
   // Set angle array
   //
-  for (int i = 0; i < N; ++i)
-  {
-    ang[i] = atan2(alt[i], dist[i]);  // set angle in radians
+  for (int i = 0; i < N; ++i) {
+    ang[i] = atan2( alt[i], dist[i] );       // set angle in radians
   }
 
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n";
 
   std::memset(visible_ref, 0, N * sizeof(int));
 
   ang_max[0] = ang[0];
-  for (int i = 1; i < N; ++i)
-  {
-    ang_max[i] = std::max(ang[i], ang_max[i - 1]);
+  for (int i = 1; i < N; ++i) {
+      ang_max[i] = std::max(ang[i], ang_max[i-1]);
   }
 
   int num_visible = 0;
 
-  for (int i = 0; i < N; ++i)
-  {
-    if (ang[i] >= ang_max[i])
-    {
-      visible_ref[i] = 1;
-      num_visible++;
-    }
-    else
-    {
-      visible_ref[i] = 0;
-    }
+  for (int i = 0; i < N; ++i) {
+     if ( ang[i] >= ang_max[i] ) {
+        visible_ref[i] = 1;
+        num_visible++;
+     } else {
+        visible_ref[i] = 0;
+     }
   }
 
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible_ref, N);
+//printArray(visible_ref, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA sequential variant
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA sequential variant
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n";
 
@@ -155,33 +148,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL1 = RAJA::seq_exec;
 
-  RAJA::inclusive_scan<EXEC_POL1>(
-      RAJA::make_span(ang, N), RAJA::make_span(ang_max, N),
-      RAJA::operators::maximum<double> {});
-
-
-  RAJA::forall<EXEC_POL1>(
-      RAJA::RangeSegment(0, N),
-      [=](int i)
-      {
-        if (ang[i] >= ang_max[i])
-        {
-          visible[i] = 1;
-        }
-        else
-        {
-          visible[i] = 0;
-        }
-      });
+  RAJA::inclusive_scan< EXEC_POL1 >(RAJA::make_span(ang, N),
+                                    RAJA::make_span(ang_max, N),
+                                    RAJA::operators::maximum<double>{} );
+
+
+  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) {
+    if ( ang[i] >= ang_max[i] ) {
+       visible[i] = 1;
+    } else {
+       visible[i] = 0;
+    }
+  });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible, N);
+//printArray(visible, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA OpenMP multithreading variant
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA OpenMP multithreading variant
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -193,34 +180,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL2 = RAJA::omp_parallel_for_exec;
 
-  RAJA::inclusive_scan<EXEC_POL2>(
-      RAJA::make_span(ang, N), RAJA::make_span(ang_max, N),
-      RAJA::operators::maximum<double> {});
-
-  RAJA::forall<EXEC_POL2>(
-      RAJA::RangeSegment(0, N),
-      [=](int i)
-      {
-        if (ang[i] >= ang_max[i])
-        {
-          visible[i] = 1;
-        }
-        else
-        {
-          visible[i] = 0;
-        }
-      });
+  RAJA::inclusive_scan< EXEC_POL2 >(RAJA::make_span(ang, N),
+                                    RAJA::make_span(ang_max, N),
+                                    RAJA::operators::maximum<double>{} );
+
+  RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=] (int i) {
+    if ( ang[i] >= ang_max[i] ) {
+       visible[i] = 1;
+    } else {
+       visible[i] = 0;
+    }
+  });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible, N);
+//printArray(visible, N);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA CUDA variant
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA CUDA variant
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -232,27 +213,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   using EXEC_POL3 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
 
-  RAJA::inclusive_scan<EXEC_POL3>(
-      RAJA::make_span(ang, N), RAJA::make_span(ang_max, N),
-      RAJA::operators::maximum<double> {});
-
-  RAJA::forall<EXEC_POL3>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_DEVICE(int i)
-      {
-        if (ang[i] >= ang_max[i])
-        {
-          visible[i] = 1;
-        }
-        else
-        {
-          visible[i] = 0;
-        }
-      });
+  RAJA::inclusive_scan< EXEC_POL3 >(RAJA::make_span(ang, N),
+                                    RAJA::make_span(ang_max, N),
+                                    RAJA::operators::maximum<double>{} );
+
+  RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) {
+    if ( ang[i] >= ang_max[i] ) {
+       visible[i] = 1;
+    } else {
+       visible[i] = 0;
+    }
+  });
 
   num_visible = checkResult(visible, visible_ref, N);
   std::cout << "\n\t num visible points = " << num_visible << "\n\n";
-  // printArray(visible, N);
+//printArray(visible, N);
 
 #endif
 
@@ -280,20 +255,13 @@ int checkResult(int* visible, int* visible_ref, int len)
   int num_visible = 0;
 
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (correct && visible[i] != visible_ref[i])
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( correct && visible[i] != visible_ref[i] ) { correct = false; }
     num_visible += visible[i];
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 
@@ -307,8 +275,7 @@ template <typename T>
 void printArray(T* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
index 55166ffed1..4d29f7b3ae 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp
@@ -15,7 +15,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #6: Offset layout stencil computation.
+ *  EXERCISE #6: Offset layout stencil computation. 
  *
  *  In this exercise, you will use RAJA Layouts and Views to perform
  *  a simple 5-point stencil computation on a 2-dimensional Cartesian mesh.
@@ -26,23 +26,23 @@
  *  The five-cell stencil accumulates values in a cell from itself and
  *  its four neighbors. Assuming the cells are indexed using (i,j) pairs on
  *  the two dimensional mesh, the stencil computation looks like:
- *
+ * 
  *  out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) +
  *              in(i, j - 1) + in(i, j + 1)
  *
  *  where 'in' is the input data array and 'out' is the result of
- *  the stencil computation. For simplicity, in the code examples, we refer
- *  to the index tuples used to access input array entries as C (center),
+ *  the stencil computation. For simplicity, in the code examples, we refer 
+ *  to the index tuples used to access input array entries as C (center), 
  *  W (west), E (east), S (south), and N (north).
  *
- *  We assume that the input array has an entry for N x M interior mesh cells
+ *  We assume that the input array has an entry for N x M interior mesh cells 
  *  plus a one cell wide halo region around the mesh interior; i.e., the size
  *  of the input array is (N + 2) * (M + 2). The output array has an entry
  *  for N x M interior mesh cells only, so its size is N * M. Note that since
- *  the arrays have different sizes, C-style indexing requires different
+ *  the arrays have different sizes, C-style indexing requires different 
  *  offset values in the code for accessing a cell entry in each array.
- *
- *  The input array is initialized so that the entry for each interior cell
+ * 
+ *  The input array is initialized so that the entry for each interior cell 
  *  is one and the entry for each halo cell is zero. So for the case where
  *  N = 3 and M = 2, the input array looks like:
  *
@@ -66,7 +66,7 @@
  *      | 3 | 4 | 3 |
  *      -------------
  *
- *  You can think about indexing into this mesh as illustrated in the
+ *  You can think about indexing into this mesh as illustrated in the 
  *  following diagram:
  *
  *  ---------------------------------------------------
@@ -79,31 +79,31 @@
  *  | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
  *  ---------------------------------------------------
  *
- *  Notably (0, 0) corresponds to the bottom left corner of the interior
- *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom
+ *  Notably (0, 0) corresponds to the bottom left corner of the interior 
+ *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom 
  *  left corner of the halo region, which extends to (3, 2).
  *
- *  This file contains two C-style sequential implementations of stencil
- *  computation. One (Part a) has column indexing as stride-1 with the outer
- *  loop traversing the rows ('i' loop variable) and the inner loop traversing
- *  the columns ('j' loop variable). The other (Part B) has row indexing as
- *  stride-1 and reverses the order of the loops. This shows that a C-style
- *  implementation requires two different implementations, one for each loop
- *  order, since the array offset arithmetic is different in the two cases.
- *  Where indicated by comments, you will fill in versions using
- *  two-dimensional RAJA Views with offset layouts. One loop ordering requires
- *  permutations, while the other does not. If done properly, you will see
- *  that both RAJA versions have identical inner loop bodies, which is not the
+ *  This file contains two C-style sequential implementations of stencil 
+ *  computation. One (Part a) has column indexing as stride-1 with the outer 
+ *  loop traversing the rows ('i' loop variable) and the inner loop traversing 
+ *  the columns ('j' loop variable). The other (Part B) has row indexing as 
+ *  stride-1 and reverses the order of the loops. This shows that a C-style 
+ *  implementation requires two different implementations, one for each loop 
+ *  order, since the array offset arithmetic is different in the two cases. 
+ *  Where indicated by comments, you will fill in versions using 
+ *  two-dimensional RAJA Views with offset layouts. One loop ordering requires 
+ *  permutations, while the other does not. If done properly, you will see 
+ *  that both RAJA versions have identical inner loop bodies, which is not the 
  *  case for the C-style variants.
  *
- *  Note that you will use the same for-loop patterns as the C-style loops.
+ *  Note that you will use the same for-loop patterns as the C-style loops. 
  *  In a later exercise, we will show you how to use RAJA's nested loop
- *  support, which allows you to write both RAJA variants with identical
+ *  support, which allows you to write both RAJA variants with identical 
  *  source code.
  *
  *  RAJA features you will use:
  *    -  Offset-layouts and RAJA Views
- *
+ * 
  *  Since this exercise is done on a CPU only, we use C++ new and delete
  *  operators to allocate and deallocate the arrays we will use.
  */
@@ -111,14 +111,14 @@
 //
 // Functions for printing and checking results
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is
-// stride-1 (Rows indicates each row is stride-1,
+// For array printing, 'stride1dim' indicates which mesh dimenstride is 
+// stride-1 (Rows indicates each row is stride-1, 
 //           Columns indicates each column is stride-1).
 //
 enum class Stride1
 {
-  Rows,
-  Columns
+   Rows,
+   Columns 
 };
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim);
 void checkResult(int* A, int* A_ref, int Ntot);
@@ -128,76 +128,73 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #6: Offset layout stencil computation...\n";
 
-  //
-  // Define number of rows and columns of cells in the 2D mesh.
-  //
-  const int Nr_int = 5;
+//
+// Define number of rows and columns of cells in the 2D mesh.
+//
+  const int Nr_int = 5; 
   const int Nc_int = 8;
 
-  const int Nr_tot = Nr_int + 2;
+  const int Nr_tot = Nr_int + 2; 
   const int Nc_tot = Nc_int + 2;
-
+  
   const int int_cells = Nr_int * Nc_int;
-  const int tot_cells = Nr_tot * Nc_tot;
+  const int tot_cells = Nr_tot * Nc_tot; 
 
-  //
-  // Allocate and initialize input array
-  //
-  int* B     = memoryManager::allocate<int>(tot_cells * sizeof(int));
-  int* A     = memoryManager::allocate<int>(int_cells * sizeof(int));
+//
+// Allocate and initialize input array
+//
+  int* B = memoryManager::allocate<int>(tot_cells * sizeof(int));
+  int* A = memoryManager::allocate<int>(int_cells * sizeof(int));
   int* A_ref = memoryManager::allocate<int>(int_cells * sizeof(int));
 
 
-  //----------------------------------------------------------------------------//
-  // Part A:
-  //
-  // Variant of stencil computation with column indexing as stride-1.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Part A:
+// 
+// Variant of stencil computation with column indexing as stride-1.
+//----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-  //
-  // We assume that for each cell id (i,j) that j is the stride-1 index.
-  //
-  for (int i = 1; i <= Nc_int; ++i)
-  {
-    for (int j = 1; j <= Nr_int; ++j)
-    {
+//
+// We assume that for each cell id (i,j) that j is the stride-1 index.
+//
+  for (int i = 1; i <= Nc_int; ++i) {
+    for (int j = 1; j <= Nr_int; ++j) {
       int idx = j + Nr_tot * i;
-      B[idx]  = 1;
+      B[idx] = 1;
     }
   }
-  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns);
+//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); 
 
 
-  //----------------------------------------------------------------------------//
-  // C-style stencil computation establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style stencil computation establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int i = 0; i < Nc_int; ++i)
-  {
-    for (int j = 0; j < Nr_int; ++j)
-    {
+  for (int i = 0; i < Nc_int; ++i) {
+    for (int j = 0; j < Nr_int; ++j) {
 
       int idx_out = j + Nr_int * i;
-      int idx_in  = (j + 1) + Nr_tot * (i + 1);
+      int idx_in = (j + 1) + Nr_tot * (i + 1);
 
       A_ref[idx_out] = B[idx_in] +                                // C
                        B[idx_in - Nr_tot] + B[idx_in + Nr_tot] +  // W, E
                        B[idx_in - 1] + B[idx_in + 1];             // S, N
+
     }
   }
 
-  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
+//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
 
 
-  //----------------------------------------------------------------------------//
-  // Variant using RAJA Layouts and Views (no permutation).
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Variant using RAJA Layouts and Views (no permutation).
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running stencil computation with RAJA Views...\n";
 
@@ -206,120 +203,114 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE (Part A):
+  /// EXERCISE (Part A): 
   ///
-  ///   Fill in the stencil computation below where you use RAJA::View
+  ///   Fill in the stencil computation below where you use RAJA::View 
   ///   objects for accessing entries in the A and B arrays. You will use
   ///   a RAJA::OffsetLayout for the B array and a RAJA::Layout for the
-  ///   A array. The B array access requires an offset since the loops
-  //    iterate over the interior (i, j) indices.
+  ///   A array. The B array access requires an offset since the loops 
+  //    iterate over the interior (i, j) indices. 
   ///
-  ///   For this part (A) of the exercise, the column (j-loop) indexing
+  ///   For this part (A) of the exercise, the column (j-loop) indexing 
   ///   has stride 1.
   ///
 
 
-  for (int i = 0; i < Nc_int; ++i)
-  {
-    for (int j = 0; j < Nr_int; ++j)
-    {
+  for (int i = 0; i < Nc_int; ++i) {
+    for (int j = 0; j < Nr_int; ++j) {
 
       // fill in the loop body
+
     }
   }
 
   checkResult(A, A_ref, int_cells);
-  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
+//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
 
 
-  //----------------------------------------------------------------------------//
-  // Part B:
-  //
-  // Variant of stencil computation with row indexing as stride-1.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Part B:
+// 
+// Variant of stencil computation with row indexing as stride-1.
+//----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-  //
-  // We assume that for each cell id (i,j) that i is the stride-1 index.
-  //
-  for (int j = 1; j <= Nr_int; ++j)
-  {
-    for (int i = 1; i <= Nc_int; ++i)
-    {
+//
+// We assume that for each cell id (i,j) that i is the stride-1 index.
+//
+  for (int j = 1; j <= Nr_int; ++j) {
+    for (int i = 1; i <= Nc_int; ++i) {
       int idx = i + Nc_tot * j;
-      B[idx]  = 1;
+      B[idx] = 1;
     }
   }
-  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
+//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
 
 
-  //----------------------------------------------------------------------------//
-  // C-style stencil computation establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style stencil computation establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int j = 0; j < Nr_int; ++j)
-  {
-    for (int i = 0; i < Nc_int; ++i)
-    {
+  for (int j = 0; j < Nr_int; ++j) {
+    for (int i = 0; i < Nc_int; ++i) {
 
       int idx_out = i + Nc_int * j;
-      int idx_in  = (i + 1) + Nc_tot * (j + 1);
+      int idx_in = (i + 1) + Nc_tot * (j + 1);
 
       A_ref[idx_out] = B[idx_in] +                                // C
                        B[idx_in - Nc_tot] + B[idx_in + Nc_tot] +  // S, N
                        B[idx_in - 1] + B[idx_in + 1];             // W, E
+
     }
   }
 
-  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
+//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
 
 
-  //----------------------------------------------------------------------------//
-  // Variant using RAJA Layouts and Views (with permutation).
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Variant using RAJA Layouts and Views (with permutation).
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n\n Running stencil computation with RAJA Views "
-               "(permuted)...\n";
+  std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n";
 
   std::memset(A, 0, int_cells * sizeof(int));
 
   ///
   /// TODO...
   ///
-  /// EXERCISE (Part B):
+  /// EXERCISE (Part B): 
   ///
-  ///   Fill in the stencil computation below where you use RAJA::View
+  ///   Fill in the stencil computation below where you use RAJA::View 
   ///   objects for accessing entries in the A and B arrays. You will use
   ///   a RAJA::OffsetLayout for the B array and a RAJA::Layout for the
-  ///   A array. The B array access requires an offset since the loops
+  ///   A array. The B array access requires an offset since the loops 
   //    iterate over the interior (i, j) indices.
   ///
-  ///   For this part (A) of the exercise, the row (i-loop) indexing
-  ///   has stride 1. Thus, layouts for the A and B arrays require
+  ///   For this part (A) of the exercise, the row (i-loop) indexing 
+  ///   has stride 1. Thus, layouts for the A and B arrays require 
   ///   the same permutation.
   ///
 
 
-  for (int j = 0; j < Nr_int; ++j)
-  {
-    for (int i = 0; i < Nc_int; ++i)
-    {
+  for (int j = 0; j < Nr_int; ++j) {
+    for (int i = 0; i < Nc_int; ++i) {
 
       // fill in the loop body
+
     }
   }
 
   checkResult(A, A_ref, int_cells);
-  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
+//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(B);
   memoryManager::deallocate(A);
   memoryManager::deallocate(A_ref);
@@ -330,24 +321,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is
-// stride-1 (0 indicates each row is stride-1,
+// For array printing, 'stride1dim' indicates which mesh dimenstride is 
+// stride-1 (0 indicates each row is stride-1, 
 //           1 indicates each column is stride-1).
 //
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim)
 {
   std::cout << std::endl;
-  for (int j = 0; j < Nrows; ++j)
-  {
-    for (int i = 0; i < Ncols; ++i)
-    {
+  for (int j = 0; j < Nrows; ++j) {
+    for (int i = 0; i < Ncols; ++i) {
       int idx = 0;
-      if (stride1dim == Stride1::Columns)
-      {
+      if ( stride1dim == Stride1::Columns ) {
         idx = j + Nrows * i;
-      }
-      else
-      {
+      } else {
         idx = i + Ncols * j;
       }
       std::cout << v[idx] << " ";
@@ -364,20 +350,15 @@ void checkResult(int* A, int* A_ref, int Ntot)
 {
   bool pass = true;
 
-  for (int i = 0; i < Ntot; ++i)
-  {
-    if (pass && (A[i] != A_ref[i]))
-    {
+  for (int i = 0; i < Ntot; ++i) {
+    if ( pass && (A[i] != A_ref[i]) ) {
       pass = false;
     }
   }
 
-  if (pass)
-  {
+  if (pass) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
index 08c7f2d851..51aad20dae 100644
--- a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
+++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp
@@ -15,7 +15,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #6: Offset layout stencil computation.
+ *  EXERCISE #6: Offset layout stencil computation. 
  *
  *  In this exercise, you will use RAJA Layouts and Views to perform
  *  a simple 5-point stencil computation on a 2-dimensional Cartesian mesh.
@@ -26,23 +26,23 @@
  *  The five-cell stencil accumulates values in a cell from itself and
  *  its four neighbors. Assuming the cells are indexed using (i,j) pairs on
  *  the two dimensional mesh, the stencil computation looks like:
- *
+ * 
  *  out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) +
  *              in(i, j - 1) + in(i, j + 1)
  *
  *  where 'in' is the input data array and 'out' is the result of
- *  the stencil computation. For simplicity, in the code examples, we refer
- *  to the index tuples used to access input array entries as C (center),
+ *  the stencil computation. For simplicity, in the code examples, we refer 
+ *  to the index tuples used to access input array entries as C (center), 
  *  W (west), E (east), S (south), and N (north).
  *
- *  We assume that the input array has an entry for N x M interior mesh cells
+ *  We assume that the input array has an entry for N x M interior mesh cells 
  *  plus a one cell wide halo region around the mesh interior; i.e., the size
  *  of the input array is (N + 2) * (M + 2). The output array has an entry
  *  for N x M interior mesh cells only, so its size is N * M. Note that since
- *  the arrays have different sizes, C-style indexing requires different
+ *  the arrays have different sizes, C-style indexing requires different 
  *  offset values in the code for accessing a cell entry in each array.
- *
- *  The input array is initialized so that the entry for each interior cell
+ * 
+ *  The input array is initialized so that the entry for each interior cell 
  *  is one and the entry for each halo cell is zero. So for the case where
  *  N = 3 and M = 2, the input array looks like:
  *
@@ -66,7 +66,7 @@
  *      | 3 | 4 | 3 |
  *      -------------
  *
- *  You can think about indexing into this mesh as illustrated in the
+ *  You can think about indexing into this mesh as illustrated in the 
  *  following diagram:
  *
  *  ---------------------------------------------------
@@ -79,31 +79,31 @@
  *  | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
  *  ---------------------------------------------------
  *
- *  Notably (0, 0) corresponds to the bottom left corner of the interior
- *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom
+ *  Notably (0, 0) corresponds to the bottom left corner of the interior 
+ *  region, which extends to (2, 1), and (-1, -1) corresponds to the bottom 
  *  left corner of the halo region, which extends to (3, 2).
  *
- *  This file contains two C-style sequential implementations of stencil
- *  computation. One has column indexing as stride-1 with the outer loop
- *  traversing the rows ('i' loop variable) and the inner loop traversing the
+ *  This file contains two C-style sequential implementations of stencil 
+ *  computation. One has column indexing as stride-1 with the outer loop 
+ *  traversing the rows ('i' loop variable) and the inner loop traversing the 
  *  columns ('j' loop variable). The other has row indexing as stride-1 and
- *  reverses the order of the loops. This shows that a C-style implementation
+ *  reverses the order of the loops. This shows that a C-style implementation 
  *  requires two different implementations, one for each loop order, since the
- *  array offset arithmetic is different in the two cases. Where indicated
+ *  array offset arithmetic is different in the two cases. Where indicated 
  *  by comments, you will fill in versions using two-dimensional RAJA Views
  *  with offset layouts. One loop ordering requires permutations, while the
  *  other does not. If done properly, you will see that both RAJA versions
  *  have identical inner loop bodies, which is not the case for the C-style
  *  variants.
  *
- *  Note that you will use the same for-loop patterns as the C-style loops.
+ *  Note that you will use the same for-loop patterns as the C-style loops. 
  *  In a later exercise, we will show you how to use RAJA's nested loop
- *  support, which allows you to write both RAJA variants with identical
+ *  support, which allows you to write both RAJA variants with identical 
  *  source code.
  *
  *  RAJA features you will use:
  *    -  Offset-layouts and RAJA Views
- *
+ * 
  *  Since this exercise is done on a CPU only, we use C++ new and delete
  *  operators to allocate and deallocate the arrays we will use.
  */
@@ -111,14 +111,14 @@
 //
 // Functions for printing and checking results
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is
-// stride-1 (Rows indicates each row is stride-1,
+// For array printing, 'stride1dim' indicates which mesh dimenstride is 
+// stride-1 (Rows indicates each row is stride-1, 
 //           Columns indicates each column is stride-1).
 //
 enum class Stride1
 {
-  Rows,
-  Columns
+   Rows,
+   Columns 
 };
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim);
 void checkResult(int* A, int* A_ref, int Ntot);
@@ -128,76 +128,73 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise #6: Offset layout stencil computation...\n";
 
-  //
-  // Define number of rows and columns of cells in the 2D mesh.
-  //
+//
+// Define number of rows and columns of cells in the 2D mesh.
+//
   const int DIM = 2;
 
-  const int Nr_int = 5;
+  const int Nr_int = 5; 
   const int Nc_int = 8;
 
-  const int Nr_tot = Nr_int + 2;
+  const int Nr_tot = Nr_int + 2; 
   const int Nc_tot = Nc_int + 2;
-
+  
   const int int_cells = Nr_int * Nc_int;
-  const int tot_cells = Nr_tot * Nc_tot;
+  const int tot_cells = Nr_tot * Nc_tot; 
 
-  //
-  // Allocate and initialize input array
-  //
-  int* B     = memoryManager::allocate<int>(tot_cells * sizeof(int));
-  int* A     = memoryManager::allocate<int>(int_cells * sizeof(int));
+//
+// Allocate and initialize input array
+//
+  int* B = memoryManager::allocate<int>(tot_cells * sizeof(int));
+  int* A = memoryManager::allocate<int>(int_cells * sizeof(int));
   int* A_ref = memoryManager::allocate<int>(int_cells * sizeof(int));
 
 
-  //----------------------------------------------------------------------------//
-  // First variant of stencil computation with column indexing as stride-1.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// First variant of stencil computation with column indexing as stride-1.
+//----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-  //
-  // We assume that for each cell id (i,j) that j is the stride-1 index.
-  //
-  for (int i = 1; i <= Nc_int; ++i)
-  {
-    for (int j = 1; j <= Nr_int; ++j)
-    {
+//
+// We assume that for each cell id (i,j) that j is the stride-1 index.
+//
+  for (int i = 1; i <= Nc_int; ++i) {
+    for (int j = 1; j <= Nr_int; ++j) {
       int idx = j + Nr_tot * i;
-      B[idx]  = 1;
+      B[idx] = 1;
     }
   }
-  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns);
+//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); 
 
 
-  //----------------------------------------------------------------------------//
-  // C-style stencil computation establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style stencil computation establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int i = 0; i < Nc_int; ++i)
-  {
-    for (int j = 0; j < Nr_int; ++j)
-    {
+  for (int i = 0; i < Nc_int; ++i) {
+    for (int j = 0; j < Nr_int; ++j) {
 
       int idx_out = j + Nr_int * i;
-      int idx_in  = (j + 1) + Nr_tot * (i + 1);
+      int idx_in = (j + 1) + Nr_tot * (i + 1);
 
       A_ref[idx_out] = B[idx_in] +                                // C
                        B[idx_in - Nr_tot] + B[idx_in + Nr_tot] +  // W, E
                        B[idx_in - 1] + B[idx_in + 1];             // S, N
+
     }
   }
 
-  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
+//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns);
 
 
-  //----------------------------------------------------------------------------//
-  // Variant using RAJA Layouts and Views (no permutation).
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Variant using RAJA Layouts and Views (no permutation).
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running stencil computation with RAJA Views...\n";
 
@@ -206,83 +203,78 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Create offset Layout and Views for data access. Note that only
   // the input array access requires an offset since the loops iterate over
-  // the interior (i, j) indices. We can use the default layout for the
-  // output array. Also, since the 'j' index (rightmost) is stride-1,
+  // the interior (i, j) indices. We can use the default layout for the 
+  // output array. Also, since the 'j' index (rightmost) is stride-1, 
   // we don't need a permutation for this case.
   //
 
   RAJA::OffsetLayout<DIM> B_layout =
-      RAJA::make_offset_layout<DIM>({{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}});
+      RAJA::make_offset_layout<DIM>({{-1, -1}}, {{Nc_tot-1, Nr_tot-1}});
 
   RAJA::View<int, RAJA::OffsetLayout<DIM>> Bview(B, B_layout);
-  RAJA::View<int, RAJA::Layout<DIM>>       Aview(A, Nc_int, Nr_int);
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, Nc_int, Nr_int);
 
-  for (int i = 0; i < Nc_int; ++i)
-  {
-    for (int j = 0; j < Nr_int; ++j)
-    {
+  for (int i = 0; i < Nc_int; ++i) {
+    for (int j = 0; j < Nr_int; ++j) {
+
+      Aview(i, j) = Bview(i, j) +                           // C
+                    Bview(i - 1, j) + Bview(i + 1, j) +     // W, E
+                    Bview(i, j - 1) + Bview(i, j + 1);      // S, N
 
-      Aview(i, j) = Bview(i, j) +                        // C
-                    Bview(i - 1, j) + Bview(i + 1, j) +  // W, E
-                    Bview(i, j - 1) + Bview(i, j + 1);   // S, N
     }
   }
 
   checkResult(A, A_ref, int_cells);
-  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
+//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns);
 
 
-  //----------------------------------------------------------------------------//
-  // Second variant of stencil computation with row indexing as stride-1.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Second variant of stencil computation with row indexing as stride-1.
+//----------------------------------------------------------------------------//
 
   std::memset(B, 0, tot_cells * sizeof(int));
 
-  //
-  // We assume that for each cell id (i,j) that i is the stride-1 index.
-  //
-  for (int j = 1; j <= Nr_int; ++j)
-  {
-    for (int i = 1; i <= Nc_int; ++i)
-    {
+//
+// We assume that for each cell id (i,j) that i is the stride-1 index.
+//
+  for (int j = 1; j <= Nr_int; ++j) {
+    for (int i = 1; i <= Nc_int; ++i) {
       int idx = i + Nc_tot * j;
-      B[idx]  = 1;
+      B[idx] = 1;
     }
   }
-  // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
+//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows);
 
 
-  //----------------------------------------------------------------------------//
-  // C-style stencil computation establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style stencil computation establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n\n Running C-style stencil computation (reference soln)...\n";
 
   std::memset(A_ref, 0, int_cells * sizeof(int));
 
-  for (int j = 0; j < Nr_int; ++j)
-  {
-    for (int i = 0; i < Nc_int; ++i)
-    {
+  for (int j = 0; j < Nr_int; ++j) {
+    for (int i = 0; i < Nc_int; ++i) {
 
       int idx_out = i + Nc_int * j;
-      int idx_in  = (i + 1) + Nc_tot * (j + 1);
+      int idx_in = (i + 1) + Nc_tot * (j + 1);
 
       A_ref[idx_out] = B[idx_in] +                                // C
                        B[idx_in - Nc_tot] + B[idx_in + Nc_tot] +  // S, N
                        B[idx_in - 1] + B[idx_in + 1];             // W, E
+
     }
   }
 
-  // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
+//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows);
 
 
-  //----------------------------------------------------------------------------//
-  // Variant using RAJA Layouts and Views (with permutation).
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// Variant using RAJA Layouts and Views (with permutation).
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n\n Running stencil computation with RAJA Views "
-               "(permuted)...\n";
+  std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n";
 
   std::memset(A, 0, int_cells * sizeof(int));
 
@@ -297,35 +289,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // application.
   //
 
-  std::array<RAJA::idx_t, DIM> perm {{1, 0}};  // 'i' index (position zero0)
-                                               // is stride-1
+  std::array<RAJA::idx_t, DIM> perm {{1, 0}};  // 'i' index (position zero0) 
+                                               // is stride-1 
 
-  RAJA::OffsetLayout<DIM> pB_layout = RAJA::make_permuted_offset_layout(
-      {{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}}, perm);
+  RAJA::OffsetLayout<DIM> pB_layout =
+    RAJA::make_permuted_offset_layout( {{-1, -1}}, {{Nc_tot-1, Nr_tot-1}},
+                                       perm );
 
-  RAJA::Layout<DIM> pA_layout =
-      RAJA::make_permuted_layout({{Nc_int, Nr_int}}, perm);
+  RAJA::Layout<DIM> pA_layout = 
+      RAJA::make_permuted_layout( {{Nc_int, Nr_int}}, perm );
 
   RAJA::View<int, RAJA::OffsetLayout<DIM>> pBview(B, pB_layout);
-  RAJA::View<int, RAJA::Layout<DIM>>       pAview(A, pA_layout);
+  RAJA::View<int, RAJA::Layout<DIM>> pAview(A, pA_layout);
 
-  for (int j = 0; j < Nr_int; ++j)
-  {
-    for (int i = 0; i < Nc_int; ++i)
-    {
+  for (int j = 0; j < Nr_int; ++j) {
+    for (int i = 0; i < Nc_int; ++i) {
+
+      pAview(i, j) = pBview(i, j) +                            // C
+                     pBview(i - 1, j) + pBview(i + 1, j) +     // W, E
+                     pBview(i, j - 1) + pBview(i, j + 1);      // S, N
 
-      pAview(i, j) = pBview(i, j) +                         // C
-                     pBview(i - 1, j) + pBview(i + 1, j) +  // W, E
-                     pBview(i, j - 1) + pBview(i, j + 1);   // S, N
     }
   }
 
   checkResult(A, A_ref, int_cells);
-  // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
+//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows);
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(B);
   memoryManager::deallocate(A);
   memoryManager::deallocate(A_ref);
@@ -336,24 +328,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 
 //
-// For array printing, 'stride1dim' indicates which mesh dimenstride is
-// stride-1 (0 indicates each row is stride-1,
+// For array printing, 'stride1dim' indicates which mesh dimenstride is 
+// stride-1 (0 indicates each row is stride-1, 
 //           1 indicates each column is stride-1).
 //
 void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim)
 {
   std::cout << std::endl;
-  for (int j = 0; j < Nrows; ++j)
-  {
-    for (int i = 0; i < Ncols; ++i)
-    {
+  for (int j = 0; j < Nrows; ++j) {
+    for (int i = 0; i < Ncols; ++i) {
       int idx = 0;
-      if (stride1dim == Stride1::Columns)
-      {
+      if ( stride1dim == Stride1::Columns ) {
         idx = j + Nrows * i;
-      }
-      else
-      {
+      } else {
         idx = i + Ncols * j;
       }
       std::cout << v[idx] << " ";
@@ -370,20 +357,15 @@ void checkResult(int* A, int* A_ref, int Ntot)
 {
   bool pass = true;
 
-  for (int i = 0; i < Ntot; ++i)
-  {
-    if (pass && (A[i] != A_ref[i]))
-    {
+  for (int i = 0; i < Ntot; ++i) {
+    if ( pass && (A[i] != A_ref[i]) ) {
       pass = false;
     }
   }
 
-  if (pass)
-  {
+  if (pass) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
index 8ab8e37acb..d183c221fa 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp
@@ -16,13 +16,13 @@
 /*
  *  EXERCISE #8: Tiled Matrix Transpose
  *
- *  In this exercise, you will use RAJA constructs to transpose a matrix
+ *  In this exercise, you will use RAJA constructs to transpose a matrix 
  *  using a loop tiling algorithm. An input matrix A of dimension N_r x N_c
  *  is provided. You will fill in the entries of the transpose matrix At.
  *
  *  This file contains a C-style variant of the sequential matrix transpose.
  *  You will complete implementations of multiple RAJA variants by filling
- *  in missing elements of RAJA kernel API execution policies as well as the
+ *  in missing elements of RAJA kernel API execution policies as well as the 
  *  RAJA kernel implementation for each. Variants you will complete include
  *  sequential, OpenMP, and CUDA execution.
  *
@@ -52,7 +52,7 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n";
@@ -66,8 +66,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -81,7 +81,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Construct a permuted layout for At so that the column index has stride 1
   //
   std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, 
+                                                            perm );
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -96,16 +97,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -113,10 +112,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
 
       //
       // (1) Loops to iterate over tile entries
@@ -124,31 +121,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that output matrix data access
       //           is stride-1.
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow)
-      {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
-        {
+      for (int trow = 0; trow < TILE_SZ; ++trow) {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
 
           int col = bx * TILE_SZ + tcol;  // Matrix column index
           int row = by * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
+
     }
   }
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
-  // The following RAJA variants will use the RAJA::kernel method to
+  // The following RAJA variants will use the RAJA::kernel method to 
   // perform the matrix transpose operation.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
@@ -157,14 +152,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // global iteration number.
   //
 
-// Note: this needs to be turned on for other back-ends when working the
+// Note: this needs to be turned on for other back-ends when working the 
 //       exercises (sequential, CUDA, etc.)
 #if defined(RAJA_ENABLE_OPENMP)
   RAJA::RangeSegment row_Range(0, N_r);
   RAJA::RangeSegment col_Range(0, N_c);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -204,8 +199,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top "
-               "inner loop...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -244,10 +238,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed "
-               "inner loops...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -256,20 +249,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-          RAJA::statement::Collapse<
-              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-              RAJA::statement::Lambda<0>>             // closes collapse
-          >                                           // closes Tile 0
-      >                                               // closes Tile 1
-                                                  >;  // closes policy list
+  using KERNEL_EXEC_POL_OMP2 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                               RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                                 RAJA::seq_exec,
+          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                    RAJA::ArgList<0, 1>,
+            RAJA::statement::Lambda<0>
+          > //closes collapse
+        > // closes Tile 0
+      > // closes Tile 1
+    >; // closes policy list
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+                        RAJA::make_tuple(col_Range, row_Range),
+                        [=](int col, int row) {
+
+    Atview(col, row) = Aview(row, col);
+
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -334,22 +334,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match &= false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -361,13 +355,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
index 4924a532c1..dbb9a75c20 100644
--- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
+++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp
@@ -50,7 +50,7 @@ void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n";
@@ -64,8 +64,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of tiled matrix transpose, we
@@ -79,7 +79,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Construct a permuted layout for At so that the column index has stride 1
   //
   std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, 
+                                                            perm );
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -94,16 +95,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
-  // printResult<int>(Aview, N_r, N_c);
+  //printResult<int>(Aview, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -111,42 +110,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int by = 0; by < outer_Dimr; ++by)
-  {
-    for (int bx = 0; bx < outer_Dimc; ++bx)
-    {
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
 
       //
       // (1) Loops to iterate over tile entries
       //
       //     Note: loops are ordered so that output matrix data access
-      //           is stride-1.
+      //           is stride-1.   
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow)
-      {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
-        {
+      for (int trow = 0; trow < TILE_SZ; ++trow) {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
 
           int col = bx * TILE_SZ + tcol;  // Matrix column index
           int row = by * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Aview(row, col);
           }
         }
       }
+
     }
   }
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
-  // The following RAJA variants will use the RAJA::kernel method to
+  // The following RAJA variants will use the RAJA::kernel method to 
   // perform the matrix transpose operation.
   //
   // Here, we define RAJA range segments to establish the iteration spaces.
@@ -157,7 +152,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment row_Range(0, N_r);
   RAJA::RangeSegment col_Range(0, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -167,26 +162,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // tile_fixed corresponds to the dimension size of the tile.
   //
 
-  using KERNEL_EXEC_POL_SEQ = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-          RAJA::statement::For<
-              1, RAJA::seq_exec,
-              RAJA::statement::For<
-                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
-
-  RAJA::kernel<KERNEL_EXEC_POL_SEQ>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+  using KERNEL_EXEC_POL_SEQ =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                               RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                                 RAJA::seq_exec,
+          RAJA::statement::For<1, RAJA::seq_exec,
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<KERNEL_EXEC_POL_SEQ>( RAJA::make_tuple(col_Range, row_Range),
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
 //----------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top "
-               "inner loop...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose -  parallel top inner loop...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -195,26 +196,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // one of the inner loops.
   //
 
-  using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-          RAJA::statement::For<
-              1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<
-                  0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>;
+  using KERNEL_EXEC_POL_OMP =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                               RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                                 RAJA::seq_exec,
+          RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+                          RAJA::make_tuple(col_Range, row_Range),
+                          [=](int col, int row) {
+
+    Atview(col, row) = Aview(row, col);
+
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed "
-               "inner loops...\n";
+  std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -224,20 +234,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // to/from the tile.
   //
 
-  using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-          RAJA::statement::Collapse<
-              RAJA::omp_parallel_collapse_exec, RAJA::ArgList<0, 1>,
-              RAJA::statement::Lambda<0>>             // closes collapse
-          >                                           // closes Tile 0
-      >                                               // closes Tile 1
-                                                  >;  // closes policy list
+  using KERNEL_EXEC_POL_OMP2 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                               RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                                 RAJA::seq_exec,
+          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                    RAJA::ArgList<0, 1>,
+            RAJA::statement::Lambda<0>
+          > //closes collapse
+        > // closes Tile 0
+      > // closes Tile 1
+    >; // closes policy list
 
   RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
-      RAJA::make_tuple(col_Range, row_Range),
-      [=](int col, int row) { Atview(col, row) = Aview(row, col); });
+                        RAJA::make_tuple(col_Range, row_Range),
+                        [=](int col, int row) {
+
+    Atview(col, row) = Aview(row, col);
+
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -250,19 +267,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   using KERNEL_EXEC_POL_CUDA =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_x_loop,
-              RAJA::statement::For<
-                  1, RAJA::cuda_thread_y_direct,
-                  RAJA::statement::For<
-                      0, RAJA::cuda_thread_x_direct,
-                      RAJA::statement::Lambda<0>>>>>>>;
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                                 RAJA::cuda_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                                   RAJA::cuda_block_x_loop,
+            RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+              RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >;
 
   RAJA::kernel<KERNEL_EXEC_POL_CUDA>(
-      RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row)
-      { Atview(col, row) = Aview(row, col); });
+                           RAJA::make_tuple(col_Range, row_Range),
+                           [=] RAJA_DEVICE (int col, int row) {
+
+                             Atview(col, row) = Aview(row, col);
+
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -287,22 +314,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match &= false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -314,13 +335,11 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
       //                << std::endl;
-      std::cout << Atview(row, col) << " ";
+      std::cout<<Atview(row, col)<<" ";
     }
     std::cout << "" << std::endl;
   }
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
index f070f49703..71743ba2d4 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp
@@ -17,10 +17,10 @@
  *  EXERCISE #9: Matrix Transpose with Local Array
  *
  *  In this exercise, you will use RAJA constructs to transpose a matrix
- *  using a loop tiling algorithm similar to exercise 8. However, this
+ *  using a loop tiling algorithm similar to exercise 8. However, this 
  *  exercise is different in that you will use a local array to write
- *  to and read from as each matrix tile is transposed. An input matrix
- *  A of dimension N_r x N_c is provided. You will fill in the entries
+ *  to and read from as each matrix tile is transposed. An input matrix 
+ *  A of dimension N_r x N_c is provided. You will fill in the entries 
  *  of the transpose matrix At.
  *
  *  This file contains a C-style variant of the sequential matrix transpose.
@@ -57,7 +57,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n";
@@ -71,8 +71,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -86,7 +86,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Construct a permuted layout for At so that the column index has stride 1
   //
   std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}},
+                                                            perm );
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -101,16 +102,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
   // printResult<int>(Aview, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of local array matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -118,10 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int brow = 0; brow < outer_Dimr; ++brow)
-  {
-    for (int bcol = 0; bcol < outer_Dimc; ++bcol)
-    {
+  for (int brow = 0; brow < outer_Dimr; ++brow) {
+    for (int bcol = 0; bcol < outer_Dimc; ++bcol) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_SZ][TILE_SZ];
@@ -132,17 +129,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow)
-      {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
-        {
+      for (int trow = 0; trow < TILE_SZ; ++trow) {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
 
           int col = bcol * TILE_SZ + tcol;  // Matrix column index
           int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Tile[trow][tcol] = Aview(row, col);
           }
         }
@@ -154,27 +148,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tcol = 0; tcol < TILE_SZ; ++tcol)
-      {
-        for (int trow = 0; trow < TILE_SZ; ++trow)
-        {
+      for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
+        for (int trow = 0; trow < TILE_SZ; ++trow) {
 
           int col = bcol * TILE_SZ + tcol;  // Matrix column index
           int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Tile[trow][tcol];
           }
         }
       }
+
     }
   }
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
   // The following RAJA variants will use the RAJA::kernel method to
@@ -198,7 +190,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   using TILE_MEM =
-      RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
+    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
 
   // **NOTE** The LocalArray is created here, but it's memory is not yet
   //          allocated. This is done when the 'InitLocalMem' statement
@@ -207,7 +199,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   TILE_MEM RAJA_Tile;
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -433,22 +425,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match &= false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -460,10 +446,8 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
index e421a70a4d..1900bf1157 100644
--- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
+++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp
@@ -17,10 +17,10 @@
  *  EXERCISE #9: Matrix Transpose with Local Array
  *
  *  In this exercise, you will use RAJA constructs to transpose a matrix
- *  using a loop tiling algorithm similar to exercise 8. However, this
+ *  using a loop tiling algorithm similar to exercise 8. However, this 
  *  exercise is different in that you will use a local array to write
- *  to and read from as each matrix tile is transposed. An input matrix
- *  A of dimension N_r x N_c is provided. You will fill in the entries
+ *  to and read from as each matrix tile is transposed. An input matrix 
+ *  A of dimension N_r x N_c is provided. You will fill in the entries 
  *  of the transpose matrix At.
  *
  *  This file contains a C-style variant of the sequential matrix transpose.
@@ -57,7 +57,7 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n";
@@ -71,8 +71,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate matrix data
   //
-  int* A  = memoryManager::allocate<int>(N_r * N_c);
-  int* At = memoryManager::allocate<int>(N_r * N_c);
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
 
   //
   // In the following implementations of matrix transpose, we
@@ -86,7 +86,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Construct a permuted layout for At so that the column index has stride 1
   //
   std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm);
+  RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}},
+                                                            perm );
   RAJA::View<int, RAJA::Layout<DIM>> Atview(At, perm_layout);
 
   //
@@ -101,16 +102,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Initialize matrix data
   //
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       Aview(row, col) = col;
     }
   }
   // printResult<int>(Aview, N_r, N_c);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of local array matrix transpose...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
@@ -118,10 +117,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // (0) Outer loops to iterate over tiles
   //
-  for (int brow = 0; brow < outer_Dimr; ++brow)
-  {
-    for (int bcol = 0; bcol < outer_Dimc; ++bcol)
-    {
+  for (int brow = 0; brow < outer_Dimr; ++brow) {
+    for (int bcol = 0; bcol < outer_Dimc; ++bcol) {
 
       // Stack-allocated local array for data on a tile
       int Tile[TILE_SZ][TILE_SZ];
@@ -132,17 +129,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loops are ordered so that input matrix data access
       //           is stride-1.
       //
-      for (int trow = 0; trow < TILE_SZ; ++trow)
-      {
-        for (int tcol = 0; tcol < TILE_SZ; ++tcol)
-        {
+      for (int trow = 0; trow < TILE_SZ; ++trow) {
+        for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
 
           int col = bcol * TILE_SZ + tcol;  // Matrix column index
           int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Tile[trow][tcol] = Aview(row, col);
           }
         }
@@ -154,27 +148,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tcol = 0; tcol < TILE_SZ; ++tcol)
-      {
-        for (int trow = 0; trow < TILE_SZ; ++trow)
-        {
+      for (int tcol = 0; tcol < TILE_SZ; ++tcol) {
+        for (int trow = 0; trow < TILE_SZ; ++trow) {
 
           int col = bcol * TILE_SZ + tcol;  // Matrix column index
           int row = brow * TILE_SZ + trow;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c)
-          {
+          if (row < N_r && col < N_c) {
             Atview(col, row) = Tile[trow][tcol];
           }
         }
       }
+
     }
   }
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   //
   // The following RAJA variants will use the RAJA::kernel method to
@@ -196,7 +188,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
 
   using TILE_MEM =
-      RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
+    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_SZ, TILE_SZ>>;
 
   // **NOTE** The LocalArray is created here, but it's memory is not yet
   //          allocated. This is done when the 'InitLocalMem' statement
@@ -205,42 +197,55 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   TILE_MEM RAJA_Tile;
 
-  //--------------------------------------------------------------------------//
+//--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using SEQ_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
+  using SEQ_EXEC_POL =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                               RAJA::seq_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                                 RAJA::seq_exec,
 
-          RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
-              RAJA::statement::ForICount<
-                  1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                      RAJA::statement::Lambda<0>>>,
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
+                                        RAJA::seq_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
+                                        RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >,
 
-              RAJA::statement::ForICount<
-                  0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                      RAJA::statement::Lambda<1>>>
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
+                                        RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
+                                          RAJA::seq_exec,
+              RAJA::statement::Lambda<1>
+            >
+          >
 
-              >>>>;
+          >
+        >
+      >
+    >;
 
-  RAJA::kernel_param<SEQ_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
+  RAJA::kernel_param<SEQ_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
 
-      RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
+    RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
+
+    [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
+
+      RAJA_Tile(trow, tcol) = Aview(row, col);
+
+    },
+
+    [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile)
-      { RAJA_Tile(trow, tcol) = Aview(row, col); },
+      Atview(col, row) = RAJA_Tile(trow, tcol);
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile)
-      { Atview(col, row) = RAJA_Tile(trow, tcol); });
+  });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -252,36 +257,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  using OPENMP_EXEC_POL = RAJA::KernelPolicy<RAJA::statement::Tile<
-      1, RAJA::tile_fixed<TILE_SZ>, RAJA::omp_parallel_for_exec,
-      RAJA::statement::Tile<
-          0, RAJA::tile_fixed<TILE_SZ>, RAJA::seq_exec,
+  using OPENMP_EXEC_POL =
+  RAJA::KernelPolicy<
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                             RAJA::omp_parallel_for_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                               RAJA::seq_exec,
+
+        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
+                                        RAJA::seq_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
+                                          RAJA::seq_exec,
+               RAJA::statement::Lambda<0>
+            >
+          >,
+
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
+                                        RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
+                                          RAJA::seq_exec,
+              RAJA::statement::Lambda<1>
+            >
+          >
+        >
+      >
+    >
+   >;
+
+  RAJA::kernel_param<OPENMP_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
 
-          RAJA::statement::InitLocalMem<
-              RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+      RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-              RAJA::statement::ForICount<
-                  1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                      RAJA::statement::Lambda<0>>>,
+      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
 
-              RAJA::statement::ForICount<
-                  0, RAJA::statement::Param<0>, RAJA::seq_exec,
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<1>, RAJA::seq_exec,
-                      RAJA::statement::Lambda<1>>>>>>>;
+        RAJA_Tile(trow, tcol) = Aview(row, col);
 
-  RAJA::kernel_param<OPENMP_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
+      },
 
-      RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
+      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile)
-      { RAJA_Tile(trow, tcol) = Aview(row, col); },
+        Atview(col, row) = RAJA_Tile(trow, tcol);
 
-      [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile)
-      { Atview(col, row) = RAJA_Tile(trow, tcol); });
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -296,43 +315,56 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   using CUDA_EXEC_POL =
-      RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-          1, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_y_loop,
-          RAJA::statement::Tile<
-              0, RAJA::tile_fixed<TILE_SZ>, RAJA::cuda_block_x_loop,
-
-              RAJA::statement::InitLocalMem<
-                  RAJA::cuda_shared_mem, RAJA::ParamList<2>,
-
-                  RAJA::statement::ForICount<
-                      1, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          0, RAJA::statement::Param<0>,
-                          RAJA::cuda_thread_x_direct,
-                          RAJA::statement::Lambda<0>>>,
+  RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_SZ>,
+                               RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_SZ>,
+                                 RAJA::cuda_block_x_loop,
+
+          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
+
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
+                                          RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
+                                            RAJA::cuda_thread_x_direct,
+                RAJA::statement::Lambda<0>
+              >
+            >,
+
+            RAJA::statement::CudaSyncThreads,
+
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, 
+                                          RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<1>, 
+                                            RAJA::cuda_thread_x_direct,
+                RAJA::statement::Lambda<1>
+              >
+            >,
+
+            RAJA::statement::CudaSyncThreads
+          >
+        >
+      >
+    >
+  >;
+
+
+  RAJA::kernel_param<CUDA_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
 
-                  RAJA::statement::CudaSyncThreads,
-
-                  RAJA::statement::ForICount<
-                      0, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-                      RAJA::statement::ForICount<
-                          1, RAJA::statement::Param<1>,
-                          RAJA::cuda_thread_x_direct,
-                          RAJA::statement::Lambda<1>>>,
+      RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
 
-                  RAJA::statement::CudaSyncThreads>>>>>;
+      [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) {
 
+        RAJA_Tile(trow, tcol) = Aview(row, col);
 
-  RAJA::kernel_param<CUDA_EXEC_POL>(
-      RAJA::make_tuple(col_Range, row_Range),
+      },
 
-      RAJA::make_tuple((int)0, (int)0, RAJA_Tile),
+      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) {
 
-      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile)
-      { RAJA_Tile(trow, tcol) = Aview(row, col); },
+        Atview(col, row) = RAJA_Tile(trow, tcol);
 
-      [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile)
-      { Atview(col, row) = RAJA_Tile(trow, tcol); });
+      });
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -359,22 +391,16 @@ template <typename T>
 void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   bool match = true;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
-      if (Atview(row, col) != row)
-      {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
         match &= false;
       }
     }
   }
-  if (match)
-  {
+  if (match) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -386,10 +412,8 @@ template <typename T>
 void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
 {
   std::cout << std::endl;
-  for (int row = 0; row < N_r; ++row)
-  {
-    for (int col = 0; col < N_c; ++col)
-    {
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
       std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
                 << std::endl;
     }
diff --git a/exercises/tutorial_halfday/memoryManager.hpp b/exercises/tutorial_halfday/memoryManager.hpp
index 6cda797b3e..83fb8cb3bb 100644
--- a/exercises/tutorial_halfday/memoryManager.hpp
+++ b/exercises/tutorial_halfday/memoryManager.hpp
@@ -28,12 +28,12 @@ namespace memoryManager
 {
 
 template <typename T>
-T* allocate(RAJA::Index_type size)
+T *allocate(RAJA::Index_type size)
 {
-  T* ptr;
+  T *ptr;
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(
-      cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
+      cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
 #else
   ptr = new T[size];
 #endif
@@ -41,10 +41,9 @@ T* allocate(RAJA::Index_type size)
 }
 
 template <typename T>
-void deallocate(T*& ptr)
+void deallocate(T *&ptr)
 {
-  if (ptr)
-  {
+  if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaFree(ptr));
 #else
@@ -55,31 +54,30 @@ void deallocate(T*& ptr)
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename T>
-T* allocate_gpu(RAJA::Index_type size)
-{
-  T* ptr;
+  template <typename T>
+  T *allocate_gpu(RAJA::Index_type size)
+  {
+    T *ptr;
 #if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size));
+    cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size));
 #elif defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size));
+    hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
 #endif
-  return ptr;
-}
+    return ptr;
+  }
 
-template <typename T>
-void deallocate_gpu(T*& ptr)
-{
-  if (ptr)
+  template <typename T>
+  void deallocate_gpu(T *&ptr)
   {
+    if (ptr) {
 #if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaFree(ptr));
+      cudaErrchk(cudaFree(ptr));
 #elif defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipFree(ptr));
+      hipErrchk(hipFree(ptr));
 #endif
-    ptr = nullptr;
+      ptr = nullptr;
+    }
   }
-}
 #endif
 
 };  // namespace memoryManager
diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp
index b6255d5def..dbe5260f6d 100644
--- a/exercises/vector-addition.cpp
+++ b/exercises/vector-addition.cpp
@@ -16,7 +16,7 @@
 /*
  *  Vector Addition Exercise
  *
- *  In this exercise, you will compute c = a + b, where a, b, c are
+ *  In this exercise, you will compute c = a + b, where a, b, c are 
  *  integer vectors.
  *
  *  This file contains sequential and OpenMP variants of the vector addition
@@ -24,7 +24,7 @@
  *  plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA
  *  compiler, in empty code sections indicated by comments.
  *
- *  The exercise shows you how to use RAJA in its simplest form and
+ *  The exercise shows you how to use RAJA in its simplest form and 
  *  illustrates similarities between a C-style for-loop and a RAJA forall loop.
  *
  *  RAJA features you will use:
@@ -32,77 +32,75 @@
  *    -  Index range segment
  *    -  Execution policies
  *
- * Note: if CUDA is enabled, CUDA unified memory is used.
+ * Note: if CUDA is enabled, CUDA unified memory is used. 
  */
 
 /*
   Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-// constexpr int CUDA_BLOCK_SIZE = 256;
+//constexpr int CUDA_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-// constexpr int HIP_BLOCK_SIZE = 256;
+//constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-// constexpr int SYCL_BLOCK_SIZE = 256;
+//constexpr int SYCL_BLOCK_SIZE = 256;
 #endif
 
 //
 // Functions for checking and printing arrays
 //
-void checkResult(int* c, int* c_ref, int len);
+void checkResult(int* c, int* c_ref, int len); 
 void printArray(int* v, int len);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA Vector Addition...\n";
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   constexpr int N = 1000000;
 
-  //
-  // Allocate and initialize vector data to random numbers in [1, 10].
-  //
-  int* a     = memoryManager::allocate<int>(N);
-  int* b     = memoryManager::allocate<int>(N);
-  int* c     = memoryManager::allocate<int>(N);
-  int* c_ref = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data to random numbers in [1, 10].
+//
+  int *a = memoryManager::allocate<int>(N);
+  int *b = memoryManager::allocate<int>(N);
+  int *c = memoryManager::allocate<int>(N);
+  int *c_ref = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] = rand() % 10 + 1;
     b[i] = rand() % 10 + 1;
   }
 
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::memset(c_ref, 0, N * sizeof(int));
 
   std::cout << "\n Running C-style sequential vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     c_ref[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
-  // printArray(c_ref, N);
+//printArray(c_ref, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::seq_exec policy enforces strictly sequential execution.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::seq_exec policy enforces strictly sequential execution.
+//----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
@@ -112,24 +110,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   /// TODO...
   ///
   /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
-  ///           method and RAJA::seq_exec execution policy type.
+  ///           method and RAJA::seq_exec execution policy type. 
   ///
   /// NOTE: We've done this one for you to help you get started...
   ///
 
   // _rajaseq_vector_add_start
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+    c[i] = a[i] + b[i]; 
+  });
   // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::simd_exec policy attempts to force the compiler to generate SIMD
-  // vectorization optimizations.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::simd_exec policy attempts to force the compiler to generate SIMD
+// vectorization optimizations.
+//----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
@@ -143,12 +142,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -156,22 +155,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style OpenMP vector addition...\n";
 
-#pragma omp parallel for
-  for (int i = 0; i < N; ++i)
-  {
+  #pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
     c[i] = a[i] + b[i];
   }
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::omp_parallel_for_exec policy runs the loop in parallel using
-  // OpenMP multithreading.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::omp_parallel_for_exec policy runs the loop in parallel using
+// OpenMP multithreading.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -187,13 +185,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
 
   checkResult(c, c_ref, N);
-// printArray(c, N);
+//printArray(c, N);
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -201,12 +199,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA vector addition...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  int* d_b = memoryManager::allocate_gpu<int>(N);
-  int* d_c = memoryManager::allocate_gpu<int>(N);
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
 
-  cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice));
-  cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice));
+  cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice ));
 
   ///
   /// TODO...
@@ -215,54 +213,53 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           method and RAJA::cuda_exec execution policy type.
   ///
   ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here.
+  ///                 top of the file if you want to use it here. 
   ///
 
-  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
-  //----------------------------------------------------------------------------//
-  // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a
-  // GPU device with 2 blocks per SM.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a 
+// GPU device with 2 blocks per SM.
+//----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
-  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector "
-               "addition...\n";
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
 
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
-  ///           method and RAJA::cuda_exec execution policy type with
+  ///           method and RAJA::cuda_exec execution policy type with 
   ///           arguments defining 2 blocks per SM and asynchronous execution.
   ///
   ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here.
+  ///                 top of the file if you want to use it here. 
   ///
 
-  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
-// printResult(c, N);
+//printResult(c, N);
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP vector addition...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  int* d_b = memoryManager::allocate_gpu<int>(N);
-  int* d_c = memoryManager::allocate_gpu<int>(N);
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
 
   ///
   /// TODO...
@@ -271,29 +268,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           method and RAJA::hip_exec execution policy type.
   ///
   ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here.
+  ///                 top of the file if you want to use it here. 
   ///
 
-  hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
-  // printResult(c, N);
+//printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA::sycl_exec policy runs the loop as a SYCL kernel.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::sycl_exec policy runs the loop as a SYCL kernel.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL vector addition...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  int* d_b = memoryManager::allocate_gpu<int>(N);
-  int* d_c = memoryManager::allocate_gpu<int>(N);
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
 
   memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int));
   memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int));
@@ -305,24 +302,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           method and RAJA::hip_exec execution policy type.
   ///
   ///           NOTE: You will have to uncomment 'SYCL_BLOCK_SIZE' near the
-  ///                 top of the file if you want to use it here.
+  ///                 top of the file if you want to use it here. 
   ///
 
   memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int));
 
   checkResult(c, c_ref, N);
-  // printResult(c, N);
+//printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -339,19 +336,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* c, int* c_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (correct && c[i] != c_ref[i])
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( correct && c[i] != c_ref[i] ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -362,9 +352,9 @@ void checkResult(int* c, int* c_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
 }
+
diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp
index a56404a780..3bbc070731 100644
--- a/exercises/vector-addition_solution.cpp
+++ b/exercises/vector-addition_solution.cpp
@@ -16,7 +16,7 @@
 /*
  *  Vector Addition Exercise
  *
- *  In this exercise, you will compute c = a + b, where a, b, c are
+ *  In this exercise, you will compute c = a + b, where a, b, c are 
  *  integer vectors.
  *
  *  This file contains sequential and OpenMP variants of the vector addition
@@ -24,7 +24,7 @@
  *  plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA
  *  compiler, in empty code sections indicated by comments.
  *
- *  The exercise shows you how to use RAJA in its simplest form and
+ *  The exercise shows you how to use RAJA in its simplest form and 
  *  illustrates similarities between a C-style for-loop and a RAJA forall loop.
  *
  *  RAJA features you will use:
@@ -32,7 +32,7 @@
  *    -  Index range segment
  *    -  Execution policies
  *
- * Note: if CUDA is enabled, CUDA unified memory is used.
+ * Note: if CUDA is enabled, CUDA unified memory is used. 
  */
 
 /*
@@ -53,89 +53,93 @@ constexpr int SYCL_BLOCK_SIZE = 256;
 //
 // Functions for checking and printing arrays
 //
-void checkResult(int* c, int* c_ref, int len);
+void checkResult(int* c, int* c_ref, int len); 
 void printArray(int* v, int len);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: RAJA Vector Addition...\n";
 
-  //
-  // Define vector length
-  //
+//
+// Define vector length
+//
   constexpr int N = 1000000;
 
-  //
-  // Allocate and initialize vector data to random numbers in [1, 10].
-  //
-  int* a     = memoryManager::allocate<int>(N);
-  int* b     = memoryManager::allocate<int>(N);
-  int* c     = memoryManager::allocate<int>(N);
-  int* c_ref = memoryManager::allocate<int>(N);
+//
+// Allocate and initialize vector data to random numbers in [1, 10].
+//
+  int *a = memoryManager::allocate<int>(N);
+  int *b = memoryManager::allocate<int>(N);
+  int *c = memoryManager::allocate<int>(N);
+  int *c_ref = memoryManager::allocate<int>(N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] = rand() % 10 + 1;
     b[i] = rand() % 10 + 1;
   }
 
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::memset(c_ref, 0, N * sizeof(int));
 
   std::cout << "\n Running C-style sequential vector addition...\n";
 
   // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     c_ref[i] = a[i] + b[i];
   }
   // _cstyle_vector_add_end
 
-  // printArray(c_ref, N);
+//printArray(c_ref, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::seq_exec policy enforces strictly sequential execution.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::seq_exec policy enforces strictly sequential execution.
+//----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
   // _rajaseq_vector_add_start
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall< RAJA::seq_exec >(
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+      c[i] = a[i] + b[i];
+    }
+  );
   // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::simd_exec policy attempts to force the compiler to generate SIMD
-  // vectorization optimizations.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::simd_exec policy attempts to force the compiler to generate SIMD
+// vectorization optimizations.
+//----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
   std::cout << "\n Running RAJA SIMD vector addition...\n";
 
   RAJA::forall<RAJA::simd_exec>(
-      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+      c[i] = a[i] + b[i]; 
+    }
+  );    
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -143,22 +147,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style OpenMP vector addition...\n";
 
-#pragma omp parallel for
-  for (int i = 0; i < N; ++i)
-  {
+  #pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
     c[i] = a[i] + b[i];
   }
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::omp_parallel_for_exec policy runs the loop in parallel using
-  // OpenMP multithreading.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::omp_parallel_for_exec policy runs the loop in parallel using
+// OpenMP multithreading.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
@@ -167,18 +170,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\n Running RAJA OpenMP multithreaded vector addition...\n";
 
   // _rajaomp_vector_add_start
-  RAJA::forall<RAJA::omp_parallel_for_exec>(
-      RAJA::TypedRangeSegment<int>(0, N), [=](int i) { c[i] = a[i] + b[i]; });
+  RAJA::forall< RAJA::omp_parallel_for_exec >(
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+      c[i] = a[i] + b[i]; 
+    }
+  ); 
   // _rajaomp_vector_add_end
 
   checkResult(c, c_ref, N);
-// printArray(c, N);
+//printArray(c, N);
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
@@ -186,113 +192,116 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA vector addition...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  int* d_b = memoryManager::allocate_gpu<int>(N);
-  int* d_c = memoryManager::allocate_gpu<int>(N);
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
 
-  cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice));
-  cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice));
+  cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice ));
 
   // _rajacuda_vector_add_start
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
+  RAJA::forall< RAJA::cuda_exec<CUDA_BLOCK_SIZE> >(RAJA::TypedRangeSegment<int>(0, N), 
+    [=] RAJA_DEVICE (int i) {
+    d_c[i] = d_a[i] + d_b[i];
+  });
   // _rajacuda_vector_add_end
 
-  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
-  // printArray(c, N);
+//printArray(c, N);
 
-  //----------------------------------------------------------------------------//
-  // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a
-  // GPU device with 2 blocks per SM.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a 
+// GPU device with 2 blocks per SM.
+//----------------------------------------------------------------------------//
 
   std::memset(c, 0, N * sizeof(int));
 
-  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector "
-               "addition...\n";
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
 
   // _rajacuda_explicit_vector_add_start
   const bool Asynchronous = true;
 
-  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE, 2, Asynchronous>>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
+  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE, 2, Asynchronous>>(RAJA::TypedRangeSegment<int>(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    d_c[i] = d_a[i] + d_b[i]; 
+  });    
   // _rajacuda_explicit_vector_add_end
 
-  cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
-// printResult(c, N);
+//printResult(c, N);
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP vector addition...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  int* d_b = memoryManager::allocate_gpu<int>(N);
-  int* d_c = memoryManager::allocate_gpu<int>(N);
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
 
-  hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice));
-  hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice));
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
 
   // _rajahip_vector_add_start
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N),
+    [=] RAJA_DEVICE (int i) {
+    d_c[i] = d_a[i] + d_b[i];
+  });
   // _rajahip_vector_add_end
 
-  hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost));
+  hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
-  // printResult(c, N);
+//printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA::sycl_exec policy runs the loop as a SYCL kernel.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA::sycl_exec policy runs the loop as a SYCL kernel.
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
   std::cout << "\n Running RAJA SYCL vector addition...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  int* d_b = memoryManager::allocate_gpu<int>(N);
-  int* d_c = memoryManager::allocate_gpu<int>(N);
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
 
   memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int));
   memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int));
 
   // _rajasycl_vector_add_start
-  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE>>(
-      RAJA::TypedRangeSegment<int>(0, N),
-      [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; });
+  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N),
+    [=] RAJA_DEVICE (int i) {
+    d_c[i] = d_a[i] + d_b[i];
+  });
   // _rajasycl_vector_add_end
 
   memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int));
 
   checkResult(c, c_ref, N);
-  // printResult(c, N);
+//printResult(c, N);
 
   memoryManager::deallocate_gpu(d_a);
   memoryManager::deallocate_gpu(d_b);
   memoryManager::deallocate_gpu(d_c);
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  //
-  // Clean up.
-  //
+//
+// Clean up.
+//
   memoryManager::deallocate(a);
   memoryManager::deallocate(b);
   memoryManager::deallocate(c);
@@ -309,19 +318,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(int* c, int* c_ref, int len)
 {
   bool correct = true;
-  for (int i = 0; i < len; i++)
-  {
-    if (correct && c[i] != c_ref[i])
-    {
-      correct = false;
-    }
+  for (int i = 0; i < len; i++) {
+    if ( correct && c[i] != c_ref[i] ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -332,9 +334,9 @@ void checkResult(int* c, int* c_ref, int len)
 void printArray(int* v, int len)
 {
   std::cout << std::endl;
-  for (int i = 0; i < len; i++)
-  {
+  for (int i = 0; i < len; i++) {
     std::cout << "v[" << i << "] = " << v[i] << std::endl;
   }
   std::cout << std::endl;
 }
+
diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp
index 925579d10a..258250a741 100644
--- a/exercises/vertexsum-indexset.cpp
+++ b/exercises/vertexsum-indexset.cpp
@@ -20,7 +20,7 @@
 /*
  *  Mesh vertex area exercise
  *
- *  In this exercise, you will use a RAJA TypedIndexSet containing 4
+ *  In this exercise, you will use a RAJA TypedIndexSet containing 4 
  *  TypedListSegments to parallelize the mesh vertex area computation.
  *  A sum is computed at each vertex on a logically-Cartesian 2D mesh
  *  where the sum represents the vertex "area" as an average of the 4
@@ -32,13 +32,13 @@
  *  each subset. When the ListSegments are put into an IndexSet, the entire
  *  computation can be executed with one RAJA::forall() statement, where
  *  you iterate over the segments sequentially and execute each segment in
- *  parallel. This exercise illustrates how RAJA can be used to enable one
+ *  parallel. This exercise illustrates how RAJA can be used to enable one 
  *  to get some parallelism from such operations without fundamentally
  *  changing the way the algorithm looks in source code.
  *
  *  This file contains sequential and OpenMP variants of the vertex area
- *  computation using C-style for-loops. You will fill in RAJA versions of
- *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA
+ *  computation using C-style for-loops. You will fill in RAJA versions of 
+ *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA 
  *  GPU and a CUDA compiler, in empty code sections indicated by comments.
  *
  *  RAJA features you will use:
@@ -68,204 +68,189 @@ void checkResult(double* a, double* aref, int n);
 void printMeshData(double* v, int n, int joff);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n";
 
-  // _vertexsum_define_start
-  //
-  // 2D mesh has N^2 elements (N+1)^2 vertices.
-  //
-  constexpr int N         = 1000;
-  constexpr int Nelem     = N;
+// _vertexsum_define_start
+//
+// 2D mesh has N^2 elements (N+1)^2 vertices.
+//
+  constexpr int N = 1000;
+  constexpr int Nelem = N;
   constexpr int Nelem_tot = Nelem * Nelem;
-  constexpr int Nvert     = N + 1;
+  constexpr int Nvert = N + 1;
   constexpr int Nvert_tot = Nvert * Nvert;
-  // _vertexsum_define_end
-  double* areae     = memoryManager::allocate<double>(Nelem_tot);
-  double* areav     = memoryManager::allocate<double>(Nvert_tot);
+// _vertexsum_define_end
+  double* areae = memoryManager::allocate<double>(Nelem_tot);
+  double* areav = memoryManager::allocate<double>(Nvert_tot);
   double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
-  int*    e2v_map   = memoryManager::allocate<int>(4 * Nelem_tot);
+  int* e2v_map = memoryManager::allocate<int>(4*Nelem_tot);
 
-  // _vertexsum_elemarea_start
-  //
-  // Define mesh spacing factor 'h' and set up elem to vertex mapping array.
-  //
+// _vertexsum_elemarea_start
+//
+// Define mesh spacing factor 'h' and set up elem to vertex mapping array.
+//
   constexpr double h = 0.1;
 
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
-    int j             = ie / Nelem;
-    int imap          = 4 * ie;
-    e2v_map[imap]     = ie + j;
-    e2v_map[imap + 1] = ie + j + 1;
-    e2v_map[imap + 2] = ie + j + Nvert;
-    e2v_map[imap + 3] = ie + j + 1 + Nvert;
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int j = ie / Nelem;
+    int imap = 4 * ie ;
+    e2v_map[imap] = ie + j;
+    e2v_map[imap+1] = ie + j + 1;
+    e2v_map[imap+2] = ie + j + Nvert;
+    e2v_map[imap+3] = ie + j + 1 + Nvert;
   }
 
-  //
-  // Initialize element areas so each element area
-  // depends on the i,j coordinates of the element.
-  //
+//
+// Initialize element areas so each element area 
+// depends on the i,j coordinates of the element.
+//
   std::memset(areae, 0, Nelem_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
-    int i     = ie % Nelem;
-    int j     = ie / Nelem;
-    areae[ie] = h * (i + 1) * h * (j + 1);
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int i = ie % Nelem;
+    int j = ie / Nelem;
+    areae[ie] = h*(i+1) * h*(j+1);
   }
-  // _vertexsum_elemarea_end
+// _vertexsum_elemarea_end
 
-  // std::cout << "\n Element areas...\n";
-  // printMeshData(areae, Nelem, Nelem);
+//std::cout << "\n Element areas...\n";
+//printMeshData(areae, Nelem, Nelem);
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential C-style version of vertex sum...\n";
 
-  // _cstyle_vertexarea_seq_start
+// _cstyle_vertexarea_seq_start
   std::memset(areav_ref, 0, Nvert_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
-    int* iv = &(e2v_map[4 * ie]);
-    areav_ref[iv[0]] += areae[ie] / 4.0;
-    areav_ref[iv[1]] += areae[ie] / 4.0;
-    areav_ref[iv[2]] += areae[ie] / 4.0;
-    areav_ref[iv[3]] += areae[ie] / 4.0;
+  for (int ie = 0; ie < Nelem_tot; ++ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
   }
-  // _cstyle_vertexarea_seq_end
-
-  // std::cout << "\n Vertex areas (reference)...\n";
-  // printMeshData(areav_ref, Nvert, jvoff);
-
-
-  //----------------------------------------------------------------------------//
-  //
-  // In the following, we partition the element iteration space into four
-  // subsets (or "colors") indicated by numbers in the figure below.
-  //
-  //    -----------------
-  //    | 2 | 3 | 2 | 3 |
-  //    -----------------
-  //    | 0 | 1 | 0 | 1 |
-  //    -----------------
-  //    | 2 | 3 | 2 | 3 |
-  //    -----------------
-  //    | 0 | 1 | 0 | 1 |
-  //    -----------------
-  //
-  // Since none of the elements with the same number share a common vertex,
-  // we can iterate over each subset ("color") in parallel.
-  //
-  // We use RAJA ListSegments and a RAJA IndexSet to define the element
-  // partitioning.
-  //
-
-  // _vertexarea_color_start
-  //
-  // Gather the element indices for each color in a vector.
-  //
-  std::vector<std::vector<int>> idx(4);
-
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
+// _cstyle_vertexarea_seq_end
+
+//std::cout << "\n Vertex areas (reference)...\n";
+//printMeshData(areav_ref, Nvert, jvoff);
+
+
+//----------------------------------------------------------------------------//
+//
+// In the following, we partition the element iteration space into four
+// subsets (or "colors") indicated by numbers in the figure below. 
+// 
+//    -----------------
+//    | 2 | 3 | 2 | 3 |
+//    -----------------
+//    | 0 | 1 | 0 | 1 |
+//    -----------------
+//    | 2 | 3 | 2 | 3 |
+//    -----------------
+//    | 0 | 1 | 0 | 1 |   
+//    -----------------
+//
+// Since none of the elements with the same number share a common vertex,
+// we can iterate over each subset ("color") in parallel.
+//
+// We use RAJA ListSegments and a RAJA IndexSet to define the element 
+// partitioning. 
+//
+
+// _vertexarea_color_start
+//
+// Gather the element indices for each color in a vector.
+//
+  std::vector< std::vector<int> > idx(4);
+
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
     int i = ie % Nelem;
     int j = ie / Nelem;
-    if (i % 2 == 0)
-    {
-      if (j % 2 == 0)
-      {
+    if ( i % 2 == 0 ) {
+      if ( j % 2 == 0 ) {
         idx[0].push_back(ie);
-      }
-      else
-      {
+      } else {
         idx[2].push_back(ie);
       }
-    }
-    else
-    {
-      if (j % 2 == 0)
-      {
+    } else {
+      if ( j % 2 == 0 ) {
         idx[1].push_back(ie);
-      }
-      else
-      {
+      } else {
         idx[3].push_back(ie);
       }
     }
   }
-  // _vertexarea_color_end
+// _vertexarea_color_end
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant. Note that we use the vectors
-  // defined above in this variant to run each element subset in parallel.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant. Note that we use the vectors
+// defined above in this variant to run each element subset in parallel. 
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running C-style OpenMP vertex sum...\n";
 
 
-  // _cstyle_vertexarea_omp_start
+// _cstyle_vertexarea_omp_start
   std::memset(areav, 0, Nvert_tot * sizeof(double));
 
-  for (int icol = 0; icol < 4; ++icol)
-  {
-    const std::vector<int>& ievec = idx[icol];
-    const int               len   = static_cast<int>(ievec.size());
-
-#pragma omp parallel for
-    for (int i = 0; i < len; ++i)
-    {
-      int  ie = ievec[i];
-      int* iv = &(e2v_map[4 * ie]);
-      areav[iv[0]] += areae[ie] / 4.0;
-      areav[iv[1]] += areae[ie] / 4.0;
-      areav[iv[2]] += areae[ie] / 4.0;
-      areav[iv[3]] += areae[ie] / 4.0;
-    }
+  for (int icol = 0; icol < 4; ++icol) {
+     const std::vector<int>& ievec = idx[icol];
+     const int len = static_cast<int>(ievec.size());
+
+     #pragma omp parallel for  
+     for (int i = 0; i < len; ++i) {
+        int ie = ievec[i]; 
+        int* iv = &(e2v_map[4*ie]);
+        areav[ iv[0] ] += areae[ie] / 4.0 ;
+        areav[ iv[1] ] += areae[ie] / 4.0 ;
+        areav[ iv[2] ] += areae[ie] / 4.0 ;
+        areav[ iv[3] ] += areae[ie] / 4.0 ;
+     }
+
   }
-  // _cstyle_vertexarea_omp_end
+// _cstyle_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex areas (reference)...\n";
-  // printMeshData(areav_ref, Nvert, jvoff);
+//std::cout << "\n Vertex areas (reference)...\n";
+//printMeshData(areav_ref, Nvert, jvoff);
 
 #endif
 
 
 // The IndexSet is a variadic template, where the template arguments
-// are the segment types that the IndexSet can hold.
-//
-#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) ||                \
-    defined(RAJA_ENABLE_HIP)
-  // _vertexarea_listsegtype_start
+// are the segment types that the IndexSet can hold. 
+// 
+#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+// _vertexarea_listsegtype_start
   using SegmentType = RAJA::TypedListSegment<int>;
 // _vertexarea_listsegtype_end
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //
-  // Resource object used to construct list segment objects with indices
-  // living in host (CPU) memory.
-  //
-  camp::resources::Resource host_res {camp::resources::Host()};
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+//
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  //
-  // Create a RAJA IndexSet with four ListSegments, one for the indices of
-  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-  // variants of the vertex sum calculation.
+// 
+// Create a RAJA IndexSet with four ListSegments, one for the indices of 
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA 
+// variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> colorset;
 
-  colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res));
+  colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); 
 
   ///
   /// TODO...
@@ -275,59 +260,56 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           below to check if it's correct.
   ///
 
-  //----------------------------------------------------------------------------//
-  // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration
-  // over segments, OpenMP parallel iteration of each segment)
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration 
+// over segments, OpenMP parallel iteration of each segment)
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  // _raja_vertexarea_omp_start
-  using EXEC_POL1 =
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
+// _raja_vertexarea_omp_start
+  using EXEC_POL1 = RAJA::ExecPolicy<RAJA::seq_segit, 
+                                     RAJA::omp_parallel_for_exec>;
 
-  RAJA::forall<EXEC_POL1>(
-      colorset,
-      [=](int ie)
-      {
-        int* iv = &(e2v_map[4 * ie]);
-        areav[iv[0]] += areae[ie] / 4.0;
-        areav[iv[1]] += areae[ie] / 4.0;
-        areav[iv[2]] += areae[ie] / 4.0;
-        areav[iv[3]] += areae[ie] / 4.0;
-      });
-  // _raja_vertexarea_omp_end
+  RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav[ iv[0] ] += areae[ie] / 4.0 ;
+    areav[ iv[1] ] += areae[ie] / 4.0 ;
+    areav[ iv[2] ] += areae[ie] / 4.0 ;
+    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex volumes...\n";
-  // printMeshData(areav, Nvert, Nvert);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, Nvert); 
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration
-  // over segments, CUDA kernel launched for each segment)
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration 
+// over segments, CUDA kernel launched for each segment)
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //
-  // Resource object used to construct list segment objects with indices
-  // living in device (GPU) memory.
-  //
-  camp::resources::Resource cuda_res {camp::resources::Cuda()};
+//
+// Resource object used to construct list segment objects with indices
+// living in device (GPU) memory.
+//
+  camp::resources::Resource cuda_res{camp::resources::Cuda()};
 
-  //
-  // Create a RAJA IndexSet with four ListSegments, one for the indices of
-  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-  // variants of the vertex sum calculation.
+//
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> cuda_colorset;
 
-  cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res));
+  cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) );
 
   ///
   /// TODO...
@@ -339,91 +321,84 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  // _raja_vertexarea_cuda_start
-  using EXEC_POL2 =
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
+// _raja_vertexarea_cuda_start
+  using EXEC_POL2 = RAJA::ExecPolicy<RAJA::seq_segit, 
+                                     RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(
-      cuda_colorset,
-      [=] RAJA_DEVICE(int ie)
-      {
-        int* iv = &(e2v_map[4 * ie]);
-        areav[iv[0]] += areae[ie] / 4.0;
-        areav[iv[1]] += areae[ie] / 4.0;
-        areav[iv[2]] += areae[ie] / 4.0;
-        areav[iv[3]] += areae[ie] / 4.0;
-      });
-  // _raja_vertexarea_cuda_end
+  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav[ iv[0] ] += areae[ie] / 4.0 ;
+    areav[ iv[1] ] += areae[ie] / 4.0 ;
+    areav[ iv[2] ] += areae[ie] / 4.0 ;
+    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex volumes...\n";
-  // printMeshData(areav, Nvert, jvoff);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, jvoff);
 
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA HIP vertex sum calculation using IndexSet (sequential iteration
-  // over segments, HIP kernel launched for each segment)
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA HIP vertex sum calculation using IndexSet (sequential iteration
+// over segments, HIP kernel launched for each segment)
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //
-  // Allocate and initialize device memory arrays
-  //
-  double* d_areae   = memoryManager::allocate_gpu<double>(Nelem_tot);
-  double* d_areav   = memoryManager::allocate_gpu<double>(Nvert_tot);
-  int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
+//
+// Allocate and initialize device memory arrays
+//
+  double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
+  double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
+  int* d_e2v_map  = memoryManager::allocate_gpu<int>(4*Nelem_tot);
 
-  hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(
-      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
+  hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
-  hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice);
 
-  //
-  // Resource object used to construct list segment objects with indices
-  // living in device (GPU) memory.
-  //
-  camp::resources::Resource hip_res {camp::resources::Hip()};
+//
+// Resource object used to construct list segment objects with indices
+// living in device (GPU) memory.
+//
+  camp::resources::Resource hip_res{camp::resources::Hip()};
 
-  //
-  // Create a RAJA IndexSet with four ListSegments, one for the indices of
-  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-  // variants of the vertex sum calculation.
+//
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> hip_colorset;
 
-  hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res));
-  hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res));
-  hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res));
-  hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res));
+  hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) );
 
   std::cout << "\n Running RAJA HIP index set vertex sum...\n";
 
-  // _raja_vertexarea_hip_start
-  using EXEC_POL3 =
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
-
-  RAJA::forall<EXEC_POL3>(
-      hip_colorset,
-      [=] RAJA_DEVICE(int ie)
-      {
-        int* iv = &(d_e2v_map[4 * ie]);
-        d_areav[iv[0]] += d_areae[ie] / 4.0;
-        d_areav[iv[1]] += d_areae[ie] / 4.0;
-        d_areav[iv[2]] += d_areae[ie] / 4.0;
-        d_areav[iv[3]] += d_areae[ie] / 4.0;
-      });
-  // _raja_vertexarea_hip_end
-
-  hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
+// _raja_vertexarea_hip_start
+  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit,
+                                     RAJA::hip_exec<HIP_BLOCK_SIZE>>;
+
+  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE (int ie) {
+    int* iv = &(d_e2v_map[4*ie]);
+    d_areav[ iv[0] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[1] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[2] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[3] ] += d_areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_hip_end
+
+  hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost);
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex volumes...\n";
-  // printMeshData(areav, Nvert, jvoff);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, jvoff);
 
   memoryManager::deallocate_gpu(d_areae);
   memoryManager::deallocate_gpu(d_areav);
@@ -431,7 +406,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(areae);
@@ -450,19 +425,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(double* a, double* aref, int n)
 {
   bool correct = true;
-  for (int i = 0; i < n * n; i++)
-  {
-    if (correct && std::abs(a[i] - aref[i]) > 10e-12)
-    {
-      correct = false;
-    }
+  for (int i = 0; i < n*n; i++) {
+    if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -473,12 +441,11 @@ void checkResult(double* a, double* aref, int n)
 void printMeshData(double* v, int n, int joff)
 {
   std::cout << std::endl;
-  for (int j = 0; j < n; ++j)
-  {
-    for (int i = 0; i < n; ++i)
-    {
-      int ii = i + j * joff;
-      std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl;
+  for (int j = 0 ; j < n ; ++j) {
+    for (int i = 0 ; i < n ; ++i) {
+      int ii = i + j*joff ;
+      std::cout << "v(" << i << "," << j << ") = "
+                << v[ii] << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp
index f2bb20947e..5c1617343a 100644
--- a/exercises/vertexsum-indexset_solution.cpp
+++ b/exercises/vertexsum-indexset_solution.cpp
@@ -20,7 +20,7 @@
 /*
  *  Mesh vertex area exercise
  *
- *  In this exercise, you will use a RAJA TypedIndexSet containing 4
+ *  In this exercise, you will use a RAJA TypedIndexSet containing 4 
  *  TypedListSegments to parallelize the mesh vertex area computation.
  *  A sum is computed at each vertex on a logically-Cartesian 2D mesh
  *  where the sum represents the vertex "area" as an average of the 4
@@ -32,13 +32,13 @@
  *  each subset. When the ListSegments are put into an IndexSet, the entire
  *  computation can be executed with one RAJA::forall() statement, where
  *  you iterate over the segments sequentially and execute each segment in
- *  parallel. This exercise illustrates how RAJA can be used to enable one
+ *  parallel. This exercise illustrates how RAJA can be used to enable one 
  *  to get some parallelism from such operations without fundamentally
  *  changing the way the algorithm looks in source code.
  *
  *  This file contains sequential and OpenMP variants of the vertex area
- *  computation using C-style for-loops. You will fill in RAJA versions of
- *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA
+ *  computation using C-style for-loops. You will fill in RAJA versions of 
+ *  these variants, plus a RAJA CUDA version if you have access to an NVIDIA 
  *  GPU and a CUDA compiler, in empty code sections indicated by comments.
  *
  *  RAJA features you will use:
@@ -68,354 +68,329 @@ void checkResult(double* a, double* aref, int n);
 void printMeshData(double* v, int n, int joff);
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n";
 
-  // _vertexsum_define_start
-  //
-  // 2D mesh has N^2 elements (N+1)^2 vertices.
-  //
-  constexpr int N         = 1000;
-  constexpr int Nelem     = N;
+// _vertexsum_define_start
+//
+// 2D mesh has N^2 elements (N+1)^2 vertices.
+//
+  constexpr int N = 1000;
+  constexpr int Nelem = N;
   constexpr int Nelem_tot = Nelem * Nelem;
-  constexpr int Nvert     = N + 1;
+  constexpr int Nvert = N + 1;
   constexpr int Nvert_tot = Nvert * Nvert;
-  // _vertexsum_define_end
-  double* areae     = memoryManager::allocate<double>(Nelem_tot);
-  double* areav     = memoryManager::allocate<double>(Nvert_tot);
+// _vertexsum_define_end
+  double* areae = memoryManager::allocate<double>(Nelem_tot);
+  double* areav = memoryManager::allocate<double>(Nvert_tot);
   double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
-  int*    e2v_map   = memoryManager::allocate<int>(4 * Nelem_tot);
+  int* e2v_map = memoryManager::allocate<int>(4*Nelem_tot);
 
-  // _vertexsum_elemarea_start
-  //
-  // Define mesh spacing factor 'h' and set up elem to vertex mapping array.
-  //
+// _vertexsum_elemarea_start
+//
+// Define mesh spacing factor 'h' and set up elem to vertex mapping array.
+//
   constexpr double h = 0.1;
 
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
-    int j             = ie / Nelem;
-    int imap          = 4 * ie;
-    e2v_map[imap]     = ie + j;
-    e2v_map[imap + 1] = ie + j + 1;
-    e2v_map[imap + 2] = ie + j + Nvert;
-    e2v_map[imap + 3] = ie + j + 1 + Nvert;
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int j = ie / Nelem;
+    int imap = 4 * ie ;
+    e2v_map[imap] = ie + j;
+    e2v_map[imap+1] = ie + j + 1;
+    e2v_map[imap+2] = ie + j + Nvert;
+    e2v_map[imap+3] = ie + j + 1 + Nvert;
   }
 
-  //
-  // Initialize element areas so each element area
-  // depends on the i,j coordinates of the element.
-  //
+//
+// Initialize element areas so each element area 
+// depends on the i,j coordinates of the element.
+//
   std::memset(areae, 0, Nelem_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
-    int i     = ie % Nelem;
-    int j     = ie / Nelem;
-    areae[ie] = h * (i + 1) * h * (j + 1);
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int i = ie % Nelem;
+    int j = ie / Nelem;
+    areae[ie] = h*(i+1) * h*(j+1);
   }
-  // _vertexsum_elemarea_end
+// _vertexsum_elemarea_end
 
-  // std::cout << "\n Element areas...\n";
-  // printMeshData(areae, Nelem, Nelem);
+//std::cout << "\n Element areas...\n";
+//printMeshData(areae, Nelem, Nelem);
 
-  //----------------------------------------------------------------------------//
-  // C-style sequential variant establishes reference solution to compare with.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running sequential C-style version of vertex sum...\n";
 
-  // _cstyle_vertexarea_seq_start
+// _cstyle_vertexarea_seq_start
   std::memset(areav_ref, 0, Nvert_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
-    int* iv = &(e2v_map[4 * ie]);
-    areav_ref[iv[0]] += areae[ie] / 4.0;
-    areav_ref[iv[1]] += areae[ie] / 4.0;
-    areav_ref[iv[2]] += areae[ie] / 4.0;
-    areav_ref[iv[3]] += areae[ie] / 4.0;
+  for (int ie = 0; ie < Nelem_tot; ++ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
   }
-  // _cstyle_vertexarea_seq_end
-
-  // std::cout << "\n Vertex areas (reference)...\n";
-  // printMeshData(areav_ref, Nvert, jvoff);
-
-
-  //----------------------------------------------------------------------------//
-  //
-  // In the following, we partition the element iteration space into four
-  // subsets (or "colors") indicated by numbers in the figure below.
-  //
-  //    -----------------
-  //    | 2 | 3 | 2 | 3 |
-  //    -----------------
-  //    | 0 | 1 | 0 | 1 |
-  //    -----------------
-  //    | 2 | 3 | 2 | 3 |
-  //    -----------------
-  //    | 0 | 1 | 0 | 1 |
-  //    -----------------
-  //
-  // Since none of the elements with the same number share a common vertex,
-  // we can iterate over each subset ("color") in parallel.
-  //
-  // We use RAJA ListSegments and a RAJA IndexSet to define the element
-  // partitioning.
-  //
-
-  // _vertexarea_color_start
-  //
-  // Gather the element indices for each color in a vector.
-  //
-  std::vector<std::vector<int>> idx(4);
-
-  for (int ie = 0; ie < Nelem_tot; ++ie)
-  {
+// _cstyle_vertexarea_seq_end
+
+//std::cout << "\n Vertex areas (reference)...\n";
+//printMeshData(areav_ref, Nvert, jvoff);
+
+
+//----------------------------------------------------------------------------//
+//
+// In the following, we partition the element iteration space into four
+// subsets (or "colors") indicated by numbers in the figure below. 
+// 
+//    -----------------
+//    | 2 | 3 | 2 | 3 |
+//    -----------------
+//    | 0 | 1 | 0 | 1 |
+//    -----------------
+//    | 2 | 3 | 2 | 3 |
+//    -----------------
+//    | 0 | 1 | 0 | 1 |   
+//    -----------------
+//
+// Since none of the elements with the same number share a common vertex,
+// we can iterate over each subset ("color") in parallel.
+//
+// We use RAJA ListSegments and a RAJA IndexSet to define the element 
+// partitioning. 
+//
+
+// _vertexarea_color_start
+//
+// Gather the element indices for each color in a vector.
+//
+  std::vector< std::vector<int> > idx(4);
+
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
     int i = ie % Nelem;
     int j = ie / Nelem;
-    if (i % 2 == 0)
-    {
-      if (j % 2 == 0)
-      {
+    if ( i % 2 == 0 ) {
+      if ( j % 2 == 0 ) {
         idx[0].push_back(ie);
-      }
-      else
-      {
+      } else {
         idx[2].push_back(ie);
       }
-    }
-    else
-    {
-      if (j % 2 == 0)
-      {
+    } else {
+      if ( j % 2 == 0 ) {
         idx[1].push_back(ie);
-      }
-      else
-      {
+      } else {
         idx[3].push_back(ie);
       }
     }
   }
-  // _vertexarea_color_end
+// _vertexarea_color_end
 
 
-  //----------------------------------------------------------------------------//
-  // C-style OpenMP multithreading variant. Note that we use the vectors
-  // defined above in this variant to run each element subset in parallel.
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// C-style OpenMP multithreading variant. Note that we use the vectors
+// defined above in this variant to run each element subset in parallel. 
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
   std::cout << "\n Running C-style OpenMP vertex sum...\n";
 
 
-  // _cstyle_vertexarea_omp_start
+// _cstyle_vertexarea_omp_start
   std::memset(areav, 0, Nvert_tot * sizeof(double));
 
-  for (int icol = 0; icol < 4; ++icol)
-  {
-    const std::vector<int>& ievec = idx[icol];
-    const int               len   = static_cast<int>(ievec.size());
-
-#pragma omp parallel for
-    for (int i = 0; i < len; ++i)
-    {
-      int  ie = ievec[i];
-      int* iv = &(e2v_map[4 * ie]);
-      areav[iv[0]] += areae[ie] / 4.0;
-      areav[iv[1]] += areae[ie] / 4.0;
-      areav[iv[2]] += areae[ie] / 4.0;
-      areav[iv[3]] += areae[ie] / 4.0;
-    }
+  for (int icol = 0; icol < 4; ++icol) {
+     const std::vector<int>& ievec = idx[icol];
+     const int len = static_cast<int>(ievec.size());
+
+     #pragma omp parallel for  
+     for (int i = 0; i < len; ++i) {
+        int ie = ievec[i]; 
+        int* iv = &(e2v_map[4*ie]);
+        areav[ iv[0] ] += areae[ie] / 4.0 ;
+        areav[ iv[1] ] += areae[ie] / 4.0 ;
+        areav[ iv[2] ] += areae[ie] / 4.0 ;
+        areav[ iv[3] ] += areae[ie] / 4.0 ;
+     }
+
   }
-  // _cstyle_vertexarea_omp_end
+// _cstyle_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex areas (reference)...\n";
-  // printMeshData(areav_ref, Nvert, jvoff);
+//std::cout << "\n Vertex areas (reference)...\n";
+//printMeshData(areav_ref, Nvert, jvoff);
 
 #endif
 
 
 // The IndexSet is a variadic template, where the template arguments
-// are the segment types that the IndexSet can hold.
-//
-#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) ||                \
-    defined(RAJA_ENABLE_HIP)
-  // _vertexarea_listsegtype_start
+// are the segment types that the IndexSet can hold. 
+// 
+#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) 
+// _vertexarea_listsegtype_start
   using SegmentType = RAJA::TypedListSegment<int>;
 // _vertexarea_listsegtype_end
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  //
-  // Resource object used to construct list segment objects with indices
-  // living in host (CPU) memory.
-  //
-  camp::resources::Resource host_res {camp::resources::Host()};
+//
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+//
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  //
-  // Create a RAJA IndexSet with four ListSegments, one for the indices of
-  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-  // variants of the vertex sum calculation.
+// 
+// Create a RAJA IndexSet with four ListSegments, one for the indices of 
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA 
+// variants of the vertex sum calculation.
 
-  // _vertexarea_indexset_start
+// _vertexarea_indexset_start
   RAJA::TypedIndexSet<SegmentType> colorset;
 
-  colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res));
-  colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), host_res));
-  colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), host_res));
-  colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), host_res));
-  // _vertexarea_indexset_end
+  colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) );
+// _vertexarea_indexset_end
 
-  //----------------------------------------------------------------------------//
-  // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration
-  // over segments, OpenMP parallel iteration of each segment)
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration 
+// over segments, OpenMP parallel iteration of each segment)
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running RAJA OpenMP index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  // _raja_vertexarea_omp_start
-  using EXEC_POL1 =
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
+// _raja_vertexarea_omp_start
+  using EXEC_POL1 = RAJA::ExecPolicy<RAJA::seq_segit, 
+                                     RAJA::omp_parallel_for_exec>;
 
-  RAJA::forall<EXEC_POL1>(
-      colorset,
-      [=](int ie)
-      {
-        int* iv = &(e2v_map[4 * ie]);
-        areav[iv[0]] += areae[ie] / 4.0;
-        areav[iv[1]] += areae[ie] / 4.0;
-        areav[iv[2]] += areae[ie] / 4.0;
-        areav[iv[3]] += areae[ie] / 4.0;
-      });
-  // _raja_vertexarea_omp_end
+  RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav[ iv[0] ] += areae[ie] / 4.0 ;
+    areav[ iv[1] ] += areae[ie] / 4.0 ;
+    areav[ iv[2] ] += areae[ie] / 4.0 ;
+    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex volumes...\n";
-  // printMeshData(areav, Nvert, Nvert);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, Nvert); 
 
 #endif
 
 
-  //----------------------------------------------------------------------------//
-  // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration
-  // over segments, CUDA kernel launched for each segment)
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration 
+// over segments, CUDA kernel launched for each segment)
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  //
-  // Resource object used to construct list segment objects with indices
-  // living in device (GPU) memory.
-  //
-  camp::resources::Resource cuda_res {camp::resources::Cuda()};
+//
+// Resource object used to construct list segment objects with indices
+// living in device (GPU) memory.
+//
+  camp::resources::Resource cuda_res{camp::resources::Cuda()};
 
-  //
-  // Create a RAJA IndexSet with four ListSegments, one for the indices of
-  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-  // variants of the vertex sum calculation.
+//
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> cuda_colorset;
 
-  cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res));
-  cuda_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), cuda_res));
-  cuda_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), cuda_res));
-  cuda_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), cuda_res));
+  cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) );
+  cuda_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), cuda_res) );
+  cuda_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), cuda_res) );
+  cuda_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), cuda_res) );
 
   std::cout << "\n Running RAJA CUDA index set vertex sum...\n";
 
-  std::memset(areav, 0, Nvert * Nvert * sizeof(double));
+  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  // _raja_vertexarea_cuda_start
-  using EXEC_POL2 =
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
+// _raja_vertexarea_cuda_start
+  using EXEC_POL2 = RAJA::ExecPolicy<RAJA::seq_segit, 
+                                     RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL2>(
-      cuda_colorset,
-      [=] RAJA_DEVICE(int ie)
-      {
-        int* iv = &(e2v_map[4 * ie]);
-        areav[iv[0]] += areae[ie] / 4.0;
-        areav[iv[1]] += areae[ie] / 4.0;
-        areav[iv[2]] += areae[ie] / 4.0;
-        areav[iv[3]] += areae[ie] / 4.0;
-      });
-  // _raja_vertexarea_cuda_end
+  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav[ iv[0] ] += areae[ie] / 4.0 ;
+    areav[ iv[1] ] += areae[ie] / 4.0 ;
+    areav[ iv[2] ] += areae[ie] / 4.0 ;
+    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex volumes...\n";
-  // printMeshData(areav, Nvert, jvoff);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, jvoff);
 
 #endif
 
-  //----------------------------------------------------------------------------//
-  // RAJA HIP vertex sum calculation using IndexSet (sequential iteration
-  // over segments, HIP kernel launched for each segment)
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+// RAJA HIP vertex sum calculation using IndexSet (sequential iteration
+// over segments, HIP kernel launched for each segment)
+//----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
 
-  //
-  // Allocate and initialize device memory arrays
-  //
-  double* d_areae   = memoryManager::allocate_gpu<double>(Nelem_tot);
-  double* d_areav   = memoryManager::allocate_gpu<double>(Nvert_tot);
-  int*    d_e2v_map = memoryManager::allocate_gpu<int>(4 * Nelem_tot);
+//
+// Allocate and initialize device memory arrays
+//
+  double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
+  double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
+  int* d_e2v_map  = memoryManager::allocate_gpu<int>(4*Nelem_tot);
 
-  hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(
-      d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice);
+  hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice);
 
   std::memset(areav, 0, Nvert_tot * sizeof(double));
-  hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice);
 
-  //
-  // Resource object used to construct list segment objects with indices
-  // living in device (GPU) memory.
-  //
-  camp::resources::Resource hip_res {camp::resources::Hip()};
+//
+// Resource object used to construct list segment objects with indices
+// living in device (GPU) memory.
+//
+  camp::resources::Resource hip_res{camp::resources::Hip()};
 
-  //
-  // Create a RAJA IndexSet with four ListSegments, one for the indices of
-  // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
-  // variants of the vertex sum calculation.
+//
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
 
   RAJA::TypedIndexSet<SegmentType> hip_colorset;
 
-  hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res));
-  hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res));
-  hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res));
-  hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res));
+  hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) );
 
   std::cout << "\n Running RAJA HIP index set vertex sum...\n";
 
-  // _raja_vertexarea_hip_start
-  using EXEC_POL3 =
-      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<HIP_BLOCK_SIZE>>;
-
-  RAJA::forall<EXEC_POL3>(
-      hip_colorset,
-      [=] RAJA_DEVICE(int ie)
-      {
-        int* iv = &(d_e2v_map[4 * ie]);
-        d_areav[iv[0]] += d_areae[ie] / 4.0;
-        d_areav[iv[1]] += d_areae[ie] / 4.0;
-        d_areav[iv[2]] += d_areae[ie] / 4.0;
-        d_areav[iv[3]] += d_areae[ie] / 4.0;
-      });
-  // _raja_vertexarea_hip_end
-
-  hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost);
+// _raja_vertexarea_hip_start
+  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit,
+                                     RAJA::hip_exec<HIP_BLOCK_SIZE>>;
+
+  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE (int ie) {
+    int* iv = &(d_e2v_map[4*ie]);
+    d_areav[ iv[0] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[1] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[2] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[3] ] += d_areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_hip_end
+
+  hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost);
   checkResult(areav, areav_ref, Nvert);
-  // std::cout << "\n Vertex volumes...\n";
-  // printMeshData(areav, Nvert, jvoff);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, jvoff);
 
   memoryManager::deallocate_gpu(d_areae);
   memoryManager::deallocate_gpu(d_areav);
@@ -423,7 +398,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   // Clean up...
   memoryManager::deallocate(areae);
@@ -442,19 +417,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 void checkResult(double* a, double* aref, int n)
 {
   bool correct = true;
-  for (int i = 0; i < n * n; i++)
-  {
-    if (correct && std::abs(a[i] - aref[i]) > 10e-12)
-    {
-      correct = false;
-    }
+  for (int i = 0; i < n*n; i++) {
+    if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; }
   }
-  if (correct)
-  {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 }
@@ -465,12 +433,11 @@ void checkResult(double* a, double* aref, int n)
 void printMeshData(double* v, int n, int joff)
 {
   std::cout << std::endl;
-  for (int j = 0; j < n; ++j)
-  {
-    for (int i = 0; i < n; ++i)
-    {
-      int ii = i + j * joff;
-      std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl;
+  for (int j = 0 ; j < n ; ++j) {
+    for (int i = 0 ; i < n ; ++i) {
+      int ii = i + j*joff ;
+      std::cout << "v(" << i << "," << j << ") = "
+                << v[ii] << std::endl;
     }
   }
   std::cout << std::endl;
diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp
index 9239280256..0f9383e95e 100644
--- a/exercises/view-layout.cpp
+++ b/exercises/view-layout.cpp
@@ -22,9 +22,9 @@
  *  RAJA features shown:
  *    - RAJA::View
  *    - RAJA::Layout
- *    - Layout permutations
+ *    - Layout permutations 
  *    - OffsetLayout
- *    - OffsetLayout permutations
+ *    - OffsetLayout permutations 
  *
  * NOTE: no RAJA kernel execution methods are used in these examples.
  */
@@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N);
 template <typename T>
 void printValues(T* C, int N);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA view & layout exercises...\n";
 
-  //----------------------------------------------------------------------------//
-  //
-  // Matrix-matrix multiplication: default layout
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Matrix-matrix multiplication: default layout
+//
+//----------------------------------------------------------------------------//
 
   // _matmult_init_start
   //
@@ -58,103 +58,95 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate storage for matrices and initialize matrix entries
   //
-  double* A    = new double[N * N];
-  double* B    = new double[N * N];
-  double* C    = new double[N * N];
-  double* Cref = new double[N * N];
-
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      A[col + N * row]    = row + 1;
-      B[col + N * row]    = col + 1;
-      C[col + N * row]    = 0.0;
-      Cref[col + N * row] = 0.0;
+  double *A = new double[ N * N ];
+  double *B = new double[ N * N ];
+  double *C = new double[ N * N ];
+  double *Cref = new double[ N * N ];
+
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      A[ col + N*row ] = row + 1;
+      B[ col + N*row ] = col + 1;
+      C[ col + N*row ] = 0.0;
+      Cref[ col + N*row ] = 0.0;
     }
   }
   // _matmult_init_end
 
-  // printValues<double>(A, N*N);
-  // printValues<double>(B, N*N);
-  // printValues<double>(C, N*N);
-  // printValues<double>(Cref, N*N);
+//printValues<double>(A, N*N); 
+//printValues<double>(B, N*N); 
+//printValues<double>(C, N*N); 
+//printValues<double>(Cref, N*N); 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication reference solution...\n";
 
   // _cstyle_matmult_start
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      for (int k = 0; k < N; ++k)
-      {
-        Cref[col + N * row] += A[k + N * row] * B[col + N * k];
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
+        Cref[col + N*row] += A[k + N*row] * B[col + N*k];
       }
     }
   }
   // _cstyle_matmult_end
 
-  // printValues<double>(Cref, N*N);
+//printValues<double>(Cref, N*N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication w/Views...\n";
 
-  //
+  // 
   // Define RAJA View objects to simplify access to the matrix entries.
-  //
-  // Note: we use default Layout
+  // 
+  // Note: we use default Layout 
   //
   // _matmult_views_start
-  RAJA::View<double, RAJA::Layout<2, int>> Aview(A, N, N);
-  RAJA::View<double, RAJA::Layout<2, int>> Bview(B, N, N);
-  RAJA::View<double, RAJA::Layout<2, int>> Cview(C, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N);
   // _matmult_views_end
 
   // _cstyle_matmult_views_start
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      for (int k = 0; k < N; ++k)
-      {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
         Cview(row, col) += Aview(row, k) * Bview(k, col);
       }
     }
   }
   // _cstyle_matmult_views_end
 
-  checkResult<double>(C, Cref, N * N);
-  // printValues<double>(C, N*N);
+  checkResult<double>(C, Cref, N*N);
+//printValues<double>(C, N*N);
 
-  //
-  // Clean up.
-  //
-  delete[] A;
-  delete[] B;
-  delete[] C;
-  delete[] Cref;
+//
+// Clean up.
+//
+  delete [] A;
+  delete [] B;
+  delete [] C;
+  delete [] Cref;
 
-  //----------------------------------------------------------------------------//
-  //
-  // Default layouts use row-major data ordering
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Default layouts use row-major data ordering
+//
+//----------------------------------------------------------------------------//
 
   //
   // Define dimensions and allocate arrays
   //
   // _default_views_init_start
-  constexpr int Nx   = 3;
-  constexpr int Ny   = 5;
-  constexpr int Nz   = 2;
-  constexpr int Ntot = Nx * Ny * Nz;
-  int*          a    = new int[Ntot];
-  int*          aref = new int[Ntot];
+  constexpr int Nx = 3;
+  constexpr int Ny = 5;
+  constexpr int Nz = 2;
+  constexpr int Ntot  = Nx*Ny*Nz;
+  int* a = new int[ Ntot ];
+  int* aref = new int[ Ntot ];
 
   for (int i = 0; i < Ntot; ++i)
   {
@@ -162,52 +154,49 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   // _default_views_init_end
 
-  // printValues<int>(ref, Ntot);
+//printValues<int>(ref, Ntot);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n Running default layout view cases...\n";
 
   std::cout << "\n\t Running 1D view case...\n";
-
+ 
   std::memset(a, 0, Ntot * sizeof(int));
+ 
+  // _default_view1D_start 
+  RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot);
 
-  // _default_view1D_start
-  RAJA::View<int, RAJA::Layout<1, int>> view_1D(a, Ntot);
-
-  for (int i = 0; i < Ntot; ++i)
-  {
+  for (int i = 0; i < Ntot; ++i) {
     view_1D(i) = i;
   }
-  // _default_view1D_end
+  // _default_view1D_end 
 
   checkResult<int>(a, aref, Ntot);
-  // printValues<int>(a, Ntot);
+//printValues<int>(a, Ntot);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D default layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
-
+ 
   // _default_view2D_start
-  RAJA::View<int, RAJA::Layout<2, int>> view_2D(a, Nx, Ny);
+  RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny);
 
-  int iter {0};
-  for (int i = 0; i < Nx; ++i)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
+  int iter{0};
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
       view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_view2D_end
 
-  checkResult<int>(a, aref, Nx * Ny);
-  // printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D default layout view case...\n";
 
@@ -216,23 +205,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement a triple loop nest using a RAJA::View and
+  /// EXERCISE: Implement a triple loop nest using a RAJA::View and 
   ///           three-dimensional RAJA::Layout that iterates over the
   ///           data array 'a' with unit stride.
   ///
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //----------------------------------------------------------------------------//
-  //
-  // Permuted layouts change the data striding order
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Permuted layouts change the data striding order
+//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running permuted layout cases...\n";
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D default permutation view case...\n";
 
@@ -240,25 +229,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _default_perm_view2D_start
   std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
-  RAJA::Layout<2, int>       defperm2_layout =
-      RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
-  RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
+  RAJA::Layout< 2, int > defperm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2);
+  RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
       defperm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_perm_view2D_end
 
-  checkResult<int>(a, aref, Nx * Ny);
-  // printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D default permutation view case...\n";
 
@@ -271,11 +258,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           three-dimensional RAJA::Layout with the identity permutation.
   ///
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //----------------------------------------//
-  //----------------------------------------//
+//----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted layout view case...\n";
 
@@ -283,25 +270,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _perm_2D_start
   std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
-  RAJA::Layout<2, int>       perm2_layout =
-      RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
-  RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
+  RAJA::Layout< 2, int > perm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, perm2);
+  RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j)
-  {
-    for (int i = 0; i < Nx; ++i)
-    {
+  for (int j = 0; j < Ny; ++j) {
+    for (int i = 0; i < Nx; ++i) {
       perm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _perm_2D_end
 
-  checkResult<int>(a, aref, Nx * Ny);
-  // printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D perma layout view case...\n";
 
@@ -312,7 +297,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///
   /// EXERCISE: Implement a triple loop nest using a RAJA::View and
   ///           three-dimensional RAJA::Layout with the permutation
-  ///           {2, 1, 0}.
+  ///           {2, 1, 0}. 
   ///
   ///           Name the Layout object 'perm3a_layout' so it can be used
   ///           with the index conversion methods in the section below.
@@ -320,10 +305,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           Layout object you create here.
   ///
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D permb layout view case...\n";
 
@@ -331,17 +316,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _permb_view3D_start
   std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
-  RAJA::Layout<3, int>       perm3b_layout =
-      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
-  RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
+  RAJA::Layout< 3, int > perm3b_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b);
+  RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j)
-  {
-    for (int k = 0; k < Nz; ++k)
-    {
-      for (int i = 0; i < Nx; ++i)
-      {
+  for (int j = 0; j < Ny; ++j) {
+    for (int k = 0; k < Nz; ++k) {
+      for (int i = 0; i < Nx; ++i) {
         perm3b_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -349,29 +331,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   // _permb_view3D_end
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //
-  // Clean up.
-  //
-  delete[] a;
-  delete[] aref;
+//
+// Clean up.
+//
+  delete [] a;
+  delete [] aref;
 
-  //----------------------------------------------------------------------------//
-  //
-  // Layouts: multi-dimensional indices vs. linear indicies
-  //
-  // RAJA::Layout type has methods that can be used to convert between
-  // multi-dimensional and linear indices. We show these below using the
-  // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
-  // sizes defined earlier:
-  //
-  //  constexpr int Nx = 3;
-  //  constexpr int Ny = 5;
-  //  constexpr int Nz = 2;
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Layouts: multi-dimensional indices vs. linear indicies
+//
+// RAJA::Layout type has methods that can be used to convert between
+// multi-dimensional and linear indices. We show these below using the
+// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
+// sizes defined earlier:
+//
+//  constexpr int Nx = 3;
+//  constexpr int Ny = 5;
+//  constexpr int Nz = 2;
+//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Multi-dimensional indices to linear indices...\n";
 
@@ -379,44 +361,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\nperm3a_layout...\n" << std::endl;
 
   int lin = -1;
-  int i   = -1;
-  int j   = -1;
-  int k   = -1;
+  int i = -1; 
+  int j = -1; 
+  int k = -1; 
 
-  /*
-    // _perm3d_layout_start
-    lin = perm3a_layout(1, 2, 0);
-    std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl;
-    std::cout << "\t  Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny "
-              << "(since perm is {2, 1, 0})" << std::endl;
+/*
+  // _perm3d_layout_start
+  lin = perm3a_layout(1, 2, 0);
+  std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl;
+  std::cout << "\t  Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
 
-    perm3a_layout.toIndices(7, i, j, k);
-    std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
-              << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
-    // _perm3d_layout_end
+  perm3a_layout.toIndices(7, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+  // _perm3d_layout_end
 
 
-    lin = perm3a_layout(2, 3, 1);
-    std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl;
-    std::cout << "\t  Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny "
-              << "(since perm is {2, 1, 0})" << std::endl;
+  lin = perm3a_layout(2, 3, 1);
+  std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
 
-    perm3a_layout.toIndices(26, i, j, k);
-    std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
-              << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+  perm3a_layout.toIndices(26, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
 
-    lin = perm3a_layout(0, 2, 1);
-    std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl;
-    std::cout << "\t  Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny "
-              << "(since perm is {2, 1, 0})" << std::endl;
+  lin = perm3a_layout(0, 2, 1);
+  std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
 
-    perm3a_layout.toIndices(21, i, j, k);
-    std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
-              << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
-  */
+  perm3a_layout.toIndices(21, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+*/
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\nperm3b_layout...\n" << std::endl;
 
@@ -427,8 +409,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(13, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
 
   lin = perm3b_layout(2, 3, 1);
@@ -438,8 +419,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(23, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
 
   lin = perm3b_layout(0, 2, 1);
@@ -448,8 +428,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
             << "(since perm is {1, 2, 0})" << std::endl;
   perm3b_layout.toIndices(15, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; 
 
   ///
   /// TODO...
@@ -459,11 +438,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           data array 'a' with unit stride.
   ///
 
-  //----------------------------------------------------------------------------//
-  //
-  // Offset layouts apply offsets to indices
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Offset layouts apply offsets to indices
+//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running offset layout cases...\n";
 
@@ -471,10 +450,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define some dimensions, and allocate arrays
   //
   constexpr int Ntot_ao = 40;
-  int*          ao      = new int[Ntot_ao];
-  int*          ao_ref  = new int[Ntot_ao];
+  int* ao = new int[ Ntot_ao ];
+  int* ao_ref = new int[ Ntot_ao ];
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 1D offset layout case...\n";
 
@@ -488,34 +467,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int imin = -5;
   int imax = 6;
 
-  for (int i = imin; i < imax; ++i)
-  {
-    ao_ref[i - imin] = i;
+  for (int i = imin; i < imax; ++i) {
+    ao_ref[ i-imin ] = i;
   }
   // _cstyle_offlayout1D_end
 
-  // printValues<int>(ao_ref, imax-imin);
+//printValues<int>(ao_ref, imax-imin);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_offlayout1D_start
-  RAJA::OffsetLayout<1, int> offlayout_1D =
-      RAJA::make_offset_layout<1, int>({{imin}}, {{imax}});
+  RAJA::OffsetLayout<1, int> offlayout_1D = 
+    RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); 
 
-  RAJA::View<int, RAJA::OffsetLayout<1, int>> aoview_1Doff(ao, offlayout_1D);
+  RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao,
+                                                             offlayout_1D);
 
-  for (int i = imin; i < imax; ++i)
-  {
+  for (int i = imin; i < imax; ++i) {
     aoview_1Doff(i) = i;
   }
   // _raja_offlayout1D_end
 
-  checkResult<int>(ao, ao_ref, imax - imin);
-  // printValues<int>(ao, 11);
+  checkResult<int>(ao, ao_ref, imax-imin);
+//printValues<int>(ao, 11);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D offset layout case...\n";
 
@@ -526,25 +504,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
 
   // _cstyle_offlayout2D_start
-  imin     = -1;
-  imax     = 2;
+  imin = -1;
+  imax = 2;
   int jmin = -5;
   int jmax = 5;
 
   iter = 0;
-  for (int i = imin; i < imax; ++i)
-  {
-    for (int j = jmin; j < jmax; ++j)
-    {
-      ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter;
+  for (int i = imin; i < imax; ++i) {
+    for (int j = jmin; j < jmax; ++j) {
+      ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin)  ] = iter;
       iter++;
     }
   }
   // _cstyle_offlayout2D_end
 
-  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
@@ -556,10 +532,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           same operations as the C-style example above.
   ///
 
-  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
-  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin)); 
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted offset layout case...\n";
 
@@ -571,51 +547,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _cstyle_permofflayout2D_start
   iter = 0;
-  for (int j = jmin; j < jmax; ++j)
-  {
-    for (int i = imin; i < imax; ++i)
-    {
-      ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter;
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      ao_ref[ (i-imin) + (j-jmin) * (imax-imin)  ] = iter; 
       iter++;
     }
   }
   // _cstyle_permofflayout2D_end
 
-  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_permofflayout2D_start
   std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
-  RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>(
-      {{imin, jmin}}, {{imax, jmax}}, perm1D);
+  RAJA::OffsetLayout<2> permofflayout_2D =
+    RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, 
+                                          {{imax, jmax}},
+                                          perm1D );
 
-  RAJA::View<int, RAJA::OffsetLayout<2>> aoview_2Dpermoff(ao, permofflayout_2D);
+  RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao,
+                                                            permofflayout_2D);
 
   iter = 0;
-  for (int j = jmin; j < jmax; ++j)
-  {
-    for (int i = imin; i < imax; ++i)
-    {
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
       aoview_2Dpermoff(i, j) = iter;
       iter++;
     }
   }
   // _raja_permofflayout2D_end
 
-  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
-  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin));
 
-  //
-  // Clean up.
-  //
-  delete[] ao;
-  delete[] ao_ref;
+//
+// Clean up.
+//
+  delete [] ao;
+  delete [] ao_ref;
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
 
@@ -629,19 +604,14 @@ template <typename T>
 void checkResult(T* C, T* Cref, int N)
 {
   bool match = true;
-  for (int i = 0; i < N; ++i)
-  {
-    if (std::abs(C[i] - Cref[i]) > 10e-12)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) {
       match = false;
     }
   }
-  if (match)
-  {
+  if ( match ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -649,8 +619,7 @@ void checkResult(T* C, T* Cref, int N)
 template <typename T>
 void printValues(T* C, int N)
 {
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     std::cout << "array[" << i << "] = " << C[i] << std::endl;
-  }
+    }
 };
diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp
index 691bdb311f..7614c993a8 100644
--- a/exercises/view-layout_solution.cpp
+++ b/exercises/view-layout_solution.cpp
@@ -22,9 +22,9 @@
  *  RAJA features shown:
  *    - RAJA::View
  *    - RAJA::Layout
- *    - Layout permutations
+ *    - Layout permutations 
  *    - OffsetLayout
- *    - OffsetLayout permutations
+ *    - OffsetLayout permutations 
  *
  * NOTE: no RAJA kernel execution methods are used in these examples.
  */
@@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N);
 template <typename T>
 void printValues(T* C, int N);
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   std::cout << "\n\nRAJA view & layout exercises...\n";
 
-  //----------------------------------------------------------------------------//
-  //
-  // Matrix-matrix multiplication: default layout
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Matrix-matrix multiplication: default layout
+//
+//----------------------------------------------------------------------------//
 
   // _matmult_init_start
   //
@@ -58,103 +58,95 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   //
   // Allocate storage for matrices and initialize matrix entries
   //
-  double* A    = new double[N * N];
-  double* B    = new double[N * N];
-  double* C    = new double[N * N];
-  double* Cref = new double[N * N];
-
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      A[col + N * row]    = row + 1;
-      B[col + N * row]    = col + 1;
-      C[col + N * row]    = 0.0;
-      Cref[col + N * row] = 0.0;
+  double *A = new double[ N * N ];
+  double *B = new double[ N * N ];
+  double *C = new double[ N * N ];
+  double *Cref = new double[ N * N ];
+
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      A[ col + N*row ] = row + 1;
+      B[ col + N*row ] = col + 1;
+      C[ col + N*row ] = 0.0;
+      Cref[ col + N*row ] = 0.0;
     }
   }
   // _matmult_init_end
 
-  // printValues<double>(A, N*N);
-  // printValues<double>(B, N*N);
-  // printValues<double>(C, N*N);
-  // printValues<double>(Cref, N*N);
+//printValues<double>(A, N*N); 
+//printValues<double>(B, N*N); 
+//printValues<double>(C, N*N); 
+//printValues<double>(Cref, N*N); 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication reference solution...\n";
 
   // _cstyle_matmult_start
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      for (int k = 0; k < N; ++k)
-      {
-        Cref[col + N * row] += A[k + N * row] * B[col + N * k];
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
+        Cref[col + N*row] += A[k + N*row] * B[col + N*k];
       }
     }
   }
   // _cstyle_matmult_end
 
-  // printValues<double>(Cref, N*N);
+//printValues<double>(Cref, N*N);
 
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running matrix multiplication w/Views...\n";
 
-  //
+  // 
   // Define RAJA View objects to simplify access to the matrix entries.
-  //
-  // Note: we use default Layout
+  // 
+  // Note: we use default Layout 
   //
   // _matmult_views_start
-  RAJA::View<double, RAJA::Layout<2, int>> Aview(A, N, N);
-  RAJA::View<double, RAJA::Layout<2, int>> Bview(B, N, N);
-  RAJA::View<double, RAJA::Layout<2, int>> Cview(C, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N);
   // _matmult_views_end
 
   // _cstyle_matmult_views_start
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
-      for (int k = 0; k < N; ++k)
-      {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
         Cview(row, col) += Aview(row, k) * Bview(k, col);
       }
     }
   }
   // _cstyle_matmult_views_end
 
-  checkResult<double>(C, Cref, N * N);
-  // printValues<double>(C, N*N);
+  checkResult<double>(C, Cref, N*N);
+//printValues<double>(C, N*N);
 
-  //
-  // Clean up.
-  //
-  delete[] A;
-  delete[] B;
-  delete[] C;
-  delete[] Cref;
+//
+// Clean up.
+//
+  delete [] A;
+  delete [] B;
+  delete [] C;
+  delete [] Cref;
 
-  //----------------------------------------------------------------------------//
-  //
-  // Default layouts use row-major data ordering
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Default layouts use row-major data ordering
+//
+//----------------------------------------------------------------------------//
 
   //
   // Define dimensions and allocate arrays
   //
   // _default_views_init_start
-  constexpr int Nx   = 3;
-  constexpr int Ny   = 5;
-  constexpr int Nz   = 2;
-  constexpr int Ntot = Nx * Ny * Nz;
-  int*          a    = new int[Ntot];
-  int*          aref = new int[Ntot];
+  constexpr int Nx = 3;
+  constexpr int Ny = 5;
+  constexpr int Nz = 2;
+  constexpr int Ntot  = Nx*Ny*Nz;
+  int* a = new int[ Ntot ];
+  int* aref = new int[ Ntot ];
 
   for (int i = 0; i < Ntot; ++i)
   {
@@ -162,67 +154,61 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   // _default_views_init_end
 
-  // printValues<int>(ref, Ntot);
+//printValues<int>(ref, Ntot);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n Running default layout view cases...\n";
 
   std::cout << "\n\t Running 1D view case...\n";
-
+ 
   std::memset(a, 0, Ntot * sizeof(int));
+ 
+  // _default_view1D_start 
+  RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot);
 
-  // _default_view1D_start
-  RAJA::View<int, RAJA::Layout<1, int>> view_1D(a, Ntot);
-
-  for (int i = 0; i < Ntot; ++i)
-  {
+  for (int i = 0; i < Ntot; ++i) {
     view_1D(i) = i;
   }
-  // _default_view1D_end
+  // _default_view1D_end 
 
   checkResult<int>(a, aref, Ntot);
-  // printValues<int>(a, Ntot);
+//printValues<int>(a, Ntot);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D default layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
-
+ 
   // _default_view2D_start
-  RAJA::View<int, RAJA::Layout<2, int>> view_2D(a, Nx, Ny);
+  RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny);
 
-  int iter {0};
-  for (int i = 0; i < Nx; ++i)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
+  int iter{0};
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
       view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_view2D_end
 
-  checkResult<int>(a, aref, Nx * Ny);
-  // printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D default layout view case...\n";
 
   std::memset(a, 0, Ntot * sizeof(int));
 
-  // _default_view3D_start
-  RAJA::View<int, RAJA::Layout<3, int>> view_3D(a, Nx, Ny, Nz);
+  // _default_view3D_start    
+  RAJA::View< int, RAJA::Layout<3, int> > view_3D(a, Nx, Ny, Nz);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
-      for (int k = 0; k < Nz; ++k)
-      {
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      for (int k = 0; k < Nz; ++k) {
         view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -230,18 +216,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   // _default_view3D_end
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //----------------------------------------------------------------------------//
-  //
-  // Permuted layouts change the data striding order
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Permuted layouts change the data striding order
+//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running permuted layout cases...\n";
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D default permutation view case...\n";
 
@@ -249,25 +235,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _default_perm_view2D_start
   std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
-  RAJA::Layout<2, int>       defperm2_layout =
-      RAJA::make_permuted_layout({{Nx, Ny}}, defperm2);
-  RAJA::View<int, RAJA::Layout<2, int>> defperm_view_2D(a, defperm2_layout);
+  RAJA::Layout< 2, int > defperm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2);
+  RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
       defperm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _default_perm_view2D_end
 
-  checkResult<int>(a, aref, Nx * Ny);
-  // printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D default permutation view case...\n";
 
@@ -275,17 +259,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _default_perm_view3D_start
   std::array<RAJA::idx_t, 3> defperm3 {{0, 1, 2}};
-  RAJA::Layout<3, int>       defperm3_layout =
-      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, defperm3);
-  RAJA::View<int, RAJA::Layout<3, int>> defperm_view_3D(a, defperm3_layout);
+  RAJA::Layout< 3, int > defperm3_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, defperm3);
+  RAJA::View< int, RAJA::Layout<3, int> > defperm_view_3D(a, defperm3_layout);
 
   iter = 0;
-  for (int i = 0; i < Nx; ++i)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
-      for (int k = 0; k < Nz; ++k)
-      {
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      for (int k = 0; k < Nz; ++k) {
         defperm_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -293,11 +274,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   // _default_perm_view3D_end
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //----------------------------------------//
-  //----------------------------------------//
+//----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted layout view case...\n";
 
@@ -305,25 +286,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _perm_2D_start
   std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
-  RAJA::Layout<2, int>       perm2_layout =
-      RAJA::make_permuted_layout({{Nx, Ny}}, perm2);
-  RAJA::View<int, RAJA::Layout<2, int>> perm_view_2D(a, perm2_layout);
+  RAJA::Layout< 2, int > perm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, perm2);
+  RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j)
-  {
-    for (int i = 0; i < Nx; ++i)
-    {
+  for (int j = 0; j < Ny; ++j) {
+    for (int i = 0; i < Nx; ++i) {
       perm_view_2D(i, j) = iter;
       ++iter;
     }
   }
   // _perm_2D_end
 
-  checkResult<int>(a, aref, Nx * Ny);
-  // printValues<int>(a, Nx*Ny);
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D perma layout view case...\n";
 
@@ -331,17 +310,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _perma_view3D_start
   std::array<RAJA::idx_t, 3> perm3a {{2, 1, 0}};
-  RAJA::Layout<3, int>       perm3a_layout =
-      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3a);
-  RAJA::View<int, RAJA::Layout<3, int>> perm3a_view_3D(a, perm3a_layout);
+  RAJA::Layout< 3, int > perm3a_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3a);
+  RAJA::View< int, RAJA::Layout<3, int> > perm3a_view_3D(a, perm3a_layout);
 
   iter = 0;
-  for (int k = 0; k < Nz; ++k)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
-      for (int i = 0; i < Nx; ++i)
-      {
+  for (int k = 0; k < Nz; ++k) {
+    for (int j = 0; j < Ny; ++j) {
+      for (int i = 0; i < Nx; ++i) {
         perm3a_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -349,10 +325,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   // _perma_view3D_end
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 3D permb layout view case...\n";
 
@@ -360,17 +336,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _permb_view3D_start
   std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
-  RAJA::Layout<3, int>       perm3b_layout =
-      RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b);
-  RAJA::View<int, RAJA::Layout<3, int>> perm3b_view_3D(a, perm3b_layout);
+  RAJA::Layout< 3, int > perm3b_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b);
+  RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout);
 
   iter = 0;
-  for (int j = 0; j < Ny; ++j)
-  {
-    for (int k = 0; k < Nz; ++k)
-    {
-      for (int i = 0; i < Nx; ++i)
-      {
+  for (int j = 0; j < Ny; ++j) {
+    for (int k = 0; k < Nz; ++k) {
+      for (int i = 0; i < Nx; ++i) {
         perm3b_view_3D(i, j, k) = iter;
         ++iter;
       }
@@ -378,29 +351,29 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   }
   // _permb_view3D_end
 
-  checkResult<int>(a, aref, Nx * Ny * Nz);
-  // printValues<int>(a, Nx*Ny*Nz);
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
 
-  //
-  // Clean up.
-  //
-  delete[] a;
-  delete[] aref;
+//
+// Clean up.
+//
+  delete [] a;
+  delete [] aref;
 
-  //----------------------------------------------------------------------------//
-  //
-  // Layouts: multi-dimensional indices vs. linear indicies
-  //
-  // RAJA::Layout type has methods that can be used to convert between
-  // multi-dimensional and linear indices. We show these below using the
-  // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
-  // sizes defined earlier:
-  //
-  //  constexpr int Nx = 3;
-  //  constexpr int Ny = 5;
-  //  constexpr int Nz = 2;
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Layouts: multi-dimensional indices vs. linear indicies
+//
+// RAJA::Layout type has methods that can be used to convert between
+// multi-dimensional and linear indices. We show these below using the
+// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
+// sizes defined earlier:
+//
+//  constexpr int Nx = 3;
+//  constexpr int Ny = 5;
+//  constexpr int Nz = 2;
+//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Multi-dimensional indices to linear indices...\n";
 
@@ -408,9 +381,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\nperm3a_layout...\n" << std::endl;
 
   int lin = -1;
-  int i   = -1;
-  int j   = -1;
-  int k   = -1;
+  int i = -1;
+  int j = -1;
+  int k = -1;
 
   // _perm3d_layout_start
   lin = perm3a_layout(1, 2, 0);
@@ -420,8 +393,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3a_layout.toIndices(7, i, j, k);
   std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
   // _perm3d_layout_end
 
 
@@ -432,8 +404,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3a_layout.toIndices(26, i, j, k);
   std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
 
   lin = perm3a_layout(0, 2, 1);
@@ -443,10 +414,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3a_layout.toIndices(21, i, j, k);
   std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\nperm3b_layout...\n" << std::endl;
 
@@ -457,8 +427,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(13, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
 
   lin = perm3b_layout(2, 3, 1);
@@ -468,8 +437,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(23, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
 
 
   lin = perm3b_layout(0, 2, 1);
@@ -479,14 +447,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   perm3b_layout.toIndices(15, i, j, k);
   std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = "
-            << "(" << i << ", " << j << ", " << k << ")\n"
-            << std::endl;
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; 
 
-  //----------------------------------------------------------------------------//
-  //
-  // Offset layouts apply offsets to indices
-  //
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//
+// Offset layouts apply offsets to indices
+//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n Running offset layout cases...\n";
 
@@ -494,10 +461,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // Define some dimensions, and allocate arrays
   //
   constexpr int Ntot_ao = 40;
-  int*          ao      = new int[Ntot_ao];
-  int*          ao_ref  = new int[Ntot_ao];
+  int* ao = new int[ Ntot_ao ];
+  int* ao_ref = new int[ Ntot_ao ];
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 1D offset layout case...\n";
 
@@ -511,34 +478,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   int imin = -5;
   int imax = 6;
 
-  for (int i = imin; i < imax; ++i)
-  {
-    ao_ref[i - imin] = i;
+  for (int i = imin; i < imax; ++i) {
+    ao_ref[ i-imin ] = i;
   }
   // _cstyle_offlayout1D_end
 
-  // printValues<int>(ao_ref, imax-imin);
+//printValues<int>(ao_ref, imax-imin);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_offlayout1D_start
-  RAJA::OffsetLayout<1, int> offlayout_1D =
-      RAJA::make_offset_layout<1, int>({{imin}}, {{imax}});
+  RAJA::OffsetLayout<1, int> offlayout_1D = 
+    RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); 
 
-  RAJA::View<int, RAJA::OffsetLayout<1, int>> aoview_1Doff(ao, offlayout_1D);
+  RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao,
+                                                             offlayout_1D);
 
-  for (int i = imin; i < imax; ++i)
-  {
+  for (int i = imin; i < imax; ++i) {
     aoview_1Doff(i) = i;
   }
   // _raja_offlayout1D_end
 
-  checkResult<int>(ao, ao_ref, imax - imin);
-  // printValues<int>(ao, 11);
+  checkResult<int>(ao, ao_ref, imax-imin);
+//printValues<int>(ao, 11);
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D offset layout case...\n";
 
@@ -549,48 +515,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
 
   // _cstyle_offlayout2D_start
-  imin     = -1;
-  imax     = 2;
+  imin = -1;
+  imax = 2;
   int jmin = -5;
   int jmax = 5;
 
   iter = 0;
-  for (int i = imin; i < imax; ++i)
-  {
-    for (int j = jmin; j < jmax; ++j)
-    {
-      ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter;
+  for (int i = imin; i < imax; ++i) {
+    for (int j = jmin; j < jmax; ++j) {
+      ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin)  ] = iter;
       iter++;
     }
   }
   // _cstyle_offlayout2D_end
 
-  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_offlayout2D_start
   RAJA::OffsetLayout<2, int> offlayout_2D =
-      RAJA::make_offset_layout<2, int>({{imin, jmin}}, {{imax, jmax}});
+    RAJA::make_offset_layout<2, int>( {{imin, jmin}}, {{imax, jmax}} );
 
-  RAJA::View<int, RAJA::OffsetLayout<2, int>> aoview_2Doff(ao, offlayout_2D);
+  RAJA::View< int, RAJA::OffsetLayout<2, int> > aoview_2Doff(ao,
+                                                             offlayout_2D);
   iter = 0;
-  for (int i = imin; i < imax; ++i)
-  {
-    for (int j = jmin; j < jmax; ++j)
-    {
+  for (int i = imin; i < imax; ++i) {
+    for (int j = jmin; j < jmax; ++j) {
       aoview_2Doff(i, j) = iter;
       iter++;
     }
   }
   // _raja_offlayout2D_end
 
-  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
-  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin)); 
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::cout << "\n\t Running 2D permuted offset layout case...\n";
 
@@ -602,51 +565,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _cstyle_permofflayout2D_start
   iter = 0;
-  for (int j = jmin; j < jmax; ++j)
-  {
-    for (int i = imin; i < imax; ++i)
-    {
-      ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter;
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      ao_ref[ (i-imin) + (j-jmin) * (imax-imin)  ] = iter; 
       iter++;
     }
   }
   // _cstyle_permofflayout2D_end
 
-  // printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
 
-  //----------------------------------------//
+//----------------------------------------//
 
   std::memset(ao, 0, Ntot_ao * sizeof(int));
 
   // _raja_permofflayout2D_start
   std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
-  RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>(
-      {{imin, jmin}}, {{imax, jmax}}, perm1D);
+  RAJA::OffsetLayout<2> permofflayout_2D =
+    RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, 
+                                          {{imax, jmax}},
+                                          perm1D );
 
-  RAJA::View<int, RAJA::OffsetLayout<2>> aoview_2Dpermoff(ao, permofflayout_2D);
+  RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao,
+                                                            permofflayout_2D);
 
   iter = 0;
-  for (int j = jmin; j < jmax; ++j)
-  {
-    for (int i = imin; i < imax; ++i)
-    {
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
       aoview_2Dpermoff(i, j) = iter;
       iter++;
     }
   }
   // _raja_permofflayout2D_end
 
-  checkResult<int>(ao, ao_ref, (imax - imin) * (jmax - jmin));
-  // printValues<int>(ao, (imax-imin)*(jmax-jmin));
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin));
 
-  //
-  // Clean up.
-  //
-  delete[] ao;
-  delete[] ao_ref;
+//
+// Clean up.
+//
+  delete [] ao;
+  delete [] ao_ref;
 
-  //----------------------------------------------------------------------------//
-  //----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
   std::cout << "\n DONE!...\n";
 
@@ -660,19 +622,14 @@ template <typename T>
 void checkResult(T* C, T* Cref, int N)
 {
   bool match = true;
-  for (int i = 0; i < N; ++i)
-  {
-    if (std::abs(C[i] - Cref[i]) > 10e-12)
-    {
+  for (int i = 0; i < N; ++i) {
+    if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) {
       match = false;
     }
   }
-  if (match)
-  {
+  if ( match ) {
     std::cout << "\n\t result -- PASS\n";
-  }
-  else
-  {
+  } else {
     std::cout << "\n\t result -- FAIL\n";
   }
 };
@@ -680,8 +637,7 @@ void checkResult(T* C, T* Cref, int N)
 template <typename T>
 void printValues(T* C, int N)
 {
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     std::cout << "array[" << i << "] = " << C[i] << std::endl;
-  }
+    }
 };
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 056715a4f3..3261c27b7a 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -61,9 +61,9 @@ namespace indexset
 /// each segment.
 ///
 template <typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
-struct ExecPolicy : public RAJA::make_policy_pattern_t<
-                        SEG_EXEC_POLICY_T::policy,
-                        RAJA::Pattern::forall>
+struct ExecPolicy
+    : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
+                                         RAJA::Pattern::forall>
 {
   using seg_it   = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
@@ -94,10 +94,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   using value_type = typename T0::value_type;
 
   // Ensure that all value types in all segments are the same
-  static_assert(
-      std::is_same<value_type, typename PARENT::value_type>::value ||
-          T0_TypeId == 0,
-      "All segments must have the same value_type");
+  static_assert(std::is_same<value_type, typename PARENT::value_type>::value ||
+                    T0_TypeId == 0,
+                "All segments must have the same value_type");
 
   //! Construct empty index set
 #if _MSC_VER < 1910
@@ -163,9 +162,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This is used to implement the == and != operators
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool compareSegmentById(
-      size_t                             segid,
-      const TypedIndexSet<P0, PREST...>& other) const
+  RAJA_INLINE bool
+  compareSegmentById(size_t segid,
+                     const TypedIndexSet<P0, PREST...>& other) const
   {
     // drill down our types until we have the right type
     if (getSegmentTypes()[segid] != T0_TypeId)
@@ -249,10 +248,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 private:
   template <typename... CALL>
-  RAJA_INLINE void push_into(
-      TypedIndexSet<CALL...>& c,
-      PushEnd                 pend  = PUSH_BACK,
-      PushCopy                pcopy = PUSH_COPY)
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
+                             PushEnd pend   = PUSH_BACK,
+                             PushCopy pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
@@ -280,11 +278,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 public:
   template <typename... CALL>
-  RAJA_INLINE void segment_push_into(
-      size_t                  segid,
-      TypedIndexSet<CALL...>& c,
-      PushEnd                 pend  = PUSH_BACK,
-      PushCopy                pcopy = PUSH_COPY)
+  RAJA_INLINE void segment_push_into(size_t segid,
+                                     TypedIndexSet<CALL...>& c,
+                                     PushEnd pend   = PUSH_BACK,
+                                     PushCopy pcopy = PUSH_COPY)
   {
     if (getSegmentTypes()[segid] != T0_TypeId)
     {
@@ -328,18 +325,16 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename Tnew>
   RAJA_INLINE void push_back(Tnew&& val)
   {
-    push_internal(
-        new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK,
-        PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_BACK, PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
   RAJA_INLINE void push_front(Tnew&& val)
   {
-    push_internal(
-        new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
-        PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
@@ -376,8 +371,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     if (getSegmentTypes()[segid] != T0_TypeId)
     {
-      PARENT::segmentCall(
-          segid, std::forward<BODY>(body), std::forward<ARGS>(args)...);
+      PARENT::segmentCall(segid, std::forward<BODY>(body),
+                          std::forward<ARGS>(args)...);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
@@ -515,7 +510,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   TypedIndexSet<T0, TREST...> createSlice(const T& segIds)
   {
     TypedIndexSet<T0, TREST...> retVal;
-    int                         numSeg = getNumSegments();
+    int numSeg = getNumSegments();
     for (auto& seg : segIds)
     {
       if (seg >= 0 && seg < numSeg)
@@ -716,8 +711,8 @@ class TypedIndexSet<>
   RAJA_INLINE void increaseTotalLength(int n) { m_len += n; }
 
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool
-  compareSegmentById(size_t, const TypedIndexSet<P0, PREST...>&) const
+  RAJA_INLINE bool compareSegmentById(size_t,
+                                      const TypedIndexSet<P0, PREST...>&) const
   {
     return false;
   }
@@ -796,14 +791,14 @@ namespace type_traits
 
 template <typename T>
 struct is_index_set
-    : ::RAJA::type_traits::
-          SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type>
+    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
+                                            typename std::decay<T>::type>
 {};
 
 template <typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::
-          SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type>
+    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
+                                            typename std::decay<T>::type>
 {};
 }  // namespace type_traits
 
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index f0b98471fe..075aecd1d1 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -55,9 +55,9 @@ namespace RAJA
  */
 void RAJASHAREDDLL_API buildIndexSetAligned(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource                                   work_res,
-    const RAJA::Index_type* const                               indices_in,
-    RAJA::Index_type                                            length,
+    camp::resources::Resource work_res,
+    const RAJA::Index_type* const indices_in,
+    RAJA::Index_type length,
     RAJA::Index_type range_min_length,
     RAJA::Index_type range_align);
 
@@ -92,11 +92,10 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int                                      fastDim,
-    int                                      midDim,
-    int                                      slowDim);
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim);
 
 /*!
  ******************************************************************************
@@ -118,11 +117,11 @@ void buildLockFreeBlockIndexset(
  */
 void buildLockFreeColorIndexset(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource                                   work_res,
-    RAJA::Index_type const*                                     domainToRange,
-    int                                                         numEntity,
-    int               numRangePerDomain,
-    int               numEntityRange,
+    camp::resources::Resource work_res,
+    RAJA::Index_type const* domainToRange,
+    int numEntity,
+    int numRangePerDomain,
+    int numEntityRange,
     RAJA::Index_type* elemPermutation  = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index cc928eb298..d5da3e9e19 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -45,8 +45,8 @@ namespace RAJA
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename... SEG_TYPES>
-RAJA_INLINE void
-getIndices(CONTAINER_T& con, const TypedIndexSet<SEG_TYPES...>& iset)
+RAJA_INLINE void getIndices(CONTAINER_T& con,
+                            const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
   forall<ExecPolicy<seq_segit, seq_exec>>(
@@ -65,8 +65,8 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(
-      seg, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx)
+                   { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -79,10 +79,9 @@ RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename... SEG_TYPES, typename CONDITIONAL>
-RAJA_INLINE void getIndicesConditional(
-    CONTAINER_T&                       con,
-    const TypedIndexSet<SEG_TYPES...>& iset,
-    CONDITIONAL                        conditional)
+RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
+                                       const TypedIndexSet<SEG_TYPES...>& iset,
+                                       CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
   forall<ExecPolicy<seq_segit, seq_exec>>(
@@ -103,18 +102,16 @@ RAJA_INLINE void getIndicesConditional(
  ******************************************************************************
  */
 template <typename CONTAINER_T, typename SEGMENT_T, typename CONDITIONAL>
-RAJA_INLINE void getIndicesConditional(
-    CONTAINER_T&     con,
-    const SEGMENT_T& seg,
-    CONDITIONAL      conditional)
+RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
+                                       const SEGMENT_T& seg,
+                                       CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(
-      seg,
-      [&](typename CONTAINER_T::value_type idx)
-      {
-        if (conditional(idx)) tcon.push_back(idx);
-      });
+  forall<seq_exec>(seg,
+                   [&](typename CONTAINER_T::value_type idx)
+                   {
+                     if (conditional(idx)) tcon.push_back(idx);
+                   });
   con = tcon;
 }
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index b0a0214148..7ed94a299e 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -50,11 +50,11 @@ struct IndexValue : public IndexValueBase
   using value_type = VALUE;
 
   //! Default constructor initializes value to 0.
-  RAJA_INLINE constexpr IndexValue()                    = default;
-  constexpr RAJA_INLINE   IndexValue(IndexValue const&) = default;
-  constexpr RAJA_INLINE   IndexValue(IndexValue&&)      = default;
-  RAJA_INLINE IndexValue& operator=(IndexValue const&)  = default;
-  RAJA_INLINE IndexValue& operator=(IndexValue&&)       = default;
+  RAJA_INLINE constexpr IndexValue()                   = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue const&)  = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue&&)       = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue const&) = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue&&)      = default;
 
   /*!
    * \brief Explicit constructor.
@@ -315,10 +315,10 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE TO convertIndex(FROM const val)
  */
 // This version is enabled if FROM is a strongly typed class
 template <typename FROM>
-constexpr RAJA_HOST_DEVICE RAJA_INLINE typename std::enable_if<
-    std::is_base_of<IndexValueBase, FROM>::value,
-    typename FROM::value_type>::type
-stripIndexType(FROM const val)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE
+    typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value,
+                            typename FROM::value_type>::type
+    stripIndexType(FROM const val)
 {
   return *val;
 }
@@ -326,8 +326,9 @@ stripIndexType(FROM const val)
  * enabled if FROM is not a strongly typed class
  */
 template <typename FROM>
-constexpr RAJA_HOST_DEVICE RAJA_INLINE typename std::
-    enable_if<!std::is_base_of<IndexValueBase, FROM>::value, FROM>::type
+constexpr RAJA_HOST_DEVICE RAJA_INLINE
+    typename std::enable_if<!std::is_base_of<IndexValueBase, FROM>::value,
+                            FROM>::type
     stripIndexType(FROM const val)
 {
   return val;
@@ -366,10 +367,10 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  * \param FROM the original type
  */
 template <typename FROM>
-using make_signed_t = typename std::conditional<
-    std::is_floating_point<FROM>::value,
-    std::common_type<FROM>,
-    std::make_signed<FROM>>::type::type;
+using make_signed_t =
+    typename std::conditional<std::is_floating_point<FROM>::value,
+                              std::common_type<FROM>,
+                              std::make_signed<FROM>>::type::type;
 
 }  // namespace RAJA
 
@@ -378,18 +379,18 @@ using make_signed_t = typename std::conditional<
  * \param TYPE the name of the type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE(TYPE, NAME)                                             \
-  class TYPE : public ::RAJA::IndexValue<TYPE>                                   \
-  {                                                                              \
-    using parent = ::RAJA::IndexValue<TYPE>;                                     \
-                                                                                 \
-  public:                                                                        \
-    using IndexValueType = TYPE;                                                 \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}                \
-    RAJA_HOST_DEVICE             RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \
-        : parent::IndexValue(v)                                                  \
-    {}                                                                           \
-    static inline std::string getName() { return NAME; }                         \
+#define RAJA_INDEX_VALUE(TYPE, NAME)                                           \
+  class TYPE : public ::RAJA::IndexValue<TYPE>                                 \
+  {                                                                            \
+    using parent = ::RAJA::IndexValue<TYPE>;                                   \
+                                                                               \
+  public:                                                                      \
+    using IndexValueType = TYPE;                                               \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}              \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v)           \
+        : parent::IndexValue(v)                                                \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 /*!
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index a09d0b9f9d..187ec05d3f 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -116,11 +116,10 @@ class TypedListSegment
    * If 'Unowned' is passed as last argument, the segment will not own its
    * index data. In this case, caller must manage array lifetime properly.
    */
-  TypedListSegment(
-      const value_type*         values,
-      Index_type                length,
-      camp::resources::Resource resource,
-      IndexOwnership            owned = Owned)
+  TypedListSegment(const value_type* values,
+                   Index_type length,
+                   camp::resources::Resource resource,
+                   IndexOwnership owned = Owned)
       : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
@@ -139,9 +138,8 @@ class TypedListSegment
    * Constructor assumes container data lives in host memory space.
    */
   template <typename Container>
-  TypedListSegment(
-      const Container&          container,
-      camp::resources::Resource resource)
+  TypedListSegment(const Container& container,
+                   camp::resources::Resource resource)
       : m_resource(nullptr),
         m_owned(Unowned),
         m_data(nullptr),
@@ -154,9 +152,9 @@ class TypedListSegment
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
-      auto       dest = tmp;
-      auto       src  = container.begin();
-      auto const end  = container.end();
+      auto dest      = tmp;
+      auto src       = container.begin();
+      auto const end = container.end();
       while (src != end)
       {
         *dest = *src;
@@ -290,8 +288,8 @@ class TypedListSegment
    * Method assumes values in given array and segment indices both live in host
    * memory space.
    */
-  RAJA_HOST_DEVICE bool
-  indicesEqual(const value_type* container, Index_type len) const
+  RAJA_HOST_DEVICE bool indicesEqual(const value_type* container,
+                                     Index_type len) const
   {
     if (container == m_data) return len == m_size;
     if (len != m_size || container == nullptr || m_data == nullptr)
@@ -344,11 +342,10 @@ class TypedListSegment
   //
   // Initialize segment data based on whether object owns the index data.
   //
-  void initIndexData(
-      const value_type*         container,
-      Index_type                len,
-      camp::resources::Resource resource_,
-      IndexOwnership            container_own)
+  void initIndexData(const value_type* container,
+                     Index_type len,
+                     camp::resources::Resource resource_,
+                     IndexOwnership container_own)
   {
 
     // empty list segment
@@ -414,8 +411,8 @@ namespace std
 
 //! Specialization of std::swap for TypedListSegment
 template <typename StorageT>
-RAJA_INLINE void
-swap(RAJA::TypedListSegment<StorageT>& a, RAJA::TypedListSegment<StorageT>& b)
+RAJA_INLINE void swap(RAJA::TypedListSegment<StorageT>& a,
+                      RAJA::TypedListSegment<StorageT>& b)
 {
   a.swap(b);
 }
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index 20b2a8aba6..607fe71daf 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -92,9 +92,8 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <
-    typename StorageT,
-    typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeSegment
 {
 
@@ -102,15 +101,13 @@ struct TypedRangeSegment
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(
-      std::is_signed<DiffT>::value,
-      "TypedRangeSegment DiffT "
-      "requires signed type.");
-  static_assert(
-      !std::is_floating_point<StorageT>::value,
-      "TypedRangeSegment "
-      "Type must be non "
-      "floating point.");
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeSegment DiffT "
+                "requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeSegment "
+                "Type must be non "
+                "floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -136,9 +133,8 @@ struct TypedRangeSegment
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(
-      StripStorageT begin,
-      StripStorageT end)
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
+                                               StripStorageT end)
       : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end))
   {}
 
@@ -227,8 +223,8 @@ struct TypedRangeSegment
    *
    *   \endverbatim
    */
-  RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment
-  slice(StorageT begin, DiffT length) const
+  RAJA_HOST_DEVICE RAJA_INLINE TypedRangeSegment slice(StorageT begin,
+                                                       DiffT length) const
   {
     StorageT start = m_begin[0] + begin;
     StorageT end   = start + length > m_end[0] ? m_end[0] : start + length;
@@ -331,9 +327,8 @@ struct TypedRangeSegment
  *
  ******************************************************************************
  */
-template <
-    typename StorageT,
-    typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
 struct TypedRangeStrideSegment
 {
 
@@ -341,16 +336,14 @@ struct TypedRangeStrideSegment
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(
-      std::is_signed<DiffT>::value,
-      "TypedRangeStrideSegment DiffT "
-      "requires signed type.");
-  static_assert(
-      !std::is_floating_point<StorageT>::value,
-      "TypedRangeStrideSegm"
-      "ent Type must be "
-      "non floating "
-      "point.");
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeStrideSegment DiffT "
+                "requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeStrideSegm"
+                "ent Type must be "
+                "non floating "
+                "point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -492,8 +485,8 @@ struct TypedRangeStrideSegment
    *
    *   \endverbatim
    */
-  RAJA_HOST_DEVICE TypedRangeStrideSegment
-  slice(StorageT begin, DiffT length) const
+  RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
+                                                 DiffT length) const
   {
     StorageT stride = m_begin.get_stride();
     StorageT start  = m_begin[0] + begin * stride;
@@ -508,8 +501,8 @@ struct TypedRangeStrideSegment
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment {
-        stripIndexType(start), stripIndexType(end), m_begin.get_stride()};
+    return TypedRangeStrideSegment {stripIndexType(start), stripIndexType(end),
+                                    m_begin.get_stride()};
   }
 
   /*!
@@ -566,12 +559,11 @@ using common_type_t = typename common_type<Ts...>::type;
  *          @begin and @end. If there is no common type, then
  *          a compiler error will be produced.
  */
-template <
-    typename BeginT,
-    typename EndT,
-    typename Common = detail::common_type_t<BeginT, EndT>>
-RAJA_HOST_DEVICE TypedRangeSegment<Common>
-                 make_range(BeginT&& begin, EndT&& end)
+template <typename BeginT,
+          typename EndT,
+          typename Common = detail::common_type_t<BeginT, EndT>>
+RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
+                                                      EndT&& end)
 {
   return {begin, end};
 }
@@ -585,21 +577,18 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common>
  *          @begin, @end, and @stride. If there is no common
  *          type, then a compiler error will be produced.
  */
-template <
-    typename BeginT,
-    typename EndT,
-    typename StrideT,
-    typename Common = detail::common_type_t<BeginT, EndT>>
+template <typename BeginT,
+          typename EndT,
+          typename StrideT,
+          typename Common = detail::common_type_t<BeginT, EndT>>
 RAJA_HOST_DEVICE TypedRangeStrideSegment<Common>
 make_strided_range(BeginT&& begin, EndT&& end, StrideT&& stride)
 {
-  static_assert(
-      std::is_signed<StrideT>::value,
-      "make_strided_segment : stride must be signed.");
-  static_assert(
-      std::is_same<make_signed_t<EndT>, StrideT>::value,
-      "make_stride_segment : stride and end must be of similar "
-      "types.");
+  static_assert(std::is_signed<StrideT>::value,
+                "make_strided_segment : stride must be signed.");
+  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value,
+                "make_stride_segment : stride and end must be of similar "
+                "types.");
   return {begin, end, stride};
 }
 
@@ -621,13 +610,11 @@ struct RangeStrideConstructible
 namespace type_traits
 {
 
-DefineTypeTraitFromConcept(
-    is_range_constructible,
-    RAJA::concepts::RangeConstructible);
+DefineTypeTraitFromConcept(is_range_constructible,
+                           RAJA::concepts::RangeConstructible);
 
-DefineTypeTraitFromConcept(
-    is_range_stride_constructible,
-    RAJA::concepts::RangeStrideConstructible);
+DefineTypeTraitFromConcept(is_range_stride_constructible,
+                           RAJA::concepts::RangeStrideConstructible);
 
 }  // namespace type_traits
 
@@ -638,16 +625,16 @@ namespace std
 
 //! Specialization of std::swap for TypedRangeSegment
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE void
-swap(RAJA::TypedRangeSegment<T>& a, RAJA::TypedRangeSegment<T>& b)
+RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeSegment<T>& a,
+                                       RAJA::TypedRangeSegment<T>& b)
 {
   a.swap(b);
 }
 
 //! Specialization of std::swap for TypedRangeStrideSegment
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE void
-swap(RAJA::TypedRangeStrideSegment<T>& a, RAJA::TypedRangeStrideSegment<T>& b)
+RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeStrideSegment<T>& a,
+                                       RAJA::TypedRangeStrideSegment<T>& b)
 {
   a.swap(b);
 }
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 2eef044283..d2a30ee5ce 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -119,9 +119,9 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   void print(std::ostream& os) const;
 
 private:
-  int              m_dep_task[_MaxDepTasks_];
-  int              m_num_dep_tasks;
-  int              m_semaphore_reload_value;
+  int m_dep_task[_MaxDepTasks_];
+  int m_num_dep_tasks;
+  int m_semaphore_reload_value;
   std::atomic<int> m_semaphore_value;
 };
 
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index a59fdafb06..33cdd3f539 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -61,10 +61,9 @@ RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 }
 
 template <typename Type, typename DifferenceType>
-RAJA_HOST_DEVICE bool is_subtraction_overflow(
-    Type           lhs,
-    DifferenceType rhs,
-    bool           iterator_on_left = true)
+RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
+                                              DifferenceType rhs,
+                                              bool iterator_on_left = true)
 {
   if (iterator_on_left)
   {
@@ -98,18 +97,17 @@ RAJA_HOST_DEVICE void check_is_addition_overflow(Type lhs, DifferenceType rhs)
 }
 
 template <typename Type, typename DifferenceType>
-RAJA_HOST_DEVICE void
-check_is_subtraction_overflow(Type lhs, DifferenceType rhs)
+RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
+                                                    DifferenceType rhs)
 {
   if (is_subtraction_overflow(lhs, rhs))
     throw std::runtime_error(overflow_msg(lhs, rhs));
 }
 #endif
 
-template <
-    typename Type           = Index_type,
-    typename DifferenceType = Type,
-    typename PointerType    = Type*>
+template <typename Type           = Index_type,
+          typename DifferenceType = Type,
+          typename PointerType    = Type*>
 class numeric_iterator
 {
 public:
@@ -277,10 +275,9 @@ class numeric_iterator
   stripped_value_type val = 0;
 };
 
-template <
-    typename Type           = Index_type,
-    typename DifferenceType = Type,
-    typename PointerType    = Type*>
+template <typename Type           = Index_type,
+          typename DifferenceType = Type,
+          typename PointerType    = Type*>
 class strided_numeric_iterator
 {
 public:
@@ -303,7 +300,7 @@ class strided_numeric_iterator
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
-      DifferenceType      stride_ = DifferenceType(1))
+      DifferenceType stride_ = DifferenceType(1))
       : val(rhs), stride(stride_)
   {}
 
@@ -349,9 +346,8 @@ class strided_numeric_iterator
   RAJA_HOST_DEVICE inline difference_type
   operator-(const strided_numeric_iterator& rhs) const
   {
-    difference_type diff =
-        (static_cast<difference_type>(val) -
-         (static_cast<difference_type>(rhs.val)));
+    difference_type diff = (static_cast<difference_type>(val) -
+                            (static_cast<difference_type>(rhs.val)));
 
     return (diff % stride != difference_type {0})
                ? (difference_type {1} + diff / stride)
@@ -423,8 +419,8 @@ class strided_numeric_iterator
   }
 
 private:
-  stripped_value_type val    = 0;
-  DifferenceType      stride = 1;
+  stripped_value_type val = 0;
+  DifferenceType stride   = 1;
 };
 
 
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 18bab98d1e..a7dee5a77c 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -44,7 +44,7 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #if defined(RAJA_HAVE_POSIX_MEMALIGN)
   // posix_memalign available
   void* ret = nullptr;
-  int   err = posix_memalign(&ret, alignment, size);
+  int err   = posix_memalign(&ret, alignment, size);
   return err ? nullptr : ret;
 #elif defined(RAJA_HAVE_ALIGNED_ALLOC)
   return std::aligned_alloc(alignment, size);
@@ -55,8 +55,8 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #else
   char* mem = (char*)malloc(size + alignment + sizeof(void*));
   if (nullptr == mem) return nullptr;
-  void** ptr =
-      (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) & ~(alignment - 1));
+  void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) &
+                        ~(alignment - 1));
   // Store the original address one position behind what we give the user.
   ptr[-1] = mem;
   return ptr;
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 3a3d5aec39..7802bda6cd 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -75,9 +75,8 @@ class RAJAVec
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(
-      size_type             init_cap = 0,
-      const allocator_type& a        = allocator_type())
+  explicit RAJAVec(size_type init_cap      = 0,
+                   const allocator_type& a = allocator_type())
       : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
   {
     reserve(init_cap);
@@ -131,8 +130,8 @@ class RAJAVec
   {
     if (&rhs != this)
     {
-      move_assign_private(
-          std::move(rhs), propagate_on_container_move_assignment {});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -306,10 +305,10 @@ class RAJAVec
   void pop_back() { destroy_items_after(m_size - 1); }
 
 private:
-  pointer        m_data;
+  pointer m_data;
   allocator_type m_allocator;
-  size_type      m_capacity;
-  size_type      m_size;
+  size_type m_capacity;
+  size_type m_size;
 
   ///
   /// Copy assignment implementation
@@ -453,8 +452,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(
-          m_allocator, m_data + m_size, std::forward<Os>(os)...);
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::forward<Os>(os)...);
     }
   }
 
@@ -465,8 +464,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(
-          m_allocator, m_data + m_size, o_data[m_size]);
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       o_data[m_size]);
     }
   }
 
@@ -477,8 +476,8 @@ class RAJAVec
   {
     for (; m_size < new_size; ++m_size)
     {
-      allocator_traits_type::construct(
-          m_allocator, m_data + m_size, std::move(o_data[m_size]));
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::move(o_data[m_size]));
     }
   }
 
@@ -504,16 +503,16 @@ class RAJAVec
     if (m_size > 0)
     {
       size_type i = m_size;
-      allocator_traits_type::construct(
-          m_allocator, m_data + i, std::move(m_data[i - 1]));
+      allocator_traits_type::construct(m_allocator, m_data + i,
+                                       std::move(m_data[i - 1]));
       for (--i; i > 0; --i)
       {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(
-        m_allocator, m_data, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -524,8 +523,8 @@ class RAJAVec
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(
-        m_allocator, m_data + m_size, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -535,7 +534,7 @@ class RAJAVec
   // relying on STL directly.
   //
   static constexpr const size_type s_init_cap = 8;
-  static constexpr const double    s_grow_fac = 1.5;
+  static constexpr const double s_grow_fac    = 1.5;
 
   //
   // Get the next value for capacity given a target and minimum.
@@ -588,8 +587,8 @@ class RAJAVec
     {
       for (size_type i = 0; i < m_size; ++i)
       {
-        allocator_traits_type::construct(
-            m_allocator, tdata + i, std::move(m_data[i]));
+        allocator_traits_type::construct(m_allocator, tdata + i,
+                                         std::move(m_data[i]));
         allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index 3527252035..66d03ca6cd 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -39,9 +39,9 @@
 
 #define RAJA_FT_BEGIN                                                          \
   extern volatile int fault_type;                                              \
-  bool                repeat;                                                  \
-  bool                do_time = false;                                         \
-  ticks               start = 0, stop = 0;                                     \
+  bool repeat;                                                                 \
+  bool do_time = false;                                                        \
+  ticks start = 0, stop = 0;                                                   \
   if (fault_type != 0)                                                         \
   {                                                                            \
     printf("Uncaught fault %d\n", fault_type);                                 \
@@ -81,7 +81,7 @@
 #else
 #define RAJA_FT_BEGIN                                                          \
   extern volatile int fault_type;                                              \
-  bool                repeat;                                                  \
+  bool repeat;                                                                 \
   if (fault_type == 0)                                                         \
   {                                                                            \
     do                                                                         \
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index 55355f23dc..f16bd9bee4 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -57,21 +57,20 @@ struct foldl_impl<Op, Arg1, Arg2>
   using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
 };
 
-template <
-    typename Op,
-    typename Arg1,
-    typename Arg2,
-    typename Arg3,
-    typename... Rest>
+template <typename Op,
+          typename Arg1,
+          typename Arg2,
+          typename Arg3,
+          typename... Rest>
 struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
 {
-  using Ret = typename foldl_impl<
-      Op,
-      typename std::invoke_result<
-          Op,
-          typename std::invoke_result<Op, Arg1, Arg2>::type,
-          Arg3>::type,
-      Rest...>::Ret;
+  using Ret =
+      typename foldl_impl<Op,
+                          typename std::invoke_result<
+                              Op,
+                              typename std::invoke_result<Op, Arg1, Arg2>::type,
+                              Arg3>::type,
+                          Rest...>::Ret;
 };
 
 #else
@@ -82,18 +81,17 @@ struct foldl_impl<Op, Arg1, Arg2>
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
 };
 
-template <
-    typename Op,
-    typename Arg1,
-    typename Arg2,
-    typename Arg3,
-    typename... Rest>
+template <typename Op,
+          typename Arg1,
+          typename Arg2,
+          typename Arg3,
+          typename... Rest>
 struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
 {
   using Ret = typename foldl_impl<
       Op,
-      typename std::result_of<
-          Op(typename std::result_of<Op(Arg1, Arg2)>::type, Arg3)>::type,
+      typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
+                                 Arg3)>::type,
       Rest...>::Ret;
 };
 
@@ -114,27 +112,25 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
 foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2) ->
     typename detail::foldl_impl<Op, Arg1, Arg2>::Ret
 {
-  return camp::forward<Op>(operation)(
-      camp::forward<Arg1>(arg1), camp::forward<Arg2>(arg2));
+  return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
+                                      camp::forward<Arg2>(arg2));
 }
 
-template <
-    typename Op,
-    typename Arg1,
-    typename Arg2,
-    typename Arg3,
-    typename... Rest>
+template <typename Op,
+          typename Arg1,
+          typename Arg2,
+          typename Arg3,
+          typename... Rest>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
 foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Rest&&... rest) ->
     typename detail::foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
 {
-  return foldl(
-      camp::forward<Op>(operation),
-      camp::forward<Op>(operation)(
-          camp::forward<Op>(operation)(
-              camp::forward<Arg1>(arg1), camp::forward<Arg2>(arg2)),
-          camp::forward<Arg3>(arg3)),
-      camp::forward<Rest>(rest)...);
+  return foldl(camp::forward<Op>(operation),
+               camp::forward<Op>(operation)(
+                   camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
+                                                camp::forward<Arg2>(arg2)),
+                   camp::forward<Arg3>(arg3)),
+               camp::forward<Rest>(rest)...);
 }
 
 
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index f067e2a0ae..313ef66934 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -25,8 +25,8 @@ struct max_platform
 {
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr RAJA::Platform
-  operator()(const RAJA::Platform& l, const RAJA::Platform& r) const
+  constexpr RAJA::Platform operator()(const RAJA::Platform& l,
+                                      const RAJA::Platform& r) const
   {
     return (l > r) ? l : r;
   }
@@ -72,11 +72,10 @@ struct get_platform_from_list<>
  * (not for MultiPolicy or nested::Policy)
  */
 template <typename T>
-struct get_platform<
-    T,
-    typename std::enable_if<
-        std::is_base_of<RAJA::PolicyBase, T>::value &&
-        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
+struct get_platform<T,
+                    typename std::enable_if<
+                        std::is_base_of<RAJA::PolicyBase, T>::value &&
+                        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
 {
 
   static constexpr Platform value = T::platform;
@@ -97,9 +96,9 @@ struct get_platform<RAJA::ExecPolicy<SEG, EXEC>>
 template <typename T>
 struct get_statement_platform
 {
-  static constexpr Platform value = get_platform_from_list<
-      typename T::execution_policy_t,
-      typename T::enclosed_statements_t>::value;
+  static constexpr Platform value =
+      get_platform_from_list<typename T::execution_policy_t,
+                             typename T::enclosed_statements_t>::value;
 };
 
 /*!
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index a49edcc819..9b4ab6c7ee 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -115,21 +115,19 @@ struct is_xargs<xargs<Args...>>
  *
  ******************************************************************************
  */
-template <
-    typename WORKGROUP_POLICY_T,
-    typename INDEX_T,
-    typename EXTRA_ARGS_T,
-    typename ALLOCATOR_T>
+template <typename WORKGROUP_POLICY_T,
+          typename INDEX_T,
+          typename EXTRA_ARGS_T,
+          typename ALLOCATOR_T>
 struct WorkPool
 {
   static_assert(
       RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
-  static_assert(
-      detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkPool: EXTRA_ARGS_T "
-      "must be a "
-      "RAJA::xargs<...> type");
+  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
+                "WorkPool: EXTRA_ARGS_T "
+                "must be a "
+                "RAJA::xargs<...> type");
 };
 
 /*!
@@ -158,21 +156,19 @@ struct WorkPool
  *
  ******************************************************************************
  */
-template <
-    typename WORKGROUP_POLICY_T,
-    typename INDEX_T,
-    typename EXTRA_ARGS_T,
-    typename ALLOCATOR_T>
+template <typename WORKGROUP_POLICY_T,
+          typename INDEX_T,
+          typename EXTRA_ARGS_T,
+          typename ALLOCATOR_T>
 struct WorkGroup
 {
   static_assert(
       RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
-  static_assert(
-      detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkGroup: "
-      "EXTRA_ARGS_T must be a "
-      "RAJA::xargs<...> type");
+  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
+                "WorkGroup: "
+                "EXTRA_ARGS_T must be a "
+                "RAJA::xargs<...> type");
 };
 
 /*!
@@ -200,70 +196,63 @@ struct WorkGroup
  *
  ******************************************************************************
  */
-template <
-    typename WORKGROUP_POLICY_T,
-    typename INDEX_T,
-    typename EXTRA_ARGS_T,
-    typename ALLOCATOR_T>
+template <typename WORKGROUP_POLICY_T,
+          typename INDEX_T,
+          typename EXTRA_ARGS_T,
+          typename ALLOCATOR_T>
 struct WorkSite
 {
   static_assert(
       RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
-  static_assert(
-      detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkSite: EXTRA_ARGS_T "
-      "must be a "
-      "RAJA::xargs<...> type");
+  static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
+                "WorkSite: EXTRA_ARGS_T "
+                "must be a "
+                "RAJA::xargs<...> type");
 };
 
 
-template <
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename STORAGE_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename INDEX_T,
-    typename... Args,
-    typename ALLOCATOR_T>
-struct WorkPool<
-    WorkGroupPolicy<
-        EXEC_POLICY_T,
-        ORDER_POLICY_T,
-        STORAGE_POLICY_T,
-        DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename INDEX_T,
+          typename... Args,
+          typename ALLOCATOR_T>
+struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                ORDER_POLICY_T,
+                                STORAGE_POLICY_T,
+                                DISPATCH_POLICY_T>,
+                INDEX_T,
+                xargs<Args...>,
+                ALLOCATOR_T>
 {
   using exec_policy     = EXEC_POLICY_T;
   using order_policy    = ORDER_POLICY_T;
   using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<
-      exec_policy,
-      order_policy,
-      storage_policy,
-      dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type  = xargs<Args...>;
-  using Allocator  = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
   using worksite_type  = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<
-      exec_policy,
-      order_policy,
-      dispatch_policy,
-      Allocator,
-      index_type,
-      Args...>;
-  using storage_type = detail::WorkStorage<
-      storage_policy,
-      Allocator,
-      typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<exec_policy,
+                                             order_policy,
+                                             dispatch_policy,
+                                             Allocator,
+                                             index_type,
+                                             Args...>;
+  using storage_type =
+      detail::WorkStorage<storage_policy,
+                          Allocator,
+                          typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -328,42 +317,38 @@ struct WorkPool<
 
 private:
   storage_type m_storage;
-  size_t       m_max_num_loops     = 0;
-  size_t       m_max_storage_bytes = 0;
+  size_t m_max_num_loops     = 0;
+  size_t m_max_storage_bytes = 0;
 
   workrunner_type m_runner;
 };
 
-template <
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename STORAGE_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename INDEX_T,
-    typename... Args,
-    typename ALLOCATOR_T>
-struct WorkGroup<
-    WorkGroupPolicy<
-        EXEC_POLICY_T,
-        ORDER_POLICY_T,
-        STORAGE_POLICY_T,
-        DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename INDEX_T,
+          typename... Args,
+          typename ALLOCATOR_T>
+struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                 ORDER_POLICY_T,
+                                 STORAGE_POLICY_T,
+                                 DISPATCH_POLICY_T>,
+                 INDEX_T,
+                 xargs<Args...>,
+                 ALLOCATOR_T>
 {
   using exec_policy     = EXEC_POLICY_T;
   using order_policy    = ORDER_POLICY_T;
   using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<
-      exec_policy,
-      order_policy,
-      storage_policy,
-      dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type  = xargs<Args...>;
-  using Allocator  = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
@@ -403,7 +388,7 @@ struct WorkGroup<
   ~WorkGroup() { clear(); }
 
 private:
-  storage_type    m_storage;
+  storage_type m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
@@ -411,36 +396,32 @@ struct WorkGroup<
   {}
 };
 
-template <
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename STORAGE_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename INDEX_T,
-    typename... Args,
-    typename ALLOCATOR_T>
-struct WorkSite<
-    WorkGroupPolicy<
-        EXEC_POLICY_T,
-        ORDER_POLICY_T,
-        STORAGE_POLICY_T,
-        DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename INDEX_T,
+          typename... Args,
+          typename ALLOCATOR_T>
+struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
+                                ORDER_POLICY_T,
+                                STORAGE_POLICY_T,
+                                DISPATCH_POLICY_T>,
+                INDEX_T,
+                xargs<Args...>,
+                ALLOCATOR_T>
 {
   using exec_policy     = EXEC_POLICY_T;
   using order_policy    = ORDER_POLICY_T;
   using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<
-      exec_policy,
-      order_policy,
-      storage_policy,
-      dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type  = xargs<Args...>;
-  using Allocator  = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workpool_type  = WorkPool<policy, index_type, xarg_type, Allocator>;
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
@@ -473,7 +454,7 @@ struct WorkSite<
 
 private:
   per_run_storage m_run_storage;
-  resource_type   m_resource;
+  resource_type m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
       : m_run_storage(std::move(run_storage)), m_resource(r)
@@ -481,32 +462,27 @@ struct WorkSite<
 };
 
 
-template <
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename STORAGE_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename INDEX_T,
-    typename... Args,
-    typename ALLOCATOR_T>
-inline typename WorkPool<
-    WorkGroupPolicy<
-        EXEC_POLICY_T,
-        ORDER_POLICY_T,
-        STORAGE_POLICY_T,
-        DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::workgroup_type
-WorkPool<
-    WorkGroupPolicy<
-        EXEC_POLICY_T,
-        ORDER_POLICY_T,
-        STORAGE_POLICY_T,
-        DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::instantiate()
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename INDEX_T,
+          typename... Args,
+          typename ALLOCATOR_T>
+inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                         ORDER_POLICY_T,
+                                         STORAGE_POLICY_T,
+                                         DISPATCH_POLICY_T>,
+                         INDEX_T,
+                         xargs<Args...>,
+                         ALLOCATOR_T>::workgroup_type
+WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                         ORDER_POLICY_T,
+                         STORAGE_POLICY_T,
+                         DISPATCH_POLICY_T>,
+         INDEX_T,
+         xargs<Args...>,
+         ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
   m_max_num_loops     = std::max(m_storage.size(), m_max_num_loops);
@@ -516,49 +492,42 @@ WorkPool<
   return workgroup_type {std::move(m_storage), std::move(m_runner)};
 }
 
-template <
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename STORAGE_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename INDEX_T,
-    typename... Args,
-    typename ALLOCATOR_T>
-inline typename WorkGroup<
-    WorkGroupPolicy<
-        EXEC_POLICY_T,
-        ORDER_POLICY_T,
-        STORAGE_POLICY_T,
-        DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::worksite_type
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename INDEX_T,
+          typename... Args,
+          typename ALLOCATOR_T>
+inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                          ORDER_POLICY_T,
+                                          STORAGE_POLICY_T,
+                                          DISPATCH_POLICY_T>,
+                          INDEX_T,
+                          xargs<Args...>,
+                          ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<
-        EXEC_POLICY_T,
-        ORDER_POLICY_T,
-        STORAGE_POLICY_T,
-        DISPATCH_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T,
+                    ORDER_POLICY_T,
+                    STORAGE_POLICY_T,
+                    DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::
-    run(typename WorkGroup<
-            WorkGroupPolicy<
-                EXEC_POLICY_T,
-                ORDER_POLICY_T,
-                STORAGE_POLICY_T,
-                DISPATCH_POLICY_T>,
-            INDEX_T,
-            xargs<Args...>,
-            ALLOCATOR_T>::resource_type r,
-        Args... args)
+    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                                         ORDER_POLICY_T,
+                                                         STORAGE_POLICY_T,
+                                                         DISPATCH_POLICY_T>,
+                                         INDEX_T,
+                                         xargs<Args...>,
+                                         ALLOCATOR_T>::resource_type r,
+                      Args... args)
 {
   util::PluginContext context {util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(
-      r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(r,
+                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index 8c0f95db30..d7c35feb3d 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -76,18 +76,16 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template <
-    Platform platform,
-    typename dispatch_policy,
-    typename DispatcherID,
-    typename... CallArgs>
+template <Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
 struct Dispatcher;
 
 
 template <typename holder_type>
-struct dispatcher_transform_types<
-    ::RAJA::indirect_function_call_dispatch,
-    holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
+                                  holder_type>
 {
   using type = ::RAJA::indirect_function_call_dispatch;
 };
@@ -102,11 +100,10 @@ struct dispatcher_transform_types<
  * device linking to fail.
  */
 template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<
-    platform,
-    ::RAJA::indirect_function_call_dispatch,
-    DispatcherID,
-    CallArgs...>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_function_call_dispatch,
+                  DispatcherID,
+                  CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy   = ::RAJA::indirect_function_call_dispatch;
@@ -118,8 +115,8 @@ struct Dispatcher<
   /// destroy the T obj in src
   ///
   template <typename T>
-  static void
-  s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
+  static void s_move_construct_destroy(void_ptr_wrapper dest,
+                                       void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
     T* src_as_T  = static_cast<T*>(src.ptr);
@@ -138,8 +135,8 @@ struct Dispatcher<
   }
   ///
   template <typename T>
-  static RAJA_DEVICE void
-  s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
+                                          CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -155,10 +152,10 @@ struct Dispatcher<
     (*obj_as_T).~T();
   }
 
-  using mover_type =
-      void (*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
-  using invoker_type =
-      void (*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
+  using mover_type     = void (*)(void_ptr_wrapper /*dest*/,
+                              void_ptr_wrapper /*src*/);
+  using invoker_type   = void (*)(void_cptr_wrapper /*obj*/,
+                                CallArgs... /*args*/);
   using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
@@ -179,16 +176,14 @@ struct Dispatcher<
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <
-      typename T,
-      bool uhi               = use_host_invoke,
-      std::enable_if_t<uhi>* = nullptr>
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
-    return {
-        mover_type {&s_move_construct_destroy<T>},
-        invoker_type {&s_host_invoke<T>}, destroyer_type {&s_destroy<T>},
-        sizeof(T)};
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {&s_host_invoke<T>}, destroyer_type {&s_destroy<T>},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -200,31 +195,28 @@ struct Dispatcher<
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template <
-      typename T,
-      typename CreateOnDevice,
-      bool uhi                = use_host_invoke,
-      std::enable_if_t<!uhi>* = nullptr>
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
   {
-    return {
-        mover_type {&s_move_construct_destroy<T>},
-        invoker_type {std::forward<CreateOnDevice>(createOnDevice)(
-            DeviceInvokerFactory<T> {})},
-        destroyer_type {&s_destroy<T>}, sizeof(T)};
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {std::forward<CreateOnDevice>(createOnDevice)(
+                DeviceInvokerFactory<T> {})},
+            destroyer_type {&s_destroy<T>}, sizeof(T)};
   }
 
-  mover_type     move_construct_destroy;
-  invoker_type   invoke;
+  mover_type move_construct_destroy;
+  invoker_type invoke;
   destroyer_type destroy;
-  size_t         size;
+  size_t size;
 };
 
 
 template <typename holder_type>
-struct dispatcher_transform_types<
-    ::RAJA::indirect_virtual_function_dispatch,
-    holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
+                                  holder_type>
 {
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
@@ -239,11 +231,10 @@ struct dispatcher_transform_types<
  * device linking to fail.
  */
 template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<
-    platform,
-    ::RAJA::indirect_virtual_function_dispatch,
-    DispatcherID,
-    CallArgs...>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_virtual_function_dispatch,
+                  DispatcherID,
+                  CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy   = ::RAJA::indirect_virtual_function_dispatch;
@@ -252,9 +243,9 @@ struct Dispatcher<
 
   struct impl_base
   {
-    virtual void
-    move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
-    virtual void destroy(void_ptr_wrapper obj) const                = 0;
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const      = 0;
   };
 
   struct host_impl_base
@@ -264,8 +255,8 @@ struct Dispatcher<
 
   struct device_impl_base
   {
-    virtual RAJA_DEVICE void
-    invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const = 0;
   };
 
   template <typename T>
@@ -275,8 +266,8 @@ struct Dispatcher<
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void
-    move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
       T* src_as_T  = static_cast<T*>(src.ptr);
@@ -313,8 +304,8 @@ struct Dispatcher<
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void
-    invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -324,7 +315,7 @@ struct Dispatcher<
   struct mover_type
   {
     impl_base* m_impl;
-    void       operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
+    void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       m_impl->move_destroy(dest, src);
     }
@@ -333,7 +324,7 @@ struct Dispatcher<
   struct host_invoker_type
   {
     host_impl_base* m_impl;
-    void            operator()(void_cptr_wrapper obj, CallArgs... args) const
+    void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
@@ -342,7 +333,7 @@ struct Dispatcher<
   struct device_invoker_type
   {
     device_impl_base* m_impl;
-    RAJA_DEVICE void  operator()(void_cptr_wrapper obj, CallArgs... args) const
+    RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
@@ -353,7 +344,7 @@ struct Dispatcher<
   struct destroyer_type
   {
     impl_base* m_impl;
-    void       operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
+    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
@@ -375,17 +366,15 @@ struct Dispatcher<
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <
-      typename T,
-      bool uhi               = use_host_invoke,
-      std::enable_if_t<uhi>* = nullptr>
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return {
-        mover_type {&s_base_impl}, host_invoker_type {&s_host_impl},
-        destroyer_type {&s_base_impl}, sizeof(T)};
+    return {mover_type {&s_base_impl}, host_invoker_type {&s_host_impl},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -397,25 +386,23 @@ struct Dispatcher<
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template <
-      typename T,
-      typename CreateOnDevice,
-      bool uhi                = use_host_invoke,
-      std::enable_if_t<!uhi>* = nullptr>
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
   {
-    static base_impl_type<T>    s_base_impl;
+    static base_impl_type<T> s_base_impl;
     static device_impl_type<T>* s_device_impl_ptr {std::forward<CreateOnDevice>(
         createOnDevice)(DeviceImplTypeFactory<T> {})};
-    return {
-        mover_type {&s_base_impl}, device_invoker_type {s_device_impl_ptr},
-        destroyer_type {&s_base_impl}, sizeof(T)};
+    return {mover_type {&s_base_impl}, device_invoker_type {s_device_impl_ptr},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
 
-  mover_type     move_construct_destroy;
-  invoker_type   invoke;
+  mover_type move_construct_destroy;
+  invoker_type invoke;
   destroyer_type destroy;
-  size_t         size;
+  size_t size;
 };
 
 
@@ -432,11 +419,10 @@ struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type>
  * It implements the interface with callable objects.
  */
 template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<
-    platform,
-    ::RAJA::direct_dispatch<>,
-    DispatcherID,
-    CallArgs...>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<>,
+                  DispatcherID,
+                  CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy                 = ::RAJA::direct_dispatch<>;
@@ -477,10 +463,9 @@ struct Dispatcher<
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <
-      typename T,
-      bool uhi               = use_host_invoke,
-      std::enable_if_t<uhi>* = nullptr>
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
     return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
@@ -491,37 +476,34 @@ struct Dispatcher<
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <
-      typename T,
-      typename CreateOnDevice,
-      bool uhi                = use_host_invoke,
-      std::enable_if_t<!uhi>* = nullptr>
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
-    return {
-        mover_type {}, device_invoker_type {}, destroyer_type {}, sizeof(T)};
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
-  mover_type     move_construct_destroy;
-  invoker_type   invoke;
+  mover_type move_construct_destroy;
+  invoker_type invoke;
   destroyer_type destroy;
-  size_t         size;
+  size_t size;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template <
-    Platform platform,
-    typename T,
-    typename DispatcherID,
-    typename... CallArgs>
-struct Dispatcher<
-    platform,
-    ::RAJA::direct_dispatch<T>,
-    DispatcherID,
-    CallArgs...>
+template <Platform platform,
+          typename T,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T>,
+                  DispatcherID,
+                  CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy                 = ::RAJA::direct_dispatch<T>;
@@ -580,14 +562,13 @@ struct Dispatcher<
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <
-      typename U,
-      bool uhi               = use_host_invoke,
-      std::enable_if_t<uhi>* = nullptr>
+  template <typename U,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
-    static_assert(
-        std::is_same<T, U>::value, "U must be in direct_dispatch types");
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
     return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
   ///
@@ -596,41 +577,38 @@ struct Dispatcher<
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <
-      typename U,
-      typename CreateOnDevice,
-      bool uhi                = use_host_invoke,
-      std::enable_if_t<!uhi>* = nullptr>
+  template <typename U,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
-    static_assert(
-        std::is_same<T, U>::value, "U must be in direct_dispatch types");
-    return {
-        mover_type {}, device_invoker_type {}, destroyer_type {}, sizeof(T)};
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
-  mover_type     move_construct_destroy;
-  invoker_type   invoke;
+  mover_type move_construct_destroy;
+  invoker_type invoke;
   destroyer_type destroy;
-  size_t         size;
+  size_t size;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template <
-    typename T0,
-    typename T1,
-    typename... TNs,
-    Platform platform,
-    typename DispatcherID,
-    typename... CallArgs>
-struct Dispatcher<
-    platform,
-    ::RAJA::direct_dispatch<T0, T1, TNs...>,
-    DispatcherID,
-    CallArgs...>
+template <typename T0,
+          typename T1,
+          typename... TNs,
+          Platform platform,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID,
+                  CallArgs...>
 {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
   using dispatch_policy   = ::RAJA::direct_dispatch<T0, T1, TNs...>;
@@ -656,11 +634,10 @@ struct Dispatcher<
 
   private:
     template <int... id_types, typename... Ts>
-    void impl_helper(
-        camp::int_seq<int, id_types...>,
-        camp::list<Ts...>,
-        void_ptr_wrapper dest,
-        void_ptr_wrapper src) const
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper dest,
+                     void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
@@ -684,22 +661,20 @@ struct Dispatcher<
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(
-          callable_indices {}, callable_types {}, obj,
-          std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
     template <int... id_types, typename... Ts>
-    void impl_helper(
-        camp::int_seq<int, id_types...>,
-        camp::list<Ts...>,
-        void_cptr_wrapper obj,
-        CallArgs... args) const
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_cptr_wrapper obj,
+                     CallArgs... args) const
     {
-      camp::sink((
-          (id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
-                           : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
     template <typename T>
@@ -715,22 +690,20 @@ struct Dispatcher<
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(
-          callable_indices {}, callable_types {}, obj,
-          std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
     template <int... id_types, typename... Ts>
-    RAJA_DEVICE void impl_helper(
-        camp::int_seq<int, id_types...>,
-        camp::list<Ts...>,
-        void_cptr_wrapper obj,
-        CallArgs... args) const
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
+                                 camp::list<Ts...>,
+                                 void_cptr_wrapper obj,
+                                 CallArgs... args) const
     {
-      camp::sink((
-          (id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
-                           : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
     template <typename T>
@@ -757,10 +730,9 @@ struct Dispatcher<
 
   private:
     template <int... id_types, typename... Ts>
-    void impl_helper(
-        camp::int_seq<int, id_types...>,
-        camp::list<Ts...>,
-        void_ptr_wrapper obj) const
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
@@ -780,13 +752,13 @@ struct Dispatcher<
   /// If T is not in Ts return -1.
   ///
   template <typename T, int... id_types, typename... Ts>
-  static constexpr id_type
-  get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
+                                  camp::list<Ts...>)
   {
     id_type id {-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[] {
-        0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    int unused[] {0,
+                  (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
     camp::sink(unused);  // quiet unused var warning
     return id;
   }
@@ -794,18 +766,16 @@ struct Dispatcher<
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <
-      typename T,
-      bool uhi               = use_host_invoke,
-      std::enable_if_t<uhi>* = nullptr>
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
   static inline Dispatcher makeDispatcher()
   {
     static constexpr id_type id =
         get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {
-        mover_type {id}, host_invoker_type {id}, destroyer_type {id},
-        sizeof(T)};
+    return {mover_type {id}, host_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -813,25 +783,23 @@ struct Dispatcher<
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <
-      typename T,
-      typename CreateOnDevice,
-      bool uhi                = use_host_invoke,
-      std::enable_if_t<!uhi>* = nullptr>
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
   static inline Dispatcher makeDispatcher(CreateOnDevice&&)
   {
     static constexpr id_type id =
         get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {
-        mover_type {id}, device_invoker_type {id}, destroyer_type {id},
-        sizeof(T)};
+    return {mover_type {id}, device_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
 
-  mover_type     move_construct_destroy;
-  invoker_type   invoke;
+  mover_type move_construct_destroy;
+  invoker_type invoke;
   destroyer_type destroy;
-  size_t         size;
+  size_t size;
 };
 
 /*!
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 8180ea6ee5..5a666d1c73 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -45,17 +45,16 @@ struct HoldBodyArgs_base
 {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template <
-      typename body_in,
-      typename = typename std::enable_if<
-          std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
+  template <typename body_in,
+            typename = typename std::enable_if<
+                std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
   HoldBodyArgs_base(body_in&& body, Args... args)
       : m_body(std::forward<body_in>(body)),
         m_arg_tuple(std::forward<Args>(args)...)
   {}
 
 protected:
-  LoopBody             m_body;
+  LoopBody m_body;
   camp::tuple<Args...> m_arg_tuple;
 };
 
@@ -106,12 +105,11 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <
-    typename ExecutionPolicy,
-    typename Segment_type,
-    typename LoopBody,
-    typename index_type,
-    typename... Args>
+template <typename ExecutionPolicy,
+          typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
@@ -128,41 +126,38 @@ struct HoldForall
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
-    wrap::forall(
-        r, ExecutionPolicy(), m_segment,
-        HoldBodyArgs {m_body, std::forward<Args>(args)...});
+    wrap::forall(r, ExecutionPolicy(), m_segment,
+                 HoldBodyArgs {m_body, std::forward<Args>(args)...});
   }
 
 private:
   Segment_type m_segment;
-  LoopBody     m_body;
+  LoopBody m_body;
 };
 
 
 /*!
  * A class that handles running work in a work container
  */
-template <
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
 struct WorkRunner;
 
 
 /*!
  * Base class describing storage for ordered runners using forall
  */
-template <
-    typename FORALL_EXEC_POLICY,
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
+template <typename FORALL_EXEC_POLICY,
+          typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
 struct WorkRunnerForallOrdered_base
 {
   using exec_policy     = EXEC_POLICY_T;
@@ -179,12 +174,12 @@ struct WorkRunnerForallOrdered_base
   struct holder_type
   {
     template <typename T>
-    using type = HoldForall<
-        forall_exec_policy,
-        typename camp::at<T, camp::num<0>>::type,  // segment_type
-        typename camp::at<T, camp::num<1>>::type,  // loop_type
-        index_type,
-        Args...>;
+    using type =
+        HoldForall<forall_exec_policy,
+                   typename camp::at<T, camp::num<0>>::type,  // segment_type
+                   typename camp::at<T, camp::num<1>>::type,  // loop_type
+                   index_type,
+                   Args...>;
   };
   ///
   template <typename T>
@@ -199,12 +194,11 @@ struct WorkRunnerForallOrdered_base
   using dispatcher_holder_policy =
       dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<
-      Platform::host,
-      dispatcher_holder_policy,
-      void,
-      resource_type,
-      Args...>;
+  using dispatcher_type = Dispatcher<Platform::host,
+                                     dispatcher_holder_policy,
+                                     void,
+                                     resource_type,
+                                     Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
@@ -239,39 +233,36 @@ struct WorkRunnerForallOrdered_base
 /*!
  * Runs work in a storage container in order using forall
  */
-template <
-    typename FORALL_EXEC_POLICY,
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunnerForallOrdered : WorkRunnerForallOrdered_base<
-                                     FORALL_EXEC_POLICY,
-                                     EXEC_POLICY_T,
-                                     ORDER_POLICY_T,
-                                     DISPATCH_POLICY_T,
-                                     ALLOCATOR_T,
-                                     INDEX_T,
-                                     Args...>
+template <typename FORALL_EXEC_POLICY,
+          typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunnerForallOrdered
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
   template <typename WorkContainer>
-  typename base::per_run_storage
-  run(WorkContainer const&         storage,
-      typename base::resource_type r,
-      Args... args) const
+  typename base::per_run_storage run(WorkContainer const& storage,
+                                     typename base::resource_type r,
+                                     Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
@@ -290,40 +281,37 @@ struct WorkRunnerForallOrdered : WorkRunnerForallOrdered_base<
 /*!
  * Runs work in a storage container in reverse order using forall
  */
-template <
-    typename FORALL_EXEC_POLICY,
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunnerForallReverse : WorkRunnerForallOrdered_base<
-                                     FORALL_EXEC_POLICY,
-                                     EXEC_POLICY_T,
-                                     ORDER_POLICY_T,
-                                     DISPATCH_POLICY_T,
-                                     ALLOCATOR_T,
-                                     INDEX_T,
-                                     Args...>
+template <typename FORALL_EXEC_POLICY,
+          typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunnerForallReverse
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
   // run the loops using forall in the reverse order to the order they were
   // enqueued
   template <typename WorkContainer>
-  typename base::per_run_storage
-  run(WorkContainer const&         storage,
-      typename base::resource_type r,
-      Args... args) const
+  typename base::per_run_storage run(WorkContainer const& storage,
+                                     typename base::resource_type r,
+                                     Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 535463eaf6..d7eceaef7f 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -141,51 +141,51 @@ struct random_access_iterator : iterator_base
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline difference_type operator-(
-      random_access_iterator const& lhs,
-      random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline difference_type
+  operator-(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator==(
-      random_access_iterator const& lhs,
-      random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator==(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator!=(
-      random_access_iterator const& lhs,
-      random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator!=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<(
-      random_access_iterator const& lhs,
-      random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<=(
-      random_access_iterator const& lhs,
-      random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>(
-      random_access_iterator const& lhs,
-      random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>=(
-      random_access_iterator const& lhs,
-      random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -195,10 +195,9 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template <
-    typename STORAGE_POLICY_T,
-    typename ALLOCATOR_T,
-    typename Dispatcher_T>
+template <typename STORAGE_POLICY_T,
+          typename ALLOCATOR_T,
+          typename Dispatcher_T>
 class WorkStorage;
 
 template <typename ALLOCATOR_T, typename Dispatcher_T>
@@ -235,7 +234,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   // struct used in storage vector to retain pointer and allocation size
   struct pointer_and_size
   {
-    pointer   ptr;
+    pointer ptr;
     size_type size;
   };
 
@@ -260,23 +259,23 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -303,8 +302,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   {
     if (this != &rhs)
     {
-      move_assign_private(
-          std::move(rhs), propagate_on_container_move_assignment {});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -337,8 +336,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   template <typename holder, typename... holder_ctor_args>
-  void
-  emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
         dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
@@ -361,7 +360,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   RAJAVec<
       pointer_and_size,
       typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
-                 m_vec;
+      m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
@@ -395,9 +394,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
   // allocate and construct value in storage
   template <typename holder, typename... holder_ctor_args>
-  pointer_and_size create_value(
-      const dispatcher_type* dispatcher,
-      holder_ctor_args&&... ctor_args)
+  pointer_and_size create_value(const dispatcher_type* dispatcher,
+                                holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
@@ -412,8 +410,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
   // allocate and move construct object as copy of other value and
   // destroy and deallocate other value
-  pointer_and_size
-  move_destroy_value(WorkStorage&& rhs, pointer_and_size other_value_and_size)
+  pointer_and_size move_destroy_value(WorkStorage&& rhs,
+                                      pointer_and_size other_value_and_size)
   {
     pointer value_ptr = reinterpret_cast<pointer>(
         allocator_traits_type::allocate(m_aloc, other_value_and_size.size));
@@ -492,29 +490,29 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
 
   private:
-    const char*      m_array_begin;
+    const char* m_array_begin;
     const size_type* m_offset_iter;
   };
 
@@ -544,8 +542,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   {
     if (this != &rhs)
     {
-      move_assign_private(
-          std::move(rhs), propagate_on_container_move_assignment {});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -575,8 +573,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   size_type storage_size() const { return m_array_end - m_array_begin; }
 
   template <typename holder, typename... holder_ctor_args>
-  void
-  emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
     size_type value_size   = create_value<holder>(
@@ -591,8 +589,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     array_clear();
     if (m_array_begin != nullptr)
     {
-      allocator_traits_type::deallocate(
-          m_aloc, m_array_begin, storage_capacity());
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
@@ -602,13 +600,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<
-      size_type,
-      typename allocator_traits_type::template rebind_alloc<size_type>>
-                 m_offsets;
-  char*          m_array_begin = nullptr;
-  char*          m_array_end   = nullptr;
-  char*          m_array_cap   = nullptr;
+  RAJAVec<size_type,
+          typename allocator_traits_type::template rebind_alloc<size_type>>
+      m_offsets;
+  char* m_array_begin = nullptr;
+  char* m_array_end   = nullptr;
+  char* m_array_cap   = nullptr;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
@@ -679,14 +676,14 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
       for (size_type i = 0; i < size(); ++i)
       {
-        move_destroy_value(
-            new_array_begin + m_offsets[i], m_array_begin + m_offsets[i]);
+        move_destroy_value(new_array_begin + m_offsets[i],
+                           m_array_begin + m_offsets[i]);
       }
 
       if (m_array_begin != nullptr)
       {
-        allocator_traits_type::deallocate(
-            m_aloc, m_array_begin, storage_capacity());
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
       m_array_begin = new_array_begin;
@@ -710,10 +707,9 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
   template <typename holder, typename... holder_ctor_args>
-  size_type create_value(
-      size_type              value_offset,
-      const dispatcher_type* dispatcher,
-      holder_ctor_args&&... ctor_args)
+  size_type create_value(size_type value_offset,
+                         const dispatcher_type* dispatcher,
+                         holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
@@ -735,9 +731,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // loop body in other
   void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
-    value_type::move_destroy(
-        reinterpret_cast<pointer>(value_ptr),
-        reinterpret_cast<pointer>(other_value_ptr));
+    value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
+                             reinterpret_cast<pointer>(other_value_ptr));
   }
 
   // destroy the loop body at value offset
@@ -749,10 +744,9 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 };
 
 template <typename ALLOCATOR_T, typename Dispatcher_T>
-class WorkStorage<
-    RAJA::constant_stride_array_of_objects,
-    ALLOCATOR_T,
-    Dispatcher_T>
+class WorkStorage<RAJA::constant_stride_array_of_objects,
+                  ALLOCATOR_T,
+                  Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
   using propagate_on_container_copy_assignment =
@@ -806,30 +800,30 @@ class WorkStorage<
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter,
-        const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
 
   private:
     const char* m_array_pos;
-    size_type   m_stride;
+    size_type m_stride;
   };
 
   using const_iterator = random_access_iterator<const_iterator_base>;
@@ -857,8 +851,8 @@ class WorkStorage<
   {
     if (this != &rhs)
     {
-      move_assign_private(
-          std::move(rhs), propagate_on_container_move_assignment {});
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -886,11 +880,11 @@ class WorkStorage<
   size_type storage_size() const { return m_array_end - m_array_begin; }
 
   template <typename holder, typename... holder_ctor_args>
-  void
-  emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(
-        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher,
+                         std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -900,8 +894,8 @@ class WorkStorage<
     array_clear();
     if (m_array_begin != nullptr)
     {
-      allocator_traits_type::deallocate(
-          m_aloc, m_array_begin, storage_capacity());
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
@@ -912,10 +906,10 @@ class WorkStorage<
 
 private:
   allocator_type m_aloc;
-  size_type      m_stride      = 1;  // can't be 0 because size divides stride
-  char*          m_array_begin = nullptr;
-  char*          m_array_end   = nullptr;
-  char*          m_array_cap   = nullptr;
+  size_type m_stride  = 1;  // can't be 0 because size divides stride
+  char* m_array_begin = nullptr;
+  char* m_array_end   = nullptr;
+  char* m_array_cap   = nullptr;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
@@ -990,14 +984,14 @@ class WorkStorage<
 
       for (size_type i = 0; i < size(); ++i)
       {
-        move_destroy_value(
-            new_array_begin + i * new_stride, m_array_begin + i * m_stride);
+        move_destroy_value(new_array_begin + i * new_stride,
+                           m_array_begin + i * m_stride);
       }
 
       if (m_array_begin != nullptr)
       {
-        allocator_traits_type::deallocate(
-            m_aloc, m_array_begin, storage_capacity());
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
       m_stride      = new_stride;
@@ -1021,17 +1015,15 @@ class WorkStorage<
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
   template <typename holder, typename... holder_ctor_args>
-  void create_value(
-      const dispatcher_type* dispatcher,
-      holder_ctor_args&&... ctor_args)
+  void create_value(const dispatcher_type* dispatcher,
+                    holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
     if (value_size > storage_unused() && value_size <= m_stride)
     {
-      array_reserve(
-          std::max(storage_size() + m_stride, 2 * storage_capacity()),
-          m_stride);
+      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
+                    m_stride);
     }
     else if (value_size > m_stride)
     {
@@ -1049,9 +1041,8 @@ class WorkStorage<
   // destroy the loop body in other
   void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
-    value_type::move_destroy(
-        reinterpret_cast<pointer>(value_ptr),
-        reinterpret_cast<pointer>(other_value_ptr));
+    value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
+                             reinterpret_cast<pointer>(other_value_ptr));
   }
 
   // destroy the loop body at value offset
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 0ff8602623..abb6545876 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -47,12 +47,11 @@ struct WorkStruct;
 template <typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template <
-    size_t   size,
-    Platform platform,
-    typename dispatch_policy,
-    typename DispatcherID,
-    typename... CallArgs>
+template <size_t size,
+          Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
 struct WorkStruct<
     size,
     Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
@@ -63,30 +62,24 @@ struct WorkStruct<
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
   template <typename holder, typename... holder_ctor_args>
-  static RAJA_INLINE void construct(
-      void*                  ptr,
-      const dispatcher_type* dispatcher,
-      holder_ctor_args&&... ctor_args)
+  static RAJA_INLINE void construct(void* ptr,
+                                    const dispatcher_type* dispatcher,
+                                    holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
     using value_type      = GenericWorkStruct<dispatcher_type>;
 
-    static_assert(
-        sizeof(holder) <= sizeof(true_value_type::obj),
-        "holder must fit in WorkStruct::obj");
-    static_assert(
-        std::is_standard_layout<true_value_type>::value,
-        "WorkStruct must be a standard layout type");
-    static_assert(
-        std::is_standard_layout<value_type>::value,
-        "GenericWorkStruct must be a standard layout type");
-    static_assert(
-        offsetof(value_type, obj) == offsetof(true_value_type, obj),
-        "WorkStruct and GenericWorkStruct must have obj at the same "
-        "offset");
-    static_assert(
-        sizeof(value_type) <= sizeof(true_value_type),
-        "WorkStruct must not be smaller than GenericWorkStruct");
+    static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
+                  "holder must fit in WorkStruct::obj");
+    static_assert(std::is_standard_layout<true_value_type>::value,
+                  "WorkStruct must be a standard layout type");
+    static_assert(std::is_standard_layout<value_type>::value,
+                  "GenericWorkStruct must be a standard layout type");
+    static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
+                  "WorkStruct and GenericWorkStruct must have obj at the same "
+                  "offset");
+    static_assert(sizeof(value_type) <= sizeof(true_value_type),
+                  "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
@@ -95,13 +88,13 @@ struct WorkStruct<
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE void
-  move_destroy(WorkStruct* value_dst, WorkStruct* value_src)
+  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
+                                       WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
     value_dst->invoke     = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(
-        &value_dst->obj, &value_src->obj);
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
+                                                  &value_src->obj);
   }
 
   // destroy the value ptr
@@ -111,21 +104,21 @@ struct WorkStruct<
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE void
-  host_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
+                                    CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE void
-  device_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
+                                                  CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
 
-  const dispatcher_type*                                    dispatcher;
-  typename dispatcher_type::invoker_type                    invoke;
+  const dispatcher_type* dispatcher;
+  typename dispatcher_type::invoker_type invoke;
   typename std::aligned_storage<size, RAJA_MAX_ALIGN>::type obj;
 };
 
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index 226ed79da8..d56c576710 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -231,9 +231,8 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
 {
-  static_assert(
-      std::is_integral<T>::value,
-      "atomicAnd can only be used on integral types");
+  static_assert(std::is_integral<T>::value,
+                "atomicAnd can only be used on integral types");
   return RAJA::atomicAnd(Policy {}, acc, value);
 }
 
@@ -249,9 +248,8 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
 {
-  static_assert(
-      std::is_integral<T>::value,
-      "atomicOr can only be used on integral types");
+  static_assert(std::is_integral<T>::value,
+                "atomicOr can only be used on integral types");
   return RAJA::atomicOr(Policy {}, acc, value);
 }
 
@@ -267,9 +265,8 @@ RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
 {
-  static_assert(
-      std::is_integral<T>::value,
-      "atomicXor can only be used on integral types");
+  static_assert(std::is_integral<T>::value,
+                "atomicXor can only be used on integral types");
   return RAJA::atomicXor(Policy {}, acc, value);
 }
 
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 79378831b1..0a5521e0e3 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -52,12 +52,14 @@ template <typename Container>
 using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
 
 template <typename Container>
-using ContainerDiff = camp::decay<
-    decltype(camp::val<camp::iterator_from<Container>>() - camp::val<camp::iterator_from<Container>>())>;
+using ContainerDiff =
+    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
+                         camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE DiffType
-firstIndex(DiffType n, CountType num_threads, CountType thread_id)
+RAJA_INLINE DiffType firstIndex(DiffType n,
+                                CountType num_threads,
+                                CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 8016081dbe..14b655475b 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -71,10 +71,9 @@ struct BaseMultiReduce
       : BaseMultiReduce {RepeatView<value_type>(MultiReduceOp::identity(), 0)}
   {}
 
-  explicit BaseMultiReduce(
-      size_t     num_bins,
-      value_type init_val = MultiReduceOp::identity(),
-      value_type identity = MultiReduceOp::identity())
+  explicit BaseMultiReduce(size_t num_bins,
+                           value_type init_val = MultiReduceOp::identity(),
+                           value_type identity = MultiReduceOp::identity())
       : BaseMultiReduce {RepeatView<value_type>(init_val, num_bins), identity}
   {}
 
@@ -85,9 +84,8 @@ struct BaseMultiReduce
           concepts::negate<std::is_convertible<Container, size_t>>,
           concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* =
           nullptr>
-  explicit BaseMultiReduce(
-      Container const& container,
-      value_type       identity = MultiReduceOp::identity())
+  explicit BaseMultiReduce(Container const& container,
+                           value_type identity = MultiReduceOp::identity())
       : data {container, identity}
   {}
 
@@ -105,20 +103,17 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(MultiReduceOp::identity(), size()));
   }
 
-  void reset(
-      size_t     num_bins,
-      value_type init_val = MultiReduceOp::identity(),
-      value_type identity = MultiReduceOp::identity())
+  void reset(size_t num_bins,
+             value_type init_val = MultiReduceOp::identity(),
+             value_type identity = MultiReduceOp::identity())
   {
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template <
-      typename Container,
-      concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
-  void reset(
-      Container const& container,
-      value_type       identity = MultiReduceOp::identity())
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
+  void reset(Container const& container,
+             value_type identity = MultiReduceOp::identity())
   {
     for (size_t bin = 0; bin < data.num_bins(); ++bin)
     {
@@ -143,9 +138,8 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template <
-      typename Container,
-      concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void get_all(Container& container) const
   {
     RAJA_EXTRACT_BED_IT(container);
@@ -212,7 +206,7 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceMin const& m_base;
-    size_t                    m_bin;
+    size_t m_bin;
   };
 };
 
@@ -260,7 +254,7 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceMax const& m_base;
-    size_t                    m_bin;
+    size_t m_bin;
   };
 };
 
@@ -308,7 +302,7 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceSum const& m_base;
-    size_t                    m_bin;
+    size_t m_bin;
   };
 };
 
@@ -356,7 +350,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceBitOr const& m_base;
-    size_t                      m_bin;
+    size_t m_bin;
   };
 };
 
@@ -404,7 +398,7 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
 
   private:
     BaseMultiReduceBitAnd const& m_base;
-    size_t                       m_bin;
+    size_t m_bin;
   };
 };
 
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 9d85915cd4..6b1db3ba24 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -51,16 +51,14 @@ struct Privatizer
   using value_type     = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
-  static_assert(
-      !has_privatizer<T>::value,
-      "Privatizer selected "
-      "inappropriately, this is almost "
-      "certainly "
-      "a bug");
-  static_assert(
-      !std::is_base_of<GenericWrapperBase, T>::value,
-      "Privatizer selected inappropriately, this is almost certainly "
-      "a bug");
+  static_assert(!has_privatizer<T>::value,
+                "Privatizer selected "
+                "inappropriately, this is almost "
+                "certainly "
+                "a bug");
+  static_assert(!std::is_base_of<GenericWrapperBase, T>::value,
+                "Privatizer selected inappropriately, this is almost certainly "
+                "a bug");
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE Privatizer(const T& o) : priv {o} {}
@@ -86,18 +84,16 @@ struct Privatizer
  * that does not belong here.
  *
  */
-template <
-    typename T,
-    typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
 {
   return Privatizer<T> {item};
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <
-    typename T,
-    typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer
 {
   return typename T::privatizer {item};
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 92976c8955..2f826b590f 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -156,9 +156,9 @@ class ValueLoc
       : val {val_}, loc {loc_}
   {}
 
-  RAJA_HOST_DEVICE           operator T() const { return val; }
+  RAJA_HOST_DEVICE operator T() const { return val; }
   RAJA_HOST_DEVICE IndexType getLoc() { return loc; }
-  RAJA_HOST_DEVICE bool      operator<(ValueLoc const& rhs) const
+  RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const
   {
     return val < rhs.val;
   }
@@ -198,12 +198,11 @@ namespace reduce
 namespace detail
 {
 
-template <
-    typename T,
-    template <typename>
-    class Reduce_,
-    template <typename, typename>
-    class Combiner_>
+template <typename T,
+          template <typename>
+          class Reduce_,
+          template <typename, typename>
+          class Combiner_>
 class BaseReduce
 {
   using Reduce = Reduce_<T>;
@@ -268,7 +267,7 @@ class BaseCombinable
 {
 protected:
   BaseCombinable const* parent = nullptr;
-  T                     identity;
+  T identity;
   T mutable my_data;
 
 public:
@@ -363,11 +362,10 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <
-    typename T,
-    typename IndexType,
-    template <typename, typename>
-    class Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
@@ -380,25 +378,22 @@ class BaseReduceMinLoc
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
   constexpr BaseReduceMinLoc(
-      T         init_val,
+      T init_val,
       IndexType init_idx,
-      T         identity_val_ = reduce_type::identity(),
+      T identity_val_         = reduce_type::identity(),
       IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-      : Base(
-            value_type(init_val, init_idx),
-            value_type(identity_val_, identity_loc_))
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
   {}
 
-  void reset(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val_ = reduce_type::identity(),
-      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
+             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
     operator T();  // automatic get() before reset
-    Base::reset(
-        value_type(init_val, init_idx),
-        value_type(identity_val_, identity_loc_));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
@@ -519,15 +514,13 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <
-    typename T,
-    typename IndexType,
-    template <typename, typename>
-    class Combiner>
-class BaseReduceMaxLoc : public BaseReduce<
-                             ValueLoc<T, IndexType, false>,
-                             RAJA::reduce::max,
-                             Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
+class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
+                                           RAJA::reduce::max,
+                                           Combiner>
 {
 public:
   using Base =
@@ -539,25 +532,22 @@ class BaseReduceMaxLoc : public BaseReduce<
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
   constexpr BaseReduceMaxLoc(
-      T         init_val,
+      T init_val,
       IndexType init_idx,
-      T         identity_val_ = reduce_type::identity(),
+      T identity_val_         = reduce_type::identity(),
       IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-      : Base(
-            value_type(init_val, init_idx),
-            value_type(identity_val_, identity_loc_))
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
   {}
 
-  void reset(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val_ = reduce_type::identity(),
-      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
+             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
     operator T();  // automatic get() before reset
-    Base::reset(
-        value_type(init_val, init_idx),
-        value_type(identity_val_, identity_loc_));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 3a593e80ca..e0b87a5d60 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -104,7 +104,7 @@ struct icount_adapter
   typename std::decay<Body>::type body;
   using container_type = typename std::decay<Range>::type;
   typename container_type::iterator begin_it;
-  Index_type                        icount;
+  Index_type icount;
   icount_adapter(Range const& r, Body const& b, IndexT icount_)
       : body {b}, icount {icount_}
   {
@@ -122,28 +122,26 @@ struct icount_adapter
 
 struct CallForall
 {
-  template <
-      typename T,
-      typename ExecPol,
-      typename Body,
-      typename Res,
-      typename ForallParams>
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
   RAJA_INLINE camp::resources::EventProxy<Res>
-              operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
 
 struct CallForallIcount
 {
   constexpr CallForallIcount(int s);
 
-  template <
-      typename T,
-      typename ExecPol,
-      typename Body,
-      typename Res,
-      typename ForallParams>
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
   RAJA_INLINE camp::resources::EventProxy<Res>
-              operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
@@ -167,22 +165,20 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <
-    typename Res,
-    typename ExecutionPolicy,
-    typename Container,
-    typename LoopBody,
-    typename ForallParams>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody,
+          typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(
-    Res               r,
-    ExecutionPolicy&& p,
-    Container&&       c,
-    LoopBody&&        loop_body,
-    ForallParams&&    f_params)
+forall(Res r,
+       ExecutionPolicy&& p,
+       Container&& c,
+       LoopBody&& loop_body,
+       ForallParams&& f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
   return forall_impl(
@@ -190,11 +186,10 @@ forall(
       std::forward<LoopBody>(loop_body), std::forward<ForallParams>(f_params));
 }
 
-template <
-    typename Res,
-    typename ExecutionPolicy,
-    typename Container,
-    typename LoopBody>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -215,32 +210,29 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <
-    typename Res,
-    typename ExecutionPolicy,
-    typename Container,
-    typename IndexType,
-    typename LoopBody,
-    typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(
-    Res               r,
-    ExecutionPolicy&& p,
-    Container&&       c,
-    IndexType&&       icount,
-    LoopBody&&        loop_body,
-    ForallParams&&    f_params)
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename IndexType,
+          typename LoopBody,
+          typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
+                                                     ExecutionPolicy&& p,
+                                                     Container&& c,
+                                                     IndexType&& icount,
+                                                     LoopBody&& loop_body,
+                                                     ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(
-      c, loop_body, icount);
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c, loop_body,
+                                                                 icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(
-      r, std::forward<ExecutionPolicy>(p), range, adapted,
-      std::forward<ForallParams>(f_params));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted,
+                     std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -252,58 +244,54 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(
 *
 ******************************************************************************
 */
-template <
-    typename Res,
-    typename SegmentIterPolicy,
-    typename SegmentExecPolicy,
-    typename... SegmentTypes,
-    typename LoopBody,
-    typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(
-    Res r,
-    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-    const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody                              loop_body,
-    ForallParams                          f_params)
+template <typename Res,
+          typename SegmentIterPolicy,
+          typename SegmentExecPolicy,
+          typename... SegmentTypes,
+          typename LoopBody,
+          typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(Res r,
+              ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+              const TypedIndexSet<SegmentTypes...>& iset,
+              LoopBody loop_body,
+              ForallParams f_params)
 {
   // no need for icount variant here
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(
-      segIterRes, SegmentIterPolicy(), iset,
-      [=, &r](int segID)
-      {
-        iset.segmentCall(
-            segID, detail::CallForallIcount(iset.getStartingIcount(segID)),
-            SegmentExecPolicy(), loop_body, r, f_params);
-      });
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(
+                     segID,
+                     detail::CallForallIcount(iset.getStartingIcount(segID)),
+                     SegmentExecPolicy(), loop_body, r, f_params);
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
-template <
-    typename Res,
-    typename SegmentIterPolicy,
-    typename SegmentExecPolicy,
-    typename LoopBody,
-    typename... SegmentTypes,
-    typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall(
-    Res r,
-    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-    const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody                              loop_body,
-    ForallParams                          f_params)
+template <typename Res,
+          typename SegmentIterPolicy,
+          typename SegmentExecPolicy,
+          typename LoopBody,
+          typename... SegmentTypes,
+          typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res>
+forall(Res r,
+       ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+       const TypedIndexSet<SegmentTypes...>& iset,
+       LoopBody loop_body,
+       ForallParams f_params)
 {
   auto segIterRes =
       resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(
-      segIterRes, SegmentIterPolicy(), iset,
-      [=, &r](int segID)
-      {
-        iset.segmentCall(
-            segID, detail::CallForall {}, SegmentExecPolicy(), loop_body, r,
-            f_params);
-      });
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(segID, detail::CallForall {},
+                                  SegmentExecPolicy(), loop_body, r, f_params);
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
@@ -330,18 +318,16 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <
-    typename ExecutionPolicy,
-    typename Res,
-    typename IdxSet,
-    typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
 RAJA_INLINE resources::EventProxy<Res>
 forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
-  static_assert(
-      type_traits::is_index_set<IdxSet>::value,
-      "Expected a TypedIndexSet but did not get one. Are you using "
-      "a TypedIndexSet policy by mistake?");
+  static_assert(type_traits::is_index_set<IdxSet>::value,
+                "Expected a TypedIndexSet but did not get one. Are you using "
+                "a TypedIndexSet policy by mistake?");
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
@@ -358,9 +344,9 @@ forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
-      r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
-      std::move(body), f_params);
+  RAJA::resources::EventProxy<Res> e =
+      wrap::forall_Icount(r, std::forward<ExecutionPolicy>(p),
+                          std::forward<IdxSet>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -371,7 +357,7 @@ template <
     typename LoopBody,
     typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res>
-            forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -386,20 +372,18 @@ RAJA_INLINE resources::EventProxy<Res>
  *
  ******************************************************************************
  */
-template <
-    typename ExecutionPolicy,
-    typename Res,
-    typename IdxSet,
-    typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
-{
-  static_assert(
-      type_traits::is_index_set<IdxSet>::value,
-      "Expected a TypedIndexSet but did not get one. Are you using "
-      "a TypedIndexSet policy by mistake?");
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
+{
+  static_assert(type_traits::is_index_set<IdxSet>::value,
+                "Expected a TypedIndexSet but did not get one. Are you using "
+                "a TypedIndexSet policy by mistake?");
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
@@ -416,9 +400,9 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(
-      r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
-      std::move(body), f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
+                   std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -428,10 +412,10 @@ template <
     typename IdxSet,
     typename LoopBody,
     typename Res = typename resources::get_resource<ExecutionPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
@@ -451,22 +435,20 @@ template <
     typename Container,
     typename LoopBody,
     typename Res = typename resources::get_resource<ExecutionPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_multi_policy<ExecutionPolicy>,
-    type_traits::is_range<Container>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_multi_policy<ExecutionPolicy>,
+                                  type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container does not model RandomAccessIterator");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container does not model RandomAccessIterator");
 
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  return forall_impl(
-      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
-      std::forward<LoopBody>(loop_body));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -476,33 +458,29 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <
-    typename ExecutionPolicy,
-    typename Res,
-    typename Container,
-    typename IndexType,
-    typename FirstParam,
-    typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_range<Container>,
-    type_traits::is_integral<IndexType>>
-forall_Icount(
-    ExecutionPolicy&& p,
-    Res               r,
-    Container&&       c,
-    IndexType         icount,
-    FirstParam&&      first,
-    Params&&... params)
-{
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container does not model RandomAccessIterator");
-
-  auto f_params = expt::make_forall_param_pack(
-      std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(
-      std::forward<FirstParam>(first), std::forward<Params>(params)...);
+template <typename ExecutionPolicy,
+          typename Res,
+          typename Container,
+          typename IndexType,
+          typename FirstParam,
+          typename... Params>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_range<Container>,
+                                  type_traits::is_integral<IndexType>>
+forall_Icount(ExecutionPolicy&& p,
+              Res r,
+              Container&& c,
+              IndexType icount,
+              FirstParam&& first,
+              Params&&... params)
+{
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container does not model RandomAccessIterator");
+
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
+                                               std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
+                                      std::forward<Params>(params)...);
   // expt::check_forall_optional_args(loop_body, f_params);
 
   util::PluginContext context {
@@ -534,11 +512,10 @@ RAJA_INLINE concepts::enable_if_t<
     type_traits::is_range<Container>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_integral<IndexType>>
-forall_Icount(
-    ExecutionPolicy&& p,
-    Container&&       c,
-    IndexType         icount,
-    LoopBody&&        loop_body)
+forall_Icount(ExecutionPolicy&& p,
+              Container&& c,
+              IndexType icount,
+              LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -554,11 +531,10 @@ forall_Icount(
  ******************************************************************************
  */
 
-template <
-    typename ExecutionPolicy,
-    typename Res,
-    typename Container,
-    typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename Container,
+          typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -566,9 +542,8 @@ RAJA_INLINE concepts::enable_if_t<
     type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 {
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container does not model RandomAccessIterator");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container does not model RandomAccessIterator");
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
@@ -585,9 +560,9 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(
-      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
-      std::move(body), f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p),
+                   std::forward<Container>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -626,16 +601,16 @@ template <
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::
-    enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
-    forall(Res r, Args&&... args)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
+forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 
 /*!
@@ -655,9 +630,9 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::
-    enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
-    forall_Icount(Res r, Args&&... args)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
+forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
       ExecutionPolicy(), r, std::forward<Args>(args)...);
@@ -666,18 +641,17 @@ RAJA_INLINE concepts::
 namespace detail
 {
 
-template <
-    typename T,
-    typename ExecutionPolicy,
-    typename LoopBody,
-    typename Res,
-    typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(
-    T const& segment,
-    ExecutionPolicy,
-    LoopBody     body,
-    Res          r,
-    ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForall::operator()(T const& segment,
+                       ExecutionPolicy,
+                       LoopBody body,
+                       Res r,
+                       ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -687,22 +661,21 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <
-    typename T,
-    typename ExecutionPolicy,
-    typename LoopBody,
-    typename Res,
-    typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(
-    T const& segment,
-    ExecutionPolicy,
-    LoopBody     body,
-    Res          r,
-    ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForallIcount::operator()(T const& segment,
+                             ExecutionPolicy,
+                             LoopBody body,
+                             Res r,
+                             ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(
-      r, ExecutionPolicy(), segment, start, body, f_params);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body,
+                             f_params);
 }
 
 }  // namespace detail
@@ -734,11 +707,11 @@ struct dynamic_helper
   }
 
   template <typename SEGMENT, typename BODY>
-  static resources::EventProxy<resources::Resource> invoke_forall(
-      RAJA::resources::Resource r,
-      const int                 pol,
-      SEGMENT const&            seg,
-      BODY const&               body)
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
   {
 
     using t_pol         = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
@@ -753,8 +726,8 @@ struct dynamic_helper
       return {r};
     }
 
-    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(
-        r, pol, seg, body);
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r, pol, seg,
+                                                               body);
   }
 };
 
@@ -774,11 +747,11 @@ struct dynamic_helper<0, POLICY_LIST>
   }
 
   template <typename SEGMENT, typename BODY>
-  static resources::EventProxy<resources::Resource> invoke_forall(
-      RAJA::resources::Resource r,
-      const int                 pol,
-      SEGMENT const&            seg,
-      BODY const&               body)
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
   {
     if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
@@ -807,11 +780,11 @@ void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
 }
 
 template <typename POLICY_LIST, typename SEGMENT, typename BODY>
-resources::EventProxy<resources::Resource> dynamic_forall(
-    RAJA::resources::Resource r,
-    const int                 pol,
-    SEGMENT const&            seg,
-    BODY const&               body)
+resources::EventProxy<resources::Resource>
+dynamic_forall(RAJA::resources::Resource r,
+               const int pol,
+               SEGMENT const& seg,
+               BODY const& body)
 {
   constexpr int N = camp::size<POLICY_LIST>::value;
   static_assert(N > 0, "RAJA policy list must not be empty");
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index a82b48d527..d03c8f531f 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -58,9 +58,8 @@ template <typename... Ts>
 struct IterableWrapperTuple<camp::tuple<Ts...>>
 {
 
-  using type = camp::tuple<RAJA::Span<
-      typename camp::decay<Ts>::iterator,
-      typename camp::decay<Ts>::IndexType>...>;
+  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                                      typename camp::decay<Ts>::IndexType>...>;
 };
 
 
@@ -68,19 +67,17 @@ namespace internal
 {
 template <class Tuple, camp::idx_t... I>
 RAJA_INLINE constexpr auto
-make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>)
-    -> camp::tuple<RAJA::Span<
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
+make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>) -> camp::tuple<
+    RAJA::Span<typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+               typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<
-          typename camp::decay<
-              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-          typename camp::decay<
-              camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType> {
+      RAJA::Span<typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+                 typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType> {
           camp::get<I>(std::forward<Tuple>(t)).begin(),
           camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
@@ -98,17 +95,16 @@ RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t)
 }
 
 
-template <
-    typename PolicyType,
-    typename SegmentTuple,
-    typename ParamTuple,
-    typename Resource,
-    typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(
-    SegmentTuple&& segments,
-    ParamTuple&&   params,
-    Resource       resource,
-    Bodies&&... bodies)
+template <typename PolicyType,
+          typename SegmentTuple,
+          typename ParamTuple,
+          typename Resource,
+          typename... Bodies>
+RAJA_INLINE resources::EventProxy<Resource>
+kernel_param_resource(SegmentTuple&& segments,
+                      ParamTuple&& params,
+                      Resource resource,
+                      Bodies&&... bodies)
 {
   util::PluginContext context {util::make_context<PolicyType>()};
 
@@ -123,8 +119,8 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<
-      segment_tuple_t, param_tuple_t, Resource, camp::decay<Bodies>...>;
+  using loop_data_t = internal::LoopData<segment_tuple_t, param_tuple_t,
+                                         Resource, camp::decay<Bodies>...>;
 
 
   util::callPreCapturePlugins(context);
@@ -153,11 +149,10 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(
   return resources::EventProxy<Resource>(resource);
 }
 
-template <
-    typename PolicyType,
-    typename SegmentTuple,
-    typename Resource,
-    typename... Bodies>
+template <typename PolicyType,
+          typename SegmentTuple,
+          typename Resource,
+          typename... Bodies>
 RAJA_INLINE resources::EventProxy<Resource>
 kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
 {
@@ -166,11 +161,10 @@ kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
       std::forward<Bodies>(bodies)...);
 }
 
-template <
-    typename PolicyType,
-    typename SegmentTuple,
-    typename ParamTuple,
-    typename... Bodies>
+template <typename PolicyType,
+          typename SegmentTuple,
+          typename ParamTuple,
+          typename... Bodies>
 RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
 kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 {
@@ -182,7 +176,7 @@ kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
 RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
-            kernel(SegmentTuple&& segments, Bodies&&... bodies)
+kernel(SegmentTuple&& segments, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
   return RAJA::kernel_param_resource<PolicyType>(
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index d796505a97..661fe92868 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -37,10 +37,9 @@ namespace statement
  * Assigns the loop iterate to argument ArgumentId
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ExecPolicy = camp::nil,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename ExecPolicy = camp::nil,
+          typename... EnclosedStmts>
 struct For : public internal::ForList,
              public internal::ForTraitBase<ArgumentId, ExecPolicy>,
              public internal::Statement<ExecPolicy, EnclosedStmts...>
@@ -61,11 +60,10 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename Data,
-    typename Types,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
 struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -87,11 +85,10 @@ struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
  *
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ExecPolicy,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t ArgumentId,
+          typename ExecPolicy,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
     Types>
@@ -113,9 +110,8 @@ struct StatementExecutor<
 
     auto r = data.res;
 
-    forall_impl(
-        r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
-        RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -125,9 +121,8 @@ struct StatementExecutor<
  *
  */
 template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-    Types>
+struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+                         Types>
 {
 
 
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index f85d37c914..c6e75c35aa 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -39,20 +39,18 @@ namespace statement
  * Assigns the loop index to param ParamId
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename ExecPolicy = camp::nil,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename ExecPolicy = camp::nil,
+          typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
                    public internal::ForTraitBase<ArgumentId, ExecPolicy>,
                    public internal::Statement<ExecPolicy, EnclosedStmts...>
 {
 
-  static_assert(
-      std::is_base_of<internal::ParamBase, ParamId>::value,
-      "Inappropriate ParamId, ParamId must be of type "
-      "RAJA::Statement::Param< # >");
+  static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
+                "Inappropriate ParamId, ParamId must be of type "
+                "RAJA::Statement::Param< # >");
   // TODO: add static_assert for valid policy in Pol
   using execution_policy_t = ExecPolicy;
 };
@@ -67,12 +65,11 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Data,
-    typename Types,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
 struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -95,12 +92,11 @@ struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
  *
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename ExecPolicy,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename ExecPolicy,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
     Types>
@@ -123,9 +119,8 @@ struct StatementExecutor<
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(
-        r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
-        RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 5673098030..66be036556 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -76,12 +76,11 @@ namespace statement
  *  });
  *
  */
-template <
-    camp::idx_t HpArgumentId,
-    typename HpExecPolicy,
-    typename ArgList,
-    typename ExecPolicy,
-    typename... EnclosedStmts>
+template <camp::idx_t HpArgumentId,
+          typename HpExecPolicy,
+          typename ArgList,
+          typename ExecPolicy,
+          typename... EnclosedStmts>
 struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...>
 {};
 
@@ -96,21 +95,18 @@ struct HyperplaneInner : public internal::Statement<camp::nil, EnclosedStmts...>
 {};
 
 
-template <
-    camp::idx_t HpArgumentId,
-    typename HpExecPolicy,
-    camp::idx_t... Args,
-    typename ExecPolicy,
-    typename... EnclosedStmts,
-    typename Types>
-struct StatementExecutor<
-    statement::Hyperplane<
-        HpArgumentId,
-        HpExecPolicy,
-        ArgList<Args...>,
-        ExecPolicy,
-        EnclosedStmts...>,
-    Types>
+template <camp::idx_t HpArgumentId,
+          typename HpExecPolicy,
+          camp::idx_t... Args,
+          typename ExecPolicy,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::Hyperplane<HpArgumentId,
+                                               HpExecPolicy,
+                                               ArgList<Args...>,
+                                               ExecPolicy,
+                                               EnclosedStmts...>,
+                         Types>
 {
 
 
@@ -148,18 +144,16 @@ struct StatementExecutor<
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(
-        r, HpExecPolicy {}, TypedRangeSegment<idx_t>(0, hp_len), outer_wrapper,
-        RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, HpExecPolicy {}, TypedRangeSegment<idx_t>(0, hp_len),
+                outer_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
-template <
-    camp::idx_t HpArgumentId,
-    camp::idx_t... Args,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t HpArgumentId,
+          camp::idx_t... Args,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
     Types>
@@ -176,9 +170,8 @@ struct StatementExecutor<
 
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
-    idx_t i = h - foldl(
-                      RAJA::operators::plus<idx_t>(),
-                      camp::get<Args>(data.offset_tuple)...);
+    idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
+                        camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 2febccf763..25bd0a10df 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -49,10 +49,9 @@ struct InitLocalMem : public internal::Statement<camp::nil>
 
 // Policy Specialization
 template <camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<
-    RAJA::cpu_tile_mem,
-    camp::idx_seq<Indices...>,
-    EnclosedStmts...> : public internal::Statement<camp::nil>
+struct InitLocalMem<RAJA::cpu_tile_mem,
+                    camp::idx_seq<Indices...>,
+                    EnclosedStmts...> : public internal::Statement<camp::nil>
 {};
 
 
@@ -63,12 +62,10 @@ namespace internal
 
 // Statement executor to initalize RAJA local array
 template <camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<
-    statement::InitLocalMem<
-        RAJA::cpu_tile_mem,
-        camp::idx_seq<Indices...>,
-        EnclosedStmts...>,
-    Types>
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
+                                                 camp::idx_seq<Indices...>,
+                                                 EnclosedStmts...>,
+                         Types>
 {
 
   // Execute statement list
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 2867e61ddf..29b3aba40d 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -147,12 +147,11 @@ template <typename SegmentType, camp::idx_t id>
 struct LambdaSegExtractor
 {
 
-  static_assert(
-      !std::is_same<SegmentType, void>::value,
-      "Segment not "
-      "assigned, but used "
-      "in Lambda with "
-      "Segs<> argument");
+  static_assert(!std::is_same<SegmentType, void>::value,
+                "Segment not "
+                "assigned, but used "
+                "in Lambda with "
+                "Segs<> argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
@@ -176,12 +175,11 @@ template <typename OffsetType, camp::idx_t id>
 struct LambdaOffsetExtractor
 {
 
-  static_assert(
-      !std::is_same<OffsetType, void>::value,
-      "Segment not assigned, "
-      "but used in Lambda "
-      "with Offsets<> "
-      "argument");
+  static_assert(!std::is_same<OffsetType, void>::value,
+                "Segment not assigned, "
+                "but used in Lambda "
+                "with Offsets<> "
+                "argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
@@ -208,12 +206,11 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
-  static_assert(
-      !std::is_same<OffsetType, void>::value,
-      "Offset not assigned, "
-      "but used in Lambda "
-      "with Offsets<> "
-      "argument");
+  static_assert(!std::is_same<OffsetType, void>::value,
+                "Offset not assigned, "
+                "but used in Lambda "
+                "with Offsets<> "
+                "argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
@@ -229,12 +226,11 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
-  static_assert(
-      !std::is_same<SegmentType, void>::value,
-      "Segment not "
-      "assigned, but used "
-      "in Lambda with "
-      "Segs<> argument");
+  static_assert(!std::is_same<SegmentType, void>::value,
+                "Segment not "
+                "assigned, but used "
+                "in Lambda with "
+                "Segs<> argument");
 
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
@@ -250,8 +246,8 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
   template <typename Data>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto
   extract(Data&& data) -> typename std::add_lvalue_reference<
-      camp::tuple_element_t<id, typename camp::decay<Data>::param_tuple_t>>::
-      type
+      camp::tuple_element_t<id,
+                            typename camp::decay<Data>::param_tuple_t>>::type
   {
     return camp::get<id>(data.param_tuple);
   }
@@ -270,11 +266,10 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
 
 
 RAJA_SUPPRESS_HD_WARN
-template <
-    camp::idx_t LoopIndex,
-    typename Types,
-    typename Data,
-    typename... targLists>
+template <camp::idx_t LoopIndex,
+          typename Types,
+          typename Data,
+          typename... targLists>
 RAJA_INLINE RAJA_HOST_DEVICE void
 invoke_lambda_with_args(Data&& data, camp::list<targLists...> const&)
 {
@@ -298,30 +293,28 @@ struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>
     // Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(
-        std::forward<Data>(data), targList {});
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
+                                                targList {});
   }
 };
 
 
-template <
-    camp::idx_t LambdaIndex,
-    typename Types,
-    typename Data,
-    camp::idx_t... SEGS,
-    camp::idx_t... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(
-    Data&& data,
-    camp::idx_seq<SEGS...> const&,
-    camp::idx_seq<PARAMS...> const&)
+template <camp::idx_t LambdaIndex,
+          typename Types,
+          typename Data,
+          camp::idx_t... SEGS,
+          camp::idx_t... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
+                                                camp::idx_seq<SEGS...> const&,
+                                                camp::idx_seq<PARAMS...> const&)
 {
 
   using AllSegs   = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::
-      exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
+                    Types>::exec(std::forward<Data>(data));
 }
 
 
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index 0f2cdac6d6..db45d2dfe4 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -38,19 +38,17 @@ namespace statement
  * the enclosed statements on the thread which contains the reduced value.
  *
  */
-template <
-    typename ReducePolicy,
-    template <typename...>
-    class ReduceOperator,
-    typename ParamId,
-    typename... EnclosedStmts>
+template <typename ReducePolicy,
+          template <typename...>
+          class ReduceOperator,
+          typename ParamId,
+          typename... EnclosedStmts>
 struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...>
 {
 
-  static_assert(
-      std::is_base_of<internal::ParamBase, ParamId>::value,
-      "Inappropriate ParamId, ParamId must be of type "
-      "RAJA::Statement::Param< # >");
+  static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
+                "Inappropriate ParamId, ParamId must be of type "
+                "RAJA::Statement::Param< # >");
 
   using execution_policy_t = camp::nil;
 };
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 3df80d639d..700df61199 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -45,9 +45,8 @@ namespace internal
 // Note: RAJA region's lambda must capture by reference otherwise
 // internal function calls are undefined.
 template <typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<
-    statement::Region<RegionPolicy, EnclosedStmts...>,
-    Types>
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
+                         Types>
 {
 
   template <typename Data>
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 5cd57d93b4..3b3b3e689d 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -51,11 +51,10 @@ namespace statement
  * A RAJA::kernel statement that implements a tiling (or blocking) loop.
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename TilePolicy,
-    typename ExecPolicy,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename TilePolicy,
+          typename ExecPolicy,
+          typename... EnclosedStmts>
 struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
 {
   using tile_policy_t = TilePolicy;
@@ -86,11 +85,10 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename Data,
-    typename Types,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
 struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -125,7 +123,7 @@ struct IterableTiler
   {
     // NOTE: this must be held by value for NVCC support, *even on the host*
     const IterableTiler itiler;
-    const Index_type    block_id;
+    const Index_type block_id;
 
   public:
     using value_type        = iterate;
@@ -164,9 +162,9 @@ struct IterableTiler
     RAJA_HOST_DEVICE
     RAJA_INLINE iterator operator+(const difference_type& rhs) const
     {
-      return iterator(
-          itiler, block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
-                                                      : block_id + rhs);
+      return iterator(itiler, block_id + rhs >= itiler.num_blocks
+                                  ? itiler.num_blocks
+                                  : block_id + rhs);
     }
 
     RAJA_HOST_DEVICE
@@ -213,7 +211,7 @@ struct IterableTiler
   RAJA_INLINE
   iterator end() const { return iterator(*this, num_blocks); }
 
-  value_type  it;
+  value_type it;
   camp::idx_t block_size;
   camp::idx_t num_blocks;
   camp::idx_t dist;
@@ -224,12 +222,11 @@ struct IterableTiler
  *
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    camp::idx_t ChunkSize,
-    typename EPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t ArgumentId,
+          camp::idx_t ChunkSize,
+          typename EPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
     Types>
@@ -253,20 +250,18 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(
-        r, EPol {}, tiled_iterable, tile_wrapper,
-        RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-template <
-    camp::idx_t ArgumentId,
-    typename EPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t ArgumentId,
+          typename EPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::
         Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
@@ -293,9 +288,8 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(
-        r, EPol {}, tiled_iterable, tile_wrapper,
-        RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 72e7b36f56..d741e0a4b0 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -42,18 +42,16 @@ namespace statement
  * Assigns the tile index to param ParamId
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename TilePolicy,
-    typename ExecPolicy,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename TilePolicy,
+          typename ExecPolicy,
+          typename... EnclosedStmts>
 struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...>
 {
-  static_assert(
-      std::is_base_of<internal::ParamBase, ParamId>::value,
-      "Inappropriate ParamId, ParamId must be of type "
-      "RAJA::Statement::Param< # >");
+  static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
+                "Inappropriate ParamId, ParamId must be of type "
+                "RAJA::Statement::Param< # >");
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
@@ -69,12 +67,11 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Data,
-    typename Types,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
 struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 {
 
@@ -102,13 +99,12 @@ struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
  *
  *
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename TPol,
-    typename EPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename TPol,
+          typename EPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
     Types>
@@ -134,9 +130,8 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(
-        r, EPol {}, tiled_iterable, tile_wrapper,
-        RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index a3fc2eab22..0bf3cc424b 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -71,9 +71,9 @@ using difftype_list_from_segments =
 
 
 template <typename Segments>
-using difftype_tuple_from_segments = typename camp::apply_l<
-    camp::lambda<camp::tuple>,
-    difftype_list_from_segments<Segments>>::type;
+using difftype_tuple_from_segments =
+    typename camp::apply_l<camp::lambda<camp::tuple>,
+                           difftype_list_from_segments<Segments>>::type;
 
 
 template <typename Iterator>
@@ -89,21 +89,20 @@ using value_type_list_from_segments =
 
 
 template <typename Segments>
-using index_tuple_from_segments = typename camp::apply_l<
-    camp::lambda<camp::tuple>,
-    value_type_list_from_segments<Segments>>::type;
+using index_tuple_from_segments =
+    typename camp::apply_l<camp::lambda<camp::tuple>,
+                           value_type_list_from_segments<Segments>>::type;
 
 template <typename Segments>
-using index_types_from_segments = typename camp::apply_l<
-    camp::lambda<camp::list>,
-    value_type_list_from_segments<Segments>>::type;
+using index_types_from_segments =
+    typename camp::apply_l<camp::lambda<camp::list>,
+                           value_type_list_from_segments<Segments>>::type;
 
 
-template <
-    typename SegmentTuple,
-    typename ParamTuple,
-    typename Resource,
-    typename... Bodies>
+template <typename SegmentTuple,
+          typename ParamTuple,
+          typename Resource,
+          typename... Bodies>
 struct LoopData
 {
 
@@ -130,7 +129,7 @@ struct LoopData
   // Lambdas that were passed into the kernel
   using BodiesTuple = camp::tuple<Bodies...>;
   const BodiesTuple bodies;
-  offset_tuple_t    offset_tuple;
+  offset_tuple_t offset_tuple;
 
   // Vector sizes of each segment.  This is only used by the vector_exec
   // policies
@@ -138,11 +137,10 @@ struct LoopData
   vector_sizes_t vector_sizes;
 
   RAJA_INLINE
-  RAJA_HOST_DEVICE constexpr LoopData(
-      SegmentTuple const& s,
-      ParamTuple const&   p,
-      Resource            r,
-      Bodies const&... b)
+  RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
+                                      ParamTuple const& p,
+                                      Resource r,
+                                      Bodies const&... b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
   {}
   constexpr LoopData(LoopData const&) = default;
@@ -174,9 +172,9 @@ struct LoopData
 
 
 template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type = typename std::iterator_traits<typename camp::at_v<
-    typename Data::segment_tuple_t::TList,
-    ArgumentId>::iterator>::difference_type;
+using segment_diff_type = typename std::iterator_traits<
+    typename camp::at_v<typename Data::segment_tuple_t::TList,
+                        ArgumentId>::iterator>::difference_type;
 
 
 template <camp::idx_t ArgumentId, typename Data>
@@ -216,7 +214,7 @@ struct NestedPrivatizer
   using value_type     = camp::decay<T>;
   using reference_type = value_type&;
 
-  data_t     privatized_data;
+  data_t privatized_data;
   value_type privatized_wrapper;
 
   RAJA_INLINE
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 22b8654b8e..e80b259940 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -43,11 +43,10 @@ struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
-  static_assert(
-      s_num_segments == sizeof...(OffsetTypes),
-      "Number of segments "
-      "and offsets must "
-      "match");
+  static_assert(s_num_segments == sizeof...(OffsetTypes),
+                "Number of segments "
+                "and offsets must "
+                "match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
   using offset_types_t  = camp::list<OffsetTypes...>;
@@ -69,20 +68,19 @@ struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
   using segment_list = typename Types::segment_types_t;
   using offset_list  = typename Types::offset_types_t;
 
-  static_assert(
-      std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-      "Segment was already assigned: Probably looping over same "
-      "segment in loop nest");
+  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+                "Segment was already assigned: Probably looping over same "
+                "segment in loop nest");
 
   using type = LoopTypes<
-      camp::list<typename std::conditional<
-          SEQ == Segment,
-          T,
-          camp::at_v<segment_list, SEQ>>::type...>,
-      camp::list<typename std::conditional<
-          SEQ == Segment,
-          T,
-          camp::at_v<segment_list, SEQ>>::type...>>;
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>,
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>>;
 };
 
 
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 72c83adc01..c0402edad9 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -31,11 +31,10 @@ namespace internal
 template <typename ExecPolicy, typename... EnclosedStmts>
 struct Statement
 {
-  static_assert(
-      std::is_same<ExecPolicy, camp::nil>::value ||
-          sizeof...(EnclosedStmts) > 0,
-      "Executable statement with no enclosed statements, this is "
-      "almost certainly a bug");
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
+                    sizeof...(EnclosedStmts) > 0,
+                "Executable statement with no enclosed statements, this is "
+                "almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index b19947272f..f0e5cd5175 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -43,11 +43,10 @@ template <camp::idx_t idx, camp::idx_t N, typename StmtList, typename Types>
 struct StatementListExecutor;
 
 
-template <
-    camp::idx_t statement_index,
-    camp::idx_t num_statements,
-    typename StmtList,
-    typename Types>
+template <camp::idx_t statement_index,
+          camp::idx_t num_statements,
+          typename StmtList,
+          typename Types>
 struct StatementListExecutor
 {
 
@@ -62,9 +61,8 @@ struct StatementListExecutor
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<
-        statement_index + 1, num_statements, StmtList,
-        Types>::exec(std::forward<Data>(data));
+    StatementListExecutor<statement_index + 1, num_statements, StmtList,
+                          Types>::exec(std::forward<Data>(data));
   }
 };
 
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 7421ed0a94..e079664fcb 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -53,13 +53,12 @@ struct null_launch_t
 {};
 
 // Support for host, and device
-template <
-    typename HOST_POLICY
+template <typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
-    ,
-    typename DEVICE_POLICY = HOST_POLICY
+          ,
+          typename DEVICE_POLICY = HOST_POLICY
 #endif
-    >
+          >
 
 struct LoopPolicy
 {
@@ -69,13 +68,12 @@ struct LoopPolicy
 #endif
 };
 
-template <
-    typename HOST_POLICY
+template <typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
-    ,
-    typename DEVICE_POLICY = HOST_POLICY
+          ,
+          typename DEVICE_POLICY = HOST_POLICY
 #endif
-    >
+          >
 struct LaunchPolicy
 {
   using host_policy_t = HOST_POLICY;
@@ -144,17 +142,16 @@ struct Lanes
 struct LaunchParams
 {
 public:
-  Teams   teams;
+  Teams teams;
   Threads threads;
-  size_t  shared_mem_size;
+  size_t shared_mem_size;
 
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(
-      Teams   in_teams,
-      Threads in_threads,
-      size_t  in_shared_mem_size = 0)
+  LaunchParams(Teams in_teams,
+               Threads in_threads,
+               size_t in_shared_mem_size = 0)
       : teams(in_teams),
         threads(in_threads),
         shared_mem_size(in_shared_mem_size) {};
@@ -238,10 +235,9 @@ struct LaunchExecute;
 
 // Policy based launch with support to new reducers...
 template <typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(
-    LaunchParams const& launch_params,
-    const char*         kernel_name,
-    ReduceParams&&... rest_of_launch_args)
+void launch(LaunchParams const& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
   // Get reducers
@@ -269,8 +265,8 @@ void launch(
   using Res = typename resources::get_resource<
       typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(
-      Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
@@ -279,9 +275,8 @@ void launch(
 // Duplicate of code above on account that we need to support the case in which
 // a kernel_name is not given
 template <typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(
-    LaunchParams const& launch_params,
-    ReduceParams&&... rest_of_launch_args)
+void launch(LaunchParams const& launch_params,
+            ReduceParams&&... rest_of_launch_args)
 {
 
   const char* kernel_name = nullptr;
@@ -311,8 +306,8 @@ void launch(
   using Res = typename resources::get_resource<
       typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(
-      Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
@@ -327,11 +322,10 @@ void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(
-    ExecPlace           place,
-    const LaunchParams& params,
-    const char*         kernel_name,
-    BODY const&         body)
+void launch(ExecPlace place,
+            const LaunchParams& params,
+            const char* kernel_name,
+            BODY const& body)
 {
 
   // Forward to single policy launch API - simplifies testing of plugins
@@ -362,11 +356,10 @@ void launch(
 
 // Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(
-    ExecPlace           place,
-    const LaunchParams& launch_params,
-    const char*         kernel_name,
-    ReduceParams&&... rest_of_launch_args)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
   // Forward to single policy launch API - simplifies testing of plugins
@@ -400,10 +393,9 @@ void launch(
 // Run-time API for new reducer interface with support of the case without a new
 // kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(
-    ExecPlace           place,
-    const LaunchParams& launch_params,
-    ReduceParams&&... rest_of_launch_args)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            ReduceParams&&... rest_of_launch_args)
 // BODY const &body)
 {
 
@@ -470,11 +462,11 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
 
 // Launch API which takes team resource struct and supports new reducers
 template <typename POLICY_LIST, typename... ReduceParams>
-resources::EventProxy<resources::Resource> launch(
-    RAJA::resources::Resource res,
-    LaunchParams const&       launch_params,
-    const char*               kernel_name,
-    ReduceParams&&... rest_of_launch_args)
+resources::EventProxy<resources::Resource>
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* kernel_name,
+       ReduceParams&&... rest_of_launch_args)
 {
 
   // Get reducers
@@ -552,10 +544,10 @@ resources::EventProxy<resources::Resource> launch(
 // Duplicate of API above on account that we need to handle the case that a
 // kernel name is not provided
 template <typename POLICY_LIST, typename... ReduceParams>
-resources::EventProxy<resources::Resource> launch(
-    RAJA::resources::Resource res,
-    LaunchParams const&       launch_params,
-    ReduceParams&&... rest_of_launch_args)
+resources::EventProxy<resources::Resource>
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       ReduceParams&&... rest_of_launch_args)
 {
 
   const char* kernel_name = nullptr;
@@ -645,11 +637,10 @@ template <typename POLICY, typename SEGMENT>
 struct LoopICountExecute;
 
 RAJA_SUPPRESS_HD_WARN
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename SEGMENT,
-    typename BODY>
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void
 loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
@@ -657,51 +648,46 @@ loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
   LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename SEGMENT,
-    typename BODY>
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
 RAJA_HOST_DEVICE RAJA_INLINE void
 loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, segment, body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment,
+                                                             body);
 }
 
 namespace expt
 {
 
 RAJA_SUPPRESS_HD_WARN
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename SEGMENT,
-    typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(
-    CONTEXT const& ctx,
-    SEGMENT const& segment0,
-    SEGMENT const& segment1,
-    BODY const&    body)
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, segment0, segment1, body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
+                                                       body);
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename SEGMENT,
-    typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(
-    CONTEXT const& ctx,
-    SEGMENT const& segment0,
-    SEGMENT const& segment1,
-    SEGMENT const& segment2,
-    BODY const&    body)
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              SEGMENT const& segment2,
+                                              BODY const& body)
 {
 
   LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
@@ -716,74 +702,66 @@ struct TileExecute;
 template <typename POLICY, typename SEGMENT>
 struct TileTCountExecute;
 
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename TILE_T,
-    typename SEGMENT,
-    typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(
-    CONTEXT const& ctx,
-    TILE_T         tile_size,
-    SEGMENT const& segment,
-    BODY const&    body)
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename TILE_T,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
+                                       TILE_T tile_size,
+                                       SEGMENT const& segment,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, tile_size, segment, body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size, segment,
+                                                       body);
 }
 
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename TILE_T,
-    typename SEGMENT,
-    typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(
-    CONTEXT const& ctx,
-    TILE_T         tile_size,
-    SEGMENT const& segment,
-    BODY const&    body)
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename TILE_T,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size,
+                                              SEGMENT const& segment,
+                                              BODY const& body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, tile_size, segment, body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size,
+                                                             segment, body);
 }
 
 namespace expt
 {
 
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename TILE_T,
-    typename SEGMENT,
-    typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(
-    CONTEXT const& ctx,
-    TILE_T         tile_size0,
-    TILE_T         tile_size1,
-    SEGMENT const& segment0,
-    SEGMENT const& segment1,
-    BODY const&    body)
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename TILE_T,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
+                                       TILE_T tile_size0,
+                                       TILE_T tile_size1,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
   TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
       ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-template <
-    typename POLICY_LIST,
-    typename CONTEXT,
-    typename TILE_T,
-    typename SEGMENT,
-    typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(
-    CONTEXT const& ctx,
-    TILE_T         tile_size0,
-    TILE_T         tile_size1,
-    SEGMENT const& segment0,
-    SEGMENT const& segment1,
-    BODY const&    body)
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename TILE_T,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size0,
+                                              TILE_T tile_size1,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              BODY const& body)
 {
 
   TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 7659c99a30..6acaaeb6a4 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -43,26 +43,25 @@ struct ForallParamPack
 private:
   // Init
   template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
-  static constexpr void detail_init(
-      EXEC_POL,
-      camp::idx_seq<Seq...>,
-      ForallParamPack& f_params,
-      Args&&... args)
+  static constexpr void detail_init(EXEC_POL,
+                                    camp::idx_seq<Seq...>,
+                                    ForallParamPack& f_params,
+                                    Args&&... args)
   {
-    CAMP_EXPAND(expt::detail::init<EXEC_POL>(
-        camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)...));
+    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                             std::forward<Args>(args)...));
   }
 
   // Combine
   template <typename EXEC_POL, camp::idx_t... Seq>
-  RAJA_HOST_DEVICE static constexpr void detail_combine(
-      EXEC_POL,
-      camp::idx_seq<Seq...>,
-      ForallParamPack&       out,
-      const ForallParamPack& in)
+  RAJA_HOST_DEVICE static constexpr void
+  detail_combine(EXEC_POL,
+                 camp::idx_seq<Seq...>,
+                 ForallParamPack& out,
+                 const ForallParamPack& in)
   {
-    CAMP_EXPAND(detail::combine<EXEC_POL>(
-        camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
+                                          camp::get<Seq>(in.param_tup)));
   }
 
   template <typename EXEC_POL, camp::idx_t... Seq>
@@ -74,14 +73,13 @@ struct ForallParamPack
 
   // Resolve
   template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
-  static constexpr void detail_resolve(
-      EXEC_POL,
-      camp::idx_seq<Seq...>,
-      ForallParamPack& f_params,
-      Args&&... args)
+  static constexpr void detail_resolve(EXEC_POL,
+                                       camp::idx_seq<Seq...>,
+                                       ForallParamPack& f_params,
+                                       Args&&... args)
   {
-    CAMP_EXPAND(detail::resolve<EXEC_POL>(
-        camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)...));
+    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                          std::forward<Args>(args)...));
   }
 
   // Used to construct the argument TYPES that will be invoked with the lambda.
@@ -95,16 +93,14 @@ struct ForallParamPack
   {
     return typename First::ARG_TUP_T();
   };
-  template <
-      typename null_t = camp::nil,
-      typename First,
-      typename Second,
-      typename... Rest>
+  template <typename null_t = camp::nil,
+            typename First,
+            typename Second,
+            typename... Rest>
   static constexpr auto LAMBDA_ARG_TUP_T()
   {
-    return camp::tuple_cat_pair(
-        typename First::ARG_TUP_T(),
-        LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
+    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
+                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
   };
 
   using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
@@ -150,44 +146,35 @@ struct ForallParamPack
 //
 struct ParamMultiplexer
 {
-  template <
-      typename EXEC_POL,
-      typename... Params,
-      typename... Args,
-      typename FP = ForallParamPack<Params...>>
-  static void constexpr init(
-      ForallParamPack<Params...>& f_params,
-      Args&&... args)
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr init(ForallParamPack<Params...>& f_params,
+                             Args&&... args)
   {
-    FP::detail_init(
-        EXEC_POL(), typename FP::params_seq(), f_params,
-        std::forward<Args>(args)...);
+    FP::detail_init(EXEC_POL(), typename FP::params_seq(), f_params,
+                    std::forward<Args>(args)...);
   }
-  template <
-      typename EXEC_POL,
-      typename... Params,
-      typename... Args,
-      typename FP = ForallParamPack<Params...>>
-  static void constexpr combine(
-      ForallParamPack<Params...>& f_params,
-      Args&&... args)
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr combine(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
   {
-    FP::detail_combine(
-        EXEC_POL(), typename FP::params_seq(), f_params,
-        std::forward<Args>(args)...);
+    FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
   }
-  template <
-      typename EXEC_POL,
-      typename... Params,
-      typename... Args,
-      typename FP = ForallParamPack<Params...>>
-  static void constexpr resolve(
-      ForallParamPack<Params...>& f_params,
-      Args&&... args)
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr resolve(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
   {
-    FP::detail_resolve(
-        EXEC_POL(), typename FP::params_seq(), f_params,
-        std::forward<Args>(args)...);
+    FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
   }
 };
 //===========================================================================
@@ -223,11 +210,10 @@ using check_types_derive_base =
 template <typename... Ts>
 constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
 {
-  static_assert(
-      detail::check_types_derive_base<
-          detail::ForallParamBase, camp::decay<Ts>...>::value,
-      "Forall optional arguments do not derive ForallParamBase. "
-      "Please see Reducer, ReducerLoc and KernelName for examples.");
+  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
+                                                camp::decay<Ts>...>::value,
+                "Forall optional arguments do not derive ForallParamBase. "
+                "Please see Reducer, ReducerLoc and KernelName for examples.");
   return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
 }
 
@@ -245,8 +231,8 @@ constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
 template <typename... Ts>
 constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
 {
-  return tuple_from_seq(
-      camp::make_idx_seq_t<sizeof...(Ts) - 1> {}, std::move(tuple));
+  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1> {},
+                        std::move(tuple));
 };
 }  // namespace detail
 
@@ -374,9 +360,8 @@ check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
 {
 #if !defined(RAJA_ENABLE_HIP)
   static_assert(
-      is_invocable<
-          LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
-          EXPECTED_ARGS...>::value,
+      is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
+                   EXPECTED_ARGS...>::value,
       "LAMBDA Not invocable w/ EXPECTED_ARGS.");
 #endif
 }
@@ -442,14 +427,13 @@ RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
 
 CAMP_SUPPRESS_HD_WARN
 template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
-RAJA_HOST_DEVICE constexpr auto invoke_with_order(
-    Params&& params,
-    Fn&&     f,
-    camp::idx_seq<Sequence...>,
-    Ts&&... extra)
+RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                  Fn&& f,
+                                                  camp::idx_seq<Sequence...>,
+                                                  Ts&&... extra)
 {
-  return f(
-      std::forward<Ts...>(extra...), (get_lambda_args<Sequence>(params))...);
+  return f(std::forward<Ts...>(extra...),
+           (get_lambda_args<Sequence>(params))...);
 }
 }  // namespace detail
 
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 379007073d..9cb0e64835 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -48,7 +48,7 @@ struct ValLoc
   bool constexpr operator<(const ValLoc& rhs) const { return val < rhs.val; }
   bool constexpr operator>(const ValLoc& rhs) const { return val > rhs.val; }
 
-  value_type       getVal() { return val; }
+  value_type getVal() { return val; }
   RAJA::Index_type getLoc() { return loc; }
 
 private:
@@ -109,14 +109,14 @@ struct Reducer : public ForallParamBase
   Reducer(value_type* target_in) : target(target_in), val(op::identity()) {}
 
   value_type* target = nullptr;
-  value_type  val    = op::identity();
+  value_type val     = op::identity();
 
 #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
     defined(RAJA_SYCL_ACTIVE)
   // Device related attributes.
-  value_type*                                         devicetarget = nullptr;
+  value_type* devicetarget = nullptr;
   RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-  unsigned int*                                       device_count = nullptr;
+  unsigned int* device_count = nullptr;
 #endif
 
   using ARG_TUP_T = camp::tuple<value_type*>;
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 79d78cabcc..baf4664062 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -51,33 +51,30 @@ template <
     typename Res,
     typename Container,
     typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<Container>>
-inclusive_scan_inplace(
-    ExecPolicy&& p,
-    Res          r,
-    Container&&  c,
-    Function     binop = Function {})
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    inclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function {})
 {
   using std::begin;
   using std::end;
   using R = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Function, R, R, R>::value,
-      "Function must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Function, R, R, R>::value,
+                "Function must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
   if (begin(c) == end(c))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(
-      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop);
+  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop);
 }
 ///
 template <
@@ -91,10 +88,9 @@ RAJA_INLINE concepts::enable_if_t<
     type_traits::is_range<Container>,
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
-inclusive_scan_inplace(
-    ExecPolicy&& p,
-    Container&&  c,
-    Function     binop = Function {})
+inclusive_scan_inplace(ExecPolicy&& p,
+                       Container&& c,
+                       Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
@@ -113,59 +109,53 @@ inclusive_scan_inplace(
 *
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename Container,
-    typename T        = RAJA::detail::ContainerVal<Container>,
-    typename Function = operators::plus<T>>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<Container>>
-exclusive_scan_inplace(
-    ExecPolicy&& p,
-    Res          r,
-    Container&&  c,
-    Function     binop = Function {},
-    T            value = Function::identity())
+template <typename ExecPolicy,
+          typename Res,
+          typename Container,
+          typename T        = RAJA::detail::ContainerVal<Container>,
+          typename Function = operators::plus<T>>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    exclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function {},
+                           T value        = Function::identity())
 {
   using std::begin;
   using std::end;
   using R = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Function, R, T, R>::value,
-      "Function must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
+                "Function must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
   if (begin(c) == end(c))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(
-      r, std::forward<ExecPolicy>(p), begin(c), end(c), binop, value);
+  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop, value);
 }
 ///
-template <
-    typename ExecPolicy,
-    typename Container,
-    typename T        = RAJA::detail::ContainerVal<Container>,
-    typename Function = operators::plus<T>,
-    typename Res      = typename resources::get_resource<ExecPolicy>::type>
+template <typename ExecPolicy,
+          typename Container,
+          typename T        = RAJA::detail::ContainerVal<Container>,
+          typename Function = operators::plus<T>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
     type_traits::is_range<Container>,
     concepts::negate<
         std::is_constructible<camp::resources::Resource, Container>>>
-exclusive_scan_inplace(
-    ExecPolicy&& p,
-    Container&&  c,
-    Function     binop = Function {},
-    T            value = Function::identity())
+exclusive_scan_inplace(ExecPolicy&& p,
+                       Container&& c,
+                       Function binop = Function {},
+                       T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -188,55 +178,49 @@ exclusive_scan_inplace(
 *begin))}
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename InContainer,
-    typename OutContainer,
-    typename Function =
-        operators::plus<RAJA::detail::ContainerVal<InContainer>>>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<InContainer>,
-    type_traits::is_range<OutContainer>>
-inclusive_scan(
-    ExecPolicy&&   p,
-    Res            r,
-    InContainer&&  in,
-    OutContainer&& out,
-    Function       binop = Function {})
+template <typename ExecPolicy,
+          typename Res,
+          typename InContainer,
+          typename OutContainer,
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    inclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<InContainer>;
   using R = RAJA::detail::ContainerVal<OutContainer>;
-  static_assert(
-      type_traits::is_binary_function<Function, R, T, R>::value,
-      "Function must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<InContainer>::value,
-      "InContainer must model RandomAccessRange");
-  static_assert(
-      type_traits::is_random_access_range<OutContainer>::value,
-      "OutContainer must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Function, R, T, R>::value,
+                "Function must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<InContainer>::value,
+                "InContainer must model RandomAccessRange");
+  static_assert(type_traits::is_random_access_range<OutContainer>::value,
+                "OutContainer must model RandomAccessRange");
   if (begin(in) == end(in))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(
-      r, std::forward<ExecPolicy>(p), begin(in), end(in), begin(out), binop);
+  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop);
 }
 ///
-template <
-    typename ExecPolicy,
-    typename InContainer,
-    typename OutContainer,
-    typename Function =
-        operators::plus<RAJA::detail::ContainerVal<InContainer>>,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <typename ExecPolicy,
+          typename InContainer,
+          typename OutContainer,
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -244,11 +228,10 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, InContainer>>,
     type_traits::is_range<OutContainer>>
-inclusive_scan(
-    ExecPolicy&&   p,
-    InContainer&&  in,
-    OutContainer&& out,
-    Function       binop = Function {})
+inclusive_scan(ExecPolicy&& p,
+               InContainer&& in,
+               OutContainer&& out,
+               Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -272,57 +255,50 @@ inclusive_scan(
 *begin))}
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename InContainer,
-    typename OutContainer,
-    typename T        = RAJA::detail::ContainerVal<InContainer>,
-    typename Function = operators::plus<T>>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<InContainer>,
-    type_traits::is_range<OutContainer>>
-exclusive_scan(
-    ExecPolicy&&   p,
-    Res            r,
-    InContainer&&  in,
-    OutContainer&& out,
-    Function       binop = Function {},
-    T              value = Function::identity())
+template <typename ExecPolicy,
+          typename Res,
+          typename InContainer,
+          typename OutContainer,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
+          typename Function = operators::plus<T>>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    exclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function {},
+                   T value        = Function::identity())
 {
   using std::begin;
   using std::end;
   using U = RAJA::detail::ContainerVal<InContainer>;
   using R = RAJA::detail::ContainerVal<OutContainer>;
-  static_assert(
-      type_traits::is_binary_function<Function, R, T, U>::value,
-      "Function must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<InContainer>::value,
-      "InContainer must model RandomAccessRange");
-  static_assert(
-      type_traits::is_random_access_range<OutContainer>::value,
-      "OutContainer must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Function, R, T, U>::value,
+                "Function must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<InContainer>::value,
+                "InContainer must model RandomAccessRange");
+  static_assert(type_traits::is_random_access_range<OutContainer>::value,
+                "OutContainer must model RandomAccessRange");
   if (begin(in) == end(in))
   {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(
-      r, std::forward<ExecPolicy>(p), begin(in), end(in), begin(out), binop,
-      value);
+  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop, value);
 }
 ///
-template <
-    typename ExecPolicy,
-    typename InContainer,
-    typename OutContainer,
-    typename T        = RAJA::detail::ContainerVal<InContainer>,
-    typename Function = operators::plus<T>,
-    typename Res      = typename resources::get_resource<ExecPolicy>::type>
+template <typename ExecPolicy,
+          typename InContainer,
+          typename OutContainer,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
+          typename Function = operators::plus<T>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -330,12 +306,11 @@ RAJA_INLINE concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, InContainer>>,
     type_traits::is_range<OutContainer>>
-exclusive_scan(
-    ExecPolicy&&   p,
-    InContainer&&  in,
-    OutContainer&& out,
-    Function       binop = Function {},
-    T              value = Function::identity())
+exclusive_scan(ExecPolicy&& p,
+               InContainer&& in,
+               OutContainer&& out,
+               Function binop = Function {},
+               T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -352,13 +327,11 @@ exclusive_scan(
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -367,10 +340,9 @@ exclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -383,13 +355,11 @@ exclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -398,10 +368,9 @@ inclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -414,13 +383,11 @@ inclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -429,10 +396,9 @@ exclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -445,13 +411,11 @@ exclusive_scan_inplace(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -460,10 +424,9 @@ inclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index fee62e78c3..fdbc5722ee 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -51,24 +51,21 @@ template <
     typename Res,
     typename Container,
     typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<Container>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<Container>>
 sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -76,8 +73,8 @@ sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 
   if (N > 1)
   {
-    return impl::sort::unstable(
-        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
+    return impl::sort::unstable(r, std::forward<ExecPolicy>(p), begin_it,
+                                end_it, comp);
   }
   else
   {
@@ -120,24 +117,21 @@ template <
     typename Res,
     typename Container,
     typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<Container>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<Container>>
 stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -145,8 +139,8 @@ stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 
   if (N > 1)
   {
-    return impl::sort::stable(
-        r, std::forward<ExecPolicy>(p), begin_it, end_it, comp);
+    return impl::sort::stable(r, std::forward<ExecPolicy>(p), begin_it, end_it,
+                              comp);
   }
   else
   {
@@ -185,40 +179,34 @@ stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 *
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename KeyContainer,
-    typename ValContainer,
-    typename Compare =
-        operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<KeyContainer>,
-    type_traits::is_range<ValContainer>>
-sort_pairs(
-    ExecPolicy&&   p,
-    Res            r,
-    KeyContainer&& keys,
-    ValContainer&& vals,
-    Compare        comp = Compare {})
+template <typename ExecPolicy,
+          typename Res,
+          typename KeyContainer,
+          typename ValContainer,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<KeyContainer>,
+                      type_traits::is_range<ValContainer>>
+sort_pairs(ExecPolicy&& p,
+           Res r,
+           KeyContainer&& keys,
+           ValContainer&& vals,
+           Compare comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<KeyContainer>::value,
-      "KeyContainer must model RandomAccessRange");
-  static_assert(
-      type_traits::is_random_access_range<ValContainer>::value,
-      "ValContainer must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<KeyContainer>::value,
+                "KeyContainer must model RandomAccessRange");
+  static_assert(type_traits::is_random_access_range<ValContainer>::value,
+                "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
@@ -226,8 +214,8 @@ sort_pairs(
 
   if (N > 1)
   {
-    return impl::sort::unstable_pairs(
-        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
+    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                      end_key, begin(vals), comp);
   }
   else
   {
@@ -235,13 +223,12 @@ sort_pairs(
   }
 }
 ///
-template <
-    typename ExecPolicy,
-    typename KeyContainer,
-    typename ValContainer,
-    typename Compare =
-        operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <typename ExecPolicy,
+          typename KeyContainer,
+          typename ValContainer,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -249,11 +236,10 @@ concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, KeyContainer>>,
     type_traits::is_range<ValContainer>>
-sort_pairs(
-    ExecPolicy&&   p,
-    KeyContainer&& keys,
-    ValContainer&& vals,
-    Compare        comp = Compare {})
+sort_pairs(ExecPolicy&& p,
+           KeyContainer&& keys,
+           ValContainer&& vals,
+           Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
@@ -274,40 +260,34 @@ sort_pairs(
 *
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename KeyContainer,
-    typename ValContainer,
-    typename Compare =
-        operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>,
-    std::is_constructible<camp::resources::Resource, Res>,
-    type_traits::is_range<KeyContainer>,
-    type_traits::is_range<ValContainer>>
-stable_sort_pairs(
-    ExecPolicy&&   p,
-    Res            r,
-    KeyContainer&& keys,
-    ValContainer&& vals,
-    Compare        comp = Compare {})
+template <typename ExecPolicy,
+          typename Res,
+          typename KeyContainer,
+          typename ValContainer,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<KeyContainer>,
+                      type_traits::is_range<ValContainer>>
+stable_sort_pairs(ExecPolicy&& p,
+                  Res r,
+                  KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
 {
   using std::begin;
   using std::distance;
   using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<KeyContainer>::value,
-      "KeyContainer must model RandomAccessRange");
-  static_assert(
-      type_traits::is_random_access_range<ValContainer>::value,
-      "ValContainer must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<KeyContainer>::value,
+                "KeyContainer must model RandomAccessRange");
+  static_assert(type_traits::is_random_access_range<ValContainer>::value,
+                "ValContainer must model RandomAccessRange");
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
@@ -315,8 +295,8 @@ stable_sort_pairs(
 
   if (N > 1)
   {
-    return impl::sort::stable_pairs(
-        r, std::forward<ExecPolicy>(p), begin_key, end_key, begin(vals), comp);
+    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                    end_key, begin(vals), comp);
   }
   else
   {
@@ -324,13 +304,12 @@ stable_sort_pairs(
   }
 }
 ///
-template <
-    typename ExecPolicy,
-    typename KeyContainer,
-    typename ValContainer,
-    typename Compare =
-        operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
+template <typename ExecPolicy,
+          typename KeyContainer,
+          typename ValContainer,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_execution_policy<ExecPolicy>,
@@ -338,11 +317,10 @@ concepts::enable_if_t<
     concepts::negate<
         std::is_constructible<camp::resources::Resource, KeyContainer>>,
     type_traits::is_range<ValContainer>>
-stable_sort_pairs(
-    ExecPolicy&&   p,
-    KeyContainer&& keys,
-    ValContainer&& vals,
-    Compare        comp = Compare {})
+stable_sort_pairs(ExecPolicy&& p,
+                  KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
@@ -360,13 +338,11 @@ stable_sort_pairs(
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 sort(Args&&... args)
 {
   Res r = Res::get_default();
@@ -375,14 +351,13 @@ sort(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 sort(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(
-      ExecPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(), r,
+                                                 std::forward<Args>(args)...);
 }
 
 /*!
@@ -391,13 +366,11 @@ sort(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 stable_sort(Args&&... args)
 {
   Res r = Res::get_default();
@@ -406,10 +379,9 @@ stable_sort(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 stable_sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
@@ -422,13 +394,11 @@ stable_sort(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
@@ -437,10 +407,9 @@ sort_pairs(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
@@ -453,13 +422,11 @@ sort_pairs(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 stable_sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
@@ -468,10 +435,9 @@ stable_sort_pairs(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_resource<Res>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 stable_sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index 3cd06d77b7..ab6d2f7c42 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -28,10 +28,9 @@ namespace RAJA
 {
 namespace expt
 {
-template <
-    typename T,
-    typename LAYOUT,
-    typename REGISTER_POLICY = default_register>
+template <typename T,
+          typename LAYOUT,
+          typename REGISTER_POLICY = default_register>
 using SquareMatrixRegister = TensorRegister<
     REGISTER_POLICY,
     T,
@@ -40,12 +39,11 @@ using SquareMatrixRegister = TensorRegister<
         RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
         RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
 
-template <
-    typename T,
-    typename LAYOUT,
-    camp::idx_t ROWS,
-    camp::idx_t COLS,
-    typename REGISTER_POLICY = default_register>
+template <typename T,
+          typename LAYOUT,
+          camp::idx_t ROWS,
+          camp::idx_t COLS,
+          typename REGISTER_POLICY = default_register>
 using RectMatrixRegister =
     TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
 
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index c472c825c4..185948682d 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -29,12 +29,11 @@ namespace expt
 {
 
 
-template <
-    typename IDX,
-    typename TENSOR_TYPE,
-    camp::idx_t             DIM,
-    IDX                     INDEX_VALUE,
-    strip_index_type_t<IDX> LENGTH_VALUE>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
 struct StaticTensorIndexInner;
 
 template <typename INNER_TYPE>
@@ -59,12 +58,11 @@ class TensorIndex
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr StaticTensorIndex<StaticTensorIndexInner<
-      IDX,
-      TENSOR_TYPE,
-      DIM,
-      index_type(-1),
-      value_type(-1)>>
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                            TENSOR_TYPE,
+                                                            DIM,
+                                                            index_type(-1),
+                                                            value_type(-1)>>
   static_all()
   {
     return StaticTensorIndex<StaticTensorIndexInner<
@@ -152,12 +150,11 @@ class TensorIndex
 };
 
 
-template <
-    typename IDX,
-    typename TENSOR_TYPE,
-    camp::idx_t             DIM,
-    IDX                     INDEX_VALUE,
-    strip_index_type_t<IDX> LENGTH_VALUE>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
 struct StaticTensorIndex<
     StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
 {
@@ -200,7 +197,7 @@ using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
  */
 template <typename IDX, typename MATRIX_TYPE>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
-                 toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
+toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
 {
   return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
 }
@@ -210,7 +207,7 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
  */
 template <typename IDX, typename MATRIX_TYPE>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
-                 toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
+toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
 {
   return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
 }
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index 649d7a2166..22f4c16cae 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -53,9 +53,8 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<
-            RAJA::internal::expt::TensorRegisterConcreteBase,
-            RIGHT>::value,
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
 {
@@ -71,9 +70,8 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<
-            RAJA::internal::expt::TensorRegisterConcreteBase,
-            RIGHT>::value,
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
 {
@@ -89,9 +87,8 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<
-            RAJA::internal::expt::TensorRegisterConcreteBase,
-            RIGHT>::value,
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
 {
@@ -107,9 +104,8 @@ template <
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
     typename std::enable_if<
-        std::is_base_of<
-            RAJA::internal::expt::TensorRegisterConcreteBase,
-            RIGHT>::value,
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
         bool>::type = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
 {
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index f1dd049d41..8041622d11 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -25,10 +25,9 @@ namespace RAJA
 namespace expt
 {
 // Convenience to describe VectorTensors
-template <
-    typename T,
-    typename REGISTER_POLICY = default_register,
-    camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
+template <typename T,
+          typename REGISTER_POLICY = default_register,
+          camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
 using VectorRegister =
     TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
 }  // namespace expt
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 074ff320df..09099eef27 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -58,37 +58,34 @@ class TensorBinaryOperator
   static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
 
 private:
-  left_operand_type  m_left_operand;
+  left_operand_type m_left_operand;
   right_operand_type m_right_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorBinaryOperator(
-      left_operand_type const&  left,
-      right_operand_type const& right)
+  TensorBinaryOperator(left_operand_type const& left,
+                       right_operand_type const& right)
       : m_left_operand {left}, m_right_operand {right}
   {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   constexpr auto getDimSize(camp::idx_t dim) const
-      -> decltype(operator_traits::getDimSize(
-          dim,
-          m_left_operand,
-          m_right_operand))
+      -> decltype(operator_traits::getDimSize(dim,
+                                              m_left_operand,
+                                              m_right_operand))
   {
     return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
   }
 
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(operator_type::eval(
-          m_left_operand.eval(tile),
-          m_right_operand.eval(tile)))
+      -> decltype(operator_type::eval(m_left_operand.eval(tile),
+                                      m_right_operand.eval(tile)))
   {
-    return operator_type::eval(
-        m_left_operand.eval(tile), m_right_operand.eval(tile));
+    return operator_type::eval(m_left_operand.eval(tile),
+                               m_right_operand.eval(tile));
   }
 
 
@@ -112,22 +109,20 @@ class TensorBinaryOperator
  * Overload for:    arithmetic + tensorexpression
 
  */
-template <
-    typename LEFT_OPERAND,
-    typename RIGHT_OPERAND,
-    typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::
-        type = true,
-    typename std::enable_if<
-        std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto
-operator+(LEFT_OPERAND const& left, RIGHT_OPERAND const& right) -> TensorAdd<
-    typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-    RIGHT_OPERAND>
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                 RIGHT_OPERAND>
 {
-  return TensorAdd<
-      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-      RIGHT_OPERAND>(
+  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                   RIGHT_OPERAND>(
       NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
 }
 
@@ -136,24 +131,23 @@ operator+(LEFT_OPERAND const& left, RIGHT_OPERAND const& right) -> TensorAdd<
  * Overload for:    arithmetic - tensorexpression
 
  */
-template <
-    typename LEFT_OPERAND,
-    typename RIGHT_OPERAND,
-    typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::
-        type = true,
-    typename std::enable_if<
-        std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto
-operator-(LEFT_OPERAND const& left, RIGHT_OPERAND const& right)
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
     -> TensorSubtract<
         typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
         RIGHT_OPERAND>
 {
   return TensorSubtract<
       typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-      RIGHT_OPERAND>(
-      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+                     right);
 }
 
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index eaf9acea5a..52fbf83cfa 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -34,8 +34,9 @@ struct TensorOperatorAdd
 {
 
   template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  eval(LEFT const& left, RIGHT const& right) -> decltype(left + right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left + right)
   {
     return left + right;
   }
@@ -49,8 +50,9 @@ struct TensorOperatorSubtract
 {
 
   template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  eval(LEFT const& left, RIGHT const& right) -> decltype(left - right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left - right)
   {
     return left - right;
   }
@@ -103,10 +105,9 @@ struct OperatorTraits
  * Specialization when the left operand is a scalar
  */
 template <typename LHS_TYPE, typename RHS_TYPE>
-struct OperatorTraits<
-    LHS_TYPE,
-    RHS_TYPE,
-    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
 {
 
   using result_type                       = typename RHS_TYPE::result_type;
@@ -128,10 +129,9 @@ struct OperatorTraits<
  * Specialization when the right operand is a scalar
  */
 template <typename LHS_TYPE, typename RHS_TYPE>
-struct OperatorTraits<
-    LHS_TYPE,
-    RHS_TYPE,
-    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
 {
 
   using result_type                       = typename LHS_TYPE::result_type;
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index ec14930dc6..c61cfd0891 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -62,7 +62,7 @@ class BlockLiteral
 
 private:
   storage_type m_storage;
-  tile_type    m_tile_origin;
+  tile_type m_tile_origin;
 
 public:
   RAJA_INLINE
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3a3f1ef577..0c57f20067 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -96,7 +96,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   RAJA_SUPPRESS_HD_WARN
   template <typename RHS>
   RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
-                               operator+(RHS const& rhs) const
+  operator+(RHS const& rhs) const
   {
     return TensorAdd<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
@@ -106,7 +106,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   template <typename RHS>
   RAJA_INLINE
       RAJA_HOST_DEVICE TensorSubtract<self_type, normalize_operand_t<RHS>>
-                       operator-(RHS const& rhs) const
+      operator-(RHS const& rhs) const
   {
     return TensorSubtract<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
@@ -124,7 +124,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   template <typename RHS>
   RAJA_INLINE
       RAJA_HOST_DEVICE TensorMultiply<self_type, normalize_operand_t<RHS>>
-                       operator*(RHS const& rhs) const
+      operator*(RHS const& rhs) const
   {
     return TensorMultiply<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
@@ -133,7 +133,7 @@ class TensorExpressionBase : public TensorExpressionConcreteBase
   RAJA_SUPPRESS_HD_WARN
   template <typename RHS>
   RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
-                               operator/(RHS const& rhs) const
+  operator/(RHS const& rhs) const
   {
     return TensorDivide<self_type, normalize_operand_t<RHS>>(
         *getThis(), normalizeOperand(rhs));
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index 71ba204403..6ea5d09aa9 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -41,10 +41,9 @@ namespace ET
  * If the operands are both matrices, we perform a matrix-matrix multiply.
  * Otherwise, we perform element-wise operations.
  */
-template <
-    typename LEFT_OPERAND_TYPE,
-    typename RIGHT_OPERAND_TYPE,
-    class ENABLE = void>
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
 struct MultiplyOperator
 {
 
@@ -55,18 +54,16 @@ struct MultiplyOperator
   RAJA_HOST_DEVICE
   static void print_ast()
   {
-    printf(
-        "Elemental(%d,%d)", (int)s_num_dims,
-        (int)RIGHT_OPERAND_TYPE::s_num_dims);
+    printf("Elemental(%d,%d)", (int)s_num_dims,
+           (int)RIGHT_OPERAND_TYPE::s_num_dims);
   }
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(
-      int                       dim,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
   }
@@ -75,10 +72,10 @@ struct MultiplyOperator
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
       -> decltype(left.eval(tile) * right.eval(tile))
   {
     return left.eval(tile) * right.eval(tile);
@@ -89,13 +86,13 @@ struct MultiplyOperator
    * Evaluate operands and perform element-wise multiply add
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right,
-      ADD_OPERAND_TYPE const&   add)
-      -> decltype(left.eval(tile)
-                      .multiply_add(right.eval(tile), add.eval(tile)))
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
+                                               add.eval(tile)))
   {
     return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
   }
@@ -105,16 +102,16 @@ struct MultiplyOperator
    * Evaluate operands and perform element-wise multiply subtract
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
-      TILE_TYPE const&             tile,
-      LEFT_OPERAND_TYPE const&     left,
-      RIGHT_OPERAND_TYPE const&    right,
-      SUBTRACT_OPERAND_TYPE const& subtract)
-      -> decltype(left.eval(tile)
-                      .multiply_subtract(right.eval(tile), subtract.eval(tile)))
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
+                                                    subtract.eval(tile)))
   {
-    return left.eval(tile).multiply_subtract(
-        right.eval(tile), subtract.eval(tile));
+    return left.eval(tile).multiply_subtract(right.eval(tile),
+                                             subtract.eval(tile));
   }
 };
 
@@ -148,10 +145,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform scaling operation
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
       -> decltype(right.eval(tile).scale(left.eval(tile)))
   {
     return right.eval(tile).scale(left.eval(tile));
@@ -162,11 +159,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply add
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right,
-      ADD_OPERAND_TYPE const&   add)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
       -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
   {
     return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
@@ -177,11 +174,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply subtract
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
-      TILE_TYPE const&             tile,
-      LEFT_OPERAND_TYPE const&     left,
-      RIGHT_OPERAND_TYPE const&    right,
-      SUBTRACT_OPERAND_TYPE const& subtract)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
       -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
   {
     return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
@@ -218,10 +215,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform scaling operation
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
       -> decltype(left.eval(tile).scale(right.eval(tile)))
   {
     return left.eval(tile).scale(right.eval(tile));
@@ -232,11 +229,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply add
    */
   template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right,
-      ADD_OPERAND_TYPE const&   add)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
       -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
   {
     return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
@@ -247,11 +244,11 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply subtract
    */
   template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
-      TILE_TYPE const&             tile,
-      LEFT_OPERAND_TYPE const&     left,
-      RIGHT_OPERAND_TYPE const&    right,
-      SUBTRACT_OPERAND_TYPE const& subtract)
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
       -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
   {
     return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
@@ -275,9 +272,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
 {
 
   using left_type  = LEFT_OPERAND_TYPE;
@@ -302,10 +298,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
   {
 
     // clear result
@@ -318,11 +314,11 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right,
-      ADD_TYPE const&           add)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
   {
 
     // evaluate add into result
@@ -339,11 +335,11 @@ struct MultiplyOperator<
   struct MultiplyBridge;
 
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
-      STORAGE&                  result,
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  et_left,
-      RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
   {
     // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
@@ -420,11 +416,10 @@ struct MultiplyOperator<
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(
-        STORAGE&                  result,
-        TILE_TYPE const&          tile,
-        LEFT_OPERAND_TYPE const&  et_left,
-        RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(STORAGE& result,
+                                     TILE_TYPE const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
       // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
 
@@ -482,38 +477,35 @@ struct MultiplyOperator<
   };
 
 
-  template <
-      size_t INDEX,
-      typename STORAGE,
-      typename INDEX_TYPE,
-      TensorTileSize TENSOR_SIZE,
-      INDEX_TYPE     Begin0,
-      INDEX_TYPE... BeginTail,
-      INDEX_TYPE Size0,
-      INDEX_TYPE... SizeTail>
+  template <size_t INDEX,
+            typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
   struct MultiplyBridge<
       STORAGE,
-      StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-          camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
       camp::integral_constant<size_t, INDEX>>
   {
 
-    using TileType = StaticTensorTile<
-        INDEX_TYPE,
-        TENSOR_SIZE,
-        camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-        camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(
-        STORAGE&                  result,
-        TileType const&           tile,
-        LEFT_OPERAND_TYPE const&  et_left,
-        RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
 
       // get tile size from matrix type
@@ -525,37 +517,40 @@ struct MultiplyOperator<
       if ((offset + tile_size) <= k_size)
       {
 
-        using LeftType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, Begin0, offset>,
-            camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
         // evaluate both sides of operator
         auto left = et_left.eval(LeftType());
 
-        using RightType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, offset>,
-            camp::int_seq<INDEX_TYPE, tile_size>>;
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
 
         auto right = et_right.eval(RightType());
 
         // accumulate product
         auto temp = left.right_multiply_vector_accumulate(right, result);
-        MultiplyBridge<
-            STORAGE, TileType, camp::integral_constant<size_t, INDEX - 1>>::
+        MultiplyBridge<STORAGE, TileType,
+                       camp::integral_constant<size_t, INDEX - 1>>::
             multiply_into_result(result, tile, et_left, et_right);
         result += temp;
       }
       else
       {
 
-        using LeftType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_PARTIAL,
-            camp::int_seq<INDEX_TYPE, Begin0, offset>,
-            camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
         auto left = et_left.eval(LeftType());
 
-        using RightType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_PARTIAL, camp::int_seq<INDEX_TYPE, offset>,
-            camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
         auto right = et_right.eval(RightType());
 
         // accumulate product of partial tile
@@ -565,37 +560,34 @@ struct MultiplyOperator<
   };
 
 
-  template <
-      typename STORAGE,
-      typename INDEX_TYPE,
-      TensorTileSize TENSOR_SIZE,
-      INDEX_TYPE     Begin0,
-      INDEX_TYPE... BeginTail,
-      INDEX_TYPE Size0,
-      INDEX_TYPE... SizeTail>
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
   struct MultiplyBridge<
       STORAGE,
-      StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-          camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
       camp::integral_constant<size_t, 0>>
   {
 
-    using TileType = StaticTensorTile<
-        INDEX_TYPE,
-        TENSOR_SIZE,
-        camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-        camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(
-        STORAGE& result,
-        TileType const&,
-        LEFT_OPERAND_TYPE const&  et_left,
-        RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const&,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
 
       // get tile size from matrix type
@@ -607,15 +599,17 @@ struct MultiplyOperator<
       if ((offset + tile_size) <= k_size)
       {
 
-        using LeftType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, Begin0, offset>,
-            camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
         // evaluate both sides of operator
         auto left = et_left.eval(LeftType());
 
-        using RightType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE, offset>,
-            camp::int_seq<INDEX_TYPE, tile_size>>;
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
 
         auto right = et_right.eval(RightType());
 
@@ -626,15 +620,16 @@ struct MultiplyOperator<
       else
       {
 
-        using LeftType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_PARTIAL,
-            camp::int_seq<INDEX_TYPE, Begin0, offset>,
-            camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
         auto left = et_left.eval(LeftType());
 
-        using RightType = StaticTensorTile<
-            INDEX_TYPE, TENSOR_PARTIAL, camp::int_seq<INDEX_TYPE, offset>,
-            camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
         auto right = et_right.eval(RightType());
 
         // accumulate product of partial tile
@@ -643,56 +638,52 @@ struct MultiplyOperator<
     }
   };
 
-  template <
-      typename STORAGE,
-      typename INDEX_TYPE,
-      TensorTileSize TENSOR_SIZE,
-      INDEX_TYPE     Begin0,
-      INDEX_TYPE... BeginTail,
-      INDEX_TYPE Size0,
-      INDEX_TYPE... SizeTail>
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
   struct MultiplyBridge<
       STORAGE,
-      StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-          camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
       void>
   {
 
-    using TileType = StaticTensorTile<
-        INDEX_TYPE,
-        TENSOR_SIZE,
-        camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-        camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
     RAJA_INLINE
     RAJA_HOST_DEVICE
-    static void multiply_into_result(
-        STORAGE&                  result,
-        TileType const&           tile,
-        LEFT_OPERAND_TYPE const&  et_left,
-        RIGHT_OPERAND_TYPE const& et_right)
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
 
-      const auto   tile_size = left_type::result_type::s_dim_elem(1);
-      const auto   k_size    = et_left.getDimSize(1);
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
       const size_t iter_count =
           (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
-      MultiplyBridge<
-          STORAGE, TileType, camp::integral_constant<size_t, iter_count>>::
+      MultiplyBridge<STORAGE, TileType,
+                     camp::integral_constant<size_t, iter_count>>::
           multiply_into_result(result, tile, et_left, et_right);
     }
   };
 };
 
 
-template <
-    typename LEFT_OPERAND_TYPE,
-    typename RIGHT_OPERAND_TYPE,
-    typename ADD_OPERAND_TYPE>
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
 class TensorMultiplyAdd;
 
 
@@ -712,9 +703,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
   using left_type   = LEFT_OPERAND_TYPE;
@@ -738,10 +728,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
   {
     // clear result
     result_type result(0);
@@ -753,11 +743,11 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right,
-      ADD_TYPE const&           add)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
   {
     // evaluate add into result
     result_type result = add.eval(tile);
@@ -770,11 +760,11 @@ struct MultiplyOperator<
 
 private:
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
-      STORAGE&                  result,
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  et_left,
-      RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
   {
     // get tile size from matrix type
     auto tile_size = right_type::result_type::s_dim_elem(0);
@@ -842,9 +832,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
   using left_type   = LEFT_OPERAND_TYPE;
@@ -858,10 +847,9 @@ struct MultiplyOperator<
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(
-      int                       dim,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
   }
@@ -870,10 +858,10 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
   {
 
     /*
@@ -902,11 +890,11 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right,
-      ADD_TYPE const&           add)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
   {
 
     // start accumulator with addition term
@@ -919,11 +907,11 @@ struct MultiplyOperator<
 
 private:
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
-      STORAGE&                  result,
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  et_left,
-      RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
   {
     // get tile size from matrix type
     using right_tensor_type = typename right_type::result_type;
@@ -994,7 +982,7 @@ class RestrictExtents
 
 private:
   operand_type m_operand;
-  tile_type    m_tile;
+  tile_type m_tile;
 
 public:
   RAJA_INLINE
@@ -1037,8 +1025,8 @@ class RestrictExtents
 };
 
 template <typename OPERAND, typename TILE>
-RestrictExtents<OPERAND, TILE>
-restrictExtents(OPERAND const& operand, TILE const& tile)
+RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
+                                               TILE const& tile)
 {
   using tile_type = typename OPERAND::tile_type;
   tile_type new_tile;
@@ -1060,9 +1048,8 @@ struct MultiplyOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
     typename std::enable_if<
-        std::is_base_of<
-            TensorBlockConcreteBase,
-            typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+        std::is_base_of<TensorBlockConcreteBase,
+                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
         LEFT_OPERAND_TYPE::s_num_dims == 2 &&
         RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
@@ -1093,10 +1080,9 @@ struct MultiplyOperator<
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static int getDimSize(
-      int                       dim,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
   {
     return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
   }
@@ -1141,8 +1127,8 @@ struct MultiplyOperator<
 
   template <typename TILE_TYPE, typename ADD_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
       RIGHT_OPERAND_TYPE const& right,
       ADD_TYPE const&
           add)  //->
@@ -1190,11 +1176,11 @@ struct MultiplyOperator<
 
 private:
   template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
-      STORAGE&                  result,
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  et_left,
-      RIGHT_OPERAND_TYPE const& et_right)
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
   {
 
     // get tile size from matrix type
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index 5908918c8e..34998af6bd 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -36,10 +36,9 @@ namespace expt
 namespace ET
 {
 
-template <
-    typename LEFT_OPERAND_TYPE,
-    typename RIGHT_OPERAND_TYPE,
-    class ENABLE = void>
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
 struct DivideOperator;
 
 
@@ -50,9 +49,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 0 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
 {
 
   using result_type = typename RIGHT_OPERAND_TYPE::result_type;
@@ -70,10 +68,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
   {
     result_type numerator(left.eval(tile));
 
@@ -94,9 +92,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -114,10 +111,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
   {
     result_type denominator(right.eval(tile));
 
@@ -140,9 +137,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -160,10 +156,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
   {
     if (tile.s_tensor_size == TENSOR_FULL)
     {
@@ -184,9 +180,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 0 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
 
   using result_type = typename RIGHT_OPERAND_TYPE::result_type;
@@ -204,10 +199,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
   {
     result_type numerator(left.eval(tile));
 
@@ -216,8 +211,8 @@ struct DivideOperator<
       return numerator.divide(right.eval(tile));
     }
 
-    return numerator.divide_nm(
-        right.eval(tile), tile.m_size[0], tile.m_size[1]);
+    return numerator.divide_nm(right.eval(tile), tile.m_size[0],
+                               tile.m_size[1]);
   }
 };
 
@@ -229,9 +224,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -250,10 +244,10 @@ struct DivideOperator<
    */
   RAJA_SUPPRESS_HD_WARN
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
   {
     result_type denominator(right.eval(tile));
 
@@ -263,8 +257,8 @@ struct DivideOperator<
     }
     else
     {
-      return left.eval(tile).divide_nm(
-          denominator, tile.m_size[0], tile.m_size[1]);
+      return left.eval(tile).divide_nm(denominator, tile.m_size[0],
+                                       tile.m_size[1]);
     }
   }
 };
@@ -277,9 +271,8 @@ template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
 struct DivideOperator<
     LEFT_OPERAND_TYPE,
     RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
 {
   using result_type = typename LEFT_OPERAND_TYPE::result_type;
   static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
@@ -297,10 +290,10 @@ struct DivideOperator<
    * Evaluate operands and perform element-wise divide
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
-      TILE_TYPE const&          tile,
-      LEFT_OPERAND_TYPE const&  left,
-      RIGHT_OPERAND_TYPE const& right)
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
   {
     if (tile.s_tensor_size == TENSOR_FULL)
     {
@@ -308,8 +301,8 @@ struct DivideOperator<
     }
     else
     {
-      return left.eval(tile).divide_nm(
-          right.eval(tile), tile.m_size[0], tile.m_size[1]);
+      return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0],
+                                       tile.m_size[1]);
     }
   }
 };
@@ -332,15 +325,14 @@ class TensorDivide : public TensorExpressionBase<
 
 
 private:
-  left_operand_type  m_left_operand;
+  left_operand_type m_left_operand;
   right_operand_type m_right_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorDivide(
-      left_operand_type const&  left_operand,
-      right_operand_type const& right_operand)
+  TensorDivide(left_operand_type const& left_operand,
+               right_operand_type const& right_operand)
       : m_left_operand {left_operand}, m_right_operand {right_operand}
   {}
 
@@ -404,8 +396,8 @@ template <
     typename std::enable_if<
         std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
         bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto
-operator/(LHS const& left_operand, RHS const& right_operand)
+RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const& left_operand,
+                                            RHS const& right_operand)
     -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
 {
   return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 1f877c97ba..00e5b14bf5 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -170,8 +170,8 @@ class TensorLoadStore
   RAJA_SUPPRESS_HD_WARN
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const& tile) const
-      -> decltype(TENSOR_TYPE::create_et_store_ref(
-          merge_ref_tile(this->m_ref, tile)))
+      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
+                                                                  tile)))
   {
     return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
   }
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 6ee5ac747a..b51aa3d8d6 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -37,10 +37,9 @@ namespace ET
 {
 
 // forward decl for FMA contraction
-template <
-    typename LEFT_OPERAND_TYPE,
-    typename RIGHT_OPERAND_TYPE,
-    typename ADD_TYPE>
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_TYPE>
 class TensorMultiplyAdd;
 
 
@@ -62,15 +61,14 @@ class TensorMultiply
   static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
 
 private:
-  left_operand_type  m_left_operand;
+  left_operand_type m_left_operand;
   right_operand_type m_right_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorMultiply(
-      left_operand_type const&  left_operand,
-      right_operand_type const& right_operand)
+  TensorMultiply(left_operand_type const& left_operand,
+                 right_operand_type const& right_operand)
       : m_left_operand {left_operand}, m_right_operand {right_operand}
   {}
 
@@ -116,14 +114,13 @@ class TensorMultiply
    */
   RAJA_SUPPRESS_HD_WARN
   template <typename ADD>
-  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<
-      left_operand_type,
-      right_operand_type,
-      normalize_operand_t<ADD>>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
+                                                 right_operand_type,
+                                                 normalize_operand_t<ADD>>
   operator+(ADD const& add) const
   {
-    return TensorMultiplyAdd<
-        left_operand_type, right_operand_type, normalize_operand_t<ADD>>(
+    return TensorMultiplyAdd<left_operand_type, right_operand_type,
+                             normalize_operand_t<ADD>>(
         m_left_operand, m_right_operand, normalizeOperand(add));
   }
 
@@ -154,8 +151,8 @@ template <
     typename std::enable_if<
         std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
         bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto
-operator*(LHS const& left_operand, RHS const& right_operand)
+RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const& left_operand,
+                                            RHS const& right_operand)
     -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
 {
   return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index a0c3dd5589..a15059ed13 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -44,20 +44,18 @@ namespace ET
  * This ET can only be generated by contracting an Add and Multiple ET.
  *
  */
-template <
-    typename LEFT_OPERAND_TYPE,
-    typename RIGHT_OPERAND_TYPE,
-    typename ADD_OPERAND_TYPE>
-class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<
-                              LEFT_OPERAND_TYPE,
-                              RIGHT_OPERAND_TYPE,
-                              ADD_OPERAND_TYPE>>
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd
+    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                                    RIGHT_OPERAND_TYPE,
+                                                    ADD_OPERAND_TYPE>>
 {
 public:
-  using self_type = TensorMultiplyAdd<
-      LEFT_OPERAND_TYPE,
-      RIGHT_OPERAND_TYPE,
-      ADD_OPERAND_TYPE>;
+  using self_type          = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                      RIGHT_OPERAND_TYPE,
+                                      ADD_OPERAND_TYPE>;
   using left_operand_type  = LEFT_OPERAND_TYPE;
   using right_operand_type = RIGHT_OPERAND_TYPE;
   using add_operand_type   = ADD_OPERAND_TYPE;
@@ -70,17 +68,16 @@ class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<
   static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
 
 private:
-  left_operand_type  m_left_operand;
+  left_operand_type m_left_operand;
   right_operand_type m_right_operand;
-  add_operand_type   m_add_operand;
+  add_operand_type m_add_operand;
 
 public:
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  TensorMultiplyAdd(
-      left_operand_type const&  left_operand,
-      right_operand_type const& right_operand,
-      add_operand_type const&   add_operand)
+  TensorMultiplyAdd(left_operand_type const& left_operand,
+                    right_operand_type const& right_operand,
+                    add_operand_type const& add_operand)
       : m_left_operand {left_operand},
         m_right_operand {right_operand},
         m_add_operand {add_operand}
@@ -89,14 +86,13 @@ class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<
 
   template <typename TILE_TYPE>
   RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(multiply_op::multiply_add(
-          tile,
-          m_left_operand,
-          m_right_operand,
-          m_add_operand))
+      -> decltype(multiply_op::multiply_add(tile,
+                                            m_left_operand,
+                                            m_right_operand,
+                                            m_add_operand))
   {
-    return multiply_op::multiply_add(
-        tile, m_left_operand, m_right_operand, m_add_operand);
+    return multiply_op::multiply_add(tile, m_left_operand, m_right_operand,
+                                     m_add_operand);
   }
 
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index e0c4a6c019..a1e9fa4542 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -65,8 +65,8 @@ class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
   RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
   {
     // transpose which tile we are returning
-    TILE_TYPE trans_tile {
-        {tile.m_begin[1], tile.m_begin[0]}, {tile.m_size[1], tile.m_size[0]}};
+    TILE_TYPE trans_tile {{tile.m_begin[1], tile.m_begin[0]},
+                          {tile.m_size[1], tile.m_size[0]}};
 
     // evaluate and return the transposed tile
     return m_tensor.eval(trans_tile).transpose();
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 28f7458752..8134831516 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -40,48 +40,41 @@ struct MatrixMatrixMultiplyHelper;
  * Row-Major * Row-Major ==> Row-Major
  *
  */
-template <
-    typename T,
-    typename REGISTER_POLICY,
-    camp::idx_t N_SIZE,
-    camp::idx_t M_SIZE,
-    camp::idx_t M2_SIZE,
-    camp::idx_t O_SIZE>
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
 struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<
-        REGISTER_POLICY,
-        T,
-        RAJA::expt::RowMajorLayout,
-        camp::idx_seq<N_SIZE, M_SIZE>>,
-    RAJA::expt::TensorRegister<
-        REGISTER_POLICY,
-        T,
-        RAJA::expt::RowMajorLayout,
-        camp::idx_seq<M2_SIZE, O_SIZE>>>
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
 {
 
-  static_assert(
-      M_SIZE == M2_SIZE,
-      "Matrices are not compatible for "
-      "multiplication");
-
-  using left_type = RAJA::expt::TensorRegister<
-      REGISTER_POLICY,
-      T,
-      RAJA::expt::RowMajorLayout,
-      camp::idx_seq<N_SIZE, M_SIZE>>;
-
-  using right_type = RAJA::expt::TensorRegister<
-      REGISTER_POLICY,
-      T,
-      RAJA::expt::RowMajorLayout,
-      camp::idx_seq<M_SIZE, O_SIZE>>;
-
-  using result_type = RAJA::expt::TensorRegister<
-      REGISTER_POLICY,
-      T,
-      RAJA::expt::RowMajorLayout,
-      camp::idx_seq<N_SIZE, O_SIZE>>;
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for "
+                "multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::RowMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::RowMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::RowMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
 
   using register_type = typename result_type::register_type;
 
@@ -101,10 +94,9 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(
-          left_type const&  A,
-          right_type const& B,
-          result_type&      C)
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
   {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
     RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
@@ -137,10 +129,9 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE RAJA_INLINE static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(
-          left_type const&  A,
-          right_type const& B,
-          result_type&      C)
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
   {
     constexpr camp::idx_t bc_segbits              = result_type::s_segbits;
     constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
@@ -148,8 +139,8 @@ struct MatrixMatrixMultiplyHelper<
     RAJA_UNROLL
     for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row)
     {
-      camp::idx_t   c_reg     = ac_row / result_type::s_major_dim_per_register;
-      camp::idx_t   c_segment = ac_row % result_type::s_major_dim_per_register;
+      camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
+      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
       register_type c_tmp;
 
       RAJA_UNROLL
@@ -193,60 +184,51 @@ struct MatrixMatrixMultiplyHelper<
  * Column-Major * Column-Major ==> Column-Major
  *
  */
-template <
-    typename T,
-    typename REGISTER_POLICY,
-    camp::idx_t N_SIZE,
-    camp::idx_t M_SIZE,
-    camp::idx_t M2_SIZE,
-    camp::idx_t O_SIZE>
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
 struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<
-        REGISTER_POLICY,
-        T,
-        RAJA::expt::ColMajorLayout,
-        camp::idx_seq<N_SIZE, M_SIZE>>,
-    RAJA::expt::TensorRegister<
-        REGISTER_POLICY,
-        T,
-        RAJA::expt::ColMajorLayout,
-        camp::idx_seq<M2_SIZE, O_SIZE>>>
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
 {
 
   using self_type = MatrixMatrixMultiplyHelper<
-      RAJA::expt::TensorRegister<
-          REGISTER_POLICY,
-          T,
-          RAJA::expt::ColMajorLayout,
-          camp::idx_seq<N_SIZE, M_SIZE>>,
-      RAJA::expt::TensorRegister<
-          REGISTER_POLICY,
-          T,
-          RAJA::expt::ColMajorLayout,
-          camp::idx_seq<M2_SIZE, O_SIZE>>>;
-
-  static_assert(
-      M_SIZE == M2_SIZE,
-      "Matrices are not compatible for "
-      "multiplication");
-
-  using left_type = RAJA::expt::TensorRegister<
-      REGISTER_POLICY,
-      T,
-      RAJA::expt::ColMajorLayout,
-      camp::idx_seq<N_SIZE, M_SIZE>>;
-
-  using right_type = RAJA::expt::TensorRegister<
-      REGISTER_POLICY,
-      T,
-      RAJA::expt::ColMajorLayout,
-      camp::idx_seq<M_SIZE, O_SIZE>>;
-
-  using result_type = RAJA::expt::TensorRegister<
-      REGISTER_POLICY,
-      T,
-      RAJA::expt::ColMajorLayout,
-      camp::idx_seq<N_SIZE, O_SIZE>>;
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<N_SIZE, M_SIZE>>,
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
+
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for "
+                "multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::ColMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::ColMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::ColMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
 
   using register_type = typename result_type::register_type;
 
@@ -267,10 +249,9 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(
-          left_type const&  A,
-          right_type const& B,
-          result_type&      C)
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
   {
 
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
@@ -305,10 +286,9 @@ struct MatrixMatrixMultiplyHelper<
   template <typename dummy = void>
   RAJA_HOST_DEVICE RAJA_INLINE static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(
-          left_type const&  A,
-          right_type const& B,
-          result_type&      C)
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
   {
     constexpr camp::idx_t ac_segbits              = result_type::s_segbits;
     constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 633ee484f5..2b87f1d34d 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -35,58 +35,53 @@ namespace expt
 /*
  * 2D (Matrix) specialization of TensorRegister
  */
-template <
-    typename REGISTER_POLICY,
-    typename T,
-    camp::idx_t ROW_ORD,
-    camp::idx_t COL_ORD,
-    camp::idx_t ROW_SIZE,
-    camp::idx_t COL_SIZE>
-class TensorRegister<
-    REGISTER_POLICY,
-    T,
-    TensorLayout<ROW_ORD, COL_ORD>,
-    camp::idx_seq<ROW_SIZE, COL_SIZE>>
-    : public RAJA::internal::expt::TensorRegisterBase<TensorRegister<
-          REGISTER_POLICY,
-          T,
-          TensorLayout<ROW_ORD, COL_ORD>,
-          camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+template <typename REGISTER_POLICY,
+          typename T,
+          camp::idx_t ROW_ORD,
+          camp::idx_t COL_ORD,
+          camp::idx_t ROW_SIZE,
+          camp::idx_t COL_SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
+    : public RAJA::internal::expt::TensorRegisterBase<
+          TensorRegister<REGISTER_POLICY,
+                         T,
+                         TensorLayout<ROW_ORD, COL_ORD>,
+                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
 {
 public:
-  using self_type = TensorRegister<
-      REGISTER_POLICY,
-      T,
-      TensorLayout<ROW_ORD, COL_ORD>,
-      camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-  using base_type     = RAJA::internal::expt::TensorRegisterBase<TensorRegister<
-      REGISTER_POLICY,
-      T,
-      TensorLayout<ROW_ORD, COL_ORD>,
-      camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-  using register_type = Register<T, REGISTER_POLICY>;
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   TensorLayout<ROW_ORD, COL_ORD>,
+                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+  using base_type = RAJA::internal::expt::TensorRegisterBase<
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+  using register_type      = Register<T, REGISTER_POLICY>;
   using row_vector_type    = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
   using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
   using register_policy    = REGISTER_POLICY;
   using element_type       = T;
   using layout_type        = TensorLayout<ROW_ORD, COL_ORD>;
 
-  using transpose_tensor_type = TensorRegister<
-      REGISTER_POLICY,
-      T,
-      TensorLayout<!ROW_ORD, !COL_ORD>,
-      camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-
-  using transpose_type = TensorRegister<
-      REGISTER_POLICY,
-      T,
-      layout_type,
-      camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-  using product_type = TensorRegister<
-      REGISTER_POLICY,
-      T,
-      layout_type,
-      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+  using transpose_tensor_type =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<!ROW_ORD, !COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+
+  using transpose_type = TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        layout_type,
+                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+  using product_type   = TensorRegister<REGISTER_POLICY,
+                                      T,
+                                      layout_type,
+                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
 
   static constexpr camp::idx_t s_num_rows    = ROW_SIZE;
   static constexpr camp::idx_t s_num_columns = COL_SIZE;
@@ -100,10 +95,10 @@ class TensorRegister<
       (ROW_SIZE * COL_SIZE) / s_elements_per_register;
 
   // We only allow matrix sizes that exactly fit in some number of registers
-  static_assert(
-      (ROW_SIZE * COL_SIZE) == s_num_registers * s_elements_per_register,
-      "MatrixRegister must be dimensioned to exactly fit an integer "
-      "number of registers");
+  static_assert((ROW_SIZE * COL_SIZE) ==
+                    s_num_registers * s_elements_per_register,
+                "MatrixRegister must be dimensioned to exactly fit an integer "
+                "number of registers");
 
   using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
 
@@ -125,16 +120,14 @@ class TensorRegister<
   static constexpr camp::idx_t s_minor_dim_registers =
       s_minor_dim_elements / s_elements_per_register;
 
-  static_assert(
-      s_minor_dim_registers > 0 || log_base2_t::is_exact,
-      "Minor dimension smaller than a vector need to be a power of "
-      "two fraction");
+  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
+                "Minor dimension smaller than a vector need to be a power of "
+                "two fraction");
 
-  static_assert(
-      s_minor_dim_registers == 0 ||
-          (s_minor_dim_elements % s_elements_per_register == 0),
-      "Minor dimensions greater than a vector length must be an "
-      "integer number of vectors");
+  static_assert(s_minor_dim_registers == 0 ||
+                    (s_minor_dim_elements % s_elements_per_register == 0),
+                "Minor dimensions greater than a vector length must be an "
+                "integer number of vectors");
 
 
   static constexpr camp::idx_t s_major_dim_per_register =
@@ -145,8 +138,8 @@ class TensorRegister<
 
 private:
   template <typename IDX>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto
-  to_register(IDX row, IDX col) -> IDX
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
+                                                                 IDX col) -> IDX
   {
     return layout_type::is_row_major()
                ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
@@ -264,11 +257,10 @@ class TensorRegister<
   }
 
 
-  template <
-      typename POINTER_TYPE,
-      typename INDEX_TYPE,
-      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-      camp::idx_t                          STRIDE_ONE_DIM>
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
   struct RefBridge<
       RAJA::internal::expt::
           TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
@@ -299,9 +291,8 @@ class TensorRegister<
         // partial
         else
         {
-          self.load_packed_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -315,9 +306,8 @@ class TensorRegister<
         // partial
         else
         {
-          self.load_strided_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
@@ -345,9 +335,8 @@ class TensorRegister<
         // partial
         else
         {
-          self.store_packed_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -361,26 +350,24 @@ class TensorRegister<
         // partial
         else
         {
-          self.store_strided_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
   };
 
 
-  template <
-      typename POINTER_TYPE,
-      typename INDEX_TYPE,
-      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-      INDEX_TYPE                           StrideInt1,
-      INDEX_TYPE                           StrideInt2,
-      INDEX_TYPE                           BeginInt1,
-      INDEX_TYPE                           BeginInt2,
-      INDEX_TYPE                           SizeInt1,
-      INDEX_TYPE                           SizeInt2,
-      camp::idx_t                          STRIDE_ONE_DIM>
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE StrideInt1,
+            INDEX_TYPE StrideInt2,
+            INDEX_TYPE BeginInt1,
+            INDEX_TYPE BeginInt2,
+            INDEX_TYPE SizeInt1,
+            INDEX_TYPE SizeInt2,
+            camp::idx_t STRIDE_ONE_DIM>
   struct RefBridge<RAJA::internal::expt::StaticTensorRef<
       POINTER_TYPE,
       INDEX_TYPE,
@@ -422,9 +409,8 @@ class TensorRegister<
         // partial
         else
         {
-          self.load_packed_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -438,9 +424,8 @@ class TensorRegister<
         // partial
         else
         {
-          self.load_strided_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
@@ -468,9 +453,8 @@ class TensorRegister<
         // partial
         else
         {
-          self.store_packed_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
       // strided data
@@ -484,9 +468,8 @@ class TensorRegister<
         // partial
         else
         {
-          self.store_strided_nm(
-              ptr, ref.m_stride[0], ref.m_stride[1], ref.m_tile.m_size[0],
-              ref.m_tile.m_size[1]);
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
         }
       }
     }
@@ -599,8 +582,8 @@ class TensorRegister<
               i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
           camp::idx_t col =
               s_elements_per_register * (i - (row * s_minor_dim_registers));
-          m_registers[i].load_strided(
-              ptr + row * row_stride + col * col_stride, col_stride);
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      col_stride);
         }
       }
       // less than one register per row
@@ -610,8 +593,8 @@ class TensorRegister<
         {
           element_type const* ptr_i =
               ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(
-              ptr_i, s_segbits, col_stride, row_stride);
+          m_registers[i].segmented_load(ptr_i, s_segbits, col_stride,
+                                        row_stride);
         }
       }
     }
@@ -630,8 +613,8 @@ class TensorRegister<
           camp::idx_t row =
               s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-          m_registers[i].load_strided(
-              ptr + row * row_stride + col * col_stride, row_stride);
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      row_stride);
         }
       }
       // less than one register per column
@@ -641,8 +624,8 @@ class TensorRegister<
         {
           element_type const* ptr_i =
               ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(
-              ptr_i, s_segbits, row_stride, col_stride);
+          m_registers[i].segmented_load(ptr_i, s_segbits, row_stride,
+                                        col_stride);
         }
       }
     }
@@ -655,12 +638,11 @@ class TensorRegister<
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& load_packed_nm(
-      element_type const* ptr,
-      int                 row_stride,
-      int                 col_stride,
-      int                 num_rows,
-      int                 num_cols)
+  self_type& load_packed_nm(element_type const* ptr,
+                            int row_stride,
+                            int col_stride,
+                            int num_rows,
+                            int num_cols)
   {
 
     if (layout_type::is_row_major())
@@ -791,12 +773,11 @@ class TensorRegister<
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& load_strided_nm(
-      element_type const* ptr,
-      int                 row_stride,
-      int                 col_stride,
-      int                 num_rows,
-      int                 num_cols)
+  self_type& load_strided_nm(element_type const* ptr,
+                             int row_stride,
+                             int col_stride,
+                             int num_rows,
+                             int num_cols)
   {
 
     if (layout_type::is_row_major())
@@ -823,9 +804,9 @@ class TensorRegister<
             if (reg_num_cols + col > num_cols)
             {
               reg_num_cols = num_cols - col;
-              m_registers[i].load_strided_n(
-                  ptr + row * row_stride + col * col_stride, col_stride,
-                  reg_num_cols);
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            col_stride, reg_num_cols);
             }
             else
             {
@@ -849,8 +830,8 @@ class TensorRegister<
 
           element_type const* ptr_i =
               ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(
-              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride,
+                                           row_stride, num_cols, reg_num_rows);
         }
       }
     }
@@ -879,9 +860,9 @@ class TensorRegister<
             if (reg_num_rows + row > num_rows)
             {
               reg_num_rows = num_rows - row;
-              m_registers[i].load_strided_n(
-                  ptr + row * row_stride + col * col_stride, row_stride,
-                  reg_num_rows);
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            row_stride, reg_num_rows);
             }
             else
             {
@@ -904,8 +885,8 @@ class TensorRegister<
 
           element_type const* ptr_i =
               ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(
-              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride,
+                                           col_stride, num_rows, reg_num_cols);
         }
       }
     }
@@ -949,8 +930,8 @@ class TensorRegister<
               i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
           camp::idx_t col =
               s_elements_per_register * (i - (row * s_minor_dim_registers));
-          m_registers[i].store_packed(
-              ptr + row * row_stride + col * col_stride);
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
       }
       // more than one column per register
@@ -971,8 +952,8 @@ class TensorRegister<
               i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
           camp::idx_t row =
               s_elements_per_register * (i - (col * s_minor_dim_registers));
-          m_registers[i].store_packed(
-              ptr + row * row_stride + col * col_stride);
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
       }
       // more than one row per register
@@ -1018,8 +999,8 @@ class TensorRegister<
         for (camp::idx_t i = 0; i < s_num_registers; ++i)
         {
           element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(
-              ptr_i, s_segbits, col_stride, row_stride);
+          m_registers[i].segmented_store(ptr_i, s_segbits, col_stride,
+                                         row_stride);
         }
       }
     }
@@ -1046,8 +1027,8 @@ class TensorRegister<
         for (camp::idx_t i = 0; i < s_num_registers; ++i)
         {
           element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(
-              ptr_i, s_segbits, row_stride, col_stride);
+          m_registers[i].segmented_store(ptr_i, s_segbits, row_stride,
+                                         col_stride);
         }
       }
     }
@@ -1060,12 +1041,11 @@ class TensorRegister<
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& store_packed_nm(
-      element_type* ptr,
-      int           row_stride,
-      int           col_stride,
-      int           num_rows,
-      int           num_cols) const
+  self_type const& store_packed_nm(element_type* ptr,
+                                   int row_stride,
+                                   int col_stride,
+                                   int num_rows,
+                                   int num_cols) const
   {
 
 
@@ -1106,8 +1086,8 @@ class TensorRegister<
       else
       {
         // default to strided operation
-        return store_strided_nm(
-            ptr, row_stride, col_stride, num_rows, num_cols);
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
       }
     }
     // Do semi-dense store for column-major
@@ -1149,8 +1129,8 @@ class TensorRegister<
       {
 
         // default to strided operation
-        return store_strided_nm(
-            ptr, row_stride, col_stride, num_rows, num_cols);
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
       }
     }
 
@@ -1162,12 +1142,11 @@ class TensorRegister<
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& store_strided_nm(
-      element_type* ptr,
-      int           row_stride,
-      int           col_stride,
-      int           num_rows,
-      int           num_cols) const
+  self_type const& store_strided_nm(element_type* ptr,
+                                    int row_stride,
+                                    int col_stride,
+                                    int num_rows,
+                                    int num_cols) const
   {
 
 
@@ -1191,9 +1170,9 @@ class TensorRegister<
             if (reg_num_cols + col > num_cols)
             {
               reg_num_cols = num_cols - col;
-              m_registers[i].store_strided_n(
-                  ptr + row * row_stride + col * col_stride, col_stride,
-                  reg_num_cols);
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             col_stride, reg_num_cols);
             }
             else
             {
@@ -1216,8 +1195,8 @@ class TensorRegister<
                                          : reg_num_rows;
 
           element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(
-              ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride,
+                                            row_stride, num_cols, reg_num_rows);
         }
       }
     }
@@ -1242,9 +1221,9 @@ class TensorRegister<
             if (reg_num_rows + row > num_rows)
             {
               reg_num_rows = num_rows - row;
-              m_registers[i].store_strided_n(
-                  ptr + row * row_stride + col * col_stride, row_stride,
-                  reg_num_rows);
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             row_stride, reg_num_rows);
             }
             else
             {
@@ -1266,8 +1245,8 @@ class TensorRegister<
                                          : reg_num_cols;
 
           element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(
-              ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride,
+                                            col_stride, num_rows, reg_num_cols);
         }
       }
     }
@@ -1564,9 +1543,9 @@ class TensorRegister<
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  column_vector_type right_multiply_vector_accumulate(
-      row_vector_type const& v,
-      column_vector_type     result) const
+  column_vector_type
+  right_multiply_vector_accumulate(row_vector_type const& v,
+                                   column_vector_type result) const
   {
 
     if (layout_type::is_row_major())
@@ -1692,9 +1671,8 @@ class TensorRegister<
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  row_vector_type left_multiply_vector_accumulate(
-      column_vector_type const& v,
-      row_vector_type           result) const
+  row_vector_type left_multiply_vector_accumulate(column_vector_type const& v,
+                                                  row_vector_type result) const
   {
 
     if (layout_type::is_row_major())
@@ -1873,10 +1851,9 @@ class TensorRegister<
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  register_type extract_diagonal_register(
-      camp::idx_t starting_column,
-      camp::idx_t segbits,
-      camp::idx_t segment) const
+  register_type extract_diagonal_register(camp::idx_t starting_column,
+                                          camp::idx_t segbits,
+                                          camp::idx_t segment) const
   {
 
     register_type result(0);
@@ -1889,9 +1866,9 @@ class TensorRegister<
 
     for (camp::idx_t i = 0; i < num_rows; ++i)
     {
-      camp::idx_t col   = (col0 + i) % s_num_columns;
-      camp::idx_t row   = row0 + i;
-      auto        value = get(row, col);
+      camp::idx_t col = (col0 + i) % s_num_columns;
+      camp::idx_t row = row0 + i;
+      auto value      = get(row, col);
       for (camp::idx_t j = 0; j < num_repeats; ++j)
       {
         result.set(value, (i << segbits) + j);
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 608060b2d5..af2ca27b98 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -53,9 +53,8 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).add(rhs);
@@ -69,9 +68,8 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).subtract(rhs);
@@ -85,9 +83,8 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
 {
   return rhs.scale(lhs);
@@ -101,9 +98,8 @@ template <
     typename LEFT,
     typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
 RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
 {
   return RIGHT(lhs).divide(rhs);
@@ -210,9 +206,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& gather(
-      element_type const*                       ptr,
-      RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather(element_type const* ptr,
+         RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -235,10 +231,10 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& gather_n(
-      element_type const*                              ptr,
-      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-      camp::idx_t                                      N)
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather_n(element_type const* ptr,
+           RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+           camp::idx_t N)
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -263,15 +259,13 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& segmented_load(
-      element_type const* ptr,
-      camp::idx_t         segbits,
-      camp::idx_t         stride_inner,
-      camp::idx_t         stride_outer)
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
   {
-    getThis()->gather(
-        ptr,
-        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+    getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner,
+                                                          stride_outer));
     return *getThis();
   }
 
@@ -284,13 +278,12 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type& segmented_load_nm(
-      element_type const* ptr,
-      camp::idx_t         segbits,
-      camp::idx_t         stride_inner,
-      camp::idx_t         stride_outer,
-      camp::idx_t         num_inner,
-      camp::idx_t         num_outer)
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
   {
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
@@ -334,9 +327,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const& scatter(
-      element_type*                                    ptr,
-      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter(element_type* ptr,
+          RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -358,10 +351,10 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const& scatter_n(
-      element_type*                                    ptr,
-      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-      camp::idx_t                                      N) const
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr,
+            RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+            camp::idx_t N) const
   {
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
@@ -386,15 +379,13 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store(
-      element_type* ptr,
-      camp::idx_t   segbits,
-      camp::idx_t   stride_inner,
-      camp::idx_t   stride_outer) const
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
   {
-    getThis()->scatter(
-        ptr,
-        self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+    getThis()->scatter(ptr, self_type::s_segmented_offsets(
+                                segbits, stride_inner, stride_outer));
     return *getThis();
   }
 
@@ -407,13 +398,12 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store_nm(
-      element_type* ptr,
-      camp::idx_t   segbits,
-      camp::idx_t   stride_inner,
-      camp::idx_t   stride_outer,
-      camp::idx_t   num_inner,
-      camp::idx_t   num_outer) const
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
   {
 
     camp::idx_t num_segments = self_type::s_num_elem >> segbits;
@@ -870,10 +860,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    * and the stride between segments (stride_outer)
    */
   RAJA_INLINE
-  static int_vector_type s_segmented_offsets(
-      camp::idx_t segbits,
-      camp::idx_t stride_inner,
-      camp::idx_t stride_outer)
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
   {
     int_vector_type result;
 
@@ -928,8 +917,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   RAJA_INLINE
-  self_type
-  segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
   {
     self_type result(0);
 
@@ -984,8 +973,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   RAJA_INLINE
-  self_type
-  segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
   {
     self_type result(0);
 
@@ -996,7 +985,7 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
     for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
     {
       camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
-      auto        value    = getThis()->get(i) + result.get(output_i);
+      auto value           = getThis()->get(i) + result.get(output_i);
       result.set(value, output_i);
     }
 
@@ -1005,11 +994,10 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
 
 
   RAJA_INLINE
-  self_type segmented_divide_nm(
-      self_type   den,
-      camp::idx_t segbits,
-      camp::idx_t num_inner,
-      camp::idx_t num_outer) const
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
   {
     self_type result;
 
@@ -1075,10 +1063,9 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
   RAJA_SUPPRESS_HD_WARN
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  self_type segmented_dot(
-      camp::idx_t      segbits,
-      camp::idx_t      output_segment,
-      self_type const& x) const
+  self_type segmented_dot(camp::idx_t segbits,
+                          camp::idx_t output_segment,
+                          self_type const& x) const
   {
     return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
   }
@@ -1131,9 +1118,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *
    */
   RAJA_INLINE
-  self_type segmented_broadcast_inner(
-      camp::idx_t segbits,
-      camp::idx_t input_segment) const
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1191,9 +1177,8 @@ class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
    *      Result= x6, x6, x6, x6, x7, x7, x7, x7
    */
   RAJA_INLINE
-  self_type segmented_broadcast_outer(
-      camp::idx_t segbits,
-      camp::idx_t input_segment) const
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
   {
     self_type result;
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index 1c09d35246..c92921df2a 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -127,28 +127,25 @@ struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
 };
 
 
-template <
-    typename IDX,
-    typename TENSOR_TYPE,
-    camp::idx_t             DIM,
-    IDX                     INDEX_VALUE,
-    strip_index_type_t<IDX> LENGTH_VALUE>
-struct TensorIndexTraits<
-    RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<
-        IDX,
-        TENSOR_TYPE,
-        DIM,
-        INDEX_VALUE,
-        LENGTH_VALUE>>>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+    RAJA::expt::StaticTensorIndexInner<IDX,
+                                       TENSOR_TYPE,
+                                       DIM,
+                                       INDEX_VALUE,
+                                       LENGTH_VALUE>>>
 {
-  using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using index_type =
-      RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<
-          IDX,
-          TENSOR_TYPE,
-          DIM,
-          INDEX_VALUE,
-          LENGTH_VALUE>>;
+  using base_type  = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using index_type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::StaticTensorIndexInner<IDX,
+                                         TENSOR_TYPE,
+                                         DIM,
+                                         INDEX_VALUE,
+                                         LENGTH_VALUE>>;
   using arg_type   = IDX;
   using value_type = strip_index_type_t<IDX>;
 
@@ -218,8 +215,8 @@ stripTensorIndexByValue(ARG const arg) ->
  * For VectorIndex types, returns the number of vector lanes.
  */
 template <typename ARG, typename IDX>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX
-getTensorSize(ARG const& arg, IDX dim_size)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
+                                                         IDX dim_size)
 {
   return TensorIndexTraits<ARG>::size(arg) >= 0
              ? IDX(TensorIndexTraits<ARG>::size(arg))
@@ -231,8 +228,8 @@ getTensorSize(ARG const& arg, IDX dim_size)
  *
  */
 template <typename ARG, typename IDX>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX
-getTensorBegin(ARG const& arg, IDX dim_minval)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
+                                                          IDX dim_minval)
 {
   return TensorIndexTraits<ARG>::begin(arg) >= 0
              ? IDX(TensorIndexTraits<ARG>::begin(arg))
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index 49746976bd..f57593b57f 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -144,12 +144,11 @@ struct PrependStaticIndexArray<
 };
 
 
-template <
-    typename INDEX_TYPE,
-    size_t     IDX,
-    INDEX_TYPE DELTA,
-    INDEX_TYPE HEAD,
-    INDEX_TYPE... TAIL>
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
 struct AddStaticIndexArray<
     INDEX_TYPE,
     IDX,
@@ -157,21 +156,19 @@ struct AddStaticIndexArray<
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
   using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using AddTail = typename AddStaticIndexArray<
-      INDEX_TYPE,
-      IDX - 1,
-      DELTA,
-      typename Orig::Tail>::Type;
+  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               DELTA,
+                                               typename Orig::Tail>::Type;
   using Type =
       typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
   using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
 };
 
-template <
-    typename INDEX_TYPE,
-    INDEX_TYPE DELTA,
-    INDEX_TYPE HEAD,
-    INDEX_TYPE... TAIL>
+template <typename INDEX_TYPE,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
 struct AddStaticIndexArray<
     INDEX_TYPE,
     0,
@@ -180,23 +177,20 @@ struct AddStaticIndexArray<
 {
 
   using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Type = typename PrependStaticIndexArray<
-      INDEX_TYPE,
-      HEAD + DELTA,
-      typename Orig::Tail>::Type;
-  using Seq = typename PrependStaticIndexArray<
-      INDEX_TYPE,
-      HEAD + DELTA,
-      typename Orig::Tail>::Seq;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                HEAD + DELTA,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               HEAD + DELTA,
+                                               typename Orig::Tail>::Seq;
 };
 
 
-template <
-    typename INDEX_TYPE,
-    size_t     IDX,
-    INDEX_TYPE VALUE,
-    INDEX_TYPE HEAD,
-    INDEX_TYPE... TAIL>
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
 struct SetStaticIndexArray<
     INDEX_TYPE,
     IDX,
@@ -204,21 +198,19 @@ struct SetStaticIndexArray<
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
   using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using SetTail = typename SetStaticIndexArray<
-      INDEX_TYPE,
-      IDX - 1,
-      VALUE,
-      typename Orig::Tail>::Type;
+  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               VALUE,
+                                               typename Orig::Tail>::Type;
   using Type =
       typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
   using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
 };
 
-template <
-    typename INDEX_TYPE,
-    INDEX_TYPE VALUE,
-    INDEX_TYPE HEAD,
-    INDEX_TYPE... TAIL>
+template <typename INDEX_TYPE,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
 struct SetStaticIndexArray<
     INDEX_TYPE,
     0,
@@ -226,12 +218,12 @@ struct SetStaticIndexArray<
     StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
 {
   using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Type =
-      typename PrependStaticIndexArray<INDEX_TYPE, VALUE, typename Orig::Tail>::
-          Type;
-  using Seq =
-      typename PrependStaticIndexArray<INDEX_TYPE, VALUE, typename Orig::Tail>::
-          Seq;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                VALUE,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               VALUE,
+                                               typename Orig::Tail>::Seq;
 };
 
 
@@ -251,7 +243,7 @@ struct TensorTile
   index_type m_begin[NUM_DIMS];
   index_type m_size[NUM_DIMS];
 
-  static constexpr camp::idx_t    s_num_dims    = NUM_DIMS;
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
   static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
 
@@ -308,23 +300,20 @@ struct TensorTile
 };
 
 
-template <
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    typename TBEGIN,
-    typename TSIZE>
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
 struct StaticTensorTile;
 
-template <
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    INDEX_TYPE... BeginInts,
-    INDEX_TYPE... SizeInts>
-struct StaticTensorTile<
-    INDEX_TYPE,
-    TENSOR_SIZE,
-    camp::int_seq<INDEX_TYPE, BeginInts...>,
-    camp::int_seq<INDEX_TYPE, SizeInts...>>
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts>
+struct StaticTensorTile<INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, BeginInts...>,
+                        camp::int_seq<INDEX_TYPE, SizeInts...>>
 {
 
 
@@ -344,19 +333,18 @@ struct StaticTensorTile<
   using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
 
   begin_type m_begin;
-  size_type  m_size;
-
-  static_assert(
-      sizeof...(BeginInts) == sizeof...(SizeInts),
-      "Mismatch between "
-      "number of "
-      "elements in "
-      "Begin and Size "
-      "series of "
-      "StaticTensorTil"
-      "e");
-
-  static constexpr camp::idx_t    s_num_dims    = sizeof...(BeginInts);
+  size_type m_size;
+
+  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
+                "Mismatch between "
+                "number of "
+                "elements in "
+                "Begin and Size "
+                "series of "
+                "StaticTensorTil"
+                "e");
+
+  static constexpr camp::idx_t s_num_dims       = sizeof...(BeginInts);
   static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
   constexpr operator nonstatic_self_type() const
@@ -391,13 +379,12 @@ struct StaticTensorTile<
 template <typename TILE, typename VALUE, size_t IDX>
 struct SetStaticTensorTileBegin;
 
-template <
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    typename TBEGIN,
-    typename TSIZE,
-    INDEX_TYPE VALUE,
-    size_t     IDX>
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
 struct SetStaticTensorTileBegin<
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
     camp::integral_constant<INDEX_TYPE, VALUE>,
@@ -414,13 +401,12 @@ struct SetStaticTensorTileBegin<
 template <typename TILE, typename VALUE, size_t IDX>
 struct SetStaticTensorTileSize;
 
-template <
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    typename TBEGIN,
-    typename TSIZE,
-    INDEX_TYPE VALUE,
-    size_t     IDX>
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
 struct SetStaticTensorTileSize<
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
     camp::integral_constant<INDEX_TYPE, VALUE>,
@@ -435,40 +421,37 @@ struct SetStaticTensorTileSize<
 };
 
 
-template <
-    typename POINTER_TYPE,
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    camp::idx_t    NUM_DIMS,
-    camp::idx_t    STRIDE_ONE_DIM = -1>
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          camp::idx_t NUM_DIMS,
+          camp::idx_t STRIDE_ONE_DIM = -1>
 struct TensorRef
 {
-  static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
-  static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
-  static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-  using self_type = TensorRef<
-      POINTER_TYPE,
-      INDEX_TYPE,
-      TENSOR_SIZE,
-      NUM_DIMS,
-      STRIDE_ONE_DIM>;
+  using self_type    = TensorRef<POINTER_TYPE,
+                              INDEX_TYPE,
+                              TENSOR_SIZE,
+                              NUM_DIMS,
+                              STRIDE_ONE_DIM>;
   using tile_type    = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
   using pointer_type = POINTER_TYPE;
   using index_type   = INDEX_TYPE;
 
 
   pointer_type m_pointer;
-  index_type   m_stride[NUM_DIMS];
-  tile_type    m_tile;
+  index_type m_stride[NUM_DIMS];
+  tile_type m_tile;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
   void print() const
   {
-    printf(
-        "TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
-        m_pointer);
+    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
+           m_pointer);
 
     for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
     {
@@ -482,36 +465,33 @@ struct TensorRef
 };
 
 
-template <
-    typename POINTER_TYPE,
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    typename STRIDE_TYPE,
-    typename BEGIN_TYPE,
-    typename SIZE_TYPE,
-    camp::idx_t STRIDE_ONE_DIM = -1>
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename STRIDE_TYPE,
+          typename BEGIN_TYPE,
+          typename SIZE_TYPE,
+          camp::idx_t STRIDE_ONE_DIM = -1>
 struct StaticTensorRef;
 
-template <
-    typename POINTER_TYPE,
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    INDEX_TYPE... StrideInts,
-    INDEX_TYPE... BeginInts,
-    INDEX_TYPE... SizeInts,
-    camp::idx_t STRIDE_ONE_DIM>
-struct StaticTensorRef<
-    POINTER_TYPE,
-    INDEX_TYPE,
-    TENSOR_SIZE,
-    camp::int_seq<INDEX_TYPE, StrideInts...>,
-    camp::int_seq<INDEX_TYPE, BeginInts...>,
-    camp::int_seq<INDEX_TYPE, SizeInts...>,
-    STRIDE_ONE_DIM>
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... StrideInts,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts,
+          camp::idx_t STRIDE_ONE_DIM>
+struct StaticTensorRef<POINTER_TYPE,
+                       INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, StrideInts...>,
+                       camp::int_seq<INDEX_TYPE, BeginInts...>,
+                       camp::int_seq<INDEX_TYPE, SizeInts...>,
+                       STRIDE_ONE_DIM>
 {
 
-  static constexpr camp::idx_t    s_num_dims        = sizeof...(BeginInts);
-  static constexpr camp::idx_t    s_stride_one_dim  = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t s_num_dims           = sizeof...(BeginInts);
+  static constexpr camp::idx_t s_stride_one_dim     = STRIDE_ONE_DIM;
   static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
   using pointer_type                                = POINTER_TYPE;
   using index_type                                  = INDEX_TYPE;
@@ -522,35 +502,32 @@ struct StaticTensorRef<
 
   using stride_type = StaticIndexArray<stride_seq>;
 
-  static_assert(
-      (sizeof...(BeginInts) == sizeof...(SizeInts)) &&
-          (sizeof...(SizeInts) == sizeof...(StrideInts)),
-      "Mismatch between number of elements in Begin and Size series "
-      "of StaticTensorRef");
+  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
+                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorRef");
 
 
-  using self_type = StaticTensorRef<
-      POINTER_TYPE,
-      INDEX_TYPE,
-      TENSOR_SIZE,
-      stride_seq,
-      begin_seq,
-      size_seq>;
+  using self_type = StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE,
+                                    TENSOR_SIZE,
+                                    stride_seq,
+                                    begin_seq,
+                                    size_seq>;
   using tile_type =
       StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
 
   pointer_type m_pointer;
-  stride_type  m_stride;
-  tile_type    m_tile;
+  stride_type m_stride;
+  tile_type m_tile;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
   void print() const
   {
-    printf(
-        "StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
-        m_pointer);
+    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
+           m_pointer);
 
     m_stride.print();
 
@@ -568,17 +545,16 @@ template <typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
 struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
 {
 
-  static_assert(
-      REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
-      "Merging a ref "
-      "with a tile "
-      "requires an "
-      "equivalent "
-      "number of "
-      "dimensions.");
-
-  static constexpr camp::idx_t    s_num_dims       = REF_TYPE::s_num_dims;
-  static constexpr camp::idx_t    s_stride_one_dim = REF_TYPE::s_stride_one_dim;
+  static_assert(REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
+                "Merging a ref "
+                "with a tile "
+                "requires an "
+                "equivalent "
+                "number of "
+                "dimensions.");
+
+  static constexpr camp::idx_t s_num_dims       = REF_TYPE::s_num_dims;
+  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
   static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
   using pointer_type   = typename REF_TYPE::pointer_type;
   using ref_index_type = typename REF_TYPE::index_type;
@@ -586,12 +562,11 @@ struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
   static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
   using tile_index_type = typename TILE_TYPE::index_type;
 
-  using merge_type = TensorRef<
-      pointer_type,
-      tile_index_type,
-      s_tile_tensor_size,
-      s_num_dims,
-      s_stride_one_dim>;
+  using merge_type = TensorRef<pointer_type,
+                               tile_index_type,
+                               s_tile_tensor_size,
+                               s_num_dims,
+                               s_stride_one_dim>;
   using shift_type = merge_type;
 
   RAJA_INLINE
@@ -604,95 +579,87 @@ struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr shift_type
-  shift_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin)
+  static constexpr shift_type shift_origin(REF_TYPE const& ref,
+                                           TILE_TYPE const& tile_origin)
   {
     return shift_type {
-        ref.m_pointer -
-            RAJA::sum<camp::idx_t>(
-                (tile_origin.m_begin[DIM_SEQ] * ref.m_stride[DIM_SEQ])...),
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
         {tile_index_type(ref.m_stride[DIM_SEQ])...},
         ref.m_tile};
   }
 };
 
 
-template <
-    typename POINTER_TYPE,
-    typename INDEX_TYPE1,
-    TensorTileSize RTENSOR_SIZE,
-    typename STRIDE,
-    INDEX_TYPE1... BEGIN1,
-    INDEX_TYPE1... SIZE1,
-    camp::idx_t STRIDE_ONE_DIM,
-    typename INDEX_TYPE2,
-    TensorTileSize TENSOR_SIZE,
-    typename BEGIN2,
-    typename SIZE2,
-    camp::idx_t... DIM_SEQ>
-struct MergeRefTile<
-    StaticTensorRef<
-        POINTER_TYPE,
-        INDEX_TYPE1,
-        RTENSOR_SIZE,
-        STRIDE,
-        camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-        camp::int_seq<INDEX_TYPE1, SIZE1...>,
-        STRIDE_ONE_DIM>,
-    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
-    camp::idx_seq<DIM_SEQ...>>
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE1,
+          TensorTileSize RTENSOR_SIZE,
+          typename STRIDE,
+          INDEX_TYPE1... BEGIN1,
+          INDEX_TYPE1... SIZE1,
+          camp::idx_t STRIDE_ONE_DIM,
+          typename INDEX_TYPE2,
+          TensorTileSize TENSOR_SIZE,
+          typename BEGIN2,
+          typename SIZE2,
+          camp::idx_t... DIM_SEQ>
+struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE1,
+                                    RTENSOR_SIZE,
+                                    STRIDE,
+                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                    STRIDE_ONE_DIM>,
+                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
+                    camp::idx_seq<DIM_SEQ...>>
 {
 
-  using ref_tile_type = StaticTensorTile<
-      INDEX_TYPE1,
-      RTENSOR_SIZE,
-      camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-      camp::int_seq<INDEX_TYPE1, SIZE1...>>;
-
-  using ref_type = StaticTensorRef<
-      POINTER_TYPE,
-      INDEX_TYPE1,
-      RTENSOR_SIZE,
-      STRIDE,
-      camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-      camp::int_seq<INDEX_TYPE1, SIZE1...>,
-      STRIDE_ONE_DIM>;
+  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
+                                         RTENSOR_SIZE,
+                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
+
+  using ref_type = StaticTensorRef<POINTER_TYPE,
+                                   INDEX_TYPE1,
+                                   RTENSOR_SIZE,
+                                   STRIDE,
+                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                   STRIDE_ONE_DIM>;
 
   using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
 
   using ref_stride_type = typename ref_type ::stride_type;
 
-  using new_stride_seq = camp::
-      int_seq<INDEX_TYPE2, INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
+  using new_stride_seq =
+      camp::int_seq<INDEX_TYPE2,
+                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
 
   using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
   using shift_size_seq  = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
 
-  using shift_tile_type = StaticTensorTile<
-      INDEX_TYPE2,
-      TENSOR_SIZE,
-      shift_begin_seq,
-      shift_size_seq>;
+  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
+                                           TENSOR_SIZE,
+                                           shift_begin_seq,
+                                           shift_size_seq>;
 
   using new_stride_type = StaticIndexArray<new_stride_seq>;
 
-  using merge_type = StaticTensorRef<
-      POINTER_TYPE,
-      INDEX_TYPE2,
-      TENSOR_SIZE,
-      new_stride_seq,
-      BEGIN2,
-      SIZE2,
-      STRIDE_ONE_DIM>;
-
-  using shift_type = StaticTensorRef<
-      POINTER_TYPE,
-      INDEX_TYPE2,
-      TENSOR_SIZE,
-      new_stride_seq,
-      shift_begin_seq,
-      shift_size_seq,
-      STRIDE_ONE_DIM>;
+  using merge_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     BEGIN2,
+                                     SIZE2,
+                                     STRIDE_ONE_DIM>;
+
+  using shift_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     shift_begin_seq,
+                                     shift_size_seq,
+                                     STRIDE_ONE_DIM>;
 
 
   RAJA_INLINE
@@ -704,13 +671,12 @@ struct MergeRefTile<
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr shift_type
-  shift_origin(ref_type const& ref, tile_type const& tile_origin)
+  static constexpr shift_type shift_origin(ref_type const& ref,
+                                           tile_type const& tile_origin)
   {
     return shift_type {
-        ref.m_pointer -
-            RAJA::sum<camp::idx_t>(
-                (tile_origin.m_begin[DIM_SEQ] * ref.m_stride[DIM_SEQ])...),
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
         new_stride_type(), shift_tile_type()};
   }
 };
@@ -724,9 +690,9 @@ merge_ref_tile(REF_TYPE const& ref, TILE_TYPE const& tile) ->
         TILE_TYPE,
         camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
 {
-  return MergeRefTile<
-      REF_TYPE, TILE_TYPE,
-      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
+  return MergeRefTile<REF_TYPE, TILE_TYPE,
+                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
+                                                                          tile);
 }
 
 
@@ -743,18 +709,18 @@ shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
         camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
 {
   return MergeRefTile<
-      REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::
-      shift_origin(ref, tile_origin);
+      REF_TYPE, TILE_TYPE,
+      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
+                                                                 tile_origin);
 }
 
 
 /*!
  * Changes TensorTile size type to FULL
  */
-template <
-    typename INDEX_TYPE,
-    TensorTileSize RTENSOR_SIZE,
-    camp::idx_t    NUM_DIMS>
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
 RAJA_INLINE
     RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&
     make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
@@ -765,10 +731,9 @@ RAJA_INLINE
 /*!
  * Changes TensorTile size type to PARTIAL
  */
-template <
-    typename INDEX_TYPE,
-    TensorTileSize RTENSOR_SIZE,
-    camp::idx_t    NUM_DIMS>
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
 RAJA_INLINE
     RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&
     make_tensor_tile_partial(
@@ -782,16 +747,14 @@ RAJA_INLINE
 /*!
  * Changes StaticTensorTile size type to FULL
  */
-template <
-    typename INDEX_TYPE,
-    TensorTileSize RTENSOR_SIZE,
-    typename TBEGIN,
-    typename TSIZE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<
-    INDEX_TYPE,
-    TENSOR_FULL,
-    TBEGIN,
-    TSIZE>&
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_FULL,
+                                                        TBEGIN,
+                                                        TSIZE>&
 make_tensor_tile_full(
     StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
 {
@@ -802,16 +765,14 @@ make_tensor_tile_full(
 /*!
  * Changes StaticTensorTile size type to PARTIAL
  */
-template <
-    typename INDEX_TYPE,
-    TensorTileSize RTENSOR_SIZE,
-    typename TBEGIN,
-    typename TSIZE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<
-    INDEX_TYPE,
-    TENSOR_PARTIAL,
-    TBEGIN,
-    TSIZE>&
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_PARTIAL,
+                                                        TBEGIN,
+                                                        TSIZE>&
 make_tensor_tile_partial(
     StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
 {
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index 6e8d7c9d52..0303a1f275 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -104,11 +104,10 @@ class TensorRegisterConcreteBase
 template <typename Derived>
 class TensorRegisterBase;
 
-template <
-    typename REGISTER_POLICY,
-    typename T,
-    typename LAYOUT,
-    typename camp::idx_t... SIZES>
+template <typename REGISTER_POLICY,
+          typename T,
+          typename LAYOUT,
+          typename camp::idx_t... SIZES>
 class TensorRegisterBase<
     RAJA::expt::
         TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
@@ -121,9 +120,9 @@ class TensorRegisterBase<
 
   static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
 
-  static constexpr camp::idx_t s_num_registers = DivideRoundUp<
-      RAJA::product<camp::idx_t>(SIZES...),
-      RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
+  static constexpr camp::idx_t s_num_registers =
+      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
+                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
 
   using index_type = camp::idx_t;
 
@@ -169,11 +168,10 @@ class TensorRegisterBase<
   /*
    * Overload for:    assignment of ET to a TensorRegister
    */
-  template <
-      typename RHS,
-      typename std::enable_if<
-          std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
-          bool>::type = true>
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
+                bool>::type = true>
   RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const& rhs)
   {
     // evaluate a single tile of the ET, storing in this TensorRegister
@@ -182,13 +180,12 @@ class TensorRegisterBase<
 
 
   template <typename... REGS>
-  explicit RAJA_HOST_DEVICE RAJA_INLINE
-  TensorRegisterBase(register_type reg0, REGS const&... regs)
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
+                                                           REGS const&... regs)
       : m_registers {reg0, regs...}
   {
-    static_assert(
-        1 + sizeof...(REGS) == s_num_registers,
-        "Incompatible number of registers");
+    static_assert(1 + sizeof...(REGS) == s_num_registers,
+                  "Incompatible number of registers");
   }
 
   RAJA_HOST_DEVICE
@@ -198,7 +195,7 @@ class TensorRegisterBase<
 
   template <typename REF_TYPE>
   RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
-                   create_et_store_ref(REF_TYPE const& ref)
+  create_et_store_ref(REF_TYPE const& ref)
   {
     return TensorRegisterStoreRef<REF_TYPE> {ref};
   }
@@ -233,16 +230,15 @@ class TensorRegisterBase<
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  static constexpr StaticTensorTile<
-      int,
-      TENSOR_FULL,
-      camp::int_seq<int, int(SIZES * 0)...>,
-      camp::int_seq<int, int(SIZES)...>>
+  static constexpr StaticTensorTile<int,
+                                    TENSOR_FULL,
+                                    camp::int_seq<int, int(SIZES * 0)...>,
+                                    camp::int_seq<int, int(SIZES)...>>
   s_get_default_tile()
   {
-    return StaticTensorTile<
-        int, TENSOR_FULL, camp::int_seq<int, int(SIZES * 0)...>,
-        camp::int_seq<int, int(SIZES)...>>();
+    return StaticTensorTile<int, TENSOR_FULL,
+                            camp::int_seq<int, int(SIZES * 0)...>,
+                            camp::int_seq<int, int(SIZES)...>>();
   }
 
   /*!
@@ -439,11 +435,10 @@ class TensorRegisterBase<
   RAJA_SUPPRESS_HD_WARN
   template <typename T2>
   RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  operator=(RAJA::expt::TensorRegister<
-            RAJA::expt::scalar_register,
-            T2,
-            RAJA::expt::ScalarLayout,
-            camp::idx_seq<>> const& value)
+  operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
+                                       T2,
+                                       RAJA::expt::ScalarLayout,
+                                       camp::idx_seq<>> const& value)
   {
     getThis()->broadcast(value.get(0));
     return *getThis();
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index a0c97db681..9a0d011d7e 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -117,8 +117,8 @@ struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
     using IterCount =
         camp::integral_constant<typename TTYPE::index_type, iter_count>;
     using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
-    using IdxSeq = typename camp::detail::gen_seq<
-        typename TTYPE::index_type, IterCount>::type;
+    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
+                                                  IterCount>::type;
 
     StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
   }
@@ -152,17 +152,16 @@ struct TensorTileExec<STORAGE, camp::idx_seq<>>
 };
 
 
-template <
-    typename STORAGE,
-    typename TILE_TYPE,
-    typename BODY,
-    camp::idx_t... IDX_SEQ,
-    camp::idx_t... DIM_SEQ>
-RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
-    TILE_TYPE const& orig_tile,
-    BODY&&           body,
-    camp::idx_seq<IDX_SEQ...> const&,
-    camp::idx_seq<DIM_SEQ...> const&)
+template <typename STORAGE,
+          typename TILE_TYPE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void
+tensorTileExec_expanded(TILE_TYPE const& orig_tile,
+                        BODY&& body,
+                        camp::idx_seq<IDX_SEQ...> const&,
+                        camp::idx_seq<DIM_SEQ...> const&)
 {
 
   // tile over full rows and columns
@@ -196,16 +195,14 @@ struct StaticTensorTileExec;
  * Implement a dimension tiling loop
  */
 
-template <
-    typename STORAGE,
-    camp::idx_t DIM0,
-    camp::idx_t... DIM_REST,
-    camp::idx_t IDX,
-    camp::idx_t... IDX_REST>
-struct StaticTensorTileExec<
-    STORAGE,
-    camp::idx_seq<DIM0, DIM_REST...>,
-    camp::idx_seq<IDX, IDX_REST...>>
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t... DIM_REST,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0, DIM_REST...>,
+                            camp::idx_seq<IDX, IDX_REST...>>
 {
 
   using DimList = camp::idx_seq<DIM0, DIM_REST...>;
@@ -214,10 +211,9 @@ struct StaticTensorTileExec<
   using IdxTail = camp::idx_seq<IDX_REST...>;
 
   using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
-  using NextExec = StaticTensorTileExec<
-      STORAGE,
-      camp::idx_seq<DIM0, DIM_REST...>,
-      camp::idx_seq<IDX_REST...>>;
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0, DIM_REST...>,
+                                        camp::idx_seq<IDX_REST...>>;
 
   static auto const step_size = STORAGE::s_dim_elem(DIM0);
 
@@ -231,23 +227,25 @@ struct StaticTensorTileExec<
 
     auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-    using NextBegin = camp::integral_constant<
-        typename TTYPE::index_type, tile_begin + STORAGE::s_dim_elem(DIM0)>;
-    using TailSize = camp::integral_constant<
-        typename TTYPE::index_type, (orig_begin + orig_size) - tile_begin>;
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-    using NextTile = typename expt::SetStaticTensorTileBegin<
-        TTYPE, NextBegin, (size_t)DIM0>::Type;
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-    using TailTile = typename expt::SetStaticTensorTileSize<
-        TTYPE, TailSize, (size_t)DIM0>::Type;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
     using PartTile = typename TailTile::Partial;
 
 
-    static_assert(
-        (tile_begin + STORAGE::s_dim_elem(DIM0)) <=
-            (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
-        "OOB StaticTensorTileExec DOWN");
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec DOWN");
 
     if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
@@ -264,20 +262,17 @@ struct StaticTensorTileExec<
 };
 
 
-template <
-    typename STORAGE,
-    camp::idx_t DIM0,
-    camp::idx_t IDX,
-    camp::idx_t... IDX_REST>
-struct StaticTensorTileExec<
-    STORAGE,
-    camp::idx_seq<DIM0>,
-    camp::idx_seq<IDX, IDX_REST...>>
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0>,
+                            camp::idx_seq<IDX, IDX_REST...>>
 {
-  using NextExec = StaticTensorTileExec<
-      STORAGE,
-      camp::idx_seq<DIM0>,
-      camp::idx_seq<IDX_REST...>>;
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0>,
+                                        camp::idx_seq<IDX_REST...>>;
 
 
   template <typename OTILE, typename TTYPE, typename BODY>
@@ -289,23 +284,25 @@ struct StaticTensorTileExec<
 
     auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-    using NextBegin = camp::integral_constant<
-        typename TTYPE::index_type, tile_begin + STORAGE::s_dim_elem(DIM0)>;
-    using TailSize = camp::integral_constant<
-        typename TTYPE::index_type, (orig_begin + orig_size) - tile_begin>;
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-    using NextTile = typename expt::SetStaticTensorTileBegin<
-        TTYPE, NextBegin, (size_t)DIM0>::Type;
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-    using TailTile = typename expt::SetStaticTensorTileSize<
-        TTYPE, TailSize, (size_t)DIM0>::Type;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
     using PartTile = typename TailTile::Partial;
 
 
-    static_assert(
-        (tile_begin + STORAGE::s_dim_elem(DIM0)) <=
-            (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
-        "OOB StaticTensorTileExec ACROSS");
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec ACROSS");
 
     if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
@@ -322,10 +319,9 @@ struct StaticTensorTileExec<
 };
 
 template <typename STORAGE, camp::idx_t... DIM_REST>
-struct StaticTensorTileExec<
-    STORAGE,
-    camp::idx_seq<DIM_REST...>,
-    camp::idx_seq<>>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM_REST...>,
+                            camp::idx_seq<>>
 {
 
   template <typename OTILE, typename TTYPE, typename BODY>
@@ -335,18 +331,17 @@ struct StaticTensorTileExec<
 };
 
 
-template <
-    typename STORAGE,
-    typename INDEX_TYPE,
-    TensorTileSize TENSOR_SIZE,
-    typename TBEGIN,
-    typename TSIZE,
-    typename BODY,
-    camp::idx_t... IDX_SEQ,
-    camp::idx_t... DIM_SEQ>
+template <typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
 RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
     StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
-    BODY&&                                                          body,
+    BODY&& body,
     camp::idx_seq<IDX_SEQ...> const&,
     camp::idx_seq<DIM_SEQ...> const&)
 {
@@ -373,8 +368,8 @@ RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
 
 
 template <typename STORAGE, typename TILE_TYPE, typename BODY>
-RAJA_INLINE RAJA_HOST_DEVICE void
-tensorTileExec(TILE_TYPE const& tile, BODY&& body)
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
+                                                 BODY&& body)
 {
   using layout_type = typename STORAGE::layout_type;
   tensorTileExec_expanded<STORAGE>(
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 48d7e9e643..dfce569070 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -38,29 +38,26 @@ namespace expt
  * This provides a Tensor specialization for vectors
  */
 template <typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-class TensorRegister<
-    REGISTER_POLICY,
-    T,
-    RAJA::expt::VectorLayout,
-    camp::idx_seq<SIZE>>
-    : public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<
-          REGISTER_POLICY,
-          T,
-          RAJA::expt::VectorLayout,
-          camp::idx_seq<SIZE>>>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     RAJA::expt::VectorLayout,
+                     camp::idx_seq<SIZE>>
+    : public internal::expt::TensorRegisterBase<
+          RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                     T,
+                                     RAJA::expt::VectorLayout,
+                                     camp::idx_seq<SIZE>>>
 {
 public:
-  using self_type = TensorRegister<
-      REGISTER_POLICY,
-      T,
-      RAJA::expt::VectorLayout,
-      camp::idx_seq<SIZE>>;
-  using base_type =
-      internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<
-          REGISTER_POLICY,
-          T,
-          RAJA::expt::VectorLayout,
-          camp::idx_seq<SIZE>>>;
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   RAJA::expt::VectorLayout,
+                                   camp::idx_seq<SIZE>>;
+  using base_type = internal::expt::TensorRegisterBase<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::VectorLayout,
+                                 camp::idx_seq<SIZE>>>;
   using element_type  = camp::decay<T>;
   using layout_type   = TensorLayout<0>;
   using register_type = Register<T, REGISTER_POLICY>;
@@ -69,11 +66,10 @@ class TensorRegister<
 
   using int_element_type =
       typename register_type::int_vector_type::element_type;
-  using int_vector_type = TensorRegister<
-      REGISTER_POLICY,
-      int_element_type,
-      RAJA::expt::VectorLayout,
-      camp::idx_seq<SIZE>>;
+  using int_vector_type = TensorRegister<REGISTER_POLICY,
+                                         int_element_type,
+                                         RAJA::expt::VectorLayout,
+                                         camp::idx_seq<SIZE>>;
 
 private:
   static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
@@ -133,13 +129,12 @@ class TensorRegister<
   /*
    * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
    */
-  template <
-      typename RHS,
-      typename std::enable_if<
-          std::is_base_of<
-              RAJA::internal::expt::ET::TensorExpressionConcreteBase,
-              RHS>::value,
-          bool>::type = true>
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<
+                    RAJA::internal::expt::ET::TensorExpressionConcreteBase,
+                    RHS>::value,
+                bool>::type = true>
   RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
   {
     // evaluate a single tile of the ET, storing in this
@@ -149,8 +144,8 @@ class TensorRegister<
 
 
   template <typename... REGS>
-  explicit RAJA_HOST_DEVICE RAJA_INLINE
-  TensorRegister(register_type reg0, REGS const&... regs)
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
+                                                       REGS const&... regs)
       : base_type(reg0, regs...)
   {}
 
@@ -230,11 +225,10 @@ class TensorRegister<
   }
 
 
-  template <
-      typename POINTER_TYPE,
-      typename INDEX_TYPE,
-      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-      camp::idx_t                          STRIDE_ONE_DIM>
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
   struct RefBridge<
       RAJA::internal::expt::
           TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
@@ -350,14 +344,13 @@ class TensorRegister<
   };
 
 
-  template <
-      typename POINTER_TYPE,
-      typename INDEX_TYPE,
-      RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-      INDEX_TYPE                           STRIDE_VALUE,
-      INDEX_TYPE                           BEGIN_VALUE,
-      INDEX_TYPE                           SIZE_VALUE,
-      camp::idx_t                          STRIDE_ONE_DIM>
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE STRIDE_VALUE,
+            INDEX_TYPE BEGIN_VALUE,
+            INDEX_TYPE SIZE_VALUE,
+            camp::idx_t STRIDE_ONE_DIM>
   struct RefBridge<RAJA::internal::expt::StaticTensorRef<
       POINTER_TYPE,
       INDEX_TYPE,
@@ -512,8 +505,8 @@ class TensorRegister<
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
-      m_registers[reg].load_strided(
-          ptr + reg * s_register_num_elem * stride, stride);
+      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                    stride);
     }
     if (s_num_partial_lanes)
     {
@@ -539,8 +532,8 @@ class TensorRegister<
       }
       else
       {
-        m_registers[reg].load_packed_n(
-            ptr + reg * s_register_num_elem, N - reg * s_register_num_elem);
+        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
+                                       N - reg * s_register_num_elem);
 
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
@@ -569,14 +562,14 @@ class TensorRegister<
     {
       if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        m_registers[reg].load_strided(
-            ptr + reg * s_register_num_elem * stride, stride);
+        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                      stride);
       }
       else
       {
-        m_registers[reg].load_strided_n(
-            ptr + reg * s_register_num_elem * stride, stride,
-            N - reg * s_register_num_elem);
+        m_registers[reg].load_strided_n(ptr +
+                                            reg * s_register_num_elem * stride,
+                                        stride, N - reg * s_register_num_elem);
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
           m_registers[r].broadcast(0);
@@ -613,8 +606,8 @@ class TensorRegister<
     }
     if (s_num_partial_lanes)
     {
-      m_registers[s_final_register].gather_n(
-          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             s_num_partial_lanes);
     }
     return *this;
   }
@@ -640,8 +633,8 @@ class TensorRegister<
       }
       else
       {
-        m_registers[reg].gather_n(
-            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
+        m_registers[reg].gather_n(ptr, offsets.vec(reg),
+                                  N - reg * s_register_num_elem);
         for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
         {
           m_registers[r].broadcast(0);
@@ -651,9 +644,9 @@ class TensorRegister<
     }
     if (s_num_partial_lanes)
     {
-      m_registers[s_final_register].gather_n(
-          ptr, offsets.vec(s_final_register),
-          N - s_final_register * s_register_num_elem);
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             N - s_final_register *
+                                                     s_register_num_elem);
     }
     return *this;
   }
@@ -687,8 +680,8 @@ class TensorRegister<
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
-      m_registers[reg].store_strided(
-          ptr + reg * s_register_num_elem * stride, stride);
+      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                     stride);
     }
     if (s_num_partial_lanes)
     {
@@ -714,8 +707,8 @@ class TensorRegister<
       }
       else
       {
-        m_registers[reg].store_packed_n(
-            ptr + reg * s_register_num_elem, N - reg * s_register_num_elem);
+        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
+                                        N - reg * s_register_num_elem);
         return *this;
       }
     }
@@ -739,14 +732,14 @@ class TensorRegister<
     {
       if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        m_registers[reg].store_strided(
-            ptr + reg * s_register_num_elem * stride, stride);
+        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                       stride);
       }
       else
       {
-        m_registers[reg].store_strided_n(
-            ptr + reg * s_register_num_elem * stride, stride,
-            N - reg * s_register_num_elem);
+        m_registers[reg].store_strided_n(ptr +
+                                             reg * s_register_num_elem * stride,
+                                         stride, N - reg * s_register_num_elem);
         return *this;
       }
     }
@@ -771,8 +764,8 @@ class TensorRegister<
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const&
-  scatter(element_type* ptr, int_vector_type const& offsets) const
+  self_type const& scatter(element_type* ptr,
+                           int_vector_type const& offsets) const
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
@@ -797,10 +790,9 @@ class TensorRegister<
    */
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  self_type const& scatter_n(
-      element_type*          ptr,
-      int_vector_type const& offsets,
-      camp::idx_t            N) const
+  self_type const& scatter_n(element_type* ptr,
+                             int_vector_type const& offsets,
+                             camp::idx_t N) const
   {
     for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
     {
@@ -810,8 +802,8 @@ class TensorRegister<
       }
       else
       {
-        m_registers[reg].scatter_n(
-            ptr, offsets.vec(reg), N - reg * s_register_num_elem);
+        m_registers[reg].scatter_n(ptr, offsets.vec(reg),
+                                   N - reg * s_register_num_elem);
 
         return *this;
       }
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index 8f00ef1efb..26f06798cc 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -81,27 +81,25 @@ class MultiPolicy
 /// \param p MultiPolicy to use for selection
 /// \param iter iterable of items to supply to body
 /// \param body functor, will receive each value produced by iterable iter
-template <
-    typename Iterable,
-    typename Body,
-    typename Selector,
-    typename... Policies>
+template <typename Iterable,
+          typename Body,
+          typename Selector,
+          typename... Policies>
 RAJA_INLINE void
 forall_impl(MultiPolicy<Selector, Policies...> p, Iterable&& iter, Body&& body)
 {
   p.invoke(iter, body);
 }
-template <
-    typename Res,
-    typename Iterable,
-    typename Body,
-    typename Selector,
-    typename... Policies>
-RAJA_INLINE resources::EventProxy<Res> forall_impl(
-    Res                                r,
-    MultiPolicy<Selector, Policies...> p,
-    Iterable&&                         iter,
-    Body&&                             body)
+template <typename Res,
+          typename Iterable,
+          typename Body,
+          typename Selector,
+          typename... Policies>
+RAJA_INLINE resources::EventProxy<Res>
+forall_impl(Res r,
+            MultiPolicy<Selector, Policies...> p,
+            Iterable&& iter,
+            Body&& body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
@@ -116,10 +114,10 @@ namespace detail
 {
 
 template <camp::idx_t... Indices, typename... Policies, typename Selector>
-auto make_multi_policy(
-    camp::idx_seq<Indices...>,
-    Selector                s,
-    std::tuple<Policies...> policies) -> MultiPolicy<Selector, Policies...>
+auto make_multi_policy(camp::idx_seq<Indices...>,
+                       Selector s,
+                       std::tuple<Policies...> policies)
+    -> MultiPolicy<Selector, Policies...>
 {
   return MultiPolicy<Selector, Policies...>(s, std::get<Indices>(policies)...);
 }
@@ -154,8 +152,8 @@ RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(
-      camp::make_idx_seq_t<sizeof...(Policies)> {}, s, policies);
+  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)> {},
+                                   s, policies);
 }
 
 namespace detail
@@ -195,9 +193,8 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...>
     }
     else
     {
-      NextInvoker::invoke(
-          offset, std::forward<Iterable>(iter),
-          std::forward<LoopBody>(loop_body));
+      NextInvoker::invoke(offset, std::forward<Iterable>(iter),
+                          std::forward<LoopBody>(loop_body));
     }
   }
 };
@@ -245,8 +242,8 @@ namespace type_traits
 
 template <typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::
-          SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type>
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
+                                            typename std::decay<T>::type>
 {};
 }  // namespace type_traits
 
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index eb81b693ba..50f9a08863 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -64,17 +64,16 @@ enum class Launch
 struct PolicyBase
 {};
 
-template <
-    Policy   Policy_,
-    Pattern  Pattern_,
-    Launch   Launch_,
-    Platform Platform_,
-    typename... Traits>
+template <Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          Platform Platform_,
+          typename... Traits>
 struct PolicyBaseT : PolicyBase
 {
-  static constexpr Policy   policy   = Policy_;
-  static constexpr Pattern  pattern  = Pattern_;
-  static constexpr Launch   launch   = Launch_;
+  static constexpr Policy policy     = Policy_;
+  static constexpr Pattern pattern   = Pattern_;
+  static constexpr Launch launch     = Launch_;
   static constexpr Platform platform = Platform_;
 };
 
@@ -128,13 +127,12 @@ template <typename PolicyType, typename Trait>
 struct policy_has_trait_impl : camp::num<false>
 {};
 ///
-template <
-    typename Trait,
-    Policy   Policy_,
-    Pattern  Pattern_,
-    Launch   Launch_,
-    Platform Platform_,
-    typename... Traits>
+template <typename Trait,
+          Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          Platform Platform_,
+          typename... Traits>
 struct policy_has_trait_impl<
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
     Trait>
@@ -167,12 +165,11 @@ template <Policy Pol, Pattern Pat, typename... Args>
 using make_policy_pattern_t =
     PolicyBaseT<Pol, Pat, Launch::undefined, Platform::undefined, Args...>;
 
-template <
-    Policy   Policy_,
-    Pattern  Pattern_,
-    Launch   Launch_,
-    Platform Platform_,
-    typename... Args>
+template <Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          Platform Platform_,
+          typename... Args>
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
@@ -180,11 +177,10 @@ template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
-template <
-    Policy   Policy_,
-    Pattern  Pattern_,
-    Platform Platform_,
-    typename... Args>
+template <Policy Policy_,
+          Pattern Pattern_,
+          Platform Platform_,
+          typename... Args>
 using make_policy_pattern_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch::undefined, Platform_, Args...>;
 
@@ -192,15 +188,15 @@ namespace concepts
 {
 
 template <typename Pol>
-struct ExecutionPolicy : DefineConcept(
-                             ::RAJA::concepts::has_type<::RAJA::Policy>(
-                                 camp::decay<decltype(Pol::policy)>()),
-                             ::RAJA::concepts::has_type<::RAJA::Pattern>(
-                                 camp::decay<decltype(Pol::pattern)>()),
-                             ::RAJA::concepts::has_type<::RAJA::Launch>(
-                                 camp::decay<decltype(Pol::launch)>()),
-                             ::RAJA::concepts::has_type<::RAJA::Platform>(
-                                 camp::decay<decltype(Pol::platform)>()))
+struct ExecutionPolicy
+    : DefineConcept(::RAJA::concepts::has_type<::RAJA::Policy>(
+                        camp::decay<decltype(Pol::policy)>()),
+                    ::RAJA::concepts::has_type<::RAJA::Pattern>(
+                        camp::decay<decltype(Pol::pattern)>()),
+                    ::RAJA::concepts::has_type<::RAJA::Launch>(
+                        camp::decay<decltype(Pol::launch)>()),
+                    ::RAJA::concepts::has_type<::RAJA::Platform>(
+                        camp::decay<decltype(Pol::platform)>()))
 {};
 
 }  // end namespace concepts
@@ -236,9 +232,8 @@ struct is_device_exec_policy
     : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip>
 {};
 
-DefineTypeTraitFromConcept(
-    is_execution_policy,
-    RAJA::concepts::ExecutionPolicy);
+DefineTypeTraitFromConcept(is_execution_policy,
+                           RAJA::concepts::ExecutionPolicy);
 
 
 template <typename Pol>
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index d008e6e301..448e78e8c9 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -67,13 +67,13 @@ struct constant_stride_array_of_objects
 
 /// Dispatch using function pointers to make indirect function calls
 struct indirect_function_call_dispatch
-    : RAJA::
-          make_policy_pattern_t<Policy::undefined, Pattern::workgroup_dispatch>
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_dispatch>
 {};
 /// Dispatch using virtual functions to make indirect function calls
 struct indirect_virtual_function_dispatch
-    : RAJA::
-          make_policy_pattern_t<Policy::undefined, Pattern::workgroup_dispatch>
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_dispatch>
 {};
 /// Dispatch using an implementation equivalent to a switch statement to select
 /// the type from RangeAndCallables and directly call the object.
@@ -82,15 +82,14 @@ struct indirect_virtual_function_dispatch
 /// objects that may be passed to WorkPool enqueue.
 template <typename... RangeAndCallables>
 struct direct_dispatch
-    : RAJA::
-          make_policy_pattern_t<Policy::undefined, Pattern::workgroup_dispatch>
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_dispatch>
 {};
 
-template <
-    typename EXEC_POLICY_T,
-    typename ORDER_POLICY_T,
-    typename STORAGE_POLICY_T,
-    typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
 struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
                              policy_of<EXEC_POLICY_T>::value,
                              Pattern::workgroup,
@@ -102,16 +101,14 @@ struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
   static_assert(
       RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(
-      RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::
-          value,
-      "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage "
-      "policy");
-  static_assert(
-      RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::
-          value,
-      "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup "
-      "dispatch policy");
+  static_assert(RAJA::pattern_is<STORAGE_POLICY_T,
+                                 RAJA::Pattern::workgroup_storage>::value,
+                "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage "
+                "policy");
+  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T,
+                                 RAJA::Pattern::workgroup_dispatch>::value,
+                "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup "
+                "dispatch policy");
 };
 
 }  // end namespace workgroup
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index d24a14b8a8..9ea7646337 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -81,10 +81,9 @@ struct builtin_useReinterpret
   using type = std::conditional_t<
       sizeof(T) == 1,
       char,
-      std::conditional_t<
-          sizeof(T) == 2,
-          short,
-          std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      std::conditional_t<sizeof(T) == 2,
+                         short,
+                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -133,9 +132,8 @@ RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
 /*!
  * Atomic load using atomic or
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
@@ -169,9 +167,8 @@ RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
 /*!
  * Atomic store using atomic exchange
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   builtin_atomicExchange(acc, value);
@@ -345,31 +342,29 @@ struct builtin_useReinterpret
 #endif
                                   ));
 
-  using type = std::conditional_t<
-      sizeof(T) == 1,
+  using type =
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-      uint8_t,
+                         uint8_t,
 #else
-      unsigned char,
+                         unsigned char,
 #endif
-      std::conditional_t<
-          sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-          uint16_t,
+                                            uint16_t,
 #else
-          unsigned short,
+                                            unsigned short,
 #endif
-          std::conditional_t<
-              sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-              uint32_t,
+                                                               uint32_t,
 #else
-              unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-              uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-              unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -395,9 +390,8 @@ struct builtin_useCAS
 /*!
  * Atomic load using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
@@ -407,9 +401,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 /*!
  * Atomic store using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
@@ -419,9 +412,8 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 /*!
  * Atomic exchange using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
@@ -431,13 +423,12 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 /*!
  * Atomic compare and swap using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
+                              __ATOMIC_RELAXED);
   return compare;
 }
 
@@ -445,9 +436,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 /*!
  * Atomic addition using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
@@ -457,9 +447,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 /*!
  * Atomic subtraction using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
@@ -469,9 +458,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 /*!
  * Atomic and using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
@@ -481,9 +469,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 /*!
  * Atomic or using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
@@ -493,9 +480,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 /*!
  * Atomic xor using intrinsic
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
@@ -520,9 +506,8 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
 /*!
  * Atomic load using reinterpret cast
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   using R = builtin_useReinterpret_t<T>;
@@ -535,24 +520,22 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 /*!
  * Atomic store using reinterpret cast
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  builtin_atomicStore(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value));
+  builtin_atomicStore(reinterpret_cast<R*>(acc),
+                      RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
 
 /*!
  * Atomic exchange using reinterpret cast
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
@@ -565,9 +548,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 /*!
  * Atomic compare and swap using reinterpret cast
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
@@ -587,9 +569,8 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
  * Equality comparison for compare and swap loop using types supported by
  * intrinsics.
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
@@ -601,16 +582,14 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
  * Converts to the underlying integral type to avoid cases where the values
  * will never compare equal (most notably, NaNs).
  */
-template <
-    typename T,
-    std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
 RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return builtin_atomicCAS_equal(
-      RAJA::util::reinterp_A_as_B<T, R>(a),
-      RAJA::util::reinterp_A_as_B<T, R>(b));
+  return builtin_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
+                                 RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
@@ -642,8 +621,9 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T
-builtin_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
+                                                     Oper&& oper,
+                                                     ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
 
@@ -785,14 +765,14 @@ RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 template <typename T>
 RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(
-      acc,
-      [value](T old)
-      {
-        return old == static_cast<T>(0) || value < old
-                   ? value
-                   : old - static_cast<T>(1);
-      });
+  return detail::builtin_atomicCAS_loop(acc,
+                                        [value](T old)
+                                        {
+                                          return old == static_cast<T>(0) ||
+                                                         value < old
+                                                     ? value
+                                                     : old - static_cast<T>(1);
+                                        });
 }
 
 template <typename T>
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 2a7c444fe0..3fc1e4b90c 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -118,7 +118,7 @@ struct DeviceZeroedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    auto  res = ::camp::resources::Cuda::get_default();
+    auto res = ::camp::resources::Cuda::get_default();
     void* ptr;
     cudaErrchk(cudaMalloc(&ptr, nbytes));
     cudaErrchk(cudaMemsetAsync(ptr, 0, nbytes, res.get_stream()));
@@ -147,8 +147,8 @@ struct DevicePinnedAllocator
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
     cudaErrchk(
         cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(
-        ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy,
+                             cudaCpuDeviceId));
 
     return ptr;
   }
@@ -174,12 +174,12 @@ namespace detail
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct cudaInfo
 {
-  const void*             func = nullptr;
-  cuda_dim_t              gridDim {0, 0, 0};
-  cuda_dim_t              blockDim {0, 0, 0};
-  size_t*                 dynamic_smem = nullptr;
+  const void* func = nullptr;
+  cuda_dim_t gridDim {0, 0, 0};
+  cuda_dim_t blockDim {0, 0, 0};
+  size_t* dynamic_smem = nullptr;
   ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
-  bool                    setup_reducers = false;
+  bool setup_reducers = false;
 };
 struct cudaStatusInfo : cudaInfo
 {
@@ -271,15 +271,14 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(
-    const void*             func,
-    cuda_dim_t              gridDim,
-    cuda_dim_t              blockDim,
-    void**                  args,
-    size_t                  shmem,
-    ::RAJA::resources::Cuda res,
-    bool                    async = true,
-    const char*             name  = nullptr)
+void launch(const void* func,
+            cuda_dim_t gridDim,
+            cuda_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Cuda res,
+            bool async       = true,
+            const char* name = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
   if (name) nvtxRangePushA(name);
@@ -353,8 +352,8 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
 template <typename T, typename GetNFromMax>
-RAJA_INLINE size_t
-allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
@@ -387,13 +386,13 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void*             func,
-    cuda_dim_t              gridDim,
-    cuda_dim_t              blockDim,
-    size_t&                 dynamic_smem,
-    ::RAJA::resources::Cuda res,
-    LOOP_BODY&&             loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 cuda_dim_t gridDim,
+                 cuda_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Cuda res,
+                 LOOP_BODY&& loop_body)
 {
   ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
       detail::tl_status,
@@ -404,7 +403,7 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
 }
 
 
-static constexpr int    cuda_occupancy_uninitialized_int = -1;
+static constexpr int cuda_occupancy_uninitialized_int = -1;
 static constexpr size_t cuda_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
 
@@ -429,15 +428,15 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 struct CudaOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int    func_max_blocks_per_device   = cuda_occupancy_uninitialized_int;
-  int    func_max_threads_per_block   = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = cuda_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(
-    const void* func,
-    size_t      func_dynamic_shmem_per_block)
+RAJA_INLINE CudaOccMaxBlocksThreadsData
+cuda_occupancy_max_blocks_threads(const void* func,
+                                  size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
@@ -458,8 +457,8 @@ RAJA_INLINE CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(
 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int    func_threads_per_block       = cuda_occupancy_uninitialized_int;
-  int    func_max_blocks_per_sm       = cuda_occupancy_uninitialized_int;
+  int func_threads_per_block          = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
@@ -485,10 +484,10 @@ cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE CudaOccMaxBlocksData cuda_occupancy_max_blocks(
-    const void* func,
-    size_t      func_dynamic_shmem_per_block,
-    int         func_threads_per_block)
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func,
+                          size_t func_dynamic_shmem_per_block,
+                          int func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
@@ -537,10 +536,9 @@ RAJA_INLINE CudaOccMaxBlocksData cuda_occupancy_max_blocks(
 template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(
-      const void* func,
-      size_t      func_dynamic_shmem_per_block,
-      IdxT        len)
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
       : m_func(func),
         m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
         m_len(len)
@@ -619,8 +617,8 @@ struct ConcretizerImpl
 
 private:
   const void* m_func;
-  size_t      m_func_dynamic_shmem_per_block;
-  IdxT        m_len;
+  size_t m_func_dynamic_shmem_per_block;
+  IdxT m_len;
 };
 
 }  // namespace cuda
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index 5e73d27d7f..5aeaba0883 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -42,8 +42,8 @@ namespace cuda
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
 template <typename Factory>
-__global__ void
-get_value_global(typename Factory::value_type* ptr, Factory factory)
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,7 +52,7 @@ get_value_global(typename Factory::value_type* ptr, Factory factory)
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void*  ptr           = nullptr;
+  static void* ptr            = nullptr;
   if (nbytes > cached_nbytes)
   {
     cached_nbytes = 0;
@@ -105,12 +105,11 @@ inline auto get_cached_value(Factory&& factory)
 /*!
  * Populate and return a Dispatcher object that can be used in device code
  */
-template <
-    typename T,
-    typename Dispatcher_T,
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    bool   Async>
+template <typename T,
+          typename Dispatcher_T,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async>
 inline const Dispatcher_T*
 get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 8544a76819..3cf8e6408f 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,21 +36,19 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-    RAJA::ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallOrdered<
           RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
           RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
@@ -77,10 +75,9 @@ struct WorkRunner<
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage
-  run(WorkContainer const&         storage,
-      typename base::resource_type r,
-      Args... args) const
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -104,21 +101,19 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-    RAJA::reverse_ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallReverse<
           RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
           RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
@@ -145,10 +140,9 @@ struct WorkRunner<
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage
-  run(WorkContainer const&         storage,
-      typename base::resource_type r,
-      Args... args) const
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -173,11 +167,10 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <
-    typename Segment_type,
-    typename LoopBody,
-    typename index_type,
-    typename... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldCudaDeviceXThreadblockLoop
 {
   template <typename segment_in, typename body_in>
@@ -192,8 +185,8 @@ struct HoldCudaDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto       begin   = m_segment.begin();
-    const auto       end     = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
     for (index_type i = i_begin; i < len; i += stride)
     {
@@ -203,16 +196,15 @@ struct HoldCudaDeviceXThreadblockLoop
 
 private:
   Segment_type m_segment;
-  LoopBody     m_body;
+  LoopBody m_body;
 };
 
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    typename StorageIter,
-    typename value_type,
-    typename index_type,
-    typename... Args>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -229,14 +221,13 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
 struct WorkRunner<
     RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
@@ -277,11 +268,11 @@ struct WorkRunner<
   using dispatcher_holder_policy =
       dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<
-      Platform::cuda,
-      dispatcher_holder_policy,
-      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
-      Args...>;
+  using dispatcher_type =
+      Dispatcher<Platform::cuda,
+                 dispatcher_holder_policy,
+                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
+                 Args...>;
 
   WorkRunner() = default;
 
@@ -317,9 +308,9 @@ struct WorkRunner<
     // using true_value_type = typename WorkContainer::template
     // true_value_type<holder>;
 
-    Iterator  begin = std::begin(iter);
-    Iterator  end   = std::end(iter);
-    IndexType len   = std::distance(begin, end);
+    Iterator begin = std::begin(iter);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
     if (len > 0 && BLOCK_SIZE > 0)
@@ -349,20 +340,21 @@ struct WorkRunner<
   run(WorkContainer const& storage, resource_type r, Args... args) const
   {
     using Iterator   = camp::decay<decltype(std::begin(storage))>;
-    using IndexType  = camp::decay<decltype(std::distance(
-         std::begin(storage), std::end(storage)))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage {};
 
-    auto func = cuda_unordered_y_block_global<
-        BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+    auto func =
+        cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
+                                      value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator  begin     = std::begin(storage);
-    Iterator  end       = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
@@ -378,8 +370,8 @@ struct WorkRunner<
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
       cuda_dim_t blockSize {static_cast<cuda_dim_member_t>(block_size), 1, 1};
       cuda_dim_t gridSize {
-          static_cast<cuda_dim_member_t>(
-              (average_iterations + block_size - 1) / block_size),
+          static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
+                                         block_size),
           static_cast<cuda_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
@@ -394,8 +386,8 @@ struct WorkRunner<
         // Launch the kernel
         //
         void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::cuda::launch(
-            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
+                           shmem, r, Async);
       }
 
       RAJA_FT_END;
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index 8485de5176..a1b3cd5279 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -88,10 +88,9 @@ struct cuda_useReinterpretCommon
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<
-      sizeof(T) == sizeof(unsigned int),
-      unsigned int,
-      unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -109,9 +108,8 @@ using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
  * This overload using builtin functions is used to implement atomic loads
  * under some build configurations.
  */
-template <
-    typename T,
-    std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
@@ -146,10 +144,9 @@ struct cuda_useReinterpretExchange
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<
-      sizeof(T) == sizeof(unsigned int),
-      unsigned int,
-      unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
@@ -163,9 +160,8 @@ using cuda_useReinterpretExchange_t =
  * Performs an atomic exchange using a builtin function. Stores the new value
  * in the given address and returns the old value.
  */
-template <
-    typename T,
-    std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
@@ -175,9 +171,8 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
  * Performs an atomic exchange using a reinterpret cast. Stores the new value
  * in the given address and returns the old value.
  */
-template <
-    typename T,
-    std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   using R = cuda_useReinterpretExchange_t<T>;
@@ -209,17 +204,15 @@ RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 
 #else
 
-template <
-    typename T,
-    std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda_atomicOr(acc, static_cast<T>(0));
 }
 
-template <
-    typename T,
-    std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   using R = cuda_useReinterpretCommon_t<T>;
@@ -274,16 +267,14 @@ struct cuda_useReinterpretCAS
 
   using type =
 #if __CUDA_ARCH__ >= 700
-      std::conditional_t<
-          sizeof(T) == sizeof(unsigned short),
-          unsigned short,
+      std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                         unsigned short,
 #endif
-          std::conditional_t<
-              sizeof(T) == sizeof(unsigned int),
-              unsigned int,
-              unsigned long long>
+                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                            unsigned int,
+                                            unsigned long long>
 #if __CUDA_ARCH__ >= 700
-          >
+                         >
 #endif
       ;
 };
@@ -294,17 +285,15 @@ struct cuda_useReinterpretCAS
 template <typename T>
 using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
-template <
-    typename T,
-    std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
 
-template <
-    typename T,
-    std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
 RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
@@ -319,24 +308,21 @@ RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
  * integral type to avoid cases where the values will never compare equal
  * (most notably, NaNs).
  */
-template <
-    typename T,
-    std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
 
-template <
-    typename T,
-    std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
-  return cuda_atomicCAS_equal(
-      RAJA::util::reinterp_A_as_B<T, R>(a),
-      RAJA::util::reinterp_A_as_B<T, R>(b));
+  return cuda_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
+                              RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
@@ -368,8 +354,9 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
  * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T
-cuda_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
+                                             Oper&& oper,
+                                             ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
 
@@ -393,21 +380,19 @@ cuda_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<
-    int,
-    unsigned int,
-    unsigned long long int,
-    float
+using cuda_atomicAdd_builtin_types = ::camp::list<int,
+                                                  unsigned int,
+                                                  unsigned long long int,
+                                                  float
 #if __CUDA_ARCH__ >= 600
-    ,
-    double
+                                                  ,
+                                                  double
 #endif
-    >;
+                                                  >;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
-        nullptr>
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old + value; });
@@ -429,19 +414,18 @@ using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
 using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
-using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
-    unsigned long long int,
-    float
+using cuda_atomicSub_via_Add_builtin_types =
+    ::camp::list<unsigned long long int,
+                 float
 #if __CUDA_ARCH__ >= 600
-    ,
-    double
+                 ,
+                 double
 #endif
-    >;
+                 >;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
-        nullptr>
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old - value; });
@@ -469,15 +453,14 @@ RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<
-    int,
-    unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<int,
+                                                     unsigned int
 #if __CUDA_ARCH__ >= 500
-    ,
-    long long int,
-    unsigned long long int
+                                                     ,
+                                                     long long int,
+                                                     unsigned long long int
 #endif
-    >;
+                                                     >;
 
 
 /*!
@@ -494,10 +477,9 @@ RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
       [value](T current) { return current <= value; });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
-        nullptr>
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
@@ -518,10 +500,9 @@ RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
       [value](T current) { return value <= current; });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
-        nullptr>
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
@@ -581,14 +562,13 @@ RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(
-      acc,
-      [value](T old)
-      {
-        return old == static_cast<T>(0) || value < old
-                   ? value
-                   : old - static_cast<T>(1);
-      });
+  return cuda_atomicCAS_loop(acc,
+                             [value](T old)
+                             {
+                               return old == static_cast<T>(0) || value < old
+                                          ? value
+                                          : old - static_cast<T>(1);
+                             });
 }
 
 template <
@@ -621,10 +601,9 @@ using cuda_atomicBit_builtin_types =
 /*!
  * Atomic and
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-        nullptr>
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old & value; });
@@ -642,10 +621,9 @@ RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 /*!
  * Atomic or
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-        nullptr>
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old | value; });
@@ -660,10 +638,9 @@ RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 /*!
  * Atomic xor
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-        nullptr>
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
 RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
   return cuda_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
@@ -691,8 +668,8 @@ RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(cuda_atomic_explicit<host_policy>, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
@@ -715,8 +692,9 @@ atomicStore(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
@@ -727,8 +705,9 @@ atomicAdd(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
@@ -739,8 +718,9 @@ atomicSub(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
@@ -751,8 +731,9 @@ atomicMin(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
@@ -763,8 +744,9 @@ atomicMax(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -777,8 +759,8 @@ atomicInc(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
@@ -789,8 +771,9 @@ atomicInc(cuda_atomic_explicit<host_policy>, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
@@ -803,8 +786,8 @@ atomicDec(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
@@ -815,8 +798,9 @@ atomicDec(cuda_atomic_explicit<host_policy>, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
@@ -827,8 +811,9 @@ atomicAnd(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
@@ -839,8 +824,9 @@ atomicOr(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
@@ -851,8 +837,9 @@ atomicXor(cuda_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(cuda_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index f99d68ddc5..e1a33ab482 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,46 +70,41 @@ namespace impl
  *
  ******************************************************************************
  */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                len,
-      const void*         RAJA_UNUSED_ARG(func),
-      size_t              RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -120,38 +115,35 @@ struct ForallDimensionCalculator<
                           "space");
     }
 
-    internal::set_cuda_dim<dim>(
-        dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(
-        dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads,
+                                static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(dims.blocks,
+                                static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template <
-    named_dim dim,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                len,
-      const void*         func,
-      size_t              dynamic_shmem_size)
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -170,31 +162,28 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                len,
-      const void*         func,
-      size_t              dynamic_shmem_size)
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -219,11 +208,10 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                len,
-      const void*         func,
-      size_t              dynamic_shmem_size)
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -235,35 +223,31 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                RAJA_UNUSED_ARG(len),
-      const void*         RAJA_UNUSED_ARG(func),
-      size_t              RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -273,31 +257,28 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                len,
-      const void*         func,
-      size_t              dynamic_shmem_size)
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -310,31 +291,28 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                len,
-      const void*         func,
-      size_t              dynamic_shmem_size)
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -359,11 +337,10 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::CudaDims& dims,
-      IdxT                len,
-      const void*         func,
-      size_t              dynamic_shmem_size)
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -390,84 +367,78 @@ struct ForallDimensionCalculator<
  *
  ******************************************************************************
  */
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forall_cuda_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length)
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
   }
 }
 ///
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
   }
 }
 
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forallp_cuda_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -475,30 +446,27 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__ void forallp_cuda_kernel(
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -512,24 +480,22 @@ template <
     typename Iterator,
     typename LOOP_BODY,
     typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forall_cuda_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length)
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -545,20 +511,18 @@ template <
     typename IndexType,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -574,25 +538,23 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forallp_cuda_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -610,23 +572,20 @@ template <
     typename ForallParam,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -645,30 +604,27 @@ __global__ void forallp_cuda_kernel(
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BlocksPerSM,
-    bool   Async,
-    typename ForallParam>
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BlocksPerSM,
-        Async> const&,
-    Iterable&& iter,
-    LoopBody&& loop_body,
-    ForallParam)
+forall_impl(resources::Cuda cuda_res,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -676,25 +632,26 @@ forall_impl(
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
       IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<
-      IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<
-      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
   {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forall_cuda_kernel<
-            EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, IndexType>);
+        &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                  IndexType>);
 
     //
     // Setup shared memory buffers
@@ -721,8 +678,8 @@ forall_impl(
       // Launch the kernels
       //
       void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(
-          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
     }
 
     RAJA_FT_END;
@@ -732,31 +689,28 @@ forall_impl(
 }
 
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BlocksPerSM,
-    bool   Async,
-    typename ForallParam>
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BlocksPerSM,
-        Async> const&,
-    Iterable&&  iter,
-    LoopBody&&  loop_body,
-    ForallParam f_params)
+forall_impl(resources::Cuda cuda_res,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -764,27 +718,27 @@ forall_impl(
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
       IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<
-      IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY,
-      Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<
-      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using UniqueMarker =
+      ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
+                   LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
   {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel<
-            EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, IndexType,
-            camp::decay<ForallParam>>);
+        &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                   IndexType, camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -817,10 +771,10 @@ forall_impl(
       //
       // Launch the kernels
       //
-      void* args[] = {
-          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::cuda::launch(
-          func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -850,35 +804,32 @@ forall_impl(
  *
  ******************************************************************************
  */
-template <
-    typename LoopBody,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BlocksPerSM,
-    bool   Async,
-    typename... SegmentTypes>
+template <typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
+          typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
     resources::Cuda r,
-    ExecPolicy<
-        seq_segit,
-        ::RAJA::policy::cuda::cuda_exec_explicit<
-            IterationMapping,
-            IterationGetter,
-            Concretizer,
-            BlocksPerSM,
-            Async>>,
+    ExecPolicy<seq_segit,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BlocksPerSM,
+                                                        Async>>,
     const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody&&                            loop_body)
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
   {
     iset.segmentCall(
         r, isi, detail::CallForall(),
-        ::RAJA::policy::cuda::cuda_exec_explicit<
-            IterationMapping, IterationGetter, Concretizer, BlocksPerSM,
-            true>(),
+        ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                 IterationGetter, Concretizer,
+                                                 BlocksPerSM, true>(),
         loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index c87cbf493c..a8daec62eb 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -93,12 +93,12 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<
-        T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto      ptr = const_cast<integer_type*>(
+    auto ptr = const_cast<integer_type*>(
         reinterpret_cast<const integer_type*>(in_ptr + idx));
 
     for (size_t i = 0; i < u.array_size(); ++i)
@@ -112,8 +112,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<
-        T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -153,8 +153,8 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<
-      T, min_shfl_int_type_size, max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -172,8 +172,8 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<
-      T, min_shfl_int_type_size, max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -217,8 +217,8 @@ shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long
-shfl_xor_sync<long long>(long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
+                                                           int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -268,8 +268,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int
-shfl_sync<unsigned int>(unsigned int var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
+                                                             int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -288,8 +288,8 @@ shfl_sync<unsigned long>(unsigned long var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long
-shfl_sync<long long>(long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
+                                                       int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -358,7 +358,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
     for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T   rhs     = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -423,7 +423,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T   rhs     = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -436,11 +436,10 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   if (numThreads > policy::cuda::device_constants.WARP_SIZE)
   {
 
-    static_assert(
-        policy::cuda::device_constants.MAX_WARPS <=
-            policy::cuda::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can "
-        "reduce MAX_WARPS values");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <=
+                      policy::cuda::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
     __shared__ unsigned char tmpsd[sizeof(
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index 8109bf2e81..1f0b999adc 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -36,15 +36,13 @@ namespace internal
 {
 
 
-template <
-    typename Data,
-    typename Conditional,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::If<Conditional, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          typename Conditional,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::If<Conditional, EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 7feb947cd8..75a7dddccb 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -68,11 +68,10 @@ struct cuda_explicit_launch
  * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<
-    async0,
-    num_blocks,
-    num_threads,
-    policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch = cuda_explicit_launch<async0,
+                                         num_blocks,
+                                         num_threads,
+                                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
@@ -80,11 +79,11 @@ using cuda_launch = cuda_explicit_launch<
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_explicit_launch<
-    async0,
-    0,
-    num_threads0,
-    policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_occ_calc_launch =
+    cuda_explicit_launch<async0,
+                         0,
+                         num_threads0,
+                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -110,9 +109,8 @@ struct CudaKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp = CudaKernelExt<
-    cuda_launch<false, num_blocks, num_threads>,
-    EnclosedStmts...>;
+using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
+                                    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -158,9 +156,9 @@ using CudaKernelFixed = CudaKernelExt<
  * The kernel launch is asynchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixedAsync = CudaKernelExt<
-    cuda_launch<true, operators::limits<int>::max(), num_threads>,
-    EnclosedStmts...>;
+using CudaKernelFixedAsync =
+    CudaKernelExt<cuda_launch<true, operators::limits<int>::max(), num_threads>,
+                  EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -168,13 +166,12 @@ using CudaKernelFixedAsync = CudaKernelExt<
  * The kernel launch is synchronous.
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
-using CudaKernelFixedSM = CudaKernelExt<
-    cuda_explicit_launch<
-        false,
-        operators::limits<int>::max(),
-        num_threads,
-        blocks_per_sm>,
-    EnclosedStmts...>;
+using CudaKernelFixedSM =
+    CudaKernelExt<cuda_explicit_launch<false,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
+                  EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -182,13 +179,12 @@ using CudaKernelFixedSM = CudaKernelExt<
  * The kernel launch is asynchronous.
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
-using CudaKernelFixedSMAsync = CudaKernelExt<
-    cuda_explicit_launch<
-        true,
-        operators::limits<int>::max(),
-        num_threads,
-        blocks_per_sm>,
-    EnclosedStmts...>;
+using CudaKernelFixedSMAsync =
+    CudaKernelExt<cuda_explicit_launch<true,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
+                  EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with 1024 threads
@@ -255,15 +251,15 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
 template <int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<
-                                    BlockSize,
-                                    BlocksPerSM,
-                                    Data,
-                                    executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
+                                                              BlocksPerSM,
+                                                              Data,
+                                                              executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<
-        BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
+                                              executor_t>;
   }
 };
 
@@ -287,11 +283,10 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <
-    typename LaunchPolicy,
-    typename StmtList,
-    typename Data,
-    typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct CudaLaunchHelper;
 
 
@@ -300,14 +295,13 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template <
-    bool async0,
-    int  num_blocks,
-    int  num_threads,
-    int  blocks_per_sm,
-    typename StmtList,
-    typename Data,
-    typename Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          int blocks_per_sm,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct CudaLaunchHelper<
     cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
     StmtList,
@@ -321,21 +315,20 @@ struct CudaLaunchHelper<
   using executor_t =
       internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = CudaKernelLauncherGetter<
-      (num_threads <= 0) ? 0 : num_threads,
-      (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
-      Data,
-      executor_t>;
+  using kernelGetter_t =
+      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
+                               Data,
+                               executor_t>;
 
   inline static const void* get_func()
   {
     return reinterpret_cast<const void*>(kernelGetter_t::get());
   }
 
-  inline static void recommended_blocks_threads(
-      size_t shmem_size,
-      int&   recommended_blocks,
-      int&   recommended_threads)
+  inline static void recommended_blocks_threads(size_t shmem_size,
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -397,8 +390,8 @@ struct CudaLaunchHelper<
     }
   }
 
-  inline static void
-  max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int& max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
     if (num_threads <= 0)
     {
@@ -471,10 +464,9 @@ struct CudaLaunchHelper<
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline cuda_dim_t fitCudaDims(
-    cuda_dim_member_t limit,
-    cuda_dim_t        result,
-    cuda_dim_t        minimum = cuda_dim_t())
+inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
+                              cuda_dim_t result,
+                              cuda_dim_t minimum = cuda_dim_t())
 {
 
 
@@ -575,8 +567,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -594,9 +586,8 @@ struct StatementExecutor<
       if (recommended_threads >= get_size(launch_dims.min_dims.threads))
       {
 
-        fit_threads = fitCudaDims(
-            recommended_threads, launch_dims.dims.threads,
-            launch_dims.min_dims.threads);
+        fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       //
@@ -606,9 +597,8 @@ struct StatementExecutor<
           get_size(fit_threads) != recommended_threads)
       {
 
-        fit_threads = fitCudaDims(
-            max_threads, launch_dims.dims.threads,
-            launch_dims.min_dims.threads);
+        fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -639,8 +629,8 @@ struct StatementExecutor<
         use_blocks = max_blocks;
       }
 
-      launch_dims.dims.blocks = fitCudaDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
+                                            launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -672,9 +662,9 @@ struct StatementExecutor<
         // Launch the kernel
         //
         void* args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(
-            func, launch_dims.dims.blocks, launch_dims.dims.threads, args,
-            shmem, res, launch_t::async);
+        RAJA::cuda::launch(func, launch_dims.dims.blocks,
+                           launch_dims.dims.threads, args, shmem, res,
+                           launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index cb08f47b0c..3176fd5bf8 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -36,20 +36,19 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::policy::cuda::
-            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -103,21 +102,19 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -183,21 +180,19 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -256,11 +251,10 @@ struct CudaStatementExecutor<
 /*
  * Executor for sequential loops inside of a CudaKernel.
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
@@ -272,10 +266,9 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -286,17 +279,16 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_direct<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -311,9 +303,9 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-      "BitMask is too large for CUDA warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -353,17 +345,16 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -378,9 +369,9 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-      "BitMask is too large for CUDA warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -431,18 +422,16 @@ struct CudaStatementExecutor<
  * Mapping directly from raw threadIdx.x
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::cuda_thread_masked_direct<Mask>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::cuda_thread_masked_direct<Mask>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -497,17 +486,16 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_thread_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 23dca70de0..18e11fb989 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -37,14 +37,13 @@ namespace internal
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<
@@ -66,11 +65,11 @@ struct CudaStatementExecutor<
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::policy::cuda::
-              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                      sync,
+                                                      IndexMapper>,
+                     EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -102,13 +101,12 @@ struct CudaStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<
@@ -180,13 +178,12 @@ struct CudaStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<
@@ -254,12 +251,11 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
@@ -271,10 +267,9 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -285,37 +280,33 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::cuda_warp_masked_direct<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::cuda_warp_masked_direct<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = CudaStatementExecutor<
-      Data,
-      statement::For<
-          ArgumentId,
-          RAJA::cuda_warp_masked_direct<Mask>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
@@ -329,9 +320,9 @@ struct CudaStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-      "BitMask is too large for CUDA warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -354,35 +345,33 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::cuda_warp_masked_loop<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::cuda_warp_masked_loop<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = CudaStatementExecutor<
-      Data,
-      statement::
-          For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
@@ -396,9 +385,9 @@ struct CudaStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-      "BitMask is too large for CUDA warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -432,36 +421,32 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::cuda_thread_masked_direct<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::cuda_thread_masked_direct<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::cuda_thread_masked_direct<Mask>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::cuda_thread_masked_direct<Mask>,
+                     EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -497,37 +482,33 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::cuda_thread_masked_loop<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::cuda_thread_masked_loop<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = CudaStatementExecutor<
-      Data,
-      statement::For<
-          ArgumentId,
-          RAJA::cuda_thread_masked_loop<Mask>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index 0c22adaae6..74c02b8608 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -36,12 +36,11 @@ namespace internal
 {
 
 
-template <
-    typename Data,
-    camp::idx_t HpArgumentId,
-    camp::idx_t... Args,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t HpArgumentId,
+          camp::idx_t... Args,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::
@@ -65,8 +64,8 @@ struct CudaStatementExecutor<
         segment_length<HpArgumentId>(data) +
         foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
-    int h_args = foldl(
-        RAJA::operators::plus<idx_t>(), camp::get<Args>(data.offset_tuple)...);
+    int h_args = foldl(RAJA::operators::plus<idx_t>(),
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index b2036c7e0f..018d9d0dfd 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -40,18 +40,15 @@ namespace internal
 {
 
 // Intialize thread shared array
-template <
-    typename Data,
-    camp::idx_t... Indices,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::InitLocalMem<
-        RAJA::cuda_shared_mem,
-        camp::idx_seq<Indices...>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_shared_mem,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -125,18 +122,15 @@ struct CudaStatementExecutor<
 };
 
 // Intialize thread private array
-template <
-    typename Data,
-    camp::idx_t... Indices,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::InitLocalMem<
-        RAJA::cuda_thread_mem,
-        camp::idx_seq<Indices...>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_thread_mem,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index 49e3cdada0..37287561fd 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -40,15 +40,13 @@ namespace RAJA
 namespace internal
 {
 
-template <
-    typename Data,
-    camp::idx_t LambdaIndex,
-    typename... Args,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::Lambda<LambdaIndex, Args...>,
-    Types>
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
 {
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index effb3469aa..dfa80667a0 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -34,21 +34,18 @@ namespace internal
 //
 // Executor that handles reductions across a single CUDA thread block
 //
-template <
-    typename Data,
-    template <typename...>
-    class ReduceOperator,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::Reduce<
-        RAJA::cuda_block_reduce,
-        ReduceOperator,
-        ParamId,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          template <typename...>
+          class ReduceOperator,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Reduce<RAJA::cuda_block_reduce,
+                                               ReduceOperator,
+                                               ParamId,
+                                               EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -100,21 +97,18 @@ struct CudaStatementExecutor<
 //
 // Executor that handles reductions across a single CUDA thread warp
 //
-template <
-    typename Data,
-    template <typename...>
-    class ReduceOperator,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::Reduce<
-        RAJA::cuda_warp_reduce,
-        ReduceOperator,
-        ParamId,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          template <typename...>
+          class ReduceOperator,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Reduce<RAJA::cuda_warp_reduce,
+                                               ReduceOperator,
+                                               ParamId,
+                                               EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index 14780e8a37..a7f36c54b7 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -47,22 +47,21 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::cuda::
-            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
+                    EnclosedStmts...>,
     Types>
 {
 
@@ -138,13 +137,12 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<
@@ -240,13 +238,12 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<
@@ -338,12 +335,11 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename TPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename TPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
@@ -356,10 +352,9 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 3ffdf3a36c..377ac4edff 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -48,15 +48,14 @@ namespace internal
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::TileTCount<
@@ -128,14 +127,13 @@ struct CudaStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::TileTCount<
@@ -221,14 +219,13 @@ struct CudaStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::TileTCount<
@@ -309,13 +306,12 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename TPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename TPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::
@@ -330,10 +326,9 @@ struct CudaStatementExecutor<
               RAJA::policy::cuda::cuda_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  cuda::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 2f9c756368..ae60b5f061 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -219,10 +219,9 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
       cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      CudaDims& RAJA_UNUSED_ARG(dims),
-      CudaDims& RAJA_UNUSED_ARG(min_dims),
-      IdxT      len)
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len > static_cast<IdxT>(1))
     {
@@ -258,11 +257,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -275,8 +273,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
                           "space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -304,11 +302,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -321,8 +318,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
                           "space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -337,10 +334,9 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      CudaDims& RAJA_UNUSED_ARG(dims),
-      CudaDims& RAJA_UNUSED_ARG(min_dims),
-      IdxT      len)
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len > static_cast<IdxT>(0))
     {
@@ -355,11 +351,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -369,15 +364,15 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_cuda_dim<dim>(
-        dims.threads, RAJA_DIVIDE_CEILING_INT(
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
                           len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(
-        min_dims.threads, RAJA_DIVIDE_CEILING_INT(
-                              len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -387,11 +382,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -400,37 +394,34 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(
-        dims.blocks, RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(
-        min_dims.blocks, RAJA_DIVIDE_CEILING_INT(
-                             len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template <
-    named_dim               dim,
-    int                     BLOCK_SIZE,
-    int                     GRID_SIZE,
-    kernel_sync_requirement sync>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::Direct,
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -445,10 +436,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -464,10 +455,9 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
       cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      CudaDims& RAJA_UNUSED_ARG(dims),
-      CudaDims& RAJA_UNUSED_ARG(min_dims),
-      IdxT      RAJA_UNUSED_ARG(len))
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
   {}
 };
 
@@ -497,11 +487,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -510,8 +499,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -539,11 +528,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -552,8 +540,8 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -586,11 +574,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -600,13 +587,13 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_cuda_dim<dim>(
-        dims.threads, RAJA_DIVIDE_CEILING_INT(
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
                           len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -616,11 +603,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -629,35 +615,32 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(
-        dims.blocks, RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template <
-    named_dim               dim,
-    int                     BLOCK_SIZE,
-    int                     GRID_SIZE,
-    kernel_sync_requirement sync>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     sync,
     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -667,10 +650,10 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index f0f16dd706..355b080a30 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -34,8 +34,8 @@ __global__ void launch_global_fcn(BODY body_in)
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -45,14 +45,14 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void
-launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -66,10 +66,10 @@ launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
-    async,
-    named_usage::unspecified,
-    named_usage::unspecified>>
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>>
 {
 
   template <typename BODY_IN, typename ReduceParams>
@@ -77,12 +77,11 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -94,10 +93,9 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize {
-        static_cast<cuda_dim_member_t>(params.teams.value[0]),
-        static_cast<cuda_dim_member_t>(params.teams.value[1]),
-        static_cast<cuda_dim_member_t>(params.teams.value[2])};
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
     cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(params.threads.value[0]),
@@ -126,9 +124,8 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::cuda::launch(
-            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
-            kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -144,12 +141,11 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       launch_params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             launch_reducers)
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -190,8 +186,8 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
       {
         using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
             async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
-            launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
 
         //
@@ -205,12 +201,11 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(
-            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
-            kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
-            launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -228,8 +223,8 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -238,21 +233,19 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
   body(ctx);
 }
 
-template <
-    typename BODY,
-    int    num_threads,
-    size_t BLOCKS_PER_SM,
-    typename ReduceParams>
+template <typename BODY,
+          int num_threads,
+          size_t BLOCKS_PER_SM,
+          typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_new_reduce_global_fcn_fixed(
-        BODY         body_in,
-        ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -275,12 +268,11 @@ struct LaunchExecute<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -294,10 +286,9 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize {
-        static_cast<cuda_dim_member_t>(params.teams.value[0]),
-        static_cast<cuda_dim_member_t>(params.teams.value[1]),
-        static_cast<cuda_dim_member_t>(params.teams.value[2])};
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
     cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(params.threads.value[0]),
@@ -326,9 +317,8 @@ struct LaunchExecute<
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::cuda::launch(
-            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
-            kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -344,19 +334,18 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       launch_params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             launch_reducers)
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<
-            BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
+                                            camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -391,10 +380,11 @@ struct LaunchExecute<
       launch_info.res          = cuda_res;
       {
 
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
-            async, nthreads, BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
-            launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
+                                                       BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
@@ -407,12 +397,11 @@ struct LaunchExecute<
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(
-            func, gridSize, blockSize, args, shared_mem_size, cuda_res, async,
-            kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
-            launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -427,10 +416,9 @@ struct LaunchExecute<
 */
 template <typename SEGMENT, typename IndexMapper>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
     SEGMENT>
 {
 
@@ -438,10 +426,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -455,11 +443,10 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
     SEGMENT>
 {
 
@@ -467,11 +454,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -486,18 +473,16 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
     SEGMENT>
 {
 
@@ -505,12 +490,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -522,9 +507,8 @@ struct LoopExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(
-          *(segment0.begin() + i0), *(segment1.begin() + i1),
-          *(segment2.begin() + i2));
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2));
     }
   }
 };
@@ -542,10 +526,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -572,11 +556,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -599,11 +583,10 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::cuda::cuda_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -618,12 +601,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -646,9 +629,8 @@ struct LoopExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(
-              *(segment0.begin() + i0), *(segment1.begin() + i1),
-              *(segment2.begin() + i2));
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2));
         }
       }
     }
@@ -657,10 +639,9 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
     SEGMENT>
 {
 
@@ -668,10 +649,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -684,11 +665,10 @@ struct LoopICountExecute<
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
     SEGMENT>
 {
 
@@ -696,11 +676,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -715,18 +695,16 @@ struct LoopICountExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
     SEGMENT>
 {
 
@@ -734,12 +712,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -751,9 +729,8 @@ struct LoopICountExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(
-          *(segment0.begin() + i0), *(segment1.begin() + i1),
-          *(segment2.begin() + i2), i0, i1, i2);
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
@@ -771,10 +748,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -801,11 +778,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -828,11 +805,10 @@ struct LoopICountExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopICountExecute<
     RAJA::policy::cuda::cuda_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -847,12 +823,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -875,9 +851,8 @@ struct LoopICountExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(
-              *(segment0.begin() + i0), *(segment1.begin() + i1),
-              *(segment2.begin() + i2), i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -890,10 +865,9 @@ struct LoopICountExecute<
 */
 template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<
-        RAJA::iteration_mapping::Direct,
-        sync,
-        IndexMapper0>,
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             sync,
+                                             IndexMapper0>,
     SEGMENT>
     : LoopExecute<
           RAJA::policy::cuda::
@@ -903,21 +877,20 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -935,28 +908,26 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1,
+                                             IndexMapper2>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1004,10 +975,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1024,11 +995,10 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::cuda::cuda_flatten_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -1042,10 +1012,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1071,10 +1041,9 @@ struct LoopExecute<
 */
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
     SEGMENT>
 {
 
@@ -1082,11 +1051,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i =
@@ -1112,11 +1081,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init =
@@ -1133,10 +1102,9 @@ struct TileExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
     SEGMENT>
 {
 
@@ -1144,11 +1112,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t   = IndexMapper::template index<diff_t>();
@@ -1174,11 +1142,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t t_init   = IndexMapper::template index<diff_t>();
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index 666a064a45..e52a036c8f 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -73,20 +73,19 @@ namespace impl
 //
 
 //! combine value into global memory
-template <
-    typename Combiner,
-    typename GetTallyIndex,
-    typename T,
-    typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
-    int            RAJA_UNUSED_ARG(num_bins),
-    T              identity,
-    int            bin,
-    T              value,
-    T*             tally_mem,
-    GetTallyOffset get_tally_offset,
-    int            tally_replication,
-    int            tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   if (value == identity)
   {
@@ -104,11 +103,11 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
-    int num_bins,
-    T   identity,
-    T*  shared_mem,
-    int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -123,19 +122,18 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
 }
 
 //! combine value into shared memory
-template <
-    typename Combiner,
-    typename GetSharedIndex,
-    typename T,
-    typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
-    int             num_bins,
-    T               identity,
-    int             bin,
-    T               value,
-    T*              shared_mem,
-    GetSharedOffset get_shared_offset,
-    int             shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
   if (value == identity)
   {
@@ -152,21 +150,20 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
 }
 
 //! combine value into shared memory
-template <
-    typename Combiner,
-    typename T,
-    typename GetSharedOffset,
-    typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
-    int             num_bins,
-    T               identity,
-    T*              shared_mem,
-    GetSharedOffset get_shared_offset,
-    int             shared_replication,
-    T*              tally_mem,
-    GetTallyOffset  get_tally_offset,
-    int             tally_replication,
-    int             tally_bins)
+template <typename Combiner,
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -213,17 +210,16 @@ struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
   template <typename Container>
-  MultiReduceGridAtomicHostInit_TallyData(
-      Container const& container,
-      T const&         identity)
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
       : m_tally_mem(nullptr),
         m_identity(identity),
         m_num_bins(container.size()),
         m_tally_bins(get_tally_bins(m_num_bins)),
         m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(
-        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
@@ -249,8 +245,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins          = new_num_bins;
       m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem         = create_tally(
-                  container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
     }
     else
     {
@@ -259,8 +255,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         int bin       = 0;
         for (auto const& value : container)
         {
-          m_tally_mem[GetTallyOffset {}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
@@ -268,8 +264,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       {
         for (int bin = 0; bin < m_num_bins; ++bin)
         {
-          m_tally_mem[GetTallyOffset {}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -343,12 +339,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   template <typename Container>
-  static T* create_tally(
-      Container const& container,
-      T const&         identity,
-      int              num_bins,
-      int              tally_bins,
-      int              tally_replication)
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -384,11 +379,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
     return tally_mem;
   }
 
-  static void destroy_tally(
-      T*& tally_mem,
-      int num_bins,
-      int tally_bins,
-      int tally_replication)
+  static void destroy_tally(T*& tally_mem,
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -399,8 +393,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       for (int bin = num_bins; bin > 0; --bin)
       {
-        int tally_offset = GetTallyOffset {}(
-            bin - 1, tally_bins, tally_rep - 1, tally_replication);
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -412,8 +406,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
   using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
-  T*  m_tally_mem;
-  T   m_identity;
+  T* m_tally_mem;
+  T m_identity;
   int m_num_bins;
   int m_tally_bins;
   int m_tally_replication;  // power of 2, at least the max number of omp
@@ -496,9 +490,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 
   //! setup permanent settings, defer to tally data
   template <typename Container>
-  MultiReduceBlockThenGridAtomicHostInit_Data(
-      Container const& container,
-      T const&         identity)
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
       : TallyData(container, identity),
         m_shared_offset(s_shared_offset_unknown),
         m_shared_replication(0)
@@ -532,8 +525,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
       return;
     }
 
-    size_t       shared_replication = 0;
-    const size_t shared_offset      = allocateDynamicShmem<T>(
+    size_t shared_replication  = 0;
+    const size_t shared_offset = allocateDynamicShmem<T>(
         [&](size_t max_shmem_size)
         {
           struct
@@ -574,8 +567,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr)
     {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity, shared_mem, m_shared_replication);
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -697,17 +690,15 @@ struct MultiReduceDataCuda
           (tuning::algorithm ==
            multi_reduce_algorithm::
                init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<
-              t_MultiReduceOp,
-              T,
-              tuning>,
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                            T,
+                                                            tuning>,
           std::conditional_t<
               (tuning::algorithm ==
                multi_reduce_algorithm::init_host_combine_global_atomic),
-              cuda::MultiReduceGridAtomicHostInit_Data<
-                  t_MultiReduceOp,
-                  T,
-                  tuning>,
+              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                       T,
+                                                       tuning>,
               void>>,
       void>;
 
@@ -843,9 +834,9 @@ struct MultiReduceDataCuda
 
 private:
   MultiReduceDataCuda const* m_parent;
-  SyncList*                  m_sync_list;
-  reduce_data_type           m_data;
-  bool                       m_own_launch_data;
+  SyncList* m_sync_list;
+  reduce_data_type m_data;
+  bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Cuda res)
   {
@@ -871,9 +862,8 @@ struct MultiReduceDataCuda
 
 }  // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(
-    policy::cuda::cuda_multi_reduce_policy,
-    cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
+                                cuda::MultiReduceDataCuda)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index eab7273509..a7bac49fd9 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -200,10 +200,9 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template <
-    size_t t_cutoff,
-    size_t preferred_replication_before_cutoff,
-    size_t preferred_replication_after_cutoff>
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
   template <typename IdxT, typename Data>
@@ -282,18 +281,17 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <
-    reduce_algorithm         t_algorithm,
-    block_communication_mode t_comm_mode,
-    size_t                   t_replication,
-    size_t                   t_atomic_stride>
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm         algorithm     = t_algorithm;
-  static constexpr block_communication_mode comm_mode     = t_comm_mode;
-  static constexpr size_t                   replication   = t_replication;
-  static constexpr size_t                   atomic_stride = t_atomic_stride;
-  static constexpr bool                     consistent =
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
+  static constexpr block_communication_mode comm_mode = t_comm_mode;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
+  static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
 
@@ -304,10 +302,9 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template <
-    typename t_AtomicReplicationConcretizer,
-    typename t_ReplicationIndexer,
-    typename t_OffsetCalculator>
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
@@ -315,10 +312,9 @@ struct AtomicReplicationTuning
   using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template <
-    multi_reduce_algorithm t_algorithm,
-    typename t_SharedAtomicReplicationTuning,
-    typename t_GlobalAtomicReplicationTuning>
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -344,10 +340,9 @@ struct DeviceConstants
                                              // the cache level that handles
                                              // atomics
 
-  constexpr DeviceConstants(
-      RAJA::Index_type warp_size,
-      RAJA::Index_type max_block_size,
-      RAJA::Index_type atomic_cache_line_bytes) noexcept
+  constexpr DeviceConstants(RAJA::Index_type warp_size,
+                            RAJA::Index_type max_block_size,
+                            RAJA::Index_type atomic_cache_line_bytes) noexcept
       : WARP_SIZE(warp_size),
         MAX_BLOCK_SIZE(max_block_size),
         MAX_WARPS(max_block_size / warp_size),
@@ -360,30 +355,26 @@ struct DeviceConstants
 // values for CUDA warp size and max block size.
 //
 constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
-static_assert(
-    device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-    "RAJA Assumption Broken: device_constants.WARP_SIZE < "
-    "device_constants.MAX_WARPS");
-static_assert(
-    device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
-    "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
-    "a multiple of device_constants.WARP_SIZE");
+static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
+static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
+              "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
+              "a multiple of device_constants.WARP_SIZE");
 
 constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
 
-template <
-    typename _IterationMapping,
-    kernel_sync_requirement sync,
-    typename... _IterationGetters>
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct cuda_indexer
 {};
 
-template <
-    typename _IterationMapping,
-    kernel_sync_requirement sync,
-    typename... _IterationGetters>
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct cuda_flatten_indexer
     : public RAJA::make_policy_pattern_launch_platform_t<
           RAJA::Policy::cuda,
@@ -394,12 +385,11 @@ struct cuda_flatten_indexer
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <
-    typename _IterationMapping,
-    typename _IterationGetter,
-    typename _LaunchConcretizer,
-    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-    bool   Async         = false>
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::forall,
@@ -411,10 +401,9 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <
-    bool   Async,
-    int    num_threads   = named_usage::unspecified,
-    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <bool Async,
+          int num_threads      = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 struct cuda_launch_explicit_t
     : public RAJA::make_policy_pattern_launch_platform_t<
           RAJA::Policy::cuda,
@@ -431,10 +420,9 @@ struct cuda_launch_explicit_t
 ///
 /// WorkGroup execution policies
 ///
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-    bool   Async         = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::workgroup_exec,
@@ -468,10 +456,9 @@ struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Pattern::reduce,
                                 detail::get_launch<false>::value,
                                 RAJA::Platform::cuda,
-                                std::conditional_t<
-                                    tuning::consistent,
-                                    reduce::ordered,
-                                    reduce::unordered>>
+                                std::conditional_t<tuning::consistent,
+                                                   reduce::ordered,
+                                                   reduce::unordered>>
 {};
 
 template <typename tuning>
@@ -481,10 +468,9 @@ struct cuda_multi_reduce_policy
           RAJA::Pattern::multi_reduce,
           detail::get_launch<false>::value,
           RAJA::Platform::cuda,
-          std::conditional_t<
-              tuning::consistent,
-              reduce::ordered,
-              reduce::unordered>>
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
 {};
 
 /*!
@@ -556,10 +542,9 @@ struct cuda_thread_masked_loop
 {};
 
 
-struct cuda_synchronize : make_policy_pattern_launch_t<
-                              Policy::cuda,
-                              Pattern::synchronize,
-                              Launch::sync>
+struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
+                                                       Pattern::synchronize,
+                                                       Launch::sync>
 {};
 
 }  // end namespace cuda
@@ -606,9 +591,8 @@ struct CudaDims
   {
     if (num_blocks() != 0)
     {
-      return {
-          (blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
-          (blocks.z ? blocks.z : 1)};
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
+              (blocks.z ? blocks.z : 1)};
     }
     else
     {
@@ -621,9 +605,8 @@ struct CudaDims
   {
     if (num_threads() != 0)
     {
-      return {
-          (threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
-          (threads.z ? threads.z : 1)};
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
+              (threads.z ? threads.z : 1)};
     }
     else
     {
@@ -1161,8 +1144,8 @@ struct IndexDivide
   template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(
-        indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
@@ -1196,10 +1179,9 @@ struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<
-      typename get_index_thread<x_index>::type,
-      typename get_index_thread<y_index>::type,
-      typename get_index_thread<z_index>::type>;
+  using type = IndexFlatten<typename get_index_thread<x_index>::type,
+                            typename get_index_thread<y_index>::type,
+                            typename get_index_thread<z_index>::type>;
 };
 
 // helper to get just the block indexing part of IndexGlobal
@@ -1215,10 +1197,9 @@ struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<
-      typename get_index_block<x_index>::type,
-      typename get_index_block<y_index>::type,
-      typename get_index_block<z_index>::type>;
+  using type = IndexFlatten<typename get_index_block<x_index>::type,
+                            typename get_index_block<y_index>::type,
+                            typename get_index_block<z_index>::type>;
 };
 
 
@@ -1229,14 +1210,12 @@ using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
 template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <
-    size_t BLOCK_SIZE_X = named_usage::unspecified,
-    size_t BLOCK_SIZE_Y = named_usage::unspecified,
-    size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using thread_xyz = IndexFlatten<
-    thread_x<BLOCK_SIZE_X>,
-    thread_y<BLOCK_SIZE_Y>,
-    thread_z<BLOCK_SIZE_Z>>;
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
+                                thread_y<BLOCK_SIZE_Y>,
+                                thread_z<BLOCK_SIZE_Z>>;
 
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
@@ -1245,14 +1224,12 @@ using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <
-    size_t GRID_SIZE_X = named_usage::unspecified,
-    size_t GRID_SIZE_Y = named_usage::unspecified,
-    size_t GRID_SIZE_Z = named_usage::unspecified>
-using block_xyz = IndexFlatten<
-    block_x<GRID_SIZE_X>,
-    block_y<GRID_SIZE_Y>,
-    block_z<GRID_SIZE_Z>>;
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
+                               block_y<GRID_SIZE_Y>,
+                               block_z<GRID_SIZE_Z>>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
@@ -1262,39 +1239,35 @@ template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
-template <
-    size_t BLOCK_SIZE_X,
-    size_t BLOCK_SIZE_Y,
-    size_t BLOCK_SIZE_Z,
-    size_t GRID_SIZE_X = named_usage::unspecified,
-    size_t GRID_SIZE_Y = named_usage::unspecified,
-    size_t GRID_SIZE_Z = named_usage::unspecified>
-using global_xyz = IndexFlatten<
-    global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
-    global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
-    global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
-
-
-template <
-    size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
-    size_t BLOCK_SIZE_X = named_usage::unspecified,
-    size_t BLOCK_SIZE_Y = named_usage::unspecified,
-    size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using warp_xyz = IndexDivide<
-    WARP_SIZE,
-    thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
-
-template <
-    size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
-    size_t BLOCK_SIZE_X = named_usage::unspecified,
-    size_t BLOCK_SIZE_Y = named_usage::unspecified,
-    size_t BLOCK_SIZE_Z = named_usage::unspecified,
-    size_t GRID_SIZE_X  = named_usage::unspecified,
-    size_t GRID_SIZE_Y  = named_usage::unspecified,
-    size_t GRID_SIZE_Z  = named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<
-    warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
-    block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+template <size_t BLOCK_SIZE_X,
+          size_t BLOCK_SIZE_Y,
+          size_t BLOCK_SIZE_Z,
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
+                                global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
+                                global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
+
+
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
 
 }  // namespace cuda
 
@@ -1316,11 +1289,10 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <
-    size_t BLOCK_SIZE,
-    size_t GRID_SIZE,
-    size_t BLOCKS_PER_SM,
-    bool   Async = false>
+template <size_t BLOCK_SIZE,
+          size_t GRID_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
@@ -1353,36 +1325,36 @@ using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
     true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    BLOCKS_PER_SM,
-    Async>;
+using cuda_exec_explicit =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    BLOCKS_PER_SM,
-    true>;
+using cuda_exec_explicit_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    Async>;
+using cuda_exec =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    true>;
+using cuda_exec_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
@@ -1448,11 +1420,10 @@ using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
     policy::cuda::MIN_BLOCKS_PER_SM,
     true>;
 
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    typename Fraction,
-    bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Fraction,
+          bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     cuda::global_x<BLOCK_SIZE>,
@@ -1484,11 +1455,10 @@ using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
     policy::cuda::MIN_BLOCKS_PER_SM,
     true>;
 
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    typename Concretizer,
-    bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Concretizer,
+          bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     cuda::global_x<BLOCK_SIZE>,
@@ -1552,11 +1522,10 @@ using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
     policy::cuda::MIN_BLOCKS_PER_SM,
     true>;
 
-template <
-    bool   with_reduce,
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM,
-    bool   Async = false>
+template <bool with_reduce,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
 using cuda_exec_base_explicit = std::conditional_t<
     with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
@@ -1569,29 +1538,27 @@ using cuda_exec_base_explicit_async = std::conditional_t<
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base = std::conditional_t<
-    with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-    cuda_exec<BLOCK_SIZE, Async>>;
+using cuda_exec_base =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+                       cuda_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async = std::conditional_t<
-    with_reduce,
-    cuda_exec_with_reduce_async<BLOCK_SIZE>,
-    cuda_exec_async<BLOCK_SIZE>>;
+using cuda_exec_base_async =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
+                       cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-    bool   Async         = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
 using cuda_work_explicit =
     policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <
-    size_t BLOCK_SIZE,
-    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 using cuda_work_explicit_async =
     policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
@@ -1611,11 +1578,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template <
-    cuda::reduce_algorithm         algorithm,
-    cuda::block_communication_mode comm_mode,
-    size_t                         replication   = named_usage::unspecified,
-    size_t                         atomic_stride = named_usage::unspecified>
+template <cuda::reduce_algorithm algorithm,
+          cuda::block_communication_mode comm_mode,
+          size_t replication   = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1638,41 +1604,41 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using cuda_reduce_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using cuda_reduce_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1690,23 +1656,20 @@ using cuda_reduce_base =
 
 
 // policies usable with multi_reducers
-template <
-    cuda::multi_reduce_algorithm algorithm,
-    typename SharedAtomicReplicationConcretizer,
-    typename SharedAtomicReplicationIndexer,
-    typename GlobalAtomicReplicationConcretizer,
-    typename GlobalAtomicReplicationIndexer>
+template <cuda::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
 using cuda_multi_reduce_tuning =
     policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
         algorithm,
-        cuda::AtomicReplicationTuning<
-            SharedAtomicReplicationConcretizer,
-            SharedAtomicReplicationIndexer,
-            GetOffsetRight<int>>,
-        cuda::AtomicReplicationTuning<
-            GlobalAtomicReplicationConcretizer,
-            GlobalAtomicReplicationIndexer,
-            GetOffsetLeft<int>>>>;
+        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                      SharedAtomicReplicationIndexer,
+                                      GetOffsetRight<int>>,
+        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                      GlobalAtomicReplicationIndexer,
+                                      GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1792,29 +1755,29 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <
-    bool   Async,
-    int    num_threads   = named_usage::unspecified,
-    size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <bool Async,
+          int num_threads      = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 using cuda_launch_explicit_t =
     policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
 // CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
 template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<
-    Async,
-    num_threads,
-    (num_threads == named_usage::unspecified)
-        ? named_usage::unspecified
-        : policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch_t =
+    policy::cuda::cuda_launch_explicit_t<Async,
+                                         num_threads,
+                                         (num_threads ==
+                                          named_usage::unspecified)
+                                             ? named_usage::unspecified
+                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
 template <typename... indexers>
-using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+using cuda_indexer_direct =
+    policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                               kernel_sync_requirement::none,
+                               indexers...>;
 
 template <typename... indexers>
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
@@ -1829,10 +1792,10 @@ using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     indexers...>;
 
 template <typename... indexers>
-using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+using cuda_flatten_indexer_direct =
+    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
+                                       kernel_sync_requirement::none,
+                                       indexers...>;
 
 template <typename... indexers>
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
@@ -2146,10 +2109,10 @@ using cuda_flatten_block_zyx_loop =
  * physical threads to fit all of the direct map requests.
  */
 template <named_dim... dims>
-using cuda_global_direct = cuda_indexer_direct<cuda::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using cuda_global_direct =
+    cuda_indexer_direct<cuda::IndexGlobal<dims,
+                                          named_usage::unspecified,
+                                          named_usage::unspecified>...>;
 
 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
@@ -2180,16 +2143,16 @@ using cuda_global_zyx_direct =
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
 template <named_dim... dims>
-using cuda_global_loop = cuda_indexer_loop<cuda::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using cuda_global_loop =
+    cuda_indexer_loop<cuda::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
 
 template <named_dim... dims>
-using cuda_global_syncable_loop = cuda_indexer_syncable_loop<cuda::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using cuda_global_syncable_loop =
+    cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
+                                                 named_usage::unspecified,
+                                                 named_usage::unspecified>...>;
 
 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
@@ -2223,10 +2186,9 @@ using cuda_global_zyx_loop =
  */
 template <named_dim... dims>
 using cuda_flatten_global_direct =
-    cuda_flatten_indexer_direct<cuda::IndexGlobal<
-        dims,
-        named_usage::unspecified,
-        named_usage::unspecified>...>;
+    cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
+                                                  named_usage::unspecified,
+                                                  named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
@@ -2265,10 +2227,10 @@ using cuda_flatten_global_zyx_direct =
  * global threads
  */
 template <named_dim... dims>
-using cuda_flatten_global_loop = cuda_flatten_indexer_loop<cuda::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using cuda_flatten_global_loop =
+    cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
@@ -2317,60 +2279,60 @@ using cuda_thread_size_z_direct =
     cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xy_direct = cuda_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xz_direct = cuda_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yx_direct = cuda_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yz_direct = cuda_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zx_direct = cuda_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zy_direct = cuda_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xyz_direct = cuda_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xyz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xzy_direct = cuda_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xzy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yxz_direct = cuda_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yxz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yzx_direct = cuda_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yzx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zxy_direct = cuda_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zxy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zyx_direct = cuda_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zyx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2403,35 +2365,35 @@ using cuda_block_size_zy_direct =
     cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xyz_direct = cuda_indexer_direct<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_xyz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xzy_direct = cuda_indexer_direct<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_xzy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yxz_direct = cuda_indexer_direct<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_yxz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yzx_direct = cuda_indexer_direct<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_yzx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zxy_direct = cuda_indexer_direct<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_zxy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zyx_direct = cuda_indexer_direct<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_zyx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2444,121 +2406,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_global_size_z_direct =
     cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xy_direct = cuda_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xz_direct = cuda_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yx_direct = cuda_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yz_direct = cuda_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zx_direct = cuda_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zy_direct = cuda_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xyz_direct = cuda_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xzy_direct = cuda_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yxz_direct = cuda_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yzx_direct = cuda_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zxy_direct = cuda_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zyx_direct = cuda_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to CUDA global threads.
@@ -2572,60 +2522,60 @@ template <int Z_BLOCK_SIZE>
 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xy_loop = cuda_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xz_loop = cuda_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yx_loop = cuda_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yz_loop = cuda_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zx_loop = cuda_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zy_loop = cuda_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xyz_loop = cuda_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_xyz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xzy_loop = cuda_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_xzy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yxz_loop = cuda_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_thread_size_yxz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yzx_loop = cuda_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_yzx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zxy_loop = cuda_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_thread_size_zxy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zyx_loop = cuda_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_thread_size_zyx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2655,35 +2605,29 @@ using cuda_block_size_zy_loop =
     cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xyz_loop = cuda_indexer_loop<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xzy_loop = cuda_indexer_loop<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yxz_loop = cuda_indexer_loop<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yzx_loop = cuda_indexer_loop<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zxy_loop = cuda_indexer_loop<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zyx_loop = cuda_indexer_loop<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2696,121 +2640,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_global_size_z_loop =
     cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xy_loop = cuda_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xz_loop = cuda_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yx_loop = cuda_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yz_loop = cuda_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zx_loop = cuda_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zy_loop = cuda_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xyz_loop = cuda_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xzy_loop = cuda_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yxz_loop = cuda_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yzx_loop = cuda_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zxy_loop = cuda_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zyx_loop = cuda_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2829,60 +2761,60 @@ using cuda_flatten_thread_size_z_direct =
     cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2896,60 +2828,60 @@ using cuda_flatten_block_size_z_direct =
     cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2962,121 +2894,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_flatten_global_size_z_direct =
     cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -3095,60 +3015,60 @@ using cuda_flatten_thread_size_z_loop =
     cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<
-    cuda::thread_z<Z_BLOCK_SIZE>,
-    cuda::thread_y<Y_BLOCK_SIZE>,
-    cuda::thread_x<X_BLOCK_SIZE>>;
+using cuda_flatten_thread_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -3162,60 +3082,60 @@ using cuda_flatten_block_size_z_loop =
     cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>>;
+using cuda_flatten_block_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>>;
+using cuda_flatten_block_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<
-    cuda::block_z<Z_GRID_SIZE>,
-    cuda::block_y<Y_GRID_SIZE>,
-    cuda::block_x<X_GRID_SIZE>>;
+using cuda_flatten_block_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -3228,121 +3148,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using cuda_flatten_global_size_z_loop =
     cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<
-    cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 0d8ed0956b..1aa5e84207 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -66,9 +66,8 @@ cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
     }
     else
     {
-      fprintf(
-          stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
-          line);
+      fprintf(stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 24a2070ac3..5828b0ee73 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -130,18 +130,16 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <
-    typename Combiner,
-    typename Accessor,
-    int replication,
-    int atomic_stride,
-    typename T,
-    typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(
-    T&            val,
-    T             identity,
-    TempIterator  in_device_mem,
-    unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
   typename TempIterator::template rebind_accessor<Accessor> device_mem(
       in_device_mem);
@@ -157,8 +155,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(
   int replicationId = blockId % replication;
   int slotId        = blockId / replication;
 
-  int          maxNumSlots = (numBlocks + replication - 1) / replication;
-  unsigned int numSlots    = (numBlocks / replication) +
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots = (numBlocks / replication) +
                           ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
@@ -246,7 +244,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T   rhs     = RAJA::cuda::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -255,11 +253,10 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     }
   }
 
-  static_assert(
-      RAJA::policy::cuda::device_constants.MAX_WARPS <=
-          RAJA::policy::cuda::device_constants.WARP_SIZE,
-      "Max Warps must be less than or equal to Warp Size for this "
-      "algorithm to work");
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
   if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
@@ -321,8 +318,8 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
   using ThreadIterationGetter =
       typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int          numBlocks   = BlockIterationGetter::size();
-  const int          numThreads  = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
   const int blockId  = BlockIterationGetter::index();
@@ -371,17 +368,16 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <
-    typename Combiner,
-    typename Accessor,
-    int replication,
-    int atomic_stride,
-    typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
-    T&            val,
-    T             identity,
-    T*            device_mem,
-    unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T>
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -481,14 +477,14 @@ class PinnedTally
   struct Node
   {
     Node* next;
-    T     values[num_slots];
+    T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode
   {
-    ResourceNode*           next;
+    ResourceNode* next;
     ::RAJA::resources::Cuda res;
-    Node*                   node_list;
+    Node* node_list;
   };
 
   //! Iterator over resources used by reducer
@@ -508,7 +504,7 @@ class PinnedTally
     ResourceIterator operator++(int)
     {
       ResourceIterator ret = *this;
-      this->           operator++();
+      this->operator++();
       return ret;
     }
 
@@ -558,7 +554,7 @@ class PinnedTally
     ResourceNodeIterator operator++(int)
     {
       ResourceNodeIterator ret = *this;
-      this->               operator++();
+      this->operator++();
       return ret;
     }
 
@@ -576,7 +572,7 @@ class PinnedTally
 
   private:
     ResourceNode* m_rn;
-    Node*         m_n;
+    Node* m_n;
   };
 
   PinnedTally() : resource_list(nullptr) {}
@@ -671,12 +667,11 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <
-    typename Combiner,
-    typename Accessor,
-    typename T,
-    size_t replication,
-    size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -685,11 +680,11 @@ struct ReduceLastBlock_Data
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T                                  value;
-  T                                          identity;
-  unsigned int*                              device_count;
+  mutable T value;
+  T identity;
+  unsigned int* device_count;
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
-  bool                                       owns_device_pointer;
+  bool owns_device_pointer;
 
   ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
 
@@ -734,9 +729,10 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-        temp, identity, device, device_count);
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -750,9 +746,9 @@ struct ReduceLastBlock_Data
     bool act = !device.allocated() && setupReducers();
     if (act)
     {
-      cuda_dim_t gridDim     = currentGridDim();
-      size_t     numBlocks   = gridDim.x * gridDim.y * gridDim.z;
-      size_t     maxNumSlots = (numBlocks + replication - 1) / replication;
+      cuda_dim_t gridDim = currentGridDim();
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
+      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
       device.allocate(maxNumSlots * replication);
       device_count =
           count_mempool_type::getInstance().template malloc<unsigned int>(
@@ -779,11 +775,10 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <
-    typename Combiner,
-    typename T,
-    size_t replication,
-    size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -791,9 +786,9 @@ struct ReduceAtomicHostInit_Data
   static constexpr size_t tally_slots = replication * atomic_stride;
 
   mutable T value;
-  T         identity;
-  bool      is_setup;
-  bool      owns_device_pointer;
+  T identity;
+  bool is_setup;
+  bool owns_device_pointer;
 
   ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
 
@@ -864,12 +859,11 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <
-    typename Combiner,
-    typename Accessor,
-    typename T,
-    size_t replication,
-    size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -878,11 +872,11 @@ struct ReduceAtomicDeviceInit_Data
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T     value;
-  T             identity;
+  mutable T value;
+  T identity;
   unsigned int* device_count;
-  T*            device;
-  bool          owns_device_pointer;
+  T* device;
+  bool owns_device_pointer;
 
   ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
 
@@ -923,9 +917,10 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-        temp, identity, device, device_count);
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -987,10 +982,10 @@ class Reduce
   using Accessor = std::conditional_t<
       (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<
-          (tuning::comm_mode == block_communication_mode::device_fence),
-          impl::AccessorDeviceScopeUseDeviceFence,
-          void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
       (tuning::algorithm ==
@@ -1003,47 +998,43 @@ class Reduce
   using reduce_data_type = std::conditional_t<
       (tuning::algorithm == reduce_algorithm::combine_last_block) ||
           (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<
-          Combiner,
-          Accessor,
-          T,
-          replication,
-          atomic_stride>,
+      cuda::ReduceLastBlock_Data<Combiner,
+                                 Accessor,
+                                 T,
+                                 replication,
+                                 atomic_stride>,
       std::conditional_t<
           atomic_available,
           std::conditional_t<
               (tuning::algorithm ==
                reduce_algorithm::init_device_combine_atomic_block),
-              cuda::ReduceAtomicDeviceInit_Data<
-                  Combiner,
-                  Accessor,
-                  T,
-                  replication,
-                  atomic_stride>,
+              cuda::ReduceAtomicDeviceInit_Data<Combiner,
+                                                Accessor,
+                                                T,
+                                                replication,
+                                                atomic_stride>,
               std::conditional_t<
                   (tuning::algorithm ==
                    reduce_algorithm::init_host_combine_atomic_block),
-                  cuda::ReduceAtomicHostInit_Data<
-                      Combiner,
-                      T,
-                      replication,
-                      atomic_stride>,
+                  cuda::ReduceAtomicHostInit_Data<Combiner,
+                                                  T,
+                                                  replication,
+                                                  atomic_stride>,
                   void>>,
           void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<
-      T,
-      tally_slots,
-      typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
   union tally_u
   {
     TallyType* list;
-    T*         val_ptr;
+    T* val_ptr;
     constexpr tally_u(TallyType* l) : list(l) {};
     constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
@@ -1169,8 +1160,8 @@ class Reduce
   T get_combined() const { return val.value; }
 
 private:
-  const Reduce*    parent;
-  tally_u          tally_or_val_ptr;
+  const Reduce* parent;
+  tally_u tally_or_val_ptr;
   reduce_data_type val;
 };
 
@@ -1283,28 +1274,25 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(
-            value_type(init_val, init_idx),
-            value_type(identity_val, identity_idx))
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
+               T identity_val = NonLocCombiner::identity(),
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val = NonLocCombiner::identity(),
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(
-        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1342,28 +1330,25 @@ class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(
-            value_type(init_val, init_idx),
-            value_type(identity_val, identity_idx))
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
+               T identity_val = NonLocCombiner::identity(),
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val = NonLocCombiner::identity(),
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(
-        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index a061b729e5..2b60028cb0 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -42,43 +42,41 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename InputIter,
-    typename Function>
-RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    InputIter begin,
-    InputIter end,
-    Function  binary_op)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename InputIter,
+          typename Function>
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
-      stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
-      stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -91,45 +89,43 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive_inplace(
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename InputIter,
-    typename Function,
-    typename T>
-RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    InputIter begin,
-    InputIter end,
-    Function  binary_op,
-    T         init)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename InputIter,
+          typename Function,
+          typename T>
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op,
+                  T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
-      stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
-      stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -142,33 +138,31 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive_inplace(
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename InputIter,
-    typename OutputIter,
-    typename Function>
-RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    InputIter  begin,
-    InputIter  end,
-    OutputIter out,
-    Function   binary_op)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename InputIter,
+          typename OutputIter,
+          typename Function>
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -191,47 +185,45 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename InputIter,
-    typename OutputIter,
-    typename Function,
-    typename T>
-RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    InputIter  begin,
-    InputIter  end,
-    OutputIter out,
-    Function   binary_op,
-    T          init)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename InputIter,
+          typename OutputIter,
+          typename Function,
+          typename T>
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
-      stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
-      stream));
+  cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index d884abb6f0..ac4597a0f7 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,51 +44,44 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename Iter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::
-                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    Iter,
-    Iter,
-    Compare)
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert(
-      std::is_pointer<Iter>::value,
-      "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert(
-      type_traits::is_arithmetic<iterval>::value,
-      "stable_sort<cuda_exec> is only implemented for arithmetic "
-      "types");
-  static_assert(
-      concepts::any_of<
-          camp::is_same<Compare, operators::less<iterval>>,
-          camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "stable_sort<cuda_exec> is only implemented for "
-      "RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "stable_sort<cuda_exec> is only implemented for arithmetic "
+                "types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "stable_sort<cuda_exec> is only implemented for "
+                "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -96,28 +89,24 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -135,20 +124,20 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -156,8 +145,8 @@ stable(
   {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(
-        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -170,28 +159,24 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -209,7 +194,7 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
       d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
@@ -230,8 +215,8 @@ stable(
   {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(
-        begin, d_out, len * sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -245,50 +230,43 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename Iter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::
-                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    Iter,
-    Iter,
-    Compare)
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert(
-      std::is_pointer<Iter>::value,
-      "sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert(
-      type_traits::is_arithmetic<iterval>::value,
-      "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert(
-      concepts::any_of<
-          camp::is_same<Compare, operators::less<iterval>>,
-          camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "sort<cuda_exec> is only implemented for RAJA::operators::less "
-      "or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "sort<cuda_exec> is only implemented for RAJA::operators::less "
+                "or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -296,28 +274,24 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>                                   p,
-    Iter                                         begin,
-    Iter                                         end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -325,28 +299,24 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>                                      p,
-    Iter                                            begin,
-    Iter                                            end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -355,15 +325,14 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
@@ -371,42 +340,35 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<
-                Compare,
-                operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
 {
-  static_assert(
-      std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for "
-      "pointers");
-  static_assert(
-      std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for "
-      "pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "stable_sort_pairs<cuda_exec> is only implemented for "
+                "arithmetic types");
   static_assert(
-      type_traits::is_arithmetic<K>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for "
-      "arithmetic types");
-  static_assert(
-      concepts::any_of<
-          camp::is_same<Compare, operators::less<K>>,
-          camp::is_same<Compare, operators::greater<K>>>::value,
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for "
       "RAJA::operators::less or RAJA::operators::greater");
 
@@ -416,31 +378,28 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>>)
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -461,7 +420,7 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortPairs(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -482,15 +441,15 @@ stable_pairs(
   {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out)
   {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -504,31 +463,28 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>>)
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
@@ -549,7 +505,7 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -570,15 +526,15 @@ stable_pairs(
   {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
   if (d_vals.Current() == d_vals_out)
   {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -593,15 +549,14 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     concepts::negate<concepts::all_of<
@@ -609,40 +564,33 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<
-                Compare,
-                operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
 {
-  static_assert(
-      std::is_pointer<KeyIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert(
-      std::is_pointer<ValIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "sort_pairs<cuda_exec> is only implemented for arithmetic "
+                "types");
   static_assert(
-      type_traits::is_arithmetic<K>::value,
-      "sort_pairs<cuda_exec> is only implemented for arithmetic "
-      "types");
-  static_assert(
-      concepts::any_of<
-          camp::is_same<Compare, operators::less<K>>,
-          camp::is_same<Compare, operators::greater<K>>>::value,
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
       "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
       "RAJA::operators::greater");
 
@@ -652,31 +600,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>                                      p,
-    KeyIter                                         keys_begin,
-    KeyIter                                         keys_end,
-    ValIter                                         vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -684,31 +629,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Cuda>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
     std::is_pointer<KeyIter>,
     std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>                                         p,
-    KeyIter                                            keys_begin,
-    KeyIter                                            keys_end,
-    ValIter                                            vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index d93995f8d7..8844937700 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -30,56 +30,56 @@ RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
 {
-  return desul::atomic_load(
-      acc, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_load(acc, raja_default_desul_order {},
+                            raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
 {
-  desul::atomic_store(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  desul::atomic_store(acc, value, raja_default_desul_order {},
+                      raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_add(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_add(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_sub(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_sub(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_min(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_min(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_max(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_max(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_inc(
-      acc, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_inc(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -88,16 +88,16 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(
-      acc, val, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_inc_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_dec(
-      acc, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_dec(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -106,40 +106,40 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(
-      acc, val, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_dec_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_and(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_and(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_or(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_or(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_xor(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_fetch_xor(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_exchange(
-      acc, value, raja_default_desul_order {}, raja_default_desul_scope {});
+  return desul::atomic_exchange(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -147,9 +147,9 @@ template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE RAJA_INLINE T
 atomicCAS(AtomicPolicy, T* acc, T compare, T value)
 {
-  return desul::atomic_compare_exchange(
-      acc, compare, value, raja_default_desul_order {},
-      raja_default_desul_scope {});
+  return desul::atomic_compare_exchange(acc, compare, value,
+                                        raja_default_desul_order {},
+                                        raja_default_desul_scope {});
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 6211fede8c..6c829798be 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -79,8 +79,8 @@ struct PinnedAllocator
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    hipErrchk(hipHostMalloc(
-        &ptr, nbytes, hipHostMallocMapped | hipHostMallocNonCoherent));
+    hipErrchk(hipHostMalloc(&ptr, nbytes,
+                            hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -120,7 +120,7 @@ struct DeviceZeroedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    auto  res = ::camp::resources::Hip::get_default();
+    auto res = ::camp::resources::Hip::get_default();
     void* ptr;
     hipErrchk(hipMalloc(&ptr, nbytes));
     hipErrchk(hipMemsetAsync(ptr, 0, nbytes, res.get_stream()));
@@ -169,12 +169,12 @@ namespace detail
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct hipInfo
 {
-  const void*            func = nullptr;
-  hip_dim_t              gridDim {0, 0, 0};
-  hip_dim_t              blockDim {0, 0, 0};
-  size_t*                dynamic_smem = nullptr;
+  const void* func = nullptr;
+  hip_dim_t gridDim {0, 0, 0};
+  hip_dim_t blockDim {0, 0, 0};
+  size_t* dynamic_smem = nullptr;
   ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
-  bool                   setup_reducers = false;
+  bool setup_reducers = false;
 };
 struct hipStatusInfo : hipInfo
 {
@@ -266,15 +266,14 @@ void launch(::RAJA::resources::Hip res, bool async = true)
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(
-    const void*            func,
-    hip_dim_t              gridDim,
-    hip_dim_t              blockDim,
-    void**                 args,
-    size_t                 shmem,
-    ::RAJA::resources::Hip res,
-    bool                   async = true,
-    const char*            name  = nullptr)
+void launch(const void* func,
+            hip_dim_t gridDim,
+            hip_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Hip res,
+            bool async       = true,
+            const char* name = nullptr)
 {
 #if defined(RAJA_ENABLE_ROCTX)
   if (name) roctxRangePush(name);
@@ -348,8 +347,8 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
 template <typename T, typename GetNFromMax>
-RAJA_INLINE size_t
-allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
   const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
@@ -382,13 +381,13 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void*            func,
-    hip_dim_t              gridDim,
-    hip_dim_t              blockDim,
-    size_t&                dynamic_smem,
-    ::RAJA::resources::Hip res,
-    LOOP_BODY&&            loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 hip_dim_t gridDim,
+                 hip_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Hip res,
+                 LOOP_BODY&& loop_body)
 {
   ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
       detail::tl_status,
@@ -399,7 +398,7 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
 }
 
 
-static constexpr int    hip_occupancy_uninitialized_int = -1;
+static constexpr int hip_occupancy_uninitialized_int = -1;
 static constexpr size_t hip_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
 
@@ -424,15 +423,15 @@ HipFixedMaxBlocksData hip_max_blocks()
 struct HipOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int    func_max_blocks_per_device   = hip_occupancy_uninitialized_int;
-  int    func_max_threads_per_block   = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = hip_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(
-    const void* func,
-    size_t      func_dynamic_shmem_per_block)
+RAJA_INLINE HipOccMaxBlocksThreadsData
+hip_occupancy_max_blocks_threads(const void* func,
+                                 size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
@@ -460,8 +459,8 @@ RAJA_INLINE HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(
 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int    func_threads_per_block       = hip_occupancy_uninitialized_int;
-  int    func_max_blocks_per_sm       = hip_occupancy_uninitialized_int;
+  int func_threads_per_block          = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
@@ -497,10 +496,10 @@ hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
 template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE HipOccMaxBlocksData hip_occupancy_max_blocks(
-    const void* func,
-    size_t      func_dynamic_shmem_per_block,
-    int         func_threads_per_block)
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func,
+                         size_t func_dynamic_shmem_per_block,
+                         int func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
@@ -559,10 +558,9 @@ RAJA_INLINE HipOccMaxBlocksData hip_occupancy_max_blocks(
 template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(
-      const void* func,
-      size_t      func_dynamic_shmem_per_block,
-      IdxT        len)
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
       : m_func(func),
         m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
         m_len(len)
@@ -641,8 +639,8 @@ struct ConcretizerImpl
 
 private:
   const void* m_func;
-  size_t      m_func_dynamic_shmem_per_block;
-  IdxT        m_len;
+  size_t m_func_dynamic_shmem_per_block;
+  IdxT m_len;
 };
 
 }  // namespace hip
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index e58837f075..5d2e9b69bb 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -42,8 +42,8 @@ namespace hip
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
 template <typename Factory>
-__global__ void
-get_value_global(typename Factory::value_type* ptr, Factory factory)
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,7 +52,7 @@ get_value_global(typename Factory::value_type* ptr, Factory factory)
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void*  ptr           = nullptr;
+  static void* ptr            = nullptr;
   if (nbytes > cached_nbytes)
   {
     cached_nbytes = 0;
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index febf12450c..dbdcbc7851 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,37 +36,33 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallOrdered<
-          RAJA::hip_exec_async<BLOCK_SIZE>,
-          RAJA::hip_work<BLOCK_SIZE, Async>,
-          RAJA::ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallOrdered<
-      RAJA::hip_exec_async<BLOCK_SIZE>,
-      RAJA::hip_work<BLOCK_SIZE, Async>,
-      RAJA::ordered,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
   using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -76,10 +72,9 @@ struct WorkRunner<
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage
-  run(WorkContainer const&         storage,
-      typename base::resource_type r,
-      Args... args) const
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -103,37 +98,33 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::reverse_ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallReverse<
-          RAJA::hip_exec_async<BLOCK_SIZE>,
-          RAJA::hip_work<BLOCK_SIZE, Async>,
-          RAJA::reverse_ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallReverse<
-      RAJA::hip_exec_async<BLOCK_SIZE>,
-      RAJA::hip_work<BLOCK_SIZE, Async>,
-      RAJA::reverse_ordered,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::reverse_ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
   using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
@@ -143,10 +134,9 @@ struct WorkRunner<
   /// run all loops asynchronously and synchronize after is necessary
   ///
   template <typename WorkContainer>
-  per_run_storage
-  run(WorkContainer const&         storage,
-      typename base::resource_type r,
-      Args... args) const
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -171,11 +161,10 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <
-    typename Segment_type,
-    typename LoopBody,
-    typename index_type,
-    typename... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldHipDeviceXThreadblockLoop
 {
   template <typename segment_in, typename body_in>
@@ -190,8 +179,8 @@ struct HoldHipDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto       begin   = m_segment.begin();
-    const auto       end     = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
     for (index_type i = i_begin; i < len; i += stride)
     {
@@ -201,15 +190,14 @@ struct HoldHipDeviceXThreadblockLoop
 
 private:
   Segment_type m_segment;
-  LoopBody     m_body;
+  LoopBody m_body;
 };
 
-template <
-    size_t BLOCK_SIZE,
-    typename StorageIter,
-    typename value_type,
-    typename index_type,
-    typename... Args>
+template <size_t BLOCK_SIZE,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -226,13 +214,12 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
 struct WorkRunner<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -272,11 +259,10 @@ struct WorkRunner<
   using dispatcher_holder_policy =
       dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<
-      Platform::hip,
-      dispatcher_holder_policy,
-      RAJA::hip_work<BLOCK_SIZE, true>,
-      Args...>;
+  using dispatcher_type = Dispatcher<Platform::hip,
+                                     dispatcher_holder_policy,
+                                     RAJA::hip_work<BLOCK_SIZE, true>,
+                                     Args...>;
 
   WorkRunner() = default;
 
@@ -312,9 +298,9 @@ struct WorkRunner<
     // using true_value_type = typename WorkContainer::template
     // true_value_type<holder>;
 
-    Iterator  begin = std::begin(iter);
-    Iterator  end   = std::end(iter);
-    IndexType len   = std::distance(begin, end);
+    Iterator begin = std::begin(iter);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
     if (len > 0 && BLOCK_SIZE > 0)
@@ -344,20 +330,20 @@ struct WorkRunner<
   run(WorkContainer const& storage, resource_type r, Args... args) const
   {
     using Iterator   = camp::decay<decltype(std::begin(storage))>;
-    using IndexType  = camp::decay<decltype(std::distance(
-         std::begin(storage), std::end(storage)))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
     per_run_storage run_storage {};
 
-    auto func = hip_unordered_y_block_global<
-        BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
+                                             index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator  begin     = std::begin(storage);
-    Iterator  end       = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
@@ -373,8 +359,8 @@ struct WorkRunner<
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
       hip_dim_t blockSize {static_cast<hip_dim_member_t>(block_size), 1, 1};
       hip_dim_t gridSize {
-          static_cast<hip_dim_member_t>(
-              (average_iterations + block_size - 1) / block_size),
+          static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
+                                        block_size),
           static_cast<hip_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
@@ -389,8 +375,8 @@ struct WorkRunner<
         // Launch the kernel
         //
         void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::hip::launch(
-            (const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
+                          shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -409,12 +395,11 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
 struct WorkRunner<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -423,12 +408,11 @@ struct WorkRunner<
     INDEX_T,
     Args...>;
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
 struct WorkRunner<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index 0b28ad3c37..60b0871f0d 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -81,10 +81,9 @@ struct hip_useReinterpretCommon
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<
-      sizeof(T) == sizeof(unsigned int),
-      unsigned int,
-      unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -102,9 +101,8 @@ using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
  * This overload using builtin functions is used to implement atomic loads
  * under some build configurations.
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
@@ -135,10 +133,9 @@ struct hip_useReinterpretExchange
                                 (sizeof(T) == sizeof(unsigned int) ||
                                  sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<
-      sizeof(T) == sizeof(unsigned int),
-      unsigned int,
-      unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
@@ -152,9 +149,8 @@ using hip_useReinterpretExchange_t =
  * Performs an atomic exchange using a builtin function. Stores the new value
  * in the given address and returns the old value.
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
@@ -164,9 +160,8 @@ RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
  * Performs an atomic exchange using a reinterpret cast. Stores the new value
  * in the given address and returns the old value.
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   using R = hip_useReinterpretExchange_t<T>;
@@ -225,31 +220,29 @@ struct hip_useReinterpretLoad
 #endif
                                   ));
 
-  using type = std::conditional_t<
-      sizeof(T) == 1,
+  using type =
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-      uint8_t,
+                         uint8_t,
 #else
-      unsigned char,
+                         unsigned char,
 #endif
-      std::conditional_t<
-          sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-          uint16_t,
+                                            uint16_t,
 #else
-          unsigned short,
+                                            unsigned short,
 #endif
-          std::conditional_t<
-              sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-              uint32_t,
+                                                               uint32_t,
 #else
-              unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-              uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-              unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -288,9 +281,8 @@ using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
 /*!
  * Atomic load
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
@@ -300,9 +292,8 @@ RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 #endif
 }
 
-template <
-    typename T,
-    std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
   using R = hip_useReinterpretLoad_t<T>;
@@ -315,9 +306,8 @@ RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 /*!
  * Atomic store
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
 RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
@@ -327,15 +317,14 @@ RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 #endif
 }
 
-template <
-    typename T,
-    std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
 RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
-  hip_atomicStore(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value));
+  hip_atomicStore(reinterpret_cast<R*>(acc),
+                  RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
 
@@ -344,9 +333,8 @@ RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
  *
  * Returns the old value in memory before this operation.
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
@@ -357,9 +345,8 @@ RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
  *
  * Returns the old value in memory before this operation.
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
@@ -375,24 +362,21 @@ RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
  * integral type to avoid cases where the values will never compare equal
  * (most notably, NaNs).
  */
-template <
-    typename T,
-    std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
 
-template <
-    typename T,
-    std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+template <typename T,
+          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
-  return hip_atomicCAS_equal(
-      RAJA::util::reinterp_A_as_B<T, R>(a),
-      RAJA::util::reinterp_A_as_B<T, R>(b));
+  return hip_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
+                             RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
@@ -425,8 +409,9 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
  * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T
-hip_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
+                                            Oper&& oper,
+                                            ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
 
@@ -454,16 +439,15 @@ hip_atomicCAS_loop(T* acc, Oper&& oper, ShortCircuit&& sc)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<
-    int,
-    unsigned int,
-    unsigned long long,
-    float
+using hip_atomicAdd_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-    ,
-    double
+                                                 ,
+                                                 double
 #endif
-    >;
+                                                 >;
 
 template <
     typename T,
@@ -489,16 +473,15 @@ RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<
-    int,
-    unsigned int,
-    unsigned long long,
-    float
+using hip_atomicSub_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-    ,
-    double
+                                                 ,
+                                                 double
 #endif
-    >;
+                                                 >;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -514,14 +497,13 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<
-    unsigned long long,
-    float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
+                                                         float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-    ,
-    double
+                                                         ,
+                                                         double
 #endif
-    >;
+                                                         >;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
@@ -635,14 +617,13 @@ RAJA_INLINE __device__ T hip_atomicInc(T* acc)
 template <typename T>
 RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
 {
-  return hip_atomicCAS_loop(
-      acc,
-      [value](T old)
-      {
-        return old == static_cast<T>(0) || value < old
-                   ? value
-                   : old - static_cast<T>(1);
-      });
+  return hip_atomicCAS_loop(acc,
+                            [value](T old)
+                            {
+                              return old == static_cast<T>(0) || value < old
+                                         ? value
+                                         : old - static_cast<T>(1);
+                            });
 }
 
 
@@ -733,8 +714,8 @@ RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(hip_atomic_explicit<host_policy>, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
@@ -757,8 +738,9 @@ atomicStore(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
@@ -769,8 +751,9 @@ atomicAdd(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
@@ -781,8 +764,9 @@ atomicSub(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
@@ -793,8 +777,9 @@ atomicMin(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
@@ -805,8 +790,9 @@ atomicMax(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
@@ -817,8 +803,8 @@ atomicInc(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
@@ -829,8 +815,9 @@ atomicInc(hip_atomic_explicit<host_policy>, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
@@ -841,8 +828,8 @@ atomicDec(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
@@ -853,8 +840,9 @@ atomicDec(hip_atomic_explicit<host_policy>, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
@@ -865,8 +853,9 @@ atomicAnd(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
@@ -877,8 +866,9 @@ atomicOr(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
@@ -889,8 +879,9 @@ atomicXor(hip_atomic_explicit<host_policy>, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(hip_atomic_explicit<host_policy>, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 9f8103b0c6..c8baea1713 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,46 +71,41 @@ namespace impl
  *
  ******************************************************************************
  */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               len,
-      const void*        RAJA_UNUSED_ARG(func),
-      size_t             RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -121,38 +116,35 @@ struct ForallDimensionCalculator<
                           "space");
     }
 
-    internal::set_hip_dim<dim>(
-        dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(
-        dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(dims.threads,
+                               static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(dims.blocks,
+                               static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template <
-    named_dim dim,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               len,
-      const void*        func,
-      size_t             dynamic_shmem_size)
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -171,31 +163,28 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::Direct,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               len,
-      const void*        func,
-      size_t             dynamic_shmem_size)
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -220,11 +209,10 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               len,
-      const void*        func,
-      size_t             dynamic_shmem_size)
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -236,35 +224,31 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               RAJA_UNUSED_ARG(len),
-      const void*        RAJA_UNUSED_ARG(func),
-      size_t             RAJA_UNUSED_ARG(dynamic_shmem_size))
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
     const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
@@ -274,31 +258,28 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       GRID_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               len,
-      const void*        func,
-      size_t             dynamic_shmem_size)
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -311,31 +292,28 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <
-    named_dim dim,
-    int       BLOCK_SIZE,
-    typename Concretizer,
-    typename UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator<
     ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
     ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
     Concretizer,
     UniqueMarker>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0 or "
-      "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0 or "
+                "named_usage::unspecified with forall");
 
   using IndexGetter =
       ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               len,
-      const void*        func,
-      size_t             dynamic_shmem_size)
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -360,11 +338,10 @@ struct ForallDimensionCalculator<
       IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      internal::HipDims& dims,
-      IdxT               len,
-      const void*        func,
-      size_t             dynamic_shmem_size)
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
     ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
         func, dynamic_shmem_size, len};
@@ -391,81 +368,75 @@ struct ForallDimensionCalculator<
  *
  ******************************************************************************
  */
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__ void forall_hip_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length)
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
   }
 }
 ///
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     body(idx[ii]);
   }
 }
 
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__ void forallp_hip_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -473,29 +444,26 @@ __launch_bounds__(BlockSize, 1) __global__ void forallp_hip_kernel(
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::
-                value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
-  auto  ii         = IterationGetter::template index<IndexType>();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
   if (ii < length)
   {
     RAJA::expt::invoke_body(f_params, body, idx[ii]);
@@ -508,24 +476,22 @@ template <
     typename Iterator,
     typename LOOP_BODY,
     typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__ void forall_hip_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length)
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -540,20 +506,18 @@ template <
     typename IndexType,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__ void
 forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -568,25 +532,23 @@ template <
     typename LOOP_BODY,
     typename IndexType,
     typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size > 0),
-        size_t> BlockSize = IterationGetter::block_size>
-__launch_bounds__(BlockSize, 1) __global__ void forallp_hip_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
+__launch_bounds__(BlockSize, 1) __global__
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -603,23 +565,20 @@ template <
     typename ForallParam,
     typename IterationMapping = typename EXEC_POL::IterationMapping,
     typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<
-        std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::
-                value &&
-            std::is_base_of<
-                iteration_mapping::UnsizedLoopBase,
-                IterationMapping>::value &&
-            (IterationGetter::block_size <= 0),
-        size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(
-    LOOP_BODY      loop_body,
-    const Iterator idx,
-    IndexType      length,
-    ForallParam    f_params)
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(loop_body);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(loop_body);
+  auto& body      = privatizer.get_priv();
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
@@ -638,14 +597,13 @@ __global__ void forallp_hip_kernel(
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename ForallParam>
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
@@ -662,19 +620,21 @@ forall_impl(
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<
-      IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<
-      IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<
-      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
@@ -708,8 +668,8 @@ forall_impl(
       // Launch the kernels
       //
       void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(
-          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
     }
 
     RAJA_FT_END;
@@ -719,14 +679,13 @@ forall_impl(
 }
 
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename ForallParam>
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
@@ -736,36 +695,37 @@ forall_impl(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-    Iterable&&  iter,
-    LoopBody&&  loop_body,
+    Iterable&& iter,
+    LoopBody&& loop_body,
     ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<
-      IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<
-      IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<
-      IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
   {
 
-    auto func =
-        reinterpret_cast<const void*>(&impl::forallp_hip_kernel<
-                                      EXEC_POL, Iterator, LOOP_BODY, IndexType,
-                                      camp::decay<ForallParam>>);
+    auto func = reinterpret_cast<const void*>(
+        &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
+                                  camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -798,10 +758,10 @@ forall_impl(
       //
       // Launch the kernels
       //
-      void* args[] = {
-          (void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::hip::launch(
-          func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -831,13 +791,12 @@ forall_impl(
  *
  ******************************************************************************
  */
-template <
-    typename LoopBody,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename... SegmentTypes>
+template <typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
     resources::Hip r,
     ExecPolicy<
@@ -845,15 +804,15 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
         ::RAJA::policy::hip::
             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
     const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody&&                            loop_body)
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
   {
     iset.segmentCall(
         r, isi, detail::CallForall(),
-        ::RAJA::policy::hip::hip_exec<
-            IterationMapping, IterationGetter, Concretizer, true>(),
+        ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                      Concretizer, true>(),
         loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index 608c6afb1a..a9d21297ac 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -93,20 +93,20 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<
-        T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto      ptr = const_cast<integer_type*>(
+    auto ptr = const_cast<integer_type*>(
         reinterpret_cast<const integer_type*>(in_ptr + idx));
 
     for (size_t i = 0; i < u.array_size(); ++i)
     {
 #if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
     RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(
-          &ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
+                                     __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -118,8 +118,8 @@ struct AccessorDeviceScopeUseBlockFence
   template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<
-        T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
@@ -130,8 +130,8 @@ struct AccessorDeviceScopeUseBlockFence
     {
 #if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
     RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(
-          &ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
+                         __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -181,8 +181,8 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<
-      T, min_shfl_int_type_size, max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -196,8 +196,8 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<
-      T, min_shfl_int_type_size, max_shfl_int_type_size>
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
       u;
   u.set_value(var);
 
@@ -262,7 +262,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
     for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T   rhs     = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -327,7 +327,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T   rhs     = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -340,11 +340,10 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   if (numThreads > policy::hip::device_constants.WARP_SIZE)
   {
 
-    static_assert(
-        policy::hip::device_constants.MAX_WARPS <=
-            policy::hip::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can "
-        "reduce MAX_WARPS values");
+    static_assert(policy::hip::device_constants.MAX_WARPS <=
+                      policy::hip::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
     __shared__ unsigned char tmpsd[sizeof(
         RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 1b4bdba005..a882b547d7 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -36,15 +36,13 @@ namespace internal
 {
 
 
-template <
-    typename Data,
-    typename Conditional,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::If<Conditional, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          typename Conditional,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::If<Conditional, EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index d4f8728899..6e90852841 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -36,20 +36,19 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::policy::hip::
-            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -103,21 +102,19 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -183,21 +180,19 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -256,11 +251,10 @@ struct HipStatementExecutor<
 /*
  * Executor for sequential loops inside of a HipKernel.
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
@@ -272,10 +266,9 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -286,17 +279,16 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -311,9 +303,9 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
-      "BitMask is too large for HIP warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -353,17 +345,16 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -378,9 +369,9 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
-      "BitMask is too large for HIP warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -431,17 +422,16 @@ struct HipStatementExecutor<
  * Mapping directly from raw threadIdx.x
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -495,17 +485,16 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index ce36d042ed..823f6b1293 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -37,14 +37,13 @@ namespace internal
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<
@@ -66,11 +65,11 @@ struct HipStatementExecutor<
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::policy::hip::
-              hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
+                     EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -102,13 +101,12 @@ struct HipStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<
@@ -180,13 +178,12 @@ struct HipStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<
@@ -254,12 +251,11 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
@@ -271,10 +267,9 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
@@ -285,35 +280,33 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::hip_warp_masked_direct<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::hip_warp_masked_direct<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = HipStatementExecutor<
-      Data,
-      statement::
-          For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
@@ -327,9 +320,9 @@ struct HipStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
-      "BitMask is too large for HIP warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -352,35 +345,33 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::hip_warp_masked_loop<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::hip_warp_masked_loop<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = HipStatementExecutor<
-      Data,
-      statement::
-          For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
@@ -394,9 +385,9 @@ struct HipStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(
-      mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
-      "BitMask is too large for HIP warp size");
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
@@ -430,37 +421,33 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::hip_thread_masked_direct<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::hip_thread_masked_direct<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = HipStatementExecutor<
-      Data,
-      statement::For<
-          ArgumentId,
-          RAJA::hip_thread_masked_direct<Mask>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
@@ -495,35 +482,33 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::hip_thread_masked_loop<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::hip_thread_masked_loop<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = HipStatementExecutor<
-      Data,
-      statement::
-          For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 55a4802357..8c5cb83d39 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -101,9 +101,9 @@ struct HipKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using HipKernelExp = HipKernelExt<
-    hip_explicit_launch<false, num_blocks, num_threads>,
-    EnclosedStmts...>;
+using HipKernelExp =
+    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -112,9 +112,9 @@ using HipKernelExp = HipKernelExt<
  * The kernel launch is asynchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using HipKernelExpAsync = HipKernelExt<
-    hip_explicit_launch<true, num_blocks, num_threads>,
-    EnclosedStmts...>;
+using HipKernelExpAsync =
+    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
@@ -247,11 +247,10 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <
-    typename LaunchPolicy,
-    typename StmtList,
-    typename Data,
-    typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct HipLaunchHelper;
 
 
@@ -260,18 +259,16 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template <
-    bool async0,
-    int  num_blocks,
-    int  num_threads,
-    typename StmtList,
-    typename Data,
-    typename Types>
-struct HipLaunchHelper<
-    hip_explicit_launch<async0, num_blocks, num_threads>,
-    StmtList,
-    Data,
-    Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
+                       StmtList,
+                       Data,
+                       Types>
 {
   using Self = HipLaunchHelper;
 
@@ -280,20 +277,19 @@ struct HipLaunchHelper<
   using executor_t =
       internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<
-      (num_threads <= 0) ? 0 : num_threads,
-      Data,
-      executor_t>;
+  using kernelGetter_t =
+      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                              Data,
+                              executor_t>;
 
   inline static const void* get_func()
   {
     return reinterpret_cast<const void*>(kernelGetter_t::get());
   }
 
-  inline static void recommended_blocks_threads(
-      size_t shmem_size,
-      int&   recommended_blocks,
-      int&   recommended_threads)
+  inline static void recommended_blocks_threads(size_t shmem_size,
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
@@ -355,8 +351,8 @@ struct HipLaunchHelper<
     }
   }
 
-  inline static void
-  max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int& max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
     if (num_threads <= 0)
     {
@@ -429,10 +425,9 @@ struct HipLaunchHelper<
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline hip_dim_t fitHipDims(
-    hip_dim_member_t limit,
-    hip_dim_t        result,
-    hip_dim_t        minimum = hip_dim_t())
+inline hip_dim_t fitHipDims(hip_dim_member_t limit,
+                            hip_dim_t result,
+                            hip_dim_t minimum = hip_dim_t())
 {
 
 
@@ -532,8 +527,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -551,9 +546,8 @@ struct StatementExecutor<
       if (recommended_threads >= get_size(launch_dims.min_dims.threads))
       {
 
-        fit_threads = fitHipDims(
-            recommended_threads, launch_dims.dims.threads,
-            launch_dims.min_dims.threads);
+        fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       //
@@ -563,9 +557,8 @@ struct StatementExecutor<
           get_size(fit_threads) != recommended_threads)
       {
 
-        fit_threads = fitHipDims(
-            max_threads, launch_dims.dims.threads,
-            launch_dims.min_dims.threads);
+        fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -596,8 +589,8 @@ struct StatementExecutor<
         use_blocks = max_blocks;
       }
 
-      launch_dims.dims.blocks = fitHipDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
+                                           launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -629,9 +622,9 @@ struct StatementExecutor<
         // Launch the kernel
         //
         void* args[] = {(void*)&hip_data};
-        RAJA::hip::launch(
-            func, launch_dims.dims.blocks, launch_dims.dims.threads, args,
-            shmem, res, launch_t::async);
+        RAJA::hip::launch(func, launch_dims.dims.blocks,
+                          launch_dims.dims.threads, args, shmem, res,
+                          launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index b033576c48..a9888d17a7 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -36,12 +36,11 @@ namespace internal
 {
 
 
-template <
-    typename Data,
-    camp::idx_t HpArgumentId,
-    camp::idx_t... Args,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t HpArgumentId,
+          camp::idx_t... Args,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::
@@ -65,8 +64,8 @@ struct HipStatementExecutor<
         segment_length<HpArgumentId>(data) +
         foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
-    int h_args = foldl(
-        RAJA::operators::plus<idx_t>(), camp::get<Args>(data.offset_tuple)...);
+    int h_args = foldl(RAJA::operators::plus<idx_t>(),
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index e3392199ba..b59ec5c88a 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -40,18 +40,15 @@ namespace internal
 {
 
 // Intialize thread shared array
-template <
-    typename Data,
-    camp::idx_t... Indices,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::InitLocalMem<
-        RAJA::hip_shared_mem,
-        camp::idx_seq<Indices...>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::InitLocalMem<RAJA::hip_shared_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
@@ -125,18 +122,15 @@ struct HipStatementExecutor<
 };
 
 // Intialize thread private array
-template <
-    typename Data,
-    camp::idx_t... Indices,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::InitLocalMem<
-        RAJA::hip_thread_mem,
-        camp::idx_seq<Indices...>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::InitLocalMem<RAJA::hip_thread_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index 9b5d2c3d92..7835ddb7eb 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -40,15 +40,13 @@ namespace RAJA
 namespace internal
 {
 
-template <
-    typename Data,
-    camp::idx_t LambdaIndex,
-    typename... Args,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::Lambda<LambdaIndex, Args...>,
-    Types>
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Lambda<LambdaIndex, Args...>,
+                            Types>
 {
 
   static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index e510bcdf1f..2799207979 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -34,21 +34,18 @@ namespace internal
 //
 // Executor that handles reductions across a single HIP thread block
 //
-template <
-    typename Data,
-    template <typename...>
-    class ReduceOperator,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::Reduce<
-        RAJA::hip_block_reduce,
-        ReduceOperator,
-        ParamId,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          template <typename...>
+          class ReduceOperator,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Reduce<RAJA::hip_block_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -99,21 +96,18 @@ struct HipStatementExecutor<
 //
 // Executor that handles reductions across a single HIP thread warp
 //
-template <
-    typename Data,
-    template <typename...>
-    class ReduceOperator,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::Reduce<
-        RAJA::hip_warp_reduce,
-        ReduceOperator,
-        ParamId,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          template <typename...>
+          class ReduceOperator,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Reduce<RAJA::hip_warp_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 01d96ae1bb..4490bddf42 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -47,22 +47,21 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::hip::
-            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                   sync,
+                                                   IndexMapper>,
+                    EnclosedStmts...>,
     Types>
 {
 
@@ -138,13 +137,12 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<
@@ -240,13 +238,12 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<
@@ -338,12 +335,11 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename TPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename TPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
@@ -356,10 +352,9 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 2abfaf9772..fc4a5c5222 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -48,15 +48,14 @@ namespace internal
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    kernel_sync_requirement sync,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          kernel_sync_requirement sync,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::TileTCount<
@@ -80,12 +79,12 @@ struct HipStatementExecutor<
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          RAJA::policy::hip::
-              hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-          EnclosedStmts...>,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
+                      EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -128,14 +127,13 @@ struct HipStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::TileTCount<
@@ -221,14 +219,13 @@ struct HipStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets no sync requirements
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    typename IndexMapper,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          typename IndexMapper,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::TileTCount<
@@ -309,13 +306,12 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename TPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename TPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<
     Data,
     statement::
@@ -330,10 +326,9 @@ struct HipStatementExecutor<
               RAJA::policy::hip::hip_indexer<
                   iteration_mapping::StridedLoop<named_usage::unspecified>,
                   kernel_sync_requirement::none,
-                  hip::IndexGlobal<
-                      named_dim::x,
-                      named_usage::ignored,
-                      named_usage::ignored>>,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
               EnclosedStmts...>,
           Types>
 {};
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 01d31bbbe1..59649fc63d 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -219,10 +219,9 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
       hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      HipDims& RAJA_UNUSED_ARG(dims),
-      HipDims& RAJA_UNUSED_ARG(min_dims),
-      IdxT     len)
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len > static_cast<IdxT>(1))
     {
@@ -258,11 +257,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -275,8 +273,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
                           "space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -304,11 +302,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -321,8 +318,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
                           "space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -337,10 +334,9 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
       hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      HipDims& RAJA_UNUSED_ARG(dims),
-      HipDims& RAJA_UNUSED_ARG(min_dims),
-      IdxT     len)
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
     if (len > static_cast<IdxT>(0))
     {
@@ -355,11 +351,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -369,15 +364,15 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_hip_dim<dim>(
-        dims.threads, RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(
-        min_dims.threads, RAJA_DIVIDE_CEILING_INT(
-                              len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -387,11 +382,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -400,37 +394,34 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(
-        dims.blocks, RAJA_DIVIDE_CEILING_INT(
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
                          len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(
-        min_dims.blocks, RAJA_DIVIDE_CEILING_INT(
-                             len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template <
-    named_dim               dim,
-    int                     BLOCK_SIZE,
-    int                     GRID_SIZE,
-    kernel_sync_requirement sync>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     iteration_mapping::Direct,
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -445,10 +436,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -464,10 +455,9 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
       hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
   template <typename IdxT>
-  static void set_dimensions(
-      HipDims& RAJA_UNUSED_ARG(dims),
-      HipDims& RAJA_UNUSED_ARG(min_dims),
-      IdxT     RAJA_UNUSED_ARG(len))
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
   {}
 };
 
@@ -497,11 +487,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
@@ -510,8 +499,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
@@ -539,11 +528,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
@@ -552,8 +540,8 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
@@ -586,11 +574,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
@@ -600,13 +587,13 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   {
     // BEWARE: if calculated block_size is too high then the kernel launch will
     // fail
-    set_hip_dim<dim>(
-        dims.threads, RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
@@ -616,11 +603,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper =
       hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
@@ -629,35 +615,32 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(
-        dims.blocks, RAJA_DIVIDE_CEILING_INT(
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
                          len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template <
-    named_dim               dim,
-    int                     BLOCK_SIZE,
-    int                     GRID_SIZE,
-    kernel_sync_requirement sync>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
 struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     sync,
     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(
-      BLOCK_SIZE > 0,
-      "block size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
-  static_assert(
-      GRID_SIZE > 0,
-      "grid size must be > 0, "
-      "named_usage::unspecified, or "
-      "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, "
+                "named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -667,10 +650,10 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(
-        min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(
-        min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index fd428ac741..4940c6d365 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -34,8 +34,8 @@ __global__ void launch_global_fcn(BODY body_in)
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -45,14 +45,14 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void
-launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -75,12 +75,11 @@ struct LaunchExecute<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -92,10 +91,9 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize {
-        static_cast<hip_dim_member_t>(params.teams.value[0]),
-        static_cast<hip_dim_member_t>(params.teams.value[1]),
-        static_cast<hip_dim_member_t>(params.teams.value[2])};
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
     hip_dim_t blockSize {
         static_cast<hip_dim_member_t>(params.threads.value[0]),
@@ -116,17 +114,16 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, hip_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::hip::launch(
-            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
-            kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -143,12 +140,11 @@ struct LaunchExecute<
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       launch_params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             launch_reducers)
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -189,26 +185,25 @@ struct LaunchExecute<
       {
         using EXEC_POL =
             RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
-            launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, hip_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(
-            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
-            kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
-            launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -226,8 +221,8 @@ __launch_bounds__(num_threads, 1) __global__
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -238,15 +233,14 @@ __launch_bounds__(num_threads, 1) __global__
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-    void launch_new_reduce_global_fcn_fixed(
-        BODY         body_in,
-        ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
@@ -269,12 +263,11 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -287,10 +280,9 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize {
-        static_cast<hip_dim_member_t>(params.teams.value[0]),
-        static_cast<hip_dim_member_t>(params.teams.value[1]),
-        static_cast<hip_dim_member_t>(params.teams.value[2])};
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
     hip_dim_t blockSize {
         static_cast<hip_dim_member_t>(params.threads.value[0]),
@@ -310,17 +302,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, hip_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body};
-        RAJA::hip::launch(
-            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
-            kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -336,18 +327,17 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       launch_params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             launch_reducers)
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<
-            BODY, nthreads, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads,
+                                            camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -383,26 +373,25 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       {
         using EXEC_POL =
             RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(
-            launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, hip_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(
-            func, gridSize, blockSize, args, shared_mem_size, hip_res, async,
-            kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(
-            launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -418,10 +407,9 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
 */
 template <typename SEGMENT, typename IndexMapper>
 struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
     SEGMENT>
 {
 
@@ -429,10 +417,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -446,11 +434,10 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
     SEGMENT>
 {
 
@@ -458,11 +445,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -477,18 +464,16 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
     SEGMENT>
 {
 
@@ -496,12 +481,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -513,9 +498,8 @@ struct LoopExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(
-          *(segment0.begin() + i0), *(segment1.begin() + i1),
-          *(segment2.begin() + i2));
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2));
     }
   }
 };
@@ -533,10 +517,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -563,11 +547,11 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -590,11 +574,10 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::hip::hip_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -609,12 +592,12 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -637,9 +620,8 @@ struct LoopExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(
-              *(segment0.begin() + i0), *(segment1.begin() + i1),
-              *(segment2.begin() + i2));
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2));
         }
       }
     }
@@ -648,10 +630,9 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
     SEGMENT>
 {
 
@@ -659,10 +640,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i   = IndexMapper::template index<diff_t>();
@@ -675,11 +656,10 @@ struct LoopICountExecute<
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
     SEGMENT>
 {
 
@@ -687,11 +667,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -706,18 +686,16 @@ struct LoopICountExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
     SEGMENT>
 {
 
@@ -725,12 +703,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -742,9 +720,8 @@ struct LoopICountExecute<
 
     if (i0 < len0 && i1 < len1 && i2 < len2)
     {
-      body(
-          *(segment0.begin() + i0), *(segment1.begin() + i1),
-          *(segment2.begin() + i2), i0, i1, i2);
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
@@ -762,10 +739,10 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t i_init   = IndexMapper::template index<diff_t>();
@@ -792,11 +769,11 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -819,11 +796,10 @@ struct LoopICountExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopICountExecute<
     RAJA::policy::hip::hip_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -838,12 +814,12 @@ struct LoopICountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -866,9 +842,8 @@ struct LoopICountExecute<
         for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
         {
 
-          body(
-              *(segment0.begin() + i0), *(segment1.begin() + i1),
-              *(segment2.begin() + i2), i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -881,10 +856,9 @@ struct LoopICountExecute<
 */
 template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
 struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<
-        RAJA::iteration_mapping::Direct,
-        sync,
-        IndexMapper0>,
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           sync,
+                                           IndexMapper0>,
     SEGMENT>
     : LoopExecute<
           RAJA::policy::hip::
@@ -894,21 +868,20 @@ struct LoopExecute<
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
 struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -926,28 +899,26 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1,
+                                           IndexMapper2>,
     SEGMENT>
 {
   using diff_t = typename std::iterator_traits<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -995,10 +966,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1015,11 +986,10 @@ struct LoopExecute<
   }
 };
 
-template <
-    typename SEGMENT,
-    typename IndexMapper0,
-    typename IndexMapper1,
-    typename IndexMapper2>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
 struct LoopExecute<
     RAJA::policy::hip::hip_flatten_indexer<
         RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
@@ -1033,10 +1003,10 @@ struct LoopExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1062,10 +1032,9 @@ struct LoopExecute<
 */
 template <typename SEGMENT, typename IndexMapper>
 struct TileExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
     SEGMENT>
 {
 
@@ -1073,11 +1042,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i =
@@ -1103,11 +1072,11 @@ struct TileExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t i_init =
@@ -1124,10 +1093,9 @@ struct TileExecute<
 
 template <typename SEGMENT, typename IndexMapper>
 struct TileTCountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::Direct,
-        kernel_sync_requirement::none,
-        IndexMapper>,
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
     SEGMENT>
 {
 
@@ -1135,11 +1103,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
     const diff_t t   = IndexMapper::template index<diff_t>();
@@ -1165,11 +1133,11 @@ struct TileTCountExecute<
       typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len      = segment.end() - segment.begin();
     const diff_t t_init   = IndexMapper::template index<diff_t>();
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index b892dcd339..5f06445a0f 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -73,20 +73,19 @@ namespace impl
 //
 
 //! combine value into global memory
-template <
-    typename Combiner,
-    typename GetTallyIndex,
-    typename T,
-    typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
-    int            RAJA_UNUSED_ARG(num_bins),
-    T              identity,
-    int            bin,
-    T              value,
-    T*             tally_mem,
-    GetTallyOffset get_tally_offset,
-    int            tally_replication,
-    int            tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   if (value == identity)
   {
@@ -104,11 +103,11 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
-    int num_bins,
-    T   identity,
-    T*  shared_mem,
-    int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -123,19 +122,18 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
 }
 
 //! combine value into shared memory
-template <
-    typename Combiner,
-    typename GetSharedIndex,
-    typename T,
-    typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
-    int             num_bins,
-    T               identity,
-    int             bin,
-    T               value,
-    T*              shared_mem,
-    GetSharedOffset get_shared_offset,
-    int             shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
   if (value == identity)
   {
@@ -152,21 +150,20 @@ RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
 }
 
 //! combine value into shared memory
-template <
-    typename Combiner,
-    typename T,
-    typename GetSharedOffset,
-    typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
-    int             num_bins,
-    T               identity,
-    T*              shared_mem,
-    GetSharedOffset get_shared_offset,
-    int             shared_replication,
-    T*              tally_mem,
-    GetTallyOffset  get_tally_offset,
-    int             tally_replication,
-    int             tally_bins)
+template <typename Combiner,
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -213,17 +210,16 @@ struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
   template <typename Container>
-  MultiReduceGridAtomicHostInit_TallyData(
-      Container const& container,
-      T const&         identity)
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
       : m_tally_mem(nullptr),
         m_identity(identity),
         m_num_bins(container.size()),
         m_tally_bins(get_tally_bins(m_num_bins)),
         m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(
-        container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
@@ -249,8 +245,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       m_num_bins          = new_num_bins;
       m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem         = create_tally(
-                  container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
     }
     else
     {
@@ -259,8 +255,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
         int bin       = 0;
         for (auto const& value : container)
         {
-          m_tally_mem[GetTallyOffset {}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
@@ -268,8 +264,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
       {
         for (int bin = 0; bin < m_num_bins; ++bin)
         {
-          m_tally_mem[GetTallyOffset {}(
-              bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -343,12 +339,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   template <typename Container>
-  static T* create_tally(
-      Container const& container,
-      T const&         identity,
-      int              num_bins,
-      int              tally_bins,
-      int              tally_replication)
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -384,11 +379,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
     return tally_mem;
   }
 
-  static void destroy_tally(
-      T*& tally_mem,
-      int num_bins,
-      int tally_bins,
-      int tally_replication)
+  static void destroy_tally(T*& tally_mem,
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
     if (num_bins == size_t(0))
     {
@@ -399,8 +393,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
     {
       for (int bin = num_bins; bin > 0; --bin)
       {
-        int tally_offset = GetTallyOffset {}(
-            bin - 1, tally_bins, tally_rep - 1, tally_replication);
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -412,8 +406,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
   using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
-  T*  m_tally_mem;
-  T   m_identity;
+  T* m_tally_mem;
+  T m_identity;
   int m_num_bins;
   int m_tally_bins;
   int m_tally_replication;  // power of 2, at least the max number of omp
@@ -496,9 +490,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 
   //! setup permanent settings, defer to tally data
   template <typename Container>
-  MultiReduceBlockThenGridAtomicHostInit_Data(
-      Container const& container,
-      T const&         identity)
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
       : TallyData(container, identity),
         m_shared_offset(s_shared_offset_unknown),
         m_shared_replication(0)
@@ -532,8 +525,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
       return;
     }
 
-    size_t       shared_replication = 0;
-    const size_t shared_offset      = allocateDynamicShmem<T>(
+    size_t shared_replication  = 0;
+    const size_t shared_offset = allocateDynamicShmem<T>(
         [&](size_t max_shmem_size)
         {
           struct
@@ -574,8 +567,8 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
     T* shared_mem = get_shared_mem();
     if (shared_mem != nullptr)
     {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity, shared_mem, m_shared_replication);
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -697,17 +690,15 @@ struct MultiReduceDataHip
           (tuning::algorithm ==
            multi_reduce_algorithm::
                init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<
-              t_MultiReduceOp,
-              T,
-              tuning>,
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                           T,
+                                                           tuning>,
           std::conditional_t<
               (tuning::algorithm ==
                multi_reduce_algorithm::init_host_combine_global_atomic),
-              hip::MultiReduceGridAtomicHostInit_Data<
-                  t_MultiReduceOp,
-                  T,
-                  tuning>,
+              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                      T,
+                                                      tuning>,
               void>>,
       void>;
 
@@ -720,10 +711,9 @@ struct MultiReduceDataHip
 
   MultiReduceDataHip() = delete;
 
-  template <
-      typename Container,
-      std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* =
-          nullptr>
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
   MultiReduceDataHip(Container const& container, T identity)
       : m_parent(this),
         m_sync_list(new SyncList),
@@ -843,9 +833,9 @@ struct MultiReduceDataHip
 
 private:
   MultiReduceDataHip const* m_parent;
-  SyncList*                 m_sync_list;
-  reduce_data_type          m_data;
-  bool                      m_own_launch_data;
+  SyncList* m_sync_list;
+  reduce_data_type m_data;
+  bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Hip res)
   {
@@ -871,9 +861,8 @@ struct MultiReduceDataHip
 
 }  // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(
-    policy::hip::hip_multi_reduce_policy,
-    hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
+                                hip::MultiReduceDataHip)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 5c51ad6c38..db4d204aeb 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -33,7 +33,7 @@ init(KernelName& kn, const RAJA::hip::detail::hipInfo&)
 // Combine
 template <typename EXEC_POL>
 RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-                 combine(KernelName&)
+combine(KernelName&)
 {}
 
 // Resolve
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index 3ea0865256..305986f522 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -30,7 +30,7 @@ init(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
 // Combine
 template <typename EXEC_POL, typename OP, typename T>
 RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-                 combine(Reducer<OP, T>& red)
+combine(Reducer<OP, T>& red)
 {
   RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
 }
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 20fb09b167..2491f5dc05 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -195,10 +195,9 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template <
-    size_t t_cutoff,
-    size_t preferred_replication_before_cutoff,
-    size_t preferred_replication_after_cutoff>
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
   template <typename IdxT, typename Data>
@@ -277,18 +276,17 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <
-    reduce_algorithm         t_algorithm,
-    block_communication_mode t_comm_mode,
-    size_t                   t_replication,
-    size_t                   t_atomic_stride>
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm         algorithm     = t_algorithm;
-  static constexpr block_communication_mode comm_mode     = t_comm_mode;
-  static constexpr size_t                   replication   = t_replication;
-  static constexpr size_t                   atomic_stride = t_atomic_stride;
-  static constexpr bool                     consistent =
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
+  static constexpr block_communication_mode comm_mode = t_comm_mode;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
+  static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
 
@@ -299,10 +297,9 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template <
-    typename t_AtomicReplicationConcretizer,
-    typename t_ReplicationIndexer,
-    typename t_OffsetCalculator>
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
@@ -310,10 +307,9 @@ struct AtomicReplicationTuning
   using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template <
-    multi_reduce_algorithm t_algorithm,
-    typename t_SharedAtomicReplicationTuning,
-    typename t_GlobalAtomicReplicationTuning>
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -339,10 +335,9 @@ struct DeviceConstants
                                              // the cache level that handles
                                              // atomics
 
-  constexpr DeviceConstants(
-      RAJA::Index_type warp_size,
-      RAJA::Index_type max_block_size,
-      RAJA::Index_type atomic_cache_line_bytes) noexcept
+  constexpr DeviceConstants(RAJA::Index_type warp_size,
+                            RAJA::Index_type max_block_size,
+                            RAJA::Index_type atomic_cache_line_bytes) noexcept
       : WARP_SIZE(warp_size),
         MAX_BLOCK_SIZE(max_block_size),
         MAX_WARPS(max_block_size / warp_size),
@@ -360,27 +355,23 @@ constexpr DeviceConstants device_constants(64, 1024, 64);  // MI300A
 #elif defined(__HIP_PLATFORM_NVIDIA__)
 constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 #endif
-static_assert(
-    device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-    "RAJA Assumption Broken: device_constants.WARP_SIZE < "
-    "device_constants.MAX_WARPS");
-static_assert(
-    device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
-    "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
-    "a multiple of device_constants.WARP_SIZE");
-
-
-template <
-    typename _IterationMapping,
-    kernel_sync_requirement sync,
-    typename... _IterationGetters>
+static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
+static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
+              "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
+              "a multiple of device_constants.WARP_SIZE");
+
+
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct hip_indexer
 {};
 
-template <
-    typename _IterationMapping,
-    kernel_sync_requirement sync,
-    typename... _IterationGetters>
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
                                  RAJA::Policy::hip,
                                  RAJA::Pattern::region,
@@ -390,11 +381,10 @@ struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <
-    typename _IterationMapping,
-    typename _IterationGetter,
-    typename _LaunchConcretizer,
-    bool Async = false>
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
+          bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                       RAJA::Policy::hip,
                       RAJA::Pattern::forall,
@@ -457,10 +447,9 @@ struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
                                RAJA::Pattern::reduce,
                                detail::get_launch<false>::value,
                                RAJA::Platform::hip,
-                               std::conditional_t<
-                                   tuning::consistent,
-                                   reduce::ordered,
-                                   reduce::unordered>>
+                               std::conditional_t<tuning::consistent,
+                                                  reduce::ordered,
+                                                  reduce::unordered>>
 {};
 
 template <typename tuning>
@@ -470,10 +459,9 @@ struct hip_multi_reduce_policy
           RAJA::Pattern::multi_reduce,
           detail::get_launch<false>::value,
           RAJA::Platform::hip,
-          std::conditional_t<
-              tuning::consistent,
-              reduce::ordered,
-              reduce::unordered>>
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
 {};
 
 /*!
@@ -543,10 +531,9 @@ struct hip_thread_masked_loop
 {};
 
 
-struct hip_synchronize : make_policy_pattern_launch_t<
-                             Policy::hip,
-                             Pattern::synchronize,
-                             Launch::sync>
+struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
+                                                      Pattern::synchronize,
+                                                      Launch::sync>
 {};
 
 }  // end namespace hip
@@ -593,9 +580,8 @@ struct HipDims
   {
     if (num_blocks() != 0)
     {
-      return {
-          (blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
-          (blocks.z ? blocks.z : 1)};
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
+              (blocks.z ? blocks.z : 1)};
     }
     else
     {
@@ -608,9 +594,8 @@ struct HipDims
   {
     if (num_threads() != 0)
     {
-      return {
-          (threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
-          (threads.z ? threads.z : 1)};
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
+              (threads.z ? threads.z : 1)};
     }
     else
     {
@@ -1144,8 +1129,8 @@ struct IndexDivide
   template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(
-        indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
@@ -1179,10 +1164,9 @@ struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<
-      typename get_index_thread<x_index>::type,
-      typename get_index_thread<y_index>::type,
-      typename get_index_thread<z_index>::type>;
+  using type = IndexFlatten<typename get_index_thread<x_index>::type,
+                            typename get_index_thread<y_index>::type,
+                            typename get_index_thread<z_index>::type>;
 };
 
 // helper to get just the block indexing part of IndexGlobal
@@ -1198,10 +1182,9 @@ struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 template <typename x_index, typename y_index, typename z_index>
 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 {
-  using type = IndexFlatten<
-      typename get_index_block<x_index>::type,
-      typename get_index_block<y_index>::type,
-      typename get_index_block<z_index>::type>;
+  using type = IndexFlatten<typename get_index_block<x_index>::type,
+                            typename get_index_block<y_index>::type,
+                            typename get_index_block<z_index>::type>;
 };
 
 
@@ -1212,14 +1195,12 @@ using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
 template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <
-    size_t BLOCK_SIZE_X = named_usage::unspecified,
-    size_t BLOCK_SIZE_Y = named_usage::unspecified,
-    size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using thread_xyz = IndexFlatten<
-    thread_x<BLOCK_SIZE_X>,
-    thread_y<BLOCK_SIZE_Y>,
-    thread_z<BLOCK_SIZE_Z>>;
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
+                                thread_y<BLOCK_SIZE_Y>,
+                                thread_z<BLOCK_SIZE_Z>>;
 
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
@@ -1228,14 +1209,12 @@ using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <
-    size_t GRID_SIZE_X = named_usage::unspecified,
-    size_t GRID_SIZE_Y = named_usage::unspecified,
-    size_t GRID_SIZE_Z = named_usage::unspecified>
-using block_xyz = IndexFlatten<
-    block_x<GRID_SIZE_X>,
-    block_y<GRID_SIZE_Y>,
-    block_z<GRID_SIZE_Z>>;
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
+                               block_y<GRID_SIZE_Y>,
+                               block_z<GRID_SIZE_Z>>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
@@ -1245,39 +1224,35 @@ template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
-template <
-    size_t BLOCK_SIZE_X,
-    size_t BLOCK_SIZE_Y,
-    size_t BLOCK_SIZE_Z,
-    size_t GRID_SIZE_X = named_usage::unspecified,
-    size_t GRID_SIZE_Y = named_usage::unspecified,
-    size_t GRID_SIZE_Z = named_usage::unspecified>
-using global_xyz = IndexFlatten<
-    global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
-    global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
-    global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
-
-
-template <
-    size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
-    size_t BLOCK_SIZE_X = named_usage::unspecified,
-    size_t BLOCK_SIZE_Y = named_usage::unspecified,
-    size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using warp_xyz = IndexDivide<
-    WARP_SIZE,
-    thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
-
-template <
-    size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
-    size_t BLOCK_SIZE_X = named_usage::unspecified,
-    size_t BLOCK_SIZE_Y = named_usage::unspecified,
-    size_t BLOCK_SIZE_Z = named_usage::unspecified,
-    size_t GRID_SIZE_X  = named_usage::unspecified,
-    size_t GRID_SIZE_Y  = named_usage::unspecified,
-    size_t GRID_SIZE_Z  = named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<
-    warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
-    block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+template <size_t BLOCK_SIZE_X,
+          size_t BLOCK_SIZE_Y,
+          size_t BLOCK_SIZE_Z,
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
+using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
+                                global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
+                                global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
+
+
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
 
 }  // namespace hip
 
@@ -1315,18 +1290,16 @@ using hip_exec_grid_async = policy::hip::hip_exec<
     true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct,
-    hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer,
-    Async>;
+using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
+                                       hip::global_x<BLOCK_SIZE>,
+                                       HipDefaultConcretizer,
+                                       Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct,
-    hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer,
-    true>;
+using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
+                                             hip::global_x<BLOCK_SIZE>,
+                                             HipDefaultConcretizer,
+                                             true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
@@ -1399,16 +1372,16 @@ using hip_exec_with_reduce_async = policy::hip::hip_exec<
     true>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base = std::conditional_t<
-    with_reduce,
-    hip_exec_with_reduce<BLOCK_SIZE, Async>,
-    hip_exec<BLOCK_SIZE, Async>>;
+using hip_exec_base =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
+                       hip_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async = std::conditional_t<
-    with_reduce,
-    hip_exec_with_reduce_async<BLOCK_SIZE>,
-    hip_exec_async<BLOCK_SIZE>>;
+using hip_exec_base_async =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce_async<BLOCK_SIZE>,
+                       hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -1424,11 +1397,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template <
-    hip::reduce_algorithm         algorithm,
-    hip::block_communication_mode comm_mode,
-    size_t                        replication   = named_usage::unspecified,
-    size_t                        atomic_stride = named_usage::unspecified>
+template <hip::reduce_algorithm algorithm,
+          hip::block_communication_mode comm_mode,
+          size_t replication   = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1451,41 +1423,41 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using hip_reduce_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using hip_reduce_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified,
-    named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1503,23 +1475,20 @@ using hip_reduce_base =
 
 
 // policies usable with multi_reducers
-template <
-    hip::multi_reduce_algorithm algorithm,
-    typename SharedAtomicReplicationConcretizer,
-    typename SharedAtomicReplicationIndexer,
-    typename GlobalAtomicReplicationConcretizer,
-    typename GlobalAtomicReplicationIndexer>
+template <hip::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
 using hip_multi_reduce_tuning =
     policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
         algorithm,
-        hip::AtomicReplicationTuning<
-            SharedAtomicReplicationConcretizer,
-            SharedAtomicReplicationIndexer,
-            GetOffsetRight<int>>,
-        hip::AtomicReplicationTuning<
-            GlobalAtomicReplicationConcretizer,
-            GlobalAtomicReplicationIndexer,
-            GetOffsetLeft<int>>>>;
+        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                     SharedAtomicReplicationIndexer,
+                                     GetOffsetRight<int>>,
+        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                     GlobalAtomicReplicationIndexer,
+                                     GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1609,10 +1578,10 @@ using policy::hip::hip_launch_t;
 
 // policies usable with kernel and launch
 template <typename... indexers>
-using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+using hip_indexer_direct =
+    policy::hip::hip_indexer<iteration_mapping::Direct,
+                             kernel_sync_requirement::none,
+                             indexers...>;
 
 template <typename... indexers>
 using hip_indexer_loop = policy::hip::hip_indexer<
@@ -1627,10 +1596,10 @@ using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     indexers...>;
 
 template <typename... indexers>
-using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+using hip_flatten_indexer_direct =
+    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     indexers...>;
 
 template <typename... indexers>
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
@@ -1944,10 +1913,10 @@ using hip_flatten_block_zyx_loop =
  * physical threads to fit all of the direct map requests.
  */
 template <named_dim... dims>
-using hip_global_direct = hip_indexer_direct<hip::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using hip_global_direct =
+    hip_indexer_direct<hip::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
 
 using hip_global_x_direct = hip_global_direct<named_dim::x>;
 using hip_global_y_direct = hip_global_direct<named_dim::y>;
@@ -1978,16 +1947,16 @@ using hip_global_zyx_direct =
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
 template <named_dim... dims>
-using hip_global_loop = hip_indexer_loop<hip::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using hip_global_loop =
+    hip_indexer_loop<hip::IndexGlobal<dims,
+                                      named_usage::unspecified,
+                                      named_usage::unspecified>...>;
 
 template <named_dim... dims>
-using hip_global_syncable_loop = hip_indexer_syncable_loop<hip::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using hip_global_syncable_loop =
+    hip_indexer_syncable_loop<hip::IndexGlobal<dims,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>...>;
 
 using hip_global_x_loop = hip_global_loop<named_dim::x>;
 using hip_global_y_loop = hip_global_loop<named_dim::y>;
@@ -2020,10 +1989,10 @@ using hip_global_zyx_loop =
  * Reshapes multiple physical global threads into a 1D iteration space
  */
 template <named_dim... dims>
-using hip_flatten_global_direct = hip_flatten_indexer_direct<hip::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using hip_flatten_global_direct =
+    hip_flatten_indexer_direct<hip::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
@@ -2062,10 +2031,10 @@ using hip_flatten_global_zyx_direct =
  * global threads
  */
 template <named_dim... dims>
-using hip_flatten_global_loop = hip_flatten_indexer_loop<hip::IndexGlobal<
-    dims,
-    named_usage::unspecified,
-    named_usage::unspecified>...>;
+using hip_flatten_global_loop =
+    hip_flatten_indexer_loop<hip::IndexGlobal<dims,
+                                              named_usage::unspecified,
+                                              named_usage::unspecified>...>;
 
 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
@@ -2114,60 +2083,60 @@ using hip_thread_size_z_direct =
     hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xy_direct = hip_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_xy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xz_direct = hip_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_xz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yx_direct = hip_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_yx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yz_direct = hip_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_yz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zx_direct = hip_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_zx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zy_direct = hip_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_zy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xyz_direct = hip_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_xyz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xzy_direct = hip_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_xzy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yxz_direct = hip_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_yxz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yzx_direct = hip_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_yzx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zxy_direct = hip_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_zxy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zyx_direct = hip_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_zyx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2197,35 +2166,29 @@ using hip_block_size_zy_direct =
     hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xyz_direct = hip_indexer_direct<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xzy_direct = hip_indexer_direct<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yxz_direct = hip_indexer_direct<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yzx_direct = hip_indexer_direct<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zxy_direct = hip_indexer_direct<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zyx_direct = hip_indexer_direct<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2238,121 +2201,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_global_size_z_direct =
     hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xy_direct = hip_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xz_direct = hip_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yx_direct = hip_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yz_direct = hip_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zx_direct = hip_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zy_direct = hip_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xyz_direct = hip_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xzy_direct = hip_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yxz_direct = hip_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yzx_direct = hip_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zxy_direct = hip_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zyx_direct = hip_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to HIP global threads.
@@ -2385,35 +2336,29 @@ using hip_thread_size_zy_loop =
     hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xyz_loop = hip_indexer_loop<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xzy_loop = hip_indexer_loop<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yxz_loop = hip_indexer_loop<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yzx_loop = hip_indexer_loop<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zxy_loop = hip_indexer_loop<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zyx_loop = hip_indexer_loop<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2443,35 +2388,29 @@ using hip_block_size_zy_loop =
     hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xyz_loop = hip_indexer_loop<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xzy_loop = hip_indexer_loop<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yxz_loop = hip_indexer_loop<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yzx_loop = hip_indexer_loop<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zxy_loop = hip_indexer_loop<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zyx_loop = hip_indexer_loop<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2484,121 +2423,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_global_size_z_loop =
     hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xy_loop = hip_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xz_loop = hip_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yx_loop = hip_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yz_loop = hip_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zx_loop = hip_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zy_loop = hip_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xyz_loop = hip_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xzy_loop = hip_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yxz_loop = hip_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yzx_loop = hip_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zxy_loop = hip_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zyx_loop = hip_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2617,60 +2544,60 @@ using hip_flatten_thread_size_z_direct =
     hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2684,60 +2611,60 @@ using hip_flatten_block_size_z_direct =
     hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -2750,121 +2677,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_flatten_global_size_z_direct =
     hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2883,60 +2798,60 @@ using hip_flatten_thread_size_z_loop =
     hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
 
 template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
 template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
 template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
 template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<
-    hip::thread_z<Z_BLOCK_SIZE>,
-    hip::thread_y<Y_BLOCK_SIZE>,
-    hip::thread_x<X_BLOCK_SIZE>>;
+using hip_flatten_thread_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
 
 
 template <int X_GRID_SIZE>
@@ -2950,60 +2865,60 @@ using hip_flatten_block_size_z_loop =
     hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
 
 template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
 template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>>;
+using hip_flatten_block_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
 template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>>;
+using hip_flatten_block_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
 template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<
-    hip::block_z<Z_GRID_SIZE>,
-    hip::block_y<Y_GRID_SIZE>,
-    hip::block_x<X_GRID_SIZE>>;
+using hip_flatten_block_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
 
 
 template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
@@ -3016,121 +2931,109 @@ template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
 using hip_flatten_global_size_z_loop =
     hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
 
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <
-    int Y_BLOCK_SIZE,
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <
-    int Z_BLOCK_SIZE,
-    int Y_BLOCK_SIZE,
-    int X_BLOCK_SIZE,
-    int Z_GRID_SIZE = named_usage::unspecified,
-    int Y_GRID_SIZE = named_usage::unspecified,
-    int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<
-    hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-    hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-    hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 8fb09fdd6f..71542f2410 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -65,8 +65,8 @@ hipAssert(hipError_t code, const char* file, int line, bool abort = true)
     }
     else
     {
-      fprintf(
-          stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file, line);
+      fprintf(stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 495aaf436e..8de76ac784 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -124,18 +124,16 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <
-    typename Combiner,
-    typename Accessor,
-    int replication,
-    int atomic_stride,
-    typename T,
-    typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(
-    T&            val,
-    T             identity,
-    TempIterator  in_device_mem,
-    unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
   typename TempIterator::template rebind_accessor<Accessor> device_mem(
       in_device_mem);
@@ -151,8 +149,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(
   int replicationId = blockId % replication;
   int slotId        = blockId / replication;
 
-  int          maxNumSlots = (numBlocks + replication - 1) / replication;
-  unsigned int numSlots    = (numBlocks / replication) +
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots = (numBlocks / replication) +
                           ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
@@ -240,7 +238,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
     {
       int srcLane = threadId ^ i;
-      T   rhs     = RAJA::hip::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
       if (srcLane < numThreads)
       {
@@ -249,20 +247,19 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     }
   }
 
-  static_assert(
-      RAJA::policy::hip::device_constants.MAX_WARPS <=
-          RAJA::policy::hip::device_constants.WARP_SIZE,
-      "Max Warps must be less than or equal to Warp Size for this "
-      "algorithm to work");
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
   if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
   {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char
-        tmpsd[sizeof(RAJA::detail::SoAArray<
-                     T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T,
+                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
@@ -314,8 +311,8 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
   using ThreadIterationGetter =
       typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int          numBlocks   = BlockIterationGetter::size();
-  const int          numThreads  = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
   const int blockId  = BlockIterationGetter::index();
@@ -364,17 +361,16 @@ grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <
-    typename Combiner,
-    typename Accessor,
-    int replication,
-    int atomic_stride,
-    typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
-    T&            val,
-    T             identity,
-    T*            device_mem,
-    unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T>
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -474,14 +470,14 @@ class PinnedTally
   struct Node
   {
     Node* next;
-    T     values[num_slots];
+    T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode
   {
-    ResourceNode*          next;
+    ResourceNode* next;
     ::RAJA::resources::Hip res;
-    Node*                  node_list;
+    Node* node_list;
   };
 
   //! Iterator over resources used by reducer
@@ -501,7 +497,7 @@ class PinnedTally
     ResourceIterator operator++(int)
     {
       ResourceIterator ret = *this;
-      this->           operator++();
+      this->operator++();
       return ret;
     }
 
@@ -551,7 +547,7 @@ class PinnedTally
     ResourceNodeIterator operator++(int)
     {
       ResourceNodeIterator ret = *this;
-      this->               operator++();
+      this->operator++();
       return ret;
     }
 
@@ -569,7 +565,7 @@ class PinnedTally
 
   private:
     ResourceNode* m_rn;
-    Node*         m_n;
+    Node* m_n;
   };
 
   PinnedTally() : resource_list(nullptr) {}
@@ -664,12 +660,11 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <
-    typename Combiner,
-    typename Accessor,
-    typename T,
-    size_t replication,
-    size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -678,11 +673,11 @@ struct ReduceLastBlock_Data
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T                                  value;
-  T                                          identity;
-  unsigned int*                              device_count;
+  mutable T value;
+  T identity;
+  unsigned int* device_count;
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
-  bool                                       own_device_ptr;
+  bool own_device_ptr;
 
   ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
 
@@ -725,10 +720,11 @@ struct ReduceLastBlock_Data
   RAJA_DEVICE
   void grid_reduce(T* output)
   {
-    T      temp          = value;
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-        temp, identity, device, device_count);
+    T temp = value;
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -742,9 +738,9 @@ struct ReduceLastBlock_Data
     bool act = !device.allocated() && setupReducers();
     if (act)
     {
-      hip_dim_t gridDim     = currentGridDim();
-      size_t    numBlocks   = gridDim.x * gridDim.y * gridDim.z;
-      size_t    maxNumSlots = (numBlocks + replication - 1) / replication;
+      hip_dim_t gridDim  = currentGridDim();
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
+      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
       device.allocate(maxNumSlots * replication);
       device_count =
           count_mempool_type::getInstance().template malloc<unsigned int>(
@@ -772,11 +768,10 @@ struct ReduceLastBlock_Data
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <
-    typename Combiner,
-    typename T,
-    size_t replication,
-    size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -784,9 +779,9 @@ struct ReduceAtomicHostInit_Data
   static constexpr size_t tally_slots = replication * atomic_stride;
 
   mutable T value;
-  T         identity;
-  bool      is_setup;
-  bool      own_device_ptr;
+  T identity;
+  bool is_setup;
+  bool own_device_ptr;
 
   ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
@@ -857,12 +852,11 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <
-    typename Combiner,
-    typename Accessor,
-    typename T,
-    size_t replication,
-    size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
@@ -871,11 +865,11 @@ struct ReduceAtomicDeviceInit_Data
 
   static constexpr size_t tally_slots = replication;
 
-  mutable T     value;
-  T             identity;
+  mutable T value;
+  T identity;
   unsigned int* device_count;
-  T*            device;
-  bool          own_device_ptr;
+  T* device;
+  bool own_device_ptr;
 
   ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
@@ -916,9 +910,10 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-        temp, identity, device, device_count);
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
     if (replicationId != replication)
     {
       output[replicationId] = temp;
@@ -980,10 +975,10 @@ class Reduce
   using Accessor = std::conditional_t<
       (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<
-          (tuning::comm_mode == block_communication_mode::device_fence),
-          impl::AccessorDeviceScopeUseDeviceFence,
-          void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
       (tuning::algorithm ==
@@ -996,47 +991,43 @@ class Reduce
   using reduce_data_type = std::conditional_t<
       (tuning::algorithm == reduce_algorithm::combine_last_block) ||
           (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<
-          Combiner,
-          Accessor,
-          T,
-          replication,
-          atomic_stride>,
+      hip::ReduceLastBlock_Data<Combiner,
+                                Accessor,
+                                T,
+                                replication,
+                                atomic_stride>,
       std::conditional_t<
           atomic_available,
           std::conditional_t<
               (tuning::algorithm ==
                reduce_algorithm::init_device_combine_atomic_block),
-              hip::ReduceAtomicDeviceInit_Data<
-                  Combiner,
-                  Accessor,
-                  T,
-                  replication,
-                  atomic_stride>,
+              hip::ReduceAtomicDeviceInit_Data<Combiner,
+                                               Accessor,
+                                               T,
+                                               replication,
+                                               atomic_stride>,
               std::conditional_t<
                   (tuning::algorithm ==
                    reduce_algorithm::init_host_combine_atomic_block),
-                  hip::ReduceAtomicHostInit_Data<
-                      Combiner,
-                      T,
-                      replication,
-                      atomic_stride>,
+                  hip::ReduceAtomicHostInit_Data<Combiner,
+                                                 T,
+                                                 replication,
+                                                 atomic_stride>,
                   void>>,
           void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<
-      T,
-      tally_slots,
-      typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
   union tally_u
   {
     TallyType* list;
-    T*         val_ptr;
+    T* val_ptr;
     constexpr tally_u(TallyType* l) : list(l) {};
     constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
@@ -1162,8 +1153,8 @@ class Reduce
   T get_combined() const { return val.value; }
 
 private:
-  const Reduce*    parent;
-  tally_u          tally_or_val_ptr;
+  const Reduce* parent;
+  tally_u tally_or_val_ptr;
   reduce_data_type val;
 };
 
@@ -1276,28 +1267,25 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(
-            value_type(init_val, init_idx),
-            value_type(identity_val, identity_idx))
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
+               T identity_val = NonLocCombiner::identity(),
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val = NonLocCombiner::identity(),
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(
-        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1335,28 +1323,25 @@ class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(
-            value_type(init_val, init_idx),
-            value_type(identity_val, identity_idx))
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
+               T identity_val = NonLocCombiner::identity(),
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
   {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(
-      T         init_val,
-      IndexType init_idx,
-      T         identity_val = NonLocCombiner::identity(),
-      IndexType identity_idx =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val = NonLocCombiner::identity(),
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(
-        value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index bbedf4f09e..17f91e5e2a 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -47,35 +47,33 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename InputIter,
-    typename Function>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename InputIter,
+          typename Function>
 RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
-    Function  binary_op)
+    Function binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, begin, len, binary_op,
-      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
-      stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
+                                             stream));
 #endif
 
   // Allocate temporary storage
@@ -84,13 +82,12 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, begin, len, binary_op,
-      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len,
-      stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
+                                             stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -104,37 +101,35 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename InputIter,
-    typename Function,
-    typename T>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename InputIter,
+          typename Function,
+          typename T>
 RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
-    Function  binary_op,
-    T         init)
+    Function binary_op,
+    T init)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, begin, init, len, binary_op,
-      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
-      stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
+                                             stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -142,13 +137,12 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, begin, init, len, binary_op,
-      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len,
-      stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
+                                             stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -162,32 +156,31 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename InputIter,
-    typename OutputIter,
-    typename Function>
-RAJA_INLINE resources::EventProxy<resources::Hip> inclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter  begin,
-    InputIter  end,
-    OutputIter out,
-    Function   binary_op)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename InputIter,
+          typename OutputIter,
+          typename Function>
+RAJA_INLINE resources::EventProxy<resources::Hip>
+inclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -198,8 +191,8 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
   hipErrchk(::cub::DeviceScan::InclusiveScan(
       d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
@@ -216,39 +209,37 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename InputIter,
-    typename OutputIter,
-    typename Function,
-    typename T>
-RAJA_INLINE resources::EventProxy<resources::Hip> exclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter  begin,
-    InputIter  end,
-    OutputIter out,
-    Function   binary_op,
-    T          init)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename InputIter,
+          typename OutputIter,
+          typename Function,
+          typename T>
+RAJA_INLINE resources::EventProxy<resources::Hip>
+exclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, out, init, len, binary_op,
-      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
-      stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
+                                             stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -256,13 +247,12 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(
-      d_temp_storage, temp_storage_bytes, begin, out, init, len, binary_op,
-      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len,
-      stream));
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
+                                             stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index ab0767bfcc..bfa0d3814b 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -73,42 +73,38 @@ R* get_current(double_buffer<R>& d_bufs)
 /*!
         \brief static assert unimplemented stable sort
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename Iter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::
-                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter,
+       Iter,
+       Compare)
 {
   static_assert(
       concepts::all_of<
           type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
           std::is_pointer<Iter>,
           concepts::any_of<
-              camp::is_same<
-                  Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<
-                  Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::
-          value,
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
       "RAJA stable_sort<hip_exec> is only implemented for pointers to "
       "arithmetic types and RAJA::operators::less and "
       "RAJA::operators::greater.");
@@ -119,23 +115,20 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -153,16 +146,16 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
+                                       stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -171,13 +164,13 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
+                                       stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeys(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -186,8 +179,8 @@ stable(
   {
 
     // copy
-    hipErrchk(hipMemcpyAsync(
-        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -200,23 +193,20 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
@@ -234,12 +224,12 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
+                                            stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
       d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
@@ -252,9 +242,9 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
+                                            stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
       d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
@@ -267,8 +257,8 @@ stable(
   {
 
     // copy
-    hipErrchk(hipMemcpyAsync(
-        begin, d_out, len * sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -282,42 +272,38 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename Iter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
         std::is_pointer<Iter>,
         concepts::any_of<
-            camp::
-                is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+         Iter,
+         Iter,
+         Compare)
 {
   static_assert(
       concepts::all_of<
           type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
           std::is_pointer<Iter>,
           concepts::any_of<
-              camp::is_same<
-                  Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<
-                  Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::
-          value,
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
       "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
       "types and RAJA::operators::less and RAJA::operators::greater.");
 
@@ -327,23 +313,20 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter                                                                begin,
-    Iter                                                                end,
-    operators::less<RAJA::detail::IterVal<Iter>>                        comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -351,23 +334,20 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename Iter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-    std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter                                                                begin,
-    Iter                                                                end,
-    operators::greater<RAJA::detail::IterVal<Iter>>                     comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                      std::is_pointer<Iter>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -376,14 +356,13 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
@@ -391,9 +370,8 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<
-                Compare,
-                operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
@@ -406,21 +384,17 @@ stable_pairs(
     ValIter,
     Compare)
 {
-  static_assert(
-      std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert(
-      std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for "
+                "arithmetic types");
   static_assert(
-      type_traits::is_arithmetic<K>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for "
-      "arithmetic types");
-  static_assert(
-      concepts::any_of<
-          camp::is_same<Compare, operators::less<K>>,
-          camp::is_same<Compare, operators::greater<K>>>::value,
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
       "stable_sort_pairs<hip_exec> is only implemented for "
       "RAJA::operators::less or RAJA::operators::greater");
 
@@ -430,13 +404,12 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -470,12 +443,12 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
+                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairs(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -488,9 +461,9 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
+                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairs(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -503,15 +476,15 @@ stable_pairs(
   {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out)
   {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -525,13 +498,12 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -565,12 +537,12 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void*  d_temp_storage     = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -583,9 +555,9 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
       d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
@@ -598,15 +570,15 @@ stable_pairs(
   {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(
-        keys_begin, d_keys_out, len * sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
   if (detail::get_current(d_vals) == d_vals_out)
   {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(
-        vals_begin, d_vals_out, len * sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -621,14 +593,13 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     concepts::negate<concepts::all_of<
@@ -636,9 +607,8 @@ concepts::enable_if_t<
         std::is_pointer<KeyIter>,
         std::is_pointer<ValIter>,
         concepts::any_of<
-            camp::is_same<
-                Compare,
-                operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
             camp::is_same<
                 Compare,
                 operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
@@ -651,21 +621,17 @@ unstable_pairs(
     ValIter,
     Compare)
 {
-  static_assert(
-      std::is_pointer<KeyIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert(
-      std::is_pointer<ValIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
+  static_assert(type_traits::is_arithmetic<K>::value,
+                "sort_pairs<hip_exec> is only implemented for arithmetic "
+                "types");
   static_assert(
-      type_traits::is_arithmetic<K>::value,
-      "sort_pairs<hip_exec> is only implemented for arithmetic "
-      "types");
-  static_assert(
-      concepts::any_of<
-          camp::is_same<Compare, operators::less<K>>,
-          camp::is_same<Compare, operators::greater<K>>>::value,
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
       "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
       "RAJA::operators::greater");
 
@@ -675,13 +641,12 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -691,9 +656,9 @@ unstable_pairs(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    KeyIter                                         keys_begin,
-    KeyIter                                         keys_end,
-    ValIter                                         vals_begin,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
     operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(hip_res, p, keys_begin, keys_end, vals_begin, comp);
@@ -702,13 +667,12 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async,
-    typename KeyIter,
-    typename ValIter>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
 concepts::enable_if_t<
     resources::EventProxy<resources::Hip>,
     type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -718,9 +682,9 @@ unstable_pairs(
     resources::Hip hip_res,
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    KeyIter                                            keys_begin,
-    KeyIter                                            keys_end,
-    ValIter                                            vals_begin,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
     operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(hip_res, p, keys_begin, keys_end, vals_begin, comp);
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index 277173ed70..f566ac741b 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -35,52 +35,46 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::omp_work,
-    RAJA::ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallOrdered<
-          RAJA::omp_parallel_for_exec,
-          RAJA::omp_work,
-          RAJA::ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::omp_work,
-    RAJA::reverse_ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallReverse<
-          RAJA::omp_parallel_for_exec,
-          RAJA::omp_work,
-          RAJA::reverse_ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {};
 
 }  // namespace detail
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index cdc3fcc1c3..b842a9bfc5 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -55,21 +55,19 @@ namespace policy
 namespace omp
 {
 
-template <
-    typename Iterable,
-    typename Func,
-    typename InnerPolicy,
-    typename ForallParam>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Host host_res,
-    const omp_parallel_exec<InnerPolicy>&,
-    Iterable&&  iter,
-    Func&&      loop_body,
-    ForallParam f_params)
+forall_impl(resources::Host host_res,
+            const omp_parallel_exec<InnerPolicy>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
 {
   RAJA::region<RAJA::omp_parallel_region>(
       [&]()
@@ -109,15 +107,13 @@ forall_impl(const ::RAJA::policy::omp::Auto&, Iterable&& iter, Func&& loop_body)
 //
 // omp for schedule(static)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Static<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static)
@@ -130,15 +126,13 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(static, ChunkSize)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Static<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static, ChunkSize)
@@ -151,15 +145,13 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(dynamic)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(dynamic)
@@ -172,15 +164,13 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(dynamic, ChunkSize)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(dynamic, ChunkSize)
@@ -193,15 +183,13 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(guided)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Guided<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(guided)
@@ -214,15 +202,13 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(guided, ChunkSize)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Guided<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(guided, ChunkSize)
@@ -236,10 +222,9 @@ RAJA_INLINE void forall_impl(
 // omp for schedule(runtime)
 //
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Runtime&,
-    Iterable&& iter,
-    Func&&     loop_body)
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                             Iterable&& iter,
+                             Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(runtime)
@@ -256,12 +241,11 @@ template <typename Policy, typename Iterable, typename Func>
 RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
 {
   omp_sched_t prev_sched;
-  int         prev_chunk;
+  int prev_chunk;
   omp_get_schedule(&prev_sched, &prev_chunk);
   omp_set_schedule(Policy::schedule, Policy::chunk_size);
-  forall_impl(
-      ::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
-      std::forward<Func>(loop_body));
+  forall_impl(::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
+              std::forward<Func>(loop_body));
   omp_set_schedule(prev_sched, prev_chunk);
 }
 #endif
@@ -273,10 +257,9 @@ RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
 // omp for nowait (Auto)
 //
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl_nowait(
-    const ::RAJA::policy::omp::Auto&,
-    Iterable&& iter,
-    Func&&     loop_body)
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for nowait
@@ -289,15 +272,14 @@ RAJA_INLINE void forall_impl_nowait(
 //
 // omp for schedule(static) nowait
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl_nowait(
-    const ::RAJA::policy::omp::Static<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static) nowait
@@ -310,15 +292,14 @@ RAJA_INLINE void forall_impl_nowait(
 //
 // omp for schedule(static, ChunkSize) nowait
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl_nowait(
-    const ::RAJA::policy::omp::Static<ChunkSize>&,
-    Iterable&& iter,
-    Func&&     loop_body)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
 {
   RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static, ChunkSize) nowait
@@ -336,57 +317,53 @@ RAJA_INLINE void
 forall_impl_nowait(const Policy&, Iterable&& iter, Func&& loop_body)
 {
   omp_sched_t prev_sched;
-  int         prev_chunk;
+  int prev_chunk;
   omp_get_schedule(&prev_sched, &prev_chunk);
   omp_set_schedule(Policy::schedule, Policy::chunk_size);
-  forall_impl_nowait(
-      ::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
-      std::forward<Func>(loop_body));
+  forall_impl_nowait(::RAJA::policy::omp::Runtime {},
+                     std::forward<Iterable>(iter),
+                     std::forward<Func>(loop_body));
   omp_set_schedule(prev_sched, prev_chunk);
 }
 #endif
 
 }  // end namespace internal
 
-template <
-    typename Schedule,
-    typename Iterable,
-    typename Func,
-    typename ForallParam>
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Host host_res,
-    const omp_for_schedule_exec<Schedule>&,
-    Iterable&& iter,
-    Func&&     loop_body,
-    ForallParam)
+forall_impl(resources::Host host_res,
+            const omp_for_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
-  internal::forall_impl(
-      Schedule {}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                        std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <
-    typename Schedule,
-    typename Iterable,
-    typename Func,
-    typename ForallParam>
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Host host_res,
-    const omp_for_nowait_schedule_exec<Schedule>&,
-    Iterable&& iter,
-    Func&&     loop_body,
-    ForallParam)
+forall_impl(resources::Host host_res,
+            const omp_for_nowait_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
-  internal::forall_impl_nowait(
-      Schedule {}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(Schedule {}, std::forward<Iterable>(iter),
+                               std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index 2668eccdda..76e0ca3fbc 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -35,10 +35,10 @@
 namespace RAJA
 {
 
-struct omp_parallel_collapse_exec : make_policy_pattern_t<
-                                        RAJA::Policy::openmp,
-                                        RAJA::Pattern::forall,
-                                        RAJA::policy::omp::For>
+struct omp_parallel_collapse_exec
+    : make_policy_pattern_t<RAJA::Policy::openmp,
+                            RAJA::Pattern::forall,
+                            RAJA::policy::omp::For>
 {};
 
 namespace internal
@@ -48,17 +48,14 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <
-    camp::idx_t Arg0,
-    camp::idx_t Arg1,
-    typename... EnclosedStmts,
-    typename Types>
-struct StatementExecutor<
-    statement::Collapse<
-        omp_parallel_collapse_exec,
-        ArgList<Arg0, Arg1>,
-        EnclosedStmts...>,
-    Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
+                                             ArgList<Arg0, Arg1>,
+                                             EnclosedStmts...>,
+                         Types>
 {
 
 
@@ -96,18 +93,15 @@ struct StatementExecutor<
 };
 
 
-template <
-    camp::idx_t Arg0,
-    camp::idx_t Arg1,
-    camp::idx_t Arg2,
-    typename... EnclosedStmts,
-    typename Types>
-struct StatementExecutor<
-    statement::Collapse<
-        omp_parallel_collapse_exec,
-        ArgList<Arg0, Arg1, Arg2>,
-        EnclosedStmts...>,
-    Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          camp::idx_t Arg2,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
+                                             ArgList<Arg0, Arg1, Arg2>,
+                                             EnclosedStmts...>,
+                         Types>
 {
 
 
@@ -117,9 +111,9 @@ struct StatementExecutor<
     const auto l0 = segment_length<Arg0>(data);
     const auto l1 = segment_length<Arg1>(data);
     const auto l2 = segment_length<Arg2>(data);
-    auto       i0 = l0;
-    auto       i1 = l1;
-    auto       i2 = l2;
+    auto i0       = l0;
+    auto i1       = l1;
+    auto i2       = l2;
 
     // Set the argument types for this loop
     using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 96283e323f..2beb61ceba 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -33,12 +33,11 @@ struct LaunchExecute<RAJA::omp_launch_t>
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      LaunchParams const&       params,
-      const char*,
-      BODY const&   body,
-      ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char*,
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     RAJA::region<RAJA::omp_parallel_region>(
         [&]()
@@ -65,12 +64,11 @@ struct LaunchExecute<RAJA::omp_launch_t>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      LaunchParams const&       launch_params,
-      const char*               RAJA_UNUSED_ARG(kernel_name),
-      BODY const&               body,
-      ReduceParams&             f_params)
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
@@ -108,10 +106,10 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -129,11 +127,11 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
         });
   }
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -151,20 +149,20 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
             for (int i = 0; i < len0; i++)
             {
 
-              loop_body.get_priv()(
-                  *(segment0.begin() + i), *(segment1.begin() + j));
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
             }
           }
         });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -184,9 +182,9 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT>
             {
               for (int i = 0; i < len0; i++)
               {
-                loop_body.get_priv()(
-                    *(segment0.begin() + i), *(segment1.begin() + j),
-                    *(segment2.begin() + k));
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
               }
             }
           }
@@ -199,10 +197,10 @@ struct LoopExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -215,11 +213,11 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -237,12 +235,12 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -256,9 +254,8 @@ struct LoopExecute<omp_for_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(
-              *(segment0.begin() + i), *(segment1.begin() + j),
-              *(segment2.begin() + k));
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k));
         }
       }
     }
@@ -273,10 +270,10 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -289,11 +286,11 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -311,12 +308,12 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -330,9 +327,8 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(
-              *(segment0.begin() + i), *(segment1.begin() + j),
-              *(segment2.begin() + k), i, j, k);
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k), i, j, k);
         }
       }
     }
@@ -347,11 +343,11 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -369,20 +365,20 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
             for (int i = 0; i < len0; i++)
             {
 
-              loop_body.get_priv()(
-                  *(segment0.begin() + i), *(segment1.begin() + j));
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
             }
           }
         });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -402,9 +398,9 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
             {
               for (int i = 0; i < len0; i++)
               {
-                loop_body.get_priv()(
-                    *(segment0.begin() + i), *(segment1.begin() + j),
-                    *(segment2.begin() + k));
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
               }
             }
           }
@@ -418,11 +414,11 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -440,20 +436,20 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
             for (int i = 0; i < len0; i++)
             {
 
-              loop_body.get_priv()(
-                  *(segment0.begin() + i), *(segment1.begin() + j), i, j);
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j), i, j);
             }
           }
         });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -473,9 +469,9 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
             {
               for (int i = 0; i < len0; i++)
               {
-                loop_body.get_priv()(
-                    *(segment0.begin() + i), *(segment1.begin() + j),
-                    *(segment2.begin() + k), i, j, k);
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k), i, j, k);
               }
             }
           }
@@ -489,11 +485,11 @@ struct TileExecute<omp_parallel_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -518,11 +514,11 @@ struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len      = segment.end() - segment.begin();
@@ -549,11 +545,11 @@ struct TileExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -570,11 +566,11 @@ struct TileTCountExecute<omp_for_exec, SEGMENT>
 {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len      = segment.end() - segment.begin();
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 3e84ceb481..9aa61217b3 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -80,10 +80,9 @@ struct MultiReduceDataOMP<
 
   MultiReduceDataOMP() = delete;
 
-  template <
-      typename Container,
-      std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* =
-          nullptr>
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
       : m_parent(nullptr),
         m_num_bins(container.size()),
@@ -99,9 +98,9 @@ struct MultiReduceDataOMP<
         m_identity(other.m_identity),
         m_data(nullptr)
   {
-    m_data = create_data(
-        RepeatView<value_type>(other.m_identity, other.m_num_bins),
-        other.m_num_bins);
+    m_data =
+        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
+                    other.m_num_bins);
   }
 
   MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
@@ -158,9 +157,9 @@ struct MultiReduceDataOMP<
 
 private:
   MultiReduceDataOMP const* m_parent;
-  size_t                    m_num_bins;
-  T                         m_identity;
-  T*                        m_data;
+  size_t m_num_bins;
+  T m_identity;
+  T* m_data;
 
   template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
@@ -216,10 +215,9 @@ struct MultiReduceDataOMP<
 
   MultiReduceDataOMP() = delete;
 
-  template <
-      typename Container,
-      std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* =
-          nullptr>
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
       : m_parent(nullptr),
         m_max_threads(omp_get_max_threads()),
@@ -229,9 +227,8 @@ struct MultiReduceDataOMP<
         m_identity(identity),
         m_data(nullptr)
   {
-    m_data = create_data(
-        container, identity, m_num_bins, m_max_threads, m_padded_bins,
-        m_padded_threads);
+    m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                         m_padded_bins, m_padded_threads);
   }
 
   MultiReduceDataOMP(MultiReduceDataOMP const& other)
@@ -253,8 +250,8 @@ struct MultiReduceDataOMP<
     {
       if (!m_parent)
       {
-        destroy_data(
-            m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                     m_padded_threads);
       }
     }
   }
@@ -266,13 +263,12 @@ struct MultiReduceDataOMP<
     size_t new_num_bins = container.size();
     if (new_num_bins != m_num_bins)
     {
-      destroy_data(
-          m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                   m_padded_threads);
       m_num_bins    = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data        = create_data(
-                 container, identity, m_num_bins, m_max_threads, m_padded_bins,
-                 m_padded_threads);
+      m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                           m_padded_bins, m_padded_threads);
     }
     else
     {
@@ -283,8 +279,8 @@ struct MultiReduceDataOMP<
           size_t bin        = 0;
           for (auto const& value : container)
           {
-            m_data[index_data(
-                bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = value;
             ++bin;
           }
         }
@@ -292,8 +288,8 @@ struct MultiReduceDataOMP<
         {
           for (size_t bin = 0; bin < m_num_bins; ++bin)
           {
-            m_data[index_data(
-                bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = identity;
           }
         }
       }
@@ -326,19 +322,19 @@ struct MultiReduceDataOMP<
 
 private:
   MultiReduceDataOMP const* m_parent;
-  size_t                    m_max_threads;
-  size_t                    m_num_bins;
-  size_t                    m_padded_threads;
-  size_t                    m_padded_bins;
-  T                         m_identity;
-  T*                        m_data;
+  size_t m_max_threads;
+  size_t m_num_bins;
+  size_t m_padded_threads;
+  size_t m_padded_bins;
+  T m_identity;
+  T* m_data;
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
     size_t num_cache_lines =
         RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(
-        num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
+                                   sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -346,23 +342,21 @@ struct MultiReduceDataOMP<
     return max_threads;
   }
 
-  static constexpr size_t index_data(
-      size_t bin,
-      size_t thread_idx,
-      size_t padded_bins,
-      size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(size_t bin,
+                                     size_t thread_idx,
+                                     size_t padded_bins,
+                                     size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
   template <typename Container>
-  static T* create_data(
-      Container const& container,
-      T                identity,
-      size_t           num_bins,
-      size_t           max_threads,
-      size_t           padded_bins,
-      size_t           padded_threads)
+  static T* create_data(Container const& container,
+                        T identity,
+                        size_t num_bins,
+                        size_t max_threads,
+                        size_t padded_bins,
+                        size_t padded_threads)
   {
     if (num_bins == size_t(0))
     {
@@ -394,12 +388,11 @@ struct MultiReduceDataOMP<
     return data;
   }
 
-  static void destroy_data(
-      T*&    data,
-      size_t num_bins,
-      size_t max_threads,
-      size_t padded_bins,
-      size_t padded_threads)
+  static void destroy_data(T*& data,
+                           size_t num_bins,
+                           size_t max_threads,
+                           size_t padded_bins,
+                           size_t padded_threads)
   {
     if (num_bins == size_t(0))
     {
@@ -420,9 +413,8 @@ struct MultiReduceDataOMP<
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(
-    policy::omp::omp_multi_reduce_policy,
-    detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
+                                detail::MultiReduceDataOMP)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index 97898ddf85..e22b3d7d59 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -23,17 +23,15 @@ namespace internal
 //
 // omp for (Auto)
 //
-template <
-    typename ExecPol,
-    typename Iterable,
-    typename Func,
-    typename ForallParam>
+template <typename ExecPol,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
-            forall_impl(
-                const ExecPol& p,
-                Iterable&&     iter,
-                Func&&         loop_body,
-                ForallParam&&  f_params)
+forall_impl(const ExecPol& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -52,21 +50,18 @@ RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
 //
 // omp for schedule(static)
 //
-template <
-    template <int>
-    class ExecPol,
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam>
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if<
     std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
     std::integral_constant<bool, (ChunkSize <= 0)>>
-forall_impl(
-    const ExecPol<ChunkSize>& p,
-    Iterable&&                iter,
-    Func&&                    loop_body,
-    ForallParam&&             f_params)
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -85,21 +80,18 @@ forall_impl(
 //
 // omp for schedule(static, ChunkSize)
 //
-template <
-    template <int>
-    class ExecPol,
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam>
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if<
     std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
     std::integral_constant<bool, (ChunkSize > 0)>>
-forall_impl(
-    const ExecPol<ChunkSize>& p,
-    Iterable&&                iter,
-    Func&&                    loop_body,
-    ForallParam&&             f_params)
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -120,11 +112,10 @@ forall_impl(
 // omp for schedule(runtime)
 //
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Runtime& p,
-    Iterable&&                          iter,
-    Func&&                              loop_body,
-    ForallParam&&                       f_params)
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -144,11 +135,10 @@ RAJA_INLINE void forall_impl(
 // omp for nowait (Auto)
 //
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE void forall_impl_nowait(
-    const ::RAJA::policy::omp::Auto& p,
-    Iterable&&                       iter,
-    Func&&                           loop_body,
-    ForallParam&&                    f_params)
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                    Iterable&& iter,
+                                    Func&& loop_body,
+                                    ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -170,17 +160,15 @@ RAJA_INLINE void forall_impl_nowait(
 //
 // omp for schedule(dynamic)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-    Iterable&&                                     iter,
-    Func&&                                         loop_body,
-    ForallParam&&                                  f_params)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -199,17 +187,15 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(dynamic, ChunkSize)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-    Iterable&&                                     iter,
-    Func&&                                         loop_body,
-    ForallParam&&                                  f_params)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -229,17 +215,15 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(guided)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-    Iterable&&                                    iter,
-    Func&&                                        loop_body,
-    ForallParam&&                                 f_params)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -258,17 +242,15 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(guided, ChunkSize)
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(
-    const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-    Iterable&&                                    iter,
-    Func&&                                        loop_body,
-    ForallParam&&                                 f_params)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -288,17 +270,16 @@ RAJA_INLINE void forall_impl(
 //
 // omp for schedule(static) nowait
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl_nowait(
-    const ::RAJA::policy::omp::Static<ChunkSize>& p,
-    Iterable&&                                    iter,
-    Func&&                                        loop_body,
-    ForallParam&&                                 f_params)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -320,17 +301,16 @@ RAJA_INLINE void forall_impl_nowait(
 //
 // omp for schedule(static, ChunkSize) nowait
 //
-template <
-    typename Iterable,
-    typename Func,
-    int ChunkSize,
-    typename ForallParam,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl_nowait(
-    const ::RAJA::policy::omp::Static<ChunkSize>& p,
-    Iterable&&                                    iter,
-    Func&&                                        loop_body,
-    ForallParam&&                                 f_params)
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -351,21 +331,20 @@ RAJA_INLINE void forall_impl_nowait(
 
 }  //  namespace internal
 
-template <
-    typename Schedule,
-    typename Iterable,
-    typename Func,
-    typename ForallParam>
-RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(
-    resources::Host host_res,
-    const omp_for_schedule_exec<Schedule>&,
-    Iterable&&  iter,
-    Func&&      loop_body,
-    ForallParam f_params)
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE resources::EventProxy<resources::Host>
+forall_impl(resources::Host host_res,
+            const omp_for_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
 {
-  expt::internal::forall_impl(
-      Schedule {}, std::forward<Iterable>(iter), std::forward<Func>(loop_body),
-      std::forward<ForallParam>(f_params));
+  expt::internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                              std::forward<Func>(loop_body),
+                              std::forward<ForallParam>(f_params));
   return resources::EventProxy<resources::Host>(host_res);
 }
 }  //  namespace expt
@@ -373,22 +352,20 @@ RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(
 ///
 /// OpenMP parallel policy implementation
 ///
-template <
-    typename Iterable,
-    typename Func,
-    typename InnerPolicy,
-    typename ForallParam>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    resources::Host host_res,
-    const omp_parallel_exec<InnerPolicy>&,
-    Iterable&&  iter,
-    Func&&      loop_body,
-    ForallParam f_params)
+forall_impl(resources::Host host_res,
+            const omp_parallel_exec<InnerPolicy>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
 {
   expt::forall_impl(host_res, InnerPolicy {}, iter, loop_body, f_params);
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index e98e8a43be..6a1299065c 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -56,7 +56,7 @@ template <multi_reduce_algorithm t_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
-  static constexpr bool                   consistent =
+  static constexpr bool consistent =
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
@@ -75,9 +75,9 @@ struct ScheduleTag
 template <omp_sched_t Sched, int Chunk>
 struct Schedule : public ScheduleTag
 {
-  constexpr static omp_sched_t schedule   = Sched;
-  constexpr static int         chunk_size = Chunk;
-  constexpr static Policy      policy     = Policy::openmp;
+  constexpr static omp_sched_t schedule = Sched;
+  constexpr static int chunk_size       = Chunk;
+  constexpr static Policy policy        = Policy::openmp;
 };
 }  // namespace internal
 
@@ -113,8 +113,8 @@ using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 template <int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::
-                     Schedule<static_cast<omp_sched_t>(-1), default_chunk_size>
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
+                                            default_chunk_size>
 {};
 
 //
@@ -128,21 +128,20 @@ struct Runtime : private internal::
 ///
 ///  Struct supporting OpenMP parallel region.
 ///
-struct omp_parallel_region : make_policy_pattern_launch_platform_t<
-                                 Policy::openmp,
-                                 Pattern::region,
-                                 Launch::undefined,
-                                 Platform::host>
+struct omp_parallel_region
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::region,
+                                            Launch::undefined,
+                                            Platform::host>
 {};
 
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t : make_policy_pattern_launch_platform_t<
-                          Policy::openmp,
-                          Pattern::region,
-                          Launch::undefined,
-                          Platform::host>
+struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                            Pattern::region,
+                                                            Launch::undefined,
+                                                            Platform::host>
 {};
 
 
@@ -150,14 +149,14 @@ struct omp_launch_t : make_policy_pattern_launch_platform_t<
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
 template <typename Sched>
-struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<
-                                          Policy::openmp,
-                                          Pattern::forall,
-                                          Launch::undefined,
-                                          Platform::host,
-                                          omp::For,
-                                          omp::NoWait,
-                                          Sched>
+struct omp_for_nowait_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            omp::NoWait,
+                                            Sched>
 {
   static_assert(
       std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
@@ -169,13 +168,13 @@ struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
 template <typename Sched>
-struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<
-                                   Policy::openmp,
-                                   Pattern::forall,
-                                   Launch::undefined,
-                                   Platform::host,
-                                   omp::For,
-                                   Sched>
+struct omp_for_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            Sched>
 {
   static_assert(
       std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
@@ -226,13 +225,13 @@ using omp_for_nowait_static_exec =
 ///  execution construct.
 ///
 template <typename InnerPolicy>
-using omp_parallel_exec = make_policy_pattern_launch_platform_t<
-    Policy::openmp,
-    Pattern::forall,
-    Launch::undefined,
-    Platform::host,
-    omp::Parallel,
-    wrapper<InnerPolicy>>;
+using omp_parallel_exec =
+    make_policy_pattern_launch_platform_t<Policy::openmp,
+                                          Pattern::forall,
+                                          Launch::undefined,
+                                          Platform::host,
+                                          omp::Parallel,
+                                          wrapper<InnerPolicy>>;
 
 ///
 ///  Internal type aliases supporting 'omp parallel for schedule( )' for
@@ -297,11 +296,10 @@ struct omp_taskgraph_interval_segit
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct omp_work : make_policy_pattern_launch_platform_t<
-                      Policy::openmp,
-                      Pattern::workgroup_exec,
-                      Launch::sync,
-                      Platform::host>
+struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                        Pattern::workgroup_exec,
+                                                        Launch::sync,
+                                                        Platform::host>
 {};
 
 ///
@@ -326,17 +324,15 @@ struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
                                      Pattern::multi_reduce,
                                      Launch::undefined,
                                      Platform::host,
-                                     std::conditional_t<
-                                         tuning::consistent,
-                                         reduce::ordered,
-                                         reduce::unordered>>
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
 {};
 
 ///
-struct omp_synchronize : make_policy_pattern_launch_t<
-                             Policy::openmp,
-                             Pattern::synchronize,
-                             Launch::sync>
+struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
+                                                      Pattern::synchronize,
+                                                      Launch::sync>
 {};
 
 #if defined(RAJA_COMPILER_MSVC)
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 5e724367f8..555075aeac 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -44,15 +44,13 @@ namespace scan
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter  begin,
-    Iter  end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
   using RAJA::detail::firstIndex;
   using std::distance;
@@ -63,24 +61,23 @@ inclusive_inplace(
   ::std::vector<Value> sums(p0, Value());
 #pragma omp parallel num_threads(p0)
   {
-    const int       p         = omp_get_num_threads();
-    const int       pid       = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
     const DistanceT idx_end   = firstIndex(n, p, pid + 1);
     if (idx_begin != idx_end)
     {
-      inclusive_inplace(
-          host_res, ::RAJA::seq_exec {}, begin + idx_begin, begin + idx_end, f);
+      inclusive_inplace(host_res, ::RAJA::seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
-#pragma omp          single
-    exclusive_inplace(
-                 host_res, ::RAJA::seq_exec {}, sums.data(), sums.data() + p, f,
-                 BinFn::identity());
+#pragma omp single
+    exclusive_inplace(host_res, ::RAJA::seq_exec {}, sums.data(),
+                      sums.data() + p, f, BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i)
     {
-               begin[i] = f(begin[i], sums[pid]);
+      begin[i] = f(begin[i], sums[pid]);
     }
   }
 
@@ -92,16 +89,14 @@ inclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter   begin,
-    Iter   end,
-    BinFn  f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  ValueT v)
 {
   using RAJA::detail::firstIndex;
   using std::distance;
@@ -112,23 +107,22 @@ exclusive_inplace(
   ::std::vector<Value> sums(p0, v);
 #pragma omp parallel num_threads(p0)
   {
-    const int       p         = omp_get_num_threads();
-    const int       pid       = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
     const DistanceT idx_end   = firstIndex(n, p, pid + 1);
-    const Value     init      = ((pid == 0) ? v : *(begin + idx_begin - 1));
+    const Value init          = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
     if (idx_begin != idx_end)
     {
-      exclusive_inplace(
-          host_res, seq_exec {}, begin + idx_begin, begin + idx_end, f, init);
+      exclusive_inplace(host_res, seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(
-        host_res, seq_exec {}, sums.data(), sums.data() + p, f,
-        BinFn::identity());
+    exclusive_inplace(host_res, seq_exec {}, sums.data(), sums.data() + p, f,
+                      BinFn::identity());
     for (auto i = idx_begin; i < idx_end; ++i)
     {
       begin[i] = f(begin[i], sums[pid]);
@@ -143,16 +137,14 @@ exclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<Policy>>
-inclusive(
-    resources::Host host_res,
-    const Policy&   exec,
-    Iter            begin,
-    Iter            end,
-    OutIter         out,
-    BinFn           f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -163,28 +155,25 @@ inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <
-    typename Policy,
-    typename Iter,
-    typename OutIter,
-    typename BinFn,
-    typename ValueT>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<Policy>>
-exclusive(
-    resources::Host host_res,
-    const Policy&   exec,
-    Iter            begin,
-    Iter            end,
-    OutIter         out,
-    BinFn           f,
-    ValueT          v)
+template <typename Policy,
+          typename Iter,
+          typename OutIter,
+          typename BinFn,
+          typename ValueT>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f,
+          ValueT v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(
-      host_res, exec, out, out + distance(begin, end), f, v);
+  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f,
+                           v);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 5a4439abbb..ea88a7b2ff 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -55,13 +55,12 @@ constexpr int get_min_iterates_per_task() { return 128; }
                by spawning tasks
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline void sort_task(
-    Sorter                       sorter,
-    Iter                         begin,
-    RAJA::detail::IterDiff<Iter> i_begin,
-    RAJA::detail::IterDiff<Iter> i_end,
-    RAJA::detail::IterDiff<Iter> iterates_per_task,
-    Compare                      comp)
+inline void sort_task(Sorter sorter,
+                      Iter begin,
+                      RAJA::detail::IterDiff<Iter> i_begin,
+                      RAJA::detail::IterDiff<Iter> i_end,
+                      RAJA::detail::IterDiff<Iter> iterates_per_task,
+                      Compare comp)
 {
   using diff_type   = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
@@ -86,8 +85,8 @@ inline void sort_task(
 
     // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
     // comp);
-    RAJA::detail::inplace_merge(
-        begin + i_begin, begin + i_middle, begin + i_end, comp);
+    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                begin + i_end, comp);
   }
 }
 
@@ -98,11 +97,10 @@ inline void sort_task(
                by manually assigning work to threads
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline void sort_parallel_region(
-    Sorter                       sorter,
-    Iter                         begin,
-    RAJA::detail::IterDiff<Iter> n,
-    Compare                      comp)
+inline void sort_parallel_region(Sorter sorter,
+                                 Iter begin,
+                                 RAJA::detail::IterDiff<Iter> n,
+                                 Compare comp)
 {
   using RAJA::detail::firstIndex;
   using diff_type = RAJA::detail::IterDiff<Iter>;
@@ -139,8 +137,8 @@ inline void sort_parallel_region(
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
       // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
       // comp);
-      RAJA::detail::inplace_merge(
-          begin + i_begin, begin + i_middle, begin + i_end, comp);
+      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                  begin + i_end, comp);
     }
   }
 }
@@ -180,7 +178,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
     RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
-#pragma omp          master
+#pragma omp master
     {
       sort_task(sorter, begin, 0, n, iterates_per_task, comp);
     }
@@ -208,15 +206,13 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
         \brief sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter    begin,
-    Iter    end,
-    Compare comp)
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<ExecPolicy>>
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
   detail::openmp::sort(detail::UnstableSorter {}, begin, end, comp);
 
@@ -227,15 +223,13 @@ unstable(
         \brief stable sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter    begin,
-    Iter    end,
-    Compare comp)
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<ExecPolicy>>
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
   detail::openmp::sort(detail::StableSorter {}, begin, end, comp);
 
@@ -245,28 +239,24 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <
-    typename ExecPolicy,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<ExecPolicy>>
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(
-      detail::UnstableSorter {}, begin, end,
-      RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -275,27 +265,24 @@ unstable_pairs(
         \brief stable sort given range of pairs using comparison function on
    keys
 */
-template <
-    typename ExecPolicy,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<ExecPolicy>>
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(
-      detail::StableSorter {}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::StableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index 1a96dcb7c6..96c2323c33 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -35,52 +35,46 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::omp_target_work,
-    RAJA::ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallOrdered<
-          RAJA::omp_target_parallel_for_exec_nt,
-          RAJA::omp_target_work,
-          RAJA::ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::omp_target_work,
-    RAJA::reverse_ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallReverse<
-          RAJA::omp_target_parallel_for_exec_nt,
-          RAJA::omp_target_work,
-          RAJA::reverse_ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {};
 
 }  // namespace detail
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 5d465b5727..a142b6a606 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -33,22 +33,20 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <
-    size_t ThreadsPerTeam,
-    typename Iterable,
-    typename Func,
-    typename ForallParam>
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    resources::Omp                                      omp_res,
-    const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
-    Iterable&&                                          iter,
-    Func&&                                              loop_body,
-    ForallParam                                         f_params)
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -94,21 +92,19 @@ forall_impl(
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <
-    size_t ThreadsPerTeam,
-    typename Iterable,
-    typename Func,
-    typename ForallParam>
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Omp omp_res,
-    const omp_target_parallel_for_exec<ThreadsPerTeam>&,
-    Iterable&& iter,
-    Func&&     loop_body,
-    ForallParam)
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec<ThreadsPerTeam>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body  = loop_body;
@@ -155,12 +151,11 @@ RAJA_INLINE concepts::enable_if_t<
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    resources::Omp                         omp_res,
-    const omp_target_parallel_for_exec_nt& p,
-    Iterable&&                             iter,
-    Func&&                                 loop_body,
-    ForallParam                            f_params)
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec_nt& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
 {
   using EXEC_POL = typename std::decay<decltype(p)>::type;
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
@@ -189,12 +184,11 @@ RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Omp omp_res,
-    const omp_target_parallel_for_exec_nt&,
-    Iterable&& iter,
-    Func&&     loop_body,
-    ForallParam)
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec_nt&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body  = loop_body;
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index 72bb3fc634..22d2eb32d8 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -15,17 +15,14 @@ namespace RAJA
 namespace internal
 {
 
-template <
-    camp::idx_t Arg0,
-    camp::idx_t Arg1,
-    typename... EnclosedStmts,
-    typename Types>
-struct StatementExecutor<
-    statement::Collapse<
-        omp_target_parallel_collapse_exec,
-        ArgList<Arg0, Arg1>,
-        EnclosedStmts...>,
-    Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
+                                             ArgList<Arg0, Arg1>,
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -55,18 +52,15 @@ struct StatementExecutor<
   }
 };
 
-template <
-    camp::idx_t Arg0,
-    camp::idx_t Arg1,
-    camp::idx_t Arg2,
-    typename... EnclosedStmts,
-    typename Types>
-struct StatementExecutor<
-    statement::Collapse<
-        omp_target_parallel_collapse_exec,
-        ArgList<Arg0, Arg1, Arg2>,
-        EnclosedStmts...>,
-    Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          camp::idx_t Arg2,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
+                                             ArgList<Arg0, Arg1, Arg2>,
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -102,19 +96,16 @@ struct StatementExecutor<
   }
 };
 
-template <
-    camp::idx_t Arg0,
-    camp::idx_t Arg1,
-    camp::idx_t Arg2,
-    camp::idx_t Arg3,
-    typename... EnclosedStmts,
-    typename Types>
-struct StatementExecutor<
-    statement::Collapse<
-        omp_target_parallel_collapse_exec,
-        ArgList<Arg0, Arg1, Arg2, Arg3>,
-        EnclosedStmts...>,
-    Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          camp::idx_t Arg2,
+          camp::idx_t Arg3,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
+                                             ArgList<Arg0, Arg1, Arg2, Arg3>,
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 35bc190d66..38e48c4d24 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -15,11 +15,10 @@ namespace RAJA
 namespace internal
 {
 
-template <
-    camp::idx_t ArgumentId,
-    typename Data,
-    typename Types,
-    typename... EnclosedStmts>
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
 struct OpenMPTargetForWrapper : public GenericWrapperBase
 {
   using data_t = camp::decay<Data>;
@@ -46,15 +45,14 @@ struct OpenMPTargetForWrapper : public GenericWrapperBase
   }
 };
 
-template <
-    camp::idx_t ArgumentId,
-    int         N,
-    typename... EnclosedStmts,
-    typename Types>
-struct StatementExecutor<
-    statement::
-        For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>,
-    Types>
+template <camp::idx_t ArgumentId,
+          int N,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<statement::For<ArgumentId,
+                                        omp_target_parallel_for_exec<N>,
+                                        EnclosedStmts...>,
+                         Types>
 {
 
   template <typename Data>
@@ -70,9 +68,9 @@ struct StatementExecutor<
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(
-        r, omp_target_parallel_for_exec<N> {}, TypedRangeSegment<len_t>(0, len),
-        for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, omp_target_parallel_for_exec<N> {},
+                TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index 95db806635..4e0b05a00c 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -37,45 +37,44 @@ struct Collapse
 {};
 
 template <size_t ThreadsPerTeam>
-struct omp_target_parallel_for_exec : make_policy_pattern_platform_t<
-                                          Policy::target_openmp,
-                                          Pattern::forall,
-                                          Platform::omp_target,
-                                          omp::Target,
-                                          omp::Teams<ThreadsPerTeam>,
-                                          omp::Distribute>
+struct omp_target_parallel_for_exec
+    : make_policy_pattern_platform_t<Policy::target_openmp,
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Teams<ThreadsPerTeam>,
+                                     omp::Distribute>
 {};
 
-struct omp_target_parallel_for_exec_nt : make_policy_pattern_platform_t<
-                                             Policy::target_openmp,
-                                             Pattern::forall,
-                                             Platform::omp_target,
-                                             omp::Target,
-                                             omp::Distribute>
+struct omp_target_parallel_for_exec_nt
+    : make_policy_pattern_platform_t<Policy::target_openmp,
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Distribute>
 {};
 
-struct omp_target_parallel_collapse_exec : make_policy_pattern_platform_t<
-                                               Policy::target_openmp,
-                                               Pattern::forall,
-                                               Platform::omp_target,
-                                               omp::Target,
-                                               omp::Collapse>
+struct omp_target_parallel_collapse_exec
+    : make_policy_pattern_platform_t<Policy::target_openmp,
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Collapse>
 {};
 
-struct omp_target_reduce : make_policy_pattern_platform_t<
-                               Policy::target_openmp,
-                               Pattern::reduce,
-                               Platform::omp_target>
+struct omp_target_reduce : make_policy_pattern_platform_t<Policy::target_openmp,
+                                                          Pattern::reduce,
+                                                          Platform::omp_target>
 {};
 
 ///
 /// WorkGroup execution policies
 ///
-struct omp_target_work : make_policy_pattern_launch_platform_t<
-                             Policy::target_openmp,
-                             Pattern::workgroup_exec,
-                             Launch::sync,
-                             Platform::omp_target>
+struct omp_target_work
+    : make_policy_pattern_launch_platform_t<Policy::target_openmp,
+                                            Pattern::workgroup_exec,
+                                            Launch::sync,
+                                            Platform::omp_target>
 {};
 
 
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index fa6ae04dab..8bcbde620d 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -70,8 +70,8 @@ static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 //! Information necessary for OpenMP offload to be considered
 struct Offload_Info
 {
-  int  hostID {omp_get_initial_device()};
-  int  deviceID {omp_get_default_device()};
+  int hostID {omp_get_initial_device()};
+  int deviceID {omp_get_default_device()};
   bool isMapped {false};
 
   Offload_Info() = default;
@@ -89,8 +89,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T*        device;
-  T*        host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -129,10 +129,10 @@ struct Reduce_Data
   RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(
-            reinterpret_cast<void*>(device), reinterpret_cast<void*>(host),
-            omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
-            info.hostID) != 0)
+    if (omp_target_memcpy(reinterpret_cast<void*>(device),
+                          reinterpret_cast<void*>(host),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
+                          info.hostID) != 0)
     {
       printf("Unable to copy memory from host to device\n");
       exit(1);
@@ -143,10 +143,10 @@ struct Reduce_Data
   RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(
-            reinterpret_cast<void*>(host), reinterpret_cast<void*>(device),
-            omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
-            info.deviceID) != 0)
+    if (omp_target_memcpy(reinterpret_cast<void*>(host),
+                          reinterpret_cast<void*>(device),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
+                          info.deviceID) != 0)
     {
       printf("Unable to copy memory from device to host\n");
       exit(1);
@@ -256,8 +256,8 @@ struct TargetReduce
   omp::Offload_Info info;
   //! storage for reduction data (host ptr, device ptr, value)
   omp::Reduce_Data<T> val;
-  T                   initVal;
-  T                   finalVal;
+  T initVal;
+  T finalVal;
 };
 
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
@@ -268,9 +268,9 @@ struct TargetReduceLoc
   TargetReduceLoc()                       = delete;
   TargetReduceLoc(const TargetReduceLoc&) = default;
   explicit TargetReduceLoc(
-      T         init_val_,
+      T init_val_,
       IndexType init_loc,
-      T         identity_val_ = Reducer::identity,
+      T identity_val_ = Reducer::identity,
       IndexType identity_loc_ =
           RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
@@ -282,12 +282,11 @@ struct TargetReduceLoc
         finalLoc(identity_loc_)
   {}
 
-  void reset(
-      T         init_val_,
-      IndexType init_loc_,
-      T         identity_val_ = Reducer::identity,
-      IndexType identity_loc_ =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(T init_val_,
+             IndexType init_loc_,
+             T identity_val_ = Reducer::identity,
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
@@ -367,10 +366,10 @@ struct TargetReduceLoc
   omp::Reduce_Data<T> val;
   //! storage for redcution data for location
   omp::Reduce_Data<IndexType> loc;
-  T                           initVal;
-  T                           finalVal;
-  IndexType                   initLoc;
-  IndexType                   finalLoc;
+  T initVal;
+  T finalVal;
+  IndexType initLoc;
+  IndexType finalLoc;
 };
 
 
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index 54e2c90182..b2b6f11bba 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -35,52 +35,44 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::seq_work,
-    RAJA::ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallOrdered<
-          RAJA::seq_exec,
-          RAJA::seq_work,
-          RAJA::ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
 {};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <
-    typename DISPATCH_POLICY_T,
-    typename ALLOCATOR_T,
-    typename INDEX_T,
-    typename... Args>
-struct WorkRunner<
-    RAJA::seq_work,
-    RAJA::reverse_ordered,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
-    : WorkRunnerForallReverse<
-          RAJA::seq_exec,
-          RAJA::seq_work,
-          RAJA::reverse_ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::reverse_ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
 {};
 
 }  // namespace detail
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index e5153a5aeb..4bf9f1607a 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -55,21 +55,19 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <
-    typename Iterable,
-    typename Func,
-    typename Resource,
-    typename ForallParam>
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Resource>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    Resource res,
-    const seq_exec&,
-    Iterable&&  iter,
-    Func&&      body,
-    ForallParam f_params)
+forall_impl(Resource res,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
+            ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
@@ -84,21 +82,19 @@ forall_impl(
   return resources::EventProxy<Resource>(res);
 }
 
-template <
-    typename Iterable,
-    typename Func,
-    typename Resource,
-    typename ForallParam>
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Resource>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    Resource res,
-    const seq_exec&,
-    Iterable&& iter,
-    Func&&     body,
-    ForallParam)
+forall_impl(Resource res,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
+            ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index 0e3a334a33..a722b89ff8 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -49,11 +49,10 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <
-    camp::idx_t Arg0,
-    camp::idx_t... ArgRest,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t... ArgRest,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
     Types>
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 61a38155e2..dc94c14d85 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -29,12 +29,10 @@ namespace internal
 //
 // Executor that handles reductions for
 //
-template <
-    template <typename...>
-    class ReduceOperator,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
+template <template <typename...> class ReduceOperator,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
     Types>
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index 2b0b15357d..6459189e23 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -29,9 +29,8 @@ template <>
 struct LaunchExecute<RAJA::null_launch_t>
 {
   template <typename BODY>
-  static void exec(
-      LaunchContext const& RAJA_UNUSED_ARG(ctx),
-      BODY const&          RAJA_UNUSED_ARG(body))
+  static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
+                   BODY const& RAJA_UNUSED_ARG(body))
   {
     RAJA_ABORT_OR_THROW("NULL Launch");
   }
@@ -47,12 +46,11 @@ struct LaunchExecute<RAJA::seq_launch_t>
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      LaunchParams const&       params,
-      const char*               RAJA_UNUSED_ARG(kernel_name),
-      BODY const&               body,
-      ReduceParams&             RAJA_UNUSED_ARG(ReduceParams))
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
@@ -74,18 +72,17 @@ struct LaunchExecute<RAJA::seq_launch_t>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      LaunchParams const&       launch_params,
-      const char*               RAJA_UNUSED_ARG(kernel_name),
-      BODY const&               body,
-      ReduceParams&             launch_reducers)
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
     LaunchContext ctx;
-    char*         kernel_local_mem = new char[launch_params.shared_mem_size];
-    ctx.shared_mem_ptr             = kernel_local_mem;
+    char* kernel_local_mem = new char[launch_params.shared_mem_size];
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     expt::invoke_body(launch_reducers, body, ctx);
 
@@ -105,8 +102,8 @@ struct LoopExecute<seq_exec, SEGMENT>
 
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
+                                                BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -118,10 +115,10 @@ struct LoopExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -132,11 +129,11 @@ struct LoopExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
@@ -154,12 +151,12 @@ struct LoopExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -173,9 +170,8 @@ struct LoopExecute<seq_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(
-              *(segment0.begin() + i), *(segment1.begin() + j),
-              *(segment2.begin() + k));
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k));
         }
       }
     }
@@ -188,10 +184,10 @@ struct LoopICountExecute<seq_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
     for (int i = 0; i < len; i++)
@@ -201,11 +197,11 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
@@ -223,12 +219,12 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment0,
-      SEGMENT const&      segment1,
-      SEGMENT const&      segment2,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -242,9 +238,8 @@ struct LoopICountExecute<seq_exec, SEGMENT>
       {
         for (int i = 0; i < len0; i++)
         {
-          body(
-              *(segment0.begin() + i), *(segment1.begin() + j),
-              *(segment2.begin() + k), i, j, k);
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k), i, j, k);
         }
       }
     }
@@ -258,11 +253,11 @@ struct TileExecute<seq_exec, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -279,11 +274,11 @@ struct TileTCountExecute<seq_exec, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T              tile_size,
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index 8ca7365c89..2b05ba512e 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -71,10 +71,9 @@ struct MultiReduceDataSeq<
 
   MultiReduceDataSeq() = delete;
 
-  template <
-      typename Container,
-      std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* =
-          nullptr>
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
   MultiReduceDataSeq(Container const& container, T identity)
       : m_parent(nullptr),
         m_num_bins(container.size()),
@@ -138,9 +137,9 @@ struct MultiReduceDataSeq<
 
 private:
   MultiReduceDataSeq const* m_parent;
-  size_t                    m_num_bins;
-  T                         m_identity;
-  T*                        m_data;
+  size_t m_num_bins;
+  T m_identity;
+  T* m_data;
 
   template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
@@ -150,8 +149,8 @@ struct MultiReduceDataSeq<
       return nullptr;
     }
 
-    auto   data = static_cast<T*>(malloc(num_bins * sizeof(T)));
-    size_t bin  = 0;
+    auto data  = static_cast<T*>(malloc(num_bins * sizeof(T)));
+    size_t bin = 0;
     for (auto const& value : container)
     {
       new (&data[bin]) T(value);
@@ -178,9 +177,8 @@ struct MultiReduceDataSeq<
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(
-    policy::sequential::seq_multi_reduce_policy,
-    detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
+                                detail::MultiReduceDataSeq)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 8704d87f2b..00fa7274a3 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -34,7 +34,7 @@ template <multi_reduce_algorithm t_multi_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
-  static constexpr bool                   consistent =
+  static constexpr bool consistent =
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
@@ -57,25 +57,22 @@ namespace sequential
 /// Segment execution policies
 ///
 
-struct seq_region : make_policy_pattern_launch_platform_t<
-                        Policy::sequential,
-                        Pattern::region,
-                        Launch::sync,
-                        Platform::host>
+struct seq_region : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                                          Pattern::region,
+                                                          Launch::sync,
+                                                          Platform::host>
 {};
 
-struct seq_launch_t : make_policy_pattern_launch_platform_t<
-                          Policy::sequential,
-                          Pattern::region,
-                          Launch::sync,
-                          Platform::host>
+struct seq_launch_t : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                                            Pattern::region,
+                                                            Launch::sync,
+                                                            Platform::host>
 {};
 
-struct seq_exec : make_policy_pattern_launch_platform_t<
-                      Policy::sequential,
-                      Pattern::forall,
-                      Launch::undefined,
-                      Platform::host>
+struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                                        Pattern::forall,
+                                                        Launch::undefined,
+                                                        Platform::host>
 {};
 
 ///
@@ -86,11 +83,10 @@ using seq_segit = seq_exec;
 ///
 /// WorkGroup execution policies
 ///
-struct seq_work : make_policy_pattern_launch_platform_t<
-                      Policy::sequential,
-                      Pattern::workgroup_exec,
-                      Launch::sync,
-                      Platform::host>
+struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                                        Pattern::workgroup_exec,
+                                                        Launch::sync,
+                                                        Platform::host>
 {};
 
 ///
@@ -100,11 +96,10 @@ struct seq_work : make_policy_pattern_launch_platform_t<
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct seq_reduce : make_policy_pattern_launch_platform_t<
-                        Policy::sequential,
-                        Pattern::reduce,
-                        Launch::undefined,
-                        Platform::host>
+struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                                          Pattern::reduce,
+                                                          Launch::undefined,
+                                                          Platform::host>
 {};
 
 ///
@@ -114,10 +109,9 @@ struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
                                      Pattern::multi_reduce,
                                      Launch::undefined,
                                      Platform::host,
-                                     std::conditional_t<
-                                         tuning::consistent,
-                                         reduce::ordered,
-                                         reduce::unordered>>
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
 {};
 
 ///
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 66e383cade..e0e12e0a58 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -41,15 +41,13 @@ namespace scan
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter  begin,
-    Iter  end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
   ValueT agg   = *begin;
@@ -68,16 +66,14 @@ inclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter  begin,
-    Iter  end,
-    BinFn f,
-    T     v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  T v)
 {
   using std::distance;
   const auto n    = distance(begin, end);
@@ -101,16 +97,14 @@ exclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(
-    resources::Host host_res,
-    const ExecPolicy&,
-    const Iter begin,
-    const Iter end,
-    OutIter    out,
-    BinFn      f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
   ValueT agg   = *begin;
@@ -129,26 +123,23 @@ inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <
-    typename ExecPolicy,
-    typename Iter,
-    typename OutIter,
-    typename BinFn,
-    typename T>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(
-    resources::Host host_res,
-    const ExecPolicy&,
-    const Iter begin,
-    const Iter end,
-    OutIter    out,
-    BinFn      f,
-    T          v)
+template <typename ExecPolicy,
+          typename Iter,
+          typename OutIter,
+          typename BinFn,
+          typename T>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f,
+          T v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT  agg  = v;
+  ValueT agg   = v;
   OutIter o    = out;
   *o++         = v;
 
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 590a2f0e3b..0a31400029 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -76,15 +76,13 @@ struct StableSorter
         \brief sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter    begin,
-    Iter    end,
-    Compare comp)
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
   detail::UnstableSorter {}(begin, end, comp);
 
@@ -95,15 +93,13 @@ unstable(
         \brief stable sort given range using comparison function
 */
 template <typename ExecPolicy, typename Iter, typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter    begin,
-    Iter    end,
-    Compare comp)
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
   detail::StableSorter {}(begin, end, comp);
 
@@ -113,21 +109,18 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <
-    typename ExecPolicy,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
@@ -141,21 +134,18 @@ unstable_pairs(
         \brief stable sort given range of pairs using comparison function on
    keys
 */
-template <
-    typename ExecPolicy,
-    typename KeyIter,
-    typename ValIter,
-    typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
   auto begin    = RAJA::zip(keys_begin, vals_begin);
   auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index e0b66fb270..851eb2317f 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -52,12 +52,11 @@ RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    RAJA::resources::Host host_res,
-    const simd_exec&,
-    Iterable&&  iter,
-    Func&&      loop_body,
-    ForallParam f_params)
+forall_impl(RAJA::resources::Host host_res,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
@@ -79,12 +78,11 @@ RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
     expt::type_traits::is_ForallParamPack<ForallParam>,
     expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    RAJA::resources::Host host_res,
-    const simd_exec&,
-    Iterable&& iter,
-    Func&&     loop_body,
-    ForallParam)
+forall_impl(RAJA::resources::Host host_res,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   auto begin    = std::begin(iter);
   auto end      = std::end(iter);
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index 0d4a23c6fa..ae4e673a15 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -124,7 +124,7 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto  privatizer   = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       // Assign offset on privatized data
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 9fd86b27b5..4544e7ad54 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -42,11 +42,10 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
     statement::
         ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
@@ -75,7 +74,7 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto  privatizer   = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 07ea13da65..4ccc94fe94 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -30,10 +30,10 @@ struct LoopExecute<simd_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -50,10 +50,10 @@ struct LoopICountExecute<simd_exec, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const&      segment,
-      BODY const&         body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index 9450c0513e..fac158a36b 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -38,11 +38,10 @@ namespace policy
 namespace simd
 {
 
-struct simd_exec : make_policy_pattern_launch_platform_t<
-                       Policy::sequential,
-                       Pattern::forall,
-                       Launch::undefined,
-                       Platform::host>
+struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                                         Pattern::forall,
+                                                         Launch::undefined,
+                                                         Platform::host>
 {};
 
 }  // end of namespace simd
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index b7832d59a6..e1c6cbc884 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -49,12 +49,12 @@ namespace detail
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct syclInfo
 {
-  sycl_dim_t      gridDim {0};
-  sycl_dim_t      blockDim {0};
-  cl::sycl::queue qu             = cl::sycl::queue();
-  bool            setup_reducers = false;
+  sycl_dim_t gridDim {0};
+  sycl_dim_t blockDim {0};
+  cl::sycl::queue qu  = cl::sycl::queue();
+  bool setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
-  syclInfo*  thread_states = nullptr;
+  syclInfo* thread_states = nullptr;
   omp::mutex lock;
 #endif
 };
@@ -74,7 +74,7 @@ struct PinnedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    void*          ptr;
+    void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ptr              = ::sycl::malloc_host(nbytes, *q);
     return ptr;
@@ -97,7 +97,7 @@ struct DeviceAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    void*          ptr;
+    void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ptr              = ::sycl::malloc_device(nbytes, *q);
     return ptr;
@@ -121,7 +121,7 @@ struct DeviceZeroedAllocator
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
-    void*          ptr;
+    void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ptr              = ::sycl::malloc_device(nbytes, *q);
     q->memset(ptr, 0, nbytes);
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 523770e27b..0232c1270d 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -69,7 +69,7 @@ namespace impl
 RAJA_INLINE
 ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 {
-  size_t           size = {block_size * ((len + block_size - 1) / block_size)};
+  size_t size = {block_size * ((len + block_size - 1) / block_size)};
   ::sycl::range<1> gridSize(size);
   return gridSize;
 }
@@ -85,24 +85,22 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    size_t BlockSize,
-    bool   Async,
-    typename ForallParam,
-    typename std::enable_if<std::is_trivially_copyable<LoopBody> {}, bool>::
-        type = true>
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Sycl>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Sycl& sycl_res,
-    sycl_exec<BlockSize, Async>,
-    Iterable&& iter,
-    LoopBody&& loop_body,
-    ForallParam)
+forall_impl(resources::Sycl& sycl_res,
+            sycl_exec<BlockSize, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam)
 {
 
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
@@ -113,9 +111,9 @@ forall_impl(
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0 && BlockSize > 0)
@@ -135,16 +133,15 @@ forall_impl(
     q->submit(
         [&](::sycl::handler& h)
         {
-          h.parallel_for(
-              ::sycl::nd_range<1> {gridSize, blockSize},
-              [=](::sycl::nd_item<1> it)
-              {
-                IndexType ii = it.get_global_id(0);
-                if (ii < len)
-                {
-                  loop_body(begin[ii]);
-                }
-              });
+          h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                         [=](::sycl::nd_item<1> it)
+                         {
+                           IndexType ii = it.get_global_id(0);
+                           if (ii < len)
+                           {
+                             loop_body(begin[ii]);
+                           }
+                         });
         });
 
     if (!Async)
@@ -156,20 +153,19 @@ forall_impl(
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    size_t BlockSize,
-    bool   Async,
-    typename ForallParam,
-    typename std::enable_if<!std::is_trivially_copyable<LoopBody> {}, bool>::
-        type = true>
-RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
-    resources::Sycl& sycl_res,
-    sycl_exec<BlockSize, Async>,
-    Iterable&& iter,
-    LoopBody&& loop_body,
-    ForallParam)
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& sycl_res,
+            sycl_exec<BlockSize, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -179,9 +175,9 @@ RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
   if (len > 0 && BlockSize > 0)
@@ -200,7 +196,7 @@ RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
     ::sycl::queue* q = sycl_res.get_queue();
 
     LOOP_BODY* lbody;
-    Iterator*  beg;
+    Iterator* beg;
 
     RAJA_FT_BEGIN;
     //
@@ -217,17 +213,16 @@ RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
     q->submit(
          [&](::sycl::handler& h)
          {
-           h.parallel_for(
-               ::sycl::nd_range<1> {gridSize, blockSize},
-               [=](::sycl::nd_item<1> it)
-               {
-                 Index_type ii = it.get_global_id(0);
-
-                 if (ii < len)
-                 {
-                   (*lbody)((*beg)[ii]);
-                 }
-               });
+           h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                          [=](::sycl::nd_item<1> it)
+                          {
+                            Index_type ii = it.get_global_id(0);
+
+                            if (ii < len)
+                            {
+                              (*lbody)((*beg)[ii]);
+                            }
+                          });
          })
         .wait();  // Need to wait for completion to free memory
 
@@ -241,25 +236,23 @@ RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    size_t BlockSize,
-    bool   Async,
-    typename ForallParam,
-    typename std::enable_if<std::is_trivially_copyable<LoopBody> {}, bool>::
-        type = true>
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Sycl>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    resources::Sycl& sycl_res,
-    sycl_exec<BlockSize, Async>,
-    Iterable&&  iter,
-    LoopBody&&  loop_body,
-    ForallParam f_params)
+forall_impl(resources::Sycl& sycl_res,
+            sycl_exec<BlockSize, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam f_params)
 
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
@@ -270,9 +263,9 @@ forall_impl(
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
@@ -301,19 +294,18 @@ forall_impl(
     q->submit(
         [&](::sycl::handler& h)
         {
-          h.parallel_for(
-              ::sycl::range<1>(len), reduction,
-              [=](::sycl::item<1> it, auto& red)
-              {
-                ForallParam fp;
-                RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-                IndexType ii = it.get_id(0);
-                if (ii < len)
-                {
-                  RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-                }
-                red.combine(fp);
-              });
+          h.parallel_for(::sycl::range<1>(len), reduction,
+                         [=](::sycl::item<1> it, auto& red)
+                         {
+                           ForallParam fp;
+                           RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                           IndexType ii = it.get_id(0);
+                           if (ii < len)
+                           {
+                             RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                           }
+                           red.combine(fp);
+                         });
         });
 
     q->wait();
@@ -325,25 +317,23 @@ forall_impl(
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <
-    typename Iterable,
-    typename LoopBody,
-    size_t BlockSize,
-    bool   Async,
-    typename ForallParam,
-    typename std::enable_if<!std::is_trivially_copyable<LoopBody> {}, bool>::
-        type = true>
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Sycl>,
     RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
     concepts::negate<
         RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    resources::Sycl& sycl_res,
-    sycl_exec<BlockSize, Async>,
-    Iterable&&  iter,
-    LoopBody&&  loop_body,
-    ForallParam f_params)
+forall_impl(resources::Sycl& sycl_res,
+            sycl_exec<BlockSize, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam f_params)
 
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
@@ -354,9 +344,9 @@ forall_impl(
   //
   // Compute the requested iteration space size
   //
-  Iterator  begin = std::begin(iter);
-  Iterator  end   = std::end(iter);
-  IndexType len   = std::distance(begin, end);
+  Iterator begin = std::begin(iter);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
@@ -380,7 +370,7 @@ forall_impl(
     // START
     //
     LOOP_BODY* lbody;
-    Iterator*  beg;
+    Iterator* beg;
     RAJA_FT_BEGIN;
     //
     // Setup shared memory buffers
@@ -400,19 +390,18 @@ forall_impl(
     q->submit(
          [&](::sycl::handler& h)
          {
-           h.parallel_for(
-               ::sycl::range<1>(len), reduction,
-               [=](::sycl::item<1> it, auto& red)
-               {
-                 Index_type  ii = it.get_id(0);
-                 ForallParam fp;
-                 RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-                 if (ii < len)
-                 {
-                   RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-                 }
-                 red.combine(fp);
-               });
+           h.parallel_for(::sycl::range<1>(len), reduction,
+                          [=](::sycl::item<1> it, auto& red)
+                          {
+                            Index_type ii = it.get_id(0);
+                            ForallParam fp;
+                            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                            if (ii < len)
+                            {
+                              RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                            }
+                            red.combine(fp);
+                          });
          })
         .wait();  // Need to wait for completion to free memory
     RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
@@ -447,22 +436,21 @@ forall_impl(
  *
  ******************************************************************************
  */
-template <
-    typename LoopBody,
-    size_t BlockSize,
-    bool   Async,
-    typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
-    resources::Sycl& r,
-    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-    const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody&&                            loop_body)
+template <typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename... SegmentTypes>
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& r,
+            ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+            const TypedIndexSet<SegmentTypes...>& iset,
+            LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
   {
-    iset.segmentCall(
-        r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(), loop_body);
+    iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
+                     loop_body);
   }  // iterate over segments of index set
 
   if (!Async)
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index a41537c19c..f7cc487a28 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -36,15 +36,13 @@ namespace internal
 {
 
 
-template <
-    typename Data,
-    typename Conditional,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::If<Conditional, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          typename Conditional,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::If<Conditional, EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t      = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index 321881b3d1..4b95bff6d6 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -37,19 +37,17 @@ namespace internal
  * Mapping directly to indicies
  * Assigns the global index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    int         Dim,
-    int         Local_Size,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          int Dim,
+          int Local_Size,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<
-        ArgumentId,
-        RAJA::sycl_global_012<Dim, Local_Size>,
-        EnclosedStmts...>,
+    statement::For<ArgumentId,
+                   RAJA::sycl_global_012<Dim, Local_Size>,
+                   EnclosedStmts...>,
     Types>
 {
 
@@ -109,17 +107,16 @@ struct SyclStatementExecutor<
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    int         Dim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          int Dim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -176,17 +173,16 @@ struct SyclStatementExecutor<
  * each group in dims.
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    int         Dim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          int Dim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -247,17 +243,16 @@ struct SyclStatementExecutor<
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    int         Dim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          int Dim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -314,17 +309,16 @@ struct SyclStatementExecutor<
  * for each item in dim.
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    int         Dim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::
-        For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          int Dim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -395,12 +389,11 @@ struct SyclStatementExecutor<
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    int         Local_Size,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          int Local_Size,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::sycl_exec<Local_Size>, EnclosedStmts...>,
@@ -455,11 +448,10 @@ struct SyclStatementExecutor<
  * This is specialized since it need to execute the loop immediately.
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index a5f52cfe0f..feb5c195c4 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -37,36 +37,32 @@ namespace internal
  * Assigns the loop iterate to offset ArgumentId
  * Assigns the loop count to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    int ThreadDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::sycl_local_012_direct<ThreadDim>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::sycl_local_012_direct<ThreadDim>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
           Types>
 {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::sycl_local_012_direct<ThreadDim>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::sycl_local_012_direct<ThreadDim>,
+                     EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -76,7 +72,7 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto   i   = item.get_local_id(ThreadDim);
+    auto i     = item.get_local_id(ThreadDim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
@@ -92,37 +88,33 @@ struct SyclStatementExecutor<
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::sycl_local_masked_direct<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::sycl_local_masked_direct<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<
-          ArgumentId,
-          RAJA::sycl_local_masked_direct<Mask>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
@@ -140,7 +132,7 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto   i0  = item.get_local_id(0);
+    auto i0    = item.get_local_id(0);
     diff_t i   = mask_t::maskValue(i0);
 
     // assign thread id directly to offset
@@ -157,35 +149,33 @@ struct SyclStatementExecutor<
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename Mask,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::sycl_local_masked_loop<Mask>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::sycl_local_masked_loop<Mask>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::
-          For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
@@ -204,7 +194,7 @@ struct SyclStatementExecutor<
   {
     // masked size strided loop
     diff_t len      = segment_length<ArgumentId>(data);
-    auto   i0       = item.get_local_id(0);
+    auto i0         = item.get_local_id(0);
     diff_t i_init   = mask_t::maskValue(i0);
     diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
@@ -235,37 +225,33 @@ struct SyclStatementExecutor<
  * Assigns the loop iterate to offset ArgumentId
  * Assigns the loop offset to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    int ThreadDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::sycl_local_012_loop<ThreadDim>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::sycl_local_012_loop<ThreadDim>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<
-          ArgumentId,
-          RAJA::sycl_local_012_loop<ThreadDim>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_012_loop<ThreadDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -274,9 +260,9 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // block stride loop
-    diff_t len      = segment_length<ArgumentId>(data);
-    auto   i_init   = item.get_local_id(ThreadDim);
-    auto   i_stride = item.get_local_range(ThreadDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_local_id(ThreadDim);
+    auto i_stride = item.get_local_range(ThreadDim);
 
     // Iterate through grid stride of chunks
     for (diff_t ii = 0; ii < len; ii += i_stride)
@@ -304,36 +290,32 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    int BlockDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::sycl_group_012_direct<BlockDim>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::sycl_group_012_direct<BlockDim>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
           Types>
 {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::sycl_group_012_direct<BlockDim>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::sycl_group_012_direct<BlockDim>,
+                     EnclosedStmts...>,
       Types>;
 
   using typename Base::diff_t;
@@ -344,7 +326,7 @@ struct SyclStatementExecutor<
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
-    auto   i   = item.get_group(BlockDim);
+    auto i     = item.get_group(BlockDim);
 
     if (i < len)
     {
@@ -366,37 +348,33 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    int BlockDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::sycl_group_012_loop<BlockDim>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::For<
-              ArgumentId,
-              RAJA::sycl_group_012_loop<BlockDim>,
-              EnclosedStmts...>,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<
-          ArgumentId,
-          RAJA::sycl_group_012_loop<BlockDim>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_group_012_loop<BlockDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -405,9 +383,9 @@ struct SyclStatementExecutor<
   exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
-    diff_t len      = segment_length<ArgumentId>(data);
-    auto   i_init   = item.get_group(BlockDim);
-    auto   i_stride = item.get_group_range(BlockDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_group(BlockDim);
+    auto i_stride = item.get_group_range(BlockDim);
 
     // Iterate through grid stride of chunks
     for (diff_t i = i_init; i < len; i += i_stride)
@@ -431,12 +409,11 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 6e98b9471c..8da7a878bb 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -42,15 +42,13 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <
-    typename Data,
-    camp::idx_t LambdaIndex,
-    typename... Args,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Lambda<LambdaIndex, Args...>,
-    Types>
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
 {
 
   static inline RAJA_DEVICE void
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 77f0246040..d36a7fa2af 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -103,12 +103,11 @@ void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <
-    bool IsTriviallyCopyable,
-    typename LaunchPolicy,
-    typename StmtList,
-    typename Data,
-    typename Types>
+template <bool IsTriviallyCopyable,
+          typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -127,11 +126,10 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
       internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(
-      Data&&               data,
-      internal::LaunchDims launch_dims,
-      size_t               shmem,
-      cl::sycl::queue*     qu)
+  static void launch(Data&& data,
+                     internal::LaunchDims launch_dims,
+                     size_t shmem,
+                     cl::sycl::queue* qu)
   {
 
     //
@@ -171,19 +169,18 @@ struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
       internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(
-      Data&&               data,
-      internal::LaunchDims launch_dims,
-      size_t               shmem,
-      cl::sycl::queue*     qu)
+  static void launch(Data&& data,
+                     internal::LaunchDims launch_dims,
+                     size_t shmem,
+                     cl::sycl::queue* qu)
   {
 
     qu->submit(
         [&](cl::sycl::handler& h)
         {
-          h.parallel_for(
-              launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item)
-              { SyclKernelLauncher<Data, executor_t>(data, item); });
+          h.parallel_for(launch_dims.fit_nd_range(qu),
+                         [=](cl::sycl::nd_item<3> item)
+                         { SyclKernelLauncher<Data, executor_t>(data, item); });
         });
 
     if (!async)
@@ -213,12 +210,11 @@ struct StatementExecutor<
     using data_t = camp::decay<Data>;
     using executor_t =
         sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
-    using launch_t = SyclLaunchHelper<
-        std::is_trivially_copyable<data_t>::value, LaunchConfig, stmt_list_t,
-        data_t, Types>;
+    using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
+                                      LaunchConfig, stmt_list_t, data_t, Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue*        q   = res.get_queue();
+    ::sycl::queue* q          = res.get_queue();
     ;
 
     //
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index 71b22a625f..ee4c78a273 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -47,12 +47,11 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename TPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename TPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
@@ -121,21 +120,18 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    int         BlockDim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_group_012_direct<BlockDim>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_direct<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -219,21 +215,18 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    int         BlockDim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_group_012_loop<BlockDim>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_loop<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -312,21 +305,18 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    int         ThreadDim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_local_012_direct<ThreadDim>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_direct<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
@@ -404,21 +394,18 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    camp::idx_t chunk_size,
-    int         ThreadDim,
-    typename... EnclosedStmts,
-    typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_local_012_loop<ThreadDim>,
-        EnclosedStmts...>,
-    Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_loop<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
 {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index 8fdf020a93..8f1caf75c0 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -47,13 +47,12 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    typename TPol,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename TPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::
@@ -111,41 +110,37 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    int         BlockDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_group_012_direct<BlockDim>,
-        EnclosedStmts...>,
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              sycl_group_012_direct<BlockDim>,
-              EnclosedStmts...>,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          sycl_group_012_direct<BlockDim>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_direct<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -189,41 +184,37 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    int         BlockDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_group_012_loop<BlockDim>,
-        EnclosedStmts...>,
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              sycl_group_012_loop<BlockDim>,
-              EnclosedStmts...>,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          sycl_group_012_loop<BlockDim>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_loop<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -268,41 +259,37 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    int         ThreadDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_local_012_direct<ThreadDim>,
-        EnclosedStmts...>,
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              sycl_local_012_direct<ThreadDim>,
-              EnclosedStmts...>,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          sycl_local_012_direct<ThreadDim>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_direct<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
@@ -346,41 +333,37 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <
-    typename Data,
-    camp::idx_t ArgumentId,
-    typename ParamId,
-    camp::idx_t chunk_size,
-    int         ThreadDim,
-    typename... EnclosedStmts,
-    typename Types>
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        sycl_local_012_loop<ThreadDim>,
-        EnclosedStmts...>,
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
           Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              sycl_local_012_loop<ThreadDim>,
-              EnclosedStmts...>,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts...>,
           Types>
 {
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          sycl_local_012_loop<ThreadDim>,
-          EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_loop<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 5c9ddbee4b..4c68cf58b8 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -184,10 +184,10 @@ struct LaunchDims
           ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
-    cl::sycl::range<3> ret_th = {
-        launch_local.x, launch_local.y, launch_local.z};
-    cl::sycl::range<3> ret_gl = {
-        launch_global.x, launch_global.y, launch_global.z};
+    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y,
+                                 launch_local.z};
+    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y,
+                                 launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 66f435c950..fcffc88aed 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -33,21 +33,19 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 {
 
   // If the launch lambda is trivially copyable
-  template <
-      typename BODY_IN,
-      typename ReduceParams,
-      typename std::enable_if<std::is_trivially_copyable<BODY_IN> {}, bool>::
-          type = true>
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -57,9 +55,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(
-        params.threads.value[2], params.threads.value[1],
-        params.threads.value[0]);
+    const ::sycl::range<3> blockSize(params.threads.value[2],
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         params.threads.value[2] * params.teams.value[2],
@@ -110,22 +108,20 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
   // If the launch lambda is trivially copyable and we have explcit reduction
   // parameters
-  template <
-      typename BODY_IN,
-      typename ReduceParams,
-      typename std::enable_if<std::is_trivially_copyable<BODY_IN> {}, bool>::
-          type = true>
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       launch_params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams              launch_reducers)
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -137,9 +133,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     //
     // Compute the number of blocks and threads
     //
-    const ::sycl::range<3> blockSize(
-        launch_params.threads.value[2], launch_params.threads.value[1],
-        launch_params.threads.value[0]);
+    const ::sycl::range<3> blockSize(launch_params.threads.value[2],
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         launch_params.threads.value[2] * launch_params.teams.value[2],
@@ -209,21 +205,19 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template <
-      typename BODY_IN,
-      typename ReduceParams,
-      typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {}, bool>::
-          type = true>
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams&             RAJA_UNUSED_ARG(launch_reducers))
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -233,9 +227,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(
-        params.threads.value[2], params.threads.value[1],
-        params.threads.value[0]);
+    const ::sycl::range<3> blockSize(params.threads.value[2],
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         params.threads.value[2] * params.teams.value[2],
@@ -293,22 +287,20 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
 
   // If the launch lambda is not trivially copyable
-  template <
-      typename BODY_IN,
-      typename ReduceParams,
-      typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {}, bool>::
-          type = true>
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(
-      RAJA::resources::Resource res,
-      const LaunchParams&       launch_params,
-      const char*               kernel_name,
-      BODY_IN&&                 body_in,
-      ReduceParams              launch_reducers)
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -320,9 +312,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     //
     // Compute the number of blocks and threads
     //
-    const ::sycl::range<3> blockSize(
-        launch_params.threads.value[2], launch_params.threads.value[1],
-        launch_params.threads.value[0]);
+    const ::sycl::range<3> blockSize(launch_params.threads.value[2],
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
     const ::sycl::range<3> gridSize(
         launch_params.threads.value[2] * launch_params.teams.value[2],
@@ -443,11 +435,10 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -478,12 +469,11 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      SEGMENT const&       segment2,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -499,9 +489,8 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
                      ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
-        body(
-            *(segment0.begin() + tx), *(segment1.begin() + ty),
-            *(segment1.begin() + ty));
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+             *(segment1.begin() + ty));
     }
   }
 };
@@ -838,11 +827,10 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -860,12 +848,11 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      SEGMENT const&       segment2,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -875,9 +862,8 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(
-            *(segment0.begin() + tx), *(segment1.begin() + ty),
-            *(segment2.begin() + tz));
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+             *(segment2.begin() + tz));
     }
   }
 };
@@ -891,11 +877,10 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -913,12 +898,11 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      SEGMENT const&       segment2,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -928,9 +912,8 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(
-            *(segment0.begin() + tx), *(segment1.begin() + ty),
-            *(segment2.begin() + tz), tx, ty, tz);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+             *(segment2.begin() + tz), tx, ty, tz);
     }
   }
 };
@@ -955,11 +938,10 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -983,12 +965,11 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      SEGMENT const&       segment2,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -1006,9 +987,8 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
              bz += ctx.itm->get_group_range(DIM2))
         {
 
-          body(
-              *(segment0.begin() + bx), *(segment1.begin() + by),
-              *(segment2.begin() + bz));
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
+               *(segment2.begin() + bz));
         }
       }
     }
@@ -1023,11 +1003,10 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -1052,12 +1031,11 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      SEGMENT const&       segment0,
-      SEGMENT const&       segment1,
-      SEGMENT const&       segment2,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -1075,9 +1053,8 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
              bz += ctx.itm->get_group_range(DIM0))
         {
 
-          body(
-              *(segment0.begin() + bx), *(segment1.begin() + by),
-              *(segment2.begin() + bz), bx, by, bz);
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
+               *(segment2.begin() + bz), bx, by, bz);
         }
       }
     }
@@ -1089,11 +1066,10 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1112,11 +1088,10 @@ struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1135,11 +1110,10 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1160,11 +1134,10 @@ struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1183,11 +1156,10 @@ struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1206,11 +1178,10 @@ struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1229,11 +1200,10 @@ struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -1252,11 +1222,10 @@ struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
 {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const& ctx,
-      TILE_T               tile_size,
-      SEGMENT const&       segment,
-      BODY const&          body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 3d954097f6..ac9690f42e 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -74,8 +74,8 @@ static int MaxNumTeams = 1;
 //! Information necessary for SYCL offload to be considered
 struct Offload_Info
 {
-  int  hostID {1};
-  int  deviceID {2};
+  int hostID {1};
+  int deviceID {2};
   bool isMapped {false};
 
   Offload_Info() = default;
@@ -93,8 +93,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T*        device;
-  T*        host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -145,9 +145,9 @@ struct Reduce_Data
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(
-        reinterpret_cast<void*>(device), reinterpret_cast<void*>(host),
-        sycl::MaxNumTeams * sizeof(T));
+    auto e =
+        q->memcpy(reinterpret_cast<void*>(device),
+                  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
@@ -164,9 +164,9 @@ struct Reduce_Data
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(
-        reinterpret_cast<void*>(host), reinterpret_cast<void*>(device),
-        sycl::MaxNumTeams * sizeof(T));
+    auto e = q->memcpy(reinterpret_cast<void*>(host),
+                       reinterpret_cast<void*>(device),
+                       sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
@@ -246,10 +246,12 @@ struct TargetReduce
   TargetReduce& reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
     Reducer {}(atm, rhsVal);
     return *this;
 #else
@@ -262,10 +264,12 @@ struct TargetReduce
   const TargetReduce& reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
     Reducer {}(atm, rhsVal);
     return *this;
 #else
@@ -293,9 +297,9 @@ struct TargetReduceLoc
   TargetReduceLoc()                       = delete;
   TargetReduceLoc(const TargetReduceLoc&) = default;
   explicit TargetReduceLoc(
-      T         init_val,
+      T init_val,
       IndexType init_loc,
-      T         identity_val_ = Reducer::identity,
+      T identity_val_ = Reducer::identity,
       IndexType identity_loc_ =
           RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
@@ -307,12 +311,11 @@ struct TargetReduceLoc
         finalLoc(identity_loc_)
   {}
 
-  void reset(
-      T         init_val_,
-      IndexType init_loc_,
-      T         identity_val_ = Reducer::identity,
-      IndexType identity_loc_ =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  void reset(T init_val_,
+             IndexType init_loc_,
+             T identity_val_ = Reducer::identity,
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
@@ -369,11 +372,11 @@ struct TargetReduceLoc
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    cl::sycl::atomic_fence(
-        cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire,
+                           cl::sycl::memory_scope::device);
     Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    cl::sycl::atomic_fence(
-        cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_release,
+                           cl::sycl::memory_scope::device);
     return *this;
 #else
     Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
@@ -389,7 +392,7 @@ struct TargetReduceLoc
   }
 
   //! storage for reduction data for value
-  sycl::Reduce_Data<T>         val;
+  sycl::Reduce_Data<T> val;
   sycl::Reduce_Data<IndexType> loc;
 
 private:
@@ -398,9 +401,9 @@ struct TargetReduceLoc
   //! storage for reduction data for value
   //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
-  T         initVal;
-  T         finalVal;
-  T         returnVal;
+  T initVal;
+  T finalVal;
+  T returnVal;
   IndexType initLoc;
   IndexType finalLoc;
   IndexType returnLoc;
@@ -427,10 +430,12 @@ class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
   const self& operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -454,10 +459,12 @@ class ReduceBitOr<sycl_reduce, T>
   self& operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -470,10 +477,12 @@ class ReduceBitOr<sycl_reduce, T>
   const self& operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -497,10 +506,12 @@ class ReduceBitAnd<sycl_reduce, T>
   self& operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -513,10 +524,12 @@ class ReduceBitAnd<sycl_reduce, T>
   const self& operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -540,10 +553,12 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
   self& min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -556,10 +571,12 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
   const self& min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -583,10 +600,12 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
   self& max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -599,10 +618,12 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
   const self& max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i   = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<
-        T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device,
-        cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index e79d4e5d57..2978673727 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -57,8 +57,8 @@ class Register<double, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index c6993255be..6330f95525 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -56,9 +56,9 @@ class Register<float, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(
-        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
 public:
@@ -81,15 +81,14 @@ class Register<float, avx_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(
-      element_type x0,
-      element_type x1,
-      element_type x2,
-      element_type x3,
-      element_type x4,
-      element_type x5,
-      element_type x6,
-      element_type x7)
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
       : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 9b4c85b791..abbce3482b 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -56,36 +56,35 @@ class Register<int32_t, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(
-        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(
-        7 * stride, 6 * stride, 5 * stride, 4 * stride, 3 * stride, 2 * stride,
-        stride, 0);
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(
-        N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0, N >= 6 ? 5 : 0,
-        N >= 3 ? 2 : 0, N >= 4 ? 3 : 0, N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(
-        N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0, N >= 7 ? 6 : 0,
-        N >= 2 ? 1 : 0, N >= 1 ? 0 : 0, N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -109,15 +108,14 @@ class Register<int32_t, avx_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(
-      element_type x0,
-      element_type x1,
-      element_type x2,
-      element_type x3,
-      element_type x4,
-      element_type x5,
-      element_type x6,
-      element_type x7)
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
       : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
@@ -224,9 +222,8 @@ class Register<int32_t, avx_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_ps(
-        reinterpret_cast<float*>(ptr), createMask(N),
-        reinterpret_cast<__m256>(m_value));
+    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
+                        reinterpret_cast<__m256>(m_value));
     return *this;
   }
 
@@ -441,10 +438,10 @@ class Register<int32_t, avx_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(
-        get(7) / b.get(7), get(6) / b.get(6), get(5) / b.get(5),
-        get(4) / b.get(4), get(3) / b.get(3), get(2) / b.get(2),
-        get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 7a05434b94..e0a03bec4f 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -56,8 +56,8 @@ class Register<int64_t, avx_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -204,9 +204,8 @@ class Register<int64_t, avx_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_pd(
-        reinterpret_cast<double*>(ptr), createMask(N),
-        reinterpret_cast<__m256d>(m_value));
+    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N),
+                        reinterpret_cast<__m256d>(m_value));
     return *this;
   }
 
@@ -355,9 +354,8 @@ class Register<int64_t, avx_register>
   self_type multiply(self_type const& b) const
   {
     // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(
-        get(3) * b.get(3), get(2) * b.get(2), get(1) * b.get(1),
-        get(0) * b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -365,9 +363,8 @@ class Register<int64_t, avx_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(
-        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
-        get(0) / b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -472,11 +469,10 @@ class Register<int64_t, avx_register>
   RAJA_INLINE
   self_type vmax(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(
-        get(3) > a.get(3) ? get(3) : a.get(3),
-        get(2) > a.get(2) ? get(2) : a.get(2),
-        get(1) > a.get(1) ? get(1) : a.get(1),
-        get(0) > a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
   }
 
   /*!
@@ -543,11 +539,10 @@ class Register<int64_t, avx_register>
   RAJA_INLINE
   self_type vmin(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(
-        get(3) < a.get(3) ? get(3) : a.get(3),
-        get(2) < a.get(2) ? get(2) : a.get(2),
-        get(1) < a.get(1) ? get(1) : a.get(1),
-        get(0) < a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
   }
 };
 
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index b4d26d499c..eba85017e0 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -56,8 +56,8 @@ class Register<double, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -162,8 +162,8 @@ class Register<double, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-    m_value = _mm256_i64gather_pd(
-        ptr, createStridedOffsets(stride), sizeof(element_type));
+    m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
     return *this;
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 0538a217bc..77d814e293 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -56,36 +56,35 @@ class Register<float, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(
-        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(
-        7 * stride, 6 * stride, 5 * stride, 4 * stride, 3 * stride, 2 * stride,
-        stride, 0);
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(
-        N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0, N >= 6 ? 5 : 0,
-        N >= 3 ? 2 : 0, N >= 4 ? 3 : 0, N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(
-        N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0, N >= 7 ? 6 : 0,
-        N >= 2 ? 1 : 0, N >= 1 ? 0 : 0, N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -101,15 +100,14 @@ class Register<float, avx2_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(
-      element_type x0,
-      element_type x1,
-      element_type x2,
-      element_type x3,
-      element_type x4,
-      element_type x5,
-      element_type x6,
-      element_type x7)
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
       : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
@@ -179,8 +177,8 @@ class Register<float, avx2_register>
   RAJA_INLINE
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i32gather_ps(
-        ptr, createStridedOffsets(stride), sizeof(element_type));
+    m_value = _mm256_i32gather_ps(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
     return *this;
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index 0d8bc941fa..fbc671b127 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -57,36 +57,35 @@ class Register<int32_t, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi32(
-        N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0, N >= 5 ? -1 : 0,
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
   __m256i createStridedOffsets(camp::idx_t stride) const
   {
     // Generate a strided offset list
-    return _mm256_set_epi32(
-        7 * stride, 6 * stride, 5 * stride, 4 * stride, 3 * stride, 2 * stride,
-        stride, 0);
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
   }
 
   RAJA_INLINE
   __m256i createPermute1(camp::idx_t N) const
   {
     // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(
-        N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0, N >= 6 ? 5 : 0,
-        N >= 3 ? 2 : 0, N >= 4 ? 3 : 0, N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
   }
 
   RAJA_INLINE
   __m256i createPermute2(camp::idx_t N) const
   {
     // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(
-        N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0, N >= 7 ? 6 : 0,
-        N >= 2 ? 1 : 0, N >= 1 ? 0 : 0, N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
   }
 
 public:
@@ -103,15 +102,14 @@ class Register<int32_t, avx2_register>
    * @brief Construct register with explicit values
    */
   RAJA_INLINE
-  Register(
-      element_type x0,
-      element_type x1,
-      element_type x2,
-      element_type x3,
-      element_type x4,
-      element_type x5,
-      element_type x6,
-      element_type x7)
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
       : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
   {}
 
@@ -183,8 +181,8 @@ class Register<int32_t, avx2_register>
   RAJA_INLINE
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i32gather_epi32(
-        ptr, createStridedOffsets(stride), sizeof(element_type));
+    m_value = _mm256_i32gather_epi32(ptr, createStridedOffsets(stride),
+                                     sizeof(element_type));
     return *this;
   }
 
@@ -198,9 +196,9 @@ class Register<int32_t, avx2_register>
   self_type&
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
-    m_value = _mm256_mask_i32gather_epi32(
-        _mm256_setzero_si256(), ptr, createStridedOffsets(stride),
-        createMask(N), sizeof(element_type));
+    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(), ptr,
+                                          createStridedOffsets(stride),
+                                          createMask(N), sizeof(element_type));
     return *this;
   }
 
@@ -395,10 +393,10 @@ class Register<int32_t, avx2_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(
-        get(7) / b.get(7), get(6) / b.get(6), get(5) / b.get(5),
-        get(4) / b.get(4), get(3) / b.get(3), get(2) / b.get(2),
-        get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -452,8 +450,8 @@ class Register<int32_t, avx2_register>
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
     auto red2 = _mm256_max_epi32(red1, sh2);
 
-    return std::max<element_type>(
-        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
   }
 
   /*!
@@ -488,16 +486,16 @@ class Register<int32_t, avx2_register>
     }
     if (N == 4)
     {
-      return std::max<element_type>(
-          _mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
     }
 
     // swap odd-even quads and add
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
     auto red2 = _mm256_max_epi32(red1, sh2);
 
-    return std::max<element_type>(
-        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
   }
 
   /*!
@@ -527,8 +525,8 @@ class Register<int32_t, avx2_register>
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
     auto red2 = _mm256_min_epi32(red1, sh2);
 
-    return std::min<element_type>(
-        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
   }
 
   /*!
@@ -563,16 +561,16 @@ class Register<int32_t, avx2_register>
     }
     if (N == 4)
     {
-      return std::min<element_type>(
-          _mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
     }
 
     // swap odd-even quads and add
     auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
     auto red2 = _mm256_min_epi32(red1, sh2);
 
-    return std::min<element_type>(
-        _mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
   }
 
   /*!
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 83ca8ba3ac..aa285f44e7 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -55,8 +55,8 @@ class Register<int64_t, avx2_register>
   __m256i createMask(camp::idx_t N) const
   {
     // Generate a mask
-    return _mm256_set_epi64x(
-        N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
   }
 
   RAJA_INLINE
@@ -166,9 +166,9 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type& load_strided(int64_t const* ptr, camp::idx_t stride)
   {
-    m_value = _mm256_i64gather_epi64(
-        reinterpret_cast<long long const*>(ptr), createStridedOffsets(stride),
-        sizeof(element_type));
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                                     createStridedOffsets(stride),
+                                     sizeof(element_type));
     return *this;
   }
 
@@ -203,9 +203,9 @@ class Register<int64_t, avx2_register>
 #ifdef RAJA_ENABLE_VECTOR_STATS
     RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-    m_value = _mm256_i64gather_epi64(
-        reinterpret_cast<long long const*>(ptr), offsets.get_register(),
-        sizeof(element_type));
+    m_value =
+        _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                               offsets.get_register(), sizeof(element_type));
     return *this;
   }
 
@@ -250,8 +250,8 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
   {
-    _mm256_maskstore_epi64(
-        reinterpret_cast<long long*>(ptr), createMask(N), m_value);
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N),
+                           m_value);
     return *this;
   }
 
@@ -374,9 +374,8 @@ class Register<int64_t, avx2_register>
   self_type multiply(self_type const& b) const
   {
     // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(
-        get(3) * b.get(3), get(2) * b.get(2), get(1) * b.get(1),
-        get(0) * b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -384,9 +383,8 @@ class Register<int64_t, avx2_register>
   self_type divide(self_type const& b) const
   {
     // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(
-        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
-        get(0) / b.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
@@ -480,11 +478,10 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type vmax(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(
-        get(3) > a.get(3) ? get(3) : a.get(3),
-        get(2) > a.get(2) ? get(2) : a.get(2),
-        get(1) > a.get(1) ? get(1) : a.get(1),
-        get(0) > a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
   }
 
   /*!
@@ -550,11 +547,10 @@ class Register<int64_t, avx2_register>
   RAJA_INLINE
   self_type vmin(self_type a) const
   {
-    return self_type(_mm256_set_epi64x(
-        get(3) < a.get(3) ? get(3) : a.get(3),
-        get(2) < a.get(2) ? get(2) : a.get(2),
-        get(1) < a.get(1) ? get(1) : a.get(1),
-        get(0) < a.get(0) ? get(0) : a.get(0)));
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
   }
 };
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index 5f859f76d7..824311a400 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -166,8 +166,8 @@ class Register<double, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i64gather_pd(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
     return *this;
   }
 
@@ -182,9 +182,9 @@ class Register<double, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i64gather_pd(
-        _mm512_setzero_pd(), createMask(N), createStridedOffsets(stride), ptr,
-        sizeof(element_type));
+    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
     return *this;
   }
 
@@ -221,8 +221,8 @@ class Register<double, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i64scatter_pd(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
     return *this;
   }
 
@@ -236,9 +236,8 @@ class Register<double, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i64scatter_pd(
-        ptr, createMask(N), createStridedOffsets(stride), m_value,
-        sizeof(element_type));
+    _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
     return *this;
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index e3582acee7..004fe3fffa 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -181,8 +181,8 @@ class Register<float, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i32gather_ps(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i32gather_ps(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
     return *this;
   }
 
@@ -197,9 +197,9 @@ class Register<float, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i32gather_ps(
-        _mm512_setzero_ps(), createMask(N), createStridedOffsets(stride), ptr,
-        sizeof(element_type));
+    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
     return *this;
   }
 
@@ -236,8 +236,8 @@ class Register<float, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i32scatter_ps(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i32scatter_ps(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
     return *this;
   }
 
@@ -251,9 +251,8 @@ class Register<float, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i32scatter_ps(
-        ptr, createMask(N), createStridedOffsets(stride), m_value,
-        sizeof(element_type));
+    _mm512_mask_i32scatter_ps(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
     return *this;
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 139b503236..e3ecac4520 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -187,8 +187,8 @@ class Register<int32_t, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i32gather_epi32(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
     return *this;
   }
 
@@ -203,9 +203,9 @@ class Register<int32_t, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i32gather_epi32(
-        _mm512_setzero_epi32(), createMask(N), createStridedOffsets(stride),
-        ptr, sizeof(element_type));
+    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
     return *this;
   }
 
@@ -246,8 +246,8 @@ class Register<int32_t, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i32scatter_epi32(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i32scatter_epi32(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
     return *this;
   }
 
@@ -261,9 +261,9 @@ class Register<int32_t, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i32scatter_epi32(
-        ptr, createMask(N), createStridedOffsets(stride), m_value,
-        sizeof(element_type));
+    _mm512_mask_i32scatter_epi32(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
     return *this;
   }
 
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index c35280d27c..f4c1c40315 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -176,8 +176,8 @@ class Register<int64_t, avx512_register>
   self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
     // AVX512F
-    m_value = _mm512_i64gather_epi64(
-        createStridedOffsets(stride), ptr, sizeof(element_type));
+    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
     return *this;
   }
 
@@ -192,9 +192,9 @@ class Register<int64_t, avx512_register>
   load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
     // AVX512F
-    m_value = _mm512_mask_i64gather_epi64(
-        _mm512_setzero_epi32(), createMask(N), createStridedOffsets(stride),
-        ptr, sizeof(element_type));
+    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
     return *this;
   }
 
@@ -210,15 +210,13 @@ class Register<int64_t, avx512_register>
 #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
     (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
      defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-    _mm512_mask_storeu_epi64(
-        ptr, ~0,
-        m_value);  // May cause slowdown due to looping
-                   // over 8 bytes, one at a time.
+    _mm512_mask_storeu_epi64(ptr, ~0,
+                             m_value);  // May cause slowdown due to looping
+                                        // over 8 bytes, one at a time.
 #else
-    _mm512_storeu_epi64(
-        ptr,
-        m_value);  // GNU 7-10 are missing this instruction,
-                   // as is icpx as of version 2022.2.
+    _mm512_storeu_epi64(ptr,
+                        m_value);  // GNU 7-10 are missing this instruction,
+                                   // as is icpx as of version 2022.2.
 #endif
     return *this;
   }
@@ -243,8 +241,8 @@ class Register<int64_t, avx512_register>
   self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
   {
     // AVX512F
-    _mm512_i64scatter_epi64(
-        ptr, createStridedOffsets(stride), m_value, sizeof(element_type));
+    _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
     return *this;
   }
 
@@ -258,9 +256,9 @@ class Register<int64_t, avx512_register>
   store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
     // AVX512F
-    _mm512_mask_i64scatter_epi64(
-        ptr, createMask(N), createStridedOffsets(stride), m_value,
-        sizeof(element_type));
+    _mm512_mask_i64scatter_epi64(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
     return *this;
   }
 
@@ -328,10 +326,10 @@ class Register<int64_t, avx512_register>
   self_type divide(self_type const& b) const
   {
     // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi64(
-        get(7) / b.get(7), get(6) / b.get(6), get(5) / b.get(5),
-        get(4) / b.get(4), get(3) / b.get(3), get(2) / b.get(2),
-        get(1) / b.get(1), get(0) / b.get(0)));
+    return self_type(_mm512_set_epi64(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
   }
 
   RAJA_HOST_DEVICE
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index b692c0bb0d..81b19709ab 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -256,11 +256,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load(
-      element_type const* ptr,
-      camp::idx_t         segbits,
-      camp::idx_t         stride_inner,
-      camp::idx_t         stride_outer)
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
   {
     auto lane = get_lane();
 
@@ -282,13 +281,12 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load_nm(
-      element_type const* ptr,
-      camp::idx_t         segbits,
-      camp::idx_t         stride_inner,
-      camp::idx_t         stride_outer,
-      camp::idx_t         num_inner,
-      camp::idx_t         num_outer)
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
   {
     auto lane = get_lane();
 
@@ -389,8 +387,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    *
    */
   template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const&
-  scatter(element_type* ptr, T2 const& offsets) const
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
   {
 
     ptr[offsets.get_raw_value()] = m_value;
@@ -426,11 +424,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store(
-      element_type* ptr,
-      camp::idx_t   segbits,
-      camp::idx_t   stride_inner,
-      camp::idx_t   stride_outer) const
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
   {
     auto lane = get_lane();
 
@@ -450,13 +447,12 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store_nm(
-      element_type* ptr,
-      camp::idx_t   segbits,
-      camp::idx_t   stride_inner,
-      camp::idx_t   stride_outer,
-      camp::idx_t   num_inner,
-      camp::idx_t   num_outer) const
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
   {
     auto lane = get_lane();
 
@@ -575,10 +571,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * floats and doubles use the CUDA instrinsic FMA
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      !std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, c.m_value));
   }
@@ -587,10 +583,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value + c.m_value);
   }
@@ -599,10 +595,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * floats and doubles use the CUDA instrinsic FMS
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      !std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, -c.m_value));
   }
@@ -611,10 +607,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value - c.m_value);
   }
@@ -645,8 +641,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type max() const
   {
     // Allreduce maximum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::maximum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
 
     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -660,8 +657,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type max_n(int N) const
   {
     // Allreduce maximum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::maximum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
 
     auto ident = RAJA::operators::limits<element_type>::min();
     auto lane  = get_lane();
@@ -689,8 +687,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type min() const
   {
     // Allreduce minimum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::minimum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
 
     return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -704,8 +703,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
   element_type min_n(int N) const
   {
     // Allreduce minimum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::minimum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
 
     auto ident = RAJA::operators::limits<element_type>::max();
     auto lane  = get_lane();
@@ -734,10 +734,9 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  static int_vector_type s_segmented_offsets(
-      camp::idx_t segbits,
-      camp::idx_t stride_inner,
-      camp::idx_t stride_outer)
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
   {
     int_vector_type result;
 
@@ -788,8 +787,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type
-  segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -812,8 +811,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
     // Third: mask off everything but output_segment
     //        this is because all output segments are valid at this point
     // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-    int  our_output_segment = get_lane() >> (5 - segbits);
-    bool in_output_segment  = our_output_segment == output_segment;
+    int our_output_segment = get_lane() >> (5 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
     if (!in_output_segment)
     {
       result.get_raw_value() = 0;
@@ -855,8 +854,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type
-  segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -866,8 +865,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
     {
 
       // tree shuffle
-      int          delta = s_num_elem >> (i + 1);
-      element_type y     = __shfl_sync(0xffffffff, x, get_lane() + delta);
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
 
       // reduce
       x += y;
@@ -875,7 +874,7 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 
     // Second: send result to output segment lanes
     self_type result;
-    int       get_from     = get_lane() & ((1 << segbits) - 1);
+    int get_from           = get_lane() & ((1 << segbits) - 1);
     result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
 
     int mask = (get_lane() >> segbits) == output_segment;
@@ -892,11 +891,10 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
 
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_divide_nm(
-      self_type   den,
-      camp::idx_t segbits,
-      camp::idx_t num_inner,
-      camp::idx_t num_outer) const
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
   {
     self_type result;
 
@@ -968,9 +966,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_inner(
-      camp::idx_t segbits,
-      camp::idx_t input_segment) const
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1025,9 +1022,8 @@ class Register<ELEMENT_TYPE, cuda_warp_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_outer(
-      camp::idx_t segbits,
-      camp::idx_t input_segment) const
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
   {
     self_type result;
 
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index c46865ff45..6cf48ea358 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -257,11 +257,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load(
-      element_type const* ptr,
-      camp::idx_t         segbits,
-      camp::idx_t         stride_inner,
-      camp::idx_t         stride_outer)
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
   {
     auto lane = get_lane();
 
@@ -283,13 +282,12 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type& segmented_load_nm(
-      element_type const* ptr,
-      camp::idx_t         segbits,
-      camp::idx_t         stride_inner,
-      camp::idx_t         stride_outer,
-      camp::idx_t         num_inner,
-      camp::idx_t         num_outer)
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
   {
     auto lane = get_lane();
 
@@ -390,8 +388,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    *
    */
   template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const&
-  scatter(element_type* ptr, T2 const& offsets) const
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
   {
 
     ptr[offsets.get_raw_value()] = m_value;
@@ -428,11 +426,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store(
-      element_type* ptr,
-      camp::idx_t   segbits,
-      camp::idx_t   stride_inner,
-      camp::idx_t   stride_outer) const
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
   {
     auto lane = get_lane();
 
@@ -452,13 +449,12 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_DEVICE
   RAJA_INLINE
-  self_type const& segmented_store_nm(
-      element_type* ptr,
-      camp::idx_t   segbits,
-      camp::idx_t   stride_inner,
-      camp::idx_t   stride_outer,
-      camp::idx_t   num_inner,
-      camp::idx_t   num_outer) const
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
   {
     auto lane = get_lane();
 
@@ -577,10 +573,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * floats and doubles use the CUDA instrinsic FMA
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      !std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, c.m_value));
   }
@@ -589,10 +585,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_add(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value + c.m_value);
   }
@@ -601,10 +597,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * floats and doubles use the CUDA instrinsic FMS
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      !std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(fma(m_value, b.m_value, -c.m_value));
   }
@@ -613,10 +609,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
    */
   template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE typename std::enable_if<
-      std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-  multiply_subtract(self_type const& b, self_type const& c) const
+  RAJA_DEVICE RAJA_INLINE
+      typename std::enable_if<std::numeric_limits<element_type>::is_integer,
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
   {
     return self_type(m_value * b.m_value - c.m_value);
   }
@@ -647,8 +643,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type max() const
   {
     // Allreduce maximum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::maximum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
 
     return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -662,8 +659,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type max_n(int N) const
   {
     // Allreduce maximum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::maximum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
 
     auto ident = RAJA::operators::limits<element_type>::min();
     auto lane  = get_lane();
@@ -691,8 +689,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type min() const
   {
     // Allreduce minimum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::minimum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
 
     return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
   }
@@ -706,8 +705,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
   element_type min_n(int N) const
   {
     // Allreduce minimum
-    using combiner_t = RAJA::reduce::detail::op_adapter<
-        element_type, RAJA::operators::minimum>;
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
 
     auto ident = RAJA::operators::limits<element_type>::max();
     auto lane  = get_lane();
@@ -736,10 +736,9 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  static int_vector_type s_segmented_offsets(
-      camp::idx_t segbits,
-      camp::idx_t stride_inner,
-      camp::idx_t stride_outer)
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
   {
     int_vector_type result;
 
@@ -790,8 +789,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type
-  segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -814,8 +813,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
     // Third: mask off everything but output_segment
     //        this is because all output segments are valid at this point
     // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-    int  our_output_segment = get_lane() >> (6 - segbits);
-    bool in_output_segment  = our_output_segment == output_segment;
+    int our_output_segment = get_lane() >> (6 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
     if (!in_output_segment)
     {
       result.get_raw_value() = 0;
@@ -857,8 +856,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type
-  segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
   {
 
     // First: tree reduce values within each segment
@@ -868,8 +867,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
     {
 
       // tree shuffle
-      int          delta = s_num_elem >> (i + 1);
-      element_type y     = hip::impl::shfl_sync(x, get_lane() + delta);
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
 
       // reduce
       x += y;
@@ -877,7 +876,7 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 
     // Second: send result to output segment lanes
     self_type result;
-    int       get_from     = get_lane() & ((1 << segbits) - 1);
+    int get_from           = get_lane() & ((1 << segbits) - 1);
     result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
 
     int mask = (get_lane() >> segbits) == output_segment;
@@ -894,11 +893,10 @@ class Register<ELEMENT_TYPE, hip_wave_register>
 
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_divide_nm(
-      self_type   den,
-      camp::idx_t segbits,
-      camp::idx_t num_inner,
-      camp::idx_t num_outer) const
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
   {
     self_type result;
 
@@ -970,9 +968,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_inner(
-      camp::idx_t segbits,
-      camp::idx_t input_segment) const
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
   {
     self_type result;
 
@@ -1027,9 +1024,8 @@ class Register<ELEMENT_TYPE, hip_wave_register>
    */
   RAJA_INLINE
   RAJA_DEVICE
-  self_type segmented_broadcast_outer(
-      camp::idx_t segbits,
-      camp::idx_t input_segment) const
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
   {
     self_type result;
 
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index 0e8f33569c..d63b78c9f4 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -42,10 +42,10 @@ class Register<T, scalar_register>
   using element_type    = T;
   using register_type   = T;
 
-  using int_vector_type = Register<
-      typename internal::expt::RegisterTraits<scalar_register, T>::
-          int_element_type,
-      scalar_register>;
+  using int_vector_type =
+      Register<typename internal::expt::RegisterTraits<scalar_register,
+                                                       T>::int_element_type,
+               scalar_register>;
 
 
 private:
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 4b27bd2566..0b71c1143b 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -40,11 +40,10 @@ namespace policy
 namespace tensor
 {
 
-template <
-    typename EXEC_POLICY,
-    typename TENSOR_TYPE,
-    camp::idx_t DIM,
-    camp::idx_t TILE_SIZE>
+template <typename EXEC_POLICY,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          camp::idx_t TILE_SIZE>
 struct tensor_exec : public EXEC_POLICY
 {
   using exec_policy = EXEC_POLICY;
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index c055a3a9e2..19f1a339ee 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -27,15 +27,15 @@ namespace RAJA
 template <camp::idx_t N>
 struct LogBase2
 {
-  static constexpr camp::idx_t value    = LogBase2<(N >> 1)>::value + 1;
-  static constexpr bool        is_exact = ((1 << value) == N);
+  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
+  static constexpr bool is_exact     = ((1 << value) == N);
 };
 
 template <>
 struct LogBase2<0>
 {
-  static constexpr camp::idx_t value    = -1;
-  static constexpr bool        is_exact = true;
+  static constexpr camp::idx_t value = -1;
+  static constexpr bool is_exact     = true;
 };
 
 /*!
@@ -62,9 +62,8 @@ struct BitMask
   template <typename T>
   RAJA_HOST_DEVICE static constexpr T maskValue(T input)
   {
-    return (
-        (input >> (static_cast<T>(Shift))) &
-        static_cast<T>((1 << (Width)) - 1));
+    return ((input >> (static_cast<T>(Shift))) &
+            static_cast<T>((1 << (Width)) - 1));
   }
 
 
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index afed430afd..b6f1f05dc4 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -96,8 +96,8 @@ struct CombiningAdapter
 
   RAJA_SUPPRESS_HD_WARN
   template <camp::idx_t... RangeInts>
-  RAJA_HOST_DEVICE inline auto
-  call_helper(IndexLinear linear_index, camp::idx_seq<RangeInts...>)
+  RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
+                                           camp::idx_seq<RangeInts...>)
       -> decltype(m_lambda(
           camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
@@ -108,8 +108,8 @@ struct CombiningAdapter
   ///
   RAJA_SUPPRESS_HD_WARN
   template <camp::idx_t... RangeInts>
-  RAJA_HOST_DEVICE inline auto
-  call_helper(IndexLinear linear_index, camp::idx_seq<RangeInts...>) const
+  RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
+                                           camp::idx_seq<RangeInts...>) const
       -> decltype(m_lambda(
           camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
@@ -217,9 +217,9 @@ make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
-RAJA_INLINE auto make_CombiningAdapter(
-    Lambda&& lambda,
-    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+RAJA_INLINE auto
+make_CombiningAdapter(Lambda&& lambda,
+                      ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
 // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
 //             camp::val<RAJA::TypedOffsetLayout<
 //                 typename std::common_type< strip_index_type_t<IdxTs>...
@@ -232,20 +232,20 @@ RAJA_INLINE auto make_CombiningAdapter(
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
-  Layout       layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
+  Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
       {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
                                           : static_cast<IdxLin>(0))...}},
       std::move(layout));
-  return make_CombiningAdapter_from_layout(
-      std::forward<Lambda>(lambda), std::move(offset_layout));
+  return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+                                           std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE auto make_PermutedCombiningAdapter(
-    Lambda&& lambda,
-    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+RAJA_INLINE auto
+make_PermutedCombiningAdapter(Lambda&& lambda,
+                              ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
 // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
 //             camp::val<RAJA::TypedOffsetLayout<
 //                 typename std::common_type< strip_index_type_t<IdxTs>...
@@ -265,8 +265,8 @@ RAJA_INLINE auto make_PermutedCombiningAdapter(
                                           : static_cast<IdxLin>(0))...}},
 
       std::move(layout));
-  return make_CombiningAdapter_from_layout(
-      std::forward<Lambda>(lambda), std::move(offset_layout));
+  return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+                                           std::move(offset_layout));
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index b38e3eefac..db0928385e 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -48,8 +48,9 @@ template <typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
 template <typename T, typename TypeList>
-using enable_if_is_none_of = std::
-    enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
+using enable_if_is_none_of =
+    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
+                     T>;
 
 
 }  // namespace util
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 75c65e0610..005f26b337 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -129,10 +129,9 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
 }  // namespace internal
 
 
-template <
-    size_t n_dims   = 1,
-    typename IdxLin = Index_type,
-    typename... IndexTypes>
+template <size_t n_dims   = 1,
+          typename IdxLin = Index_type,
+          typename... IndexTypes>
 struct IndexLayout
     : public internal::
           IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>
@@ -140,15 +139,14 @@ struct IndexLayout
   using Base = internal::
       IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
-  using internal::IndexLayout_impl<
-      camp::make_idx_seq_t<n_dims>,
-      IdxLin,
-      IndexTypes...>::IndexLayout_impl;
+  using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                   IdxLin,
+                                   IndexTypes...>::IndexLayout_impl;
 
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
-      const internal::
-          IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
-              rhs)
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                               IdxLin,
+                                               IndexTypes...>& rhs)
       : Base {rhs}
   {}
 };
@@ -168,16 +166,15 @@ auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
  * creates an index layout based on the input camp::tuple of index types
  *
  */
-template <
-    typename IdxLin = Index_type,
-    typename... Types,
-    typename... IndexTypes>
+template <typename IdxLin = Index_type,
+          typename... Types,
+          typename... IndexTypes>
 auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
     -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
   static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(
-      index_tuple_in, ns...);
+  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
+                                                              ns...);
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index 2fc56be88b..7812306b71 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -23,8 +23,10 @@ class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
 {
 public:
   using Parent = ::RAJA::util::PluginStrategy;
-  typedef void (
-      *init_function)(const int, const uint64_t, const uint32_t, void*);
+  typedef void (*init_function)(const int,
+                                const uint64_t,
+                                const uint32_t,
+                                void*);
   typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
   typedef void (*post_function)(uint64_t);
   typedef void (*finalize_function)();
@@ -42,9 +44,9 @@ class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
 
   void initDirectory(const std::string& path);
 
-  std::vector<init_function>     init_functions;
-  std::vector<pre_function>      pre_functions;
-  std::vector<post_function>     post_functions;
+  std::vector<init_function> init_functions;
+  std::vector<pre_function> pre_functions;
+  std::vector<post_function> post_functions;
   std::vector<finalize_function> finalize_functions;
 
 };  // end KokkosPluginLoader class
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 4e12b2fb22..30a044e322 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -38,10 +38,9 @@ namespace detail
 {
 
 
-template <
-    typename Range,
-    typename IdxLin        = Index_type,
-    ptrdiff_t StrideOneDim = -1>
+template <typename Range,
+          typename IdxLin        = Index_type,
+          ptrdiff_t StrideOneDim = -1>
 struct LayoutBase_impl;
 
 /*!
@@ -75,8 +74,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   using IndexLinear = IdxLin;
   using IndexRange  = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
-  static constexpr size_t    n_dims = sizeof...(RangeInts);
-  static constexpr IdxLin    limit  = RAJA::operators::limits<IdxLin>::max();
+  static constexpr size_t n_dims = sizeof...(RangeInts);
+  static constexpr IdxLin limit  = RAJA::operators::limits<IdxLin>::max();
   static constexpr ptrdiff_t stride_one_dim = StrideOneDim;
 
   IdxLin sizes[n_dims]       = {0};
@@ -106,8 +105,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
         inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
         inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
-    static_assert(
-        n_dims == sizeof...(Types), "number of dimensions must match");
+    static_assert(n_dims == sizeof...(Types),
+                  "number of dimensions must match");
   }
 
   /*!
@@ -115,10 +114,9 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    */
   template <typename CIdxLin, ptrdiff_t CStrideOneDim>
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
-  LayoutBase_impl(const LayoutBase_impl<
-                  camp::idx_seq<RangeInts...>,
-                  CIdxLin,
-                  CStrideOneDim>& rhs)
+  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
+                                        CIdxLin,
+                                        CStrideOneDim>& rhs)
       : sizes {static_cast<IdxLin>(rhs.sizes[RangeInts])...},
         strides {static_cast<IdxLin>(rhs.strides[RangeInts])...},
         inv_strides {static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
@@ -145,10 +143,9 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
-    printf(
-        "Error at index %d, value %ld is not within bounds [0, %ld] \n",
-        static_cast<int>(N), static_cast<long int>(idx),
-        static_cast<long int>(sizes[N] - 1));
+    printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
+           static_cast<int>(N), static_cast<long int>(idx),
+           static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
@@ -157,8 +154,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void
-  BoundsCheck(Idx idx, Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
+                                                Indices... indices) const
   {
     if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N])))
     {
@@ -184,13 +181,12 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>(
-        (RangeInts == stride_one_dim
-             ?  // Is this dimension stride-one?
-             indices
-             :  // it's stride one, so dont bother with multiply
-             strides[RangeInts] * indices  // it's not stride one
-         )...);
+    return sum<IdxLin>((RangeInts == stride_one_dim
+                            ?  // Is this dimension stride-one?
+                            indices
+                            :  // it's stride one, so dont bother with multiply
+                            strides[RangeInts] * indices  // it's not stride one
+                        )...);
   }
 
 
@@ -205,25 +201,23 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    *                 dimensionality of this layout.
    */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void
-  toIndices(IdxLin linear_index, Indices&&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
+                                              Indices&&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
     if (totSize > 0 && (linear_index < 0 || linear_index >= totSize))
     {
-      printf(
-          "Error! Linear index %ld is not within bounds [0, %ld]. \n",
-          static_cast<long int>(linear_index),
-          static_cast<long int>(totSize - 1));
+      printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
+             static_cast<long int>(linear_index),
+             static_cast<long int>(totSize - 1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
     }
 #endif
 
-    camp::sink((
-        indices =
-            (camp::decay<
-                Indices>)((linear_index / inv_strides[RangeInts]) % inv_mods[RangeInts]))...);
+    camp::sink((indices = (camp::decay<Indices>)((linear_index /
+                                                  inv_strides[RangeInts]) %
+                                                 inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -374,13 +368,12 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    * @param indices  Variadic list of indices to be assigned, number must match
    *                 dimensionality of this layout.
    */
-  RAJA_INLINE RAJA_HOST_DEVICE void
-  toIndices(IdxLin linear_index, DimTypes&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(
-        camp::make_idx_seq_t<sizeof...(DimTypes)> {},
-        std::forward<IdxLin>(linear_index),
-        std::forward<DimTypes&>(indices)...);
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
+                    std::forward<IdxLin>(linear_index),
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
@@ -392,10 +385,9 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *
    */
   template <typename... Indices, camp::idx_t... RangeInts>
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(
-      camp::idx_seq<RangeInts...>,
-      IdxLin linear_index,
-      Indices&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
+                                                    IdxLin linear_index,
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
@@ -411,7 +403,7 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
  */
 template <ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
 RAJA_INLINE Layout<n_dims, IdxLin, s1_dim>
-            make_stride_one(Layout<n_dims, IdxLin> const& l)
+make_stride_one(Layout<n_dims, IdxLin> const& l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
@@ -423,7 +415,7 @@ RAJA_INLINE Layout<n_dims, IdxLin, s1_dim>
  */
 template <ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
 RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim>
-            make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
+make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
 {
   // strip l to it's base-class type
   using Base    = typename TypedLayout<IdxLin, IdxTuple>::Base;
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 57bb337844..faa5910704 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -70,47 +70,43 @@ using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 }  // namespace internal
 
 
-template <
-    typename ValueType,
-    typename Perm,
-    typename Sizes,
-    typename... IndexTypes>
-using TypedLocalArray = internal::TypedViewBase<
-    ValueType,
-    ValueType*,
-    internal::getStaticLayoutType<Perm, Sizes>,
-    camp::list<IndexTypes...>>;
+template <typename ValueType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
+using TypedLocalArray =
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            camp::list<IndexTypes...>>;
 
 
 template <typename ValueType, typename Perm, typename Sizes>
-using LocalArray = internal::TypedViewBase<
-    ValueType,
-    ValueType*,
-    internal::getStaticLayoutType<Perm, Sizes>,
-    internal::getDefaultIndexTypes<Perm>>;
-
-
-template <
-    typename AtomicPolicy,
-    typename DataType,
-    typename Perm,
-    typename Sizes,
-    typename... IndexTypes>
+using LocalArray =
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            internal::getDefaultIndexTypes<Perm>>;
+
+
+template <typename AtomicPolicy,
+          typename DataType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
 struct AtomicTypedLocalArray
 {};
 
-template <
-    typename AtomicPolicy,
-    typename DataType,
-    camp::idx_t... Perm,
-    Index_type... Sizes,
-    typename... IndexTypes>
-struct AtomicTypedLocalArray<
-    AtomicPolicy,
-    DataType,
-    camp::idx_seq<Perm...>,
-    RAJA::SizeList<Sizes...>,
-    IndexTypes...>
+template <typename AtomicPolicy,
+          typename DataType,
+          camp::idx_t... Perm,
+          Index_type... Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray<AtomicPolicy,
+                             DataType,
+                             camp::idx_seq<Perm...>,
+                             RAJA::SizeList<Sizes...>,
+                             IndexTypes...>
 {
   DataType* m_arrayPtr = nullptr;
   using value_type     = DataType;
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 948a9b038f..858f444f74 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -54,9 +54,9 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   static constexpr size_t n_dims = sizeof...(RangeInts);
   IdxLin offsets[n_dims]         = {0};  // If not specified set to zero
 
-  constexpr RAJA_INLINE OffsetLayout_impl(
-      std::array<IdxLin, sizeof...(RangeInts)> begin,
-      std::array<IdxLin, sizeof...(RangeInts)> end)
+  constexpr RAJA_INLINE
+  OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
+                    std::array<IdxLin, sizeof...(RangeInts)> end)
       : base_ {(end[RangeInts] - begin[RangeInts])...},
         offsets {begin[RangeInts]...}
   {}
@@ -74,11 +74,10 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
-    printf(
-        "Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
-        static_cast<int>(N), static_cast<long int>(idx),
-        static_cast<long int>(offsets[N]),
-        static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+    printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
+           static_cast<int>(N), static_cast<long int>(idx),
+           static_cast<long int>(offsets[N]),
+           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
@@ -87,8 +86,8 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void
-  BoundsCheck(Idx idx, Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
+                                                Indices... indices) const
   {
     if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N]))
     {
@@ -109,17 +108,17 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   }
 
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void
-  toIndices(IdxLin linear_index, Indices&&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
+                                              Indices&&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
   }
 
   static RAJA_INLINE OffsetLayout_impl<IndexRange, IdxLin>
-                     from_layout_and_offsets(
-                         const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
-                         const Layout<sizeof...(RangeInts), IdxLin>&     rhs)
+  from_layout_and_offsets(
+      const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
+      const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
     OffsetLayout_impl ret {rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
@@ -169,8 +168,8 @@ struct OffsetLayout
   using Base =
       internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
-  using internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>::
-      OffsetLayout_impl;
+  using internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                    IdxLin>::OffsetLayout_impl;
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout(
       const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
@@ -208,21 +207,19 @@ struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
-  RAJA_INLINE RAJA_HOST_DEVICE void
-  toIndices(IdxLin linear_index, DimTypes&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(
-        camp::make_idx_seq_t<sizeof...(DimTypes)> {},
-        std::forward<IdxLin>(linear_index),
-        std::forward<DimTypes&>(indices)...);
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
+                    std::forward<IdxLin>(linear_index),
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
   template <typename... Indices, camp::idx_t... RangeInts>
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(
-      camp::idx_seq<RangeInts...>,
-      IdxLin linear_index,
-      Indices&... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
+                                                    IdxLin linear_index,
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
@@ -233,18 +230,17 @@ struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
 
 
 template <size_t n_dims, typename IdxLin = Index_type>
-auto make_offset_layout(
-    const std::array<IdxLin, n_dims>& begin,
-    const std::array<IdxLin, n_dims>& end) -> OffsetLayout<n_dims, IdxLin>
+auto make_offset_layout(const std::array<IdxLin, n_dims>& begin,
+                        const std::array<IdxLin, n_dims>& end)
+    -> OffsetLayout<n_dims, IdxLin>
 {
   return OffsetLayout<n_dims, IdxLin> {begin, end};
 }
 
 template <size_t Rank, typename IdxLin = Index_type>
-auto make_permuted_offset_layout(
-    const std::array<IdxLin, Rank>& begin,
-    const std::array<IdxLin, Rank>& end,
-    const std::array<IdxLin, Rank>& permutation)
+auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
+                                 const std::array<IdxLin, Rank>& end,
+                                 const std::array<IdxLin, Rank>& permutation)
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 253a1bf8e4..77c880b08e 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -29,20 +29,19 @@ namespace RAJA
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeft
 {
-  template <
-      typename new_Ret,
-      typename new_Arg1 = new_Ret,
-      typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
   template <size_t>
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
-      Arg1 const& i,
-      Arg1 const& num_i,
-      Arg2 const& j,
-      Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& num_i,
+             Arg2 const& j,
+             Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
@@ -51,36 +50,33 @@ struct GetOffsetLeft
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetRight
 {
-  template <
-      typename new_Ret,
-      typename new_Arg1 = new_Ret,
-      typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
   template <size_t>
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
-      Arg1 const& i,
-      Arg1 const& RAJA_UNUSED_ARG(num_i),
-      Arg2 const& j,
-      Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
-template <
-    size_t t_bunch_num_i,
-    typename Ret,
-    typename Arg1 = Ret,
-    typename Arg2 = Arg1>
+template <size_t t_bunch_num_i,
+          typename Ret,
+          typename Arg1 = Ret,
+          typename Arg2 = Arg1>
 struct GetOffsetLeftBunched
 {
-  template <
-      typename new_Ret,
-      typename new_Arg1 = new_Ret,
-      typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind =
       GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
 
@@ -89,11 +85,11 @@ struct GetOffsetLeftBunched
 
   static constexpr Arg1 bunch_num_i {t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
-      Arg1 const& i,
-      Arg1 const& RAJA_UNUSED_ARG(num_i),
-      Arg2 const& j,
-      Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index 11d68680f7..03366fa7f7 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -52,10 +52,10 @@ struct fp_associative_tag : associative_tag
 
 // get associativity tag appropriate for the type
 template <typename T>
-using associative_or_fp_associative_tag = std::conditional_t<
-    std::is_floating_point<std::decay_t<T>>::value,
-    fp_associative_tag,
-    associative_tag>;
+using associative_or_fp_associative_tag =
+    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                       fp_associative_tag,
+                       associative_tag>;
 
 template <typename Arg1, typename Arg2, typename Result>
 struct binary_function
@@ -197,12 +197,11 @@ struct largest<T, false, false, true, true>
 template <typename T, bool gpu = false>
 struct largest
 {
-  using type = typename detail::largest<
-      T,
-      std::is_integral<T>::value,
-      std::is_signed<T>::value,
-      std::is_floating_point<T>::value,
-      gpu>::type;
+  using type = typename detail::largest<T,
+                                        std::is_integral<T>::value,
+                                        std::is_signed<T>::value,
+                                        std::is_floating_point<T>::value,
+                                        gpu>::type;
 };
 
 
@@ -251,10 +250,9 @@ struct limits;
 
 // limits for signed integer types
 template <typename T>
-struct limits<
-    T,
-    typename std::enable_if<
-        std::is_integral<T>::value && !std::is_unsigned<T>::value>::type>
+struct limits<T,
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      !std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -280,10 +278,9 @@ struct limits<
 
 // limits for signed integer types
 template <typename T>
-struct limits<
-    T,
-    typename std::enable_if<
-        std::is_integral<T>::value && std::is_unsigned<T>::value>::type>
+struct limits<T,
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -349,15 +346,13 @@ static_assert(check<unsigned int>(), "limits for unsigned int is broken");
 static_assert(check<long>(), "limits for long is broken");
 static_assert(check<unsigned long>(), "limits for unsigned long is broken");
 static_assert(check<long int>(), "limits for long int is broken");
-static_assert(
-    check<unsigned long int>(),
-    "limits for unsigned long int is "
-    "broken");
+static_assert(check<unsigned long int>(),
+              "limits for unsigned long int is "
+              "broken");
 static_assert(check<long long>(), "limits for long long is broken");
-static_assert(
-    check<unsigned long long>(),
-    "limits for unsigned long long is "
-    "broken");
+static_assert(check<unsigned long long>(),
+              "limits for unsigned long long is "
+              "broken");
 #endif
 
 // Arithmetic
@@ -366,8 +361,8 @@ template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
               detail::associative_or_fp_associative_tag<Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return Ret {lhs} + rhs;
   }
@@ -377,8 +372,8 @@ struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minus : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return Ret {lhs} - rhs;
   }
@@ -389,8 +384,8 @@ struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
                     detail::associative_or_fp_associative_tag<Ret>
 {
 
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return Ret {lhs} * rhs;
   }
@@ -400,8 +395,8 @@ struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct divides : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return Ret {lhs} / rhs;
   }
@@ -410,8 +405,8 @@ struct divides : public detail::binary_function<Arg1, Arg2, Ret>
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return Ret {lhs} % rhs;
   }
@@ -423,8 +418,8 @@ template <typename Arg1, typename Arg2 = Arg1>
 struct logical_and : public detail::comparison_function<Arg1, Arg2>,
                      detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs && rhs;
   }
@@ -435,8 +430,8 @@ template <typename Arg1, typename Arg2 = Arg1>
 struct logical_or : public detail::comparison_function<Arg1, Arg2>,
                     detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs || rhs;
   }
@@ -457,8 +452,8 @@ struct logical_not : public detail::unary_function<T, bool>
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return lhs | rhs;
   }
@@ -469,8 +464,8 @@ struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return lhs & rhs;
   }
@@ -482,8 +477,8 @@ struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return lhs ^ rhs;
   }
@@ -500,8 +495,8 @@ template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
                  detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return (rhs < lhs) ? rhs : lhs;
   }
@@ -515,8 +510,8 @@ template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
                  detail::associative_tag
 {
-  RAJA_HOST_DEVICE constexpr Ret
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
+                                            const Arg2& rhs) const
   {
     return (lhs < rhs) ? rhs : lhs;
   }
@@ -531,8 +526,8 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
 template <typename Arg1, typename Arg2 = Arg1>
 struct equal_to : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs == rhs;
   }
@@ -541,8 +536,8 @@ struct equal_to : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs != rhs;
   }
@@ -551,8 +546,8 @@ struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct greater : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs > rhs;
   }
@@ -561,8 +556,8 @@ struct greater : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct less : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs < rhs;
   }
@@ -572,8 +567,8 @@ struct less : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct greater_equal : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs >= rhs;
   }
@@ -582,8 +577,8 @@ struct greater_equal : public detail::comparison_function<Arg1, Arg2>
 template <typename Arg1, typename Arg2 = Arg1>
 struct less_equal : public detail::comparison_function<Arg1, Arg2>
 {
-  RAJA_HOST_DEVICE constexpr bool
-  operator()(const Arg1& lhs, const Arg2& rhs) const
+  RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
+                                             const Arg2& rhs) const
   {
     return lhs <= rhs;
   }
@@ -603,8 +598,8 @@ struct identity : public detail::unary_function<Orig, Ret>
 template <typename T, typename U>
 struct project1st : public detail::binary_function<T, U, T>
 {
-  RAJA_HOST_DEVICE constexpr T
-  operator()(const T& lhs, const U& RAJA_UNUSED_ARG(rhs)) const
+  RAJA_HOST_DEVICE constexpr T operator()(const T& lhs,
+                                          const U& RAJA_UNUSED_ARG(rhs)) const
   {
     return lhs;
   }
@@ -613,8 +608,8 @@ struct project1st : public detail::binary_function<T, U, T>
 template <typename T, typename U = T>
 struct project2nd : public detail::binary_function<T, U, U>
 {
-  RAJA_HOST_DEVICE constexpr U
-  operator()(const T& RAJA_UNUSED_ARG(lhs), const U& rhs) const
+  RAJA_HOST_DEVICE constexpr U operator()(const T& RAJA_UNUSED_ARG(lhs),
+                                          const U& rhs) const
   {
     return rhs;
   }
@@ -637,11 +632,11 @@ struct is_fp_associative
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct safe_plus : public plus<
-                       Arg1,
-                       Arg2,
-                       typename types::larger<
-                           typename types::larger_of<Arg1, Arg2>::type>::type>
+struct safe_plus
+    : public plus<Arg1,
+                  Arg2,
+                  typename types::larger<
+                      typename types::larger_of<Arg1, Arg2>::type>::type>
 {};
 
 }  // namespace operators
@@ -649,11 +644,10 @@ struct safe_plus : public plus<
 namespace concepts
 {
 
-template <
-    typename Function,
-    typename Return,
-    typename Arg1 = Return,
-    typename Arg2 = Arg1>
+template <typename Function,
+          typename Return,
+          typename Arg1 = Return,
+          typename Arg2 = Arg1>
 struct BinaryFunction
     : DefineConcept(::RAJA::concepts::convertible_to<Return>(
           camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>())))
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index 3962b18e16..2a70c4e760 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -221,11 +221,11 @@ template <camp::idx_t... Range, camp::idx_t... Perm>
 struct InversePermutationHelper<camp::idx_seq<Range...>, camp::idx_seq<Perm...>>
 {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq<CalcInversePermutationElem<
-      Range,
-      0,
-      sizeof...(Range),
-      camp::idx_seq<Perm...>>::value...>;
+  using type = camp::idx_seq<
+      CalcInversePermutationElem<Range,
+                                 0,
+                                 sizeof...(Range),
+                                 camp::idx_seq<Perm...>>::value...>;
 };
 
 
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index 98fe9e11e4..f7f1c627fb 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -61,9 +61,9 @@ namespace RAJA
  *
  */
 template <size_t Rank, typename IdxLin = Index_type>
-auto make_permuted_layout(
-    std::array<IdxLin, Rank>      sizes,
-    std::array<camp::idx_t, Rank> permutation) -> Layout<Rank, IdxLin>
+auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
+                          std::array<camp::idx_t, Rank> permutation)
+    -> Layout<Rank, IdxLin>
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index b6905cad0e..4bfb2ee7b8 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -18,20 +18,19 @@ namespace util
 template <typename T>
 class RegistryEntry
 {
-  std::string        Name, Desc;
+  std::string Name, Desc;
   std::shared_ptr<T> object;
 
 public:
-  RegistryEntry(
-      const std::string& N,
-      const std::string& D,
-      std::shared_ptr<T> (*C)())
+  RegistryEntry(const std::string& N,
+                const std::string& D,
+                std::shared_ptr<T> (*C)())
       : Name(N), Desc(D), object(C())
   {}
 
   const std::string& getName() const { return Name; }
   const std::string& getDesc() const { return Desc; }
-  T*                 get() const { return object.get(); }
+  T* get() const { return object.get(); }
 };
 
 /// A global registry used in conjunction with static constructors to make
@@ -61,7 +60,7 @@ class Registry
     friend class iterator;
     friend Registry<T>;
 
-    node*        Next;
+    node* Next;
     const entry& Val;
 
   public:
@@ -86,8 +85,8 @@ class Registry
   public:
     explicit iterator(const node* N) : Cur(N) {}
 
-    bool      operator==(const iterator& That) const { return Cur == That.Cur; }
-    bool      operator!=(const iterator& That) const { return Cur != That.Cur; }
+    bool operator==(const iterator& That) const { return Cur == That.Cur; }
+    bool operator!=(const iterator& That) const { return Cur != That.Cur; }
     iterator& operator++()
     {
       Cur = Cur->Next;
@@ -100,14 +99,14 @@ class Registry
   // begin is not defined here in order to avoid usage of an undefined static
   // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
   static RAJASHAREDDLL_API iterator begin();
-  static iterator                   end() { return iterator(nullptr); }
+  static iterator end() { return iterator(nullptr); }
 
   /// A static registration template.
   template <typename V>
   class add
   {
     entry Entry;
-    node  Node;
+    node Node;
 
     static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 43102ef680..0f8110288b 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -122,8 +122,8 @@ struct RepeatView
       lhs -= rhs;
       return lhs;
     }
-    friend constexpr difference_type
-    operator-(iterator const& lhs, iterator const& rhs)
+    friend constexpr difference_type operator-(iterator const& lhs,
+                                               iterator const& rhs)
     {
       return static_cast<difference_type>(lhs.m_index) -
              static_cast<difference_type>(rhs.m_index);
@@ -157,7 +157,7 @@ struct RepeatView
 
   private:
     const T* m_value = nullptr;
-    size_t   m_index = 0;
+    size_t m_index   = 0;
   };
 
   RepeatView() = delete;
@@ -184,13 +184,13 @@ struct RepeatView
   constexpr iterator cend() const { return iterator(&m_value, m_bound); }
 
   constexpr explicit operator bool() const { return m_bound != 0; }
-  constexpr bool     empty() const { return m_bound == 0; }
+  constexpr bool empty() const { return m_bound == 0; }
 
   constexpr size_t size() const { return m_bound; }
 
 private:
   size_t m_bound = 0;
-  T      m_value;
+  T m_value;
 };
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index b832ec4bd5..c4d63ac19f 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -42,7 +42,7 @@ class SoAArray
 
 public:
   RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; }
-  RAJA_HOST_DEVICE void       set(size_t i, value_type val) { mem[i] = val; }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; }
 
 private:
   value_type mem[size];
@@ -70,7 +70,7 @@ class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   }
 
 private:
-  first_type  mem[size];
+  first_type mem[size];
   second_type mem_idx[size];
 };
 
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index b774523f47..c0ce5a5ccf 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -39,11 +39,10 @@ namespace detail
  * This is useful for creating a vectorizable data layout and getting
  * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
  */
-template <
-    typename T,
-    typename mempool =
-        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>,
-    typename accessor = DefaultAccessor>
+template <typename T,
+          typename mempool = RAJA::basic_mempool::MemPool<
+              RAJA::basic_mempool::generic_allocator>,
+          typename accessor = DefaultAccessor>
 class SoAPtr
 {
   template <typename, typename, typename>
@@ -104,16 +103,14 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <
-    typename T,
-    typename IndexType,
-    bool doing_min,
-    typename mempool,
-    typename accessor>
-class SoAPtr<
-    RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
-    mempool,
-    accessor>
+template <typename T,
+          typename IndexType,
+          bool doing_min,
+          typename mempool,
+          typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
+             mempool,
+             accessor>
 {
   using first_type  = T;
   using second_type = IndexType;
@@ -175,7 +172,7 @@ class SoAPtr<
   }
 
 private:
-  first_type*  mem     = nullptr;
+  first_type* mem      = nullptr;
   second_type* mem_idx = nullptr;
 };
 
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index 8b12d4b3b3..c305fe9bd1 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -67,13 +67,11 @@ struct Span
   using iterator        = IterType;
   using const_iterator  = IterType;
 
-  static_assert(
-      type_traits::is_integral<IndexType>::value,
-      "IndexType must "
-      "model Integral");
-  static_assert(
-      type_traits::is_random_access_iterator<IterType>::value,
-      "IterType must model RandomAccessIterator");
+  static_assert(type_traits::is_integral<IndexType>::value,
+                "IndexType must "
+                "model Integral");
+  static_assert(type_traits::is_random_access_iterator<IterType>::value,
+                "IterType must model RandomAccessIterator");
 
   RAJA_HOST_DEVICE Span(iterator begin, iterator end)
       : m_begin {begin}, m_end {end}
@@ -83,8 +81,8 @@ struct Span
       : m_begin {begin}, m_end {begin + size}
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE iterator       begin() { return m_begin; }
-  RAJA_HOST_DEVICE RAJA_INLINE iterator       end() { return m_end; }
+  RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
+  RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator begin() const { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator end() const { return m_end; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
@@ -138,13 +136,13 @@ struct Span
   {
     return slice(size() - count, count);
   }
-  RAJA_HOST_DEVICE RAJA_INLINE Span
-  subspan(size_type begin, size_type length) const
+  RAJA_HOST_DEVICE RAJA_INLINE Span subspan(size_type begin,
+                                            size_type length) const
   {
     return slice(begin, length);
   }
-  RAJA_HOST_DEVICE RAJA_INLINE Span
-  slice(size_type begin, size_type length) const
+  RAJA_HOST_DEVICE RAJA_INLINE Span slice(size_type begin,
+                                          size_type length) const
   {
     auto start = m_begin + begin;
     auto end   = start + length > m_end ? m_end : start + length;
@@ -177,8 +175,8 @@ struct Span
  *
  */
 template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType>
-                             make_span(IterType begin, IndexType size)
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
+                                                                 IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
@@ -189,9 +187,8 @@ RAJA_INLINE auto make_span(Iter& iterable)
   using std::begin;
   using std::distance;
   using std::end;
-  return Span<
-      typename Iter::iterator,
-      decltype(distance(begin(iterable), end(iterable)))>(
+  return Span<typename Iter::iterator,
+              decltype(distance(begin(iterable), end(iterable)))>(
       begin(iterable), end(iterable));
 }
 
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 7288ca8d17..df70092459 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -39,26 +39,23 @@ namespace detail
 {
 
 
-template <
-    typename IdxLin,
-    typename Range,
-    typename Sizes,
-    typename Strides,
-    typename DimTypeList = void>
+template <typename IdxLin,
+          typename Range,
+          typename Sizes,
+          typename Strides,
+          typename DimTypeList = void>
 struct StaticLayoutBase_impl;
 
 
-template <
-    typename IdxLin,
-    IdxLin... RangeInts,
-    IdxLin... Sizes,
-    IdxLin... Strides>
-struct StaticLayoutBase_impl<
-    IdxLin,
-    camp::int_seq<IdxLin, RangeInts...>,
-    camp::int_seq<IdxLin, Sizes...>,
-    camp::int_seq<IdxLin, Strides...>,
-    void>
+template <typename IdxLin,
+          IdxLin... RangeInts,
+          IdxLin... Sizes,
+          IdxLin... Strides>
+struct StaticLayoutBase_impl<IdxLin,
+                             camp::int_seq<IdxLin, RangeInts...>,
+                             camp::int_seq<IdxLin, Sizes...>,
+                             camp::int_seq<IdxLin, Strides...>,
+                             void>
 {
 
   using IndexLinear = IdxLin;
@@ -78,9 +75,8 @@ struct StaticLayoutBase_impl<
 
   RAJA_INLINE static void print()
   {
-    camp::sink(printf(
-        "StaticLayout: arg%d: size=%d, stride=%d\n", (int)RangeInts, (int)Sizes,
-        (int)Strides)...);
+    camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
+                      (int)RangeInts, (int)Sizes, (int)Strides)...);
   }
 
 
@@ -190,16 +186,14 @@ struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
 template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <
-    typename IdxLin,
-    IdxLin... Range,
-    camp::idx_t... Perm,
-    IdxLin... Sizes>
-struct StrideCalculator<
-    IdxLin,
-    camp::int_seq<IdxLin, Range...>,
-    camp::idx_seq<Perm...>,
-    camp::int_seq<IdxLin, Sizes...>>
+template <typename IdxLin,
+          IdxLin... Range,
+          camp::idx_t... Perm,
+          IdxLin... Sizes>
+struct StrideCalculator<IdxLin,
+                        camp::int_seq<IdxLin, Range...>,
+                        camp::idx_seq<Perm...>,
+                        camp::int_seq<IdxLin, Sizes...>>
 {
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
@@ -211,31 +205,28 @@ struct StrideCalculator<
 
   using strides_unperm = camp::int_seq<
       IdxLin,
-      StrideCalculatorIdx<
-          IdxLin,
-          N,
-          Range,
-          camp::seq_at<Perm, sizes>::value...>::stride...>;
+      StrideCalculatorIdx<IdxLin,
+                          N,
+                          Range,
+                          camp::seq_at<Perm, sizes>::value...>::stride...>;
 
-  using strides = camp::int_seq<
-      IdxLin,
-      camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::
-          value...>;
+  using strides =
+      camp::int_seq<IdxLin,
+                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
+                                 strides_unperm>::value...>;
 };
 
 
-template <
-    typename IdxLin,
-    IdxLin... RangeInts,
-    IdxLin... Sizes,
-    IdxLin... Strides,
-    typename... DimTypes>
-struct StaticLayoutBase_impl<
-    IdxLin,
-    camp::int_seq<IdxLin, RangeInts...>,
-    camp::int_seq<IdxLin, Sizes...>,
-    camp::int_seq<IdxLin, Strides...>,
-    camp::list<DimTypes...>>
+template <typename IdxLin,
+          IdxLin... RangeInts,
+          IdxLin... Sizes,
+          IdxLin... Strides,
+          typename... DimTypes>
+struct StaticLayoutBase_impl<IdxLin,
+                             camp::int_seq<IdxLin, RangeInts...>,
+                             camp::int_seq<IdxLin, Sizes...>,
+                             camp::int_seq<IdxLin, Strides...>,
+                             camp::list<DimTypes...>>
 {
 
 
@@ -301,12 +292,11 @@ struct StaticLayoutBase_impl<
 };
 
 
-template <
-    typename Perm,
-    typename IdxLin,
-    typename Sizes,
-    typename Indexes,
-    typename TypeList>
+template <typename Perm,
+          typename IdxLin,
+          typename Sizes,
+          typename Indexes,
+          typename TypeList>
 struct StaticLayoutMaker
 {
   using strides =
@@ -329,11 +319,10 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <
-    typename Perm,
-    typename IdxLin,
-    typename TypeList,
-    camp::idx_t... Sizes>
+template <typename Perm,
+          typename IdxLin,
+          typename TypeList,
+          camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index a0e5053d1e..6c96a12e23 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -74,8 +74,8 @@ class BGQTimer
   void reset() { telapsed = 0; }
 
 private:
-  TimeType    tstart;
-  TimeType    tstop;
+  TimeType tstart;
+  TimeType tstop;
   ElapsedType telapsed;
 };
 
@@ -126,8 +126,8 @@ class ChronoTimer
   void reset() { telapsed = 0; }
 
 private:
-  TimeType    tstart;
-  TimeType    tstop;
+  TimeType tstart;
+  TimeType tstop;
   ElapsedType telapsed;
 };
 
@@ -178,8 +178,8 @@ class GettimeTimer
   }
 
 private:
-  TimeType   tstart;
-  TimeType   tstop;
+  TimeType tstart;
+  TimeType tstop;
   ElasedType telapsed;
 
   ElapsedType stime_elapsed;
@@ -238,8 +238,8 @@ class ClockTimer
   void reset() { telapsed = 0; }
 
 private:
-  TimeType    tstart;
-  TimeType    tstop;
+  TimeType tstart;
+  TimeType tstop;
   long double telapsed;
 
   void set_elapsed() { telapsed += (tstop - tstart); }
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 80bb0948dc..9f7563d729 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -144,7 +144,7 @@ struct GetTensorArgIdx<DIM, camp::list<ARGS...>>
  */
 template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
 RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
-            get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
+get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
 {
   return RAJA::max<camp::idx_t>(
       internal::expt::getTensorDim<ARGS>() == DIM
@@ -159,7 +159,7 @@ RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
  */
 template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
 RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
-            get_tensor_args_size(LAYOUT const& layout, ARGS... args)
+get_tensor_args_size(LAYOUT const& layout, ARGS... args)
 {
   return RAJA::max<camp::idx_t>(
       internal::expt::getTensorDim<ARGS>() == DIM
@@ -182,41 +182,37 @@ namespace detail
  * In the future development, this may return SIMD vectors or matrices using
  * class specializations.
  */
-template <
-    typename VecSeq,
-    typename Args,
-    typename ElementType,
-    typename PointerType,
-    typename LinIdx,
-    typename LayoutType>
+template <typename VecSeq,
+          typename Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
 struct ViewReturnHelper;
 
 
 /*
  * Specialization for Scalar return types
  */
-template <
-    typename... Args,
-    typename ElementType,
-    typename PointerType,
-    typename LinIdx,
-    typename LayoutType>
-struct ViewReturnHelper<
-    camp::idx_seq<>,
-    camp::list<Args...>,
-    ElementType,
-    PointerType,
-    LinIdx,
-    LayoutType>
+template <typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
 {
   using return_type = ElementType&;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(
-      LayoutType const&  layout,
-      PointerType const& data,
-      Args const&... args)
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
   {
     return data[stripIndexType(layout(args...))];
   }
@@ -227,21 +223,19 @@ struct ViewReturnHelper<
 /*
  * Specialization for Tensor return types
  */
-template <
-    camp::idx_t VecHead,
-    camp::idx_t... VecSeq,
-    typename... Args,
-    typename ElementType,
-    typename PointerType,
-    typename LinIdx,
-    typename LayoutType>
-struct ViewReturnHelper<
-    camp::idx_seq<VecHead, VecSeq...>,
-    camp::list<Args...>,
-    ElementType,
-    PointerType,
-    LinIdx,
-    LayoutType>
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
 {
 
   static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
@@ -262,34 +256,31 @@ struct ViewReturnHelper<
            : -1)...);
 
 
-  using tensor_reg_type = typename camp::at_v<
-      camp::list<Args...>,
-      GetTensorArgIdx<0, Args...>::value>::tensor_type;
-  using ref_type = internal::expt::TensorRef<
-      ElementType*,
-      LinIdx,
-      internal::expt::TENSOR_MULTIPLE,
-      s_num_dims,
-      s_stride_one_dim>;
+  using tensor_reg_type =
+      typename camp::at_v<camp::list<Args...>,
+                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
+  using ref_type = internal::expt::TensorRef<ElementType*,
+                                             LinIdx,
+                                             internal::expt::TENSOR_MULTIPLE,
+                                             s_num_dims,
+                                             s_stride_one_dim>;
   using return_type =
       internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(
-      LayoutType const&  layout,
-      PointerType const& data,
-      Args const&... args)
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
   {
 
     return return_type(ref_type {
         // data pointer
         &data[0] +
-            layout(
-                internal::expt::isTensorIndex<Args>()
-                    ? LinIdx {0}
-                    : (LinIdx)stripIndexType(
-                          internal::expt::stripTensorIndexByValue(args))...),
+            layout(internal::expt::isTensorIndex<Args>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
         // strides
         {(LinIdx)layout.template get_dim_stride<
              GetTensorArgIdx<VecHead, Args...>::value>(),
@@ -310,29 +301,27 @@ struct ViewReturnHelper<
 /*
  * Specialization for Tensor return types and static layout types
  */
-template <
-    camp::idx_t VecHead,
-    camp::idx_t... VecSeq,
-    typename... INDEX_TYPES,
-    typename ElementType,
-    typename PointerType,
-    typename LinIdx,
-    LinIdx... RangeInts,
-    LinIdx... SizeInts,
-    LinIdx... StrideInts,
-    typename DIM_LIST>
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... INDEX_TYPES,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          LinIdx... RangeInts,
+          LinIdx... SizeInts,
+          LinIdx... StrideInts,
+          typename DIM_LIST>
 struct ViewReturnHelper<
     camp::idx_seq<VecHead, VecSeq...>,
     camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
     ElementType,
     PointerType,
     LinIdx,
-    RAJA::detail::StaticLayoutBase_impl<
-        LinIdx,
-        camp::int_seq<LinIdx, RangeInts...>,
-        camp::int_seq<LinIdx, SizeInts...>,
-        camp::int_seq<LinIdx, StrideInts...>,
-        DIM_LIST>>
+    RAJA::detail::StaticLayoutBase_impl<LinIdx,
+                                        camp::int_seq<LinIdx, RangeInts...>,
+                                        camp::int_seq<LinIdx, SizeInts...>,
+                                        camp::int_seq<LinIdx, StrideInts...>,
+                                        DIM_LIST>>
 {
   static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
 
@@ -360,59 +349,59 @@ struct ViewReturnHelper<
            : -1)...);
 
 
-  using new_begin_seq = camp::int_seq<
-      LinIdx,
-      (LinIdx)get_tensor_args_begin<VecHead>(
-          LayoutType(),
-          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-      (LinIdx)get_tensor_args_begin<VecSeq>(
-          LayoutType(),
-          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
-  using new_size_seq = camp::int_seq<
-      LinIdx,
-      (LinIdx)get_tensor_args_size<VecHead>(
-          LayoutType(),
-          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-      (LinIdx)get_tensor_args_size<VecSeq>(
-          LayoutType(),
-          RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_begin_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_begin<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_begin<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_size_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_size<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_size<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
 
   using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
   using new_size_type  = internal::expt::StaticIndexArray<new_size_seq>;
 
 
-  using tensor_reg_type = typename camp::
-      at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
-  using ref_type = internal::expt::StaticTensorRef<
-      ElementType*,
-      LinIdx,
-      internal::expt::TENSOR_MULTIPLE,
-      stride_seq,
-      new_begin_seq,
-      new_size_seq,
-      s_stride_one_dim>;
+  using tensor_reg_type =
+      typename camp::at_v<index_list,
+                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
+  using ref_type =
+      internal::expt::StaticTensorRef<ElementType*,
+                                      LinIdx,
+                                      internal::expt::TENSOR_MULTIPLE,
+                                      stride_seq,
+                                      new_begin_seq,
+                                      new_size_seq,
+                                      s_stride_one_dim>;
   using return_type =
       internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(
-      LayoutType const&  layout,
-      PointerType const& data,
-      RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
+  static constexpr return_type
+  make_return(LayoutType const& layout,
+              PointerType const& data,
+              RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
   {
 
     return return_type(ref_type {
         // data pointer
         &data[0] +
-            layout(
-                internal::expt::isTensorIndex<
-                    typename RAJA::expt::StaticTensorIndex<
-                        INDEX_TYPES>::base_type>()
-                    ? LinIdx {0}
-                    : (LinIdx)stripIndexType(
-                          internal::expt::stripTensorIndexByValue(args))...),
+            layout(internal::expt::isTensorIndex<
+                       typename RAJA::expt::StaticTensorIndex<
+                           INDEX_TYPES>::base_type>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
         // strides
         typename ref_type::stride_type(),
         // tile
@@ -433,12 +422,11 @@ struct ViewReturnHelper<
  *
  * Otherwise it produces the usual scalar reference return type
  */
-template <
-    typename ElementType,
-    typename PointerType,
-    typename LinIdx,
-    typename LayoutType,
-    typename... Args>
+template <typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType,
+          typename... Args>
 using view_return_type_t = typename detail::ViewReturnHelper<
     camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
     camp::list<Args...>,
@@ -455,22 +443,19 @@ using view_return_type_t = typename detail::ViewReturnHelper<
  *
  * Otherwise it produces the usual scalar reference return value
  */
-template <
-    typename ElementType,
-    typename LinIdx,
-    typename LayoutType,
-    typename PointerType,
-    typename... Args>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<
-    ElementType,
-    PointerType,
-    LinIdx,
-    LayoutType,
-    Args...>
-view_make_return_value(
-    LayoutType const&  layout,
-    PointerType const& data,
-    Args const&... args)
+template <typename ElementType,
+          typename LinIdx,
+          typename LayoutType,
+          typename PointerType,
+          typename... Args>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
+                                                          PointerType,
+                                                          LinIdx,
+                                                          LayoutType,
+                                                          Args...>
+view_make_return_value(LayoutType const& layout,
+                       PointerType const& data,
+                       Args const&... args)
 {
   return detail::ViewReturnHelper<
       camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
@@ -494,11 +479,9 @@ namespace detail
 template <typename Expected, typename Arg>
 struct MatchTypedViewArgHelper
 {
-  static_assert(
-      std::is_convertible<
-          strip_index_type_t<Arg>,
-          strip_index_type_t<Expected>>::value,
-      "Argument isn't compatible");
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
   using type = strip_index_type_t<Arg>;
 
@@ -517,16 +500,13 @@ struct MatchTypedViewArgHelper
  * typed indices.
  */
 template <typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-struct MatchTypedViewArgHelper<
-    Expected,
-    RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
+struct MatchTypedViewArgHelper<Expected,
+                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
 {
 
-  static_assert(
-      std::is_convertible<
-          strip_index_type_t<Arg>,
-          strip_index_type_t<Expected>>::value,
-      "Argument isn't compatible");
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
   using arg_type = strip_index_type_t<Arg>;
 
@@ -545,13 +525,12 @@ struct MatchTypedViewArgHelper<
  * In this case, there is no StaticTensorIndex to unpack, just strip any
  * strongly typed indices.
  */
-template <
-    typename Expected,
-    typename Arg,
-    typename VectorType,
-    camp::idx_t             DIM,
-    Arg                     BEGIN,
-    strip_index_type_t<Arg> LENGTH>
+template <typename Expected,
+          typename Arg,
+          typename VectorType,
+          camp::idx_t DIM,
+          Arg BEGIN,
+          strip_index_type_t<Arg> LENGTH>
 struct MatchTypedViewArgHelper<
     Expected,
     RAJA::expt::StaticTensorIndex<
@@ -559,11 +538,9 @@ struct MatchTypedViewArgHelper<
             StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>>
 {
 
-  static_assert(
-      std::is_convertible<
-          strip_index_type_t<Arg>,
-          strip_index_type_t<Expected>>::value,
-      "Argument isn't compatible");
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
   using arg_type = strip_index_type_t<Arg>;
 
@@ -614,7 +591,7 @@ class ViewBase
   using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
 
 protected:
-  pointer_type      m_data;
+  pointer_type m_data;
   layout_type const m_layout;
 
 public:
@@ -662,8 +639,8 @@ class ViewBase
   {}
 
   template <typename... Args>
-  RAJA_HOST_DEVICE
-      RAJA_INLINE constexpr ViewBase(pointer_type data, Args... dim_sizes)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
+                                                  Args... dim_sizes)
       : m_data(data), m_layout(dim_sizes...)
   {}
 
@@ -699,12 +676,11 @@ class ViewBase
 
 
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
-      value_type,
-      pointer_type,
-      linear_index_type,
-      layout_type,
-      Args...>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
   operator()(Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
@@ -719,12 +695,11 @@ class ViewBase
    * which seems to have been fixed in CUDA 9.2+
    */
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
-      value_type,
-      pointer_type,
-      linear_index_type,
-      layout_type,
-      Args...>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
   operator[](Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
@@ -732,13 +707,12 @@ class ViewBase
   }
 
 
-  template <
-      size_t n_dims   = layout_type::n_dims,
-      typename IdxLin = linear_index_type>
+  template <size_t n_dims   = layout_type::n_dims,
+            typename IdxLin = linear_index_type>
   RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(
-        n_dims == layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     shifted_layout_type shift_layout(m_layout);
     shift_layout.shift(shift);
@@ -748,23 +722,20 @@ class ViewBase
 };
 
 
-template <
-    typename ValueType,
-    typename PointerType,
-    typename LayoutType,
-    typename IndexTypes>
+template <typename ValueType,
+          typename PointerType,
+          typename LayoutType,
+          typename IndexTypes>
 class TypedViewBase;
 
-template <
-    typename ValueType,
-    typename PointerType,
-    typename LayoutType,
-    typename... IndexTypes>
-class TypedViewBase<
-    ValueType,
-    PointerType,
-    LayoutType,
-    camp::list<IndexTypes...>>
+template <typename ValueType,
+          typename PointerType,
+          typename LayoutType,
+          typename... IndexTypes>
+class TypedViewBase<ValueType,
+                    PointerType,
+                    LayoutType,
+                    camp::list<IndexTypes...>>
     : public ViewBase<ValueType, PointerType, LayoutType>
 {
 
@@ -777,36 +748,32 @@ class TypedViewBase<
   using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
       typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-  using Base = ViewBase<ValueType, PointerType, LayoutType>;
-  using Self = TypedViewBase<
-      value_type,
-      pointer_type,
-      layout_type,
-      camp::list<IndexTypes...>>;
-  using NonConstView = TypedViewBase<
-      nc_value_type,
-      nc_pointer_type,
-      layout_type,
-      camp::list<IndexTypes...>>;
+  using Base         = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self         = TypedViewBase<value_type,
+                             pointer_type,
+                             layout_type,
+                             camp::list<IndexTypes...>>;
+  using NonConstView = TypedViewBase<nc_value_type,
+                                     nc_pointer_type,
+                                     layout_type,
+                                     camp::list<IndexTypes...>>;
 
   using shifted_layout_type = typename add_offset<layout_type>::type;
-  using ShiftedView         = TypedViewBase<
-      value_type,
-      pointer_type,
-      shifted_layout_type,
-      camp::list<IndexTypes...>>;
+  using ShiftedView         = TypedViewBase<value_type,
+                                    pointer_type,
+                                    shifted_layout_type,
+                                    camp::list<IndexTypes...>>;
 
   static constexpr size_t n_dims = sizeof...(IndexTypes);
 
   using Base::Base;
 
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
-      value_type,
-      pointer_type,
-      linear_index_type,
-      layout_type,
-      Args...>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
   operator()(Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
@@ -822,12 +789,11 @@ class TypedViewBase<
    * which seems to have been fixed in CUDA 9.2+
    */
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<
-      value_type,
-      pointer_type,
-      linear_index_type,
-      layout_type,
-      Args...>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
   operator[](Args... args) const
   {
     return view_make_return_value<value_type, linear_index_type>(
@@ -836,13 +802,12 @@ class TypedViewBase<
   }
 
 
-  template <
-      size_t n_dims   = sizeof...(IndexTypes),
-      typename IdxLin = linear_index_type>
+  template <size_t n_dims   = sizeof...(IndexTypes),
+            typename IdxLin = linear_index_type>
   RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(
-        n_dims == layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     shifted_layout_type shift_layout(Base::get_layout());
     shift_layout.shift(shift);
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index 7ee9cba7e5..76ffa44bb9 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -47,10 +47,9 @@ struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
   using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
 };
 
-template <
-    typename ValueType,
-    typename LayoutType,
-    typename PointerType = ValueType*>
+template <typename ValueType,
+          typename LayoutType,
+          typename PointerType = ValueType*>
 using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
 
 
@@ -65,14 +64,12 @@ RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
   return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
 
-template <
-    size_t n_dims,
-    typename IndexType,
-    typename ValueType,
-    typename... IndexTypes>
+template <size_t n_dims,
+          typename IndexType,
+          typename ValueType,
+          typename... IndexTypes>
 RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
-            make_index_view(
-                ValueType*                                    ptr,
+make_index_view(ValueType* ptr,
                 IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
   return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
@@ -123,24 +120,20 @@ RAJA_HOST_DEVICE RAJA_INLINE auto
 removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
     lyout,
     std::forward<Tup>(tup),
-    cat_seq_t<
-        camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-        offset_seq_t<
-            Nth + 1,  // after Nth
-            camp::make_idx_seq_t<
-                camp::tuple_size<Tup>::value - Nth - 1>>  // sequence after Nth
-        > {}))
+    cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+              offset_seq_t<Nth + 1,       // after Nth
+                           camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                Nth - 1>>  // sequence after Nth
+              > {}))
 {
   return selecttuple<Lay>(
       lyout, std::forward<Tup>(tup),
-      cat_seq_t<
-          camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-          offset_seq_t<
-              Nth + 1,  // after Nth
-              camp::make_idx_seq_t<
-                  camp::tuple_size<Tup>::value - Nth - 1>>  // sequence after
-                                                            // Nth
-          > {});
+      cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                offset_seq_t<Nth + 1,       // after Nth
+                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                  Nth - 1>>  // sequence after
+                                                             // Nth
+                > {});
 }
 
 
@@ -169,7 +162,7 @@ struct MultiView
       MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
-  nc_pointer_type   data;
+  nc_pointer_type data;
 
   template <typename... Args>
   RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
@@ -198,15 +191,14 @@ struct MultiView
       RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
       shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(
-        n_dims == layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<
-        ValueType, typename add_offset<layout_type>::type, P2Pidx>(
-        data, shift_layout);
+    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type,
+                           P2Pidx>(data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
@@ -283,7 +275,7 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
 
 template <typename AtomicPolicy, typename ViewType>
 RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy>
-            make_atomic_view(ViewType const& view)
+make_atomic_view(ViewType const& view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index b91bdc9db0..3b488a81ec 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -97,7 +97,7 @@ class MemoryArena
       for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter)
       {
 
-        void*  adj_ptr = iter->first;
+        void* adj_ptr = iter->first;
         size_t cap =
             static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
 
@@ -106,8 +106,8 @@ class MemoryArena
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(
-              iter, adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
+          remove_free_chunk(iter, adj_ptr,
+                            static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
 
@@ -253,8 +253,8 @@ class MemoryArena
   }
 
   memory_chunk m_allocation;
-  free_type    m_free_space;
-  used_type    m_used_space;
+  free_type m_free_space;
+  used_type m_used_space;
 };
 
 } /* end namespace detail */
@@ -368,9 +368,9 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    const size_t                   size = nTs * sizeof(T);
-    void*                          ptr  = nullptr;
-    arena_container_type::iterator end  = m_arenas.end();
+    const size_t size                  = nTs * sizeof(T);
+    void* ptr                          = nullptr;
+    arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
          ++iter)
     {
@@ -402,7 +402,7 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    void*                          ptr = const_cast<void*>(cptr);
+    void* ptr                          = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
          ++iter)
@@ -427,8 +427,8 @@ class MemPool
 #endif
 
   arena_container_type m_arenas;
-  size_t               m_default_arena_size;
-  allocator_t          m_alloc;
+  size_t m_default_arena_size;
+  allocator_t m_alloc;
 };
 
 //! example allocator for basic_mempool using malloc/free
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 202c31c912..81069d57d0 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -39,8 +39,9 @@ namespace detail
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
-for_each(Iter begin, Iter end, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
+                                                Iter end,
+                                                UnaryFunc func)
 {
   for (; begin != end; ++begin)
   {
@@ -53,8 +54,8 @@ for_each(Iter begin, Iter end, UnaryFunc func)
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
-for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
+                                                     UnaryFunc func)
 {
   // braced init lists are evaluated in order
   int seq_unused_array[] = {0, (func(Ts {}), 0)...};
@@ -66,8 +67,9 @@ for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
-for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+                                                      UnaryFunc func,
+                                                      camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -103,8 +105,8 @@ RAJA_HOST_DEVICE RAJA_INLINE
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
-for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
+                                                     UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index f79e713a24..dddb050ec4 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -167,9 +167,9 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
 #else
 #ifdef RAJA_COMPILER_MSVC
   fflush(stdout);
-  char*  value;
+  char* value;
   size_t len;
-  bool   no_except = false;
+  bool no_except = false;
   if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr)
   {
     no_except = true;
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index ed19f907d9..b07f3b6f48 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -93,11 +93,10 @@ RAJA_HOST_DEVICE constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template <
-    typename L,
-    typename R,
-    std::enable_if_t<
-        std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr>
+template <typename L,
+          typename R,
+          std::enable_if_t<std::is_integral<L>::value &&
+                           std::is_integral<R>::value>* = nullptr>
 constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
 {
   return lhs & (rhs - R(1));
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 444fd6e533..9d8380bbf1 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -46,8 +46,8 @@ struct LeftFoldReduce
 {
   RAJA_HOST_DEVICE
   RAJA_INLINE constexpr explicit LeftFoldReduce(
-      T        init = BinaryOp::identity(),
-      BinaryOp op   = BinaryOp {}) noexcept
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
       : m_op(std::move(op)), m_accumulated_value(std::move(init))
   {}
 
@@ -94,33 +94,31 @@ struct LeftFoldReduce
 
 private:
   BinaryOp m_op;
-  T        m_accumulated_value;
+  T m_accumulated_value;
 };
 
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <
-    typename T,
-    typename BinaryOp,
-    typename SizeType     = size_t,
-    SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
+template <typename T,
+          typename BinaryOp,
+          typename SizeType     = size_t,
+          SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
 struct BinaryTreeReduce
 {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(
-      t_num_levels <= CHAR_BIT * sizeof(SizeType),
-      "SizeType must be "
-      "large enough to "
-      "act at a bitset "
-      "for num_levels");
+  static_assert(t_num_levels <= CHAR_BIT * sizeof(SizeType),
+                "SizeType must be "
+                "large enough to "
+                "act at a bitset "
+                "for num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE constexpr explicit BinaryTreeReduce(
-      T        init = BinaryOp::identity(),
-      BinaryOp op   = BinaryOp {}) noexcept
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
       : m_op(std::move(op))
   {
     combine(std::move(init));
@@ -250,10 +248,10 @@ struct BinaryTreeReduce
 
 
 template <typename T, typename BinaryOp>
-using HighAccuracyReduce = std::conditional_t<
-    RAJA::operators::is_fp_associative<T>::value,
-    BinaryTreeReduce<T, BinaryOp>,
-    LeftFoldReduce<T, BinaryOp>>;
+using HighAccuracyReduce =
+    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
+                       BinaryTreeReduce<T, BinaryOp>,
+                       LeftFoldReduce<T, BinaryOp>>;
 
 
 /*!
@@ -288,8 +286,8 @@ binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(
-      std::move(init), std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
+                                                  std::move(op));
 
   for (; begin != end; ++begin)
   {
@@ -327,25 +325,22 @@ high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
   using a left fold algorithm in O(N) operations and O(1) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/accumulate
 */
-template <
-    typename Container,
-    typename T        = detail::ContainerVal<Container>,
-    typename BinaryOp = operators::plus<T>>
+template <typename Container,
+          typename T        = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-                accumulate(
-                    Container&& c,
-                    T           init = BinaryOp::identity(),
-                    BinaryOp    op   = BinaryOp {})
+    accumulate(Container&& c,
+               T init      = BinaryOp::identity(),
+               BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
-  static_assert(
-      type_traits::is_binary_function<BinaryOp, T, T, T>::value,
-      "BinaryOp must model BinaryFunction");
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(
-      begin(c), end(c), std::move(init), std::move(op));
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init),
+                                  std::move(op));
 }
 
 /*!
@@ -353,25 +348,22 @@ RAJA_HOST_DEVICE
   using a binary tree algorithm in O(N) operations and O(lg(N)) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
-template <
-    typename Container,
-    typename T        = detail::ContainerVal<Container>,
-    typename BinaryOp = operators::plus<T>>
+template <typename Container,
+          typename T        = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-                binary_tree_reduce(
-                    Container&& c,
-                    T           init = BinaryOp::identity(),
-                    BinaryOp    op   = BinaryOp {})
+    binary_tree_reduce(Container&& c,
+                       T init      = BinaryOp::identity(),
+                       BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
-  static_assert(
-      type_traits::is_binary_function<BinaryOp, T, T, T>::value,
-      "BinaryOp must model BinaryFunction");
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(
-      begin(c), end(c), std::move(init), std::move(op));
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
+                                    std::move(op));
 }
 
 /*!
@@ -380,25 +372,22 @@ RAJA_HOST_DEVICE
   concern
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
-template <
-    typename Container,
-    typename T        = detail::ContainerVal<Container>,
-    typename BinaryOp = operators::plus<T>>
+template <typename Container,
+          typename T        = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-                high_accuracy_reduce(
-                    Container&& c,
-                    T           init = BinaryOp::identity(),
-                    BinaryOp    op   = BinaryOp {})
+    high_accuracy_reduce(Container&& c,
+                         T init      = BinaryOp::identity(),
+                         BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
-  static_assert(
-      type_traits::is_binary_function<BinaryOp, T, T, T>::value,
-      "BinaryOp must model BinaryFunction");
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(
-      begin(c), end(c), std::move(init), std::move(op));
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
+                                      std::move(op));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 53e0f2e69a..567d95e21e 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -70,45 +70,41 @@ struct get_resource_from_platform<Platform::cuda>
   using type = camp::resources::Cuda;
 };
 
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async>
-struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<
-    IterationMapping,
-    IterationGetter,
-    Concretizer,
-    BLOCKS_PER_SM,
-    Async>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                             IterationGetter,
+                                                             Concretizer,
+                                                             BLOCKS_PER_SM,
+                                                             Async>>
 {
   using type = camp::resources::Cuda;
 };
 
 template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-struct get_resource<
-    ::RAJA::policy::cuda::
-        cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>
+struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async,
+                                                                 num_threads,
+                                                                 BLOCKS_PER_SM>>
 {
   using type = camp::resources::Cuda;
 };
 
-template <
-    typename ISetIter,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    size_t BLOCKS_PER_SM,
-    bool   Async>
-struct get_resource<ExecPolicy<
-    ISetIter,
-    ::RAJA::policy::cuda::cuda_exec_explicit<
-        IterationMapping,
-        IterationGetter,
-        Concretizer,
-        BLOCKS_PER_SM,
-        Async>>>
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>>>
 {
   using type = camp::resources::Cuda;
 };
@@ -121,11 +117,10 @@ struct get_resource_from_platform<Platform::hip>
   using type = camp::resources::Hip;
 };
 
-template <
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
 struct get_resource<
     ::RAJA::policy::hip::
         hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>
@@ -139,12 +134,11 @@ struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>
   using type = camp::resources::Hip;
 };
 
-template <
-    typename ISetIter,
-    typename IterationMapping,
-    typename IterationGetter,
-    typename Concretizer,
-    bool Async>
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
 struct get_resource<ExecPolicy<
     ISetIter,
     ::RAJA::policy::hip::
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index 6a5badba1c..a5c0de5e76 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -40,8 +40,9 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE Iter
-partition(Iter begin, Iter end, Predicate pred)
+RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
+                                            Iter end,
+                                            Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
@@ -452,8 +453,8 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>(
-          RAJA::DATA_ALIGN, copylen * sizeof(value_type)),
+      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                              copylen * sizeof(value_type)),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
@@ -505,13 +506,13 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 */
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
 // constexpr OutIter // <-- std:: return value
-void RAJA_INLINE merge_like_std(
-    Iter1   first1,
-    Iter1   last1,
-    Iter2   first2,
-    Iter2   last2,
-    OutIter d_first,  // using this as direct access to result
-    Compare comp)
+void RAJA_INLINE
+merge_like_std(Iter1 first1,
+               Iter1 last1,
+               Iter2 first2,
+               Iter2 last2,
+               OutIter d_first,  // using this as direct access to result
+               Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
@@ -577,7 +578,7 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
   auto minlam = [](diff_type a, diff_type b) { return (a < b) ? a : b; };
 
   // insertion sort for sizes <= 16
-  diff_type                  len                   = end - begin;
+  diff_type len                                    = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
   if (len <= insertion_sort_cutoff && len > 0)
   {
@@ -599,8 +600,8 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>(
-            RAJA::DATA_ALIGN, len * sizeof(value_type)),
+        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                                len * sizeof(value_type)),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
@@ -651,16 +652,15 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
         if (copyvalid)  // switch arrays per level of merging to avoid copying
                         // back to copyarr
         {
-          detail::merge_like_std(
-              copyarr + start, copyarr + start + midpoint,
-              copyarr + start + midpoint, copyarr + finish, begin + start,
-              comp);
+          detail::merge_like_std(copyarr + start, copyarr + start + midpoint,
+                                 copyarr + start + midpoint, copyarr + finish,
+                                 begin + start, comp);
         }
         else
         {
-          detail::merge_like_std(
-              begin + start, begin + start + midpoint, begin + start + midpoint,
-              begin + finish, copyarr + start, comp);
+          detail::merge_like_std(begin + start, begin + start + midpoint,
+                                 begin + start + midpoint, begin + finish,
+                                 copyarr + start, comp);
         }
       }
 
@@ -689,22 +689,19 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
     \brief stable insertion sort given range inplace using comparison function
     and using O(N^2) comparisons and O(1) memory
 */
-template <
-    typename Container,
-    typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <typename Container,
+          typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                insertion_sort(Container&& c, Compare comp = Compare {})
+    insertion_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -723,22 +720,19 @@ RAJA_HOST_DEVICE
     \brief unstable shell sort given range inplace using comparison function
     and using O(N^?) comparisons and O(1) memory
 */
-template <
-    typename Container,
-    typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <typename Container,
+          typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                shell_sort(Container&& c, Compare comp = Compare {})
+    shell_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -757,22 +751,19 @@ RAJA_HOST_DEVICE
     \brief unstable heap sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(1) memory
 */
-template <
-    typename Container,
-    typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <typename Container,
+          typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                heap_sort(Container&& c, Compare comp = Compare {})
+    heap_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -791,22 +782,19 @@ RAJA_HOST_DEVICE
     \brief unstable intro sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
-template <
-    typename Container,
-    typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <typename Container,
+          typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE
     RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-                intro_sort(Container&& c, Compare comp = Compare {})
+    intro_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
@@ -825,21 +813,18 @@ RAJA_HOST_DEVICE
     \brief stable merge sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(N) memory
 */
-template <
-    typename Container,
-    typename Compare = operators::less<detail::ContainerVal<Container>>>
+template <typename Container,
+          typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-            merge_sort(Container&& c, Compare comp = Compare {})
+merge_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
-  static_assert(
-      type_traits::is_binary_function<Compare, bool, T, T>::value,
-      "Compare must model BinaryFunction");
-  static_assert(
-      type_traits::is_random_access_range<Container>::value,
-      "Container must model RandomAccessRange");
+  static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
+                "Compare must model BinaryFunction");
+  static_assert(type_traits::is_random_access_range<Container>::value,
+                "Container must model RandomAccessRange");
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index aeafdca8f2..f141aa3acd 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -138,11 +138,11 @@ struct Direct : DirectBase
 ///   // 2 -> {6, 7}
 ///
 template <size_t max_iterations>
-struct Contiguousloop : ContiguousLoopBase,
-                        std::conditional_t<
-                            (max_iterations != named_usage::unspecified),
-                            SizedLoopSpecifyingBase<max_iterations>,
-                            UnsizedLoopBase>
+struct Contiguousloop
+    : ContiguousLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
 {};
 
 ///
@@ -171,9 +171,9 @@ struct Contiguousloop : ContiguousLoopBase,
 ///   // 2 -> {2, 5}
 ///
 template <size_t max_iterations>
-struct StridedLoop : StridedLoopBase,
-                     std::conditional_t<
-                         (max_iterations != named_usage::unspecified),
+struct StridedLoop
+    : StridedLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
                          SizedLoopSpecifyingBase<max_iterations>,
                          UnsizedLoopBase>
 {};
@@ -845,15 +845,15 @@ using UnalignedReal_ptr       = Real_type*;
 using const_UnalignedReal_ptr = const Real_type*;
 
 #elif defined(RAJA_USE_RESTRICT_PTR)
-using Real_ptr          = Real_type*                   RAJA_RESTRICT;
-using const_Real_ptr    = const Real_type*       RAJA_RESTRICT;
+using Real_ptr          = Real_type* RAJA_RESTRICT;
+using const_Real_ptr    = const Real_type* RAJA_RESTRICT;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr       = Complex_type*             RAJA_RESTRICT;
+using Complex_ptr       = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr       = Real_type*             RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR)
@@ -861,11 +861,11 @@ using Real_ptr           = TDRAReal_ptr;
 using const_Real_ptr     = const_TDRAReal_ptr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr        = Complex_type*             RAJA_RESTRICT;
+using Complex_ptr        = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr  = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr       = Real_type*             RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_PTR_CLASS)
@@ -912,18 +912,16 @@ struct DefaultAccessor
  * \brief Abstracts T into an equal or greater size array of integers whose
  * size is between min_integer_type_size and max_interger_type_size inclusive.
  */
-template <
-    typename T,
-    size_t min_integer_type_size = 1,
-    size_t max_integer_type_size = sizeof(unsigned long long)>
+template <typename T,
+          size_t min_integer_type_size = 1,
+          size_t max_integer_type_size = sizeof(unsigned long long)>
 struct AsIntegerArray
 {
-  static_assert(
-      min_integer_type_size <= max_integer_type_size,
-      "incompatible "
-      "min and max "
-      "integer type "
-      "size");
+  static_assert(min_integer_type_size <= max_integer_type_size,
+                "incompatible "
+                "min and max "
+                "integer type "
+                "size");
   using integer_type = std::conditional_t<
       ((alignof(T) >= alignof(unsigned long long) &&
         sizeof(unsigned long long) <= max_integer_type_size) ||
@@ -944,28 +942,25 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<
-                      ((alignof(T) >= alignof(unsigned char) &&
-                        sizeof(unsigned char) <= max_integer_type_size)),
-                      unsigned char,
-                      void>>>>>;
-  static_assert(
-      !std::is_same<integer_type, void>::value,
-      "could not find a "
-      "compatible integer "
-      "type");
-  static_assert(
-      sizeof(integer_type) >= min_integer_type_size,
-      "integer_type "
-      "smaller than "
-      "min integer "
-      "type size");
-  static_assert(
-      sizeof(integer_type) <= max_integer_type_size,
-      "integer_type "
-      "greater than "
-      "max integer "
-      "type size");
+                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
+                                       sizeof(unsigned char) <=
+                                           max_integer_type_size)),
+                                     unsigned char,
+                                     void>>>>>;
+  static_assert(!std::is_same<integer_type, void>::value,
+                "could not find a "
+                "compatible integer "
+                "type");
+  static_assert(sizeof(integer_type) >= min_integer_type_size,
+                "integer_type "
+                "smaller than "
+                "min integer "
+                "type size");
+  static_assert(sizeof(integer_type) <= max_integer_type_size,
+                "integer_type "
+                "greater than "
+                "max integer "
+                "type size");
 
   static constexpr size_t num_integer_type =
       (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
@@ -1021,7 +1016,7 @@ struct ScopedAssignment
 
 private:
   T& m_ref_to_val;
-  T  m_prev_val;
+  T m_prev_val;
 };
 
 }  // namespace detail
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index ef82f4c5ec..612c16cfc4 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -43,10 +43,9 @@ struct ZipIterator
   static_assert(
       concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
-  static_assert(
-      sizeof...(Iters) > 1,
-      "ZipIterator must contain one or more "
-      "iterators");
+  static_assert(sizeof...(Iters) > 1,
+                "ZipIterator must contain one or more "
+                "iterators");
 
   using value_type =
       zip_val<typename std::iterator_traits<Iters>::value_type...>;
@@ -59,10 +58,9 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
 
-  template <
-      typename... Args,
-      typename =
-          concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...>>
+  template <typename... Args,
+            typename = concepts::enable_if<
+                type_traits::convertible_to<Args&&, Iters>...>>
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
       : m_iterators(std::forward<Args>(args)...)
   {}
@@ -166,8 +164,8 @@ struct ZipIterator
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator
-  operator+(difference_type lhs, const ZipIterator& rhs)
+  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
+                                                const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -188,8 +186,8 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void
-  safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
+                                                     ZipIterator rhs)
   {
     detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap {});
   }
@@ -220,17 +218,16 @@ RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
     ZipIterator objects.
 */
 template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args) -> Span<
-    ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-    typename ZipIterator<
-        detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
+    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+            typename ZipIterator<
+                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
-  return Span<
-      ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-      typename ZipIterator<
-          detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
+  return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+              typename ZipIterator<detail::ContainerIter<
+                  camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
       zip(end(std::forward<Args>(args))...));
 }
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index 8cc6921969..5faecebf7b 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -50,9 +50,9 @@ using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
 template <camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr RAJA::
-    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>&
-    get(zip_tuple<is_val, Ts...>& z) noexcept
+RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
+                                                     zip_tuple<is_val, Ts...>>&
+get(zip_tuple<is_val, Ts...>& z) noexcept
 {
   return z.template get<I>();
 }
@@ -195,9 +195,8 @@ template <typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
 RAJA_HOST_DEVICE inline void
 zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(
-      RAJA::get<Is>(std::forward<Tuple0>(t0)),
-      RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
+                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
@@ -206,9 +205,8 @@ zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 template <typename Tuple, typename F>
 RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(
-      std::forward<Tuple>(t), std::forward<F>(f),
-      typename camp::decay<Tuple>::IdxSeq {});
+  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
+                    typename camp::decay<Tuple>::IdxSeq {});
 }
 
 /*!
@@ -217,14 +215,12 @@ RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 template <typename Tuple0, typename Tuple1, typename F>
 RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(
-      std::is_same<
-          typename camp::decay<Tuple0>::IdxSeq,
-          typename camp::decay<Tuple1>::IdxSeq>::value,
-      "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(
-      std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f),
-      typename camp::decay<Tuple0>::IdxSeq {});
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
+                             typename camp::decay<Tuple1>::IdxSeq>::value,
+                "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple0>::IdxSeq {});
 }
 
 }  // end namespace detail
@@ -240,10 +236,10 @@ struct zip_tuple
   using value_type = RAJA::tuple<Ts...>;
 
   template <typename T>
-  using opp_type = typename std::conditional<
-      is_val,
-      typename std::add_lvalue_reference<T>::type,
-      typename std::remove_reference<T>::type>::type;
+  using opp_type =
+      typename std::conditional<is_val,
+                                typename std::add_lvalue_reference<T>::type,
+                                typename std::remove_reference<T>::type>::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -260,11 +256,10 @@ struct zip_tuple
   {}
 
   // assignment from types convertible to Ts
-  template <
-      typename... Os,
-      typename = concepts::enable_if<type_traits::convertible_to<
-          Os&&,
-          typename std::remove_reference<Ts>::type>...>>
+  template <typename... Os,
+            typename = concepts::enable_if<type_traits::convertible_to<
+                Os&&,
+                typename std::remove_reference<Ts>::type>...>>
   zip_tuple& assign(Os&&... os)
   {
     return assign_helper(IdxSeq {}, std::forward<Os>(os)...);
@@ -349,16 +344,16 @@ struct zip_tuple
   }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void
-  safe_swap(zip_tuple& lhs, zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     zip_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void
-  safe_swap(zip_tuple& lhs, opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     opp_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
@@ -391,8 +386,8 @@ struct zip_tuple
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
+                                         camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
@@ -402,8 +397,8 @@ struct zip_tuple
 
   // copy and move assignment operator helpers
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
-  assign_helper(zip_tuple& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
+                                                        camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -412,8 +407,8 @@ struct zip_tuple
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
-  assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
+                                                        camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -422,8 +417,8 @@ struct zip_tuple
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
-  assign_helper(zip_tuple&& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
+                                                        camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -438,8 +433,8 @@ struct zip_tuple
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
+                                         camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
   template <camp::idx_t... Is>
@@ -449,22 +444,22 @@ struct zip_tuple
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
-  assign_helper(opp_tuple& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
+                                                        camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(o)...);
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
-  assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
+                                                        camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(o)...);
     return *this;
   }
   template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple&
-  assign_helper(opp_tuple&& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
+                                                        camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
     return *this;
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index c3720b937e..1fc5b37f27 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -42,9 +42,9 @@ namespace RAJA
  */
 void buildIndexSetAligned(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource                                   work_res,
-    const RAJA::Index_type* const                               indices_in,
-    RAJA::Index_type                                            length,
+    camp::resources::Resource work_res,
+    const RAJA::Index_type* const indices_in,
+    RAJA::Index_type length,
     RAJA::Index_type range_min_length,
     RAJA::Index_type range_align)
 {
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index 7d3d590131..d84367dc29 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -17,8 +17,8 @@ const uint64_t kokkos_interface_version = 20171029;
 RAJA_INLINE
 bool isSharedObject(const std::string& filename)
 {
-  return (
-      filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
 template <typename function>
@@ -99,14 +99,14 @@ void KokkosPluginLoader::initPlugin(const std::string& path)
   // Getting and storing supported kokkos functions.
   getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
 
-  getFunction<pre_function>(
-      plugin, pre_functions, "kokkosp_begin_parallel_for");
+  getFunction<pre_function>(plugin, pre_functions,
+                            "kokkosp_begin_parallel_for");
 
-  getFunction<post_function>(
-      plugin, post_functions, "kokkosp_end_parallel_for");
+  getFunction<post_function>(plugin, post_functions,
+                             "kokkosp_end_parallel_for");
 
-  getFunction<finalize_function>(
-      plugin, finalize_functions, "kokkosp_finalize_library");
+  getFunction<finalize_function>(plugin, finalize_functions,
+                                 "kokkosp_finalize_library");
 #else
   RAJA_UNUSED_ARG(path);
 #endif
@@ -122,7 +122,7 @@ void KokkosPluginLoader::initDirectory(const std::string& path)
     return;
   }
 
-  DIR*           dir;
+  DIR* dir;
   struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index 03da4551a3..3b5f314138 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -42,11 +42,10 @@ namespace RAJA
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int                                      fastDim,
-    int                                      midDim,
-    int                                      slowDim)
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim)
 {
   constexpr int PROFITABLE_ENTITY_THRESHOLD_BLOCK = 100;
 
@@ -111,8 +110,8 @@ void buildLockFreeBlockIndexset(
           RAJA::Index_type len      = end - start;
           // printf("%d %d\n", start + (lane  )*len/3,
           //                   start + (lane+1)*len/3  ) ;
-          iset.push_back(RAJA::RangeSegment(
-              start + (lane)*len / 3, start + (lane + 1) * len / 3));
+          iset.push_back(RAJA::RangeSegment(start + (lane)*len / 3,
+                                            start + (lane + 1) * len / 3));
         }
       }
     }
@@ -214,22 +213,22 @@ void buildLockFreeBlockIndexset(
  */
 void buildLockFreeColorIndexset(
     RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment>& iset,
-    camp::resources::Resource                                   work_res,
-    RAJA::Index_type const*                                     domainToRange,
-    int                                                         numEntity,
-    int               numRangePerDomain,
-    int               numEntityRange,
+    camp::resources::Resource work_res,
+    RAJA::Index_type const* domainToRange,
+    int numEntity,
+    int numRangePerDomain,
+    int numEntityRange,
     RAJA::Index_type* elemPermutation,
     RAJA::Index_type* ielemPermutation)
 {
-  bool  done     = false;
+  bool done      = false;
   bool* isMarked = new bool[numEntity];
 
-  RAJA::Index_type  numWorkset   = 0;
+  RAJA::Index_type numWorkset    = 0;
   RAJA::Index_type* worksetDelim = new RAJA::Index_type[numEntity];
 
-  RAJA::Index_type  worksetSize = 0;
-  RAJA::Index_type* workset     = new RAJA::Index_type[numEntity];
+  RAJA::Index_type worksetSize = 0;
+  RAJA::Index_type* workset    = new RAJA::Index_type[numEntity];
 
   RAJA::Index_type* rangeToDomain =
       new RAJA::Index_type[numEntityRange * numRangePerDomain];
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
index b7324213f3..94d19699b6 100644
--- a/src/RuntimePluginLoader.cpp
+++ b/src/RuntimePluginLoader.cpp
@@ -15,8 +15,8 @@
 RAJA_INLINE
 bool isSharedObject(const std::string& filename)
 {
-  return (
-      filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
 namespace RAJA
@@ -121,7 +121,7 @@ void RuntimePluginLoader::initDirectory(const std::string& path)
     return;
   }
 
-  DIR*           dir;
+  DIR* dir;
   struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index c522a46b06..fe6cb470bf 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -12,24 +12,23 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallResourceRangeSegmentTestImpl(
-    INDEX_TYPE first,
-    INDEX_TYPE last,
-    const int  pol)
+void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
+                                               INDEX_TYPE last,
+                                               const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
-  WORKING_RES               working_res;
+  WORKING_RES working_res;
   camp::resources::Resource erased_working_res {working_res};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -40,18 +39,17 @@ void DynamicForallResourceRangeSegmentTestImpl(
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -78,9 +76,9 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   // If N == 5 host, openmp, device are on
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  bool                      is_on_host =
+  bool is_on_host =
       working_res.get_platform() == camp::resources::Platform::host ? true
-                                                                                         : false;
+                                                                    : false;
 
   if (is_on_host)
   {
@@ -91,8 +89,8 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
     // Loop through policy list
     for (int pol = 0; pol < host_range; ++pol)
     {
-      DynamicForallResourceRangeSegmentTestImpl<
-          INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
+                                                POLICY_LIST>(
           INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
@@ -105,16 +103,15 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 #endif
     for (int pol = device_start; pol < N; ++pol)
     {
-      DynamicForallResourceRangeSegmentTestImpl<
-          INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
+                                                POLICY_LIST>(
           INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    DynamicForallResourceRangeSegmentTest,
-    RangeSegmentForallResource);
+REGISTER_TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest,
+                            RangeSegmentForallResource);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index d4c319fdc6..09dec1c458 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -12,20 +12,19 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallRangeSegmentTestImpl(
-    INDEX_TYPE first,
-    INDEX_TYPE last,
-    const int  pol)
+void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
+                                       INDEX_TYPE last,
+                                       const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -33,8 +32,8 @@ void DynamicForallRangeSegmentTestImpl(
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -53,29 +52,27 @@ void DynamicForallRangeSegmentTestImpl(
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(
-        pol, r1,
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        {
-          (void)idx;
-          working_array[0]++;
-        });
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1,
+                                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                            {
+                                              (void)idx;
+                                              working_array[0]++;
+                                            });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -101,9 +98,9 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
   // If N == 5 host, openmp, device are on
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  bool                      is_on_host =
+  bool is_on_host =
       working_res.get_platform() == camp::resources::Platform::host ? true
-                                                                                         : false;
+                                                                    : false;
 
   if (is_on_host)
   {
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index 3b426bdd96..1189dfc36a 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -16,20 +16,20 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N  = N0;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
@@ -65,13 +65,12 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -80,21 +79,19 @@ template <typename T>
 class ForallCombiningAdapter1DTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 8dba3983eb..5b011f6c8b 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -14,29 +14,28 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter2DTestImpl(
-    INDEX_TYPE first0,
-    INDEX_TYPE last0,
-    INDEX_TYPE first1,
-    INDEX_TYPE last1)
+void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(
-      RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N  = N0 * N1;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
@@ -57,8 +56,8 @@ void ForallCombiningAdapter2DTestImpl(
           if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 && idx1 < last1)
           {
             // in bounds
-            working_array[RAJA::stripIndexType(
-                (idx0 - first0) * N1 + (idx1 - first1))] +=
+            working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
+                                               (idx1 - first1))] +=
                 (idx0 - first0) * N1 + (idx1 - first1);
           }
           else
@@ -76,13 +75,12 @@ void ForallCombiningAdapter2DTestImpl(
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -91,21 +89,19 @@ template <typename T>
 class ForallCombiningAdapter2DTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index 1d864d42c9..1b5611ee74 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -14,34 +14,33 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter3DTestImpl(
-    INDEX_TYPE first0,
-    INDEX_TYPE last0,
-    INDEX_TYPE first1,
-    INDEX_TYPE last1,
-    INDEX_TYPE first2,
-    INDEX_TYPE last2)
+void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1,
+                                      INDEX_TYPE first2,
+                                      INDEX_TYPE last2)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(
-      RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(
-      RAJA::stripIndexType(first2), RAJA::stripIndexType(last2));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2),
+                                         RAJA::stripIndexType(last2));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N  = N0 * N1 * N2;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
@@ -86,13 +85,12 @@ void ForallCombiningAdapter3DTestImpl(
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -101,21 +99,19 @@ template <typename T>
 class ForallCombiningAdapter3DTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index 2bc4b5f4de..f3cfc532a0 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -43,8 +43,8 @@ struct RSMultiplexer<IdxType, RAJA::TypedRangeStrideSegment<IdxType>>
 template <typename IdxType>
 struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 {
-  RAJA::TypedListSegment<IdxType>
-  makeseg(IdxType N, camp::resources::Resource work_res)
+  RAJA::TypedListSegment<IdxType> makeseg(IdxType N,
+                                          camp::resources::Resource work_res)
   {
     std::vector<IdxType> temp(N);
     std::iota(std::begin(temp), std::end(temp), 0);
@@ -55,13 +55,12 @@ struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 // end segment multiplexer
 
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename SegmentType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename SegmentType,
+          typename T>
 void ForallAtomicBasicTestImpl(IdxType seglimit)
 {
   // initialize an array
@@ -76,8 +75,8 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
   T* test_array;
   T* check_array;
 
-  allocateForallTestData<T>(
-      len, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<T>(len, work_res, &work_array, &check_array,
+                            &test_array);
 
   // use atomic add to reduce the array
   test_array[0]  = static_cast<T>(0);
@@ -106,8 +105,8 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
         RAJA::atomicInc<AtomicPolicy>(work_array + 4);
         RAJA::atomicDec<AtomicPolicy>(work_array + 5);
         RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-        RAJA::atomicCAS<AtomicPolicy>(
-            work_array + 7, static_cast<T>(i), static_cast<T>(i + 1));
+        RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i),
+                                      static_cast<T>(i + 1));
         RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
         RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
         RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
@@ -148,15 +147,13 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicBasicTestImpl<
-      AExec, APol, ResType, IdxType, RAJA::TypedRangeSegment<IdxType>, DType>(
-      10000);
-  ForallAtomicBasicTestImpl<
-      AExec, APol, ResType, IdxType, RAJA::TypedRangeStrideSegment<IdxType>,
-      DType>(10000);
-  ForallAtomicBasicTestImpl<
-      AExec, APol, ResType, IdxType, RAJA::TypedListSegment<IdxType>, DType>(
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeSegment<IdxType>, DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeStrideSegment<IdxType>, DType>(
       10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedListSegment<IdxType>, DType>(10000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, AtomicBasicForall);
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
index cf0f2e0831..04eff1251e 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
@@ -16,11 +16,10 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PreIncCountOp
 {
-  PreIncCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  PreIncCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -32,17 +31,16 @@ struct PreIncCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (++counter) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PostIncCountOp
 {
-  PostIncCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  PostIncCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -54,17 +52,16 @@ struct PostIncCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter++); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AddEqCountOp
 {
-  AddEqCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  AddEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -79,17 +76,16 @@ struct AddEqCountOp
     return (counter += (T)1) - (T)1;
   }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchAddCountOp
 {
-  FetchAddCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  FetchAddCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -104,45 +100,41 @@ struct FetchAddCountOp
     return counter.fetch_add((T)1);
   }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename IdxType,
-    typename T,
-    template <typename, typename, typename>
-    class CountOp>
-void testAtomicRefAdd(
-    RAJA::TypedRangeSegment<IdxType> seg,
-    T*                               count,
-    T*                               list,
-    bool*                            hit,
-    T*                               hcount,
-    T*                               hlist,
-    bool*                            hhit,
-    camp::resources::Resource        work_res,
-    IdxType                          N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
+void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
 
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        list[i] = countop.max + (T)1;
-        hit[i]  = false;
-      });
-
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        T val             = countop(i);
-        list[i]           = val;
-        hit[(IdxType)val] = true;
-      });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -173,12 +165,11 @@ void testAtomicRefAdd(
 }
 
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicRefAddTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
@@ -187,13 +178,13 @@ void ForallAtomicRefAddTestImpl(IdxType N)
 
   camp::resources::Resource host_res {camp::resources::Host()};
 
-  T*    count = work_res.allocate<T>(1);
-  T*    list  = work_res.allocate<T>(N);
-  bool* hit   = work_res.allocate<bool>(N);
+  T* count  = work_res.allocate<T>(1);
+  T* list   = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T*    hcount = host_res.allocate<T>(1);
-  T*    hlist  = host_res.allocate<T>(N);
-  bool* hhit   = host_res.allocate<bool>(N);
+  T* hcount  = host_res.allocate<T>(1);
+  T* hlist   = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 66fd877fd2..0adce05b3d 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -16,11 +16,10 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CASOtherOp : all_op
 {
-  CASOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  CASOtherOp(T* dcount,
+             T* hcount,
+             camp::resources::Resource work_res,
+             RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -41,17 +40,16 @@ struct CASOtherOp : all_op
     return received;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CompareExchangeWeakOtherOp : all_op
 {
-  CompareExchangeWeakOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  CompareExchangeWeakOtherOp(T* dcount,
+                             T* hcount,
+                             camp::resources::Resource work_res,
+                             RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -70,17 +68,16 @@ struct CompareExchangeWeakOtherOp : all_op
     return expect;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct CompareExchangeStrongOtherOp : all_op
 {
-  CompareExchangeStrongOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  CompareExchangeStrongOtherOp(T* dcount,
+                               T* hcount,
+                               camp::resources::Resource work_res,
+                               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -99,35 +96,32 @@ struct CompareExchangeStrongOtherOp : all_op
     return expect;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename IdxType,
-    typename T,
-    template <typename, typename, typename>
-    class OtherOp>
-void testAtomicRefCASOp(
-    RAJA::TypedRangeSegment<IdxType> seg,
-    T*                               count,
-    T*                               list,
-    T*                               hcount,
-    T*                               hlist,
-    camp::resources::Resource        work_res,
-    IdxType                          N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
+                        T* count,
+                        T* list,
+                        T* hcount,
+                        T* hlist,
+                        camp::resources::Resource work_res,
+                        IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        T val   = otherop(i);
-        list[i] = val;
-      });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -148,12 +142,11 @@ void testAtomicRefCASOp(
 }
 
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicRefCASTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
@@ -178,12 +171,12 @@ void ForallAtomicRefCASTestImpl(IdxType N)
 
   testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, CASOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<
-      ExecPolicy, AtomicPolicy, IdxType, T, CompareExchangeWeakOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<
-      ExecPolicy, AtomicPolicy, IdxType, T, CompareExchangeStrongOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeWeakOtherOp>(seg, count, list, hcount,
+                                                 hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeStrongOtherOp>(seg, count, list, hcount,
+                                                   hlist, work_res, N);
 
   work_res.deallocate(count);
   work_res.deallocate(list);
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index 683cf0548c..8bb250d339 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -16,11 +16,10 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct LoadOtherOp : all_op
 {
-  LoadOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  LoadOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)seg.size()),
         max(min),
@@ -33,17 +32,16 @@ struct LoadOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other.load(); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct OperatorTOtherOp : all_op
 {
-  OperatorTOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
+  OperatorTOtherOp(T* dcount,
+                   T* hcount,
+                   camp::resources::Resource work_res,
+                   RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
       : other(dcount), min(T(0)), max(min), final_min(min), final_max(min)
   {
     hcount[0] = min;
@@ -52,17 +50,16 @@ struct OperatorTOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct StoreOtherOp : all_op
 {
-  StoreOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  StoreOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min((T)0),
         max((T)seg.size() - (T)1),
@@ -79,17 +76,16 @@ struct StoreOtherOp : all_op
     return (T)i;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AssignOtherOp : all_op
 {
-  AssignOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  AssignOtherOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -102,35 +98,32 @@ struct AssignOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return (other = (T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename IdxType,
-    typename T,
-    template <typename, typename, typename>
-    class OtherOp>
-void testAtomicRefLoadStoreOp(
-    RAJA::TypedRangeSegment<IdxType> seg,
-    T*                               count,
-    T*                               list,
-    T*                               hcount,
-    T*                               hlist,
-    camp::resources::Resource        work_res,
-    IdxType                          N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
+                              T* count,
+                              T* list,
+                              T* hcount,
+                              T* hlist,
+                              camp::resources::Resource work_res,
+                              IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        T val   = otherop(i);
-        list[i] = val;
-      });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -151,12 +144,11 @@ void testAtomicRefLoadStoreOp(
 }
 
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
@@ -181,9 +173,9 @@ void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, LoadOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<
-      ExecPolicy, AtomicPolicy, IdxType, T, OperatorTOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                           OperatorTOtherOp>(seg, count, list, hcount, hlist,
+                                             work_res, N);
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, StoreOtherOp>(
       seg, count, list, hcount, hlist, work_res, N);
   testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, AssignOtherOp>(
@@ -212,8 +204,7 @@ TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
   ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallAtomicRefLoadStoreTest,
-    AtomicRefLoadStoreForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest,
+                            AtomicRefLoadStoreForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 051947354a..49ec06689a 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -16,11 +16,10 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct AndEqOtherOp : int_op
 {
-  AndEqOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  AndEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size()),
@@ -33,17 +32,16 @@ struct AndEqOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other &= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchAndOtherOp : int_op
 {
-  FetchAndOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  FetchAndOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -56,17 +54,16 @@ struct FetchAndOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_and((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct OrEqOtherOp : int_op
 {
-  OrEqOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  OrEqOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -79,17 +76,16 @@ struct OrEqOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other |= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchOrOtherOp : int_op
 {
-  FetchOrOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  FetchOrOtherOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -102,17 +98,16 @@ struct FetchOrOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_or((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct XorEqOtherOp : int_op
 {
-  XorEqOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  XorEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -130,17 +125,16 @@ struct XorEqOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other ^= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchXorOtherOp : int_op
 {
-  FetchXorOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  FetchXorOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max(np2m1((T)seg.size())),
@@ -158,61 +152,56 @@ struct FetchXorOtherOp : int_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_xor((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename IdxType,
-    typename T,
-    template <typename, typename, typename>
-    class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // No test when underlying op type is int, and index type is float
 typename std::enable_if<
     (std::is_floating_point<T>::value &&
      std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
-testAtomicRefLogicalOp(
-    RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
-    T*                               RAJA_UNUSED_ARG(count),
-    T*                               RAJA_UNUSED_ARG(list),
-    T*                               RAJA_UNUSED_ARG(hcount),
-    T*                               RAJA_UNUSED_ARG(hlist),
-    camp::resources::Resource        RAJA_UNUSED_ARG(work_res),
-    IdxType                          RAJA_UNUSED_ARG(N))
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
+                       T* RAJA_UNUSED_ARG(count),
+                       T* RAJA_UNUSED_ARG(list),
+                       T* RAJA_UNUSED_ARG(hcount),
+                       T* RAJA_UNUSED_ARG(hlist),
+                       camp::resources::Resource RAJA_UNUSED_ARG(work_res),
+                       IdxType RAJA_UNUSED_ARG(N))
 {}
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename IdxType,
-    typename T,
-    template <typename, typename, typename>
-    class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // Run test if T is integral and operation is int_op, or for any all_op
 typename std::enable_if<
     (std::is_integral<T>::value &&
      std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value) ||
     (std::is_base_of<all_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
-testAtomicRefLogicalOp(
-    RAJA::TypedRangeSegment<IdxType> seg,
-    T*                               count,
-    T*                               list,
-    T*                               hcount,
-    T*                               hlist,
-    camp::resources::Resource        work_res,
-    IdxType                          N)
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
+                       T* count,
+                       T* list,
+                       T* hcount,
+                       T* hlist,
+                       camp::resources::Resource work_res,
+                       IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        T val   = otherop(i);
-        list[i] = val;
-      });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -233,12 +222,11 @@ testAtomicRefLogicalOp(
 }
 
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicRefLogicalTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index 3a6394271f..02547f773a 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -16,11 +16,10 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct MaxEqOtherOp : all_op
 {
-  MaxEqOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  MaxEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -33,17 +32,16 @@ struct MaxEqOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchMaxOtherOp : all_op
 {
-  FetchMaxOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  FetchMaxOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -56,17 +54,16 @@ struct FetchMaxOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct MinEqOtherOp : all_op
 {
-  MinEqOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  MinEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size() - (T)1),
@@ -79,17 +76,16 @@ struct MinEqOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchMinOtherOp : all_op
 {
-  FetchMinOtherOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  FetchMinOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
       : other(dcount),
         min(T(0)),
         max((T)seg.size()),
@@ -102,35 +98,32 @@ struct FetchMinOtherOp : all_op
   RAJA_HOST_DEVICE
   T operator()(IdxType i) const { return other.fetch_min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
-  T                                min, max, final_min, final_max;
+  T min, max, final_min, final_max;
 };
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename IdxType,
-    typename T,
-    template <typename, typename, typename>
-    class OtherOp>
-void testAtomicRefMinMaxOp(
-    RAJA::TypedRangeSegment<IdxType> seg,
-    T*                               count,
-    T*                               list,
-    T*                               hcount,
-    T*                               hlist,
-    camp::resources::Resource        work_res,
-    IdxType                          N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
+                           T* count,
+                           T* list,
+                           T* hcount,
+                           T* hlist,
+                           camp::resources::Resource work_res,
+                           IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        T val   = otherop(i);
-        list[i] = val;
-      });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -151,12 +144,11 @@ void testAtomicRefMinMaxOp(
 }
 
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicRefMinMaxTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
index a9bc1fde30..bf15327085 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
@@ -16,11 +16,10 @@
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PreDecCountOp
 {
-  PreDecCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  PreDecCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -29,17 +28,16 @@ struct PreDecCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (--counter); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct PostDecCountOp
 {
-  PostDecCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  PostDecCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -48,17 +46,16 @@ struct PostDecCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter--) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct SubEqCountOp
 {
-  SubEqCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  SubEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -67,17 +64,16 @@ struct SubEqCountOp
   RAJA_HOST_DEVICE
   T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter -= (T)1); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
 template <typename T, typename AtomicPolicy, typename IdxType>
 struct FetchSubCountOp
 {
-  FetchSubCountOp(
-      T*                               dcount,
-      T*                               hcount,
-      camp::resources::Resource        work_res,
-      RAJA::TypedRangeSegment<IdxType> seg)
+  FetchSubCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
       : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
@@ -89,43 +85,39 @@ struct FetchSubCountOp
     return counter.fetch_sub((T)1) - (T)1;
   }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
-  T                                min, max, final;
+  T min, max, final;
 };
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename IdxType,
-    typename T,
-    template <typename, typename, typename>
-    class CountOp>
-void testAtomicRefSub(
-    RAJA::TypedRangeSegment<IdxType> seg,
-    T*                               count,
-    T*                               list,
-    bool*                            hit,
-    T*                               hcount,
-    T*                               hlist,
-    bool*                            hhit,
-    camp::resources::Resource        work_res,
-    IdxType                          N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
+void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        list[i] = countop.max + (T)1;
-        hit[i]  = false;
-      });
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        T val             = countop(i);
-        list[i]           = val;
-        hit[(IdxType)val] = true;
-      });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -147,12 +139,11 @@ void testAtomicRefSub(
 }
 
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicRefSubTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
@@ -161,13 +152,13 @@ void ForallAtomicRefSubTestImpl(IdxType N)
 
   camp::resources::Resource host_res {camp::resources::Host()};
 
-  T*    count = work_res.allocate<T>(1);
-  T*    list  = work_res.allocate<T>(N);
-  bool* hit   = work_res.allocate<bool>(N);
+  T* count  = work_res.allocate<T>(1);
+  T* list   = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T*    hcount = host_res.allocate<T>(1);
-  T*    hlist  = host_res.allocate<T>(N);
-  bool* hhit   = host_res.allocate<bool>(N);
+  T* hcount  = host_res.allocate<T>(1);
+  T* hlist   = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index c78ed6ffac..c71c363d75 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -15,12 +15,11 @@
 
 #include <cmath>
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
@@ -36,11 +35,11 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
   camp::resources::Resource work_res {WORKINGRES()};
   camp::resources::Resource host_res {camp::resources::Host()};
 
-  T*  actualsource = work_res.allocate<T>(N);
-  T** source       = work_res.allocate<T*>(src_side);
-  T*  actualdest   = work_res.allocate<T>(N / 2);
-  T** dest         = work_res.allocate<T*>(dst_side);
-  T*  check_array  = host_res.allocate<T>(N / 2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source      = work_res.allocate<T*>(src_side);
+  T* actualdest   = work_res.allocate<T>(N / 2);
+  T** dest        = work_res.allocate<T*>(dst_side);
+  T* check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -51,17 +50,15 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 #endif
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
-  RAJA::forall<ExecPolicy>(
-      seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
-      { source[ii] = actualsource + (ii * dst_side); });
+  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
+                           { source[ii] = actualsource + (ii * dst_side); });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
-  RAJA::forall<ExecPolicy>(
-      seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
-      { dest[ii] = actualdest + (ii * dst_side); });
+  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
+                           { dest[ii] = actualdest + (ii * dst_side); });
 
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i) { actualsource[i] = (T)1; });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { actualsource[i] = (T)1; });
 
   // use atomic add to reduce the array
   // 1D defaut MultiView
@@ -73,26 +70,25 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(
-      seg_dstside,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-        {
-          sum_atomic_view(i, aopidx) = (T)0;
-        }
-      });
+  RAJA::forall<ExecPolicy>(seg_dstside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i, aopidx) = (T)0;
+                             }
+                           });
 
   // Assign values to dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(
-      seg_srcside,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-        {
-          sum_atomic_view(i / 2, aopidx) += vec_view(aopidx, i / 2);
-        }
-      });
+  RAJA::forall<ExecPolicy>(seg_srcside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i / 2, aopidx) +=
+                                   vec_view(aopidx, i / 2);
+                             }
+                           });
 
   work_res.memcpy(check_array, actualdest, sizeof(T) * N / 2);
 
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index 1c6e9cbc3a..c066673e4a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -15,12 +15,11 @@
 
 #include <cmath>
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
@@ -36,11 +35,11 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
   camp::resources::Resource work_res {WORKINGRES()};
   camp::resources::Resource host_res {camp::resources::Host()};
 
-  T*  actualsource = work_res.allocate<T>(N);
-  T** source       = work_res.allocate<T*>(src_side);
-  T*  actualdest   = work_res.allocate<T>(N / 2);
-  T** dest         = work_res.allocate<T*>(dst_side);
-  T*  check_array  = host_res.allocate<T>(N / 2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source      = work_res.allocate<T*>(src_side);
+  T* actualdest   = work_res.allocate<T>(N / 2);
+  T** dest        = work_res.allocate<T*>(dst_side);
+  T* check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -88,9 +87,8 @@ template <typename T>
 class ForallAtomicOutOfBoundsMultiViewTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    ForallAtomicOutOfBoundsMultiViewTest,
-    AtomicOutOfBoundsMultiViewForall)
+TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
+             AtomicOutOfBoundsMultiViewForall)
 {
   using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
   using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -98,12 +96,11 @@ TYPED_TEST_P(
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicOutOfBoundsMultiViewTestImpl<
-      AExec, APol, ResType, IdxType, DType>(20000);
+  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType,
+                                           DType>(20000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallAtomicOutOfBoundsMultiViewTest,
-    AtomicOutOfBoundsMultiViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest,
+                            AtomicOutOfBoundsMultiViewForall);
 
 #endif  //__TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index 4082ba13be..325fba2a0a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -13,12 +13,11 @@
 #ifndef __TEST_FORALL_ATOMIC_VIEW_HPP__
 #define __TEST_FORALL_ATOMIC_VIEW_HPP__
 
-template <
-    typename ExecPolicy,
-    typename AtomicPolicy,
-    typename WORKINGRES,
-    typename IdxType,
-    typename T>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename WORKINGRES,
+          typename IdxType,
+          typename T>
 void ForallAtomicViewTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
@@ -60,13 +59,12 @@ void ForallAtomicViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(
-      seg_half, [=] RAJA_HOST_DEVICE(IdxType i) { sum_atomic_view(i) = (T)0; });
+  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i) = (T)0; });
 
   // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(
-      seg, [=] RAJA_HOST_DEVICE(IdxType i)
-      { sum_atomic_view(i / 2) += vec_view(i); });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i / 2) += vec_view(i); });
 
   work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
 
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index 1622071b60..ae4cbcfb09 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -28,7 +28,7 @@ void ForallIcountIndexSetViewTestImpl()
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType            iset;
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -45,8 +45,8 @@ void ForallIcountIndexSetViewTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -58,7 +58,7 @@ void ForallIcountIndexSetViewTestImpl()
     test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::Layout<1>                                        layout(N);
+  RAJA::Layout<1> layout(N);
   RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>> work_view(
       working_array, layout);
 
@@ -74,8 +74,8 @@ void ForallIcountIndexSetViewTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -93,8 +93,7 @@ TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
   ForallIcountIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallIcountIndexSetViewTest,
-    IndexSetForallIcountView);
+REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest,
+                            IndexSetForallIcountView);
 
 #endif  // __TEST_FORALL_ICOUNT_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index 3383b8b6d9..783bffa5fb 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -27,7 +27,7 @@ void ForallIndexSetViewTestImpl()
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType            iset;
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -44,8 +44,8 @@ void ForallIndexSetViewTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -59,10 +59,10 @@ void ForallIndexSetViewTestImpl()
   using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
-  view_type       work_view(working_array, layout);
+  view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
+  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -72,8 +72,8 @@ void ForallIndexSetViewTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index 1d2e5a2f21..cd29d25073 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -26,7 +26,7 @@ void ForallIcountIndexSetTestImpl()
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType            iset;
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -43,8 +43,8 @@ void ForallIcountIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -56,10 +56,9 @@ void ForallIcountIndexSetTestImpl()
     test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::forall_Icount(
-      EXEC_POLICY(), iset,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
-      { working_array[icount] = idx; });
+  RAJA::forall_Icount(EXEC_POLICY(), iset,
+                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+                      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -68,8 +67,8 @@ void ForallIcountIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index 5a7bfbf857..9decd9ae7e 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -25,7 +25,7 @@ void ForallIndexSetTestImpl()
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType            iset;
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, working_res);
@@ -42,8 +42,8 @@ void ForallIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -54,9 +54,9 @@ void ForallIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall(
-      EXEC_POLICY(), iset,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
+  RAJA::forall(EXEC_POLICY(), iset,
+               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+               { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -66,8 +66,8 @@ void ForallIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index 18f23bf2fc..b3c33c97f9 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -15,45 +15,41 @@
 #include <random>
 #include <type_traits>
 
-template <
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY,
-    typename ABSTRACTION,
-    typename DATA_TYPE,
-    typename IDX_TYPE,
-    typename SEG_TYPE,
-    typename Container,
-    typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
-ForallMultiReduceBasicTestImpl(
-    const SEG_TYPE&,
-    const Container&,
-    const std::vector<IDX_TYPE>&,
-    camp::resources::Resource,
-    RandomGenerator&)
+ForallMultiReduceBasicTestImpl(const SEG_TYPE&,
+                               const Container&,
+                               const std::vector<IDX_TYPE>&,
+                               camp::resources::Resource,
+                               RandomGenerator&)
 {
   return false;
 }
 ///
-template <
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY,
-    typename ABSTRACTION,
-    typename DATA_TYPE,
-    typename IDX_TYPE,
-    typename SEG_TYPE,
-    typename Container,
-    typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
-ForallMultiReduceBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const Container&             multi_init,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res,
-    RandomGenerator&             rngen)
+ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
+                               const Container& multi_init,
+                               const std::vector<IDX_TYPE>& seg_idx,
+                               camp::resources::Resource working_res,
+                               RandomGenerator& rngen)
 {
   using MULTIREDUCER =
       typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
@@ -61,7 +57,7 @@ ForallMultiReduceBasicTestImpl(
   const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1;
   const IDX_TYPE idx_len   = static_cast<IDX_TYPE>(seg_idx.size());
 
-  const int    modval   = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -78,8 +74,8 @@ ForallMultiReduceBasicTestImpl(
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(
-      idx_range + 1, working_res, &working_range, &check_range, &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -99,11 +95,11 @@ ForallMultiReduceBasicTestImpl(
     }
   }
 
-  allocateForallTestData(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+                         &test_array);
 
-  allocateForallTestData(
-      data_len, working_res, &working_bins, &check_bins, &test_bins);
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+                         &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -123,8 +119,8 @@ ForallMultiReduceBasicTestImpl(
     }
   }
 
-  working_res.memcpy(
-      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -181,16 +177,16 @@ ForallMultiReduceBasicTestImpl(
             ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::forall<EXEC_POLICY>(
-          seg,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-          {
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
     }
 
     for (size_t bin = 0; bin < num_bins; ++bin)
@@ -208,39 +204,38 @@ ForallMultiReduceBasicTestImpl(
     {
 
       // use floating point values to accentuate floating point precision issues
-      std::conditional_t<
-          !std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>>
+      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
           array_flt_distribution(0, modval - 1);
 
       for (IDX_TYPE i = 0; i < data_len; ++i)
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(
-          working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
     std::vector<DATA_TYPE> ref_vals;
-    bool                   got_ref_vals = false;
+    bool got_ref_vals = false;
 
     const int nloops = 2;
     for (int j = 0; j < nloops; ++j)
     {
       red.reset();
 
-      RAJA::forall<EXEC_POLICY>(
-          seg,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-          {
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
 
       if (!got_ref_vals)
       {
@@ -280,7 +275,7 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto         random_seed = std::random_device {}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
@@ -290,12 +285,12 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
-  size_t              num_bins_min = 0;
+  size_t num_bins_min = 0;
   for (size_t num_bins_max : num_bins_max_container)
   {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(
-        num_bins_min, num_bins_max);
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
     num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
@@ -304,28 +299,28 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
     // Range segment tests
     RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
     RAJA::getIndices(seg_idx, r1);
-    ForallMultiReduceBasicTestImpl<
-        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-        r1, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r1, container, seg_idx,
+                                              working_res, rngen);
 
     seg_idx.clear();
     RAJA::TypedRangeSegment<IDX_TYPE> r3(3, 2060);
     RAJA::getIndices(seg_idx, r3);
-    ForallMultiReduceBasicTestImpl<
-        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-        r3, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r3, container, seg_idx,
+                                              working_res, rngen);
 
     // Range-stride segment test
     seg_idx.clear();
     RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
     RAJA::getIndices(seg_idx, r5);
-    ForallMultiReduceBasicTestImpl<
-        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-        r5, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r5, container, seg_idx,
+                                              working_res, rngen);
 
     // List segment test
     seg_idx.clear();
-    IDX_TYPE                                last = 10567;
+    IDX_TYPE last = 10567;
     std::uniform_int_distribution<IDX_TYPE> dist(0, last - 1);
     for (IDX_TYPE i = 0; i < last; ++i)
     {
@@ -335,11 +330,11 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
         seg_idx.push_back(i);
       }
     }
-    RAJA::TypedListSegment<IDX_TYPE> l1(
-        &seg_idx[0], seg_idx.size(), working_res);
-    ForallMultiReduceBasicTestImpl<
-        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-        l1, container, seg_idx, working_res, rngen);
+    RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                        working_res);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(l1, container, seg_idx,
+                                              working_res, rngen);
   }
 }
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index fbde22fca6..c4ef3f3188 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceBitAndBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,8 +29,8 @@ void ForallReduceBitAndBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -45,9 +43,8 @@ void ForallReduceBitAndBasicTestImpl(
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpand &= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -73,13 +70,12 @@ void ForallReduceBitAndBasicTestImpl(
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      {
-        redand &= working_array[idx];
-        redand2 &= working_array[idx];
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redand &= working_array[idx];
+                              redand2 &= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -89,16 +85,15 @@ void ForallReduceBitAndBasicTestImpl(
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(
-        seg,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { redand &= working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redand &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -122,38 +117,43 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -168,13 +168,12 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceBitAndBasicTest,
-    ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
+                            ReduceBitAndBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index d375d0d986..5e783b89e0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceBitOrBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,8 +29,8 @@ void ForallReduceBitOrBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -45,9 +43,8 @@ void ForallReduceBitOrBasicTestImpl(
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpor |= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
@@ -74,13 +71,12 @@ void ForallReduceBitOrBasicTestImpl(
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor(0);
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      {
-        redor |= working_array[idx];
-        redor2 |= working_array[idx];
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redor |= working_array[idx];
+                              redor2 |= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2.get()), ref_or);
@@ -90,16 +86,15 @@ void ForallReduceBitOrBasicTestImpl(
   const int nloops = 3;
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(
-        seg,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { redor |= working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redor |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
@@ -123,38 +118,40 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -169,9 +166,9 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index babf462f1e..cb5657cde4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMaxBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,10 +29,10 @@ void ForallReduceMaxBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval   = 100;
+  const int modval         = 100;
   const DATA_TYPE max_init = -1;
   const DATA_TYPE big_max  = modval + 1;
 
@@ -54,13 +52,12 @@ void ForallReduceMaxBasicTestImpl(
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> maxinit(big_max);
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      {
-        maxinit.max(working_array[idx]);
-        max.max(working_array[idx]);
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.max(working_array[idx]);
+                              max.max(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
@@ -69,20 +66,18 @@ void ForallReduceMaxBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { max.max(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { max.max(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
@@ -105,38 +100,40 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -151,9 +148,9 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index a4037ee0fa..3aaba8daf4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMaxLocBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,15 +29,15 @@ void ForallReduceMaxLocBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval      = 100;
-  const DATA_TYPE max_init    = -modval;
-  const IDX_TYPE  maxloc_init = -1;
-  const IDX_TYPE  maxloc_idx  = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE big_max     = modval + 1;
-  const IDX_TYPE  big_maxloc  = maxloc_init;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
+  const IDX_TYPE maxloc_init = -1;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval + 1;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -47,8 +45,8 @@ void ForallReduceMaxLocBasicTestImpl(
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max    = max_init;
-  IDX_TYPE  ref_maxloc = maxloc_init;
+  DATA_TYPE ref_max   = max_init;
+  IDX_TYPE ref_maxloc = maxloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] > ref_max)
@@ -61,18 +59,17 @@ void ForallReduceMaxLocBasicTestImpl(
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(
-      big_max, maxloc_init);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(
-      max_init, maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max,
+                                                                 maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init,
+                                                             maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      {
-        maxinit.maxloc(working_array[idx], idx);
-        max.maxloc(working_array[idx], idx);
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.maxloc(working_array[idx], idx);
+                              max.maxloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -84,22 +81,20 @@ void ForallReduceMaxLocBasicTestImpl(
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { max.maxloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { max.maxloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
@@ -122,38 +117,43 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -168,13 +168,12 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMaxLocBasicTest,
-    ReduceMaxLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
+                            ReduceMaxLocBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index b58adda3a4..2d91806ad6 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMinBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,10 +29,10 @@ void ForallReduceMinBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval    = 100;
+  const int modval          = 100;
   const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
@@ -55,13 +53,12 @@ void ForallReduceMinBasicTestImpl(
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      {
-        mininit.min(working_array[idx]);
-        min.min(working_array[idx]);
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.min(working_array[idx]);
+                              min.min(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -70,22 +67,20 @@ void ForallReduceMinBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { min.min(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { min.min(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -109,38 +104,40 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -155,9 +152,9 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index dde95db3cb..58e679cfe5 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMinLocBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,15 +29,15 @@ void ForallReduceMinLocBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval       = 100;
-  const DATA_TYPE min_init     = modval + 1;
-  const IDX_TYPE  minloc_init  = -1;
-  const IDX_TYPE  minloc_idx   = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE small_min    = -modval;
-  const IDX_TYPE  small_minloc = minloc_init;
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
+  const IDX_TYPE small_minloc = minloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -47,8 +45,8 @@ void ForallReduceMinLocBasicTestImpl(
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min    = min_init;
-  IDX_TYPE  ref_minloc = minloc_init;
+  DATA_TYPE ref_min   = min_init;
+  IDX_TYPE ref_minloc = minloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] < ref_min)
@@ -61,18 +59,17 @@ void ForallReduceMinLocBasicTestImpl(
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(
-      small_min, minloc_init);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(
-      min_init, minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min,
+                                                                 minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init,
+                                                             minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      {
-        mininit.minloc(working_array[idx], idx);
-        min.minloc(working_array[idx], idx);
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.minloc(working_array[idx], idx);
+                              min.minloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -84,22 +81,20 @@ void ForallReduceMinLocBasicTestImpl(
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { min.minloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      { min.minloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
@@ -122,38 +117,43 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -168,13 +168,12 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMinLocBasicTest,
-    ReduceMinLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
+                            ReduceMinLocBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 970e2e3a80..11112841b0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,8 +29,8 @@ void ForallReduceSumBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -53,13 +51,12 @@ void ForallReduceSumBasicTestImpl(
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-      {
-        sum += working_array[idx];
-        sum2 += working_array[idx];
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              sum += working_array[idx];
+                              sum2 += working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -70,15 +67,15 @@ void ForallReduceSumBasicTestImpl(
 
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(
-        seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { sum += working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { sum += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -102,38 +99,40 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -148,9 +147,9 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index c0e56db720..19860bf8c2 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceBitAndBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,8 +29,8 @@ void ForallReduceBitAndBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -45,9 +43,8 @@ void ForallReduceBitAndBasicTestImpl(
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpand &= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -100,8 +97,8 @@ void ForallReduceBitAndBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -125,38 +122,43 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -171,13 +173,12 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceBitAndBasicTest,
-    ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
+                            ReduceBitAndBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index 39b433df13..b0141c1431 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceBitOrBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,8 +29,8 @@ void ForallReduceBitOrBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -45,9 +43,8 @@ void ForallReduceBitOrBasicTestImpl(
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { simpor |= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
@@ -100,8 +97,8 @@ void ForallReduceBitOrBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
@@ -125,38 +122,40 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -171,9 +170,9 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceBitOrBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index 86ad07c954..9ebb335771 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMaxBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,10 +29,10 @@ void ForallReduceMaxBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval   = 100;
+  const int modval         = 100;
   const DATA_TYPE max_init = -1;
   const DATA_TYPE big_max  = modval + 1;
 
@@ -71,22 +69,22 @@ void ForallReduceMaxBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(max), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-      { m = RAJA_MAX(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MAX(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-      { m = RAJA_MAX(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MAX(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
@@ -109,38 +107,40 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -155,9 +155,9 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index 2cafa5b034..28cac4c81e 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMaxLocBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,15 +29,15 @@ void ForallReduceMaxLocBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval      = 100;
-  const DATA_TYPE max_init    = -modval;
-  const IDX_TYPE  maxloc_init = -1;
-  const IDX_TYPE  maxloc_idx  = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE big_max     = modval * 10;
-  const IDX_TYPE  big_maxloc  = maxloc_init;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
+  const IDX_TYPE maxloc_init = -1;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval * 10;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -47,8 +45,8 @@ void ForallReduceMaxLocBasicTestImpl(
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max    = max_init;
-  IDX_TYPE  ref_maxloc = maxloc_init;
+  DATA_TYPE ref_max   = max_init;
+  IDX_TYPE ref_maxloc = maxloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] > ref_max)
@@ -85,24 +83,24 @@ void ForallReduceMaxLocBasicTestImpl(
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-      { m.max(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.max(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-      { m.max(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.max(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
@@ -125,38 +123,43 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -171,13 +174,12 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMaxLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMaxLocBasicTest,
-    ReduceMaxLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
+                            ReduceMaxLocBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index 22e0497a28..d452e77a14 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMinBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,10 +29,10 @@ void ForallReduceMinBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval    = 100;
+  const int modval          = 100;
   const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
@@ -71,21 +69,21 @@ void ForallReduceMinBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-      { m = RAJA_MIN(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MIN(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-      { m = RAJA_MIN(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
+                            { m = RAJA_MIN(working_array[idx] * factor, m); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -109,38 +107,40 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -155,9 +155,9 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index 14a51d8149..623c78fda5 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceMinLocBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,15 +29,15 @@ void ForallReduceMinLocBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval       = 100;
-  const DATA_TYPE min_init     = modval + 1;
-  const IDX_TYPE  minloc_init  = -1;
-  const IDX_TYPE  minloc_idx   = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE small_min    = -modval;
-  const IDX_TYPE  small_minloc = minloc_init;
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
+  const IDX_TYPE small_minloc = minloc_init;
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -47,8 +45,8 @@ void ForallReduceMinLocBasicTestImpl(
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min    = min_init;
-  IDX_TYPE  ref_minloc = minloc_init;
+  DATA_TYPE ref_min   = min_init;
+  IDX_TYPE ref_minloc = minloc_init;
   for (IDX_TYPE i = 0; i < idx_len; ++i)
   {
     if (test_array[seg_idx[i]] < ref_min)
@@ -85,24 +83,24 @@ void ForallReduceMinLocBasicTestImpl(
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-      { m.min(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.min(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-      { m.min(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
+                            { m.min(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
@@ -125,38 +123,43 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -171,13 +174,12 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceMinLocBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMinLocBasicTest,
-    ReduceMinLocBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
+                            ReduceMinLocBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index ca12d4073c..1d8d1ee963 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -13,16 +13,14 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -31,8 +29,8 @@ void ForallReduceSumBasicTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -71,17 +69,17 @@ void ForallReduceSumBasicTestImpl(
 
   for (int j = 0; j < nloops; ++j)
   {
-    RAJA::forall<EXEC_POLICY>(
-        seg, RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s)
-        { s += working_array[idx]; });
+    RAJA::forall<EXEC_POLICY>(seg,
+                              RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s)
+                              { s += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -105,38 +103,40 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r2, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r3, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r4, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(r5, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -151,9 +151,9 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  ForallReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-      REDUCE_POLICY>(l1, seg_idx, working_res);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index 26dc2e129b..dc7be52f55 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -19,11 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <
-    typename IDX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -48,8 +47,8 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = -DBL_MAX;
 
@@ -59,11 +58,11 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   }
 
   // for setting random values in arrays
-  std::random_device                     rd;
-  std::mt19937                           mt(rd());
+  std::random_device rd;
+  std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double    current_max = default_val;
+  double current_max    = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax0(default_val);
@@ -87,20 +86,19 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(
-        iset,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-        {
-          dmax0.max(working_array[i]);
-          dmax1.max(2 * working_array[i]);
-        });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.max(working_array[i]);
+                                dmax1.max(2 * working_array[i]);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
@@ -108,21 +106,19 @@ template <typename T>
 class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    ForallIndexSetReduceMaxMultipleTest,
-    ReduceMaxMultipleForallIndexSet)
+TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
+             ReduceMaxMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxMultipleTestImpl<
-      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallIndexSetReduceMaxMultipleTest,
-    ReduceMaxMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest,
+                            ReduceMaxMultipleForallIndexSet);
 
 #endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 13546a492a..45bb37cbe3 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -19,11 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <
-    typename IDX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -48,10 +47,10 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
-  double   current_max = -DBL_MAX;
+  double current_max   = -DBL_MAX;
   IDX_TYPE current_loc = -1;
 
   for (IDX_TYPE i = 0; i < alen; ++i)
@@ -61,10 +60,10 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
 
   const int test_repeat = 4;
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(
-      current_max, current_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(
-      current_max, current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max,
+                                                            current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max,
+                                                            current_loc);
 
   for (int tcount = 1; tcount <= test_repeat; ++tcount)
   {
@@ -82,13 +81,12 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(
-        iset,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-        {
-          dmax0.maxloc(working_array[i], i);
-          dmax1.maxloc(2 * working_array[i], i);
-        });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.maxloc(working_array[i], i);
+                                dmax1.maxloc(2 * working_array[i], i);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
@@ -96,8 +94,8 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
@@ -105,21 +103,19 @@ template <typename T>
 class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    ForallIndexSetReduceMaxLocMultipleTest,
-    ReduceMaxLocMultipleForallIndexSet)
+TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
+             ReduceMaxLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxLocMultipleTestImpl<
-      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallIndexSetReduceMaxLocMultipleTest,
-    ReduceMaxLocMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest,
+                            ReduceMaxLocMultipleForallIndexSet);
 
 #endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index 0a8c116721..9a0cc3b67f 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -19,11 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <
-    typename IDX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -48,8 +47,8 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = DBL_MAX;
 
@@ -59,11 +58,11 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   }
 
   // for setting random values in arrays
-  std::random_device                     rd;
-  std::mt19937                           mt(rd());
+  std::random_device rd;
+  std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double    current_min = default_val;
+  double current_min    = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin0(default_val);
@@ -87,20 +86,19 @@ void ForallIndexSetReduceMinMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(
-        iset,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-        {
-          dmin0.min(working_array[i]);
-          dmin1.min(2 * working_array[i]);
-        });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.min(working_array[i]);
+                                dmin1.min(2 * working_array[i]);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
@@ -108,21 +106,19 @@ template <typename T>
 class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    ForallIndexSetReduceMinMultipleTest,
-    ReduceMinMultipleForallIndexSet)
+TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
+             ReduceMinMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinMultipleTestImpl<
-      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallIndexSetReduceMinMultipleTest,
-    ReduceMinMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest,
+                            ReduceMinMultipleForallIndexSet);
 
 #endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index 6a311aab53..2b0fa8d43d 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -19,11 +19,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <
-    typename IDX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -48,10 +47,10 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
-  double   current_min = DBL_MAX;
+  double current_min   = DBL_MAX;
   IDX_TYPE current_loc = -1;
 
   for (IDX_TYPE i = 0; i < alen; ++i)
@@ -61,10 +60,10 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
 
   const int test_repeat = 4;
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(
-      current_min, current_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(
-      current_min, current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min,
+                                                            current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min,
+                                                            current_loc);
 
   for (int tcount = 1; tcount <= test_repeat; ++tcount)
   {
@@ -82,13 +81,12 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
 
     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    RAJA::forall<EXEC_POLICY>(
-        iset,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-        {
-          dmin0.minloc(working_array[i], i);
-          dmin1.minloc(2 * working_array[i], i);
-        });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.minloc(working_array[i], i);
+                                dmin1.minloc(2 * working_array[i], i);
+                              });
 
     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
@@ -96,8 +94,8 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<double>(working_res, working_array, check_array,
+                                   test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
@@ -105,21 +103,19 @@ template <typename T>
 class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    ForallIndexSetReduceMinLocMultipleTest,
-    ReduceMinLocMultipleForallIndexSet)
+TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
+             ReduceMinLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinLocMultipleTestImpl<
-      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallIndexSetReduceMinLocMultipleTest,
-    ReduceMinLocMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest,
+                            ReduceMinLocMultipleForallIndexSet);
 
 #endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index 5167e601a9..7dd8f83844 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -17,11 +17,10 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <
-    typename IDX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceSumMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
@@ -46,18 +45,18 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   double* dcheck_array;
   double* dtest_array;
 
-  allocateForallTestData<double>(
-      alen, working_res, &dworking_array, &dcheck_array, &dtest_array);
+  allocateForallTestData<double>(alen, working_res, &dworking_array,
+                                 &dcheck_array, &dtest_array);
 
   int* iworking_array;
   int* icheck_array;
   int* itest_array;
 
-  allocateForallTestData<int>(
-      alen, working_res, &iworking_array, &icheck_array, &itest_array);
+  allocateForallTestData<int>(alen, working_res, &iworking_array, &icheck_array,
+                              &itest_array);
 
   const double dinit_val = 0.1;
-  const int    iinit_val = 1;
+  const int iinit_val    = 1;
 
   for (IDX_TYPE i = 0; i < alen; ++i)
   {
@@ -68,48 +67,45 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   working_res.memcpy(dworking_array, dtest_array, sizeof(double) * alen);
   working_res.memcpy(iworking_array, itest_array, sizeof(int) * alen);
 
-  const double drinit      = 5.0;
-  const int    irinit      = 4;
-  const int    test_repeat = 4;
+  const double drinit   = 5.0;
+  const int irinit      = 4;
+  const int test_repeat = 4;
 
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum0(drinit * 1.0);
-  RAJA::ReduceSum<REDUCE_POLICY, int>    isum1(irinit * 2);
+  RAJA::ReduceSum<REDUCE_POLICY, int> isum1(irinit * 2);
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum2(drinit * 3.0);
-  RAJA::ReduceSum<REDUCE_POLICY, int>    isum3(irinit * 4);
+  RAJA::ReduceSum<REDUCE_POLICY, int> isum3(irinit * 4);
 
   for (int tcount = 1; tcount <= test_repeat; ++tcount)
   {
 
-    RAJA::forall<EXEC_POLICY>(
-        iset,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-        {
-          dsum0 += 1.0 * dworking_array[idx];
-          isum1 += 2 * iworking_array[idx];
-          dsum2 += 3.0 * dworking_array[idx];
-          isum3 += 4 * iworking_array[idx];
-        });
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                dsum0 += 1.0 * dworking_array[idx];
+                                isum1 += 2 * iworking_array[idx];
+                                dsum2 += 3.0 * dworking_array[idx];
+                                isum3 += 4 * iworking_array[idx];
+                              });
 
     double dchk_val = dinit_val * static_cast<double>(iset.getLength());
-    int    ichk_val = iinit_val * static_cast<int>(iset.getLength());
-
-    ASSERT_FLOAT_EQ(
-        static_cast<double>(dsum0.get()),
-        tcount * (1 * dchk_val) + (drinit * 1.0));
-    ASSERT_EQ(
-        static_cast<int>(isum1.get()), tcount * (2 * ichk_val) + (irinit * 2));
-    ASSERT_FLOAT_EQ(
-        static_cast<double>(dsum2.get()),
-        tcount * (3 * dchk_val) + (drinit * 3.0));
-    ASSERT_EQ(
-        static_cast<int>(isum3.get()), tcount * (4 * ichk_val) + (irinit * 4));
+    int ichk_val    = iinit_val * static_cast<int>(iset.getLength());
+
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()),
+                    tcount * (1 * dchk_val) + (drinit * 1.0));
+    ASSERT_EQ(static_cast<int>(isum1.get()),
+              tcount * (2 * ichk_val) + (irinit * 2));
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum2.get()),
+                    tcount * (3 * dchk_val) + (drinit * 3.0));
+    ASSERT_EQ(static_cast<int>(isum3.get()),
+              tcount * (4 * ichk_val) + (irinit * 4));
   }
 
-  deallocateForallTestData<double>(
-      working_res, dworking_array, dcheck_array, dtest_array);
+  deallocateForallTestData<double>(working_res, dworking_array, dcheck_array,
+                                   dtest_array);
 
-  deallocateForallTestData<int>(
-      working_res, iworking_array, icheck_array, itest_array);
+  deallocateForallTestData<int>(working_res, iworking_array, icheck_array,
+                                itest_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
@@ -117,21 +113,19 @@ template <typename T>
 class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    ForallIndexSetReduceSumMultipleTest,
-    ReduceSumMultipleForallIndexSet)
+TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
+             ReduceSumMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceSumMultipleTestImpl<
-      IDX_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallIndexSetReduceSumMultipleTest,
-    ReduceSumMultipleForallIndexSet);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest,
+                            ReduceSumMultipleForallIndexSet);
 
 #endif  // __TEST_FORALL_INDEXSET_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index cbe5114860..8e996e4a2c 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -14,32 +14,31 @@
 #include <numeric>
 #include <random>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  DATA_TYPE*                working_array;
-  DATA_TYPE*                check_array;
-  DATA_TYPE*                test_array;
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
   const DATA_TYPE big_val     = 500;
 
-  static std::random_device                     rd;
-  static std::mt19937                           mt(rd());
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(
-          static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `max0;` not `max0(0);`
@@ -72,27 +71,25 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE  max_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
-        working_res.memcpy(
-            &working_array[max_index], &test_array[max_index],
-            sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[max_index], &test_array[max_index],
+                           sizeof(DATA_TYPE));
 
         if (current_max < roll)
         {
           current_max = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(
-            r1,
-            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-            {
-              max0.max(working_array[idx]);
-              max1.max(2 * working_array[idx]);
-              max2.max(working_array[idx]);
-            });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.max(working_array[idx]);
+                                    max1.max(2 * working_array[idx]);
+                                    max2.max(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
@@ -109,8 +106,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
@@ -126,12 +123,11 @@ TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxMultipleTestImpl<
-      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMaxMultipleTest,
-    ReduceMaxMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest,
+                            ReduceMaxMultipleForall);
 
 #endif  // __TEST_FORALL_MULTIPLE_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index 44bfe41de1..d13f7f05a3 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -14,40 +14,39 @@
 #include <numeric>
 #include <random>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  DATA_TYPE*                working_array;
-  DATA_TYPE*                check_array;
-  DATA_TYPE*                test_array;
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const IDX_TYPE  default_loc = -1;
+  const IDX_TYPE default_loc  = -1;
   const DATA_TYPE big_val     = 500;
 
-  static std::random_device                     rd;
-  static std::mt19937                           mt(rd());
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(
-          static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(
-      default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(
-      default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(
-      big_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
   for (int l = 0; l < nOuterLoops; ++l)
@@ -63,7 +62,7 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
     DATA_TYPE current_max = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc  = default_loc;
 
     const int nMiddleLoops = 2;
     for (int k = 0; k < nMiddleLoops; ++k)
@@ -79,15 +78,14 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE  max_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         if (current_max != roll)
         {  // avoid two indices getting the same value
           test_array[max_index] = roll;
-          working_res.memcpy(
-              &working_array[max_index], &test_array[max_index],
-              sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[max_index], &test_array[max_index],
+                             sizeof(DATA_TYPE));
 
           if (current_max < roll)
           {
@@ -96,14 +94,13 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
           }
         }
 
-        RAJA::forall<EXEC_POLICY>(
-            r1,
-            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-            {
-              max0.maxloc(working_array[idx], idx);
-              max1.maxloc(2 * working_array[idx], idx);
-              max2.maxloc(working_array[idx], idx);
-            });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.maxloc(working_array[idx], idx);
+                                    max1.maxloc(2 * working_array[idx], idx);
+                                    max2.maxloc(working_array[idx], idx);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -130,8 +127,8 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
@@ -147,12 +144,11 @@ TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxLocMultipleTestImpl<
-      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMaxLocMultipleTest,
-    ReduceMaxLocMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest,
+                            ReduceMaxLocMultipleForall);
 
 #endif  // __TEST_FORALL_MULTIPLE_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index 65688b2044..a33710f7dc 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -14,32 +14,31 @@
 #include <numeric>
 #include <random>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  DATA_TYPE*                working_array;
-  DATA_TYPE*                check_array;
-  DATA_TYPE*                test_array;
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
   const DATA_TYPE big_val     = -500;
 
-  static std::random_device                     rd;
-  static std::mt19937                           mt(rd());
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(
-          static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `min0;` not `min0(0);`
@@ -72,27 +71,25 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE  min_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
-        working_res.memcpy(
-            &working_array[min_index], &test_array[min_index],
-            sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[min_index], &test_array[min_index],
+                           sizeof(DATA_TYPE));
 
         if (current_min > roll)
         {
           current_min = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(
-            r1,
-            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-            {
-              min0.min(working_array[idx]);
-              min1.min(2 * working_array[idx]);
-              min2.min(working_array[idx]);
-            });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.min(working_array[idx]);
+                                    min1.min(2 * working_array[idx]);
+                                    min2.min(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
         ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
@@ -109,8 +106,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
@@ -126,12 +123,11 @@ TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinMultipleTestImpl<
-      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMinMultipleTest,
-    ReduceMinMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest,
+                            ReduceMinMultipleForall);
 
 #endif  // __TEST_FORALL_MULTIPLE_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index ae8f824dc0..c8e4431ac4 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -14,41 +14,40 @@
 #include <numeric>
 #include <random>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  DATA_TYPE*                working_array;
-  DATA_TYPE*                check_array;
-  DATA_TYPE*                test_array;
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const IDX_TYPE  default_loc = -1;
+  const IDX_TYPE default_loc  = -1;
   const DATA_TYPE big_val     = -500;
 
-  static std::random_device                     rd;
-  static std::mt19937                           mt(rd());
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int>     dist2(
-          static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(
-      default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(
-      default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(
-      big_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
   for (int l = 0; l < nOuterLoops; ++l)
@@ -65,7 +64,7 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
     DATA_TYPE current_min = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc  = default_loc;
 
     const int nMiddleLoops = 2;
     for (int k = 0; k < nMiddleLoops; ++k)
@@ -82,16 +81,15 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
       for (int j = 0; j < nloops; ++j)
       {
 
-        DATA_TYPE roll      = static_cast<DATA_TYPE>(dist(mt));
-        IDX_TYPE  min_index = static_cast<IDX_TYPE>(dist2(mt));
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         printf("rolling { %f, %f }\n", (double)roll, (double)min_index);
         if (current_min != roll)
         {  // avoid two indices getting the same value
           test_array[min_index] = roll;
-          working_res.memcpy(
-              &working_array[min_index], &test_array[min_index],
-              sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[min_index], &test_array[min_index],
+                             sizeof(DATA_TYPE));
 
           if (current_min > roll)
           {
@@ -99,17 +97,16 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
             current_loc = min_index;
           }
         }
-        printf(
-            "current { %f, %f }\n", (double)current_min, (double)current_loc);
-
-        RAJA::forall<EXEC_POLICY>(
-            r1,
-            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-            {
-              min0.minloc(working_array[idx], idx);
-              min1.minloc(2 * working_array[idx], idx);
-              min2.minloc(working_array[idx], idx);
-            });
+        printf("current { %f, %f }\n", (double)current_min,
+               (double)current_loc);
+
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.minloc(working_array[idx], idx);
+                                    min1.minloc(2 * working_array[idx], idx);
+                                    min2.minloc(working_array[idx], idx);
+                                  });
 
         printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
@@ -139,8 +136,8 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
@@ -156,12 +153,11 @@ TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinLocMultipleTestImpl<
-      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                     EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceMinLocMultipleTest,
-    ReduceMinLocMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest,
+                            ReduceMinLocMultipleForall);
 
 #endif  // __TEST_FORALL_MULTIPLE_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index 0fe742b351..aa489187f0 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -11,23 +11,22 @@
 #include <cstdlib>
 #include <numeric>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  DATA_TYPE*                working_array;
-  DATA_TYPE*                check_array;
-  DATA_TYPE*                test_array;
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
@@ -54,57 +53,55 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
   for (int j = 0; j < nloops; ++j)
   {
 
-    RAJA::forall<EXEC_POLICY>(
-        r1,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-        {
-          sum0 += working_array[idx];
-          sum1 += working_array[idx] * 2;
-          sum2 += working_array[idx] * 3;
-          sum3 += working_array[idx] * 4;
-          sum4 += working_array[idx] * 5;
-          sum5 += working_array[idx] * 6;
-          sum6 += working_array[idx] * 7;
-          sum7 += working_array[idx] * 8;
-        });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(
-        2 * check_val + (initval * 1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(
-        4 * check_val + (initval * 3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(
-        6 * check_val + (initval * 5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(
-        8 * check_val + (initval * 7), static_cast<DATA_TYPE>(sum7.get()));
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  DATA_TYPE*                working_array;
-  DATA_TYPE*                check_array;
-  DATA_TYPE*                test_array;
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      last, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
@@ -142,38 +139,37 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
   for (int j = 0; j < nloops; ++j)
   {
 
-    RAJA::forall<EXEC_POLICY>(
-        r1,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-        {
-          sum0 += working_array[idx];
-          sum1 += working_array[idx] * 2;
-          sum2 += working_array[idx] * 3;
-          sum3 += working_array[idx] * 4;
-          sum4 += working_array[idx] * 5;
-          sum5 += working_array[idx] * 6;
-          sum6 += working_array[idx] * 7;
-          sum7 += working_array[idx] * 8;
-        });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(
-        2 * check_val + (initval * 1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(
-        4 * check_val + (initval * 3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(
-        6 * check_val + (initval * 5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(
-        8 * check_val + (initval * 7), static_cast<DATA_TYPE>(sum7.get()));
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
@@ -189,15 +185,15 @@ TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceSumMultipleStaggeredTestImpl<
-      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                           EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 
-  ForallReduceSumMultipleStaggered2TestImpl<
-      IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                            EXEC_POLICY, REDUCE_POLICY>(0,
+                                                                        2115);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallReduceSumMultipleTest,
-    ReduceSumMultipleForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
+                            ReduceSumMultipleForall);
 
 #endif  // __TEST_FORALL_MULTIPLE_REDUCESUM_HPP__
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index c7cc42ae96..6b0dafd652 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -11,11 +11,10 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename REG_POLICY,
-    typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename REG_POLICY,
+          typename EXEC_POLICY>
 void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
   camp::resources::Resource working_res {WORKING_RES::get_default()};
@@ -36,21 +35,19 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
 
   RAJA::region<REG_POLICY>(
       [=]()
       {
-        RAJA::forall<EXEC_POLICY>(
-            rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-            { working_array[idx - first] += 1; });
+        RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 1; });
 
-        RAJA::forall<EXEC_POLICY>(
-            lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-            { working_array[idx - first] += 2; });
+        RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 2; });
       });
 
 
@@ -61,8 +58,8 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 3);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -79,10 +76,10 @@ TYPED_TEST_P(ForallRegionTest, RegionForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(
-      1, 153);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(
-      3, 2556);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1,
+                                                                         153);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3,
+                                                                         2556);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest, RegionForall);
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index be03756467..f909ef9d4c 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -24,10 +24,10 @@ void ForallResourceIcountIndexSetTestImpl()
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  WORKING_RES               working_res;
+  WORKING_RES working_res;
   camp::resources::Resource erased_working_res {working_res};
 
-  IndexSetType            iset;
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, erased_working_res);
@@ -44,8 +44,8 @@ void ForallResourceIcountIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -69,8 +69,8 @@ void ForallResourceIcountIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -85,12 +85,11 @@ TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceIcountIndexSetTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                       EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallResourceIcountIndexSetTest,
-    ResourceIndexSetForallIcount);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest,
+                            ResourceIndexSetForallIcount);
 
 #endif  // __TEST_FORALL_RESOURCE_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index 5e6ea862b1..2129d1350f 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -23,10 +23,10 @@ void ForallResourceIndexSetTestImpl()
   using IndexSetType =
       RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  WORKING_RES               working_res;
+  WORKING_RES working_res;
   camp::resources::Resource erased_working_res {working_res};
 
-  IndexSetType            iset;
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
       iset, is_indices, erased_working_res);
@@ -43,8 +43,8 @@ void ForallResourceIndexSetTestImpl()
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -55,9 +55,9 @@ void ForallResourceIndexSetTestImpl()
     test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(
-      working_res, iset,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[idx] = idx; });
+  RAJA::forall<EXEC_POLICY>(working_res, iset,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -67,8 +67,8 @@ void ForallResourceIndexSetTestImpl()
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index b52c6be2d3..b5596ab1dd 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -35,51 +35,50 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
 
   size_t idxlen = idx_array.size();
 
-  WORKING_RES               working_res;
+  WORKING_RES working_res;
   camp::resources::Resource erased_working_res {working_res};
 
   // Create list segment for tests
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(
-      &idx_array[0], idxlen, erased_working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen,
+                                          erased_working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(
-      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (size_t i = 0; i < idxlen; ++i)
   {
     test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(
-      working_res, lseg,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-      { working_array[RAJA::stripIndexType(idx)] = idx; });
+  RAJA::forall<EXEC_POLICY>(working_res, lseg,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                              working_array[RAJA::stripIndexType(idx)] = idx;
+                            });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   //
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -104,8 +103,7 @@ TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
       INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallResourceListSegmentTest,
-    ResourceListSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceListSegmentTest,
+                            ResourceListSegmentForall);
 
 #endif  // __TEST_FORALL_RESOURCE_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 62e796a367..321a0804fa 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -13,18 +13,18 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
-  WORKING_RES               working_res;
+  WORKING_RES working_res;
   camp::resources::Resource erased_working_res {working_res};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -35,18 +35,17 @@ void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -55,21 +54,19 @@ template <typename T>
 class ForallResourceRangeSegmentTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
@@ -95,8 +92,7 @@ TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallResourceRangeSegmentTest,
-    ResourceRangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest,
+                            ResourceRangeSegmentForall);
 
 #endif  // __TEST_FORALL_RESOURCE_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index 58ae4e7bf4..37d3ebfbf3 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -8,37 +8,35 @@
 #ifndef __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 #define __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY>
-void ForallResourceRangeStrideSegmentTestImpl(
-    INDEX_TYPE first,
-    INDEX_TYPE last,
-    DIFF_TYPE  stride)
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                              INDEX_TYPE last,
+                                              DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  WORKING_RES               working_res;
+  WORKING_RES working_res;
   camp::resources::Resource erased_working_res {working_res};
   camp::resources::Resource host_res {camp::resources::Host()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(
-      working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   INDEX_TYPE idx = first;
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
@@ -52,18 +50,17 @@ void ForallResourceRangeStrideSegmentTestImpl(
       [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
       { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
 
-  working_res.memcpy(
-      check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
@@ -72,48 +69,45 @@ template <typename T>
 class ForallResourceRangeStrideSegmentTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
 
   // Test negative strides
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
-TYPED_TEST_P(
-    ForallResourceRangeStrideSegmentTest,
-    ResourceRangeStrideSegmentForall)
+TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
+             ResourceRangeStrideSegmentForall)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -121,41 +115,40 @@ TYPED_TEST_P(
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
 
   // Test size zero segments
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallResourceRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallResourceRangeStrideSegmentTest,
-    ResourceRangeStrideSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest,
+                            ResourceRangeStrideSegmentForall);
 
 #endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index 808e0550c7..93c2e1c07d 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -44,8 +44,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -66,25 +66,23 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 #else
 #define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
 #endif
-  static_assert(
-      IS_TRIVIALLY_COPYABLE(layout_type),
-      "These layouts should always be triviallly copyable");
+  static_assert(IS_TRIVIALLY_COPYABLE(layout_type),
+                "These layouts should always be triviallly copyable");
 
   // AJK: see ViewBase Ctor notes in RAJA/Util/TypedViewBase.hpp
 #if (!defined(RAJA_ENABLE_CUDA) && !defined(RAJA_ENABLE_CLANG_CUDA))
-  static_assert(
-      IS_TRIVIALLY_COPYABLE(view_type),
-      "These views should always be triviallly copyable");
+  static_assert(IS_TRIVIALLY_COPYABLE(view_type),
+                "These views should always be triviallly copyable");
 #endif
 
 
 #endif
 
   RAJA::Layout<1> layout(N);
-  view_type       work_view(working_array, layout);
+  view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -93,8 +91,8 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -125,8 +123,8 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -141,12 +139,11 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type  work_view(
-       working_array,
-       RAJA::make_offset_layout<1, INDEX_TYPE>({{offset}}, {{N_offset}}));
+  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{offset}}, {{N_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(
-      lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -155,8 +152,8 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
@@ -176,12 +173,12 @@ TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
       32000);
 
-  ForallListSegmentOffsetViewTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13, 1);
-  ForallListSegmentOffsetViewTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047, 2);
-  ForallListSegmentOffsetViewTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000, 3);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(13, 1);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(2047, 2);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(32000, 3);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest, ListSegmentForallView);
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index 269dd193e2..96956fd981 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -15,17 +15,17 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 {
   INDEX_TYPE lentot = N * N;
-  const int  NDIMS  = 2;
+  const int NDIMS   = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, lentot);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      lentot, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
   std::iota(test_array, test_array + lentot, 0);
 
@@ -34,14 +34,13 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      r1,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-      {
-        const INDEX_TYPE row = idx / N;
-        const INDEX_TYPE col = idx % N;
-        work_view(row, col)  = row * N + col;
-      });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = row * N + col;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
@@ -50,8 +49,8 @@ void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
@@ -59,17 +58,17 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 {
   const INDEX_TYPE leninterior = N * N;
   const INDEX_TYPE lentot      = (N + 2) * (N + 2);
-  const int        NDIMS       = 2;
+  const int NDIMS              = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, leninterior);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      lentot, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * lentot);
 
@@ -90,14 +89,13 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      r1,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-      {
-        const INDEX_TYPE row = idx / N;
-        const INDEX_TYPE col = idx % N;
-        work_view(row, col)  = idx;
-      });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = idx;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
@@ -106,8 +104,8 @@ void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
@@ -115,12 +113,11 @@ template <typename T>
 class ForallRangeSegment2DViewTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-        nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runOffsetViewTests()
 {}
 
@@ -150,8 +147,7 @@ TYPED_TEST_P(ForallRangeSegment2DViewTest, RangeSegmentForall2DView)
   runOffsetViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallRangeSegment2DViewTest,
-    RangeSegmentForall2DView);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest,
+                            RangeSegmentForall2DView);
 
 #endif  // __TEST_FORALL_RANGESEGMENT2DVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index 4c184981d7..0c981f3da9 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -14,15 +14,15 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first, last);
-  INDEX_TYPE                          N = r1.end() - r1.begin();
+  INDEX_TYPE N = r1.end() - r1.begin();
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -31,11 +31,10 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
-  view_type       work_view(working_array, layout);
+  view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      r1,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx - rbegin) = idx; });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx - rbegin) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -44,26 +43,25 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeSegmentOffsetViewTestImpl(
-    INDEX_TYPE first,
-    INDEX_TYPE last,
-    INDEX_TYPE offset)
+void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
+                                          INDEX_TYPE offset)
 {
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first + offset, last + offset);
-  INDEX_TYPE                          N = r1.end() - r1.begin();
+  INDEX_TYPE N = r1.end() - r1.begin();
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
@@ -73,12 +71,11 @@ void ForallRangeSegmentOffsetViewTestImpl(
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type  work_view(
-       working_array,
-       RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}}, {{l_offset}}));
+  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{f_offset}}, {{l_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(
-      r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -87,16 +84,15 @@ void ForallRangeSegmentOffsetViewTestImpl(
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-        nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeViewTests()
 {}
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index e9e1f4c281..2e56fab16c 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -8,26 +8,24 @@
 #ifndef __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 #define __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY>
-void ForallRangeStrideSegmentViewTestImpl(
-    INDEX_TYPE first,
-    INDEX_TYPE last,
-    DIFF_TYPE  stride)
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
+                                          DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
-  INDEX_TYPE                                N = r1.size();
+  INDEX_TYPE N = r1.size();
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
@@ -43,11 +41,10 @@ void ForallRangeStrideSegmentViewTestImpl(
   using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
-  view_type       work_view(working_array, layout);
+  view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(
-      r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-      { work_view((idx - first) / stride) = idx; });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view((idx - first) / stride) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
@@ -56,17 +53,16 @@ void ForallRangeStrideSegmentViewTestImpl(
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-        nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeIndexViewTests()
 {}
 
@@ -78,17 +74,17 @@ template <
     typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
 {
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-10, -1, 2);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 2);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 3);
-
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, -1, -1);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, 0, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-10, -1, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-5, 0, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-5, 5, 3);
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(10, -1, -1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(10, 0, -2);
 }
 
 
@@ -104,32 +100,31 @@ TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
 
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 255, 2);
 
   // Test size zero segments
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, -2);
-  ForallRangeStrideSegmentViewTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, -2);
 
   runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallRangeStrideSegmentViewTest,
-    RangeStrideSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest,
+                            RangeStrideSegmentForallView);
 
 #endif  // __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index a642963f85..df2f4300c8 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -55,8 +55,8 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -66,41 +66,40 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
       test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(
-        lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        { working_array[RAJA::stripIndexType(idx)] = idx; });
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                                working_array[RAJA::stripIndexType(idx)] = idx;
+                              });
   }
   else
   {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(
-        lseg,
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        {
-          (void)idx;
-          working_array[0]++;
-        });
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index 72c4ea89c4..a55a655788 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -14,14 +14,14 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -29,8 +29,8 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -48,29 +48,27 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(
-        r1,
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        {
-          (void)idx;
-          working_array[0]++;
-        });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -79,21 +77,19 @@ template <typename T>
 class ForallRangeSegmentTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index 4f3940d94c..e92ec54af2 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -10,15 +10,13 @@
 
 #include <cstring>
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY>
-void ForallRangeStrideSegmentTestImpl(
-    INDEX_TYPE first,
-    INDEX_TYPE last,
-    DIFF_TYPE  stride)
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
+                                      DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
@@ -26,9 +24,9 @@ void ForallRangeStrideSegmentTestImpl(
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
   camp::resources::Resource host_res {camp::resources::Host()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -36,8 +34,8 @@ void ForallRangeStrideSegmentTestImpl(
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
@@ -60,26 +58,24 @@ void ForallRangeStrideSegmentTestImpl(
   else
   {  // zero-length segment
 
-    RAJA::forall<EXEC_POLICY>(
-        r1,
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        {
-          (void)idx;
-          working_array[0]++;
-        });
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
   for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -88,42 +84,40 @@ template <typename T>
 class ForallRangeStrideSegmentTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
+                                                DIFF_TYPE(3));
 
   // Test negative strides
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0),
+                                                DIFF_TYPE(-2));
 }
 
 
@@ -135,41 +129,40 @@ TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255),
+                                                DIFF_TYPE(2));
 
   // Test size zero segments
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ForallRangeStrideSegmentTest,
-    RangeStrideSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest,
+                            RangeStrideSegmentForall);
 
 #endif  // __TEST_FORALL_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index 678fdd9555..393d1ad7ba 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -49,9 +49,9 @@ TEST(IndexSetBuild, Aligned)
 
   RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
 
-  RAJA::buildIndexSetAligned(
-      iset, res, &indices[0], static_cast<RAJA::Index_type>(indices.size()),
-      range_min_length, range_align);
+  RAJA::buildIndexSetAligned(iset, res, &indices[0],
+                             static_cast<RAJA::Index_type>(indices.size()),
+                             range_min_length, range_align);
 
   ASSERT_EQ(iset.getLength(), indices.size());
 
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index 137bc9c506..440239a700 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -16,16 +16,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename EXEC_POLICY,
-    typename WORKING_RES,
-    typename SEG_TYPE>
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE>
 void KernelBasicFissionFusionLoopTestImpl(
-    const SEG_TYPE&              seg,
+    const SEG_TYPE& seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES                  working_res,
-    camp::resources::Resource    erased_working_res)
+    WORKING_RES working_res,
+    camp::resources::Resource erased_working_res)
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
@@ -42,17 +41,17 @@ void KernelBasicFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_x;
   DATA_TYPE* test_array_y;
 
-  allocateForallTestData<DATA_TYPE>(
-      RAJA::stripIndexType(data_len), erased_working_res, &working_array_x,
-      &check_array_x, &test_array_x);
+  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
-  allocateForallTestData<DATA_TYPE>(
-      RAJA::stripIndexType(data_len), erased_working_res, &working_array_y,
-      &check_array_y, &test_array_y);
+  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(
-      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(working_array_x, 0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
@@ -71,36 +70,32 @@ void KernelBasicFissionFusionLoopTestImpl(
 
   );
 
-  working_res.memcpy(
-      check_array_x, working_array_x,
-      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(check_array_x, working_array_x,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  memset(
-      static_cast<void*>(check_array_y), 0,
-      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  memset(static_cast<void*>(check_array_y), 0,
+         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  RAJA::forall<RAJA::seq_exec>(
-      working_res, seg_idx,
-      [=](IDX_TYPE i)
-      {
-        check_array_y[RAJA::stripIndexType(i)] += 1;
-        check_array_y[RAJA::stripIndexType(i)] += 2;
-      });
+  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
+                               [=](IDX_TYPE i)
+                               {
+                                 check_array_y[RAJA::stripIndexType(i)] += 1;
+                                 check_array_y[RAJA::stripIndexType(i)] += 2;
+                               });
 
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
   {
-    ASSERT_EQ(
-        check_array_x[RAJA::stripIndexType(i)],
-        check_array_y[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
+              check_array_y[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_x, check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_y, check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif  // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 27f9fbc1e8..141bbc7687 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -15,15 +15,14 @@ template <typename T>
 class KernelBasicFissionFusionLoopTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    KernelBasicFissionFusionLoopTest,
-    BasicFissionFusionLoopSegmentKernel)
+TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
+             BasicFissionFusionLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res {WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
@@ -32,15 +31,15 @@ TYPED_TEST_P(
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -48,34 +47,31 @@ TYPED_TEST_P(
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
@@ -90,22 +86,21 @@ TYPED_TEST_P(
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                       RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelBasicFissionFusionLoopTest,
-    BasicFissionFusionLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelBasicFissionFusionLoopTest,
+                            BasicFissionFusionLoopSegmentKernel);
 #endif  // __TEST_KERNEL_BASIC_FISSION_FUSION_LOOP_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index 0704619204..03b5813640 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -16,16 +16,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename EXEC_POLICY,
-    typename WORKING_RES,
-    typename SEG_TYPE>
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE>
 void KernelBasicSingleICountLoopTestImpl(
-    const SEG_TYPE&              seg,
+    const SEG_TYPE& seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES                  working_res,
-    camp::resources::Resource    erased_working_res)
+    WORKING_RES working_res,
+    camp::resources::Resource erased_working_res)
 {
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
@@ -46,24 +45,21 @@ void KernelBasicSingleICountLoopTestImpl(
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(
-      data_len, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  allocateForallTestData<IDX_TYPE>(
-      data_len, erased_working_res, &working_array_i, &check_array_i,
-      &test_array_i);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res,
+                                   &working_array_i, &check_array_i,
+                                   &test_array_i);
 
-  memset(
-      static_cast<void*>(test_array), 0,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  memset(static_cast<void*>(test_array), 0,
+         sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(
-      working_array, test_array,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(
-      working_array_i, test_array_i,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(working_array_i, test_array_i,
+                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   if (RAJA::stripIndexType(idx_len) > 0)
   {
@@ -99,28 +95,24 @@ void KernelBasicSingleICountLoopTestImpl(
         });
   }
 
-  working_res.memcpy(
-      check_array, working_array,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
-  working_res.memcpy(
-      check_array_i, working_array_i,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(check_array_i, working_array_i,
+                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
-    ASSERT_EQ(
-        test_array_i[RAJA::stripIndexType(i)],
-        check_array_i[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array_i[RAJA::stripIndexType(i)],
+              check_array_i[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 
-  deallocateForallTestData<IDX_TYPE>(
-      erased_working_res, working_array_i, check_array_i, test_array_i);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array_i,
+                                     check_array_i, test_array_i);
 }
 
 #endif  // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index 1cc86a31b7..d1f00123d8 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -15,15 +15,14 @@ template <typename T>
 class KernelBasicSingleICountLoopTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    KernelBasicSingleICountLoopTest,
-    BasicSingleICountLoopSegmentKernel)
+TYPED_TEST_P(KernelBasicSingleICountLoopTest,
+             BasicSingleICountLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res {WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
@@ -32,15 +31,15 @@ TYPED_TEST_P(
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -48,34 +47,31 @@ TYPED_TEST_P(
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
   // List segment tests
@@ -90,23 +86,22 @@ TYPED_TEST_P(
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleICountLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                      RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelBasicSingleICountLoopTest,
-    BasicSingleICountLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest,
+                            BasicSingleICountLoopSegmentKernel);
 
 #endif  // __TEST_KERNEL_BASIC_SINGLE_ICOUNT_LOOP_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index a383b686b5..1d6e0e5938 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -16,41 +16,37 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
   RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <
-    typename IDX_TYPE,
-    typename EXEC_POLICY,
-    typename WORKING_RES,
-    typename SEG_TYPE,
-    bool USE_RESOURCE>
-void KernelBasicSingleLoopTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES                  working_res,
-    camp::resources::Resource    erased_working_res)
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE,
+          bool USE_RESOURCE>
+void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
+                                   const std::vector<IDX_TYPE>& seg_idx,
+                                   WORKING_RES working_res,
+                                   camp::resources::Resource erased_working_res)
 {
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
@@ -68,16 +64,14 @@ void KernelBasicSingleLoopTestImpl(
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(
-      data_len, erased_working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  memset(
-      static_cast<void*>(test_array), 0,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  memset(static_cast<void*>(test_array), 0,
+         sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(
-      working_array, test_array,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   if (RAJA::stripIndexType(idx_len) > 0)
   {
@@ -96,28 +90,25 @@ void KernelBasicSingleLoopTestImpl(
   else
   {  // zero-length segment
 
-    call_kernel<EXEC_POLICY, USE_RESOURCE>(
-        RAJA::make_tuple(seg), working_res,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-        {
-          (void)idx;
-          working_array[0]++;
-        });
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg), working_res,
+                                           [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                           {
+                                             (void)idx;
+                                             working_array[0]++;
+                                           });
   }
 
-  working_res.memcpy(
-      check_array, working_array,
-      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      erased_working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 }
 
 #endif  // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index 36001b0c7f..156aaf7d1d 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -21,7 +21,7 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res {WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = false;
@@ -32,51 +32,51 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
-      USE_RES>(r1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
-      USE_RES>(r2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
-      USE_RES>(r3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
+      r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-      rs1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-      rs2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-      rs3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -90,23 +90,22 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
-      USE_RES>(l1, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
-      USE_RES>(l2, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
+      l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelBasicSingleLoopTest,
-    BasicSingleLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
+                            BasicSingleLoopSegmentKernel);
 
 #endif  // __TEST_KERNEL_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index 3014825d4d..eb54f4763e 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -21,7 +21,7 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res {WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = true;
@@ -32,51 +32,51 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
-      USE_RES>(r1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
-      USE_RES>(r2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>,
-      USE_RES>(r3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
+      r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-      rs1, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-      rs2, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-      rs3, seg_idx, working_res, erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -90,23 +90,22 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
-      USE_RES>(l1, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelBasicSingleLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>,
-      USE_RES>(l2, seg_idx, working_res, erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
+      l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelBasicSingleLoopTest,
-    BasicSingleLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
+                            BasicSingleLoopSegmentKernel);
 
 #endif  // __TEST_KERNEL_RESOURCE_BASIC_SINGLE_LOOP_SEGMENTS_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index 9daea3c157..d321718390 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -16,16 +16,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename EXEC_POLICY,
-    typename WORKING_RES,
-    typename SEG_TYPE>
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE>
 void KernelConditionalFissionFusionLoopTestImpl(
-    const SEG_TYPE&              seg,
+    const SEG_TYPE& seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES                  working_res,
-    camp::resources::Resource    erased_working_res)
+    WORKING_RES working_res,
+    camp::resources::Resource erased_working_res)
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
@@ -42,17 +41,17 @@ void KernelConditionalFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_x;
   DATA_TYPE* test_array_y;
 
-  allocateForallTestData<DATA_TYPE>(
-      RAJA::stripIndexType(data_len), erased_working_res, &working_array_x,
-      &check_array_x, &test_array_x);
+  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
-  allocateForallTestData<DATA_TYPE>(
-      RAJA::stripIndexType(data_len), erased_working_res, &working_array_y,
-      &check_array_y, &test_array_y);
+  allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(
-      working_array_x, 0, sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(working_array_x, 0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (int param = 0; param < 2; ++param)
   {
@@ -77,34 +76,32 @@ void KernelConditionalFissionFusionLoopTestImpl(
 
     );
 
-    working_res.memcpy(
-        check_array_x, working_array_x,
-        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+    working_res.memcpy(check_array_x, working_array_x,
+                       sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    memset(
-        static_cast<void*>(check_array_y), 0,
-        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+    memset(static_cast<void*>(check_array_y), 0,
+           sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    RAJA::forall<RAJA::seq_exec>(
-        working_res, seg_idx,
-        [=](IDX_TYPE i)
-        { check_array_y[RAJA::stripIndexType(i)] = 3 + 3 * param; });
+    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
+                                 [=](IDX_TYPE i) {
+                                   check_array_y[RAJA::stripIndexType(i)] =
+                                       3 + 3 * param;
+                                 });
 
 
     for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
     {
-      ASSERT_EQ(
-          check_array_x[RAJA::stripIndexType(i)],
-          check_array_y[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
+                check_array_y[RAJA::stripIndexType(i)]);
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_x, check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      erased_working_res, working_array_y, check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif  // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index 07b15942a3..ffe659f215 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -15,15 +15,14 @@ template <typename T>
 class KernelConditionalFissionFusionLoopTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    KernelConditionalFissionFusionLoopTest,
-    ConditionalFissionFusionLoopSegmentKernel)
+TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
+             ConditionalFissionFusionLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES               working_res {WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
@@ -32,15 +31,15 @@ TYPED_TEST_P(
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                             RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                             RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
@@ -48,8 +47,8 @@ TYPED_TEST_P(
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedRangeSegment<IDX_TYPE>>(
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                             RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
   // Range-stride segment tests
@@ -58,16 +57,16 @@ TYPED_TEST_P(
   RAJA::getIndices(seg_idx, rs1);
   KernelConditionalFissionFusionLoopTestImpl<
       IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs1, seg_idx, working_res, erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1, seg_idx, working_res,
+                                               erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelConditionalFissionFusionLoopTestImpl<
       IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs2, seg_idx, working_res, erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2, seg_idx, working_res,
+                                               erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
@@ -75,8 +74,8 @@ TYPED_TEST_P(
   RAJA::getIndices(seg_idx, rs3);
   KernelConditionalFissionFusionLoopTestImpl<
       IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs3, seg_idx, working_res, erased_working_res);
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3, seg_idx, working_res,
+                                               erased_working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -90,22 +89,21 @@ TYPED_TEST_P(
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(
-      &seg_idx[0], seg_idx.size(), erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                             RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(
-      nullptr, seg_idx.size(), erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES, RAJA::TypedListSegment<IDX_TYPE>>(
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+                                             RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelConditionalFissionFusionLoopTest,
-    ConditionalFissionFusionLoopSegmentKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelConditionalFissionFusionLoopTest,
+                            ConditionalFissionFusionLoopSegmentKernel);
 #endif  // __TEST_KERNEL_CONDITIONAL_FISSION_FUSION_LOOP_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index bf8df979c0..8645ae0b33 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -11,16 +11,14 @@
 #include <numeric>
 #include <type_traits>
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
-void KernelHyperplane2DTestImpl(
-    const int groups,
-    const int idim,
-    const int jdim)
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void KernelHyperplane2DTestImpl(const int groups,
+                                const int idim,
+                                const int jdim)
 {
   // This test traverses "groups" 2D arrays, and modifies values in a 1D
   // hyperplane manner.
@@ -33,8 +31,8 @@ void KernelHyperplane2DTestImpl(
 
   INDEX_TYPE array_length = groups * idim * jdim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView(
       test_array, groups, idim, jdim);
@@ -124,8 +122,8 @@ void KernelHyperplane2DTestImpl(
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
@@ -142,15 +140,12 @@ TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
-      1, 10, 10);
-  KernelHyperplane2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
-      2, 111, 205);
-  KernelHyperplane2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
-      3, 213, 123);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(2, 111, 205);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(3, 213, 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest, Hyperplane2DKernel);
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 48be474ad1..611d8fd3bf 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -11,36 +11,32 @@
 #include <numeric>
 #include <type_traits>
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 typename std::enable_if<
     std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(
-    const int RAJA_UNUSED_ARG(groups),
-    const int RAJA_UNUSED_ARG(idim),
-    const int RAJA_UNUSED_ARG(jdim),
-    const int RAJA_UNUSED_ARG(kdim))
+KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups),
+                           const int RAJA_UNUSED_ARG(idim),
+                           const int RAJA_UNUSED_ARG(jdim),
+                           const int RAJA_UNUSED_ARG(kdim))
 {
   // do nothing for unsigned index types
 }
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 typename std::enable_if<
     std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(
-    const int groups,
-    const int idimin,
-    const int jdimin,
-    const int kdimin)
+KernelHyperplane3DTestImpl(const int groups,
+                           const int idimin,
+                           const int jdimin,
+                           const int kdimin)
 {
   // This test traverses "groups" number of 3D arrays, and modifies values in a
   // 2D hyperplane manner.
@@ -68,8 +64,8 @@ KernelHyperplane3DTestImpl(
 
   INDEX_TYPE array_length = groups * idim * jdim * kdim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView(
       test_array, groups, idim, jdim, kdim);
@@ -87,49 +83,49 @@ KernelHyperplane3DTestImpl(
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
   // perform array arithmetic with a 2D J-K hyperplane
-  RAJA::TypedRangeSegment<INDEX_TYPE>       Grange(0, groups);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Irange(0, idim, 1);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(Grange, Irange, Jrange, Krange),
-      [=] RAJA_HOST_DEVICE(
-          INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk)
-      {
-        if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 ||
-            jj >= jdim || kk < 0 || kk >= kdim)
-        {
-          oob_count += 1;
-        }
-
-        DATA_TYPE left = 1;
-        if (ii > 0)
-        {
-          left = WorkView(g, ii - 1, jj, kk);
-        }
-
-        DATA_TYPE up = 1;
-        if (jj > 0)
-        {
-          up = WorkView(g, ii, jj - 1, kk);
-        }
-
-        DATA_TYPE back = 1;
-        if (kk > 0)
-        {
-          back = WorkView(g, ii, jj, kk - 1);
-        }
-
-        WorkView(g, ii, jj, kk) = left + up + back;
-
-        trip_count += 1;
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(Grange, Irange, Jrange, Krange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii,
+                                                 INDEX_TYPE jj, INDEX_TYPE kk)
+                            {
+                              if (g < 0 || g >= groups || ii < 0 ||
+                                  ii >= idim || jj < 0 || jj >= jdim ||
+                                  kk < 0 || kk >= kdim)
+                              {
+                                oob_count += 1;
+                              }
+
+                              DATA_TYPE left = 1;
+                              if (ii > 0)
+                              {
+                                left = WorkView(g, ii - 1, jj, kk);
+                              }
+
+                              DATA_TYPE up = 1;
+                              if (jj > 0)
+                              {
+                                up = WorkView(g, ii, jj - 1, kk);
+                              }
+
+                              DATA_TYPE back = 1;
+                              if (kk > 0)
+                              {
+                                back = WorkView(g, ii, jj, kk - 1);
+                              }
+
+                              WorkView(g, ii, jj, kk) = left + up + back;
+
+                              trip_count += 1;
+                            });
 
   work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
 
-  ASSERT_EQ(
-      (INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim * kdim);
+  ASSERT_EQ((INDEX_TYPE)trip_count.get(),
+            (INDEX_TYPE)groups * idim * jdim * kdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
@@ -179,8 +175,8 @@ KernelHyperplane3DTestImpl(
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
@@ -197,15 +193,12 @@ TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane3DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
-      1, 10, 10, 10);
-  KernelHyperplane3DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
-      2, 151, 111, 205);
-  KernelHyperplane3DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(
-      3, 101, 213, 123);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10, 10);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(2, 151, 111, 205);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(3, 101, 213, 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest, Hyperplane3DKernel);
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index a76a1f90c0..a58cc80812 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -15,45 +15,41 @@
 #include <random>
 #include <type_traits>
 
-template <
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY,
-    typename ABSTRACTION,
-    typename DATA_TYPE,
-    typename IDX_TYPE,
-    typename SEGMENTS_TYPE,
-    typename Container,
-    typename WORKING_RES,
-    typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
-KernelMultiReduceNestedTestImpl(
-    const SEGMENTS_TYPE&,
-    const Container&,
-    WORKING_RES,
-    RandomGenerator&)
+KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
+                                const Container&,
+                                WORKING_RES,
+                                RandomGenerator&)
 {
   return false;
 }
 ///
-template <
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY,
-    typename ABSTRACTION,
-    typename DATA_TYPE,
-    typename IDX_TYPE,
-    typename SEGMENTS_TYPE,
-    typename Container,
-    typename WORKING_RES,
-    typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
-KernelMultiReduceNestedTestImpl(
-    const SEGMENTS_TYPE& segments,
-    const Container&     multi_init,
-    WORKING_RES          working_res,
-    RandomGenerator&     rngen)
+KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
+                                const Container& multi_init,
+                                WORKING_RES working_res,
+                                RandomGenerator& rngen)
 {
   using RAJA::get;
   using MULTIREDUCER =
@@ -73,7 +69,7 @@ KernelMultiReduceNestedTestImpl(
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int    modval   = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -90,8 +86,8 @@ KernelMultiReduceNestedTestImpl(
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(
-      idx_range + 1, working_res, &working_range, &check_range, &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -117,11 +113,11 @@ KernelMultiReduceNestedTestImpl(
     }
   }
 
-  allocateForallTestData(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+                         &test_array);
 
-  allocateForallTestData(
-      data_len, working_res, &working_bins, &check_bins, &test_bins);
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+                         &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -141,8 +137,8 @@ KernelMultiReduceNestedTestImpl(
     }
   }
 
-  working_res.memcpy(
-      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -228,23 +224,22 @@ KernelMultiReduceNestedTestImpl(
     {
 
       // use floating point values to accentuate floating point precision issues
-      std::conditional_t<
-          !std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>>
+      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
           array_flt_distribution(0, modval - 1);
 
       for (IDX_TYPE i = 0; i < data_len; ++i)
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(
-          working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
     std::vector<DATA_TYPE> ref_vals;
-    bool                   got_ref_vals = false;
+    bool got_ref_vals = false;
 
     const int nloops = 2;
     for (int j = 0; j < nloops; ++j)
@@ -360,7 +355,7 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
       typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   // for setting random values in arrays
-  auto         random_seed = std::random_device {}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
   WORKING_RES working_res {WORKING_RES::get_default()};
@@ -368,47 +363,44 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
-  size_t              num_bins_min = 0;
+  size_t num_bins_min = 0;
   for (size_t num_bins_max : num_bins_max_container)
   {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(
-        num_bins_min, num_bins_max);
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
     num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    KernelMultiReduceNestedTestImpl<
-        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-        s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(
-        RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
-        RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    KernelMultiReduceNestedTestImpl<
-        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-        s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(
-        RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
-        RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
-        RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    KernelMultiReduceNestedTestImpl<
-        EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-        s3, container, working_res, rngen);
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelMultiReduceNestedTest,
-    MultiReduceNestedKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest,
+                            MultiReduceNestedKernel);
 
 #endif  // __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index ba3b2dd44a..ae856ae553 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -10,29 +10,26 @@
 
 #include <numeric>
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(
-    SEGMENTS&&  segs,
-    PARAMS&&    params,
-    WORKING_RES work_res,
-    Args&&... args)
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -52,15 +49,14 @@ using BlockReduceSumSupportedLoopTypeList =
 // Nest loop trip count test.
 //
 //
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
 {
 
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
   // Allocate Tests Data
@@ -68,16 +64,16 @@ void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
   int* check_array;
   int* test_array;
 
-  allocateForallTestData<int>(
-      N, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<int>(N, erased_work_res, &work_array, &check_array,
+                              &test_array);
 
   RAJA::TypedRangeSegment<int> range(0, N);
 
   // Initialize Data
   std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
-  erased_work_res.memcpy(
-      work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
+  erased_work_res.memcpy(work_array, test_array,
+                         sizeof(int) * RAJA::stripIndexType(N));
 
   RAJA::ReduceSum<REDUCE_POL, int> worksum(0);
 
@@ -105,18 +101,17 @@ void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
 
   ASSERT_EQ(worksum.get(), N * (N - 1) / 2);
 
-  deallocateForallTestData<int>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<int>(erased_work_res, work_array, check_array,
+                                test_array);
 }
 
 // DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above
 // DEPTH_1_REDUCESUM test.
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
@@ -151,10 +146,9 @@ template <typename REDUCE_POL, typename POLICY_DATA>
 struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::Lambda<1>>,
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<1>>,
       RAJA::statement::Reduce<
           typename camp::at<POLICY_DATA, camp::num<1>>::type,
           RAJA::operators::plus,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index 45e3e957b2..ee001ba8d8 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -10,24 +10,22 @@
 
 #include <numeric>
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
@@ -39,34 +37,32 @@ call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceSum test supports.
 //
 //
-using ReduceSumSupportedLoopTypeList = camp::list<
-    DEPTH_3_REDUCESUM,
-    DEPTH_3_REDUCESUM_SEQ_INNER,
-    DEPTH_3_REDUCESUM_SEQ_OUTER,
-    DEVICE_DEPTH_3_REDUCESUM,
-    DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-    DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
+using ReduceSumSupportedLoopTypeList =
+    camp::list<DEPTH_3_REDUCESUM,
+               DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEPTH_3_REDUCESUM_SEQ_OUTER,
+               DEVICE_DEPTH_3_REDUCESUM,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
 
 //
 //
 // ReduceSum 3D Matrix index calculation per element.
 //
 //
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
-void KernelNestedLoopTest(
-    const DEPTH_3_REDUCESUM&,
-    const RAJA::Index_type dim0,
-    const RAJA::Index_type dim1,
-    const RAJA::Index_type dim2)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
+                          const RAJA::Index_type dim0,
+                          const RAJA::Index_type dim1,
+                          const RAJA::Index_type dim2)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
-  RAJA::Index_type  flatSize = dim0 * dim1 * dim2;
+  RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -81,94 +77,88 @@ void KernelNestedLoopTest(
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  erased_work_res.memcpy(
-      work_array, test_array,
-      sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  erased_work_res.memcpy(work_array, test_array,
+                         sizeof(RAJA::Index_type) *
+                             RAJA::stripIndexType(flatSize));
 
-  constexpr int                                     Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
-      work_array, dim0, dim1, dim2);
+  constexpr int Depth = 3;
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim0,
+                                                              dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
-  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type>       worksum(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range0, range1, range2), work_res,
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k)
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k)
       { worksum += work_view(i, j, k); });
 
-  RAJA::forall<RAJA::seq_exec>(
-      rangeflat, [=](RAJA::Index_type i)
-      { hostsum += test_array[RAJA::stripIndexType(i)]; });
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i) {
+                                 hostsum += test_array[RAJA::stripIndexType(i)];
+                               });
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above
 // DEPTH_3_REDUCESUM test.
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE,
-    typename... Args>
-void KernelNestedLoopTest(
-    const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
-    Args... args)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
+                          Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE,
-    typename... Args>
-void KernelNestedLoopTest(
-    const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
-    Args... args)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
+                          Args... args)
 {
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEPTH_3_REDUCESUM(), args...);
@@ -198,10 +188,9 @@ struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA>
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<
-    DEPTH_3_REDUCESUM_SEQ_OUTER,
-    REDUCE_POL,
-    POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::For<
       0,
@@ -216,28 +205,26 @@ struct ReduceSumNestedLoopExec<
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<
-    DEPTH_3_REDUCESUM_SEQ_INNER,
-    REDUCE_POL,
-    POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::For<
       0,
       typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::
-              For<2, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>;
+      RAJA::statement::For<1,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::For<2,
+                                                RAJA::seq_exec,
+                                                RAJA::statement::Lambda<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<
-    DEVICE_DEPTH_3_REDUCESUM,
-    REDUCE_POL,
-    POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM,
+                               REDUCE_POL,
+                               POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -254,10 +241,9 @@ struct ReduceSumNestedLoopExec<
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<
-    DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
-    REDUCE_POL,
-    POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -274,10 +260,9 @@ struct ReduceSumNestedLoopExec<
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<
-    DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-    REDUCE_POL,
-    POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
index bfd3ba4ed6..b2363c468b 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
@@ -45,8 +45,7 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopBlockReduceSumTest,
-    NestedLoopBlockKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
+                            NestedLoopBlockKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
index df9999a381..aea740d451 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
@@ -45,8 +45,7 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
       LOOP_TYPE(), 40, 30, 20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopReduceSumTest,
-    NestedLoopReduceSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
+                            NestedLoopReduceSumKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_REDUCESUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
index 6222f17ec1..7ecdf9252f 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
@@ -45,8 +45,7 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopBlockReduceSumTest,
-    NestedLoopBlockKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
+                            NestedLoopBlockKernel);
 
 #endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_REDUCE_SUM_HPP__
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
index 810b05227f..b96edb880a 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
@@ -45,8 +45,7 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
       LOOP_TYPE(), 40, 30, 20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopReduceSumTest,
-    NestedLoopReduceSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
+                            NestedLoopReduceSumKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_REDUCESUM_HPP__
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index ae10f52342..a34688ef8d 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -19,14 +19,14 @@
 
 template <typename IDX_TYPE, typename DATA_TYPE, typename EXEC_POLICY>
 void KernelNestedLoopsSegmentTypesTestImpl(
-    const RAJA::TypedRangeSegment<IDX_TYPE>&       s1,
-    const std::vector<IDX_TYPE>&                   s1_idx,
+    const RAJA::TypedRangeSegment<IDX_TYPE>& s1,
+    const std::vector<IDX_TYPE>& s1_idx,
     const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
-    const std::vector<IDX_TYPE>&                   s2_idx,
-    const RAJA::TypedListSegment<IDX_TYPE>&        s3,
-    const std::vector<IDX_TYPE>&                   s3_idx,
-    camp::resources::Resource                      working_res,
-    int                                            perm)
+    const std::vector<IDX_TYPE>& s2_idx,
+    const RAJA::TypedListSegment<IDX_TYPE>& s3,
+    const std::vector<IDX_TYPE>& s3_idx,
+    camp::resources::Resource working_res,
+    int perm)
 {
   IDX_TYPE idx1_len = static_cast<IDX_TYPE>(s1_idx.size());
   IDX_TYPE idx2_len = static_cast<IDX_TYPE>(s2_idx.size());
@@ -55,21 +55,19 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(
-      work_array, dim1, dim2, dim3);
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(
-      test_array, dim1, dim2, dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(work_array, dim1, dim2,
+                                                   dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(test_array, dim1, dim2,
+                                                   dim3);
 
-  memset(
-      static_cast<void*>(test_array), 0,
-      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  memset(static_cast<void*>(test_array), 0,
+         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(
-      work_array, test_array,
-      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(work_array, test_array,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   if (!zero_legth_segment)
   {
@@ -123,9 +121,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
         });
   }
 
-  working_res.memcpy(
-      check_array, work_array,
-      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memcpy(check_array, work_array,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (IDX_TYPE i = 0; i < data_len; ++i)
   {
@@ -133,8 +130,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, work_array, check_array,
+                                      test_array);
 }
 
 
@@ -192,7 +189,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 
   // Zero-length range segment
   RAJA::TypedRangeSegment<IDX_TYPE> s4(4, 4);
-  std::vector<IDX_TYPE>             s4_idx;
+  std::vector<IDX_TYPE> s4_idx;
   RAJA::getIndices(s4_idx, s4);
 
   perm = 1;
@@ -209,7 +206,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 
   // Zero-length range stride segment
   RAJA::TypedRangeStrideSegment<IDX_TYPE> s5(3, 3, 2);
-  std::vector<IDX_TYPE>                   s5_idx;
+  std::vector<IDX_TYPE> s5_idx;
   RAJA::getIndices(s5_idx, s5);
 
   perm = 1;
@@ -225,7 +222,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
       s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   // Zero-length list segment
-  std::vector<IDX_TYPE>            s6_idx;
+  std::vector<IDX_TYPE> s6_idx;
   RAJA::TypedListSegment<IDX_TYPE> s6(nullptr, s6_idx.size(), working_res);
 
   perm = 1;
@@ -241,8 +238,7 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
       s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopsSegmentTypesTest,
-    NestedLoopsSegmentTypesKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest,
+                            NestedLoopsSegmentTypesKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOPS_SEGMENT_TYPES_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index ef2c2cfb27..76089d813d 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -10,15 +10,14 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelOffsetView2DTestImpl(
-    std::array<RAJA::idx_t, 2> dim,
-    std::array<RAJA::idx_t, 2> offset_lo,
-    std::array<RAJA::idx_t, 2> offset_hi)
+void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
+                                std::array<RAJA::idx_t, 2> offset_lo,
+                                std::array<RAJA::idx_t, 2> offset_hi)
 {
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  IDX_TYPE*                 working_array;
-  IDX_TYPE*                 check_array;
-  IDX_TYPE*                 test_array;
+  IDX_TYPE* working_array;
+  IDX_TYPE* check_array;
+  IDX_TYPE* test_array;
 
   RAJA::idx_t N = dim.at(0) * dim.at(1);
 
@@ -27,8 +26,8 @@ void KernelOffsetView2DTestImpl(
   EXPECT_LT(off_dim0, dim.at(0));
   EXPECT_LT(off_dim1, dim.at(1));
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -51,9 +50,9 @@ void KernelOffsetView2DTestImpl(
   RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(iseg, jseg), [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
-      { view(i, j) = static_cast<IDX_TYPE>(1); });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            { view(i, j) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
@@ -62,8 +61,8 @@ void KernelOffsetView2DTestImpl(
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
@@ -80,8 +79,8 @@ TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  RAJA::idx_t                dim0 = 21;
-  RAJA::idx_t                dim1 = 23;
+  RAJA::idx_t dim0 = 21;
+  RAJA::idx_t dim1 = 23;
   std::array<RAJA::idx_t, 2> dim {{dim0, dim1}};
 
   //
@@ -89,30 +88,29 @@ TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
   //
   std::array<RAJA::idx_t, 2> offset_lo {{0, 2}};
   std::array<RAJA::idx_t, 2> offset_hi {{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -2}};
   offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 6}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 2> {{0, 1}};
   offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 1}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -1}};
   offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopOffsetView2DTest,
-    OffsetView2DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest,
+                            OffsetView2DKernelTest);
 
 #endif  // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index 6c1f5da17b..60c335154a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -10,15 +10,14 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelOffsetView3DTestImpl(
-    std::array<RAJA::idx_t, 3> dim,
-    std::array<RAJA::idx_t, 3> offset_lo,
-    std::array<RAJA::idx_t, 3> offset_hi)
+void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
+                                std::array<RAJA::idx_t, 3> offset_lo,
+                                std::array<RAJA::idx_t, 3> offset_hi)
 {
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  IDX_TYPE*                 working_array;
-  IDX_TYPE*                 check_array;
-  IDX_TYPE*                 test_array;
+  IDX_TYPE* working_array;
+  IDX_TYPE* check_array;
+  IDX_TYPE* test_array;
 
   RAJA::idx_t N = dim.at(0) * dim.at(1) * dim.at(2);
 
@@ -29,8 +28,8 @@ void KernelOffsetView3DTestImpl(
   EXPECT_LT(off_dim1, dim.at(1));
   EXPECT_LT(off_dim2, dim.at(2));
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -72,8 +71,8 @@ void KernelOffsetView3DTestImpl(
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
@@ -90,9 +89,9 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  RAJA::idx_t                dim0 = 13;
-  RAJA::idx_t                dim1 = 19;
-  RAJA::idx_t                dim2 = 16;
+  RAJA::idx_t dim0 = 13;
+  RAJA::idx_t dim1 = 19;
+  RAJA::idx_t dim2 = 16;
   std::array<RAJA::idx_t, 3> dim {{dim0, dim1, dim2}};
 
   //
@@ -100,30 +99,29 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   //
   std::array<RAJA::idx_t, 3> offset_lo {{0, 2, 1}};
   std::array<RAJA::idx_t, 3> offset_hi {{dim0 - 2, dim1 - 6, dim2 - 4}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -2, -3}};
   offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 10, dim2 - 8}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
   offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 2, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -1, 0}};
   offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 4, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, offset_lo, offset_hi);
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+                                                                 offset_hi);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopOffsetView3DTest,
-    OffsetView3DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest,
+                            OffsetView3DKernelTest);
 
 #endif  // __TEST_KERNEL_NESTEDLOOP_OFFSETVIEW3D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index 49ec54bd15..1888c93016 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -9,17 +9,16 @@
 #define __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW2D_HPP__
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedOffsetView2DTestImpl(
-    std::array<RAJA::idx_t, 2> dim,
-    std::array<RAJA::idx_t, 2> perm)
+void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
+                                        std::array<RAJA::idx_t, 2> perm)
 {
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  IDX_TYPE*                 A_work_array;
-  IDX_TYPE*                 A_check_array;
-  IDX_TYPE*                 A_test_array;
-  IDX_TYPE*                 B_work_array;
-  IDX_TYPE*                 B_check_array;
-  IDX_TYPE*                 B_test_array;
+  IDX_TYPE* A_work_array;
+  IDX_TYPE* A_check_array;
+  IDX_TYPE* A_test_array;
+  IDX_TYPE* B_work_array;
+  IDX_TYPE* B_check_array;
+  IDX_TYPE* B_test_array;
 
   //
   // These are used for RAJA Layout, Segment definitions in the test.
@@ -46,8 +45,8 @@ void KernelPermutedOffsetView2DTestImpl(
   RAJA::idx_t Ntot = Ntot_outer * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
@@ -63,8 +62,8 @@ void KernelPermutedOffsetView2DTestImpl(
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
@@ -93,18 +92,19 @@ void KernelPermutedOffsetView2DTestImpl(
       RAJA::make_permuted_layout({{Nint_len.at(0), Nint_len.at(1)}}, perm);
 
   RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> B_view(B_work_array, B_layout);
-  RAJA::View<IDX_TYPE, RAJA::Layout<2>>       A_view(A_work_array, A_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2>> A_view(A_work_array, A_layout);
 
   RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(iseg, jseg),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
-      {
-        A_view(i, j) = B_view(i, j) + B_view(i - 1, j) + B_view(i + 1, j) +
-                       B_view(i, j - 1) + B_view(i, j + 1);
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            {
+                              A_view(i, j) = B_view(i, j) + B_view(i - 1, j) +
+                                             B_view(i + 1, j) +
+                                             B_view(i, j - 1) +
+                                             B_view(i, j + 1);
+                            });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
@@ -113,11 +113,11 @@ void KernelPermutedOffsetView2DTestImpl(
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, A_work_array, A_check_array, A_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
+                                     A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, B_work_array, B_check_array, B_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
+                                     B_test_array);
 }
 
 
@@ -127,30 +127,28 @@ class KernelNestedLoopPermutedOffsetView2DTest : public ::testing::Test
 {};
 
 
-TYPED_TEST_P(
-    KernelNestedLoopPermutedOffsetView2DTest,
-    PermutedOffsetView2DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
+             PermutedOffsetView2DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
-  RAJA::idx_t                dim0 = 23;
-  RAJA::idx_t                dim1 = 37;
+  RAJA::idx_t dim0 = 23;
+  RAJA::idx_t dim1 = 37;
   std::array<RAJA::idx_t, 2> dim {{dim0, dim1}};
 
   std::array<RAJA::idx_t, 2> perm {{0, 1}};
-  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, perm);
+  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                         perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
-  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, perm);
+  KernelPermutedOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                         perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopPermutedOffsetView2DTest,
-    PermutedOffsetView2DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView2DTest,
+                            PermutedOffsetView2DKernelTest);
 
 #endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index 2f44756fb9..0448cf268a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -9,17 +9,16 @@
 #define __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW3D_HPP__
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedOffsetView3DTestImpl(
-    std::array<RAJA::idx_t, 3> dim,
-    std::array<RAJA::idx_t, 3> perm)
+void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
+                                        std::array<RAJA::idx_t, 3> perm)
 {
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  IDX_TYPE*                 A_work_array;
-  IDX_TYPE*                 A_check_array;
-  IDX_TYPE*                 A_test_array;
-  IDX_TYPE*                 B_work_array;
-  IDX_TYPE*                 B_check_array;
-  IDX_TYPE*                 B_test_array;
+  IDX_TYPE* A_work_array;
+  IDX_TYPE* A_check_array;
+  IDX_TYPE* A_test_array;
+  IDX_TYPE* B_work_array;
+  IDX_TYPE* B_check_array;
+  IDX_TYPE* B_test_array;
 
   //
   // These are used for RAJA Layout, Segment definitions in the test.
@@ -49,8 +48,8 @@ void KernelPermutedOffsetView3DTestImpl(
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Ntot, working_res, &B_work_array, &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
@@ -70,8 +69,8 @@ void KernelPermutedOffsetView3DTestImpl(
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(
-      Nint, working_res, &A_work_array, &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
@@ -108,7 +107,7 @@ void KernelPermutedOffsetView3DTestImpl(
       {{Nint_len.at(0), Nint_len.at(1), Nint_len.at(2)}}, perm);
 
   RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> B_view(B_work_array, B_layout);
-  RAJA::View<IDX_TYPE, RAJA::Layout<3>>       A_view(A_work_array, A_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3>> A_view(A_work_array, A_layout);
 
   RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
   RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
@@ -131,11 +130,11 @@ void KernelPermutedOffsetView3DTestImpl(
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, A_work_array, A_check_array, A_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
+                                     A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, B_work_array, B_check_array, B_test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
+                                     B_test_array);
 }
 
 
@@ -145,9 +144,8 @@ class KernelNestedLoopPermutedOffsetView3DTest : public ::testing::Test
 {};
 
 
-TYPED_TEST_P(
-    KernelNestedLoopPermutedOffsetView3DTest,
-    PermutedOffsetView3DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
+             PermutedOffsetView3DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -166,19 +164,18 @@ TYPED_TEST_P(
   std::array<RAJA::idx_t, 3> dim {{dim0, dim1, dim2}};
 
   std::array<RAJA::idx_t, 3> perm {{0, 1, 2}};
-  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, perm);
+  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                         perm);
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
-  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, perm);
+  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                         perm);
 
   perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
-  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim, perm);
+  KernelPermutedOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                         perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopPermutedOffsetView3DTest,
-    PermutedOffsetView3DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView3DTest,
+                            PermutedOffsetView3DKernelTest);
 
 #endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDOFFSETVIEW3D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index 76db158a0a..37729d4e99 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -10,22 +10,21 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedView2DTestImpl(
-    std::array<IDX_TYPE, 2>    dim,
-    std::array<RAJA::idx_t, 2> perm)
+void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
+                                  std::array<RAJA::idx_t, 2> perm)
 {
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  IDX_TYPE*                 working_array;
-  IDX_TYPE*                 check_array;
-  IDX_TYPE*                 test_array;
+  IDX_TYPE* working_array;
+  IDX_TYPE* check_array;
+  IDX_TYPE* test_array;
 
   std::array<RAJA::idx_t, 2> dim_strip {
       {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
        static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1);
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -41,9 +40,8 @@ void KernelPermutedView2DTestImpl(
   RAJA::View<IDX_TYPE, RAJA::Layout<2, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
       [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
       {
         int val    = RAJA::stripIndexType(layout(i, j)) % mod_val;
@@ -57,8 +55,8 @@ void KernelPermutedView2DTestImpl(
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
@@ -92,16 +90,15 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   //
   std::array<IDX_TYPE, 2> dim_ns {
       {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24)}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim_ns, perm);
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim_ns, perm);
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopPermutedView2DTest,
-    PermutedView2DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest,
+                            PermutedView2DKernelTest);
 
 #endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW2D_HPP__
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index 0bc63c682a..7c3c329bf3 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -10,14 +10,13 @@
 
 
 template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void KernelPermutedView3DTestImpl(
-    std::array<IDX_TYPE, 3>    dim,
-    std::array<RAJA::idx_t, 3> perm)
+void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
+                                  std::array<RAJA::idx_t, 3> perm)
 {
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  IDX_TYPE*                 working_array;
-  IDX_TYPE*                 check_array;
-  IDX_TYPE*                 test_array;
+  IDX_TYPE* working_array;
+  IDX_TYPE* check_array;
+  IDX_TYPE* test_array;
 
   std::array<RAJA::idx_t, 3> dim_strip {
       {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
@@ -25,8 +24,8 @@ void KernelPermutedView3DTestImpl(
        static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(2)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1) * dim_strip.at(2);
 
-  allocateForallTestData<IDX_TYPE>(
-      N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+                                   &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
@@ -42,10 +41,9 @@ void KernelPermutedView3DTestImpl(
   RAJA::View<IDX_TYPE, RAJA::Layout<3, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
-          RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
       [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
       {
         int val       = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
@@ -59,8 +57,8 @@ void KernelPermutedView3DTestImpl(
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+                                     test_array);
 }
 
 
@@ -81,9 +79,9 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Square view
   //
-  std::array<IDX_TYPE, 3> dim_s {
-      {static_cast<IDX_TYPE>(21), static_cast<IDX_TYPE>(21),
-       static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 3> dim_s {{static_cast<IDX_TYPE>(21),
+                                  static_cast<IDX_TYPE>(21),
+                                  static_cast<IDX_TYPE>(21)}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
@@ -96,23 +94,22 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 3> dim_ns {
-      {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24),
-       static_cast<IDX_TYPE>(17)}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim_ns, perm);
+  std::array<IDX_TYPE, 3> dim_ns {{static_cast<IDX_TYPE>(15),
+                                   static_cast<IDX_TYPE>(24),
+                                   static_cast<IDX_TYPE>(17)}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim_ns, perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(
-      dim_ns, perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopPermutedView3DTest,
-    PermutedView3DKernelTest);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest,
+                            PermutedView3DKernelTest);
 
 #endif  // __TEST_KERNEL_NESTEDLOOP_PERMUTEDVIEW3D_HPP_
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index ef7c5e8b37..b0958b0f52 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -10,24 +10,22 @@
 
 #include <numeric>
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
@@ -39,35 +37,32 @@ call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 // Define list of nested loop types the Basic test supports.
 //
 //
-using BasicSupportedLoopTypeList = camp::list<
-    DEPTH_2,
-    DEPTH_2_COLLAPSE,
-    DEPTH_3,
-    DEPTH_3_COLLAPSE,
-    DEPTH_3_COLLAPSE_SEQ_INNER,
-    DEPTH_3_COLLAPSE_SEQ_OUTER,
-    DEVICE_DEPTH_2>;
+using BasicSupportedLoopTypeList = camp::list<DEPTH_2,
+                                              DEPTH_2_COLLAPSE,
+                                              DEPTH_3,
+                                              DEPTH_3_COLLAPSE,
+                                              DEPTH_3_COLLAPSE_SEQ_INNER,
+                                              DEPTH_3_COLLAPSE_SEQ_OUTER,
+                                              DEVICE_DEPTH_2>;
 
 //
 //
 // Basic 2D Matrix index calculation per element.
 //
 //
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    bool USE_RESOURCE,
-    typename... ExtraArgs>
-void KernelNestedLoopTest(
-    const DEPTH_2&,
-    const RAJA::Index_type dim0,
-    const RAJA::Index_type dim1,
-    ExtraArgs...)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... ExtraArgs>
+void KernelNestedLoopTest(const DEPTH_2&,
+                          const RAJA::Index_type dim0,
+                          const RAJA::Index_type dim1,
+                          ExtraArgs...)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
-  RAJA::Index_type  flatSize = dim0 * dim1;
+  RAJA::Index_type flatSize = dim0 * dim1;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -81,53 +76,49 @@ void KernelNestedLoopTest(
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  constexpr int                                     Depth = 2;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
-      work_array, dim1, dim0);
+  constexpr int Depth = 2;
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim1,
+                                                              dim0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range1, range0), work_res,
       [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i)
       { work_view(j, i) = (j * dim0) + i; });
 
-  work_res.memcpy(
-      check_array, work_array,
-      sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(
-      rangeflat,
-      [=](RAJA::Index_type i)
-      {
-        ASSERT_EQ(
-            test_array[RAJA::stripIndexType(i)],
-            check_array[RAJA::stripIndexType(i)]);
-      });
-
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  work_res.memcpy(check_array, work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2
 // test.
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
-      DEPTH_2(), args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
-      DEPTH_2(), args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
 //
@@ -136,16 +127,15 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(
-    const DEPTH_3&,
-    const RAJA::Index_type dim0,
-    const RAJA::Index_type dim1,
-    const RAJA::Index_type dim2)
+void KernelNestedLoopTest(const DEPTH_3&,
+                          const RAJA::Index_type dim0,
+                          const RAJA::Index_type dim1,
+                          const RAJA::Index_type dim2)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
-  RAJA::Index_type  flatSize = dim0 * dim1 * dim2;
+  RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -160,64 +150,59 @@ void KernelNestedLoopTest(
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  constexpr int                                     Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(
-      work_array, dim2, dim1, dim0);
+  constexpr int Depth = 3;
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim2,
+                                                              dim1, dim0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(range2, range1, range0), work_res,
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i)
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type k, RAJA::Index_type j,
+                           RAJA::Index_type i)
       { work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i; });
 
-  work_res.memcpy(
-      check_array, work_array,
-      sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(
-      rangeflat,
-      [=](RAJA::Index_type i)
-      {
-        ASSERT_EQ(
-            test_array[RAJA::stripIndexType(i)],
-            check_array[RAJA::stripIndexType(i)]);
-      });
-
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  work_res.memcpy(check_array, work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEPTH_3_COLLAPSE execution policies use the above DEPTH_3 test.
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
-      DEPTH_3(), args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
-      DEPTH_3(), args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    bool USE_RESOURCE,
-    typename... Args>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
 void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args)
 {
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(
-      DEPTH_3(), args...);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
 //
@@ -249,10 +234,9 @@ struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA>
   using type = RAJA::KernelPolicy<RAJA::statement::For<
       1,
       typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0>>>>;
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::Lambda<0>>>>;
 };
 
 template <typename POLICY_DATA>
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 9f8270ddde..8f9d35df6f 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -10,24 +10,22 @@
 
 #include "RAJA_test-abs.hpp"
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
@@ -54,7 +52,7 @@ void KernelNestedLoopTest()
   constexpr static int DIM = 2;
 
   camp::resources::Resource host_res {camp::resources::Host()};
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
   double* work_arrA = work_res.template allocate<double>(N * N);
@@ -72,10 +70,10 @@ void KernelNestedLoopTest()
     test_arrA[i] = i * 1.2;
     test_arrB[i] = i * 0.5;
   }
-  work_res.memcpy(
-      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Initialize RAJA Views
   RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
@@ -105,8 +103,8 @@ void KernelNestedLoopTest()
   RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(
-          RAJA::RangeSegment {1, N - 1}, RAJA::RangeSegment {1, N - 1}),
+      RAJA::make_tuple(RAJA::RangeSegment {1, N - 1},
+                       RAJA::RangeSegment {1, N - 1}),
 
       // Resource
       work_res,
@@ -127,10 +125,10 @@ void KernelNestedLoopTest()
                                   work_viewB(i - 1, j));
       });
 
-  work_res.memcpy(
-      check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrA, work_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrB, work_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
       RAJA::RangeSegment {0, N * N},
@@ -181,15 +179,15 @@ struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA>
 template <typename POLICY_DATA>
 struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
 {
-  using type = RAJA::KernelPolicy<
-      RAJA::statement::Collapse<
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::ArgList<1, 0>,
-          RAJA::statement::Lambda<0>>,
-      RAJA::statement::Collapse<
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::ArgList<1, 0>,
-          RAJA::statement::Lambda<1>>>;
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<0>>,
+                         RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<1>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index b3cc2879e7..b41125b50e 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -10,29 +10,26 @@
 
 #include "RAJA_test-abs.hpp"
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(
-    SEGMENTS&&  segs,
-    PARAMS&&    params,
-    WORKING_RES work_res,
-    Args&&... args)
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -60,7 +57,7 @@ void KernelNestedLoopTest()
   constexpr static int DIM = 2;
 
   camp::resources::Resource host_res {camp::resources::Host()};
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
   double* work_arrA = work_res.template allocate<double>(N * N);
@@ -93,12 +90,12 @@ void KernelNestedLoopTest()
     }
   }
 
-  work_res.memcpy(
-      work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(
-      work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrC, test_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Calculate Test data
   for (int row = 0; row < N; ++row)
@@ -117,9 +114,8 @@ void KernelNestedLoopTest()
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(
-          RAJA::RangeSegment {0, N}, RAJA::RangeSegment {0, N},
-          RAJA::RangeSegment {0, N}),
+      RAJA::make_tuple(RAJA::RangeSegment {0, N}, RAJA::RangeSegment {0, N},
+                       RAJA::RangeSegment {0, N}),
 
       RAJA::tuple<double> {0.0},
 
@@ -139,8 +135,8 @@ void KernelNestedLoopTest()
 
   );
 
-  work_res.memcpy(
-      check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrC, work_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   RAJA::forall<RAJA::seq_exec>(
       RAJA::RangeSegment {0, N * N}, [=](RAJA::Index_type i)
@@ -180,13 +176,12 @@ struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA>
               typename camp::at<POLICY_DATA, camp::num<2>>::type,
               RAJA::statement::Lambda<1>  // inner loop: dot += ...
               >,
-          RAJA::statement::Lambda<
-              2,
-              RAJA::Segs<0, 1>,
-              RAJA::Params<0>>  // set
-                                // C(row,
-                                // col)
-                                // = dot
+          RAJA::statement::Lambda<2,
+                                  RAJA::Segs<0, 1>,
+                                  RAJA::Params<0>>  // set
+                                                    // C(row,
+                                                    // col)
+                                                    // = dot
           >>>;
 };
 
@@ -209,11 +204,10 @@ struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
                   typename camp::at<POLICY_DATA, camp::num<2>>::type,
                   RAJA::statement::Lambda<1>  // inner loop: dot += ...
                   >,
-              RAJA::statement::Lambda<
-                  2,
-                  RAJA::Segs<0, 1>,
-                  RAJA::Params<0>>  // set C(row, col) = dot
-              >>>                   // end CudaKernel
+              RAJA::statement::Lambda<2,
+                                      RAJA::Segs<0, 1>,
+                                      RAJA::Params<0>>  // set C(row, col) = dot
+              >>>                                       // end CudaKernel
                          >;
 };
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index eb69e705ac..27ed5270bc 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -38,8 +38,8 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 
   // For double nested loop tests the third arg is ignored.
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
-      LOOP_TYPE(), 40, 30, 20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
index a93ebe9564..a1ce6bbbd2 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
@@ -41,8 +41,7 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopMultiLambdaTest,
-    NestedLoopMultiLambdaKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest,
+                            NestedLoopMultiLambdaKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
index 29bbf13ba0..df5264ec43 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
@@ -20,9 +20,8 @@ template <typename T>
 class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    KernelNestedLoopMultiLambdaParamTest,
-    NestedLoopMultiLambdaParamKernel)
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
 {
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -43,8 +42,7 @@ TYPED_TEST_P(
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopMultiLambdaParamTest,
-    NestedLoopMultiLambdaParamKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest,
+                            NestedLoopMultiLambdaParamKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index de4fad2b5a..1fbec4bb91 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -37,8 +37,8 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
 
   // For double nested loop tests the third arg is ignored.
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(
-      LOOP_TYPE(), 40, 30, 20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
index 639b73fb7d..798faf3f99 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
@@ -41,8 +41,7 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopMultiLambdaTest,
-    NestedLoopMultiLambdaKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest,
+                            NestedLoopMultiLambdaKernel);
 
 #endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
index a76faafd1d..31dfbc1bd8 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
@@ -20,9 +20,8 @@ template <typename T>
 class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
 {};
 
-TYPED_TEST_P(
-    KernelNestedLoopMultiLambdaParamTest,
-    NestedLoopMultiLambdaParamKernel)
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
 {
   using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -43,8 +42,7 @@ TYPED_TEST_P(
   KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelNestedLoopMultiLambdaParamTest,
-    NestedLoopMultiLambdaParamKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest,
+                            NestedLoopMultiLambdaParamKernel);
 
 #endif  // __TEST_KERNEL_RESOURCE_NESTED_LOOP_MULTI_LAMBDA_PARAM_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index 3bf4c82418..0e416b44a5 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -8,13 +8,12 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename FORALL_POLICY,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -22,39 +21,37 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE*  work_array;
-  DATA_TYPE*  check_array;
-  DATA_TYPE*  test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-      { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(
-      seg,
-      [=](INDEX_TYPE zz)
-      {
-        for (int xx = 0; xx < xdim; ++xx)
-        {
-          checkarr2D[zz][xx] = zz * xdim + xx;
-        }
-        checkarr2D[ydim - 1][xdim - 1] = 0;
-      });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -72,30 +69,30 @@ void KernelLocMax2DTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(
-      colrange,
-      [=](INDEX_TYPE c)
-      {
-        for (int r = 0; r < ydim; ++r)
-        {
-          checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-        }
-      });
-
-  Index2D   raja_loc      = maxloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = maxloc_reducer.getLoc();
   DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
-  Index2D   checkraja_loc = checkmaxloc_reducer.getLoc();
+  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -113,15 +110,12 @@ TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(10, 10);
-  KernelLocMax2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(151, 151);
-  KernelLocMax2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(362, 362);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest, LocMax2DKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index fb0d213f69..938f0b666f 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -8,13 +8,12 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename FORALL_POLICY,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -22,39 +21,37 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE*  work_array;
-  DATA_TYPE*  check_array;
-  DATA_TYPE*  test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-      { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(
-      seg,
-      [=](INDEX_TYPE zz)
-      {
-        for (int xx = 0; xx < xdim; ++xx)
-        {
-          checkarr2D[zz][xx] = zz * xdim + xx;
-        }
-        checkarr2D[ydim - 1][xdim - 1] = 0;
-      });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -74,30 +71,30 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(
-      colrange,
-      [=](INDEX_TYPE c)
-      {
-        for (int r = 0; r < ydim; ++r)
-        {
-          checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-        }
-      });
-
-  Index2D   raja_loc      = maxloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = maxloc_reducer.getLoc();
   DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
-  Index2D   checkraja_loc = checkmaxloc_reducer.getLoc();
+  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -115,15 +112,12 @@ TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest, LocMax2DViewKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index 3a8f5554ba..699a6ff776 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -8,13 +8,12 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename FORALL_POLICY,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -22,39 +21,37 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE*  work_array;
-  DATA_TYPE*  check_array;
-  DATA_TYPE*  test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-      { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(
-      seg,
-      [=](INDEX_TYPE zz)
-      {
-        for (int xx = 0; xx < xdim; ++xx)
-        {
-          checkarr2D[zz][xx] = zz * xdim + xx;
-        }
-        checkarr2D[ydim - 1][xdim - 1] = 0;
-      });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -69,46 +66,46 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMaxLoc<
-      REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
       maxloc_reducer((DATA_TYPE)0, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(int c, int r)
-      {
-        maxloc_reducer.maxloc(
-            ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              maxloc_reducer.maxloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
   RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
       (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(
-      colrange,
-      [=](INDEX_TYPE c)
-      {
-        for (int r = 0; r < ydim; ++r)
-        {
-          checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-        }
-      });
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE                         raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D   checkraja_loc                    = checkmaxloc_reducer.getLoc();
+  DATA_TYPE raja_max                         = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc                      = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -126,19 +123,18 @@ TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTupleTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTupleTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTupleTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelLocMax2DViewTupleTest,
-    LocMax2DViewTupleKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest,
+                            LocMax2DViewTupleKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index d6d476d57f..165adf2284 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -8,13 +8,12 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename FORALL_POLICY,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -22,39 +21,37 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE*  work_array;
-  DATA_TYPE*  check_array;
-  DATA_TYPE*  test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-      { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(
-      seg,
-      [=](INDEX_TYPE zz)
-      {
-        for (int xx = 0; xx < xdim; ++xx)
-        {
-          checkarr2D[zz][xx] = zz * xdim + xx + 1;
-        }
-        checkarr2D[ydim - 1][xdim - 1] = 0;
-      });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -72,30 +69,30 @@ void KernelLocMin2DTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(
-      colrange,
-      [=](INDEX_TYPE c)
-      {
-        for (int r = 0; r < ydim; ++r)
-        {
-          checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-        }
-      });
-
-  Index2D   raja_loc      = minloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = minloc_reducer.getLoc();
   DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
-  Index2D   checkraja_loc = checkminloc_reducer.getLoc();
+  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -113,15 +110,12 @@ TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(10, 10);
-  KernelLocMin2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(151, 151);
-  KernelLocMin2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(362, 362);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest, LocMin2DKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index ccf8d341f4..046ec52a6b 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -8,13 +8,12 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename FORALL_POLICY,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -22,39 +21,37 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE*  work_array;
-  DATA_TYPE*  check_array;
-  DATA_TYPE*  test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-      { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(
-      seg,
-      [=](INDEX_TYPE zz)
-      {
-        for (int xx = 0; xx < xdim; ++xx)
-        {
-          checkarr2D[zz][xx] = zz * xdim + xx + 1;
-        }
-        checkarr2D[ydim - 1][xdim - 1] = 0;
-      });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -74,30 +71,30 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(
-      colrange,
-      [=](INDEX_TYPE c)
-      {
-        for (int r = 0; r < ydim; ++r)
-        {
-          checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-        }
-      });
-
-  Index2D   raja_loc      = minloc_reducer.getLoc();
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = minloc_reducer.getLoc();
   DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
-  Index2D   checkraja_loc = checkminloc_reducer.getLoc();
+  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -115,15 +112,12 @@ TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest, LocMin2DViewKernel);
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index f9b606cbbb..57016c4e00 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -8,13 +8,12 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename FORALL_POLICY,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -22,39 +21,37 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
   DATA_TYPE** workarr2D;
   DATA_TYPE** checkarr2D;
   DATA_TYPE** testarr2D;
-  DATA_TYPE*  work_array;
-  DATA_TYPE*  check_array;
-  DATA_TYPE*  test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE*>(
-      ydim, work_res, &workarr2D, &checkarr2D, &testarr2D);
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
   RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
-  RAJA::forall<FORALL_POLICY>(
-      seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-      { workarr2D[zz] = work_array + zz * ydim; });
+  RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(
-      seg, [=](INDEX_TYPE zz) { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(
-      seg,
-      [=](INDEX_TYPE zz)
-      {
-        for (int xx = 0; xx < xdim; ++xx)
-        {
-          checkarr2D[zz][xx] = zz * xdim + xx + 1;
-        }
-        checkarr2D[ydim - 1][xdim - 1] = 0;
-      });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -65,46 +62,46 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMinLoc<
-      REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
       minloc_reducer((DATA_TYPE)1024, LocTup);
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(int c, int r)
-      {
-        minloc_reducer.minloc(
-            ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              minloc_reducer.minloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
   RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
       (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::forall<RAJA::seq_exec>(
-      colrange,
-      [=](INDEX_TYPE c)
-      {
-        for (int r = 0; r < ydim; ++r)
-        {
-          checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-        }
-      });
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE                         raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D   checkraja_loc                    = checkminloc_reducer.getLoc();
+  DATA_TYPE raja_min                         = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc                      = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE*>(
-      work_res, workarr2D, checkarr2D, testarr2D);
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
@@ -122,19 +119,18 @@ TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTupleTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTupleTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTupleTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY,
-      REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelLocMin2DViewTupleTest,
-    LocMin2DViewTupleKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest,
+                            LocMin2DViewTupleKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
index 3802771927..63c696f96a 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-data.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -9,14 +9,13 @@
 #define __TEST_KERNEL_REGION_DATA_HPP__
 
 template <typename T>
-void allocRegionTestData(
-    int                       N,
-    camp::resources::Resource work_res,
-    T**                       work1,
-    T**                       work2,
-    T**                       work3,
-    camp::resources::Resource host_res,
-    T**                       check)
+void allocRegionTestData(int N,
+                         camp::resources::Resource work_res,
+                         T** work1,
+                         T** work2,
+                         T** work3,
+                         camp::resources::Resource host_res,
+                         T** check)
 {
   *work1 = work_res.allocate<T>(N);
   *work2 = work_res.allocate<T>(N);
@@ -26,13 +25,12 @@ void allocRegionTestData(
 }
 
 template <typename T>
-void deallocRegionTestData(
-    camp::resources::Resource work_res,
-    T*                        work1,
-    T*                        work2,
-    T*                        work3,
-    camp::resources::Resource host_res,
-    T*                        check)
+void deallocRegionTestData(camp::resources::Resource work_res,
+                           T* work1,
+                           T* work2,
+                           T* work3,
+                           camp::resources::Resource host_res,
+                           T* check)
 {
   work_res.deallocate(work1);
   work_res.deallocate(work2);
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index 9282e3687b..cb87a63357 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -26,9 +26,8 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(
-      N, work_res, &work_array1, &work_array2, &work_array3, host_res,
-      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
   work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
   work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
@@ -73,8 +72,8 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(
-      work_res, work_array1, work_array2, work_array3, host_res, check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index 55835c7dbe..e444da11ba 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -22,9 +22,8 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(
-      N, work_res, &work_array1, &work_array2, &work_array3, host_res,
-      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
   work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
   work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
@@ -58,8 +57,8 @@ void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(
-      work_res, work_array1, work_array2, work_array3, host_res, check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
index 02f0958a2b..48185fe281 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
@@ -77,8 +77,7 @@ TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
       IDX_TYPE(1035), tsize);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelSingleLoopForICountTest,
-    ForICountSingleLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest,
+                            ForICountSingleLoopKernel);
 
 #endif  // __TEST_KERNEL_SINGLE_LOOP_FORICOUNT_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
index cfe65a6e7e..078ee61cf6 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
@@ -79,8 +79,7 @@ TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
       IDX_TYPE(1035), tsize);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelSingleLoopTileTCountTest,
-    TileTCountSingleLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest,
+                            TileTCountSingleLoopKernel);
 
 #endif  // __TEST_KERNEL_SINGLE_LOOP_TILETCOUNT_HPP_
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index 974ddefde1..5a28fbd523 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -10,11 +10,10 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
@@ -32,11 +31,11 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -66,13 +65,13 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
-      RAJA::make_tuple(
-          RAJA::TileSize {tile_dim_x}, RAJA::TileSize {tile_dim_y}),
+      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
+                       RAJA::TileSize {tile_dim_y}),
       [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
       { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -89,13 +88,13 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
   // transpose work_array again with different tile sizes
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
-      RAJA::make_tuple(
-          RAJA::TileSize {tile_dim_x}, RAJA::TileSize {tile_dim_y / 2}),
+      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
+                       RAJA::TileSize {tile_dim_y / 2}),
       [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
       { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -105,11 +104,11 @@ void KernelTileDynamic2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array_t, check_array_t, test_array_t);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index c125619b9c..0bfe064bbd 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -10,11 +10,10 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileFixed2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
@@ -32,11 +31,11 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -64,13 +63,12 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-      { WorkTView(cc, rr) = WorkView(rr, cc); });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -80,11 +78,11 @@ void KernelTileFixed2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array_t, check_array_t, test_array_t);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index 91c114280a..1dcda30f9e 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -11,12 +11,11 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 {
   // This test reduces min and max with tiling.
@@ -29,8 +28,8 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   // initialize arrays
   std::iota(test_array, test_array + array_length, 1);
@@ -55,25 +54,23 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(
-      &colidx[0], colidx.size(), work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // find min and max on target platform
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-      {
-        workmin.min(WorkView(rr, cc));
-        workmax.max(WorkView(rr, cc));
-      });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            {
+                              workmin.min(WorkView(rr, cc));
+                              workmax.max(WorkView(rr, cc));
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(-1), static_cast<DATA_TYPE>(workmin.get()));
-  ASSERT_EQ(
-      static_cast<DATA_TYPE>(array_length + 2),
-      static_cast<DATA_TYPE>(workmax.get()));
+  ASSERT_EQ(static_cast<DATA_TYPE>(array_length + 2),
+            static_cast<DATA_TYPE>(workmax.get()));
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
@@ -90,16 +87,15 @@ TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DMinMaxTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DMinMaxTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DMinMaxTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelTileFixed2DMinMaxTest,
-    TileFixed2DMinMaxKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest,
+                            TileFixed2DMinMaxKernel);
 
 #endif  // __TEST_KERNEL_TILE_FIXED2DMINMAX_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index 67da3f80d6..6304b1500f 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -12,12 +12,11 @@
 #include <vector>
 #include <type_traits>
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 {
   // This test reduces sums with tiling.
@@ -65,14 +64,13 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(
-      &colidx[0], colidx.size(), work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // sum on target platform
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-      { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
 
   ASSERT_FLOAT_EQ(hostsum, (DATA_TYPE)worksum.get());
 }
@@ -91,12 +89,12 @@ TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DSumTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DSumTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DSumTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel);
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index 0ca8c2b3dd..2bfe44934e 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -10,11 +10,10 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename DATA_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
@@ -32,11 +31,11 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE>(
-      array_length, work_res, &work_array_t, &check_array_t, &test_array_t);
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
@@ -45,8 +44,8 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize local array (shared mem)
-  using TILE_MEM = RAJA::LocalArray<
-      DATA_TYPE, RAJA::Perm<0, 1>, RAJA::SizeList<tile_dim_x, tile_dim_y>>;
+  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0, 1>,
+                                    RAJA::SizeList<tile_dim_x, tile_dim_y>>;
   TILE_MEM Tile_Array;
 
   // initialize arrays
@@ -72,16 +71,16 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
   RAJA::kernel_param<EXEC_POLICY>(
       RAJA::make_tuple(colrange, rowrange),
       RAJA::make_tuple((INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array),
-      [=] RAJA_HOST_DEVICE(
-          INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty,
-          TILE_MEM & Tile_Array) { Tile_Array(ty, tx) = WorkView(rr, cc); },
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
+      { Tile_Array(ty, tx) = WorkView(rr, cc); },
 
-      [=] RAJA_HOST_DEVICE(
-          INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty,
-          TILE_MEM & Tile_Array) { WorkTView(cc, rr) = Tile_Array(ty, tx); });
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
+      { WorkTView(cc, rr) = Tile_Array(ty, tx); });
 
-  work_res.memcpy(
-      check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
   for (int rr = 0; rr < rows; ++rr)
   {
@@ -91,11 +90,11 @@ void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 
-  deallocateForallTestData<DATA_TYPE>(
-      work_res, work_array_t, check_array_t, test_array_t);
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
@@ -111,12 +110,12 @@ TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileLocalArray2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileLocalArray2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileLocalArray2DTestImpl<
-      INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(10, 10);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(151, 111);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel);
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
index dc3ecfb7bc..c1d3daad25 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -43,8 +43,7 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
       LOOP_TYPE(), 64, 4 * 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelWarpThreadReduceMaskTest,
-    WarpThreadReduceMaskKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
+                            WarpThreadReduceMaskKernel);
 
 #endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
index 439ed4870d..bf1dda57af 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -44,8 +44,7 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
       LOOP_TYPE(), 4000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelWarpThreadReduceWarpTest,
-    WarpThreadReduceWarpKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
+                            WarpThreadReduceWarpKernel);
 
 #endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
index 99b9239466..7181c5f5a2 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -43,8 +43,7 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelWarpThreadWarpLoopTest,
-    WarpThreadWarpLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
+                            WarpThreadWarpLoopKernel);
 
 #endif  // __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
index 0d78e690a2..24c7f294ca 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -43,8 +43,7 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
       LOOP_TYPE(), 64, 4 * 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelWarpThreadReduceMaskTest,
-    WarpThreadReduceMaskKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
+                            WarpThreadReduceMaskKernel);
 
 #endif  // __TEST_WARP_THREAD_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
index 966301b563..6690efd2f9 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -44,8 +44,7 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
       LOOP_TYPE(), 4000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelWarpThreadReduceWarpTest,
-    WarpThreadReduceWarpKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
+                            WarpThreadReduceWarpKernel);
 
 #endif  // __TEST_WARP_THREAD_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
index 003c07bacb..ba8f38e64c 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -43,8 +43,7 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
       LOOP_TYPE(), 2345);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    KernelWarpThreadWarpLoopTest,
-    WarpThreadWarpLoopKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
+                            WarpThreadWarpLoopKernel);
 
 #endif  // __TEST_WARP_THREAD_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index c4973d56e5..8809630544 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -10,53 +10,49 @@
 
 #include <numeric>
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
   RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel_param(
-    SEGMENTS&&  segs,
-    PARAMS&&    params,
-    WORKING_RES work_res,
-    Args&&... args)
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -68,44 +64,41 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceMask test supports.
 //
 //
-using ReduceMaskSupportedLoopTypeList = camp::list<
-    DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-    DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
+using ReduceMaskSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+               DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
-    const RAJA::Index_type directlen,
-    const RAJA::Index_type looplen)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
+                          const RAJA::Index_type directlen,
+                          const RAJA::Index_type looplen)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      directlen * looplen, erased_work_res, &work_array, &check_array,
-      &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
+                                           &test_array);
 
-  RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
+  RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
       work_res,
       [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j))
       {
@@ -118,44 +111,40 @@ void KernelWarpThreadTest(
   ASSERT_EQ(trip_count.get(), looplen * directlen);
   ASSERT_EQ(worksum.get(), looplen * directlen * (directlen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
-    const RAJA::Index_type directlen,
-    const RAJA::Index_type looplen)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
+                          const RAJA::Index_type directlen,
+                          const RAJA::Index_type looplen)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      directlen * looplen, erased_work_res, &work_array, &check_array,
-      &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
+                                           &test_array);
 
-  RAJA::ReduceMax<REDUCE_POL, int>              max_thread(0);
+  RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
       RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0), work_res,
-      [=] RAJA_DEVICE(
-          RAJA::Index_type RAJA_UNUSED_ARG(i),
-          RAJA::Index_type RAJA_UNUSED_ARG(j),
-          RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
+      [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                      RAJA::Index_type RAJA_UNUSED_ARG(j),
+                      RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
       {
         trip_count += 1;
         worksum += y;  // y should only be 0..3
@@ -166,8 +155,8 @@ void KernelWarpThreadTest(
   ASSERT_EQ(trip_count.get(), looplen * directlen);
   ASSERT_EQ(worksum.get(), looplen * directlen * (looplen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
@@ -181,10 +170,9 @@ struct WarpThreadExec;
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<
-    DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-    REDUCE_POL,
-    POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+                      REDUCE_POL,
+                      POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
@@ -198,10 +186,9 @@ struct WarpThreadExec<
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<
-    DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
-    REDUCE_POL,
-    POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
+                      REDUCE_POL,
+                      POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<
       RAJA::statement::DEVICE_KERNEL<RAJA::statement::ForICount<
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index 39da05cbf2..e69c46baa5 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -10,29 +10,27 @@
 
 #include <numeric>
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel_param(
-    SEGMENTS&&  segs,
-    PARAMS&&    params,
-    WORKING_RES work_res,
-    Args&&... args)
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -44,34 +42,32 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceWarp test supports.
 //
 //
-using ReduceWarpSupportedLoopTypeList = camp::list<
-    DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-    DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-    DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
+using ReduceWarpSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
-    const RAJA::Index_type len)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
+                          const RAJA::Index_type len)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
@@ -94,20 +90,19 @@ void KernelWarpThreadTest(
   ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
   ASSERT_EQ(reduce_count.get(), 1);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(
     const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
     const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
@@ -117,20 +112,19 @@ void KernelWarpThreadTest(
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type outerlen = len / innerlen;
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
       RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type & value)
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type & value)
       { value += i + j * outerlen; },
 
       [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
@@ -144,20 +138,19 @@ void KernelWarpThreadTest(
   ASSERT_EQ(worksum.get(), outerlen * innerlen * (outerlen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(
     const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
     const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
@@ -168,22 +161,20 @@ void KernelWarpThreadTest(
   RAJA::Index_type middlelen = 16;
   RAJA::Index_type outerlen  = len / (innerlen * middlelen);
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
-          RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
       RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k,
-          RAJA::Index_type & value)
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k, RAJA::Index_type & value)
       { value += i + j * outerlen + k * outerlen * middlelen; },
 
       [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
@@ -194,13 +185,12 @@ void KernelWarpThreadTest(
         reduce_count += 1;
       });
 
-  ASSERT_EQ(
-      worksum.get(), outerlen * middlelen * innerlen *
-                         (outerlen * middlelen * innerlen - 1) / 2);
+  ASSERT_EQ(worksum.get(), outerlen * middlelen * innerlen *
+                               (outerlen * middlelen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), middlelen * innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
@@ -214,16 +204,14 @@ struct WarpThreadExec;
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<
-    DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-    REDUCE_POL,
-    POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::Lambda<0>>,
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>,
       RAJA::statement::Reduce<
           typename camp::at<POLICY_DATA, camp::num<1>>::type,
           RAJA::operators::plus,
@@ -233,10 +221,9 @@ struct WarpThreadExec<
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<
-    DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-    REDUCE_POL,
-    POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
 {
   using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
       RAJA::statement::For<
@@ -255,10 +242,9 @@ struct WarpThreadExec<
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<
-    DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
-    REDUCE_POL,
-    POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index cdd268f681..d0a8e51af3 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -10,53 +10,49 @@
 
 #include <numeric>
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
 {
   RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 {
   RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel_param(
-    SEGMENTS&&  segs,
-    PARAMS&&    params,
-    WORKING_RES work_res,
-    Args&&... args)
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
 {
   RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template <
-    typename EXEC_POL,
-    bool USE_RESOURCE,
-    typename SEGMENTS,
-    typename PARAMS,
-    typename WORKING_RES,
-    typename... Args>
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
 typename std::enable_if<!USE_RESOURCE>::type
 call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 {
@@ -68,34 +64,32 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the WarpLoop test supports.
 //
 //
-using WarpLoopSupportedLoopTypeList = camp::list<
-    DEVICE_DEPTH_1_REDUCESUM_WARP,
-    DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-    DEVICE_DEPTH_2_REDUCESUM_WARP>;
+using WarpLoopSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARP,
+               DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+               DEVICE_DEPTH_2_REDUCESUM_WARP>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_1_REDUCESUM_WARP&,
-    const RAJA::Index_type len)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
+                          const RAJA::Index_type len)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      len, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
 
@@ -107,23 +101,21 @@ void KernelWarpThreadTest(
 
   ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_2_REDUCESUM_WARP&,
-    const RAJA::Index_type numtiles)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
+                          const RAJA::Index_type numtiles)
 {
-  WORKING_RES               work_res {WORKING_RES::get_default()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
   camp::resources::Resource erased_work_res {work_res};
 
-  RAJA::Index_type  flatSize = 32 * numtiles;
+  RAJA::Index_type flatSize = 32 * numtiles;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
@@ -138,29 +130,27 @@ void KernelWarpThreadTest(
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
       RAJA::make_tuple((RAJA::Index_type)0), work_res,
-      [=] RAJA_HOST_DEVICE(
-          RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type j)
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                           RAJA::Index_type j)
       {
         worksum += j;  // j should only be 0..31
       });
 
   ASSERT_EQ(worksum.get(), numtiles * 32 * (32 - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(
-      erased_work_res, work_array, check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // More specific execution policies that use the above
 // DEVICE_DEPTH_1_REDUCESUM_WARP test.
-template <
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename REDUCE_POL,
-    bool USE_RESOURCE,
-    typename... Args>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
-    Args... args)
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
+                          Args... args)
 {
   KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
       DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
@@ -179,19 +169,17 @@ struct WarpThreadExec;
 template <typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
 {
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::Lambda<0>>>  // end DEVICE_KERNEL
-                         >;
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>>  // end DEVICE_KERNEL
+                                  >;
 };
 
 template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<
-    DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-    REDUCE_POL,
-    POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+                      REDUCE_POL,
+                      POLICY_DATA>
 {
   using type =
       RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index 0d0c972689..c20b66a95d 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -21,11 +21,10 @@
 // Defining the Launch Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template <
-    typename EXEC_POL_DATA,
-    typename IDX_TYPE,
-    typename SEGMENTS_TYPE,
-    typename Lambda>
+template <typename EXEC_POL_DATA,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Lambda>
 void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
 {
   using RAJA::get;
@@ -57,9 +56,8 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
   IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k);
 
   RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(
-          RAJA::Teams(blocks_i, blocks_j, blocks_k),
-          RAJA::Threads(threads_i, threads_j, threads_k)),
+      RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
+                         RAJA::Threads(threads_i, threads_j, threads_k)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_Z_POLICY>(
@@ -99,9 +97,8 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
                                                 j < distance_sj &&
                                                 k < distance_sk)
                                             {
-                                              lambda(
-                                                  begin_sk[k], begin_sj[j],
-                                                  begin_si[i]);
+                                              lambda(begin_sk[k], begin_sj[j],
+                                                     begin_si[i]);
                                             }
                                           });
                                     });
@@ -112,45 +109,41 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
       });
 }
 
-template <
-    typename EXEC_POL_DATA,
-    typename REDUCE_POLICY,
-    typename ABSTRACTION,
-    typename DATA_TYPE,
-    typename IDX_TYPE,
-    typename SEGMENTS_TYPE,
-    typename Container,
-    typename WORKING_RES,
-    typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
-LaunchMultiReduceNestedTestImpl(
-    const SEGMENTS_TYPE&,
-    const Container&,
-    WORKING_RES,
-    RandomGenerator&)
+LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
+                                const Container&,
+                                WORKING_RES,
+                                RandomGenerator&)
 {
   return false;
 }
 ///
-template <
-    typename EXEC_POL_DATA,
-    typename REDUCE_POLICY,
-    typename ABSTRACTION,
-    typename DATA_TYPE,
-    typename IDX_TYPE,
-    typename SEGMENTS_TYPE,
-    typename Container,
-    typename WORKING_RES,
-    typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
-LaunchMultiReduceNestedTestImpl(
-    const SEGMENTS_TYPE& segments,
-    const Container&     multi_init,
-    WORKING_RES          working_res,
-    RandomGenerator&     rngen)
+LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
+                                const Container& multi_init,
+                                WORKING_RES working_res,
+                                RandomGenerator& rngen)
 {
   using RAJA::get;
   using MULTIREDUCER =
@@ -170,7 +163,7 @@ LaunchMultiReduceNestedTestImpl(
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int    modval   = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -187,8 +180,8 @@ LaunchMultiReduceNestedTestImpl(
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(
-      idx_range + 1, working_res, &working_range, &check_range, &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
   for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
   {
@@ -214,11 +207,11 @@ LaunchMultiReduceNestedTestImpl(
     }
   }
 
-  allocateForallTestData(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+                         &test_array);
 
-  allocateForallTestData(
-      data_len, working_res, &working_bins, &check_bins, &test_bins);
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+                         &test_bins);
 
   if (data_len > IDX_TYPE(0))
   {
@@ -238,8 +231,8 @@ LaunchMultiReduceNestedTestImpl(
     }
   }
 
-  working_res.memcpy(
-      working_range, test_range, sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -325,23 +318,22 @@ LaunchMultiReduceNestedTestImpl(
     {
 
       // use floating point values to accentuate floating point precision issues
-      std::conditional_t<
-          !std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>>
+      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
           array_flt_distribution(0, modval - 1);
 
       for (IDX_TYPE i = 0; i < data_len; ++i)
       {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(
-          working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
     std::vector<DATA_TYPE> ref_vals;
-    bool                   got_ref_vals = false;
+    bool got_ref_vals = false;
 
     const int nloops = 2;
     for (int j = 0; j < nloops; ++j)
@@ -398,7 +390,7 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto         random_seed = std::random_device {}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
   WORKING_RES working_res {WORKING_RES::get_default()};
@@ -406,47 +398,44 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
-  size_t              num_bins_min = 0;
+  size_t num_bins_min = 0;
   for (size_t num_bins_max : num_bins_max_container)
   {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(
-        num_bins_min, num_bins_max);
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
     num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    LaunchMultiReduceNestedTestImpl<
-        EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-        s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(
-        RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
-        RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
-        RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    LaunchMultiReduceNestedTestImpl<
-        EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-        s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(
-        RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
-        RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
-        RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    LaunchMultiReduceNestedTestImpl<
-        EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-        s3, container, working_res, rngen);
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    LaunchMultiReduceNestedTest,
-    MultiReduceNestedLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest,
+                            MultiReduceNestedLaunch);
 
 #endif  // __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index c12d1d35af..bb64d5424b 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -10,16 +10,15 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename THREAD_X_POLICY,
-    typename THREAD_Y_POLICY,
-    typename THREAD_Z_POLICY,
-    typename TEAM_X_POLICY,
-    typename TEAM_Y_POLICY,
-    typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -42,9 +41,9 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -52,8 +51,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
   // 6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
@@ -70,13 +69,12 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
     constexpr int DIM = 6;
     using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(
-        working_array, N6, N5, N4, N3, N2, N1);
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -132,13 +130,12 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -180,9 +177,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(
-          test_array[RAJA::stripIndexType(i)],
-          check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -191,8 +187,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -205,36 +201,43 @@ class LaunchNestedDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedDirectTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedDirectTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(1));
 }
 
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index a8ed179ed9..c9192b6718 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -10,16 +10,15 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename THREAD_X_POLICY,
-    typename THREAD_Y_POLICY,
-    typename THREAD_Z_POLICY,
-    typename TEAM_X_POLICY,
-    typename TEAM_Y_POLICY,
-    typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -43,9 +42,9 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -53,8 +52,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   // 6 threads total
   constexpr int threads_x = 1;
@@ -72,13 +71,12 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
     constexpr int DIM = 6;
     using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(
-        working_array, N6, N5, N4, N3, N2, N1);
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -134,13 +132,12 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::loop<TEAM_Z_POLICY>(
@@ -182,9 +179,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(
-          test_array[RAJA::stripIndexType(i)],
-          check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -193,8 +189,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -207,35 +203,42 @@ class LaunchNestedLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedLoopTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(0));
 
-  LaunchNestedLoopTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+  LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
       INDEX_TYPE(3));
 }
 
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 675fcc760a..20a4e10ac6 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -10,16 +10,15 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename THREAD_X_POLICY,
-    typename THREAD_Y_POLICY,
-    typename THREAD_Z_POLICY,
-    typename TEAM_X_POLICY,
-    typename TEAM_Y_POLICY,
-    typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -46,9 +45,9 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -56,8 +55,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -69,9 +68,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -116,13 +114,12 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -164,9 +161,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(
-          test_array[RAJA::stripIndexType(i)],
-          check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -175,8 +171,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -189,24 +185,31 @@ class LaunchNestedTileDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index ea4b0a2b78..790498dc2f 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -10,16 +10,15 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename THREAD_X_POLICY,
-    typename THREAD_Y_POLICY,
-    typename THREAD_Z_POLICY,
-    typename TEAM_X_POLICY,
-    typename TEAM_Y_POLICY,
-    typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -43,9 +42,9 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -53,8 +52,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -66,9 +65,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(threads_x, threads_y, threads_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -113,13 +111,12 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(
-            RAJA::Teams(blocks_x, blocks_y, blocks_z),
-            RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
           RAJA::tile<TEAM_Z_POLICY>(
@@ -167,9 +164,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(
-          test_array[RAJA::stripIndexType(i)],
-          check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -178,8 +174,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -192,24 +188,31 @@ class LaunchNestedTileLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<6>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index a9b7c306ab..c07e8490ea 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -13,17 +13,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY,
-    typename REDUCE_POLICY>
-void LaunchReduceBitAndBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename REDUCE_POLICY>
+void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -33,10 +31,10 @@ void LaunchReduceBitAndBasicTestImpl(
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int           blocks  = (seg.size() - 1) / threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -85,13 +83,12 @@ void LaunchReduceBitAndBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            {
-              redand &= working_array[idx];
-              redand2 &= working_array[idx];
-            });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           redand &= working_array[idx];
+                                           redand2 &= working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
@@ -114,8 +111,8 @@ void LaunchReduceBitAndBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -126,13 +123,15 @@ class LaunchReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
@@ -166,16 +165,16 @@ TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
   RAJA::getIndices(seg_idx, r4);
   LaunchReduceBitAndBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r4, seg_idx, working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchReduceBitAndBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r5, seg_idx, working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -195,8 +194,7 @@ TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
       GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    LaunchReduceBitAndBasicTest,
-    ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest,
+                            ReduceBitAndBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index 79ea1d1004..eb3f55c1e5 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -13,17 +13,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY,
-    typename REDUCE_POLICY>
-void LaunchReduceMinBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename REDUCE_POLICY>
+void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -33,12 +31,12 @@ void LaunchReduceMinBasicTestImpl(
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int           blocks  = (seg.size() - 1) / threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval    = 100;
+  const int modval          = 100;
   const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
@@ -63,13 +61,12 @@ void LaunchReduceMinBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            {
-              mininit.min(working_array[idx]);
-              min.min(working_array[idx]);
-            });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           mininit.min(working_array[idx]);
+                                           min.min(working_array[idx]);
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
@@ -103,8 +100,8 @@ void LaunchReduceMinBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -115,13 +112,15 @@ class LaunchReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
@@ -131,23 +130,26 @@ TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
@@ -155,16 +157,16 @@ TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
   RAJA::getIndices(seg_idx, r4);
   LaunchReduceMinBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r4, seg_idx, working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchReduceMinBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r5, seg_idx, working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -179,9 +181,10 @@ TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest, ReduceMinBasicForall);
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index 942cbc8b91..da783f96bd 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -13,17 +13,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY,
-    typename REDUCE_POLICY>
-void LaunchReduceSumBasicTestImpl(
-    const SEG_TYPE&              seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename REDUCE_POLICY>
+void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
@@ -34,10 +32,10 @@ void LaunchReduceSumBasicTestImpl(
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int           blocks  = (seg.size() - 1) / threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -62,13 +60,12 @@ void LaunchReduceSumBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            {
-              sum += working_array[idx];
-              sum2 += working_array[idx];
-            });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           sum += working_array[idx];
+                                           sum2 += working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
@@ -92,8 +89,8 @@ void LaunchReduceSumBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -104,13 +101,15 @@ class LaunchReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
@@ -120,22 +119,25 @@ TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
@@ -143,16 +145,16 @@ TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
   RAJA::getIndices(seg_idx, r4);
   LaunchReduceSumBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r4, seg_idx, working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchReduceSumBasicTestImpl<
       IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r5, seg_idx, working_res);
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -167,9 +169,10 @@ TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest, ReduceSumBasicForall);
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index b0eebd67b3..a06d022183 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -13,17 +13,16 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY>
 
 void LaunchParamExptReduceBitAndBasicTestImpl(
-    const SEG_TYPE&              seg,
+    const SEG_TYPE& seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -33,10 +32,10 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int           blocks  = (seg.size() - 1) / threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
@@ -86,16 +85,15 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _redand, DATA_TYPE & _redand2)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _redand,
+                           DATA_TYPE & _redand2)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            {
-              _redand &= working_array[idx];
-              _redand2 &= working_array[idx];
-            });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _redand &= working_array[idx];
+                                           _redand2 &= working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
@@ -119,8 +117,8 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -131,13 +129,15 @@ class LaunchParamExptReduceBitAndBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
 
@@ -146,23 +146,26 @@ TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r1, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r2, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r3, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
@@ -192,13 +195,13 @@ TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(l1, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                           RAJA::TypedListSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    LaunchParamExptReduceBitAndBasicTest,
-    ReduceBitAndBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest,
+                            ReduceBitAndBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 99f890d267..156697485b 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -13,16 +13,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY>
 void LaunchParamExptReduceMinBasicTestImpl(
-    const SEG_TYPE&              seg,
+    const SEG_TYPE& seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
   IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
@@ -32,12 +31,12 @@ void LaunchParamExptReduceMinBasicTestImpl(
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int           blocks  = (seg.size() - 1) / threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int       modval    = 100;
+  const int modval          = 100;
   const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
@@ -62,8 +61,8 @@ void LaunchParamExptReduceMinBasicTestImpl(
       "LaunchMinBasicTest",
       RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
       RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _mininit, DATA_TYPE & _min)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _mininit,
+                           DATA_TYPE & _min)
       {
         RAJA::loop<GLOBAL_THREAD_POLICY>(
             ctx, seg,
@@ -111,8 +110,8 @@ void LaunchParamExptReduceMinBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -123,13 +122,15 @@ class LaunchParamExptReduceMinBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
 
@@ -138,38 +139,43 @@ TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r1, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r2, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r3, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -184,13 +190,13 @@ TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(l1, seg_idx, working_res);
+  LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    LaunchParamExptReduceMinBasicTest,
-    ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest,
+                            ReduceMinBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index 6b6ebd4e6a..9e16dae0fb 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -13,16 +13,15 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename IDX_TYPE,
-    typename DATA_TYPE,
-    typename SEG_TYPE,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY>
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY>
 void LaunchParamExptReduceSumBasicTestImpl(
-    const SEG_TYPE&              seg,
+    const SEG_TYPE& seg,
     const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource    working_res)
+    camp::resources::Resource working_res)
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
@@ -33,10 +32,10 @@ void LaunchParamExptReduceSumBasicTestImpl(
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int           blocks  = (seg.size() - 1) / threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
@@ -60,16 +59,15 @@ void LaunchParamExptReduceSumBasicTestImpl(
       RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
       "LaunchSumBasicTest", RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
       RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-      [=] RAJA_HOST_DEVICE(
-          RAJA::LaunchContext ctx, DATA_TYPE & _sum, DATA_TYPE & _sum2)
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum,
+                           DATA_TYPE & _sum2)
       {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            {
-              _sum += working_array[idx];
-              _sum2 += working_array[idx];
-            });
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _sum += working_array[idx];
+                                           _sum2 += working_array[idx];
+                                         });
       });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
@@ -94,8 +92,8 @@ void LaunchParamExptReduceSumBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+                                      test_array);
 }
 
 
@@ -106,13 +104,15 @@ class LaunchParamExptReduceSumBasicTest : public ::testing::Test
 
 TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
 
@@ -121,38 +121,43 @@ TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
   // Range segment tests
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchParamExptReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r1, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchParamExptReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r2, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchParamExptReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(r3, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
   // Range-stride segment tests
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
@@ -167,13 +172,13 @@ TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
     }
   }
   RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchParamExptReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>(l1, seg_idx, working_res);
+  LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    LaunchParamExptReduceSumBasicTest,
-    ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest,
+                            ReduceSumBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index 44dda21028..094aeb131d 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -10,23 +10,22 @@
 
 #include <numeric>
 
-template <
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename TEAM_POLICY,
-    typename THREAD_POLICY>
+template <typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchBasicSharedTestImpl()
 {
 
   int N = 1000;
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  int*                      working_array;
-  int*                      check_array;
-  int*                      test_array;
+  int* working_array;
+  int* check_array;
+  int* test_array;
 
-  allocateForallTestData<int>(
-      N * N, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<int>(N * N, working_res, &working_array, &check_array,
+                              &test_array);
 
 
   // Select platform
@@ -54,19 +53,18 @@ void LaunchBasicSharedTestImpl()
               // Array shared within threads of the same team
               int* s_A = ctx.getSharedMemory<int>(1);
 
-              RAJA::loop<THREAD_POLICY>(
-                  ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; });
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1),
+                                        [&](int c) { s_A[c] = r; });
 
               ctx.teamSync();
 
               // broadcast shared value to all threads and write to array
-              RAJA::loop<THREAD_POLICY>(
-                  ctx, RAJA::RangeSegment(0, N),
-                  [&](int c)
-                  {
-                    const int idx      = c + N * r;
-                    working_array[idx] = s_A[0];
-                  });  // loop j
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N),
+                                        [&](int c)
+                                        {
+                                          const int idx      = c + N * r;
+                                          working_array[idx] = s_A[0];
+                                        });  // loop j
 
               ctx.releaseSharedMemory();
             });  // loop r
@@ -83,8 +81,8 @@ void LaunchBasicSharedTestImpl()
     }
   }
 
-  deallocateForallTestData<int>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<int>(working_res, working_array, check_array,
+                                test_array);
 }
 
 
@@ -96,16 +94,19 @@ class LaunchBasicSharedTest : public ::testing::Test
 TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
 {
 
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<1>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<1>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<1>>::type, camp::num<2>>::type;
-
-  LaunchBasicSharedTestImpl<
-      WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<2>>::type;
+
+  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                            THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest, BasicSharedTeams);
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index b16340416c..8faa4111ad 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -15,11 +15,10 @@
 #include <algorithm>
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchListSegmentTestImpl(INDEX_TYPE N)
 {
 
@@ -59,11 +58,11 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
-  int           blocks  = (data_len - 1) / threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -73,8 +72,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
       test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
@@ -91,20 +90,19 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, lseg,
-              [&](INDEX_TYPE idx)
-              {
-                (void)idx;
-                working_array[0]++;
-              });
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg,
+                                          [&](INDEX_TYPE idx)
+                                          {
+                                            (void)idx;
+                                            working_array[0]++;
+                                          });
         });
   }
 
@@ -114,9 +112,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
   {
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(
-          test_array[RAJA::stripIndexType(i)],
-          check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -125,8 +122,8 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
   }
 
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -139,27 +136,25 @@ TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
 {
   using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY    = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length list segment
-  LaunchListSegmentTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(0));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
 
-  LaunchListSegmentTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(13));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
 
-  LaunchListSegmentTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(2047));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
 
-  LaunchListSegmentTestImpl<
-      INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(32000));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest, ListSegmentTeams);
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index fd29331771..2d36a6316b 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -10,22 +10,21 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -33,11 +32,11 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
-  int           blocks  = (data_len - 1) / threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -61,16 +60,16 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, r1,
-              [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) { working_array[0]++; });
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
         });
   }
 
@@ -81,9 +80,8 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(
-          test_array[RAJA::stripIndexType(i)],
-          check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -92,8 +90,8 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -102,65 +100,65 @@ template <typename T>
 class LaunchRangeSegmentTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(-5));
-
-  LaunchRangeSegmentTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0));
-  LaunchRangeSegmentTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(-5));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(0));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(5));
 }
 
 TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(3));
-
-  LaunchRangeSegmentTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(27));
-  LaunchRangeSegmentTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(2047));
-  LaunchRangeSegmentTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(32000));
-
-  runNegativeTests<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(3),
+                                                   INDEX_TYPE(3));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(0),
+                                                   INDEX_TYPE(27));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(2047));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                   GLOBAL_THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest, RangeSegmentTeams);
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index 144e80519d..d25d46ce8f 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -10,16 +10,14 @@
 
 #include <cstring>
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POICY>
-void LaunchRangeStrideSegmentTestImpl(
-    INDEX_TYPE first,
-    INDEX_TYPE last,
-    DIFF_TYPE  stride)
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
+void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
+                                      DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
       RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
@@ -27,9 +25,9 @@ void LaunchRangeStrideSegmentTestImpl(
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
   camp::resources::Resource host_res {camp::resources::Host()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
   if (data_len == 0)
@@ -37,15 +35,15 @@ void LaunchRangeStrideSegmentTestImpl(
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
   constexpr int threads = 256;
-  int           blocks  = (data_len - 1) / threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -76,9 +74,9 @@ void LaunchRangeStrideSegmentTestImpl(
         RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
         {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, r1,
-              [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) { working_array[0]++; });
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
         });
   }
 
@@ -89,9 +87,8 @@ void LaunchRangeStrideSegmentTestImpl(
 
     for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
     {
-      ASSERT_EQ(
-          test_array[RAJA::stripIndexType(i)],
-          check_array[RAJA::stripIndexType(i)]);
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
   }
   else
@@ -100,8 +97,8 @@ void LaunchRangeStrideSegmentTestImpl(
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -110,95 +107,93 @@ template <typename T>
 class LaunchRangeStrideSegmentTest : public ::testing::Test
 {};
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POICY,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {}
 
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename GLOBAL_THREAD_POLICY,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
 
   // Test negative strides
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
 TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
   using DIFF_TYPE =
       typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
 
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
 
   // Test size zero segments
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  LaunchRangeStrideSegmentTestImpl<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
       INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
-  runNegativeStrideTests<
-      INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY>();
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
+                         GLOBAL_THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    LaunchRangeStrideSegmentTest,
-    RangeStrideSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest,
+                            RangeStrideSegmentTeams);
 
 #endif  // __TEST_TEAMS_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index f53faae728..0d2fa4d789 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -10,12 +10,11 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename TEAM_POLICY,
-    typename THREAD_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 {
 
@@ -25,15 +24,15 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
       RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
@@ -55,9 +54,9 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   shared_mem_size += RAJA::stripIndexType(thread_range) * sizeof(int);
 
   RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(
-          RAJA::Teams(RAJA::stripIndexType(block_range)),
-          RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range)),
+                         shared_mem_size),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_POLICY>(
@@ -80,9 +79,8 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
                   {
                     Int_Tile(RAJA::stripIndexType(tid)) =
                         RAJA::stripIndexType(tid);
-                    Tile(
-                        RAJA::stripIndexType(thread_range) -
-                        RAJA::stripIndexType(tid) - 1) =
+                    Tile(RAJA::stripIndexType(thread_range) -
+                         RAJA::stripIndexType(tid) - 1) =
                         thread_range - tid - 1 + thread_range * bid;
                   });
 
@@ -106,13 +104,12 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 
   for (size_t i = 0; i < data_len; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -124,23 +121,24 @@ class LaunchDynamicMemTest : public ::testing::Test
 TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
-  LaunchDynamicMemTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>(
-      INDEX_TYPE(4), INDEX_TYPE(2));
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(4), INDEX_TYPE(2));
 
-  LaunchDynamicMemTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>(
-      INDEX_TYPE(5), INDEX_TYPE(32));
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(5), INDEX_TYPE(32));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest, DynamicMemLaunch);
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index ba0cd6dcec..a424015398 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -10,13 +10,12 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename TEAM_POLICY,
-    typename THREAD_POLICY,
-    int THREAD_RANGE>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY,
+          int THREAD_RANGE>
 void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 {
 
@@ -28,15 +27,15 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
       RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_array;
-  INDEX_TYPE*               check_array;
-  INDEX_TYPE*               test_array;
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
 
   size_t data_len =
       RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_array, &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
@@ -51,9 +50,8 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
   }
 
   RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(
-          RAJA::Teams(RAJA::stripIndexType(block_range)),
-          RAJA::Threads(RAJA::stripIndexType(thread_range))),
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range))),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
       {
         RAJA::loop<TEAM_POLICY>(
@@ -68,17 +66,16 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
               // a function-scope static __shared__ variable within a
               // __device__/__global__ function
               RAJA_TEAM_SHARED char
-                          char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
+                  char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
               INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
 
               RAJA::loop<THREAD_POLICY>(
                   ctx, inner_range,
                   [&](INDEX_TYPE tid)
                   {
-                    Tile
-                        [RAJA::stripIndexType(thread_range) -
+                    Tile[RAJA::stripIndexType(thread_range) -
                          RAJA::stripIndexType(tid) - 1] =
-                            thread_range - tid - 1 + thread_range * bid;
+                        thread_range - tid - 1 + thread_range * bid;
                   });
 
               ctx.teamSync();
@@ -100,13 +97,12 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 
   for (size_t i = 0; i < data_len; i++)
   {
-    ASSERT_EQ(
-        test_array[RAJA::stripIndexType(i)],
-        check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_array, check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+                                       test_array);
 }
 
 
@@ -118,23 +114,24 @@ class LaunchStaticMemTest : public ::testing::Test
 TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
-  LaunchStaticMemTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 2>(
-      INDEX_TYPE(4));
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 2>(INDEX_TYPE(4));
 
-  LaunchStaticMemTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 32>(
-      INDEX_TYPE(5));
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 32>(INDEX_TYPE(5));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest, StaticMemLaunch);
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index d1041c31c4..48aed7a007 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -10,12 +10,11 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename THREAD_X_POLICY,
-    typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -31,9 +30,9 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_ttile_array;
-  INDEX_TYPE*               check_ttile_array;
-  INDEX_TYPE*               test_ttile_array;
+  INDEX_TYPE* working_ttile_array;
+  INDEX_TYPE* check_ttile_array;
+  INDEX_TYPE* test_ttile_array;
 
   INDEX_TYPE* working_iloop_array;
   INDEX_TYPE* check_iloop_array;
@@ -45,13 +44,13 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_ttile_array, &check_ttile_array,
-      &test_ttile_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
+                                     &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_iloop_array, &check_iloop_array,
-      &test_iloop_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
+                                     &test_iloop_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -66,7 +65,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
           RAJA::tile_tcount<TEAM_X_POLICY>(
               ctx, threads_x, r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE                                 bx)
+                  INDEX_TYPE bx)
               {
                 RAJA::loop_icount<THREAD_X_POLICY>(
                     ctx, x_tile,
@@ -81,11 +80,11 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   else
   {  // zero-length segment
 
-    memset(
-        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
@@ -108,10 +107,10 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         });
   }
 
-  working_res.memcpy(
-      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(
-      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -138,11 +137,11 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
@@ -155,30 +154,30 @@ class LaunchNestedTileDirectTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
-      INDEX_TYPE(0));
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
-      INDEX_TYPE(1));
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
 
-  LaunchNestedTileDirectTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
-      INDEX_TYPE(2));
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index 9b92ad7166..d39a66009d 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -10,12 +10,11 @@
 
 #include <numeric>
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename LAUNCH_POLICY,
-    typename THREAD_X_POLICY,
-    typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -34,9 +33,9 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
   camp::resources::Resource working_res {WORKING_RES::get_default()};
-  INDEX_TYPE*               working_ttile_array;
-  INDEX_TYPE*               check_ttile_array;
-  INDEX_TYPE*               test_ttile_array;
+  INDEX_TYPE* working_ttile_array;
+  INDEX_TYPE* check_ttile_array;
+  INDEX_TYPE* test_ttile_array;
 
   INDEX_TYPE* working_iloop_array;
   INDEX_TYPE* check_iloop_array;
@@ -48,13 +47,13 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_ttile_array, &check_ttile_array,
-      &test_ttile_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
+                                     &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(
-      data_len, working_res, &working_iloop_array, &check_iloop_array,
-      &test_iloop_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
+                                     &test_iloop_array);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -69,7 +68,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
           RAJA::tile_tcount<TEAM_X_POLICY>(
               ctx, tile_size, r1,
               [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE                                 bx)
+                  INDEX_TYPE bx)
               {
                 RAJA::loop_icount<THREAD_X_POLICY>(
                     ctx, x_tile,
@@ -84,11 +83,11 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   else
   {  // zero-length segment
 
-    memset(
-        static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(
-        working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
         RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
@@ -111,10 +110,10 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         });
   }
 
-  working_res.memcpy(
-      check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(
-      check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
   if (RAJA::stripIndexType(N) > 0)
   {
@@ -141,11 +140,11 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_ttile_array, check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(
-      working_res, working_iloop_array, check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
@@ -158,30 +157,30 @@ class LaunchNestedTileLoopTest : public ::testing::Test
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 {
 
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<0>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<
-      typename camp::at<TypeParam, camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
-      INDEX_TYPE(0));
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
   // Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
-      INDEX_TYPE(1));
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
 
-  LaunchNestedTileLoopTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, TEAM_X_POLICY>(
-      INDEX_TYPE(2));
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index 61a004db4b..40769cee3a 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -11,11 +11,10 @@
 #include <numeric>
 
 template <typename OP, typename T>
-::testing::AssertionResult check_exclusive(
-    const T* actual,
-    const T* original,
-    int      N,
-    T        init = OP::identity())
+::testing::AssertionResult check_exclusive(const T* actual,
+                                           const T* original,
+                                           int N,
+                                           T init = OP::identity())
 {
   for (int i = 0; i < N; ++i)
   {
@@ -33,12 +32,12 @@ ::testing::AssertionResult check_exclusive(
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
 void ScanExclusiveTestImpl(
-    int                           N,
+    int N,
     typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res {WORKING_RES::get_default()};
+  WORKING_RES res {WORKING_RES::get_default()};
   camp::resources::Resource working_res {res};
 
   T* work_in;
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index 4491ea539c..34d7b6d470 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -11,11 +11,10 @@
 #include <numeric>
 
 template <typename OP, typename T>
-::testing::AssertionResult check_exclusive(
-    const T* actual,
-    const T* original,
-    int      N,
-    T        init = OP::identity())
+::testing::AssertionResult check_exclusive(const T* actual,
+                                           const T* original,
+                                           int N,
+                                           T init = OP::identity())
 {
   for (int i = 0; i < N; ++i)
   {
@@ -33,12 +32,12 @@ ::testing::AssertionResult check_exclusive(
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
 void ScanExclusiveInplaceTestImpl(
-    int                           N,
+    int N,
     typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res {WORKING_RES::get_default()};
+  WORKING_RES res {WORKING_RES::get_default()};
   camp::resources::Resource working_res {res};
 
   T* work_in;
@@ -54,8 +53,8 @@ void ScanExclusiveInplaceTestImpl(
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
-      RAJA::make_span(work_in, N), OP_TYPE {}, offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
+                                            OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -65,8 +64,8 @@ void ScanExclusiveInplaceTestImpl(
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(
-      res, RAJA::make_span(work_in, N), OP_TYPE {}, offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -97,12 +96,12 @@ TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(
-      0, T(13));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(
-      357, T(15));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(
-      32000, T(2));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0,
+                                                                       T(13));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357,
+                                                                       T(15));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000,
+                                                                       T(2));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest, ScanExclusiveInplace);
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index ca744d480e..43c0c8e1b2 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -11,10 +11,10 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-    const typename OP::result_type* actual,
-    const typename OP::result_type* original,
-    int                             N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
   for (int i = 0; i < N; ++i)
@@ -36,7 +36,7 @@ void ScanInclusiveTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res {WORKING_RES::get_default()};
+  WORKING_RES res {WORKING_RES::get_default()};
   camp::resources::Resource working_res {res};
 
   T* work_in;
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index 4a86d7d64f..8f3761865b 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -11,10 +11,10 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-    const typename OP::result_type* actual,
-    const typename OP::result_type* original,
-    int                             N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
   for (int i = 0; i < N; ++i)
@@ -36,7 +36,7 @@ void ScanInclusiveInplaceTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES               res {WORKING_RES::get_default()};
+  WORKING_RES res {WORKING_RES::get_default()};
   camp::resources::Resource working_res {res};
 
   T* work_in;
@@ -52,8 +52,8 @@ void ScanInclusiveInplaceTestImpl(int N)
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(
-      RAJA::make_span(work_in, N), OP_TYPE {});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
+                                            OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -63,8 +63,8 @@ void ScanInclusiveInplaceTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(
-      res, RAJA::make_span(work_in, N), OP_TYPE {});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
index e6010c2cad..26b015939f 100644
--- a/test/functional/scan/tests/test-scan-data.hpp
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -13,13 +13,12 @@
 //
 
 template <typename T>
-void allocScanTestData(
-    int                       N,
-    camp::resources::Resource work_res,
-    T**                       work_in,
-    T**                       work_out,
-    T**                       host_in,
-    T**                       host_out)
+void allocScanTestData(int N,
+                       camp::resources::Resource work_res,
+                       T** work_in,
+                       T** work_out,
+                       T** host_in,
+                       T** host_out)
 {
   camp::resources::Resource host_res {camp::resources::Host()};
 
@@ -31,12 +30,11 @@ void allocScanTestData(
 }
 
 template <typename T>
-void deallocScanTestData(
-    camp::resources::Resource work_res,
-    T*                        work_in,
-    T*                        work_out,
-    T*                        host_in,
-    T*                        host_out)
+void deallocScanTestData(camp::resources::Resource work_res,
+                         T* work_in,
+                         T* work_out,
+                         T* host_in,
+                         T* host_out)
 {
   camp::resources::Resource host_res {camp::resources::Host()};
 
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index b0b399be09..d988dd8e55 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -14,39 +14,34 @@ using MatrixElementType = double;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        8,
-        RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        16,
-        RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::hip_wave_register>,
 #endif
 
 
@@ -70,24 +65,21 @@ using TensorMatrixTypes = ::testing::Types<
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        2,
-        4,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        2,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        2,
-        12,
-        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   12,
+                                   RAJA::expt::avx2_register>,
 
 //    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
 //    4,8, RAJA::expt::avx2_register>,
@@ -103,48 +95,41 @@ using TensorMatrixTypes = ::testing::Types<
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        8,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        4,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -152,9 +137,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
index 3dc5828dbb..40cb6f67fd 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
@@ -14,84 +14,73 @@ using MatrixElementType = float;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        8,
-        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -99,9 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
index 9c48f41e64..b3e415abbc 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
@@ -14,84 +14,73 @@ using MatrixElementType = int32_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        8,
-        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -99,9 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
index 1db4d3f4e5..3dca8e44a6 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
@@ -14,157 +14,134 @@ using MatrixElementType = int64_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        2,
-        4,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        2,
-        8,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        2,
-        RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        2,
-        RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx_register>,
 
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        2,
-        4,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        2,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        8,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        2,
-        RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        2,
-        RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        8,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        4,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        16,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        8,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        16,
-        4,
-        RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        8,
-        4,
-        RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -172,9 +149,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<
-        MatrixElementType,
-        TensorMatrixLayoutType,
-        RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
     >;
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index 911a0f88f3..b16684cfdc 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -22,8 +22,8 @@ void CtorGetSetImpl()
   //
   // Allocate Data
   //
-  std::vector<element_t> data1_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -32,8 +32,8 @@ void CtorGetSetImpl()
       data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
-  std::vector<element_t> data2_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index ce74af66a4..1a28374569 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -27,7 +27,7 @@ void ET_AddImpl()
   //
 
   // alloc data1
-  std::vector<element_t>                 data1_vec(N * N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
   element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
@@ -35,7 +35,7 @@ void ET_AddImpl()
 
 
   // alloc data2
-  std::vector<element_t>                 data2_vec(N * N);
+  std::vector<element_t> data2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
   element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
@@ -63,7 +63,7 @@ void ET_AddImpl()
 
 
   // alloc data5
-  std::vector<element_t>                 data5_vec(N * N);
+  std::vector<element_t> data5_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
   element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
@@ -125,9 +125,8 @@ void ET_AddImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(
-          data5_h(j, i),
-          data1_h(i, j) + data2_h(j, i) + data3_h(i, j) + data4_h(j, i));
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i) +
+                                          data3_h(i, j) + data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index dd6b18ff18..c17692d673 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -27,7 +27,7 @@ void ET_DivideImpl()
 
   // alloc data1
 
-  std::vector<element_t>                 data1_vec(N * N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
   element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
@@ -36,7 +36,7 @@ void ET_DivideImpl()
 
   // alloc data2
 
-  std::vector<element_t>                 data2_vec(N * N);
+  std::vector<element_t> data2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
   element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
@@ -64,7 +64,7 @@ void ET_DivideImpl()
 
 
   // alloc data5
-  std::vector<element_t>                 data5_vec(N * N);
+  std::vector<element_t> data5_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
   element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
@@ -126,9 +126,8 @@ void ET_DivideImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(
-          data5_h(j, i),
-          data1_h(i, j) / data2_h(j, i) + data3_h(i, j) / data4_h(j, i));
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i) +
+                                          data3_h(i, j) / data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 78c033d261..6c9638a779 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -24,8 +24,8 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -35,8 +35,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -46,25 +46,21 @@ void ET_LoadStoreImpl()
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
-  RAJA::View<
-      element_t,
-      RAJA::StaticLayout<
-          RAJA::PERM_IJ, matrix_t::s_num_rows, matrix_t::s_num_columns>>
+  std::vector<element_t> data3_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
       data3_h(data3_vec.data());
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<
-      element_t,
-      RAJA::StaticLayout<
-          RAJA::PERM_IJ, matrix_t::s_num_rows, matrix_t::s_num_columns>>
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
       data3_d(data3_ptr);
 
 
   // alloc data4
-  std::vector<element_t> data4_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data4_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data4_h(
       data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -74,8 +70,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data5_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(
       data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -85,8 +81,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data6
-  std::vector<element_t> data6_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data6_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data6_h(
       data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -96,8 +92,8 @@ void ET_LoadStoreImpl()
 
 
   // alloc data7
-  std::vector<element_t> data7_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data7_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data7_h(
       data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index e51b86b0c3..c197a306e4 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -55,7 +55,7 @@ void ET_MatrixMatrixMultiplyImpl()
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t>                 data3_vec(N * N);
+  std::vector<element_t> data3_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(), N, N);
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
@@ -102,11 +102,13 @@ void ET_MatrixMatrixMultiplyImpl()
       [=] RAJA_HOST_DEVICE()
       {
         auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-        auto A_cols = RAJA::expt::ColIndex<
-            int, A_matrix_t>::template static_range<0, N>();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
 
-        auto B_rows = RAJA::expt::RowIndex<
-            int, B_matrix_t>::template static_range<0, N>();
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
         auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
         auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index d7114633d2..f8d136d0e7 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -58,9 +58,9 @@ void ET_MatrixMatrixMultiplyAddImpl()
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t>                              data3_vec(N * N);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(
-      data3_vec.data(), N, N);
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),
+                                                              N, N);
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
   RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr, N, N);
@@ -108,11 +108,13 @@ void ET_MatrixMatrixMultiplyAddImpl()
       [=] RAJA_HOST_DEVICE()
       {
         auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-        auto A_cols = RAJA::expt::ColIndex<
-            int, A_matrix_t>::template static_range<0, N>();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
 
-        auto B_rows = RAJA::expt::RowIndex<
-            int, B_matrix_t>::template static_range<0, N>();
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
         auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
         auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
index e5043c9d56..e67e4a1389 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
@@ -42,7 +42,7 @@ void ET_MatrixVectorImpl()
 
   // alloc data2 - The input vector
 
-  std::vector<element_t>                                     data2_vec(N);
+  std::vector<element_t> data2_vec(N);
   RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_h(
       data2_vec.data());
 
@@ -52,7 +52,7 @@ void ET_MatrixVectorImpl()
 
   // alloc data3 - The output vector
 
-  std::vector<element_t>                         data3_vec(N);
+  std::vector<element_t> data3_vec(N);
   RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_h(data3_vec.data(), N);
 
   element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index 37e44a9828..a7ac9b4529 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -28,7 +28,7 @@ void ET_NegateImpl()
 
   // alloc input0
 
-  std::vector<element_t>                 input0_vec(N * N);
+  std::vector<element_t> input0_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, N);
 
   element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
@@ -48,7 +48,7 @@ void ET_NegateImpl()
 
   // alloc output0
 
-  std::vector<element_t>                 output0_vec(N * N);
+  std::vector<element_t> output0_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), N, N);
 
   element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
@@ -57,7 +57,7 @@ void ET_NegateImpl()
 
   // alloc output1
 
-  std::vector<element_t>                 output1_vec(N * N);
+  std::vector<element_t> output1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), N, N);
 
   element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
@@ -66,7 +66,7 @@ void ET_NegateImpl()
 
   // alloc output2
 
-  std::vector<element_t>                 output2_vec(N * N);
+  std::vector<element_t> output2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), N, N);
 
   element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
@@ -75,7 +75,7 @@ void ET_NegateImpl()
 
   // alloc output3
 
-  std::vector<element_t>                 output3_vec(N * N);
+  std::vector<element_t> output3_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), N, N);
 
   element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
@@ -84,7 +84,7 @@ void ET_NegateImpl()
 
   // alloc output4
 
-  std::vector<element_t>                 output4_vec(N * N);
+  std::vector<element_t> output4_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), N, N);
 
   element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index 77cea0df88..5b3d146938 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -27,7 +27,7 @@ void ET_SubtractImpl()
   //
 
   // alloc data1
-  std::vector<element_t>                 data1_vec(N * N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
   element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
@@ -35,7 +35,7 @@ void ET_SubtractImpl()
 
 
   // alloc data2
-  std::vector<element_t>                 data2_vec(N * N);
+  std::vector<element_t> data2_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
   element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
@@ -63,7 +63,7 @@ void ET_SubtractImpl()
 
 
   // alloc data5
-  std::vector<element_t>                 data5_vec(N * N);
+  std::vector<element_t> data5_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
   element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
@@ -125,9 +125,8 @@ void ET_SubtractImpl()
   {
     for (camp::idx_t j = 0; j < N; ++j)
     {
-      ASSERT_SCALAR_EQ(
-          data5_h(j, i),
-          data1_h(i, j) - data2_h(j, i) + data3_h(i, j) - data4_h(j, i));
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i) +
+                                          data3_h(i, j) - data4_h(j, i));
       //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
       //      data2(i,j));
     }
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index 9b54a6e7ce..18a3d44b5f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -34,7 +34,7 @@ void ET_TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t>                 input0_vec(N * M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
   element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
@@ -54,7 +54,7 @@ void ET_TransposeImpl()
 
   // alloc output0
 
-  std::vector<element_t>                 output0_vec(N * M);
+  std::vector<element_t> output0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
   element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
@@ -63,7 +63,7 @@ void ET_TransposeImpl()
 
   // alloc output1
 
-  std::vector<element_t>                 output1_vec(N * M);
+  std::vector<element_t> output1_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), M, N);
 
   element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
@@ -72,7 +72,7 @@ void ET_TransposeImpl()
 
   // alloc output2
 
-  std::vector<element_t>                 output2_vec(N * M);
+  std::vector<element_t> output2_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), M, N);
 
   element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
@@ -81,7 +81,7 @@ void ET_TransposeImpl()
 
   // alloc output3
 
-  std::vector<element_t>                 output3_vec(N * M);
+  std::vector<element_t> output3_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), M, N);
 
   element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
@@ -90,7 +90,7 @@ void ET_TransposeImpl()
 
   // alloc output4
 
-  std::vector<element_t>                 output4_vec(N * M);
+  std::vector<element_t> output4_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), M, N);
 
   element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index efa2d0f912..bbf131075b 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -25,8 +25,8 @@ void Load_ColMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(
-      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
@@ -37,8 +37,8 @@ void Load_ColMajorImpl()
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -135,13 +135,13 @@ void Load_ColMajorImpl()
             matrix_t m;
             if (matrix_t::layout_type::is_column_major())
             {
-              m.load_packed_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.load_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                               m_size);
             }
             else
             {
-              m.load_strided_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.load_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
             }
 
             // write out to a second view so we can check it on the host
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 17723d4647..84eee26474 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -25,8 +25,8 @@ void Load_RowMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(
-      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
@@ -37,8 +37,8 @@ void Load_RowMajorImpl()
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -133,13 +133,13 @@ void Load_RowMajorImpl()
             matrix_t m;
             if (matrix_t::layout_type::is_row_major())
             {
-              m.load_packed_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.load_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                               n_size, m_size);
             }
             else
             {
-              m.load_strided_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.load_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
             }
 
             // write out to a second view so we can check it on the host
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index b58ad8aab0..b107b919e2 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -25,8 +25,8 @@ void Store_ColMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(
-      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
@@ -37,8 +37,8 @@ void Store_ColMajorImpl()
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -161,13 +161,13 @@ void Store_ColMajorImpl()
             // Store matrix to memory
             if (matrix_t::layout_type::is_column_major())
             {
-              m.store_packed_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.store_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
             }
             else
             {
-              m.store_strided_nm(
-                  data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size, m_size);
+              m.store_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                 m_size);
             }
           });
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index 23bddd60ba..ae3d9b5fba 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -24,8 +24,8 @@ void Store_RowMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(
-      4 * matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(
       data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
@@ -36,8 +36,8 @@ void Store_RowMajorImpl()
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(
-      matrix_t::s_num_rows * matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(
       data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -160,13 +160,13 @@ void Store_RowMajorImpl()
             // Store matrix to memory
             if (matrix_t::layout_type::is_row_major())
             {
-              m.store_packed_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.store_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
             }
             else
             {
-              m.store_strided_nm(
-                  data1_ptr, 2 * matrix_t::s_num_columns, 1, n_size, m_size);
+              m.store_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                 n_size, m_size);
             }
           });
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
index 43d59fd44c..dbd1b14c9a 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
@@ -32,7 +32,7 @@ void TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t>                 input0_vec(N * M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
   element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
@@ -41,7 +41,7 @@ void TransposeImpl()
 
   // alloc output0
 
-  std::vector<element_t>                 output0_vec(N * M);
+  std::vector<element_t> output0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
   element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
index 2de56c4e85..b936803efd 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
@@ -22,11 +22,11 @@ void AddImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
index 67d107f9f2..7ba22b6a80 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
@@ -22,11 +22,11 @@ void DivideImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -164,8 +164,8 @@ void DivideImpl()
     {
       if (lane < N)
       {
-        ASSERT_SCALAR_EQ(
-            input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
+        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane],
+                         output0_vec[lane]);
       }
       else
       {
diff --git a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
index 7a6472ee96..dcd47e50e0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
@@ -22,15 +22,15 @@ void DotProductImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t*             output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
 
   // Initialize input data
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
index e42c12da9d..f2294ab3ae 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
@@ -22,15 +22,15 @@ void FMAImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t*             input2_hptr = input2_vec.data();
+  element_t* input2_hptr = input2_vec.data();
   element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -76,9 +76,8 @@ void FMAImpl()
 
   for (camp::idx_t lane = 0; lane < num_elem; ++lane)
   {
-    ASSERT_SCALAR_EQ(
-        input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
-        output0_vec[lane]);
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
index aac787b54f..05015c5560 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
@@ -22,15 +22,15 @@ void FMSImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t*             input2_hptr = input2_vec.data();
+  element_t* input2_hptr = input2_vec.data();
   element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
@@ -76,9 +76,8 @@ void FMSImpl()
 
   for (camp::idx_t lane = 0; lane < num_elem; ++lane)
   {
-    ASSERT_SCALAR_EQ(
-        input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
-        output0_vec[lane]);
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
index 4817279969..3aa665712c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
@@ -27,13 +27,13 @@ void GatherImpl()
 
   // Data to be read (10x larger than output)
   std::vector<element_t> input0_vec(10 * num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Indexing into input0
   std::vector<index_t> input1_vec(num_elem);
-  index_t*             input1_hptr = input1_vec.data();
-  index_t*             input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
   element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
index a7161e674e..b735c05ece 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
@@ -21,7 +21,7 @@ void GetSetImpl()
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
index 435961a33b..768965aad0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
@@ -21,7 +21,7 @@ void LoadImpl()
 
   // Allocate
   std::vector<element_t> input0_vec(10 * num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
index 07206c2b2c..319e0cac57 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
@@ -22,15 +22,15 @@ void MaxImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t*             output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
   element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
@@ -89,8 +89,8 @@ void MaxImpl()
   // check element-wise operation
   for (camp::idx_t i = 0; i < num_elem; ++i)
   {
-    ASSERT_SCALAR_EQ(
-        std::max<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
index 1ca4c581c1..aad3a0333c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
@@ -22,14 +22,14 @@ void MinImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t*             output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
   element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
@@ -88,8 +88,8 @@ void MinImpl()
   // check element-wise operation
   for (camp::idx_t i = 0; i < num_elem; ++i)
   {
-    ASSERT_SCALAR_EQ(
-        std::min<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
index e3d7e5b3cc..33072a50e5 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
@@ -22,11 +22,11 @@ void MultiplyImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
index bf7c77ebf6..23a29a9bd7 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
@@ -27,13 +27,13 @@ void ScatterImpl()
 
   // Data to be read
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Indexing into output0
   std::vector<index_t> input1_vec(num_elem);
-  index_t*             input1_hptr = input1_vec.data();
-  index_t*             input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   // Scattered output (10x larger than output)
   std::vector<element_t> output0_vec(10 * num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
index cd010e9cb1..f843fc6ad9 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
@@ -22,7 +22,7 @@ void SegmentedBroadcastInnerImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
index 18c8c262c1..aca677b975 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
@@ -22,7 +22,7 @@ void SegmentedBroadcastOuterImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
index a081db2d9c..4332cf3430 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
@@ -22,11 +22,11 @@ void SegmentedDotProductImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
index 6ef2f5b074..e0e45f428c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
@@ -22,7 +22,7 @@ void SegmentedSumInnerImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
index ec61a47ee8..484f9e198a 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
@@ -22,7 +22,7 @@ void SegmentedSumOuterImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
index 3fb1ad7001..ca341f74d8 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
@@ -21,7 +21,7 @@ void StoreImpl()
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(10 * num_elem);
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
index 85120e8b03..2fdf1425d1 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
@@ -22,11 +22,11 @@ void SubtractImpl()
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t*             input0_hptr = input0_vec.data();
+  element_t* input0_hptr = input0_vec.data();
   element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t*             input1_hptr = input1_vec.data();
+  element_t* input1_hptr = input1_vec.data();
   element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index 6f218158f4..3b1111b6ef 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -86,13 +86,12 @@ ForallVectorRef2dImpl()
 
   using policy2_t = RAJA::KernelPolicy<RAJA::statement::For<
       0, RAJA::seq_exec,
-      RAJA::statement::For<
-          1, RAJA::expt::vector_exec<vector_t>, RAJA::statement::Lambda<0>>>>;
+      RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
+                           RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<policy2_t>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<index_t>(0, N),
-          RAJA::TypedRangeSegment<index_t>(0, M)),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N),
+                       RAJA::TypedRangeSegment<index_t>(0, M)),
 
       [=](index_t i, index_t j)
       { Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9; });
diff --git a/test/functional/util/test-CombiningAdapter-1D.cpp b/test/functional/util/test-CombiningAdapter-1D.cpp
index 4068067b1e..2867cd56e2 100644
--- a/test/functional/util/test-CombiningAdapter-1D.cpp
+++ b/test/functional/util/test-CombiningAdapter-1D.cpp
@@ -28,7 +28,7 @@ void test_CombiningAdapter_1D(Segment0 const& seg0)
   auto seg0_begin = begin(seg0);
 
   size_t counter0 = 0;
-  auto   adapter  = RAJA::make_CombiningAdapter(
+  auto adapter    = RAJA::make_CombiningAdapter(
       [&](SegIndexType i0)
       {
         ASSERT_EQ(seg0_begin[counter0], i0);
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index 6b7af38c85..6a01da4836 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -19,23 +19,22 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename SegIndexType0,
-    typename SegIndexType1,
-    typename Segment0,
-    typename Segment1>
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename Segment0,
+          typename Segment1>
 void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 {
   using std::begin;
   using std::distance;
   using std::end;
-  auto   seg0_begin = begin(seg0);
-  auto   seg1_begin = begin(seg1);
-  size_t seg1_len   = static_cast<size_t>(seg1.size());
+  auto seg0_begin = begin(seg0);
+  auto seg1_begin = begin(seg1);
+  size_t seg1_len = static_cast<size_t>(seg1.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
-  auto   adapter  = RAJA::make_CombiningAdapter(
+  auto adapter    = RAJA::make_CombiningAdapter(
       [&](SegIndexType0 i0, SegIndexType1 i1)
       {
         ASSERT_EQ(seg0_begin[counter0], i0);
@@ -63,11 +62,10 @@ void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 }
 
 template <typename SegIndexType0, typename SegIndexType1>
-void test_types_CombiningAdapter_2D(
-    SegIndexType0 ibegin0,
-    SegIndexType0 iend0,
-    SegIndexType1 ibegin1,
-    SegIndexType1 iend1)
+void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index e13d617ffe..38226bdf4a 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -19,31 +19,29 @@
 #include <numeric>
 #include <vector>
 
-template <
-    typename SegIndexType0,
-    typename SegIndexType1,
-    typename SegIndexType2,
-    typename Segment0,
-    typename Segment1,
-    typename Segment2>
-void test_CombiningAdapter_3D(
-    Segment0 const& seg0,
-    Segment1 const& seg1,
-    Segment2 const& seg2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2,
+          typename Segment0,
+          typename Segment1,
+          typename Segment2>
+void test_CombiningAdapter_3D(Segment0 const& seg0,
+                              Segment1 const& seg1,
+                              Segment2 const& seg2)
 {
   using std::begin;
   using std::distance;
   using std::end;
-  auto   seg0_begin = begin(seg0);
-  auto   seg1_begin = begin(seg1);
-  size_t seg1_len   = static_cast<size_t>(seg1.size());
-  auto   seg2_begin = begin(seg2);
-  size_t seg2_len   = static_cast<size_t>(seg2.size());
+  auto seg0_begin = begin(seg0);
+  auto seg1_begin = begin(seg1);
+  size_t seg1_len = static_cast<size_t>(seg1.size());
+  auto seg2_begin = begin(seg2);
+  size_t seg2_len = static_cast<size_t>(seg2.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
   size_t counter2 = 0;
-  auto   adapter  = RAJA::make_CombiningAdapter(
+  auto adapter    = RAJA::make_CombiningAdapter(
       [&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2)
       {
         ASSERT_EQ(seg0_begin[counter0], i0);
@@ -67,9 +65,8 @@ void test_CombiningAdapter_3D(
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(
-      distance(begin(range), end(range)),
-      seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
   for (auto idx = begin(range); idx != range_end; ++idx)
@@ -78,17 +75,15 @@ void test_CombiningAdapter_3D(
   }
 }
 
-template <
-    typename SegIndexType0,
-    typename SegIndexType1,
-    typename SegIndexType2>
-void test_types_CombiningAdapter_3D(
-    SegIndexType0 ibegin0,
-    SegIndexType0 iend0,
-    SegIndexType1 ibegin1,
-    SegIndexType1 iend1,
-    SegIndexType2 ibegin2,
-    SegIndexType2 iend2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2>
+void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1,
+                                    SegIndexType2 ibegin2,
+                                    SegIndexType2 iend2)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
index 0bd7040e78..f7d489c75e 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
@@ -28,7 +28,7 @@ void test_PermutedCombiningAdapter_1D(Segment const& seg0)
   auto seg0_begin = begin(seg0);
 
   size_t counters[1] = {0};
-  auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
       [&](IndexType i0)
       {
         ASSERT_EQ(seg0_begin[counters[0]], i0);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index 4b05289667..d9396d6ebd 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -25,13 +25,13 @@ void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
   using std::begin;
   using std::distance;
   using std::end;
-  auto   seg0_begin  = begin(seg0);
-  auto   seg1_begin  = begin(seg1);
-  size_t seg_lens[2] = {
-      static_cast<size_t>(seg0.size()), static_cast<size_t>(seg1.size())};
+  auto seg0_begin    = begin(seg0);
+  auto seg1_begin    = begin(seg1);
+  size_t seg_lens[2] = {static_cast<size_t>(seg0.size()),
+                        static_cast<size_t>(seg1.size())};
 
   size_t counters[2] = {0, 0};
-  auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
       [&](IndexType i0, IndexType i1)
       {
         ASSERT_EQ(seg0_begin[counters[0]], i0);
@@ -60,11 +60,10 @@ void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
 }
 
 template <typename Perm, typename IndexType>
-void test_types_PermutedCombiningAdapter_2D(
-    IndexType ibegin0,
-    IndexType iend0,
-    IndexType ibegin1,
-    IndexType iend1)
+void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index 3172d336ba..2ef1021251 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -20,23 +20,22 @@
 #include <vector>
 
 template <typename Perm, typename IndexType, typename Segment>
-void test_PermutedCombiningAdapter_3D(
-    Segment const& seg0,
-    Segment const& seg1,
-    Segment const& seg2)
+void test_PermutedCombiningAdapter_3D(Segment const& seg0,
+                                      Segment const& seg1,
+                                      Segment const& seg2)
 {
   using std::begin;
   using std::distance;
   using std::end;
-  auto   seg0_begin  = begin(seg0);
-  auto   seg1_begin  = begin(seg1);
-  auto   seg2_begin  = begin(seg2);
-  size_t seg_lens[3] = {
-      static_cast<size_t>(seg0.size()), static_cast<size_t>(seg1.size()),
-      static_cast<size_t>(seg2.size())};
+  auto seg0_begin    = begin(seg0);
+  auto seg1_begin    = begin(seg1);
+  auto seg2_begin    = begin(seg2);
+  size_t seg_lens[3] = {static_cast<size_t>(seg0.size()),
+                        static_cast<size_t>(seg1.size()),
+                        static_cast<size_t>(seg2.size())};
 
   size_t counters[3] = {0, 0, 0};
-  auto   adapter     = RAJA::make_PermutedCombiningAdapter<Perm>(
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
       [&](IndexType i0, IndexType i1, IndexType i2)
       {
         ASSERT_EQ(seg0_begin[counters[0]], i0);
@@ -62,9 +61,8 @@ void test_PermutedCombiningAdapter_3D(
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(
-      distance(begin(range), end(range)),
-      seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
   for (auto idx = begin(range); idx != range_end; ++idx)
@@ -74,13 +72,12 @@ void test_PermutedCombiningAdapter_3D(
 }
 
 template <typename Perm, typename IndexType>
-void test_types_PermutedCombiningAdapter_3D(
-    IndexType ibegin0,
-    IndexType iend0,
-    IndexType ibegin1,
-    IndexType iend1,
-    IndexType ibegin2,
-    IndexType iend2)
+void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1,
+                                            IndexType ibegin2,
+                                            IndexType iend2)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
@@ -96,10 +93,10 @@ TEST(PermutedCombiningAdapter, test3D)
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_JKI, int>(0, 0, 0, 0, 0, 5);
 
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KIJ, int>(0, 3, 0, 4, 0, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(
-      -3, 5, 0, 6, 2, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(
-      4, 13, -2, 7, -3, 0);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(
-      -8, -2, -5, 3, 1, 4);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2,
+                                                               5);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3,
+                                                               0);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1,
+                                                               4);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index b085d8ed4e..24dc62646b 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -23,7 +23,7 @@
 template <typename IndexType, typename type1>
 struct callable11
 {
-  type1*                working_ptr1;
+  type1* working_ptr1;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr1[i] += type1(i);
@@ -32,8 +32,8 @@ struct callable11
 template <typename IndexType, typename type1>
 struct callable12
 {
-  type1*                working_ptr1;
-  type1 const           test_val1;
+  type1* working_ptr1;
+  type1 const test_val1;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr1[i] += test_val1;
@@ -43,7 +43,7 @@ struct callable12
 template <typename IndexType, typename type2>
 struct callable21
 {
-  type2*                working_ptr2;
+  type2* working_ptr2;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr2[i] += type2(i);
@@ -52,8 +52,8 @@ struct callable21
 template <typename IndexType, typename type2>
 struct callable22
 {
-  type2*                working_ptr2;
-  type2 const           test_val2;
+  type2* working_ptr2;
+  type2 const test_val2;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr2[i] += test_val2;
@@ -63,7 +63,7 @@ struct callable22
 template <typename IndexType, typename type3>
 struct callable31
 {
-  type3*                working_ptr3;
+  type3* working_ptr3;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr3[i] += type3(i);
@@ -72,8 +72,8 @@ struct callable31
 template <typename IndexType, typename type3>
 struct callable32
 {
-  type3*                working_ptr3;
-  type3 const           test_val3;
+  type3* working_ptr3;
+  type3 const test_val3;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr3[i] += test_val3;
@@ -81,25 +81,23 @@ struct callable32
 };
 
 
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupOrderedMultiple
 {
-  void operator()(
-      std::mt19937& rng,
-      IndexType     max_begin,
-      IndexType     min_end,
-      IndexType     num1,
-      IndexType     num2,
-      IndexType     num3,
-      IndexType     pool_reuse,
-      IndexType     group_reuse) const
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
   {
     ASSERT_GT(min_end, max_begin);
     IndexType N = min_end + max_begin;
@@ -130,7 +128,7 @@ struct testWorkGroupOrderedMultiple
       }
     }
 
-    WORKING_RES               res = WORKING_RES::get_default();
+    WORKING_RES res = WORKING_RES::get_default();
     camp::resources::Resource working_res {res};
 
     using type1 = IndexType;
@@ -149,14 +147,14 @@ struct testWorkGroupOrderedMultiple
     type3* check_array3   = nullptr;
     type3* test_array3    = nullptr;
 
-    allocateForallTestData<type1>(
-        N * num1, working_res, &working_array1, &check_array1, &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-    allocateForallTestData<type2>(
-        N * num2, working_res, &working_array2, &check_array2, &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-    allocateForallTestData<type3>(
-        N * num3, working_res, &working_array3, &check_array3, &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
     type1 const test_val1(5);
     type2 const test_val2(7);
@@ -173,26 +171,26 @@ struct testWorkGroupOrderedMultiple
         camp::list<range_segment, callable31<IndexType, type3>>,
         camp::list<range_segment, callable32<IndexType, type3>>>;
 
-    using WorkPool_type = RAJA::WorkPool<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type = RAJA::WorkGroup<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type = RAJA::WorkSite<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using resource_type = typename WorkGroup_type::resource_type;
 
-    WorkPool_type  pool(Allocator {});
+    WorkPool_type pool(Allocator {});
     WorkGroup_type group = pool.instantiate();
-    WorkSite_type  site  = group.run();
+    WorkSite_type site   = group.run();
 
     for (IndexType pr = 0; pr < pool_reuse; pr++)
     {
@@ -203,34 +201,28 @@ struct testWorkGroupOrderedMultiple
         for (IndexType j = IndexType(0); j < num1; j++)
         {
           type1* working_ptr1 = working_array1 + N * j;
-          pool.enqueue(
-              range_segment {begin1[j], end1[j]},
-              callable11<IndexType, type1> {working_ptr1});
-          pool.enqueue(
-              range_segment {begin1[j], end1[j]},
-              callable12<IndexType, type1> {working_ptr1, test_val1});
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable11<IndexType, type1> {working_ptr1});
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable12<IndexType, type1> {working_ptr1, test_val1});
         }
 
         for (IndexType j = IndexType(0); j < num2; j++)
         {
           type2* working_ptr2 = working_array2 + N * j;
-          pool.enqueue(
-              range_segment {begin2[j], end2[j]},
-              callable21<IndexType, type2> {working_ptr2});
-          pool.enqueue(
-              range_segment {begin2[j], end2[j]},
-              callable22<IndexType, type2> {working_ptr2, test_val2});
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable21<IndexType, type2> {working_ptr2});
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable22<IndexType, type2> {working_ptr2, test_val2});
         }
 
         for (IndexType j = IndexType(0); j < num3; j++)
         {
           type3* working_ptr3 = working_array3 + N * j;
-          pool.enqueue(
-              range_segment {begin3[j], end3[j]},
-              callable31<IndexType, type3> {working_ptr3});
-          pool.enqueue(
-              range_segment {begin3[j], end3[j]},
-              callable32<IndexType, type3> {working_ptr3, test_val3});
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable31<IndexType, type3> {working_ptr3});
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable32<IndexType, type3> {working_ptr3, test_val3});
         }
       }
 
@@ -382,14 +374,14 @@ struct testWorkGroupOrderedMultiple
     }
 
 
-    deallocateForallTestData<type1>(
-        working_res, working_array1, check_array1, test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-    deallocateForallTestData<type2>(
-        working_res, working_array2, check_array2, test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-    deallocateForallTestData<type3>(
-        working_res, working_array3, check_array3, test_array3);
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
   }
 };
 
@@ -397,13 +389,12 @@ struct testWorkGroupOrderedMultiple
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupOrderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -413,25 +404,23 @@ struct testWorkGroupOrderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(
-      std::mt19937&,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType) const
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
   {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupOrderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -441,15 +430,14 @@ struct testWorkGroupOrderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(
-      std::mt19937&,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType) const
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
   {}
 };
 
@@ -463,9 +451,8 @@ class WorkGroupBasicOrderedMultipleReuseFunctionalTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicOrderedMultipleReuseFunctionalTest,
-    BasicWorkGroupOrderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
+             BasicWorkGroupOrderedMultipleReuse)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -485,9 +472,9 @@ TYPED_TEST_P(
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(
+  testWorkGroupOrderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(
       rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
       group_reuse);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index 4b09cfcfab..b0a2ac3734 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -19,14 +19,13 @@
 #include <vector>
 
 
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupOrderedSingle
 {
   void operator()(IndexType begin, IndexType end) const
@@ -35,15 +34,15 @@ struct testWorkGroupOrderedSingle
     ASSERT_GE(end, begin);
     IndexType N = end + begin;
 
-    WORKING_RES               res = WORKING_RES::get_default();
+    WORKING_RES res = WORKING_RES::get_default();
     camp::resources::Resource working_res {res};
 
     IndexType* working_array;
     IndexType* check_array;
     IndexType* test_array;
 
-    allocateForallTestData<IndexType>(
-        N, working_res, &working_array, &check_array, &test_array);
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
     IndexType const test_val(5);
 
@@ -59,20 +58,20 @@ struct testWorkGroupOrderedSingle
         camp::list<range_segment, decltype(callable1)>,
         camp::list<range_segment, decltype(callable2)>>;
 
-    using WorkPool_type = RAJA::WorkPool<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type = RAJA::WorkGroup<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type = RAJA::WorkSite<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     {
       for (IndexType i = IndexType(0); i < N; i++)
@@ -118,8 +117,8 @@ struct testWorkGroupOrderedSingle
     }
 
 
-    deallocateForallTestData<IndexType>(
-        working_res, working_array, check_array, test_array);
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
   }
 };
 
@@ -127,13 +126,12 @@ struct testWorkGroupOrderedSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupOrderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -146,13 +144,12 @@ struct testWorkGroupOrderedSingle<
   void operator()(IndexType, IndexType) const {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupOrderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -175,9 +172,8 @@ class WorkGroupBasicOrderedSingleFunctionalTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicOrderedSingleFunctionalTest,
-    BasicWorkGroupOrderedSingle)
+TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
+             BasicWorkGroupOrderedSingle)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -199,15 +195,15 @@ TYPED_TEST_P(
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(b1, e1);
-  testWorkGroupOrderedSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(b2, e2);
-  testWorkGroupOrderedSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(b3, e3);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_SINGLE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index e250ba895a..c2265c3a96 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -22,8 +22,8 @@
 template <typename IndexType, typename type1>
 struct callable1
 {
-  type1*                working_ptr1;
-  type1 const           test_val1;
+  type1* working_ptr1;
+  type1 const test_val1;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr1[i] += type1(i) + test_val1;
@@ -33,8 +33,8 @@ struct callable1
 template <typename IndexType, typename type2>
 struct callable2
 {
-  type2*                working_ptr2;
-  type2 const           test_val2;
+  type2* working_ptr2;
+  type2 const test_val2;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr2[i] += type2(i) + test_val2;
@@ -44,8 +44,8 @@ struct callable2
 template <typename IndexType, typename type3>
 struct callable3
 {
-  type3*                working_ptr3;
-  type3 const           test_val3;
+  type3* working_ptr3;
+  type3 const test_val3;
   RAJA_HOST_DEVICE void operator()(IndexType i) const
   {
     working_ptr3[i] += type3(i) + test_val3;
@@ -53,25 +53,23 @@ struct callable3
 };
 
 
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupUnorderedMultiple
 {
-  void operator()(
-      std::mt19937& rng,
-      IndexType     max_begin,
-      IndexType     min_end,
-      IndexType     num1,
-      IndexType     num2,
-      IndexType     num3,
-      IndexType     pool_reuse,
-      IndexType     group_reuse) const
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
   {
     ASSERT_GT(min_end, max_begin);
     IndexType N = min_end + max_begin;
@@ -102,7 +100,7 @@ struct testWorkGroupUnorderedMultiple
       }
     }
 
-    WORKING_RES               res = WORKING_RES::get_default();
+    WORKING_RES res = WORKING_RES::get_default();
     camp::resources::Resource working_res {res};
 
     using type1 = IndexType;
@@ -121,14 +119,14 @@ struct testWorkGroupUnorderedMultiple
     type3* check_array3   = nullptr;
     type3* test_array3    = nullptr;
 
-    allocateForallTestData<type1>(
-        N * num1, working_res, &working_array1, &check_array1, &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-    allocateForallTestData<type2>(
-        N * num2, working_res, &working_array2, &check_array2, &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-    allocateForallTestData<type3>(
-        N * num3, working_res, &working_array3, &check_array3, &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
     type1 const test_val1(5);
     type2 const test_val2(7);
@@ -141,20 +139,20 @@ struct testWorkGroupUnorderedMultiple
         camp::list<range_segment, callable2<IndexType, type2>>,
         camp::list<range_segment, callable3<IndexType, type3>>>;
 
-    using WorkPool_type = RAJA::WorkPool<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type = RAJA::WorkGroup<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type = RAJA::WorkSite<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     WorkPool_type pool(Allocator {});
 
@@ -166,25 +164,22 @@ struct testWorkGroupUnorderedMultiple
         for (IndexType j = IndexType(0); j < num1; j++)
         {
           type1* working_ptr1 = working_array1 + N * j;
-          pool.enqueue(
-              range_segment {begin1[j], end1[j]},
-              callable1<IndexType, type1> {working_ptr1, test_val1});
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable1<IndexType, type1> {working_ptr1, test_val1});
         }
 
         for (IndexType j = IndexType(0); j < num2; j++)
         {
           type2* working_ptr2 = working_array2 + N * j;
-          pool.enqueue(
-              range_segment {begin2[j], end2[j]},
-              callable2<IndexType, type2> {working_ptr2, test_val2});
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable2<IndexType, type2> {working_ptr2, test_val2});
         }
 
         for (IndexType j = IndexType(0); j < num3; j++)
         {
           type3* working_ptr3 = working_array3 + N * j;
-          pool.enqueue(
-              range_segment {begin3[j], end3[j]},
-              callable3<IndexType, type3> {working_ptr3, test_val3});
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable3<IndexType, type3> {working_ptr3, test_val3});
         }
       }
 
@@ -331,14 +326,14 @@ struct testWorkGroupUnorderedMultiple
     }
 
 
-    deallocateForallTestData<type1>(
-        working_res, working_array1, check_array1, test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-    deallocateForallTestData<type2>(
-        working_res, working_array2, check_array2, test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-    deallocateForallTestData<type3>(
-        working_res, working_array3, check_array3, test_array3);
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
   }
 };
 
@@ -346,13 +341,12 @@ struct testWorkGroupUnorderedMultiple
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupUnorderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -362,25 +356,23 @@ struct testWorkGroupUnorderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(
-      std::mt19937&,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType) const
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
   {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupUnorderedMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -390,15 +382,14 @@ struct testWorkGroupUnorderedMultiple<
     Allocator,
     WORKING_RES>
 {
-  void operator()(
-      std::mt19937&,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType,
-      IndexType) const
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
   {}
 };
 
@@ -413,9 +404,8 @@ class WorkGroupBasicUnorderedMultipleReuseFunctionalTest
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
-    BasicWorkGroupUnorderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
+             BasicWorkGroupUnorderedMultipleReuse)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -435,9 +425,9 @@ TYPED_TEST_P(
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(
+  testWorkGroupUnorderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator,
+                                 WORKING_RESOURCE> {}(
       rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
       group_reuse);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index 09c0fc4ad9..629bccdb0d 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -18,14 +18,13 @@
 #include <random>
 
 
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupUnorderedSingle
 {
   void operator()(IndexType begin, IndexType end) const
@@ -35,15 +34,15 @@ struct testWorkGroupUnorderedSingle
     ASSERT_GE(end, begin);
     IndexType N = end + begin;
 
-    WORKING_RES               res = WORKING_RES::get_default();
+    WORKING_RES res = WORKING_RES::get_default();
     camp::resources::Resource working_res {res};
 
     IndexType* working_array;
     IndexType* check_array;
     IndexType* test_array;
 
-    allocateForallTestData<IndexType>(
-        N, working_res, &working_array, &check_array, &test_array);
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
     IndexType const test_val(5);
 
@@ -55,25 +54,24 @@ struct testWorkGroupUnorderedSingle
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, decltype(callable)>>;
 
-    using WorkPool_type = RAJA::WorkPool<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type = RAJA::WorkGroup<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type = RAJA::WorkSite<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     using resource_type = typename WorkSite_type::resource_type;
-    static_assert(
-        std::is_same<WORKING_RES, resource_type>::value,
-        "Expected same resource types");
+    static_assert(std::is_same<WORKING_RES, resource_type>::value,
+                  "Expected same resource types");
 
     {
       for (IndexType i = IndexType(0); i < N; i++)
@@ -120,8 +118,8 @@ struct testWorkGroupUnorderedSingle
     }
 
 
-    deallocateForallTestData<IndexType>(
-        working_res, working_array, check_array, test_array);
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
   }
 };
 
@@ -129,13 +127,12 @@ struct testWorkGroupUnorderedSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupUnorderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -148,13 +145,12 @@ struct testWorkGroupUnorderedSingle<
   void operator()(IndexType, IndexType) const {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKING_RES>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES>
 struct testWorkGroupUnorderedSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -177,9 +173,8 @@ class WorkGroupBasicUnorderedSingleFunctionalTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicUnorderedSingleFunctionalTest,
-    BasicWorkGroupUnorderedSingle)
+TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
+             BasicWorkGroupUnorderedSingle)
 {
   using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -201,15 +196,15 @@ TYPED_TEST_P(
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(b1, e1);
-  testWorkGroupUnorderedSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(b2, e2);
-  testWorkGroupUnorderedSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE> {}(b3, e3);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_SINGLE__
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index 03e3ec2d6e..4b4c786784 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -34,9 +34,8 @@
 
 #define GPU_TEST_F(test_fixture, test_name)                                    \
   static void gpu_test_f_##test_fixture##_##test_name();                       \
-  GTEST_TEST_(                                                                 \
-      test_fixture, test_name, test_fixture,                                   \
-      ::testing::internal::GetTypeId<test_fixture>())                          \
+  GTEST_TEST_(test_fixture, test_name, test_fixture,                           \
+              ::testing::internal::GetTypeId<test_fixture>())                  \
   {                                                                            \
     gpu_test_f_##test_fixture##_##test_name();                                 \
   }                                                                            \
@@ -70,11 +69,11 @@
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(                                           \
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name));                    \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
+                                                           test_name));        \
   };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(                                                  \
-      test_case_name, test_name)::gtest_registering_dummy_ =                   \
+  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
+                             test_name)::gtest_registering_dummy_ =            \
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
   template <typename Invocable>                                                \
   static void gtest_gpu_##test_case_name##_##test_name(Invocable&& GetParam)
@@ -87,7 +86,7 @@
     {                                                                          \
     private:                                                                   \
       typedef SuiteName<gtest_TypeParam_> TestFixture;                         \
-      typedef gtest_TypeParam_            TypeParam;                           \
+      typedef gtest_TypeParam_ TypeParam;                                      \
                                                                                \
     public:                                                                    \
       void TestBody() override;                                                \
diff --git a/test/include/RAJA_test-atomic-ref-types.hpp b/test/include/RAJA_test-atomic-ref-types.hpp
index 79a2d5e196..2f6280ed0e 100644
--- a/test/include/RAJA_test-atomic-ref-types.hpp
+++ b/test/include/RAJA_test-atomic-ref-types.hpp
@@ -20,7 +20,7 @@
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 1, T>::type
-            np2m1(T val)
+np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -30,7 +30,7 @@ RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 1, T>::type
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 2, T>::type
-            np2m1(T val)
+np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -41,7 +41,7 @@ RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 2, T>::type
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 4, T>::type
-            np2m1(T val)
+np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -53,7 +53,7 @@ RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 4, T>::type
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 8, T>::type
-            np2m1(T val)
+np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
@@ -66,7 +66,7 @@ RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 8, T>::type
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 16, T>::type
-            np2m1(T val)
+np2m1(T val)
 {
   val |= val >> 1;
   val |= val >> 2;
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
index a35fbf4877..5a9df0ab43 100644
--- a/test/include/RAJA_test-atomic-types.hpp
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -21,15 +21,14 @@
 //
 // Atomic data types
 //
-using AtomicDataTypeList = camp::list<
-    RAJA::Index_type,
-    int,
+using AtomicDataTypeList = camp::list<RAJA::Index_type,
+                                      int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    unsigned int,
-    long long,
-    unsigned long long,
-    float,
+                                      unsigned int,
+                                      long long,
+                                      unsigned long long,
+                                      float,
 #endif
-    double>;
+                                      double>;
 
 #endif  // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-dynamic-forall.hpp b/test/include/RAJA_test-dynamic-forall.hpp
index 11562841b7..9988492216 100644
--- a/test/include/RAJA_test-dynamic-forall.hpp
+++ b/test/include/RAJA_test-dynamic-forall.hpp
@@ -15,24 +15,23 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using policy_list = camp::list<camp::list<
-    RAJA::seq_exec,
-    RAJA::simd_exec
+using policy_list = camp::list<camp::list<RAJA::seq_exec,
+                                          RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    RAJA::omp_parallel_for_exec
+                                          ,
+                                          RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    RAJA::cuda_exec<256>,
-    RAJA::cuda_exec<512>
+                                          ,
+                                          RAJA::cuda_exec<256>,
+                                          RAJA::cuda_exec<512>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    RAJA::hip_exec<256>,
-    RAJA::hip_exec<512>
+                                          ,
+                                          RAJA::hip_exec<256>,
+                                          RAJA::hip_exec<512>
 #endif
-    >>;
+                                          >>;
 
 
 #endif  // __RAJA_test_dynamic_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-async-execpol.hpp b/test/include/RAJA_test-forall-async-execpol.hpp
index 66489c3bf4..587f816476 100644
--- a/test/include/RAJA_test-forall-async-execpol.hpp
+++ b/test/include/RAJA_test-forall-async-execpol.hpp
@@ -38,10 +38,10 @@ using OpenMPTargetAsyncForallAtomicExecPols = OpenMPTargetForallAtomicExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAsyncForallExecPols = camp::list<
-    RAJA::cuda_exec<128, true>,
-    RAJA::cuda_exec<256, true>,
-    RAJA::cuda_exec_explicit<256, 2, true>>;
+using CudaAsyncForallExecPols =
+    camp::list<RAJA::cuda_exec<128, true>,
+               RAJA::cuda_exec<256, true>,
+               RAJA::cuda_exec_explicit<256, 2, true>>;
 
 using CudaAsyncForallReduceExecPols = CudaForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
index 222af900ff..d932e6d94f 100644
--- a/test/include/RAJA_test-forall-data.hpp
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -15,12 +15,11 @@
 #include "camp/resource.hpp"
 
 template <typename T>
-void allocateForallTestData(
-    size_t                    N,
-    camp::resources::Resource work_res,
-    T**                       work_array,
-    T**                       check_array,
-    T**                       test_array)
+void allocateForallTestData(size_t N,
+                            camp::resources::Resource work_res,
+                            T** work_array,
+                            T** check_array,
+                            T** test_array)
 {
   camp::resources::Resource host_res {camp::resources::Host()};
 
@@ -31,17 +30,15 @@ void allocateForallTestData(
 }
 
 // for RAJA strongly typed indices
-template <
-    typename T,
-    typename std::enable_if<
-        std::is_base_of<RAJA::IndexValueBase, camp::type::ptr::rem<T>>::value>::
-        type* = nullptr>
-void allocateForallTestData(
-    T                         N,
-    camp::resources::Resource work_res,
-    T**                       work_array,
-    T**                       check_array,
-    T**                       test_array)
+template <typename T,
+          typename std::enable_if<
+              std::is_base_of<RAJA::IndexValueBase,
+                              camp::type::ptr::rem<T>>::value>::type* = nullptr>
+void allocateForallTestData(T N,
+                            camp::resources::Resource work_res,
+                            T** work_array,
+                            T** check_array,
+                            T** test_array)
 {
   camp::resources::Resource host_res {camp::resources::Host()};
 
@@ -52,11 +49,10 @@ void allocateForallTestData(
 }
 
 template <typename T>
-void deallocateForallTestData(
-    camp::resources::Resource work_res,
-    T*                        work_array,
-    T*                        check_array,
-    T*                        test_array)
+void deallocateForallTestData(camp::resources::Resource work_res,
+                              T* work_array,
+                              T* check_array,
+                              T* test_array)
 {
   camp::resources::Resource host_res {camp::resources::Host()};
 
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 6bcd95edb9..cc8f9b2a26 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -90,35 +90,35 @@ using OpenMPForallExecPols = camp::list<
 
 using OpenMPForallReduceExecPols = OpenMPForallExecPols;
 
-using OpenMPForallAtomicExecPols = camp::list<
-    RAJA::omp_parallel_for_exec
+using OpenMPForallAtomicExecPols =
+    camp::list<RAJA::omp_parallel_for_exec
 
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    ,
-    RAJA::omp_parallel_for_static_exec<>,
-    RAJA::omp_parallel_for_static_exec<4>,
-    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
-    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-
-    ,
-    RAJA::omp_parallel_for_dynamic_exec<>,
-    RAJA::omp_parallel_for_dynamic_exec<2>
-
-    ,
-    RAJA::omp_parallel_for_guided_exec<>,
-    RAJA::omp_parallel_for_guided_exec<3>
-
-    ,
-    RAJA::omp_parallel_for_runtime_exec
+               ,
+               RAJA::omp_parallel_for_static_exec<>,
+               RAJA::omp_parallel_for_static_exec<4>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
+
+               ,
+               RAJA::omp_parallel_for_dynamic_exec<>,
+               RAJA::omp_parallel_for_dynamic_exec<2>
+
+               ,
+               RAJA::omp_parallel_for_guided_exec<>,
+               RAJA::omp_parallel_for_guided_exec<3>
+
+               ,
+               RAJA::omp_parallel_for_runtime_exec
 #endif
-    >;
+               >;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetForallExecPols = camp::list<
-    RAJA::omp_target_parallel_for_exec<8>,
-    RAJA::omp_target_parallel_for_exec_nt>;
+using OpenMPTargetForallExecPols =
+    camp::list<RAJA::omp_target_parallel_for_exec<8>,
+               RAJA::omp_target_parallel_for_exec_nt>;
 
 using OpenMPTargetForallReduceExecPols = OpenMPTargetForallExecPols;
 
@@ -127,15 +127,15 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallExecPols = camp::list<
-    RAJA::cuda_exec<128>,
-    RAJA::cuda_exec_occ_calc<256>,
-    RAJA::cuda_exec_grid<256, 64>,
-    RAJA::cuda_exec_explicit<256, 2>,
-    RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
-    RAJA::cuda_exec_occ_custom<
-        256,
-        RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
+using CudaForallExecPols =
+    camp::list<RAJA::cuda_exec<128>,
+               RAJA::cuda_exec_occ_calc<256>,
+               RAJA::cuda_exec_grid<256, 64>,
+               RAJA::cuda_exec_explicit<256, 2>,
+               RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::cuda_exec_occ_custom<
+                   256,
+                   RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -144,14 +144,14 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallExecPols = camp::list<
-    RAJA::hip_exec<128>,
-    RAJA::hip_exec_occ_calc<256>,
-    RAJA::hip_exec_grid<256, 64>,
-    RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
-    RAJA::hip_exec_occ_custom<
-        256,
-        RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
+using HipForallExecPols =
+    camp::list<RAJA::hip_exec<128>,
+               RAJA::hip_exec_occ_calc<256>,
+               RAJA::hip_exec_grid<256, 64>,
+               RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::hip_exec_occ_custom<
+                   256,
+                   RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
index fa19432d7a..e4eebcd266 100644
--- a/test/include/RAJA_test-forall-indexset-execpol.hpp
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -12,9 +12,9 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialForallIndexSetExecPols = camp::list<
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
+using SequentialForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
 
 //
 // Sequential execution policy types for reduction tests.
@@ -25,14 +25,14 @@ using SequentialForallIndexSetReduceExecPols =
     camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallIndexSetExecPols = camp::list<
-    RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-    RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
+using OpenMPForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 
-using OpenMPForallIndexSetReduceExecPols = camp::list<
-    RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
+using OpenMPForallIndexSetReduceExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -45,25 +45,25 @@ using OpenMPTargetForallIndexSetReduceExecPols =
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallIndexSetExecPols = camp::list<
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
+using CudaForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
 
 using CudaForallIndexSetReduceExecPols = CudaForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallIndexSetExecPols = camp::list<
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
+using HipForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
 
 using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclForallIndexSetExecPols = camp::list<
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
+using SyclForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
 
 using SyclForallIndexSetReduceExecPols = SyclForallIndexSetExecPols;
 #endif
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
index 7383e79508..ed13851729 100644
--- a/test/include/RAJA_test-index-types.hpp
+++ b/test/include/RAJA_test-index-types.hpp
@@ -30,20 +30,20 @@ RAJA_INDEX_VALUE_T(StrongULL, unsigned long long, "StrongULLType");
 //
 // Standard index types list
 //
-using IdxTypeList = camp::list<
-    RAJA::Index_type,
-    int,
+using IdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    unsigned int,
-    // short int types will break a bunch of tests due to assumpitons
-    // made in the test implementations.
-    //                             short,
-    //                             unsigned short,
-    long int,
-    unsigned long,
-    long long,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                             short,
+               //                             unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-    unsigned long long>;
+               unsigned long long>;
 
 //
 // Signed index types list
@@ -53,22 +53,22 @@ using SignedIdxTypeList = camp::list<RAJA::Index_type, int, long long>;
 //
 // Index types w/ Strong types list
 //
-using StrongIdxTypeList = camp::list<
-    RAJA::Index_type,
-    int,
-    StrongIndexType,
+using StrongIdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
+               StrongIndexType,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    // StrongInt,
-    unsigned int,
-    // short int types will break a bunch of tests due to assumpitons
-    // made in the test implementations.
-    //                                   short,
-    //                                   unsigned short,
-    long int,
-    unsigned long,
-    long long,
+               // StrongInt,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                                   short,
+               //                                   unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-    // StrongULL,
-    unsigned long long>;
+               // StrongULL,
+               unsigned long long>;
 
 #endif  // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
index 99656a2608..4bc41ac9cf 100644
--- a/test/include/RAJA_test-indexset-build.hpp
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -21,24 +21,23 @@
 // Utility routine to construct index set with mix of Range, RangeStride,
 // and List segments to use in various tests.
 //
-template <
-    typename INDEX_TYPE,
-    typename RANGE_TYPE,
-    typename RANGESTRIDE_TYPE,
-    typename LIST_TYPE>
+template <typename INDEX_TYPE,
+          typename RANGE_TYPE,
+          typename RANGESTRIDE_TYPE,
+          typename LIST_TYPE>
 void buildIndexSet(
     RAJA::TypedIndexSet<RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE>& iset,
-    std::vector<INDEX_TYPE>&                                      indices_out,
-    camp::resources::Resource                                     working_res)
+    std::vector<INDEX_TYPE>& indices_out,
+    camp::resources::Resource working_res)
 {
   //
   //  Build vector of integers for creating List segments.
   //
-  std::default_random_engine             gen;
+  std::default_random_engine gen;
   std::uniform_real_distribution<double> dist(0.0, 1.0);
 
   std::vector<INDEX_TYPE> lindices;
-  INDEX_TYPE              idx = 0;
+  INDEX_TYPE idx = 0;
   while (lindices.size() < 3000)
   {
     double dval = dist(gen);
@@ -53,11 +52,11 @@ void buildIndexSet(
   // Construct a mix of Range, RangeStride, and List segments
   // and add them to index set
   //
-  INDEX_TYPE              rbeg     = 0;
-  INDEX_TYPE              rend     = 0;
-  INDEX_TYPE              stride   = 0;
-  INDEX_TYPE              last_idx = 0;
-  INDEX_TYPE              lseg_len = static_cast<INDEX_TYPE>(lindices.size());
+  INDEX_TYPE rbeg     = 0;
+  INDEX_TYPE rend     = 0;
+  INDEX_TYPE stride   = 0;
+  INDEX_TYPE last_idx = 0;
+  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>(lindices.size());
   std::vector<INDEX_TYPE> lseg(lseg_len);
   std::vector<INDEX_TYPE> lseg_vec(lseg_len);
 
diff --git a/test/include/RAJA_test-kernel-nested-loop-types.hpp b/test/include/RAJA_test-kernel-nested-loop-types.hpp
index 20cca84c61..9c323c95e4 100644
--- a/test/include/RAJA_test-kernel-nested-loop-types.hpp
+++ b/test/include/RAJA_test-kernel-nested-loop-types.hpp
@@ -97,25 +97,22 @@ struct KELB_impl;
 
 template <typename T, typename First, typename... Rest>
 struct is_in_type_list<T, list<First, Rest...>>
-    : std::conditional<
-          std::is_same<typename T::LoopType, First>::value,
-          list<T>,
-          typename is_in_type_list<T, list<Rest...>>::type>
+    : std::conditional<std::is_same<typename T::LoopType, First>::value,
+                       list<T>,
+                       typename is_in_type_list<T, list<Rest...>>::type>
 {};
 
 template <typename T, typename Last>
 struct is_in_type_list<T, list<Last>>
-    : std::conditional<
-          std::is_same<typename T::LoopType, Last>::value,
-          list<T>,
-          list<>>
+    : std::conditional<std::is_same<typename T::LoopType, Last>::value,
+                       list<T>,
+                       list<>>
 {};
 
 template <typename POL_TYPE_LIST, typename First, typename... Rest>
 struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>>
-    : join<
-          typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
-          typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
+    : join<typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
+           typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
 {};
 
 template <typename POL_TYPE_LIST, typename Last>
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 9b9910faf1..0bb84ddd16 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -16,18 +16,16 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -35,10 +33,10 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+using cuda_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
 
 using cuda_direct_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -51,10 +49,10 @@ using Cuda_launch_policies =
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
+using hip_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
@@ -62,10 +60,10 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
-    RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
+using sycl_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index 519838133e..258809a569 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -16,27 +16,25 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using omp_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -44,14 +42,14 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+using cuda_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
 
 using cuda_direct_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -69,14 +67,14 @@ using Cuda_launch_policies =
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
-    RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
-    RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
-    RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
-    RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
+using hip_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
@@ -84,14 +82,14 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,  // slowest
-    RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-    RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,  // fastest
-    RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
-    RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-    RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
+using sycl_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,  // slowest
+               RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,  // fastest
+               RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index 265d85261f..5965621493 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -16,16 +16,14 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec>>;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -33,9 +31,8 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+using cuda_policies = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
 using cuda_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -46,18 +43,16 @@ using Cuda_launch_policies = camp::list<cuda_policies, cuda_explicit_policies>;
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
+using hip_policies = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+                                RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
 
 using Hip_launch_policies = camp::list<hip_policies>;
 #endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
+using sycl_policies = camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
 using Sycl_launch_policies = camp::list<sycl_policies>;
 #endif  // RAJA_ENABLE_SYCL
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index 83c721b0ba..bed8b99cd6 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -16,18 +16,16 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -35,10 +33,10 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -51,19 +49,19 @@ using Cuda_launch_policies =
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-    RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
 
 using Hip_launch_policies = camp::list<hip_loop_policies>;
 #endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
-    RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
 
 using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index 1c4110c69c..7f4dd17486 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -16,26 +16,24 @@
 #include "camp/list.hpp"
 
 // Launch policies
-using seq_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -43,14 +41,14 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
     RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
@@ -67,27 +65,27 @@ using Cuda_launch_policies =
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
-    RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
-    RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-    RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
-    RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
-    RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
 
 using Hip_launch_policies = camp::list<hip_loop_policies>;
 #endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,  // slowest index
-    RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-    RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,  // fastest index
-    RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
-    RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-    RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,  // slowest index
+               RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,  // fastest index
+               RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
 
 using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index b44deb9aeb..0d896c7880 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -23,9 +23,8 @@ using seq_cuda_policies = camp::list<
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using seq_cuda_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<
-        RAJA::seq_launch_t,
-        RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LaunchPolicy<RAJA::seq_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
@@ -33,10 +32,10 @@ using Sequential_launch_policies =
     camp::list<seq_cuda_policies, seq_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
-using seq_hip_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
+using seq_hip_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_hip_policies>;
 
@@ -50,10 +49,10 @@ using seq_sycl_policies = camp::list<
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
 
 #else
-using Sequential_launch_policies = camp::list<camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-    RAJA::LoopPolicy<RAJA::seq_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>>;
+using Sequential_launch_policies =
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
 #endif  // Sequential
 
 
@@ -67,9 +66,8 @@ using omp_cuda_policies = camp::list<
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using omp_cuda_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<
-        RAJA::omp_launch_t,
-        RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+    RAJA::LaunchPolicy<RAJA::omp_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
     RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
     RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
@@ -96,52 +94,49 @@ using OpenMP_launch_policies = camp::list<omp_sycl_policies>;
 
 #else
 
-using OpenMP_launch_policies = camp::list<camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-    RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
-    RAJA::LoopPolicy<RAJA::seq_exec>>>;
+using OpenMP_launch_policies =
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                          RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
 #endif
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using Cuda_launch_policies = camp::list<
-    seq_cuda_policies,
-    seq_cuda_explicit_policies
+using Cuda_launch_policies = camp::list<seq_cuda_policies,
+                                        seq_cuda_explicit_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    omp_cuda_policies,
-    omp_cuda_explicit_policies
+                                        ,
+                                        omp_cuda_policies,
+                                        omp_cuda_explicit_policies
 #endif
 
-    >;
+                                        >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using Hip_launch_policies = camp::list<
-    seq_hip_policies
+using Hip_launch_policies = camp::list<seq_hip_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    omp_hip_policies
+                                       ,
+                                       omp_hip_policies
 #endif
-    >;
+                                       >;
 
 #endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using Sycl_launch_policies = camp::list<
-    seq_sycl_policies
+using Sycl_launch_policies = camp::list<seq_sycl_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    omp_sycl_policies
+                                        ,
+                                        omp_sycl_policies
 #endif
-    >;
+                                        >;
 
 #endif  // RAJA_ENABLE_SYCL
 
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
index 144995adfb..1dd618a72e 100644
--- a/test/include/RAJA_test-multi-reduce-abstractor.hpp
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -36,8 +36,8 @@ struct SumAbstractor
   template <typename Reducer>
   static bool consistent(Reducer const&)
   {
-    return RAJA::policy_has_trait<
-               typename Reducer::policy, RAJA::reduce::ordered>::value ||
+    return RAJA::policy_has_trait<typename Reducer::policy,
+                                  RAJA::reduce::ordered>::value ||
            !std::is_floating_point<typename Reducer::value_type>::value;
   }
 
diff --git a/test/include/RAJA_test-plugin-kernelpol.hpp b/test/include/RAJA_test-plugin-kernelpol.hpp
index 767e7b9c05..0ef68cbb7c 100644
--- a/test/include/RAJA_test-plugin-kernelpol.hpp
+++ b/test/include/RAJA_test-plugin-kernelpol.hpp
@@ -35,9 +35,9 @@ using SequentialPluginKernelExecPols = camp::list<
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPPluginKernelExecPols = camp::list<
-    RAJA::KernelPolicy<
-        RAJA::statement::
-            For<0, RAJA::omp_parallel_for_exec, RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::For<0,
+                                            RAJA::omp_parallel_for_exec,
+                                            RAJA::statement::Lambda<0>>>,
     RAJA::KernelPolicy<RAJA::statement::Tile<
         0,
         RAJA::tile_fixed<2>,
@@ -46,11 +46,10 @@ using OpenMPPluginKernelExecPols = camp::list<
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPluginKernelExecPols =
-    camp::list<RAJA::KernelPolicy<RAJA::statement::For<
-        0,
-        RAJA::omp_target_parallel_for_exec<64>,
-        RAJA::statement::Lambda<0>>>>;
+using OpenMPTargetPluginKernelExecPols = camp::list<RAJA::KernelPolicy<
+    RAJA::statement::For<0,
+                         RAJA::omp_target_parallel_for_exec<64>,
+                         RAJA::statement::Lambda<0>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -74,10 +73,9 @@ using CudaPluginKernelExecPols = camp::list<
             0,
             RAJA::tile_fixed<128>,
             RAJA::cuda_block_x_direct,
-            RAJA::statement::For<
-                0,
-                RAJA::cuda_thread_x_direct,
-                RAJA::statement::Lambda<0>>>>>>;
+            RAJA::statement::For<0,
+                                 RAJA::cuda_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
@@ -101,10 +99,9 @@ using HipPluginKernelExecPols = camp::list<
             0,
             RAJA::tile_fixed<128>,
             RAJA::hip_block_x_direct,
-            RAJA::statement::For<
-                0,
-                RAJA::hip_thread_x_direct,
-                RAJA::statement::Lambda<0>>>>>>;
+            RAJA::statement::For<0,
+                                 RAJA::hip_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-reduce-types.hpp b/test/include/RAJA_test-reduce-types.hpp
index 23e5f7351d..49d5cadaea 100644
--- a/test/include/RAJA_test-reduce-types.hpp
+++ b/test/include/RAJA_test-reduce-types.hpp
@@ -21,14 +21,13 @@
 //
 // Reduce data types
 //
-using ReduceDataTypeList = camp::list<
-    int,
+using ReduceDataTypeList = camp::list<int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    unsigned,
-    long long,
-    unsigned long long,
+                                      unsigned,
+                                      long long,
+                                      unsigned long long,
 #endif
-    float,
-    double>;
+                                      float,
+                                      double>;
 
 #endif  // __RAJA_test_reduce_types_HPP__
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index eda49db423..b755677c2e 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -33,23 +33,23 @@ using OpenMPReducePols =
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols = camp::list<
-    RAJA::cuda_reduce_device_fence,
-    RAJA::cuda_reduce_block_fence,
-    RAJA::cuda_reduce_atomic_device_init_device_fence,
-    RAJA::cuda_reduce_atomic_device_init_block_fence,
-    RAJA::cuda_reduce_atomic_host_init_device_fence,
-    RAJA::cuda_reduce_atomic_host_init_block_fence>;
+using CudaReducePols =
+    camp::list<RAJA::cuda_reduce_device_fence,
+               RAJA::cuda_reduce_block_fence,
+               RAJA::cuda_reduce_atomic_device_init_device_fence,
+               RAJA::cuda_reduce_atomic_device_init_block_fence,
+               RAJA::cuda_reduce_atomic_host_init_device_fence,
+               RAJA::cuda_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols = camp::list<
-    RAJA::hip_reduce_device_fence,
-    RAJA::hip_reduce_block_fence,
-    RAJA::hip_reduce_atomic_device_init_device_fence,
-    RAJA::hip_reduce_atomic_device_init_block_fence,
-    RAJA::hip_reduce_atomic_host_init_device_fence,
-    RAJA::hip_reduce_atomic_host_init_block_fence>;
+using HipReducePols =
+    camp::list<RAJA::hip_reduce_device_fence,
+               RAJA::hip_reduce_block_fence,
+               RAJA::hip_reduce_atomic_device_init_device_fence,
+               RAJA::hip_reduce_atomic_device_init_block_fence,
+               RAJA::hip_reduce_atomic_host_init_device_fence,
+               RAJA::hip_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index c80ab784eb..6c70d8583c 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -34,8 +34,8 @@ template <typename BODY>
 __global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
   body();
 }
 
@@ -65,8 +65,8 @@ template <typename BODY>
 __global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
-  auto  privatizer = thread_privatize(body_in);
-  auto& body       = privatizer.get_priv();
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
   body();
 }
 
@@ -79,8 +79,8 @@ struct TensorTestHelper<RAJA::expt::hip_wave_register>
   {
     hipDeviceSynchronize();
 
-    RAJA::forall<RAJA::hip_exec<64>>(
-        RAJA::RangeSegment(0, 64), [=] RAJA_HOST_DEVICE(int) { body(); });
+    RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0, 64),
+                                     [=] RAJA_HOST_DEVICE(int) { body(); });
 
     hipDeviceSynchronize();
   }
@@ -134,8 +134,8 @@ void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    cudaErrchk(cudaMemcpy(
-        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), cudaMemcpyHostToDevice));
+    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                          cudaMemcpyHostToDevice));
   }
   else
   {
@@ -148,8 +148,8 @@ void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    cudaErrchk(cudaMemcpy(
-        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), cudaMemcpyDeviceToHost));
+    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                          cudaMemcpyDeviceToHost));
   }
   else
   {
@@ -196,8 +196,8 @@ void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    hipErrchk(hipMemcpy(
-        d_ptr, h_vec.data(), h_vec.size() * sizeof(T), hipMemcpyHostToDevice));
+    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                        hipMemcpyHostToDevice));
   }
   else
   {
@@ -210,8 +210,8 @@ void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
 {
   if (TensorTestHelper<POL>::is_device)
   {
-    hipErrchk(hipMemcpy(
-        h_vec.data(), d_ptr, h_vec.size() * sizeof(T), hipMemcpyDeviceToHost));
+    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                        hipMemcpyDeviceToHost));
   }
   else
   {
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index 6225feca3b..520337103a 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -88,16 +88,16 @@ struct ResourceAllocator
     Resource const& get_resource() const { return m_res; }
 
     template <typename U>
-    friend inline bool
-    operator==(std_allocator const& /*lhs*/, std_allocator<U> const& /*rhs*/)
+    friend inline bool operator==(std_allocator const& /*lhs*/,
+                                  std_allocator<U> const& /*rhs*/)
     {
       return true;  // lhs.get_resource() == rhs.get_resource(); // TODO not
                     // equality comparable yet
     }
 
     template <typename U>
-    friend inline bool
-    operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -137,8 +137,8 @@ struct NeverEqualAllocator
   /*[[nodiscard]]*/
   void* allocate(size_t size)
   {
-    void* ptr    = malloc(size);
-    auto  iter_b = m_allocations.emplace(ptr, size);
+    void* ptr   = malloc(size);
+    auto iter_b = m_allocations.emplace(ptr, size);
     if (!iter_b.second)
     {
       RAJA_ABORT_OR_THROW("failed to add allocation to map");
@@ -286,15 +286,15 @@ struct WorkStorageTestAllocator
     AllocatorImpl const& get_impl() const { return m_impl; }
 
     template <typename U>
-    friend inline bool
-    operator==(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator==(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return lhs.get_impl() == rhs.get_impl();
     }
 
     template <typename U>
-    friend inline bool
-    operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -322,10 +322,10 @@ using SequentialOrderedPolicyList =
     camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialOrderPolicyList =
     camp::list<RAJA::ordered, RAJA::reverse_ordered>;
-using SequentialStoragePolicyList = camp::list<
-    RAJA::array_of_pointers,
-    RAJA::ragged_array_of_objects,
-    RAJA::constant_stride_array_of_objects>;
+using SequentialStoragePolicyList =
+    camp::list<RAJA::array_of_pointers,
+               RAJA::ragged_array_of_objects,
+               RAJA::constant_stride_array_of_objects>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPExecPolicyList    = camp::list<RAJA::omp_work>;
@@ -352,10 +352,10 @@ using CudaExecPolicyList = camp::list<
     RAJA::cuda_work<1024>,
     RAJA::cuda_work_explicit<256, 2>>;
 using CudaOrderedPolicyList = SequentialOrderedPolicyList;
-using CudaOrderPolicyList   = camp::list<
-    RAJA::ordered,
-    RAJA::reverse_ordered,
-    RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
+using CudaOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
 using CudaStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -366,10 +366,10 @@ using HipExecPolicyList = camp::list<
 #endif
     RAJA::hip_work<1024>>;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
-using HipOrderPolicyList   = camp::list<
-    RAJA::ordered,
-    RAJA::reverse_ordered,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
+using HipOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
 using HipStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -415,12 +415,12 @@ using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<
 //
 // Memory resource types for testing different std allocator requirements
 //
-using WorkStorageAllocatorList = camp::list<
-    typename detail::WorkStorageTestAllocator<
-        detail::AlwaysEqualAllocator>::template std_allocator<char>,
-    typename detail::WorkStorageTestAllocator<
-        detail::NeverEqualAllocator>::template std_allocator<char>,
-    typename detail::WorkStorageTestAllocator<
-        detail::PropogatingAllocator>::template std_allocator<char>>;
+using WorkStorageAllocatorList =
+    camp::list<typename detail::WorkStorageTestAllocator<
+                   detail::AlwaysEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::NeverEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::PropogatingAllocator>::template std_allocator<char>>;
 
 #endif  // __TEST_WORKGROUP_UTILS_HPP__
diff --git a/test/include/RAJA_unit-test-for3d3d.hpp b/test/include/RAJA_unit-test-for3d3d.hpp
index ceda07d85b..2297745b8c 100644
--- a/test/include/RAJA_unit-test-for3d3d.hpp
+++ b/test/include/RAJA_unit-test-for3d3d.hpp
@@ -123,26 +123,22 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_cuda_global(L run)
 {
-  run(
-      dim3d3d {
-          {static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
-           static_cast<int>(threadIdx.z)},
-          {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
-           static_cast<int>(blockIdx.z)}},
-      dim3d3d {
-          {static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
-           static_cast<int>(blockDim.z)},
-          {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
-           static_cast<int>(gridDim.z)}});
+  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+                static_cast<int>(threadIdx.z)},
+               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+                static_cast<int>(blockIdx.z)}},
+      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+                static_cast<int>(blockDim.z)},
+               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+                static_cast<int>(gridDim.z)}});
 }
 
 // test_cuda implementation
 template <typename L>
 inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 {
-  for3d3d_cuda_global<<<
-      dim3(dim.block[0], dim.block[1], dim.block[2]),
-      dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
+  for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
+                        dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
       std::forward<L>(run));
   cudaErrchk(cudaGetLastError());
   cudaErrchk(cudaDeviceSynchronize());
@@ -155,28 +151,24 @@ inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_hip_global(L run)
 {
-  run(
-      dim3d3d {
-          {static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
-           static_cast<int>(threadIdx.z)},
-          {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
-           static_cast<int>(blockIdx.z)}},
-      dim3d3d {
-          {static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
-           static_cast<int>(blockDim.z)},
-          {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
-           static_cast<int>(gridDim.z)}});
+  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+                static_cast<int>(threadIdx.z)},
+               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+                static_cast<int>(blockIdx.z)}},
+      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+                static_cast<int>(blockDim.z)},
+               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+                static_cast<int>(gridDim.z)}});
 }
 
 // test_hip implementation
 template <typename L>
 inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 {
-  hipLaunchKernelGGL(
-      for3d3d_hip_global<camp::decay<L>>,
-      dim3(dim.block[0], dim.block[1], dim.block[2]),
-      dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
-      std::forward<L>(run));
+  hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
+                     dim3(dim.block[0], dim.block[1], dim.block[2]),
+                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
+                     std::forward<L>(run));
   hipErrchk(hipGetLastError());
   hipErrchk(hipDeviceSynchronize());
 }
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
index e3c06a8d7e..d5315c6e1c 100644
--- a/test/include/RAJA_unit-test-forone.hpp
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -71,9 +71,8 @@ __global__ void forone_hip_global(L run)
 template <typename L>
 inline void forone(test_hip, L&& run)
 {
-  hipLaunchKernelGGL(
-      forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
-      std::forward<L>(run));
+  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
+                     std::forward<L>(run));
   hipErrchk(hipGetLastError());
   hipErrchk(hipDeviceSynchronize());
 }
diff --git a/test/include/RAJA_unit-test-types.hpp b/test/include/RAJA_unit-test-types.hpp
index f58af70238..618a625ede 100644
--- a/test/include/RAJA_unit-test-types.hpp
+++ b/test/include/RAJA_unit-test-types.hpp
@@ -17,19 +17,18 @@
 //
 // List of integral types used in RAJA index unit tests
 //
-using UnitIntegralTypes = ::testing::Types<
-    char,
-    unsigned char,
-    short,
-    unsigned short,
-    int,
-    unsigned int,
-    long,
-    unsigned long,
-    long int,
-    unsigned long int,
-    long long,
-    unsigned long long>;
+using UnitIntegralTypes = ::testing::Types<char,
+                                           unsigned char,
+                                           short,
+                                           unsigned short,
+                                           int,
+                                           unsigned int,
+                                           long,
+                                           unsigned long,
+                                           long int,
+                                           unsigned long int,
+                                           long long,
+                                           unsigned long long>;
 
 //
 // Expanded integral types used in RAJA index unit tests
@@ -60,21 +59,20 @@ using UnitIntFloatTypes =
 //
 // Standard list of index types used in RAJA index unit tests
 //
-using UnitIndexTypes = ::testing::Types<
-    RAJA::Index_type,
-    int,
+using UnitIndexTypes = ::testing::Types<RAJA::Index_type,
+                                        int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    unsigned int,
-    char,
-    unsigned char,
-    short,
-    unsigned short,
-    long,
-    unsigned long,
-    long int,
-    unsigned long int,
-    long long,
+                                        unsigned int,
+                                        char,
+                                        unsigned char,
+                                        short,
+                                        unsigned short,
+                                        long,
+                                        unsigned long,
+                                        long int,
+                                        unsigned long int,
+                                        long long,
 #endif
-    unsigned long long>;
+                                        unsigned long long>;
 
 #endif  // __RAJA_unit_test_types_HPP__
diff --git a/test/include/type_helper.hpp b/test/include/type_helper.hpp
index 6286c61271..4cb2fd0975 100644
--- a/test/include/type_helper.hpp
+++ b/test/include/type_helper.hpp
@@ -95,12 +95,10 @@ struct concat<T>
   using type = T;
 };
 
-template <
-    template <class...>
-    class T,
-    class... Front,
-    class... Next,
-    class... Rest>
+template <template <class...> class T,
+          class... Front,
+          class... Next,
+          class... Rest>
 struct concat<T<Front...>, T<Next...>, Rest...>
 {
   using type = typename concat<T<Front..., Next...>, Rest...>::type;
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
index 9a669fb341..82d77e9d98 100644
--- a/test/install/using-with-cmake/using-with-cmake.cpp
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -13,7 +13,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
   double* a = new double[N];
   double* b = new double[N];
-  double  c = 3.14159;
+  double c  = 3.14159;
 
   for (std::size_t i = 0; i < N; i++)
   {
@@ -21,9 +21,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
     b[i] = 2.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, N),
-      [=] RAJA_HOST_DEVICE(std::size_t i) { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
+                               [=] RAJA_HOST_DEVICE(std::size_t i)
+                               { a[i] += b[i] * c; });
 
   delete[] a;
   delete[] b;
diff --git a/test/integration/plugin/plugin_to_test.cpp b/test/integration/plugin/plugin_to_test.cpp
index 8b7822ebaf..a637c3476a 100644
--- a/test/integration/plugin/plugin_to_test.cpp
+++ b/test/integration/plugin/plugin_to_test.cpp
@@ -77,8 +77,7 @@ class CounterPlugin : public RAJA::util::PluginStrategy
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin>
-    P("counter-plugin",
-      "Coun"
-      "te"
-      "r");
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin",
+                                                        "Coun"
+                                                        "te"
+                                                        "r");
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
index 8e1aeeba21..a652fe9db1 100644
--- a/test/integration/plugin/tests/counter.hpp
+++ b/test/integration/plugin/tests/counter.hpp
@@ -11,11 +11,11 @@
 struct CounterData
 {
   RAJA::Platform capture_platform_active = RAJA::Platform::undefined;
-  int            capture_counter_pre     = 0;
-  int            capture_counter_post    = 0;
+  int capture_counter_pre                = 0;
+  int capture_counter_post               = 0;
   RAJA::Platform launch_platform_active  = RAJA::Platform::undefined;
-  int            launch_counter_pre      = 0;
-  int            launch_counter_post     = 0;
+  int launch_counter_pre                 = 0;
+  int launch_counter_post                = 0;
 };
 
 // note the use of a pointer here to allow different types of memory
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index a9eea22f3f..2db5d1c5e4 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -31,8 +31,8 @@ void PluginForallTestImpl()
   for (int i = 0; i < 10; i++)
   {
 
-    RAJA::forall<ExecPolicy>(
-        RAJA::RangeSegment(i, i + 1), PluginTestCallable {data});
+    RAJA::forall<ExecPolicy>(RAJA::RangeSegment(i, i + 1),
+                             PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
@@ -45,8 +45,8 @@ void PluginForallTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -68,8 +68,8 @@ void PluginForAllICountTestImpl()
   for (int i = 0; i < 10; i++)
   {
 
-    RAJA::forall_Icount<ExecPolicy>(
-        RAJA::RangeSegment(i, i + 1), i, PluginTestCallable {data});
+    RAJA::forall_Icount<ExecPolicy>(RAJA::RangeSegment(i, i + 1), i,
+                                    PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
@@ -82,8 +82,8 @@ void PluginForAllICountTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -129,8 +129,8 @@ void PluginForAllIdxSetTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -176,8 +176,8 @@ void PluginForAllIcountIdxSetTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -226,15 +226,14 @@ TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
   using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIcountIdxSetTestImpl<
-      ExecPolicy, ResType, PlatformHolder::platform>();
+  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType,
+                                   PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    PluginForallTest,
-    PluginForall,
-    PluginForAllICount,
-    PluginForAllIdxSet,
-    PluginForAllIcountIdxSet);
+REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
+                            PluginForall,
+                            PluginForAllICount,
+                            PluginForAllIdxSet,
+                            PluginForAllIcountIdxSet);
 
 #endif  //__TEST_PLUGIN_FORALL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index 20bf9c6167..41a7cd92cd 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -31,9 +31,8 @@ void PluginKernelTestImpl()
   for (int i = 0; i < 10; i++)
   {
 
-    RAJA::kernel<KernelPolicy>(
-        RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
-        PluginTestCallable {data});
+    RAJA::kernel<KernelPolicy>(RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
+                               PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
@@ -46,8 +45,8 @@ void PluginKernelTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index dc9df9c033..b01dadee8c 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -53,8 +53,8 @@ void PluginLaunchTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index 040238089a..b5aaef62ee 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -55,8 +55,8 @@ void PluginResourceLaunchTestImpl()
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(
-      &plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
   ASSERT_EQ(plugin_data.capture_counter_pre, 10);
   ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -79,8 +79,8 @@ TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
   using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginResourceLaunchTestImpl<
-      LaunchPolicy, ResType, PlatformHolder::platform>();
+  PluginResourceLaunchTestImpl<LaunchPolicy, ResType,
+                               PlatformHolder::platform>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest, PluginResourceLaunch);
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index f589354f49..3f2c3e1223 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -21,15 +21,14 @@
 // once before and after each run invocation for the launch counter.
 
 // test with workgroup
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator,
-    typename WORKINGRES,
-    RAJA::Platform PLATFORM>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator,
+          typename WORKINGRES,
+          RAJA::Platform PLATFORM>
 struct PluginWorkGroupTestImpl
 {
   void operator()() const
@@ -39,20 +38,20 @@ struct PluginWorkGroupTestImpl
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, PluginTestCallable>>;
 
-    using WorkPool_type = RAJA::WorkPool<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkGroup_type = RAJA::WorkGroup<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-    using WorkSite_type = RAJA::WorkSite<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<>, Allocator>;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
     SetupPluginVars spv(WORKINGRES {});
 
@@ -69,8 +68,8 @@ struct PluginWorkGroupTestImpl
         loop_data[i].launch_counter_pre      = -1;
         loop_data[i].launch_counter_post     = -1;
       }
-      plugin_test_resource->memcpy(
-          data, &loop_data[0], 10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(data, &loop_data[0],
+                                   10 * sizeof(CounterData));
     }
 
     WorkPool_type pool(Allocator {});
@@ -82,8 +81,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(
-          &plugin_data, plugin_test_data, sizeof(CounterData));
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -94,17 +93,17 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData loop_data[10];
-      plugin_test_resource->memcpy(
-          &loop_data[0], data, 10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
 
       for (int i = 0; i < 10; i++)
       {
-        ASSERT_EQ(
-            loop_data[i].capture_platform_active, RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
         ASSERT_EQ(loop_data[i].capture_counter_post, -1);
-        ASSERT_EQ(
-            loop_data[i].launch_platform_active, RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
         ASSERT_EQ(loop_data[i].launch_counter_post, -1);
       }
@@ -114,8 +113,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(
-          &plugin_data, plugin_test_data, sizeof(CounterData));
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -126,17 +125,17 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData loop_data[10];
-      plugin_test_resource->memcpy(
-          &loop_data[0], data, 10 * sizeof(CounterData));
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
 
       for (int i = 0; i < 10; i++)
       {
-        ASSERT_EQ(
-            loop_data[i].capture_platform_active, RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
         ASSERT_EQ(loop_data[i].capture_counter_post, -1);
-        ASSERT_EQ(
-            loop_data[i].launch_platform_active, RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
         ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
         ASSERT_EQ(loop_data[i].launch_counter_post, -1);
       }
@@ -146,8 +145,8 @@ struct PluginWorkGroupTestImpl
 
     {
       CounterData plugin_data;
-      plugin_test_resource->memcpy(
-          &plugin_data, plugin_test_data, sizeof(CounterData));
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
       ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
       ASSERT_EQ(plugin_data.capture_counter_pre, 10);
       ASSERT_EQ(plugin_data.capture_counter_post, 10);
@@ -179,14 +178,13 @@ struct PluginWorkGroupTestImpl
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKINGRES,
-    RAJA::Platform PLATFORM>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKINGRES,
+          RAJA::Platform PLATFORM>
 struct PluginWorkGroupTestImpl<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -200,14 +198,13 @@ struct PluginWorkGroupTestImpl<
   void operator()() const {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator,
-    typename WORKINGRES,
-    RAJA::Platform PLATFORM>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKINGRES,
+          RAJA::Platform PLATFORM>
 struct PluginWorkGroupTestImpl<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -240,9 +237,9 @@ TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
   using PlatformHolder   = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator, WORKING_RESOURCE, PlatformHolder::platform> {}();
+  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                          IndexType, Allocator, WORKING_RESOURCE,
+                          PlatformHolder::platform> {}();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest, PluginWorkGroup);
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
index d6bbd3af37..aca9c8e47b 100644
--- a/test/integration/plugin/tests/test-plugin.hpp
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -144,9 +144,9 @@ struct PluginTestCallable
   }
 
 private:
-  CounterData*       m_data_optr = nullptr;
+  CounterData* m_data_optr       = nullptr;
   const CounterData* m_data_iptr = nullptr;
-  CounterData        m_data;
+  CounterData m_data;
 
 
   RAJA_HOST_DEVICE void clear()
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
index 653ec3cc69..f934d864f7 100644
--- a/test/integration/plugin_for_test_kokkos.cpp
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -9,17 +9,17 @@
 
 #include <exception>
 
-extern "C" void kokkosp_init_library(
-    const int      RAJA_UNUSED_ARG(loadSeq),
-    const uint64_t RAJA_UNUSED_ARG(interfaceVer),
-    const uint32_t RAJA_UNUSED_ARG(devInfoCount),
-    void*          RAJA_UNUSED_ARG(deviceInfo))
+extern "C" void
+kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
+                     const uint64_t RAJA_UNUSED_ARG(interfaceVer),
+                     const uint32_t RAJA_UNUSED_ARG(devInfoCount),
+                     void* RAJA_UNUSED_ARG(deviceInfo))
 {}
 
-extern "C" void kokkosp_begin_parallel_for(
-    const char*    RAJA_UNUSED_ARG(name),
-    const uint32_t RAJA_UNUSED_ARG(devID),
-    uint64_t*      RAJA_UNUSED_ARG(kID))
+extern "C" void
+kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
+                           const uint32_t RAJA_UNUSED_ARG(devID),
+                           uint64_t* RAJA_UNUSED_ARG(kID))
 {
   throw std::runtime_error("preLaunch");
 }
diff --git a/test/integration/test_plugin_dynamic.cpp b/test/integration/test_plugin_dynamic.cpp
index 6904d8c35d..5a3f157e97 100644
--- a/test/integration/test_plugin_dynamic.cpp
+++ b/test/integration/test_plugin_dynamic.cpp
@@ -13,8 +13,8 @@ TEST(PluginTestDynamic, Exception)
   int* a = new int[10];
 
   ASSERT_ANY_THROW({
-    RAJA::forall<RAJA::seq_exec>(
-        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/integration/test_plugin_kokkos.cpp b/test/integration/test_plugin_kokkos.cpp
index 4bde2c5f2e..521870494b 100644
--- a/test/integration/test_plugin_kokkos.cpp
+++ b/test/integration/test_plugin_kokkos.cpp
@@ -13,8 +13,8 @@ TEST(PluginTestKokkos, Exception)
   int* a = new int[10];
 
   ASSERT_ANY_THROW({
-    RAJA::forall<RAJA::seq_exec>(
-        RAJA::RangeSegment(0, 10), [=](int i) { a[i] = 0; });
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/old-tests/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
index fc5a02b1ed..dd46bfbdc1 100644
--- a/test/old-tests/unit/cuda/test-synchronize.cpp
+++ b/test/old-tests/unit/cuda/test-synchronize.cpp
@@ -21,9 +21,9 @@ GPU_TEST(SynchronizeTest, CUDA)
       [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::cuda_synchronize>();
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, 50),
-      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   cudaErrchk(cudaFree(managed_data));
 }
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index 3865346009..cdd3a099b4 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -76,25 +76,24 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
     }
   }
 
-  using SharedTile = AtomicTypedLocalArray<
-      RAJA::auto_atomic, double, RAJA::PERM_IJ,
-      RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
+  using SharedTile =
+      AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ,
+                            RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
-          RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
-          RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
-          RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
         TY row = by * TY_TILE_DIM + ty;  // Matrix row index
@@ -106,8 +105,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
         TY row = by * TY_TILE_DIM + ty;  // Matrix row index
@@ -123,8 +122,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
   {
     for (int col = 0; col < N_cols; ++col)
     {
-      ASSERT_FLOAT_EQ(
-          (double)B[col + row * N_cols], (double)A[col + row * N_cols]);
+      ASSERT_FLOAT_EQ((double)B[col + row * N_cols],
+                      (double)A[col + row * N_cols]);
     }
   }
 
@@ -165,7 +164,7 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   double *A, *B;
   double *d_A, *d_B;
-  size_t  Arr_sz = N_rows * N_cols;
+  size_t Arr_sz = N_rows * N_cols;
   hipMalloc(&d_A, sizeof(double) * Arr_sz);
   hipMalloc(&d_B, sizeof(double) * Arr_sz);
   A = new double[N_rows * N_cols];
@@ -173,10 +172,10 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(
-      d_A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(
-      d_B, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows,
+                                                             N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows,
+                                                             N_cols);
 
   for (int row = 0; row < N_rows; ++row)
   {
@@ -188,24 +187,24 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 
   hipMemcpy(d_A, A, Arr_sz * sizeof(double), hipMemcpyHostToDevice);
 
-  using SharedTile = TypedLocalArray<
-      double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
+  using SharedTile =
+      TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>,
+                      TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(
-          RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
-          RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
-          RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
-          RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
         TY row = by * TY_TILE_DIM + ty;  // Matrix row index
@@ -217,8 +216,8 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(
-          TX tx, TY ty, TX bx, TY by, SharedTile & myTile, SharedTile&)
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
       {
         TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
         TY row = by * TY_TILE_DIM + ty;  // Matrix row index
@@ -319,9 +318,8 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(
-          int tx, int ty, int bx, int by, SharedTile& myTile,
-          SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
       {
         int col = bx * TILE_DIM + tx;  // Matrix column index
         int row = by * TILE_DIM + ty;  // Matrix row index
@@ -334,9 +332,8 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(
-          int tx, int ty, int bx, int by, SharedTile& myTile,
-          SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
       {
         int col = by * TILE_DIM + tx;  // Transposed matrix column index
         int row = bx * TILE_DIM + ty;  // Transposed matrix row index
@@ -449,9 +446,8 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
       RAJA::make_tuple(myTile, myTile2),
 
       // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(
-          int tx, int ty, int bx, int by, SharedTile& myTile,
-          SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
       {
         int col = bx * TILE_DIM + tx;  // Matrix column index
         int row = by * TILE_DIM + ty;  // Matrix row index
@@ -464,9 +460,8 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
       },
 
       // read from shared mem
-      [=] RAJA_HOST_DEVICE(
-          int tx, int ty, int bx, int by, SharedTile& myTile,
-          SharedTile& myTile2)
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
       {
         int col = by * TILE_DIM + tx;  // Transposed matrix column index
         int row = bx * TILE_DIM + ty;  // Transposed matrix row index
@@ -529,10 +524,9 @@ using SeqTypes =
                 RAJA::statement::For<
                     1,
                     RAJA::seq_exec,
-                    RAJA::statement::For<
-                        0,
-                        RAJA::seq_exec,
-                        RAJA::statement::Lambda<1>>>
+                    RAJA::statement::For<0,
+                                         RAJA::seq_exec,
+                                         RAJA::statement::Lambda<1>>>
 
                 >  // close shared memory scope
             >      // for 2
@@ -558,20 +552,18 @@ using TestTypes = ::testing::Types<
                 RAJA::ParamList<0, 1>,
 
                 // Load data into shared memory
-                RAJA::statement::Collapse<
-                    RAJA::omp_parallel_collapse_exec,
-                    RAJA::ArgList<0, 1>,
-                    RAJA::statement::Lambda<0>>,
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<0>>,
 
                 // Read data from shared memory
-                RAJA::statement::Collapse<
-                    RAJA::omp_parallel_collapse_exec,
-                    RAJA::ArgList<0, 1>,
-                    RAJA::statement::Lambda<1>>>>  // for
-                                                   // 2
-        >                                          // for 3
-                                  >                // close policy
-               >                                   // close list
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<1>>>>  // for
+                                                                         // 2
+        >                            // for 3
+                                  >  // close policy
+               >                     // close list
 
     ,
     RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
@@ -661,16 +653,15 @@ using TestTypes = ::testing::Types<
             RAJA::statement::For<
                 1,
                 RAJA::seq_exec,
-                RAJA::statement::For<
-                    0,
-                    RAJA::seq_exec,
-                    RAJA::statement::Lambda<1>>>>  // close
-                                                   // shared
-                                                   // mem
-                                                   // window
-        >                                          // outer collapsed
-                                  >                // close policy list
-               >                                   // close list
+                RAJA::statement::For<0,
+                                     RAJA::seq_exec,
+                                     RAJA::statement::Lambda<1>>>>  // close
+                                                                    // shared
+                                                                    // mem
+                                                                    // window
+        >                            // outer collapsed
+                                  >  // close policy list
+               >                     // close list
     >;
 
 
@@ -697,20 +688,18 @@ using CUDATypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::cuda_thread_x_direct,
-                            RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
                     RAJA::statement::CudaSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::cuda_thread_x_direct,
-                            RAJA::statement::Lambda<1>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
                     RAJA::statement::CudaSyncThreads>     // close shared memory
                                                           // scope
                 >                                         // for 2
@@ -735,20 +724,18 @@ using CUDATypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::cuda_thread_x_direct,
-                            RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
                     RAJA::statement::CudaSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::cuda_thread_x_direct,
-                            RAJA::statement::Lambda<1>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
                     RAJA::statement::CudaSyncThreads>     // close shared memory
                                                           // scope
                 >                                         // for 2
@@ -781,20 +768,18 @@ using HIPTypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::hip_thread_x_direct,
-                            RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
                     RAJA::statement::HipSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::hip_thread_x_direct,
-                            RAJA::statement::Lambda<1>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
                     RAJA::statement::HipSyncThreads>     // close shared memory
                                                          // scope
                 >                                        // for 2
@@ -819,20 +804,18 @@ using HIPTypes = ::testing::Types<
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::hip_thread_x_direct,
-                            RAJA::statement::Lambda<0>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
                     RAJA::statement::HipSyncThreads,
 
                     // Read data from shared memory
                     RAJA::statement::For<
                         1,
                         RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<
-                            0,
-                            RAJA::hip_thread_x_direct,
-                            RAJA::statement::Lambda<1>>>,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
                     RAJA::statement::HipSyncThreads>     // close shared memory
                                                          // scope
                 >                                        // for 2
@@ -933,13 +916,12 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   using Shmem      = typename TypeParam::Shmem;
   using ThreadPriv = typename TypeParam::ThreadPriv;
 
-  Shmem      aShared, bShared;  // memory to be shared between threads
-  ThreadPriv pVal;              // iteration dependent data
+  Shmem aShared, bShared;  // memory to be shared between threads
+  ThreadPriv pVal;         // iteration dependent data
 
   RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(
-          RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
-          RAJA::RangeSegment(0, P)),
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
+                       RAJA::RangeSegment(0, P)),
       RAJA::make_tuple(aShared, bShared, pVal),
 
       // Zero out thread local memory for storing dot products
@@ -955,9 +937,8 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
       { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(
-          int tn, int tm, int tp, Shmem& aShared, Shmem& bShared,
-          ThreadPriv& pVal)
+      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
+                           Shmem& bShared, ThreadPriv& pVal)
       { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
@@ -1066,18 +1047,18 @@ struct Policy_MatMultiply_cpu
                       RAJA::seq_exec,
 
                       // Load tile of A into shmem
-                      RAJA::statement::For<
-                          1,
-                          RAJA::seq_exec,
-                          RAJA::statement::
-                              For<0, RAJA::seq_exec, shmem_Lambda1>>,
+                      RAJA::statement::For<1,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<0,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda1>>,
 
                       // Load tile of B into shmem
-                      RAJA::statement::For<
-                          2,
-                          RAJA::seq_exec,
-                          RAJA::statement::
-                              For<1, RAJA::seq_exec, shmem_Lambda2>>,
+                      RAJA::statement::For<2,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<1,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda2>>,
 
                       // Partial multiplication
                       RAJA::statement::For<
@@ -1096,11 +1077,10 @@ struct Policy_MatMultiply_cpu
                   RAJA::statement::For<
                       2,
                       RAJA::seq_exec,
-                      RAJA::statement::For<
-                          0,
-                          RAJA::seq_exec,
-                          shmem_Lambda4>>>>>  // Create shared
-                                              // memory
+                      RAJA::statement::For<0,
+                                           RAJA::seq_exec,
+                                           shmem_Lambda4>>>>>  // Create shared
+                                                               // memory
       >;
 };
 
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index a7e78cf966..bb4f7ab274 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -21,8 +21,8 @@ using namespace RAJA::statement;
 TEST(SIMD, Align)
 {
 
-  int     N = 1024;
-  double  c = 0.5;
+  int N    = 1024;
+  double c = 0.5;
   double* a =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
   double* b =
@@ -38,8 +38,8 @@ TEST(SIMD, Align)
   double* y = RAJA::align_hint(a);
   double* x = RAJA::align_hint(b);
 
-  RAJA::forall<RAJA::simd_exec>(
-      RAJA::RangeSegment(0, N), [=](int i) { y[i] += x[i] * c; });
+  RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
+                                [=](int i) { y[i] += x[i] * c; });
 
   for (int i = 0; i < N; ++i)
   {
@@ -61,12 +61,12 @@ TEST(SIMD, OMPAndSimd)
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double* a = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
-  double* b = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
-  double* c = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                  N * M * sizeof(double));
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                  N * M * sizeof(double));
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                  N * M * sizeof(double));
 
   for (int i = 0; i < N * M; ++i)
   {
@@ -95,26 +95,25 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
       1, RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<
-          0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
-          RAJA::statement::Lambda<1>>>>;
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
+                           RAJA::statement::Lambda<1>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double* a = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
-  double* b = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
-  double* c = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
-
-  double* a2 = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
-  double* b2 = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
-  double* c2 = RAJA::allocate_aligned_type<double>(
-      RAJA::DATA_ALIGN, N * M * sizeof(double));
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                  N * M * sizeof(double));
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                  N * M * sizeof(double));
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                  N * M * sizeof(double));
+
+  double* a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                   N * M * sizeof(double));
+  double* b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                   N * M * sizeof(double));
+  double* c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+                                                   N * M * sizeof(double));
 
   for (int i = 0; i < N * M; ++i)
   {
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
index 9de2bdb439..5bd0d3a612 100644
--- a/test/unit/algorithm/test-algorithm-util-for_each.cpp
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -31,13 +31,12 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
   std::vector<TypeParam> numbers;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(
-      numbers,
-      [&](TypeParam& number)
-      {
-        number += 1;
-        copies.push_back(number);
-      });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   number += 1;
+                   copies.push_back(number);
+                 });
 
   ASSERT_EQ(copies.size(), 0);
   ASSERT_EQ(numbers.size(), 0);
@@ -52,13 +51,12 @@ TYPED_TEST(ForEachUnitTest, VectorRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(
-      numbers,
-      [&](TypeParam& number)
-      {
-        copies.push_back(number);
-        number += 1;
-      });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 13);
   for (TypeParam i = 0; i < 13; ++i)
@@ -76,13 +74,12 @@ TYPED_TEST(ForEachUnitTest, RajaSpanRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(
-      RAJA::make_span(numbers.data(), 11),
-      [&](TypeParam& number)
-      {
-        copies.push_back(number);
-        number += 1;
-      });
+  RAJA::for_each(RAJA::make_span(numbers.data(), 11),
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 11);
   for (TypeParam i = 0; i < 11; ++i)
@@ -100,8 +97,8 @@ TYPED_TEST(ForEachUnitTest, SetRange)
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(
-      numbers, [&](TypeParam const& number) { copies.push_back(number); });
+  RAJA::for_each(numbers,
+                 [&](TypeParam const& number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 6);
   for (TypeParam i = 0; i < 6; ++i)
@@ -117,8 +114,8 @@ TYPED_TEST(ForEachUnitTest, EmptyTypeList)
   using numbers = camp::list<>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(
-      numbers {}, [&](auto number) { copies.push_back(number); });
+  RAJA::for_each_type(numbers {},
+                      [&](auto number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 0);
 }
@@ -130,21 +127,19 @@ T get_num(std::integral_constant<T, val>)
   return val;
 }
 
-template <
-    typename TypeParam,
-    std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
+template <typename TypeParam,
+          std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
-  using numbers = camp::list<
-      std::integral_constant<TypeParam, 0>,
-      std::integral_constant<TypeParam, 1>,
-      std::integral_constant<TypeParam, 2>,
-      std::integral_constant<TypeParam, 3>,
-      std::integral_constant<TypeParam, 4>>;
+  using numbers = camp::list<std::integral_constant<TypeParam, 0>,
+                             std::integral_constant<TypeParam, 1>,
+                             std::integral_constant<TypeParam, 2>,
+                             std::integral_constant<TypeParam, 3>,
+                             std::integral_constant<TypeParam, 4>>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(
-      numbers {}, [&](auto number) { copies.push_back(get_num(number)); });
+  RAJA::for_each_type(numbers {},
+                      [&](auto number) { copies.push_back(get_num(number)); });
 
   ASSERT_EQ(copies.size(), 5);
   for (TypeParam i = 0; i < 5; ++i)
@@ -153,9 +148,8 @@ void run_int_type_test()
   }
 }
 ///
-template <
-    typename TypeParam,
-    std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
+template <typename TypeParam,
+          std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   // ignore non-ints
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index dda14b0425..bacec1a905 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -102,7 +102,7 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 {
   ValType* values        = nullptr;
   ValType* reduced_value = nullptr;
-  Res      m_res;
+  Res m_res;
 
   template <typename RandomGenerator>
   ReduceData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
@@ -143,14 +143,13 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(
-    ReduceData<Res, reduce_interface_tag, T>& data,
-    RAJA::Index_type                          N,
-    T,
-    BinaryOp,
-    Reducer reducer,
-    reduce_interface_tag,
-    reduce_default_interface_tag)
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -159,14 +158,13 @@ void doReduce(
 }
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(
-    ReduceData<Res, reduce_interface_tag, T>& data,
-    RAJA::Index_type                          N,
-    T                                         init,
-    BinaryOp,
-    Reducer reducer,
-    reduce_interface_tag,
-    reduce_init_interface_tag)
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -175,14 +173,13 @@ void doReduce(
 }
 
 template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(
-    ReduceData<Res, reduce_interface_tag, T>& data,
-    RAJA::Index_type                          N,
-    T                                         init,
-    BinaryOp                                  op,
-    Reducer                                   reducer,
-    reduce_interface_tag,
-    reduce_init_op_interface_tag)
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp op,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_op_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -191,23 +188,22 @@ void doReduce(
 }
 
 
-template <
-    typename Res,
-    typename T,
-    typename BinaryOp,
-    typename TestReducer,
-    typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char*                               test_name,
-    const unsigned                            seed,
-    ReduceData<Res, reduce_interface_tag, T>& data,
-    RAJA::Index_type                          N,
-    T                                         init,
-    BinaryOp                                  op,
-    TestReducer                               test_reducer,
-    left_fold_reduce_tag,
-    reduce_interface_tag si,
-    BinaryOpInterface    ci)
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename TestReducer,
+          typename BinaryOpInterface>
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           left_fold_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
@@ -229,23 +225,22 @@ ::testing::AssertionResult testReduce(
   return ::testing::AssertionSuccess();
 }
 
-template <
-    typename Res,
-    typename T,
-    typename BinaryOp,
-    typename TestReducer,
-    typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char*                               test_name,
-    const unsigned                            seed,
-    ReduceData<Res, reduce_interface_tag, T>& data,
-    RAJA::Index_type                          N,
-    T                                         init,
-    BinaryOp                                  op,
-    TestReducer                               test_reducer,
-    unordered_reduce_tag,
-    reduce_interface_tag si,
-    BinaryOpInterface    ci)
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename TestReducer,
+          typename BinaryOpInterface>
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           unordered_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
@@ -269,11 +264,10 @@ ::testing::AssertionResult testReduce(
 
 
 template <typename ValType, typename Reducer, typename Res>
-void testReducerInterfaces(
-    unsigned         seed,
-    RAJA::Index_type MaxN,
-    Reducer          reducer,
-    Res              res)
+void testReducerInterfaces(unsigned seed,
+                           RAJA::Index_type MaxN,
+                           Reducer reducer,
+                           Res res)
 {
   using reduce_category    = typename Reducer::reduce_category;
   using interface_category = typename Reducer::reduce_interface;
@@ -281,7 +275,7 @@ void testReducerInterfaces(
   using init_no_operator   = reduce_init_interface_tag;
   using init_operator      = reduce_init_op_interface_tag;
 
-  std::mt19937     rng(seed);
+  std::mt19937 rng(seed);
   RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
       (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
@@ -296,14 +290,14 @@ void testReducerInterfaces(
   ASSERT_TRUE(testReduce(
       "init", seed, data, N, ValType(N), RAJA::operators::plus<ValType> {},
       reducer, reduce_category {}, interface_category {}, init_no_operator {}));
-  ASSERT_TRUE(testReduce(
-      "minimum", seed, data, N, ValType(0),
-      RAJA::operators::minimum<ValType> {}, reducer, reduce_category {},
-      interface_category {}, init_operator {}));
-  ASSERT_TRUE(testReduce(
-      "Maximum", seed, data, N, ValType(0),
-      RAJA::operators::maximum<ValType> {}, reducer, reduce_category {},
-      interface_category {}, init_operator {}));
+  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0),
+                         RAJA::operators::minimum<ValType> {}, reducer,
+                         reduce_category {}, interface_category {},
+                         init_operator {}));
+  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0),
+                         RAJA::operators::maximum<ValType> {}, reducer,
+                         reduce_category {}, interface_category {},
+                         init_operator {}));
 }
 
 template <typename ValType, typename Reducer, typename Res>
@@ -336,10 +330,10 @@ TYPED_TEST_P(ReduceUnitTest, UnitReduce)
   using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned         seed = get_random_seed();
+  unsigned seed         = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Reducer          reducer {};
-  ResType          res = ResType::get_default();
+  Reducer reducer {};
+  ResType res = ResType::get_default();
 
   testReducer<ValType>(seed, MaxN, reducer, res);
 }
@@ -350,16 +344,15 @@ REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
 //
 // Key types for reduce tests
 //
-using ReduceValTypeList = camp::list<
-    RAJA::Index_type,
-    int,
+using ReduceValTypeList = camp::list<RAJA::Index_type,
+                                     int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    unsigned,
-    long long,
-    unsigned long long,
-    float,
+                                     unsigned,
+                                     long long,
+                                     unsigned long long,
+                                     float,
 #endif
-    double>;
+                                     double>;
 
 // Max test lengths for reduce tests
 using ReduceMaxNListDefault = camp::list<camp::num<10000>>;
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index c9f57a656d..046d631adf 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -98,18 +98,17 @@ struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 #endif
 
 
-template <
-    typename Res,
-    typename pairs_category,
-    typename K,
-    typename V = RAJA::Index_type>
+template <typename Res,
+          typename pairs_category,
+          typename K,
+          typename V = RAJA::Index_type>
 struct SortData;
 
 template <typename Res, typename K, typename V>
 struct SortData<Res, sort_interface_tag, K, V>
 {
-  K*  orig_keys   = nullptr;
-  K*  sorted_keys = nullptr;
+  K* orig_keys   = nullptr;
+  K* sorted_keys = nullptr;
   Res m_res;
 
   template <typename RandomGenerator>
@@ -193,21 +192,20 @@ struct SortData<Res, sort_pairs_interface_tag, K, V>
     if (orig_vals != nullptr)
     {
       this->m_res.deallocate(orig_vals, camp::resources::MemoryAccess::Managed);
-      this->m_res.deallocate(
-          sorted_vals, camp::resources::MemoryAccess::Managed);
+      this->m_res.deallocate(sorted_vals,
+                             camp::resources::MemoryAccess::Managed);
     }
   }
 };
 
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(
-    SortData<Res, sort_interface_tag, T>& data,
-    RAJA::Index_type                      N,
-    Compare,
-    Sorter sorter,
-    sort_interface_tag,
-    sort_default_interface_tag)
+void doSort(SortData<Res, sort_interface_tag, T>& data,
+            RAJA::Index_type N,
+            Compare,
+            Sorter sorter,
+            sort_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -216,13 +214,12 @@ void doSort(
 }
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(
-    SortData<Res, sort_interface_tag, T>& data,
-    RAJA::Index_type                      N,
-    Compare                               comp,
-    Sorter                                sorter,
-    sort_interface_tag,
-    sort_comp_interface_tag)
+void doSort(SortData<Res, sort_interface_tag, T>& data,
+            RAJA::Index_type N,
+            Compare comp,
+            Sorter sorter,
+            sort_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -231,13 +228,12 @@ void doSort(
 }
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(
-    SortData<Res, sort_interface_tag, T>& data,
-    RAJA::Index_type                      N,
-    Compare,
-    Sorter sorter,
-    sort_interface_tag,
-    sort_res_default_interface_tag)
+void doSort(SortData<Res, sort_interface_tag, T>& data,
+            RAJA::Index_type N,
+            Compare,
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
   sorter(data.resource(), RAJA::make_span(data.sorted_keys, N));
@@ -245,122 +241,107 @@ void doSort(
 }
 
 template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(
-    SortData<Res, sort_interface_tag, T>& data,
-    RAJA::Index_type                      N,
-    Compare                               comp,
-    Sorter                                sorter,
-    sort_interface_tag,
-    sort_res_comp_interface_tag)
+void doSort(SortData<Res, sort_interface_tag, T>& data,
+            RAJA::Index_type N,
+            Compare comp,
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
   sorter(data.resource(), RAJA::make_span(data.sorted_keys, N), comp);
   data.resource().wait();
 }
 
-template <
-    typename Res,
-    typename K,
-    typename V,
-    typename Compare,
-    typename Sorter>
-void doSort(
-    SortData<Res, sort_pairs_interface_tag, K, V>& data,
-    RAJA::Index_type                               N,
-    Compare,
-    Sorter sorter,
-    sort_pairs_interface_tag,
-    sort_default_interface_tag)
+template <typename Res,
+          typename K,
+          typename V,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+            RAJA::Index_type N,
+            Compare,
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(
-      RAJA::make_span(data.sorted_keys, N),
-      RAJA::make_span(data.sorted_vals, N));
+  sorter(RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N));
   sorter.synchronize();
 }
 
-template <
-    typename Res,
-    typename K,
-    typename V,
-    typename Compare,
-    typename Sorter>
-void doSort(
-    SortData<Res, sort_pairs_interface_tag, K, V>& data,
-    RAJA::Index_type                               N,
-    Compare                                        comp,
-    Sorter                                         sorter,
-    sort_pairs_interface_tag,
-    sort_comp_interface_tag)
+template <typename Res,
+          typename K,
+          typename V,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+            RAJA::Index_type N,
+            Compare comp,
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(
-      RAJA::make_span(data.sorted_keys, N),
-      RAJA::make_span(data.sorted_vals, N), comp);
+  sorter(RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N), comp);
   sorter.synchronize();
 }
 
-template <
-    typename Res,
-    typename K,
-    typename V,
-    typename Compare,
-    typename Sorter>
-void doSort(
-    SortData<Res, sort_pairs_interface_tag, K, V>& data,
-    RAJA::Index_type                               N,
-    Compare,
-    Sorter sorter,
-    sort_pairs_interface_tag,
-    sort_res_default_interface_tag)
+template <typename Res,
+          typename K,
+          typename V,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+            RAJA::Index_type N,
+            Compare,
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(
-      data.resource(), RAJA::make_span(data.sorted_keys, N),
-      RAJA::make_span(data.sorted_vals, N));
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N));
   data.resource().wait();
 }
 
-template <
-    typename Res,
-    typename K,
-    typename V,
-    typename Compare,
-    typename Sorter>
-void doSort(
-    SortData<Res, sort_pairs_interface_tag, K, V>& data,
-    RAJA::Index_type                               N,
-    Compare                                        comp,
-    Sorter                                         sorter,
-    sort_pairs_interface_tag,
-    sort_res_comp_interface_tag)
+template <typename Res,
+          typename K,
+          typename V,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+            RAJA::Index_type N,
+            Compare comp,
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(
-      data.resource(), RAJA::make_span(data.sorted_keys, N),
-      RAJA::make_span(data.sorted_vals, N), comp);
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N), comp);
   data.resource().wait();
 }
 
 
-template <
-    typename Res,
-    typename T,
-    typename Compare,
-    typename TestSorter,
-    typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char*                           test_name,
-    const unsigned                        seed,
-    SortData<Res, sort_interface_tag, T>& data,
-    RAJA::Index_type                      N,
-    Compare                               comp,
-    TestSorter                            test_sorter,
-    unstable_sort_tag,
-    sort_interface_tag si,
-    CompareInterface   ci)
+template <typename Res,
+          typename T,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    unstable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -412,22 +393,20 @@ ::testing::AssertionResult testSort(
   return ::testing::AssertionSuccess();
 }
 
-template <
-    typename Res,
-    typename T,
-    typename Compare,
-    typename TestSorter,
-    typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char*                           test_name,
-    const unsigned                        seed,
-    SortData<Res, sort_interface_tag, T>& data,
-    RAJA::Index_type                      N,
-    Compare                               comp,
-    TestSorter                            test_sorter,
-    stable_sort_tag,
-    sort_interface_tag si,
-    CompareInterface   ci)
+template <typename Res,
+          typename T,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    stable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -479,23 +458,22 @@ ::testing::AssertionResult testSort(
 }
 
 
-template <
-    typename Res,
-    typename K,
-    typename V,
-    typename Compare,
-    typename TestSorter,
-    typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char*                                    test_name,
-    const unsigned                                 seed,
-    SortData<Res, sort_pairs_interface_tag, K, V>& data,
-    RAJA::Index_type                               N,
-    Compare                                        comp,
-    TestSorter                                     test_sorter,
-    unstable_sort_tag,
-    sort_pairs_interface_tag si,
-    CompareInterface         ci)
+template <typename Res,
+          typename K,
+          typename V,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         unstable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -551,23 +529,22 @@ ::testing::AssertionResult testSort(
   return ::testing::AssertionSuccess();
 }
 
-template <
-    typename Res,
-    typename K,
-    typename V,
-    typename Compare,
-    typename TestSorter,
-    typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char*                                    test_name,
-    const unsigned                                 seed,
-    SortData<Res, sort_pairs_interface_tag, K, V>& data,
-    RAJA::Index_type                               N,
-    Compare                                        comp,
-    TestSorter                                     test_sorter,
-    stable_sort_tag,
-    sort_pairs_interface_tag si,
-    CompareInterface         ci)
+template <typename Res,
+          typename K,
+          typename V,
+          typename Compare,
+          typename TestSorter,
+          typename CompareInterface>
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         stable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
@@ -637,10 +614,10 @@ void testSorterResInterfaces(
 template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::true_type,
-    unsigned                                              seed,
+    unsigned seed,
     SortData<Res, typename Sorter::sort_interface, K, V>& data,
-    RAJA::Index_type                                      N,
-    Sorter                                                sorter)
+    RAJA::Index_type N,
+    Sorter sorter)
 {
   // Sorter supports resource interface, res tests
   using stability_category      = typename Sorter::sort_category;
@@ -654,18 +631,17 @@ void testSorterResInterfaces(
   ASSERT_TRUE(testSort(
       "resource+ascending", seed, data, N, RAJA::operators::less<K> {}, sorter,
       stability_category {}, pairs_category {}, resource_use_comparator {}));
-  ASSERT_TRUE(testSort(
-      "resource+descending", seed, data, N, RAJA::operators::greater<K> {},
-      sorter, stability_category {}, pairs_category {},
-      resource_use_comparator {}));
+  ASSERT_TRUE(testSort("resource+descending", seed, data, N,
+                       RAJA::operators::greater<K> {}, sorter,
+                       stability_category {}, pairs_category {},
+                       resource_use_comparator {}));
 }
 
 template <typename K, typename Sorter, typename Res>
-void testSorterInterfaces(
-    unsigned         seed,
-    RAJA::Index_type MaxN,
-    Sorter           sorter,
-    Res              res)
+void testSorterInterfaces(unsigned seed,
+                          RAJA::Index_type MaxN,
+                          Sorter sorter,
+                          Res res)
 {
   using stability_category = typename Sorter::sort_category;
   using pairs_category     = typename Sorter::sort_interface;
@@ -673,19 +649,19 @@ void testSorterInterfaces(
   using no_comparator      = sort_default_interface_tag;
   using use_comparator     = sort_comp_interface_tag;
 
-  std::mt19937     rng(seed);
+  std::mt19937 rng(seed);
   RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
       (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
   SortData<Res, pairs_category, K> data(N, res, [&]() { return dist(rng); });
 
-  ASSERT_TRUE(testSort(
-      "default", seed, data, N, RAJA::operators::less<K> {}, sorter,
-      stability_category {}, pairs_category {}, no_comparator {}));
-  ASSERT_TRUE(testSort(
-      "ascending", seed, data, N, RAJA::operators::less<K> {}, sorter,
-      stability_category {}, pairs_category {}, use_comparator {}));
+  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K> {},
+                       sorter, stability_category {}, pairs_category {},
+                       no_comparator {}));
+  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K> {},
+                       sorter, stability_category {}, pairs_category {},
+                       use_comparator {}));
   ASSERT_TRUE(testSort(
       "descending", seed, data, N, RAJA::operators::greater<K> {}, sorter,
       stability_category {}, pairs_category {}, use_comparator {}));
@@ -723,10 +699,10 @@ TYPED_TEST_P(SortUnitTest, UnitSort)
   using KeyType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned         seed = get_random_seed();
+  unsigned seed         = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Sorter           sorter {};
-  ResType          res = ResType::get_default();
+  Sorter sorter {};
+  ResType res = ResType::get_default();
 
   testSorter<KeyType>(seed, MaxN, sorter, res);
 }
@@ -737,16 +713,15 @@ REGISTER_TYPED_TEST_SUITE_P(SortUnitTest, UnitSort);
 //
 // Key types for sort tests
 //
-using SortKeyTypeList = camp::list<
-    RAJA::Index_type,
-    int,
+using SortKeyTypeList = camp::list<RAJA::Index_type,
+                                   int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    unsigned,
-    long long,
-    unsigned long long,
-    float,
+                                   unsigned,
+                                   long long,
+                                   unsigned long long,
+                                   float,
 #endif
-    double>;
+                                   double>;
 
 // Max test lengths for sort tests
 using SortMaxNListDefault = camp::list<camp::num<10000>>;
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index c8f29c4de2..02daab1c60 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -58,8 +58,8 @@ struct PolicySortPairs : PolicySynchronize<policy>
   PolicySortPairs() : m_name("RAJA::sort<unknown>[pairs]") {}
 
   PolicySortPairs(std::string const& policy_name)
-      : m_name(
-            std::string("RAJA::sort<") + policy_name + std::string(">[pairs]"))
+      : m_name(std::string("RAJA::sort<") + policy_name +
+               std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -77,25 +77,25 @@ using SequentialSortSorters =
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPSortSorters = camp::list<
-    PolicySort<RAJA::omp_parallel_for_exec>,
-    PolicySortPairs<RAJA::omp_parallel_for_exec>>;
+using OpenMPSortSorters =
+    camp::list<PolicySort<RAJA::omp_parallel_for_exec>,
+               PolicySortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaSortSorters = camp::list<
-    PolicySort<RAJA::cuda_exec<128>>,
-    PolicySortPairs<RAJA::cuda_exec<128>>,
-    PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
+using CudaSortSorters =
+    camp::list<PolicySort<RAJA::cuda_exec<128>>,
+               PolicySortPairs<RAJA::cuda_exec<128>>,
+               PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipSortSorters = camp::
-    list<PolicySort<RAJA::hip_exec<128>>, PolicySortPairs<RAJA::hip_exec<128>>>;
+using HipSortSorters = camp::list<PolicySort<RAJA::hip_exec<128>>,
+                                  PolicySortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index a8658caf36..c4c9189732 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -35,8 +35,8 @@ struct PolicyStableSort : PolicySynchronize<policy>
   PolicyStableSort() : m_name("RAJA::stable_sort<unknown>") {}
 
   PolicyStableSort(std::string const& policy_name)
-      : m_name(
-            std::string("RAJA::stable_sort<") + policy_name + std::string(">"))
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -60,9 +60,8 @@ struct PolicyStableSortPairs : PolicySynchronize<policy>
   PolicyStableSortPairs() : m_name("RAJA::stable_sort<unknown>[pairs]") {}
 
   PolicyStableSortPairs(std::string const& policy_name)
-      : m_name(
-            std::string("RAJA::stable_sort<") + policy_name +
-            std::string(">[pairs]"))
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -74,32 +73,32 @@ struct PolicyStableSortPairs : PolicySynchronize<policy>
   }
 };
 
-using SequentialStableSortSorters = camp::list<
-    PolicyStableSort<RAJA::seq_exec>,
-    PolicyStableSortPairs<RAJA::seq_exec>>;
+using SequentialStableSortSorters =
+    camp::list<PolicyStableSort<RAJA::seq_exec>,
+               PolicyStableSortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPStableSortSorters = camp::list<
-    PolicyStableSort<RAJA::omp_parallel_for_exec>,
-    PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
+using OpenMPStableSortSorters =
+    camp::list<PolicyStableSort<RAJA::omp_parallel_for_exec>,
+               PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaStableSortSorters = camp::list<
-    PolicyStableSort<RAJA::cuda_exec<128>>,
-    PolicyStableSortPairs<RAJA::cuda_exec<128>>,
-    PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
+using CudaStableSortSorters =
+    camp::list<PolicyStableSort<RAJA::cuda_exec<128>>,
+               PolicyStableSortPairs<RAJA::cuda_exec<128>>,
+               PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipStableSortSorters = camp::list<
-    PolicyStableSort<RAJA::hip_exec<128>>,
-    PolicyStableSortPairs<RAJA::hip_exec<128>>>;
+using HipStableSortSorters =
+    camp::list<PolicyStableSort<RAJA::hip_exec<128>>,
+               PolicyStableSortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index d3d7bf7f9c..52570dbdf1 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -77,9 +77,8 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   std::string m_name;
 
   BinaryTreeReduce()
-      : m_name(
-            std::string("RAJA::binary_tree_reduce<") +
-            test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(std::string("RAJA::binary_tree_reduce<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -92,10 +91,9 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   }
 
   template <typename T, typename Container>
-  void operator()(
-      T*                                    reduced_value,
-      Container&&                           c,
-      RAJA::detail::ContainerVal<Container> init)
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -103,11 +101,10 @@ struct BinaryTreeReduce<test_policy, RunOnDevice>
   }
 
   template <typename T, typename Container, typename BinaryOp>
-  void operator()(
-      T*                                    reduced_value,
-      Container&&                           c,
-      RAJA::detail::ContainerVal<Container> init,
-      BinaryOp                              op)
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -124,9 +121,8 @@ struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   Accumulate()
-      : m_name(
-            std::string("RAJA::accumulate<") +
-            test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(std::string("RAJA::accumulate<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -139,21 +135,19 @@ struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   }
 
   template <typename T, typename Container>
-  void operator()(
-      T*                                    reduced_value,
-      Container&&                           c,
-      RAJA::detail::ContainerVal<Container> init)
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
     forone<test_policy>([=] RAJA_DEVICE()
                         { *reduced_value = RAJA::accumulate(c, init); });
   }
 
   template <typename T, typename Container, typename BinaryOp>
-  void operator()(
-      T*                                    reduced_value,
-      Container&&                           c,
-      RAJA::detail::ContainerVal<Container> init,
-      BinaryOp                              op)
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
     forone<test_policy>([=] RAJA_DEVICE()
                         { *reduced_value = RAJA::accumulate(c, init, op); });
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
index 3fd939a237..b972b752cd 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -85,15 +85,13 @@ struct InsertionSortPairs<test_policy, RunOnHost>
 
   const char* name() { return "RAJA::insertion_sort[pairs]"; }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -126,15 +124,13 @@ struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::shell_sort[pairs]"; }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -167,15 +163,13 @@ struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::heap_sort[pairs]"; }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -208,15 +202,13 @@ struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::intro_sort[pairs]"; }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -249,15 +241,13 @@ struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
   const char* name() { return "RAJA::merge_sort[pairs]"; }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
@@ -277,9 +267,8 @@ struct InsertionSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   InsertionSort()
-      : m_name(
-            std::string("RAJA::insertion_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -308,22 +297,19 @@ struct InsertionSortPairs<test_policy, RunOnDevice>
   std::string m_name;
 
   InsertionSortPairs()
-      : m_name(
-            std::string("RAJA::insertion_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -345,9 +331,8 @@ struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   ShellSort()
-      : m_name(
-            std::string("RAJA::shell_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -375,22 +360,19 @@ struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   ShellSortPairs()
-      : m_name(
-            std::string("RAJA::shell_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -412,9 +394,8 @@ struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   HeapSort()
-      : m_name(
-            std::string("RAJA::heap_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -442,22 +423,19 @@ struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   HeapSortPairs()
-      : m_name(
-            std::string("RAJA::heap_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -479,9 +457,8 @@ struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   IntroSort()
-      : m_name(
-            std::string("RAJA::intro_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -509,22 +486,19 @@ struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   IntroSortPairs()
-      : m_name(
-            std::string("RAJA::intro_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
@@ -546,9 +520,8 @@ struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   MergeSort()
-      : m_name(
-            std::string("RAJA::merge_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">"))
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
   {}
 
   const char* name() { return m_name.c_str(); }
@@ -576,22 +549,19 @@ struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
   std::string m_name;
 
   MergeSortPairs()
-      : m_name(
-            std::string("RAJA::merge_sort<") +
-            test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
   {}
 
   const char* name() { return m_name.c_str(); }
 
-  template <
-      typename KeyContainer,
-      typename ValContainer,
-      typename Compare =
-          RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
-  void operator()(
-      KeyContainer&& keys,
-      ValContainer&& vals,
-      Compare        comp = Compare {})
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  void operator()(KeyContainer&& keys,
+                  ValContainer&& vals,
+                  Compare comp = Compare {})
   {
     forone<test_policy>(
         [=] RAJA_DEVICE()
diff --git a/test/unit/atomic/test-atomic-incdec.cpp b/test/unit/atomic/test-atomic-incdec.cpp
index b8e02b6d91..8a48670cd4 100644
--- a/test/unit/atomic/test-atomic-incdec.cpp
+++ b/test/unit/atomic/test-atomic-incdec.cpp
@@ -18,31 +18,31 @@
 #include "RAJA_unit-test-forone.hpp"
 #endif
 
-using unsigned_types = ::testing::Types<
-    std::tuple<unsigned int, RAJA::builtin_atomic>,
-    std::tuple<unsigned int, RAJA::seq_atomic>,
-    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-    std::tuple<unsigned long long int, RAJA::seq_atomic>
+using unsigned_types =
+    ::testing::Types<std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    std::tuple<unsigned int, RAJA::omp_atomic>,
-    std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::hip_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::hip_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>
 #endif
-    >;
+                     >;
 
 // Basic Inc Dec
 
@@ -61,7 +61,7 @@ TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
-  T  inc_init   = (T)0;
+  T inc_init    = (T)0;
   T* inc_result = &inc_init;
 
   // oldval < val, increment oldval
@@ -81,7 +81,7 @@ TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
-  T  dec_init   = (T)1;
+  T dec_init    = (T)1;
   T* dec_result = &dec_init;
 
   // oldval > 0, decrement oldval
@@ -100,20 +100,19 @@ TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest, BasicIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    BasicIncDecUnitTest,
-    AtomicBasicIncDecUnitTest,
-    unsigned_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicIncDecUnitTest,
+                               AtomicBasicIncDecUnitTest,
+                               unsigned_types);
 
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
 
-using CUDA_unsigned_types = ::testing::Types<
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+using CUDA_unsigned_types =
+    ::testing::Types<std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
 
 
 template <typename T>
@@ -188,8 +187,7 @@ GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CUDAIncDecUnitTest,
-    AtomicCUDAIncDecUnitTest,
-    CUDA_unsigned_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAIncDecUnitTest,
+                               AtomicCUDAIncDecUnitTest,
+                               CUDA_unsigned_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
index 9f9df881c3..677417d98d 100644
--- a/test/unit/atomic/test-atomic-ref-accessors.cpp
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -33,9 +33,9 @@ TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // should also work with CUDA
-  T  theval  = (T)0;
+  T theval   = (T)0;
   T* memaddr = &theval;
-  T  result;
+  T result;
 
   // explicit constructor with memory address
   RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
@@ -60,10 +60,9 @@ TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest, BasicAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    BasicAccessUnitTest,
-    AtomicRefBasicAccessorUnitTest,
-    basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAccessUnitTest,
+                               AtomicRefBasicAccessorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -134,8 +133,7 @@ GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CUDAAccessUnitTest,
-    AtomicRefCUDAAccessorUnitTest,
-    CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAccessUnitTest,
+                               AtomicRefCUDAAccessorUnitTest,
+                               CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
index fe7ca0cd99..7899b2af5e 100644
--- a/test/unit/atomic/test-atomic-ref-addsub.cpp
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -32,7 +32,7 @@ TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
   using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T  theval  = (T)0;
+  T theval   = (T)0;
   T* memaddr = &theval;
 
   // explicit constructor with memory address
@@ -71,10 +71,9 @@ TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    BasicAddSubUnitTest,
-    AtomicRefBasicAddSubUnitTest,
-    basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAddSubUnitTest,
+                               AtomicRefBasicAddSubUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -150,8 +149,7 @@ GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CUDAAddSubUnitTest,
-    AtomicRefCUDAAddSubUnitTest,
-    CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAddSubUnitTest,
+                               AtomicRefCUDAAddSubUnitTest,
+                               CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
index d104e189e0..4c9d840641 100644
--- a/test/unit/atomic/test-atomic-ref-bitwise.cpp
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -30,9 +30,9 @@ TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
   using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T  theval  = (T)1;
+  T theval   = (T)1;
   T* memaddr = &theval;
-  T  result;
+  T result;
 
   // explicit constructor with memory address
   RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
@@ -66,34 +66,33 @@ TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises);
 
-using basic_types = ::testing::Types<
-    std::tuple<int, RAJA::builtin_atomic>,
-    std::tuple<int, RAJA::seq_atomic>,
-    std::tuple<unsigned int, RAJA::builtin_atomic>,
-    std::tuple<unsigned int, RAJA::seq_atomic>,
-    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-    std::tuple<unsigned long long int, RAJA::seq_atomic>
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    std::tuple<int, RAJA::omp_atomic>,
-    std::tuple<unsigned int, RAJA::omp_atomic>,
-    std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    std::tuple<int, RAJA::auto_atomic>,
-    std::tuple<int, RAJA::cuda_atomic>,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
-    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    BasicBitwiseUnitTest,
-    AtomicRefBasicBitwiseUnitTest,
-    basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicBitwiseUnitTest,
+                               AtomicRefBasicBitwiseUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -160,16 +159,15 @@ GPU_TYPED_TEST_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises);
 
-using CUDA_types = ::testing::Types<
-    std::tuple<int, RAJA::auto_atomic>,
-    std::tuple<int, RAJA::cuda_atomic>,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CUDABitwiseUnitTest,
-    AtomicRefCUDABitwiseUnitTest,
-    CUDA_types);
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDABitwiseUnitTest,
+                               AtomicRefCUDABitwiseUnitTest,
+                               CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
index 54c4195c34..c86c6be8fb 100644
--- a/test/unit/atomic/test-atomic-ref-constructor.cpp
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -40,7 +40,7 @@ void DefaultPolConstructors()
 
   // ref constructor
   RAJA::AtomicRef<T> const& reft1 = test1;
-  RAJA::AtomicRef<T>        reftest1(reft1);
+  RAJA::AtomicRef<T> reftest1(reft1);
 
   ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
@@ -50,16 +50,14 @@ TYPED_TEST_P(AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors)
   DefaultPolConstructors<TypeParam>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    AtomicRefDefaultConstructorUnitTest,
-    DefaultPolConstructors);
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest,
+                            DefaultPolConstructors);
 
 using default_types = ::testing::Types<int, float, double>;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    DefaultConstrUnitTest,
-    AtomicRefDefaultConstructorUnitTest,
-    default_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(DefaultConstrUnitTest,
+                               AtomicRefDefaultConstructorUnitTest,
+                               default_types);
 
 // Basic Constructors with policies
 
@@ -83,19 +81,17 @@ TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
 
   // ref constructor
   RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy>        reftest1(reft1);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
 
   ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    AtomicRefBasicConstructorUnitTest,
-    BasicConstructors);
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest,
+                            BasicConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    BasicConstrUnitTest,
-    AtomicRefBasicConstructorUnitTest,
-    basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicConstrUnitTest,
+                               AtomicRefBasicConstructorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -129,7 +125,7 @@ GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 
   // ref constructor
   RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy>        reftest1(reft1);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
   forone<test_cuda>([=] __device__() { reftest1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
 
@@ -140,8 +136,7 @@ GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CUDAConstrUnitTest,
-    AtomicRefCUDAConstructorUnitTest,
-    CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAConstrUnitTest,
+                               AtomicRefCUDAConstructorUnitTest,
+                               CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
index 14409ebc69..842e18d319 100644
--- a/test/unit/atomic/test-atomic-ref-exchanges.cpp
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -30,8 +30,8 @@ TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
   using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T  swapper = (T)91;
-  T  theval  = (T)0;
+  T swapper  = (T)91;
+  T theval   = (T)0;
   T* memaddr = &theval;
 
   // explicit constructor with memory address
@@ -48,9 +48,9 @@ TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
   ASSERT_EQ(swapper, (T)91);
 
 
-  bool result  = true;
-  T    testval = (T)19;
-  T&   valref  = testval;
+  bool result = true;
+  T testval   = (T)19;
+  T& valref   = testval;
 
   // test strong exchange method
   result = test1.compare_exchange_strong(valref, testval);
@@ -69,42 +69,41 @@ TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest, BasicExchanges);
 
-using basic_types = ::testing::Types<
-    std::tuple<int, RAJA::builtin_atomic>,
-    std::tuple<int, RAJA::seq_atomic>,
-    std::tuple<unsigned int, RAJA::builtin_atomic>,
-    std::tuple<unsigned int, RAJA::seq_atomic>,
-    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-    std::tuple<unsigned long long int, RAJA::seq_atomic>,
-    std::tuple<float, RAJA::builtin_atomic>,
-    std::tuple<float, RAJA::seq_atomic>,
-    std::tuple<double, RAJA::builtin_atomic>,
-    std::tuple<double, RAJA::seq_atomic>
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    std::tuple<int, RAJA::omp_atomic>,
-    std::tuple<unsigned int, RAJA::omp_atomic>,
-    std::tuple<unsigned long long int, RAJA::omp_atomic>,
-    std::tuple<float, RAJA::omp_atomic>,
-    std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    std::tuple<int, RAJA::auto_atomic>,
-    std::tuple<int, RAJA::cuda_atomic>,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-    std::tuple<float, RAJA::auto_atomic>,
-    std::tuple<float, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>
 #endif
-    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    BasicExchangeUnitTest,
-    AtomicRefBasicExchangeUnitTest,
-    basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicExchangeUnitTest,
+                               AtomicRefBasicExchangeUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -122,10 +121,10 @@ GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
   using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T*    swapper = nullptr;
-  T*    memaddr = nullptr;
-  T*    testval = nullptr;
-  bool* result  = nullptr;
+  T* swapper   = nullptr;
+  T* memaddr   = nullptr;
+  T* testval   = nullptr;
+  bool* result = nullptr;
   cudaErrchk(cudaMallocManaged(&swapper, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&testval, sizeof(T)));
@@ -182,18 +181,17 @@ GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges);
 
-using CUDA_types = ::testing::Types<
-    std::tuple<int, RAJA::auto_atomic>,
-    std::tuple<int, RAJA::cuda_atomic>,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-    std::tuple<float, RAJA::auto_atomic>,
-    std::tuple<float, RAJA::auto_atomic>>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CUDAExchangeUnitTest,
-    AtomicRefCUDAExchangeUnitTest,
-    CUDA_types);
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAExchangeUnitTest,
+                               AtomicRefCUDAExchangeUnitTest,
+                               CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
index b45ef9d0ea..10d4825616 100644
--- a/test/unit/atomic/test-atomic-ref-minmax.cpp
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -32,9 +32,9 @@ TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
   using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T  theval  = (T)91;
+  T theval   = (T)91;
   T* memaddr = &theval;
-  T  result;
+  T result;
 
   // explicit constructor with memory address
   RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
@@ -60,10 +60,9 @@ TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    BasicMinMaxUnitTest,
-    AtomicRefBasicMinMaxUnitTest,
-    basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicMinMaxUnitTest,
+                               AtomicRefBasicMinMaxUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -119,8 +118,7 @@ GPU_TYPED_TEST_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs)
 
 REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CUDAMinMaxUnitTest,
-    AtomicRefCUDAMinMaxUnitTest,
-    CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAMinMaxUnitTest,
+                               AtomicRefCUDAMinMaxUnitTest,
+                               CUDA_types);
 #endif
diff --git a/test/unit/atomic/test-atomic-ref.hpp b/test/unit/atomic/test-atomic-ref.hpp
index 9749fa42ba..6805c432cd 100644
--- a/test/unit/atomic/test-atomic-ref.hpp
+++ b/test/unit/atomic/test-atomic-ref.hpp
@@ -12,63 +12,63 @@
 #include <RAJA/RAJA.hpp>
 #include "RAJA_gtest.hpp"
 
-using basic_types = ::testing::Types<
-    std::tuple<int, RAJA::builtin_atomic>,
-    std::tuple<int, RAJA::seq_atomic>,
-    std::tuple<unsigned int, RAJA::builtin_atomic>,
-    std::tuple<unsigned int, RAJA::seq_atomic>,
-    std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-    std::tuple<unsigned long long int, RAJA::seq_atomic>,
-    std::tuple<float, RAJA::builtin_atomic>,
-    std::tuple<float, RAJA::seq_atomic>,
-    std::tuple<double, RAJA::builtin_atomic>,
-    std::tuple<double, RAJA::seq_atomic>
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-    ,
-    std::tuple<int, RAJA::omp_atomic>,
-    std::tuple<unsigned int, RAJA::omp_atomic>,
-    std::tuple<unsigned long long int, RAJA::omp_atomic>,
-    std::tuple<float, RAJA::omp_atomic>,
-    std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    ,
-    std::tuple<int, RAJA::auto_atomic>,
-    std::tuple<int, RAJA::cuda_atomic>,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-    std::tuple<float, RAJA::auto_atomic>,
-    std::tuple<float, RAJA::cuda_atomic>,
-    std::tuple<double, RAJA::auto_atomic>,
-    std::tuple<double, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    ,
-    std::tuple<int, RAJA::auto_atomic>,
-    std::tuple<int, RAJA::hip_atomic>,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::hip_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::hip_atomic>,
-    std::tuple<float, RAJA::auto_atomic>,
-    std::tuple<float, RAJA::hip_atomic>,
-    std::tuple<double, RAJA::auto_atomic>,
-    std::tuple<double, RAJA::hip_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::hip_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::hip_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::hip_atomic>
 #endif
-    >;
+                     >;
 
 #if defined(RAJA_ENABLE_CUDA)
-using CUDA_types = ::testing::Types<
-    std::tuple<int, RAJA::auto_atomic>,
-    std::tuple<int, RAJA::cuda_atomic>,
-    std::tuple<unsigned int, RAJA::auto_atomic>,
-    std::tuple<unsigned int, RAJA::cuda_atomic>,
-    std::tuple<unsigned long long int, RAJA::auto_atomic>,
-    std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-    std::tuple<float, RAJA::auto_atomic>,
-    std::tuple<float, RAJA::auto_atomic>,
-    std::tuple<double, RAJA::cuda_atomic>,
-    std::tuple<double, RAJA::cuda_atomic>>;
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>>;
 #endif
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 6c16840b95..0ddc91bb9a 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -23,12 +23,12 @@ GPU_TEST(SynchronizeUnitTest, HIP)
       { d_managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
-  hipMemcpy(
-      managed_data, d_managed_data, sizeof(double) * 50, hipMemcpyDeviceToHost);
+  hipMemcpy(managed_data, d_managed_data, sizeof(double) * 50,
+            hipMemcpyDeviceToHost);
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment(0, 50),
-      [=](RAJA::Index_type i) { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   free(managed_data);
   hipFree(d_managed_data);
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index 586e9b0939..af9027a11c 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -60,13 +60,13 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType isrl;
   ASSERT_EQ(size_t(2), isrl.getNumTypes());
-  int         idx[] = {0, 2, 4, 5};
+  int idx[] = {0, 2, 4, 5};
   ListSegType lseg(idx, 4, host_res);
   isrl.push_back(lseg);
   isrl.push_back(RangeSegType(6, 8));
   ASSERT_EQ(2, isrl.size());
   ASSERT_EQ(size_t(6), isrl.getLength());
-  const ListSegType  ls0  = isrl.getSegment<const ListSegType>(0);
+  const ListSegType ls0   = isrl.getSegment<const ListSegType>(0);
   const RangeSegType rs11 = isrl.getSegment<const RangeSegType>(1);
   ASSERT_EQ(4, ls0.size());
   ASSERT_EQ(2, rs11.size());
@@ -74,7 +74,7 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_FALSE(isrl.compareSegmentById(0, isr));
   ASSERT_FALSE(isr.compareSegmentById(1, isrl));
 
-  RIndexSetType  isr3(isr);
+  RIndexSetType isr3(isr);
   RLIndexSetType isrl3 = isrl;
   ASSERT_TRUE(isr == isr3);
   ASSERT_FALSE(isrl != isrl3);
@@ -87,7 +87,7 @@ TEST(IndexSetUnitTest, Swap)
   using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
-  RangeSegType  range(0, 10);
+  RangeSegType range(0, 10);
   iset1.push_back(range);
   iset1.push_back_nocopy(&range);
   iset1.push_front(range);
@@ -112,11 +112,11 @@ TEST(IndexSetUnitTest, Slice)
   using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
-  RangeSegType  range1(0, 2);
-  RangeSegType  range2(2, 4);
-  RangeSegType  range3(4, 6);
-  RangeSegType  range4(6, 8);
-  RangeSegType  range5(8, 10);
+  RangeSegType range1(0, 2);
+  RangeSegType range2(2, 4);
+  RangeSegType range3(4, 6);
+  RangeSegType range4(6, 8);
+  RangeSegType range5(8, 10);
   iset1.push_back(range1);
   iset1.push_back(range2);
   iset1.push_back(range3);
@@ -138,8 +138,8 @@ TEST(IndexSetUnitTest, Slice)
   ASSERT_EQ(8, *rs22.begin());
   ASSERT_EQ(10, *rs22.end());
 
-  int           segs[] = {0, 3};
-  RIndexSetType iset3  = iset1.createSlice(segs, 2);
+  int segs[]          = {0, 3};
+  RIndexSetType iset3 = iset1.createSlice(segs, 2);
   ASSERT_EQ(2, iset3.size());
   ASSERT_EQ(size_t(4), iset3.getLength());
   const RangeSegType rs30 = iset3.getSegment<const RangeSegType>(0);
@@ -171,7 +171,7 @@ TEST(IndexSetUnitTest, ConditionalEvenIndices)
   RLIndexSetType iset;
 
   iset.push_back(RangeSegType(0, 6));
-  int         idx[] = {7, 8, 10, 11};
+  int idx[] = {7, 8, 10, 11};
   ListSegType lseg(idx, 4, host_res);
   iset.push_back(lseg);
   iset.push_back(RangeSegType(13, 17));
@@ -214,8 +214,8 @@ TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
   ref_lt100_indices.push_back(99);
 
   RAJA::RAJAVec<int> lt100_indices;
-  getIndicesConditional(
-      lt100_indices, iset, [](int idx) { return (idx < 100); });
+  getIndicesConditional(lt100_indices, iset,
+                        [](int idx) { return (idx < 100); });
 
   EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
   for (size_t i = 0; i < ref_lt100_indices.size(); ++i)
diff --git a/test/unit/index/test-indexvalue.cpp b/test/unit/index/test-indexvalue.cpp
index d663b30f72..ca148d2c91 100644
--- a/test/unit/index/test-indexvalue.cpp
+++ b/test/unit/index/test-indexvalue.cpp
@@ -147,7 +147,7 @@ TYPED_TEST(IndexValueUnitTest, StrongTypesArith)
 
 TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
 {
-  StrongTypeIndex  a(8);
+  StrongTypeIndex a(8);
   RAJA::Index_type b(2);
 
   ASSERT_EQ(StrongTypeIndex(10), a + b);
@@ -184,7 +184,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
 
 
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
-  TestType         c(8);
+  TestType c(8);
   RAJA::Index_type d(2);
 
   ASSERT_EQ(TestType(10), c + d);
@@ -250,7 +250,7 @@ TYPED_TEST(IndexValueUnitTest, StrongTypeCompare)
 
 TYPED_TEST(IndexValueUnitTest, IndexTypeCompare)
 {
-  StrongTypeIndex  v(5);
+  StrongTypeIndex v(5);
   RAJA::Index_type v_lower(4);
   RAJA::Index_type v_higher(6);
   RAJA::Index_type v_same(5);
@@ -265,7 +265,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeCompare)
   ASSERT_NE(v, v_higher);
 
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
-  TestType         x(5);
+  TestType x(5);
   RAJA::Index_type x_lower(4);
   RAJA::Index_type x_higher(6);
   RAJA::Index_type x_same(5);
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index f4ee0a87e9..f810aac1ed 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -83,7 +83,7 @@ TYPED_TEST(ListSegmentUnitTest, Swaps)
 
 TYPED_TEST(ListSegmentUnitTest, Equality)
 {
-  std::vector<TypeParam>            idx1 {5, 3, 1, 2};
+  std::vector<TypeParam> idx1 {5, 3, 1, 2};
   RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   std::vector<TypeParam> idx2 {2, 1, 3, 5};
@@ -97,7 +97,7 @@ TYPED_TEST(ListSegmentUnitTest, Equality)
 
 TYPED_TEST(ListSegmentUnitTest, Iterators)
 {
-  std::vector<TypeParam>            idx1 {5, 3, 1, 2};
+  std::vector<TypeParam> idx1 {5, 3, 1, 2};
   RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   ASSERT_EQ(TypeParam(5), *list.begin());
diff --git a/test/unit/index/test-rangesegment.cpp b/test/unit/index/test-rangesegment.cpp
index 76497982c1..fbed2a15bd 100644
--- a/test/unit/index/test-rangesegment.cpp
+++ b/test/unit/index/test-rangesegment.cpp
@@ -20,15 +20,13 @@ class RangeSegmentUnitTest : public ::testing::Test
 TYPED_TEST_SUITE(RangeSegmentUnitTest, UnitIndexTypes);
 
 
-template <
-    typename T,
-    typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {}
 
-template <
-    typename T,
-    typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {
   RAJA::TypedRangeSegment<T> r1(-10, 7);
@@ -78,15 +76,13 @@ TYPED_TEST(RangeSegmentUnitTest, Swaps)
   ASSERT_EQ(r2, r3);
 }
 
-template <
-    typename T,
-    typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {}
 
-template <
-    typename T,
-    typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {
   RAJA::TypedRangeSegment<T> r3(-2, 100);
@@ -106,17 +102,15 @@ TYPED_TEST(RangeSegmentUnitTest, Iterators)
   NegativeRangeSegIteratorsTest<TypeParam>();
 }
 
-template <
-    typename IDX_TYPE,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {}
 
-template <
-    typename IDX_TYPE,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {
   auto r1 = RAJA::TypedRangeSegment<IDX_TYPE>(-4, 4);
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
index 33a2a680e2..1b009433ef 100644
--- a/test/unit/index/test-rangestridesegment.cpp
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -62,15 +62,13 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Iterators)
   ASSERT_EQ(difftype_t(25), r1.size());
 }
 
-template <
-    typename T,
-    typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {}
 
-template <
-    typename T,
-    typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {
   RAJA::TypedRangeStrideSegment<T> segment16(-10, -2, 2);
@@ -144,17 +142,15 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   NegativeRangeStrideTestSizes<TypeParam>();
 }
 
-template <
-    typename IDX_TYPE,
-    typename std::enable_if<std::is_unsigned<
-        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {}
 
-template <
-    typename IDX_TYPE,
-    typename std::enable_if<std::is_signed<
-        RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {
   auto r1 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(10, -1, -1);
diff --git a/test/unit/indexing/test-indexing.hpp b/test/unit/indexing/test-indexing.hpp
index d6ce03bd7d..9978f17832 100644
--- a/test/unit/indexing/test-indexing.hpp
+++ b/test/unit/indexing/test-indexing.hpp
@@ -17,19 +17,19 @@
 //
 // List of named_dims
 //
-using NamedDimensionTypeList = camp::list<
-    camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
-    camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
-    camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
+using NamedDimensionTypeList =
+    camp::list<camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
 
 //
 // List of sizes
 //
-using SizeTypeList = camp::list<
-    camp::integral_constant<int, RAJA::named_usage::ignored>,
-    camp::integral_constant<int, RAJA::named_usage::unspecified>,
-    camp::integral_constant<int, 1>,
-    camp::integral_constant<int, 7>>;
+using SizeTypeList =
+    camp::list<camp::integral_constant<int, RAJA::named_usage::ignored>,
+               camp::integral_constant<int, RAJA::named_usage::unspecified>,
+               camp::integral_constant<int, 1>,
+               camp::integral_constant<int, 7>>;
 
 //
 // Holder for indexing templates
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index 85404a335f..2fbb4a4421 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -20,12 +20,11 @@ class IndexingUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(IndexingUnitTest);
 
-template <
-    typename test_policy,
-    typename indexer_type,
-    RAJA::named_dim dim_012,
-    int             BLOCK_SIZE,
-    int             GRID_SIZE>
+template <typename test_policy,
+          typename indexer_type,
+          RAJA::named_dim dim_012,
+          int BLOCK_SIZE,
+          int GRID_SIZE>
 void testBasicIndexing()
 {
   dim3d3d expected_dim {{1, 1, 1}, {1, 1, 1}};
@@ -72,14 +71,13 @@ void testBasicIndexing()
   actual_size =
       test_reallocate(working_res, host_res, actual_size, total_global);
 
-  for3d3d<test_policy>(
-      expected_dim,
-      [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
-      {
-        int i           = index(idx, dim);
-        actual_index[i] = indexer_type::template index<int>();
-        actual_size[i]  = indexer_type::template size<int>();
-      });
+  for3d3d<test_policy>(expected_dim,
+                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
+                       {
+                         int i           = index(idx, dim);
+                         actual_index[i] = indexer_type::template index<int>();
+                         actual_size[i]  = indexer_type::template size<int>();
+                       });
 
   actual_index =
       test_reallocate(host_res, working_res, actual_index, total_global);
@@ -107,9 +105,8 @@ TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
   using indexer_type = typename indexer_holder_type::template type<
       dim_type::value, threads_type::value, blocks_type::value>;
 
-  testBasicIndexing<
-      test_policy, indexer_type, dim_type::value, threads_type::value,
-      blocks_type::value>();
+  testBasicIndexing<test_policy, indexer_type, dim_type::value,
+                    threads_type::value, blocks_type::value>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(IndexingUnitTest, BasicIndexing);
diff --git a/test/unit/internal/test-iterators.cpp b/test/unit/internal/test-iterators.cpp
index 7baebdb69b..2d90dca4c0 100644
--- a/test/unit/internal/test-iterators.cpp
+++ b/test/unit/internal/test-iterators.cpp
@@ -89,7 +89,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
   if (std::is_unsigned<TypeParam>::value)
   {
     ASSERT_ANY_THROW({
-      TypeParam                                    val = 10;
+      TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
       of_it -= 11;
     });
@@ -100,28 +100,28 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
     });
 
     ASSERT_ANY_THROW({
-      TypeParam                                    val = 10;
+      TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto                                         sum = of_it - 11u;
+      auto sum = of_it - 11u;
       (void)sum;
     });
     ASSERT_ANY_THROW({
       TypeParam val = std::numeric_limits<TypeParam>::max() - 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto                                         sum = of_it + 11;
+      auto sum = of_it + 11;
       (void)sum;
     });
 
     ASSERT_ANY_THROW({
-      TypeParam                                          val = 10;
+      TypeParam val = 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto                                               sum = 8 - of_it;
+      auto sum = 8 - of_it;
       (void)sum;
     });
     ASSERT_ANY_THROW({
       TypeParam val = std::numeric_limits<TypeParam>::max() - 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
-      auto                                               sum = 11 + of_it;
+      auto sum = 11 + of_it;
       (void)sum;
     });
   }
@@ -132,7 +132,7 @@ TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
   if (std::is_unsigned<TypeParam>::value)
   {
     ASSERT_ANY_THROW({
-      TypeParam                                            val = 2;
+      TypeParam val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
       of_it -= 2;
     });
@@ -143,15 +143,15 @@ TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
     });
 
     ASSERT_ANY_THROW({
-      TypeParam                                            val = 2;
+      TypeParam val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
-      auto                                                 sum = of_it - 2;
+      auto sum = of_it - 2;
       (void)sum;
     });
     ASSERT_ANY_THROW({
       TypeParam val = std::numeric_limits<TypeParam>::max() - 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
-      auto                                                 sum = of_it + 2;
+      auto sum = of_it + 2;
       (void)sum;
     });
   }
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
index 2a995053be..e84f21a475 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -58,15 +58,12 @@ void testBasicMultiReducerConstructorRegular(size_t num_bins)
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ(
-        (NumericType)multi_reduce_sum[bin].get(),
-        get_op_identity(multi_reduce_sum));
-    ASSERT_EQ(
-        (NumericType)multi_reduce_min[bin].get(),
-        get_op_identity(multi_reduce_min));
-    ASSERT_EQ(
-        (NumericType)multi_reduce_max[bin].get(),
-        get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
@@ -86,19 +83,16 @@ void testBasicMultiReducerConstructorBitwise(size_t num_bins)
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
 
-    ASSERT_EQ(
-        (NumericType)multi_reduce_or[bin].get(),
-        get_op_identity(multi_reduce_or));
-    ASSERT_EQ(
-        (NumericType)multi_reduce_and[bin].get(),
-        get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
   }
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
   testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
@@ -107,10 +101,9 @@ void testBasicMultiReducerConstructor(size_t num_bins)
       num_bins);
 }
 ///
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
   testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
@@ -130,9 +123,8 @@ TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
 
 
 template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorRegular(
-    size_t      num_bins,
-    NumericType initVal)
+void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
+                                                  NumericType initVal)
 {
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
       num_bins, initVal);
@@ -158,9 +150,8 @@ void testMultiReducerSingleInitConstructorRegular(
 }
 
 template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorBitwise(
-    size_t      num_bins,
-    NumericType initVal)
+void testMultiReducerSingleInitConstructorBitwise(size_t num_bins,
+                                                  NumericType initVal)
 {
   RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
       num_bins, initVal);
@@ -180,10 +171,9 @@ void testMultiReducerSingleInitConstructorBitwise(
   }
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
   testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
@@ -192,10 +182,9 @@ void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
       num_bins, initVal);
 }
 ///
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
   testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
@@ -269,40 +258,37 @@ void testMultiReducerContainerInitConstructorBitwise(Container const& container)
   }
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename Container,
-    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular<
-      MultiReducePolicy, NumericType>(container);
-  testMultiReducerContainerInitConstructorBitwise<
-      MultiReducePolicy, NumericType>(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorBitwise<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 ///
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename Container,
-    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular<
-      MultiReducePolicy, NumericType>(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 
-TYPED_TEST_P(
-    MultiReducerContainerInitConstructorUnitTest,
-    MultiReducerConstructor)
+TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
+             MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
-  std::set<NumericType>    c2;
+  std::set<NumericType> c2;
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
@@ -317,16 +303,13 @@ TYPED_TEST_P(
 }
 
 
-REGISTER_TYPED_TEST_SUITE_P(
-    MultiReducerBasicConstructorUnitTest,
-    MultiReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest,
+                            MultiReducerConstructor);
 
-REGISTER_TYPED_TEST_SUITE_P(
-    MultiReducerSingleInitConstructorUnitTest,
-    MultiReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest,
+                            MultiReducerConstructor);
 
-REGISTER_TYPED_TEST_SUITE_P(
-    MultiReducerContainerInitConstructorUnitTest,
-    MultiReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest,
+                            MultiReducerConstructor);
 
 #endif  //__TEST_MULTI_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index aed0a7462e..379cdbd6fc 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -77,15 +77,12 @@ void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ(
-        (NumericType)multi_reduce_sum[bin].get(),
-        get_op_identity(multi_reduce_sum));
-    ASSERT_EQ(
-        (NumericType)multi_reduce_min[bin].get(),
-        get_op_identity(multi_reduce_min));
-    ASSERT_EQ(
-        (NumericType)multi_reduce_max[bin].get(),
-        get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
@@ -123,20 +120,17 @@ void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
 
-    ASSERT_EQ(
-        (NumericType)multi_reduce_and[bin].get(),
-        get_op_identity(multi_reduce_and));
-    ASSERT_EQ(
-        (NumericType)multi_reduce_or[bin].get(),
-        get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
   }
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
   testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
@@ -150,11 +144,10 @@ void testMultiReducerBasicReset(size_t num_bins)
   // MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
 }
 ///
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
   testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
@@ -179,11 +172,10 @@ TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
 
 
 template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetRegular(
-    bool        use_reducer,
-    size_t      init_bins,
-    size_t      num_bins,
-    NumericType initVal)
+void testMultiReducerSingleResetRegular(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
       init_bins, initVal);
@@ -227,11 +219,10 @@ void testMultiReducerSingleResetRegular(
 }
 
 template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetBitwise(
-    bool        use_reducer,
-    size_t      init_bins,
-    size_t      num_bins,
-    NumericType initVal)
+void testMultiReducerSingleResetBitwise(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
   RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
       init_bins, initVal);
@@ -267,15 +258,13 @@ void testMultiReducerSingleResetBitwise(
   }
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(
-    size_t      init_bins,
-    size_t      num_bins,
-    NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
   testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
       false, init_bins, num_bins, initVal);
@@ -289,15 +278,13 @@ void testMultiReducerSingleResetSize(
   // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 ///
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(
-    size_t      init_bins,
-    size_t      num_bins,
-    NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
   testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
       false, init_bins, num_bins, initVal);
@@ -335,18 +322,16 @@ TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
 }
 
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    typename Container>
-void testMultiReducerContainerResetRegular(
-    bool             use_reducer,
-    size_t           init_bins,
-    Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetRegular(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType  initVal  = NumericType(5);
+  NumericType initVal   = NumericType(5);
 
   RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
       init_bins, initVal);
@@ -391,18 +376,16 @@ void testMultiReducerContainerResetRegular(
   }
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    typename Container>
-void testMultiReducerContainerResetBitwise(
-    bool             use_reducer,
-    size_t           init_bins,
-    Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetBitwise(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType  initVal  = NumericType(5);
+  NumericType initVal   = NumericType(5);
 
   RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
       init_bins, initVal);
@@ -440,20 +423,18 @@ void testMultiReducerContainerResetBitwise(
   }
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    typename Container,
-    std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(
-    size_t           init_bins,
-    Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular<
-      MultiReducePolicy, NumericType, ForOnePol>(false, init_bins, container);
-  testMultiReducerContainerResetBitwise<
-      MultiReducePolicy, NumericType, ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetBitwise<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
   // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
@@ -462,29 +443,26 @@ void testMultiReducerContainerResetSize(
   // ForOnePol >(true, init_bins, container);
 }
 ///
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    typename Container,
-    std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(
-    size_t           init_bins,
-    Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular<
-      MultiReducePolicy, NumericType, ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
   // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
   // ForOnePol >(true, init_bins, container);
 }
 
-template <
-    typename MultiReducePolicy,
-    typename NumericType,
-    typename ForOnePol,
-    typename Container>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
 void testMultiReducerContainerReset(Container const& container)
 {
   testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
@@ -503,7 +481,7 @@ TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
-  std::set<NumericType>    c2;
+  std::set<NumericType> c2;
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
@@ -523,8 +501,7 @@ REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest, MultiReducerReset);
 
 REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest, MultiReducerReset);
 
-REGISTER_TYPED_TEST_SUITE_P(
-    MultiReducerContainerResetUnitTest,
-    MultiReducerReset);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest,
+                            MultiReducerReset);
 
 #endif  //__TEST_MULTI_REDUCER_RESET__
diff --git a/test/unit/reducer/test-reducer-constructors-cuda.cpp b/test/unit/reducer/test-reducer-constructors-cuda.cpp
index f5e67555e3..75889c4706 100644
--- a/test/unit/reducer/test-reducer-constructors-cuda.cpp
+++ b/test/unit/reducer/test-reducer-constructors-cuda.cpp
@@ -13,24 +13,22 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaBasicReducerConstructorTypes = Test<camp::cartesian_product<
-    CudaReducerPolicyList,
-    DataTypeList,
-    CudaResourceList>>::Types;
+using CudaBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
+                                 DataTypeList,
+                                 CudaResourceList>>::Types;
 
-using CudaInitReducerConstructorTypes = Test<camp::cartesian_product<
-    CudaReducerPolicyList,
-    DataTypeList,
-    CudaResourceList,
-    CudaUnitTestPolicyList>>::Types;
+using CudaInitReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
+                                 DataTypeList,
+                                 CudaResourceList,
+                                 CudaUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CudaBasicTest,
-    ReducerBasicConstructorUnitTest,
-    CudaBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(CudaBasicTest,
+                               ReducerBasicConstructorUnitTest,
+                               CudaBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CudaInitTest,
-    ReducerInitConstructorUnitTest,
-    CudaInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(CudaInitTest,
+                               ReducerInitConstructorUnitTest,
+                               CudaInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-hip.cpp b/test/unit/reducer/test-reducer-constructors-hip.cpp
index 05ad5ba1ae..c4f4ddb8b4 100644
--- a/test/unit/reducer/test-reducer-constructors-hip.cpp
+++ b/test/unit/reducer/test-reducer-constructors-hip.cpp
@@ -13,24 +13,22 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipBasicReducerConstructorTypes = Test<camp::cartesian_product<
-    HipReducerPolicyList,
-    DataTypeList,
-    HipResourceList>>::Types;
+using HipBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
+                                 DataTypeList,
+                                 HipResourceList>>::Types;
 
-using HipInitReducerConstructorTypes = Test<camp::cartesian_product<
-    HipReducerPolicyList,
-    DataTypeList,
-    HipResourceList,
-    HipUnitTestPolicyList>>::Types;
+using HipInitReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
+                                 DataTypeList,
+                                 HipResourceList,
+                                 HipUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    HipBasicTest,
-    ReducerBasicConstructorUnitTest,
-    HipBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(HipBasicTest,
+                               ReducerBasicConstructorUnitTest,
+                               HipBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    HipInitTest,
-    ReducerInitConstructorUnitTest,
-    HipInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(HipInitTest,
+                               ReducerInitConstructorUnitTest,
+                               HipInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
index 11e6311b2a..3dd9e8ae39 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
@@ -13,14 +13,13 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetInitReducerConstructorTypes = Test<camp::cartesian_product<
-    OpenMPTargetReducerPolicyList,
-    DataTypeList,
-    OpenMPTargetResourceList,
-    SequentialUnitTestPolicyList>>::Types;
+using OpenMPTargetInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
+                                 DataTypeList,
+                                 OpenMPTargetResourceList,
+                                 SequentialUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    OpenMPTargetInitTest,
-    ReducerInitConstructorUnitTest,
-    OpenMPTargetInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetInitTest,
+                               ReducerInitConstructorUnitTest,
+                               OpenMPTargetInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-openmp.cpp b/test/unit/reducer/test-reducer-constructors-openmp.cpp
index d9e32d65f9..eb31791058 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp.cpp
@@ -13,24 +13,22 @@
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPBasicReducerConstructorTypes = Test<camp::cartesian_product<
-    OpenMPReducerPolicyList,
-    DataTypeList,
-    HostResourceList>>::Types;
+using OpenMPBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList>>::Types;
 
-using OpenMPInitReducerConstructorTypes = Test<camp::cartesian_product<
-    OpenMPReducerPolicyList,
-    DataTypeList,
-    HostResourceList,
-    SequentialUnitTestPolicyList>>::Types;
+using OpenMPInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    OpenMPBasicTest,
-    ReducerBasicConstructorUnitTest,
-    OpenMPBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPBasicTest,
+                               ReducerBasicConstructorUnitTest,
+                               OpenMPBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    OpenMPInitTest,
-    ReducerInitConstructorUnitTest,
-    OpenMPInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPInitTest,
+                               ReducerInitConstructorUnitTest,
+                               OpenMPInitReducerConstructorTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-constructors-seq.cpp b/test/unit/reducer/test-reducer-constructors-seq.cpp
index 7d34a653ad..7d765529f8 100644
--- a/test/unit/reducer/test-reducer-constructors-seq.cpp
+++ b/test/unit/reducer/test-reducer-constructors-seq.cpp
@@ -12,23 +12,21 @@
 
 #include "tests/test-reducer-constructors.hpp"
 
-using SequentialBasicReducerConstructorTypes = Test<camp::cartesian_product<
-    SequentialReducerPolicyList,
-    DataTypeList,
-    HostResourceList>>::Types;
+using SequentialBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList>>::Types;
 
-using SequentialInitReducerConstructorTypes = Test<camp::cartesian_product<
-    SequentialReducerPolicyList,
-    DataTypeList,
-    HostResourceList,
-    SequentialUnitTestPolicyList>>::Types;
+using SequentialInitReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialUnitTestPolicyList>>::Types;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    SequentialBasicTest,
-    ReducerBasicConstructorUnitTest,
-    SequentialBasicReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
+                               ReducerBasicConstructorUnitTest,
+                               SequentialBasicReducerConstructorTypes);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    SequentialInitTest,
-    ReducerInitConstructorUnitTest,
-    SequentialInitReducerConstructorTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(SequentialInitTest,
+                               ReducerInitConstructorUnitTest,
+                               SequentialInitReducerConstructorTypes);
diff --git a/test/unit/reducer/test-reducer-reset-cuda.cpp b/test/unit/reducer/test-reducer-reset-cuda.cpp
index a3647340d5..2443419c7d 100644
--- a/test/unit/reducer/test-reducer-reset-cuda.cpp
+++ b/test/unit/reducer/test-reducer-reset-cuda.cpp
@@ -12,15 +12,14 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerResetTypes = Test<camp::cartesian_product<
-    CudaReducerPolicyList,
-    DataTypeList,
-    CudaResourceList,
-    CudaUnitTestPolicyList>>::Types;
+using CudaReducerResetTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
+                                 DataTypeList,
+                                 CudaResourceList,
+                                 CudaUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    CudaResetTest,
-    ReducerResetUnitTest,
-    CudaReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(CudaResetTest,
+                               ReducerResetUnitTest,
+                               CudaReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-hip.cpp b/test/unit/reducer/test-reducer-reset-hip.cpp
index 647171ae3f..eb31480311 100644
--- a/test/unit/reducer/test-reducer-reset-hip.cpp
+++ b/test/unit/reducer/test-reducer-reset-hip.cpp
@@ -12,15 +12,14 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerResetTypes = Test<camp::cartesian_product<
-    HipReducerPolicyList,
-    DataTypeList,
-    HipResourceList,
-    HipUnitTestPolicyList>>::Types;
+using HipReducerResetTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
+                                 DataTypeList,
+                                 HipResourceList,
+                                 HipUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    HipResetTest,
-    ReducerResetUnitTest,
-    HipReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(HipResetTest,
+                               ReducerResetUnitTest,
+                               HipReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-openmp-target.cpp b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
index a5b0540f3f..5f02ec92ea 100644
--- a/test/unit/reducer/test-reducer-reset-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
@@ -12,15 +12,14 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerResetTypes = Test<camp::cartesian_product<
-    OpenMPTargetReducerPolicyList,
-    DataTypeList,
-    OpenMPTargetResourceList,
-    SequentialUnitTestPolicyList>>::Types;
+using OpenMPTargetReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
+                                 DataTypeList,
+                                 OpenMPTargetResourceList,
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    OpenMPTargetResetTest,
-    ReducerResetUnitTest,
-    OpenMPTargetReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetResetTest,
+                               ReducerResetUnitTest,
+                               OpenMPTargetReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-openmp.cpp b/test/unit/reducer/test-reducer-reset-openmp.cpp
index 6f84329955..a570a7be6a 100644
--- a/test/unit/reducer/test-reducer-reset-openmp.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp.cpp
@@ -12,15 +12,14 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerResetTypes = Test<camp::cartesian_product<
-    OpenMPReducerPolicyList,
-    DataTypeList,
-    HostResourceList,
-    SequentialUnitTestPolicyList>>::Types;
+using OpenMPReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    OpenMPResetTest,
-    ReducerResetUnitTest,
-    OpenMPReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPResetTest,
+                               ReducerResetUnitTest,
+                               OpenMPReducerResetTypes);
 #endif
diff --git a/test/unit/reducer/test-reducer-reset-seq.cpp b/test/unit/reducer/test-reducer-reset-seq.cpp
index ed8ab68c41..5884aa43e4 100644
--- a/test/unit/reducer/test-reducer-reset-seq.cpp
+++ b/test/unit/reducer/test-reducer-reset-seq.cpp
@@ -11,14 +11,13 @@
 
 #include "tests/test-reducer-reset.hpp"
 
-using SequentialReducerResetTypes = Test<camp::cartesian_product<
-    SequentialReducerPolicyList,
-    DataTypeList,
-    HostResourceList,
-    SequentialUnitTestPolicyList>>::Types;
+using SequentialReducerResetTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
+                                 DataTypeList,
+                                 HostResourceList,
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    SequentialResetTest,
-    ReducerResetUnitTest,
-    SequentialReducerResetTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(SequentialResetTest,
+                               ReducerResetUnitTest,
+                               SequentialReducerResetTypes);
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index 032fe260c2..d771a77b77 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -47,9 +47,8 @@ testReducerConstructor()
 
 // Basic constructor tests are only expected to be verified on the host.
 // Should not run this on a GPU.
-template <
-    typename ReducePolicy,
-    typename NumericType>
+template <typename ReducePolicy,
+          typename NumericType>
 typename std::enable_if<  // CPU policy.
 #if defined(RAJA_ENABLE_CUDA)
     !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
@@ -61,19 +60,17 @@ typename std::enable_if<  // CPU policy.
     >::type
 testReducerConstructor()
 {
-  RAJA::ReduceSum<ReducePolicy, NumericType>    reduce_sum;
-  RAJA::ReduceMin<ReducePolicy, NumericType>    reduce_min;
-  RAJA::ReduceMax<ReducePolicy, NumericType>    reduce_max;
+  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
+  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min;
+  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max;
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
-  RAJA::ReduceMinLoc<
-      ReducePolicy, NumericType,
-      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup;
-  RAJA::ReduceMaxLoc<
-      ReducePolicy, NumericType,
-      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup;
 
   ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
@@ -87,18 +84,14 @@ testReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-      RAJA::Index_type());
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-      RAJA::Index_type());
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-      RAJA::Index_type());
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-      RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
 }
 
 TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
@@ -109,10 +102,9 @@ TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
   testReducerConstructor<ReducePolicy, NumericType>();
 }
 
-template <
-    typename ReducePolicy,
-    typename NumericType,
-    typename ForOnePol>
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
 typename std::enable_if<  // Host policy does nothing.
     std::is_base_of<RunOnHost, ForOnePol>::value>::type
 exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
@@ -121,10 +113,9 @@ exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <
-    typename ReducePolicy,
-    typename NumericType,
-    typename ForOnePol>
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
 typename std::enable_if<  // GPU policy fiddles with value.
     std::is_base_of<RunOnDevice, ForOnePol>::value>::type
 exec_dispatcher(NumericType* initVal)
@@ -138,11 +129,10 @@ exec_dispatcher(NumericType* initVal)
 }
 #endif
 
-template <
-    typename ReducePolicy,
-    typename NumericType,
-    typename WORKING_RES,
-    typename ForOnePol>
+template <typename ReducePolicy,
+          typename NumericType,
+          typename WORKING_RES,
+          typename ForOnePol>
 void testInitReducerConstructor()
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -167,20 +157,18 @@ void testInitReducerConstructor()
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::ReduceSum<ReducePolicy, NumericType>    reduce_sum(initVal);
-  RAJA::ReduceMin<ReducePolicy, NumericType>    reduce_min(initVal);
-  RAJA::ReduceMax<ReducePolicy, NumericType>    reduce_max(initVal);
+  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
+  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
+  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(initVal);
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(initVal, 1);
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<
-      ReducePolicy, NumericType,
-      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<
-      ReducePolicy, NumericType,
-      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup(initVal, LocTup);
 
   // move a value onto device and fiddle
@@ -203,18 +191,14 @@ void testInitReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal));
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-      (RAJA::Index_type)1);
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-      (RAJA::Index_type)1);
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-      (RAJA::Index_type)1);
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-      (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
 
   work_res.deallocate(workVal);
   host_res.deallocate(theVal);
@@ -227,17 +211,15 @@ TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
   using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  testInitReducerConstructor<
-      ReduceType, NumericType, ResourceType, ForOneType>();
+  testInitReducerConstructor<ReduceType, NumericType, ResourceType,
+                             ForOneType>();
 }
 
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ReducerBasicConstructorUnitTest,
-    BasicReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest,
+                            BasicReducerConstructor);
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ReducerInitConstructorUnitTest,
-    InitReducerConstructor);
+REGISTER_TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest,
+                            InitReducerConstructor);
 
 #endif  //__TEST_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index 6ff9efcc0d..c976545fc8 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -16,12 +16,11 @@
 
 #include "../test-reducer.hpp"
 
-template <
-    typename ReducePolicy,
-    typename NumericType,
-    typename Indexer,
-    typename Tuple,
-    typename ForOnePol>
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
 typename std::enable_if<  // Empty function for non-device policy.
     std::is_base_of<RunOnHost, ForOnePol>::value>::type
 exec_dispatcher(
@@ -42,23 +41,22 @@ exec_dispatcher(
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <
-    typename ReducePolicy,
-    typename NumericType,
-    typename Indexer,
-    typename Tuple,
-    typename ForOnePol>
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
 typename std::enable_if<  // GPU policy execution.
     std::is_base_of<RunOnDevice, ForOnePol>::value>::type
 exec_dispatcher(
-    RAJA::ReduceSum<ReducePolicy, NumericType>&             reduce_sum,
-    RAJA::ReduceMin<ReducePolicy, NumericType>&             reduce_min,
-    RAJA::ReduceMax<ReducePolicy, NumericType>&             reduce_max,
+    RAJA::ReduceSum<ReducePolicy, NumericType>& reduce_sum,
+    RAJA::ReduceMin<ReducePolicy, NumericType>& reduce_min,
+    RAJA::ReduceMax<ReducePolicy, NumericType>& reduce_max,
     RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& reduce_minloc,
     RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& reduce_maxloc,
-    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>&   reduce_minloctup,
-    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>&   reduce_maxloctup,
-    NumericType                                             initVal)
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& reduce_minloctup,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& reduce_maxloctup,
+    NumericType initVal)
 {
   // Use device to activate any value for each reducer.
   forone<ForOnePol>(
@@ -83,11 +81,10 @@ class ReducerResetUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(ReducerResetUnitTest);
 
-template <
-    typename ReducePolicy,
-    typename NumericType,
-    typename WORKING_RES,
-    typename ForOnePol>
+template <typename ReducePolicy,
+          typename NumericType,
+          typename WORKING_RES,
+          typename ForOnePol>
 void testReducerReset()
 {
   camp::resources::Resource work_res {WORKING_RES::get_default()};
@@ -112,26 +109,23 @@ void testReducerReset()
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::ReduceSum<ReducePolicy, NumericType>    reduce_sum(initVal);
-  RAJA::ReduceMin<ReducePolicy, NumericType>    reduce_min(initVal);
-  RAJA::ReduceMax<ReducePolicy, NumericType>    reduce_max(initVal);
+  RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
+  RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
+  RAJA::ReduceMax<ReducePolicy, NumericType> reduce_max(initVal);
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc(initVal, 1);
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<
-      ReducePolicy, NumericType,
-      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<
-      ReducePolicy, NumericType,
-      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
       reduce_maxloctup(initVal, LocTup);
 
   // initiate some device computation if using device policy
-  exec_dispatcher<
-      ReducePolicy, NumericType, RAJA::Index_type,
-      RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
+  exec_dispatcher<ReducePolicy, NumericType, RAJA::Index_type,
+                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
       reduce_sum, reduce_min, reduce_max, reduce_minloc, reduce_maxloc,
       reduce_minloctup, reduce_maxloctup, initVal);
 
@@ -159,18 +153,14 @@ void testReducerReset()
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(resetVal[0]));
 
   // Reset of tuple loc defaults to 0
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-      (RAJA::Index_type)0);
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-      (RAJA::Index_type)0);
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-      (RAJA::Index_type)0);
-  ASSERT_EQ(
-      (RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-      (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
 
   // reset locs to default of -1.
   reduce_minloc.reset(resetVal[0], -1);
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index 813a7df3f8..55fcf488ab 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -18,8 +18,8 @@ inline __host__ __device__ void gpu_time_wait_for(float time, float clockrate)
 {
   clock_t time_in_clocks = time * clockrate;
 
-  unsigned int start_clock  = (unsigned int)clock();
-  clock_t      clock_offset = 0;
+  unsigned int start_clock = (unsigned int)clock();
+  clock_t clock_offset     = 0;
   while (clock_offset < time_in_clocks)
   {
     unsigned int end_clock = (unsigned int)clock();
@@ -29,7 +29,7 @@ inline __host__ __device__ void gpu_time_wait_for(float time, float clockrate)
 
 int get_clockrate()
 {
-  int            cuda_device = 0;
+  int cuda_device = 0;
   cudaDeviceProp deviceProp;
   cudaGetDevice(&cuda_device);
   cudaGetDeviceProperties(&deviceProp, cuda_device);
@@ -60,8 +60,8 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   using namespace RAJA;
 
   constexpr std::size_t NUM_STREAMS {8};
-  WORKING_RES           dev[NUM_STREAMS];
-  resources::Host       host;
+  WORKING_RES dev[NUM_STREAMS];
+  resources::Host host;
 
   int clockrate {get_clockrate()};
   ASSERT_TRUE(clockrate != -1);
@@ -73,9 +73,9 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   sync_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<SyncExecPol>(
-        dev[stream], RangeSegment(0, ARRAY_SIZE),
-        [=] RAJA_HOST_DEVICE(int i) { gpu_time_wait_for(100, clockrate); });
+    forall<SyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
+                        [=] RAJA_HOST_DEVICE(int i)
+                        { gpu_time_wait_for(100, clockrate); });
   }
   sync_timer.stop();
   RAJA::Timer::ElapsedType t_sync = sync_timer.elapsed();
@@ -84,9 +84,9 @@ void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
   async_timer.start();
   for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
   {
-    forall<AsyncExecPol>(
-        dev[stream], RangeSegment(0, ARRAY_SIZE),
-        [=] RAJA_HOST_DEVICE(int i) { gpu_time_wait_for(100, clockrate); });
+    forall<AsyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
+                         [=] RAJA_HOST_DEVICE(int i)
+                         { gpu_time_wait_for(100, clockrate); });
   }
   async_timer.stop();
   RAJA::Timer::ElapsedType t_async = async_timer.elapsed();
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index be080a368f..8de49f37d7 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -16,29 +16,28 @@ void ResourceBasicAsyncSemanticsTestImpl()
   constexpr std::size_t ARRAY_SIZE {10000000};
   using namespace RAJA;
 
-  WORKING_RES     dev;
+  WORKING_RES dev;
   resources::Host host;
 
   int* d_array = resources::Resource {dev}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i) { h_array[i] = i; });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
-  forall<EXEC_POLICY>(
-      dev, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
+  forall<EXEC_POLICY>(dev, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE),
-      [=](int i) { ASSERT_EQ(h_array[i], i + 2); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev.deallocate(d_array);
   host.deallocate(h_array);
@@ -57,8 +56,7 @@ TYPED_TEST_P(ResourceBasicAsyncSemanticsTest, ResourceBasicAsyncSemantics)
   ResourceBasicAsyncSemanticsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ResourceBasicAsyncSemanticsTest,
-    ResourceBasicAsyncSemantics);
+REGISTER_TYPED_TEST_SUITE_P(ResourceBasicAsyncSemanticsTest,
+                            ResourceBasicAsyncSemantics);
 
 #endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index 18cc05cbc8..c7b050dd18 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -16,8 +16,8 @@ void ResourceDependsTestImpl()
   constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
-  WORKING_RES     dev1;
-  WORKING_RES     dev2;
+  WORKING_RES dev1;
+  WORKING_RES dev2;
   resources::Host host;
 
   int* d_array1 = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
@@ -25,27 +25,26 @@ void ResourceDependsTestImpl()
   int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
 
-  forall<EXEC_POLICY>(
-      dev1, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
 
-  resources::Event e = forall<EXEC_POLICY>(
-      dev2, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
+  resources::Event e =
+      forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(
-      dev1, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] *= d_array2[i]; });
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i)
+                      { d_array1[i] *= d_array2[i]; });
 
   dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE),
-      [=](int i) { ASSERT_EQ(h_array[i], -i); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], -i); });
 
   dev1.deallocate(d_array1);
   dev2.deallocate(d_array2);
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index 92f59f468f..51dc837935 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -16,25 +16,25 @@ void ResourceJoinAsyncSemanticsTestImpl()
   constexpr std::size_t ARRAY_SIZE {1000000};
   using namespace RAJA;
 
-  WORKING_RES     dev1;
-  WORKING_RES     dev2;
+  WORKING_RES dev1;
+  WORKING_RES dev2;
   resources::Host host;
 
   int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i) { h_array[i] = i; });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev2.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
   auto e1 = dev2.get_event_erased();
   dev1.wait_for(&e1);
 
-  RAJA::resources::Event e2 = forall<EXEC_POLICY>(
-      dev1, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
+  RAJA::resources::Event e2 =
+      forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev2.wait_for(&e2);
 
@@ -42,9 +42,9 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE),
-      [=](int i) { ASSERT_EQ(h_array[i], i + 2); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
@@ -63,8 +63,7 @@ TYPED_TEST_P(ResourceJoinAsyncSemanticsTest, ResourceJoinAsyncSemantics)
   ResourceJoinAsyncSemanticsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(
-    ResourceJoinAsyncSemanticsTest,
-    ResourceJoinAsyncSemantics);
+REGISTER_TYPED_TEST_SUITE_P(ResourceJoinAsyncSemanticsTest,
+                            ResourceJoinAsyncSemantics);
 
 #endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index b801e611c6..f52fa0a817 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -16,43 +16,40 @@ void ResourceMultiStreamTestImpl()
   constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
-  WORKING_RES     dev1;
-  WORKING_RES     dev2;
-  WORKING_RES     dev3;
+  WORKING_RES dev1;
+  WORKING_RES dev2;
+  WORKING_RES dev3;
   resources::Host host;
 
   int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
   int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  resources::Event e1 = forall<EXEC_POLICY>(
-      dev1, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i)
-      {
-        if (i % 3 == 0)
-        {
-          d_array[i] = i;
-        }
-      });
-
-  resources::Event e2 = forall<EXEC_POLICY>(
-      dev2, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i)
-      {
-        if (i % 3 == 1)
-        {
-          d_array[i] = i;
-        }
-      });
-
-  resources::Event e3 = forall<EXEC_POLICY>(
-      dev2, RangeSegment(0, ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE(int i)
-      {
-        if (i % 3 == 2)
-        {
-          d_array[i] = i;
-        }
-      });
+  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 0)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 1)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 2)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
 
   dev1.wait_for(&e2);
   dev1.wait_for(&e3);
@@ -61,9 +58,9 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(
-      host, RangeSegment(0, ARRAY_SIZE),
-      [=](int i) { ASSERT_EQ(h_array[i], i); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
diff --git a/test/unit/util/operator/test-operators-bitwise-modulus.cpp b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
index d87fec6484..db81160eb1 100644
--- a/test/unit/util/operator/test-operators-bitwise-modulus.cpp
+++ b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
@@ -24,8 +24,8 @@ void modulus_test()
   using Mod = RAJA::operators::modulus<T>;
 
   Mod m;
-  T   i = static_cast<T>(5);
-  T   j = static_cast<T>(2);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), T(1));
 
   if (std::is_signed<T>::value)
@@ -42,9 +42,9 @@ void bit_or_test()
   using Or = RAJA::operators::bit_or<T>;
 
   Or o;
-  T  i = static_cast<T>(0010);
-  T  j = static_cast<T>(0001);
-  T  k = static_cast<T>(0111);
+  T i = static_cast<T>(0010);
+  T j = static_cast<T>(0001);
+  T k = static_cast<T>(0111);
   ASSERT_EQ(o(i, j), T(0011));
   ASSERT_EQ(o(i, k), T(0111));
   ASSERT_EQ(o(j, k), T(0111));
@@ -56,9 +56,9 @@ void bit_and_test()
   using And = RAJA::operators::bit_and<T>;
 
   And a;
-  T   i = static_cast<T>(0010);
-  T   j = static_cast<T>(0001);
-  T   k = static_cast<T>(0111);
+  T i = static_cast<T>(0010);
+  T j = static_cast<T>(0001);
+  T k = static_cast<T>(0111);
   ASSERT_EQ(a(i, j), T(0000));
   ASSERT_EQ(a(i, k), T(0010));
   ASSERT_EQ(a(j, k), T(0001));
@@ -70,9 +70,9 @@ void bit_xor_test()
   using Xor = RAJA::operators::bit_xor<T>;
 
   Xor x;
-  T   i = static_cast<T>(0010);
-  T   j = static_cast<T>(0001);
-  T   k = static_cast<T>(0111);
+  T i = static_cast<T>(0010);
+  T j = static_cast<T>(0001);
+  T k = static_cast<T>(0111);
   ASSERT_EQ(x(i, j), T(0011));
   ASSERT_EQ(x(i, k), T(0101));
   ASSERT_EQ(x(j, k), T(0110));
diff --git a/test/unit/util/operator/test-operators-equivalence.cpp b/test/unit/util/operator/test-operators-equivalence.cpp
index 18f57d96b6..710dc21abd 100644
--- a/test/unit/util/operator/test-operators-equivalence.cpp
+++ b/test/unit/util/operator/test-operators-equivalence.cpp
@@ -24,8 +24,8 @@ void equal_test()
   using Eq = RAJA::operators::equal_to<T>;
 
   Eq eq;
-  T  i = static_cast<T>(5);
-  T  j = static_cast<T>(5);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(5);
   ASSERT_TRUE(eq(i, j));
 
   if (std::is_signed<T>::value)
@@ -42,8 +42,8 @@ void not_equal_test()
   using NEq = RAJA::operators::not_equal_to<T>;
 
   NEq neq;
-  T   i = static_cast<T>(5);
-  T   j = static_cast<T>(3);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(3);
   ASSERT_TRUE(neq(i, j));
 
   if (std::is_signed<T>::value)
@@ -148,8 +148,8 @@ void maximum_test()
   using Max = RAJA::operators::maximum<T>;
 
   Max m;
-  T   i = static_cast<T>(5);
-  T   j = static_cast<T>(2);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), i);
 
   if (std::is_signed<T>::value)
@@ -166,8 +166,8 @@ void minimum_test()
   using Min = RAJA::operators::minimum<T>;
 
   Min m;
-  T   i = static_cast<T>(5);
-  T   j = static_cast<T>(2);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), j);
 
   if (std::is_signed<T>::value)
diff --git a/test/unit/util/operator/test-operators-identity.cpp b/test/unit/util/operator/test-operators-identity.cpp
index dd0577e3c0..ef7589d05d 100644
--- a/test/unit/util/operator/test-operators-identity.cpp
+++ b/test/unit/util/operator/test-operators-identity.cpp
@@ -24,8 +24,8 @@ void identity_test()
   using Ident = RAJA::operators::identity<T>;
 
   Ident id;
-  T     i = static_cast<T>(0);
-  T     j = static_cast<T>(1);
+  T i = static_cast<T>(0);
+  T j = static_cast<T>(1);
   ASSERT_EQ(id(i), T(0));
   ASSERT_EQ(id(j), T(1));
 
@@ -42,8 +42,8 @@ void project1st_test()
   using Proj1 = RAJA::operators::project1st<T, T>;
 
   Proj1 p;
-  T     i = static_cast<T>(0);
-  T     j = static_cast<T>(1);
+  T i = static_cast<T>(0);
+  T j = static_cast<T>(1);
   ASSERT_EQ(p(i, j), T(0));
   ASSERT_EQ(p(j, i), T(1));
 
@@ -61,8 +61,8 @@ void project2nd_test()
   using Proj2 = RAJA::operators::project2nd<T, T>;
 
   Proj2 p;
-  T     i = static_cast<T>(0);
-  T     j = static_cast<T>(1);
+  T i = static_cast<T>(0);
+  T j = static_cast<T>(1);
   ASSERT_EQ(p(i, j), T(1));
   ASSERT_EQ(p(j, i), T(0));
 
diff --git a/test/unit/util/operator/test-operators-logical.cpp b/test/unit/util/operator/test-operators-logical.cpp
index 2e8d3a9f3a..8edb9cdad0 100644
--- a/test/unit/util/operator/test-operators-logical.cpp
+++ b/test/unit/util/operator/test-operators-logical.cpp
@@ -23,12 +23,12 @@ void logical_and_test()
   using And = RAJA::operators::logical_and<T>;
 
   And a;
-  T   i0 = static_cast<T>(0);
-  T   i1 = static_cast<T>(1);
-  T   i2 = static_cast<T>(2);
-  T   j0 = static_cast<T>(0);
-  T   j1 = static_cast<T>(1);
-  T   j2 = static_cast<T>(2);
+  T i0 = static_cast<T>(0);
+  T i1 = static_cast<T>(1);
+  T i2 = static_cast<T>(2);
+  T j0 = static_cast<T>(0);
+  T j1 = static_cast<T>(1);
+  T j2 = static_cast<T>(2);
   ASSERT_FALSE(a(i0, j0));
   ASSERT_FALSE(a(i0, j1));
   ASSERT_FALSE(a(i1, j0));
@@ -50,12 +50,12 @@ void logical_or_test()
   using Or = RAJA::operators::logical_or<T>;
 
   Or o;
-  T  i0 = static_cast<T>(0);
-  T  i1 = static_cast<T>(1);
-  T  i2 = static_cast<T>(2);
-  T  j0 = static_cast<T>(0);
-  T  j1 = static_cast<T>(1);
-  T  j2 = static_cast<T>(2);
+  T i0 = static_cast<T>(0);
+  T i1 = static_cast<T>(1);
+  T i2 = static_cast<T>(2);
+  T j0 = static_cast<T>(0);
+  T j1 = static_cast<T>(1);
+  T j2 = static_cast<T>(2);
   ASSERT_FALSE(o(i0, j0));
   ASSERT_TRUE(o(i0, j1));
   ASSERT_TRUE(o(i1, j0));
@@ -77,8 +77,8 @@ void logical_not_test()
   using Not = RAJA::operators::logical_not<T>;
 
   Not n;
-  T   i0 = static_cast<T>(0);
-  T   i1 = static_cast<T>(1);
+  T i0 = static_cast<T>(0);
+  T i1 = static_cast<T>(1);
   ASSERT_FALSE(n(i1));
   ASSERT_TRUE(n(i0));
   if (std::is_signed<T>::value)
diff --git a/test/unit/util/operator/test-operators-math.cpp b/test/unit/util/operator/test-operators-math.cpp
index c89a8f9ecc..16dd7c170a 100644
--- a/test/unit/util/operator/test-operators-math.cpp
+++ b/test/unit/util/operator/test-operators-math.cpp
@@ -25,8 +25,8 @@ void plus_test()
   ASSERT_EQ(ident, T(0));
 
   Plus p;
-  T    i = static_cast<T>(1);
-  T    j = static_cast<T>(2);
+  T i = static_cast<T>(1);
+  T j = static_cast<T>(2);
   ASSERT_EQ(p(i, j), T(3));
 
   if (std::is_signed<T>::value)
@@ -43,8 +43,8 @@ void minus_test()
   using Minus = RAJA::operators::minus<T>;
 
   Minus m;
-  T     i = static_cast<T>(5);
-  T     j = static_cast<T>(2);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), T(3));
 
   if (std::is_signed<T>::value)
@@ -63,8 +63,8 @@ void multiplies_test()
   ASSERT_EQ(ident, T(1));
 
   Mult m;
-  T    i = static_cast<T>(5);
-  T    j = static_cast<T>(2);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
   ASSERT_EQ(m(i, j), T(10));
 
   if (std::is_signed<T>::value)
@@ -81,8 +81,8 @@ void divides_test()
   using Div = RAJA::operators::divides<T>;
 
   Div d;
-  T   i = static_cast<T>(5);
-  T   j = static_cast<T>(2);
+  T i = static_cast<T>(5);
+  T j = static_cast<T>(2);
   if (std::is_floating_point<T>::value)
     ASSERT_EQ(d(i, j), T(2.5));
   else
diff --git a/test/unit/util/test-float-limits.cpp b/test/unit/util/test-float-limits.cpp
index 18684225ad..d54e454083 100644
--- a/test/unit/util/test-float-limits.cpp
+++ b/test/unit/util/test-float-limits.cpp
@@ -27,18 +27,15 @@ TYPED_TEST_SUITE_P(FloatLimitsUnitTest);
 TYPED_TEST_P(FloatLimitsUnitTest, FloatLimits)
 {
 #if !defined(RAJA_ENABLE_TARGET_OPENMP)
-  ASSERT_EQ(
-      RAJA::operators::limits<TypeParam>::min(),
-      -std::numeric_limits<TypeParam>::max());
-  ASSERT_EQ(
-      RAJA::operators::limits<TypeParam>::max(),
-      std::numeric_limits<TypeParam>::max());
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::min(),
+            -std::numeric_limits<TypeParam>::max());
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::max(),
+            std::numeric_limits<TypeParam>::max());
 #endif
 }
 
 REGISTER_TYPED_TEST_SUITE_P(FloatLimitsUnitTest, FloatLimits);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    FloatLimitsUnitTests,
-    FloatLimitsUnitTest,
-    UnitFloatTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(FloatLimitsUnitTests,
+                               FloatLimitsUnitTest,
+                               UnitFloatTypes);
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
index 6e84484ea7..6cc8941184 100644
--- a/test/unit/util/test-fraction.cpp
+++ b/test/unit/util/test-fraction.cpp
@@ -20,17 +20,14 @@ void testFractionMultiplyTypesValues()
 
   ASSERT_EQ(Frac::multiply(IntegerType(0)), IntegerType(0));
 
-  ASSERT_EQ(
-      Frac::multiply(IntegerType(1)),
-      IntegerType(double(numerator) / double(denominator)));
+  ASSERT_EQ(Frac::multiply(IntegerType(1)),
+            IntegerType(double(numerator) / double(denominator)));
 
-  ASSERT_EQ(
-      Frac::multiply(IntegerType(100)),
-      IntegerType(double(numerator) / double(denominator) * double(100)));
+  ASSERT_EQ(Frac::multiply(IntegerType(100)),
+            IntegerType(double(numerator) / double(denominator) * double(100)));
 
-  ASSERT_EQ(
-      Frac::multiply(IntegerType(101)),
-      IntegerType(double(numerator) / double(denominator) * double(101)));
+  ASSERT_EQ(Frac::multiply(IntegerType(101)),
+            IntegerType(double(numerator) / double(denominator) * double(101)));
 
   // Test where naive algorithm causes overflow, when within precision of double
   if /*constexpr*/ (sizeof(IntegerType) < sizeof(double))
diff --git a/test/unit/util/test-integral-limits.cpp b/test/unit/util/test-integral-limits.cpp
index 02648c0edb..1e68ecc4f4 100644
--- a/test/unit/util/test-integral-limits.cpp
+++ b/test/unit/util/test-integral-limits.cpp
@@ -25,17 +25,14 @@ TYPED_TEST_SUITE_P(IntegralLimitsUnitTest);
 
 TYPED_TEST_P(IntegralLimitsUnitTest, IntegralLimits)
 {
-  ASSERT_EQ(
-      RAJA::operators::limits<TypeParam>::min(),
-      std::numeric_limits<TypeParam>::min());
-  ASSERT_EQ(
-      RAJA::operators::limits<TypeParam>::max(),
-      std::numeric_limits<TypeParam>::max());
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::min(),
+            std::numeric_limits<TypeParam>::min());
+  ASSERT_EQ(RAJA::operators::limits<TypeParam>::max(),
+            std::numeric_limits<TypeParam>::max());
 }
 
 REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsUnitTest, IntegralLimits);
 
-INSTANTIATE_TYPED_TEST_SUITE_P(
-    IntegralLimitsUnitTests,
-    IntegralLimitsUnitTest,
-    UnitIntegralTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsUnitTests,
+                               IntegralLimitsUnitTest,
+                               UnitIntegralTypes);
diff --git a/test/unit/util/test-span.cpp b/test/unit/util/test-span.cpp
index 90b13d5401..77ba2c347a 100644
--- a/test/unit/util/test-span.cpp
+++ b/test/unit/util/test-span.cpp
@@ -23,8 +23,8 @@ TEST(Span, basic_assign_Span) {RAJA_SPAN_RUN_TEST(testSpanAssignTypes)}
 
 TEST(Span, basic_iterator_Span) {RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)}
 
-TEST(Span, basic_element_access_Span) {
-    RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
+TEST(Span,
+     basic_element_access_Span) {RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
 
 TEST(Span, basic_observe_Span) {RAJA_SPAN_RUN_TEST(testSpanObserveTypes)}
 
diff --git a/test/unit/util/test-span.hpp b/test/unit/util/test-span.hpp
index 3e817548da..b6fff3fe90 100644
--- a/test/unit/util/test-span.hpp
+++ b/test/unit/util/test-span.hpp
@@ -26,7 +26,7 @@ template <typename ValueType, typename IndexType>
 void testSpanConstructTypes()
 {
   constexpr IndexType len = 4;
-  ValueType*          ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -49,10 +49,10 @@ template <typename ValueType, typename IndexType>
 void testSpanAssignTypes()
 {
   constexpr IndexType len = 4;
-  ValueType*          ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
-    RAJA::Span<ValueType*, IndexType>       span(ptr, len);
+    RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> span2(ptr, len);
     span = span2;
 
@@ -61,9 +61,9 @@ void testSpanAssignTypes()
   }
 
   {
-    ValueType*                              ptr2 = ptr + 1;
-    constexpr IndexType                     len2 = 1;
-    RAJA::Span<ValueType*, IndexType>       span(ptr, len);
+    ValueType* ptr2          = ptr + 1;
+    constexpr IndexType len2 = 1;
+    RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> span2(ptr2, len2);
     span = span2;
 
@@ -81,7 +81,7 @@ void testSpanIteratorTypes()
   using iterator          = typename span_type::iterator;
   using const_iterator    = typename span_type::const_iterator;
   constexpr IndexType len = 4;
-  ValueType*          ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -127,7 +127,7 @@ template <typename ValueType, typename IndexType>
 void testSpanElementAccessTypes()
 {
   constexpr IndexType len = 4;
-  ValueType*          ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -156,7 +156,7 @@ template <typename ValueType, typename IndexType>
 void testSpanObserveTypes()
 {
   constexpr IndexType len = 4;
-  ValueType*          ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -186,7 +186,7 @@ template <typename ValueType, typename IndexType>
 void testSpanSubViewTypes()
 {
   constexpr IndexType len = 4;
-  ValueType*          ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
@@ -196,7 +196,7 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType                     count = 3;
+    constexpr IndexType count = 3;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan = span.first(count);
 
@@ -205,7 +205,7 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType                     count = 3;
+    constexpr IndexType count = 3;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan = span.last(count);
 
@@ -214,8 +214,8 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType                     begin = 1;
-    constexpr IndexType                     count = 2;
+    constexpr IndexType begin = 1;
+    constexpr IndexType count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan =
         span.subspan(begin, count);
@@ -225,8 +225,8 @@ void testSpanSubViewTypes()
   }
 
   {
-    constexpr IndexType                     begin = 1;
-    constexpr IndexType                     count = 2;
+    constexpr IndexType begin = 1;
+    constexpr IndexType count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> subspan = span.slice(begin, count);
 
@@ -241,7 +241,7 @@ template <typename ValueType, typename IndexType>
 void testSpanMakeSpanTypes()
 {
   constexpr IndexType len = 4;
-  ValueType*          ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span = RAJA::make_span(ptr, len);
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index dd34787ddf..048c91a641 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -279,9 +279,9 @@ TEST(IndexLayout, View3DLayout)
   Index_type index_list_j[2] = {1, 2};
   Index_type index_list_k[2] = {2, 3};
 
-  auto index_tuple = make_index_tuple(
-      DirectIndex<>(), IndexList<> {&index_list_j[0]},
-      IndexList<> {&index_list_k[0]});
+  auto index_tuple =
+      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list_j[0]},
+                       IndexList<> {&index_list_k[0]});
 
   auto index_layout = make_index_layout(index_tuple, 2, 3, 4);
 
diff --git a/test/unit/view-layout/test-makelayout.cpp b/test/unit/view-layout/test-makelayout.cpp
index 4ffa3368d7..a1377ddc26 100644
--- a/test/unit/view-layout/test-makelayout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -241,8 +241,8 @@ TEST(StaticLayoutUnitTest, 4D_PermutedStaticLayout)
       {
         for (int l = 0; l < 5; ++l)
         {
-          ASSERT_EQ(
-              dynamic_layout(i, j, k, l), static_layout::s_oper(i, j, k, l));
+          ASSERT_EQ(dynamic_layout(i, j, k, l),
+                    static_layout::s_oper(i, j, k, l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
index 14a7e150b1..60efb37df9 100644
--- a/test/unit/view-layout/test-multiview.cpp
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -32,8 +32,8 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 
   using layout = RAJA::Layout<1>;
 
-  TypeParam  a1[10];
-  TypeParam  a2[10];
+  TypeParam a1[10];
+  TypeParam a2[10];
   TypeParam* data[2];
 
   data[0] = a1;
@@ -107,12 +107,12 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 TYPED_TEST(MultiViewUnitTest, Accessor)
 {
 
-  const int  Nx = 3;
-  const int  Ny = 5;
-  const int  Nz = 2;
-  const int  N  = Nx * Ny * Nz;
-  TypeParam* b  = new TypeParam[N];
-  TypeParam* c  = new TypeParam[N];
+  const int Nx = 3;
+  const int Ny = 5;
+  const int Nz = 2;
+  const int N  = Nx * Ny * Nz;
+  TypeParam* b = new TypeParam[N];
+  TypeParam* c = new TypeParam[N];
   TypeParam* a[2];
 
   a[0] = b;
@@ -129,9 +129,9 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 1D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>>    view_1D(a, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a, N);
   RAJA::MultiView<TypeParam, RAJA::Layout<1>, 1> view_1D1p(a, N);
-  TypeParam                                      val {0};
+  TypeParam val {0};
   for (int i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(0, i));
@@ -144,7 +144,7 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 2D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>>    view_2D(a, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
   RAJA::MultiView<TypeParam, RAJA::Layout<2>, 1> view_2D1p(a, Ny, Nx);
   val = TypeParam {0};
   for (int j = 0; j < Ny; ++j)
@@ -162,7 +162,7 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>>    view_3D(a, Nz, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
   RAJA::MultiView<TypeParam, RAJA::Layout<3>, 2> view_3D1p(a, Nz, Ny, Nx);
   val = TypeParam {0};
   for (int k = 0; k < Nz; ++k)
@@ -198,8 +198,8 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
   /*
    * MultiView is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1>    lower {{1}};
-  std::array<RAJA::Index_type, 1>    upper {{11}};
+  std::array<RAJA::Index_type, 1> lower {{1}};
+  std::array<RAJA::Index_type, 1> upper {{11}};
   RAJA::MultiView<TypeParam, layout> view(
       data, RAJA::make_offset_layout<1>(lower, upper));
   RAJA::MultiView<TypeParam, layout, 1> view1p(
@@ -223,7 +223,7 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
 TYPED_TEST(MultiViewUnitTest, Shift1D)
 {
 
-  int        N     = 10;
+  int N            = 10;
   TypeParam* reala = new TypeParam[N];
   TypeParam* realb = new TypeParam[N];
   TypeParam* a[2];
@@ -231,10 +231,10 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
   a[1] = realb;
 
   // Create a view from a base view
-  const int               DIM    = 1;
+  const int DIM                  = 1;
   RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>>       B(a, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N);
 
   for (int i = 0; i < N; ++i)
   {
@@ -271,7 +271,7 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 
   TLayout myLayout(10);
 
-  RAJA::MultiView<TypeParam, TLayout>       D(a, myLayout);
+  RAJA::MultiView<TypeParam, TLayout> D(a, myLayout);
   RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
   for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
@@ -286,18 +286,18 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 TYPED_TEST(MultiViewUnitTest, Shift2D)
 {
 
-  int        N  = 10;
+  int N         = 10;
   TypeParam* a0 = new TypeParam[N * N];
   TypeParam* b0 = new TypeParam[N * N];
   TypeParam* a[2];
   a[0] = a0;
   a[1] = b0;
 
-  const int               DIM = 2;
+  const int DIM = 2;
   RAJA::OffsetLayout<DIM> layout =
       RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>>       B(a, N, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
 
   for (int y = 0; y < N; ++y)
   {
@@ -325,7 +325,7 @@ TYPED_TEST(MultiViewUnitTest, Shift2D)
 
   // Create a view from a base view with permuted layout
   std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::OffsetLayout<2>      playout =
+  RAJA::OffsetLayout<2> playout =
       RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
index d866b7977c..f15b0c40b4 100644
--- a/test/unit/view-layout/test-typedlayout.cpp
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -18,8 +18,8 @@ TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
 {
 
-  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(
-      10, 5);
+  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,
+                                                                          5);
 
   ASSERT_EQ(TypeParam {0}, l(TypeParam {0}, TypeParam {0}));
 
@@ -144,8 +144,9 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
-  using static_layout = RAJA::TypedStaticLayout<
-      RAJA::PERM_IJ, TypeParam, RAJA::list<TypeParam, TypeParam>, 7, 5>;
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_IJ, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -162,8 +163,9 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
 {
   auto dynamic_layout = RAJA::make_permuted_layout(
       {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::TypedStaticLayout<
-      RAJA::PERM_JI, TypeParam, RAJA::list<TypeParam, TypeParam>, 7, 5>;
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -179,9 +181,10 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 {
   auto dynamic_layout = RAJA::make_permuted_layout(
       {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<
-      RAJA::PERM_JKI, TypeParam, RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
-      13, 5>;
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JKI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
+                              13, 5>;
 
   // Check that we get the same layout
   for (TypeParam i = 0; i < 7; ++i)
@@ -190,8 +193,8 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
     {
       for (TypeParam k = 0; k < 5; ++k)
       {
-        ASSERT_EQ(
-            TypeParam(dynamic_layout(i, j, k)), static_layout::s_oper(i, j, k));
+        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)),
+                  static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -215,9 +218,8 @@ TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
       {
         for (TypeParam l = 0; l < 5; ++l)
         {
-          ASSERT_EQ(
-              TypeParam(dynamic_layout(i, j, k, l)),
-              static_layout::s_oper(i, j, k, l));
+          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)),
+                    static_layout::s_oper(i, j, k, l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index 5841dada14..e57e884edf 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -63,11 +63,11 @@ TYPED_TEST(TypedViewUnitTest, Constructors)
 TYPED_TEST(TypedViewUnitTest, Accessor)
 {
 
-  const int  Nx = 3;
-  const int  Ny = 5;
-  const int  Nz = 2;
-  const int  N  = Nx * Ny * Nz;
-  TypeParam* a  = new TypeParam[N];
+  const int Nx = 3;
+  const int Ny = 5;
+  const int Nz = 2;
+  const int N  = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
   int iter {0};
   for (TypeParam i = 0; i < TypeParam {N}; ++i)
@@ -80,7 +80,7 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
    * 1D Accessor
    */
   RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a, N);
-  TypeParam                              val {0};
+  TypeParam val {0};
   for (int i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(i));
@@ -123,11 +123,11 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
 TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
 {
 
-  const int  Nx = 3;
-  const int  Ny = 5;
-  const int  Nz = 2;
-  const int  N  = Nx * Ny * Nz;
-  TypeParam* a  = new TypeParam[N];
+  const int Nx = 3;
+  const int Ny = 5;
+  const int Nz = 2;
+  const int N  = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
   int iter {0};
   for (TypeParam i = 0; i < TypeParam {N}; ++i)
@@ -140,7 +140,7 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
    * 1D Typed Accessor
    */
   RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a, N);
-  TypeParam                                              val {0};
+  TypeParam val {0};
   for (TypeParam i = 0; i < N; ++i)
   {
     ASSERT_EQ(val, view_1D(i));
@@ -192,8 +192,8 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
    */
   std::array<RAJA::Index_type, 1> lower {{1}};
   std::array<RAJA::Index_type, 1> upper {{11}};
-  RAJA::View<TypeParam, layout>   view(
-        data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::View<TypeParam, layout> view(data,
+                                     RAJA::make_offset_layout<1>(lower, upper));
 
   for (int i = 0; i < 10; i++)
   {
@@ -209,17 +209,17 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
 TYPED_TEST(TypedViewUnitTest, Shift1D)
 {
 
-  int        N = 10;
+  int N        = 10;
   TypeParam* a = new TypeParam[N];
   TypeParam* b = new TypeParam[N];
 
   /*
    * Create a view from a base view
    */
-  const int               DIM    = 1;
+  const int DIM                  = 1;
   RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>>    A(a, layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>>          B(a, N);
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N);
   RAJA::TypedView<TypeParam, RAJA::Layout<DIM>, TX> C(a, N);
 
   for (int i = 0; i < N; ++i)
@@ -255,7 +255,7 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
 
   TLayout myLayout(10);
 
-  RAJA::View<TypeParam, TLayout>       D(a, myLayout);
+  RAJA::View<TypeParam, TLayout> D(a, myLayout);
   RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
   for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
@@ -271,15 +271,15 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
 TYPED_TEST(TypedViewUnitTest, Shift2D)
 {
 
-  int        N = 10;
+  int N        = 10;
   TypeParam* a = new TypeParam[N * N];
   TypeParam* b = new TypeParam[N * N];
 
-  const int               DIM = 2;
+  const int DIM = 2;
   RAJA::OffsetLayout<DIM> layout =
       RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>>       B(a, N, N);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
 
   for (int y = 0; y < N; ++y)
   {
@@ -308,7 +308,7 @@ TYPED_TEST(TypedViewUnitTest, Shift2D)
    * Create a view from a base view with permuted layout
    */
   std::array<RAJA::idx_t, 2> perm {{1, 0}};
-  RAJA::OffsetLayout<2>      playout =
+  RAJA::OffsetLayout<2> playout =
       RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
diff --git a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
index b48de2c149..fb6dd0786e 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
@@ -36,7 +36,7 @@ struct EnqueueTestCallable
 
 private:
   IndexType* ptr;
-  IndexType  val;
+  IndexType val;
 };
 
 #endif  //__TEST_UTIL_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
index e34dfd55b1..a93c932ec2 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
@@ -39,10 +39,9 @@ struct TestCallable
     return *this;
   }
 
-  RAJA_HOST_DEVICE void operator()(
-      void* val_ptr,
-      bool* move_constructed_ptr,
-      bool* moved_from_ptr) const
+  RAJA_HOST_DEVICE void operator()(void* val_ptr,
+                                   bool* move_constructed_ptr,
+                                   bool* moved_from_ptr) const
   {
     *static_cast<T*>(val_ptr) = val;
     *move_constructed_ptr     = move_constructed;
@@ -62,9 +61,9 @@ struct TestCallable
 template <typename T, size_t N>
 struct TestArray
 {
-  T                  a[N] {};
-  T&                 operator[](size_t i) { return a[i]; }
-  T const&           operator[](size_t i) const { return a[i]; }
+  T a[N] {};
+  T& operator[](size_t i) { return a[i]; }
+  T const& operator[](size_t i) const { return a[i]; }
   friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
   {
     for (size_t i = 0; i < N; ++i)
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index c3f603b5dc..7dbac1403d 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -15,13 +15,12 @@
 #include "RAJA_test-workgroup.hpp"
 
 
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupConstructorSingle
 {
   template <typename... Xargs>
@@ -32,33 +31,30 @@ struct testWorkGroupConstructorSingle
     using DispatchPolicy = typename DispatchTyper::template type<>;
 
     {
-      RAJA::WorkPool<
-          RAJA::WorkGroupPolicy<
-              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-          IndexType, RAJA::xargs<Xargs...>, Allocator>
+      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
           pool(Allocator {});
 
       ASSERT_EQ(pool.num_loops(), (size_t)0);
       ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-      RAJA::WorkGroup<
-          RAJA::WorkGroupPolicy<
-              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-          IndexType, RAJA::xargs<Xargs...>, Allocator>
+      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                            StoragePolicy, DispatchPolicy>,
+                      IndexType, RAJA::xargs<Xargs...>, Allocator>
           group = pool.instantiate();
 
       ASSERT_EQ(pool.num_loops(), (size_t)0);
       ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-      RAJA::WorkSite<
-          RAJA::WorkGroupPolicy<
-              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-          IndexType, RAJA::xargs<Xargs...>, Allocator>
+      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
           site = group.run(Xargs {}...);
 
       using resource_type = typename RAJA::WorkPool<
-          RAJA::WorkGroupPolicy<
-              ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+          RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy,
+                                DispatchPolicy>,
           IndexType, RAJA::xargs<Xargs...>, Allocator>::resource_type;
       auto e = resource_type::get_default().get_event();
       e.wait();
@@ -79,12 +75,11 @@ struct testWorkGroupConstructorSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupConstructorSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -98,12 +93,11 @@ struct testWorkGroupConstructorSingle<
   {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupConstructorSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -127,9 +121,8 @@ class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
 
-TYPED_TEST_P(
-    WorkGroupBasicConstructorSingleUnitTest,
-    BasicWorkGroupConstructorSingle)
+TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
+             BasicWorkGroupConstructorSingle)
 {
   using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -139,9 +132,9 @@ TYPED_TEST_P(
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator> {}(Xargs {});
+  testWorkGroupConstructorSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator> {}(
+      Xargs {});
 }
 
 #endif  //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 7c4074bcd7..76016b7bee 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -36,11 +36,10 @@ call_dispatcher(Invoker invoker, CallArgs... callArgs)
 template <typename IndexType, typename... Args>
 struct DispatcherTestCallable
 {
-  DispatcherTestCallable(
-      IndexType* _ptr_call,
-      IndexType  _val_call,
-      IndexType* _ptr_dtor,
-      IndexType  _val_dtor)
+  DispatcherTestCallable(IndexType* _ptr_call,
+                         IndexType _val_call,
+                         IndexType* _ptr_dtor,
+                         IndexType _val_dtor)
       : ptr_call(_ptr_call),
         val_call(_val_call),
         ptr_dtor(_ptr_dtor),
@@ -79,21 +78,20 @@ struct DispatcherTestCallable
 
 private:
   IndexType* ptr_call;
-  IndexType  val_call;
+  IndexType val_call;
   IndexType* ptr_dtor;
-  IndexType  val_dtor;
+  IndexType val_dtor;
 
 public:
   bool move_constructed = false;
   bool moved_from       = false;
 };
 
-template <
-    typename ExecPolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename WORKING_RES,
-    typename ForOnePol>
+template <typename ExecPolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename WORKING_RES,
+          typename ForOnePol>
 struct testWorkGroupDispatcherSingle
 {
   template <typename... Args>
@@ -106,9 +104,9 @@ struct testWorkGroupDispatcherSingle
 
     static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
     using DispatchPolicy  = typename DispatchTyper::template type<TestCallable>;
-    using Dispatcher_type = RAJA::detail::Dispatcher<
-        platform, DispatchPolicy, void, IndexType, Args...>;
-    using Invoker_type         = typename Dispatcher_type::invoker_type;
+    using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                     void, IndexType, Args...>;
+    using Invoker_type    = typename Dispatcher_type::invoker_type;
     using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
     const Dispatcher_type* dispatcher =
         RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
@@ -174,9 +172,9 @@ struct testWorkGroupDispatcherSingle
     work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
     // move a value onto device and fiddle
-    call_dispatcher<
-        ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType, Args...>(
-        dispatcher->invoke, wrk_obj, (IndexType)1, Args {}...);
+    call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType,
+                    Args...>(dispatcher->invoke, wrk_obj, (IndexType)1,
+                             Args {}...);
 
     work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
@@ -207,12 +205,11 @@ struct testWorkGroupDispatcherSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename IndexType,
-    typename WORKING_RES,
-    typename ForOnePol>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename IndexType,
+          typename WORKING_RES,
+          typename ForOnePol>
 struct testWorkGroupDispatcherSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     detail::indirect_function_call_dispatch_typer,
@@ -225,12 +222,11 @@ struct testWorkGroupDispatcherSingle<
   {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename IndexType,
-    typename WORKING_RES,
-    typename ForOnePol>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename IndexType,
+          typename WORKING_RES,
+          typename ForOnePol>
 struct testWorkGroupDispatcherSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     detail::indirect_virtual_function_dispatch_typer,
@@ -252,9 +248,8 @@ class WorkGroupBasicDispatcherSingleUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 
-TYPED_TEST_P(
-    WorkGroupBasicDispatcherSingleUnitTest,
-    BasicWorkGroupDispatcherSingle)
+TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
+             BasicWorkGroupDispatcherSingle)
 {
   using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -263,9 +258,8 @@ TYPED_TEST_P(
   using ResourceType  = typename camp::at<TypeParam, camp::num<4>>::type;
   using ForOneType    = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupDispatcherSingle<
-      ExecPolicy, DispatchTyper, IndexType, ResourceType, ForOneType> {}(
-      Args {});
+  testWorkGroupDispatcherSingle<ExecPolicy, DispatchTyper, IndexType,
+                                ResourceType, ForOneType> {}(Args {});
 }
 
 #endif  //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index c864774c68..6b7572af83 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -18,19 +18,19 @@
 #include <random>
 
 
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupEnqueueMultiple
 {
   template <typename... Args>
-  void
-  operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num)
-      const
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
   {
     IndexType success = (IndexType)1;
 
@@ -40,15 +40,15 @@ struct testWorkGroupEnqueueMultiple
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
 
-    using WorkPool_type = RAJA::WorkPool<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    using WorkGroup_type = RAJA::WorkGroup<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
     {
       WorkPool_type pool(Allocator {});
@@ -63,8 +63,8 @@ struct testWorkGroupEnqueueMultiple
         {
           for (size_t i = 0; i < num; ++i)
           {
-            pool.enqueue(
-                range_segment {0, 1}, callable {&success, IndexType(0)});
+            pool.enqueue(range_segment {0, 1},
+                         callable {&success, IndexType(0)});
           }
 
           ASSERT_EQ(pool.num_loops(), (size_t)num);
@@ -93,12 +93,11 @@ struct testWorkGroupEnqueueMultiple
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupEnqueueMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -112,12 +111,11 @@ struct testWorkGroupEnqueueMultiple<
   {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupEnqueueMultiple<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -141,9 +139,8 @@ class WorkGroupBasicEnqueueMultipleUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicEnqueueMultipleUnitTest,
-    BasicWorkGroupEnqueueMultiple)
+TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
+             BasicWorkGroupEnqueueMultiple)
 {
   using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -153,16 +150,16 @@ TYPED_TEST_P(
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937                          rng(std::random_device {}());
+  std::mt19937 rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
-  testWorkGroupEnqueueMultiple<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator> {}(Xargs {}, false, dist_rep(rng), dist_num(rng));
-  testWorkGroupEnqueueMultiple<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator> {}(Xargs {}, true, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, false, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, true, dist_rep(rng), dist_num(rng));
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index a2cb6bd1be..ee172d7732 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -18,19 +18,19 @@
 #include <random>
 
 
-template <
-    typename ExecPolicy,
-    typename OrderPolicy,
-    typename StoragePolicy,
-    typename DispatchTyper,
-    typename IndexType,
-    typename Allocator>
+template <typename ExecPolicy,
+          typename OrderPolicy,
+          typename StoragePolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupEnqueueSingle
 {
   template <typename... Args>
-  void
-  operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num)
-      const
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
   {
     IndexType success = (IndexType)1;
 
@@ -40,15 +40,15 @@ struct testWorkGroupEnqueueSingle
     using DispatchPolicy = typename DispatchTyper::template type<
         camp::list<range_segment, callable>>;
 
-    using WorkPool_type = RAJA::WorkPool<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    using WorkGroup_type = RAJA::WorkGroup<
-        RAJA::WorkGroupPolicy<
-            ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-        IndexType, RAJA::xargs<Args...>, Allocator>;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
     {
       WorkPool_type pool(Allocator {});
@@ -63,8 +63,8 @@ struct testWorkGroupEnqueueSingle
         {
           for (size_t i = 0; i < num; ++i)
           {
-            pool.enqueue(
-                range_segment {0, 1}, callable {&success, IndexType(0)});
+            pool.enqueue(range_segment {0, 1},
+                         callable {&success, IndexType(0)});
           }
 
           ASSERT_EQ(pool.num_loops(), (size_t)num);
@@ -93,12 +93,11 @@ struct testWorkGroupEnqueueSingle
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupEnqueueSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -112,12 +111,11 @@ struct testWorkGroupEnqueueSingle<
   {}
 };
 ///
-template <
-    size_t BLOCK_SIZE,
-    bool   Async,
-    typename StoragePolicy,
-    typename IndexType,
-    typename Allocator>
+template <size_t BLOCK_SIZE,
+          bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator>
 struct testWorkGroupEnqueueSingle<
     RAJA::hip_work<BLOCK_SIZE, Async>,
     RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
@@ -151,12 +149,12 @@ TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
   using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupEnqueueSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator> {}(Xargs {}, false, 1, 1);
-  testWorkGroupEnqueueSingle<
-      ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType,
-      Allocator> {}(Xargs {}, true, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, false, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, true, 1, 1);
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 26c47adca0..603209ecee 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -27,8 +27,8 @@ void testWorkGroupWorkStorageConstructor()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy           = typename DispatchTyper::template type<>;
-  using Dispatcher_type          = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
@@ -72,16 +72,15 @@ class WorkGroupBasicWorkStorageConstructorUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicWorkStorageConstructorUnitTest,
-    BasicWorkGroupWorkStorageConstructor)
+TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
+             BasicWorkGroupWorkStorageConstructor)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor<
-      StoragePolicy, DispatchTyper, Allocator>();
+  testWorkGroupWorkStorageConstructor<StoragePolicy, DispatchTyper,
+                                      Allocator>();
 }
 
 
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index 088bd71be3..d45a8d6ce6 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -29,8 +29,8 @@ void testWorkGroupWorkStorageInsertCall()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
@@ -69,11 +69,11 @@ void testWorkGroupWorkStorageInsertCall()
 
       auto iter = container.begin();
 
-      double test_val         = -1;
-      bool   move_constructed = false;
-      bool   moved_from       = true;
-      WorkStruct_type::host_call(
-          &*iter, (void*)&test_val, &move_constructed, &moved_from);
+      double test_val       = -1;
+      bool move_constructed = false;
+      bool moved_from       = true;
+      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed,
+                                 &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
@@ -127,9 +127,8 @@ class WorkGroupBasicWorkStorageInsertCallUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicWorkStorageInsertCallUnitTest,
-    BasicWorkGroupWorkStorageInsertCall)
+TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
+             BasicWorkGroupWorkStorageInsertCall)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index 2156296908..58206f5d90 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -29,8 +29,8 @@ void testWorkGroupWorkStorageIterator()
 
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
@@ -90,9 +90,8 @@ class WorkGroupBasicWorkStorageIteratorUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicWorkStorageIteratorUnitTest,
-    BasicWorkGroupWorkStorageIterator)
+TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
+             BasicWorkGroupWorkStorageIterator)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 1ede327f41..49fe1a4d60 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -21,10 +21,9 @@
 
 
 template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
-void testWorkGroupWorkStorageMultiple(
-    const size_t num0,
-    const size_t num1,
-    const size_t num2)
+void testWorkGroupWorkStorageMultiple(const size_t num0,
+                                      const size_t num1,
+                                      const size_t num2)
 {
   bool success = true;
 
@@ -63,8 +62,8 @@ void testWorkGroupWorkStorageMultiple(
   static constexpr auto platform = RAJA::Platform::host;
   using DispatchPolicy =
       typename DispatchTyper::template type<callable0, callable1, callable2>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
   using WorkStorage_type =
       RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
@@ -127,20 +126,18 @@ void testWorkGroupWorkStorageMultiple(
       }
 
       ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(
-          container.storage_size(), num0 * sizeof(callable0) +
-                                        num1 * sizeof(callable1) +
-                                        num2 * sizeof(callable2));
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
     };
 
     auto test_contents = [&](WorkStorage_type& container, double init_val0,
                              double init_val1, double init_val2)
     {
       ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(
-          container.storage_size(), num0 * sizeof(callable0) +
-                                        num1 * sizeof(callable1) +
-                                        num2 * sizeof(callable2));
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
 
       {
         auto iter = container.begin();
@@ -148,10 +145,10 @@ void testWorkGroupWorkStorageMultiple(
         for (size_t i = 0; i < num0; ++i)
         {
           type0 val {};
-          bool  move_constructed = false;
-          bool  moved_from       = true;
-          WorkStruct_type::host_call(
-              &*iter, (void*)&val, &move_constructed, &moved_from);
+          bool move_constructed = false;
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -164,10 +161,10 @@ void testWorkGroupWorkStorageMultiple(
         for (size_t i = 0; i < num1; ++i)
         {
           type1 val {};
-          bool  move_constructed = false;
-          bool  moved_from       = true;
-          WorkStruct_type::host_call(
-              &*iter, (void*)&val, &move_constructed, &moved_from);
+          bool move_constructed = false;
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -180,10 +177,10 @@ void testWorkGroupWorkStorageMultiple(
         for (size_t i = 0; i < num2; ++i)
         {
           type2 val {};
-          bool  move_constructed = false;
-          bool  moved_from       = true;
-          WorkStruct_type::host_call(
-              &*iter, (void*)&val, &move_constructed, &moved_from);
+          bool move_constructed = false;
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);
@@ -244,15 +241,14 @@ class WorkGroupBasicWorkStorageMultipleUnitTest : public ::testing::Test
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
 
 
-TYPED_TEST_P(
-    WorkGroupBasicWorkStorageMultipleUnitTest,
-    BasicWorkGroupWorkStorageMultiple)
+TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest,
+             BasicWorkGroupWorkStorageMultiple)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
   using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  std::mt19937                          rng(std::random_device {}());
+  std::mt19937 rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
   testWorkGroupWorkStorageMultiple<StoragePolicy, DispatchTyper, Allocator>(

From c1472094f07ecbdafad7e0119b05cf4851b10a51 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Tue, 8 Oct 2024 12:01:34 -0700
Subject: [PATCH 7/9] restore changes to test and include

---
 include/RAJA/RAJA.hpp                         |   10 +-
 include/RAJA/index/IndexSet.hpp               |  293 +-
 include/RAJA/index/IndexSetBuilders.hpp       |   35 +-
 include/RAJA/index/IndexSetUtils.hpp          |   41 +-
 include/RAJA/index/IndexValue.hpp             |  144 +-
 include/RAJA/index/ListSegment.hpp            |  113 +-
 include/RAJA/index/RangeSegment.hpp           |  151 +-
 include/RAJA/internal/DepGraphNode.hpp        |    9 +-
 include/RAJA/internal/Iterators.hpp           |  169 +-
 include/RAJA/internal/MemUtils_CPU.hpp        |   22 +-
 include/RAJA/internal/RAJAVec.hpp             |  242 +-
 include/RAJA/internal/fault_tolerance.hpp     |  106 +-
 include/RAJA/internal/foldl.hpp               |   47 +-
 include/RAJA/internal/get_platform.hpp        |   52 +-
 include/RAJA/pattern/WorkGroup.hpp            |  289 +-
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  505 ++-
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |  193 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    |  594 ++-
 include/RAJA/pattern/WorkGroup/WorkStruct.hpp |   62 +-
 include/RAJA/pattern/atomic.hpp               |   93 +-
 include/RAJA/pattern/detail/algorithm.hpp     |   23 +-
 include/RAJA/pattern/detail/forall.hpp        |   12 +-
 include/RAJA/pattern/detail/multi_reduce.hpp  |  181 +-
 include/RAJA/pattern/detail/privatizer.hpp    |   21 +-
 include/RAJA/pattern/detail/reduce.hpp        |  268 +-
 include/RAJA/pattern/forall.hpp               |  531 ++-
 include/RAJA/pattern/kernel.hpp               |  100 +-
 include/RAJA/pattern/kernel/Collapse.hpp      |    4 +-
 include/RAJA/pattern/kernel/Conditional.hpp   |   62 +-
 include/RAJA/pattern/kernel/For.hpp           |   38 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   28 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |   45 +-
 include/RAJA/pattern/kernel/InitLocalMem.hpp  |   61 +-
 include/RAJA/pattern/kernel/Lambda.hpp        |  192 +-
 include/RAJA/pattern/kernel/Param.hpp         |   11 +-
 include/RAJA/pattern/kernel/Reduce.hpp        |    6 +-
 include/RAJA/pattern/kernel/Region.hpp        |   38 +-
 include/RAJA/pattern/kernel/Tile.hpp          |  114 +-
 include/RAJA/pattern/kernel/TileTCount.hpp    |   31 +-
 .../RAJA/pattern/kernel/internal/LoopData.hpp |  126 +-
 .../pattern/kernel/internal/LoopTypes.hpp     |   76 +-
 .../pattern/kernel/internal/Statement.hpp     |   15 +-
 .../pattern/kernel/internal/StatementList.hpp |   25 +-
 .../RAJA/pattern/kernel/internal/Template.hpp |   17 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  612 ++--
 include/RAJA/pattern/multi_reduce.hpp         |    7 +-
 include/RAJA/pattern/params/forall.hpp        |  725 ++--
 include/RAJA/pattern/params/kernel_name.hpp   |   25 +-
 include/RAJA/pattern/params/params_base.hpp   |  133 +-
 include/RAJA/pattern/params/reducer.hpp       |  254 +-
 include/RAJA/pattern/reduce.hpp               |    4 +-
 include/RAJA/pattern/scan.hpp                 |  294 +-
 include/RAJA/pattern/sort.hpp                 |  236 +-
 include/RAJA/pattern/synchronize.hpp          |    2 +-
 .../RAJA/pattern/tensor/MatrixRegister.hpp    |   38 +-
 .../RAJA/pattern/tensor/ScalarRegister.hpp    |   14 +-
 include/RAJA/pattern/tensor/TensorBlock.hpp   |    1 +
 include/RAJA/pattern/tensor/TensorIndex.hpp   |  358 +-
 include/RAJA/pattern/tensor/TensorLayout.hpp  |   83 +-
 .../RAJA/pattern/tensor/TensorRegister.hpp    |  133 +-
 .../RAJA/pattern/tensor/VectorRegister.hpp    |   19 +-
 .../tensor/internal/ET/BinaryOperator.hpp     |  233 +-
 .../internal/ET/BinaryOperatorTraits.hpp      |  226 +-
 .../tensor/internal/ET/BlockLiteral.hpp       |  144 +-
 .../internal/ET/ExpressionTemplateBase.hpp    |  231 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   | 2212 ++++++------
 .../tensor/internal/ET/TensorDivide.hpp       |  681 ++--
 .../tensor/internal/ET/TensorLiteral.hpp      |  130 +-
 .../tensor/internal/ET/TensorLoadStore.hpp    |  361 +-
 .../tensor/internal/ET/TensorMultiply.hpp     |  247 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |  151 +-
 .../tensor/internal/ET/TensorNegate.hpp       |  103 +-
 .../internal/ET/TensorScalarLiteral.hpp       |  121 +-
 .../tensor/internal/ET/TensorTranspose.hpp    |  114 +-
 .../tensor/internal/ET/normalizeOperand.hpp   |   99 +-
 .../tensor/internal/ExpressionTemplate.hpp    |    1 +
 .../tensor/internal/MatrixMatrixMultiply.hpp  |  527 ++-
 .../tensor/internal/MatrixRegisterImpl.hpp    | 2656 ++++++--------
 .../pattern/tensor/internal/RegisterBase.hpp  | 2027 ++++++-----
 .../tensor/internal/TensorIndexTraits.hpp     |  592 +--
 .../pattern/tensor/internal/TensorRef.hpp     | 1251 +++----
 .../tensor/internal/TensorRegisterBase.hpp    | 1542 ++++----
 .../tensor/internal/TensorTileExec.hpp        |  532 ++-
 .../tensor/internal/VectorRegisterImpl.hpp    | 1758 +++++----
 include/RAJA/pattern/tensor/stats.hpp         |    7 +-
 include/RAJA/policy/MultiPolicy.hpp           |   67 +-
 include/RAJA/policy/PolicyBase.hpp            |  146 +-
 include/RAJA/policy/WorkGroup.hpp             |   81 +-
 include/RAJA/policy/atomic_auto.hpp           |   52 +-
 include/RAJA/policy/atomic_builtin.hpp        |  361 +-
 include/RAJA/policy/cuda.hpp                  |    2 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |  259 +-
 .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp |   43 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  288 +-
 include/RAJA/policy/cuda/atomic.hpp           |  493 ++-
 include/RAJA/policy/cuda/forall.hpp           |  728 ++--
 include/RAJA/policy/cuda/intrinsics.hpp       |  163 +-
 .../RAJA/policy/cuda/kernel/Conditional.hpp   |   18 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  248 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |  240 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  332 +-
 .../RAJA/policy/cuda/kernel/Hyperplane.hpp    |   38 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |  156 +-
 include/RAJA/policy/cuda/kernel/Lambda.hpp    |   26 +-
 include/RAJA/policy/cuda/kernel/Reduce.hpp    |   40 +-
 include/RAJA/policy/cuda/kernel/Sync.hpp      |   43 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |  161 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |  206 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  515 ++-
 include/RAJA/policy/cuda/launch.hpp           |  990 +++--
 include/RAJA/policy/cuda/multi_reduce.hpp     |  568 ++-
 .../RAJA/policy/cuda/params/kernel_name.hpp   |   64 +-
 include/RAJA/policy/cuda/params/reduce.hpp    |   99 +-
 include/RAJA/policy/cuda/policy.hpp           | 3198 +++++++----------
 include/RAJA/policy/cuda/raja_cudaerrchk.hpp  |   26 +-
 include/RAJA/policy/cuda/reduce.hpp           |  715 ++--
 include/RAJA/policy/cuda/scan.hpp             |  158 +-
 include/RAJA/policy/cuda/sort.hpp             |  641 ++--
 include/RAJA/policy/desul/atomic.hpp          |  149 +-
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |  285 +-
 .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp  |   36 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  316 +-
 include/RAJA/policy/hip/atomic.hpp            |  489 ++-
 include/RAJA/policy/hip/forall.hpp            |  723 ++--
 include/RAJA/policy/hip/intrinsics.hpp        |  154 +-
 .../RAJA/policy/hip/kernel/Conditional.hpp    |   20 +-
 include/RAJA/policy/hip/kernel/For.hpp        |  245 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  334 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  229 +-
 include/RAJA/policy/hip/kernel/Hyperplane.hpp |   38 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |  158 +-
 include/RAJA/policy/hip/kernel/Lambda.hpp     |   26 +-
 include/RAJA/policy/hip/kernel/Reduce.hpp     |   59 +-
 include/RAJA/policy/hip/kernel/Sync.hpp       |   39 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |  161 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  200 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  515 ++-
 include/RAJA/policy/hip/launch.hpp            |  983 +++--
 include/RAJA/policy/hip/multi_reduce.hpp      |  567 ++-
 .../RAJA/policy/hip/params/kernel_name.hpp    |   63 +-
 include/RAJA/policy/hip/params/reduce.hpp     |   96 +-
 include/RAJA/policy/hip/policy.hpp            | 2961 ++++++---------
 include/RAJA/policy/hip/raja_hiperrchk.hpp    |   26 +-
 include/RAJA/policy/hip/reduce.hpp            |  709 ++--
 include/RAJA/policy/hip/scan.hpp              |  204 +-
 include/RAJA/policy/hip/sort.hpp              |  621 ++--
 include/RAJA/policy/openmp.hpp                |    2 +-
 .../policy/openmp/WorkGroup/Dispatcher.hpp    |    8 +-
 .../policy/openmp/WorkGroup/WorkRunner.hpp    |   64 +-
 include/RAJA/policy/openmp/atomic.hpp         |   69 +-
 include/RAJA/policy/openmp/forall.hpp         |  442 ++-
 .../RAJA/policy/openmp/kernel/Collapse.hpp    |   51 +-
 .../policy/openmp/kernel/OmpSyncThreads.hpp   |   27 +-
 include/RAJA/policy/openmp/launch.hpp         |  555 ++-
 include/RAJA/policy/openmp/multi_reduce.hpp   |  281 +-
 include/RAJA/policy/openmp/params/forall.hpp  |  557 ++-
 .../RAJA/policy/openmp/params/kernel_name.hpp |   58 +-
 include/RAJA/policy/openmp/params/reduce.hpp  |   60 +-
 include/RAJA/policy/openmp/policy.hpp         |  229 +-
 include/RAJA/policy/openmp/reduce.hpp         |   15 +-
 include/RAJA/policy/openmp/region.hpp         |   12 +-
 include/RAJA/policy/openmp/scan.hpp           |  133 +-
 include/RAJA/policy/openmp/sort.hpp           |  155 +-
 include/RAJA/policy/openmp_target.hpp         |    3 +-
 .../openmp_target/WorkGroup/Dispatcher.hpp    |   25 +-
 .../openmp_target/WorkGroup/WorkRunner.hpp    |   64 +-
 include/RAJA/policy/openmp_target/forall.hpp  |  113 +-
 .../policy/openmp_target/kernel/Collapse.hpp  |   98 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   46 +-
 .../openmp_target/params/kernel_name.hpp      |   58 +-
 .../policy/openmp_target/params/reduce.hpp    |   60 +-
 include/RAJA/policy/openmp_target/policy.hpp  |   78 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |  256 +-
 include/RAJA/policy/sequential.hpp            |    2 +-
 .../sequential/WorkGroup/Dispatcher.hpp       |    8 +-
 .../sequential/WorkGroup/WorkRunner.hpp       |   62 +-
 include/RAJA/policy/sequential/atomic.hpp     |   57 +-
 include/RAJA/policy/sequential/forall.hpp     |   48 +-
 .../policy/sequential/kernel/Collapse.hpp     |   26 +-
 .../RAJA/policy/sequential/kernel/Reduce.hpp  |    6 +-
 include/RAJA/policy/sequential/launch.hpp     |  198 +-
 .../RAJA/policy/sequential/multi_reduce.hpp   |   88 +-
 .../policy/sequential/params/kernel_name.hpp  |   68 +-
 .../RAJA/policy/sequential/params/reduce.hpp  |   58 +-
 include/RAJA/policy/sequential/policy.hpp     |   58 +-
 include/RAJA/policy/sequential/region.hpp     |    2 +-
 include/RAJA/policy/sequential/scan.hpp       |  110 +-
 include/RAJA/policy/sequential/sort.hpp       |   91 +-
 include/RAJA/policy/simd/forall.hpp           |   46 +-
 include/RAJA/policy/simd/kernel/For.hpp       |   38 +-
 include/RAJA/policy/simd/kernel/ForICount.hpp |   30 +-
 include/RAJA/policy/simd/launch.hpp           |   28 +-
 include/RAJA/policy/simd/policy.hpp           |    4 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp    |   25 +-
 include/RAJA/policy/sycl/forall.hpp           |  295 +-
 .../RAJA/policy/sycl/kernel/Conditional.hpp   |   19 +-
 include/RAJA/policy/sycl/kernel/For.hpp       |  221 +-
 include/RAJA/policy/sycl/kernel/ForICount.hpp |  274 +-
 include/RAJA/policy/sycl/kernel/Lambda.hpp    |   26 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |  110 +-
 include/RAJA/policy/sycl/kernel/Tile.hpp      |  228 +-
 .../RAJA/policy/sycl/kernel/TileTCount.hpp    |  251 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |  149 +-
 include/RAJA/policy/sycl/launch.hpp           | 1047 +++---
 .../RAJA/policy/sycl/params/kernel_name.hpp   |   75 +-
 include/RAJA/policy/sycl/params/reduce.hpp    |   60 +-
 include/RAJA/policy/sycl/policy.hpp           |  151 +-
 include/RAJA/policy/sycl/reduce.hpp           |  351 +-
 include/RAJA/policy/tensor/arch.hpp           |   58 +-
 include/RAJA/policy/tensor/arch/avx.hpp       |   12 +-
 .../policy/tensor/arch/avx/avx_double.hpp     |  888 +++--
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |  918 +++--
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp | 1458 ++++----
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp | 1011 +++---
 .../RAJA/policy/tensor/arch/avx/traits.hpp    |   85 +-
 include/RAJA/policy/tensor/arch/avx2.hpp      |   12 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   | 1005 +++---
 .../policy/tensor/arch/avx2/avx2_float.hpp    |  943 ++---
 .../policy/tensor/arch/avx2/avx2_int32.hpp    | 1077 +++---
 .../policy/tensor/arch/avx2/avx2_int64.hpp    | 1024 +++---
 .../RAJA/policy/tensor/arch/avx2/traits.hpp   |  109 +-
 include/RAJA/policy/tensor/arch/avx512.hpp    |   12 +-
 .../tensor/arch/avx512/avx512_double.hpp      |  701 ++--
 .../tensor/arch/avx512/avx512_float.hpp       |  724 ++--
 .../tensor/arch/avx512/avx512_int32.hpp       |  839 +++--
 .../tensor/arch/avx512/avx512_int64.hpp       |  739 ++--
 .../RAJA/policy/tensor/arch/avx512/traits.hpp |   88 +-
 include/RAJA/policy/tensor/arch/cuda.hpp      |    6 +-
 .../policy/tensor/arch/cuda/cuda_warp.hpp     | 1974 +++++-----
 .../RAJA/policy/tensor/arch/cuda/traits.hpp   |   39 +-
 include/RAJA/policy/tensor/arch/hip.hpp       |    6 +-
 .../RAJA/policy/tensor/arch/hip/hip_wave.hpp  | 1970 +++++-----
 .../RAJA/policy/tensor/arch/hip/traits.hpp    |   39 +-
 include/RAJA/policy/tensor/arch/scalar.hpp    |    8 +-
 .../RAJA/policy/tensor/arch/scalar/scalar.hpp |  895 +++--
 .../RAJA/policy/tensor/arch/scalar/traits.hpp |   85 +-
 include/RAJA/policy/tensor/arch_impl.hpp      |   14 +-
 include/RAJA/policy/tensor/policy.hpp         |   33 +-
 include/RAJA/util/BitMask.hpp                 |   99 +-
 include/RAJA/util/CombiningAdapter.hpp        |   92 +-
 include/RAJA/util/EnableIf.hpp                |    6 +-
 include/RAJA/util/IndexLayout.hpp             |  155 +-
 include/RAJA/util/KokkosPluginLoader.hpp      |   53 +-
 include/RAJA/util/Layout.hpp                  |  190 +-
 include/RAJA/util/LocalArray.hpp              |  106 +-
 include/RAJA/util/OffsetLayout.hpp            |  141 +-
 include/RAJA/util/OffsetOperators.hpp         |   51 +-
 include/RAJA/util/Operators.hpp               |  266 +-
 include/RAJA/util/Permutations.hpp            |  109 +-
 include/RAJA/util/PermutedLayout.hpp          |   20 +-
 include/RAJA/util/PluginContext.hpp           |   30 +-
 include/RAJA/util/PluginLinker.hpp            |   22 +-
 include/RAJA/util/PluginOptions.hpp           |   18 +-
 include/RAJA/util/PluginStrategy.hpp          |   28 +-
 include/RAJA/util/Registry.hpp                |  239 +-
 include/RAJA/util/RepeatView.hpp              |  121 +-
 include/RAJA/util/RuntimePluginLoader.hpp     |   41 +-
 include/RAJA/util/SoAArray.hpp                |    8 +-
 include/RAJA/util/SoAPtr.hpp                  |  159 +-
 include/RAJA/util/Span.hpp                    |   76 +-
 include/RAJA/util/StaticLayout.hpp            |  176 +-
 include/RAJA/util/Timer.hpp                   |   11 +-
 include/RAJA/util/TypeConvert.hpp             |    2 +-
 include/RAJA/util/TypedViewBase.hpp           | 1271 +++----
 include/RAJA/util/View.hpp                    |  233 +-
 include/RAJA/util/align.hpp                   |   18 +-
 include/RAJA/util/basic_mempool.hpp           |  136 +-
 include/RAJA/util/concepts.hpp                |    8 +-
 include/RAJA/util/for_each.hpp                |   39 +-
 include/RAJA/util/macros.hpp                  |   56 +-
 include/RAJA/util/math.hpp                    |   44 +-
 include/RAJA/util/mutex.hpp                   |   12 +-
 include/RAJA/util/plugins.hpp                 |   75 +-
 include/RAJA/util/reduce.hpp                  |  197 +-
 include/RAJA/util/resource.hpp                |  293 +-
 include/RAJA/util/sort.hpp                    |  433 ++-
 include/RAJA/util/types.hpp                   |  160 +-
 include/RAJA/util/zip.hpp                     |  116 +-
 include/RAJA/util/zip_tuple.hpp               |  394 +-
 ...t-dynamic-forall-resource-RangeSegment.hpp |   89 +-
 .../test-dynamic-forall-RangeSegment.hpp      |  109 +-
 .../tests/test-forall-CombiningAdapter-1D.hpp |   98 +-
 .../tests/test-forall-CombiningAdapter-2D.hpp |  132 +-
 .../tests/test-forall-CombiningAdapter-3D.hpp |  178 +-
 .../tests/test-forall-atomic-basic.hpp        |  141 +-
 .../tests/test-forall-AtomicRefAdd.hpp        |  194 +-
 .../tests/test-forall-AtomicRefCAS.hpp        |  183 +-
 .../tests/test-forall-AtomicRefLoadStore.hpp  |  173 +-
 .../tests/test-forall-AtomicRefLogical.hpp    |  273 +-
 .../tests/test-forall-AtomicRefMinMax.hpp     |  175 +-
 .../tests/test-forall-AtomicRefSub.hpp        |  177 +-
 .../tests/test-forall-AtomicMultiView.hpp     |   91 +-
 ...test-forall-AtomicOutOfBoundsMultiView.hpp |   55 +-
 .../tests/test-forall-AtomicView.hpp          |   53 +-
 .../tests/test-forall-IcountIndexSetView.hpp  |   51 +-
 .../tests/test-forall-IndexSetView.hpp        |   50 +-
 .../tests/test-forall-IcountIndexSet.hpp      |   48 +-
 .../indexset/tests/test-forall-IndexSet.hpp   |   51 +-
 .../tests/test-forall-basic-MultiReduce.hpp   |  251 +-
 .../forall/reduce-basic/CMakeLists.txt        |    2 +-
 .../tests/test-forall-basic-ReduceBitAnd.hpp  |  120 +-
 .../tests/test-forall-basic-ReduceBitOr.hpp   |  126 +-
 .../tests/test-forall-basic-ReduceMax.hpp     |  124 +-
 .../tests/test-forall-basic-ReduceMaxLoc.hpp  |  137 +-
 .../tests/test-forall-basic-ReduceMin.hpp     |  122 +-
 .../tests/test-forall-basic-ReduceMinLoc.hpp  |  137 +-
 .../tests/test-forall-basic-ReduceSum.hpp     |  116 +-
 .../test-forall-basic-expt-ReduceBitAnd.hpp   |  131 +-
 .../test-forall-basic-expt-ReduceBitOr.hpp    |  145 +-
 .../test-forall-basic-expt-ReduceMax.hpp      |  143 +-
 .../test-forall-basic-expt-ReduceMaxLoc.hpp   |  145 +-
 ...test-forall-basic-expt-ReduceMaxLocAlt.hpp |  199 +
 .../test-forall-basic-expt-ReduceMin.hpp      |  139 +-
 .../test-forall-basic-expt-ReduceMinLoc.hpp   |  148 +-
 ...test-forall-basic-expt-ReduceMinLocAlt.hpp |  198 +
 .../test-forall-basic-expt-ReduceSum.hpp      |  133 +-
 ...est-forall-indexset-multiple-ReduceMax.hpp |   84 +-
 ...-forall-indexset-multiple-ReduceMaxLoc.hpp |   91 +-
 ...est-forall-indexset-multiple-ReduceMin.hpp |   96 +-
 ...-forall-indexset-multiple-ReduceMinLoc.hpp |   91 +-
 ...est-forall-indexset-multiple-ReduceSum.hpp |   94 +-
 ...test-forall-segment-multiple-ReduceMax.hpp |   75 +-
 ...t-forall-segment-multiple-ReduceMaxLoc.hpp |   85 +-
 ...test-forall-segment-multiple-ReduceMin.hpp |   73 +-
 ...t-forall-segment-multiple-ReduceMinLoc.hpp |   86 +-
 ...test-forall-segment-multiple-ReduceSum.hpp |  140 +-
 .../region/tests/test-forall-region.hpp       |   59 +-
 .../test-forall-ResourceIcountIndexSet.hpp    |   53 +-
 .../tests/test-forall-ResourceIndexSet.hpp    |   53 +-
 .../test-forall-resource-ListSegment.hpp      |   68 +-
 .../test-forall-resource-RangeSegment.hpp     |   69 +-
 ...est-forall-resource-RangeStrideSegment.hpp |  158 +-
 .../tests/test-forall-ListSegmentView.hpp     |  133 +-
 .../tests/test-forall-RangeSegment2DView.hpp  |  108 +-
 .../tests/test-forall-RangeSegmentView.hpp    |   99 +-
 .../test-forall-RangeStrideSegmentView.hpp    |  111 +-
 .../segment/tests/test-forall-ListSegment.hpp |   93 +-
 .../tests/test-forall-RangeSegment.hpp        |   97 +-
 .../tests/test-forall-RangeStrideSegment.hpp  |  167 +-
 .../indexset-build/test-aligned-indexset.cpp  |   18 +-
 .../tests/basic-fission-fusion-loop-impl.hpp  |   55 +-
 ...nel-basic-fission-fusion-loop-segments.hpp |   53 +-
 .../tests/basic-single-icount-loop-impl.hpp   |  120 +-
 ...rnel-basic-single-icount-loop-segments.hpp |   60 +-
 .../tests/basic-single-loop-segments-impl.hpp |  112 +-
 ...test-kernel-basic-single-loop-segments.hpp |   66 +-
 ...el-resource-basic-single-loop-segments.hpp |   66 +-
 .../conditional-fission-fusion-loop-impl.hpp  |   56 +-
 ...nditional-fission-fusion-loop-segments.hpp |   65 +-
 .../tests/test-kernel-hyperplane-2D.hpp       |  147 +-
 .../tests/test-kernel-hyperplane-3D.hpp       |  202 +-
 .../tests/test-kernel-nested-MultiReduce.hpp  |  350 +-
 .../tests/nested-loop-BlockReduceSum-impl.hpp |  175 +-
 .../tests/nested-loop-ReduceSum-impl.hpp      |  331 +-
 ...test-kernel-nested-loop-BlockReduceSum.hpp |   19 +-
 .../test-kernel-nested-loop-ReduceSum.hpp     |   19 +-
 ...el-resource-nested-loop-BlockReduceSum.hpp |   19 +-
 ...-kernel-resource-nested-loop-ReduceSum.hpp |   19 +-
 ...test-kernel-nested-loops-segment-types.hpp |  243 +-
 .../test-kernel-nested-loop-OffsetView2D.hpp  |   69 +-
 .../test-kernel-nested-loop-OffsetView3D.hpp  |   83 +-
 ...ernel-nested-loop-PermutedOffsetView2D.hpp |   94 +-
 ...ernel-nested-loop-PermutedOffsetView3D.hpp |  122 +-
 ...test-kernel-nested-loop-PermutedView2D.hpp |   58 +-
 ...test-kernel-nested-loop-PermutedView3D.hpp |   69 +-
 .../tests/nested-loop-Basic-impl.hpp          |  359 +-
 .../tests/nested-loop-MultiLambda-impl.hpp    |  259 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |  222 +-
 .../tests/test-kernel-nested-loop-Basic.hpp   |   16 +-
 .../test-kernel-nested-loop-MultiLambda.hpp   |   11 +-
 ...st-kernel-nested-loop-MultiLambdaParam.hpp |   12 +-
 ...test-kernel-resource-nested-loop-Basic.hpp |   16 +-
 ...ernel-resource-nested-loop-MultiLambda.hpp |   11 +-
 ...-resource-nested-loop-MultiLambdaParam.hpp |   12 +-
 .../tests/test-kernel-reduceloc-Max2D.hpp     |  143 +-
 .../tests/test-kernel-reduceloc-Max2DView.hpp |  143 +-
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |  142 +-
 .../tests/test-kernel-reduceloc-Min2D.hpp     |  143 +-
 .../tests/test-kernel-reduceloc-Min2DView.hpp |  143 +-
 .../test-kernel-reduceloc-Min2DViewTuple.hpp  |  142 +-
 .../region/tests/test-kernel-region-data.hpp  |    8 +-
 .../region/tests/test-kernel-region-sync.hpp  |   60 +-
 .../region/tests/test-kernel-region.hpp       |   59 +-
 .../test-kernel-single-loop-ForICount.hpp     |   48 +-
 .../test-kernel-single-loop-TileTCount.hpp    |   48 +-
 .../tests/test-kernel-tile-Dynamic2D.hpp      |  142 +-
 .../tests/test-kernel-tile-Fixed2D.hpp        |  114 +-
 .../tests/test-kernel-tile-Fixed2DMinMax.hpp  |   80 +-
 .../tests/test-kernel-tile-Fixed2DSum.hpp     |   55 +-
 .../tests/test-kernel-tile-LocalArray2D.hpp   |  127 +-
 ...kernel-resource-warp-thread-ReduceMask.hpp |   16 +-
 ...kernel-resource-warp-thread-ReduceWarp.hpp |   16 +-
 ...t-kernel-resource-warp-thread-WarpLoop.hpp |   16 +-
 .../test-kernel-warp-thread-ReduceMask.hpp    |   16 +-
 .../test-kernel-warp-thread-ReduceWarp.hpp    |   16 +-
 .../test-kernel-warp-thread-WarpLoop.hpp      |   16 +-
 .../tests/warp-thread-ReduceMask-impl.hpp     |  219 +-
 .../tests/warp-thread-ReduceWarp-impl.hpp     |  327 +-
 .../tests/warp-thread-WarpLoop-impl.hpp       |  237 +-
 .../tests/test-launch-nested-MultiReduce.hpp  |  355 +-
 .../tests/test-launch-nested-Direct.hpp       |  264 +-
 .../tests/test-launch-nested-Loop.hpp         |  257 +-
 .../tests/test-launch-nested-Tile-Direct.hpp  |  247 +-
 .../tests/test-launch-nested-Tile-Loop.hpp    |  251 +-
 .../tests/test-launch-basic-ReduceBitAnd.hpp  |  172 +-
 .../tests/test-launch-basic-ReduceMin.hpp     |  173 +-
 .../tests/test-launch-basic-ReduceSum.hpp     |  156 +-
 ...t-launch-basic-param-expt-ReduceBitAnd.hpp |  188 +-
 ...test-launch-basic-param-expt-ReduceMin.hpp |  196 +-
 ...test-launch-basic-param-expt-ReduceSum.hpp |  167 +-
 .../tests/test-launch-BasicShared.hpp         |  125 +-
 .../segment/tests/test-launch-ListSegment.hpp |  137 +-
 .../tests/test-launch-RangeSegment.hpp        |  165 +-
 .../tests/test-launch-RangeStrideSegment.hpp  |  208 +-
 .../tests/test-launch-DynamicMem.hpp          |  163 +-
 .../tests/test-launch-StaticMem.hpp           |  153 +-
 .../test-launch-nested-Tile-iCount-Direct.hpp |  183 +-
 .../test-launch-nested-Tile-iCount-Loop.hpp   |  184 +-
 .../scan/tests/test-scan-Exclusive.hpp        |   72 +-
 .../scan/tests/test-scan-ExclusiveInplace.hpp |   69 +-
 .../scan/tests/test-scan-Inclusive.hpp        |   60 +-
 .../scan/tests/test-scan-InclusiveInplace.hpp |   54 +-
 test/functional/scan/tests/test-scan-data.hpp |   18 +-
 .../matrix/test-tensor-matrix-double.hpp      |  135 +-
 .../matrix/test-tensor-matrix-float.hpp       |   73 +-
 .../matrix/test-tensor-matrix-int32_t.hpp     |   72 +-
 .../matrix/test-tensor-matrix-int64_t.hpp     |  144 +-
 .../tests/test-tensor-matrix-CtorGetSet.hpp   |   85 +-
 .../tests/test-tensor-matrix-ET_Add.hpp       |  176 +-
 .../tests/test-tensor-matrix-ET_Divide.hpp    |  177 +-
 .../tests/test-tensor-matrix-ET_LoadStore.hpp |  206 +-
 ...-tensor-matrix-ET_MatrixMatrixMultiply.hpp |  244 +-
 ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp |  248 +-
 .../test-tensor-matrix-ET_MatrixVector.hpp    |  230 +-
 .../tests/test-tensor-matrix-ET_Negate.hpp    |  142 +-
 .../tests/test-tensor-matrix-ET_Subtract.hpp  |  176 +-
 .../tests/test-tensor-matrix-ET_Transpose.hpp |  170 +-
 .../test-tensor-matrix-Load_ColMajor.hpp      |  168 +-
 .../test-tensor-matrix-Load_RowMajor.hpp      |  169 +-
 .../test-tensor-matrix-Store_ColMajor.hpp     |  184 +-
 .../test-tensor-matrix-Store_RowMajor.hpp     |  184 +-
 .../tests/test-tensor-matrix-Transpose.hpp    |   81 +-
 .../tests/test-tensor-register-Add.hpp        |  119 +-
 .../tests/test-tensor-register-Divide.hpp     |  154 +-
 .../tests/test-tensor-register-DotProduct.hpp |   50 +-
 .../tests/test-tensor-register-FMA.hpp        |   66 +-
 .../tests/test-tensor-register-FMS.hpp        |   66 +-
 .../tests/test-tensor-register-Gather.hpp     |  104 +-
 .../tests/test-tensor-register-GetSet.hpp     |  257 +-
 .../tests/test-tensor-register-Load.hpp       |  165 +-
 .../tests/test-tensor-register-Max.hpp        |   89 +-
 .../tests/test-tensor-register-Min.hpp        |   87 +-
 .../tests/test-tensor-register-Multiply.hpp   |  119 +-
 .../tests/test-tensor-register-Scatter.hpp    |  102 +-
 ...ensor-register-SegmentedBroadcastInner.hpp |   85 +-
 ...ensor-register-SegmentedBroadcastOuter.hpp |   72 +-
 ...st-tensor-register-SegmentedDotProduct.hpp |   72 +-
 ...test-tensor-register-SegmentedSumInner.hpp |   62 +-
 ...test-tensor-register-SegmentedSumOuter.hpp |   61 +-
 .../tests/test-tensor-register-Store.hpp      |  169 +-
 .../tests/test-tensor-register-Subtract.hpp   |  119 +-
 .../tests/test-tensor-vector-CtorGetSet.hpp   |   78 +-
 .../tests/test-tensor-vector-FmaFms.hpp       |   94 +-
 .../test-tensor-vector-ForallVectorRef1d.hpp  |   97 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |  114 +-
 .../tests/test-tensor-vector-MinMax.hpp       |   44 +-
 .../tests/test-tensor-vector-SumDot.hpp       |   42 +-
 .../util/test-CombiningAdapter-1D.cpp         |   22 +-
 .../util/test-CombiningAdapter-2D.cpp         |   47 +-
 .../util/test-CombiningAdapter-3D.cpp         |   73 +-
 .../util/test-PermutedCombiningAdapter-1D.cpp |   22 +-
 .../util/test-PermutedCombiningAdapter-2D.cpp |   48 +-
 .../util/test-PermutedCombiningAdapter-3D.cpp |   79 +-
 .../test-workgroup-Ordered-MultipleReuse.hpp  |  586 ++-
 .../tests/test-workgroup-Ordered-Single.hpp   |  252 +-
 ...test-workgroup-Unordered-MultipleReuse.hpp |  538 ++-
 .../tests/test-workgroup-Unordered-Single.hpp |  231 +-
 test/include/RAJA_gtest.hpp                   |  270 +-
 test/include/RAJA_test-abs.hpp                |   27 +-
 test/include/RAJA_test-atomic-ref-types.hpp   |   93 +-
 test/include/RAJA_test-atomic-types.hpp       |   17 +-
 test/include/RAJA_test-atomicpol.hpp          |   78 +-
 test/include/RAJA_test-base.hpp               |    5 +-
 test/include/RAJA_test-camp.hpp               |    2 +-
 test/include/RAJA_test-dynamic-forall.hpp     |   17 +-
 .../RAJA_test-forall-async-execpol.hpp        |   17 +-
 test/include/RAJA_test-forall-data.hpp        |   24 +-
 test/include/RAJA_test-forall-execpol.hpp     |  170 +-
 .../RAJA_test-forall-indexset-execpol.hpp     |   42 +-
 test/include/RAJA_test-index-types.hpp        |   60 +-
 test/include/RAJA_test-indexset-build.hpp     |   72 +-
 .../RAJA_test-kernel-nested-loop-types.hpp    |  139 +-
 test/include/RAJA_test-kernel-tile-size.hpp   |    2 +-
 ...launch-direct-teams-threads-1D-execpol.hpp |   70 +-
 ...launch-direct-teams-threads-3D-execpol.hpp |  125 +-
 test/include/RAJA_test-launch-execpol.hpp     |   58 +-
 ...t-launch-loop-teams-threads-1D-execpol.hpp |   80 +-
 ...t-launch-loop-teams-threads-3D-execpol.hpp |  134 +-
 .../RAJA_test-launch-runtime-execpol.hpp      |  149 +-
 .../RAJA_test-multi-reduce-abstractor.hpp     |  215 +-
 test/include/RAJA_test-multi-reducepol.hpp    |   27 +-
 test/include/RAJA_test-platform.hpp           |    9 +-
 test/include/RAJA_test-plugin-kernelpol.hpp   |  136 +-
 test/include/RAJA_test-plugin-launchpol.hpp   |   12 +-
 .../RAJA_test-plugin-resource-launchpol.hpp   |   12 +-
 test/include/RAJA_test-reduce-types.hpp       |   15 +-
 test/include/RAJA_test-reduceloc-types.hpp    |   13 +-
 test/include/RAJA_test-reducepol.hpp          |   39 +-
 test/include/RAJA_test-tensor.hpp             |  243 +-
 test/include/RAJA_test-workgroup.hpp          |  296 +-
 test/include/RAJA_unit-test-for3d3d.hpp       |  133 +-
 test/include/RAJA_unit-test-forone.hpp        |   29 +-
 test/include/RAJA_unit-test-policy.hpp        |   72 +-
 test/include/RAJA_unit-test-types.hpp         |   41 +-
 test/include/type_helper.hpp                  |   39 +-
 .../using-with-cmake/using-with-cmake.cpp     |   20 +-
 test/integration/plugin/plugin_to_test.cpp    |   22 +-
 test/integration/plugin/tests/counter.hpp     |   14 +-
 .../plugin/tests/test-plugin-forall.hpp       |  172 +-
 .../plugin/tests/test-plugin-kernel.hpp       |   44 +-
 .../plugin/tests/test-plugin-launch.hpp       |   56 +-
 .../tests/test-plugin-resource-launch.hpp     |   57 +-
 .../plugin/tests/test-plugin-workgroup.hpp    |  324 +-
 test/integration/plugin/tests/test-plugin.hpp |   59 +-
 test/integration/plugin_for_test_dynamic.cpp  |   10 +-
 test/integration/plugin_for_test_kokkos.cpp   |   20 +-
 test/integration/test_plugin_dynamic.cpp      |    2 +-
 test/integration/test_plugin_kokkos.cpp       |    2 +-
 test/old-tests/unit/cpu/test-synchronize.cpp  |    3 +-
 test/old-tests/unit/cuda/test-synchronize.cpp |   14 +-
 test/old-tests/unit/test-sharedmem.cpp        | 1278 +++----
 test/old-tests/unit/test-simd.cpp             |   89 +-
 .../test-algorithm-util-for_each.cpp          |   90 +-
 .../tests/test-algorithm-reduce-utils.hpp     |  274 +-
 .../tests/test-algorithm-sort-utils.hpp       |  498 ++-
 .../algorithm/tests/test-algorithm-sort.hpp   |   78 +-
 .../tests/test-algorithm-stable-sort.hpp      |   78 +-
 .../tests/test-algorithm-util-reduce.hpp      |  164 +-
 .../tests/test-algorithm-util-sort.hpp        |  651 ++--
 test/unit/atomic/test-atomic-incdec.cpp       |  151 +-
 .../unit/atomic/test-atomic-ref-accessors.cpp |  100 +-
 test/unit/atomic/test-atomic-ref-addsub.cpp   |  131 +-
 test/unit/atomic/test-atomic-ref-bitwise.cpp  |  178 +-
 .../atomic/test-atomic-ref-constructor.cpp    |  104 +-
 .../unit/atomic/test-atomic-ref-exchanges.cpp |  210 +-
 test/unit/atomic/test-atomic-ref-minmax.cpp   |   95 +-
 test/unit/atomic/test-atomic-ref.hpp          |  102 +-
 test/unit/hip/test-synchronize.cpp            |   21 +-
 test/unit/index/test-indexset.cpp             |   49 +-
 test/unit/index/test-indexvalue.cpp           |    7 +-
 test/unit/index/test-listsegment.cpp          |   46 +-
 test/unit/index/test-rangesegment.cpp         |   42 +-
 test/unit/index/test-rangestridesegment.cpp   |   98 +-
 test/unit/indexing/test-indexing.hpp          |   28 +-
 .../indexing/tests/test-indexing-global.hpp   |   94 +-
 test/unit/internal/test-iterators.cpp         |   22 +-
 test/unit/internal/test-rajavec.cpp           |    4 +-
 .../unit/multi_reducer/test-multi-reducer.hpp |   32 +-
 .../tests/test-multi-reducer-constructors.hpp |  189 +-
 .../tests/test-multi-reducer-reset.hpp        |  476 ++-
 .../test-reducer-constructors-cuda.cpp        |   16 +-
 .../reducer/test-reducer-constructors-hip.cpp |   16 +-
 ...est-reducer-constructors-openmp-target.cpp |   10 +-
 .../test-reducer-constructors-openmp.cpp      |   16 +-
 .../reducer/test-reducer-constructors-seq.cpp |   17 +-
 test/unit/reducer/test-reducer-reset-cuda.cpp |    6 +-
 test/unit/reducer/test-reducer-reset-hip.cpp  |    6 +-
 .../test-reducer-reset-openmp-target.cpp      |    6 +-
 .../reducer/test-reducer-reset-openmp.cpp     |    6 +-
 test/unit/reducer/test-reducer-reset-seq.cpp  |    7 +-
 test/unit/reducer/test-reducer.hpp            |   16 +-
 .../tests/test-reducer-constructors.hpp       |  157 +-
 .../unit/reducer/tests/test-reducer-reset.hpp |  179 +-
 .../tests/test-resource-AsyncTime.hpp         |   67 +-
 .../test-resource-BasicAsyncSemantics.hpp     |   33 +-
 .../resource/tests/test-resource-Depends.hpp  |   44 +-
 .../test-resource-JoinAsyncSemantics.hpp      |   34 +-
 .../tests/test-resource-MultiStream.hpp       |   68 +-
 .../test-operators-bitwise-modulus.cpp        |   41 +-
 .../operator/test-operators-equivalence.cpp   |  124 +-
 .../util/operator/test-operators-identity.cpp |   44 +-
 .../util/operator/test-operators-logical.cpp  |   55 +-
 .../util/operator/test-operators-math.cpp     |   52 +-
 test/unit/util/test-float-limits.cpp          |    7 +-
 test/unit/util/test-fraction.cpp              |   17 +-
 test/unit/util/test-integral-limits.cpp       |    7 +-
 test/unit/util/test-math.cpp                  |   17 +-
 test/unit/util/test-span.cpp                  |   41 +-
 test/unit/util/test-span.hpp                  |   66 +-
 test/unit/util/test-timer.cpp                 |    8 +-
 test/unit/view-layout/test-indexlayout.cpp    |  239 +-
 test/unit/view-layout/test-makelayout.cpp     |  118 +-
 test/unit/view-layout/test-multiview.cpp      |  303 +-
 .../unit/view-layout/test-standard-layout.cpp |   15 +-
 test/unit/view-layout/test-typedlayout.cpp    |  121 +-
 test/unit/view-layout/test-typedview.cpp      |  201 +-
 .../tests/test-util-workgroup-Enqueue.hpp     |   14 +-
 .../tests/test-util-workgroup-WorkStorage.hpp |   39 +-
 .../tests/test-workgroup-Constructor.hpp      |  173 +-
 .../tests/test-workgroup-Dispatcher.hpp       |  312 +-
 .../tests/test-workgroup-Enqueue-Multiple.hpp |  176 +-
 .../tests/test-workgroup-Enqueue-Single.hpp   |  170 +-
 ...test-workgroup-WorkStorage-Constructor.hpp |   37 +-
 .../test-workgroup-WorkStorage-InsertCall.hpp |   58 +-
 .../test-workgroup-WorkStorage-Iterator.hpp   |   51 +-
 .../test-workgroup-WorkStorage-Multiple.hpp   |  154 +-
 606 files changed, 53087 insertions(+), 59982 deletions(-)
 create mode 100644 test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
 create mode 100644 test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp

diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index abc965b0f5..59cca4bf22 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -88,7 +88,7 @@
 #endif
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/desul.hpp"
+    #include "RAJA/policy/desul.hpp"
 #endif
 
 #include "RAJA/index/IndexSet.hpp"
@@ -197,13 +197,11 @@
 
 #include "RAJA/pattern/sort.hpp"
 
-namespace RAJA
-{
-namespace expt
-{}
+namespace RAJA {
+namespace expt{}
 //  // provide a RAJA::expt namespace for experimental work, but bring alias
 //  // it into RAJA so it doesn't affect user code
 //  using namespace expt;
-}  // namespace RAJA
+}
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 3261c27b7a..1a467c8341 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -34,16 +34,8 @@
 namespace RAJA
 {
 
-enum PushEnd
-{
-  PUSH_FRONT,
-  PUSH_BACK
-};
-enum PushCopy
-{
-  PUSH_COPY,
-  PUSH_NOCOPY
-};
+enum PushEnd { PUSH_FRONT, PUSH_BACK };
+enum PushCopy { PUSH_COPY, PUSH_NOCOPY };
 
 template <typename... TALL>
 class TypedIndexSet;
@@ -63,9 +55,8 @@ namespace indexset
 template <typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
 struct ExecPolicy
     : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
-                                         RAJA::Pattern::forall>
-{
-  using seg_it   = SEG_ITER_POLICY_T;
+                                         RAJA::Pattern::forall> {
+  using seg_it = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
 };
 
@@ -86,7 +77,7 @@ using policy::indexset::ExecPolicy;
 template <typename T0, typename... TREST>
 class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 {
-  using PARENT               = TypedIndexSet<TREST...>;
+  using PARENT = TypedIndexSet<TREST...>;
   static const int T0_TypeId = sizeof...(TREST);
 
 public:
@@ -100,7 +91,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Construct empty index set
 #if _MSC_VER < 1910
-  // this one instance of constexpr does not work on VS2012 or VS2015
+   // this one instance of constexpr does not work on VS2012 or VS2015
   RAJA_INLINE TypedIndexSet() : PARENT() {}
 #else
   RAJA_INLINE constexpr TypedIndexSet() : PARENT() {}
@@ -108,12 +99,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Copy-constructor for index set
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet<T0, TREST...> const& c) : PARENT((PARENT const&)c)
+  TypedIndexSet(TypedIndexSet<T0, TREST...> const &c)
+      : PARENT((PARENT const &)c)
   {
     size_t num = c.data.size();
     data.resize(num);
-    for (size_t i = 0; i < num; ++i)
-    {
+    for (size_t i = 0; i < num; ++i) {
       data[i] = c.data[i];
     }
     // mark all as not owned by us
@@ -121,10 +112,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Copy-assignment operator for index set
-  TypedIndexSet<T0, TREST...>& operator=(const TypedIndexSet<T0, TREST...>& rhs)
+  TypedIndexSet<T0, TREST...> &operator=(const TypedIndexSet<T0, TREST...> &rhs)
   {
-    if (&rhs != this)
-    {
+    if (&rhs != this) {
       TypedIndexSet<T0, TREST...> copy(rhs);
       this->swap(copy);
     }
@@ -135,21 +125,19 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE ~TypedIndexSet()
   {
     size_t num_seg = data.size();
-    for (size_t i = 0; i < num_seg; ++i)
-    {
+    for (size_t i = 0; i < num_seg; ++i) {
       // Only free segment of we allocated it
-      if (owner[i])
-      {
+      if (owner[i]) {
         delete data[i];
       }
     }
   }
 
   //! Swap function for copy-and-swap idiom.
-  void swap(TypedIndexSet<T0, TREST...>& other)
+  void swap(TypedIndexSet<T0, TREST...> &other)
   {
     // Swap parents data
-    PARENT::swap((PARENT&)other);
+    PARENT::swap((PARENT &)other);
     // Swap our data
     using std::swap;
     swap(data, other.data);
@@ -162,20 +150,18 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This is used to implement the == and != operators
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool
-  compareSegmentById(size_t segid,
-                     const TypedIndexSet<P0, PREST...>& other) const
+  RAJA_INLINE bool compareSegmentById(
+      size_t segid,
+      const TypedIndexSet<P0, PREST...> &other) const
   {
     // drill down our types until we have the right type
-    if (getSegmentTypes()[segid] != T0_TypeId)
-    {
+    if (getSegmentTypes()[segid] != T0_TypeId) {
       // peel off T0
       return PARENT::compareSegmentById(segid, other);
     }
 
     // Check that other's segid is of type T0
-    if (!other.template checkSegmentType<T0>(segid))
-    {
+    if (!other.template checkSegmentType<T0>(segid)) {
       return false;
     }
 
@@ -188,8 +174,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename P0>
   RAJA_INLINE bool checkSegmentType(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId)
-    {
+    if (getSegmentTypes()[segid] == T0_TypeId) {
       return std::is_same<T0, P0>::value;
     }
     return PARENT::template checkSegmentType<P0>(segid);
@@ -198,24 +183,22 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0& getSegment(size_t segid)
+  RAJA_INLINE P0 &getSegment(size_t segid)
   {
-    if (getSegmentTypes()[segid] == T0_TypeId)
-    {
+    if (getSegmentTypes()[segid] == T0_TypeId) {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const*>(data[offset]);
+      return *reinterpret_cast<P0 const *>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0 const& getSegment(size_t segid) const
+  RAJA_INLINE P0 const &getSegment(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId)
-    {
+    if (getSegmentTypes()[segid] == T0_TypeId) {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const*>(data[offset]);
+      return *reinterpret_cast<P0 const *>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
@@ -248,25 +231,20 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 private:
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
-                             PushEnd pend   = PUSH_BACK,
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &c,
+                             PushEnd pend = PUSH_BACK,
                              PushCopy pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
-    if (pend == PUSH_BACK)
-    {
-      for (Index_type i = 0; i < num; ++i)
-      {
+    if (pend == PUSH_BACK) {
+      for (Index_type i = 0; i < num; ++i) {
         segment_push_into(i, c, pend, pcopy);
-      }
-    }
-    else
-    {
-      for (Index_type i = num - 1; i > -1; --i)
-      {
+      } 
+    } else {
+      for (Index_type i = num-1; i > -1; --i) {
         segment_push_into(i, c, pend, pcopy);
-      }
+      } 
     }
   }
 
@@ -279,71 +257,66 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 public:
   template <typename... CALL>
   RAJA_INLINE void segment_push_into(size_t segid,
-                                     TypedIndexSet<CALL...>& c,
-                                     PushEnd pend   = PUSH_BACK,
+                                     TypedIndexSet<CALL...> &c,
+                                     PushEnd pend = PUSH_BACK,
                                      PushCopy pcopy = PUSH_COPY)
   {
-    if (getSegmentTypes()[segid] != T0_TypeId)
-    {
+    if (getSegmentTypes()[segid] != T0_TypeId) {
       PARENT::segment_push_into(segid, c, pend, pcopy);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
-    switch (value_for(pend, pcopy))
-    {
-    case value_for(PUSH_BACK, PUSH_COPY):
-      c.push_back(*data[offset]);
-      break;
-    case value_for(PUSH_BACK, PUSH_NOCOPY):
-      c.push_back_nocopy(data[offset]);
-      break;
-    case value_for(PUSH_FRONT, PUSH_COPY):
-      c.push_front(*data[offset]);
-      break;
-    case value_for(PUSH_FRONT, PUSH_NOCOPY):
-      c.push_front_nocopy(data[offset]);
-      break;
+    switch (value_for(pend, pcopy)) {
+      case value_for(PUSH_BACK, PUSH_COPY):
+        c.push_back(*data[offset]);
+        break;
+      case value_for(PUSH_BACK, PUSH_NOCOPY):
+        c.push_back_nocopy(data[offset]);
+        break;
+      case value_for(PUSH_FRONT, PUSH_COPY):
+        c.push_front(*data[offset]);
+        break;
+      case value_for(PUSH_FRONT, PUSH_NOCOPY):
+        c.push_front_nocopy(data[offset]);
+        break;
     }
   }
 
 
   //! Add segment to back end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_back_nocopy(Tnew* val)
+  RAJA_INLINE void push_back_nocopy(Tnew *val)
   {
     push_internal(val, PUSH_BACK, PUSH_NOCOPY);
   }
 
   //! Add segment to front end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_front_nocopy(Tnew* val)
+  RAJA_INLINE void push_front_nocopy(Tnew *val)
   {
     push_internal(val, PUSH_FRONT, PUSH_NOCOPY);
   }
 
   //! Add copy of segment to back end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_back(Tnew&& val)
+  RAJA_INLINE void push_back(Tnew &&val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
-                  PUSH_BACK, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK, PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_front(Tnew&& val)
+  RAJA_INLINE void push_front(Tnew &&val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
-                  PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
   RAJA_INLINE size_t getLength() const
   {
     size_t total = PARENT::getLength();
-    size_t num   = data.size();
-    for (size_t i = 0; i < num; ++i)
-    {
+    size_t num = data.size();
+    for (size_t i = 0; i < num; ++i) {
       total += data[i]->size();
     }
     return total;
@@ -366,12 +339,13 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY, typename... ARGS>
-  RAJA_HOST_DEVICE void
-  segmentCall(size_t segid, BODY&& body, ARGS&&... args) const
+  RAJA_HOST_DEVICE void segmentCall(size_t segid,
+                                    BODY &&body,
+                                    ARGS &&... args) const
   {
-    if (getSegmentTypes()[segid] != T0_TypeId)
-    {
-      PARENT::segmentCall(segid, std::forward<BODY>(body),
+    if (getSegmentTypes()[segid] != T0_TypeId) {
+      PARENT::segmentCall(segid,
+                          std::forward<BODY>(body),
                           std::forward<ARGS>(args)...);
       return;
     }
@@ -382,23 +356,24 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 protected:
   //! Internal logic to add a new segment -- catch invalid type insertion
   template <typename Tnew>
-  RAJA_INLINE void
-  push_internal(Tnew* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void push_internal(Tnew *val,
+                                 PushEnd pend = PUSH_BACK,
+                                 PushCopy pcopy = PUSH_COPY)
   {
     static_assert(sizeof...(TREST) > 0, "Invalid type for this TypedIndexSet");
     PARENT::push_internal(val, pend, pcopy);
   }
 
   //! Internal logic to add a new segment
-  RAJA_INLINE void
-  push_internal(T0* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void push_internal(T0 *val,
+                                 PushEnd pend = PUSH_BACK,
+                                 PushCopy pcopy = PUSH_COPY)
   {
     data.push_back(val);
     owner.push_back(pcopy == PUSH_COPY);
 
     // Determine if we push at the front or back of the segment list
-    if (pend == PUSH_BACK)
-    {
+    if (pend == PUSH_BACK) {
       // Store the segment type
       getSegmentTypes().push_back(T0_TypeId);
 
@@ -409,9 +384,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       size_t icount = val->size();
       getSegmentIcounts().push_back(getTotalLength());
       increaseTotalLength(icount);
-    }
-    else
-    {
+    } else {
       // Store the segment type
       getSegmentTypes().push_front(T0_TypeId);
 
@@ -421,8 +394,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       // Store the segment icount
       getSegmentIcounts().push_front(0);
       size_t icount = val->size();
-      for (size_t i = 1; i < getSegmentIcounts().size(); ++i)
-      {
+      for (size_t i = 1; i < getSegmentIcounts().size(); ++i) {
         getSegmentIcounts()[i] += icount;
       }
       increaseTotalLength(icount);
@@ -430,7 +402,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Returns the number of indices (the total icount of segments
-  RAJA_INLINE Index_type& getTotalLength() { return PARENT::getTotalLength(); }
+  RAJA_INLINE Index_type &getTotalLength() { return PARENT::getTotalLength(); }
 
   //! set total length of the indexset
   RAJA_INLINE void setTotalLength(int n) { return PARENT::setTotalLength(n); }
@@ -465,10 +437,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     TypedIndexSet<T0, TREST...> retVal;
 
-    int minSeg = RAJA::operators::maximum<int> {}(0, begin);
-    int maxSeg = RAJA::operators::minimum<int> {}(end, getNumSegments());
-    for (int i = minSeg; i < maxSeg; ++i)
-    {
+    int minSeg = RAJA::operators::maximum<int>{}(0, begin);
+    int maxSeg = RAJA::operators::minimum<int>{}(end, getNumSegments());
+    for (int i = minSeg; i < maxSeg; ++i) {
       segment_push_into(i, retVal, PUSH_BACK, PUSH_NOCOPY);
     }
     return retVal;
@@ -481,15 +452,13 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This TypedIndexSet will not change and the created "slice" into it
   /// will not own any of its segments.
   ///
-  TypedIndexSet<T0, TREST...> createSlice(const int* segIds, int len)
+  TypedIndexSet<T0, TREST...> createSlice(const int *segIds, int len)
   {
     TypedIndexSet<T0, TREST...> retVal;
 
     int numSeg = getNumSegments();
-    for (int i = 0; i < len; ++i)
-    {
-      if (segIds[i] >= 0 && segIds[i] < numSeg)
-      {
+    for (int i = 0; i < len; ++i) {
+      if (segIds[i] >= 0 && segIds[i] < numSeg) {
         segment_push_into(segIds[i], retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -507,14 +476,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// iterator type must de-reference to an integral value.
   ///
   template <typename T>
-  TypedIndexSet<T0, TREST...> createSlice(const T& segIds)
+  TypedIndexSet<T0, TREST...> createSlice(const T &segIds)
   {
     TypedIndexSet<T0, TREST...> retVal;
     int numSeg = getNumSegments();
-    for (auto& seg : segIds)
-    {
-      if (seg >= 0 && seg < numSeg)
-      {
+    for (auto &seg : segIds) {
+      if (seg >= 0 && seg < numSeg) {
         segment_push_into(seg, retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -525,7 +492,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   void setSegmentInterval(size_t interval_id, int begin, int end)
   {
     m_seg_interval_begin[interval_id] = begin;
-    m_seg_interval_end[interval_id]   = end;
+    m_seg_interval_end[interval_id] = end;
   }
 
   //! get lower bound of segment identified with interval_id
@@ -542,37 +509,37 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 protected:
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
   {
     return PARENT::getSegmentIcounts();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
   {
     return PARENT::getSegmentIcounts();
   }
@@ -585,15 +552,13 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///       types and indices; e.g., dependency info not checked.
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...>& other) const
+  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...> &other) const
   {
     size_t num_seg = getNumSegments();
     if (num_seg != other.getNumSegments()) return false;
 
-    for (size_t segid = 0; segid < num_seg; ++segid)
-    {
-      if (!compareSegmentById(segid, other))
-      {
+    for (size_t segid = 0; segid < num_seg; ++segid) {
+      if (!compareSegmentById(segid, other)) {
         return false;
       }
     }
@@ -602,14 +567,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Inequality operator returns true if any segment is not equal, else false.
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...>& other) const
+  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...> &other) const
   {
     return (!(*this == other));
   }
 
 private:
   //! vector of TypedIndexSet data objects of type T0
-  RAJA::RAJAVec<T0*> data;
+  RAJA::RAJAVec<T0 *> data;
 
   //! vector indicating which segments are owned by the TypedIndexSet
   RAJA::RAJAVec<Index_type> owner;
@@ -638,16 +603,16 @@ class TypedIndexSet<>
 
   //! Copy-constructor.
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet const& c)
+  TypedIndexSet(TypedIndexSet const &c)
   {
-    segment_types   = c.segment_types;
+    segment_types = c.segment_types;
     segment_offsets = c.segment_offsets;
     segment_icounts = c.segment_icounts;
-    m_len           = c.m_len;
+    m_len = c.m_len;
   }
 
   //! Swap function for copy-and-swap idiom (deep copy).
-  void swap(TypedIndexSet& other)
+  void swap(TypedIndexSet &other)
   {
     using std::swap;
     swap(segment_types, other.segment_types);
@@ -660,7 +625,7 @@ class TypedIndexSet<>
   RAJA_INLINE static size_t getNumTypes() { return 0; }
 
   template <typename T>
-  RAJA_INLINE constexpr bool isValidSegmentType(T const&) const
+  RAJA_INLINE constexpr bool isValidSegmentType(T const &) const
   {
     // Segment type wasn't found
     return false;
@@ -672,39 +637,40 @@ class TypedIndexSet<>
 
   template <typename BODY, typename... ARGS>
   RAJA_INLINE void segmentCall(size_t, BODY, ARGS...) const
-  {}
+  {
+  }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE Index_type& getTotalLength() { return m_len; }
+  RAJA_INLINE Index_type &getTotalLength() { return m_len; }
 
   RAJA_INLINE void setTotalLength(int n) { m_len = n; }
 
@@ -712,7 +678,7 @@ class TypedIndexSet<>
 
   template <typename P0, typename... PREST>
   RAJA_INLINE bool compareSegmentById(size_t,
-                                      const TypedIndexSet<P0, PREST...>&) const
+                                      const TypedIndexSet<P0, PREST...> &) const
   {
     return false;
   }
@@ -724,29 +690,34 @@ class TypedIndexSet<>
   }
 
   template <typename P0>
-  RAJA_INLINE P0& getSegment(size_t)
+  RAJA_INLINE P0 &getSegment(size_t)
   {
-    return *((P0*)(this - this));
+    return *((P0 *)(this - this));
   }
 
   template <typename P0>
-  RAJA_INLINE P0 const& getSegment(size_t) const
+  RAJA_INLINE P0 const &getSegment(size_t) const
   {
-    return *((P0*)(this - this));
+    return *((P0 *)(this - this));
   }
 
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
-  {}
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &, PushEnd, PushCopy) const
+  {
+  }
 
   template <typename... CALL>
-  RAJA_INLINE void
-  segment_push_into(size_t, TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
-  {}
+  RAJA_INLINE void segment_push_into(size_t,
+                                     TypedIndexSet<CALL...> &,
+                                     PushEnd,
+                                     PushCopy) const
+  {
+  }
 
   template <typename Tnew>
-  RAJA_INLINE void push(Tnew const&, PushEnd, PushCopy)
-  {}
+  RAJA_INLINE void push(Tnew const &, PushEnd, PushCopy)
+  {
+  }
 
 public:
   using iterator = Iterators::numeric_iterator<Index_type>;
@@ -791,15 +762,13 @@ namespace type_traits
 
 template <typename T>
 struct is_index_set
-    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
-                                            typename std::decay<T>::type>
-{};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type> {
+};
 
 template <typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
-                                            typename std::decay<T>::type>
-{};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type> {
+};
 }  // namespace type_traits
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 075aecd1d1..543524be01 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -37,13 +37,13 @@ namespace RAJA
  * \brief Generate an index set with aligned Range segments and List segments,
  *        as needed, from given array of indices.
  *
- *        Routine does no error-checking on argements and assumes
+ *        Routine does no error-checking on argements and assumes 
  *        RAJA::Index_type array contains valid indices.
  *
- *  \param iset reference to index set generated with aligned range segments
+ *  \param iset reference to index set generated with aligned range segments 
  *         and list segments. Method assumes index set is empty (no segments).
- *  \param work_res camp resource object that identifies the memory space in
- *         which list segment index data will live (passed to list segment
+ *  \param work_res camp resource object that identifies the memory space in 
+ *         which list segment index data will live (passed to list segment 
  *         ctor).
  *  \param indices_in pointer to start of input array of indices.
  *  \param length size of input index array.
@@ -79,36 +79,37 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  ******************************************************************************
  *
  * \brief Generate a lock-free "block" index set (planar division) containing
- *        range segments.
+ *        range segments. 
  *
- *        The method chunks a fastDim x midDim x slowDim mesh into blocks that
+ *        The method chunks a fastDim x midDim x slowDim mesh into blocks that 
  *        can be dependency-scheduled, removing need for lock constructs.
  *
  *  \param iset reference to index set generated with range segments.
- *         Method assumes index set is empty (no segments).
+ *         Method assumes index set is empty (no segments). 
  *  \param fastDim "fast" block dimension (see above).
  *  \param midDim  "mid" block dimension (see above).
  *  \param slowDim "slow" block dimension (see above).
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-                                int fastDim,
-                                int midDim,
-                                int slowDim);
+void buildLockFreeBlockIndexset(
+    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+    int fastDim,
+    int midDim,
+    int slowDim);
 
 /*!
  ******************************************************************************
  *
  * \brief Generate a lock-free "color" index set containing range and list
  *        segments.
- *
- *        TThe domain-set is colored based on connectivity to the range-set.
- *        All elements in each segment are independent, and no two segments
+ * 
+ *        TThe domain-set is colored based on connectivity to the range-set. 
+ *        All elements in each segment are independent, and no two segments 
  *        can be executed in parallel.
  *
- * \param iset reference to index set generated. Method assumes index set
- *        is empty (no segments).
+ * \param iset reference to index set generated. Method assumes index set 
+ *        is empty (no segments). 
  * \param work_res camp resource object that identifies the memory space in
  *         which list segment index data will live (passed to list segment
  *         ctor).
@@ -122,7 +123,7 @@ void buildLockFreeColorIndexset(
     int numEntity,
     int numRangePerDomain,
     int numEntityRange,
-    RAJA::Index_type* elemPermutation  = nullptr,
+    RAJA::Index_type* elemPermutation = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index d5da3e9e19..4baea450fc 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 //@{
 //!   @name Methods to gather indices of segment or index set into a container.
 //!
-//!   For each method, the given container must be templated on a data type,
-//!   have default and copy ctors, push_back method, and value_type. Is is
-//!   assumed that the container data type and segment or index set data type
-//!   are compatible in the sense that the index set type can be converted to
+//!   For each method, the given container must be templated on a data type, 
+//!   have default and copy ctors, push_back method, and value_type. Is is 
+//!   assumed that the container data type and segment or index set data type 
+//!   are compatible in the sense that the index set type can be converted to 
 //!   the container data type.
 
 /*!
@@ -49,8 +49,11 @@ RAJA_INLINE void getIndices(CONTAINER_T& con,
                             const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec>>(
-      iset, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
+  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
+    [&](typename CONTAINER_T::value_type idx) {
+      tcon.push_back(idx);
+    }
+  );
   con = tcon;
 }
 
@@ -65,8 +68,11 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx)
-                   { tcon.push_back(idx); });
+  forall<seq_exec>(seg,
+    [&](typename CONTAINER_T::value_type idx) {
+      tcon.push_back(idx);
+    }
+  );
   con = tcon;
 }
 
@@ -84,12 +90,11 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec>>(
-      iset,
-      [&](typename CONTAINER_T::value_type idx)
-      {
-        if (conditional(idx)) tcon.push_back(idx);
-      });
+  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
+    [&](typename CONTAINER_T::value_type idx) {
+      if (conditional(idx)) tcon.push_back(idx);
+    }
+  );
   con = tcon;
 }
 
@@ -108,10 +113,10 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
 {
   CONTAINER_T tcon;
   forall<seq_exec>(seg,
-                   [&](typename CONTAINER_T::value_type idx)
-                   {
-                     if (conditional(idx)) tcon.push_back(idx);
-                   });
+    [&](typename CONTAINER_T::value_type idx) {
+      if (conditional(idx)) tcon.push_back(idx);
+    }
+  );
   con = tcon;
 }
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 7ed94a299e..44fa143445 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -28,8 +28,8 @@
 namespace RAJA
 {
 
-struct IndexValueBase
-{};
+struct IndexValueBase {
+};
 
 /*!
  * \brief Strongly typed "integer" class.
@@ -44,17 +44,16 @@ struct IndexValueBase
  * Yes, this uses the curiously-recurring template pattern.
  */
 template <typename TYPE, typename VALUE = RAJA::Index_type>
-struct IndexValue : public IndexValueBase
-{
+struct IndexValue : public IndexValueBase {
 
   using value_type = VALUE;
 
   //! Default constructor initializes value to 0.
-  RAJA_INLINE constexpr IndexValue()                   = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue const&)  = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue&&)       = default;
-  RAJA_INLINE IndexValue& operator=(IndexValue const&) = default;
-  RAJA_INLINE IndexValue& operator=(IndexValue&&)      = default;
+  RAJA_INLINE constexpr IndexValue() = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue const &) = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue &&) = default;
+  RAJA_INLINE IndexValue &operator=(IndexValue const &) = default;
+  RAJA_INLINE IndexValue &operator=(IndexValue &&) = default;
 
   /*!
    * \brief Explicit constructor.
@@ -62,13 +61,14 @@ struct IndexValue : public IndexValueBase
    */
   RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit IndexValue(value_type v)
       : value(v)
-  {}
+  {
+  }
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator*() { return value; }
+  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator*() { return value; }
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE const value_type& operator*() const
+  RAJA_HOST_DEVICE RAJA_INLINE const value_type &operator*() const
   {
     return value;
   }
@@ -82,10 +82,10 @@ struct IndexValue : public IndexValueBase
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator++()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator++()
   {
     value++;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
   //! postdecrement -- returns a copy
@@ -97,10 +97,10 @@ struct IndexValue : public IndexValueBase
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator--()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator--()
   {
     value--;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
   //! addition to underlying index from an Index_type
@@ -163,52 +163,52 @@ struct IndexValue : public IndexValueBase
     return TYPE(value % a.value);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(value_type x)
   {
     value += x;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(TYPE x)
   {
     value += x.value;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(value_type x)
   {
     value -= x;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(TYPE x)
   {
     value -= x.value;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(value_type x)
   {
     value *= x;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(TYPE x)
   {
     value *= x.value;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(value_type x)
   {
     value /= x;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(TYPE x)
   {
     value /= x.value;
-    return static_cast<TYPE&>(*this);
+    return static_cast<TYPE &>(*this);
   }
 
   RAJA_HOST_DEVICE RAJA_INLINE bool operator<(value_type x) const
@@ -334,22 +334,18 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
   return val;
 }
 
-namespace internal
-{
-template <typename FROM, typename Enable = void>
-struct StripIndexTypeT
-{
-  using type = FROM;
+namespace internal{
+template<typename FROM, typename Enable = void>
+struct StripIndexTypeT {
+    using type = FROM;
 };
 
-template <typename FROM>
-struct StripIndexTypeT<
-    FROM,
-    typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
+template<typename FROM>
+struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
 {
-  using type = typename FROM::value_type;
+    using type = typename FROM::value_type;
 };
-}  // namespace internal
+} // namespace internal
 
 /*!
  * \brief Strips a strongly typed index to its underlying type
@@ -357,7 +353,7 @@ struct StripIndexTypeT<
  *
  * \param FROM the original type
  */
-template <typename FROM>
+template<typename FROM>
 using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
 
 /*!
@@ -366,11 +362,12 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  *
  * \param FROM the original type
  */
-template <typename FROM>
-using make_signed_t =
-    typename std::conditional<std::is_floating_point<FROM>::value,
-                              std::common_type<FROM>,
-                              std::make_signed<FROM>>::type::type;
+template<typename FROM>
+using make_signed_t = typename std::conditional < 
+                                  std::is_floating_point<FROM>::value,
+                                    std::common_type<FROM>,
+                                    std::make_signed<FROM>
+                               >::type::type;
 
 }  // namespace RAJA
 
@@ -379,18 +376,19 @@ using make_signed_t =
  * \param TYPE the name of the type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE(TYPE, NAME)                                           \
-  class TYPE : public ::RAJA::IndexValue<TYPE>                                 \
-  {                                                                            \
-    using parent = ::RAJA::IndexValue<TYPE>;                                   \
-                                                                               \
-  public:                                                                      \
-    using IndexValueType = TYPE;                                               \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}              \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v)           \
-        : parent::IndexValue(v)                                                \
-    {}                                                                         \
-    static inline std::string getName() { return NAME; }                       \
+#define RAJA_INDEX_VALUE(TYPE, NAME)                                 \
+  class TYPE : public ::RAJA::IndexValue<TYPE>                       \
+  {                                                                  \
+    using parent = ::RAJA::IndexValue<TYPE>;                         \
+                                                                     \
+  public:                                                            \
+    using IndexValueType = TYPE;                                     \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}    \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \
+        : parent::IndexValue(v)                                      \
+    {                                                                \
+    }                                                                \
+    static inline std::string getName() { return NAME; }             \
   };
 
 /*!
@@ -399,17 +397,17 @@ using make_signed_t =
  * \param IDXT the index types value type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                                   \
-  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                           \
-  {                                                                            \
-  public:                                                                      \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                                        \
-        : RAJA::IndexValue<TYPE, IDXT>::IndexValue()                           \
-    {}                                                                         \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)                         \
-        : RAJA::IndexValue<TYPE, IDXT>::IndexValue(v)                          \
-    {}                                                                         \
-    static inline std::string getName() { return NAME; }                       \
+#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                         \
+  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                 \
+  {                                                                  \
+  public:                                                            \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                              \
+        : RAJA::IndexValue<TYPE,IDXT>::IndexValue() {}               \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)               \
+        : RAJA::IndexValue<TYPE,IDXT>::IndexValue(v)                 \
+    {                                                                \
+    }                                                                \
+    static inline std::string getName() { return NAME; }             \
   };
 
 #endif
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index 187ec05d3f..adee46053c 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -85,6 +85,7 @@ template <typename StorageT>
 class TypedListSegment
 {
 public:
+
   //@{
   //!   @name Types used in implementation based on template parameter.
 
@@ -110,7 +111,7 @@ class TypedListSegment
    * \param values array of indices defining iteration space of segment
    * \param length number of indices
    * \param resource camp resource defining memory space where index data live
-   * \param owned optional enum value indicating whether segment owns indices
+   * \param owned optional enum value indicating whether segment owns indices 
    * (Owned or Unowned). Default is Owned.
    *
    * If 'Unowned' is passed as last argument, the segment will not own its
@@ -120,7 +121,7 @@ class TypedListSegment
                    Index_type length,
                    camp::resources::Resource resource,
                    IndexOwnership owned = Owned)
-      : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
+    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
   }
@@ -140,34 +141,30 @@ class TypedListSegment
   template <typename Container>
   TypedListSegment(const Container& container,
                    camp::resources::Resource resource)
-      : m_resource(nullptr),
-        m_owned(Unowned),
-        m_data(nullptr),
-        m_size(container.size())
+    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size())
   {
-    if (m_size > 0)
-    {
+    if (m_size > 0) {
 
-      camp::resources::Resource host_res {camp::resources::Host()};
+      camp::resources::Resource host_res{camp::resources::Host()};
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
-      auto dest      = tmp;
-      auto src       = container.begin();
+      auto dest = tmp;
+      auto src = container.begin();
       auto const end = container.end();
-      while (src != end)
-      {
+      while (src != end) {
         *dest = *src;
         ++dest;
         ++src;
       }
 
       m_resource = new camp::resources::Resource(resource);
-      m_data     = m_resource->allocate<value_type>(m_size);
+      m_data = m_resource->allocate<value_type>(m_size);
       m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
       m_owned = Owned;
 
       host_res.deallocate(tmp);
+
     }
   }
 
@@ -178,11 +175,10 @@ class TypedListSegment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other)
-      : m_resource(nullptr),
-        m_owned(Unowned),
-        m_data(other.m_data),
-        m_size(other.m_size)
-  {}
+    : m_resource(nullptr),
+      m_owned(Unowned), m_data(other.m_data), m_size(other.m_size)
+  {
+  }
 
   //! Copy assignment for list segment
   //  As this may be called from a lambda in a
@@ -191,59 +187,59 @@ class TypedListSegment
   {
     clear();
     m_resource = nullptr;
-    m_owned    = Unowned;
-    m_data     = other.m_data;
-    m_size     = other.m_size;
+    m_owned = Unowned;
+    m_data = other.m_data;
+    m_size = other.m_size;
   }
 
-  //! move assignment for list segment
+    //! move assignment for list segment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs)
   {
     clear();
     m_resource = rhs.m_resource;
-    m_owned    = rhs.m_owned;
-    m_data     = rhs.m_data;
-    m_size     = rhs.m_size;
+    m_owned = rhs.m_owned;
+    m_data = rhs.m_data;
+    m_size = rhs.m_size;
 
     rhs.m_resource = nullptr;
-    rhs.m_owned    = Unowned;
-    rhs.m_data     = nullptr;
-    rhs.m_size     = 0;
+    rhs.m_owned = Unowned;
+    rhs.m_data = nullptr;
+    rhs.m_size = 0;
   }
 
   //! Move constructor for list segment
   RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs)
-      : m_resource(rhs.m_resource),
-        m_owned(rhs.m_owned),
-        m_data(rhs.m_data),
-        m_size(rhs.m_size)
+    : m_resource(rhs.m_resource),
+      m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
   {
-    rhs.m_owned    = Unowned;
+    rhs.m_owned = Unowned;
     rhs.m_resource = nullptr;
-    rhs.m_size     = 0;
-    rhs.m_data     = nullptr;
+    rhs.m_size = 0;
+    rhs.m_data = nullptr;
   }
 
   //! List segment destructor
-  RAJA_HOST_DEVICE ~TypedListSegment() { clear(); }
+  RAJA_HOST_DEVICE ~TypedListSegment()
+  {
+    clear();
+  }
 
   //! Clear method to be called
   RAJA_HOST_DEVICE void clear()
   {
 
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_data != nullptr && m_owned == Owned)
-    {
+    if (m_data != nullptr && m_owned == Owned) {
       m_resource->deallocate(m_data);
       delete m_resource;
     }
 #endif
-    m_data     = nullptr;
+    m_data = nullptr;
     m_resource = nullptr;
-    m_owned    = Unowned;
-    m_size     = 0;
+    m_owned = Unowned;
+    m_size = 0;
   }
 
   //@}
@@ -349,35 +345,32 @@ class TypedListSegment
   {
 
     // empty list segment
-    if (len <= 0 || container == nullptr)
-    {
-      m_data  = nullptr;
-      m_size  = 0;
+    if (len <= 0 || container == nullptr) {
+      m_data = nullptr;
+      m_size = 0;
       m_owned = Unowned;
       return;
     }
 
     // some non-zero size -- initialize accordingly
-    m_size  = len;
+    m_size = len;
     m_owned = container_own;
-    if (m_owned == Owned)
-    {
+    if (m_owned == Owned) {
 
-      m_resource = new camp::resources::Resource(resource_);
+        m_resource = new camp::resources::Resource(resource_);
 
-      camp::resources::Resource host_res {camp::resources::Host()};
+        camp::resources::Resource host_res{camp::resources::Host()};
 
-      value_type* tmp = host_res.allocate<value_type>(m_size);
+        value_type* tmp = host_res.allocate<value_type>(m_size);
 
-      for (Index_type i = 0; i < m_size; ++i)
-      {
-        tmp[i] = container[i];
-      }
+        for (Index_type i = 0; i < m_size; ++i) {
+          tmp[i] = container[i];
+        }
 
-      m_data = m_resource->allocate<value_type>(m_size);
-      m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
+        m_data = m_resource->allocate<value_type>(m_size);
+        m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
 
-      host_res.deallocate(tmp);
+        host_res.deallocate(tmp);
 
       return;
     }
@@ -389,7 +382,7 @@ class TypedListSegment
 
 
   // Copy of camp resource passed to ctor
-  camp::resources::Resource* m_resource;
+  camp::resources::Resource *m_resource;
 
   // Ownership flag to guide data copying/management
   IndexOwnership m_owned;
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index 607fe71daf..a41959c583 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -50,10 +50,10 @@ namespace RAJA
  *
  * NOTE: TypedRangeSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of
+ * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of 
  *       indices [-5, 3).
  *
- * NOTE: Proper handling of indices strides requires that StorageT is a
+ * NOTE: Proper handling of indices strides requires that StorageT is a 
  *       signed type.
  *
  * Usage:
@@ -92,22 +92,15 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT,
-          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeSegment
-{
+template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeSegment {
 
-  //
+  // 
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
-  //
-  static_assert(std::is_signed<DiffT>::value,
-                "TypedRangeSegment DiffT "
-                "requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value,
-                "TypedRangeSegment "
-                "Type must be non "
-                "floating point.");
+  // 
+  static_assert(std::is_signed<DiffT>::value, "TypedRangeSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -124,19 +117,20 @@ struct TypedRangeSegment
   //@}
 
   //@{
-  //!   @name Constructors, destructor, and copy assignment.
+  //!   @name Constructors, destructor, and copy assignment. 
 
   /*!
    * \brief Construct a range segment repreenting the interval [begin, end)
-   *
+   * 
    * \param begin start value (inclusive) for the range
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
-                                               StripStorageT end)
-      : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end))
-  {}
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end)
+      : m_begin(iterator(begin)), 
+        m_end(begin > end ? m_begin : iterator(end))
+  {
+  }
 
   //! Disable compiler generated constructor
   RAJA_HOST_DEVICE TypedRangeSegment() = delete;
@@ -193,7 +187,7 @@ struct TypedRangeSegment
    * \brief Compare this segment to another for inequality
    *
    * \return true if begin or end does not match, else false
-   */
+   */ 
   RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const
   {
     return !(operator==(o));
@@ -204,9 +198,9 @@ struct TypedRangeSegment
   /*!
    * \brief Get a new TypedRangeSegment instance representing a slice of
    *        existing segment
-   *
-   * \param begin start iterate of new range
-   * \param length maximum length of new range
+   * 
+   * \param begin start iterate of new range 
+   * \param length maximum length of new range 
    * \return TypedRangeSegment representing the interval
    *         [ *begin() + begin, min( *begin() + begin + length, *end() ) )
    *
@@ -219,7 +213,7 @@ struct TypedRangeSegment
    *     auto r = RAJA::TypedRangeSegment<int>(-4, 4);
    *
    *     // s repreents the subinterval  [-3, 2)
-   *     auto s = r.slice(1, 5);
+   *     auto s = r.slice(1, 5); 
    *
    *   \endverbatim
    */
@@ -227,9 +221,9 @@ struct TypedRangeSegment
                                                        DiffT length) const
   {
     StorageT start = m_begin[0] + begin;
-    StorageT end   = start + length > m_end[0] ? m_end[0] : start + length;
+    StorageT end = start + length > m_end[0] ? m_end[0] : start + length;
 
-    return TypedRangeSegment {stripIndexType(start), stripIndexType(end)};
+    return TypedRangeSegment{stripIndexType(start), stripIndexType(end)};
   }
 
   /*!
@@ -253,8 +247,8 @@ struct TypedRangeSegment
 /*!
  ******************************************************************************
  *
- * \class TypedRangeStrideSegment
- *
+ * \class TypedRangeStrideSegment 
+ * 
  * \brief  Segment class representing a strided range of typed indices
  *
  * \tparam StorageT underlying data type for the segment indices (required)
@@ -270,9 +264,9 @@ struct TypedRangeSegment
  *
  * NOTE: TypedRangeStrideSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeStrideSegment allows for positive or negative strides and
- *       indices. This allows for forward (stride > 0) or backward (stride < 0)
- *       traversal of the iteration space. A stride of zero is undefined and
+ * NOTE: TypedRangeStrideSegment allows for positive or negative strides and 
+ *       indices. This allows for forward (stride > 0) or backward (stride < 0) 
+ *       traversal of the iteration space. A stride of zero is undefined and 
  *       will cause divide-by-zero errors.
  *
  * As with RangeSegment, the iteration space is inclusive of begin() and
@@ -281,7 +275,7 @@ struct TypedRangeSegment
  * For positive strides, begin() > end() implies size()==0
  * For negative strides, begin() < end() implies size()==0
  *
- * NOTE: Proper handling of negative strides and indices requires that
+ * NOTE: Proper handling of negative strides and indices requires that 
  *       StorageT is a signed type.
  *
  * Usage:
@@ -327,23 +321,15 @@ struct TypedRangeSegment
  *
  ******************************************************************************
  */
-template <typename StorageT,
-          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeStrideSegment
-{
+template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeStrideSegment {
 
   //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value,
-                "TypedRangeStrideSegment DiffT "
-                "requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value,
-                "TypedRangeStrideSegm"
-                "ent Type must be "
-                "non floating "
-                "point.");
+  static_assert(std::is_signed<DiffT>::value, "TypedRangeStrideSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -363,7 +349,7 @@ struct TypedRangeStrideSegment
   //!   @name Constructors, destructor, and copy assignment.
 
   /*!
-   * \brief Construct a range segment for the interval [begin, end) with
+   * \brief Construct a range segment for the interval [begin, end) with 
    *        given stride
    *
    * \param begin start value (inclusive) for the range
@@ -371,8 +357,9 @@ struct TypedRangeStrideSegment
    * \param stride stride value when iterating over the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE
-  TypedRangeStrideSegment(StripStorageT begin, StripStorageT end, DiffT stride)
+  RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin,
+                                           StripStorageT end,
+                                           DiffT stride)
       : m_begin(iterator(begin, stride)),
         m_end(iterator(end, stride)),
         // essentially a ceil((end-begin)/stride) but using integer math,
@@ -380,16 +367,13 @@ struct TypedRangeStrideSegment
         m_size((end - begin + stride - (stride > 0 ? 1 : -1)) / stride)
   {
     // clamp range when end is unreachable from begin without wrapping
-    if (stride < 0 && end > begin)
-    {
+    if (stride < 0 && end > begin) {
       m_end = m_begin;
-    }
-    else if (stride > 0 && end < begin)
-    {
+    } else if (stride > 0 && end < begin) {
       m_end = m_begin;
     }
     // m_size initialized as negative indicates a zero iteration space
-    m_size = m_size < DiffT {0} ? DiffT {0} : m_size;
+    m_size = m_size < DiffT{0} ? DiffT{0} : m_size;
   }
 
   //! Disable compiler generated constructor
@@ -424,8 +408,8 @@ struct TypedRangeStrideSegment
 
   /*!
    * \brief Get size of this segment
-   *
-   * The size is the number of iterates in the
+   * 
+   * The size is the number of iterates in the 
    * interval [begin, end) when striding over it
    */
   RAJA_HOST_DEVICE DiffT size() const { return m_size; }
@@ -451,8 +435,7 @@ struct TypedRangeStrideSegment
    *
    * \return true if begin, end, or size does not match, else false
    */
-  RAJA_HOST_DEVICE RAJA_INLINE bool
-  operator!=(TypedRangeStrideSegment const& o) const
+  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeStrideSegment const& o) const
   {
     return !(operator==(o));
   }
@@ -467,7 +450,7 @@ struct TypedRangeStrideSegment
    * \param length maximum length of new range
    *
    * \return TypedRangeStrideSegment representing the interval
-   *         [ *begin() + begin * stride,
+   *         [ *begin() + begin * stride, 
    *           min( *begin() + (begin + length) * stride, *end() )
    *
    * Here's an example of a slice operation on a range segment with a negative
@@ -483,26 +466,24 @@ struct TypedRangeStrideSegment
    *     //       5 indices in r starting at the 6th entry
    *     auto s = r.slice(6, 6);
    *
-   *   \endverbatim
+   *   \endverbatim 
    */
   RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
                                                  DiffT length) const
   {
     StorageT stride = m_begin.get_stride();
-    StorageT start  = m_begin[0] + begin * stride;
-    StorageT end    = start + stride * length;
+    StorageT start = m_begin[0] + begin * stride;
+    StorageT end = start + stride * length;
 
-    if (stride > 0)
-    {
+    if (stride > 0) {
       end = end > m_end[0] ? m_end[0] : end;
-    }
-    else
-    {
+    } else {
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment {stripIndexType(start), stripIndexType(end),
-                                    m_begin.get_stride()};
+    return TypedRangeStrideSegment{stripIndexType(start),
+                                   stripIndexType(end),
+                                   m_begin.get_stride()};
   }
 
   /*!
@@ -537,12 +518,11 @@ namespace detail
 
 template <typename T, typename... Rest>
 struct common_type
-    : std::common_type<T, typename std::common_type<Rest...>::type>
-{};
+    : std::common_type<T, typename std::common_type<Rest...>::type> {
+};
 
 template <typename T>
-struct common_type<T>
-{
+struct common_type<T> {
   using type = T;
 };
 
@@ -569,7 +549,7 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
 }
 
 /*!
- * \brief Function to make a TypedRangeStride Segment for the interval
+ * \brief Function to make a TypedRangeStride Segment for the interval 
  *        [begin, end) with given stride
  *
  *  \return a newly constructed TypedRangeStrideSegment where
@@ -581,14 +561,13 @@ template <typename BeginT,
           typename EndT,
           typename StrideT,
           typename Common = detail::common_type_t<BeginT, EndT>>
-RAJA_HOST_DEVICE TypedRangeStrideSegment<Common>
-make_strided_range(BeginT&& begin, EndT&& end, StrideT&& stride)
+RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
+    BeginT&& begin,
+    EndT&& end,
+    StrideT&& stride)
 {
-  static_assert(std::is_signed<StrideT>::value,
-                "make_strided_segment : stride must be signed.");
-  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value,
-                "make_stride_segment : stride and end must be of similar "
-                "types.");
+  static_assert(std::is_signed<StrideT>::value, "make_strided_segment : stride must be signed.");
+  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value, "make_stride_segment : stride and end must be of similar types.");
   return {begin, end, stride};
 }
 
@@ -597,13 +576,13 @@ namespace concepts
 
 template <typename T, typename U>
 struct RangeConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>())
-{};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>()) {
+};
 
 template <typename T, typename U, typename V>
 struct RangeStrideConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>())
-{};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>()) {
+};
 
 }  // namespace concepts
 
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index d2a30ee5ce..8feceae22f 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -57,7 +57,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   DepGraphNode()
       : m_num_dep_tasks(0), m_semaphore_reload_value(0), m_semaphore_value(0)
-  {}
+  {
+  }
 
   ///
   /// Get/set semaphore value; i.e., the current number of (unsatisfied)
@@ -81,8 +82,7 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void satisfyOne()
   {
-    if (m_semaphore_value > 0)
-    {
+    if (m_semaphore_value > 0) {
       --m_semaphore_value;
     }
   }
@@ -92,8 +92,7 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void wait()
   {
-    while (m_semaphore_value > 0)
-    {
+    while (m_semaphore_value > 0) {
       // TODO: an efficient wait would be better here, but the standard
       // promise/future is not good enough
       std::this_thread::yield();
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 33cdd3f539..6f32a56e6d 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -50,8 +50,7 @@ std::string overflow_msg(LType lhs, RType rhs)
 template <typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 {
-  if (std::is_unsigned<Type>::value)
-  {
+  if (std::is_unsigned<Type>::value) {
     if ((rhs > 0) && (lhs > std::numeric_limits<Type>::max() - rhs))
       return true;
     if ((rhs < 0) && (lhs < std::numeric_limits<Type>::min() - rhs))
@@ -65,22 +64,18 @@ RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
                                               DifferenceType rhs,
                                               bool iterator_on_left = true)
 {
-  if (iterator_on_left)
-  {
+  if (iterator_on_left) {
 
-    if (std::is_unsigned<Type>::value)
-    {
+    if (std::is_unsigned<Type>::value) {
       if ((rhs > 0) && (lhs < std::numeric_limits<Type>::min() + rhs))
         return true;
       if ((rhs < 0) && (lhs > std::numeric_limits<Type>::max() + rhs))
         return true;
     }
-  }
-  else
-  {  // Special case where operation is : value(lhs) - iterator(rhs).
 
-    if (std::is_unsigned<DifferenceType>::value)
-    {
+  } else {  // Special case where operation is : value(lhs) - iterator(rhs).
+
+    if (std::is_unsigned<DifferenceType>::value) {
       if ((lhs > 0) && (rhs < std::numeric_limits<DifferenceType>::min() + lhs))
         return true;
       if ((lhs < 0)) return true;
@@ -105,28 +100,29 @@ RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
 }
 #endif
 
-template <typename Type           = Index_type,
+template <typename Type = Index_type,
           typename DifferenceType = Type,
-          typename PointerType    = Type*>
+          typename PointerType = Type*>
 class numeric_iterator
 {
 public:
-  using value_type          = Type;
+  using value_type = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type     = DifferenceType;
-  using pointer             = PointerType;
-  using reference           = value_type&;
-  using iterator_category   = std::random_access_iterator_tag;
-
-  constexpr numeric_iterator() noexcept                         = default;
-  constexpr numeric_iterator(const numeric_iterator&) noexcept  = default;
-  constexpr numeric_iterator(numeric_iterator&&) noexcept       = default;
+  using difference_type = DifferenceType;
+  using pointer = PointerType;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  constexpr numeric_iterator() noexcept = default;
+  constexpr numeric_iterator(const numeric_iterator&) noexcept = default;
+  constexpr numeric_iterator(numeric_iterator&&) noexcept = default;
   numeric_iterator& operator=(const numeric_iterator&) noexcept = default;
-  numeric_iterator& operator=(numeric_iterator&&) noexcept      = default;
+  numeric_iterator& operator=(numeric_iterator&&) noexcept = default;
 
   RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs)
       : val(rhs)
-  {}
+  {
+  }
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; }
 
@@ -178,8 +174,8 @@ class numeric_iterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline numeric_iterator&
-  operator+=(const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
+      const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
@@ -187,8 +183,8 @@ class numeric_iterator
     val += rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator&
-  operator-=(const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
+      const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
@@ -196,47 +192,48 @@ class numeric_iterator
     val -= rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator&
-  operator+=(const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
+      const numeric_iterator& rhs)
   {
     val += rhs.val;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator&
-  operator-=(const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
+      const numeric_iterator& rhs)
   {
     val -= rhs.val;
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline stripped_value_type
-  operator+(const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type operator+(
+      const numeric_iterator& rhs) const
   {
     return val + rhs.val;
   }
-  RAJA_HOST_DEVICE inline stripped_value_type
-  operator-(const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type operator-(
+      const numeric_iterator& rhs) const
   {
     return val - rhs.val;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator
-  operator+(const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator operator+(
+      const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
 #endif
     return numeric_iterator(val + rhs);
   }
-  RAJA_HOST_DEVICE inline numeric_iterator
-  operator-(const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator operator-(
+      const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
 #endif
     return numeric_iterator(val - rhs);
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator
-  operator+(difference_type lhs, const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+(
+      difference_type lhs,
+      const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_addition_overflow(rhs.val, lhs)
@@ -246,8 +243,9 @@ class numeric_iterator
     return numeric_iterator(lhs + rhs.val);
 #endif
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator
-  operator-(difference_type lhs, const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-(
+      difference_type lhs,
+      const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_subtraction_overflow(rhs.val, lhs, false)
@@ -275,34 +273,31 @@ class numeric_iterator
   stripped_value_type val = 0;
 };
 
-template <typename Type           = Index_type,
+template <typename Type = Index_type,
           typename DifferenceType = Type,
-          typename PointerType    = Type*>
+          typename PointerType = Type*>
 class strided_numeric_iterator
 {
 public:
-  using value_type          = Type;
+  using value_type = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type     = DifferenceType;
-  using pointer             = DifferenceType*;
-  using reference           = DifferenceType&;
-  using iterator_category   = std::random_access_iterator_tag;
+  using difference_type = DifferenceType;
+  using pointer = DifferenceType*;
+  using reference = DifferenceType&;
+  using iterator_category = std::random_access_iterator_tag;
 
   constexpr strided_numeric_iterator() noexcept = default;
-  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept =
-      default;
-  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept =
-      default;
-  strided_numeric_iterator&
-  operator=(const strided_numeric_iterator&) noexcept = default;
-  strided_numeric_iterator&
-  operator=(strided_numeric_iterator&&) noexcept = default;
+  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = default;
+  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = default;
+  strided_numeric_iterator& operator=(const strided_numeric_iterator&) noexcept = default;
+  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept = default;
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
       DifferenceType stride_ = DifferenceType(1))
       : val(rhs), stride(stride_)
-  {}
+  {
+  }
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return stride; }
 
@@ -317,8 +312,8 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline strided_numeric_iterator&
-  operator+=(const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator+=(
+      const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
@@ -326,8 +321,8 @@ class strided_numeric_iterator
     val += rhs * stride;
     return *this;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator&
-  operator-=(const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=(
+      const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -336,33 +331,33 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type
-  operator+(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type operator+(
+      const strided_numeric_iterator& rhs) const
   {
     return (static_cast<difference_type>(val) +
             (static_cast<difference_type>(rhs.val))) /
            stride;
   }
-  RAJA_HOST_DEVICE inline difference_type
-  operator-(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type operator-(
+      const strided_numeric_iterator& rhs) const
   {
     difference_type diff = (static_cast<difference_type>(val) -
                             (static_cast<difference_type>(rhs.val)));
 
-    return (diff % stride != difference_type {0})
-               ? (difference_type {1} + diff / stride)
+    return (diff % stride != difference_type{0})
+               ? (difference_type{1} + diff / stride)
                : diff / stride;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator
-  operator+(const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator operator+(
+      const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
 #endif
     return strided_numeric_iterator(val + rhs * stride, stride);
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator
-  operator-(const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator operator-(
+      const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -372,34 +367,34 @@ class strided_numeric_iterator
 
   // Specialized comparison to allow normal iteration to work on off-stride
   // multiples by adjusting rhs to the nearest *higher* multiple of stride
-  RAJA_HOST_DEVICE inline bool
-  operator!=(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool operator!=(
+      const strided_numeric_iterator& rhs) const
   {
     return (val - rhs.val) / stride;
   }
-  RAJA_HOST_DEVICE inline bool
-  operator==(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool operator==(
+      const strided_numeric_iterator& rhs) const
   {
     return !((val - rhs.val) / stride);
   }
 
-  RAJA_HOST_DEVICE inline bool
-  operator>(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool operator>(
+      const strided_numeric_iterator& rhs) const
   {
     return val * stride > rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool
-  operator<(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool operator<(
+      const strided_numeric_iterator& rhs) const
   {
     return val * stride < rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool
-  operator>=(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool operator>=(
+      const strided_numeric_iterator& rhs) const
   {
     return val * stride >= rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool
-  operator<=(const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool operator<=(
+      const strided_numeric_iterator& rhs) const
   {
     return val * stride <= rhs.val * stride;
   }
@@ -420,7 +415,7 @@ class strided_numeric_iterator
 
 private:
   stripped_value_type val = 0;
-  DifferenceType stride   = 1;
+  DifferenceType stride = 1;
 };
 
 
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index a7dee5a77c..55015f9ab7 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -27,7 +27,7 @@
 
 #include "RAJA/util/types.hpp"
 
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) ||                \
+#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \
     defined(__MINGW32__) || defined(__BORLANDC__)
 #define RAJA_PLATFORM_WINDOWS
 #include <malloc.h>
@@ -44,7 +44,7 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #if defined(RAJA_HAVE_POSIX_MEMALIGN)
   // posix_memalign available
   void* ret = nullptr;
-  int err   = posix_memalign(&ret, alignment, size);
+  int err = posix_memalign(&ret, alignment, size);
   return err ? nullptr : ret;
 #elif defined(RAJA_HAVE_ALIGNED_ALLOC)
   return std::aligned_alloc(alignment, size);
@@ -53,10 +53,10 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #elif defined(RAJA_PLATFORM_WINDOWS)
   return _aligned_malloc(size, alignment);
 #else
-  char* mem = (char*)malloc(size + alignment + sizeof(void*));
+  char *mem = (char *)malloc(size + alignment + sizeof(void *));
   if (nullptr == mem) return nullptr;
-  void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) &
-                        ~(alignment - 1));
+  void **ptr = (void **)((std::uintptr_t)(mem + alignment + sizeof(void *)) &
+                         ~(alignment - 1));
   // Store the original address one position behind what we give the user.
   ptr[-1] = mem;
   return ptr;
@@ -97,23 +97,25 @@ inline void free_aligned(void* ptr)
 ///
 struct FreeAligned
 {
-  void operator()(void* ptr) { free_aligned(ptr); }
+  void operator()(void* ptr)
+  {
+    free_aligned(ptr);
+  }
 };
 
 ///
 /// Deleter function object for memory allocated with allocate_aligned_type
 /// that calls the destructor for the fist size objects in the storage.
 ///
-template <typename T, typename index_type>
+template < typename T, typename index_type >
 struct FreeAlignedType : FreeAligned
 {
   index_type size = 0;
 
   void operator()(T* ptr)
   {
-    for (index_type i = size; i > 0; --i)
-    {
-      ptr[i - 1].~T();
+    for ( index_type i = size; i > 0; --i ) {
+      ptr[i-1].~T();
     }
     FreeAligned::operator()(ptr);
   }
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 7802bda6cd..1d0ec0cbeb 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -49,7 +49,7 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename T, typename Allocator = std::allocator<T>>
+template <typename T, typename Allocator = std::allocator<T> >
 class RAJAVec
 {
   using allocator_traits_type = std::allocator_traits<Allocator>;
@@ -57,25 +57,24 @@ class RAJAVec
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap =
+  using propagate_on_container_swap            =
       typename allocator_traits_type::propagate_on_container_swap;
-
 public:
-  using value_type      = T;
-  using allocator_type  = Allocator;
-  using size_type       = std::size_t;
+  using value_type = T;
+  using allocator_type = Allocator;
+  using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference       = value_type&;
+  using reference = value_type&;
   using const_reference = const value_type&;
-  using pointer         = typename allocator_traits_type::pointer;
-  using const_pointer   = typename allocator_traits_type::const_pointer;
-  using iterator        = value_type*;
-  using const_iterator  = const value_type*;
+  using pointer = typename allocator_traits_type::pointer;
+  using const_pointer = typename allocator_traits_type::const_pointer;
+  using iterator = value_type*;
+  using const_iterator = const value_type*;
 
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(size_type init_cap      = 0,
+  explicit RAJAVec(size_type init_cap = 0,
                    const allocator_type& a = allocator_type())
       : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
   {
@@ -87,9 +86,7 @@ class RAJAVec
   ///
   RAJAVec(const RAJAVec& other)
       : m_data(nullptr),
-        m_allocator(
-            allocator_traits_type::select_on_container_copy_construction(
-                other.m_allocator)),
+        m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)),
         m_capacity(0),
         m_size(0)
   {
@@ -106,9 +103,9 @@ class RAJAVec
         m_capacity(other.m_capacity),
         m_size(other.m_size)
   {
-    other.m_data     = nullptr;
+    other.m_data = nullptr;
     other.m_capacity = 0;
-    other.m_size     = 0;
+    other.m_size = 0;
   }
 
   ///
@@ -116,9 +113,8 @@ class RAJAVec
   ///
   RAJAVec& operator=(const RAJAVec& rhs)
   {
-    if (&rhs != this)
-    {
-      copy_assign_private(rhs, propagate_on_container_copy_assignment {});
+    if (&rhs != this) {
+      copy_assign_private(rhs, propagate_on_container_copy_assignment{});
     }
     return *this;
   }
@@ -128,10 +124,8 @@ class RAJAVec
   ///
   RAJAVec& operator=(RAJAVec&& rhs)
   {
-    if (&rhs != this)
-    {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment {});
+    if (&rhs != this) {
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -150,31 +144,31 @@ class RAJAVec
   ///
   void swap(RAJAVec& other)
   {
-    swap_private(other, propagate_on_container_swap {});
+    swap_private(other, propagate_on_container_swap{});
   }
 
   ///
   /// Get a pointer to the beginning of the contiguous vector
   ///
-  pointer data() { return m_data; }
+        pointer data()       { return m_data; }
   ///
   const_pointer data() const { return m_data; }
 
   ///
   /// Get an iterator to the end.
   ///
-  iterator end() { return m_data + m_size; }
+        iterator  end()       { return m_data + m_size; }
   ///
-  const_iterator end() const { return m_data + m_size; }
+  const_iterator  end() const { return m_data + m_size; }
   ///
   const_iterator cend() const { return m_data + m_size; }
 
   ///
   /// Get an iterator to the beginning.
   ///
-  iterator begin() { return m_data; }
+        iterator  begin()       { return m_data; }
   ///
-  const_iterator begin() const { return m_data; }
+  const_iterator  begin() const { return m_data; }
   ///
   const_iterator cbegin() const { return m_data; }
 
@@ -206,12 +200,18 @@ class RAJAVec
   ///
   /// Shrink the capacity of the vector to the current size.
   ///
-  void shrink_to_fit() { shrink_cap(m_size); }
+  void shrink_to_fit()
+  {
+    shrink_cap(m_size);
+  }
 
   ///
   /// Empty vector of all data.
   ///
-  void clear() { destroy_items_after(0); }
+  void clear()
+  {
+    destroy_items_after(0);
+  }
 
   ///
   /// Change the size of the vector,
@@ -221,13 +221,10 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size)
   {
-    if (new_size >= size())
-    {
+    if (new_size >= size()) {
       reserve(new_size);
       construct_items_back(new_size);
-    }
-    else
-    {
+    } else {
       destroy_items_after(new_size);
     }
   }
@@ -240,13 +237,10 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size, const_reference new_value)
   {
-    if (new_size >= size())
-    {
+    if (new_size >= size()) {
       reserve(new_size);
       construct_items_back(new_size, new_value);
-    }
-    else
-    {
+    } else {
       destroy_items_after(new_size);
     }
   }
@@ -254,23 +248,23 @@ class RAJAVec
   ///
   /// Bracket operator accessor.
   ///
-  reference operator[](difference_type i) { return m_data[i]; }
+        reference operator[](difference_type i)       { return m_data[i]; }
   ///
   const_reference operator[](difference_type i) const { return m_data[i]; }
 
   ///
   /// Access the last item of the vector.
   ///
-  reference front() { return m_data[0]; }
+        reference front()       { return m_data[0]; }
   ///
   const_reference front() const { return m_data[0]; }
 
   ///
   /// Access the last item of the vector.
   ///
-  reference back() { return m_data[m_size - 1]; }
+        reference back()       { return m_data[m_size-1]; }
   ///
-  const_reference back() const { return m_data[m_size - 1]; }
+  const_reference back() const { return m_data[m_size-1]; }
 
   ///
   /// Add item to front end of vector. Note that this operation is unique to
@@ -278,31 +272,28 @@ class RAJAVec
   ///
   void push_front(const_reference item) { emplace_front_private(item); }
   ///
-  void push_front(value_type&& item) { emplace_front_private(std::move(item)); }
+  void push_front(   value_type&& item) { emplace_front_private(std::move(item)); }
   ///
-  template <typename... Os>
-  void emplace_front(Os&&... os)
-  {
-    emplace_front_private(std::forward<Os>(os)...);
-  }
+  template < typename ... Os >
+  void emplace_front(Os&&... os) { emplace_front_private(std::forward<Os>(os)...); }
 
   ///
   /// Add item to back end of vector.
   ///
   void push_back(const_reference item) { emplace_back_private(item); }
   ///
-  void push_back(value_type&& item) { emplace_back_private(std::move(item)); }
+  void push_back(   value_type&& item) { emplace_back_private(std::move(item)); }
   ///
-  template <typename... Os>
-  void emplace_back(Os&&... os)
-  {
-    emplace_back_private(std::forward<Os>(os)...);
-  }
+  template < typename ... Os >
+  void emplace_back(Os&&... os) { emplace_back_private(std::forward<Os>(os)...); }
 
   ///
   /// Remove the last item of the vector.
   ///
-  void pop_back() { destroy_items_after(m_size - 1); }
+  void pop_back()
+  {
+    destroy_items_after(m_size-1);
+  }
 
 private:
   pointer m_data;
@@ -316,14 +307,13 @@ class RAJAVec
   ///
   void copy_assign_private(RAJAVec const& rhs, std::true_type)
   {
-    if (m_allocator != rhs.m_allocator)
-    {
+    if (m_allocator != rhs.m_allocator) {
       clear();
       shrink_to_fit();
       m_allocator = rhs.m_allocator;
     }
 
-    copy_assign_private(rhs, std::false_type {});
+    copy_assign_private(rhs, std::false_type{});
   }
 
   ///
@@ -333,13 +323,10 @@ class RAJAVec
   void copy_assign_private(RAJAVec const& rhs, std::false_type)
   {
     reserve(rhs.size());
-    if (size() < rhs.size())
-    {
+    if (size() < rhs.size()) {
       copy_assign_items(0, size(), rhs.data());
       copy_construct_items_back(rhs.size(), rhs.data());
-    }
-    else
-    {
+    } else {
       copy_assign_items(0, rhs.size(), rhs.data());
       destroy_items_after(size());
     }
@@ -354,14 +341,14 @@ class RAJAVec
     clear();
     shrink_to_fit();
 
-    m_data      = rhs.m_data;
+    m_data = rhs.m_data;
     m_allocator = std::move(rhs.m_allocator);
-    m_capacity  = rhs.m_capacity;
-    m_size      = rhs.m_size;
+    m_capacity = rhs.m_capacity;
+    m_size = rhs.m_size;
 
-    rhs.m_data     = nullptr;
+    rhs.m_data = nullptr;
     rhs.m_capacity = 0;
-    rhs.m_size     = 0;
+    rhs.m_size = 0;
   }
 
   ///
@@ -370,29 +357,23 @@ class RAJAVec
   ///
   void move_assign_private(RAJAVec&& rhs, std::false_type)
   {
-    if (m_allocator == rhs.m_allocator)
-    {
+    if (m_allocator == rhs.m_allocator) {
       clear();
       shrink_to_fit();
 
-      m_data     = rhs.m_data;
+      m_data = rhs.m_data;
       m_capacity = rhs.m_capacity;
-      m_size     = rhs.m_size;
+      m_size = rhs.m_size;
 
-      rhs.m_data     = nullptr;
+      rhs.m_data = nullptr;
       rhs.m_capacity = 0;
-      rhs.m_size     = 0;
-    }
-    else
-    {
+      rhs.m_size = 0;
+    } else {
       reserve(rhs.size());
-      if (size() < rhs.size())
-      {
+      if (size() < rhs.size()) {
         move_assign_items(0, size(), rhs.data());
         move_construct_items_back(rhs.size(), rhs.data());
-      }
-      else
-      {
+      } else {
         move_assign_items(0, rhs.size(), rhs.data());
         destroy_items_after(size());
       }
@@ -405,10 +386,10 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::true_type)
   {
     using std::swap;
-    swap(m_data, other.m_data);
+    swap(m_data,      other.m_data);
     swap(m_allocator, other.m_allocator);
-    swap(m_capacity, other.m_capacity);
-    swap(m_size, other.m_size);
+    swap(m_capacity,  other.m_capacity);
+    swap(m_size,      other.m_size);
   }
 
   ///
@@ -417,9 +398,9 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::false_type)
   {
     using std::swap;
-    swap(m_data, other.m_data);
-    swap(m_capacity, other.m_capacity);
-    swap(m_size, other.m_size);
+    swap(m_data,      other.m_data);
+    swap(m_capacity,  other.m_capacity);
+    swap(m_size,      other.m_size);
   }
 
   //
@@ -427,8 +408,7 @@ class RAJAVec
   //
   void copy_assign_items(size_type first, size_type last, const_pointer o_data)
   {
-    for (size_type i = first; i < last; ++i)
-    {
+    for (size_type i = first; i < last; ++i) {
       m_data[i] = o_data[i];
     }
   }
@@ -438,8 +418,7 @@ class RAJAVec
   //
   void move_assign_items(size_type first, size_type last, pointer o_data)
   {
-    for (size_type i = first; i < last; ++i)
-    {
+    for (size_type i = first; i < last; ++i) {
       m_data[i] = std::move(o_data[i]);
     }
   }
@@ -447,13 +426,11 @@ class RAJAVec
   //
   // Construct items [m_size, new_size) from args.
   //
-  template <typename... Os>
+  template < typename ... Os >
   void construct_items_back(size_type new_size, Os&&... os)
   {
-    for (; m_size < new_size; ++m_size)
-    {
-      allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                       std::forward<Os>(os)...);
+    for (; m_size < new_size; ++m_size) {
+      allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
     }
   }
 
@@ -462,10 +439,8 @@ class RAJAVec
   //
   void copy_construct_items_back(size_type new_size, const_pointer o_data)
   {
-    for (; m_size < new_size; ++m_size)
-    {
-      allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                       o_data[m_size]);
+    for (; m_size < new_size; ++m_size) {
+      allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]);
     }
   }
 
@@ -474,10 +449,8 @@ class RAJAVec
   //
   void move_construct_items_back(size_type new_size, pointer o_data)
   {
-    for (; m_size < new_size; ++m_size)
-    {
-      allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                       std::move(o_data[m_size]));
+    for (; m_size < new_size; ++m_size) {
+      allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size]));
     }
   }
 
@@ -486,45 +459,39 @@ class RAJAVec
   //
   void destroy_items_after(size_type new_end)
   {
-    for (; m_size > new_end; --m_size)
-    {
-      allocator_traits_type::destroy(m_allocator, m_data + m_size - 1);
+    for (; m_size > new_end; --m_size) {
+      allocator_traits_type::destroy(m_allocator, m_data+m_size-1);
     }
   }
 
   //
   // Add an item to the front, shifting all existing items back one.
   //
-  template <typename... Os>
+  template < typename ... Os >
   void emplace_front_private(Os&&... os)
   {
     reserve(m_size + 1);
 
-    if (m_size > 0)
-    {
+    if (m_size > 0) {
       size_type i = m_size;
-      allocator_traits_type::construct(m_allocator, m_data + i,
-                                       std::move(m_data[i - 1]));
-      for (--i; i > 0; --i)
-      {
+      allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1]));
+      for (--i; i > 0; --i) {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(m_allocator, m_data,
-                                     std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data, std::forward<Os>(os)...);
     m_size++;
   }
 
   //
   // Add an item to the back.
   //
-  template <typename... Os>
+  template < typename ... Os >
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(m_allocator, m_data + m_size,
-                                     std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -534,7 +501,7 @@ class RAJAVec
   // relying on STL directly.
   //
   static constexpr const size_type s_init_cap = 8;
-  static constexpr const double s_grow_fac    = 1.5;
+  static constexpr const double s_grow_fac = 1.5;
 
   //
   // Get the next value for capacity given a target and minimum.
@@ -542,8 +509,7 @@ class RAJAVec
   size_type get_next_cap(size_type target_size)
   {
     size_type next_cap = s_init_cap;
-    if (m_capacity != 0)
-    {
+    if (m_capacity != 0) {
       next_cap = static_cast<size_type>(m_capacity * s_grow_fac);
     }
     return std::max(target_size, next_cap);
@@ -554,8 +520,7 @@ class RAJAVec
   //
   void grow_cap(size_type target_size)
   {
-    if (m_capacity < target_size)
-    {
+    if (m_capacity < target_size) {
       change_cap(get_next_cap(target_size));
     }
   }
@@ -565,8 +530,7 @@ class RAJAVec
   //
   void shrink_cap(size_type target_size)
   {
-    if (m_capacity > target_size)
-    {
+    if (m_capacity > target_size) {
       change_cap(std::max(m_size, target_size));
     }
   }
@@ -578,23 +542,19 @@ class RAJAVec
   void change_cap(size_type next_cap)
   {
     pointer tdata = nullptr;
-    if (next_cap != 0)
-    {
+    if (next_cap != 0) {
       tdata = allocator_traits_type::allocate(m_allocator, next_cap);
     }
 
-    if (m_data)
-    {
-      for (size_type i = 0; i < m_size; ++i)
-      {
-        allocator_traits_type::construct(m_allocator, tdata + i,
-                                         std::move(m_data[i]));
-        allocator_traits_type::destroy(m_allocator, m_data + i);
+    if (m_data) {
+      for (size_type i = 0; i < m_size; ++i) {
+        allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i]));
+        allocator_traits_type::destroy(m_allocator, m_data+i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
     }
 
-    m_data     = tdata;
+    m_data = tdata;
     m_capacity = next_cap;
   }
 };
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index 66d03ca6cd..cf3a86cede 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -37,72 +37,60 @@
 #include <stdio.h>
 #include "cycle.h"
 
-#define RAJA_FT_BEGIN                                                          \
-  extern volatile int fault_type;                                              \
-  bool repeat;                                                                 \
-  bool do_time = false;                                                        \
-  ticks start = 0, stop = 0;                                                   \
-  if (fault_type != 0)                                                         \
-  {                                                                            \
-    printf("Uncaught fault %d\n", fault_type);                                 \
-    fault_type = 0;                                                            \
-  }                                                                            \
-  do                                                                           \
-  {                                                                            \
-    repeat = false;                                                            \
-    if (do_time)                                                               \
-    {                                                                          \
-      start = getticks();                                                      \
+#define RAJA_FT_BEGIN                          \
+  extern volatile int fault_type;              \
+  bool repeat;                                 \
+  bool do_time = false;                        \
+  ticks start = 0, stop = 0;                   \
+  if (fault_type != 0) {                       \
+    printf("Uncaught fault %d\n", fault_type); \
+    fault_type = 0;                            \
+  }                                            \
+  do {                                         \
+    repeat = false;                            \
+    if (do_time) {                             \
+      start = getticks();                      \
     }
 
-#define RAJA_FT_END                                                            \
-  if (do_time)                                                                 \
-  {                                                                            \
-    stop = getticks();                                                         \
-    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start));   \
-    do_time    = false;                                                        \
-    fault_type = 0;                                                            \
-  }                                                                            \
-  if (fault_type < 0)                                                          \
-  {                                                                            \
-    printf("Unrecoverable fault (restart penalty)\n");                         \
-    fault_type = 0;                                                            \
-  }                                                                            \
-  if (fault_type > 0)                                                          \
-  {                                                                            \
-    /* invalidate cache */                                                     \
-    repeat  = true;                                                            \
-    do_time = true;                                                            \
-  }                                                                            \
-  }                                                                            \
-  while (repeat == true)                                                       \
+#define RAJA_FT_END                                                          \
+  if (do_time) {                                                             \
+    stop = getticks();                                                       \
+    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start)); \
+    do_time = false;                                                         \
+    fault_type = 0;                                                          \
+  }                                                                          \
+  if (fault_type < 0) {                                                      \
+    printf("Unrecoverable fault (restart penalty)\n");                       \
+    fault_type = 0;                                                          \
+  }                                                                          \
+  if (fault_type > 0) {                                                      \
+    /* invalidate cache */                                                   \
+    repeat = true;                                                           \
+    do_time = true;                                                          \
+  }                                                                          \
+  }                                                                          \
+  while (repeat == true)                                                     \
     ;
 
 #else
-#define RAJA_FT_BEGIN                                                          \
-  extern volatile int fault_type;                                              \
-  bool repeat;                                                                 \
-  if (fault_type == 0)                                                         \
-  {                                                                            \
-    do                                                                         \
-    {                                                                          \
+#define RAJA_FT_BEGIN             \
+  extern volatile int fault_type; \
+  bool repeat;                    \
+  if (fault_type == 0) {          \
+    do {                          \
       repeat = false;
 
-#define RAJA_FT_END                                                            \
-  if (fault_type > 0)                                                          \
-  {                                                                            \
-    /* invalidate cache */                                                     \
-    repeat     = true;                                                         \
-    fault_type = 0;                                                            \
-  }                                                                            \
-  }                                                                            \
-  while (repeat == true)                                                       \
-    ;                                                                          \
-  }                                                                            \
-  else                                                                         \
-  {                                                                            \
-    fault_type = 0; /* ignore for the simulation */                            \
-  }
+#define RAJA_FT_END        \
+  if (fault_type > 0) {    \
+    /* invalidate cache */ \
+    repeat = true;         \
+    fault_type = 0;        \
+  }                        \
+  }                        \
+  while (repeat == true)   \
+    ;                      \
+  }                        \
+  else { fault_type = 0; /* ignore for the simulation */ }
 
 #endif  // RAJA_REPORT_FT
 
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index f16bd9bee4..af65c05392 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -44,16 +44,14 @@ template <typename Op, typename... Rest>
 struct foldl_impl;
 
 template <typename Op, typename Arg1>
-struct foldl_impl<Op, Arg1>
-{
+struct foldl_impl<Op, Arg1> {
   using Ret = Arg1;
 };
 
 #if RAJA_HAS_CXX17_IS_INVOCABLE
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2>
-{
+struct foldl_impl<Op, Arg1, Arg2> {
   using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
 };
 
@@ -62,22 +60,18 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
-{
-  using Ret =
-      typename foldl_impl<Op,
-                          typename std::invoke_result<
-                              Op,
-                              typename std::invoke_result<Op, Arg1, Arg2>::type,
-                              Arg3>::type,
-                          Rest...>::Ret;
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
+  using Ret = typename foldl_impl<
+      Op,
+      typename std::invoke_result<Op, typename std::invoke_result<Op, Arg1, Arg2>::type,
+                                      Arg3>::type,
+      Rest...>::Ret;
 };
 
 #else
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2>
-{
+struct foldl_impl<Op, Arg1, Arg2> {
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
 };
 
@@ -86,8 +80,7 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
-{
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
   using Ret = typename foldl_impl<
       Op,
       typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
@@ -97,19 +90,20 @@ struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
 
 #endif
 
-}  // namespace detail
+} // namespace detail
 
 template <typename Op, typename Arg1>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
-foldl(Op&& RAJA_UNUSED_ARG(operation), Arg1&& arg) ->
-    typename detail::foldl_impl<Op, Arg1>::Ret
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
+    Op&& RAJA_UNUSED_ARG(operation),
+    Arg1&& arg) -> typename detail::foldl_impl<Op, Arg1>::Ret
 {
   return camp::forward<Arg1>(arg);
 }
 
 template <typename Op, typename Arg1, typename Arg2>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
-foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
+                                                  Arg1&& arg1,
+                                                  Arg2&& arg2) ->
     typename detail::foldl_impl<Op, Arg1, Arg2>::Ret
 {
   return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
@@ -121,8 +115,11 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
-foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Rest&&... rest) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
+                                                  Arg1&& arg1,
+                                                  Arg2&& arg2,
+                                                  Arg3&& arg3,
+                                                  Rest&&... rest) ->
     typename detail::foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
 {
   return foldl(camp::forward<Op>(operation),
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index 313ef66934..0354d04bfd 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -8,21 +8,18 @@
 namespace RAJA
 {
 
-namespace policy
-{
-namespace multi
-{
+namespace policy {
+namespace multi {
 template <typename Selector, typename... Policies>
 class MultiPolicy;
 
 }
-}  // namespace policy
+}
 
-namespace detail
+namespace detail 
 {
 
-struct max_platform
-{
+struct max_platform {
   RAJA_HOST_DEVICE
   RAJA_INLINE
   constexpr RAJA::Platform operator()(const RAJA::Platform& l,
@@ -37,8 +34,7 @@ struct max_platform
  * This is a catch-all, so anything undefined gets Platform::undefined
  */
 template <typename T, typename = void>
-struct get_platform
-{
+struct get_platform {
   // catch-all: undefined platform
   static constexpr Platform value = Platform::undefined;
 };
@@ -49,8 +45,7 @@ struct get_platform
  * reduction of them all.
  */
 template <typename... Policies>
-struct get_platform_from_list
-{
+struct get_platform_from_list {
   static constexpr Platform value =
       foldl(max_platform(), get_platform<Policies>::value...);
 };
@@ -59,8 +54,7 @@ struct get_platform_from_list
  * Define an empty list as Platform::undefined;
  */
 template <>
-struct get_platform_from_list<>
-{
+struct get_platform_from_list<> {
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -73,10 +67,10 @@ struct get_platform_from_list<>
  */
 template <typename T>
 struct get_platform<T,
-                    typename std::enable_if<
-                        std::is_base_of<RAJA::PolicyBase, T>::value &&
-                        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
-{
+                    typename std::
+                        enable_if<std::is_base_of<RAJA::PolicyBase, T>::value
+                                  && !RAJA::type_traits::is_indexset_policy<T>::
+                                         value>::type> {
 
   static constexpr Platform value = T::platform;
 };
@@ -89,13 +83,12 @@ struct get_platform<T,
  */
 template <typename SEG, typename EXEC>
 struct get_platform<RAJA::ExecPolicy<SEG, EXEC>>
-    : public get_platform_from_list<SEG, EXEC>
-{};
+    : public get_platform_from_list<SEG, EXEC> {
+};
 
 
 template <typename T>
-struct get_statement_platform
-{
+struct get_statement_platform {
   static constexpr Platform value =
       get_platform_from_list<typename T::execution_policy_t,
                              typename T::enclosed_statements_t>::value;
@@ -109,8 +102,7 @@ struct get_statement_platform
  * each of them.
  */
 template <typename... Stmts>
-struct get_platform<RAJA::internal::StatementList<Stmts...>>
-{
+struct get_platform<RAJA::internal::StatementList<Stmts...>> {
   static constexpr Platform value =
       foldl(max_platform(), get_statement_platform<Stmts>::value...);
 };
@@ -119,8 +111,7 @@ struct get_platform<RAJA::internal::StatementList<Stmts...>>
  * Specialize for an empty statement list to be undefined
  */
 template <>
-struct get_platform<RAJA::internal::StatementList<>>
-{
+struct get_platform<RAJA::internal::StatementList<>> {
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -129,12 +120,11 @@ struct get_platform<RAJA::internal::StatementList<>>
 // Once a specific policy is selected, that policy will select the correct
 // platform... see policy_invoker in MultiPolicy.hpp
 template <typename SELECTOR, typename... POLICIES>
-struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>>
-{
+struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>> {
   static constexpr Platform value = Platform::undefined;
 };
 
-}  // namespace detail
-}  // namespace RAJA
+} // closing brace for detail namespace
+} // closing brace for RAJA namespace
 
-#endif  // RAJA_get_platform_HPP
+#endif // RAJA_get_platform_HPP
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 9b4ab6c7ee..767821b8d8 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -38,44 +38,38 @@ namespace RAJA
  *
  * \verbatim
 
-   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator>
- pool(allocator);
+   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> pool(allocator);
 
    pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) {
       xarg0[i] = xarg1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group =
- pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group = pool.instantiate();
 
    int* xarg0 = ...;
    int xarg1 = ...;
-   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site =
- group.run(xarg0, xarg1);
+   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site = group.run(xarg0, xarg1);
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template <typename... Args>
+template < typename ... Args >
 using xargs = camp::list<Args...>;
 
-namespace detail
-{
+namespace detail {
 
-template <typename T>
-struct is_xargs
-{
+template < typename T >
+struct is_xargs {
   static constexpr bool value = false;
 };
 
-template <typename... Args>
-struct is_xargs<xargs<Args...>>
-{
+template < typename ... Args >
+struct is_xargs<xargs<Args...>> {
   static constexpr bool value = true;
 };
 
-}  // namespace detail
+}
 
 
 //
@@ -108,8 +102,7 @@ struct is_xargs<xargs<Args...>>
       data[i] = 1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
- pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
 
  * \endverbatim
  *
@@ -119,15 +112,11 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkPool
-{
-  static_assert(
-      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkPool {
+  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-                "WorkPool: EXTRA_ARGS_T "
-                "must be a "
-                "RAJA::xargs<...> type");
+      "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -146,11 +135,9 @@ struct WorkPool
  *
  * \verbatim
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
- pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
- group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
 
  * \endverbatim
  *
@@ -160,15 +147,11 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkGroup
-{
-  static_assert(
-      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkGroup {
+  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-                "WorkGroup: "
-                "EXTRA_ARGS_T must be a "
-                "RAJA::xargs<...> type");
+      "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -187,8 +170,7 @@ struct WorkGroup
  *
  * \verbatim
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
- group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
 
    site.synchronize();
 
@@ -200,15 +182,11 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkSite
-{
-  static_assert(
-      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkSite {
+  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-                "WorkSite: EXTRA_ARGS_T "
-                "must be a "
-                "RAJA::xargs<...> type");
+      "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 
@@ -217,7 +195,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename... Args,
+          typename ... Args,
           typename ALLOCATOR_T>
 struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -227,32 +205,23 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy     = EXEC_POLICY_T;
-  using order_policy    = ORDER_POLICY_T;
-  using storage_policy  = STORAGE_POLICY_T;
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
+  using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<exec_policy,
-                                 order_policy,
-                                 storage_policy,
-                                 dispatch_policy>;
-  using index_type      = INDEX_T;
-  using xarg_type       = xargs<Args...>;
-  using Allocator       = ALLOCATOR_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using index_type = INDEX_T;
+  using xarg_type = xargs<Args...>;
+  using Allocator = ALLOCATOR_T;
 
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
-  using worksite_type  = WorkSite<policy, index_type, xarg_type, Allocator>;
+  using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<exec_policy,
-                                             order_policy,
-                                             dispatch_policy,
-                                             Allocator,
-                                             index_type,
-                                             Args...>;
-  using storage_type =
-      detail::WorkStorage<storage_policy,
-                          Allocator,
-                          typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<
+      exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>;
+  using storage_type = detail::WorkStorage<
+      storage_policy, Allocator, typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -260,45 +229,52 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workrunner_type::resource_type;
 
-  explicit WorkPool(Allocator const& aloc) : m_storage(aloc) {}
+  explicit WorkPool(Allocator const& aloc)
+    : m_storage(aloc)
+  { }
 
-  WorkPool(WorkPool const&)            = delete;
+  WorkPool(WorkPool const&) = delete;
   WorkPool& operator=(WorkPool const&) = delete;
 
-  WorkPool(WorkPool&&)            = default;
+  WorkPool(WorkPool&&) = default;
   WorkPool& operator=(WorkPool&&) = default;
 
-  size_t num_loops() const { return m_storage.size(); }
+  size_t num_loops() const
+  {
+    return m_storage.size();
+  }
 
-  size_t storage_bytes() const { return m_storage.storage_size(); }
+  size_t storage_bytes() const
+  {
+    return m_storage.storage_size();
+  }
 
   void reserve(size_t num_loops, size_t storage_bytes)
   {
     m_storage.reserve(num_loops, storage_bytes);
   }
 
-  template <typename segment_T, typename loop_T>
+  template < typename segment_T, typename loop_T >
   inline void enqueue(segment_T&& seg, loop_T&& loop_body)
   {
     {
       // ignore zero length loops
-      using std::begin;
-      using std::end;
+      using std::begin; using std::end;
       if (begin(seg) == end(seg)) return;
     }
-    if (m_storage.begin() == m_storage.end())
-    {
+    if (m_storage.begin() == m_storage.end()) {
       // perform auto-reserve on reuse
       reserve(m_max_num_loops, m_max_storage_bytes);
     }
 
-    util::PluginContext context {util::make_context<exec_policy>()};
+    util::PluginContext context{util::make_context<exec_policy>()};
     util::callPreCapturePlugins(context);
 
     using RAJA::util::trigger_updates_before;
     auto body = trigger_updates_before(loop_body);
 
-    m_runner.enqueue(m_storage, std::forward<segment_T>(seg), std::move(body));
+    m_runner.enqueue(
+        m_storage, std::forward<segment_T>(seg), std::move(body));
 
     util::callPostCapturePlugins(context);
   }
@@ -313,11 +289,14 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkPool() { clear(); }
+  ~WorkPool()
+  {
+    clear();
+  }
 
 private:
   storage_type m_storage;
-  size_t m_max_num_loops     = 0;
+  size_t m_max_num_loops = 0;
   size_t m_max_storage_bytes = 0;
 
   workrunner_type m_runner;
@@ -328,7 +307,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename... Args,
+          typename ... Args,
           typename ALLOCATOR_T>
 struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  ORDER_POLICY_T,
@@ -338,23 +317,20 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                  xargs<Args...>,
                  ALLOCATOR_T>
 {
-  using exec_policy     = EXEC_POLICY_T;
-  using order_policy    = ORDER_POLICY_T;
-  using storage_policy  = STORAGE_POLICY_T;
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
+  using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<exec_policy,
-                                 order_policy,
-                                 storage_policy,
-                                 dispatch_policy>;
-  using index_type      = INDEX_T;
-  using xarg_type       = xargs<Args...>;
-  using Allocator       = ALLOCATOR_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using index_type = INDEX_T;
+  using xarg_type = xargs<Args...>;
+  using Allocator = ALLOCATOR_T;
 
   using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using storage_type    = typename workpool_type::storage_type;
+  using storage_type = typename workpool_type::storage_type;
   using workrunner_type = typename workpool_type::workrunner_type;
 
   friend workpool_type;
@@ -363,16 +339,15 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkGroup(WorkGroup const&)            = delete;
+  WorkGroup(WorkGroup const&) = delete;
   WorkGroup& operator=(WorkGroup const&) = delete;
 
-  WorkGroup(WorkGroup&&)            = default;
+  WorkGroup(WorkGroup&&) = default;
   WorkGroup& operator=(WorkGroup&&) = default;
 
   inline worksite_type run(resource_type r, Args...);
 
-  worksite_type run(Args... args)
-  {
+  worksite_type run(Args... args) {
     auto r = resource_type::get_default();
     return run(r, std::move(args)...);
   }
@@ -385,15 +360,19 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkGroup() { clear(); }
+  ~WorkGroup()
+  {
+    clear();
+  }
 
 private:
   storage_type m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
-      : m_storage(std::move(storage)), m_runner(std::move(runner))
-  {}
+    : m_storage(std::move(storage))
+    , m_runner(std::move(runner))
+  { }
 };
 
 template <typename EXEC_POLICY_T,
@@ -401,7 +380,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename... Args,
+          typename ... Args,
           typename ALLOCATOR_T>
 struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -411,19 +390,16 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy     = EXEC_POLICY_T;
-  using order_policy    = ORDER_POLICY_T;
-  using storage_policy  = STORAGE_POLICY_T;
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
+  using storage_policy = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy          = WorkGroupPolicy<exec_policy,
-                                 order_policy,
-                                 storage_policy,
-                                 dispatch_policy>;
-  using index_type      = INDEX_T;
-  using xarg_type       = xargs<Args...>;
-  using Allocator       = ALLOCATOR_T;
-
-  using workpool_type  = WorkPool<policy, index_type, xarg_type, Allocator>;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
+  using index_type = INDEX_T;
+  using xarg_type = xargs<Args...>;
+  using Allocator = ALLOCATOR_T;
+
+  using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
 
 private:
@@ -436,13 +412,16 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkSite(WorkSite const&)            = delete;
+  WorkSite(WorkSite const&) = delete;
   WorkSite& operator=(WorkSite const&) = delete;
 
-  WorkSite(WorkSite&&)            = default;
+  WorkSite(WorkSite&&) = default;
   WorkSite& operator=(WorkSite&&) = default;
 
-  resource_type get_resource() const { return m_resource; }
+  resource_type get_resource() const
+  {
+    return m_resource;
+  }
 
   void clear()
   {
@@ -450,15 +429,19 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
     // TODO: synchronize
   }
 
-  ~WorkSite() { clear(); }
+  ~WorkSite()
+  {
+    clear();
+  }
 
 private:
   per_run_storage m_run_storage;
   resource_type m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
-      : m_run_storage(std::move(run_storage)), m_resource(r)
-  {}
+    : m_run_storage(std::move(run_storage))
+    , m_resource(r)
+  { }
 };
 
 
@@ -467,29 +450,26 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename... Args,
+          typename ... Args,
           typename ALLOCATOR_T>
-inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
-                                         ORDER_POLICY_T,
-                                         STORAGE_POLICY_T,
-                                         DISPATCH_POLICY_T>,
-                         INDEX_T,
-                         xargs<Args...>,
-                         ALLOCATOR_T>::workgroup_type
-WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
-                         ORDER_POLICY_T,
-                         STORAGE_POLICY_T,
-                         DISPATCH_POLICY_T>,
-         INDEX_T,
-         xargs<Args...>,
-         ALLOCATOR_T>::instantiate()
+inline
+typename WorkPool<
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::workgroup_type
+WorkPool<
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
-  m_max_num_loops     = std::max(m_storage.size(), m_max_num_loops);
+  m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
   m_max_storage_bytes = std::max(m_storage.storage_size(), m_max_storage_bytes);
 
   // move storage into workgroup
-  return workgroup_type {std::move(m_storage), std::move(m_runner)};
+  return workgroup_type{std::move(m_storage), std::move(m_runner)};
 }
 
 template <typename EXEC_POLICY_T,
@@ -497,37 +477,30 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename... Args,
+          typename ... Args,
           typename ALLOCATOR_T>
-inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
-                                          ORDER_POLICY_T,
-                                          STORAGE_POLICY_T,
-                                          DISPATCH_POLICY_T>,
-                          INDEX_T,
-                          xargs<Args...>,
-                          ALLOCATOR_T>::worksite_type
+inline
+typename WorkGroup<
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    INDEX_T,
+    xargs<Args...>,
+    ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T,
-                    ORDER_POLICY_T,
-                    STORAGE_POLICY_T,
-                    DISPATCH_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
-                                                         ORDER_POLICY_T,
-                                                         STORAGE_POLICY_T,
-                                                         DISPATCH_POLICY_T>,
-                                         INDEX_T,
-                                         xargs<Args...>,
-                                         ALLOCATOR_T>::resource_type r,
+    ALLOCATOR_T>::run(typename WorkGroup<
+                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+                          INDEX_T,
+                          xargs<Args...>,
+                          ALLOCATOR_T>::resource_type r,
                       Args... args)
 {
-  util::PluginContext context {util::make_context<EXEC_POLICY_T>()};
+  util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(r,
-                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index d7c35feb3d..1eac283f4b 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -36,36 +36,35 @@ namespace RAJA
 namespace detail
 {
 
-template <typename>
+template < typename >
 struct DispatcherVoidPtrWrapper
 {
   void* ptr;
   DispatcherVoidPtrWrapper() = default;
   // implicit constructor from void*
-  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) {}
+  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { }
 };
 
-template <typename>
+template < typename >
 struct DispatcherVoidConstPtrWrapper
 {
   const void* ptr;
   DispatcherVoidConstPtrWrapper() = default;
   // implicit constructor from const void*
-  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) {}
+  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { }
 };
 
 
-constexpr bool dispatcher_use_host_invoke(Platform platform)
-{
+constexpr bool dispatcher_use_host_invoke(Platform platform) {
   return !(platform == Platform::cuda || platform == Platform::hip);
 }
 
 // Transforms one dispatch policy into another by creating a dispatch policy
 // of holder_type objects. See usage in WorkRunner for more explanation.
-template <typename dispatch_policy, typename holder_type>
+template < typename dispatch_policy, typename holder_type >
 struct dispatcher_transform_types;
 ///
-template <typename dispatch_policy, typename holder_type>
+template < typename dispatch_policy, typename holder_type >
 using dispatcher_transform_types_t =
     typename dispatcher_transform_types<dispatch_policy, holder_type>::type;
 
@@ -76,17 +75,12 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template <Platform platform,
-          typename dispatch_policy,
-          typename DispatcherID,
-          typename... CallArgs>
+template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
 struct Dispatcher;
 
 
-template <typename holder_type>
-struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
-                                  holder_type>
-{
+template < typename holder_type >
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> {
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
@@ -99,44 +93,38 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::indirect_function_call_dispatch,
-                  DispatcherID,
-                  CallArgs...>
-{
+template < Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherID, CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy   = ::RAJA::indirect_function_call_dispatch;
-  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  template <typename T>
-  static void s_move_construct_destroy(void_ptr_wrapper dest,
-                                       void_ptr_wrapper src)
+  template < typename T >
+  static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
-    T* src_as_T  = static_cast<T*>(src.ptr);
-    new (dest_as_T) T(std::move(*src_as_T));
+    T* src_as_T = static_cast<T*>(src.ptr);
+    new(dest_as_T) T(std::move(*src_as_T));
     (*src_as_T).~T();
   }
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  template <typename T>
+  template < typename T >
   static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
   }
   ///
-  template <typename T>
-  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
-                                          CallArgs... args)
+  template < typename T >
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -145,26 +133,22 @@ struct Dispatcher<platform,
   ///
   /// destroy the object of type T in obj
   ///
-  template <typename T>
+  template < typename T >
   static void s_destroy(void_ptr_wrapper obj)
   {
     T* obj_as_T = static_cast<T*>(obj.ptr);
     (*obj_as_T).~T();
   }
 
-  using mover_type     = void (*)(void_ptr_wrapper /*dest*/,
-                              void_ptr_wrapper /*src*/);
-  using invoker_type   = void (*)(void_cptr_wrapper /*obj*/,
-                                CallArgs... /*args*/);
-  using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
+  using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
+  using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
+  using destroyer_type = void(*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
-  template <typename T>
-  struct DeviceInvokerFactory
-  {
+  template < typename T >
+  struct DeviceInvokerFactory {
     using value_type = invoker_type;
-    RAJA_DEVICE value_type operator()()
-    {
+    RAJA_DEVICE value_type operator()() {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -176,14 +160,14 @@ struct Dispatcher<platform,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher()
-  {
-    return {mover_type {&s_move_construct_destroy<T>},
-            invoker_type {&s_host_invoke<T>}, destroyer_type {&s_destroy<T>},
-            sizeof(T)};
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    return { mover_type{&s_move_construct_destroy<T>},
+             invoker_type{&s_host_invoke<T>},
+             destroyer_type{&s_destroy<T>},
+             sizeof(T)
+           };
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -195,16 +179,14 @@ struct Dispatcher<platform,
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
-  {
-    return {mover_type {&s_move_construct_destroy<T>},
-            invoker_type {std::forward<CreateOnDevice>(createOnDevice)(
-                DeviceInvokerFactory<T> {})},
-            destroyer_type {&s_destroy<T>}, sizeof(T)};
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+    return { mover_type{&s_move_construct_destroy<T>},
+             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(DeviceInvokerFactory<T>{})},
+             destroyer_type{&s_destroy<T>},
+             sizeof(T)
+           };
   }
 
   mover_type move_construct_destroy;
@@ -214,10 +196,8 @@ struct Dispatcher<platform,
 };
 
 
-template <typename holder_type>
-struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
-                                  holder_type>
-{
+template < typename holder_type >
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> {
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
@@ -230,48 +210,38 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::indirect_virtual_function_dispatch,
-                  DispatcherID,
-                  CallArgs...>
-{
+template < Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, DispatcherID, CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy   = ::RAJA::indirect_virtual_function_dispatch;
-  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  struct impl_base
-  {
-    virtual void move_destroy(void_ptr_wrapper dest,
-                              void_ptr_wrapper src) const = 0;
-    virtual void destroy(void_ptr_wrapper obj) const      = 0;
+  struct impl_base {
+    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const = 0;
   };
 
-  struct host_impl_base
-  {
+  struct host_impl_base {
     virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
   };
 
-  struct device_impl_base
-  {
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
-                                    CallArgs... args) const = 0;
+  struct device_impl_base {
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
   };
 
-  template <typename T>
+  template < typename T >
   struct base_impl_type : impl_base
   {
     ///
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void move_destroy(void_ptr_wrapper dest,
-                              void_ptr_wrapper src) const override
+    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T  = static_cast<T*>(src.ptr);
-      new (dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T = static_cast<T*>(src.ptr);
+      new(dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
 
@@ -285,7 +255,7 @@ struct Dispatcher<platform,
     }
   };
 
-  template <typename T>
+  template < typename T >
   struct host_impl_type : host_impl_base
   {
     ///
@@ -298,22 +268,20 @@ struct Dispatcher<platform,
     }
   };
 
-  template <typename T>
+  template < typename T >
   struct device_impl_type : device_impl_base
   {
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
-                                    CallArgs... args) const override
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
 
-  struct mover_type
-  {
+  struct mover_type {
     impl_base* m_impl;
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
@@ -321,8 +289,7 @@ struct Dispatcher<platform,
     }
   };
 
-  struct host_invoker_type
-  {
+  struct host_invoker_type {
     host_impl_base* m_impl;
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
@@ -330,30 +297,30 @@ struct Dispatcher<platform,
     }
   };
   ///
-  struct device_invoker_type
-  {
+  struct device_invoker_type {
     device_impl_base* m_impl;
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::
-      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
 
-  struct destroyer_type
-  {
+  struct destroyer_type {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
+    void operator()(void_ptr_wrapper obj) const
+    {
+      m_impl->destroy(obj);
+    }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
-  template <typename T>
-  struct DeviceImplTypeFactory
-  {
+  template < typename T >
+  struct DeviceImplTypeFactory {
     using value_type = device_impl_type<T>*;
-    RAJA_DEVICE value_type operator()()
-    {
+    RAJA_DEVICE value_type operator()() {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -366,15 +333,16 @@ struct Dispatcher<platform,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher()
-  {
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return {mover_type {&s_base_impl}, host_invoker_type {&s_host_impl},
-            destroyer_type {&s_base_impl}, sizeof(T)};
+    return { mover_type{&s_base_impl},
+             host_invoker_type{&s_host_impl},
+             destroyer_type{&s_base_impl},
+             sizeof(T)
+           };
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -386,17 +354,17 @@ struct Dispatcher<platform,
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
-  {
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
     static base_impl_type<T> s_base_impl;
-    static device_impl_type<T>* s_device_impl_ptr {std::forward<CreateOnDevice>(
-        createOnDevice)(DeviceImplTypeFactory<T> {})};
-    return {mover_type {&s_base_impl}, device_invoker_type {s_device_impl_ptr},
-            destroyer_type {&s_base_impl}, sizeof(T)};
+    static device_impl_type<T>* s_device_impl_ptr{
+        std::forward<CreateOnDevice>(createOnDevice)(DeviceImplTypeFactory<T>{}) };
+    return { mover_type{&s_base_impl},
+             device_invoker_type{s_device_impl_ptr},
+             destroyer_type{&s_base_impl},
+             sizeof(T)
+           };
   }
 
   mover_type move_construct_destroy;
@@ -407,68 +375,61 @@ struct Dispatcher<platform,
 
 
 // direct_dispatch expects a list of types
-template <typename... Ts, typename holder_type>
-struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type>
-{
-  using type =
-      ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
+template < typename ... Ts, typename holder_type >
+struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type> {
+  using type = ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to zero callable types.
  * It implements the interface with callable objects.
  */
-template <Platform platform, typename DispatcherID, typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::direct_dispatch<>,
-                  DispatcherID,
-                  CallArgs...>
-{
+template < Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy                 = ::RAJA::direct_dispatch<>;
-  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy = ::RAJA::direct_dispatch<>;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type
-  {
-    void operator()(void_ptr_wrapper, void_ptr_wrapper) const {}
+  struct mover_type {
+    void operator()(void_ptr_wrapper, void_ptr_wrapper) const
+    { }
   };
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type
-  {
-    void operator()(void_cptr_wrapper, CallArgs...) const {}
+  struct host_invoker_type {
+    void operator()(void_cptr_wrapper, CallArgs...) const
+    { }
   };
-  struct device_invoker_type
-  {
-    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const {}
+  struct device_invoker_type {
+    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const
+    { }
   };
-  using invoker_type = std::
-      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type
-  {
-    void operator()(void_ptr_wrapper) const {}
+  struct destroyer_type {
+    void operator()(void_ptr_wrapper) const
+    { }
   };
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher()
-  {
-    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -476,14 +437,10 @@ struct Dispatcher<platform,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
-  {
-    return {mover_type {}, device_invoker_type {}, destroyer_type {},
-            sizeof(T)};
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -496,31 +453,23 @@ struct Dispatcher<platform,
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template <Platform platform,
-          typename T,
-          typename DispatcherID,
-          typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::direct_dispatch<T>,
-                  DispatcherID,
-                  CallArgs...>
-{
+template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy                 = ::RAJA::direct_dispatch<T>;
-  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy = ::RAJA::direct_dispatch<T>;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type
-  {
+  struct mover_type {
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T  = static_cast<T*>(src.ptr);
-      new (dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T = static_cast<T*>(src.ptr);
+      new(dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -528,30 +477,28 @@ struct Dispatcher<platform,
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type
-  {
+  struct host_invoker_type {
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type
-  {
+  struct device_invoker_type {
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::
-      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type
-  {
+  struct destroyer_type {
     void operator()(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -562,14 +509,11 @@ struct Dispatcher<platform,
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename U,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher()
-  {
-    static_assert(std::is_same<T, U>::value,
-                  "U must be in direct_dispatch types");
-    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
+  template< typename U,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -577,16 +521,11 @@ struct Dispatcher<platform,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <typename U,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
-  {
-    static_assert(std::is_same<T, U>::value,
-                  "U must be in direct_dispatch types");
-    return {mover_type {}, device_invoker_type {}, destroyer_type {},
-            sizeof(T)};
+  template< typename U, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -599,55 +538,46 @@ struct Dispatcher<platform,
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template <typename T0,
-          typename T1,
-          typename... TNs,
-          Platform platform,
-          typename DispatcherID,
-          typename... CallArgs>
-struct Dispatcher<platform,
-                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
-                  DispatcherID,
-                  CallArgs...>
-{
+template < typename T0, typename T1, typename ... TNs,
+           Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID, CallArgs...> {
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy   = ::RAJA::direct_dispatch<T0, T1, TNs...>;
-  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  using id_type          = int;
-  using callable_indices = camp::make_int_seq_t<id_type, 2 + sizeof...(TNs)>;
-  using callable_types   = camp::list<T0, T1, TNs...>;
+  using id_type = int;
+  using callable_indices = camp::make_int_seq_t<id_type, 2+sizeof...(TNs)>;
+  using callable_types = camp::list<T0, T1, TNs...>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type
-  {
+  struct mover_type {
     id_type id;
 
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
-      impl_helper(callable_indices {}, callable_types {}, dest, src);
+      impl_helper(callable_indices{}, callable_types{},
+                  dest, src);
     }
 
   private:
-    template <int... id_types, typename... Ts>
-    void impl_helper(camp::int_seq<int, id_types...>,
-                     camp::list<Ts...>,
-                     void_ptr_wrapper dest,
-                     void_ptr_wrapper src) const
+    template < int ... id_types, typename ... Ts >
+    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
 
-    template <typename T>
+    template < typename T >
     void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T  = static_cast<T*>(src.ptr);
-      new (dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T = static_cast<T*>(src.ptr);
+      new(dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -655,89 +585,79 @@ struct Dispatcher<platform,
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type
-  {
+  struct host_invoker_type {
     id_type id;
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices {}, callable_types {}, obj,
-                  std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices{}, callable_types{},
+                  obj, std::forward<CallArgs>(args)...);
     }
 
   private:
-    template <int... id_types, typename... Ts>
-    void impl_helper(camp::int_seq<int, id_types...>,
-                     camp::list<Ts...>,
-                     void_cptr_wrapper obj,
-                     CallArgs... args) const
+    template < int ... id_types, typename ... Ts >
+    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_cptr_wrapper obj, CallArgs... args) const
     {
-      camp::sink(((id_types == id)
-                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
-                      : 0)...);
+      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
     }
 
-    template <typename T>
+    template < typename T >
     void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type
-  {
+  struct device_invoker_type {
     id_type id;
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices {}, callable_types {}, obj,
-                  std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices{}, callable_types{},
+                  obj, std::forward<CallArgs>(args)...);
     }
 
   private:
-    template <int... id_types, typename... Ts>
-    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
-                                 camp::list<Ts...>,
-                                 void_cptr_wrapper obj,
-                                 CallArgs... args) const
+    template < int ... id_types, typename ... Ts >
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_cptr_wrapper obj, CallArgs... args) const
     {
-      camp::sink(((id_types == id)
-                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
-                      : 0)...);
+      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
     }
 
-    template <typename T>
+    template < typename T >
     RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::
-      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type
-  {
+  struct destroyer_type {
     id_type id;
 
     void operator()(void_ptr_wrapper obj) const
     {
-      impl_helper(callable_indices {}, callable_types {}, obj);
+      impl_helper(callable_indices{}, callable_types{},
+                  obj);
     }
 
   private:
-    template <int... id_types, typename... Ts>
-    void impl_helper(camp::int_seq<int, id_types...>,
-                     camp::list<Ts...>,
-                     void_ptr_wrapper obj) const
+    template < int ... id_types, typename ... Ts >
+    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
 
-    template <typename T>
+    template < typename T >
     void impl(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -751,31 +671,25 @@ struct Dispatcher<platform,
   /// The id is just the index of T in the list of callable_types.
   /// If T is not in Ts return -1.
   ///
-  template <typename T, int... id_types, typename... Ts>
-  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
-                                  camp::list<Ts...>)
+  template < typename T, int ... id_types, typename ... Ts >
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
   {
-    id_type id {-1};
+    id_type id{-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[] {0,
-                  (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
-    camp::sink(unused);  // quiet unused var warning
+    int unused[] {0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    camp::sink(unused); // quiet unused var warning
     return id;
   }
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template <typename T,
-            bool uhi               = use_host_invoke,
-            std::enable_if_t<uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher()
-  {
-    static constexpr id_type id =
-        get_id<T>(callable_indices {}, callable_types {});
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type {id}, host_invoker_type {id}, destroyer_type {id},
-            sizeof(T)};
+    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -783,17 +697,12 @@ struct Dispatcher<platform,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template <typename T,
-            typename CreateOnDevice,
-            bool uhi                = use_host_invoke,
-            std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
-  {
-    static constexpr id_type id =
-        get_id<T>(callable_indices {}, callable_types {});
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type {id}, device_invoker_type {id}, destroyer_type {id},
-            sizeof(T)};
+    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 5a666d1c73..9645f73050 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -40,18 +40,18 @@ namespace detail
 /*!
  * A body and args holder for storing loops that are being executed in foralls
  */
-template <typename LoopBody, typename... Args>
+template <typename LoopBody, typename ... Args>
 struct HoldBodyArgs_base
 {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template <typename body_in,
-            typename = typename std::enable_if<
-                std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
+  template < typename body_in,
+      typename = typename std::enable_if<
+        std::is_same<LoopBody, camp::decay<body_in>>::value>::type >
   HoldBodyArgs_base(body_in&& body, Args... args)
-      : m_body(std::forward<body_in>(body)),
-        m_arg_tuple(std::forward<Args>(args)...)
-  {}
+    : m_body(std::forward<body_in>(body))
+    , m_arg_tuple(std::forward<Args>(args)...)
+  { }
 
 protected:
   LoopBody m_body;
@@ -62,7 +62,7 @@ struct HoldBodyArgs_base
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the host
  */
-template <typename LoopBody, typename index_type, typename... Args>
+template <typename LoopBody, typename index_type, typename ... Args>
 struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -70,10 +70,10 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
   }
 
-  template <camp::idx_t... Is>
+  template < camp::idx_t ... Is >
   RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -84,7 +84,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the device
  */
-template <typename LoopBody, typename index_type, typename... Args>
+template <typename LoopBody, typename index_type, typename ... Args>
 struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -92,10 +92,10 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_DEVICE RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
   }
 
-  template <camp::idx_t... Is>
+  template < camp::idx_t ... Is >
   RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -105,29 +105,28 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <typename ExecutionPolicy,
-          typename Segment_type,
-          typename LoopBody,
-          typename index_type,
-          typename... Args>
+template <typename ExecutionPolicy, typename Segment_type, typename LoopBody,
+          typename index_type, typename ... Args>
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
-  using HoldBodyArgs  = typename std::conditional<
+  using HoldBodyArgs = typename std::conditional<
       !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
       HoldBodyArgs_host<LoopBody, index_type, Args...>,
-      HoldBodyArgs_device<LoopBody, index_type, Args...>>::type;
+      HoldBodyArgs_device<LoopBody, index_type, Args...> >::type;
 
-  template <typename segment_in, typename body_in>
+  template < typename segment_in, typename body_in >
   HoldForall(segment_in&& segment, body_in&& body)
-      : m_segment(std::forward<segment_in>(segment)),
-        m_body(std::forward<body_in>(body))
-  {}
+    : m_segment(std::forward<segment_in>(segment))
+    , m_body(std::forward<body_in>(body))
+  { }
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
-    wrap::forall(r, ExecutionPolicy(), m_segment,
-                 HoldBodyArgs {m_body, std::forward<Args>(args)...});
+    wrap::forall(r,
+                 ExecutionPolicy(),
+                 m_segment,
+                 HoldBodyArgs{m_body, std::forward<Args>(args)...});
   }
 
 private:
@@ -144,7 +143,7 @@ template <typename EXEC_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunner;
 
 
@@ -157,32 +156,28 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunnerForallOrdered_base
 {
-  using exec_policy     = EXEC_POLICY_T;
-  using order_policy    = ORDER_POLICY_T;
+  using exec_policy = EXEC_POLICY_T;
+  using order_policy = ORDER_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator       = ALLOCATOR_T;
-  using index_type      = INDEX_T;
-  using resource_type =
-      typename resources::get_resource<FORALL_EXEC_POLICY>::type;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+  using resource_type = typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
   using forall_exec_policy = FORALL_EXEC_POLICY;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type
-  {
-    template <typename T>
-    using type =
-        HoldForall<forall_exec_policy,
-                   typename camp::at<T, camp::num<0>>::type,  // segment_type
-                   typename camp::at<T, camp::num<1>>::type,  // loop_type
-                   index_type,
-                   Args...>;
+  struct holder_type {
+    template < typename T >
+    using type = HoldForall<forall_exec_policy,
+                            typename camp::at<T, camp::num<0>>::type, // segment_type
+                            typename camp::at<T, camp::num<1>>::type, // loop_type
+                            index_type, Args...>;
   };
   ///
-  template <typename T>
+  template < typename T >
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -191,40 +186,33 @@ struct WorkRunnerForallOrdered_base
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy =
-      dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::host,
-                                     dispatcher_holder_policy,
-                                     void,
-                                     resource_type,
-                                     Args...>;
+  using dispatcher_type = Dispatcher<Platform::host, dispatcher_holder_policy, void, resource_type, Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
   WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete;
-  WorkRunnerForallOrdered_base&
-  operator=(WorkRunnerForallOrdered_base const&) = delete;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete;
 
-  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base&&) = default;
-  WorkRunnerForallOrdered_base&
-  operator=(WorkRunnerForallOrdered_base&&) = default;
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template <typename WorkContainer, typename segment_T, typename loop_T>
+  template < typename WorkContainer, typename segment_T, typename loop_T >
   inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
   {
-    using holder =
-        holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
+    using holder = holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
     storage.template emplace<holder>(
-        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
+        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
         std::forward<segment_T>(seg), std::forward<loop_T>(loop));
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear() {}
+  void clear()
+  { }
 
   // no extra storage required here
   using per_run_storage = int;
@@ -239,38 +227,39 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunnerForallOrdered
-    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                   EXEC_POLICY_T,
-                                   ORDER_POLICY_T,
-                                   DISPATCH_POLICY_T,
-                                   ALLOCATOR_T,
-                                   INDEX_T,
-                                   Args...>
+    : WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>
 {
-  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                            EXEC_POLICY_T,
-                                            ORDER_POLICY_T,
-                                            DISPATCH_POLICY_T,
-                                            ALLOCATOR_T,
-                                            INDEX_T,
-                                            Args...>;
+  using base = WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
-  template <typename WorkContainer>
+  template < typename WorkContainer >
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage {};
+    typename base::per_run_storage run_storage{};
 
     auto end = storage.end();
-    for (auto iter = storage.begin(); iter != end; ++iter)
-    {
+    for (auto iter = storage.begin(); iter != end; ++iter) {
       value_type::host_call(&*iter, r, args...);
     }
 
@@ -287,40 +276,40 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunnerForallReverse
-    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                   EXEC_POLICY_T,
-                                   ORDER_POLICY_T,
-                                   DISPATCH_POLICY_T,
-                                   ALLOCATOR_T,
-                                   INDEX_T,
-                                   Args...>
+    : WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>
 {
-  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
-                                            EXEC_POLICY_T,
-                                            ORDER_POLICY_T,
-                                            DISPATCH_POLICY_T,
-                                            ALLOCATOR_T,
-                                            INDEX_T,
-                                            Args...>;
+  using base = WorkRunnerForallOrdered_base<
+      FORALL_EXEC_POLICY,
+      EXEC_POLICY_T,
+      ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
 
-  // run the loops using forall in the reverse order to the order they were
-  // enqueued
-  template <typename WorkContainer>
+  // run the loops using forall in the reverse order to the order they were enqueued
+  template < typename WorkContainer >
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage {};
+    typename base::per_run_storage run_storage{};
 
     auto begin = storage.begin();
-    for (auto iter = storage.end(); iter != begin; --iter)
-    {
-      value_type::host_call(&*(iter - 1), r, args...);
+    for (auto iter = storage.end(); iter != begin; --iter) {
+      value_type::host_call(&*(iter-1), r, args...);
     }
 
     return run_storage;
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index d7eceaef7f..52631d108f 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -46,23 +46,23 @@ namespace detail
 //   operator -  ( iterator_base const& )
 //   operator == ( iterator_base const& )
 //   operator <  ( iterator_base const& )
-template <typename iterator_base>
+template < typename iterator_base >
 struct random_access_iterator : iterator_base
 {
-  using base              = iterator_base;
-  using value_type        = const typename base::value_type;
-  using pointer           = typename base::pointer;
-  using reference         = typename base::reference;
-  using difference_type   = typename base::difference_type;
+  using base = iterator_base;
+  using value_type = const typename base::value_type;
+  using pointer = typename base::pointer;
+  using reference = typename base::reference;
+  using difference_type = typename base::difference_type;
   using iterator_category = std::random_access_iterator_tag;
 
   using base::base;
 
   random_access_iterator(random_access_iterator const&) = default;
-  random_access_iterator(random_access_iterator&&)      = default;
+  random_access_iterator(random_access_iterator &&) = default;
 
   random_access_iterator& operator=(random_access_iterator const&) = default;
-  random_access_iterator& operator=(random_access_iterator&&)      = default;
+  random_access_iterator& operator=(random_access_iterator &&) = default;
 
 
   RAJA_HOST_DEVICE reference operator*() const
@@ -70,7 +70,10 @@ struct random_access_iterator : iterator_base
     return *static_cast<base const&>(*this);
   }
 
-  RAJA_HOST_DEVICE pointer operator->() const { return &(*(*this)); }
+  RAJA_HOST_DEVICE pointer operator->() const
+  {
+    return &(*(*this));
+  }
 
   RAJA_HOST_DEVICE reference operator[](difference_type i) const
   {
@@ -117,75 +120,68 @@ struct random_access_iterator : iterator_base
     return *this;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator
-  operator+(random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
+      random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy += rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator
-  operator+(difference_type lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
+      difference_type lhs, random_access_iterator const& rhs)
   {
     random_access_iterator copy = rhs;
     copy += lhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator
-  operator-(random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator operator-(
+      random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy -= rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline difference_type
-  operator-(random_access_iterator const& lhs,
-            random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline difference_type operator-(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator==(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator==(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator!=(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator!=(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator<(random_access_iterator const& lhs,
-            random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator<(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator<=(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator<=(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator>(random_access_iterator const& lhs,
-            random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator>(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
-  RAJA_HOST_DEVICE friend inline bool
-  operator>=(random_access_iterator const& lhs,
-             random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool operator>=(
+      random_access_iterator const& lhs, random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -195,12 +191,10 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template <typename STORAGE_POLICY_T,
-          typename ALLOCATOR_T,
-          typename Dispatcher_T>
+template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T >
 class WorkStorage;
 
-template <typename ALLOCATOR_T, typename Dispatcher_T>
+template < typename ALLOCATOR_T, typename Dispatcher_T >
 class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -208,27 +202,25 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap =
+  using propagate_on_container_swap            =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(
-      std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
-
 public:
-  using storage_policy  = RAJA::array_of_pointers;
+  using storage_policy = RAJA::array_of_pointers;
   using dispatcher_type = Dispatcher_T;
 
-  template <typename holder>
+  template < typename holder >
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type      = GenericWorkStruct<dispatcher_type>;
-  using allocator_type  = ALLOCATOR_T;
-  using size_type       = std::size_t;
+  using value_type = GenericWorkStruct<dispatcher_type>;
+  using allocator_type = ALLOCATOR_T;
+  using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference       = value_type&;
+  using reference = value_type&;
   using const_reference = const value_type&;
-  using pointer         = value_type*;
-  using const_pointer   = const value_type*;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
 
 private:
   // struct used in storage vector to retain pointer and allocation size
@@ -239,19 +231,24 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   };
 
 public:
-  // iterator base class for accessing stored WorkStructs outside of the
-  // container
+
+  // iterator base class for accessing stored WorkStructs outside of the container
   struct const_iterator_base
   {
-    using value_type        = const typename WorkStorage::value_type;
-    using pointer           = typename WorkStorage::const_pointer;
-    using reference         = typename WorkStorage::const_reference;
-    using difference_type   = typename WorkStorage::difference_type;
+    using value_type = const typename WorkStorage::value_type;
+    using pointer = typename WorkStorage::const_pointer;
+    using reference = typename WorkStorage::const_reference;
+    using difference_type = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
-    const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {}
+    const_iterator_base(const pointer_and_size* ptrptr)
+      : m_ptrptr(ptrptr)
+    { }
 
-    RAJA_HOST_DEVICE reference operator*() const { return *(m_ptrptr->ptr); }
+    RAJA_HOST_DEVICE reference operator*() const
+    {
+      return *(m_ptrptr->ptr);
+    }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
     {
@@ -259,23 +256,20 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type
-    operator-(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator==(const_iterator_base const& lhs_iter,
-               const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator<(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -288,22 +282,22 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-      : m_vec(0, aloc), m_aloc(aloc)
-  {}
+    : m_vec(0, aloc)
+    , m_aloc(aloc)
+  { }
 
-  WorkStorage(WorkStorage const&)            = delete;
+  WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-      : m_vec(std::move(rhs.m_vec)), m_aloc(std::move(rhs.m_aloc))
-  {}
+    : m_vec(std::move(rhs.m_vec))
+    , m_aloc(std::move(rhs.m_aloc))
+  { }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs)
-    {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment {});
+    if (this != &rhs) {
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -318,26 +312,33 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const { return m_vec.size(); }
+  size_type size() const
+  {
+    return m_vec.size();
+  }
 
-  const_iterator begin() const { return const_iterator(m_vec.begin()); }
+  const_iterator begin() const
+  {
+    return const_iterator(m_vec.begin());
+  }
 
-  const_iterator end() const { return const_iterator(m_vec.end()); }
+  const_iterator end() const
+  {
+    return const_iterator(m_vec.end());
+  }
 
   // number of bytes used for storage of loops
   size_type storage_size() const
   {
     size_type storage_size_nbytes = 0;
-    for (size_t i = 0; i < m_vec.size(); ++i)
-    {
+    for (size_t i = 0; i < m_vec.size(); ++i) {
       storage_size_nbytes += m_vec[i].size;
     }
     return storage_size_nbytes;
   }
 
-  template <typename holder, typename... holder_ctor_args>
-  void emplace(const dispatcher_type* dispatcher,
-               holder_ctor_args&&... ctor_args)
+  template < typename holder, typename ... holder_ctor_args >
+  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
         dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
@@ -346,28 +347,27 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   // destroy all stored loops, deallocates all storage
   void clear()
   {
-    while (!m_vec.empty())
-    {
+    while (!m_vec.empty()) {
       destroy_value(m_vec.back());
       m_vec.pop_back();
     }
     m_vec.shrink_to_fit();
   }
 
-  ~WorkStorage() { clear(); }
+  ~WorkStorage()
+  {
+    clear();
+  }
 
 private:
-  RAJAVec<
-      pointer_and_size,
-      typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
-      m_vec;
+  RAJAVec<pointer_and_size, typename allocator_traits_type::template rebind_alloc<pointer_and_size>> m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
-    m_vec  = std::move(rhs.m_vec);
+    m_vec = std::move(rhs.m_vec);
     m_aloc = std::move(rhs.m_aloc);
   }
 
@@ -375,16 +375,12 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc)
-    {
+    if (m_aloc == rhs.m_aloc) {
       // take storage if allocators compare equal
       m_vec = std::move(rhs.m_vec);
-    }
-    else
-    {
+    } else {
       // allocate new storage if allocators do not compare equal
-      for (size_type i = 0; i < rhs.m_vec.size(); ++i)
-      {
+      for (size_type i = 0; i < rhs.m_vec.size(); ++i) {
         m_vec.emplace_back(move_destroy_value(std::move(rhs), rhs.m_vec[i]));
       }
       rhs.m_vec.clear();
@@ -393,7 +389,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // allocate and construct value in storage
-  template <typename holder, typename... holder_ctor_args>
+  template < typename holder, typename ... holder_ctor_args >
   pointer_and_size create_value(const dispatcher_type* dispatcher,
                                 holder_ctor_args&&... ctor_args)
   {
@@ -405,7 +401,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     value_type::template construct<holder>(
         value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
 
-    return pointer_and_size {value_ptr, value_size};
+    return pointer_and_size{value_ptr, value_size};
   }
 
   // allocate and move construct object as copy of other value and
@@ -418,24 +414,22 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
     value_type::move_destroy(value_ptr, other_value_and_size.ptr);
 
-    allocator_traits_type::deallocate(
-        rhs.m_aloc, reinterpret_cast<char*>(other_value_and_size.ptr),
-        other_value_and_size.size);
+    allocator_traits_type::deallocate(rhs.m_aloc,
+        reinterpret_cast<char*>(other_value_and_size.ptr), other_value_and_size.size);
 
-    return pointer_and_size {value_ptr, other_value_and_size.size};
+    return pointer_and_size{value_ptr, other_value_and_size.size};
   }
 
   // destroy and deallocate value
   void destroy_value(pointer_and_size value_and_size_ptr)
   {
     value_type::destroy(value_and_size_ptr.ptr);
-    allocator_traits_type::deallocate(
-        m_aloc, reinterpret_cast<char*>(value_and_size_ptr.ptr),
-        value_and_size_ptr.size);
+    allocator_traits_type::deallocate(m_aloc,
+        reinterpret_cast<char*>(value_and_size_ptr.ptr), value_and_size_ptr.size);
   }
 };
 
-template <typename ALLOCATOR_T, typename Dispatcher_T>
+template < typename ALLOCATOR_T, typename Dispatcher_T >
 class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -443,45 +437,44 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap =
+  using propagate_on_container_swap            =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(
-      std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
-
 public:
-  using storage_policy  = RAJA::ragged_array_of_objects;
+  using storage_policy = RAJA::ragged_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template <typename holder>
+  template < typename holder >
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type      = GenericWorkStruct<dispatcher_type>;
-  using allocator_type  = ALLOCATOR_T;
-  using size_type       = std::size_t;
+  using value_type = GenericWorkStruct<dispatcher_type>;
+  using allocator_type = ALLOCATOR_T;
+  using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference       = value_type&;
+  using reference = value_type&;
   using const_reference = const value_type&;
-  using pointer         = value_type*;
-  using const_pointer   = const value_type*;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the
-  // container
+  // iterator base class for accessing stored WorkStructs outside of the container
   struct const_iterator_base
   {
-    using value_type        = const typename WorkStorage::value_type;
-    using pointer           = typename WorkStorage::const_pointer;
-    using reference         = typename WorkStorage::const_reference;
-    using difference_type   = typename WorkStorage::difference_type;
+    using value_type = const typename WorkStorage::value_type;
+    using pointer = typename WorkStorage::const_pointer;
+    using reference = typename WorkStorage::const_reference;
+    using difference_type = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_begin, const size_type* offset_iter)
-        : m_array_begin(array_begin), m_offset_iter(offset_iter)
-    {}
+      : m_array_begin(array_begin)
+      , m_offset_iter(offset_iter)
+    { }
 
     RAJA_HOST_DEVICE reference operator*() const
     {
-      return *reinterpret_cast<pointer>(m_array_begin + *m_offset_iter);
+      return *reinterpret_cast<pointer>(
+          m_array_begin + *m_offset_iter);
     }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
@@ -490,23 +483,20 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type
-    operator-(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator==(const_iterator_base const& lhs_iter,
-               const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator<(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
@@ -520,30 +510,29 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-      : m_offsets(0, aloc), m_aloc(aloc)
-  {}
+    : m_offsets(0, aloc)
+    , m_aloc(aloc)
+  { }
 
-  WorkStorage(WorkStorage const&)            = delete;
+  WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-      : m_offsets(std::move(rhs.m_offsets)),
-        m_array_begin(rhs.m_array_begin),
-        m_array_end(rhs.m_array_end),
-        m_array_cap(rhs.m_array_cap),
-        m_aloc(std::move(rhs.m_aloc))
+    : m_offsets(std::move(rhs.m_offsets))
+    , m_array_begin(rhs.m_array_begin)
+    , m_array_end(rhs.m_array_end)
+    , m_array_cap(rhs.m_array_cap)
+    , m_aloc(std::move(rhs.m_aloc))
   {
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end   = nullptr;
-    rhs.m_array_cap   = nullptr;
+    rhs.m_array_end = nullptr;
+    rhs.m_array_cap = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs)
-    {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment {});
+    if (this != &rhs) {
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -557,7 +546,10 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const { return m_offsets.size(); }
+  size_type size() const
+  {
+    return m_offsets.size();
+  }
 
   const_iterator begin() const
   {
@@ -570,15 +562,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of bytes used for storage of loops
-  size_type storage_size() const { return m_array_end - m_array_begin; }
+  size_type storage_size() const
+  {
+    return m_array_end - m_array_begin;
+  }
 
-  template <typename holder, typename... holder_ctor_args>
-  void emplace(const dispatcher_type* dispatcher,
-               holder_ctor_args&&... ctor_args)
+  template < typename holder, typename ... holder_ctor_args >
+  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
-    size_type value_size   = create_value<holder>(
-        value_offset, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    size_type value_size   = create_value<holder>(value_offset,
+        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
   }
@@ -587,22 +581,21 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr)
-    {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                        storage_capacity());
+    if (m_array_begin != nullptr) {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage() { clear(); }
+  ~WorkStorage()
+  {
+    clear();
+  }
 
 private:
-  RAJAVec<size_type,
-          typename allocator_traits_type::template rebind_alloc<size_type>>
-      m_offsets;
+  RAJAVec<size_type, typename allocator_traits_type::template rebind_alloc<size_type>> m_offsets;
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -615,8 +608,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
     m_offsets     = std::move(rhs.m_offsets);
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end;
-    m_array_cap   = rhs.m_array_cap;
+    m_array_end   = rhs.m_array_end  ;
+    m_array_cap   = rhs.m_array_cap  ;
     m_aloc        = std::move(rhs.m_aloc);
 
     rhs.m_array_begin = nullptr;
@@ -628,29 +621,25 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc)
-    {
+    if (m_aloc == rhs.m_aloc) {
 
       m_offsets     = std::move(rhs.m_offsets);
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end;
-      m_array_cap   = rhs.m_array_cap;
+      m_array_end   = rhs.m_array_end  ;
+      m_array_cap   = rhs.m_array_cap  ;
 
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    }
-    else
-    {
+    } else {
       array_reserve(rhs.storage_size());
 
-      for (size_type i = 0; i < rhs.size(); ++i)
-      {
+      for (size_type i = 0; i < rhs.size(); ++i) {
         m_array_end = m_array_begin + rhs.m_offsets[i];
         move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]);
         m_offsets.emplace_back(rhs.m_offsets[i]);
       }
-      m_array_end     = m_array_begin + rhs.storage_size();
+      m_array_end = m_array_begin + rhs.storage_size();
       rhs.m_array_end = rhs.m_array_begin;
       rhs.m_offsets.clear();
       rhs.clear();
@@ -658,45 +647,46 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // get loop storage capacity, used and unused in bytes
-  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
+  size_type storage_capacity() const
+  {
+    return m_array_cap - m_array_begin;
+  }
 
   // get unused loop storage capacity in bytes
-  size_type storage_unused() const { return m_array_cap - m_array_end; }
+  size_type storage_unused() const
+  {
+    return m_array_cap - m_array_end;
+  }
 
   // reserve space for loop_storage_size bytes of loop storage
   void array_reserve(size_type loop_storage_size)
   {
-    if (loop_storage_size > storage_capacity())
-    {
+    if (loop_storage_size > storage_capacity()) {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end = new_array_begin + storage_size();
-      char* new_array_cap = new_array_begin + loop_storage_size;
+      char* new_array_end   = new_array_begin + storage_size();
+      char* new_array_cap   = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i)
-      {
+      for (size_type i = 0; i < size(); ++i) {
         move_destroy_value(new_array_begin + m_offsets[i],
-                           m_array_begin + m_offsets[i]);
+                             m_array_begin + m_offsets[i]);
       }
 
-      if (m_array_begin != nullptr)
-      {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                          storage_capacity());
+      if (m_array_begin != nullptr) {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
       }
 
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end;
-      m_array_cap   = new_array_cap;
+      m_array_end   = new_array_end  ;
+      m_array_cap   = new_array_cap  ;
     }
   }
 
   // destroy loop objects (does not deallocate array storage)
   void array_clear()
   {
-    while (!m_offsets.empty())
-    {
+    while (!m_offsets.empty()) {
       destroy_value(m_offsets.back());
       m_array_end = m_array_begin + m_offsets.back();
       m_offsets.pop_back();
@@ -706,17 +696,15 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
-  template <typename holder, typename... holder_ctor_args>
+  template < typename holder, typename ... holder_ctor_args >
   size_type create_value(size_type value_offset,
                          const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused())
-    {
-      array_reserve(
-          std::max(storage_size() + value_size, 2 * storage_capacity()));
+    if (value_size > storage_unused()) {
+      array_reserve(std::max(storage_size() + value_size, 2*storage_capacity()));
     }
 
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
@@ -738,12 +726,13 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr =
+        reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
 
-template <typename ALLOCATOR_T, typename Dispatcher_T>
+template < typename ALLOCATOR_T, typename Dispatcher_T >
 class WorkStorage<RAJA::constant_stride_array_of_objects,
                   ALLOCATOR_T,
                   Dispatcher_T>
@@ -753,41 +742,39 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap =
+  using propagate_on_container_swap            =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(
-      std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
-
 public:
-  using storage_policy  = RAJA::constant_stride_array_of_objects;
+  using storage_policy = RAJA::constant_stride_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template <typename holder>
+  template < typename holder >
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type      = GenericWorkStruct<dispatcher_type>;
-  using allocator_type  = ALLOCATOR_T;
-  using size_type       = std::size_t;
+  using value_type = GenericWorkStruct<dispatcher_type>;
+  using allocator_type = ALLOCATOR_T;
+  using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference       = value_type&;
+  using reference = value_type&;
   using const_reference = const value_type&;
-  using pointer         = value_type*;
-  using const_pointer   = const value_type*;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the
-  // container
+  // iterator base class for accessing stored WorkStructs outside of the container
   struct const_iterator_base
   {
-    using value_type        = const typename WorkStorage::value_type;
-    using pointer           = typename WorkStorage::const_pointer;
-    using reference         = typename WorkStorage::const_reference;
-    using difference_type   = typename WorkStorage::difference_type;
+    using value_type = const typename WorkStorage::value_type;
+    using pointer = typename WorkStorage::const_pointer;
+    using reference = typename WorkStorage::const_reference;
+    using difference_type = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_pos, size_type stride)
-        : m_array_pos(array_pos), m_stride(stride)
-    {}
+      : m_array_pos(array_pos)
+      , m_stride(stride)
+    { }
 
     RAJA_HOST_DEVICE reference operator*() const
     {
@@ -800,23 +787,20 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type
-    operator-(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type operator-(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator==(const_iterator_base const& lhs_iter,
-               const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator==(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
-    RAJA_HOST_DEVICE friend inline bool
-    operator<(const_iterator_base const& lhs_iter,
-              const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool operator<(
+        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
@@ -829,17 +813,19 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   using const_iterator = random_access_iterator<const_iterator_base>;
 
 
-  explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {}
+  explicit WorkStorage(allocator_type const& aloc)
+    : m_aloc(aloc)
+  { }
 
-  WorkStorage(WorkStorage const&)            = delete;
+  WorkStorage(WorkStorage const&) = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-      : m_aloc(std::move(rhs.m_aloc)),
-        m_stride(rhs.m_stride),
-        m_array_begin(rhs.m_array_begin),
-        m_array_end(rhs.m_array_end),
-        m_array_cap(rhs.m_array_cap)
+    : m_aloc(std::move(rhs.m_aloc))
+    , m_stride(rhs.m_stride)
+    , m_array_begin(rhs.m_array_begin)
+    , m_array_end(rhs.m_array_end)
+    , m_array_cap(rhs.m_array_cap)
   {
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -849,10 +835,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs)
-    {
-      move_assign_private(std::move(rhs),
-                          propagate_on_container_move_assignment {});
+    if (this != &rhs) {
+      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
     }
     return *this;
   }
@@ -863,28 +847,35 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     size_type num_storage_loops =
         std::max(num_loops, (loop_storage_size + m_stride - 1) / m_stride);
-    array_reserve(num_storage_loops * m_stride, m_stride);
+    array_reserve(num_storage_loops*m_stride, m_stride);
   }
 
   // number of loops stored
-  size_type size() const { return storage_size() / m_stride; }
+  size_type size() const
+  {
+    return storage_size() / m_stride;
+  }
 
   const_iterator begin() const
   {
     return const_iterator(m_array_begin, m_stride);
   }
 
-  const_iterator end() const { return const_iterator(m_array_end, m_stride); }
+  const_iterator end() const
+  {
+    return const_iterator(m_array_end, m_stride);
+  }
 
   // amount of storage in bytes used to store loops
-  size_type storage_size() const { return m_array_end - m_array_begin; }
+  size_type storage_size() const
+  {
+    return m_array_end - m_array_begin;
+  }
 
-  template <typename holder, typename... holder_ctor_args>
-  void emplace(const dispatcher_type* dispatcher,
-               holder_ctor_args&&... ctor_args)
+  template < typename holder, typename ... holder_ctor_args >
+  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(dispatcher,
-                         std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -892,21 +883,22 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr)
-    {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                        storage_capacity());
+    if (m_array_begin != nullptr) {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage() { clear(); }
+  ~WorkStorage()
+  {
+    clear();
+  }
 
 private:
   allocator_type m_aloc;
-  size_type m_stride  = 1;  // can't be 0 because size divides stride
+  size_type m_stride     = 1; // can't be 0 because size divides stride
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -917,10 +909,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     clear();
 
     m_aloc        = std::move(rhs.m_aloc);
-    m_stride      = rhs.m_stride;
+    m_stride      = rhs.m_stride     ;
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end;
-    m_array_cap   = rhs.m_array_cap;
+    m_array_end   = rhs.m_array_end  ;
+    m_array_cap   = rhs.m_array_cap  ;
 
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -932,27 +924,23 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc)
-    {
+    if (m_aloc == rhs.m_aloc) {
 
-      m_stride      = rhs.m_stride;
+      m_stride      = rhs.m_stride     ;
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end;
-      m_array_cap   = rhs.m_array_cap;
+      m_array_end   = rhs.m_array_end  ;
+      m_array_cap   = rhs.m_array_cap  ;
 
       // do not reset stride, leave it for reuse
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    }
-    else
-    {
+    } else {
 
       m_stride = rhs.m_stride;
       array_reserve(rhs.storage_size(), rhs.m_stride);
 
-      for (size_type i = 0; i < rhs.size(); ++i)
-      {
+      for (size_type i = 0; i < rhs.size(); ++i) {
         move_destroy_value(m_array_end, rhs.m_array_begin + i * rhs.m_stride);
         m_array_end += m_stride;
       }
@@ -962,10 +950,16 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 
   // storage capacity, used and unused, in bytes
-  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
+  size_type storage_capacity() const
+  {
+    return m_array_cap - m_array_begin;
+  }
 
   // unused storage capacity in bytes
-  size_type storage_unused() const { return m_array_cap - m_array_end; }
+  size_type storage_unused() const
+  {
+    return m_array_cap - m_array_end;
+  }
 
   // allocate enough storage for loop_storage_size bytes with
   // each loop body separated by new_stride bytes
@@ -974,39 +968,33 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // Note that loop_storage_size must be a multiple of new_stride
   void array_reserve(size_type loop_storage_size, size_type new_stride)
   {
-    if (loop_storage_size > storage_capacity() || new_stride > m_stride)
-    {
+    if (loop_storage_size > storage_capacity() || new_stride > m_stride) {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end = new_array_begin + size() * new_stride;
-      char* new_array_cap = new_array_begin + loop_storage_size;
+      char* new_array_end   = new_array_begin + size() * new_stride;
+      char* new_array_cap   = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i)
-      {
+      for (size_type i = 0; i < size(); ++i) {
         move_destroy_value(new_array_begin + i * new_stride,
-                           m_array_begin + i * m_stride);
+                             m_array_begin + i *   m_stride);
       }
 
-      if (m_array_begin != nullptr)
-      {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin,
-                                          storage_capacity());
+      if (m_array_begin != nullptr) {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
       }
 
-      m_stride      = new_stride;
+      m_stride      = new_stride     ;
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end;
-      m_array_cap   = new_array_cap;
+      m_array_end   = new_array_end  ;
+      m_array_cap   = new_array_cap  ;
     }
   }
 
   // destroy the loops in storage (does not deallocate loop storage)
   void array_clear()
   {
-    for (size_type value_offset = storage_size(); value_offset > 0;
-         value_offset -= m_stride)
-    {
+    for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) {
       destroy_value(value_offset - m_stride);
       m_array_end -= m_stride;
     }
@@ -1014,20 +1002,18 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
-  template <typename holder, typename... holder_ctor_args>
+  template < typename holder, typename ... holder_ctor_args >
   void create_value(const dispatcher_type* dispatcher,
                     holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused() && value_size <= m_stride)
-    {
-      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
+    if (value_size > storage_unused() && value_size <= m_stride) {
+      array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()),
                     m_stride);
-    }
-    else if (value_size > m_stride)
-    {
-      array_reserve((size() + 1) * value_size, value_size);
+    } else if (value_size > m_stride) {
+      array_reserve((size()+1)*value_size,
+                    value_size);
     }
 
     size_type value_offset = storage_size();
@@ -1039,7 +1025,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // move construct the loop body in value from other and
   // destroy the loop body in other
-  void move_destroy_value(char* value_ptr, char* other_value_ptr)
+  void move_destroy_value(char* value_ptr,
+                          char* other_value_ptr)
   {
     value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
                              reinterpret_cast<pointer>(other_value_ptr));
@@ -1048,7 +1035,8 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr =
+        reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index abb6545876..72e1540c54 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -35,7 +35,7 @@ namespace detail
 /*!
  * A struct that gives a generic way to layout memory for different loops
  */
-template <size_t size, typename Dispatcher_T>
+template < size_t size, typename Dispatcher_T >
 struct WorkStruct;
 
 /*!
@@ -44,75 +44,67 @@ struct WorkStruct;
  *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
-template <typename Dispatcher_T>
+template < typename Dispatcher_T >
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template <size_t size,
-          Platform platform,
-          typename dispatch_policy,
-          typename DispatcherID,
-          typename... CallArgs>
-struct WorkStruct<
-    size,
-    Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
+template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
 {
-  using dispatcher_type =
-      Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
+  using dispatcher_type = Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
 
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
-  template <typename holder, typename... holder_ctor_args>
-  static RAJA_INLINE void construct(void* ptr,
-                                    const dispatcher_type* dispatcher,
-                                    holder_ctor_args&&... ctor_args)
+  template < typename holder, typename ... holder_ctor_args >
+  static RAJA_INLINE
+  void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
-    using value_type      = GenericWorkStruct<dispatcher_type>;
+    using value_type = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
-                  "holder must fit in WorkStruct::obj");
+        "holder must fit in WorkStruct::obj");
     static_assert(std::is_standard_layout<true_value_type>::value,
-                  "WorkStruct must be a standard layout type");
+        "WorkStruct must be a standard layout type");
     static_assert(std::is_standard_layout<value_type>::value,
-                  "GenericWorkStruct must be a standard layout type");
+        "GenericWorkStruct must be a standard layout type");
     static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
-                  "WorkStruct and GenericWorkStruct must have obj at the same "
-                  "offset");
+        "WorkStruct and GenericWorkStruct must have obj at the same offset");
     static_assert(sizeof(value_type) <= sizeof(true_value_type),
-                  "WorkStruct must not be smaller than GenericWorkStruct");
+        "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
-    value_ptr->invoke     = dispatcher->invoke;
-    new (&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
+    value_ptr->invoke = dispatcher->invoke;
+    new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
-                                       WorkStruct* value_src)
+  static RAJA_INLINE
+  void move_destroy(WorkStruct* value_dst,
+                    WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
-    value_dst->invoke     = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
-                                                  &value_src->obj);
+    value_dst->invoke = value_src->invoke;
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj);
   }
 
   // destroy the value ptr
-  static RAJA_INLINE void destroy(WorkStruct* value_ptr)
+  static RAJA_INLINE
+  void destroy(WorkStruct* value_ptr)
   {
     value_ptr->dispatcher->destroy(&value_ptr->obj);
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
-                                    CallArgs... args)
+  static RAJA_INLINE
+  void host_call(const WorkStruct* value_ptr, CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
-                                                  CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE
+  void device_call(const WorkStruct* value_ptr, CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index d56c576710..d5905f7928 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -87,9 +87,9 @@ namespace RAJA
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
 {
-  return RAJA::atomicLoad(Policy {}, acc);
+  return RAJA::atomicLoad(Policy{}, acc);
 }
 
 
@@ -100,9 +100,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
 {
-  RAJA::atomicStore(Policy {}, acc, value);
+  RAJA::atomicStore(Policy{}, acc, value);
 }
 
 
@@ -114,9 +114,9 @@ RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
 {
-  return RAJA::atomicAdd(Policy {}, acc, value);
+  return RAJA::atomicAdd(Policy{}, acc, value);
 }
 
 
@@ -128,9 +128,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
 {
-  return RAJA::atomicSub(Policy {}, acc, value);
+  return RAJA::atomicSub(Policy{}, acc, value);
 }
 
 
@@ -142,9 +142,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
 {
-  return RAJA::atomicMin(Policy {}, acc, value);
+  return RAJA::atomicMin(Policy{}, acc, value);
 }
 
 
@@ -156,9 +156,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
 {
-  return RAJA::atomicMax(Policy {}, acc, value);
+  return RAJA::atomicMax(Policy{}, acc, value);
 }
 
 
@@ -169,9 +169,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
 {
-  return RAJA::atomicInc(Policy {}, acc);
+  return RAJA::atomicInc(Policy{}, acc);
 }
 
 
@@ -185,9 +185,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
 {
-  return RAJA::atomicInc(Policy {}, acc, compare);
+  return RAJA::atomicInc(Policy{}, acc, compare);
 }
 
 
@@ -198,9 +198,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
 {
-  return RAJA::atomicDec(Policy {}, acc);
+  return RAJA::atomicDec(Policy{}, acc);
 }
 
 
@@ -214,9 +214,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
 {
-  return RAJA::atomicDec(Policy {}, acc, compare);
+  return RAJA::atomicDec(Policy{}, acc, compare);
 }
 
 
@@ -229,11 +229,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicAnd can only be used on integral types");
-  return RAJA::atomicAnd(Policy {}, acc, value);
+  return RAJA::atomicAnd(Policy{}, acc, value);
 }
 
 
@@ -246,11 +246,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicOr can only be used on integral types");
-  return RAJA::atomicOr(Policy {}, acc, value);
+  return RAJA::atomicOr(Policy{}, acc, value);
 }
 
 
@@ -263,11 +263,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicXor can only be used on integral types");
-  return RAJA::atomicXor(Policy {}, acc, value);
+  return RAJA::atomicXor(Policy{}, acc, value);
 }
 
 
@@ -279,9 +279,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
 {
-  return RAJA::atomicExchange(Policy {}, acc, value);
+  return RAJA::atomicExchange(Policy{}, acc, value);
 }
 
 
@@ -295,9 +295,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
 {
-  return RAJA::atomicCAS(Policy {}, acc, compare, value);
+  return RAJA::atomicCAS(Policy{}, acc, compare, value);
 }
 
 /*!
@@ -317,18 +317,22 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr explicit AtomicRef(value_type* value_ptr) : m_value_ptr(value_ptr)
-  {}
+  constexpr explicit AtomicRef(value_type *value_ptr)
+      : m_value_ptr(value_ptr) {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr AtomicRef(AtomicRef const& c) : m_value_ptr(c.m_value_ptr) {}
+  constexpr AtomicRef(AtomicRef const &c)
+      : m_value_ptr(c.m_value_ptr) {}
 
   AtomicRef& operator=(AtomicRef const&) = delete;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type* getPointer() const { return m_value_ptr; }
+  value_type * getPointer() const
+  {
+    return m_value_ptr;
+  }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -347,11 +351,17 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type load() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
+  value_type load() const
+  {
+    return RAJA::atomicLoad<Policy>(m_value_ptr);
+  }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  operator value_type() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
+  operator value_type() const
+  {
+    return RAJA::atomicLoad<Policy>(m_value_ptr);
+  }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -372,13 +382,10 @@ class AtomicRef
   bool compare_exchange_strong(value_type& expect, value_type rhs) const
   {
     value_type compare = expect;
-    value_type old     = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
-    if (compare == old)
-    {
+    value_type old = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
+    if (compare == old) {
       return true;
-    }
-    else
-    {
+    } else {
       expect = old;
       return false;
     }
@@ -520,7 +527,7 @@ class AtomicRef
   }
 
 private:
-  value_type* m_value_ptr;
+  value_type *m_value_ptr;
 };
 
 
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 0a5521e0e3..21d266bd21 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -49,17 +49,16 @@ using ContainerVal =
     camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
 
 template <typename Container>
-using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
+using ContainerRef =
+    decltype(*camp::val<camp::iterator_from<Container>>());
 
 template <typename Container>
 using ContainerDiff =
-    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
-                         camp::val<camp::iterator_from<Container>>())>;
+    camp::decay<decltype(camp::val<camp::iterator_from<Container>>()-camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE DiffType firstIndex(DiffType n,
-                                CountType num_threads,
-                                CountType thread_id)
+RAJA_INLINE
+DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
@@ -71,7 +70,9 @@ RAJA_INLINE DiffType firstIndex(DiffType n,
     \brief swap values at iterators lhs and rhs
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs)
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+safe_iter_swap(Iter lhs, Iter rhs)
 {
 #ifdef RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
   using camp::safe_swap;
@@ -86,7 +87,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE
+Iter
+next(Iter it)
 {
   ++it;
   return it;
@@ -96,7 +99,9 @@ RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE
+Iter
+prev(Iter it)
 {
   --it;
   return it;
diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp
index aa9a3ac888..3bd5d7ecaf 100644
--- a/include/RAJA/pattern/detail/forall.hpp
+++ b/include/RAJA/pattern/detail/forall.hpp
@@ -19,12 +19,12 @@
 #ifndef RAJA_PATTERN_DETAIL_FORALL_HPP
 #define RAJA_PATTERN_DETAIL_FORALL_HPP
 
-#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX)                           \
-  using std::begin;                                                            \
-  using std::end;                                                              \
-  using std::distance;                                                         \
-  auto begin##SUFFIX    = begin(CONTAINER);                                    \
-  auto end##SUFFIX      = end(CONTAINER);                                      \
+#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \
+  using std::begin;                                  \
+  using std::end;                                    \
+  using std::distance;                               \
+  auto begin##SUFFIX = begin(CONTAINER);             \
+  auto end##SUFFIX = end(CONTAINER);                 \
   auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX)
 
 #define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it)
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 14b655475b..884b9aa989 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -26,29 +26,32 @@
 #include "RAJA/util/RepeatView.hpp"
 
 
-#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)                     \
-  template <typename tuning, typename T>                                       \
-  struct MultiReduce##OP_NAME<POL<tuning>, T>                                  \
-      : reduce::detail::BaseMultiReduce##OP_NAME<                              \
-            DATA<T, RAJA::reduce::OP<T>, tuning>>                              \
-  {                                                                            \
-    using policy = POL<tuning>;                                                \
-    using Base   = reduce::detail::BaseMultiReduce##OP_NAME<                   \
-        DATA<T, RAJA::reduce::OP<T>, tuning>>;                               \
-    using Base::Base;                                                          \
-    using typename Base::value_type;                                           \
-    using typename Base::reference;                                            \
-                                                                               \
-    RAJA_SUPPRESS_HD_WARN                                                      \
-    RAJA_HOST_DEVICE                                                           \
-    reference operator[](size_t bin) const { return reference(*this, bin); }   \
+#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)    \
+  template <typename tuning, typename T>                      \
+  struct MultiReduce##OP_NAME<POL<tuning>, T>                 \
+      : reduce::detail::BaseMultiReduce##OP_NAME<             \
+            DATA<T, RAJA::reduce::OP<T>, tuning>>             \
+  {                                                           \
+    using policy = POL<tuning>;                               \
+    using Base = reduce::detail::BaseMultiReduce##OP_NAME<    \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                \
+    using Base::Base;                                         \
+    using typename Base::value_type;                          \
+    using typename Base::reference;                           \
+                                                              \
+    RAJA_SUPPRESS_HD_WARN                                     \
+    RAJA_HOST_DEVICE                                          \
+    reference operator[](size_t bin) const                    \
+    {                                                         \
+      return reference(*this, bin);                           \
+    }                                                         \
   };
 
-#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)                             \
-  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)                              \
-  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)                              \
-  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)                              \
-  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)                         \
+#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)            \
+  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)             \
+  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)             \
+  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)             \
+  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)        \
   RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA)
 
 namespace RAJA
@@ -64,37 +67,32 @@ template <typename t_MultiReduceData>
 struct BaseMultiReduce
 {
   using MultiReduceData = t_MultiReduceData;
-  using MultiReduceOp   = typename t_MultiReduceData::MultiReduceOp;
-  using value_type      = typename t_MultiReduceData::value_type;
+  using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
+  using value_type = typename t_MultiReduceData::value_type;
 
-  BaseMultiReduce()
-      : BaseMultiReduce {RepeatView<value_type>(MultiReduceOp::identity(), 0)}
-  {}
+  BaseMultiReduce() : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)} {}
 
   explicit BaseMultiReduce(size_t num_bins,
                            value_type init_val = MultiReduceOp::identity(),
                            value_type identity = MultiReduceOp::identity())
-      : BaseMultiReduce {RepeatView<value_type>(init_val, num_bins), identity}
-  {}
-
-  template <
-      typename Container,
-      concepts::enable_if_t<
-          type_traits::is_range<Container>,
-          concepts::negate<std::is_convertible<Container, size_t>>,
-          concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* =
-          nullptr>
+      : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
+  { }
+
+  template < typename Container,
+             concepts::enable_if_t<type_traits::is_range<Container>,
+                                   concepts::negate<std::is_convertible<Container, size_t>>,
+                                   concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* = nullptr >
   explicit BaseMultiReduce(Container const& container,
                            value_type identity = MultiReduceOp::identity())
-      : data {container, identity}
-  {}
+      : data{container, identity}
+  { }
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduce(BaseMultiReduce const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduce(BaseMultiReduce&&)                 = default;
-  BaseMultiReduce& operator=(BaseMultiReduce const&) = delete;
-  BaseMultiReduce& operator=(BaseMultiReduce&&)      = delete;
+  BaseMultiReduce(BaseMultiReduce &&) = default;
+  BaseMultiReduce &operator=(BaseMultiReduce const&) = delete;
+  BaseMultiReduce &operator=(BaseMultiReduce &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduce() = default;
 
@@ -110,14 +108,13 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template <typename Container,
-            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
+  template < typename Container,
+             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
   void reset(Container const& container,
              value_type identity = MultiReduceOp::identity())
   {
-    for (size_t bin = 0; bin < data.num_bins(); ++bin)
-    {
-      RAJA_UNUSED_VAR(get(bin));  // automatic get() before reset
+    for (size_t bin = 0; bin < data.num_bins(); ++bin) {
+      RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
     }
     data.reset(container, identity);
   }
@@ -128,7 +125,7 @@ struct BaseMultiReduce
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseMultiReduce const& combine(size_t bin, value_type const& other) const
+  BaseMultiReduce const& combine(size_t bin, value_type const &other) const
   {
     data.combine(bin, other);
     return *this;
@@ -138,19 +135,16 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template <typename Container,
-            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
+  template < typename Container,
+             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
   void get_all(Container& container) const
   {
     RAJA_EXTRACT_BED_IT(container);
-    if (size_t(distance_it) != data.num_bins())
-    {
-      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size "
-                          "than multi reducer");
+    if (size_t(distance_it) != data.num_bins()) {
+      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer");
     }
     size_t bin = 0;
-    for (auto& val : container)
-    {
+    for (auto& val : container) {
       val = data.get(bin);
       ++bin;
     }
@@ -173,17 +167,17 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 {
 public:
   using Base = BaseMultiReduce<MultiReduceData>;
-  using Base::Base;
   using typename Base::value_type;
+  using Base::Base;
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMin(BaseMultiReduceMin const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin(BaseMultiReduceMin&&) = default;
+  BaseMultiReduceMin(BaseMultiReduceMin &&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin& operator=(BaseMultiReduceMin const&) = delete;
+  BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin& operator=(BaseMultiReduceMin&&) = delete;
+  BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMin() = default;
 
@@ -191,8 +185,8 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMin const& base, size_t bin)
-        : m_base(base), m_bin(bin)
-    {}
+      : m_base(base), m_bin(bin)
+    { }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -202,7 +196,10 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const { return m_base.get(m_bin); }
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
 
   private:
     BaseMultiReduceMin const& m_base;
@@ -229,9 +226,9 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMax(BaseMultiReduceMax&&)                 = default;
-  BaseMultiReduceMax& operator=(BaseMultiReduceMax const&) = delete;
-  BaseMultiReduceMax& operator=(BaseMultiReduceMax&&)      = delete;
+  BaseMultiReduceMax(BaseMultiReduceMax &&) = default;
+  BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete;
+  BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMax() = default;
 
@@ -239,8 +236,8 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMax const& base, size_t bin)
-        : m_base(base), m_bin(bin)
-    {}
+      : m_base(base), m_bin(bin)
+    { }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -250,7 +247,10 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const { return m_base.get(m_bin); }
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
 
   private:
     BaseMultiReduceMax const& m_base;
@@ -277,9 +277,9 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceSum(BaseMultiReduceSum&&)                 = default;
-  BaseMultiReduceSum& operator=(BaseMultiReduceSum const&) = delete;
-  BaseMultiReduceSum& operator=(BaseMultiReduceSum&&)      = delete;
+  BaseMultiReduceSum(BaseMultiReduceSum &&) = default;
+  BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete;
+  BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceSum() = default;
 
@@ -287,8 +287,8 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceSum const& base, size_t bin)
-        : m_base(base), m_bin(bin)
-    {}
+      : m_base(base), m_bin(bin)
+    { }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -298,7 +298,10 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const { return m_base.get(m_bin); }
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
 
   private:
     BaseMultiReduceSum const& m_base;
@@ -325,9 +328,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitOr(BaseMultiReduceBitOr&&)                 = default;
-  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr const&) = delete;
-  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&)      = delete;
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default;
+  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete;
+  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitOr() = default;
 
@@ -335,8 +338,8 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitOr const& base, size_t bin)
-        : m_base(base), m_bin(bin)
-    {}
+      : m_base(base), m_bin(bin)
+    { }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -346,7 +349,10 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const { return m_base.get(m_bin); }
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
 
   private:
     BaseMultiReduceBitOr const& m_base;
@@ -373,9 +379,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&)                 = default;
-  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd const&) = delete;
-  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&)      = delete;
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default;
+  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete;
+  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitAnd() = default;
 
@@ -383,8 +389,8 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitAnd const& base, size_t bin)
-        : m_base(base), m_bin(bin)
-    {}
+      : m_base(base), m_bin(bin)
+    { }
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -394,7 +400,10 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const { return m_base.get(m_bin); }
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
 
   private:
     BaseMultiReduceBitAnd const& m_base;
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 6b1db3ba24..3579027cd3 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -30,7 +30,7 @@ class has_privatizer
 private:
   template <typename C>
   static auto Test(void*)
-      -> decltype(camp::val<typename C::privatizer>(), camp::true_type {});
+      -> decltype(camp::val<typename C::privatizer>(), camp::true_type{});
 
   template <typename>
   static camp::false_type Test(...);
@@ -42,26 +42,23 @@ class has_privatizer
 
 static_assert(!has_privatizer<int>::value, "if this fires, abandon all hope");
 
-struct GenericWrapperBase
-{};
+struct GenericWrapperBase {
+};
 
 template <typename T>
-struct Privatizer
-{
-  using value_type     = camp::decay<T>;
+struct Privatizer {
+  using value_type = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
   static_assert(!has_privatizer<T>::value,
-                "Privatizer selected "
-                "inappropriately, this is almost "
-                "certainly "
+                "Privatizer selected inappropriately, this is almost certainly "
                 "a bug");
   static_assert(!std::is_base_of<GenericWrapperBase, T>::value,
                 "Privatizer selected inappropriately, this is almost certainly "
                 "a bug");
 
   RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE Privatizer(const T& o) : priv {o} {}
+  RAJA_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE reference_type get_priv() { return priv; }
@@ -88,7 +85,7 @@ template <typename T,
           typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
 {
-  return Privatizer<T> {item};
+  return Privatizer<T>{item};
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -96,7 +93,7 @@ template <typename T,
           typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer
 {
-  return typename T::privatizer {item};
+  return typename T::privatizer{item};
 }
 
 }  // namespace internal
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 2f826b590f..788f3c698d 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -21,33 +21,33 @@
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/types.hpp"
 
-#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)                                \
-  template <typename T>                                                        \
-  class Reduce##OP<POL, T>                                                     \
-      : public reduce::detail::BaseReduce##OP<T, COMBINER>                     \
-  {                                                                            \
-  public:                                                                      \
-    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>;                  \
-    using Base::Base;                                                          \
+#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)               \
+  template <typename T>                                       \
+  class Reduce##OP<POL, T>                                    \
+      : public reduce::detail::BaseReduce##OP<T, COMBINER>    \
+  {                                                           \
+  public:                                                     \
+    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>; \
+    using Base::Base;                                         \
   };
 
-#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                          \
-  template <typename T, typename IndexType>                                    \
-  class Reduce##OP<POL, T, IndexType>                                          \
-      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>          \
-  {                                                                            \
-  public:                                                                      \
-    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>;       \
-    using Base::Base;                                                          \
+#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                    \
+  template <typename T, typename IndexType>                              \
+  class Reduce##OP<POL, T, IndexType>                                    \
+      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>    \
+  {                                                                      \
+  public:                                                                \
+    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>; \
+    using Base::Base;                                                    \
   };
 
-#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)                               \
-  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)                                     \
-  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)                                     \
-  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)                                     \
-  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)                            \
-  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)                            \
-  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)                                   \
+#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)       \
+  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)             \
+  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)             \
+  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)             \
+  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)    \
+  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)    \
+  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)           \
   RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER)
 
 namespace RAJA
@@ -64,15 +64,14 @@ namespace detail
 {
 
 template <typename T, template <typename...> class Op>
-struct op_adapter : private Op<T, T, T>
-{
+struct op_adapter : private Op<T, T, T> {
   using operator_type = Op<T, T, T>;
   RAJA_HOST_DEVICE static constexpr T identity()
   {
     return operator_type::identity();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val, const T v) const
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val, const T v) const
   {
     val = operator_type::operator()(val, v);
   }
@@ -80,24 +79,24 @@ struct op_adapter : private Op<T, T, T>
 }  // namespace detail
 
 template <typename T>
-struct sum : detail::op_adapter<T, RAJA::operators::plus>
-{};
+struct sum : detail::op_adapter<T, RAJA::operators::plus> {
+};
 
 template <typename T>
-struct min : detail::op_adapter<T, RAJA::operators::minimum>
-{};
+struct min : detail::op_adapter<T, RAJA::operators::minimum> {
+};
 
 template <typename T>
-struct max : detail::op_adapter<T, RAJA::operators::maximum>
-{};
+struct max : detail::op_adapter<T, RAJA::operators::maximum> {
+};
 
 template <typename T>
-struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or>
-{};
+struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or> {
+};
 
 template <typename T>
-struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and>
-{};
+struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and> {
+};
 
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -108,8 +107,7 @@ namespace detail
 {
 
 template <typename T, bool = std::is_integral<T>::value>
-struct DefaultLoc
-{};
+struct DefaultLoc {};
 
 template <typename T>
 struct DefaultLoc<T, false>  // any non-integral type
@@ -130,39 +128,30 @@ class ValueLoc
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   IndexType loc = DefaultLoc<IndexType>().value();
 
-#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 ||            \
-    defined(__HIPCC__)
+#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const& other)
-      : val {other.val}, loc {other.loc}
-  {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {}
   RAJA_HOST_DEVICE
-  ValueLoc& operator=(ValueLoc const& other)
-  {
-    val = other.val;
-    loc = other.loc;
-    return *this;
-  }
+  ValueLoc &operator=(ValueLoc const &other) { val = other.val; loc = other.loc; return *this;}
 #else
-  constexpr ValueLoc()                 = default;
-  constexpr ValueLoc(ValueLoc const&)  = default;
-  ValueLoc& operator=(ValueLoc const&) = default;
+  constexpr ValueLoc() = default;
+  constexpr ValueLoc(ValueLoc const &) = default;
+  ValueLoc &operator=(ValueLoc const &) = default;
 #endif
 
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_)
-      : val {val_}, loc {DefaultLoc<IndexType>().value()}
-  {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_, IndexType const& loc_)
-      : val {val_}, loc {loc_}
-  {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc<IndexType>().value()} {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_)
+      : val{val_}, loc{loc_}
+  {
+  }
 
   RAJA_HOST_DEVICE operator T() const { return val; }
   RAJA_HOST_DEVICE IndexType getLoc() { return loc; }
-  RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const
+  RAJA_HOST_DEVICE bool operator<(ValueLoc const &rhs) const
   {
     return val < rhs.val;
   }
-  RAJA_HOST_DEVICE bool operator>(ValueLoc const& rhs) const
+  RAJA_HOST_DEVICE bool operator>(ValueLoc const &rhs) const
   {
     return val > rhs.val;
   }
@@ -175,17 +164,14 @@ class ValueLoc
 namespace operators
 {
 template <typename T, typename IndexType, bool B>
-struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>>
-{
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
-      ValueLoc<T, IndexType, B>
-      min()
+struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>> {
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
+  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> min()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
-      ValueLoc<T, IndexType, B>
-      max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
+  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> max()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::max());
   }
@@ -211,49 +197,50 @@ class BaseReduce
   Combiner_t mutable c;
 
 public:
-  using value_type  = T;
+  using value_type = T;
   using reduce_type = Reduce;
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce() : c {T(), Reduce::identity()} {}
+  BaseReduce() : c{T(), Reduce::identity()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   BaseReduce(T init_val, T identity_ = Reduce::identity())
-      : c {init_val, identity_}
-  {}
+      : c{init_val, identity_}
+  {
+  }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   void reset(T val, T identity_ = Reduce::identity())
   {
-    operator T();  // automatic get() before reset
+    operator T(); // automatic get() before reset
     c.reset(val, identity_);
   }
 
   //! prohibit compiler-generated copy assignment
-  BaseReduce& operator=(const BaseReduce&) = delete;
+  BaseReduce &operator=(const BaseReduce &) = delete;
 
   //! compiler-generated copy constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce(const BaseReduce& copy) : c(copy.c) {}
+  BaseReduce(const BaseReduce &copy) : c(copy.c) {}
 
   //! compiler-generated move constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  BaseReduce(BaseReduce&& copy) : c(std::move(copy.c)) {}
+  BaseReduce(BaseReduce &&copy) : c(std::move(copy.c)) {}
 
   //! compiler-generated move assignment
-  BaseReduce& operator=(BaseReduce&&) = default;
+  BaseReduce &operator=(BaseReduce &&) = default;
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const& other) const { c.combine(other); }
+  void combine(T const &other) const { c.combine(other); }
 
-  T& local() const { return c.local(); }
+  T &local() const { return c.local(); }
 
   //! Get the calculated reduced value
   operator T() const { return c.get(); }
@@ -266,50 +253,51 @@ template <typename T, typename Reduce, typename Derived>
 class BaseCombinable
 {
 protected:
-  BaseCombinable const* parent = nullptr;
+  BaseCombinable const *parent = nullptr;
   T identity;
   T mutable my_data;
 
 public:
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable() : identity {T()}, my_data {T()} {}
+  constexpr BaseCombinable() : identity{T()}, my_data{T()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   constexpr BaseCombinable(T init_val, T identity_ = T())
-      : identity {identity_}, my_data {init_val}
-  {}
+      : identity{identity_}, my_data{init_val}
+  {
+  }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   void reset(T init_val, T identity_)
   {
-    my_data  = init_val;
+    my_data = init_val;
     identity = identity_;
   }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable(BaseCombinable const& other)
-      : parent {other.parent ? other.parent : &other},
-        identity {other.identity},
-        my_data {identity}
-  {}
+  constexpr BaseCombinable(BaseCombinable const &other)
+      : parent{other.parent ? other.parent : &other},
+        identity{other.identity},
+        my_data{identity}
+  {
+  }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   ~BaseCombinable()
   {
-    if (parent && my_data != identity)
-    {
+    if (parent && my_data != identity) {
       Reduce()(parent->my_data, my_data);
     }
   }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const& other) { Reduce {}(my_data, other); }
+  void combine(T const &other) { Reduce{}(my_data, other); }
 
   /*!
    *  \return the calculated reduced value
@@ -319,17 +307,17 @@ class BaseCombinable
   /*!
    *  \return reference to the local value
    */
-  T& local() const { return my_data; }
+  T &local() const { return my_data; }
 
   T get_combined() const { return my_data; }
 
 private:
   // Convenience method for CRTP
-  const Derived& derived() const
+  const Derived &derived() const
   {
-    return *(static_cast<const Derived*>(this));
+    return *(static_cast<const Derived *>(this));
   }
-  Derived& derived() { return *(static_cast<Derived*>(this)); }
+  Derived &derived() { return *(static_cast<Derived *>(this)); }
 };
 
 /*!
@@ -348,7 +336,7 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMin& min(T rhs) const
+  const BaseReduceMin &min(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -362,43 +350,36 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <typename T,
-          typename IndexType,
-          template <typename, typename>
-          class Combiner>
+template <typename T, typename IndexType, template <typename, typename> class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
 public:
   using Base = BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>;
-  using value_type  = typename Base::value_type;
+  using value_type = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMinLoc(
-      T init_val,
-      IndexType init_idx,
-      T identity_val_         = reduce_type::identity(),
-      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val_, identity_loc_))
-  {}
-
-  void reset(T init_val,
-             IndexType init_idx,
-             T identity_val_         = reduce_type::identity(),
+  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx,
+                             T identity_val_ = reduce_type::identity(),
+                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
+  {
+  }
+
+  void reset(T init_val, IndexType init_idx,
+             T identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T();  // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val_, identity_loc_));
+    operator T(); // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMinLoc& minloc(T rhs, IndexType loc) const
+  const BaseReduceMinLoc &minloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
@@ -427,7 +408,7 @@ class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMax& max(T rhs) const
+  const BaseReduceMax &max(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -451,7 +432,7 @@ class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceSum& operator+=(T rhs) const
+  const BaseReduceSum &operator+=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -475,7 +456,7 @@ class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitOr& operator|=(T rhs) const
+  const BaseReduceBitOr &operator|=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -499,7 +480,7 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitAnd& operator&=(T rhs) const
+  const BaseReduceBitAnd &operator&=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -514,45 +495,36 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T,
-          typename IndexType,
-          template <typename, typename>
-          class Combiner>
-class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
-                                           RAJA::reduce::max,
-                                           Combiner>
+template <typename T, typename IndexType, template <typename, typename> class Combiner>
+class BaseReduceMaxLoc
+    : public BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>
 {
 public:
-  using Base =
-      BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
-  using value_type  = typename Base::value_type;
+  using Base = BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
+  using value_type = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMaxLoc(
-      T init_val,
-      IndexType init_idx,
-      T identity_val_         = reduce_type::identity(),
-      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val_, identity_loc_))
-  {}
-
-  void reset(T init_val,
-             IndexType init_idx,
-             T identity_val_         = reduce_type::identity(),
+  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx,
+                             T identity_val_ = reduce_type::identity(),
+                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
+  {
+  }
+
+  void reset(T init_val, IndexType init_idx,
+             T identity_val_ = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T();  // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val_, identity_loc_));
+    operator T(); // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMaxLoc& maxloc(T rhs, IndexType loc) const
+  const BaseReduceMaxLoc &maxloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index e0b87a5d60..686f0e8c6b 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -98,15 +98,14 @@ namespace detail
 {
 /// Adapter to replace specific implementations for the icount variants
 template <typename Range, typename Body, typename IndexT>
-struct icount_adapter
-{
+struct icount_adapter {
   using index_type = typename std::decay<IndexT>::type;
   typename std::decay<Body>::type body;
   using container_type = typename std::decay<Range>::type;
   typename container_type::iterator begin_it;
   Index_type icount;
   icount_adapter(Range const& r, Body const& b, IndexT icount_)
-      : body {b}, icount {icount_}
+      : body{b}, icount{icount_}
   {
     using std::begin;
     begin_it = begin(r);
@@ -120,28 +119,16 @@ struct icount_adapter
   }
 };
 
-struct CallForall
-{
-  template <typename T,
-            typename ExecPol,
-            typename Body,
-            typename Res,
-            typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res>
-  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+struct CallForall {
+  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
 
-struct CallForallIcount
-{
+struct CallForallIcount {
   constexpr CallForallIcount(int s);
 
-  template <typename T,
-            typename ExecPol,
-            typename Body,
-            typename Res,
-            typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res>
-  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
@@ -165,31 +152,22 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename Res,
-          typename ExecutionPolicy,
-          typename Container,
-          typename LoopBody,
-          typename ForallParams>
+template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody, typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res r,
-       ExecutionPolicy&& p,
-       Container&& c,
-       LoopBody&& loop_body,
-       ForallParams&& f_params)
+forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(
-      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
-      std::forward<LoopBody>(loop_body), std::forward<ForallParams>(f_params));
+  return forall_impl(r,
+                     std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body),
+                     std::forward<ForallParams>(f_params));
 }
 
-template <typename Res,
-          typename ExecutionPolicy,
-          typename Container,
-          typename LoopBody>
+template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -197,9 +175,11 @@ RAJA_INLINE concepts::enable_if_t<
 forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(
-      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
-      std::forward<LoopBody>(loop_body), expt::get_empty_forall_param_pack());
+  return forall_impl(r,
+                     std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body),
+                     expt::get_empty_forall_param_pack());
 }
 
 
@@ -217,22 +197,22 @@ template <typename Res,
           typename LoopBody,
           typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                     ExecutionPolicy&& p,
-                                                     Container&& c,
-                                                     IndexType&& icount,
-                                                     LoopBody&& loop_body,
-                                                     ForallParams&& f_params)
+                                                      ExecutionPolicy&& p,
+                                                      Container&& c,
+                                                      IndexType&& icount,
+                                                      LoopBody&& loop_body,
+                                                      ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c, loop_body,
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c,
+                                                                 loop_body,
                                                                  icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted,
-                     std::forward<ForallParams>(f_params));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted, std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -250,24 +230,23 @@ template <typename Res,
           typename... SegmentTypes,
           typename LoopBody,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res>
-forall_Icount(Res r,
-              ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-              const TypedIndexSet<SegmentTypes...>& iset,
-              LoopBody loop_body,
-              ForallParams f_params)
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
+                                                ExecPolicy<SegmentIterPolicy,
+                                                SegmentExecPolicy>,
+                                                const TypedIndexSet<SegmentTypes...>& iset,
+                                                LoopBody loop_body,
+                                                ForallParams f_params)
 {
   // no need for icount variant here
-  auto segIterRes =
-      resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
-               [=, &r](int segID)
-               {
-                 iset.segmentCall(
-                     segID,
+  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
+    iset.segmentCall(segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
-                     SegmentExecPolicy(), loop_body, r, f_params);
-               });
+                     SegmentExecPolicy(),
+                     loop_body,
+                     r,
+                     f_params);
+  });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
@@ -277,33 +256,30 @@ template <typename Res,
           typename LoopBody,
           typename... SegmentTypes,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res>
-forall(Res r,
-       ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-       const TypedIndexSet<SegmentTypes...>& iset,
-       LoopBody loop_body,
-       ForallParams f_params)
-{
-  auto segIterRes =
-      resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
-               [=, &r](int segID)
-               {
-                 iset.segmentCall(segID, detail::CallForall {},
-                                  SegmentExecPolicy(), loop_body, r, f_params);
-               });
+RAJA_INLINE resources::EventProxy<Res> forall(Res r,
+                                         ExecPolicy<SegmentIterPolicy,
+                                         SegmentExecPolicy>,
+                                         const TypedIndexSet<SegmentTypes...>& iset,
+                                         LoopBody loop_body,
+                                         ForallParams f_params)
+{
+  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
+    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params);
+  });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
 }  // end namespace wrap
 
 
+
 /*!
  ******************************************************************************
  *
- * \brief The RAJA::policy_by_value_interface forall functions provide an
- *interface with value-based policies. It also enforces the interface and
- *performs static checks as well as triggering plugins and loop body updates.
+ * \brief The RAJA::policy_by_value_interface forall functions provide an interface with
+ *        value-based policies. It also enforces the interface and performs
+ *        static checks as well as triggering plugins and loop body updates.
  *
  ******************************************************************************
  */
@@ -318,12 +294,11 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename Res,
-          typename IdxSet,
-          typename... Params>
-RAJA_INLINE resources::EventProxy<Res>
-forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
+template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
+                                                     Res r,
+                                                     IdxSet&& c,
+                                                     Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -331,10 +306,9 @@ forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
-  // expt::check_forall_optional_args(loop_body, f_params);
+  //expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context {
-      util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -344,24 +318,27 @@ forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e =
-      wrap::forall_Icount(r, std::forward<ExecutionPolicy>(p),
-                          std::forward<IdxSet>(c), std::move(body), f_params);
+  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
+      r,
+      std::forward<ExecutionPolicy>(p),
+      std::forward<IdxSet>(c),
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <
-    typename ExecutionPolicy,
-    typename IdxSet,
-    typename LoopBody,
-    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
-RAJA_INLINE resources::EventProxy<Res>
-forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
+                                                     IdxSet&& c,
+                                                     LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p),
+      r,
+      std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -372,14 +349,11 @@ forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename Res,
-          typename IdxSet,
-          typename... Params>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_indexset_policy<ExecutionPolicy>>
-    forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
+template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_indexset_policy<ExecutionPolicy>>
+forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -389,8 +363,7 @@ RAJA_INLINE
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context {
-      util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -400,26 +373,28 @@ RAJA_INLINE
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =
-      wrap::forall(r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
-                   std::move(body), f_params);
+  resources::EventProxy<Res> e = wrap::forall(
+      r,
+      std::forward<ExecutionPolicy>(p),
+      std::forward<IdxSet>(c),
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <
-    typename ExecutionPolicy,
-    typename IdxSet,
-    typename LoopBody,
-    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
-RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_indexset_policy<ExecutionPolicy>>
-    forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_indexset_policy<ExecutionPolicy>>
+forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p),
+      r,
+      std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -430,14 +405,12 @@ RAJA_INLINE
  *
  ******************************************************************************
  */
-template <
-    typename ExecutionPolicy,
-    typename Container,
-    typename LoopBody,
-    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_multi_policy<ExecutionPolicy>,
-                                  type_traits::is_range<Container>>
+template <typename ExecutionPolicy, typename Container, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_multi_policy<ExecutionPolicy>,
+    type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
@@ -446,9 +419,10 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  return forall_impl(r, std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body));
+  return forall_impl(r,
+              std::forward<ExecutionPolicy>(p),
+              std::forward<Container>(c),
+              std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -464,9 +438,10 @@ template <typename ExecutionPolicy,
           typename IndexType,
           typename FirstParam,
           typename... Params>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_range<Container>,
-                                  type_traits::is_integral<IndexType>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_range<Container>,
+    type_traits::is_integral<IndexType>>
 forall_Icount(ExecutionPolicy&& p,
               Res r,
               Container&& c,
@@ -477,14 +452,11 @@ forall_Icount(ExecutionPolicy&& p,
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
-  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
-                                               std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
-                                      std::forward<Params>(params)...);
-  // expt::check_forall_optional_args(loop_body, f_params);
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first), std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first), std::forward<Params>(params)...);
+  //expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context {
-      util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -495,18 +467,21 @@ forall_Icount(ExecutionPolicy&& p,
   util::callPreLaunchPlugins(context);
 
   resources::EventProxy<Res> e = wrap::forall_Icount(
-      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c), icount,
-      std::move(body), f_params);
+      r,
+      std::forward<ExecutionPolicy>(p),
+      std::forward<Container>(c),
+      icount,
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <
-    typename ExecutionPolicy,
-    typename Container,
-    typename IndexType,
-    typename LoopBody,
-    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+template <typename ExecutionPolicy,
+          typename Container,
+          typename IndexType,
+          typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_range<Container>,
@@ -519,7 +494,10 @@ forall_Icount(ExecutionPolicy&& p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c), icount,
+      std::forward<ExecutionPolicy>(p),
+      r,
+      std::forward<Container>(c),
+      icount,
       std::forward<LoopBody>(loop_body));
 }
 
@@ -531,10 +509,7 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
-template <typename ExecutionPolicy,
-          typename Res,
-          typename Container,
-          typename... Params>
+template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -549,8 +524,7 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context {
-      util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -560,19 +534,19 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =
-      wrap::forall(r, std::forward<ExecutionPolicy>(p),
-                   std::forward<Container>(c), std::move(body), f_params);
+  resources::EventProxy<Res> e =  wrap::forall(
+      r,
+      std::forward<ExecutionPolicy>(p),
+      std::forward<Container>(c),
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
 
-template <
-    typename ExecutionPolicy,
-    typename Container,
-    typename LoopBody,
-    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+template <typename ExecutionPolicy, typename Container, typename LoopBody,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -582,11 +556,13 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c),
+      std::forward<ExecutionPolicy>(p),
+      r,
+      std::forward<Container>(c),
       std::forward<LoopBody>(loop_body));
 }
 
-}  // namespace policy_by_value_interface
+}  // end inline namespace policy_by_value_interface
 
 
 /*!
@@ -594,23 +570,20 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecutionPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+template <typename ExecutionPolicy, typename... Args,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
-                                                   std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(
+      ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
 forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
-                                                   std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(
+      ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 
 /*!
@@ -619,10 +592,8 @@ forall(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <
-    typename ExecutionPolicy,
-    typename... Args,
-    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+template <typename ExecutionPolicy, typename... Args,
+          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 {
   Res r = Res::get_default();
@@ -630,8 +601,7 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
 forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -641,17 +611,12 @@ forall_Icount(Res r, Args&&... args)
 namespace detail
 {
 
-template <typename T,
-          typename ExecutionPolicy,
-          typename LoopBody,
-          typename Res,
-          typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res>
-CallForall::operator()(T const& segment,
-                       ExecutionPolicy,
-                       LoopBody body,
-                       Res r,
-                       ForallParams f_params) const
+template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
+                                                               ExecutionPolicy,
+                                                               LoopBody body,
+                                                               Res r,
+                                                               ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -661,21 +626,15 @@ CallForall::operator()(T const& segment,
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T,
-          typename ExecutionPolicy,
-          typename LoopBody,
-          typename Res,
-          typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res>
-CallForallIcount::operator()(T const& segment,
-                             ExecutionPolicy,
-                             LoopBody body,
-                             Res r,
-                             ForallParams f_params) const
+template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
+                                                                     ExecutionPolicy,
+                                                                     LoopBody body,
+                                                                     Res r,
+                                                                     ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body,
-                             f_params);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params);
 }
 
 }  // namespace detail
@@ -691,111 +650,99 @@ CallForallIcount::operator()(T const& segment,
 namespace expt
 {
 
-template <camp::idx_t IDX, typename POLICY_LIST>
-struct dynamic_helper
-{
-  template <typename SEGMENT, typename BODY>
-  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
+  template<camp::idx_t IDX, typename POLICY_LIST>
+  struct dynamic_helper
   {
-    if (IDX == pol)
+    template<typename SEGMENT, typename BODY>
+    static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
     {
-      using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
-      RAJA::forall<t_pol>(seg, body);
-      return;
+      if(IDX==pol){
+        using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
+        RAJA::forall<t_pol>(seg, body);
+        return;
+      }
+      dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(pol, seg, body);
     }
-    dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
-  }
 
-  template <typename SEGMENT, typename BODY>
-  static resources::EventProxy<resources::Resource>
-  invoke_forall(RAJA::resources::Resource r,
-                const int pol,
-                SEGMENT const& seg,
-                BODY const& body)
-  {
+    template<typename SEGMENT, typename BODY>
+    static resources::EventProxy<resources::Resource>
+    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
+    {
 
-    using t_pol         = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
-    using resource_type = typename resources::get_resource<t_pol>::type;
+      using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
+      using resource_type = typename resources::get_resource<t_pol>::type;
 
-    if (IDX == pol)
-    {
-      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+      if(IDX==pol){
+        RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      // Return a generic event proxy from r,
-      // because forall returns a typed event proxy
-      return {r};
+        //Return a generic event proxy from r,
+        //because forall returns a typed event proxy
+        return {r};
+      }
+
+      return dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
     }
 
-    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r, pol, seg,
-                                                               body);
-  }
-};
+  };
 
-template <typename POLICY_LIST>
-struct dynamic_helper<0, POLICY_LIST>
-{
-  template <typename SEGMENT, typename BODY>
-  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
+  template<typename POLICY_LIST>
+  struct dynamic_helper<0, POLICY_LIST>
   {
-    if (0 == pol)
+    template<typename SEGMENT, typename BODY>
+    static void
+    invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
     {
-      using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
-      RAJA::forall<t_pol>(seg, body);
-      return;
+      if(0==pol){
+        using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
+        RAJA::forall<t_pol>(seg, body);
+        return;
+      }
+      RAJA_ABORT_OR_THROW("Policy enum not supported ");
     }
-    RAJA_ABORT_OR_THROW("Policy enum not supported ");
-  }
 
-  template <typename SEGMENT, typename BODY>
-  static resources::EventProxy<resources::Resource>
-  invoke_forall(RAJA::resources::Resource r,
-                const int pol,
-                SEGMENT const& seg,
-                BODY const& body)
-  {
-    if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
+    template<typename SEGMENT, typename BODY>
+    static resources::EventProxy<resources::Resource>
+    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
+    {
+      if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
-    using t_pol         = typename camp::at<POLICY_LIST, camp::num<0>>::type;
-    using resource_type = typename resources::get_resource<t_pol>::type;
+      using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
+      using resource_type = typename resources::get_resource<t_pol>::type;
 
-    RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-    // Return a generic event proxy from r,
-    // because forall returns a typed event proxy
-    return {r};
-  }
-};
+      //Return a generic event proxy from r,
+      //because forall returns a typed event proxy
+      return {r};
+    }
 
-template <typename POLICY_LIST, typename SEGMENT, typename BODY>
-void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
-{
-  constexpr int N = camp::size<POLICY_LIST>::value;
-  static_assert(N > 0, "RAJA policy list must not be empty");
+  };
 
-  if (pol > N - 1)
+  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+  void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body)
   {
-    RAJA_ABORT_OR_THROW("Policy enum not supported");
-  }
-  dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
-}
+    constexpr int N = camp::size<POLICY_LIST>::value;
+    static_assert(N > 0, "RAJA policy list must not be empty");
 
-template <typename POLICY_LIST, typename SEGMENT, typename BODY>
-resources::EventProxy<resources::Resource>
-dynamic_forall(RAJA::resources::Resource r,
-               const int pol,
-               SEGMENT const& seg,
-               BODY const& body)
-{
-  constexpr int N = camp::size<POLICY_LIST>::value;
-  static_assert(N > 0, "RAJA policy list must not be empty");
+    if(pol > N-1)  {
+      RAJA_ABORT_OR_THROW("Policy enum not supported");
+    }
+    dynamic_helper<N-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+  }
 
-  if (pol > N - 1)
+  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+  resources::EventProxy<resources::Resource>
+  dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
   {
-    RAJA_ABORT_OR_THROW("Policy value out of range");
-  }
+    constexpr int N = camp::size<POLICY_LIST>::value;
+    static_assert(N > 0, "RAJA policy list must not be empty");
 
-  return dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
-}
+    if(pol > N-1)  {
+      RAJA_ABORT_OR_THROW("Policy value out of range");
+    }
+
+    return dynamic_helper<N-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+  }
 
 }  // namespace expt
 
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index d03c8f531f..1875fe27d9 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -55,43 +55,44 @@ template <typename T>
 struct IterableWrapperTuple;
 
 template <typename... Ts>
-struct IterableWrapperTuple<camp::tuple<Ts...>>
-{
+struct IterableWrapperTuple<camp::tuple<Ts...>> {
 
-  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
-                                      typename camp::decay<Ts>::IndexType>...>;
+  using type =
+      camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                             typename camp::decay<Ts>::IndexType>...>;
 };
 
 
 namespace internal
 {
 template <class Tuple, camp::idx_t... I>
-RAJA_INLINE constexpr auto
-make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>) -> camp::tuple<
-    RAJA::Span<typename camp::decay<
-                   camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-               typename camp::decay<
-                   camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
+RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
+                                                   camp::idx_seq<I...>)
+    -> camp::tuple<RAJA::Span<
+        typename camp::decay<
+            camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+        typename camp::decay<
+            camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<typename camp::decay<
-                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-                 typename camp::decay<
-                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType> {
-          camp::get<I>(std::forward<Tuple>(t)).begin(),
-          camp::get<I>(std::forward<Tuple>(t)).end()}...);
+      RAJA::Span<
+          typename camp::decay<
+              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+          typename camp::decay<camp::tuple_element_t<I, camp::decay<Tuple>>>::
+              IndexType>{camp::get<I>(std::forward<Tuple>(t)).begin(),
+                         camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
 }  // namespace internal
 
 template <class Tuple>
-RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t)
+RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple &&t)
     -> decltype(internal::make_wrapped_tuple_impl(
         std::forward<Tuple>(t),
-        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {}))
+        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{}))
 {
   return internal::make_wrapped_tuple_impl(
       std::forward<Tuple>(t),
-      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {});
+      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{});
 }
 
 
@@ -100,13 +101,12 @@ template <typename PolicyType,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource>
-kernel_param_resource(SegmentTuple&& segments,
-                      ParamTuple&& params,
-                      Resource resource,
-                      Bodies&&... bodies)
+RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &&segments,
+                                                                  ParamTuple &&params,
+                                                                  Resource resource,
+                                                                  Bodies &&... bodies)
 {
-  util::PluginContext context {util::make_context<PolicyType>()};
+  util::PluginContext context{util::make_context<PolicyType>()};
 
   // TODO: test that all policy members model the Executor policy concept
   // TODO: add a static_assert for functors which cannot be invoked with
@@ -119,8 +119,10 @@ kernel_param_resource(SegmentTuple&& segments,
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<segment_tuple_t, param_tuple_t,
-                                         Resource, camp::decay<Bodies>...>;
+  using loop_data_t = internal::LoopData<segment_tuple_t,
+                                         param_tuple_t,
+                                         Resource,
+                                         camp::decay<Bodies>...>;
 
 
   util::callPreCapturePlugins(context);
@@ -129,10 +131,11 @@ kernel_param_resource(SegmentTuple&& segments,
   // our segments, loop bodies, and the tuple of loop indices
   // it is passed through all of the kernel mechanics by-referenece,
   // and only copied to provide thread-private instances.
-  loop_data_t loop_data(
-      make_wrapped_tuple(std::forward<SegmentTuple>(segments)),
-      std::forward<ParamTuple>(params), resource,
-      std::forward<Bodies>(bodies)...);
+  loop_data_t loop_data(make_wrapped_tuple(
+                            std::forward<SegmentTuple>(segments)),
+                            std::forward<ParamTuple>(params),
+                            resource,
+                            std::forward<Bodies>(bodies)...);
 
   util::callPostCapturePlugins(context);
 
@@ -153,35 +156,40 @@ template <typename PolicyType,
           typename SegmentTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource>
-kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
+RAJA_INLINE resources::EventProxy<Resource> kernel_resource(SegmentTuple &&segments,
+                                                            Resource resource,
+                                                            Bodies &&... bodies)
 {
-  return RAJA::kernel_param_resource<PolicyType>(
-      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), resource,
-      std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
+                                                 RAJA::make_tuple(),
+                                                 resource,
+                                                 std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType,
           typename SegmentTuple,
           typename ParamTuple,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
-kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel_param(SegmentTuple &&segments,
+                                                                                           ParamTuple &&params,
+                                                                                           Bodies &&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(
-      std::forward<SegmentTuple>(segments), std::forward<ParamTuple>(params),
-      res, std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
+                                                 std::forward<ParamTuple>(params),
+                                                 res,
+                                                 std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
-kernel(SegmentTuple&& segments, Bodies&&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel(SegmentTuple &&segments,
+                                                                                     Bodies &&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(
-      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), res,
-      std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
+                                                 RAJA::make_tuple(),
+                                                 res,
+                                                 std::forward<Bodies>(bodies)...);
 }
 
 
diff --git a/include/RAJA/pattern/kernel/Collapse.hpp b/include/RAJA/pattern/kernel/Collapse.hpp
index 10afccda53..8efb126397 100644
--- a/include/RAJA/pattern/kernel/Collapse.hpp
+++ b/include/RAJA/pattern/kernel/Collapse.hpp
@@ -29,8 +29,8 @@ namespace statement
 template <typename ExecPolicy, typename ForList, typename... EnclosedStmts>
 struct Collapse : public internal::ForList,
                   public internal::CollapseBase,
-                  public internal::Statement<ExecPolicy, EnclosedStmts...>
-{};
+                  public internal::Statement<ExecPolicy, EnclosedStmts...> {
+};
 
 
 }  // namespace statement
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index 1b8f38f76b..6b7875c4c2 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -37,8 +37,8 @@ namespace statement
  *
  */
 template <typename Condition, typename... EnclosedStmts>
-struct If : public internal::Statement<camp::nil, EnclosedStmts...>
-{};
+struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
+};
 
 
 /*!
@@ -46,11 +46,10 @@ struct If : public internal::Statement<camp::nil, EnclosedStmts...>
  *
  */
 template <long value>
-struct Value
-{
+struct Value {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const&)
+  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const &)
   {
     return value;
   }
@@ -61,11 +60,10 @@ struct Value
  *
  */
 template <typename L, typename R>
-struct Equals
-{
+struct Equals {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) == R::eval(data);
   }
@@ -76,11 +74,10 @@ struct Equals
  *
  */
 template <typename L, typename R>
-struct NotEquals
-{
+struct NotEquals {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) != R::eval(data);
   }
@@ -92,11 +89,10 @@ struct NotEquals
  *
  */
 template <typename L, typename R>
-struct Or
-{
+struct Or {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) || R::eval(data);
   }
@@ -108,11 +104,10 @@ struct Or
  *
  */
 template <typename L, typename R>
-struct And
-{
+struct And {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) && R::eval(data);
   }
@@ -124,11 +119,10 @@ struct And
  *
  */
 template <typename L, typename R>
-struct LessThan
-{
+struct LessThan {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) < R::eval(data);
   }
@@ -140,11 +134,10 @@ struct LessThan
  *
  */
 template <typename L, typename R>
-struct LessThanEq
-{
+struct LessThanEq {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) <= R::eval(data);
   }
@@ -156,11 +149,10 @@ struct LessThanEq
  *
  */
 template <typename L, typename R>
-struct GreaterThan
-{
+struct GreaterThan {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) > R::eval(data);
   }
@@ -172,11 +164,10 @@ struct GreaterThan
  *
  */
 template <typename L, typename R>
-struct GreaterThanEq
-{
+struct GreaterThanEq {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return L::eval(data) >= R::eval(data);
   }
@@ -188,11 +179,10 @@ struct GreaterThanEq
  *
  */
 template <typename L>
-struct Not
-{
+struct Not {
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
   {
     return !(L::eval(data));
   }
@@ -206,16 +196,14 @@ namespace internal
 
 
 template <typename Condition, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types>
-{
+struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
 
-    if (Condition::eval(data))
-    {
+    if (Condition::eval(data)) {
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(
           std::forward<Data>(data));
     }
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 661fe92868..539c451673 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -42,8 +42,7 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts>
 struct For : public internal::ForList,
              public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...>
-{
+             public internal::Statement<ExecPolicy, EnclosedStmts...> {
 
   // TODO: add static_assert for valid policy in Pol
   using execution_policy_t = ExecPolicy;
@@ -60,12 +59,8 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
-struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
-{
+template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
+struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -90,13 +85,11 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
-    Types>
-{
+    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
 
     // Set the argument type for this loop
@@ -105,13 +98,12 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len    = segment_length<ArgumentId>(data);
+    auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -120,14 +112,15 @@ struct StatementExecutor<
  *
  *
  */
-template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-                         Types>
-{
+template <camp::idx_t ArgumentId,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<
+    statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
 
     // Set the argument type for this loop
@@ -136,13 +129,12 @@ struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len    = segment_length<ArgumentId>(data);
+    auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     RAJA_EXTRACT_BED_IT(TypedRangeSegment<len_t>(0, len));
 
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
-    {
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
       for_wrapper(*(begin_it + i));
     }
   }
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index c6e75c35aa..18515c7f59 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -44,9 +44,8 @@ template <camp::idx_t ArgumentId,
           typename ExecPolicy = camp::nil,
           typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
-                   public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-                   public internal::Statement<ExecPolicy, EnclosedStmts...>
-{
+             public internal::ForTraitBase<ArgumentId, ExecPolicy>,
+             public internal::Statement<ExecPolicy, EnclosedStmts...> {
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
@@ -65,13 +64,9 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Data,
-          typename Types,
+template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
           typename... EnclosedStmts>
-struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
-{
+struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -98,29 +93,26 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
-    Types>
-{
+    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes, EnclosedStmts...>
-        for_wrapper(data);
+    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes,
+                     EnclosedStmts...> for_wrapper(data);
 
-    auto len    = segment_length<ArgumentId>(data);
+    auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 66be036556..955afcecc0 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -81,8 +81,10 @@ template <camp::idx_t HpArgumentId,
           typename ArgList,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...>
-{};
+struct Hyperplane
+    : public internal::Statement<ExecPolicy,
+                                 EnclosedStmts...> {
+};
 
 }  // end namespace statement
 
@@ -91,8 +93,9 @@ namespace internal
 
 
 template <camp::idx_t HpArgumentId, typename ArgList, typename... EnclosedStmts>
-struct HyperplaneInner : public internal::Statement<camp::nil, EnclosedStmts...>
-{};
+struct HyperplaneInner
+    : public internal::Statement<camp::nil, EnclosedStmts...> {
+};
 
 
 template <camp::idx_t HpArgumentId,
@@ -105,13 +108,11 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
                                                HpExecPolicy,
                                                ArgList<Args...>,
                                                ExecPolicy,
-                                               EnclosedStmts...>,
-                         Types>
-{
+                                               EnclosedStmts...>, Types> {
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data& data)
+  static RAJA_INLINE void exec(Data &data)
   {
 
     // get type of Hp arguments index
@@ -125,7 +126,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
     // Add a Collapse policy around our enclosed statements that will handle
     // the inner hyperplane loop's execution
     using kernel_policy = statement::Collapse<
-        ExecPolicy, ArgList<Args...>,
+        ExecPolicy,
+        ArgList<Args...>,
         HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>>;
 
     // Create a For-loop wrapper for the outer loop
@@ -133,9 +135,9 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
 
     // compute manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    idx_t hp_len =
-        segment_length<HpArgumentId>(data) +
-        foldl(RAJA::operators::plus<idx_t>(), segment_length<Args>(data)...);
+    idx_t hp_len = segment_length<HpArgumentId>(data) +
+                   foldl(RAJA::operators::plus<idx_t>(),
+                                 segment_length<Args>(data)...);
 
     /* Execute the outer loop over hyperplanes
      *
@@ -144,8 +146,10 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r, HpExecPolicy {}, TypedRangeSegment<idx_t>(0, hp_len),
-                outer_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, HpExecPolicy{},
+                TypedRangeSegment<idx_t>(0, hp_len),
+                outer_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -155,30 +159,27 @@ template <camp::idx_t HpArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
-    Types>
-{
+    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data& data)
+  static RAJA_INLINE void exec(Data &data)
   {
 
     // get h value
-    auto h      = camp::get<HpArgumentId>(data.offset_tuple);
+    auto h = camp::get<HpArgumentId>(data.offset_tuple);
     using idx_t = decltype(h);
 
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
     idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
-                        camp::get<Args>(data.offset_tuple)...);
+                                camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
 
     // check bounds
-    if (i >= 0 && i < len)
-    {
+    if (i >= 0 && i < len) {
 
       // store in tuple
       data.template assign_offset<HpArgumentId>(i);
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 25bd0a10df..21d9e3cd2a 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -26,7 +26,7 @@
 namespace RAJA
 {
 
-// Policies for RAJA local arrays
+//Policies for RAJA local arrays
 struct cpu_tile_mem;
 
 
@@ -43,16 +43,14 @@ namespace statement
  * IntiLocalMem<Pol, RAJA::param_idx<0>, statements...>
  * Will intialize the 0th array in the param tuple
  */
-template <typename Pol, typename Indices, typename... EnclosedStmts>
-struct InitLocalMem : public internal::Statement<camp::nil>
-{};
+template<typename Pol, typename Indices, typename... EnclosedStmts>
+struct InitLocalMem : public internal::Statement<camp::nil> {
+};
 
-// Policy Specialization
-template <camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<RAJA::cpu_tile_mem,
-                    camp::idx_seq<Indices...>,
-                    EnclosedStmts...> : public internal::Statement<camp::nil>
-{};
+//Policy Specialization
+template<camp::idx_t... Indices, typename... EnclosedStmts>
+struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts...> : public internal::Statement<camp::nil> {
+};
 
 
 }  // end namespace statement
@@ -60,33 +58,28 @@ struct InitLocalMem<RAJA::cpu_tile_mem,
 namespace internal
 {
 
-// Statement executor to initalize RAJA local array
-template <camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
-                                                 camp::idx_seq<Indices...>,
-                                                 EnclosedStmts...>,
-                         Types>
-{
-
-  // Execute statement list
-  template <class Data>
-  static void RAJA_INLINE exec_expanded(Data&& data)
+//Statement executor to initalize RAJA local array
+template<camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...>, Types>{
+  
+  //Execute statement list
+  template<class Data>
+  static void RAJA_INLINE exec_expanded(Data && data)
   {
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
-
-  // Intialize local array
-  // Identifies type + number of elements needed
-  template <camp::idx_t Pos, camp::idx_t... others, class Data>
-  static void RAJA_INLINE exec_expanded(Data&& data)
+  
+  //Intialize local array
+  //Identifies type + number of elements needed
+  template<camp::idx_t Pos, camp::idx_t... others, class Data>
+  static void RAJA_INLINE exec_expanded(Data && data)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
 
     // Initialize memory
 #ifdef RAJA_COMPILER_MSVC
     // MSVC doesn't like taking a pointer to stack allocated data?!?!
-    varType* ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
+    varType *ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
     camp::get<Pos>(data.param_tuple).set_data(ptr);
 #else
     varType Array[camp::get<Pos>(data.param_tuple).size()];
@@ -102,14 +95,16 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
     delete[] ptr;
 #endif
   }
+  
 
-
-  template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  
+  template<typename Data>
+  static RAJA_INLINE void exec(Data &&data)
   {
-    // Initalize local arrays + execute statements + cleanup
+    //Initalize local arrays + execute statements + cleanup
     exec_expanded<Indices...>(data);
   }
+  
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 29b3aba40d..29d41b431e 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -46,28 +46,28 @@ struct lambda_arg_param_t
 struct lambda_arg_offset_t
 {};
 
-template <typename T>
+template<typename T>
 struct lambda_arg_value_t
 {
-  using type = T;
+    using type = T;
 };
 
-template <typename T, camp::idx_t V>
+template<typename T, camp::idx_t V>
 struct LambdaArg
 {
-  static constexpr camp::idx_t value = V;
+    static constexpr camp::idx_t value = V;
 };
 
-}  // namespace internal
+}
+
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment values
  * should be passed into the lambda as an argument
  */
-template <camp::idx_t... args>
-using Segs =
-    camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
+template<camp::idx_t ... args>
+using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment offsets
@@ -79,18 +79,16 @@ using Segs =
  * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the
  * current tile.
  */
-template <camp::idx_t... args>
-using Offsets =
-    camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
+template<camp::idx_t ... args>
+using Offsets = camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more parameters that
  * should be passed into the lambda as an argument.
  */
-template <camp::idx_t... args>
-using Params =
-    camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
+template<camp::idx_t ... args>
+using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more constant values
@@ -105,9 +103,8 @@ using Params =
  * writing:   Lambda<0, ValuesT<double, 3, 4>>
  * invokes:   lambda0( (double)3, (double) 4 )
  */
-template <typename T, camp::idx_t... values>
-using ValuesT =
-    camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
+template<typename T, camp::idx_t ... values>
+using ValuesT = camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
 
 
 namespace statement
@@ -122,9 +119,8 @@ namespace statement
  * RAJA::kernel<exec_pol>(make_tuple{s0, s1, s2}, lambda0, lambda1);
  *
  */
-template <camp::idx_t BodyIdx, typename... Args>
-struct Lambda : internal::Statement<camp::nil>
-{
+template <camp::idx_t BodyIdx, typename... Args >
+struct Lambda : internal::Statement<camp::nil> {
   static const camp::idx_t loop_body_index = BodyIdx;
 };
 
@@ -134,6 +130,13 @@ namespace internal
 {
 
 
+
+
+
+
+
+
+
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -143,25 +146,26 @@ namespace internal
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template <typename SegmentType, camp::idx_t id>
+template<typename SegmentType, camp::idx_t id>
 struct LambdaSegExtractor
 {
 
   static_assert(!std::is_same<SegmentType, void>::value,
-                "Segment not "
-                "assigned, but used "
-                "in Lambda with "
-                "Segs<> argument");
+      "Segment not assigned, but used in Lambda with Segs<> argument");
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static SegmentType extract(Data &&data)
   {
-    return SegmentType(camp::get<id>(data.segment_tuple)
-                           .begin()[camp::get<id>(data.offset_tuple)]);
+    return SegmentType(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)]);
   }
+
 };
 
 
+
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -171,24 +175,26 @@ struct LambdaSegExtractor
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template <typename OffsetType, camp::idx_t id>
+template<typename OffsetType, camp::idx_t id>
 struct LambdaOffsetExtractor
 {
 
   static_assert(!std::is_same<OffsetType, void>::value,
-                "Segment not assigned, "
-                "but used in Lambda "
-                "with Offsets<> "
-                "argument");
+      "Segment not assigned, but used in Lambda with Offsets<> argument");
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static OffsetType extract(Data &&data)
   {
     return OffsetType(camp::get<id>(data.offset_tuple));
   }
+
 };
 
 
+
 /*
  * Helper that provides first level of argument extraction
  * This acts as a switchboard between Segs, Offsets, and Params
@@ -196,144 +202,140 @@ struct LambdaOffsetExtractor
  * It calls LambdaArgExtractor to perform the actual argument extraction.
  * This allows LambdaArgExtractor to be specialized
  */
-template <typename Types, typename T>
+template<typename Types, typename T>
 struct LambdaArgSwitchboard;
 
 
-template <typename Types, camp::idx_t id>
+template<typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
 {
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
   static_assert(!std::is_same<OffsetType, void>::value,
-                "Offset not assigned, "
-                "but used in Lambda "
-                "with Offsets<> "
-                "argument");
+      "Offset not assigned, but used in Lambda with Offsets<> argument");
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static OffsetType extract(Data &&data)
   {
-    return LambdaOffsetExtractor<OffsetType, id>::extract(
-        std::forward<Data>(data));
+    return LambdaOffsetExtractor<OffsetType, id>::extract(std::forward<Data>(data));
   }
+
 };
 
-template <typename Types, camp::idx_t id>
+template<typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
 {
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
   static_assert(!std::is_same<SegmentType, void>::value,
-                "Segment not "
-                "assigned, but used "
-                "in Lambda with "
-                "Segs<> argument");
+      "Segment not assigned, but used in Lambda with Segs<> argument");
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static SegmentType extract(Data &&data)
   {
-    return LambdaSegExtractor<SegmentType, id>::extract(
-        std::forward<Data>(data));
+    return LambdaSegExtractor<SegmentType, id>::extract(std::forward<Data>(data));
   }
+
 };
 
-template <typename Types, camp::idx_t id>
+template<typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
 {
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto
-  extract(Data&& data) -> typename std::add_lvalue_reference<
-      camp::tuple_element_t<id,
-                            typename camp::decay<Data>::param_tuple_t>>::type
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static auto extract(Data &&data)->
+    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
   {
     return camp::get<id>(data.param_tuple);
   }
 };
 
 
-template <typename Types, typename T, camp::idx_t value>
+template<typename Types, typename T, camp::idx_t value>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
 {
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static T extract(Data&&)
+  template<typename Data>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr
+  static T extract(Data &&)
   {
     return T(value);
   }
 };
 
 
+
 RAJA_SUPPRESS_HD_WARN
-template <camp::idx_t LoopIndex,
-          typename Types,
-          typename Data,
-          typename... targLists>
-RAJA_INLINE RAJA_HOST_DEVICE void
-invoke_lambda_with_args(Data&& data, camp::list<targLists...> const&)
+template<camp::idx_t LoopIndex, typename Types, typename Data, typename... targLists>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data,
+                                                       camp::list<targLists...> const &)
 {
   camp::get<LoopIndex>(data.bodies)(
       LambdaArgSwitchboard<Types, targLists>::extract(data)...);
 }
 
 
+
+
 /*!
  * A RAJA::kernel statement that invokes a lambda function
  * with user specified arguments.
  */
-template <camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>
-{
+template <camp::idx_t LambdaIndex,typename... Args, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
   {
 
-    // Convert SegList, ParamList into Seg, Param types, and store in a list
+    //Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
-                                                targList {});
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data), targList{});
   }
 };
 
 
-template <camp::idx_t LambdaIndex,
-          typename Types,
-          typename Data,
-          camp::idx_t... SEGS,
-          camp::idx_t... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
-                                                camp::idx_seq<SEGS...> const&,
-                                                camp::idx_seq<PARAMS...> const&)
+
+template <camp::idx_t LambdaIndex, typename Types, typename Data, camp::idx_t ... SEGS, camp::idx_t ... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data, camp::idx_seq<SEGS...> const &, camp::idx_seq<PARAMS...> const &)
 {
 
-  using AllSegs   = Segs<SEGS...>;
+  using AllSegs = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
-                    Types>::exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::exec(std::forward<Data>(data));
 }
 
 
 template <camp::idx_t LambdaIndex, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex>, Types>
-{
+struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
   {
 
-    using Data_t         = camp::decay<Data>;
+    using Data_t = camp::decay<Data>;
     using offset_tuple_t = typename Data_t::offset_tuple_t;
-    using param_tuple_t  = typename Data_t::param_tuple_t;
+    using param_tuple_t = typename Data_t::param_tuple_t;
 
     invoke_lambda<LambdaIndex, Types>(
         std::forward<Data>(data),
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value> {},
-        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value> {});
+        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
+        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{});
+
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp
index 999e1a9ebe..8e870ebe15 100644
--- a/include/RAJA/pattern/kernel/Param.hpp
+++ b/include/RAJA/pattern/kernel/Param.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 namespace internal
 {
 
-struct ParamBase
-{};
+struct ParamBase {
+};
 
-}  // end namespace internal
+}// end namespace internal
 
 namespace statement
 {
@@ -47,13 +47,12 @@ namespace statement
  * RAJA::kernel execution policies.
  */
 template <camp::idx_t ParamId>
-struct Param : public internal::ParamBase
-{
+struct Param : public internal::ParamBase {
 
   constexpr static camp::idx_t param_idx = ParamId;
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const& data)
+  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const &data)
       -> decltype(camp::get<ParamId>(data.param_tuple))
   {
     return camp::get<ParamId>(data.param_tuple);
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index db45d2dfe4..4de4922ea3 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -39,12 +39,10 @@ namespace statement
  *
  */
 template <typename ReducePolicy,
-          template <typename...>
-          class ReduceOperator,
+          template <typename...> class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts>
-struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...>
-{
+struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...> {
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 700df61199..82b79ae775 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -30,9 +30,9 @@ namespace RAJA
 namespace statement
 {
 
-template <typename RegionPolicy, typename... EnclosedStmts>
-struct Region : public internal::Statement<camp::nil>
-{};
+template<typename RegionPolicy, typename... EnclosedStmts>
+struct Region : public internal::Statement<camp::nil> {
+};
 
 
 }  // end namespace statement
@@ -40,27 +40,23 @@ struct Region : public internal::Statement<camp::nil>
 namespace internal
 {
 
-// Statement executor to create a region within kernel
+//Statement executor to create a region within kernel
+
+//Note: RAJA region's lambda must capture by reference otherwise
+//internal function calls are undefined.
+template<typename RegionPolicy, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>, Types> {
 
-// Note: RAJA region's lambda must capture by reference otherwise
-// internal function calls are undefined.
-template <typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
-                         Types>
+template<typename Data>
+static RAJA_INLINE void exec(Data &&data)
 {
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
-  {
-
-    RAJA::region<RegionPolicy>(
-        [&]()
-        {
-          using data_t = camp::decay<Data>;
-          execute_statement_list<camp::list<EnclosedStmts...>, Types>(
-              data_t(data));
-        });
-  }
+  RAJA::region<RegionPolicy>([&]() {
+      using data_t = camp::decay<Data>;
+      execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
+    });
+}
+
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 3b3b3e689d..43f72e0545 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -34,13 +34,14 @@
 namespace RAJA
 {
 
-struct TileSize
-{
+struct TileSize {
   const camp::idx_t size;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr TileSize(camp::idx_t size_) : size {size_} {}
+  constexpr TileSize(camp::idx_t size_) : size{size_}
+  {
+  }
 };
 
 namespace statement
@@ -55,8 +56,7 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
-{
+struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
@@ -65,18 +65,17 @@ struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
 
 ///! tag for a tiling loop
 template <camp::idx_t chunk_size_>
-struct tile_fixed
-{
+struct tile_fixed {
   static constexpr camp::idx_t chunk_size = chunk_size_;
 };
 
 template <camp::idx_t ArgumentId>
-struct tile_dynamic
-{
+struct tile_dynamic {
   static constexpr camp::idx_t id = ArgumentId;
 };
 
 
+
 namespace internal
 {
 
@@ -85,12 +84,8 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
-struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
-{
+template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
+struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -109,8 +104,7 @@ struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
 
 
 template <typename Iterable>
-struct IterableTiler
-{
+struct IterableTiler {
   using value_type = camp::decay<Iterable>;
 
   struct iterate
@@ -126,45 +120,46 @@ struct IterableTiler
     const Index_type block_id;
 
   public:
-    using value_type        = iterate;
-    using difference_type   = camp::idx_t;
-    using pointer           = value_type*;
-    using reference         = value_type&;
+    using value_type = iterate;
+    using difference_type = camp::idx_t;
+    using pointer = value_type *;
+    using reference = value_type &;
     using iterator_category = std::random_access_iterator_tag;
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
-    constexpr iterator(IterableTiler const& itiler_, Index_type block_id_)
-        : itiler {itiler_}, block_id {block_id_}
-    {}
+    constexpr iterator(IterableTiler const &itiler_, Index_type block_id_)
+        : itiler{itiler_}, block_id{block_id_}
+    {
+    }
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
     value_type operator*()
     {
       auto start = block_id * itiler.block_size;
-      return iterate {itiler.it.slice(start, itiler.block_size), block_id};
+      return iterate{itiler.it.slice(start, itiler.block_size), block_id};
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE difference_type operator-(const iterator& rhs) const
+    RAJA_INLINE difference_type operator-(const iterator &rhs) const
     {
       return static_cast<difference_type>(block_id) -
              static_cast<difference_type>(rhs.block_id);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator-(const difference_type& rhs) const
+    RAJA_INLINE iterator operator-(const difference_type &rhs) const
     {
       return iterator(itiler, block_id - rhs);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator+(const difference_type& rhs) const
+    RAJA_INLINE iterator operator+(const difference_type &rhs) const
     {
-      return iterator(itiler, block_id + rhs >= itiler.num_blocks
-                                  ? itiler.num_blocks
-                                  : block_id + rhs);
+      return iterator(itiler,
+                      block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
+                                                          : block_id + rhs);
     }
 
     RAJA_HOST_DEVICE
@@ -174,13 +169,13 @@ struct IterableTiler
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator!=(const iterator& rhs) const
+    RAJA_INLINE bool operator!=(const iterator &rhs) const
     {
       return block_id != rhs.block_id;
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator<(const iterator& rhs) const
+    RAJA_INLINE bool operator<(const iterator &rhs) const
     {
       return block_id < rhs.block_id;
     }
@@ -188,17 +183,16 @@ struct IterableTiler
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  IterableTiler(const Iterable& it_, camp::idx_t block_size_)
-      : it {it_}, block_size {block_size_}
+  IterableTiler(const Iterable &it_, camp::idx_t block_size_)
+      : it{it_}, block_size{block_size_}
   {
     using std::begin;
     using std::distance;
     using std::end;
-    dist       = it.end() - it.begin();  // distance(begin(it), end(it));
+    dist = it.end() - it.begin();  // distance(begin(it), end(it));
     num_blocks = dist / block_size;
     // if (dist % block_size) num_blocks += 1;
-    if (dist - num_blocks * block_size > 0)
-    {
+    if (dist - num_blocks * block_size > 0) {
       num_blocks += 1;
     }
   }
@@ -228,15 +222,13 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
-    Types>
-{
+    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data& data)
+  static RAJA_INLINE void exec(Data &data)
   {
     // Get the segment we are going to tile
-    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = tile_fixed<ChunkSize>::chunk_size;
@@ -246,51 +238,47 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types,
+                EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-template <camp::idx_t ArgumentId,
-          typename EPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t ArgumentId,
+  typename EPol,
+  typename... EnclosedStmts,
+  typename Types>
 struct StatementExecutor<
-    statement::
-        Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
-    Types>
-{
+    statement::Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data& data)
+  static RAJA_INLINE void exec(Data &data)
   {
     // Get the segment we are going to tile
-    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = camp::get<ArgumentId>(data.param_tuple);
-    static_assert(
-        camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
-        "Extracted parameter must be of type TileSize.");
+    static_assert(camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
+                  "Extracted parameter must be of type TileSize.");
 
     // Create a tile iterator
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size.size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types,
+                EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
-
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index d741e0a4b0..2653e992c7 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -47,8 +47,7 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...>
-{
+struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...> {
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
                 "RAJA::Statement::Param< # >");
@@ -67,13 +66,9 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Data,
-          typename Types,
+template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
           typename... EnclosedStmts>
-struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
-{
+struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -84,16 +79,17 @@ struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
   {
     // Assign the tile's segment to the tuple
     camp::get<ArgumentId>(Base::data.segment_tuple) = si.s;
-
+    
     // Assign the tile's index
     Base::data.template assign_param<ParamId>(si.i);
-
+    
     // Execute enclosed statements
     Base::exec();
   }
 };
 
 
+
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::TileTCount
  *
@@ -106,16 +102,14 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
-    Types>
-{
+    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>, Types> {
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data& data)
+  static RAJA_INLINE void exec(Data &data)
   {
     // Get the segment we are going to tile
-    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = TPol::chunk_size;
@@ -125,13 +119,12 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileTCountWrapper<ArgumentId, ParamId, Data, Types, EnclosedStmts...>
-        tile_wrapper(data);
+    TileTCountWrapper<ArgumentId, ParamId, Data, Types,
+                      EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 0bf3cc424b..9667a55538 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -40,27 +40,29 @@ namespace internal
 {
 
 
-// Universal base of all For wrappers for type traits
-struct ForList
-{};
-struct ForBase
-{};
-struct CollapseBase
-{};
-template <camp::idx_t ArgumentId, typename Policy>
-struct ForTraitBase : public ForBase
-{
-  constexpr static camp::idx_t index_val = ArgumentId;
-  using index                            = camp::num<ArgumentId>;
-  using index_type  = camp::nil;  // default to invalid type
-  using policy_type = Policy;
-  using type        = ForTraitBase;  // make camp::value compatible
-};
+
+
+  // Universal base of all For wrappers for type traits
+  struct ForList {
+  };
+  struct ForBase {
+  };
+  struct CollapseBase {
+  };
+  template <camp::idx_t ArgumentId, typename Policy>
+  struct ForTraitBase : public ForBase {
+    constexpr static camp::idx_t index_val = ArgumentId;
+    using index = camp::num<ArgumentId>;
+    using index_type = camp::nil;  // default to invalid type
+    using policy_type = Policy;
+    using type = ForTraitBase;  // make camp::value compatible
+  };
+
+
 
 
 template <typename Iterator>
-struct iterable_difftype_getter
-{
+struct iterable_difftype_getter {
   using type = typename std::iterator_traits<
       typename Iterator::iterator>::difference_type;
 };
@@ -77,8 +79,7 @@ using difftype_tuple_from_segments =
 
 
 template <typename Iterator>
-struct iterable_value_type_getter
-{
+struct iterable_value_type_getter {
   using type =
       typename std::iterator_traits<typename Iterator::iterator>::value_type;
 };
@@ -99,12 +100,13 @@ using index_types_from_segments =
                            value_type_list_from_segments<Segments>>::type;
 
 
+
+
 template <typename SegmentTuple,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-struct LoopData
-{
+struct LoopData {
 
   using Self = LoopData<SegmentTuple, ParamTuple, Resource, Bodies...>;
 
@@ -136,71 +138,78 @@ struct LoopData
   using vector_sizes_t = tuple_of_n<int, camp::tuple_size<SegmentTuple>::value>;
   vector_sizes_t vector_sizes;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
-                                      ParamTuple const& p,
-                                      Resource r,
-                                      Bodies const&... b)
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  LoopData(SegmentTuple const &s, ParamTuple const &p, Resource r, Bodies const &... b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
-  {}
-  constexpr LoopData(LoopData const&) = default;
-  constexpr LoopData(LoopData&&)      = default;
+  {
+  }
+  constexpr LoopData(LoopData const &) = default;
+  constexpr LoopData(LoopData &&) = default;
 
   template <camp::idx_t Idx, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const& i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i)
   {
     camp::get<Idx>(offset_tuple) = i;
   }
 
   template <typename ParamId, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const& i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
   {
-    using param_t =
-        camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
+    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
     camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
   }
 
   template <typename ParamId>
-  RAJA_HOST_DEVICE RAJA_INLINE auto get_param()
-      -> camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
+  RAJA_HOST_DEVICE RAJA_INLINE
+  auto get_param() ->
+    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
   {
     return camp::get<ParamId::param_idx>(param_tuple);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE Resource get_resource() { return res; }
+  RAJA_HOST_DEVICE RAJA_INLINE
+  Resource get_resource()
+  {
+    return res;
+  }
+
+
 };
 
 
+
+
 template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type = typename std::iterator_traits<
-    typename camp::at_v<typename Data::segment_tuple_t::TList,
-                        ArgumentId>::iterator>::difference_type;
+using segment_diff_type =
+    typename std::iterator_traits<
+        typename camp::at_v<typename Data::segment_tuple_t::TList,
+                            ArgumentId>::iterator>::difference_type;
+
+
 
 
 template <camp::idx_t ArgumentId, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const& data)
-    -> segment_diff_type<ArgumentId, Data>
+RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
+  segment_diff_type<ArgumentId, Data>
 {
   return camp::get<ArgumentId>(data.segment_tuple).end() -
          camp::get<ArgumentId>(data.segment_tuple).begin();
 }
 
 
+
+
 template <typename Data, typename Types, typename... EnclosedStmts>
-struct GenericWrapper : GenericWrapperBase
-{
+struct GenericWrapper : GenericWrapperBase {
   using data_t = camp::decay<Data>;
 
-  data_t& data;
+  data_t &data;
 
   RAJA_INLINE
-  constexpr explicit GenericWrapper(data_t& d) : data {d} {}
+  constexpr explicit GenericWrapper(data_t &d) : data{d} {}
 
   RAJA_INLINE
-  void exec()
-  {
-    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
-  }
+  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
 };
 
 
@@ -208,25 +217,26 @@ struct GenericWrapper : GenericWrapperBase
  * Convenience object used to create a thread-private LoopData object.
  */
 template <typename T>
-struct NestedPrivatizer
-{
-  using data_t         = typename T::data_t;
-  using value_type     = camp::decay<T>;
-  using reference_type = value_type&;
+struct NestedPrivatizer {
+  using data_t = typename T::data_t;
+  using value_type = camp::decay<T>;
+  using reference_type = value_type &;
 
   data_t privatized_data;
   value_type privatized_wrapper;
 
   RAJA_INLINE
-  constexpr NestedPrivatizer(const T& o)
-      : privatized_data {o.data}, privatized_wrapper(privatized_data)
-  {}
+  constexpr NestedPrivatizer(const T &o)
+      : privatized_data{o.data}, privatized_wrapper(privatized_data)
+  {
+  }
 
   RAJA_INLINE
   reference_type get_priv() { return privatized_wrapper; }
 };
 
 
+
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index e80b259940..7f77df4214 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -29,73 +29,63 @@ namespace internal
 {
 
 
-template <typename SegmentTypes, typename OffsetTypes>
+template <typename SegmentTypes,
+          typename OffsetTypes>
 struct LoopTypes;
 
-template <typename... SegmentTypes, typename... OffsetTypes>
-struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
-{
+template <typename ... SegmentTypes,
+          typename ... OffsetTypes>
+struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>> {
 
-  using Self =
-      LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
+  using Self = LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
 
   static constexpr size_t s_num_segments = sizeof...(SegmentTypes);
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
   static_assert(s_num_segments == sizeof...(OffsetTypes),
-                "Number of segments "
-                "and offsets must "
-                "match");
+      "Number of segments and offsets must match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
-  using offset_types_t  = camp::list<OffsetTypes...>;
+  using offset_types_t = camp::list<OffsetTypes...>;
 };
 
 
-template <typename Data>
-using makeInitialLoopTypes = LoopTypes<
-    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
-    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
+template<typename Data>
+using makeInitialLoopTypes =
+    LoopTypes<list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
+              list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
 
 
-template <typename Types, camp::idx_t Segment, typename T, typename Seq>
+template<typename Types, camp::idx_t Segment, typename T, typename Seq>
 struct SetSegmentTypeHelper;
 
-template <typename Types, camp::idx_t Segment, typename T, camp::idx_t... SEQ>
+template<typename Types,
+         camp::idx_t Segment,
+         typename T,
+         camp::idx_t ... SEQ>
 struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
 {
-  using segment_list = typename Types::segment_types_t;
-  using offset_list  = typename Types::offset_types_t;
-
-  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-                "Segment was already assigned: Probably looping over same "
-                "segment in loop nest");
-
-  using type = LoopTypes<
-      camp::list<
-          typename std::conditional<SEQ == Segment,
-                                    T,
-                                    camp::at_v<segment_list, SEQ>>::type...>,
-      camp::list<
-          typename std::conditional<SEQ == Segment,
-                                    T,
-                                    camp::at_v<segment_list, SEQ>>::type...>>;
+    using segment_list = typename Types::segment_types_t;
+    using offset_list = typename Types::offset_types_t;
+
+    static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+        "Segment was already assigned: Probably looping over same segment in loop nest");
+
+    using type = LoopTypes<
+        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>,
+        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>>;
+
 };
 
 
-template <typename Types, camp::idx_t Segment, typename T>
-using setSegmentType = typename SetSegmentTypeHelper<
-    Types,
-    Segment,
-    T,
-    camp::make_idx_seq_t<Types::s_num_segments>>::type;
+template<typename Types, camp::idx_t Segment, typename T>
+using setSegmentType =
+    typename SetSegmentTypeHelper<Types, Segment, T, camp::make_idx_seq_t<Types::s_num_segments>>::type;
 
-template <typename Types, camp::idx_t Segment, typename Data>
-using setSegmentTypeFromData = setSegmentType<
-    Types,
-    Segment,
-    camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
+template<typename Types, camp::idx_t Segment, typename Data>
+using setSegmentTypeFromData =
+    setSegmentType<Types, Segment, camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index c0402edad9..48ca828a68 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -28,24 +28,25 @@ namespace internal
 {
 
 
+
 template <typename ExecPolicy, typename... EnclosedStmts>
-struct Statement
-{
-  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
-                    sizeof...(EnclosedStmts) > 0,
-                "Executable statement with no enclosed statements, this is "
-                "almost certainly a bug");
+struct Statement {
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value || sizeof...(EnclosedStmts) > 0,
+      "Executable statement with no enclosed statements, this is almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
-  using execution_policy_t    = ExecPolicy;
+  using execution_policy_t = ExecPolicy;
 };
 
 
+
+
 template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
+
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index f0e5cd5175..5c0d71afb4 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -35,6 +35,8 @@ template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
+
+
 template <typename... Stmts>
 using StatementList = camp::list<Stmts...>;
 
@@ -45,13 +47,11 @@ struct StatementListExecutor;
 
 template <camp::idx_t statement_index,
           camp::idx_t num_statements,
-          typename StmtList,
-          typename Types>
-struct StatementListExecutor
-{
+          typename StmtList, typename Types>
+struct StatementListExecutor {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
 
     // Get the statement we're going to execute
@@ -61,8 +61,8 @@ struct StatementListExecutor
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList,
-                          Types>::exec(std::forward<Data>(data));
+    StatementListExecutor<statement_index + 1, num_statements, StmtList, Types>::exec(
+        std::forward<Data>(data));
   }
 };
 
@@ -72,23 +72,24 @@ struct StatementListExecutor
  */
 
 template <camp::idx_t num_statements, typename StmtList, typename Types>
-struct StatementListExecutor<num_statements, num_statements, StmtList, Types>
-{
+struct StatementListExecutor<num_statements, num_statements, StmtList, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&&)
-  {}
+  static RAJA_INLINE void exec(Data &&)
+  {
+  }
 };
 
 
 template <typename StmtList, typename Types, typename Data>
-RAJA_INLINE void execute_statement_list(Data&& data)
+RAJA_INLINE void execute_statement_list(Data &&data)
 {
   StatementListExecutor<0, camp::size<StmtList>::value, StmtList, Types>::exec(
       std::forward<Data>(data));
 }
 
 
+
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index 7771ae99ee..c750b95986 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -39,8 +39,8 @@ struct SeqToType
 template <typename T, typename SEQ>
 struct ListOfNHelper;
 
-template <typename T, camp::idx_t... SEQ>
-struct ListOfNHelper<T, camp::idx_seq<SEQ...>>
+template <typename T, camp::idx_t ... SEQ>
+struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
 {
   using type = camp::list<typename SeqToType<T, SEQ>::type...>;
 };
@@ -49,13 +49,13 @@ struct ListOfNHelper<T, camp::idx_seq<SEQ...>>
 template <typename T, typename SEQ>
 struct TupleOfNHelper;
 
-template <typename T, camp::idx_t... SEQ>
-struct TupleOfNHelper<T, camp::idx_seq<SEQ...>>
+template <typename T, camp::idx_t ... SEQ>
+struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
 {
   using type = camp::tuple<typename SeqToType<T, SEQ>::type...>;
 };
 
-}  // namespace detail
+} // namespace detail
 
 /*
  *  This creates a camp::list with N types, each one being T.
@@ -64,8 +64,7 @@ struct TupleOfNHelper<T, camp::idx_seq<SEQ...>>
  *
  */
 template <typename T, camp::idx_t N>
-using list_of_n =
-    typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 /*
@@ -75,8 +74,8 @@ using list_of_n =
  *
  */
 template <typename T, camp::idx_t N>
-using tuple_of_n =
-    typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index e079664fcb..f1d70aeacb 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -28,7 +28,7 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-// Odd dependecy with atomics is breaking CI builds
+//Odd dependecy with atomics is breaking CI builds
 //#include "RAJA/util/View.hpp"
 
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
@@ -41,16 +41,11 @@ namespace RAJA
 {
 
 // GPU or CPU threads available
-// strongly type the ExecPlace (guards agaist errors)
-enum struct ExecPlace : int
-{
-  HOST,
-  DEVICE,
-  NUM_PLACES
-};
+//strongly type the ExecPlace (guards agaist errors)
+enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES };
 
-struct null_launch_t
-{};
+struct null_launch_t {
+};
 
 // Support for host, and device
 template <typename HOST_POLICY
@@ -60,8 +55,7 @@ template <typename HOST_POLICY
 #endif
           >
 
-struct LoopPolicy
-{
+struct LoopPolicy {
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -74,8 +68,7 @@ template <typename HOST_POLICY
           typename DEVICE_POLICY = HOST_POLICY
 #endif
           >
-struct LaunchPolicy
-{
+struct LaunchPolicy {
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -83,51 +76,48 @@ struct LaunchPolicy
 };
 
 
-struct Teams
-{
+struct Teams {
   int value[3];
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams() : value {1, 1, 1} {}
+  constexpr Teams() : value{1, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i) : value {i, 1, 1} {}
+  constexpr Teams(int i) : value{i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j) : value {i, j, 1} {}
+  constexpr Teams(int i, int j) : value{i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j, int k) : value {i, j, k} {}
+  constexpr Teams(int i, int j, int k) : value{i, j, k} {}
 };
 
-struct Threads
-{
+struct Threads {
   int value[3];
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads() : value {1, 1, 1} {}
+  constexpr Threads() : value{1, 1, 1} {}
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i) : value {i, 1, 1} {}
+  constexpr Threads(int i) : value{i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j) : value {i, j, 1} {}
+  constexpr Threads(int i, int j) : value{i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j, int k) : value {i, j, k} {}
+  constexpr Threads(int i, int j, int k) : value{i, j, k} {}
 };
 
-struct Lanes
-{
+struct Lanes {
   int value;
 
   RAJA_INLINE
@@ -139,8 +129,7 @@ struct Lanes
   constexpr Lanes(int i) : value(i) {}
 };
 
-struct LaunchParams
-{
+struct LaunchParams {
 public:
   Teams teams;
   Threads threads;
@@ -149,71 +138,67 @@ struct LaunchParams
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams in_teams,
-               Threads in_threads,
-               size_t in_shared_mem_size = 0)
-      : teams(in_teams),
-        threads(in_threads),
-        shared_mem_size(in_shared_mem_size) {};
+  LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0)
+    : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {};
 
 private:
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Teams apply(Teams const& a) { return (teams = a); }
+  Teams apply(Teams const &a) { return (teams = a); }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Threads apply(Threads const& a) { return (threads = a); }
+  Threads apply(Threads const &a) { return (threads = a); }
 };
 
 class LaunchContext
 {
 public:
-  // Bump style allocator used to
-  // get memory from the pool
+
+  //Bump style allocator used to
+  //get memory from the pool
   size_t shared_mem_offset;
 
-  void* shared_mem_ptr;
+  void *shared_mem_ptr;
 
 #if defined(RAJA_ENABLE_SYCL)
-  mutable cl::sycl::nd_item<3>* itm;
+  mutable cl::sycl::nd_item<3> *itm;
 #endif
 
   RAJA_HOST_DEVICE LaunchContext()
-      : shared_mem_offset(0), shared_mem_ptr(nullptr)
-  {}
+    : shared_mem_offset(0), shared_mem_ptr(nullptr)
+  {
+  }
 
-  // TODO handle alignment
-  template <typename T>
+  //TODO handle alignment
+  template<typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
   {
 
-    // Calculate offset in bytes with a char pointer
-    void* mem_ptr = static_cast<char*>(shared_mem_ptr) + shared_mem_offset;
+    //Calculate offset in bytes with a char pointer
+    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
 
-    shared_mem_offset += bytes * sizeof(T);
+    shared_mem_offset += bytes*sizeof(T);
 
-    // convert to desired type
+    //convert to desired type
     return static_cast<T*>(mem_ptr);
   }
 
   /*
   //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
-  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
-  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
+  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t z_stride=DIM-1, typename arg, typename... args>
+  RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs)
   {
     T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
 
     shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
-  idxs...);
+    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx, idxs...);
   }
   */
 
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
-    // On the cpu/gpu we want to restart the count
+    //On the cpu/gpu we want to restart the count
     shared_mem_offset = 0;
   }
 
@@ -233,24 +218,19 @@ class LaunchContext
 template <typename LAUNCH_POLICY>
 struct LaunchExecute;
 
-// Policy based launch with support to new reducers...
-template <typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(LaunchParams const& launch_params,
-            const char* kernel_name,
-            ReduceParams&&... rest_of_launch_args)
+//Policy based launch with support to new reducers...
+template <typename LAUNCH_POLICY, typename ... ReduceParams>
+void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
 {
 
-  // Get reducers
-  auto reducers = expt::make_forall_param_pack(
-      std::forward<ReduceParams>(rest_of_launch_args)...);
+  //Get reducers
+  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body =
-      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  // Take the first policy as we assume the second policy is not user defined.
-  // We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context {
-      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  //Take the first policy as we assume the second policy is not user defined.
+  //We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -262,36 +242,29 @@ void launch(LaunchParams const& launch_params,
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<
-      typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
-                 reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 
-// Duplicate of code above on account that we need to support the case in which
-// a kernel_name is not given
-template <typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(LaunchParams const& launch_params,
-            ReduceParams&&... rest_of_launch_args)
+//Duplicate of code above on account that we need to support the case in which a kernel_name is not given
+template <typename LAUNCH_POLICY, typename ... ReduceParams>
+void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args)
 {
 
-  const char* kernel_name = nullptr;
+  const char *kernel_name = nullptr;
 
-  // Get reducers
-  auto reducers = expt::make_forall_param_pack(
-      std::forward<ReduceParams>(rest_of_launch_args)...);
+  //Get reducers
+  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body =
-      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  // Take the first policy as we assume the second policy is not user defined.
-  // We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context {
-      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  //Take the first policy as we assume the second policy is not user defined.
+  //We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -303,200 +276,148 @@ void launch(LaunchParams const& launch_params,
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<
-      typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
-                 reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 //=================================================
-// Run time based policy launch
+//Run time based policy launch
 //=================================================
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
+void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
 {
   launch<POLICY_LIST>(place, params, nullptr, body);
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place,
-            const LaunchParams& params,
-            const char* kernel_name,
-            BODY const& body)
+void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
 {
 
-  // Forward to single policy launch API - simplifies testing of plugins
-  switch (place)
-  {
-  case ExecPlace::HOST:
-  {
-    using Res = typename resources::get_resource<
-        typename POLICY_LIST::host_policy_t>::type;
-    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
-        Res::get_default(), params, kernel_name, body);
-    break;
-  }
+  //Forward to single policy launch API - simplifies testing of plugins
+  switch (place) {
+    case ExecPlace::HOST: {
+      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(Res::get_default(), params, kernel_name, body);
+      break;
+    }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE:
-  {
-    using Res = typename resources::get_resource<
-        typename POLICY_LIST::device_policy_t>::type;
-    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
-        Res::get_default(), params, kernel_name, body);
-    break;
-  }
+  case ExecPlace::DEVICE: {
+      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(Res::get_default(), params, kernel_name, body);
+      break;
+    }
 #endif
-  default:
-    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+    default:
+      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
+
 }
 
-// Run-time API for new reducer interface
+//Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place,
-            const LaunchParams& launch_params,
-            const char* kernel_name,
-            ReduceParams&&... rest_of_launch_args)
+void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
 {
 
-  // Forward to single policy launch API - simplifies testing of plugins
-  switch (place)
-  {
-  case ExecPlace::HOST:
-  {
-    using Res = typename resources::get_resource<
-        typename POLICY_LIST::host_policy_t>::type;
-    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
-        Res::get_default(), launch_params, kernel_name,
-        std::forward<ReduceParams>(rest_of_launch_args)...);
-    break;
-  }
+  //Forward to single policy launch API - simplifies testing of plugins
+  switch (place) {
+    case ExecPlace::HOST: {
+      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
+        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+      break;
+    }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE:
-  {
-    using Res = typename resources::get_resource<
-        typename POLICY_LIST::device_policy_t>::type;
-    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
-        Res::get_default(), launch_params, kernel_name,
-        std::forward<ReduceParams>(rest_of_launch_args)...);
-    break;
-  }
+  case ExecPlace::DEVICE: {
+      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
+        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+      break;
+    }
 #endif
-  default:
-    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+    default:
+      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
+
 }
 
-// Run-time API for new reducer interface with support of the case without a new
-// kernel name
+//Run-time API for new reducer interface with support of the case without a new kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place,
-            const LaunchParams& launch_params,
-            ReduceParams&&... rest_of_launch_args)
-// BODY const &body)
+void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args)
+            //BODY const &body)
 {
 
-  const char* kernel_name = nullptr;
+  const char *kernel_name = nullptr;
 
-  // Forward to single policy launch API - simplifies testing of plugins
-  switch (place)
-  {
-  case ExecPlace::HOST:
-  {
-    using Res = typename resources::get_resource<
-        typename POLICY_LIST::host_policy_t>::type;
-    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
-        Res::get_default(), launch_params, kernel_name,
-        std::forward<ReduceParams>(rest_of_launch_args)...);
-    break;
-  }
+  //Forward to single policy launch API - simplifies testing of plugins
+  switch (place) {
+    case ExecPlace::HOST: {
+      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
+        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+      break;
+    }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE:
-  {
-    using Res = typename resources::get_resource<
-        typename POLICY_LIST::device_policy_t>::type;
-    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
-        Res::get_default(), launch_params, kernel_name,
-        std::forward<ReduceParams>(rest_of_launch_args)...);
-    break;
-  }
+  case ExecPlace::DEVICE: {
+      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
+        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+      break;
+    }
 #endif
-  default:
-    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+    default:
+      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
+
 }
 
 
-// Helper function to retrieve a resource based on the run-time policy - if a
-// device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
-    defined(RAJA_ENABLE_SYCL)
-template <typename T, typename U>
-RAJA::resources::Resource
-Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device)
-{
-  if (device == RAJA::ExecPlace::DEVICE)
-  {
-    return RAJA::resources::Resource(device_res);
-  }
-  else
-  {
-    return RAJA::resources::Resource(host_res);
-  }
+// Helper function to retrieve a resource based on the run-time policy - if a device is active
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
+template<typename T, typename U>
+RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
+  if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
+  else { return RAJA::resources::Resource(host_res); }
 }
 #endif
 
-template <typename T>
-RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
-{
-  if (device == RAJA::ExecPlace::DEVICE)
-  {
-    RAJA_ABORT_OR_THROW("Device is not enabled");
-  }
+template<typename T>
+RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
+  if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
 
   return RAJA::resources::Resource(host_res);
 }
 
-// Launch API which takes team resource struct and supports new reducers
-template <typename POLICY_LIST, typename... ReduceParams>
+//Launch API which takes team resource struct and supports new reducers
+template <typename POLICY_LIST, typename ... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
-       const char* kernel_name,
-       ReduceParams&&... rest_of_launch_args)
+launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
+       const char *kernel_name, ReduceParams&&... rest_of_launch_args)
 {
 
-  // Get reducers
-  auto reducers = expt::make_forall_param_pack(
-      std::forward<ReduceParams>(rest_of_launch_args)...);
+  //Get reducers
+  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body =
-      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if (res.get_platform() == RAJA::Platform::host)
-  {
+  if(res.get_platform() == RAJA::Platform::host) {
     place = RAJA::ExecPlace::HOST;
-  }
-  else
-  {
+  } else {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  // Configure plugins
+  //Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context {
-      place == ExecPlace::HOST
-          ? util::make_context<typename POLICY_LIST::host_policy_t>()
-          : util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context{place == ExecPlace::HOST ?
+      util::make_context<typename POLICY_LIST::host_policy_t>() :
+      util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context {
-      util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -508,30 +429,24 @@ launch(RAJA::resources::Resource res,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place)
-  {
-  case ExecPlace::HOST:
-  {
-    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-    resources::EventProxy<resources::Resource> e_proxy =
-        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-    util::callPostLaunchPlugins(context);
-    return e_proxy;
-  }
+  switch (place) {
+    case ExecPlace::HOST: {
+      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      util::callPostLaunchPlugins(context);
+      return e_proxy;
+    }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE:
-  {
-    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-    resources::EventProxy<resources::Resource> e_proxy =
-        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-    util::callPostLaunchPlugins(context);
-    return e_proxy;
-  }
+    case ExecPlace::DEVICE: {
+      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name,  p_body, reducers);
+      util::callPostLaunchPlugins(context);
+      return e_proxy;
+    }
 #endif
-  default:
-  {
-    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-  }
+    default: {
+      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+    }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -541,45 +456,36 @@ launch(RAJA::resources::Resource res,
 }
 
 
-// Duplicate of API above on account that we need to handle the case that a
-// kernel name is not provided
-template <typename POLICY_LIST, typename... ReduceParams>
+//Duplicate of API above on account that we need to handle the case that a kernel name is not provided
+template <typename POLICY_LIST, typename ... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
+launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
        ReduceParams&&... rest_of_launch_args)
 {
 
-  const char* kernel_name = nullptr;
+  const char *kernel_name = nullptr;
 
-  // Get reducers
-  auto reducers = expt::make_forall_param_pack(
-      std::forward<ReduceParams>(rest_of_launch_args)...);
+  //Get reducers
+  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body =
-      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if (res.get_platform() == RAJA::Platform::host)
-  {
+  if(res.get_platform() == RAJA::Platform::host) {
     place = RAJA::ExecPlace::HOST;
-  }
-  else
-  {
+  } else {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  // Configure plugins
+  //Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context {
-      place == ExecPlace::HOST
-          ? util::make_context<typename POLICY_LIST::host_policy_t>()
-          : util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context{place == ExecPlace::HOST ?
+      util::make_context<typename POLICY_LIST::host_policy_t>() :
+      util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context {
-      util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -591,30 +497,24 @@ launch(RAJA::resources::Resource res,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place)
-  {
-  case ExecPlace::HOST:
-  {
-    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-    resources::EventProxy<resources::Resource> e_proxy =
-        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-    util::callPostLaunchPlugins(context);
-    return e_proxy;
-  }
+  switch (place) {
+    case ExecPlace::HOST: {
+      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      util::callPostLaunchPlugins(context);
+      return e_proxy;
+    }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE:
-  {
-    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-    resources::EventProxy<resources::Resource> e_proxy =
-        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-    util::callPostLaunchPlugins(context);
-    return e_proxy;
-  }
+    case ExecPlace::DEVICE: {
+      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      util::callPostLaunchPlugins(context);
+      return e_proxy;
+    }
 #endif
-  default:
-  {
-    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-  }
+    default: {
+      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+    }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -623,7 +523,7 @@ launch(RAJA::resources::Resource res,
   return resources::EventProxy<resources::Resource>(res);
 }
 
-template <typename POLICY_LIST>
+template<typename POLICY_LIST>
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 using loop_policy = typename POLICY_LIST::device_policy_t;
 #else
@@ -641,23 +541,28 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void
-loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
+                                       SEGMENT const &segment,
+                                       BODY const &body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                       segment,
+                                                       body);
 }
 
 template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void
-loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
+                                          SEGMENT const &segment,
+                                          BODY const &body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment,
-                                                             body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                          segment,
+                                                          body);
 }
 
 namespace expt
@@ -668,13 +573,15 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
-                                       SEGMENT const& segment0,
-                                       SEGMENT const& segment1,
-                                       BODY const& body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       BODY const &body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                       segment0,
+                                                       segment1,
                                                        body);
 }
 
@@ -683,18 +590,37 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
-                                              SEGMENT const& segment0,
-                                              SEGMENT const& segment1,
-                                              SEGMENT const& segment2,
-                                              BODY const& body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       SEGMENT const &segment2,
+                                       BODY const &body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, segment0, segment1, segment2, body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                       segment0,
+                                                       segment1,
+                                                       segment2,
+                                                       body);
 }
 
-}  // namespace expt
+RAJA_SUPPRESS_HD_WARN
+template <typename POLICY_LIST,
+          typename CONTEXT,
+          typename SEGMENT,
+          typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       SEGMENT const &segment2,
+                                       BODY const &body)
+{
+
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                           segment0, segment1, segment2, body);
+}
+
+} //namespace expt
 
 template <typename POLICY, typename SEGMENT>
 struct TileExecute;
@@ -707,13 +633,15 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
                                        TILE_T tile_size,
-                                       SEGMENT const& segment,
-                                       BODY const& body)
+                                       SEGMENT const &segment,
+                                       BODY const &body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size, segment,
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                       tile_size,
+                                                       segment,
                                                        body);
 }
 
@@ -722,13 +650,15 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
-                                              TILE_T tile_size,
-                                              SEGMENT const& segment,
-                                              BODY const& body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
+                                       TILE_T tile_size,
+                                       SEGMENT const &segment,
+                                       BODY const &body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size,
-                                                             segment, body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                          tile_size,
+                                                          segment,
+                                                          body);
 }
 
 namespace expt
@@ -739,16 +669,20 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
                                        TILE_T tile_size0,
                                        TILE_T tile_size1,
-                                       SEGMENT const& segment0,
-                                       SEGMENT const& segment1,
-                                       BODY const& body)
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       BODY const &body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, tile_size0, tile_size1, segment0, segment1, body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                       tile_size0,
+                                                       tile_size1,
+                                                       segment0,
+                                                       segment1,
+                                                       body);
 }
 
 template <typename POLICY_LIST,
@@ -756,19 +690,23 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
-                                              TILE_T tile_size0,
-                                              TILE_T tile_size1,
-                                              SEGMENT const& segment0,
-                                              SEGMENT const& segment1,
-                                              BODY const& body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
+                                       TILE_T tile_size0,
+                                       TILE_T tile_size1,
+                                       SEGMENT const &segment0,
+                                       SEGMENT const &segment1,
+                                       BODY const &body)
 {
 
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
-      ctx, tile_size0, tile_size1, segment0, segment1, body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
+                                                          tile_size0,
+                                                          tile_size1,
+                                                          segment0,
+                                                          segment1,
+                                                          body);
 }
 
-}  // namespace expt
+} //namespace expt
 
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
index ca3f4e58d0..3fbe36877c 100644
--- a/include/RAJA/pattern/multi_reduce.hpp
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -156,7 +156,7 @@ struct MultiReduceSum;
  */
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitOr;
-
+ 
 
 /*!
  ******************************************************************************
@@ -171,8 +171,7 @@ struct MultiReduceBitOr;
    Index_ptr bins = ...;
    Real_ptr bit_vals = ...;
 
-   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins,
- init_val);
+   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
 
    forall<exec_policy>( ..., [=] (Index_type i) {
       my_bits[bins[i]] &= (data[i]);
@@ -189,7 +188,7 @@ struct MultiReduceBitOr;
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitAnd;
 
-}  // namespace RAJA
+} //namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 6acaaeb6a4..5a656206f5 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -21,435 +21,348 @@ namespace RAJA
 namespace expt
 {
 
-//
-//
-// Forall Parameter Packing type
-//
-//
-struct ParamMultiplexer;
-
-template <typename... Params>
-struct ForallParamPack
-{
-
-  friend struct ParamMultiplexer;
-
-  using Base = camp::tuple<Params...>;
-  Base param_tup;
-
-  static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
-  using params_seq                     = camp::make_idx_seq_t<param_tup_sz>;
-
-private:
-  // Init
-  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
-  static constexpr void detail_init(EXEC_POL,
-                                    camp::idx_seq<Seq...>,
-                                    ForallParamPack& f_params,
-                                    Args&&... args)
-  {
-    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
-                                             std::forward<Args>(args)...));
-  }
-
-  // Combine
-  template <typename EXEC_POL, camp::idx_t... Seq>
-  RAJA_HOST_DEVICE static constexpr void
-  detail_combine(EXEC_POL,
-                 camp::idx_seq<Seq...>,
-                 ForallParamPack& out,
-                 const ForallParamPack& in)
-  {
-    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
-                                          camp::get<Seq>(in.param_tup)));
-  }
-
-  template <typename EXEC_POL, camp::idx_t... Seq>
-  RAJA_HOST_DEVICE static constexpr void
-  detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params)
-  {
-    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(f_params.param_tup)));
-  }
-
-  // Resolve
-  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
-  static constexpr void detail_resolve(EXEC_POL,
-                                       camp::idx_seq<Seq...>,
-                                       ForallParamPack& f_params,
-                                       Args&&... args)
-  {
-    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
-                                          std::forward<Args>(args)...));
-  }
-
-  // Used to construct the argument TYPES that will be invoked with the lambda.
-  template <typename null_t = camp::nil>
-  static constexpr auto LAMBDA_ARG_TUP_T()
-  {
-    return camp::tuple<> {};
+  //
+  //
+  // Forall Parameter Packing type
+  //
+  //
+  struct ParamMultiplexer;
+
+  template<typename... Params>
+  struct ForallParamPack {
+
+    friend struct ParamMultiplexer;
+
+    using Base = camp::tuple<Params...>;
+    Base param_tup;
+
+    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
+    using params_seq = camp::make_idx_seq_t< param_tup_sz >;
+
+  private:
+
+    // Init
+    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
+    static constexpr void detail_init(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
+      CAMP_EXPAND(expt::detail::init<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
+    }
+
+    // Combine
+    template<typename EXEC_POL, camp::idx_t... Seq>
+    RAJA_HOST_DEVICE
+    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& out, const ForallParamPack& in ) {
+      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
+    }
+
+    template<typename EXEC_POL, camp::idx_t... Seq>
+    RAJA_HOST_DEVICE
+    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
+      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
+    }
+    
+    // Resolve
+    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
+    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
+      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
+    }
+
+    // Used to construct the argument TYPES that will be invoked with the lambda.
+    template<typename null_t = camp::nil>
+    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; };
+    template<typename null_t = camp::nil, typename First>
+    static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); };
+    template<typename null_t = camp::nil, typename First, typename Second, typename... Rest>
+    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
+
+    using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
+    
+    //Use the size of param_tup to generate the argument list.
+    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
+    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
+    template<camp::idx_t N>
+    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>) {
+      return camp::tuple_cat_pair(  camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num<N-1>())  );
+    }
+
+  public:
+    ForallParamPack(){}
+
+    RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());}
+
+    using lambda_arg_seq = camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
+
+    template<typename... Ts>
+    ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
+  }; // struct ForallParamPack 
+  
+
+
+  //===========================================================================
+  //
+  //
+  // ParamMultiplexer is how we hook into the individual calls within forall_impl.
+  //
+  //
+  struct ParamMultiplexer {
+    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
+    static void constexpr init( ForallParamPack<Params...>& f_params, Args&& ...args) {
+      FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward<Args>(args)... );
+    }
+    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
+    static void constexpr combine(ForallParamPack<Params...>& f_params, Args&& ...args){
+      FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
+    }
+    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
+    static void constexpr resolve( ForallParamPack<Params...>& f_params, Args&& ...args){
+      FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
+    }
   };
-  template <typename null_t = camp::nil, typename First>
-  static constexpr auto LAMBDA_ARG_TUP_T()
-  {
-    return typename First::ARG_TUP_T();
-  };
-  template <typename null_t = camp::nil,
-            typename First,
-            typename Second,
-            typename... Rest>
-  static constexpr auto LAMBDA_ARG_TUP_T()
-  {
-    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
-                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
-  };
-
-  using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
+  //===========================================================================
 
-  // Use the size of param_tup to generate the argument list.
-  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>)
-  {
-    return camp::make_tuple();
-  }
-  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>)
-  {
-    return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup();
-  }
-  template <camp::idx_t N>
-  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>)
-  {
-    return camp::tuple_cat_pair(
-        camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(),
-        LAMBDA_ARG_TUP_V(camp::num<N - 1>()));
-  }
 
-public:
-  ForallParamPack() {}
 
-  RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args()
-  {
-    return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());
+  //===========================================================================
+  //
+  //
+  // ForallParamPack generators.
+  //
+  //
+  RAJA_INLINE static auto get_empty_forall_param_pack(){
+    static ForallParamPack<> p;
+    return p;
   }
 
-  using lambda_arg_seq =
-      camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
+  namespace detail {
+    // all_true trick to perform variadic expansion in static asserts.
+    // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
+    template<bool...> struct bool_pack;
+    template<bool... bs>
+    using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
 
-  template <typename... Ts>
-  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
-};  // struct ForallParamPack
+    template<typename Base, typename... Ts>
+    using check_types_derive_base = all_true<std::is_convertible<Ts, Base>::value...>;
+  } // namespace detail
 
 
-//===========================================================================
-//
-//
-// ParamMultiplexer is how we hook into the individual calls within forall_impl.
-//
-//
-struct ParamMultiplexer
-{
-  template <typename EXEC_POL,
-            typename... Params,
-            typename... Args,
-            typename FP = ForallParamPack<Params...>>
-  static void constexpr init(ForallParamPack<Params...>& f_params,
-                             Args&&... args)
-  {
-    FP::detail_init(EXEC_POL(), typename FP::params_seq(), f_params,
-                    std::forward<Args>(args)...);
-  }
-  template <typename EXEC_POL,
-            typename... Params,
-            typename... Args,
-            typename FP = ForallParamPack<Params...>>
-  static void constexpr combine(ForallParamPack<Params...>& f_params,
-                                Args&&... args)
-  {
-    FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params,
-                       std::forward<Args>(args)...);
-  }
-  template <typename EXEC_POL,
-            typename... Params,
-            typename... Args,
-            typename FP = ForallParamPack<Params...>>
-  static void constexpr resolve(ForallParamPack<Params...>& f_params,
-                                Args&&... args)
-  {
-    FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params,
-                       std::forward<Args>(args)...);
+  template<typename... Ts>
+  constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple) {
+    static_assert(detail::check_types_derive_base<detail::ForallParamBase, camp::decay<Ts>...>::value,
+        "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ;
+    return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
   }
-};
-//===========================================================================
 
+  
 
-//===========================================================================
-//
-//
-// ForallParamPack generators.
-//
-//
-RAJA_INLINE static auto get_empty_forall_param_pack()
-{
-  static ForallParamPack<> p;
-  return p;
-}
-
-namespace detail
-{
-// all_true trick to perform variadic expansion in static asserts.
-// https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
-template <bool...>
-struct bool_pack;
-template <bool... bs>
-using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
-
-template <typename Base, typename... Ts>
-using check_types_derive_base =
-    all_true<std::is_convertible<Ts, Base>::value...>;
-}  // namespace detail
-
-
-template <typename... Ts>
-constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
-{
-  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
-                                                camp::decay<Ts>...>::value,
-                "Forall optional arguments do not derive ForallParamBase. "
-                "Please see Reducer, ReducerLoc and KernelName for examples.");
-  return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
-}
-
-
-namespace detail
-{
-// Maybe we should do a lot of these with structs...
-template <camp::idx_t... Seq, typename TupleType>
-constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
-{
-  return camp::forward_as_tuple(
-      camp::get<Seq>(std::forward<TupleType>(tuple))...);
-};
-
-template <typename... Ts>
-constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
-{
-  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1> {},
-                        std::move(tuple));
-};
-}  // namespace detail
-
-
-// Make a tuple of the param pack except the final element...
-template <typename... Args>
-constexpr auto make_forall_param_pack(Args&&... args)
-{
-  // We assume the last element of the pack is the lambda so we need to strip it
-  // from the list.
-  auto stripped_arg_tuple = detail::strip_last_elem(
-      camp::forward_as_tuple(std::forward<Args>(args)...));
-  return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
-}
-//===========================================================================
-
-
-//===========================================================================
-//
-//
-// Callable should be the last argument in the param pack, just extract it...
-//
-//
-template <typename... Args>
-constexpr auto&& get_lambda(Args&&... args)
-{
-  return camp::get<sizeof...(Args) - 1>(
-      camp::forward_as_tuple(std::forward<Args>(args)...));
-}
-//===========================================================================
-
-
-//===========================================================================
-//
-//
-// Checking expected argument list against the assumed lambda.
-//
-//
-namespace detail
-{
+  namespace detail {
+    // Maybe we should do a lot of these with structs...
+    template<camp::idx_t... Seq, typename TupleType>
+    constexpr auto tuple_from_seq (const camp::idx_seq<Seq...>&, TupleType&& tuple){
+      return camp::forward_as_tuple( camp::get< Seq >(std::forward<TupleType>(tuple))... );
+    };
 
-//
-//
-// Lambda traits Utilities
-//
-//
-template <class F>
-struct lambda_traits;
-
-template <class R, class C, class First, class... Rest>
-struct lambda_traits<R (C::*)(First, Rest...)>
-{  // non-const specialization
-  using arg_type = First;
-};
-template <class R, class C, class First, class... Rest>
-struct lambda_traits<R (C::*)(First, Rest...) const>
-{  // const specialization
-  using arg_type = First;
-};
-
-template <class T>
-typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
-
-
-//
-//
-// List manipulation Utilities
-//
-//
-template <typename... Ts>
-constexpr auto list_remove_pointer(const camp::list<Ts...>&)
-{
-  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...> {};
-}
+    template<typename... Ts>
+    constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple){
+      return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts)-1>{},std::move(tuple));
+    };
+  } // namespace detail
 
-template <typename... Ts>
-constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&)
-{
-  return camp::list<typename std::add_lvalue_reference<Ts>::type...> {};
-}
 
-template <typename... Ts>
-constexpr auto tuple_to_list(const camp::tuple<Ts...>&)
-{
-  return camp::list<Ts...> {};
-}
-
-// TODO : Change to std::is_invocable at c++17
-template <typename F, typename... Args>
-struct is_invocable
-    : std::is_constructible<
-          std::function<void(Args...)>,
-          std::reference_wrapper<typename std::remove_reference<F>::type>>
-{};
-
-template <class...>
-using void_t = void;
-
-template <class F, class = void>
-struct has_empty_op : std::false_type
-{};
-
-template <class F>
-struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>>
-    : std::true_type
-{};
-
-template <class F>
-struct get_lambda_index_type
-{
-  typedef typename std::remove_pointer<decltype(lambda_arg_helper(
-      &camp::decay<F>::operator()))>::type type;
-};
-
-// If LAMBDA::operator() is not available this probably isn't a generic lambda
-// and we can't extract and check args.
-template <typename LAMBDA, typename... EXPECTED_ARGS>
-constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>>
-check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
-{}
-
-template <typename LAMBDA, typename... EXPECTED_ARGS>
-constexpr concepts::enable_if<has_empty_op<LAMBDA>>
-check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
-{
+  // Make a tuple of the param pack except the final element...
+  template<typename... Args>
+  constexpr auto make_forall_param_pack(Args&&... args){
+    // We assume the last element of the pack is the lambda so we need to strip it from the list.
+    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
+    return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+  }
+  //===========================================================================
+
+
+
+  //===========================================================================
+  //
+  //
+  // Callable should be the last argument in the param pack, just extract it...
+  //
+  //
+  template<typename... Args>
+  constexpr auto&& get_lambda(Args&&... args){
+    return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
+  } 
+  //===========================================================================
+
+
+
+  //===========================================================================
+  //
+  //
+  // Checking expected argument list against the assumed lambda.
+  //
+  //
+  namespace detail {
+
+    // 
+    //
+    // Lambda traits Utilities
+    // 
+    //
+    template<class F>
+    struct lambda_traits;
+
+    template<class R, class C, class First, class... Rest>
+    struct lambda_traits<R (C::*)(First, Rest...)>
+    {  // non-const specialization
+      using arg_type = First; 
+    };
+    template<class R, class C, class First, class... Rest>
+    struct lambda_traits<R (C::*)(First, Rest...) const>
+    {  // const specialization
+      using arg_type = First; 
+    };
+
+    template<class T>
+    typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
+
+
+    // 
+    //
+    // List manipulation Utilities
+    // 
+    //
+    template<typename... Ts>
+    constexpr auto list_remove_pointer(const camp::list<Ts...>&){
+      return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
+    }
+    
+    template<typename... Ts>
+    constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
+      return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
+    }
+
+    template<typename... Ts>
+    constexpr auto tuple_to_list(const camp::tuple<Ts...>&) {
+      return camp::list<Ts...>{};
+    }
+
+    // TODO : Change to std::is_invocable at c++17
+    template <typename F, typename... Args>
+    struct is_invocable :
+      std::is_constructible<
+        std::function<void(Args ...)>,
+        std::reference_wrapper<typename std::remove_reference<F>::type>
+      >{};
+
+    template<class...>
+    using void_t = void;
+
+    template<class F, class=void>
+    struct has_empty_op : std::false_type{};
+
+    template<class F>
+    struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>> : std::true_type{};
+
+    template<class F>
+    struct get_lambda_index_type {
+      typedef typename std::remove_pointer<
+                decltype(lambda_arg_helper(
+                      &camp::decay<F>::operator())
+                )
+              >::type type;
+    };
+
+    // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args.
+    template<typename LAMBDA, typename... EXPECTED_ARGS>
+    constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {}
+
+    template<typename LAMBDA, typename... EXPECTED_ARGS>
+    constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
 #if !defined(RAJA_ENABLE_HIP)
-  static_assert(
-      is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
-                   EXPECTED_ARGS...>::value,
-      "LAMBDA Not invocable w/ EXPECTED_ARGS.");
+      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); 
 #endif
-}
+    }
 
-}  // namespace detail
+  } // namespace detail
 
 
-template <typename Lambda, typename ForallParams>
-constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
-{
+  template<typename Lambda, typename ForallParams>
+  constexpr 
+  void
+  check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
 
-  using expected_arg_type_list = decltype(detail::list_add_lvalue_ref(
-      detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args()))));
+    using expected_arg_type_list = decltype( detail::list_add_lvalue_ref(
+                                               detail::list_remove_pointer(
+                                                 detail::tuple_to_list(
+                                                   fpp.lambda_args()
+                                                 )
+                                               )
+                                            ));
 
-  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list {});
-}
-//===========================================================================
+    detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
+  }
+  //===========================================================================
+  
 
 
-//===========================================================================
-//
-//
-// Type trailts for SFINAE work.
-//
-//
-namespace type_traits
-{
-template <typename T>
-struct is_ForallParamPack : std::false_type
-{};
-template <typename... Args>
-struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type
-{};
-
-template <typename T>
-struct is_ForallParamPack_empty : std::true_type
-{};
-template <typename First, typename... Rest>
-struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
-    : std::false_type
-{};
-template <>
-struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
-{};
-}  // namespace type_traits
-//===========================================================================
-
-
-//===========================================================================
-//
-//
-// Invoke Forall with Params.
-//
-//
-namespace detail
-{
-template <camp::idx_t Idx, typename FP>
-RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
-    -> decltype(*camp::get<Idx>(fpp.lambda_args()))
-{
-  return (*camp::get<Idx>(fpp.lambda_args()));
-}
-
-CAMP_SUPPRESS_HD_WARN
-template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
-RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                  Fn&& f,
-                                                  camp::idx_seq<Sequence...>,
-                                                  Ts&&... extra)
-{
-  return f(std::forward<Ts...>(extra...),
-           (get_lambda_args<Sequence>(params))...);
-}
-}  // namespace detail
-
-// CAMP_SUPPRESS_HD_WARN
-template <typename Params, typename Fn, typename... Ts>
-RAJA_HOST_DEVICE constexpr auto
-invoke_body(Params&& params, Fn&& f, Ts&&... extra)
-{
-  return detail::invoke_with_order(
-      camp::forward<Params>(params), camp::forward<Fn>(f),
-      typename camp::decay<Params>::lambda_arg_seq(),
-      camp::forward<Ts...>(extra)...);
-}
-//===========================================================================
+  //===========================================================================
+  //
+  //
+  // Type trailts for SFINAE work.
+  //
+  //
+  namespace type_traits
+  {
+    template <typename T> struct is_ForallParamPack : std::false_type {};
+    template <typename... Args> struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {};
+
+    template <typename T> struct is_ForallParamPack_empty : std::true_type {};
+    template <typename First, typename... Rest> struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>> : std::false_type {};
+    template <> struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {};
+  }
+  //===========================================================================
+
+
+
+  //===========================================================================
+  //
+  //
+  // Invoke Forall with Params.
+  //
+  //
+  namespace detail {
+    template<camp::idx_t Idx, typename FP>
+    RAJA_HOST_DEVICE
+    constexpr
+    auto get_lambda_args(FP& fpp)
+        -> decltype(  *camp::get<Idx>( fpp.lambda_args() )  ) {
+      return (  *camp::get<Idx>( fpp.lambda_args() )  );
+    }
+
+    CAMP_SUPPRESS_HD_WARN
+    template <typename Fn,
+              camp::idx_t... Sequence,
+              typename Params,
+              typename... Ts>
+    RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                      Fn&& f,
+                                                      camp::idx_seq<Sequence...>,
+                                                      Ts&&... extra)
+    {
+      return f(std::forward<Ts...>(extra...), ( get_lambda_args<Sequence>(params) )...);
+    }
+  } // namespace detail
+
+  //CAMP_SUPPRESS_HD_WARN
+  template <typename Params, typename Fn, typename... Ts>
+  RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra)
+  {
+    return detail::invoke_with_order(
+        camp::forward<Params>(params),
+        camp::forward<Fn>(f),
+        typename camp::decay<Params>::lambda_arg_seq(),
+        camp::forward<Ts...>(extra)...);
+  }
+  //===========================================================================
 
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace expt
+} //  namespace RAJA
 
-#endif  //  FORALL_PARAM_HPP
+#endif //  FORALL_PARAM_HPP
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
index e9d2a6e3e1..e768d8dd59 100644
--- a/include/RAJA/pattern/params/kernel_name.hpp
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -10,20 +10,23 @@ namespace expt
 namespace detail
 {
 
-struct KernelName : public ForallParamBase
-{
-  RAJA_HOST_DEVICE KernelName() {}
-  KernelName(const char* name_in) : name(name_in) {}
-  const char* name;
-};
+  struct KernelName : public ForallParamBase {
+    RAJA_HOST_DEVICE KernelName() {}
+    KernelName(const char* name_in) : name(name_in) {}
+    const char* name;
+  };
+
+} // namespace detail
 
-}  // namespace detail
+inline auto KernelName(const char * n)
+{
+  return detail::KernelName(n);
+}
+} // namespace expt
 
-inline auto KernelName(const char* n) { return detail::KernelName(n); }
-}  // namespace expt
 
+} //  namespace RAJA
 
-}  //  namespace RAJA
 
 
-#endif  // KERNEL_NAME_HPP
+#endif // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index e15d1db819..98380f6ffc 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -6,25 +6,130 @@ namespace RAJA
 {
 namespace expt
 {
+
+  template<typename T, typename IndexType = RAJA::Index_type>
+  struct ValLoc {
+    using index_type = IndexType;
+    using value_type = T;
+
+    ValLoc() = default;
+    RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
+    RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l) {}
+
+    ValLoc(ValLoc const &) = default;
+    ValLoc(ValLoc &&) = default;
+    ValLoc& operator=(ValLoc const &) = default;
+    ValLoc& operator=(ValLoc &&) = default;
+
+    RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const { return val < rhs.val; }
+    RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const { return val > rhs.val; }
+
+    RAJA_HOST_DEVICE constexpr const value_type& getVal() const {return val;}
+    RAJA_HOST_DEVICE constexpr const index_type& getLoc() const {return loc;}
+
+    RAJA_HOST_DEVICE void set(T inval, IndexType inindex) {val = inval; loc = inindex;}
+    RAJA_HOST_DEVICE void setVal(T inval) {val = inval;}
+    RAJA_HOST_DEVICE void setLoc(IndexType inindex) {loc = inindex;}
+
+    value_type val;
+    index_type loc = -1;
+  };
+
+  template<typename T, template <typename, typename, typename> class Op>
+  struct ValOp {
+    using value_type = T;
+    using op_type = Op<T,T,T>;
+
+    ValOp() = default;
+    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+
+    ValOp(ValOp const &) = default;
+    ValOp(ValOp &&) = default;
+    ValOp& operator=(ValOp const &) = default;
+    ValOp& operator=(ValOp &&) = default;
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<T,T,T>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<T,T,T>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::plus<T,T,T>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & operator+=(const value_type& rhs) { val += rhs; return *this; }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & operator&=(const value_type& rhs) { val &= rhs; return *this; }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & operator|=(const value_type& rhs) { val |= rhs; return *this; }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
+    RAJA_HOST_DEVICE ValOp & operator&=(value_type& rhs) { val &= rhs; return *this; }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
+    RAJA_HOST_DEVICE ValOp & operator|=(value_type& rhs) { val |= rhs; return *this; }
+
+    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { val < rhs.val; return *this; }
+    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { val > rhs.val; return *this; }
+
+    value_type val = op_type::identity();
+  };
+
+  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
+  struct ValOp <ValLoc<T,IndexType>, Op> {
+    using index_type = IndexType;
+    using value_type = ValLoc<T,index_type>;
+    using op_type = Op<value_type,value_type,value_type>;
+    using valloc_value_type = typename value_type::value_type;
+    using valloc_index_type = typename value_type::index_type;
+
+    ValOp() = default;
+    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+    RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l) : val(v, l) {}
+
+    ValOp(ValOp const &) = default;
+    ValOp(ValOp &&) = default;
+    ValOp& operator=(ValOp const &) = default;
+    ValOp& operator=(ValOp &&) = default;
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & minloc(valloc_value_type v, valloc_index_type l) { return min(value_type(v,l)); }
+
+    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
+    RAJA_HOST_DEVICE constexpr ValOp & maxloc(valloc_value_type v, valloc_index_type l) { return max(value_type(v,l)); }
+
+    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { return val < rhs.val; }
+    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { return val > rhs.val; }
+
+    value_type val = op_type::identity();
+  };
+
+  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
+  using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
+
 namespace detail
 {
 
-struct ForallParamBase
-{
+  struct ForallParamBase {
 
-  // Some of this can be made virtual in c++20, for now must be defined in each
-  // child class if any arguments to the forall lambda are needed (e.g.
-  // KernelName is excluded.)
-  using ARG_TUP_T  = camp::tuple<>;
-  using ARG_LIST_T = typename ARG_TUP_T::TList;
-  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
-  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
-};
+    // Some of this can be made virtual in c++20, for now must be defined in each child class
+    // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.)
+    using ARG_TUP_T = camp::tuple<>; 
+    using ARG_LIST_T = typename ARG_TUP_T::TList;
+    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
+    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+  
+  };
 
-}  // namespace detail
+} // namespace detail
 
-}  // namespace expt
+} // namespace expt
 
-}  //  namespace RAJA
+} //  namespace RAJA
 
-#endif  //  RAJA_PARAMS_BASE
+#endif //  RAJA_PARAMS_BASE
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 9cb0e64835..78b6d7714d 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -15,68 +15,24 @@
 namespace RAJA
 {
 
-namespace expt
-{
-
-template <typename T>
-struct ValLoc
-{
-  using index_type = RAJA::Index_type;
-  using value_type = T;
-
-  RAJA_HOST_DEVICE ValLoc() {}
-  RAJA_HOST_DEVICE ValLoc(value_type v) : val(v) {}
-  RAJA_HOST_DEVICE ValLoc(value_type v, RAJA::Index_type l) : val(v), loc(l) {}
-
-  RAJA_HOST_DEVICE void min(value_type v, index_type l)
-  {
-    if (v < val)
-    {
-      val = v;
-      loc = l;
-    }
-  }
-  RAJA_HOST_DEVICE void max(value_type v, index_type l)
-  {
-    if (v > val)
-    {
-      val = v;
-      loc = l;
-    }
-  }
-
-  bool constexpr operator<(const ValLoc& rhs) const { return val < rhs.val; }
-  bool constexpr operator>(const ValLoc& rhs) const { return val > rhs.val; }
-
-  value_type getVal() { return val; }
-  RAJA::Index_type getLoc() { return loc; }
-
-private:
-  value_type val;
-  index_type loc = -1;
-};
-
-}  //  namespace expt
-
 namespace operators
 {
 
-template <typename T>
-struct limits<RAJA::expt::ValLoc<T>>
-{
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T> min()
+template <typename T, typename IndexType>
+struct limits<RAJA::expt::ValLoc<T, IndexType>> {
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> min()
   {
-    return RAJA::expt::ValLoc<T>(RAJA::operators::limits<T>::min());
+    return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> max()
   {
-    return RAJA::expt::ValLoc<T>(RAJA::operators::limits<T>::max());
+    return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::max());
   }
 };
 
-}  //  namespace operators
+} //  namespace operators
 
-}  //  namespace RAJA
+} //  namespace RAJA
 
 namespace RAJA
 {
@@ -87,87 +43,159 @@ namespace detail
 {
 
 #if defined(RAJA_CUDA_ACTIVE)
-using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+  using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
-using device_mem_pool_t = RAJA::hip::device_mempool_type;
+  using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #elif defined(RAJA_SYCL_ACTIVE)
-using device_mem_pool_t = RAJA::sycl::device_mempool_type;
+  using device_mem_pool_t = RAJA::sycl::device_mempool_type;
 #endif
 
-//
-//
-// Basic Reducer
-//
-//
-template <typename Op, typename T>
-struct Reducer : public ForallParamBase
-{
-  using op         = Op;
-  using value_type = T;
+  //
+  //
+  // Basic Reducer
+  //
+  //
+
+  // Basic data type Reducer
+  // T must be a basic data type
+  // VOp must be ValOp<T, Op>
+  template <typename Op, typename T, typename VOp>
+  struct Reducer : public ForallParamBase {
+    using op = Op;
+    using value_type = T; // This is a basic data type
+
+    Reducer() = default;
+
+    // Basic data type constructor
+    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target(target_in){}
 
-  RAJA_HOST_DEVICE Reducer() {}
-  Reducer(value_type* target_in) : target(target_in), val(op::identity()) {}
+    Reducer(Reducer const &) = default;
+    Reducer(Reducer &&) = default;
+    Reducer& operator=(Reducer const &) = default;
+    Reducer& operator=(Reducer &&) = default;
 
-  value_type* target = nullptr;
-  value_type val     = op::identity();
+    // Internal ValOp object that is used within RAJA::forall/launch
+    VOp m_valop = VOp{};
 
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
-    defined(RAJA_SYCL_ACTIVE)
-  // Device related attributes.
-  value_type* devicetarget = nullptr;
-  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-  unsigned int* device_count = nullptr;
+    // Points to the user specified result variable
+    value_type *target = nullptr;
+
+    // combineTarget() performs the final op on the target data and location in resolve()
+    RAJA_HOST_DEVICE void combineTarget(value_type in)
+    {
+      value_type temp = op{}(*target, in);
+      *target = temp;
+    }
+
+    RAJA_HOST_DEVICE
+    value_type &
+    getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
+    // Device related attributes.
+    value_type * devicetarget = nullptr;
+    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+    unsigned int * device_count = nullptr;
 #endif
 
-  using ARG_TUP_T = camp::tuple<value_type*>;
-  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
-  {
-    return camp::make_tuple(&val);
-  }
+    // These are types and parameters extracted from this struct, and given to the forall.
+    using ARG_TUP_T = camp::tuple<VOp*>;
+    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
+
+    using ARG_LIST_T = typename ARG_TUP_T::TList;
+    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
+  };
+
+  // Partial specialization of Reducer for ValLoc
+  // T is a deduced basic data type
+  // I is a deduced index type
+  template <typename T, typename I, template <typename, typename, typename> class Op>
+  struct Reducer<Op<ValLoc<T,I>, ValLoc<T,I>, ValLoc<T,I>>, ValLoc<T,I>, ValOp<ValLoc<T,I>, Op>> : public ForallParamBase {
+    using target_value_type = T;
+    using target_index_type = I;
+    using value_type = ValLoc<T,I>;
+    using op = Op<value_type,value_type,value_type>;
+    using VOp = ValOp<ValLoc<target_value_type,target_index_type>, Op>;
+
+    Reducer() = default;
+
+    // ValLoc constructor
+    // Note that the target_ variables point to the val and loc within the user defined target ValLoc
+    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target_value(&target_in->val), target_index(&target_in->loc) {}
+
+    // Dual input constructor for ReduceLoc<>(data, index) case
+    // The target_ variables point to vars defined by the user
+    RAJA_HOST_DEVICE Reducer(target_value_type *data_in, target_index_type *index_in) : m_valop(VOp{}), target_value(data_in), target_index(index_in) {}
+
+    Reducer(Reducer const &) = default;
+    Reducer(Reducer &&) = default;
+    Reducer& operator=(Reducer const &) = default;
+    Reducer& operator=(Reducer &&) = default;
+
+    // The ValLoc within m_valop is initialized with data and location values from either a ValLoc, or dual data and location values, passed into the constructor
+    VOp m_valop = VOp{};
+
+    // Points to either dual value and index defined by the user, or value and index within a ValLoc defined by the user
+    target_value_type *target_value = nullptr;
+    target_index_type *target_index = nullptr;
+
+    // combineTarget() performs the final op on the target data and location in resolve()
+    RAJA_HOST_DEVICE void combineTarget(value_type in)
+    {
+      // Create a different temp ValLoc solely for combining
+      value_type temp(*target_value, *target_index);
+      temp = op{}(temp, in);
+      *target_value = temp.val;
+      *target_index = temp.loc;
+    }
 
-  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
-  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
-};
+    RAJA_HOST_DEVICE
+    value_type &
+    getVal() { return m_valop.val; }
 
-}  // namespace detail
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
+    // Device related attributes.
+    value_type * devicetarget = nullptr;
+    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+    unsigned int * device_count = nullptr;
+#endif
 
-template <template <typename, typename, typename> class Op, typename T>
-auto constexpr Reduce(T* target)
-{
-  return detail::Reducer<Op<T, T, T>, T>(target);
-}
+    // These are types and parameters extracted from this struct, and given to the forall.
+    using ARG_TUP_T = camp::tuple<VOp*>;
+    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
 
+    using ARG_LIST_T = typename ARG_TUP_T::TList;
+    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
+  };
 
-namespace detail
-{
+} // namespace detail
 
-//
-//
-// Basic ReducerLoc
-//
-//
-template <typename Op, typename T>
-struct ReducerLoc : public Reducer<Op, T>
+// Standard use case.
+template <template <typename, typename, typename> class Op, typename T>
+auto constexpr Reduce(T *target)
 {
-  using Base       = Reducer<Op, T>;
-  using value_type = typename Base::value_type;
-  ReducerLoc(value_type* target_in)
-  {
-    Base::target = target_in;
-    Base::val    = value_type(Op::identity());
-  }
-};
+  return detail::Reducer<Op<T,T,T>, T, ValOp<T, Op>>(target);
+}
 
-}  // namespace detail
+// User-defined ValLoc case.
+template <template <typename, typename, typename> class Op, typename T, typename IndexType>
+auto constexpr Reduce(ValLoc<T, IndexType> *target)
+{
+  using VL = ValLoc<T,IndexType>;
+  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target);
+}
 
-template <template <typename, typename, typename> class Op, typename T>
-auto constexpr ReduceLoc(T* target)
+// Dual input use case where reduction value and location are separate, non-ValLoc types supplied by the user.
+template <template <typename, typename, typename> class Op, typename T, typename IndexType>
+auto constexpr ReduceLoc(T *target, IndexType *index)
 {
-  return detail::ReducerLoc<Op<T, T, T>, T>(target);
+  using VL = ValLoc<T,IndexType>;
+  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target, index);
 }
-}  // namespace expt
+
+} // namespace expt
 
 
-}  //  namespace RAJA
+} //  namespace RAJA
 
-#endif  //  NEW_REDUCE_HPP
+#endif //  NEW_REDUCE_HPP
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index a1cc15dceb..0c0eaf3efb 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -205,7 +205,7 @@ class ReduceSum;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitOr;
-
+ 
 
 /*!
  ******************************************************************************
@@ -231,7 +231,7 @@ class ReduceBitOr;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitAnd;
-}  // namespace RAJA
+} //namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index baf4664062..0f46ee0a22 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -46,21 +46,20 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename Container,
-    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
+template <typename ExecPolicy,
+          typename Res,
+          typename Container,
+          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
 RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<Container>>
-    inclusive_scan_inplace(ExecPolicy&& p,
-                           Res r,
-                           Container&& c,
-                           Function binop = Function {})
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<Container>>
+inclusive_scan_inplace(ExecPolicy&& p,
+                       Res r,
+                       Container&& c,
+                       Function binop = Function{})
 {
   using std::begin;
   using std::end;
@@ -69,32 +68,32 @@ RAJA_INLINE
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c))
-  {
+  if (begin(c) == end(c)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
-                                       end(c), binop);
+  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p),
+                                       begin(c), end(c), binop);
 }
 ///
-template <
-    typename ExecPolicy,
-    typename Container,
-    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
-    typename Res      = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<Container>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, Container>>>
+template <typename ExecPolicy,
+          typename Container,
+          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<Container>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function {})
+                       Function binop = Function{})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
-      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<Container>(c),
+      binop);
 }
 
 /*!
@@ -112,19 +111,19 @@ inclusive_scan_inplace(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename Res,
           typename Container,
-          typename T        = RAJA::detail::ContainerVal<Container>,
+          typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<Container>>
-    exclusive_scan_inplace(ExecPolicy&& p,
-                           Res r,
-                           Container&& c,
-                           Function binop = Function {},
-                           T value        = Function::identity())
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<Container>>
+exclusive_scan_inplace(ExecPolicy&& p,
+                       Res r,
+                       Container&& c,
+                       Function binop = Function{},
+                       T value = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -133,33 +132,35 @@ RAJA_INLINE
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c))
-  {
+  if (begin(c) == end(c)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
-                                       end(c), binop, value);
+  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p),
+                                       begin(c), end(c), binop, value);
 }
 ///
 template <typename ExecPolicy,
           typename Container,
-          typename T        = RAJA::detail::ContainerVal<Container>,
+          typename T = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<Container>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, Container>>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<Container>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function {},
-                       T value        = Function::identity())
+                       Function binop = Function{},
+                       T value = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
-      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop, value);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<Container>(c),
+      binop,
+      value);
 }
 
 /*!
@@ -182,20 +183,19 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename Function =
-              operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>>
 RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<InContainer>,
-                          type_traits::is_range<OutContainer>>
-    inclusive_scan(ExecPolicy&& p,
-                   Res r,
-                   InContainer&& in,
-                   OutContainer&& out,
-                   Function binop = Function {})
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<InContainer>,
+                      type_traits::is_range<OutContainer>>
+inclusive_scan(ExecPolicy&& p,
+               Res r,
+               InContainer&& in,
+               OutContainer&& out,
+               Function binop = Function{})
 {
   using std::begin;
   using std::end;
@@ -207,36 +207,36 @@ RAJA_INLINE
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in))
-  {
+  if (begin(in) == end(in)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p), begin(in),
-                               end(in), begin(out), binop);
+  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p),
+                               begin(in), end(in), begin(out), binop);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename Function =
-              operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<InContainer>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, InContainer>>,
-    type_traits::is_range<OutContainer>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<InContainer>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
+                      type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function {})
+               Function binop = Function{})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
-      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
-      std::forward<OutContainer>(out), binop);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<InContainer>(in),
+      std::forward<OutContainer>(out),
+      binop);
 }
 
 /*!
@@ -259,21 +259,21 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename T        = RAJA::detail::ContainerVal<InContainer>,
+          typename T = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-    concepts::enable_if_t<resources::EventProxy<Res>,
-                          type_traits::is_execution_policy<ExecPolicy>,
-                          type_traits::is_resource<Res>,
-                          std::is_constructible<camp::resources::Resource, Res>,
-                          type_traits::is_range<InContainer>,
-                          type_traits::is_range<OutContainer>>
-    exclusive_scan(ExecPolicy&& p,
-                   Res r,
-                   InContainer&& in,
-                   OutContainer&& out,
-                   Function binop = Function {},
-                   T value        = Function::identity())
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>,
+                      std::is_constructible<camp::resources::Resource, Res>,
+                      type_traits::is_range<InContainer>,
+                      type_traits::is_range<OutContainer>>
+exclusive_scan(ExecPolicy&& p,
+               Res r,
+               InContainer&& in,
+               OutContainer&& out,
+               Function binop = Function{},
+               T value = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -285,40 +285,42 @@ RAJA_INLINE
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in))
-  {
+  if (begin(in) == end(in)) {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p), begin(in),
-                               end(in), begin(out), binop, value);
+  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p),
+                               begin(in), end(in), begin(out), binop, value);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename T        = RAJA::detail::ContainerVal<InContainer>,
+          typename T = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<InContainer>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, InContainer>>,
-    type_traits::is_range<OutContainer>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<InContainer>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
+                      type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function {},
-               T value        = Function::identity())
+               Function binop = Function{},
+               T value = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
-      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
-      std::forward<OutContainer>(out), binop, value);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<InContainer>(in),
+      std::forward<OutContainer>(out),
+      binop,
+      value);
 }
 
-}  // namespace policy_by_value_interface
+}  // end inline namespace policy_by_value_interface
 
 
 /*!
@@ -327,11 +329,11 @@ exclusive_scan(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy, typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type >
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -340,9 +342,10 @@ exclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -355,11 +358,11 @@ exclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
+template <typename ExecPolicy, typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -368,9 +371,10 @@ inclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -383,11 +387,11 @@ inclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
+template <typename ExecPolicy, typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -396,9 +400,10 @@ exclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -411,11 +416,11 @@ exclusive_scan_inplace(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
+template <typename ExecPolicy, typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -424,9 +429,10 @@ inclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
-                                  type_traits::is_execution_policy<ExecPolicy>,
-                                  type_traits::is_resource<Res>>
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_resource<Res>>
 inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index fdbc5722ee..acf3fe5ba7 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -46,21 +46,23 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename Container,
-    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <typename ExecPolicy,
+          typename Res,
+          typename Container,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
+sort(ExecPolicy&& p,
+     Res r,
+     Container&& c,
+     Compare comp = Compare{})
 {
   using std::begin;
-  using std::distance;
   using std::end;
+  using std::distance;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -69,35 +71,34 @@ sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N        = distance(begin_it, end_it);
+  auto N = distance(begin_it, end_it);
 
-  if (N > 1)
-  {
-    return impl::sort::unstable(r, std::forward<ExecPolicy>(p), begin_it,
-                                end_it, comp);
-  }
-  else
-  {
+  if (N > 1) {
+    return impl::sort::unstable(r, std::forward<ExecPolicy>(p),
+                                begin_it, end_it, comp);
+  } else {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <
-    typename ExecPolicy,
-    typename Container,
-    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-    typename Res     = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<Container>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, Container>>>
-sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
+template <typename ExecPolicy,
+          typename Container,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<Container>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+sort(ExecPolicy&& p,
+     Container&& c,
+     Compare comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort(
-      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<Container>(c),
+      comp);
 }
 
 /*!
@@ -112,21 +113,23 @@ sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 *
 ******************************************************************************
 */
-template <
-    typename ExecPolicy,
-    typename Res,
-    typename Container,
-    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <typename ExecPolicy,
+          typename Res,
+          typename Container,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
+stable_sort(ExecPolicy&& p,
+            Res r,
+            Container&& c,
+            Compare comp = Compare{})
 {
   using std::begin;
-  using std::distance;
   using std::end;
+  using std::distance;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -135,35 +138,34 @@ stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N        = distance(begin_it, end_it);
+  auto N = distance(begin_it, end_it);
 
-  if (N > 1)
-  {
-    return impl::sort::stable(r, std::forward<ExecPolicy>(p), begin_it, end_it,
-                              comp);
-  }
-  else
-  {
+  if (N > 1) {
+    return impl::sort::stable(r, std::forward<ExecPolicy>(p),
+                              begin_it, end_it, comp);
+  } else {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <
-    typename ExecPolicy,
-    typename Container,
-    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-    typename Res     = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<Container>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, Container>>>
-stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
+template <typename ExecPolicy,
+          typename Container,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<Container>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+stable_sort(ExecPolicy&& p,
+            Container&& c,
+            Compare comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort(
-      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<Container>(c),
+      comp);
 }
 
 /*!
@@ -183,8 +185,7 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -195,11 +196,11 @@ sort_pairs(ExecPolicy&& p,
            Res r,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare {})
+           Compare comp = Compare{})
 {
   using std::begin;
-  using std::distance;
   using std::end;
+  using std::distance;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -210,15 +211,12 @@ sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N         = distance(begin_key, end_key);
+  auto N = distance(begin_key, end_key);
 
-  if (N > 1)
-  {
-    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
-                                      end_key, begin(vals), comp);
-  }
-  else
-  {
+  if (N > 1) {
+    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p),
+                                      begin_key, end_key, begin(vals), comp);
+  } else {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -226,25 +224,25 @@ sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<KeyContainer>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, KeyContainer>>,
-    type_traits::is_range<ValContainer>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<KeyContainer>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
+                      type_traits::is_range<ValContainer>>
 sort_pairs(ExecPolicy&& p,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare {})
+           Compare comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
-      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals), comp);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals),
+      comp);
 }
 
 /*!
@@ -264,8 +262,7 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -276,11 +273,11 @@ stable_sort_pairs(ExecPolicy&& p,
                   Res r,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
 {
   using std::begin;
-  using std::distance;
   using std::end;
+  using std::distance;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -291,15 +288,12 @@ stable_sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N         = distance(begin_key, end_key);
+  auto N = distance(begin_key, end_key);
 
-  if (N > 1)
-  {
-    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
-                                    end_key, begin(vals), comp);
-  }
-  else
-  {
+  if (N > 1) {
+    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p),
+                                    begin_key, end_key, begin(vals), comp);
+  } else {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -307,28 +301,28 @@ stable_sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare =
-              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_execution_policy<ExecPolicy>,
-    type_traits::is_range<KeyContainer>,
-    concepts::negate<
-        std::is_constructible<camp::resources::Resource, KeyContainer>>,
-    type_traits::is_range<ValContainer>>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      type_traits::is_execution_policy<ExecPolicy>,
+                      type_traits::is_range<KeyContainer>,
+                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
+                      type_traits::is_range<ValContainer>>
 stable_sort_pairs(ExecPolicy&& p,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
-      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals), comp);
+      std::forward<ExecPolicy>(p),
+      r,
+      std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals),
+      comp);
 }
 
-}  // namespace policy_by_value_interface
+}  // end inline namespace policy_by_value_interface
 
 // =============================================================================
 
@@ -338,12 +332,11 @@ stable_sort_pairs(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
+template <typename ExecPolicy, typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort(Args&&... args)
+sort(Args &&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort<ExecPolicy>(
@@ -354,10 +347,10 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort(Res r, Args&&... args)
+sort(Res r, Args &&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(), r,
-                                                 std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(
+      ExecPolicy(), r, std::forward<Args>(args)...);
 }
 
 /*!
@@ -366,12 +359,11 @@ sort(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
+template <typename ExecPolicy, typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort(Args&&... args)
+stable_sort(Args &&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort<ExecPolicy>(
@@ -382,7 +374,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort(Res r, Args&&... args)
+stable_sort(Res r, Args &&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -394,12 +386,11 @@ stable_sort(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
+template <typename ExecPolicy, typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort_pairs(Args&&... args)
+sort_pairs(Args &&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs<ExecPolicy>(
@@ -410,7 +401,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort_pairs(Res r, Args&&... args)
+sort_pairs(Res r, Args &&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -422,12 +413,11 @@ sort_pairs(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy,
-          typename... Args,
+template <typename ExecPolicy, typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort_pairs(Args&&... args)
+stable_sort_pairs(Args &&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs<ExecPolicy>(
@@ -438,7 +428,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort_pairs(Res r, Args&&... args)
+stable_sort_pairs(Res r, Args &&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
diff --git a/include/RAJA/pattern/synchronize.hpp b/include/RAJA/pattern/synchronize.hpp
index 77c88e5c6d..d3e42af81c 100644
--- a/include/RAJA/pattern/synchronize.hpp
+++ b/include/RAJA/pattern/synchronize.hpp
@@ -41,7 +41,7 @@ namespace RAJA
 template <typename Policy>
 void synchronize()
 {
-  synchronize_impl(Policy {});
+  synchronize_impl(Policy{});
 }
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index ab6d2f7c42..9fa39f34ee 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -28,27 +28,25 @@ namespace RAJA
 {
 namespace expt
 {
-template <typename T,
-          typename LAYOUT,
-          typename REGISTER_POLICY = default_register>
-using SquareMatrixRegister = TensorRegister<
-    REGISTER_POLICY,
-    T,
-    LAYOUT,
-    camp::idx_seq<
-        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
-        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
-
-template <typename T,
-          typename LAYOUT,
-          camp::idx_t ROWS,
-          camp::idx_t COLS,
-          typename REGISTER_POLICY = default_register>
-using RectMatrixRegister =
-    TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
-
-}  // namespace expt
+  template<typename T, typename LAYOUT, typename REGISTER_POLICY = default_register>
+  using SquareMatrixRegister =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     LAYOUT,
+                     camp::idx_seq<RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem,
+                                   RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem>>;
+
+  template<typename T, typename LAYOUT, camp::idx_t ROWS, camp::idx_t COLS,
+           typename REGISTER_POLICY = default_register>
+  using RectMatrixRegister =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     LAYOUT,
+                     camp::idx_seq<ROWS,COLS>>;
+
+} // namespace expt
 }  // namespace RAJA
 
 
+
 #endif
diff --git a/include/RAJA/pattern/tensor/ScalarRegister.hpp b/include/RAJA/pattern/tensor/ScalarRegister.hpp
index d532d58ade..f6675b4ba9 100644
--- a/include/RAJA/pattern/tensor/ScalarRegister.hpp
+++ b/include/RAJA/pattern/tensor/ScalarRegister.hpp
@@ -28,14 +28,16 @@ namespace RAJA
 namespace expt
 {
 
-// Convenience to describe ScalarTensors
-template <typename T>
-using ScalarRegister =
-    TensorRegister<scalar_register, T, ScalarLayout, camp::idx_seq<>>;
+  // Convenience to describe ScalarTensors
+  template<typename T>
+  using ScalarRegister = TensorRegister<scalar_register,
+                                        T,
+                                        ScalarLayout,
+                                        camp::idx_seq<>>;
 
 
-}  // namespace expt
-}  // namespace RAJA
+} // namespace expt
+} // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorBlock.hpp b/include/RAJA/pattern/tensor/TensorBlock.hpp
index 6fc9d48897..0e9869a772 100644
--- a/include/RAJA/pattern/tensor/TensorBlock.hpp
+++ b/include/RAJA/pattern/tensor/TensorBlock.hpp
@@ -360,6 +360,7 @@ namespace ET{
 }  // namespace RAJA
 
 
+
 #endif
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 185948682d..f992649876 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -29,190 +29,196 @@ namespace expt
 {
 
 
-template <typename IDX,
-          typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          IDX INDEX_VALUE,
-          strip_index_type_t<IDX> LENGTH_VALUE>
-struct StaticTensorIndexInner;
-
-template <typename INNER_TYPE>
-struct StaticTensorIndex;
-
-
-template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-class TensorIndex
-{
-public:
-  using self_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using value_type  = strip_index_type_t<IDX>;
-  using index_type  = IDX;
-  using tensor_type = TENSOR_TYPE;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr self_type all()
-  {
-    return self_type(index_type(-1), value_type(-1));
-  }
-
-  RAJA_INLINE
+  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
+  struct StaticTensorIndexInner;
+
+  template<typename INNER_TYPE>
+  struct StaticTensorIndex;
+
+
+  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+  class TensorIndex {
+    public:
+      using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+      using value_type = strip_index_type_t<IDX>;
+      using index_type = IDX;
+      using tensor_type = TENSOR_TYPE;
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      self_type all(){
+        return self_type(index_type(-1), value_type(-1));
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>> static_all(){
+        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>>();
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      self_type range(index_type begin, index_type end){
+        return self_type(begin, value_type(stripIndexType(end-begin)));
+      }
+
+      template<value_type TBEGIN, value_type TEND>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>> static_range(){
+        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>>();
+      }
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      TensorIndex() : m_index(index_type(0)), m_length(0) {}
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg) :
+      m_index(*seg.begin()), m_length(seg.size())
+      {}
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {}
+
+      template<typename T, camp::idx_t D>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
+
+
+      template<IDX IDX_VAL, strip_index_type_t<IDX> LEN_VAL>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
+          : m_index(IDX_VAL)
+          , m_length(LEN_VAL)
+      {}
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      index_type const &operator*() const {
+        return m_index;
+      }
+
+      // used in strip_by_value as a static cast
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      explicit operator index_type() const {
+        // return does not matter, but suppresses no-return warnings
+        return m_index;
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      index_type begin() const {
+        return m_index;
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      value_type size() const {
+        return m_length;
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      value_type dim() const {
+        return DIM;
+      }
+
+    private:
+      index_type m_index;
+      value_type m_length;
+  };
+
+
+  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
+  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
+
+      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
+      using value_type = strip_index_type_t<IDX>;
+      using index_type = IDX;
+      using tensor_type = TENSOR_TYPE;
+
+      static const index_type s_index  = INDEX_VALUE;
+      static const index_type s_length = LENGTH_VALUE;
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr operator base_type() {
+        return base_type(s_index,s_length);
+      }
+    
+  };
+
+
+
+  /*!
+   * Index that specifies the starting element index of a Vector
+   */
+  template<typename IDX, typename VECTOR_TYPE>
+  using VectorIndex =  TensorIndex<IDX, VECTOR_TYPE, 0>;
+
+  /*!
+   * Index that specifies the starting Row index of a matrix
+   */
+  template<typename IDX, typename MATRIX_TYPE>
+  using RowIndex =  TensorIndex<IDX, MATRIX_TYPE, 0>;
+
+  /*!
+   * Index that specifies the starting Column index of a matrix
+   */
+  template<typename IDX, typename MATRIX_TYPE>
+  using ColIndex =  TensorIndex<IDX, MATRIX_TYPE, 1>;
+
+
+  /*!
+   * Converts a Row index to a Column index
+   */
+  template<typename IDX, typename MATRIX_TYPE>
   RAJA_HOST_DEVICE
-  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
-                                                            TENSOR_TYPE,
-                                                            DIM,
-                                                            index_type(-1),
-                                                            value_type(-1)>>
-  static_all()
-  {
-    return StaticTensorIndex<StaticTensorIndexInner<
-        IDX, TENSOR_TYPE, DIM, index_type(-1), value_type(-1)>>();
-  }
-
   RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr self_type range(index_type begin, index_type end)
-  {
-    return self_type(begin, value_type(stripIndexType(end - begin)));
+  constexpr
+  ColIndex<IDX, MATRIX_TYPE> toColIndex(RowIndex<IDX, MATRIX_TYPE> const &r){
+    return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
   }
 
-  template <value_type TBEGIN, value_type TEND>
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex<
-      StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>
-  static_range()
-  {
-    return StaticTensorIndex<
-        StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>();
-  }
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {}
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr TensorIndex(RAJA::TypedRangeSegment<IDX> const& seg)
-      : m_index(*seg.begin()), m_length(seg.size())
-  {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr TensorIndex(index_type value, value_type length)
-      : m_index(value), m_length(length)
-  {}
-
-  template <typename T, camp::idx_t D>
-  RAJA_INLINE
-      RAJA_HOST_DEVICE constexpr TensorIndex(TensorIndex<IDX, T, D> const& c)
-      : m_index(*c), m_length(c.size())
-  {}
-
-
-  template <IDX IDX_VAL, strip_index_type_t<IDX> LEN_VAL>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
-      StaticTensorIndex<
-          StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const
-          RAJA_UNUSED_ARG(&c))
-      : m_index(IDX_VAL), m_length(LEN_VAL)
-  {}
-
-
-  RAJA_INLINE
+  /*!
+   * Converts a Column index to a Row index
+   */
+  template<typename IDX, typename MATRIX_TYPE>
   RAJA_HOST_DEVICE
-  constexpr index_type const& operator*() const { return m_index; }
-
-  // used in strip_by_value as a static cast
   RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr explicit operator index_type() const
-  {
-    // return does not matter, but suppresses no-return warnings
-    return m_index;
+  constexpr
+  RowIndex<IDX, MATRIX_TYPE> toRowIndex(ColIndex<IDX, MATRIX_TYPE> const &c){
+    return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
   }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type begin() const { return m_index; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr value_type size() const { return m_length; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr value_type dim() const { return DIM; }
-
-private:
-  index_type m_index;
-  value_type m_length;
-};
-
-
-template <typename IDX,
-          typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          IDX INDEX_VALUE,
-          strip_index_type_t<IDX> LENGTH_VALUE>
-struct StaticTensorIndex<
-    StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
-{
-
-  using base_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using value_type  = strip_index_type_t<IDX>;
-  using index_type  = IDX;
-  using tensor_type = TENSOR_TYPE;
-
-  static const index_type s_index  = INDEX_VALUE;
-  static const index_type s_length = LENGTH_VALUE;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr operator base_type() { return base_type(s_index, s_length); }
-};
-
-
-/*!
- * Index that specifies the starting element index of a Vector
- */
-template <typename IDX, typename VECTOR_TYPE>
-using VectorIndex = TensorIndex<IDX, VECTOR_TYPE, 0>;
-
-/*!
- * Index that specifies the starting Row index of a matrix
- */
-template <typename IDX, typename MATRIX_TYPE>
-using RowIndex = TensorIndex<IDX, MATRIX_TYPE, 0>;
-
-/*!
- * Index that specifies the starting Column index of a matrix
- */
-template <typename IDX, typename MATRIX_TYPE>
-using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
-
-
-/*!
- * Converts a Row index to a Column index
- */
-template <typename IDX, typename MATRIX_TYPE>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
-toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
-{
-  return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
-}
-
-/*!
- * Converts a Column index to a Row index
- */
-template <typename IDX, typename MATRIX_TYPE>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
-toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
-{
-  return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
-}
-
-}  // namespace expt
+} // namespace expt
 }  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
diff --git a/include/RAJA/pattern/tensor/TensorLayout.hpp b/include/RAJA/pattern/tensor/TensorLayout.hpp
index 8e2404c3a2..376d6b905a 100644
--- a/include/RAJA/pattern/tensor/TensorLayout.hpp
+++ b/include/RAJA/pattern/tensor/TensorLayout.hpp
@@ -28,56 +28,67 @@ namespace expt
 {
 
 
-template <camp::idx_t... DIM_SEQ>
-struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
-{
+  template<camp::idx_t ... DIM_SEQ>
+  struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
+  {
 
-  using seq_t = camp::idx_seq<DIM_SEQ...>;
+      using seq_t = camp::idx_seq<DIM_SEQ...>;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr bool is_column_major() { return false; }
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      bool is_column_major(){
+        return false;
+      }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr bool is_row_major() { return false; }
-};
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      bool is_row_major(){
+        return false;
+      }
 
+  };
 
-// specialization for Matrix layouts, where column vs row major matters
-template <camp::idx_t S2, camp::idx_t S1>
-struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
-{
-  using seq_t = camp::idx_seq<S2, S1>;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr bool is_column_major()
+  // specialization for Matrix layouts, where column vs row major matters
+  template<camp::idx_t S2, camp::idx_t S1>
+  struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
   {
-    return S1 == 0;  // Rows are stride-1
-  }
+      using seq_t = camp::idx_seq<S2, S1>;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr bool is_row_major()
-  {
-    return S1 == 1;  // Columns are stride-1
-  }
-};
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      bool is_column_major(){
+        return S1 == 0; // Rows are stride-1
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      bool is_row_major(){
+        return S1 == 1; // Columns are stride-1
+      }
+  };
 
 
-// 0d tensor (scalar) layout
-using ScalarLayout = TensorLayout<>;
+  // 0d tensor (scalar) layout
+  using ScalarLayout = TensorLayout<>;
 
-// 1d tensor (vector) layout
-using VectorLayout = TensorLayout<0>;
+  // 1d tensor (vector) layout
+  using VectorLayout = TensorLayout<0>;
 
-// 2d tensor (matrix) layouts
-using RowMajorLayout = TensorLayout<0, 1>;
-using ColMajorLayout = TensorLayout<1, 0>;
+  // 2d tensor (matrix) layouts
+  using RowMajorLayout = TensorLayout<0, 1>;
+  using ColMajorLayout = TensorLayout<1, 0>;
 
 
-}  // namespace expt
+} // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index 22f4c16cae..d410f46fb7 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -28,91 +28,80 @@
 
 namespace RAJA
 {
-namespace internal
-{
-namespace expt
-{
-class TensorRegisterConcreteBase;
+namespace internal {
+namespace expt {
+    class TensorRegisterConcreteBase;
+}
 }
-}  // namespace internal
 
 namespace expt
 {
 
 
-template <typename REGISTER_POLICY, typename T, typename LAYOUT, typename SIZES>
-class TensorRegister;
+  template<typename REGISTER_POLICY,
+           typename T,
+           typename LAYOUT,
+           typename SIZES>
+  class TensorRegister;
 
 
-/*
- * Overload for:    arithmetic + TensorRegister
+  /*
+   * Overload for:    arithmetic + TensorRegister
 
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
-{
-  return RIGHT(lhs).add(rhs);
-}
-
-/*
- * Overload for:    arithmetic - TensorRegister
-
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return RIGHT(lhs).add(rhs);
+  }
+
+  /*
+   * Overload for:    arithmetic - TensorRegister
+
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
-{
-  return RIGHT(lhs).subtract(rhs);
-}
-
-/*
- * Overload for:    arithmetic * TensorRegister
-
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return RIGHT(lhs).subtract(rhs);
+  }
+
+  /*
+   * Overload for:    arithmetic * TensorRegister
+
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
-{
-  return rhs.scale(lhs);
-}
-
-/*
- * Overload for:    arithmetic / TensorRegister
-
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return rhs.scale(lhs);
+  }
+
+  /*
+   * Overload for:    arithmetic / TensorRegister
+
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
-                        RIGHT>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
-{
-  return RIGHT(lhs).divide(rhs);
-}
-
-}  // namespace expt
+    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return RIGHT(lhs).divide(rhs);
+  }
+
+} // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index 8041622d11..afab05658f 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -24,15 +24,16 @@ namespace RAJA
 {
 namespace expt
 {
-// Convenience to describe VectorTensors
-template <typename T,
-          typename REGISTER_POLICY = default_register,
-          camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
-using VectorRegister =
-    TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
-}  // namespace expt
-
-}  // namespace RAJA
+  // Convenience to describe VectorTensors
+  template<typename T, typename REGISTER_POLICY = default_register, camp::idx_t NUM_ELEM = Register<T,REGISTER_POLICY>::s_num_elem>
+  using VectorRegister = TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        VectorLayout,
+                                        camp::idx_seq<NUM_ELEM> >;
+} // namespace expt
+
+} // namespace RAJA
+
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 09099eef27..953f4fd4a0 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -34,121 +34,110 @@ namespace expt
 {
 
 
-namespace ET
-{
-
-
-template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-class TensorBinaryOperator
-    : public TensorExpressionBase<
-          TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
-{
-public:
-  using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
-  using operator_type      = OPERATOR;
-  using left_operand_type  = LEFT_OPERAND;
-  using right_operand_type = RIGHT_OPERAND;
-
-  using element_type = typename LEFT_OPERAND::element_type;
-  using index_type   = typename LEFT_OPERAND::index_type;
-
-  using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
-  using result_type     = typename operator_traits::result_type;
-
-  static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
-
-private:
-  left_operand_type m_left_operand;
-  right_operand_type m_right_operand;
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorBinaryOperator(left_operand_type const& left,
-                       right_operand_type const& right)
-      : m_left_operand {left}, m_right_operand {right}
-  {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr auto getDimSize(camp::idx_t dim) const
-      -> decltype(operator_traits::getDimSize(dim,
-                                              m_left_operand,
-                                              m_right_operand))
-  {
-    return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
-  }
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(operator_type::eval(m_left_operand.eval(tile),
-                                      m_right_operand.eval(tile)))
-  {
-    return operator_type::eval(m_left_operand.eval(tile),
-                               m_right_operand.eval(tile));
-  }
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const
+  namespace ET
   {
-    operator_type::print_ast();
-    printf("[");
-    operator_type::print_ast();
-    printf("](");
-    m_left_operand.print_ast();
-    printf(", ");
-    m_right_operand.print_ast();
-    printf(")");
-  }
-};
 
 
-/*
- * Overload for:    arithmetic + tensorexpression
-
- */
-template <typename LEFT_OPERAND,
-          typename RIGHT_OPERAND,
-          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
-                                  bool>::type = true,
-          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
-                                                  RIGHT_OPERAND>::value,
-                                  bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const& left,
-                                            RIGHT_OPERAND const& right)
-    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-                 RIGHT_OPERAND>
-{
-  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-                   RIGHT_OPERAND>(
-      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-}
-
-
-/*
- * Overload for:    arithmetic - tensorexpression
-
- */
-template <typename LEFT_OPERAND,
-          typename RIGHT_OPERAND,
-          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
-                                  bool>::type = true,
-          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
-                                                  RIGHT_OPERAND>::value,
-                                  bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
-                                            RIGHT_OPERAND const& right)
-    -> TensorSubtract<
-        typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-        RIGHT_OPERAND>
-{
-  return TensorSubtract<
-      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
-                     right);
-}
+    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+    class TensorBinaryOperator :
+        public TensorExpressionBase<TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
+    {
+      public:
+        using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
+        using operator_type = OPERATOR;
+        using left_operand_type = LEFT_OPERAND;
+        using right_operand_type = RIGHT_OPERAND;
+
+        using element_type = typename LEFT_OPERAND::element_type;
+        using index_type = typename LEFT_OPERAND::index_type;
+
+        using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
+        using result_type = typename operator_traits::result_type;
+
+        static constexpr camp::idx_t s_num_dims =
+            operator_traits::s_num_dims;
+
+      private:
+        left_operand_type m_left_operand;
+        right_operand_type m_right_operand;
+
+      public:
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorBinaryOperator(left_operand_type const &left, right_operand_type const &right) :
+        m_left_operand{left}, m_right_operand{right}
+        {}
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        auto getDimSize(camp::idx_t dim) const ->
+        decltype(operator_traits::getDimSize(dim, m_left_operand, m_right_operand))
+        {
+          return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
+        }
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        auto eval(TILE_TYPE const &tile) const ->
+          decltype(operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile)))
+        {
+          return operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile));
+        }
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          operator_type::print_ast();
+          printf("[");
+          operator_type::print_ast();
+          printf("](");
+          m_left_operand.print_ast();
+          printf(", ");
+          m_right_operand.print_ast();
+          printf(")");
+        }
+
+
+    };
+
+
+
+
+    /*
+     * Overload for:    arithmetic + tensorexpression
+
+     */
+    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
+      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
+      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    auto operator+(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
+    TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
+    {
+      return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+    }
+
+
+    /*
+     * Overload for:    arithmetic - tensorexpression
+
+     */
+    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
+      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
+      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    auto operator-(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
+    TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
+    {
+      return TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+    }
 
 
 //    /*
@@ -156,27 +145,21 @@ RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
 //
 //     */
 //    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
-//      bool>::type = true, typename
-//      std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
-//      RIGHT_OPERAND>::value, bool>::type = true>
+//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
+//      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
 //    RAJA_INLINE
 //    RAJA_HOST_DEVICE
 //    auto operator/(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-//    RIGHT_OPERAND>
+//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
 //    {
-//      return TensorDivide<typename
-//      NormalizeOperandHelper<LEFT_OPERAND>::return_type,
-//      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
-//      right);
+//      return TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
 //    }
 
 
-}  // namespace ET
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index 52fbf83cfa..a1450bf19f 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -27,133 +27,159 @@ namespace expt
 {
 
 
-namespace ET
-{
+  namespace ET
+  {
+
+    struct TensorOperatorAdd
+    {
+
+      template<typename LEFT, typename RIGHT>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      auto eval(LEFT const &left, RIGHT const &right) ->
+        decltype(left + right)
+      {
+        return left + right;
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void print_ast(){
+        printf("Add");
+      }
+    };
+
+    struct TensorOperatorSubtract
+    {
+
+      template<typename LEFT, typename RIGHT>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      auto eval(LEFT const &left, RIGHT const &right) ->
+        decltype(left - right)
+      {
+        return left - right;
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void print_ast(){
+        printf("Subtract");
+      }
+    };
+
+
+
 
-struct TensorOperatorAdd
-{
 
-  template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
-                                                RIGHT const& right)
-      -> decltype(left + right)
-  {
-    return left + right;
-  }
+    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+    class TensorBinaryOperator;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Add"); }
-};
+    template<typename LHS, typename RHS>
+    using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
 
-struct TensorOperatorSubtract
-{
+    template<typename LHS, typename RHS>
+    using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
 
-  template <typename LEFT, typename RIGHT>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
-                                                RIGHT const& right)
-      -> decltype(left - right)
-  {
-    return left - right;
-  }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Subtract"); }
-};
 
 
-template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-class TensorBinaryOperator;
+    /*!
+     * Provides default operations for add, subtract and divide
+     *
+     * For the most part, this is just element wise operations between
+     * compatible tensors.
+     *
+     * There are specializations that handle when one operand is a scalar
+     */
+    template<typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
+    struct OperatorTraits {
 
-template <typename LHS, typename RHS>
-using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
+        using result_type = typename LHS_TYPE::result_type;
+        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-template <typename LHS, typename RHS>
-using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void print_ast() {
+          printf("Elemental");
+        }
 
 
-/*!
- * Provides default operations for add, subtract and divide
- *
- * For the most part, this is just element wise operations between
- * compatible tensors.
- *
- * There are specializations that handle when one operand is a scalar
- */
-template <typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
-struct OperatorTraits
-{
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &rhs) {
+          return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
+        }
 
-  using result_type                       = typename LHS_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+    };
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Elemental"); }
+    /*!
+     * Specialization when the left operand is a scalar
+     */
+    template<typename LHS_TYPE, typename RHS_TYPE>
+    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
+    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
+    {
 
+        using result_type = typename RHS_TYPE::result_type;
+        static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const& rhs)
-  {
-    return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
-  }
-};
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void print_ast() {
+          printf("Scalar");
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        int getDimSize(int dim, LHS_TYPE const &, RHS_TYPE const &rhs) {
+          return rhs.getDimSize(dim);
+        }
 
-/*!
- * Specialization when the left operand is a scalar
- */
-template <typename LHS_TYPE, typename RHS_TYPE>
-struct OperatorTraits<LHS_TYPE,
-                      RHS_TYPE,
-                      typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
-{
+    };
 
-  using result_type                       = typename RHS_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
+    /*!
+     * Specialization when the right operand is a scalar
+     */
+    template<typename LHS_TYPE, typename RHS_TYPE>
+    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
+    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
+    {
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Scalar"); }
+        using result_type = typename LHS_TYPE::result_type;
+        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int getDimSize(int dim, LHS_TYPE const&, RHS_TYPE const& rhs)
-  {
-    return rhs.getDimSize(dim);
-  }
-};
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void print_ast() {
+          printf("Scalar");
+        }
 
-/*!
- * Specialization when the right operand is a scalar
- */
-template <typename LHS_TYPE, typename RHS_TYPE>
-struct OperatorTraits<LHS_TYPE,
-                      RHS_TYPE,
-                      typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
-{
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &) {
+          return lhs.getDimSize(dim);
+        }
 
-  using result_type                       = typename LHS_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Scalar"); }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const&)
-  {
-    return lhs.getDimSize(dim);
-  }
-};
+    };
 
 
-}  // namespace ET
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index c61cfd0891..210414eaec 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -26,6 +26,7 @@
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
 
 
+
 namespace RAJA
 {
 namespace internal
@@ -34,90 +35,93 @@ namespace expt
 {
 
 
-namespace ET
-{
+  namespace ET
+  {
 
 
-/*!
- * Temporary n-dimensional memory.
- *
- * STORAGE_TYPE defines the memory storage
- * TENSOR_TYPE defines what kind of tensor is returned by eval()
- */
-template <typename STORAGE_TYPE, typename TENSOR_TYPE>
-class BlockLiteral
-    : public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>>
-{
-public:
-  using self_type    = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
-  using storage_type = STORAGE_TYPE;
-  using tensor_type  = TENSOR_TYPE;
-  using result_type  = TENSOR_TYPE;
-  using ref_type     = typename STORAGE_TYPE::ref_type;
-  using tile_type    = typename ref_type::tile_type;
-  using index_type   = camp::idx_t;
-
-  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-private:
-  storage_type m_storage;
-  tile_type m_tile_origin;
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type dim) const
-  {
-    return storage_type::s_dim_elem(dim);
-  }
+    /*!
+     * Temporary n-dimensional memory.
+     *
+     * STORAGE_TYPE defines the memory storage
+     * TENSOR_TYPE defines what kind of tensor is returned by eval()
+     */
+    template<typename STORAGE_TYPE, typename TENSOR_TYPE>
+    class BlockLiteral :  public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>> {
+      public:
+        using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
+        using storage_type = STORAGE_TYPE;
+        using tensor_type = TENSOR_TYPE;
+        using result_type = TENSOR_TYPE;
+        using ref_type = typename STORAGE_TYPE::ref_type;
+        using tile_type = typename ref_type::tile_type;
+        using index_type = camp::idx_t;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr BlockLiteral(tile_type tile_origin)
-      : m_storage(), m_tile_origin(tile_origin)
-  {}
+        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
-  {
-    result_type result;
 
-    // load result from storage
-    result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
+      private:
+        storage_type m_storage;
+        tile_type m_tile_origin;
 
-    return result;
-  }
+      public:
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type dim) const {
+          return storage_type::s_dim_elem(dim);
+        }
 
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        BlockLiteral(tile_type tile_origin) :
+          m_storage(),
+          m_tile_origin(tile_origin)
+        {
 
-  /*!
-   *  Returns a ref that points at this data, shifted by its origin
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  ref_type get_ref()
-  {
+        }
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        result_type eval(TILE_TYPE const &tile) const {
+          result_type result;
+
+          // load result from storage
+          result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
+
+          return result;
+        }
+
+
+        /*!
+         *  Returns a ref that points at this data, shifted by its origin
+         */
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        ref_type get_ref() {
+
+          // compute shifited origin ref
+          return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
+
+        }
 
-    // compute shifited origin ref
-    return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
-  }
 
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("BlockLiteral()");
+        }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const { printf("BlockLiteral()"); }
-};
+    };
 
 
 //    /*
-//     * For TensorRegister nodes, we need to wrap this in a constant value ET
-//     node
+//     * For TensorRegister nodes, we need to wrap this in a constant value ET node
 //     */
 //    template<typename RHS>
 //    struct NormalizeOperandHelper<RHS,
-//    typename
-//    std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase,
-//    RHS>::value>::type>
+//    typename std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase, RHS>::value>::type>
 //    {
 //        using return_type = BlockLiteral<RHS>;
 //
@@ -130,10 +134,10 @@ class BlockLiteral
 //        }
 //    };
 
-}  // namespace ET
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 0c57f20067..3e96a63462 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -38,121 +38,128 @@ namespace expt
 {
 
 
-class TensorRegisterConcreteBase;
+    class TensorRegisterConcreteBase;
 
-namespace ET
-{
-
-//
-// forward decls
-//
-
-template <typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
-class TensorLoadStore;
-
-
-template <typename LHS_TYPE, typename RHS_TYPE>
-class TensorMultiply;
-
-template <typename LHS_TYPE, typename RHS_TYPE>
-class TensorDivide;
-
-template <typename TENSOR_TYPE>
-class TensorNegate;
-
-template <typename TENSOR_TYPE>
-class TensorTranspose;
-
-
-// provides a non-templated base-type for all ET's
-// this allows using things like std::is_base_of
-class TensorExpressionConcreteBase
-{};
-
-
-template <typename DERIVED_TYPE>
-class TensorExpressionBase : public TensorExpressionConcreteBase
-{
-public:
-  using self_type = DERIVED_TYPE;
-
-private:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type* getThis() { return static_cast<self_type*>(this); }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr self_type const* getThis() const
-  {
-    return static_cast<self_type const*>(this);
-  }
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr camp::idx_t getDimBegin(camp::idx_t) const { return 0; }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
-  operator+(RHS const& rhs) const
-  {
-    return TensorAdd<self_type, normalize_operand_t<RHS>>(
-        *getThis(), normalizeOperand(rhs));
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_INLINE
-      RAJA_HOST_DEVICE TensorSubtract<self_type, normalize_operand_t<RHS>>
-      operator-(RHS const& rhs) const
+  namespace ET
   {
-    return TensorSubtract<self_type, normalize_operand_t<RHS>>(
-        *getThis(), normalizeOperand(rhs));
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorNegate<self_type> operator-() const
-  {
-    return TensorNegate<self_type>(*getThis());
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_INLINE
-      RAJA_HOST_DEVICE TensorMultiply<self_type, normalize_operand_t<RHS>>
-      operator*(RHS const& rhs) const
-  {
-    return TensorMultiply<self_type, normalize_operand_t<RHS>>(
-        *getThis(), normalizeOperand(rhs));
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
-  operator/(RHS const& rhs) const
-  {
-    return TensorDivide<self_type, normalize_operand_t<RHS>>(
-        *getThis(), normalizeOperand(rhs));
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorTranspose<self_type> transpose() const
-  {
-    return TensorTranspose<self_type>(*getThis());
-  }
-};
-
-
-}  // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+    //
+    // forward decls
+    //
+
+    template<typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
+    class TensorLoadStore;
+
+
+    template<typename LHS_TYPE, typename RHS_TYPE>
+    class TensorMultiply;
+
+    template<typename LHS_TYPE, typename RHS_TYPE>
+    class TensorDivide;
+
+    template<typename TENSOR_TYPE>
+    class TensorNegate;
+
+    template<typename TENSOR_TYPE>
+    class TensorTranspose;
+
+
+
+
+    // provides a non-templated base-type for all ET's
+    // this allows using things like std::is_base_of
+    class TensorExpressionConcreteBase{};
+
+
+    template<typename DERIVED_TYPE>
+    class TensorExpressionBase :public TensorExpressionConcreteBase {
+      public:
+        using self_type = DERIVED_TYPE;
+
+      private:
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        self_type *getThis(){
+          return static_cast<self_type*>(this);
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        self_type const *getThis() const {
+          return static_cast<self_type const*>(this);
+        }
+
+      public:
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        camp::idx_t getDimBegin(camp::idx_t ) const
+        {
+          return 0;
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorAdd<self_type, normalize_operand_t<RHS> >
+        operator+(RHS const &rhs) const {
+          return TensorAdd<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorSubtract<self_type, normalize_operand_t<RHS>>
+        operator-(RHS const &rhs) const {
+          return TensorSubtract<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorNegate<self_type>
+        operator-() const {
+          return TensorNegate<self_type>(*getThis());
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorMultiply<self_type, normalize_operand_t<RHS>>
+        operator*(RHS const &rhs) const {
+          return TensorMultiply<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorDivide<self_type, normalize_operand_t<RHS>>
+        operator/(RHS const &rhs) const {
+          return TensorDivide<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorTranspose<self_type>
+        transpose() const {
+          return TensorTranspose<self_type>(*getThis());
+        }
+
+    };
+
+
+  } // namespace ET
+
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index 6ea5d09aa9..e7e7223ce4 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -20,1232 +20,1210 @@
 #define RAJA_pattern_tensor_ET_MultiplyOperator_HPP
 
 
+
 namespace RAJA
 {
 namespace internal
 {
 namespace expt
 {
-// forward
-class TensorBlockConcreteBase;
-
-
-namespace ET
-{
+  //forward
+  class TensorBlockConcreteBase;
 
 
-/*!
- * Provides default multiply, multiply add, and multiply subtract
- * operations.
- *
- * If the operands are both matrices, we perform a matrix-matrix multiply.
- * Otherwise, we perform element-wise operations.
- */
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          class ENABLE = void>
-struct MultiplyOperator
-{
 
-  using result_type = typename LEFT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast()
+  namespace ET
   {
-    printf("Elemental(%d,%d)", (int)s_num_dims,
-           (int)RIGHT_OPERAND_TYPE::s_num_dims);
-  }
 
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int getDimSize(int dim,
-                        LEFT_OPERAND_TYPE const& left,
-                        RIGHT_OPERAND_TYPE const& right)
-  {
-    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise multiply
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
-           RIGHT_OPERAND_TYPE const& right)
-      -> decltype(left.eval(tile) * right.eval(tile))
-  {
-    return left.eval(tile) * right.eval(tile);
-  }
-
-
-  /*!
-   * Evaluate operands and perform element-wise multiply add
-   */
-  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const& add)
-      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
-                                               add.eval(tile)))
-  {
-    return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
-  }
-
-
-  /*!
-   * Evaluate operands and perform element-wise multiply subtract
-   */
-  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const& tile,
-                    LEFT_OPERAND_TYPE const& left,
-                    RIGHT_OPERAND_TYPE const& right,
-                    SUBTRACT_OPERAND_TYPE const& subtract)
-      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
-                                                    subtract.eval(tile)))
-  {
-    return left.eval(tile).multiply_subtract(right.eval(tile),
-                                             subtract.eval(tile));
-  }
-};
-
+    /*!
+     * Provides default multiply, multiply add, and multiply subtract
+     * operations.
+     *
+     * If the operands are both matrices, we perform a matrix-matrix multiply.
+     * Otherwise, we perform element-wise operations.
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
+    struct MultiplyOperator
+    {
 
-/*!
- * Specialization that provides multiplying a scalar * tensor
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct MultiplyOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
+        using result_type = typename LEFT_OPERAND_TYPE::result_type;
+        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void print_ast() {
+          printf("Elemental(%d,%d)", (int)s_num_dims, (int)RIGHT_OPERAND_TYPE::s_num_dims);
+        }
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
+          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+        }
+
+        /*!
+         * Evaluate operands and perform element-wise multiply
+         */
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
+          decltype(left.eval(tile) * right.eval(tile))
+        {
+          return left.eval(tile) * right.eval(tile);
+        }
+
+
+        /*!
+         * Evaluate operands and perform element-wise multiply add
+         */
+        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
+          decltype(left.eval(tile).multiply_add(right.eval(tile), add.eval(tile)))
+        {
+          return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
+        }
+
+
+        /*!
+         * Evaluate operands and perform element-wise multiply subtract
+         */
+        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
+          decltype(left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile)))
+        {
+          return left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile));
+        }
+
+
+    };
+
+
+    /*!
+     * Specialization that provides multiplying a scalar * tensor
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
     typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0>::type>
-{
-
-  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Scale"); }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
-  {
-    return right.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform scaling operation
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
-           RIGHT_OPERAND_TYPE const& right)
-      -> decltype(right.eval(tile).scale(left.eval(tile)))
-  {
-    return right.eval(tile).scale(left.eval(tile));
-  }
-
-
-  /*!
-   * Evaluate operands and perform element-wise multiply add
-   */
-  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const& add)
-      -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
-  {
-    return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
-  }
-
-
-  /*!
-   * Evaluate operands and perform element-wise multiply subtract
-   */
-  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const& tile,
-                    LEFT_OPERAND_TYPE const& left,
-                    RIGHT_OPERAND_TYPE const& right,
-                    SUBTRACT_OPERAND_TYPE const& subtract)
-      -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
-  {
-    return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
-  }
-};
-
-
-/*!
- * Specialization that provides multiplying a tensor*scalar
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct MultiplyOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-{
-
-  using result_type = typename LEFT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Scale"); }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
-  {
-    return left.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform scaling operation
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
-           RIGHT_OPERAND_TYPE const& right)
-      -> decltype(left.eval(tile).scale(right.eval(tile)))
-  {
-    return left.eval(tile).scale(right.eval(tile));
-  }
-
-
-  /*!
-   * Evaluate operands and perform element-wise multiply add
-   */
-  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_OPERAND_TYPE const& add)
-      -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
-  {
-    return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
-  }
-
-
-  /*!
-   * Evaluate operands and perform element-wise multiply subtract
-   */
-  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static auto
-  multiply_subtract(TILE_TYPE const& tile,
-                    LEFT_OPERAND_TYPE const& left,
-                    RIGHT_OPERAND_TYPE const& right,
-                    SUBTRACT_OPERAND_TYPE const& subtract)
-      -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
-  {
-    return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
-  }
-};
-
-
-/*!
- * Specialization for matrix-vector right multiplication.
- *
- * By default the A*x operator for two matrices produces a matrix-vector
- * multiplication.
- *
- * The right hand side vector is always treated as a column vector.
- *
- * The resulting vector type is inherited from the RHS
- *
- *
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct MultiplyOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
-{
-
-  using left_type  = LEFT_OPERAND_TYPE;
-  using right_type = RIGHT_OPERAND_TYPE;
-  using result_type =
-      typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
-  static constexpr camp::idx_t s_num_dims = 1;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Matrx*Vector"); }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
-  {
-    return dim == 0 ? right.getDimSize(0) : 0;
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise multiply
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
-           RIGHT_OPERAND_TYPE const& right)
-  {
-
-    // clear result
-    result_type result(0);
-
-    // multiply left and right into result
-    multiply_into_result(result, tile, left, right);
-
-    return result;
-  }
-
-  template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const& add)
-  {
-
-    // evaluate add into result
-    result_type result = add.eval(tile);
-
-    // multiply left and right into result
-    multiply_into_result(result, tile, left, right);
-
-    return result;
-  }
-
-private:
-  template <typename STORAGE, typename TILE_TYPE, typename INDEX = void>
-  struct MultiplyBridge;
-
-  template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
-  {
-    // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-
-    // get tile size from matrix type
-    auto tile_size = left_type::result_type::s_dim_elem(1);
-    auto k_size    = et_left.getDimSize(1);
-    // TODO: check that left and right are compatible
-    // m_left.getDimSize(1) == m_right.getDimSize(0)
-    // how do we provide checking for this kind of error?
-
-    // tile over row of left and column of right
-    auto left_tile =
-        LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-    left_tile.m_begin[0] = tile.m_begin[0];
-    left_tile.m_size[0]  = tile.m_size[0];
-    left_tile.m_size[1]  = tile_size;
-
-    using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-    RightType right_tile = tile;
-    right_tile.m_size[0] = tile_size;
-
-    // Do full tiles in k
-    decltype(k_size) k = 0;
-    for (; k + tile_size <= k_size; k += tile_size)
     {
 
-      // evaluate both sides of operator
-      left_tile.m_begin[1] = k;
-      auto left            = et_left.eval(left_tile);
-
-      right_tile.m_begin[0] = k;
-      auto right            = et_right.eval(right_tile);
-
-      // accumulate product
-      result = left.right_multiply_vector_accumulate(right, result);
-    }
-    // remainder tile in k
-    if (k < k_size)
+        using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+        static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void print_ast() {
+          printf("Scale");
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
+          return right.getDimSize(dim);
+        }
+
+        /*!
+         * Evaluate operands and perform scaling operation
+         */
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
+          decltype(right.eval(tile).scale(left.eval(tile)))
+        {
+          return right.eval(tile).scale(left.eval(tile));
+        }
+
+
+
+        /*!
+         * Evaluate operands and perform element-wise multiply add
+         */
+        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
+          decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
+        {
+          return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
+        }
+
+
+        /*!
+         * Evaluate operands and perform element-wise multiply subtract
+         */
+        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
+          decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
+        {
+          return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
+        }
+    };
+
+
+    /*!
+     * Specialization that provides multiplying a tensor*scalar
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
     {
-      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
-      left_part_tile.m_begin[1] = k;
-      left_part_tile.m_size[1]  = k_size - k;
-      auto left                 = et_left.eval(left_part_tile);
-
-      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
-      right_part_tile.m_begin[0] = k;
-      right_part_tile.m_size[0]  = k_size - k;
-      auto right                 = et_right.eval(right_part_tile);
-
-      // accumulate product of partial tile
-      result = left.right_multiply_vector_accumulate(right, result);
-    }
-  }
-
-
-  template <typename T>
-  struct Diag
-  {
-    static_assert(!std::is_same<T, void>::value, "diag");
-  };
-
-  template <typename I, TensorTileSize TTS, typename B, typename S>
-  struct Diag<StaticTensorTile<I, TTS, B, S>>
-  {
-    static_assert(std::is_same<I, void>::value, "diag");
-  };
-
-  template <typename STORAGE, typename TILE_TYPE, typename INDEX>
-  struct MultiplyBridge
-  {
-
-    Diag<TILE_TYPE> diag;
 
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TILE_TYPE const& tile,
-                                     LEFT_OPERAND_TYPE const& et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
+        using result_type = typename LEFT_OPERAND_TYPE::result_type;
+        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void print_ast() {
+          printf("Scale");
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
+          return left.getDimSize(dim);
+        }
+
+        /*!
+         * Evaluate operands and perform scaling operation
+         */
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
+          decltype(left.eval(tile).scale(right.eval(tile)))
+        {
+          return left.eval(tile).scale(right.eval(tile));
+        }
+
+
+
+        /*!
+         * Evaluate operands and perform element-wise multiply add
+         */
+        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
+          decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
+        {
+          return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
+        }
+
+
+        /*!
+         * Evaluate operands and perform element-wise multiply subtract
+         */
+        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
+          decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
+        {
+          return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
+        }
+    };
+
+
+    /*!
+     * Specialization for matrix-vector right multiplication.
+     *
+     * By default the A*x operator for two matrices produces a matrix-vector
+     * multiplication.
+     *
+     * The right hand side vector is always treated as a column vector.
+     *
+     * The resulting vector type is inherited from the RHS
+     *
+     *
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==1>::type>
     {
-      // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-
-      // get tile size from matrix type
-      auto tile_size = left_type::result_type::s_dim_elem(1);
-      auto k_size    = et_left.getDimSize(1);
-      // TODO: check that left and right are compatible
-      // m_left.getDimSize(1) == m_right.getDimSize(0)
-      // how do we provide checking for this kind of error?
-
-      // tile over row of left and column of right
-      auto left_tile =
-          LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-      left_tile.m_begin[0] = tile.m_begin[0];
-      left_tile.m_size[0]  = tile.m_size[0];
-      left_tile.m_size[1]  = tile_size;
-
-      using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-      RightType right_tile = tile;
-      right_tile.m_size[0] = tile_size;
-
-      // Do full tiles in k
-      decltype(k_size) k = 0;
-      for (; k + tile_size <= k_size; k += tile_size)
-      {
 
-        // evaluate both sides of operator
-        left_tile.m_begin[1] = k;
-        auto left            = et_left.eval(left_tile);
+      using left_type = LEFT_OPERAND_TYPE;
+      using right_type = RIGHT_OPERAND_TYPE;
+      using result_type = typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
+      static constexpr camp::idx_t s_num_dims = 1;
 
-        right_tile.m_begin[0] = k;
-        auto right            = et_right.eval(right_tile);
-
-        // accumulate product
-        result = left.right_multiply_vector_accumulate(right, result);
-      }
-      // remainder tile in k
-      if (k < k_size)
-      {
-        auto& left_part_tile      = make_tensor_tile_partial(left_tile);
-        left_part_tile.m_begin[1] = k;
-        left_part_tile.m_size[1]  = k_size - k;
-        auto left                 = et_left.eval(left_part_tile);
-
-        auto& right_part_tile      = make_tensor_tile_partial(right_tile);
-        right_part_tile.m_begin[0] = k;
-        right_part_tile.m_size[0]  = k_size - k;
-        auto right                 = et_right.eval(right_part_tile);
-
-        // accumulate product of partial tile
-        result = left.right_multiply_vector_accumulate(right, result);
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void print_ast() {
+        printf("Matrx*Vector");
       }
-    }
-  };
-
-
-  template <size_t INDEX,
-            typename STORAGE,
-            typename INDEX_TYPE,
-            TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE Begin0,
-            INDEX_TYPE... BeginTail,
-            INDEX_TYPE Size0,
-            INDEX_TYPE... SizeTail>
-  struct MultiplyBridge<
-      STORAGE,
-      StaticTensorTile<INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
-      camp::integral_constant<size_t, INDEX>>
-  {
 
-    using TileType =
-        StaticTensorTile<INDEX_TYPE,
-                         TENSOR_SIZE,
-                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
-
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TileType const& tile,
-                                     LEFT_OPERAND_TYPE const& et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
-    {
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
+        return dim == 0 ? right.getDimSize(0) : 0;
+      }
 
-      // get tile size from matrix type
-      const auto tile_size = left_type::result_type::s_dim_elem(1);
-      const auto k_size    = et_left.getDimSize(1);
+      /*!
+       * Evaluate operands and perform element-wise multiply
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
 
-      auto const offset = INDEX * tile_size;
+        // clear result
+        result_type result(0);
 
-      if ((offset + tile_size) <= k_size)
-      {
+        // multiply left and right into result
+        multiply_into_result(result, tile, left, right);
 
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
-        // evaluate both sides of operator
-        auto left = et_left.eval(LeftType());
-
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, tile_size>>;
-
-        auto right = et_right.eval(RightType());
-
-        // accumulate product
-        auto temp = left.right_multiply_vector_accumulate(right, result);
-        MultiplyBridge<STORAGE, TileType,
-                       camp::integral_constant<size_t, INDEX - 1>>::
-            multiply_into_result(result, tile, et_left, et_right);
-        result += temp;
+        return result;
       }
-      else
-      {
 
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
-        auto left = et_left.eval(LeftType());
+      template<typename TILE_TYPE, typename ADD_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
 
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
-        auto right = et_right.eval(RightType());
+        // evaluate add into result
+        result_type result = add.eval(tile);
 
-        // accumulate product of partial tile
-        result = left.right_multiply_vector_accumulate(right, result);
-      }
-    }
-  };
-
-
-  template <typename STORAGE,
-            typename INDEX_TYPE,
-            TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE Begin0,
-            INDEX_TYPE... BeginTail,
-            INDEX_TYPE Size0,
-            INDEX_TYPE... SizeTail>
-  struct MultiplyBridge<
-      STORAGE,
-      StaticTensorTile<INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
-      camp::integral_constant<size_t, 0>>
-  {
+        // multiply left and right into result
+        multiply_into_result(result, tile, left, right);
 
-    using TileType =
-        StaticTensorTile<INDEX_TYPE,
-                         TENSOR_SIZE,
-                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
-
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TileType const&,
-                                     LEFT_OPERAND_TYPE const& et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
-    {
+        return result;
+      }
 
-      // get tile size from matrix type
-      const auto tile_size = left_type::result_type::s_dim_elem(1);
-      const auto k_size    = et_left.getDimSize(1);
+    private:
 
-      auto const offset = 0;
+      template<typename STORAGE, typename TILE_TYPE, typename INDEX=void>
+      struct MultiplyBridge;
 
-      if ((offset + tile_size) <= k_size)
+      template<typename STORAGE, typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
       {
+        //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+        // get tile size from matrix type
+        auto tile_size = left_type::result_type::s_dim_elem(1);
+        auto k_size = et_left.getDimSize(1);
+        // TODO: check that left and right are compatible
+        // m_left.getDimSize(1) == m_right.getDimSize(0)
+        // how do we provide checking for this kind of error?
+
+        // tile over row of left and column of right
+        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+        left_tile.m_begin[0] = tile.m_begin[0];
+        left_tile.m_size[0] = tile.m_size[0];
+        left_tile.m_size[1] = tile_size;
+
+        using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+        RightType right_tile = tile;
+        right_tile.m_size[0] = tile_size;
+
+        // Do full tiles in k
+        decltype(k_size) k = 0;
+        for(;k+tile_size <= k_size; k+= tile_size){
+
+          // evaluate both sides of operator
+          left_tile.m_begin[1] = k;
+          auto left = et_left.eval(left_tile);
+
+          right_tile.m_begin[0] = k;
+          auto right = et_right.eval(right_tile);
+
+          // accumulate product
+          result = left.right_multiply_vector_accumulate(right, result);
+        }
+        // remainder tile in k
+        if(k < k_size){
+          auto &left_part_tile = make_tensor_tile_partial(left_tile);
+          left_part_tile.m_begin[1] = k;
+          left_part_tile.m_size[1] = k_size-k;
+          auto left = et_left.eval(left_part_tile);
+
+          auto &right_part_tile = make_tensor_tile_partial(right_tile);
+          right_part_tile.m_begin[0] = k;
+          right_part_tile.m_size[0] = k_size-k;
+          auto right = et_right.eval(right_part_tile);
+
+          // accumulate product of partial tile
+          result = left.right_multiply_vector_accumulate(right, result);
+        }
 
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
-        // evaluate both sides of operator
-        auto left = et_left.eval(LeftType());
-
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, tile_size>>;
-
-        auto right = et_right.eval(RightType());
-
-        // accumulate product
-        auto temp = left.right_multiply_vector_accumulate(right, result);
-        result += temp;
       }
-      else
-      {
-
-        using LeftType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
-                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
-        auto left = et_left.eval(LeftType());
 
-        using RightType =
-            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
-                             camp::int_seq<INDEX_TYPE, offset>,
-                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
-        auto right = et_right.eval(RightType());
 
-        // accumulate product of partial tile
-        result = left.right_multiply_vector_accumulate(right, result);
-      }
-    }
-  };
-
-  template <typename STORAGE,
-            typename INDEX_TYPE,
-            TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE Begin0,
-            INDEX_TYPE... BeginTail,
-            INDEX_TYPE Size0,
-            INDEX_TYPE... SizeTail>
-  struct MultiplyBridge<
-      STORAGE,
-      StaticTensorTile<INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
-      void>
-  {
-
-    using TileType =
-        StaticTensorTile<INDEX_TYPE,
-                         TENSOR_SIZE,
-                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
-
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void multiply_into_result(STORAGE& result,
-                                     TileType const& tile,
-                                     LEFT_OPERAND_TYPE const& et_left,
-                                     RIGHT_OPERAND_TYPE const& et_right)
+      template<typename T>
+      struct Diag{
+          static_assert(!std::is_same<T,void>::value,"diag");
+      };
+
+      template<typename I, TensorTileSize TTS, typename B, typename S>
+      struct Diag< StaticTensorTile<I,TTS,B,S> >{
+          static_assert(std::is_same<I,void>::value,"diag");
+      };
+
+      template<typename STORAGE, typename TILE_TYPE, typename INDEX>
+      struct MultiplyBridge {
+
+          Diag<TILE_TYPE> diag;
+
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+            //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+    
+            // get tile size from matrix type
+            auto tile_size = left_type::result_type::s_dim_elem(1);
+            auto k_size = et_left.getDimSize(1);
+            // TODO: check that left and right are compatible
+            // m_left.getDimSize(1) == m_right.getDimSize(0)
+            // how do we provide checking for this kind of error?
+    
+            // tile over row of left and column of right
+            auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+            left_tile.m_begin[0] = tile.m_begin[0];
+            left_tile.m_size[0] = tile.m_size[0];
+            left_tile.m_size[1] = tile_size;
+    
+            using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+            RightType right_tile = tile;
+            right_tile.m_size[0] = tile_size;
+    
+            // Do full tiles in k
+            decltype(k_size) k = 0;
+            for(;k+tile_size <= k_size; k+= tile_size){
+    
+              // evaluate both sides of operator
+              left_tile.m_begin[1] = k;
+              auto left = et_left.eval(left_tile);
+    
+              right_tile.m_begin[0] = k;
+              auto right = et_right.eval(right_tile);
+    
+              // accumulate product
+              result = left.right_multiply_vector_accumulate(right, result);
+            }
+            // remainder tile in k
+            if(k < k_size){
+              auto &left_part_tile = make_tensor_tile_partial(left_tile);
+              left_part_tile.m_begin[1] = k;
+              left_part_tile.m_size[1] = k_size-k;
+              auto left = et_left.eval(left_part_tile);
+    
+              auto &right_part_tile = make_tensor_tile_partial(right_tile);
+              right_part_tile.m_begin[0] = k;
+              right_part_tile.m_size[0] = k_size-k;
+              auto right = et_right.eval(right_part_tile);
+    
+              // accumulate product of partial tile
+              result = left.right_multiply_vector_accumulate(right, result);
+            }
+    
+          }
+      };
+
+
+
+
+      template<
+          size_t INDEX,
+          typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
+          INDEX_TYPE  Size0, INDEX_TYPE... SizeTail
+      >
+      struct MultiplyBridge <
+          STORAGE,
+          StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >,
+          camp::integral_constant<size_t,INDEX>
+      > {
+
+          using TileType = StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >;
+              
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+
+              // get tile size from matrix type
+              const auto tile_size = left_type::result_type::s_dim_elem(1);
+              const auto k_size = et_left.getDimSize(1);
+             
+              auto const offset = INDEX*tile_size;
+
+              if( (offset + tile_size) <= k_size ) {
+    
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
+                    >;
+                    // evaluate both sides of operator
+                    auto left = et_left.eval(LeftType());
+
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE,    offset>,
+                        camp::int_seq<INDEX_TYPE, tile_size>
+                    >;
+    
+                    auto right = et_right.eval(RightType());
+    
+                    // accumulate product
+                    auto temp = left.right_multiply_vector_accumulate(right, result);
+                    MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,INDEX-1>>::multiply_into_result(result,tile,et_left,et_right);
+                    result += temp;
+                    
+              } else {
+
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
+                    >;
+		    auto left = et_left.eval(LeftType());
+	    
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE,        offset>,
+                        camp::int_seq<INDEX_TYPE, k_size-offset>
+                    >;
+		    auto right = et_right.eval(RightType());
+	    
+		    // accumulate product of partial tile
+		    result = left.right_multiply_vector_accumulate(right, result);
+
+              }
+
+
+            }
+          };
+
+
+      template<
+          typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
+          INDEX_TYPE  Size0, INDEX_TYPE...  SizeTail
+      >
+      struct MultiplyBridge <
+          STORAGE,
+          StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >,
+          camp::integral_constant<size_t,0>
+      > {
+
+          using TileType = StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >;
+              
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TileType const &, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+
+              // get tile size from matrix type
+              const auto tile_size = left_type::result_type::s_dim_elem(1);
+              const auto k_size = et_left.getDimSize(1);
+             
+              auto const offset = 0;
+
+              if( (offset + tile_size) <= k_size ) {
+    
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
+                    >;
+                    // evaluate both sides of operator
+                    auto left = et_left.eval(LeftType());
+
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE,    offset>,
+                        camp::int_seq<INDEX_TYPE, tile_size>
+                    >;
+    
+                    auto right = et_right.eval(RightType());
+    
+                    // accumulate product
+                    auto temp = left.right_multiply_vector_accumulate(right, result);
+                    result += temp;
+                    
+              } else {
+
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
+                    >;
+		    auto left = et_left.eval(LeftType());
+	    
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE,        offset>,
+                        camp::int_seq<INDEX_TYPE, k_size-offset>
+                    >;
+		    auto right = et_right.eval(RightType());
+	    
+		    // accumulate product of partial tile
+		    result = left.right_multiply_vector_accumulate(right, result);
+
+              }
+
+
+            }
+          };
+
+      template<
+          typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE Begin0,  INDEX_TYPE... BeginTail,
+          INDEX_TYPE  Size0,  INDEX_TYPE... SizeTail
+      >
+      struct MultiplyBridge <
+          STORAGE,
+          StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >,
+          void
+      > {
+
+          using TileType = StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >;
+              
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+
+              const auto tile_size = left_type::result_type::s_dim_elem(1);
+              const auto k_size = et_left.getDimSize(1);
+              const size_t iter_count = (k_size/tile_size) + ( (k_size%tile_size != 0) ? 1 : 0 );
+
+              MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,iter_count>>::multiply_into_result(result,tile,et_left,et_right);
+
+            }
+          };
+
+      };
+
+
+
+
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
+    class TensorMultiplyAdd;
+
+
+    /*!
+     * Specialization for vector*matrix left multiplication.
+     *
+     * By default the x'*A operator for two matrices produces a vector-matrix
+     * multiplication.
+     *
+     * The left hand side vector is always treated as a row vector.
+     *
+     * The resulting vector type is inherited from the LHS
+     *
+     *
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
     {
 
-      const auto tile_size = left_type::result_type::s_dim_elem(1);
-      const auto k_size    = et_left.getDimSize(1);
-      const size_t iter_count =
-          (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
+      using left_type = LEFT_OPERAND_TYPE;
+      using right_type = RIGHT_OPERAND_TYPE;
+      using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
+      static constexpr camp::idx_t s_num_dims = 1;
 
-      MultiplyBridge<STORAGE, TileType,
-                     camp::integral_constant<size_t, iter_count>>::
-          multiply_into_result(result, tile, et_left, et_right);
-    }
-  };
-};
-
-
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          typename ADD_OPERAND_TYPE>
-class TensorMultiplyAdd;
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void print_ast() {
+        printf("Vector*Matrix");
+      }
 
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
+        return dim == 0 ? left.getDimSize(0) : 0;
+      }
 
-/*!
- * Specialization for vector*matrix left multiplication.
- *
- * By default the x'*A operator for two matrices produces a vector-matrix
- * multiplication.
- *
- * The left hand side vector is always treated as a row vector.
- *
- * The resulting vector type is inherited from the LHS
- *
- *
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct MultiplyOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
-{
+      /*!
+       * Evaluate operands and perform element-wise multiply
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
+        // clear result
+        result_type result(0);
+
+        // multiply left and right into result
+        multiply_into_result(result, tile, left, right);
+
+        return result;
+      }
 
-  using left_type   = LEFT_OPERAND_TYPE;
-  using right_type  = RIGHT_OPERAND_TYPE;
-  using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
-  static constexpr camp::idx_t s_num_dims = 1;
+      template<typename TILE_TYPE, typename ADD_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
+        // evaluate add into result
+        result_type result = add.eval(tile);
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Vector*Matrix"); }
+        // multiply left and right into result
+        multiply_into_result(result, tile, left, right);
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
-  {
-    return dim == 0 ? left.getDimSize(0) : 0;
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise multiply
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
-           RIGHT_OPERAND_TYPE const& right)
-  {
-    // clear result
-    result_type result(0);
+        return result;
+      }
 
-    // multiply left and right into result
-    multiply_into_result(result, tile, left, right);
+    private:
+      template<typename STORAGE, typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+      {
+        // get tile size from matrix type
+        auto tile_size = right_type::result_type::s_dim_elem(0);
+        auto k_size = et_right.getDimSize(0);
 
-    return result;
-  }
 
-  template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const& add)
-  {
-    // evaluate add into result
-    result_type result = add.eval(tile);
-
-    // multiply left and right into result
-    multiply_into_result(result, tile, left, right);
-
-    return result;
-  }
-
-private:
-  template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
-  {
-    // get tile size from matrix type
-    auto tile_size = right_type::result_type::s_dim_elem(0);
-    auto k_size    = et_right.getDimSize(0);
+        // TODO: check that left and right are compatible
+        // m_left.getDimSize(1) == m_right.getDimSize(0)
+        // how do we provide checking for this kind of error?
 
+        // tile over row of left and column of right
+        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+        right_tile.m_begin[1] = tile.m_begin[0];
+        right_tile.m_size[1] = tile.m_size[0];
+        right_tile.m_size[0] = tile_size;
 
-    // TODO: check that left and right are compatible
-    // m_left.getDimSize(1) == m_right.getDimSize(0)
-    // how do we provide checking for this kind of error?
+        TILE_TYPE left_tile = tile;
+        left_tile.m_size[0] = tile_size;
 
-    // tile over row of left and column of right
-    auto right_tile =
-        RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-    right_tile.m_begin[1] = tile.m_begin[0];
-    right_tile.m_size[1]  = tile.m_size[0];
-    right_tile.m_size[0]  = tile_size;
 
-    TILE_TYPE left_tile = tile;
-    left_tile.m_size[0] = tile_size;
+        // Do full tiles in k
+        decltype(k_size) k = 0;
+        for(;k+tile_size <= k_size; k+= tile_size){
 
+          // evaluate both sides of operator
+          right_tile.m_begin[0] = k;
+          auto right = et_right.eval(right_tile);
 
-    // Do full tiles in k
-    decltype(k_size) k = 0;
-    for (; k + tile_size <= k_size; k += tile_size)
-    {
+          left_tile.m_begin[0] = k;
+          auto left = et_left.eval(left_tile);
 
-      // evaluate both sides of operator
-      right_tile.m_begin[0] = k;
-      auto right            = et_right.eval(right_tile);
+          // accumulate product
+          result = right.left_multiply_vector_accumulate(left, result);
 
-      left_tile.m_begin[0] = k;
-      auto left            = et_left.eval(left_tile);
+        }
+        // remainder tile in k
+        if(k < k_size){
+          auto &right_part_tile = make_tensor_tile_partial(right_tile);
+          right_part_tile.m_begin[0] = k;
+          right_part_tile.m_size[0] = k_size-k;
+          auto right = et_right.eval(right_part_tile);
 
-      // accumulate product
-      result = right.left_multiply_vector_accumulate(left, result);
-    }
-    // remainder tile in k
-    if (k < k_size)
-    {
-      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
-      right_part_tile.m_begin[0] = k;
-      right_part_tile.m_size[0]  = k_size - k;
-      auto right                 = et_right.eval(right_part_tile);
-
-      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
-      left_part_tile.m_begin[0] = k;
-      left_part_tile.m_size[0]  = k_size - k;
-      auto left                 = et_left.eval(left_part_tile);
-
-      // compute product into x of partial tile
-      result = right.left_multiply_vector_accumulate(left, result);
-    }
-  }
-};
+          auto &left_part_tile = make_tensor_tile_partial(left_tile);
+          left_part_tile.m_begin[0] = k;
+          left_part_tile.m_size[0] = k_size-k;
+          auto left = et_left.eval(left_part_tile);
 
+          // compute product into x of partial tile
+          result = right.left_multiply_vector_accumulate(left, result);
+        }
 
-/*!
- * Specialization for matrix-matrix multiplication for TensorRegisters
- *
- * By default the A*B operator for two matrices produces a matrix-matrix
- * multiplication.
- *
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct MultiplyOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
-{
+      }
 
-  using left_type   = LEFT_OPERAND_TYPE;
-  using right_type  = RIGHT_OPERAND_TYPE;
-  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-  static constexpr camp::idx_t s_num_dims = 2;
+    };
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Matrx*Matrix"); }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int getDimSize(int dim,
-                        LEFT_OPERAND_TYPE const& left,
-                        RIGHT_OPERAND_TYPE const& right)
-  {
-    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise multiply
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const& left,
-           RIGHT_OPERAND_TYPE const& right)
-  {
 
-    /*
-     *
-     * For TensorRegister:
-     *
-     *   Return's a register containing product of left and right operands
-     *
-     * For TensorBlock:
-     *
-     *  Return's an ET TensorLiteral containing the left and right operrands
-     *
-     *  OR
+    /*!
+     * Specialization for matrix-matrix multiplication for TensorRegisters
      *
-     *  Returns an ET multiply
+     * By default the A*B operator for two matrices produces a matrix-matrix
+     * multiplication.
      *
      */
-    // create zeroed temporary
-    result_type result;
-    result.broadcast(0);
-
-    // multiply left and right operands into temporary
-    multiply_into_result(result, tile, left, right);
-
-    return result;
-  }
-
-  template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const& add)
-  {
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<
+    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    {
 
-    // start accumulator with addition term
-    result_type result = add.eval(tile);
+      using left_type = LEFT_OPERAND_TYPE;
+      using right_type = RIGHT_OPERAND_TYPE;
+      using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+      static constexpr camp::idx_t s_num_dims = 2;
 
-    multiply_into_result(result, tile, left, right);
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void print_ast() {
+        printf("Matrx*Matrix");
+      }
 
-    return result;
-  }
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
+        return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+      }
 
-private:
-  template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
-  {
-    // get tile size from matrix type
-    using right_tensor_type = typename right_type::result_type;
-    auto tile_size          = right_tensor_type::s_dim_elem(0);
-    auto k_size             = et_left.getDimSize(1);
+      /*!
+       * Evaluate operands and perform element-wise multiply
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
+      {
 
-    // TODO: check that left and right are compatible
-    // m_left.getDimSize(1) == m_right.getDimSize(0)
-    // how do we provide checking for this kind of error?
+        /*
+         *
+         * For TensorRegister:
+         *
+         *   Return's a register containing product of left and right operands
+         *
+         * For TensorBlock:
+         *
+         *  Return's an ET TensorLiteral containing the left and right operrands
+         *
+         *  OR
+         *
+         *  Returns an ET multiply
+         *
+         */
+        // create zeroed temporary
+        result_type result;
+        result.broadcast(0);
+
+        // multiply left and right operands into temporary
+        multiply_into_result(result, tile, left,right);
+
+        return result;
+      }
 
-    // tile over row of left and column of right
-    TILE_TYPE left_tile = tile;
-    left_tile.m_size[1] = tile_size;
-    auto left_begin     = et_left.getDimBegin(1);
+      template<typename TILE_TYPE, typename ADD_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add)
+      {
 
-    TILE_TYPE right_tile = tile;
-    right_tile.m_size[0] = tile_size;
-    auto right_begin     = et_right.getDimBegin(0);
+        // start accumulator with addition term
+        result_type result = add.eval(tile);
 
+        multiply_into_result(result, tile, left, right);
 
-    // Do full tiles in k
-    decltype(k_size) k = 0;
-    for (; k + tile_size <= k_size; k += tile_size)
-    {
+        return result;
 
-      // evaluate both sides of operator
-      left_tile.m_begin[1] = k + left_begin;
-      auto left            = et_left.eval(left_tile);
+      }
 
-      right_tile.m_begin[0] = k + right_begin;
-      auto right            = et_right.eval(right_tile);
+    private:
+      template<typename STORAGE, typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+      {
+        // get tile size from matrix type
+        using right_tensor_type = typename right_type::result_type;
+        auto tile_size = right_tensor_type::s_dim_elem(0);
+        auto k_size = et_left.getDimSize(1);
+
+        // TODO: check that left and right are compatible
+        // m_left.getDimSize(1) == m_right.getDimSize(0)
+        // how do we provide checking for this kind of error?
+
+        // tile over row of left and column of right
+        TILE_TYPE left_tile = tile;
+        left_tile.m_size[1] = tile_size;
+        auto left_begin = et_left.getDimBegin(1);
+
+        TILE_TYPE right_tile = tile;
+        right_tile.m_size[0] = tile_size;
+        auto right_begin = et_right.getDimBegin(0);
+
+
+        // Do full tiles in k
+        decltype(k_size) k = 0;
+        for(;k+tile_size <= k_size; k+= tile_size){
+
+          // evaluate both sides of operator
+          left_tile.m_begin[1] = k + left_begin;
+          auto left = et_left.eval(left_tile);
+
+          right_tile.m_begin[0] = k + right_begin;
+          auto right = et_right.eval(right_tile);
+
+          // accumulate product
+          left.matrix_multiply_accumulate(result, right);
+        }
+        // remainder tile in k
+        if(k < k_size){
+
+          auto &left_part_tile = make_tensor_tile_partial(left_tile);
+          left_part_tile.m_begin[1] = k + left_begin;
+          left_part_tile.m_size[1] = k_size-k;
+          auto left = et_left.eval(left_part_tile);
+
+          auto &right_part_tile = make_tensor_tile_partial(right_tile);
+          right_part_tile.m_begin[0] = k + right_begin;
+          right_part_tile.m_size[0] = k_size-k;
+          auto right = et_right.eval(right_part_tile);
+
+          // accumulate product
+          left.matrix_multiply_accumulate(result, right);
+        }
+      }
 
-      // accumulate product
-      left.matrix_multiply_accumulate(result, right);
-    }
-    // remainder tile in k
-    if (k < k_size)
-    {
+    };
 
-      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
-      left_part_tile.m_begin[1] = k + left_begin;
-      left_part_tile.m_size[1]  = k_size - k;
-      auto left                 = et_left.eval(left_part_tile);
 
-      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
-      right_part_tile.m_begin[0] = k + right_begin;
-      right_part_tile.m_size[0]  = k_size - k;
-      auto right                 = et_right.eval(right_part_tile);
 
-      // accumulate product
-      left.matrix_multiply_accumulate(result, right);
-    }
-  }
-};
 
 
-template <typename OPERAND_TYPE, typename TILE_TYPE>
-class RestrictExtents
-    : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>>
-{
-public:
-  using self_type    = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
-  using operand_type = OPERAND_TYPE;
-  using result_type  = typename OPERAND_TYPE::result_type;
-  using index_type   = typename TILE_TYPE::index_type;
-  using tile_type    = TILE_TYPE;
-  static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
-
-private:
-  operand_type m_operand;
-  tile_type m_tile;
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RestrictExtents(operand_type const& operand, tile_type const& tile)
-      : m_operand {operand}, m_tile {tile}
-  {}
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type dim) const
-  {
-    return m_tile.m_size[dim];
-  }
+    template<typename OPERAND_TYPE, typename TILE_TYPE>
+    class RestrictExtents : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>> {
+      public:
+        using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
+        using operand_type = OPERAND_TYPE;
+        using result_type = typename OPERAND_TYPE::result_type;
+        using index_type = typename TILE_TYPE::index_type;
+        using tile_type = TILE_TYPE;
+        static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimBegin(camp::idx_t dim) const
-  {
-    return m_tile.m_begin[dim];
-  }
+      private:
+        operand_type m_operand;
+        tile_type m_tile;
 
+      public:
 
-  template <typename TILE_TYPE2>
-  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE2 const& tile) const
-      -> decltype(m_operand.eval(tile))
-  {
-    return m_operand.eval(tile);
-  }
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        RestrictExtents(operand_type const &operand, tile_type const &tile) :
+        m_operand{operand}, m_tile{tile}
+        {}
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const
-  {
-    printf("RestrictExtents(");
-    m_operand.print_ast();
-    printf(")");
-  }
-};
-
-template <typename OPERAND, typename TILE>
-RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
-                                               TILE const& tile)
-{
-  using tile_type = typename OPERAND::tile_type;
-  tile_type new_tile;
-  new_tile.copy(tile);
-  return RestrictExtents<OPERAND, TILE>(operand, new_tile);
-}
 
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type dim) const {
+          return m_tile.m_size[dim];
+        }
 
-/*!
- * Specialization for matrix-matrix multiplication for TensorBlocks
- *
- * By default the A*B operator for two matrices produces a matrix-matrix
- * multiplication.
- *
- */
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimBegin(camp::idx_t dim) const {
+          return m_tile.m_begin[dim];
+        }
 
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct MultiplyOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-        std::is_base_of<TensorBlockConcreteBase,
-                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
-        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
-{
-  using left_type   = LEFT_OPERAND_TYPE;
-  using right_type  = RIGHT_OPERAND_TYPE;
-  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-  static constexpr camp::idx_t s_num_dims = 2;
 
-  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
-  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename
-  //      RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
+        template<typename TILE_TYPE2>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        auto eval(TILE_TYPE2 const &tile) const ->
+          decltype(m_operand.eval(tile))
+        {
+          return m_operand.eval(tile);
+        }
 
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("RestrictExtents(");
+          m_operand.print_ast();
+          printf(")");
+        }
 
-  // This tensor type is a TensorBlock of some kind
-  using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
 
-  // Get the storage type from the TensorBlock
-  using storage_type = typename tensor_type::storage_type;
+    };
 
-  // Create a BlockLiteral that uses the TensorBlock's indicated storage
-  // and has an eval() that produces the TensorBlock's register type
-  using block_literal =
-      BlockLiteral<storage_type, typename tensor_type::register_type>;
+    template<typename OPERAND, typename TILE>
+    RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const &operand, TILE const &tile){
+      using tile_type = typename OPERAND::tile_type;
+      tile_type new_tile;
+      new_tile.copy(tile);
+      return RestrictExtents<OPERAND, TILE>(operand, new_tile);
+    }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static void print_ast() { printf("Matrx*Matrix"); }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int getDimSize(int dim,
-                        LEFT_OPERAND_TYPE const& left,
-                        RIGHT_OPERAND_TYPE const& right)
-  {
-    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise multiply
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply(
-      TILE_TYPE const& tile,
-      LEFT_OPERAND_TYPE const&,
-      RIGHT_OPERAND_TYPE const&)  //->
-                                  /// decltype(TensorMultiply<decltype(left.eval(tile)),
-                                  /// decltype(right.eval(tile))>(left.eval(tile),
-                                  /// right.eval(tile)))
-  {
 
-    /*
-     * First pass:  just return a Multiply ET that evaluates the block
-     * with underlying TensorRegisters
+    /*!
+     * Specialization for matrix-matrix multiplication for TensorBlocks
      *
-     *
-     * Second pass: we want to return a TensorLiteral ET node with the
-     * matrix product already evaluated.?
-     *
-     * What we really care about is improving the data reuse: so perhaps
-     * returning a Multiply ET node with TensorLiteral nodes for each
-     * of the operands
+     * By default the A*B operator for two matrices produces a matrix-matrix
+     * multiplication.
      *
      */
-    // create a BlockLiteral
-    block_literal result(tile);
-
-    // evaluate the block-wise product into result
-
-    // return TensorMultiply<decltype(left.eval(tile)),
-    // decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
-
-    // return the BlockLiterat ET
-    return result;
-  }
-
-  template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add(
-      TILE_TYPE const& tile,
-      LEFT_OPERAND_TYPE const& left,
-      RIGHT_OPERAND_TYPE const& right,
-      ADD_TYPE const&
-          add)  //->
-                // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
-                // decltype(right.eval(tile)),
-                // decltype(add.eval(tile))>(left.eval(tile),
-                // right.eval(tile), add.eval(tile)))
-  {
-    /*
-     * First pass:  we want to return a BlockLiteral ET node with the
-     * matrix product already evaluated.  We do this by creating
-     * a LoadStore node wrapping the BlockLiteral, and evaluating it as
-     * a sub-expression.
-     *
-     * What we really care about is improving the data reuse: so perhaps
-     * returning a Multiply ET node with TensorLiteral nodes for each
-     * of the operands
-     *
-     */
-
-    // create a BlockLiteral
-    using block_tile_type = typename block_literal::tile_type;
-    block_tile_type block_tile;
-    block_tile.copy(tile);
-    block_literal result(block_tile);
-
-    using ref_type        = typename block_literal::ref_type;
-    using load_store_type = TensorLoadStore<tensor_type, ref_type>;
-
-    // initialize the result with our addition term
-    auto result_et = load_store_type(result.get_ref()).eval(tile);
-    result_et      = add.eval(tile);
-
-    // return TensorMultiplyAdd<decltype(left.eval(tile)),
-    // decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile),
-    // right.eval(tile), add.eval(tile));
-
-    //          multiply_into_result(result_et, tile, restrictExtents(left,
-    //          tile), restrictExtents(right, tile));
-    multiply_into_result(result_et, tile, left, right);
-
-    // return the BlockLiterat ET
-    return result;
-  }
-
-private:
-  template <typename STORAGE, typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static void
-  multiply_into_result(STORAGE& result,
-                       TILE_TYPE const& tile,
-                       LEFT_OPERAND_TYPE const& et_left,
-                       RIGHT_OPERAND_TYPE const& et_right)
-  {
-
-    // get tile size from matrix type
-    auto tile_size = result_type::s_dim_elem(1);
-    auto k_size    = et_left.getDimSize(1);
-
-    // TODO: check that left and right are compatible
-    // m_left.getDimSize(1) == m_right.getDimSize(0)
-    // how do we provide checking for this kind of error?
-
-    // tile over row of left and column of right
-    TILE_TYPE left_tile = tile;
-    left_tile.m_size[1] = tile_size;
-    auto left_begin     = et_left.getDimBegin(1);
-
-    TILE_TYPE right_tile = tile;
-    right_tile.m_size[0] = tile_size;
-    auto right_begin     = et_right.getDimBegin(0);
-
 
-    // Do full tiles in k
-    decltype(k_size) k = 0;
-    for (; k + tile_size <= k_size; k += tile_size)
-    {
-
-
-      // evaluate both sides of operator
-      left_tile.m_begin[1] = k + left_begin;
-      auto left            = et_left.eval(left_tile);
-
-      right_tile.m_begin[0] = k + right_begin;
-      auto right            = et_right.eval(right_tile);
-
-      // accumulate product
-      // left.matrix_multiply_accumulate(result, right);
-      result +=
-          restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
-    }
-    // remainder tile in k
-    if (k < k_size)
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<
+    std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
     {
+        using left_type = LEFT_OPERAND_TYPE;
+        using right_type = RIGHT_OPERAND_TYPE;
+        using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+        static constexpr camp::idx_t s_num_dims = 2;
 
-      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
-      left_part_tile.m_begin[1] = k + left_begin;
-      left_part_tile.m_size[1]  = k_size - k;
-      auto left                 = et_left.eval(left_part_tile);
-
-      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
-      right_part_tile.m_begin[0] = k + right_begin;
-      right_part_tile.m_size[0]  = k_size - k;
-      auto right                 = et_right.eval(right_part_tile);
-
-      // accumulate product
-      // left.matrix_multiply_accumulate(result, right);
-      result += restrictExtents(left, left_part_tile) *
-                restrictExtents(right, right_part_tile);
-    }
-  }
-};
-
-
-}  // namespace ET
+  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
+  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
+
+
+        // This tensor type is a TensorBlock of some kind
+        using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
+
+        // Get the storage type from the TensorBlock
+        using storage_type = typename tensor_type::storage_type;
+
+        // Create a BlockLiteral that uses the TensorBlock's indicated storage
+        // and has an eval() that produces the TensorBlock's register type
+        using block_literal = BlockLiteral<storage_type,
+                                           typename tensor_type::register_type>;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void print_ast() {
+          printf("Matrx*Matrix");
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
+          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+        }
+
+        /*!
+         * Evaluate operands and perform element-wise multiply
+         */
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        block_literal multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &) //->
+          ///decltype(TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile)))
+        {
+
+          /*
+           * First pass:  just return a Multiply ET that evaluates the block
+           * with underlying TensorRegisters
+           *
+           *
+           * Second pass: we want to return a TensorLiteral ET node with the
+           * matrix product already evaluated.?
+           *
+           * What we really care about is improving the data reuse: so perhaps
+           * returning a Multiply ET node with TensorLiteral nodes for each
+           * of the operands
+           *
+           */
+          // create a BlockLiteral
+          block_literal result(tile);
+
+          // evaluate the block-wise product into result
+
+          //return TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
+
+          // return the BlockLiterat ET
+          return result;
+        }
+
+        template<typename TILE_TYPE, typename ADD_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        block_literal multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add) //->
+          //decltype(TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile)))
+        {
+          /*
+           * First pass:  we want to return a BlockLiteral ET node with the
+           * matrix product already evaluated.  We do this by creating
+           * a LoadStore node wrapping the BlockLiteral, and evaluating it as
+           * a sub-expression.
+           *
+           * What we really care about is improving the data reuse: so perhaps
+           * returning a Multiply ET node with TensorLiteral nodes for each
+           * of the operands
+           *
+           */
+
+          // create a BlockLiteral
+          using block_tile_type = typename block_literal::tile_type;
+          block_tile_type block_tile;
+          block_tile.copy(tile);
+          block_literal result(block_tile);
+
+          using ref_type = typename block_literal::ref_type;
+          using load_store_type = TensorLoadStore<tensor_type, ref_type>;
+
+          // initialize the result with our addition term
+          auto result_et = load_store_type(result.get_ref()).eval(tile);
+          result_et = add.eval(tile);
+
+          //return TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile));
+
+//          multiply_into_result(result_et, tile, restrictExtents(left, tile), restrictExtents(right, tile));
+          multiply_into_result(result_et, tile, left, right);
+
+          // return the BlockLiterat ET
+          return result;
+        }
+
+      private:
+
+        template<typename STORAGE, typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+        {
+
+          // get tile size from matrix type
+          auto tile_size = result_type::s_dim_elem(1);
+          auto k_size = et_left.getDimSize(1);
+
+          // TODO: check that left and right are compatible
+          // m_left.getDimSize(1) == m_right.getDimSize(0)
+          // how do we provide checking for this kind of error?
+
+          // tile over row of left and column of right
+          TILE_TYPE left_tile = tile;
+          left_tile.m_size[1] = tile_size;
+          auto left_begin = et_left.getDimBegin(1);
+
+          TILE_TYPE right_tile = tile;
+          right_tile.m_size[0] = tile_size;
+          auto right_begin = et_right.getDimBegin(0);
+
+
+
+          // Do full tiles in k
+          decltype(k_size) k = 0;
+          for(;k+tile_size <= k_size; k+= tile_size){
+
+
+            // evaluate both sides of operator
+            left_tile.m_begin[1] = k + left_begin;
+            auto left = et_left.eval(left_tile);
+
+            right_tile.m_begin[0] = k + right_begin;
+            auto right = et_right.eval(right_tile);
+
+            // accumulate product
+            //left.matrix_multiply_accumulate(result, right);
+            result += restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
+          }
+          // remainder tile in k
+          if(k < k_size){
+
+            auto &left_part_tile = make_tensor_tile_partial(left_tile);
+            left_part_tile.m_begin[1] = k + left_begin;
+            left_part_tile.m_size[1] = k_size-k;
+            auto left = et_left.eval(left_part_tile);
+
+            auto &right_part_tile = make_tensor_tile_partial(right_tile);
+            right_part_tile.m_begin[0] = k + right_begin;
+            right_part_tile.m_size[0] = k_size-k;
+            auto right = et_right.eval(right_part_tile);
+
+            // accumulate product
+            //left.matrix_multiply_accumulate(result, right);
+            result += restrictExtents(left, left_part_tile) * restrictExtents(right, right_part_tile);
+          }
+        }
+    };
+
+
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index 34998af6bd..faa92747dd 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -33,381 +33,346 @@ namespace expt
 {
 
 
-namespace ET
-{
-
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          class ENABLE = void>
-struct DivideOperator;
-
-
-/*!
- * Specialization that provides dividing a scalar by a vector
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct DivideOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
-{
-
-  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  namespace ET
   {
-    return right.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise divide
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
-         RIGHT_OPERAND_TYPE const& right)
-  {
-    result_type numerator(left.eval(tile));
 
-    if (tile.s_tensor_size == TENSOR_FULL)
-    {
-      return numerator.divide(right.eval(tile));
-    }
-
-    return numerator.divide_n(right.eval(tile), tile.m_size[0]);
-  }
-};
-
-
-/*!
- * Specialization that provides dividing a vector by a scalar
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct DivideOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-{
-  using result_type = typename LEFT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
+    struct DivideOperator;
 
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
-  {
-    return left.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise divide
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
-         RIGHT_OPERAND_TYPE const& right)
-  {
-    result_type denominator(right.eval(tile));
 
-    if (tile.s_tensor_size == TENSOR_FULL)
-    {
-      return left.eval(tile).divide(denominator);
-    }
-    else
+    /*!
+     * Specialization that provides dividing a scalar by a vector
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
     {
-      return left.eval(tile).divide_n(denominator, tile.m_size[0]);
-    }
-  }
-};
-
 
-/*!
- * Specialization that provides dividing a vector by a vector
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct DivideOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
-{
-  using result_type = typename LEFT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
-  {
-    return left.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise divide
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
-         RIGHT_OPERAND_TYPE const& right)
-  {
-    if (tile.s_tensor_size == TENSOR_FULL)
+      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
+        return right.getDimSize(dim);
+      }
+
+      /*!
+       * Evaluate operands and perform element-wise divide
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
+      {
+        result_type numerator(left.eval(tile));
+
+        if(tile.s_tensor_size == TENSOR_FULL){
+          return numerator.divide(right.eval(tile));
+        }
+
+        return numerator.divide_n(right.eval(tile), tile.m_size[0]);
+
+      }
+    };
+
+
+    /*!
+     * Specialization that provides dividing a vector by a scalar
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
     {
-      return left.eval(tile).divide(right.eval(tile));
-    }
-    else
+      using result_type = typename LEFT_OPERAND_TYPE::result_type;
+      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
+        return left.getDimSize(dim);
+      }
+
+      /*!
+       * Evaluate operands and perform element-wise divide
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
+      {
+        result_type denominator(right.eval(tile));
+
+        if(tile.s_tensor_size == TENSOR_FULL){
+          return left.eval(tile).divide(denominator);
+        }
+        else{
+          return left.eval(tile).divide_n(denominator, tile.m_size[0]);
+        }
+      }
+    };
+
+
+    /*!
+     * Specialization that provides dividing a vector by a vector
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
     {
-      return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
-    }
-  }
-};
-
-
-/*!
- * Specialization that provides dividing a scalar by a matrix
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct DivideOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
-{
-
-  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
-  {
-    return right.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise divide
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
-         RIGHT_OPERAND_TYPE const& right)
-  {
-    result_type numerator(left.eval(tile));
-
-    if (tile.s_tensor_size == TENSOR_FULL)
+      using result_type = typename LEFT_OPERAND_TYPE::result_type;
+      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
+        return left.getDimSize(dim);
+      }
+
+      /*!
+       * Evaluate operands and perform element-wise divide
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
+      {
+        if(tile.s_tensor_size == TENSOR_FULL){
+          return left.eval(tile).divide(right.eval(tile));
+        }
+        else{
+          return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
+        }
+      }
+    };
+
+
+
+
+
+
+    /*!
+     * Specialization that provides dividing a scalar by a matrix
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
     {
-      return numerator.divide(right.eval(tile));
-    }
-
-    return numerator.divide_nm(right.eval(tile), tile.m_size[0],
-                               tile.m_size[1]);
-  }
-};
 
-
-/*!
- * Specialization that provides dividing a vector by a scalar
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct DivideOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-{
-  using result_type = typename LEFT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
-  {
-    return left.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise divide
-   */
-  RAJA_SUPPRESS_HD_WARN
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
-         RIGHT_OPERAND_TYPE const& right)
-  {
-    result_type denominator(right.eval(tile));
-
-    if (tile.s_tensor_size == TENSOR_FULL)
-    {
-      return left.eval(tile).divide(denominator);
-    }
-    else
+      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
+        return right.getDimSize(dim);
+      }
+
+      /*!
+       * Evaluate operands and perform element-wise divide
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
+      {
+        result_type numerator(left.eval(tile));
+
+        if(tile.s_tensor_size == TENSOR_FULL){
+          return numerator.divide(right.eval(tile));
+        }
+
+        return numerator.divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
+
+      }
+    };
+
+
+    /*!
+     * Specialization that provides dividing a vector by a scalar
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
     {
-      return left.eval(tile).divide_nm(denominator, tile.m_size[0],
-                                       tile.m_size[1]);
-    }
-  }
-};
-
-
-/*!
- * Specialization that provides dividing a vector by a vector
- */
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-struct DivideOperator<
-    LEFT_OPERAND_TYPE,
-    RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
-                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
-{
-  using result_type = typename LEFT_OPERAND_TYPE::result_type;
-  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static int
-  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
-  {
-    return left.getDimSize(dim);
-  }
-
-  /*!
-   * Evaluate operands and perform element-wise divide
-   */
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static result_type
-  divide(TILE_TYPE const& tile,
-         LEFT_OPERAND_TYPE const& left,
-         RIGHT_OPERAND_TYPE const& right)
-  {
-    if (tile.s_tensor_size == TENSOR_FULL)
+      using result_type = typename LEFT_OPERAND_TYPE::result_type;
+      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
+        return left.getDimSize(dim);
+      }
+
+      /*!
+       * Evaluate operands and perform element-wise divide
+       */
+      RAJA_SUPPRESS_HD_WARN
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
+      {
+        result_type denominator(right.eval(tile));
+
+        if(tile.s_tensor_size == TENSOR_FULL){
+          return left.eval(tile).divide(denominator);
+        }
+        else{
+          return left.eval(tile).divide_nm(denominator, tile.m_size[0], tile.m_size[1]);
+        }
+      }
+    };
+
+
+    /*!
+     * Specialization that provides dividing a vector by a vector
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
     {
-      return left.eval(tile).divide(right.eval(tile));
-    }
-    else
+      using result_type = typename LEFT_OPERAND_TYPE::result_type;
+      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
+        return left.getDimSize(dim);
+      }
+
+      /*!
+       * Evaluate operands and perform element-wise divide
+       */
+      template<typename TILE_TYPE>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
+      {
+        if(tile.s_tensor_size == TENSOR_FULL){
+          return left.eval(tile).divide(right.eval(tile));
+        }
+        else{
+          return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
+        }
+      }
+    };
+
+
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    class TensorDivide: public TensorExpressionBase<TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
+      public:
+        using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+        using left_operand_type = LEFT_OPERAND_TYPE;
+        using right_operand_type = RIGHT_OPERAND_TYPE;
+        using element_type = typename LEFT_OPERAND_TYPE::element_type;
+        using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+        using divide_op = DivideOperator<left_operand_type, right_operand_type>;
+        using result_type = typename divide_op::result_type;
+        static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
+
+
+      private:
+        left_operand_type m_left_operand;
+        right_operand_type m_right_operand;
+
+      public:
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorDivide(left_operand_type const &left_operand, right_operand_type const &right_operand) :
+        m_left_operand{left_operand}, m_right_operand{right_operand}
+        {}
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type dim) const {
+          return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
+        }
+
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        result_type eval(TILE_TYPE const &tile) const
+        {
+          return divide_op::divide(tile, m_left_operand, m_right_operand);
+        }
+
+        /*!
+         * Returns the LHS of the operation, used to form contractions
+         */
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        left_operand_type const &getLeftOperand() const {
+          return m_left_operand;
+        }
+
+        /*!
+         * Returns the RHS of the operation, used to form contractions
+         */
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        right_operand_type const &getRightOperand() const {
+          return m_right_operand;
+        }
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("Divide(");
+          m_left_operand.print_ast();
+          printf(", ");
+          m_right_operand.print_ast();
+          printf(")");
+        }
+
+
+    };
+
+
+    /*
+     * Overload for:    arithmetic / tensorexpression
+
+     */
+    template<typename LHS, typename RHS,
+      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    auto operator/(LHS const &left_operand, RHS const &right_operand) ->
+    TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
     {
-      return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0],
-                                       tile.m_size[1]);
+      return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
     }
-  }
-};
-
-
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-class TensorDivide : public TensorExpressionBase<
-                         TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
-{
-public:
-  using self_type         = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-  using left_operand_type = LEFT_OPERAND_TYPE;
-  using right_operand_type = RIGHT_OPERAND_TYPE;
-  using element_type       = typename LEFT_OPERAND_TYPE::element_type;
-  using index_type         = typename LEFT_OPERAND_TYPE::index_type;
-
-  using divide_op   = DivideOperator<left_operand_type, right_operand_type>;
-  using result_type = typename divide_op::result_type;
-  static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
-
-
-private:
-  left_operand_type m_left_operand;
-  right_operand_type m_right_operand;
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorDivide(left_operand_type const& left_operand,
-               right_operand_type const& right_operand)
-      : m_left_operand {left_operand}, m_right_operand {right_operand}
-  {}
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type dim) const
-  {
-    return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
-  }
-
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
-  {
-    return divide_op::divide(tile, m_left_operand, m_right_operand);
-  }
-
-  /*!
-   * Returns the LHS of the operation, used to form contractions
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr left_operand_type const& getLeftOperand() const
-  {
-    return m_left_operand;
-  }
-
-  /*!
-   * Returns the RHS of the operation, used to form contractions
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr right_operand_type const& getRightOperand() const
-  {
-    return m_right_operand;
-  }
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const
-  {
-    printf("Divide(");
-    m_left_operand.print_ast();
-    printf(", ");
-    m_right_operand.print_ast();
-    printf(")");
-  }
-};
-
-
-/*
- * Overload for:    arithmetic / tensorexpression
-
- */
-template <
-    typename LHS,
-    typename RHS,
-    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const& left_operand,
-                                            RHS const& right_operand)
-    -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-{
-  return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
-      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
-}
 
-}  // namespace ET
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index 10367f0d5b..6720a304f2 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -33,72 +33,76 @@ namespace expt
 {
 
 
-namespace ET
-{
-
-
-template <typename TENSOR_TYPE>
-class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
-{
-public:
-  using self_type    = TensorLiteral<TENSOR_TYPE>;
-  using tensor_type  = TENSOR_TYPE;
-  using element_type = typename TENSOR_TYPE::element_type;
-  using result_type  = tensor_type;
-  using index_type   = RAJA::Index_type;
-
-  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type dim) const
-  {
-    return tensor_type::s_dim_elem(dim);
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  explicit TensorLiteral(tensor_type const& value) : m_value {value} {}
-
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const&) const
-  {
-    return result_type(m_value);
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const { printf("TensorLiteral()"); }
-
-private:
-  tensor_type m_value;
-};
-
-
-/*
- * For TensorRegister nodes, we need to wrap this in a constant value ET node
- */
-template <typename RHS>
-struct NormalizeOperandHelper<
-    RHS,
-    typename std::enable_if<
-        std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
-{
-  using return_type = TensorLiteral<RHS>;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr return_type normalize(RHS const& rhs)
+  namespace ET
   {
-    return return_type(rhs);
-  }
-};
 
-}  // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+    template<typename TENSOR_TYPE>
+    class TensorLiteral :  public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>> {
+      public:
+        using self_type = TensorLiteral<TENSOR_TYPE>;
+        using tensor_type = TENSOR_TYPE;
+        using element_type = typename TENSOR_TYPE::element_type;
+        using result_type = tensor_type;
+        using index_type = RAJA::Index_type;
+
+        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type dim) const {
+          return tensor_type::s_dim_elem(dim);
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        explicit
+        TensorLiteral(tensor_type const &value) :
+        m_value{value}
+        {}
+
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        result_type eval(TILE_TYPE const &) const {
+          return result_type(m_value);
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("TensorLiteral()");
+        }
+
+      private:
+        tensor_type m_value;
+    };
+
+
+    /*
+     * For TensorRegister nodes, we need to wrap this in a constant value ET node
+     */
+    template<typename RHS>
+    struct NormalizeOperandHelper<RHS,
+    typename std::enable_if<std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
+    {
+        using return_type = TensorLiteral<RHS>;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        return_type normalize(RHS const &rhs){
+          return return_type(rhs);
+        }
+    };
+
+  } // namespace ET
+
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 00e5b14bf5..3b69552a32 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -34,185 +34,220 @@ namespace expt
 {
 
 
-namespace ET
-{
 
 
-template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-struct TensorStoreFunctor
-{
-  LHS_TYPE const& m_lhs;
-  RHS_TYPE const& m_rhs;
 
-  template <typename TILE_TYPE>
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(TILE_TYPE const& tile) const
+  namespace ET
   {
 
 
-    /*
-     *
-     * For recursive ET types, eval() produces a new ET, and
-     * eval_lhs() produces a new TensorLoadStore.
-     *
-     */
-
-    m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
-  }
-};
-
-template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
-makeTensorStoreFunctor(LHS_TYPE const& lhs, RHS_TYPE const& rhs)
-    -> TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
-{
-  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE> {lhs, rhs};
-}
-
-
-template <typename TENSOR_TYPE, typename REF_TYPE>
-class TensorLoadStore
-    : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>>
-{
-public:
-  using self_type    = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
-  using tensor_type  = TENSOR_TYPE;
-  using element_type = typename TENSOR_TYPE::element_type;
-  using index_type   = typename REF_TYPE::index_type;
-  using ref_type     = REF_TYPE;
-  using tile_type    = typename REF_TYPE::tile_type;
-  using result_type  = TENSOR_TYPE;
-
-  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-private:
-  ref_type m_ref;
-
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  explicit TensorLoadStore(ref_type const& ref) : m_ref {ref} {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorLoadStore(self_type const& rhs) : m_ref(rhs.m_ref) {}
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print() const
-  {
-    printf("TensorLoadStore: ");
-    m_ref.m_tile.print();
-  }
-
-  //        RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(self_type const& rhs)
-  {
-    store(rhs);
-    return *this;
-  }
-
-  //        RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator=(RHS const& rhs)
-  {
 
-    store(normalizeOperand(rhs));
-
-    return *this;
-  }
-
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator+=(RHS const& rhs)
-  {
-    store(normalizeOperand(rhs) + (*this));
-    return *this;
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator-=(RHS const& rhs)
-  {
-    store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)));
-    return *this;
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*=(RHS const& rhs)
-  {
-    store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)));
-    return *this;
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type operator/=(RHS const& rhs)
-  {
-    store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)));
-    return *this;
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
-  {
-    return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const& tile) const
-      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
-                                                                  tile)))
-  {
-    return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
-  }
+    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+    struct TensorStoreFunctor
+    {
+        LHS_TYPE const &m_lhs;
+        RHS_TYPE const &m_rhs;
+
+        template<typename TILE_TYPE>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void operator()(TILE_TYPE const &tile) const {
+
+
+          /*
+           *
+           * For recursive ET types, eval() produces a new ET, and
+           * eval_lhs() produces a new TensorLoadStore.
+           *
+           */
+
+          m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
+
+        }
+    };
+
+    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    auto makeTensorStoreFunctor(LHS_TYPE const &lhs, RHS_TYPE const &rhs) ->
+    TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
+    {
+      return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
+    }
+
+
+    template<typename TENSOR_TYPE, typename REF_TYPE>
+    class TensorLoadStore : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>> {
+      public:
+        using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
+        using tensor_type = TENSOR_TYPE;
+        using element_type = typename TENSOR_TYPE::element_type;
+        using index_type = typename REF_TYPE::index_type;
+        using ref_type = REF_TYPE;
+        using tile_type = typename REF_TYPE::tile_type;
+        using result_type = TENSOR_TYPE;
+
+        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+
+      private:
+        ref_type m_ref;
+
+
+      public:
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        explicit
+        TensorLoadStore(ref_type const &ref) : m_ref{ref}
+        {
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorLoadStore(self_type const &rhs) : m_ref(rhs.m_ref)
+        {}
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print() const {
+          printf("TensorLoadStore: ");
+          m_ref.m_tile.print();
+        }
+
+//        RAJA_SUPPRESS_HD_WARN
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        self_type &operator=(self_type const &rhs)
+        {
+          store(rhs);
+          return *this;
+        }
+
+//        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        self_type &operator=(RHS const &rhs)
+        {
+
+          store(normalizeOperand(rhs));
+
+          return *this;
+        }
+
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        self_type &operator+=(RHS const &rhs)
+        {
+          store( normalizeOperand(rhs) + (*this) );
+          return *this;
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        self_type &operator-=(RHS const &rhs)
+        {
+          store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)) );
+          return *this;
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        self_type operator*=(RHS const &rhs)
+        {
+          store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)) );
+          return *this;
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename RHS>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        self_type operator/=(RHS const &rhs)
+        {
+          store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)) );
+          return *this;
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        auto eval(TILE_TYPE const &tile) const ->
+          decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
+        {
+          return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
+        }
+
+        RAJA_SUPPRESS_HD_WARN
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        auto eval_lhs(TILE_TYPE const &tile) const ->
+          decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref, tile)))
+        {
+          return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
+        }
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type dim) const {
+          return m_ref.m_tile.m_size[dim];
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("Load()");
+        }
+
+      private:
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        tile_type const &getTile() const {
+          return m_ref.m_tile;
+        }
+
+
+        template<typename RHS>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void store(RHS const &rhs)
+        {
+#ifdef RAJA_DEBUG_PRINT_ET_AST
+          printf("Store(");
+          rhs.print_ast();
+          printf(")\n");
+#endif
 
+          tensorTileExec<tensor_type>(m_ref.m_tile,
+              makeTensorStoreFunctor<tensor_type>(*this, rhs));
+        }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type dim) const
-  {
-    return m_ref.m_tile.m_size[dim];
-  }
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const { printf("Load()"); }
 
-private:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  tile_type const& getTile() const { return m_ref.m_tile; }
 
 
-  template <typename RHS>
-  RAJA_INLINE RAJA_HOST_DEVICE void store(RHS const& rhs)
-  {
-#ifdef RAJA_DEBUG_PRINT_ET_AST
-    printf("Store(");
-    rhs.print_ast();
-    printf(")\n");
-#endif
-
-    tensorTileExec<tensor_type>(
-        m_ref.m_tile, makeTensorStoreFunctor<tensor_type>(*this, rhs));
-  }
-};
+    };
 
 
-}  // namespace ET
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index b51aa3d8d6..3e3429588f 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -33,136 +33,127 @@ namespace internal
 namespace expt
 {
 
-namespace ET
-{
-
-// forward decl for FMA contraction
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          typename ADD_TYPE>
-class TensorMultiplyAdd;
-
-
-template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-class TensorMultiply
-    : public TensorExpressionBase<
-          TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
-{
-public:
-  using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-  using left_operand_type  = LEFT_OPERAND_TYPE;
-  using right_operand_type = RIGHT_OPERAND_TYPE;
-  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-  using element_type = typename LEFT_OPERAND_TYPE::element_type;
-  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
-
-  using result_type                       = typename multiply_op::result_type;
-  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-private:
-  left_operand_type m_left_operand;
-  right_operand_type m_right_operand;
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorMultiply(left_operand_type const& left_operand,
-                 right_operand_type const& right_operand)
-      : m_left_operand {left_operand}, m_right_operand {right_operand}
-  {}
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr int getDimSize(int dim) const
-  {
-    return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
-  }
-
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
-  {
-    return multiply_op::multiply(tile, m_left_operand, m_right_operand);
-  }
-
-  /*!
-   * Returns the LHS of the operation, used to form contractions
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr left_operand_type const& getLeftOperand() const
-  {
-    return m_left_operand;
-  }
-
-  /*!
-   * Returns the RHS of the operation, used to form contractions
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr right_operand_type const& getRightOperand() const
+  namespace ET
   {
-    return m_right_operand;
-  }
-
-
-  /*!
-   * operator+ overload that forms a FMA contraction
-   */
-  RAJA_SUPPRESS_HD_WARN
-  template <typename ADD>
-  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
-                                                 right_operand_type,
-                                                 normalize_operand_t<ADD>>
-  operator+(ADD const& add) const
-  {
-    return TensorMultiplyAdd<left_operand_type, right_operand_type,
-                             normalize_operand_t<ADD>>(
-        m_left_operand, m_right_operand, normalizeOperand(add));
-  }
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const
-  {
-    printf("Multiply[");
-    multiply_op::print_ast();
-    printf("](");
-    m_left_operand.print_ast();
-    printf(", ");
-    m_right_operand.print_ast();
-    printf(")");
-  }
-};
-
-
-/*
- * Overload for:    arithmetic * tensorexpression
-
- */
-template <
-    typename LHS,
-    typename RHS,
-    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-    typename std::enable_if<
-        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
-        bool>::type = true>
-RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const& left_operand,
-                                            RHS const& right_operand)
-    -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-{
-  return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
-      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
-}
-
-}  // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+    // forward decl for FMA contraction
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_TYPE>
+    class TensorMultiplyAdd;
+
+
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+    class TensorMultiply : public TensorExpressionBase<TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
+      public:
+        using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+        using left_operand_type = LEFT_OPERAND_TYPE;
+        using right_operand_type = RIGHT_OPERAND_TYPE;
+        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+        using element_type = typename LEFT_OPERAND_TYPE::element_type;
+        using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
+        using result_type = typename multiply_op::result_type;
+        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+      private:
+        left_operand_type m_left_operand;
+        right_operand_type m_right_operand;
+
+      public:
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorMultiply(left_operand_type const &left_operand, right_operand_type const &right_operand) :
+        m_left_operand{left_operand}, m_right_operand{right_operand}
+        {}
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        int getDimSize(int dim) const {
+          return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
+        }
+
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        auto eval(TILE_TYPE const &tile) const ->
+          decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
+        {
+          return multiply_op::multiply(tile, m_left_operand, m_right_operand);
+        }
+
+        /*!
+         * Returns the LHS of the operation, used to form contractions
+         */
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        left_operand_type const &getLeftOperand() const {
+          return m_left_operand;
+        }
+
+        /*!
+         * Returns the RHS of the operation, used to form contractions
+         */
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        right_operand_type const &getRightOperand() const {
+          return m_right_operand;
+        }
+
+
+        /*!
+         * operator+ overload that forms a FMA contraction
+         */
+        RAJA_SUPPRESS_HD_WARN
+        template<typename ADD>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>
+        operator+(ADD const &add) const {
+          return TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>(m_left_operand, m_right_operand, normalizeOperand(add));
+        }
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("Multiply[");
+          multiply_op::print_ast();
+          printf("](");
+          m_left_operand.print_ast();
+          printf(", ");
+          m_right_operand.print_ast();
+          printf(")");
+        }
+
+
+    };
+
+
+    /*
+     * Overload for:    arithmetic * tensorexpression
+
+     */
+    template<typename LHS, typename RHS,
+      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    auto operator*(LHS const &left_operand, RHS const &right_operand) ->
+    TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+    {
+      return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+    }
+
+  } // namespace ET
+
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index a15059ed13..44f27e92c7 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -33,90 +33,81 @@ namespace expt
 {
 
 
-namespace ET
-{
+  namespace ET
+  {
 
 
-/*!
- * Expression for LHS*RHS+ADD, which allows for accessing FMA style
- * operations.
- *
- * This ET can only be generated by contracting an Add and Multiple ET.
- *
- */
-template <typename LEFT_OPERAND_TYPE,
-          typename RIGHT_OPERAND_TYPE,
-          typename ADD_OPERAND_TYPE>
-class TensorMultiplyAdd
-    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
-                                                    RIGHT_OPERAND_TYPE,
-                                                    ADD_OPERAND_TYPE>>
-{
-public:
-  using self_type          = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
-                                      RIGHT_OPERAND_TYPE,
-                                      ADD_OPERAND_TYPE>;
-  using left_operand_type  = LEFT_OPERAND_TYPE;
-  using right_operand_type = RIGHT_OPERAND_TYPE;
-  using add_operand_type   = ADD_OPERAND_TYPE;
-  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-  using element_type = typename LEFT_OPERAND_TYPE::element_type;
-  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
-
-  using result_type                       = typename multiply_op::result_type;
-  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-private:
-  left_operand_type m_left_operand;
-  right_operand_type m_right_operand;
-  add_operand_type m_add_operand;
-
-public:
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorMultiplyAdd(left_operand_type const& left_operand,
-                    right_operand_type const& right_operand,
-                    add_operand_type const& add_operand)
-      : m_left_operand {left_operand},
-        m_right_operand {right_operand},
-        m_add_operand {add_operand}
-  {}
-
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
-      -> decltype(multiply_op::multiply_add(tile,
-                                            m_left_operand,
-                                            m_right_operand,
-                                            m_add_operand))
-  {
-    return multiply_op::multiply_add(tile, m_left_operand, m_right_operand,
-                                     m_add_operand);
-  }
+    /*!
+     * Expression for LHS*RHS+ADD, which allows for accessing FMA style
+     * operations.
+     *
+     * This ET can only be generated by contracting an Add and Multiple ET.
+     *
+     */
+    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
+    class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>> {
+      public:
+        using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>;
+        using left_operand_type = LEFT_OPERAND_TYPE;
+        using right_operand_type = RIGHT_OPERAND_TYPE;
+        using add_operand_type = ADD_OPERAND_TYPE;
+        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
 
+        using element_type = typename LEFT_OPERAND_TYPE::element_type;
+        using index_type = typename LEFT_OPERAND_TYPE::index_type;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const
-  {
-    printf("MultiplyAdd[");
-    multiply_op::print_ast();
-    printf("](");
-    m_left_operand.print_ast();
-    printf(", ");
-    m_right_operand.print_ast();
-    printf(", ");
-    m_add_operand.print_ast();
-    printf(")");
-  }
-};
-
-
-}  // namespace ET
-
-}  // namespace expt
-}  // namespace internal
+        using result_type = typename multiply_op::result_type;
+        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+      private:
+        left_operand_type m_left_operand;
+        right_operand_type m_right_operand;
+        add_operand_type m_add_operand;
+
+      public:
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorMultiplyAdd(left_operand_type const &left_operand, right_operand_type const &right_operand,
+                          add_operand_type const &add_operand) :
+        m_left_operand{left_operand}, m_right_operand{right_operand}, m_add_operand{add_operand}
+        {}
+
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        auto eval(TILE_TYPE const &tile) const ->
+          decltype(multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand))
+        {
+          return multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand);
+        }
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("MultiplyAdd[");
+          multiply_op::print_ast();
+          printf("](");
+          m_left_operand.print_ast();
+          printf(", ");
+          m_right_operand.print_ast();
+          printf(", ");
+          m_add_operand.print_ast();
+          printf(")");
+        }
+
+
+
+    };
+
+
+
+
+  } // namespace ET
+
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index f0512665cf..d5211e4963 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -33,58 +33,61 @@ namespace expt
 {
 
 
-namespace ET
-{
-
-template <typename ET_TYPE>
-class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
-{
-public:
-  using self_type    = TensorNegate<ET_TYPE>;
-  using rhs_type     = ET_TYPE;
-  using tensor_type  = typename ET_TYPE::result_type;
-  using element_type = typename tensor_type::element_type;
-  using index_type   = typename ET_TYPE::index_type;
-
-  using result_type                       = tensor_type;
-  using tile_type                         = typename ET_TYPE::tile_type;
-  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorNegate(rhs_type const& tensor) : m_tensor {tensor} {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type dim) const
-  {
-    return m_tensor.getDimSize(dim);
-  }
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  namespace ET
   {
-    return m_tensor.eval(tile).scale(-1);
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const
-  {
-    printf("Negate(");
-    m_tensor.print_ast();
-    printf(")");
-  }
-
-private:
-  rhs_type m_tensor;
-};
-
-
-}  // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+    template<typename ET_TYPE>
+    class TensorNegate :  public TensorExpressionBase<TensorNegate<ET_TYPE>> {
+      public:
+        using self_type = TensorNegate<ET_TYPE>;
+        using rhs_type = ET_TYPE;
+        using tensor_type = typename ET_TYPE::result_type;
+        using element_type = typename tensor_type::element_type;
+        using index_type = typename ET_TYPE::index_type;
+
+        using result_type = tensor_type;
+        using tile_type = typename ET_TYPE::tile_type;
+        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorNegate(rhs_type const &tensor) :
+        m_tensor{tensor}
+        {}
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type dim) const {
+          return m_tensor.getDimSize(dim);
+        }
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        result_type eval(TILE_TYPE const &tile) const
+        {
+          return m_tensor.eval(tile).scale(-1);
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("Negate(");
+          m_tensor.print_ast();
+          printf(")");
+        }
+
+      private:
+        rhs_type m_tensor;
+    };
+
+
+
+  } // namespace ET
+
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index ac692c3bcf..4ab0a3ebc6 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -33,71 +33,78 @@ namespace expt
 {
 
 
-namespace ET
-{
-
-
-template <typename T>
-class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
-{
-public:
-  using self_type    = TensorScalarLiteral<T>;
-  using tensor_type  = RAJA::expt::ScalarRegister<T>;
-  using element_type = T;
-  using result_type  = T;
-  using index_type   = RAJA::Index_type;
-
-  static constexpr camp::idx_t s_num_dims = 0;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type) const { return 0; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  explicit constexpr TensorScalarLiteral(element_type const& value) noexcept
-      : m_value {value}
-  {}
-
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE element_type eval(TILE_TYPE const&) const
+  namespace ET
   {
-    return m_value;
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const { printf("ScalarLiteral(%e)", (double)m_value); }
-
-private:
-  element_type m_value;
-};
 
 
-/*
- * For arithmetic values, we need to wrap in a constant value ET node
- */
-template <typename RHS>
-struct NormalizeOperandHelper<
-    RHS,
+    template<typename T>
+    class TensorScalarLiteral :  public TensorExpressionBase<TensorScalarLiteral<T>> {
+      public:
+        using self_type = TensorScalarLiteral<T>;
+        using tensor_type = RAJA::expt::ScalarRegister<T>;
+        using element_type = T;
+        using result_type = T;
+        using index_type = RAJA::Index_type;
+
+        static constexpr camp::idx_t s_num_dims = 0;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type ) const {
+          return 0;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        explicit
+        constexpr
+        TensorScalarLiteral(element_type const &value) noexcept :
+        m_value{value}
+        {}
+
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        element_type eval(TILE_TYPE const &) const {
+          return m_value;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("ScalarLiteral(%e)", (double)m_value);
+        }
+
+      private:
+        element_type m_value;
+    };
+
+
+    /*
+     * For arithmetic values, we need to wrap in a constant value ET node
+     */
+    template<typename RHS>
+    struct NormalizeOperandHelper<RHS,
     typename std::enable_if<std::is_arithmetic<RHS>::value>::type>
-{
-  using return_type = TensorScalarLiteral<RHS>;
+    {
+        using return_type = TensorScalarLiteral<RHS>;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr return_type normalize(RHS const& rhs)
-  {
-    return return_type(rhs);
-  }
-};
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        return_type normalize(RHS const &rhs){
+          return return_type(rhs);
+        }
+    };
 
 
-}  // namespace ET
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index a1e9fa4542..46950eec6f 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -33,63 +33,67 @@ namespace expt
 {
 
 
-namespace ET
-{
-
-template <typename ET_TYPE>
-class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
-{
-public:
-  using self_type    = TensorTranspose<ET_TYPE>;
-  using rhs_type     = ET_TYPE;
-  using tensor_type  = typename ET_TYPE::result_type;
-  using element_type = typename tensor_type::element_type;
-  using index_type   = typename ET_TYPE::index_type;
-
-  using result_type                       = tensor_type;
-  using tile_type                         = typename ET_TYPE::tile_type;
-  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorTranspose(rhs_type const& tensor) : m_tensor {tensor} {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr index_type getDimSize(index_type dim) const
+  namespace ET
   {
-    return m_tensor.getDimSize(dim);
-  }
-
-  template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
-  {
-    // transpose which tile we are returning
-    TILE_TYPE trans_tile {{tile.m_begin[1], tile.m_begin[0]},
-                          {tile.m_size[1], tile.m_size[0]}};
-
-    // evaluate and return the transposed tile
-    return m_tensor.eval(trans_tile).transpose();
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void print_ast() const
-  {
-    printf("Transpose(");
-    m_tensor.print_ast();
-    printf(")");
-  }
-
-private:
-  rhs_type m_tensor;
-};
-
-
-}  // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+    template<typename ET_TYPE>
+    class TensorTranspose :  public TensorExpressionBase<TensorTranspose<ET_TYPE>> {
+      public:
+        using self_type = TensorTranspose<ET_TYPE>;
+        using rhs_type = ET_TYPE;
+        using tensor_type = typename ET_TYPE::result_type;
+        using element_type = typename tensor_type::element_type;
+        using index_type = typename ET_TYPE::index_type;
+
+        using result_type = tensor_type;
+        using tile_type = typename ET_TYPE::tile_type;
+        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        TensorTranspose(rhs_type const &tensor) :
+        m_tensor{tensor}
+        {}
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        constexpr
+        index_type getDimSize(index_type dim) const {
+          return m_tensor.getDimSize(dim);
+        }
+
+        template<typename TILE_TYPE>
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        result_type eval(TILE_TYPE const &tile) const {
+          // transpose which tile we are returning
+          TILE_TYPE trans_tile{
+            {tile.m_begin[1], tile.m_begin[0]},
+            {tile.m_size[1],  tile.m_size[0]}
+          };
+
+          // evaluate and return the transposed tile
+          return m_tensor.eval(trans_tile).transpose();
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        void print_ast() const {
+          printf("Transpose(");
+          m_tensor.print_ast();
+          printf(")");
+        }
+
+      private:
+        rhs_type m_tensor;
+    };
+
+
+
+  } // namespace ET
+
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
index 7f3059acdf..2a868a3131 100644
--- a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
@@ -33,57 +33,64 @@ namespace expt
 {
 
 
-class TensorRegisterConcreteBase;
-
-namespace ET
-{
-class TensorExpressionConcreteBase;
-
-template <typename RHS, typename enable = void>
-struct NormalizeOperandHelper;
-
-
-/*
- * For TensorExpression nodes, we just return them as-is.
- */
-template <typename RHS>
-struct NormalizeOperandHelper<
-    RHS,
-    typename std::enable_if<
-        std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
-{
-  using return_type = RHS;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr return_type normalize(RHS const& rhs) { return rhs; }
-};
-
-
-/**
- * Allows uniform packaging up of operands into ExpressionTemplates.
- *
- * The NormalizeOperandHelper is specialized throughout the code in order
- * to convert non-ET operands into ET objects
- *
- * ET operators can then take any operand type, and use this to convert
- * them into ET types the same way.
- */
-template <typename RHS>
-RAJA_INLINE RAJA_HOST_DEVICE auto normalizeOperand(RHS const& rhs) ->
+    class TensorRegisterConcreteBase;
+
+  namespace ET
+  {
+    class TensorExpressionConcreteBase;
+
+    template<typename RHS, typename enable = void>
+    struct NormalizeOperandHelper;
+
+
+    /*
+     * For TensorExpression nodes, we just return them as-is.
+     */
+    template<typename RHS>
+    struct NormalizeOperandHelper<RHS,
+    typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
+    {
+        using return_type = RHS;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        return_type normalize(RHS const &rhs){
+          return rhs;
+        }
+    };
+
+
+
+
+    /**
+     * Allows uniform packaging up of operands into ExpressionTemplates.
+     *
+     * The NormalizeOperandHelper is specialized throughout the code in order
+     * to convert non-ET operands into ET objects
+     *
+     * ET operators can then take any operand type, and use this to convert
+     * them into ET types the same way.
+     */
+    template<typename RHS>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    auto normalizeOperand(RHS const &rhs) ->
     typename NormalizeOperandHelper<RHS>::return_type
-{
-  return NormalizeOperandHelper<RHS>::normalize(rhs);
-}
+    {
+      return NormalizeOperandHelper<RHS>::normalize(rhs);
+    }
 
-template <typename RHS>
-using normalize_operand_t = typename NormalizeOperandHelper<RHS>::return_type;
+    template<typename RHS>
+    using normalize_operand_t =
+        typename NormalizeOperandHelper<RHS>::return_type;
 
 
-}  // namespace ET
+  } // namespace ET
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
index a94ec924db..2b6bf7304d 100644
--- a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
@@ -31,4 +31,5 @@
 #include "RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp"
 
 
+
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 8134831516..08a9886acc 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -31,327 +31,308 @@ namespace expt
 {
 
 
-template <typename MATA, typename MATB>
-struct MatrixMatrixMultiplyHelper;
 
 
-/**
- *
- * Row-Major * Row-Major ==> Row-Major
- *
- */
-template <typename T,
-          typename REGISTER_POLICY,
-          camp::idx_t N_SIZE,
-          camp::idx_t M_SIZE,
-          camp::idx_t M2_SIZE,
-          camp::idx_t O_SIZE>
-struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::RowMajorLayout,
-                               camp::idx_seq<N_SIZE, M_SIZE>>,
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::RowMajorLayout,
-                               camp::idx_seq<M2_SIZE, O_SIZE>>>
-{
 
-  static_assert(M_SIZE == M2_SIZE,
-                "Matrices are not compatible for "
-                "multiplication");
-
-  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                               T,
-                                               RAJA::expt::RowMajorLayout,
-                                               camp::idx_seq<N_SIZE, M_SIZE>>;
-
-  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                T,
-                                                RAJA::expt::RowMajorLayout,
-                                                camp::idx_seq<M_SIZE, O_SIZE>>;
-
-  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                 T,
-                                                 RAJA::expt::RowMajorLayout,
-                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
-
-  using register_type = typename result_type::register_type;
-
-  static constexpr camp::idx_t s_elements_per_register =
-      left_type::s_elements_per_register;
-  static constexpr camp::idx_t s_A_minor_dim_registers =
-      left_type::s_minor_dim_registers;
-  static constexpr camp::idx_t s_B_minor_dim_registers =
-      right_type::s_minor_dim_registers;
-  static constexpr camp::idx_t s_C_minor_dim_registers =
-      result_type::s_minor_dim_registers;
-
-  /*
-   * Matrix B (and C) has 1 more more registers per row
+
+
+  template<typename MATA, typename MATB>
+  struct MatrixMatrixMultiplyHelper;
+
+
+
+  /**
+   *
+   * Row-Major * Row-Major ==> Row-Major
    *
    */
-  template <typename dummy = void>
-  RAJA_HOST_DEVICE static RAJA_INLINE
+  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
+  struct MatrixMatrixMultiplyHelper<
+  RAJA::expt::TensorRegister<REGISTER_POLICY,
+                   T,
+                   RAJA::expt::RowMajorLayout,
+                   camp::idx_seq<N_SIZE, M_SIZE>>,
+                   RAJA::expt::TensorRegister<REGISTER_POLICY,
+                    T,
+                    RAJA::expt::RowMajorLayout,
+                    camp::idx_seq<M2_SIZE, O_SIZE>> >
+    {
+
+      static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
+
+      using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                       T,
+                                       RAJA::expt::RowMajorLayout,
+                                       camp::idx_seq<N_SIZE, M_SIZE>>;
+
+      using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        RAJA::expt::RowMajorLayout,
+                                        camp::idx_seq<M_SIZE, O_SIZE>> ;
+
+      using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                         T,
+                                         RAJA::expt::RowMajorLayout,
+                                         camp::idx_seq<N_SIZE, O_SIZE>> ;
+
+      using register_type = typename result_type::register_type;
+
+      static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
+      static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
+      static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
+      static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
+
+      /*
+       * Matrix B (and C) has 1 more more registers per row
+       *
+       */
+      template<typename dummy = void>
+      RAJA_HOST_DEVICE
+      static
+      RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const& A,
-                          right_type const& B,
-                          result_type& C)
-  {
+      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
+      {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
+        RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
 #endif
 
-    constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
+        constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
 
-    RAJA_UNROLL
-    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
-    {
-      camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
-      camp::idx_t ac_row     = c_reg / num_bc_reg_per_row;
-
-      RAJA_UNROLL
-      for (camp::idx_t a_col = 0; a_col < M_SIZE; ++a_col)
-      {
-        camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
+        RAJA_UNROLL
+        for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
+          camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
+          camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
+
+          RAJA_UNROLL
+          for(camp::idx_t a_col = 0;a_col < M_SIZE;++ a_col){
+            camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
+
+            C.get_register(c_reg) =
+                register_type(A.get(ac_row, a_col)).multiply_add(
+                    B.get_register(b_reg),
+                    C.get_register(c_reg));
+          }
+        }
 
-        C.get_register(c_reg) =
-            register_type(A.get(ac_row, a_col))
-                .multiply_add(B.get_register(b_reg), C.get_register(c_reg));
       }
-    }
-  }
 
-  /*
-   * Matrix B (and C) have less than one register per row
-   *
-   */
-  template <typename dummy = void>
-  RAJA_HOST_DEVICE RAJA_INLINE static
+      /*
+       * Matrix B (and C) have less than one register per row
+       *
+       */
+      template<typename dummy = void>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
       typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const& A,
-                          right_type const& B,
-                          result_type& C)
-  {
-    constexpr camp::idx_t bc_segbits              = result_type::s_segbits;
-    constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
-
-    RAJA_UNROLL
-    for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row)
-    {
-      camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
-      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
-      register_type c_tmp;
-
-      RAJA_UNROLL
-      for (camp::idx_t b_reg = 0; b_reg < right_type::s_num_registers; ++b_reg)
+      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
       {
+        constexpr camp::idx_t bc_segbits = result_type::s_segbits;
+        constexpr camp::idx_t a_segments_per_register = 1<<bc_segbits;
 
-        camp::idx_t a_segment = ac_row * right_type::s_num_registers + b_reg;
-        camp::idx_t a_reg     = a_segment / a_segments_per_register;
-        camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
+        RAJA_UNROLL
+        for(camp::idx_t ac_row = 0;ac_row < N_SIZE;++ ac_row){
+          camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
+          camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
+          register_type c_tmp;
 
-        auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(
-            bc_segbits, a_reg_segment);
+          RAJA_UNROLL
+          for(camp::idx_t b_reg = 0;b_reg < right_type::s_num_registers;++ b_reg){
 
-        if (b_reg == 0)
-        {
+            camp::idx_t a_segment = ac_row*right_type::s_num_registers + b_reg;
+            camp::idx_t a_reg = a_segment / a_segments_per_register;
+            camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
 
-          c_tmp = a_tmp.multiply(B.get_register(b_reg));
-        }
-        else
-        {
-          c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
-        }
-      }
+            auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(bc_segbits, a_reg_segment);
 
-      C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
-    }
-  }
+            if(b_reg == 0){
 
-  RAJA_HOST_DEVICE
-  static RAJA_INLINE void
-  multiply(left_type const& A, right_type const& B, result_type& C)
-  {
-    C = result_type(0);
-    multiply_accumulate(A, B, C);
-  }
-};
+              c_tmp = a_tmp.multiply(B.get_register(b_reg));
+            }
+            else{
+              c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
+            }
 
+          }
 
-/**
- *
- * Column-Major * Column-Major ==> Column-Major
- *
- */
-template <typename T,
-          typename REGISTER_POLICY,
-          camp::idx_t N_SIZE,
-          camp::idx_t M_SIZE,
-          camp::idx_t M2_SIZE,
-          camp::idx_t O_SIZE>
-struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::ColMajorLayout,
-                               camp::idx_seq<N_SIZE, M_SIZE>>,
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                               T,
-                               RAJA::expt::ColMajorLayout,
-                               camp::idx_seq<M2_SIZE, O_SIZE>>>
-{
+          C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
+
+        }
+
+      }
+
+      RAJA_HOST_DEVICE
+      static
+      RAJA_INLINE
+      void multiply(left_type const &A, right_type const &B, result_type &C){
+        C = result_type(0);
+        multiply_accumulate(A, B, C);
+      }
+  };
 
-  using self_type = MatrixMatrixMultiplyHelper<
-      RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                 T,
-                                 RAJA::expt::ColMajorLayout,
-                                 camp::idx_seq<N_SIZE, M_SIZE>>,
-      RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                 T,
-                                 RAJA::expt::ColMajorLayout,
-                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
-
-  static_assert(M_SIZE == M2_SIZE,
-                "Matrices are not compatible for "
-                "multiplication");
-
-  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                               T,
-                                               RAJA::expt::ColMajorLayout,
-                                               camp::idx_seq<N_SIZE, M_SIZE>>;
-
-  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                T,
-                                                RAJA::expt::ColMajorLayout,
-                                                camp::idx_seq<M_SIZE, O_SIZE>>;
-
-  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                                 T,
-                                                 RAJA::expt::ColMajorLayout,
-                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
-
-  using register_type = typename result_type::register_type;
-
-  static constexpr camp::idx_t s_elements_per_register =
-      left_type::s_elements_per_register;
-  static constexpr camp::idx_t s_A_minor_dim_registers =
-      left_type::s_minor_dim_registers;
-  static constexpr camp::idx_t s_B_minor_dim_registers =
-      right_type::s_minor_dim_registers;
-  static constexpr camp::idx_t s_C_minor_dim_registers =
-      result_type::s_minor_dim_registers;
-
-
-  /*
-   * Matrix A (and C) has 1 more more registers per column
+
+  /**
+   *
+   * Column-Major * Column-Major ==> Column-Major
    *
    */
-  template <typename dummy = void>
-  RAJA_HOST_DEVICE static RAJA_INLINE
-      typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const& A,
-                          right_type const& B,
-                          result_type& C)
-  {
+  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
+    struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                     T,
+                     RAJA::expt::ColMajorLayout,
+                     camp::idx_seq<N_SIZE, M_SIZE>>,
+                     RAJA::expt::TensorRegister<REGISTER_POLICY,
+                      T,
+                      RAJA::expt::ColMajorLayout,
+                      camp::idx_seq<M2_SIZE, O_SIZE>> >
+      {
 
-#if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
-#endif
+      using self_type = MatrixMatrixMultiplyHelper<
+          RAJA::expt::TensorRegister<REGISTER_POLICY,
+                         T,
+                         RAJA::expt::ColMajorLayout,
+                         camp::idx_seq<N_SIZE, M_SIZE>>,
+                         RAJA::expt::TensorRegister<REGISTER_POLICY,
+                          T,
+                          RAJA::expt::ColMajorLayout,
+                          camp::idx_seq<M2_SIZE, O_SIZE>> >;
+
+        static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
+
+        using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                         T,
+                                         RAJA::expt::ColMajorLayout,
+                                         camp::idx_seq<N_SIZE, M_SIZE>>;
+
+        using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                          T,
+                                          RAJA::expt::ColMajorLayout,
+                                          camp::idx_seq<M_SIZE, O_SIZE>> ;
+
+        using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                           T,
+                                           RAJA::expt::ColMajorLayout,
+                                           camp::idx_seq<N_SIZE, O_SIZE>> ;
+
+        using register_type = typename result_type::register_type;
+
+        static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
+        static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
+        static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
+        static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
+
+
+
+        /*
+         * Matrix A (and C) has 1 more more registers per column
+         *
+         */
+        template<typename dummy = void>
+        RAJA_HOST_DEVICE
+        static
+        RAJA_INLINE
+        typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
+        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
+        {
 
+  #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
+          RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
+  #endif
 
-    constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
 
-    RAJA_UNROLL
-    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
-    {
-      camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
-      camp::idx_t bc_col     = c_reg / num_ac_reg_per_col;
+          constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
 
-      RAJA_UNROLL
-      for (camp::idx_t b_row = 0; b_row < M_SIZE; ++b_row)
-      {
-        camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
+          RAJA_UNROLL
+          for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
+            camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
+            camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
 
-        C.get_register(c_reg) =
-            register_type(B.get(b_row, bc_col))
-                .multiply_add(A.get_register(a_reg), C.get_register(c_reg));
-      }
-    }
-  }
+            RAJA_UNROLL
+            for(camp::idx_t b_row = 0;b_row < M_SIZE;++ b_row){
+              camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
 
-  /*
-   * Matrix A (and C) have less than one register per column
-   *
-   */
-  template <typename dummy = void>
-  RAJA_HOST_DEVICE RAJA_INLINE static
-      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const& A,
-                          right_type const& B,
-                          result_type& C)
-  {
-    constexpr camp::idx_t ac_segbits              = result_type::s_segbits;
-    constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
-
-    camp::idx_t bc_col = 0;
-
-    RAJA_UNROLL
-    for (camp::idx_t c_reg = 0;
-         c_reg < N_SIZE / result_type::s_major_dim_per_register; ++c_reg)
-    {
+              C.get_register(c_reg) =
+                  register_type(B.get(b_row, bc_col)).multiply_add(
+                      A.get_register(a_reg),
+                      C.get_register(c_reg));
+            }
+          }
 
-      RAJA_UNROLL
-      for (camp::idx_t c_segment = 0;
-           c_segment < result_type::s_major_dim_per_register; ++c_segment)
-      {
 
-        register_type c_tmp;
+        }
 
-        RAJA_UNROLL
-        for (camp::idx_t a_reg = 0; a_reg < right_type::s_num_registers;
-             ++a_reg)
+        /*
+         * Matrix A (and C) have less than one register per column
+         *
+         */
+        template<typename dummy = void>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        static
+        typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
         {
+          constexpr camp::idx_t ac_segbits = result_type::s_segbits;
+          constexpr camp::idx_t b_segments_per_register = 1<<ac_segbits;
 
+          camp::idx_t bc_col = 0;
 
-          camp::idx_t b_segment = bc_col * right_type::s_num_registers + a_reg;
-          camp::idx_t b_reg     = b_segment / b_segments_per_register;
-          camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
+          RAJA_UNROLL
+          for(camp::idx_t c_reg = 0;c_reg < N_SIZE/result_type::s_major_dim_per_register;++ c_reg){
 
-          register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(
-              ac_segbits, b_reg_segment);
+            RAJA_UNROLL
+            for(camp::idx_t c_segment = 0;c_segment < result_type::s_major_dim_per_register;++ c_segment){
 
-          if (a_reg == 0)
-          {
-            c_tmp = b_tmp.multiply(A.get_register(a_reg));
-          }
-          else
-          {
-            c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
-          }
+              register_type c_tmp;
+
+              RAJA_UNROLL
+              for(camp::idx_t a_reg = 0;a_reg < right_type::s_num_registers;++ a_reg){
+
+
+                camp::idx_t b_segment = bc_col*right_type::s_num_registers + a_reg;
+                camp::idx_t b_reg = b_segment / b_segments_per_register;
+                camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
+
+                register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(ac_segbits, b_reg_segment);
+
+                if(a_reg == 0){
+                  c_tmp = b_tmp.multiply(A.get_register(a_reg));
+                }
+                else{
+                  c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
+                }
+
+              }
+
+              C.get_register(c_reg) += c_tmp.segmented_sum_outer(ac_segbits, c_segment);
+
+              ++ bc_col;
+            } // c_segment
+          } // c_reg
+
+
+        }
+
+
+        RAJA_HOST_DEVICE
+        static
+        RAJA_INLINE
+        void multiply(left_type const &A, right_type const &B, result_type &C){
+          C = result_type(0);
+          self_type::multiply_accumulate(A, B, C);
         }
+    };
 
-        C.get_register(c_reg) +=
-            c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
-        ++bc_col;
-      }  // c_segment
-    }    // c_reg
-  }
 
 
-  RAJA_HOST_DEVICE
-  static RAJA_INLINE void
-  multiply(left_type const& A, right_type const& B, result_type& C)
-  {
-    C = result_type(0);
-    self_type::multiply_accumulate(A, B, C);
-  }
-};
+} // namespace expt
+} // namespace internal
+} // namespace RAJA
 
 
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 2b87f1d34d..3036a096b5 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -32,1342 +32,1121 @@ namespace RAJA
 namespace expt
 {
 
-/*
- * 2D (Matrix) specialization of TensorRegister
- */
-template <typename REGISTER_POLICY,
-          typename T,
-          camp::idx_t ROW_ORD,
-          camp::idx_t COL_ORD,
-          camp::idx_t ROW_SIZE,
-          camp::idx_t COL_SIZE>
-class TensorRegister<REGISTER_POLICY,
-                     T,
-                     TensorLayout<ROW_ORD, COL_ORD>,
-                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
-    : public RAJA::internal::expt::TensorRegisterBase<
-          TensorRegister<REGISTER_POLICY,
-                         T,
-                         TensorLayout<ROW_ORD, COL_ORD>,
-                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
-{
-public:
-  using self_type = TensorRegister<REGISTER_POLICY,
-                                   T,
-                                   TensorLayout<ROW_ORD, COL_ORD>,
-                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-  using base_type = RAJA::internal::expt::TensorRegisterBase<
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     TensorLayout<ROW_ORD, COL_ORD>,
-                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-  using register_type      = Register<T, REGISTER_POLICY>;
-  using row_vector_type    = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
-  using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
-  using register_policy    = REGISTER_POLICY;
-  using element_type       = T;
-  using layout_type        = TensorLayout<ROW_ORD, COL_ORD>;
-
-  using transpose_tensor_type =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     TensorLayout<!ROW_ORD, !COL_ORD>,
-                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-
-  using transpose_type = TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        layout_type,
-                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-  using product_type   = TensorRegister<REGISTER_POLICY,
-                                      T,
-                                      layout_type,
-                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
-
-  static constexpr camp::idx_t s_num_rows    = ROW_SIZE;
-  static constexpr camp::idx_t s_num_columns = COL_SIZE;
-
-
-  static constexpr camp::idx_t s_elements_per_register =
-      RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem;
-
-  // number of registers to hold entire matrix
-  static constexpr camp::idx_t s_num_registers =
-      (ROW_SIZE * COL_SIZE) / s_elements_per_register;
-
-  // We only allow matrix sizes that exactly fit in some number of registers
-  static_assert((ROW_SIZE * COL_SIZE) ==
-                    s_num_registers * s_elements_per_register,
-                "MatrixRegister must be dimensioned to exactly fit an integer "
-                "number of registers");
-
-  using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
-
-  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
-
-  static constexpr camp::idx_t s_mask_per_register =
-      (1 << log_base2_t::value) - 1;
-
-
-  static constexpr camp::idx_t s_minor_dim_elements =
-      layout_type::is_row_major() ? s_num_columns : s_num_rows;
-
-  static constexpr camp::idx_t s_major_dim_elements =
-      layout_type::is_row_major() ? s_num_rows : s_num_columns;
-
-  // number of (full) registers that span the minor dim
-  // if a single register is split across multiple rows or columns, then
-  // this is 0
-  static constexpr camp::idx_t s_minor_dim_registers =
-      s_minor_dim_elements / s_elements_per_register;
-
-  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
-                "Minor dimension smaller than a vector need to be a power of "
-                "two fraction");
-
-  static_assert(s_minor_dim_registers == 0 ||
-                    (s_minor_dim_elements % s_elements_per_register == 0),
-                "Minor dimensions greater than a vector length must be an "
-                "integer number of vectors");
-
-
-  static constexpr camp::idx_t s_major_dim_per_register =
-      s_elements_per_register / s_minor_dim_elements;
-
-  static constexpr camp::idx_t s_segbits =
-      RAJA::LogBase2<s_minor_dim_elements>::value;
-
-private:
-  template <typename IDX>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
-                                                                 IDX col) -> IDX
-  {
-    return layout_type::is_row_major()
-               ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
-               : (col * IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
-  }
-
-  template <typename IDX>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX row, IDX col)
-      -> IDX
+  /*
+   * 2D (Matrix) specialization of TensorRegister
+   */
+  template<typename REGISTER_POLICY, typename T, camp::idx_t ROW_ORD, camp::idx_t COL_ORD, camp::idx_t ROW_SIZE, camp::idx_t COL_SIZE>
+  class TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>> :
+    public RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>
   {
-    return layout_type::is_row_major()
-               ? (row * IDX(COL_SIZE) + col) & IDX(s_mask_per_register)
-               : (col * IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
-  }
+    public:
+      using self_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+      using base_type = RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+      using register_type = Register<T, REGISTER_POLICY>;
+      using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
+      using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
+      using register_policy = REGISTER_POLICY;
+      using element_type = T;
+      using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
 
-  using base_type::m_registers;
+      using transpose_tensor_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<!ROW_ORD, !COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
 
-public:
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr TensorRegister() : base_type() {}
+      using transpose_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+      using product_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  TensorRegister(element_type c) : base_type(c) { this->broadcast(c); }
+      static constexpr camp::idx_t s_num_rows = ROW_SIZE;
+      static constexpr camp::idx_t s_num_columns = COL_SIZE;
 
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorRegister(self_type const& c) : base_type(c) { this->copy(c); }
 
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  ~TensorRegister() {}
+      static constexpr camp::idx_t s_elements_per_register =
+          RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem;
 
+      // number of registers to hold entire matrix
+      static constexpr camp::idx_t s_num_registers =
+          (ROW_SIZE*COL_SIZE) / s_elements_per_register;
 
-  /*!
-   * Returns true if the underlying data packed for a given tensor ref
-   *
-   * This is true if either:
-   *   It's column major and the rows are stride one
-   *   It's row major and the columns are stride one
-   */
-  template <camp::idx_t STRIDE_ONE_DIM>
-  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
-  {
-    return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
-           (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
-  }
+      // We only allow matrix sizes that exactly fit in some number of registers
+      static_assert((ROW_SIZE*COL_SIZE) == s_num_registers*s_elements_per_register,
+          "MatrixRegister must be dimensioned to exactly fit an integer number of registers");
 
-  /*!
-   * Gets the maximum size of matrix along specified dimension
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
-  {
-    return dim == 0 ? ROW_SIZE : COL_SIZE;
-  }
+      using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
 
+      static constexpr camp::idx_t s_shift_per_register =
+          log_base2_t::value;
 
-  /*!
-   * @brief Set entire vector to a single scalar value
-   * @param value Value to set all vector elements to
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(element_type value)
-  {
-    this->broadcast(value);
-    return *this;
-  }
+      static constexpr camp::idx_t s_mask_per_register =
+          (1<<log_base2_t::value)-1;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(self_type const& c) { return this->copy(c); }
 
+      static constexpr camp::idx_t s_minor_dim_elements =
+          layout_type::is_row_major() ? s_num_columns : s_num_rows;
 
-  /*!
-   * Provide matrix-matrix multiply for operator* between to matrices
-   */
-  template <typename T2, typename L, typename RP>
-  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
-  {
-    return matrix_multiply(y);
-  }
+      static constexpr camp::idx_t s_major_dim_elements =
+          layout_type::is_row_major() ? s_num_rows : s_num_columns;
 
-  /*!
-   * Provide right matrix-vector multiply for operator* between this
-   * matrix and a vector.
-   */
-  template <typename T2, typename RP>
-  VectorRegister<T2, RP> operator*(VectorRegister<T2, RP> const& y) const
-  {
-    return right_multiply_vector(y);
-  }
+      // number of (full) registers that span the minor dim
+      // if a single register is split across multiple rows or columns, then
+      // this is 0
+      static constexpr camp::idx_t s_minor_dim_registers =
+              s_minor_dim_elements / s_elements_per_register;
 
+      static_assert(s_minor_dim_registers >0  ||  log_base2_t::is_exact,
+          "Minor dimension smaller than a vector need to be a power of two fraction");
 
-  template <typename REF_TYPE>
-  struct RefBridge;
+      static_assert(s_minor_dim_registers == 0 || (s_minor_dim_elements % s_elements_per_register == 0),
+          "Minor dimensions greater than a vector length must be an integer number of vectors");
 
-  template <typename REF_TYPE>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
-  {
-    RefBridge<REF_TYPE>::load_ref(*this, ref);
-    return *this;
-  }
 
-  template <typename REF_TYPE>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
-  {
-    RefBridge<REF_TYPE>::store_ref(*this, ref);
-    return *this;
-  }
-
-
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            camp::idx_t STRIDE_ONE_DIM>
-  struct RefBridge<
-      RAJA::internal::expt::
-          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
-  {
+      static constexpr camp::idx_t s_major_dim_per_register =
+          s_elements_per_register / s_minor_dim_elements;
+
+      static constexpr camp::idx_t s_segbits = RAJA::LogBase2<s_minor_dim_elements>::value;
+
+    private:
+
+      template<typename IDX>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      static
+      auto to_register(IDX row, IDX col) -> IDX {
+        return layout_type::is_row_major() ?
+            (row*IDX(COL_SIZE) + col) >> IDX(s_shift_per_register) :
+            (col*IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
+      }
+
+      template<typename IDX>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      static
+      auto to_lane(IDX row, IDX col) -> IDX {
+        return layout_type::is_row_major() ?
+            (row*IDX(COL_SIZE) + col) & IDX(s_mask_per_register) :
+            (col*IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
+      }
 
-    using RefType = RAJA::internal::expt::
-        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
+      using base_type::m_registers;
 
-    /*!
-     * @brief Performs load specified by TensorRef object.
-     */
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void load_ref(self_type& self, RefType const& ref)
-    {
+    public:
 
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
-                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      // check for packed data
-      if (self.is_ref_packed<STRIDE_ONE_DIM>())
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      TensorRegister() : base_type() {}
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      TensorRegister(element_type c) : base_type(c)
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+        this->broadcast(c);
       }
-      // strided data
-      else
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      TensorRegister(self_type const &c) : base_type(c)
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+        this->copy(c);
       }
-    }
 
 
-    /*!
-     * @brief Performs load specified by TensorRef object.
-     */
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void store_ref(self_type const& self, RefType& ref)
-    {
 
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
-                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      ~TensorRegister(){}
 
-      // check for packed data
-      if (self.is_ref_packed<STRIDE_ONE_DIM>())
-      {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+
+      /*!
+       * Returns true if the underlying data packed for a given tensor ref
+       *
+       * This is true if either:
+       *   It's column major and the rows are stride one
+       *   It's row major and the columns are stride one
+       */
+      template<camp::idx_t STRIDE_ONE_DIM>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      bool is_ref_packed() {
+        return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
+            (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
       }
-      // strided data
-      else
-      {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+
+      /*!
+       * Gets the maximum size of matrix along specified dimension
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
+        return dim == 0 ? ROW_SIZE : COL_SIZE;
       }
-    }
-  };
-
-
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE StrideInt1,
-            INDEX_TYPE StrideInt2,
-            INDEX_TYPE BeginInt1,
-            INDEX_TYPE BeginInt2,
-            INDEX_TYPE SizeInt1,
-            INDEX_TYPE SizeInt2,
-            camp::idx_t STRIDE_ONE_DIM>
-  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
-      POINTER_TYPE,
-      INDEX_TYPE,
-      TENSOR_SIZE,
-      camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
-      camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
-      camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
-      STRIDE_ONE_DIM>>
-  {
 
-    using RefType = RAJA::internal::expt::StaticTensorRef<
-        POINTER_TYPE,
-        INDEX_TYPE,
-        TENSOR_SIZE,
-        camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
-        camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
-        camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
-        STRIDE_ONE_DIM>;
-
-    /*!
-     * @brief Performs load specified by TensorRef object.
-     */
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void load_ref(self_type& self, RefType const& ref)
-    {
-
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
-                 ref.m_tile.m_begin[1] * ref.m_stride[1];
-
-      // check for packed data
-      if (self.is_ref_packed<STRIDE_ONE_DIM>())
+
+
+      /*!
+       * @brief Set entire vector to a single scalar value
+       * @param value Value to set all vector elements to
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(element_type value)
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+        this->broadcast(value);
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        return this->copy(c);
       }
-      // strided data
-      else
+
+
+      /*!
+       * Provide matrix-matrix multiply for operator* between to matrices
+       */
+      template<typename T2, typename L, typename RP>
+      self_type
+      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+        return matrix_multiply(y);
       }
-    }
 
+      /*!
+       * Provide right matrix-vector multiply for operator* between this
+       * matrix and a vector.
+       */
+      template<typename T2, typename RP>
+      VectorRegister<T2, RP>
+      operator*(VectorRegister<T2, RP> const &y) const
+      {
+        return right_multiply_vector(y);
+      }
 
-    /*!
-     * @brief Performs load specified by TensorRef object.
-     */
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    static void store_ref(self_type const& self, RefType& ref)
-    {
 
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
-                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+      template<typename REF_TYPE>
+      struct RefBridge;
 
-      // check for packed data
-      if (self.is_ref_packed<STRIDE_ONE_DIM>())
-      {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type& load_ref (REF_TYPE const &ref){
+          RefBridge<REF_TYPE>::load_ref(*this,ref);
+          return *this;
       }
-      // strided data
-      else
-      {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-        }
-        // partial
-        else
-        {
-          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-        }
+
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_ref (REF_TYPE &ref) const {
+          RefBridge<REF_TYPE>::store_ref(*this,ref);
+          return *this;
       }
-    }
-  };
-
-
-  /*!
-   * Loads a dense full matrix from memory.
-   *
-   * For row-major, column entries must be stride-1
-   * For column-major, row entries must be stride-1
-   *
-   * Non-stride-1 dimension can have any striding... so this is can
-   * be a "semi-dense" matrix.
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type&
-  load_packed(element_type const* ptr, int row_stride, int col_stride)
-  {
-    // if it's dense in columns and rows, just do a dense load
-    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
-        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
-    {
 
-      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+
+
+      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
+      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
       {
-        m_registers[reg].load_packed(ptr + reg * s_elements_per_register);
-      }
-    }
-    // Do semi-dense load for row-major
-    else if (layout_type::is_row_major())
-    {
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
+          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void load_ref(self_type& self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
+          }
+    
+    
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>())
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
+          }
+
+      };
+
+
+
+
+      template<
+           typename POINTER_TYPE,
+           typename INDEX_TYPE,
+           RAJA::internal::expt::TensorTileSize TENSOR_SIZE, 
+           INDEX_TYPE StrideInt1, INDEX_TYPE StrideInt2,
+           INDEX_TYPE  BeginInt1, INDEX_TYPE  BeginInt2,
+           INDEX_TYPE   SizeInt1, INDEX_TYPE   SizeInt2,
+           camp::idx_t STRIDE_ONE_DIM
+      >
+      struct RefBridge
+      <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>>
       {
-        camp::idx_t reg = 0;
-        for (camp::idx_t row = 0; row < ROW_SIZE; ++row)
-        {
-          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
-          {
 
-            camp::idx_t offset =
-                row * row_stride + colreg * s_elements_per_register;
+          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void load_ref(self_type& self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
+          }
+    
+    
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>())
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
+          }
+
+      };
+
+
+
 
-            m_registers[reg].load_packed(ptr + offset);
 
-            reg++;
+      /*!
+       * Loads a dense full matrix from memory.
+       *
+       * For row-major, column entries must be stride-1
+       * For column-major, row entries must be stride-1
+       *
+       * Non-stride-1 dimension can have any striding... so this is can
+       * be a "semi-dense" matrix.
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr,
+          int row_stride, int col_stride)
+      {
+        // if it's dense in columns and rows, just do a dense load
+        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
+           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+
+          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
+            m_registers[reg].load_packed(ptr + reg*s_elements_per_register);
           }
+
         }
-      }
-      // more than one column per register
-      else
-      {
-        // default to strided operation
-        return load_strided(ptr, row_stride, col_stride);
-      }
-    }
-    // Do semi-dense load for column-major
-    else
-    {
-      // one or more registers per row
-      if (s_minor_dim_registers)
-      {
+        // Do semi-dense load for row-major
+        else if(layout_type::is_row_major()){
 
-        camp::idx_t reg = 0;
-        for (camp::idx_t col = 0; col < COL_SIZE; ++col)
-        {
-          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
-          {
+          // one or more registers per column
+          if(s_minor_dim_registers){
+            camp::idx_t reg = 0;
+            for(camp::idx_t row = 0;row < ROW_SIZE;++ row){
+              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
 
-            camp::idx_t offset =
-                col * col_stride + rowreg * s_elements_per_register;
+                camp::idx_t offset = row*row_stride + colreg*s_elements_per_register;
 
-            m_registers[reg].load_packed(ptr + offset);
+                m_registers[reg].load_packed(ptr + offset);
 
-            reg++;
+                reg ++;
+
+              }
+            }
+          }
+          // more than one column per register
+          else{
+            // default to strided operation
+            return load_strided(ptr, row_stride, col_stride);
           }
         }
-      }
-      // more than one column per register
-      else
-      {
-        // default to strided operation
-        return load_strided(ptr, row_stride, col_stride);
-      }
-    }
+        // Do semi-dense load for column-major
+        else{
+          // one or more registers per row
+          if(s_minor_dim_registers){
 
-    return *this;
-  }
+            camp::idx_t reg = 0;
+            for(camp::idx_t col = 0;col < COL_SIZE;++ col){
+              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-  /*!
-   * Loads a strided full matrix from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type&
-  load_strided(element_type const* ptr, int row_stride, int col_stride)
-  {
+                camp::idx_t offset = col*col_stride + rowreg*s_elements_per_register;
 
-    if (layout_type::is_row_major())
-    {
-      // one or more registers per row
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t row =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          camp::idx_t col =
-              s_elements_per_register * (i - (row * s_minor_dim_registers));
-          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
-                                      col_stride);
+                m_registers[reg].load_packed(ptr + offset);
+
+                reg ++;
+
+              }
+            }
+          }
+          // more than one column per register
+          else{
+            // default to strided operation
+            return load_strided(ptr, row_stride, col_stride);
+          }
         }
+
+        return *this;
       }
-      // less than one register per row
-      else
+
+      /*!
+       * Loads a strided full matrix from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr,
+          int row_stride, int col_stride)
       {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          element_type const* ptr_i =
-              ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(ptr_i, s_segbits, col_stride,
-                                        row_stride);
+
+        if(layout_type::is_row_major()){
+          // one or more registers per row
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
+            }
+          }
+          // less than one register per row
+          else
+          {
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
+              m_registers[i].segmented_load(ptr_i, s_segbits, col_stride, row_stride);
+            }
+          }
         }
-      }
-    }
 
-    // column major
-    else
-    {
+        // column major
+        else{
+
+          // one or more registers per column
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t col =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          camp::idx_t row =
-              s_elements_per_register * (i - (col * s_minor_dim_registers));
-
-          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
-                                      row_stride);
+              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
+            }
+          }
+          // less than one register per column
+          else
+          {
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
+              m_registers[i].segmented_load(ptr_i, s_segbits, row_stride, col_stride);
+            }
+          }
         }
+
+        return *this;
       }
-      // less than one register per column
-      else
+
+      /*!
+       * Loads a dense partial matrix from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_packed_nm(element_type const *ptr,
+          int row_stride, int col_stride,
+          int num_rows, int num_cols)
       {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          element_type const* ptr_i =
-              ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load(ptr_i, s_segbits, row_stride,
-                                        col_stride);
-        }
-      }
-    }
 
-    return *this;
-  }
+        if(layout_type::is_row_major()){
 
-  /*!
-   * Loads a dense partial matrix from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_packed_nm(element_type const* ptr,
-                            int row_stride,
-                            int col_stride,
-                            int num_rows,
-                            int num_cols)
-  {
+          // one or more registers per column
+          if(s_minor_dim_registers){
 
-    if (layout_type::is_row_major())
-    {
+            for(camp::idx_t row = 0;row < num_rows;++ row){
+              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
+                camp::idx_t reg = row*s_minor_dim_registers + colreg;
 
-        for (camp::idx_t row = 0; row < num_rows; ++row)
-        {
-          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
-          {
+                camp::idx_t col0 = colreg*s_elements_per_register;
+                camp::idx_t offset = row*row_stride + col0;
 
-            camp::idx_t reg = row * s_minor_dim_registers + colreg;
+                // loading a complete register
+                if(col0+s_elements_per_register <= num_cols){
+                  m_registers[reg].load_packed(ptr + offset);
+                }
 
-            camp::idx_t col0   = colreg * s_elements_per_register;
-            camp::idx_t offset = row * row_stride + col0;
+                // partial register at end of row
+                else{
+                  m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
 
-            // loading a complete register
-            if (col0 + s_elements_per_register <= num_cols)
-            {
-              m_registers[reg].load_packed(ptr + offset);
+                  // zero out the remaining registers, if any
+                  for(camp::idx_t i = colreg+1;i < s_minor_dim_registers;++i){
+                    reg++;
+                    m_registers[reg] = element_type(0);
+                  }
+
+                  break; // end this row
+                }
+              }
             }
 
-            // partial register at end of row
-            else
-            {
-              m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
+            // zero out remaining rows
+            for(camp::idx_t row = num_rows;row < ROW_SIZE;++ row){
+              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+
+                camp::idx_t reg = row*s_minor_dim_registers + colreg;
 
-              // zero out the remaining registers, if any
-              for (camp::idx_t i = colreg + 1; i < s_minor_dim_registers; ++i)
-              {
-                reg++;
                 m_registers[reg] = element_type(0);
               }
-
-              break;  // end this row
             }
           }
+          // more than one column per register
+          else{
+            // default to strided operation
+            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+          }
         }
+        // Do semi-dense load for column-major
+        else{
 
-        // zero out remaining rows
-        for (camp::idx_t row = num_rows; row < ROW_SIZE; ++row)
-        {
-          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
-          {
+          // one or more registers per column
+          if(s_minor_dim_registers){
 
-            camp::idx_t reg = row * s_minor_dim_registers + colreg;
+            for(camp::idx_t col = 0;col < num_cols;++ col){
+              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-            m_registers[reg] = element_type(0);
-          }
-        }
-      }
-      // more than one column per register
-      else
-      {
-        // default to strided operation
-        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
-      }
-    }
-    // Do semi-dense load for column-major
-    else
-    {
+                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
+                camp::idx_t row0 = rowreg*s_elements_per_register;
+                camp::idx_t offset = col*col_stride + row0;
 
-        for (camp::idx_t col = 0; col < num_cols; ++col)
-        {
-          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
-          {
+                // loading a complete register
+                if(row0+s_elements_per_register <= num_rows){
+                  m_registers[reg].load_packed(ptr + offset);
+                }
 
-            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
+                // partial register at end of column
+                else{
+                  m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
 
-            camp::idx_t row0   = rowreg * s_elements_per_register;
-            camp::idx_t offset = col * col_stride + row0;
+                  // zero out the remaining registers, if any
+                  for(camp::idx_t i = rowreg+1;i < s_minor_dim_registers;++i){
+                    reg++;
+                    m_registers[reg] = element_type(0);
+                  }
 
-            // loading a complete register
-            if (row0 + s_elements_per_register <= num_rows)
-            {
-              m_registers[reg].load_packed(ptr + offset);
+                  break; // end this column
+                }
+              }
             }
+            // zero out remaining columns
+            for(camp::idx_t col = num_cols;col < COL_SIZE;++ col){
+              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-            // partial register at end of column
-            else
-            {
-              m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
+                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
 
-              // zero out the remaining registers, if any
-              for (camp::idx_t i = rowreg + 1; i < s_minor_dim_registers; ++i)
-              {
-                reg++;
                 m_registers[reg] = element_type(0);
               }
-
-              break;  // end this column
             }
-          }
-        }
-        // zero out remaining columns
-        for (camp::idx_t col = num_cols; col < COL_SIZE; ++col)
-        {
-          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
-          {
 
-            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
+          }
+          // more than one column per register
+          else{
 
-            m_registers[reg] = element_type(0);
+            // default to strided operation
+            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
           }
         }
+
+        return *this;
       }
-      // more than one column per register
-      else
+
+      /*!
+       * Loads a strided partial matrix from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_strided_nm(element_type const *ptr,
+          int row_stride, int col_stride,
+          int num_rows, int num_cols)
       {
 
-        // default to strided operation
-        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
-      }
-    }
+        if(layout_type::is_row_major()){
+          // one or more registers per row
+          if(s_minor_dim_registers){
 
-    return *this;
-  }
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              if(row >= num_rows){
+                m_registers[i] = element_type(0);
+              }
+              else{
+                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
 
-  /*!
-   * Loads a strided partial matrix from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_strided_nm(element_type const* ptr,
-                             int row_stride,
-                             int col_stride,
-                             int num_rows,
-                             int num_cols)
-  {
 
-    if (layout_type::is_row_major())
-    {
-      // one or more registers per row
-      if (s_minor_dim_registers)
-      {
+                camp::idx_t reg_num_cols = s_elements_per_register;
+                if(reg_num_cols+col > num_cols){
+                  reg_num_cols = num_cols-col;
+                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
+                }
+                else{
+                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
+                }
 
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t row =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          if (row >= num_rows)
-          {
-            m_registers[i] = element_type(0);
+
+              }
+            }
           }
+          // less than one register per row
           else
           {
-            camp::idx_t col =
-                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              // figure out how many rows get loaded in this register
+              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
+              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-            camp::idx_t reg_num_cols = s_elements_per_register;
-            if (reg_num_cols + col > num_cols)
-            {
-              reg_num_cols = num_cols - col;
-              m_registers[i].load_strided_n(ptr + row * row_stride +
-                                                col * col_stride,
-                                            col_stride, reg_num_cols);
-            }
-            else
-            {
-              m_registers[i].load_strided(
-                  ptr + row * row_stride + col * col_stride, col_stride);
+              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
+              m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
             }
           }
         }
-      }
-      // less than one register per row
-      else
-      {
 
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          // figure out how many rows get loaded in this register
-          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
-          reg_num_rows             = reg_num_rows > s_major_dim_per_register
-                                         ? s_major_dim_per_register
-                                         : reg_num_rows;
-
-          element_type const* ptr_i =
-              ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride,
-                                           row_stride, num_cols, reg_num_rows);
-        }
-      }
-    }
+        // column major
+        else{
 
-    // column major
-    else
-    {
+          // one or more registers per column
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              if(col >= num_cols){
+                m_registers[i] = element_type(0);
+              }
+              else{
+                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t col =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          if (col >= num_cols)
-          {
-            m_registers[i] = element_type(0);
+                camp::idx_t reg_num_rows = s_elements_per_register;
+                if(reg_num_rows+row > num_rows){
+                  reg_num_rows = num_rows-row;
+                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
+                }
+                else{
+                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
+                }
+              }
+            }
           }
+          // less than one register per column
           else
           {
-            camp::idx_t row =
-                s_elements_per_register * (i - (col * s_minor_dim_registers));
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              // figure out how many columns get loaded in this register
+              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
+              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
 
-            camp::idx_t reg_num_rows = s_elements_per_register;
-            if (reg_num_rows + row > num_rows)
-            {
-              reg_num_rows = num_rows - row;
-              m_registers[i].load_strided_n(ptr + row * row_stride +
-                                                col * col_stride,
-                                            row_stride, reg_num_rows);
-            }
-            else
-            {
-              m_registers[i].load_strided(
-                  ptr + row * row_stride + col * col_stride, row_stride);
+              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
+              m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
             }
           }
         }
+
+        return *this;
       }
-      // less than one register per column
-      else
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          // figure out how many columns get loaded in this register
-          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
-          reg_num_cols             = reg_num_cols > s_major_dim_per_register
-                                         ? s_major_dim_per_register
-                                         : reg_num_cols;
-
-          element_type const* ptr_i =
-              ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride,
-                                           col_stride, num_rows, reg_num_cols);
-        }
-      }
-    }
 
-    return *this;
-  }
 
 
-  /*!
-   * Store a dense full matrix to memory.
-   *
-   * Column entries must be stride-1, rows may be any striding
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const&
-  store_packed(element_type* ptr, int row_stride, int col_stride) const
-  {
+      /*!
+       * Store a dense full matrix to memory.
+       *
+       * Column entries must be stride-1, rows may be any striding
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr,
+          int row_stride, int col_stride) const
+      {
 
-    // if it's dense in columns and rows, just do a dense load
-    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
-        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
-    {
+        // if it's dense in columns and rows, just do a dense load
+        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
+           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
 
-      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
-      {
-        m_registers[reg].store_packed(ptr + reg * s_elements_per_register);
-      }
-    }
-    // Do semi-dense store for row-major
-    else if (layout_type::is_row_major())
-    {
+          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
+            m_registers[reg].store_packed(ptr + reg*s_elements_per_register);
+          }
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t row =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          camp::idx_t col =
-              s_elements_per_register * (i - (row * s_minor_dim_registers));
-          m_registers[i].store_packed(ptr + row * row_stride +
-                                      col * col_stride);
         }
-      }
-      // more than one column per register
-      else
-      {
-        store_strided(ptr, row_stride, col_stride);
-      }
-    }
-    // Do semi-dense store for column-major
-    else
-    {
-      // one or more registers per row
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t col =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          camp::idx_t row =
-              s_elements_per_register * (i - (col * s_minor_dim_registers));
-          m_registers[i].store_packed(ptr + row * row_stride +
-                                      col * col_stride);
+        // Do semi-dense store for row-major
+        else if(layout_type::is_row_major()){
+
+          // one or more registers per column
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
+            }
+          }
+          // more than one column per register
+          else{
+            store_strided(ptr, row_stride, col_stride);
+          }
+        }
+        // Do semi-dense store for column-major
+        else{
+          // one or more registers per row
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
+            }
+          }
+          // more than one row per register
+          else{
+            store_strided(ptr, row_stride, col_stride);
+          }
         }
-      }
-      // more than one row per register
-      else
-      {
-        store_strided(ptr, row_stride, col_stride);
-      }
-    }
 
 
-    return *this;
-  }
+        return *this;
+      }
 
-  /*!
-   * Store a strided full matrix to memory
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const&
-  store_strided(element_type* ptr, int row_stride, int col_stride) const
-  {
+      /*!
+       * Store a strided full matrix to memory
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr,
+          int row_stride, int col_stride) const
+      {
 
 
-    if (layout_type::is_row_major())
-    {
-      // one or more registers per row
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t row =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          camp::idx_t col =
-              s_elements_per_register * (i - (row * s_minor_dim_registers));
-          m_registers[i].store_strided(
-              ptr + row * row_stride + col * col_stride, col_stride);
-        }
-      }
-      // less than one register per row
-      else
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(ptr_i, s_segbits, col_stride,
-                                         row_stride);
+        if(layout_type::is_row_major()){
+          // one or more registers per row
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
+            }
+          }
+          // less than one register per row
+          else
+          {
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
+              m_registers[i].segmented_store(ptr_i, s_segbits, col_stride, row_stride);
+            }
+          }
         }
-      }
-    }
 
-    // column major
-    else
-    {
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t col =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          camp::idx_t row =
-              s_elements_per_register * (i - (col * s_minor_dim_registers));
-          m_registers[i].store_strided(
-              ptr + row * row_stride + col * col_stride, row_stride);
+        // column major
+        else{
+          // one or more registers per column
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
+            }
+          }
+          // less than one register per column
+          else
+          {
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
+              m_registers[i].segmented_store(ptr_i, s_segbits, row_stride, col_stride);
+            }
+          }
         }
+
+        return *this;
       }
-      // less than one register per column
-      else
+
+      /*!
+       * Store a dense partial matrix to memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_packed_nm(element_type *ptr,
+          int row_stride, int col_stride,
+          int num_rows, int num_cols) const
       {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store(ptr_i, s_segbits, row_stride,
-                                         col_stride);
-        }
-      }
-    }
 
-    return *this;
-  }
 
-  /*!
-   * Store a dense partial matrix to memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_packed_nm(element_type* ptr,
-                                   int row_stride,
-                                   int col_stride,
-                                   int num_rows,
-                                   int num_cols) const
-  {
+        if(layout_type::is_row_major()){
 
+          // one or more registers per column
+          if(s_minor_dim_registers){
 
-    if (layout_type::is_row_major())
-    {
+            for(camp::idx_t row = 0;row < num_rows;++ row){
+              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
+                camp::idx_t reg = row*s_minor_dim_registers + colreg;
 
-        for (camp::idx_t row = 0; row < num_rows; ++row)
-        {
-          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
-          {
+                camp::idx_t col0 = colreg*s_elements_per_register;
+                camp::idx_t offset = row*row_stride + col0;
 
-            camp::idx_t reg = row * s_minor_dim_registers + colreg;
+                // store a complete register
+                if(col0+s_elements_per_register <= num_cols){
+                  m_registers[reg].store_packed(ptr + offset);
+                }
 
-            camp::idx_t col0   = colreg * s_elements_per_register;
-            camp::idx_t offset = row * row_stride + col0;
+                // partial register at end of row
+                else{
+                  m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
 
-            // store a complete register
-            if (col0 + s_elements_per_register <= num_cols)
-            {
-              m_registers[reg].store_packed(ptr + offset);
+                  break; // end this row
+                }
+              }
             }
 
-            // partial register at end of row
-            else
-            {
-              m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
-
-              break;  // end this row
-            }
+          }
+          // more than one column per register
+          else{
+            // default to strided operation
+            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
           }
         }
-      }
-      // more than one column per register
-      else
-      {
-        // default to strided operation
-        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
-                                num_cols);
-      }
-    }
-    // Do semi-dense store for column-major
-    else
-    {
+        // Do semi-dense store for column-major
+        else{
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
+          // one or more registers per column
+          if(s_minor_dim_registers){
 
-        for (camp::idx_t col = 0; col < num_cols; ++col)
-        {
-          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
-          {
+            for(camp::idx_t col = 0;col < num_cols;++ col){
+              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
+                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
 
-            camp::idx_t row0   = rowreg * s_elements_per_register;
-            camp::idx_t offset = col * col_stride + row0;
+                camp::idx_t row0 = rowreg*s_elements_per_register;
+                camp::idx_t offset = col*col_stride + row0;
 
-            // loading a complete register
-            if (row0 + s_elements_per_register <= num_rows)
-            {
-              m_registers[reg].store_packed(ptr + offset);
-            }
+                // loading a complete register
+                if(row0+s_elements_per_register <= num_rows){
+                  m_registers[reg].store_packed(ptr + offset);
+                }
 
-            // partial register at end of column
-            else
-            {
-              m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
+                // partial register at end of column
+                else{
+                  m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
 
-              break;  // end this column
+                  break; // end this column
+                }
+              }
             }
+
+          }
+          // more than one column per register
+          else{
+
+            // default to strided operation
+            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
           }
         }
+
+        return *this;
       }
-      // more than one column per register
-      else
+
+      /*!
+       * Store a strided partial matrix to memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_strided_nm(element_type *ptr,
+          int row_stride, int col_stride,
+          int num_rows, int num_cols) const
       {
 
-        // default to strided operation
-        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
-                                num_cols);
-      }
-    }
 
-    return *this;
-  }
+        if(layout_type::is_row_major()){
+          // one or more registers per row
+          if(s_minor_dim_registers){
 
-  /*!
-   * Store a strided partial matrix to memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_strided_nm(element_type* ptr,
-                                    int row_stride,
-                                    int col_stride,
-                                    int num_rows,
-                                    int num_cols) const
-  {
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              if(row < num_rows){
+                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
 
 
-    if (layout_type::is_row_major())
-    {
-      // one or more registers per row
-      if (s_minor_dim_registers)
-      {
+                camp::idx_t reg_num_cols = s_elements_per_register;
+                if(reg_num_cols+col > num_cols){
+                  reg_num_cols = num_cols-col;
+                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
+                }
+                else{
+                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
+                }
 
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t row =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          if (row < num_rows)
+
+              }
+            }
+          }
+          // less than one register per row
+          else
           {
-            camp::idx_t col =
-                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              // figure out how many rows get loaded in this register
+              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
+              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-            camp::idx_t reg_num_cols = s_elements_per_register;
-            if (reg_num_cols + col > num_cols)
-            {
-              reg_num_cols = num_cols - col;
-              m_registers[i].store_strided_n(ptr + row * row_stride +
-                                                 col * col_stride,
-                                             col_stride, reg_num_cols);
-            }
-            else
-            {
-              m_registers[i].store_strided(
-                  ptr + row * row_stride + col * col_stride, col_stride);
+              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
+              m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
             }
           }
         }
-      }
-      // less than one register per row
-      else
-      {
-
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          // figure out how many rows get loaded in this register
-          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
-          reg_num_rows             = reg_num_rows > s_major_dim_per_register
-                                         ? s_major_dim_per_register
-                                         : reg_num_rows;
-
-          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride,
-                                            row_stride, num_cols, reg_num_rows);
-        }
-      }
-    }
 
-    // column major
-    else
-    {
+        // column major
+        else{
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t col =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          if (col < num_cols)
+          // one or more registers per column
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              if(col < num_cols){
+                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+
+                camp::idx_t reg_num_rows = s_elements_per_register;
+                if(reg_num_rows+row > num_rows){
+                  reg_num_rows = num_rows-row;
+                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
+                }
+                else{
+                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
+                }
+              }
+            }
+          }
+          // less than one register per column
+          else
           {
-            camp::idx_t row =
-                s_elements_per_register * (i - (col * s_minor_dim_registers));
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              // figure out how many columns get loaded in this register
+              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
+              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
 
-            camp::idx_t reg_num_rows = s_elements_per_register;
-            if (reg_num_rows + row > num_rows)
-            {
-              reg_num_rows = num_rows - row;
-              m_registers[i].store_strided_n(ptr + row * row_stride +
-                                                 col * col_stride,
-                                             row_stride, reg_num_rows);
-            }
-            else
-            {
-              m_registers[i].store_strided(
-                  ptr + row * row_stride + col * col_stride, row_stride);
+              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
+              m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
             }
           }
         }
+
+        return *this;
       }
-      // less than one register per column
-      else
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          // figure out how many columns get loaded in this register
-          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
-          reg_num_cols             = reg_num_cols > s_major_dim_per_register
-                                         ? s_major_dim_per_register
-                                         : reg_num_cols;
-
-          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
-          m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride,
-                                            col_stride, num_rows, reg_num_cols);
-        }
-      }
-    }
 
-    return *this;
-  }
 
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_nm(self_type const &mat, int num_rows, int num_cols) const {
+        self_type result;
 
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_nm(self_type const& mat, int num_rows, int num_cols) const
-  {
-    self_type result;
 
+        if(layout_type::is_row_major()){
+          // one or more registers per row
+          if(s_minor_dim_registers){
 
-    if (layout_type::is_row_major())
-    {
-      // one or more registers per row
-      if (s_minor_dim_registers)
-      {
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              if(row < num_rows){
+                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
 
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t row =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          if (row < num_rows)
-          {
-            camp::idx_t col =
-                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
+                camp::idx_t reg_num_cols = s_elements_per_register;
+                if(reg_num_cols+col > num_cols){
+                  reg_num_cols = num_cols-col;
+                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
+                }
+                else{
+                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
+                }
 
-            camp::idx_t reg_num_cols = s_elements_per_register;
-            if (reg_num_cols + col > num_cols)
-            {
-              reg_num_cols = num_cols - col;
-              result.m_registers[i] =
-                  m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
-            }
-            else
-            {
-              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
+
+              }
             }
           }
-        }
-      }
-      // less than one register per row
-      else
-      {
+          // less than one register per row
+          else
+          {
 
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          // figure out how many rows get loaded in this register
-          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
-          reg_num_rows             = reg_num_rows > s_major_dim_per_register
-                                         ? s_major_dim_per_register
-                                         : reg_num_rows;
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              // figure out how many rows get loaded in this register
+              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
+              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-          result.m_registers[i] = m_registers[i].segmented_divide_nm(
-              mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
+              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
+            }
+          }
         }
-      }
-    }
 
-    // column major
-    else
-    {
+        // column major
+        else{
 
-      // one or more registers per column
-      if (s_minor_dim_registers)
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          camp::idx_t col =
-              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-          if (col < num_cols)
+          // one or more registers per column
+          if(s_minor_dim_registers){
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+              if(col < num_cols){
+                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+
+                camp::idx_t reg_num_rows = s_elements_per_register;
+                if(reg_num_rows+row > num_rows){
+                  reg_num_rows = num_rows-row;
+                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
+                }
+                else{
+                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
+                }
+              }
+            }
+          }
+          // less than one register per column
+          else
           {
-            camp::idx_t row =
-                s_elements_per_register * (i - (col * s_minor_dim_registers));
+            for(camp::idx_t i = 0;i < s_num_registers;++ i){
+              // figure out how many columns get loaded in this register
+              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
+              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
 
-            camp::idx_t reg_num_rows = s_elements_per_register;
-            if (reg_num_rows + row > num_rows)
-            {
-              reg_num_rows = num_rows - row;
-              result.m_registers[i] =
-                  m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
-            }
-            else
-            {
-              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
+              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
             }
           }
         }
-      }
-      // less than one register per column
-      else
-      {
-        for (camp::idx_t i = 0; i < s_num_registers; ++i)
-        {
-          // figure out how many columns get loaded in this register
-          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
-          reg_num_cols             = reg_num_cols > s_major_dim_per_register
-                                         ? s_major_dim_per_register
-                                         : reg_num_cols;
-
-          result.m_registers[i] = m_registers[i].segmented_divide_nm(
-              mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
-        }
-      }
-    }
 
 
-    return result;
-  }
+        return result;
+      }
+
 
 
-  /*!
-   * Matrix transpose, keeping layout
-   *
-   * Transpose is not completely implemented
-   */
+      /*!
+       * Matrix transpose, keeping layout
+       *
+       * Transpose is not completely implemented
+       */
 #if 0
       RAJA_HOST_DEVICE
       RAJA_INLINE
@@ -1512,427 +1291,386 @@ class TensorRegister<REGISTER_POLICY,
         return reinterpret_cast<transpose_tensor_type const &>(*this);
       }
 #endif
-  /*!
-   * Matrix vector product
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  column_vector_type right_multiply_vector(row_vector_type v) const
-  {
-    column_vector_type result(0);
-    return right_multiply_vector_accumulate(v, result);
-  }
+      /*!
+       * Matrix vector product
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      column_vector_type right_multiply_vector(row_vector_type v) const {
+        column_vector_type result(0);
+        return right_multiply_vector_accumulate(v, result);
+      }
 
-  /*!
-   * Matrix vector product
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  row_vector_type left_multiply_vector(column_vector_type v) const
-  {
-    row_vector_type result(0);
-    return left_multiply_vector_accumulate(v, result);
-  }
+      /*!
+       * Matrix vector product
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      row_vector_type left_multiply_vector(column_vector_type v) const {
+        row_vector_type result(0);
+        return left_multiply_vector_accumulate(v, result);
+      }
 
 
-  /*!
-   * Matrix vector product with accumulation into another vector
-   *
-   * acc += (this) * v
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  column_vector_type
-  right_multiply_vector_accumulate(row_vector_type const& v,
-                                   column_vector_type result) const
-  {
+      /*!
+       * Matrix vector product with accumulation into another vector
+       *
+       * acc += (this) * v
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      column_vector_type right_multiply_vector_accumulate(row_vector_type const &v, column_vector_type result) const {
 
-    if (layout_type::is_row_major())
-    {
+        if(layout_type::is_row_major()){
 
-      // 1 register is split over multiple rows
-      if (s_minor_dim_registers == 0)
-      {
+          // 1 register is split over multiple rows
+          if(s_minor_dim_registers == 0){
+
+            // start by broadcasting the first segment in v across all of v
+            // we will use this term for all registers in the matrix
+            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
+
+            // loop over output segments, which is also the number of
+            // registers in the matrix (no kidding!)
+            RAJA_UNROLL
+            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
+
+              // compute which result register we are accumulating into
+              camp::idx_t result_reg = outseg >> s_segbits;
 
-        // start by broadcasting the first segment in v across all of v
-        // we will use this term for all registers in the matrix
-        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
+              // compute which segment within result_reg we are accumulating into
+              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
 
-        // loop over output segments, which is also the number of
-        // registers in the matrix (no kidding!)
-        RAJA_UNROLL
-        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
-        {
+              // compute segmented dot product to get output segment
+              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-          // compute which result register we are accumulating into
-          camp::idx_t result_reg = outseg >> s_segbits;
+              // accumulate result
+              result.get_register(result_reg) += value;
+            }
+
+          }
+          // one or more registers per row
+          else{
+
+            // Loop over rows
+            camp::idx_t reg = 0;
+            RAJA_UNROLL
+            for(camp::idx_t row = 0;row < s_num_rows;++ row){
 
-          // compute which segment within result_reg we are accumulating into
-          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
+              // compute partial dot products for all registers in this row
+              auto rowsum = register_type(0);
+              RAJA_UNROLL
+              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
 
-          // compute segmented dot product to get output segment
-          auto value =
-              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+                rowsum = m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
+                reg ++;
+
+              } // rowreg
+
+              // finish dot product by taking sum of rowsum
+              auto value = result.get(row) + rowsum.sum();
+              result.set(value, row);
+
+            } // row
+          }
 
-          // accumulate result
-          result.get_register(result_reg) += value;
         }
-      }
-      // one or more registers per row
-      else
-      {
+        else{
 
-        // Loop over rows
-        camp::idx_t reg = 0;
-        RAJA_UNROLL
-        for (camp::idx_t row = 0; row < s_num_rows; ++row)
-        {
 
-          // compute partial dot products for all registers in this row
-          auto rowsum = register_type(0);
-          RAJA_UNROLL
-          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
-          {
+          // 1 register is split over multiple columns
+          if(s_minor_dim_registers == 0){
 
-            rowsum =
-                m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
-            reg++;
+            auto &mv = result.get_register(0);
 
-          }  // rowreg
+            // Loop over registers, which are also the segments in v
+            RAJA_UNROLL
+            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
+              camp::idx_t v_reg = m_reg >> s_segbits;
+              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
 
-          // finish dot product by taking sum of rowsum
-          auto value = result.get(row) + rowsum.sum();
-          result.set(value, row);
+              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+              mv = m_registers[m_reg].multiply_add(v_tmp, mv);
 
-        }  // row
-      }
-    }
-    else
-    {
+            }
 
+            // Now sum segments in mv together to form final result
+            mv = mv.segmented_sum_outer(s_segbits, 0);
 
-      // 1 register is split over multiple columns
-      if (s_minor_dim_registers == 0)
-      {
+          }
+          // one or more registers per column
+          else{
 
-        auto& mv = result.get_register(0);
+            // Loop over columns (which is also registers)
+            camp::idx_t reg = 0;
+            RAJA_UNROLL
+            for(camp::idx_t col = 0;col < s_num_columns;++ col){
 
-        // Loop over registers, which are also the segments in v
-        RAJA_UNROLL
-        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
-        {
-          camp::idx_t v_reg = m_reg >> s_segbits;
-          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
+              // extract column value from v
+              auto v_col = register_type(v.get(col));
 
-          auto v_tmp =
-              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-          mv = m_registers[m_reg].multiply_add(v_tmp, mv);
-        }
+              // apply v_col to entire column (1 or more registers)
+              RAJA_UNROLL
+              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
+
+                auto &mv = result.get_register(rowreg);
+                mv = m_registers[reg].multiply_add(v_col, mv);
+
+                reg ++;
+
+              } // rowreg
+            } // col
+          }
 
-        // Now sum segments in mv together to form final result
-        mv = mv.segmented_sum_outer(s_segbits, 0);
+        }
+        return result;
       }
-      // one or more registers per column
-      else
-      {
 
-        // Loop over columns (which is also registers)
-        camp::idx_t reg = 0;
-        RAJA_UNROLL
-        for (camp::idx_t col = 0; col < s_num_columns; ++col)
-        {
+      /*!
+       * Matrix vector product with accumulation into another vector
+       *
+       * acc += v * (this)
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      row_vector_type left_multiply_vector_accumulate(column_vector_type const &v, row_vector_type result) const {
 
-          // extract column value from v
-          auto v_col = register_type(v.get(col));
+        if(layout_type::is_row_major()){
 
-          // apply v_col to entire column (1 or more registers)
-          RAJA_UNROLL
-          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
-          {
+          // 1 register is split over multiple columns
+          if(s_minor_dim_registers == 0){
+            auto &vm = result.get_register(0);
 
-            auto& mv = result.get_register(rowreg);
-            mv       = m_registers[reg].multiply_add(v_col, mv);
+            // Loop over registers, which are also the segments in v
+            RAJA_UNROLL
+            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
+              camp::idx_t v_reg = m_reg >> s_segbits;
+              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
 
-            reg++;
+              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+              vm = m_registers[m_reg].multiply_add(v_tmp, vm);
 
-          }  // rowreg
-        }    // col
-      }
-    }
-    return result;
-  }
-
-  /*!
-   * Matrix vector product with accumulation into another vector
-   *
-   * acc += v * (this)
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  row_vector_type left_multiply_vector_accumulate(column_vector_type const& v,
-                                                  row_vector_type result) const
-  {
+            }
 
-    if (layout_type::is_row_major())
-    {
+            // Now sum segments in mv together to form final result
+            vm = vm.segmented_sum_outer(s_segbits, 0);
 
-      // 1 register is split over multiple columns
-      if (s_minor_dim_registers == 0)
-      {
-        auto& vm = result.get_register(0);
-
-        // Loop over registers, which are also the segments in v
-        RAJA_UNROLL
-        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
-        {
-          camp::idx_t v_reg = m_reg >> s_segbits;
-          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
-
-          auto v_tmp =
-              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-          vm = m_registers[m_reg].multiply_add(v_tmp, vm);
-        }
+          }
+          // one or more registers per row
+          else{
 
-        // Now sum segments in mv together to form final result
-        vm = vm.segmented_sum_outer(s_segbits, 0);
-      }
-      // one or more registers per row
-      else
-      {
+            // Loop over rows
+            camp::idx_t reg = 0;
+            RAJA_UNROLL
+            for(camp::idx_t row = 0;row < s_num_rows;++ row){
+              auto lhs_bcat = register_type(v.get(row));
+              RAJA_UNROLL
+              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
 
-        // Loop over rows
-        camp::idx_t reg = 0;
-        RAJA_UNROLL
-        for (camp::idx_t row = 0; row < s_num_rows; ++row)
-        {
-          auto lhs_bcat = register_type(v.get(row));
-          RAJA_UNROLL
-          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
-          {
+                result.get_register(colreg) =
+                    m_registers[reg].multiply_add(lhs_bcat, result.get_register(colreg));
+                reg ++;
 
-            result.get_register(colreg) = m_registers[reg].multiply_add(
-                lhs_bcat, result.get_register(colreg));
-            reg++;
+              } // rowreg
 
-          }  // rowreg
-        }
-      }
+            }
 
+          }
 
-    }  // row-major
 
-    // Column-major:
-    else
-    {
-      // 1 register is split over multiple rows
-      if (s_minor_dim_registers == 0)
-      {
+        } // row-major
 
-        // start by broadcasting the first segment in v across all of v
-        // we will use this term for all registers in the matrix
-        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
+        // Column-major:
+        else{
+          // 1 register is split over multiple rows
+          if(s_minor_dim_registers == 0){
 
-        // loop over output segments, which is also the number of
-        // registers in the matrix (no kidding!)
-        RAJA_UNROLL
-        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
-        {
+            // start by broadcasting the first segment in v across all of v
+            // we will use this term for all registers in the matrix
+            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-          // compute which result register we are accumulating into
-          camp::idx_t result_reg = outseg >> s_segbits;
+            // loop over output segments, which is also the number of
+            // registers in the matrix (no kidding!)
+            RAJA_UNROLL
+            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
 
-          // compute which segment within result_reg we are accumulating into
-          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
+              // compute which result register we are accumulating into
+              camp::idx_t result_reg = outseg >> s_segbits;
 
-          // compute segmented dot product to get output segment
-          auto value =
-              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+              // compute which segment within result_reg we are accumulating into
+              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
 
-          // accumulate result
-          result.get_register(result_reg) += value;
-        }
+              // compute segmented dot product to get output segment
+              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+
+              // accumulate result
+              result.get_register(result_reg) += value;
+            }
+
+          }
+          // one or more registers per column
+          else{
+            // Loop over rows
+            camp::idx_t reg = 0;
+            RAJA_UNROLL
+            for(camp::idx_t col = 0;col < s_num_columns;++ col){
+
+              // compute partial dot products for all registers in this row
+              auto colsum = register_type(0);
+              RAJA_UNROLL
+              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
+                colsum = m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
+                reg ++;
+
+              } // rowreg
+
+              // finish dot product by taking sum of rowsum
+              auto value = result.get(col) + colsum.sum();
+              result.set(value, col);
+
+            } // col
+          }
+
+
+        } // col-major
+        return result;
       }
-      // one or more registers per column
-      else
-      {
-        // Loop over rows
-        camp::idx_t reg = 0;
-        RAJA_UNROLL
-        for (camp::idx_t col = 0; col < s_num_columns; ++col)
-        {
-
-          // compute partial dot products for all registers in this row
-          auto colsum = register_type(0);
-          RAJA_UNROLL
-          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
-          {
-            colsum =
-                m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
-            reg++;
 
-          }  // rowreg
 
-          // finish dot product by taking sum of rowsum
-          auto value = result.get(col) + colsum.sum();
-          result.set(value, col);
 
-        }  // col
+
+
+      /*!
+       * Matrix-Matrix product
+       */
+      template<typename RMAT>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply(RMAT const &mat) const {
+        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(0);
+        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply(*this, mat, res);
+        return res;
       }
 
+      /*!
+       * Matrix-Matrix multiply add
+       */
+      template<typename RMAT>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply_add(RMAT const &B, typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type const &C) const {
+        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(C);
+        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, res);
+        return res;
+      }
 
-    }  // col-major
-    return result;
-  }
+      /*!
+       * Matrix-Matrix multiply accumulate
+       */
+      template<typename ACCMAT, typename RMAT>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      void
+      matrix_multiply_accumulate(ACCMAT &acc, RMAT const &B) const {
+        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, acc);
+      }
 
 
-  /*!
-   * Matrix-Matrix product
-   */
-  template <typename RMAT>
-  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
-      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply(RMAT const& mat) const
-  {
-    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
-        self_type, RMAT>::result_type res(0);
-    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::multiply(
-        *this, mat, res);
-    return res;
-  }
-
-  /*!
-   * Matrix-Matrix multiply add
-   */
-  template <typename RMAT>
-  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
-      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply_add(
-          RMAT const& B,
-          typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
-              self_type,
-              RMAT>::result_type const& C) const
-  {
-    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
-        self_type, RMAT>::result_type res(C);
-    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
-        self_type, RMAT>::multiply_accumulate(*this, B, res);
-    return res;
-  }
-
-  /*!
-   * Matrix-Matrix multiply accumulate
-   */
-  template <typename ACCMAT, typename RMAT>
-  RAJA_HOST_DEVICE RAJA_INLINE void
-  matrix_multiply_accumulate(ACCMAT& acc, RMAT const& B) const
-  {
-    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
-        self_type, RMAT>::multiply_accumulate(*this, B, acc);
-  }
 
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &set(element_type val, int row, int col){
+        m_registers[to_register(row, col)].set(val, to_lane(row,col));
+        return *this;
+      }
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& set(element_type val, int row, int col)
-  {
-    m_registers[to_register(row, col)].set(val, to_lane(row, col));
-    return *this;
-  }
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      element_type get(int row, int col) const {
+        return m_registers[to_register(row, col)].get(to_lane(row,col));
+      }
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  element_type get(int row, int col) const
-  {
-    return m_registers[to_register(row, col)].get(to_lane(row, col));
-  }
 
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      register_type extract_diagonal_register(camp::idx_t starting_column, camp::idx_t segbits, camp::idx_t segment) const {
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  register_type extract_diagonal_register(camp::idx_t starting_column,
-                                          camp::idx_t segbits,
-                                          camp::idx_t segment) const
-  {
+        register_type result(0);
 
-    register_type result(0);
+        camp::idx_t num_rows = register_type::s_num_elem >> segbits;
+        camp::idx_t num_repeats = 1 << segbits;
 
-    camp::idx_t num_rows    = register_type::s_num_elem >> segbits;
-    camp::idx_t num_repeats = 1 << segbits;
+        camp::idx_t col0 = (starting_column + num_rows*segment)%s_num_columns;
+        camp::idx_t row0 = num_rows*segment;
 
-    camp::idx_t col0 = (starting_column + num_rows * segment) % s_num_columns;
-    camp::idx_t row0 = num_rows * segment;
+        for(camp::idx_t i = 0;i < num_rows;++i){
+          camp::idx_t col = (col0 + i) % s_num_columns;
+          camp::idx_t row = row0 + i;
+          auto value = get(row,col);
+          for(camp::idx_t j = 0;j < num_repeats;++j){
+            result.set(value, (i<<segbits) + j);
+          }
+        }
 
-    for (camp::idx_t i = 0; i < num_rows; ++i)
-    {
-      camp::idx_t col = (col0 + i) % s_num_columns;
-      camp::idx_t row = row0 + i;
-      auto value      = get(row, col);
-      for (camp::idx_t j = 0; j < num_repeats; ++j)
-      {
-        result.set(value, (i << segbits) + j);
+        return result;
       }
-    }
-
-    return result;
-  }
 
 
-  /*!
-   * @brief Converts to matrix to a string
-   *
-   *
-   */
-  RAJA_INLINE
-  std::string to_string(bool one_line = false) const
-  {
-    std::string s = "Matrix(" + std::to_string(s_num_rows) + "x" +
-                    std::to_string(s_num_columns);
-    if (!one_line)
-    {
-      s += ")\n";
-    }
+      /*!
+       * @brief Converts to matrix to a string
+       *
+       *
+       */
+      RAJA_INLINE
+      std::string to_string(bool one_line=false) const {
+        std::string s = "Matrix(" + std::to_string(s_num_rows) +
+            "x" + std::to_string(s_num_columns);
+        if(!one_line){
+          s +=")\n";
+        }
 
 
-    s += "[ ";
+        s += "[ ";
 
-    //
-    for (camp::idx_t r = 0; r < s_num_rows; ++r)
-    {
-      if (r > 0)
-      {
-        s += ", ";
-        if (!one_line)
-        {
-          s += "\n  ";
+        //
+        for(camp::idx_t r = 0;r < s_num_rows; ++ r){
+          if(r > 0){
+            s += ", ";
+            if(!one_line){
+              s+= "\n  ";
+            }
+          }
+          s += "[";
+          for(camp::idx_t c = 0;c < s_num_columns; ++ c){
+            if(c > 0){
+              s += ", ";
+            }
+            s += std::to_string(this->get(r,c));
+          }
+          s += "]";
         }
-      }
-      s += "[";
-      for (camp::idx_t c = 0; c < s_num_columns; ++c)
-      {
-        if (c > 0)
-        {
-          s += ", ";
+
+        s += " ]";
+        if(!one_line){
+          s+="\n";
         }
-        s += std::to_string(this->get(r, c));
+        return s;
       }
-      s += "]";
-    }
 
-    s += " ]";
-    if (!one_line)
-    {
-      s += "\n";
-    }
-    return s;
-  }
+  }; // MatrixRegisterImpl
 
-};  // MatrixRegisterImpl
 
 
-}  // namespace expt
+
+
+} // namespace expt
 }  // namespace RAJA
 
 
+
+
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index af2ca27b98..3480fda10c 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -33,1197 +33,1184 @@ namespace RAJA
 {
 namespace expt
 {
-template <typename T, typename REGISTER_POLICY>
-class Register;
+  template<typename T, typename REGISTER_POLICY>
+  class Register;
 }
 
 namespace internal
 {
 namespace expt
 {
-class RegisterConcreteBase
-{};
+  class RegisterConcreteBase {};
 
 
-/*
- * Overload for:    arithmetic + TensorRegister
+  /*
+   * Overload for:    arithmetic + TensorRegister
 
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
-{
-  return RIGHT(lhs).add(rhs);
-}
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return RIGHT(lhs).add(rhs);
+  }
 
-/*
- * Overload for:    arithmetic - TensorRegister
+  /*
+   * Overload for:    arithmetic - TensorRegister
 
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
-{
-  return RIGHT(lhs).subtract(rhs);
-}
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return RIGHT(lhs).subtract(rhs);
+  }
 
-/*
- * Overload for:    arithmetic * TensorRegister
+  /*
+   * Overload for:    arithmetic * TensorRegister
 
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
-{
-  return rhs.scale(lhs);
-}
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return rhs.scale(lhs);
+  }
 
-/*
- * Overload for:    arithmetic / TensorRegister
+  /*
+   * Overload for:    arithmetic / TensorRegister
 
- */
-template <
-    typename LEFT,
-    typename RIGHT,
+   */
+  template<typename LEFT, typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
-                            bool>::type                                  = true>
-RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
-{
-  return RIGHT(lhs).divide(rhs);
-}
-
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
+  {
+    return RIGHT(lhs).divide(rhs);
+  }
 
-/*!
- * Register base class that provides some default behaviors and simplifies
- * the implementation of new register types.
- *
- * This uses CRTP to provide static polymorphism
- */
-template <typename Derived>
-class RegisterBase;
 
-template <typename T, typename REGISTER_POLICY>
-class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
-    : public RegisterConcreteBase
-{
-public:
-  using self_type    = RAJA::expt::Register<T, REGISTER_POLICY>;
-  using element_type = camp::decay<T>;
 
-  using index_type = camp::idx_t;
 
-  using int_element_type =
-      typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
-  using int_vector_type =
-      RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-private:
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type* getThis() { return static_cast<self_type*>(this); }
+  /*!
+   * Register base class that provides some default behaviors and simplifies
+   * the implementation of new register types.
+   *
+   * This uses CRTP to provide static polymorphism
+   */
+  template<typename Derived>
+  class RegisterBase;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr self_type const* getThis() const
+  template<typename T, typename REGISTER_POLICY>
+  class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>> :
+    public RegisterConcreteBase
   {
-    return static_cast<self_type const*>(this);
-  }
+    public:
+      using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
+      using element_type = camp::decay<T>;
 
-public:
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr bool is_root() { return true; }
+      using index_type = camp::idx_t;
 
+      using int_element_type = typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
+      using int_vector_type = RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr RegisterBase() {}
+    private:
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  ~RegisterBase() {}
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type *getThis(){
+        return static_cast<self_type *>(this);
+      }
 
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      self_type const *getThis() const{
+        return static_cast<self_type const *>(this);
+      }
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr RegisterBase(RegisterBase const&) {}
+    public:
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr RegisterBase(self_type const&) {}
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      bool is_root() {
+        return true;
+      }
 
 
-  /*!
-   * @brief Broadcast scalar value to first N register elements
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static self_type s_broadcast_n(element_type const& value, camp::idx_t N)
-  {
-    self_type x;
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      x.set(value, i);
-    }
-    return x;
-  }
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      RegisterBase(){}
 
-  /*!
-   * @brief Extracts a scalar value and broadcasts to a new register
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type get_and_broadcast(int i) const
-  {
-    self_type x;
-    x.broadcast(getThis()->get(i));
-    return x;
-  }
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      ~RegisterBase(){}
 
 
-  /*!
-   * @brief Generic gather operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type&
-  gather(element_type const* ptr,
-         RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
-  {
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      RegisterBase(RegisterBase const &){}
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      RegisterBase(self_type const &){
+      }
+
+
+
+      /*!
+       * @brief Broadcast scalar value to first N register elements
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      self_type s_broadcast_n(element_type const &value, camp::idx_t N){
+        self_type x;
+        for(camp::idx_t i = 0;i < N;++ i){
+          x.set(value, i);
+        }
+        return x;
+      }
+
+      /*!
+       * @brief Extracts a scalar value and broadcasts to a new register
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type get_and_broadcast(int i) const {
+        self_type x;
+        x.broadcast(getThis()->get(i));
+        return x;
+      }
+
+
+      /*!
+       * @brief Generic gather operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &gather(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
-      getThis()->set(ptr[offsets.get(i)], i);
-    }
-    return *getThis();
-  }
+        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+          getThis()->set(ptr[offsets.get(i)], i);
+        }
+        return *getThis();
+      }
 
 
-  /*!
-   * @brief Generic gather operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type&
-  gather_n(element_type const* ptr,
-           RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-           camp::idx_t N)
-  {
+      /*!
+       * @brief Generic gather operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &gather_n(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      getThis()->set(ptr[offsets.get(i)], i);
-    }
-    return *getThis();
-  }
+          for(camp::idx_t i = 0;i < N;++ i){
+            getThis()->set(ptr[offsets.get(i)], i);
+          }
+          return *getThis();
+      }
 
 
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays.
-   *
-   * The default operation combines the s_segmented_offsets and gather
-   * operations.
-   *
-   *
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t segbits,
-                            camp::idx_t stride_inner,
-                            camp::idx_t stride_outer)
-  {
-    getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner,
-                                                          stride_outer));
-    return *getThis();
-  }
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays.
+       *
+       * The default operation combines the s_segmented_offsets and gather
+       * operations.
+       *
+       *
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
+        getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+        return *getThis();
+      }
 
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays where we load partial segments.
-   *
-   *
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t segbits,
-                               camp::idx_t stride_inner,
-                               camp::idx_t stride_outer,
-                               camp::idx_t num_inner,
-                               camp::idx_t num_outer)
-  {
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays where we load partial segments.
+       *
+       *
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
+          camp::idx_t stride_inner, camp::idx_t stride_outer,
+          camp::idx_t num_inner, camp::idx_t num_outer)
+      {
 
-    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size     = 1 << segbits;
+        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+        camp::idx_t seg_size = 1 << segbits;
 
-    camp::idx_t lane = 0;
-    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
-    {
-      for (camp::idx_t i = 0; i < seg_size; ++i)
-      {
+        camp::idx_t lane = 0;
+        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
+          for(camp::idx_t i = 0;i < seg_size; ++ i){
 
-        if (seg >= num_outer || i >= num_inner)
-        {
-          getThis()->set(element_type(0), lane);
-        }
-        else
-        {
+            if(seg >= num_outer || i >= num_inner){
+              getThis()->set(element_type(0), lane);
+            }
+            else{
 
-          camp::idx_t offset = seg * stride_outer + i * stride_inner;
+              camp::idx_t offset = seg*stride_outer + i*stride_inner;
 
-          element_type value = ptr[offset];
+              element_type value = ptr[offset];
 
-          getThis()->set(value, lane);
+              getThis()->set(value, lane);
+
+            }
+
+            lane ++;
+          }
         }
 
-        lane++;
+        return *getThis();
       }
-    }
 
-    return *getThis();
-  }
 
 
-  /*!
-   * @brief Generic scatter operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  scatter(element_type* ptr,
-          RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
-  {
+
+
+      /*!
+       * @brief Generic scatter operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &scatter(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets) const {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
-      ptr[offsets.get(i)] = getThis()->get(i);
-    }
-    return *getThis();
-  }
+        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+          ptr[offsets.get(i)] = getThis()->get(i);
+        }
+        return *getThis();
+      }
 
-  /*!
-   * @brief Generic scatter operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  scatter_n(element_type* ptr,
-            RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
-            camp::idx_t N) const
-  {
+      /*!
+       * @brief Generic scatter operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &scatter_n(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N) const {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[offsets.get(i)] = getThis()->get(i);
-    }
-    return *getThis();
-  }
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[offsets.get(i)] = getThis()->get(i);
+        }
+        return *getThis();
+      }
 
 
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays.
-   *
-   * The default operation combines the s_segmented_offsets and gather
-   * operations.
-   *
-   *
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t segbits,
-                                   camp::idx_t stride_inner,
-                                   camp::idx_t stride_outer) const
-  {
-    getThis()->scatter(ptr, self_type::s_segmented_offsets(
-                                segbits, stride_inner, stride_outer));
-    return *getThis();
-  }
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays.
+       *
+       * The default operation combines the s_segmented_offsets and gather
+       * operations.
+       *
+       *
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
+        getThis()->scatter(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
+        return *getThis();
+      }
 
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays where we load partial segments.
-   *
-   *
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t segbits,
-                                      camp::idx_t stride_inner,
-                                      camp::idx_t stride_outer,
-                                      camp::idx_t num_inner,
-                                      camp::idx_t num_outer) const
-  {
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays where we load partial segments.
+       *
+       *
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
+          camp::idx_t stride_inner, camp::idx_t stride_outer,
+          camp::idx_t num_inner, camp::idx_t num_outer) const
+      {
 
-    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size     = 1 << segbits;
+        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+        camp::idx_t seg_size = 1 << segbits;
 
-    camp::idx_t lane = 0;
-    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
-    {
-      for (camp::idx_t i = 0; i < seg_size; ++i)
-      {
+        camp::idx_t lane = 0;
+        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
+          for(camp::idx_t i = 0;i < seg_size; ++ i){
+
+            if(!(seg >= num_outer || i >= num_inner)){
 
-        if (!(seg >= num_outer || i >= num_inner))
-        {
+              camp::idx_t offset = seg*stride_outer + i*stride_inner;
 
-          camp::idx_t offset = seg * stride_outer + i * stride_inner;
+              ptr[offset] = getThis()->get(lane);
 
-          ptr[offset] = getThis()->get(lane);
+            }
+
+            lane ++;
+          }
         }
 
-        lane++;
+        return *getThis();
       }
-    }
 
-    return *getThis();
-  }
+      /*!
+       * @brief Set entire register to a single scalar value
+       * @param value Value to set all register elements to
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(element_type value)
+      {
+        getThis()->broadcast(value);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Set entire register to a single scalar value
-   * @param value Value to set all register elements to
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(element_type value)
-  {
-    getThis()->broadcast(value);
-    return *getThis();
-  }
+      /*!
+       * @brief Set entire register to a single scalar value
+       * @param value Value to set all register elements to
+       */
+      RAJA_SUPPRESS_HD_WARN
+      template<typename T2>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const &value)
+      {
+        getThis()->broadcast(value.get(0));
+        return *getThis();
+      }
 
-  /*!
-   * @brief Set entire register to a single scalar value
-   * @param value Value to set all register elements to
-   */
-  RAJA_SUPPRESS_HD_WARN
-  template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type&
-  operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const& value)
-  {
-    getThis()->broadcast(value.get(0));
-    return *getThis();
-  }
+      /*!
+       * @brief Assign one register to another
+       * @param x register to copy
+       * @return Value of (*this)
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(self_type const &x)
+      {
+        getThis()->copy(x);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Assign one register to another
-   * @param x register to copy
-   * @return Value of (*this)
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(self_type const& x)
-  {
-    getThis()->copy(x);
-    return *getThis();
-  }
 
 
-  /*!
-   * @brief Add two registers
-   * @param x register to add
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator+(self_type const& x) const { return getThis()->add(x); }
 
 
-  /*!
-   * @brief Add a register to this register
-   * @param x register to add
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator+=(self_type const& x)
-  {
-    *getThis() = getThis()->add(x);
-    return *getThis();
-  }
+      /*!
+       * @brief Add two registers
+       * @param x register to add
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator+(self_type const &x) const
+      {
+        return getThis()->add(x);
+      }
 
-  /*!
-   * @brief Add scalar to this register
-   * @param x scalar to add to this register
-   * @return Value of (*this)+x
-   *
-   * This broadcasts the scalar to all lanes, then adds to this register
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator+(element_type const& x) const { return getThis()->add(x); }
 
+      /*!
+       * @brief Add a register to this register
+       * @param x register to add
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator+=(self_type const &x)
+      {
+        *getThis() = getThis()->add(x);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Add a scalar to this register
-   * @param x scalar to add to this register
-   * @return Value of (*this)+x
-   *
-   * This broadcasts the scalar to all lanes, then adds to this register
-   *
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator+=(element_type x)
-  {
-    *getThis() = getThis()->add(x);
-    return *getThis();
-  }
+      /*!
+       * @brief Add scalar to this register
+       * @param x scalar to add to this register
+       * @return Value of (*this)+x
+       *
+       * This broadcasts the scalar to all lanes, then adds to this register
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator+(element_type const &x) const
+      {
+        return getThis()->add(x);
+      }
 
-  /*!
-   * @brief Negate the value of this register
-   * @return Value of -(*this)
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator-() const { return self_type(0).subtract(*getThis()); }
 
-  /*!
-   * @brief Subtract two register registers
-   * @param x register to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator-(self_type const& x) const
-  {
-    return getThis()->subtract(x);
-  }
+      /*!
+       * @brief Add a scalar to this register
+       * @param x scalar to add to this register
+       * @return Value of (*this)+x
+       *
+       * This broadcasts the scalar to all lanes, then adds to this register
+       *
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator+=(element_type x)
+      {
+        *getThis() = getThis()->add(x);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Subtract a register from this register
-   * @param x register to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator-=(self_type const& x)
-  {
-    *getThis() = getThis()->subtract(x);
-    return *getThis();
-  }
+      /*!
+       * @brief Negate the value of this register
+       * @return Value of -(*this)
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator-() const
+      {
+        return self_type(0).subtract(*getThis());
+      }
 
-  /*!
-   * @brief Subtract scalar from this register
-   * @param x register to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator-(element_type const& x) const
-  {
-    return getThis()->subtract(x);
-  }
+      /*!
+       * @brief Subtract two register registers
+       * @param x register to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator-(self_type const &x) const
+      {
+        return getThis()->subtract(x);
+      }
 
-  /*!
-   * @brief Subtract a scalar from this register
-   * @param x register to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator-=(element_type const& x)
-  {
-    *getThis() = getThis()->subtract(x);
-    return *getThis();
-  }
+      /*!
+       * @brief Subtract a register from this register
+       * @param x register to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator-=(self_type const &x)
+      {
+        *getThis() = getThis()->subtract(x);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Multiply two register registers, element wise
-   * @param x register to subtract from this register
-   * @return Value of (*this)+x
-   */
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*(RHS const& rhs) const
-  {
-    return getThis()->multiply(rhs);
-  }
+      /*!
+       * @brief Subtract scalar from this register
+       * @param x register to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator-(element_type const &x) const
+      {
+        return getThis()->subtract(x);
+      }
 
-  /*!
-   * @brief Multiply a register with this register
-   * @param x register to multiple with this register
-   * @return Value of (*this)+x
-   */
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
-  {
-    *getThis() = getThis()->multiply(rhs);
-    return *getThis();
-  }
+      /*!
+       * @brief Subtract a scalar from this register
+       * @param x register to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator-=(element_type const &x)
+      {
+        *getThis() = getThis()->subtract(x);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Divide two register registers, element wise
-   * @param x register to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
+      /*!
+       * @brief Multiply two register registers, element wise
+       * @param x register to subtract from this register
+       * @return Value of (*this)+x
+       */
+      template<typename RHS>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator*(RHS const &rhs) const
+      {
+        return getThis()->multiply(rhs);
+      }
 
-  /*!
-   * @brief Divide this register by another register
-   * @param x register to divide by
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator/=(self_type const& x)
-  {
-    *getThis() = getThis()->divide(x);
-    return *getThis();
-  }
+      /*!
+       * @brief Multiply a register with this register
+       * @param x register to multiple with this register
+       * @return Value of (*this)+x
+       */
+      template<typename RHS>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator*=(RHS const &rhs)
+      {
+        *getThis() = getThis()->multiply(rhs);
+        return *getThis();
+      }
 
+      /*!
+       * @brief Divide two register registers, element wise
+       * @param x register to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type operator/(self_type const &x) const
+      {
+        return getThis()->divide(x);
+      }
 
-  /*!
-   * @brief Divide by a scalar, element wise
-   * @param x Scalar to divide by
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type operator/(element_type const& x) const
-  {
-    return getThis()->divide(x);
-  }
+      /*!
+       * @brief Divide this register by another register
+       * @param x register to divide by
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator/=(self_type const &x)
+      {
+        *getThis() = getThis()->divide(x);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Divide this register by another register
-   * @param x Scalar to divide by
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator/=(element_type const& x)
-  {
-    *getThis() = getThis()->divide(x);
-    return *getThis();
-  }
 
+      /*!
+       * @brief Divide by a scalar, element wise
+       * @param x Scalar to divide by
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type operator/(element_type const &x) const
+      {
+        return getThis()->divide(x);
+      }
 
-  /*!
-   * @brief Divide n elements of this register by another register
-   * @param x register to divide by
-   * @param n Number of elements to divide
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t n) const
-  {
-    self_type q(*getThis());
-    for (camp::idx_t i = 0; i < n; ++i)
-    {
-      q.set(getThis()->get(i) / b.get(i), i);
-    }
-    return q;
-  }
+      /*!
+       * @brief Divide this register by another register
+       * @param x Scalar to divide by
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator/=(element_type const &x)
+      {
+        *getThis() = getThis()->divide(x);
+        return *getThis();
+      }
 
-  /*!
-   * @brief Divide n elements of this register by a scalar
-   * @param x Scalar to divide by
-   * @param n Number of elements to divide
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(element_type const& b, camp::idx_t n) const
-  {
-    self_type q(*getThis());
-    for (camp::idx_t i = 0; i < n; ++i)
-    {
-      q.set(getThis()->get(i) / b, i);
-    }
-    return q;
-  }
 
-  /*!
-   * @brief Dot product of two registers
-   * @param x Other register to dot with this register
-   * @return Value of (*this) dot x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type dot(self_type const& x) const
-  {
-    return getThis()->multiply(x).sum();
-  }
+      /*!
+       * @brief Divide n elements of this register by another register
+       * @param x register to divide by
+       * @param n Number of elements to divide
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t n) const {
+        self_type q(*getThis());
+        for(camp::idx_t i = 0;i < n;++i){
+          q.set(getThis()->get(i) / b.get(i), i);
+        }
+        return q;
+      }
 
-  /*!
-   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-   *
-   * Derived types can override this to implement intrinsic FMA's
-   *
-   * @param b Second product operand
-   * @param c Sum operand
-   * @return Value of (*this)*b+c
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_add(self_type const& b, self_type const& c) const
-  {
-    return (self_type(*getThis()) * self_type(b)) + self_type(c);
-  }
+      /*!
+       * @brief Divide n elements of this register by a scalar
+       * @param x Scalar to divide by
+       * @param n Number of elements to divide
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(element_type const &b, camp::idx_t n) const {
+        self_type q(*getThis());
+        for(camp::idx_t i = 0;i < n;++i){
+          q.set(getThis()->get(i) / b, i);
+        }
+        return q;
+      }
 
-  /*!
-   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-   *
-   * Derived types can override this to implement intrinsic FMS's
-   *
-   * @param b Second product operand
-   * @param c Subtraction operand
-   * @return Value of (*this)*b-c
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return getThis()->multiply_add(b, -c);
-  }
+      /*!
+       * @brief Dot product of two registers
+       * @param x Other register to dot with this register
+       * @return Value of (*this) dot x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type dot(self_type const &x) const
+      {
+        return getThis()->multiply(x).sum();
+      }
 
-  /*!
-   * Multiply this tensor by a scalar value
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type scale(element_type c) const
-  {
-    return getThis()->multiply(self_type(c));
-  }
+      /*!
+       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+       *
+       * Derived types can override this to implement intrinsic FMA's
+       *
+       * @param b Second product operand
+       * @param c Sum operand
+       * @return Value of (*this)*b+c
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_add(self_type const &b, self_type const &c) const
+      {
+        return (self_type(*getThis()) * self_type(b)) + self_type(c);
+      }
 
-  /*!
-   * Minimum value across first N lanes of register
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type min_n(camp::idx_t N) const { return getThis()->min(N); }
+      /*!
+       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+       *
+       * Derived types can override this to implement intrinsic FMS's
+       *
+       * @param b Second product operand
+       * @param c Subtraction operand
+       * @return Value of (*this)*b-c
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return getThis()->multiply_add(b, -c);
+      }
 
-  /*!
-   * Maximum value across first N lanes of register
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type max_n(camp::idx_t N) const { return getThis()->max(N); }
+      /*!
+       * Multiply this tensor by a scalar value
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type scale(element_type c) const
+      {
+        return getThis()->multiply(self_type(c));
+      }
 
-  /*!
-   * Provides vector-level building block for matrix transpose operations.
-   *
-   * This is a non-optimized reference version which will be used if
-   * no architecture specialized version is supplied
-   *
-   * This is a permute-and-shuffle left operation
-   *
-   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-   *
-   *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
-   *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
-   *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type transpose_shuffle_left(camp::idx_t lvl, self_type const& y) const
-  {
-    auto const& x = *getThis();
+      /*!
+       * Minimum value across first N lanes of register
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type min_n(camp::idx_t N) const
+      {
+        return getThis()->min(N);
+      }
 
-    self_type z;
+      /*!
+       * Maximum value across first N lanes of register
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type max_n(camp::idx_t N) const
+      {
+        return getThis()->max(N);
+      }
 
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
+      /*!
+       * Provides vector-level building block for matrix transpose operations.
+       *
+       * This is a non-optimized reference version which will be used if
+       * no architecture specialized version is supplied
+       *
+       * This is a permute-and-shuffle left operation
+       *
+       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+       *
+       *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
+       *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
+       *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type transpose_shuffle_left(camp::idx_t lvl, self_type const &y) const
+      {
+        auto const &x = *getThis();
 
-      // extract value x or y
-      camp::idx_t xy_select = (i >> lvl) & 0x1;
+        self_type z;
 
+        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
 
-      z.set(xy_select == 0 ? x.get(i) : y.get(i - (1 << lvl)), i);
-    }
+          // extract value x or y
+          camp::idx_t xy_select = (i >> lvl) & 0x1;
 
-    return z;
-  }
 
+          z.set(xy_select == 0 ? x.get(i) : y.get(i - (1<<lvl)), i);
+        }
 
-  /*!
-   * Provides vector-level building block for matrix transpose operations.
-   *
-   * This is a non-optimized reference version which will be used if
-   * no architecture specialized version is supplied
-   *
-   * This is a permute-and-shuffle right operation
-   *
-   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-   *
-   *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
-   *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
-   *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type transpose_shuffle_right(int lvl, self_type const& y) const
-  {
-    auto const& x = *getThis();
+        return z;
+      }
 
-    self_type z;
 
-    camp::idx_t i0 = 1 << lvl;
+      /*!
+       * Provides vector-level building block for matrix transpose operations.
+       *
+       * This is a non-optimized reference version which will be used if
+       * no architecture specialized version is supplied
+       *
+       * This is a permute-and-shuffle right operation
+       *
+       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+       *
+       *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
+       *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
+       *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type transpose_shuffle_right(int lvl, self_type const &y) const
+      {
+        auto const &x = *getThis();
 
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
+        self_type z;
 
-      // extract value x or y
-      camp::idx_t xy_select = (i >> lvl) & 0x1;
+        camp::idx_t i0 = 1<<lvl;
 
-      z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1 << lvl)), i);
-    }
+        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
 
-    return z;
-  }
+          // extract value x or y
+          camp::idx_t xy_select = (i >> lvl) & 0x1;
 
+          z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1<<lvl)), i);
+        }
 
-  /*!
-   * Provides gather/scatter indices for segmented loads and stores
-   *
-   * THe number of segment bits (segbits) is specified, as well as the
-   * stride between elements in a segment (stride_inner),
-   * and the stride between segments (stride_outer)
-   */
-  RAJA_INLINE
-  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
-                                             camp::idx_t stride_inner,
-                                             camp::idx_t stride_outer)
-  {
-    int_vector_type result;
+        return z;
+      }
 
-    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size     = 1 << segbits;
 
-    camp::idx_t lane = 0;
-    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
-    {
-      for (camp::idx_t i = 0; i < seg_size; ++i)
+
+
+      /*!
+       * Provides gather/scatter indices for segmented loads and stores
+       *
+       * THe number of segment bits (segbits) is specified, as well as the
+       * stride between elements in a segment (stride_inner),
+       * and the stride between segments (stride_outer)
+       */
+      RAJA_INLINE
+      static
+      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
       {
-        result.set(seg * stride_outer + i * stride_inner, lane);
-        lane++;
-      }
-    }
+        int_vector_type result;
 
-    return result;
-  }
+        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+        camp::idx_t seg_size = 1 << segbits;
 
+        camp::idx_t lane = 0;
+        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
+          for(camp::idx_t i = 0;i < seg_size; ++ i){
+            result.set(seg*stride_outer + i*stride_inner, lane);
+            lane ++;
+          }
+        }
 
-  /*!
-   * Sum elements within each segment, with segment size defined by segbits.
-   * Stores each segments sum consecutively, but shifed to the
-   * corresponding output_segment slot.
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 is equivalent to the input vector,  since there are 8
-   *      outputs, there is only 1 output segment
-   *
-   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-   *      so there are possible output segments.
-   *
-   *      output_segment=0:
-   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-   *
-   *      output_segment=1:
-   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-   *
-   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-   *      output_segment denotes the vector position of the sum
-   *
-   */
-  RAJA_INLINE
-  self_type segmented_sum_inner(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
-  {
-    self_type result(0);
+        return result;
+      }
 
-    // default implementation is dumb, just sum each value into
-    // appropriate segment lane
-    int output_offset = output_segment * self_type::s_num_elem >> segbits;
 
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
-      auto value =
-          getThis()->get(i) + result.get((i >> segbits) + output_offset);
-      result.set(value, (i >> segbits) + output_offset);
-    }
+      /*!
+       * Sum elements within each segment, with segment size defined by segbits.
+       * Stores each segments sum consecutively, but shifed to the
+       * corresponding output_segment slot.
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 is equivalent to the input vector,  since there are 8
+       *      outputs, there is only 1 output segment
+       *
+       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+       *      so there are possible output segments.
+       *
+       *      output_segment=0:
+       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+       *
+       *      output_segment=1:
+       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+       *
+       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+       *      output_segment denotes the vector position of the sum
+       *
+       */
+      RAJA_INLINE
+      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
+      {
+        self_type result(0);
 
-    return result;
-  }
+        // default implementation is dumb, just sum each value into
+        // appropriate segment lane
+        int output_offset = output_segment * self_type::s_num_elem>>segbits;
 
-  /*!
-   * Sum all segments as subvectors, with segment size defined by segbits
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 the segments are size 1, which means that this is just a
-   *      sum of all elements.  The output_segment determines where the
-   *      result is placed.
-   *
-   *      output_segment=0:
-   *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
-   *
-   *      output_segment=3:
-   *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
-   *
-   *  segbits=1 the segments are 2-wide:
-   *
-   *      output_segment=0:
-   *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
-   *
-   *      output_segment=1:
-   *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
-   *
-   *  and so on up to segbits=3, which is just the original vector:
-   *  segbits=3
-   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   */
-  RAJA_INLINE
-  self_type segmented_sum_outer(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
-  {
-    self_type result(0);
+        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+          auto value = getThis()->get(i) + result.get((i >> segbits)+output_offset);
+          result.set(value, (i >> segbits)+output_offset);
+        }
 
-    // default implementation is dumb, just sum each value into
-    // appropriate segment lane
-    int output_offset = output_segment * (1 << segbits);
+        return result;
+      }
+
+      /*!
+       * Sum all segments as subvectors, with segment size defined by segbits
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 the segments are size 1, which means that this is just a
+       *      sum of all elements.  The output_segment determines where the
+       *      result is placed.
+       *
+       *      output_segment=0:
+       *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
+       *
+       *      output_segment=3:
+       *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
+       *
+       *  segbits=1 the segments are 2-wide:
+       *
+       *      output_segment=0:
+       *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
+       *
+       *      output_segment=1:
+       *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
+       *
+       *  and so on up to segbits=3, which is just the original vector:
+       *  segbits=3
+       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       */
+      RAJA_INLINE
+      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
+      {
+        self_type result(0);
 
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
-      camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
-      auto value           = getThis()->get(i) + result.get(output_i);
-      result.set(value, output_i);
-    }
+        // default implementation is dumb, just sum each value into
+        // appropriate segment lane
+        int output_offset = output_segment * (1<<segbits);
 
-    return result;
-  }
+        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+          camp::idx_t output_i = output_offset + (i&((1<<segbits)-1));
+          auto value = getThis()->get(i) + result.get(output_i);
+          result.set(value, output_i);
+        }
 
+        return result;
+      }
 
-  RAJA_INLINE
-  self_type segmented_divide_nm(self_type den,
-                                camp::idx_t segbits,
-                                camp::idx_t num_inner,
-                                camp::idx_t num_outer) const
-  {
-    self_type result;
 
-    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-    camp::idx_t seg_size     = 1 << segbits;
 
-    camp::idx_t lane = 0;
-    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
-    {
-      for (camp::idx_t i = 0; i < seg_size; ++i)
+      RAJA_INLINE
+      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
       {
+        self_type result;
 
-        if (seg >= num_outer || i >= num_inner)
-        {
-          result.set(element_type(0), lane);
-        }
-        else
-        {
+        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+        camp::idx_t seg_size = 1 << segbits;
+
+        camp::idx_t lane = 0;
+        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
+          for(camp::idx_t i = 0;i < seg_size; ++ i){
+
+            if(seg >= num_outer || i >= num_inner){
+              result.set(element_type(0), lane);
+            }
+            else{
+
+              element_type div = getThis()->get(lane) / den.get(lane);
+
+              result.set(div, lane);
 
-          element_type div = getThis()->get(lane) / den.get(lane);
+            }
 
-          result.set(div, lane);
+            lane ++;
+          }
         }
 
-        lane++;
+        return result;
       }
-    }
 
-    return result;
-  }
 
 
-  /*!
-   * Segmented dot product performs dot products
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
-   *
-   *
-   *  segbits=0 is equivalent to a vector multiply,  since there are 8
-   *      outputs, there is only 1 output segment
-   *
-   *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
-   *
-   *  segbits=1 sums neighboring pairs of products.  There are 4 output,
-   *      so there are possible output segments.
-   *
-   *      output_segment=0:
-   *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
-   *
-   *      output_segment=1:
-   *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
-   *
-   *  and so on up to segbits=3, which is a full dot-product of x and y, and the
-   *      output_segment denotes the vector position of the result
-   *
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type segmented_dot(camp::idx_t segbits,
-                          camp::idx_t output_segment,
-                          self_type const& x) const
-  {
-    return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
-  }
+      /*!
+       * Segmented dot product performs dot products
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
+       *
+       *
+       *  segbits=0 is equivalent to a vector multiply,  since there are 8
+       *      outputs, there is only 1 output segment
+       *
+       *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
+       *
+       *  segbits=1 sums neighboring pairs of products.  There are 4 output,
+       *      so there are possible output segments.
+       *
+       *      output_segment=0:
+       *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
+       *
+       *      output_segment=1:
+       *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
+       *
+       *  and so on up to segbits=3, which is a full dot-product of x and y, and the
+       *      output_segment denotes the vector position of the result
+       *
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type segmented_dot(camp::idx_t segbits, camp::idx_t output_segment, self_type const &x) const
+      {
+        return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+      }
 
-  /*!
-   * Segmented broadcast copies a segment to all output segments of a vector
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 means the input segment size is 1, so this selects the
-   *      value at x[input_segmnet] and broadcasts it to the rest of the
-   *      vector
-   *
-   *      input segments allowed are from 0 to 7, inclusive
-   *
-   *      input_segment=0
-   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-   *
-   *      input_segment=5
-   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-   *
-   *  segbits=1 means that the input segments are each pair of x values:
-   *
-   *      input segments allowed are from 0 to 3, inclusive
-   *
-   *      input_segment=0:
-   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-   *
-   *      input_segment=1:
-   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-   *
-   *      input_segment=3:
-   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-   *
-   *  and so on up to segbits=2, the input segments are 4 wide:
-   *
-   *      input segments allowed are from 0 or 1
-   *
-   *      input_segment=0:
-   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-   *
-   *      input_segment=1:
-   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-   *
-   */
-  RAJA_INLINE
-  self_type segmented_broadcast_inner(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
-  {
-    self_type result;
+      /*!
+       * Segmented broadcast copies a segment to all output segments of a vector
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 means the input segment size is 1, so this selects the
+       *      value at x[input_segmnet] and broadcasts it to the rest of the
+       *      vector
+       *
+       *      input segments allowed are from 0 to 7, inclusive
+       *
+       *      input_segment=0
+       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+       *
+       *      input_segment=5
+       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+       *
+       *  segbits=1 means that the input segments are each pair of x values:
+       *
+       *      input segments allowed are from 0 to 3, inclusive
+       *
+       *      input_segment=0:
+       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+       *
+       *      input_segment=1:
+       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+       *
+       *      input_segment=3:
+       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+       *
+       *  and so on up to segbits=2, the input segments are 4 wide:
+       *
+       *      input segments allowed are from 0 or 1
+       *
+       *      input_segment=0:
+       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+       *
+       *      input_segment=1:
+       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+       *
+       */
+      RAJA_INLINE
+      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
+      {
+        self_type result;
 
-    camp::idx_t mask   = (1 << segbits) - 1;
-    camp::idx_t offset = input_segment << segbits;
+        camp::idx_t mask = (1<<segbits)-1;
+        camp::idx_t offset = input_segment << segbits;
 
-    // default implementation is dumb, just sum each value into
-    // appropriate segment lane
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
+        // default implementation is dumb, just sum each value into
+        // appropriate segment lane
+        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
 
-      auto off = (i & mask) + offset;
+          auto off = (i&mask) + offset;
 
-      result.set(getThis()->get(off), i);
-    }
+          result.set(getThis()->get(off), i);
+        }
 
-    return result;
-  }
+        return result;
+      }
 
 
-  /*!
-   * Segmented broadcast spreads a segment to all output segments of a vector
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 means the input segment size is 1, so this selects the
-   *      value at x[input_segmnet] and broadcasts it to the rest of the
-   *      vector
-   *
-   *      input segments allowed are from 0 to 7, inclusive
-   *
-   *      input_segment=0
-   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-   *
-   *      input_segment=5
-   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-   *
-   *  segbits=1 means that the input segments are each pair of x values:
-   *
-   *      input segments allowed are from 0 to 3, inclusive
-   *
-   *      output_segment=0:
-   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-   *
-   *      output_segment=1:
-   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-   *
-   *      output_segment=3:
-   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-   */
-  RAJA_INLINE
-  self_type segmented_broadcast_outer(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
-  {
-    self_type result;
+      /*!
+       * Segmented broadcast spreads a segment to all output segments of a vector
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 means the input segment size is 1, so this selects the
+       *      value at x[input_segmnet] and broadcasts it to the rest of the
+       *      vector
+       *
+       *      input segments allowed are from 0 to 7, inclusive
+       *
+       *      input_segment=0
+       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+       *
+       *      input_segment=5
+       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+       *
+       *  segbits=1 means that the input segments are each pair of x values:
+       *
+       *      input segments allowed are from 0 to 3, inclusive
+       *
+       *      output_segment=0:
+       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+       *
+       *      output_segment=1:
+       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+       *
+       *      output_segment=3:
+       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+       */
+      RAJA_INLINE
+      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
+      {
+        self_type result;
 
-    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
 
-    // default implementation is dumb, just sum each value into
-    // appropriate segment lane
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
+        // default implementation is dumb, just sum each value into
+        // appropriate segment lane
+        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
 
-      auto off = (i >> segbits) + offset;
+          auto off = (i>>segbits) + offset;
 
-      result.set(getThis()->get(off), i);
-    }
+          result.set(getThis()->get(off), i);
+        }
 
-    return result;
-  }
+        return result;
+      }
 
 
-  /*!
-   * @brief Converts to vector to a string
-   *
-   *
-   */
-  RAJA_INLINE
-  std::string to_string() const
-  {
-    std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
 
-    //
-    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
-    {
-      s += std::to_string(getThis()->get(i)) + " ";
-    }
 
-    s += " ]\n";
 
-    return s;
-  }
-};
+      /*!
+       * @brief Converts to vector to a string
+       *
+       *
+       */
+      RAJA_INLINE
+      std::string to_string() const {
+        std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
+
+        //
+        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+          s += std::to_string(getThis()->get(i)) + " ";
+        }
+
+        s += " ]\n";
+
+        return s;
+      }
+
+  };
+
 
+} // namespace expt
+} // namespace internal
+} // namespace RAJA
 
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index c92921df2a..bb53993fed 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -27,277 +27,347 @@ namespace RAJA
 
 namespace internal
 {
-/* Partial specialization for the strip_index_type_t helper in
-   IndexValue.hpp
-*/
-template <typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
-struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
-{
-  using type =
-      typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
-};
+    /* Partial specialization for the strip_index_type_t helper in
+       IndexValue.hpp
+    */
+    template<typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
+    struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
+    {
+        using type = typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
+    };
 
 
 namespace expt
 {
 
 
-// Helper that strips the Vector type from an argument
-template <typename ARG>
-struct TensorIndexTraits
-{
-  using arg_type   = ARG;
-  using value_type = strip_index_type_t<ARG>;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr bool isTensorIndex() { return false; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr arg_type const& strip(arg_type const& arg) { return arg; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr arg_type const strip_by_value(arg_type const arg)
-  {
-    return arg;
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type size(arg_type const&) { return 1; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type begin(arg_type const&) { return 0; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type dim() { return 0; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type num_elem() { return 1; }
-};
-
-template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
-{
-  using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using arg_type   = IDX;
-  using value_type = strip_index_type_t<IDX>;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr bool isTensorIndex() { return true; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr arg_type const& strip(index_type const& arg) { return *arg; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr arg_type const strip_by_value(index_type const arg)
-  {
-    return (arg_type)arg;
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type size(index_type const& arg) { return arg.size(); }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type begin(index_type const& arg)
-  {
-    return arg.begin();
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type dim() { return DIM; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type num_elem()
-  {
-    return TENSOR_TYPE::s_dim_elem(DIM);
-  }
-};
-
-
-template <typename IDX,
-          typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          IDX INDEX_VALUE,
-          strip_index_type_t<IDX> LENGTH_VALUE>
-struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
-    RAJA::expt::StaticTensorIndexInner<IDX,
-                                       TENSOR_TYPE,
-                                       DIM,
-                                       INDEX_VALUE,
-                                       LENGTH_VALUE>>>
-{
-  using base_type  = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-  using index_type = RAJA::expt::StaticTensorIndex<
-      RAJA::expt::StaticTensorIndexInner<IDX,
-                                         TENSOR_TYPE,
-                                         DIM,
-                                         INDEX_VALUE,
-                                         LENGTH_VALUE>>;
-  using arg_type   = IDX;
-  using value_type = strip_index_type_t<IDX>;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr bool isTensorIndex() { return true; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr arg_type const strip_by_value(index_type const)
-  {
-    return INDEX_VALUE;
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type size(index_type const&) { return LENGTH_VALUE; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type begin(index_type const&) { return INDEX_VALUE; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type dim() { return DIM; }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr value_type num_elem()
-  {
-    return TENSOR_TYPE::s_dim_elem(DIM);
-  }
-};
-
-/*
- * Returns vector size of argument.
- *
- * For scalars, always returns 1.
- *
- * For VectorIndex types, returns the number of vector lanes.
- */
-template <typename ARG>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr bool isTensorIndex()
-{
-  return TensorIndexTraits<ARG>::isTensorIndex();
-}
-
-template <typename ARG>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndex(ARG const& arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const&
-{
-  return TensorIndexTraits<ARG>::strip(arg);
-}
-
-
-template <typename ARG>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
-stripTensorIndexByValue(ARG const arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const
-{
-  return TensorIndexTraits<ARG>::strip_by_value(arg);
-}
-
-/*
- * Returns tensor dimension size of argument.
- *
- * For VectorIndex types, returns the number of vector lanes.
- */
-template <typename ARG, typename IDX>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
-                                                         IDX dim_size)
-{
-  return TensorIndexTraits<ARG>::size(arg) >= 0
-             ? IDX(TensorIndexTraits<ARG>::size(arg))
-             : dim_size;
-}
-
-/*
- * Returns tensor dimenson beginning index of an argument.
- *
- */
-template <typename ARG, typename IDX>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
-                                                          IDX dim_minval)
-{
-  return TensorIndexTraits<ARG>::begin(arg) >= 0
-             ? IDX(TensorIndexTraits<ARG>::begin(arg))
-             : dim_minval;
-}
-
-/*
- * Returns vector dim of argument.
- *
- * For scalars, always returns 0.
- *
- * For VectorIndex types, returns the DIM argument.
- * For vector_exec, this is always 0
- *
- * For matrices, DIM means:
- *   0 : Row
- *   1 : Column
- */
-template <typename ARG>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr auto getTensorDim()
-    -> decltype(TensorIndexTraits<ARG>::dim())
-{
-  return TensorIndexTraits<ARG>::dim();
-}
 
-}  // namespace expt
 
 
-/*
- * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
- * includes the vector length with them
- */
-template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-{
-
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
-      TensorIndex<IDX, TENSOR_TYPE, DIM>
-      extract(Data&& data)
-  {
-    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-        camp::get<id>(data.segment_tuple)
-            .begin()[camp::get<id>(data.offset_tuple)],
-        camp::get<id>(data.vector_sizes));
-  }
-};
-
-/*
- * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
- * includes the vector length with them
- */
-template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
-      TensorIndex<IDX, TENSOR_TYPE, DIM>
-      extract(Data&& data)
-  {
-    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-        IDX(camp::get<id>(data.offset_tuple)),  // convert offset type to IDX
-        camp::get<id>(data.vector_sizes));
-  }
-};
-
-}  // namespace internal
+    // Helper that strips the Vector type from an argument
+    template<typename ARG>
+    struct TensorIndexTraits {
+        using arg_type = ARG;
+        using value_type = strip_index_type_t<ARG>;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        bool isTensorIndex(){
+          return false;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const &strip(arg_type const &arg){
+          return arg;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const strip_by_value(arg_type const arg){
+          return arg;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type size(arg_type const &){
+          return 1;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type begin(arg_type const &){
+          return 0;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type dim(){
+          return 0;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type num_elem(){
+          return 1;
+        }
+    };
+
+    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+    struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>> {
+        using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+        using arg_type = IDX;
+        using value_type = strip_index_type_t<IDX>;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        bool isTensorIndex(){
+          return true;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const &strip(index_type const &arg){
+          return *arg;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const strip_by_value(index_type const arg){
+          return (arg_type)arg;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type size(index_type const &arg){
+          return arg.size();
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type begin(index_type const &arg){
+          return arg.begin();
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type dim(){
+          return DIM;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type num_elem(){
+          return TENSOR_TYPE::s_dim_elem(DIM);
+        }
+    };
+
+
+
+
+    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
+    struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+        RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>
+    >> {
+        using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+        using index_type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>;
+        using arg_type = IDX;
+        using value_type = strip_index_type_t<IDX>;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        bool isTensorIndex(){
+          return true;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const strip_by_value(index_type const){
+          return INDEX_VALUE;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type size(index_type const &){
+          return LENGTH_VALUE;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type begin(index_type const &){
+          return INDEX_VALUE;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type dim(){
+          return DIM;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type num_elem(){
+          return TENSOR_TYPE::s_dim_elem(DIM);
+        }
+    };
+
+    /*
+     * Returns vector size of argument.
+     *
+     * For scalars, always returns 1.
+     *
+     * For VectorIndex types, returns the number of vector lanes.
+     */
+    template<typename ARG>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    bool isTensorIndex()
+    {
+      return TensorIndexTraits<ARG>::isTensorIndex();
+    }
+
+    template<typename ARG>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    auto stripTensorIndex(ARG const &arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const &
+    {
+      return TensorIndexTraits<ARG>::strip(arg);
+    }
+
+
+    template<typename ARG>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    auto stripTensorIndexByValue(ARG const arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const
+    {
+      return TensorIndexTraits<ARG>::strip_by_value(arg);
+    }
+
+    /*
+     * Returns tensor dimension size of argument.
+     *
+     * For VectorIndex types, returns the number of vector lanes.
+     */
+    template<typename ARG, typename IDX>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    IDX getTensorSize(ARG const &arg, IDX dim_size)
+    {
+      return TensorIndexTraits<ARG>::size(arg) >= 0 ?
+          IDX(TensorIndexTraits<ARG>::size(arg)) :
+          dim_size;
+    }
+
+    /*
+     * Returns tensor dimenson beginning index of an argument.
+     *
+     */
+    template<typename ARG, typename IDX>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    IDX getTensorBegin(ARG const &arg, IDX dim_minval)
+    {
+      return TensorIndexTraits<ARG>::begin(arg) >= 0 ?
+          IDX(TensorIndexTraits<ARG>::begin(arg)) :
+          dim_minval;
+    }
+
+    /*
+     * Returns vector dim of argument.
+     *
+     * For scalars, always returns 0.
+     *
+     * For VectorIndex types, returns the DIM argument.
+     * For vector_exec, this is always 0
+     *
+     * For matrices, DIM means:
+     *   0 : Row
+     *   1 : Column
+     */
+    template<typename ARG>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    auto getTensorDim() ->
+      decltype(TensorIndexTraits<ARG>::dim())
+    {
+      return TensorIndexTraits<ARG>::dim();
+    }
+
+} // namespace expt
+
+
+    /*
+     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+     * includes the vector length with them
+     */
+    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+    struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+    {
+
+      template<typename Data>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
+      {
+        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+            camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)],
+            camp::get<id>(data.vector_sizes));
+      }
+
+    };
+
+    /*
+     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+     * includes the vector length with them
+     */
+    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+    struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+    {
+
+      template<typename Data>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
+      {
+        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+            IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
+            camp::get<id>(data.vector_sizes));
+      }
+
+    };
+
+} // namespace internal
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index f57593b57f..60e31f24b9 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -30,759 +30,656 @@ namespace internal
 namespace expt
 {
 
-template <typename INT_SEQ>
-struct StaticIndexArray;
-
-template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
-struct PrependStaticIndexArray;
+    template<typename INT_SEQ>
+    struct StaticIndexArray;
+
+    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
+    struct PrependStaticIndexArray;
+
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
+    struct AddStaticIndexArray;
+
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
+    struct SetStaticIndexArray;
+
+
+    template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>> {
+        
+        using seq_type = camp::int_seq<INDEX_TYPE,HEAD,TAIL...>;
+        using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE,TAIL...>>;
+
+        Tail tail;
+
+        RAJA_INLINE
+        StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>() = default;
+       
+	 
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        static constexpr INDEX_TYPE value_at(size_t index) {
+            if(index == 0){
+                return HEAD;
+            } else {
+                return Tail::value_at(index-1);
+            }
+        }
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        constexpr INDEX_TYPE operator[](size_t index) const {
+            if(index == 0){
+                return HEAD;
+            } else {
+                return tail[index-1];
+            }
+        }
+
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print_values() const {
+            printf("%ld ",(long)HEAD);
+            tail.print_values();
+        }
+
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+            printf("[");
+            print_values();
+            printf("]");
+        }
+
+
+    };
+
+    template<typename INDEX_TYPE>
+    struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
+    {
 
-template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
-struct AddStaticIndexArray;
+        using seq_type = camp::int_seq<INDEX_TYPE>;
 
-template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
-struct SetStaticIndexArray;
+        RAJA_INLINE
+        StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
 
 
-template <typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-struct StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>
-{
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        static constexpr INDEX_TYPE value_at(size_t) {
+            return 0;
+        }
 
-  using seq_type = camp::int_seq<INDEX_TYPE, HEAD, TAIL...>;
-  using Self     = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Tail     = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        constexpr INDEX_TYPE operator[](size_t) const {
+            return 0;
+        }
 
-  Tail tail;
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print_values() const {}
 
-  RAJA_INLINE
-  StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>() = default;
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+            print("[]");
+        }
 
+    };
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr INDEX_TYPE value_at(size_t index)
-  {
-    if (index == 0)
-    {
-      return HEAD;
-    }
-    else
-    {
-      return Tail::value_at(index - 1);
-    }
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr INDEX_TYPE operator[](size_t index) const
-  {
-    if (index == 0)
-    {
-      return HEAD;
-    }
-    else
+    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
+    struct PrependStaticIndexArray<INDEX_TYPE, NEW_HEAD, StaticIndexArray<camp::int_seq<INDEX_TYPE,ORIG_INTS...>>>
     {
-      return tail[index - 1];
-    }
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print_values() const
-  {
-    printf("%ld ", (long)HEAD);
-    tail.print_values();
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print() const
-  {
-    printf("[");
-    print_values();
-    printf("]");
-  }
-};
-
-template <typename INDEX_TYPE>
-struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
-{
+        using Type = StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
+        using Seq  = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
+    };
 
-  using seq_type = camp::int_seq<INDEX_TYPE>;
 
-  RAJA_INLINE
-  StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
 
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr INDEX_TYPE value_at(size_t) { return 0; }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr INDEX_TYPE operator[](size_t) const { return 0; }
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct AddStaticIndexArray<INDEX_TYPE, IDX, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
+    {
+        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using AddTail = typename AddStaticIndexArray<INDEX_TYPE,IDX-1,DELTA,typename Orig::Tail>::Type;
+        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Type;
+        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Seq;
+    };
+
+    template<typename INDEX_TYPE, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct AddStaticIndexArray<INDEX_TYPE, 0, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
+    {
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print_values() const {}
+        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Type;
+        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Seq;
+    };
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print() const { print("[]"); }
-};
 
-template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
-struct PrependStaticIndexArray<
-    INDEX_TYPE,
-    NEW_HEAD,
-    StaticIndexArray<camp::int_seq<INDEX_TYPE, ORIG_INTS...>>>
-{
-  using Type =
-      StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
-  using Seq = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
-};
-
-
-template <typename INDEX_TYPE,
-          size_t IDX,
-          INDEX_TYPE DELTA,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
-struct AddStaticIndexArray<
-    INDEX_TYPE,
-    IDX,
-    DELTA,
-    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
-{
-  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
-                                               IDX - 1,
-                                               DELTA,
-                                               typename Orig::Tail>::Type;
-  using Type =
-      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
-  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
-};
-
-template <typename INDEX_TYPE,
-          INDEX_TYPE DELTA,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
-struct AddStaticIndexArray<
-    INDEX_TYPE,
-    0,
-    DELTA,
-    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
-{
 
-  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
-                                                HEAD + DELTA,
-                                                typename Orig::Tail>::Type;
-  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
-                                               HEAD + DELTA,
-                                               typename Orig::Tail>::Seq;
-};
-
-
-template <typename INDEX_TYPE,
-          size_t IDX,
-          INDEX_TYPE VALUE,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
-struct SetStaticIndexArray<
-    INDEX_TYPE,
-    IDX,
-    VALUE,
-    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
-{
-  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
-                                               IDX - 1,
-                                               VALUE,
-                                               typename Orig::Tail>::Type;
-  using Type =
-      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
-  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
-};
-
-template <typename INDEX_TYPE,
-          INDEX_TYPE VALUE,
-          INDEX_TYPE HEAD,
-          INDEX_TYPE... TAIL>
-struct SetStaticIndexArray<
-    INDEX_TYPE,
-    0,
-    VALUE,
-    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
-{
-  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
-  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
-                                                VALUE,
-                                                typename Orig::Tail>::Type;
-  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
-                                               VALUE,
-                                               typename Orig::Tail>::Seq;
-};
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
+    {
+        using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using SetTail = typename SetStaticIndexArray<INDEX_TYPE,IDX-1,VALUE,typename Orig::Tail>::Type;
+        using Type    = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Type;
+        using Seq     = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Seq;
+    };
+
+    template<typename INDEX_TYPE, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct SetStaticIndexArray<INDEX_TYPE, 0, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
+    {
+        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using Type = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Type;
+        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Seq;
+    };
 
 
-enum TensorTileSize
-{
-  TENSOR_PARTIAL,  // the tile is a full TensorRegister
-  TENSOR_FULL,     // the tile is a partial TensorRegister
-  TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
-};
+    enum TensorTileSize
+    {
+      TENSOR_PARTIAL,  // the tile is a full TensorRegister
+      TENSOR_FULL,     // the tile is a partial TensorRegister
+      TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
+    };
 
-template <typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
-struct TensorTile
-{
-  using self_type           = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-  using nonstatic_self_type = self_type;
-  using index_type          = INDEX_TYPE;
-  index_type m_begin[NUM_DIMS];
-  index_type m_size[NUM_DIMS];
+    template<typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
+    struct TensorTile
+    {
+        using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+        using nonstatic_self_type = self_type;
+        using index_type = INDEX_TYPE;
+        index_type m_begin[NUM_DIMS];
+        index_type m_size[NUM_DIMS];
+
+        static constexpr camp::idx_t s_num_dims = NUM_DIMS;
+        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+
+        template<typename I, TensorTileSize S>
+        void copy(TensorTile<I, S, NUM_DIMS> const &c)
+        {
+          for(camp::idx_t i = 0;i < NUM_DIMS;++i){
+            m_begin[i] = c.m_begin[i];
+            m_size[i] = c.m_size[i];
+          }
+        }
+
+        /*!
+         * Subtract begin offsets of two tiles.
+         *
+         * The resulting tile has the sizes of the left operand, but has
+         * m_begin[i] = left.m_begin[i] - right.m_begin[i]
+         *
+         */
+        template<typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        self_type operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const &sub) const {
+          self_type result(*this);
+          for(camp::idx_t i = 0;i < s_num_dims; ++ i){
+            result.m_begin[i] -= sub.m_begin[i];
+          }
+          return result;
+        }
+
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+          printf("TensorTile: dims=%d, m_begin=[",  (int)NUM_DIMS);
+
+          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
+            printf("%ld ", (long)m_begin[i]);
+          }
+
+          printf("], m_size=[");
+
+          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
+            printf("%ld ", (long)m_size[i]);
+          }
+
+          printf("]\n");
+        }
+    };
+
+
+
+
+    template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE>
+    struct StaticTensorTile;
+
+    template< typename INDEX_TYPE,
+              TensorTileSize TENSOR_SIZE,
+              INDEX_TYPE... BeginInts,
+              INDEX_TYPE... SizeInts>
+    struct StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, BeginInts...>,
+              camp::int_seq<INDEX_TYPE, SizeInts...>>
+    {
 
-  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
-  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
 
-  template <typename I, TensorTileSize S>
-  void copy(TensorTile<I, S, NUM_DIMS> const& c)
-  {
-    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
-    {
-      m_begin[i] = c.m_begin[i];
-      m_size[i]  = c.m_size[i];
-    }
-  }
-
-  /*!
-   * Subtract begin offsets of two tiles.
-   *
-   * The resulting tile has the sizes of the left operand, but has
-   * m_begin[i] = left.m_begin[i] - right.m_begin[i]
-   *
-   */
-  template <typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type
-  operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const& sub) const
-  {
-    self_type result(*this);
-    for (camp::idx_t i = 0; i < s_num_dims; ++i)
-    {
-      result.m_begin[i] -= sub.m_begin[i];
-    }
-    return result;
-  }
+        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
+        using begin_type = StaticIndexArray<begin_seq>;
+        using size_type  = StaticIndexArray<size_seq >;
+        using self_type  = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq,size_seq>;
+        using index_type = INDEX_TYPE;
 
+        using nonstatic_self_type = TensorTile<INDEX_TYPE,TENSOR_SIZE,sizeof...(BeginInts)>;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print() const
-  {
-    printf("TensorTile: dims=%d, m_begin=[", (int)NUM_DIMS);
+        using Partial = StaticTensorTile< INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>; 
+        using Full    = StaticTensorTile< INDEX_TYPE, TENSOR_FULL   , begin_seq, size_seq>; 
 
-    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
-    {
-      printf("%ld ", (long)m_begin[i]);
-    }
+        begin_type m_begin;
+        size_type  m_size;
 
-    printf("], m_size=[");
+	static_assert(
+          sizeof...(BeginInts) == sizeof...(SizeInts),
+          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
+        );
 
-    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
-    {
-      printf("%ld ", (long)m_size[i]);
-    }
+        static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
+        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-    printf("]\n");
-  }
-};
-
-
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE>
-struct StaticTensorTile;
-
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE... BeginInts,
-          INDEX_TYPE... SizeInts>
-struct StaticTensorTile<INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, BeginInts...>,
-                        camp::int_seq<INDEX_TYPE, SizeInts...>>
-{
+        constexpr operator nonstatic_self_type() const {
+            return nonstatic_self_type { {BeginInts...}, {SizeInts...} };
+        }
 
+        constexpr nonstatic_self_type nonstatic() const {
+            return *this;
+        }
+        
+        template<TensorTileSize S>
+        constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const RAJA_UNUSED_ARG(&c)) const
+        {}
 
-  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
-  using begin_type = StaticIndexArray<begin_seq>;
-  using size_type  = StaticIndexArray<size_seq>;
-  using self_type =
-      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
-  using index_type = INDEX_TYPE;
 
-  using nonstatic_self_type =
-      TensorTile<INDEX_TYPE, TENSOR_SIZE, sizeof...(BeginInts)>;
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+          printf("StaticTensorTile: dims=%d, m_begin=",  (int)s_num_dims);
 
-  using Partial =
-      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>;
-  using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
+          m_begin.print();
 
-  begin_type m_begin;
-  size_type m_size;
+          printf(", m_size=");
+          
+          m_size.print();
 
-  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
-                "Mismatch between "
-                "number of "
-                "elements in "
-                "Begin and Size "
-                "series of "
-                "StaticTensorTil"
-                "e");
+          printf("\n");
+        }
+    };
 
-  static constexpr camp::idx_t s_num_dims       = sizeof...(BeginInts);
-  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+        template< typename TILE, typename VALUE, size_t IDX>
+        struct SetStaticTensorTileBegin;
 
-  constexpr operator nonstatic_self_type() const
-  {
-    return nonstatic_self_type {{BeginInts...}, {SizeInts...}};
-  }
+        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
+        struct SetStaticTensorTileBegin<
+              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
+              camp::integral_constant<INDEX_TYPE,VALUE>,
+              IDX
+        > {
+            using BeginType = StaticIndexArray<TBEGIN>;
+            using Type = StaticTensorTile<
+                INDEX_TYPE,
+                TENSOR_SIZE,
+                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,BeginType>::Seq,
+                TSIZE
+            >;
+        };
 
-  constexpr nonstatic_self_type nonstatic() const { return *this; }
+        template< typename TILE, typename VALUE, size_t IDX>
+        struct SetStaticTensorTileSize;
 
-  template <TensorTileSize S>
-  constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const
-                          RAJA_UNUSED_ARG(&c)) const
-  {}
+        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
+        struct SetStaticTensorTileSize<
+              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
+              camp::integral_constant<INDEX_TYPE,VALUE>,
+              IDX
+        > {
+            using SizeType = StaticIndexArray<TSIZE>;
+            using Type = StaticTensorTile<
+                INDEX_TYPE,
+                TENSOR_SIZE,
+                TBEGIN,
+                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,SizeType>::Seq
+            >;
+        };
 
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print() const
-  {
-    printf("StaticTensorTile: dims=%d, m_begin=", (int)s_num_dims);
 
-    m_begin.print();
 
-    printf(", m_size=");
+    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM = -1>
+    struct TensorRef
+    {
+        static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
+        static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
+        static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
 
-    m_size.print();
+        using self_type = TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
+        using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+        using pointer_type = POINTER_TYPE;
+        using index_type = INDEX_TYPE;
+        
 
-    printf("\n");
-  }
-};
+        pointer_type m_pointer;
+        index_type m_stride[NUM_DIMS];
+        tile_type m_tile;
 
-template <typename TILE, typename VALUE, size_t IDX>
-struct SetStaticTensorTileBegin;
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+          printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS, m_pointer);
 
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE,
-          INDEX_TYPE VALUE,
-          size_t IDX>
-struct SetStaticTensorTileBegin<
-    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
-    camp::integral_constant<INDEX_TYPE, VALUE>,
-    IDX>
-{
-  using BeginType = StaticIndexArray<TBEGIN>;
-  using Type      = StaticTensorTile<
-      INDEX_TYPE,
-      TENSOR_SIZE,
-      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, BeginType>::Seq,
-      TSIZE>;
-};
-
-template <typename TILE, typename VALUE, size_t IDX>
-struct SetStaticTensorTileSize;
-
-template <typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE,
-          INDEX_TYPE VALUE,
-          size_t IDX>
-struct SetStaticTensorTileSize<
-    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
-    camp::integral_constant<INDEX_TYPE, VALUE>,
-    IDX>
-{
-  using SizeType = StaticIndexArray<TSIZE>;
-  using Type     = StaticTensorTile<
-      INDEX_TYPE,
-      TENSOR_SIZE,
-      TBEGIN,
-      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, SizeType>::Seq>;
-};
-
-
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          camp::idx_t NUM_DIMS,
-          camp::idx_t STRIDE_ONE_DIM = -1>
-struct TensorRef
-{
-  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
-  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
-  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
-
-  using self_type    = TensorRef<POINTER_TYPE,
-                              INDEX_TYPE,
-                              TENSOR_SIZE,
-                              NUM_DIMS,
-                              STRIDE_ONE_DIM>;
-  using tile_type    = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-  using pointer_type = POINTER_TYPE;
-  using index_type   = INDEX_TYPE;
-
-
-  pointer_type m_pointer;
-  index_type m_stride[NUM_DIMS];
-  tile_type m_tile;
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print() const
-  {
-    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
-           m_pointer);
-
-    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
-    {
-      printf("%ld ", (long)m_stride[i]);
-    }
+          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
+            printf("%ld ", (long)m_stride[i]);
+          }
 
-    printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
-
-    m_tile.print();
-  }
-};
-
-
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename STRIDE_TYPE,
-          typename BEGIN_TYPE,
-          typename SIZE_TYPE,
-          camp::idx_t STRIDE_ONE_DIM = -1>
-struct StaticTensorRef;
-
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE... StrideInts,
-          INDEX_TYPE... BeginInts,
-          INDEX_TYPE... SizeInts,
-          camp::idx_t STRIDE_ONE_DIM>
-struct StaticTensorRef<POINTER_TYPE,
-                       INDEX_TYPE,
-                       TENSOR_SIZE,
-                       camp::int_seq<INDEX_TYPE, StrideInts...>,
-                       camp::int_seq<INDEX_TYPE, BeginInts...>,
-                       camp::int_seq<INDEX_TYPE, SizeInts...>,
-                       STRIDE_ONE_DIM>
-{
+          printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
 
-  static constexpr camp::idx_t s_num_dims           = sizeof...(BeginInts);
-  static constexpr camp::idx_t s_stride_one_dim     = STRIDE_ONE_DIM;
-  static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
-  using pointer_type                                = POINTER_TYPE;
-  using index_type                                  = INDEX_TYPE;
+          m_tile.print();
+        }
 
-  using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
-  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
+    };
 
-  using stride_type = StaticIndexArray<stride_seq>;
 
-  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
-                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
-                "Mismatch between number of elements in Begin and Size series "
-                "of StaticTensorRef");
 
+    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename STRIDE_TYPE, typename BEGIN_TYPE, typename SIZE_TYPE, camp::idx_t STRIDE_ONE_DIM = -1>
+    struct StaticTensorRef;
 
-  using self_type = StaticTensorRef<POINTER_TYPE,
-                                    INDEX_TYPE,
-                                    TENSOR_SIZE,
-                                    stride_seq,
-                                    begin_seq,
-                                    size_seq>;
-  using tile_type =
-      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, INDEX_TYPE... StrideInts, INDEX_TYPE... BeginInts, INDEX_TYPE... SizeInts, camp::idx_t STRIDE_ONE_DIM>
+    struct StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInts...>,camp::int_seq<INDEX_TYPE,BeginInts...>,camp::int_seq<INDEX_TYPE,SizeInts...>,STRIDE_ONE_DIM>
+    {
 
+        static constexpr camp::idx_t    s_num_dims         = sizeof...(BeginInts);
+        static constexpr camp::idx_t    s_stride_one_dim   = STRIDE_ONE_DIM;
+        static constexpr TensorTileSize s_ref_tensor_size  = TENSOR_SIZE;
+        using pointer_type = POINTER_TYPE;
+        using index_type = INDEX_TYPE;
+        
+        using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
+        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
 
-  pointer_type m_pointer;
-  stride_type m_stride;
-  tile_type m_tile;
+        using stride_type  = StaticIndexArray<stride_seq>;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  void print() const
-  {
-    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
-           m_pointer);
+	static_assert(
+          (sizeof...(BeginInts) == sizeof...(SizeInts)) && (sizeof...(SizeInts) == sizeof...(StrideInts)),
+          "Mismatch between number of elements in Begin and Size series of StaticTensorRef"
+        );
+        
 
-    m_stride.print();
+        using self_type = StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,stride_seq,begin_seq,size_seq>;
+        using tile_type = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
-    printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
 
-    m_tile.print();
-  }
-};
+        pointer_type m_pointer;
+        stride_type m_stride;
+        tile_type m_tile;
 
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+          printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims, m_pointer);
+
+          m_stride.print();
+
+          printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+          m_tile.print();
+        }
+
+    };
+
+
+
+
+    template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
+    struct MergeRefTile;
+
+    template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t ... DIM_SEQ>
+    struct MergeRefTile <REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
+
+        static_assert( REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims , "Merging a ref with a tile requires an equivalent number of dimensions.");
+
+        static constexpr camp::idx_t    s_num_dims         = REF_TYPE::s_num_dims;
+        static constexpr camp::idx_t    s_stride_one_dim   = REF_TYPE::s_stride_one_dim;
+        static constexpr TensorTileSize s_ref_tensor_size  = TILE_TYPE::s_tensor_size;
+        using pointer_type    = typename REF_TYPE::pointer_type;
+        using ref_index_type  = typename REF_TYPE::index_type;
+        
+        static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
+        using tile_index_type = typename TILE_TYPE::index_type;
+
+        using merge_type = TensorRef<pointer_type, tile_index_type, s_tile_tensor_size, s_num_dims, s_stride_one_dim>;
+        using shift_type = merge_type;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static constexpr
+        merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile){
+          return merge_type{
+            ref.m_pointer,
+            {tile_index_type(ref.m_stride[DIM_SEQ])...},
+            tile
+          };
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static constexpr
+        shift_type shift_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin){
+          return shift_type{
+            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
+            {tile_index_type(ref.m_stride[DIM_SEQ])...},
+            ref.m_tile
+          };
+        }
+
+    };
+
+
+
+
+
+
+
+    template<
+       typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE,
+       typename STRIDE, INDEX_TYPE1... BEGIN1, INDEX_TYPE1... SIZE1, camp::idx_t STRIDE_ONE_DIM,
+       typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, typename BEGIN2, typename SIZE2,
+       camp::idx_t ... DIM_SEQ
+    >
+    struct MergeRefTile<
+       StaticTensorRef<
+              POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE,
+              STRIDE,
+              camp::int_seq<INDEX_TYPE1,BEGIN1...>,
+              camp::int_seq<INDEX_TYPE1,SIZE1...>,
+              STRIDE_ONE_DIM
+       >,
+       StaticTensorTile<
+              INDEX_TYPE2,
+              TENSOR_SIZE,
+              BEGIN2,
+              SIZE2
+       >,
+       camp::idx_seq<DIM_SEQ...>
+    > {
+
+        using ref_tile_type = StaticTensorTile<
+                  INDEX_TYPE1,
+                  RTENSOR_SIZE,
+                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
+                  camp::int_seq<INDEX_TYPE1, SIZE1...>
+              >;
+
+        using ref_type = StaticTensorRef<
+                  POINTER_TYPE,
+                  INDEX_TYPE1,
+                  RTENSOR_SIZE,
+                  STRIDE,
+                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
+                  camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                  STRIDE_ONE_DIM
+              >;
+
+        using tile_type = StaticTensorTile<
+                  INDEX_TYPE2,
+                  TENSOR_SIZE,
+                  BEGIN2,
+                  SIZE2
+              >;
+
+        using ref_stride_type = typename ref_type ::stride_type;
+
+        using new_stride_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>; 
+        
+        using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(BEGIN1)...>; 
+        using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(SIZE1)...>; 
+       
+        using shift_tile_type = StaticTensorTile<INDEX_TYPE2,TENSOR_SIZE,shift_begin_seq,shift_size_seq>;
+ 
+        using new_stride_type = StaticIndexArray<new_stride_seq>; 
+
+        using merge_type = StaticTensorRef<
+                  POINTER_TYPE,
+                  INDEX_TYPE2,
+                  TENSOR_SIZE,
+                  new_stride_seq,
+                  BEGIN2,
+                  SIZE2,
+                  STRIDE_ONE_DIM
+              >;
+
+        using shift_type = StaticTensorRef<
+                  POINTER_TYPE,
+                  INDEX_TYPE2,
+                  TENSOR_SIZE,
+                  new_stride_seq,
+                  shift_begin_seq,
+                  shift_size_seq,
+                  STRIDE_ONE_DIM
+              >;
+
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static constexpr
+        merge_type merge(ref_type const &ref, tile_type const &tile){
+          return merge_type {
+            ref.m_pointer,
+            new_stride_type(),
+            tile
+          };
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static constexpr
+        shift_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
+          return shift_type {
+            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
+            new_stride_type(),
+            shift_tile_type()
+          };
+        }
+
+
+
+    };
+
+
+
+
+    template<typename REF_TYPE, typename TILE_TYPE>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    auto merge_ref_tile(REF_TYPE const &ref, TILE_TYPE const &tile) ->
+      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
+    {
+      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
+    }
 
-template <typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
-struct MergeRefTile;
 
-template <typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
-struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
-{
 
-  static_assert(REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
-                "Merging a ref "
-                "with a tile "
-                "requires an "
-                "equivalent "
-                "number of "
-                "dimensions.");
-
-  static constexpr camp::idx_t s_num_dims       = REF_TYPE::s_num_dims;
-  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
-  static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
-  using pointer_type   = typename REF_TYPE::pointer_type;
-  using ref_index_type = typename REF_TYPE::index_type;
-
-  static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
-  using tile_index_type = typename TILE_TYPE::index_type;
-
-  using merge_type = TensorRef<pointer_type,
-                               tile_index_type,
-                               s_tile_tensor_size,
-                               s_num_dims,
-                               s_stride_one_dim>;
-  using shift_type = merge_type;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr merge_type merge(REF_TYPE const& ref, TILE_TYPE const& tile)
-  {
-    return merge_type {
-        ref.m_pointer, {tile_index_type(ref.m_stride[DIM_SEQ])...}, tile};
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr shift_type shift_origin(REF_TYPE const& ref,
-                                           TILE_TYPE const& tile_origin)
-  {
-    return shift_type {
-        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
-                                                ref.m_stride[DIM_SEQ])...),
-        {tile_index_type(ref.m_stride[DIM_SEQ])...},
-        ref.m_tile};
-  }
-};
-
-
-template <typename POINTER_TYPE,
-          typename INDEX_TYPE1,
-          TensorTileSize RTENSOR_SIZE,
-          typename STRIDE,
-          INDEX_TYPE1... BEGIN1,
-          INDEX_TYPE1... SIZE1,
-          camp::idx_t STRIDE_ONE_DIM,
-          typename INDEX_TYPE2,
-          TensorTileSize TENSOR_SIZE,
-          typename BEGIN2,
-          typename SIZE2,
-          camp::idx_t... DIM_SEQ>
-struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
-                                    INDEX_TYPE1,
-                                    RTENSOR_SIZE,
-                                    STRIDE,
-                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                                    STRIDE_ONE_DIM>,
-                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
-                    camp::idx_seq<DIM_SEQ...>>
-{
+    /*!
+     * Modifies a ref's pointer so that the supplied tile_origin will resolve
+     * to the original pointer.
+     */
+    template<typename REF_TYPE, typename TILE_TYPE>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    auto shift_tile_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin) ->
+      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
+    {
+      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref, tile_origin);
+    }
 
-  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
-                                         RTENSOR_SIZE,
-                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
-
-  using ref_type = StaticTensorRef<POINTER_TYPE,
-                                   INDEX_TYPE1,
-                                   RTENSOR_SIZE,
-                                   STRIDE,
-                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
-                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                                   STRIDE_ONE_DIM>;
-
-  using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
-
-  using ref_stride_type = typename ref_type ::stride_type;
-
-  using new_stride_seq =
-      camp::int_seq<INDEX_TYPE2,
-                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
-
-  using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
-  using shift_size_seq  = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
-
-  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
-                                           TENSOR_SIZE,
-                                           shift_begin_seq,
-                                           shift_size_seq>;
-
-  using new_stride_type = StaticIndexArray<new_stride_seq>;
-
-  using merge_type = StaticTensorRef<POINTER_TYPE,
-                                     INDEX_TYPE2,
-                                     TENSOR_SIZE,
-                                     new_stride_seq,
-                                     BEGIN2,
-                                     SIZE2,
-                                     STRIDE_ONE_DIM>;
-
-  using shift_type = StaticTensorRef<POINTER_TYPE,
-                                     INDEX_TYPE2,
-                                     TENSOR_SIZE,
-                                     new_stride_seq,
-                                     shift_begin_seq,
-                                     shift_size_seq,
-                                     STRIDE_ONE_DIM>;
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr merge_type merge(ref_type const& ref, tile_type const& tile)
-  {
-    return merge_type {ref.m_pointer, new_stride_type(), tile};
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr shift_type shift_origin(ref_type const& ref,
-                                           tile_type const& tile_origin)
-  {
-    return shift_type {
-        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
-                                                ref.m_stride[DIM_SEQ])...),
-        new_stride_type(), shift_tile_type()};
-  }
-};
-
-
-template <typename REF_TYPE, typename TILE_TYPE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
-merge_ref_tile(REF_TYPE const& ref, TILE_TYPE const& tile) ->
-    typename MergeRefTile<
-        REF_TYPE,
-        TILE_TYPE,
-        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
-{
-  return MergeRefTile<REF_TYPE, TILE_TYPE,
-                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
-                                                                          tile);
-}
 
 
-/*!
- * Modifies a ref's pointer so that the supplied tile_origin will resolve
- * to the original pointer.
- */
-template <typename REF_TYPE, typename TILE_TYPE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
-shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
-    typename MergeRefTile<
-        REF_TYPE,
-        TILE_TYPE,
-        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
-{
-  return MergeRefTile<
-      REF_TYPE, TILE_TYPE,
-      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
-                                                                 tile_origin);
-}
+    /*!
+     * Changes TensorTile size type to FULL
+     */
+    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &
+    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
+      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &>(tile);
+    }
 
+    /*!
+     * Changes TensorTile size type to PARTIAL
+     */
+    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &
+    make_tensor_tile_partial(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
+      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &>(tile);
+    }
 
-/*!
- * Changes TensorTile size type to FULL
- */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          camp::idx_t NUM_DIMS>
-RAJA_INLINE
-    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&
-    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
-{
-  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&>(tile);
-}
 
-/*!
- * Changes TensorTile size type to PARTIAL
- */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          camp::idx_t NUM_DIMS>
-RAJA_INLINE
-    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&
-    make_tensor_tile_partial(
-        TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
-{
-  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&>(
-      tile);
-}
 
+    /*!
+     * Changes StaticTensorTile size type to FULL
+     */
+    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &
+    make_tensor_tile_full(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
+      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &>(tile);
+    }
 
-/*!
- * Changes StaticTensorTile size type to FULL
- */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
-                                                        TENSOR_FULL,
-                                                        TBEGIN,
-                                                        TSIZE>&
-make_tensor_tile_full(
-    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
-{
-  return reinterpret_cast<
-      StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE>&>(tile);
-}
+    /*!
+     * Changes StaticTensorTile size type to PARTIAL
+     */
+    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &
+    make_tensor_tile_partial(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
+      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &>(tile);
+    }
 
-/*!
- * Changes StaticTensorTile size type to PARTIAL
- */
-template <typename INDEX_TYPE,
-          TensorTileSize RTENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
-                                                        TENSOR_PARTIAL,
-                                                        TBEGIN,
-                                                        TSIZE>&
-make_tensor_tile_partial(
-    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
-{
-  return reinterpret_cast<
-      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE>&>(tile);
-}
 
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace expt
+} // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index 0303a1f275..d2bce598ff 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -34,795 +34,815 @@ namespace expt
 {
 
 
-namespace ET
-{
-class TensorExpressionConcreteBase;
-}  // namespace ET
-
-
-template <typename TENSOR, camp::idx_t DIM>
-struct TensorDimSize
-{
-  static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
-};
-
-/*
- * Tensor product helper class.
- *
- * This defines the default product operation between types when using the
- * operator*
- *
- */
-template <typename LHS, typename RHS>
-struct TensorDefaultOperation
-{
-
-  using multiply_type = decltype(LHS().multiply(RHS()));
-
-  // default multiplication operator
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static multiply_type multiply(LHS const& lhs, RHS const& rhs)
-  {
-    return lhs.multiply(rhs);
-  }
-};
-
-
-template <typename REF_TYPE>
-struct TensorRegisterStoreRef
-{
-  using self_type = TensorRegisterStoreRef<REF_TYPE>;
-  REF_TYPE m_ref;
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type operator=(RHS const& rhs)
-  {
-
-    rhs.store_ref(m_ref);
-    return *this;
-  }
-};
-
-template <camp::idx_t N, camp::idx_t D>
-struct DivideRoundUp
-{
-  static constexpr camp::idx_t value = (N % D) > 0 ? (1 + N / D) : (N / D);
-};
-
-
-class TensorRegisterConcreteBase
-{};
-
-/*!
- * TensorRegister base class that provides some default behaviors and simplifies
- * the implementation of new register types.
- *
- * This uses CRTP to provide static polymorphism
- */
-template <typename Derived>
-class TensorRegisterBase;
-
-template <typename REGISTER_POLICY,
-          typename T,
-          typename LAYOUT,
-          typename camp::idx_t... SIZES>
-class TensorRegisterBase<
-    RAJA::expt::
-        TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
-    : public TensorRegisterConcreteBase
-{
-public:
-  using self_type = RAJA::expt::
-      TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
-  using element_type = camp::decay<T>;
-
-  static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
-
-  static constexpr camp::idx_t s_num_registers =
-      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
-                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
 
-  using index_type = camp::idx_t;
 
-  using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
 
-  using register_policy = REGISTER_POLICY;
-
-private:
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type* getThis() { return static_cast<self_type*>(this); }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr self_type const* getThis() const
+  namespace ET
   {
-    return static_cast<self_type const*>(this);
-  }
-
-protected:
-  register_type m_registers[s_num_registers];
-
-public:
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr TensorRegisterBase() {}
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  TensorRegisterBase(element_type c) { broadcast(c); }
-
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorRegisterBase(self_type const& c) { copy(c); }
-
+    class TensorExpressionConcreteBase;
+  } // namespace ET
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  ~TensorRegisterBase() {}
 
+  template<typename TENSOR, camp::idx_t DIM>
+  struct TensorDimSize{
+      static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
+  };
 
   /*
-   * Overload for:    assignment of ET to a TensorRegister
-   */
-  template <typename RHS,
-            typename std::enable_if<
-                std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
-                bool>::type = true>
-  RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const& rhs)
-  {
-    // evaluate a single tile of the ET, storing in this TensorRegister
-    *this = rhs.eval(self_type::s_get_default_tile());
-  }
-
-
-  template <typename... REGS>
-  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
-                                                           REGS const&... regs)
-      : m_registers {reg0, regs...}
-  {
-    static_assert(1 + sizeof...(REGS) == s_num_registers,
-                  "Incompatible number of registers");
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr bool is_root() { return register_type::is_root(); }
-
-
-  template <typename REF_TYPE>
-  RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
-  create_et_store_ref(REF_TYPE const& ref)
-  {
-    return TensorRegisterStoreRef<REF_TYPE> {ref};
-  }
-
-  RAJA_SUPPRESS_HD_WARN
-  template <typename REF_TYPE>
-  RAJA_HOST_DEVICE RAJA_INLINE static self_type s_load_ref(REF_TYPE const& ref)
-  {
-
-    self_type value;
-
-    value.load_ref(ref);
-    return value;
-  }
-
-  /*!
-   * Gets the size of the tensor
-   * Since this is a vector, just the length of the vector in dim 0
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr int s_dim_elem(int dim)
-  {
-    return (dim == 0) ? self_type::s_num_elem : 0;
-  }
-
-
-  /*!
-   * Gets the default tile of this tensor
-   * That tile always start at 0, and extends to the full tile sizes
-   */
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr StaticTensorTile<int,
-                                    TENSOR_FULL,
-                                    camp::int_seq<int, int(SIZES * 0)...>,
-                                    camp::int_seq<int, int(SIZES)...>>
-  s_get_default_tile()
-  {
-    return StaticTensorTile<int, TENSOR_FULL,
-                            camp::int_seq<int, int(SIZES * 0)...>,
-                            camp::int_seq<int, int(SIZES)...>>();
-  }
-
-  /*!
-   * @brief convenience routine to allow Vector classes to use
-   * camp::sink() across a variety of register types, and use things like
-   * ternary operators
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr bool sink() const { return false; }
-
-
-  /*!
-   * Copy contents of another tensor
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& c)
-  {
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      m_registers[i] = c.vec(i);
-    }
-    return *getThis();
-  }
-
-
-  /*!
-   * Sets all elements to zero
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& clear()
-  {
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      m_registers[i] = register_type(0);
-    }
-
-
-    return *getThis();
-  }
-
-
-  /*!
-   * Copy contents of another matrix operator
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type v)
-  {
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      m_registers[i].broadcast(v);
-    }
-    return *getThis();
-  }
-
-
-  /*!
-   * @brief Broadcast scalar value to first N register elements
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast_n(element_type const& value, camp::idx_t N)
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      getThis()->set(value, i);
-    }
-    return *getThis();
-  }
-
-  /*!
-   * @brief Extracts a scalar value and broadcasts to a new register
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type get_and_broadcast(int i) const
-  {
-    self_type x;
-    x.broadcast(getThis()->get(i));
-    return x;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& mat) const
-  {
-    self_type result;
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      result.vec(i) = m_registers[i].add(mat.vec(i));
-    }
-    return result;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& mat) const
-  {
-    self_type result;
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      result.vec(i) = m_registers[i].subtract(mat.vec(i));
-    }
-    return result;
-  }
-
-
-  /*!
-   * element-wise multiplication
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& x) const
-  {
-    self_type result;
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      result.vec(i) = m_registers[i].multiply(x.vec(i));
-    }
-    return result;
-  }
-
-  /*!
-   * element-wise fused multiply add
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply_add(self_type const& x, self_type const& add) const
-  {
-    self_type result;
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
-    }
-    return result;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& mat) const
-  {
-    self_type result;
-    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
-    {
-      result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
-    }
-    return result;
-  }
-
-
-  /*!
-   * @brief Dot product of two vectors
-   * @param x Other vector to dot with this vector
-   * @return Value of (*this) dot x
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type dot(self_type const& x) const
-  {
-    element_type result(0);
-
-    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
-    {
-      result += m_registers[reg].multiply(x.vec(reg)).sum();
-    }
-
-    return result;
-  }
-
-
-  /*!
-   * @brief Set entire vector to a single scalar value
-   * @param value Value to set all vector elements to
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& operator=(element_type value)
-  {
-    getThis()->broadcast(value);
-    return *getThis();
-  }
-
-  /*!
-   * @brief Set entire vector to a single scalar value
-   * @param value Value to set all vector elements to
-   */
-  RAJA_SUPPRESS_HD_WARN
-  template <typename T2>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
-  operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
-                                       T2,
-                                       RAJA::expt::ScalarLayout,
-                                       camp::idx_seq<>> const& value)
-  {
-    getThis()->broadcast(value.get(0));
-    return *getThis();
-  }
-
-  /*!
-   * @brief Assign one register to antoher
-   * @param x Vector to copy
-   * @return Value of (*this)
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& operator=(self_type const& x)
-  {
-    getThis()->copy(x);
-    return *getThis();
-  }
-
-
-  /*!
-   * @brief Add two vector registers
-   * @param x Vector to add to this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator+(self_type const& x) const { return getThis()->add(x); }
-
-
-  /*!
-   * @brief Add a vector to this vector
-   * @param x Vector to add to this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator+=(self_type const& x)
-  {
-    *getThis() = getThis()->add(x);
-    return *getThis();
-  }
-
-  /*!
-   * @brief Add vector to a scalar
-   * @param x scalar to add to this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator+(element_type const& x) const { return getThis()->add(x); }
-
-
-  /*!
-   * @brief Add a scalar to this vector
-   * @param x scalar to add to this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator+=(element_type x)
-  {
-    *getThis() = getThis()->add(x);
-    return *getThis();
-  }
-
-  /*!
-   * @brief Negate the value of this vector
-   * @return Value of -(*this)
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator-() const { return self_type(0).subtract(*getThis()); }
-
-  /*!
-   * @brief Subtract two vector registers
-   * @param x Vector to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator-(self_type const& x) const
-  {
-    return getThis()->subtract(x);
-  }
-
-  /*!
-   * @brief Subtract a vector from this vector
-   * @param x Vector to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator-=(self_type const& x)
-  {
-    *getThis() = getThis()->subtract(x);
-    return *getThis();
-  }
-
-  /*!
-   * @brief Subtract scalar from this register
-   * @param x Vector to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator-(element_type const& x) const
-  {
-    return getThis()->subtract(x);
-  }
-
-  /*!
-   * @brief Subtract a scalar from this vector
-   * @param x Vector to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator-=(element_type const& x)
-  {
-    *getThis() = getThis()->subtract(x);
-    return *getThis();
-  }
-
-  /*!
-   * @brief Multiply two vector registers, element wise
-   * @param x Vector to subtract from this register
-   * @return Value of (*this)+x
-   */
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE
-      typename TensorDefaultOperation<self_type, RHS>::multiply_type
-      operator*(RHS const& rhs) const
-  {
-    return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-  }
-
-  /*!
-   * @brief Multiply a vector with this vector
-   * @param x Vector to multiple with this register
-   * @return Value of (*this)+x
-   */
-  template <typename RHS>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
-  {
-    *getThis() =
-        TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-    return *getThis();
-  }
-
-  /*!
-   * @brief Divide two vector registers, element wise
-   * @param x Vector to subtract from this register
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
-
-  /*!
-   * @brief Divide this vector by another vector
-   * @param x Vector to divide by
-   * @return Value of (*this)+x
+   * Tensor product helper class.
+   *
+   * This defines the default product operation between types when using the
+   * operator*
+   *
    */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator/=(self_type const& x)
-  {
-    *getThis() = getThis()->divide(x);
-    return *getThis();
-  }
+  template<typename LHS, typename RHS>
+  struct TensorDefaultOperation{
 
+      using multiply_type = decltype(LHS().multiply(RHS()));
 
-  /*!
-   * @brief Divide by a scalar, element wise
-   * @param x Scalar to divide by
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type operator/(element_type const& x) const
-  {
-    return getThis()->divide(x);
-  }
+      // default multiplication operator
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      multiply_type multiply(LHS const &lhs, RHS const &rhs)
+      {
+        return lhs.multiply(rhs);
+      }
 
-  /*!
-   * @brief Divide this vector by another vector
-   * @param x Scalar to divide by
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator/=(element_type const& x)
-  {
-    *getThis() = getThis()->divide(x);
-    return *getThis();
-  }
+  };
 
 
-  /*!
-   * @brief Returns element wise minimum value tensor
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type vmin(self_type x) const
-  {
-    self_type result;
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      result.vec(i) = m_registers[i].vmin(x.vec(i));
-    }
-    return result;
-  }
+  template<typename REF_TYPE>
+  struct TensorRegisterStoreRef{
+      using self_type = TensorRegisterStoreRef<REF_TYPE>;
+      REF_TYPE m_ref;
 
+      RAJA_SUPPRESS_HD_WARN
+      template<typename RHS>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator=(RHS const &rhs)
+      {
 
-  /*!
-   * @brief Returns element wise maximum value tensor
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type vmax(self_type x) const
-  {
-    self_type result;
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      result.vec(i) = m_registers[i].vmax(x.vec(i));
-    }
-    return result;
-  }
-
+        rhs.store_ref(m_ref);
+        return *this;
+      }
+  };
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  register_type& vec(int i) { return m_registers[i]; }
+  template<camp::idx_t N, camp::idx_t D>
+  struct DivideRoundUp {
+      static constexpr camp::idx_t value =
+          (N % D) > 0 ? (1 + N/D) : (N/D);
+  };
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr register_type const& vec(int i) const { return m_registers[i]; }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  register_type& get_register(int reg) { return m_registers[reg]; }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr register_type const& get_register(int reg) const
-  {
-    return m_registers[reg];
-  }
 
+  class TensorRegisterConcreteBase {};
 
   /*!
-   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-   *
-   * Derived types can override this to implement intrinsic FMS's
+   * TensorRegister base class that provides some default behaviors and simplifies
+   * the implementation of new register types.
    *
-   * @param b Second product operand
-   * @param c Subtraction operand
-   * @return Value of (*this)*b-c
+   * This uses CRTP to provide static polymorphism
    */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return getThis()->multiply_add(b, -c);
-  }
+  template<typename Derived>
+  class TensorRegisterBase;
 
-  /*!
-   * Multiply this tensor by a scalar value
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type scale(element_type c) const
+  template<typename REGISTER_POLICY, typename T, typename LAYOUT, typename camp::idx_t ... SIZES>
+  class TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>> :
+    public TensorRegisterConcreteBase
   {
-    return getThis()->multiply(self_type(c));
-  }
-
+    public:
+      using self_type = RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
+      using element_type = camp::decay<T>;
+
+      static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
 
-  /*!
-   * In-place add operation
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& inplace_add(self_type x)
-  {
-    *getThis() = getThis()->add(x);
-    return *getThis();
-  }
-
-  /*!
-   * In-place sbutract operation
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& inplace_subtract(self_type x)
-  {
-    *getThis() = getThis()->subtract(x);
-    return *getThis();
-  }
-
-  /*!
-   * In-place multiply operation
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& inplace_multiply(self_type x)
-  {
-    *getThis() = getThis()->multiply(x);
-    return *getThis();
-  }
-
-  /*!
-   * In-place multiply-add operation
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& inplace_multiply_add(self_type x, self_type y)
-  {
-    *getThis() = getThis()->multiply_add(x, y);
-    return *getThis();
-  }
-
-  /*!
-   * In-place multiply-subtract operation
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& inplace_multiply_subtract(self_type x, self_type y)
-  {
-    *getThis() = getThis()->multiply_subtract(x, y);
-    return *getThis();
-  }
-
-  /*!
-   * In-place divide operation
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& inplace_divide(self_type x)
-  {
-    *getThis() = getThis()->divide(x);
-    return *getThis();
-  }
-
-  /*!
-   * In-place scaling operation
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& inplace_scale(element_type x)
-  {
-    *getThis() = getThis()->scale(x);
-    return *getThis();
-  }
-};
-
-}  // namespace expt
-
-}  // namespace internal
+      static constexpr camp::idx_t s_num_registers = DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...), RegisterTraits<REGISTER_POLICY,T>::s_num_elem>::value;
+
+      using index_type = camp::idx_t;
+
+      using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
+
+      using register_policy = REGISTER_POLICY;
+
+    private:
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type *getThis(){
+        return static_cast<self_type *>(this);
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      self_type const *getThis() const{
+        return static_cast<self_type const *>(this);
+      }
+
+    protected:
+
+      register_type m_registers[s_num_registers];
+
+    public:
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      TensorRegisterBase(){}
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      TensorRegisterBase(element_type c)
+      {
+        broadcast(c);
+      }
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      TensorRegisterBase(self_type const &c)
+      {
+        copy(c);
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      ~TensorRegisterBase(){}
+
+
+      /*
+       * Overload for:    assignment of ET to a TensorRegister
+       */
+      template<typename RHS,
+        typename std::enable_if<std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      TensorRegisterBase(RHS const &rhs)
+      {
+        // evaluate a single tile of the ET, storing in this TensorRegister
+        *this = rhs.eval(self_type::s_get_default_tile());
+      }
+
+
+      template<typename ... REGS>
+      explicit
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      TensorRegisterBase(register_type reg0, REGS const &... regs) :
+        m_registers{reg0, regs...}
+      {
+        static_assert(1+sizeof...(REGS) == s_num_registers,
+            "Incompatible number of registers");
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      bool is_root() {
+        return register_type::is_root();
+      }
+
+
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      TensorRegisterStoreRef<REF_TYPE>
+      create_et_store_ref(REF_TYPE const &ref) {
+        return TensorRegisterStoreRef<REF_TYPE>{ref};
+      }
+
+      RAJA_SUPPRESS_HD_WARN
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      self_type
+      s_load_ref(REF_TYPE const &ref) {
+
+        self_type value;
+
+        value.load_ref(ref);
+        return value;
+      }
+
+      /*!
+       * Gets the size of the tensor
+       * Since this is a vector, just the length of the vector in dim 0
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr int s_dim_elem(int dim){
+        return (dim==0) ? self_type::s_num_elem : 0;
+      }
+
+
+      /*!
+       * Gets the default tile of this tensor
+       * That tile always start at 0, and extends to the full tile sizes
+       */
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>
+      s_get_default_tile()
+      {
+        return StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>();
+      }
+
+      /*!
+       * @brief convenience routine to allow Vector classes to use
+       * camp::sink() across a variety of register types, and use things like
+       * ternary operators
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      bool sink() const{
+        return false;
+      }
+
+
+
+
+
+
+      /*!
+       * Copy contents of another tensor
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &c){
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          m_registers[i] = c.vec(i);
+        }
+        return *getThis();
+      }
+
+
+
+
+      /*!
+       * Sets all elements to zero
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &clear(){
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          m_registers[i] = register_type(0);
+        }
+
+
+        return *getThis();
+      }
+
+
+      /*!
+       * Copy contents of another matrix operator
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type v){
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          m_registers[i].broadcast(v);
+        }
+        return *getThis();
+      }
+
+
+      /*!
+       * @brief Broadcast scalar value to first N register elements
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast_n(element_type const &value, camp::idx_t N){
+        for(camp::idx_t i = 0;i < N;++ i){
+          getThis()->set(value, i);
+        }
+        return *getThis();
+      }
+
+      /*!
+       * @brief Extracts a scalar value and broadcasts to a new register
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type get_and_broadcast(int i) const {
+        self_type x;
+        x.broadcast(getThis()->get(i));
+        return x;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &mat) const {
+        self_type result;
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          result.vec(i) = m_registers[i].add(mat.vec(i));
+        }
+        return result;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &mat) const {
+        self_type result;
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          result.vec(i) = m_registers[i].subtract(mat.vec(i));
+        }
+        return result;
+      }
+
+
+      /*!
+       * element-wise multiplication
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &x) const {
+        self_type result;
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          result.vec(i) = m_registers[i].multiply(x.vec(i));
+        }
+        return result;
+      }
+
+      /*!
+       * element-wise fused multiply add
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply_add(self_type const &x, self_type const &add) const {
+        self_type result;
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
+        }
+        return result;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &mat) const {
+        self_type result;
+        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
+          result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
+        }
+        return result;
+      }
+
+
+
+      /*!
+       * @brief Dot product of two vectors
+       * @param x Other vector to dot with this vector
+       * @return Value of (*this) dot x
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type dot(self_type const &x) const
+      {
+        element_type result(0);
+
+        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
+          result += m_registers[reg].multiply(x.vec(reg)).sum();
+        }
+
+        return result;
+      }
+
+
+      /*!
+       * @brief Set entire vector to a single scalar value
+       * @param value Value to set all vector elements to
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &operator=(element_type value)
+      {
+        getThis()->broadcast(value);
+        return *getThis();
+      }
+
+      /*!
+       * @brief Set entire vector to a single scalar value
+       * @param value Value to set all vector elements to
+       */
+      RAJA_SUPPRESS_HD_WARN
+      template<typename T2>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register, T2, RAJA::expt::ScalarLayout, camp::idx_seq<>> const &value)
+      {
+        getThis()->broadcast(value.get(0));
+        return *getThis();
+      }
+
+      /*!
+       * @brief Assign one register to antoher
+       * @param x Vector to copy
+       * @return Value of (*this)
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &operator=(self_type const &x)
+      {
+        getThis()->copy(x);
+        return *getThis();
+      }
+
+
+
+
+
+      /*!
+       * @brief Add two vector registers
+       * @param x Vector to add to this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator+(self_type const &x) const
+      {
+        return getThis()->add(x);
+      }
+
+
+      /*!
+       * @brief Add a vector to this vector
+       * @param x Vector to add to this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator+=(self_type const &x)
+      {
+        *getThis() = getThis()->add(x);
+        return *getThis();
+      }
+
+      /*!
+       * @brief Add vector to a scalar
+       * @param x scalar to add to this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator+(element_type const &x) const
+      {
+        return getThis()->add(x);
+      }
+
+
+      /*!
+       * @brief Add a scalar to this vector
+       * @param x scalar to add to this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator+=(element_type x)
+      {
+        *getThis() = getThis()->add(x);
+        return *getThis();
+      }
+
+      /*!
+       * @brief Negate the value of this vector
+       * @return Value of -(*this)
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator-() const
+      {
+        return self_type(0).subtract(*getThis());
+      }
+
+      /*!
+       * @brief Subtract two vector registers
+       * @param x Vector to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator-(self_type const &x) const
+      {
+        return getThis()->subtract(x);
+      }
+
+      /*!
+       * @brief Subtract a vector from this vector
+       * @param x Vector to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator-=(self_type const &x)
+      {
+        *getThis() = getThis()->subtract(x);
+        return *getThis();
+      }
+
+      /*!
+       * @brief Subtract scalar from this register
+       * @param x Vector to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator-(element_type const &x) const
+      {
+        return getThis()->subtract(x);
+      }
+
+      /*!
+       * @brief Subtract a scalar from this vector
+       * @param x Vector to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator-=(element_type const &x)
+      {
+        *getThis() = getThis()->subtract(x);
+        return *getThis();
+      }
+
+      /*!
+       * @brief Multiply two vector registers, element wise
+       * @param x Vector to subtract from this register
+       * @return Value of (*this)+x
+       */
+      template<typename RHS>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      typename TensorDefaultOperation<self_type, RHS>::multiply_type
+      operator*(RHS const &rhs) const
+      {
+        return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+      }
+
+      /*!
+       * @brief Multiply a vector with this vector
+       * @param x Vector to multiple with this register
+       * @return Value of (*this)+x
+       */
+      template<typename RHS>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator*=(RHS const &rhs)
+      {
+        *getThis() = TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+        return *getThis();
+      }
+
+      /*!
+       * @brief Divide two vector registers, element wise
+       * @param x Vector to subtract from this register
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type operator/(self_type const &x) const
+      {
+        return getThis()->divide(x);
+      }
+
+      /*!
+       * @brief Divide this vector by another vector
+       * @param x Vector to divide by
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator/=(self_type const &x)
+      {
+        *getThis() = getThis()->divide(x);
+        return *getThis();
+      }
+
+
+      /*!
+       * @brief Divide by a scalar, element wise
+       * @param x Scalar to divide by
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type operator/(element_type const &x) const
+      {
+        return getThis()->divide(x);
+      }
+
+      /*!
+       * @brief Divide this vector by another vector
+       * @param x Scalar to divide by
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator/=(element_type const &x)
+      {
+        *getThis() = getThis()->divide(x);
+        return *getThis();
+      }
+
+
+      /*!
+       * @brief Returns element wise minimum value tensor
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type vmin(self_type x) const {
+        self_type result;
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          result.vec(i) = m_registers[i].vmin(x.vec(i));
+        }
+        return result;
+      }
+
+
+      /*!
+       * @brief Returns element wise maximum value tensor
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type vmax(self_type x) const {
+        self_type result;
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          result.vec(i) = m_registers[i].vmax(x.vec(i));
+        }
+        return result;
+      }
+
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      register_type &vec(int i){
+        return m_registers[i];
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      register_type const &vec(int i) const{
+        return m_registers[i];
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      register_type &get_register(int reg){
+        return m_registers[reg];
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      register_type const &get_register(int reg) const{
+        return m_registers[reg];
+      }
+
+
+
+      /*!
+       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+       *
+       * Derived types can override this to implement intrinsic FMS's
+       *
+       * @param b Second product operand
+       * @param c Subtraction operand
+       * @return Value of (*this)*b-c
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return getThis()->multiply_add(b, -c);
+      }
+
+      /*!
+       * Multiply this tensor by a scalar value
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type scale(element_type c) const
+      {
+        return getThis()->multiply(self_type(c));
+      }
+
+
+      /*!
+       * In-place add operation
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &inplace_add(self_type x){
+        *getThis() = getThis()->add(x);
+        return *getThis();
+      }
+
+      /*!
+       * In-place sbutract operation
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &inplace_subtract(self_type x){
+        *getThis() = getThis()->subtract(x);
+        return *getThis();
+      }
+
+      /*!
+       * In-place multiply operation
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &inplace_multiply(self_type x){
+        *getThis() = getThis()->multiply(x);
+        return *getThis();
+      }
+
+      /*!
+       * In-place multiply-add operation
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &inplace_multiply_add(self_type x, self_type y){
+        *getThis() = getThis()->multiply_add(x,y);
+        return *getThis();
+      }
+
+      /*!
+       * In-place multiply-subtract operation
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &inplace_multiply_subtract(self_type x, self_type y){
+        *getThis() = getThis()->multiply_subtract(x,y);
+        return *getThis();
+      }
+
+      /*!
+       * In-place divide operation
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &inplace_divide(self_type x){
+        *getThis() = getThis()->divide(x);
+        return *getThis();
+      }
+
+      /*!
+       * In-place scaling operation
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &inplace_scale(element_type x){
+        *getThis() = getThis()->scale(x);
+        return *getThis();
+      }
+
+  };
+
+} //namespace internal
+
+} // namespace expt
 
 }  // namespace RAJA
 
 
+
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 9a0d011d7e..3899a97118 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -33,351 +33,345 @@ namespace expt
 {
 
 
-template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-struct StaticTensorTileExec;
 
-template <typename STORAGE, typename DIM_SEQ>
-struct TensorTileExec;
+    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+    struct StaticTensorTileExec;
 
-/**
- * Implement a dimension tiling loop
- */
-template <typename STORAGE, camp::idx_t DIM0, camp::idx_t... DIM_REST>
-struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
-{
+    template<typename STORAGE, typename DIM_SEQ>
+    struct TensorTileExec;
 
-  using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+    /**
+     * Implement a dimension tiling loop
+     */
+    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST>
+    struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>{
 
-  template <typename OTILE, typename TTYPE, typename BODY>
-  RAJA_HOST_DEVICE RAJA_INLINE static void
-  exec(OTILE const& otile, TTYPE& tile, BODY&& body)
-  {
+      using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
 
-    auto const orig_begin = otile.m_begin[DIM0];
-    auto const orig_size  = otile.m_size[DIM0];
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      void exec(OTILE const &otile, TTYPE &tile, BODY && body){
 
-    // Do the full tile sizes
-    for (tile.m_begin[DIM0] = orig_begin;
+        auto const orig_begin = otile.m_begin[DIM0];
+        auto const orig_size =  otile.m_size[DIM0];
 
-         tile.m_begin[DIM0] + STORAGE::s_dim_elem(DIM0) <=
-         orig_begin + orig_size;
+        // Do the full tile sizes
+        for(tile.m_begin[DIM0] = orig_begin;
 
-         tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0))
-    {
+            tile.m_begin[DIM0] +  STORAGE::s_dim_elem(DIM0) <=
+                orig_begin+orig_size;
 
-      // Do the next inner tiling loop
-      inner_t::exec(otile, tile, body);
-    }
+            tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0)){
 
-    // Postamble if needed
-    if (tile.m_begin[DIM0] < orig_begin + orig_size)
-    {
+          // Do the next inner tiling loop
+          inner_t::exec(otile, tile, body);
 
-      // convert tile to a partial tile
-      auto& part_tile = make_tensor_tile_partial(tile);
+        }
 
-      // store original size
-      auto tmp_size = part_tile.m_size[DIM0];
+        // Postamble if needed
+        if(tile.m_begin[DIM0] <
+            orig_begin + orig_size)
+        {
 
-      // set tile size to the remainder
-      part_tile.m_size[DIM0] = orig_begin + orig_size - tile.m_begin[DIM0];
+          // convert tile to a partial tile
+          auto &part_tile = make_tensor_tile_partial(tile);
 
-      // Do the next inner tiling loop
-      inner_t::exec(otile, part_tile, body);
+          // store original size
+          auto tmp_size = part_tile.m_size[DIM0];
 
-      // restore size
-      part_tile.m_size[DIM0] = tmp_size;
-    }
+          // set tile size to the remainder
+          part_tile.m_size[DIM0] =
+              orig_begin +
+              orig_size -
+              tile.m_begin[DIM0];
 
-    // reset tile dimension
-    tile.m_begin[DIM0] = orig_begin;
-  }
+          // Do the next inner tiling loop
+          inner_t::exec(otile, part_tile, body);
 
+          // restore size
+          part_tile.m_size[DIM0] = tmp_size;
+        }
 
-  template <typename OTILE, typename TTYPE, typename BODY>
-  RAJA_HOST_DEVICE RAJA_INLINE static void
-  static_exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
-  {
+        // reset tile dimension
+        tile.m_begin[DIM0] = orig_begin;
 
+      }
 
-    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-    auto constexpr step_size = STORAGE::s_dim_elem(DIM0);
+      template<
+          typename OTILE,
+          typename TTYPE,
+          typename BODY
+      >
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      void
+      static_exec(
+          OTILE const &otile,
+          TTYPE const &tile,
+          BODY && body
+      ){
 
-    auto constexpr iter_count =
-        (tile_begin >= orig_begin) && (tile_begin < (orig_begin + orig_size))
-            ? ((orig_begin + orig_size) - tile_begin + step_size - 1) /
-                  step_size
-            : 0;
 
+        auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+        auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
 
-    using IterCount =
-        camp::integral_constant<typename TTYPE::index_type, iter_count>;
-    using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
-    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
-                                                  IterCount>::type;
+        auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-    StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
-  }
-};
+        auto constexpr step_size  = STORAGE::s_dim_elem(DIM0);
 
+        auto constexpr iter_count =
+               (tile_begin >= orig_begin) && (tile_begin < (orig_begin+orig_size))
+                 ? ((orig_begin + orig_size) - tile_begin + step_size - 1) / step_size
+                 : 0;
 
-/**
- * Termination of nested loop:  execute evaluation of ET
- */
-template <typename STORAGE>
-struct TensorTileExec<STORAGE, camp::idx_seq<>>
-{
 
-  template <typename OTILE, typename TTYPE, typename BODY>
-  RAJA_HOST_DEVICE RAJA_INLINE static void
-  exec(OTILE&, TTYPE const& tile, BODY&& body)
-  {
-
-    // execute body, passing in the current tile
-    body(tile);
-  }
-
-  template <typename OTILE, typename TTYPE, typename BODY>
-  RAJA_HOST_DEVICE RAJA_INLINE static void
-  static_exec(OTILE const&, TTYPE const& tile, BODY&& body)
-  {
-
-    // execute body, passing in the current tile
-    body(tile);
-  }
-};
-
-
-template <typename STORAGE,
-          typename TILE_TYPE,
-          typename BODY,
-          camp::idx_t... IDX_SEQ,
-          camp::idx_t... DIM_SEQ>
-RAJA_INLINE RAJA_HOST_DEVICE void
-tensorTileExec_expanded(TILE_TYPE const& orig_tile,
-                        BODY&& body,
-                        camp::idx_seq<IDX_SEQ...> const&,
-                        camp::idx_seq<DIM_SEQ...> const&)
-{
+        using IterCount = camp::integral_constant<typename TTYPE::index_type,iter_count>;
+        using DimSeq = camp::idx_seq<DIM0,DIM_REST...>;
+        using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,IterCount>::type;
 
-  // tile over full rows and columns
-  // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
-  TILE_TYPE tile {
-      {orig_tile.m_begin[IDX_SEQ]...},
-      {STORAGE::s_dim_elem(IDX_SEQ)...},
-  };
+        StaticTensorTileExec<STORAGE,DimSeq,IdxSeq>::exec(otile,tile,body);
+        
+      }
 
 
-  // Promote the tile type to a "full-tile" so that the full-element
-  // register operations are used.
-  // Any of the tiling loops can demote this to a partial-tile when
-  // they do postamble execution
-  auto& full_tile = make_tensor_tile_full(tile);
 
-  // Do all of the tiling loops in layout order, this may improve
-  // cache performance
-  using layout_order       = typename STORAGE::layout_type::seq_t;
-  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
+    };
 
 
-  tensor_tile_exec_t::exec(orig_tile, full_tile, body);
-}
+    /**
+     * Termination of nested loop:  execute evaluation of ET
+     */
+    template<typename STORAGE>
+    struct TensorTileExec<STORAGE, camp::idx_seq<>>{
 
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      void exec(OTILE &, TTYPE const &tile, BODY && body){
 
-template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-struct StaticTensorTileExec;
+        // execute body, passing in the current tile
+        body(tile);
 
-/**
- * Implement a dimension tiling loop
- */
+      }
 
-template <typename STORAGE,
-          camp::idx_t DIM0,
-          camp::idx_t... DIM_REST,
-          camp::idx_t IDX,
-          camp::idx_t... IDX_REST>
-struct StaticTensorTileExec<STORAGE,
-                            camp::idx_seq<DIM0, DIM_REST...>,
-                            camp::idx_seq<IDX, IDX_REST...>>
-{
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      void static_exec(OTILE const &, TTYPE const &tile, BODY && body){
 
-  using DimList = camp::idx_seq<DIM0, DIM_REST...>;
-  using DimTail = camp::idx_seq<DIM_REST...>;
-  using IdxList = camp::idx_seq<IDX, IDX_REST...>;
-  using IdxTail = camp::idx_seq<IDX_REST...>;
+        // execute body, passing in the current tile
+        body(tile);
 
-  using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
-  using NextExec = StaticTensorTileExec<STORAGE,
-                                        camp::idx_seq<DIM0, DIM_REST...>,
-                                        camp::idx_seq<IDX_REST...>>;
+      }
 
-  static auto const step_size = STORAGE::s_dim_elem(DIM0);
+    };
 
-  template <typename OTILE, typename TTYPE, typename BODY>
-  RAJA_HOST_DEVICE RAJA_INLINE static void
-  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
-  {
 
-    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+    template<typename STORAGE, typename TILE_TYPE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    void tensorTileExec_expanded(TILE_TYPE const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
+    {
 
-    using NextBegin =
-        camp::integral_constant<typename TTYPE::index_type,
-                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
-    using TailSize =
-        camp::integral_constant<typename TTYPE::index_type,
-                                (orig_begin + orig_size) - tile_begin>;
+      // tile over full rows and columns
+      // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
+      TILE_TYPE tile {
+        {orig_tile.m_begin[IDX_SEQ]...},
+        {STORAGE::s_dim_elem(IDX_SEQ)...},
+      };
 
-    using NextTile =
-        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
-                                                (size_t)DIM0>::Type;
 
-    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
-                                                            (size_t)DIM0>::Type;
-    using PartTile = typename TailTile::Partial;
+      // Promote the tile type to a "full-tile" so that the full-element
+      // register operations are used.
+      // Any of the tiling loops can demote this to a partial-tile when
+      // they do postamble execution
+      auto &full_tile = make_tensor_tile_full(tile);
 
+      // Do all of the tiling loops in layout order, this may improve
+      // cache performance
+      using layout_order = typename STORAGE::layout_type::seq_t;
+      using tensor_tile_exec_t =
+             TensorTileExec<STORAGE, layout_order>;
 
-    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
-                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
-                  "OOB StaticTensorTileExec DOWN");
 
-    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
-    {
-      DownExec::static_exec(otile, tile, body);
-      NextTile next_tile;
-      NextExec::exec(otile, next_tile, body);
-    }
-    else if (tile_begin < (orig_begin + orig_size))
-    {
-      PartTile part_tile;
-      DownExec::static_exec(otile, part_tile, body);
+      tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+
     }
-  }
-};
 
 
-template <typename STORAGE,
-          camp::idx_t DIM0,
-          camp::idx_t IDX,
-          camp::idx_t... IDX_REST>
-struct StaticTensorTileExec<STORAGE,
-                            camp::idx_seq<DIM0>,
-                            camp::idx_seq<IDX, IDX_REST...>>
-{
-  using NextExec = StaticTensorTileExec<STORAGE,
-                                        camp::idx_seq<DIM0>,
-                                        camp::idx_seq<IDX_REST...>>;
+    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+    struct StaticTensorTileExec;
+
+    /**
+     * Implement a dimension tiling loop
+     */
+
+    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST, camp::idx_t IDX, camp::idx_t ... IDX_REST>
+    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>,camp::idx_seq<IDX,IDX_REST...>>{
+
+          using DimList  = camp::idx_seq<DIM0, DIM_REST...>;
+          using DimTail  = camp::idx_seq<      DIM_REST...>;
+          using IdxList  = camp::idx_seq<IDX , IDX_REST...>;
+          using IdxTail  = camp::idx_seq<      IDX_REST...>;
 
+          using DownExec = TensorTileExec<STORAGE,camp::idx_seq<DIM_REST...>>;
+          using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0,DIM_REST...>,camp::idx_seq<IDX_REST...>>;
 
-  template <typename OTILE, typename TTYPE, typename BODY>
-  RAJA_HOST_DEVICE RAJA_INLINE static void
-  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
-  {
-    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
+          static auto const step_size = STORAGE::s_dim_elem(DIM0);
 
-    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+          template<
+              typename OTILE,
+              typename TTYPE,
+              typename BODY
+          >
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static
+          void
+          exec(
+              OTILE const &otile,
+              TTYPE const &tile,
+              BODY && body
+          ){
+    
+            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    
+            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-    using NextBegin =
-        camp::integral_constant<typename TTYPE::index_type,
-                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
-    using TailSize =
-        camp::integral_constant<typename TTYPE::index_type,
-                                (orig_begin + orig_size) - tile_begin>;
+            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
+            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
 
-    using NextTile =
-        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
-                                                (size_t)DIM0>::Type;
+            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
 
-    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
-                                                            (size_t)DIM0>::Type;
-    using PartTile = typename TailTile::Partial;
+            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
+            using PartTile  = typename TailTile::Partial;
 
+    
+            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec DOWN" );
+     
+            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
+               DownExec::static_exec(otile, tile, body);
+               NextTile next_tile;
+               NextExec::exec(otile, next_tile, body);
+            } else if ( tile_begin < (orig_begin + orig_size ) ) {
+               PartTile part_tile;
+               DownExec::static_exec(otile,part_tile,body);
+            }
+    
+          }
 
-    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
-                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
-                  "OOB StaticTensorTileExec ACROSS");
 
-    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
-    {
-      body(tile);
-      NextTile next_tile;
-      NextExec::exec(otile, next_tile, body);
-    }
-    else if (tile_begin < (orig_begin + orig_size))
+
+    };
+
+
+
+    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t IDX, camp::idx_t ... IDX_REST>
+    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0>, camp::idx_seq<IDX,IDX_REST...>>{
+      using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0>,camp::idx_seq<IDX_REST...>>;
+
+
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static void exec(OTILE const & otile, TTYPE const &tile, BODY && body) {
+            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    
+            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+
+            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
+            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+
+            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+
+            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
+            using PartTile  = typename TailTile::Partial;
+
+    
+            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec ACROSS" );
+     
+            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
+               body(tile);
+               NextTile next_tile;
+               NextExec::exec(otile, next_tile, body);
+            } else if ( tile_begin < (orig_begin + orig_size ) ) {
+               PartTile part_tile;
+               body(part_tile);
+            }
+      }
+
+    };
+
+    template<typename STORAGE, camp::idx_t ... DIM_REST>
+    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>, camp::idx_seq<> >{
+
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static void exec(OTILE const &, TTYPE const &, BODY &&) {}
+
+    };
+
+
+
+    template<typename STORAGE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    void tensorTileExec_expanded( StaticTensorTile<INDEX_TYPE,TENSOR_SIZE, TBEGIN, TSIZE> const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
     {
-      PartTile part_tile;
-      body(part_tile);
-    }
-  }
-};
 
-template <typename STORAGE, camp::idx_t... DIM_REST>
-struct StaticTensorTileExec<STORAGE,
-                            camp::idx_seq<DIM_REST...>,
-                            camp::idx_seq<>>
-{
+      using InputType = StaticTensorTile<
+          INDEX_TYPE,
+          TENSOR_SIZE,
+          TBEGIN,
+          TSIZE
+      >;
 
-  template <typename OTILE, typename TTYPE, typename BODY>
-  RAJA_HOST_DEVICE RAJA_INLINE static void
-  exec(OTILE const&, TTYPE const&, BODY&&)
-  {}
-};
-
-
-template <typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          typename TBEGIN,
-          typename TSIZE,
-          typename BODY,
-          camp::idx_t... IDX_SEQ,
-          camp::idx_t... DIM_SEQ>
-RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
-    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
-    BODY&& body,
-    camp::idx_seq<IDX_SEQ...> const&,
-    camp::idx_seq<DIM_SEQ...> const&)
-{
+      using InputBegin = typename InputType::begin_type;
 
-  using InputType = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>;
+      using Type = StaticTensorTile<
+          INDEX_TYPE,
+          TENSOR_FULL,
+          camp::int_seq<INDEX_TYPE,InputBegin::value_at(IDX_SEQ)...>,
+          camp::int_seq<INDEX_TYPE,STORAGE::s_dim_elem(IDX_SEQ)...>
+      >;
 
-  using InputBegin = typename InputType::begin_type;
+      Type full_tile;
 
-  using Type = StaticTensorTile<
-      INDEX_TYPE, TENSOR_FULL,
-      camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
-      camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
+      // Do all of the tiling loops in layout order, this may improve
+      // cache performance
+      using layout_order = typename STORAGE::layout_type::seq_t;
+      using tensor_tile_exec_t =
+             TensorTileExec<STORAGE, layout_order>;
 
-  Type full_tile;
 
-  // Do all of the tiling loops in layout order, this may improve
-  // cache performance
-  using layout_order       = typename STORAGE::layout_type::seq_t;
-  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
+      tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
 
+    }
 
-  tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
-}
 
 
-template <typename STORAGE, typename TILE_TYPE, typename BODY>
-RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
-                                                 BODY&& body)
-{
-  using layout_type = typename STORAGE::layout_type;
-  tensorTileExec_expanded<STORAGE>(
-      tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims> {}, layout_type {});
-}
+    template<typename STORAGE, typename TILE_TYPE, typename BODY>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    void tensorTileExec(TILE_TYPE const &tile, BODY && body)
+    {
+      using layout_type = typename STORAGE::layout_type;
+      tensorTileExec_expanded<STORAGE>(tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
+    }
 
-}  // namespace expt
-}  // namespace internal
+  } // namespace internal
+} // namespace expt
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index dfce569070..4ef4998fbe 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -34,1064 +34,958 @@ namespace RAJA
 namespace expt
 {
 
-/*!
- * This provides a Tensor specialization for vectors
- */
-template <typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-class TensorRegister<REGISTER_POLICY,
-                     T,
-                     RAJA::expt::VectorLayout,
-                     camp::idx_seq<SIZE>>
-    : public internal::expt::TensorRegisterBase<
-          RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                     T,
-                                     RAJA::expt::VectorLayout,
-                                     camp::idx_seq<SIZE>>>
-{
-public:
-  using self_type = TensorRegister<REGISTER_POLICY,
-                                   T,
-                                   RAJA::expt::VectorLayout,
-                                   camp::idx_seq<SIZE>>;
-  using base_type = internal::expt::TensorRegisterBase<
-      RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                 T,
-                                 RAJA::expt::VectorLayout,
-                                 camp::idx_seq<SIZE>>>;
-  using element_type  = camp::decay<T>;
-  using layout_type   = TensorLayout<0>;
-  using register_type = Register<T, REGISTER_POLICY>;
-
-  static constexpr camp::idx_t s_num_elem = SIZE;
-
-  using int_element_type =
-      typename register_type::int_vector_type::element_type;
-  using int_vector_type = TensorRegister<REGISTER_POLICY,
-                                         int_element_type,
-                                         RAJA::expt::VectorLayout,
-                                         camp::idx_seq<SIZE>>;
-
-private:
-  static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
-
-  static constexpr camp::idx_t s_num_full_registers =
-      s_num_elem / s_register_num_elem;
-
-  static constexpr camp::idx_t s_num_partial_lanes =
-      s_num_elem % s_register_num_elem;
-
-  static constexpr camp::idx_t s_num_registers = (s_num_partial_lanes > 0)
-                                                     ? s_num_full_registers + 1
-                                                     : s_num_full_registers;
-
-  using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
-
-  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
-
-  static constexpr camp::idx_t s_mask_per_register =
-      (1 << log_base2_t::value) - 1;
-
-  // Offset of last regiser in m_registers
-  static constexpr camp::idx_t s_final_register = s_num_partial_lanes == 0
-                                                      ? s_num_full_registers - 1
-                                                      : s_num_full_registers;
-
-  template <typename IDX>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX i) -> IDX
+  /*!
+   * This provides a Tensor specialization for vectors
+   */
+  template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
+  class TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>> :
+    public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>
   {
-    return i >> IDX(s_shift_per_register);
-  }
+    public:
+      using self_type = TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
+      using base_type = internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>;
+      using element_type = camp::decay<T>;
+      using layout_type = TensorLayout<0>;
+      using register_type = Register<T, REGISTER_POLICY>;
 
-  template <typename IDX>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX i) -> IDX
-  {
-    return i & IDX(s_mask_per_register);
-  }
+      static constexpr camp::idx_t s_num_elem = SIZE;
 
+      using int_element_type = typename register_type::int_vector_type::element_type;
+      using int_vector_type = TensorRegister<REGISTER_POLICY, int_element_type, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
 
-  using base_type::m_registers;
+    private:
 
-public:
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr TensorRegister() {}
+      static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
 
+      static constexpr camp::idx_t s_num_full_registers = s_num_elem/s_register_num_elem;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  TensorRegister(element_type c) { this->broadcast(c); }
+      static constexpr camp::idx_t s_num_partial_lanes =  s_num_elem%s_register_num_elem;
 
+      static constexpr camp::idx_t s_num_registers =
+          (s_num_partial_lanes > 0) ?
+              s_num_full_registers + 1 :
+              s_num_full_registers;
 
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  TensorRegister(self_type const& c) : base_type(c) {}
+      using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
 
-  /*
-   * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
-   */
-  template <typename RHS,
-            typename std::enable_if<
-                std::is_base_of<
-                    RAJA::internal::expt::ET::TensorExpressionConcreteBase,
-                    RHS>::value,
-                bool>::type = true>
-  RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
-  {
-    // evaluate a single tile of the ET, storing in this
-    // RAJA::expt::TensorRegister
-    *this = rhs.eval(base_type::s_get_default_tile());
-  }
+      static constexpr camp::idx_t s_shift_per_register =
+          log_base2_t::value;
 
+      static constexpr camp::idx_t s_mask_per_register =
+          (1<<log_base2_t::value)-1;
 
-  template <typename... REGS>
-  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
-                                                       REGS const&... regs)
-      : base_type(reg0, regs...)
-  {}
+      // Offset of last regiser in m_registers
+      static constexpr camp::idx_t s_final_register =
+          s_num_partial_lanes == 0 ?
+              s_num_full_registers-1 : s_num_full_registers;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr bool is_root() { return register_type::is_root(); }
+      template<typename IDX>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      static
+      auto to_register(IDX i) -> IDX {
+        return i >> IDX(s_shift_per_register);
+      }
 
+      template<typename IDX>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      static
+      auto to_lane(IDX i) -> IDX {
+        return i & IDX(s_mask_per_register);
+      }
 
-  /*!
-   * Returns true if the underlying data packed for a given tensor ref
-   *
-   * This is true if either:
-   *   It's column major and the rows are stride one
-   *   It's row major and the columns are stride one
-   */
-  template <camp::idx_t STRIDE_ONE_DIM>
-  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
-  {
-    return STRIDE_ONE_DIM == 0;
-  }
 
+      using base_type::m_registers;
 
-  /*!
-   * Gets the maximum size of matrix along specified dimension
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
-  {
-    return dim == 0 ? s_num_elem : 0;
-  }
+    public:
 
 
-  /*!
-   * @brief Set entire vector to a single scalar value
-   * @param value Value to set all vector elements to
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(element_type value)
-  {
-    this->broadcast(value);
-    return *this;
-  }
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(self_type const& c) { return this->copy(c); }
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      TensorRegister(){}
 
-  /*!
-   * Provide left vector-matrix multiply for operator* between
-   * this vector and a matrix
-   */
-  template <typename T2, typename L, typename RP>
-  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
-  {
-    return y.left_vector_multiply(*this);
-  }
 
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      TensorRegister(element_type c)
+      {
+        this->broadcast(c);
+      }
 
-  template <typename REF_TYPE>
-  struct RefBridge;
 
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      TensorRegister(self_type const &c) :
+        base_type(c)
+      {
+      }
 
-  template <typename REF_TYPE>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
-  {
-    RefBridge<REF_TYPE>::load_ref(*this, ref);
-    return *this;
-  }
+      /*
+       * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
+       */
+      template<typename RHS,
+        typename std::enable_if<std::is_base_of<RAJA::internal::expt::ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      TensorRegister(RHS const &rhs)
+      {
+        // evaluate a single tile of the ET, storing in this RAJA::expt::TensorRegister
+        *this = rhs.eval(base_type::s_get_default_tile());
+      }
 
-  template <typename REF_TYPE>
-  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
-  {
-    RefBridge<REF_TYPE>::store_ref(*this, ref);
-    return *this;
-  }
-
-
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            camp::idx_t STRIDE_ONE_DIM>
-  struct RefBridge<
-      RAJA::internal::expt::
-          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
-  {
 
-    using RefType = RAJA::internal::expt::
-        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
+      template<typename ... REGS>
+      explicit
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      TensorRegister(register_type reg0, REGS const &... regs) :
+        base_type(reg0, regs...)
+      {
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      bool is_root() {
+        return register_type::is_root();
+      }
+
 
-    /*!
-     * @brief Performs load specified by TensorRef object.
-     */
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    static void load_ref(self_type& self, RefType const& ref)
-    {
+      /*!
+       * Returns true if the underlying data packed for a given tensor ref
+       *
+       * This is true if either:
+       *   It's column major and the rows are stride one
+       *   It's row major and the columns are stride one
+       */
+      template<camp::idx_t STRIDE_ONE_DIM>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      bool is_ref_packed() {
+        return STRIDE_ONE_DIM == 0;
+      }
 
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      // check for packed data
-      if (STRIDE_ONE_DIM == 0)
+      /*!
+       * Gets the maximum size of matrix along specified dimension
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
+        return dim == 0 ? s_num_elem : 0;
+      }
+
+
+      /*!
+       * @brief Set entire vector to a single scalar value
+       * @param value Value to set all vector elements to
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(element_type value)
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed++;
-#endif
-          self.load_packed(ptr);
-        }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n++;
-#endif
-          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-        }
+        this->broadcast(value);
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        return this->copy(c);
       }
-      // strided data
-      else
+
+      /*!
+       * Provide left vector-matrix multiply for operator* between
+       * this vector and a matrix
+       */
+      template<typename T2, typename L, typename RP>
+      self_type
+      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided++;
-#endif
-          self.load_strided(ptr, ref.m_stride[0]);
-        }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n++;
-#endif
-          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-        }
+        return y.left_vector_multiply(*this);
       }
-    }
 
 
-    /*!
-     * @brief Performs load specified by TensorRef object.
-     */
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    static void store_ref(self_type const& self, RefType& ref)
-    {
+      template<typename REF_TYPE>
+      struct RefBridge;
+
 
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type& load_ref (REF_TYPE const &ref){
+          RefBridge<REF_TYPE>::load_ref(*this,ref);
+          return *this;
+      }
 
-      // check for packed data
-      if (STRIDE_ONE_DIM == 0)
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_ref (REF_TYPE &ref) const {
+          RefBridge<REF_TYPE>::store_ref(*this,ref);
+          return *this;
+      }
+
+
+      
+      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
+      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed++;
-#endif
-          self.store_packed(ptr);
+
+          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void load_ref (self_type& self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed ++;
+              #endif
+                self.load_packed(ptr);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed_n ++;
+              #endif
+                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided ++;
+              #endif
+                self.load_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided_n ++;
+              #endif
+                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
+          }
+
+
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed ++;
+    #endif
+                self.store_packed(ptr);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed_n ++;
+    #endif
+                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided ++;
+    #endif
+                self.store_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided_n ++;
+    #endif
+                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
+          }
+           
+
+      };
+
+
+
+
+
+      
+      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, INDEX_TYPE STRIDE_VALUE, INDEX_TYPE BEGIN_VALUE, INDEX_TYPE SIZE_VALUE, camp::idx_t STRIDE_ONE_DIM>
+      struct RefBridge <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>> 
+      {
+
+          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by StaticTensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void load_ref (self_type &self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed ++;
+              #endif
+                self.load_packed(ptr);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed_n ++;
+              #endif
+                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided ++;
+              #endif
+                self.load_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided_n ++;
+              #endif
+                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
+          }
+
+
+
+          /*!
+           * @brief Performs load specified by StaticTensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed ++;
+    #endif
+                self.store_packed(ptr);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed_n ++;
+    #endif
+                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided ++;
+    #endif
+                self.store_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided_n ++;
+    #endif
+                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
+          }
+           
+
+      };
+     
+
+
+
+      /*!
+       * Loads a dense full vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr)
+      {
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
         }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n++;
-#endif
-          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].load_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
         }
+        return *this;
       }
-      // strided data
-      else
+
+      /*!
+       * Loads a strided full vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, int stride)
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided++;
-#endif
-          self.store_strided(ptr, ref.m_stride[0]);
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
         }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n++;
-#endif
-          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].load_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
         }
+        return *this;
       }
-    }
-  };
-
 
-  template <typename POINTER_TYPE,
-            typename INDEX_TYPE,
-            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
-            INDEX_TYPE STRIDE_VALUE,
-            INDEX_TYPE BEGIN_VALUE,
-            INDEX_TYPE SIZE_VALUE,
-            camp::idx_t STRIDE_ONE_DIM>
-  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
-      POINTER_TYPE,
-      INDEX_TYPE,
-      TENSOR_SIZE,
-      camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
-      camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
-      camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
-      STRIDE_ONE_DIM>>
-  {
-
-    using RefType = RAJA::internal::expt::StaticTensorRef<
-        POINTER_TYPE,
-        INDEX_TYPE,
-        TENSOR_SIZE,
-        camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
-        camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
-        camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
-        STRIDE_ONE_DIM>;
-
-    /*!
-     * @brief Performs load specified by StaticTensorRef object.
-     */
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    static void load_ref(self_type& self, RefType const& ref)
-    {
-
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
-
-      // check for packed data
-      if (STRIDE_ONE_DIM == 0)
+      /*!
+       * Loads a dense partial vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, int N)
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed++;
-#endif
-          self.load_packed(ptr);
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
+          }
+          else{
+            m_registers[reg].load_packed_n(ptr+reg*s_register_num_elem,
+                                           N-reg*s_register_num_elem);
+
+            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
+              m_registers[r].broadcast(0);
+            }
+            return *this;
+          }
+
         }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n++;
-#endif
-          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].load_packed_n(
+              ptr+s_final_register*s_register_num_elem,
+              N-s_final_register*s_register_num_elem);
         }
+        return *this;
       }
-      // strided data
-      else
+
+      /*!
+       * Loads a strided partial vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr,
+          int stride, int N)
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided++;
-#endif
-          self.load_strided(ptr, ref.m_stride[0]);
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
+          }
+          else{
+            m_registers[reg].load_strided_n(ptr+reg*s_register_num_elem*stride,
+                                            stride,
+                                            N-reg*s_register_num_elem);
+            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
+              m_registers[r].broadcast(0);
+            }
+            return *this;
+          }
+
         }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n++;
-#endif
-          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].load_strided_n(
+              ptr+s_final_register*s_register_num_elem*stride,
+              stride,
+              N-s_final_register*s_register_num_elem);
         }
+        return *this;
       }
-    }
 
 
-    /*!
-     * @brief Performs load specified by StaticTensorRef object.
-     */
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    static void store_ref(self_type const& self, RefType& ref)
-    {
+      /*!
+       * @brief Generic gather operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &gather(element_type const *ptr, int_vector_type offsets){
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          m_registers[reg].gather(ptr, offsets.vec(reg));
+        }
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+        }
+        return *this;
+      }
 
-      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+      /*!
+       * @brief Generic gather operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            m_registers[reg].gather(ptr, offsets.vec(reg));
+          }
+          else{
+            m_registers[reg].gather_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
+            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
+              m_registers[r].broadcast(0);
+            }
+            return *this;
+          }
 
-      // check for packed data
-      if (STRIDE_ONE_DIM == 0)
-      {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed++;
-#endif
-          self.store_packed(ptr);
         }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n++;
-#endif
-          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].gather_n(
+              ptr,
+              offsets.vec(s_final_register),
+              N-s_final_register*s_register_num_elem);
         }
+        return *this;
       }
-      // strided data
-      else
+
+
+      /*!
+       * Loads a dense full vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const
       {
-        // full vector?
-        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided++;
-#endif
-          self.store_strided(ptr, ref.m_stride[0]);
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
         }
-        // partial
-        else
-        {
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n++;
-#endif
-          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].store_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
         }
+        return *this;
       }
-    }
-  };
-
-
-  /*!
-   * Loads a dense full vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].load_packed_n(
-          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
-    }
-    return *this;
-  }
-
-  /*!
-   * Loads a strided full vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, int stride)
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
-                                    stride);
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].load_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride, stride,
-          s_num_partial_lanes);
-    }
-    return *this;
-  }
 
-  /*!
-   * Loads a dense partial vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, int N)
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      /*!
+       * Loads a strided full vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, int stride) const
       {
-        m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
+        }
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].store_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        }
+        return *this;
       }
-      else
+
+      /*!
+       * Loads a dense partial vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, int N) const
       {
-        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
-                                       N - reg * s_register_num_elem);
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
+          }
+          else{
+            m_registers[reg].store_packed_n(ptr+reg*s_register_num_elem,
+                                           N-reg*s_register_num_elem);
+            return *this;
+          }
 
-        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
-        {
-          m_registers[r].broadcast(0);
+        }
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].store_packed_n(
+              ptr+s_final_register*s_register_num_elem,
+              N-s_final_register*s_register_num_elem);
         }
         return *this;
       }
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].load_packed_n(
-          ptr + s_final_register * s_register_num_elem,
-          N - s_final_register * s_register_num_elem);
-    }
-    return *this;
-  }
 
-  /*!
-   * Loads a strided partial vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_strided_n(element_type const* ptr, int stride, int N)
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      /*!
+       * Loads a strided partial vector from memory
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type  *ptr,
+          int stride, int N) const
       {
-        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
-                                      stride);
-      }
-      else
-      {
-        m_registers[reg].load_strided_n(ptr +
-                                            reg * s_register_num_elem * stride,
-                                        stride, N - reg * s_register_num_elem);
-        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
-        {
-          m_registers[r].broadcast(0);
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
+          }
+          else{
+            m_registers[reg].store_strided_n(ptr+reg*s_register_num_elem*stride,
+                                            stride,
+                                            N-reg*s_register_num_elem);
+            return *this;
+          }
+
+        }
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].store_strided_n(
+              ptr+s_final_register*s_register_num_elem*stride,
+              stride,
+              N-s_final_register*s_register_num_elem);
         }
         return *this;
       }
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].load_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride, stride,
-          N - s_final_register * s_register_num_elem);
-    }
-    return *this;
-  }
 
 
-  /*!
-   * @brief Generic gather operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& gather(element_type const* ptr, int_vector_type offsets)
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      m_registers[reg].gather(ptr, offsets.vec(reg));
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
-                                             s_num_partial_lanes);
-    }
-    return *this;
-  }
 
-  /*!
-   * @brief Generic gather operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
-      {
-        m_registers[reg].gather(ptr, offsets.vec(reg));
-      }
-      else
-      {
-        m_registers[reg].gather_n(ptr, offsets.vec(reg),
-                                  N - reg * s_register_num_elem);
-        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
-        {
-          m_registers[r].broadcast(0);
+      /*!
+       * @brief Generic scatter operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &scatter(element_type *ptr, int_vector_type const &offsets) const {
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          m_registers[reg].scatter(ptr, offsets.vec(reg));
+        }
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].scatter_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
         }
         return *this;
       }
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
-                                             N - s_final_register *
-                                                     s_register_num_elem);
-    }
-    return *this;
-  }
 
+      /*!
+       * @brief Generic scatter operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &scatter_n(element_type *ptr, int_vector_type const &offsets, camp::idx_t N) const {
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            m_registers[reg].scatter(ptr, offsets.vec(reg));
+          }
+          else{
+            m_registers[reg].scatter_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
+
+            return *this;
+          }
 
-  /*!
-   * Loads a dense full vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].store_packed_n(
-          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
-    }
-    return *this;
-  }
+        }
+        if(s_num_partial_lanes){
+          m_registers[s_final_register].scatter_n(
+              ptr,
+              offsets.vec(s_final_register),
+              N-s_num_full_registers*s_register_num_elem);
+        }
+        return *this;
+      }
 
-  /*!
-   * Loads a strided full vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, int stride) const
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
-                                     stride);
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].store_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride, stride,
-          s_num_partial_lanes);
-    }
-    return *this;
-  }
 
-  /*!
-   * Loads a dense partial vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, int N) const
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
-      {
-        m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &den) const {
+        self_type result;
+        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
+          result.vec(reg) = m_registers[reg].divide(den.vec(reg));
+        }
+        if(s_num_partial_lanes){
+          result.vec(s_final_register) = m_registers[s_final_register].divide_n(den.vec(s_final_register), s_num_partial_lanes);
+        }
+        return result;
       }
-      else
-      {
-        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
-                                        N - reg * s_register_num_elem);
-        return *this;
+
+      /*!
+       * @brief Divide n elements of this vector by another vector
+       * @param x Vector to divide by
+       * @param n Number of elements to divide
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t n) const {
+        self_type q(*this);
+        for(camp::idx_t i = 0;i < n;++i){
+          q.set(this->get(i) / b.get(i), i);
+        }
+        return q;
       }
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].store_packed_n(
-          ptr + s_final_register * s_register_num_elem,
-          N - s_final_register * s_register_num_elem);
-    }
-    return *this;
-  }
 
-  /*!
-   * Loads a strided partial vector from memory
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
-      {
-        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
-                                       stride);
+      /*!
+       * @brief Divide n elements of this vector by a scalar
+       * @param x Scalar to divide by
+       * @param n Number of elements to divide
+       * @return Value of (*this)+x
+       */
+      RAJA_SUPPRESS_HD_WARN
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(element_type const &b, camp::idx_t n) const {
+        self_type q(*this);
+        for(camp::idx_t i = 0;i < n;++i){
+          q.set(this->get(i) / b, i);
+        }
+        return q;
       }
-      else
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type min() const
       {
-        m_registers[reg].store_strided_n(ptr +
-                                             reg * s_register_num_elem * stride,
-                                         stride, N - reg * s_register_num_elem);
-        return *this;
+        // special case where there's just one parital register
+        if(s_num_full_registers == 0){
+          return m_registers[0].min_n(s_num_partial_lanes);
+        }
+
+        element_type result = m_registers[0].min();
+        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
+          result = RAJA::min<element_type>(result, m_registers[i].min());
+        }
+        if(s_num_partial_lanes){
+          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(s_num_partial_lanes));
+        }
+        return result;
       }
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].store_strided_n(
-          ptr + s_final_register * s_register_num_elem * stride, stride,
-          N - s_final_register * s_register_num_elem);
-    }
-    return *this;
-  }
 
+      /*!
+       * @brief Returns the smallest element over the first N lanes
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type min_n(int N) const
+      {
+        // special case where there's just one parital register
+        if(N < s_register_num_elem){
+          return m_registers[0].min_n(N);
+        }
 
-  /*!
-   * @brief Generic scatter operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& scatter(element_type* ptr,
-                           int_vector_type const& offsets) const
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      m_registers[reg].scatter(ptr, offsets.vec(reg));
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].scatter_n(
-          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
-    }
-    return *this;
-  }
+        element_type result = m_registers[0].min();
+        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            result = RAJA::min<element_type>(result, m_registers[reg].min());
+          }
+          else{
+            return RAJA::min<element_type>(result, m_registers[reg].min_n(N-reg*s_register_num_elem));
+          }
+        }
+        if(N-s_num_full_registers*s_register_num_elem > 0){
+          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(N-s_final_register*s_register_num_elem));
+        }
+        return result;
+      }
 
-  /*!
-   * @brief Generic scatter operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& scatter_n(element_type* ptr,
-                             int_vector_type const& offsets,
-                             camp::idx_t N) const
-  {
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type max() const
       {
-        m_registers[reg].scatter(ptr, offsets.vec(reg));
+        // special case where there's just one parital register
+        if(s_num_full_registers == 0){
+          return m_registers[0].max_n(s_num_partial_lanes);
+        }
+
+        element_type result = m_registers[0].max();
+        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
+          result = RAJA::max<element_type>(result, m_registers[i].max());
+        }
+        if(s_num_partial_lanes){
+          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(s_num_partial_lanes));
+        }
+        return result;
       }
-      else
+
+      /*!
+       * @brief Returns the largest element over the first N lanes
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type max_n(int N) const
       {
-        m_registers[reg].scatter_n(ptr, offsets.vec(reg),
-                                   N - reg * s_register_num_elem);
+        // special case where there's just one parital register
+        if(N < s_register_num_elem){
+          return m_registers[0].max_n(N);
+        }
 
-        return *this;
+        element_type result = m_registers[0].max();
+        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
+          if(N >= reg*s_register_num_elem + s_register_num_elem){
+            result = RAJA::max<element_type>(result, m_registers[reg].max());
+          }
+          else{
+            return RAJA::max<element_type>(result, m_registers[reg].max_n(N-reg*s_register_num_elem));
+          }
+        }
+        if(N-s_num_full_registers*s_register_num_elem > 0){
+          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(N-s_final_register*s_register_num_elem));
+        }
+        return result;
       }
-    }
-    if (s_num_partial_lanes)
-    {
-      m_registers[s_final_register].scatter_n(
-          ptr, offsets.vec(s_final_register),
-          N - s_num_full_registers * s_register_num_elem);
-    }
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& den) const
-  {
-    self_type result;
-    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
-    {
-      result.vec(reg) = m_registers[reg].divide(den.vec(reg));
-    }
-    if (s_num_partial_lanes)
-    {
-      result.vec(s_final_register) = m_registers[s_final_register].divide_n(
-          den.vec(s_final_register), s_num_partial_lanes);
-    }
-    return result;
-  }
 
-  /*!
-   * @brief Divide n elements of this vector by another vector
-   * @param x Vector to divide by
-   * @param n Number of elements to divide
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t n) const
-  {
-    self_type q(*this);
-    for (camp::idx_t i = 0; i < n; ++i)
-    {
-      q.set(this->get(i) / b.get(i), i);
-    }
-    return q;
-  }
+      /*!
+       * @brief Returns the sum of all elements
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type sum() const
+      {
+        // first do a vector sum of all registers
+        register_type s = m_registers[0];
+        for(camp::idx_t i = 1;i < s_num_registers;++ i){
+          s += m_registers[i];
+        }
+        // then a horizontal sum of result
+        return s.sum();
+      }
 
-  /*!
-   * @brief Divide n elements of this vector by a scalar
-   * @param x Scalar to divide by
-   * @param n Number of elements to divide
-   * @return Value of (*this)+x
-   */
-  RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(element_type const& b, camp::idx_t n) const
-  {
-    self_type q(*this);
-    for (camp::idx_t i = 0; i < n; ++i)
-    {
-      q.set(this->get(i) / b, i);
-    }
-    return q;
-  }
 
+      /*!
+       * @brief The * operator of two vectors is a element-wise multiply
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type operator*(self_type const &x) const {
+        return this->multiply(x);
+      }
 
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type min() const
-  {
-    // special case where there's just one parital register
-    if (s_num_full_registers == 0)
-    {
-      return m_registers[0].min_n(s_num_partial_lanes);
-    }
-
-    element_type result = m_registers[0].min();
-    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
-    {
-      result = RAJA::min<element_type>(result, m_registers[i].min());
-    }
-    if (s_num_partial_lanes)
-    {
-      result = RAJA::min<element_type>(
-          result, m_registers[s_final_register].min_n(s_num_partial_lanes));
-    }
-    return result;
-  }
 
-  /*!
-   * @brief Returns the smallest element over the first N lanes
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type min_n(int N) const
-  {
-    // special case where there's just one parital register
-    if (N < s_register_num_elem)
-    {
-      return m_registers[0].min_n(N);
-    }
-
-    element_type result = m_registers[0].min();
-    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
-      {
-        result = RAJA::min<element_type>(result, m_registers[reg].min());
-      }
-      else
-      {
-        return RAJA::min<element_type>(
-            result, m_registers[reg].min_n(N - reg * s_register_num_elem));
+      /*!
+       * @brief The dot product of two vectors
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      element_type dot(self_type const &x) const {
+        element_type dp(0);
+        for(camp::idx_t i = 0;i < s_num_registers;++ i){
+          dp += m_registers[i].dot(x.vec(i));
+        }
+        return dp;
       }
-    }
-    if (N - s_num_full_registers * s_register_num_elem > 0)
-    {
-      result = RAJA::min<element_type>(
-          result, m_registers[s_final_register].min_n(
-                      N - s_final_register * s_register_num_elem));
-    }
-    return result;
-  }
 
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type max() const
-  {
-    // special case where there's just one parital register
-    if (s_num_full_registers == 0)
-    {
-      return m_registers[0].max_n(s_num_partial_lanes);
-    }
-
-    element_type result = m_registers[0].max();
-    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
-    {
-      result = RAJA::max<element_type>(result, m_registers[i].max());
-    }
-    if (s_num_partial_lanes)
-    {
-      result = RAJA::max<element_type>(
-          result, m_registers[s_final_register].max_n(s_num_partial_lanes));
-    }
-    return result;
-  }
 
-  /*!
-   * @brief Returns the largest element over the first N lanes
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type max_n(int N) const
-  {
-    // special case where there's just one parital register
-    if (N < s_register_num_elem)
-    {
-      return m_registers[0].max_n(N);
-    }
-
-    element_type result = m_registers[0].max();
-    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
-    {
-      if (N >= reg * s_register_num_elem + s_register_num_elem)
-      {
-        result = RAJA::max<element_type>(result, m_registers[reg].max());
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &set(element_type val, int idx){
+        m_registers[to_register(idx)].set(val, to_lane(idx));
+        return *this;
       }
-      else
-      {
-        return RAJA::max<element_type>(
-            result, m_registers[reg].max_n(N - reg * s_register_num_elem));
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      element_type get(int idx) const {
+        return m_registers[to_register(idx)].get(to_lane(idx));
       }
-    }
-    if (N - s_num_full_registers * s_register_num_elem > 0)
-    {
-      result = RAJA::max<element_type>(
-          result, m_registers[s_final_register].max_n(
-                      N - s_final_register * s_register_num_elem));
-    }
-    return result;
-  }
 
-  /*!
-   * @brief Returns the sum of all elements
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  element_type sum() const
-  {
-    // first do a vector sum of all registers
-    register_type s = m_registers[0];
-    for (camp::idx_t i = 1; i < s_num_registers; ++i)
-    {
-      s += m_registers[i];
-    }
-    // then a horizontal sum of result
-    return s.sum();
-  }
 
 
-  /*!
-   * @brief The * operator of two vectors is a element-wise multiply
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type operator*(self_type const& x) const { return this->multiply(x); }
+      /*!
+       * @brief Converts to vector to a string
+       *
+       *
+       */
+      RAJA_INLINE
+      std::string to_string() const {
+        std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
 
+        //
+        for(camp::idx_t i = 0;i < s_num_elem; ++ i){
+          s += std::to_string(this->get(i)) + " ";
+        }
 
-  /*!
-   * @brief The dot product of two vectors
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  element_type dot(self_type const& x) const
-  {
-    element_type dp(0);
-    for (camp::idx_t i = 0; i < s_num_registers; ++i)
-    {
-      dp += m_registers[i].dot(x.vec(i));
-    }
-    return dp;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& set(element_type val, int idx)
-  {
-    m_registers[to_register(idx)].set(val, to_lane(idx));
-    return *this;
-  }
+        camp::idx_t physical_size = s_num_registers * s_register_num_elem;
+        if(s_num_elem < physical_size){
+          s += "{";
+          for(camp::idx_t i = s_num_elem;i < physical_size; ++ i){
+            s += std::to_string(this->get(i)) + " ";
+          }
+          s += "}";
+        }
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  element_type get(int idx) const
-  {
-    return m_registers[to_register(idx)].get(to_lane(idx));
-  }
 
+        s += " ]\n";
 
-  /*!
-   * @brief Converts to vector to a string
-   *
-   *
-   */
-  RAJA_INLINE
-  std::string to_string() const
-  {
-    std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
-
-    //
-    for (camp::idx_t i = 0; i < s_num_elem; ++i)
-    {
-      s += std::to_string(this->get(i)) + " ";
-    }
-
-    camp::idx_t physical_size = s_num_registers * s_register_num_elem;
-    if (s_num_elem < physical_size)
-    {
-      s += "{";
-      for (camp::idx_t i = s_num_elem; i < physical_size; ++i)
-      {
-        s += std::to_string(this->get(i)) + " ";
+        return s;
       }
-      s += "}";
-    }
-
 
-    s += " ]\n";
 
-    return s;
-  }
-};
+  };
 
 
-}  // namespace expt
+} // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/stats.hpp b/include/RAJA/pattern/tensor/stats.hpp
index 643cd3ca22..77b70faf00 100644
--- a/include/RAJA/pattern/tensor/stats.hpp
+++ b/include/RAJA/pattern/tensor/stats.hpp
@@ -33,7 +33,7 @@ namespace expt
 {
 struct tensor_stats
 {
-  static int indent;
+    static int indent;
 
   static camp::idx_t num_vector_copy;
   static camp::idx_t num_vector_copy_ctor;
@@ -77,9 +77,10 @@ struct tensor_stats
 
   static void resetVectorStats();
   static void printVectorStats();
+
 };
 
-}  // namespace expt
-}  // namespace RAJA
+} // namespace expt
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index 26f06798cc..defa08585a 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -57,14 +57,15 @@ class MultiPolicy
 
 public:
   MultiPolicy() = delete;  // No default construction
-  MultiPolicy(Selector s) : s(s), _policies({Policies {}...}) {}
+  MultiPolicy(Selector s) : s(s), _policies({Policies{}...}) {}
   MultiPolicy(Selector s, Policies... policies) : s(s), _policies({policies...})
-  {}
+  {
+  }
 
-  MultiPolicy(const MultiPolicy& p) : s(p.s), _policies(p._policies) {}
+  MultiPolicy(const MultiPolicy &p) : s(p.s), _policies(p._policies) {}
 
   template <typename Iterable, typename Body>
-  int invoke(Iterable&& i, Body&& b)
+  int invoke(Iterable &&i, Body &&b)
   {
     size_t index = s(i);
     _policies.invoke(index, i, b);
@@ -85,8 +86,9 @@ template <typename Iterable,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE void
-forall_impl(MultiPolicy<Selector, Policies...> p, Iterable&& iter, Body&& body)
+RAJA_INLINE void forall_impl(MultiPolicy<Selector, Policies...> p,
+                             Iterable &&iter,
+                             Body &&body)
 {
   p.invoke(iter, body);
 }
@@ -95,11 +97,10 @@ template <typename Res,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE resources::EventProxy<Res>
-forall_impl(Res r,
-            MultiPolicy<Selector, Policies...> p,
-            Iterable&& iter,
-            Body&& body)
+RAJA_INLINE resources::EventProxy<Res> forall_impl(Res r,
+                                  MultiPolicy<Selector, Policies...> p,
+                                  Iterable &&iter,
+                                  Body &&body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
@@ -135,7 +136,7 @@ template <typename... Policies, typename Selector>
 RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 {
-  return MultiPolicy<Selector, Policies...>(s, Policies {}...);
+  return MultiPolicy<Selector, Policies...>(s, Policies{}...);
 }
 
 /// make_multi_policy - Construct a MultiPolicy from the given selector and
@@ -152,16 +153,15 @@ RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)> {},
-                                   s, policies);
+  return detail::make_multi_policy(
+      camp::make_idx_seq_t<sizeof...(Policies)>{}, s, policies);
 }
 
 namespace detail
 {
 
 template <size_t index, size_t size, typename Policy, typename... rest>
-struct policy_invoker : public policy_invoker<index - 1, size, rest...>
-{
+struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
   static_assert(index < size, "index must be in the range of possibilities");
   Policy _p;
   using NextInvoker = policy_invoker<index - 1, size, rest...>;
@@ -169,12 +169,11 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...>
   policy_invoker(Policy p, rest... args) : NextInvoker(args...), _p(p) {}
 
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
+  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
   {
-    if (offset == size - index - 1)
-    {
+    if (offset == size - index - 1) {
 
-      util::PluginContext context {util::make_context<Policy>()};
+      util::PluginContext context{util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -190,27 +189,22 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...>
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    }
-    else
-    {
-      NextInvoker::invoke(offset, std::forward<Iterable>(iter),
-                          std::forward<LoopBody>(loop_body));
+    } else {
+      NextInvoker::invoke(offset, std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
 };
 
 template <size_t size, typename Policy, typename... rest>
-struct policy_invoker<0, size, Policy, rest...>
-{
+struct policy_invoker<0, size, Policy, rest...> {
   Policy _p;
   policy_invoker(Policy p, rest...) : _p(p) {}
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
+  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
   {
-    if (offset == size - 1)
-    {
+    if (offset == size - 1) {
 
-      util::PluginContext context {util::make_context<Policy>()};
+      util::PluginContext context{util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -220,16 +214,14 @@ struct policy_invoker<0, size, Policy, rest...>
 
       util::callPreLaunchPlugins(context);
 
-      // std::cout <<"policy_invoker: No index\n";
+      //std::cout <<"policy_invoker: No index\n";
       using policy::multi::forall_impl;
       RAJA_FORCEINLINE_RECURSIVE
       auto r = resources::get_resource<Policy>::type::get_default();
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    }
-    else
-    {
+    } else {
       throw std::runtime_error("unknown offset invoked");
     }
   }
@@ -242,9 +234,8 @@ namespace type_traits
 
 template <typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
-                                            typename std::decay<T>::type>
-{};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type> {
+};
 }  // namespace type_traits
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 50f9a08863..898c92a621 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -26,8 +26,7 @@
 namespace RAJA
 {
 
-enum class Policy
-{
+enum class Policy {
   undefined,
   sequential,
   simd,
@@ -38,8 +37,7 @@ enum class Policy
   sycl
 };
 
-enum class Pattern
-{
+enum class Pattern {
   undefined,
   forall,
   region,
@@ -54,109 +52,97 @@ enum class Pattern
   workgroup_dispatch
 };
 
-enum class Launch
-{
-  undefined,
-  sync,
-  async
-};
+enum class Launch { undefined, sync, async };
 
-struct PolicyBase
-{};
+struct PolicyBase {
+};
 
 template <Policy Policy_,
           Pattern Pattern_,
           Launch Launch_,
           Platform Platform_,
           typename... Traits>
-struct PolicyBaseT : PolicyBase
-{
-  static constexpr Policy policy     = Policy_;
-  static constexpr Pattern pattern   = Pattern_;
-  static constexpr Launch launch     = Launch_;
+struct PolicyBaseT : PolicyBase {
+  static constexpr Policy policy = Policy_;
+  static constexpr Pattern pattern = Pattern_;
+  static constexpr Launch launch = Launch_;
   static constexpr Platform platform = Platform_;
 };
 
 template <typename PolicyType>
-struct policy_of
-{
+struct policy_of {
   static constexpr Policy value = PolicyType::policy;
 };
 
 template <typename PolicyType>
-struct pattern_of
-{
+struct pattern_of {
   static constexpr Pattern value = PolicyType::pattern;
 };
 
 template <typename PolicyType>
-struct launch_of
-{
+struct launch_of {
   static constexpr Launch value = PolicyType::launch;
 };
 
 template <typename PolicyType>
-struct platform_of
-{
+struct platform_of {
   static constexpr Platform value = PolicyType::platform;
 };
 
 template <typename PolicyType, RAJA::Policy P_>
-struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_>
-{};
+struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_> {
+};
 
-template <typename PolicyType, RAJA::Policy... Ps_>
-struct policy_any_of
-    : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value>
-{};
+template <typename PolicyType, RAJA::Policy ... Ps_>
+struct policy_any_of : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
+};
 
 template <typename PolicyType, RAJA::Pattern P_>
-struct pattern_is : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_>
-{};
+struct pattern_is
+    : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_> {
+};
 
 template <typename PolicyType, RAJA::Launch L_>
-struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_>
-{};
+struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_> {
+};
 
 template <typename PolicyType, RAJA::Platform P_>
 struct platform_is
-    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_>
-{};
+    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_> {
+};
 
 template <typename PolicyType, typename Trait>
-struct policy_has_trait_impl : camp::num<false>
-{};
+struct policy_has_trait_impl
+    : camp::num<false> {
+};
 ///
-template <typename Trait,
-          Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          Platform Platform_,
-          typename... Traits>
+template <typename Trait, Policy Policy_,
+                          Pattern Pattern_,
+                          Launch Launch_,
+                          Platform Platform_,
+                          typename... Traits>
 struct policy_has_trait_impl<
-    PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
-    Trait>
-    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value>
-{};
+      PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>, Trait>
+    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value> {
+};
 ///
 template <typename PolicyType, typename Trait>
 using policy_has_trait = policy_has_trait_impl<camp::decay<PolicyType>, Trait>;
 
 
 template <typename Inner>
-struct wrapper
-{
+struct wrapper {
   using inner = Inner;
 };
 
 namespace reduce
 {
 
-struct ordered
-{};
+struct ordered {
+};
 
-struct unordered
-{};
+struct unordered {
+};
 
 }  // namespace reduce
 
@@ -173,7 +159,10 @@ template <Policy Policy_,
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
-template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
+template <Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
@@ -196,8 +185,8 @@ struct ExecutionPolicy
                     ::RAJA::concepts::has_type<::RAJA::Launch>(
                         camp::decay<decltype(Pol::launch)>()),
                     ::RAJA::concepts::has_type<::RAJA::Platform>(
-                        camp::decay<decltype(Pol::platform)>()))
-{};
+                        camp::decay<decltype(Pol::platform)>())) {
+};
 
 }  // end namespace concepts
 
@@ -205,45 +194,44 @@ namespace type_traits
 {
 
 template <typename Pol>
-struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential>
-{};
+struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential> {
+};
 template <typename Pol>
-struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd>
-{};
+struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd> {
+};
 template <typename Pol>
-struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp>
-{};
+struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp> {
+};
 template <typename Pol>
 struct is_target_openmp_policy
-    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp>
-{};
+    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp> {
+};
 template <typename Pol>
-struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda>
-{};
+struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda> {
+};
 template <typename Pol>
-struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip>
-{};
+struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip> {
+};
 template <typename Pol>
-struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl>
-{};
+struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl> {
+};
 
 template <typename Pol>
 struct is_device_exec_policy
-    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip>
-{};
+    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip> {
+};
 
 DefineTypeTraitFromConcept(is_execution_policy,
                            RAJA::concepts::ExecutionPolicy);
 
 
 template <typename Pol>
-struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce>
-{};
+struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce> {
+};
 
 template <typename Pol>
-struct is_multi_reduce_policy
-    : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce>
-{};
+struct is_multi_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
+};
 
 }  // end namespace type_traits
 
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index 448e78e8c9..cae78d2493 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -39,76 +39,75 @@ namespace workgroup
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
-{};
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_order> {
+};
 /// execute the enqueued loops in the reverse order from the order that they
 /// were enqueued
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct reverse_ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
-{};
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_order> {
+};
 
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in separate allocations.
 struct array_of_pointers
-    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
-{};
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_storage> {
+};
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in a single compact array.
 struct ragged_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
-{};
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_storage> {
+};
 /// store an array of the enqueued objects with padding such that the objects
 /// can be accessed using a constant stride from the beginning of the array.
 struct constant_stride_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
-{};
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_storage> {
+};
 
 /// Dispatch using function pointers to make indirect function calls
 struct indirect_function_call_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch>
-{};
+                                  Pattern::workgroup_dispatch> {
+};
 /// Dispatch using virtual functions to make indirect function calls
 struct indirect_virtual_function_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch>
-{};
+                                  Pattern::workgroup_dispatch> {
+};
 /// Dispatch using an implementation equivalent to a switch statement to select
 /// the type from RangeAndCallables and directly call the object.
 /// RangeAndCallables is a pack of types of the form camp::list<Range, Callable>
 /// where pairs of Range and Callable are the types of the range and callable
 /// objects that may be passed to WorkPool enqueue.
-template <typename... RangeAndCallables>
+template < typename ... RangeAndCallables >
 struct direct_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch>
-{};
-
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
-struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
-                             policy_of<EXEC_POLICY_T>::value,
-                             Pattern::workgroup,
-                             platform_of<EXEC_POLICY_T>::value>
-{
-  static_assert(
-      RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
+                                  Pattern::workgroup_dispatch> {
+};
+
+template < typename EXEC_POLICY_T,
+           typename ORDER_POLICY_T,
+           typename STORAGE_POLICY_T,
+           typename DISPATCH_POLICY_T = indirect_function_call_dispatch >
+struct WorkGroupPolicy
+    : public RAJA::make_policy_pattern_platform_t<
+                       policy_of<EXEC_POLICY_T>::value,
+                       Pattern::workgroup,
+                       platform_of<EXEC_POLICY_T>::value> {
+  static_assert(RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
       "WorkGroupPolicy: EXEC_POLICY_T must be a workgroup exec policy");
-  static_assert(
-      RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
+  static_assert(RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(RAJA::pattern_is<STORAGE_POLICY_T,
-                                 RAJA::Pattern::workgroup_storage>::value,
-                "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage "
-                "policy");
-  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T,
-                                 RAJA::Pattern::workgroup_dispatch>::value,
-                "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup "
-                "dispatch policy");
+  static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
+      "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
+  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::value,
+      "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup dispatch policy");
 };
 
 }  // end namespace workgroup
@@ -118,12 +117,12 @@ using policy::workgroup::ordered;
 using policy::workgroup::reverse_ordered;
 
 using policy::workgroup::array_of_pointers;
-using policy::workgroup::constant_stride_array_of_objects;
 using policy::workgroup::ragged_array_of_objects;
+using policy::workgroup::constant_stride_array_of_objects;
 
-using policy::workgroup::direct_dispatch;
 using policy::workgroup::indirect_function_call_dispatch;
 using policy::workgroup::indirect_virtual_function_dispatch;
+using policy::workgroup::direct_dispatch;
 
 using policy::workgroup::WorkGroupPolicy;
 
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index ee859b4a91..e0ca557b32 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -23,7 +23,7 @@
 #include "RAJA/util/macros.hpp"
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/sequential/atomic.hpp"
+    #include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 /*!
@@ -39,19 +39,19 @@
  * because we assume there is no thread safety issues (no parallel model)
  */
 #if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE)
-#define RAJA_AUTO_ATOMIC                                                       \
+#define RAJA_AUTO_ATOMIC \
   RAJA::cuda_atomic {}
 #elif defined(__HIP_DEVICE_COMPILE__) && defined(RAJA_HIP_ACTIVE)
-#define RAJA_AUTO_ATOMIC                                                       \
+#define RAJA_AUTO_ATOMIC \
   RAJA::hip_atomic {}
 #elif defined(__SYCL_DEVICE_ONLY__)
-#define RAJA_AUTO_ATOMIC                                                       \
+#define RAJA_AUTO_ATOMIC \
   RAJA::sycl_atomic {}
 #elif defined(RAJA_ENABLE_OPENMP)
-#define RAJA_AUTO_ATOMIC                                                       \
+#define RAJA_AUTO_ATOMIC \
   RAJA::omp_atomic {}
 #else
-#define RAJA_AUTO_ATOMIC                                                       \
+#define RAJA_AUTO_ATOMIC \
   RAJA::seq_atomic {}
 #endif
 
@@ -60,96 +60,102 @@ namespace RAJA
 {
 
 //! Atomic policy that automatically does "the right thing"
-struct auto_atomic
-{};
+struct auto_atomic {
+};
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T *acc)
 {
   return atomicLoad(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T *acc, T value)
 {
   atomicStore(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T *acc, T value)
 {
   return atomicAdd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T *acc, T value)
 {
   return atomicSub(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T *acc, T value)
 {
   return atomicMin(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T *acc, T value)
 {
   return atomicMax(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic,
+                                         T *acc,
+                                         T compare)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic,
+                                         T *acc,
+                                         T compare)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T *acc, T value)
 {
   return atomicAnd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T *acc, T value)
 {
   return atomicOr(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value)
 {
   return atomicXor(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, T* acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
+                                              T *acc,
+                                              T value)
 {
   return atomicExchange(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(auto_atomic, T* acc, T compare, T value)
+atomicCAS(auto_atomic, T *acc, T compare, T value)
 {
   return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value);
 }
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index 9ea7646337..34755fa49d 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -22,8 +22,7 @@
 
 #include <cstdint>
 
-#if defined(RAJA_COMPILER_MSVC) ||                                             \
-    (defined(_WIN32) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
 #include <intrin.h>
 #endif
 
@@ -42,16 +41,14 @@ namespace RAJA
 
 
 //! Atomic policy that uses the compilers builtin __atomic_XXX routines
-struct builtin_atomic
-{};
+struct builtin_atomic {
+};
 
 
-namespace detail
-{
+namespace detail {
 
 
-#if defined(RAJA_COMPILER_MSVC) ||                                             \
-    (defined(_WIN32) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
 
 
 /*!
@@ -59,11 +56,12 @@ namespace detail
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic
-{
+struct builtin_useIntrinsic {
   static constexpr bool value =
-      std::is_same<T, char>::value || std::is_same<T, short>::value ||
-      std::is_same<T, long>::value || std::is_same<T, long long>::value;
+    std::is_same<T, char>::value ||
+    std::is_same<T, short>::value ||
+    std::is_same<T, long>::value ||
+    std::is_same<T, long long>::value;
 };
 
 
@@ -72,18 +70,18 @@ struct builtin_useIntrinsic
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret
-{
+struct builtin_useReinterpret {
   static constexpr bool value =
-      !builtin_useIntrinsic<T>::value &&
-      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
-
-  using type = std::conditional_t<
-      sizeof(T) == 1,
-      char,
-      std::conditional_t<sizeof(T) == 2,
-                         short,
-                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
+    !builtin_useIntrinsic<T>::value &&
+    (sizeof(T) == 1 ||
+     sizeof(T) == 2 ||
+     sizeof(T) == 4 ||
+     sizeof(T) == 8);
+
+  using type =
+    std::conditional_t<sizeof(T) == 1, char,
+    std::conditional_t<sizeof(T) == 2, short,
+    std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -92,11 +90,10 @@ struct builtin_useReinterpret
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS
-{
+struct builtin_useCAS {
   static constexpr bool value =
-      !builtin_useIntrinsic<T>::value &&
-      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+    !builtin_useIntrinsic<T>::value &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -108,22 +105,22 @@ struct builtin_useCAS
 /*!
  * Atomic or using intrinsics
  */
-RAJA_INLINE char builtin_atomicOr(char* acc, char value)
+RAJA_INLINE char builtin_atomicOr(char *acc, char value)
 {
   return _InterlockedOr8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicOr(short* acc, short value)
+RAJA_INLINE short builtin_atomicOr(short *acc, short value)
 {
   return _InterlockedOr16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicOr(long* acc, long value)
+RAJA_INLINE long builtin_atomicOr(long *acc, long value)
 {
   return _InterlockedOr(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
+RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
 {
   return _InterlockedOr64(acc, value);
 }
@@ -134,7 +131,7 @@ RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE T builtin_atomicLoad(T* acc)
+RAJA_INLINE T builtin_atomicLoad(T *acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
 }
@@ -143,22 +140,22 @@ RAJA_INLINE T builtin_atomicLoad(T* acc)
 /*!
  * Atomic exchange using intrinsics
  */
-RAJA_INLINE char builtin_atomicExchange(char* acc, char value)
+RAJA_INLINE char builtin_atomicExchange(char *acc, char value)
 {
   return _InterlockedExchange8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicExchange(short* acc, short value)
+RAJA_INLINE short builtin_atomicExchange(short *acc, short value)
 {
   return _InterlockedExchange16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicExchange(long* acc, long value)
+RAJA_INLINE long builtin_atomicExchange(long *acc, long value)
 {
   return _InterlockedExchange(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
+RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
 {
   return _InterlockedExchange64(acc, value);
 }
@@ -169,7 +166,7 @@ RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE void builtin_atomicStore(T* acc, T value)
+RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 {
   builtin_atomicExchange(acc, value);
 }
@@ -178,23 +175,22 @@ RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 /*!
  * Atomic compare and swap using intrinsics
  */
-RAJA_INLINE char builtin_atomicCAS(char* acc, char compare, char value)
+RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value)
 {
   return _InterlockedCompareExchange8(acc, value, compare);
 }
 
-RAJA_INLINE short builtin_atomicCAS(short* acc, short compare, short value)
+RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value)
 {
   return _InterlockedCompareExchange16(acc, value, compare);
 }
 
-RAJA_INLINE long builtin_atomicCAS(long* acc, long compare, long value)
+RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
 {
   return _InterlockedCompareExchange(acc, value, compare);
 }
 
-RAJA_INLINE long long
-builtin_atomicCAS(long long* acc, long long compare, long long value)
+RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
 {
   return _InterlockedCompareExchange64(acc, value, compare);
 }
@@ -203,22 +199,22 @@ builtin_atomicCAS(long long* acc, long long compare, long long value)
 /*!
  * Atomic addition using intrinsics
  */
-RAJA_INLINE char builtin_atomicAdd(char* acc, char value)
+RAJA_INLINE char builtin_atomicAdd(char *acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAdd(short* acc, short value)
+RAJA_INLINE short builtin_atomicAdd(short *acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAdd(long* acc, long value)
+RAJA_INLINE long builtin_atomicAdd(long *acc, long value)
 {
   return _InterlockedExchangeAdd(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value)
+RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, value);
 }
@@ -227,22 +223,22 @@ RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value)
 /*!
  * Atomic subtraction using intrinsics
  */
-RAJA_INLINE char builtin_atomicSub(char* acc, char value)
+RAJA_INLINE char builtin_atomicSub(char *acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, -value);
 }
 
-RAJA_INLINE short builtin_atomicSub(short* acc, short value)
+RAJA_INLINE short builtin_atomicSub(short *acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, -value);
 }
 
-RAJA_INLINE long builtin_atomicSub(long* acc, long value)
+RAJA_INLINE long builtin_atomicSub(long *acc, long value)
 {
   return _InterlockedExchangeAdd(acc, -value);
 }
 
-RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value)
+RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, -value);
 }
@@ -251,22 +247,22 @@ RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value)
 /*!
  * Atomic and using intrinsics
  */
-RAJA_INLINE char builtin_atomicAnd(char* acc, char value)
+RAJA_INLINE char builtin_atomicAnd(char *acc, char value)
 {
   return _InterlockedAnd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAnd(short* acc, short value)
+RAJA_INLINE short builtin_atomicAnd(short *acc, short value)
 {
   return _InterlockedAnd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAnd(long* acc, long value)
+RAJA_INLINE long builtin_atomicAnd(long *acc, long value)
 {
   return _InterlockedAnd(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value)
+RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
 {
   return _InterlockedAnd64(acc, value);
 }
@@ -275,22 +271,22 @@ RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value)
 /*!
  * Atomic xor using intrinsics
  */
-RAJA_INLINE char builtin_atomicXor(char* acc, char value)
+RAJA_INLINE char builtin_atomicXor(char *acc, char value)
 {
   return _InterlockedXor8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicXor(short* acc, short value)
+RAJA_INLINE short builtin_atomicXor(short *acc, short value)
 {
   return _InterlockedXor16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicXor(long* acc, long value)
+RAJA_INLINE long builtin_atomicXor(long *acc, long value)
 {
   return _InterlockedXor(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
+RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
 {
   return _InterlockedXor64(acc, value);
 }
@@ -304,11 +300,10 @@ RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic
-{
+struct builtin_useIntrinsic {
   static constexpr bool value =
-      (std::is_integral<T>::value || std::is_enum<T>::value) &&
-      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+    (std::is_integral<T>::value || std::is_enum<T>::value) &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -317,54 +312,54 @@ struct builtin_useIntrinsic
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret
-{
-  static constexpr bool value = !std::is_integral<T>::value &&
-                                !std::is_enum<T>::value &&
-                                ((sizeof(T) == 1
+struct builtin_useReinterpret {
+  static constexpr bool value =
+    !std::is_integral<T>::value &&
+    !std::is_enum<T>::value &&
+    ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-                                  && sizeof(unsigned char) == 1
+      && sizeof(unsigned char) == 1
 #endif
-                                  ) ||
-                                 (sizeof(T) == 2
+     ) ||
+     (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-                                  && sizeof(unsigned short) == 2
+      && sizeof(unsigned short) == 2
 #endif
-                                  ) ||
-                                 (sizeof(T) == 4
+     ) ||
+     (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-                                  && sizeof(unsigned int) == 4
+      && sizeof(unsigned int) == 4
 #endif
-                                  ) ||
-                                 (sizeof(T) == 8
+     ) ||
+     (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-                                  && sizeof(unsigned long long) == 8
+      && sizeof(unsigned long long) == 8
 #endif
-                                  ));
+     ));
 
   using type =
-      std::conditional_t<sizeof(T) == 1,
+    std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                         uint8_t,
+                       uint8_t,
 #else
-                         unsigned char,
+                       unsigned char,
 #endif
-                         std::conditional_t<sizeof(T) == 2,
+    std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                                            uint16_t,
+                       uint16_t,
 #else
-                                            unsigned short,
+                       unsigned short,
 #endif
-                                            std::conditional_t<sizeof(T) == 4,
+    std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                                                               uint32_t,
+                       uint32_t,
 #else
-                                                               unsigned int,
+                       unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                                                               uint64_t>>>;
+                       uint64_t>>>;
 #else
-                                                               unsigned long long>>>;
+                       unsigned long long>>>;
 #endif
 };
 
@@ -374,11 +369,10 @@ struct builtin_useReinterpret
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS
-{
+struct builtin_useCAS {
   static constexpr bool value =
-      !std::is_integral<T>::value && !std::is_enum<T>::value &&
-      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+    !std::is_integral<T>::value && !std::is_enum<T>::value &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -392,7 +386,7 @@ struct builtin_useCAS
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
 }
@@ -403,7 +397,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -414,7 +408,7 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -425,10 +419,10 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
-                              __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(
+      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   return compare;
 }
 
@@ -438,7 +432,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
 }
@@ -449,7 +443,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
 }
@@ -460,7 +454,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
 }
@@ -471,7 +465,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
 }
@@ -482,7 +476,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
 }
@@ -508,12 +502,12 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
 {
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-      builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -522,7 +516,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -536,12 +530,13 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicExchange(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    builtin_atomicExchange(reinterpret_cast<R*>(acc),
+                           RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -550,13 +545,14 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicCAS(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
-      RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    builtin_atomicCAS(reinterpret_cast<R*>(acc),
+                      RAJA::util::reinterp_A_as_B<T, R>(compare),
+                      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -571,7 +567,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
 {
   return a == b;
 }
@@ -584,7 +580,7 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -599,15 +595,15 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
  * Returns the OLD value that was replaced by the result of this operation.
  */
 template <typename T, typename Oper>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
+                                                     Oper &&oper)
 {
   T old = builtin_atomicLoad(acc);
   T expected;
 
-  do
-  {
+  do {
     expected = old;
-    old      = builtin_atomicCAS(acc, expected, oper(expected));
+    old = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected));
 
   return old;
@@ -621,23 +617,21 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
-                                                     Oper&& oper,
-                                                     ShortCircuit&& sc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
+                                                     Oper &&oper,
+                                                     ShortCircuit &&sc)
 {
   T old = builtin_atomicLoad(acc);
 
-  if (sc(old))
-  {
+  if (sc(old)) {
     return old;
   }
 
   T expected;
 
-  do
-  {
+  do {
     expected = old;
-    old      = builtin_atomicCAS(acc, expected, oper(expected));
+    old = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -652,50 +646,65 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
 /*!
  * Atomic addition using compare and swap loop
  */
-template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value](T old) { return old + value; });
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old + value;
+  });
 }
 
 
 /*!
  * Atomic subtraction using compare and swap loop
  */
-template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value](T old) { return old - value; });
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old - value;
+  });
 }
 
 
 /*!
  * Atomic and using compare and swap loop
  */
-template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value](T old) { return old & value; });
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old & value;
+  });
 }
 
 
 /*!
  * Atomic or using compare and swap loop
  */
-template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value](T old) { return old | value; });
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old | value;
+  });
 }
 
 
 /*!
  * Atomic xor using compare and swap loop
  */
-template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old ^ value;
+  });
 }
 
 
@@ -703,105 +712,109 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T* acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc)
 {
   return detail::builtin_atomicLoad(acc);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value)
 {
   detail::builtin_atomicStore(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicAdd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicSub(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-      acc, [value](T old) { return value < old ? value : old; },
-      [value](T current) { return current <= value; });
+    acc,
+    [value] (T old) {
+      return value < old ? value : old;
+    },
+    [value] (T current) {
+      return current <= value;
+    });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-      acc, [value](T old) { return old < value ? value : old; },
-      [value](T current) { return value <= current; });
+    acc,
+    [value] (T old) {
+      return old < value ? value : old;
+    },
+    [value] (T current) {
+      return value <= current;
+    });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
 {
   return detail::builtin_atomicAdd(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(
-      acc, [value](T old)
-      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
+  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
+    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
+  });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
 {
   return detail::builtin_atomicSub(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc,
-                                        [value](T old)
-                                        {
-                                          return old == static_cast<T>(0) ||
-                                                         value < old
-                                                     ? value
-                                                     : old - static_cast<T>(1);
-                                        });
+  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicAnd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicOr(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicXor(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T* acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
 {
   return detail::builtin_atomicExchange(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T
-atomicCAS(builtin_atomic, T* acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
 {
   return detail::builtin_atomicCAS(acc, compare, value);
 }
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index 40d5e68e4c..e9d5bc454f 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/cuda/atomic.hpp"
+    #include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/forall.hpp"
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 3fc1e4b90c..88a89d5362 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -71,8 +71,7 @@ cudaDeviceProp& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator
-{
+struct PinnedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -91,8 +90,7 @@ struct PinnedAllocator
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator
-{
+struct DeviceAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -112,8 +110,7 @@ struct DeviceAllocator
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator
-{
+struct DeviceZeroedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -135,8 +132,7 @@ struct DeviceZeroedAllocator
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator
-{
+struct DevicePinnedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -145,10 +141,8 @@ struct DevicePinnedAllocator
     cudaErrchk(cudaGetDevice(&device));
     void* ptr;
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
-    cudaErrchk(
-        cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy,
-                             cudaCpuDeviceId));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
 
     return ptr;
   }
@@ -164,25 +158,22 @@ struct DevicePinnedAllocator
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type =
-    basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct cudaInfo
-{
+struct cudaInfo {
   const void* func = nullptr;
-  cuda_dim_t gridDim {0, 0, 0};
-  cuda_dim_t blockDim {0, 0, 0};
+  cuda_dim_t gridDim{0, 0, 0};
+  cuda_dim_t blockDim{0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
+  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
   bool setup_reducers = false;
 };
-struct cudaStatusInfo : cudaInfo
-{
+struct cudaStatusInfo : cudaInfo {
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -199,7 +190,10 @@ extern cudaStatusInfo tl_status;
 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
+void synchronize_impl(::RAJA::resources::Cuda res)
+{
+  res.wait();
+}
 
 }  // namespace detail
 
@@ -211,16 +205,13 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map)
-  {
-    if (!val.second)
-    {
+  for (auto& val : detail::g_stream_info_map) {
+    if (!val.second) {
       synchronize = true;
-      val.second  = true;
+      val.second = true;
     }
   }
-  if (synchronize)
-  {
+  if (synchronize) {
     cudaErrchk(cudaDeviceSynchronize());
   }
 }
@@ -233,16 +224,12 @@ void synchronize(::RAJA::resources::Cuda res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end())
-  {
-    if (!iter->second)
-    {
+  if (iter != detail::g_stream_info_map.end()) {
+    if (!iter->second) {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  }
-  else
-  {
+  } else {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -255,40 +242,29 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end())
-  {
+  if (iter != detail::g_stream_info_map.end()) {
     iter->second = !async;
-  }
-  else
-  {
+  } else {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async)
-  {
+  if (!async) {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func,
-            cuda_dim_t gridDim,
-            cuda_dim_t blockDim,
-            void** args,
-            size_t shmem,
-            ::RAJA::resources::Cuda res,
-            bool async       = true,
-            const char* name = nullptr)
+void launch(const void* func, cuda_dim_t gridDim, cuda_dim_t blockDim, void** args, size_t shmem,
+            ::RAJA::resources::Cuda res, bool async = true, const char *name = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if (name) nvtxRangePushA(name);
+  if(name) nvtxRangePushA(name);
 #else
   RAJA_UNUSED_VAR(name);
 #endif
-  cudaErrchk(
-      cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+  cudaErrchk(cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if (name) nvtxRangePop();
+  if(name) nvtxRangePop();
 #endif
   launch(res, async);
 }
@@ -307,11 +283,9 @@ cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentGridSize()
-{
-  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
-         detail::tl_status.gridDim.z;
-}
+cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
+                                             detail::tl_status.gridDim.y *
+                                             detail::tl_status.gridDim.z; }
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -319,11 +293,9 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentBlockSize()
-{
-  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
-         detail::tl_status.blockDim.z;
-}
+cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
+                                              detail::tl_status.blockDim.y *
+                                              detail::tl_status.blockDim.z; }
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -338,8 +310,7 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure =
-    std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -351,27 +322,24 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template <typename T, typename GetNFromMax>
-RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
-                                        size_t align = alignof(T))
+template < typename T, typename GetNFromMax >
+RAJA_INLINE
+size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
-                                     ? align - (unaligned_shmem % align)
-                                     : size_t(0);
-  const size_t aligned_shmem   = unaligned_shmem + align_offset;
+  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
+      ? align - (unaligned_shmem % align)
+      : size_t(0);
+  const size_t aligned_shmem = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
-                                         max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) *
+      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
-  {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  }
-  else
-  {
+  } else {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -386,17 +354,16 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
-make_launch_body(const void* func,
-                 cuda_dim_t gridDim,
-                 cuda_dim_t blockDim,
-                 size_t& dynamic_smem,
-                 ::RAJA::resources::Cuda res,
-                 LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
+    const void* func,
+    cuda_dim_t gridDim,
+    cuda_dim_t blockDim,
+    size_t& dynamic_smem,
+    ::RAJA::resources::Cuda res,
+    LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
-      detail::tl_status,
-      detail::cudaInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(detail::tl_status,
+      detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -411,8 +378,7 @@ static constexpr size_t cuda_occupancy_uninitialized_size_t =
 struct CudaFixedMaxBlocksData
 {
   int device_sm_per_device = cuda::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm =
-      cuda::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -428,26 +394,25 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 struct CudaOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device      = cuda_occupancy_uninitialized_int;
-  int func_max_threads_per_block      = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
+  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE CudaOccMaxBlocksThreadsData
-cuda_occupancy_max_blocks_threads(const void* func,
-                                  size_t func_dynamic_shmem_per_block)
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
+RAJA_INLINE
+CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
-  {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
     cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
-        func, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
+
   }
 
   return data;
@@ -457,50 +422,48 @@ cuda_occupancy_max_blocks_threads(const void* func,
 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_threads_per_block          = cuda_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm          = cuda_occupancy_uninitialized_int;
+  int func_threads_per_block = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
-RAJA_INLINE CudaOccMaxBlocksData
-cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
+template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
+RAJA_INLINE
+CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
-  {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block       = func_threads_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block,
-        func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE CudaOccMaxBlocksData
-cuda_occupancy_max_blocks(const void* func,
-                          size_t func_dynamic_shmem_per_block,
-                          int func_threads_per_block)
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
+RAJA_INLINE
+CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-      data.func_threads_per_block != func_threads_per_block)
-  {
+  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+       data.func_threads_per_block != func_threads_per_block ) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block       = func_threads_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block,
-        func_dynamic_shmem_per_block));
+    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+
   }
 
   return data;
@@ -533,16 +496,14 @@ cuda_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template <typename IdxT, typename Concretizer, typename UniqueMarker>
+template < typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func,
-                  size_t func_dynamic_shmem_per_block,
-                  IdxT len)
-      : m_func(func),
-        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
-        m_len(len)
-  {}
+  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
+    : m_func(func)
+    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
+    , m_len(len)
+  { }
 
   IdxT get_max_block_size() const
   {
@@ -556,14 +517,10 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block)
-    {
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block) {
       return func_threads_per_block;
-    }
-    else
-    {
+    } else {
       return IdxT(0);
     }
   }
@@ -571,8 +528,7 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -580,17 +536,16 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -599,10 +554,8 @@ struct ConcretizerImpl
   {
     auto data = cuda_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device =
-        Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -610,9 +563,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device =
-        this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
+    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index 5aeaba0883..f6269b36e4 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace cuda
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template <typename Factory>
-__global__ void get_value_global(typename Factory::value_type* ptr,
-                                 Factory factory)
+template < typename Factory >
+__global__ void get_value_global(
+    typename Factory::value_type* ptr, Factory factory)
 {
   *ptr = factory();
 }
@@ -52,9 +52,8 @@ __global__ void get_value_global(typename Factory::value_type* ptr,
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr            = nullptr;
-  if (nbytes > cached_nbytes)
-  {
+  static void* ptr = nullptr;
+  if (nbytes > cached_nbytes) {
     cached_nbytes = 0;
     cudaErrchk(cudaFreeHost(ptr));
     cudaErrchk(cudaMallocHost(&ptr, nbytes));
@@ -74,7 +73,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template <typename Factory>
+template < typename Factory >
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -82,9 +81,8 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Cuda::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func =
-      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void* args[] = {(void*)&ptr, (void*)&factory};
+  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void *args[] = {(void*)&ptr, (void*)&factory};
   cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   cudaErrchk(cudaStreamSynchronize(res.get_stream()));
 
@@ -93,7 +91,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template <typename Factory>
+template < typename Factory >
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -103,20 +101,17 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace cuda
 
 /*!
- * Populate and return a Dispatcher object that can be used in device code
- */
-template <typename T,
-          typename Dispatcher_T,
-          size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool Async>
-inline const Dispatcher_T*
-get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+* Populate and return a Dispatcher object that can be used in device code
+*/
+template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
+inline const Dispatcher_T* get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
-  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
-      [](auto&& factory) {
-        return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
-      })};
+  static Dispatcher_T dispatcher{
+        Dispatcher_T::template makeDispatcher<T>(
+          [](auto&& factory) {
+            return cuda::get_cached_value(
+                std::forward<decltype(factory)>(factory));
+          }) };
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 3cf8e6408f..41fe17c84a 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,48 +36,46 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
     : WorkRunnerForallOrdered<
-          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-          RAJA::ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
 {
   using base = WorkRunnerForallOrdered<
-      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-      RAJA::ordered,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
   using base::base;
-  using IndexType       = INDEX_T;
+  using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template <typename WorkContainer>
+  template < typename WorkContainer >
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r,
-                      Args... args) const
+                      typename base::resource_type r, Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -85,12 +83,8 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0)
-    {
-      if (!Async)
-      {
-        RAJA::cuda::synchronize(r);
-      }
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::cuda::synchronize(r); }
     }
 
     return run_storage;
@@ -101,48 +95,46 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
     : WorkRunnerForallReverse<
-          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-          RAJA::reverse_ordered,
-          DISPATCH_POLICY_T,
-          ALLOCATOR_T,
-          INDEX_T,
-          Args...>
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
 {
   using base = WorkRunnerForallReverse<
-      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-      RAJA::reverse_ordered,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
   using base::base;
-  using IndexType       = INDEX_T;
+  using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template <typename WorkContainer>
+  template < typename WorkContainer >
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r,
-                      Args... args) const
+                      typename base::resource_type r, Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -150,12 +142,8 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0)
-    {
-      if (!Async)
-      {
-        RAJA::cuda::synchronize(r);
-      }
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::cuda::synchronize(r); }
     }
 
     return run_storage;
@@ -167,17 +155,15 @@ struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type,
-          typename LoopBody,
-          typename index_type,
-          typename... Args>
+template <typename Segment_type, typename LoopBody,
+          typename index_type, typename ... Args>
 struct HoldCudaDeviceXThreadblockLoop
 {
-  template <typename segment_in, typename body_in>
+  template < typename segment_in, typename body_in >
   HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-      : m_segment(std::forward<segment_in>(segment)),
-        m_body(std::forward<body_in>(body))
-  {}
+    : m_segment(std::forward<segment_in>(segment))
+    , m_body(std::forward<body_in>(body))
+  { }
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -185,11 +171,10 @@ struct HoldCudaDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin         = m_segment.begin();
-    const auto end           = m_segment.end();
+    const auto begin = m_segment.begin();
+    const auto end   = m_segment.end();
     const index_type len(end - begin);
-    for (index_type i = i_begin; i < len; i += stride)
-    {
+    for ( index_type i = i_begin; i < len; i += stride ) {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -199,12 +184,12 @@ struct HoldCudaDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          typename StorageIter,
-          typename value_type,
-          typename index_type,
-          typename... Args>
+template < size_t BLOCK_SIZE,
+           size_t BLOCKS_PER_SM,
+           typename StorageIter,
+           typename value_type,
+           typename index_type,
+           typename ... Args >
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -221,42 +206,36 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunner<
-    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-    RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
 {
-  using exec_policy =
-      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
-  using order_policy = RAJA::policy::cuda::
-      unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator       = ALLOCATOR_T;
-  using index_type      = INDEX_T;
-  using resource_type   = resources::Cuda;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+  using resource_type = resources::Cuda;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type
-  {
-    template <typename T>
+  struct holder_type {
+    template < typename T >
     using type = HoldCudaDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
-        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
-        index_type,
-        Args...>;
+        typename camp::at<T, camp::num<0>>::type, // ITERABLE
+        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
+        index_type, Args...>;
   };
   ///
-  template <typename T>
+  template < typename T >
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -265,25 +244,21 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy =
-      dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type =
-      Dispatcher<Platform::cuda,
-                 dispatcher_holder_policy,
-                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
-                 Args...>;
+  using dispatcher_type = Dispatcher<Platform::cuda, dispatcher_holder_policy, RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&)            = delete;
+  WorkRunner(WorkRunner const&) = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner && o)
+    : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner&& o)
+  WorkRunner& operator=(WorkRunner && o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -293,41 +268,35 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template <typename WorkContainer, typename Iterable, typename LoopBody>
-  inline void
-  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template < typename WorkContainer, typename Iterable, typename LoopBody >
+  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType =
-        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template
-    // true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end   = std::end(iter);
-    IndexType len  = std::distance(begin, end);
+    Iterator end = std::end(iter);
+    IndexType len = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0)
-    {
+    if (len > 0 && BLOCK_SIZE > 0) {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup
-      // reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
       //
       // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream,
-      //     std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -335,44 +304,37 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template <typename WorkContainer>
-  per_run_storage
-  run(WorkContainer const& storage, resource_type r, Args... args) const
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator   = camp::decay<decltype(std::begin(storage))>;
-    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
-                                                          std::end(storage)))>;
+    using Iterator  = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage {};
+    per_run_storage run_storage{};
 
-    auto func =
-        cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
-                                      value_type, index_type, Args...>;
+    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin      = std::begin(storage);
-    Iterator end        = std::end(storage);
+    Iterator begin = std::begin(storage);
+    Iterator end = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0)
-    {
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
 
-      index_type average_iterations =
-          m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      cuda_dim_t blockSize {static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize {
-          static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
-                                         block_size),
-          static_cast<cuda_dim_member_t>(num_loops), 1};
+      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
+      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
+                          static_cast<cuda_dim_member_t>(num_loops),
+                          1};
 
       RAJA_FT_BEGIN;
 
@@ -385,9 +347,8 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
-                           shmem, r, Async);
+        void* func_args[] = { (void*)&begin, (void*)&args... };
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -397,7 +358,10 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear() { m_total_iterations = 0; }
+  void clear()
+  {
+    m_total_iterations = 0;
+  }
 
 private:
   index_type m_total_iterations = 0;
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index a1b3cd5279..aedfe91a03 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -25,8 +25,7 @@
 #include <stdexcept>
 #include <type_traits>
 
-#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 &&                     \
-    __CUDACC_VER_MINOR__ >= 6
+#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6
 #define RAJA_ENABLE_CUDA_ATOMIC_REF
 #endif
 
@@ -66,11 +65,11 @@ namespace detail
  * cuda_useBuiltinExchange below.
  */
 template <typename T>
-struct cuda_useBuiltinCommon
-{
-  static constexpr bool value = std::is_same<T, int>::value ||
-                                std::is_same<T, unsigned int>::value ||
-                                std::is_same<T, unsigned long long>::value;
+struct cuda_useBuiltinCommon {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -82,15 +81,15 @@ struct cuda_useBuiltinCommon
  * below.
  */
 template <typename T>
-struct cuda_useReinterpretCommon
-{
-  static constexpr bool value = !cuda_useBuiltinCommon<T>::value &&
-                                (sizeof(T) == sizeof(unsigned int) ||
-                                 sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretCommon {
+  static constexpr bool value =
+    !cuda_useBuiltinCommon<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
 };
 
 
@@ -110,7 +109,7 @@ using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
+RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -125,12 +124,12 @@ RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
  * using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinExchange
-{
-  static constexpr bool value = std::is_same<T, int>::value ||
-                                std::is_same<T, unsigned int>::value ||
-                                std::is_same<T, unsigned long long>::value ||
-                                std::is_same<T, float>::value;
+struct cuda_useBuiltinExchange {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value ||
+    std::is_same<T, float>::value;
 };
 
 /*!
@@ -138,23 +137,22 @@ struct cuda_useBuiltinExchange
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct cuda_useReinterpretExchange
-{
-  static constexpr bool value = !cuda_useBuiltinExchange<T>::value &&
-                                (sizeof(T) == sizeof(unsigned int) ||
-                                 sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretExchange {
+  static constexpr bool value =
+    !cuda_useBuiltinExchange<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using cuda_useReinterpretExchange_t =
-    typename cuda_useReinterpretExchange<T>::type;
+using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -162,7 +160,7 @@ using cuda_useReinterpretExchange_t =
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -173,12 +171,13 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
 {
   using R = cuda_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicExchange(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    cuda_atomicExchange(reinterpret_cast<R*>(acc),
+                        RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -188,41 +187,41 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 #if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
 
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
 {
   return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
-      cuda::memory_order_relaxed {});
+    cuda::memory_order_relaxed{});
 }
 
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
 {
   cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
-      value, cuda::memory_order_relaxed {});
+    value, cuda::memory_order_relaxed{});
 }
 
 #else
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
 {
   return cuda_atomicOr(acc, static_cast<T>(0));
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-      cuda_atomicLoad(reinterpret_cast<R*>(acc)));
+    cuda_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
 {
   cuda_atomicExchange(acc, value);
 }
@@ -239,14 +238,14 @@ RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
  * implemented using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinCAS
-{
+struct cuda_useBuiltinCAS {
   static constexpr bool value =
 #if __CUDA_ARCH__ >= 700
-      std::is_same<T, unsigned short int>::value ||
+    std::is_same<T, unsigned short int>::value ||
 #endif
-      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
-      std::is_same<T, unsigned long long>::value;
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value;
 };
 
 /*!
@@ -255,28 +254,29 @@ struct cuda_useBuiltinCAS
  * and swap supports
  */
 template <typename T>
-struct cuda_useReinterpretCAS
-{
-  static constexpr bool value = !cuda_useBuiltinCAS<T>::value &&
-                                (
+struct cuda_useReinterpretCAS {
+  static constexpr bool value =
+    !cuda_useBuiltinCAS<T>::value &&
+    (
 #if __CUDA_ARCH__ >= 700
-                                    sizeof(T) == sizeof(unsigned short) ||
+     sizeof(T) == sizeof(unsigned short) ||
 #endif
-                                    sizeof(T) == sizeof(unsigned int) ||
-                                    sizeof(T) == sizeof(unsigned long long));
+     sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long)
+    );
 
   using type =
 #if __CUDA_ARCH__ >= 700
-      std::conditional_t<sizeof(T) == sizeof(unsigned short),
-                         unsigned short,
+    std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                       unsigned short,
 #endif
-                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                            unsigned int,
-                                            unsigned long long>
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int,
+                       unsigned long long>
 #if __CUDA_ARCH__ >= 700
-                         >
+                      >
 #endif
-      ;
+    ;
 };
 
 /*!
@@ -287,20 +287,21 @@ using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicCAS(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
-      RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    cuda_atomicCAS(reinterpret_cast<R*>(acc),
+                   RAJA::util::reinterp_A_as_B<T, R>(compare),
+                   RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 /*!
@@ -333,44 +334,42 @@ RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
+                                             Oper&& oper)
 {
   T old = cuda_atomicLoad(acc);
   T expected;
 
-  do
-  {
+  do {
     expected = old;
-    old      = cuda_atomicCAS(acc, expected, oper(expected));
+    old = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected));
 
   return old;
 }
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with
- * short-circuiting. Implementation uses the existing CUDA supplied unsigned
- * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
- * result of this operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
+ * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
+ * operator. Returns the OLD value that was replaced by the result of this
+ * operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
                                              Oper&& oper,
                                              ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
 
-  if (sc(old))
-  {
+  if (sc(old)) {
     return old;
   }
 
   T expected;
 
-  do
-  {
+  do {
     expected = old;
-    old      = cuda_atomicCAS(acc, expected, oper(expected));
+    old = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -380,28 +379,29 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<int,
-                                                  unsigned int,
-                                                  unsigned long long int,
-                                                  float
+using cuda_atomicAdd_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long int,
+  float
 #if __CUDA_ARCH__ >= 600
-                                                  ,
-                                                  double
+  ,
+  double
 #endif
-                                                  >;
+>;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
-              nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value](T old) { return old + value; });
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old + value;
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -412,39 +412,39 @@ RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
  */
 using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
-using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
+using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<
+  int,
+  unsigned int
+>;
 
-using cuda_atomicSub_via_Add_builtin_types =
-    ::camp::list<unsigned long long int,
-                 float
+using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
+  unsigned long long int,
+  float
 #if __CUDA_ARCH__ >= 600
-                 ,
-                 double
+  ,
+  double
 #endif
-                 >;
+>;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
-              nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value](T old) { return old - value; });
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old - value;
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
   return ::atomicSub(acc, value);
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -453,34 +453,37 @@ RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<int,
-                                                     unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<
+  int,
+  unsigned int
 #if __CUDA_ARCH__ >= 500
-                                                     ,
-                                                     long long int,
-                                                     unsigned long long int
+  ,
+  long long int,
+  unsigned long long int
 #endif
-                                                     >;
+>;
 
 
 /*!
  * Atomic min
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 {
   return cuda_atomicCAS_loop(
-      acc, [value](T old) { return value < old ? value : old; },
-      [value](T current) { return current <= value; });
+    acc,
+    [value] (T old) {
+      return value < old ? value : old;
+    },
+    [value] (T current) {
+      return current <= value;
+    });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
-              nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -489,21 +492,23 @@ RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 /*!
  * Atomic max
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 {
   return cuda_atomicCAS_loop(
-      acc, [value](T old) { return old < value ? value : old; },
-      [value](T current) { return value <= current; });
+    acc,
+    [value] (T old) {
+      return old < value ? value : old;
+    },
+    [value] (T current) {
+      return value <= current;
+    });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
-              nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -512,30 +517,28 @@ RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 /*!
  * Atomic increment/decrement with reset
  */
-using cuda_atomicIncDecReset_builtin_types = ::camp::list<unsigned int>;
+using cuda_atomicIncDecReset_builtin_types = ::camp::list<
+  unsigned int
+>;
 
 
 /*!
  * Atomic increment with reset
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(
-      acc, [value](T old)
-      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
 {
   return ::atomicInc(acc, value);
 }
@@ -545,7 +548,7 @@ RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicInc(T* acc)
+RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
 {
   return cuda_atomicAdd(acc, static_cast<T>(1));
 }
@@ -554,28 +557,20 @@ RAJA_INLINE __device__ T cuda_atomicInc(T* acc)
 /*!
  * Atomic decrement with reset
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc,
-                             [value](T old)
-                             {
-                               return old == static_cast<T>(0) || value < old
-                                          ? value
-                                          : old - static_cast<T>(1);
-                             });
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
 {
   return ::atomicDec(acc, value);
 }
@@ -585,7 +580,7 @@ RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicDec(T* acc)
+RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
 {
   return cuda_atomicSub(acc, static_cast<T>(1));
 }
@@ -594,25 +589,28 @@ RAJA_INLINE __device__ T cuda_atomicDec(T* acc)
 /*!
  * Atomic bitwise functions (and, or, xor)
  */
-using cuda_atomicBit_builtin_types =
-    ::camp::list<int, unsigned int, unsigned long long int>;
+using cuda_atomicBit_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long int
+>;
 
 
 /*!
  * Atomic and
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-              nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value](T old) { return old & value; });
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old & value;
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -622,11 +620,12 @@ RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
  * Atomic or
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-              nullptr>
-RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value](T old) { return old | value; });
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old | value;
+  });
 }
 
 /*!
@@ -639,17 +638,17 @@ RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
  * Atomic xor
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
-              nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old ^ value;
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -668,195 +667,185 @@ RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
-                                          T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy {}, acc);
+  return RAJA::atomicLoad(host_policy{}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(cuda_atomic_explicit<host_policy>, T* acc, T value)
+atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   detail::cuda_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy {}, acc, value);
+  RAJA::atomicStore(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy {}, acc, value);
+  return RAJA::atomicAdd(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy {}, acc, value);
+  return RAJA::atomicSub(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy {}, acc, value);
+  return RAJA::atomicMin(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy {}, acc, value);
+  return RAJA::atomicMax(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
   return detail::cuda_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy {}, acc, value);
+  return RAJA::atomicInc(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy {}, acc);
+  return RAJA::atomicInc(host_policy{}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
   return detail::cuda_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy {}, acc, value);
+  return RAJA::atomicDec(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy {}, acc);
+  return RAJA::atomicDec(host_policy{}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy {}, acc, value);
+  return RAJA::atomicAnd(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
-                                        T* acc,
-                                        T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy {}, acc, value);
+  return RAJA::atomicOr(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy {}, acc, value);
+  return RAJA::atomicXor(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
-                                              T* acc,
-                                              T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy {}, acc, value);
+  return RAJA::atomicExchange(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
+atomicCAS(cuda_atomic_explicit<host_policy>, T *acc, T compare, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index e1a33ab482..493136400c 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,91 +70,61 @@ namespace impl
  *
  ******************************************************************************
  */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          typename UniqueMarker>
+template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT len,
+                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if (len > (block_size * grid_size))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > (block_size * grid_size) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_cuda_dim<dim>(dims.threads,
-                                static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks,
-                                static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template <named_dim dim,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if (block_size == IdxT(0))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( block_size == IdxT(0) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -162,59 +132,43 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template <named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::cuda::
-        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::
-      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -223,67 +177,46 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template <named_dim dim,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -291,59 +224,43 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template <named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::cuda::
-        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::
-      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::CudaDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::CudaDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -373,22 +290,21 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forall_cuda_kernel(LOOP_BODY loop_body,
-                            const Iterator idx,
-                            IndexType length)
+void forall_cuda_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if (ii < length) {
     body(idx[ii]);
   }
 }
@@ -399,20 +315,21 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void
-forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forall_cuda_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if (ii < length) {
     body(idx[ii]);
   }
 }
@@ -424,24 +341,23 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(LOOP_BODY loop_body,
-                             const Iterator idx,
-                             IndexType length,
-                             ForallParam f_params)
+void forallp_cuda_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if ( ii < length ) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -453,143 +369,138 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
-                                    const Iterator idx,
-                                    IndexType length,
-                                    ForallParam f_params)
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forallp_cuda_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if ( ii < length ) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forall_cuda_kernel(LOOP_BODY loop_body,
-                            const Iterator idx,
-                            IndexType length)
+void forall_cuda_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 ///
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void
-forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forall_cuda_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 
 ///
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(LOOP_BODY loop_body,
-                             const Iterator idx,
-                             IndexType length,
-                             ForallParam f_params)
+void forallp_cuda_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <
-    typename EXEC_POL,
-    size_t BlocksPerSM,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
-                                    const Iterator idx,
-                                    IndexType length,
-                                    ForallParam f_params)
+template <typename EXEC_POL,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forallp_cuda_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -604,50 +515,37 @@ __global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BlocksPerSM,
-          bool Async,
+template <typename Iterable, typename LoopBody,
+          typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Cuda>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                     IterationGetter,
-                                                     Concretizer,
-                                                     BlocksPerSM,
-                                                     Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
-      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
-                                    LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0)
-  {
+  if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
@@ -670,16 +568,14 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(
-          func, dims.blocks, dims.threads, shmem, cuda_res,
-          std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
+          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
-                         Async);
+      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
     }
 
     RAJA_FT_END;
@@ -689,56 +585,41 @@ forall_impl(resources::Cuda cuda_res,
 }
 
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BlocksPerSM,
-          bool Async,
+template <typename Iterable, typename LoopBody,
+          typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Cuda>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                     IterationGetter,
-                                                     Concretizer,
-                                                     BlocksPerSM,
-                                                     Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
-      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker =
-      ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
-                   LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0)
-  {
+  if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                   IndexType, camp::decay<ForallParam>>);
+        &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                   IndexType, camp::decay<ForallParam> >);
 
     //
     // Setup shared memory buffers
@@ -754,9 +635,9 @@ forall_impl(resources::Cuda cuda_res,
     RAJA_FT_BEGIN;
 
     RAJA::cuda::detail::cudaInfo launch_info;
-    launch_info.gridDim  = dims.blocks;
+    launch_info.gridDim = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res      = cuda_res;
+    launch_info.res = cuda_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -764,17 +645,14 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(
-          func, dims.blocks, dims.threads, shmem, cuda_res,
-          std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
+          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
-                      (void*)&f_params};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
-                         Async);
+      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -805,32 +683,22 @@ forall_impl(resources::Cuda cuda_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BlocksPerSM,
-          bool Async,
+          typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
-    resources::Cuda r,
-    ExecPolicy<seq_segit,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BlocksPerSM,
-                                                        Async>>,
-    const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+forall_impl(resources::Cuda r,
+            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
+            const TypedIndexSet<SegmentTypes...>& iset,
+            LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi)
-  {
-    iset.segmentCall(
-        r, isi, detail::CallForall(),
-        ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                 IterationGetter, Concretizer,
-                                                 BlocksPerSM, true>(),
-        loop_body);
+  for (int isi = 0; isi < num_seg; ++isi) {
+    iset.segmentCall(r,
+                     isi,
+                     detail::CallForall(),
+                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
+                     loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index a8daec62eb..b2daa3a23e 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -59,9 +59,15 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
 };
 
 /*!
@@ -90,45 +96,46 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template <typename T>
+  template < typename T >
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(
-        reinterpret_cast<const integer_type*>(in_ptr + idx));
+    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
 
-    for (size_t i = 0; i < u.array_size(); ++i)
-    {
+    for (size_t i = 0; i < u.array_size(); ++i) {
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
     }
 
     return u.get_value();
   }
 
-  template <typename T>
+  template < typename T >
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i)
-    {
+    for (size_t i = 0; i < u.array_size(); ++i) {
       atomicExch(&ptr[i], u.array[i]);
     }
   }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
 };
 
 
@@ -153,13 +160,10 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
-      u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i)
-  {
+  for (size_t i = 0; i < u.array_size(); ++i) {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
 #else
@@ -172,13 +176,10 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
-      u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i)
-  {
+  for (size_t i = 0; i < u.array_size(); ++i) {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
 #else
@@ -197,8 +198,7 @@ RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int
-shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -210,22 +210,19 @@ RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long
-shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
-                                                           int laneMask)
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long
-shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -268,8 +265,7 @@ RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
-                                                             int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -281,22 +277,19 @@ RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long
-shfl_sync<unsigned long>(unsigned long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
-                                                       int srcLane)
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long
-shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -341,28 +334,23 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
-  {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner {}(temp, rhs);
+      Combiner{}(temp, rhs);
     }
-  }
-  else
-  {
+
+  } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
-      T rhs       = shfl_sync(temp, srcLane);
+      T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads)
-      {
-        Combiner {}(temp, rhs);
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
       }
     }
   }
@@ -382,10 +370,9 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
-  {
+  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
     T rhs = __shfl_xor_sync(0xffffffff, temp, i);
-    Combiner {}(temp, rhs);
+    Combiner{}(temp, rhs);
   }
 
   return temp;
@@ -401,81 +388,65 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId  = threadId % policy::cuda::device_constants.WARP_SIZE;
+  int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
-  {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner {}(temp, rhs);
+      Combiner{}(temp, rhs);
     }
-  }
-  else
-  {
+
+  } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
-      T rhs       = shfl_sync(temp, srcLane);
+      T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads)
-      {
-        Combiner {}(temp, rhs);
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::cuda::device_constants.WARP_SIZE)
-  {
+  if (numThreads > policy::cuda::device_constants.WARP_SIZE) {
 
-    static_assert(policy::cuda::device_constants.MAX_WARPS <=
-                      policy::cuda::device_constants.WARP_SIZE,
-                  "This algorithms assumes a warp of WARP_SIZE threads can "
-                  "reduce MAX_WARPS values");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE,
+        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(
-        RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
-        reinterpret_cast<RAJA::detail::SoAArray<
-            T, policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0)
-    {
+    if (warpId == 0) {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0)
-    {
+    if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads)
-      {
+      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
-      }
-      else
-      {
+      } else {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
-      {
+      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2) {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner {}(temp, rhs);
+        Combiner{}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index 1f0b999adc..ff15848bcb 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -42,17 +42,18 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types>
-{
+                             Types> {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
-    if (Conditional::eval(data))
-    {
+    if (Conditional::eval(data)) {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -60,7 +61,10 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 75a7dddccb..7465f515b0 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -51,8 +51,7 @@ namespace RAJA
  * Blocks per SM must be chosen by the user.
  */
 template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
-struct cuda_explicit_launch
-{};
+struct cuda_explicit_launch {};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
@@ -68,10 +67,7 @@ struct cuda_explicit_launch
  * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<async0,
-                                         num_blocks,
-                                         num_threads,
-                                         policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
@@ -79,11 +75,7 @@ using cuda_launch = cuda_explicit_launch<async0,
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using cuda_occ_calc_launch =
-    cuda_explicit_launch<async0,
-                         0,
-                         num_threads0,
-                         policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -95,11 +87,8 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<
-          ::RAJA::policy::cuda::
-              cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
-          EnclosedStmts...>
-{};
+    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
+};
 
 
 /*!
@@ -109,8 +98,8 @@ struct CudaKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
-                                    EnclosedStmts...>;
+using CudaKernelExp =
+    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -146,9 +135,9 @@ using CudaKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixed = CudaKernelExt<
-    cuda_launch<false, operators::limits<int>::max(), num_threads>,
-    EnclosedStmts...>;
+using CudaKernelFixed =
+    CudaKernelExt<cuda_launch<false, operators::limits<int>::max(), num_threads>,
+                  EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -167,10 +156,7 @@ using CudaKernelFixedAsync =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false,
-                                       operators::limits<int>::max(),
-                                       num_threads,
-                                       blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<false, operators::limits<int>::max(), num_threads, blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -180,10 +166,7 @@ using CudaKernelFixedSM =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true,
-                                       operators::limits<int>::max(),
-                                       num_threads,
-                                       blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<true, operators::limits<int>::max(), num_threads, blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -213,7 +196,7 @@ template <typename Data, typename Exec>
 __global__ void CudaKernelLauncher(Data data)
 {
 
-  using data_t        = camp::decay<Data>;
+  using data_t = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -232,7 +215,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
     void CudaKernelLauncherFixed(Data data)
 {
 
-  using data_t        = camp::decay<Data>;
+  using data_t = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -248,18 +231,13 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template <int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
+template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
-  using type =
-      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
-                                                              BlocksPerSM,
-                                                              Data,
-                                                              executor_t>)>;
+  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
-                                              executor_t>;
+    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
   }
 };
 
@@ -267,11 +245,10 @@ struct CudaKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template <typename Data, typename executor_t>
+template<typename Data, typename executor_t>
 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 {
-  using type =
-      camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
+  using type = camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::CudaKernelLauncher<Data, executor_t>;
@@ -279,14 +256,12 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 };
 
 
+
 /*!
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <typename LaunchPolicy,
-          typename StmtList,
-          typename Data,
-          typename Types>
+template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
 struct CudaLaunchHelper;
 
 
@@ -295,31 +270,16 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template <bool async0,
-          int num_blocks,
-          int num_threads,
-          int blocks_per_sm,
-          typename StmtList,
-          typename Data,
-          typename Types>
-struct CudaLaunchHelper<
-    cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
-    StmtList,
-    Data,
-    Types>
+template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
+struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
 {
   using Self = CudaLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t =
-      internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t =
-      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
-                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
-                               Data,
-                               executor_t>;
+  using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
 
   inline static const void* get_func()
   {
@@ -327,16 +287,13 @@ struct CudaLaunchHelper<
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-                                                int& recommended_blocks,
-                                                int& recommended_threads)
+      int &recommended_blocks, int &recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0)
-    {
+    if (num_blocks <= 0) {
 
-      if (num_threads <= 0)
-      {
+      if (num_threads <= 0) {
 
         //
         // determine blocks at runtime
@@ -344,11 +301,10 @@ struct CudaLaunchHelper<
         //
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks  = data.func_max_blocks_per_device;
+        recommended_blocks = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-      }
-      else
-      {
+
+      } else {
 
         //
         // determine blocks at runtime
@@ -358,73 +314,69 @@ struct CudaLaunchHelper<
 
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks =
-            data.func_max_blocks_per_sm * data.device_sm_per_device;
+        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
+
       }
-    }
-    else
-    {
 
-      if (num_threads <= 0)
-      {
+    } else {
+
+      if (num_threads <= 0) {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-      }
-      else
-      {
+
+      } else {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
+
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
+
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
-                                 int& max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
   {
-    if (num_threads <= 0)
-    {
+    if (num_threads <= 0) {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-    }
-    else
-    {
+
+    } else {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
+
     }
   }
 
-  inline static void
-  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
+  inline static void max_blocks(size_t shmem_size,
+      int &max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0)
-    {
+    if (num_blocks <= 0) {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 || num_threads != actual_threads)
-      {
+      if (num_threads <= 0 ||
+          num_threads != actual_threads) {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -432,9 +384,8 @@ struct CudaLaunchHelper<
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-      }
-      else
-      {
+
+      } else {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -442,15 +393,16 @@ struct CudaLaunchHelper<
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
+
       }
-    }
-    else
-    {
+
+    } else {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
+
     }
   }
 };
@@ -464,10 +416,8 @@ struct CudaLaunchHelper<
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
-                              cuda_dim_t result,
-                              cuda_dim_t minimum = cuda_dim_t())
-{
+inline
+cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
 
 
   // clamp things to at least 1
@@ -480,13 +430,12 @@ inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if (result.x * result.y * result.z <= limit) return result;
+  if(result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if (result.x * result.y * minimum.z < limit)
-  {
+  if(result.x * result.y * minimum.z < limit){
     // compute a new z
-    result.z = limit / (result.x * result.y);
+    result.z = limit / (result.x*result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -494,10 +443,9 @@ inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
 
 
   // Can we reduce y to fit?
-  if (result.x * minimum.y * result.z < limit)
-  {
+  if(result.x * minimum.y * result.z < limit){
     // compute a new y
-    result.y = limit / (result.x * result.z);
+    result.y = limit / (result.x*result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -505,10 +453,9 @@ inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
 
 
   // Can we reduce y to fit?
-  if (minimum.x * result.y * result.z < limit)
-  {
+  if(minimum.x * result.y * result.z < limit){
     // compute a new x
-    result.x = limit / (result.y * result.z);
+    result.x = limit / (result.y*result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -523,21 +470,18 @@ inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
-    Types>
-{
+    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data&& data)
+  static inline void exec(Data &&data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t =
-        cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -551,10 +495,9 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks  = launch_dims.num_blocks();
+    int num_blocks = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0)
-    {
+    if (num_blocks > 0 || num_threads > 0) {
 
       //
       // Setup shared memory buffers
@@ -567,8 +510,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
-                                           recommended_threads);
+      launch_t::recommended_blocks_threads(
+          shmem, recommended_blocks, recommended_threads);
 
 
       //
@@ -581,24 +524,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      cuda_dim_t fit_threads {0, 0, 0};
+      cuda_dim_t fit_threads{0,0,0};
 
-      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
-      {
+      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
+
+        fit_threads = fitCudaDims(
+            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
 
-        fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
-                                  launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if (recommended_threads < max_threads &&
-          get_size(fit_threads) != recommended_threads)
-      {
+      if ( recommended_threads < max_threads &&
+           get_size(fit_threads) != recommended_threads ) {
+
+        fit_threads = fitCudaDims(
+            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
 
-        fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
-                                  launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -612,25 +555,24 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if (launch_dims.num_threads() == recommended_threads)
-      {
+      if ( launch_dims.num_threads() == recommended_threads ) {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-      }
-      else
-      {
+
+      } else {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
+
       }
 
-      launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
-                                            launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(
+          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -639,8 +581,7 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if (launch_dims.num_threads() > max_threads)
-      {
+      if(launch_dims.num_threads() > max_threads){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -654,17 +595,14 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto cuda_data = RAJA::cuda::make_launch_body(
-            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
-            data);
+        auto cuda_data = RAJA::cuda::make_launch_body(func,
+            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func, launch_dims.dims.blocks,
-                           launch_dims.dims.threads, args, shmem, res,
-                           launch_t::async);
+        void *args[] = {(void*)&cuda_data};
+        RAJA::cuda::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 3176fd5bf8..58ffa1ba14 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -45,12 +45,9 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                    EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -63,13 +60,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::
-          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -82,13 +79,14 @@ struct CudaStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    LaunchDims dims{my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -110,13 +108,9 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::sync,
-                       IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                    EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -129,23 +123,20 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::sync,
-          IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -160,13 +151,14 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -188,13 +180,9 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::none,
-                       IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                    EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -207,23 +195,20 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::none,
-          IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -233,13 +218,14 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -259,19 +245,14 @@ struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-    : CudaStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+: CudaStatementExecutor<Data, statement::For<ArgumentId,
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                     kernel_sync_requirement::none,
+                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+      EnclosedStmts...>, Types>
+{
+
+};
 
 
 /*
@@ -282,32 +263,33 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::cuda_warp_masked_direct<Mask>,
-                                            EnclosedStmts...>,
-                             Types>
-{
+struct CudaStatementExecutor<
+  Data,
+  statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -317,11 +299,13 @@ struct CudaStatementExecutor<Data,
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -336,7 +320,7 @@ struct CudaStatementExecutor<Data,
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return (dims);
+    return(dims);
   }
 };
 
@@ -348,41 +332,41 @@ struct CudaStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::cuda_warp_masked_loop<Mask>,
-                                            EnclosedStmts...>,
-                             Types>
-{
+struct CudaStatementExecutor<
+  Data,
+  statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -398,7 +382,9 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -413,7 +399,7 @@ struct CudaStatementExecutor<Data,
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return (dims);
+    return(dims);
   }
 };
 
@@ -425,29 +411,30 @@ struct CudaStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-    Data,
-    statement::For<ArgumentId,
-                   RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts...>,
-    Types>
-{
+  Data,
+  statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -457,11 +444,13 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -477,7 +466,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return (dims.max(enclosed_dims));
+    return(dims.max(enclosed_dims));
   }
 };
 
@@ -489,38 +478,39 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::cuda_thread_masked_loop<Mask>,
-                                            EnclosedStmts...>,
-                             Types>
-{
+struct CudaStatementExecutor<
+  Data,
+  statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -536,7 +526,9 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -552,7 +544,7 @@ struct CudaStatementExecutor<Data,
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return (dims.max(enclosed_dims));
+    return(dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 18e11fb989..87556ed8b1 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -46,40 +46,33 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::policy::cuda::
-            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                         EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::cuda::
-                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                      sync,
-                                                      IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -109,52 +102,38 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                         EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::sync,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::policy::cuda::cuda_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::sync,
-              IndexMapper>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -186,52 +165,38 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                         EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::policy::cuda::cuda_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::none,
-              IndexMapper>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -260,19 +225,14 @@ struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-    : CudaStatementExecutor<
-          Data,
-          statement::ForICount<
-              ArgumentId,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+: CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                     kernel_sync_requirement::none,
+                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+      EnclosedStmts...>, Types>
+{
+
+};
 
 
 /*
@@ -284,47 +244,40 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::cuda_warp_masked_direct<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      CudaStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::cuda_warp_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -335,8 +288,9 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
+
 };
 
 
@@ -349,56 +303,48 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::cuda_warp_masked_loop<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      CudaStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::cuda_warp_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -413,6 +359,7 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
+
 };
 
 
@@ -425,43 +372,37 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::cuda_thread_masked_direct<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
+                   EnclosedStmts ...>, Types > {
 
   using Base = CudaStatementExecutor<
-      Data,
-      statement::For<ArgumentId,
-                     RAJA::cuda_thread_masked_direct<Mask>,
-                     EnclosedStmts...>,
-      Types>;
+          Data,
+          statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -472,8 +413,9 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
+
 };
 
 
@@ -486,52 +428,45 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::cuda_thread_masked_loop<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      CudaStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::cuda_thread_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -546,6 +481,7 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
+
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index 74c02b8608..fd33192a65 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -41,31 +41,33 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-    Data,
-    statement::
-        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
-    Types>
-{
+struct CudaStatementExecutor<Data,
+                             statement::Hyperplane<HpArgumentId,
+                                                   seq_exec,
+                                                   ArgList<Args...>,
+                                                   EnclosedStmts...>,
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t =
-      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len =
-        segment_length<HpArgumentId>(data) +
-        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
+    int hp_len = segment_length<HpArgumentId>(data) +
+                 foldl(RAJA::operators::plus<int>(),
+                               segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-                       camp::get<Args>(data.offset_tuple)...);
+        camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -77,8 +79,7 @@ struct CudaStatementExecutor<
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h)
-    {
+    for (int h = 0; h < hp_len; ++h) {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -92,13 +93,18 @@ struct CudaStatementExecutor<
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
+
+
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 018d9d0dfd..258cd204d6 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -39,30 +39,27 @@ struct cuda_shared_mem;
 namespace internal
 {
 
-// Intialize thread shared array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
+//Intialize thread shared array
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
 struct CudaStatementExecutor<Data,
                              statement::InitLocalMem<RAJA::cuda_shared_mem,
-                                                     camp::idx_seq<Indices...>,
-                                                     EnclosedStmts...>,
+                             camp::idx_seq<Indices...>, EnclosedStmts...>,
                              Types>
 {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  // Launch loops
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Launch loops
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -70,33 +67,40 @@ struct CudaStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  // Intialize local array
-  // Identifies type + number of elements needed
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Intialize local array
+  //Identifies type + number of elements needed
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  // Set pointer to null base case
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null base case
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  // Set pointer to null recursive case
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null recursive case
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -104,47 +108,47 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
 
-    // Intialize scoped arrays + launch loops
+    //Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    // set pointers in scoped arrays to null
+    //set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline static LaunchDims calculateDimensions(Data const& data)
+  inline
+  static
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
+
 };
 
-// Intialize thread private array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::InitLocalMem<RAJA::cuda_thread_mem,
-                                                     camp::idx_seq<Indices...>,
-                                                     EnclosedStmts...>,
-                             Types>
+//Intialize thread private array
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
 {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  // Launch loops
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Launch loops
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -152,33 +156,40 @@ struct CudaStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  // Intialize local array
-  // Identifies type + number of elements needed
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Intialize local array
+  //Identifies type + number of elements needed
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  // Set pointer to null base case
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null base case
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  // Set pointer to null recursive case
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null recursive case
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -186,24 +197,31 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
 
-    // Intialize scoped arrays + launch loops
+    //Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    // set pointers in scoped arrays to null
+    //set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline static LaunchDims calculateDimensions(Data const& data)
+  inline
+  static
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
+
 };
 
 
+
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index 37287561fd..e932a3e270 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -40,34 +40,30 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data,
-          camp::idx_t LambdaIndex,
-          typename... Args,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Lambda<LambdaIndex, Args...>,
-                             Types>
-{
+template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if (thread_active)
-    {
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
-          data);
+    if(thread_active){
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
     }
   }
 
 
-  static inline LaunchDims
-  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
+  static
+  inline
+  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
+
+
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index dfa80667a0..7e46748991 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -35,8 +35,7 @@ namespace internal
 // Executor that handles reductions across a single CUDA thread block
 //
 template <typename Data,
-          template <typename...>
-          class ReduceOperator,
+          template <typename...> class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -45,24 +44,22 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types>
-{
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value    = data.template get_param<ParamId>();
+    auto value = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active)
-    {
+    if (!thread_active) {
       value = ident;
     }
 
@@ -76,8 +73,7 @@ struct CudaStatementExecutor<Data,
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if (thread_active)
-    {
+    if(thread_active){
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -85,7 +81,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -98,8 +94,7 @@ struct CudaStatementExecutor<Data,
 // Executor that handles reductions across a single CUDA thread warp
 //
 template <typename Data,
-          template <typename...>
-          class ReduceOperator,
+          template <typename...> class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -108,37 +103,35 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types>
-{
+                             Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value    = data.template get_param<ParamId>();
+    auto value = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active)
-    {
+    if (!thread_active) {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value = RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value =
+        RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if (thread_active)
-    {
+    if(thread_active){
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -146,7 +139,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -155,6 +148,7 @@ struct CudaStatementExecutor<Data,
 };
 
 
+
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index ae00d346ae..7dd45d8837 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -43,14 +43,14 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncthreads().
  */
-struct CudaSyncThreads : public internal::Statement<camp::nil>
-{};
+struct CudaSyncThreads : public internal::Statement<camp::nil> {
+};
 
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncwarp().
  */
-struct CudaSyncWarp : public internal::Statement<camp::nil>
-{};
+struct CudaSyncWarp : public internal::Statement<camp::nil> {
+};
 
 }  // namespace statement
 
@@ -58,38 +58,37 @@ namespace internal
 {
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types>
-{
+struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
 
-  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &, bool) { __syncthreads(); }
 
 
-  static inline LaunchDims
-  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
+  static
+  inline
+  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types>
-{
+struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types> {
 
-  static inline RAJA_DEVICE
+  static
+  inline
+  RAJA_DEVICE
 #if CUDART_VERSION >= 9000
-      void
-      exec(Data&, bool)
-  {
-    __syncwarp();
-  }
+  void exec(Data &, bool) { __syncwarp(); }
 #else
-      void
-      exec(Data&, bool)
-  {}
+  void exec(Data &, bool) {  }
 #endif
 
-  static inline LaunchDims
-  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
+  static
+  inline
+  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index a7f36c54b7..ad901f6b02 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -58,12 +58,10 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                     sync,
-                                                     IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                     EnclosedStmts...>,
-    Types>
-{
+                    Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -71,21 +69,19 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<
-      RAJA::policy::cuda::
-          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -104,23 +100,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len =
-        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -145,16 +141,11 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
-{
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                    EnclosedStmts...>, Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -162,32 +153,26 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator =
-      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::sync,
-          IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride =
-        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -205,23 +190,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len =
-        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -246,16 +231,11 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
-{
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                    EnclosedStmts...>, Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -263,32 +243,26 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator =
-      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::none,
-          IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride =
-        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -301,23 +275,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len =
-        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -342,22 +316,15 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
-    Types>
-    : CudaStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              TPol,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
+: CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
+    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                   kernel_sync_requirement::none,
+                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+    EnclosedStmts...>, Types>
+{
+
+};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 377ac4edff..c611346d46 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -58,49 +58,42 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::cuda::
-            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
     : public CudaStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              RAJA::policy::cuda::
-                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          RAJA::policy::cuda::
-              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-          EnclosedStmts...>,
-      Types>;
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      EnclosedStmts...>,
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t   = IndexMapper::template index<diff_t>();
-    const diff_t i   = t * static_cast<diff_t>(chunk_size);
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -136,64 +129,50 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
     : public CudaStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::sync,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          RAJA::policy::cuda::cuda_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::sync,
-              IndexMapper>,
-          EnclosedStmts...>,
-      Types>;
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                      EnclosedStmts...>,
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t t_init   = IndexMapper::template index<diff_t>();
-    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t t_init = IndexMapper::template index<diff_t>();
+    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
-    {
+    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -228,64 +207,50 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::cuda::cuda_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
     : public CudaStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          RAJA::policy::cuda::cuda_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::none,
-              IndexMapper>,
-          EnclosedStmts...>,
-      Types>;
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                      EnclosedStmts...>,
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t t_init   = IndexMapper::template index<diff_t>();
-    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t t_init = IndexMapper::template index<diff_t>();
+    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
-    {
+    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -314,24 +279,15 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::
-        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
-    Types>
-    : CudaStatementExecutor<
-          Data,
-          statement::TileTCount<
-              ArgumentId,
-              ParamId,
-              TPol,
-              RAJA::policy::cuda::cuda_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  cuda::IndexGlobal<named_dim::x,
-                                    named_usage::ignored,
-                                    named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+: CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
+    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                   kernel_sync_requirement::none,
+                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+    EnclosedStmts...>, Types>
+{
+
+};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index ae60b5f061..9c904ea45a 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -44,26 +44,29 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims
-{
+struct LaunchDims {
 
   CudaDims dims;
   CudaDims min_dims;
 
-  LaunchDims()                             = default;
-  LaunchDims(LaunchDims const&)            = default;
+  LaunchDims() = default;
+  LaunchDims(LaunchDims const&) = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(CudaDims _dims) : dims {_dims}, min_dims {} {}
+  LaunchDims(CudaDims _dims)
+    : dims{_dims}
+    , min_dims{}
+  { }
 
   RAJA_INLINE
   LaunchDims(CudaDims _dims, CudaDims _min_dims)
-      : dims {_dims}, min_dims {_min_dims}
-  {}
+    : dims{_dims}
+    , min_dims{_min_dims}
+  { }
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const& c) const
+  LaunchDims max(LaunchDims const &c) const
   {
     LaunchDims result;
 
@@ -79,44 +82,43 @@ struct LaunchDims
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x =
-        std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y =
-        std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z =
-        std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const { return dims.num_blocks(); }
+  int num_blocks() const {
+    return dims.num_blocks();
+  }
 
   RAJA_INLINE
-  int num_threads() const { return dims.num_threads(); }
+  int num_threads() const {
+    return dims.num_threads();
+  }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks()
-  {
+  void clamp_to_min_blocks() {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads()
-  {
+  void clamp_to_min_threads() {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
+
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper
-{
+struct CudaStatementListExecutorHelper {
 
   using next_helper_t =
       CudaStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -124,7 +126,7 @@ struct CudaStatementListExecutorHelper
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -135,7 +137,7 @@ struct CudaStatementListExecutorHelper
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data& data)
+  inline static LaunchDims calculateDimensions(Data &data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -149,17 +151,16 @@ struct CudaStatementListExecutorHelper
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
-{
+struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data&, bool)
+  inline static RAJA_DEVICE void exec(Data &, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data&)
+  inline static LaunchDims calculateDimensions(Data &)
   {
     return LaunchDims();
   }
@@ -174,121 +175,109 @@ struct CudaStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types>
-{
+struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   using enclosed_stmts_t =
       camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Execute statements in order with helper class
-    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
-        data, thread_active);
+    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute this statements launch dimensions
-    return CudaStatementListExecutorHelper<
-        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
+    return CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
+        calculateDimensions(data);
   }
 };
 
 
 template <typename StmtList, typename Data, typename Types>
-using cuda_statement_list_executor_t =
-    CudaStatementListExecutor<Data, StmtList, Types>;
+using cuda_statement_list_executor_t = CudaStatementListExecutor<
+    Data,
+    StmtList,
+    Types>;
 
 
 // specialization for direct sequential policies
-template <typename kernel_indexer>
+template<typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template <typename IdxT>
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
-                             CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
   {
-    if (len > static_cast<IdxT>(1))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > static_cast<IdxT>(1) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(IndexMapper::block_size))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -296,225 +285,164 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(IndexMapper::grid_size))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::
-      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template <typename IdxT>
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
-                             CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
   {
-    if (len > static_cast<IdxT>(0))
-    {
+    if (len > static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
-    set_cuda_dim<dim>(dims.threads,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    sync,
-    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
-               static_cast<IdxT>(IndexMapper::grid_size)))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
+                static_cast<IdxT>(IndexMapper::grid_size)) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template <typename IdxT>
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
-                             CudaDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT RAJA_UNUSED_ARG(len))
-  {}
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
+  {
+  }
 };
 
 // specialization for strided loop thread policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template <typename IdxT>
-  static void
-  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -522,44 +450,35 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void
-  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::
-      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0))
-    {
+    if (len > static_cast<IdxT>(0)) {
       set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -568,92 +487,62 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
-    set_cuda_dim<dim>(dims.threads,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks,
-                      RAJA_DIVIDE_CEILING_INT(
-                          len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void
-  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template < typename IdxT >
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads,
-                      static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks,
-                      static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 355b080a30..75e5f6902b 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,47 +45,38 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in,
-                                             ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body(reduce_params, body, ctx);
+  RAJA::expt::invoke_body( reduce_params, body, ctx );
 
-  // Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
-      reduce_params);
+  //Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<
-    RAJA::policy::cuda::cuda_launch_explicit_t<async,
-                                               named_usage::unspecified,
-                                               named_usage::unspecified>>
-{
+struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, const LaunchParams &params,
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(
+        &launch_global_fcn<BODY>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -93,20 +84,18 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
 
-    cuda_dim_t blockSize {
-        static_cast<cuda_dim_member_t>(params.threads.value[0]),
-        static_cast<cuda_dim_member_t>(params.threads.value[1]),
-        static_cast<cuda_dim_member_t>(params.threads.value[2])};
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
@@ -116,16 +105,14 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, cuda_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -134,18 +121,13 @@ struct LaunchExecute<
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  // Version with explicit reduction parameters..
+  //Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -158,54 +140,46 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize {
-        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
 
-    cuda_dim_t blockSize {
-        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim      = gridSize;
-      launch_info.blockDim     = blockSize;
+      launch_info.gridDim = gridSize;
+      launch_info.blockDim = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res          = cuda_res;
+      launch_info.res = cuda_res;
 
       {
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
-            async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
 
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, cuda_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        void *args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
@@ -213,66 +187,56 @@ struct LaunchExecute<
 
     return resources::EventProxy<resources::Resource>(res);
   }
+
 };
 
 
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_global_fcn_fixed(BODY body_in)
+void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY,
-          int num_threads,
-          size_t BLOCKS_PER_SM,
-          typename ReduceParams>
+template <typename BODY, int num_threads, size_t BLOCKS_PER_SM, typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_new_reduce_global_fcn_fixed(BODY body_in,
-                                            ReduceParams reduce_params)
+void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body(reduce_params, body, ctx);
+  RAJA::expt::invoke_body( reduce_params, body, ctx );
 
-  // Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
-      reduce_params);
+  //Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
 }
 
 template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<
-    RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
-{
+struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, const LaunchParams &params,
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -286,20 +250,18 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
 
-    cuda_dim_t blockSize {
-        static_cast<cuda_dim_member_t>(params.threads.value[0]),
-        static_cast<cuda_dim_member_t>(params.threads.value[1]),
-        static_cast<cuda_dim_member_t>(params.threads.value[2])};
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
@@ -309,16 +271,14 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, cuda_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -327,25 +287,19 @@ struct LaunchExecute<
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  // Version with explicit reduction parameters..
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+  //Version with explicit reduction parameters..
+  template<typename BODY_IN, typename ReduceParams>
+    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
+       const char *kernel_name, BODY_IN && body_in, ReduceParams &launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
-                                            camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -353,61 +307,53 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize {
-        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
 
-    cuda_dim_t blockSize {
-        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
 
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim      = gridSize;
-      launch_info.blockDim     = blockSize;
+      launch_info.gridDim = gridSize;
+      launch_info.blockDim = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res          = cuda_res;
+      launch_info.res = cuda_res;
       {
 
-        using EXEC_POL =
-            RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
-                                                       BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, cuda_res,
-            std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async, kernel_name);
+        void *args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
     }
     return resources::EventProxy<resources::Resource>(res);
   }
+
 };
 
 
@@ -415,50 +361,43 @@ struct LaunchExecute<
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE
+  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+            SEGMENT const &segment,
+            BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -466,36 +405,29 @@ struct LoopExecute<
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1)
-    {
+    if (i0 < len0 && i1 < len1) {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1,
-                                     IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -505,62 +437,53 @@ struct LoopExecute<
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2)
-    {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(*(segment0.begin() + i0),
+           *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE
+  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+            SEGMENT const &segment,
+            BODY const &body)
   {
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -571,42 +494,34 @@ struct LoopExecute<
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0),
+             *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -620,16 +535,14 @@ struct LoopExecute<
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
-        {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0),
+               *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -638,49 +551,42 @@ struct LoopExecute<
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -688,36 +594,31 @@ struct LoopICountExecute<
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1)
-    {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
+    if (i0 < len0 && i1 < len1) {
+      body(*(segment0.begin() + i0),
+           *(segment1.begin() + i1),
+           i0, i1);
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper0,
-                                     IndexMapper1,
-                                     IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -727,62 +628,54 @@ struct LoopICountExecute<
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2)
-    {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-           *(segment2.begin() + i2), i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(*(segment0.begin() + i0),
+           *(segment1.begin() + i1),
+           *(segment2.begin() + i2),
+           i0, i1, i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -793,42 +686,35 @@ struct LoopICountExecute<
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
+        body(*(segment0.begin() + i0),
+             *(segment1.begin() + i1),
+             i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopICountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -842,17 +728,16 @@ struct LoopICountExecute<
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
-        {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-               *(segment2.begin() + i2), i0, i1, i2);
+          body(*(segment0.begin() + i0),
+               *(segment1.begin() + i1),
+               *(segment2.begin() + i2),
+               i0, i1, i2);
         }
       }
     }
@@ -863,34 +748,31 @@ struct LoopICountExecute<
 /*
    CUDA generic flattened loop implementations
 */
-template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                             sync,
-                                             IndexMapper0>,
-    SEGMENT>
-    : LoopExecute<
-          RAJA::policy::cuda::
-              cuda_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
-          SEGMENT>
+template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                                          sync,
+                                                          IndexMapper0>,
+                   SEGMENT>
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper0>,
+                   SEGMENT>
 {};
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                             kernel_sync_requirement::none,
-                                             IndexMapper0,
-                                             IndexMapper1>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -899,35 +781,29 @@ struct LoopExecute<
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride * i1;
+    const int i = i0 + i0_stride*i1;
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                             kernel_sync_requirement::none,
-                                             IndexMapper0,
-                                             IndexMapper1,
-                                             IndexMapper2>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1,
+                                                          IndexMapper2>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -938,47 +814,39 @@ struct LoopExecute<
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
+    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        sync,
-        IndexMapper0>,
-    SEGMENT>
-    : LoopExecute<
-          RAJA::policy::cuda::cuda_indexer<
-              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-              sync,
-              IndexMapper0>,
-          SEGMENT>
+template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                          sync,
+                                                          IndexMapper0>,
+                   SEGMENT>
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  sync,
+                                                  IndexMapper0>,
+                   SEGMENT>
 {};
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -988,34 +856,29 @@ struct LoopExecute<
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
-    {
+    for (int i = i0 + i0_stride*i1;
+         i < len;
+         i += i0_stride*i1_stride) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::cuda::cuda_flatten_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1,
+                                                          IndexMapper2>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1027,9 +890,9 @@ struct LoopExecute<
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
-         i += i0_stride * i1_stride * i2_stride)
-    {
+    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
+         i < len;
+         i += i0_stride*i1_stride*i2_stride) {
       body(*(segment.begin() + i));
     }
   }
@@ -1040,122 +903,101 @@ struct LoopExecute<
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
-    SEGMENT>
-{
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len)
-    {
+    if (i < len) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride =
-        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<
-    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     IndexMapper>,
-    SEGMENT>
-{
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t   = IndexMapper::template index<diff_t>();
-    const diff_t i   = t * static_cast<diff_t>(tile_size);
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(tile_size);
 
-    if (i < len)
-    {
+    if (i < len) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<
-    RAJA::policy::cuda::cuda_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t t_init   = IndexMapper::template index<diff_t>();
-    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t t_init = IndexMapper::template index<diff_t>();
+    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
-    {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index e52a036c8f..f9f60f730e 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/desul/atomic.hpp"
+  #include "RAJA/policy/desul/atomic.hpp"
 #else
-#include "RAJA/policy/cuda/atomic.hpp"
+  #include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -73,124 +73,100 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner,
-          typename GetTallyIndex,
-          typename T,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                         T identity,
-                                         int bin,
-                                         T value,
-                                         T* tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+template <typename Combiner, typename GetTallyIndex,
+          typename T, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                                                      T identity,
+                                                                      int bin,
+                                                                      T value,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
 {
-  if (value == identity)
-  {
-    return;
-  }
+  if (value == identity) { return; }
 
-  int tally_index =
-      GetTallyIndex::template index<int>();  // globalWarpId by default
+  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset =
-      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
+  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_init_shmem(int num_bins,
-                              T identity,
-                              T* shared_mem,
-                              int shared_replication)
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
+                                                           T identity,
+                                                           T* shared_mem,
+                                                           int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
-  {
+       shmem_offset < shared_replication * num_bins;
+       shmem_offset += numThreads) {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename GetSharedIndex,
-          typename T,
-          typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                        T identity,
-                                        int bin,
-                                        T value,
-                                        T* shared_mem,
-                                        GetSharedOffset get_shared_offset,
-                                        int shared_replication)
+template <typename Combiner, typename GetSharedIndex,
+          typename T, typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                                                     T identity,
+                                                                     int bin,
+                                                                     T value,
+                                                                     T* shared_mem,
+                                                                     GetSharedOffset get_shared_offset,
+                                                                     int shared_replication)
 {
-  if (value == identity)
-  {
-    return;
-  }
+  if (value == identity) { return; }
 
-  int shared_index =
-      GetSharedIndex::template index<int>();  // threadId by default
+  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset =
-      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::cuda::atomic<Combiner> {}(shared_mem[shmem_offset], value);
+  RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T,
-          typename GetSharedOffset,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                         T identity,
-                                         T* shared_mem,
-                                         GetSharedOffset get_shared_offset,
-                                         int shared_replication,
-                                         T* tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+          typename T, typename GetSharedOffset, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                                                      T identity,
+                                                                      T* shared_mem,
+                                                                      GetSharedOffset get_shared_offset,
+                                                                      int shared_replication,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                (gridDim.x * gridDim.y) * blockIdx.z;
+                 (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads)
-  {
+  for (int bin = threadId; bin < num_bins; bin += numThreads) {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
-    {
-      int shmem_offset =
-          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner {}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
+      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner{}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity)
-    {
+    if (value != identity) {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset =
-          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
+      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
     }
+
   }
 }
 
@@ -209,63 +185,48 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template <typename Container>
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
-                                          T const& identity)
-      : m_tally_mem(nullptr),
-        m_identity(identity),
-        m_num_bins(container.size()),
-        m_tally_bins(get_tally_bins(m_num_bins)),
-        m_tally_replication(get_tally_replication())
+  template < typename Container >
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
+      : m_tally_mem(nullptr)
+      , m_identity(identity)
+      , m_num_bins(container.size())
+      , m_tally_bins(get_tally_bins(m_num_bins))
+      , m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                               m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(
-      MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(
-      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData&
-  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData&
-  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData() = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template <typename Container>
+  template < typename Container >
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins)
-    {
+    if (new_num_bins != m_num_bins) {
       teardown_permanent();
-      m_num_bins          = new_num_bins;
-      m_tally_bins        = get_tally_bins(m_num_bins);
+      m_num_bins = new_num_bins;
+      m_tally_bins = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                                 m_tally_replication);
-    }
-    else
-    {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    } else {
       {
         int tally_rep = 0;
-        int bin       = 0;
-        for (auto const& value : container)
-        {
-          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
-                                        m_tally_replication)] = value;
+        int bin = 0;
+        for (auto const& value : container) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
-      {
-        for (int bin = 0; bin < m_num_bins; ++bin)
-        {
-          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
-                                        m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < m_num_bins; ++bin) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -283,11 +244,9 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-        reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
-    {
-      int tally_offset =
-          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+          reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
+      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -299,27 +258,20 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(
-      size_t(
-          policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-      size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size =
-      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+                                                       size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer =
-      typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind =
-      typename GetTallyOffset_rebind_rebunch::template rebunch<
-          s_tally_bunch_size>;
+  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
-           s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -329,50 +281,39 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct
-    {
+    struct {
       int func_min_global_replication;
-    } func_data {min_tally_replication};
+    } func_data{min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer {}
-        .template get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer{}.template
+        get_global_replication<int>(func_data);
   }
 
-  template <typename Container>
-  static T* create_tally(Container const& container,
-                         T const& identity,
-                         int num_bins,
-                         int tally_bins,
-                         int tally_replication)
+  template < typename Container >
+  static T* create_tally(Container const& container, T const& identity,
+                         int num_bins, int tally_bins, int tally_replication)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication * tally_bins, s_tally_alignment);
+        tally_replication*tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0)
-    {
+    if (tally_replication > 0) {
       {
         int tally_rep = 0;
-        int bin       = 0;
-        for (auto const& value : container)
-        {
-          int tally_offset =
-              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
-          new (&tally_mem[tally_offset]) T(value);
+        int bin = 0;
+        for (auto const& value : container) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
-      {
-        for (int bin = 0; bin < num_bins; ++bin)
-        {
-          int tally_offset =
-              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
-          new (&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < num_bins; ++bin) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -380,21 +321,15 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins,
-                            int tally_bins,
-                            int tally_replication)
+                            int num_bins, int tally_bins, int tally_replication)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return;
     }
 
-    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
-    {
-      for (int bin = num_bins; bin > 0; --bin)
-      {
-        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
-                                             tally_replication);
+    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
+      for (int bin = num_bins; bin > 0; --bin) {
+        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -403,15 +338,14 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication;  // power of 2, at least the max number of omp
-                            // threads
+  int m_tally_replication; // power of 2, at least the max number of omp threads
 };
 
 
@@ -420,31 +354,34 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData =
-      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::get;
-  using TallyData::identity;
-  using TallyData::num_bins;
-  using TallyData::reset_permanent;
   using TallyData::TallyData;
+  using TallyData::reset_permanent;
   using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
+  { }
 
   //! teardown per launch, do nothing
-  void teardown_launch() {}
+  void teardown_launch()
+  { }
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device() {}
+  void setup_device()
+  { }
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device() {}
+  void finalize_device()
+  { }
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -452,8 +389,9 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
-        m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity,
+        bin, value,
+        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -463,19 +401,18 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset =
-        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner {}(m_tally_mem[tally_offset], value);
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
+  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
-  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -485,69 +422,57 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData =
-      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template <typename Container>
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
-                                              T const& identity)
-      : TallyData(container, identity),
-        m_shared_offset(s_shared_offset_unknown),
-        m_shared_replication(0)
-  {}
+  template < typename Container >
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
+      : TallyData(container, identity)
+      , m_shared_offset(s_shared_offset_unknown)
+      , m_shared_replication(0)
+  { }
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(
-      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(
-      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data&
-  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data&
-  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::get;
-  using TallyData::identity;
-  using TallyData::num_bins;
   using TallyData::reset_permanent;
   using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0))
-    {
+    if (m_num_bins == size_t(0)) {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
-    size_t shared_replication  = 0;
+    size_t shared_replication = 0;
     const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size)
-        {
-          struct
-          {
-            size_t func_threads_per_block;
-            size_t func_max_shared_replication_per_block;
-          } func_data {block_size, max_shmem_size / m_num_bins};
-
-          shared_replication =
-              SharedAtomicReplicationConcretizer {}
-                  .template get_shared_replication<size_t>(func_data);
-          return m_num_bins * shared_replication;
-        });
-
-    if (shared_offset != dynamic_smem_allocation_failure)
-    {
+        [&](size_t max_shmem_size) {
+
+      struct {
+        size_t func_threads_per_block;
+        size_t func_max_shared_replication_per_block;
+      } func_data{block_size, max_shmem_size / m_num_bins};
+
+      shared_replication = SharedAtomicReplicationConcretizer{}.template
+          get_shared_replication<size_t>(func_data);
+      return m_num_bins * shared_replication;
+    });
+
+    if (shared_offset != dynamic_smem_allocation_failure) {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset      = static_cast<int>(shared_offset);
-    }
-    else
-    {
+      m_shared_offset = static_cast<int>(shared_offset);
+    } else {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -556,7 +481,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset      = s_shared_offset_unknown;
+    m_shared_offset = s_shared_offset_unknown;
   }
 
 
@@ -565,10 +490,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr)
-    {
-      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
-                                          m_shared_replication);
+    if (shared_mem != nullptr) {
+      impl::block_multi_reduce_init_shmem(
+          m_num_bins, m_identity,
+          shared_mem, m_shared_replication);
     }
   }
 
@@ -577,12 +502,11 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr)
-    {
+    if (shared_mem != nullptr) {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
-          m_shared_replication, m_tally_mem, GetTallyOffset {},
-          m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity,
+          shared_mem, GetSharedOffset{}, m_shared_replication,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
     }
   }
 
@@ -592,17 +516,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr)
-    {
+    if (shared_mem != nullptr) {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
-          m_shared_replication);
-    }
-    else
-    {
+          m_num_bins, m_identity,
+          bin, value,
+          shared_mem, GetSharedOffset{}, m_shared_replication);
+    } else {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
-          m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity,
+          bin, value,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
     }
   }
 
@@ -613,16 +536,14 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset =
-        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner {}(m_tally_mem[tally_offset], value);
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer =
-      typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -630,27 +551,24 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown =
-      std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid =
-      std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
 
 
+  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
-  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset;       // in bytes
-  int m_shared_replication;  // power of 2
+  int m_shared_offset; // in bytes
+  int m_shared_replication; // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid)
-    {
+    if (m_shared_offset == s_shared_offset_invalid) {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -677,50 +595,39 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template <typename T, typename t_MultiReduceOp, typename tuning>
+template < typename T, typename t_MultiReduceOp, typename tuning >
 struct MultiReduceDataCuda
 {
-  static constexpr bool atomic_available =
-      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<
-      (atomic_available),
-      std::conditional_t<
-          (tuning::algorithm ==
-           multi_reduce_algorithm::
-               init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                            T,
-                                                            tuning>,
-          std::conditional_t<
-              (tuning::algorithm ==
-               multi_reduce_algorithm::init_host_combine_global_atomic),
-              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                       T,
-                                                       tuning>,
-              void>>,
+  using reduce_data_type =
+      std::conditional_t<(atomic_available),
+        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
+            cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+            void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Cuda>;
 
 public:
-  using value_type    = T;
+  using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataCuda() = delete;
 
-  template <
-      typename Container,
-      std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* =
-          nullptr>
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr >
   MultiReduceDataCuda(Container const& container, T identity)
-      : m_parent(this),
-        m_sync_list(new SyncList),
-        m_data(container, identity),
-        m_own_launch_data(false)
-  {}
+      : m_parent(this)
+      , m_sync_list(new SyncList)
+      , m_data(container, identity)
+      , m_own_launch_data(false)
+  {
+  }
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -732,35 +639,31 @@ struct MultiReduceDataCuda
 #else
       : m_parent(&other)
 #endif
-        ,
-        m_sync_list(other.m_sync_list),
-        m_data(other.m_data),
-        m_own_launch_data(false)
+      , m_sync_list(other.m_sync_list)
+      , m_data(other.m_data)
+      , m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent)
-    {
-      if (setupReducers())
-      {
+    if (m_parent) {
+      if (setupReducers()) {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent          = nullptr;
+        m_parent = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent)
-    {
+    if (!m_parent->m_parent) {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataCuda(MultiReduceDataCuda&&)                 = delete;
+  MultiReduceDataCuda(MultiReduceDataCuda &&) = delete;
   MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
-  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&)      = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -768,30 +671,23 @@ struct MultiReduceDataCuda
   ~MultiReduceDataCuda()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this)
-    {
+    if (m_parent == this) {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    }
-    else if (m_parent)
-    {
+    } else if (m_parent) {
       // do nothing
-    }
-    else
-    {
-      if (m_own_launch_data)
-      {
+    } else {
+      if (m_own_launch_data) {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent)
-    {
+    if (!m_parent->m_parent) {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -799,7 +695,7 @@ struct MultiReduceDataCuda
   }
 
 
-  template <typename Container>
+  template < typename Container >
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -833,17 +729,15 @@ struct MultiReduceDataCuda
 
 
 private:
-  MultiReduceDataCuda const* m_parent;
+  MultiReduceDataCuda const *m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Cuda res)
   {
-    for (resources::Cuda& list_res : *m_sync_list)
-    {
-      if (list_res.get_stream() == res.get_stream())
-      {
+    for (resources::Cuda& list_res : *m_sync_list) {
+      if (list_res.get_stream() == res.get_stream()) {
         return;
       }
     }
@@ -852,8 +746,7 @@ struct MultiReduceDataCuda
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Cuda& list_res : *m_sync_list)
-    {
+    for (resources::Cuda& list_res : *m_sync_list) {
       ::RAJA::cuda::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -862,8 +755,7 @@ struct MultiReduceDataCuda
 
 }  // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
-                                cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index 6411dfe72d..4edf645ed3 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -7,46 +7,42 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
-
-// Init
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
-init(KernelName& kn, const RAJA::cuda::detail::cudaInfo&)
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
+  {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  nvtxRangePush(kn.name);
+    nvtxRangePush(kn.name);
 #else
-  RAJA_UNUSED_VAR(kn);
+    RAJA_UNUSED_VAR(kn);
 #endif
-}
-
-// Combine
-template <typename EXEC_POL>
-RAJA_HOST_DEVICE
-    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
-    combine(KernelName&)
-{}
-
-// Resolve
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
-resolve(KernelName&, const RAJA::cuda::detail::cudaInfo&)
-{
+  }
+
+  // Combine
+  template<typename EXEC_POL>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  combine(KernelName&) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
+  {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  nvtxRangePop();
+    nvtxRangePop();
 #endif
-}
+  }
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
 #endif
 
-#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 2fa86da40e..6ab3372aaa 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -8,55 +8,56 @@
 #include "RAJA/policy/cuda/reduce.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
-
-// Init
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
-init(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
-{
-  red.devicetarget =
-      RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
-  red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
-  red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
-}
-
-// Combine
-template <typename EXEC_POL, typename OP, typename T>
-RAJA_HOST_DEVICE
-    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
-    combine(Reducer<OP, T>& red)
-{
-  RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
-}
-
-// Resolve
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
-resolve(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
-{
-  // complete reduction
-  ci.res.wait();
-  *red.target = OP {}(*red.target, *red.devicetarget);
-
-  // free memory
-  RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
-  red.device_count = nullptr;
-  red.device_mem.deallocate();
-  RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
-  red.devicetarget = nullptr;
-}
-
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+#include "RAJA/policy/cuda/policy.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  init(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
+  {
+    red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
+    red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
+    red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  combine(Reducer<OP, T, VOp>& red)
+  {
+    RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(red.devicetarget,
+                                                                            red.getVal(),
+                                                                            red.device_mem,
+                                                                            red.device_count);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  resolve(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
+  {
+    // complete reduction
+    ci.res.wait();
+
+    red.combineTarget(*red.devicetarget);
+
+    // free memory
+    RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
+    red.device_count = nullptr;
+    red.device_mem.deallocate();
+    RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
+    red.devicetarget = nullptr;
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
 #endif
 
-#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index a7bac49fd9..cd71a37480 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -61,14 +61,12 @@ using cuda_dim_member_t = camp::decay<decltype(std::declval<cuda_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch
-{
+struct get_launch {
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false>
-{
+struct get_launch<false> {
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -77,16 +75,16 @@ namespace cuda
 {
 
 /// Type representing thread and block indexing within a grid
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template <typename... indexers>
+template<typename ...indexers>
 struct IndexFlatten;
 
-template <size_t divisor, typename index>
+template<size_t divisor, typename index>
 struct IndexDivide;
 
-template <size_t divisor, typename index>
+template<size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -98,14 +96,13 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device   = data.device_sm_per_device;
+    IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device =
-        func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -118,31 +115,26 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
 struct FractionOffsetOccupancyConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device   = data.device_sm_per_device;
+    IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
-    {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
-        IdxT(0))
-    {
-      func_max_blocks_per_sm =
-          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
+      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device =
-        func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -156,27 +148,22 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template <typename AvoidMaxOccupancyConcretizer>
+template < typename AvoidMaxOccupancyConcretizer >
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block    = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm =
-        func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm)
-    {
+    if (func_max_threads_per_sm < device_max_threads_per_sm) {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    }
-    else
-    {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
-          data);
+    } else {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
     }
   }
 };
@@ -185,10 +172,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template <size_t preferred_replication>
+template < size_t preferred_replication >
 struct ConstantPreferredReplicationConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -200,23 +187,19 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template <size_t t_cutoff,
-          size_t preferred_replication_before_cutoff,
-          size_t preferred_replication_after_cutoff>
+template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
+                            size_t preferred_replication_after_cutoff >
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff                 = t_cutoff;
+    IdxT cutoff = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff)
-    {
+    if (func_threads_per_block < cutoff) {
       return IdxT(preferred_replication_before_cutoff);
-    }
-    else
-    {
+    } else {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -227,21 +210,19 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template <typename GetPreferredReplication>
+template < typename GetPreferredReplication >
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block =
-        data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication =
-        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
-            data);
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
 
-    return prev_pow2(
-        std::min(preferred_replication, func_max_shared_replication_per_block));
+    return prev_pow2(std::min(preferred_replication,
+                              func_max_shared_replication_per_block));
   }
 };
 
@@ -250,20 +231,18 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template <typename GetPreferredReplication>
+template < typename GetPreferredReplication >
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication =
-        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
-            data);
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
 
-    return next_pow2(
-        std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -281,16 +260,14 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <reduce_algorithm t_algorithm,
-          block_communication_mode t_comm_mode,
-          size_t t_replication,
-          size_t t_atomic_stride>
+template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
+           size_t t_replication, size_t t_atomic_stride >
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm         = t_algorithm;
+  static constexpr reduce_algorithm algorithm = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication                 = t_replication;
-  static constexpr size_t atomic_stride               = t_atomic_stride;
+  static constexpr size_t replication = t_replication;
+  static constexpr size_t atomic_stride = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -302,25 +279,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template <typename t_AtomicReplicationConcretizer,
-          typename t_ReplicationIndexer,
-          typename t_OffsetCalculator>
+template < typename t_AtomicReplicationConcretizer,
+           typename t_ReplicationIndexer,
+           typename t_OffsetCalculator >
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer           = t_ReplicationIndexer;
-  using OffsetCalculator             = t_OffsetCalculator;
+  using ReplicationIndexer = t_ReplicationIndexer;
+  using OffsetCalculator = t_OffsetCalculator;
 };
 
-template <multi_reduce_algorithm t_algorithm,
-          typename t_SharedAtomicReplicationTuning,
-          typename t_GlobalAtomicReplicationTuning>
+template < multi_reduce_algorithm t_algorithm,
+           typename t_SharedAtomicReplicationTuning,
+           typename t_GlobalAtomicReplicationTuning >
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent    = false;
+  static constexpr bool consistent = false;
 };
 
 }  // namespace cuda
@@ -335,29 +312,25 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type
-      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
-                                             // the cache level that handles
-                                             // atomics
+  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-      : WARP_SIZE(warp_size),
-        MAX_BLOCK_SIZE(max_block_size),
-        MAX_WARPS(max_block_size / warp_size),
-        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  {}
+    : WARP_SIZE(warp_size)
+    , MAX_BLOCK_SIZE(max_block_size)
+    , MAX_WARPS(max_block_size / warp_size)
+    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  { }
 };
 
 //
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
-constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
+constexpr DeviceConstants device_constants(32, 1024, 32); // V100
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
-              "device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
@@ -366,51 +339,38 @@ constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
 
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
-struct cuda_indexer
-{};
-
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
-struct cuda_flatten_indexer
-    : public RAJA::make_policy_pattern_launch_platform_t<
-          RAJA::Policy::cuda,
-          RAJA::Pattern::region,
-          detail::get_launch<true /*async */>::value,
-          RAJA::Platform::cuda>
-{
+template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+struct cuda_indexer {};
+
+template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
+  RAJA::Policy::cuda,
+  RAJA::Pattern::region,
+  detail::get_launch<true /*async */>::value,
+  RAJA::Platform::cuda> {
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping,
-          typename _IterationGetter,
-          typename _LaunchConcretizer,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool Async           = false>
+template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                                RAJA::Policy::cuda,
-                                RAJA::Pattern::forall,
-                                detail::get_launch<Async>::value,
-                                RAJA::Platform::cuda>
-{
-  using IterationMapping  = _IterationMapping;
-  using IterationGetter   = _IterationGetter;
+                       RAJA::Policy::cuda,
+                       RAJA::Pattern::forall,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::cuda> {
+  using IterationMapping = _IterationMapping;
+  using IterationGetter = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async,
-          int num_threads      = named_usage::unspecified,
+template <bool Async, int num_threads = named_usage::unspecified,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-struct cuda_launch_explicit_t
-    : public RAJA::make_policy_pattern_launch_platform_t<
-          RAJA::Policy::cuda,
-          RAJA::Pattern::region,
-          detail::get_launch<Async>::value,
-          RAJA::Platform::cuda>
-{};
+struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::region,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda> {
+};
 
 
 //
@@ -420,15 +380,13 @@ struct cuda_launch_explicit_t
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool Async           = false>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                                RAJA::Policy::cuda,
-                                RAJA::Pattern::workgroup_exec,
-                                detail::get_launch<Async>::value,
-                                RAJA::Platform::cuda>
-{};
+                       RAJA::Policy::cuda,
+                       RAJA::Pattern::workgroup_exec,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::cuda> {
+};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -436,10 +394,10 @@ struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-          RAJA::Policy::cuda,
-          RAJA::Pattern::workgroup_order,
-          RAJA::Platform::cuda>
-{};
+                       RAJA::Policy::cuda,
+                       RAJA::Pattern::workgroup_order,
+                       RAJA::Platform::cuda> {
+};
 
 
 ///
@@ -450,36 +408,36 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <typename tuning>
-struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
-                                RAJA::Policy::cuda,
-                                RAJA::Pattern::reduce,
-                                detail::get_launch<false>::value,
-                                RAJA::Platform::cuda,
-                                std::conditional_t<tuning::consistent,
-                                                   reduce::ordered,
-                                                   reduce::unordered>>
-{};
-
-template <typename tuning>
+template < typename tuning >
+struct cuda_reduce_policy
+    : public RAJA::
+          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
+                                                RAJA::Pattern::reduce,
+                                                detail::get_launch<false>::value,
+                                                RAJA::Platform::cuda,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
+};
+
+template < typename tuning >
 struct cuda_multi_reduce_policy
-    : public RAJA::make_policy_pattern_launch_platform_t<
-          RAJA::Policy::cuda,
-          RAJA::Pattern::multi_reduce,
-          detail::get_launch<false>::value,
-          RAJA::Platform::cuda,
-          std::conditional_t<tuning::consistent,
-                             reduce::ordered,
-                             reduce::unordered>>
-{};
+    : public RAJA::
+          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
+                                                RAJA::Pattern::multi_reduce,
+                                                detail::get_launch<false>::value,
+                                                RAJA::Platform::cuda,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
+};
 
 /*!
  * Cuda atomic policy for using cuda atomics on the device and
  * the provided policy on the host
  */
-template <typename host_policy>
-struct cuda_atomic_explicit
-{};
+template<typename host_policy>
+struct cuda_atomic_explicit{};
 
 /*!
  * Default cuda atomic policy uses cuda atomics on the device and non-atomics
@@ -490,26 +448,23 @@ using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct cuda_block_reduce
-{};
+struct cuda_block_reduce{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct cuda_warp_reduce
-{};
+struct cuda_warp_reduce{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_direct
-{};
+struct cuda_warp_direct{};
 
 // Policy to map work to threads within a warp using a warp-stride loop
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_loop
-{};
+struct cuda_warp_loop{};
+
 
 
 // Policy to map work to threads within a warp using a bit mask
@@ -518,9 +473,8 @@ struct cuda_warp_loop
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template <typename Mask>
-struct cuda_warp_masked_direct
-{};
+template<typename Mask>
+struct cuda_warp_masked_direct {};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with cuda_thread_x_*
@@ -528,24 +482,21 @@ struct cuda_warp_masked_direct
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template <typename Mask>
-struct cuda_warp_masked_loop
-{};
+template<typename Mask>
+struct cuda_warp_masked_loop {};
 
 
-template <typename Mask>
-struct cuda_thread_masked_direct
-{};
+template<typename Mask>
+struct cuda_thread_masked_direct {};
 
-template <typename Mask>
-struct cuda_thread_masked_loop
-{};
+template<typename Mask>
+struct cuda_thread_masked_loop {};
 
 
 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
                                                        Pattern::synchronize,
-                                                       Launch::sync>
-{};
+                                                       Launch::sync> {
+};
 
 }  // end namespace cuda
 }  // end namespace policy
@@ -557,131 +508,141 @@ namespace internal
 RAJA_INLINE
 int get_size(cuda_dim_t dims)
 {
-  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
-  {
+  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
     return 0;
   }
-  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) *
+         (dims.y ? dims.y : 1) *
+         (dims.z ? dims.z : 1);
 }
 
-struct CudaDims
-{
+struct CudaDims {
 
-  cuda_dim_t blocks {0, 0, 0};
-  cuda_dim_t threads {0, 0, 0};
+  cuda_dim_t blocks{0,0,0};
+  cuda_dim_t threads{0,0,0};
 
-  CudaDims()                           = default;
-  CudaDims(CudaDims const&)            = default;
+  CudaDims() = default;
+  CudaDims(CudaDims const&) = default;
   CudaDims& operator=(CudaDims const&) = default;
 
   RAJA_INLINE
   CudaDims(cuda_dim_member_t default_val)
-      : blocks {default_val, default_val, default_val},
-        threads {default_val, default_val, default_val}
-  {}
+    : blocks{default_val, default_val, default_val}
+    , threads{default_val, default_val, default_val}
+  { }
 
   RAJA_INLINE
-  int num_blocks() const { return get_size(blocks); }
+  int num_blocks() const {
+    return get_size(blocks);
+  }
 
   RAJA_INLINE
-  int num_threads() const { return get_size(threads); }
+  int num_threads() const {
+    return get_size(threads);
+  }
 
   RAJA_INLINE
-  cuda_dim_t get_blocks() const
-  {
-    if (num_blocks() != 0)
-    {
-      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
+  cuda_dim_t get_blocks() const {
+    if (num_blocks() != 0) {
+      return {(blocks.x ? blocks.x : 1),
+              (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    }
-    else
-    {
+    } else {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  cuda_dim_t get_threads() const
-  {
-    if (num_threads() != 0)
-    {
-      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
+  cuda_dim_t get_threads() const {
+    if (num_threads() != 0) {
+      return {(threads.x ? threads.x : 1),
+              (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    }
-    else
-    {
+    } else {
       return threads;
     }
   }
 };
 
-template <named_dim dim>
+template<named_dim dim>
 struct CudaDimHelper;
 
-template <>
-struct CudaDimHelper<named_dim::x>
-{
+template<>
+struct CudaDimHelper<named_dim::x>{
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static constexpr
+  cuda_dim_member_t get(dim_t const &d)
   {
     return d.x;
   }
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static
+  void set(dim_t &d, cuda_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template <>
-struct CudaDimHelper<named_dim::y>
-{
+template<>
+struct CudaDimHelper<named_dim::y>{
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static constexpr
+  cuda_dim_member_t get(dim_t const &d)
   {
     return d.y;
   }
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static
+  void set(dim_t &d, cuda_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template <>
-struct CudaDimHelper<named_dim::z>
-{
+template<>
+struct CudaDimHelper<named_dim::z>{
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static constexpr
+  cuda_dim_member_t get(dim_t const &d)
   {
     return d.z;
   }
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static
+  void set(dim_t &d, cuda_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template <named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE constexpr cuda_dim_member_t get_cuda_dim(dim_t const& d)
+template<named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE
+constexpr
+cuda_dim_member_t get_cuda_dim(dim_t const &d)
 {
   return CudaDimHelper<dim>::get(d);
 }
 
-template <named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
+template<named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE
+void set_cuda_dim(dim_t &d, cuda_dim_member_t value)
 {
   return CudaDimHelper<dim>::set(d, value);
 }
 
-}  // namespace internal
+} // namespace internal
 
 namespace cuda
 {
@@ -690,13 +651,14 @@ namespace cuda
 struct IndexSize
 {
   cuda_dim_member_t block_size = named_usage::unspecified;
-  cuda_dim_member_t grid_size  = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr IndexSize(
-      cuda_dim_member_t _block_size = named_usage::unspecified,
-      cuda_dim_member_t _grid_size  = named_usage::unspecified)
-      : block_size(_block_size), grid_size(_grid_size)
-  {}
+  cuda_dim_member_t grid_size = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr
+  IndexSize(cuda_dim_member_t _block_size = named_usage::unspecified,
+            cuda_dim_member_t _grid_size = named_usage::unspecified)
+    : block_size(_block_size)
+    , grid_size(_grid_size)
+  { }
 };
 
 /// Type representing thread indexing within a grid
@@ -704,461 +666,436 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(block_size) *
+           static_cast<IdxT>(grid_size) ;
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template <named_dim dim, int GRID_SIZE>
+template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(grid_size) ;
   }
 };
 /// with fixed block size and fixed grid size of 1
-template <named_dim dim, int BLOCK_SIZE>
+template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size);
+    return static_cast<IdxT>(block_size) ;
   }
 };
 /// with fixed block size and fixed grid size of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 
 /// with dynamic block size and fixed grid size
-template <named_dim dim, int GRID_SIZE>
+template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size) ;
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
   }
 };
 
 /// with fixed block size and dynamic grid size
-template <named_dim dim, int BLOCK_SIZE>
+template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(
-               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template <named_dim dim, int GRID_SIZE>
+template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(grid_size) ;
   }
 };
 /// with fixed grid sized of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 /// with dynamic grid size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template <named_dim dim, int BLOCK_SIZE>
+template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size);
+    return static_cast<IdxT>(block_size) ;
   }
 };
 /// with fixed block size of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 /// with dynamic block size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
+    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 
 // useful for flatten global index (includes x)
-template <typename x_index>
+template<typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return x_index::template size<IdxT>();
+    return  x_index::template size<IdxT>();
   }
+
 };
 
 // useful for flatten global index (includes x,y)
-template <typename x_index, typename y_index>
+template<typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
+      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
+
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
+    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
   }
+
 };
 
 // useful for flatten global index (includes x,y,z)
-template <typename x_index, typename y_index, typename z_index>
+template<typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-           x_index::template size<IdxT>() *
-               (y_index::template index<IdxT>() +
-                y_index::template size<IdxT>() *
-                    z_index::template index<IdxT>());
+      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
+                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
-           z_index::template size<IdxT>();
+    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
   }
+
 };
 
-template <size_t divisor, typename indexer>
+template<size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
-                                   static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
   }
 };
 
-template <size_t divisor, typename indexer>
+template<size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template <typename IdxT = cuda_dim_member_t>
+  template < typename IdxT = cuda_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1167,10 +1104,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template <typename index_global>
+template < typename index_global >
 struct get_index_thread;
 ///
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1185,10 +1122,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template <typename index_global>
+template < typename index_global >
 struct get_index_block;
 ///
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1203,83 +1140,85 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+template <size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE = named_usage::unspecified>
+template <size_t GRID_SIZE=named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE = named_usage::unspecified>
+template <size_t GRID_SIZE=named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE = named_usage::unspecified>
+template <size_t GRID_SIZE=named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
+template <size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using warp_xyz =
-    IndexDivide<WARP_SIZE,
-                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified,
-          size_t GRID_SIZE_X  = named_usage::unspecified,
-          size_t GRID_SIZE_Y  = named_usage::unspecified,
-          size_t GRID_SIZE_Z  = named_usage::unspecified>
-using warp_global_xyz =
-    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
-                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
-
-}  // namespace cuda
+template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+using warp_xyz = IndexDivide<WARP_SIZE,
+                             thread_xyz<BLOCK_SIZE_X,
+                                        BLOCK_SIZE_Y,
+                                        BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified,
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
+                                              BLOCK_SIZE_X,
+                                              BLOCK_SIZE_Y,
+                                              BLOCK_SIZE_Z>,
+                                     block_xyz<GRID_SIZE_X,
+                                               GRID_SIZE_Y,
+                                               GRID_SIZE_Z>>;
+
+} // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
 
-using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
-    cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
-        cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
-using CudaFractionOffsetOccupancyConcretizer =
-    cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
@@ -1289,286 +1228,179 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE,
-          size_t GRID_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool Async = false>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer,
-    BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer,
-    BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     BLOCKS_PER_SM,
-                                     Async>;
+using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     BLOCKS_PER_SM,
-                                     true>;
+using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     policy::cuda::MIN_BLOCKS_PER_SM,
-                                     Async>;
+using cuda_exec = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async =
-    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
-                                     cuda::global_x<BLOCK_SIZE>,
-                                     CudaDefaultConcretizer,
-                                     policy::cuda::MIN_BLOCKS_PER_SM,
-                                     true>;
+using cuda_exec_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer,
-    BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer,
-    BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    true>;
-
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          typename Fraction,
-          bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
-    BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
-    BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    true>;
-
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          typename Concretizer,
-          bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    Concretizer,
-    BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    Concretizer,
-    BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    Concretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    Concretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer,
-    BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer,
-    BLOCKS_PER_SM,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer,
-    policy::cuda::MIN_BLOCKS_PER_SM,
-    true>;
-
-template <bool with_reduce,
-          size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
-          bool Async = false>
-using cuda_exec_base_explicit = std::conditional_t<
-    with_reduce,
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_base_explicit_async = std::conditional_t<
-    with_reduce,
+using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
     cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base =
-    std::conditional_t<with_reduce,
-                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-                       cuda_exec<BLOCK_SIZE, Async>>;
+using cuda_exec_base = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+    cuda_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async =
-    std::conditional_t<with_reduce,
-                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
-                       cuda_exec_async<BLOCK_SIZE>>;
+using cuda_exec_base_async = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce_async<BLOCK_SIZE>,
+    cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
-          bool Async           = false>
-using cuda_work_explicit =
-    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_work_explicit_async =
-    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_work_explicit_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_work = policy::cuda::
-    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::
-    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
@@ -1578,10 +1410,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template <cuda::reduce_algorithm algorithm,
-          cuda::block_communication_mode comm_mode,
-          size_t replication   = named_usage::unspecified,
-          size_t atomic_stride = named_usage::unspecified>
+template < cuda::reduce_algorithm algorithm,
+           cuda::block_communication_mode comm_mode,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified >
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1604,41 +1436,35 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
-                       cuda::block_communication_mode::device_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::combine_last_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
-                       cuda::block_communication_mode::block_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::combine_last_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
-                       cuda::block_communication_mode::device_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_device_combine_atomic_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
-                       cuda::block_communication_mode::block_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_device_combine_atomic_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
-                       cuda::block_communication_mode::device_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_host_combine_atomic_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence =
-    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
-                       cuda::block_communication_mode::block_fence,
-                       named_usage::unspecified,
-                       named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_host_combine_atomic_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1650,26 +1476,25 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template <bool with_atomic>
-using cuda_reduce_base =
-    std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
+template < bool with_atomic >
+using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // policies usable with multi_reducers
-template <cuda::multi_reduce_algorithm algorithm,
-          typename SharedAtomicReplicationConcretizer,
-          typename SharedAtomicReplicationIndexer,
-          typename GlobalAtomicReplicationConcretizer,
-          typename GlobalAtomicReplicationIndexer>
-using cuda_multi_reduce_tuning =
-    policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
-        algorithm,
-        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                      SharedAtomicReplicationIndexer,
-                                      GetOffsetRight<int>>,
-        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                      GlobalAtomicReplicationIndexer,
-                                      GetOffsetLeft<int>>>>;
+template < cuda::multi_reduce_algorithm algorithm,
+           typename SharedAtomicReplicationConcretizer,
+           typename SharedAtomicReplicationIndexer,
+           typename GlobalAtomicReplicationConcretizer,
+           typename GlobalAtomicReplicationIndexer >
+using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
+    cuda::MultiReduceTuning<
+      algorithm,
+      cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                    SharedAtomicReplicationIndexer,
+                                    GetOffsetRight<int>>,
+      cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                    GlobalAtomicReplicationIndexer,
+                                    GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1683,51 +1508,44 @@ using cuda_multi_reduce_tuning =
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
 //   systems.
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
-    cuda_multi_reduce_tuning<
-        cuda::multi_reduce_algorithm::
-            init_host_combine_block_atomic_then_grid_atomic,
-        cuda::SharedAtomicReplicationMaxPow2Concretizer<
-            cuda::ConstantPreferredReplicationConcretizer<16>>,
-        cuda::thread_xyz<>,
-        cuda::GlobalAtomicReplicationMinPow2Concretizer<
-            cuda::ConstantPreferredReplicationConcretizer<2>>,
-        cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning<
+    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    cuda::SharedAtomicReplicationMaxPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<16>>,
+    cuda::thread_xyz<>,
+    cuda::GlobalAtomicReplicationMinPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<2>>,
+    cuda::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
-    cuda_multi_reduce_tuning<
-        cuda::multi_reduce_algorithm::
-            init_host_combine_block_atomic_then_grid_atomic,
-        cuda::SharedAtomicReplicationMaxPow2Concretizer<
-            cuda::ConstantPreferredReplicationConcretizer<0>>,
-        cuda::thread_xyz<>,
-        cuda::GlobalAtomicReplicationMinPow2Concretizer<
-            cuda::ConstantPreferredReplicationConcretizer<2>>,
-        cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning<
+    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    cuda::SharedAtomicReplicationMaxPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<0>>,
+    cuda::thread_xyz<>,
+    cuda::GlobalAtomicReplicationMinPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<2>>,
+    cuda::warp_global_xyz<>>;
 //
 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
     cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void,  // unused with this algorithm
-    void,  // unused with this algorithm
+    void, // unused with this algorithm
+    void, // unused with this algorithm
     cuda::GlobalAtomicReplicationMinPow2Concretizer<
         cuda::ConstantPreferredReplicationConcretizer<2>>,
     cuda::warp_global_xyz<>>;
 //
-using cuda_multi_reduce_atomic_global_no_replication_host_init =
-    cuda_multi_reduce_tuning<
-        cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-        void,  // unused with this algorithm
-        void,  // unused with this algorithm
-        cuda::GlobalAtomicReplicationMinPow2Concretizer<
-            cuda::ConstantPreferredReplicationConcretizer<1>>,
-        cuda::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
-// the same answer every time when used in the same way
-using cuda_multi_reduce_atomic =
-    cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely
-// used
+using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning<
+    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+    void, // unused with this algorithm
+    void, // unused with this algorithm
+    cuda::GlobalAtomicReplicationMinPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<1>>,
+    cuda::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely used
 using cuda_multi_reduce_atomic_low_performance_low_overhead =
     cuda_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1755,49 +1573,41 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool Async,
-          int num_threads      = named_usage::unspecified,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_launch_explicit_t =
-    policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
+template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
-// CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
+//CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
 template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t =
-    policy::cuda::cuda_launch_explicit_t<Async,
-                                         num_threads,
-                                         (num_threads ==
-                                          named_usage::unspecified)
-                                             ? named_usage::unspecified
-                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
+    (num_threads == named_usage::unspecified) ? named_usage::unspecified : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
-template <typename... indexers>
-using cuda_indexer_direct =
-    policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                               kernel_sync_requirement::none,
-                               indexers...>;
+template < typename ... indexers >
+using cuda_indexer_direct = policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
-template <typename... indexers>
+template < typename ... indexers >
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template <typename... indexers>
+template < typename ... indexers >
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template <typename... indexers>
-using cuda_flatten_indexer_direct =
-    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
-                                       kernel_sync_requirement::none,
-                                       indexers...>;
+template < typename ... indexers >
+using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
-template <typename... indexers>
+template < typename ... indexers >
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1810,7 +1620,7 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_thread_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1825,28 +1635,22 @@ using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
 using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
 using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_direct =
-    cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_direct =
-    cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_direct =
-    cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_direct =
-    cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_direct =
-    cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_direct =
-    cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_direct = cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_direct = cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_direct = cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_direct = cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_direct = cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_direct = cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_thread_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1861,18 +1665,12 @@ using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
 using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
 using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_loop =
-    cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_loop =
-    cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_loop =
-    cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_loop =
-    cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_loop =
-    cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_loop =
-    cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_loop = cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_loop = cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_loop = cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_loop = cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_loop = cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
@@ -1880,7 +1678,7 @@ using cuda_thread_zyx_loop =
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1888,38 +1686,26 @@ using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
 using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
 using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
 
-using cuda_flatten_thread_xy_direct =
-    cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_direct =
-    cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_direct =
-    cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_direct =
-    cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_direct =
-    cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_direct =
-    cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_direct =
-    cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_direct =
-    cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_direct =
-    cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_direct =
-    cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_direct =
-    cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_direct =
-    cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1927,31 +1713,19 @@ using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
 using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
 using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
 
-using cuda_flatten_thread_xy_loop =
-    cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_loop =
-    cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_loop =
-    cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_loop =
-    cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_loop =
-    cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_loop =
-    cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_loop =
-    cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_loop =
-    cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_loop =
-    cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_loop =
-    cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_loop =
-    cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_loop =
-    cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1959,7 +1733,7 @@ using cuda_flatten_thread_zyx_loop =
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_block_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1974,28 +1748,22 @@ using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
 using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
 using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_direct =
-    cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_direct =
-    cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_direct =
-    cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_direct =
-    cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_direct =
-    cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_direct =
-    cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_direct = cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_direct = cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_direct = cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_direct = cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_direct = cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_direct = cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_block_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -2010,18 +1778,12 @@ using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
 using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
 using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_loop =
-    cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_loop =
-    cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_loop =
-    cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_loop =
-    cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_loop =
-    cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_loop =
-    cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_loop = cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_loop = cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_loop = cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_loop = cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_loop = cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
@@ -2029,7 +1791,7 @@ using cuda_block_zyx_loop =
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -2037,38 +1799,26 @@ using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
 using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
 using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
 
-using cuda_flatten_block_xy_direct =
-    cuda_flatten_block_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_direct =
-    cuda_flatten_block_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_direct =
-    cuda_flatten_block_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_direct =
-    cuda_flatten_block_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_direct =
-    cuda_flatten_block_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_direct =
-    cuda_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_direct =
-    cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_direct =
-    cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_direct =
-    cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_direct =
-    cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_direct =
-    cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_direct =
-    cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -2076,31 +1826,19 @@ using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
 using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
 using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
 
-using cuda_flatten_block_xy_loop =
-    cuda_flatten_block_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_loop =
-    cuda_flatten_block_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_loop =
-    cuda_flatten_block_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_loop =
-    cuda_flatten_block_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_loop =
-    cuda_flatten_block_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_loop =
-    cuda_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_loop =
-    cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_loop =
-    cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_loop =
-    cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_loop =
-    cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_loop =
-    cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_loop =
-    cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -2108,11 +1846,9 @@ using cuda_flatten_block_zyx_loop =
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template <named_dim... dims>
-using cuda_global_direct =
-    cuda_indexer_direct<cuda::IndexGlobal<dims,
-                                          named_usage::unspecified,
-                                          named_usage::unspecified>...>;
+template < named_dim ... dims >
+using cuda_global_direct = cuda_indexer_direct<
+    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
@@ -2125,34 +1861,24 @@ using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
 using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
 using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_direct =
-    cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_direct =
-    cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_direct =
-    cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_direct =
-    cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_direct =
-    cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_direct =
-    cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_direct = cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_direct = cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_direct = cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_direct = cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_direct = cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_direct = cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template <named_dim... dims>
-using cuda_global_loop =
-    cuda_indexer_loop<cuda::IndexGlobal<dims,
-                                        named_usage::unspecified,
-                                        named_usage::unspecified>...>;
-
-template <named_dim... dims>
-using cuda_global_syncable_loop =
-    cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
-                                                 named_usage::unspecified,
-                                                 named_usage::unspecified>...>;
+template < named_dim ... dims >
+using cuda_global_loop = cuda_indexer_loop<
+    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+
+template < named_dim ... dims >
+using cuda_global_syncable_loop = cuda_indexer_syncable_loop<
+    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
@@ -2165,18 +1891,12 @@ using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
 using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
 using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_loop =
-    cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_loop =
-    cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_loop =
-    cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_loop =
-    cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_loop =
-    cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_loop =
-    cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_loop = cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_loop = cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_loop = cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_loop = cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_loop = cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2184,83 +1904,54 @@ using cuda_global_zyx_loop =
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template <named_dim... dims>
-using cuda_flatten_global_direct =
-    cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
-                                                  named_usage::unspecified,
-                                                  named_usage::unspecified>...>;
+template < named_dim ... dims >
+using cuda_flatten_global_direct = cuda_flatten_indexer_direct<
+    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
 using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
 
-using cuda_flatten_global_xy_direct =
-    cuda_flatten_global_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_direct =
-    cuda_flatten_global_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_direct =
-    cuda_flatten_global_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_direct =
-    cuda_flatten_global_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_direct =
-    cuda_flatten_global_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_direct =
-    cuda_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_direct =
-    cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_direct =
-    cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_direct =
-    cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_direct =
-    cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_direct =
-    cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_direct =
-    cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical
- * global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical global threads
  */
-template <named_dim... dims>
-using cuda_flatten_global_loop =
-    cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
-                                                named_usage::unspecified,
-                                                named_usage::unspecified>...>;
+template < named_dim ... dims >
+using cuda_flatten_global_loop = cuda_flatten_indexer_loop<
+    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
 using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
 
-using cuda_flatten_global_xy_loop =
-    cuda_flatten_global_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_loop =
-    cuda_flatten_global_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_loop =
-    cuda_flatten_global_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_loop =
-    cuda_flatten_global_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_loop =
-    cuda_flatten_global_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_loop =
-    cuda_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_loop =
-    cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_loop =
-    cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_loop =
-    cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_loop =
-    cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_loop =
-    cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_loop =
-    cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -2268,481 +1959,271 @@ using cuda_flatten_global_zyx_loop =
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template <int X_BLOCK_SIZE>
-using cuda_thread_size_x_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
-using cuda_thread_size_y_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
-using cuda_thread_size_z_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xy_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xz_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yx_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yz_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zx_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zy_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xyz_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xzy_direct =
-    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yxz_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yzx_direct =
-    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zxy_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zyx_direct =
-    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                        cuda::thread_y<Y_BLOCK_SIZE>,
-                        cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
-using cuda_block_size_x_direct =
-    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
-using cuda_block_size_y_direct =
-    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
-using cuda_block_size_z_direct =
-    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xy_direct =
-    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xz_direct =
-    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yx_direct =
-    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yz_direct =
-    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zx_direct =
-    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zy_direct =
-    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xyz_direct =
-    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xzy_direct =
-    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yxz_direct =
-    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yzx_direct =
-    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zxy_direct =
-    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zyx_direct =
-    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                        cuda::block_y<Y_GRID_SIZE>,
-                        cuda::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_x_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_y_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_z_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xy_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xz_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yx_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yz_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zx_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zy_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xyz_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xzy_direct =
-    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yxz_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yzx_direct =
-    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zxy_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zyx_direct =
-    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_BLOCK_SIZE >
+using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE >
+using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE >
+using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_xy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_xz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_yx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_yz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_zx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_zy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
+using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE >
+using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE >
+using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_xy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_xz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_yx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_yz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_zx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_zy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_x_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_y_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_z_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xyz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xzy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yxz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yzx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zxy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zyx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template <int X_BLOCK_SIZE>
+template < int X_BLOCK_SIZE >
 using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
+template < int Y_BLOCK_SIZE >
 using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
+template < int Z_BLOCK_SIZE >
 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xy_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xz_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yx_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yz_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zx_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zy_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_xyz_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_xzy_loop =
-    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_thread_size_yxz_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_yzx_loop =
-    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_thread_size_zxy_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_thread_size_zyx_loop =
-    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                      cuda::thread_y<Y_BLOCK_SIZE>,
-                      cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_xy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_xz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_yx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_yz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_zx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_zy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
 using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
+template < int Y_GRID_SIZE >
 using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
+template < int Z_GRID_SIZE >
 using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
 
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xy_loop =
-    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xz_loop =
-    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yx_loop =
-    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yz_loop =
-    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zx_loop =
-    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zy_loop =
-    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                                                   cuda::block_y<Y_GRID_SIZE>,
-                                                   cuda::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_x_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_y_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_z_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xy_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xz_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yx_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yz_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zx_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zy_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xyz_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_xzy_loop =
-    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yxz_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_yzx_loop =
-    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zxy_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_global_size_zyx_loop =
-    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_xy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_xz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_yx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_yz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_zx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_zy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_x_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_y_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_z_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xyz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_xzy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yxz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_yzx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zxy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2750,507 +2231,272 @@ using cuda_global_size_zyx_loop =
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template <int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_x_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_y_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_z_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xyz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xzy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yxz_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yzx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zxy_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zyx_direct =
-    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
-                                cuda::thread_y<Y_BLOCK_SIZE>,
-                                cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
-using cuda_flatten_block_size_x_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
-using cuda_flatten_block_size_y_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
-using cuda_flatten_block_size_z_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xy_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xz_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yx_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yz_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zx_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zy_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xyz_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xzy_direct =
-    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yxz_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yzx_direct =
-    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zxy_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zyx_direct =
-    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
-                                cuda::block_y<Y_GRID_SIZE>,
-                                cuda::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_x_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_y_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_z_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xy_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xz_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yx_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yz_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zx_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zy_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xyz_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xzy_direct =
-    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yxz_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yzx_direct =
-    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zxy_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zyx_direct =
-    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_x_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_y_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_z_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
+using cuda_flatten_block_size_x_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE >
+using cuda_flatten_block_size_y_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE >
+using cuda_flatten_block_size_z_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_x_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_y_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_z_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical
- * global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical global threads
  */
-template <int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_x_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_y_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_z_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_xyz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_xzy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using cuda_flatten_thread_size_yxz_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_yzx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using cuda_flatten_thread_size_zxy_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using cuda_flatten_thread_size_zyx_loop =
-    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
-                              cuda::thread_y<Y_BLOCK_SIZE>,
-                              cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
-using cuda_flatten_block_size_x_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
-using cuda_flatten_block_size_y_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
-using cuda_flatten_block_size_z_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xy_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xz_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yx_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yz_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zx_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zy_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_xyz_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_xzy_loop =
-    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using cuda_flatten_block_size_yxz_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_yzx_loop =
-    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using cuda_flatten_block_size_zxy_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using cuda_flatten_block_size_zyx_loop =
-    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
-                              cuda::block_y<Y_GRID_SIZE>,
-                              cuda::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_x_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_y_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_z_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xy_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xz_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yx_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yz_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zx_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zy_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xyz_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_xzy_loop =
-    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yxz_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_yzx_loop =
-    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zxy_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using cuda_flatten_global_size_zyx_loop =
-    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_x_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_y_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_z_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
+using cuda_flatten_block_size_x_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE >
+using cuda_flatten_block_size_y_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE >
+using cuda_flatten_block_size_z_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_x_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_y_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_z_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 1aa5e84207..409ec16818 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -43,18 +43,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define cudaErrchk(ans)                                                        \
-  {                                                                            \
-    ::RAJA::cudaAssert((ans), __FILE__, __LINE__);                             \
+#define cudaErrchk(ans)                            \
+  {                                                \
+    ::RAJA::cudaAssert((ans), __FILE__, __LINE__); \
   }
 
-inline void
-cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
+inline void cudaAssert(cudaError_t code,
+                       const char *file,
+                       int line,
+                       bool abort = true)
 {
-  if (code != cudaSuccess)
-  {
-    if (abort)
-    {
+  if (code != cudaSuccess) {
+    if (abort) {
       std::string msg;
       msg += "CUDAassert: ";
       msg += cudaGetErrorString(code);
@@ -63,11 +63,9 @@ cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    }
-    else
-    {
-      fprintf(stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
-              line);
+    } else {
+      fprintf(stderr, "CUDAassert: %s %s %d\n",
+              cudaGetErrorString(code), file, line);
     }
   }
 }
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 5828b0ee73..2b13417531 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -44,9 +44,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/desul/atomic.hpp"
+  #include "RAJA/policy/desul/atomic.hpp"
 #else
-#include "RAJA/policy/cuda/atomic.hpp"
+  #include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -66,53 +66,47 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>>
-{
+struct atomic<sum<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::cuda_atomic {}, &val, v);
+    RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<min<T>>
-{
+struct atomic<min<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::cuda_atomic {}, &val, v);
+    RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<max<T>>
-{
+struct atomic<max<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::cuda_atomic {}, &val, v);
+    RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<and_bit<T>>
-{
+struct atomic<and_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::cuda_atomic {}, &val, v);
+    RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<or_bit<T>>
-{
+struct atomic<or_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::cuda_atomic {}, &val, v);
+    RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct cuda_atomic_available
-{
+struct cuda_atomic_available {
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -130,19 +124,15 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
-          typename T,
-          typename TempIterator>
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
+          typename T, typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                                   T identity,
-                                                   TempIterator in_device_mem,
-                                                   unsigned int* device_count)
+                                        T identity,
+                                        TempIterator in_device_mem,
+                                        unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(
-      in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -153,22 +143,20 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId        = blockId / replication;
+  int slotId = blockId / replication;
 
-  int maxNumSlots       = (numBlocks + replication - 1) / replication;
+  int maxNumSlots = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots   = replicationId * maxNumSlots;
-  int blockSlot    = beginSlots + slotId;
+  int beginSlots = replicationId * maxNumSlots;
+  int blockSlot = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u)
-  {
-    if (threadId == 0)
-    {
+  if (numSlots <= 1u) {
+    if (threadId == 0) {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -176,36 +164,33 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   bool isLastBlock = false;
-  if (threadId == 0)
-  {
+  if (threadId == 0) {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count =
-        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
-    isLastBlock = (old_count == (numSlots - 1));
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
+    isLastBlock = (old_count == (numSlots-1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   isLastBlock = __syncthreads_or(isLastBlock);
 
   // last block accumulates values from device_mem
-  if (isLastBlock)
-  {
+  if (isLastBlock) {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId; i < numSlots; i += numThreads)
-    {
-      Combiner {}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId;
+                      i < numSlots;
+                      i += numThreads) {
+      Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0)
-    {
+    if (threadId == 0) {
       val = temp;
     }
   }
@@ -213,92 +198,72 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt
-{
+namespace expt {
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId   = ThreadIterationGetter::index();
+  const int threadId = ThreadIterationGetter::index();
 
-  const int warpId  = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
+  const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0)
-  {
+  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-      temp  = Combiner {}(temp, rhs);
+      temp = Combiner{}(temp, rhs);
     }
-  }
-  else
-  {
+
+  } else {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
-      T rhs       = RAJA::cuda::impl::shfl_sync(temp, srcLane);
+      T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads)
-      {
-        temp = Combiner {}(temp, rhs);
+      if (srcLane < numThreads) {
+        temp = Combiner{}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
-                    RAJA::policy::cuda::device_constants.WARP_SIZE,
-                "Max Warps must be less than or equal to Warp Size for this "
-                "algorithm to work");
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
-  {
+  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char
-        tmpsd[sizeof(RAJA::detail::SoAArray<
-                     T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
-        sd = reinterpret_cast<RAJA::detail::SoAArray<
-            T, RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0)
-    {
+    if (warpId == 0) {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0)
-    {
+    if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads)
-      {
+      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
-      }
-      else
-      {
+      } else {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
-           i *= 2)
-      {
+      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-        temp  = Combiner {}(temp, rhs);
+        temp = Combiner{}(temp, rhs);
       }
     }
 
@@ -310,74 +275,66 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void
-grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
+RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
+                                          T val,
+                                          RAJA::detail::SoAPtr<T,RAJA::cuda::device_mempool_type> device_mem,
+                                          unsigned int* device_count)
 {
-  using BlockIterationGetter =
-      typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter =
-      typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks            = BlockIterationGetter::size();
-  const int numThreads           = ThreadIterationGetter::size();
+  const int numBlocks = BlockIterationGetter::size();
+  const int numThreads = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId  = BlockIterationGetter::index();
+  const int blockId = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
-  T temp = block_reduce<ThreadIterationGetter, OP>(red.val, OP::identity());
+  T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0)
-  {
-    red.device_mem.set(blockId, temp);
+  if (threadId == 0) {
+    device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(red.device_count, wrap_around);
-    lastBlock              = (old_count == wrap_around);
+    unsigned int old_count = ::atomicInc(device_count, wrap_around);
+    lastBlock = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock)
-  {
+  if (lastBlock) {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads)
-    {
-      temp = OP {}(temp, red.device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads) {
+      temp = OP{}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0)
-    {
-      *(red.devicetarget) = temp;
+    if (threadId == 0) {
+      *device_target = temp;
     }
   }
 }
 
-}  //  namespace expt
+} //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
-          typename T>
-RAJA_DEVICE RAJA_INLINE int
-grid_reduce_atomic_device_init(T& val,
-                               T identity,
-                               T* device_mem,
-                               unsigned int* device_count)
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
+                                               T identity,
+                                               T* device_mem,
+                                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -386,28 +343,24 @@ grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId % replication);
-  int atomicOffset  = replicationId * atomic_stride;
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u)
-  {
+  if (numSlots <= 1u) {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0)
-    {
+    if (threadId == 0) {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0)
-  {
+  if (threadId == 0) {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u)
-    {
+    if (old_val == 0u) {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -418,22 +371,19 @@ grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0)
-  {
+  if (threadId == 0) {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
+    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count =
-        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
-    isLastBlock = (old_count == (numSlots + 1));
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
+    isLastBlock = (old_count == (numSlots+1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock)
-    {
+    if (isLastBlock) {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -444,8 +394,9 @@ grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void
-grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
+                                                            T identity,
+                                                            T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -453,15 +404,14 @@ grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId % replication);
-  int atomicOffset  = replicationId * atomic_stride;
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity)
-  {
-    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity) {
+    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
   }
 }
 
@@ -474,14 +424,12 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node
-  {
+  struct Node {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode
-  {
+  struct ResourceNode {
     ResourceNode* next;
     ::RAJA::resources::Cuda res;
     Node* node_list;
@@ -534,19 +482,14 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next)
-      {
+      if (m_n->next) {
         m_n = m_n->next;
-      }
-      else if (m_rn->next)
-      {
+      } else if (m_rn->next) {
         m_rn = m_rn->next;
-        m_n  = m_rn->node_list;
-      }
-      else
-      {
+        m_n = m_rn->node_list;
+      } else {
         m_rn = nullptr;
-        m_n  = nullptr;
+        m_n = nullptr;
       }
       return *this;
     }
@@ -558,7 +501,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T (&)[num_slots] { return m_n->values; }
+    auto operator*() -> T(&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -595,27 +538,25 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
+  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn)
-    {
+    while (rn) {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn)
-    {
-      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next      = resource_list;
-      rn->res       = res;
+    if (!rn) {
+      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next = resource_list;
+      rn->res = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n       = mempool::getInstance().template malloc<Node>(1);
-    n->next       = rn->node_list;
+    Node* n = mempool::getInstance().template malloc<Node>(1);
+    n->next = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -624,8 +565,7 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r)
-    {
+    for (auto r = resourceBegin(); r != end; ++r) {
       ::RAJA::cuda::synchronize(*r);
     }
   }
@@ -633,12 +573,10 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list)
-    {
+    while (resource_list) {
       ResourceNode* rn = resource_list;
-      while (rn->node_list)
-      {
-        Node* n       = rn->node_list;
+      while (rn->node_list) {
+        Node* n = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -667,15 +605,12 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type  = device_mempool_type;
+  using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -686,7 +621,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool owns_device_pointer;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
 
   /*! \brief create from a default value and offload information
    *
@@ -694,30 +629,31 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value {initValue},
-        identity {identity_},
-        device_count {nullptr},
-        device {},
-        owns_device_pointer {false}
-  {}
+      : value{initValue},
+        identity{identity_},
+        device_count{nullptr},
+        device{},
+        owns_device_pointer{false}
+  {
+  }
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value {other.identity},
-        identity {other.identity},
-        device_count {other.device_count},
-        device {other.device},
-        owns_device_pointer {false}
-  {}
+      : value{other.identity},
+        identity{other.identity},
+        device_count{other.device_count},
+        device{other.device},
+        owns_device_pointer{false}
+  {
+  }
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T (&output)[tally_slots])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r)
-    {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -729,12 +665,10 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId =
-        impl::grid_reduce_last_block<Combiner, Accessor, replication,
-                                     atomic_stride>(temp, identity, device,
-                                                    device_count);
-    if (replicationId != replication)
-    {
+    size_t replicationId = impl::grid_reduce_last_block<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
       output[replicationId] = temp;
     }
   }
@@ -744,15 +678,13 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act)
-    {
+    if (act) {
       cuda_dim_t gridDim = currentGridDim();
-      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
+      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots * replication);
-      device_count =
-          count_mempool_type::getInstance().template malloc<unsigned int>(
-              replication * atomic_stride);
+      device.allocate(maxNumSlots*replication);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -763,11 +695,10 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act)
-    {
+    if (act) {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count        = nullptr;
+      device_count = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -775,10 +706,8 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -790,32 +719,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool owns_device_pointer;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value {initValue},
-        identity {identity_},
-        is_setup {false},
-        owns_device_pointer {false}
-  {}
+      : value{initValue},
+        identity{identity_},
+        is_setup{false},
+        owns_device_pointer{false}
+  {
+  }
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value {other.identity},
-        identity {other.identity},
-        is_setup {other.is_setup},
-        owns_device_pointer {false}
-  {}
+      : value{other.identity},
+        identity{other.identity},
+        is_setup{other.is_setup},
+        owns_device_pointer{false}
+  {
+  }
 
-  ReduceAtomicHostInit_Data&
-  operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T (&output)[tally_slots])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r)
-    {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -827,8 +756,9 @@ struct ReduceAtomicHostInit_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
-        temp, identity, output);
+    impl::grid_reduce_atomic_host_init<Combiner,
+        replication, atomic_stride>(
+            temp, identity, output);
   }
 
   //! check and setup for device
@@ -836,9 +766,8 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act)
-    {
-      is_setup            = true;
+    if (act) {
+      is_setup = true;
       owns_device_pointer = true;
     }
     return act;
@@ -849,9 +778,8 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act)
-    {
-      is_setup            = false;
+    if (act) {
+      is_setup = false;
       owns_device_pointer = false;
     }
     return act;
@@ -859,15 +787,12 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type  = device_mempool_type;
+  using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -878,34 +803,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool owns_device_pointer;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value {initValue},
-        identity {identity_},
-        device_count {nullptr},
-        device {nullptr},
-        owns_device_pointer {false}
-  {}
+      : value{initValue},
+        identity{identity_},
+        device_count{nullptr},
+        device{nullptr},
+        owns_device_pointer{false}
+  {
+  }
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value {other.identity},
-        identity {other.identity},
-        device_count {other.device_count},
-        device {other.device},
-        owns_device_pointer {false}
-  {}
+      : value{other.identity},
+        identity{other.identity},
+        device_count{other.device_count},
+        device{other.device},
+        owns_device_pointer{false}
+  {
+  }
 
-  ReduceAtomicDeviceInit_Data&
-  operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T (&output)[tally_slots])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r)
-    {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -917,12 +842,10 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId =
-        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
-                                             atomic_stride>(
-            temp, identity, device, device_count);
-    if (replicationId != replication)
-    {
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
       output[replicationId] = temp;
     }
   }
@@ -932,13 +855,10 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act)
-    {
-      device = data_mempool_type::getInstance().template malloc<T>(
-          replication * atomic_stride);
-      device_count =
-          count_mempool_type::getInstance().template malloc<unsigned int>(
-              replication * atomic_stride);
+    if (act) {
+      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -949,12 +869,11 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act)
-    {
+    if (act) {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count        = nullptr;
+      device_count = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -966,77 +885,49 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication =
-      (tuning::replication > 0) ? tuning::replication : 1;
-  static constexpr size_t atomic_stride =
-      (tuning::atomic_stride > 0)
-          ? tuning::atomic_stride
-          : ((policy::cuda::device_constants
-                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-                 ? RAJA_DIVIDE_CEILING_INT(
-                       policy::cuda::device_constants
-                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
-                       sizeof(T))
-                 : 1);
-
-  using Accessor = std::conditional_t<
-      (tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication = (tuning::replication > 0)
+      ? tuning::replication
+      : 1;
+  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
+      ? tuning::atomic_stride
+      : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode ==
-                          block_communication_mode::device_fence),
-                         impl::AccessorDeviceScopeUseDeviceFence,
-                         void>>;
+      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
+        impl::AccessorDeviceScopeUseDeviceFence,
+        void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm ==
-       reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available =
-      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<
-      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
-          (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<Combiner,
-                                 Accessor,
-                                 T,
-                                 replication,
-                                 atomic_stride>,
-      std::conditional_t<
-          atomic_available,
-          std::conditional_t<
-              (tuning::algorithm ==
-               reduce_algorithm::init_device_combine_atomic_block),
-              cuda::ReduceAtomicDeviceInit_Data<Combiner,
-                                                Accessor,
-                                                T,
-                                                replication,
-                                                atomic_stride>,
-              std::conditional_t<
-                  (tuning::algorithm ==
-                   reduce_algorithm::init_host_combine_atomic_block),
-                  cuda::ReduceAtomicHostInit_Data<Combiner,
-                                                  T,
-                                                  replication,
-                                                  atomic_stride>,
-                  void>>,
-          void>>;
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
+                                              (atomic_policy && !atomic_available),
+      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      std::conditional_t<atomic_available,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
+          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
+            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
+            void>>,
+        void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T,
-                                tally_slots,
-                                typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u
-  {
+  union tally_u {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l) {};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
+    constexpr tally_u(TallyType* l) : list(l){};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
   };
 
 public:
@@ -1045,10 +936,11 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent {this},
-        tally_or_val_ptr {new TallyType},
+      : parent{this},
+        tally_or_val_ptr{new TallyType},
         val(init_val, identity_)
-  {}
+  {
+  }
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -1062,18 +954,16 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent {other.parent},
+      : parent{other.parent},
 #else
-      : parent {&other},
+      : parent{&other},
 #endif
-        tally_or_val_ptr {other.tally_or_val_ptr},
+        tally_or_val_ptr{other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent)
-    {
-      if (val.setupForDevice())
-      {
+    if (parent) {
+      if (val.setupForDevice()) {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -1088,35 +978,25 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this)
-    {
+    if (parent == this) {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    }
-    else if (parent)
-    {
-      if (val.value != val.identity)
-      {
+    } else if (parent) {
+      if (val.value != val.identity) {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    }
-    else
-    {
-      if (val.teardownForDevice())
-      {
+    } else {
+      if (val.teardownForDevice()) {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent)
-    {
+    if (!parent->parent) {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    }
-    else
-    {
+    } else {
       parent->combine(val.value);
     }
 #endif
@@ -1125,18 +1005,15 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n   = tally_or_val_ptr.list->begin();
+    auto n = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end)
-    {
+    if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n)
-      {
+      for (; n != end; ++n) {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r)
-        {
+        for (size_t r = 0; r < tally_slots; ++r) {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1150,7 +1027,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner {}(val.value, other); }
+  void combine(T other) const { Combiner{}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1260,39 +1137,33 @@ class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::Reduce<
-          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-          tuning>
+    : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+                          tuning>
 {
 
 public:
-  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner       = RAJA::reduce::min<value_type>;
+  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val,
-               IndexType init_idx,
+  ReduceMinLoc(T init_val, IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
-  {}
+               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+  {
+  }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
-             IndexType init_idx,
+  void reset(T init_val, IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1316,39 +1187,33 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::Reduce<
-          RAJA::reduce::max<
-              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-          tuning>
+    : public cuda::
+          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+                 tuning>
 {
 public:
-  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner       = RAJA::reduce::max<value_type>;
+  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val,
-               IndexType init_idx,
+  ReduceMaxLoc(T init_val, IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
-  {}
+               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+  {
+  }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
-             IndexType init_idx,
+  void reset(T init_val, IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 2b60028cb0..0a9b0bf305 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -49,34 +49,40 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-inclusive_inplace(resources::Cuda cuda_res,
-                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                           IterationGetter,
-                                                           Concretizer,
-                                                           BLOCKS_PER_SM,
-                                                           Async>,
-                  InputIter begin,
-                  InputIter end,
-                  Function binary_op)
+RAJA_INLINE
+resources::EventProxy<resources::Cuda>
+inclusive_inplace(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    InputIter begin,
+    InputIter end,
+    Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, len, stream));
+                                              temp_storage_bytes,
+                                              begin,
+                                              begin,
+                                              binary_op,
+                                              len,
+                                              stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, len, stream));
+                                              temp_storage_bytes,
+                                              begin,
+                                              begin,
+                                              binary_op,
+                                              len,
+                                              stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -97,35 +103,43 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-exclusive_inplace(resources::Cuda cuda_res,
-                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                           IterationGetter,
-                                                           Concretizer,
-                                                           BLOCKS_PER_SM,
-                                                           Async>,
-                  InputIter begin,
-                  InputIter end,
-                  Function binary_op,
-                  T init)
+RAJA_INLINE
+resources::EventProxy<resources::Cuda>
+exclusive_inplace(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    InputIter begin,
+    InputIter end,
+    Function binary_op,
+    T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, init, len, stream));
+                                              temp_storage_bytes,
+                                              begin,
+                                              begin,
+                                              binary_op,
+                                              init,
+                                              len,
+                                              stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, begin,
-                                              binary_op, init, len, stream));
+                                              temp_storage_bytes,
+                                              begin,
+                                              begin,
+                                              binary_op,
+                                              init,
+                                              len,
+                                              stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -146,33 +160,41 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-inclusive(resources::Cuda cuda_res,
-          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                   IterationGetter,
-                                                   Concretizer,
-                                                   BLOCKS_PER_SM,
-                                                   Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op)
+RAJA_INLINE
+resources::EventProxy<resources::Cuda>
+inclusive(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    InputIter begin,
+    InputIter end,
+    OutputIter out,
+    Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                              temp_storage_bytes,
+                                              begin,
+                                              out,
+                                              binary_op,
+                                              len,
+                                              stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                              temp_storage_bytes,
+                                              begin,
+                                              out,
+                                              binary_op,
+                                              len,
+                                              stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -194,36 +216,44 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-exclusive(resources::Cuda cuda_res,
-          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                   IterationGetter,
-                                                   Concretizer,
-                                                   BLOCKS_PER_SM,
-                                                   Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op,
-          T init)
+RAJA_INLINE
+resources::EventProxy<resources::Cuda>
+exclusive(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    InputIter begin,
+    InputIter end,
+    OutputIter out,
+    Function binary_op,
+    T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, out,
-                                              binary_op, init, len, stream));
+                                              temp_storage_bytes,
+                                              begin,
+                                              out,
+                                              binary_op,
+                                              init,
+                                              len,
+                                              stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes, begin, out,
-                                              binary_op, init, len, stream));
+                                              temp_storage_bytes,
+                                              begin,
+                                              out,
+                                              binary_op,
+                                              init,
+                                              len,
+                                              stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index ac4597a0f7..c5a353b704 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,44 +44,32 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename Iter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-        std::is_pointer<Iter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(resources::Cuda cuda_res,
-       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                IterationGetter,
-                                                Concretizer,
-                                                BLOCKS_PER_SM,
-                                                Async>,
-       Iter,
-       Iter,
-       Compare)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                        std::is_pointer<Iter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    Iter,
+    Iter,
+    Compare)
 {
-  static_assert(std::is_pointer<Iter>::value,
-                "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<Iter>::value,
+      "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert(type_traits::is_arithmetic<iterval>::value,
-                "stable_sort<cuda_exec> is only implemented for arithmetic "
-                "types");
-  static_assert(concepts::any_of<
-                    camp::is_same<Compare, operators::less<iterval>>,
-                    camp::is_same<Compare, operators::greater<iterval>>>::value,
-                "stable_sort<cuda_exec> is only implemented for "
-                "RAJA::operators::less or RAJA::operators::greater");
+  static_assert (type_traits::is_arithmetic<iterval>::value,
+      "stable_sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<iterval>>,
+      camp::is_same<Compare, operators::greater<iterval>>>::value,
+      "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -89,32 +77,26 @@ stable(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(resources::Cuda cuda_res,
-       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                IterationGetter,
-                                                Concretizer,
-                                                BLOCKS_PER_SM,
-                                                Async>,
-       Iter begin,
-       Iter end,
-       operators::less<RAJA::detail::IterVal<Iter>>)
+stable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    Iter begin,
+    Iter end,
+    operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len       = std::distance(begin, end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(R) * CHAR_BIT;
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -124,11 +106,15 @@ stable(resources::Cuda cuda_res,
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
@@ -136,17 +122,19 @@ stable(resources::Cuda cuda_res,
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out)
-  {
+  if (d_keys.Current() == d_out) {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
-                               stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -159,32 +147,26 @@ stable(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(resources::Cuda cuda_res,
-       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                IterationGetter,
-                                                Concretizer,
-                                                BLOCKS_PER_SM,
-                                                Async>,
-       Iter begin,
-       Iter end,
-       operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    Iter begin,
+    Iter end,
+    operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len       = std::distance(begin, end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(R) * CHAR_BIT;
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -194,29 +176,35 @@ stable(resources::Cuda cuda_res,
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out)
-  {
+  if (d_keys.Current() == d_out) {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
-                               stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -230,43 +218,32 @@ stable(resources::Cuda cuda_res,
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename Iter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-        std::is_pointer<Iter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(resources::Cuda cuda_res,
-         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                  IterationGetter,
-                                                  Concretizer,
-                                                  BLOCKS_PER_SM,
-                                                  Async>,
-         Iter,
-         Iter,
-         Compare)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                        std::is_pointer<Iter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    Iter,
+    Iter,
+    Compare)
 {
-  static_assert(std::is_pointer<Iter>::value,
-                "sort<cuda_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<Iter>::value,
+      "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert(type_traits::is_arithmetic<iterval>::value,
-                "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert(concepts::any_of<
-                    camp::is_same<Compare, operators::less<iterval>>,
-                    camp::is_same<Compare, operators::greater<iterval>>>::value,
-                "sort<cuda_exec> is only implemented for RAJA::operators::less "
-                "or RAJA::operators::greater");
+  static_assert (type_traits::is_arithmetic<iterval>::value,
+      "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<iterval>>,
+      camp::is_same<Compare, operators::greater<iterval>>>::value,
+      "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -274,24 +251,18 @@ unstable(resources::Cuda cuda_res,
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(resources::Cuda cuda_res,
-         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                  IterationGetter,
-                                                  Concretizer,
-                                                  BLOCKS_PER_SM,
-                                                  Async> p,
-         Iter begin,
-         Iter end,
-         operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
+    Iter begin,
+    Iter end,
+    operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -299,24 +270,18 @@ unstable(resources::Cuda cuda_res,
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(resources::Cuda cuda_res,
-         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                  IterationGetter,
-                                                  Concretizer,
-                                                  BLOCKS_PER_SM,
-                                                  Async> p,
-         Iter begin,
-         Iter end,
-         operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
+    Iter begin,
+    Iter end,
+    operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -325,52 +290,36 @@ unstable(resources::Cuda cuda_res,
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-        std::is_pointer<KeyIter>,
-        std::is_pointer<ValIter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(resources::Cuda cuda_res,
-             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                      IterationGetter,
-                                                      Concretizer,
-                                                      BLOCKS_PER_SM,
-                                                      Async>,
-             KeyIter,
-             KeyIter,
-             ValIter,
-             Compare)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                        std::is_pointer<KeyIter>,
+                        std::is_pointer<ValIter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    KeyIter,
+    KeyIter,
+    ValIter,
+    Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "stable_sort_pairs<cuda_exec> is only implemented for "
-                "pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "stable_sort_pairs<cuda_exec> is only implemented for "
-                "pointers");
+  static_assert (std::is_pointer<KeyIter>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "stable_sort_pairs<cuda_exec> is only implemented for "
-                "arithmetic types");
-  static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for "
-      "RAJA::operators::less or RAJA::operators::greater");
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -378,37 +327,29 @@ stable_pairs(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
-stable_pairs(resources::Cuda cuda_res,
-             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                      IterationGetter,
-                                                      Concretizer,
-                                                      BLOCKS_PER_SM,
-                                                      Async>,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             operators::less<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
+stable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len       = std::distance(keys_begin, keys_end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(K) * CHAR_BIT;
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -420,36 +361,42 @@ stable_pairs(resources::Cuda cuda_res,
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out)
-  {
+  if (d_keys.Current() == d_keys_out) {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out)
-  {
+  if (d_vals.Current() == d_vals_out) {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -463,37 +410,29 @@ stable_pairs(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
-stable_pairs(resources::Cuda cuda_res,
-             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                      IterationGetter,
-                                                      Concretizer,
-                                                      BLOCKS_PER_SM,
-                                                      Async>,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             operators::greater<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
+stable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len       = std::distance(keys_begin, keys_end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(K) * CHAR_BIT;
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -505,36 +444,42 @@ stable_pairs(resources::Cuda cuda_res,
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out)
-  {
+  if (d_keys.Current() == d_keys_out) {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out)
-  {
+  if (d_vals.Current() == d_vals_out) {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                               cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -549,50 +494,36 @@ stable_pairs(resources::Cuda cuda_res,
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-        std::is_pointer<KeyIter>,
-        std::is_pointer<ValIter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(resources::Cuda cuda_res,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async>,
-               KeyIter,
-               KeyIter,
-               ValIter,
-               Compare)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                        std::is_pointer<KeyIter>,
+                        std::is_pointer<ValIter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    KeyIter,
+    KeyIter,
+    ValIter,
+    Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<KeyIter>::value,
+      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "sort_pairs<cuda_exec> is only implemented for arithmetic "
-                "types");
-  static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
-      "RAJA::operators::greater");
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "sort_pairs<cuda_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -600,28 +531,20 @@ unstable_pairs(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
-unstable_pairs(resources::Cuda cuda_res,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async> p,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
-               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
+unstable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -629,28 +552,20 @@ unstable_pairs(resources::Cuda cuda_res,
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Cuda>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
-unstable_pairs(resources::Cuda cuda_res,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async> p,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
-               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
+unstable_pairs(
+    resources::Cuda cuda_res,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 8844937700..71bf429079 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -28,131 +28,176 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T
+atomicLoad(AtomicPolicy, T *acc)
 {
-  return desul::atomic_load(acc, raja_default_desul_order {},
-                            raja_default_desul_scope {});
+  return desul::atomic_load(acc,
+                            raja_default_desul_order{},
+                            raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
-{
-  desul::atomic_store(acc, value, raja_default_desul_order {},
-                      raja_default_desul_scope {});
+RAJA_HOST_DEVICE
+RAJA_INLINE void
+atomicStore(AtomicPolicy, T *acc, T value)
+{
+  desul::atomic_store(acc,
+                      value,
+                      raja_default_desul_order{},
+                      raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
-{
-  return desul::atomic_fetch_add(acc, value, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+RAJA_HOST_DEVICE
+RAJA_INLINE T
+atomicAdd(AtomicPolicy, T *acc, T value)
+{
+  return desul::atomic_fetch_add(acc,
+                                 value,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
-{
-  return desul::atomic_fetch_sub(acc, value, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+RAJA_HOST_DEVICE
+RAJA_INLINE T
+atomicSub(AtomicPolicy, T *acc, T value)
+{
+  return desul::atomic_fetch_sub(acc,
+                                 value,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_min(acc, value, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+  return desul::atomic_fetch_min(acc,
+                                 value,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_max(acc, value, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+  return desul::atomic_fetch_max(acc,
+                                 value,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
 {
-  return desul::atomic_fetch_inc(acc, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+  return desul::atomic_fetch_inc(acc,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(acc, val, raja_default_desul_order {},
-                                     raja_default_desul_scope {});
+  return desul::atomic_fetch_inc_mod(acc,
+                                     val,
+                                     raja_default_desul_order{},
+                                     raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
 {
-  return desul::atomic_fetch_dec(acc, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+  return desul::atomic_fetch_dec(acc,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(acc, val, raja_default_desul_order {},
-                                     raja_default_desul_scope {});
+  return desul::atomic_fetch_dec_mod(acc,
+                                     val,
+                                     raja_default_desul_order{},
+                                     raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_and(acc, value, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+  return desul::atomic_fetch_and(acc,
+                                 value,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_or(acc, value, raja_default_desul_order {},
-                                raja_default_desul_scope {});
+  return desul::atomic_fetch_or(acc,
+                                value,
+                                raja_default_desul_order{},
+                                raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_xor(acc, value, raja_default_desul_order {},
-                                 raja_default_desul_scope {});
+  return desul::atomic_fetch_xor(acc,
+                                 value,
+                                 raja_default_desul_order{},
+                                 raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_exchange(acc, value, raja_default_desul_order {},
-                                raja_default_desul_scope {});
+  return desul::atomic_exchange(acc,
+                                value,
+                                raja_default_desul_order{},
+                                raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T
-atomicCAS(AtomicPolicy, T* acc, T compare, T value)
-{
-  return desul::atomic_compare_exchange(acc, compare, value,
-                                        raja_default_desul_order {},
-                                        raja_default_desul_scope {});
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
+{
+  return desul::atomic_compare_exchange(acc,
+                                        compare,
+                                        value,
+                                        raja_default_desul_order{},
+                                        raja_default_desul_scope{});
 }
 
 }  // namespace RAJA
 
 #endif  // RAJA_ENABLE_DESUL_ATOMICS
-#endif  // guard
+#endif // guard
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 6c829798be..f1f69eab5e 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -72,15 +72,14 @@ hipDeviceProp_t& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator
-{
+struct PinnedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     hipErrchk(hipHostMalloc(&ptr, nbytes,
-                            hipHostMallocMapped | hipHostMallocNonCoherent));
+        hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -93,8 +92,7 @@ struct PinnedAllocator
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator
-{
+struct DeviceAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -114,8 +112,7 @@ struct DeviceAllocator
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator
-{
+struct DeviceZeroedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -137,8 +134,7 @@ struct DeviceZeroedAllocator
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator
-{
+struct DevicePinnedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -159,25 +155,22 @@ struct DevicePinnedAllocator
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type =
-    basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct hipInfo
-{
+struct hipInfo {
   const void* func = nullptr;
-  hip_dim_t gridDim {0, 0, 0};
-  hip_dim_t blockDim {0, 0, 0};
+  hip_dim_t gridDim{0, 0, 0};
+  hip_dim_t blockDim{0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
+  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)};
   bool setup_reducers = false;
 };
-struct hipStatusInfo : hipInfo
-{
+struct hipStatusInfo : hipInfo {
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -194,7 +187,10 @@ extern hipStatusInfo tl_status;
 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
+void synchronize_impl(::RAJA::resources::Hip res)
+{
+  res.wait();
+}
 
 }  // namespace detail
 
@@ -206,16 +202,13 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map)
-  {
-    if (!val.second)
-    {
+  for (auto& val : detail::g_stream_info_map) {
+    if (!val.second) {
       synchronize = true;
-      val.second  = true;
+      val.second = true;
     }
   }
-  if (synchronize)
-  {
+  if (synchronize) {
     hipErrchk(hipDeviceSynchronize());
   }
 }
@@ -228,16 +221,12 @@ void synchronize(::RAJA::resources::Hip res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end())
-  {
-    if (!iter->second)
-    {
+  if (iter != detail::g_stream_info_map.end()) {
+    if (!iter->second) {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  }
-  else
-  {
+  } else {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -250,41 +239,30 @@ void launch(::RAJA::resources::Hip res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end())
-  {
+  if (iter != detail::g_stream_info_map.end()) {
     iter->second = !async;
-  }
-  else
-  {
+  } else {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async)
-  {
+  if (!async) {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func,
-            hip_dim_t gridDim,
-            hip_dim_t blockDim,
-            void** args,
-            size_t shmem,
-            ::RAJA::resources::Hip res,
-            bool async       = true,
-            const char* name = nullptr)
+void launch(const void* func, hip_dim_t gridDim, hip_dim_t blockDim, void** args, size_t shmem,
+            ::RAJA::resources::Hip res, bool async = true, const char *name = nullptr)
 {
-#if defined(RAJA_ENABLE_ROCTX)
-  if (name) roctxRangePush(name);
-#else
-  RAJA_UNUSED_VAR(name);
-#endif
-  hipErrchk(
-      hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
-#if defined(RAJA_ENABLE_ROCTX)
-  if (name) roctxRangePop();
-#endif
+  #if defined(RAJA_ENABLE_ROCTX)
+  if(name) roctxRangePush(name);
+  #else
+    RAJA_UNUSED_VAR(name);
+  #endif
+  hipErrchk(hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+  #if defined(RAJA_ENABLE_ROCTX)
+  if(name) roctxRangePop();
+  #endif
   launch(res, async);
 }
 
@@ -302,11 +280,9 @@ hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-hip_dim_member_t currentGridSize()
-{
-  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
-         detail::tl_status.gridDim.z;
-}
+hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
+                                            detail::tl_status.gridDim.y *
+                                            detail::tl_status.gridDim.z; }
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -314,11 +290,9 @@ hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-hip_dim_member_t currentBlockSize()
-{
-  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
-         detail::tl_status.blockDim.z;
-}
+hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
+                                             detail::tl_status.blockDim.y *
+                                             detail::tl_status.blockDim.z; }
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -333,8 +307,7 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure =
-    std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -346,27 +319,24 @@ constexpr size_t dynamic_smem_allocation_failure =
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template <typename T, typename GetNFromMax>
-RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
-                                        size_t align = alignof(T))
+template < typename T, typename GetNFromMax >
+RAJA_INLINE
+size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
-                                     ? align - (unaligned_shmem % align)
-                                     : size_t(0);
-  const size_t aligned_shmem   = unaligned_shmem + align_offset;
+  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
+      ? align - (unaligned_shmem % align)
+      : size_t(0);
+  const size_t aligned_shmem = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
-                                         max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) *
+      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
-  {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  }
-  else
-  {
+  } else {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -381,17 +351,16 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
-make_launch_body(const void* func,
-                 hip_dim_t gridDim,
-                 hip_dim_t blockDim,
-                 size_t& dynamic_smem,
-                 ::RAJA::resources::Hip res,
-                 LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
+    const void* func,
+    hip_dim_t gridDim,
+    hip_dim_t blockDim,
+    size_t& dynamic_smem,
+    ::RAJA::resources::Hip res,
+    LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
-      detail::tl_status,
-      detail::hipInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(detail::tl_status,
+      detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -406,8 +375,7 @@ static constexpr size_t hip_occupancy_uninitialized_size_t =
 struct HipFixedMaxBlocksData
 {
   int device_sm_per_device = hip::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm =
-      hip::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -423,33 +391,32 @@ HipFixedMaxBlocksData hip_max_blocks()
 struct HipOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device      = hip_occupancy_uninitialized_int;
-  int func_max_threads_per_block      = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
+  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE HipOccMaxBlocksThreadsData
-hip_occupancy_max_blocks_threads(const void* func,
-                                 size_t func_dynamic_shmem_per_block)
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
+RAJA_INLINE
+HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
-  {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
-        func, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    hipDeviceProp_t& prop           = hip::device_prop();
+    hipDeviceProp_t& prop = hip::device_prop();
     data.func_max_blocks_per_device = prop.multiProcessorCount;
     data.func_max_threads_per_block = 1024;
 #endif
+
   }
 
   return data;
@@ -459,70 +426,60 @@ hip_occupancy_max_blocks_threads(const void* func,
 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_threads_per_block          = hip_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm          = hip_occupancy_uninitialized_int;
+  int func_threads_per_block = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
-RAJA_INLINE HipOccMaxBlocksData
-hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
+template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
+RAJA_INLINE
+HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
-  {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block       = func_threads_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block,
-        func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm =
-        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
-    if (data.func_max_blocks_per_sm <= 0)
-    {
-      data.func_max_blocks_per_sm = 1
-    }
+    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
+    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
 #endif
+
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template <typename RAJA_UNUSED_ARG(UniqueMarker)>
-RAJA_INLINE HipOccMaxBlocksData
-hip_occupancy_max_blocks(const void* func,
-                         size_t func_dynamic_shmem_per_block,
-                         int func_threads_per_block)
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
+RAJA_INLINE
+HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-      data.func_threads_per_block != func_threads_per_block)
-  {
+  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+       data.func_threads_per_block != func_threads_per_block ) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block       = func_threads_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block,
-        func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm =
-        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
-    if (data.func_max_blocks_per_sm <= 0)
-    {
-      data.func_max_blocks_per_sm = 1
-    }
+    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
+    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
 #endif
+
   }
 
   return data;
@@ -555,16 +512,14 @@ hip_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template <typename IdxT, typename Concretizer, typename UniqueMarker>
+template < typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func,
-                  size_t func_dynamic_shmem_per_block,
-                  IdxT len)
-      : m_func(func),
-        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
-        m_len(len)
-  {}
+  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
+    : m_func(func)
+    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
+    , m_len(len)
+  { }
 
   IdxT get_max_block_size() const
   {
@@ -578,14 +533,10 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block)
-    {
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block) {
       return func_threads_per_block;
-    }
-    else
-    {
+    } else {
       return IdxT(0);
     }
   }
@@ -593,8 +544,7 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -602,17 +552,16 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -621,10 +570,8 @@ struct ConcretizerImpl
   {
     auto data = hip_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device =
-        Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device =
-        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -632,9 +579,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device =
-        this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
+    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 5d2e9b69bb..975d26b7ff 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace hip
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template <typename Factory>
-__global__ void get_value_global(typename Factory::value_type* ptr,
-                                 Factory factory)
+template < typename Factory >
+__global__ void get_value_global(
+    typename Factory::value_type* ptr, Factory factory)
 {
   *ptr = factory();
 }
@@ -52,9 +52,8 @@ __global__ void get_value_global(typename Factory::value_type* ptr,
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr            = nullptr;
-  if (nbytes > cached_nbytes)
-  {
+  static void* ptr = nullptr;
+  if (nbytes > cached_nbytes) {
     cached_nbytes = 0;
     hipErrchk(hipHostFree(ptr));
     hipErrchk(hipHostMalloc(&ptr, nbytes));
@@ -74,7 +73,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template <typename Factory>
+template < typename Factory >
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -82,9 +81,8 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Hip::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func =
-      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void* args[] = {(void*)&ptr, (void*)&factory};
+  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void *args[] = {(void*)&ptr, (void*)&factory};
   hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   hipErrchk(hipStreamSynchronize(res.get_stream()));
 
@@ -93,7 +91,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template <typename Factory>
+template < typename Factory >
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -103,15 +101,17 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace hip
 
 /*!
- * Populate and return a Dispatcher object that can be used in device code
- */
-template <typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
+* Populate and return a Dispatcher object that can be used in device code
+*/
+template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async >
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
-  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
-      [](auto&& factory) {
-        return hip::get_cached_value(std::forward<decltype(factory)>(factory));
-      })};
+  static Dispatcher_T dispatcher{
+        Dispatcher_T::template makeDispatcher<T>(
+          [](auto&& factory) {
+            return hip::get_cached_value(
+                std::forward<decltype(factory)>(factory));
+          }) };
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index dbdcbc7851..26d45d7bd9 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,45 +36,46 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
-                              RAJA::hip_work<BLOCK_SIZE, Async>,
-                              RAJA::ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
 {
-  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
-                                       RAJA::hip_work<BLOCK_SIZE, Async>,
-                                       RAJA::ordered,
-                                       DISPATCH_POLICY_T,
-                                       ALLOCATOR_T,
-                                       INDEX_T,
-                                       Args...>;
+  using base = WorkRunnerForallOrdered<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
   using base::base;
-  using IndexType       = INDEX_T;
+  using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template <typename WorkContainer>
+  template < typename WorkContainer >
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r,
-                      Args... args) const
+                      typename base::resource_type r, Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -82,12 +83,8 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0)
-    {
-      if (!Async)
-      {
-        RAJA::hip::synchronize(r);
-      }
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::hip::synchronize(r); }
     }
 
     return run_storage;
@@ -98,45 +95,46 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
-                              RAJA::hip_work<BLOCK_SIZE, Async>,
-                              RAJA::reverse_ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
 {
-  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
-                                       RAJA::hip_work<BLOCK_SIZE, Async>,
-                                       RAJA::reverse_ordered,
-                                       DISPATCH_POLICY_T,
-                                       ALLOCATOR_T,
-                                       INDEX_T,
-                                       Args...>;
+  using base = WorkRunnerForallReverse<
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
   using base::base;
-  using IndexType       = INDEX_T;
+  using IndexType = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template <typename WorkContainer>
+  template < typename WorkContainer >
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r,
-                      Args... args) const
+                      typename base::resource_type r, Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -144,12 +142,8 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0)
-    {
-      if (!Async)
-      {
-        RAJA::hip::synchronize(r);
-      }
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::hip::synchronize(r); }
     }
 
     return run_storage;
@@ -161,17 +155,15 @@ struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type,
-          typename LoopBody,
-          typename index_type,
-          typename... Args>
+template <typename Segment_type, typename LoopBody,
+          typename index_type, typename ... Args>
 struct HoldHipDeviceXThreadblockLoop
 {
-  template <typename segment_in, typename body_in>
+  template < typename segment_in, typename body_in >
   HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-      : m_segment(std::forward<segment_in>(segment)),
-        m_body(std::forward<body_in>(body))
-  {}
+    : m_segment(std::forward<segment_in>(segment))
+    , m_body(std::forward<body_in>(body))
+  { }
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -179,11 +171,10 @@ struct HoldHipDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin         = m_segment.begin();
-    const auto end           = m_segment.end();
+    const auto begin = m_segment.begin();
+    const auto end   = m_segment.end();
     const index_type len(end - begin);
-    for (index_type i = i_begin; i < len; i += stride)
-    {
+    for ( index_type i = i_begin; i < len; i += stride ) {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -193,11 +184,11 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template <size_t BLOCK_SIZE,
-          typename StorageIter,
-          typename value_type,
-          typename index_type,
-          typename... Args>
+template < size_t BLOCK_SIZE,
+           typename StorageIter,
+           typename value_type,
+           typename index_type,
+           typename ... Args >
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -214,40 +205,36 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunner<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    DISPATCH_POLICY_T,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
 {
   using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
-  using order_policy =
-      RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator       = ALLOCATOR_T;
-  using index_type      = INDEX_T;
-  using resource_type   = resources::Hip;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+  using resource_type = resources::Hip;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type
-  {
-    template <typename T>
+  struct holder_type {
+    template < typename T >
     using type = HoldHipDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
-        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
-        index_type,
-        Args...>;
+        typename camp::at<T, camp::num<0>>::type, // ITERABLE
+        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
+        index_type, Args...>;
   };
   ///
-  template <typename T>
+  template < typename T >
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -256,24 +243,21 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy =
-      dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::hip,
-                                     dispatcher_holder_policy,
-                                     RAJA::hip_work<BLOCK_SIZE, true>,
-                                     Args...>;
+  using dispatcher_type = Dispatcher<Platform::hip, dispatcher_holder_policy, RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&)            = delete;
+  WorkRunner(WorkRunner const&) = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner && o)
+    : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner&& o)
+  WorkRunner& operator=(WorkRunner && o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -283,41 +267,35 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template <typename WorkContainer, typename Iterable, typename LoopBody>
-  inline void
-  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template < typename WorkContainer, typename Iterable, typename LoopBody >
+  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType =
-        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template
-    // true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end   = std::end(iter);
-    IndexType len  = std::distance(begin, end);
+    Iterator end = std::end(iter);
+    IndexType len = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0)
-    {
+    if (len > 0 && BLOCK_SIZE > 0) {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup
-      // reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
       //
       // LOOP_BODY body = RAJA::hip::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream,
-      //     std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -325,43 +303,37 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template <typename WorkContainer>
-  per_run_storage
-  run(WorkContainer const& storage, resource_type r, Args... args) const
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator   = camp::decay<decltype(std::begin(storage))>;
-    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
-                                                          std::end(storage)))>;
+    using Iterator  = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage {};
+    per_run_storage run_storage{};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
-                                             index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin      = std::begin(storage);
-    Iterator end        = std::end(storage);
+    Iterator begin = std::begin(storage);
+    Iterator end = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0)
-    {
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
 
-      index_type average_iterations =
-          m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      hip_dim_t blockSize {static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize {
-          static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
-                                        block_size),
-          static_cast<hip_dim_member_t>(num_loops), 1};
+      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
+      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
+                          static_cast<hip_dim_member_t>(num_loops),
+                          1};
 
       RAJA_FT_BEGIN;
 
@@ -374,9 +346,8 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = {(void*)&begin, (void*)&args...};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
-                          shmem, r, Async);
+        void* func_args[] = { (void*)&begin, (void*)&args... };
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -386,7 +357,10 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear() { m_total_iterations = 0; }
+  void clear()
+  {
+    m_total_iterations = 0;
+  }
 
 private:
   index_type m_total_iterations = 0;
@@ -395,31 +369,29 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunner<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    RAJA::indirect_function_call_dispatch,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>;
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        RAJA::indirect_function_call_dispatch,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
+          typename ... Args>
 struct WorkRunner<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    RAJA::indirect_virtual_function_dispatch,
-    ALLOCATOR_T,
-    INDEX_T,
-    Args...>;
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        RAJA::indirect_virtual_function_dispatch,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
 
 #endif
 
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index 60b0871f0d..b4f0d7faa7 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -49,8 +49,11 @@ namespace RAJA
 namespace detail
 {
 
-using hip_atomicCommon_builtin_types =
-    ::camp::list<int, unsigned int, unsigned long long>;
+using hip_atomicCommon_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long
+>;
 
 /*!
  * Type trait for determining if atomic operators should be implemented
@@ -59,11 +62,11 @@ using hip_atomicCommon_builtin_types =
  * hip_useBuiltinExchange below.
  */
 template <typename T>
-struct hip_useBuiltinCommon
-{
-  static constexpr bool value = std::is_same<T, int>::value ||
-                                std::is_same<T, unsigned int>::value ||
-                                std::is_same<T, unsigned long long>::value;
+struct hip_useBuiltinCommon {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -75,15 +78,15 @@ struct hip_useBuiltinCommon
  * below.
  */
 template <typename T>
-struct hip_useReinterpretCommon
-{
-  static constexpr bool value = !hip_useBuiltinCommon<T>::value &&
-                                (sizeof(T) == sizeof(unsigned int) ||
-                                 sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretCommon {
+  static constexpr bool value =
+    !hip_useBuiltinCommon<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
 };
 
 
@@ -103,7 +106,7 @@ using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
+RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -114,12 +117,12 @@ RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
  * using a builtin
  */
 template <typename T>
-struct hip_useBuiltinExchange
-{
-  static constexpr bool value = std::is_same<T, int>::value ||
-                                std::is_same<T, unsigned int>::value ||
-                                std::is_same<T, unsigned long long>::value ||
-                                std::is_same<T, float>::value;
+struct hip_useBuiltinExchange {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value ||
+    std::is_same<T, float>::value;
 };
 
 /*!
@@ -127,23 +130,22 @@ struct hip_useBuiltinExchange
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct hip_useReinterpretExchange
-{
-  static constexpr bool value = !hip_useBuiltinExchange<T>::value &&
-                                (sizeof(T) == sizeof(unsigned int) ||
-                                 sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretExchange {
+  static constexpr bool value =
+    !hip_useBuiltinExchange<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
 
-  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                                  unsigned int,
-                                  unsigned long long>;
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using hip_useReinterpretExchange_t =
-    typename hip_useReinterpretExchange<T>::type;
+using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -151,7 +153,7 @@ using hip_useReinterpretExchange_t =
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -162,16 +164,17 @@ RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
 {
   using R = hip_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicExchange(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    hip_atomicExchange(reinterpret_cast<R*>(acc),
+                       RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
-#if defined(__has_builtin) &&                                                  \
+#if defined(__has_builtin) && \
     (__has_builtin(__hip_atomic_load) || __has_builtin(__hip_atomic_store))
 
 /*!
@@ -179,11 +182,10 @@ RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
  * using an intrinsic
  */
 template <typename T>
-struct hip_useBuiltinLoad
-{
+struct hip_useBuiltinLoad {
   static constexpr bool value =
-      (std::is_integral<T>::value || std::is_enum<T>::value) &&
-      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+    (std::is_integral<T>::value || std::is_enum<T>::value) &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 template <typename T>
@@ -195,54 +197,54 @@ using hip_useBuiltinStore = hip_useBuiltinLoad<T>;
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct hip_useReinterpretLoad
-{
-  static constexpr bool value = !std::is_integral<T>::value &&
-                                !std::is_enum<T>::value &&
-                                ((sizeof(T) == 1
+struct hip_useReinterpretLoad {
+  static constexpr bool value =
+    !std::is_integral<T>::value &&
+    !std::is_enum<T>::value &&
+    ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-                                  && sizeof(unsigned char) == 1
+      && sizeof(unsigned char) == 1
 #endif
-                                  ) ||
-                                 (sizeof(T) == 2
+     ) ||
+     (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-                                  && sizeof(unsigned short) == 2
+      && sizeof(unsigned short) == 2
 #endif
-                                  ) ||
-                                 (sizeof(T) == 4
+     ) ||
+     (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-                                  && sizeof(unsigned int) == 4
+      && sizeof(unsigned int) == 4
 #endif
-                                  ) ||
-                                 (sizeof(T) == 8
+     ) ||
+     (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-                                  && sizeof(unsigned long long) == 8
+      && sizeof(unsigned long long) == 8
 #endif
-                                  ));
+     ));
 
   using type =
-      std::conditional_t<sizeof(T) == 1,
+    std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                         uint8_t,
+                       uint8_t,
 #else
-                         unsigned char,
+                       unsigned char,
 #endif
-                         std::conditional_t<sizeof(T) == 2,
+    std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                                            uint16_t,
+                       uint16_t,
 #else
-                                            unsigned short,
+                       unsigned short,
 #endif
-                                            std::conditional_t<sizeof(T) == 4,
+    std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                                                               uint32_t,
+                       uint32_t,
 #else
-                                                               unsigned int,
+                       unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                                                               uint64_t>>>;
+                       uint64_t>>>;
 #else
-                                                               unsigned long long>>>;
+                       unsigned long long>>>;
 #endif
 };
 
@@ -283,7 +285,7 @@ using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
   return __hip_atomic_load(acc, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -294,12 +296,12 @@ RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 {
   using R = hip_useReinterpretLoad_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-      hip_atomicLoad(reinterpret_cast<R*>(acc)));
+    hip_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -308,7 +310,7 @@ RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
   __hip_atomic_store(acc, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -319,7 +321,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
@@ -335,7 +337,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
@@ -347,13 +349,14 @@ RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicCAS(
-      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
-      RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    hip_atomicCAS(reinterpret_cast<R*>(acc),
+                  RAJA::util::reinterp_A_as_B<T, R>(compare),
+                  RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -387,15 +390,15 @@ RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
+                                            Oper&& oper)
 {
   T old = hip_atomicLoad(acc);
   T expected;
 
-  do
-  {
+  do {
     expected = old;
-    old      = hip_atomicCAS(acc, expected, oper(expected));
+    old = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected));
 
   return old;
@@ -403,29 +406,27 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
 
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with
- * short-circuiting. Implementation uses the existing HIP supplied unsigned
- * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
- * result of this operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
+ * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
+ * operator. Returns the OLD value that was replaced by the result of this
+ * operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
                                             Oper&& oper,
                                             ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
 
-  if (sc(old))
-  {
+  if (sc(old)) {
     return old;
   }
 
   T expected;
 
-  do
-  {
+  do {
     expected = old;
-    old      = hip_atomicCAS(acc, expected, oper(expected));
+    old = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -439,28 +440,29 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<int,
-                                                 unsigned int,
-                                                 unsigned long long,
-                                                 float
+using hip_atomicAdd_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long,
+  float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-                                                 ,
-                                                 double
+  ,
+  double
 #endif
-                                                 >;
+>;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value](T old) { return old + value; });
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old + value;
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -473,15 +475,16 @@ RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<int,
-                                                 unsigned int,
-                                                 unsigned long long,
-                                                 float
+using hip_atomicSub_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long,
+  float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-                                                 ,
-                                                 double
+  ,
+  double
 #endif
-                                                 >;
+>;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -489,7 +492,10 @@ using hip_atomicSub_builtin_types = ::camp::list<int,
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
+using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
+  int,
+  unsigned int
+>;
 
 /*!
  * List of types where HIP builtin atomicAdd is used to implement atomicSub.
@@ -497,33 +503,33 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
-                                                         float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<
+  unsigned long long,
+  float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-                                                         ,
-                                                         double
+  ,
+  double
 #endif
-                                                         >;
+>;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value](T old) { return old - value; });
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old - value;
+  });
 }
 
 /*!
  * HIP atomicSub builtin implementation.
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
   return ::atomicSub(acc, value);
 }
@@ -531,11 +537,9 @@ RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 /*!
  * HIP atomicSub via atomicAdd builtin implementation.
  */
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* =
-        nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -546,20 +550,23 @@ RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
  */
 using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
 {
   return hip_atomicCAS_loop(
-      acc, [value](T old) { return value < old ? value : old; },
-      [value](T current) { return current <= value; });
+    acc,
+    [value] (T old) {
+      return value < old ? value : old;
+    },
+    [value] (T current) {
+      return current <= value;
+    });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -570,20 +577,23 @@ RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
  */
 using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
 {
   return hip_atomicCAS_loop(
-      acc, [value](T old) { return old < value ? value : old; },
-      [value](T current) { return value <= current; });
+    acc,
+    [value] (T old) {
+      return old < value ? value : old;
+    },
+    [value] (T current) {
+      return value <= current;
+    });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -593,11 +603,11 @@ RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
  * Atomic increment with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
+RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
 {
-  return hip_atomicCAS_loop(
-      acc, [value](T old)
-      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
+  });
 }
 
 
@@ -605,7 +615,7 @@ RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T* acc)
+RAJA_INLINE __device__ T hip_atomicInc(T *acc)
 {
   return hip_atomicAdd(acc, static_cast<T>(1));
 }
@@ -615,15 +625,11 @@ RAJA_INLINE __device__ T hip_atomicInc(T* acc)
  * Atomic decrement with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
+RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc,
-                            [value](T old)
-                            {
-                              return old == static_cast<T>(0) || value < old
-                                         ? value
-                                         : old - static_cast<T>(1);
-                            });
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  });
 }
 
 
@@ -631,7 +637,7 @@ RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T* acc)
+RAJA_INLINE __device__ T hip_atomicDec(T *acc)
 {
   return hip_atomicSub(acc, static_cast<T>(1));
 }
@@ -642,18 +648,18 @@ RAJA_INLINE __device__ T hip_atomicDec(T* acc)
  */
 using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value](T old) { return old & value; });
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old & value;
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -664,12 +670,13 @@ RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
  */
 using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value](T old) { return old | value; });
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old | value;
+  });
 }
 
 /*!
@@ -683,18 +690,18 @@ RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
  */
 using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old ^ value;
+  });
 }
 
-template <
-    typename T,
-    RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -714,191 +721,181 @@ RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
-                                          T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy {}, acc);
+  return RAJA::atomicLoad(host_policy{}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(hip_atomic_explicit<host_policy>, T* acc, T value)
+atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   detail::hip_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy {}, acc, value);
+  RAJA::atomicStore(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy {}, acc, value);
+  return RAJA::atomicAdd(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy {}, acc, value);
+  return RAJA::atomicSub(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy {}, acc, value);
+  return RAJA::atomicMin(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy {}, acc, value);
+  return RAJA::atomicMax(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy {}, acc, value);
+  return RAJA::atomicInc(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicInc(hip_atomic_explicit<host_policy>, T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy {}, acc);
+  return RAJA::atomicInc(host_policy{}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy {}, acc, value);
+  return RAJA::atomicDec(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
-                                         T* acc)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicDec(hip_atomic_explicit<host_policy>, T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy {}, acc);
+  return RAJA::atomicDec(host_policy{}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy {}, acc, value);
+  return RAJA::atomicAnd(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
-                                        T* acc,
-                                        T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy {}, acc, value);
+  return RAJA::atomicOr(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
-                                         T* acc,
-                                         T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy {}, acc, value);
+  return RAJA::atomicXor(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
-                                              T* acc,
-                                              T value)
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy {}, acc, value);
+  return RAJA::atomicExchange(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(hip_atomic_explicit<host_policy>, T* acc, T compare, T value)
+atomicCAS(hip_atomic_explicit<host_policy>, T *acc, T compare, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index c8baea1713..a8c4cf53b9 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,91 +71,61 @@ namespace impl
  *
  ******************************************************************************
  */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          typename UniqueMarker>
+template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT len,
+                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if (len > (block_size * grid_size))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > (block_size * grid_size) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_hip_dim<dim>(dims.threads,
-                               static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks,
-                               static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template <named_dim dim,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if (block_size == IdxT(0))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( block_size == IdxT(0) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -163,59 +133,43 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template <named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::Direct,
-    ::RAJA::hip::
-        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::
-      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -224,67 +178,46 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func),
-                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template <named_dim dim,
-          int GRID_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -292,59 +225,43 @@ struct ForallDimensionCalculator<
   }
 };
 
-template <named_dim dim,
-          int BLOCK_SIZE,
-          typename Concretizer,
-          typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0 or "
-                "named_usage::unspecified with forall");
-
-  using IndexGetter =
-      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template <named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<
-    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-    ::RAJA::hip::
-        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-    Concretizer,
-    UniqueMarker>
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
+                                 UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::
-      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template <typename IdxT>
-  static void set_dimensions(internal::HipDims& dims,
-                             IdxT len,
-                             const void* func,
-                             size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template < typename IdxT >
+  static void set_dimensions(internal::HipDims& dims, IdxT len,
+                             const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
-        func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -373,22 +290,21 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forall_hip_kernel(LOOP_BODY loop_body,
-                           const Iterator idx,
-                           IndexType length)
+void forall_hip_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if (ii < length) {
     body(idx[ii]);
   }
 }
@@ -398,20 +314,21 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void
-forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forall_hip_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if (ii < length) {
     body(idx[ii]);
   }
 }
@@ -422,24 +339,23 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size > 0),
-                           size_t> BlockSize = IterationGetter::block_size>
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(LOOP_BODY loop_body,
-                            const Iterator idx,
-                            IndexType length,
-                            ForallParam f_params)
+void forallp_hip_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if ( ii < length ) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -450,139 +366,134 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter  = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
-                                           IterationMapping>::value &&
-                               (IterationGetter::block_size <= 0),
-                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
-                                   const Iterator idx,
-                                   IndexType length,
-                                   ForallParam f_params)
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forallp_hip_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  auto ii         = IterationGetter::template index<IndexType>();
-  if (ii < length)
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  auto ii = IterationGetter::template index<IndexType>();
+  if ( ii < length ) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forall_hip_kernel(LOOP_BODY loop_body,
-                           const Iterator idx,
-                           IndexType length)
+void forall_hip_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 ///
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void
-forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forall_hip_kernel(LOOP_BODY loop_body,
+                       const Iterator idx,
+                       IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
     body(idx[ii]);
   }
 }
 
 ///
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping          = typename EXEC_POL::IterationMapping,
-    typename IterationGetter           = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size > 0),
-                     size_t> BlockSize = IterationGetter::block_size>
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size > 0),
+              size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(LOOP_BODY loop_body,
-                            const Iterator idx,
-                            IndexType length,
-                            ForallParam f_params)
+void forallp_hip_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <
-    typename EXEC_POL,
-    typename Iterator,
-    typename LOOP_BODY,
-    typename IndexType,
-    typename ForallParam,
-    typename IterationMapping = typename EXEC_POL::IterationMapping,
-    typename IterationGetter  = typename EXEC_POL::IterationGetter,
-    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
-                                     IterationMapping>::value &&
-                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
-                                         IterationMapping>::value &&
-                         (IterationGetter::block_size <= 0),
-                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
-                                   const Iterator idx,
-                                   IndexType length,
-                                   ForallParam f_params)
+template <typename EXEC_POL,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam,
+          typename IterationMapping = typename EXEC_POL::IterationMapping,
+          typename IterationGetter = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
+                (IterationGetter::block_size <= 0),
+              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__
+void forallp_hip_kernel(LOOP_BODY loop_body,
+                        const Iterator idx,
+                        IndexType length,
+                        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body      = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
-       ii += IterationGetter::template size<IndexType>())
-  {
-    RAJA::expt::invoke_body(f_params, body, idx[ii]);
+  auto& body = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>();
+       ii < length;
+       ii += IterationGetter::template size<IndexType>()) {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -597,48 +508,37 @@ __global__ void forallp_hip_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
+template <typename Iterable, typename LoopBody,
+          typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-    Iterable&& iter,
-    LoopBody&& loop_body,
-    ForallParam)
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Hip>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Hip hip_res,
+            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL =
-      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
-                                    Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
-                                    LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0)
-  {
+  if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType>);
@@ -660,16 +560,14 @@ forall_impl(
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(
-          func, dims.blocks, dims.threads, shmem, hip_res,
-          std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(func,
+          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
-                        Async);
+      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
     }
 
     RAJA_FT_END;
@@ -679,49 +577,37 @@ forall_impl(
 }
 
 
-template <typename Iterable,
-          typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
+template <typename Iterable, typename LoopBody,
+          typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-    Iterable&& iter,
-    LoopBody&& loop_body,
-    ForallParam f_params)
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Hip>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+forall_impl(resources::Hip hip_res,
+            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL =
-      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
-                                    Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
-                                    LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator =
-      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
-                                      Concretizer, UniqueMarker>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0)
-  {
+  if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
@@ -741,9 +627,9 @@ forall_impl(
     RAJA_FT_BEGIN;
 
     RAJA::hip::detail::hipInfo launch_info;
-    launch_info.gridDim  = dims.blocks;
+    launch_info.gridDim = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res      = hip_res;
+    launch_info.res = hip_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -751,17 +637,14 @@ forall_impl(
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(
-          func, dims.blocks, dims.threads, shmem, hip_res,
-          std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(func,
+          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
-                      (void*)&f_params};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
-                        Async);
+      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -792,28 +675,22 @@ forall_impl(
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
+          typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
-    resources::Hip r,
-    ExecPolicy<
-        seq_segit,
-        ::RAJA::policy::hip::
-            hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
-    const TypedIndexSet<SegmentTypes...>& iset,
-    LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Hip>
+forall_impl(resources::Hip r,
+            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
+            const TypedIndexSet<SegmentTypes...>& iset,
+            LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi)
-  {
-    iset.segmentCall(
-        r, isi, detail::CallForall(),
-        ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
-                                      Concretizer, true>(),
-        loop_body);
+  for (int isi = 0; isi < num_seg; ++isi) {
+    iset.segmentCall(r,
+                     isi,
+                     detail::CallForall(),
+                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
+                     loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::hip::synchronize(r);
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index a9d21297ac..c72a0b5c4f 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -59,9 +59,15 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
 };
 
 /*!
@@ -90,23 +96,18 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template <typename T>
+  template < typename T >
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(
-        reinterpret_cast<const integer_type*>(in_ptr + idx));
-
-    for (size_t i = 0; i < u.array_size(); ++i)
-    {
-#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
-    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
-                                     __HIP_MEMORY_SCOPE_AGENT);
+    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -115,23 +116,19 @@ struct AccessorDeviceScopeUseBlockFence
     return u.get_value();
   }
 
-  template <typename T>
+  template < typename T >
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
-                                                   max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i)
-    {
-#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
-    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
-                         __HIP_MEMORY_SCOPE_AGENT);
+    for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -140,8 +137,7 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
-    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
     __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 #else
     __threadfence();
@@ -150,13 +146,11 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_release()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
-    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) &&                 \
-    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
+                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
-    __builtin_amdgcn_s_waitcnt(
-        /*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
 #else
     __threadfence();
 #endif
@@ -181,13 +175,10 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
-      u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i)
-  {
+  for (size_t i = 0; i < u.array_size(); ++i) {
     u.array[i] = ::__shfl_xor(u.array[i], laneMask);
   }
   return u.get_value();
@@ -196,13 +187,10 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
-                               max_shfl_int_type_size>
-      u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i)
-  {
+  for (size_t i = 0; i < u.array_size(); ++i) {
     u.array[i] = ::__shfl(u.array[i], srcLane);
   }
   return u.get_value();
@@ -245,28 +233,23 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
-  {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner {}(temp, rhs);
+      Combiner{}(temp, rhs);
     }
-  }
-  else
-  {
+
+  } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
-      T rhs       = shfl_sync(temp, srcLane);
+      T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads)
-      {
-        Combiner {}(temp, rhs);
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
       }
     }
   }
@@ -286,10 +269,9 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
-  {
+  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
     T rhs = shfl_xor_sync(temp, i);
-    Combiner {}(temp, rhs);
+    Combiner{}(temp, rhs);
   }
 
   return temp;
@@ -305,77 +287,61 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId  = threadId % policy::hip::device_constants.WARP_SIZE;
+  int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
-  {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner {}(temp, rhs);
+      Combiner{}(temp, rhs);
     }
-  }
-  else
-  {
+
+  } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
-      T rhs       = shfl_sync(temp, srcLane);
+      T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads)
-      {
-        Combiner {}(temp, rhs);
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::hip::device_constants.WARP_SIZE)
-  {
+  if (numThreads > policy::hip::device_constants.WARP_SIZE) {
 
-    static_assert(policy::hip::device_constants.MAX_WARPS <=
-                      policy::hip::device_constants.WARP_SIZE,
-                  "This algorithms assumes a warp of WARP_SIZE threads can "
-                  "reduce MAX_WARPS values");
+    static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE,
+        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
 
-    __shared__ unsigned char tmpsd[sizeof(
-        RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
     RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
-        reinterpret_cast<RAJA::detail::SoAArray<
-            T, policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0)
-    {
+    if (warpId == 0) {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0)
-    {
+    if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads)
-      {
+      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
-      }
-      else
-      {
+      } else {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
-      {
+      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2) {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner {}(temp, rhs);
+        Combiner{}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index a882b547d7..3204845544 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -41,18 +41,19 @@ template <typename Data,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                            statement::If<Conditional, EnclosedStmts...>,
-                            Types>
-{
+                             statement::If<Conditional, EnclosedStmts...>,
+                             Types> {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
-    if (Conditional::eval(data))
-    {
+    if (Conditional::eval(data)) {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -60,7 +61,10 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 6e90852841..39e7104c16 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -45,12 +45,9 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                    EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -63,13 +60,13 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::
-          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -82,13 +79,14 @@ struct HipStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    LaunchDims dims{my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -110,13 +108,9 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::sync,
-                       IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                    EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -128,24 +122,21 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator =
-      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::sync,
-          IndexMapper>>;
+  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -160,13 +151,14 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -188,13 +180,9 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<
-                       iteration_mapping::StridedLoop<named_usage::unspecified>,
-                       kernel_sync_requirement::none,
-                       IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                    EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -206,24 +194,21 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator =
-      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::none,
-          IndexMapper>>;
+  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -233,13 +218,14 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -259,19 +245,14 @@ struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-    : HipStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+: HipStatementExecutor<Data, statement::For<ArgumentId,
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                     kernel_sync_requirement::none,
+                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+      EnclosedStmts...>, Types>
+{
+
+};
 
 
 /*
@@ -282,32 +263,33 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_warp_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
-{
+struct HipStatementExecutor<
+  Data,
+  statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -317,11 +299,13 @@ struct HipStatementExecutor<Data,
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -336,7 +320,7 @@ struct HipStatementExecutor<Data,
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return (dims);
+    return(dims);
   }
 };
 
@@ -348,41 +332,41 @@ struct HipStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_warp_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
-{
+struct HipStatementExecutor<
+  Data,
+  statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -398,7 +382,9 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -413,7 +399,7 @@ struct HipStatementExecutor<Data,
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return (dims);
+    return(dims);
   }
 };
 
@@ -425,28 +411,30 @@ struct HipStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_thread_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
-{
+struct HipStatementExecutor<
+  Data,
+  statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -456,11 +444,13 @@ struct HipStatementExecutor<Data,
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -476,7 +466,7 @@ struct HipStatementExecutor<Data,
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return (dims.max(enclosed_dims));
+    return(dims.max(enclosed_dims));
   }
 };
 
@@ -488,38 +478,39 @@ struct HipStatementExecutor<Data,
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::hip_thread_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>
-{
+struct HipStatementExecutor<
+  Data,
+  statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
+                 EnclosedStmts ...>,
+  Types> {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -535,7 +526,9 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -551,7 +544,7 @@ struct HipStatementExecutor<Data,
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return (dims.max(enclosed_dims));
+    return(dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 823f6b1293..ba6642f248 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -46,40 +46,33 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::policy::hip::
-            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                         EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::hip::
-                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -109,52 +102,38 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                         EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::sync,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::policy::hip::hip_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::sync,
-              IndexMapper>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -186,52 +165,38 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<
-        ArgumentId,
-        ParamId,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                         EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-          Data,
-          statement::For<
-              ArgumentId,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                       EnclosedStmts...>,
+        Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<
-          ArgumentId,
-          RAJA::policy::hip::hip_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::none,
-              IndexMapper>,
-          EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -260,19 +225,14 @@ struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-    : HipStatementExecutor<
-          Data,
-          statement::ForICount<
-              ArgumentId,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+: HipStatementExecutor<Data, statement::ForICount<ArgumentId,
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                     kernel_sync_requirement::none,
+                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+      EnclosedStmts...>, Types>
+{
+
+};
 
 
 /*
@@ -284,47 +244,40 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::hip_warp_masked_direct<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public HipStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_warp_masked_direct<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -335,8 +288,9 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
+
 };
 
 
@@ -349,56 +303,48 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::hip_warp_masked_loop<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public HipStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_warp_masked_loop<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -413,6 +359,7 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
+
 };
 
 
@@ -425,43 +372,37 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::hip_thread_masked_direct<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public HipStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_thread_masked_direct<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -472,8 +413,9 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, thread_active && (i<len));
   }
+
 };
 
 
@@ -486,52 +428,45 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::hip_thread_masked_loop<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public HipStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      HipStatementExecutor<Data,
-                           statement::For<ArgumentId,
-                                          RAJA::hip_thread_masked_loop<Mask>,
-                                          EnclosedStmts...>,
-                           Types>;
+  using Base = HipStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len      = segment_length<ArgumentId>(data);
-    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    const diff_t len = segment_length<ArgumentId>(data);
+    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -546,6 +481,7 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
+
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 8c5cb83d39..1ed7740008 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -51,8 +51,7 @@ namespace RAJA
  *
  */
 template <bool async0, int num_blocks, int num_threads>
-struct hip_explicit_launch
-{};
+struct hip_explicit_launch {};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
@@ -88,10 +87,8 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<
-          ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
-          EnclosedStmts...>
-{};
+    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
+};
 
 
 /*!
@@ -102,8 +99,7 @@ struct HipKernelExt
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExp =
-    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
-                 EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -113,8 +109,7 @@ using HipKernelExp =
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExpAsync =
-    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
-                 EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
@@ -140,9 +135,9 @@ using HipKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixed = HipKernelExt<
-    hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
-    EnclosedStmts...>;
+using HipKernelFixed =
+    HipKernelExt<hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
+                  EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with a fixed
@@ -150,9 +145,8 @@ using HipKernelFixed = HipKernelExt<
  * The kernel launch is asynchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixedAsync = HipKernelExt<
-    hip_explicit_launch<true, operators::limits<int>::max(), num_threads>,
-    EnclosedStmts...>;
+using HipKernelFixedAsync =
+    HipKernelExt<hip_explicit_launch<true, operators::limits<int>::max(), num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
@@ -181,7 +175,7 @@ template <typename Data, typename Exec>
 __global__ void HipKernelLauncher(Data data)
 {
 
-  using data_t        = camp::decay<Data>;
+  using data_t = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -200,7 +194,7 @@ __launch_bounds__(BlockSize, 1) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
-  using data_t        = camp::decay<Data>;
+  using data_t = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -216,11 +210,10 @@ __launch_bounds__(BlockSize, 1) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template <int BlockSize, typename Data, typename executor_t>
+template<int BlockSize, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
-  using type = camp::decay<
-      decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
@@ -231,11 +224,10 @@ struct HipKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template <typename Data, typename executor_t>
+template<typename Data, typename executor_t>
 struct HipKernelLauncherGetter<0, Data, executor_t>
 {
-  using type =
-      camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
+  using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncher<Data, executor_t>;
@@ -243,14 +235,12 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
 };
 
 
+
 /*!
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <typename LaunchPolicy,
-          typename StmtList,
-          typename Data,
-          typename Types>
+template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
 struct HipLaunchHelper;
 
 
@@ -259,28 +249,16 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template <bool async0,
-          int num_blocks,
-          int num_threads,
-          typename StmtList,
-          typename Data,
-          typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
-                       StmtList,
-                       Data,
-                       Types>
+template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
 {
   using Self = HipLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t =
-      internal::hip_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t =
-      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
-                              Data,
-                              executor_t>;
+  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
 
   inline static const void* get_func()
   {
@@ -288,16 +266,13 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-                                                int& recommended_blocks,
-                                                int& recommended_threads)
+      int &recommended_blocks, int &recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0)
-    {
+    if (num_blocks <= 0) {
 
-      if (num_threads <= 0)
-      {
+      if (num_threads <= 0) {
 
         //
         // determine blocks at runtime
@@ -305,11 +280,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
         //
         auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks  = data.func_max_blocks_per_device;
+        recommended_blocks = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-      }
-      else
-      {
+
+      } else {
 
         //
         // determine blocks at runtime
@@ -319,73 +293,69 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
 
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks =
-            data.func_max_blocks_per_sm * data.device_sm_per_device;
+        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
+
       }
-    }
-    else
-    {
 
-      if (num_threads <= 0)
-      {
+    } else {
+
+      if (num_threads <= 0) {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-      }
-      else
-      {
+
+      } else {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
+
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
+
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
-                                 int& max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
   {
-    if (num_threads <= 0)
-    {
+    if (num_threads <= 0) {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-    }
-    else
-    {
+
+    } else {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
+
     }
   }
 
-  inline static void
-  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
+  inline static void max_blocks(size_t shmem_size,
+      int &max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0)
-    {
+    if (num_blocks <= 0) {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 || num_threads != actual_threads)
-      {
+      if (num_threads <= 0 ||
+          num_threads != actual_threads) {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -393,9 +363,8 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-      }
-      else
-      {
+
+      } else {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -403,15 +372,16 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
+
       }
-    }
-    else
-    {
+
+    } else {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
+
     }
   }
 };
@@ -425,10 +395,8 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline hip_dim_t fitHipDims(hip_dim_member_t limit,
-                            hip_dim_t result,
-                            hip_dim_t minimum = hip_dim_t())
-{
+inline
+hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
 
 
   // clamp things to at least 1
@@ -441,13 +409,12 @@ inline hip_dim_t fitHipDims(hip_dim_member_t limit,
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if (result.x * result.y * result.z <= limit) return result;
+  if(result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if (result.x * result.y * minimum.z < limit)
-  {
+  if(result.x * result.y * minimum.z < limit){
     // compute a new z
-    result.z = limit / (result.x * result.y);
+    result.z = limit / (result.x*result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -455,10 +422,9 @@ inline hip_dim_t fitHipDims(hip_dim_member_t limit,
 
 
   // Can we reduce y to fit?
-  if (result.x * minimum.y * result.z < limit)
-  {
+  if(result.x * minimum.y * result.z < limit){
     // compute a new y
-    result.y = limit / (result.x * result.z);
+    result.y = limit / (result.x*result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -466,10 +432,9 @@ inline hip_dim_t fitHipDims(hip_dim_member_t limit,
 
 
   // Can we reduce y to fit?
-  if (minimum.x * result.y * result.z < limit)
-  {
+  if(minimum.x * result.y * result.z < limit){
     // compute a new x
-    result.x = limit / (result.y * result.z);
+    result.x = limit / (result.y*result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -484,20 +449,18 @@ inline hip_dim_t fitHipDims(hip_dim_member_t limit,
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
-    Types>
-{
+    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
 
-  using stmt_list_t   = StatementList<EnclosedStmts...>;
-  using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using StatementType =
+      statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data&& data)
+  static inline void exec(Data &&data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t =
-        hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -511,10 +474,9 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks  = launch_dims.num_blocks();
+    int num_blocks = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0)
-    {
+    if (num_blocks > 0 || num_threads > 0) {
 
       //
       // Setup shared memory buffers
@@ -527,8 +489,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
-                                           recommended_threads);
+      launch_t::recommended_blocks_threads(
+          shmem, recommended_blocks, recommended_threads);
 
 
       //
@@ -541,24 +503,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      hip_dim_t fit_threads {0, 0, 0};
+      hip_dim_t fit_threads{0,0,0};
 
-      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
-      {
+      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
+
+        fit_threads = fitHipDims(
+            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
 
-        fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
-                                 launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if (recommended_threads < max_threads &&
-          get_size(fit_threads) != recommended_threads)
-      {
+      if ( recommended_threads < max_threads &&
+           get_size(fit_threads) != recommended_threads ) {
+
+        fit_threads = fitHipDims(
+            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
 
-        fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
-                                 launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -572,25 +534,24 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if (launch_dims.num_threads() == recommended_threads)
-      {
+      if ( launch_dims.num_threads() == recommended_threads ) {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-      }
-      else
-      {
+
+      } else {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
+
       }
 
-      launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
-                                           launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(
+          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -599,8 +560,7 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if (launch_dims.num_threads() > max_threads)
-      {
+      if(launch_dims.num_threads() > max_threads){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -614,17 +574,14 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto hip_data = RAJA::hip::make_launch_body(
-            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
-            data);
+        auto hip_data = RAJA::hip::make_launch_body(func,
+            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func, launch_dims.dims.blocks,
-                          launch_dims.dims.threads, args, shmem, res,
-                          launch_t::async);
+        void *args[] = {(void*)&hip_data};
+        RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index a9888d17a7..5c428f03ab 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -41,31 +41,33 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-    Data,
-    statement::
-        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
-    Types>
-{
+struct HipStatementExecutor<Data,
+                             statement::Hyperplane<HpArgumentId,
+                                                   seq_exec,
+                                                   ArgList<Args...>,
+                                                   EnclosedStmts...>,
+                            Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t =
-      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len =
-        segment_length<HpArgumentId>(data) +
-        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
+    int hp_len = segment_length<HpArgumentId>(data) +
+                 foldl(RAJA::operators::plus<int>(),
+                               segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-                       camp::get<Args>(data.offset_tuple)...);
+        camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -77,8 +79,7 @@ struct HipStatementExecutor<
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h)
-    {
+    for (int h = 0; h < hp_len; ++h) {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -92,13 +93,18 @@ struct HipStatementExecutor<
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
+
+
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index b59ec5c88a..bbb8d6081b 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -39,30 +39,27 @@ struct hip_shared_mem;
 namespace internal
 {
 
-// Intialize thread shared array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
+//Intialize thread shared array
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
 struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_shared_mem,
-                                                    camp::idx_seq<Indices...>,
-                                                    EnclosedStmts...>,
+                            statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>,
+                            EnclosedStmts...>,
                             Types>
 {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  // Launch loops
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Launch loops
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -70,33 +67,40 @@ struct HipStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  // Intialize local array
-  // Identifies type + number of elements needed
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Intialize local array
+  //Identifies type + number of elements needed
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  // Set pointer to null base case
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null base case
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  // Set pointer to null recursive case
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null recursive case
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -104,47 +108,47 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
 
-    // Intialize scoped arrays + launch loops
+    //Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    // set pointers in scoped arrays to null
+    //set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline static LaunchDims calculateDimensions(Data const& data)
+  inline
+  static
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
+
 };
 
-// Intialize thread private array
-template <typename Data,
-          camp::idx_t... Indices,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_thread_mem,
-                                                    camp::idx_seq<Indices...>,
-                                                    EnclosedStmts...>,
-                            Types>
+//Intialize thread private array
+template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
 {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  // Launch loops
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Launch loops
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -152,33 +156,40 @@ struct HipStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  // Intialize local array
-  // Identifies type + number of elements needed
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
+  //Intialize local array
+  //Identifies type + number of elements needed
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void initMem(Data &data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<
-        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  // Set pointer to null base case
-  template <camp::idx_t Pos>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null base case
+  template<camp::idx_t Pos>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  // Set pointer to null recursive case
-  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static inline RAJA_DEVICE void setPtrToNull(Data& data)
+  //Set pointer to null recursive case
+  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static
+  inline
+  RAJA_DEVICE
+  void setPtrToNull(Data &data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -186,24 +197,31 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
 
-    // Intialize scoped arrays + launch loops
+    //Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    // set pointers in scoped arrays to null
+    //set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline static LaunchDims calculateDimensions(Data const& data)
+  inline
+  static
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
+
 };
 
 
+
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index 7835ddb7eb..d04fb11bf6 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -40,34 +40,30 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data,
-          camp::idx_t LambdaIndex,
-          typename... Args,
-          typename Types>
-struct HipStatementExecutor<Data,
-                            statement::Lambda<LambdaIndex, Args...>,
-                            Types>
-{
+template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if (thread_active)
-    {
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
-          data);
+    if(thread_active){
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
     }
   }
 
 
-  static inline LaunchDims
-  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
+  static
+  inline
+  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
+
+
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index 2799207979..a518073e7c 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -35,34 +35,31 @@ namespace internal
 // Executor that handles reductions across a single HIP thread block
 //
 template <typename Data,
-          template <typename...>
-          class ReduceOperator,
+          template <typename...> class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                            statement::Reduce<RAJA::hip_block_reduce,
-                                              ReduceOperator,
-                                              ParamId,
-                                              EnclosedStmts...>,
-                            Types>
-{
+                             statement::Reduce<RAJA::hip_block_reduce,
+                                               ReduceOperator,
+                                               ParamId,
+                                               EnclosedStmts...>,
+                           Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value    = data.template get_param<ParamId>();
+    auto value = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active)
-    {
+    if (!thread_active) {
       value = ident;
     }
 
@@ -70,13 +67,13 @@ struct HipStatementExecutor<Data,
     // reduction objects
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value = RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
+    value_t new_value =
+        RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
 
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if (thread_active)
-    {
+    if(thread_active){
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -84,7 +81,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -97,47 +94,44 @@ struct HipStatementExecutor<Data,
 // Executor that handles reductions across a single HIP thread warp
 //
 template <typename Data,
-          template <typename...>
-          class ReduceOperator,
+          template <typename...> class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                            statement::Reduce<RAJA::hip_warp_reduce,
-                                              ReduceOperator,
-                                              ParamId,
-                                              EnclosedStmts...>,
-                            Types>
-{
+                             statement::Reduce<RAJA::hip_warp_reduce,
+                                               ReduceOperator,
+                                               ParamId,
+                                               EnclosedStmts...>,
+                            Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value    = data.template get_param<ParamId>();
+    auto value = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active)
-    {
+    if (!thread_active) {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value = RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value =
+        RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if (thread_active)
-    {
+    if(thread_active){
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -145,7 +139,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline LaunchDims calculateDimensions(Data const &data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -154,6 +148,7 @@ struct HipStatementExecutor<Data,
 };
 
 
+
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index b5d590cb3a..d54a5ccf83 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -43,14 +43,14 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a HIP __syncthreads().
  */
-struct HipSyncThreads : public internal::Statement<camp::nil>
-{};
+struct HipSyncThreads : public internal::Statement<camp::nil> {
+};
 
 /*!
  * A RAJA::kernel statement that performs a HIP __syncwarp().
  */
-struct HipSyncWarp : public internal::Statement<camp::nil>
-{};
+struct HipSyncWarp : public internal::Statement<camp::nil> {
+};
 
 }  // namespace statement
 
@@ -58,31 +58,34 @@ namespace internal
 {
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncThreads, Types>
-{
+struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
 
-  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &, bool) { __syncthreads(); }
 
 
-  static inline LaunchDims
-  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
+  static
+  inline
+  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncWarp, Types>
-{
+struct HipStatementExecutor<Data, statement::HipSyncWarp, Types> {
 
-  static inline RAJA_DEVICE
-      // not currently supported
-      void
-      exec(Data&, bool)
-  {}
+  static
+  inline
+  RAJA_DEVICE
+  //not currently supported
+  void exec(Data &, bool) {  }
 
-  static inline LaunchDims
-  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
+  static
+  inline
+  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 4490bddf42..62dda7f20d 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -58,12 +58,10 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                   sync,
-                                                   IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                     EnclosedStmts...>,
-    Types>
-{
+                    Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -71,21 +69,19 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<
-      RAJA::policy::hip::
-          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -104,23 +100,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len =
-        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -145,16 +141,11 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
-{
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                    EnclosedStmts...>, Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -162,32 +153,26 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator =
-      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::sync,
-          IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride =
-        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -205,23 +190,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len =
-        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -246,16 +231,11 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<
-        ArgumentId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
-{
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                    EnclosedStmts...>, Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -263,32 +243,26 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator =
-      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-          iteration_mapping::StridedLoop<named_usage::unspecified>,
-          kernel_sync_requirement::none,
-          IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride =
-        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -301,23 +275,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len =
-        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims {my_dims, my_min_dims};
+    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims{my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -342,22 +316,15 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
-    Types>
-    : HipStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              TPol,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
+: HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
+    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                   kernel_sync_requirement::none,
+                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+    EnclosedStmts...>, Types>
+{
+
+};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index fc4a5c5222..07637fbd8f 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -58,49 +58,42 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::hip::
-            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-        EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
     : public HipStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              RAJA::policy::hip::
-                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                     sync,
-                                                     IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                       EnclosedStmts...>,
-      Types>;
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t   = IndexMapper::template index<diff_t>();
-    const diff_t i   = t * static_cast<diff_t>(chunk_size);
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -136,64 +129,50 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::sync,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
     : public HipStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::sync,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          RAJA::policy::hip::hip_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::sync,
-              IndexMapper>,
-          EnclosedStmts...>,
-      Types>;
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                      EnclosedStmts...>,
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t t_init   = IndexMapper::template index<diff_t>();
-    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t t_init = IndexMapper::template index<diff_t>();
+    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
-    {
+    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -228,64 +207,50 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<
-        ArgumentId,
-        ParamId,
-        RAJA::tile_fixed<chunk_size>,
-        RAJA::policy::hip::hip_indexer<
-            iteration_mapping::StridedLoop<named_usage::unspecified>,
-            kernel_sync_requirement::none,
-            IndexMapper>,
-        EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                    EnclosedStmts...>,
+                    Types>
     : public HipStatementExecutor<
-          Data,
-          statement::Tile<
-              ArgumentId,
-              RAJA::tile_fixed<chunk_size>,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  IndexMapper>,
-              EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                        EnclosedStmts...>,
+                        Types> {
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<
-          ArgumentId,
-          RAJA::tile_fixed<chunk_size>,
-          RAJA::policy::hip::hip_indexer<
-              iteration_mapping::StridedLoop<named_usage::unspecified>,
-              kernel_sync_requirement::none,
-              IndexMapper>,
-          EnclosedStmts...>,
-      Types>;
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                      EnclosedStmts...>,
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static inline RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t t_init   = IndexMapper::template index<diff_t>();
-    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t t_init = IndexMapper::template index<diff_t>();
+    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
-    {
+    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -314,24 +279,15 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::
-        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
-    Types>
-    : HipStatementExecutor<
-          Data,
-          statement::TileTCount<
-              ArgumentId,
-              ParamId,
-              TPol,
-              RAJA::policy::hip::hip_indexer<
-                  iteration_mapping::StridedLoop<named_usage::unspecified>,
-                  kernel_sync_requirement::none,
-                  hip::IndexGlobal<named_dim::x,
-                                   named_usage::ignored,
-                                   named_usage::ignored>>,
-              EnclosedStmts...>,
-          Types>
-{};
+    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+: HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
+    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                   kernel_sync_requirement::none,
+                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
+    EnclosedStmts...>, Types>
+{
+
+};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 59649fc63d..aa0610d736 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -44,26 +44,29 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims
-{
+struct LaunchDims {
 
   HipDims dims;
   HipDims min_dims;
 
-  LaunchDims()                             = default;
-  LaunchDims(LaunchDims const&)            = default;
+  LaunchDims() = default;
+  LaunchDims(LaunchDims const&) = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(HipDims _dims) : dims {_dims}, min_dims {} {}
+  LaunchDims(HipDims _dims)
+    : dims{_dims}
+    , min_dims{}
+  { }
 
   RAJA_INLINE
   LaunchDims(HipDims _dims, HipDims _min_dims)
-      : dims {_dims}, min_dims {_min_dims}
-  {}
+    : dims{_dims}
+    , min_dims{_min_dims}
+  { }
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const& c) const
+  LaunchDims max(LaunchDims const &c) const
   {
     LaunchDims result;
 
@@ -79,44 +82,43 @@ struct LaunchDims
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x =
-        std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y =
-        std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z =
-        std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const { return dims.num_blocks(); }
+  int num_blocks() const {
+    return dims.num_blocks();
+  }
 
   RAJA_INLINE
-  int num_threads() const { return dims.num_threads(); }
+  int num_threads() const {
+    return dims.num_threads();
+  }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks()
-  {
+  void clamp_to_min_blocks() {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads()
-  {
+  void clamp_to_min_threads() {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
+
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper
-{
+struct HipStatementListExecutorHelper {
 
   using next_helper_t =
       HipStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -124,7 +126,7 @@ struct HipStatementListExecutorHelper
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -135,7 +137,7 @@ struct HipStatementListExecutorHelper
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data& data)
+  inline static LaunchDims calculateDimensions(Data &data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -149,17 +151,16 @@ struct HipStatementListExecutorHelper
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
-{
+struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data&, bool)
+  inline static RAJA_DEVICE void exec(Data &, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data&)
+  inline static LaunchDims calculateDimensions(Data &)
   {
     return LaunchDims();
   }
@@ -174,121 +175,109 @@ struct HipStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types>
-{
+struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   using enclosed_stmts_t =
       camp::list<HipStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, bool thread_active)
   {
     // Execute statements in order with helper class
-    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
-        data, thread_active);
+    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute this statements launch dimensions
-    return HipStatementListExecutorHelper<
-        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
+    return HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
+        calculateDimensions(data);
   }
 };
 
 
 template <typename StmtList, typename Data, typename Types>
-using hip_statement_list_executor_t =
-    HipStatementListExecutor<Data, StmtList, Types>;
+using hip_statement_list_executor_t = HipStatementListExecutor<
+    Data,
+    StmtList,
+    Types>;
 
 
 // specialization for direct sequential policies
-template <typename kernel_indexer>
+template<typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template <typename IdxT>
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
-                             HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+  template < typename IdxT >
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
   {
-    if (len > static_cast<IdxT>(1))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > static_cast<IdxT>(1) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(IndexMapper::block_size))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -296,225 +285,164 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(IndexMapper::grid_size))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template <typename IdxT>
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
-                             HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT len)
+  template < typename IdxT >
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
   {
-    if (len > static_cast<IdxT>(0))
-    {
+    if (len > static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
-    set_hip_dim<dim>(dims.threads,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    sync,
-    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
-               static_cast<IdxT>(IndexMapper::grid_size)))
-    {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index "
-                          "space");
+    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
+                static_cast<IdxT>(IndexMapper::grid_size)) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template <typename IdxT>
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
-                             HipDims& RAJA_UNUSED_ARG(min_dims),
-                             IdxT RAJA_UNUSED_ARG(len))
-  {}
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template < typename IdxT >
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
+  {
+  }
 };
 
 // specialization for strided loop thread policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template <typename IdxT>
-  static void
-  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -522,44 +450,35 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void
-  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template <named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template<named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0))
-    {
+    if (len > static_cast<IdxT>(0)) {
       set_hip_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -568,92 +487,62 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
   }
 };
 ///
-template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will
-    // fail
-    set_hip_dim<dim>(dims.threads,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
-  using IndexMapper =
-      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template <typename IdxT>
+  template < typename IdxT >
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks,
-                     RAJA_DIVIDE_CEILING_INT(
-                         len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template <named_dim dim,
-          int BLOCK_SIZE,
-          int GRID_SIZE,
-          kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    sync,
-    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                    sync,
+                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0,
-                "block size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0,
-                "grid size must be > 0, "
-                "named_usage::unspecified, or "
-                "named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template <typename IdxT>
-  static void
-  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template < typename IdxT >
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads,
-                     static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks,
-                     static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 4940c6d365..6823647b48 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,45 +45,38 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in,
-                                             ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body(reduce_params, body, ctx);
+  RAJA::expt::invoke_body( reduce_params, body, ctx );
 
-  // Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
-      reduce_params);
+  //Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<
-    RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
-{
+struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, const LaunchParams &params,
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(
+        &launch_global_fcn<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -91,20 +84,18 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2])};
+                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
 
-    hip_dim_t blockSize {
-        static_cast<hip_dim_member_t>(params.threads.value[0]),
-        static_cast<hip_dim_member_t>(params.threads.value[1]),
-        static_cast<hip_dim_member_t>(params.threads.value[2])};
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
@@ -114,16 +105,14 @@ struct LaunchExecute<
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -133,18 +122,13 @@ struct LaunchExecute<
   }
 
 
-  // Version with explicit reduction parameters..
+ //Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -157,53 +141,45 @@ struct LaunchExecute<
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize {
-        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
 
-    hip_dim_t blockSize {
-        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim      = gridSize;
-      launch_info.blockDim     = blockSize;
+      launch_info.gridDim = gridSize;
+      launch_info.blockDim = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res          = hip_res;
+      launch_info.res = hip_res;
 
       {
-        using EXEC_POL =
-            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        void *args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
@@ -211,20 +187,21 @@ struct LaunchExecute<
 
     return resources::EventProxy<resources::Resource>(res);
   }
+
 };
 
 
 template <typename BODY, int num_threads>
 __launch_bounds__(num_threads, 1) __global__
-    void launch_global_fcn_fixed(BODY body_in)
+void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -233,46 +210,39 @@ __launch_bounds__(num_threads, 1) __global__
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-    void launch_new_reduce_global_fcn_fixed(BODY body_in,
-                                            ReduceParams reduce_params)
+void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
 
-  // Set pointer to shared memory
+  //Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body(reduce_params, body, ctx);
+  RAJA::expt::invoke_body( reduce_params, body, ctx );
 
-  // Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
-      reduce_params);
+  //Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
 }
 
 
 template <bool async, int nthreads>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
-{
+struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, const LaunchParams &params,
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func =
-        reinterpret_cast<const void*>(&launch_global_fcn_fixed<BODY, nthreads>);
+    auto func = reinterpret_cast<const void*>(
+        &launch_global_fcn_fixed<BODY, nthreads>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -280,20 +250,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2])};
+                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
 
-    hip_dim_t blockSize {
-        static_cast<hip_dim_member_t>(params.threads.value[0]),
-        static_cast<hip_dim_member_t>(params.threads.value[1]),
-        static_cast<hip_dim_member_t>(params.threads.value[2])};
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
@@ -302,16 +270,14 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -320,24 +286,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  // Version with explicit reduction parameters..
+ //Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& launch_reducers)
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
+       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads,
-                                            camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -345,53 +305,45 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize {
-        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
 
-    hip_dim_t blockSize {
-        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim      = gridSize;
-      launch_info.blockDim     = blockSize;
+      launch_info.gridDim = gridSize;
+      launch_info.blockDim = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res          = hip_res;
+      launch_info.res = hip_res;
 
       {
-        using EXEC_POL =
-            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
-                                                     launch_info);
+        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async, kernel_name);
+        void *args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
-                                                        launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
 
       RAJA_FT_END;
@@ -399,6 +351,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
 
     return resources::EventProxy<resources::Resource>(res);
   }
+
 };
 
 
@@ -406,50 +359,43 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE
+  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+            SEGMENT const &segment,
+            BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -457,36 +403,29 @@ struct LoopExecute<
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1)
-    {
+    if (i0 < len0 && i1 < len1) {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1,
-                                   IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -496,62 +435,53 @@ struct LoopExecute<
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2)
-    {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(*(segment0.begin() + i0),
+           *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE
+  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+            SEGMENT const &segment,
+            BODY const &body)
   {
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
-    SEGMENT>
-{
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -562,42 +492,34 @@ struct LoopExecute<
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0),
+             *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper0,
+                                                  IndexMapper1,
+                                                  IndexMapper2>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -611,16 +533,14 @@ struct LoopExecute<
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
-        {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0),
+               *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -629,49 +549,42 @@ struct LoopExecute<
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i   = IndexMapper::template index<diff_t>();
+    const diff_t i = IndexMapper::template index<diff_t>();
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -679,36 +592,31 @@ struct LoopICountExecute<
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1)
-    {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
+    if (i0 < len0 && i1 < len1) {
+      body(*(segment0.begin() + i0),
+           *(segment1.begin() + i1),
+           i0, i1);
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper0,
-                                   IndexMapper1,
-                                   IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -718,62 +626,54 @@ struct LoopICountExecute<
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2)
-    {
-      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-           *(segment2.begin() + i2), i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2) {
+      body(*(segment0.begin() + i0),
+           *(segment1.begin() + i1),
+           *(segment2.begin() + i2),
+           i0, i1, i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t i_init   = IndexMapper::template index<diff_t>();
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t i_init = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
-    SEGMENT>
-{
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -784,42 +684,35 @@ struct LoopICountExecute<
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
+        body(*(segment0.begin() + i0),
+             *(segment1.begin() + i1),
+             i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopICountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
-    SEGMENT>
-{
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper0,
+                                                        IndexMapper1,
+                                                        IndexMapper2>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -833,17 +726,16 @@ struct LoopICountExecute<
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
-    {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
-      {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
-        {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
 
-          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
-               *(segment2.begin() + i2), i0, i1, i2);
+          body(*(segment0.begin() + i0),
+               *(segment1.begin() + i1),
+               *(segment2.begin() + i2),
+               i0, i1, i2);
         }
       }
     }
@@ -854,34 +746,31 @@ struct LoopICountExecute<
 /*
    HIP generic flattened loop implementations
 */
-template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                           sync,
-                                           IndexMapper0>,
-    SEGMENT>
-    : LoopExecute<
-          RAJA::policy::hip::
-              hip_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
-          SEGMENT>
+template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                                          sync,
+                                                          IndexMapper0>,
+                   SEGMENT>
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper0>,
+                   SEGMENT>
 {};
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                           kernel_sync_requirement::none,
-                                           IndexMapper0,
-                                           IndexMapper1>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -890,35 +779,29 @@ struct LoopExecute<
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride * i1;
+    const int i = i0 + i0_stride*i1;
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                           kernel_sync_requirement::none,
-                                           IndexMapper0,
-                                           IndexMapper1,
-                                           IndexMapper2>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1,
+                                                          IndexMapper2>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -929,47 +812,39 @@ struct LoopExecute<
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
+    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
 
-    if (i < len)
-    {
+    if (i < len) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        sync,
-        IndexMapper0>,
-    SEGMENT>
-    : LoopExecute<
-          RAJA::policy::hip::hip_indexer<
-              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-              sync,
-              IndexMapper0>,
-          SEGMENT>
+template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                          sync,
+                                                          IndexMapper0>,
+                   SEGMENT>
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  sync,
+                                                  IndexMapper0>,
+                   SEGMENT>
 {};
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -979,34 +854,29 @@ struct LoopExecute<
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
-    {
+    for (int i = i0 + i0_stride*i1;
+         i < len;
+         i += i0_stride*i1_stride) {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT,
-          typename IndexMapper0,
-          typename IndexMapper1,
-          typename IndexMapper2>
-struct LoopExecute<
-    RAJA::policy::hip::hip_flatten_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper0,
-        IndexMapper1,
-        IndexMapper2>,
-    SEGMENT>
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                          kernel_sync_requirement::none,
+                                                          IndexMapper0,
+                                                          IndexMapper1,
+                                                          IndexMapper2>,
+                   SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -1018,9 +888,9 @@ struct LoopExecute<
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
-         i += i0_stride * i1_stride * i2_stride)
-    {
+    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
+         i < len;
+         i += i0_stride*i1_stride*i2_stride) {
       body(*(segment.begin() + i));
     }
   }
@@ -1031,122 +901,101 @@ struct LoopExecute<
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
-    SEGMENT>
-{
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len)
-    {
+    if (i < len) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                  kernel_sync_requirement::none,
+                                                  IndexMapper>,
+                   SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init =
-        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride =
-        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<
-    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                   kernel_sync_requirement::none,
-                                   IndexMapper>,
-    SEGMENT>
-{
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t   = IndexMapper::template index<diff_t>();
-    const diff_t i   = t * static_cast<diff_t>(tile_size);
+    const diff_t t = IndexMapper::template index<diff_t>();
+    const diff_t i = t * static_cast<diff_t>(tile_size);
 
-    if (i < len)
-    {
+    if (i < len) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<
-    RAJA::policy::hip::hip_indexer<
-        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-        kernel_sync_requirement::none,
-        IndexMapper>,
-    SEGMENT>
-{
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+                                                        kernel_sync_requirement::none,
+                                                        IndexMapper>,
+                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<
-      typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
-    const diff_t len      = segment.end() - segment.begin();
-    const diff_t t_init   = IndexMapper::template index<diff_t>();
-    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len = segment.end() - segment.begin();
+    const diff_t t_init = IndexMapper::template index<diff_t>();
+    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
-    {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 5f06445a0f..0d9d3899d8 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/hip/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/desul/atomic.hpp"
+  #include "RAJA/policy/desul/atomic.hpp"
 #else
-#include "RAJA/policy/hip/atomic.hpp"
+  #include "RAJA/policy/hip/atomic.hpp"
 #endif
 
 #include "RAJA/policy/hip/policy.hpp"
@@ -73,124 +73,100 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner,
-          typename GetTallyIndex,
-          typename T,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                         T identity,
-                                         int bin,
-                                         T value,
-                                         T* tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+template <typename Combiner, typename GetTallyIndex,
+          typename T, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                                                      T identity,
+                                                                      int bin,
+                                                                      T value,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
 {
-  if (value == identity)
-  {
-    return;
-  }
+  if (value == identity) { return; }
 
-  int tally_index =
-      GetTallyIndex::template index<int>();  // globalWarpId by default
+  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset =
-      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
+  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_init_shmem(int num_bins,
-                              T identity,
-                              T* shared_mem,
-                              int shared_replication)
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
+                                                           T identity,
+                                                           T* shared_mem,
+                                                           int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
-  {
+       shmem_offset < shared_replication * num_bins;
+       shmem_offset += numThreads) {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename GetSharedIndex,
-          typename T,
-          typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void
-block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                        T identity,
-                                        int bin,
-                                        T value,
-                                        T* shared_mem,
-                                        GetSharedOffset get_shared_offset,
-                                        int shared_replication)
+template <typename Combiner, typename GetSharedIndex,
+          typename T, typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                                                     T identity,
+                                                                     int bin,
+                                                                     T value,
+                                                                     T* shared_mem,
+                                                                     GetSharedOffset get_shared_offset,
+                                                                     int shared_replication)
 {
-  if (value == identity)
-  {
-    return;
-  }
+  if (value == identity) { return; }
 
-  int shared_index =
-      GetSharedIndex::template index<int>();  // threadId by default
+  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset =
-      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::hip::atomic<Combiner> {}(shared_mem[shmem_offset], value);
+  RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T,
-          typename GetSharedOffset,
-          typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void
-grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                         T identity,
-                                         T* shared_mem,
-                                         GetSharedOffset get_shared_offset,
-                                         int shared_replication,
-                                         T* tally_mem,
-                                         GetTallyOffset get_tally_offset,
-                                         int tally_replication,
-                                         int tally_bins)
+          typename T, typename GetSharedOffset, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                                                      T identity,
+                                                                      T* shared_mem,
+                                                                      GetSharedOffset get_shared_offset,
+                                                                      int shared_replication,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                (gridDim.x * gridDim.y) * blockIdx.z;
+                 (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads)
-  {
+  for (int bin = threadId; bin < num_bins; bin += numThreads) {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
-    {
-      int shmem_offset =
-          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner {}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
+      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner{}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity)
-    {
+    if (value != identity) {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset =
-          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
+      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
     }
+
   }
 }
 
@@ -209,63 +185,48 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template <typename Container>
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
-                                          T const& identity)
-      : m_tally_mem(nullptr),
-        m_identity(identity),
-        m_num_bins(container.size()),
-        m_tally_bins(get_tally_bins(m_num_bins)),
-        m_tally_replication(get_tally_replication())
+  template < typename Container >
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
+      : m_tally_mem(nullptr)
+      , m_identity(identity)
+      , m_num_bins(container.size())
+      , m_tally_bins(get_tally_bins(m_num_bins))
+      , m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                               m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(
-      MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(
-      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData&
-  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData&
-  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData() = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template <typename Container>
+  template < typename Container >
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins)
-    {
+    if (new_num_bins != m_num_bins) {
       teardown_permanent();
-      m_num_bins          = new_num_bins;
-      m_tally_bins        = get_tally_bins(m_num_bins);
+      m_num_bins = new_num_bins;
+      m_tally_bins = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
-                                 m_tally_replication);
-    }
-    else
-    {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    } else {
       {
         int tally_rep = 0;
-        int bin       = 0;
-        for (auto const& value : container)
-        {
-          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
-                                        m_tally_replication)] = value;
+        int bin = 0;
+        for (auto const& value : container) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
-      {
-        for (int bin = 0; bin < m_num_bins; ++bin)
-        {
-          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
-                                        m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < m_num_bins; ++bin) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
         }
       }
     }
@@ -283,11 +244,9 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-        reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
-    {
-      int tally_offset =
-          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+          reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
+      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -299,27 +258,20 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(
-      size_t(
-          policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-      size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size =
-      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+                                                       size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer =
-      typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind =
-      typename GetTallyOffset_rebind_rebunch::template rebunch<
-          s_tally_bunch_size>;
+  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
-           s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -329,50 +281,39 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct
-    {
+    struct {
       int func_min_global_replication;
-    } func_data {min_tally_replication};
+    } func_data{min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer {}
-        .template get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer{}.template
+        get_global_replication<int>(func_data);
   }
 
-  template <typename Container>
-  static T* create_tally(Container const& container,
-                         T const& identity,
-                         int num_bins,
-                         int tally_bins,
-                         int tally_replication)
+  template < typename Container >
+  static T* create_tally(Container const& container, T const& identity,
+                         int num_bins, int tally_bins, int tally_replication)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication * tally_bins, s_tally_alignment);
+        tally_replication*tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0)
-    {
+    if (tally_replication > 0) {
       {
         int tally_rep = 0;
-        int bin       = 0;
-        for (auto const& value : container)
-        {
-          int tally_offset =
-              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
-          new (&tally_mem[tally_offset]) T(value);
+        int bin = 0;
+        for (auto const& value : container) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
-      {
-        for (int bin = 0; bin < num_bins; ++bin)
-        {
-          int tally_offset =
-              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
-          new (&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < num_bins; ++bin) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -380,21 +321,15 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins,
-                            int tally_bins,
-                            int tally_replication)
+                            int num_bins, int tally_bins, int tally_replication)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return;
     }
 
-    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
-    {
-      for (int bin = num_bins; bin > 0; --bin)
-      {
-        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
-                                             tally_replication);
+    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
+      for (int bin = num_bins; bin > 0; --bin) {
+        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -403,15 +338,14 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication;  // power of 2, at least the max number of omp
-                            // threads
+  int m_tally_replication; // power of 2, at least the max number of omp threads
 };
 
 
@@ -420,31 +354,34 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData =
-      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::get;
-  using TallyData::identity;
-  using TallyData::num_bins;
-  using TallyData::reset_permanent;
   using TallyData::TallyData;
+  using TallyData::reset_permanent;
   using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
+  { }
 
   //! teardown per launch, do nothing
-  void teardown_launch() {}
+  void teardown_launch()
+  { }
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device() {}
+  void setup_device()
+  { }
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device() {}
+  void finalize_device()
+  { }
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -452,8 +389,9 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
-        m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity,
+        bin, value,
+        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -463,19 +401,18 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset =
-        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner {}(m_tally_mem[tally_offset], value);
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
+  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
-  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -485,69 +422,57 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData =
-      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template <typename Container>
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
-                                              T const& identity)
-      : TallyData(container, identity),
-        m_shared_offset(s_shared_offset_unknown),
-        m_shared_replication(0)
-  {}
+  template < typename Container >
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
+      : TallyData(container, identity)
+      , m_shared_offset(s_shared_offset_unknown)
+      , m_shared_replication(0)
+  { }
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(
-      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(
-      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data&
-  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data&
-  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::get;
-  using TallyData::identity;
-  using TallyData::num_bins;
   using TallyData::reset_permanent;
   using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0))
-    {
+    if (m_num_bins == size_t(0)) {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
-    size_t shared_replication  = 0;
+    size_t shared_replication = 0;
     const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size)
-        {
-          struct
-          {
-            size_t func_threads_per_block;
-            size_t func_max_shared_replication_per_block;
-          } func_data {block_size, max_shmem_size / m_num_bins};
-
-          shared_replication =
-              SharedAtomicReplicationConcretizer {}
-                  .template get_shared_replication<size_t>(func_data);
-          return m_num_bins * shared_replication;
-        });
-
-    if (shared_offset != dynamic_smem_allocation_failure)
-    {
+        [&](size_t max_shmem_size) {
+
+      struct {
+        size_t func_threads_per_block;
+        size_t func_max_shared_replication_per_block;
+      } func_data{block_size, max_shmem_size / m_num_bins};
+
+      shared_replication = SharedAtomicReplicationConcretizer{}.template
+          get_shared_replication<size_t>(func_data);
+      return m_num_bins * shared_replication;
+    });
+
+    if (shared_offset != dynamic_smem_allocation_failure) {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset      = static_cast<int>(shared_offset);
-    }
-    else
-    {
+      m_shared_offset = static_cast<int>(shared_offset);
+    } else {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -556,7 +481,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset      = s_shared_offset_unknown;
+    m_shared_offset = s_shared_offset_unknown;
   }
 
 
@@ -565,10 +490,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr)
-    {
-      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
-                                          m_shared_replication);
+    if (shared_mem != nullptr) {
+      impl::block_multi_reduce_init_shmem(
+          m_num_bins, m_identity,
+          shared_mem, m_shared_replication);
     }
   }
 
@@ -577,12 +502,11 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr)
-    {
+    if (shared_mem != nullptr) {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
-          m_shared_replication, m_tally_mem, GetTallyOffset {},
-          m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity,
+          shared_mem, GetSharedOffset{}, m_shared_replication,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
     }
   }
 
@@ -592,17 +516,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr)
-    {
+    if (shared_mem != nullptr) {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
-          m_shared_replication);
-    }
-    else
-    {
+          m_num_bins, m_identity,
+          bin, value,
+          shared_mem, GetSharedOffset{}, m_shared_replication);
+    } else {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
-          m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity,
+          bin, value,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
     }
   }
 
@@ -613,16 +536,14 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset =
-        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner {}(m_tally_mem[tally_offset], value);
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer =
-      typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -630,27 +551,24 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown =
-      std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid =
-      std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
 
 
+  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
-  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset;       // in bytes
-  int m_shared_replication;  // power of 2
+  int m_shared_offset; // in bytes
+  int m_shared_replication; // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid)
-    {
+    if (m_shared_offset == s_shared_offset_invalid) {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -677,49 +595,39 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template <typename T, typename t_MultiReduceOp, typename tuning>
+template < typename T, typename t_MultiReduceOp, typename tuning >
 struct MultiReduceDataHip
 {
-  static constexpr bool atomic_available =
-      RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<
-      (atomic_available),
-      std::conditional_t<
-          (tuning::algorithm ==
-           multi_reduce_algorithm::
-               init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                           T,
-                                                           tuning>,
-          std::conditional_t<
-              (tuning::algorithm ==
-               multi_reduce_algorithm::init_host_combine_global_atomic),
-              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
-                                                      T,
-                                                      tuning>,
-              void>>,
+  using reduce_data_type =
+      std::conditional_t<(atomic_available),
+        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
+            hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+            void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Hip>;
 
 public:
-  using value_type    = T;
+  using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataHip() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* = nullptr >
   MultiReduceDataHip(Container const& container, T identity)
-      : m_parent(this),
-        m_sync_list(new SyncList),
-        m_data(container, identity),
-        m_own_launch_data(false)
-  {}
+      : m_parent(this)
+      , m_sync_list(new SyncList)
+      , m_data(container, identity)
+      , m_own_launch_data(false)
+  {
+  }
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -731,35 +639,31 @@ struct MultiReduceDataHip
 #else
       : m_parent(&other)
 #endif
-        ,
-        m_sync_list(other.m_sync_list),
-        m_data(other.m_data),
-        m_own_launch_data(false)
+      , m_sync_list(other.m_sync_list)
+      , m_data(other.m_data)
+      , m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent)
-    {
-      if (setupReducers())
-      {
+    if (m_parent) {
+      if (setupReducers()) {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent          = nullptr;
+        m_parent = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent)
-    {
+    if (!m_parent->m_parent) {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataHip(MultiReduceDataHip&&)                 = delete;
+  MultiReduceDataHip(MultiReduceDataHip &&) = delete;
   MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
-  MultiReduceDataHip& operator=(MultiReduceDataHip&&)      = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -767,30 +671,23 @@ struct MultiReduceDataHip
   ~MultiReduceDataHip()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this)
-    {
+    if (m_parent == this) {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    }
-    else if (m_parent)
-    {
+    } else if (m_parent) {
       // do nothing
-    }
-    else
-    {
-      if (m_own_launch_data)
-      {
+    } else {
+      if (m_own_launch_data) {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent)
-    {
+    if (!m_parent->m_parent) {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -798,7 +695,7 @@ struct MultiReduceDataHip
   }
 
 
-  template <typename Container>
+  template < typename Container >
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -832,17 +729,15 @@ struct MultiReduceDataHip
 
 
 private:
-  MultiReduceDataHip const* m_parent;
+  MultiReduceDataHip const *m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Hip res)
   {
-    for (resources::Hip& list_res : *m_sync_list)
-    {
-      if (list_res.get_stream() == res.get_stream())
-      {
+    for (resources::Hip& list_res : *m_sync_list) {
+      if (list_res.get_stream() == res.get_stream()) {
         return;
       }
     }
@@ -851,8 +746,7 @@ struct MultiReduceDataHip
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Hip& list_res : *m_sync_list)
-    {
+    for (resources::Hip& list_res : *m_sync_list) {
       ::RAJA::hip::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -861,8 +755,7 @@ struct MultiReduceDataHip
 
 }  // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
-                                hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index db4d204aeb..30269f8406 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -11,45 +11,42 @@
 #include "roctx.h"
 #endif
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
-
-// Init
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-init(KernelName& kn, const RAJA::hip::detail::hipInfo&)
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
+  {
 #if defined(RAJA_ENABLE_ROCTX)
-  roctxRangePush(kn.name);
+    roctxRangePush(kn.name);
 #else
-  RAJA_UNUSED_VAR(kn);
+    RAJA_UNUSED_VAR(kn);
 #endif
-}
-
-// Combine
-template <typename EXEC_POL>
-RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-combine(KernelName&)
-{}
-
-// Resolve
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-resolve(KernelName&, const RAJA::hip::detail::hipInfo&)
-{
+  }
+
+  // Combine
+  template<typename EXEC_POL>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  combine(KernelName&) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
+  {
 #if defined(RAJA_ENABLE_ROCTX)
-  roctxRangePop();
+    roctxRangePop();
 #endif
-}
+  }
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
 #endif
 
-#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index 305986f522..a3da07ee2c 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -8,54 +8,54 @@
 #include "RAJA/policy/hip/reduce.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
-
-// Init
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-init(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
-{
-  red.devicetarget =
-      RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
-  red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
-  red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
-}
-
-// Combine
-template <typename EXEC_POL, typename OP, typename T>
-RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-combine(Reducer<OP, T>& red)
-{
-  RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
-}
-
-// Resolve
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
-resolve(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
-{
-  // complete reduction
-  hi.res.wait();
-  *red.target = OP {}(*red.target, *red.devicetarget);
-
-  // free memory
-  RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
-  red.device_count = nullptr;
-  red.device_mem.deallocate();
-  RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
-  red.devicetarget = nullptr;
-}
-
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  init(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
+  {
+    red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
+    red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
+    red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  combine(Reducer<OP, T, VOp>& red)
+  {
+    RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter,OP>( red.devicetarget,
+                                                                            red.getVal(),
+                                                                            red.device_mem,
+                                                                            red.device_count);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  resolve(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
+  {
+    // complete reduction
+    hi.res.wait();
+
+    red.combineTarget(*red.devicetarget);
+
+    // free memory
+    RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
+    red.device_count = nullptr;
+    red.device_mem.deallocate();
+    RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
+    red.devicetarget = nullptr;
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
 #endif
 
-#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 2491f5dc05..a9f9027675 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -38,7 +38,7 @@
 namespace RAJA
 {
 
-using hip_dim_t        = dim3;
+using hip_dim_t = dim3;
 using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 
 //
@@ -56,14 +56,12 @@ using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch
-{
+struct get_launch {
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false>
-{
+struct get_launch<false> {
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -72,16 +70,16 @@ namespace hip
 {
 
 /// Type representing thread and block indexing within a grid
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template <typename... indexers>
+template<typename ...indexers>
 struct IndexFlatten;
 
-template <size_t divisor, typename index>
+template<size_t divisor, typename index>
 struct IndexDivide;
 
-template <size_t divisor, typename index>
+template<size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -93,14 +91,13 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device   = data.device_sm_per_device;
+    IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device =
-        func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -113,31 +110,26 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
 struct FractionOffsetOccupancyConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device   = data.device_sm_per_device;
+    IdxT device_sm_per_device = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
-    {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
-        IdxT(0))
-    {
-      func_max_blocks_per_sm =
-          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
+      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device =
-        func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -151,27 +143,22 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template <typename AvoidMaxOccupancyConcretizer>
+template < typename AvoidMaxOccupancyConcretizer >
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block    = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm =
-        func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm)
-    {
+    if (func_max_threads_per_sm < device_max_threads_per_sm) {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    }
-    else
-    {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
-          data);
+    } else {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
     }
   }
 };
@@ -180,10 +167,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template <size_t preferred_replication>
+template < size_t preferred_replication >
 struct ConstantPreferredReplicationConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -195,23 +182,19 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template <size_t t_cutoff,
-          size_t preferred_replication_before_cutoff,
-          size_t preferred_replication_after_cutoff>
+template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
+                            size_t preferred_replication_after_cutoff >
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff                 = t_cutoff;
+    IdxT cutoff = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff)
-    {
+    if (func_threads_per_block < cutoff) {
       return IdxT(preferred_replication_before_cutoff);
-    }
-    else
-    {
+    } else {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -222,21 +205,19 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template <typename GetPreferredReplication>
+template < typename GetPreferredReplication >
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block =
-        data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication =
-        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
-            data);
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
 
-    return prev_pow2(
-        std::min(preferred_replication, func_max_shared_replication_per_block));
+    return prev_pow2(std::min(preferred_replication,
+                              func_max_shared_replication_per_block));
   }
 };
 
@@ -245,20 +226,18 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template <typename GetPreferredReplication>
+template < typename GetPreferredReplication >
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template <typename IdxT, typename Data>
+  template < typename IdxT, typename Data >
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication =
-        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
-            data);
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
 
-    return next_pow2(
-        std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -276,16 +255,14 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template <reduce_algorithm t_algorithm,
-          block_communication_mode t_comm_mode,
-          size_t t_replication,
-          size_t t_atomic_stride>
+template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
+           size_t t_replication, size_t t_atomic_stride >
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm         = t_algorithm;
+  static constexpr reduce_algorithm algorithm = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication                 = t_replication;
-  static constexpr size_t atomic_stride               = t_atomic_stride;
+  static constexpr size_t replication = t_replication;
+  static constexpr size_t atomic_stride = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -297,25 +274,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template <typename t_AtomicReplicationConcretizer,
-          typename t_ReplicationIndexer,
-          typename t_OffsetCalculator>
+template < typename t_AtomicReplicationConcretizer,
+           typename t_ReplicationIndexer,
+           typename t_OffsetCalculator >
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer           = t_ReplicationIndexer;
-  using OffsetCalculator             = t_OffsetCalculator;
+  using ReplicationIndexer = t_ReplicationIndexer;
+  using OffsetCalculator = t_OffsetCalculator;
 };
 
-template <multi_reduce_algorithm t_algorithm,
-          typename t_SharedAtomicReplicationTuning,
-          typename t_GlobalAtomicReplicationTuning>
+template < multi_reduce_algorithm t_algorithm,
+           typename t_SharedAtomicReplicationTuning,
+           typename t_GlobalAtomicReplicationTuning >
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent    = false;
+  static constexpr bool consistent = false;
 };
 
 }  // namespace hip
@@ -330,19 +307,16 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type
-      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
-                                             // the cache level that handles
-                                             // atomics
+  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-      : WARP_SIZE(warp_size),
-        MAX_BLOCK_SIZE(max_block_size),
-        MAX_WARPS(max_block_size / warp_size),
-        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  {}
+    : WARP_SIZE(warp_size)
+    , MAX_BLOCK_SIZE(max_block_size)
+    , MAX_WARPS(max_block_size / warp_size)
+    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  { }
 };
 
 //
@@ -350,59 +324,49 @@ struct DeviceConstants
 // values for HIP warp size and max block size.
 //
 #if defined(__HIP_PLATFORM_AMD__)
-constexpr DeviceConstants device_constants(64, 1024, 64);  // MI300A
+constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A
 // constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X
 #elif defined(__HIP_PLATFORM_NVIDIA__)
-constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
+constexpr DeviceConstants device_constants(32, 1024, 32); // V100
 #endif
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
-              "device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
 
 
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
-struct hip_indexer
-{};
+template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+struct hip_indexer {};
 
-template <typename _IterationMapping,
-          kernel_sync_requirement sync,
-          typename... _IterationGetters>
+template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-                                 RAJA::Policy::hip,
-                                 RAJA::Pattern::region,
-                                 detail::get_launch<true /*async */>::value,
-                                 RAJA::Platform::hip>
-{
+  RAJA::Policy::hip,
+  RAJA::Pattern::region,
+  detail::get_launch<true /*async */>::value,
+  RAJA::Platform::hip> {
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping,
-          typename _IterationGetter,
-          typename _LaunchConcretizer,
+template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
           bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                      RAJA::Policy::hip,
-                      RAJA::Pattern::forall,
-                      detail::get_launch<Async>::value,
-                      RAJA::Platform::hip>
-{
-  using IterationMapping  = _IterationMapping;
-  using IterationGetter   = _IterationGetter;
+                       RAJA::Policy::hip,
+                       RAJA::Pattern::forall,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::hip> {
+  using IterationMapping = _IterationMapping;
+  using IterationGetter = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
 template <bool Async, int num_threads = named_usage::unspecified>
 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                          RAJA::Policy::hip,
-                          RAJA::Pattern::region,
-                          detail::get_launch<Async>::value,
-                          RAJA::Platform::hip>
-{};
+                       RAJA::Policy::hip,
+                       RAJA::Pattern::region,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::hip> {
+};
 
 
 //
@@ -414,11 +378,11 @@ struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
 ///
 template <size_t BLOCK_SIZE, bool Async = false>
 struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
-                      RAJA::Policy::hip,
-                      RAJA::Pattern::workgroup_exec,
-                      detail::get_launch<Async>::value,
-                      RAJA::Platform::hip>
-{};
+                       RAJA::Policy::hip,
+                       RAJA::Pattern::workgroup_exec,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::hip> {
+};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -426,10 +390,10 @@ struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_hip_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-          RAJA::Policy::hip,
-          RAJA::Pattern::workgroup_order,
-          RAJA::Platform::hip>
-{};
+                       RAJA::Policy::hip,
+                       RAJA::Pattern::workgroup_order,
+                       RAJA::Platform::hip> {
+};
 
 
 ///
@@ -441,36 +405,36 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///
 
 
-template <typename tuning>
-struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
-                               RAJA::Policy::hip,
-                               RAJA::Pattern::reduce,
-                               detail::get_launch<false>::value,
-                               RAJA::Platform::hip,
-                               std::conditional_t<tuning::consistent,
-                                                  reduce::ordered,
-                                                  reduce::unordered>>
-{};
+template < typename tuning >
+struct hip_reduce_policy
+    : public RAJA::
+          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
+                                                RAJA::Pattern::reduce,
+                                                detail::get_launch<false>::value,
+                                                RAJA::Platform::hip,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
+};
 
-template <typename tuning>
+template < typename tuning >
 struct hip_multi_reduce_policy
-    : public RAJA::make_policy_pattern_launch_platform_t<
-          RAJA::Policy::hip,
-          RAJA::Pattern::multi_reduce,
-          detail::get_launch<false>::value,
-          RAJA::Platform::hip,
-          std::conditional_t<tuning::consistent,
-                             reduce::ordered,
-                             reduce::unordered>>
-{};
+    : public RAJA::
+          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
+                                                RAJA::Pattern::multi_reduce,
+                                                detail::get_launch<false>::value,
+                                                RAJA::Platform::hip,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
+};
 
 /*!
  * Hip atomic policy for using hip atomics on the device and
  * the provided policy on the host
  */
-template <typename host_policy>
-struct hip_atomic_explicit
-{};
+template<typename host_policy>
+struct hip_atomic_explicit{};
 
 /*!
  * Default hip atomic policy uses hip atomics on the device and non-atomics
@@ -481,13 +445,11 @@ using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct hip_block_reduce
-{};
+struct hip_block_reduce{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct hip_warp_reduce
-{};
+struct hip_warp_reduce{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
@@ -501,15 +463,15 @@ struct hip_warp_reduce
 // struct hip_warp_loop{};
 
 
+
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
 // Multiple warps have to be created by using hip_thread_{yz}_*
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template <typename Mask>
-struct hip_warp_masked_direct
-{};
+template<typename Mask>
+struct hip_warp_masked_direct {};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
@@ -517,24 +479,21 @@ struct hip_warp_masked_direct
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template <typename Mask>
-struct hip_warp_masked_loop
-{};
+template<typename Mask>
+struct hip_warp_masked_loop {};
 
 
-template <typename Mask>
-struct hip_thread_masked_direct
-{};
+template<typename Mask>
+struct hip_thread_masked_direct {};
 
-template <typename Mask>
-struct hip_thread_masked_loop
-{};
+template<typename Mask>
+struct hip_thread_masked_loop {};
 
 
 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
-                                                      Pattern::synchronize,
-                                                      Launch::sync>
-{};
+                                                       Pattern::synchronize,
+                                                       Launch::sync> {
+};
 
 }  // end namespace hip
 }  // end namespace policy
@@ -546,131 +505,141 @@ namespace internal
 RAJA_INLINE
 int get_size(hip_dim_t dims)
 {
-  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
-  {
+  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
     return 0;
   }
-  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) *
+         (dims.y ? dims.y : 1) *
+         (dims.z ? dims.z : 1);
 }
 
-struct HipDims
-{
+struct HipDims {
 
-  hip_dim_t blocks {0, 0, 0};
-  hip_dim_t threads {0, 0, 0};
+  hip_dim_t blocks{0,0,0};
+  hip_dim_t threads{0,0,0};
 
-  HipDims()                          = default;
-  HipDims(HipDims const&)            = default;
+  HipDims() = default;
+  HipDims(HipDims const&) = default;
   HipDims& operator=(HipDims const&) = default;
 
   RAJA_INLINE
   HipDims(hip_dim_member_t default_val)
-      : blocks {default_val, default_val, default_val},
-        threads {default_val, default_val, default_val}
-  {}
+    : blocks{default_val, default_val, default_val}
+    , threads{default_val, default_val, default_val}
+  { }
 
   RAJA_INLINE
-  int num_blocks() const { return get_size(blocks); }
+  int num_blocks() const {
+    return get_size(blocks);
+  }
 
   RAJA_INLINE
-  int num_threads() const { return get_size(threads); }
+  int num_threads() const {
+    return get_size(threads);
+  }
 
   RAJA_INLINE
-  hip_dim_t get_blocks() const
-  {
-    if (num_blocks() != 0)
-    {
-      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
+  hip_dim_t get_blocks() const {
+    if (num_blocks() != 0) {
+      return {(blocks.x ? blocks.x : 1),
+              (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    }
-    else
-    {
+    } else {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  hip_dim_t get_threads() const
-  {
-    if (num_threads() != 0)
-    {
-      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
+  hip_dim_t get_threads() const {
+    if (num_threads() != 0) {
+      return {(threads.x ? threads.x : 1),
+              (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    }
-    else
-    {
+    } else {
       return threads;
     }
   }
 };
 
-template <named_dim dim>
+template<named_dim dim>
 struct HipDimHelper;
 
-template <>
-struct HipDimHelper<named_dim::x>
-{
+template<>
+struct HipDimHelper<named_dim::x>{
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static constexpr
+  hip_dim_member_t get(dim_t const &d)
   {
     return d.x;
   }
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static
+  void set(dim_t &d, hip_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template <>
-struct HipDimHelper<named_dim::y>
-{
+template<>
+struct HipDimHelper<named_dim::y>{
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static constexpr
+  hip_dim_member_t get(dim_t const &d)
   {
     return d.y;
   }
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static
+  void set(dim_t &d, hip_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template <>
-struct HipDimHelper<named_dim::z>
-{
+template<>
+struct HipDimHelper<named_dim::z>{
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static constexpr
+  hip_dim_member_t get(dim_t const &d)
   {
     return d.z;
   }
 
-  template <typename dim_t>
-  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
+  template<typename dim_t>
+  RAJA_HOST_DEVICE
+  inline static
+  void set(dim_t &d, hip_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template <named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE constexpr hip_dim_member_t get_hip_dim(dim_t const& d)
+template<named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE
+constexpr
+hip_dim_member_t get_hip_dim(dim_t const &d)
 {
   return HipDimHelper<dim>::get(d);
 }
 
-template <named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
+template<named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE
+void set_hip_dim(dim_t &d, hip_dim_member_t value)
 {
   return HipDimHelper<dim>::set(d, value);
 }
 
-}  // namespace internal
+} // namespace internal
 
 namespace hip
 {
@@ -679,13 +648,14 @@ namespace hip
 struct IndexSize
 {
   hip_dim_member_t block_size = named_usage::unspecified;
-  hip_dim_member_t grid_size  = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr IndexSize(
-      hip_dim_member_t _block_size = named_usage::unspecified,
-      hip_dim_member_t _grid_size  = named_usage::unspecified)
-      : block_size(_block_size), grid_size(_grid_size)
-  {}
+  hip_dim_member_t grid_size = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr
+  IndexSize(hip_dim_member_t _block_size = named_usage::unspecified,
+            hip_dim_member_t _grid_size = named_usage::unspecified)
+    : block_size(_block_size)
+    , grid_size(_grid_size)
+  { }
 };
 
 /// Type representing thread indexing within a grid
@@ -693,457 +663,436 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(block_size) *
+           static_cast<IdxT>(grid_size) ;
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template <named_dim dim, int GRID_SIZE>
+template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(grid_size) ;
   }
 };
 /// with fixed block size and fixed grid size of 1
-template <named_dim dim, int BLOCK_SIZE>
+template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size);
+    return static_cast<IdxT>(block_size) ;
   }
 };
 /// with fixed block size and fixed grid size of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 
 /// with dynamic block size and fixed grid size
-template <named_dim dim, int GRID_SIZE>
+template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size) ;
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
   }
 };
 
 /// with fixed block size and dynamic grid size
-template <named_dim dim, int BLOCK_SIZE>
+template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-               static_cast<IdxT>(
-                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template <named_dim dim, int GRID_SIZE>
+template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = GRID_SIZE;
+  static constexpr int grid_size = GRID_SIZE;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size);
+    return static_cast<IdxT>(grid_size) ;
   }
 };
 /// with fixed grid sized of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = 1;
+  static constexpr int grid_size = 1;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 /// with dynamic grid size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = named_usage::unspecified;
+  static constexpr int grid_size = named_usage::unspecified;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template <named_dim dim, int BLOCK_SIZE>
+template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size);
+    return static_cast<IdxT>(block_size) ;
   }
 };
 /// with fixed block size of 1
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 /// with dynamic block size
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(
-        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template <named_dim dim>
+template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size  = named_usage::ignored;
+  static constexpr int grid_size = named_usage::ignored;
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0);
+    return static_cast<IdxT>(0) ;
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1);
+    return static_cast<IdxT>(1) ;
   }
 };
 
 // useful for flatten global index (includes x)
-template <typename x_index>
+template<typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return x_index::template size<IdxT>();
+    return  x_index::template size<IdxT>();
   }
+
 };
 
 // useful for flatten global index (includes x,y)
-template <typename x_index, typename y_index>
+template<typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
+      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
+
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
+    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
   }
+
 };
 
 // useful for flatten global index (includes x,y,z)
-template <typename x_index, typename y_index, typename z_index>
+template<typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-           x_index::template size<IdxT>() *
-               (y_index::template index<IdxT>() +
-                y_index::template size<IdxT>() *
-                    z_index::template index<IdxT>());
+      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
+                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
-           z_index::template size<IdxT>();
+    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
   }
+
 };
 
-template <size_t divisor, typename indexer>
+template<size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
-                                   static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
   }
 };
 
-template <size_t divisor, typename indexer>
+template<size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template <typename IdxT = hip_dim_member_t>
+  template < typename IdxT = hip_dim_member_t >
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1152,10 +1101,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template <typename index_global>
+template < typename index_global >
 struct get_index_thread;
 ///
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1170,10 +1119,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template <typename index_global>
+template < typename index_global >
 struct get_index_block;
 ///
-template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1188,88 +1137,89 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+template <size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE = named_usage::unspecified>
+template <size_t GRID_SIZE=named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE = named_usage::unspecified>
+template <size_t GRID_SIZE=named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE = named_usage::unspecified>
+template <size_t GRID_SIZE=named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
+template <size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X = named_usage::unspecified,
-          size_t GRID_SIZE_Y = named_usage::unspecified,
-          size_t GRID_SIZE_Z = named_usage::unspecified>
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified>
-using warp_xyz =
-    IndexDivide<WARP_SIZE,
-                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X = named_usage::unspecified,
-          size_t BLOCK_SIZE_Y = named_usage::unspecified,
-          size_t BLOCK_SIZE_Z = named_usage::unspecified,
-          size_t GRID_SIZE_X  = named_usage::unspecified,
-          size_t GRID_SIZE_Y  = named_usage::unspecified,
-          size_t GRID_SIZE_Z  = named_usage::unspecified>
-using warp_global_xyz =
-    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
-                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
-
-}  // namespace hip
+template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+using warp_xyz = IndexDivide<WARP_SIZE,
+                             thread_xyz<BLOCK_SIZE_X,
+                                        BLOCK_SIZE_Y,
+                                        BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified,
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
+                                              BLOCK_SIZE_X,
+                                              BLOCK_SIZE_Y,
+                                              BLOCK_SIZE_Z>,
+                                     block_xyz<GRID_SIZE_X,
+                                               GRID_SIZE_Y,
+                                               GRID_SIZE_Z>>;
+
+} // namespace hip
 
 // contretizers used in forall, scan, and sort policies
 
-using HipAvoidDeviceMaxThreadOccupancyConcretizer =
-    hip::AvoidDeviceMaxThreadOccupancyConcretizer<
-        hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
-using HipFractionOffsetOccupancyConcretizer =
-    hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipReduceDefaultConcretizer =
-    HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
@@ -1277,111 +1227,83 @@ using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
-                                       hip::global_x<BLOCK_SIZE>,
-                                       HipDefaultConcretizer,
-                                       Async>;
+using hip_exec = policy::hip::hip_exec<
+    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
-                                             hip::global_x<BLOCK_SIZE>,
-                                             HipDefaultConcretizer,
-                                             true>;
+using hip_exec_async = policy::hip::hip_exec<
+    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_max = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_max_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer, true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using hip_exec_occ_fraction = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using hip_exec_occ_fraction_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using hip_exec_occ_custom = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    Concretizer,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    Concretizer, Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using hip_exec_occ_custom_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    Concretizer,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    Concretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_with_reduce = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer,
-    Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_with_reduce_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>,
-    hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer,
-    true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer, true>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base =
-    std::conditional_t<with_reduce,
-                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
-                       hip_exec<BLOCK_SIZE, Async>>;
+using hip_exec_base = std::conditional_t<with_reduce,
+    hip_exec_with_reduce<BLOCK_SIZE, Async>,
+    hip_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async =
-    std::conditional_t<with_reduce,
-                       hip_exec_with_reduce_async<BLOCK_SIZE>,
-                       hip_exec_async<BLOCK_SIZE>>;
+using hip_exec_base_async = std::conditional_t<with_reduce,
+    hip_exec_with_reduce_async<BLOCK_SIZE>,
+    hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -1397,10 +1319,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template <hip::reduce_algorithm algorithm,
-          hip::block_communication_mode comm_mode,
-          size_t replication   = named_usage::unspecified,
-          size_t atomic_stride = named_usage::unspecified>
+template < hip::reduce_algorithm algorithm,
+           hip::block_communication_mode comm_mode,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified >
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1423,41 +1345,35 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
-                      hip::block_communication_mode::device_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::combine_last_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_block_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
-                      hip::block_communication_mode::block_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::combine_last_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
-                      hip::block_communication_mode::device_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_device_combine_atomic_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
-                      hip::block_communication_mode::block_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_device_combine_atomic_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
-                      hip::block_communication_mode::device_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_host_combine_atomic_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence =
-    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
-                      hip::block_communication_mode::block_fence,
-                      named_usage::unspecified,
-                      named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_host_combine_atomic_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1469,26 +1385,25 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template <bool with_atomic>
-using hip_reduce_base =
-    std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
+template < bool with_atomic >
+using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // policies usable with multi_reducers
-template <hip::multi_reduce_algorithm algorithm,
-          typename SharedAtomicReplicationConcretizer,
-          typename SharedAtomicReplicationIndexer,
-          typename GlobalAtomicReplicationConcretizer,
-          typename GlobalAtomicReplicationIndexer>
-using hip_multi_reduce_tuning =
-    policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
-        algorithm,
-        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                     SharedAtomicReplicationIndexer,
-                                     GetOffsetRight<int>>,
-        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                     GlobalAtomicReplicationIndexer,
-                                     GetOffsetLeft<int>>>>;
+template < hip::multi_reduce_algorithm algorithm,
+           typename SharedAtomicReplicationConcretizer,
+           typename SharedAtomicReplicationIndexer,
+           typename GlobalAtomicReplicationConcretizer,
+           typename GlobalAtomicReplicationIndexer >
+using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
+    hip::MultiReduceTuning<
+      algorithm,
+      hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                    SharedAtomicReplicationIndexer,
+                                    GetOffsetRight<int>>,
+      hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                    GlobalAtomicReplicationIndexer,
+                                    GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1501,51 +1416,44 @@ using hip_multi_reduce_tuning =
 // - *host_init* policies initialize memory used with atomics on the host.
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init =
-    hip_multi_reduce_tuning<
-        hip::multi_reduce_algorithm::
-            init_host_combine_block_atomic_then_grid_atomic,
-        hip::SharedAtomicReplicationMaxPow2Concretizer<
-            hip::ConstantPreferredReplicationConcretizer<4>>,
-        hip::thread_xyz<>,
-        hip::GlobalAtomicReplicationMinPow2Concretizer<
-            hip::ConstantPreferredReplicationConcretizer<32>>,
-        hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning<
+    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    hip::SharedAtomicReplicationMaxPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<4>>,
+    hip::thread_xyz<>,
+    hip::GlobalAtomicReplicationMinPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<32>>,
+    hip::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
-    hip_multi_reduce_tuning<
-        hip::multi_reduce_algorithm::
-            init_host_combine_block_atomic_then_grid_atomic,
-        hip::SharedAtomicReplicationMaxPow2Concretizer<
-            hip::ConstantPreferredReplicationConcretizer<0>>,
-        hip::thread_xyz<>,
-        hip::GlobalAtomicReplicationMinPow2Concretizer<
-            hip::ConstantPreferredReplicationConcretizer<32>>,
-        hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning<
+    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    hip::SharedAtomicReplicationMaxPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<0>>,
+    hip::thread_xyz<>,
+    hip::GlobalAtomicReplicationMinPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<32>>,
+    hip::warp_global_xyz<>>;
 //
 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
     hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void,  // unused with this algorithm
-    void,  // unused with this algorithm
+    void, // unused with this algorithm
+    void, // unused with this algorithm
     hip::GlobalAtomicReplicationMinPow2Concretizer<
         hip::ConstantPreferredReplicationConcretizer<32>>,
     hip::warp_global_xyz<>>;
 //
-using hip_multi_reduce_atomic_global_no_replication_host_init =
-    hip_multi_reduce_tuning<
-        hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-        void,  // unused with this algorithm
-        void,  // unused with this algorithm
-        hip::GlobalAtomicReplicationMinPow2Concretizer<
-            hip::ConstantPreferredReplicationConcretizer<1>>,
-        hip::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
-// the same answer every time when used in the same way
-using hip_multi_reduce_atomic =
-    hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely
-// used
+using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning<
+    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+    void, // unused with this algorithm
+    void, // unused with this algorithm
+    hip::GlobalAtomicReplicationMinPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<1>>,
+    hip::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely used
 using hip_multi_reduce_atomic_low_performance_low_overhead =
     hip_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1577,31 +1485,31 @@ using policy::hip::hip_launch_t;
 
 
 // policies usable with kernel and launch
-template <typename... indexers>
-using hip_indexer_direct =
-    policy::hip::hip_indexer<iteration_mapping::Direct,
-                             kernel_sync_requirement::none,
-                             indexers...>;
+template < typename ... indexers >
+using hip_indexer_direct = policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
-template <typename... indexers>
+template < typename ... indexers >
 using hip_indexer_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template <typename... indexers>
+template < typename ... indexers >
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template <typename... indexers>
-using hip_flatten_indexer_direct =
-    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
-                                     kernel_sync_requirement::none,
-                                     indexers...>;
+template < typename ... indexers >
+using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
+    iteration_mapping::Direct,
+    kernel_sync_requirement::none,
+    indexers...>;
 
-template <typename... indexers>
+template < typename ... indexers >
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1614,7 +1522,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_thread_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1629,28 +1537,22 @@ using hip_thread_yz_direct = hip_thread_direct<named_dim::y, named_dim::z>;
 using hip_thread_zx_direct = hip_thread_direct<named_dim::z, named_dim::x>;
 using hip_thread_zy_direct = hip_thread_direct<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_direct =
-    hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_direct =
-    hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_direct =
-    hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_direct =
-    hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_direct =
-    hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_direct =
-    hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_direct = hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_direct = hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_direct = hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_direct = hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_direct = hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_direct = hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_thread_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_thread_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1665,18 +1567,12 @@ using hip_thread_yz_loop = hip_thread_loop<named_dim::y, named_dim::z>;
 using hip_thread_zx_loop = hip_thread_loop<named_dim::z, named_dim::x>;
 using hip_thread_zy_loop = hip_thread_loop<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_loop =
-    hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_loop =
-    hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_loop =
-    hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_loop =
-    hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_loop =
-    hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_loop =
-    hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_loop = hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_loop = hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_loop = hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_loop = hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_loop = hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
@@ -1684,7 +1580,7 @@ using hip_thread_zyx_loop =
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_flatten_thread_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1692,38 +1588,26 @@ using hip_flatten_thread_x_direct = hip_flatten_thread_direct<named_dim::x>;
 using hip_flatten_thread_y_direct = hip_flatten_thread_direct<named_dim::y>;
 using hip_flatten_thread_z_direct = hip_flatten_thread_direct<named_dim::z>;
 
-using hip_flatten_thread_xy_direct =
-    hip_flatten_thread_direct<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_direct =
-    hip_flatten_thread_direct<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_direct =
-    hip_flatten_thread_direct<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_direct =
-    hip_flatten_thread_direct<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_direct =
-    hip_flatten_thread_direct<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_direct =
-    hip_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_direct =
-    hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_direct =
-    hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_direct =
-    hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_direct =
-    hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_direct =
-    hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_direct =
-    hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_flatten_thread_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1731,31 +1615,19 @@ using hip_flatten_thread_x_loop = hip_flatten_thread_loop<named_dim::x>;
 using hip_flatten_thread_y_loop = hip_flatten_thread_loop<named_dim::y>;
 using hip_flatten_thread_z_loop = hip_flatten_thread_loop<named_dim::z>;
 
-using hip_flatten_thread_xy_loop =
-    hip_flatten_thread_loop<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_loop =
-    hip_flatten_thread_loop<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_loop =
-    hip_flatten_thread_loop<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_loop =
-    hip_flatten_thread_loop<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_loop =
-    hip_flatten_thread_loop<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_loop =
-    hip_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_loop =
-    hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_loop =
-    hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_loop =
-    hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_loop =
-    hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_loop =
-    hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_loop =
-    hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1763,7 +1635,7 @@ using hip_flatten_thread_zyx_loop =
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_block_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1778,28 +1650,22 @@ using hip_block_yz_direct = hip_block_direct<named_dim::y, named_dim::z>;
 using hip_block_zx_direct = hip_block_direct<named_dim::z, named_dim::x>;
 using hip_block_zy_direct = hip_block_direct<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_direct =
-    hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_direct =
-    hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_direct =
-    hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_direct =
-    hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_direct =
-    hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_direct =
-    hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_direct = hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_direct = hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_direct = hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_direct = hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_direct = hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_direct = hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_block_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_block_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1814,18 +1680,12 @@ using hip_block_yz_loop = hip_block_loop<named_dim::y, named_dim::z>;
 using hip_block_zx_loop = hip_block_loop<named_dim::z, named_dim::x>;
 using hip_block_zy_loop = hip_block_loop<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_loop =
-    hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_loop =
-    hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_loop =
-    hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_loop =
-    hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_loop =
-    hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_loop =
-    hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_loop = hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_loop = hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_loop = hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_loop = hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_loop = hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
@@ -1833,7 +1693,7 @@ using hip_block_zyx_loop =
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_flatten_block_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1841,38 +1701,26 @@ using hip_flatten_block_x_direct = hip_flatten_block_direct<named_dim::x>;
 using hip_flatten_block_y_direct = hip_flatten_block_direct<named_dim::y>;
 using hip_flatten_block_z_direct = hip_flatten_block_direct<named_dim::z>;
 
-using hip_flatten_block_xy_direct =
-    hip_flatten_block_direct<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_direct =
-    hip_flatten_block_direct<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_direct =
-    hip_flatten_block_direct<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_direct =
-    hip_flatten_block_direct<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_direct =
-    hip_flatten_block_direct<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_direct =
-    hip_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_direct =
-    hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_direct =
-    hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_direct =
-    hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_direct =
-    hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_direct =
-    hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_direct =
-    hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_direct = hip_flatten_block_direct<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_direct = hip_flatten_block_direct<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_direct = hip_flatten_block_direct<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_direct = hip_flatten_block_direct<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_direct = hip_flatten_block_direct<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_direct = hip_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_direct = hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_direct = hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_direct = hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_direct = hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_direct = hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_direct = hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template <named_dim... dims>
+template < named_dim ... dims >
 using hip_flatten_block_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1880,31 +1728,19 @@ using hip_flatten_block_x_loop = hip_flatten_block_loop<named_dim::x>;
 using hip_flatten_block_y_loop = hip_flatten_block_loop<named_dim::y>;
 using hip_flatten_block_z_loop = hip_flatten_block_loop<named_dim::z>;
 
-using hip_flatten_block_xy_loop =
-    hip_flatten_block_loop<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_loop =
-    hip_flatten_block_loop<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_loop =
-    hip_flatten_block_loop<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_loop =
-    hip_flatten_block_loop<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_loop =
-    hip_flatten_block_loop<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_loop =
-    hip_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_loop =
-    hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_loop =
-    hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_loop =
-    hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_loop =
-    hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_loop =
-    hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_loop =
-    hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_loop = hip_flatten_block_loop<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_loop = hip_flatten_block_loop<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_loop = hip_flatten_block_loop<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_loop = hip_flatten_block_loop<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_loop = hip_flatten_block_loop<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_loop = hip_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_loop = hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_loop = hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_loop = hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_loop = hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_loop = hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1912,11 +1748,9 @@ using hip_flatten_block_zyx_loop =
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template <named_dim... dims>
-using hip_global_direct =
-    hip_indexer_direct<hip::IndexGlobal<dims,
-                                        named_usage::unspecified,
-                                        named_usage::unspecified>...>;
+template < named_dim ... dims >
+using hip_global_direct = hip_indexer_direct<
+    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using hip_global_x_direct = hip_global_direct<named_dim::x>;
 using hip_global_y_direct = hip_global_direct<named_dim::y>;
@@ -1929,34 +1763,24 @@ using hip_global_yz_direct = hip_global_direct<named_dim::y, named_dim::z>;
 using hip_global_zx_direct = hip_global_direct<named_dim::z, named_dim::x>;
 using hip_global_zy_direct = hip_global_direct<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_direct =
-    hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_direct =
-    hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_direct =
-    hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_direct =
-    hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_direct =
-    hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_direct =
-    hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_direct = hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_direct = hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_direct = hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_direct = hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_direct = hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_direct = hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template <named_dim... dims>
-using hip_global_loop =
-    hip_indexer_loop<hip::IndexGlobal<dims,
-                                      named_usage::unspecified,
-                                      named_usage::unspecified>...>;
-
-template <named_dim... dims>
-using hip_global_syncable_loop =
-    hip_indexer_syncable_loop<hip::IndexGlobal<dims,
-                                               named_usage::unspecified,
-                                               named_usage::unspecified>...>;
+template < named_dim ... dims >
+using hip_global_loop = hip_indexer_loop<
+    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+
+template < named_dim ... dims >
+using hip_global_syncable_loop = hip_indexer_syncable_loop<
+    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using hip_global_x_loop = hip_global_loop<named_dim::x>;
 using hip_global_y_loop = hip_global_loop<named_dim::y>;
@@ -1969,18 +1793,12 @@ using hip_global_yz_loop = hip_global_loop<named_dim::y, named_dim::z>;
 using hip_global_zx_loop = hip_global_loop<named_dim::z, named_dim::x>;
 using hip_global_zy_loop = hip_global_loop<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_loop =
-    hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_loop =
-    hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_loop =
-    hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_loop =
-    hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_loop =
-    hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_loop =
-    hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_loop = hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_loop = hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_loop = hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_loop = hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_loop = hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -1988,83 +1806,54 @@ using hip_global_zyx_loop =
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template <named_dim... dims>
-using hip_flatten_global_direct =
-    hip_flatten_indexer_direct<hip::IndexGlobal<dims,
-                                                named_usage::unspecified,
-                                                named_usage::unspecified>...>;
+template < named_dim ... dims >
+using hip_flatten_global_direct = hip_flatten_indexer_direct<
+    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
 using hip_flatten_global_z_direct = hip_flatten_global_direct<named_dim::z>;
 
-using hip_flatten_global_xy_direct =
-    hip_flatten_global_direct<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_direct =
-    hip_flatten_global_direct<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_direct =
-    hip_flatten_global_direct<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_direct =
-    hip_flatten_global_direct<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_direct =
-    hip_flatten_global_direct<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_direct =
-    hip_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_direct =
-    hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_direct =
-    hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_direct =
-    hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_direct =
-    hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_direct =
-    hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_direct =
-    hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_direct = hip_flatten_global_direct<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_direct = hip_flatten_global_direct<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_direct = hip_flatten_global_direct<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_direct = hip_flatten_global_direct<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_direct = hip_flatten_global_direct<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_direct = hip_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_direct = hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_direct = hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_direct = hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_direct = hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_direct = hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_direct = hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical
- * global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical global threads
  */
-template <named_dim... dims>
-using hip_flatten_global_loop =
-    hip_flatten_indexer_loop<hip::IndexGlobal<dims,
-                                              named_usage::unspecified,
-                                              named_usage::unspecified>...>;
+template < named_dim ... dims >
+using hip_flatten_global_loop = hip_flatten_indexer_loop<
+    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
 
 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
 using hip_flatten_global_z_loop = hip_flatten_global_loop<named_dim::z>;
 
-using hip_flatten_global_xy_loop =
-    hip_flatten_global_loop<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_loop =
-    hip_flatten_global_loop<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_loop =
-    hip_flatten_global_loop<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_loop =
-    hip_flatten_global_loop<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_loop =
-    hip_flatten_global_loop<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_loop =
-    hip_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_loop =
-    hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_loop =
-    hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_loop =
-    hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_loop =
-    hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_loop =
-    hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_loop =
-    hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_loop = hip_flatten_global_loop<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_loop = hip_flatten_global_loop<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_loop = hip_flatten_global_loop<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_loop = hip_flatten_global_loop<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_loop = hip_flatten_global_loop<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_loop = hip_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_loop = hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_loop = hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_loop = hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_loop = hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_loop = hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -2072,460 +1861,271 @@ using hip_flatten_global_zyx_loop =
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template <int X_BLOCK_SIZE>
-using hip_thread_size_x_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
-using hip_thread_size_y_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
-using hip_thread_size_z_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xy_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xz_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yx_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yz_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zx_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zy_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xyz_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xzy_direct =
-    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yxz_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yzx_direct =
-    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zxy_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zyx_direct =
-    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                       hip::thread_y<Y_BLOCK_SIZE>,
-                       hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
+template < int X_BLOCK_SIZE >
+using hip_thread_size_x_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE >
+using hip_thread_size_y_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE >
+using hip_thread_size_z_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_xy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_xz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_yx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_yz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_zx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_zy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_xyz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_xzy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_yxz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_yzx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_zxy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_zyx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
 using hip_block_size_x_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
+template < int Y_GRID_SIZE >
 using hip_block_size_y_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
+template < int Z_GRID_SIZE >
 using hip_block_size_z_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
 
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xy_direct =
-    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xz_direct =
-    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yx_direct =
-    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yz_direct =
-    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zx_direct =
-    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zy_direct =
-    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                                                     hip::block_y<Y_GRID_SIZE>,
-                                                     hip::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_x_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_y_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_z_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xy_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xz_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yx_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yz_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zx_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zy_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xyz_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xzy_direct =
-    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yxz_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yzx_direct =
-    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zxy_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zyx_direct =
-    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_xy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_xz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_yx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_yz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_zx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_zy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_x_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_y_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_z_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xyz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xzy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yxz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yzx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zxy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zyx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template <int X_BLOCK_SIZE>
+template < int X_BLOCK_SIZE >
 using hip_thread_size_x_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
+template < int Y_BLOCK_SIZE >
 using hip_thread_size_y_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
+template < int Z_BLOCK_SIZE >
 using hip_thread_size_z_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
 
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xy_loop =
-    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xz_loop =
-    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yx_loop =
-    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yz_loop =
-    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zx_loop =
-    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zy_loop =
-    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                                                  hip::thread_y<Y_BLOCK_SIZE>,
-                                                  hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_xy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_xz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_yx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_yz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_zx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_zy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
 using hip_block_size_x_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
+template < int Y_GRID_SIZE >
 using hip_block_size_y_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
+template < int Z_GRID_SIZE >
 using hip_block_size_z_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
 
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xy_loop =
-    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xz_loop =
-    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yx_loop =
-    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yz_loop =
-    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zx_loop =
-    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zy_loop =
-    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                                                 hip::block_y<Y_GRID_SIZE>,
-                                                 hip::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_x_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_y_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_z_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xy_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xz_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yx_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yz_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zx_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zy_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xyz_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_xzy_loop =
-    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yxz_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_yzx_loop =
-    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zxy_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_global_size_zyx_loop =
-    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_xy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_xz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_yx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_yz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_zx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_zy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_x_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_y_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_z_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xyz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_xzy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yxz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_yzx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zxy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2533,507 +2133,272 @@ using hip_global_size_zyx_loop =
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template <int X_BLOCK_SIZE>
-using hip_flatten_thread_size_x_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_y_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_z_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xy_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xz_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yx_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yz_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zx_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zy_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xyz_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xzy_direct =
-    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yxz_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yzx_direct =
-    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zxy_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zyx_direct =
-    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
-                               hip::thread_y<Y_BLOCK_SIZE>,
-                               hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
-using hip_flatten_block_size_x_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
-using hip_flatten_block_size_y_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
-using hip_flatten_block_size_z_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xy_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xz_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yx_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yz_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zx_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zy_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xyz_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xzy_direct =
-    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yxz_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yzx_direct =
-    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
-                               hip::block_z<Z_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zxy_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zyx_direct =
-    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
-                               hip::block_y<Y_GRID_SIZE>,
-                               hip::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_x_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_y_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_z_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xy_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xz_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yx_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yz_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zx_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zy_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xyz_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xzy_direct =
-    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yxz_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yzx_direct =
-    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zxy_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zyx_direct =
-    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_BLOCK_SIZE >
+using hip_flatten_thread_size_x_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_y_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_z_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
+using hip_flatten_block_size_x_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE >
+using hip_flatten_block_size_y_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE >
+using hip_flatten_block_size_z_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_x_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_y_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_z_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical
- * global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical global threads
  */
-template <int X_BLOCK_SIZE>
-using hip_flatten_thread_size_x_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_y_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_z_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xy_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xz_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yx_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yz_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zx_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zy_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
-
-template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_xyz_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
-template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_xzy_loop =
-    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
-using hip_flatten_thread_size_yxz_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>>;
-template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_yzx_loop =
-    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
-using hip_flatten_thread_size_zxy_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>>;
-template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
-using hip_flatten_thread_size_zyx_loop =
-    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
-                             hip::thread_y<Y_BLOCK_SIZE>,
-                             hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template <int X_GRID_SIZE>
-using hip_flatten_block_size_x_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE>
-using hip_flatten_block_size_y_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE>
-using hip_flatten_block_size_z_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xy_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xz_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yx_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yz_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zx_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zy_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
-
-template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_xyz_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
-template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_xzy_loop =
-    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
-using hip_flatten_block_size_yxz_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>>;
-template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_yzx_loop =
-    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
-                             hip::block_z<Z_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
-using hip_flatten_block_size_zxy_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>>;
-template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
-using hip_flatten_block_size_zyx_loop =
-    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
-                             hip::block_y<Y_GRID_SIZE>,
-                             hip::block_x<X_GRID_SIZE>>;
-
-
-template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_x_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_y_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_z_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xy_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xz_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yx_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yz_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zx_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zy_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template <int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xyz_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_xzy_loop =
-    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yxz_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template <int Y_BLOCK_SIZE,
-          int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_yzx_loop =
-    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zxy_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template <int Z_BLOCK_SIZE,
-          int Y_BLOCK_SIZE,
-          int X_BLOCK_SIZE,
-          int Z_GRID_SIZE = named_usage::unspecified,
-          int Y_GRID_SIZE = named_usage::unspecified,
-          int X_GRID_SIZE = named_usage::unspecified>
-using hip_flatten_global_size_zyx_loop =
-    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int X_BLOCK_SIZE >
+using hip_flatten_thread_size_x_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_y_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_z_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
+using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
+using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
+using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template < int X_GRID_SIZE >
+using hip_flatten_block_size_x_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE >
+using hip_flatten_block_size_y_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE >
+using hip_flatten_block_size_z_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
+using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
+using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
+using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+
+
+template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_x_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_y_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_z_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
+           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
+using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 71542f2410..5e3a02fb2c 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -42,18 +42,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define hipErrchk(ans)                                                         \
-  {                                                                            \
-    ::RAJA::hipAssert((ans), __FILE__, __LINE__);                              \
+#define hipErrchk(ans)                            \
+  {                                                \
+    ::RAJA::hipAssert((ans), __FILE__, __LINE__); \
   }
 
-inline void
-hipAssert(hipError_t code, const char* file, int line, bool abort = true)
+inline void hipAssert(hipError_t code,
+                       const char *file,
+                       int line,
+                       bool abort = true)
 {
-  if (code != hipSuccess)
-  {
-    if (abort)
-    {
+  if (code != hipSuccess) {
+    if (abort) {
       std::string msg;
       msg += "HIPassert: ";
       msg += hipGetErrorString(code);
@@ -62,11 +62,9 @@ hipAssert(hipError_t code, const char* file, int line, bool abort = true)
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    }
-    else
-    {
-      fprintf(stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file,
-              line);
+    } else {
+      fprintf(stderr, "HIPassert: %s %s %d\n",
+              hipGetErrorString(code), file, line);
     }
   }
 }
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 8de76ac784..c81adf8e24 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -60,53 +60,47 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>>
-{
+struct atomic<sum<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::hip_atomic {}, &val, v);
+    RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<min<T>>
-{
+struct atomic<min<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::hip_atomic {}, &val, v);
+    RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<max<T>>
-{
+struct atomic<max<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::hip_atomic {}, &val, v);
+    RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<and_bit<T>>
-{
+struct atomic<and_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::hip_atomic {}, &val, v);
+    RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<or_bit<T>>
-{
+struct atomic<or_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::hip_atomic {}, &val, v);
+    RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
   }
 };
 
 template <typename T>
-struct hip_atomic_available
-{
+struct hip_atomic_available {
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -124,19 +118,15 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
-          typename T,
-          typename TempIterator>
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
+          typename T, typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                                   T identity,
-                                                   TempIterator in_device_mem,
-                                                   unsigned int* device_count)
+                                        T identity,
+                                        TempIterator in_device_mem,
+                                        unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(
-      in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -147,22 +137,20 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId        = blockId / replication;
+  int slotId = blockId / replication;
 
-  int maxNumSlots       = (numBlocks + replication - 1) / replication;
+  int maxNumSlots = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots   = replicationId * maxNumSlots;
-  int blockSlot    = beginSlots + slotId;
+  int beginSlots = replicationId * maxNumSlots;
+  int blockSlot = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u)
-  {
-    if (threadId == 0)
-    {
+  if (numSlots <= 1u) {
+    if (threadId == 0) {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -170,36 +158,33 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   __shared__ bool isLastBlock;
-  if (threadId == 0)
-  {
+  if (threadId == 0) {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count =
-        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
-    isLastBlock = (old_count == (numSlots - 1));
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
+    isLastBlock = (old_count == (numSlots-1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   __syncthreads();
 
   // last block accumulates values from device_mem
-  if (isLastBlock)
-  {
+  if (isLastBlock) {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId; i < numSlots; i += numThreads)
-    {
-      Combiner {}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId;
+                      i < numSlots;
+                      i += numThreads) {
+      Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0)
-    {
+    if (threadId == 0) {
       val = temp;
     }
   }
@@ -207,91 +192,72 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt
-{
+namespace expt {
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId   = ThreadIterationGetter::index();
+  const int threadId = ThreadIterationGetter::index();
 
-  const int warpId  = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
+  const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0)
-  {
+  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-      temp  = Combiner {}(temp, rhs);
+      temp = Combiner{}(temp, rhs);
     }
-  }
-  else
-  {
+
+  } else {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
-    {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
-      T rhs       = RAJA::hip::impl::shfl_sync(temp, srcLane);
+      T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads)
-      {
-        temp = Combiner {}(temp, rhs);
+      if (srcLane < numThreads) {
+        temp = Combiner{}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
-                    RAJA::policy::hip::device_constants.WARP_SIZE,
-                "Max Warps must be less than or equal to Warp Size for this "
-                "algorithm to work");
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE,
+               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
-  {
+  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(
-        RAJA::detail::SoAArray<T,
-                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
-        sd = reinterpret_cast<RAJA::detail::SoAArray<
-            T, RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0)
-    {
+    if (warpId == 0) {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0)
-    {
+    if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads)
-      {
+      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
-      }
-      else
-      {
+      } else {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
-      {
+      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-        temp  = Combiner {}(temp, rhs);
+        temp = Combiner{}(temp, rhs);
       }
     }
 
@@ -303,74 +269,67 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void
-grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
+RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
+                                          T val,
+                                          RAJA::detail::SoAPtr<T,RAJA::hip::device_mempool_type> device_mem,
+                                          unsigned int * device_count)
 {
-  using BlockIterationGetter =
-      typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter =
-      typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks            = BlockIterationGetter::size();
-  const int numThreads           = ThreadIterationGetter::size();
+  const int numBlocks = BlockIterationGetter::size();
+  const int numThreads = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId  = BlockIterationGetter::index();
+  const int blockId = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
-  T temp = block_reduce<ThreadIterationGetter, OP>(red.val, OP::identity());
+  T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0)
-  {
-    red.device_mem.set(blockId, temp);
+  if (threadId == 0) {
+    device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(red.device_count, wrap_around);
-    lastBlock              = (old_count == wrap_around);
+    unsigned int old_count = ::atomicInc(device_count, wrap_around);
+    lastBlock = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock)
-  {
+  if (lastBlock) {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads)
-    {
-      temp = OP {}(temp, red.device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads) {
+      temp = OP{}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0)
-    {
-      *(red.devicetarget) = temp;
+    if (threadId == 0) {
+      *device_target = temp;
     }
   }
 }
 
-}  //  namespace expt
+} //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner,
-          typename Accessor,
-          int replication,
-          int atomic_stride,
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
           typename T>
-RAJA_DEVICE RAJA_INLINE int
-grid_reduce_atomic_device_init(T& val,
-                               T identity,
-                               T* device_mem,
-                               unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
+                                               T identity,
+                                               T* device_mem,
+                                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -379,28 +338,24 @@ grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId % replication);
-  int atomicOffset  = replicationId * atomic_stride;
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u)
-  {
+  if (numSlots <= 1u) {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0)
-    {
+    if (threadId == 0) {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0)
-  {
+  if (threadId == 0) {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u)
-    {
+    if (old_val == 0u) {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -411,22 +366,19 @@ grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0)
-  {
+  if (threadId == 0) {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
+    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count =
-        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
-    isLastBlock = (old_count == (numSlots + 1));
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
+    isLastBlock = (old_count == (numSlots+1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock)
-    {
+    if (isLastBlock) {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -437,8 +389,9 @@ grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void
-grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
+                                                            T identity,
+                                                            T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -446,16 +399,16 @@ grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId % replication);
-  int atomicOffset  = replicationId * atomic_stride;
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity)
-  {
-    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity) {
+    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
   }
+
 }
 
 }  // namespace impl
@@ -467,14 +420,12 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node
-  {
+  struct Node {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode
-  {
+  struct ResourceNode {
     ResourceNode* next;
     ::RAJA::resources::Hip res;
     Node* node_list;
@@ -527,19 +478,14 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next)
-      {
+      if (m_n->next) {
         m_n = m_n->next;
-      }
-      else if (m_rn->next)
-      {
+      } else if (m_rn->next) {
         m_rn = m_rn->next;
-        m_n  = m_rn->node_list;
-      }
-      else
-      {
+        m_n = m_rn->node_list;
+      } else {
         m_rn = nullptr;
-        m_n  = nullptr;
+        m_n = nullptr;
       }
       return *this;
     }
@@ -551,7 +497,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T (&)[num_slots] { return m_n->values; }
+    auto operator*() -> T(&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -588,27 +534,25 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
+  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn)
-    {
+    while (rn) {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn)
-    {
-      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next      = resource_list;
-      rn->res       = res;
+    if (!rn) {
+      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next = resource_list;
+      rn->res = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n       = mempool::getInstance().template malloc<Node>(1);
-    n->next       = rn->node_list;
+    Node* n = mempool::getInstance().template malloc<Node>(1);
+    n->next = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -617,8 +561,7 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r)
-    {
+    for (auto r = resourceBegin(); r != end; ++r) {
       ::RAJA::hip::synchronize(*r);
     }
   }
@@ -626,12 +569,10 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list)
-    {
+    while (resource_list) {
       ResourceNode* rn = resource_list;
-      while (rn->node_list)
-      {
-        Node* n       = rn->node_list;
+      while (rn->node_list) {
+        Node* n = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -660,15 +601,12 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type  = device_mempool_type;
+  using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -679,7 +617,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool own_device_ptr;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
 
   /*! \brief create from a default value and offload information
    *
@@ -687,30 +625,31 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value {initValue},
-        identity {identity_},
-        device_count {nullptr},
-        device {},
-        own_device_ptr {false}
-  {}
+      : value{initValue},
+        identity{identity_},
+        device_count{nullptr},
+        device{},
+        own_device_ptr{false}
+  {
+  }
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value {other.identity},
-        identity {other.identity},
-        device_count {other.device_count},
-        device {other.device},
-        own_device_ptr {false}
-  {}
+      : value{other.identity},
+        identity{other.identity},
+        device_count{other.device_count},
+        device{other.device},
+        own_device_ptr{false}
+  {
+  }
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T (&output)[tally_slots])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r)
-    {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -721,12 +660,10 @@ struct ReduceLastBlock_Data
   void grid_reduce(T* output)
   {
     T temp = value;
-    size_t replicationId =
-        impl::grid_reduce_last_block<Combiner, Accessor, replication,
-                                     atomic_stride>(temp, identity, device,
-                                                    device_count);
-    if (replicationId != replication)
-    {
+    size_t replicationId = impl::grid_reduce_last_block<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
       output[replicationId] = temp;
     }
   }
@@ -736,15 +673,13 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act)
-    {
-      hip_dim_t gridDim  = currentGridDim();
-      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
+    if (act) {
+      hip_dim_t gridDim = currentGridDim();
+      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots * replication);
-      device_count =
-          count_mempool_type::getInstance().template malloc<unsigned int>(
-              replication * atomic_stride);
+      device.allocate(maxNumSlots*replication);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -755,11 +690,10 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act)
-    {
+    if (act) {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count   = nullptr;
+      device_count = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -768,10 +702,8 @@ struct ReduceLastBlock_Data
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -783,32 +715,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool own_device_ptr;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value {initValue},
-        identity {identity_},
-        is_setup {false},
-        own_device_ptr {false}
-  {}
+      : value{initValue},
+        identity{identity_},
+        is_setup{false},
+        own_device_ptr{false}
+  {
+  }
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value {other.identity},
-        identity {other.identity},
-        is_setup {other.is_setup},
-        own_device_ptr {false}
-  {}
+      : value{other.identity},
+        identity{other.identity},
+        is_setup{other.is_setup},
+        own_device_ptr{false}
+  {
+  }
 
-  ReduceAtomicHostInit_Data&
-  operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T (&output)[tally_slots])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r)
-    {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -821,7 +753,7 @@ struct ReduceAtomicHostInit_Data
     T temp = value;
 
     impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
-        temp, identity, output);
+            temp, identity, output);
   }
 
   //! check and setup for device
@@ -829,9 +761,8 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act)
-    {
-      is_setup       = true;
+    if (act) {
+      is_setup = true;
       own_device_ptr = true;
     }
     return act;
@@ -842,9 +773,8 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act)
-    {
-      is_setup       = false;
+    if (act) {
+      is_setup = false;
       own_device_ptr = false;
     }
     return act;
@@ -852,15 +782,12 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner,
-          typename Accessor,
-          typename T,
-          size_t replication,
-          size_t atomic_stride>
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type  = device_mempool_type;
+  using data_mempool_type = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -871,34 +798,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value {initValue},
-        identity {identity_},
-        device_count {nullptr},
-        device {nullptr},
-        own_device_ptr {false}
-  {}
+      : value{initValue},
+        identity{identity_},
+        device_count{nullptr},
+        device{nullptr},
+        own_device_ptr{false}
+  {
+  }
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value {other.identity},
-        identity {other.identity},
-        device_count {other.device_count},
-        device {other.device},
-        own_device_ptr {false}
-  {}
+      : value{other.identity},
+        identity{other.identity},
+        device_count{other.device_count},
+        device{other.device},
+        own_device_ptr{false}
+  {
+  }
 
-  ReduceAtomicDeviceInit_Data&
-  operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T (&output)[tally_slots])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r)
-    {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -910,12 +837,10 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId =
-        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
-                                             atomic_stride>(
-            temp, identity, device, device_count);
-    if (replicationId != replication)
-    {
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
       output[replicationId] = temp;
     }
   }
@@ -925,13 +850,10 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act)
-    {
-      device = data_mempool_type::getInstance().template malloc<T>(
-          replication * atomic_stride);
-      device_count =
-          count_mempool_type::getInstance().template malloc<unsigned int>(
-              replication * atomic_stride);
+    if (act) {
+      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -942,12 +864,11 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act)
-    {
+    if (act) {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count   = nullptr;
+      device_count = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -959,77 +880,49 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication =
-      (tuning::replication > 0) ? tuning::replication : 32;
-  static constexpr size_t atomic_stride =
-      (tuning::atomic_stride > 0)
-          ? tuning::atomic_stride
-          : ((policy::hip::device_constants
-                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-                 ? RAJA_DIVIDE_CEILING_INT(
-                       policy::hip::device_constants
-                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
-                       sizeof(T))
-                 : 1);
-
-  using Accessor = std::conditional_t<
-      (tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication = (tuning::replication > 0)
+      ? tuning::replication
+      : 32;
+  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
+      ? tuning::atomic_stride
+      : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode ==
-                          block_communication_mode::device_fence),
-                         impl::AccessorDeviceScopeUseDeviceFence,
-                         void>>;
+      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
+        impl::AccessorDeviceScopeUseDeviceFence,
+        void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm ==
-       reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available =
-      RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<
-      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
-          (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<Combiner,
-                                Accessor,
-                                T,
-                                replication,
-                                atomic_stride>,
-      std::conditional_t<
-          atomic_available,
-          std::conditional_t<
-              (tuning::algorithm ==
-               reduce_algorithm::init_device_combine_atomic_block),
-              hip::ReduceAtomicDeviceInit_Data<Combiner,
-                                               Accessor,
-                                               T,
-                                               replication,
-                                               atomic_stride>,
-              std::conditional_t<
-                  (tuning::algorithm ==
-                   reduce_algorithm::init_host_combine_atomic_block),
-                  hip::ReduceAtomicHostInit_Data<Combiner,
-                                                 T,
-                                                 replication,
-                                                 atomic_stride>,
-                  void>>,
-          void>>;
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
+                                              (atomic_policy && !atomic_available),
+      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      std::conditional_t<atomic_available,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
+          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
+            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
+            void>>,
+        void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T,
-                                tally_slots,
-                                typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u
-  {
+  union tally_u {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l) {};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
+    constexpr tally_u(TallyType* l) : list(l){};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
   };
 
 public:
@@ -1038,10 +931,11 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent {this},
-        tally_or_val_ptr {new TallyType},
+      : parent{this},
+        tally_or_val_ptr{new TallyType},
         val(init_val, identity_)
-  {}
+  {
+  }
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -1055,18 +949,16 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent {other.parent},
+      : parent{other.parent},
 #else
-      : parent {&other},
+      : parent{&other},
 #endif
-        tally_or_val_ptr {other.tally_or_val_ptr},
+        tally_or_val_ptr{other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent)
-    {
-      if (val.setupForDevice())
-      {
+    if (parent) {
+      if (val.setupForDevice()) {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -1081,35 +973,25 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this)
-    {
+    if (parent == this) {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    }
-    else if (parent)
-    {
-      if (val.value != val.identity)
-      {
+    } else if (parent) {
+      if (val.value != val.identity) {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    }
-    else
-    {
-      if (val.teardownForDevice())
-      {
+    } else {
+      if (val.teardownForDevice()) {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent)
-    {
+    if (!parent->parent) {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    }
-    else
-    {
+    } else {
       parent->combine(val.value);
     }
 #endif
@@ -1118,18 +1000,15 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n   = tally_or_val_ptr.list->begin();
+    auto n = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end)
-    {
+    if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n)
-      {
+      for (; n != end; ++n) {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r)
-        {
+        for (size_t r = 0; r < tally_slots; ++r) {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1143,7 +1022,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner {}(val.value, other); }
+  void combine(T other) const { Combiner{}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1253,39 +1132,33 @@ class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::Reduce<
-          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-          tuning>
+    : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+                          tuning>
 {
 
 public:
-  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner       = RAJA::reduce::min<value_type>;
+  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base           = hip::Reduce<Combiner, value_type, tuning>;
+  using Base = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val,
-               IndexType init_idx,
+  ReduceMinLoc(T init_val, IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
-  {}
+               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+  {
+  }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
-             IndexType init_idx,
+  void reset(T init_val, IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1309,39 +1182,33 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::Reduce<
-          RAJA::reduce::max<
-              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-          tuning>
+    : public hip::
+          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+                 tuning>
 {
 public:
-  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner       = RAJA::reduce::max<value_type>;
+  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base           = hip::Reduce<Combiner, value_type, tuning>;
+  using Base = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val,
-               IndexType init_idx,
+  ReduceMaxLoc(T init_val, IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx =
-                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx),
-             value_type(identity_val, identity_idx))
-  {}
+               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
+  {
+  }
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val,
-             IndexType init_idx,
+  void reset(T init_val, IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx),
-                value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 17f91e5e2a..cdf0a9b82d 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -53,10 +53,11 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
+RAJA_INLINE
+resources::EventProxy<resources::Hip>
+inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -65,14 +66,23 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      begin,
+                                      begin,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             begin,
+                                             binary_op,
+                                             len,
                                              stream));
 #endif
 
@@ -82,11 +92,20 @@ RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      begin,
+                                      begin,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             begin,
+                                             binary_op,
+                                             len,
                                              stream));
 #endif
   // Free temporary storage
@@ -108,10 +127,11 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
+RAJA_INLINE
+resources::EventProxy<resources::Hip>
+exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -121,14 +141,25 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      begin,
+                                      begin,
+                                      init,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, init, len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             begin,
+                                             binary_op,
+                                             init,
+                                             len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -137,11 +168,22 @@ RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      begin, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                              temp_storage_bytes,
+                                              begin,
+                                              begin,
+                                              init,
+                                              len,
+                                              binary_op,
+                                              stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, begin, binary_op, init, len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             begin,
+                                             binary_op,
+                                             init,
+                                             len,
                                              stream));
 #endif
   // Free temporary storage
@@ -163,27 +205,38 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-inclusive(resources::Hip hip_res,
-          ::RAJA::policy::hip::
-              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op)
+RAJA_INLINE
+resources::EventProxy<resources::Hip>
+inclusive(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    InputIter begin,
+    InputIter end,
+    OutputIter out,
+    Function binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      begin,
+                                      out,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             out,
+                                             binary_op,
+                                             len,
+                                             stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -191,11 +244,21 @@ inclusive(resources::Hip hip_res,
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, len, binary_op, stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      begin,
+                                      out,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             out,
+                                             binary_op,
+                                             len,
+                                             stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -217,28 +280,40 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-exclusive(resources::Hip hip_res,
-          ::RAJA::policy::hip::
-              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-          InputIter begin,
-          InputIter end,
-          OutputIter out,
-          Function binary_op,
-          T init)
+RAJA_INLINE
+resources::EventProxy<resources::Hip>
+exclusive(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    InputIter begin,
+    InputIter end,
+    OutputIter out,
+    Function binary_op,
+    T init)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      begin,
+                                      out,
+                                      init,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, out, binary_op, init, len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             out,
+                                             binary_op,
+                                             init,
+                                             len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -247,11 +322,22 @@ exclusive(resources::Hip hip_res,
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
-                                      out, init, len, binary_op, stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      begin,
+                                      out,
+                                      init,
+                                      len,
+                                      binary_op,
+                                      stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
-                                             begin, out, binary_op, init, len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                             temp_storage_bytes,
+                                             begin,
+                                             out,
+                                             binary_op,
+                                             init,
+                                             len,
                                              stream));
 #endif
   // Free temporary storage
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index bfa0d3814b..eb16246623 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -51,63 +51,52 @@ namespace detail
 {
 
 #if defined(__HIPCC__)
-template <typename R>
-using double_buffer = ::rocprim::double_buffer<R>;
+  template < typename R >
+  using double_buffer = ::rocprim::double_buffer<R>;
 #elif defined(__CUDACC__)
-template <typename R>
-using double_buffer = ::cub::DoubleBuffer<R>;
+  template < typename R >
+  using double_buffer = ::cub::DoubleBuffer<R>;
 #endif
 
-template <typename R>
-R* get_current(double_buffer<R>& d_bufs)
-{
+  template < typename R >
+  R* get_current(double_buffer<R>& d_bufs)
+  {
 #if defined(__HIPCC__)
-  return d_bufs.current();
+    return d_bufs.current();
 #elif defined(__CUDACC__)
-  return d_bufs.Current();
+    return d_bufs.Current();
 #endif
-}
+  }
 
-}  // namespace detail
+}
 
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-        std::is_pointer<Iter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(resources::Hip hip_res,
-       ::RAJA::policy::hip::
-           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-       Iter,
-       Iter,
-       Compare)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                        std::is_pointer<Iter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter,
+    Iter,
+    Compare)
 {
-  static_assert(
-      concepts::all_of<
-          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-          std::is_pointer<Iter>,
-          concepts::any_of<
-              camp::is_same<Compare,
-                            operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<Compare, operators::greater<
-                                         RAJA::detail::IterVal<Iter>>>>>::value,
-      "RAJA stable_sort<hip_exec> is only implemented for pointers to "
-      "arithmetic types and RAJA::operators::less and "
-      "RAJA::operators::greater.");
+  static_assert(concepts::all_of<
+                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                  std::is_pointer<Iter>,
+                  concepts::any_of<
+                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
+                "RAJA stable_sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -115,28 +104,26 @@ stable(resources::Hip hip_res,
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(resources::Hip hip_res,
-       ::RAJA::policy::hip::
-           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-       Iter begin,
-       Iter end,
-       operators::less<RAJA::detail::IterVal<Iter>>)
+stable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter begin,
+    Iter end,
+    operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len       = std::distance(begin, end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(R) * CHAR_BIT;
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -146,16 +133,24 @@ stable(resources::Hip hip_res,
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
-                                       d_keys, len, begin_bit, end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_keys,
+                                       len,
+                                       begin_bit,
+                                       end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -164,23 +159,29 @@ stable(resources::Hip hip_res,
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
-                                       d_keys, len, begin_bit, end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_keys,
+                                       len,
+                                       begin_bit,
+                                       end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes, d_keys, len,
-                                              begin_bit, end_bit, stream));
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              len,
+                                              begin_bit,
+                                              end_bit,
+                                              stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out)
-  {
+  if (detail::get_current(d_keys) == d_out) {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
-                             stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -193,28 +194,26 @@ stable(resources::Hip hip_res,
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(resources::Hip hip_res,
-       ::RAJA::policy::hip::
-           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-       Iter begin,
-       Iter end,
-       operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter begin,
+    Iter end,
+    operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len       = std::distance(begin, end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(R) * CHAR_BIT;
+  int len = std::distance(begin, end);
+  int begin_bit=0;
+  int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -224,16 +223,24 @@ stable(resources::Hip hip_res,
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
-                                            d_keys, len, begin_bit, end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
+                                            temp_storage_bytes,
+                                            d_keys,
+                                            len,
+                                            begin_bit,
+                                            end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -242,23 +249,29 @@ stable(resources::Hip hip_res,
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
-                                            d_keys, len, begin_bit, end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
+                                            temp_storage_bytes,
+                                            d_keys,
+                                            len,
+                                            begin_bit,
+                                            end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
-      stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                        temp_storage_bytes,
+                                                        d_keys,
+                                                        len,
+                                                        begin_bit,
+                                                        end_bit,
+                                                        stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out)
-  {
+  if (detail::get_current(d_keys) == d_out) {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
-                             stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -272,40 +285,30 @@ stable(resources::Hip hip_res,
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename Iter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-        std::is_pointer<Iter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<Iter>>>,
-            camp::is_same<Compare,
-                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(resources::Hip hip_res,
-         ::RAJA::policy::hip::
-             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-         Iter,
-         Iter,
-         Compare)
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                        std::is_pointer<Iter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    Iter,
+    Iter,
+    Compare)
 {
-  static_assert(
-      concepts::all_of<
-          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-          std::is_pointer<Iter>,
-          concepts::any_of<
-              camp::is_same<Compare,
-                            operators::less<RAJA::detail::IterVal<Iter>>>,
-              camp::is_same<Compare, operators::greater<
-                                         RAJA::detail::IterVal<Iter>>>>>::value,
-      "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
-      "types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(concepts::all_of<
+                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+                  std::is_pointer<Iter>,
+                  concepts::any_of<
+                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
+                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
+                "RAJA sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -313,20 +316,18 @@ unstable(resources::Hip hip_res,
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(resources::Hip hip_res,
-         ::RAJA::policy::hip::
-             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-         Iter begin,
-         Iter end,
-         operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    Iter begin,
+    Iter end,
+    operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -334,20 +335,18 @@ unstable(resources::Hip hip_res,
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(resources::Hip hip_res,
-         ::RAJA::policy::hip::
-             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-         Iter begin,
-         Iter end,
-         operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    Iter begin,
+    Iter end,
+    operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -356,47 +355,36 @@ unstable(resources::Hip hip_res,
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-        std::is_pointer<KeyIter>,
-        std::is_pointer<ValIter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                        std::is_pointer<KeyIter>,
+                        std::is_pointer<ValIter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<KeyIter>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "stable_sort_pairs<hip_exec> is only implemented for "
-                "arithmetic types");
-  static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for "
-      "RAJA::operators::less or RAJA::operators::greater");
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -404,21 +392,16 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -429,9 +412,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len       = std::distance(keys_begin, keys_end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(K) * CHAR_BIT;
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -443,16 +426,26 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
-                                        d_keys, d_vals, len, begin_bit, end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        d_vals,
+                                        len,
+                                        begin_bit,
+                                        end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -461,30 +454,36 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
-                                        d_keys, d_vals, len, begin_bit, end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        d_vals,
+                                        len,
+                                        begin_bit,
+                                        end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_vals,
+                                               len,
+                                               begin_bit,
+                                               end_bit,
+                                               stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out)
-  {
+  if (detail::get_current(d_keys) == d_keys_out) {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out)
-  {
+  if (detail::get_current(d_vals) == d_vals_out) {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -498,21 +497,16 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -523,9 +517,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len       = std::distance(keys_begin, keys_end);
-  int begin_bit = 0;
-  int end_bit   = sizeof(K) * CHAR_BIT;
+  int len = std::distance(keys_begin, keys_end);
+  int begin_bit=0;
+  int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -537,16 +531,26 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
+  void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
-                                             d_keys, d_vals, len, begin_bit,
-                                             end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             d_vals,
+                                             len,
+                                             begin_bit,
+                                             end_bit,
+                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -555,30 +559,36 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
-                                             d_keys, d_vals, len, begin_bit,
-                                             end_bit, stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             d_vals,
+                                             len,
+                                             begin_bit,
+                                             end_bit,
+                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
-      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
-      end_bit, stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_vals,
+                                                         len,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out)
-  {
+  if (detail::get_current(d_keys) == d_keys_out) {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out)
-  {
+  if (detail::get_current(d_vals) == d_vals_out) {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
-                             hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -593,47 +603,36 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    concepts::negate<concepts::all_of<
-        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-        std::is_pointer<KeyIter>,
-        std::is_pointer<ValIter>,
-        concepts::any_of<
-            camp::is_same<Compare,
-                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
-            camp::is_same<
-                Compare,
-                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename KeyIter, typename ValIter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      concepts::negate<concepts::all_of<
+                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                        std::is_pointer<KeyIter>,
+                        std::is_pointer<ValIter>,
+                        concepts::any_of<
+                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
+                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert(std::is_pointer<KeyIter>::value,
-                "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert(std::is_pointer<ValIter>::value,
-                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<KeyIter>::value,
+      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert (std::is_pointer<ValIter>::value,
+      "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert(type_traits::is_arithmetic<K>::value,
-                "sort_pairs<hip_exec> is only implemented for arithmetic "
-                "types");
-  static_assert(
-      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
-                       camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
-      "RAJA::operators::greater");
+  static_assert (type_traits::is_arithmetic<K>::value,
+      "sort_pairs<hip_exec> is only implemented for arithmetic types");
+  static_assert (concepts::any_of<
+      camp::is_same<Compare, operators::less<K>>,
+      camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -641,21 +640,16 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -667,21 +661,16 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename KeyIter,
-          typename ValIter>
-concepts::enable_if_t<
-    resources::EventProxy<resources::Hip>,
-    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-    std::is_pointer<KeyIter>,
-    std::is_pointer<ValIter>>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename KeyIter, typename ValIter>
+concepts::enable_if_t<resources::EventProxy<resources::Hip>,
+                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+                      std::is_pointer<KeyIter>,
+                      std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index 89a7997b31..fc29dabcbf 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -30,7 +30,7 @@
 #include <thread>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/openmp/atomic.hpp"
+    #include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
 #include "RAJA/policy/openmp/forall.hpp"
diff --git a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index 8a3263bfd2..09861941ab 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
- * Populate and return a Dispatcher object
- */
-template <typename T, typename Dispatcher_T>
+* Populate and return a Dispatcher object
+*/
+template < typename T, typename Dispatcher_T >
 inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
-  return get_Dispatcher<T, Dispatcher_T>(seq_work {});
+  return get_Dispatcher<T, Dispatcher_T>(seq_work{});
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index f566ac741b..c889273a0f 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -38,21 +38,23 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_work,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
-                              RAJA::omp_work,
-                              RAJA::ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
-{};
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_work,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::omp_parallel_for_exec,
+        RAJA::omp_work,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
 
 /*!
  * Runs work in a storage container in reverse order
@@ -61,21 +63,23 @@ struct WorkRunner<RAJA::omp_work,
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_work,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
-                              RAJA::omp_work,
-                              RAJA::reverse_ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
-{};
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_work,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::omp_parallel_for_exec,
+        RAJA::omp_work,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 43e790759d..2dc047dd95 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -35,7 +35,8 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 {
   T ret;
 #pragma omp atomic capture
@@ -48,12 +49,13 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
 {
   T ret;
 #pragma omp atomic capture
   {
-    ret  = *acc;
+    ret = *acc;
     *acc = value;
   }
   RAJA_UNUSED_VAR(ret);
@@ -61,7 +63,8 @@ RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -75,7 +78,8 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -89,14 +93,15 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-#pragma omp atomic capture compare
+  #pragma omp atomic capture compare
   {
     old = *acc;
-    if (value < *acc)
+    if ( value < *acc )
     {
       *acc = value;
     }
@@ -104,20 +109,21 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T* acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMin(builtin_atomic {}, acc, value);
+  return atomicMin(builtin_atomic{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-#pragma omp atomic capture compare
+  #pragma omp atomic capture compare
   {
     old = *acc;
-    if (value > *acc)
+    if ( value > *acc )
     {
       *acc = value;
     }
@@ -125,14 +131,15 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T* acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMax(builtin_atomic {}, acc, value);
+  return atomicMax(builtin_atomic{}, acc, value);
 #endif
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicInc(omp_atomic, T *acc)
 {
   T old;
 #pragma omp atomic capture
@@ -146,16 +153,18 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicInc(builtin_atomic {}, acc, value);
+  return RAJA::atomicInc(builtin_atomic{}, acc, value);
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicDec(omp_atomic, T *acc)
 {
   T old;
 #pragma omp atomic capture
@@ -169,15 +178,17 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicDec(builtin_atomic {}, acc, value);
+  return RAJA::atomicDec(builtin_atomic{}, acc, value);
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -190,7 +201,8 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -203,7 +215,8 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -216,12 +229,13 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old  = *acc;  // capture old for return value
+    old = *acc;  // capture old for return value
     *acc = value;
   }
   return old;
@@ -229,13 +243,14 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
 {
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return RAJA::atomicCAS(builtin_atomic {}, acc, compare, value);
+  return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
 }
 
-#endif  // not defined RAJA_COMPILER_MSVC
+#endif // not defined RAJA_COMPILER_MSVC
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index b842a9bfc5..815168ae98 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -55,27 +55,23 @@ namespace policy
 namespace omp
 {
 
-template <typename Iterable,
-          typename Func,
-          typename InnerPolicy,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam f_params)
 {
-  RAJA::region<RAJA::omp_parallel_region>(
-      [&]()
-      {
-        using RAJA::internal::thread_privatize;
-        auto body = thread_privatize(loop_body);
-        forall_impl(host_res, InnerPolicy {}, iter, body.get_priv(), f_params);
-      });
+  RAJA::region<RAJA::omp_parallel_region>([&]() {
+    using RAJA::internal::thread_privatize;
+    auto body = thread_privatize(loop_body);
+    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
+  });
   return resources::EventProxy<resources::Host>(host_res);
 }
 
@@ -87,283 +83,249 @@ forall_impl(resources::Host host_res,
 namespace internal
 {
 
-/// Tag dispatch for omp forall
+  /// Tag dispatch for omp forall
 
-//
-// omp for (Auto)
-//
-template <typename Iterable, typename Func>
-RAJA_INLINE void
-forall_impl(const ::RAJA::policy::omp::Auto&, Iterable&& iter, Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for (Auto)
+  //
+  template <typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(static)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(static)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(static)
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(static)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(static, ChunkSize)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(static, ChunkSize)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(static, ChunkSize)
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(static, ChunkSize)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(dynamic)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(dynamic)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(dynamic)
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(dynamic)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(dynamic, ChunkSize)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(dynamic, ChunkSize)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(dynamic, ChunkSize)
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(dynamic, ChunkSize)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(guided)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(guided)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(guided)
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(guided)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(guided, ChunkSize)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(guided, ChunkSize)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(guided, ChunkSize)
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(guided, ChunkSize)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(runtime)
-//
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
-                             Iterable&& iter,
-                             Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(runtime)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(runtime)
+  //
+  template <typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(runtime)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-// TODO :: not implemented in forall param interface ...
-#if !defined(RAJA_COMPILER_MSVC)
-// dynamic & guided
-template <typename Policy, typename Iterable, typename Func>
-RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
-{
-  omp_sched_t prev_sched;
-  int prev_chunk;
-  omp_get_schedule(&prev_sched, &prev_chunk);
-  omp_set_schedule(Policy::schedule, Policy::chunk_size);
-  forall_impl(::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
-              std::forward<Func>(loop_body));
-  omp_set_schedule(prev_sched, prev_chunk);
-}
-#endif
+  // TODO :: not implemented in forall param interface ...
+  #if !defined(RAJA_COMPILER_MSVC)
+  // dynamic & guided
+  template <typename Policy, typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl(const Policy&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    omp_sched_t prev_sched;
+    int prev_chunk;
+    omp_get_schedule(&prev_sched, &prev_chunk);
+    omp_set_schedule(Policy::schedule, Policy::chunk_size);
+    forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+    omp_set_schedule(prev_sched, prev_chunk);
+  }
+  #endif
 
 
-/// Tag dispatch for omp forall with nowait
+  /// Tag dispatch for omp forall with nowait
 
-//
-// omp for nowait (Auto)
-//
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
-                                    Iterable&& iter,
-                                    Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for nowait
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for nowait (Auto)
+  //
+  template <typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for nowait
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(static) nowait
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                   Iterable&& iter,
-                   Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(static) nowait
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(static) nowait
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(static) nowait
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-//
-// omp for schedule(static, ChunkSize) nowait
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                   Iterable&& iter,
-                   Func&& loop_body)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp for schedule(static, ChunkSize) nowait
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  //
+  // omp for schedule(static, ChunkSize) nowait
+  //
+  template <typename Iterable, typename Func, int ChunkSize,
+    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                               Iterable&& iter,
+                               Func&& loop_body)
   {
-    loop_body(begin_it[i]);
+    RAJA_EXTRACT_BED_IT(iter);
+    #pragma omp for schedule(static, ChunkSize) nowait
+    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+      loop_body(begin_it[i]);
+    }
   }
-}
 
-// TODO :: not implemented in param interface...
-#if !defined(RAJA_COMPILER_MSVC)
-// dynamic & guided
-template <typename Policy, typename Iterable, typename Func>
-RAJA_INLINE void
-forall_impl_nowait(const Policy&, Iterable&& iter, Func&& loop_body)
-{
-  omp_sched_t prev_sched;
-  int prev_chunk;
-  omp_get_schedule(&prev_sched, &prev_chunk);
-  omp_set_schedule(Policy::schedule, Policy::chunk_size);
-  forall_impl_nowait(::RAJA::policy::omp::Runtime {},
-                     std::forward<Iterable>(iter),
-                     std::forward<Func>(loop_body));
-  omp_set_schedule(prev_sched, prev_chunk);
-}
-#endif
-
-}  // end namespace internal
-
-template <typename Schedule,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  //TODO :: not implemented in param interface...
+  #if !defined(RAJA_COMPILER_MSVC)
+  // dynamic & guided
+  template <typename Policy, typename Iterable, typename Func>
+  RAJA_INLINE void forall_impl_nowait(const Policy&,
+                               Iterable&& iter,
+                               Func&& loop_body)
+  {
+    omp_sched_t prev_sched;
+    int prev_chunk;
+    omp_get_schedule(&prev_sched, &prev_chunk);
+    omp_set_schedule(Policy::schedule, Policy::chunk_size);
+    forall_impl_nowait(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+    omp_set_schedule(prev_sched, prev_chunk);
+  }
+  #endif
+
+} // end namespace internal
+
+template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
-                        std::forward<Func>(loop_body));
+  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_nowait_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl_nowait(Schedule {}, std::forward<Iterable>(iter),
-                               std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index 76e0ca3fbc..ba71ac2fbf 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -38,8 +38,8 @@ namespace RAJA
 struct omp_parallel_collapse_exec
     : make_policy_pattern_t<RAJA::Policy::openmp,
                             RAJA::Pattern::forall,
-                            RAJA::policy::omp::For>
-{};
+                            RAJA::policy::omp::For> {
+};
 
 namespace internal
 {
@@ -48,15 +48,10 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          typename... EnclosedStmts,
-          typename Types>
+template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>,
-                         Types>
-{
+                                             EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -76,17 +71,14 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1) firstprivate(privatizer)              \
+#pragma omp parallel for private(i0, i1) firstprivate(privatizer) \
     RAJA_COLLAPSE(2)
-    for (i0 = 0; i0 < l0; ++i0)
-    {
-      for (i1 = 0; i1 < l1; ++i1)
-      {
+    for (i0 = 0; i0 < l0; ++i0) {
+      for (i1 = 0; i1 < l1; ++i1) {
         auto& private_data = privatizer.get_priv();
         private_data.template assign_offset<Arg0>(i0);
         private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
-            private_data);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
       }
     }
   }
@@ -100,9 +92,7 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>,
-                         Types>
-{
+                                             EnclosedStmts...>, Types> {
 
 
   template <typename Data>
@@ -111,9 +101,9 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
     const auto l0 = segment_length<Arg0>(data);
     const auto l1 = segment_length<Arg1>(data);
     const auto l2 = segment_length<Arg2>(data);
-    auto i0       = l0;
-    auto i1       = l1;
-    auto i2       = l2;
+    auto i0 = l0;
+    auto i1 = l1;
+    auto i2 = l2;
 
     // Set the argument types for this loop
     using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
@@ -122,20 +112,16 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer)          \
+#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer) \
     RAJA_COLLAPSE(3)
-    for (i0 = 0; i0 < l0; ++i0)
-    {
-      for (i1 = 0; i1 < l1; ++i1)
-      {
-        for (i2 = 0; i2 < l2; ++i2)
-        {
+    for (i0 = 0; i0 < l0; ++i0) {
+      for (i1 = 0; i1 < l1; ++i1) {
+        for (i2 = 0; i2 < l2; ++i2) {
           auto& private_data = privatizer.get_priv();
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
           private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
-              private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
         }
       }
     }
@@ -143,6 +129,9 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
+
+
+
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
index be051f1209..65f56010bc 100644
--- a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -30,33 +30,38 @@
 #include "RAJA/policy/openmp/policy.hpp"
 
 
+
 namespace RAJA
 {
 
 namespace statement
 {
-struct OmpSyncThreads : public internal::Statement<camp::nil>
-{};
+struct OmpSyncThreads : public internal::Statement<camp::nil> {
+};
 
-}  // namespace statement
+} // namespace statement
 
 namespace internal
 {
 
 
-// Statement executor to synchronize omp threads inside a kernel region
-template <typename Types>
-struct StatementExecutor<statement::OmpSyncThreads, Types>
+
+//Statement executor to synchronize omp threads inside a kernel region
+template<typename Types>
+struct StatementExecutor<statement::OmpSyncThreads, Types> {
+
+template<typename Data>
+static RAJA_INLINE void exec(Data &&)
 {
+  #pragma omp barrier
+}
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data&&)
-  {
-#pragma omp barrier
-  }
 };
 
 
+
+
+
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 2beb61ceba..7856bd6fda 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -25,60 +25,48 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::omp_launch_t>
-{
+struct LaunchExecute<RAJA::omp_launch_t> {
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
-       const char*,
-       BODY const& body,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          LaunchContext ctx;
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+
+        LaunchContext ctx;
 
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+        using RAJA::internal::thread_privatize;
+        auto loop_body = thread_privatize(body);
 
-          ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
+        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
-          loop_body.get_priv()(ctx);
+        loop_body.get_priv()(ctx);
 
-          free(ctx.shared_mem_ptr);
-          ctx.shared_mem_ptr = nullptr;
-        });
+        free(ctx.shared_mem_ptr);
+        ctx.shared_mem_ptr = nullptr;
+    });
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template <typename ReduceParams, typename BODY>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
-       const char* RAJA_UNUSED_ARG(kernel_name),
-       BODY const& body,
-       ReduceParams& f_params)
+  template<typename ReduceParams, typename BODY>
+    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
+       const char *RAJA_UNUSED_ARG(kernel_name),  BODY const &body, ReduceParams &f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
 
     expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
-    // reducer object must be named f_params as expected by macro below
+    //reducer object must be named f_params as expected by macro below
     RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-#pragma omp parallel reduction(combine : f_params)
+   #pragma omp parallel reduction(combine : f_params)
     {
 
       LaunchContext ctx;
@@ -86,7 +74,7 @@ struct LaunchExecute<RAJA::omp_launch_t>
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 
-      ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size);
+      ctx.shared_mem_ptr = (char*) malloc(launch_params.shared_mem_size);
 
       expt::invoke_body(f_params, loop_body.get_priv(), ctx);
 
@@ -98,136 +86,120 @@ struct LaunchExecute<RAJA::omp_launch_t>
 
     return resources::EventProxy<resources::Resource>(res);
   }
+
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_for_exec, SEGMENT>
-{
+struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     int len = segment.end() - segment.begin();
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 #pragma omp for
-          for (int i = 0; i < len; i++)
-          {
+      for (int i = 0; i < len; i++) {
 
-            loop_body.get_priv()(*(segment.begin() + i));
-          }
-        });
+        loop_body.get_priv()(*(segment.begin() + i));
+      }
+    });
   }
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp for
-          for (int j = 0; j < len1; j++)
-          {
-            for (int i = 0; i < len0; i++)
-            {
-
-              loop_body.get_priv()(*(segment0.begin() + i),
-                                   *(segment1.begin() + j));
-            }
-          }
-        });
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+
+          loop_body.get_priv()(*(segment0.begin() + i),
+                               *(segment1.begin() + j));
+        }
+      }
+    });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp for
-          for (int k = 0; k < len2; k++)
-          {
-            for (int j = 0; j < len1; j++)
-            {
-              for (int i = 0; i < len0; i++)
-              {
-                loop_body.get_priv()(*(segment0.begin() + i),
-                                     *(segment1.begin() + j),
-                                     *(segment2.begin() + k));
-              }
-            }
+      for (int k = 0; k < len2; k++) {
+        for (int j = 0; j < len1; j++) {
+          for (int i = 0; i < len0; i++) {
+            loop_body.get_priv()(*(segment0.begin() + i),
+                                 *(segment1.begin() + j),
+                                 *(segment2.begin() + k));
           }
-        });
+        }
+      }
+    });
   }
 };
 
 template <typename SEGMENT>
-struct LoopExecute<omp_for_exec, SEGMENT>
-{
+struct LoopExecute<omp_for_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i++)
-    {
+    for (int i = 0; i < len; i++) {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int j = 0; j < len1; j++)
-    {
-      for (int i = 0; i < len0; i++)
-      {
+    for (int j = 0; j < len1; j++) {
+      for (int i = 0; i < len0; i++) {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -235,12 +207,12 @@ struct LoopExecute<omp_for_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -248,13 +220,11 @@ struct LoopExecute<omp_for_exec, SEGMENT>
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int k = 0; k < len2; k++)
-    {
-      for (int j = 0; j < len1; j++)
-      {
-        for (int i = 0; i < len0; i++)
-        {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++) {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
@@ -266,54 +236,53 @@ struct LoopExecute<omp_for_exec, SEGMENT>
 // Return local index
 //
 template <typename SEGMENT>
-struct LoopICountExecute<omp_for_exec, SEGMENT>
-{
+struct LoopICountExecute<omp_for_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     int len = segment.end() - segment.begin();
 
 #pragma omp for
-    for (int i = 0; i < len; i++)
-    {
-      body(*(segment.begin() + i), i);
-    }
+      for (int i = 0; i < len; i++) {
+        body(*(segment.begin() + i), i);
+      }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int j = 0; j < len1; j++)
-    {
-      for (int i = 0; i < len0; i++)
-      {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
 
-        body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
+               i,
+               j);
+        }
       }
-    }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -321,17 +290,18 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int k = 0; k < len2; k++)
-    {
-      for (int j = 0; j < len1; j++)
-      {
-        for (int i = 0; i < len0; i++)
-        {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
-               *(segment2.begin() + k), i, j, k);
+      for (int k = 0; k < len2; k++) {
+        for (int j = 0; j < len1; j++) {
+          for (int i = 0; i < len0; i++) {
+            body(*(segment0.begin() + i),
+                 *(segment1.begin() + j),
+                 *(segment2.begin() + k),
+                 i,
+                 j,
+                 k);
+          }
         }
       }
-    }
   }
 };
 
@@ -339,246 +309,219 @@ struct LoopICountExecute<omp_for_exec, SEGMENT>
 struct omp_parallel_nested_for_exec;
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
-{
+struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-          for (int j = 0; j < len1; j++)
-          {
-            for (int i = 0; i < len0; i++)
-            {
-
-              loop_body.get_priv()(*(segment0.begin() + i),
-                                   *(segment1.begin() + j));
-            }
-          }
-        });
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+
+          loop_body.get_priv()(*(segment0.begin() + i),
+                               *(segment1.begin() + j));
+        }
+      }
+    });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-          for (int k = 0; k < len2; k++)
-          {
-            for (int j = 0; j < len1; j++)
-            {
-              for (int i = 0; i < len0; i++)
-              {
-                loop_body.get_priv()(*(segment0.begin() + i),
-                                     *(segment1.begin() + j),
-                                     *(segment2.begin() + k));
-              }
-            }
+      for (int k = 0; k < len2; k++) {
+        for (int j = 0; j < len1; j++) {
+          for (int i = 0; i < len0; i++) {
+            loop_body.get_priv()(*(segment0.begin() + i),
+                                 *(segment1.begin() + j),
+                                 *(segment2.begin() + k));
           }
-        });
+        }
+      }
+    });
   }
 };
 
 // Return local index
 template <typename SEGMENT>
-struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
-{
+struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-          for (int j = 0; j < len1; j++)
-          {
-            for (int i = 0; i < len0; i++)
-            {
-
-              loop_body.get_priv()(*(segment0.begin() + i),
-                                   *(segment1.begin() + j), i, j);
-            }
-          }
-        });
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+
+          loop_body.get_priv()(*(segment0.begin() + i),
+                               *(segment1.begin() + j),
+                               i,
+                               j);
+        }
+      }
+    });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-          for (int k = 0; k < len2; k++)
-          {
-            for (int j = 0; j < len1; j++)
-            {
-              for (int i = 0; i < len0; i++)
-              {
-                loop_body.get_priv()(*(segment0.begin() + i),
-                                     *(segment1.begin() + j),
-                                     *(segment2.begin() + k), i, j, k);
-              }
-            }
+      for (int k = 0; k < len2; k++) {
+        for (int j = 0; j < len1; j++) {
+          for (int i = 0; i < len0; i++) {
+            loop_body.get_priv()(*(segment0.begin() + i),
+                                 *(segment1.begin() + j),
+                                 *(segment2.begin() + k),
+                                 i,
+                                 j,
+                                 k);
           }
-        });
+        }
+      }
+    });
   }
 };
 
 
 template <typename SEGMENT>
-struct TileExecute<omp_parallel_for_exec, SEGMENT>
-{
+struct TileExecute<omp_parallel_for_exec, SEGMENT> {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     int len = segment.end() - segment.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp for
-          for (int i = 0; i < len; i += tile_size)
-          {
-            loop_body.get_priv()(segment.slice(i, tile_size));
-          }
-        });
+      for (int i = 0; i < len; i += tile_size) {
+        loop_body.get_priv()(segment.slice(i, tile_size));
+      }
+    });
   }
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
-{
+struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
-    const int len      = segment.end() - segment.begin();
+    const int len = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
-    RAJA::region<RAJA::omp_parallel_region>(
-        [&]()
-        {
-          using RAJA::internal::thread_privatize;
-          auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>([&]() {
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
 #pragma omp parallel for
-          for (int i = 0; i < numTiles; i++)
-          {
-            const int i_tile_size = i * tile_size;
-            loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
-          }
-        });
+      for (int i = 0; i < numTiles; i++) {
+        const int i_tile_size = i * tile_size;
+        loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
+      }
+    });
   }
 };
 
 template <typename SEGMENT>
-struct TileExecute<omp_for_exec, SEGMENT>
-{
+struct TileExecute<omp_for_exec, SEGMENT> {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i += tile_size)
-    {
+    for (int i = 0; i < len; i += tile_size) {
       body(segment.slice(i, tile_size));
     }
   }
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_for_exec, SEGMENT>
-{
+struct TileTCountExecute<omp_for_exec, SEGMENT> {
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
-    const int len      = segment.end() - segment.begin();
+    const int len = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
 #pragma omp for
-    for (int i = 0; i < numTiles; i++)
-    {
+    for (int i = 0; i < numTiles; i++) {
       const int i_tile_size = i * tile_size;
       body(segment.slice(i_tile_size, tile_size), i);
     }
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 9aa61217b3..22b09a7722 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -56,7 +56,7 @@ namespace detail
  *
  **************************************************************************
  */
-template <typename T, typename t_MultiReduceOp, typename tuning>
+template < typename T, typename t_MultiReduceOp, typename tuning >
 struct MultiReduceDataOMP;
 
 /*!
@@ -68,56 +68,47 @@ struct MultiReduceDataOMP;
  *
  **************************************************************************
  */
-template <typename T, typename t_MultiReduceOp>
-struct MultiReduceDataOMP<
-    T,
-    t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<
-        RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
+template < typename T, typename t_MultiReduceOp >
+struct MultiReduceDataOMP<T, t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
 {
-  using value_type    = T;
+  using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr),
-        m_num_bins(container.size()),
-        m_identity(identity),
-        m_data(nullptr)
+      : m_parent(nullptr)
+      , m_num_bins(container.size())
+      , m_identity(identity)
+      , m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const& other)
-      : m_parent(other.m_parent ? other.m_parent : &other),
-        m_num_bins(other.m_num_bins),
-        m_identity(other.m_identity),
-        m_data(nullptr)
+  MultiReduceDataOMP(MultiReduceDataOMP const &other)
+      : m_parent(other.m_parent ? other.m_parent : &other)
+      , m_num_bins(other.m_num_bins)
+      , m_identity(other.m_identity)
+      , m_data(nullptr)
   {
-    m_data =
-        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
-                    other.m_num_bins);
+    m_data = create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins), other.m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data)
-    {
-      if (m_parent && (m_num_bins != size_t(0)))
-      {
+    if (m_data) {
+      if (m_parent && (m_num_bins != size_t(0))) {
 #pragma omp critical(ompMultiReduceCritical)
         {
-          for (size_t bin = 0; bin < m_num_bins; ++bin)
-          {
-            MultiReduceOp {}(m_parent->m_data[bin], m_data[bin]);
+          for (size_t bin = 0; bin < m_num_bins; ++bin) {
+            MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]);
           }
         }
       }
@@ -125,22 +116,18 @@ struct MultiReduceDataOMP<
     }
   }
 
-  template <typename Container>
+  template < typename Container >
   void reset(Container const& container, T identity)
   {
-    m_identity          = identity;
+    m_identity = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins)
-    {
+    if (new_num_bins != m_num_bins) {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data     = create_data(container, m_num_bins);
-    }
-    else
-    {
+      m_data = create_data(container, m_num_bins);
+    } else {
       size_t bin = 0;
-      for (auto const& value : container)
-      {
+      for (auto const& value : container) {
         m_data[bin] = value;
         ++bin;
       }
@@ -151,29 +138,26 @@ struct MultiReduceDataOMP<
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
+  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataOMP const* m_parent;
+  MultiReduceDataOMP const *m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template <typename Container>
+  template < typename Container >
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return nullptr;
     }
-    auto data =
-        RAJA::allocate_aligned_type<T>(RAJA::DATA_ALIGN, num_bins * sizeof(T));
+    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, num_bins * sizeof(T) );
     size_t bin = 0;
-    for (auto const& value : container)
-    {
-      new (&data[bin]) T(value);
+    for (auto const& value : container) {
+      new(&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -181,13 +165,11 @@ struct MultiReduceDataOMP<
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return;
     }
-    for (size_t bin = num_bins; bin > 0; --bin)
-    {
-      data[bin - 1].~T();
+    for (size_t bin = num_bins; bin > 0; --bin) {
+      data[bin-1].~T();
     }
     RAJA::free_aligned(data);
     data = nullptr;
@@ -203,93 +185,74 @@ struct MultiReduceDataOMP<
  *
  **************************************************************************
  */
-template <typename T, typename t_MultiReduceOp>
-struct MultiReduceDataOMP<
-    T,
-    t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<
-        RAJA::omp::multi_reduce_algorithm::combine_on_get>>
+template < typename T, typename t_MultiReduceOp >
+struct MultiReduceDataOMP<T, t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>>
 {
-  using value_type    = T;
+  using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr),
-        m_max_threads(omp_get_max_threads()),
-        m_num_bins(container.size()),
-        m_padded_threads(pad_threads(m_max_threads)),
-        m_padded_bins(pad_bins(m_num_bins)),
-        m_identity(identity),
-        m_data(nullptr)
+      : m_parent(nullptr)
+      , m_max_threads(omp_get_max_threads())
+      , m_num_bins(container.size())
+      , m_padded_threads(pad_threads(m_max_threads))
+      , m_padded_bins(pad_bins(m_num_bins))
+      , m_identity(identity)
+      , m_data(nullptr)
   {
-    m_data = create_data(container, identity, m_num_bins, m_max_threads,
-                         m_padded_bins, m_padded_threads);
+    m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const& other)
-      : m_parent(other.m_parent ? other.m_parent : &other),
-        m_num_bins(other.m_num_bins),
-        m_padded_threads(other.m_padded_threads),
-        m_padded_bins(other.m_padded_bins),
-        m_identity(other.m_identity),
-        m_data(other.m_data)
-  {}
+  MultiReduceDataOMP(MultiReduceDataOMP const &other)
+      : m_parent(other.m_parent ? other.m_parent : &other)
+      , m_num_bins(other.m_num_bins)
+      , m_padded_threads(other.m_padded_threads)
+      , m_padded_bins(other.m_padded_bins)
+      , m_identity(other.m_identity)
+      , m_data(other.m_data)
+  { }
 
-  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data)
-    {
-      if (!m_parent)
-      {
-        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
-                     m_padded_threads);
+    if (m_data) {
+      if (!m_parent) {
+        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
       }
     }
   }
 
-  template <typename Container>
+  template < typename Container >
   void reset(Container const& container, T identity)
   {
-    m_identity          = identity;
+    m_identity = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins)
-    {
-      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
-                   m_padded_threads);
-      m_num_bins    = new_num_bins;
+    if (new_num_bins != m_num_bins) {
+      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+      m_num_bins = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container, identity, m_num_bins, m_max_threads,
-                           m_padded_bins, m_padded_threads);
-    }
-    else
-    {
-      if (m_max_threads > 0)
-      {
+      m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    } else {
+      if (m_max_threads > 0) {
         {
           size_t thread_idx = 0;
-          size_t bin        = 0;
-          for (auto const& value : container)
-          {
-            m_data[index_data(bin, thread_idx, m_padded_bins,
-                              m_padded_threads)] = value;
+          size_t bin = 0;
+          for (auto const& value : container) {
+            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
             ++bin;
           }
         }
-        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx)
-        {
-          for (size_t bin = 0; bin < m_num_bins; ++bin)
-          {
-            m_data[index_data(bin, thread_idx, m_padded_bins,
-                              m_padded_threads)] = identity;
+        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) {
+          for (size_t bin = 0; bin < m_num_bins; ++bin) {
+            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
           }
         }
       }
@@ -300,28 +263,24 @@ struct MultiReduceDataOMP<
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const& val)
+  void combine(size_t bin, T const &val)
   {
     size_t thread_idx = omp_get_thread_num();
-    MultiReduceOp {}(
-        m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)],
-        val);
+    MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val);
   }
 
   T get(size_t bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
         reducer(m_identity);
-    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx)
-    {
-      reducer.combine(
-          m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
+    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) {
+      reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
     }
     return reducer.get_and_clear();
   }
 
 private:
-  MultiReduceDataOMP const* m_parent;
+  MultiReduceDataOMP const *m_parent;
   size_t m_max_threads;
   size_t m_num_bins;
   size_t m_padded_threads;
@@ -331,10 +290,8 @@ struct MultiReduceDataOMP<
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
-    size_t num_cache_lines =
-        RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
-                                   sizeof(T));
+    size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN);
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -342,46 +299,33 @@ struct MultiReduceDataOMP<
     return max_threads;
   }
 
-  static constexpr size_t index_data(size_t bin,
-                                     size_t thread_idx,
-                                     size_t padded_bins,
-                                     size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(size_t bin, size_t thread_idx,
+                                     size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
-  template <typename Container>
-  static T* create_data(Container const& container,
-                        T identity,
-                        size_t num_bins,
-                        size_t max_threads,
-                        size_t padded_bins,
-                        size_t padded_threads)
+  template < typename Container >
+  static T* create_data(Container const& container, T identity,
+                        size_t num_bins, size_t max_threads,
+                        size_t padded_bins, size_t padded_threads)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>(
-        RAJA::DATA_ALIGN, padded_threads * padded_bins * sizeof(T));
-    if (max_threads > 0)
-    {
+    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) );
+    if (max_threads > 0) {
       {
         size_t thread_idx = 0;
-        size_t bin        = 0;
-        for (auto const& value : container)
-        {
-          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
-              T(value);
+        size_t bin = 0;
+        for (auto const& value : container) {
+          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value);
           ++bin;
         }
       }
-      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx)
-      {
-        for (size_t bin = 0; bin < num_bins; ++bin)
-        {
-          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
-              T(identity);
+      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
+          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity);
         }
       }
     }
@@ -389,21 +333,15 @@ struct MultiReduceDataOMP<
   }
 
   static void destroy_data(T*& data,
-                           size_t num_bins,
-                           size_t max_threads,
-                           size_t padded_bins,
-                           size_t padded_threads)
+                           size_t num_bins, size_t max_threads,
+                           size_t padded_bins, size_t padded_threads)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return;
     }
-    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx)
-    {
-      for (size_t bin = num_bins; bin > 0; --bin)
-      {
-        data[index_data(bin - 1, thread_idx - 1, padded_bins, padded_threads)]
-            .~T();
+    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) {
+      for (size_t bin = num_bins; bin > 0; --bin) {
+        data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T();
       }
     }
     RAJA::free_aligned(data);
@@ -413,8 +351,7 @@ struct MultiReduceDataOMP<
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
-                                detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index e22b3d7d59..d9bea5d0d8 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -18,356 +18,303 @@ namespace omp
 namespace expt
 {
 
-namespace internal
-{
-//
-// omp for (Auto)
-//
-template <typename ExecPol,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
-forall_impl(const ExecPol& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for reduction(combine : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  namespace internal
   {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+    //
+    // omp for (Auto)
+    //
+    template <typename ExecPol, typename Iterable, typename Func, typename ForallParam>
+    RAJA_INLINE 
+    concepts::enable_if< std::is_same<ExecPol, RAJA::policy::omp::Auto> >
+    forall_impl(const ExecPol& p,
+                Iterable&& iter,
+                Func&& loop_body,
+                ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-//
-// omp for schedule(static)
-//
-template <template <int> class ExecPol,
-          typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if<
-    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-    std::integral_constant<bool, (ChunkSize <= 0)>>
-forall_impl(const ExecPol<ChunkSize>& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(static) reduction(combine : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+    //
+    // omp for schedule(static)
+    //
+    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
+    RAJA_INLINE 
+    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+                         std::integral_constant<bool,(ChunkSize <= 0)> >
+    forall_impl(const ExecPol<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(static) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-//
-// omp for schedule(static, ChunkSize)
-//
-template <template <int> class ExecPol,
-          typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if<
-    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-    std::integral_constant<bool, (ChunkSize > 0)>>
-forall_impl(const ExecPol<ChunkSize>& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(static, ChunkSize) reduction(combine         \
-                                                               : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+    //
+    // omp for schedule(static, ChunkSize)
+    //
+    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
+    RAJA_INLINE 
+    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+                         std::integral_constant<bool,(ChunkSize > 0)> >
+    forall_impl(const ExecPol<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-//
-// omp for schedule(runtime)
-//
-template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(static, ChunkSize) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(runtime) reduction(combine : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+    //
+    // omp for schedule(runtime)
+    //
+    template <typename Iterable, typename Func, typename ForallParam>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-//
-// omp for nowait (Auto)
-//
-template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                    Iterable&& iter,
-                                    Func&& loop_body,
-                                    ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(runtime) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel
-  {
-#pragma omp for nowait reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for nowait (Auto)
+    //
+    template <typename Iterable, typename Func, typename ForallParam>
+    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
     {
-      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel
+      {
+      #pragma omp for nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
     }
-  }
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+    //
+    // omp for schedule(dynamic)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-//
-// omp for schedule(dynamic)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+    //
+    // omp for schedule(dynamic, ChunkSize)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-//
-// omp for schedule(dynamic, ChunkSize)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine        \
-                                                                : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-//
-// omp for schedule(guided)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+    //
+    // omp for schedule(guided)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(guided) reduction(combine : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(guided) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-//
-// omp for schedule(guided, ChunkSize)
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(guided, ChunkSize) reduction(combine         \
-                                                               : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-  }
+    //
+    // omp for schedule(guided, ChunkSize)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(guided, ChunkSize) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
 
-//
-// omp for schedule(static) nowait
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                   Iterable&& iter,
-                   Func&& loop_body,
-                   ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel
-  {
-#pragma omp for schedule(static) nowait reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    //
+    // omp for schedule(static) nowait
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
     {
-      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-    }
-  }
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
+      RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel
+      {
+      #pragma omp for schedule(static) nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
 
-//
-// omp for schedule(static, ChunkSize) nowait
-//
-template <typename Iterable,
-          typename Func,
-          int ChunkSize,
-          typename ForallParam,
-          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void
-forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                   Iterable&& iter,
-                   Func&& loop_body,
-                   ForallParam&& f_params)
-{
-  using EXEC_POL = typename std::decay<decltype(p)>::type;
-  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+    //
+    // omp for schedule(static, ChunkSize) nowait
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-  RAJA_EXTRACT_BED_IT(iter);
+      RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-  {
-#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
-    {
-      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      {
+      #pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
     }
-  }
-
-  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-}
 
-}  //  namespace internal
+  } //  namespace internal
 
-template <typename Schedule,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
-RAJA_INLINE resources::EventProxy<resources::Host>
-forall_impl(resources::Host host_res,
-            const omp_for_schedule_exec<Schedule>&,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam f_params)
-{
-  expt::internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
-                              std::forward<Func>(loop_body),
-                              std::forward<ForallParam>(f_params));
-  return resources::EventProxy<resources::Host>(host_res);
-}
-}  //  namespace expt
+  template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
+  RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
+                                                                 const omp_for_schedule_exec<Schedule>&,
+                                                                 Iterable&& iter,
+                                                                 Func&& loop_body,
+                                                                 ForallParam f_params)
+  {
+    expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body), std::forward<ForallParam>(f_params));
+    return resources::EventProxy<resources::Host>(host_res);
+  }
+} //  namespace expt
 
 ///
 /// OpenMP parallel policy implementation
 ///
-template <typename Iterable,
-          typename Func,
-          typename InnerPolicy,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam f_params)
 {
-  expt::forall_impl(host_res, InnerPolicy {}, iter, loop_body, f_params);
+  expt::forall_impl(host_res, InnerPolicy{}, iter, loop_body, f_params);
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
index 3a6c6d9bea..65a5f7a329 100644
--- a/include/RAJA/policy/openmp/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -3,42 +3,38 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-// Init
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
-init(KernelName&)
-{
-  // TODO: Define kernel naming
-}
-
-// Combine
-template <typename EXEC_POL, typename T>
-camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
-combine(KernelName&, T& /*place holder argument*/)
-{}
-
-// Resolve
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
-resolve(KernelName&)
-{
-  // TODO: Define kernel naming
-}
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  combine(KernelName&, T& /*place holder argument*/) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
 
 #endif
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
 
-#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
index 4846929843..f71efc255a 100644
--- a/include/RAJA/policy/openmp/params/reduce.hpp
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -3,43 +3,37 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-// Init
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
-init(Reducer<OP, T>& red)
-{
-  red.val = OP::identity();
-}
-
-// Combine
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
-combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
-{
-  out.val = OP {}(out.val, in.val);
-}
-
-// Resolve
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
-resolve(Reducer<OP, T>& red)
-{
-  *red.target = OP {}(*red.target, red.val);
-}
+  // Init
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  init(Reducer<OP, T, VOp>& red) {
+    red.m_valop.val = OP::identity();
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
+    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  resolve(Reducer<OP, T, VOp>& red) {
+    red.combineTarget(red.m_valop.val);
+  }
 
 #endif
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
-#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index 6a1299065c..aff2567474 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -26,16 +26,15 @@
 #include "RAJA/policy/atomic_builtin.hpp"
 
 #if defined(RAJA_COMPILER_MSVC)
-typedef enum omp_sched_t
-{
-  // schedule kinds
-  omp_sched_static  = 0x1,
-  omp_sched_dynamic = 0x2,
-  omp_sched_guided  = 0x3,
-  omp_sched_auto    = 0x4,
-
-  // schedule modifier
-  omp_sched_monotonic = 0x80000000u
+typedef enum omp_sched_t { 
+    // schedule kinds 
+    omp_sched_static = 0x1, 
+    omp_sched_dynamic = 0x2, 
+    omp_sched_guided = 0x3, 
+    omp_sched_auto = 0x4, 
+    
+    // schedule modifier 
+    omp_sched_monotonic = 0x80000000u 
 } omp_sched_t;
 #else
 #include <omp.h>
@@ -52,7 +51,7 @@ enum struct multi_reduce_algorithm : int
   combine_on_get
 };
 
-template <multi_reduce_algorithm t_algorithm>
+template < multi_reduce_algorithm t_algorithm >
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -60,7 +59,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
-}  // namespace omp
+} // namspace omp
 
 namespace policy
 {
@@ -69,16 +68,14 @@ namespace omp
 
 namespace internal
 {
-struct ScheduleTag
-{};
-
-template <omp_sched_t Sched, int Chunk>
-struct Schedule : public ScheduleTag
-{
-  constexpr static omp_sched_t schedule = Sched;
-  constexpr static int chunk_size       = Chunk;
-  constexpr static Policy policy        = Policy::openmp;
-};
+    struct ScheduleTag {};
+
+    template <omp_sched_t Sched, int Chunk>
+    struct Schedule : public ScheduleTag {
+        constexpr static omp_sched_t schedule = Sched;
+        constexpr static int chunk_size = Chunk;
+        constexpr static Policy policy = Policy::openmp;
+    };
 }  // namespace internal
 
 //
@@ -89,23 +86,23 @@ struct Schedule : public ScheduleTag
 //////////////////////////////////////////////////////////////////////
 //
 
-struct Parallel
-{};
+struct Parallel {
+};
 
-struct For
-{};
+struct For {
+};
 
-struct NoWait
-{};
+struct NoWait {
+};
 
 static constexpr int default_chunk_size = -1;
 
-struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>
-{};
+struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>{
+};
 
 template <int ChunkSize = default_chunk_size>
-struct Static : public internal::Schedule<omp_sched_static, ChunkSize>
-{};
+struct Static : public internal::Schedule<omp_sched_static, ChunkSize> {
+};
 
 template <int ChunkSize = default_chunk_size>
 using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
@@ -113,9 +110,8 @@ using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 template <int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
-                                            default_chunk_size>
-{};
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), default_chunk_size> {
+};
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -126,41 +122,39 @@ struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
 //
 
 ///
-///  Struct supporting OpenMP parallel region.
+///  Struct supporting OpenMP parallel region. 
 ///
 struct omp_parallel_region
     : make_policy_pattern_launch_platform_t<Policy::openmp,
                                             Pattern::region,
                                             Launch::undefined,
-                                            Platform::host>
-{};
+                                            Platform::host> {
+};
 
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                            Pattern::region,
-                                                            Launch::undefined,
-                                                            Platform::host>
-{};
+struct omp_launch_t
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::region,
+                                            Launch::undefined,
+                                            Platform::host> {
+};
 
 
 ///
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
 template <typename Sched>
-struct omp_for_nowait_schedule_exec
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::For,
-                                            omp::NoWait,
-                                            Sched>
-{
-  static_assert(
-      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                              Pattern::forall,
+                                                              Launch::undefined,
+                                                              Platform::host,
+                                                              omp::For,
+                                                              omp::NoWait,
+                                                              Sched> {
+    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 
@@ -168,17 +162,14 @@ struct omp_for_nowait_schedule_exec
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
 template <typename Sched>
-struct omp_for_schedule_exec
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::For,
-                                            Sched>
-{
-  static_assert(
-      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                              Pattern::forall,
+                                                              Launch::undefined,
+                                                              Platform::host,
+                                                              omp::For,
+                                                              Sched> {
+    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 ///
@@ -205,58 +196,52 @@ using omp_for_runtime_exec = omp_for_schedule_exec<omp::Runtime>;
 
 ///
 ///  Internal type aliases supporting 'omp for schedule( ) nowait' for specific
-///  schedule types.
+///  schedule types. 
 ///
 ///  IMPORTANT: We only provide a nowait policy option for static scheduling
 ///             since that is the only scheduling case that can be used with
-///             nowait and be correct in general. Paraphrasing the OpenMP
+///             nowait and be correct in general. Paraphrasing the OpenMP 
 ///             standard:
-///
-///             Programs that depend on which thread executes a particular
+///             
+///             Programs that depend on which thread executes a particular 
 ///             iteration under any circumstance other than static schedule
 ///             are non-conforming.
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_for_nowait_static_exec =
-    omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
+using omp_for_nowait_static_exec = omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
 
 ///
 ///  Struct supporting OpenMP 'parallel' region containing an inner loop
 ///  execution construct.
 ///
 template <typename InnerPolicy>
-using omp_parallel_exec =
-    make_policy_pattern_launch_platform_t<Policy::openmp,
-                                          Pattern::forall,
-                                          Launch::undefined,
-                                          Platform::host,
-                                          omp::Parallel,
-                                          wrapper<InnerPolicy>>;
+using omp_parallel_exec = make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::Parallel,
+                                            wrapper<InnerPolicy>>;
 
 ///
-///  Internal type aliases supporting 'omp parallel for schedule( )' for
+///  Internal type aliases supporting 'omp parallel for schedule( )' for 
 ///  specific schedule types.
 ///
 using omp_parallel_for_exec = omp_parallel_exec<omp_for_exec>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_static_exec =
-    omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>>>;
+using omp_parallel_for_static_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>> >;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_dynamic_exec =
-    omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>>>;
+using omp_parallel_for_dynamic_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>> >;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_guided_exec =
-    omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>>>;
+using omp_parallel_for_guided_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>> >;
 
 ///
-using omp_parallel_for_runtime_exec =
-    omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
+using omp_parallel_for_runtime_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
 
 
 ///
@@ -280,13 +265,13 @@ using omp_parallel_segit = omp_parallel_for_segit;
 ///////////////////////////////////////////////////////////////////////
 ///
 struct omp_taskgraph_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
-{};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
+};
 
 ///
 struct omp_taskgraph_interval_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
-{};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
+};
 
 
 ///
@@ -299,8 +284,8 @@ struct omp_taskgraph_interval_segit
 struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host>
-{};
+                                                        Platform::host> {
+};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -309,31 +294,31 @@ struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce>
-{};
+struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce> {
+};
 
 ///
 struct omp_reduce_ordered
-    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered>
-{};
+    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered> {
+};
 
 ///
-template <typename tuning>
-struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
-                                     Policy::openmp,
-                                     Pattern::multi_reduce,
-                                     Launch::undefined,
-                                     Platform::host,
-                                     std::conditional_t<tuning::consistent,
-                                                        reduce::ordered,
-                                                        reduce::unordered>>
-{};
+template < typename tuning >
+struct omp_multi_reduce_policy
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::multi_reduce,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            std::conditional_t<tuning::consistent,
+                                                               reduce::ordered,
+                                                               reduce::unordered>> {
+};
 
 ///
 struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
                                                       Pattern::synchronize,
-                                                      Launch::sync>
-{};
+                                                      Launch::sync> {
+};
 
 #if defined(RAJA_COMPILER_MSVC)
 
@@ -342,15 +327,14 @@ using omp_atomic = builtin_atomic;
 
 #else  // RAJA_COMPILER_MSVC not defined
 
-struct omp_atomic
-{};
+struct omp_atomic {};
 
 #endif
 
 
-template <RAJA::omp::multi_reduce_algorithm algorithm>
-using omp_multi_reduce_tuning =
-    omp_multi_reduce_policy<RAJA::omp::MultiReduceTuning<algorithm>>;
+template < RAJA::omp::multi_reduce_algorithm algorithm >
+using omp_multi_reduce_tuning = omp_multi_reduce_policy<
+    RAJA::omp::MultiReduceTuning<algorithm> >;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - combine_on_destruction policies combine new values into a single value for
@@ -360,8 +344,8 @@ using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning<
     RAJA::omp::multi_reduce_algorithm::combine_on_destruction>;
 // - combine_on_get policies combine new values into a single value for
 //   each thread then when get is called those values are combined.
-using omp_multi_reduce_combine_on_get =
-    omp_multi_reduce_tuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>;
+using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning<
+    RAJA::omp::multi_reduce_algorithm::combine_on_get>;
 
 // Policy for RAJA::MultiReduce* objects that gives the
 // same answer every time when used in the same way
@@ -411,19 +395,18 @@ using policy::omp::omp_parallel_for_segit;
 using policy::omp::omp_parallel_segit;
 
 ///
-/// Type alias for omp parallel region containing an inner 'omp for' loop
+/// Type alias for omp parallel region containing an inner 'omp for' loop 
 /// execution policy. Inner policy types follow.
 ///
 using policy::omp::omp_parallel_exec;
 
 ///
-/// Type alias for 'omp for' loop execution within an omp_parallel_exec
-/// construct
+/// Type alias for 'omp for' loop execution within an omp_parallel_exec construct
 ///
 using policy::omp::omp_for_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
 /// scheduling policy within an omp_parallel_exec construct
 /// Scheduling policies are near the top of this file and include:
 /// RAJA::policy::omp::{Auto, Static, Dynamic, Guided, Runtime}
@@ -438,7 +421,7 @@ using policy::omp::omp_for_schedule_exec;
 using policy::omp::omp_for_nowait_schedule_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
 /// static scheduling policy within an omp_parallel_exec construct
 ///
 using policy::omp::omp_for_static_exec;
@@ -454,8 +437,8 @@ using policy::omp::omp_for_runtime_exec;
 ///
 /// Type aliases for omp parallel region
 ///
-using policy::omp::omp_launch_t;
 using policy::omp::omp_parallel_region;
+using policy::omp::omp_launch_t;
 
 ///
 /// Type aliases for omp reductions
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index 7fb0953c03..7ccc68c3a1 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -55,8 +55,7 @@ class ReduceOMP
 
   ~ReduceOMP()
   {
-    if (Base::parent)
-    {
+    if (Base::parent) {
 #pragma omp critical(ompReduceCritical)
       Reduce()(Base::parent->local(), Base::my_data);
       Base::my_data = Base::identity;
@@ -102,22 +101,20 @@ class ReduceOMPOrdered
 
   ~ReduceOMPOrdered()
   {
-    Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
+    Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
     Base::my_data = Base::identity;
   }
 
   T get_combined() const
   {
-    if (Base::my_data != Base::identity)
-    {
-      Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
+    if (Base::my_data != Base::identity) {
+      Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
       Base::my_data = Base::identity;
     }
 
     T res = Base::identity;
-    for (size_t i = 0; i < data->size(); ++i)
-    {
-      Reduce {}(res, (*data)[i]);
+    for (size_t i = 0; i < data->size(); ++i) {
+      Reduce{}(res, (*data)[i]);
     }
     return res;
   }
diff --git a/include/RAJA/policy/openmp/region.hpp b/include/RAJA/policy/openmp/region.hpp
index 80f2dbd84a..88f0519abf 100644
--- a/include/RAJA/policy/openmp/region.hpp
+++ b/include/RAJA/policy/openmp/region.hpp
@@ -35,15 +35,15 @@ namespace omp
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const omp_parallel_region&, Func&& body)
+RAJA_INLINE void region_impl(const omp_parallel_region &, Func &&body)
 {
 
 #pragma omp parallel
-  {  // curly brackets to ensure body() is encapsulated in omp parallel region
-    // thread private copy of body
-    auto loopbody = body;
-    loopbody();
-  }
+    { // curly brackets to ensure body() is encapsulated in omp parallel region
+      //thread private copy of body
+      auto loopbody = body;
+      loopbody();
+    }
 }
 
 }  // namespace omp
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 555075aeac..97cd7a8ab8 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -44,39 +44,39 @@ namespace scan
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(resources::Host host_res,
-                  const Policy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(
+    resources::Host host_res,
+    const Policy&,
+    Iter begin,
+    Iter end,
+    BinFn f)
 {
-  using RAJA::detail::firstIndex;
   using std::distance;
-  using Value     = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n    = distance(begin, end);
+  using RAJA::detail::firstIndex;
+  using Value = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, Value());
 #pragma omp parallel num_threads(p0)
   {
-    const int p               = omp_get_num_threads();
-    const int pid             = omp_get_thread_num();
+    const int p = omp_get_num_threads();
+    const int pid = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
-    if (idx_begin != idx_end)
-    {
-      inclusive_inplace(host_res, ::RAJA::seq_exec {}, begin + idx_begin,
-                        begin + idx_end, f);
+    const DistanceT idx_end = firstIndex(n, p, pid + 1);
+    if (idx_begin != idx_end) {
+      inclusive_inplace(host_res, ::RAJA::seq_exec{},
+                        begin + idx_begin, begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, ::RAJA::seq_exec {}, sums.data(),
-                      sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i)
-    {
+    exclusive_inplace(host_res, ::RAJA::seq_exec{},
+                      sums.data(), sums.data() + p, f, BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i) {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -89,42 +89,42 @@ inclusive_inplace(resources::Host host_res,
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(resources::Host host_res,
-                  const Policy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f,
-                  ValueT v)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(
+    resources::Host host_res,
+    const Policy&,
+    Iter begin,
+    Iter end,
+    BinFn f,
+    ValueT v)
 {
-  using RAJA::detail::firstIndex;
   using std::distance;
-  using Value     = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n    = distance(begin, end);
+  using RAJA::detail::firstIndex;
+  using Value = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, v);
 #pragma omp parallel num_threads(p0)
   {
-    const int p               = omp_get_num_threads();
-    const int pid             = omp_get_thread_num();
+    const int p = omp_get_num_threads();
+    const int pid = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
-    const Value init          = ((pid == 0) ? v : *(begin + idx_begin - 1));
+    const DistanceT idx_end = firstIndex(n, p, pid + 1);
+    const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
-    if (idx_begin != idx_end)
-    {
-      exclusive_inplace(host_res, seq_exec {}, begin + idx_begin,
-                        begin + idx_end, f, init);
+    if (idx_begin != idx_end) {
+      exclusive_inplace(host_res, seq_exec{},
+                        begin + idx_begin, begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, seq_exec {}, sums.data(), sums.data() + p, f,
-                      BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i)
-    {
+    exclusive_inplace(host_res, seq_exec{},
+                      sums.data(), sums.data() + p, f, BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i) {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -137,14 +137,16 @@ exclusive_inplace(resources::Host host_res,
    initial value
 */
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-inclusive(resources::Host host_res,
-          const Policy& exec,
-          Iter begin,
-          Iter end,
-          OutIter out,
-          BinFn f)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<Policy>>
+inclusive(
+    resources::Host host_res,
+    const Policy& exec,
+    Iter begin,
+    Iter end,
+    OutIter out,
+    BinFn f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -160,20 +162,21 @@ template <typename Policy,
           typename OutIter,
           typename BinFn,
           typename ValueT>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_openmp_policy<Policy>>
-exclusive(resources::Host host_res,
-          const Policy& exec,
-          Iter begin,
-          Iter end,
-          OutIter out,
-          BinFn f,
-          ValueT v)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_openmp_policy<Policy>>
+exclusive(
+    resources::Host host_res,
+    const Policy& exec,
+    Iter begin,
+    Iter end,
+    OutIter out,
+    BinFn f,
+    ValueT v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f,
-                           v);
+  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f, v);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index ea88a7b2ff..9e4474d692 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -62,18 +62,16 @@ inline void sort_task(Sorter sorter,
                       RAJA::detail::IterDiff<Iter> iterates_per_task,
                       Compare comp)
 {
-  using diff_type   = RAJA::detail::IterDiff<Iter>;
+  using diff_type = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
 
-  if (n <= iterates_per_task)
-  {
+  if (n <= iterates_per_task) {
 
-    sorter(begin + i_begin, begin + i_end, comp);
-  }
-  else
-  {
+    sorter(begin+i_begin, begin+i_end, comp);
+
+  } else {
 
-    const diff_type i_middle = i_begin + n / 2;
+    const diff_type i_middle = i_begin + n/2;
 
 #pragma omp task
     sort_task(sorter, begin, i_begin, i_middle, iterates_per_task, comp);
@@ -83,10 +81,8 @@ inline void sort_task(Sorter sorter,
 
 #pragma omp taskwait
 
-    // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
-    // comp);
-    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
-                                begin + i_end, comp);
+    //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
   }
 }
 
@@ -118,27 +114,20 @@ inline void sort_parallel_region(Sorter sorter,
   }
 
   // hierarchically merge ranges
-  for (diff_type middle_offset = 1; middle_offset < num_threads;
-       middle_offset *= 2)
-  {
+  for (diff_type middle_offset = 1; middle_offset < num_threads; middle_offset *= 2) {
 
-    diff_type end_offset = 2 * middle_offset;
+    diff_type end_offset = 2*middle_offset;
 
-    const diff_type i_middle = firstIndex(
-        n, num_threads, std::min(thread_id + middle_offset, num_threads));
-    const diff_type i_end = firstIndex(
-        n, num_threads, std::min(thread_id + end_offset, num_threads));
+    const diff_type i_middle = firstIndex(n, num_threads, std::min(thread_id + middle_offset, num_threads));
+    const diff_type i_end    = firstIndex(n, num_threads, std::min(thread_id + end_offset,    num_threads));
 
 #pragma omp barrier
 
-    if (thread_id % end_offset == 0)
-    {
+    if (thread_id % end_offset == 0) {
 
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
-      // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
-      // comp);
-      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
-                                  begin + i_end, comp);
+      //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
     }
   }
 }
@@ -150,7 +139,11 @@ inline void sort_parallel_region(Sorter sorter,
         \brief sort given range using sorter and comparison function
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
+inline
+void sort(Sorter sorter,
+          Iter begin,
+          Iter end,
+          Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
 
@@ -158,26 +151,22 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 
   const diff_type n = end - begin;
 
-  if (n <= min_iterates_per_task)
-  {
+  if (n <= min_iterates_per_task) {
 
     sorter(begin, end, comp);
-  }
-  else
-  {
+
+  } else {
 
     const diff_type max_threads = omp_get_max_threads();
 
 #if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 
-    const diff_type iterates_per_task =
-        std::max(n / (2 * max_threads), min_iterates_per_task);
+    const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
 
-    const diff_type requested_num_threads =
-        std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
+    const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
 #pragma omp master
     {
       sort_task(sorter, begin, 0, n, iterates_per_task, comp);
@@ -185,11 +174,10 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 
 #else
 
-    const diff_type requested_num_threads = std::min(
-        (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
+    const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
     {
       sort_parallel_region(sorter, begin, n, comp);
     }
@@ -198,9 +186,9 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
   }
 }
 
-}  // namespace openmp
+} // namespace openmp
 
-}  // namespace detail
+} // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -208,13 +196,14 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable(resources::Host host_res,
-         const ExecPolicy&,
-         Iter begin,
-         Iter end,
-         Compare comp)
+unstable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter begin,
+    Iter end,
+    Compare comp)
 {
-  detail::openmp::sort(detail::UnstableSorter {}, begin, end, comp);
+  detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -225,13 +214,14 @@ unstable(resources::Host host_res,
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable(resources::Host host_res,
-       const ExecPolicy&,
-       Iter begin,
-       Iter end,
-       Compare comp)
+stable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter begin,
+    Iter end,
+    Compare comp)
 {
-  detail::openmp::sort(detail::StableSorter {}, begin, end, comp);
+  detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -239,50 +229,43 @@ stable(resources::Host host_res,
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(resources::Host host_res,
-               const ExecPolicy&,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
-               Compare comp)
+unstable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
-  auto begin    = RAJA::zip(keys_begin, vals_begin);
-  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin  = RAJA::zip(keys_begin, vals_begin);
+  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::UnstableSorter {}, begin, end,
-                       RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on
-   keys
+        \brief stable sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(resources::Host host_res,
-             const ExecPolicy&,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             Compare comp)
+stable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
-  auto begin    = RAJA::zip(keys_begin, vals_begin);
-  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin  = RAJA::zip(keys_begin, vals_begin);
+  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::StableSorter {}, begin, end,
-                       RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index 4c48a12eda..af88127636 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -34,7 +34,6 @@
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
-        // defined(RAJA_ENABLE_TARGET_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP)
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index 6ace7460fd..a4a4a62903 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -36,12 +36,12 @@ namespace omp_target
 
 // create the value in a target region using the factory, map the value
 // back, and return the value created in the target region
-template <typename Factory>
+template < typename Factory >
 inline auto get_value(Factory factory)
 {
   typename std::decay_t<Factory>::value_type value;
 
-#pragma omp target map(tofrom : value) map(to : factory)
+  #pragma omp target map(tofrom : value) map(to : factory)
   {
     value = factory();
   }
@@ -51,7 +51,7 @@ inline auto get_value(Factory factory)
 
 // get the device value and store it so it can be used
 // multiple times
-template <typename Factory>
+template < typename Factory >
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -61,18 +61,17 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace omp_target
 
 /*!
- * Populate and return a Dispatcher object that can be used in omp target
- * regions
- */
-template <typename T, typename Dispatcher_T>
+* Populate and return a Dispatcher object that can be used in omp target regions
+*/
+template < typename T, typename Dispatcher_T >
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
-  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
-      [](auto&& factory)
-      {
-        return omp_target::get_cached_value(
-            std::forward<decltype(factory)>(factory));
-      })};
+  static Dispatcher_T dispatcher{
+        Dispatcher_T::template makeDispatcher<T>(
+          [](auto&& factory) {
+            return omp_target::get_cached_value(
+                std::forward<decltype(factory)>(factory));
+          }) };
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index 96c2323c33..b373d09c61 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -38,21 +38,23 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_target_work,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
-                              RAJA::omp_target_work,
-                              RAJA::ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
-{};
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_target_work,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::omp_target_parallel_for_exec_nt,
+        RAJA::omp_target_work,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
 
 /*!
  * Runs work in a storage container in reverse order
@@ -61,21 +63,23 @@ struct WorkRunner<RAJA::omp_target_work,
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::omp_target_work,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...>
-    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
-                              RAJA::omp_target_work,
-                              RAJA::reverse_ordered,
-                              DISPATCH_POLICY_T,
-                              ALLOCATOR_T,
-                              INDEX_T,
-                              Args...>
-{};
+          typename ... Args>
+struct WorkRunner<
+        RAJA::omp_target_work,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::omp_target_parallel_for_exec_nt,
+        RAJA::omp_target_work,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index a142b6a606..061481cbc1 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -33,15 +33,13 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
             Iterable&& iter,
@@ -53,37 +51,33 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body  = loop_body;
+  Body body = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if (tperteam > omp::MAXNUMTHREADS)
+  if ( tperteam > omp::MAXNUMTHREADS )
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
-  if (numteams > tperteam)
+  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
+  if ( numteams > tperteam )
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
-  // distance)
+// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams)           \
-    schedule(static, 1) map(to                                                 \
-                            : body, begin_it) reduction(combine                \
-                                                        : f_params)
-  for (i = 0; i < distance_it; ++i)
-  {
+#pragma omp target teams distribute parallel for num_teams(numteams) \
+    schedule(static, 1) map(to : body,begin_it) reduction(combine: f_params)
+  for (i = 0; i < distance_it; ++i) {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -92,14 +86,13 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <size_t ThreadsPerTeam,
-          typename Iterable,
-          typename Func,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>&,
             Iterable&& iter,
@@ -107,36 +100,33 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body  = loop_body;
+  Body body = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if (tperteam > omp::MAXNUMTHREADS)
+  if ( tperteam > omp::MAXNUMTHREADS )
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
-  if (numteams > tperteam)
+  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
+  if ( numteams > tperteam )
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
-  // distance)
+// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams)           \
-    schedule(static, 1) map(to                                                 \
-                            : body, begin_it)
-  for (i = 0; i < distance_it; ++i)
-  {
+#pragma omp target teams distribute parallel for num_teams(numteams) \
+    schedule(static, 1) map(to : body,begin_it)
+  for (i = 0; i < distance_it; ++i) {
     Body ib = body;
     ib(begin_it[i]);
   }
@@ -145,12 +135,16 @@ forall_impl(resources::Omp omp_res,
 }
 
 
+
+
+
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt& p,
             Iterable&& iter,
@@ -162,15 +156,13 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body  = loop_body;
+  Body body = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1)           \
-    firstprivate(body, begin_it) reduction(combine                             \
-                                           : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
+#pragma omp target teams distribute parallel for schedule(static, 1) \
+    firstprivate(body,begin_it) reduction(combine: f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -180,10 +172,12 @@ forall_impl(resources::Omp omp_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt&,
             Iterable&& iter,
@@ -191,14 +185,13 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body  = loop_body;
+  Body body = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1)           \
-    firstprivate(body, begin_it)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
+#pragma omp target teams distribute parallel for schedule(static, 1) \
+    firstprivate(body,begin_it)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     Body ib = body;
     ib(begin_it[i]);
   }
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index 22d2eb32d8..b72147151c 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -10,19 +10,13 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA
-{
-namespace internal
-{
+namespace RAJA {
+namespace internal {
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          typename... EnclosedStmts,
-          typename Types>
+template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>,
-                         Types>
+                                             EnclosedStmts...>, Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -36,20 +30,17 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1)           \
+#pragma omp target teams distribute parallel for schedule(static, 1) \
     firstprivate(privatizer) collapse(2)
-    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
-    {
-      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
-      {
-        auto& private_data = privatizer.get_priv();
-        private_data.template assign_offset<Arg0>(i0);
-        private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
-            private_data);
+      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
+        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
+          auto& private_data = privatizer.get_priv();
+          private_data.template assign_offset<Arg0>(i0);
+          private_data.template assign_offset<Arg1>(i1);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
+        }
       }
     }
-  }
 };
 
 template <camp::idx_t Arg0,
@@ -59,8 +50,7 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>,
-                         Types>
+                                             EnclosedStmts...>, Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -76,24 +66,20 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1)           \
+#pragma omp target teams distribute parallel for schedule(static, 1) \
     firstprivate(privatizer) collapse(3)
-    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
-    {
-      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
-      {
-        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
-        {
-          auto& private_data = privatizer.get_priv();
-          private_data.template assign_offset<Arg0>(i0);
-          private_data.template assign_offset<Arg1>(i1);
-          private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
-              private_data);
+      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
+        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
+          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
+            auto& private_data = privatizer.get_priv();
+            private_data.template assign_offset<Arg0>(i0);
+            private_data.template assign_offset<Arg1>(i1);
+            private_data.template assign_offset<Arg2>(i2);
+            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
+          }
         }
       }
     }
-  }
 };
 
 template <camp::idx_t Arg0,
@@ -104,8 +90,7 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>,
-                         Types>
+                                             EnclosedStmts...>, Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -123,31 +108,26 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1)           \
+#pragma omp target teams distribute parallel for schedule(static, 1) \
     firstprivate(privatizer) collapse(4)
-    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
-    {
-      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
-      {
-        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
-        {
-          for (auto i3 = (decltype(l3))0; i3 < l3; ++i3)
-          {
-            auto& private_data = privatizer.get_priv();
-            private_data.template assign_offset<Arg0>(i0);
-            private_data.template assign_offset<Arg1>(i1);
-            private_data.template assign_offset<Arg2>(i2);
-            private_data.template assign_offset<Arg3>(i2);
-            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(
-                private_data);
+      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
+        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
+          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
+            for (auto i3 = (decltype(l3))0; i3 < l3; ++i3) {
+              auto& private_data = privatizer.get_priv();
+              private_data.template assign_offset<Arg0>(i0);
+              private_data.template assign_offset<Arg1>(i1);
+              private_data.template assign_offset<Arg2>(i2);
+              private_data.template assign_offset<Arg3>(i2);
+              execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(private_data);
+            }
           }
         }
       }
     }
-  }
 };
 
-}  // namespace internal
-}  // namespace RAJA
+}
+}
 
-#endif  // RAJA_policy_openmp_target_kernel_Collapse_HPP
+#endif // RAJA_policy_openmp_target_kernel_Collapse_HPP
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 38e48c4d24..173230b9e2 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -10,32 +10,25 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA
-{
-namespace internal
-{
+namespace RAJA {
+namespace internal {
 
-template <camp::idx_t ArgumentId,
-          typename Data,
-          typename Types,
-          typename... EnclosedStmts>
-struct OpenMPTargetForWrapper : public GenericWrapperBase
+template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
+struct OpenMPTargetForWrapper : public GenericWrapperBase 
 {
   using data_t = camp::decay<Data>;
 
   data_t data;
 
-  /*!
+  /*! 
    * \brief Deferences data so that it can be mapped to the device
    */
   RAJA_INLINE
-  constexpr explicit OpenMPTargetForWrapper(data_t& d) : data {d} {}
+  constexpr explicit OpenMPTargetForWrapper(data_t &d) : 
+    data{d}  {}
 
   RAJA_INLINE
-  void exec()
-  {
-    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
-  }
+  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
 
   template <typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
@@ -49,33 +42,28 @@ template <camp::idx_t ArgumentId,
           int N,
           typename... EnclosedStmts,
           typename Types>
-struct StatementExecutor<statement::For<ArgumentId,
-                                        omp_target_parallel_for_exec<N>,
-                                        EnclosedStmts...>,
-                         Types>
+struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>, Types>
 {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...>
-        for_wrapper(data);
+    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len    = segment_length<ArgumentId>(data);
+    auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N> {},
-                TypedRangeSegment<len_t>(0, len), for_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
-}  // namespace internal
-}  // namespace RAJA
 
-#endif  // RAJA_policy_openmp_kernel_For_HPP
+}
+}
+
+#endif // RAJA_policy_openmp_kernel_For_HPP
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
index 3579269bdf..5e9edb4b6c 100644
--- a/include/RAJA/policy/openmp_target/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -3,42 +3,38 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-// Init
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
-init(KernelName&)
-{
-  // TODO: Define kernel naming
-}
-
-// Combine
-template <typename EXEC_POL, typename T>
-camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
-combine(KernelName&, T& /*place holder argument*/)
-{}
-
-// Resolve
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
-resolve(KernelName&)
-{
-  // TODO: Define kernel naming
-}
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  combine(KernelName&, T& /*place holder argument*/) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
 
 #endif
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
 
-#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index 02b7885973..6127eef226 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -3,43 +3,37 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-// Init
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
-init(Reducer<OP, T>& red)
-{
-  red.val = OP::identity();
-}
-
-// Combine
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
-combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
-{
-  out.val = OP {}(out.val, in.val);
-}
-
-// Resolve
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
-resolve(Reducer<OP, T>& red)
-{
-  *red.target = OP {}(*red.target, red.val);
-}
+  // Init
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  init(Reducer<OP, T, VOp>& red) {
+    red.m_valop.val = OP::identity();
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
+    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  resolve(Reducer<OP, T, I, VOp>& red) {
+    red.combineTarget(red.m_valop.val);
+  }
 
 #endif
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
-#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index 4e0b05a00c..520f5afc55 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -10,13 +10,10 @@
 
 #include "RAJA/policy/PolicyBase.hpp"
 
-namespace RAJA
-{
+namespace RAJA {
 
-namespace policy
-{
-namespace omp
-{
+namespace policy {
+namespace omp {
 
 // Max number of CUDA reduction threads per block possible.
 // Required for allocating omp target data before execution policy.
@@ -24,48 +21,47 @@ namespace omp
 static constexpr int MAXNUMTHREADS = 1024;
 
 template <unsigned int TeamSize>
-struct Teams : std::integral_constant<unsigned int, TeamSize>
-{};
+struct Teams : std::integral_constant<unsigned int, TeamSize> {
+};
 
-struct Target
-{};
+struct Target {
+};
 
-struct Distribute
-{};
+struct Distribute {
+};
 
-struct Collapse
-{};
+struct Collapse {
+};
 
 template <size_t ThreadsPerTeam>
 struct omp_target_parallel_for_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                                     Pattern::forall,
-                                     Platform::omp_target,
-                                     omp::Target,
-                                     omp::Teams<ThreadsPerTeam>,
-                                     omp::Distribute>
-{};
+                            Pattern::forall,
+                            Platform::omp_target,
+                            omp::Target,
+                            omp::Teams<ThreadsPerTeam>,
+                            omp::Distribute> {
+};
 
 struct omp_target_parallel_for_exec_nt
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                                     Pattern::forall,
-                                     Platform::omp_target,
-                                     omp::Target,
-                                     omp::Distribute>
-{};
+                            Pattern::forall,
+                            Platform::omp_target,
+                            omp::Target,
+                            omp::Distribute> {
+};
 
 struct omp_target_parallel_collapse_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                                     Pattern::forall,
-                                     Platform::omp_target,
-                                     omp::Target,
-                                     omp::Collapse>
-{};
+                            Pattern::forall,
+                            Platform::omp_target,
+                            omp::Target,
+                            omp::Collapse> {
+};
 
-struct omp_target_reduce : make_policy_pattern_platform_t<Policy::target_openmp,
-                                                          Pattern::reduce,
-                                                          Platform::omp_target>
-{};
+struct omp_target_reduce
+    : make_policy_pattern_platform_t<Policy::target_openmp, Pattern::reduce, Platform::omp_target> {
+};
 
 ///
 /// WorkGroup execution policies
@@ -74,21 +70,21 @@ struct omp_target_work
     : make_policy_pattern_launch_platform_t<Policy::target_openmp,
                                             Pattern::workgroup_exec,
                                             Launch::sync,
-                                            Platform::omp_target>
-{};
+                                            Platform::omp_target> {
+};
 
 
-}  // namespace omp
-}  // namespace policy
+}  // closing brace for omp namespace
+}  // closing brace for policy namespace
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_parallel_for_exec;
 using policy::omp::omp_target_parallel_for_exec_nt;
 using policy::omp::omp_target_reduce;
+using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_work;
 #endif
 
-}  // namespace RAJA
+} // closing brace for RAJA namespace
 
-#endif  // RAJA_policy_openmp_target_HPP
+#endif // RAJA_policy_openmp_target_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 8bcbde620d..6691729bbe 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -33,14 +33,15 @@ namespace omp
 #pragma omp declare target
 
 template <typename T, typename I>
-struct minloc
+struct minloc 
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void
-  operator()(T& val, I& loc, const T v, const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
+                                               I &loc,
+                                               const T v,
+                                               const I l)
   {
-    if (v < val)
-    {
+    if (v < val) {
       loc = l;
       val = v;
     }
@@ -48,14 +49,15 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc
+struct maxloc 
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void
-  operator()(T& val, I& loc, const T v, const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
+                                               I &loc,
+                                               const T v,
+                                               const I l)
   {
-    if (v > val)
-    {
+    if (v > val) {
       loc = l;
       val = v;
     }
@@ -68,19 +70,18 @@ struct maxloc
 static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 
 //! Information necessary for OpenMP offload to be considered
-struct Offload_Info
+struct Offload_Info 
 {
-  int hostID {omp_get_initial_device()};
-  int deviceID {omp_get_default_device()};
-  bool isMapped {false};
+  int hostID{omp_get_initial_device()};
+  int deviceID{omp_get_default_device()};
+  bool isMapped{false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info& other)
-      : hostID {other.hostID},
-        deviceID {other.deviceID},
-        isMapped {other.isMapped}
-  {}
+  Offload_Info(const Offload_Info &other)
+      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
+  {
+  }
 };
 
 //! Reduction data for OpenMP Offload -- stores value, host pointer, and device
@@ -89,8 +90,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T* device;
-  T* host;
+  T *device;
+  T *host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -99,19 +100,17 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
-      : value(initValue),
-        device {reinterpret_cast<T*>(
+  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
+     : value(initValue),
+        device{reinterpret_cast<T *>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
-        host {new T[omp::MaxNumTeams]}
+        host{new T[omp::MaxNumTeams]}
   {
-    if (!host)
-    {
+    if (!host) {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device)
-    {
+    if (!device) {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -119,50 +118,55 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue) { value = initValue; }
+  void reset(T initValue)
+  {
+    value = initValue;
+  }
 
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data&) = default;
+  Reduce_Data(const Reduce_Data &) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info& info)
+  RAJA_INLINE void hostToDevice(Offload_Info &info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void*>(device),
-                          reinterpret_cast<void*>(host),
-                          omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
-                          info.hostID) != 0)
-    {
+    if (omp_target_memcpy(reinterpret_cast<void *>(device),
+                          reinterpret_cast<void *>(host),
+                          omp::MaxNumTeams * sizeof(T),
+                          0,
+                          0,
+                          info.deviceID,
+                          info.hostID) != 0) {
       printf("Unable to copy memory from host to device\n");
       exit(1);
     }
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info& info)
+  RAJA_INLINE void deviceToHost(Offload_Info &info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void*>(host),
-                          reinterpret_cast<void*>(device),
-                          omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
-                          info.deviceID) != 0)
-    {
+    if (omp_target_memcpy(reinterpret_cast<void *>(host),
+                          reinterpret_cast<void *>(device),
+                          omp::MaxNumTeams * sizeof(T),
+                          0,
+                          0,
+                          info.hostID,
+                          info.deviceID) != 0) {
       printf("Unable to copy memory from device to host\n");
       exit(1);
     }
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info& info)
+  RAJA_INLINE void cleanup(Offload_Info &info)
   {
-    if (device)
-    {
-      omp_target_free(reinterpret_cast<void*>(device), info.deviceID);
+    if (device) {
+      omp_target_free(reinterpret_cast<void *>(device), info.deviceID);
       device = nullptr;
     }
-    if (host)
-    {
+    if (host) {
       delete[] host;
       host = nullptr;
     }
@@ -174,80 +178,77 @@ struct Reduce_Data
 //! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce
+struct TargetReduce 
 {
-  TargetReduce()                    = delete;
-  TargetReduce(const TargetReduce&) = default;
+  TargetReduce() = delete;
+  TargetReduce(const TargetReduce &) = default;
 
   explicit TargetReduce(T init_val_, T identity_ = Reducer::identity())
       : info(),
         val(identity_, identity_, info),
         initVal(init_val_),
         finalVal(identity_)
-  {}
+  {
+  }
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     operator T();
     val.reset(identity_);
-    initVal  = init_val_;
+    initVal = init_val_;
     finalVal = identity_;
   }
 
-#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
 #pragma omp declare target
 #endif
   //! apply reduction on device upon destruction
   ~TargetReduce()
   {
-    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
-    // XL is fixed 2/25/2019.
-    if (!omp_is_initial_device())
-    {
+    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
+    if (!omp_is_initial_device()) {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer {}(val.device[tid], val.value);
+        Reducer{}(val.device[tid], val.value);
       }
     }
   }
-#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
 #pragma omp end declare target
 #endif
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped)
-    {
+    if (!info.isMapped) {
       val.deviceToHost(info);
 
-      for (int i = 0; i < omp::MaxNumTeams; ++i)
-      {
-        Reducer {}(val.value, val.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i) {
+        Reducer{}(val.value, val.host[i]);
       }
       val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer {}(finalVal, initVal);
-    Reducer {}(finalVal, val.value);
+    Reducer{}(finalVal, initVal);
+    Reducer{}(finalVal, val.value);
     return finalVal;
   }
   //! alias for operator T()
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce& reduce(T rhsVal)
+  TargetReduce &reduce(T rhsVal)
   {
-    Reducer {}(val.value, rhsVal);
+    Reducer{}(val.value, rhsVal);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce& reduce(T rhsVal) const
+  const TargetReduce &reduce(T rhsVal) const
   {
-    Reducer {}(val.value, rhsVal);
+    Reducer{}(val.value, rhsVal);
     return *this;
   }
 
@@ -263,16 +264,13 @@ struct TargetReduce
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc
+struct TargetReduceLoc 
 {
-  TargetReduceLoc()                       = delete;
-  TargetReduceLoc(const TargetReduceLoc&) = default;
-  explicit TargetReduceLoc(
-      T init_val_,
-      IndexType init_loc,
-      T identity_val_ = Reducer::identity,
-      IndexType identity_loc_ =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc() = delete;
+  TargetReduceLoc(const TargetReduceLoc &) = default;
+  explicit TargetReduceLoc(T init_val_, IndexType init_loc,
+                           T identity_val_ = Reducer::identity,
+                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -280,34 +278,31 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {}
+  {
+  }
 
-  void reset(T init_val_,
-             IndexType init_loc_,
+  void reset(T init_val_, IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
     loc.reset(identity_loc_);
-    initVal  = init_val_;
+    initVal = init_val_;
     finalVal = identity_val_;
-    initLoc  = init_loc_;
+    initLoc = init_loc_;
     finalLoc = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
-    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
-    // XL is fixed 2/25/2019.
-    if (!omp_is_initial_device())
-    {
+    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
+    if (!omp_is_initial_device()) {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer {}(val.device[tid], loc.device[tid], val.value, loc.value);
+        Reducer{}(val.device[tid], loc.device[tid], val.value, loc.value);
       }
     }
   }
@@ -315,13 +310,11 @@ struct TargetReduceLoc
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped)
-    {
+    if (!info.isMapped) {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      for (int i = 0; i < omp::MaxNumTeams; ++i)
-      {
-        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i) {
+        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       val.cleanup(info);
       loc.cleanup(info);
@@ -329,8 +322,8 @@ struct TargetReduceLoc
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer {}(finalVal, finalLoc, initVal, initLoc);
-    Reducer {}(finalVal, finalLoc, val.value, loc.value);
+    Reducer{}(finalVal, finalLoc, initVal, initLoc);
+    Reducer{}(finalVal, finalLoc, val.value, loc.value);
     return finalVal;
   }
   //! alias for operator T()
@@ -346,16 +339,16 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
   {
-    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -379,19 +372,20 @@ class ReduceSum<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-  using self   = ReduceSum<omp_target_reduce, T>;
+
+  using self = ReduceSum<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self& operator+=(T rhsVal)
+  self &operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self& operator+=(T rhsVal) const
+  const self &operator+=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -404,19 +398,20 @@ class ReduceBitOr<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-  using self   = ReduceBitOr<omp_target_reduce, T>;
+
+  using self = ReduceBitOr<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self& operator|=(T rhsVal)
+  self &operator|=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self& operator|=(T rhsVal) const
+  const self &operator|=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -429,19 +424,20 @@ class ReduceBitAnd<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-  using self   = ReduceBitAnd<omp_target_reduce, T>;
+
+  using self = ReduceBitAnd<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self& operator&=(T rhsVal)
+  self &operator&=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self& operator&=(T rhsVal) const
+  const self &operator&=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -454,19 +450,20 @@ class ReduceMin<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-  using self   = ReduceMin<omp_target_reduce, T>;
+
+  using self = ReduceMin<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self& min(T rhsVal)
+  self &min(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self& min(T rhsVal) const
+  const self &min(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -480,19 +477,20 @@ class ReduceMax<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-  using self   = ReduceMax<omp_target_reduce, T>;
+
+  using self = ReduceMax<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self& max(T rhsVal)
+  self &max(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self& max(T rhsVal) const
+  const self &max(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -505,19 +503,21 @@ class ReduceMinLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>
 {
 public:
-  using self   = ReduceMinLoc<omp_target_reduce, T, IndexType>;
-  using parent = TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
+
+  using self = ReduceMinLoc<omp_target_reduce, T, IndexType>;
+  using parent =
+      TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  self& minloc(T rhsVal, IndexType rhsLoc)
+  self &minloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  const self& minloc(T rhsVal, IndexType rhsLoc) const
+  const self &minloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
@@ -531,19 +531,21 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>
 {
 public:
-  using self   = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
-  using parent = TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
+
+  using self = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
+  using parent =
+      TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  self& maxloc(T rhsVal, IndexType rhsLoc)
+  self &maxloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  const self& maxloc(T rhsVal, IndexType rhsLoc) const
+  const self &maxloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 90c6cb85ed..0963b31a01 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -21,7 +21,7 @@
 #define RAJA_sequential_HPP
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-#include "RAJA/policy/sequential/atomic.hpp"
+    #include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 #include "RAJA/policy/sequential/forall.hpp"
diff --git a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index ab97dbd3cf..13796fd8a3 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
- * Populate and return a Dispatcher object
- */
-template <typename T, typename Dispatcher_T>
+* Populate and return a Dispatcher object
+*/
+template < typename T, typename Dispatcher_T >
 inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>()};
+  static Dispatcher_T dispatcher{ Dispatcher_T::template makeDispatcher<T>() };
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index b2b6f11bba..31e401bf88 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -38,20 +38,23 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::seq_work,
-                  RAJA::ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
-                                                     RAJA::seq_work,
-                                                     RAJA::ordered,
-                                                     DISPATCH_POLICY_T,
-                                                     ALLOCATOR_T,
-                                                     INDEX_T,
-                                                     Args...>
-{};
+          typename ... Args>
+struct WorkRunner<
+        RAJA::seq_work,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::seq_exec,
+        RAJA::seq_work,
+        RAJA::ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
 
 /*!
  * Runs work in a storage container in reverse order
@@ -60,20 +63,23 @@ struct WorkRunner<RAJA::seq_work,
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename... Args>
-struct WorkRunner<RAJA::seq_work,
-                  RAJA::reverse_ordered,
-                  DISPATCH_POLICY_T,
-                  ALLOCATOR_T,
-                  INDEX_T,
-                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
-                                                     RAJA::seq_work,
-                                                     RAJA::reverse_ordered,
-                                                     DISPATCH_POLICY_T,
-                                                     ALLOCATOR_T,
-                                                     INDEX_T,
-                                                     Args...>
-{};
+          typename ... Args>
+struct WorkRunner<
+        RAJA::seq_work,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::seq_exec,
+        RAJA::seq_work,
+        RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{ };
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index a9e5e4f256..046e52e1c1 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -27,21 +27,24 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(seq_atomic, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
 {
   return *acc;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
 {
   *acc = value;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc += value;
@@ -51,7 +54,8 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc -= value;
@@ -61,26 +65,29 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
-  *acc  = ret < value ? ret : value;
+  *acc = ret < value ? ret : value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
-  *acc  = value < ret ? ret : value;
+  *acc = value < ret ? ret : value;
   return ret;
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 {
   T ret = *acc;
   (*acc) += T(1);
@@ -89,16 +96,18 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc, T val)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
 {
   T old = *acc;
-  *acc  = val <= old ? T(0) : old + T(1);
+  *acc = val <= old ? T(0) : old + T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 {
   T ret = *acc;
   (*acc) -= T(1);
@@ -107,16 +116,18 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc, T val)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
 {
   T old = *acc;
-  *acc  = old == T(0) || val < old ? val : old - T(1);
+  *acc = old == T(0) || val < old ? val : old - T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc &= value;
@@ -125,7 +136,8 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc |= value;
@@ -134,7 +146,8 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc ^= value;
@@ -143,19 +156,21 @@ RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T* acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(seq_atomic, T* acc, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
-  *acc  = value;
+  *acc = value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
 {
   T ret = *acc;
-  *acc  = ret == compare ? value : ret;
+  *acc = ret == compare ? value : ret;
   return ret;
 }
 
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 4bf9f1607a..5d1d6d84b0 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -55,26 +55,24 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename Func,
-          typename Resource,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Resource>,
-    expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template <typename Iterable, typename Func, typename Resource, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<Resource>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
 forall_impl(Resource res,
-            const seq_exec&,
-            Iterable&& iter,
-            Func&& body,
+            const seq_exec &,
+            Iterable &&iter,
+            Func &&body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     expt::invoke_body(f_params, body, *(begin_it + i));
   }
 
@@ -82,24 +80,22 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-template <typename Iterable,
-          typename Func,
-          typename Resource,
-          typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Resource>,
-    expt::type_traits::is_ForallParamPack<ForallParam>,
-    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Iterable, typename Func, typename Resource, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<Resource>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
 forall_impl(Resource res,
-            const seq_exec&,
-            Iterable&& iter,
-            Func&& body,
+            const seq_exec &,
+            Iterable &&iter,
+            Func &&body,
             ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
     body(*(begin_it + i));
   }
   return resources::EventProxy<Resource>(res);
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index a722b89ff8..8e600ec2e8 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -32,12 +32,10 @@ namespace internal
 //
 template <typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>,
-    Types>
-{
+    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data& data)
+  static RAJA_INLINE void exec(Data &data)
   {
     // termination case: no more loops, just execute enclosed statements
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
@@ -49,17 +47,13 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0,
-          camp::idx_t... ArgRest,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
-    Types>
-{
+template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::Collapse<seq_exec,
+                                             ArgList<Arg0, ArgRest...>,
+                                             EnclosedStmts...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data& data)
+  static RAJA_INLINE void exec(Data &data)
   {
 
     // Set the argument type for this loop
@@ -67,13 +61,11 @@ struct StatementExecutor<
 
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>,
-        NewTypes>;
+        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
-    for (auto i0 = 0; i0 < len0; ++i0)
-    {
+    for (auto i0 = 0; i0 < len0; ++i0) {
       data.template assign_offset<Arg0>(i0);
 
       next_loop_t::exec(data);
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index dc94c14d85..7280844320 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -34,12 +34,10 @@ template <template <typename...> class ReduceOperator,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
-    Types>
-{
+    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
     // since a sequential reduction is a NOP, and the single thread always
     // has the reduced value, this is just a passthrough to the enclosed
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index 6459189e23..a2025a71d5 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -26,8 +26,7 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::null_launch_t>
-{
+struct LaunchExecute<RAJA::null_launch_t> {
   template <typename BODY>
   static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
                    BODY const& RAJA_UNUSED_ARG(body))
@@ -38,25 +37,20 @@ struct LaunchExecute<RAJA::null_launch_t>
 
 
 template <>
-struct LaunchExecute<RAJA::seq_launch_t>
-{
+struct LaunchExecute<RAJA::seq_launch_t> {
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
-       const char* RAJA_UNUSED_ARG(kernel_name),
-       BODY const& body,
-       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
+  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name),
+       BODY const &body, ReduceParams &RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
 
-    char* kernel_local_mem = new char[params.shared_mem_size];
-    ctx.shared_mem_ptr     = kernel_local_mem;
+    char *kernel_local_mem = new char[params.shared_mem_size];
+    ctx.shared_mem_ptr = kernel_local_mem;
 
     body(ctx);
 
@@ -66,23 +60,18 @@ struct LaunchExecute<RAJA::seq_launch_t>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
-       const char* RAJA_UNUSED_ARG(kernel_name),
-       BODY const& body,
-       ReduceParams& launch_reducers)
+  template<typename BODY, typename ReduceParams>
+    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
+       const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body, ReduceParams &launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
     LaunchContext ctx;
-    char* kernel_local_mem = new char[launch_params.shared_mem_size];
-    ctx.shared_mem_ptr     = kernel_local_mem;
+    char *kernel_local_mem = new char[launch_params.shared_mem_size];
+    ctx.shared_mem_ptr = kernel_local_mem;
 
     expt::invoke_body(launch_reducers, body, ctx);
 
@@ -93,57 +82,54 @@ struct LaunchExecute<RAJA::seq_launch_t>
 
     return resources::EventProxy<resources::Resource>(res);
   }
+
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<seq_exec, SEGMENT>
-{
+struct LoopExecute<seq_exec, SEGMENT> {
 
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
-                                                BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++)
-    {
+    for (int i = 0; i < len; i++) {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++)
-    {
+    for (int i = 0; i < len; i++) {
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++)
-    {
-      for (int i = 0; i < len0; i++)
-      {
+    for (int j = 0; j < len1; j++) {
+      for (int i = 0; i < len0; i++) {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -151,12 +137,12 @@ struct LoopExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
 
     // block stride loop
@@ -164,54 +150,49 @@ struct LoopExecute<seq_exec, SEGMENT>
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++)
-    {
-      for (int j = 0; j < len1; j++)
-      {
-        for (int i = 0; i < len0; i++)
-        {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++) {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
     }
   }
+
 };
 
 
 template <typename SEGMENT>
-struct LoopICountExecute<seq_exec, SEGMENT>
-{
+struct LoopICountExecute<seq_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++)
-    {
+    for (int i = 0; i < len; i++) {
       body(*(segment.begin() + i), i);
     }
   }
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       BODY const& body)
+    template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++)
-    {
-      for (int i = 0; i < len0; i++)
-      {
+    for (int j = 0; j < len1; j++) {
+      for (int i = 0; i < len0; i++) {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
@@ -219,12 +200,12 @@ struct LoopICountExecute<seq_exec, SEGMENT>
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment0,
-       SEGMENT const& segment1,
-       SEGMENT const& segment2,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
 
     // block stride loop
@@ -232,32 +213,30 @@ struct LoopICountExecute<seq_exec, SEGMENT>
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++)
-    {
-      for (int j = 0; j < len1; j++)
-      {
-        for (int i = 0; i < len0; i++)
-        {
-          body(*(segment0.begin() + i), *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++) {
+      for (int j = 0; j < len1; j++) {
+        for (int i = 0; i < len0; i++) {
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
                *(segment2.begin() + k), i, j, k);
         }
       }
     }
   }
+
 };
 
-// Tile Execute + variants
+//Tile Execute + variants
 
 template <typename SEGMENT>
-struct TileExecute<seq_exec, SEGMENT>
-{
+struct TileExecute<seq_exec, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -267,27 +246,28 @@ struct TileExecute<seq_exec, SEGMENT>
       body(segment.slice(tx, tile_size));
     }
   }
+
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<seq_exec, SEGMENT>
-{
+struct TileTCountExecute<seq_exec, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       TILE_T tile_size,
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = 0, bx = 0; tx < len; tx += tile_size, bx++)
+    for (int tx = 0, bx=0; tx < len; tx += tile_size, bx++)
     {
       body(segment.slice(tx, tile_size), bx);
     }
   }
+
 };
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index 2b05ba512e..be3a3860f8 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -47,7 +47,7 @@ namespace detail
  *
  **************************************************************************
  */
-template <typename T, typename t_MultiReduceOp, typename tuning>
+template < typename T, typename t_MultiReduceOp, typename tuning >
 struct MultiReduceDataSeq;
 
 /*!
@@ -59,68 +59,59 @@ struct MultiReduceDataSeq;
  *
  **************************************************************************
  */
-template <typename T, typename t_MultiReduceOp>
-struct MultiReduceDataSeq<
-    T,
-    t_MultiReduceOp,
+template < typename T, typename t_MultiReduceOp >
+struct MultiReduceDataSeq<T, t_MultiReduceOp,
     RAJA::sequential::MultiReduceTuning<
-        RAJA::sequential::multi_reduce_algorithm::left_fold>>
+      RAJA::sequential::multi_reduce_algorithm::left_fold>>
 {
-  using value_type    = T;
+  using value_type = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataSeq() = delete;
 
-  template <typename Container,
-            std::enable_if_t<
-                !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr >
   MultiReduceDataSeq(Container const& container, T identity)
-      : m_parent(nullptr),
-        m_num_bins(container.size()),
-        m_identity(identity),
-        m_data(nullptr)
+      : m_parent(nullptr)
+      , m_num_bins(container.size())
+      , m_identity(identity)
+      , m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataSeq(MultiReduceDataSeq const& other)
-      : m_parent(other.m_parent ? other.m_parent : &other),
-        m_num_bins(other.m_num_bins),
-        m_identity(other.m_identity),
-        m_data(other.m_data)
-  {}
+  MultiReduceDataSeq(MultiReduceDataSeq const &other)
+      : m_parent(other.m_parent ? other.m_parent : &other)
+      , m_num_bins(other.m_num_bins)
+      , m_identity(other.m_identity)
+      , m_data(other.m_data)
+  { }
 
-  MultiReduceDataSeq(MultiReduceDataSeq&&)                 = delete;
+  MultiReduceDataSeq(MultiReduceDataSeq &&) = delete;
   MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
-  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&)      = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete;
 
   ~MultiReduceDataSeq()
   {
-    if (m_data)
-    {
-      if (!m_parent)
-      {
+    if (m_data) {
+      if (!m_parent) {
         destroy_data(m_data, m_num_bins);
       }
     }
   }
 
-  template <typename Container>
+  template < typename Container >
   void reset(Container const& container, T identity)
   {
-    m_identity          = identity;
+    m_identity = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins)
-    {
+    if (new_num_bins != m_num_bins) {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data     = create_data(container, m_num_bins);
-    }
-    else
-    {
+      m_data = create_data(container, m_num_bins);
+    } else {
       size_t bin = 0;
-      for (auto const& value : container)
-      {
+      for (auto const& value : container) {
         m_data[bin] = value;
         ++bin;
       }
@@ -131,29 +122,27 @@ struct MultiReduceDataSeq<
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
+  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataSeq const* m_parent;
+  MultiReduceDataSeq const *m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template <typename Container>
+  template < typename Container >
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return nullptr;
     }
 
-    auto data  = static_cast<T*>(malloc(num_bins * sizeof(T)));
+    auto data = static_cast<T*>(malloc(num_bins*sizeof(T)));
     size_t bin = 0;
-    for (auto const& value : container)
-    {
-      new (&data[bin]) T(value);
+    for (auto const& value : container) {
+      new(&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -161,13 +150,11 @@ struct MultiReduceDataSeq<
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0))
-    {
+    if (num_bins == size_t(0)) {
       return;
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin)
-    {
+    for (size_t bin = 0; bin < num_bins; ++bin) {
       data[bin].~T();
     }
     free(data);
@@ -177,8 +164,7 @@ struct MultiReduceDataSeq<
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
-                                detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
index d31f271569..00e6a1dc52 100644
--- a/include/RAJA/policy/sequential/params/kernel_name.hpp
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -3,39 +3,35 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
-
-// Init
-template <typename EXEC_POL>
-camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
-init(KernelName&)
-{
-  // TODO: Define kernel naming
-}
-
-// Combine
-template <typename EXEC_POL, typename T>
-RAJA_HOST_DEVICE
-    camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
-    combine(KernelName&, T)
-{}
-
-// Resolve
-template <typename EXEC_POL>
-camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
-resolve(KernelName&)
-{
-  // TODO: Define kernel naming
-}
-
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
-
-
-#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  combine(KernelName&, T) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
index 5f0be6d53a..b77028ca5f 100644
--- a/include/RAJA/policy/sequential/params/reduce.hpp
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -3,37 +3,33 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
 
-// Init
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
-init(Reducer<OP, T>& red)
-{
-  red.val = OP::identity();
-}
-// Combine
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
-combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
-{
-  out.val = OP {}(out.val, in.val);
-}
-// Resolve
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
-resolve(Reducer<OP, T>& red)
-{
-  *red.target = OP {}(*red.target, red.val);
-}
+  // Init
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  init(Reducer<OP, T, VOp>& red) {
+    red.m_valop.val = OP::identity();
+  }
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
+    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+  }
 
-#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  resolve(Reducer<OP, T, VOp>& red) {
+    red.combineTarget(red.m_valop.val);
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 00fa7274a3..287af42502 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -30,7 +30,7 @@ enum struct multi_reduce_algorithm : int
   left_fold
 };
 
-template <multi_reduce_algorithm t_multi_algorithm>
+template < multi_reduce_algorithm t_multi_algorithm >
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
@@ -38,7 +38,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
-}  // namespace sequential
+} // namspace sequential
 
 namespace policy
 {
@@ -60,20 +60,20 @@ namespace sequential
 struct seq_region : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::region,
                                                           Launch::sync,
-                                                          Platform::host>
-{};
+                                                          Platform::host> {
+};
 
 struct seq_launch_t : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                             Pattern::region,
                                                             Launch::sync,
-                                                            Platform::host>
-{};
+                                                            Platform::host> {
+};
 
 struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::forall,
                                                         Launch::undefined,
-                                                        Platform::host>
-{};
+                                                        Platform::host> {
+};
 
 ///
 /// Index set segment iteration policies
@@ -86,8 +86,8 @@ using seq_segit = seq_exec;
 struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host>
-{};
+                                                        Platform::host> {
+};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -99,20 +99,20 @@ struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
 struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::reduce,
                                                           Launch::undefined,
-                                                          Platform::host>
-{};
+                                                          Platform::host> {
+};
 
 ///
-template <typename tuning>
-struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
-                                     Policy::sequential,
-                                     Pattern::multi_reduce,
-                                     Launch::undefined,
-                                     Platform::host,
-                                     std::conditional_t<tuning::consistent,
-                                                        reduce::ordered,
-                                                        reduce::unordered>>
-{};
+template < typename tuning >
+struct seq_multi_reduce_policy
+    : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                            Pattern::multi_reduce,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            std::conditional_t<tuning::consistent,
+                                                               reduce::ordered,
+                                                               reduce::unordered>> {
+};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -121,13 +121,13 @@ struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct seq_atomic
-{};
+struct seq_atomic {
+};
 
 
-template <RAJA::sequential::multi_reduce_algorithm algorithm>
-using seq_multi_reduce_tuning =
-    seq_multi_reduce_policy<RAJA::sequential::MultiReduceTuning<algorithm>>;
+template < RAJA::sequential::multi_reduce_algorithm algorithm >
+using seq_multi_reduce_tuning = seq_multi_reduce_policy<
+    RAJA::sequential::MultiReduceTuning<algorithm> >;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - left_fold policies combine new values into a single value.
@@ -143,12 +143,12 @@ using seq_multi_reduce = seq_multi_reduce_left_fold;
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
-using policy::sequential::seq_launch_t;
-using policy::sequential::seq_multi_reduce;
 using policy::sequential::seq_reduce;
+using policy::sequential::seq_multi_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
+using policy::sequential::seq_launch_t;
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/region.hpp b/include/RAJA/policy/sequential/region.hpp
index 81c5d41647..84d03ae202 100644
--- a/include/RAJA/policy/sequential/region.hpp
+++ b/include/RAJA/policy/sequential/region.hpp
@@ -35,7 +35,7 @@ namespace sequential
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const seq_region&, Func&& body)
+RAJA_INLINE void region_impl(const seq_region &, Func &&body)
 {
   body();
 }
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index e0e12e0a58..4bcc73366d 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -41,21 +41,22 @@ namespace scan
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(resources::Host host_res,
-                  const ExecPolicy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(
+    resources::Host host_res,
+    const ExecPolicy &,
+    Iter begin,
+    Iter end,
+    BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg   = *begin;
+  ValueT agg = *begin;
 
-  for (Iter i = ++begin; i != end; ++i)
-  {
+  for (Iter i = ++begin; i != end; ++i) {
     agg = f(agg, *i);
-    *i  = agg;
+    *i = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -66,27 +67,28 @@ inclusive_inplace(resources::Host host_res,
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(resources::Host host_res,
-                  const ExecPolicy&,
-                  Iter begin,
-                  Iter end,
-                  BinFn f,
-                  T v)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(
+    resources::Host host_res,
+    const ExecPolicy &,
+    Iter begin,
+    Iter end,
+    BinFn f,
+    T v)
 {
   using std::distance;
-  const auto n    = distance(begin, end);
+  const auto n = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
 
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg   = v;
+  ValueT agg = v;
 
-  for (DistanceT i = 0; i < n; ++i)
-  {
-    auto t   = begin[i];
+  for (DistanceT i = 0; i < n; ++i) {
+    auto t = begin[i];
     begin[i] = agg;
-    agg      = f(agg, t);
+    agg = f(agg, t);
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -97,22 +99,23 @@ exclusive_inplace(resources::Host host_res,
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(resources::Host host_res,
-          const ExecPolicy&,
-          const Iter begin,
-          const Iter end,
-          OutIter out,
-          BinFn f)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(
+    resources::Host host_res,
+    const ExecPolicy &,
+    const Iter begin,
+    const Iter end,
+    OutIter out,
+    BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg   = *begin;
-  *out++       = agg;
+  ValueT agg = *begin;
+  *out++ = agg;
 
-  for (Iter i = begin + 1; i != end; ++i)
-  {
-    agg    = f(agg, *i);
+  for (Iter i = begin + 1; i != end; ++i) {
+    agg = f(agg, *i);
     *out++ = agg;
   }
 
@@ -128,25 +131,26 @@ template <typename ExecPolicy,
           typename OutIter,
           typename BinFn,
           typename T>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                                  type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(resources::Host host_res,
-          const ExecPolicy&,
-          const Iter begin,
-          const Iter end,
-          OutIter out,
-          BinFn f,
-          T v)
+RAJA_INLINE
+concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                      type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(
+    resources::Host host_res,
+    const ExecPolicy &,
+    const Iter begin,
+    const Iter end,
+    OutIter out,
+    BinFn f,
+    T v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg   = v;
-  OutIter o    = out;
-  *o++         = v;
+  ValueT agg = v;
+  OutIter o = out;
+  *o++ = v;
 
-  for (Iter i = begin; i != end - 1; ++i, ++o)
-  {
+  for (Iter i = begin; i != end - 1; ++i, ++o) {
     agg = f(agg, *i);
-    *o  = agg;
+    *o = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 0a31400029..98dcf6fc27 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -30,7 +30,7 @@
 
 #include "RAJA/util/zip.hpp"
 
-#include "RAJA/util/sort.hpp"
+#include "RAJA/util/sort.hpp" 
 
 #include "RAJA/policy/sequential/policy.hpp"
 
@@ -50,8 +50,9 @@ namespace detail
 */
 struct UnstableSorter
 {
-  template <typename... Args>
-  RAJA_INLINE void operator()(Args&&... args) const
+  template < typename... Args >
+  RAJA_INLINE
+  void operator()(Args&&... args) const
   {
     RAJA::detail::intro_sort(std::forward<Args>(args)...);
   }
@@ -63,14 +64,15 @@ struct UnstableSorter
 */
 struct StableSorter
 {
-  template <typename... Args>
-  RAJA_INLINE void operator()(Args&&... args) const
+  template < typename... Args >
+  RAJA_INLINE
+  void operator()(Args&&... args) const
   {
     RAJA::detail::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -78,13 +80,14 @@ struct StableSorter
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable(resources::Host host_res,
-         const ExecPolicy&,
-         Iter begin,
-         Iter end,
-         Compare comp)
+unstable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter begin,
+    Iter end,
+    Compare comp)
 {
-  detail::UnstableSorter {}(begin, end, comp);
+  detail::UnstableSorter{}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -95,13 +98,14 @@ unstable(resources::Host host_res,
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable(resources::Host host_res,
-       const ExecPolicy&,
-       Iter begin,
-       Iter end,
-       Compare comp)
+stable(
+    resources::Host host_res,
+    const ExecPolicy&,
+    Iter begin,
+    Iter end,
+    Compare comp)
 {
-  detail::StableSorter {}(begin, end, comp);
+  detail::StableSorter{}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -109,48 +113,43 @@ stable(resources::Host host_res,
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(resources::Host host_res,
-               const ExecPolicy&,
-               KeyIter keys_begin,
-               KeyIter keys_end,
-               ValIter vals_begin,
-               Compare comp)
+unstable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
-  auto begin    = RAJA::zip(keys_begin, vals_begin);
-  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::UnstableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on
-   keys
+        \brief stable sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy,
-          typename KeyIter,
-          typename ValIter,
-          typename Compare>
+template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(resources::Host host_res,
-             const ExecPolicy&,
-             KeyIter keys_begin,
-             KeyIter keys_end,
-             ValIter vals_begin,
-             Compare comp)
+stable_pairs(
+    resources::Host host_res,
+    const ExecPolicy&,
+    KeyIter keys_begin,
+    KeyIter keys_end,
+    ValIter vals_begin,
+    Compare comp)
 {
-  auto begin    = RAJA::zip(keys_begin, vals_begin);
-  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
+  auto begin = RAJA::zip(keys_begin, vals_begin);
+  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::StableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 851eb2317f..8c5b38af9c 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -48,24 +48,25 @@ namespace simd
 
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec&,
-            Iterable&& iter,
-            Func&& loop_body,
+            const simd_exec &,
+            Iterable &&iter,
+            Func &&loop_body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
-  auto begin    = std::begin(iter);
-  auto end      = std::end(iter);
+  auto begin = std::begin(iter);
+  auto end = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i)
-  {
+  for (decltype(distance) i = 0; i < distance; ++i) {
     expt::invoke_body(f_params, loop_body, *(begin + i));
   }
 
@@ -74,22 +75,23 @@ forall_impl(RAJA::resources::Host host_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    expt::type_traits::is_ForallParamPack<ForallParam>,
-    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec&,
-            Iterable&& iter,
-            Func&& loop_body,
+            const simd_exec &,
+            Iterable &&iter,
+            Func &&loop_body,
             ForallParam)
 {
-  auto begin    = std::begin(iter);
-  auto end      = std::end(iter);
+  auto begin = std::begin(iter);
+  auto end = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i)
-  {
+  for (decltype(distance) i = 0; i < distance; ++i) {
     loop_body(*(begin + i));
   }
 
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index ae4e673a15..53ed45ad1f 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -40,14 +40,12 @@ namespace internal
  *
  */
 template <class T>
-struct TypeIsLambda
-{
+struct TypeIsLambda {
   static const bool value = false;
 };
 
-template <camp::idx_t BodyIdx, typename... Args>
-struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>>
-{
+template <camp::idx_t BodyIdx, typename ... Args>
+struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
   static const bool value = true;
 };
 
@@ -61,11 +59,10 @@ template <typename Types, class... Statements>
 struct Invoke_all_Lambda;
 
 template <typename Types>
-struct Invoke_all_Lambda<Types>
-{
+struct Invoke_all_Lambda<Types> {
 
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data&&)
+  static RAJA_INLINE void lambda_special(Data &&)
   {
     // NOP terminator
   }
@@ -73,8 +70,7 @@ struct Invoke_all_Lambda<Types>
 
 
 template <typename Types, class Statement, class... StatementRest>
-struct Invoke_all_Lambda<Types, Statement, StatementRest...>
-{
+struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
 
   // Lambda check
   static const bool value = TypeIsLambda<camp::decay<Statement>>::value;
@@ -82,7 +78,7 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...>
 
   // Invoke the chain of lambdas
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data&& data)
+  static RAJA_INLINE void lambda_special(Data &&data)
   {
 
     // Execute this Lambda
@@ -102,36 +98,32 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...>
  */
 template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>,
-    Types>
-{
+    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter     = get<ArgumentId>(data.segment_tuple);
-    auto begin    = std::begin(iter);
-    auto end      = std::end(iter);
+    auto iter = get<ArgumentId>(data.segment_tuple);
+    auto begin = std::begin(iter);
+    auto end = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i)
-    {
+    for (decltype(distance) i = 0; i < distance; ++i) {
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer    = thread_privatize(data);
+      auto privatizer = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       // Assign offset on privatized data
       private_data.template assign_offset<ArgumentId>(i);
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
-          private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
     }
   }
 };
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 4544e7ad54..36a169f2bf 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -42,31 +42,26 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template <camp::idx_t ArgumentId, typename ParamId,
+          typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::
-        ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
-    Types>
-{
+    statement::ForICount<ArgumentId, ParamId, RAJA::simd_exec,
+                         EnclosedStmts...>, Types> {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data&& data)
+  static RAJA_INLINE void exec(Data &&data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter     = get<ArgumentId>(data.segment_tuple);
-    auto begin    = std::begin(iter);
-    auto end      = std::end(iter);
+    auto iter = get<ArgumentId>(data.segment_tuple);
+    auto begin = std::begin(iter);
+    auto end = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i)
-    {
+    for (decltype(distance) i = 0; i < distance; ++i) {
 
       // Offsets and parameters need to be privatized
       data.template assign_offset<ArgumentId>(i);
@@ -74,11 +69,10 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer    = thread_privatize(data);
+      auto privatizer = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
-          private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
     }
   }
 };
@@ -87,4 +81,4 @@ struct StatementExecutor<
 }  // end namespace RAJA
 
 
-#endif
+#endif 
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 4ccc94fe94..1f8ba01ab3 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -26,40 +26,36 @@ namespace RAJA
 {
 
 template <typename SEGMENT>
-struct LoopExecute<simd_exec, SEGMENT>
-{
+struct LoopExecute<simd_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++)
-    {
+    for (int i = 0; i < len; i++) {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT>
-struct LoopICountExecute<simd_exec, SEGMENT>
-{
+struct LoopICountExecute<simd_exec, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void
-  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-       SEGMENT const& segment,
-       BODY const& body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++)
-    {
+    for (int i = 0; i < len; i++) {
       body(*(segment.begin() + i), i);
     }
   }
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index fac158a36b..a85811163f 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -41,8 +41,8 @@ namespace simd
 struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                          Pattern::forall,
                                                          Launch::undefined,
-                                                         Platform::host>
-{};
+                                                         Platform::host> {
+};
 
 }  // end of namespace simd
 
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index e1c6cbc884..081a88dc23 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -47,11 +47,10 @@ namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct syclInfo
-{
-  sycl_dim_t gridDim {0};
-  sycl_dim_t blockDim {0};
-  cl::sycl::queue qu  = cl::sycl::queue();
+struct syclInfo {
+  sycl_dim_t gridDim{0};
+  sycl_dim_t blockDim{0};
+  cl::sycl::queue qu = cl::sycl::queue();
   bool setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
   syclInfo* thread_states = nullptr;
@@ -68,15 +67,14 @@ extern std::unordered_map<cl::sycl::queue, bool> g_queue_info_map;
 }  // namespace detail
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator
-{
+struct PinnedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr              = ::sycl::malloc_host(nbytes, *q);
+    ptr = ::sycl::malloc_host(nbytes, *q);
     return ptr;
   }
 
@@ -91,15 +89,14 @@ struct PinnedAllocator
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator
-{
+struct DeviceAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr              = ::sycl::malloc_device(nbytes, *q);
+    ptr = ::sycl::malloc_device(nbytes, *q);
     return ptr;
   }
 
@@ -115,15 +112,14 @@ struct DeviceAllocator
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator
-{
+struct DeviceZeroedAllocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr              = ::sycl::malloc_device(nbytes, *q);
+    ptr = ::sycl::malloc_device(nbytes, *q);
     q->memset(ptr, 0, nbytes);
     return ptr;
   }
@@ -150,3 +146,4 @@ using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 #endif  // closing endif for RAJA_ENABLE_SYCL
 
 #endif  // closing endif for header file include guard
+
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 0232c1270d..901cc694f0 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -85,18 +85,14 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
-                                  bool>::type = true>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Sycl>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl& sycl_res,
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Sycl>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Sycl &sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -105,19 +101,17 @@ forall_impl(resources::Sycl& sycl_res,
 
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
-  {
+  if (len > 0 && BlockSize > 0) {
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
     //       For now, leave the device compiler to error with invalid WG size.
@@ -125,43 +119,33 @@ forall_impl(resources::Sycl& sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize {BlockSize};
+    sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    q->submit(
-        [&](::sycl::handler& h)
-        {
-          h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
-                         [=](::sycl::nd_item<1> it)
-                         {
-                           IndexType ii = it.get_global_id(0);
-                           if (ii < len)
-                           {
-                             loop_body(begin[ii]);
-                           }
-                         });
-        });
-
-    if (!Async)
-    {
-      q->wait();
-    }
+    q->submit([&](::sycl::handler& h) {
+
+      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
+                      [=]  (::sycl::nd_item<1> it) {
+
+        IndexType ii = it.get_global_id(0);
+        if (ii < len) {
+          loop_body(begin[ii]);
+        }
+      });
+    });
+
+    if (!Async) { q->wait(); }
   }
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
-                                  bool>::type = true>
-RAJA_INLINE resources::EventProxy<resources::Sycl>
-forall_impl(resources::Sycl& sycl_res,
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
+RAJA_INLINE 
+resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -169,19 +153,17 @@ forall_impl(resources::Sycl& sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
-  {
+  if (len > 0 && BlockSize > 0) {
 
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
@@ -190,7 +172,7 @@ forall_impl(resources::Sycl& sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize {BlockSize};
+    sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -204,27 +186,24 @@ forall_impl(resources::Sycl& sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    q->submit(
-         [&](::sycl::handler& h)
-         {
-           h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
-                          [=](::sycl::nd_item<1> it)
-                          {
-                            Index_type ii = it.get_global_id(0);
-
-                            if (ii < len)
-                            {
-                              (*lbody)((*beg)[ii]);
-                            }
-                          });
-         })
-        .wait();  // Need to wait for completion to free memory
+    q->submit([&](::sycl::handler& h) {
+
+      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
+                      [=]  (::sycl::nd_item<1> it) {
+
+        Index_type ii = it.get_global_id(0);
+
+        if (ii < len) {
+          (*lbody)((*beg)[ii]);
+        }
+      });
+    }).wait(); // Need to wait for completion to free memory
 
     // Free our device memory
     cl::sycl::free(lbody, *q);
@@ -236,19 +215,14 @@ forall_impl(resources::Sycl& sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
-                                  bool>::type = true>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Sycl>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Sycl& sycl_res,
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
+RAJA_INLINE
+concepts::enable_if_t< 
+  resources::EventProxy<resources::Sycl>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+forall_impl(resources::Sycl &sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -257,79 +231,70 @@ forall_impl(resources::Sycl& sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
-  {
+  if (len > 0 && BlockSize > 0) {
 
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize {BlockSize};
+    sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = [](ForallParam x, ForallParam y)
-    {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
+    auto combiner = []( ForallParam x, ForallParam y ) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
       return x;
     };
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit(
-        [&](::sycl::handler& h)
-        {
-          h.parallel_for(::sycl::range<1>(len), reduction,
-                         [=](::sycl::item<1> it, auto& red)
-                         {
-                           ForallParam fp;
-                           RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-                           IndexType ii = it.get_id(0);
-                           if (ii < len)
-                           {
-                             RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-                           }
-                           red.combine(fp);
-                         });
-        });
+    q->submit([&](::sycl::handler& h) {
+      h.parallel_for( ::sycl::range<1>(len),
+                      reduction,
+                      [=]   (::sycl::item<1> it, auto & red)  {
+
+        ForallParam fp;
+	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+        IndexType ii = it.get_id(0);
+        if (ii < len) {
+          RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+        }
+        red.combine(fp);
+      });
+    });
 
     q->wait();
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
     ::sycl::free(res, *q);
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
+
 }
 
-template <typename Iterable,
-          typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
-                                  bool>::type = true>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Sycl>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Sycl& sycl_res,
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
+RAJA_INLINE
+concepts::enable_if_t< 
+  resources::EventProxy<resources::Sycl>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+forall_impl(resources::Sycl &sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -338,32 +303,29 @@ forall_impl(resources::Sycl& sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
-  {
+  if (len > 0 && BlockSize > 0) {
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize {BlockSize};
+    sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = [](ForallParam x, ForallParam y)
-    {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
+    auto combiner = []( ForallParam x, ForallParam y ) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
       return x;
     };
 
@@ -377,44 +339,45 @@ forall_impl(resources::Sycl& sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit(
-         [&](::sycl::handler& h)
-         {
-           h.parallel_for(::sycl::range<1>(len), reduction,
-                          [=](::sycl::item<1> it, auto& red)
-                          {
-                            Index_type ii = it.get_id(0);
-                            ForallParam fp;
-                            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-                            if (ii < len)
-                            {
-                              RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-                            }
-                            red.combine(fp);
-                          });
-         })
-        .wait();  // Need to wait for completion to free memory
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
+    q->submit([&](::sycl::handler& h) {
+      h.parallel_for( ::sycl::range<1>(len),
+                      reduction,
+                      [=]   (::sycl::item<1> it, auto & red)  {
+
+
+        Index_type ii = it.get_id(0);
+        ForallParam fp;
+	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+        if (ii < len) {
+          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+        }
+        red.combine(fp);
+
+      });
+    }).wait(); // Need to wait for completion to free memory
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
     // Free our device memory
     ::sycl::free(res, *q);
     ::sycl::free(lbody, *q);
     ::sycl::free(beg, *q);
 
     RAJA_FT_END;
+
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
+
 }
 
 
@@ -440,23 +403,23 @@ template <typename LoopBody,
           size_t BlockSize,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl>
-forall_impl(resources::Sycl& r,
-            ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &r,
+                                                    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+                                                    const TypedIndexSet<SegmentTypes...>& iset,
+                                                    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi)
-  {
-    iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
+  for (int isi = 0; isi < num_seg; ++isi) {
+    iset.segmentCall(r,
+                     isi,
+                     detail::CallForall(),
+                     sycl_exec<BlockSize, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
-  if (!Async)
-  {
+  if ( !Async ) {
     ::sycl::queue* q = r.get_queue();
-    q->wait();
+    q->wait(); 
   }
 
   return resources::EventProxy<resources::Sycl>(r);
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index f7cc487a28..9149418518 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -42,18 +42,18 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types>
-{
+                             Types> {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    if (Conditional::eval(data))
-    {
+    if (Conditional::eval(data)) {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active);
@@ -61,7 +61,10 @@ struct SyclStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index 4b95bff6d6..d0976b931f 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -45,11 +45,8 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId,
-                   RAJA::sycl_global_012<Dim, Local_Size>,
-                   EnclosedStmts...>,
-    Types>
-{
+    statement::For<ArgumentId, RAJA::sycl_global_012<Dim, Local_Size>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -61,39 +58,38 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i   = item.get_global_id(Dim);
+    auto i = item.get_global_id(Dim);
 
-    // Assign the x thread to the argument
-    data.template assign_offset<ArgumentId>(i);
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
 
-    // execute enclosed statements
-    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // Set Global Space for Dimension and Local Size
     LaunchDims dims;
-    if (Dim == 0)
-    {
+    if (Dim == 0) {
       dims.global.x = len;
-      dims.local.x  = Local_Size;
+      dims.local.x = Local_Size;
     }
-    if (Dim == 1)
-    {
+    if (Dim == 1) {
       dims.global.y = len;
-      dims.local.y  = Local_Size;
+      dims.local.y = Local_Size;
     }
-    if (Dim == 2)
-    {
+    if (Dim == 2) {
       dims.global.z = len;
-      dims.local.z  = Local_Size;
+      dims.local.z = Local_Size;
     }
 
     // combine with enclosed statements
@@ -112,12 +108,10 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_group_012_direct<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+    Data,
+    statement::For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -129,35 +123,34 @@ struct SyclStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i   = item.get_group(Dim);
+    auto i = item.get_group(Dim);
 
-    // Assign the x thread to the argument
-    data.template assign_offset<ArgumentId>(i);
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
 
-    // execute enclosed statements
-    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0)
-    {
+    if (Dim == 0) {
       dims.group.x = len;
     }
-    if (Dim == 1)
-    {
+    if (Dim == 1) {
       dims.group.y = len;
     }
-    if (Dim == 2)
-    {
+    if (Dim == 2) {
       dims.group.z = len;
     }
 
@@ -178,12 +171,10 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_group_012_loop<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+    Data,
+    statement::For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -195,15 +186,14 @@ struct SyclStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len      = segment_length<ArgumentId>(data);
-    auto i0       = item.get_group(Dim);
+    auto len = segment_length<ArgumentId>(data);
+    auto i0 = item.get_group(Dim);
     auto i_stride = item.get_group_range(Dim);
 
-    for (auto i = i0; i < len; i += i_stride)
-    {
+    for(auto i = i0;i < len;i += i_stride){
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -213,22 +203,21 @@ struct SyclStatementExecutor<Data,
     }
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0)
-    {
+    if (Dim == 0) {
       dims.group.x = len;
-    }
-    if (Dim == 1)
-    {
+    } 
+    if (Dim == 1) {
       dims.group.y = len;
     }
-    if (Dim == 2)
-    {
+    if (Dim == 2) {
       dims.group.z = len;
     }
 
@@ -248,12 +237,10 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_local_012_direct<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+    Data,
+    statement::For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -265,35 +252,35 @@ struct SyclStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i   = item.get_local_id(Dim);
+    auto i = item.get_local_id(Dim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0)
-    {
+    if (Dim == 0) {
       dims.local.x = len;
     }
-    if (Dim == 1)
-    {
+    if (Dim == 1) {
       dims.local.y = len;
     }
-    if (Dim == 2)
-    {
+    if (Dim == 2) {
       dims.local.z = len;
     }
 
@@ -314,12 +301,10 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::For<ArgumentId,
-                                            RAJA::sycl_local_012_loop<Dim>,
-                                            EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+    Data,
+    statement::For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -331,16 +316,15 @@ struct SyclStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len      = segment_length<ArgumentId>(data);
-    auto i0       = item.get_local_id(Dim);
+    auto len = segment_length<ArgumentId>(data);
+    auto i0 = item.get_local_id(Dim);
     auto i_stride = item.get_local_range(Dim);
-    auto i        = i0;
+    auto i = i0;
 
-    for (; i < len; i += i_stride)
-    {
+    for(; i < len;i += i_stride){
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -349,7 +333,7 @@ struct SyclStatementExecutor<Data,
       enclosed_stmts_t::exec(data, item, thread_active);
     }
     // do we need one more masked iteration?
-    if (i - i0 < len)
+    if(i - i0 < len)
     {
       // execute enclosed statements one more time, but masking them off
       // this is because there's at least one thread that isn't masked off
@@ -358,22 +342,21 @@ struct SyclStatementExecutor<Data,
     }
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0)
-    {
+    if (Dim == 0) {
       dims.local.x = len;
     }
-    if (Dim == 1)
-    {
+    if (Dim == 1) {
       dims.local.y = len;
     }
-    if (Dim == 2)
-    {
+    if (Dim == 2) {
       dims.local.z = len;
     }
 
@@ -397,8 +380,7 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::sycl_exec<Local_Size>, EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -410,13 +392,13 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data& data, cl::sycl::nd_item<3> item)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i   = item.get_global_id(0);
+    auto i = item.get_global_id(0);
 
-    if (i < len)
-    {
+    if (i < len) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -427,13 +409,15 @@ struct SyclStatementExecutor<
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    dims.local.x  = Local_Size;
+    dims.local.x = Local_Size;
     dims.global.x = len;
 
     // combine with enclosed statements
@@ -455,8 +439,7 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-    Types>
-{
+    Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -468,17 +451,17 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
 
-    using idx_type =
-        camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
+    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
 
     idx_type len = segment_length<ArgumentId>(data);
 
-    for (idx_type i = 0; i < len; ++i)
-    {
+    for(idx_type i = 0;i < len;++ i){
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -487,7 +470,9 @@ struct SyclStatementExecutor<
     }
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
@@ -498,4 +483,4 @@ struct SyclStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif
+#endif 
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index feb5c195c4..9c25bb0ab9 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -31,6 +31,7 @@ namespace internal
 {
 
 
+
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Mapping directly from local id to indices
@@ -45,45 +46,41 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_012_direct<ThreadDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_012_direct<ThreadDim>,
-                         EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>, Types> {
 
   using Base = SyclStatementExecutor<
-      Data,
-      statement::For<ArgumentId,
-                     RAJA::sycl_local_012_direct<ThreadDim>,
-                     EnclosedStmts...>,
-      Types>;
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+        Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i     = item.get_local_id(ThreadDim);
+    auto i = item.get_local_id(ThreadDim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+
   }
 };
 
 
+
+
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -92,59 +89,58 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::sycl_local_masked_direct<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public SyclStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_local_masked_direct<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i0    = item.get_local_id(0);
-    diff_t i   = mask_t::maskValue(i0);
+    auto i0 = item.get_local_id(0);
+    diff_t i = mask_t::maskValue(i0);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
   }
+
 };
 
 
+
+
+
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -153,54 +149,48 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::sycl_local_masked_loop<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts...>,
-    Types>
-    : public SyclStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts...>,
-          Types>
-{
+    statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
+                   EnclosedStmts ...>, Types > {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_local_masked_loop<Mask>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts ...>, Types >;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // masked size strided loop
-    diff_t len      = segment_length<ArgumentId>(data);
-    auto i0         = item.get_local_id(0);
-    diff_t i_init   = mask_t::maskValue(i0);
-    diff_t i_stride = (diff_t)mask_t::max_masked_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i0 = item.get_local_id(0);
+    diff_t i_init = mask_t::maskValue(i0);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -215,9 +205,13 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
     }
   }
+
 };
 
 
+
+
+
 /*
  * Executor for thread work sharing loop inside SyclKernel.
  * Provides a block-stride loop (stride of blockDim.xyz) for
@@ -233,40 +227,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_local_012_loop<ThreadDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_local_012_loop<ThreadDim>,
-                         EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+        Types> {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_local_012_loop<ThreadDim>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+        Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // block stride loop
-    diff_t len    = segment_length<ArgumentId>(data);
-    auto i_init   = item.get_local_id(ThreadDim);
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i_init = item.get_local_id(ThreadDim);
     auto i_stride = item.get_local_range(ThreadDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -284,6 +269,7 @@ struct SyclStatementExecutor<
 };
 
 
+
 /*
  * Executor for group work sharing inside SyclKernel.
  * Provides a direct mapping of each block in 012.
@@ -298,38 +284,29 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_group_012_direct<BlockDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_group_012_direct<BlockDim>,
-                         EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+        Types> {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::sycl_group_012_direct<BlockDim>,
-                     EnclosedStmts...>,
+      statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
       Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
-    auto i     = item.get_group(BlockDim);
+    auto i = item.get_group(BlockDim);
 
-    if (i < len)
-    {
+    if (i < len) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -356,40 +333,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId,
-                         ParamId,
-                         RAJA::sycl_group_012_loop<BlockDim>,
-                         EnclosedStmts...>,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-          Data,
-          statement::For<ArgumentId,
-                         RAJA::sycl_group_012_loop<BlockDim>,
-                         EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+        Types> {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::For<ArgumentId,
-                                           RAJA::sycl_group_012_loop<BlockDim>,
-                                           EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
-    diff_t len    = segment_length<ArgumentId>(data);
-    auto i_init   = item.get_group(BlockDim);
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i_init = item.get_group(BlockDim);
     auto i_stride = item.get_group_range(BlockDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -416,29 +384,26 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
-    Types>
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
     : public SyclStatementExecutor<
-          Data,
-          statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-      Types>;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
-    for (diff_t i = 0; i < len; ++i)
-    {
+    for(diff_t i = 0;i < len;++ i){
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -450,6 +415,9 @@ struct SyclStatementExecutor<
 };
 
 
+
+
+
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 8da7a878bb..0542f4b81e 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -42,28 +42,22 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <typename Data,
-          camp::idx_t LambdaIndex,
-          typename... Args,
-          typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Lambda<LambdaIndex, Args...>,
-                             Types>
-{
+template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct SyclStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if (thread_active)
-    {
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
-          data);
+    if(thread_active){
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
     }
+
   }
 
-  static inline LaunchDims
-  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
+  static
+  inline
+  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index d36a7fa2af..88c789c062 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -49,11 +49,11 @@ namespace RAJA
  */
 template <bool async0>
 struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t<
-                         RAJA::Policy::sycl,
-                         RAJA::Pattern::forall,
-                         detail::get_launch<async0>::value,
-                         RAJA::Platform::sycl>
-{};
+                            RAJA::Policy::sycl,
+                            RAJA::Pattern::forall,
+                            detail::get_launch<async0>::value,
+                            RAJA::Platform::sycl>{
+};
 
 namespace statement
 {
@@ -63,24 +63,28 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct SyclKernelExt
-    : public internal::Statement<LaunchConfig, EnclosedStmts...>
-{};
+    : public internal::Statement<LaunchConfig, EnclosedStmts...> {
+};
 
 /*
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is synchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernel = SyclKernelExt<sycl_launch<false>, EnclosedStmts...>;
+using SyclKernel =
+    SyclKernelExt<sycl_launch<false>,
+                  EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is asynchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernelAsync = SyclKernelExt<sycl_launch<true>, EnclosedStmts...>;
+using SyclKernelAsync =
+    SyclKernelExt<sycl_launch<true>,
+                  EnclosedStmts...>;
 
-}  // namespace statement
+} // namespace statement
 
 namespace internal
 {
@@ -92,7 +96,7 @@ template <typename Data, typename Exec>
 void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
 {
 
-  using data_t        = camp::decay<Data>;
+  using data_t = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -103,11 +107,7 @@ void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template <bool IsTriviallyCopyable,
-          typename LaunchPolicy,
-          typename StmtList,
-          typename Data,
-          typename Types>
+template<bool IsTriviallyCopyable, typename LaunchPolicy, typename StmtList, typename Data, typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -115,18 +115,17 @@ struct SyclLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template <bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
+template<bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t =
-      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data&& data,
+  static void launch(Data &&data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
@@ -137,19 +136,21 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    data_t* m_data = (data_t*)cl::sycl::malloc_device(sizeof(data_t), *qu);
+    data_t* m_data = (data_t*) cl::sycl::malloc_device(sizeof(data_t), *qu);
     qu->memcpy(m_data, &data, sizeof(data_t)).wait();
 
-    qu->submit(
-          [&](cl::sycl::handler& h)
-          {
-            h.parallel_for(
-                launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item)
-                { SyclKernelLauncher<Data, executor_t>(*m_data, item); });
-          })
-        .wait();  // Need to wait to free memory
+    qu->submit([&](cl::sycl::handler& h) {
+ 
+      h.parallel_for(launch_dims.fit_nd_range(qu),
+                     [=] (cl::sycl::nd_item<3> item) {
+        
+        SyclKernelLauncher<Data, executor_t>(*m_data, item);
+
+      });
+    }).wait(); // Need to wait to free memory
 
     cl::sycl::free(m_data, *qu);
+
   }
 };
 
@@ -158,35 +159,34 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template <bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
+template<bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t =
-      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data&& data,
+  static void launch(Data &&data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
   {
 
-    qu->submit(
-        [&](cl::sycl::handler& h)
-        {
-          h.parallel_for(launch_dims.fit_nd_range(qu),
-                         [=](cl::sycl::nd_item<3> item)
-                         { SyclKernelLauncher<Data, executor_t>(data, item); });
-        });
-
-    if (!async)
-    {
-      qu->wait();
-    };
+    qu->submit([&](cl::sycl::handler& h) {
+ 
+      h.parallel_for(launch_dims.fit_nd_range(qu),
+                     [=] (cl::sycl::nd_item<3> item) {
+
+        SyclKernelLauncher<Data, executor_t>(data, item);
+
+      });
+    });
+
+    if (!async) { qu->wait(); };
+
   }
 };
 
@@ -195,40 +195,38 @@ struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>,
-    Types>
-{
+    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data&& data)
+  static inline void exec(Data &&data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t =
-        sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t = sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
                                       LaunchConfig, stmt_list_t, data_t, Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue* q          = res.get_queue();
-    ;
+    ::sycl::queue* q = res.get_queue();;
 
     //
     // Compute the requested kernel dimensions
     //
     LaunchDims launch_dims = executor_t::calculateDimensions(data);
-
+    
     int shmem = 0;
 
     //
     // Launch the kernels
     //
     launch_t::launch(std::move(data), launch_dims, shmem, q);
+
   }
+
 };
 
 
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index ee4c78a273..81a57cdecb 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -1,12 +1,12 @@
-/*!
-******************************************************************************
-*
-* \file
-*
-* \brief   Header file for SYCL tiled executors.
-*
-******************************************************************************
-*/
+ /*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for SYCL tiled executors.
+ *
+ ******************************************************************************
+ */
 
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -54,22 +54,19 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
-    Types>
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
 {
 
-  using stmt_list_t      = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
-  using diff_t           = segment_diff_type<ArgumentId, Data>;
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
-  {
+  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -78,8 +75,7 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0; i < len; i += chunk_size)
-    {
+    for (diff_t i = 0; i < len; i += chunk_size) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -93,15 +89,17 @@ struct SyclStatementExecutor<
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, TPol::chunk_size);
@@ -126,13 +124,14 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_group_012_direct<BlockDim>,
-                                             EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_group_012_direct<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -140,24 +139,20 @@ struct SyclStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    // diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
-    diff_t i =
-        item.get_group(BlockDim) *
-        chunk_size;  // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    //diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    diff_t i = item.get_group(BlockDim) * chunk_size;//get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
 
     // check have chunk
-    if (i < len)
-    {
+    if (i < len) {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -174,14 +169,15 @@ struct SyclStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
-    diff_t len        = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len)
-    {
+    if (num_blocks * chunk_size < len) {
       num_blocks++;
     }
 
@@ -193,11 +189,11 @@ struct SyclStatementExecutor<Data,
 
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -221,13 +217,13 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_group_012_loop<BlockDim>,
-                                             EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_group_012_loop<BlockDim>,
+                    EnclosedStmts...>, Types>
+  {
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -235,24 +231,22 @@ struct SyclStatementExecutor<Data,
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len      = segment.end() - segment.begin();
-    diff_t i_init   = item.get_group(BlockDim) * chunk_size;        // TODO
-    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size;  // TODO
+    diff_t len = segment.end() - segment.begin();
+    diff_t i_init = item.get_group(BlockDim) * chunk_size; // TODO
+    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride)
-    {
+    for (diff_t i = i_init; i < len; i += i_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -266,14 +260,15 @@ struct SyclStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
-    diff_t len        = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len)
-    {
+    if (num_blocks * chunk_size < len) {
       num_blocks++;
     }
 
@@ -281,12 +276,13 @@ struct SyclStatementExecutor<Data,
     set_sycl_dim<BlockDim>(dims.group, num_blocks);
 
 
+
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -300,6 +296,7 @@ struct SyclStatementExecutor<Data,
 };
 
 
+
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
@@ -309,35 +306,33 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_local_012_direct<ThreadDim>,
-                                             EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+  Data,
+  statement::Tile<ArgumentId,
+                  RAJA::tile_fixed<chunk_size>,
+                  sycl_local_012_direct<ThreadDim>,
+                  EnclosedStmts ...>, Types>{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    diff_t i   = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t i = item.get_local_id(ThreadDim) * chunk_size;
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -345,7 +340,7 @@ struct SyclStatementExecutor<Data,
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment           = orig_segment.slice(i, slice_size);
+    segment = orig_segment.slice(i, slice_size);
 
     // execute enclosed statements
     enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -355,14 +350,15 @@ struct SyclStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
-    diff_t len         = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if (num_threads * chunk_size < len)
-    {
+    if(num_threads * chunk_size < len){
       num_threads++;
     }
 
@@ -371,20 +367,20 @@ struct SyclStatementExecutor<Data,
     set_sycl_dim<ThreadDim>(dims.min_locals, num_threads);
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-        enclosed_stmts_t::calculateDimensions(private_data);
+      enclosed_stmts_t::calculateDimensions(private_data);
 
-    return (dims.max(enclosed_dims));
+    return(dims.max(enclosed_dims));
   }
 };
 
@@ -398,40 +394,37 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<Data,
-                             statement::Tile<ArgumentId,
-                                             RAJA::tile_fixed<chunk_size>,
-                                             sycl_local_012_loop<ThreadDim>,
-                                             EnclosedStmts...>,
-                             Types>
-{
+struct SyclStatementExecutor<
+  Data,
+  statement::Tile<ArgumentId,
+                  RAJA::tile_fixed<chunk_size>,
+                  sycl_local_012_loop<ThreadDim>,
+                  EnclosedStmts ...>, Types>{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len      = segment_length<ArgumentId>(data);
-    diff_t i_init   = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t len = segment_length<ArgumentId>(data);
+    diff_t i_init = item.get_local_id(ThreadDim) * chunk_size;
     diff_t i_stride = item.get_group_range(ThreadDim) * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride)
-    {
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -440,7 +433,7 @@ struct SyclStatementExecutor<Data,
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment           = orig_segment.slice(i, slice_size);
+      segment = orig_segment.slice(i, slice_size);
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -451,14 +444,15 @@ struct SyclStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
 
     // Compute how many blocks
-    diff_t len         = segment_length<ArgumentId>(data);
+    diff_t len = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if (num_threads * chunk_size < len)
-    {
+    if(num_threads * chunk_size < len){
       num_threads++;
     }
     num_threads = std::max(num_threads, (diff_t)1);
@@ -468,24 +462,26 @@ struct SyclStatementExecutor<Data,
     set_sycl_dim<ThreadDim>(dims.min_locals, 1);
 
     // privatize data, so we can mess with the segments
-    using data_t        = camp::decay<Data>;
+    using data_t = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-        enclosed_stmts_t::calculateDimensions(private_data);
+      enclosed_stmts_t::calculateDimensions(private_data);
 
-    return (dims.max(enclosed_dims));
+    return(dims.max(enclosed_dims));
   }
 };
 
 
+
+
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index 8f1caf75c0..b1d263a263 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -55,31 +55,27 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::
-        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
     : public SyclStatementExecutor<
-          Data,
-          statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
-      Types>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
-  {
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -88,8 +84,7 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t)
-    {
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -119,49 +114,48 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_direct<BlockDim>,
-                          EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_group_012_direct<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
     : public SyclStatementExecutor<
-          Data,
-          statement::Tile<ArgumentId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_direct<BlockDim>,
-                          EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_group_012_direct<BlockDim>,
+                        EnclosedStmts...>,
+                        Types> {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_group_012_direct<BlockDim>,
-                                            EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      sycl_group_012_direct<BlockDim>,
+                      EnclosedStmts...>,
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    // diff_t t = get_sycl_dim<BlockDim>(blockIdx);
+    //diff_t t = get_sycl_dim<BlockDim>(blockIdx);
     diff_t t = item.get_group(BlockDim);
     diff_t i = t * chunk_size;
 
     // check have a chunk
-    if (i < len)
-    {
+    if (i < len) {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -193,52 +187,51 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_loop<BlockDim>,
-                          EnclosedStmts...>,
-    Types>
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_group_012_loop<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
     : public SyclStatementExecutor<
-          Data,
-          statement::Tile<ArgumentId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_group_012_loop<BlockDim>,
-                          EnclosedStmts...>,
-          Types>
-{
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_group_012_loop<BlockDim>,
+                        EnclosedStmts...>,
+                        Types> {
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_group_012_loop<BlockDim>,
-                                            EnclosedStmts...>,
-                            Types>;
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      sycl_group_012_loop<BlockDim>,
+                      EnclosedStmts...>,
+                      Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len      = segment.end() - segment.begin();
-    diff_t t_init   = item.get_group(BlockDim);
-    diff_t i_init   = t_init * chunk_size;
+    diff_t len = segment.end() - segment.begin();
+    diff_t t_init = item.get_group(BlockDim);
+    diff_t i_init = t_init * chunk_size;
     diff_t t_stride = item.get_group_range(BlockDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
-    {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -254,6 +247,7 @@ struct SyclStatementExecutor<
 };
 
 
+
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
@@ -264,49 +258,49 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
+  Data,
+  statement::TileTCount<ArgumentId, ParamId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_local_012_direct<ThreadDim>,
+                        EnclosedStmts ...>,
+                        Types>
+  : public SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts...>,
-    Types>
-    : public SyclStatementExecutor<
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_local_012_direct<ThreadDim>,
+                    EnclosedStmts ...>,
+                    Types> {
+
+  using Base = SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts...>,
-          Types>
-{
+                          EnclosedStmts ...>,
+                          Types>;
 
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_local_012_direct<ThreadDim>,
-                                            EnclosedStmts...>,
-                            Types>;
-
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    // diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
+    //diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t = item.get_local_id(ThreadDim);
     diff_t i = t * chunk_size;
 
@@ -316,7 +310,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment           = orig_segment.slice(i, slice_size);
+    segment = orig_segment.slice(i, slice_size);
     data.template assign_param<ParamId>(t);
 
     // execute enclosed statements
@@ -338,58 +332,57 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename... EnclosedStmts,
+          typename ... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
+  Data,
+  statement::TileTCount<ArgumentId, ParamId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_local_012_loop<ThreadDim>,
+                        EnclosedStmts ...>,
+                        Types>
+  : public SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId,
-                          ParamId,
-                          RAJA::tile_fixed<chunk_size>,
-                          sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts...>,
-    Types>
-    : public SyclStatementExecutor<
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_local_012_loop<ThreadDim>,
+                    EnclosedStmts ...>,
+                    Types> {
+
+  using Base = SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts...>,
-          Types>
-{
-
-  using Base =
-      SyclStatementExecutor<Data,
-                            statement::Tile<ArgumentId,
-                                            RAJA::tile_fixed<chunk_size>,
-                                            sycl_local_012_loop<ThreadDim>,
-                                            EnclosedStmts...>,
-                            Types>;
+                          EnclosedStmts ...>,
+                          Types>;
 
-  using typename Base::diff_t;
   using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t        = camp::decay<decltype(segment)>;
+    using segment_t = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment_length<ArgumentId>(data);
-    //    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
+//    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t_init = item.get_local_id(ThreadDim);
     diff_t i_init = t_init * chunk_size;
-    //    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
+//    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
     diff_t t_stride = item.get_local_range(ThreadDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
-    {
+    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -398,7 +391,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment           = orig_segment.slice(i, slice_size);
+      segment = orig_segment.slice(i, slice_size);
       data.template assign_param<ParamId>(t);
 
       // execute enclosed statements
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 4c68cf58b8..56e3a9aa1e 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -43,8 +43,7 @@ namespace internal
 {
 
 // LaunchDims and Helper functions
-struct LaunchDims
-{
+struct LaunchDims {
   sycl_dim_3_t group;
   sycl_dim_3_t local;
   sycl_dim_3_t global;
@@ -53,22 +52,22 @@ struct LaunchDims
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims()
-      : group {0, 0, 0},
-        local {1, 1, 1},
-        global {1, 1, 1},
-        min_groups {0, 0, 0},
-        min_locals {0, 0, 0}
-  {}
+  LaunchDims() : group{0,0,0},
+                 local{1,1,1},
+                 global{1,1,1},
+                 min_groups{0,0,0},
+                 min_locals{0,0,0} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims(LaunchDims const& c)
-      : group(c.group), local(c.local), global(c.global)
-  {}
+  LaunchDims(LaunchDims const &c) : group(c.group),
+                                    local(c.local),
+                                    global(c.global)
+  {
+  }
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const& c) const
+  LaunchDims max(LaunchDims const &c) const
   {
     LaunchDims result;
 
@@ -87,115 +86,89 @@ struct LaunchDims
     return result;
   }
 
-  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q)
-  {
+  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) {
 
     sycl_dim_3_t launch_global;
 
-    sycl_dim_3_t launch_local {1, 1, 1};
-    launch_local.x = std::max(launch_local.x, local.x);
+    sycl_dim_3_t launch_local {1,1,1};
+    launch_local.x = std::max(launch_local.x, local.x); 
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
     cl::sycl::device dev = q->get_device();
 
-    auto max_work_group_size =
-        dev.get_info<::cl::sycl::info::device::max_work_group_size>();
+    auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>();
 
-    if (launch_local.x > max_work_group_size)
-    {
+    if(launch_local.x > max_work_group_size) {
       launch_local.x = max_work_group_size;
     }
-    if (launch_local.y > max_work_group_size)
-    {
+    if(launch_local.y > max_work_group_size) {
       launch_local.y = max_work_group_size;
     }
-    if (launch_local.z > max_work_group_size)
-    {
+    if(launch_local.z > max_work_group_size) {
       launch_local.z = max_work_group_size;
     }
 
 
     // Make sure the multiple of locals fits
     // Prefer larger z -> y -> x
-    if (launch_local.x * launch_local.y * launch_local.z > max_work_group_size)
-    {
+    if(launch_local.x * launch_local.y * launch_local.z > max_work_group_size) {
       int remaining = 1;
       // local z cannot be > max_wrk from above
-      // if equal then remaining is 1, on handle <
-      if (max_work_group_size > launch_local.z)
-      {
+      // if equal then remaining is 1, on handle < 
+      if(max_work_group_size > launch_local.z) {
         // keep local z
         remaining = max_work_group_size / launch_local.z;
       }
-      if (remaining >= launch_local.y)
-      {
+      if(remaining >= launch_local.y) {
         // keep local y
         remaining = remaining / launch_local.y;
-      }
-      else
-      {
+      } else {
         launch_local.y = remaining;
-        remaining      = remaining / launch_local.y;
+        remaining = remaining / launch_local.y;
       }
-      if (remaining < launch_local.x)
-      {
+      if(remaining < launch_local.x) {
         launch_local.x = remaining;
       }
     }
 
 
     // User gave group policy, use to calculate global space
-    if (group.x != 0 || group.y != 0 || group.z != 0)
-    {
-      sycl_dim_3_t launch_group {1, 1, 1};
+    if (group.x != 0 || group.y != 0 || group.z != 0) {
+      sycl_dim_3_t launch_group {1,1,1};
       launch_group.x = std::max(launch_group.x, group.x);
       launch_group.y = std::max(launch_group.y, group.y);
       launch_group.z = std::max(launch_group.z, group.z);
 
       launch_global.x = launch_local.x * launch_group.x;
-      launch_global.y = launch_local.y * launch_group.y;
+      launch_global.y = launch_local.y * launch_group.y; 
       launch_global.z = launch_local.z * launch_group.z;
-    }
-    else
-    {
-      launch_global.x =
-          launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
-      launch_global.y =
-          launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
-      launch_global.z =
-          launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
+    } else {
+      launch_global.x = launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
+      launch_global.y = launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
+      launch_global.z = launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
     }
 
 
-    if (launch_global.x % launch_local.x != 0)
-    {
-      launch_global.x =
-          ((launch_global.x / launch_local.x) + 1) * launch_local.x;
+    if(launch_global.x % launch_local.x != 0) {
+      launch_global.x = ((launch_global.x / launch_local.x) + 1) * launch_local.x; 
     }
-    if (launch_global.y % launch_local.y != 0)
-    {
-      launch_global.y =
-          ((launch_global.y / launch_local.y) + 1) * launch_local.y;
+    if(launch_global.y % launch_local.y != 0) {
+      launch_global.y = ((launch_global.y / launch_local.y) + 1) * launch_local.y; 
     }
-    if (launch_global.z % launch_local.z != 0)
-    {
-      launch_global.z =
-          ((launch_global.z / launch_local.z) + 1) * launch_local.z;
+    if(launch_global.z % launch_local.z != 0) {
+      launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; 
     }
 
-    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y,
-                                 launch_local.z};
-    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y,
-                                 launch_global.z};
+    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z};
+    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
 };
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper
-{
+struct SyclStatementListExecutorHelper {
 
   using next_helper_t =
       SyclStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -203,8 +176,7 @@ struct SyclStatementListExecutorHelper
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, item, thread_active);
@@ -214,7 +186,7 @@ struct SyclStatementListExecutorHelper
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data& data)
+  inline static LaunchDims calculateDimensions(Data &data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -228,17 +200,16 @@ struct SyclStatementListExecutorHelper
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
-{
+struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data&, cl::sycl::nd_item<3> item, bool)
+  inline static RAJA_DEVICE void exec(Data &, cl::sycl::nd_item<3> item, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data&)
+  inline static LaunchDims calculateDimensions(Data &)
   {
     return LaunchDims();
   }
@@ -252,33 +223,37 @@ struct SyclStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types>
-{
+struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
 
   using enclosed_stmts_t =
       camp::list<SyclStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static inline RAJA_DEVICE void
-  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute statements in order with helper class
-    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
-        data, item, thread_active);
+    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, item, thread_active);
   }
 
-  static inline LaunchDims calculateDimensions(Data const& data)
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
   {
     // Compute this statements launch dimensions
-    return SyclStatementListExecutorHelper<
-        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
+    return SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
+        calculateDimensions(data);
   }
 };
 
 template <typename StmtList, typename Data, typename Types>
-using sycl_statement_list_executor_t =
-    SyclStatementListExecutor<Data, StmtList, Types>;
+using sycl_statement_list_executor_t = SyclStatementListExecutor<
+    Data,
+    StmtList,
+    Types>;
 
 }  // namespace internal
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index fcffc88aed..ad9fecc222 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -29,23 +29,16 @@ namespace RAJA
 {
 
 template <bool async>
-struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
-{
-
-  // If the launch lambda is trivially copyable
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
-                                    bool>::type = true>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
+
+ //If the launch lambda is trivially copyable
+  template <typename BODY_IN, typename ReduceParams,
+	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
+       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -56,72 +49,57 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-                                     params.threads.value[1],
-                                     params.threads.value[0]);
+				     params.threads.value[1],
+				     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(
-        params.threads.value[2] * params.teams.value[2],
-        params.threads.value[1] * params.teams.value[1],
-        params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
+				    params.threads.value[1] * params.teams.value[1],
+				    params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
-        params.threads.value[2] > zero && params.teams.value[0] > zero &&
-        params.teams.value[1] > zero && params.teams.value[2] > zero)
-    {
+    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
+         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
 
       RAJA_FT_BEGIN;
 
-      q->submit(
-          [&](cl::sycl::handler& h)
-          {
-            auto s_vec =
-                ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
-
-            h.parallel_for(
-                cl::sycl::nd_range<3>(gridSize, blockSize),
-                [=](cl::sycl::nd_item<3> itm)
-                {
-                  LaunchContext ctx;
-                  ctx.itm = &itm;
-
-                  // Point to shared memory
-                  ctx.shared_mem_ptr =
-                      s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
-                          .get();
-
-                  body_in(ctx);
-                });
-          });
-
-      if (!async)
-      {
-        q->wait();
-      }
+      q->submit([&](cl::sycl::handler& h) {
+
+        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+            LaunchContext ctx;
+            ctx.itm = &itm;
+
+            //Point to shared memory
+            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+            body_in(ctx);
+
+           });
+
+      });
+
+    if (!async) { q->wait(); }
 
       RAJA_FT_END;
+
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  // If the launch lambda is trivially copyable and we have explcit reduction
-  // parameters
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
-                                    bool>::type = true>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams launch_reducers)
+ //If the launch lambda is trivially copyable and we have explcit reduction parameters
+  template <typename BODY_IN, typename ReduceParams,
+	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
+       BODY_IN &&body_in, ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -134,66 +112,57 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-                                     launch_params.threads.value[1],
-                                     launch_params.threads.value[0]);
+				     launch_params.threads.value[1],
+				     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(
-        launch_params.threads.value[2] * launch_params.teams.value[2],
-        launch_params.threads.value[1] * launch_params.teams.value[1],
-        launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
+				    launch_params.threads.value[1] * launch_params.teams.value[1],
+				    launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if (launch_params.threads.value[0] > zero &&
-        launch_params.threads.value[1] > zero &&
-        launch_params.threads.value[2] > zero &&
-        launch_params.teams.value[0] > zero &&
-        launch_params.teams.value[1] > zero &&
-        launch_params.teams.value[2] > zero)
-    {
+    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
+         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
 
 
-      auto combiner = [](ReduceParams x, ReduceParams y)
-      {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
+      auto combiner = []( ReduceParams x, ReduceParams y ) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
         return x;
-      };
+       };
 
       RAJA_FT_BEGIN;
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit(
-           [&](cl::sycl::handler& h)
-           {
-             auto s_vec = ::sycl::local_accessor<char, 1>(
-                 launch_params.shared_mem_size, h);
+      q->submit([&](cl::sycl::handler& h) {
+
+       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           reduction,
+           [=] (cl::sycl::nd_item<3> itm, auto & red) {
 
-             h.parallel_for(
-                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
-                 [=](cl::sycl::nd_item<3> itm, auto& red)
-                 {
-                   LaunchContext ctx;
-                   ctx.itm = &itm;
+            LaunchContext ctx;
+            ctx.itm = &itm;
 
-                   // Point to shared memory
-                   ctx.shared_mem_ptr =
-                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
-                           .get();
+            //Point to shared memory
+            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-                   ReduceParams fp;
-                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+            ReduceParams fp;
+            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-                   RAJA::expt::invoke_body(fp, body_in, ctx);
+            RAJA::expt::invoke_body(fp, body_in, ctx);
 
-                   red.combine(fp);
-                 });
-           })
-          .wait();  // Need to wait for completion to free memory
+            red.combine(fp);
 
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
+           });
+
+      }).wait(); // Need to wait for completion to free memory
+
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
       ::sycl::free(res, *q);
 
       RAJA_FT_END;
@@ -201,23 +170,17 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-    return resources::EventProxy<resources::Resource>(res);
+   return resources::EventProxy<resources::Resource>(res);
   }
 
-  // If the launch lambda is not trivially copyable
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
-                                    bool>::type = true>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
+  //If the launch lambda is not trivially copyable
+  template <typename BODY_IN, typename ReduceParams,
+	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
+       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -228,79 +191,67 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-                                     params.threads.value[1],
-                                     params.threads.value[0]);
+				     params.threads.value[1],
+				     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(
-        params.threads.value[2] * params.teams.value[2],
-        params.threads.value[1] * params.teams.value[1],
-        params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
+				    params.threads.value[1] * params.teams.value[1],
+				    params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
-        params.threads.value[2] > zero && params.teams.value[0] > zero &&
-        params.teams.value[1] > zero && params.teams.value[2] > zero)
-    {
+    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
+         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy
-      // to Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy to
+      // Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      q->submit(
-           [&](cl::sycl::handler& h)
-           {
-             auto s_vec =
-                 ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
-
-             h.parallel_for(
-                 cl::sycl::nd_range<3>(gridSize, blockSize),
-                 [=](cl::sycl::nd_item<3> itm)
-                 {
-                   LaunchContext ctx;
-                   ctx.itm = &itm;
-
-                   // Point to shared memory
-                   ctx.shared_mem_ptr =
-                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
-                           .get();
-
-                   (*lbody)(ctx);
-                 });
-           })
-          .wait();  // Need to wait for completion to free memory
+      q->submit([&](cl::sycl::handler& h) {
+
+        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+            LaunchContext ctx;
+            ctx.itm = &itm;
+
+            //Point to shared memory
+            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+            (*lbody)(ctx);
+
+           });
+
+      }).wait(); // Need to wait for completion to free memory
 
       cl::sycl::free(lbody, *q);
 
       RAJA_FT_END;
+
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
 
-  // If the launch lambda is not trivially copyable
-  template <typename BODY_IN,
-            typename ReduceParams,
-            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
-                                    bool>::type = true>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       const char* kernel_name,
-       BODY_IN&& body_in,
-       ReduceParams launch_reducers)
+  //If the launch lambda is not trivially copyable
+  template <typename BODY_IN, typename ReduceParams,
+	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
+                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+    exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
+         BODY_IN &&body_in, ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -313,75 +264,66 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-                                     launch_params.threads.value[1],
-                                     launch_params.threads.value[0]);
+				     launch_params.threads.value[1],
+				     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(
-        launch_params.threads.value[2] * launch_params.teams.value[2],
-        launch_params.threads.value[1] * launch_params.teams.value[1],
-        launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
+				    launch_params.threads.value[1] * launch_params.teams.value[1],
+				    launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if (launch_params.threads.value[0] > zero &&
-        launch_params.threads.value[1] > zero &&
-        launch_params.threads.value[2] > zero &&
-        launch_params.teams.value[0] > zero &&
-        launch_params.teams.value[1] > zero &&
-        launch_params.teams.value[2] > zero)
-    {
+    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
+         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
 
 
-      auto combiner = [](ReduceParams x, ReduceParams y)
-      {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
+      auto combiner = []( ReduceParams x, ReduceParams y ) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
         return x;
-      };
+       };
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy
-      // to Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy to
+      // Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit(
-           [&](cl::sycl::handler& h)
-           {
-             auto s_vec = ::sycl::local_accessor<char, 1>(
-                 launch_params.shared_mem_size, h);
+      q->submit([&](cl::sycl::handler& h) {
+
+       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           reduction,
+           [=] (cl::sycl::nd_item<3> itm, auto & red) {
+
+            LaunchContext ctx;
+            ctx.itm = &itm;
+
+            //Point to shared memory
+            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-             h.parallel_for(
-                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
-                 [=](cl::sycl::nd_item<3> itm, auto& red)
-                 {
-                   LaunchContext ctx;
-                   ctx.itm = &itm;
+            ReduceParams fp;
+            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-                   // Point to shared memory
-                   ctx.shared_mem_ptr =
-                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
-                           .get();
+            RAJA::expt::invoke_body(fp, *lbody, ctx);
 
-                   ReduceParams fp;
-                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+            red.combine(fp);
 
-                   RAJA::expt::invoke_body(fp, *lbody, ctx);
+           });
 
-                   red.combine(fp);
-                 });
-           })
-          .wait();  // Need to wait for completion to free memory
+      }).wait(); // Need to wait for completion to free memory
 
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
       ::sycl::free(res, *q);
       cl::sycl::free(lbody, *q);
 
@@ -390,14 +332,15 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-    return resources::EventProxy<resources::Resource>(res);
+   return resources::EventProxy<resources::Resource>(res);
   }
+
 };
 
 /*
    SYCL global thread mapping
 */
-template <int... DIM>
+template<int ... DIM>
 struct sycl_global_item;
 
 using sycl_global_item_0 = sycl_global_item<0>;
@@ -405,49 +348,53 @@ using sycl_global_item_1 = sycl_global_item<1>;
 using sycl_global_item_2 = sycl_global_item<2>;
 
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_global_item<DIM>, SEGMENT>
-{
+struct LoopExecute<sycl_global_item<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx = ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
-                     ctx.itm->get_local_id(DIM);
+      const int tx =
+        ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
+        ctx.itm->get_local_id(DIM);
 
       if (tx < len) body(*(segment.begin() + tx));
     }
   }
 };
 
-using sycl_global_item_01 = sycl_global_item<0, 1>;
-using sycl_global_item_02 = sycl_global_item<0, 2>;
-using sycl_global_item_10 = sycl_global_item<1, 0>;
-using sycl_global_item_12 = sycl_global_item<1, 2>;
-using sycl_global_item_20 = sycl_global_item<2, 0>;
-using sycl_global_item_21 = sycl_global_item<2, 1>;
+using sycl_global_item_01 = sycl_global_item<0,1>;
+using sycl_global_item_02 = sycl_global_item<0,2>;
+using sycl_global_item_10 = sycl_global_item<1,0>;
+using sycl_global_item_12 = sycl_global_item<1,2>;
+using sycl_global_item_20 = sycl_global_item<2,0>;
+using sycl_global_item_21 = sycl_global_item<2,1>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
-{
+struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-                     ctx.itm->get_local_id(DIM0);
+      const int tx =
+        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+        ctx.itm->get_local_id(DIM0);
 
-      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-                     ctx.itm->get_local_id(DIM1);
+      const int ty =
+        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+        ctx.itm->get_local_id(DIM1);
 
 
       if (tx < len0 && ty < len1)
@@ -457,39 +404,43 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
 };
 
 
-using sycl_global_item_012 = sycl_global_item<0, 1, 2>;
-using sycl_global_item_021 = sycl_global_item<0, 2, 1>;
-using sycl_global_item_102 = sycl_global_item<1, 0, 2>;
-using sycl_global_item_120 = sycl_global_item<1, 2, 0>;
-using sycl_global_item_201 = sycl_global_item<2, 0, 1>;
-using sycl_global_item_210 = sycl_global_item<2, 1, 0>;
+using sycl_global_item_012 = sycl_global_item<0,1,2>;
+using sycl_global_item_021 = sycl_global_item<0,2,1>;
+using sycl_global_item_102 = sycl_global_item<1,0,2>;
+using sycl_global_item_120 = sycl_global_item<1,2,0>;
+using sycl_global_item_201 = sycl_global_item<2,0,1>;
+using sycl_global_item_210 = sycl_global_item<2,1,0>;
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
-{
+struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-                     ctx.itm->get_local_id(DIM0);
+      const int tx =
+        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+        ctx.itm->get_local_id(DIM0);
 
-      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-                     ctx.itm->get_local_id(DIM1);
+      const int ty =
+        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+        ctx.itm->get_local_id(DIM1);
 
-      const int tz = ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
-                     ctx.itm->get_local_id(DIM2);
+      const int tz =
+        ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
+        ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
              *(segment1.begin() + ty));
     }
   }
@@ -498,86 +449,70 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
 /*
 Reshape threads in a block into a 1D iteration space
 */
-template <int... dim>
-struct sycl_flatten_group_local_direct
-{};
-
-using sycl_flatten_group_local_01_direct =
-    sycl_flatten_group_local_direct<0, 1>;
-using sycl_flatten_group_local_02_direct =
-    sycl_flatten_group_local_direct<0, 2>;
-using sycl_flatten_group_local_10_direct =
-    sycl_flatten_group_local_direct<1, 0>;
-using sycl_flatten_group_local_12_direct =
-    sycl_flatten_group_local_direct<1, 2>;
-using sycl_flatten_group_local_20_direct =
-    sycl_flatten_group_local_direct<2, 0>;
-using sycl_flatten_group_local_21_direct =
-    sycl_flatten_group_local_direct<2, 1>;
-
-using sycl_flatten_group_local_012_direct =
-    sycl_flatten_group_local_direct<0, 1, 2>;
-using sycl_flatten_group_local_021_direct =
-    sycl_flatten_group_local_direct<0, 2, 1>;
-using sycl_flatten_group_local_102_direct =
-    sycl_flatten_group_local_direct<1, 0, 2>;
-using sycl_flatten_group_local_120_direct =
-    sycl_flatten_group_local_direct<1, 2, 0>;
-using sycl_flatten_group_local_201_direct =
-    sycl_flatten_group_local_direct<2, 0, 1>;
-using sycl_flatten_group_local_210_direct =
-    sycl_flatten_group_local_direct<2, 1, 0>;
-
-template <int... dim>
-struct sycl_flatten_group_local_loop
-{};
-
-using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0, 1>;
-using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0, 2>;
-using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1, 0>;
-using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1, 2>;
-using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2, 0>;
-using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2, 1>;
-
-using sycl_flatten_group_local_012_loop =
-    sycl_flatten_group_local_loop<0, 1, 2>;
-using sycl_flatten_group_local_021_loop =
-    sycl_flatten_group_local_loop<0, 2, 1>;
-using sycl_flatten_group_local_102_loop =
-    sycl_flatten_group_local_loop<1, 0, 2>;
-using sycl_flatten_group_local_120_loop =
-    sycl_flatten_group_local_loop<1, 2, 0>;
-using sycl_flatten_group_local_201_loop =
-    sycl_flatten_group_local_loop<2, 0, 1>;
-using sycl_flatten_group_local_210_loop =
-    sycl_flatten_group_local_loop<2, 1, 0>;
-
-template <typename SEGMENT, int DIM0, int DIM1>
+template<int ... dim>
+struct sycl_flatten_group_local_direct{};
+
+using sycl_flatten_group_local_01_direct = sycl_flatten_group_local_direct<0,1>;
+using sycl_flatten_group_local_02_direct = sycl_flatten_group_local_direct<0,2>;
+using sycl_flatten_group_local_10_direct = sycl_flatten_group_local_direct<1,0>;
+using sycl_flatten_group_local_12_direct = sycl_flatten_group_local_direct<1,2>;
+using sycl_flatten_group_local_20_direct = sycl_flatten_group_local_direct<2,0>;
+using sycl_flatten_group_local_21_direct = sycl_flatten_group_local_direct<2,1>;
+
+using sycl_flatten_group_local_012_direct = sycl_flatten_group_local_direct<0,1,2>;
+using sycl_flatten_group_local_021_direct = sycl_flatten_group_local_direct<0,2,1>;
+using sycl_flatten_group_local_102_direct = sycl_flatten_group_local_direct<1,0,2>;
+using sycl_flatten_group_local_120_direct = sycl_flatten_group_local_direct<1,2,0>;
+using sycl_flatten_group_local_201_direct = sycl_flatten_group_local_direct<2,0,1>;
+using sycl_flatten_group_local_210_direct = sycl_flatten_group_local_direct<2,1,0>;
+
+template<int ... dim>
+struct sycl_flatten_group_local_loop{};
+
+using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0,1>;
+using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0,2>;
+using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1,0>;
+using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1,2>;
+using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2,0>;
+using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2,1>;
+
+using sycl_flatten_group_local_012_loop = sycl_flatten_group_local_loop<0,1,2>;
+using sycl_flatten_group_local_021_loop = sycl_flatten_group_local_loop<0,2,1>;
+using sycl_flatten_group_local_102_loop = sycl_flatten_group_local_loop<1,0,2>;
+using sycl_flatten_group_local_120_loop = sycl_flatten_group_local_loop<1,2,0>;
+using sycl_flatten_group_local_201_loop = sycl_flatten_group_local_loop<2,0,1>;
+using sycl_flatten_group_local_210_loop = sycl_flatten_group_local_loop<2,1,0>;
+
+template<typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
 {
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx  = ctx.itm->get_local_id(DIM0);
-      const int ty  = ctx.itm->get_local_id(DIM1);
-      const int bx  = ctx.itm->get_local_range(DIM0);
-      const int tid = tx + bx * ty;
+      const int tx = ctx.itm->get_local_id(DIM0);
+      const int ty = ctx.itm->get_local_id(DIM1);
+      const int bx = ctx.itm->get_local_range(DIM0);
+      const int tid = tx + bx*ty;
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template <typename SEGMENT, int DIM0, int DIM1>
+template<typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
 {
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -587,19 +522,21 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
     const int bx = ctx.itm->get_local_range(DIM0);
     const int by = ctx.itm->get_local_range(DIM1);
 
-    for (int tid = tx + bx * ty; tid < len; tid += bx * by)
-    {
+    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
       body(*(segment.begin() + tid));
     }
+
   }
 };
 
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
     {
@@ -609,19 +546,21 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int bx = ctx.itm->get_local_range(DIM0);
       const int by = ctx.itm->get_local_range(DIM1);
 
-      const int tid = tx + bx * (ty + by * tz);
+      const int tid = tx + bx*(ty + by*tz);
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -632,10 +571,10 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
     const int by = ctx.itm->get_local_range(DIM1);
     const int bz = ctx.itm->get_local_range(DIM2);
 
-    for (int tid = tx + bx * (ty + by * tz); tid < len; tid += bx * by * bz)
-    {
+    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
       body(*(segment.begin() + tid));
     }
+
   }
 };
 
@@ -643,17 +582,19 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
   SYCL thread loops with block strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT>
-{
+struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM);
+         tx < len;
          tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx));
@@ -665,12 +606,13 @@ struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT>
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT>
-{
+struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -685,19 +627,20 @@ struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT>
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT>
-{
+struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM); bx < len;
-         bx += ctx.itm->get_group_range(DIM))
-    {
+    for (int bx = ctx.itm->get_group(DIM);
+         bx < len;
+         bx += ctx.itm->get_group_range(DIM) ) {
       body(*(segment.begin() + bx));
     }
   }
@@ -707,12 +650,13 @@ struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT>
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT>
-{
+struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -727,18 +671,20 @@ struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT>
   SYCL thread loops with block strides + Return Index
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT>
-{
+struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
-         tx += ctx.itm->get_local_range(DIM))
+    for (int tx = ctx.itm->get_local_id(DIM);
+         tx < len;
+         tx += ctx.itm->get_local_range(DIM) )
     {
       body(*(segment.begin() + tx), tx);
     }
@@ -749,12 +695,13 @@ struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT>
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT>
-{
+struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -769,19 +716,20 @@ struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT>
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT>
-{
+struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM); bx < len;
-         bx += ctx.itm->get_group_range(DIM))
-    {
+    for (int bx =  ctx.itm->get_group(DIM);
+         bx < len;
+         bx += ctx.itm->get_group_range(DIM) ) {
       body(*(segment.begin() + bx), bx);
     }
   }
@@ -791,12 +739,13 @@ struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT>
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT>
-{
+struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void
-  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -808,29 +757,29 @@ struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT>
 };
 
 // perfectly nested sycl direct policies
-using sycl_group_01_nested_direct = sycl_group_012_direct<0, 1>;
-using sycl_group_02_nested_direct = sycl_group_012_direct<0, 2>;
-using sycl_group_10_nested_direct = sycl_group_012_direct<1, 0>;
-using sycl_group_12_nested_direct = sycl_group_012_direct<1, 2>;
-using sycl_group_20_nested_direct = sycl_group_012_direct<2, 0>;
-using sycl_group_21_nested_direct = sycl_group_012_direct<2, 1>;
-
-using sycl_group_012_nested_direct = sycl_group_012_direct<0, 1, 2>;
-using sycl_group_021_nested_direct = sycl_group_012_direct<0, 2, 1>;
-using sycl_group_102_nested_direct = sycl_group_012_direct<1, 0, 2>;
-using sycl_group_120_nested_direct = sycl_group_012_direct<1, 2, 0>;
-using sycl_group_201_nested_direct = sycl_group_012_direct<2, 0, 1>;
-using sycl_group_210_nested_direct = sycl_group_012_direct<2, 1, 0>;
+using sycl_group_01_nested_direct = sycl_group_012_direct<0,1>;
+using sycl_group_02_nested_direct = sycl_group_012_direct<0,2>;
+using sycl_group_10_nested_direct = sycl_group_012_direct<1,0>;
+using sycl_group_12_nested_direct = sycl_group_012_direct<1,2>;
+using sycl_group_20_nested_direct = sycl_group_012_direct<2,0>;
+using sycl_group_21_nested_direct = sycl_group_012_direct<2,1>;
+
+using sycl_group_012_nested_direct = sycl_group_012_direct<0,1,2>;
+using sycl_group_021_nested_direct = sycl_group_012_direct<0,2,1>;
+using sycl_group_102_nested_direct = sycl_group_012_direct<1,0,2>;
+using sycl_group_120_nested_direct = sycl_group_012_direct<1,2,0>;
+using sycl_group_201_nested_direct = sycl_group_012_direct<2,0,1>;
+using sycl_group_210_nested_direct = sycl_group_012_direct<2,1,0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
-{
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -844,15 +793,15 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
-{
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -862,7 +811,8 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
              *(segment2.begin() + tz));
     }
   }
@@ -873,36 +823,37 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
   Return local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
-{
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx = ctx.itm->get_group(DIM0);
-      const int ty = ctx.itm->get_group(DIM1);
+      const int tx =  ctx.itm->get_group(DIM0);
+      const int ty =  ctx.itm->get_group(DIM1);
       if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty), tx, ty);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+             tx, ty);
     }
   }
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
-{
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -912,45 +863,48 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
              *(segment2.begin() + tz), tx, ty, tz);
     }
   }
 };
 
 // perfectly nested sycl loop policies
-using sycl_group_01_nested_loop = sycl_group_012_loop<0, 1>;
-using sycl_group_02_nested_loop = sycl_group_012_loop<0, 2>;
-using sycl_group_10_nested_loop = sycl_group_012_loop<1, 0>;
-using sycl_group_12_nested_loop = sycl_group_012_loop<1, 2>;
-using sycl_group_20_nested_loop = sycl_group_012_loop<2, 0>;
-using sycl_group_21_nested_loop = sycl_group_012_loop<2, 1>;
-
-using sycl_group_012_nested_loop = sycl_group_012_loop<0, 1, 2>;
-using sycl_group_021_nested_loop = sycl_group_012_loop<0, 2, 1>;
-using sycl_group_102_nested_loop = sycl_group_012_loop<1, 0, 2>;
-using sycl_group_120_nested_loop = sycl_group_012_loop<1, 2, 0>;
-using sycl_group_201_nested_loop = sycl_group_012_loop<2, 0, 1>;
-using sycl_group_210_nested_loop = sycl_group_012_loop<2, 1, 0>;
+using sycl_group_01_nested_loop = sycl_group_012_loop<0,1>;
+using sycl_group_02_nested_loop = sycl_group_012_loop<0,2>;
+using sycl_group_10_nested_loop = sycl_group_012_loop<1,0>;
+using sycl_group_12_nested_loop = sycl_group_012_loop<1,2>;
+using sycl_group_20_nested_loop = sycl_group_012_loop<2,0>;
+using sycl_group_21_nested_loop = sycl_group_012_loop<2,1>;
+
+using sycl_group_012_nested_loop = sycl_group_012_loop<0,1,2>;
+using sycl_group_021_nested_loop = sycl_group_012_loop<0,2,1>;
+using sycl_group_102_nested_loop = sycl_group_012_loop<1,0,2>;
+using sycl_group_120_nested_loop = sycl_group_012_loop<1,2,0>;
+using sycl_group_201_nested_loop = sycl_group_012_loop<2,0,1>;
+using sycl_group_210_nested_loop = sycl_group_012_loop<2,1,0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
-{
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0);
+           bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM1); by < len1;
+        for (int by = ctx.itm->get_group(DIM1);
+             by < len1;
              bx += ctx.itm->get_group_range(DIM1))
         {
           body(*(segment0.begin() + bx), *(segment1.begin() + by));
@@ -961,33 +915,37 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
-{
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0);
+         bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM1); by < len1;
+      for (int by = ctx.itm->get_group(DIM1);
+           by < len1;
            by += ctx.itm->get_group_range(DIM1))
       {
 
-        for (int bz = ctx.itm->get_group(DIM2); bz < len2;
+        for (int bz = ctx.itm->get_group(DIM2);
+             bz < len2;
              bz += ctx.itm->get_group_range(DIM2))
         {
 
-          body(*(segment0.begin() + bx), *(segment1.begin() + by),
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
                *(segment2.begin() + bz));
         }
       }
@@ -999,23 +957,25 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
   perfectly nested sycl loop policies + returns local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
-{
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0);
+           bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM0); by < len1;
+        for (int by = ctx.itm->get_group(DIM0);
+             by < len1;
              by += ctx.itm->get_group_range(DIM1))
         {
 
@@ -1027,33 +987,37 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
-{
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           SEGMENT const& segment0,
-                                           SEGMENT const& segment1,
-                                           SEGMENT const& segment2,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0);
+         bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM0); by < len1;
+      for (int by = ctx.itm->get_group(DIM0);
+           by < len1;
            by += ctx.itm->get_group_range(DIM0))
       {
 
-        for (int bz = ctx.itm->get_group(DIM0); bz < len2;
+        for (int bz =  ctx.itm->get_group(DIM0);
+             bz < len2;
              bz += ctx.itm->get_group_range(DIM0))
         {
 
-          body(*(segment0.begin() + bx), *(segment1.begin() + by),
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
                *(segment2.begin() + bz), bx, by, bz);
         }
       }
@@ -1062,19 +1026,20 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
-{
+struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
+         tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
       body(segment.slice(tx, tile_size));
@@ -1084,20 +1049,20 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
-{
+struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if (tx < len)
+    if(tx < len)
     {
       body(segment.slice(tx, tile_size));
     }
@@ -1106,19 +1071,19 @@ struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
-{
+struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_group(DIM) * tile_size;
+    for (int tx = ctx.itm->get_group(DIM)* tile_size;
 
          tx < len;
 
@@ -1130,110 +1095,110 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
-{
+struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_group(DIM) * tile_size;
-    if (tx < len)
-    {
+    if(tx < len){
       body(segment.slice(tx, tile_size));
     }
   }
 };
 
-// Tile execute + return index
+//Tile execute + return index
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
-{
+struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
+         tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
-      body(segment.slice(tx, tile_size), tx / tile_size);
+      body(segment.slice(tx, tile_size), tx/tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
-{
+struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if (tx < len)
+    if(tx < len)
     {
-      body(segment.slice(tx, tile_size), tx / tile_size);
+      body(segment.slice(tx, tile_size), tx/tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
-{
+struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM) * tile_size; bx < len;
+    for (int bx = ctx.itm->get_group(DIM) * tile_size;
+         bx < len;
          bx += ctx.itm->get_group_range(DIM) * tile_size)
     {
-      body(segment.slice(bx, tile_size), bx / tile_size);
+      body(segment.slice(bx, tile_size), bx/tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
-{
+struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
-                                           TILE_T tile_size,
-                                           SEGMENT const& segment,
-                                           BODY const& body)
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int bx = ctx.itm->get_group(DIM) * tile_size;
-    if (bx < len)
-    {
-      body(segment.slice(bx, tile_size), bx / tile_size);
+    if(bx < len){
+      body(segment.slice(bx, tile_size), bx/tile_size);
     }
   }
 };
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
index 149d4ca0fd..1f33be19bb 100644
--- a/include/RAJA/policy/sycl/params/kernel_name.hpp
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -3,42 +3,39 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
-
-#if defined(RAJA_ENABLE_SYCL)
-
-// Init
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
-init(KernelName&)
-{
-  // TODO: Define kernel naming
-}
-
-// Combine
-template <typename EXEC_POL, typename T>
-camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
-    SYCL_EXTERNAL combine(KernelName&, T)
-{}
-
-// Resolve
-template <typename EXEC_POL>
-camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
-resolve(KernelName&)
-{
-  // TODO: Define kernel naming
-}
-
-#endif
-
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
-
-
-#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+#if defined(RAJA_ENABLE_SYCL)  
+  
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  SYCL_EXTERNAL
+  combine(KernelName&, T) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+#endif  
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+
+#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/params/reduce.hpp b/include/RAJA/policy/sycl/params/reduce.hpp
index 26b67c469d..e2fb7e1a5a 100644
--- a/include/RAJA/policy/sycl/params/reduce.hpp
+++ b/include/RAJA/policy/sycl/params/reduce.hpp
@@ -3,43 +3,37 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA
-{
-namespace expt
-{
-namespace detail
-{
+namespace RAJA {
+namespace expt {
+namespace detail {
 
 #if defined(RAJA_ENABLE_SYCL)
 
-// Init
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
-init(Reducer<OP, T>& red)
-{
-  red.val = OP::identity();
-}
-
-// Combine
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
-    SYCL_EXTERNAL combine(Reducer<OP, T>& out, const Reducer<OP, T>& in)
-{
-  out.val = OP {}(out.val, in.val);
-}
-
-// Resolve
-template <typename EXEC_POL, typename OP, typename T>
-camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
-resolve(Reducer<OP, T>& red)
-{
-  *red.target = OP {}(*red.target, red.val);
-}
+  // Init
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  init(Reducer<OP, T, VOp>& red) {
+    red.m_valop.val = OP::identity();
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
+    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T, typename VOp>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  resolve(Reducer<OP, T, VOp>& red) {
+    red.combineTarget(red.m_valop.val);
+  }
 
 #endif
 
-}  //  namespace detail
-}  //  namespace expt
-}  //  namespace RAJA
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
 
-#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
+#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index afd7c24b22..0f92fe27e1 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -35,8 +35,7 @@
 namespace RAJA
 {
 
-struct uint3
-{
+struct uint3 {
   unsigned long x, y, z;
 };
 
@@ -47,14 +46,12 @@ using sycl_dim_3_t = uint3;
 namespace detail
 {
 template <bool Async>
-struct get_launch
-{
+struct get_launch {
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false>
-{
+struct get_launch<false> {
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -76,28 +73,27 @@ struct sycl_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::sycl,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl>
-{};
+                       RAJA::Platform::sycl> {
+};
 
 template <bool Async, int num_threads = 0>
 struct sycl_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                           RAJA::Policy::sycl,
-                           RAJA::Pattern::region,
-                           detail::get_launch<Async>::value,
-                           RAJA::Platform::sycl>
-{};
+                       RAJA::Policy::sycl,
+                       RAJA::Pattern::region,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::sycl> {
+};
 
 struct sycl_reduce
-    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce>
-{};
+    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce> {
+};
 
 //
 // Sycl atomic policy for using sycl atomics on the device and
 // the provided Policy on the host
 //
-template <typename host_policy>
-struct sycl_atomic_explicit
-{};
+template<typename host_policy>
+struct sycl_atomic_explicit{};
 
 //
 // Default sycl atomic policy uses sycl atomics on the device and non-atomics
@@ -105,13 +101,11 @@ struct sycl_atomic_explicit
 //
 using sycl_atomic = sycl_atomic_explicit<seq_atomic>;
 
-template <typename Mask>
-struct sycl_local_masked_direct
-{};
+template<typename Mask>
+struct sycl_local_masked_direct {};
 
-template <typename Mask>
-struct sycl_local_masked_loop
-{};
+template<typename Mask>
+struct sycl_local_masked_loop {};
 
 }  // namespace sycl
 }  // namespace policy
@@ -126,29 +120,27 @@ using policy::sycl::sycl_local_masked_direct;
 using policy::sycl::sycl_local_masked_loop;
 
 using policy::sycl::sycl_launch_t;
-
+  
 /*!
  * Maps indices to SYCL global id
  * Optional WORK_GROUP_SIZE to
  */
-template <int dim, int WORK_GROUP_SIZE = 1>
-struct sycl_global_012
-{};
+template<int dim, int WORK_GROUP_SIZE = 1>
+struct sycl_global_012{};
 
-template <int WORK_GROUP_SIZE>
+template<int WORK_GROUP_SIZE>
 using sycl_global_0 = sycl_global_012<0, WORK_GROUP_SIZE>;
-template <int WORK_GROUP_SIZE>
+template<int WORK_GROUP_SIZE>
 using sycl_global_1 = sycl_global_012<1, WORK_GROUP_SIZE>;
-template <int WORK_GROUP_SIZE>
+template<int WORK_GROUP_SIZE>
 using sycl_global_2 = sycl_global_012<2, WORK_GROUP_SIZE>;
 
 /*!
  * Maps segment indices to SYCL group ids.
  * Loops to allow for any value
  */
-template <int... dim>
-struct sycl_group_012_loop
-{};
+template<int ... dim>
+struct sycl_group_012_loop{};
 
 using sycl_group_0_loop = sycl_group_012_loop<0>;
 using sycl_group_1_loop = sycl_group_012_loop<1>;
@@ -158,9 +150,8 @@ using sycl_group_2_loop = sycl_group_012_loop<2>;
  * Maps segment indices to SYCL local ids.
  * Loops to allow for any value
  */
-template <int... dim>
-struct sycl_local_012_loop
-{};
+template<int ... dim>
+struct sycl_local_012_loop{};
 
 using sycl_local_0_loop = sycl_local_012_loop<0>;
 using sycl_local_1_loop = sycl_local_012_loop<1>;
@@ -169,9 +160,8 @@ using sycl_local_2_loop = sycl_local_012_loop<2>;
 /*!
  * Maps segment indices to SYCL group ids.
  */
-template <int... dim>
-struct sycl_group_012_direct
-{};
+template<int ... dim>
+struct sycl_group_012_direct{};
 
 using sycl_group_0_direct = sycl_group_012_direct<0>;
 using sycl_group_1_direct = sycl_group_012_direct<1>;
@@ -180,87 +170,102 @@ using sycl_group_2_direct = sycl_group_012_direct<2>;
 /*!
  * Maps segment indices to SYCL local ids.
  */
-template <int... dim>
-struct sycl_local_012_direct
-{};
+template<int ... dim>
+struct sycl_local_012_direct{};
 
 using sycl_local_0_direct = sycl_local_012_direct<0>;
 using sycl_local_1_direct = sycl_local_012_direct<1>;
 using sycl_local_2_direct = sycl_local_012_direct<2>;
 
 
-namespace internal
-{
+namespace internal{
 
-template <int dim>
+template<int dim>
 struct SyclDimHelper;
 
-template <>
-struct SyclDimHelper<0>
-{
+template<>
+struct SyclDimHelper<0>{
 
-  template <typename dim_t>
-  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
+  template<typename dim_t>
+  inline
+  static
+  constexpr
+  auto get(dim_t const &d) ->
+    decltype(d.x)
   {
     return d.x;
   }
 
-  template <typename dim_t>
-  inline static void set(dim_t& d, int value)
+  template<typename dim_t>
+  inline
+  static
+  void set(dim_t &d, int value)
   {
     d.x = value;
   }
 };
 
-template <>
-struct SyclDimHelper<1>
-{
+template<>
+struct SyclDimHelper<1>{
 
-  template <typename dim_t>
-  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
+  template<typename dim_t>
+  inline
+  static
+  constexpr
+  auto get(dim_t const &d) ->
+    decltype(d.x)
   {
     return d.y;
   }
 
-  template <typename dim_t>
-  inline static void set(dim_t& d, int value)
+  template<typename dim_t>
+  inline
+  static
+  void set(dim_t &d, int value)
   {
     d.y = value;
   }
 };
 
-template <>
-struct SyclDimHelper<2>
-{
+template<>
+struct SyclDimHelper<2>{
 
-  template <typename dim_t>
-  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
+  template<typename dim_t>
+  inline
+  static
+  constexpr
+  auto get(dim_t const &d) ->
+    decltype(d.x)
   {
     return d.z;
   }
 
-  template <typename dim_t>
-  inline static void set(dim_t& d, int value)
+  template<typename dim_t>
+  inline
+  static
+  void set(dim_t &d, int value)
   {
     d.z = value;
   }
 };
 
-template <int dim, typename dim_t>
-constexpr auto get_sycl_dim(dim_t const& d) -> decltype(d.x)
+template<int dim, typename dim_t>
+constexpr
+auto get_sycl_dim(dim_t const &d) ->
+  decltype(d.x)
 {
   return SyclDimHelper<dim>::get(d);
 }
 
-template <int dim, typename dim_t>
-void set_sycl_dim(dim_t& d, int value)
+template<int dim, typename dim_t>
+void set_sycl_dim(dim_t &d, int value)
 {
   return SyclDimHelper<dim>::set(d, value);
 }
-}  // namespace internal
+} // namespace internal
 
 }  // namespace RAJA
 
-#endif  // RAJA_ENABLE_SYCL
+#endif // RAJA_ENABLE_SYCL
 
 #endif
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index ac9690f42e..49d89b3cd2 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   Header file for SYCL reduction stucts/classes.
- *
+ *          
  ******************************************************************************
  */
 
@@ -38,14 +38,15 @@ namespace sycl
 {
 
 template <typename T, typename I>
-struct minloc
+struct minloc 
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void
-  operator()(T& val, I& loc, const T v, const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
+                                               I &loc,
+                                               const T v,
+                                               const I l)
   {
-    if (v < val)
-    {
+    if (v < val) {
       loc = l;
       val = v;
     }
@@ -53,14 +54,15 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc
+struct maxloc 
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void
-  operator()(T& val, I& loc, const T v, const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
+                                               I &loc,
+                                               const T v,
+                                               const I l)
   {
-    if (v > val)
-    {
+    if (v > val) {
       loc = l;
       val = v;
     }
@@ -72,19 +74,18 @@ struct maxloc
 static int MaxNumTeams = 1;
 
 //! Information necessary for SYCL offload to be considered
-struct Offload_Info
+struct Offload_Info 
 {
-  int hostID {1};
-  int deviceID {2};
-  bool isMapped {false};
+  int hostID{1};
+  int deviceID{2};
+  bool isMapped{false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info& other)
-      : hostID {other.hostID},
-        deviceID {other.deviceID},
-        isMapped {other.isMapped}
-  {}
+  Offload_Info(const Offload_Info &other)
+      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
+  {
+  }
 };
 
 //! Reduction data for SYCL Offload -- stores value, host pointer, and device
@@ -93,8 +94,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T* device;
-  T* host;
+  T *device;
+  T *host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -103,24 +104,20 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
+  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
       : value(initValue)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
 
-    device = reinterpret_cast<T*>(
-        cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
-    host = reinterpret_cast<T*>(
-        cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
+    device = reinterpret_cast<T *>(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
+    host = reinterpret_cast<T *>(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
 
-    if (!host)
-    {
+    if (!host) {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device)
-    {
+    if (!device) {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -128,63 +125,62 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue) { value = initValue; }
+  void reset(T initValue)
+  {
+    value = initValue;
+  }
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data&) = default;
+  Reduce_Data(const Reduce_Data &) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info& info)
+  RAJA_INLINE void hostToDevice(Offload_Info &info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if (!q)
-    {
+    if(!q) {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
     }
 
     // precondition: host and device are valid pointers
-    auto e =
-        q->memcpy(reinterpret_cast<void*>(device),
-                  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
+    auto e = q->memcpy(reinterpret_cast<void *>(device),
+                       reinterpret_cast<void *>(host),
+                       sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info& info)
+  RAJA_INLINE void deviceToHost(Offload_Info &info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if (!q)
-    {
+    if(!q) {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
-    }
+    } 
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void*>(host),
-                       reinterpret_cast<void*>(device),
+    auto e = q->memcpy(reinterpret_cast<void *>(host),
+                       reinterpret_cast<void *>(device),
                        sycl::MaxNumTeams * sizeof(T));
-
+ 
     e.wait();
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info& info)
+  RAJA_INLINE void cleanup(Offload_Info &info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if (device)
-    {
-      cl::sycl::free(reinterpret_cast<void*>(device), *q);
+    if (device) {
+      cl::sycl::free(reinterpret_cast<void *>(device), *q);
       device = nullptr;
     }
-    if (host)
-    {
-      cl::sycl::free(reinterpret_cast<void*>(host), *q);
-      // delete[] host;
+    if (host) {
+      cl::sycl::free(reinterpret_cast<void *>(host), *q);
+      //delete[] host;
       host = nullptr;
     }
   }
@@ -195,46 +191,47 @@ struct Reduce_Data
 //! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce
+struct TargetReduce 
 {
-  TargetReduce()                    = delete;
-  TargetReduce(const TargetReduce&) = default;
+  TargetReduce() = delete;
+  TargetReduce(const TargetReduce &) = default;
 
   explicit TargetReduce(T init_val)
       : info(),
         val(Reducer::identity(), Reducer::identity(), info),
         initVal(init_val),
         finalVal(Reducer::identity())
-  {}
+  {
+  }
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     val.cleanup(info);
-    val           = sycl::Reduce_Data<T>(identity_, identity_, info);
+    val = sycl::Reduce_Data<T>(identity_, identity_, info);
     info.isMapped = false;
-    initVal       = init_val_;
-    finalVal      = identity_;
+    initVal = init_val_;
+    finalVal = identity_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduce() {}
+  ~TargetReduce()
+  {
+  }
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped)
-    {
+    if (!info.isMapped) {
       val.deviceToHost(info);
-      for (int i = 0; i < sycl::MaxNumTeams; ++i)
-      {
-        Reducer {}(val.value, val.host[i]);
+      for (int i =0; i < sycl::MaxNumTeams; ++i) {
+        Reducer{}(val.value, val.host[i]);
       }
-      //      val.cleanup(info);
+//      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer {}(finalVal, initVal);
-    Reducer {}(finalVal, val.value);
+    Reducer{}(finalVal, initVal);
+    Reducer{}(finalVal, val.value);
     T returnVal = finalVal;
     reset(finalVal);
     return returnVal;
@@ -243,37 +240,29 @@ struct TargetReduce
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce& reduce(T rhsVal)
+  TargetReduce &reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            val.device[i]);
-    Reducer {}(atm, rhsVal);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
+    Reducer{}(atm, rhsVal);
     return *this;
 #else
-    Reducer {}(val.value, rhsVal);
+    Reducer{}(val.value, rhsVal);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce& reduce(T rhsVal) const
+  const TargetReduce &reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            val.device[i]);
-    Reducer {}(atm, rhsVal);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
+    Reducer{}(atm, rhsVal);  
     return *this;
 #else
-    Reducer {}(val.value, rhsVal);
+    Reducer{}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -292,16 +281,13 @@ struct TargetReduce
 //! SYCL Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc
+struct TargetReduceLoc 
 {
-  TargetReduceLoc()                       = delete;
-  TargetReduceLoc(const TargetReduceLoc&) = default;
-  explicit TargetReduceLoc(
-      T init_val,
-      IndexType init_loc,
-      T identity_val_ = Reducer::identity,
-      IndexType identity_loc_ =
-          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc() = delete;
+  TargetReduceLoc(const TargetReduceLoc &) = default;
+  explicit TargetReduceLoc(T init_val, IndexType init_loc,
+                           T identity_val_ = Reducer::identity,
+                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -309,46 +295,45 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {}
+  {
+  }
 
-  void reset(T init_val_,
-             IndexType init_loc_,
+  void reset(T init_val_, IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ =
-                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
     loc.cleanup(info);
     loc = sycl::Reduce_Data<IndexType>(identity_loc_, identity_loc_, info);
     info.isMapped = false;
-    initVal       = init_val_;
-    finalVal      = identity_val_;
-    initLoc       = init_loc_;
-    finalLoc      = identity_loc_;
+    initVal = init_val_;
+    finalVal = identity_val_;
+    initLoc = init_loc_;
+    finalLoc = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduceLoc() {}
+  ~TargetReduceLoc()
+  {
+  }
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped)
-    {
+    if (!info.isMapped) {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-
-      for (int i = 0; i < sycl::MaxNumTeams; ++i)
-      {
-        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
+      
+      for (int i = 0; i < sycl::MaxNumTeams; ++i) {
+        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       info.isMapped = true;
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer {}(finalVal, finalLoc, initVal, initLoc);
-    Reducer {}(finalVal, finalLoc, val.value, loc.value);
+    Reducer{}(finalVal, finalLoc, initVal, initLoc);
+    Reducer{}(finalVal, finalLoc, val.value, loc.value);
     returnVal = finalVal;
     returnLoc = finalLoc;
     reset(finalVal, finalLoc);
@@ -368,26 +353,24 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire,
-                           cl::sycl::memory_scope::device);
-    Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    cl::sycl::atomic_fence(cl::sycl::memory_order_release,
-                           cl::sycl::memory_scope::device);
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
+    Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
     return *this;
 #else
-    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -399,7 +382,7 @@ struct TargetReduceLoc
   //! storage for offload information
   sycl::Offload_Info info;
   //! storage for reduction data for value
-  //  sycl::Reduce_Data<T> val;
+//  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
   T initVal;
   T finalVal;
@@ -412,30 +395,28 @@ struct TargetReduceLoc
 
 //! specialization of ReduceSum for omp_target_reduce
 template <typename T>
-class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
+class ReduceSum<sycl_reduce, T>
+    : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-  using self   = ReduceSum<sycl_reduce, T>;
+
+  using self = ReduceSum<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self& operator+=(T rhsVal)
+  self &operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self& operator+=(T rhsVal) const
+  const self &operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -451,20 +432,17 @@ class ReduceBitOr<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-  using self   = ReduceBitOr<sycl_reduce, T>;
+
+  using self = ReduceBitOr<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self& operator|=(T rhsVal)
+  self &operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -474,15 +452,11 @@ class ReduceBitOr<sycl_reduce, T>
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self& operator|=(T rhsVal) const
+  const self &operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -498,20 +472,17 @@ class ReduceBitAnd<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-  using self   = ReduceBitAnd<sycl_reduce, T>;
+
+  using self = ReduceBitAnd<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self& operator&=(T rhsVal)
+  self &operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -521,15 +492,11 @@ class ReduceBitAnd<sycl_reduce, T>
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self& operator&=(T rhsVal) const
+  const self &operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -542,23 +509,21 @@ class ReduceBitAnd<sycl_reduce, T>
 
 //! specialization of ReduceMin for omp_target_reduce
 template <typename T>
-class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
+class ReduceMin<sycl_reduce, T>
+    : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-  using self   = ReduceMin<sycl_reduce, T>;
+
+  using self = ReduceMin<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self& min(T rhsVal)
+  self &min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -568,15 +533,11 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self& min(T rhsVal) const
+  const self &min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -589,23 +550,21 @@ class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 
 //! specialization of ReduceMax for omp_target_reduce
 template <typename T>
-class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
+class ReduceMax<sycl_reduce, T>
+    : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-  using self   = ReduceMax<sycl_reduce, T>;
+
+  using self = ReduceMax<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self& max(T rhsVal)
+  self &max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -615,15 +574,11 @@ class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self& max(T rhsVal) const
+  const self &max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm =
-        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-            parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
diff --git a/include/RAJA/policy/tensor/arch.hpp b/include/RAJA/policy/tensor/arch.hpp
index 50de38b80a..771adea64f 100644
--- a/include/RAJA/policy/tensor/arch.hpp
+++ b/include/RAJA/policy/tensor/arch.hpp
@@ -23,27 +23,26 @@
 namespace RAJA
 {
 
-namespace internal
-{
+namespace internal {
 
 namespace expt
 {
 
 
-/*!
- * Provides architectural details for a given architecture and data type.
- */
-template <typename REGISTER_POLICY, typename T>
-struct RegisterTraits;
-/*
- * using element_type = T;
- * using register_policy = REGISTER_POLICY;
- * static constexpr camp::idx s_num_bits = X;
- * static constexpr camp::idx s_num_elem = Y;
- *
- */
-}  // namespace expt
-}  // namespace internal
+  /*!
+   * Provides architectural details for a given architecture and data type.
+   */
+  template<typename REGISTER_POLICY, typename T>
+  struct RegisterTraits;
+  /*
+   * using element_type = T;
+   * using register_policy = REGISTER_POLICY;
+   * static constexpr camp::idx s_num_bits = X;
+   * static constexpr camp::idx s_num_elem = Y;
+   *
+   */
+} //namespace expt
+} //namespace internal
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -55,8 +54,7 @@ namespace expt
 {
 
 #ifdef __AVX512F__
-struct avx512_register
-{};
+struct avx512_register {};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx512_register
@@ -65,8 +63,7 @@ struct avx512_register
 
 
 #ifdef __AVX2__
-struct avx2_register
-{};
+struct avx2_register {};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx2_register
@@ -75,8 +72,7 @@ struct avx2_register
 
 
 #ifdef __AVX__
-struct avx_register
-{};
+struct avx_register {};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx_register
@@ -89,8 +85,7 @@ struct avx_register
 /*!
  * A CUDA warp distributed vector register
  */
-struct cuda_warp_register
-{};
+struct cuda_warp_register {};
 
 #endif
 
@@ -101,14 +96,12 @@ struct cuda_warp_register
  * A HIP wavefront distributed vector register
  * On AMD GPUs this is rally just a vector register
  */
-struct hip_wave_register
-{};
+struct hip_wave_register {};
 
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-struct scalar_register
-{};
+struct scalar_register {};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::scalar_register
@@ -116,12 +109,13 @@ struct scalar_register
 #endif
 
 
-// This sets the default SIMD register that will be used
-using default_register = RAJA_TENSOR_REGISTER_TYPE;
+  // This sets the default SIMD register that will be used
+  using default_register = RAJA_TENSOR_REGISTER_TYPE;
+
 
+} // namespace expt
+} // namespace RAJA
 
-}  // namespace expt
-}  // namespace RAJA
 
 
 //
diff --git a/include/RAJA/policy/tensor/arch/avx.hpp b/include/RAJA/policy/tensor/arch/avx.hpp
index c0df27fac9..ed25f1f3e3 100644
--- a/include/RAJA/policy/tensor/arch/avx.hpp
+++ b/include/RAJA/policy/tensor/arch/avx.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX__
 
-#include <RAJA/policy/tensor/arch/avx/traits.hpp>
-#include <RAJA/policy/tensor/arch/avx/avx_int64.hpp>
-#include <RAJA/policy/tensor/arch/avx/avx_int32.hpp>
-#include <RAJA/policy/tensor/arch/avx/avx_float.hpp>
-#include <RAJA/policy/tensor/arch/avx/avx_double.hpp>
+#include<RAJA/policy/tensor/arch/avx/traits.hpp>
+#include<RAJA/policy/tensor/arch/avx/avx_int64.hpp>
+#include<RAJA/policy/tensor/arch/avx/avx_int32.hpp>
+#include<RAJA/policy/tensor/arch/avx/avx_float.hpp>
+#include<RAJA/policy/tensor/arch/avx/avx_double.hpp>
 
 
-#endif  // __AVX__
+#endif // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index 2978673727..8a23d66e57 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -34,462 +34,444 @@ namespace RAJA
 namespace expt
 {
 
-template <>
-class Register<double, avx_register>
-    : public internal::expt::RegisterBase<Register<double, avx_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<double, avx_register>>;
-
-  using register_policy = avx_register;
-  using self_type       = Register<double, avx_register>;
-  using element_type    = double;
-  using register_type   = __m256d;
-
-  using int_vector_type = Register<int64_t, avx_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 4;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm256_setzero_pd()) {}
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0, element_type x1, element_type x2, element_type x3)
-      : base_type(), m_value(_mm256_set_pd(x3, x2, x1, x0))
-  {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = _mm256_loadu_pd(ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    m_value = _mm256_maskload_pd(ptr, createMask(N));
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    for (camp::idx_t i = 0; i < 4; ++i)
-    {
-      m_value[i] = ptr[i * stride];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    m_value = _mm256_setzero_pd();
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      m_value[i] = ptr[i * stride];
-    };
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    _mm256_storeu_pd(ptr, m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    _mm256_maskstore_pd(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    for (camp::idx_t i = 0; i < 4; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const { return m_value[i]; }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value[i] = value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
+  template<>
+  class Register<double, avx_register> :
+    public internal::expt::RegisterBase<Register<double, avx_register>>
   {
-    m_value = _mm256_set1_pd(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm256_add_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm256_sub_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm256_mul_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(_mm256_div_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply a masked divide, so do it manually
-    return self_type(_mm256_set_pd(
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const
-  {
-    auto sh1  = _mm256_permute_pd(m_value, 0x5);
-    auto red1 = _mm256_add_pd(m_value, sh1);
-    return red1[0] + red1[2];
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const
-  {
-    // permute the first two and last two lanes of the register
-    // A = { v[1], v[0], v[3], v[2] }
-    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-    // take the maximum value of each lane
-    // B = { max{v[0], v[1]},
-    //       max{v[0], v[1]},
-    //       max{v[2], v[3]},
-    //       max{v[2], v[3]} }
-    register_type b = _mm256_max_pd(m_value, a);
-
-    // now take the maximum of a lower and upper halves
-    return RAJA::max<element_type>(b[0], b[2]);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    if (N == 4)
-    {
-      // permute the first two and last two lanes of the register
-      // A = { v[1], v[0], v[3], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-      // take the maximum value of each lane
-      // B = { max{v[0], v[1]},
-      //       max{v[0], v[1]},
-      //       max{v[2], v[3]},
-      //       max{v[2], v[3]} }
-      register_type b = _mm256_max_pd(m_value, a);
-
-      // now take the maximum of a lower and upper halves
-      return RAJA::max<element_type>(b[0], b[2]);
-    }
-    else if (N == 3)
-    {
-      // permute the first two and last two lanes of the register
-      // use the third element TWICE, so we effectively remove the 4th
-      // lane
-      // A = { v[1], v[0], v[2], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-      // take the maximum value of each lane
-      // B = { max{v[0], v[1]},
-      //       max{v[0], v[1]},
-      //       max{v[2], v[2]},   <-- just v[2]
-      //       max{v[2], v[3]} }
-      register_type b = _mm256_max_pd(m_value, a);
-
-      // now take the maximum of a lower and upper lane
-      return RAJA::max<element_type>(b[0], b[2]);
-    }
-    else if (N == 2)
-    {
-      return RAJA::max<element_type>(m_value[0], m_value[1]);
-    }
-    else if (N == 1)
-    {
-      return m_value[0];
-    }
-    return RAJA::operators::limits<double>::min();
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm256_max_pd(m_value, a.m_value));
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-    // permute the first two and last two lanes of the register
-    // A = { v[1], v[0], v[3], v[2] }
-    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-    // take the minimum value of each lane
-    // B = { min{v[0], v[1]},
-    //       min{v[0], v[1]},
-    //       min{v[2], v[3]},
-    //       min{v[2], v[3]} }
-    register_type b = _mm256_min_pd(m_value, a);
-
-    // now take the minimum of a lower and upper halves
-    return RAJA::min<element_type>(b[0], b[2]);
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    if (N == 4)
-    {
-      // permute the first two and last two lanes of the register
-      // A = { v[1], v[0], v[3], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-      // take the minimum value of each lane
-      // B = { min{v[0], v[1]},
-      //       min{v[0], v[1]},
-      //       min{v[2], v[3]},
-      //       min{v[2], v[3]} }
-      register_type b = _mm256_min_pd(m_value, a);
-
-      // now take the minimum of a lower and upper halves
-      return RAJA::min<element_type>(b[0], b[2]);
-    }
-    else if (N == 3)
-    {
-      // permute the first two and last two lanes of the register
-      // use the third element TWICE, so we effectively remove the 4th
-      // lane
-      // A = { v[1], v[0], v[2], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-      // take the minimum value of each lane
-      // B = { min{v[0], v[1]},
-      //       min{v[0], v[1]},
-      //       min{v[2], v[2]},   <-- just v[2]
-      //       min{v[2], v[3]} }
-      register_type b = _mm256_min_pd(m_value, a);
-
-      // now take the minimum of a lower and upper lane
-      return RAJA::min<element_type>(b[0], b[2]);
-    }
-    else if (N == 2)
-    {
-      return RAJA::min<element_type>(m_value[0], m_value[1]);
-    }
-    else if (N == 1)
-    {
-      return m_value[0];
-    }
-    return RAJA::operators::limits<double>::max();
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm256_min_pd(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+    public:
+      using base_type = internal::expt::RegisterBase<Register<double, avx_register>>;
+
+      using register_policy = avx_register;
+      using self_type = Register<double, avx_register>;
+      using element_type = double;
+      using register_type = __m256d;
+
+      using int_vector_type = Register<int64_t, avx_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi64x(
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 4;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : base_type(), m_value(_mm256_setzero_pd()) {
+      }
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3) :
+                     base_type(), m_value(_mm256_set_pd(x3,x2,x1,x0))
+      {}
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
+
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = _mm256_loadu_pd(ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        m_value = _mm256_maskload_pd(ptr, createMask(N));
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+        for(camp::idx_t i = 0;i < 4;++ i){
+          m_value[i] = ptr[i*stride];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+        m_value = _mm256_setzero_pd();
+        for(camp::idx_t i = 0;i < N;++ i){
+          m_value[i] = ptr[i*stride];
+        };
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        _mm256_storeu_pd(ptr, m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        _mm256_maskstore_pd(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+        for(camp::idx_t i = 0;i < 4;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {return m_value[i];}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        m_value[i] = value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_pd(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm256_add_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm256_sub_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm256_mul_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(_mm256_div_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        // AVX2 does not supply a masked divide, so do it manually
+        return self_type(_mm256_set_pd(
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+        auto sh1 = _mm256_permute_pd(m_value, 0x5);
+        auto red1 = _mm256_add_pd(m_value, sh1);
+        return red1[0]+red1[2];
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        // permute the first two and last two lanes of the register
+        // A = { v[1], v[0], v[3], v[2] }
+        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+        // take the maximum value of each lane
+        // B = { max{v[0], v[1]},
+        //       max{v[0], v[1]},
+        //       max{v[2], v[3]},
+        //       max{v[2], v[3]} }
+        register_type b = _mm256_max_pd(m_value, a);
+
+        // now take the maximum of a lower and upper halves
+        return RAJA::max<element_type>(b[0], b[2]);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        if(N == 4){
+          // permute the first two and last two lanes of the register
+          // A = { v[1], v[0], v[3], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+          // take the maximum value of each lane
+          // B = { max{v[0], v[1]},
+          //       max{v[0], v[1]},
+          //       max{v[2], v[3]},
+          //       max{v[2], v[3]} }
+          register_type b = _mm256_max_pd(m_value, a);
+
+          // now take the maximum of a lower and upper halves
+          return RAJA::max<element_type>(b[0], b[2]);
+        }
+        else if(N == 3){
+          // permute the first two and last two lanes of the register
+          // use the third element TWICE, so we effectively remove the 4th
+          // lane
+          // A = { v[1], v[0], v[2], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+          // take the maximum value of each lane
+          // B = { max{v[0], v[1]},
+          //       max{v[0], v[1]},
+          //       max{v[2], v[2]},   <-- just v[2]
+          //       max{v[2], v[3]} }
+          register_type b = _mm256_max_pd(m_value, a);
+
+          // now take the maximum of a lower and upper lane
+          return RAJA::max<element_type>(b[0], b[2]);
+        }
+        else if(N == 2){
+          return RAJA::max<element_type>(m_value[0], m_value[1]);
+        }
+        else if(N == 1){
+          return m_value[0];
+        }
+        return RAJA::operators::limits<double>::min();
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm256_max_pd(m_value, a.m_value));
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        // permute the first two and last two lanes of the register
+        // A = { v[1], v[0], v[3], v[2] }
+        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+        // take the minimum value of each lane
+        // B = { min{v[0], v[1]},
+        //       min{v[0], v[1]},
+        //       min{v[2], v[3]},
+        //       min{v[2], v[3]} }
+        register_type b = _mm256_min_pd(m_value, a);
+
+        // now take the minimum of a lower and upper halves
+        return RAJA::min<element_type>(b[0], b[2]);
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        if(N == 4){
+          // permute the first two and last two lanes of the register
+          // A = { v[1], v[0], v[3], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+          // take the minimum value of each lane
+          // B = { min{v[0], v[1]},
+          //       min{v[0], v[1]},
+          //       min{v[2], v[3]},
+          //       min{v[2], v[3]} }
+          register_type b = _mm256_min_pd(m_value, a);
+
+          // now take the minimum of a lower and upper halves
+          return RAJA::min<element_type>(b[0], b[2]);
+        }
+        else if(N == 3){
+          // permute the first two and last two lanes of the register
+          // use the third element TWICE, so we effectively remove the 4th
+          // lane
+          // A = { v[1], v[0], v[2], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+          // take the minimum value of each lane
+          // B = { min{v[0], v[1]},
+          //       min{v[0], v[1]},
+          //       min{v[2], v[2]},   <-- just v[2]
+          //       min{v[2], v[3]} }
+          register_type b = _mm256_min_pd(m_value, a);
+
+          // now take the minimum of a lower and upper lane
+          return RAJA::min<element_type>(b[0], b[2]);
+        }
+        else if(N == 2){
+          return RAJA::min<element_type>(m_value[0], m_value[1]);
+        }
+        else if(N == 1){
+          return m_value[0];
+        }
+        return RAJA::operators::limits<double>::max();
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm256_min_pd(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX__
+#endif //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 6330f95525..1e6563742a 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -34,481 +34,457 @@ namespace RAJA
 namespace expt
 {
 
-template <>
-class Register<float, avx_register>
-    : public internal::expt::RegisterBase<Register<float, avx_register>>
-{
-public:
-  using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
-
-  using register_policy = avx_register;
-  using self_type       = Register<float, avx_register>;
-  using element_type    = float;
-  using register_type   = __m256;
-
-  using int_vector_type = Register<int32_t, avx_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 8;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm256_setzero_ps()) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
-      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
-  {}
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = _mm256_loadu_ps(ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    m_value = _mm256_maskload_ps(ptr, createMask(N));
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    for (camp::idx_t i = 0; i < 8; ++i)
-    {
-      m_value[i] = ptr[i * stride];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    m_value = _mm256_setzero_ps();
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      m_value[i] = ptr[i * stride];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    _mm256_storeu_ps(ptr, m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    _mm256_maskstore_ps(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    for (camp::idx_t i = 0; i < 8; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const { return m_value[i]; }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value[i] = value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm256_set1_ps(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm256_add_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm256_sub_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm256_mul_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(_mm256_div_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply a masked divide
-    return self_type(_mm256_set_ps(
-        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
-        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const
+  template<>
+  class Register<float, avx_register> :
+    public internal::expt::RegisterBase<Register<float, avx_register>>
   {
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
-    auto red1 = _mm256_add_ps(m_value, sh1);
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permute_ps(red1, 0x4E);
-    auto red2 = _mm256_add_ps(red1, sh2);
-
-    return red2[0] + red2[4];
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const
-  {
-    // swap odd-even pairs and combine
-    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
-    auto red1 = _mm256_max_ps(m_value, sh1);
-
-    // swap odd-even quads and combine
-    auto sh2  = _mm256_permute_ps(red1, 0x4E);
-    auto red2 = _mm256_max_ps(red1, sh2);
-
-    // combine quads
-    return RAJA::max<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns the largest element of first N lanes
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<float>::min();
-    }
-    if (N == 1)
-    {
-      return m_value[0];
-    }
-    if (N == 2)
-    {
-      return RAJA::max<element_type>(m_value[0], m_value[1]);
-    }
-
-    // swap odd-even pairs and add
-    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-    if (N == 7)
-    {
-      // blend out the 8th lane of the permute
-      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-    }
-
-    auto red1 = _mm256_max_ps(m_value, sh1);
-
-    // Some more simple shortcuts
-    if (N == 3)
-    {
-      return RAJA::max<element_type>(red1[0], m_value[2]);
-    }
-
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permute_ps(red1, 0x4E);
-    auto red2 = _mm256_max_ps(red1, sh2);
-
-    if (N == 4)
-    {
-      return red2[0];
-    }
-    if (N == 5)
-    {
-      return RAJA::max<element_type>(red2[0], m_value[4]);
-    }
-    if (N == 6)
-    {
-      return RAJA::max<element_type>(red2[0], red1[4]);
-    }
-
-    // 7 or 8 lanes
-    return RAJA::max<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm256_max_ps(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-    // swap odd-even pairs and combine
-    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
-    auto red1 = _mm256_min_ps(m_value, sh1);
-
-    // swap odd-even quads and combine
-    auto sh2  = _mm256_permute_ps(red1, 0x4E);
-    auto red2 = _mm256_min_ps(red1, sh2);
-
-    // combine quads
-    return RAJA::min<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<float>::max();
-    }
-    if (N == 1)
-    {
-      return m_value[0];
-    }
-    if (N == 2)
-    {
-      return RAJA::min<element_type>(m_value[0], m_value[1]);
-    }
-
-    // swap odd-even pairs and add
-    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-    if (N == 7)
-    {
-      // blend out the 8th lane of the permute
-      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-    }
-
-    auto red1 = _mm256_min_ps(m_value, sh1);
-
-    // Some more simple shortcuts
-    if (N == 3)
-    {
-      return RAJA::min<element_type>(red1[0], m_value[2]);
-    }
-
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permute_ps(red1, 0x4E);
-    auto red2 = _mm256_min_ps(red1, sh2);
-
-    if (N == 4)
-    {
-      return red2[0];
-    }
-    if (N == 5)
-    {
-      return RAJA::min<element_type>(red2[0], m_value[4]);
-    }
-    if (N == 6)
-    {
-      return RAJA::min<element_type>(red2[0], red1[4]);
-    }
-
-    // 7 or 8 lanes
-    return RAJA::min<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm256_min_ps(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+    public:
+      using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
+
+      using register_policy = avx_register;
+      using self_type = Register<float, avx_register>;
+      using element_type = float;
+      using register_type = __m256;
+
+      using int_vector_type = Register<int32_t, avx_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi32(
+            N >= 8 ? -1 : 0,
+            N >= 7 ? -1 : 0,
+            N >= 6 ? -1 : 0,
+            N >= 5 ? -1 : 0,
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 8;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : base_type(), m_value(_mm256_setzero_ps()) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3,
+                     element_type x4,
+                     element_type x5,
+                     element_type x6,
+                     element_type x7) :
+        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
+      {}
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = _mm256_loadu_ps(ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        m_value = _mm256_maskload_ps(ptr, createMask(N));
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+        for(camp::idx_t i = 0;i < 8;++ i){
+          m_value[i] = ptr[i*stride];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+        m_value = _mm256_setzero_ps();
+        for(camp::idx_t i = 0;i < N;++ i){
+          m_value[i] = ptr[i*stride];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        _mm256_storeu_ps(ptr, m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        _mm256_maskstore_ps(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+        for(camp::idx_t i = 0;i < 8;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {return m_value[i];}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        m_value[i] = value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_ps(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm256_add_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm256_sub_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm256_mul_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(_mm256_div_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N ) const {
+        // AVX2 does not supply a masked divide
+        return self_type(_mm256_set_ps(
+            N >= 8 ? get(7)/b.get(7) : 0,
+            N >= 7 ? get(6)/b.get(6) : 0,
+            N >= 6 ? get(5)/b.get(5) : 0,
+            N >= 5 ? get(4)/b.get(4) : 0,
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+        auto red1 = _mm256_add_ps(m_value, sh1);
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permute_ps(red1, 0x4E);
+        auto red2 = _mm256_add_ps(red1, sh2);
+
+        return red2[0] + red2[4];
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        // swap odd-even pairs and combine
+        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+        auto red1 = _mm256_max_ps(m_value, sh1);
+
+        // swap odd-even quads and combine
+        auto sh2 = _mm256_permute_ps(red1, 0x4E);
+        auto red2 = _mm256_max_ps(red1, sh2);
+
+        // combine quads
+        return RAJA::max<element_type>(red2[0], red2[4]);
+      }
+
+      /*!
+       * @brief Returns the largest element of first N lanes
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N >8){
+          return RAJA::operators::limits<float>::min();
+        }
+        if(N == 1){
+          return m_value[0];
+        }
+        if(N == 2){
+          return RAJA::max<element_type>(m_value[0], m_value[1]);
+        }
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+        if(N == 7){
+          // blend out the 8th lane of the permute
+          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+        }
+
+        auto red1 = _mm256_max_ps(m_value, sh1);
+
+        // Some more simple shortcuts
+        if(N == 3){
+          return RAJA::max<element_type>(red1[0], m_value[2]);
+        }
+
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permute_ps(red1, 0x4E);
+        auto red2 = _mm256_max_ps(red1, sh2);
+
+        if(N == 4){
+          return red2[0];
+        }
+        if(N == 5){
+          return RAJA::max<element_type>(red2[0], m_value[4]);
+        }
+        if(N == 6){
+          return RAJA::max<element_type>(red2[0], red1[4]);
+        }
+
+        // 7 or 8 lanes
+        return RAJA::max<element_type>(red2[0], red2[4]);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm256_max_ps(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        // swap odd-even pairs and combine
+        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+        auto red1 = _mm256_min_ps(m_value, sh1);
+
+        // swap odd-even quads and combine
+        auto sh2 = _mm256_permute_ps(red1, 0x4E);
+        auto red2 = _mm256_min_ps(red1, sh2);
+
+        // combine quads
+        return RAJA::min<element_type>(red2[0], red2[4]);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N >8){
+          return RAJA::operators::limits<float>::max();
+        }
+        if(N == 1){
+          return m_value[0];
+        }
+        if(N == 2){
+          return RAJA::min<element_type>(m_value[0], m_value[1]);
+        }
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+        if(N == 7){
+          // blend out the 8th lane of the permute
+          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+        }
+
+        auto red1 = _mm256_min_ps(m_value, sh1);
+
+        // Some more simple shortcuts
+        if(N == 3){
+          return RAJA::min<element_type>(red1[0], m_value[2]);
+        }
+
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permute_ps(red1, 0x4E);
+        auto red2 = _mm256_min_ps(red1, sh2);
+
+        if(N == 4){
+          return red2[0];
+        }
+        if(N == 5){
+          return RAJA::min<element_type>(red2[0], m_value[4]);
+        }
+        if(N == 6){
+          return RAJA::min<element_type>(red2[0], red1[4]);
+        }
+
+        // 7 or 8 lanes
+        return RAJA::min<element_type>(red2[0], red2[4]);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm256_min_ps(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX__
+#endif //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index abbce3482b..11ab97be16 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -33,764 +33,738 @@ namespace RAJA
 {
 namespace expt
 {
-template <>
-class Register<int32_t, avx_register>
-    : public internal::expt::RegisterBase<Register<int32_t, avx_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<int32_t, avx_register>>;
-
-  using register_policy = avx_register;
-  using self_type       = Register<int32_t, avx_register>;
-  using element_type    = int32_t;
-  using register_type   = __m256i;
-
-  using int_vector_type = Register<int32_t, avx_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
-                            3 * stride, 2 * stride, stride, 0);
-  }
-
-  RAJA_INLINE
-  __m256i createPermute1(camp::idx_t N) const
-  {
-    // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createPermute2(camp::idx_t N) const
-  {
-    // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 8;
-
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
-      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
-  {}
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = _mm256_loadu_si256((__m256i const*)ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    m_value = _mm256_setzero_si256();
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      set(ptr[i], i);
-    }
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    for (camp::idx_t i = 0; i < 8; ++i)
-    {
-      set(ptr[i * stride], i);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    m_value = _mm256_setzero_si256();
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      set(ptr[i * stride], i);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
-                        reinterpret_cast<__m256>(m_value));
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    for (camp::idx_t i = 0; i < 8; ++i)
-    {
-      ptr[i * stride] = get(i);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = get(i);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      return _mm256_extract_epi32(m_value, 0);
-    case 1:
-      return _mm256_extract_epi32(m_value, 1);
-    case 2:
-      return _mm256_extract_epi32(m_value, 2);
-    case 3:
-      return _mm256_extract_epi32(m_value, 3);
-    case 4:
-      return _mm256_extract_epi32(m_value, 4);
-    case 5:
-      return _mm256_extract_epi32(m_value, 5);
-    case 6:
-      return _mm256_extract_epi32(m_value, 6);
-    case 7:
-      return _mm256_extract_epi32(m_value, 7);
-    }
-    return 0;
-  }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      m_value = _mm256_insert_epi32(m_value, value, 0);
-      break;
-    case 1:
-      m_value = _mm256_insert_epi32(m_value, value, 1);
-      break;
-    case 2:
-      m_value = _mm256_insert_epi32(m_value, value, 2);
-      break;
-    case 3:
-      m_value = _mm256_insert_epi32(m_value, value, 3);
-      break;
-    case 4:
-      m_value = _mm256_insert_epi32(m_value, value, 4);
-      break;
-    case 5:
-      m_value = _mm256_insert_epi32(m_value, value, 5);
-      break;
-    case 6:
-      m_value = _mm256_insert_epi32(m_value, value, 6);
-      break;
-    case 7:
-      m_value = _mm256_insert_epi32(m_value, value, 7);
-      break;
-    }
-
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm256_set1_epi32(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    // no 8-way 32-bit add, but there is a 4-way... split and conquer
-
-    // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a   = _mm256_castsi256_si128(m_value);
-    auto low_b   = _mm256_castsi256_si128(b.m_value);
-    auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
-
-    // Hi 128-bits
-    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
-    auto res_hi = _mm_add_epi32(hi_a, hi_b);
-
-    // Stitch back together
-    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
-
-    // Low 128-bits
-    auto low_a   = _mm256_castsi256_si128(m_value);
-    auto low_b   = _mm256_castsi256_si128(b.m_value);
-    auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
-
-    // Hi 128-bits
-    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
-    auto res_hi = _mm_sub_epi32(hi_a, hi_b);
-
-    // Stitch back together
-    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    // no 8-way 32-bit multiply, but there is a 32x32 -> 64
-    // This gets ugly :)
-
-    // Low 128-bits
-    auto low_a = _mm256_castsi256_si128(m_value);
-    auto low_b = _mm256_castsi256_si128(b.m_value);
-    // multiply even lanes 0, 2
-    auto res_low_even = _mm_mul_epi32(low_a, low_b);
-
-    // multiply odd lanes 1, 3
-    auto low_a_sh    = _mm_shuffle_epi32(low_a, 0xB1);
-    auto low_b_sh    = _mm_shuffle_epi32(low_b, 0xB1);
-    auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
-
-    // recombine to get all 4 lanes
-    // note: AVX doesn't have a int32 blend, so we use the float32 blend
-    res_low_odd  = _mm_shuffle_epi32(res_low_odd, 0xB1);
-    auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
-        _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
-
-
-    // High 128-bits
-    auto hi_a = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-    // multiply even lanes 0, 2
-    auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
-
-    // multiply odd lanes 1, 3
-    auto hi_a_sh    = _mm_shuffle_epi32(hi_a, 0xB1);
-    auto hi_b_sh    = _mm_shuffle_epi32(hi_b, 0xB1);
-    auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
-
-    // recombine to get all 4 lanes
-    // note: AVX doesn't have a int32 blend, so we use the float32 blend
-    res_hi_odd  = _mm_shuffle_epi32(res_hi_odd, 0xB1);
-    auto res_hi = _mm_castps_si128(_mm_blend_ps(
-        _mm_castsi128_ps(res_hi_odd), _mm_castsi128_ps(res_hi_even), 0x05));
-
-    // Stitch back together
-    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
-                                      get(5) / b.get(5), get(4) / b.get(4),
-                                      get(3) / b.get(3), get(2) / b.get(2),
-                                      get(1) / b.get(1), get(0) / b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(
-        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
-        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const
-  {
-    // Low 128-bits
-    auto low = _mm256_castsi256_si128(m_value);
-
-    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
-    auto low_red1 = _mm_add_epi32(low, low_sh1);
-
-    auto low_sh2  = _mm_shuffle_epi32(low_red1, 0x1B);
-    auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
-
-
-    // High 128-bits
-    auto hi = _mm256_extractf128_si256(m_value, 1);
-
-    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
-    auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
-
-    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
-    auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
-
-
-    // Sum halves, extract total sum
-    auto hi_low = _mm_add_epi32(hi_red2, low_red2);
-    return _mm_extract_epi32(hi_low, 0);
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const
+  template<>
+  class Register<int32_t, avx_register> :
+    public internal::expt::RegisterBase<Register<int32_t, avx_register>>
   {
-    // this is just painful, since we don't have a proper masked permute
-    // in AVX.  Lots of special cases to make sure we compare just the
-    // right lanes
-
-
-    // Low 128-bits
-    auto low = _mm256_castsi256_si128(m_value);
-
-    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
-    auto low_red1 = _mm_max_epi32(low, low_sh1);
-
-    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    public:
+      using base_type = internal::expt::RegisterBase<Register<int32_t, avx_register>>;
+
+      using register_policy = avx_register;
+      using self_type = Register<int32_t, avx_register>;
+      using element_type = int32_t;
+      using register_type = __m256i;
+
+      using int_vector_type = Register<int32_t, avx_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi32(
+            N >= 8 ? -1 : 0,
+            N >= 7 ? -1 : 0,
+            N >= 6 ? -1 : 0,
+            N >= 5 ? -1 : 0,
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+        return  _mm256_set_epi32(
+            7*stride, 6*stride, 5*stride, 4*stride,
+            3*stride, 2*stride, stride, 0);
+      }
+
+      RAJA_INLINE
+      __m256i createPermute1(camp::idx_t N) const {
+        // Generate a permutation for first round of min/max routines
+        return  _mm256_set_epi32(
+            N >= 7 ? 6 : 0,
+            N >= 8 ? 7 : 0,
+            N >= 5 ? 4 : 0,
+            N >= 6 ? 5 : 0,
+            N >= 3 ? 2 : 0,
+            N >= 4 ? 3 : 0,
+            N >= 1 ? 0 : 0,
+            N >= 2 ? 1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createPermute2(camp::idx_t N) const {
+        // Generate a permutation for second round of min/max routines
+        return  _mm256_set_epi32(
+            N >= 6 ? 5 : 0,
+            N >= 5 ? 4 : 0,
+            N >= 8 ? 7 : 0,
+            N >= 7 ? 6 : 0,
+            N >= 2 ? 1 : 0,
+            N >= 1 ? 0 : 0,
+            N >= 4 ? 3 : 0,
+            N >= 2 ? 2 : 0);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 8;
+
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : base_type(), m_value(_mm256_setzero_si256()) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3,
+                     element_type x4,
+                     element_type x5,
+                     element_type x6,
+                     element_type x7) :
+        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
+      {}
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = _mm256_loadu_si256((__m256i const *)ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        m_value = _mm256_setzero_si256();
+        for(camp::idx_t i = 0;i < N;++ i){
+          set(ptr[i], i);
+        }
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+        for(camp::idx_t i = 0;i < 8;++ i){
+          set(ptr[i*stride], i);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+        m_value = _mm256_setzero_si256();
+        for(camp::idx_t i = 0;i < N;++ i){
+          set(ptr[i*stride], i);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N), reinterpret_cast<__m256>(m_value));
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+        for(camp::idx_t i = 0;i < 8;++ i){
+          ptr[i*stride] = get(i);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = get(i);
+        }
+        return *this;
+      }
+
+
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: return _mm256_extract_epi32(m_value, 0);
+          case 1: return _mm256_extract_epi32(m_value, 1);
+          case 2: return _mm256_extract_epi32(m_value, 2);
+          case 3: return _mm256_extract_epi32(m_value, 3);
+          case 4: return _mm256_extract_epi32(m_value, 4);
+          case 5: return _mm256_extract_epi32(m_value, 5);
+          case 6: return _mm256_extract_epi32(m_value, 6);
+          case 7: return _mm256_extract_epi32(m_value, 7);
+        }
+        return 0;
+      }
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
+          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
+          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
+          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
+          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
+          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
+          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
+          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
+        }
+
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_epi32(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        // no 8-way 32-bit add, but there is a 4-way... split and conquer
+
+        // Low 128-bits  - use _mm256_castsi256_si128???
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(b.m_value);
+        auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
+
+        // Hi 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+        auto res_hi = _mm_add_epi32(hi_a, hi_b);
+
+        // Stitch back together
+        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
+
+        // Low 128-bits
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(b.m_value);
+        auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
+
+        // Hi 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+        auto res_hi = _mm_sub_epi32(hi_a, hi_b);
+
+        // Stitch back together
+        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        // no 8-way 32-bit multiply, but there is a 32x32 -> 64
+        // This gets ugly :)
+
+        // Low 128-bits
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(b.m_value);
+        // multiply even lanes 0, 2
+        auto res_low_even = _mm_mul_epi32(low_a, low_b);
+
+        // multiply odd lanes 1, 3
+        auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
+        auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
+        auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
+
+        // recombine to get all 4 lanes
+        // note: AVX doesn't have a int32 blend, so we use the float32 blend
+        res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
+        auto res_low = _mm256_castsi128_si256(_mm_castps_si128(
+            _mm_blend_ps(_mm_castsi128_ps(res_low_odd),
+                         _mm_castsi128_ps(res_low_even),
+                         0x05)
+            ));
+
+
+        // High 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+        // multiply even lanes 0, 2
+        auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
+
+        // multiply odd lanes 1, 3
+        auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
+        auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
+        auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
+
+        // recombine to get all 4 lanes
+        // note: AVX doesn't have a int32 blend, so we use the float32 blend
+        res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
+        auto res_hi = _mm_castps_si128(
+            _mm_blend_ps(_mm_castsi128_ps(res_hi_odd),
+                         _mm_castsi128_ps(res_hi_even),
+                         0x05)
+            );
+
+        // Stitch back together
+        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi32(
+            get(7)/b.get(7),
+            get(6)/b.get(6),
+            get(5)/b.get(5),
+            get(4)/b.get(4),
+            get(3)/b.get(3),
+            get(2)/b.get(2),
+            get(1)/b.get(1),
+            get(0)/b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi32(
+            N >= 8 ? get(7)/b.get(7) : 0,
+            N >= 7 ? get(6)/b.get(6) : 0,
+            N >= 6 ? get(5)/b.get(5) : 0,
+            N >= 5 ? get(4)/b.get(4) : 0,
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+        // Low 128-bits
+        auto low = _mm256_castsi256_si128(m_value);
+
+        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+        auto low_red1 = _mm_add_epi32(low, low_sh1);
+
+        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+        auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
+
+
+        // High 128-bits
+        auto hi = _mm256_extractf128_si256(m_value, 1);
+
+        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+        auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
+
+        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+        auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
+
+
+        // Sum halves, extract total sum
+        auto hi_low = _mm_add_epi32(hi_red2, low_red2);
+        return _mm_extract_epi32(hi_low, 0);
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        // this is just painful, since we don't have a proper masked permute
+        // in AVX.  Lots of special cases to make sure we compare just the
+        // right lanes
+
+
+        // Low 128-bits
+        auto low = _mm256_castsi256_si128(m_value);
+
+        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+        auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+        // lane 0 of low_red2 now has reduction of 0,1,2,3
+        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+
+
+        // High 128-bits
+        auto hi = _mm256_extractf128_si256(m_value, 1);
+
+
+        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+        // Sum halves, extract final reduction
+        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+        return _mm_extract_epi32(hi_low, 0);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N > 8){
+          return RAJA::operators::limits<int32_t>::min();
+        }
+
+        // this is just painful, since we don't have a proper masked permute
+        // in AVX.  Lots of special cases to make sure we compare just the
+        // right lanes
+        if(N==1){
+          return _mm256_extract_epi32(m_value, 0);
+        }
 
-    // lane 0 of low_red2 now has reduction of 0,1,2,3
-    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+        // Low 128-bits
+        auto low = _mm256_castsi256_si128(m_value);
 
+        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+        auto low_red1 = _mm_max_epi32(low, low_sh1);
 
-    // High 128-bits
-    auto hi = _mm256_extractf128_si256(m_value, 1);
+        if(N==2){
+          return _mm_extract_epi32(low_red1, 0);
+        }
 
+        if(N==3){
+          // get lane 2 into lane 0
+          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+          auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
+          return _mm_extract_epi32(low_red1a, 0);
+        }
 
-    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
-    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
 
-    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
-    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+        // lane 0 of low_red2 now has reduction of 0,1,2,3
+        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
 
+        if(N==4){
+          return _mm_extract_epi32(low_red2, 0);
+        }
 
-    // Sum halves, extract final reduction
-    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-    return _mm_extract_epi32(hi_low, 0);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<int32_t>::min();
-    }
-
-    // this is just painful, since we don't have a proper masked permute
-    // in AVX.  Lots of special cases to make sure we compare just the
-    // right lanes
-    if (N == 1)
-    {
-      return _mm256_extract_epi32(m_value, 0);
-    }
-
-    // Low 128-bits
-    auto low = _mm256_castsi256_si128(m_value);
-
-    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
-    auto low_red1 = _mm_max_epi32(low, low_sh1);
-
-    if (N == 2)
-    {
-      return _mm_extract_epi32(low_red1, 0);
-    }
-
-    if (N == 3)
-    {
-      // get lane 2 into lane 0
-      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
-      auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
-      return _mm_extract_epi32(low_red1a, 0);
-    }
-
-    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-    // lane 0 of low_red2 now has reduction of 0,1,2,3
-    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
-
-    if (N == 4)
-    {
-      return _mm_extract_epi32(low_red2, 0);
-    }
-
-    // High 128-bits
-    auto hi = _mm256_extractf128_si256(m_value, 1);
-
-    if (N == 5)
-    {
-      auto red_5 = _mm_max_epi32(low_red2, hi);
-      return _mm_extract_epi32(red_5, 0);
-    }
-
-    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
-    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-    if (N == 6)
-    {
-      auto red_6 = _mm_max_epi32(low_red2, hi_red1);
-      return _mm_extract_epi32(red_6, 0);
-    }
-    if (N == 7)
-    {
-      // get lane 6 (lane 2 of hi) into lane 0
-      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
-      auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
-      auto red_7    = _mm_max_epi32(low_red2, hi_red_6);
-      return _mm_extract_epi32(red_7, 0);
-    }
-
-    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
-    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-    // Sum halves, extract total sum
-    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-    return _mm_extract_epi32(hi_low, 0);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type b) const
-  {
-    // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-    // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a   = _mm256_castsi256_si128(m_value);
-    auto low_b   = _mm256_castsi256_si128(b.m_value);
-    auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
-
-    // Hi 128-bits
-    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
-    auto res_hi = _mm_max_epi32(hi_a, hi_b);
-
-    // Stitch back together
-    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-    // this is just painful, since we don't have a proper masked permute
-    // in AVX.  Lots of special cases to make sure we compare just the
-    // right lanes
-
-    // Low 128-bits
-    auto low = _mm256_castsi256_si128(m_value);
-
-    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
-    auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-    // lane 0 of low_red2 now has reduction of 0,1,2,3
-    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-
-    // High 128-bits
-    auto hi = _mm256_extractf128_si256(m_value, 1);
-
-    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
-    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-
-    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
-    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
-
-
-    // Sum halves, extract total sum
-    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-    return _mm_extract_epi32(hi_low, 0);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<int32_t>::max();
-    }
-    // this is just painful, since we don't have a proper masked permute
-    // in AVX.  Lots of special cases to make sure we compare just the
-    // right lanes
-    if (N == 1)
-    {
-      return _mm256_extract_epi32(m_value, 0);
-    }
-
-    // Low 128-bits
-    auto low = _mm256_castsi256_si128(m_value);
-
-    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
-    auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-    if (N == 2)
-    {
-      return _mm_extract_epi32(low_red1, 0);
-    }
-
-    if (N == 3)
-    {
-      // get lane 2 into lane 0
-      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
-      auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
-      return _mm_extract_epi32(low_red1a, 0);
-    }
-
-    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-    // lane 0 of low_red2 now has reduction of 0,1,2,3
-    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-    if (N == 4)
-    {
-      return _mm_extract_epi32(low_red2, 0);
-    }
-
-    // High 128-bits
-    auto hi = _mm256_extractf128_si256(m_value, 1);
-
-    if (N == 5)
-    {
-      auto red_5 = _mm_min_epi32(low_red2, hi);
-      return _mm_extract_epi32(red_5, 0);
-    }
-
-    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
-    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-    if (N == 6)
-    {
-      auto red_6 = _mm_min_epi32(low_red2, hi_red1);
-      return _mm_extract_epi32(red_6, 0);
-    }
-    if (N == 7)
-    {
-      // get lane 6 (lane 2 of hi) into lane 0
-      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
-      auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
-      auto red_7    = _mm_min_epi32(low_red2, hi_red_6);
-      return _mm_extract_epi32(red_7, 0);
-    }
-
-    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
-    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
-
-
-    // Sum halves, extract total sum
-    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-    return _mm_extract_epi32(hi_low, 0);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type b) const
-  {
-    // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-    // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a   = _mm256_castsi256_si128(m_value);
-    auto low_b   = _mm256_castsi256_si128(b.m_value);
-    auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
+        // High 128-bits
+        auto hi = _mm256_extractf128_si256(m_value, 1);
 
-    // Hi 128-bits
-    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
-    auto res_hi = _mm_min_epi32(hi_a, hi_b);
+        if(N==5){
+          auto red_5 = _mm_max_epi32(low_red2, hi);
+          return _mm_extract_epi32(red_5, 0);
+        }
+
+        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+        if(N==6){
+          auto red_6 = _mm_max_epi32(low_red2, hi_red1);
+          return _mm_extract_epi32(red_6, 0);
+        }
+        if(N==7){
+          // get lane 6 (lane 2 of hi) into lane 0
+          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+          auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
+          auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
+          return _mm_extract_epi32(red_7, 0);
+        }
 
-    // Stitch back together
-    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-  }
-};
+        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+        // Sum halves, extract total sum
+        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+        return _mm_extract_epi32(hi_low, 0);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type b) const
+      {
+        // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+        // Low 128-bits  - use _mm256_castsi256_si128???
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(b.m_value);
+        auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
+
+        // Hi 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+        auto res_hi = _mm_max_epi32(hi_a, hi_b);
+
+        // Stitch back together
+        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        // this is just painful, since we don't have a proper masked permute
+        // in AVX.  Lots of special cases to make sure we compare just the
+        // right lanes
+
+        // Low 128-bits
+        auto low = _mm256_castsi256_si128(m_value);
+
+        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+        auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+        // lane 0 of low_red2 now has reduction of 0,1,2,3
+        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+
+        // High 128-bits
+        auto hi = _mm256_extractf128_si256(m_value, 1);
+
+        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+
+        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+        // Sum halves, extract total sum
+        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+        return _mm_extract_epi32(hi_low, 0);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N > 8){
+          return RAJA::operators::limits<int32_t>::max();
+        }
+        // this is just painful, since we don't have a proper masked permute
+        // in AVX.  Lots of special cases to make sure we compare just the
+        // right lanes
+        if(N==1){
+          return _mm256_extract_epi32(m_value, 0);
+        }
+
+        // Low 128-bits
+        auto low = _mm256_castsi256_si128(m_value);
+
+        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
+        auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+        if(N==2){
+          return _mm_extract_epi32(low_red1, 0);
+        }
+
+        if(N==3){
+          // get lane 2 into lane 0
+          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
+          auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
+          return _mm_extract_epi32(low_red1a, 0);
+        }
+
+        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+        // lane 0 of low_red2 now has reduction of 0,1,2,3
+        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+        if(N==4){
+          return _mm_extract_epi32(low_red2, 0);
+        }
+
+        // High 128-bits
+        auto hi = _mm256_extractf128_si256(m_value, 1);
+
+        if(N==5){
+          auto red_5 = _mm_min_epi32(low_red2, hi);
+          return _mm_extract_epi32(red_5, 0);
+        }
+
+        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
+        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+        if(N==6){
+          auto red_6 = _mm_min_epi32(low_red2, hi_red1);
+          return _mm_extract_epi32(red_6, 0);
+        }
+        if(N==7){
+          // get lane 6 (lane 2 of hi) into lane 0
+          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
+          auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
+          auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
+          return _mm_extract_epi32(red_7, 0);
+        }
+
+        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
+        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
 
 
-}  // namespace expt
+        // Sum halves, extract total sum
+        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+        return _mm_extract_epi32(hi_low, 0);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type b) const
+      {
+        // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+        // Low 128-bits  - use _mm256_castsi256_si128???
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(b.m_value);
+        auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
+
+        // Hi 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+        auto res_hi = _mm_min_epi32(hi_a, hi_b);
+
+        // Stitch back together
+        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX2__
+#endif //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index e0a03bec4f..1c7fae3dc7 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -33,525 +33,506 @@ namespace RAJA
 {
 namespace expt
 {
-template <>
-class Register<int64_t, avx_register>
-    : public internal::expt::RegisterBase<Register<int64_t, avx_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<int64_t, avx_register>>;
-
-  using register_policy = avx_register;
-  using self_type       = Register<int64_t, avx_register>;
-  using element_type    = int64_t;
-  using register_type   = __m256i;
-
-  using int_vector_type = Register<int64_t, avx_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
-  }
-
-  /*
-   * Use the packed-double permute function because there isn't one
-   * specifically for int64
-   *
-   * Just adds a bunch of casting, should be same cost
-   */
-  template <int perm>
-  RAJA_INLINE __m256i permute(__m256i x) const
-  {
-    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 4;
-
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0, element_type x1, element_type x2, element_type x3)
-      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
-  {}
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
-        reinterpret_cast<double const*>(ptr), createMask(N)));
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    for (camp::idx_t i = 0; i < 4; ++i)
-    {
-      m_value[i] = ptr[i * stride];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    m_value = _mm256_setzero_si256();
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      m_value[i] = ptr[i * stride];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N),
-                        reinterpret_cast<__m256d>(m_value));
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    for (camp::idx_t i = 0; i < 4; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      return _mm256_extract_epi64(m_value, 0);
-    case 1:
-      return _mm256_extract_epi64(m_value, 1);
-    case 2:
-      return _mm256_extract_epi64(m_value, 2);
-    case 3:
-      return _mm256_extract_epi64(m_value, 3);
-    }
-    return 0;
-  }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      m_value = _mm256_insert_epi64(m_value, value, 0);
-      break;
-    case 1:
-      m_value = _mm256_insert_epi64(m_value, value, 1);
-      break;
-    case 2:
-      m_value = _mm256_insert_epi64(m_value, value, 2);
-      break;
-    case 3:
-      m_value = _mm256_insert_epi64(m_value, value, 3);
-      break;
-    }
-
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm256_set1_epi64x(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
-
-    // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a   = _mm256_castsi256_si128(m_value);
-    auto low_b   = _mm256_castsi256_si128(b.m_value);
-    auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
-
-    // Hi 128-bits
-    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
-    auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-    // Stitch back together
-    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
-
-    // Low 128-bits  - use _mm256_castsi256_si128???
-    auto low_a   = _mm256_castsi256_si128(m_value);
-    auto low_b   = _mm256_castsi256_si128(b.m_value);
-    auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
-
-    // Hi 128-bits
-    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
-    auto res_hi = _mm_sub_epi64(hi_a, hi_b);
-
-    // Stitch back together
-    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
-                                       get(1) * b.get(1), get(0) * b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
-                                       get(1) / b.get(1), get(0) / b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const
-  {
-    // swap pairs and add
-    auto sh1 = permute<0x5>(m_value);
-
-    // Add lower 128-bits
-    auto low_a   = _mm256_castsi256_si128(m_value);
-    auto low_b   = _mm256_castsi256_si128(sh1);
-    auto res_low = _mm_add_epi64(low_a, low_b);
-
-    // Add upper 128-bits
-    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
-    auto hi_b   = _mm256_extractf128_si256(sh1, 1);
-    auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-    // Sum upper and lower
-    auto res = _mm_add_epi64(res_hi, res_low);
-
-    // add lower and upper
-    return _mm_extract_epi64(res, 0);
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const
-  {
-    // AVX2 does not supply an 64bit integer max!
-    auto red = get(0);
-
-    auto v1 = get(1);
-    red     = red < v1 ? v1 : red;
-
-    auto v2 = get(2);
-    red     = red < v2 ? v2 : red;
-
-    auto v3 = get(3);
-    red     = red < v3 ? v3 : red;
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
+  template<>
+  class Register<int64_t, avx_register> :
+    public internal::expt::RegisterBase<Register<int64_t, avx_register>>
   {
-    if (N <= 0 || N > 4)
-    {
-      return RAJA::operators::limits<int64_t>::min();
-    }
-
-    // AVX2 does not supply an 64bit integer max?!?
-    auto red = get(0);
-
-    if (N > 1)
-    {
-      auto v1 = get(1);
-      red     = red < v1 ? v1 : red;
-    }
-    if (N > 2)
-    {
-      auto v2 = get(2);
-      red     = red < v2 ? v2 : red;
-    }
-    if (N > 3)
-    {
-      auto v3 = get(3);
-      red     = red < v3 ? v3 : red;
-    }
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
-                                       get(2) > a.get(2) ? get(2) : a.get(2),
-                                       get(1) > a.get(1) ? get(1) : a.get(1),
-                                       get(0) > a.get(0) ? get(0) : a.get(0)));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-
-    // AVX2 does not supply an 64bit integer max?!?
-    auto red = get(0);
-
-    auto v1 = get(1);
-    red     = red > v1 ? v1 : red;
-
-    auto v2 = get(2);
-    red     = red > v2 ? v2 : red;
-
-    auto v3 = get(3);
-    red     = red > v3 ? v3 : red;
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    if (N <= 0 || N > 4)
-    {
-      return RAJA::operators::limits<int64_t>::max();
-    }
-
-    // AVX2 does not supply an 64bit integer max?!?
-    auto red = get(0);
-
-    if (N > 1)
-    {
-      auto v1 = get(1);
-      red     = red > v1 ? v1 : red;
-    }
-    if (N > 2)
-    {
-      auto v2 = get(2);
-      red     = red > v2 ? v2 : red;
-    }
-    if (N > 3)
-    {
-      auto v3 = get(3);
-      red     = red > v3 ? v3 : red;
-    }
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
-                                       get(2) < a.get(2) ? get(2) : a.get(2),
-                                       get(1) < a.get(1) ? get(1) : a.get(1),
-                                       get(0) < a.get(0) ? get(0) : a.get(0)));
-  }
-};
-
-
-}  // namespace expt
+    public:
+      using base_type = internal::expt::RegisterBase<Register<int64_t, avx_register>>;
+
+      using register_policy = avx_register;
+      using self_type = Register<int64_t, avx_register>;
+      using element_type = int64_t;
+      using register_type = __m256i;
+
+      using int_vector_type = Register<int64_t, avx_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi64x(
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
+      }
+
+      /*
+       * Use the packed-double permute function because there isn't one
+       * specifically for int64
+       *
+       * Just adds a bunch of casting, should be same cost
+       */
+      template<int perm>
+      RAJA_INLINE
+      __m256i permute(__m256i x) const {
+        return _mm256_castpd_si256(
+            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 4;
+
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : base_type(),  m_value(_mm256_setzero_si256()) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3) :
+        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
+      {}
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        m_value = _mm256_castpd_si256(
+            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
+        );
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+        for(camp::idx_t i = 0;i < 4;++ i){
+          m_value[i] = ptr[i*stride];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+        m_value = _mm256_setzero_si256();
+        for(camp::idx_t i = 0;i < N;++ i){
+          m_value[i] = ptr[i*stride];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N), reinterpret_cast<__m256d>(m_value));
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+        for(camp::idx_t i = 0;i < 4;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: return _mm256_extract_epi64(m_value, 0);
+          case 1: return _mm256_extract_epi64(m_value, 1);
+          case 2: return _mm256_extract_epi64(m_value, 2);
+          case 3: return _mm256_extract_epi64(m_value, 3);
+        }
+        return 0;
+      }
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
+          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
+          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
+          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
+        }
+
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_epi64x(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
+
+        // Low 128-bits  - use _mm256_castsi256_si128???
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(b.m_value);
+        auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
+
+        // Hi 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+        auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+        // Stitch back together
+        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
+
+        // Low 128-bits  - use _mm256_castsi256_si128???
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(b.m_value);
+        auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
+
+        // Hi 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+        auto res_hi = _mm_sub_epi64(hi_a, hi_b);
+
+        // Stitch back together
+        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        // AVX2 does not supply an int64_t multiply, so do it manually
+        return self_type(_mm256_set_epi64x(
+            get(3)*b.get(3),
+            get(2)*b.get(2),
+            get(1)*b.get(1),
+            get(0)*b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi64x(
+            get(3)/b.get(3),
+            get(2)/b.get(2),
+            get(1)/b.get(1),
+            get(0)/b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi64x(
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+        // swap pairs and add
+        auto sh1 = permute<0x5>(m_value);
+
+        // Add lower 128-bits
+        auto low_a = _mm256_castsi256_si128(m_value);
+        auto low_b = _mm256_castsi256_si128(sh1);
+        auto res_low = _mm_add_epi64(low_a, low_b);
+
+        // Add upper 128-bits
+        auto hi_a = _mm256_extractf128_si256(m_value, 1);
+        auto hi_b = _mm256_extractf128_si256(sh1, 1);
+        auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+        // Sum upper and lower
+        auto res = _mm_add_epi64(res_hi, res_low);
+
+        // add lower and upper
+        return _mm_extract_epi64(res, 0);
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        // AVX2 does not supply an 64bit integer max!
+        auto red = get(0);
+
+        auto v1 = get(1);
+        red = red < v1 ? v1 : red;
+
+        auto v2 = get(2);
+        red = red < v2 ? v2 : red;
+
+        auto v3 = get(3);
+        red = red < v3 ? v3 : red;
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        if(N <= 0 || N > 4){
+          return RAJA::operators::limits<int64_t>::min();
+        }
+
+        // AVX2 does not supply an 64bit integer max?!?
+        auto red = get(0);
+
+        if(N > 1){
+          auto v1 = get(1);
+          red = red < v1 ? v1 : red;
+        }
+        if(N > 2){
+          auto v2 = get(2);
+          red = red < v2 ? v2 : red;
+        }
+        if(N > 3){
+          auto v3 = get(3);
+          red = red < v3 ? v3 : red;
+        }
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+          return self_type(_mm256_set_epi64x(
+              get(3) > a.get(3) ? get(3) : a.get(3),
+              get(2) > a.get(2) ? get(2) : a.get(2),
+              get(1) > a.get(1) ? get(1) : a.get(1),
+              get(0) > a.get(0) ? get(0) : a.get(0) ));
+        
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+
+        // AVX2 does not supply an 64bit integer max?!?
+        auto red = get(0);
+
+        auto v1 = get(1);
+        red = red > v1 ? v1 : red;
+
+        auto v2 = get(2);
+        red = red > v2 ? v2 : red;
+
+        auto v3 = get(3);
+        red = red > v3 ? v3 : red;
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        if(N <= 0 || N > 4){
+          return RAJA::operators::limits<int64_t>::max();
+        }
+
+        // AVX2 does not supply an 64bit integer max?!?
+        auto red = get(0);
+
+        if(N > 1){
+          auto v1 = get(1);
+          red = red > v1 ? v1 : red;
+        }
+        if(N > 2){
+          auto v2 = get(2);
+          red = red > v2 ? v2 : red;
+        }
+        if(N > 3){
+          auto v3 = get(3);
+          red = red > v3 ? v3 : red;
+        }
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+          return self_type(_mm256_set_epi64x(
+              get(3) < a.get(3) ? get(3) : a.get(3),
+              get(2) < a.get(2) ? get(2) : a.get(2),
+              get(1) < a.get(1) ? get(1) : a.get(1),
+              get(0) < a.get(0) ? get(0) : a.get(0) ));
+        
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX2__
+#endif //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index ad0c7b3d26..33c18e2c5f 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -20,59 +20,52 @@
 #ifndef RAJA_policy_tensor_arch_avx_traits_HPP
 #define RAJA_policy_tensor_arch_avx_traits_HPP
 
-namespace RAJA
-{
-namespace internal
-{
-namespace expt
-{
+namespace RAJA {
+namespace internal {
+namespace expt {
 
-template <>
-struct RegisterTraits<RAJA::expt::avx_register, int32_t>
-{
-  using element_type                      = int32_t;
-  using register_policy                   = RAJA::expt::avx_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type                  = int32_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx_register, int32_t>{
+      using element_type = int32_t;
+      using register_policy = RAJA::expt::avx_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 8;
+      using int_element_type = int32_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::avx_register, int64_t>
-{
-  using element_type                      = int64_t;
-  using register_policy                   = RAJA::expt::avx_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type                  = int64_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx_register, int64_t>{
+      using element_type = int64_t;
+      using register_policy = RAJA::expt::avx_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 4;
+      using int_element_type = int64_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::avx_register, float>
-{
-  using element_type                      = float;
-  using register_policy                   = RAJA::expt::avx_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type                  = int32_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx_register, float>{
+      using element_type = float;
+      using register_policy = RAJA::expt::avx_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 8;
+      using int_element_type = int32_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::avx_register, double>
-{
-  using element_type                      = double;
-  using register_policy                   = RAJA::expt::avx_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type                  = int64_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx_register, double>{
+      using element_type = double;
+      using register_policy = RAJA::expt::avx_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 4;
+      using int_element_type = int64_t;
+  };
 
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
+} // namespace intenral
+} // namespace expt
+} // namespace RAJA
 
 
 #endif
 
 
-#endif  // __AVX__
+#endif // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx2.hpp b/include/RAJA/policy/tensor/arch/avx2.hpp
index 4ae2ca6bdd..b462257924 100644
--- a/include/RAJA/policy/tensor/arch/avx2.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX2__
 
-#include <RAJA/policy/tensor/arch/avx2/traits.hpp>
-#include <RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
-#include <RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
-#include <RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
-#include <RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
+#include<RAJA/policy/tensor/arch/avx2/traits.hpp>
+#include<RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
+#include<RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
+#include<RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
+#include<RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
 
 
-#endif  // __AVX2__
+#endif // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index eba85017e0..852003a4f9 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -34,546 +34,529 @@ namespace RAJA
 namespace expt
 {
 
-template <>
-class Register<double, avx2_register>
-    : public internal::expt::RegisterBase<Register<double, avx2_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<double, avx2_register>>;
-
-  using register_policy = avx2_register;
-  using self_type       = Register<double, avx2_register>;
-  using element_type    = double;
-  using register_type   = __m256d;
-
-  using int_vector_type = Register<int64_t, avx2_register>;
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 4;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : m_value(_mm256_setzero_pd()) {}
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0, element_type x1, element_type x2, element_type x3)
-      : m_value(_mm256_set_pd(x3, x2, x1, x0))
-  {}
-
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
-
-
-  /*!
-   * @brief Returns underlying SIMD register.
-   */
-  RAJA_INLINE
-  constexpr register_type get_register() const { return m_value; }
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
+  template<>
+  class Register<double, avx2_register> :
+    public internal::expt::RegisterBase<Register<double, avx2_register>>
   {
+    public:
+      using base_type = internal::expt::RegisterBase<Register<double, avx2_register>>;
+
+      using register_policy = avx2_register;
+      using self_type = Register<double, avx2_register>;
+      using element_type = double;
+      using register_type = __m256d;
+
+      using int_vector_type = Register<int64_t, avx2_register>;
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi64x(
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 4;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : m_value(_mm256_setzero_pd()) {
+      }
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3) :
+        m_value(_mm256_set_pd(x3,x2,x1,x0))
+      {}
+
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
+
+
+      /*!
+       * @brief Returns underlying SIMD register.
+       */
+      RAJA_INLINE
+      constexpr
+      register_type get_register() const {
+        return m_value;
+      }
+
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_packed++;
+          RAJA::tensor_stats::num_vector_load_packed ++;
 #endif
-    m_value = _mm256_loadu_pd(ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
+        m_value = _mm256_loadu_pd(ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_packed_n++;
+          RAJA::tensor_stats::num_vector_load_packed_n ++;
 #endif
-    m_value = _mm256_maskload_pd(ptr, createMask(N));
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
+        m_value = _mm256_maskload_pd(ptr, createMask(N));
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided++;
+          RAJA::tensor_stats::num_vector_load_strided ++;
 #endif
-    m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
-                                  sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
+        m_value = _mm256_i64gather_pd(ptr,
+                                      createStridedOffsets(stride),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    m_value = _mm256_mask_i64gather_pd(
-        _mm256_setzero_pd(), ptr, createStridedOffsets(stride),
-        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Generic gather operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type& gather(element_type const* ptr, int_vector_type offsets)
-  {
+        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
+                                      ptr,
+                                      createStridedOffsets(stride),
+                                      _mm256_castsi256_pd(createMask(N)),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Generic gather operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type &gather(element_type const *ptr, int_vector_type offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    m_value =
-        _mm256_i64gather_pd(ptr, offsets.get_register(), sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Generic gather operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
-  {
+        m_value = _mm256_i64gather_pd(ptr,
+                                      offsets.get_register(),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Generic gather operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    m_value = _mm256_mask_i64gather_pd(
-        _mm256_setzero_pd(), ptr, offsets.get_register(),
-        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
+        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
+                                      ptr,
+                                      offsets.get_register(),
+                                      _mm256_castsi256_pd(createMask(N)),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_store_packed++;
+          RAJA::tensor_stats::num_vector_store_packed ++;
 #endif
-    _mm256_storeu_pd(ptr, m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
+        _mm256_storeu_pd(ptr, m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_store_packed_n++;
+          RAJA::tensor_stats::num_vector_store_packed_n ++;
 #endif
-    _mm256_maskstore_pd(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
+        _mm256_maskstore_pd(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_store_strided++;
+          RAJA::tensor_stats::num_vector_store_strided ++;
 #endif
-    for (camp::idx_t i = 0; i < 4; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
+        for(camp::idx_t i = 0;i < 4;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_store_strided_n++;
+          RAJA::tensor_stats::num_vector_store_strided_n ++;
 #endif
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const { return m_value[i]; }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value[i] = value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm256_set1_pd(value);
-    return *this;
-  }
-
-  /*!
-   * @brief Extracts a scalar value and broadcasts to a new register
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type get_and_broadcast(int i) const
-  {
-    switch (i)
-    {
-    case 0:
-      return self_type(_mm256_permute4x64_pd(m_value, 0x00));
-    case 1:
-      return self_type(_mm256_permute4x64_pd(m_value, 0x55));
-    case 2:
-      return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
-    case 3:
-      return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
-    }
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm256_add_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm256_sub_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm256_mul_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(_mm256_div_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply a masked divide, so do it manually
-    return self_type(_mm256_set_pd(
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {return m_value[i];}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        m_value[i] = value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_pd(value);
+        return *this;
+      }
+
+      /*!
+       * @brief Extracts a scalar value and broadcasts to a new register
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type get_and_broadcast(int i) const {
+        switch(i){
+          case 0: return self_type(_mm256_permute4x64_pd (m_value, 0x00));
+          case 1: return self_type(_mm256_permute4x64_pd (m_value, 0x55));
+          case 2: return self_type(_mm256_permute4x64_pd (m_value, 0xAA));
+          case 3: return self_type(_mm256_permute4x64_pd (m_value, 0xFF));
+        }
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm256_add_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm256_sub_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm256_mul_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(_mm256_div_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        // AVX2 does not supply a masked divide, so do it manually
+        return self_type(_mm256_set_pd(
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
-  }
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
+      }
 #endif
 
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum(camp::idx_t = 4) const
-  {
-    auto sh1  = _mm256_permute_pd(m_value, 0x5);
-    auto red1 = _mm256_add_pd(m_value, sh1);
-    return red1[0] + red1[2];
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max(camp::idx_t N = 4) const
-  {
-    if (N == 4)
-    {
-      // permute the first two and last two lanes of the register
-      // A = { v[1], v[0], v[3], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-      // take the maximum value of each lane
-      // B = { max{v[0], v[1]},
-      //       max{v[0], v[1]},
-      //       max{v[2], v[3]},
-      //       max{v[2], v[3]} }
-      register_type b = _mm256_max_pd(m_value, a);
-
-      // now take the maximum of a lower and upper halves
-      return RAJA::max<element_type>(b[0], b[2]);
-    }
-    else if (N == 3)
-    {
-      // permute the first two and last two lanes of the register
-      // use the third element TWICE, so we effectively remove the 4th
-      // lane
-      // A = { v[1], v[0], v[2], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-      // take the maximum value of each lane
-      // B = { max{v[0], v[1]},
-      //       max{v[0], v[1]},
-      //       max{v[2], v[2]},   <-- just v[2]
-      //       max{v[2], v[3]} }
-      register_type b = _mm256_max_pd(m_value, a);
-
-      // now take the maximum of a lower and upper lane
-      return RAJA::max<element_type>(b[0], b[2]);
-    }
-    else if (N == 2)
-    {
-      return RAJA::max<element_type>(m_value[0], m_value[1]);
-    }
-    else if (N == 1)
-    {
-      return m_value[0];
-    }
-    return RAJA::operators::limits<double>::min();
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm256_max_pd(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-    // permute the first two and last two lanes of the register
-    // A = { v[1], v[0], v[3], v[2] }
-    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-    // take the minimum value of each lane
-    // B = { min{v[0], v[1]},
-    //       min{v[0], v[1]},
-    //       min{v[2], v[3]},
-    //       min{v[2], v[3]} }
-    register_type b = _mm256_min_pd(m_value, a);
-
-    // now take the minimum of a lower and upper halves
-    return RAJA::min<element_type>(b[0], b[2]);
-  }
-
-  /*!
-   * @brief Returns the largest element from first N lanes
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    if (N == 4)
-    {
-      // permute the first two and last two lanes of the register
-      // A = { v[1], v[0], v[3], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-      // take the minimum value of each lane
-      // B = { min{v[0], v[1]},
-      //       min{v[0], v[1]},
-      //       min{v[2], v[3]},
-      //       min{v[2], v[3]} }
-      register_type b = _mm256_min_pd(m_value, a);
-
-      // now take the minimum of a lower and upper halves
-      return std::min<element_type>(b[0], b[2]);
-    }
-    else if (N == 3)
-    {
-      // permute the first two and last two lanes of the register
-      // use the third element TWICE, so we effectively remove the 4th
-      // lane
-      // A = { v[1], v[0], v[2], v[2] }
-      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-      // take the minimum value of each lane
-      // B = { min{v[0], v[1]},
-      //       min{v[0], v[1]},
-      //       min{v[2], v[2]},   <-- just v[2]
-      //       min{v[2], v[3]} }
-      register_type b = _mm256_min_pd(m_value, a);
-
-      // now take the minimum of a lower and upper lane
-      return std::min<element_type>(b[0], b[2]);
-    }
-    else if (N == 2)
-    {
-      return std::min<element_type>(m_value[0], m_value[1]);
-    }
-    else if (N == 1)
-    {
-      return m_value[0];
-    }
-    return RAJA::operators::limits<double>::max();
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm256_min_pd(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum(camp::idx_t = 4) const
+      {
+        auto sh1 = _mm256_permute_pd(m_value, 0x5);
+        auto red1 = _mm256_add_pd(m_value, sh1);
+        return red1[0]+red1[2];
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max(camp::idx_t N = 4) const
+      {
+        if(N == 4){
+          // permute the first two and last two lanes of the register
+          // A = { v[1], v[0], v[3], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+          // take the maximum value of each lane
+          // B = { max{v[0], v[1]},
+          //       max{v[0], v[1]},
+          //       max{v[2], v[3]},
+          //       max{v[2], v[3]} }
+          register_type b = _mm256_max_pd(m_value, a);
+
+          // now take the maximum of a lower and upper halves
+          return RAJA::max<element_type>(b[0], b[2]);
+        }
+        else if(N == 3){
+          // permute the first two and last two lanes of the register
+          // use the third element TWICE, so we effectively remove the 4th
+          // lane
+          // A = { v[1], v[0], v[2], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+          // take the maximum value of each lane
+          // B = { max{v[0], v[1]},
+          //       max{v[0], v[1]},
+          //       max{v[2], v[2]},   <-- just v[2]
+          //       max{v[2], v[3]} }
+          register_type b = _mm256_max_pd(m_value, a);
+
+          // now take the maximum of a lower and upper lane
+          return RAJA::max<element_type>(b[0], b[2]);
+        }
+        else if(N == 2){
+          return RAJA::max<element_type>(m_value[0], m_value[1]);
+        }
+        else if(N == 1){
+          return m_value[0];
+        }
+        return RAJA::operators::limits<double>::min();
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm256_max_pd(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        // permute the first two and last two lanes of the register
+        // A = { v[1], v[0], v[3], v[2] }
+        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+        // take the minimum value of each lane
+        // B = { min{v[0], v[1]},
+        //       min{v[0], v[1]},
+        //       min{v[2], v[3]},
+        //       min{v[2], v[3]} }
+        register_type b = _mm256_min_pd(m_value, a);
+
+        // now take the minimum of a lower and upper halves
+        return RAJA::min<element_type>(b[0], b[2]);
+      }
+
+      /*!
+       * @brief Returns the largest element from first N lanes
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        if(N == 4){
+          // permute the first two and last two lanes of the register
+          // A = { v[1], v[0], v[3], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+          // take the minimum value of each lane
+          // B = { min{v[0], v[1]},
+          //       min{v[0], v[1]},
+          //       min{v[2], v[3]},
+          //       min{v[2], v[3]} }
+          register_type b = _mm256_min_pd(m_value, a);
+
+          // now take the minimum of a lower and upper halves
+          return std::min<element_type>(b[0], b[2]);
+        }
+        else if(N == 3){
+          // permute the first two and last two lanes of the register
+          // use the third element TWICE, so we effectively remove the 4th
+          // lane
+          // A = { v[1], v[0], v[2], v[2] }
+          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+          // take the minimum value of each lane
+          // B = { min{v[0], v[1]},
+          //       min{v[0], v[1]},
+          //       min{v[2], v[2]},   <-- just v[2]
+          //       min{v[2], v[3]} }
+          register_type b = _mm256_min_pd(m_value, a);
+
+          // now take the minimum of a lower and upper lane
+          return std::min<element_type>(b[0], b[2]);
+        }
+        else if(N == 2){
+          return std::min<element_type>(m_value[0], m_value[1]);
+        }
+        else if(N == 1){
+          return m_value[0];
+        }
+        return RAJA::operators::limits<double>::max();
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm256_min_pd(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX2__
+#endif //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 77d814e293..4b1e11419d 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -33,486 +33,487 @@ namespace RAJA
 {
 namespace expt
 {
-template <>
-class Register<float, avx2_register>
-    : public internal::expt::RegisterBase<Register<float, avx2_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<float, avx2_register>>;
-
-  using register_policy = avx2_register;
-  using self_type       = Register<float, avx2_register>;
-  using element_type    = float;
-  using register_type   = __m256;
-
-  using int_vector_type = Register<int32_t, avx2_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
-                            3 * stride, 2 * stride, stride, 0);
-  }
-
-  RAJA_INLINE
-  __m256i createPermute1(camp::idx_t N) const
-  {
-    // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createPermute2(camp::idx_t N) const
-  {
-    // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 8;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : m_value(_mm256_setzero_ps()) {}
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
-      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
-  {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
-
-  /*!
-   * @brief Returns underlying SIMD register.
-   */
-  RAJA_INLINE
-  constexpr register_type get_register() const { return m_value; }
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = _mm256_loadu_ps(ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    m_value = _mm256_maskload_ps(ptr, createMask(N));
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    m_value = _mm256_i32gather_ps(ptr, createStridedOffsets(stride),
-                                  sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  template<>
+  class Register<float, avx2_register> :
+    public internal::expt::RegisterBase<Register<float, avx2_register>>
   {
-    m_value = _mm256_mask_i32gather_ps(
-        _mm256_setzero_ps(), ptr, createStridedOffsets(stride),
-        _mm256_castsi256_ps(createMask(N)), sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    _mm256_storeu_ps(ptr, m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    _mm256_maskstore_ps(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    for (camp::idx_t i = 0; i < 8; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const { return m_value[i]; }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value[i] = value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm256_set1_ps(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm256_add_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm256_sub_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm256_mul_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(_mm256_div_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply a masked divide
-    return self_type(_mm256_set_ps(
-        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
-        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
+    public:
+      using base_type = internal::expt::RegisterBase<Register<float, avx2_register>>;
+
+      using register_policy = avx2_register;
+      using self_type = Register<float, avx2_register>;
+      using element_type = float;
+      using register_type = __m256;
+
+      using int_vector_type = Register<int32_t, avx2_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi32(
+            N >= 8 ? -1 : 0,
+            N >= 7 ? -1 : 0,
+            N >= 6 ? -1 : 0,
+            N >= 5 ? -1 : 0,
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+        return  _mm256_set_epi32(
+            7*stride, 6*stride, 5*stride, 4*stride,
+            3*stride, 2*stride, stride, 0);
+      }
+
+      RAJA_INLINE
+      __m256i createPermute1(camp::idx_t N) const {
+        // Generate a permutation for first round of min/max routines
+        return  _mm256_set_epi32(
+            N >= 7 ? 6 : 0,
+            N >= 8 ? 7 : 0,
+            N >= 5 ? 4 : 0,
+            N >= 6 ? 5 : 0,
+            N >= 3 ? 2 : 0,
+            N >= 4 ? 3 : 0,
+            N >= 1 ? 0 : 0,
+            N >= 2 ? 1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createPermute2(camp::idx_t N) const {
+        // Generate a permutation for second round of min/max routines
+        return  _mm256_set_epi32(
+            N >= 6 ? 5 : 0,
+            N >= 5 ? 4 : 0,
+            N >= 8 ? 7 : 0,
+            N >= 7 ? 6 : 0,
+            N >= 2 ? 1 : 0,
+            N >= 1 ? 0 : 0,
+            N >= 4 ? 3 : 0,
+            N >= 2 ? 2 : 0);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 8;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : m_value(_mm256_setzero_ps()) {
+      }
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3,
+                     element_type x4,
+                     element_type x5,
+                     element_type x6,
+                     element_type x7) :
+        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
+      {}
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
+
+      /*!
+       * @brief Returns underlying SIMD register.
+       */
+      RAJA_INLINE
+      constexpr
+      register_type get_register() const {
+        return m_value;
+      }
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = _mm256_loadu_ps(ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        m_value = _mm256_maskload_ps(ptr, createMask(N));
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+        m_value = _mm256_i32gather_ps(ptr,
+                                      createStridedOffsets(stride),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+        m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
+                                      ptr,
+                                      createStridedOffsets(stride),
+                                      _mm256_castsi256_ps(createMask(N)),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        _mm256_storeu_ps(ptr, m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        _mm256_maskstore_ps(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+        for(camp::idx_t i = 0;i < 8;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {return m_value[i];}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        m_value[i] = value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_ps(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm256_add_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm256_sub_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm256_mul_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(_mm256_div_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N ) const {
+        // AVX2 does not supply a masked divide
+        return self_type(_mm256_set_ps(
+            N >= 8 ? get(7)/b.get(7) : 0,
+            N >= 7 ? get(6)/b.get(6) : 0,
+            N >= 6 ? get(5)/b.get(5) : 0,
+            N >= 5 ? get(4)/b.get(4) : 0,
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
-  }
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
+      }
 #endif
 
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const
-  {
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
-    auto red1 = _mm256_add_ps(m_value, sh1);
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permute_ps(red1, 0x4E);
-    auto red2 = _mm256_add_ps(red1, sh2);
-
-    return red2[0] + red2[4];
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const
-  {
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-    auto red1 = _mm256_max_ps(m_value, sh1);
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-    auto red2 = _mm256_max_ps(red1, sh2);
-
-    return std::max<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<float>::min();
-    }
-    if (N == 1)
-    {
-      return m_value[0];
-    }
-    if (N == 2)
-    {
-      return std::max<element_type>(m_value[0], m_value[1]);
-    }
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-    auto red1 = _mm256_max_ps(m_value, sh1);
-
-    if (N == 3)
-    {
-      return std::max<element_type>(red1[0], m_value[2]);
-    }
-    if (N == 4)
-    {
-      return std::max<element_type>(red1[0], red1[2]);
-    }
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-    auto red2 = _mm256_max_ps(red1, sh2);
-
-    return std::max<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm256_max_ps(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-    auto red1 = _mm256_min_ps(m_value, sh1);
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-    auto red2 = _mm256_min_ps(red1, sh2);
-
-    return std::min<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<float>::max();
-    }
-    if (N == 1)
-    {
-      return m_value[0];
-    }
-    if (N == 2)
-    {
-      return std::min<element_type>(m_value[0], m_value[1]);
-    }
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-    auto red1 = _mm256_min_ps(m_value, sh1);
-
-    if (N == 3)
-    {
-      return std::min<element_type>(red1[0], m_value[2]);
-    }
-    if (N == 4)
-    {
-      return std::min<element_type>(red1[0], red1[2]);
-    }
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-    auto red2 = _mm256_min_ps(red1, sh2);
-
-    return std::min<element_type>(red2[0], red2[4]);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm256_min_ps(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+        auto red1 = _mm256_add_ps(m_value, sh1);
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permute_ps(red1, 0x4E);
+        auto red2 = _mm256_add_ps(red1, sh2);
+
+        return red2[0] + red2[4];
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+        auto red1 = _mm256_max_ps(m_value, sh1);
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+        auto red2 = _mm256_max_ps(red1, sh2);
+
+        return std::max<element_type>(red2[0], red2[4]);
+
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N >8){
+          return RAJA::operators::limits<float>::min();
+        }
+        if(N == 1){
+          return m_value[0];
+        }
+        if(N == 2){
+          return std::max<element_type>(m_value[0], m_value[1]);
+        }
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+        auto red1 = _mm256_max_ps(m_value, sh1);
+
+        if(N == 3){
+          return std::max<element_type>(red1[0], m_value[2]);
+        }
+        if(N == 4){
+          return std::max<element_type>(red1[0], red1[2]);
+        }
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+        auto red2 = _mm256_max_ps(red1, sh2);
+
+        return std::max<element_type>(red2[0], red2[4]);
+
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm256_max_ps(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+        auto red1 = _mm256_min_ps(m_value, sh1);
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+        auto red2 = _mm256_min_ps(red1, sh2);
+
+        return std::min<element_type>(red2[0], red2[4]);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N >8){
+          return RAJA::operators::limits<float>::max();
+        }
+        if(N == 1){
+          return m_value[0];
+        }
+        if(N == 2){
+          return std::min<element_type>(m_value[0], m_value[1]);
+        }
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+        auto red1 = _mm256_min_ps(m_value, sh1);
+
+        if(N == 3){
+          return std::min<element_type>(red1[0], m_value[2]);
+        }
+        if(N == 4){
+          return std::min<element_type>(red1[0], red1[2]);
+        }
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+        auto red2 = _mm256_min_ps(red1, sh2);
+
+        return std::min<element_type>(red2[0], red2[4]);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm256_min_ps(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX2__
+#endif //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index fbc671b127..ab5948a3f2 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -34,562 +34,535 @@ namespace RAJA
 namespace expt
 {
 
-template <>
-class Register<int32_t, avx2_register>
-    : public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
-
-  using register_policy = avx2_register;
-  using self_type       = Register<int32_t, avx2_register>;
-  using element_type    = int32_t;
-  using register_type   = __m256i;
-
-  using int_vector_type = Register<int32_t, avx2_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
-                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
-                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
-                            3 * stride, 2 * stride, stride, 0);
-  }
-
-  RAJA_INLINE
-  __m256i createPermute1(camp::idx_t N) const
-  {
-    // Generate a permutation for first round of min/max routines
-    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
-                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
-                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createPermute2(camp::idx_t N) const
-  {
-    // Generate a permutation for second round of min/max routines
-    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
-                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
-                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 8;
-
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : m_value(_mm256_setzero_si256()) {}
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0,
-           element_type x1,
-           element_type x2,
-           element_type x3,
-           element_type x4,
-           element_type x5,
-           element_type x6,
-           element_type x7)
-      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
-  {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-  /*!
-   * @brief Returns underlying SIMD register.
-   */
-  RAJA_INLINE
-  constexpr register_type get_register() const { return m_value; }
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = _mm256_loadu_si256((__m256i const*)ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    m_value = _mm256_maskload_epi32(ptr, createMask(N));
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    m_value = _mm256_i32gather_epi32(ptr, createStridedOffsets(stride),
-                                     sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(), ptr,
-                                          createStridedOffsets(stride),
-                                          createMask(N), sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    _mm256_maskstore_epi32(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    for (camp::idx_t i = 0; i < 8; ++i)
-    {
-      ptr[i * stride] = get(i);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = get(i);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      return _mm256_extract_epi32(m_value, 0);
-    case 1:
-      return _mm256_extract_epi32(m_value, 1);
-    case 2:
-      return _mm256_extract_epi32(m_value, 2);
-    case 3:
-      return _mm256_extract_epi32(m_value, 3);
-    case 4:
-      return _mm256_extract_epi32(m_value, 4);
-    case 5:
-      return _mm256_extract_epi32(m_value, 5);
-    case 6:
-      return _mm256_extract_epi32(m_value, 6);
-    case 7:
-      return _mm256_extract_epi32(m_value, 7);
-    }
-    return 0;
-  }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      m_value = _mm256_insert_epi32(m_value, value, 0);
-      break;
-    case 1:
-      m_value = _mm256_insert_epi32(m_value, value, 1);
-      break;
-    case 2:
-      m_value = _mm256_insert_epi32(m_value, value, 2);
-      break;
-    case 3:
-      m_value = _mm256_insert_epi32(m_value, value, 3);
-      break;
-    case 4:
-      m_value = _mm256_insert_epi32(m_value, value, 4);
-      break;
-    case 5:
-      m_value = _mm256_insert_epi32(m_value, value, 5);
-      break;
-    case 6:
-      m_value = _mm256_insert_epi32(m_value, value, 6);
-      break;
-    case 7:
-      m_value = _mm256_insert_epi32(m_value, value, 7);
-      break;
-    }
-
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm256_set1_epi32(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm256_add_epi32(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm256_sub_epi32(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-
-    // the AVX2 epi32 multiply only multiplies the even elements
-    // and provides 64-bit results
-    // need to do some repacking to get this to work
-
-    // multiply 0, 2, 4, 6
-    auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
-
-    // Swap 32-bit words
-    auto sh_a = _mm256_castps_si256(
-        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
-
-    auto sh_b = _mm256_castps_si256(
-        _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
-
-    // multiply 1, 3, 5, 7
-    auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
-
-    // Stitch prod_odd and prod_even back together
-    auto sh_odd = _mm256_castps_si256(
-        _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
-
-    return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
-                                      get(5) / b.get(5), get(4) / b.get(4),
-                                      get(3) / b.get(3), get(2) / b.get(2),
-                                      get(1) / b.get(1), get(0) / b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi32(
-        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
-        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const
-  {
-    // swap odd-even pairs and add
-    auto sh1 = _mm256_castps_si256(
-        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
-    auto red1 = _mm256_add_epi32(m_value, sh1);
-
-
-    // swap odd-even quads and add
-    auto sh2 =
-        _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
-    auto red2 = _mm256_add_epi32(red1, sh2);
-
-    return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const
+  template<>
+  class Register<int32_t, avx2_register> :
+    public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
   {
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-    auto red1 = _mm256_max_epi32(m_value, sh1);
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-    auto red2 = _mm256_max_epi32(red1, sh2);
-
-    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<int32_t>::min();
-    }
-    if (N == 1)
-    {
-      return get(0);
-    }
-
-    if (N == 2)
-    {
-      return std::max<element_type>(get(0), get(1));
-    }
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-    auto red1 = _mm256_max_epi32(m_value, sh1);
-
-    if (N == 3)
-    {
-      return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-    }
-    if (N == 4)
-    {
-      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
-                                    _mm256_extract_epi32(red1, 2));
-    }
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-    auto red2 = _mm256_max_epi32(red1, sh2);
-
-    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm256_max_epi32(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-    auto red1 = _mm256_min_epi32(m_value, sh1);
-
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-    auto red2 = _mm256_min_epi32(red1, sh2);
-
-    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    // Some simple cases
-    if (N <= 0 || N > 8)
-    {
-      return RAJA::operators::limits<int32_t>::max();
-    }
-    if (N == 1)
-    {
-      return get(0);
-    }
-
-    if (N == 2)
-    {
-      return std::min<element_type>(get(0), get(1));
-    }
-
-    // swap odd-even pairs and add
-    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-    auto red1 = _mm256_min_epi32(m_value, sh1);
-
-    if (N == 3)
-    {
-      return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-    }
-    if (N == 4)
-    {
-      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
-                                    _mm256_extract_epi32(red1, 2));
-    }
-
-    // swap odd-even quads and add
-    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-    auto red2 = _mm256_min_epi32(red1, sh2);
-
-    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
-                                  _mm256_extract_epi32(red2, 4));
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm256_min_epi32(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+    public:
+      using base_type = internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
+
+      using register_policy = avx2_register;
+      using self_type = Register<int32_t, avx2_register>;
+      using element_type = int32_t;
+      using register_type = __m256i;
+
+      using int_vector_type = Register<int32_t, avx2_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi32(
+            N >= 8 ? -1 : 0,
+            N >= 7 ? -1 : 0,
+            N >= 6 ? -1 : 0,
+            N >= 5 ? -1 : 0,
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+        return  _mm256_set_epi32(
+            7*stride, 6*stride, 5*stride, 4*stride,
+            3*stride, 2*stride, stride, 0);
+      }
+
+      RAJA_INLINE
+      __m256i createPermute1(camp::idx_t N) const {
+        // Generate a permutation for first round of min/max routines
+        return  _mm256_set_epi32(
+            N >= 7 ? 6 : 0,
+            N >= 8 ? 7 : 0,
+            N >= 5 ? 4 : 0,
+            N >= 6 ? 5 : 0,
+            N >= 3 ? 2 : 0,
+            N >= 4 ? 3 : 0,
+            N >= 1 ? 0 : 0,
+            N >= 2 ? 1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createPermute2(camp::idx_t N) const {
+        // Generate a permutation for second round of min/max routines
+        return  _mm256_set_epi32(
+            N >= 6 ? 5 : 0,
+            N >= 5 ? 4 : 0,
+            N >= 8 ? 7 : 0,
+            N >= 7 ? 6 : 0,
+            N >= 2 ? 1 : 0,
+            N >= 1 ? 0 : 0,
+            N >= 4 ? 3 : 0,
+            N >= 2 ? 2 : 0);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 8;
+
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : m_value(_mm256_setzero_si256()) {
+      }
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3,
+                     element_type x4,
+                     element_type x5,
+                     element_type x6,
+                     element_type x7) :
+        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
+      {}
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+      /*!
+       * @brief Returns underlying SIMD register.
+       */
+      RAJA_INLINE
+      constexpr
+      register_type get_register() const {
+        return m_value;
+      }
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = _mm256_loadu_si256((__m256i const *)ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        m_value = _mm256_maskload_epi32(ptr, createMask(N));
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+        m_value = _mm256_i32gather_epi32(ptr,
+                                      createStridedOffsets(stride),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+        m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
+                                      ptr,
+                                      createStridedOffsets(stride),
+                                      createMask(N),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        _mm256_maskstore_epi32(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+        for(camp::idx_t i = 0;i < 8;++ i){
+          ptr[i*stride] = get(i);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = get(i);
+        }
+        return *this;
+      }
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: return _mm256_extract_epi32(m_value, 0);
+          case 1: return _mm256_extract_epi32(m_value, 1);
+          case 2: return _mm256_extract_epi32(m_value, 2);
+          case 3: return _mm256_extract_epi32(m_value, 3);
+          case 4: return _mm256_extract_epi32(m_value, 4);
+          case 5: return _mm256_extract_epi32(m_value, 5);
+          case 6: return _mm256_extract_epi32(m_value, 6);
+          case 7: return _mm256_extract_epi32(m_value, 7);
+        }
+        return 0;
+      }
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
+          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
+          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
+          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
+          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
+          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
+          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
+          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
+        }
+
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_epi32(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm256_add_epi32(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm256_sub_epi32(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+
+        // the AVX2 epi32 multiply only multiplies the even elements
+        // and provides 64-bit results
+        // need to do some repacking to get this to work
+
+        // multiply 0, 2, 4, 6
+        auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
+
+        // Swap 32-bit words
+        auto sh_a = _mm256_castps_si256(
+            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+
+        auto sh_b = _mm256_castps_si256(
+                    _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
+
+        // multiply 1, 3, 5, 7
+        auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
+
+        // Stitch prod_odd and prod_even back together
+        auto sh_odd = _mm256_castps_si256(
+                    _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
+
+        return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi32(
+            get(7)/b.get(7),
+            get(6)/b.get(6),
+            get(5)/b.get(5),
+            get(4)/b.get(4),
+            get(3)/b.get(3),
+            get(2)/b.get(2),
+            get(1)/b.get(1),
+            get(0)/b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi32(
+            N >= 8 ? get(7)/b.get(7) : 0,
+            N >= 7 ? get(6)/b.get(6) : 0,
+            N >= 6 ? get(5)/b.get(5) : 0,
+            N >= 5 ? get(4)/b.get(4) : 0,
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_castps_si256(
+            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1) );
+        auto red1 = _mm256_add_epi32(m_value, sh1);
+
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_castps_si256(
+            _mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
+        auto red2 = _mm256_add_epi32(red1, sh2);
+
+        return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+        auto red1 = _mm256_max_epi32(m_value, sh1);
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+        auto red2 = _mm256_max_epi32(red1, sh2);
+
+        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N > 8){
+          return RAJA::operators::limits<int32_t>::min();
+        }
+        if(N == 1){
+          return get(0);
+        }
+
+        if(N == 2){
+          return std::max<element_type>(get(0), get(1));
+        }
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+        auto red1 = _mm256_max_epi32(m_value, sh1);
+
+        if(N == 3){
+          return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+        }
+        if(N == 4){
+          return std::max<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
+        }
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+        auto red2 = _mm256_max_epi32(red1, sh2);
+
+        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm256_max_epi32(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+        auto red1 = _mm256_min_epi32(m_value, sh1);
+
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+        auto red2 = _mm256_min_epi32(red1, sh2);
+
+        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        // Some simple cases
+        if(N <= 0 || N > 8){
+          return RAJA::operators::limits<int32_t>::max();
+        }
+        if(N == 1){
+          return get(0);
+        }
+
+        if(N == 2){
+          return std::min<element_type>(get(0), get(1));
+        }
+
+        // swap odd-even pairs and add
+        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+        auto red1 = _mm256_min_epi32(m_value, sh1);
+
+        if(N == 3){
+          return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+        }
+        if(N == 4){
+          return std::min<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
+        }
+
+        // swap odd-even quads and add
+        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+        auto red2 = _mm256_min_epi32(red1, sh2);
+
+        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm256_min_epi32(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX2__
+#endif //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index aa285f44e7..00eea542cd 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -33,533 +33,519 @@ namespace RAJA
 {
 namespace expt
 {
-template <>
-class Register<int64_t, avx2_register>
-    : public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
-
-  using register_policy = avx2_register;
-  using self_type       = Register<int64_t, avx2_register>;
-  using element_type    = int64_t;
-  using register_type   = __m256i;
-
-  using int_vector_type = Register<int64_t, avx2_register>;
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __m256i createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
-                             N >= 1 ? -1 : 0);
-  }
-
-  RAJA_INLINE
-  __m256i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
-  }
-
-  /*
-   * Use the packed-double permute function because there isn't one
-   * specifically for int64
-   *
-   * Just adds a bunch of casting, should be same cost
-   */
-  template <int perm>
-  RAJA_INLINE __m256i permute(__m256i x) const
-  {
-    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 4;
-
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  Register() : m_value(_mm256_setzero_si256()) {}
-
-  /*!
-   * @brief Construct register with explicit values
-   */
-  RAJA_INLINE
-  Register(element_type x0, element_type x1, element_type x2, element_type x3)
-      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
-  {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  RAJA_INLINE
-  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-  /*!
-   * @brief Returns underlying SIMD register.
-   */
-  RAJA_INLINE
-  constexpr register_type get_register() const { return m_value; }
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
-        reinterpret_cast<double const*>(ptr), createMask(N)));
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(int64_t const* ptr, camp::idx_t stride)
-  {
-    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
-                                     createStridedOffsets(stride),
-                                     sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    m_value = _mm256_mask_i64gather_epi64(
-        _mm256_set1_epi64x(0), reinterpret_cast<long long const*>(ptr),
-        createStridedOffsets(stride), createMask(N), sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Generic gather operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  template<>
+  class Register<int64_t, avx2_register> :
+    public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
   {
+    public:
+      using base_type = internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
+
+      using register_policy = avx2_register;
+      using self_type = Register<int64_t, avx2_register>;
+      using element_type = int64_t;
+      using register_type = __m256i;
+
+      using int_vector_type = Register<int64_t, avx2_register>;
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __m256i createMask(camp::idx_t N) const {
+        // Generate a mask
+        return  _mm256_set_epi64x(
+            N >= 4 ? -1 : 0,
+            N >= 3 ? -1 : 0,
+            N >= 2 ? -1 : 0,
+            N >= 1 ? -1 : 0);
+      }
+
+      RAJA_INLINE
+      __m256i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
+      }
+
+      /*
+       * Use the packed-double permute function because there isn't one
+       * specifically for int64
+       *
+       * Just adds a bunch of casting, should be same cost
+       */
+      template<int perm>
+      RAJA_INLINE
+      __m256i permute(__m256i x) const {
+        return _mm256_castpd_si256(
+            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 4;
+
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      Register() : m_value(_mm256_setzero_si256()) {
+      }
+
+      /*!
+       * @brief Construct register with explicit values
+       */
+      RAJA_INLINE
+      Register(element_type x0,
+                     element_type x1,
+                     element_type x2,
+                     element_type x3) :
+        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
+      {}
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+      RAJA_INLINE
+      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+      /*!
+       * @brief Returns underlying SIMD register.
+       */
+      RAJA_INLINE
+      constexpr
+      register_type get_register() const {
+        return m_value;
+      }
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        m_value = _mm256_castpd_si256(
+            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
+        );
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(int64_t const *ptr, camp::idx_t stride){
+        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
+                                      createStridedOffsets(stride),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+        m_value = _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
+                                      reinterpret_cast<long long const *>(ptr),
+                                      createStridedOffsets(stride),
+                                      createMask(N),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Generic gather operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type &gather(element_type const *ptr, int_vector_type offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    m_value =
-        _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
-                               offsets.get_register(), sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Generic gather operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
-  {
+        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
+                                      offsets.get_register(),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Generic gather operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-    RAJA::tensor_stats::num_vector_load_strided_n++;
+          RAJA::tensor_stats::num_vector_load_strided_n ++;
 #endif
-    m_value = _mm256_mask_i64gather_epi64(
-        _mm256_setzero_si256(), reinterpret_cast<long long const*>(ptr),
-        offsets.get_register(), createMask(N), sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N),
-                           m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    for (camp::idx_t i = 0; i < 4; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ptr[i * stride] = m_value[i];
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      return _mm256_extract_epi64(m_value, 0);
-    case 1:
-      return _mm256_extract_epi64(m_value, 1);
-    case 2:
-      return _mm256_extract_epi64(m_value, 2);
-    case 3:
-      return _mm256_extract_epi64(m_value, 3);
-    }
-    return 0;
-  }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    // got to be a nicer way to do this!?!?
-    switch (i)
-    {
-    case 0:
-      m_value = _mm256_insert_epi64(m_value, value, 0);
-      break;
-    case 1:
-      m_value = _mm256_insert_epi64(m_value, value, 1);
-      break;
-    case 2:
-      m_value = _mm256_insert_epi64(m_value, value, 2);
-      break;
-    case 3:
-      m_value = _mm256_insert_epi64(m_value, value, 3);
-      break;
-    }
-
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm256_set1_epi64x(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm256_add_epi64(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm256_sub_epi64(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    // AVX2 does not supply an int64_t multiply, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
-                                       get(1) * b.get(1), get(0) * b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
-                                       get(1) / b.get(1), get(0) / b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX2 does not supply an integer divide, so do it manually
-    return self_type(_mm256_set_epi64x(
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const
-  {
-
-    // swap pairs and add
-    auto sh1  = permute<0x5>(m_value);
-    auto red1 = _mm256_add_epi64(m_value, sh1);
-
-    // add lower and upper
-    return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const
-  {
-    // AVX2 does not supply an 64bit integer max?!?
-    auto red = get(0);
-
-    auto v1 = get(1);
-    red     = red < v1 ? v1 : red;
-
-    auto v2 = get(2);
-    red     = red < v2 ? v2 : red;
-
-    auto v3 = get(3);
-    red     = red < v3 ? v3 : red;
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    if (N <= 0 || N > 4)
-    {
-      return RAJA::operators::limits<int64_t>::min();
-    }
-
-    // AVX2 does not supply an 64bit integer max?!?
-    auto red = get(0);
-
-    if (N > 1)
-    {
-      auto v1 = get(1);
-      red     = red < v1 ? v1 : red;
-    }
-    if (N > 2)
-    {
-      auto v2 = get(2);
-      red     = red < v2 ? v2 : red;
-    }
-    if (N > 3)
-    {
-      auto v3 = get(3);
-      red     = red < v3 ? v3 : red;
-    }
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
-                                       get(2) > a.get(2) ? get(2) : a.get(2),
-                                       get(1) > a.get(1) ? get(1) : a.get(1),
-                                       get(0) > a.get(0) ? get(0) : a.get(0)));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const
-  {
-    // AVX2 does not supply an 64bit integer max?!?
-    auto red = get(0);
-
-    auto v1 = get(1);
-    red     = red > v1 ? v1 : red;
-
-    auto v2 = get(2);
-    red     = red > v2 ? v2 : red;
-
-    auto v3 = get(3);
-    red     = red > v3 ? v3 : red;
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    if (N <= 0 || N > 4)
-    {
-      return RAJA::operators::limits<int64_t>::max();
-    }
-
-    // AVX2 does not supply an 64bit integer max?!?
-    auto red = get(0);
-
-    if (N > 1)
-    {
-      auto v1 = get(1);
-      red     = red > v1 ? v1 : red;
-    }
-    if (N > 2)
-    {
-      auto v2 = get(2);
-      red     = red > v2 ? v2 : red;
-    }
-    if (N > 3)
-    {
-      auto v3 = get(3);
-      red     = red > v3 ? v3 : red;
-    }
-
-    return red;
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
-                                       get(2) < a.get(2) ? get(2) : a.get(2),
-                                       get(1) < a.get(1) ? get(1) : a.get(1),
-                                       get(0) < a.get(0) ? get(0) : a.get(0)));
-  }
-};
-
-
-}  // namespace expt
+        m_value = _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
+                                      reinterpret_cast<long long const *>(ptr),
+                                      offsets.get_register(),
+                                      createMask(N),
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+        for(camp::idx_t i = 0;i < 4;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+        for(camp::idx_t i = 0;i < N;++ i){
+          ptr[i*stride] = m_value[i];
+        }
+        return *this;
+      }
+
+
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: return _mm256_extract_epi64(m_value, 0);
+          case 1: return _mm256_extract_epi64(m_value, 1);
+          case 2: return _mm256_extract_epi64(m_value, 2);
+          case 3: return _mm256_extract_epi64(m_value, 3);
+        }
+        return 0;
+      }
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        // got to be a nicer way to do this!?!?
+        switch(i){
+          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
+          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
+          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
+          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
+        }
+
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm256_set1_epi64x(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm256_add_epi64(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm256_sub_epi64(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        // AVX2 does not supply an int64_t multiply, so do it manually
+        return self_type(_mm256_set_epi64x(
+            get(3)*b.get(3),
+            get(2)*b.get(2),
+            get(1)*b.get(1),
+            get(0)*b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi64x(
+            get(3)/b.get(3),
+            get(2)/b.get(2),
+            get(1)/b.get(1),
+            get(0)/b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        // AVX2 does not supply an integer divide, so do it manually
+        return self_type(_mm256_set_epi64x(
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+
+        // swap pairs and add
+        auto sh1 = permute<0x5>(m_value);
+        auto red1 = _mm256_add_epi64(m_value, sh1);
+
+        // add lower and upper
+        return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        // AVX2 does not supply an 64bit integer max?!?
+        auto red = get(0);
+
+        auto v1 = get(1);
+        red = red < v1 ? v1 : red;
+
+        auto v2 = get(2);
+        red = red < v2 ? v2 : red;
+
+        auto v3 = get(3);
+        red = red < v3 ? v3 : red;
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        if(N <= 0 || N > 4){
+          return RAJA::operators::limits<int64_t>::min();
+        }
+
+        // AVX2 does not supply an 64bit integer max?!?
+        auto red = get(0);
+
+        if(N > 1){
+          auto v1 = get(1);
+          red = red < v1 ? v1 : red;
+        }
+        if(N > 2){
+          auto v2 = get(2);
+          red = red < v2 ? v2 : red;
+        }
+        if(N > 3){
+          auto v3 = get(3);
+          red = red < v3 ? v3 : red;
+        }
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+          return self_type(_mm256_set_epi64x(
+              get(3) > a.get(3) ? get(3) : a.get(3),
+              get(2) > a.get(2) ? get(2) : a.get(2),
+              get(1) > a.get(1) ? get(1) : a.get(1),
+              get(0) > a.get(0) ? get(0) : a.get(0) ));
+        
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        // AVX2 does not supply an 64bit integer max?!?
+        auto red = get(0);
+
+        auto v1 = get(1);
+        red = red > v1 ? v1 : red;
+
+        auto v2 = get(2);
+        red = red > v2 ? v2 : red;
+
+        auto v3 = get(3);
+        red = red > v3 ? v3 : red;
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        if(N <= 0 || N > 4){
+          return RAJA::operators::limits<int64_t>::max();
+        }
+
+        // AVX2 does not supply an 64bit integer max?!?
+        auto red = get(0);
+
+        if(N > 1){
+          auto v1 = get(1);
+          red = red > v1 ? v1 : red;
+        }
+        if(N > 2){
+          auto v2 = get(2);
+          red = red > v2 ? v2 : red;
+        }
+        if(N > 3){
+          auto v3 = get(3);
+          red = red > v3 ? v3 : red;
+        }
+
+        return red;
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+          return self_type(_mm256_set_epi64x(
+              get(3) < a.get(3) ? get(3) : a.get(3),
+              get(2) < a.get(2) ? get(2) : a.get(2),
+              get(1) < a.get(1) ? get(1) : a.get(1),
+              get(0) < a.get(0) ? get(0) : a.get(0) ));
+        
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX2__
+#endif //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index d51b4ad853..e95c661335 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -21,60 +21,55 @@
 #define RAJA_policy_tensor_arch_avx2_traits_HPP
 
 
-namespace RAJA
-{
-namespace internal
-{
-namespace expt
-{
-
-
-template <>
-struct RegisterTraits<RAJA::expt::avx2_register, int32_t>
-{
-  using element_type                      = int32_t;
-  using register_policy                   = RAJA::expt::avx2_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type                  = int32_t;
-};
-
-template <>
-struct RegisterTraits<RAJA::expt::avx2_register, int64_t>
-{
-  using element_type                      = int64_t;
-  using register_policy                   = RAJA::expt::avx2_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type                  = int64_t;
-};
-
-template <>
-struct RegisterTraits<RAJA::expt::avx2_register, float>
-{
-  using element_type                      = float;
-  using register_policy                   = RAJA::expt::avx2_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type                  = int32_t;
-};
-
-template <>
-struct RegisterTraits<RAJA::expt::avx2_register, double>
-{
-  using element_type                      = double;
-  using register_policy                   = RAJA::expt::avx2_register;
-  static constexpr camp::idx_t s_num_bits = 256;
-  static constexpr camp::idx_t s_num_elem = 4;
-  using int_element_type                  = int64_t;
-};
-
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
-
-
-#endif  // guard
-
-
-#endif  // __AVX2__
+namespace RAJA {
+namespace internal {
+namespace expt {
+
+
+
+  template<>
+  struct RegisterTraits<RAJA::expt::avx2_register, int32_t>{
+      using element_type = int32_t;
+      using register_policy = RAJA::expt::avx2_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 8;
+      using int_element_type = int32_t;
+  };
+
+  template<>
+  struct RegisterTraits<RAJA::expt::avx2_register, int64_t>{
+      using element_type = int64_t;
+      using register_policy = RAJA::expt::avx2_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 4;
+      using int_element_type = int64_t;
+  };
+
+  template<>
+  struct RegisterTraits<RAJA::expt::avx2_register, float>{
+      using element_type = float;
+      using register_policy = RAJA::expt::avx2_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 8;
+      using int_element_type = int32_t;
+  };
+
+  template<>
+  struct RegisterTraits<RAJA::expt::avx2_register, double>{
+      using element_type = double;
+      using register_policy = RAJA::expt::avx2_register;
+      static constexpr camp::idx_t s_num_bits = 256;
+      static constexpr camp::idx_t s_num_elem = 4;
+      using int_element_type = int64_t;
+  };
+
+} // namespace intenral
+} // namespace expt
+} // namespace RAJA
+
+
+#endif // guard
+
+
+
+#endif // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx512.hpp b/include/RAJA/policy/tensor/arch/avx512.hpp
index 71d0212c5e..597563da35 100644
--- a/include/RAJA/policy/tensor/arch/avx512.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512.hpp
@@ -18,11 +18,11 @@
 // Check if the base AVX512 instructions are present
 #ifdef __AVX512F__
 
-#include <RAJA/policy/tensor/arch/avx512/traits.hpp>
-#include <RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
-#include <RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
-#include <RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
-#include <RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
+#include<RAJA/policy/tensor/arch/avx512/traits.hpp>
+#include<RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
+#include<RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
+#include<RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
+#include<RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
 
 
-#endif  // __AVX512F__
+#endif // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index 824311a400..a7b7ebaafa 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -34,371 +34,360 @@ namespace RAJA
 namespace expt
 {
 
-template <>
-class Register<double, avx512_register>
-    : public internal::expt::RegisterBase<Register<double, avx512_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<double, avx512_register>>;
-
-
-  using register_policy = avx512_register;
-  using self_type       = Register<double, avx512_register>;
-  using element_type    = double;
-  using register_type   = __m512d;
-
-  using int_vector_type = Register<int64_t, avx512_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __mmask8 createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    switch (N)
-    {
-    case 0:
-      return __mmask8(0x00);
-    case 1:
-      return __mmask8(0x01);
-    case 2:
-      return __mmask8(0x03);
-    case 3:
-      return __mmask8(0x07);
-    case 4:
-      return __mmask8(0x0F);
-    case 5:
-      return __mmask8(0x1F);
-    case 6:
-      return __mmask8(0x3F);
-    case 7:
-      return __mmask8(0x7F);
-    case 8:
-      return __mmask8(0xFF);
-    }
-    return __mmask8(0);
-  }
-
-  RAJA_INLINE
-  __m512i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    auto vstride = _mm512_set1_epi64(stride);
-    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-    return _mm512_mullo_epi64(vstride, vseq);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 8;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm512_setzero_pd()) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register(element_type const& c) : base_type(), m_value(_mm512_set1_pd(c)) {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    // AVX512F
-    m_value = _mm512_loadu_pd(ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    // AVX512F
-    m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    // AVX512F
-    m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
-                                  sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    // AVX512F
-    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
-                                       createStridedOffsets(stride), ptr,
-                                       sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    // AVX512F
-    _mm512_storeu_pd(ptr, m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    // AVX512F
-    _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    // AVX512F
-    _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
-                         sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    // AVX512F
-    _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
-                              m_value, sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const { return m_value[i]; }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value[i] = value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm512_set1_pd(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm512_add_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
+  template<>
+  class Register<double, avx512_register> :
+    public internal::expt::RegisterBase<Register<double, avx512_register>>
   {
-    return self_type(_mm512_sub_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm512_mul_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(_mm512_div_pd(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
-  }
+    public:
+      using base_type = internal::expt::RegisterBase<Register<double, avx512_register>>;
+
+
+      using register_policy = avx512_register;
+      using self_type = Register<double, avx512_register>;
+      using element_type = double;
+      using register_type = __m512d;
+
+      using int_vector_type = Register<int64_t, avx512_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __mmask8 createMask(camp::idx_t N) const {
+        // Generate a mask
+				switch(N){
+					case 0: return __mmask8(0x00);
+					case 1: return __mmask8(0x01);
+					case 2: return __mmask8(0x03);
+					case 3: return __mmask8(0x07);
+					case 4: return __mmask8(0x0F);
+					case 5: return __mmask8(0x1F);
+					case 6: return __mmask8(0x3F);
+					case 7: return __mmask8(0x7F);
+					case 8: return __mmask8(0xFF);
+				}
+				return __mmask8(0);
+      }
+
+      RAJA_INLINE
+      __m512i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+				auto vstride = _mm512_set1_epi64(stride);
+				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+				return _mm512_mullo_epi64(vstride, vseq);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 8;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register() : base_type(), m_value(_mm512_setzero_pd()) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register(element_type const &c) : base_type(), m_value(_mm512_set1_pd(c)) {}
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+			  // AVX512F
+        m_value = _mm512_loadu_pd(ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+			  // AVX512F
+        m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+			  // AVX512F
+        m_value = _mm512_i64gather_pd(createStridedOffsets(stride),
+				                              ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+				// AVX512F
+        m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
+                                      createMask(N),
+                                      createStridedOffsets(stride),
+                                      ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+				// AVX512F
+        _mm512_storeu_pd(ptr, m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+				// AVX512F
+        _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+				// AVX512F
+				_mm512_i64scatter_pd(ptr, 
+				                     createStridedOffsets(stride),
+														 m_value,
+														 sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+				// AVX512F
+				_mm512_mask_i64scatter_pd(ptr, 
+                           				createMask(N),
+				                          createStridedOffsets(stride),
+																	m_value,
+														      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {return m_value[i];}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        m_value[i] = value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm512_set1_pd(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm512_add_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm512_sub_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm512_mul_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(_mm512_div_pd(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
+      }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
-  }
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
+      }
 #endif
 
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const { return _mm512_reduce_add_pd(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const { return _mm512_reduce_max_pd(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_max_pd(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm512_max_pd(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const { return _mm512_reduce_min_pd(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_min_pd(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm512_min_pd(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+				return _mm512_reduce_add_pd(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        return _mm512_reduce_max_pd(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_max_pd(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm512_max_pd(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        return _mm512_reduce_min_pd(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_min_pd(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm512_min_pd(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX512F__
+#endif //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 004fe3fffa..84cb034a56 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -33,387 +33,367 @@ namespace RAJA
 {
 namespace expt
 {
-template <>
-class Register<float, avx512_register>
-    : public internal::expt::RegisterBase<Register<float, avx512_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<float, avx512_register>>;
-
-  using register_policy = avx512_register;
-  using self_type       = Register<float, avx512_register>;
-  using element_type    = float;
-  using register_type   = __m512;
-
-  using int_vector_type = Register<int32_t, avx512_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __mmask16 createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    switch (N)
-    {
-    case 0:
-      return __mmask16(0x0000);
-    case 1:
-      return __mmask16(0x0001);
-    case 2:
-      return __mmask16(0x0003);
-    case 3:
-      return __mmask16(0x0007);
-    case 4:
-      return __mmask16(0x000F);
-    case 5:
-      return __mmask16(0x001F);
-    case 6:
-      return __mmask16(0x003F);
-    case 7:
-      return __mmask16(0x007F);
-    case 8:
-      return __mmask16(0x00FF);
-    case 9:
-      return __mmask16(0x01FF);
-    case 10:
-      return __mmask16(0x03FF);
-    case 11:
-      return __mmask16(0x07FF);
-    case 12:
-      return __mmask16(0x0FFF);
-    case 13:
-      return __mmask16(0x1FFF);
-    case 14:
-      return __mmask16(0x3FFF);
-    case 15:
-      return __mmask16(0x7FFF);
-    case 16:
-      return __mmask16(0xFFFF);
-    }
-    return __mmask16(0);
-  }
-
-  RAJA_INLINE
-  __m512i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    auto vstride = _mm512_set1_epi32(stride);
-    auto vseq =
-        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    return _mm512_mullo_epi32(vstride, vseq);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 16;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm512_setzero_ps()) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register(element_type const& c) : base_type(), m_value(_mm512_set1_ps(c)) {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    // AVX512F
-    m_value = _mm512_loadu_ps(ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    // AVX512F
-    m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  template<>
+  class Register<float, avx512_register> :
+    public internal::expt::RegisterBase<Register<float, avx512_register>>
   {
-    // AVX512F
-    m_value = _mm512_i32gather_ps(createStridedOffsets(stride), ptr,
-                                  sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    // AVX512F
-    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), createMask(N),
-                                       createStridedOffsets(stride), ptr,
-                                       sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    // AVX512F
-    _mm512_storeu_ps(ptr, m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    // AVX512F
-    _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    // AVX512F
-    _mm512_i32scatter_ps(ptr, createStridedOffsets(stride), m_value,
-                         sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    // AVX512F
-    _mm512_mask_i32scatter_ps(ptr, createMask(N), createStridedOffsets(stride),
-                              m_value, sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const { return m_value[i]; }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value[i] = value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm512_set1_ps(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm512_add_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm512_sub_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm512_mul_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(_mm512_div_ps(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
-  }
+    public:
+      using base_type = internal::expt::RegisterBase<Register<float, avx512_register>>;
+
+      using register_policy = avx512_register;
+      using self_type = Register<float, avx512_register>;
+      using element_type = float;
+      using register_type = __m512;
+
+      using int_vector_type = Register<int32_t, avx512_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __mmask16 createMask(camp::idx_t N) const {
+        // Generate a mask
+				switch(N){
+					case 0:  return __mmask16(0x0000);
+					case 1:  return __mmask16(0x0001);
+					case 2:  return __mmask16(0x0003);
+					case 3:  return __mmask16(0x0007);
+					case 4:  return __mmask16(0x000F);
+					case 5:  return __mmask16(0x001F);
+					case 6:  return __mmask16(0x003F);
+					case 7:  return __mmask16(0x007F);
+					case 8:  return __mmask16(0x00FF);
+          case 9:  return __mmask16(0x01FF);
+          case 10: return __mmask16(0x03FF);
+          case 11: return __mmask16(0x07FF);
+          case 12: return __mmask16(0x0FFF);
+          case 13: return __mmask16(0x1FFF);
+          case 14: return __mmask16(0x3FFF);
+          case 15: return __mmask16(0x7FFF);
+          case 16: return __mmask16(0xFFFF);
+				}
+				return __mmask16(0);
+      }
+
+      RAJA_INLINE
+      __m512i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+				auto vstride = _mm512_set1_epi32(stride);
+				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+				return _mm512_mullo_epi32(vstride, vseq);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 16;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register() : base_type(), m_value(_mm512_setzero_ps()) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register(element_type const &c) : base_type(), m_value(_mm512_set1_ps(c)) {}
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+			  // AVX512F
+        m_value = _mm512_loadu_ps(ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+			  // AVX512F
+        m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+			  // AVX512F
+        m_value = _mm512_i32gather_ps(createStridedOffsets(stride),
+				                              ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+				// AVX512F
+        m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
+                                      createMask(N),
+                                      createStridedOffsets(stride),
+                                      ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+				// AVX512F
+        _mm512_storeu_ps(ptr, m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+				// AVX512F
+        _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+				// AVX512F
+				_mm512_i32scatter_ps(ptr,
+				                     createStridedOffsets(stride),
+														 m_value,
+														 sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+				// AVX512F
+				_mm512_mask_i32scatter_ps(ptr,
+                           				createMask(N),
+				                          createStridedOffsets(stride),
+																	m_value,
+														      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {return m_value[i];}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        m_value[i] = value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm512_set1_ps(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm512_add_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm512_sub_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm512_mul_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(_mm512_div_ps(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
+      }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
-  }
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
-  }
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
+      }
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
+      }
 #endif
 
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const { return _mm512_reduce_add_ps(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const { return _mm512_reduce_max_ps(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_max_ps(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm512_max_ps(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const { return _mm512_reduce_min_ps(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_min_ps(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm512_min_ps(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+				return _mm512_reduce_add_ps(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        return _mm512_reduce_max_ps(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_max_ps(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm512_max_ps(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        return _mm512_reduce_min_ps(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_min_ps(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm512_min_ps(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX512F__
+#endif //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index e3ecac4520..021ca90fbe 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -33,440 +33,419 @@ namespace RAJA
 {
 namespace expt
 {
-template <>
-class Register<int32_t, avx512_register>
-    : public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
-
-  using register_policy = avx512_register;
-  using self_type       = Register<int32_t, avx512_register>;
-  using element_type    = int32_t;
-  using register_type   = __m512i;
-
-  using int_vector_type = Register<int32_t, avx512_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __mmask16 createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    switch (N)
-    {
-    case 0:
-      return __mmask16(0x0000);
-    case 1:
-      return __mmask16(0x0001);
-    case 2:
-      return __mmask16(0x0003);
-    case 3:
-      return __mmask16(0x0007);
-    case 4:
-      return __mmask16(0x000F);
-    case 5:
-      return __mmask16(0x001F);
-    case 6:
-      return __mmask16(0x003F);
-    case 7:
-      return __mmask16(0x007F);
-    case 8:
-      return __mmask16(0x00FF);
-    case 9:
-      return __mmask16(0x01FF);
-    case 10:
-      return __mmask16(0x03FF);
-    case 11:
-      return __mmask16(0x07FF);
-    case 12:
-      return __mmask16(0x0FFF);
-    case 13:
-      return __mmask16(0x1FFF);
-    case 14:
-      return __mmask16(0x3FFF);
-    case 15:
-      return __mmask16(0x7FFF);
-    case 16:
-      return __mmask16(0xFFFF);
-    }
-    return __mmask16(0);
-  }
-
-  RAJA_INLINE
-  __m512i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    auto vstride = _mm512_set1_epi32(stride);
-    auto vseq =
-        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    return _mm512_mullo_epi32(vstride, vseq);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 16;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi32(c))
-  {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    // AVX512F
-#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-    m_value = _mm512_loadu_si512(ptr);
-#else
-    m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
-#endif
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  template<>
+  class Register<int32_t, avx512_register> :
+    public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
   {
-    // AVX512F
-    m_value =
-        _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    // AVX512F
-    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride), ptr,
-                                     sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    // AVX512F
-    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), createMask(N),
-                                          createStridedOffsets(stride), ptr,
-                                          sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    // AVX512F
-#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-    _mm512_storeu_si512(ptr, m_value);
-#else
-    _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
-#endif
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    // AVX512F
-    _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    // AVX512F
-    _mm512_i32scatter_epi32(ptr, createStridedOffsets(stride), m_value,
-                            sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
-  {
-    // AVX512F
-    _mm512_mask_i32scatter_epi32(ptr, createMask(N),
-                                 createStridedOffsets(stride), m_value,
-                                 sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const
-  {
-// GNU 7-10 are missing this instruction.
-#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
-#define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
-#endif
-
-    switch (i)
-    {
-    case 0:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
-    case 1:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
-    case 2:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
-    case 3:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
-    case 4:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
-    case 5:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
-    case 6:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
-    case 7:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
-    case 8:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
-    case 9:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
-    case 10:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
-    case 11:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
-    case 12:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
-    case 13:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
-    case 14:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
-    case 15:
-      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
-    }
-    return 0;
-  }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm512_set1_epi32(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm512_add_epi32(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm512_sub_epi32(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm512_mullo_epi32(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi32(
-        get(15) / b.get(15), get(14) / b.get(14), get(13) / b.get(13),
-        get(12) / b.get(12), get(11) / b.get(11), get(10) / b.get(10),
-        get(9) / b.get(9), get(8) / b.get(8), get(7) / b.get(7),
-        get(6) / b.get(6), get(5) / b.get(5), get(4) / b.get(4),
-        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
-        get(0) / b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi32(
-        N >= 16 ? get(15) / b.get(15) : 0, N >= 15 ? get(14) / b.get(14) : 0,
-        N >= 14 ? get(13) / b.get(13) : 0, N >= 13 ? get(12) / b.get(12) : 0,
-        N >= 12 ? get(11) / b.get(11) : 0, N >= 11 ? get(10) / b.get(10) : 0,
-        N >= 10 ? get(9) / b.get(9) : 0, N >= 9 ? get(8) / b.get(8) : 0,
-        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
-        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const { return _mm512_reduce_add_epi32(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const { return _mm512_reduce_max_epi32(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm512_max_epi32(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const { return _mm512_reduce_min_epi32(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm512_min_epi32(m_value, a.m_value));
-  }
-};
-
-}  // namespace expt
+    public:
+      using base_type = internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
+
+      using register_policy = avx512_register;
+      using self_type = Register<int32_t, avx512_register>;
+      using element_type = int32_t;
+      using register_type = __m512i;
+
+      using int_vector_type = Register<int32_t, avx512_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __mmask16 createMask(camp::idx_t N) const {
+        // Generate a mask
+				switch(N){
+					case 0:  return __mmask16(0x0000);
+					case 1:  return __mmask16(0x0001);
+					case 2:  return __mmask16(0x0003);
+					case 3:  return __mmask16(0x0007);
+					case 4:  return __mmask16(0x000F);
+					case 5:  return __mmask16(0x001F);
+					case 6:  return __mmask16(0x003F);
+					case 7:  return __mmask16(0x007F);
+					case 8:  return __mmask16(0x00FF);
+          case 9:  return __mmask16(0x01FF);
+          case 10: return __mmask16(0x03FF);
+          case 11: return __mmask16(0x07FF);
+          case 12: return __mmask16(0x0FFF);
+          case 13: return __mmask16(0x1FFF);
+          case 14: return __mmask16(0x3FFF);
+          case 15: return __mmask16(0x7FFF);
+          case 16: return __mmask16(0xFFFF);
+				}
+				return __mmask16(0);
+      }
+
+      RAJA_INLINE
+      __m512i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+				auto vstride = _mm512_set1_epi32(stride);
+				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+				return _mm512_mullo_epi32(vstride, vseq);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 16;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi32(c)) {}
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+			  // AVX512F
+        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+        m_value = _mm512_loadu_si512(ptr);
+        #else
+        m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
+        #endif
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+			  // AVX512F
+        m_value = _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+			  // AVX512F
+        m_value = _mm512_i32gather_epi32(createStridedOffsets(stride),
+				                              ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+				// AVX512F
+        m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
+                                      createMask(N),
+                                      createStridedOffsets(stride),
+                                      ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+				// AVX512F
+        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+        _mm512_storeu_si512(ptr, m_value);
+        #else
+        _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
+        #endif
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+				// AVX512F
+        _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+				// AVX512F
+				_mm512_i32scatter_epi32(ptr,
+				                     createStridedOffsets(stride),
+														 m_value,
+														 sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+				// AVX512F
+				_mm512_mask_i32scatter_epi32(ptr,
+                           				createMask(N),
+				                          createStridedOffsets(stride),
+																	m_value,
+														      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {
+        // GNU 7-10 are missing this instruction.
+        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
+        #define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
+        #endif
+
+				switch(i){	
+					case 0: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
+					case 1: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
+					case 2: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
+					case 3: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
+					case 4: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
+					case 5: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
+					case 6: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
+					case 7: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
+					case 8: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
+					case 9: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
+					case 10: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
+					case 11: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
+					case 12: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
+					case 13: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
+					case 14: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
+					case 15: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
+				}
+				return 0;
+			}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+				m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm512_set1_epi32(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm512_add_epi32(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm512_sub_epi32(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm512_mullo_epi32(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        // AVX512 does not supply an integer divide, so do it manually
+        return self_type(_mm512_set_epi32(
+            get(15)/b.get(15),
+            get(14)/b.get(14),
+            get(13)/b.get(13),
+            get(12)/b.get(12),
+            get(11)/b.get(11),
+            get(10)/b.get(10),
+            get(9)/b.get(9),
+            get(8)/b.get(8),
+            get(7)/b.get(7),
+            get(6)/b.get(6),
+            get(5)/b.get(5),
+            get(4)/b.get(4),
+            get(3)/b.get(3),
+            get(2)/b.get(2),
+            get(1)/b.get(1),
+            get(0)/b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N) const {
+        // AVX512 does not supply an integer divide, so do it manually
+        return self_type(_mm512_set_epi32(
+            N >= 16 ? get(15)/b.get(15) : 0,
+            N >= 15 ? get(14)/b.get(14) : 0,
+            N >= 14 ? get(13)/b.get(13) : 0,
+            N >= 13 ? get(12)/b.get(12) : 0,
+            N >= 12 ? get(11)/b.get(11) : 0,
+            N >= 11 ? get(10)/b.get(10) : 0,
+            N >= 10 ? get(9)/b.get(9) : 0,
+            N >= 9 ? get(8)/b.get(8) : 0,
+            N >= 8 ? get(7)/b.get(7) : 0,
+            N >= 7 ? get(6)/b.get(6) : 0,
+            N >= 6 ? get(5)/b.get(5) : 0,
+            N >= 5 ? get(4)/b.get(4) : 0,
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+				return _mm512_reduce_add_epi32(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        return _mm512_reduce_max_epi32(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm512_max_epi32(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        return _mm512_reduce_min_epi32(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm512_min_epi32(m_value, a.m_value));
+      }
+  };
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX512F__
+#endif //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index f4c1c40315..17f929c607 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -33,386 +33,373 @@ namespace RAJA
 {
 namespace expt
 {
-template <>
-class Register<int64_t, avx512_register>
-    : public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
-
-  using register_policy = avx512_register;
-  using self_type       = Register<int64_t, avx512_register>;
-  using element_type    = int64_t;
-  using register_type   = __m512i;
-
-  using int_vector_type = Register<int64_t, avx512_register>;
-
-
-private:
-  register_type m_value;
-
-  RAJA_INLINE
-  __mmask8 createMask(camp::idx_t N) const
-  {
-    // Generate a mask
-    switch (N)
-    {
-    case 0:
-      return __mmask8(0x00);
-    case 1:
-      return __mmask8(0x01);
-    case 2:
-      return __mmask8(0x03);
-    case 3:
-      return __mmask8(0x07);
-    case 4:
-      return __mmask8(0x0F);
-    case 5:
-      return __mmask8(0x1F);
-    case 6:
-      return __mmask8(0x3F);
-    case 7:
-      return __mmask8(0x7F);
-    case 8:
-      return __mmask8(0xFF);
-    }
-    return __mmask8(0);
-  }
-
-  RAJA_INLINE
-  __m512i createStridedOffsets(camp::idx_t stride) const
-  {
-    // Generate a strided offset list
-    auto vstride = _mm512_set1_epi64(stride);
-    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-    return _mm512_mullo_epi64(vstride, vseq);
-  }
-
-public:
-  static constexpr camp::idx_t s_num_elem = 8;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_INLINE
-  explicit Register(register_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-  /*!
-   * @brief Construct from scalar.
-   * Sets all elements to same value (broadcast).
-   */
-  // AVX512F
-  RAJA_INLINE
-  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi64(c))
-  {}
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    // AVX512F
-#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
-    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
-     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-    m_value = _mm512_maskz_loadu_epi64(
-        ~0,
-        ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
-#else
-    m_value =
-        _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction,
-                                  // as is icpx as of version 2022.2.
-#endif
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    // AVX512F
-    m_value =
-        _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
-  {
-    // AVX512F
-    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
-                                     sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
-  {
-    // AVX512F
-    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
-                                          createStridedOffsets(stride), ptr,
-                                          sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    // AVX512F
-#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
-    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
-     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-    _mm512_mask_storeu_epi64(ptr, ~0,
-                             m_value);  // May cause slowdown due to looping
-                                        // over 8 bytes, one at a time.
-#else
-    _mm512_storeu_epi64(ptr,
-                        m_value);  // GNU 7-10 are missing this instruction,
-                                   // as is icpx as of version 2022.2.
-#endif
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    // AVX512F
-    _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
-  {
-    // AVX512F
-    _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
-                            sizeof(element_type));
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  template<>
+  class Register<int64_t, avx512_register> :
+    public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
   {
-    // AVX512F
-    _mm512_mask_i64scatter_epi64(ptr, createMask(N),
-                                 createStridedOffsets(stride), m_value,
-                                 sizeof(element_type));
-    return *this;
-  }
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  RAJA_INLINE
-  element_type get(camp::idx_t i) const { return m_value[i]; }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  self_type& set(element_type value, camp::idx_t i)
-  {
-    m_value[i] = value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& value)
-  {
-    m_value = _mm512_set1_epi64(value);
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(_mm512_add_epi64(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(_mm512_sub_epi64(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(_mm512_mullo_epi64(m_value, b.m_value));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi64(get(7) / b.get(7), get(6) / b.get(6),
-                                      get(5) / b.get(5), get(4) / b.get(4),
-                                      get(3) / b.get(3), get(2) / b.get(2),
-                                      get(1) / b.get(1), get(0) / b.get(0)));
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, camp::idx_t N) const
-  {
-    // AVX512 does not supply an integer divide, so do it manually
-    return self_type(_mm512_set_epi64(
-        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
-        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
-        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
-        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
-  }
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  element_type sum() const { return _mm512_reduce_add_epi64(m_value); }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max() const { return _mm512_reduce_max_epi64(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(_mm512_max_epi64(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min() const { return _mm512_reduce_min_epi64(m_value); }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  self_type vmin(self_type a) const
-  {
-    return self_type(_mm512_min_epi64(m_value, a.m_value));
-  }
-};
-
-
-}  // namespace expt
+    public:
+      using base_type = internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
+
+      using register_policy = avx512_register;
+      using self_type = Register<int64_t, avx512_register>;
+      using element_type = int64_t;
+      using register_type = __m512i;
+
+      using int_vector_type = Register<int64_t, avx512_register>;
+
+
+    private:
+      register_type m_value;
+
+      RAJA_INLINE
+      __mmask8 createMask(camp::idx_t N) const {
+        // Generate a mask
+				switch(N){
+					case 0: return __mmask8(0x00);
+					case 1: return __mmask8(0x01);
+					case 2: return __mmask8(0x03);
+					case 3: return __mmask8(0x07);
+					case 4: return __mmask8(0x0F);
+					case 5: return __mmask8(0x1F);
+					case 6: return __mmask8(0x3F);
+					case 7: return __mmask8(0x7F);
+					case 8: return __mmask8(0xFF);
+				}
+				return __mmask8(0);
+      }
+
+      RAJA_INLINE
+      __m512i createStridedOffsets(camp::idx_t stride) const {
+        // Generate a strided offset list
+				auto vstride = _mm512_set1_epi64(stride);
+				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+				return _mm512_mullo_epi64(vstride, vseq);
+      }
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 8;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_INLINE
+      explicit Register(register_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      /*!
+       * @brief Construct from scalar.
+       * Sets all elements to same value (broadcast).
+       */
+			// AVX512F
+      RAJA_INLINE
+      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi64(c)) {}
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+			  // AVX512F
+        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
+            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+        m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
+        #else
+        m_value = _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
+        #endif
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+			  // AVX512F
+        m_value = _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+			  // AVX512F
+        m_value = _mm512_i64gather_epi64(createStridedOffsets(stride),
+				                              ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+				// AVX512F
+        m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
+                                      createMask(N),
+                                      createStridedOffsets(stride),
+                                      ptr,
+                                      sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+				// AVX512F
+        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
+            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+        _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
+        #else
+        _mm512_storeu_epi64(ptr, m_value);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
+        #endif
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+				// AVX512F
+        _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+				// AVX512F
+				_mm512_i64scatter_epi64(ptr,
+				                     createStridedOffsets(stride),
+														 m_value,
+														 sizeof(element_type));
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+				// AVX512F
+				_mm512_mask_i64scatter_epi64(ptr,
+                           				createMask(N),
+				                          createStridedOffsets(stride),
+																	m_value,
+														      sizeof(element_type));
+        return *this;
+      }
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      RAJA_INLINE
+      element_type get(camp::idx_t i) const
+      {return m_value[i];}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      self_type &set(element_type value, camp::idx_t i)
+      {
+        m_value[i] = value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &value){
+        m_value =  _mm512_set1_epi64(value);
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(_mm512_add_epi64(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(_mm512_sub_epi64(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(_mm512_mullo_epi64(m_value, b.m_value));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        // AVX512 does not supply an integer divide, so do it manually
+        return self_type(_mm512_set_epi64(
+            get(7)/b.get(7),
+            get(6)/b.get(6),
+            get(5)/b.get(5),
+            get(4)/b.get(4),
+            get(3)/b.get(3),
+            get(2)/b.get(2),
+            get(1)/b.get(1),
+            get(0)/b.get(0)
+            ));
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, camp::idx_t N ) const {
+        // AVX512 does not supply an integer divide, so do it manually
+        return self_type(_mm512_set_epi64(
+            N >= 8 ? get(7)/b.get(7) : 0,
+            N >= 7 ? get(6)/b.get(6) : 0,
+            N >= 6 ? get(5)/b.get(5) : 0,
+            N >= 5 ? get(4)/b.get(4) : 0,
+            N >= 4 ? get(3)/b.get(3) : 0,
+            N >= 3 ? get(2)/b.get(2) : 0,
+            N >= 2 ? get(1)/b.get(1) : 0,
+            N >= 1 ? get(0)/b.get(0) : 0
+            ));
+      }
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      element_type sum() const
+      {
+				return _mm512_reduce_add_epi64(m_value);
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max() const
+      {
+        return _mm512_reduce_max_epi64(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(_mm512_max_epi64(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min() const
+      {
+        return _mm512_reduce_min_epi64(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+				return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(_mm512_min_epi64(m_value, a.m_value));
+      }
+  };
+
+
+}   // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif  //__AVX512F__
+#endif //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index 3088b0b8ae..b2b5cf6731 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -21,59 +21,53 @@
 #ifndef RAJA_policy_tensor_arch_avx512_traits_HPP
 #define RAJA_policy_tensor_arch_avx512_traits_HPP
 
-namespace RAJA
-{
-namespace internal
-{
-namespace expt
-{
+namespace RAJA {
+namespace internal {
+namespace expt {
 
 
-template <>
-struct RegisterTraits<RAJA::expt::avx512_register, int32_t>
-{
-  using element_type                      = int32_t;
-  using register_policy                   = RAJA::expt::avx512_register;
-  static constexpr camp::idx_t s_num_bits = 512;
-  static constexpr camp::idx_t s_num_elem = 16;
-  using int_element_type                  = int32_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx512_register, int32_t>{
+      using element_type = int32_t;
+      using register_policy = RAJA::expt::avx512_register;
+      static constexpr camp::idx_t s_num_bits = 512;
+      static constexpr camp::idx_t s_num_elem = 16;
+      using int_element_type = int32_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::avx512_register, int64_t>
-{
-  using element_type                      = int64_t;
-  using register_policy                   = RAJA::expt::avx512_register;
-  static constexpr camp::idx_t s_num_bits = 512;
-  static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type                  = int64_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx512_register, int64_t>{
+      using element_type = int64_t;
+      using register_policy = RAJA::expt::avx512_register;
+      static constexpr camp::idx_t s_num_bits = 512;
+      static constexpr camp::idx_t s_num_elem = 8;
+      using int_element_type = int64_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::avx512_register, float>
-{
-  using element_type                      = float;
-  using register_policy                   = RAJA::expt::avx512_register;
-  static constexpr camp::idx_t s_num_bits = 512;
-  static constexpr camp::idx_t s_num_elem = 16;
-  using int_element_type                  = int32_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx512_register, float>{
+      using element_type = float;
+      using register_policy = RAJA::expt::avx512_register;
+      static constexpr camp::idx_t s_num_bits = 512;
+      static constexpr camp::idx_t s_num_elem = 16;
+      using int_element_type = int32_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::avx512_register, double>
-{
-  using element_type                      = double;
-  using register_policy                   = RAJA::expt::avx512_register;
-  static constexpr camp::idx_t s_num_bits = 512;
-  static constexpr camp::idx_t s_num_elem = 8;
-  using int_element_type                  = int64_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::avx512_register, double>{
+      using element_type = double;
+      using register_policy = RAJA::expt::avx512_register;
+      static constexpr camp::idx_t s_num_bits = 512;
+      static constexpr camp::idx_t s_num_elem = 8;
+      using int_element_type = int64_t;
+  };
 
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
+} // namespace internal
+} // namespace expt
+} // namespace RAJA
 
-#endif  // guard
+#endif // guard
 
 
-#endif  // __AVX512F__
+
+#endif // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/cuda.hpp b/include/RAJA/policy/tensor/arch/cuda.hpp
index cfda807e68..a840c63d85 100644
--- a/include/RAJA/policy/tensor/arch/cuda.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_cuda_HPP
 #define RAJA_policy_tensor_arch_cuda_HPP
 
-#include <RAJA/policy/tensor/arch/cuda/traits.hpp>
-#include <RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
+#include<RAJA/policy/tensor/arch/cuda/traits.hpp>
+#include<RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
 
 
 #endif
 
 
-#endif  // RAJA_ENABLE_CUDA
+#endif // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index 81b19709ab..e23eb92bed 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -30,1019 +30,981 @@
 #define RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP
 
 
+
 namespace RAJA
 {
 namespace expt
 {
 
-template <typename ELEMENT_TYPE>
-class Register<ELEMENT_TYPE, cuda_warp_register>
-    : public internal::expt::RegisterBase<
-          Register<ELEMENT_TYPE, cuda_warp_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
-
-  using register_policy = cuda_warp_register;
-  using self_type       = Register<ELEMENT_TYPE, cuda_warp_register>;
-  using element_type    = ELEMENT_TYPE;
-  using register_type   = ELEMENT_TYPE;
-
-  using int_vector_type = Register<int64_t, cuda_warp_register>;
-
-
-private:
-  element_type m_value;
-
-public:
-  static constexpr int s_num_elem = 32;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr Register() : base_type(), m_value(0) {}
-
-
-  /*!
-   * @brief Copy constructor from raw value
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr Register(element_type c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-
-  /*!
-   * @brief Copy assignment operator
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
+  template<typename ELEMENT_TYPE>
+  class Register<ELEMENT_TYPE, cuda_warp_register> :
+    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>
+  {
+    public:
+      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
+
+      using register_policy = cuda_warp_register;
+      using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
+      using element_type = ELEMENT_TYPE;
+      using register_type = ELEMENT_TYPE;
+
+      using int_vector_type = Register<int64_t, cuda_warp_register>;
+
+
+		private:
+      element_type m_value;
 
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& operator=(element_type c)
-  {
-    m_value = c;
-    return *this;
-  }
-
-  /*!
-   * @brief Gets our warp lane
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr static int get_lane() { return threadIdx.x; }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  constexpr element_type const& get_raw_value() const { return m_value; }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  element_type& get_raw_value() { return m_value; }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  static constexpr bool is_root() { return get_lane() == 0; }
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_packed(element_type const* ptr)
-  {
-
-    auto lane = get_lane();
-
-    m_value = ptr[lane];
-
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_packed_n(element_type const* ptr, int N)
-  {
-    auto lane = get_lane();
-    if (lane < N)
-    {
-      m_value = ptr[lane];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_strided(element_type const* ptr, int stride)
-  {
-
-    auto lane = get_lane();
-
-    m_value = ptr[stride * lane];
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_strided_n(element_type const* ptr, int stride, int N)
-  {
-    auto lane = get_lane();
-
-    if (lane < N)
-    {
-      m_value = ptr[stride * lane];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic gather operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& gather(element_type const* ptr, int_vector_type offsets)
-  {
-
-    m_value = ptr[offsets.get_raw_value()];
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic gather operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type&
-  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
-  {
-    if (get_lane() < N)
-    {
-      m_value = ptr[offsets.get_raw_value()];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays.
-   *
-   * The default operation combines the s_segmented_offsets and gather
-   * operations.
-   *
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t segbits,
-                            camp::idx_t stride_inner,
-                            camp::idx_t stride_outer)
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    m_value = ptr[seg * stride_outer + i * stride_inner];
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays where we load partial segments.
-   *
-   *
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t segbits,
-                               camp::idx_t stride_inner,
-                               camp::idx_t stride_outer,
-                               camp::idx_t num_inner,
-                               camp::idx_t num_outer)
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    if (seg >= num_outer || i >= num_inner)
-    {
-      m_value = element_type(0);
-    }
-    else
-    {
-      m_value = ptr[seg * stride_outer + i * stride_inner];
-    }
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_packed(element_type* ptr) const
-  {
-
-    auto lane = get_lane();
-
-    ptr[lane] = m_value;
-
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_packed_n(element_type* ptr, int N) const
-  {
-
-    auto lane = get_lane();
-
-    if (lane < N)
-    {
-      ptr[lane] = m_value;
-    }
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_strided(element_type* ptr, int stride) const
-  {
-
-    auto lane = get_lane();
-
-    ptr[lane * stride] = m_value;
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
-  {
-
-    auto lane = get_lane();
-
-    if (lane < N)
-    {
-      ptr[lane * stride] = m_value;
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic scatter operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
-                                                   T2 const& offsets) const
-  {
-
-    ptr[offsets.get_raw_value()] = m_value;
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic scatter operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const&
-  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
-  {
-    if (get_lane() < N)
-    {
-      ptr[offsets.get_raw_value()] = m_value;
-    }
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic segmented store operation used for storing sub-matrices
-   * to larger arrays.
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t segbits,
-                                   camp::idx_t stride_inner,
-                                   camp::idx_t stride_outer) const
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    ptr[seg * stride_outer + i * stride_inner] = m_value;
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic segmented store operation used for storing sub-matrices
-   * to larger arrays where we store partial segments.
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t segbits,
-                                      camp::idx_t stride_inner,
-                                      camp::idx_t stride_outer,
-                                      camp::idx_t num_inner,
-                                      camp::idx_t num_outer) const
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    if (seg >= num_outer || i >= num_inner)
-    {
-      // nop
-    }
-    else
-    {
-      ptr[seg * stride_outer + i * stride_inner] = m_value;
-    }
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
-  {
-    return __shfl_sync(0xffffffff, m_value, i, 32);
-  }
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& set(element_type value, int i)
-  {
-    auto lane = get_lane();
-    if (lane == i)
-    {
-      m_value = value;
-    }
-    return *this;
-  }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& a)
-  {
-    m_value = a;
-    return *this;
-  }
-
-  /*!
-   * @brief Extracts a scalar value and broadcasts to a new register
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type get_and_broadcast(int i) const
-  {
-    self_type x;
-    x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
-    return x;
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(m_value + b.m_value);
-  }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(m_value - b.m_value);
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(m_value * b.m_value);
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(m_value / b.m_value);
-  }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, int N) const
-  {
-    return get_lane() < N ? self_type(m_value / b.m_value)
-                          : self_type(element_type(0));
-  }
-
-  /**
-   * floats and doubles use the CUDA instrinsic FMA
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+		public:
+
+      static constexpr int s_num_elem = 32;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      Register() : base_type(), m_value(0) {
+
+      }
+
+
+      /*!
+       * @brief Copy constructor from raw value
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      Register(element_type c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+
+      /*!
+       * @brief Copy assignment operator
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &operator=(element_type c){
+        m_value = c;
+        return *this;
+      }
+
+      /*!
+       * @brief Gets our warp lane
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      static
+      int get_lane() {
+        return threadIdx.x;
+      }
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      constexpr
+      element_type const &get_raw_value() const {
+        return m_value;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      element_type &get_raw_value() {
+        return m_value;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      bool is_root() {
+        return get_lane() == 0;
+      }
+
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_packed(element_type const *ptr){
+
+        auto lane = get_lane();
+
+        m_value = ptr[lane];
+
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_packed_n(element_type const *ptr, int N){
+        auto lane = get_lane();
+        if(lane < N){
+          m_value = ptr[lane];
+        }
+        else{
+          m_value = element_type(0);
+        }
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_strided(element_type const *ptr, int stride){
+
+        auto lane = get_lane();
+
+        m_value = ptr[stride*lane];
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_strided_n(element_type const *ptr, int stride, int N){
+        auto lane = get_lane();
+
+        if(lane < N){
+          m_value = ptr[stride*lane];
+        }
+        else{
+          m_value = element_type(0);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic gather operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &gather(element_type const *ptr, int_vector_type offsets){
+
+        m_value = ptr[offsets.get_raw_value()];
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic gather operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+        if(get_lane() < N){
+          m_value = ptr[offsets.get_raw_value()];
+        }
+        else{
+          m_value = element_type(0);
+        }
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays.
+       *
+       * The default operation combines the s_segmented_offsets and gather
+       * operations.
+       *
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        m_value = ptr[seg*stride_outer + i*stride_inner];
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays where we load partial segments.
+       *
+       *
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
+          camp::idx_t stride_inner, camp::idx_t stride_outer,
+          camp::idx_t num_inner, camp::idx_t num_outer)
+      {
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        if(seg >= num_outer || i >= num_inner){
+          m_value = element_type(0);
+        }
+        else{
+          m_value = ptr[seg*stride_outer + i*stride_inner];
+        }
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_packed(element_type *ptr) const{
+
+        auto lane = get_lane();
+
+        ptr[lane] = m_value;
+
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_packed_n(element_type *ptr, int N) const{
+
+        auto lane = get_lane();
+
+        if(lane < N){
+          ptr[lane] = m_value;
+        }
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_strided(element_type *ptr, int stride) const{
+
+        auto lane = get_lane();
+
+        ptr[lane*stride] = m_value;
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
+
+        auto lane = get_lane();
+
+        if(lane < N){
+          ptr[lane*stride] = m_value;
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic scatter operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
+
+        ptr[offsets.get_raw_value()] = m_value;
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic scatter operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
+        if(get_lane() < N){
+          ptr[offsets.get_raw_value()] = m_value;
+        }
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic segmented store operation used for storing sub-matrices
+       * to larger arrays.
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        ptr[seg*stride_outer + i*stride_inner] = m_value;
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic segmented store operation used for storing sub-matrices
+       * to larger arrays where we store partial segments.
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
+          camp::idx_t stride_inner, camp::idx_t stride_outer,
+          camp::idx_t num_inner, camp::idx_t num_outer) const
+      {
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        if(seg >= num_outer || i >= num_inner){
+          // nop
+        }
+        else{
+          ptr[seg*stride_outer + i*stride_inner] = m_value;
+        }
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      constexpr
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type get(int i) const
+			{
+        return  __shfl_sync(0xffffffff, m_value, i, 32);
+			}
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &set(element_type value, int i)
+			{
+				auto lane = get_lane();
+      	if(lane == i){
+					m_value = value;
+				}
+        return *this;
+			}
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &a){
+        m_value = a;
+        return *this;
+      }
+
+      /*!
+       * @brief Extracts a scalar value and broadcasts to a new register
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type get_and_broadcast(int i) const {
+        self_type x;
+        x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
+        return x;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(m_value + b.m_value);
+      }
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(m_value - b.m_value);
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(m_value * b.m_value);
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(m_value / b.m_value);
+      }
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, int N) const {
+        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
+      }
+
+      /**
+       * floats and doubles use the CUDA instrinsic FMA
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(fma(m_value, b.m_value, c.m_value));
-  }
-
-  /**
-   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+      RETURN_TYPE>::type
+      multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(fma(m_value, b.m_value, c.m_value));
+      }
+
+      /**
+       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(m_value * b.m_value + c.m_value);
-  }
-
-  /**
-   * floats and doubles use the CUDA instrinsic FMS
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+      RETURN_TYPE>::type
+      multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(m_value * b.m_value + c.m_value);
+      }
+
+      /**
+       * floats and doubles use the CUDA instrinsic FMS
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(fma(m_value, b.m_value, -c.m_value));
-  }
-
-  /**
-   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+      RETURN_TYPE>::type
+      multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(fma(m_value, b.m_value, -c.m_value));
+      }
+
+      /**
+       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(m_value * b.m_value - c.m_value);
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type sum() const
-  {
-    // Allreduce sum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type max() const
-  {
-    // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
-
-    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type max_n(int N) const
-  {
-    // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
-
-    auto ident = RAJA::operators::limits<element_type>::min();
-    auto lane  = get_lane();
-    auto value = lane < N ? m_value : ident;
-    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type vmax(self_type a) const
-  {
-    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type min() const
-  {
-    // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
-
-    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-  }
-
-  /*!
-   * @brief Returns the largest element from first N lanes
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type min_n(int N) const
-  {
-    // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
-
-    auto ident = RAJA::operators::limits<element_type>::max();
-    auto lane  = get_lane();
-    auto value = lane < N ? m_value : ident;
-    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type vmin(self_type a) const
-  {
-    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
-  }
-
-
-  /*!
-   * Provides gather/scatter indices for segmented loads and stores
-   *
-   * THe number of segment bits (segbits) is specified, as well as the
-   * stride between elements in a segment (stride_inner),
-   * and the stride between segments (stride_outer)
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
-                                             camp::idx_t stride_inner,
-                                             camp::idx_t stride_outer)
-  {
-    int_vector_type result;
-
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    result.get_raw_value() = seg * stride_outer + i * stride_inner;
-
-    return result;
-  }
-
-
-  /*!
-   * Sum elements within each segment, with segment size defined by segbits.
-   * Stores each segments sum consecutively, but shifed to the
-   * corresponding output_segment slot.
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 is equivalent to the input vector,  since there are 8
-   *      outputs, there is only 1 output segment
-   *
-   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-   *      so there are possible output segments.
-   *
-   *      output_segment=0:
-   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-   *
-   *      output_segment=1:
-   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-   *
-   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-   *      output_segment denotes the vector position of the sum
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_sum_inner(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
-  {
-
-    // First: tree reduce values within each segment
-    element_type x = m_value;
-    RAJA_UNROLL
-    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
-    {
-
-      // tree shuffle
-      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
-
-      // reduce
-      x += y;
-    }
-
-    // Second: send result to output segment lanes
-    self_type result;
-    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane() << segbits);
-
-    // Third: mask off everything but output_segment
-    //        this is because all output segments are valid at this point
-    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-    int our_output_segment = get_lane() >> (5 - segbits);
-    bool in_output_segment = our_output_segment == output_segment;
-    if (!in_output_segment)
-    {
-      result.get_raw_value() = 0;
-    }
-
-    return result;
-  }
-
-  /*!
-   * Sum across segments, with segment size defined by segbits
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 is equivalent to the input vector,  since there are 8
-   *      outputs, there is only 1 output segment
-   *
-   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=1 sums strided pairs of values.  There are 4 output,
-   *      so there are possible output segments.
-   *
-   *      output_segment=0:
-   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-   *
-   *      output_segment=1:
-   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-   *
-   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-   *      output_segment denotes the vector position of the sum
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_sum_outer(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
-  {
-
-    // First: tree reduce values within each segment
-    element_type x = m_value;
-    RAJA_UNROLL
-    for (int i = 0; i < 5 - segbits; ++i)
-    {
-
-      // tree shuffle
-      int delta      = s_num_elem >> (i + 1);
-      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
-
-      // reduce
-      x += y;
-    }
-
-    // Second: send result to output segment lanes
-    self_type result;
-    int get_from           = get_lane() & ((1 << segbits) - 1);
-    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
-
-    int mask = (get_lane() >> segbits) == output_segment;
-
-
-    // Third: mask off everything but output_segment
-    if (!mask)
-    {
-      result.get_raw_value() = 0;
-    }
-
-    return result;
-  }
-
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_divide_nm(self_type den,
-                                camp::idx_t segbits,
-                                camp::idx_t num_inner,
-                                camp::idx_t num_outer) const
-  {
-    self_type result;
-
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    if (seg >= num_outer || i >= num_inner)
-    {
-      // nop
-    }
-    else
-    {
-      result.get_raw_value() = m_value / den.get_raw_value();
-    }
-
-    return result;
-  }
-
-
-  /*!
-   * Segmented broadcast copies a segment to all output segments of a vector
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 means the input segment size is 1, so this selects the
-   *      value at x[input_segmnet] and broadcasts it to the rest of the
-   *      vector
-   *
-   *      input segments allowed are from 0 to 7, inclusive
-   *
-   *      input_segment=0
-   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-   *
-   *      input_segment=5
-   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-   *
-   *  segbits=1 means that the input segments are each pair of x values:
-   *
-   *      input segments allowed are from 0 to 3, inclusive
-   *
-   *      output_segment=0:
-   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-   *
-   *      output_segment=1:
-   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-   *
-   *      output_segment=3:
-   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-   *
-   *  and so on up to segbits=2, the input segments are 4 wide:
-   *
-   *      input segments allowed are from 0 or 1
-   *
-   *      output_segment=0:
-   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-   *
-   *      output_segment=1:
-   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_broadcast_inner(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
-  {
-    self_type result;
-
-    camp::idx_t mask   = (1 << segbits) - 1;
-    camp::idx_t offset = input_segment << segbits;
-
-
-    camp::idx_t i = (get_lane() & mask) + offset;
-
-    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-
-    return result;
-  }
-
-
-  /*!
-   * Segmented broadcast spreads a segment to all output segments of a vector
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 means the input segment size is 1, so this selects the
-   *      value at x[input_segmnet] and broadcasts it to the rest of the
-   *      vector
-   *
-   *      input segments allowed are from 0 to 7, inclusive
-   *
-   *      input_segment=0
-   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-   *
-   *      input_segment=5
-   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-   *
-   *  segbits=1 means that the input segments are each pair of x values:
-   *
-   *      input segments allowed are from 0 to 3, inclusive
-   *
-   *      output_segment=0:
-   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-   *
-   *      output_segment=1:
-   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-   *
-   *      output_segment=3:
-   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_broadcast_outer(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
-  {
-    self_type result;
-
-    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-    camp::idx_t i = (get_lane() >> segbits) + offset;
-
-    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-    return result;
-  }
-};
-
-
-}  // namespace expt
-
-}  // namespace RAJA
-
-
-#endif  // Guard
-
-#endif  // CUDA
+      RETURN_TYPE>::type
+      multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(m_value * b.m_value - c.m_value);
+      }
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type sum() const
+      {
+				// Allreduce sum
+				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+				return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+      }
+
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type max() const
+      {
+        // Allreduce maximum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
+
+        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type max_n(int N) const
+      {
+        // Allreduce maximum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
+
+        auto ident = RAJA::operators::limits<element_type>::min();
+        auto lane = get_lane();
+        auto value = lane < N ? m_value : ident;
+        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type vmax(self_type a) const
+      {
+        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type min() const
+      {
+        // Allreduce minimum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
+
+        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+
+      }
+
+      /*!
+       * @brief Returns the largest element from first N lanes
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type min_n(int N) const
+      {
+        // Allreduce minimum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
+
+        auto ident = RAJA::operators::limits<element_type>::max();
+        auto lane = get_lane();
+        auto value = lane < N ? m_value : ident;
+        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type vmin(self_type a) const
+      {
+        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+      }
+
+
+
+
+      /*!
+       * Provides gather/scatter indices for segmented loads and stores
+       *
+       * THe number of segment bits (segbits) is specified, as well as the
+       * stride between elements in a segment (stride_inner),
+       * and the stride between segments (stride_outer)
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      static
+      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
+      {
+        int_vector_type result;
+
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        result.get_raw_value() = seg*stride_outer + i*stride_inner;
+
+        return result;
+      }
+
+
+      /*!
+       * Sum elements within each segment, with segment size defined by segbits.
+       * Stores each segments sum consecutively, but shifed to the
+       * corresponding output_segment slot.
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 is equivalent to the input vector,  since there are 8
+       *      outputs, there is only 1 output segment
+       *
+       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+       *      so there are possible output segments.
+       *
+       *      output_segment=0:
+       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+       *
+       *      output_segment=1:
+       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+       *
+       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+       *      output_segment denotes the vector position of the sum
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
+      {
+
+        // First: tree reduce values within each segment
+        element_type x = m_value;
+        RAJA_UNROLL
+        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
+
+          // tree shuffle
+          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
+
+          // reduce
+          x += y;
+        }
+
+        // Second: send result to output segment lanes
+        self_type result;
+        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane()<<segbits);
+
+        // Third: mask off everything but output_segment
+        //        this is because all output segments are valid at this point
+        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+        int our_output_segment = get_lane()>>(5-segbits);
+        bool in_output_segment = our_output_segment == output_segment;
+        if(!in_output_segment){
+          result.get_raw_value() = 0;
+        }
+
+        return result;
+      }
+
+      /*!
+       * Sum across segments, with segment size defined by segbits
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 is equivalent to the input vector,  since there are 8
+       *      outputs, there is only 1 output segment
+       *
+       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=1 sums strided pairs of values.  There are 4 output,
+       *      so there are possible output segments.
+       *
+       *      output_segment=0:
+       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+       *
+       *      output_segment=1:
+       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+       *
+       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+       *      output_segment denotes the vector position of the sum
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
+      {
+
+        // First: tree reduce values within each segment
+        element_type x = m_value;
+        RAJA_UNROLL
+        for(int i = 0;i < 5-segbits; ++ i){
+
+          // tree shuffle
+          int delta = s_num_elem >> (i+1);
+          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
+
+          // reduce
+          x += y;
+        }
+
+        // Second: send result to output segment lanes
+        self_type result;
+        int get_from = get_lane()&( (1<<segbits)-1);
+        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
+
+        int mask = (get_lane()>>segbits) == output_segment;
+
+
+        // Third: mask off everything but output_segment
+        if(!mask){
+          result.get_raw_value() = 0;
+        }
+
+        return result;
+      }
+
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
+      {
+        self_type result;
+
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        if(seg >= num_outer || i >= num_inner){
+          // nop
+        }
+        else{
+          result.get_raw_value() = m_value / den.get_raw_value();
+        }
+
+        return result;
+      }
+
+
+      /*!
+       * Segmented broadcast copies a segment to all output segments of a vector
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 means the input segment size is 1, so this selects the
+       *      value at x[input_segmnet] and broadcasts it to the rest of the
+       *      vector
+       *
+       *      input segments allowed are from 0 to 7, inclusive
+       *
+       *      input_segment=0
+       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+       *
+       *      input_segment=5
+       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+       *
+       *  segbits=1 means that the input segments are each pair of x values:
+       *
+       *      input segments allowed are from 0 to 3, inclusive
+       *
+       *      output_segment=0:
+       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+       *
+       *      output_segment=1:
+       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+       *
+       *      output_segment=3:
+       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+       *
+       *  and so on up to segbits=2, the input segments are 4 wide:
+       *
+       *      input segments allowed are from 0 or 1
+       *
+       *      output_segment=0:
+       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+       *
+       *      output_segment=1:
+       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
+      {
+        self_type result;
+
+        camp::idx_t mask = (1<<segbits)-1;
+        camp::idx_t offset = input_segment << segbits;
+
+
+        camp::idx_t i = (get_lane()&mask) + offset;
+
+        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+
+        return result;
+      }
+
+
+      /*!
+       * Segmented broadcast spreads a segment to all output segments of a vector
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 means the input segment size is 1, so this selects the
+       *      value at x[input_segmnet] and broadcasts it to the rest of the
+       *      vector
+       *
+       *      input segments allowed are from 0 to 7, inclusive
+       *
+       *      input_segment=0
+       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+       *
+       *      input_segment=5
+       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+       *
+       *  segbits=1 means that the input segments are each pair of x values:
+       *
+       *      input segments allowed are from 0 to 3, inclusive
+       *
+       *      output_segment=0:
+       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+       *
+       *      output_segment=1:
+       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+       *
+       *      output_segment=3:
+       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
+      {
+        self_type result;
+
+        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+        camp::idx_t i = (get_lane() >> segbits) + offset;
+
+        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+        return result;
+      }
+
+
+
+
+  };
+
+
+
+}   // namespace expt
+
+} // namespace RAJA
+
+
+#endif // Guard
+
+#endif // CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 8b9c355f44..032517677c 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -21,29 +21,26 @@
 #ifndef RAJA_policy_tensor_arch_cuda_traits_HPP
 #define RAJA_policy_tensor_arch_cuda_traits_HPP
 
-namespace RAJA
-{
-namespace internal
-{
-namespace expt
-{
-
-template <typename T>
-struct RegisterTraits<RAJA::expt::cuda_warp_register, T>
-{
-  using element_type                      = T;
-  using register_policy                   = RAJA::expt::cuda_warp_register;
-  static constexpr camp::idx_t s_num_elem = 32;
-  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-  using int_element_type                  = int32_t;
-};
-
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
+namespace RAJA {
+namespace internal {
+namespace expt {
+
+  template<typename T>
+  struct RegisterTraits<RAJA::expt::cuda_warp_register, T>{
+      using element_type = T;
+      using register_policy = RAJA::expt::cuda_warp_register;
+      static constexpr camp::idx_t s_num_elem = 32;
+      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+      using int_element_type = int32_t;
+  };
+
+} // namespace internal
+} // namespace expt
+} // namespace RAJA
+
 
 
 #endif
 
 
-#endif  // RAJA_ENABLE_CUDA
+#endif // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip.hpp b/include/RAJA/policy/tensor/arch/hip.hpp
index 3ddf27e39c..6e76772a29 100644
--- a/include/RAJA/policy/tensor/arch/hip.hpp
+++ b/include/RAJA/policy/tensor/arch/hip.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_hip_HPP
 #define RAJA_policy_tensor_arch_hip_HPP
 
-#include <RAJA/policy/tensor/arch/hip/traits.hpp>
-#include <RAJA/policy/tensor/arch/hip/hip_wave.hpp>
+#include<RAJA/policy/tensor/arch/hip/traits.hpp>
+#include<RAJA/policy/tensor/arch/hip/hip_wave.hpp>
 
 
 #endif
 
 
-#endif  // RAJA_ENABLE_CUDA
+#endif // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index 6cf48ea358..74bbc2f077 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -30,1021 +30,983 @@
 #define RAJA_policy_tensor_arch_hip_hip_wave_register_HPP
 
 
+
 namespace RAJA
 {
 namespace expt
 {
 
 
-template <typename ELEMENT_TYPE>
-class Register<ELEMENT_TYPE, hip_wave_register>
-    : public internal::expt::RegisterBase<
-          Register<ELEMENT_TYPE, hip_wave_register>>
-{
-public:
-  using base_type =
-      internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
-
-  using register_policy = hip_wave_register;
-  using self_type       = Register<ELEMENT_TYPE, hip_wave_register>;
-  using element_type    = ELEMENT_TYPE;
-  using register_type   = ELEMENT_TYPE;
-
-  using int_vector_type = Register<int64_t, hip_wave_register>;
-
-
-private:
-  element_type m_value;
-
-public:
-  static constexpr int s_num_elem = 64;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr Register() : base_type(), m_value(0) {}
-
-
-  /*!
-   * @brief Copy constructor from raw value
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr Register(element_type c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-
-  /*!
-   * @brief Copy assignment operator
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& operator=(self_type const& c)
+  template<typename ELEMENT_TYPE>
+  class Register<ELEMENT_TYPE, hip_wave_register> :
+    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>
   {
-    m_value = c.m_value;
-    return *this;
-  }
+    public:
+      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
+
+      using register_policy = hip_wave_register;
+      using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
+      using element_type = ELEMENT_TYPE;
+      using register_type = ELEMENT_TYPE;
+
+      using int_vector_type = Register<int64_t, hip_wave_register>;
+
 
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& operator=(element_type c)
-  {
-    m_value = c;
-    return *this;
-  }
-
-  /*!
-   * @brief Gets our warp lane
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  constexpr static int get_lane() { return threadIdx.x; }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  constexpr element_type const& get_raw_value() const { return m_value; }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  element_type& get_raw_value() { return m_value; }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  static constexpr bool is_root() { return get_lane() == 0; }
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_packed(element_type const* ptr)
-  {
-
-    auto lane = get_lane();
-
-    m_value = ptr[lane];
-
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_packed_n(element_type const* ptr, int N)
-  {
-    auto lane = get_lane();
-    if (lane < N)
-    {
-      m_value = ptr[lane];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_strided(element_type const* ptr, int stride)
-  {
+		private:
+      element_type m_value;
 
-    auto lane = get_lane();
+		public:
 
-    m_value = ptr[stride * lane];
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& load_strided_n(element_type const* ptr, int stride, int N)
-  {
-    auto lane = get_lane();
-
-    if (lane < N)
-    {
-      m_value = ptr[stride * lane];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic gather operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& gather(element_type const* ptr, int_vector_type offsets)
-  {
-
-    m_value = ptr[offsets.get_raw_value()];
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic gather operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type&
-  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
-  {
-    if (get_lane() < N)
-    {
-      m_value = ptr[offsets.get_raw_value()];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays.
-   *
-   * The default operation combines the s_segmented_offsets and gather
-   * operations.
-   *
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& segmented_load(element_type const* ptr,
-                            camp::idx_t segbits,
-                            camp::idx_t stride_inner,
-                            camp::idx_t stride_outer)
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    m_value = ptr[seg * stride_outer + i * stride_inner];
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic segmented load operation used for loading sub-matrices
-   * from larger arrays where we load partial segments.
-   *
-   *
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& segmented_load_nm(element_type const* ptr,
-                               camp::idx_t segbits,
-                               camp::idx_t stride_inner,
-                               camp::idx_t stride_outer,
-                               camp::idx_t num_inner,
-                               camp::idx_t num_outer)
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    if (seg >= num_outer || i >= num_inner)
-    {
-      m_value = element_type(0);
-    }
-    else
-    {
-      m_value = ptr[seg * stride_outer + i * stride_inner];
-    }
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_packed(element_type* ptr) const
-  {
-
-    auto lane = get_lane();
-
-    ptr[lane] = m_value;
-
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_packed_n(element_type* ptr, int N) const
-  {
-
-    auto lane = get_lane();
-
-    if (lane < N)
-    {
-      ptr[lane] = m_value;
-    }
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_strided(element_type* ptr, int stride) const
-  {
-
-    auto lane = get_lane();
-
-    ptr[lane * stride] = m_value;
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
-  {
-
-    auto lane = get_lane();
-
-    if (lane < N)
-    {
-      ptr[lane * stride] = m_value;
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic scatter operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
-                                                   T2 const& offsets) const
-  {
-
-    ptr[offsets.get_raw_value()] = m_value;
-
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic scatter operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  template <typename T2>
-  RAJA_DEVICE RAJA_INLINE self_type const&
-  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
-  {
-    if (get_lane() < N)
-    {
-      ptr[offsets.get_raw_value()] = m_value;
-    }
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic segmented store operation used for storing sub-matrices
-   * to larger arrays.
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type const& segmented_store(element_type* ptr,
-                                   camp::idx_t segbits,
-                                   camp::idx_t stride_inner,
-                                   camp::idx_t stride_outer) const
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    ptr[seg * stride_outer + i * stride_inner] = m_value;
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic segmented store operation used for storing sub-matrices
-   * to larger arrays where we store partial segments.
-   *
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type const& segmented_store_nm(element_type* ptr,
-                                      camp::idx_t segbits,
-                                      camp::idx_t stride_inner,
-                                      camp::idx_t stride_outer,
-                                      camp::idx_t num_inner,
-                                      camp::idx_t num_outer) const
-  {
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    if (seg >= num_outer || i >= num_inner)
-    {
-      // nop
-    }
-    else
-    {
-      ptr[seg * stride_outer + i * stride_inner] = m_value;
-    }
-
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
-  {
-    return hip::impl::shfl_sync(m_value, i);
-  }
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type& set(element_type value, int i)
-  {
-    auto lane = get_lane();
-    if (lane == i)
-    {
-      m_value = value;
-    }
-    return *this;
-  }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& a)
-  {
-    m_value = a;
-    return *this;
-  }
-
-  /*!
-   * @brief Extracts a scalar value and broadcasts to a new register
-   */
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type get_and_broadcast(int i) const
-  {
-    self_type x;
-    x.m_value = hip::impl::shfl_sync(m_value, i, 32);
-    return x;
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(m_value + b.m_value);
-  }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(m_value - b.m_value);
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(m_value * b.m_value);
-  }
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(m_value / b.m_value);
-  }
-
-
-  RAJA_DEVICE
-  RAJA_INLINE
-  self_type divide_n(self_type const& b, int N) const
-  {
-    return get_lane() < N ? self_type(m_value / b.m_value)
-                          : self_type(element_type(0));
-  }
-
-  /**
-   * floats and doubles use the CUDA instrinsic FMA
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+      static constexpr int s_num_elem = 64;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      Register() : base_type(), m_value(0) {
+
+      }
+
+
+      /*!
+       * @brief Copy constructor from raw value
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      Register(element_type c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+
+      /*!
+       * @brief Copy assignment operator
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &operator=(element_type c){
+        m_value = c;
+        return *this;
+      }
+
+      /*!
+       * @brief Gets our warp lane
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      constexpr
+      static
+      int get_lane() {
+        return threadIdx.x;
+      }
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      constexpr
+      element_type const &get_raw_value() const {
+        return m_value;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      element_type &get_raw_value() {
+        return m_value;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      static
+      constexpr
+      bool is_root() {
+        return get_lane() == 0;
+      }
+
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_packed(element_type const *ptr){
+
+        auto lane = get_lane();
+
+        m_value = ptr[lane];
+
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_packed_n(element_type const *ptr, int N){
+        auto lane = get_lane();
+        if(lane < N){
+          m_value = ptr[lane];
+        }
+        else{
+          m_value = element_type(0);
+        }
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_strided(element_type const *ptr, int stride){
+
+        auto lane = get_lane();
+
+        m_value = ptr[stride*lane];
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &load_strided_n(element_type const *ptr, int stride, int N){
+        auto lane = get_lane();
+
+        if(lane < N){
+          m_value = ptr[stride*lane];
+        }
+        else{
+          m_value = element_type(0);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic gather operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &gather(element_type const *ptr, int_vector_type offsets){
+
+        m_value = ptr[offsets.get_raw_value()];
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic gather operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+        if(get_lane() < N){
+          m_value = ptr[offsets.get_raw_value()];
+        }
+        else{
+          m_value = element_type(0);
+        }
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays.
+       *
+       * The default operation combines the s_segmented_offsets and gather
+       * operations.
+       *
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        m_value = ptr[seg*stride_outer + i*stride_inner];
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic segmented load operation used for loading sub-matrices
+       * from larger arrays where we load partial segments.
+       *
+       *
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
+          camp::idx_t stride_inner, camp::idx_t stride_outer,
+          camp::idx_t num_inner, camp::idx_t num_outer)
+      {
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        if(seg >= num_outer || i >= num_inner){
+          m_value = element_type(0);
+        }
+        else{
+          m_value = ptr[seg*stride_outer + i*stride_inner];
+        }
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_packed(element_type *ptr) const{
+
+        auto lane = get_lane();
+
+        ptr[lane] = m_value;
+
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_packed_n(element_type *ptr, int N) const{
+
+        auto lane = get_lane();
+
+        if(lane < N){
+          ptr[lane] = m_value;
+        }
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_strided(element_type *ptr, int stride) const{
+
+        auto lane = get_lane();
+
+        ptr[lane*stride] = m_value;
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
+
+        auto lane = get_lane();
+
+        if(lane < N){
+          ptr[lane*stride] = m_value;
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic scatter operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
+
+        ptr[offsets.get_raw_value()] = m_value;
+
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic scatter operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      template<typename T2>
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
+        if(get_lane() < N){
+          ptr[offsets.get_raw_value()] = m_value;
+        }
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic segmented store operation used for storing sub-matrices
+       * to larger arrays.
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        ptr[seg*stride_outer + i*stride_inner] = m_value;
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic segmented store operation used for storing sub-matrices
+       * to larger arrays where we store partial segments.
+       *
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
+          camp::idx_t stride_inner, camp::idx_t stride_outer,
+          camp::idx_t num_inner, camp::idx_t num_outer) const
+      {
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        if(seg >= num_outer || i >= num_inner){
+          // nop
+        }
+        else{
+          ptr[seg*stride_outer + i*stride_inner] = m_value;
+        }
+
+        return *this;
+      }
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      constexpr
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type get(int i) const
+			{
+        return hip::impl::shfl_sync(m_value, i);
+			}
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type &set(element_type value, int i)
+			{
+				auto lane = get_lane();
+      	if(lane == i){
+					m_value = value;
+				}
+        return *this;
+			}
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &a){
+        m_value = a;
+        return *this;
+      }
+
+      /*!
+       * @brief Extracts a scalar value and broadcasts to a new register
+       */
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type get_and_broadcast(int i) const {
+        self_type x;
+        x.m_value = hip::impl::shfl_sync(m_value, i, 32);
+        return x;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(m_value + b.m_value);
+      }
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(m_value - b.m_value);
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(m_value * b.m_value);
+      }
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(m_value / b.m_value);
+      }
+
+
+      RAJA_DEVICE
+      RAJA_INLINE
+      self_type divide_n(self_type const &b, int N) const {
+        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
+      }
+
+      /**
+       * floats and doubles use the CUDA instrinsic FMA
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(fma(m_value, b.m_value, c.m_value));
-  }
-
-  /**
-   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+      RETURN_TYPE>::type
+      multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(fma(m_value, b.m_value, c.m_value));
+      }
+
+      /**
+       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_add(self_type const& b, self_type const& c) const
-  {
-    return self_type(m_value * b.m_value + c.m_value);
-  }
-
-  /**
-   * floats and doubles use the CUDA instrinsic FMS
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+      RETURN_TYPE>::type
+      multiply_add(self_type const &b, self_type const &c) const
+      {
+        return self_type(m_value * b.m_value + c.m_value);
+      }
+
+      /**
+       * floats and doubles use the CUDA instrinsic FMS
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(fma(m_value, b.m_value, -c.m_value));
-  }
-
-  /**
-   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-   */
-  template <typename RETURN_TYPE = self_type>
-  RAJA_DEVICE RAJA_INLINE
+      RETURN_TYPE>::type
+      multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(fma(m_value, b.m_value, -c.m_value));
+      }
+
+      /**
+       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+       */
+      template<typename RETURN_TYPE = self_type>
+      RAJA_DEVICE
+      RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-                              RETURN_TYPE>::type
-      multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return self_type(m_value * b.m_value - c.m_value);
-  }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type sum() const
-  {
-    // Allreduce sum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type max() const
-  {
-    // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
-
-    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type max_n(int N) const
-  {
-    // Allreduce maximum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::maximum>;
-
-    auto ident = RAJA::operators::limits<element_type>::min();
-    auto lane  = get_lane();
-    auto value = lane < N ? m_value : ident;
-    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type vmax(self_type a) const
-  {
-    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
-  }
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type min() const
-  {
-    // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
-
-    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-  }
-
-  /*!
-   * @brief Returns the largest element from first N lanes
-   * @return The largest scalar element in the register
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  element_type min_n(int N) const
-  {
-    // Allreduce minimum
-    using combiner_t =
-        RAJA::reduce::detail::op_adapter<element_type,
-                                         RAJA::operators::minimum>;
-
-    auto ident = RAJA::operators::limits<element_type>::max();
-    auto lane  = get_lane();
-    auto value = lane < N ? m_value : ident;
-    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type vmin(self_type a) const
-  {
-    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
-  }
-
-
-  /*!
-   * Provides gather/scatter indices for segmented loads and stores
-   *
-   * THe number of segment bits (segbits) is specified, as well as the
-   * stride between elements in a segment (stride_inner),
-   * and the stride between segments (stride_outer)
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
-                                             camp::idx_t stride_inner,
-                                             camp::idx_t stride_outer)
-  {
-    int_vector_type result;
-
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    result.get_raw_value() = seg * stride_outer + i * stride_inner;
-
-    return result;
-  }
-
-
-  /*!
-   * Sum elements within each segment, with segment size defined by segbits.
-   * Stores each segments sum consecutively, but shifed to the
-   * corresponding output_segment slot.
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 is equivalent to the input vector,  since there are 8
-   *      outputs, there is only 1 output segment
-   *
-   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-   *      so there are possible output segments.
-   *
-   *      output_segment=0:
-   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-   *
-   *      output_segment=1:
-   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-   *
-   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-   *      output_segment denotes the vector position of the sum
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_sum_inner(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
-  {
-
-    // First: tree reduce values within each segment
-    element_type x = m_value;
-    RAJA_UNROLL
-    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
-    {
-
-      // tree shuffle
-      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
-
-      // reduce
-      x += y;
-    }
-
-    // Second: send result to output segment lanes
-    self_type result;
-    result.get_raw_value() = hip::impl::shfl_sync(x, get_lane() << segbits);
-
-    // Third: mask off everything but output_segment
-    //        this is because all output segments are valid at this point
-    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-    int our_output_segment = get_lane() >> (6 - segbits);
-    bool in_output_segment = our_output_segment == output_segment;
-    if (!in_output_segment)
-    {
-      result.get_raw_value() = 0;
-    }
-
-    return result;
-  }
-
-  /*!
-   * Sum across segments, with segment size defined by segbits
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 is equivalent to the input vector,  since there are 8
-   *      outputs, there is only 1 output segment
-   *
-   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=1 sums strided pairs of values.  There are 4 output,
-   *      so there are possible output segments.
-   *
-   *      output_segment=0:
-   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-   *
-   *      output_segment=1:
-   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-   *
-   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-   *      output_segment denotes the vector position of the sum
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_sum_outer(camp::idx_t segbits,
-                                camp::idx_t output_segment) const
-  {
-
-    // First: tree reduce values within each segment
-    element_type x = m_value;
-    RAJA_UNROLL
-    for (int i = 0; i < 6 - segbits; ++i)
-    {
-
-      // tree shuffle
-      int delta      = s_num_elem >> (i + 1);
-      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
-
-      // reduce
-      x += y;
-    }
-
-    // Second: send result to output segment lanes
-    self_type result;
-    int get_from           = get_lane() & ((1 << segbits) - 1);
-    result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
-
-    int mask = (get_lane() >> segbits) == output_segment;
-
-
-    // Third: mask off everything but output_segment
-    if (!mask)
-    {
-      result.get_raw_value() = 0;
-    }
-
-    return result;
-  }
-
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_divide_nm(self_type den,
-                                camp::idx_t segbits,
-                                camp::idx_t num_inner,
-                                camp::idx_t num_outer) const
-  {
-    self_type result;
-
-    auto lane = get_lane();
-
-    // compute segment and segment_size
-    auto seg = lane >> segbits;
-    auto i   = lane & ((1 << segbits) - 1);
-
-    if (seg >= num_outer || i >= num_inner)
-    {
-      // nop
-    }
-    else
-    {
-      result.get_raw_value() = m_value / den.get_raw_value();
-    }
-
-    return result;
-  }
-
-
-  /*!
-   * Segmented broadcast copies a segment to all output segments of a vector
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 means the input segment size is 1, so this selects the
-   *      value at x[input_segmnet] and broadcasts it to the rest of the
-   *      vector
-   *
-   *      input segments allowed are from 0 to 7, inclusive
-   *
-   *      input_segment=0
-   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-   *
-   *      input_segment=5
-   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-   *
-   *  segbits=1 means that the input segments are each pair of x values:
-   *
-   *      input segments allowed are from 0 to 3, inclusive
-   *
-   *      output_segment=0:
-   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-   *
-   *      output_segment=1:
-   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-   *
-   *      output_segment=3:
-   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-   *
-   *  and so on up to segbits=2, the input segments are 4 wide:
-   *
-   *      input segments allowed are from 0 or 1
-   *
-   *      output_segment=0:
-   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-   *
-   *      output_segment=1:
-   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-   *
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_broadcast_inner(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
-  {
-    self_type result;
-
-    camp::idx_t mask   = (1 << segbits) - 1;
-    camp::idx_t offset = input_segment << segbits;
-
-
-    camp::idx_t i = (get_lane() & mask) + offset;
-
-    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-
-    return result;
-  }
-
-
-  /*!
-   * Segmented broadcast spreads a segment to all output segments of a vector
-   *
-   * Note: segment size is 1<<segbits elements
-   *       number of segments is s_num_elem>>seg_bits
-   *
-   *
-   *  Example:
-   *
-   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-   *
-   *  segbits=0 means the input segment size is 1, so this selects the
-   *      value at x[input_segmnet] and broadcasts it to the rest of the
-   *      vector
-   *
-   *      input segments allowed are from 0 to 7, inclusive
-   *
-   *      input_segment=0
-   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-   *
-   *      input_segment=5
-   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-   *
-   *  segbits=1 means that the input segments are each pair of x values:
-   *
-   *      input segments allowed are from 0 to 3, inclusive
-   *
-   *      output_segment=0:
-   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-   *
-   *      output_segment=1:
-   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-   *
-   *      output_segment=3:
-   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-   */
-  RAJA_INLINE
-  RAJA_DEVICE
-  self_type segmented_broadcast_outer(camp::idx_t segbits,
-                                      camp::idx_t input_segment) const
-  {
-    self_type result;
-
-    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-    camp::idx_t i = (get_lane() >> segbits) + offset;
-
-    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-    return result;
-  }
-};
-
-
-}  // namespace expt
-
-}  // namespace RAJA
-
-
-#endif  // Guard
-
-#endif  // HIP
+      RETURN_TYPE>::type
+      multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return self_type(m_value * b.m_value - c.m_value);
+      }
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type sum() const
+      {
+				// Allreduce sum
+				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+				return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+      }
+
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type max() const
+      {
+        // Allreduce maximum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
+
+        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type max_n(int N) const
+      {
+        // Allreduce maximum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
+
+        auto ident = RAJA::operators::limits<element_type>::min();
+        auto lane = get_lane();
+        auto value = lane < N ? m_value : ident;
+        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type vmax(self_type a) const
+      {
+        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
+      }
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type min() const
+      {
+        // Allreduce minimum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
+
+        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+
+      }
+
+      /*!
+       * @brief Returns the largest element from first N lanes
+       * @return The largest scalar element in the register
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      element_type min_n(int N) const
+      {
+        // Allreduce minimum
+        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
+
+        auto ident = RAJA::operators::limits<element_type>::max();
+        auto lane = get_lane();
+        auto value = lane < N ? m_value : ident;
+        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type vmin(self_type a) const
+      {
+        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
+      }
+
+
+
+
+      /*!
+       * Provides gather/scatter indices for segmented loads and stores
+       *
+       * THe number of segment bits (segbits) is specified, as well as the
+       * stride between elements in a segment (stride_inner),
+       * and the stride between segments (stride_outer)
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      static
+      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
+      {
+        int_vector_type result;
+
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        result.get_raw_value() = seg*stride_outer + i*stride_inner;
+
+        return result;
+      }
+
+
+      /*!
+       * Sum elements within each segment, with segment size defined by segbits.
+       * Stores each segments sum consecutively, but shifed to the
+       * corresponding output_segment slot.
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 is equivalent to the input vector,  since there are 8
+       *      outputs, there is only 1 output segment
+       *
+       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+       *      so there are possible output segments.
+       *
+       *      output_segment=0:
+       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+       *
+       *      output_segment=1:
+       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+       *
+       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+       *      output_segment denotes the vector position of the sum
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
+      {
+
+        // First: tree reduce values within each segment
+        element_type x = m_value;
+        RAJA_UNROLL
+        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
+
+          // tree shuffle
+          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
+
+          // reduce
+          x += y;
+        }
+
+        // Second: send result to output segment lanes
+        self_type result;
+        result.get_raw_value() = hip::impl::shfl_sync(x, get_lane()<<segbits);
+
+        // Third: mask off everything but output_segment
+        //        this is because all output segments are valid at this point
+        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+        int our_output_segment = get_lane()>>(6-segbits);
+        bool in_output_segment = our_output_segment == output_segment;
+        if(!in_output_segment){
+          result.get_raw_value() = 0;
+        }
+
+        return result;
+      }
+
+      /*!
+       * Sum across segments, with segment size defined by segbits
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 is equivalent to the input vector,  since there are 8
+       *      outputs, there is only 1 output segment
+       *
+       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=1 sums strided pairs of values.  There are 4 output,
+       *      so there are possible output segments.
+       *
+       *      output_segment=0:
+       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+       *
+       *      output_segment=1:
+       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+       *
+       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+       *      output_segment denotes the vector position of the sum
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
+      {
+
+        // First: tree reduce values within each segment
+        element_type x = m_value;
+        RAJA_UNROLL
+        for(int i = 0;i < 6-segbits; ++ i){
+
+          // tree shuffle
+          int delta = s_num_elem >> (i+1);
+          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
+
+          // reduce
+          x += y;
+        }
+
+        // Second: send result to output segment lanes
+        self_type result;
+        int get_from = get_lane()&( (1<<segbits)-1);
+        result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
+
+        int mask = (get_lane()>>segbits) == output_segment;
+
+
+        // Third: mask off everything but output_segment
+        if(!mask){
+          result.get_raw_value() = 0;
+        }
+
+        return result;
+      }
+
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
+      {
+        self_type result;
+
+        auto lane = get_lane();
+
+        // compute segment and segment_size
+        auto seg = lane >> segbits;
+        auto i = lane & ((1<<segbits)-1);
+
+        if(seg >= num_outer || i >= num_inner){
+          // nop
+        }
+        else{
+          result.get_raw_value() = m_value / den.get_raw_value();
+        }
+
+        return result;
+      }
+
+
+      /*!
+       * Segmented broadcast copies a segment to all output segments of a vector
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 means the input segment size is 1, so this selects the
+       *      value at x[input_segmnet] and broadcasts it to the rest of the
+       *      vector
+       *
+       *      input segments allowed are from 0 to 7, inclusive
+       *
+       *      input_segment=0
+       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+       *
+       *      input_segment=5
+       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+       *
+       *  segbits=1 means that the input segments are each pair of x values:
+       *
+       *      input segments allowed are from 0 to 3, inclusive
+       *
+       *      output_segment=0:
+       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+       *
+       *      output_segment=1:
+       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+       *
+       *      output_segment=3:
+       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+       *
+       *  and so on up to segbits=2, the input segments are 4 wide:
+       *
+       *      input segments allowed are from 0 or 1
+       *
+       *      output_segment=0:
+       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+       *
+       *      output_segment=1:
+       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+       *
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
+      {
+        self_type result;
+
+        camp::idx_t mask = (1<<segbits)-1;
+        camp::idx_t offset = input_segment << segbits;
+
+
+        camp::idx_t i = (get_lane()&mask) + offset;
+
+        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+
+        return result;
+      }
+
+
+      /*!
+       * Segmented broadcast spreads a segment to all output segments of a vector
+       *
+       * Note: segment size is 1<<segbits elements
+       *       number of segments is s_num_elem>>seg_bits
+       *
+       *
+       *  Example:
+       *
+       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+       *
+       *  segbits=0 means the input segment size is 1, so this selects the
+       *      value at x[input_segmnet] and broadcasts it to the rest of the
+       *      vector
+       *
+       *      input segments allowed are from 0 to 7, inclusive
+       *
+       *      input_segment=0
+       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+       *
+       *      input_segment=5
+       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+       *
+       *  segbits=1 means that the input segments are each pair of x values:
+       *
+       *      input segments allowed are from 0 to 3, inclusive
+       *
+       *      output_segment=0:
+       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+       *
+       *      output_segment=1:
+       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+       *
+       *      output_segment=3:
+       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+       */
+      RAJA_INLINE
+      RAJA_DEVICE
+      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
+      {
+        self_type result;
+
+        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+        camp::idx_t i = (get_lane() >> segbits) + offset;
+
+        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+        return result;
+      }
+
+
+
+
+  };
+
+
+
+}   // namespace expt
+
+} // namespace RAJA
+
+
+#endif // Guard
+
+#endif // HIP
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index dc4d0d63d1..4c4d959599 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -21,29 +21,26 @@
 #ifndef RAJA_policy_tensor_arch_hip_traits_HPP
 #define RAJA_policy_tensor_arch_hip_traits_HPP
 
-namespace RAJA
-{
-namespace internal
-{
-namespace expt
-{
-
-template <typename T>
-struct RegisterTraits<RAJA::expt::hip_wave_register, T>
-{
-  using element_type                      = T;
-  using register_policy                   = RAJA::expt::hip_wave_register;
-  static constexpr camp::idx_t s_num_elem = 64;
-  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-  using int_element_type                  = int32_t;
-};
-
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
+namespace RAJA {
+namespace internal {
+namespace expt {
+
+  template<typename T>
+  struct RegisterTraits<RAJA::expt::hip_wave_register, T>{
+      using element_type = T;
+      using register_policy = RAJA::expt::hip_wave_register;
+      static constexpr camp::idx_t s_num_elem = 64;
+      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+      using int_element_type = int32_t;
+  };
+
+} // namespace internal
+} // namespace expt
+} // namespace RAJA
+
 
 
 #endif
 
 
-#endif  // RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_HIP
diff --git a/include/RAJA/policy/tensor/arch/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar.hpp
index 29b3788e80..5e139f41f0 100644
--- a/include/RAJA/policy/tensor/arch/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar.hpp
@@ -16,12 +16,16 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 
+
 #ifndef RAJA_policy_tensor_arch_scalar_HPP
 #define RAJA_policy_tensor_arch_scalar_HPP
 
 
-#include <RAJA/policy/tensor/arch/scalar/traits.hpp>
-#include <RAJA/policy/tensor/arch/scalar/scalar.hpp>
+
+#include<RAJA/policy/tensor/arch/scalar/traits.hpp>
+#include<RAJA/policy/tensor/arch/scalar/scalar.hpp>
 
 
 #endif
+
+
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index d63b78c9f4..139c5d27a5 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -22,464 +22,449 @@
 
 namespace RAJA
 {
-namespace expt
-{
-
-/**
- * A specialization for a single element register.
- * We will implement this as a scalar value, and let the compiler use
- * whatever registers it deems appropriate.
- */
-template <typename T>
-class Register<T, scalar_register>
-    : public internal::expt::RegisterBase<Register<T, scalar_register>>
-{
-public:
-  using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
-
-  using register_policy = scalar_register;
-  using self_type       = Register<T, scalar_register>;
-  using element_type    = T;
-  using register_type   = T;
-
-  using int_vector_type =
-      Register<typename internal::expt::RegisterTraits<scalar_register,
-                                                       T>::int_element_type,
-               scalar_register>;
-
-
-private:
-  T m_value;
-
-public:
-  static constexpr camp::idx_t s_num_elem = 1;
-
-  /*!
-   * @brief Default constructor, zeros register contents
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr Register() : base_type(), m_value(0) {}
-
-  /*!
-   * @brief Copy constructor from underlying simd register
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr Register(element_type const& c) : base_type(), m_value(c) {}
-
-
-  /*!
-   * @brief Copy constructor
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
-
-
-  /*!
-   * @brief Copy assignment constructor
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& operator=(self_type const& c)
-  {
-    m_value = c.m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Load a full register from a stride-one memory location
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_packed(element_type const* ptr)
-  {
-    m_value = ptr[0];
-    return *this;
-  }
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
-  {
-    if (N > 0)
-    {
-      m_value = ptr[0];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-    return *this;
-  }
-
-  /*!
-   * @brief Gather a full register from a strided memory location
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_strided(element_type const* ptr, camp::idx_t)
-  {
-    m_value = ptr[0];
-    return *this;
-  }
-
-
-  /*!
-   * @brief Partially load a register from a stride-one memory location given
-   *        a run-time number of elements.
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& load_strided_n(element_type const* ptr, camp::idx_t, camp::idx_t N)
-  {
-    if (N > 0)
-    {
-      m_value = ptr[0];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic gather operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type& gather(element_type const* ptr, int_vector_type offsets)
-  {
-
-    m_value = ptr[offsets.get(0)];
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic gather operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be loaded relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type&
-  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
-  {
-    if (N > 0)
-    {
-      m_value = ptr[offsets.get(0)];
-    }
-    else
-    {
-      m_value = element_type(0);
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_packed(element_type* ptr) const
-  {
-    ptr[0] = m_value;
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
-  {
-    if (N > 0)
-    {
-      ptr[0] = m_value;
-    }
-    return *this;
-  }
-
-  /*!
-   * @brief Store entire register to consecutive memory locations
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const& store_strided(element_type* ptr, camp::idx_t) const
-  {
-    ptr[0] = m_value;
-    return *this;
-  }
-
-
-  /*!
-   * @brief Store partial register to consecutive memory locations
-   *
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type const&
-  store_strided_n(element_type* ptr, camp::idx_t, camp::idx_t N) const
-  {
-    if (N > 0)
-    {
-      ptr[0] = m_value;
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Generic scatter operation for full vector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type const& scatter(element_type* ptr, int_vector_type offsets) const
-  {
-
-    ptr[offsets.get(0)] = m_value;
-
-    return *this;
-  }
-
-  /*!
-   * @brief Generic scatter operation for n-length subvector.
-   *
-   * Must provide another register containing offsets of all values
-   * to be stored relative to supplied pointer.
-   *
-   * Offsets are element-wise, not byte-wise.
-   *
-   */
-  RAJA_INLINE
-  self_type const&
-  scatter_n(element_type* ptr, int_vector_type offsets, camp::idx_t N) const
-  {
-    if (N > 0)
-    {
-      ptr[offsets.get(0)] = m_value;
-    }
-    return *this;
-  }
-
-
-  /*!
-   * @brief Get scalar value from vector register
-   * @param i Offset of scalar to get
-   * @return Returns scalar value at i
-   */
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE element_type get(camp::idx_t) const
-  {
-    return m_value;
-  }
-
-
-  /*!
-   * @brief Set scalar value in vector register
-   * @param i Offset of scalar to set
-   * @param value Value of scalar to set
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type& set(element_type value, camp::idx_t)
-  {
-    m_value = value;
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& broadcast(element_type const& a)
-  {
-    m_value = a;
-    return *this;
-  }
-
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type& copy(self_type const& src)
-  {
-    m_value = src.m_value;
-    return *this;
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type add(self_type const& b) const
-  {
-    return self_type(m_value + b.m_value);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type subtract(self_type const& b) const
-  {
-    return self_type(m_value - b.m_value);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type multiply(self_type const& b) const
-  {
-    return self_type(m_value * b.m_value);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type divide(self_type const& b) const
-  {
-    return self_type(m_value / b.m_value);
-  }
-
-  /*!
-   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-   *
-   * Derived types can override this to implement intrinsic FMA's
-   *
-   * @param b Second product operand
-   * @param c Sum operand
-   * @return Value of (*this)*b+c
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_add(self_type const& b, self_type const& c) const
-  {
-    return m_value * b.m_value + c.m_value;
-  }
-
-  /*!
-   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-   *
-   * Derived types can override this to implement intrinsic FMS's
-   *
-   * @param b Second product operand
-   * @param c Subtraction operand
-   * @return Value of (*this)*b-c
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  self_type multiply_subtract(self_type const& b, self_type const& c) const
-  {
-    return m_value * b.m_value - c.m_value;
-  }
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr element_type sum() const { return m_value; }
-
-
-  /*!
-   * @brief Sum the elements of this vector
-   * @return Sum of the values of the vectors scalar elements
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr element_type dot(self_type const& b) const
-  {
-    return m_value * b.m_value;
-  }
-
-
-  /*!
-   * @brief Returns the largest element
-   * @return The largest scalar element in the register
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr element_type max() const { return m_value; }
-
-  /*!
-   * @brief Returns the largest element from first N lanes
-   * @return The largest scalar element in the register
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  element_type max_n(camp::idx_t N) const
-  {
-    return N ? m_value : RAJA::operators::limits<element_type>::min();
-    ;
-  }
-
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type vmax(self_type a) const
-  {
-    return self_type(RAJA::max<element_type>(m_value, a.m_value));
-  }
-
-  /*!
-   * @brief Returns the smallest element
-   * @return The smallest scalar element in the register
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  element_type min() const { return m_value; }
-
-  /*!
-   * @brief Returns the smallest element from first N lanes
-   * @return The smallest scalar element in the register
-   */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  element_type min_n(camp::idx_t N) const
-  {
-    return N ? m_value : RAJA::operators::limits<element_type>::max();
-    ;
-  }
+namespace expt {
 
-  /*!
-   * @brief Returns element-wise largest values
-   * @return Vector of the element-wise max values
+  /**
+   * A specialization for a single element register.
+   * We will implement this as a scalar value, and let the compiler use
+   * whatever registers it deems appropriate.
    */
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  self_type vmin(self_type a) const
+  template<typename T>
+  class Register<T, scalar_register> :
+      public internal::expt::RegisterBase<Register<T, scalar_register>>
   {
-    return self_type(RAJA::min<element_type>(m_value, a.m_value));
-  }
-};
-}  // namespace expt
+    public:
+      using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
+
+      using register_policy = scalar_register;
+      using self_type = Register<T, scalar_register>;
+      using element_type = T;
+      using register_type = T;
+
+      using int_vector_type = Register<typename internal::expt::RegisterTraits<scalar_register, T>::int_element_type, scalar_register>;
+
+
+    private:
+      T m_value;
+
+    public:
+
+      static constexpr camp::idx_t s_num_elem = 1;
+
+      /*!
+       * @brief Default constructor, zeros register contents
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      Register() : base_type(), m_value(0) {
+      }
+
+      /*!
+       * @brief Copy constructor from underlying simd register
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      Register(element_type const &c) : base_type(), m_value(c) {}
+
+
+      /*!
+       * @brief Copy constructor
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
+
+
+      /*!
+       * @brief Copy assignment constructor
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &operator=(self_type const &c){
+        m_value = c.m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Load a full register from a stride-one memory location
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_packed(element_type const *ptr){
+        m_value = ptr[0];
+        return *this;
+      }
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+        if(N > 0){
+          m_value = ptr[0];
+        }
+        else{
+          m_value = element_type(0);
+        }
+        return *this;
+      }
+
+      /*!
+       * @brief Gather a full register from a strided memory location
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_strided(element_type const *ptr, camp::idx_t ){
+        m_value = ptr[0];
+        return *this;
+      }
+
+
+      /*!
+       * @brief Partially load a register from a stride-one memory location given
+       *        a run-time number of elements.
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &load_strided_n(element_type const *ptr, camp::idx_t , camp::idx_t N){
+        if(N > 0){
+          m_value = ptr[0];
+        }
+        else{
+          m_value = element_type(0);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic gather operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type &gather(element_type const *ptr, int_vector_type offsets){
+
+        m_value = ptr[offsets.get(0)];
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic gather operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be loaded relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+        if(N > 0){
+          m_value = ptr[offsets.get(0)];
+        }
+        else{
+          m_value = element_type(0);
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_packed(element_type *ptr) const{
+        ptr[0] = m_value;
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+        if(N > 0){
+          ptr[0] = m_value;
+        }
+        return *this;
+      }
+
+      /*!
+       * @brief Store entire register to consecutive memory locations
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_strided(element_type *ptr, camp::idx_t ) const{
+        ptr[0] = m_value;
+        return *this;
+      }
+
+
+      /*!
+       * @brief Store partial register to consecutive memory locations
+       *
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_strided_n(element_type *ptr, camp::idx_t , camp::idx_t N) const{
+        if(N > 0){
+          ptr[0] = m_value;
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Generic scatter operation for full vector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type const &scatter(element_type *ptr, int_vector_type offsets) const {
+
+        ptr[offsets.get(0)] = m_value;
+
+        return *this;
+      }
+
+      /*!
+       * @brief Generic scatter operation for n-length subvector.
+       *
+       * Must provide another register containing offsets of all values
+       * to be stored relative to supplied pointer.
+       *
+       * Offsets are element-wise, not byte-wise.
+       *
+       */
+      RAJA_INLINE
+      self_type const &scatter_n(element_type *ptr, int_vector_type offsets, camp::idx_t N) const {
+        if(N > 0){
+          ptr[offsets.get(0)] = m_value;
+        }
+        return *this;
+      }
+
+
+      /*!
+       * @brief Get scalar value from vector register
+       * @param i Offset of scalar to get
+       * @return Returns scalar value at i
+       */
+      constexpr
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      element_type get(camp::idx_t) const
+      {return m_value;}
+
+
+      /*!
+       * @brief Set scalar value in vector register
+       * @param i Offset of scalar to set
+       * @param value Value of scalar to set
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type &set(element_type value, camp::idx_t)
+      {
+        m_value = value;
+        return *this;
+      }
+
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &broadcast(element_type const &a){
+        m_value = a;
+        return *this;
+      }
+
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type &copy(self_type const &src){
+        m_value = src.m_value;
+        return *this;
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type add(self_type const &b) const {
+        return self_type(m_value + b.m_value);
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type subtract(self_type const &b) const {
+        return self_type(m_value - b.m_value);
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type multiply(self_type const &b) const {
+        return self_type(m_value * b.m_value);
+      }
+
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type divide(self_type const &b) const {
+        return self_type(m_value / b.m_value);
+      }
+
+      /*!
+       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+       *
+       * Derived types can override this to implement intrinsic FMA's
+       *
+       * @param b Second product operand
+       * @param c Sum operand
+       * @return Value of (*this)*b+c
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_add(self_type const &b, self_type const &c) const
+      {
+        return m_value * b.m_value + c.m_value;
+      }
+
+      /*!
+       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+       *
+       * Derived types can override this to implement intrinsic FMS's
+       *
+       * @param b Second product operand
+       * @param c Subtraction operand
+       * @return Value of (*this)*b-c
+       */
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      self_type multiply_subtract(self_type const &b, self_type const &c) const
+      {
+        return m_value * b.m_value - c.m_value;
+      }
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      element_type sum() const
+      {
+        return m_value;
+      }
+
+
+      /*!
+       * @brief Sum the elements of this vector
+       * @return Sum of the values of the vectors scalar elements
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      element_type dot(self_type const &b) const
+      {
+        return m_value * b.m_value;
+      }
+
+
+      /*!
+       * @brief Returns the largest element
+       * @return The largest scalar element in the register
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      constexpr
+      element_type max() const
+      {
+        return m_value;
+      }
+
+      /*!
+       * @brief Returns the largest element from first N lanes
+       * @return The largest scalar element in the register
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      element_type max_n(camp::idx_t N) const
+      {
+        return N ? m_value : RAJA::operators::limits<element_type>::min();;
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type vmax(self_type a) const
+      {
+        return self_type(RAJA::max<element_type>(m_value, a.m_value));
+      }
+
+      /*!
+       * @brief Returns the smallest element
+       * @return The smallest scalar element in the register
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      element_type min() const
+      {
+        return m_value;
+      }
+
+      /*!
+       * @brief Returns the smallest element from first N lanes
+       * @return The smallest scalar element in the register
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      element_type min_n(camp::idx_t N) const
+      {
+        return N ? m_value : RAJA::operators::limits<element_type>::max();;
+      }
+
+      /*!
+       * @brief Returns element-wise largest values
+       * @return Vector of the element-wise max values
+       */
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type vmin(self_type a) const
+      {
+        return self_type(RAJA::min<element_type>(m_value, a.m_value));
+      }
+
+
+
+  };
+} // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index 92496eeae3..dfeccbb86f 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -19,57 +19,52 @@
 #ifndef RAJA_policy_tensor_arch_scalar_traits_HPP
 #define RAJA_policy_tensor_arch_scalar_traits_HPP
 
-namespace RAJA
-{
-namespace internal
-{
-namespace expt
-{
+namespace RAJA {
+namespace internal {
+namespace expt {
 
 
-template <>
-struct RegisterTraits<RAJA::expt::scalar_register, int32_t>
-{
-  using element_type                      = int32_t;
-  using register_policy                   = RAJA::expt::scalar_register;
-  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
-  static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type                  = int32_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::scalar_register, int32_t>{
+      using element_type = int32_t;
+      using register_policy = RAJA::expt::scalar_register;
+      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
+      static constexpr camp::idx_t s_num_elem = 1;
+      using int_element_type = int32_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::scalar_register, int64_t>
-{
-  using element_type                      = int64_t;
-  using register_policy                   = RAJA::expt::scalar_register;
-  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
-  static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type                  = int64_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::scalar_register, int64_t>{
+      using element_type = int64_t;
+      using register_policy = RAJA::expt::scalar_register;
+      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
+      static constexpr camp::idx_t s_num_elem = 1;
+      using int_element_type = int64_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::scalar_register, float>
-{
-  using element_type                      = float;
-  using register_policy                   = RAJA::expt::scalar_register;
-  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
-  static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type                  = int32_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::scalar_register, float>{
+      using element_type = float;
+      using register_policy = RAJA::expt::scalar_register;
+      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
+      static constexpr camp::idx_t s_num_elem = 1;
+      using int_element_type = int32_t;
+  };
 
-template <>
-struct RegisterTraits<RAJA::expt::scalar_register, double>
-{
-  using element_type                      = double;
-  using register_policy                   = RAJA::expt::scalar_register;
-  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
-  static constexpr camp::idx_t s_num_elem = 1;
-  using int_element_type                  = int64_t;
-};
+  template<>
+  struct RegisterTraits<RAJA::expt::scalar_register, double>{
+      using element_type = double;
+      using register_policy = RAJA::expt::scalar_register;
+      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
+      static constexpr camp::idx_t s_num_elem = 1;
+      using int_element_type = int64_t;
+  };
 
 
-}  // namespace expt
-}  // namespace internal
-}  // namespace RAJA
+}
+}
+}
 
 #endif
+
+
diff --git a/include/RAJA/policy/tensor/arch_impl.hpp b/include/RAJA/policy/tensor/arch_impl.hpp
index 0e7085b5e2..e14451505a 100644
--- a/include/RAJA/policy/tensor/arch_impl.hpp
+++ b/include/RAJA/policy/tensor/arch_impl.hpp
@@ -22,6 +22,7 @@
 #include "RAJA/policy/tensor/arch.hpp"
 
 
+
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -31,29 +32,30 @@
 //
 
 #ifdef __AVX512F__
-#include <RAJA/policy/tensor/arch/avx512.hpp>
+#include<RAJA/policy/tensor/arch/avx512.hpp>
 #endif
 
 
 #ifdef __AVX2__
-#include <RAJA/policy/tensor/arch/avx2.hpp>
+#include<RAJA/policy/tensor/arch/avx2.hpp>
 #endif
 
 
 #ifdef __AVX__
-#include <RAJA/policy/tensor/arch/avx.hpp>
+#include<RAJA/policy/tensor/arch/avx.hpp>
 #endif
 
 #ifdef RAJA_CUDA_ACTIVE
-#include <RAJA/policy/tensor/arch/cuda.hpp>
+#include<RAJA/policy/tensor/arch/cuda.hpp>
 #endif
 
 #ifdef RAJA_HIP_ACTIVE
-#include <RAJA/policy/tensor/arch/hip.hpp>
+#include<RAJA/policy/tensor/arch/hip.hpp>
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-#include <RAJA/policy/tensor/arch/scalar.hpp>
+#include<RAJA/policy/tensor/arch/scalar.hpp>
+
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 0b71c1143b..8618d543b2 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -40,42 +40,37 @@ namespace policy
 namespace tensor
 {
 
-template <typename EXEC_POLICY,
-          typename TENSOR_TYPE,
-          camp::idx_t DIM,
-          camp::idx_t TILE_SIZE>
-struct tensor_exec : public EXEC_POLICY
-{
+template<typename EXEC_POLICY, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t TILE_SIZE>
+struct tensor_exec : public EXEC_POLICY {
   using exec_policy = EXEC_POLICY;
   using tensor_type = TENSOR_TYPE;
 
   static constexpr camp::idx_t s_tensor_dim = DIM;
-  static constexpr camp::idx_t s_tile_size  = TILE_SIZE;
+  static constexpr camp::idx_t s_tile_size = TILE_SIZE;
 };
 
 
+
 }  // end of namespace tensor
 
 }  // end of namespace policy
 
-namespace expt
-{
+namespace expt {
+
+
+template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using vector_exec = policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
+template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_row_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using vector_exec =
-    policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_col_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
 
-template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_row_exec =
-    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_col_exec =
-    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
+} //  namespace expt
 
 
-}  //  namespace expt
 
 
 }  // end of namespace RAJA
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 19f1a339ee..63f011b689 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -24,62 +24,61 @@
 namespace RAJA
 {
 
-template <camp::idx_t N>
-struct LogBase2
-{
-  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
-  static constexpr bool is_exact     = ((1 << value) == N);
-};
+  template<camp::idx_t N>
+  struct LogBase2
+  {
+      static constexpr camp::idx_t value = LogBase2<(N>>1)>::value + 1;
+      static constexpr bool is_exact = ((1<<value) == N);
+  };
 
-template <>
-struct LogBase2<0>
-{
-  static constexpr camp::idx_t value = -1;
-  static constexpr bool is_exact     = true;
-};
+  template<>
+  struct LogBase2<0>
+  {
+      static constexpr camp::idx_t value = -1;
+      static constexpr bool is_exact = true;
+  };
 
-/*!
- * A bit-masking operator
- *
- * Provides an operator that shifts and masks in input value to extract
- * a contiguous set of bits.
- *
- * result = (input >> Shift) & (Mask)
- *
- * Where mask is (1<<Width)-1, or the number of bits defined by Width.
- *
- *
- */
-template <int Width, int Shift>
-struct BitMask
-{
-  static constexpr int shift            = Shift;
-  static constexpr int width            = Width;
-  static constexpr int max_input_size   = 1 << (Shift + Width);
-  static constexpr int max_masked_size  = 1 << Width;
-  static constexpr int max_shifted_size = 1 << Shift;
+  /*!
+   * A bit-masking operator
+   *
+   * Provides an operator that shifts and masks in input value to extract
+   * a contiguous set of bits.
+   *
+   * result = (input >> Shift) & (Mask)
+   *
+   * Where mask is (1<<Width)-1, or the number of bits defined by Width.
+   *
+   *
+   */
+  template<int Width, int Shift>
+  struct BitMask {
+    static constexpr int shift = Shift;
+    static constexpr int width = Width;
+    static constexpr int max_input_size = 1<<(Shift+Width);
+    static constexpr int max_masked_size = 1<<Width;
+    static constexpr int max_shifted_size = 1<<Shift;
 
-  template <typename T>
-  RAJA_HOST_DEVICE static constexpr T maskValue(T input)
-  {
-    return ((input >> (static_cast<T>(Shift))) &
-            static_cast<T>((1 << (Width)) - 1));
-  }
+    template<typename T>
+    RAJA_HOST_DEVICE
+    static constexpr T maskValue(T input) {
+      return( (input>>( static_cast<T>(Shift) )) & static_cast<T>((1<<(Width))-1) );
+    }
 
 
-  template <typename T>
-  RAJA_HOST_DEVICE static constexpr T getOuter(T input)
-  {
-    return ((input >> (static_cast<T>(Shift))) >> Width);
-  }
+    template<typename T>
+    RAJA_HOST_DEVICE
+    static constexpr T getOuter(T input) {
+      return(  (input>>(static_cast<T>(Shift))) >> Width );
+    }
 
-  template <typename T>
-  RAJA_HOST_DEVICE static constexpr T maskOuter(T input)
-  {
-    return (input & (static_cast<T>(-1) << (Width + Shift)));
-  }
-};
+    template<typename T>
+    RAJA_HOST_DEVICE
+    static constexpr T maskOuter(T input) {
+      return( input & (static_cast<T>(-1) << (Width+Shift) )  );
+    }
+
+  };
 
 }  // namespace RAJA
 
-#endif  // RAJA_util_BitMask_HPP
+#endif //RAJA_util_BitMask_HPP
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index b6f1f05dc4..abe8197b93 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -82,11 +82,11 @@ struct CombiningAdapter
 {
   using Layout = Layout_;
 
-  using IndexRange     = typename Layout::IndexRange;
+  using IndexRange = typename Layout::IndexRange;
   using StrippedIdxLin = typename Layout::StrippedIdxLin;
-  using IndexLinear    = typename Layout::IndexLinear;
-  using DimTuple       = typename Layout::DimTuple;
-  using DimArr         = typename Layout::DimArr;
+  using IndexLinear = typename Layout::IndexLinear;
+  using DimTuple = typename Layout::DimTuple;
+  using DimArr = typename Layout::DimArr;
 
   using RangeLinear = RAJA::TypedRangeSegment<IndexLinear>;
 
@@ -95,11 +95,10 @@ struct CombiningAdapter
   Layout m_layout;
 
   RAJA_SUPPRESS_HD_WARN
-  template <camp::idx_t... RangeInts>
+  template < camp::idx_t... RangeInts >
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>)
-      -> decltype(m_lambda(
-          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -107,11 +106,10 @@ struct CombiningAdapter
   }
   ///
   RAJA_SUPPRESS_HD_WARN
-  template <camp::idx_t... RangeInts>
+  template < camp::idx_t... RangeInts >
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>) const
-      -> decltype(m_lambda(
-          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -119,14 +117,16 @@ struct CombiningAdapter
   }
 
 public:
+
   /*!
    * Constructor from lambda and layout.
    */
-  template <typename C_Lambda, typename C_Layout>
+  template < typename C_Lambda, typename C_Layout >
   RAJA_HOST_DEVICE CombiningAdapter(C_Lambda&& lambda, C_Layout&& layout)
-      : m_lambda(std::forward<C_Lambda>(lambda)),
-        m_layout(std::forward<C_Layout>(layout))
-  {}
+      : m_lambda(std::forward<C_Lambda>(lambda))
+      , m_layout(std::forward<C_Layout>(layout))
+  {
+  }
 
   /*!
    * Call the lambda by converting the linear index to multidimensional indices.
@@ -134,13 +134,13 @@ struct CombiningAdapter
    * @return return value of lambda
    */
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index)
-      -> decltype(call_helper(linear_index, IndexRange()))
+    -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
   ///
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index) const
-      -> decltype(call_helper(linear_index, IndexRange()))
+    -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
@@ -207,9 +207,9 @@ struct CombiningAdapter
  *
  */
 template <typename Lambda, typename Layout>
-RAJA_HOST_DEVICE RAJA_INLINE auto
-make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
-// -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
+RAJA_HOST_DEVICE RAJA_INLINE
+auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
+  // -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
 {
   return CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>(
       std::forward<Lambda>(lambda), std::forward<Layout>(layout));
@@ -217,54 +217,48 @@ make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
-RAJA_INLINE auto
-make_CombiningAdapter(Lambda&& lambda,
-                      ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-//             camp::val<RAJA::TypedOffsetLayout<
-//                 typename std::common_type< strip_index_type_t<IdxTs>...
-//                 >::type, IdxTs...>>()))
+RAJA_INLINE
+auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+  //             camp::val<RAJA::TypedOffsetLayout<
+  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
+  //                 IdxTs...>>()))
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
-  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
+  using std::begin; using std::end; using std::distance;
+  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                          : static_cast<IdxLin>(0))...}},
-      std::move(layout));
+        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                            : static_cast<IdxLin>(0))...}},
+        std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE auto
-make_PermutedCombiningAdapter(Lambda&& lambda,
-                              ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-//             camp::val<RAJA::TypedOffsetLayout<
-//                 typename std::common_type< strip_index_type_t<IdxTs>...
-//                 >::type, IdxTs...>>()))
+RAJA_INLINE
+auto make_PermutedCombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+  //             camp::val<RAJA::TypedOffsetLayout<
+  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
+  //                 IdxTs...>>()))
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
-  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
+  using std::begin; using std::end; using std::distance;
+  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   auto layout = make_permuted_layout<sizeof...(IdxTs), IdxLin>(
-      {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
-      RAJA::as_array<Perm>::get());
+              {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
+              RAJA::as_array<Perm>::get());
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                          : static_cast<IdxLin>(0))...}},
+        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                            : static_cast<IdxLin>(0))...}},
 
-      std::move(layout));
+        std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index db0928385e..257e852bf9 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -41,16 +41,14 @@ struct is_any_of;
 
 template <typename T, typename... Types>
 struct is_any_of<T, ::camp::list<Types...>>
-    : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
+  : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
 {};
 
 template <typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
 template <typename T, typename TypeList>
-using enable_if_is_none_of =
-    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
-                     T>;
+using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
 
 
 }  // namespace util
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 005f26b337..6bb308d375 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -3,8 +3,7 @@
  *
  * \file
  *
- * \brief   RAJA header file defining the IndexLayout class and IndexList
- *classes.
+ * \brief   RAJA header file defining the IndexLayout class and IndexList classes.
  *
  ******************************************************************************
  */
@@ -21,83 +20,73 @@
 
 #include "RAJA/util/Layout.hpp"
 
-namespace RAJA
+namespace RAJA 
 {
 
 /*!
- * DirectIndex struct contains call operator that returns the same index that
- * was input
- *
- */
-template <typename IdxLin = Index_type>
-struct DirectIndex
-{
+* DirectIndex struct contains call operator that returns the same index that was input
+*
+*/
+template<typename IdxLin = Index_type>
+struct DirectIndex {
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
   {
     return idx;
   }
+
 };
 
 /*!
- * IndexList struct stores a pointer to an array containing the index list.
- * Its call operator returns the entry at the input location (idx) of its index
- * list.
- *
- */
-template <typename IdxLin = Index_type>
-struct IndexList
-{
+* IndexList struct stores a pointer to an array containing the index list.
+* Its call operator returns the entry at the input location (idx) of its index list.
+* 
+*/
+template<typename IdxLin = Index_type>
+struct IndexList {
 
-  IdxLin* index_list {nullptr};
+  IdxLin* index_list{nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
   {
     return index_list[idx];
   }
+
 };
 
 /*!
- * ConditionalIndexList struct stores a pointer to an array containing the index
- * list. Its call operator returns the same index that was input if the index
- * list is a nullptr, or otherwise returns the entry at the input location (idx)
- * of its index list.
- *
- */
-template <typename IdxLin = Index_type>
-struct ConditionalIndexList
-{
+* ConditionalIndexList struct stores a pointer to an array containing the index list.
+* Its call operator returns the same index that was input if the index list is a nullptr, 
+* or otherwise returns the entry at the input location (idx) of its index list.
+* 
+*/
+template<typename IdxLin = Index_type>
+struct ConditionalIndexList {
 
-  IdxLin* index_list {nullptr};
+  IdxLin* index_list{nullptr};  
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
   {
-    if (index_list)
-    {
+    if (index_list) {
       return index_list[idx];
-    }
-    else
-    {
+    } else {
       return idx;
     }
   }
+
 };
 
 namespace internal
 {
 
-template <typename Range, typename IdxLin, typename... IndexTypes>
+template<typename Range, typename IdxLin, typename... IndexTypes>
 struct IndexLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
-struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
-{
-  using IndexRange  = camp::idx_seq<RangeInts...>;
+struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
+  using IndexRange = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
@@ -105,78 +94,76 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
   camp::tuple<IndexTypes...> tuple;
 
   template <typename... Types>
-  constexpr RAJA_INLINE
-  IndexLayout_impl(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
-      : base_ {(ns)...}, tuple(index_tuple_in)
-  {}
+  constexpr RAJA_INLINE IndexLayout_impl(
+      camp::tuple<IndexTypes...> index_tuple_in,
+      Types... ns)
+      : base_{(ns)...},
+        tuple(index_tuple_in)
+  {
+  }
 
   /*!
    * Computes a linear space index from entries of index lists stored in tuple.
-   * This is accomplished through the inner product of the strides and the
+   * This is accomplished through the inner product of the strides and the 
    * entry in the index list along each dimension.
    * @param indices Indices in the n-dimensional space of this layout
    * @return Linear space index.
-   */
+   */  
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
-  operator()(Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
+      Indices... indices) const
   {
     return sum<IdxLin>(
-        (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
+      (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
   }
+
 };
 
-}  // namespace internal
+} // namespace internal
 
 
-template <size_t n_dims   = 1,
-          typename IdxLin = Index_type,
-          typename... IndexTypes>
+template <size_t n_dims = 1, typename IdxLin = Index_type, typename... IndexTypes>
 struct IndexLayout
-    : public internal::
-          IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>
-{
-  using Base = internal::
-      IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
+    : public internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...> {
+  using Base =
+      internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
   using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                   IdxLin,
-                                   IndexTypes...>::IndexLayout_impl;
-
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE
-  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                               IdxLin,
-                                               IndexTypes...>& rhs)
-      : Base {rhs}
-  {}
+                                    IdxLin, IndexTypes...>::IndexLayout_impl;
+
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
+      const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
+          rhs)
+      : Base{rhs}
+  {
+  }
+
 };
 
 /*!
- * creates of a camp::tuple of index types
+ * creates of a camp::tuple of index types 
  * (such as DirectIndex, IndexList, or ConditionalIndexList)
  *
  */
 template <typename... IndexTypes>
 auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
 {
-  return camp::tuple<IndexTypes...>(it...);
+    return camp::tuple<IndexTypes...>(it...);
 }
 
 /*!
  * creates an index layout based on the input camp::tuple of index types
  *
- */
-template <typename IdxLin = Index_type,
-          typename... Types,
-          typename... IndexTypes>
-auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
-    -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
+ */  
+template <typename IdxLin = Index_type, typename... Types, typename... IndexTypes>
+auto make_index_layout(
+  camp::tuple<IndexTypes...> index_tuple_in,
+  Types... ns) -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
-  static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
-                                                              ns...);
+    static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
+    return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in, ns...);
 }
 
-}  // namespace RAJA
+}
 
 #endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index 7812306b71..c5060a0a96 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -14,44 +14,39 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA
-{
-namespace util
-{
+namespace RAJA {
+namespace util {
 
-class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
-{
-public:
-  using Parent = ::RAJA::util::PluginStrategy;
-  typedef void (*init_function)(const int,
-                                const uint64_t,
-                                const uint32_t,
-                                void*);
-  typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
-  typedef void (*post_function)(uint64_t);
-  typedef void (*finalize_function)();
+  class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
+  {
+  public:
+    using Parent = ::RAJA::util::PluginStrategy;
+    typedef void (*init_function)(const int, const uint64_t, const uint32_t, void*);
+    typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
+    typedef void (*post_function)(uint64_t);
+    typedef void (*finalize_function)();
 
-  KokkosPluginLoader();
+    KokkosPluginLoader();
 
-  void preLaunch(const RAJA::util::PluginContext& p) override;
+    void preLaunch(const RAJA::util::PluginContext& p) override;
 
-  void postLaunch(const RAJA::util::PluginContext& p) override;
+    void postLaunch(const RAJA::util::PluginContext& p) override;
 
-  void finalize() override;
+    void finalize() override;
 
-private:
-  void initPlugin(const std::string& path);
+  private:
+    void initPlugin(const std::string &path);
+    
+    void initDirectory(const std::string &path);
 
-  void initDirectory(const std::string& path);
+    std::vector<init_function> init_functions;
+    std::vector<pre_function> pre_functions;
+    std::vector<post_function> post_functions;
+    std::vector<finalize_function> finalize_functions;
 
-  std::vector<init_function> init_functions;
-  std::vector<pre_function> pre_functions;
-  std::vector<post_function> post_functions;
-  std::vector<finalize_function> finalize_functions;
+  };  // end KokkosPluginLoader class
 
-};  // end KokkosPluginLoader class
-
-void linkKokkosPluginLoader();
+  void linkKokkosPluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 30a044e322..948e37f498 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -38,8 +38,9 @@ namespace detail
 {
 
 
+
 template <typename Range,
-          typename IdxLin        = Index_type,
+          typename IdxLin = Index_type,
           ptrdiff_t StrideOneDim = -1>
 struct LayoutBase_impl;
 
@@ -48,62 +49,63 @@ struct LayoutBase_impl;
  */
 
 template <size_t j, size_t n_dims, typename IdxLin = Index_type>
-struct stride_calculator
-{
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
-  operator()(IdxLin cur_stride, IdxLin const (&sizes)[n_dims]) const
+struct stride_calculator {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
+      IdxLin cur_stride,
+      IdxLin const (&sizes)[n_dims]) const
   {
-    return stride_calculator<j + 1, n_dims, IdxLin> {}(
+    return stride_calculator<j + 1, n_dims, IdxLin>{}(
         cur_stride * (sizes[j] ? sizes[j] : 1), sizes);
   }
 };
 template <size_t n_dims, typename IdxLin>
-struct stride_calculator<n_dims, n_dims, IdxLin>
-{
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
-  operator()(IdxLin cur_stride, IdxLin const (&)[n_dims]) const
+struct stride_calculator<n_dims, n_dims, IdxLin> {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
+      IdxLin cur_stride,
+      IdxLin const (&)[n_dims]) const
   {
     return cur_stride;
   }
 };
 
 template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
-{
+struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
 public:
   using IndexLinear = IdxLin;
-  using IndexRange  = camp::make_idx_seq_t<sizeof...(RangeInts)>;
+  using IndexRange = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  static constexpr IdxLin limit  = RAJA::operators::limits<IdxLin>::max();
+  static constexpr IdxLin limit = RAJA::operators::limits<IdxLin>::max();
   static constexpr ptrdiff_t stride_one_dim = StrideOneDim;
 
-  IdxLin sizes[n_dims]       = {0};
-  IdxLin strides[n_dims]     = {0};
+  IdxLin sizes[n_dims] = {0};
+  IdxLin strides[n_dims] = {0};
   IdxLin inv_strides[n_dims] = {0};
-  IdxLin inv_mods[n_dims]    = {0};
+  IdxLin inv_mods[n_dims] = {0};
 
 
   /*!
    * Default constructor with zero sizes and strides.
    */
-  constexpr RAJA_INLINE LayoutBase_impl()                        = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&)  = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&)       = default;
-  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl const&) = default;
-  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&)      = default;
+  constexpr RAJA_INLINE LayoutBase_impl() = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const &) = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl &&) = default;
+  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl const &) =
+      default;
+  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl &&) =
+      default;
 
   /*!
    * Construct a layout given the size of each dimension.
    */
   template <typename... Types>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(Types... ns)
-      : sizes {static_cast<IdxLin>(stripIndexType(ns))...},
-        strides {(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin> {}(
+      : sizes{static_cast<IdxLin>(stripIndexType(ns))...},
+        strides{(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin>{}(
             sizes[RangeInts] ? IdxLin(1) : IdxLin(0),
             sizes))...},
-        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
     static_assert(n_dims == sizeof...(Types),
                   "number of dimensions must match");
@@ -113,15 +115,15 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    *  Templated copy ctor from simillar layout.
    */
   template <typename CIdxLin, ptrdiff_t CStrideOneDim>
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE
-  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
-                                        CIdxLin,
-                                        CStrideOneDim>& rhs)
-      : sizes {static_cast<IdxLin>(rhs.sizes[RangeInts])...},
-        strides {static_cast<IdxLin>(rhs.strides[RangeInts])...},
-        inv_strides {static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
-        inv_mods {static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
-  {}
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE LayoutBase_impl(
+      const LayoutBase_impl<camp::idx_seq<RangeInts...>, CIdxLin, CStrideOneDim>
+          &rhs)
+      : sizes{static_cast<IdxLin>(rhs.sizes[RangeInts])...},
+        strides{static_cast<IdxLin>(rhs.strides[RangeInts])...},
+        inv_strides{static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
+        inv_mods{static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
+  {
+  }
 
 
   /*!
@@ -129,35 +131,36 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    */
   template <typename... Types>
   RAJA_INLINE constexpr LayoutBase_impl(
-      const std::array<IdxLin, n_dims>& sizes_in,
-      const std::array<IdxLin, n_dims>& strides_in)
-      : sizes {sizes_in[RangeInts]...},
-        strides {strides_in[RangeInts]...},
-        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
-  {}
+      const std::array<IdxLin, n_dims> &sizes_in,
+      const std::array<IdxLin, n_dims> &strides_in)
+      : sizes{sizes_in[RangeInts]...},
+        strides{strides_in[RangeInts]...},
+        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+  {
+  }
 
   /*!
    * Methods to performs bounds checking in layout objects
    */
-  template <camp::idx_t N, typename Idx>
+  template<camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(sizes[N] - 1));
+           static_cast<int>(N), static_cast<long int>(idx), static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {}
+  {
+  }
 
   template <camp::idx_t N, typename Idx, typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
                                                 Indices... indices) const
   {
-    if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N])))
+    if(sizes[N] > 0 && !(0<=idx && idx < static_cast<Idx>(sizes[N])))
     {
       BoundsCheckError<N>(idx);
     }
@@ -177,16 +180,16 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
   operator()(Indices... indices) const
   {
-#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>((RangeInts == stride_one_dim
-                            ?  // Is this dimension stride-one?
-                            indices
-                            :  // it's stride one, so dont bother with multiply
-                            strides[RangeInts] * indices  // it's not stride one
-                        )...);
+    return sum<IdxLin>(
+      (RangeInts==stride_one_dim ?   // Is this dimension stride-one?
+         indices :  // it's stride one, so dont bother with multiply
+         strides[RangeInts]*indices // it's not stride one
+			)...
+    );
   }
 
 
@@ -202,22 +205,20 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    */
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices&&... indices) const
+                                              Indices &&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
-    if (totSize > 0 && (linear_index < 0 || linear_index >= totSize))
-    {
+    if(totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
       printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
-             static_cast<long int>(linear_index),
-             static_cast<long int>(totSize - 1));
+             static_cast<long int>(linear_index), static_cast<long int>(totSize-1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
-    }
+     }
 #endif
 
-    camp::sink((indices = (camp::decay<Indices>)((linear_index /
-                                                  inv_strides[RangeInts]) %
-                                                 inv_mods[RangeInts]))...);
+    camp::sink((indices =
+      (camp::decay<Indices>)((linear_index / inv_strides[RangeInts]) %
+                             inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -230,9 +231,8 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
-    return foldl(
-        RAJA::operators::multiplies<IdxLin>(),
-        (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
+    return foldl(RAJA::operators::multiplies<IdxLin>(),
+                         (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
   }
 
   /*!
@@ -247,21 +247,27 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
     return foldl(RAJA::operators::multiplies<IdxLin>(), sizes[RangeInts]...);
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_stride() const {
     return strides[DIM];
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_size() const {
     return sizes[DIM];
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_begin() const {
     return 0;
   }
 };
@@ -332,12 +338,11 @@ struct TypedLayout;
 
 template <typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
 struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
-    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne>
-{
+    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne> {
 
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-  using Self   = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
-  using Base   = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
+  using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
+  using Base = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
 
   // Pull in base constructors
@@ -351,8 +356,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
-  operator()(DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
+      DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
@@ -369,11 +374,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *                 dimensionality of this layout.
    */
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes&... indices) const
+                                              DimTypes &... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes&>(indices)...);
+                    std::forward<DimTypes &>(indices)...);
   }
 
 private:
@@ -387,12 +392,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices&... indices) const
+                                                    Indices &... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    camp::sink(
-        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
+		camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -402,8 +406,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
  *
  */
 template <ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
-RAJA_INLINE Layout<n_dims, IdxLin, s1_dim>
-make_stride_one(Layout<n_dims, IdxLin> const& l)
+RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
+    Layout<n_dims, IdxLin> const &l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
@@ -414,12 +418,12 @@ make_stride_one(Layout<n_dims, IdxLin> const& l)
  *
  */
 template <ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
-RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim>
-make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
+RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim> make_stride_one(
+    TypedLayout<IdxLin, IdxTuple> const &l)
 {
   // strip l to it's base-class type
-  using Base    = typename TypedLayout<IdxLin, IdxTuple>::Base;
-  Base const& b = (Base const&)l;
+  using Base = typename TypedLayout<IdxLin, IdxTuple>::Base;
+  Base const &b = (Base const &)l;
 
   // Use non-typed layout to initialize new typed layout
   return TypedLayout<IdxLin, IdxTuple, s1_dim>(b);
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index faa5910704..50680101d4 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -31,7 +31,8 @@ namespace RAJA
 {
 
 
-template <camp::idx_t... Sizes>
+
+template<camp::idx_t ... Sizes>
 using ParamList = camp::idx_seq<Sizes...>;
 
 /*!
@@ -50,86 +51,79 @@ using ParamList = camp::idx_seq<Sizes...>;
  */
 
 
-namespace internal
-{
+namespace internal {
 
 
-template <typename Perm, typename Sizes>
-struct StaticLayoutHelper;
 
-template <camp::idx_t... Perm, Index_type... Sizes>
-struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>
-{
-  using type = StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
-};
+  template<typename Perm, typename Sizes>
+  struct StaticLayoutHelper;
+
+  template<camp::idx_t ... Perm, Index_type ...Sizes>
+  struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>{
+      using type =  StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+  };
 
-template <typename Perm, typename Sizes>
-using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
+  template<typename Perm, typename Sizes>
+  using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 
 
-}  // namespace internal
 
+}
 
-template <typename ValueType,
-          typename Perm,
-          typename Sizes,
-          typename... IndexTypes>
+
+template<typename ValueType, typename Perm, typename Sizes, typename... IndexTypes>
 using TypedLocalArray =
-    internal::TypedViewBase<ValueType,
-                            ValueType*,
-                            internal::getStaticLayoutType<Perm, Sizes>,
-                            camp::list<IndexTypes...>>;
+    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, camp::list<IndexTypes...> >;
 
 
-template <typename ValueType, typename Perm, typename Sizes>
+template<typename ValueType, typename Perm, typename Sizes>
 using LocalArray =
-    internal::TypedViewBase<ValueType,
-                            ValueType*,
-                            internal::getStaticLayoutType<Perm, Sizes>,
-                            internal::getDefaultIndexTypes<Perm>>;
-
-
-template <typename AtomicPolicy,
-          typename DataType,
-          typename Perm,
-          typename Sizes,
-          typename... IndexTypes>
-struct AtomicTypedLocalArray
-{};
-
-template <typename AtomicPolicy,
-          typename DataType,
-          camp::idx_t... Perm,
-          Index_type... Sizes,
-          typename... IndexTypes>
-struct AtomicTypedLocalArray<AtomicPolicy,
-                             DataType,
-                             camp::idx_seq<Perm...>,
-                             RAJA::SizeList<Sizes...>,
-                             IndexTypes...>
-{
-  DataType* m_arrayPtr = nullptr;
-  using value_type     = DataType;
-  using atomic_ref_t   = RAJA::AtomicRef<value_type, AtomicPolicy>;
-  using layout_type    = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, internal::getDefaultIndexTypes<Perm> >;
+
+
+
+
+
+template<typename AtomicPolicy, typename DataType, typename Perm,
+         typename Sizes, typename ... IndexTypes>
+struct AtomicTypedLocalArray {
+};
+
+template<typename AtomicPolicy, typename DataType, camp::idx_t ... Perm,
+          Index_type ... Sizes, typename ... IndexTypes>
+struct AtomicTypedLocalArray<AtomicPolicy, DataType, camp::idx_seq<Perm ...>,
+                             RAJA::SizeList<Sizes ...>, IndexTypes ...>{
+  DataType *m_arrayPtr = nullptr;
+  using value_type = DataType;
+  using atomic_ref_t = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm ...>, Sizes ...>;
   static const camp::idx_t NumElem = layout_type::s_size;
 
   RAJA_HOST_DEVICE
-  atomic_ref_t operator()(IndexTypes... indices) const
+  atomic_ref_t operator()(IndexTypes ... indices) const
   {
-    return (atomic_ref_t(
-        &m_arrayPtr[layout_type::s_oper(stripIndexType(indices)...)]));
+    return(atomic_ref_t(&m_arrayPtr[layout_type::s_oper(stripIndexType(indices)
+                                                     ...)]));
   }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr camp::idx_t size() const { return layout_type::s_size; }
+  constexpr
+  camp::idx_t size() const
+  {
+    return layout_type::s_size;
+  }
 
   RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(DataType* data_ptr) { m_arrayPtr = data_ptr; }
+  RAJA_INLINE void set_data(DataType * data_ptr){
+    m_arrayPtr = data_ptr;
+  }
 };
 
 
+
+
+
 }  // end namespace RAJA
 
 
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 858f444f74..827515062e 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -41,67 +41,66 @@ template <typename Range, typename IdxLin>
 struct OffsetLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin>
-struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
-{
-  using Self        = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
-  using IndexRange  = camp::idx_seq<RangeInts...>;
+struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
+  using Self = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
+  using IndexRange = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims]         = {0};  // If not specified set to zero
+  IdxLin offsets[n_dims]={0}; //If not specified set to zero
 
-  constexpr RAJA_INLINE
-  OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
-                    std::array<IdxLin, sizeof...(RangeInts)> end)
-      : base_ {(end[RangeInts] - begin[RangeInts])...},
-        offsets {begin[RangeInts]...}
-  {}
+  constexpr RAJA_INLINE OffsetLayout_impl(
+      std::array<IdxLin, sizeof...(RangeInts)> begin,
+      std::array<IdxLin, sizeof...(RangeInts)> end)
+      : base_{(end[RangeInts] - begin[RangeInts])...},
+        offsets{begin[RangeInts]...}
+  {
+  }
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout_impl(Self const& c)
-      : base_(c.base_), offsets {c.offsets[RangeInts]...}
-  {}
+      : base_(c.base_), offsets{c.offsets[RangeInts]...}
+  {
+  }
 
   void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
   {
-    for (size_t i = 0; i < n_dims; ++i)
-      offsets[i] += shift[i];
+    for(size_t i=0; i<n_dims; ++i) offsets[i] += shift[i];
   }
 
-  template <camp::idx_t N, typename Idx>
+  template<camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
            static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(offsets[N]),
-           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+           static_cast<long int>(offsets[N]), static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {}
+  {
+  }
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
-                                                Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
   {
-    if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N]))
+    if(!(offsets[N] <=idx && idx < offsets[N] + base_.sizes[N]))
     {
       BoundsCheckError<N>(idx);
     }
     RAJA_UNUSED_VAR(idx);
-    BoundsCheck<N + 1>(indices...);
+    BoundsCheck<N+1>(indices...);
   }
 
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
-  operator()(Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin operator()(
+      Indices... indices) const
   {
-#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     return base_((indices - offsets[RangeInts])...);
@@ -109,7 +108,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
 
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices&&... indices) const
+                                              Indices &&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
@@ -120,15 +119,16 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
       const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
       const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
-    OffsetLayout_impl ret {rhs};
+    OffsetLayout_impl ret{rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
     return ret;
   }
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
   OffsetLayout_impl(const Layout<sizeof...(RangeInts), IdxLin>& rhs)
-      : base_ {rhs}
-  {}
+      : base_{rhs}
+  {
+  }
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin size() const
   {
@@ -140,21 +140,27 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
     return base_.size_noproj();
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_stride() const {
     return base_.get_dim_stride();
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_size() const {
     return base_.get_dim_size();
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_begin() const {
     return offsets[DIM];
   }
 };
@@ -163,8 +169,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
 
 template <size_t n_dims = 1, typename IdxLin = Index_type>
 struct OffsetLayout
-    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>
-{
+    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin> {
   using Base =
       internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
@@ -174,57 +179,56 @@ struct OffsetLayout
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout(
       const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
           rhs)
-      : Base {rhs}
-  {}
+      : Base{rhs}
+  {
+  }
 };
 
-// TypedOffsetLayout
+//TypedOffsetLayout
 template <typename IdxLin, typename DimTuple>
 struct TypedOffsetLayout;
 
 template <typename IdxLin, typename... DimTypes>
 struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
-    : public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
+: public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
 {
-  using StrippedIdxLin = strip_index_type_t<IdxLin>;
-  using Self           = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-  using Base           = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
-  using DimArr         = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
-  using DimTuple       = camp::tuple<DimTypes...>;
-  using IndexLinear    = IdxLin;
-
-  // Pull in base coonstructors
-#if 0
+   using StrippedIdxLin = strip_index_type_t<IdxLin>;
+   using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+   using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
+   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
+   using DimTuple = camp::tuple<DimTypes...>;
+   using IndexLinear = IdxLin;
+
+   // Pull in base coonstructors
+ #if 0
    // This breaks with nvcc11
  using Base::Base;
-#else
-  using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
-#endif
+ #else
+   using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
+ #endif
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
-  operator()(DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes&... indices) const
+                                              DimTypes &... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes&>(indices)...);
+                    std::forward<DimTypes &>(indices)...);
   }
 
 private:
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices&... indices) const
+                                                    Indices &... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    camp::sink(
-        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -234,7 +238,7 @@ auto make_offset_layout(const std::array<IdxLin, n_dims>& begin,
                         const std::array<IdxLin, n_dims>& end)
     -> OffsetLayout<n_dims, IdxLin>
 {
-  return OffsetLayout<n_dims, IdxLin> {begin, end};
+  return OffsetLayout<n_dims, IdxLin>{begin, end};
 }
 
 template <size_t Rank, typename IdxLin = Index_type>
@@ -244,8 +248,7 @@ auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
-  for (size_t i = 0; i < Rank; ++i)
-  {
+  for (size_t i = 0; i < Rank; ++i) {
     sizes[i] = end[i] - begin[i];
   }
   return internal::OffsetLayout_impl<camp::make_idx_seq_t<Rank>, IdxLin>::
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 77c880b08e..150aaeee34 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -29,19 +29,15 @@ namespace RAJA
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeft
 {
-  template <typename new_Ret,
-            typename new_Arg1 = new_Ret,
-            typename new_Arg2 = new_Ret>
+  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
-  template <size_t>
+  template < size_t >
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
-  operator()(Arg1 const& i,
-             Arg1 const& num_i,
-             Arg2 const& j,
-             Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  Ret operator()(Arg1 const& i, Arg1 const& num_i,
+                 Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
@@ -50,46 +46,35 @@ struct GetOffsetLeft
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetRight
 {
-  template <typename new_Ret,
-            typename new_Arg1 = new_Ret,
-            typename new_Arg2 = new_Ret>
+  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
-  template <size_t>
+  template < size_t >
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
-  operator()(Arg1 const& i,
-             Arg1 const& RAJA_UNUSED_ARG(num_i),
-             Arg2 const& j,
-             Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
+                 Arg2 const& j, Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
 template <size_t t_bunch_num_i,
-          typename Ret,
-          typename Arg1 = Ret,
-          typename Arg2 = Arg1>
+          typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeftBunched
 {
-  template <typename new_Ret,
-            typename new_Arg1 = new_Ret,
-            typename new_Arg2 = new_Ret>
-  using rebind =
-      GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
+  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  using rebind = GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
 
-  template <size_t new_bunch_num_i>
+  template < size_t new_bunch_num_i >
   using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
 
-  static constexpr Arg1 bunch_num_i {t_bunch_num_i};
+  static constexpr Arg1 bunch_num_i{t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
-  operator()(Arg1 const& i,
-             Arg1 const& RAJA_UNUSED_ARG(num_i),
-             Arg2 const& j,
-             Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
+                 Arg2 const& j, Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index 03366fa7f7..b4249e7182 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -43,38 +43,35 @@ namespace detail
 {
 
 // truly associative (does not include fp add/multiply)
-struct associative_tag
-{};
+struct associative_tag {
+};
 
 // associative up to floating point rounding differences
-struct fp_associative_tag : associative_tag
-{};
+struct fp_associative_tag : associative_tag {
+};
 
 // get associativity tag appropriate for the type
-template <typename T>
+template < typename T >
 using associative_or_fp_associative_tag =
-    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
-                       fp_associative_tag,
-                       associative_tag>;
+  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                     fp_associative_tag, associative_tag>;
 
 template <typename Arg1, typename Arg2, typename Result>
-struct binary_function
-{
-  using first_argument_type  = Arg1;
+struct binary_function {
+  using first_argument_type = Arg1;
   using second_argument_type = Arg2;
-  using result_type          = Result;
+  using result_type = Result;
 };
 
 template <typename Argument, typename Result>
-struct unary_function
-{
+struct unary_function {
   using argument_type = Argument;
-  using result_type   = Result;
+  using result_type = Result;
 };
 
 template <typename Arg1, typename Arg2>
-struct comparison_function : public binary_function<Arg1, Arg2, bool>
-{};
+struct comparison_function : public binary_function<Arg1, Arg2, bool> {
+};
 
 }  // namespace detail
 
@@ -82,15 +79,13 @@ namespace types
 {
 
 template <typename T>
-struct is_unsigned_int
-{
+struct is_unsigned_int {
   static constexpr const bool value =
       std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
 
 template <typename T>
-struct is_signed_int
-{
+struct is_signed_int {
   static constexpr const bool value =
       !std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
@@ -101,60 +96,51 @@ struct is_signed_int
    type)
 */
 template <typename T, bool GPU = false>
-struct larger
-{};
+struct larger {
+};
 
 template <>
-struct larger<uint8_t>
-{
+struct larger<uint8_t> {
   using type = uint16_t;
 };
 
 template <>
-struct larger<uint16_t>
-{
+struct larger<uint16_t> {
   using type = uint32_t;
 };
 
 template <>
-struct larger<uint32_t>
-{
+struct larger<uint32_t> {
   using type = uint64_t;
 };
 
 template <>
-struct larger<int8_t>
-{
+struct larger<int8_t> {
   using type = int16_t;
 };
 
 template <>
-struct larger<int16_t>
-{
+struct larger<int16_t> {
   using type = int32_t;
 };
 
 template <>
-struct larger<int32_t>
-{
+struct larger<int32_t> {
   using type = int64_t;
 };
 
 template <>
-struct larger<float>
-{
+struct larger<float> {
   using type = double;
 };
 
 template <>
-struct larger<double>
-{
+struct larger<double> {
   using type = long double;
 };
 
 template <>
-struct larger<double, true>
-{
+struct larger<double, true> {
   using type = double;
 };
 
@@ -162,30 +148,26 @@ namespace detail
 {
 
 template <typename T, bool isInt, bool isSigned, bool isFP, bool gpu = false>
-struct largest
-{};
+struct largest {
+};
 
 template <typename T>
-struct largest<T, true, false, false>
-{
+struct largest<T, true, false, false> {
   using type = uint64_t;
 };
 
 template <typename T>
-struct largest<T, true, true, false>
-{
+struct largest<T, true, true, false> {
   using type = int64_t;
 };
 
 template <typename T>
-struct largest<T, false, false, true, false>
-{
+struct largest<T, false, false, true, false> {
   using type = long double;
 };
 
 template <typename T>
-struct largest<T, false, false, true, true>
-{
+struct largest<T, false, false, true, true> {
   using type = double;
 };
 }  // namespace detail
@@ -195,8 +177,7 @@ struct largest<T, false, false, true, true>
    pass 'true' as second template argument
 */
 template <typename T, bool gpu = false>
-struct largest
-{
+struct largest {
   using type = typename detail::largest<T,
                                         std::is_integral<T>::value,
                                         std::is_signed<T>::value,
@@ -206,37 +187,30 @@ struct largest
 
 
 template <typename T>
-struct size_of
-{
-  enum
-  {
-    value = sizeof(T)
-  };
+struct size_of {
+  enum { value = sizeof(T) };
 };
 
 namespace detail
 {
 
 template <typename T, typename U, bool lhsLarger>
-struct larger_of
-{};
+struct larger_of {
+};
 
 template <typename T, typename U>
-struct larger_of<T, U, true>
-{
+struct larger_of<T, U, true> {
   using type = T;
 };
 
 template <typename T, typename U>
-struct larger_of<T, U, false>
-{
+struct larger_of<T, U, false> {
   using type = U;
 };
 }  // namespace detail
 
 template <typename T, typename U>
-struct larger_of
-{
+struct larger_of {
   using type = typename detail::
       larger_of<T, U, (size_of<T>::value > size_of<U>::value)>::type;
 };
@@ -244,6 +218,7 @@ struct larger_of
 }  // namespace types
 
 
+
 template <typename T, typename Enable = void>
 struct limits;
 
@@ -251,27 +226,27 @@ struct limits;
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-              typename std::enable_if<std::is_integral<T>::value &&
-                                      !std::is_unsigned<T>::value>::type>
+  typename std::enable_if<std::is_integral<T>::value &&
+  !std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(disable : 4309)
+#pragma warning( disable : 4309 )
 #endif
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
+    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu) );
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4309)
+#pragma warning( default : 4309 )
 #endif
   }
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(disable : 4309)
+#pragma warning( disable : 4309 )
 #endif
     return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4309)
+#pragma warning( default : 4309 )
 #endif
   }
 };
@@ -279,8 +254,8 @@ struct limits<T,
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-              typename std::enable_if<std::is_integral<T>::value &&
-                                      std::is_unsigned<T>::value>::type>
+  typename std::enable_if<std::is_integral<T>::value &&
+  std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -289,36 +264,42 @@ struct limits<T,
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(disable : 4309)
+#pragma warning( disable : 4309 )
 #endif
     return static_cast<T>(0xFFFFFFFFFFFFFFFF);
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4309)
+#pragma warning( default : 4309 )
 #endif
   }
 };
 
 
 template <>
-struct limits<float>
-{
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min() { return -FLT_MAX; }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max() { return FLT_MAX; }
+struct limits<float> {
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min()
+  {
+    return -FLT_MAX;
+  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max()
+  {
+    return FLT_MAX;
+  }
 };
 
 template <>
-struct limits<double>
-{
+struct limits<double> {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr double min()
   {
     return -DBL_MAX;
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() { return DBL_MAX; }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() 
+  { 
+     return DBL_MAX; 
+  }
 };
 
 template <>
-struct limits<long double>
-{
+struct limits<long double> {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr long double min()
   {
     return -LDBL_MAX;
@@ -347,68 +328,61 @@ static_assert(check<long>(), "limits for long is broken");
 static_assert(check<unsigned long>(), "limits for unsigned long is broken");
 static_assert(check<long int>(), "limits for long int is broken");
 static_assert(check<unsigned long int>(),
-              "limits for unsigned long int is "
-              "broken");
+              "limits for unsigned long int is broken");
 static_assert(check<long long>(), "limits for long long is broken");
 static_assert(check<unsigned long long>(),
-              "limits for unsigned long long is "
-              "broken");
+              "limits for unsigned long long is broken");
 #endif
 
 // Arithmetic
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
-              detail::associative_or_fp_associative_tag<Ret>
-{
+              detail::associative_or_fp_associative_tag<Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret {lhs} + rhs;
+    return Ret{lhs} + rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct minus : public detail::binary_function<Arg1, Arg2, Ret>
-{
+struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret {lhs} - rhs;
+    return Ret{lhs} - rhs;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
-                    detail::associative_or_fp_associative_tag<Ret>
-{
+                    detail::associative_or_fp_associative_tag<Ret> {
 
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret {lhs} * rhs;
+    return Ret{lhs} * rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {1}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{1}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct divides : public detail::binary_function<Arg1, Arg2, Ret>
-{
+struct divides : public detail::binary_function<Arg1, Arg2, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret {lhs} / rhs;
+    return Ret{lhs} / rhs;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
-{
+struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret {lhs} % rhs;
+    return Ret{lhs} % rhs;
   }
 };
 
@@ -416,8 +390,7 @@ struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_and : public detail::comparison_function<Arg1, Arg2>,
-                     detail::associative_tag
-{
+                     detail::associative_tag {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -428,8 +401,7 @@ struct logical_and : public detail::comparison_function<Arg1, Arg2>,
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_or : public detail::comparison_function<Arg1, Arg2>,
-                    detail::associative_tag
-{
+                    detail::associative_tag {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -439,8 +411,7 @@ struct logical_or : public detail::comparison_function<Arg1, Arg2>,
 };
 
 template <typename T>
-struct logical_not : public detail::unary_function<T, bool>
-{
+struct logical_not : public detail::unary_function<T, bool> {
   RAJA_HOST_DEVICE constexpr bool operator()(const T& lhs) const
   {
     return !lhs;
@@ -450,33 +421,30 @@ struct logical_not : public detail::unary_function<T, bool>
 // Bitwise
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
-{
+struct bit_or : public detail::binary_function<Arg1, Arg2, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs | rhs;
   }
 
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
+RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
-{
+struct bit_and : public detail::binary_function<Arg1, Arg2, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs & rhs;
   }
 
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret {0}; }
+RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
 };
 
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
-{
+struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -493,8 +461,7 @@ struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
 */
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag
-{
+                 detail::associative_tag {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -508,8 +475,7 @@ struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag
-{
+                 detail::associative_tag {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -524,8 +490,7 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
 // Logical Comparison
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct equal_to : public detail::comparison_function<Arg1, Arg2>
-{
+struct equal_to : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -534,8 +499,7 @@ struct equal_to : public detail::comparison_function<Arg1, Arg2>
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
-{
+struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -544,8 +508,7 @@ struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater : public detail::comparison_function<Arg1, Arg2>
-{
+struct greater : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -554,8 +517,7 @@ struct greater : public detail::comparison_function<Arg1, Arg2>
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less : public detail::comparison_function<Arg1, Arg2>
-{
+struct less : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -565,8 +527,7 @@ struct less : public detail::comparison_function<Arg1, Arg2>
 
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater_equal : public detail::comparison_function<Arg1, Arg2>
-{
+struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -575,8 +536,7 @@ struct greater_equal : public detail::comparison_function<Arg1, Arg2>
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less_equal : public detail::comparison_function<Arg1, Arg2>
-{
+struct less_equal : public detail::comparison_function<Arg1, Arg2> {
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -587,8 +547,7 @@ struct less_equal : public detail::comparison_function<Arg1, Arg2>
 // Filters
 
 template <typename Ret, typename Orig = Ret>
-struct identity : public detail::unary_function<Orig, Ret>
-{
+struct identity : public detail::unary_function<Orig, Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Orig& lhs) const
   {
     return lhs;
@@ -596,8 +555,7 @@ struct identity : public detail::unary_function<Orig, Ret>
 };
 
 template <typename T, typename U>
-struct project1st : public detail::binary_function<T, U, T>
-{
+struct project1st : public detail::binary_function<T, U, T> {
   RAJA_HOST_DEVICE constexpr T operator()(const T& lhs,
                                           const U& RAJA_UNUSED_ARG(rhs)) const
   {
@@ -606,8 +564,7 @@ struct project1st : public detail::binary_function<T, U, T>
 };
 
 template <typename T, typename U = T>
-struct project2nd : public detail::binary_function<T, U, U>
-{
+struct project2nd : public detail::binary_function<T, U, U> {
   RAJA_HOST_DEVICE constexpr U operator()(const T& RAJA_UNUSED_ARG(lhs),
                                           const U& rhs) const
   {
@@ -618,15 +575,13 @@ struct project2nd : public detail::binary_function<T, U, U>
 // Type Traits
 
 template <typename T>
-struct is_associative
-{
+struct is_associative {
   static constexpr const bool value =
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
 template <typename T>
-struct is_fp_associative
-{
+struct is_fp_associative {
   static constexpr const bool value =
       std::is_base_of<detail::fp_associative_tag, T>::value;
 };
@@ -636,8 +591,8 @@ struct safe_plus
     : public plus<Arg1,
                   Arg2,
                   typename types::larger<
-                      typename types::larger_of<Arg1, Arg2>::type>::type>
-{};
+                      typename types::larger_of<Arg1, Arg2>::type>::type> {
+};
 
 }  // namespace operators
 
@@ -650,20 +605,19 @@ template <typename Function,
           typename Arg2 = Arg1>
 struct BinaryFunction
     : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>())))
-{};
+          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>()))) {
+};
 
 template <typename Function, typename Return, typename Arg = Return>
 struct UnaryFunction : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-                           camp::val<Function>()(camp::val<Arg>())))
-{};
+                           camp::val<Function>()(camp::val<Arg>()))) {
+};
 
 namespace detail
 {
 
 template <typename Fun, typename Ret, typename T, typename U>
-using is_binary_function =
-    ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
+using is_binary_function = ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
 
 template <typename Fun, typename Ret, typename T>
 using is_unary_function = ::RAJA::concepts::requires_<UnaryFunction, Ret, T>;
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index 2a70c4e760..e79e9f2830 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -31,47 +31,46 @@ template <typename Indices>
 struct as_array;
 
 template <camp::idx_t... Indices>
-struct as_array<camp::idx_seq<Indices...>>
-{
+struct as_array<camp::idx_seq<Indices...>> {
   static constexpr std::array<Index_type, sizeof...(Indices)> get()
   {
     return {{Indices...}};
   }
 };
 
-using PERM_I     = camp::idx_seq<0>;
-using PERM_IJ    = camp::idx_seq<0, 1>;
-using PERM_JI    = camp::idx_seq<1, 0>;
-using PERM_IJK   = camp::idx_seq<0, 1, 2>;
-using PERM_IKJ   = camp::idx_seq<0, 2, 1>;
-using PERM_JIK   = camp::idx_seq<1, 0, 2>;
-using PERM_JKI   = camp::idx_seq<1, 2, 0>;
-using PERM_KIJ   = camp::idx_seq<2, 0, 1>;
-using PERM_KJI   = camp::idx_seq<2, 1, 0>;
-using PERM_IJKL  = camp::idx_seq<0, 1, 2, 3>;
-using PERM_IJLK  = camp::idx_seq<0, 1, 3, 2>;
-using PERM_IKJL  = camp::idx_seq<0, 2, 1, 3>;
-using PERM_IKLJ  = camp::idx_seq<0, 2, 3, 1>;
-using PERM_ILJK  = camp::idx_seq<0, 3, 1, 2>;
-using PERM_ILKJ  = camp::idx_seq<0, 3, 2, 1>;
-using PERM_JIKL  = camp::idx_seq<1, 0, 2, 3>;
-using PERM_JILK  = camp::idx_seq<1, 0, 3, 2>;
-using PERM_JKIL  = camp::idx_seq<1, 2, 0, 3>;
-using PERM_JKLI  = camp::idx_seq<1, 2, 3, 0>;
-using PERM_JLIK  = camp::idx_seq<1, 3, 0, 2>;
-using PERM_JLKI  = camp::idx_seq<1, 3, 2, 0>;
-using PERM_KIJL  = camp::idx_seq<2, 0, 1, 3>;
-using PERM_KILJ  = camp::idx_seq<2, 0, 3, 1>;
-using PERM_KJIL  = camp::idx_seq<2, 1, 0, 3>;
-using PERM_KJLI  = camp::idx_seq<2, 1, 3, 0>;
-using PERM_KLIJ  = camp::idx_seq<2, 3, 0, 1>;
-using PERM_KLJI  = camp::idx_seq<2, 3, 1, 0>;
-using PERM_LIJK  = camp::idx_seq<3, 0, 1, 2>;
-using PERM_LIKJ  = camp::idx_seq<3, 0, 2, 1>;
-using PERM_LJIK  = camp::idx_seq<3, 1, 0, 2>;
-using PERM_LJKI  = camp::idx_seq<3, 1, 2, 0>;
-using PERM_LKIJ  = camp::idx_seq<3, 2, 0, 1>;
-using PERM_LKJI  = camp::idx_seq<3, 2, 1, 0>;
+using PERM_I = camp::idx_seq<0>;
+using PERM_IJ = camp::idx_seq<0, 1>;
+using PERM_JI = camp::idx_seq<1, 0>;
+using PERM_IJK = camp::idx_seq<0, 1, 2>;
+using PERM_IKJ = camp::idx_seq<0, 2, 1>;
+using PERM_JIK = camp::idx_seq<1, 0, 2>;
+using PERM_JKI = camp::idx_seq<1, 2, 0>;
+using PERM_KIJ = camp::idx_seq<2, 0, 1>;
+using PERM_KJI = camp::idx_seq<2, 1, 0>;
+using PERM_IJKL = camp::idx_seq<0, 1, 2, 3>;
+using PERM_IJLK = camp::idx_seq<0, 1, 3, 2>;
+using PERM_IKJL = camp::idx_seq<0, 2, 1, 3>;
+using PERM_IKLJ = camp::idx_seq<0, 2, 3, 1>;
+using PERM_ILJK = camp::idx_seq<0, 3, 1, 2>;
+using PERM_ILKJ = camp::idx_seq<0, 3, 2, 1>;
+using PERM_JIKL = camp::idx_seq<1, 0, 2, 3>;
+using PERM_JILK = camp::idx_seq<1, 0, 3, 2>;
+using PERM_JKIL = camp::idx_seq<1, 2, 0, 3>;
+using PERM_JKLI = camp::idx_seq<1, 2, 3, 0>;
+using PERM_JLIK = camp::idx_seq<1, 3, 0, 2>;
+using PERM_JLKI = camp::idx_seq<1, 3, 2, 0>;
+using PERM_KIJL = camp::idx_seq<2, 0, 1, 3>;
+using PERM_KILJ = camp::idx_seq<2, 0, 3, 1>;
+using PERM_KJIL = camp::idx_seq<2, 1, 0, 3>;
+using PERM_KJLI = camp::idx_seq<2, 1, 3, 0>;
+using PERM_KLIJ = camp::idx_seq<2, 3, 0, 1>;
+using PERM_KLJI = camp::idx_seq<2, 3, 1, 0>;
+using PERM_LIJK = camp::idx_seq<3, 0, 1, 2>;
+using PERM_LIKJ = camp::idx_seq<3, 0, 2, 1>;
+using PERM_LJIK = camp::idx_seq<3, 1, 0, 2>;
+using PERM_LJKI = camp::idx_seq<3, 1, 2, 0>;
+using PERM_LKIJ = camp::idx_seq<3, 2, 0, 1>;
+using PERM_LKJI = camp::idx_seq<3, 2, 1, 0>;
 using PERM_IJKLM = camp::idx_seq<0, 1, 2, 3, 4>;
 using PERM_IJKML = camp::idx_seq<0, 1, 2, 4, 3>;
 using PERM_IJLKM = camp::idx_seq<0, 1, 3, 2, 4>;
@@ -194,51 +193,51 @@ using PERM_MLKIJ = camp::idx_seq<4, 3, 2, 0, 1>;
 using PERM_MLKJI = camp::idx_seq<4, 3, 2, 1, 0>;
 
 
-namespace internal
+
+
+namespace internal 
 {
 
 
-template <camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
+template<camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem
 {
-  static constexpr camp::idx_t value =
-      camp::seq_at<J, Perm>::value == I
-          ? J
-          : CalcInversePermutationElem<I, J + 1, N, Perm>::value;
+  static constexpr camp::idx_t value = 
+    camp::seq_at<J, Perm>::value == I ? J : CalcInversePermutationElem<I, J+1, N, Perm>::value;
 };
 
-template <camp::idx_t I, camp::idx_t N, typename Perm>
+template<camp::idx_t I, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem<I, N, N, Perm>
 {
   static constexpr camp::idx_t value = I;
 };
 
 
-template <typename Range, typename Perm>
+
+template<typename Range, typename Perm>
 struct InversePermutationHelper;
 
-template <camp::idx_t... Range, camp::idx_t... Perm>
-struct InversePermutationHelper<camp::idx_seq<Range...>, camp::idx_seq<Perm...>>
+template<camp::idx_t ... Range, camp::idx_t ... Perm>
+struct InversePermutationHelper<camp::idx_seq<Range...>, 
+                                camp::idx_seq<Perm...>>
 {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq<
-      CalcInversePermutationElem<Range,
-                                 0,
-                                 sizeof...(Range),
-                                 camp::idx_seq<Perm...>>::value...>;
+  using type = camp::idx_seq< 
+    CalcInversePermutationElem<Range, 0, sizeof...(Range), camp::idx_seq<Perm...>>::value ...  
+  >;  
 };
 
 
-}  // namespace internal
+
+} // namespace internal
+
 
 
 /*!
   Inverts a permutation
 */
-template <typename Perm>
-using invert_permutation = typename internal::InversePermutationHelper<
-    camp::make_idx_seq_t<camp::size<Perm>::value>,
-    Perm>::type;
+template<typename Perm>
+using invert_permutation = typename internal::InversePermutationHelper<camp::make_idx_seq_t<camp::size<Perm>::value>, Perm>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index f7f1c627fb..5bb176215b 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -67,30 +67,26 @@ auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
-  for (size_t i = 0; i < Rank; ++i)
-  {
+  for (size_t i = 0; i < Rank; ++i) {
     // If the size of dimension i is zero, then the stride is zero
     folded_strides[i] = sizes[permutation[i]] ? 1 : 0;
-    for (size_t j = i + 1; j < Rank; ++j)
-    {
+    for (size_t j = i + 1; j < Rank; ++j) {
       folded_strides[i] *= sizes[permutation[j]] ? sizes[permutation[j]] : 1;
     }
   }
 
-  for (size_t i = 0; i < Rank; ++i)
-  {
+  for (size_t i = 0; i < Rank; ++i) {
     strides[permutation[i]] = folded_strides[i];
   }
 
 
   // return Layout<Rank, IdxLin>(sizes, strides);
-  auto ret = Layout<Rank, IdxLin>();
-  for (size_t i = 0; i < Rank; ++i)
-  {
-    ret.sizes[i]       = sizes[i];
-    ret.strides[i]     = strides[i];
+  auto ret  = Layout<Rank, IdxLin>();
+  for (size_t i = 0; i < Rank; ++i) {
+    ret.sizes[i] = sizes[i];
+    ret.strides[i] = strides[i];
     ret.inv_strides[i] = strides[i] ? strides[i] : 1;
-    ret.inv_mods[i]    = sizes[i] ? sizes[i] : 1;
+    ret.inv_mods[i] = sizes[i] ? sizes[i] : 1;
   }
   return ret;
 }
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 97aebf9431..996836e397 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -11,33 +11,31 @@
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/internal/get_platform.hpp"
 
-namespace RAJA
-{
-namespace util
-{
+namespace RAJA {
+namespace util {
 
 class KokkosPluginLoader;
 
-struct PluginContext
-{
-public:
-  PluginContext(const Platform p) : platform(p) {}
+struct PluginContext {
+  public:
+    PluginContext(const Platform p) :
+      platform(p) {}
 
-  Platform platform;
+    Platform platform;
 
-private:
-  mutable uint64_t kID;
+  private:
+    mutable uint64_t kID;
 
-  friend class KokkosPluginLoader;
+    friend class KokkosPluginLoader;
 };
 
-template <typename Policy>
+template<typename Policy>
 PluginContext make_context()
 {
-  return PluginContext {detail::get_platform<Policy>::value};
+  return PluginContext{detail::get_platform<Policy>::value};
 }
 
-}  // namespace util
-}  // namespace RAJA
+} // closing brace for util namespace
+} // closing brace for RAJA namespace
 
 #endif
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
index 5920142759..e5b77bd027 100644
--- a/include/RAJA/util/PluginLinker.hpp
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -11,18 +11,14 @@
 #include "RAJA/util/RuntimePluginLoader.hpp"
 #include "RAJA/util/KokkosPluginLoader.hpp"
 
-namespace
-{
-namespace anonymous_RAJA
-{
-struct pluginLinker
-{
-  inline pluginLinker()
-  {
-    (void)RAJA::util::linkRuntimePluginLoader();
-    (void)RAJA::util::linkKokkosPluginLoader();
+namespace {
+  namespace anonymous_RAJA {
+    struct pluginLinker {
+      inline pluginLinker() {
+        (void)RAJA::util::linkRuntimePluginLoader();
+        (void)RAJA::util::linkKokkosPluginLoader();
+      }
+    } pluginLinker;
   }
-} pluginLinker;
-}  // namespace anonymous_RAJA
-}  // namespace
+}
 #endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
index 50ed3a1da9..f0b6a35507 100644
--- a/include/RAJA/util/PluginOptions.hpp
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -10,24 +10,22 @@
 
 #include <string>
 
-namespace RAJA
-{
-namespace util
-{
+namespace RAJA {
+namespace util {
 
 struct PluginOptions
 {
-  PluginOptions(const std::string& newstr) : str(newstr) {};
-
-  std::string str;
+    PluginOptions(const std::string& newstr) : str(newstr) {};
+    
+    std::string str;
 };
 
 inline PluginOptions make_options(const std::string& newstr)
 {
-  return PluginOptions {newstr};
+    return PluginOptions{newstr};
 }
 
-}  // namespace util
-}  // namespace RAJA
+} // namespace util
+} // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 86f8fd7f6b..3935559bba 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -12,35 +12,33 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/Registry.hpp"
 
-namespace RAJA
-{
-namespace util
-{
+namespace RAJA {
+namespace util {
 
 class PluginStrategy
 {
-public:
-  RAJASHAREDDLL_API PluginStrategy();
+  public:
+    RAJASHAREDDLL_API PluginStrategy();
 
-  virtual ~PluginStrategy() = default;
+    virtual ~PluginStrategy() = default;
 
-  virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
+    virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
 
-  virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
+    virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
 
-  virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
+    virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
 
-  virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
+    virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
 
-  virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
+    virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
 
-  virtual RAJASHAREDDLL_API void finalize();
+    virtual RAJASHAREDDLL_API void finalize();
 };
 
 using PluginRegistry = Registry<PluginStrategy>;
 
-}  // namespace util
-}  // namespace RAJA
+} // closing brace for util namespace
+} // closing brace for RAJA namespace
 
 
 #endif
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index 4bfb2ee7b8..579481a6ed 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -10,147 +10,126 @@
 
 #include <memory>
 
-namespace RAJA
-{
-namespace util
-{
-
-template <typename T>
-class RegistryEntry
-{
-  std::string Name, Desc;
-  std::shared_ptr<T> object;
-
-public:
-  RegistryEntry(const std::string& N,
-                const std::string& D,
-                std::shared_ptr<T> (*C)())
-      : Name(N), Desc(D), object(C())
-  {}
-
-  const std::string& getName() const { return Name; }
-  const std::string& getDesc() const { return Desc; }
-  T* get() const { return object.get(); }
-};
-
-/// A global registry used in conjunction with static constructors to make
-/// pluggable components (like targets or garbage collectors) "just work" when
-/// linked with an executable.
-template <typename T>
-class Registry
-{
-public:
-  using type  = T;
-  using entry = RegistryEntry<T>;
-
-  class node;
-  class iterator;
-
-private:
-  Registry() = delete;
-
-  friend class node;
-  static node *Head, *Tail;
-
-public:
-  /// Node in linked list of entries.
-  ///
-  class node
-  {
-    friend class iterator;
-    friend Registry<T>;
-
-    node* Next;
-    const entry& Val;
+namespace RAJA {
+namespace util {
+
+  template <typename T>
+  class RegistryEntry {
+    std::string Name, Desc;
+    std::shared_ptr<T> object;
 
   public:
-    node(const entry& V) : Next(nullptr), Val(V) {}
-  };
+    RegistryEntry(const std::string& N, const std::string& D,
+        std::shared_ptr<T> (*C)())
+        : Name(N), Desc(D), object(C()) {}
 
-  /// Add a node to the Registry: this is the interface between the plugin and
-  /// the executable.
-  ///
-  /// This function is exported by the executable and called by the plugin to
-  /// add a node to the executable's registry. Therefore it's not defined here
-  /// to avoid it being instantiated in the plugin and is instead defined in
-  /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
-  static RAJASHAREDDLL_API void add_node(node* N);
-
-  /// Iterators for registry entries.
-  ///
-  class iterator
-  {
-    const node* Cur;
+    const std::string& getName() const { return Name; }
+    const std::string& getDesc() const { return Desc; }
+    T* get() const { return object.get(); }
+  };
 
+  /// A global registry used in conjunction with static constructors to make
+  /// pluggable components (like targets or garbage collectors) "just work" when
+  /// linked with an executable.
+  template <typename T>
+  class Registry {
   public:
-    explicit iterator(const node* N) : Cur(N) {}
-
-    bool operator==(const iterator& That) const { return Cur == That.Cur; }
-    bool operator!=(const iterator& That) const { return Cur != That.Cur; }
-    iterator& operator++()
-    {
-      Cur = Cur->Next;
-      return *this;
-    }
-    const entry& operator*() const { return Cur->Val; }
-    const entry* operator->() const { return &Cur->Val; }
-  };
+    using type = T;
+    using entry = RegistryEntry<T>;
 
-  // begin is not defined here in order to avoid usage of an undefined static
-  // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
-  static RAJASHAREDDLL_API iterator begin();
-  static iterator end() { return iterator(nullptr); }
+    class node;
+    class iterator;
 
-  /// A static registration template.
-  template <typename V>
-  class add
-  {
-    entry Entry;
-    node Node;
+  private:
+    Registry() = delete;
 
-    static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
+    friend class node;
+    static node *Head, *Tail;
 
   public:
-    add(const std::string& Name, const std::string& Desc)
-        : Entry(Name, Desc, CtorFn), Node(Entry)
-    {
-      add_node(&Node);
-    }
+    /// Node in linked list of entries.
+    ///
+    class node {
+      friend class iterator;
+      friend Registry<T>;
+
+      node *Next;
+      const entry& Val;
+
+    public:
+      node(const entry &V) : Next(nullptr), Val(V) {}
+    };
+
+    /// Add a node to the Registry: this is the interface between the plugin and
+    /// the executable.
+    ///
+    /// This function is exported by the executable and called by the plugin to
+    /// add a node to the executable's registry. Therefore it's not defined here
+    /// to avoid it being instantiated in the plugin and is instead defined in
+    /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
+    static RAJASHAREDDLL_API void add_node(node *N);
+
+    /// Iterators for registry entries.
+    ///
+    class iterator {
+      const node *Cur;
+
+    public:
+      explicit iterator(const node *N) : Cur(N) {}
+
+      bool operator==(const iterator &That) const { return Cur == That.Cur; }
+      bool operator!=(const iterator &That) const { return Cur != That.Cur; }
+      iterator &operator++() { Cur = Cur->Next; return *this; }
+      const entry &operator*() const { return Cur->Val; }
+      const entry *operator->() const { return &Cur->Val; }
+    };
+
+    // begin is not defined here in order to avoid usage of an undefined static
+    // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
+    static RAJASHAREDDLL_API iterator begin();
+    static iterator end()   { return iterator(nullptr); }
+
+    /// A static registration template.
+    template <typename V>
+    class add {
+      entry Entry;
+      node Node;
+
+      static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
+
+    public:
+      add(const std::string& Name, const std::string& Desc)
+          : Entry(Name, Desc, CtorFn), Node(Entry) {
+        add_node(&Node);
+      }
+    };
   };
-};
-
-}  // namespace util
-}  // namespace RAJA
-
-#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
-  namespace RAJA                                                               \
-  {                                                                            \
-  namespace util                                                               \
-  {                                                                            \
-  template <typename T>                                                        \
-  typename Registry<T>::node* Registry<T>::Head = nullptr;                     \
-  template <typename T>                                                        \
-  typename Registry<T>::node* Registry<T>::Tail = nullptr;                     \
-  template <typename T>                                                        \
-  void Registry<T>::add_node(typename Registry<T>::node* N)                    \
-  {                                                                            \
-    if (Tail)                                                                  \
-      Tail->Next = N;                                                          \
-    else                                                                       \
-      Head = N;                                                                \
-    Tail = N;                                                                  \
-  }                                                                            \
-  template <typename T>                                                        \
-  typename Registry<T>::iterator Registry<T>::begin()                          \
-  {                                                                            \
-    return iterator(Head);                                                     \
-  }                                                                            \
-  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Head;         \
-  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Tail;         \
-  template void                                                                \
-  Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*);             \
-  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin();   \
-  }                                                                            \
+
+} // closing brace for util namespace
+} // closing brace for RAJA namespace
+
+#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
+  namespace RAJA { \
+  namespace util { \
+  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
+  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
+  template<typename T> \
+  void Registry<T>::add_node(typename Registry<T>::node *N) { \
+    if (Tail) \
+      Tail->Next = N; \
+    else \
+      Head = N; \
+    Tail = N; \
+  } \
+  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
+    return iterator(Head); \
+  } \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
+  template \
+  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
+  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
+  } \
   }
 
 #endif
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 0f8110288b..618913f794 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -50,132 +50,75 @@ namespace RAJA
  *   unbounded extents
  *
  */
-template <typename T>
+template < typename T >
 struct RepeatView
 {
   struct iterator
   {
     using difference_type = std::ptrdiff_t;
-    using value_type      = T;
-    using reference       = value_type const&;
+    using value_type = T;
+    using reference = value_type const&;
 
     iterator() = default;
 
     constexpr iterator(const T* base, size_t index)
-        : m_value(base), m_index(index)
-    {}
+      : m_value(base), m_index(index)
+    { }
 
     constexpr reference operator*() const noexcept { return *m_value; }
-    constexpr reference operator[](difference_type index) const noexcept
-    {
-      return *(*this + index);
-    }
-
-    constexpr iterator& operator++()
-    {
-      ++m_index;
-      return *this;
-    }
-    constexpr iterator operator++(int)
-    {
-      auto tmp = *this;
-      ++(*this);
-      return tmp;
-    }
-
-    constexpr iterator& operator--()
-    {
-      --m_index;
-      return *this;
-    }
-    constexpr iterator operator--(int)
-    {
-      auto tmp = *this;
-      --(*this);
-      return tmp;
-    }
-
-    constexpr iterator& operator+=(difference_type rhs)
-    {
-      m_index += rhs;
-      return *this;
-    }
-    constexpr iterator& operator-=(difference_type rhs)
-    {
-      m_index -= rhs;
-      return *this;
-    }
+    constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); }
+
+    constexpr iterator& operator++() { ++m_index; return *this; }
+    constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
+
+    constexpr iterator& operator--() { --m_index; return *this; }
+    constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; }
+
+    constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; }
+    constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; }
 
     friend constexpr iterator operator+(iterator lhs, difference_type rhs)
-    {
-      lhs += rhs;
-      return lhs;
-    }
+    { lhs += rhs; return lhs; }
     friend constexpr iterator operator+(difference_type lhs, iterator rhs)
-    {
-      rhs += lhs;
-      return rhs;
-    }
+    { rhs += lhs; return rhs; }
 
     friend constexpr iterator operator-(iterator lhs, difference_type rhs)
-    {
-      lhs -= rhs;
-      return lhs;
-    }
-    friend constexpr difference_type operator-(iterator const& lhs,
-                                               iterator const& rhs)
-    {
-      return static_cast<difference_type>(lhs.m_index) -
-             static_cast<difference_type>(rhs.m_index);
-    }
+    { lhs -= rhs; return lhs; }
+    friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs)
+    { return static_cast<difference_type>(lhs.m_index) - static_cast<difference_type>(rhs.m_index); }
 
     friend constexpr bool operator==(iterator const& lhs, iterator const& rhs)
-    {
-      return lhs.m_index == rhs.m_index;
-    }
+    { return lhs.m_index == rhs.m_index; }
     friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs)
-    {
-      return !(lhs == rhs);
-    }
+    { return !(lhs == rhs); }
 
     friend constexpr bool operator<(iterator const& lhs, iterator const& rhs)
-    {
-      return lhs.m_index < rhs.m_index;
-    }
+    { return lhs.m_index < rhs.m_index; }
     friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs)
-    {
-      return !(rhs < lhs);
-    }
+    { return !(rhs < lhs); }
     friend constexpr bool operator>(iterator const& lhs, iterator const& rhs)
-    {
-      return rhs < lhs;
-    }
+    { return rhs < lhs; }
     friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs)
-    {
-      return !(lhs < rhs);
-    }
+    { return !(lhs < rhs); }
 
   private:
     const T* m_value = nullptr;
-    size_t m_index   = 0;
+    size_t m_index = 0;
   };
 
   RepeatView() = delete;
 
   constexpr RepeatView(T const& value, size_t bound)
-      : m_bound(bound), m_value(value)
-  {}
+    : m_bound(bound), m_value(value)
+  { }
 
   constexpr RepeatView(T&& value, size_t bound)
-      : m_bound(bound), m_value(std::move(value))
-  {}
+    : m_bound(bound), m_value(std::move(value))
+  { }
 
   constexpr T const& front() const { return m_value; }
   constexpr T const& back() const { return m_value; }
-  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const
-  {
-    return m_value;
-  }
+  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; }
 
   constexpr iterator begin() const { return iterator(&m_value, 0); }
   constexpr iterator cbegin() const { return iterator(&m_value, 0); }
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
index 289e067b0a..3e7fbb165f 100644
--- a/include/RAJA/util/RuntimePluginLoader.hpp
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -14,40 +14,39 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA
-{
-namespace util
-{
+namespace RAJA {
+namespace util {
 
-class RuntimePluginLoader : public RAJA::util::PluginStrategy
-{
-  using Parent = RAJA::util::PluginStrategy;
+  class RuntimePluginLoader : public RAJA::util::PluginStrategy
+  {
+    using Parent = RAJA::util::PluginStrategy;
 
-public:
-  RuntimePluginLoader();
+  public:
+    RuntimePluginLoader();
 
-  void init(const RAJA::util::PluginOptions& p) override;
+    void init(const RAJA::util::PluginOptions& p) override;
 
-  void preCapture(const RAJA::util::PluginContext& p) override;
+    void preCapture(const RAJA::util::PluginContext& p) override;
 
-  void postCapture(const RAJA::util::PluginContext& p) override;
+    void postCapture(const RAJA::util::PluginContext& p) override;
 
-  void preLaunch(const RAJA::util::PluginContext& p) override;
+    void preLaunch(const RAJA::util::PluginContext& p) override;
 
-  void postLaunch(const RAJA::util::PluginContext& p) override;
+    void postLaunch(const RAJA::util::PluginContext& p) override;
 
-  void finalize() override;
+    void finalize() override;
 
-private:
-  void initPlugin(const std::string& path);
+  private:
 
-  void initDirectory(const std::string& path);
+    void initPlugin(const std::string &path);
+    
+    void initDirectory(const std::string &path);
 
-  std::vector<std::unique_ptr<Parent>> plugins;
+    std::vector<std::unique_ptr<Parent>> plugins;
 
-};  // end RuntimePluginLoader class
+  };  // end RuntimePluginLoader class
 
-void linkRuntimePluginLoader();
+  void linkRuntimePluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index c4d63ac19f..6828bc3b1a 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -52,10 +52,10 @@ class SoAArray
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
 template <typename T, typename IndexType, bool doing_min, size_t size>
-class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
+class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
 {
-  using value_type  = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
-  using first_type  = T;
+  using value_type = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+  using first_type = T;
   using second_type = IndexType;
 
 public:
@@ -65,7 +65,7 @@ class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    mem[i]     = val;
+    mem[i] = val;
     mem_idx[i] = val.getLoc();
   }
 
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index c0ce5a5ccf..6adea65b80 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -26,6 +26,9 @@
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/util/types.hpp"
 
+// for RAJA::expt::ValLoc
+#include "RAJA/pattern/params/params_base.hpp"
+
 namespace RAJA
 {
 
@@ -42,35 +45,35 @@ namespace detail
 template <typename T,
           typename mempool = RAJA::basic_mempool::MemPool<
               RAJA::basic_mempool::generic_allocator>,
-          typename accessor = DefaultAccessor>
+          typename accessor = DefaultAccessor >
 class SoAPtr
 {
-  template <typename, typename, typename>
-  friend class SoAPtr;  // friend other instantiations of this class
+  template < typename, typename, typename >
+  friend class SoAPtr; // friend other instantiations of this class
 
 public:
   using value_type = T;
 
-  template <typename rhs_accessor>
+  template < typename rhs_accessor >
   using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
 
-  SoAPtr()                         = default;
-  SoAPtr(SoAPtr const&)            = default;
-  SoAPtr(SoAPtr&&)                 = default;
+  SoAPtr() = default;
+  SoAPtr(SoAPtr const&) = default;
+  SoAPtr(SoAPtr &&) = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr&&)      = default;
+  SoAPtr& operator=(SoAPtr &&) = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
-  {}
+  {
+  }
 
-  template <
-      typename rhs_accessor,
-      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
-  RAJA_HOST_DEVICE explicit SoAPtr(
-      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-      : mem(rhs.mem)
-  {}
+  template < typename rhs_accessor,
+             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
+  RAJA_HOST_DEVICE
+  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+    : mem(rhs.mem)
+  { }
 
   SoAPtr& allocate(size_t size)
   {
@@ -87,65 +90,125 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
+  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
+
+private:
+  value_type* mem = nullptr;
+};
+
+/*!
+ * @brief Specialization for RAJA::reduce::detail::ValueLoc.
+ */
+template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
+{
+  using first_type = T;
+  using second_type = IndexType;
+
+  template < typename, typename, typename >
+  friend class SoAPtr; // fiend other instantiations of this class
+
+public:
+  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+
+  template < typename rhs_accessor >
+  using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
+
+  SoAPtr() = default;
+  SoAPtr(SoAPtr const&) = default;
+  SoAPtr(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr const&) = default;
+  SoAPtr& operator=(SoAPtr &&) = default;
+
+  explicit SoAPtr(size_t size)
+      : mem(mempool::getInstance().template malloc<first_type>(size)),
+        mem_idx(mempool::getInstance().template malloc<second_type>(size))
+  {
+  }
+
+  template < typename rhs_accessor,
+             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
+  RAJA_HOST_DEVICE
+  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+    : mem(rhs.mem)
+    , mem_idx(rhs.mem_idx)
+  { }
+
+  SoAPtr& allocate(size_t size)
+  {
+    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem_idx = mempool::getInstance().template malloc<second_type>(size);
+    return *this;
+  }
+
+  SoAPtr& deallocate()
+  {
+    mempool::getInstance().free(mem);
+    mem = nullptr;
+    mempool::getInstance().free(mem_idx);
+    mem_idx = nullptr;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
+
   RAJA_HOST_DEVICE value_type get(size_t i) const
   {
-    return accessor::get(mem, i);
+    return value_type(accessor::get(mem, i), accessor::get(mem_idx, i));
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    accessor::set(mem, i, val);
+    accessor::set(mem, i, first_type(val));
+    accessor::set(mem_idx, i, val.getLoc());
   }
 
 private:
-  value_type* mem = nullptr;
+  first_type* mem = nullptr;
+  second_type* mem_idx = nullptr;
 };
 
 /*!
- * @brief Specialization for RAJA::reduce::detail::ValueLoc.
+ * @brief Specialization for RAJA::expt::ValLoc.
  */
-template <typename T,
-          typename IndexType,
-          bool doing_min,
-          typename mempool,
-          typename accessor>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
-             mempool,
-             accessor>
+template <typename T, typename IndexType, typename mempool, typename accessor>
+class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
 {
-  using first_type  = T;
+  using first_type = T;
   using second_type = IndexType;
 
-  template <typename, typename, typename>
-  friend class SoAPtr;  // fiend other instantiations of this class
+  template < typename, typename, typename >
+  friend class SoAPtr; // friend other instantiations of this class
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+  using value_type = RAJA::expt::ValLoc<T, IndexType>;
 
-  template <typename rhs_accessor>
+  template < typename rhs_accessor >
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr()                         = default;
-  SoAPtr(SoAPtr const&)            = default;
-  SoAPtr(SoAPtr&&)                 = default;
+  SoAPtr() = default;
+  SoAPtr(SoAPtr const&) = default;
+  SoAPtr(SoAPtr &&) = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr&&)      = default;
+  SoAPtr& operator=(SoAPtr &&) = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {}
+  {
+  }
 
-  template <
-      typename rhs_accessor,
-      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
-  RAJA_HOST_DEVICE explicit SoAPtr(
-      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-      : mem(rhs.mem), mem_idx(rhs.mem_idx)
-  {}
+  template < typename rhs_accessor,
+             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
+  RAJA_HOST_DEVICE
+  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+    : mem(rhs.mem)
+    , mem_idx(rhs.mem_idx)
+  { }
 
   SoAPtr& allocate(size_t size)
   {
-    mem     = mempool::getInstance().template malloc<first_type>(size);
+    mem = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -167,12 +230,12 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    accessor::set(mem, i, first_type(val));
+    accessor::set(mem, i, val.getVal());
     accessor::set(mem_idx, i, val.getLoc());
   }
 
 private:
-  first_type* mem      = nullptr;
+  first_type* mem = nullptr;
   second_type* mem_idx = nullptr;
 };
 
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index c305fe9bd1..2da2e0164c 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -56,30 +56,30 @@ namespace RAJA
  *
  */
 template <typename IterType, typename IndexType>
-struct Span
-{
-  using element_type    = typename std::iterator_traits<IterType>::value_type;
-  using value_type      = camp::decay<element_type>;
-  using size_type       = IndexType;
+struct Span {
+  using element_type = typename std::iterator_traits<IterType>::value_type;
+  using value_type = camp::decay<element_type>;
+  using size_type = IndexType;
   using difference_type = std::ptrdiff_t;
-  using reference       = element_type&;
+  using reference = element_type&;
   using const_reference = const element_type&;
-  using iterator        = IterType;
-  using const_iterator  = IterType;
+  using iterator = IterType;
+  using const_iterator = IterType;
 
   static_assert(type_traits::is_integral<IndexType>::value,
-                "IndexType must "
-                "model Integral");
+                "IndexType must model Integral");
   static_assert(type_traits::is_random_access_iterator<IterType>::value,
                 "IterType must model RandomAccessIterator");
 
   RAJA_HOST_DEVICE Span(iterator begin, iterator end)
-      : m_begin {begin}, m_end {end}
-  {}
+      : m_begin{begin}, m_end{end}
+  {
+  }
 
   RAJA_HOST_DEVICE Span(iterator begin, size_type size)
-      : m_begin {begin}, m_end {begin + size}
-  {}
+      : m_begin{begin}, m_end{begin + size}
+  {
+  }
 
   RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
@@ -88,34 +88,16 @@ struct Span
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
-  {
-    return s.begin();
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s) { return s.begin(); }
   RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s)
-  {
-    return s.begin();
-  }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s)
-  {
-    return s.end();
-  }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s)
-  {
-    return s.cbegin();
-  }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s)
-  {
-    return s.cend();
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s) { return s.begin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s) { return s.end(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s) { return s.cbegin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s) { return s.cend(); }
 
   RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end() - 1); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
-  {
-    return data()[i];
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end()-1); }
+  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const { return data()[i]; }
   RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
 
   RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
@@ -145,7 +127,7 @@ struct Span
                                           size_type length) const
   {
     auto start = m_begin + begin;
-    auto end   = start + length > m_end ? m_end : start + length;
+    auto end = start + length > m_end ? m_end : start + length;
     return Span(start, end);
   }
 
@@ -175,21 +157,21 @@ struct Span
  *
  */
 template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
-                                                                 IndexType size)
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
+    IterType begin,
+    IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
 
 template <typename Iter>
-RAJA_INLINE auto make_span(Iter& iterable)
+RAJA_INLINE auto make_span(Iter &iterable)
 {
   using std::begin;
-  using std::distance;
   using std::end;
-  return Span<typename Iter::iterator,
-              decltype(distance(begin(iterable), end(iterable)))>(
-      begin(iterable), end(iterable));
+  using std::distance;
+  return Span<typename Iter::iterator, decltype(distance(begin(iterable), end(iterable)))>
+    (begin(iterable), end(iterable));
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index df70092459..8d27980f83 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -32,6 +32,7 @@
 #include "RAJA/util/Permutations.hpp"
 
 
+
 namespace RAJA
 {
 
@@ -39,11 +40,7 @@ namespace detail
 {
 
 
-template <typename IdxLin,
-          typename Range,
-          typename Sizes,
-          typename Strides,
-          typename DimTypeList = void>
+template <typename IdxLin, typename Range, typename Sizes, typename Strides, typename DimTypeList=void>
 struct StaticLayoutBase_impl;
 
 
@@ -55,16 +52,15 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             void>
-{
+                             void> {
 
   using IndexLinear = IdxLin;
-  using sizes       = camp::int_seq<IdxLin, Sizes...>;
-  using strides     = camp::int_seq<IdxLin, Strides...>;
+  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using strides = camp::int_seq<IdxLin, Strides...>;
 
-  static constexpr camp::idx_t stride_one_dim = RAJA::max<camp::idx_t>(
-      (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts)
-                                                    : -1)...);
+  static constexpr camp::idx_t stride_one_dim =
+      RAJA::max<camp::idx_t>(
+          (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts) : -1)...);
 
   static constexpr size_t n_dims = sizeof...(Sizes);
 
@@ -76,7 +72,9 @@ struct StaticLayoutBase_impl<IdxLin,
   RAJA_INLINE static void print()
   {
     camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                      (int)RangeInts, (int)Sizes, (int)Strides)...);
+                               (int)RangeInts,
+                               (int)Sizes,
+                               (int)Strides)...);
   }
 
 
@@ -88,8 +86,8 @@ struct StaticLayoutBase_impl<IdxLin,
    * @return Linear space index.
    */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
-  operator()(Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
+      Indices... indices) const
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -97,8 +95,7 @@ struct StaticLayoutBase_impl<IdxLin,
 
 
   template <typename... Indices>
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
-  s_oper(Indices... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(Indices... indices)
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -111,7 +108,8 @@ struct StaticLayoutBase_impl<IdxLin,
       RAJA::product<IdxLin>((Sizes == IdxLin(0) ? IdxLin(1) : Sizes)...);
 
   // Multiply together all of the sizes
-  static constexpr IdxLin s_size_noproj = RAJA::product<IdxLin>(Sizes...);
+  static constexpr IdxLin s_size_noproj =
+      RAJA::product<IdxLin>(Sizes...);
 
   /*!
    * Computes a size of the layout's space with projections as size 1.
@@ -139,31 +137,37 @@ struct StaticLayoutBase_impl<IdxLin,
   }
 
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_stride() const {
     return camp::seq_at<DIM, strides>::value;
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_size() const {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_begin() const {
     return 0;
   }
+
 };
 
 template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
-struct StrideCalculatorIdx
-{
+struct StrideCalculatorIdx {
   static_assert(N == sizeof...(Sizes), "");
 
-  using sizes_seq              = camp::int_seq<IdxLin, Sizes...>;
+  using sizes_seq = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin size = camp::seq_at<Idx, sizes_seq>::value;
   static constexpr IdxLin size_last =
       StrideCalculatorIdx<IdxLin, N, Idx + 1, Sizes...>::size;
@@ -174,49 +178,38 @@ struct StrideCalculatorIdx
 };
 
 template <typename IdxLin, IdxLin N, IdxLin... Sizes>
-struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
-{
+struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
   static_assert(N == sizeof...(Sizes), "");
 
-  static constexpr IdxLin size   = 1;
-  static constexpr IdxLin value  = 1;
+  static constexpr IdxLin size = 1;
+  static constexpr IdxLin value = 1;
   static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
 template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <typename IdxLin,
-          IdxLin... Range,
-          camp::idx_t... Perm,
-          IdxLin... Sizes>
+template <typename IdxLin, IdxLin ... Range, camp::idx_t... Perm, IdxLin... Sizes>
 struct StrideCalculator<IdxLin,
                         camp::int_seq<IdxLin, Range...>,
                         camp::idx_seq<Perm...>,
-                        camp::int_seq<IdxLin, Sizes...>>
-{
+                        camp::int_seq<IdxLin, Sizes...>> {
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
-  using sizes               = camp::int_seq<IdxLin, Sizes...>;
+  using sizes = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin N = sizeof...(Sizes);
-  using range               = camp::int_seq<IdxLin, Range...>;
-  using perm                = camp::idx_seq<Perm...>;
-  using inv_perm            = invert_permutation<perm>;
-
-  using strides_unperm = camp::int_seq<
-      IdxLin,
-      StrideCalculatorIdx<IdxLin,
-                          N,
-                          Range,
-                          camp::seq_at<Perm, sizes>::value...>::stride...>;
-
-  using strides =
-      camp::int_seq<IdxLin,
-                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
-                                 strides_unperm>::value...>;
+  using range = camp::int_seq<IdxLin, Range...>;
+  using perm = camp::idx_seq<Perm...>;
+  using inv_perm = invert_permutation<perm>;
+
+  using strides_unperm =
+      camp::int_seq<IdxLin, StrideCalculatorIdx<IdxLin, N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
+
+  using strides = camp::int_seq<IdxLin, camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
 };
 
 
+
 template <typename IdxLin,
           IdxLin... RangeInts,
           IdxLin... Sizes,
@@ -226,19 +219,19 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             camp::list<DimTypes...>>
-{
+                             camp::list<DimTypes...>> {
 
 
   using IndexLinear = IdxLin;
   using ranges      = camp::int_seq<IdxLin, RangeInts...>;
   using sizes       = camp::int_seq<IdxLin, Sizes...>;
-  using strides     = camp::int_seq<IdxLin, Strides...>;
+  using strides     = camp::int_seq<IdxLin, Strides...>;  
 
-  using InnerLayout =
-      StaticLayoutBase_impl<IdxLin, ranges, sizes, strides, void>;
+  using InnerLayout = StaticLayoutBase_impl<IdxLin,ranges,sizes,strides,void>;
 
-  static constexpr camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
+  static
+  constexpr
+  camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
 
   static constexpr IndexLinear n_dims = sizeof...(DimTypes);
   /*!
@@ -248,14 +241,14 @@ struct StaticLayoutBase_impl<IdxLin,
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear
-  s_oper(DimTypes... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear s_oper(
+      DimTypes... indices)
   {
     return InnerLayout::s_oper(stripIndexType(indices)...);
   }
 
 
-  static constexpr IndexLinear s_size        = InnerLayout::s_size;
+  static constexpr IndexLinear s_size = InnerLayout::s_size;
   static constexpr IndexLinear s_size_noproj = InnerLayout::s_size_noproj;
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr static IndexLinear size()
@@ -268,43 +261,49 @@ struct StaticLayoutBase_impl<IdxLin,
     return s_size_noproj;
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
-  {
-    return InnerLayout {}.get_dim_stride();
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_stride() const {
+    return InnerLayout{}.get_dim_stride();
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_size() const {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template <camp::idx_t DIM>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
-  {
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_begin() const {
     return 0;
   }
 
 
   RAJA_INLINE
   static void print() { InnerLayout::print(); }
+
 };
 
 
-template <typename Perm,
-          typename IdxLin,
-          typename Sizes,
-          typename Indexes,
-          typename TypeList>
+
+
+
+template <typename Perm, typename IdxLin, typename Sizes, typename Indexes, typename TypeList>
 struct StaticLayoutMaker
 {
-  using strides =
-      typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
-  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides, TypeList>;
+  using strides = typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
+  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides,TypeList>;
 };
 
 
+
 }  // namespace detail
 
 
@@ -314,21 +313,20 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    void>::type;
+    void
+    >::type;
 
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <typename Perm,
-          typename IdxLin,
-          typename TypeList,
-          camp::idx_t... Sizes>
+template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    TypeList>::type;
+    TypeList
+    >::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 6c96a12e23..8c23a2c74d 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -51,7 +51,7 @@ class BGQTimer
   using ElapsedType = double;
 
 private:
-  using TimeType     = timeval;
+  using TimeType = timeval;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
@@ -104,13 +104,14 @@ class ChronoTimer
   using ElapsedType = double;
 
 private:
-  using ClockType    = std::chrono::steady_clock;
-  using TimeType     = ClockType::time_point;
+  using ClockType = std::chrono::steady_clock;
+  using TimeType = ClockType::time_point;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
   ChronoTimer() : tstart(ClockType::now()), tstop(ClockType::now()), telapsed(0)
-  {}
+  {
+  }
 
   void start() { tstart = ClockType::now(); }
 
@@ -173,7 +174,7 @@ class GettimeTimer
 
   void reset()
   {
-    stime_elapsed  = 0;
+    stime_elapsed = 0;
     nstime_elapsed = 0;
   }
 
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 9b34eb5e71..5cdc019259 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -39,7 +39,7 @@ namespace util
  * Reinterpret any datatype as another datatype of the same size
  */
 template <typename A, typename B>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const& a)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a)
 {
   static_assert(sizeof(A) == sizeof(B), "A and B must be the same size");
 
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 9f7563d729..0d5bed35d6 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -38,786 +38,705 @@ namespace RAJA
 namespace internal
 {
 
-template <camp::idx_t, typename T>
-struct IndexToType
-{
-  using type = T;
-};
+  template<camp::idx_t, typename T>
+  struct IndexToType{
+      using type = T;
+  };
 
-template <typename IdxSeq, typename T>
-struct SequenceToType;
+  template<typename IdxSeq, typename T>
+  struct SequenceToType;
 
-template <camp::idx_t... Perm, typename T>
-struct SequenceToType<camp::idx_seq<Perm...>, T>
-{
-  using type = camp::list<typename IndexToType<Perm, T>::type...>;
-};
+  template<camp::idx_t ... Perm, typename T>
+  struct SequenceToType<camp::idx_seq<Perm...>, T>{
+      using type =  camp::list<typename IndexToType<Perm, T>::type...>;
+  };
 
-template <typename Perm>
-using getDefaultIndexTypes =
-    typename SequenceToType<Perm, RAJA::Index_type>::type;
-
-
-// Helpers to convert
-// layouts -> OffsetLayouts
-// Typedlayouts -> TypedOffsetLayouts
-template <typename layout>
-struct add_offset
-{
-  using type = RAJA::OffsetLayout<layout::n_dims>;
-};
+  template<typename Perm>
+  using getDefaultIndexTypes = typename SequenceToType<Perm, RAJA::Index_type>::type;
 
-template <typename IdxLin, typename... DimTypes>
-struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
-{
-  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-};
 
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-namespace detail
-{
-/*
- * Returns the argument number which contains a VectorIndex
- *
- * returns -1 if none of the arguments are VectorIndexs
- */
 
-template <camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
-struct GetTensorArgIdxExpanded;
-
-template <camp::idx_t DIM, typename... ARGS, camp::idx_t... IDX>
-struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>>
-{
-
-  static constexpr camp::idx_t value = RAJA::max<camp::idx_t>(
-      (internal::expt::isTensorIndex<ARGS>() &&
-               internal::expt::getTensorDim<ARGS>() == DIM
-           ? IDX
-           : -1)...);
-};
+  //Helpers to convert
+  //layouts -> OffsetLayouts
+  //Typedlayouts -> TypedOffsetLayouts
+  template<typename layout>
+  struct add_offset
+  {
+    using type = RAJA::OffsetLayout<layout::n_dims>;
+  };
 
+  template<typename IdxLin, typename...DimTypes>
+  struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
+  {
+    using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
+  };
 
-}  // namespace detail
-#endif
 
 
-/*
- * Returns the number of arguments which are VectorIndexs
- */
-template <typename... ARGS>
-struct count_num_tensor_args
-{
-  static constexpr camp::idx_t value =
-#if defined(RAJA_ENABLE_VECTORIZATION)
-      RAJA::sum<camp::idx_t>(
-          (internal::expt::isTensorIndex<ARGS>() ? 1 : 0)...);
-#else
-      0;  // There should be 0 Tensor indices if not vectorizing.
-#endif
-};
 
 #if defined(RAJA_ENABLE_VECTORIZATION)
-/*
- * Returns which argument has a vector index
- */
-template <camp::idx_t DIM, typename... ARGS>
-struct GetTensorArgIdx
-{
-  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
-      DIM,
-      camp::list<ARGS...>,
-      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
-};
+  namespace detail
+  {
+    /*
+     * Returns the argument number which contains a VectorIndex
+     *
+     * returns -1 if none of the arguments are VectorIndexs
+     */
 
-template <camp::idx_t DIM, typename... ARGS>
-struct GetTensorArgIdx<DIM, camp::list<ARGS...>>
-{
-  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
-      DIM,
-      camp::list<ARGS...>,
-      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
-};
+    template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
+    struct GetTensorArgIdxExpanded;
 
-/*
- * Returns the beginning index in a vector argument
- */
-template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
-RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
-get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
-{
-  return RAJA::max<camp::idx_t>(
-      internal::expt::getTensorDim<ARGS>() == DIM
-          ? internal::expt::getTensorBegin<ARGS>(
-                args, layout.template get_dim_begin<
-                          GetTensorArgIdx<DIM, ARGS...>::value>())
-          : 0 ...);
-}
-
-/*
- * Returns the number of elements in the vector argument
- */
-template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
-RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
-get_tensor_args_size(LAYOUT const& layout, ARGS... args)
-{
-  return RAJA::max<camp::idx_t>(
-      internal::expt::getTensorDim<ARGS>() == DIM
-          ? internal::expt::getTensorSize<ARGS>(
-                args, layout.template get_dim_size<
-                          GetTensorArgIdx<DIM, ARGS...>::value>())
-          : 0 ...);
-}
-#endif
+    template<camp::idx_t DIM, typename ... ARGS, camp::idx_t ... IDX>
+    struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>> {
 
+        static constexpr camp::idx_t value =
+            RAJA::max<camp::idx_t>(
+                (internal::expt::isTensorIndex<ARGS>()&&internal::expt::getTensorDim<ARGS>()==DIM ? IDX : -1) ...);
+    };
 
-namespace detail
-{
 
-/*!
- * Provides conversion of view data to a return type.
- *
- * For scalars, this just returns the scalar.
- *
- * In the future development, this may return SIMD vectors or matrices using
- * class specializations.
- */
-template <typename VecSeq,
-          typename Args,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType>
-struct ViewReturnHelper;
+  } // namespace detail
+#endif
 
 
-/*
- * Specialization for Scalar return types
- */
-template <typename... Args,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType>
-struct ViewReturnHelper<camp::idx_seq<>,
-                        camp::list<Args...>,
-                        ElementType,
-                        PointerType,
-                        LinIdx,
-                        LayoutType>
-{
-  using return_type = ElementType&;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const& layout,
-                                           PointerType const& data,
-                                           Args const&... args)
-  {
-    return data[stripIndexType(layout(args...))];
-  }
-};
-
 
+  /*
+   * Returns the number of arguments which are VectorIndexs
+   */
+  template<typename ... ARGS>
+  struct count_num_tensor_args{
+    static constexpr camp::idx_t value =
 #if defined(RAJA_ENABLE_VECTORIZATION)
-/*
- * Specialization for Tensor return types
- */
-template <camp::idx_t VecHead,
-          camp::idx_t... VecSeq,
-          typename... Args,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType>
-struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
-                        camp::list<Args...>,
-                        ElementType,
-                        PointerType,
-                        LinIdx,
-                        LayoutType>
-{
-
-  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-  // This is the stride-one dimensions w.r.t. the tensor not the View
-  // For example:
-  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-  //  For a matrix, s_stride_one_dim is either:
-  //                 -1 neither row nor column are packed
-  //                 0 rows are stride-one
-  //                 1 columns are stride-one
-  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
-      (GetTensorArgIdx<VecHead, Args...>::value == LayoutType::stride_one_dim
-           ? VecHead
-           : -1),
-      (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim
-           ? VecSeq
-           : -1)...);
-
-
-  using tensor_reg_type =
-      typename camp::at_v<camp::list<Args...>,
-                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
-  using ref_type = internal::expt::TensorRef<ElementType*,
-                                             LinIdx,
-                                             internal::expt::TENSOR_MULTIPLE,
-                                             s_num_dims,
-                                             s_stride_one_dim>;
-  using return_type =
-      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+        RAJA::sum<camp::idx_t>(
+            (internal::expt::isTensorIndex<ARGS>() ? 1 : 0) ...);
+#else
+        0;  // There should be 0 Tensor indices if not vectorizing.
+#endif
+  };
+  
+#if defined(RAJA_ENABLE_VECTORIZATION)
+  /*
+   * Returns which argument has a vector index
+   */
+  template<camp::idx_t DIM, typename ... ARGS>
+  struct GetTensorArgIdx{
+      static constexpr camp::idx_t value =
+          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
+  };
+
+  template<camp::idx_t DIM, typename ... ARGS>
+  struct GetTensorArgIdx<DIM,camp::list<ARGS...>>{
+      static constexpr camp::idx_t value =
+          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
+  };
 
+  /*
+   * Returns the beginning index in a vector argument
+   */
+  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const& layout,
-                                           PointerType const& data,
-                                           Args const&... args)
-  {
-
-    return return_type(ref_type {
-        // data pointer
-        &data[0] +
-            layout(internal::expt::isTensorIndex<Args>()
-                       ? LinIdx {0}
-                       : (LinIdx)stripIndexType(
-                             internal::expt::stripTensorIndexByValue(args))...),
-        // strides
-        {(LinIdx)layout.template get_dim_stride<
-             GetTensorArgIdx<VecHead, Args...>::value>(),
-         (LinIdx)layout.template get_dim_stride<
-             GetTensorArgIdx<VecSeq, Args...>::value>()...},
-        // tile
-        {// begin
-         {(LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
-          (LinIdx)(get_tensor_args_begin<VecSeq>(layout, args...))...},
-
-         // size
-         {(LinIdx)get_tensor_args_size<VecHead>(layout, args...),
-          (LinIdx)get_tensor_args_size<VecSeq>(layout, args...)...}}});
+  static constexpr camp::idx_t get_tensor_args_begin(LAYOUT const &layout, ARGS ... args){
+    return RAJA::max<camp::idx_t>(
+        internal::expt::getTensorDim<ARGS>()==DIM
+        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTensorArgIdx<DIM, ARGS...>::value>())
+        : 0 ...);
   }
-};
-
-
-/*
- * Specialization for Tensor return types and static layout types
- */
-template <camp::idx_t VecHead,
-          camp::idx_t... VecSeq,
-          typename... INDEX_TYPES,
-          typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          LinIdx... RangeInts,
-          LinIdx... SizeInts,
-          LinIdx... StrideInts,
-          typename DIM_LIST>
-struct ViewReturnHelper<
-    camp::idx_seq<VecHead, VecSeq...>,
-    camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
-    ElementType,
-    PointerType,
-    LinIdx,
-    RAJA::detail::StaticLayoutBase_impl<LinIdx,
-                                        camp::int_seq<LinIdx, RangeInts...>,
-                                        camp::int_seq<LinIdx, SizeInts...>,
-                                        camp::int_seq<LinIdx, StrideInts...>,
-                                        DIM_LIST>>
-{
-  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-  using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
-
-  using range_seq  = camp::int_seq<LinIdx, RangeInts...>;
-  using size_seq   = camp::int_seq<LinIdx, SizeInts...>;
-  using stride_seq = camp::int_seq<LinIdx, StrideInts...>;
-  using LayoutType = RAJA::detail::
-      StaticLayoutBase_impl<LinIdx, range_seq, size_seq, stride_seq, DIM_LIST>;
-
-  // This is the stride-one dimensions w.r.t. the tensor not the View
-  // For example:
-  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-  //  For a matrix, s_stride_one_dim is either:
-  //                 -1 neither row nor column are packed
-  //                 0 rows are stride-one
-  //                 1 columns are stride-one
-  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
-      (GetTensorArgIdx<VecHead, index_list>::value == LayoutType::stride_one_dim
-           ? VecHead
-           : -1),
-      (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim
-           ? VecSeq
-           : -1)...);
-
-
-  using new_begin_seq =
-      camp::int_seq<LinIdx,
-                    (LinIdx)get_tensor_args_begin<VecHead>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                    (LinIdx)get_tensor_args_begin<VecSeq>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
-  using new_size_seq =
-      camp::int_seq<LinIdx,
-                    (LinIdx)get_tensor_args_size<VecHead>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                    (LinIdx)get_tensor_args_size<VecSeq>(
-                        LayoutType(),
-                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
-
-  using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
-  using new_size_type  = internal::expt::StaticIndexArray<new_size_seq>;
-
-
-  using tensor_reg_type =
-      typename camp::at_v<index_list,
-                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
-  using ref_type =
-      internal::expt::StaticTensorRef<ElementType*,
-                                      LinIdx,
-                                      internal::expt::TENSOR_MULTIPLE,
-                                      stride_seq,
-                                      new_begin_seq,
-                                      new_size_seq,
-                                      s_stride_one_dim>;
-  using return_type =
-      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
 
+  /*
+   * Returns the number of elements in the vector argument
+   */
+  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr return_type
-  make_return(LayoutType const& layout,
-              PointerType const& data,
-              RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
-  {
-
-    return return_type(ref_type {
-        // data pointer
-        &data[0] +
-            layout(internal::expt::isTensorIndex<
-                       typename RAJA::expt::StaticTensorIndex<
-                           INDEX_TYPES>::base_type>()
-                       ? LinIdx {0}
-                       : (LinIdx)stripIndexType(
-                             internal::expt::stripTensorIndexByValue(args))...),
-        // strides
-        typename ref_type::stride_type(),
-        // tile
-        {new_begin_type(), new_size_type()}});
+  static constexpr camp::idx_t get_tensor_args_size(LAYOUT const &layout, ARGS ... args){
+    return RAJA::max<camp::idx_t>(
+        internal::expt::getTensorDim<ARGS>()==DIM
+        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTensorArgIdx<DIM, ARGS...>::value>())
+        : 0 ...);
   }
-};
 #endif
 
 
-}  // namespace detail
+  namespace detail {
 
+  /*!
+   * Provides conversion of view data to a return type.
+   *
+   * For scalars, this just returns the scalar.
+   *
+   * In the future development, this may return SIMD vectors or matrices using
+   * class specializations.
+   */
+  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
+  struct ViewReturnHelper;
 
-/*
- * Computes the return type of a view.
- *
- * If any of the arguments are a VectorIndex, it creates a VectorRef
- * return type.
- *
- * Otherwise it produces the usual scalar reference return type
- */
-template <typename ElementType,
-          typename PointerType,
-          typename LinIdx,
-          typename LayoutType,
-          typename... Args>
-using view_return_type_t = typename detail::ViewReturnHelper<
-    camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-    camp::list<Args...>,
-    ElementType,
-    PointerType,
-    LinIdx,
-    LayoutType>::return_type;
-
-/*
- * Creates the return value for a View
- *
- * If any of the arguments are a VectorIndex, it creates a VectorRef
- * return value.
- *
- * Otherwise it produces the usual scalar reference return value
- */
-template <typename ElementType,
-          typename LinIdx,
-          typename LayoutType,
-          typename PointerType,
-          typename... Args>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
-                                                          PointerType,
-                                                          LinIdx,
-                                                          LayoutType,
-                                                          Args...>
-view_make_return_value(LayoutType const& layout,
-                       PointerType const& data,
-                       Args const&... args)
-{
-  return detail::ViewReturnHelper<
-      camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-      camp::list<Args...>, ElementType, PointerType, LinIdx,
-      LayoutType>::make_return(layout, data, args...);
-}
 
-namespace detail
-{
+  /*
+   * Specialization for Scalar return types
+   */
+  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
+  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
+  {
+      using return_type = ElementType &;
 
-/**
- * This class will help strip strongly typed indices
- *
- * This default implementation static_asserts that Expected==Arg, otherwise
- * it's an error.  This enforces types for the TypedView.
- *
- * Specialization where expected type is same as argument type.
- * In this case, there is no VectorIndex to unpack, just strip any strongly
- * typed indices.
- */
-template <typename Expected, typename Arg>
-struct MatchTypedViewArgHelper
-{
-  static_assert(std::is_convertible<strip_index_type_t<Arg>,
-                                    strip_index_type_t<Expected>>::value,
-                "Argument isn't compatible");
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
+        return data[stripIndexType(layout(args...))];
+      }
+  };
 
-  using type = strip_index_type_t<Arg>;
 
-  static RAJA_HOST_DEVICE RAJA_INLINE constexpr type extract(Arg arg)
+#if defined(RAJA_ENABLE_VECTORIZATION)
+  /*
+   * Specialization for Tensor return types
+   */
+  template<camp::idx_t VecHead, camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
+  struct ViewReturnHelper<camp::idx_seq<VecHead,VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
   {
-    return stripIndexType(arg);
-  }
-};
 
+      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+      // This is the stride-one dimensions w.r.t. the tensor not the View
+      // For example:
+      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+      //  For a matrix, s_stride_one_dim is either:
+      //                 -1 neither row nor column are packed
+      //                 0 rows are stride-one
+      //                 1 columns are stride-one
+      static constexpr camp::idx_t s_stride_one_dim =
+          RAJA::max<camp::idx_t>(
+                  (GetTensorArgIdx<VecHead,Args...>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
+                  (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
+          );
+
+
+      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTensorArgIdx<0, Args...>::value>::tensor_type;
+      using ref_type = internal::expt::TensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE, s_num_dims, s_stride_one_dim>;
+      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
+
+        return return_type(ref_type{
+          // data pointer
+          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
+          // strides
+          {
+              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecHead,Args...>::value>(),
+              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecSeq, Args...>::value>()...
+          },
+          // tile
+          {
+              // begin
+              {
+                  (LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
+                  (LinIdx)(get_tensor_args_begin<VecSeq> (layout, args...))...
+              },
+
+              // size
+              {
+                  (LinIdx)get_tensor_args_size<VecHead>(layout, args...),
+                  (LinIdx)get_tensor_args_size<VecSeq> (layout, args...)...
+              }
+          }
+        });
+      }
+  };
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-/**
- * Specialization where expected type is wrapped in a VectorIndex type
- *
- * In this case, there is no VectorIndex to unpack, just strip any strongly
- * typed indices.
- */
-template <typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-struct MatchTypedViewArgHelper<Expected,
-                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
-{
 
-  static_assert(std::is_convertible<strip_index_type_t<Arg>,
-                                    strip_index_type_t<Expected>>::value,
-                "Argument isn't compatible");
 
-  using arg_type = strip_index_type_t<Arg>;
 
-  using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
 
-  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type
-  extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg)
-  {
-    return type(stripIndexType(*vec_arg), vec_arg.size());
-  }
-};
+  /*
+   * Specialization for Tensor return types and static layout types
+   */
+  template<
+      camp::idx_t VecHead, camp::idx_t ... VecSeq,
+      typename ... INDEX_TYPES,
+      typename ElementType, typename PointerType, typename LinIdx,
+      LinIdx... RangeInts, LinIdx... SizeInts, LinIdx... StrideInts,
+      typename DIM_LIST
+  >
+  struct ViewReturnHelper<
+      camp::idx_seq<VecHead,VecSeq...>,
+      camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
+      ElementType, PointerType,
+      LinIdx,
+      RAJA::detail::StaticLayoutBase_impl<
+          LinIdx,
+          camp::int_seq<LinIdx,RangeInts...>,
+          camp::int_seq<LinIdx,SizeInts...>,
+          camp::int_seq<LinIdx,StrideInts...>,
+          DIM_LIST
+      >
+  > {
+      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+      using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
+
+      using range_seq  = camp::int_seq<LinIdx,RangeInts... >;
+      using size_seq   = camp::int_seq<LinIdx,SizeInts...  >;
+      using stride_seq = camp::int_seq<LinIdx,StrideInts...>;
+      using LayoutType = RAJA::detail::StaticLayoutBase_impl<LinIdx,range_seq,size_seq,stride_seq,DIM_LIST>;
+
+      // This is the stride-one dimensions w.r.t. the tensor not the View
+      // For example:
+      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+      //  For a matrix, s_stride_one_dim is either:
+      //                 -1 neither row nor column are packed
+      //                 0 rows are stride-one
+      //                 1 columns are stride-one
+      static constexpr camp::idx_t s_stride_one_dim =
+          RAJA::max<camp::idx_t>(
+                  (GetTensorArgIdx<VecHead,index_list>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
+                  (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
+          );
+
+
+
+
+      using new_begin_seq = camp::int_seq<
+                LinIdx,
+                (LinIdx)get_tensor_args_begin<VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                (LinIdx)get_tensor_args_begin<VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
+            >;
+      using new_size_seq  = camp::int_seq<
+                LinIdx,
+                (LinIdx)get_tensor_args_size <VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                (LinIdx)get_tensor_args_size <VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
+            >;
+
+      using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
+      using new_size_type  = internal::expt::StaticIndexArray<new_size_seq >;
+
+
+      using tensor_reg_type = typename camp::at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
+      using ref_type = internal::expt::StaticTensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE,stride_seq,new_begin_seq,new_size_seq, s_stride_one_dim>;
+      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      return_type make_return(LayoutType const &layout, PointerType const &data, RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &... args){
+
+        return return_type(ref_type{
+          // data pointer
+          &data[0] + layout(internal::expt::isTensorIndex<typename RAJA::expt::StaticTensorIndex<INDEX_TYPES>::base_type>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
+          // strides
+          typename ref_type::stride_type(),
+          // tile
+          {
+              new_begin_type(),
+              new_size_type()
+          }
+        });
+      }
+  };
+#endif
 
-/**
- * Specialization where expected type is wrapped in a StaticTensorIndex type
- *
- * In this case, there is no StaticTensorIndex to unpack, just strip any
- * strongly typed indices.
- */
-template <typename Expected,
-          typename Arg,
-          typename VectorType,
-          camp::idx_t DIM,
-          Arg BEGIN,
-          strip_index_type_t<Arg> LENGTH>
-struct MatchTypedViewArgHelper<
-    Expected,
-    RAJA::expt::StaticTensorIndex<
-        RAJA::expt::
-            StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>>
-{
 
-  static_assert(std::is_convertible<strip_index_type_t<Arg>,
-                                    strip_index_type_t<Expected>>::value,
-                "Argument isn't compatible");
+  } // namespace detail
 
-  using arg_type = strip_index_type_t<Arg>;
 
-  using type = RAJA::expt::StaticTensorIndex<
-      RAJA::expt::
-          StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
+  /*
+   * Computes the return type of a view.
+   *
+   * If any of the arguments are a VectorIndex, it creates a VectorRef
+   * return type.
+   *
+   * Otherwise it produces the usual scalar reference return type
+   */
+  template<typename ElementType, typename PointerType, typename LinIdx, typename LayoutType, typename ... Args>
+  using view_return_type_t =
+      typename detail::ViewReturnHelper<
+        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+        camp::list<Args...>,
+        ElementType,
+        PointerType,
+        LinIdx,
+        LayoutType>::return_type;
 
-  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type extract(
-      RAJA::expt::StaticTensorIndex<
-          RAJA::expt::
-              StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>
-          RAJA_UNUSED_ARG(vec_arg))
-  {
-    return type();
+  /*
+   * Creates the return value for a View
+   *
+   * If any of the arguments are a VectorIndex, it creates a VectorRef
+   * return value.
+   *
+   * Otherwise it produces the usual scalar reference return value
+   */
+  template<typename ElementType, typename LinIdx, typename LayoutType, typename PointerType, typename ... Args>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  view_return_type_t<ElementType, PointerType, LinIdx, LayoutType, Args...>
+  view_make_return_value(LayoutType const &layout, PointerType const &data, Args const &... args){
+    return detail::ViewReturnHelper<
+        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+        camp::list<Args...>,
+        ElementType,
+        PointerType,
+        LinIdx,
+        LayoutType>::make_return(layout, data, args...);
   }
-};
-#endif
 
-}  // namespace detail
-
-
-template <typename Expected, typename Arg>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr
-    typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
-    match_typed_view_arg(Arg const& arg)
-{
-  return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
-}
-
-
-template <typename ValueType, typename PointerType, typename LayoutType>
-class ViewBase
-{
+  namespace detail
+  {
 
-public:
-  using value_type        = ValueType;
-  using pointer_type      = PointerType;
-  using layout_type       = LayoutType;
-  using linear_index_type = typename layout_type::IndexLinear;
-  using nc_value_type     = typename std::remove_const<value_type>::type;
-  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-      typename std::remove_pointer<pointer_type>::type>::type>::type;
+  /**
+   * This class will help strip strongly typed indices
+   *
+   * This default implementation static_asserts that Expected==Arg, otherwise
+   * it's an error.  This enforces types for the TypedView.
+   *
+   * Specialization where expected type is same as argument type.
+   * In this case, there is no VectorIndex to unpack, just strip any strongly
+   * typed indices.
+   */
+  template<typename Expected, typename Arg>
+  struct MatchTypedViewArgHelper{
+    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
+        "Argument isn't compatible");
 
-  using Self         = ViewBase<value_type, pointer_type, layout_type>;
-  using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
+    using type = strip_index_type_t<Arg>;
 
-  using shifted_layout_type = typename add_offset<layout_type>::type;
-  using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
+    static RAJA_HOST_DEVICE RAJA_INLINE
+    constexpr
+    type extract(Arg arg){
+      return stripIndexType(arg);
+    }
+  };
 
-protected:
-  pointer_type m_data;
-  layout_type const m_layout;
 
-public:
-  /*
-   * Defaulted operators (AJK):
-   *
-   * OpenMP Target currently needs the View classes to be trivially copyable,
-   * which means that we need to use the default ctor's and assignment
-   * operators.
+#if defined(RAJA_ENABLE_VECTORIZATION)
+  /**
+   * Specialization where expected type is wrapped in a VectorIndex type
    *
-   * These defaulted operators cause issues with some versions of CUDA, so
-   * in the case that CUDA is enabled, we switch to explicitly defined
-   * operators.
+   * In this case, there is no VectorIndex to unpack, just strip any strongly
+   * typed indices.
    */
-#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr ViewBase() {};
+  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
+  struct MatchTypedViewArgHelper<Expected, RAJA::expt::TensorIndex<Arg, VectorType, DIM> >{
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE ViewBase(ViewBase const& c)
-      : m_layout(c.m_layout), m_data(c.m_data)
-  {}
+    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
+        "Argument isn't compatible");
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  ViewBase& operator=(ViewBase const& c)
-  {
-    m_layout = c.m_layout;
-    m_data   = c.m_data;
-  }
-#else
-  constexpr ViewBase()                             = default;
-  RAJA_INLINE constexpr ViewBase(ViewBase const&)  = default;
-  RAJA_INLINE constexpr ViewBase(ViewBase&&)       = default;
-  RAJA_INLINE ViewBase& operator=(ViewBase const&) = default;
-  RAJA_INLINE ViewBase& operator=(ViewBase&&)      = default;
+    using arg_type = strip_index_type_t<Arg>;
 
-#endif
+    using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr ViewBase(pointer_type data, layout_type&& layout)
-      : m_data(data), m_layout(layout)
-  {}
+    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
+    type extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg){
+      return type(stripIndexType(*vec_arg), vec_arg.size());
+    }
+  };
 
-  template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
-                                                  Args... dim_sizes)
-      : m_data(data), m_layout(dim_sizes...)
-  {}
+  /**
+   * Specialization where expected type is wrapped in a StaticTensorIndex type
+   *
+   * In this case, there is no StaticTensorIndex to unpack, just strip any strongly
+   * typed indices.
+   */
+  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM, Arg BEGIN, strip_index_type_t<Arg> LENGTH>
+  struct MatchTypedViewArgHelper<Expected, RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> >{
 
+    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
+        "Argument isn't compatible");
 
-  template <bool IsConstView = std::is_const<value_type>::value>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(
-      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
-      : m_data(rhs.get_data()), m_layout(rhs.get_layout())
-  {}
+    using arg_type = strip_index_type_t<Arg>;
 
+    using type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(PointerType data_ptr) { m_data = data_ptr; }
+    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
+    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> RAJA_UNUSED_ARG(vec_arg)){
+      return type();
+    }
+  };
+#endif
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr pointer_type const& get_data() const { return m_data; }
+  } //namespace detail
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr layout_type const& get_layout() const { return m_layout; }
 
+  template<typename Expected, typename Arg>
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr linear_index_type size() const { return m_layout.size(); }
-
-
-  template <camp::idx_t DIM>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type get_dim_size() const
+  constexpr
+  typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
+  match_typed_view_arg(Arg const &arg)
   {
-    return m_layout.template get_dim_size<DIM>();
+    return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
   }
 
 
-  template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-  operator()(Args... args) const
-  {
-    return view_make_return_value<value_type, linear_index_type>(
-        m_layout, m_data, args...);
-  }
-
 
-  /*
-   * Compatibility note (AJK):
-   * We are using variadic arguments even though operator[] takes exactly 1
-   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
-   * which seems to have been fixed in CUDA 9.2+
-   */
-  template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-  operator[](Args... args) const
-  {
-    return view_make_return_value<value_type, linear_index_type>(
-        m_layout, m_data, args...);
-  }
+template <typename ValueType,
+          typename PointerType,
+          typename LayoutType>
+class ViewBase {
+
+  public:
+    using value_type = ValueType;
+    using pointer_type = PointerType;
+    using layout_type = LayoutType;
+    using linear_index_type = typename layout_type::IndexLinear;
+    using nc_value_type = typename std::remove_const<value_type>::type;
+    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+        typename std::remove_pointer<pointer_type>::type>::type>::type;
+
+    using Self = ViewBase<value_type, pointer_type, layout_type>;
+    using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
+
+    using shifted_layout_type = typename add_offset<layout_type>::type;
+    using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
+
+  protected:
+    pointer_type m_data;
+    layout_type const m_layout;
+
+  public:
+
+
+    /*
+     * Defaulted operators (AJK):
+     *
+     * OpenMP Target currently needs the View classes to be trivially copyable,
+     * which means that we need to use the default ctor's and assignment
+     * operators.
+     *
+     * These defaulted operators cause issues with some versions of CUDA, so
+     * in the case that CUDA is enabled, we switch to explicitly defined
+     * operators.
+     */
+#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr ViewBase(){};
 
+    RAJA_HOST_DEVICE
+    RAJA_INLINE ViewBase(ViewBase const &c)
+      : m_layout(c.m_layout), m_data(c.m_data)
+    {
+    }
+
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    ViewBase &operator=(ViewBase const &c)
+    {
+      m_layout = c.m_layout;
+      m_data = c.m_data;
+    }
+#else
+    constexpr ViewBase() = default;
+    RAJA_INLINE constexpr ViewBase(ViewBase const &) = default;
+    RAJA_INLINE constexpr ViewBase(ViewBase &&) = default;
+    RAJA_INLINE ViewBase& operator=(ViewBase const &) = default;
+    RAJA_INLINE ViewBase& operator=(ViewBase &&) = default;
 
-  template <size_t n_dims   = layout_type::n_dims,
-            typename IdxLin = linear_index_type>
-  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-  {
-    static_assert(n_dims == layout_type::n_dims,
-                  "Dimension mismatch in view shift");
+#endif
 
-    shifted_layout_type shift_layout(m_layout);
-    shift_layout.shift(shift);
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    ViewBase(pointer_type data, layout_type &&layout) :
+    m_data(data), m_layout(layout)
+    {
+    }
+
+    template <typename... Args>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    ViewBase(pointer_type data, Args... dim_sizes) :
+    m_data(data), m_layout(dim_sizes...)
+    {
+    }
+
+
+    template <bool IsConstView = std::is_const<value_type>::value>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    ViewBase(typename std::enable_if<IsConstView, NonConstView>::type const &rhs) :
+    m_data(rhs.get_data()), m_layout(rhs.get_layout())
+    {
+    }
+
+
+    RAJA_HOST_DEVICE
+    RAJA_INLINE void set_data(PointerType data_ptr){
+      m_data = data_ptr;
+    }
+
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    pointer_type const &get_data() const
+    {
+      return m_data;
+    }
+
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    layout_type const &get_layout() const
+    {
+      return m_layout;
+    }
+
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    linear_index_type size() const
+    {
+      return m_layout.size();
+    }
+
+
+    template<camp::idx_t DIM>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    linear_index_type get_dim_size() const
+    {
+      return m_layout.template get_dim_size<DIM>();
+    }
+
+
+    template <typename... Args>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
+    operator()(Args... args) const
+    {
+      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
+    }
+
+
+
+    /*
+     * Compatibility note (AJK):
+     * We are using variadic arguments even though operator[] takes exactly 1 argument
+     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
+     * been fixed in CUDA 9.2+
+     */
+    template <typename ... Args>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
+    operator[](Args ... args) const
+    {
+      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
+    }
+
+
+
+    template <size_t n_dims = layout_type::n_dims, typename IdxLin = linear_index_type>
+    RAJA_INLINE
+    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+    {
+      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+
+      shifted_layout_type shift_layout(m_layout);
+      shift_layout.shift(shift);
+
+      return ShiftedView(m_data, shift_layout);
+    }
 
-    return ShiftedView(m_data, shift_layout);
-  }
 };
 
 
 template <typename ValueType,
-          typename PointerType,
-          typename LayoutType,
-          typename IndexTypes>
+        typename PointerType,
+        typename LayoutType,
+        typename IndexTypes>
 class TypedViewBase;
 
 template <typename ValueType,
           typename PointerType,
           typename LayoutType,
           typename... IndexTypes>
-class TypedViewBase<ValueType,
-                    PointerType,
-                    LayoutType,
-                    camp::list<IndexTypes...>>
-    : public ViewBase<ValueType, PointerType, LayoutType>
+class TypedViewBase<ValueType, PointerType, LayoutType, camp::list<IndexTypes...>> :
+  public ViewBase<ValueType, PointerType, LayoutType>
 {
 
-public:
-  using value_type        = ValueType;
-  using pointer_type      = PointerType;
-  using layout_type       = LayoutType;
-  using linear_index_type = typename layout_type::IndexLinear;
-  using nc_value_type     = typename std::remove_const<value_type>::type;
-  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-      typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-  using Base         = ViewBase<ValueType, PointerType, LayoutType>;
-  using Self         = TypedViewBase<value_type,
-                             pointer_type,
-                             layout_type,
-                             camp::list<IndexTypes...>>;
-  using NonConstView = TypedViewBase<nc_value_type,
-                                     nc_pointer_type,
-                                     layout_type,
-                                     camp::list<IndexTypes...>>;
-
-  using shifted_layout_type = typename add_offset<layout_type>::type;
-  using ShiftedView         = TypedViewBase<value_type,
-                                    pointer_type,
-                                    shifted_layout_type,
-                                    camp::list<IndexTypes...>>;
-
-  static constexpr size_t n_dims = sizeof...(IndexTypes);
-
-  using Base::Base;
-
-  template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-  operator()(Args... args) const
-  {
-    return view_make_return_value<value_type, linear_index_type>(
-        Base::m_layout, Base::m_data,
-        match_typed_view_arg<IndexTypes>(args)...);
-  }
+  public:
+    using value_type = ValueType;
+    using pointer_type = PointerType;
+    using layout_type = LayoutType;
+    using linear_index_type = typename layout_type::IndexLinear;
+    using nc_value_type = typename std::remove_const<value_type>::type;
+    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+        typename std::remove_pointer<pointer_type>::type>::type>::type;
 
+    using Base = ViewBase<ValueType, PointerType, LayoutType>;
+    using Self = TypedViewBase<value_type, pointer_type, layout_type, camp::list<IndexTypes...> >;
+    using NonConstView = TypedViewBase<nc_value_type, nc_pointer_type, layout_type, camp::list<IndexTypes...> >;
 
-  /*
-   * Compatibility note (AJK):
-   * We are using variadic arguments even though operator[] takes exactly 1
-   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
-   * which seems to have been fixed in CUDA 9.2+
-   */
-  template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
-                                                            pointer_type,
-                                                            linear_index_type,
-                                                            layout_type,
-                                                            Args...>
-  operator[](Args... args) const
-  {
-    return view_make_return_value<value_type, linear_index_type>(
-        Base::m_layout, Base::m_data,
-        match_typed_view_arg<IndexTypes>(args)...);
-  }
+    using shifted_layout_type = typename add_offset<layout_type>::type;
+    using ShiftedView = TypedViewBase<value_type, pointer_type, shifted_layout_type, camp::list<IndexTypes...> >;
 
+    static constexpr size_t n_dims = sizeof...(IndexTypes);
 
-  template <size_t n_dims   = sizeof...(IndexTypes),
-            typename IdxLin = linear_index_type>
-  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-  {
-    static_assert(n_dims == layout_type::n_dims,
-                  "Dimension mismatch in view shift");
+    using Base::Base;
 
-    shifted_layout_type shift_layout(Base::get_layout());
-    shift_layout.shift(shift);
+    template <typename... Args>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
+    operator()(Args... args) const
+    {
+      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
+    }
+
+
+
+    /*
+     * Compatibility note (AJK):
+     * We are using variadic arguments even though operator[] takes exactly 1 argument
+     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
+     * been fixed in CUDA 9.2+
+     */
+    template <typename ... Args>
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    constexpr
+    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
+    operator[](Args ... args) const
+    {
+      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
+    }
+
+
+
+    template <size_t n_dims = sizeof...(IndexTypes), typename IdxLin = linear_index_type>
+    RAJA_INLINE
+    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+    {
+      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+
+      shifted_layout_type shift_layout(Base::get_layout());
+      shift_layout.shift(shift);
+
+      return ShiftedView(Base::get_data(), shift_layout);
+    }
 
-    return ShiftedView(Base::get_data(), shift_layout);
-  }
 };
 
 
-}  // namespace internal
+
+} // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index 76ffa44bb9..fcaee67f98 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -32,59 +32,66 @@
 namespace RAJA
 {
 
-// Helpers to convert
-// layouts -> OffsetLayouts
-// Typedlayouts -> TypedOffsetLayouts
-template <typename layout>
+//Helpers to convert
+//layouts -> OffsetLayouts
+//Typedlayouts -> TypedOffsetLayouts
+template<typename layout>
 struct add_offset
 {
   using type = RAJA::OffsetLayout<layout::n_dims>;
 };
 
-template <typename IdxLin, typename... DimTypes>
-struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
+template<typename IdxLin, typename...DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
 {
-  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+  using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
 };
 
 template <typename ValueType,
           typename LayoutType,
-          typename PointerType = ValueType*>
-using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
+          typename PointerType = ValueType *>
+using View =
+    internal::ViewBase<ValueType, PointerType, LayoutType>;
+
 
 
 template <typename ValueType, typename LayoutType, typename... IndexTypes>
-using TypedView = internal::
-    TypedViewBase<ValueType, ValueType*, LayoutType, camp::list<IndexTypes...>>;
+using TypedView =
+    internal::TypedViewBase<ValueType, ValueType *, LayoutType, camp::list<IndexTypes...> >;
+
+
+
 
 
 template <typename IndexType, typename ValueType>
-RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
+RAJA_INLINE View<ValueType, Layout<1, IndexType, 0> > make_view(
+    ValueType *ptr)
 {
-  return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
+  return View<ValueType, Layout<1, IndexType, 0> >(ptr, 1);
 }
 
-template <size_t n_dims,
-          typename IndexType,
-          typename ValueType,
-          typename... IndexTypes>
-RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
-make_index_view(ValueType* ptr,
-                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+template <size_t n_dims, typename IndexType, typename ValueType, typename... IndexTypes>
+RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> > make_index_view(
+    ValueType *ptr, IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
-  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
-      ptr, index_layout);
+  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> >(ptr, index_layout);
 }
 
 
 // select certain indices from a tuple, given a curated index sequence
 // returns linear index of layout(ar...)
 template <typename Lay, typename Tup, camp::idx_t... Idxs>
-RAJA_HOST_DEVICE RAJA_INLINE auto
-selecttuple(Lay lyout, Tup&& tup, camp::idx_seq<Idxs...>)
-    -> decltype(lyout(camp::get<Idxs>(std::forward<Tup>(tup))...))
-{
-  return lyout(camp::get<Idxs>(std::forward<Tup>(tup))...);
+RAJA_HOST_DEVICE RAJA_INLINE 
+auto selecttuple( Lay lyout, Tup&& tup, camp::idx_seq<Idxs...> ) ->
+  decltype(
+            lyout(
+              camp::get<Idxs>(std::forward<Tup>(tup))...
+            )
+          )
+{ 
+  return lyout(
+                camp::get<Idxs>(std::forward<Tup>(tup))...
+              );
 }
 
 // sequence combiner
@@ -92,7 +99,9 @@ template <typename Seq1, typename Seq2>
 struct cat_seq;
 
 template <camp::idx_t... Idxs1, camp::idx_t... Idxs2>
-struct cat_seq<camp::idx_seq<Idxs1...>, camp::idx_seq<Idxs2...>>
+struct cat_seq  < camp::idx_seq<Idxs1...>,
+                  camp::idx_seq<Idxs2...>
+                >
 {
   using type = camp::idx_seq<Idxs1..., Idxs2...>;
 };
@@ -107,7 +116,7 @@ struct offset_seq;
 template <camp::idx_t Offset, camp::idx_t... Idxs>
 struct offset_seq<Offset, camp::idx_seq<Idxs...>>
 {
-  using type = camp::idx_seq<(Idxs + Offset)...>;
+  using type = camp::idx_seq<(Idxs+Offset)...>;
 };
 
 template <camp::idx_t Offset, typename Seq>
@@ -116,50 +125,60 @@ using offset_seq_t = typename offset_seq<Offset, Seq>::type;
 // remove the Nth index in a parameter pack
 // returns linear index of layout(ar...)
 template <typename Lay, RAJA::Index_type Nth = 0, typename Tup>
-RAJA_HOST_DEVICE RAJA_INLINE auto
-removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
-    lyout,
-    std::forward<Tup>(tup),
-    cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-              offset_seq_t<Nth + 1,       // after Nth
-                           camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
-                                                Nth - 1>>  // sequence after Nth
-              > {}))
+RAJA_HOST_DEVICE RAJA_INLINE auto removenth( Lay lyout, Tup&& tup ) ->
+  decltype( selecttuple<Lay>(
+              lyout,
+              std::forward<Tup>(tup),
+              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                          offset_seq_t<
+                            Nth+1,  // after Nth
+                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
+                          > // sequence after Nth
+                       >{}
+            )
+          )
 {
   return selecttuple<Lay>(
-      lyout, std::forward<Tup>(tup),
-      cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                offset_seq_t<Nth + 1,       // after Nth
-                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
-                                                  Nth - 1>>  // sequence after
-                                                             // Nth
-                > {});
+              lyout,
+              std::forward<Tup>(tup),
+              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                          offset_seq_t<
+                            Nth+1,  // after Nth
+                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
+                          > // sequence after Nth
+                       >{}
+          );
 }
 
 
-// P2Pidx represents the array-of-pointers index. This allows the position of
-// the index into the array-of-pointers to be moved around in the MultiView
-// operator(); see the operator overload. Default of 0 means that the p2p index
-// is in the 0th position.
-template <
-    typename ValueType,
-    typename LayoutType,
-    RAJA::Index_type P2Pidx      = 0,
-    typename PointerType         = ValueType**,
-    typename NonConstPointerType = camp::type::ptr::add<  // adds *
-        camp::type::ptr::add<camp::type::cv::rem<         // removes cv
-            camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
-                                                                    // *
-                                 >>>>>
-struct MultiView
-{
-  using value_type      = ValueType;
-  using pointer_type    = PointerType;
-  using layout_type     = LayoutType;
-  using nc_value_type   = camp::decay<value_type>;
+
+
+// P2Pidx represents the array-of-pointers index. This allows the position of the
+// index into the array-of-pointers to be moved around in the MultiView operator();
+// see the operator overload.
+// Default of 0 means that the p2p index is in the 0th position.
+template <typename ValueType,
+          typename LayoutType,
+          RAJA::Index_type P2Pidx = 0,
+          typename PointerType = ValueType **,
+          typename NonConstPointerType =
+              camp::type::ptr::add< // adds *
+                camp::type::ptr::add<
+                  camp::type::cv::rem<  // removes cv
+                    camp::type::ptr::rem<
+                      camp::type::ptr::rem<PointerType>  // removes *
+                    >
+                  >
+                >
+              >
+          >
+struct MultiView {
+  using value_type = ValueType;
+  using pointer_type = PointerType;
+  using layout_type = LayoutType;
+  using nc_value_type = camp::decay<value_type>;
   using nc_pointer_type = NonConstPointerType;
-  using NonConstView =
-      MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
+  using NonConstView = MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
   nc_pointer_type data;
@@ -167,38 +186,39 @@ struct MultiView
   template <typename... Args>
   RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
       : layout(dim_sizes...), data(data_ptr)
-  {}
+  {
+  }
 
-  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type&& layout)
+  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type &&layout)
       : layout(layout), data(data_ptr)
-  {}
+  {
+  }
 
-  RAJA_INLINE constexpr MultiView(MultiView const&)  = default;
-  RAJA_INLINE constexpr MultiView(MultiView&&)       = default;
-  RAJA_INLINE MultiView& operator=(MultiView const&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView&&)      = default;
+  RAJA_INLINE constexpr MultiView(MultiView const &) = default;
+  RAJA_INLINE constexpr MultiView(MultiView &&) = default;
+  RAJA_INLINE MultiView& operator=(MultiView const &) = default;
+  RAJA_INLINE MultiView& operator=(MultiView &&) = default;
 
   template <bool IsConstView = std::is_const<value_type>::value>
   RAJA_INLINE constexpr MultiView(
-      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
-      : layout(rhs.layout), data(rhs.data)
-  {}
+      typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
+      : layout(rhs.layout),
+        data(rhs.data)
+  {
+  }
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { data = data_ptr; }
 
-  template <size_t n_dims = layout_type::n_dims, typename IdxLin = Index_type>
-  RAJA_INLINE
-      RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
-      shift(const std::array<IdxLin, n_dims>& shift)
+  template <size_t n_dims=layout_type::n_dims, typename IdxLin = Index_type>
+  RAJA_INLINE RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
+  shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims == layout_type::n_dims,
-                  "Dimension mismatch in view shift");
+    static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type,
-                           P2Pidx>(data, shift_layout);
+    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>(data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
@@ -206,30 +226,26 @@ struct MultiView
   // making this specifically typed would require unpacking the layout,
   // this is easier to maintain
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(Args... ar) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... ar) const
   {
-    auto pidx =
-        stripIndexType(camp::get<P2Pidx>(camp::forward_as_tuple(ar...)));
+    auto pidx = stripIndexType( camp::get<P2Pidx>( camp::forward_as_tuple( ar... ) ) );
 
-    if (pidx < 0)
+    if ( pidx < 0 )
     {
-      RAJA_ABORT_OR_THROW("Negative index while accessing array of "
-                          "pointers.\n");
+      RAJA_ABORT_OR_THROW( "Negative index while accessing array of pointers.\n" );
     }
-
-    auto idx = stripIndexType(
-        removenth<LayoutType, P2Pidx>(layout, camp::forward_as_tuple(ar...)));
+    
+    auto idx = stripIndexType( removenth<LayoutType, P2Pidx>( layout, camp::forward_as_tuple( ar... ) ) );
     return data[pidx][idx];
   }
 };
 
 template <typename ViewType, typename AtomicPolicy = RAJA::auto_atomic>
-struct AtomicViewWrapper
-{
-  using base_type    = ViewType;
+struct AtomicViewWrapper {
+  using base_type = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type   = typename base_type::value_type;
-  using atomic_type  = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using value_type = typename base_type::value_type;
+  using atomic_type = RAJA::AtomicRef<value_type, AtomicPolicy>;
 
   base_type base_;
 
@@ -239,7 +255,7 @@ struct AtomicViewWrapper
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS&&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS &&... args) const
   {
     return atomic_type(&base_.operator()(std::forward<ARGS>(args)...));
   }
@@ -251,22 +267,21 @@ struct AtomicViewWrapper
  * for performance
  */
 template <typename ViewType>
-struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
-{
-  using base_type    = ViewType;
+struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
+  using base_type = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type   = typename base_type::value_type;
-  using atomic_type  = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
+  using value_type = typename base_type::value_type;
+  using atomic_type = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
 
   base_type base_;
 
   RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_ {view} {}
+  constexpr explicit AtomicViewWrapper(ViewType const &view) : base_{view} {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(ARGS&&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(ARGS &&... args) const
   {
     return base_.operator()(std::forward<ARGS>(args)...);
   }
@@ -274,8 +289,8 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
 
 
 template <typename AtomicPolicy, typename ViewType>
-RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy>
-make_atomic_view(ViewType const& view)
+RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy> make_atomic_view(
+    ViewType const &view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 23ccbee14c..7103ecb152 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -32,20 +32,16 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(disable : 4146)  // Force msvc to ignore subtracting from signed
-                                 // number warning
+#pragma warning( disable : 4146 )  // Force msvc to ignore subtracting from signed number warning
 #endif
   void* r = nullptr;
-  if (size <= space)
-  {
+  if (size <= space) {
     char* p1 = static_cast<char*>(ptr);
     char* p2 = reinterpret_cast<char*>(
-        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) &
-        -alignment);
+        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) & -alignment);
     size_t d = static_cast<size_t>(p2 - p1);
-    if (d <= space - size)
-    {
-      r   = p2;
+    if (d <= space - size) {
+      r = p2;
       ptr = r;
       space -= d;
     }
@@ -53,9 +49,9 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
   return r;
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4146)  // Force msvc to ignore subtracting from signed
-                                 // number warning
+#pragma warning( default : 4146 )  // Force msvc to ignore subtracting from signed number warning
 #endif
+
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index 3b488a81ec..f0208ccbd3 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -54,28 +54,27 @@ namespace detail
 class MemoryArena
 {
 public:
-  using free_type       = std::map<void*, void*>;
+  using free_type = std::map<void*, void*>;
   using free_value_type = typename free_type::value_type;
-  using used_type       = std::map<void*, void*>;
+  using used_type = std::map<void*, void*>;
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-      : m_allocation {ptr, static_cast<char*>(ptr) + size},
-        m_free_space(),
-        m_used_space()
+    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
+      m_free_space(),
+      m_used_space()
   {
-    m_free_space[ptr] = static_cast<char*>(ptr) + size;
-    if (m_allocation.begin == nullptr)
-    {
+     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
+    if (m_allocation.begin == nullptr) {
       fprintf(stderr, "Attempt to create MemoryArena with no memory");
       std::abort();
     }
   }
 
-  MemoryArena(MemoryArena const&)            = delete;
+  MemoryArena(MemoryArena const&) = delete;
   MemoryArena& operator=(MemoryArena const&) = delete;
 
-  MemoryArena(MemoryArena&&)            = default;
+  MemoryArena(MemoryArena&&) = default;
   MemoryArena& operator=(MemoryArena&&) = default;
 
   size_t capacity()
@@ -91,22 +90,21 @@ class MemoryArena
   void* get(size_t nbytes, size_t alignment)
   {
     void* ptr_out = nullptr;
-    if (capacity() >= nbytes)
-    {
+    if (capacity() >= nbytes) {
       free_type::iterator end = m_free_space.end();
-      for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter)
-      {
+      for (free_type::iterator iter = m_free_space.begin(); iter != end;
+           ++iter) {
 
         void* adj_ptr = iter->first;
         size_t cap =
             static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
 
-        if (::RAJA::align(alignment, nbytes, adj_ptr, cap))
-        {
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(iter, adj_ptr,
+          remove_free_chunk(iter,
+                            adj_ptr,
                             static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
@@ -120,35 +118,29 @@ class MemoryArena
 
   bool give(void* ptr)
   {
-    if (m_allocation.begin <= ptr && ptr < m_allocation.end)
-    {
+    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
 
       used_type::iterator found = m_used_space.find(ptr);
 
-      if (found != m_used_space.end())
-      {
+      if (found != m_used_space.end()) {
 
         add_free_chunk(found->first, found->second);
 
         m_used_space.erase(found);
-      }
-      else
-      {
+
+      } else {
         fprintf(stderr, "Invalid free %p", ptr);
         std::abort();
       }
 
       return true;
-    }
-    else
-    {
+    } else {
       return false;
     }
   }
 
 private:
-  struct memory_chunk
-  {
+  struct memory_chunk {
     void* begin;
     void* end;
   };
@@ -160,23 +152,19 @@ class MemoryArena
     free_type::iterator next = m_free_space.lower_bound(begin);
 
     // check if prev exists
-    if (next != m_free_space.begin())
-    {
+    if (next != m_free_space.begin()) {
       // check if prev can cover [begin, end)
       free_type::iterator prev = next;
       --prev;
-      if (prev->second == begin)
-      {
+      if (prev->second == begin) {
         // extend prev to cover [begin, end)
         prev->second = end;
 
         // check if prev can cover next too
-        if (next != invl)
-        {
+        if (next != invl) {
           assert(next->first != begin);
 
-          if (next->first == end)
-          {
+          if (next->first == end) {
             // extend prev to cover next too
             prev->second = next->second;
 
@@ -188,14 +176,12 @@ class MemoryArena
       }
     }
 
-    if (next != invl)
-    {
+    if (next != invl) {
       assert(next->first != begin);
 
-      if (next->first == end)
-      {
+      if (next->first == end) {
         // extend next to cover [begin, end)
-        m_free_space.insert(next, free_value_type {begin, next->second});
+        m_free_space.insert(next, free_value_type{begin, next->second});
         m_free_space.erase(next);
 
         return;
@@ -204,42 +190,38 @@ class MemoryArena
 
     // no free space adjacent to this chunk, add seperate free chunk [begin,
     // end)
-    m_free_space.insert(next, free_value_type {begin, end});
+    m_free_space.insert(next, free_value_type{begin, end});
   }
 
   void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
   {
 
-    void* ptr     = iter->first;
+    void* ptr = iter->first;
     void* ptr_end = iter->second;
 
     // fixup m_free_space, shrinking and adding chunks as needed
-    if (ptr != begin)
-    {
+    if (ptr != begin) {
 
       // shrink end of current free region to [ptr, begin)
       iter->second = begin;
 
-      if (end != ptr_end)
-      {
+      if (end != ptr_end) {
 
         // insert free region [end, ptr_end) after current free region
         free_type::iterator next = iter;
         ++next;
-        m_free_space.insert(next, free_value_type {end, ptr_end});
+        m_free_space.insert(next, free_value_type{end, ptr_end});
       }
-    }
-    else if (end != ptr_end)
-    {
+
+    } else if (end != ptr_end) {
 
       // shrink beginning of current free region to [end, ptr_end)
       free_type::iterator next = iter;
       ++next;
-      m_free_space.insert(next, free_value_type {end, ptr_end});
+      m_free_space.insert(next, free_value_type{end, ptr_end});
       m_free_space.erase(iter);
-    }
-    else
-    {
+
+    } else {
 
       // can not reuse current region, erase
       m_free_space.erase(iter);
@@ -249,7 +231,7 @@ class MemoryArena
   void add_used_chunk(void* begin, void* end)
   {
     // simply inserts a chunk of memory into used_space
-    m_used_space.insert(used_value_type {begin, end});
+    m_used_space.insert(used_value_type{begin, end});
   }
 
   memory_chunk m_allocation;
@@ -308,7 +290,7 @@ class MemPool
 
   static inline MemPool<allocator_t>& getInstance()
   {
-    static MemPool<allocator_t> pool {};
+    static MemPool<allocator_t> pool{};
     return pool;
   }
 
@@ -316,7 +298,8 @@ class MemPool
 
   MemPool()
       : m_arenas(), m_default_arena_size(default_default_arena_size), m_alloc()
-  {}
+  {
+  }
 
   ~MemPool()
   {
@@ -333,8 +316,7 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    while (!m_arenas.empty())
-    {
+    while (!m_arenas.empty()) {
       void* allocation_ptr = m_arenas.front().get_allocation();
       m_alloc.free(allocation_ptr);
       m_arenas.pop_front();
@@ -356,7 +338,7 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    size_t prev_size     = m_default_arena_size;
+    size_t prev_size = m_default_arena_size;
     m_default_arena_size = new_size;
     return prev_size;
   }
@@ -368,26 +350,22 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    const size_t size                  = nTs * sizeof(T);
-    void* ptr                          = nullptr;
+    const size_t size = nTs * sizeof(T);
+    void* ptr = nullptr;
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter)
-    {
+         ++iter) {
       ptr = iter->get(size, alignment);
-      if (ptr != nullptr)
-      {
+      if (ptr != nullptr) {
         break;
       }
     }
 
-    if (ptr == nullptr)
-    {
+    if (ptr == nullptr) {
       const size_t alloc_size =
           std::max(size + alignment, m_default_arena_size);
       void* arena_ptr = m_alloc.malloc(alloc_size);
-      if (arena_ptr != nullptr)
-      {
+      if (arena_ptr != nullptr) {
         m_arenas.emplace_front(arena_ptr, alloc_size);
         ptr = m_arenas.front().get(size, alignment);
       }
@@ -402,19 +380,16 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    void* ptr                          = const_cast<void*>(cptr);
+    void* ptr = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter)
-    {
-      if (iter->give(ptr))
-      {
+         ++iter) {
+      if (iter->give(ptr)) {
         ptr = nullptr;
         break;
       }
     }
-    if (ptr != nullptr)
-    {
+    if (ptr != nullptr) {
       fprintf(stderr, "Unknown pointer %p", ptr);
     }
   }
@@ -432,8 +407,7 @@ class MemPool
 };
 
 //! example allocator for basic_mempool using malloc/free
-struct generic_allocator
-{
+struct generic_allocator {
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes) { return std::malloc(nbytes); }
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 06637e7a96..4372993949 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -34,17 +34,17 @@ using namespace camp::concepts;
 
 template <typename From, typename To>
 struct ConvertibleTo
-    : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>()))
-{};
+  : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
+};
 
-}  // namespace concepts
+}
 
 namespace type_traits
 {
 using namespace camp::type_traits;
 
 DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
-}  // namespace type_traits
+}
 
 }  // end namespace RAJA
 
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 81069d57d0..25783b2a0a 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -38,13 +38,11 @@ namespace detail
 
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
-template <typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
-                                                Iter end,
-                                                UnaryFunc func)
+template<typename Iter, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
 {
-  for (; begin != end; ++begin)
-  {
+  for (; begin != end; ++begin) {
     func(*begin);
   }
 
@@ -54,11 +52,11 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
-                                                     UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 {
   // braced init lists are evaluated in order
-  int seq_unused_array[] = {0, (func(Ts {}), 0)...};
+  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
   RAJA_UNUSED_VAR(seq_unused_array);
 
   return func;
@@ -67,9 +65,8 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
-                                                      UnaryFunc func,
-                                                      camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -90,7 +87,7 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
 RAJA_SUPPRESS_HD_WARN
 template <typename Container, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
-    concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
     for_each(Container&& c, UnaryFunc func)
 {
   using std::begin;
@@ -105,23 +102,23 @@ RAJA_HOST_DEVICE RAJA_INLINE
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
-                                                     UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
 
 /*!
-  \brief Apply func to each object in the given tuple or tuple like type in
-  order using a compile-time expansion in O(N) operations and O(1) extra memory
+  \brief Apply func to each object in the given tuple or tuple like type in order
+  using a compile-time expansion in O(N) operations and O(1) extra memory
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
-  return detail::for_each_tuple(
-      std::forward<Tuple>(t), std::move(func),
-      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value> {});
+  return detail::for_each_tuple(std::forward<Tuple>(t), std::move(func),
+      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index dddb050ec4..9ddb5bebb7 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -33,16 +33,16 @@
 // We need a better solution than this as it is a pain to manage
 // this stuff in an application.
 //
-#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||                   \
-    (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) ||           \
-    (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
+  || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) \
+  || (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
 #endif
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE      __device__
-#define RAJA_HOST        __host__
+#define RAJA_DEVICE __device__
+#define RAJA_HOST __host__
 
 #if defined(RAJA_ENABLE_CLANG_CUDA)
 #define RAJA_SUPPRESS_HD_WARN
@@ -52,8 +52,8 @@
 
 #elif defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE      __device__
-#define RAJA_HOST        __host__
+#define RAJA_DEVICE __device__
+#define RAJA_HOST __host__
 #define RAJA_SUPPRESS_HD_WARN
 
 #define RAJA_USE_HIP_INTRINSICS
@@ -115,8 +115,9 @@
  *******************************************************************************
  */
 template <typename... T>
-RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
-{}
+RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
+{
+}
 
 /*!
  * \def RAJA_STRINGIFY_HELPER(x)
@@ -132,7 +133,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
  */
 #define RAJA_STRINGIFY_MACRO(x) RAJA_STRINGIFY_HELPER(x)
 
-#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)                             \
+#define RAJA_DIVIDE_CEILING_INT(dividend, divisor) \
   (((dividend) + (divisor)-1) / (divisor))
 
 /*!
@@ -140,26 +141,27 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
  * Used in forall and launch
  */
 #if defined(RAJA_ENABLE_OPENMP)
-#define RAJA_OMP_DECLARE_REDUCTION_COMBINE                                     \
-  _Pragma(" omp declare reduction( combine \
+#define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
+      _Pragma(" omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")  // initializer(omp_priv = omp_in) ")
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")\
+        //initializer(omp_priv = omp_in) ")
 #endif
 
 
 RAJA_HOST_DEVICE
-inline void RAJA_ABORT_OR_THROW(const char* str)
+inline void RAJA_ABORT_OR_THROW(const char *str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
-  // segfault here ran into linking problems
-  *((volatile char*)0) = 0;  // write to address 0
+  //segfault here ran into linking problems
+  *((volatile char *)0) = 0;  // write to address 0
 #else
-  printf("%s\n", str);
+  printf ( "%s\n", str );
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
-  *((volatile char*)0) = 0;  // write to address 0
+  *((volatile char *)0) = 0;  // write to address 0
 #elif defined(__CUDA_ARCH__)
-  asm("trap;");
+  asm ("trap;");
 
 #elif defined(__HIP_DEVICE_COMPILE__)
   abort();
@@ -167,11 +169,10 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
 #else
 #ifdef RAJA_COMPILER_MSVC
   fflush(stdout);
-  char* value;
+  char *value;
   size_t len;
   bool no_except = false;
-  if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr)
-  {
+  if(_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr){
     no_except = true;
     free(value);
   }
@@ -181,12 +182,9 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
 #endif
 
   fflush(stdout);
-  if (no_except)
-  {
+  if (no_except) {
     std::abort();
-  }
-  else
-  {
+  } else {
     throw std::runtime_error(str);
   }
 #endif
@@ -204,7 +202,7 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
  */
 
 #if (__cplusplus >= 201402L)
-#define RAJA_HAS_CXX14                    1
+#define RAJA_HAS_CXX14 1
 #define RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED 1
 #elif defined(__has_cpp_attribute)
 #if __has_cpp_attribute(deprecated)
@@ -214,7 +212,7 @@ inline void RAJA_ABORT_OR_THROW(const char* str)
 
 #if defined(RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED)
 // When using a C++14 compiler, use the standard-specified deprecated attribute
-#define RAJA_DEPRECATE(Msg)       [[deprecated(Msg)]]
+#define RAJA_DEPRECATE(Msg) [[deprecated(Msg)]]
 #define RAJA_DEPRECATE_ALIAS(Msg) [[deprecated(Msg)]]
 
 #elif defined(_MSC_VER)
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index b07f3b6f48..99d7bc192e 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -34,14 +34,14 @@ namespace RAJA
     For zero or negative n return 0
 
 */
-template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr T log2(T n) noexcept
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr T log2(T n) noexcept
 {
   T result = 0;
-  if (n > 0)
-  {
-    while (n >>= 1)
-    {
+  if (n > 0) {
+    while(n >>= 1) {
       ++result;
     }
   }
@@ -57,12 +57,13 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr T log2(T n) noexcept
         if n is not a power of 2, return the next greater power of 2
       if n is negative, return 0
 */
-template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-RAJA_HOST_DEVICE constexpr T next_pow2(T n) noexcept
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr T next_pow2(T n) noexcept
 {
   --n;
-  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
-  {
+  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
     n |= n >> s;
   }
   ++n;
@@ -70,8 +71,7 @@ RAJA_HOST_DEVICE constexpr T next_pow2(T n) noexcept
 }
 
 /*!
-    \brief "round down" to the largest power of 2 that is less than or equal to
-   n
+    \brief "round down" to the largest power of 2 that is less than or equal to n
 
     For an integer n,
       if n is negative, return 0
@@ -79,12 +79,13 @@ RAJA_HOST_DEVICE constexpr T next_pow2(T n) noexcept
         if n is a power of 2, return n
         else return the largest power of 2 that is less than n
 */
-template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-RAJA_HOST_DEVICE constexpr T prev_pow2(T n) noexcept
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr T prev_pow2(T n) noexcept
 {
-  if (n < 0) return 0;
-  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
-  {
+  if ( n < 0 ) return 0;
+  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
     n |= n >> s;
   }
   return n - (n >> 1);
@@ -93,13 +94,12 @@ RAJA_HOST_DEVICE constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template <typename L,
-          typename R,
-          std::enable_if_t<std::is_integral<L>::value &&
-                           std::is_integral<R>::value>* = nullptr>
+template < typename L, typename R,
+           std::enable_if_t<std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr >
+RAJA_HOST_DEVICE RAJA_INLINE
 constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
 {
-  return lhs & (rhs - R(1));
+  return lhs & (rhs-R(1));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
index 631177cbf6..a955b27915 100644
--- a/include/RAJA/util/mutex.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -39,10 +39,10 @@ class mutex
 
   mutex() { omp_init_lock(&m_lock); }
 
-  mutex(const mutex&)            = delete;
-  mutex(mutex&&)                 = delete;
+  mutex(const mutex&) = delete;
+  mutex(mutex&&) = delete;
   mutex& operator=(const mutex&) = delete;
-  mutex& operator=(mutex&&)      = delete;
+  mutex& operator=(mutex&&) = delete;
 
   void lock() { omp_set_lock(&m_lock); }
 
@@ -68,10 +68,10 @@ class lock_guard
 public:
   explicit lock_guard(mutex_type& m) : m_mutex(m) { m_mutex.lock(); }
 
-  lock_guard(const lock_guard&)            = delete;
-  lock_guard(lock_guard&&)                 = delete;
+  lock_guard(const lock_guard&) = delete;
+  lock_guard(lock_guard&&) = delete;
   lock_guard& operator=(const lock_guard&) = delete;
-  lock_guard& operator=(lock_guard&&)      = delete;
+  lock_guard& operator=(lock_guard&&) = delete;
 
   ~lock_guard() { m_mutex.unlock(); }
 
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index 301bbc875c..d5f42efde0 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -18,88 +18,103 @@
 #include "RAJA/util/KokkosPluginLoader.hpp"
 #endif
 
-namespace RAJA
-{
-namespace util
-{
+namespace RAJA {
+namespace util {
 
 template <typename T>
-RAJA_INLINE auto trigger_updates_before(T&& item) ->
-    typename std::remove_reference<T>::type
+RAJA_INLINE auto trigger_updates_before(T&& item)
+  -> typename std::remove_reference<T>::type
 {
   return item;
 }
 
 RAJA_INLINE
-void callPreCapturePlugins(const PluginContext& p)
+void
+callPreCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
-       ++plugin)
+  for (auto plugin = PluginRegistry::begin();
+      plugin != PluginRegistry::end();
+      ++plugin)
   {
     (*plugin).get()->preCapture(p);
   }
 }
 
 RAJA_INLINE
-void callPostCapturePlugins(const PluginContext& p)
+void
+callPostCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
-       ++plugin)
+  for (auto plugin = PluginRegistry::begin();
+      plugin != PluginRegistry::end();
+      ++plugin)
   {
     (*plugin).get()->postCapture(p);
   }
 }
 
 RAJA_INLINE
-void callPreLaunchPlugins(const PluginContext& p)
+void
+callPreLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
-       ++plugin)
+  for (auto plugin = PluginRegistry::begin();
+      plugin != PluginRegistry::end();
+      ++plugin)
   {
     (*plugin).get()->preLaunch(p);
   }
 }
 
 RAJA_INLINE
-void callPostLaunchPlugins(const PluginContext& p)
+void
+callPostLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
-       ++plugin)
+  for (auto plugin = PluginRegistry::begin();
+      plugin != PluginRegistry::end();
+      ++plugin)
   {
     (*plugin).get()->postLaunch(p);
   }
 }
 
 RAJA_INLINE
-void callInitPlugins(const PluginOptions p)
+void
+callInitPlugins(const PluginOptions p)
 {
-  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
-       ++plugin)
+  for (auto plugin = PluginRegistry::begin(); 
+      plugin != PluginRegistry::end();
+      ++plugin)
   {
     (*plugin).get()->init(p);
   }
 }
 
 RAJA_INLINE
-void init_plugins(const std::string& path)
-{
+void
+init_plugins(const std::string& path)
+{   
   callInitPlugins(make_options(path));
 }
 
 RAJA_INLINE
-void init_plugins() { callInitPlugins(make_options("")); }
+void
+init_plugins()
+{   
+  callInitPlugins(make_options(""));
+}
 
 RAJA_INLINE
-void finalize_plugins()
-{
-  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
-       ++plugin)
+void
+finalize_plugins()
+{   
+  for (auto plugin = PluginRegistry::begin(); 
+    plugin != PluginRegistry::end();
+    ++plugin)
   {
     (*plugin).get()->finalize();
   }
 }
 
-}  // namespace util
-}  // namespace RAJA
+} // closing brace for util namespace
+} // closing brace for RAJA namespace
 
 #endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 9d8380bbf1..6d0c28f861 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -44,17 +44,19 @@ namespace detail
 template <typename T, typename BinaryOp>
 struct LeftFoldReduce
 {
-  RAJA_HOST_DEVICE
-  RAJA_INLINE constexpr explicit LeftFoldReduce(
-      T init      = BinaryOp::identity(),
-      BinaryOp op = BinaryOp {}) noexcept
-      : m_op(std::move(op)), m_accumulated_value(std::move(init))
-  {}
-
-  LeftFoldReduce(LeftFoldReduce const&)            = delete;
+  RAJA_HOST_DEVICE RAJA_INLINE
+  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
+                                      BinaryOp op = BinaryOp{}) noexcept
+    : m_op(std::move(op))
+    , m_accumulated_value(std::move(init))
+  {
+
+  }
+
+  LeftFoldReduce(LeftFoldReduce const&) = delete;
   LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
-  LeftFoldReduce(LeftFoldReduce&&)                 = delete;
-  LeftFoldReduce& operator=(LeftFoldReduce&&)      = delete;
+  LeftFoldReduce(LeftFoldReduce &&) = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
 
   ~LeftFoldReduce() = default;
 
@@ -62,7 +64,8 @@ struct LeftFoldReduce
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void clear() noexcept
   {
     m_accumulated_value = BinaryOp::identity();
   }
@@ -70,7 +73,8 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get_and_clear()
   {
     T accumulated_value = std::move(m_accumulated_value);
 
@@ -82,12 +86,17 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; }
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get()
+  {
+    return m_accumulated_value;
+  }
 
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE void combine(T val)
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void combine(T val)
   {
     m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
   }
@@ -100,53 +109,50 @@ struct LeftFoldReduce
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <typename T,
-          typename BinaryOp,
-          typename SizeType     = size_t,
-          SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
+template <typename T, typename BinaryOp, typename SizeType = size_t,
+          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
 struct BinaryTreeReduce
 {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(t_num_levels <= CHAR_BIT * sizeof(SizeType),
-                "SizeType must be "
-                "large enough to "
-                "act at a bitset "
-                "for num_levels");
+  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE constexpr explicit BinaryTreeReduce(
-      T init      = BinaryOp::identity(),
-      BinaryOp op = BinaryOp {}) noexcept
-      : m_op(std::move(op))
+  RAJA_HOST_DEVICE RAJA_INLINE
+  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
+                                      BinaryOp op = BinaryOp{}) noexcept
+    : m_op(std::move(op))
   {
     combine(std::move(init));
   }
 
-  BinaryTreeReduce(BinaryTreeReduce const&)            = delete;
+  BinaryTreeReduce(BinaryTreeReduce const&) = delete;
   BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
-  BinaryTreeReduce(BinaryTreeReduce&&)                 = delete;
-  BinaryTreeReduce& operator=(BinaryTreeReduce&&)      = delete;
+  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
 
-  RAJA_HOST_DEVICE RAJA_INLINE ~BinaryTreeReduce() { clear(); }
+  RAJA_HOST_DEVICE RAJA_INLINE
+  ~BinaryTreeReduce()
+  {
+    clear();
+  }
 
 
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void clear() noexcept
   {
     // destroy all values on the tree stack and reset count to 0
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
-    {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
 
-      if (m_count & mask)
-      {
+      if (m_count & mask) {
 
         get_value(level)->~T();
 
         m_count ^= mask;
+
       }
     }
   }
@@ -154,16 +160,15 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get_and_clear()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
-    {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
 
-      if (m_count & mask)
-      {
+      if (m_count & mask) {
 
         value = m_op(std::move(value), std::move(*get_value(level)));
         get_value(level)->~T();
@@ -178,17 +183,15 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE T get()
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType count = m_count, level = 0, mask = 1; count;
-         ++level, mask <<= 1)
-    {
+    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
 
-      if (count & mask)
-      {
+      if (count & mask) {
 
         value = m_op(std::move(value), *get_value(level));
 
@@ -202,19 +205,20 @@ struct BinaryTreeReduce
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE void combine(T value)
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void combine(T value)
   {
     // accumulate values and store in the first unused level found
     // clear values from used levels along the way
     SizeType level = 0;
-    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1)
-    {
+    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) {
 
       value = m_op(std::move(*get_value(level)), std::move(value));
       get_value(level)->~T();
+
     }
 
-    new (get_storage(level)) T(std::move(value));
+    new(get_storage(level)) T(std::move(value));
 
     ++m_count;
   }
@@ -230,12 +234,14 @@ struct BinaryTreeReduce
   // values or is unused and has no value.
   std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
 
-  RAJA_HOST_DEVICE RAJA_INLINE void* get_storage(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void* get_storage(SizeType level)
   {
     return &m_tree_stack[level];
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE T* get_value(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T* get_value(SizeType level)
   {
 #if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     // TODO: check that launder is supported in device code
@@ -248,10 +254,10 @@ struct BinaryTreeReduce
 
 
 template <typename T, typename BinaryOp>
-using HighAccuracyReduce =
-    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
-                       BinaryTreeReduce<T, BinaryOp>,
-                       LeftFoldReduce<T, BinaryOp>>;
+using HighAccuracyReduce = std::conditional_t<
+    RAJA::operators::is_fp_associative<T>::value,
+      BinaryTreeReduce<T, BinaryOp>,
+      LeftFoldReduce<T, BinaryOp>>;
 
 
 /*!
@@ -259,15 +265,18 @@ using HighAccuracyReduce =
            operation using O(N) operations and O(1) memory
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE T
-left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE
+T left_fold_reduce(Iter begin,
+                   Iter end,
+                   T init,
+                   BinaryOp op)
 {
   LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin)
-  {
+  for (; begin != end; ++begin) {
 
     reducer.combine(*begin);
+
   }
 
   return reducer.get_and_clear();
@@ -281,18 +290,20 @@ left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
     floating point types.
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE T
-binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE
+T binary_tree_reduce(Iter begin,
+                     Iter end,
+                     T init,
+                     BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
-                                                  std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin)
-  {
+  for (; begin != end; ++begin) {
 
     reducer.combine(*begin);
+
   }
 
   return reducer.get_and_clear();
@@ -304,15 +315,18 @@ binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
     is a concern, or a faster algorithm with it is not a concern
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE T
-high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE
+T high_accuracy_reduce(Iter begin,
+                        Iter end,
+                        T init,
+                        BinaryOp op)
 {
   HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin)
-  {
+  for (; begin != end; ++begin) {
 
     reducer.combine(*begin);
+
   }
 
   return reducer.get_and_clear();
@@ -326,21 +340,18 @@ high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
     see https://en.cppreference.com/w/cpp/algorithm/accumulate
 */
 template <typename Container,
-          typename T        = detail::ContainerVal<Container>,
+          typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE
-    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-    accumulate(Container&& c,
-               T init      = BinaryOp::identity(),
-               BinaryOp op = BinaryOp {})
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(begin(c), end(c), std::move(init),
-                                  std::move(op));
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
 }
 
 /*!
@@ -349,21 +360,18 @@ RAJA_HOST_DEVICE
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T        = detail::ContainerVal<Container>,
+          typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE
-    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-    binary_tree_reduce(Container&& c,
-                       T init      = BinaryOp::identity(),
-                       BinaryOp op = BinaryOp {})
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
-                                    std::move(op));
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
 }
 
 /*!
@@ -373,21 +381,18 @@ RAJA_HOST_DEVICE
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T        = detail::ContainerVal<Container>,
+          typename T = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE
-    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
-    high_accuracy_reduce(Container&& c,
-                         T init      = BinaryOp::identity(),
-                         BinaryOp op = BinaryOp {})
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
-                                      std::move(op));
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 567d95e21e..28a476d951 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -37,212 +37,145 @@
 namespace RAJA
 {
 
-namespace resources
-{
-using namespace camp::resources;
+  namespace resources
+  {
+  using namespace camp::resources;
 
-template <typename e>
-struct get_resource
-{
-  using type = camp::resources::Host;
-};
+  template<typename e>
+  struct get_resource{
+    using type = camp::resources::Host;
+  };
 
-template <Platform>
-struct get_resource_from_platform
-{
-  using type = camp::resources::Host;
-};
+  template<Platform>
+  struct get_resource_from_platform{
+    using type = camp::resources::Host;
+  };
 
-template <typename ExecPol>
-using resource_from_pol_t = typename get_resource_from_platform<
-    detail::get_platform<ExecPol>::value>::type;
+  template<typename ExecPol>
+  using resource_from_pol_t = typename get_resource_from_platform<detail::get_platform<ExecPol>::value>::type;
 
-template <typename ExecPol>
-constexpr resource_from_pol_t<ExecPol> get_default_resource()
-{
-  return resource_from_pol_t<ExecPol>::get_default();
-}
+  template<typename ExecPol>
+  constexpr resource_from_pol_t<ExecPol> get_default_resource() {
+    return resource_from_pol_t<ExecPol>::get_default();
+  }
 
 #if defined(RAJA_CUDA_ACTIVE)
-template <>
-struct get_resource_from_platform<Platform::cuda>
-{
-  using type = camp::resources::Cuda;
-};
-
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async>
-struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                             IterationGetter,
-                                                             Concretizer,
-                                                             BLOCKS_PER_SM,
-                                                             Async>>
-{
-  using type = camp::resources::Cuda;
-};
-
-template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async,
-                                                                 num_threads,
-                                                                 BLOCKS_PER_SM>>
-{
-  using type = camp::resources::Cuda;
-};
-
-template <typename ISetIter,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async>
-struct get_resource<
-    ExecPolicy<ISetIter,
-               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
-                                                        IterationGetter,
-                                                        Concretizer,
-                                                        BLOCKS_PER_SM,
-                                                        Async>>>
-{
-  using type = camp::resources::Cuda;
-};
+  template<>
+  struct get_resource_from_platform<Platform::cuda>{
+    using type = camp::resources::Cuda;
+  };
+
+  template<typename IterationMapping, typename IterationGetter,
+           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
+  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
+    using type = camp::resources::Cuda;
+  };
+
+  template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
+  struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>{
+    using type = camp::resources::Cuda;
+  };
+
+  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
+           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
+    using type = camp::resources::Cuda;
+  };
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
-template <>
-struct get_resource_from_platform<Platform::hip>
-{
-  using type = camp::resources::Hip;
-};
-
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async>
-struct get_resource<
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>
-{
-  using type = camp::resources::Hip;
-};
-
-template <bool Async, int num_threads>
-struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>
-{
-  using type = camp::resources::Hip;
-};
-
-template <typename ISetIter,
-          typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async>
-struct get_resource<ExecPolicy<
-    ISetIter,
-    ::RAJA::policy::hip::
-        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>
-{
-  using type = camp::resources::Hip;
-};
+  template<>
+  struct get_resource_from_platform<Platform::hip>{
+    using type = camp::resources::Hip;
+  };
+
+  template<typename IterationMapping, typename IterationGetter,
+           typename Concretizer, bool Async>
+  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
+    using type = camp::resources::Hip;
+  };
+
+  template <bool Async, int num_threads>
+  struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>{
+    using type = camp::resources::Hip;
+  };
+
+  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
+           typename Concretizer, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
+    using type = camp::resources::Hip;
+  };
 #endif
 
 #if defined(RAJA_SYCL_ACTIVE)
-template <>
-struct get_resource_from_platform<Platform::sycl>
-{
-  using type = camp::resources::Sycl;
-};
-
-template <size_t BlockSize, bool Async>
-struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>
-{
-  using type = camp::resources::Sycl;
-};
-
-template <bool Async, int num_threads>
-struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>
-{
-  using type = camp::resources::Sycl;
-};
-
-template <typename ISetIter, size_t BlockSize, bool Async>
-struct get_resource<
-    ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>
-{
-  using type = camp::resources::Sycl;
-};
+  template<>
+  struct get_resource_from_platform<Platform::sycl>{
+    using type = camp::resources::Sycl;
+  };
+
+  template<size_t BlockSize, bool Async>
+  struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>{
+    using type = camp::resources::Sycl;
+  };
+
+  template <bool Async, int num_threads>
+  struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>{
+    using type = camp::resources::Sycl;
+  };
+
+  template<typename ISetIter, size_t BlockSize, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>{
+    using type = camp::resources::Sycl;
+  };
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-template <>
-struct get_resource_from_platform<Platform::omp_target>
-{
-  using type = camp::resources::Omp;
-};
-
-template <>
-struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>
-{
-  using type = camp::resources::Omp;
-};
-
-template <size_t ThreadsPerTeam>
-struct get_resource<
-    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>
-{
-  using type = camp::resources::Omp;
-};
-
-template <typename ISetIter>
-struct get_resource<
-    ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>
-{
-  using type = camp::resources::Omp;
-};
-
-template <typename ISetIter, size_t ThreadsPerTeam>
-struct get_resource<ExecPolicy<
-    ISetIter,
-    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>
-{
-  using type = camp::resources::Omp;
-};
+  template<>
+  struct get_resource_from_platform<Platform::omp_target>{
+    using type = camp::resources::Omp;
+  };
+
+  template<>
+  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>{
+    using type = camp::resources::Omp;
+  };
+
+  template<size_t ThreadsPerTeam>
+  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>{
+    using type = camp::resources::Omp;
+  };
+
+  template<typename ISetIter>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>{
+    using type = camp::resources::Omp;
+  };
+
+  template<typename ISetIter, size_t ThreadsPerTeam>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>{
+    using type = camp::resources::Omp;
+  };
 #endif
 
-}  // end namespace resources
+  } // end namespace resources
 
-namespace type_traits
-{
-template <typename T>
-struct is_resource : std::false_type
-{};
-template <>
-struct is_resource<resources::Host> : std::true_type
-{};
+  namespace type_traits
+  {
+    template <typename T> struct is_resource : std::false_type {};
+    template <> struct is_resource<resources::Host> : std::true_type {};
 #if defined(RAJA_CUDA_ACTIVE)
-template <>
-struct is_resource<resources::Cuda> : std::true_type
-{};
+    template <> struct is_resource<resources::Cuda> : std::true_type {};
 #endif
 #if defined(RAJA_HIP_ACTIVE)
-template <>
-struct is_resource<resources::Hip> : std::true_type
-{};
+    template <> struct is_resource<resources::Hip> : std::true_type {};
 #endif
 #if defined(RAJA_SYCL_ACTIVE)
-template <>
-struct is_resource<resources::Sycl> : std::true_type
-{};
+    template <> struct is_resource<resources::Sycl> : std::true_type {};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-template <>
-struct is_resource<resources::Omp> : std::true_type
-{};
+    template <> struct is_resource<resources::Omp> : std::true_type {};
 #endif
-}  // end namespace type_traits
+  } // end namespace type_traits
 
 }  // end namespace RAJA
 
-#endif  // RAJA_resources_HPP#
+#endif //RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index a5c0de5e76..bbec03dfe1 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -40,41 +40,37 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
-                                            Iter end,
-                                            Predicate pred)
+RAJA_HOST_DEVICE RAJA_INLINE
+Iter
+partition(Iter begin,
+          Iter end,
+          Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end)
-  {
+  if (begin == end) {
     return begin;
   }
 
   // advance to first false
   Iter first_false = begin;
-  for (; first_false != end; ++first_false)
-  {
+  for (; first_false != end; ++first_false) {
 
-    if (!pred(first_false))
-    {
+    if (!pred(first_false)) {
       break;
     }
   }
 
   // return if none were false
-  if (first_false == end)
-  {
+  if (first_false == end) {
     return first_false;
   }
 
   // advance through rest of list to find the next true
-  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true)
-  {
+  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true) {
 
     // find the end of a range of falses [first_false, next_true)
-    if (pred(next_true))
-    {
+    if (pred(next_true)) {
 
       // shift the known range of falses forward
       // by swapping the true to the beginning of the range
@@ -91,36 +87,33 @@ RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
     and using O(N^2) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE void
-insertion_sort(Iter begin, Iter end, Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+insertion_sort(Iter begin,
+               Iter end,
+               Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end)
-  {
+  if (begin == end) {
     return;
   }
 
   // for each unsorted item in the right side of the range
-  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end;
-       ++next_unsorted)
-  {
+  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end; ++next_unsorted) {
 
     // insert unsorted item into the sorted left side of the range
-    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert)
-    {
+    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert) {
 
       Iter next_sorted = RAJA::prev(to_insert);
 
       // compare with next item to left
-      if (comp(*to_insert, *next_sorted))
-      {
+      if (comp(*to_insert, *next_sorted)) {
 
         // swap down if should be before
         safe_iter_swap(next_sorted, to_insert);
-      }
-      else
-      {
+
+      } else {
 
         // stop if in correct position
         break;
@@ -132,16 +125,20 @@ insertion_sort(Iter begin, Iter end, Compare comp)
 /*!
     \brief get number of strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE constexpr size_t num_shell_strides() { return 39; }
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr size_t num_shell_strides()
+{
+  return 39;
+}
 
 /*!
     \brief get strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE constexpr long long unsigned
-get_shell_stride(int i)
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr long long unsigned get_shell_stride(int i)
 {
   using array_type = long long unsigned[num_shell_strides()];
-  return (array_type {
+  return (array_type{
       // strides from M. Ciura 2001
       1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
@@ -150,8 +147,8 @@ get_shell_stride(int i)
       149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
       8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
       220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
-      5647794772392llu, 12707538237882llu, 28591961035234llu,
-      64331912329276llu})[i];
+      5647794772392llu, 12707538237882llu, 28591961035234llu, 64331912329276llu
+    })[i];
 }
 
 /*!
@@ -159,27 +156,26 @@ get_shell_stride(int i)
     and using O(N^?) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+shell_sort(Iter begin,
+           Iter end,
+           Compare comp)
 {
   using ::RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
 
   diff_type n = end - begin;
 
-  if (n <= static_cast<diff_type>(1))
-  {
+  if (n <= static_cast<diff_type>(1)) {
     return;
-  }
-  else if (get_shell_stride(1) < static_cast<unsigned long long>(n))
-  {
+  } else if (get_shell_stride(1) < static_cast<unsigned long long>(n)) {
 
     int i_stride = 2;
     // find first stride larger than n
     constexpr int num_strides = num_shell_strides();
-    for (; i_stride < num_strides; ++i_stride)
-    {
-      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n))
-      {
+    for (; i_stride < num_strides; ++i_stride) {
+      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n)) {
         break;
       }
     }
@@ -188,32 +184,25 @@ RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
 
     // for each stride size smaller than n, largest to smallest, not including 1
     // sort strided ranges with stride stride
-    for (; i_stride > 0; --i_stride)
-    {
+    for (; i_stride > 0; --i_stride) {
       diff_type stride = static_cast<diff_type>(get_shell_stride(i_stride));
 
       // for each unsorted item in the right side of each strided range
-      for (diff_type i_next_unsorted = stride; i_next_unsorted != n;
-           ++i_next_unsorted)
-      {
+      for (diff_type i_next_unsorted = stride; i_next_unsorted != n; ++i_next_unsorted) {
 
         // insert unsorted item into the sorted left side of the strided range
-        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride;
-             i_to_insert -= stride)
-        {
+        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride; i_to_insert -= stride) {
 
-          Iter to_insert   = begin + i_to_insert;
+          Iter to_insert = begin + i_to_insert;
           Iter next_sorted = to_insert - stride;
 
           // compare with next item to left
-          if (comp(*to_insert, *next_sorted))
-          {
+          if (comp(*to_insert, *next_sorted)) {
 
             // swap down if should be before
             safe_iter_swap(next_sorted, to_insert);
-          }
-          else
-          {
+
+          } else {
 
             // stop if in correct position
             break;
@@ -233,8 +222,12 @@ RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
     and using O(lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE void
-heapify(Iter begin, Iter root, Iter end, Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE
+void
+heapify(Iter begin,
+        Iter root,
+        Iter end,
+        Compare comp)
 {
   using RAJA::safe_iter_swap;
 
@@ -242,28 +235,24 @@ heapify(Iter begin, Iter root, Iter end, Compare comp)
 
   // heapify the root node into place
   // until this is a max heap again
-  for (auto i = root - begin; 2 * i + 1 < N; i = root - begin)
-  {
+  for (auto i = root - begin; 2*i+1 < N; i = root - begin) {
 
     // find the max item amongst the root, left child, and right child
     Iter maxit = root;
 
     // left child
-    Iter child = begin + 2 * i + 1;
-    if (comp(*maxit, *child))
-    {
+    Iter child = begin + 2*i+1;
+    if (comp(*maxit, *child)) {
       maxit = child;
     }
 
     // right child
     ++child;
-    if (child != end && comp(*maxit, *child))
-    {
+    if (child != end && comp(*maxit, *child)) {
       maxit = child;
     }
 
-    if (maxit == root)
-    {
+    if (maxit == root) {
       // root is the max, done
       break;
     }
@@ -280,22 +269,24 @@ heapify(Iter begin, Iter root, Iter end, Compare comp)
     and using O(N*lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline void heap_sort(Iter begin, Iter end, Compare comp)
+RAJA_HOST_DEVICE inline
+void
+heap_sort(Iter begin,
+          Iter end,
+          Compare comp)
 {
   using RAJA::safe_iter_swap;
 
   auto N = end - begin;
 
-  if (N < 2)
-  {
+  if (N < 2) {
     // already sorted
     return;
   }
 
   // make range into a max heap by
   // going through nodes with children one-by-one in reverse order
-  for (Iter root = begin + (N - 1) / 2; root != begin; --root)
-  {
+  for (Iter root = begin + (N-1)/2; root != begin; --root) {
     // heapify a sub-heap
     heapify(begin, root, end, comp);
   }
@@ -303,8 +294,7 @@ RAJA_HOST_DEVICE inline void heap_sort(Iter begin, Iter end, Compare comp)
   heapify(begin, begin, end, comp);
 
   // remove one element from max heap repeatedly until sorted
-  for (--end; begin != end; --end)
-  {
+  for (--end; begin != end; --end) {
 
     // swap max element into sorted position at end of heap
     safe_iter_swap(begin, end);
@@ -335,8 +325,12 @@ struct intro_sort_insertion_sort_cutoff
     and using O(N*lg(N)) comparisons and O(lg(N)) memory, with limited depth.
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline void
-intro_sort_depth(Iter begin, Iter end, Compare comp, unsigned depth)
+RAJA_HOST_DEVICE inline
+void
+intro_sort_depth(Iter begin,
+                 Iter end,
+                 Compare comp,
+                 unsigned depth)
 {
   using RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
@@ -347,56 +341,57 @@ intro_sort_depth(Iter begin, Iter end, Compare comp, unsigned depth)
   constexpr diff_type insertion_sort_cutoff =
       static_cast<diff_type>(intro_sort_insertion_sort_cutoff::get());
 
-  if (N < 2)
-  {
+  if (N < 2) {
 
     // already sorted
-  }
-  else if (N < insertion_sort_cutoff)
-  {
+
+  } else if (N < insertion_sort_cutoff) {
 
     // use insertion sort for small inputs
     detail::insertion_sort(begin, end, comp);
-  }
-  else if (depth == 0)
-  {
+
+  } else if (depth == 0) {
 
     // use heap sort if recurse too deep
     detail::heap_sort(begin, end, comp);
-  }
-  else
-  {
+
+  } else {
 
     // use quick sort
     // choose pivot with median of 3 (N >= insertion_sort_cutoff)
-    Iter mid  = begin + N / 2;
-    Iter last = end - 1;
-    Iter pivot =
-        comp(*begin, *mid)
-            ? (comp(*mid, *last) ? mid : (comp(*begin, *last) ? last : begin))
-            : (comp(*mid, *last) ? (comp(*begin, *last) ? begin : last) : mid);
+    Iter mid = begin + N/2;
+    Iter last = end-1;
+    Iter pivot = comp(*begin, *mid)
+                    ? ( comp(*mid, *last)
+                           ? mid
+                           : ( comp(*begin, *last)
+                                  ? last
+                                  : begin ) )
+                    : ( comp(*mid, *last)
+                           ? ( comp(*begin, *last)
+                                  ? begin
+                                  : last )
+                           : mid );
 
     // swap pivot to last
-    if (pivot != last)
-    {
+    if (pivot != last) {
       safe_iter_swap(pivot, last);
       pivot = last;
     }
 
     // partition
-    mid = partition(begin, last, [&](Iter it) { return comp(*it, *pivot); });
+    mid = partition(begin, last, [&](Iter it){ return comp(*it, *pivot); });
 
     // swap pivot to sorted position
-    if (mid != pivot)
-    {
+    if (mid != pivot) {
       safe_iter_swap(mid, pivot);
       pivot = mid;
     }
 
     // recurse to sort first and second parts, ignoring already sorted pivot
     // by construction pivot is always in the range [begin, last]
-    detail::intro_sort_depth(begin, pivot, comp, depth - 1);
-    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth - 1);
+    detail::intro_sort_depth(begin, pivot, comp, depth-1);
+    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth-1);
   }
 }
 
@@ -405,18 +400,20 @@ intro_sort_depth(Iter begin, Iter end, Compare comp, unsigned depth)
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
+RAJA_HOST_DEVICE inline
+void
+intro_sort(Iter begin,
+           Iter end,
+           Compare comp)
 {
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2 * RAJA::log2(N);
+  unsigned max_depth = 2*RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  // limit max_depth statically in device code to allow compiler to remove
-  // recursion
-  if (max_depth > detail::intro_sort_device_max_depth::get())
-  {
+  // limit max_depth statically in device code to allow compiler to remove recursion
+  if (max_depth > detail::intro_sort_device_max_depth::get()) {
     max_depth = detail::intro_sort_device_max_depth::get();
   }
 #endif
@@ -429,20 +426,25 @@ RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
     with local range/2 copy
 */
 template <typename Iter, typename Compare>
-void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
+void
+RAJA_INLINE
+inplace_merge(  Iter first,
+                Iter middle,
+                Iter last,
+                Compare comp  )
 {
-  using diff_type  = RAJA::detail::IterDiff<Iter>;
+  using diff_type = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   diff_type copylen = middle - first;
 
-  if (first == middle || middle == last)
+  if ( first == middle || middle == last )
   {
     // at least one side empty, already sorted
     return;
   }
 
-  if (!comp(*middle, *(middle - 1)))
+  if ( !comp(*middle, *(middle-1)) )
   {
     // everything already in order, done
     return;
@@ -453,39 +455,37 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
-                                              copylen * sizeof(value_type)),
+      RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, copylen * sizeof(value_type) ),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
 
   // check memory allocation worked
-  if (copyarr == nullptr)
-  {
-    RAJA_ABORT_OR_THROW("inplace_merge temporary memory allocation failed");
+  if (copyarr == nullptr) {
+    RAJA_ABORT_OR_THROW( "inplace_merge temporary memory allocation failed" );
   }
 
   // move construct input into buffer storage
   // use buf_deleter.size as index to keep track of objects constructed
-  for (diff_type& cc = buf_deleter.size; cc < copylen; ++cc)
+  for ( diff_type& cc = buf_deleter.size; cc < copylen; ++cc )
   {
-    new (&copyarr[cc]) value_type(std::move(first[cc]));
+    new(&copyarr[cc]) value_type(std::move(first[cc]));
   }
 
   // merge
-  for (diff_type cur = 0; cur < copylen;)
+  for ( diff_type cur = 0; cur < copylen; )
   {
-    if (middle >= last)  // moved all second half, put copy into remainder
+    if ( middle >= last ) // moved all second half, put copy into remainder
     {
-      std::move(copyarr + cur, copyarr + copylen, first);
+      std::move( copyarr+cur, copyarr+copylen, first );
       break;
     }
-    else if (first == middle)  // everything prior to middle is sorted, done
+    else if ( first == middle ) // everything prior to middle is sorted, done
     {
       break;
     }
 
-    if (comp(*middle, copyarr[cur]))
+    if ( comp(*middle, copyarr[cur]) )
     {
       *first = std::move(*middle);
       ++middle;
@@ -505,46 +505,47 @@ void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
     while copies are outside, somewhat follows STL API
 */
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
-// constexpr OutIter // <-- std:: return value
-void RAJA_INLINE
-merge_like_std(Iter1 first1,
-               Iter1 last1,
-               Iter2 first2,
-               Iter2 last2,
-               OutIter d_first,  // using this as direct access to result
-               Compare comp)
+//constexpr OutIter // <-- std:: return value
+void
+RAJA_INLINE
+merge_like_std( Iter1 first1,
+                Iter1 last1,
+                Iter2 first2,
+                Iter2 last2,
+                OutIter d_first,  // using this as direct access to result
+                Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (first1 == last2 - 1)  // should never need to do this
+  if ( first1 == last2 - 1 )  // should never need to do this
   {
     return;
   }
 
-  if ((last2 - first1) == 2)  // only 2 elements, simple swap
+  if ( (last2 - first1) == 2 ) // only 2 elements, simple swap
   {
-    if (!comp(*d_first, *(d_first + 1)))
+    if ( !comp(*d_first, *(d_first+1)) )
     {
-      safe_iter_swap(d_first, d_first + 1);
+      safe_iter_swap( d_first, d_first+1 );
     }
     return;
   }
 
-  while (first1 < last1 || first2 < last2)
+  while ( first1 < last1 || first2 < last2 )
   {
-    if (first1 >= last1)  // first half done
+    if ( first1 >= last1 ) // first half done
     {
       *d_first = std::move(*first2);
       ++first2;
     }
-    else if (first2 >= last2)  // second half done
+    else if ( first2 >= last2 )  // second half done
     {
       *d_first = std::move(*first1);
       ++first1;
     }
     else  // neither half done
     {
-      if (comp(*first2, *first1))
+      if ( comp( *first2, *first1 ) )
       {
         *d_first = std::move(*first2);
         ++first2;
@@ -567,30 +568,34 @@ merge_like_std(Iter1 first1,
     and using O(N*lg(N)) comparisons and O(N) memory
 */
 template <typename Iter, typename Compare>
-RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
+RAJA_INLINE
+void
+merge_sort(Iter begin,
+           Iter end,
+           Compare comp)
 {
-  using diff_type  = RAJA::detail::IterDiff<Iter>;
+  using diff_type = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   // iterative mergesort (bottom up) for future parallelism
 
   // min helper
-  auto minlam = [](diff_type a, diff_type b) { return (a < b) ? a : b; };
+  auto minlam = [] (diff_type a, diff_type b) {return (a < b) ? a : b;};
 
   // insertion sort for sizes <= 16
-  diff_type len                                    = end - begin;
+  diff_type len = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
-  if (len <= insertion_sort_cutoff && len > 0)
+  if ( len <= insertion_sort_cutoff && len > 0 )
   {
-    detail::insertion_sort(begin, end, comp);
+    detail::insertion_sort( begin, end, comp );
   }
   else
   {
     // insertion sort on 16-element chunks, then merge
-    for (diff_type start = 0; start < len; start += insertion_sort_cutoff)
+    for ( diff_type start = 0; start < len; start += insertion_sort_cutoff )
     {
-      diff_type lastchunk = minlam(insertion_sort_cutoff, len - start);
-      detail::insertion_sort(begin + start, begin + start + lastchunk, comp);
+      diff_type lastchunk = minlam( insertion_sort_cutoff, len - start );
+      detail::insertion_sort( begin + start, begin + start + lastchunk, comp );
     }
 
     // merge using extra storage
@@ -600,86 +605,74 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
-                                                len * sizeof(value_type)),
+        RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, len * sizeof(value_type) ),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
 
     // check memory allocation worked
-    if (copyarr == nullptr)
-    {
-      RAJA_ABORT_OR_THROW("merge_sort temporary memory allocation failed");
+    if (copyarr == nullptr) {
+      RAJA_ABORT_OR_THROW( "merge_sort temporary memory allocation failed" );
     }
 
     // move construct input into buffer storage
     // use buf_deleter.size as index to keep track of objects constructed
-    for (diff_type& cc = buf_deleter.size; cc < len; ++cc)
+    for ( diff_type& cc = buf_deleter.size; cc < len; ++cc )
     {
-      new (&copyarr[cc]) value_type(std::move(begin[cc]));
+      new(&copyarr[cc]) value_type(std::move(begin[cc]));
     }
 
     bool copyvalid = true;
-    // for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log
-    // n) loop
-    for (diff_type midpoint = 16; midpoint < len;
-         midpoint *= 2)  // O(log n) loop
+    //for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    for ( diff_type midpoint = 16; midpoint < len; midpoint *= 2 )  // O(log n) loop
     {
-      for (diff_type start = 0; start < len;
-           start += midpoint * 2)  // O(n) merging loop (can be parallelized)
+      for ( diff_type start = 0; start < len; start += midpoint * 2 )  // O(n) merging loop (can be parallelized)
       {
-        diff_type finish = minlam(start + midpoint * 2, len);
-        if (finish > len)
+        diff_type finish = minlam( start + midpoint * 2, len );
+        if ( finish > len )
         {
-          RAJA_ABORT_OR_THROW(
-              "merge_sort invalid finish point");  // sanity check
+          RAJA_ABORT_OR_THROW( "merge_sort invalid finish point" );  // sanity check
         }
 
-        if (start + midpoint >= len)
+        if ( start + midpoint >= len )
         {
           // copy sorted remainder over
-          if (copyvalid)
+          if ( copyvalid )
           {
-            std::move(copyarr + start, copyarr + finish, begin + start);
+            std::move( copyarr + start, copyarr + finish, begin + start );
           }
           else
           {
-            std::move(begin + start, begin + finish, copyarr + start);
+            std::move( begin + start, begin + finish, copyarr + start );
           }
           break;  // skip merge if no second half exists
         }
 
-        if (copyvalid)  // switch arrays per level of merging to avoid copying
-                        // back to copyarr
+        if ( copyvalid )  // switch arrays per level of merging to avoid copying back to copyarr
         {
-          detail::merge_like_std(copyarr + start, copyarr + start + midpoint,
-                                 copyarr + start + midpoint, copyarr + finish,
-                                 begin + start, comp);
+          detail::merge_like_std( copyarr + start, copyarr + start + midpoint, copyarr + start + midpoint, copyarr + finish, begin + start, comp );
         }
         else
         {
-          detail::merge_like_std(begin + start, begin + start + midpoint,
-                                 begin + start + midpoint, begin + finish,
-                                 copyarr + start, comp);
+          detail::merge_like_std( begin + start, begin + start + midpoint, begin + start + midpoint, begin + finish, copyarr + start, comp );
         }
       }
 
-      copyvalid = !copyvalid;  // switch arrays per level of merging to avoid
-                               // copying back to copyarr
+      copyvalid = !copyvalid; // switch arrays per level of merging to avoid copying back to copyarr
     }
 
     // update copy if necessary
-    if (copyvalid)
+    if ( copyvalid )
     {
-      std::move(copyarr, copyarr + len, begin);
+      std::move( copyarr, copyarr + len, begin );
     }
   }
-  // else
+  //else
   //{
-  //  Possible TBD: in-place mergesort
-  //  Would shift (like insertion sort) when performing merge.
-  //  PRO - Can use on GPU, O(1) storage required.
-  //  CON - Shifting would cause slowdown O(n^2 log n).
+      // Possible TBD: in-place mergesort
+      // Would shift (like insertion sort) when performing merge.
+      // PRO - Can use on GPU, O(1) storage required.
+      // CON - Shifting would cause slowdown O(n^2 log n).
   //}
 }
 
@@ -691,9 +684,10 @@ RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE
-    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    insertion_sort(Container&& c, Compare comp = Compare {})
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_range<Container>>
+insertion_sort(Container&& c,
+               Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -706,11 +700,9 @@ RAJA_HOST_DEVICE
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it)
-  {
+  if (begin_it != end_it) {
     auto next = begin_it;
-    if (++next != end_it)
-    {
+    if (++next != end_it) {
       detail::insertion_sort(begin_it, end_it, comp);
     }
   }
@@ -722,9 +714,10 @@ RAJA_HOST_DEVICE
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE
-    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    shell_sort(Container&& c, Compare comp = Compare {})
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_range<Container>>
+shell_sort(Container&& c,
+           Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -737,11 +730,9 @@ RAJA_HOST_DEVICE
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it)
-  {
+  if (begin_it != end_it) {
     auto next = begin_it;
-    if (++next != end_it)
-    {
+    if (++next != end_it) {
       detail::shell_sort(begin_it, end_it, comp);
     }
   }
@@ -753,9 +744,10 @@ RAJA_HOST_DEVICE
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE
-    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    heap_sort(Container&& c, Compare comp = Compare {})
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_range<Container>>
+heap_sort(Container&& c,
+          Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -768,11 +760,9 @@ RAJA_HOST_DEVICE
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it)
-  {
+  if (begin_it != end_it) {
     auto next = begin_it;
-    if (++next != end_it)
-    {
+    if (++next != end_it) {
       detail::heap_sort(begin_it, end_it, comp);
     }
   }
@@ -784,9 +774,10 @@ RAJA_HOST_DEVICE
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE
-    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-    intro_sort(Container&& c, Compare comp = Compare {})
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if<type_traits::is_range<Container>>
+intro_sort(Container&& c,
+           Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -799,11 +790,9 @@ RAJA_HOST_DEVICE
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it)
-  {
+  if (begin_it != end_it) {
     auto next = begin_it;
-    if (++next != end_it)
-    {
+    if (++next != end_it) {
       detail::intro_sort(begin_it, end_it, comp);
     }
   }
@@ -815,8 +804,10 @@ RAJA_HOST_DEVICE
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
-merge_sort(Container&& c, Compare comp = Compare {})
+RAJA_INLINE
+concepts::enable_if<type_traits::is_range<Container>>
+merge_sort(Container&& c,
+           Compare comp = Compare{})
 {
   using std::begin;
   using std::end;
@@ -829,11 +820,9 @@ merge_sort(Container&& c, Compare comp = Compare {})
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it)
-  {
+  if (begin_it != end_it) {
     auto next = begin_it;
-    if (++next != end_it)
-    {
+    if (++next != end_it) {
       detail::merge_sort(begin_it, end_it, comp);
     }
   }
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index f141aa3acd..310217bde5 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -41,7 +41,7 @@ namespace RAJA
 ///
 enum named_usage : int
 {
-  ignored     = -1,
+  ignored = -1,
   unspecified = 0
 };
 
@@ -70,19 +70,13 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
-struct DirectBase
-{};
-struct LoopBase
-{};
-struct ContiguousLoopBase : LoopBase
-{};
-struct StridedLoopBase : LoopBase
-{};
-struct UnsizedLoopBase
-{};
-struct SizedLoopBase
-{};
-template <size_t t_max_iterations>
+struct DirectBase {};
+struct LoopBase {};
+struct ContiguousLoopBase : LoopBase {};
+struct StridedLoopBase : LoopBase {};
+struct UnsizedLoopBase {};
+struct SizedLoopBase {};
+template < size_t t_max_iterations >
 struct SizedLoopSpecifyingBase : SizedLoopBase
 {
   static constexpr size_t max_iterations = t_max_iterations;
@@ -109,8 +103,7 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-struct Direct : DirectBase
-{};
+struct Direct : DirectBase {};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -137,13 +130,10 @@ struct Direct : DirectBase
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-template <size_t max_iterations>
-struct Contiguousloop
-    : ContiguousLoopBase,
-      std::conditional_t<(max_iterations != named_usage::unspecified),
-                         SizedLoopSpecifyingBase<max_iterations>,
-                         UnsizedLoopBase>
-{};
+template < size_t max_iterations >
+struct Contiguousloop : ContiguousLoopBase,
+    std::conditional_t<(max_iterations != named_usage::unspecified),
+                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -170,25 +160,18 @@ struct Contiguousloop
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-template <size_t max_iterations>
-struct StridedLoop
-    : StridedLoopBase,
-      std::conditional_t<(max_iterations != named_usage::unspecified),
-                         SizedLoopSpecifyingBase<max_iterations>,
-                         UnsizedLoopBase>
-{};
+template < size_t max_iterations >
+struct StridedLoop : StridedLoopBase,
+    std::conditional_t<(max_iterations != named_usage::unspecified),
+                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
 
-}  // namespace iteration_mapping
+} // namespace iteration_mapping
 
 ///
 /// Enumeration used to indicate whether ListSegment object owns data
 /// representing its indices.
 ///
-enum IndexOwnership
-{
-  Unowned,
-  Owned
-};
+enum IndexOwnership { Unowned, Owned };
 
 ///
 /// Type use for all loop indexing in RAJA constructs.
@@ -206,8 +189,8 @@ const int UndefinedValue = -9999999;
 /// Template list of sizes
 ///
 template <Index_type... Sizes>
-struct SizeList
-{};
+struct SizeList {
+};
 
 
 ///
@@ -220,15 +203,15 @@ struct Fraction
 
   using inverse = Fraction<int_t, denominator, numerator>;
 
-  template <typename new_int_t>
-  using rebind =
-      Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+  template < typename new_int_t >
+  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
 
   static constexpr int_t multiply(int_t val) noexcept
   {
     return (val / denominator) * numerator +
            (val % denominator) * numerator / denominator;
   }
+
 };
 
 
@@ -246,7 +229,7 @@ using Real_type = double;
 
 #elif defined(RAJA_USE_FLOAT)
 ///
-using Real_type         = float;
+using Real_type = float;
 
 #else
 #error RAJA Real_type is undefined!
@@ -271,8 +254,7 @@ using Complex_type = std::complex<Real_type>;
 // alignment attribute supported for versions > 12
 //
 #if __ICC >= 1300
-using TDRAReal_ptr =
-    Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 #endif
@@ -280,8 +262,7 @@ using const_TDRAReal_ptr = const TDRAReal_ptr;
 #elif defined(RAJA_COMPILER_GNU)
 
 #elif defined(RAJA_COMPILER_CLANG)
-using TDRAReal_ptr =
-    Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 
@@ -833,51 +814,51 @@ class RestrictComplexPtr
  ******************************************************************************
  */
 #if defined(RAJA_USE_BARE_PTR)
-using Real_ptr       = Real_type*;
+using Real_ptr = Real_type*;
 using const_Real_ptr = const Real_type*;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr       = Complex_type*;
+using Complex_ptr = Complex_type*;
 using const_Complex_ptr = const Complex_type*;
 #endif
 
-using UnalignedReal_ptr       = Real_type*;
+using UnalignedReal_ptr = Real_type*;
 using const_UnalignedReal_ptr = const Real_type*;
 
 #elif defined(RAJA_USE_RESTRICT_PTR)
-using Real_ptr          = Real_type* RAJA_RESTRICT;
-using const_Real_ptr    = const Real_type* RAJA_RESTRICT;
+using Real_ptr = Real_type* RAJA_RESTRICT;
+using const_Real_ptr = const Real_type* RAJA_RESTRICT;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr       = Complex_type* RAJA_RESTRICT;
+using Complex_ptr = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR)
-using Real_ptr           = TDRAReal_ptr;
-using const_Real_ptr     = const_TDRAReal_ptr;
+using Real_ptr = TDRAReal_ptr;
+using const_Real_ptr = const_TDRAReal_ptr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr        = Complex_type* RAJA_RESTRICT;
-using const_Complex_ptr  = const Complex_type* RAJA_RESTRICT;
+using Complex_ptr = Complex_type* RAJA_RESTRICT;
+using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_PTR_CLASS)
-using Real_ptr           = RestrictAlignedRealPtr;
-using const_Real_ptr     = ConstRestrictAlignedRealPtr;
+using Real_ptr = RestrictAlignedRealPtr;
+using const_Real_ptr = ConstRestrictAlignedRealPtr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr        = RestrictComplexPtr;
-using const_Complex_ptr  = ConstRestrictComplexPtr;
+using Complex_ptr = RestrictComplexPtr;
+using const_Complex_ptr = ConstRestrictComplexPtr;
 #endif
 
-using UnalignedReal_ptr       = RestrictRealPtr;
+using UnalignedReal_ptr = RestrictRealPtr;
 using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 
 #else
@@ -886,21 +867,20 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 #endif
 
 
-namespace detail
-{
+namespace detail {
 
 /*!
  * \brief Abstracts access to memory using normal memory accesses.
  */
 struct DefaultAccessor
 {
-  template <typename T>
+  template < typename T >
   static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
   {
     return ptr[i];
   }
 
-  template <typename T>
+  template < typename T >
   static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
   {
     ptr[i] = val;
@@ -918,10 +898,7 @@ template <typename T,
 struct AsIntegerArray
 {
   static_assert(min_integer_type_size <= max_integer_type_size,
-                "incompatible "
-                "min and max "
-                "integer type "
-                "size");
+                "incompatible min and max integer type size");
   using integer_type = std::conditional_t<
       ((alignof(T) >= alignof(unsigned long long) &&
         sizeof(unsigned long long) <= max_integer_type_size) ||
@@ -942,25 +919,17 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
-                                       sizeof(unsigned char) <=
-                                           max_integer_type_size)),
-                                     unsigned char,
-                                     void>>>>>;
+                  std::conditional_t<
+                      ((alignof(T) >= alignof(unsigned char) &&
+                        sizeof(unsigned char) <= max_integer_type_size)),
+                      unsigned char,
+                      void>>>>>;
   static_assert(!std::is_same<integer_type, void>::value,
-                "could not find a "
-                "compatible integer "
-                "type");
+                "could not find a compatible integer type");
   static_assert(sizeof(integer_type) >= min_integer_type_size,
-                "integer_type "
-                "smaller than "
-                "min integer "
-                "type size");
+                "integer_type smaller than min integer type size");
   static_assert(sizeof(integer_type) <= max_integer_type_size,
-                "integer_type "
-                "greater than "
-                "max integer "
-                "type size");
+                "integer_type greater than max integer type size");
 
   static constexpr size_t num_integer_type =
       (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
@@ -996,23 +965,28 @@ template <typename T>
 struct ScopedAssignment
 {
   ScopedAssignment(T& val, T const& new_val)
-      : m_ref_to_val(val), m_prev_val(std::move(val))
+    : m_ref_to_val(val)
+    , m_prev_val(std::move(val))
   {
     m_ref_to_val = new_val;
   }
 
   ScopedAssignment(T& val, T&& new_val)
-      : m_ref_to_val(val), m_prev_val(std::move(val))
+    : m_ref_to_val(val)
+    , m_prev_val(std::move(val))
   {
     m_ref_to_val = std::move(new_val);
   }
 
-  ScopedAssignment(ScopedAssignment const&)            = delete;
-  ScopedAssignment(ScopedAssignment&&)                 = delete;
+  ScopedAssignment(ScopedAssignment const&) = delete;
+  ScopedAssignment(ScopedAssignment &&) = delete;
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
-  ScopedAssignment& operator=(ScopedAssignment&&)      = delete;
+  ScopedAssignment& operator=(ScopedAssignment &&) = delete;
 
-  ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
+  ~ScopedAssignment()
+  {
+    m_ref_to_val = std::move(m_prev_val);
+  }
 
 private:
   T& m_ref_to_val;
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 612c16cfc4..1beefeb9cc 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -37,40 +37,41 @@ namespace RAJA
     \brief ZipIterator class for simultaneously iterating over
     multiple iterators. This is not a standards compliant iterator.
 */
-template <typename... Iters>
+template < typename ... Iters >
 struct ZipIterator
 {
-  static_assert(
-      concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
+  static_assert(concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
   static_assert(sizeof...(Iters) > 1,
-                "ZipIterator must contain one or more "
-                "iterators");
+      "ZipIterator must contain one or more iterators");
 
-  using value_type =
-      zip_val<typename std::iterator_traits<Iters>::value_type...>;
+  using value_type = zip_val<typename std::iterator_traits<Iters>::value_type...>;
   using difference_type = std::ptrdiff_t;
-  using pointer         = void;
+  using pointer = void;
   using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
-  using creference =
-      zip_ref<const typename std::iterator_traits<Iters>::reference...>;
+  using creference = zip_ref<const typename std::iterator_traits<Iters>::reference...>;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
+  RAJA_HOST_DEVICE inline ZipIterator()
+    : m_iterators()
+  {
+  }
 
-  template <typename... Args,
-            typename = concepts::enable_if<
-                type_traits::convertible_to<Args&&, Iters>...>>
+  template < typename... Args,
+             typename = concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...> >
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
-      : m_iterators(std::forward<Args>(args)...)
-  {}
+    : m_iterators(std::forward<Args>(args)...)
+  {
+  }
 
   RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
-      : m_iterators(rhs.m_iterators)
-  {}
+    : m_iterators(rhs.m_iterators)
+  {
+  }
   RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
-      : m_iterators(std::move(rhs.m_iterators))
-  {}
+    : m_iterators(std::move(rhs.m_iterators))
+  {
+  }
 
   RAJA_HOST_DEVICE inline ZipIterator& operator=(const ZipIterator& rhs)
   {
@@ -96,11 +97,11 @@ struct ZipIterator
   }
   RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) > RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) >  RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) < RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) <  RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
   {
@@ -113,12 +114,12 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator& operator++()
   {
-    detail::zip_for_each(m_iterators, detail::PreInc {});
+    detail::zip_for_each(m_iterators, detail::PreInc{});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator& operator--()
   {
-    detail::zip_for_each(m_iterators, detail::PreDec {});
+    detail::zip_for_each(m_iterators, detail::PreDec{});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator operator++(int)
@@ -134,38 +135,41 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator+=(
+      const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type> {rhs});
+    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
     return *this;
   }
-  RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator-=(
+      const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type> {rhs});
+    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type
-  operator-(const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type operator-(
+      const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) - RAJA::get<0>(rhs.m_iterators);
   }
-  RAJA_HOST_DEVICE inline ZipIterator
-  operator+(const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator operator+(
+      const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp += rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE inline ZipIterator
-  operator-(const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator operator-(
+      const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
-                                                const ZipIterator& rhs)
+  RAJA_HOST_DEVICE friend ZipIterator operator+(
+      difference_type lhs,
+      const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -174,7 +178,7 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline reference operator*() const
   {
-    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)> {});
+    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)>{});
   }
   // TODO:: figure out what to do with this
   // RAJA_HOST_DEVICE inline reference operator->() const
@@ -186,16 +190,15 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
-                                                     ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
   {
-    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap {});
+    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
   }
 
 private:
   zip_val<camp::decay<Iters>...> m_iterators;
 
-  template <camp::idx_t... Is>
+  template < camp::idx_t ... Is >
   RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
   {
     return reference(*RAJA::get<Is>(m_iterators)...);
@@ -207,8 +210,10 @@ struct ZipIterator
     \brief Zip multiple iterators together to iterate them simultaneously with
     a single ZipIterator object.
 */
-template <typename... Args>
-RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
+template < typename... Args >
+RAJA_HOST_DEVICE
+auto zip(Args&&... args)
+  -> ZipIterator<camp::decay<Args>...>
 {
   return {std::forward<Args>(args)...};
 }
@@ -218,28 +223,29 @@ RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
     ZipIterator objects.
 */
 template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
-    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-            typename ZipIterator<
-                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+RAJA_HOST_DEVICE RAJA_INLINE
+auto zip_span(Args&&... args)
+  -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+          typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
   return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-              typename ZipIterator<detail::ContainerIter<
-                  camp::decay<Args>>...>::difference_type>(
+              typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
-      zip(end(std::forward<Args>(args))...));
+      zip(  end(std::forward<Args>(args))...));
 }
 
 /*!
     \brief Comparator object that compares the first member
     of tuple like objects.
 */
-template <typename T, typename Compare>
+template < typename T, typename Compare >
 struct CompareFirst
 {
-  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_) : comp(comp_) {}
+  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_)
+    : comp(comp_)
+  { }
 
   RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs)
   {
@@ -254,8 +260,10 @@ struct CompareFirst
     \brief Make a comparator to compare first member of tuple
     like objects of type T.
 */
-template <typename T, typename Compare>
-RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
+template < typename T, typename Compare >
+RAJA_HOST_DEVICE
+auto compare_first(Compare comp)
+  -> CompareFirst<T, Compare>
 {
   return {comp};
 }
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index 5faecebf7b..d631d4714b 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -31,61 +31,49 @@
 namespace RAJA
 {
 
-template <bool is_val, typename... Ts>
+template < bool is_val, typename ... Ts >
 struct zip_tuple;
 
-template <camp::idx_t I, typename ZT>
+template < camp::idx_t I, typename ZT >
 struct zip_tuple_element;
 
-template <camp::idx_t I, bool is_val, typename... Ts>
+template < camp::idx_t I, bool is_val, typename ... Ts >
 struct zip_tuple_element<I, zip_tuple<is_val, Ts...>>
-    : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
-{};
+  : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
+{ };
 
-template <camp::idx_t I, typename ZT>
+template < camp::idx_t I, typename ZT >
 using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 
 
 // get function declarations for zip_tuple
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
-template <camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
-                                                     zip_tuple<is_val, Ts...>>&
-get(zip_tuple<is_val, Ts...>& z) noexcept
-{
-  return z.template get<I>();
-}
-template <camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr RAJA::
-    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
-    get(zip_tuple<is_val, Ts...> const& z) noexcept
-{
-  return z.template get<I>();
-}
-template <camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<
-    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>>&&
-get(zip_tuple<is_val, Ts...>&& z) noexcept
-{
-  return std::move(z).template get<I>();
-}
-template <camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<
-    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> &
+get(zip_tuple<is_val, Ts...>      &  z) noexcept
+{ return           z .template get<I>(); }
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
+get(zip_tuple<is_val, Ts...> const&  z) noexcept
+{ return           z .template get<I>(); }
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> &&
+get(zip_tuple<is_val, Ts...>      && z) noexcept
+{ return std::move(z).template get<I>(); }
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
 get(zip_tuple<is_val, Ts...> const&& z) noexcept
-{
-  return std::move(z).template get<I>();
-}
+{ return std::move(z).template get<I>(); }
 
 namespace detail
 {
 
 struct PassThrough
 {
-  template <typename T>
+  template < typename T >
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-      -> decltype(std::forward<T>(t))
+    -> decltype(std::forward<T>(t))
   {
     return std::forward<T>(t);
   }
@@ -93,9 +81,9 @@ struct PassThrough
 
 struct Move
 {
-  template <typename T>
+  template < typename T >
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-      -> decltype(std::move(t))
+    -> decltype(std::move(t))
   {
     return std::move(t);
   }
@@ -103,9 +91,9 @@ struct Move
 
 struct PreInc
 {
-  template <typename Iter>
+  template< typename Iter >
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-      -> decltype(++std::forward<Iter>(iter))
+    -> decltype(++std::forward<Iter>(iter))
   {
     return ++std::forward<Iter>(iter);
   }
@@ -113,33 +101,33 @@ struct PreInc
 
 struct PreDec
 {
-  template <typename Iter>
+  template< typename Iter >
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-      -> decltype(--std::forward<Iter>(iter))
+    -> decltype(--std::forward<Iter>(iter))
   {
     return --std::forward<Iter>(iter);
   }
 };
 
-template <typename difference_type>
+template < typename difference_type >
 struct PlusEq
 {
   const difference_type& rhs;
-  template <typename Iter>
+  template< typename Iter >
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-      -> decltype(std::forward<Iter>(iter) += rhs)
+    -> decltype(std::forward<Iter>(iter) += rhs)
   {
     return std::forward<Iter>(iter) += rhs;
   }
 };
 
-template <typename difference_type>
+template < typename difference_type >
 struct MinusEq
 {
   const difference_type& rhs;
-  template <typename Iter>
+  template< typename Iter >
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-      -> decltype(std::forward<Iter>(iter) -= rhs)
+    -> decltype(std::forward<Iter>(iter) -= rhs)
   {
     return std::forward<Iter>(iter) -= rhs;
   }
@@ -147,9 +135,9 @@ struct MinusEq
 
 struct DeRef
 {
-  template <typename Iter>
+  template< typename Iter >
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-      -> decltype(*std::forward<Iter>(iter))
+    -> decltype(*std::forward<Iter>(iter))
   {
     return *std::forward<Iter>(iter);
   }
@@ -157,7 +145,7 @@ struct DeRef
 
 struct Swap
 {
-  template <typename T0, typename T1>
+  template< typename T0, typename T1 >
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using camp::safe_swap;
@@ -168,7 +156,7 @@ struct Swap
 
 struct IterSwap
 {
-  template <typename T0, typename T1>
+  template< typename T0, typename T1 >
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using RAJA::safe_iter_swap;
@@ -181,9 +169,9 @@ struct IterSwap
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template <typename Tuple, typename F, camp::idx_t... Is>
-RAJA_HOST_DEVICE inline void
-zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
+template < typename Tuple, typename F, camp::idx_t... Is >
+RAJA_HOST_DEVICE inline
+void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
 }
@@ -191,55 +179,51 @@ zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template <typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
-RAJA_HOST_DEVICE inline void
-zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
+template < typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is >
+RAJA_HOST_DEVICE inline
+void zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
-                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)), RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template <typename Tuple, typename F>
-RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
+template < typename Tuple, typename F >
+RAJA_HOST_DEVICE inline
+void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
-                    typename camp::decay<Tuple>::IdxSeq {});
+  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f), typename camp::decay<Tuple>::IdxSeq{});
 }
 
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template <typename Tuple0, typename Tuple1, typename F>
-RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+template < typename Tuple0, typename Tuple1, typename F >
+RAJA_HOST_DEVICE inline
+void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
-                             typename camp::decay<Tuple1>::IdxSeq>::value,
-                "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1),
-                    std::forward<F>(f),
-                    typename camp::decay<Tuple0>::IdxSeq {});
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq, typename camp::decay<Tuple1>::IdxSeq>::value,
+      "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
 }
 
-}  // end namespace detail
+} // end namespace detail
 
 /*!
     \brief Tuple used by ZipIterator for storing multiple references and values.
-    Acts like a reference to its members allowing copy/move
-   construction/assignment based on the reference type of the zip_tuple.
+    Acts like a reference to its members allowing copy/move construction/assignment
+    based on the reference type of the zip_tuple.
 */
-template <bool is_val, typename... Ts>
+template < bool is_val, typename ... Ts >
 struct zip_tuple
 {
   using value_type = RAJA::tuple<Ts...>;
 
-  template <typename T>
-  using opp_type =
-      typename std::conditional<is_val,
-                                typename std::add_lvalue_reference<T>::type,
-                                typename std::remove_reference<T>::type>::type;
+  template < typename T >
+  using opp_type = typename std::conditional< is_val,
+        typename std::add_lvalue_reference<T>::type,
+        typename std::remove_reference<T>::type >::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -248,114 +232,76 @@ struct zip_tuple
   using IdxSeq = camp::make_idx_seq_t<sizeof...(Ts)>;
 
   // constructor from types convertible to Ts
-  template <
-      typename... Os,
-      typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...>>
+  template < typename ... Os
+           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...> >
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
-      : m_tuple(std::forward<Os>(os)...)
-  {}
+    : m_tuple(std::forward<Os>(os)...) { }
 
   // assignment from types convertible to Ts
-  template <typename... Os,
-            typename = concepts::enable_if<type_traits::convertible_to<
-                Os&&,
-                typename std::remove_reference<Ts>::type>...>>
+  template < typename ... Os
+           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, typename std::remove_reference<Ts>::type>...> >
   zip_tuple& assign(Os&&... os)
-  {
-    return assign_helper(IdxSeq {}, std::forward<Os>(os)...);
-  }
+  { return assign_helper(IdxSeq{}, std::forward<Os>(os)...); }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq {})
-  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o)
+    : zip_tuple(          o , IdxSeq{}) { }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
-      : zip_tuple(o, IdxSeq {})
-  {}
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
-      : zip_tuple(std::move(o), IdxSeq {})
-  {}  // move if is_val, pass-through otherwise
+    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o)
+    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
-  {
-    return assign_helper(o, IdxSeq {});
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &      o)
+  { return assign_helper(          o , IdxSeq{}); }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
-  {
-    return assign_helper(o, IdxSeq {});
-  }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
-  {
-    return assign_helper(std::move(o), IdxSeq {});
-  }
+  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &&     o)
+  { return assign_helper(std::move(o), IdxSeq{}); }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq {})
-  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o)
+    : zip_tuple(          o , IdxSeq{}) { }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
-      : zip_tuple(o, IdxSeq {})
-  {}
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
-      : zip_tuple(std::move(o), IdxSeq {})
-  {}  // move if is_val, pass-through otherwise
+    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o)
+    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
-  {
-    return assign_helper(o, IdxSeq {});
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &      o)
+  { return assign_helper(          o , IdxSeq{}); }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
-  {
-    return assign_helper(o, IdxSeq {});
-  }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
-  {
-    return assign_helper(std::move(o), IdxSeq {});
-  }
+  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &&     o)
+  { return assign_helper(std::move(o), IdxSeq{}); }
 
   // get member functions for zip_tuples
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
-  template <camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type>&
-  get() & noexcept
-  {
-    return RAJA::get<I>(m_tuple);
-  }
-  template <camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type> const&
-  get() const& noexcept
-  {
-    return RAJA::get<I>(m_tuple);
-  }
-  template <camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
-      RAJA::tuple_element_t<I, value_type>>&&
-  get() && noexcept
-  {
-    return std::move(RAJA::get<I>(m_tuple));
-  }
-  template <camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
-      RAJA::tuple_element_t<I, value_type>> const&&
-  get() const&& noexcept
-  {
-    return std::move(RAJA::get<I>(m_tuple));
-  }
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> & get() & noexcept
+  { return RAJA::get<I>(m_tuple); }
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> const& get() const& noexcept
+  { return RAJA::get<I>(m_tuple); }
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> && get() && noexcept
+  { return std::move(RAJA::get<I>(m_tuple)); }
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> const&& get() const&& noexcept
+  { return std::move(RAJA::get<I>(m_tuple)); }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
-                                                     zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, zip_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap {});
+    detail::zip_for_each(lhs, rhs, detail::Swap{});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
-                                                     opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, opp_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap {});
+    detail::zip_for_each(lhs, rhs, detail::Swap{});
   }
 
   // allow printing of zip_tuples by printing value_type
@@ -367,111 +313,67 @@ struct zip_tuple
 private:
   // move if is_val is true, otherwise copy in move constructor
   // this allows values to be moved, and references to stay lvalue references
-  using IsValMover = typename std::
-      conditional<is_val, detail::Move, detail::PassThrough>::type;
+  using IsValMover = typename std::conditional<is_val, detail::Move, detail::PassThrough>::type;
 
   value_type m_tuple;
 
   // assignment helper from types convertible to Ts
-  template <typename... Os, camp::idx_t... Is>
+  template < typename ... Os, camp::idx_t ... Is >
   zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
-  {
-    camp::sink(get<Is>() = std::forward<Os>(os)...);
-    return *this;
-  }
+  { camp::sink(get<Is>() = std::forward<Os>(os)...); return *this; }
 
   // copy and move constructor helpers
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o, camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(o)...)
-  {}
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
-                                         camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(o)...)
-  {}
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
-  {}  // move if is_val, pass-through otherwise
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
-                                                        camp::idx_seq<Is...>)
-  {
-    if (this != &o)
-    {
-      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
-    }
-    return *this;
-  }
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
-                                                        camp::idx_seq<Is...>)
-  {
-    if (this != &o)
-    {
-      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
-    }
-    return *this;
-  }
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
-                                                        camp::idx_seq<Is...>)
-  {
-    if (this != &o)
-    {
-      camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
-    }
-    return *this;
-  }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &      o, camp::idx_seq<Is...>)
+  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
+  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &&     o, camp::idx_seq<Is...>)
+  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); } return *this; }
 
   // copy and move constructor helpers from opp_tuple type zip_tuples
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o, camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(o)...)
-  {}
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
-                                         camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(o)...)
-  {}
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
-      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
-  {}  // move if is_val, pass-through otherwise
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(             o )...) { }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o, camp::idx_seq<Is...>)
+    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
-                                                        camp::idx_seq<Is...>)
-  {
-    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
-    return *this;
-  }
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
-                                                        camp::idx_seq<Is...>)
-  {
-    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
-    return *this;
-  }
-  template <camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
-                                                        camp::idx_seq<Is...>)
-  {
-    camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
-    return *this;
-  }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &      o, camp::idx_seq<Is...>)
+  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
+  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
+  template < camp::idx_t ... Is >
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &&     o, camp::idx_seq<Is...>)
+  { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); return *this; }
+
 };
 
 // alias zip_ref to zip_tuple capable of storing references (!is_val)
-template <typename... Ts>
+template < typename ... Ts >
 using zip_ref = zip_tuple<false, Ts...>;
 
 // alias zip_val to zip_tuple suitable for storing values (is_val)
-template <typename... Ts>
+template < typename ... Ts >
 using zip_val = zip_tuple<true, Ts...>;
 
 }  // end namespace RAJA
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index fe6cb470bf..8c8d051d8f 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -12,51 +12,51 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
-                                               INDEX_TYPE last,
-                                               const int pol)
+void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource erased_working_res{working_res};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::expt::dynamic_forall<POLICY_LIST>(
-      working_res, pol, r1,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+  RAJA::expt::dynamic_forall<POLICY_LIST>(working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+  });
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+
 }
 
 
 TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest);
 template <typename T>
 class DynamicForallResourceRangeSegmentTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 {
@@ -70,45 +70,42 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  // If N == 2 host, no openmp is available
-  // If N == 3 host, openmp is available
-  // If N == 4 host, device is available
-  // If N == 5 host, openmp, device are on
+  //If N == 2 host, no openmp is available
+  //If N == 3 host, openmp is available
+  //If N == 4 host, device is available
+  //If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
-  bool is_on_host =
-      working_res.get_platform() == camp::resources::Platform::host ? true
-                                                                    : false;
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
 
-  if (is_on_host)
-  {
+  if(is_on_host) { 
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3;
-#endif
-    // Loop through policy list
-    for (int pol = 0; pol < host_range; ++pol)
-    {
-      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
-                                                POLICY_LIST>(
-          INDEX_TYPE(0), INDEX_TYPE(27), pol);
-    }
+    host_range = 3; 
+#endif      
+      //Loop through policy list
+      for(int pol=0; pol<host_range; ++pol) 
+        {
+          DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+        }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3;
-#endif
-    for (int pol = device_start; pol < N; ++pol)
+    device_start = 3; 
+#endif      
+    for(int pol=device_start; pol<N; ++pol) 
     {
-      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
-                                                POLICY_LIST>(
-          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
+
+
 }
 
 REGISTER_TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest,
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index 09dec1c458..11168b0e30 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -12,66 +12,60 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
-                                       INDEX_TYPE last,
-                                       const int pol)
+void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(
-        pol, r1,
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
-  }
-  else
-  {  // zero-length segment
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+    });
+
+  } else { // zero-length segment 
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      (void) idx;
+      working_array[0]++;
+    });
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1,
-                                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                                            {
-                                              (void)idx;
-                                              working_array[0]++;
-                                            });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -79,7 +73,8 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
 TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest);
 template <typename T>
 class DynamicForallRangeSegmentTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
 {
@@ -92,45 +87,45 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  // If N == 2 host, no openmp is available
-  // If N == 3 host, openmp is available
-  // If N == 4 host, device is available
-  // If N == 5 host, openmp, device are on
+  //If N == 2 host, no openmp is available
+  //If N == 3 host, openmp is available
+  //If N == 4 host, device is available
+  //If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
-  bool is_on_host =
-      working_res.get_platform() == camp::resources::Platform::host ? true
-                                                                    : false;
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
 
-  if (is_on_host)
-  {
+  if(is_on_host) { 
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3;
-#endif
-    // Loop through policy list
-    for (int pol = 0; pol < host_range; ++pol)
-    {
-      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
-          INDEX_TYPE(0), INDEX_TYPE(27), pol);
-    }
+    host_range = 3; 
+#endif      
+      //Loop through policy list
+      for(int pol=0; pol<host_range; ++pol) 
+        {
+          DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+        }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3;
-#endif
-    for (int pol = device_start; pol < N; ++pol)
+    device_start = 3; 
+#endif      
+    for(int pol=device_start; pol<N; ++pol) 
     {
-      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
-          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
+
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest, RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest,
+                            RangeSegmentForall);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index 1189dfc36a..1b9dd4334a 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -16,60 +16,56 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
-  INDEX_TYPE N  = N0;
+  INDEX_TYPE N = N0;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N) + 1;
+  size_t data_len = RAJA::stripIndexType(N)+1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), first - first);
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
-    {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
       test_array[i0] = i0;
     }
     test_array[RAJA::stripIndexType(N)] = INDEX_TYPE(0);
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter(
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        {
-          if (idx >= first && idx < last)
-          {
-            // in bounds
-            working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
-          }
-          else
-          {
-            // out of bounds
-            working_array[RAJA::stripIndexType(N)]++;
-          }
-        },
-        r0);
+    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      if (idx >= first && idx < last) {
+        // in bounds
+        working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
+      } else {
+        // out of bounds
+        working_array[RAJA::stripIndexType(N)]++;
+      }
+    }, r0);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
+
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -77,31 +73,24 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest);
 template <typename T>
 class ForallCombiningAdapter1DTest : public ::testing::Test
-{};
+{
+};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{}
+{
+}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -112,19 +101,16 @@ TYPED_TEST_P(ForallCombiningAdapter1DTest, Forall1D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest, Forall1D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest,
+                            Forall1D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_1D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 5b011f6c8b..2be6464bb8 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -14,35 +14,32 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
-                                      INDEX_TYPE last0,
-                                      INDEX_TYPE first1,
-                                      INDEX_TYPE last1)
+void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
+                                  INDEX_TYPE first1, INDEX_TYPE last1)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
-                                         RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
-                                         RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
-  INDEX_TYPE N  = N0 * N1;
+  INDEX_TYPE N = N0 * N1;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N) + 1;
+  size_t data_len = RAJA::stripIndexType(N)+1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
-    {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
-      {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
         test_array[i0 * N1 + i1] = i0 * N1 + i1;
       }
     }
@@ -50,36 +47,32 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter(
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1)
-        {
-          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 && idx1 < last1)
-          {
-            // in bounds
-            working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
-                                               (idx1 - first1))] +=
-                (idx0 - first0) * N1 + (idx1 - first1);
-          }
-          else
-          {
-            // out of bounds
-            working_array[RAJA::stripIndexType(N)]++;
-          }
-        },
-        r0, r1);
+    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1) {
+      if (idx0 >= first0 && idx0 < last0 &&
+          idx1 >= first1 && idx1 < last1) {
+        // in bounds
+        working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
+                                           (idx1 - first1))] += (idx0 - first0) * N1 +
+                                                                (idx1 - first1);
+      } else {
+        // out of bounds
+        working_array[RAJA::stripIndexType(N)]++;
+      }
+    }, r0, r1);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
+
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -87,31 +80,27 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
 TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest);
 template <typename T>
 class ForallCombiningAdapter2DTest : public ::testing::Test
-{};
+{
+};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{}
+{
+}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
+                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3));
 
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
+                                                                     INDEX_TYPE(-3), INDEX_TYPE(0));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
+                                                                     INDEX_TYPE(-3), INDEX_TYPE(2));
 }
 
 
@@ -122,23 +111,24 @@ TYPED_TEST_P(ForallCombiningAdapter2DTest, Forall2D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(8));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(6), INDEX_TYPE(5), INDEX_TYPE(5));
-
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(15), INDEX_TYPE(0), INDEX_TYPE(17));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(57), INDEX_TYPE(4), INDEX_TYPE(21));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(13), INDEX_TYPE(156), INDEX_TYPE(17), INDEX_TYPE(203));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
+                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
+                                                                     INDEX_TYPE(5), INDEX_TYPE(8));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(6),
+                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
+
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(15),
+                                                                     INDEX_TYPE(0), INDEX_TYPE(17));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(57),
+                                                                     INDEX_TYPE(4), INDEX_TYPE(21));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(156),
+                                                                     INDEX_TYPE(17), INDEX_TYPE(203));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest, Forall2D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest,
+                            Forall2D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_2D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index 1b5611ee74..83213cc113 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -14,43 +14,41 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
-                                      INDEX_TYPE last0,
-                                      INDEX_TYPE first1,
-                                      INDEX_TYPE last1,
-                                      INDEX_TYPE first2,
-                                      INDEX_TYPE last2)
+void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
+                                  INDEX_TYPE first1, INDEX_TYPE last1,
+                                  INDEX_TYPE first2, INDEX_TYPE last2)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
-                                         RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
-                                         RAJA::stripIndexType(last1));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2),
-                                         RAJA::stripIndexType(last2));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2), RAJA::stripIndexType(last2));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
-  INDEX_TYPE N  = N0 * N1 * N2;
+  INDEX_TYPE N = N0 * N1 * N2;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N) + 1;
+  size_t data_len = RAJA::stripIndexType(N)+1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
-    {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
-      {
-        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++)
-        {
-          test_array[i0 * N1 * N2 + i1 * N2 + i2] = i0 * N1 * N2 + i1 * N2 + i2;
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
+        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++) {
+          test_array[i0 * N1*N2 +
+                     i1 * N2 +
+                     i2] = i0 * N1 * N2 +
+                           i1 * N2 +
+                           i2;
         }
       }
     }
@@ -58,38 +56,35 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter(
-        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2)
-        {
-          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 &&
-              idx1 < last1 && idx2 >= first2 && idx2 < last2)
-          {
-            // in bounds
-            working_array[RAJA::stripIndexType(
-                (idx0 - first0) * N1 * N2 + (idx1 - first1) * N2 +
-                (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
-                                     (idx1 - first1) * N2 + (idx2 - first2);
-          }
-          else
-          {
-            // out of bounds
-            working_array[RAJA::stripIndexType(N)]++;
-          }
-        },
-        r0, r1, r2);
+    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2) {
+      if (idx0 >= first0 && idx0 < last0 &&
+          idx1 >= first1 && idx1 < last1 &&
+          idx2 >= first2 && idx2 < last2) {
+        // in bounds
+        working_array[RAJA::stripIndexType((idx0 - first0) * N1 * N2 +
+                                           (idx1 - first1) * N2 +
+                                           (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
+                                                                (idx1 - first1) * N2 +
+                                                                (idx2 - first2);
+      } else {
+        // out of bounds
+        working_array[RAJA::stripIndexType(N)]++;
+      }
+    }, r0, r1, r2);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
+
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -97,34 +92,30 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
 TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest);
 template <typename T>
 class ForallCombiningAdapter3DTest : public ::testing::Test
-{};
+{
+};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{}
+{
+}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3),
-      INDEX_TYPE(-1), INDEX_TYPE(-1));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0),
-      INDEX_TYPE(-4), INDEX_TYPE(0));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2),
-      INDEX_TYPE(-7), INDEX_TYPE(-2));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
+                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3),
+                                                                     INDEX_TYPE(-1), INDEX_TYPE(-1));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
+                                                                     INDEX_TYPE(-3), INDEX_TYPE(0),
+                                                                     INDEX_TYPE(-4), INDEX_TYPE(0));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
+                                                                     INDEX_TYPE(-3), INDEX_TYPE(2),
+                                                                     INDEX_TYPE(-7), INDEX_TYPE(-2));
 }
 
 
@@ -135,32 +126,33 @@ TYPED_TEST_P(ForallCombiningAdapter3DTest, Forall3D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
-      INDEX_TYPE(7));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
-      INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
-      INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
-      INDEX_TYPE(7));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(7), INDEX_TYPE(0), INDEX_TYPE(6), INDEX_TYPE(0),
-      INDEX_TYPE(3));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(13), INDEX_TYPE(4), INDEX_TYPE(17),
-      INDEX_TYPE(6), INDEX_TYPE(11));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(13), INDEX_TYPE(46), INDEX_TYPE(17), INDEX_TYPE(51),
-      INDEX_TYPE(4), INDEX_TYPE(31));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
+                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
+                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
+                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
+                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
+                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
+                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
+                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
+                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(7),
+                                                                     INDEX_TYPE(0), INDEX_TYPE(6),
+                                                                     INDEX_TYPE(0), INDEX_TYPE(3));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(13),
+                                                                     INDEX_TYPE(4), INDEX_TYPE(17),
+                                                                     INDEX_TYPE(6), INDEX_TYPE(11));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(46),
+                                                                     INDEX_TYPE(17), INDEX_TYPE(51),
+                                                                     INDEX_TYPE(4), INDEX_TYPE(31));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest, Forall3D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest,
+                            Forall3D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_3D_HPP__
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index f3cfc532a0..a9e2c5a9f8 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with
-/// forall.
+/// Header file containing basic functional tests for atomic operations with forall.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_BASIC_HPP__
@@ -16,40 +15,38 @@
 #include <numeric>
 
 // segment multiplexer
-template <typename IdxType, typename SegType>
-struct RSMultiplexer
-{};
+template< typename IdxType, typename SegType >
+struct RSMultiplexer {};
 
-template <typename IdxType>
-struct RSMultiplexer<IdxType, RAJA::TypedRangeSegment<IdxType>>
+template< typename IdxType >
+struct RSMultiplexer < IdxType, RAJA::TypedRangeSegment<IdxType> >
 {
   RAJA::TypedRangeSegment<IdxType>
-  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
+  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
   {
-    return RAJA::TypedRangeSegment<IdxType>(0, N);
+    return RAJA::TypedRangeSegment<IdxType>( 0, N );
   }
 };
 
-template <typename IdxType>
-struct RSMultiplexer<IdxType, RAJA::TypedRangeStrideSegment<IdxType>>
+template< typename IdxType >
+struct RSMultiplexer < IdxType, RAJA::TypedRangeStrideSegment<IdxType> >
 {
   RAJA::TypedRangeStrideSegment<IdxType>
-  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
+  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
   {
-    return RAJA::TypedRangeStrideSegment<IdxType>(0, N, 1);
+    return RAJA::TypedRangeStrideSegment<IdxType>( 0, N, 1 );
   }
 };
 
-template <typename IdxType>
-struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
+template< typename IdxType >
+struct RSMultiplexer < IdxType, RAJA::TypedListSegment<IdxType> >
 {
-  RAJA::TypedListSegment<IdxType> makeseg(IdxType N,
-                                          camp::resources::Resource work_res)
+  RAJA::TypedListSegment<IdxType>
+  makeseg( IdxType N, camp::resources::Resource work_res )
   {
     std::vector<IdxType> temp(N);
-    std::iota(std::begin(temp), std::end(temp), 0);
-    return RAJA::TypedListSegment<IdxType>(
-        &temp[0], static_cast<size_t>(temp.size()), work_res);
+    std::iota( std::begin(temp), std::end(temp), 0 );
+    return RAJA::TypedListSegment<IdxType>( &temp[0], static_cast<size_t>(temp.size()), work_res );
   }
 };
 // end segment multiplexer
@@ -61,59 +58,58 @@ template <typename ExecPolicy,
           typename IdxType,
           typename SegmentType,
           typename T>
-void ForallAtomicBasicTestImpl(IdxType seglimit)
+void ForallAtomicBasicTestImpl( IdxType seglimit )
 {
   // initialize an array
   const int len = 12;
 
-  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource work_res{WORKINGRES()};
 
-  SegmentType seg =
-      RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
+  SegmentType seg = 
+    RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
 
-  T* work_array;
-  T* test_array;
-  T* check_array;
+  T * work_array;
+  T * test_array;
+  T * check_array;
 
-  allocateForallTestData<T>(len, work_res, &work_array, &check_array,
-                            &test_array);
+  allocateForallTestData<T>(  len,
+                              work_res,
+                              &work_array,
+                              &check_array,
+                              &test_array );
 
   // use atomic add to reduce the array
-  test_array[0]  = static_cast<T>(0);
-  test_array[1]  = static_cast<T>(seglimit);
-  test_array[2]  = static_cast<T>(seglimit);
-  test_array[3]  = static_cast<T>(0);
-  test_array[4]  = static_cast<T>(0);
-  test_array[5]  = static_cast<T>(seglimit + 1);
-  test_array[6]  = static_cast<T>(seglimit);
-  test_array[7]  = static_cast<T>(0);
-  test_array[8]  = static_cast<T>(0);
-  test_array[9]  = static_cast<T>(0);
+  test_array[0] = static_cast<T>(0);
+  test_array[1] = static_cast<T>(seglimit);
+  test_array[2] = static_cast<T>(seglimit);
+  test_array[3] = static_cast<T>(0);
+  test_array[4] = static_cast<T>(0);
+  test_array[5] = static_cast<T>(seglimit + 1);
+  test_array[6] = static_cast<T>(seglimit);
+  test_array[7] = static_cast<T>(0);
+  test_array[8] = static_cast<T>(0);
+  test_array[9] = static_cast<T>(0);
   test_array[10] = static_cast<T>(0);
   test_array[11] = static_cast<T>(0);
 
   work_res.memcpy(work_array, test_array, sizeof(T) * len);
 
-  RAJA::forall<ExecPolicy>(
-      seg,
-      [=] RAJA_HOST_DEVICE(IdxType i)
-      {
-        RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
-        RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
-        RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
-        RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
-        RAJA::atomicInc<AtomicPolicy>(work_array + 4);
-        RAJA::atomicDec<AtomicPolicy>(work_array + 5);
-        RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-        RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i),
-                                      static_cast<T>(i + 1));
-        RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
-        RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
-        RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
-        RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
-      });
-
-  work_res.memcpy(check_array, work_array, sizeof(T) * len);
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+    RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
+    RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
+    RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
+    RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
+    RAJA::atomicInc<AtomicPolicy>(work_array + 4);
+    RAJA::atomicDec<AtomicPolicy>(work_array + 5);
+    RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
+    RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i), static_cast<T>(i+1));
+    RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
+    RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
+    RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
+    RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
+  });
+
+  work_res.memcpy( check_array, work_array, sizeof(T) * len );
   work_res.wait();
 
   EXPECT_EQ(static_cast<T>(seglimit), check_array[0]);
@@ -131,13 +127,17 @@ void ForallAtomicBasicTestImpl(IdxType seglimit)
   EXPECT_EQ(static_cast<T>(4), check_array[10]);
   EXPECT_EQ(static_cast<T>(13), check_array[11]);
 
-  deallocateForallTestData<T>(work_res, work_array, check_array, test_array);
+  deallocateForallTestData<T>(work_res,
+                              work_array,
+                              check_array,
+                              test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicBasicTest);
 template <typename T>
 class ForallAtomicBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
 {
@@ -147,15 +147,18 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
-                            RAJA::TypedRangeSegment<IdxType>, DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
-                            RAJA::TypedRangeStrideSegment<IdxType>, DType>(
-      10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
-                            RAJA::TypedListSegment<IdxType>, DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
+                            IdxType, RAJA::TypedRangeSegment<IdxType>, 
+                            DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
+                            IdxType, RAJA::TypedRangeStrideSegment<IdxType>, 
+                            DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
+                            IdxType, RAJA::TypedListSegment<IdxType>, 
+                            DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, AtomicBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest,
+                            AtomicBasicForall);
 
 #endif  //__TEST_FORALL_ATOMIC_BASIC_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
index 04eff1251e..9089844744 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
@@ -6,135 +6,98 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for addition arithmetic atomic
-/// operations using forall
+/// Source file containing basic functional tests for addition arithmetic atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_ADD_HPP__
 #define __TEST_FORALL_ATOMICREF_ADD_HPP__
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct PreIncCountOp
-{
-  PreIncCountOp(T* dcount,
-                T* hcount,
-                camp::resources::Resource work_res,
-                RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final((T)seg.size())
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PreIncCountOp {
+  PreIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (++counter) - (T)1; }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (++counter) - (T)1;
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct PostIncCountOp
-{
-  PostIncCountOp(T* dcount,
-                 T* hcount,
-                 camp::resources::Resource work_res,
-                 RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final((T)seg.size())
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PostIncCountOp {
+  PostIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter++); }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter++);
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct AddEqCountOp
-{
-  AddEqCountOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final((T)seg.size())
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct AddEqCountOp {
+  AddEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-  {
-    return (counter += (T)1) - (T)1;
-  }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter += (T)1) - (T)1;
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct FetchAddCountOp
-{
-  FetchAddCountOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final((T)seg.size())
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchAddCountOp {
+  FetchAddCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-  {
-    return counter.fetch_add((T)1);
-  }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return counter.fetch_add((T)1);
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class CountOp>
+         typename AtomicPolicy,
+         typename IdxType,
+         typename T,
+         template <typename, typename, typename> class CountOp>
 void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
-                      T* count,
-                      T* list,
-                      bool* hit,
-                      T* hcount,
-                      T* hlist,
-                      bool* hhit,
-                      camp::resources::Resource work_res,
-                      IdxType N)
+    T* count, T* list, bool* hit,
+    T* hcount, T* hlist, bool* hhit,
+    camp::resources::Resource work_res, IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
 
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             list[i] = countop.max + (T)1;
-                             hit[i]  = false;
-                           });
-
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val             = countop(i);
-                             list[i]           = val;
-                             hit[(IdxType)val] = true;
-                           });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = countop.max + (T)1;
+      hit[i] = false;
+      });
+
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = countop(i);
+      list[i] = val;
+      hit[(IdxType)val] = true;
+      });
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -143,9 +106,9 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy(hcount, count, sizeof(T));
-  work_res.memcpy(hlist, list, sizeof(T) * N);
-  work_res.memcpy(hhit, hit, sizeof(bool) * N);
+  work_res.memcpy( hcount, count, sizeof(T) );
+  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy( hhit, hit, sizeof(bool) * N );
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -156,8 +119,7 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
 #endif
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++)
-  {
+  for (IdxType i = 0; i < seg.size(); i++) {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -170,21 +132,21 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefAddTestImpl(IdxType N)
+void ForallAtomicRefAddTestImpl( IdxType N )
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource work_res{WORKINGRES()};
 
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count  = work_res.allocate<T>(1);
-  T* list   = work_res.allocate<T>(N);
-  bool* hit = work_res.allocate<bool>(N);
+  T * count   = work_res.allocate<T>(1);
+  T * list    = work_res.allocate<T>(N);
+  bool * hit  = work_res.allocate<bool>(N);
 
-  T* hcount  = host_res.allocate<T>(1);
-  T* hlist   = host_res.allocate<T>(N);
-  bool* hhit = host_res.allocate<bool>(N);
+  T * hcount   = host_res.allocate<T>(1);
+  T * hlist    = host_res.allocate<T>(N);
+  bool * hhit  = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -194,28 +156,29 @@ void ForallAtomicRefAddTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PreIncCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PostIncCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, AddEqCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, FetchAddCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate(count);
-  work_res.deallocate(list);
-  work_res.deallocate(hit);
-  host_res.deallocate(hcount);
-  host_res.deallocate(hlist);
-  host_res.deallocate(hhit);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PreIncCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PostIncCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     AddEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     FetchAddCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate( count );
+  work_res.deallocate( list );
+  work_res.deallocate( hit );
+  host_res.deallocate( hcount );
+  host_res.deallocate( hlist );
+  host_res.deallocate( hhit ); 
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefAddTest);
 template <typename T>
 class ForallAtomicRefAddTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
 {
@@ -225,9 +188,10 @@ TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
+  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest, AtomicRefAddForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest,
+                            AtomicRefAddForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_ADD_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 0adce05b3d..8f036fc4b9 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -6,122 +6,92 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for CAS atomic operations
-/// using forall
+/// Source file containing basic functional tests for CAS atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_CAS_HPP__
 #define __TEST_FORALL_ATOMICREF_CAS_HPP__
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct CASOtherOp : all_op
-{
-  CASOtherOp(T* dcount,
-             T* hcount,
-             camp::resources::Resource work_res,
-             RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final_min(min),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct CASOtherOp : all_op {
+  CASOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const
-  {
-    T received, expect = (T)0;
-    while ((received = other.CAS(expect, (T)i)) != expect)
+    T operator()(IdxType i) const
     {
-      expect = received;
+      T received, expect = (T)0;
+      while ((received = other.CAS(expect, (T)i)) != expect) {
+        expect = received;
+      }
+      return received;
     }
-    return received;
-  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct CompareExchangeWeakOtherOp : all_op
-{
-  CompareExchangeWeakOtherOp(T* dcount,
-                             T* hcount,
-                             camp::resources::Resource work_res,
-                             RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final_min(min),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct CompareExchangeWeakOtherOp : all_op {
+  CompareExchangeWeakOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const
-  {
-    T expect = (T)0;
-    while (!other.compare_exchange_weak(expect, (T)i))
-    {}
-    return expect;
-  }
+    T operator()(IdxType i) const
+    {
+      T expect = (T)0;
+      while (!other.compare_exchange_weak(expect, (T)i)) {}
+      return expect;
+    }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct CompareExchangeStrongOtherOp : all_op
-{
-  CompareExchangeStrongOtherOp(T* dcount,
-                               T* hcount,
-                               camp::resources::Resource work_res,
-                               RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final_min(min),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct CompareExchangeStrongOtherOp : all_op {
+  CompareExchangeStrongOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const
-  {
-    T expect = (T)0;
-    while (!other.compare_exchange_strong(expect, (T)i))
-    {}
-    return expect;
-  }
+    T operator()(IdxType i) const
+    {
+      T expect = (T)0;
+      while (!other.compare_exchange_strong(expect, (T)i)) {}
+      return expect;
+    }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
-void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
-                        T* count,
-                        T* list,
-                        T* hcount,
-                        T* hlist,
-                        camp::resources::Resource work_res,
-                        IdxType N)
+template  < typename ExecPolicy,
+            typename AtomicPolicy,
+            typename IdxType,
+            typename T,
+            template <typename, typename, typename> class OtherOp>
+void
+testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
+    T* hcount, T* hlist,
+    camp::resources::Resource work_res, IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = otherop.max + (T)1;
+  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = otherop(i);
+      list[i] = val;
+  });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -129,13 +99,12 @@ void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy(hcount, count, sizeof(T));
-  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy( hcount, count, sizeof(T) );
+  work_res.memcpy( hlist, list, sizeof(T) * N );
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++)
-  {
+  for (IdxType i = 0; i < seg.size(); i++) {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -147,19 +116,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefCASTestImpl(IdxType N)
+void ForallAtomicRefCASTestImpl( IdxType N )
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource work_res{WORKINGRES()};
 
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count = work_res.allocate<T>(1);
-  T* list  = work_res.allocate<T>(N);
+  T * count   = work_res.allocate<T>(1);
+  T * list    = work_res.allocate<T>(N);
 
-  T* hcount = host_res.allocate<T>(1);
-  T* hlist  = host_res.allocate<T>(N);
+  T * hcount   = host_res.allocate<T>(1);
+  T * hlist    = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -169,26 +138,25 @@ void ForallAtomicRefCASTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, CASOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
-                     CompareExchangeWeakOtherOp>(seg, count, list, hcount,
-                                                 hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
-                     CompareExchangeStrongOtherOp>(seg, count, list, hcount,
-                                                   hlist, work_res, N);
-
-  work_res.deallocate(count);
-  work_res.deallocate(list);
-  host_res.deallocate(hcount);
-  host_res.deallocate(hlist);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       CASOtherOp                  >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       CompareExchangeWeakOtherOp  >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       CompareExchangeStrongOtherOp>(seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate( count );
+  work_res.deallocate( list );
+  host_res.deallocate( hcount );
+  host_res.deallocate( hlist );
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefCASTest);
 template <typename T>
 class ForallAtomicRefCASTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
 {
@@ -198,9 +166,10 @@ TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
+  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest, AtomicRefCASForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest,
+                            AtomicRefCASForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_CAS_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index 8bb250d339..95209b6c79 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -6,124 +6,94 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for load/store atomic
-/// operations using forall
+/// Source file containing basic functional tests for load/store atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 #define __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct LoadOtherOp : all_op
-{
-  LoadOtherOp(T* dcount,
-              T* hcount,
-              camp::resources::Resource work_res,
-              RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min((T)seg.size()),
-        max(min),
-        final_min(min),
-        final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct LoadOtherOp : all_op {
+  LoadOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min((T)seg.size()), max(min),
+    final_min(min), final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other.load(); }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+    { return other.load(); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct OperatorTOtherOp : all_op
-{
-  OperatorTOtherOp(T* dcount,
-                   T* hcount,
-                   camp::resources::Resource work_res,
-                   RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
-      : other(dcount), min(T(0)), max(min), final_min(min), final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct OperatorTOtherOp : all_op {
+  OperatorTOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
+    : other(dcount), min(T(0)), max(min),
+    final_min(min), final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other; }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+    { return other; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct StoreOtherOp : all_op
-{
-  StoreOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min((T)0),
-        max((T)seg.size() - (T)1),
-        final_min(min),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct StoreOtherOp : all_op {
+  StoreOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const
-  {
-    other.store((T)i);
-    return (T)i;
-  }
+    T operator()(IdxType i) const
+    { other.store((T)i); return (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct AssignOtherOp : all_op
-{
-  AssignOtherOp(T* dcount,
-                T* hcount,
-                camp::resources::Resource work_res,
-                RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max((T)seg.size() - (T)1),
-        final_min(min),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct AssignOtherOp : all_op {
+  AssignOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(min), final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return (other = (T)i); }
+    T operator()(IdxType i) const
+    { return (other = (T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
-void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
-                              T* count,
-                              T* list,
-                              T* hcount,
-                              T* hlist,
-                              camp::resources::Resource work_res,
-                              IdxType N)
+template  < typename ExecPolicy,
+            typename AtomicPolicy,
+            typename IdxType,
+            typename T,
+            template <typename, typename, typename> class OtherOp>
+void
+testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
+    T* hcount, T* hlist,
+    camp::resources::Resource work_res, IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = otherop.max + (T)1;
+  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = otherop(i);
+      list[i] = val;
+  });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -131,13 +101,12 @@ void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy(hcount, count, sizeof(T));
-  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy( hcount, count, sizeof(T) );
+  work_res.memcpy( hlist, list, sizeof(T) * N );
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++)
-  {
+  for (IdxType i = 0; i < seg.size(); i++) {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -149,19 +118,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLoadStoreTestImpl(IdxType N)
+void ForallAtomicRefLoadStoreTestImpl( IdxType N )
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource work_res{WORKINGRES()};
 
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count = work_res.allocate<T>(1);
-  T* list  = work_res.allocate<T>(N);
+  T * count   = work_res.allocate<T>(1);
+  T * list    = work_res.allocate<T>(N);
 
-  T* hcount = host_res.allocate<T>(1);
-  T* hlist  = host_res.allocate<T>(N);
+  T * hcount   = host_res.allocate<T>(1);
+  T * hlist    = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -171,27 +140,27 @@ void ForallAtomicRefLoadStoreTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, LoadOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T,
-                           OperatorTOtherOp>(seg, count, list, hcount, hlist,
-                                             work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, StoreOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, AssignOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate(count);
-  work_res.deallocate(list);
-  host_res.deallocate(hcount);
-  host_res.deallocate(hlist);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       LoadOtherOp     >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       OperatorTOtherOp>(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       StoreOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       AssignOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate( count );
+  work_res.deallocate( list );
+  host_res.deallocate( hcount );
+  host_res.deallocate( hlist );
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest);
 template <typename T>
 class ForallAtomicRefLoadStoreTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
 {
@@ -201,7 +170,7 @@ TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
+  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest,
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 49ec06689a..382560109c 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -6,202 +6,154 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for logical atomic operations
-/// using forall
+/// Source file containing basic functional tests for logical atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 #define __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct AndEqOtherOp : int_op
-{
-  AndEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max((T)seg.size()),
-        final_min(min),
-        final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct AndEqOtherOp : int_op {
+  AndEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max((T)seg.size()),
+    final_min(min), final_max(min)
   {
     hcount[0] = np2m1((T)seg.size());
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other &= (T)i; }
+    T operator()(IdxType i) const
+    { return other &= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct FetchAndOtherOp : int_op
-{
-  FetchAndOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max(np2m1((T)seg.size())),
-        final_min(min),
-        final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchAndOtherOp : int_op {
+  FetchAndOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(min), final_max(min)
   {
     hcount[0] = max;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other.fetch_and((T)i); }
+    T operator()(IdxType i) const
+    { return other.fetch_and((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct OrEqOtherOp : int_op
-{
-  OrEqOtherOp(T* dcount,
-              T* hcount,
-              camp::resources::Resource work_res,
-              RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max(np2m1((T)seg.size())),
-        final_min(max),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct OrEqOtherOp : int_op {
+  OrEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(max), final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other |= (T)i; }
+    T operator()(IdxType i) const
+    { return other |= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct FetchOrOtherOp : int_op
-{
-  FetchOrOtherOp(T* dcount,
-                 T* hcount,
-                 camp::resources::Resource work_res,
-                 RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max(np2m1((T)seg.size())),
-        final_min(max),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchOrOtherOp : int_op {
+  FetchOrOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(max), final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other.fetch_or((T)i); }
+    T operator()(IdxType i) const
+    { return other.fetch_or((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct XorEqOtherOp : int_op
-{
-  XorEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max(np2m1((T)seg.size())),
-        final_min(min),
-        final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct XorEqOtherOp : int_op {
+  XorEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(min), final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i)
-    {
-      final_min ^= (T)i;
-      final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i) {
+      final_min ^= (T)i; final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other ^= (T)i; }
+    T operator()(IdxType i) const
+    { return other ^= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct FetchXorOtherOp : int_op
-{
-  FetchXorOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max(np2m1((T)seg.size())),
-        final_min(min),
-        final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchXorOtherOp : int_op {
+  FetchXorOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
+    final_min(min), final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i)
-    {
-      final_min ^= (T)i;
-      final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i) {
+      final_min ^= (T)i; final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other.fetch_xor((T)i); }
+    T operator()(IdxType i) const
+    { return other.fetch_xor((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
+template  < typename ExecPolicy,
+            typename AtomicPolicy,
+            typename IdxType,
+            typename T,
+            template <typename, typename, typename> class OtherOp>
 // No test when underlying op type is int, and index type is float
 typename std::enable_if<
-    (std::is_floating_point<T>::value &&
-     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
-                       T* RAJA_UNUSED_ARG(count),
-                       T* RAJA_UNUSED_ARG(list),
-                       T* RAJA_UNUSED_ARG(hcount),
-                       T* RAJA_UNUSED_ARG(hlist),
-                       camp::resources::Resource RAJA_UNUSED_ARG(work_res),
-                       IdxType RAJA_UNUSED_ARG(N))
-{}
+           (std::is_floating_point<T>::value && 
+            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
+         >::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg), 
+                     T* RAJA_UNUSED_ARG(count), T* RAJA_UNUSED_ARG(list),
+                     T* RAJA_UNUSED_ARG(hcount), T* RAJA_UNUSED_ARG(hlist),
+                     camp::resources::Resource RAJA_UNUSED_ARG(work_res), IdxType RAJA_UNUSED_ARG(N))
+{
+}
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
+template  < typename ExecPolicy,
+            typename AtomicPolicy,
+            typename IdxType,
+            typename T,
+            template <typename, typename, typename> class OtherOp>
 // Run test if T is integral and operation is int_op, or for any all_op
 typename std::enable_if<
-    (std::is_integral<T>::value &&
-     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value) ||
-    (std::is_base_of<all_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
-                       T* count,
-                       T* list,
-                       T* hcount,
-                       T* hlist,
-                       camp::resources::Resource work_res,
-                       IdxType N)
+           (std::is_integral<T>::value && 
+            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value) || 
+            (std::is_base_of<all_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
+         >::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
+    T* hcount, T* hlist,
+    camp::resources::Resource work_res, IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = otherop.max + (T)1;
+  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = otherop(i);
+      list[i] = val;
+  });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -209,13 +161,12 @@ testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy(hcount, count, sizeof(T));
-  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy( hcount, count, sizeof(T) );
+  work_res.memcpy( hlist, list, sizeof(T) * N );
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++)
-  {
+  for (IdxType i = 0; i < seg.size(); i++) {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -227,19 +178,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLogicalTestImpl(IdxType N)
+void ForallAtomicRefLogicalTestImpl( IdxType N )
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource work_res{WORKINGRES()};
 
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count = work_res.allocate<T>(1);
-  T* list  = work_res.allocate<T>(N);
+  T * count   = work_res.allocate<T>(1);
+  T * list    = work_res.allocate<T>(N);
 
-  T* hcount = host_res.allocate<T>(1);
-  T* hlist  = host_res.allocate<T>(N);
+  T * hcount   = host_res.allocate<T>(1);
+  T * hlist    = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -249,32 +200,33 @@ void ForallAtomicRefLogicalTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  // Note: These integral tests require return type conditional overloading
+  // Note: These integral tests require return type conditional overloading 
   //       of testAtomicRefLogicalOp
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, AndEqOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchAndOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, OrEqOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchOrOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, XorEqOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchXorOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate(count);
-  work_res.deallocate(list);
-  host_res.deallocate(hcount);
-  host_res.deallocate(hlist);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       AndEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchAndOtherOp>(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       OrEqOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchOrOtherOp >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       XorEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchXorOtherOp>(seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate( count );
+  work_res.deallocate( list );
+  host_res.deallocate( hcount );
+  host_res.deallocate( hlist );
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest);
 template <typename T>
 class ForallAtomicRefLogicalTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
 {
@@ -284,9 +236,10 @@ TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
+  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest,
+                            AtomicRefLogicalForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_LOGICAL_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index 02547f773a..b8860def9f 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -6,124 +6,94 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for min/max atomic operations
-/// using forall
+/// Source file containing basic functional tests for min/max atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 #define __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct MaxEqOtherOp : all_op
-{
-  MaxEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max((T)seg.size() - (T)1),
-        final_min(max),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct MaxEqOtherOp : all_op {
+  MaxEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(max), final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other.max((T)i); }
+    T operator()(IdxType i) const
+    { return other.max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct FetchMaxOtherOp : all_op
-{
-  FetchMaxOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max((T)seg.size() - (T)1),
-        final_min(max),
-        final_max(max)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchMaxOtherOp : all_op {
+  FetchMaxOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(max), final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other.fetch_max((T)i); }
+    T operator()(IdxType i) const
+    { return other.fetch_max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct MinEqOtherOp : all_op
-{
-  MinEqOtherOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max((T)seg.size() - (T)1),
-        final_min(min),
-        final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct MinEqOtherOp : all_op {
+  MinEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
+    final_min(min), final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other.min((T)i); }
+    T operator()(IdxType i) const
+    { return other.min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct FetchMinOtherOp : all_op
-{
-  FetchMinOtherOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
-      : other(dcount),
-        min(T(0)),
-        max((T)seg.size()),
-        final_min(min),
-        final_max(min)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchMinOtherOp : all_op {
+  FetchMinOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : other(dcount), min(T(0)), max((T)seg.size()),
+    final_min(min), final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType i) const { return other.fetch_min((T)i); }
+    T operator()(IdxType i) const
+    { return other.fetch_min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class OtherOp>
-void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
-                           T* count,
-                           T* list,
-                           T* hcount,
-                           T* hlist,
-                           camp::resources::Resource work_res,
-                           IdxType N)
+template  < typename ExecPolicy,
+            typename AtomicPolicy,
+            typename IdxType,
+            typename T,
+            template <typename, typename, typename> class OtherOp>
+void
+testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
+    T* hcount, T* hlist,
+    camp::resources::Resource work_res, IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { list[i] = otherop.max + (T)1; });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val   = otherop(i);
-                             list[i] = val;
-                           });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = otherop.max + (T)1;
+  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = otherop(i);
+      list[i] = val;
+  });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -131,13 +101,12 @@ void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy(hcount, count, sizeof(T));
-  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy( hcount, count, sizeof(T) );
+  work_res.memcpy( hlist, list, sizeof(T) * N );
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++)
-  {
+  for (IdxType i = 0; i < seg.size(); i++) {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -149,19 +118,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefMinMaxTestImpl(IdxType N)
+void ForallAtomicRefMinMaxTestImpl( IdxType N )
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource work_res{WORKINGRES()};
 
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count = work_res.allocate<T>(1);
-  T* list  = work_res.allocate<T>(N);
+  T * count   = work_res.allocate<T>(1);
+  T * list    = work_res.allocate<T>(N);
 
-  T* hcount = host_res.allocate<T>(1);
-  T* hlist  = host_res.allocate<T>(N);
+  T * hcount   = host_res.allocate<T>(1);
+  T * hlist    = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -171,26 +140,27 @@ void ForallAtomicRefMinMaxTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MaxEqOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMaxOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MinEqOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMinOtherOp>(
-      seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate(count);
-  work_res.deallocate(list);
-  host_res.deallocate(hcount);
-  host_res.deallocate(hlist);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       MaxEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchMaxOtherOp>(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       MinEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
+                       FetchMinOtherOp>(seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate( count );
+  work_res.deallocate( list );
+  host_res.deallocate( hcount );
+  host_res.deallocate( hlist );
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest);
 template <typename T>
 class ForallAtomicRefMinMaxTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
 {
@@ -200,9 +170,10 @@ TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
+  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest,
+                            AtomicRefMinMaxForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_MINMAX_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
index bf15327085..f4579fb786 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
@@ -6,118 +6,96 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for subtraction arithmetic
-/// atomic operations using forall
+/// Source file containing basic functional tests for subtraction arithmetic atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_SUB_HPP__
 #define __TEST_FORALL_ATOMICREF_SUB_HPP__
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct PreDecCountOp
-{
-  PreDecCountOp(T* dcount,
-                T* hcount,
-                camp::resources::Resource work_res,
-                RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PreDecCountOp {
+  PreDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (--counter); }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (--counter);
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct PostDecCountOp
-{
-  PostDecCountOp(T* dcount,
-                 T* hcount,
-                 camp::resources::Resource work_res,
-                 RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct PostDecCountOp {
+  PostDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter--) - (T)1; }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter--) - (T)1;
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct SubEqCountOp
-{
-  SubEqCountOp(T* dcount,
-               T* hcount,
-               camp::resources::Resource work_res,
-               RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct SubEqCountOp {
+  SubEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter -= (T)1); }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return (counter -= (T)1);
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template <typename T, typename AtomicPolicy, typename IdxType>
-struct FetchSubCountOp
-{
-  FetchSubCountOp(T* dcount,
-                  T* hcount,
-                  camp::resources::Resource work_res,
-                  RAJA::TypedRangeSegment<IdxType> seg)
-      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
+template < typename T, typename AtomicPolicy, typename IdxType >
+struct FetchSubCountOp {
+  FetchSubCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
+    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-  {
-    return counter.fetch_sub((T)1) - (T)1;
-  }
+    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
+      return counter.fetch_sub((T)1) - (T)1;
+    }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename IdxType,
-          typename T,
-          template <typename, typename, typename>
-          class CountOp>
+         typename AtomicPolicy,
+         typename IdxType,
+         typename T,
+         template <typename, typename, typename> class CountOp>
 void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
-                      T* count,
-                      T* list,
-                      bool* hit,
-                      T* hcount,
-                      T* hlist,
-                      bool* hhit,
-                      camp::resources::Resource work_res,
-                      IdxType N)
+    T* count, T* list, bool* hit,
+    T* hcount, T* hlist, bool* hhit,
+    camp::resources::Resource work_res, IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             list[i] = countop.max + (T)1;
-                             hit[i]  = false;
-                           });
-  RAJA::forall<ExecPolicy>(seg,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             T val             = countop(i);
-                             list[i]           = val;
-                             hit[(IdxType)val] = true;
-                           });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      list[i] = countop.max + (T)1;
+      hit[i] = false;
+      });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+      T val = countop(i);
+      list[i] = val;
+      hit[(IdxType)val] = true;
+      });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -125,13 +103,12 @@ void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy(hcount, count, sizeof(T));
-  work_res.memcpy(hlist, list, sizeof(T) * N);
-  work_res.memcpy(hhit, hit, sizeof(bool) * N);
+  work_res.memcpy( hcount, count, sizeof(T) );
+  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy( hhit, hit, sizeof(bool) * N );
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++)
-  {
+  for (IdxType i = 0; i < seg.size(); i++) {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -144,21 +121,21 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefSubTestImpl(IdxType N)
+void ForallAtomicRefSubTestImpl( IdxType N )
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource work_res{WORKINGRES()};
 
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* count  = work_res.allocate<T>(1);
-  T* list   = work_res.allocate<T>(N);
-  bool* hit = work_res.allocate<bool>(N);
+  T * count   = work_res.allocate<T>(1);
+  T * list    = work_res.allocate<T>(N);
+  bool * hit  = work_res.allocate<bool>(N);
 
-  T* hcount  = host_res.allocate<T>(1);
-  T* hlist   = host_res.allocate<T>(N);
-  bool* hhit = host_res.allocate<bool>(N);
+  T * hcount   = host_res.allocate<T>(1);
+  T * hlist    = host_res.allocate<T>(N);
+  bool * hhit  = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -168,28 +145,29 @@ void ForallAtomicRefSubTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PreDecCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PostDecCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, SubEqCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, FetchSubCountOp>(
-      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate(count);
-  work_res.deallocate(list);
-  work_res.deallocate(hit);
-  host_res.deallocate(hcount);
-  host_res.deallocate(hlist);
-  host_res.deallocate(hhit);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PreDecCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     PostDecCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     SubEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
+                     FetchSubCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate( count );
+  work_res.deallocate( list );
+  work_res.deallocate( hit );
+  host_res.deallocate( hcount );
+  host_res.deallocate( hlist );
+  host_res.deallocate( hhit ); 
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefSubTest);
 template <typename T>
 class ForallAtomicRefSubTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
 {
@@ -199,9 +177,10 @@ TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
+  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest, AtomicRefSubForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest,
+                            AtomicRefSubForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_SUB_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index c71c363d75..fc67162823 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with
-/// forall and views.
+/// Header file containing basic functional tests for atomic operations with forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
@@ -20,26 +19,25 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicMultiViewTestImpl(IdxType N)
+void ForallAtomicMultiViewTestImpl( IdxType N )
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>(
-      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
-  int src_side = dst_side * 2;                 // source[] dimension
+  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
+  int src_side = dst_side*2; // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res {WORKINGRES()};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* actualsource = work_res.allocate<T>(N);
-  T** source      = work_res.allocate<T*>(src_side);
-  T* actualdest   = work_res.allocate<T>(N / 2);
-  T** dest        = work_res.allocate<T*>(dst_side);
-  T* check_array  = host_res.allocate<T>(N / 2);
+  T *  actualsource = work_res.allocate<T> (N);
+  T ** source       = work_res.allocate<T*>(src_side);
+  T *  actualdest   = work_res.allocate<T> (N/2);
+  T ** dest         = work_res.allocate<T*>(dst_side);
+  T *  check_array  = host_res.allocate<T> (N/2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -51,14 +49,19 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
   RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
-                           { source[ii] = actualsource + (ii * dst_side); });
+  {
+    source[ii] = actualsource+(ii*dst_side);
+  });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
   RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
-                           { dest[ii] = actualdest + (ii * dst_side); });
+  {
+    dest[ii] = actualdest+(ii*dst_side);
+  });
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { actualsource[i] = (T)1; });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+    actualsource[i] = (T)1;
+  });
 
   // use atomic add to reduce the array
   // 1D defaut MultiView
@@ -70,27 +73,22 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_dstside,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-                             {
-                               sum_atomic_view(i, aopidx) = (T)0;
-                             }
-                           });
+  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType i) {
+    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
+    {
+      sum_atomic_view(i,aopidx) = (T)0;
+    }
+  });
 
   // Assign values to dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_srcside,
-                           [=] RAJA_HOST_DEVICE(IdxType i)
-                           {
-                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
-                             {
-                               sum_atomic_view(i / 2, aopidx) +=
-                                   vec_view(aopidx, i / 2);
-                             }
-                           });
-
-  work_res.memcpy(check_array, actualdest, sizeof(T) * N / 2);
+  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType i) {
+    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
+    {
+      sum_atomic_view(i/2, aopidx) += vec_view(aopidx,i/2);
+    }
+  });
+
+  work_res.memcpy( check_array, actualdest, sizeof(T) * N/2 );
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -100,22 +98,22 @@ void ForallAtomicMultiViewTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i)
-  {
+  for (IdxType i = 0; i < N / 2; ++i) {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  work_res.deallocate(actualsource);
-  work_res.deallocate(source);
-  work_res.deallocate(actualdest);
-  work_res.deallocate(dest);
-  host_res.deallocate(check_array);
+  work_res.deallocate( actualsource );
+  work_res.deallocate( source );
+  work_res.deallocate( actualdest );
+  work_res.deallocate( dest );
+  host_res.deallocate( check_array );
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest);
 template <typename T>
 class ForallAtomicMultiViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
 {
@@ -125,9 +123,10 @@ TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>(20000);
+  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest, AtomicMultiViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest,
+                            AtomicMultiViewForall);
 
 #endif  //__TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index c066673e4a..a33c0f591a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with
-/// forall and views.
+/// Header file containing basic functional tests for atomic operations with forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
@@ -20,26 +19,25 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
+void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>(
-      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
-  int src_side = dst_side * 2;                 // source[] dimension
+  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
+  int src_side = dst_side*2; // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res {WORKINGRES()};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* actualsource = work_res.allocate<T>(N);
-  T** source      = work_res.allocate<T*>(src_side);
-  T* actualdest   = work_res.allocate<T>(N / 2);
-  T** dest        = work_res.allocate<T*>(dst_side);
-  T* check_array  = host_res.allocate<T>(N / 2);
+  T *  actualsource = work_res.allocate<T> (N);
+  T ** source       = work_res.allocate<T*>(src_side);
+  T *  actualdest   = work_res.allocate<T> (N/2);
+  T ** dest         = work_res.allocate<T*>(dst_side);
+  T *  check_array  = host_res.allocate<T> (N/2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -60,12 +58,12 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
   auto sum_atomic_view = RAJA::make_atomic_view<AtomicPolicy>(sum_view);
 
 
-// Need gtest death test to avoid complete failure due to eventual seg fault
-#if defined(RAJA_ENABLE_TARGET_OPENMP)
-  EXPECT_DEATH_IF_SUPPORTED((sum_atomic_view(0, -1) = (T)0), "");
-#else
-  EXPECT_THROW((sum_atomic_view(0, -1) = (T)0), std::runtime_error);
-#endif
+  // Need gtest death test to avoid complete failure due to eventual seg fault
+  #if defined(RAJA_ENABLE_TARGET_OPENMP)
+  EXPECT_DEATH_IF_SUPPORTED( (sum_atomic_view(0,-1) = (T)0), "" );
+  #else
+  EXPECT_THROW( (sum_atomic_view(0,-1) = (T)0), std::runtime_error );
+  #endif
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -75,20 +73,20 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.deallocate(actualsource);
-  work_res.deallocate(source);
-  work_res.deallocate(actualdest);
-  work_res.deallocate(dest);
-  host_res.deallocate(check_array);
+  work_res.deallocate( actualsource );
+  work_res.deallocate( source );
+  work_res.deallocate( actualdest );
+  work_res.deallocate( dest );
+  host_res.deallocate( check_array );
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest);
 template <typename T>
 class ForallAtomicOutOfBoundsMultiViewTest : public ::testing::Test
-{};
+{
+};
 
-TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
-             AtomicOutOfBoundsMultiViewForall)
+TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest, AtomicOutOfBoundsMultiViewForall)
 {
   using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
   using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -96,8 +94,7 @@ TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType,
-                                           DType>(20000);
+  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest,
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index 325fba2a0a..588e95bf82 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with
-/// forall and views.
+/// Header file containing basic functional tests for atomic operations with forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_VIEW_HPP__
@@ -18,18 +17,18 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicViewTestImpl(IdxType N)
+void ForallAtomicViewTestImpl( IdxType N )
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_half(0, N / 2);
 
-  camp::resources::Resource work_res {WORKINGRES()};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  T* hsource     = host_res.allocate<T>(N);
-  T* source      = work_res.allocate<T>(N);
-  T* dest        = work_res.allocate<T>(N / 2);
-  T* check_array = host_res.allocate<T>(N / 2);
+  T * hsource = host_res.allocate<T>(N);
+  T * source = work_res.allocate<T>(N);
+  T * dest = work_res.allocate<T>(N/2);
+  T * check_array = host_res.allocate<T>(N/2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -39,9 +38,10 @@ void ForallAtomicViewTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](IdxType i) { hsource[i] = (T)1; });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](IdxType i) { hsource[i] = (T)1; });
 
-  work_res.memcpy(source, hsource, sizeof(T) * N);
+  work_res.memcpy( source, hsource, sizeof(T) * N );
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -59,14 +59,16 @@ void ForallAtomicViewTestImpl(IdxType N)
 
 
   // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { sum_atomic_view(i) = (T)0; });
+  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i) {
+    sum_atomic_view(i) = (T)0;
+  });
 
   // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
-                           { sum_atomic_view(i / 2) += vec_view(i); });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
+    sum_atomic_view(i / 2) += vec_view(i);
+  });
 
-  work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
+  work_res.memcpy( check_array, dest, sizeof(T) * N/2 );
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -76,21 +78,21 @@ void ForallAtomicViewTestImpl(IdxType N)
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i)
-  {
+  for (IdxType i = 0; i < N / 2; ++i) {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  host_res.deallocate(hsource);
-  work_res.deallocate(source);
-  work_res.deallocate(dest);
-  host_res.deallocate(check_array);
+  host_res.deallocate( hsource );
+  work_res.deallocate( source );
+  work_res.deallocate( dest );
+  host_res.deallocate( check_array );
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicViewTest);
 template <typename T>
 class ForallAtomicViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
 {
@@ -100,9 +102,10 @@ TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>(100000);
+  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>( 100000 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest, AtomicViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest,
+                            AtomicViewForall);
 
 #endif  //__TEST_FORALL_ATOMIC_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index ae4cbcfb09..26bd5ee7d9 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -23,58 +23,62 @@ void ForallIcountIndexSetViewTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType =
-      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices;
+  std::vector<INDEX_TYPE> is_indices; 
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-      iset, is_indices, working_res);
+    iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //
+  //  
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i)
-  {
-    test_array[ticount++] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ ticount++ ] = is_indices[i];
   }
 
   RAJA::Layout<1> layout(N);
-  RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>> work_view(
-      working_array, layout);
+  RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >
+    work_view(working_array, layout);
 
-  RAJA::forall_Icount<EXEC_POLICY>(
-      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
-      { work_view(icount) = idx; });
+  RAJA::forall_Icount<EXEC_POLICY>(iset,
+    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+    work_view( icount ) = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  //
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -82,7 +86,8 @@ void ForallIcountIndexSetViewTestImpl()
 TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest);
 template <typename T>
 class ForallIcountIndexSetViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
 {
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index 783bffa5fb..7fc00c47d9 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -22,57 +22,61 @@ void ForallIndexSetViewTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType =
-      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset;
+  IndexSetType iset; 
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-      iset, is_indices, working_res);
+    iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //
+  //  
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i)
-  {
-    test_array[is_indices[i]] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ is_indices[i] ] = is_indices[i];
   }
 
-  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { working_array[idx] = idx; });
+  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[idx] = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  //
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -80,7 +84,8 @@ void ForallIndexSetViewTestImpl()
 TYPED_TEST_SUITE_P(ForallIndexSetViewTest);
 template <typename T>
 class ForallIndexSetViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
 {
@@ -91,6 +96,7 @@ TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
   ForallIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest, IndexSetForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest,
+                            IndexSetForallView);
 
 #endif  // __TEST_FORALL_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index cd29d25073..70fbb98b15 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -21,53 +21,57 @@ void ForallIcountIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType =
-      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices;
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-      iset, is_indices, working_res);
+    iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //
+  //  
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i)
-  {
-    test_array[ticount++] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ ticount++ ] = is_indices[i];
   }
 
   RAJA::forall_Icount(EXEC_POLICY(), iset,
-                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
-                      { working_array[icount] = idx; });
+    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+    working_array[icount] = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -75,7 +79,8 @@ void ForallIcountIndexSetTestImpl()
 TYPED_TEST_SUITE_P(ForallIcountIndexSetTest);
 template <typename T>
 class ForallIcountIndexSetTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
 {
@@ -86,6 +91,7 @@ TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
   ForallIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest, IndexSetForallIcount);
+REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest,
+                            IndexSetForallIcount);
 
 #endif  // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index 9decd9ae7e..f2be845482 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -20,53 +20,56 @@ void ForallIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType =
-      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices;
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-      iset, is_indices, working_res);
+    iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //
+  //  
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i)
-  {
-    test_array[is_indices[i]] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ is_indices[i] ] = is_indices[i];
   }
 
-  RAJA::forall(EXEC_POLICY(), iset,
-               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-               { working_array[idx] = idx; });
+  RAJA::forall(EXEC_POLICY(), iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[idx] = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  //
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -74,7 +77,8 @@ void ForallIndexSetTestImpl()
 TYPED_TEST_SUITE_P(ForallIndexSetTest);
 template <typename T>
 class ForallIndexSetTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
 {
@@ -85,6 +89,7 @@ TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
   ForallIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest, IndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest,
+                            IndexSetForall);
 
 #endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index b3c33c97f9..c783befdf4 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -15,13 +15,9 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEG_TYPE,
-          typename Container,
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEG_TYPE, typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
@@ -31,17 +27,11 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&,
                                const std::vector<IDX_TYPE>&,
                                camp::resources::Resource,
                                RandomGenerator&)
-{
-  return false;
-}
+{ return false; }
 ///
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEG_TYPE,
-          typename Container,
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEG_TYPE, typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
@@ -51,13 +41,12 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
                                camp::resources::Resource working_res,
                                RandomGenerator& rngen)
 {
-  using MULTIREDUCER =
-      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1;
-  const IDX_TYPE idx_len   = static_cast<IDX_TYPE>(seg_idx.size());
+  const IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
-  const int modval      = 100;
+  const int modval = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -74,44 +63,47 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range + 1, working_res, &working_range,
-                         &check_range, &test_range);
+  allocateForallTestData(idx_range+1,
+                         working_res,
+                         &working_range,
+                         &check_range,
+                         &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
-  {
+  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
-        0, num_bins);
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
 
-    for (IDX_TYPE i = 0; i < idx_len; ++i)
-    {
-      IDX_TYPE idx    = seg_idx[i];
+    for (IDX_TYPE i = 0; i < idx_len; ++i) {
+      IDX_TYPE idx = seg_idx[i];
       test_range[idx] = data_len;
       data_len += work_per_iterate_distribution(rngen);
-      test_range[idx + 1] = data_len;
+      test_range[idx+1] = data_len;
     }
   }
 
-  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_array,
+                         &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_bins,
+                         &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0))
-  {
+  if (data_len > IDX_TYPE(0)) {
 
-    // use ints to initialize array here to avoid floating point precision
-    // issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
+    // use ints to initialize array here to avoid floating point precision issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i)
-    {
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -119,8 +111,7 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     }
   }
 
-  working_res.memcpy(working_range, test_range,
-                     sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -132,27 +123,19 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i)
-    {
-      ref_vals[test_bins[i]] =
-          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
+      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::forall<EXEC_POLICY>(
-        seg,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-        {
-          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-               ++idx)
-          {
-            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-          }
-        });
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
+      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+      }
+    });
 
     size_t bin = 0;
-    for (auto init_val : multi_init)
-    {
+    for (auto init_val : multi_init) {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -162,59 +145,44 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from
-  // multiple loops
+  // basic multiple use test, ensure same reducer can combine values from multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j)
-    {
+    for (int j = 0; j < nloops; ++j) {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i)
-      {
-        ref_vals[test_bins[i]] =
-            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::forall<EXEC_POLICY>(seg,
-                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-                                {
-                                  for (IDX_TYPE idx = working_range[ii];
-                                       idx < working_range[ii + 1]; ++idx)
-                                  {
-                                    ABSTRACTION::reduce(red[working_bins[idx]],
-                                                        working_array[idx]);
-                                  }
-                                });
+      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+        }
+      });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin)
-    {
+    for (size_t bin = 0; bin < num_bins; ++bin) {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red))
-  {
+  if (ABSTRACTION::consistent(red)) {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
-    {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-                         std::uniform_int_distribution<DATA_TYPE>,
-                         std::uniform_real_distribution<DATA_TYPE>>
-          array_flt_distribution(0, modval - 1);
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i)
-      {
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array,
-                         sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -222,31 +190,21 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j)
-    {
+    for (int j = 0; j < nloops; ++j) {
       red.reset();
 
-      RAJA::forall<EXEC_POLICY>(seg,
-                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
-                                {
-                                  for (IDX_TYPE idx = working_range[ii];
-                                       idx < working_range[ii + 1]; ++idx)
-                                  {
-                                    ABSTRACTION::reduce(red[working_bins[idx]],
-                                                        working_array[idx]);
-                                  }
-                                });
-
-      if (!got_ref_vals)
-      {
+      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+        }
+      });
+
+      if (!got_ref_vals) {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      }
-      else
-      {
-        for (size_t bin = 0; bin < num_bins; ++bin)
-        {
+      } else {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -254,16 +212,26 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   }
 
 
-  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
-  deallocateForallTestData(working_res, working_array, check_array, test_array);
-  deallocateForallTestData(working_res, working_range, check_range, test_range);
+  deallocateForallTestData(working_res,
+                           working_bins,
+                           check_bins,
+                           test_bins);
+  deallocateForallTestData(working_res,
+                           working_array,
+                           check_array,
+                           test_array);
+  deallocateForallTestData(working_res,
+                           working_range,
+                           check_range,
+                           test_range);
 }
 
 
 TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest);
 template <typename T>
 class ForallMultiReduceBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 {
@@ -275,10 +243,10 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device {}();
+  auto random_seed = std::random_device{}();
   std::mt19937 rngen(random_seed);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -286,58 +254,51 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container)
-  {
+  for (size_t num_bins_max : num_bins_max_container) {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
-                                                        num_bins_max);
-    num_bins_min    = num_bins_max + 1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
+    num_bins_min = num_bins_max+1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+    RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
     RAJA::getIndices(seg_idx, r1);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(r1, container, seg_idx,
-                                              working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   r1, container, seg_idx, working_res, rngen);
 
     seg_idx.clear();
-    RAJA::TypedRangeSegment<IDX_TYPE> r3(3, 2060);
+    RAJA::TypedRangeSegment<IDX_TYPE> r3( 3, 2060 );
     RAJA::getIndices(seg_idx, r3);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(r3, container, seg_idx,
-                                              working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   r3, container, seg_idx, working_res, rngen);
 
     // Range-stride segment test
     seg_idx.clear();
-    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
     RAJA::getIndices(seg_idx, r5);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(r5, container, seg_idx,
-                                              working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   r5, container, seg_idx, working_res, rngen);
 
     // List segment test
     seg_idx.clear();
     IDX_TYPE last = 10567;
-    std::uniform_int_distribution<IDX_TYPE> dist(0, last - 1);
-    for (IDX_TYPE i = 0; i < last; ++i)
-    {
+    std::uniform_int_distribution<IDX_TYPE> dist(0, last-1);
+    for (IDX_TYPE i = 0; i < last; ++i) {
       IDX_TYPE randval = dist(rngen);
-      if (i < randval)
-      {
+      if ( i < randval ) {
         seg_idx.push_back(i);
       }
     }
-    RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                        working_res);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                   DATA_TYPE>(l1, container, seg_idx,
-                                              working_res, rngen);
+    RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                         working_res );
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   l1, container, seg_idx, working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest, MultiReduceBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest,
+                            MultiReduceBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/CMakeLists.txt b/test/functional/forall/reduce-basic/CMakeLists.txt
index 42f03f04c8..de6ce12264 100644
--- a/test/functional/forall/reduce-basic/CMakeLists.txt
+++ b/test/functional/forall/reduce-basic/CMakeLists.txt
@@ -8,7 +8,7 @@
 #
 # List of experimental reduction types for generating test files.
 #
-set(REDUCETYPES ReduceSum ReduceMin ReduceMax ReduceMaxLoc ReduceMinLoc)
+set(REDUCETYPES ReduceSum ReduceMin ReduceMax ReduceMaxLoc ReduceMinLoc ReduceMaxLocAlt ReduceMinLocAlt)
 
 set(DATATYPES CoreReductionDataTypeList)
 
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index c4ef3f3188..6adade08a9 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -13,69 +13,66 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpand &= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    simpand &= working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
-
-  //
+  
+  // 
   // And now a randomized test that pushes zeros around
-  //
+  // 
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_and &= test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_and &= test_array[ seg_idx[i] ];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              redand &= working_array[idx];
-                              redand2 &= working_array[idx];
-                            });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    redand  &= working_array[idx];
+    redand2 &= working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -83,16 +80,18 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              { redand &= working_array[idx]; });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      redand &= working_array[idx];
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
+   
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -100,7 +99,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
@@ -110,67 +110,67 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index 5e783b89e0..a0db78c4f6 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -13,70 +13,67 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                    const std::vector<IDX_TYPE>& seg_idx,
-                                    camp::resources::Resource working_res)
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpor |= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    simpor |= working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
-
+ 
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_or |= test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_or |= test_array[ seg_idx[i] ];
   }
 
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor(0);
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              redor |= working_array[idx];
-                              redor2 |= working_array[idx];
-                            });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    redor  |= working_array[idx];
+    redor2 |= working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2.get()), ref_or);
@@ -84,23 +81,26 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              { redor |= working_array[idx]; });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      redor |= working_array[idx];
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
+   
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
@@ -111,66 +111,70 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+    camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r1, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r2, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r3, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                             working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                             working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(l1, seg_idx, working_res);
+                                 RAJA::TypedListSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
+                            ReduceBitOrBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index cb5657cde4..5ec8c47164 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -13,38 +13,37 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
-
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+ 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval         = 100;
+  const int modval = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max  = modval + 1;
+  const DATA_TYPE big_max = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -52,12 +51,10 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> maxinit(big_max);
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              maxinit.max(working_array[idx]);
-                              max.max(working_array[idx]);
-                            });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    maxinit.max( working_array[idx] );
+    max.max( working_array[idx] );
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
@@ -66,24 +63,29 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.max(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    max.max( working_array[idx] * factor);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
-
+   
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.max(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    max.max( working_array[idx] * factor);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
+   
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
@@ -93,66 +95,70 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
+                            ReduceMaxBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index 3aaba8daf4..c5f228821d 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -13,63 +13,57 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval           = 100;
-  const DATA_TYPE max_init   = -modval;
+  const int modval = 100;
+  const DATA_TYPE max_init = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE big_max    = modval + 1;
-  const IDX_TYPE big_maxloc  = maxloc_init;
+  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE big_max = modval+1;
+  const IDX_TYPE big_maxloc = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max   = max_init;
+  DATA_TYPE ref_max = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    if (test_array[seg_idx[i]] > ref_max)
-    {
-      ref_max    = test_array[seg_idx[i]];
-      ref_maxloc = seg_idx[i];
-    }
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] > ref_max ) {
+       ref_max = test_array[ seg_idx[i] ];
+       ref_maxloc = seg_idx[i];
+    } 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max,
-                                                                 maxloc_init);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init,
-                                                             maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max, maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              maxinit.maxloc(working_array[idx], idx);
-                              max.maxloc(working_array[idx], idx);
-                            });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    maxinit.maxloc( working_array[idx], idx );
+    max.maxloc( working_array[idx], idx );
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -81,26 +75,31 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.maxloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    max.maxloc( working_array[idx] * factor, idx);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
-
+  
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { max.maxloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
+    max.maxloc( working_array[idx] * factor, idx);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+ 
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
@@ -110,67 +109,67 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index 2d91806ad6..67e051acc4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -13,38 +13,37 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval          = 100;
-  const DATA_TYPE min_init  = modval + 1;
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -53,12 +52,10 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              mininit.min(working_array[idx]);
-                              min.min(working_array[idx]);
-                            });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    mininit.min( working_array[idx] );
+    min.min( working_array[idx] );
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -66,20 +63,24 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min.reset(min_init);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
-  DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.min(working_array[idx] * factor); });
+  DATA_TYPE factor = 3; 
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    min.min( working_array[idx] * factor);
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.min(working_array[idx] * factor); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
+    min.min( working_array[idx] * factor);
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
+   
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -87,7 +88,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
@@ -97,66 +99,70 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
+                            ReduceMinBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index 58e679cfe5..be5265d4b1 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -13,63 +13,57 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
-
-  const int modval            = 100;
-  const DATA_TYPE min_init    = modval + 1;
-  const IDX_TYPE minloc_init  = -1;
-  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE small_min   = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
+  const IDX_TYPE minloc_init = -1;
+  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE small_min = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min   = min_init;
+  DATA_TYPE ref_min = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    if (test_array[seg_idx[i]] < ref_min)
-    {
-      ref_min    = test_array[seg_idx[i]];
-      ref_minloc = seg_idx[i];
-    }
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] < ref_min ) {
+       ref_min = test_array[ seg_idx[i] ];
+       ref_minloc = seg_idx[i];
+    } 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min,
-                                                                 minloc_init);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init,
-                                                             minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min, minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              mininit.minloc(working_array[idx], idx);
-                              min.minloc(working_array[idx], idx);
-                            });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    mininit.minloc( working_array[idx], idx );
+    min.minloc( working_array[idx], idx );
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -81,26 +75,31 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.minloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    min.minloc( working_array[idx] * factor, idx);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { min.minloc(working_array[idx] * factor, idx); });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
+    min.minloc( working_array[idx] * factor, idx);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+   
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
@@ -110,67 +109,67 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 11112841b0..2203aedd1b 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -13,36 +13,35 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_sum += test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_sum += test_array[ seg_idx[i] ];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,12 +50,10 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            {
-                              sum += working_array[idx];
-                              sum2 += working_array[idx];
-                            });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    sum  += working_array[idx];
+    sum2 += working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -65,16 +62,18 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              { sum += working_array[idx]; });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      sum += working_array[idx];
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -82,7 +81,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
@@ -92,66 +92,70 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
-  seg_idx.clear();
+// List segment tests
+  seg_idx.clear(); 
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
+                                       working_res );
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
+                            ReduceSumBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index 19860bf8c2..c64106fc59 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -13,40 +13,45 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
+  using REF_BITAND = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_and>;
+
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
-  RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
+  DATA_TYPE simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpand &= working_array[idx]; });
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & _simpand) {
+      _simpand &= working_array[idx];
+  });
 
-  ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
+  ASSERT_EQ(static_cast<DATA_TYPE>(simpand), 5);
 
 
   //
@@ -55,30 +60,27 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_and &= test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_and &= test_array[ seg_idx[i] ];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-      RAJA::expt::KernelName("RAJA Reduce BitAnd"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2)
-      {
-        r1 &= working_array[idx];
-        r2 &= working_array[idx];
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+    RAJA::expt::KernelName("RAJA Reduce BitAnd"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND &r1, REF_BITAND &r2) {
+      r1 &= working_array[idx];
+      r2 &= working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -86,18 +88,20 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::forall<EXEC_POLICY>(
-        seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1)
-        { r1 &= working_array[idx]; });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg,
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND &r1) {
+        r1 &= working_array[idx];
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -105,7 +109,8 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
@@ -115,67 +120,67 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index b0141c1431..8c3ea14c4d 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -13,72 +13,74 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                    const std::vector<IDX_TYPE>& seg_idx,
-                                    camp::resources::Resource working_res)
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
+  using REF_BITOR = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_or>;
+
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
-  RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
-
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                            { simpor |= working_array[idx]; });
+  DATA_TYPE simpor(5);
 
-  ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::bit_or>(&simpor),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & _simpor) {
+      _simpor |= working_array[idx];
+  });
 
+  ASSERT_EQ(static_cast<DATA_TYPE>(simpor), 13);
 
+ 
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_or |= test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_or |= test_array[ seg_idx[i] ];
   }
 
   DATA_TYPE redor(0);
   DATA_TYPE redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
-      RAJA::expt::KernelName("RAJA Reduce BitOr"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1, DATA_TYPE & r2)
-      {
-        r1 |= working_array[idx];
-        r2 |= working_array[idx];
-      });
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
+    RAJA::expt::KernelName("RAJA Reduce BitOr"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR &r1, REF_BITOR &r2) {
+      r1 |= working_array[idx];
+      r2 |= working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2), ref_or);
@@ -86,25 +88,28 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::forall<EXEC_POLICY>(
-        seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & r1)
-        { r1 |= working_array[idx]; });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg,
+      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR &r1) {
+        r1 |= working_array[idx];
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
+   
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
@@ -115,66 +120,70 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+    camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r1, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r2, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(r3, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                             working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                             working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                 REDUCE_POLICY>(l1, seg_idx, working_res);
+                                 RAJA::TypedListSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
+                            ReduceBitOrBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index 9ebb335771..773c737a85 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -13,54 +13,54 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
-  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  using REF_MAX = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::maximum>;
 
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+ 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval         = 100;
+  const int modval = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max  = modval + 1;
+  const DATA_TYPE big_max = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
-  DATA_TYPE maxinit = big_max;
+  DATA_TYPE maxinit(big_max);
   DATA_TYPE max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-      RAJA::expt::KernelName("RAJA Reduce Max"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m)
-      {
-        mi = RAJA_MAX(working_array[idx], mi);
-        m  = RAJA_MAX(working_array[idx], m);
-      });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    RAJA::expt::KernelName("RAJA Reduce Max"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &mi, REF_MAX &m) {
+      mi.max(working_array[idx]);
+      m.max(working_array[idx]);
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max);
@@ -69,28 +69,33 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MAX(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &m) {
+      m.max(working_array[idx] * factor);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
-
+   
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MAX(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &m) {
+      m.max(working_array[idx] * factor);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
+   
 
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
@@ -100,66 +105,70 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
+                            ReduceMaxBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index 28cac4c81e..225018eac8 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -13,100 +13,93 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval           = 100;
-  const DATA_TYPE max_init   = -modval;
+  const int modval = 100;
+  const DATA_TYPE max_init = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE big_max    = modval * 10;
-  const IDX_TYPE big_maxloc  = maxloc_init;
+  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE big_max = modval*10;
+  const IDX_TYPE big_maxloc = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(1000 % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max   = max_init;
+  DATA_TYPE ref_max = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    if (test_array[seg_idx[i]] > ref_max)
-    {
-      ref_max    = test_array[seg_idx[i]];
-      ref_maxloc = seg_idx[i];
-    }
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] > ref_max ) {
+       ref_max = test_array[ seg_idx[i] ];
+       ref_maxloc = seg_idx[i];
+    } 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE>;
+  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
+  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
   VL_TYPE maxinit(big_max, maxloc_init);
   VL_TYPE max(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-      RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m)
-      {
-        mi.max(working_array[idx], idx);
-        m.max(working_array[idx], idx);
-      });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
+      mi.maxloc( working_array[idx], idx );
+      m.maxloc( working_array[idx], idx );
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
-  max = VL_TYPE(max_init, maxloc_init);
+  max.set(max_init, maxloc_init);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), max_init);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.max(working_array[idx] * factor, idx); });
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
+      m.maxloc( working_array[idx] * factor, idx);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
-  factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.max(working_array[idx] * factor, idx); });
-  ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
-  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
-
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
@@ -116,67 +109,67 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
new file mode 100644
index 0000000000..68810ac893
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
@@ -0,0 +1,199 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMAXLOCALT_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMAXLOCALT_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxLocAltBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE max_init = -modval;
+  const IDX_TYPE maxloc_init = -1;
+  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE big_max = modval*10;
+  const IDX_TYPE big_maxloc = maxloc_init;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
+  }
+  test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
+
+  DATA_TYPE ref_max = max_init;
+  IDX_TYPE ref_maxloc = maxloc_init;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] > ref_max ) {
+       ref_max = test_array[ seg_idx[i] ];
+       ref_maxloc = seg_idx[i];
+    } 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+
+  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
+  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
+  VL_TYPE maxinit(big_max, maxloc_init);
+  VL_TYPE max(max_init, maxloc_init);
+
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
+      mi.maxloc( working_array[idx], idx );
+      m.maxloc( working_array[idx], idx );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
+  ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+
+  VL_TYPE max2(max_init, maxloc_init);
+
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max2),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
+      m2.max( max );
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max2.getVal()), static_cast<DATA_TYPE>(max.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(max2.getLoc()), static_cast<IDX_TYPE>(max.getLoc()));
+
+  DATA_TYPE s_max = max_init;
+  IDX_TYPE s_maxloc = maxloc_init;
+
+  const int factor = 4;
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max, &s_maxloc),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
+      m.maxloc( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_max), ref_max * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc), ref_maxloc);
+
+  DATA_TYPE s_max2 = max_init;
+  IDX_TYPE s_maxloc2 = maxloc_init;
+
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max2, &s_maxloc2),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
+      m2.max(max2);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_max2), static_cast<DATA_TYPE>(max2.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc2), static_cast<IDX_TYPE>(max2.getLoc()));
+
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMaxLocAltBasicTest);
+template <typename T>
+class ForallReduceMaxLocAltBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMaxLocAltBasicTest, ReduceMaxLocAltBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
+
+// List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocAltBasicTest,
+                            ReduceMaxLocAltBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMAXLOCALT_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index d452e77a14..9aab696301 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -13,54 +13,54 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
+  using REF_MIN = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::minimum>;
+
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval          = 100;
-  const DATA_TYPE min_init  = modval + 1;
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
-  DATA_TYPE mininit = small_min;
+  DATA_TYPE mininit(small_min);
   DATA_TYPE min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      RAJA::expt::KernelName("RAJA Reduce Min"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & mi, DATA_TYPE & m)
-      {
-        mi = RAJA_MIN(working_array[idx], mi);
-        m  = RAJA_MIN(working_array[idx], m);
-      });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    RAJA::expt::KernelName("RAJA Reduce Min"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &mi, REF_MIN &m) {
+      mi.min(working_array[idx]);
+      m.min(working_array[idx]);
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min);
@@ -68,21 +68,25 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min = min_init;
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
-  DATA_TYPE factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MIN(working_array[idx] * factor, m); });
+  DATA_TYPE factor = 3; 
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &m) {
+      m.min(working_array[idx] * factor);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & m)
-                            { m = RAJA_MIN(working_array[idx] * factor, m); });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &m) {
+      m.min(working_array[idx] * factor);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -90,7 +94,8 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
@@ -100,66 +105,70 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
+                            ReduceMinBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index 623c78fda5..dc48b403ea 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -13,100 +13,92 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
-
-  const int modval            = 100;
-  const DATA_TYPE min_init    = modval + 1;
-  const IDX_TYPE minloc_init  = -1;
-  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
-  const DATA_TYPE small_min   = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
+  const IDX_TYPE minloc_init = -1;
+  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE small_min = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min   = min_init;
+  DATA_TYPE ref_min = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    if (test_array[seg_idx[i]] < ref_min)
-    {
-      ref_min    = test_array[seg_idx[i]];
-      ref_minloc = seg_idx[i];
-    }
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] < ref_min ) {
+       ref_min = test_array[ seg_idx[i] ];
+       ref_minloc = seg_idx[i];
+    } 
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
-
-  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE>;
+  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
+  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
   VL_TYPE mininit(small_min, minloc_init);
   VL_TYPE min(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      RAJA::expt::KernelName("RAJA Reduce MinLoc"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & mi, VL_TYPE & m)
-      {
-        mi.min(working_array[idx], idx);
-        m.min(working_array[idx], idx);
-      });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    RAJA::expt::KernelName("RAJA Reduce MinLoc"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
+      mi.minloc( working_array[idx], idx );
+      m.minloc( working_array[idx], idx );
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
-  min = VL_TYPE(min_init, minloc_init);
+  min.set(min_init, minloc_init);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), min_init);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
   RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.min(working_array[idx] * factor, idx); });
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
+      m.minloc( working_array[idx] * factor, idx);
+  });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
-  factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg,
-                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE & m)
-                            { m.min(working_array[idx] * factor, idx); });
-  ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
-  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
-
-
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
@@ -116,67 +108,67 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                              working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                                  REDUCE_POLICY>(l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
new file mode 100644
index 0000000000..07a6058234
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
@@ -0,0 +1,198 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMINLOCALT_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMINLOCALT_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinLocAltBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
+  const IDX_TYPE minloc_init = -1;
+  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE small_min = -modval;
+  const IDX_TYPE small_minloc = minloc_init;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
+
+  DATA_TYPE ref_min = min_init;
+  IDX_TYPE ref_minloc = minloc_init;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] < ref_min ) {
+       ref_min = test_array[ seg_idx[i] ];
+       ref_minloc = seg_idx[i];
+    } 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
+  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
+  VL_TYPE mininit(small_min, minloc_init);
+  VL_TYPE min(min_init, minloc_init);
+
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    RAJA::expt::KernelName("RAJA Reduce MinLoc"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
+      mi.minloc( working_array[idx], idx );
+      m.minloc( working_array[idx], idx );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
+  ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+
+  VL_TYPE min2(min_init, minloc_init);
+
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min2),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
+      m2.min( min );
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min2.getVal()), static_cast<DATA_TYPE>(min.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(min2.getLoc()), static_cast<IDX_TYPE>(min.getLoc()));
+
+  DATA_TYPE s_min = min_init;
+  IDX_TYPE s_minloc = minloc_init;
+
+  const int factor = 4;
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min, &s_minloc),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
+      m.minloc( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_min), ref_min * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc), ref_minloc);
+
+  DATA_TYPE s_min2 = min_init;
+  IDX_TYPE s_minloc2 = minloc_init;
+
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min2, &s_minloc2),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
+      m2.min(min2);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_min2), static_cast<DATA_TYPE>(min2.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc2), static_cast<IDX_TYPE>(min2.getLoc()));
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMinLocAltBasicTest);
+template <typename T>
+class ForallReduceMinLocAltBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMinLocAltBasicTest, ReduceMinLocAltBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
+
+// List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocAltBasicTest,
+                            ReduceMinLocAltBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMINLOCALT_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index 1d8d1ee963..4105700f7c 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -13,52 +13,52 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
+  using REF_SUM = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::plus>;
+
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_sum += test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_sum += test_array[ seg_idx[i] ];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
-  DATA_TYPE sum  = 0;
-  DATA_TYPE sum2 = 2;
+  DATA_TYPE sum(0);
+  DATA_TYPE sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(
-      seg, RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-      RAJA::expt::KernelName("RAJA Reduce Sum"),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s1, DATA_TYPE & s2)
-      {
-        s1 += working_array[idx];
-        s2 += working_array[idx];
-      });
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+    RAJA::expt::KernelName("RAJA Reduce Sum"),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM &s1, REF_SUM &s2) {
+      s1 += working_array[idx];
+      s2 += working_array[idx];
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -67,18 +67,20 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::forall<EXEC_POLICY>(seg,
-                              RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE & s)
-                              { s += working_array[idx]; });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg, 
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM &s) {
+        s += working_array[idx];
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -86,7 +88,8 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
@@ -96,66 +99,70 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                           working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
-  seg_idx.clear();
+// List segment tests
+  seg_idx.clear(); 
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
+                                       working_res );
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
-                               REDUCE_POLICY>(l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
+                            ReduceSumBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index dc7be52f55..6335affc02 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -19,14 +19,12 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename IDX_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -41,19 +39,21 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
 
   const double default_val = -DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i)
-  {
+  for (IDX_TYPE i = 0; i < alen; ++i) {
     test_array[i] = default_val;
   }
 
@@ -62,49 +62,49 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_max    = default_val;
+  double current_max = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax0(default_val);
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount)
-  {
-
-    // pick an index in one of the segments
-    int index = 5127;                // seg 3
-    if (tcount == 2) index = 1938;   // seg2
-    if (tcount == 3) index = 13333;  // seg4
-    if (tcount == 4) index = 52;     // seg1
-
-    double droll = dist(mt);
-    if (test_array[index] > droll)
-    {
-      test_array[index] = droll;
-      current_max       = RAJA_MAX(current_max, droll);
-    }
-
-    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
-
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmax0.max(working_array[i]);
-                                dmax1.max(2 * working_array[i]);
-                              });
-
-    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+
+     // pick an index in one of the segments
+     int index = 5127;  // seg 3
+     if (tcount == 2) index = 1938; // seg2
+     if (tcount == 3) index = 13333; // seg4
+     if (tcount == 4) index = 52; // seg1
+
+     double droll = dist(mt);
+     if (test_array[index] > droll) {
+       test_array[index] = droll;
+       current_max = RAJA_MAX(current_max, droll);
+     }
+
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmax0.max(working_array[i]);
+       dmax1.max(2 * working_array[i]);
+     });
+
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
              ReduceMaxMultipleForallIndexSet)
@@ -114,8 +114,8 @@ TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                          REDUCE_POLICY>();
+  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                          EXEC_POLICY, REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 45bb37cbe3..4d30728fe6 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -19,14 +19,12 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -34,76 +32,77 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1);
-  iset.push_back(r2);
-  iset.push_back(r3);
-  iset.push_back(r4);
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
 
-  double current_max   = -DBL_MAX;
+  double current_max = -DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i)
-  {
+  for (IDX_TYPE i = 0; i < alen; ++i) {
     test_array[i] = current_max;
   }
-
+  
   const int test_repeat = 4;
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max,
-                                                            current_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max,
-                                                            current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max, current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max, current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount)
-  {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
 
-    // set max val
-    current_max = 100.0 + tcount * 10.0;
+     // set max val 
+     current_max = 100.0 + tcount * 10.0;
 
-    // pick an index in one of the segments
-    current_loc = 5127;                    // seg 3
-    if (tcount == 2) current_loc = 1938;   // seg2
-    if (tcount == 3) current_loc = 13333;  // seg4
-    if (tcount == 4) current_loc = 52;     // seg1
+     // pick an index in one of the segments
+     current_loc = 5127;  // seg 3
+     if (tcount == 2) current_loc = 1938; // seg2
+     if (tcount == 3) current_loc = 13333; // seg4
+     if (tcount == 4) current_loc = 52; // seg1
 
-    test_array[current_loc] = current_max;
+     test_array[current_loc] = current_max;
+ 
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmax0.maxloc(working_array[i], i);
+       dmax1.maxloc(2 * working_array[i], i);
+     });
 
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmax0.maxloc(working_array[i], i);
-                                dmax1.maxloc(2 * working_array[i], i);
-                              });
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
 
-    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-    ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
-    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
-    ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
-{};
+{
+};
 
-TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
+TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest, 
              ReduceMaxLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -111,8 +110,8 @@ TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                             REDUCE_POLICY>();
+  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                             EXEC_POLICY, REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index 9a0cc3b67f..cf3b60d078 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -19,14 +19,12 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallIndexSetReduceMinMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -34,79 +32,81 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1);
-  iset.push_back(r2);
-  iset.push_back(r3);
-  iset.push_back(r4);
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
 
   const double default_val = DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i)
-  {
+  for (IDX_TYPE i = 0; i < alen; ++i) {
     test_array[i] = default_val;
   }
-
+  
   // for setting random values in arrays
   std::random_device rd;
   std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_min    = default_val;
+  double current_min = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin0(default_val);
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount)
-  {
-
-    // pick an index in one of the segments
-    int index = 5127;                // seg 3
-    if (tcount == 2) index = 1938;   // seg2
-    if (tcount == 3) index = 13333;  // seg4
-    if (tcount == 4) index = 52;     // seg1
-
-    double droll = dist(mt);
-    if (test_array[index] > droll)
-    {
-      test_array[index] = droll;
-      current_min       = RAJA_MIN(current_min, droll);
-    }
-
-    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
-
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmin0.min(working_array[i]);
-                                dmin1.min(2 * working_array[i]);
-                              });
-
-    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+
+     // pick an index in one of the segments
+     int index = 5127;  // seg 3
+     if (tcount == 2) index = 1938; // seg2
+     if (tcount == 3) index = 13333; // seg4
+     if (tcount == 4) index = 52; // seg1
+
+     double droll = dist(mt);
+     if (test_array[index] > droll) {
+       test_array[index] = droll;
+       current_min = RAJA_MIN(current_min, droll);
+     }
+ 
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmin0.min(working_array[i]);
+       dmin1.min(2 * working_array[i]);
+     });
+
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
-{};
+{
+};
 
-TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
+TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest, 
              ReduceMinMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -114,8 +114,8 @@ TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                          REDUCE_POLICY>();
+  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                          EXEC_POLICY, REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index 2b0fa8d43d..b8abbd9f67 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -19,14 +19,12 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallIndexSetReduceMinLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -34,76 +32,77 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1);
-  iset.push_back(r2);
-  iset.push_back(r3);
-  iset.push_back(r4);
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen, working_res, &working_array,
-                                 &check_array, &test_array);
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &working_array,
+                                 &check_array,
+                                 &test_array);
 
-  double current_min   = DBL_MAX;
+  double current_min = DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i)
-  {
+  for (IDX_TYPE i = 0; i < alen; ++i) {
     test_array[i] = current_min;
   }
-
+  
   const int test_repeat = 4;
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min,
-                                                            current_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min,
-                                                            current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min, current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min, current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount)
-  {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
 
-    // set min val
-    current_min = 100.0 - tcount * 10.0;
+     // set min val 
+     current_min = 100.0 - tcount * 10.0;
 
-    // pick an index in one of the segments
-    current_loc = 5127;                    // seg 3
-    if (tcount == 2) current_loc = 1938;   // seg2
-    if (tcount == 3) current_loc = 13333;  // seg4
-    if (tcount == 4) current_loc = 52;     // seg1
+     // pick an index in one of the segments
+     current_loc = 5127;  // seg 3
+     if (tcount == 2) current_loc = 1938; // seg2
+     if (tcount == 3) current_loc = 13333; // seg4
+     if (tcount == 4) current_loc = 52; // seg1
 
-    test_array[current_loc] = current_min;
+     test_array[current_loc] = current_min;
+ 
+     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+       dmin0.minloc(working_array[i], i);
+       dmin1.minloc(2 * working_array[i], i);
+     });
 
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-                              {
-                                dmin0.minloc(working_array[i], i);
-                                dmin1.minloc(2 * working_array[i], i);
-                              });
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
+     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
 
-    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-    ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
-    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
-    ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res, working_array, check_array,
+  deallocateForallTestData<double>(working_res,
+                                   working_array,
+                                   check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
-{};
+{
+};
 
-TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
+TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest, 
              ReduceMinLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -111,8 +110,8 @@ TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                             REDUCE_POLICY>();
+  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                             EXEC_POLICY, REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index 7dd8f83844..88d3f54d7e 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -17,14 +17,12 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename IDX_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
 void ForallIndexSetReduceSumMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -32,43 +30,48 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1);
-  iset.push_back(r2);
-  iset.push_back(r3);
-  iset.push_back(r4);
+  iset.push_back(r1); 
+  iset.push_back(r2); 
+  iset.push_back(r3); 
+  iset.push_back(r4); 
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   double* dworking_array;
   double* dcheck_array;
   double* dtest_array;
 
-  allocateForallTestData<double>(alen, working_res, &dworking_array,
-                                 &dcheck_array, &dtest_array);
+  allocateForallTestData<double>(alen,
+                                 working_res,
+                                 &dworking_array,
+                                 &dcheck_array,
+                                 &dtest_array);
 
   int* iworking_array;
   int* icheck_array;
   int* itest_array;
 
-  allocateForallTestData<int>(alen, working_res, &iworking_array, &icheck_array,
+  allocateForallTestData<int>(alen,
+                              working_res,
+                              &iworking_array,
+                              &icheck_array,
                               &itest_array);
 
   const double dinit_val = 0.1;
-  const int iinit_val    = 1;
+  const int iinit_val = 1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i)
-  {
+  for (IDX_TYPE i = 0; i < alen; ++i) {
     dtest_array[i] = dinit_val;
     itest_array[i] = iinit_val;
   }
-
+  
   working_res.memcpy(dworking_array, dtest_array, sizeof(double) * alen);
   working_res.memcpy(iworking_array, itest_array, sizeof(int) * alen);
 
-  const double drinit   = 5.0;
-  const int irinit      = 4;
+  const double drinit = 5.0;
+  const int irinit = 4;
   const int test_repeat = 4;
 
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum0(drinit * 1.0);
@@ -76,44 +79,47 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum2(drinit * 3.0);
   RAJA::ReduceSum<REDUCE_POLICY, int> isum3(irinit * 4);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount)
-  {
-
-    RAJA::forall<EXEC_POLICY>(iset,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              {
-                                dsum0 += 1.0 * dworking_array[idx];
-                                isum1 += 2 * iworking_array[idx];
-                                dsum2 += 3.0 * dworking_array[idx];
-                                isum3 += 4 * iworking_array[idx];
-                              });
+  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+ 
+    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      dsum0 += 1.0 * dworking_array[idx];
+      isum1 += 2 * iworking_array[idx];
+      dsum2 += 3.0 * dworking_array[idx];
+      isum3 += 4 * iworking_array[idx];
+    });
 
     double dchk_val = dinit_val * static_cast<double>(iset.getLength());
-    int ichk_val    = iinit_val * static_cast<int>(iset.getLength());
+    int ichk_val = iinit_val * static_cast<int>(iset.getLength());
 
-    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()),
-                    tcount * (1 * dchk_val) + (drinit * 1.0));
-    ASSERT_EQ(static_cast<int>(isum1.get()),
-              tcount * (2 * ichk_val) + (irinit * 2));
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()), 
+                               tcount * (1 * dchk_val) + (drinit * 1.0) );
+    ASSERT_EQ(static_cast<int>(isum1.get()), 
+                               tcount * (2 * ichk_val) + (irinit * 2) );
     ASSERT_FLOAT_EQ(static_cast<double>(dsum2.get()),
-                    tcount * (3 * dchk_val) + (drinit * 3.0));
-    ASSERT_EQ(static_cast<int>(isum3.get()),
-              tcount * (4 * ichk_val) + (irinit * 4));
+                               tcount * (3 * dchk_val) + (drinit * 3.0) );
+    ASSERT_EQ(static_cast<int>(isum3.get()), 
+                               tcount * (4 * ichk_val) + (irinit * 4) );
+
   }
 
-  deallocateForallTestData<double>(working_res, dworking_array, dcheck_array,
+  deallocateForallTestData<double>(working_res,
+                                   dworking_array,
+                                   dcheck_array,
                                    dtest_array);
 
-  deallocateForallTestData<int>(working_res, iworking_array, icheck_array,
+  deallocateForallTestData<int>(working_res,
+                                iworking_array,
+                                icheck_array,
                                 itest_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
 template <typename T>
 class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
-{};
+{
+};
 
-TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
+TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest, 
              ReduceSumMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -121,8 +127,8 @@ TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
-                                          REDUCE_POLICY>();
+  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES,
+                                          EXEC_POLICY, REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index 8e996e4a2c..bc5aec30d6 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -14,31 +14,32 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
+template <typename IDX_TYPE, 
+          typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, 
+                                     IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const DATA_TYPE big_val     = 500;
-
+  const DATA_TYPE big_val = 500;
+  
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `max0;` not `max0(0);`
@@ -48,8 +49,7 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l)
-  {
+  for (int l = 0; l < nOuterLoops; ++l) {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
@@ -58,62 +58,61 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     DATA_TYPE current_max = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k)
-    {
+    for (int k = 0; k < nMiddleLoops; ++k) {
 
-      for (IDX_TYPE i = 0; i < last; ++i)
-      {
+      for (IDX_TYPE i = 0; i < last; ++i) {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j)
-      {
+      for (int j = 0; j < nloops; ++j) {
 
-        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
-        working_res.memcpy(&working_array[max_index], &test_array[max_index],
-                           sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
 
-        if (current_max < roll)
-        {
-          current_max = roll;
+        if ( current_max < roll ) {
+          current_max = roll ;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    max0.max(working_array[idx]);
-                                    max1.max(2 * working_array[idx]);
-                                    max2.max(working_array[idx]);
-                                  });
+        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+          max0.max(working_array[idx]);
+          max1.max(2 * working_array[idx]);
+          max2.max(working_array[idx]);
+        });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
+
       }
+
     }
 
     max0.reset(default_val);
     max1.reset(default_val);
     max2.reset(big_val);
+
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
 template <typename T>
 class ForallReduceMaxMultipleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
 {
@@ -123,8 +122,8 @@ TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                                  REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index d13f7f05a3..8f16762989 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -14,43 +14,40 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
+template <typename IDX_TYPE, 
+          typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, 
+                                        IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const IDX_TYPE default_loc  = -1;
-  const DATA_TYPE big_val     = 500;
-
+  const IDX_TYPE default_loc = -1;
+  const DATA_TYPE big_val = 500;
+  
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
-                                                              default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val,
-                                                              default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val,
-                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val, default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l)
-  {
+  for (int l = 0; l < nOuterLoops; ++l) {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -62,45 +59,37 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
     DATA_TYPE current_max = default_val;
-    IDX_TYPE current_loc  = default_loc;
+    IDX_TYPE  current_loc = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k)
-    {
+    for (int k = 0; k < nMiddleLoops; ++k) {
 
-      for (IDX_TYPE i = first; i < last; ++i)
-      {
+      for (IDX_TYPE i = first; i < last; ++i) {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j)
-      {
+      for (int j = 0; j < nloops; ++j) {
 
-        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
-        if (current_max != roll)
-        {  // avoid two indices getting the same value
+        if ( current_max != roll ) { // avoid two indices getting the same value
           test_array[max_index] = roll;
-          working_res.memcpy(&working_array[max_index], &test_array[max_index],
-                             sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
 
-          if (current_max < roll)
-          {
+          if ( current_max < roll ) {
             current_max = roll;
             current_loc = max_index;
           }
         }
 
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    max0.maxloc(working_array[idx], idx);
-                                    max1.maxloc(2 * working_array[idx], idx);
-                                    max2.maxloc(working_array[idx], idx);
-                                  });
+        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+          max0.maxloc(working_array[idx], idx);
+          max1.maxloc(2 * working_array[idx], idx);
+          max2.maxloc(working_array[idx], idx);
+        });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -110,12 +99,15 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
+
       }
+
     }
 
     max0.reset(default_val, default_loc);
     max1.reset(default_val, default_loc);
     max2.reset(big_val, default_loc);
+
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
@@ -127,14 +119,17 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
 template <typename T>
 class ForallReduceMaxLocMultipleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
 {
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index a33710f7dc..7e51ac2a2d 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -15,30 +15,31 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
+          typename DATA_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinMultipleTestImpl(IDX_TYPE first, 
+                                     IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const DATA_TYPE big_val     = -500;
-
+  const DATA_TYPE big_val = -500;
+  
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `min0;` not `min0(0);`
@@ -48,8 +49,7 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l)
-  {
+  for (int l = 0; l < nOuterLoops; ++l) {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
@@ -58,62 +58,61 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     DATA_TYPE current_min = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k)
-    {
+    for (int k = 0; k < nMiddleLoops; ++k) {
 
-      for (IDX_TYPE i = 0; i < last; ++i)
-      {
+      for (IDX_TYPE i = 0; i < last; ++i) {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j)
-      {
+      for (int j = 0; j < nloops; ++j) {
 
-        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
-        working_res.memcpy(&working_array[min_index], &test_array[min_index],
-                           sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
 
-        if (current_min > roll)
-        {
-          current_min = roll;
+        if ( current_min > roll ) {
+          current_min = roll ;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    min0.min(working_array[idx]);
-                                    min1.min(2 * working_array[idx]);
-                                    min2.min(working_array[idx]);
-                                  });
+        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+          min0.min(working_array[idx]);
+          min1.min(2 * working_array[idx]);
+          min2.min(working_array[idx]);
+        });
 
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
         ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
+
       }
+
     }
 
     min0.reset(default_val);
     min1.reset(default_val);
     min2.reset(big_val);
+
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
 template <typename T>
 class ForallReduceMinMultipleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
 {
@@ -123,8 +122,8 @@ TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                                  REDUCE_POLICY>(0, 2115);
+  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index c8e4431ac4..d71f582ed9 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -15,43 +15,40 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
+          typename DATA_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, 
+                                        IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const IDX_TYPE default_loc  = -1;
-  const DATA_TYPE big_val     = -500;
+  const IDX_TYPE default_loc = -1;
+  const DATA_TYPE big_val = -500;
 
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
-                                                  static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val,
-                                                              default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val,
-                                                              default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val,
-                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val, default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l)
-  {
+  for (int l = 0; l < nOuterLoops; ++l) {
 
     printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
@@ -64,49 +61,40 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
     DATA_TYPE current_min = default_val;
-    IDX_TYPE current_loc  = default_loc;
+    IDX_TYPE  current_loc = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k)
-    {
+    for (int k = 0; k < nMiddleLoops; ++k) {
 
       printf("reset data { %f }\n", (double)default_val);
-      for (IDX_TYPE i = first; i < last; ++i)
-      {
+      for (IDX_TYPE i = first; i < last; ++i) {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j)
-      {
+      for (int j = 0; j < nloops; ++j) {
 
-        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
+        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         printf("rolling { %f, %f }\n", (double)roll, (double)min_index);
-        if (current_min != roll)
-        {  // avoid two indices getting the same value
+        if ( current_min != roll ) { // avoid two indices getting the same value
           test_array[min_index] = roll;
-          working_res.memcpy(&working_array[min_index], &test_array[min_index],
-                             sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
 
-          if (current_min > roll)
-          {
+          if ( current_min > roll ) {
             current_min = roll;
             current_loc = min_index;
           }
         }
-        printf("current { %f, %f }\n", (double)current_min,
-               (double)current_loc);
+        printf("current { %f, %f }\n", (double)current_min, (double)current_loc);
 
-        RAJA::forall<EXEC_POLICY>(r1,
-                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                  {
-                                    min0.minloc(working_array[idx], idx);
-                                    min1.minloc(2 * working_array[idx], idx);
-                                    min2.minloc(working_array[idx], idx);
-                                  });
+        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+          min0.minloc(working_array[idx], idx);
+          min1.minloc(2 * working_array[idx], idx);
+          min2.minloc(working_array[idx], idx);
+        });
 
         printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
@@ -117,13 +105,16 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
+
       }
+
     }
 
     printf("min0 reset { %f, %f }\n", (double)default_val, (double)default_loc);
     min0.reset(default_val, (DATA_TYPE)default_loc);
     min1.reset(default_val, default_loc);
     min2.reset(big_val, default_loc);
+
   }
 
   printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
@@ -136,14 +127,17 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
 template <typename T>
 class ForallReduceMinLocMultipleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
 {
@@ -153,7 +147,7 @@ TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
                                      EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index aa489187f0..b5a6c469d1 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -12,26 +12,27 @@
 #include <numeric>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
+          typename DATA_TYPE, typename WORKING_RES,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, 
+                                              IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i)
-  {
+  for (IDX_TYPE i = first; i < last; ++i) {
     test_array[i] = initval;
   }
 
@@ -50,63 +51,60 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
   const DATA_TYPE index_len = static_cast<DATA_TYPE>(last - first);
 
   const int nloops = 2;
-  for (int j = 0; j < nloops; ++j)
-  {
-
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              {
-                                sum0 += working_array[idx];
-                                sum1 += working_array[idx] * 2;
-                                sum2 += working_array[idx] * 3;
-                                sum3 += working_array[idx] * 4;
-                                sum4 += working_array[idx] * 5;
-                                sum5 += working_array[idx] * 6;
-                                sum6 += working_array[idx] * 7;
-                                sum7 += working_array[idx] * 8;
-                              });
+  for (int j = 0; j < nloops; ++j) {
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      sum0 += working_array[idx];
+      sum1 += working_array[idx] * 2;
+      sum2 += working_array[idx] * 3;
+      sum3 += working_array[idx] * 4;
+      sum4 += working_array[idx] * 5;
+      sum5 += working_array[idx] * 6;
+      sum6 += working_array[idx] * 7;
+      sum7 += working_array[idx] * 8;
+    });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval * 1),
-              static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval * 3),
-              static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval * 5),
-              static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval * 7),
-              static_cast<DATA_TYPE>(sum7.get()));
+    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
+
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
+          typename DATA_TYPE, typename WORKING_RES, 
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, 
+			                       IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(last,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i)
-  {
+  for (IDX_TYPE i = first; i < last; ++i) {
     test_array[i] = initval;
   }
 
@@ -136,46 +134,43 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
   sum7.reset(initval * 7);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j)
-  {
-
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                              {
-                                sum0 += working_array[idx];
-                                sum1 += working_array[idx] * 2;
-                                sum2 += working_array[idx] * 3;
-                                sum3 += working_array[idx] * 4;
-                                sum4 += working_array[idx] * 5;
-                                sum5 += working_array[idx] * 6;
-                                sum6 += working_array[idx] * 7;
-                                sum7 += working_array[idx] * 8;
-                              });
+  for (int j = 0; j < nloops; ++j) {
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+      sum0 += working_array[idx];
+      sum1 += working_array[idx] * 2;
+      sum2 += working_array[idx] * 3;
+      sum3 += working_array[idx] * 4;
+      sum4 += working_array[idx] * 5;
+      sum5 += working_array[idx] * 6;
+      sum6 += working_array[idx] * 7;
+      sum7 += working_array[idx] * 8;
+    });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval * 1),
-              static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval * 3),
-              static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval * 5),
-              static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval * 7),
-              static_cast<DATA_TYPE>(sum7.get()));
+    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
+
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
 template <typename T>
 class ForallReduceSumMultipleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
 {
@@ -185,12 +180,11 @@ TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 
-  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                            EXEC_POLICY, REDUCE_POLICY>(0,
-                                                                        2115);
+  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index 6b0dafd652..f83d9ef1a5 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -11,54 +11,59 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename REG_POLICY,
-          typename EXEC_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, 
+          typename REG_POLICY, typename EXEC_POLICY>
 void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   //
   // Set some local variables and create some segments for using in tests
   //
   const INDEX_TYPE N = last - first;
-
+  
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(&idx_array[0], &idx_array[0] + N, first);
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
+                                          working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  working_res.memset( working_array, 0, sizeof(INDEX_TYPE) * N );
 
-  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
+  RAJA::region<REG_POLICY>([=]() {
 
-  RAJA::region<REG_POLICY>(
-      [=]()
-      {
-        RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                                  { working_array[idx - first] += 1; });
+    RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[idx - first] += 1;
+    });
 
-        RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                                  { working_array[idx - first] += 2; });
-      });
+    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[idx - first] += 2; 
+    });
+
+  });
 
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(check_array[i], 3);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -66,7 +71,8 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallRegionTest);
 template <typename T>
 class ForallRegionTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallRegionTest, RegionForall)
 {
@@ -76,12 +82,11 @@ TYPED_TEST_P(ForallRegionTest, RegionForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1,
-                                                                         153);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3,
-                                                                         2556);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1, 153);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest, RegionForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest,
+                            RegionForall);
 
 #endif  // __TEST_FORALL_REGION_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index f909ef9d4c..b000b270da 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -21,63 +21,67 @@ void ForallResourceIcountIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType =
-      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource erased_working_res{working_res};
 
-  IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices;
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-      iset, is_indices, erased_working_res);
+    iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //
+  //  
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i)
-  {
-    test_array[ticount++] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ ticount++ ] = is_indices[i];
   }
 
-  RAJA::forall_Icount<EXEC_POLICY>(
-      working_res, iset,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
-      { working_array[icount] = idx; });
+  RAJA::forall_Icount<EXEC_POLICY>(working_res, iset,
+    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
+    working_array[icount] = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest);
 template <typename T>
 class ForallResourceIcountIndexSetTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
 {
@@ -85,8 +89,7 @@ TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                       EXEC_POLICY>();
+  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest,
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index 2129d1350f..c1f714013d 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -20,62 +20,66 @@ void ForallResourceIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType =
-      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
+  using IndexSetType = 
+   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource erased_working_res{working_res};
 
-  IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices;
+  IndexSetType iset; 
+  std::vector<INDEX_TYPE> is_indices; 
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-      iset, is_indices, erased_working_res);
+    iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
+  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //
+  //  
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i)
-  {
-    test_array[is_indices[i]] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i) {
+    test_array[ is_indices[i] ] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, iset,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { working_array[idx] = idx; });
+  RAJA::forall<EXEC_POLICY>(working_res, iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[idx] = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  //
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  // 
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIndexSetTest);
 template <typename T>
 class ForallResourceIndexSetTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
 {
@@ -86,6 +90,7 @@ TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
   ForallResourceIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest, ResourceIndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest,
+                            ResourceIndexSetForall);
 
 #endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index b5596ab1dd..5e0675cc98 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -22,70 +22,67 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand(time(NULL));
+  srand ( time(NULL) );
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
-  {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if (i < randval)
-    {
+    if ( i < randval ) {
       idx_array.push_back(i);
-    }
+    }     
   }
 
   size_t idxlen = idx_array.size();
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource erased_working_res{working_res};
 
   // Create list segment for tests
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen,
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
                                           erased_working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (size_t i = 0; i < idxlen; ++i)
-  {
-    test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i) {
+    test_array[ RAJA::stripIndexType(idx_array[i]) ] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, lseg,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-                              working_array[RAJA::stripIndexType(idx)] = idx;
-                            });
+  RAJA::forall<EXEC_POLICY>(working_res, lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx)] = idx;
+  }); 
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  //
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  // 
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceListSegmentTest);
 template <typename T>
 class ForallResourceListSegmentTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
 {
@@ -93,14 +90,11 @@ TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      INDEX_TYPE(13));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      INDEX_TYPE(2047));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      INDEX_TYPE(32000));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceListSegmentTest,
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 321a0804fa..83cc7c4aa1 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -13,66 +13,60 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource erased_working_res{working_res};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::forall<EXEC_POLICY>(
-      working_res, r1,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+  });
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest);
 template <typename T>
 class ForallResourceRangeSegmentTest : public ::testing::Test
-{};
+{
+};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{}
+{
+}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -82,12 +76,9 @@ TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index 37d3ebfbf3..f85f295548 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -8,142 +8,100 @@
 #ifndef __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 #define __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
-void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
-                                              INDEX_TYPE last,
-                                              DIFF_TYPE stride)
+template <typename INDEX_TYPE, typename DIFF_TYPE, 
+          typename WORKING_RES, typename EXEC_POLICY>
+void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+                                      DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res {working_res};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource host_res{camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N)); 
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
-  {
-    test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
-    idx += stride;
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+    test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+    idx += stride; 
   }
 
-  RAJA::forall<EXEC_POLICY>(
-      working_res, r1,
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-      { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
+  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+  });
 
-  working_res.memcpy(check_array, working_array,
-                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
-                                       check_array, test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest);
 template <typename T>
 class ForallResourceRangeStrideSegmentTest : public ::testing::Test
-{};
-
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+};
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{}
-
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-  // Test negative strides
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+// Test negative strides
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
-TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
-             ResourceRangeStrideSegmentForall)
+TYPED_TEST_P(ForallResourceRangeStrideSegmentTest, ResourceRangeStrideSegmentForall)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE =
-      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-  // Test size zero segments
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                           EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+// Test size zero segments
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index 93c2e1c07d..e673abf306 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -23,49 +23,49 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand(time(NULL));
+  srand ( time(NULL) );
 
-  for (INDEX_TYPE i = 0; i < N; ++i)
-  {
+  for (INDEX_TYPE i = 0; i < N; ++i) {
     INDEX_TYPE randval = rand() % N;
-    if (i < randval)
-    {
+    if ( i < randval ) {
       idx_array.push_back(i);
-    }
+    }     
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+                                          working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i)
-  {
-    test_array[idx_array[i]] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i) {
+    test_array[ idx_array[i] ] = idx_array[i];
   }
 
   using layout_type = RAJA::Layout<1, INDEX_TYPE, 0>;
-  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
-#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) ||            \
-       defined(RAJA_COMPILER_MSVC))) ||                                        \
-    _GLIBCXX_RELEASE >= 20150716
-#if (__GNUG__ && __GNUC__ < 5)
-#define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-#else
-#define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-#endif
+  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
+#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) || defined(RAJA_COMPILER_MSVC)))\
+    || _GLIBCXX_RELEASE >= 20150716
+  #if (__GNUG__ && __GNUC__ < 5)
+  #define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+  #else
+  #define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+  #endif
   static_assert(IS_TRIVIALLY_COPYABLE(layout_type),
                 "These layouts should always be triviallly copyable");
 
@@ -77,21 +77,23 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 
 
 #endif
-
+  
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx ) = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -102,64 +104,69 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand(time(NULL));
+  srand ( time(NULL) );
 
-  for (INDEX_TYPE i = 0; i < N; ++i)
-  {
+  for (INDEX_TYPE i = 0; i < N; ++i) {
     INDEX_TYPE randval = rand() % N;
-    if (i < randval)
-    {
-      idx_array.push_back(i + offset);
-    }
+    if ( i < randval ) {
+      idx_array.push_back(i+offset);
+    }     
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+                                          working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i)
-  {
-    test_array[idx_array[i] - offset] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i) {
+    test_array[ idx_array[i]-offset ] = idx_array[i];
   }
 
   using layout_type = RAJA::OffsetLayout<1, INDEX_TYPE>;
-  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
+  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
-                                         {{offset}}, {{N_offset}}));
+  view_type work_view(working_array, 
+                      RAJA::make_offset_layout<1, INDEX_TYPE>( {{offset}}, 
+                                                               {{N_offset}} ));
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx ) = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
 template <typename T>
 class ForallListSegmentViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
 {
@@ -168,19 +175,15 @@ TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      2047);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      32000);
-
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                      EXEC_POLICY>(13, 1);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                      EXEC_POLICY>(2047, 2);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
-                                      EXEC_POLICY>(32000, 3);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000);
+
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13, 1);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047, 2);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000, 3);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest, ListSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest,
+                            ListSegmentForallView);
 
 #endif  // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index 96956fd981..b9355d9bc1 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -15,41 +15,43 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 {
   INDEX_TYPE lentot = N * N;
-  const int NDIMS   = 2;
+  const int NDIMS = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, lentot);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   std::iota(test_array, test_array + lentot, 0);
 
-  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<NDIMS>>;
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<NDIMS> >;
   RAJA::Layout<NDIMS> layout(N, N);
-
+  
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            {
-                              const INDEX_TYPE row = idx / N;
-                              const INDEX_TYPE col = idx % N;
-                              work_view(row, col)  = row * N + col;
-                            });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    const INDEX_TYPE row = idx / N;
+    const INDEX_TYPE col = idx % N;
+    work_view(row, col) = row * N + col;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++)
-  {
+  for (INDEX_TYPE i = 0; i < lentot; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -57,81 +59,75 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 {
   const INDEX_TYPE leninterior = N * N;
-  const INDEX_TYPE lentot      = (N + 2) * (N + 2);
-  const int NDIMS              = 2;
+  const INDEX_TYPE lentot = (N + 2) * (N + 2);
+  const int NDIMS = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, leninterior);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * lentot);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * lentot ); 
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (int row = 1; row < N + 1; ++row)
-  {
-    for (int col = 1; col < N + 1; ++col)
-    {
-      int idx         = row * (N + 2) + col;
-      test_array[idx] = (row - 1) * N + (col - 1);
+  for (int row = 1; row < N + 1; ++row) {
+    for (int col = 1; col < N + 1; ++col) {
+      int idx = row * (N+2) + col;
+      test_array[ idx ] = (row - 1) * N + (col - 1);
     }
   }
 
-  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<NDIMS>>;
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<NDIMS> >;
   RAJA::OffsetLayout<NDIMS> layout =
-      RAJA::make_offset_layout<NDIMS>({{-1, -1}}, {{N + 1, N + 1}});
+    RAJA::make_offset_layout<NDIMS>( {{-1, -1}} , {{N+1, N+1}} );
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1,
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            {
-                              const INDEX_TYPE row = idx / N;
-                              const INDEX_TYPE col = idx % N;
-                              work_view(row, col)  = idx;
-                            });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    const INDEX_TYPE row = idx / N;
+    const INDEX_TYPE col = idx % N;
+    work_view(row, col) = idx;  
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++)
-  {
+  for (INDEX_TYPE i = 0; i < lentot; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
 template <typename T>
 class ForallRangeSegment2DViewTest : public ::testing::Test
-{};
+{
+};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-              nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
 void runOffsetViewTests()
-{}
+{
+}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runOffsetViewTests()
 {
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      4);
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      100);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(4);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(100);
 }
 
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index 0c981f3da9..b4449db822 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -16,109 +16,112 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first, last);
   INDEX_TYPE N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
-
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+ 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx - rbegin) = idx; });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx - rbegin ) = idx;
+  }); 
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
-                                          INDEX_TYPE last,
+void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
                                           INDEX_TYPE offset)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first + offset, last + offset);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first+offset, last+offset);
   INDEX_TYPE N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE>>;
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE> >;
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
-                                         {{f_offset}}, {{l_offset}}));
+  view_type work_view(working_array, 
+                      RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}},
+                                                              {{l_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view(idx) = idx; });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( idx ) = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-              nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeViewTests()
-{}
+{
+}
 
-template <
-    typename INDEX_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeViewTests()
 {
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      -5, 0, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      -5, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      0, 10, -5);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 10, -5);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest);
 template <typename T>
 class ForallRangeSegmentViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
 {
@@ -130,16 +133,14 @@ TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      0, 5, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      1, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      1, 255, 3);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 5, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 3);
 
   runNegativeViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest, RangeSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest,
+                            RangeSegmentForallView);
 
 #endif  // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index 2e56fab16c..c385b929bc 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -8,90 +8,80 @@
 #ifndef __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 #define __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
-void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
-                                          INDEX_TYPE last,
+template <typename INDEX_TYPE, typename DIFF_TYPE, 
+          typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
                                           DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
   INDEX_TYPE N = r1.size();
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
+  memset( test_array, 0, sizeof(INDEX_TYPE) * N );
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = 0; i < N; ++i)
-  {
-    test_array[(idx - first) / stride] = idx;
+  for (INDEX_TYPE i = 0; i < N; ++i) {
+    test_array[ (idx-first)/stride ] = idx;
     idx += stride;
   }
 
-  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
+  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                            { work_view((idx - first) / stride) = idx; });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    work_view( (idx-first)/stride ) = idx;
+  });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
-              nullptr>
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
-{}
-
-template <
-    typename INDEX_TYPE,
-    typename DIFF_TYPE,
-    typename WORKING_RES,
-    typename EXEC_POLICY,
-    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
 {
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(-10, -1, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(-5, 0, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(-5, 5, 3);
-
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(10, -1, -1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(10, 0, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-10, -1, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 3);
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, -1, -1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, 0, -2);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest);
 template <typename T>
 class ForallRangeStrideSegmentViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
 {
@@ -100,26 +90,17 @@ TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
 
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 255, 2);
-
-  // Test size zero segments
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(0, 20, -2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                       EXEC_POLICY>(1, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 2);
+
+// Test size zero segments
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, -2);
 
   runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index df2f4300c8..0252af8644 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -22,83 +22,76 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand(time(NULL));
+  srand ( time(NULL) );
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
-  {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if (i < randval)
-    {
+    if ( i < randval ) {
       idx_array.push_back(i);
-    }
+    }     
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0)
-  {
+  if (N > 0) {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, 
+                                          working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
-    for (size_t i = 0; i < idxlen; ++i)
-    {
-      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i) {
+      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-    RAJA::forall<EXEC_POLICY>(lseg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-                                working_array[RAJA::stripIndexType(idx)] = idx;
-                              });
-  }
-  else
-  {  // zero-length segment
+    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[RAJA::stripIndexType(idx)] = idx;
+    }); 
+
+  } else { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      (void) idx;
+      working_array[0]++;
+    });
 
-    RAJA::forall<EXEC_POLICY>(lseg,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                              {
-                                (void)idx;
-                                working_array[0]++;
-                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -106,7 +99,8 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
 TYPED_TEST_SUITE_P(ForallListSegmentTest);
 template <typename T>
 class ForallListSegmentTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
 {
@@ -115,19 +109,16 @@ TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length list segment
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      INDEX_TYPE(0));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(0));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      INDEX_TYPE(13));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      INDEX_TYPE(2047));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
-      INDEX_TYPE(32000));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest, ListSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest,
+                            ListSegmentForall);
 
 #endif  // __TEST_FORALL_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index a55a655788..8b10d5dc10 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -14,60 +14,57 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::forall<EXEC_POLICY>(
-        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
-  }
-  else
-  {  // zero-length segment
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+    });
+
+  } else { // zero-length segment 
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      (void) idx;
+      working_array[0]++;
+    });
 
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                              {
-                                (void)idx;
-                                working_array[0]++;
-                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -75,31 +72,24 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallRangeSegmentTest);
 template <typename T>
 class ForallRangeSegmentTest : public ::testing::Test
-{};
+{
+};
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{}
+{
+}
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -110,19 +100,16 @@ TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest, RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest,
+                            RangeSegmentForall);
 
 #endif  // __TEST_FORALL_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index e92ec54af2..00046e15bf 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -10,71 +10,65 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
-void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
-                                      INDEX_TYPE last,
+template <typename INDEX_TYPE, typename DIFF_TYPE, 
+          typename WORKING_RES, typename EXEC_POLICY>
+void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len); 
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
-    {
-      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
-      idx += stride;
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+      idx += stride; 
     }
 
-    RAJA::forall<EXEC_POLICY>(
-        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-        { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
-  }
-  else
-  {  // zero-length segment
-
-    RAJA::forall<EXEC_POLICY>(r1,
-                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
-                              {
-                                (void)idx;
-                                working_array[0]++;
-                              });
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+    });
+
+  } else { // zero-length segment
+
+    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      (void) idx;
+      working_array[0]++;
+    });
+
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -82,42 +76,26 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest);
 template <typename T>
 class ForallRangeStrideSegmentTest : public ::testing::Test
-{};
-
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+};
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{}
-
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                DIFF_TYPE(3));
-
-  // Test negative strides
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1),
-                                                DIFF_TYPE(-1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0),
-                                                DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+// Test negative strides
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
@@ -126,38 +104,19 @@ TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE =
-      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
-                                                DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
-                                                DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21),
-                                                DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255),
-                                                DIFF_TYPE(2));
-
-  // Test size zero segments
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
-                                                DIFF_TYPE(-2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
-                                                DIFF_TYPE(-2));
+  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+// Test size zero segments
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index 393d1ad7ba..ec40004b2d 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -11,7 +11,7 @@
 
 #include "RAJA_test-base.hpp"
 
-#include "RAJA/index/IndexSetBuilders.hpp"
+#include "RAJA/index/IndexSetBuilders.hpp" 
 
 #include "camp/resource.hpp"
 
@@ -21,7 +21,7 @@
 TEST(IndexSetBuild, Aligned)
 {
   const RAJA::Index_type range_min_length = 8;
-  const RAJA::Index_type range_align      = 2;
+  const RAJA::Index_type range_align = 2;
 
   using RSType = RAJA::RangeSegment;
   using LSType = RAJA::ListSegment;
@@ -36,8 +36,7 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(17);
   indices.push_back(18);
 
-  for (RAJA::Index_type i = 20; i < 28; ++i)
-  {
+  for (RAJA::Index_type i = 20; i < 28; ++i) {
     indices.push_back(i);
   }
 
@@ -45,13 +44,16 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(30);
   indices.push_back(31);
 
-  camp::resources::Resource res {camp::resources::Host()};
-
+  camp::resources::Resource res{camp::resources::Host()};
+ 
   RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
 
-  RAJA::buildIndexSetAligned(iset, res, &indices[0],
+  RAJA::buildIndexSetAligned(iset, 
+                             res,
+                             &indices[0],
                              static_cast<RAJA::Index_type>(indices.size()),
-                             range_min_length, range_align);
+                             range_min_length,
+                             range_align);
 
   ASSERT_EQ(iset.getLength(), indices.size());
 
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index 440239a700..44a2a9ffa1 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -28,8 +28,7 @@ void KernelBasicFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0)
-  {
+  if (seg_idx.size() > 0) {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -42,60 +41,66 @@ void KernelBasicFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_x,
-                                    &check_array_x, &test_array_x);
+                                    erased_working_res,
+                                    &working_array_x,
+                                    &check_array_x,
+                                    &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_y,
-                                    &check_array_y, &test_array_y);
+                                    erased_working_res,
+                                    &working_array_y,
+                                    &check_array_y,
+                                    &test_array_y);
 
 
-  working_res.memset(working_array_x, 0,
+  working_res.memset(working_array_x,
+                     0,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-      {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
       },
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-      {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
       }
 
   );
 
-  working_res.memcpy(check_array_x, working_array_x,
+  working_res.memcpy(check_array_x,
+                     working_array_x,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  memset(static_cast<void*>(check_array_y), 0,
+  memset(static_cast<void*>(check_array_y),
+         0,
          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
-                               [=](IDX_TYPE i)
-                               {
-                                 check_array_y[RAJA::stripIndexType(i)] += 1;
-                                 check_array_y[RAJA::stripIndexType(i)] += 2;
-                               });
+  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
+    check_array_y[RAJA::stripIndexType(i)] += 1;
+    check_array_y[RAJA::stripIndexType(i)] += 2;
+  });
 
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
-  {
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
     ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
               check_array_y[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
-                                      check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res,
+                                      working_array_x,
+                                      check_array_x,
+                                      test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
-                                      check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res,
+                                      working_array_y,
+                                      check_array_y,
+                                      test_array_y);
 }
 
 #endif  // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 141bbc7687..0627e469af 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -13,17 +13,18 @@
 TYPED_TEST_SUITE_P(KernelBasicFissionFusionLoopTest);
 template <typename T>
 class KernelBasicFissionFusionLoopTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
              BasicFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res {working_res};
+  WORKING_RES working_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res{working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -31,14 +32,18 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -47,7 +52,9 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -55,14 +62,18 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
@@ -70,7 +81,9 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
@@ -78,25 +91,29 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
-  {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+    if (i < randval) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
+                                      seg_idx.size(),
                                       erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
+                                      seg_idx.size(),
                                       erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
+                                       EXEC_POLICY,
+                                       WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index 03b5813640..e22f544062 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -16,20 +16,15 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename EXEC_POLICY,
-          typename WORKING_RES,
-          typename SEG_TYPE>
-void KernelBasicSingleICountLoopTestImpl(
-    const SEG_TYPE& seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    WORKING_RES working_res,
-    camp::resources::Resource erased_working_res)
+template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES, typename SEG_TYPE>
+void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg, 
+                                   const std::vector<IDX_TYPE>& seg_idx,
+                                   WORKING_RES working_res,
+                                   camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
   IDX_TYPE data_len = IDX_TYPE(0);
-  if (seg_idx.size() > 0)
-  {
+  if ( seg_idx.size() > 0 ) {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -40,79 +35,86 @@ void KernelBasicSingleICountLoopTestImpl(
   IDX_TYPE* test_array;
   IDX_TYPE* test_array_i;
 
-  if (RAJA::stripIndexType(data_len) == 0)
-  {
+  if ( RAJA::stripIndexType(data_len) == 0 ) {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
-                                   &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len,
+                                   erased_working_res,
+                                   &working_array,
+                                   &check_array,
+                                   &test_array);
 
-  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res,
-                                   &working_array_i, &check_array_i,
+  allocateForallTestData<IDX_TYPE>(data_len,
+                                   erased_working_res,
+                                   &working_array_i,
+                                   &check_array_i,
                                    &test_array_i);
 
-  memset(static_cast<void*>(test_array), 0,
+  memset(static_cast<void*>(test_array), 0, 
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array,
+  working_res.memcpy(working_array, test_array, 
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array_i, test_array_i,
+  working_res.memcpy(working_array_i, test_array_i, 
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if (RAJA::stripIndexType(idx_len) > 0)
-  {
+  if ( RAJA::stripIndexType(idx_len) > 0 ) {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
-    {
-      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
-          seg_idx[RAJA::stripIndexType(i)];
-      test_array_i[RAJA::stripIndexType(RAJA::stripIndexType(i))] = IDX_TYPE(i);
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
+      test_array  [ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
+        seg_idx[RAJA::stripIndexType(i)];
+      test_array_i[ RAJA::stripIndexType(RAJA::stripIndexType(i)) ] = 
+        IDX_TYPE(i);
     }
-
+ 
     RAJA::kernel_param<EXEC_POLICY>(
-        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
+      RAJA::make_tuple(seg),
+      RAJA::make_tuple(IDX_TYPE(0)),
+      
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
+        working_array[RAJA::stripIndexType(idx)] = IDX_TYPE(idx) ;
+        working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx) ;
+      }
+    );
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
-        {
-          working_array[RAJA::stripIndexType(idx)]     = IDX_TYPE(idx);
-          working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx);
-        });
-  }
-  else
-  {  // zero-length segment
+  } else { // zero-length segment
 
     RAJA::kernel_param<EXEC_POLICY>(
-        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
-
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
-        {
-          (void)idx;
-          (void)i_idx;
-          working_array[0]++;
-          working_array_i[0]++;
-        });
+      RAJA::make_tuple(seg),
+      RAJA::make_tuple(IDX_TYPE(0)),
+      
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
+        (void) idx; (void) i_idx;
+        working_array[0]++;
+        working_array_i[0]++;
+      }
+    );
+
   }
 
-  working_res.memcpy(check_array, working_array,
+  working_res.memcpy(check_array, working_array, 
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
-  working_res.memcpy(check_array_i, working_array_i,
+  working_res.memcpy(check_array_i, working_array_i, 
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
-    ASSERT_EQ(test_array_i[RAJA::stripIndexType(i)],
-              check_array_i[RAJA::stripIndexType(i)]);
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+    ASSERT_EQ( test_array[RAJA::stripIndexType(i)],
+               check_array[RAJA::stripIndexType(i)] );
+    ASSERT_EQ( test_array_i[RAJA::stripIndexType(i)],
+               check_array_i[RAJA::stripIndexType(i)] );
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
-                                     check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res,
+                                     working_array,
+                                     check_array,
+                                     test_array);
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array_i,
-                                     check_array_i, test_array_i);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res,
+                                     working_array_i,
+                                     check_array_i,
+                                     test_array_i);
 }
 
 #endif  // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index d1f00123d8..e6bd76fef9 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -13,92 +13,88 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest);
 template <typename T>
 class KernelBasicSingleICountLoopTest : public ::testing::Test
-{};
+{
+};
 
-TYPED_TEST_P(KernelBasicSingleICountLoopTest,
-             BasicSingleICountLoopSegmentKernel)
+TYPED_TEST_P(KernelBasicSingleICountLoopTest, BasicSingleICountLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res {working_res};
+  WORKING_RES working_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res{working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-      r1, seg_idx, working_res, erased_working_res);
+                                      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-      r2, seg_idx, working_res, erased_working_res);
+                                      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-      r3, seg_idx, working_res, erased_working_res);
+                                      r3, seg_idx, working_res, erased_working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs1, seg_idx, working_res, erased_working_res);
+                                      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs2, seg_idx, working_res, erased_working_res);
+                                      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-      rs3, seg_idx, working_res, erased_working_res);
+                                      rs3, seg_idx, working_res, erased_working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-      l1, seg_idx, working_res, erased_working_res);
+                                      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-      l2, seg_idx, working_res, erased_working_res);
+                                      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest,
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index 1d6e0e5938..6b4239e84a 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -16,42 +16,32 @@
 #include <numeric>
 #include <vector>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
-{
-  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel<EXEC_POL>(segs, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
 }
 
-template <typename IDX_TYPE,
-          typename EXEC_POLICY,
-          typename WORKING_RES,
-          typename SEG_TYPE,
-          bool USE_RESOURCE>
-void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
+template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES,
+          typename SEG_TYPE, bool USE_RESOURCE>
+void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg, 
                                    const std::vector<IDX_TYPE>& seg_idx,
                                    WORKING_RES working_res,
                                    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
   IDX_TYPE data_len = IDX_TYPE(0);
-  if (seg_idx.size() > 0)
-  {
+  if ( seg_idx.size() > 0 ) {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -59,56 +49,58 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  if (RAJA::stripIndexType(data_len) == 0)
-  {
+  if ( RAJA::stripIndexType(data_len) == 0 ) {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
-                                   &check_array, &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len,
+                                   erased_working_res,
+                                   &working_array,
+                                   &check_array,
+                                   &test_array);
 
-  memset(static_cast<void*>(test_array), 0,
+  memset(static_cast<void*>(test_array), 0, 
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array,
+  working_res.memcpy(working_array, test_array, 
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if (RAJA::stripIndexType(idx_len) > 0)
-  {
+  if ( RAJA::stripIndexType(idx_len) > 0 ) {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
-    {
-      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
-          seg_idx[RAJA::stripIndexType(i)];
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
+      test_array[ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
+         seg_idx[RAJA::stripIndexType(i)];
     }
+ 
+    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+        working_array[RAJA::stripIndexType(idx)] = idx;
+      }
+    );
+
+  } else { // zero-length segment
+
+    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+        (void) idx;
+        working_array[0]++;
+      }
+    );
 
-    call_kernel<EXEC_POLICY, USE_RESOURCE>(
-        RAJA::make_tuple(seg), working_res,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-        { working_array[RAJA::stripIndexType(idx)] = idx; });
-  }
-  else
-  {  // zero-length segment
-
-    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg), working_res,
-                                           [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
-                                           {
-                                             (void)idx;
-                                             working_array[0]++;
-                                           });
   }
 
-  working_res.memcpy(check_array, working_array,
+  working_res.memcpy(check_array, working_array, 
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+    ASSERT_EQ( test_array[RAJA::stripIndexType(i)], 
+               check_array[RAJA::stripIndexType(i)] );
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
-                                     check_array, test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res,
+                                     working_array,
+                                     check_array,
+                                     test_array);
 }
 
 #endif  // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index 156aaf7d1d..5a7ce88f55 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -13,7 +13,8 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
@@ -21,88 +22,81 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res {working_res};
+  WORKING_RES working_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res{working_res};
 
   constexpr bool USE_RES = false;
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r1, seg_idx, working_res, erased_working_res);
+                                  r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r2, seg_idx, working_res, erased_working_res);
+                                  r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r3, seg_idx, working_res, erased_working_res);
+                                  r3, seg_idx, working_res, erased_working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs1, seg_idx, working_res,
-                                         erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+                                  rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs2, seg_idx, working_res,
-                                         erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+                                  rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs3, seg_idx, working_res,
-                                         erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+                                  rs3, seg_idx, working_res, erased_working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l1, seg_idx, working_res, erased_working_res);
+                                  l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l2, seg_idx, working_res, erased_working_res);
+                                  l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index eb54f4763e..6f624eab2c 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -13,7 +13,8 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
@@ -21,88 +22,81 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res {working_res};
+  WORKING_RES working_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res{working_res};
 
   constexpr bool USE_RES = true;
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r1, seg_idx, working_res, erased_working_res);
+                                  r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r2, seg_idx, working_res, erased_working_res);
+                                  r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-      r3, seg_idx, working_res, erased_working_res);
+                                  r3, seg_idx, working_res, erased_working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs1, seg_idx, working_res,
-                                         erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+                                  rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs2, seg_idx, working_res,
-                                         erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+                                  rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                USE_RES>(rs3, seg_idx, working_res,
-                                         erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
+                                  rs3, seg_idx, working_res, erased_working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l1, seg_idx, working_res, erased_working_res);
+                                  l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
-                                      erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-      l2, seg_idx, working_res, erased_working_res);
+                                  l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index d321718390..5a326b3c62 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -28,8 +28,7 @@ void KernelConditionalFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0)
-  {
+  if (seg_idx.size() > 0) {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -42,19 +41,23 @@ void KernelConditionalFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_x,
-                                    &check_array_x, &test_array_x);
+                                    erased_working_res,
+                                    &working_array_x,
+                                    &check_array_x,
+                                    &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res, &working_array_y,
-                                    &check_array_y, &test_array_y);
+                                    erased_working_res,
+                                    &working_array_y,
+                                    &check_array_y,
+                                    &test_array_y);
 
 
-  working_res.memset(working_array_x, 0,
+  working_res.memset(working_array_x,
+                     0,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (int param = 0; param < 2; ++param)
-  {
+  for (int param = 0; param < 2; ++param) {
 
     RAJA::kernel_param<EXEC_POLICY>(
 
@@ -62,46 +65,47 @@ void KernelConditionalFissionFusionLoopTestImpl(
 
         RAJA::make_tuple(param),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-        {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
         },
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
-        {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
         }
 
     );
 
-    working_res.memcpy(check_array_x, working_array_x,
+    working_res.memcpy(check_array_x,
+                       working_array_x,
                        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    memset(static_cast<void*>(check_array_y), 0,
+    memset(static_cast<void*>(check_array_y),
+           0,
            sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
-                                 [=](IDX_TYPE i) {
-                                   check_array_y[RAJA::stripIndexType(i)] =
-                                       3 + 3 * param;
-                                 });
+    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
+      check_array_y[RAJA::stripIndexType(i)] = 3 + 3 * param;
+    });
 
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
-    {
+    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
       ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
                 check_array_y[RAJA::stripIndexType(i)]);
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
-                                      check_array_x, test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res,
+                                      working_array_x,
+                                      check_array_x,
+                                      test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
-                                      check_array_y, test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res,
+                                      working_array_y,
+                                      check_array_y,
+                                      test_array_y);
 }
 
 #endif  // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index ffe659f215..ddb2302e60 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -13,17 +13,18 @@
 TYPED_TEST_SUITE_P(KernelConditionalFissionFusionLoopTest);
 template <typename T>
 class KernelConditionalFissionFusionLoopTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
              ConditionalFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res {working_res};
+  WORKING_RES working_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res{working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -31,14 +32,18 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
+                                             EXEC_POLICY,
+                                             WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
+                                             EXEC_POLICY,
+                                             WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -47,7 +52,9 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
+                                             EXEC_POLICY,
+                                             WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -56,16 +63,24 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1, seg_idx, working_res,
+      IDX_TYPE,
+      EXEC_POLICY,
+      WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1,
+                                               seg_idx,
+                                               working_res,
                                                erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2, seg_idx, working_res,
+      IDX_TYPE,
+      EXEC_POLICY,
+      WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2,
+                                               seg_idx,
+                                               working_res,
                                                erased_working_res);
 
   // test zero-length range-stride segment
@@ -73,33 +88,41 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE, EXEC_POLICY, WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3, seg_idx, working_res,
+      IDX_TYPE,
+      EXEC_POLICY,
+      WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3,
+                                               seg_idx,
+                                               working_res,
                                                erased_working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
-  {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+    if (i < randval) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
+                                      seg_idx.size(),
                                       erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
+                                             EXEC_POLICY,
+                                             WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
+                                      seg_idx.size(),
                                       erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
+                                             EXEC_POLICY,
+                                             WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index 8645ae0b33..ddae647f83 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -11,98 +11,80 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-void KernelHyperplane2DTestImpl(const int groups,
-                                const int idim,
-                                const int jdim)
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim)
 {
-  // This test traverses "groups" 2D arrays, and modifies values in a 1D
-  // hyperplane manner.
+  // This test traverses "groups" 2D arrays, and modifies values in a 1D hyperplane manner.
 
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView(
-      test_array, groups, idim, jdim);
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView(
-      work_array, groups, idim, jdim);
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView(
-      check_array, groups, idim, jdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView( test_array, groups, idim, jdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim );
 
   // initialize array
-  std::iota(test_array, test_array + array_length, 1);
+  std::iota( test_array, test_array + array_length, 1 );
 
-  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
-  // perform array arithmetic with a 1D hyperplane, in either the I or J
-  // direction
-  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
-  RAJA::TypedRangeSegment<INDEX_TYPE> Irange(0, idim);
-  RAJA::TypedRangeSegment<INDEX_TYPE> Jrange(0, jdim);
-
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(Grange, Irange, Jrange),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj)
-      {
-        if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim ||
-            (int)jj < 0 || (int)jj >= jdim)
-        {
-          oob_count += 1;
-        }
+  // perform array arithmetic with a 1D hyperplane, in either the I or J direction
+  RAJA::TypedRangeSegment<INDEX_TYPE>  Grange( 0, groups );
+  RAJA::TypedRangeSegment<INDEX_TYPE>  Irange( 0, idim );
+  RAJA::TypedRangeSegment<INDEX_TYPE>  Jrange( 0, jdim );
 
-        DATA_TYPE left = 1;
-        if (ii > 0)
-        {
-          left = WorkView(g, ii - 1, jj);
-        }
+  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj ) {
+      if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim || (int)jj < 0 || (int)jj >= jdim) {
+        oob_count += 1;
+      }
 
-        DATA_TYPE up = 1;
-        if (jj > 0)
-        {
-          up = WorkView(g, ii, jj - 1);
-        }
+      DATA_TYPE left = 1;
+      if (ii > 0) {
+        left = WorkView(g, ii - 1, jj);
+      }
 
-        WorkView(g, ii, jj) = left + up;
+      DATA_TYPE up = 1;
+      if (jj > 0) {
+        up = WorkView(g, ii, jj - 1);
+      }
 
-        trip_count += 1;
-      });
+      WorkView(g, ii, jj) = left + up;
 
-  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
+      trip_count += 1;
+  });
+
+  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
 
   ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g)
-  {
-    for (int i = 0; i < idim; ++i)
-    {
-      for (int j = 0; j < jdim; ++j)
-      {
+  for (int g = 0; g < groups; ++g) {
+    for (int i = 0; i < idim; ++i) {
+      for (int j = 0; j < jdim; ++j) {
         DATA_TYPE left = 1;
-        if (i > 0)
-        {
+        if (i > 0) {
           left = HostView(g, i - 1, j);
         }
 
         DATA_TYPE up = 1;
-        if (j > 0)
-        {
+        if (j > 0) {
           up = HostView(g, i, j - 1);
         }
 
@@ -111,43 +93,42 @@ void KernelHyperplane2DTestImpl(const int groups,
     }
   }
 
-  for (int g = 0; g < groups; ++g)
-  {
-    for (int i = 0; i < idim; ++i)
-    {
-      for (int j = 0; j < jdim; ++j)
-      {
+  for (int g = 0; g < groups; ++g) {
+    for (int i = 0; i < idim; ++i) {
+      for (int j = 0; j < jdim; ++j) {
         ASSERT_FLOAT_EQ(CheckView(g, i, j), HostView(g, i, j));
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane2DTest);
 template <typename T>
 class KernelHyperplane2DTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(1, 10, 10);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(2, 111, 205);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(3, 213, 123);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 111, 205);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest, Hyperplane2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest,
+                            Hyperplane2DKernel);
 
 #endif  // __TEST_KERNEL_HYPERPLANE_2D_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 611d8fd3bf..321f43d6a6 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -11,38 +11,21 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-typename std::enable_if<
-    std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups),
-                           const int RAJA_UNUSED_ARG(idim),
-                           const int RAJA_UNUSED_ARG(jdim),
-                           const int RAJA_UNUSED_ARG(kdim))
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups), const int RAJA_UNUSED_ARG(idim), const int RAJA_UNUSED_ARG(jdim), const int RAJA_UNUSED_ARG(kdim))
 {
   // do nothing for unsigned index types
 }
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
-typename std::enable_if<
-    std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int groups,
-                           const int idimin,
-                           const int jdimin,
-                           const int kdimin)
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin, const int kdimin)
 {
-  // This test traverses "groups" number of 3D arrays, and modifies values in a
-  // 2D hyperplane manner.
+  // This test traverses "groups" number of 3D arrays, and modifies values in a 2D hyperplane manner.
 
   int idim, jdim, kdim;
-  if (std::is_same<DATA_TYPE, float>::value)
+  if ( std::is_same<DATA_TYPE, float>::value )
   {
     // Restrict to a small data size for better float precision.
     idim = 5;
@@ -56,102 +39,87 @@ KernelHyperplane3DTestImpl(const int groups,
     kdim = kdimin;
   }
 
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim * kdim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView(
-      test_array, groups, idim, jdim, kdim);
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView(
-      work_array, groups, idim, jdim, kdim);
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView(
-      check_array, groups, idim, jdim, kdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView( test_array, groups, idim, jdim, kdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim, kdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim, kdim );
 
   // initialize array
-  std::iota(test_array, test_array + array_length, 1);
+  std::iota( test_array, test_array + array_length, 1 );
 
-  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
   // perform array arithmetic with a 2D J-K hyperplane
-  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Irange(0, idim, 1);
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
-
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(Grange, Irange, Jrange, Krange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii,
-                                                 INDEX_TYPE jj, INDEX_TYPE kk)
-                            {
-                              if (g < 0 || g >= groups || ii < 0 ||
-                                  ii >= idim || jj < 0 || jj >= jdim ||
-                                  kk < 0 || kk >= kdim)
-                              {
-                                oob_count += 1;
-                              }
-
-                              DATA_TYPE left = 1;
-                              if (ii > 0)
-                              {
-                                left = WorkView(g, ii - 1, jj, kk);
-                              }
-
-                              DATA_TYPE up = 1;
-                              if (jj > 0)
-                              {
-                                up = WorkView(g, ii, jj - 1, kk);
-                              }
-
-                              DATA_TYPE back = 1;
-                              if (kk > 0)
-                              {
-                                back = WorkView(g, ii, jj, kk - 1);
-                              }
-
-                              WorkView(g, ii, jj, kk) = left + up + back;
-
-                              trip_count += 1;
-                            });
-
-  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
-
-  ASSERT_EQ((INDEX_TYPE)trip_count.get(),
-            (INDEX_TYPE)groups * idim * jdim * kdim);
+  RAJA::TypedRangeSegment<INDEX_TYPE>   Grange( 0, groups );
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Irange( 0, idim, 1 );
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Jrange( jdim-1, -1, -1 );
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Krange( 0, kdim, 1 );
+
+  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange, Krange ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk ) {
+      if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 || jj >= jdim || kk < 0 || kk >= kdim) {
+        oob_count += 1;
+      }
+
+      DATA_TYPE left = 1;
+      if (ii > 0) {
+        left = WorkView(g, ii - 1, jj, kk);
+      }
+
+      DATA_TYPE up = 1;
+      if (jj > 0) {
+        up = WorkView(g, ii, jj - 1, kk);
+      }
+
+      DATA_TYPE back = 1;
+      if (kk > 0) {
+        back = WorkView(g, ii, jj, kk - 1);
+      }
+
+      WorkView(g, ii, jj, kk) = left + up + back;
+
+      trip_count += 1;
+  });
+
+  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
+
+  ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim * kdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g)
-  {
-    for (int i = 0; i < idim; ++i)
-    {
-      for (int j = jdim - 1; j >= 0; --j)
-      {
-        for (int k = 0; k < kdim; ++k)
-        {
+  for (int g = 0; g < groups; ++g) {
+    for (int i = 0; i < idim; ++i) {
+      for (int j = jdim - 1; j >= 0; --j) {
+        for (int k = 0; k < kdim; ++k) {
           DATA_TYPE left = 1;
-          if (i > 0)
-          {
+          if (i > 0) {
             left = HostView(g, i - 1, j, k);
           }
 
           DATA_TYPE up = 1;
-          if (j > 0)
-          {
+          if (j > 0) {
             up = HostView(g, i, j - 1, k);
           }
 
           DATA_TYPE back = 1;
-          if (k > 0)
-          {
+          if (k > 0) {
             back = HostView(g, i, j, k - 1);
           }
 
@@ -161,46 +129,44 @@ KernelHyperplane3DTestImpl(const int groups,
     }
   }
 
-  for (int g = 0; g < groups; ++g)
-  {
-    for (int i = 0; i < idim; ++i)
-    {
-      for (int j = 0; j < jdim; ++j)
-      {
-        for (int k = 0; k < kdim; ++k)
-        {
+  for (int g = 0; g < groups; ++g) {
+    for (int i = 0; i < idim; ++i) {
+      for (int j = 0; j < jdim; ++j) {
+        for (int k = 0; k < kdim; ++k) {
           ASSERT_FLOAT_EQ(CheckView(g, i, j, k), HostView(g, i, j, k));
         }
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane3DTest);
 template <typename T>
 class KernelHyperplane3DTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(1, 10, 10, 10);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(2, 151, 111, 205);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                             REDUCE_POLICY>(3, 101, 213, 123);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10, 10);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 151, 111, 205);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 101, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest, Hyperplane3DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest,
+                            Hyperplane3DKernel);
 
 #endif  // __TEST_KERNEL_HYPERPLANE_3D_HPP__
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index a58cc80812..10923b9da2 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -15,15 +15,10 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -31,19 +26,12 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{
-  return false;
-}
+{ return false; }
 ///
-template <typename EXEC_POLICY,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -52,8 +40,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER =
-      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -63,13 +50,13 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval      = 100;
+  const int modval = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -86,50 +73,51 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range + 1, working_res, &working_range,
-                         &check_range, &test_range);
+  allocateForallTestData(idx_range+1,
+                         working_res,
+                         &working_range,
+                         &check_range,
+                         &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
-  {
+  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
-        0, num_bins);
-
-    for (IDX_TYPE k : sk)
-    {
-      for (IDX_TYPE j : sj)
-      {
-        for (IDX_TYPE i : si)
-        {
-          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+
+    for (IDX_TYPE k : sk) {
+      for (IDX_TYPE j : sj) {
+        for (IDX_TYPE i : si) {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii + 1] = data_len;
+          test_range[ii+1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_array,
+                         &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_bins,
+                         &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0))
-  {
+  if (data_len > IDX_TYPE(0)) {
 
-    // use ints to initialize array here to avoid floating point precision
-    // issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
+    // use ints to initialize array here to avoid floating point precision issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i)
-    {
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -137,8 +125,7 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range,
-                     sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -150,28 +137,21 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i)
-    {
-      ref_vals[test_bins[i]] =
-          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
+      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::kernel_resource<EXEC_POLICY>(
-        segments, working_res,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
-        {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-               ++idx)
-          {
-            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-          }
-        });
+    RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
+        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+      }
+    });
 
     size_t bin = 0;
-    for (auto init_val : multi_init)
-    {
+    for (auto init_val : multi_init) {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -181,60 +161,46 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from
-  // multiple loops
+  // basic multiple use test, ensure same reducer can combine values from multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j)
-    {
+    for (int j = 0; j < nloops; ++j) {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i)
-      {
-        ref_vals[test_bins[i]] =
-            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::kernel_resource<EXEC_POLICY>(
-          segments, working_res,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
-          {
-            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
+      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin)
-    {
+    for (size_t bin = 0; bin < num_bins; ++bin) {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red))
-  {
+  if (ABSTRACTION::consistent(red)) {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
-    {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-                         std::uniform_int_distribution<DATA_TYPE>,
-                         std::uniform_real_distribution<DATA_TYPE>>
-          array_flt_distribution(0, modval - 1);
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i)
-      {
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array,
-                         sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -242,32 +208,23 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j)
-    {
+    for (int j = 0; j < nloops; ++j) {
       red.reset();
 
-      RAJA::kernel_resource<EXEC_POLICY>(
-          segments, working_res,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
-          {
-            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
-
-      if (!got_ref_vals)
-      {
+      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
+
+      if (!got_ref_vals) {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      }
-      else
-      {
-        for (size_t bin = 0; bin < num_bins; ++bin)
-        {
+      } else {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -275,67 +232,76 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
-  deallocateForallTestData(working_res, working_array, check_array, test_array);
-  deallocateForallTestData(working_res, working_range, check_range, test_range);
+  deallocateForallTestData(working_res,
+                           working_bins,
+                           check_bins,
+                           test_bins);
+  deallocateForallTestData(working_res,
+                           working_array,
+                           check_array,
+                           test_array);
+  deallocateForallTestData(working_res,
+                           working_range,
+                           check_range,
+                           test_range);
 }
 
 
 TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest);
 template <typename T>
 class KernelMultiReduceNestedTest : public ::testing::Test
-{};
+{
+};
 
 //
 //
 // Defining the Kernel Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiReduceNestedLoopExec;
 
-template <typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<
-              2,
-              typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>>>>>;
+template<typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA> {
+  using type =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
 };
 
-template <typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::ArgList<0, 1, 2>,
-      RAJA::statement::Lambda<0>>>;
+template<typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
+  using type =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<0,1,2>,
+        RAJA::statement::Lambda<0>
+      >
+    >;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
-    defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
 
-template <typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
-{
+template<typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
   using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::For<
-                  2,
-                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
-                         >;
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
@@ -351,52 +317,46 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
 
   using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
   using LOOP_POLS = typename EXEC_POL_DATA::type;
-  using EXEC_POLICY =
-      typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY = typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device {}();
+  auto random_seed = std::random_device{}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res {WORKING_RES::get_default()};
+  WORKING_RES working_res{WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container)
-  {
+  for (size_t num_bins_max : num_bins_max_container) {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
-                                                        num_bins_max);
-    num_bins_min    = num_bins_max + 1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
+    num_bins_min = num_bins_max+1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s1, container,
-                                                         working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s2, container,
-                                                         working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s2, container, working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 =
-        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s3, container,
-                                                         working_res, rngen);
+    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s3, container, working_res, rngen);
+
   }
 }
 
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index ae856ae553..216aee14d6 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -10,30 +10,22 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
-                                                        PARAMS&& params,
-                                                        WORKING_RES work_res,
-                                                        Args&&... args)
-{
-  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
 }
 
 //
@@ -41,81 +33,80 @@ call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the Block test supports.
 //
 //
-using BlockReduceSumSupportedLoopTypeList =
-    camp::list<DEPTH_1_REDUCESUM, DEVICE_DEPTH_1_REDUCESUM>;
+using BlockReduceSumSupportedLoopTypeList = camp::list<
+  DEPTH_1_REDUCESUM,
+  DEVICE_DEPTH_1_REDUCESUM
+  >;
 
 //
 //
 // Nest loop trip count test.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
-{
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N){
 
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   // Allocate Tests Data
-  int* work_array;
-  int* check_array;
-  int* test_array;
-
-  allocateForallTestData<int>(N, erased_work_res, &work_array, &check_array,
+  int * work_array;
+  int * check_array;
+  int * test_array;
+
+  allocateForallTestData<int>(N,
+                              erased_work_res,
+                              &work_array,
+                              &check_array,
                               &test_array);
 
-  RAJA::TypedRangeSegment<int> range(0, N);
+  RAJA::TypedRangeSegment<int> range(0,N);
 
   // Initialize Data
   std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
-  erased_work_res.memcpy(work_array, test_array,
-                         sizeof(int) * RAJA::stripIndexType(N));
-
+  erased_work_res.memcpy(work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
+  
   RAJA::ReduceSum<REDUCE_POL, int> worksum(0);
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N)), RAJA::make_tuple<int>(0),
+    RAJA::make_tuple(RAJA::RangeSegment(0, N)),
+    RAJA::make_tuple<int>(0),
 
-      // Resource
-      work_res,
+    // Resource
+    work_res,
 
-      // lambda 0, only runs for sequential
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
-      { value = work_array[i]; },
+    // lambda 0, only runs for sequential
+    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
+       value = work_array[i];
+    },
 
-      // lambda 1, only runs for device
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
-      { value += work_array[i]; },
+    // lambda 1, only runs for device
+    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
+       value += work_array[i];
+    },
 
-      // lambda 2, (reduction) runs for both sequential and device
-      // Device: This only gets executed on the "root" thread which received the
-      // reduced value.
-      [=] RAJA_HOST_DEVICE(int& value) { worksum += value; }
+    // lambda 2, (reduction) runs for both sequential and device
+    // Device: This only gets executed on the "root" thread which received the reduced value.
+    [=] RAJA_HOST_DEVICE (int & value) {
+       worksum += value;
+    }
 
   );
 
-  ASSERT_EQ(worksum.get(), N * (N - 1) / 2);
+  ASSERT_EQ(worksum.get(), N*(N-1)/2);
 
-  deallocateForallTestData<int>(erased_work_res, work_array, check_array,
+  deallocateForallTestData<int>(erased_work_res,
+                                work_array,
+                                check_array,
                                 test_array);
 }
 
-// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above
-// DEPTH_1_REDUCESUM test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
-      DEPTH_1_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above DEPTH_1_REDUCESUM test.
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_1_REDUCESUM(), args...);
 }
 
 //
@@ -123,41 +114,35 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args)
 // Defining the Kernel Loop structure for Block Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct BlockNestedLoopExec;
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::Lambda<0>,
-      RAJA::statement::Reduce<
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::operators::plus,
-          RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<2, RAJA::Params<0>>>>>;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>,
+        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<2, RAJA::Params<0>>
+        >
+      >
+    >;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                           RAJA::statement::Lambda<1>>,
-      RAJA::statement::Reduce<
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::operators::plus,
-          RAJA::statement::Param<0>,
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<1>>,
+        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
           RAJA::statement::Lambda<2, RAJA::Params<0>>
-          // Device: Lambda 2 only gets executed on the "root" thread which
-          // received the reduced value.
-          >>  // end DEVICE_KERNEL
-                                  >;
+          // Device: Lambda 2 only gets executed on the "root" thread which received the reduced value.
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index ee001ba8d8..54934bef6d 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -10,26 +10,20 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
-{
-  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel<EXEC_POL>(segs, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
 }
 
 //
@@ -37,131 +31,94 @@ call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceSum test supports.
 //
 //
-using ReduceSumSupportedLoopTypeList =
-    camp::list<DEPTH_3_REDUCESUM,
-               DEPTH_3_REDUCESUM_SEQ_INNER,
-               DEPTH_3_REDUCESUM_SEQ_OUTER,
-               DEVICE_DEPTH_3_REDUCESUM,
-               DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-               DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
+using ReduceSumSupportedLoopTypeList = camp::list<
+  DEPTH_3_REDUCESUM,
+  DEPTH_3_REDUCESUM_SEQ_INNER,
+  DEPTH_3_REDUCESUM_SEQ_OUTER,
+  DEVICE_DEPTH_3_REDUCESUM,
+  DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+  DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
 
 //
 //
 // ReduceSum 3D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2)
-{
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+                          const RAJA::Index_type dim2){
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      flatSize, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(flatSize,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  erased_work_res.memcpy(work_array, test_array,
-                         sizeof(RAJA::Index_type) *
-                             RAJA::stripIndexType(flatSize));
+  erased_work_res.memcpy(work_array, test_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
 
   constexpr int Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim0,
-                                                              dim1, dim2);
+  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim0, dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(range0, range1, range2), work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
-                           RAJA::Index_type k)
-      { worksum += work_view(i, j, k); });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range0, range1, range2), work_res,
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k) {
+                              worksum += work_view(i,j,k);
+                            });
 
-  RAJA::forall<RAJA::seq_exec>(rangeflat,
-                               [=](RAJA::Index_type i) {
-                                 hostsum += test_array[RAJA::stripIndexType(i)];
-                               });
+  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
+    hostsum += test_array[RAJA::stripIndexType(i)];
+  });
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
-// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above
-// DEPTH_3_REDUCESUM test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
-      DEPTH_3_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above DEPTH_3_REDUCESUM test.
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
-      DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
-      DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
-                          Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
-      DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
-                          Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
-      DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
 }
 
 //
@@ -169,113 +126,99 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
 // Defining the Kernel Loop structure for ReduceSum Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct ReduceSumNestedLoopExec;
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<
-              2,
-              typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>>>>>;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER,
-                               REDUCE_POL,
-                               POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      RAJA::seq_exec,
-      RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<
-              2,
-              typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>>>>>;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER,
-                               REDUCE_POL,
-                               POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<1,
-                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
-                           RAJA::statement::For<2,
-                                                RAJA::seq_exec,
-                                                RAJA::statement::Lambda<0>>>>>;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<2, RAJA::seq_exec,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM,
-                               REDUCE_POL,
-                               POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::For<
-                  2,
-                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
-                         >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
-                               REDUCE_POL,
-                               POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          RAJA::seq_exec,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::For<
-                  2,
-                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
-                         >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-                               REDUCE_POL,
-                               POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::For<
-                  2,
-                  RAJA::seq_exec,
-                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
-                         >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::For<2, RAJA::seq_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
index b2363c468b..cb2f444643 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
-{};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,16 +31,13 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
index aea740d451..344ae26666 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test
-{};
+class KernelNestedLoopReduceSumTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,16 +31,13 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 40, 30, 20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
index 7ecdf9252f..a83c16592f 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
-{};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,16 +31,13 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
index b96edb880a..bbf888f680 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test
-{};
+class KernelNestedLoopReduceSumTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,16 +31,13 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 40, 30, 20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index a34688ef8d..1c1eafabc5 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -19,22 +19,21 @@
 
 template <typename IDX_TYPE, typename DATA_TYPE, typename EXEC_POLICY>
 void KernelNestedLoopsSegmentTypesTestImpl(
-    const RAJA::TypedRangeSegment<IDX_TYPE>& s1,
-    const std::vector<IDX_TYPE>& s1_idx,
-    const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
-    const std::vector<IDX_TYPE>& s2_idx,
-    const RAJA::TypedListSegment<IDX_TYPE>& s3,
-    const std::vector<IDX_TYPE>& s3_idx,
-    camp::resources::Resource working_res,
-    int perm)
+  const RAJA::TypedRangeSegment<IDX_TYPE>& s1, 
+  const std::vector<IDX_TYPE>& s1_idx,
+  const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
+  const std::vector<IDX_TYPE>& s2_idx,
+  const RAJA::TypedListSegment<IDX_TYPE>& s3,
+  const std::vector<IDX_TYPE>& s3_idx,
+  camp::resources::Resource working_res,
+  int perm)
 {
   IDX_TYPE idx1_len = static_cast<IDX_TYPE>(s1_idx.size());
   IDX_TYPE idx2_len = static_cast<IDX_TYPE>(s2_idx.size());
   IDX_TYPE idx3_len = static_cast<IDX_TYPE>(s3_idx.size());
 
   bool zero_legth_segment = false;
-  if (RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0)
-  {
+  if ( RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0 ) {
     zero_legth_segment = true;
   }
 
@@ -42,8 +41,7 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   IDX_TYPE dim2 = 1;
   IDX_TYPE dim3 = 1;
 
-  if (!zero_legth_segment)
-  {
+  if ( !zero_legth_segment ) {
     dim1 = s1_idx[s1_idx.size() - 1] + 1;
     dim2 = s2_idx[s2_idx.size() - 1] + 1;
     dim3 = s3_idx[s3_idx.size() - 1] + 1;
@@ -55,82 +53,76 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &work_array,
+                                    &check_array,
+                                    &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(work_array, dim1, dim2,
-                                                   dim3);
-  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(test_array, dim1, dim2,
-                                                   dim3);
+  RAJA::View< DATA_TYPE, RAJA::Layout<3> > work_view(work_array, 
+                                                     dim1, dim2, dim3);
+  RAJA::View< DATA_TYPE, RAJA::Layout<3> > test_view(test_array, 
+                                                     dim1, dim2, dim3);
 
-  memset(static_cast<void*>(test_array), 0,
-         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  memset( static_cast<void*>(test_array), 0, 
+          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len) );
 
-  working_res.memcpy(work_array, test_array,
+  working_res.memcpy(work_array, test_array, 
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  if (!zero_legth_segment)
-  {
-    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1)
-    {
-      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2)
-      {
-        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3)
-        {
+  if ( !zero_legth_segment ) {
+    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1) {
+      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2) {
+        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3) {
           auto ii1 = RAJA::stripIndexType(i1);
           auto ii2 = RAJA::stripIndexType(i2);
           auto ii3 = RAJA::stripIndexType(i3);
-          test_view(s1_idx[ii1], s2_idx[ii2], s3_idx[ii3]) =
-              static_cast<DATA_TYPE>(RAJA::stripIndexType(
-                  s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]));
+          test_view( s1_idx[ii1], s2_idx[ii2], s3_idx[ii3] ) = 
+            static_cast<DATA_TYPE>( RAJA::stripIndexType(
+                                    s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]) );
         }
       }
     }
   }
 
-  if (perm == 1)
-  {
-    RAJA::kernel<EXEC_POLICY>(
-        RAJA::make_tuple(s1, s2, s3),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3)
-        {
-          work_view(i1, i2, i3) =
-              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
-        });
-  }
-
-  if (perm == 2)
-  {
-    RAJA::kernel<EXEC_POLICY>(
-        RAJA::make_tuple(s2, s3, s1),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1)
-        {
-          work_view(i1, i2, i3) =
-              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
-        });
-  }
-
-  if (perm == 3)
-  {
-    RAJA::kernel<EXEC_POLICY>(
-        RAJA::make_tuple(s3, s1, s2),
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2)
-        {
-          work_view(i1, i2, i3) =
-              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
-        });
+  if ( perm == 1 ) {
+    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s1, s2, s3 ),
+      [=] RAJA_HOST_DEVICE (IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3) {
+        work_view(i1, i2, i3) = 
+          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
+      }
+    );
   }
+ 
+  if ( perm == 2 ) {
+    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s2, s3, s1 ),
+      [=] RAJA_HOST_DEVICE (IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1) {
+        work_view(i1, i2, i3) = 
+          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
+      }
+    );
+  } 
+
+  if ( perm == 3 ) {
+    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s3, s1, s2 ),
+      [=] RAJA_HOST_DEVICE (IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2) {
+        work_view(i1, i2, i3) = 
+          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
+      }
+    );
+  } 
 
-  working_res.memcpy(check_array, work_array,
+  working_res.memcpy(check_array, work_array, 
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
     auto ii = RAJA::stripIndexType(i);
-    ASSERT_EQ(test_array[ii], check_array[ii]);
+    ASSERT_EQ( test_array[ii], check_array[ii] );
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res, work_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      work_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -138,7 +130,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
 TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest);
 template <typename T>
 class KernelNestedLoopsSegmentTypesTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 {
@@ -146,96 +139,144 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> s1_idx;
   std::vector<IDX_TYPE> s2_idx;
   std::vector<IDX_TYPE> s3_idx;
 
-  // Create a segment of each basic type RAJA provides and test
-  // permutations of those segments in nested loops
+// Create a segment of each basic type RAJA provides and test
+// permutations of those segments in nested loops 
 
-  RAJA::TypedRangeSegment<IDX_TYPE> s1(0, 69);
+  RAJA::TypedRangeSegment<IDX_TYPE> s1( 0, 69 );
   RAJA::getIndices(s1_idx, s1);
 
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2(3, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2( 3, 188, 2 );
   RAJA::getIndices(s2_idx, s2);
 
   IDX_TYPE last = IDX_TYPE(427);
-  srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       s3_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> s3(&s3_idx[0], s3_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> s3( &s3_idx[0], s3_idx.size(),
+                                       working_res );
 
   int perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+                                        s1, s1_idx,    
+                                        s2, s2_idx,    
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s2, s2_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s2, s2_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
-  // Test some zero-length segment combinations
+// Test some zero-length segment combinations
 
-  // Zero-length range segment
-  RAJA::TypedRangeSegment<IDX_TYPE> s4(4, 4);
+// Zero-length range segment
+  RAJA::TypedRangeSegment<IDX_TYPE> s4( 4, 4 );
   std::vector<IDX_TYPE> s4_idx;
   RAJA::getIndices(s4_idx, s4);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+                                        s4, s4_idx,
+                                        s2, s2_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+                                        s4, s4_idx,
+                                        s2, s2_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
-
-  // Zero-length range stride segment
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5(3, 3, 2);
+                                        s4, s4_idx,
+                                        s2, s2_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
+
+// Zero-length range stride segment
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5( 3, 3, 2 );
   std::vector<IDX_TYPE> s5_idx;
   RAJA::getIndices(s5_idx, s5);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s5, s5_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s5, s5_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s5, s5_idx,
+                                        s3, s3_idx,
+                                        working_res,
+                                        perm);
 
-  // Zero-length list segment
+// Zero-length list segment 
   std::vector<IDX_TYPE> s6_idx;
-  RAJA::TypedListSegment<IDX_TYPE> s6(nullptr, s6_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> s6( nullptr, s6_idx.size(),
+                                       working_res );
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s2, s2_idx,
+                                        s6, s6_idx,
+                                        working_res,
+                                        perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s2, s2_idx,
+                                        s6, s6_idx,
+                                        working_res,
+                                        perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
+                                        s1, s1_idx,
+                                        s2, s2_idx,
+                                        s6, s6_idx,
+                                        working_res,
+                                        perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index 76089d813d..77f168ce2f 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -14,7 +14,7 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                 std::array<RAJA::idx_t, 2> offset_lo,
                                 std::array<RAJA::idx_t, 2> offset_hi)
 {
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
@@ -23,45 +23,51 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
-  EXPECT_LT(off_dim0, dim.at(0));
-  EXPECT_LT(off_dim1, dim.at(1));
+  EXPECT_LT( off_dim0, dim.at(0) );
+  EXPECT_LT( off_dim1, dim.at(1) );
 
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+  allocateForallTestData<IDX_TYPE>(N,
+                                   working_res,
+                                   &working_array,
+                                   &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
-  {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
-    {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
       test_array[j + dim.at(1) * i] = static_cast<IDX_TYPE>(1);
     }
   }
 
 
-  RAJA::OffsetLayout<2> layout = RAJA::make_offset_layout<2>(
-      {{offset_lo.at(0), offset_lo.at(1)}},
-      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1)}});
-  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> view(working_array, layout);
+  RAJA::OffsetLayout<2> layout =
+    RAJA::make_offset_layout<2>( {{offset_lo.at(0), offset_lo.at(1)}},
+                                 {{offset_lo.at(0) + dim.at(0),
+                                   offset_lo.at(1) + dim.at(1)}} );
+  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
-                            { view(i, j) = static_cast<IDX_TYPE>(1); });
+  RAJA::kernel<EXEC_POLICY>(
+    RAJA::make_tuple( iseg, jseg ),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+      view(i, j) = static_cast<IDX_TYPE>(1);
+    }
+  );
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii)
-  {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     working_array,
+                                     check_array,
                                      test_array);
 }
 
@@ -69,7 +75,8 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopOffsetView2DTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
@@ -87,26 +94,30 @@ TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
   // Square views
   //
   std::array<RAJA::idx_t, 2> offset_lo {{0, 2}};
-  std::array<RAJA::idx_t, 2> offset_hi {{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  std::array<RAJA::idx_t, 2> offset_hi {{dim0-3, dim1-4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -2}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 6}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-6}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 2> {{0, 1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 1}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-1}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 }
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index 60c335154a..32adc3ede0 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -14,7 +14,7 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                 std::array<RAJA::idx_t, 3> offset_lo,
                                 std::array<RAJA::idx_t, 3> offset_hi)
 {
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
@@ -24,54 +24,60 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
   RAJA::idx_t off_dim2 = offset_hi.at(2) - offset_lo.at(2);
-  EXPECT_LT(off_dim0, dim.at(0));
-  EXPECT_LT(off_dim1, dim.at(1));
-  EXPECT_LT(off_dim2, dim.at(2));
-
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+  EXPECT_LT( off_dim0, dim.at(0) );
+  EXPECT_LT( off_dim1, dim.at(1) );
+  EXPECT_LT( off_dim2, dim.at(2) );
+
+  allocateForallTestData<IDX_TYPE>(N,
+                                   working_res,
+                                   &working_array,
+                                   &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
-  {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
-    {
-      for (RAJA::idx_t k = 0; k < off_dim2; ++k)
-      {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
+      for (RAJA::idx_t k = 0; k < off_dim2; ++k) {
         test_array[k + dim.at(2) * j + dim.at(1) * dim.at(2) * i] =
-            static_cast<IDX_TYPE>(1);
+          static_cast<IDX_TYPE>(1);
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> layout = RAJA::make_offset_layout<3>(
-      {{offset_lo.at(0), offset_lo.at(1), offset_lo.at(2)}},
-      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1),
-        offset_lo.at(2) + dim.at(2)}});
+  RAJA::OffsetLayout<3> layout =
+    RAJA::make_offset_layout<3>( {{offset_lo.at(0),
+                                   offset_lo.at(1),
+                                   offset_lo.at(2)}},
+                                 {{offset_lo.at(0) + dim.at(0),
+                                   offset_lo.at(1) + dim.at(1),
+                                   offset_lo.at(2) + dim.at(2)}} );
 
-  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> view(working_array, layout);
+  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg(offset_lo.at(2), offset_hi.at(2));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg( offset_lo.at(2), offset_hi.at(2));
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(iseg, jseg, kseg),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
-      { view(i, j, k) = static_cast<IDX_TYPE>(1); });
+    RAJA::make_tuple( iseg, jseg, kseg ),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
+      view(i, j, k) = static_cast<IDX_TYPE>(1);
+    }
+  );
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii)
-  {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     working_array,
+                                     check_array,
                                      test_array);
 }
 
@@ -79,7 +85,8 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopOffsetView3DTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
@@ -98,26 +105,30 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   // Square views
   //
   std::array<RAJA::idx_t, 3> offset_lo {{0, 2, 1}};
-  std::array<RAJA::idx_t, 3> offset_hi {{dim0 - 2, dim1 - 6, dim2 - 4}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  std::array<RAJA::idx_t, 3> offset_hi {{dim0-2, dim1-6, dim2-4}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -2, -3}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 10, dim2 - 8}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-10, dim2-8}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 2, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-2, dim2-2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -1, 0}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 4, dim2 - 2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-4, dim2-2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
+                                                                 offset_lo,
                                                                  offset_hi);
 }
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index 1888c93016..f83126959d 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -12,7 +12,7 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                         std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   IDX_TYPE* A_work_array;
   IDX_TYPE* A_check_array;
   IDX_TYPE* A_test_array;
@@ -35,8 +35,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer = dim.at(perm.at(0));
-  RAJA::idx_t Nint_inner = dim.at(perm.at(1));
+  RAJA::idx_t Nint_outer = dim.at( perm.at(0) );
+  RAJA::idx_t Nint_inner = dim.at( perm.at(1) );
 
   RAJA::idx_t Ntot_outer = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_inner = Nint_inner + 2 * 1;
@@ -45,15 +45,16 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
-                                   &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot,
+                                   working_res,
+                                   &B_work_array,
+                                   &B_check_array,
+                                   &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
-  {
-    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j)
-    {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
+    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j) {
       B_test_array[j + Ntot_inner * i] = static_cast<IDX_TYPE>(1);
     }
   }
@@ -62,61 +63,70 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
-                                   &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint,
+                                   working_res,
+                                   &A_work_array,
+                                   &A_check_array,
+                                   &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
-  {
-    for (RAJA::idx_t j = 0; j < Nint_inner; ++j)
-    {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
+    for (RAJA::idx_t j = 0; j < Nint_inner; ++j) {
 
       int A_idx = j + Nint_inner * i;
       int B_idx = (j + 1) + Ntot_inner * (i + 1);
 
-      A_test_array[A_idx] = B_test_array[B_idx] +               // C
-                            B_test_array[B_idx - Ntot_inner] +  // S
-                            B_test_array[B_idx + Ntot_inner] +  // N
-                            B_test_array[B_idx - 1] +           // W
-                            B_test_array[B_idx + 1];            // E
+      A_test_array[A_idx] = B_test_array[B_idx] +                // C
+                            B_test_array[B_idx - Ntot_inner] +   // S
+                            B_test_array[B_idx + Ntot_inner] +   // N
+                            B_test_array[B_idx - 1] +            // W
+                            B_test_array[B_idx + 1];             // E
+
     }
   }
 
 
-  RAJA::OffsetLayout<2> B_layout = RAJA::make_permuted_offset_layout<2>(
-      {{-1, -1}}, {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1}}, perm);
+  RAJA::OffsetLayout<2> B_layout =
+    RAJA::make_permuted_offset_layout<2>( {{-1, -1}},
+                                          {{Ntot_len.at(0)-1, Ntot_len.at(1)-1}},
+                                          perm );
   RAJA::Layout<2> A_layout =
-      RAJA::make_permuted_layout({{Nint_len.at(0), Nint_len.at(1)}}, perm);
+    RAJA::make_permuted_layout( {{Nint_len.at(0), Nint_len.at(1)}}, perm );
+
+  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > B_view(B_work_array, B_layout);
+  RAJA::View< IDX_TYPE, RAJA::Layout<2> >  A_view(A_work_array, A_layout);
 
-  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> B_view(B_work_array, B_layout);
-  RAJA::View<IDX_TYPE, RAJA::Layout<2>> A_view(A_work_array, A_layout);
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
+  RAJA::kernel<EXEC_POLICY>(
+    RAJA::make_tuple( iseg, jseg ),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
-                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
-                            {
-                              A_view(i, j) = B_view(i, j) + B_view(i - 1, j) +
-                                             B_view(i + 1, j) +
-                                             B_view(i, j - 1) +
-                                             B_view(i, j + 1);
-                            });
+      A_view(i, j) = B_view(i, j) +
+                     B_view(i - 1, j) + B_view(i + 1, j) +
+                     B_view(i, j - 1) + B_view(i, j + 1);
+
+    }
+  );
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
-  {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     A_work_array,
+                                     A_check_array,
                                      A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     B_work_array,
+                                     B_check_array,
                                      B_test_array);
 }
 
@@ -124,11 +134,11 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView2DTest : public ::testing::Test
-{};
+{
+};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
-             PermutedOffsetView2DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest, PermutedOffsetView2DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index 0448cf268a..776aff7c57 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -12,7 +12,7 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                         std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   IDX_TYPE* A_work_array;
   IDX_TYPE* A_check_array;
   IDX_TYPE* A_test_array;
@@ -26,8 +26,9 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   // Note that we assume a finite difference stencil width of one.
   //
   std::array<RAJA::idx_t, 3> Nint_len {{dim.at(0), dim.at(1), dim.at(2)}};
-  std::array<RAJA::idx_t, 3> Ntot_len {
-      {dim.at(0) + 2 * 1, dim.at(1) + 2 * 1, dim.at(2) + 2 * 1}};
+  std::array<RAJA::idx_t, 3> Ntot_len {{dim.at(0) + 2 * 1,
+                                        dim.at(1) + 2 * 1,
+                                        dim.at(2) + 2 * 1}};
 
   //
   // These are used in data initialization and setting reference solution.
@@ -36,9 +37,9 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer  = dim.at(perm.at(0));
-  RAJA::idx_t Nint_middle = dim.at(perm.at(1));
-  RAJA::idx_t Nint_inner  = dim.at(perm.at(2));
+  RAJA::idx_t Nint_outer  = dim.at( perm.at(0) );
+  RAJA::idx_t Nint_middle = dim.at( perm.at(1) );
+  RAJA::idx_t Nint_inner  = dim.at( perm.at(2) );
 
   RAJA::idx_t Ntot_outer  = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_middle = Nint_middle + 2 * 1;
@@ -48,19 +49,19 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
-                                   &B_check_array, &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot,
+                                   working_res,
+                                   &B_work_array,
+                                   &B_check_array,
+                                   &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
-  {
-    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j)
-    {
-      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k)
-      {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
+    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j) {
+      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k) {
         B_test_array[k + j * Ntot_inner + i * Ntot_inner * Ntot_middle] =
-            static_cast<IDX_TYPE>(1);
+          static_cast<IDX_TYPE>(1);
       }
     }
   }
@@ -69,71 +70,80 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
-                                   &A_check_array, &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint,
+                                   working_res,
+                                   &A_work_array,
+                                   &A_check_array,
+                                   &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
-  {
-    for (RAJA::idx_t j = 0; j < Nint_middle; ++j)
-    {
-      for (RAJA::idx_t k = 0; k < Nint_inner; ++k)
-      {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
+    for (RAJA::idx_t j = 0; j < Nint_middle; ++j) {
+      for (RAJA::idx_t k = 0; k < Nint_inner; ++k) {
 
         int A_idx = k + j * Nint_inner + i * Nint_inner * Nint_middle;
         int B_idx =
-            (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
+          (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
 
         A_test_array[A_idx] =
-            B_test_array[B_idx] +                               // C
-            B_test_array[B_idx - 1] +                           // W
-            B_test_array[B_idx + 1] +                           // E
-            B_test_array[B_idx - Ntot_inner] +                  // S
-            B_test_array[B_idx + Ntot_inner] +                  // N
-            B_test_array[B_idx - (Ntot_inner * Ntot_middle)] +  // B
-            B_test_array[B_idx + (Ntot_inner * Ntot_middle)];   // T
+          B_test_array[B_idx] +                              // C
+          B_test_array[B_idx - 1] +                          // W
+          B_test_array[B_idx + 1] +                          // E
+          B_test_array[B_idx - Ntot_inner] +                 // S
+          B_test_array[B_idx + Ntot_inner] +                 // N
+          B_test_array[B_idx - (Ntot_inner*Ntot_middle)] +   // B
+          B_test_array[B_idx + (Ntot_inner*Ntot_middle)];    // T
+
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> B_layout = RAJA::make_permuted_offset_layout<3>(
-      {{-1, -1, -1}},
-      {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1, Ntot_len.at(2) - 1}}, perm);
-  RAJA::Layout<3> A_layout = RAJA::make_permuted_layout(
-      {{Nint_len.at(0), Nint_len.at(1), Nint_len.at(2)}}, perm);
+  RAJA::OffsetLayout<3> B_layout =
+    RAJA::make_permuted_offset_layout<3>( {{-1, -1, -1}},
+                                          {{Ntot_len.at(0)-1,
+                                            Ntot_len.at(1)-1,
+                                            Ntot_len.at(2)-1}},
+                                          perm );
+  RAJA::Layout<3> A_layout =
+    RAJA::make_permuted_layout( {{Nint_len.at(0),
+                                  Nint_len.at(1),
+                                  Nint_len.at(2)}}, perm );
 
-  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> B_view(B_work_array, B_layout);
-  RAJA::View<IDX_TYPE, RAJA::Layout<3>> A_view(A_work_array, A_layout);
+  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > B_view(B_work_array, B_layout);
+  RAJA::View< IDX_TYPE, RAJA::Layout<3> >  A_view(A_work_array, A_layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg(0, Nint_len.at(2));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg( 0, Nint_len.at(2) );
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(iseg, jseg, kseg),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
-      {
-        A_view(i, j, k) = B_view(i, j, k) + B_view(i - 1, j, k) +
-                          B_view(i + 1, j, k) + B_view(i, j - 1, k) +
-                          B_view(i, j + 1, k) + B_view(i, j, k - 1) +
-                          B_view(i, j, k + 1);
-      });
+    RAJA::make_tuple( iseg, jseg, kseg ),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
+      A_view(i, j, k) = B_view(i, j, k) +
+                        B_view(i - 1, j, k) + B_view(i + 1, j, k) +
+                        B_view(i, j - 1, k) + B_view(i, j + 1, k) +
+                        B_view(i, j, k - 1) + B_view(i, j, k + 1);
+    }
+  );
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
-  {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     A_work_array,
+                                     A_check_array,
                                      A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     B_work_array,
+                                     B_check_array,
                                      B_test_array);
 }
 
@@ -141,11 +151,11 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView3DTest : public ::testing::Test
-{};
+{
+};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
-             PermutedOffsetView3DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest, PermutedOffsetView3DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index 37729d4e99..66311c43f1 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -13,49 +13,52 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
                                   std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 2> dim_strip {
-      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
-       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1)))}};
+  std::array<RAJA::idx_t, 2>
+    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
+                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ) }};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1);
 
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+  allocateForallTestData<IDX_TYPE>(N,
+                                   working_res,
+                                   &working_array,
+                                   &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at(perm.at(1));
-  for (RAJA::idx_t ii = 0; ii < N; ++ii)
-  {
+  int mod_val = dim.at( perm.at(1) );
+  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<2> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View<IDX_TYPE, RAJA::Layout<2, int>> view(working_array, layout);
+  RAJA::View< IDX_TYPE, RAJA::Layout<2, int> > view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
-      {
-        int val    = RAJA::stripIndexType(layout(i, j)) % mod_val;
-        view(i, j) = static_cast<IDX_TYPE>(val);
-      });
+    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)) ),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+      int val = RAJA::stripIndexType(layout(i, j)) % mod_val;
+      view(i, j) = static_cast<IDX_TYPE>(val);
+    }
+  );
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii)
-  {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     working_array,
+                                     check_array,
                                      test_array);
 }
 
@@ -63,7 +66,8 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedView2DTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
@@ -77,8 +81,8 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   //
   // Square view
   //
-  std::array<IDX_TYPE, 2> dim_s {
-      {static_cast<IDX_TYPE>(21), static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 2> dim_s  {{static_cast<IDX_TYPE>(21),
+                                   static_cast<IDX_TYPE>(21)}};
   KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
@@ -88,14 +92,12 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 2> dim_ns {
-      {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24)}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  std::array<IDX_TYPE, 2> dim_ns  {{static_cast<IDX_TYPE>(15),
+                                    static_cast<IDX_TYPE>(24)}};
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index 7c3c329bf3..c3cb31ddce 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -13,51 +13,54 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
                                   std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 3> dim_strip {
-      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
-       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1))),
-       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(2)))}};
+  std::array<RAJA::idx_t, 3>
+    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
+                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ),
+                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(2)) ) }};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1) * dim_strip.at(2);
 
-  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
+  allocateForallTestData<IDX_TYPE>(N,
+                                   working_res,
+                                   &working_array,
+                                   &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at(perm.at(1)) * dim.at(perm.at(2));
-  for (RAJA::idx_t ii = 0; ii < N; ++ii)
-  {
+  int mod_val = dim.at( perm.at(1) ) * dim.at( perm.at(2) );
+  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<3> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View<IDX_TYPE, RAJA::Layout<3, int>> view(working_array, layout);
+  RAJA::View< IDX_TYPE, RAJA::Layout<3, int> > view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
-                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
-      {
-        int val       = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
-        view(i, j, k) = static_cast<IDX_TYPE>(val);
-      });
+    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
+                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2)) ),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
+      int val = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
+      view(i, j, k) = static_cast<IDX_TYPE>(val);
+    }
+  );
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii)
-  {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res,
+                                     working_array,
+                                     check_array,
                                      test_array);
 }
 
@@ -65,7 +68,8 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedView3DTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
@@ -79,9 +83,9 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Square view
   //
-  std::array<IDX_TYPE, 3> dim_s {{static_cast<IDX_TYPE>(21),
-                                  static_cast<IDX_TYPE>(21),
-                                  static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 3> dim_s  {{static_cast<IDX_TYPE>(21),
+                                   static_cast<IDX_TYPE>(21),
+                                   static_cast<IDX_TYPE>(21)}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
@@ -94,19 +98,16 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 3> dim_ns {{static_cast<IDX_TYPE>(15),
-                                   static_cast<IDX_TYPE>(24),
-                                   static_cast<IDX_TYPE>(17)}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  std::array<IDX_TYPE, 3> dim_ns  {{static_cast<IDX_TYPE>(15),
+                                    static_cast<IDX_TYPE>(24),
+                                    static_cast<IDX_TYPE>(17)}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
 
   perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
-                                                                   perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest,
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index b0958b0f52..e5b99159b8 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -10,26 +10,20 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
-{
-  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel<EXEC_POL>(segs, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
 }
 
 //
@@ -37,88 +31,73 @@ call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 // Define list of nested loop types the Basic test supports.
 //
 //
-using BasicSupportedLoopTypeList = camp::list<DEPTH_2,
-                                              DEPTH_2_COLLAPSE,
-                                              DEPTH_3,
-                                              DEPTH_3_COLLAPSE,
-                                              DEPTH_3_COLLAPSE_SEQ_INNER,
-                                              DEPTH_3_COLLAPSE_SEQ_OUTER,
-                                              DEVICE_DEPTH_2>;
+using BasicSupportedLoopTypeList = camp::list<
+  DEPTH_2,
+  DEPTH_2_COLLAPSE,
+  DEPTH_3,
+  DEPTH_3_COLLAPSE,
+  DEPTH_3_COLLAPSE_SEQ_INNER,
+  DEPTH_3_COLLAPSE_SEQ_OUTER,
+  DEVICE_DEPTH_2>;
 
 //
 //
 // Basic 2D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... ExtraArgs>
+template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... ExtraArgs>
 void KernelNestedLoopTest(const DEPTH_2&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          ExtraArgs...)
-{
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+                          ExtraArgs...){
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      flatSize, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(flatSize,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 2;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim1,
-                                                              dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(range1, range0), work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i)
-      { work_view(j, i) = (j * dim0) + i; });
-
-  work_res.memcpy(check_array, work_array,
-                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat,
-                               [=](RAJA::Index_type i)
-                               {
-                                 ASSERT_EQ(
-                                     test_array[RAJA::stripIndexType(i)],
-                                     check_array[RAJA::stripIndexType(i)]);
-                               });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim1, dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range1, range0), work_res,
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type j, RAJA::Index_type i) {
+                              work_view(j,i) = (j * dim0) + i;
+                            });
+
+  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
-// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2
-// test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
-                                                               args...);
+// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2 test.
+template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
-                                                               args...);
+template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
 }
 
 //
@@ -130,20 +109,22 @@ template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2)
-{
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+                          const RAJA::Index_type dim2){
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      flatSize, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(flatSize,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
@@ -151,58 +132,38 @@ void KernelNestedLoopTest(const DEPTH_3&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 3;
-  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim2,
-                                                              dim1, dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(range2, range1, range0), work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type k, RAJA::Index_type j,
-                           RAJA::Index_type i)
-      { work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i; });
-
-  work_res.memcpy(check_array, work_array,
-                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat,
-                               [=](RAJA::Index_type i)
-                               {
-                                 ASSERT_EQ(
-                                     test_array[RAJA::stripIndexType(i)],
-                                     check_array[RAJA::stripIndexType(i)]);
-                               });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim2, dim1, dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range2, range1, range0), work_res,
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i) {
+                              work_view(k,j,i) = (dim0 * dim1 * k) + (dim0 * j) + i;
+                            });
+
+  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
 // DEPTH_3_COLLAPSE execution policies use the above DEPTH_3 test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
-                                                               args...);
+template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
-                                                               args...);
+template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args)
-{
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
-                                                               args...);
+template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args){
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
 }
 
 //
@@ -210,89 +171,97 @@ void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args)
 // Defining the Kernel Loop structure for Basic Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename POLICY_DATA>
 struct BasicNestedLoopExec;
 
-template <typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      2,
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<
-              0,
-              typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>>>>>;
+template<typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
 };
 
-template <typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
-                           RAJA::statement::Lambda<0>>>>;
+template<typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
 };
 
-template <typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::ArgList<1, 0>,
-      RAJA::statement::Lambda<0>>>;
+template<typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<1,0>,
+        RAJA::statement::Lambda<0>
+      >
+    >;
 };
 
-template <typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::ArgList<0, 1, 2>,
-      RAJA::statement::Lambda<0>>>;
+template<typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<0,1,2>,
+        RAJA::statement::Lambda<0>
+      >
+    >;
 };
 
-template <typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      0,
-      RAJA::seq_exec,
-      RAJA::statement::Collapse<
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::ArgList<1, 2>,
-          RAJA::statement::Lambda<0>>>>;
+template<typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec,
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<1,2>,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
 };
 
-template <typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::ArgList<0, 1>,
-      RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
+template<typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<0,1>,
+        RAJA::statement::For<2, RAJA::seq_exec,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
-    defined(RAJA_ENABLE_SYCL)
-
-template <typename POLICY_DATA>
-struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
-          RAJA::statement::For<
-              0,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
-              RAJA::statement::Lambda<0>>>>  // end CudaKernel
-                         >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
+
+template<typename POLICY_DATA>
+struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
+          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end CudaKernel
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 8f9d35df6f..5c2cdd5149 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -10,26 +10,20 @@
 
 #include "RAJA_test-abs.hpp"
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
-{
-  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel<EXEC_POL>(segs, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
 }
 
 //
@@ -37,8 +31,10 @@ call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
 // Define list of nested loop types the MultiLambda test supports.
 //
 //
-using MultiLambdaSupportedLoopTypeList =
-    camp::list<DEPTH_2, DEPTH_2_COLLAPSE, DEVICE_DEPTH_2>;
+using MultiLambdaSupportedLoopTypeList = camp::list<
+  DEPTH_2,
+  DEPTH_2_COLLAPSE,
+  DEVICE_DEPTH_2>;
 
 //
 //
@@ -46,97 +42,74 @@ using MultiLambdaSupportedLoopTypeList =
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest()
-{
-  constexpr static int N   = 1000;
+void KernelNestedLoopTest(){
+  constexpr static int N = 1000;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res {camp::resources::Host()};
-  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+  WORKING_RES work_res{WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N * N);
-  double* work_arrB = work_res.template allocate<double>(N * N);
+  double* work_arrA = work_res.template allocate<double>(N*N);
+  double* work_arrB = work_res.template allocate<double>(N*N);
 
-  double* test_arrA = host_res.allocate<double>(N * N);
-  double* test_arrB = host_res.allocate<double>(N * N);
+  double* test_arrA = host_res.allocate<double>(N*N);
+  double* test_arrB = host_res.allocate<double>(N*N);
 
-  double* check_arrA = host_res.allocate<double>(N * N);
-  double* check_arrB = host_res.allocate<double>(N * N);
+  double* check_arrA = host_res.allocate<double>(N*N);
+  double* check_arrB = host_res.allocate<double>(N*N);
 
   // Initialize Data
-  for (RAJA::Index_type i = 0; i < N * N; i++)
-  {
-    test_arrA[i] = i * 1.2;
-    test_arrB[i] = i * 0.5;
+  for (RAJA::Index_type i = 0; i < N*N; i++) {
+    test_arrA[i] = i * 1.2;  test_arrB[i] = i * 0.5;
   }
-  work_res.memcpy(work_arrA, test_arrA,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(work_arrB, test_arrB,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
 
   // Initialize RAJA Views
-  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
-  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
 
   // Calculate Test data
-  for (RAJA::Index_type i = 1; i < N - 1; ++i)
-  {
-    for (RAJA::Index_type j = 1; j < N - 1; ++j)
-    {
-      test_viewB(i, j) = 0.2 * (test_viewA(i, j) + test_viewA(i, j - 1) +
-                                test_viewA(i, j + 1) + test_viewA(i + 1, j) +
-                                test_viewA(i - 1, j));
+  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
+    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
+      test_viewB(i,j) = 0.2 * (test_viewA(i,j) + test_viewA(i,j-1) + test_viewA(i,j+1) + test_viewA(i+1,j) + test_viewA(i-1,j));
     }
   }
-  for (RAJA::Index_type i = 1; i < N - 1; ++i)
-  {
-    for (RAJA::Index_type j = 1; j < N - 1; ++j)
-    {
-      test_viewA(i, j) = 0.2 * (test_viewB(i, j) + test_viewB(i, j - 1) +
-                                test_viewB(i, j + 1) + test_viewB(i + 1, j) +
-                                test_viewB(i - 1, j));
+  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
+    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
+      test_viewA(i,j) = 0.2 * (test_viewB(i,j) + test_viewB(i,j-1) + test_viewB(i,j+1) + test_viewB(i+1,j) + test_viewB(i-1,j));
     }
-  }
+  } 
 
-  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
-  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::RangeSegment {1, N - 1},
-                       RAJA::RangeSegment {1, N - 1}),
-
-      // Resource
-      work_res,
-
-      // lambda 0
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
-      {
-        work_viewB(i, j) = 0.2 * (work_viewA(i, j) + work_viewA(i, j - 1) +
-                                  work_viewA(i, j + 1) + work_viewA(i + 1, j) +
-                                  work_viewA(i - 1, j));
-      },
-
-      // lambda 1
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
-      {
-        work_viewA(i, j) = 0.2 * (work_viewB(i, j) + work_viewB(i, j - 1) +
-                                  work_viewB(i, j + 1) + work_viewB(i + 1, j) +
-                                  work_viewB(i - 1, j));
-      });
-
-  work_res.memcpy(check_arrA, work_arrA,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(check_arrB, work_arrB,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment {0, N * N},
-      [=](RAJA::Index_type i)
-      {
-        ASSERT_TRUE(RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8);
-        ASSERT_TRUE(RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8);
-      });
+    RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                     RAJA::RangeSegment{1, N-1}),
+
+    // Resource
+    work_res,
+
+    // lambda 0
+    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
+      work_viewB(i,j) = 0.2 * (work_viewA(i,j) + work_viewA(i,j-1) + work_viewA(i,j+1) + work_viewA(i+1,j) + work_viewA(i-1,j));
+    },
+
+    // lambda 1
+    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
+      work_viewA(i,j) = 0.2 * (work_viewB(i,j) + work_viewB(i,j-1) + work_viewB(i,j+1) + work_viewB(i+1,j) + work_viewB(i-1,j));
+    }
+  );
+
+  work_res.memcpy(check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
+
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
+    ASSERT_TRUE( RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8 );
+    ASSERT_TRUE( RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8 );
+  });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -153,64 +126,62 @@ void KernelNestedLoopTest()
 // Defining the Kernel Loop structure for MultiLambda Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaNestedLoopExec;
 
-template <typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<
-      RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0>>>,
-      RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<1>>>>;
+template<typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::Lambda<0>
+        >
+      >,
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::Lambda<1>
+        >
+      >
+    >;
 };
 
-template <typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::Collapse<
-                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                             RAJA::ArgList<1, 0>,
-                             RAJA::statement::Lambda<0>>,
-                         RAJA::statement::Collapse<
-                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                             RAJA::ArgList<1, 0>,
-                             RAJA::statement::Lambda<1>>>;
+template<typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<1,0>,
+        RAJA::statement::Lambda<0>
+      >,
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<1,0>,
+        RAJA::statement::Lambda<1>
+      >
+    >;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
-    defined(RAJA_ENABLE_SYCL)
-
-template <typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0>>>>,
-      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<1>>>>>;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
+
+template<typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >,
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::Lambda<1>
+          >
+        >
+      >
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index b41125b50e..8c62b908e3 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -10,30 +10,22 @@
 
 #include "RAJA_test-abs.hpp"
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
-                                                        PARAMS&& params,
-                                                        WORKING_RES work_res,
-                                                        Args&&... args)
-{
-  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
 }
 
 //
@@ -41,8 +33,10 @@ call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the MultiLambdaParam test supports.
 //
 //
-using MultiLambdaParamSupportedLoopTypeList =
-    camp::list<DEPTH_3, DEVICE_DEPTH_3>;
+using MultiLambdaParamSupportedLoopTypeList = camp::list<
+  DEPTH_3,
+  DEVICE_DEPTH_3
+  >;
 
 //
 //
@@ -50,97 +44,93 @@ using MultiLambdaParamSupportedLoopTypeList =
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest()
-{
+void KernelNestedLoopTest(){
 
-  constexpr static int N   = 100;
+  constexpr static int N = 100;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res {camp::resources::Host()};
-  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+  WORKING_RES work_res{WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N * N);
-  double* work_arrB = work_res.template allocate<double>(N * N);
-  double* work_arrC = work_res.template allocate<double>(N * N);
+  double* work_arrA = work_res.template allocate<double>(N*N);
+  double* work_arrB = work_res.template allocate<double>(N*N);
+  double* work_arrC = work_res.template allocate<double>(N*N);
 
-  double* test_arrA = host_res.allocate<double>(N * N);
-  double* test_arrB = host_res.allocate<double>(N * N);
-  double* test_arrC = host_res.allocate<double>(N * N);
+  double* test_arrA = host_res.allocate<double>(N*N);
+  double* test_arrB = host_res.allocate<double>(N*N);
+  double* test_arrC = host_res.allocate<double>(N*N);
 
-  double* check_arrC = host_res.allocate<double>(N * N);
+  double* check_arrC = host_res.allocate<double>(N*N);
 
   // Initialize RAJA Views
-  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
-  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
-  RAJA::View<double, RAJA::Layout<DIM>> test_viewC(test_arrC, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > test_viewC(test_arrC, N, N);
 
-  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
-  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
-  RAJA::View<double, RAJA::Layout<DIM>> work_viewC(work_arrC, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
+  RAJA::View< double, RAJA::Layout<DIM> > work_viewC(work_arrC, N, N);
 
   // Initialize Data
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
       test_viewA(row, col) = row;
       test_viewB(row, col) = col;
       test_viewB(row, col) = 0;
     }
   }
 
-  work_res.memcpy(work_arrA, test_arrA,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(work_arrB, test_arrB,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
-  work_res.memcpy(work_arrC, test_arrC,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
 
   // Calculate Test data
-  for (int row = 0; row < N; ++row)
-  {
-    for (int col = 0; col < N; ++col)
-    {
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k)
-      {
+      for (int k = 0; k < N; ++k) {
         dot += test_viewA(row, k) * test_viewB(k, col);
       }
       test_viewC(row, col) = dot;
+
     }
   }
-
+  
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::RangeSegment {0, N}, RAJA::RangeSegment {0, N},
-                       RAJA::RangeSegment {0, N}),
+    RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                     RAJA::RangeSegment{0, N},
+                     RAJA::RangeSegment{0, N}),
 
-      RAJA::tuple<double> {0.0},
+    RAJA::tuple<double>{0.0},
 
-      // Resource
-      work_res,
+    // Resource
+    work_res,
 
-      // lambda 0
-      [=] RAJA_HOST_DEVICE(double& dot) { dot = 0.0; },
+    // lambda 0
+    [=] RAJA_HOST_DEVICE (double& dot) {
+       dot = 0.0;
+    },
 
-      // lambda 1
-      [=] RAJA_HOST_DEVICE(int col, int row, int k, double& dot)
-      { dot += work_viewA(row, k) * work_viewB(k, col); },
+    // lambda 1
+    [=] RAJA_HOST_DEVICE (int col, int row, int k, double& dot) {
+       dot += work_viewA(row, k) * work_viewB(k, col);
+    },
 
-      // lambda 2
-      [=] RAJA_HOST_DEVICE(int col, int row, double& dot)
-      { work_viewC(row, col) = dot; }
+    // lambda 2
+    [=] RAJA_HOST_DEVICE (int col, int row, double& dot) {
+       work_viewC(row, col) = dot;
+    }
 
   );
 
-  work_res.memcpy(check_arrC, work_arrC,
-                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::RangeSegment {0, N * N}, [=](RAJA::Index_type i)
-      { ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8); });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
+    ASSERT_TRUE( RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8 );
+  });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -158,57 +148,43 @@ void KernelNestedLoopTest()
 // Defining the Kernel Loop structure for MultiLambdaParam Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaParamNestedLoopExec;
 
-template <typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      typename camp::at<POLICY_DATA, camp::num<0>>::type,
-      RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+template<typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
           RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<
-              2,
-              typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<1>  // inner loop: dot += ...
-              >,
-          RAJA::statement::Lambda<2,
-                                  RAJA::Segs<0, 1>,
-                                  RAJA::Params<0>>  // set
-                                                    // C(row,
-                                                    // col)
-                                                    // = dot
-          >>>;
+          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+            RAJA::statement::Lambda<1> // inner loop: dot += ...
+          >,
+          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
+        >
+      >
+    >;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
-    defined(RAJA_ENABLE_SYCL)
-
-template <typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              0,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-              RAJA::statement::For<
-                  2,
-                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<1>  // inner loop: dot += ...
-                  >,
-              RAJA::statement::Lambda<2,
-                                      RAJA::Segs<0, 1>,
-                                      RAJA::Params<0>>  // set C(row, col) = dot
-              >>>                                       // end CudaKernel
-                         >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
+
+template<typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+            RAJA::statement::Lambda<1> // inner loop: dot += ...
+          >,
+          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
+        >
+      >
+      > // end CudaKernel
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index 27ed5270bc..37cab1789b 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -17,12 +17,10 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test
-{};
+class KernelNestedLoopBasicTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -37,11 +35,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
-                                                          20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
+                            NestedLoopBasicKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
index a1ce6bbbd2..cddcb005f4 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
@@ -17,12 +17,10 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test
-{};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -32,8 +30,7 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
index df5264ec43..eae84e88c9 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
@@ -17,13 +17,10 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
-{};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
-             NestedLoopMultiLambdaParamKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,8 +30,7 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index 1fbec4bb91..7845500ae7 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -16,12 +16,10 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test
-{};
+class KernelNestedLoopBasicTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -36,11 +34,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
-                                                          20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
+                            NestedLoopBasicKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
index 798faf3f99..75616bea68 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
@@ -17,12 +17,10 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test
-{};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -32,8 +30,7 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
index 31dfbc1bd8..02dbe213cc 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
@@ -17,13 +17,10 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
-{};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
-             NestedLoopMultiLambdaParamKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,8 +30,7 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index 0e416b44a5..f2f2d0acab 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -8,116 +8,123 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelLocMax2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE** workarr2D;
-  DATA_TYPE** checkarr2D;
-  DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE ** workarr2D;
+  DATA_TYPE ** checkarr2D;
+  DATA_TYPE ** testarr2D;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE *> ( ydim,
+                                        work_res,
+                                        &workarr2D,
+                                        &checkarr2D,
+                                        &testarr2D
+                                      );
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  {
+    workarr2D[zz] = work_array + zz * ydim;
+  });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    checkarr2D[zz] = check_array + zz * ydim;
+  });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    for ( int xx = 0; xx < xdim; ++xx )
+    {
+      checkarr2D[zz][xx] = zz*xdim + xx;
+    }
+    checkarr2D[ydim-1][xdim-1] = 0;
+  });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
-      (DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
-      { maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r)); });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                           [=] RAJA_HOST_DEVICE (int c, int r) {
+                             maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r));
+                           });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
-      (DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
-
-  Index2D raja_loc        = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
+    for( int r = 0; r < ydim; ++r)
+    {
+      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
+    }
+  });
+
+  Index2D raja_loc = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE *> ( work_res,
+                                          workarr2D,
+                                          checkarr2D,
+                                          testarr2D
+                                        );
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DTest);
 template <typename T>
 class KernelLocMax2DTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest, LocMax2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest,
+                            LocMax2DKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index 938f0b666f..bd648ff88c 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -8,50 +8,56 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE** workarr2D;
-  DATA_TYPE** checkarr2D;
-  DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE ** workarr2D;
+  DATA_TYPE ** checkarr2D;
+  DATA_TYPE ** testarr2D;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE *> ( ydim,
+                                        work_res,
+                                        &workarr2D,
+                                        &checkarr2D,
+                                        &testarr2D
+                                      );
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  {
+    workarr2D[zz] = work_array + zz * ydim;
+  });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    checkarr2D[zz] = check_array + zz * ydim;
+  });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    for ( int xx = 0; xx < xdim; ++xx )
+    {
+      checkarr2D[zz][xx] = zz*xdim + xx;
+    }
+    checkarr2D[ydim-1][xdim-1] = 0;
+  });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -60,66 +66,67 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
-      (DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
-      { maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r)); });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                           [=] RAJA_HOST_DEVICE (int c, int r) {
+                             maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r));
+                           });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
-      (DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
-
-  Index2D raja_loc        = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
+    for( int r = 0; r < ydim; ++r)
+    {
+      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
+    }
+  });
+
+  Index2D raja_loc = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE *> ( work_res,
+                                          workarr2D,
+                                          checkarr2D,
+                                          testarr2D
+                                        );
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTest);
 template <typename T>
 class KernelLocMax2DViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest, LocMax2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest,
+                            LocMax2DViewKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index 699a6ff776..045fc8e97e 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -8,50 +8,56 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE** workarr2D;
-  DATA_TYPE** checkarr2D;
-  DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE ** workarr2D;
+  DATA_TYPE ** checkarr2D;
+  DATA_TYPE ** testarr2D;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE *> ( ydim,
+                                        work_res,
+                                        &workarr2D,
+                                        &checkarr2D,
+                                        &testarr2D
+                                      );
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  {
+    workarr2D[zz] = work_array + zz * ydim;
+  });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    checkarr2D[zz] = check_array + zz * ydim;
+  });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    for ( int xx = 0; xx < xdim; ++xx )
+    {
+      checkarr2D[zz][xx] = zz*xdim + xx;
+    }
+    checkarr2D[ydim-1][xdim-1] = 0;
+  });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -66,72 +72,64 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE,
-                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
-      maxloc_reducer((DATA_TYPE)0, LocTup);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> maxloc_reducer((DATA_TYPE)0, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(int c, int r)
-                            {
-                              maxloc_reducer.maxloc(
-                                  ArrView(r, c),
-                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                            });
+                           [=] RAJA_HOST_DEVICE (int c, int r) {
+                             maxloc_reducer.maxloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                           });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
-      (DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
+    for( int r = 0; r < ydim; ++r)
+    {
+      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
+    }
+  });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max                         = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc                      = checkmaxloc_reducer.getLoc();
+  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE *> ( work_res,
+                                          workarr2D,
+                                          checkarr2D,
+                                          testarr2D
+                                        );
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest);
 template <typename T>
 class KernelLocMax2DViewTupleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      10, 10);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      151, 151);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      362, 362);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index 165adf2284..090280813c 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -8,116 +8,123 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelLocMin2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE** workarr2D;
-  DATA_TYPE** checkarr2D;
-  DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE ** workarr2D;
+  DATA_TYPE ** checkarr2D;
+  DATA_TYPE ** testarr2D;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE *> ( ydim,
+                                        work_res,
+                                        &workarr2D,
+                                        &checkarr2D,
+                                        &testarr2D
+                                      );
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  {
+    workarr2D[zz] = work_array + zz * ydim;
+  });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    checkarr2D[zz] = check_array + zz * ydim;
+  });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    for ( int xx = 0; xx < xdim; ++xx )
+    {
+      checkarr2D[zz][xx] = zz*xdim + xx + 1;
+    }
+    checkarr2D[ydim-1][xdim-1] = 0;
+  });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
-      (DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
-      { minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r)); });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                           [=] RAJA_HOST_DEVICE (int c, int r) {
+                             minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r));
+                           });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
-      (DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkminloc_reducer.minloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
-
-  Index2D raja_loc        = minloc_reducer.getLoc();
-  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
+    for( int r = 0; r < ydim; ++r)
+    {
+      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
+    }
+  });
+
+  Index2D raja_loc = minloc_reducer.getLoc();
+  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE *> ( work_res,
+                                          workarr2D,
+                                          checkarr2D,
+                                          testarr2D
+                                        );
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DTest);
 template <typename T>
 class KernelLocMin2DTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest, LocMin2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest,
+                            LocMin2DKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index 046ec52a6b..cf0791e8d5 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -8,50 +8,56 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE** workarr2D;
-  DATA_TYPE** checkarr2D;
-  DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE ** workarr2D;
+  DATA_TYPE ** checkarr2D;
+  DATA_TYPE ** testarr2D;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE *> ( ydim,
+                                        work_res,
+                                        &workarr2D,
+                                        &checkarr2D,
+                                        &testarr2D
+                                      );
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  {
+    workarr2D[zz] = work_array + zz * ydim;
+  });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    checkarr2D[zz] = check_array + zz * ydim;
+  });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    for ( int xx = 0; xx < xdim; ++xx )
+    {
+      checkarr2D[zz][xx] = zz*xdim + xx + 1;
+    }
+    checkarr2D[ydim-1][xdim-1] = 0;
+  });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -60,66 +66,67 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
-      (DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
-      { minloc_reducer.minloc(ArrView(r, c), Index2D(c, r)); });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                           [=] RAJA_HOST_DEVICE (int c, int r) {
+                             minloc_reducer.minloc(ArrView(r, c), Index2D(c, r));
+                           });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
-      (DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkminloc_reducer.minloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
-
-  Index2D raja_loc        = minloc_reducer.getLoc();
-  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
+    for( int r = 0; r < ydim; ++r)
+    {
+      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
+    }
+  });
+
+  Index2D raja_loc = minloc_reducer.getLoc();
+  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE *> ( work_res,
+                                          workarr2D,
+                                          checkarr2D,
+                                          testarr2D
+                                        );
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTest);
 template <typename T>
 class KernelLocMin2DViewTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
-                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest, LocMin2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest,
+                            LocMin2DViewKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index 57016c4e00..4234471f89 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -8,50 +8,56 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename FORALL_POLICY,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE** workarr2D;
-  DATA_TYPE** checkarr2D;
-  DATA_TYPE** testarr2D;
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE ** workarr2D;
+  DATA_TYPE ** checkarr2D;
+  DATA_TYPE ** testarr2D;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
-  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
-                                     &testarr2D);
+  allocateForallTestData<DATA_TYPE *> ( ydim,
+                                        work_res,
+                                        &workarr2D,
+                                        &checkarr2D,
+                                        &testarr2D
+                                      );
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-                              { workarr2D[zz] = work_array + zz * ydim; });
+  {
+    workarr2D[zz] = work_array + zz * ydim;
+  });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
-                               { checkarr2D[zz] = check_array + zz * ydim; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    checkarr2D[zz] = check_array + zz * ydim;
+  });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](INDEX_TYPE zz)
-                               {
-                                 for (int xx = 0; xx < xdim; ++xx)
-                                 {
-                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
-                                 }
-                                 checkarr2D[ydim - 1][xdim - 1] = 0;
-                               });
+  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
+  {
+    for ( int xx = 0; xx < xdim; ++xx )
+    {
+      checkarr2D[zz][xx] = zz*xdim + xx + 1;
+    }
+    checkarr2D[ydim-1][xdim-1] = 0;
+  });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -62,72 +68,64 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE,
-                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
-      minloc_reducer((DATA_TYPE)1024, LocTup);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> minloc_reducer((DATA_TYPE)1024, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(int c, int r)
-                            {
-                              minloc_reducer.minloc(
-                                  ArrView(r, c),
-                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                            });
+                           [=] RAJA_HOST_DEVICE (int c, int r) {
+                             minloc_reducer.minloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                           });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
-      (DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange,
-                               [=](INDEX_TYPE c)
-                               {
-                                 for (int r = 0; r < ydim; ++r)
-                                 {
-                                   checkminloc_reducer.minloc(checkarr2D[r][c],
-                                                              Index2D(c, r));
-                                 }
-                               });
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
+    for( int r = 0; r < ydim; ++r)
+    {
+      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
+    }
+  });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min                         = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc                      = checkminloc_reducer.getLoc();
+  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
-                                       testarr2D);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE *> ( work_res,
+                                          workarr2D,
+                                          checkarr2D,
+                                          testarr2D
+                                        );
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest);
 template <typename T>
 class KernelLocMin2DViewTupleTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      10, 10);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      151, 151);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
-      362, 362);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest,
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
index 63c696f96a..f0b9f58ff6 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-data.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -11,9 +11,7 @@
 template <typename T>
 void allocRegionTestData(int N,
                          camp::resources::Resource work_res,
-                         T** work1,
-                         T** work2,
-                         T** work3,
+                         T** work1, T** work2, T** work3,
                          camp::resources::Resource host_res,
                          T** check)
 {
@@ -26,9 +24,7 @@ void allocRegionTestData(int N,
 
 template <typename T>
 void deallocRegionTestData(camp::resources::Resource work_res,
-                           T* work1,
-                           T* work2,
-                           T* work3,
+                           T* work1, T* work2, T* work3,
                            camp::resources::Resource host_res,
                            T* check)
 {
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index cb87a63357..b9ad122d2b 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -15,25 +15,28 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res {camp::resources::Host()};
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-
+  
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
-                      host_res, &check_array);
+  allocRegionTestData(N,
+                      work_res,
+                      &work_array1, &work_array2, &work_array3,
+                      host_res,
+                      &check_array);
 
-  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
-  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
-  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
 
-  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
+  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
 
   //
   // Create a list segment with indices in reverse order from range
@@ -45,42 +48,48 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(idx_array.begin(), idx_array.end(), first);
   std::reverse(idx_array.begin(), idx_array.end());
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
+                                          work_res);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-      RAJA::make_tuple(rseg, lseg),
+    RAJA::make_tuple(rseg, lseg),
 
-      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
+    [=] (INDEX_TYPE i) {
+      work_array1[i - first] = 50;
+    },
 
-      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
+    [=] (INDEX_TYPE i) {
+      work_array2[i - first] = 100;
+    },
 
-      [=](INDEX_TYPE i)
-      {
-        work_array3[i - first] =
-            work_array1[i - first] + work_array2[i - first] + 1;
-      }
+    [=] (INDEX_TYPE i) {
+      work_array3[i - first] = work_array1[i - first] + 
+                               work_array2[i - first] + 1;
+    }
 
   );
-
+  
   work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
-                        host_res, check_array);
+  deallocRegionTestData(work_res,
+                        work_array1, work_array2, work_array3,
+                        host_res,
+                        check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionSyncTest);
 template <typename T>
 class KernelRegionSyncTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
 {
@@ -93,6 +102,7 @@ TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
   KernelRegionSyncTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest, RegionSyncKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest,
+                            RegionSyncKernel);
 
 #endif  // __TEST_KERNEL_REGION_SYNC_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index e444da11ba..bb2ec449e0 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -11,61 +11,69 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res {camp::resources::Host()};
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-
+  
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
-                      host_res, &check_array);
+  allocRegionTestData(N,
+                      work_res,
+                      &work_array1, &work_array2, &work_array3,
+                      host_res,
+                      &check_array);
 
-  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
-  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
-  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
 
-  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
+  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
 
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-      RAJA::make_tuple(rseg),
+    RAJA::make_tuple(rseg),
 
-      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
+    [=] (INDEX_TYPE i) {
+      work_array1[i - first] = 50;
+    },
 
-      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
+    [=] (INDEX_TYPE i) {
+      work_array2[i - first] = 100;
+    },
 
-      [=](INDEX_TYPE i)
-      {
-        work_array3[i - first] =
-            work_array1[i - first] + work_array2[i - first] + 1;
-      }
+    [=] (INDEX_TYPE i) {
+      work_array3[i - first] = work_array1[i - first] + 
+                               work_array2[i - first] + 1;
+    }
 
   );
+  
+  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N );
 
-  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
-
-  for (INDEX_TYPE i = 0; i < N; i++)
-  {
+  for (INDEX_TYPE i = 0; i < N; i++) {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
-                        host_res, check_array);
+  deallocRegionTestData(work_res,
+                        work_array1, work_array2, work_array3,
+                        host_res,
+                        check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionTest);
 template <typename T>
 class KernelRegionTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelRegionTest, RegionKernel)
 {
@@ -78,6 +86,7 @@ TYPED_TEST_P(KernelRegionTest, RegionKernel)
   KernelRegionTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest, RegionKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest,
+                            RegionKernel);
 
 #endif  // __TEST_KERNEL_REGION_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
index 48185fe281..82e749d226 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
@@ -11,9 +11,8 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template <int VALUE>
-struct Value
-{
+template<int VALUE>
+struct Value {
   static constexpr int value = VALUE;
 };
 
@@ -24,57 +23,58 @@ void KernelSingleLoopForICountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < tsize; ++t)
-  {
+  for (IDX_TYPE t = 0; t < tsize; ++t) {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
-        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
-
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii)
-        {
-          trip_count += 1;
-          if (i % tsize == t && ii == t)
-          {
-            tile_count += 1;
-          }
-        });
+      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
+      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
+        trip_count += 1;
+        if ( i % tsize == t && ii == t ) { 
+          tile_count += 1;
+        }
+      }
+    );
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ(trip_result, (t + 1) * N);
+    ASSERT_EQ( trip_result, (t+1) * N );
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = N / tsize;
-    if (t < N % tsize)
-    {
+    if ( t < N % tsize ) {
       tile_expect += 1;
     }
     ASSERT_EQ(tile_result, tile_expect);
+
   }
+
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest);
 template <typename T>
 class KernelSingleLoopForICountTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-      IDX_TYPE(57), tsize);
+    IDX_TYPE(57), tsize);
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-      IDX_TYPE(1035), tsize);
+    IDX_TYPE(1035), tsize);
+
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest,
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
index 078ee61cf6..e745a8d08b 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
@@ -11,9 +11,8 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template <int VALUE>
-struct Value
-{
+template<int VALUE>
+struct Value {
   static constexpr int value = VALUE;
 };
 
@@ -26,57 +25,58 @@ void KernelSingleLoopTileTCountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < NT; ++t)
-  {
+  for (IDX_TYPE t = 0; t < NT; ++t) {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
-        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
-
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti)
-        {
-          trip_count += 1;
-          if (i / tsize == t && ti == t)
-          {
-            tile_count += 1;
-          }
-        });
+      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
+      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
+
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
+        trip_count += 1;
+        if ( i / tsize == t && ti == t ) {
+          tile_count += 1;
+        }
+      }
+    );
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ(trip_result, (t + 1) * N);
+    ASSERT_EQ( trip_result, (t+1) * N );
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = tsize;
-    if ((t + 1) * tsize > N)
-    {
+    if ( (t + 1) * tsize > N ) {
       tile_expect = N - t * tsize;
     }
     ASSERT_EQ(tile_result, tile_expect);
+
   }
+
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest);
 template <typename T>
 class KernelSingleLoopTileTCountTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-      IDX_TYPE(57), tsize);
+    IDX_TYPE(57), tsize);
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-      IDX_TYPE(1035), tsize);
+    IDX_TYPE(1035), tsize);
+
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index 5a28fbd523..ccb57cfc62 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -10,128 +10,136 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // holds transposed matrices
-  DATA_TYPE* work_array_t;
-  DATA_TYPE* check_array_t;
-  DATA_TYPE* test_array_t;
+  DATA_TYPE * work_array_t;
+  DATA_TYPE * check_array_t;
+  DATA_TYPE * test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
-
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
-                                    &check_array_t, &test_array_t);
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
+
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array_t,
+                                      &check_array_t,
+                                      &test_array_t
+                                    );
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
 
   // initialize arrays
-  std::iota(test_array, test_array + array_length, 1);
-  std::iota(test_array_t, test_array_t + array_length, 1);
+  std::iota( test_array, test_array + array_length, 1 );
+  std::iota( test_array_t, test_array_t + array_length, 1 );
 
-  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
-  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
 
   // transpose test_array on CPU
-  for (int rr = 0; rr < rows; ++rr)
+  for ( int rr = 0; rr < rows; ++rr )
   {
-    for (int cc = 0; cc < cols; ++cc)
+    for ( int cc = 0; cc < cols; ++cc )
     {
-      HostTView(cc, rr) = HostView(rr, cc);
+      HostTView( cc, rr ) = HostView( rr, cc ); 
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
 
-  RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
-                       RAJA::TileSize {tile_dim_y}),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-      { WorkTView(cc, rr) = WorkView(rr, cc); });
+  RAJA::kernel_param<EXEC_POLICY> (
+    RAJA::make_tuple( colrange, rowrange ),
+    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y} ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
+      WorkTView( cc, rr ) = WorkView( rr, cc );
+  });
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
 
-  for (int rr = 0; rr < rows; ++rr)
+  for ( int rr = 0; rr < rows; ++rr )
   {
-    for (int cc = 0; cc < cols; ++cc)
+    for ( int cc = 0; cc < cols; ++cc )
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
   // reset check and work transpose arrays
-  work_res.memcpy(check_array_t, test_array, sizeof(DATA_TYPE) * array_length);
-  work_res.memcpy(work_array_t, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( check_array_t, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy( work_array_t, test_array, sizeof(DATA_TYPE) * array_length );
 
   // transpose work_array again with different tile sizes
-  RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
-                       RAJA::TileSize {tile_dim_y / 2}),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-      { WorkTView(cc, rr) = WorkView(rr, cc); });
+  RAJA::kernel_param<EXEC_POLICY> (
+    RAJA::make_tuple( colrange, rowrange ),
+    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y/2} ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
+      WorkTView( cc, rr ) = WorkView( rr, cc );
+  });
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
 
-  for (int rr = 0; rr < rows; ++rr)
+  for ( int rr = 0; rr < rows; ++rr )
   {
-    for (int cc = 0; cc < cols; ++cc)
+    for ( int cc = 0; cc < cols; ++cc )
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
-                                      test_array_t);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array_t,
+                                        check_array_t,
+                                        test_array_t
+                                      );
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileDynamic2DTest);
 template <typename T>
 class KernelTileDynamic2DTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelTileDynamic2DTest, TileDynamic2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
-      10, 10);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
-      151, 111);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
-      362, 362);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest, TileDynamic2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest,
+                            TileDynamic2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_DYNAMIC2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index 0bfe064bbd..9013e5c9ea 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -10,102 +10,112 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelTileFixed2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // holds transposed matrices
-  DATA_TYPE* work_array_t;
-  DATA_TYPE* check_array_t;
-  DATA_TYPE* test_array_t;
+  DATA_TYPE * work_array_t;
+  DATA_TYPE * check_array_t;
+  DATA_TYPE * test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
-
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
-                                    &check_array_t, &test_array_t);
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
+
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array_t,
+                                      &check_array_t,
+                                      &test_array_t
+                                    );
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
 
   // initialize arrays
-  std::iota(test_array, test_array + array_length, 1);
-  std::iota(test_array_t, test_array_t + array_length, 1);
+  std::iota( test_array, test_array + array_length, 1 );
+  std::iota( test_array_t, test_array_t + array_length, 1 );
 
-  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
-  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
 
   // transpose test_array on CPU
-  for (int rr = 0; rr < rows; ++rr)
+  for ( int rr = 0; rr < rows; ++rr )
   {
-    for (int cc = 0; cc < cols; ++cc)
+    for ( int cc = 0; cc < cols; ++cc )
     {
-      HostTView(cc, rr) = HostView(rr, cc);
+      HostTView( cc, rr ) = HostView( rr, cc ); 
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-                            { WorkTView(cc, rr) = WorkView(rr, cc); });
+  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
+      WorkTView( cc, rr ) = WorkView( rr, cc );
+  });
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
 
-  for (int rr = 0; rr < rows; ++rr)
+  for ( int rr = 0; rr < rows; ++rr )
   {
-    for (int cc = 0; cc < cols; ++cc)
+    for ( int cc = 0; cc < cols; ++cc )
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
-                                      test_array_t);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array_t,
+                                        check_array_t,
+                                        test_array_t
+                                      );
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DTest);
 template <typename T>
 class KernelTileFixed2DTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelTileFixed2DTest, TileFixed2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
-      10, 10);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
-      151, 111);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
-      362, 362);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest, TileFixed2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest,
+                            TileFixed2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_FIXED2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index 1dcda30f9e..ac876065a1 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -11,42 +11,42 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 {
   // This test reduces min and max with tiling.
 
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
 
   // initialize arrays
-  std::iota(test_array, test_array + array_length, 1);
+  std::iota( test_array, test_array + array_length, 1 );
 
   // set min and max of the array
   test_array[4] = -1;
-  test_array[8] = array_length + 2;
+  test_array[8] = array_length+2;
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
 
-  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
 
-  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin(DATA_TYPE(99999));
-  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax(DATA_TYPE(-1));
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin( DATA_TYPE(99999) ); 
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax( DATA_TYPE(-1) ); 
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -54,45 +54,43 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
-                                              work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
 
   // find min and max on target platform
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-                            {
-                              workmin.min(WorkView(rr, cc));
-                              workmax.max(WorkView(rr, cc));
-                            });
+  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
+      workmin.min(WorkView(rr, cc));
+      workmax.max(WorkView(rr, cc));
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(-1), static_cast<DATA_TYPE>(workmin.get()));
-  ASSERT_EQ(static_cast<DATA_TYPE>(array_length + 2),
-            static_cast<DATA_TYPE>(workmax.get()));
+  ASSERT_EQ(static_cast<DATA_TYPE>(array_length+2), static_cast<DATA_TYPE>(workmax.get()));
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest);
 template <typename T>
 class KernelTileFixed2DMinMaxTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index 6304b1500f..33da6d3c7d 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -12,17 +12,13 @@
 #include <vector>
 #include <type_traits>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
 void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 {
   // This test reduces sums with tiling.
 
   int rows, cols;
-  if (std::is_same<DATA_TYPE, float>::value)
+  if ( std::is_same<DATA_TYPE, float>::value )
   {
     // Restrict to a small data size for better float precision.
     rows = 3;
@@ -34,20 +30,20 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     cols = colsin;
   }
 
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
   DATA_TYPE hostsum = 0;
 
-  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum(DATA_TYPE(0));
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum( DATA_TYPE(0) ); 
 
   // sum on CPU in a tiled manner
-  for (int rr = 0; rr < rows; rr += tile_dim_x)
+  for ( int rr = 0; rr < rows; rr += tile_dim_x )
   {
-    for (int cc = 0; cc < cols; cc += tile_dim_y)
+    for ( int cc = 0; cc < cols; cc += tile_dim_y )
     {
-      for (int r = rr; r < std::min(rr + tile_dim_x, rows); ++r)
+      for ( int r = rr; r < std::min(rr+tile_dim_x, rows); ++r )
       {
-        for (int c = cc; c < std::min(cc + tile_dim_y, cols); ++c)
+        for ( int c = cc; c < std::min(cc+tile_dim_y, cols); ++c )
         {
           hostsum += (DATA_TYPE)(r * 1.1 + c);
         }
@@ -56,7 +52,7 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
   }
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -64,13 +60,13 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
-                                              work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
 
   // sum on target platform
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
-                            { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
+  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
+      worksum += (DATA_TYPE)(rr * 1.1 + cc);
+  });
 
   ASSERT_FLOAT_EQ(hostsum, (DATA_TYPE)worksum.get());
 }
@@ -79,24 +75,23 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest);
 template <typename T>
 class KernelTileFixed2DSumTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
 {
-  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                               REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                               REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
-                               REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest,
+                            TileFixed2DSumKernel);
 
 #endif  // __TEST_KERNEL_TILE_FIXED2DSUM_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index 2bfe44934e..017512c50c 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -10,114 +10,121 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename DATA_TYPE,
-          typename WORKING_RES,
-          typename EXEC_POLICY>
+template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
 
-  DATA_TYPE* work_array;
-  DATA_TYPE* check_array;
-  DATA_TYPE* test_array;
+  DATA_TYPE * work_array;
+  DATA_TYPE * check_array;
+  DATA_TYPE * test_array;
 
   // holds transposed matrices
-  DATA_TYPE* work_array_t;
-  DATA_TYPE* check_array_t;
-  DATA_TYPE* test_array_t;
+  DATA_TYPE * work_array_t;
+  DATA_TYPE * check_array_t;
+  DATA_TYPE * test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
-                                    &check_array, &test_array);
-
-  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
-                                    &check_array_t, &test_array_t);
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array,
+                                      &check_array,
+                                      &test_array
+                                    );
+
+  allocateForallTestData<DATA_TYPE> ( array_length,
+                                      work_res,
+                                      &work_array_t,
+                                      &check_array_t,
+                                      &test_array_t
+                                    );
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
 
   // initialize local array (shared mem)
-  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0, 1>,
-                                    RAJA::SizeList<tile_dim_x, tile_dim_y>>;
+  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0,1>, RAJA::SizeList<tile_dim_x, tile_dim_y>>;
   TILE_MEM Tile_Array;
 
   // initialize arrays
-  std::iota(test_array, test_array + array_length, 1);
-  std::iota(test_array_t, test_array_t + array_length, 1);
+  std::iota( test_array, test_array + array_length, 1 );
+  std::iota( test_array_t, test_array_t + array_length, 1 );
 
-  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
-  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
 
   // transpose test_array on CPU
-  for (int rr = 0; rr < rows; ++rr)
+  for ( int rr = 0; rr < rows; ++rr )
   {
-    for (int cc = 0; cc < cols; ++cc)
+    for ( int cc = 0; cc < cols; ++cc )
     {
-      HostTView(cc, rr) = HostView(rr, cc);
+      HostTView( cc, rr ) = HostView( rr, cc ); 
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
 
-  RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(colrange, rowrange),
-      RAJA::make_tuple((INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array),
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
-                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
-      { Tile_Array(ty, tx) = WorkView(rr, cc); },
+  RAJA::kernel_param<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ), RAJA::make_tuple( (INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array ),
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
+      Tile_Array( ty, tx ) = WorkView( rr, cc );
+    },
 
-      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
-                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
-      { WorkTView(cc, rr) = Tile_Array(ty, tx); });
+    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
+      WorkTView( cc, rr ) = Tile_Array( ty, tx );
+    }
+  );
 
-  work_res.memcpy(check_array_t, work_array_t,
-                  sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
 
-  for (int rr = 0; rr < rows; ++rr)
+  for ( int rr = 0; rr < rows; ++rr )
   {
-    for (int cc = 0; cc < cols; ++cc)
+    for ( int cc = 0; cc < cols; ++cc )
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
-                                      test_array);
-
-  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
-                                      test_array_t);
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array,
+                                        check_array,
+                                        test_array
+                                      );
+
+  deallocateForallTestData<DATA_TYPE> ( work_res,
+                                        work_array_t,
+                                        check_array_t,
+                                        test_array_t
+                                      );
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest);
 template <typename T>
 class KernelTileLocalArray2DTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                 EXEC_POLICY>(10, 10);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                 EXEC_POLICY>(151, 111);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
-                                 EXEC_POLICY>(362, 362);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest,
+                            TileLocalArray2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_LOCALARRAY2D_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
index c1d3daad25..13f9c62a45 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test
-{};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,14 +31,12 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 64, 4 * 123);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
index bf1dda57af..cda8aaba59 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test
-{};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,15 +31,13 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 4000);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
index 7181c5f5a2..f3194fba44 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test
-{};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,14 +31,12 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 2345);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345 );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
index 24c7f294ca..08e9a0c381 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test
-{};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,14 +31,12 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 64, 4 * 123);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
index 6690efd2f9..e61c05446c 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test
-{};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,15 +31,13 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 4000);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
index ba8f38e64c..c435c484b2 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -17,13 +17,11 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test
-{};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
-{
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -33,14 +31,12 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY =
-      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
-      LOOP_TYPE(), 2345);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index 8809630544..797379e890 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -10,53 +10,38 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
-{
-  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel<EXEC_POL>(segs, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs,
-                  PARAMS&& params,
-                  WORKING_RES work_res,
-                  Args&&... args)
-{
-  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
 }
 
 //
@@ -64,99 +49,97 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceMask test supports.
 //
 //
-using ReduceMaskSupportedLoopTypeList =
-    camp::list<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-               DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
+using ReduceMaskSupportedLoopTypeList = camp::list<
+  DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+  DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
+>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
-                                           &work_array, &check_array,
-                                           &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-      work_res,
-      [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j))
-      {
-        trip_count += 1;
-        worksum += i;  // i should only be 0..directlen-1
-        max_thread.max(threadIdx.x);
-      });
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+                            work_res,
+                            [=] RAJA_DEVICE (RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j)) {
+                              trip_count += 1;
+                              worksum += i; // i should only be 0..directlen-1
+                              max_thread.max(threadIdx.x);
+                            });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen * directlen);
-  ASSERT_EQ(worksum.get(), looplen * directlen * (directlen - 1) / 2);
+  ASSERT_EQ(trip_count.get(), looplen*directlen);
+  ASSERT_EQ(worksum.get(), looplen*directlen*(directlen-1)/2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
-                                           &work_array, &check_array,
-                                           &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-      RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0), work_res,
-      [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
-                      RAJA::Index_type RAJA_UNUSED_ARG(j),
-                      RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
-      {
-        trip_count += 1;
-        worksum += y;  // y should only be 0..3
-        max_thread.max(threadIdx.x);
-      });
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+                            RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0),
+                            work_res,
+                            [=] RAJA_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type RAJA_UNUSED_ARG(j), RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y) {
+                              trip_count += 1;
+                              worksum += y; // y should only be 0..3
+                              max_thread.max(threadIdx.x);
+                            });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen * directlen);
-  ASSERT_EQ(worksum.get(), looplen * directlen * (looplen - 1) / 2);
+  ASSERT_EQ(trip_count.get(), looplen*directlen);
+  ASSERT_EQ(worksum.get(), looplen*directlen*(looplen-1)/2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
 //
@@ -164,43 +147,37 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
 // Defining the Kernel Loop structure for ReduceMask Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-                      REDUCE_POL,
-                      POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          0,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
-                         >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
-                      REDUCE_POL,
-                      POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<RAJA::statement::ForICount<
-          0,
-          RAJA::statement::Param<0>,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::ForICount<
-              1,
-              RAJA::statement::Param<1>,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
-      >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index e69c46baa5..1771b99665 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -10,31 +10,22 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs,
-                  PARAMS&& params,
-                  WORKING_RES work_res,
-                  Args&&... args)
-{
-  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
 }
 
 //
@@ -42,68 +33,67 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the ReduceWarp test supports.
 //
 //
-using ReduceWarpSupportedLoopTypeList =
-    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-               DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-               DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
+using ReduceWarpSupportedLoopTypeList = camp::list<
+  DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+  DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+  DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
+>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
-      RAJA::make_tuple((RAJA::Index_type)0), work_res,
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type & value)
-      { value += i; },
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type &value) {
+                              value += i;
+                            },
 
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
-      {
-        // This only gets executed on the "root" thread which received the
-        // reduced value.
-        worksum += value;
-        reduce_count += 1;
-      });
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
+                              // This only gets executed on the "root" thread which received the reduced value.
+                              worksum += value;
+                              reduce_count += 1;
+                            });
 
-  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
+  ASSERT_EQ(worksum.get(), len*(len-1)/2);
   ASSERT_EQ(reduce_count.get(), 1);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
-    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
+                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
@@ -112,85 +102,88 @@ void KernelWarpThreadTest(
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type outerlen = len / innerlen;
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-      RAJA::make_tuple((RAJA::Index_type)0), work_res,
-
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
-                           RAJA::Index_type & value)
-      { value += i + j * outerlen; },
-
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
-      {
-        // This only gets executed on the "root" thread which received the
-        // reduced value.
-        worksum += value;
-        reduce_count += 1;
-      });
-
-  ASSERT_EQ(worksum.get(), outerlen * innerlen * (outerlen * innerlen - 1) / 2);
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type &value) {
+                              value += i + j * outerlen;
+                            },
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
+                              // This only gets executed on the "root" thread which received the reduced value.
+                              worksum += value;
+                              reduce_count += 1;
+                            });
+
+  ASSERT_EQ(worksum.get(), outerlen*innerlen*(outerlen*innerlen-1)/2);
   ASSERT_EQ(reduce_count.get(), innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
-void KernelWarpThreadTest(
-    const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
-    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
+                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  RAJA::Index_type innerlen  = 10;
+  RAJA::Index_type innerlen = 10;
   RAJA::Index_type middlelen = 16;
-  RAJA::Index_type outerlen  = len / (innerlen * middlelen);
+  RAJA::Index_type outerlen = len / (innerlen*middlelen);
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
-                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-      RAJA::make_tuple((RAJA::Index_type)0), work_res,
-
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
-                           RAJA::Index_type k, RAJA::Index_type & value)
-      { value += i + j * outerlen + k * outerlen * middlelen; },
-
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
-      {
-        // This only gets executed on the "root" thread which received the
-        // reduced value.
-        worksum += value;
-        reduce_count += 1;
-      });
-
-  ASSERT_EQ(worksum.get(), outerlen * middlelen * innerlen *
-                               (outerlen * middlelen * innerlen - 1) / 2);
-  ASSERT_EQ(reduce_count.get(), middlelen * innerlen);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
+                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k, RAJA::Index_type &value) {
+                              value += i + j * outerlen + k * outerlen * middlelen;
+                            },
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
+                              // This only gets executed on the "root" thread which received the reduced value.
+                              worksum += value;
+                              reduce_count += 1;
+                            });
+
+  ASSERT_EQ(worksum.get(), outerlen*middlelen*innerlen*(outerlen*middlelen*innerlen-1)/2);
+  ASSERT_EQ(reduce_count.get(), middlelen*innerlen);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
 //
@@ -198,75 +191,57 @@ void KernelWarpThreadTest(
 // Defining the Kernel Loop structure for ReduceWarp Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-                      REDUCE_POL,
-                      POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                           RAJA::statement::Lambda<0>>,
-      RAJA::statement::Reduce<
-          typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::operators::plus,
-          RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
-                                  >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>>,
+        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-                      REDUCE_POL,
-                      POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<
-          1,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              0,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::Lambda<0>>>,
-      RAJA::statement::Reduce<
-          typename camp::at<POLICY_DATA, camp::num<2>>::type,
-          RAJA::operators::plus,
-          RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
-                                  >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::statement::Lambda<0>
+          >
+        >,
+        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
-                      REDUCE_POL,
-                      POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
-          2,
-          typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<
-              1,
-              typename camp::at<POLICY_DATA, camp::num<1>>::type,
-              RAJA::statement::For<
-                  0,
-                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
-                  RAJA::statement::Lambda<0>>                  // end For 0
-              >,                                               // end For 1
-          typename camp::at<POLICY_DATA, camp::num<3>>::type,  // warp
-                                                               // synchronize
-          RAJA::statement::Reduce<
-              typename camp::at<POLICY_DATA, camp::num<4>>::type,
-              RAJA::operators::plus,
-              RAJA::statement::Param<0>,
-              RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end For 2
-                                                        >  // end DEVICE_KERNEL
-                         >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::statement::Lambda<0>
+            > // end For 0
+          >,  // end For 1
+          typename camp::at<POLICY_DATA, camp::num<3>>::type, // warp synchronize
+          RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<4>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+            RAJA::statement::Lambda<1, RAJA::Params<0>>
+          >
+        > // end For 2
+      > // end DEVICE_KERNEL
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index d0a8e51af3..ba4f445c88 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -10,53 +10,38 @@
 
 #include <numeric>
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
-{
-  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel<EXEC_POL>(segs, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs,
-                  PARAMS&& params,
-                  WORKING_RES work_res,
-                  Args&&... args)
-{
-  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
 }
 
-template <typename EXEC_POL,
-          bool USE_RESOURCE,
-          typename SEGMENTS,
-          typename PARAMS,
-          typename WORKING_RES,
-          typename... Args>
-typename std::enable_if<!USE_RESOURCE>::type
-call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
-{
-  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
 }
 
 //
@@ -64,96 +49,93 @@ call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
 // Define list of nested loop types the WarpLoop test supports.
 //
 //
-using WarpLoopSupportedLoopTypeList =
-    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARP,
-               DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-               DEVICE_DEPTH_2_REDUCESUM_WARP>;
+using WarpLoopSupportedLoopTypeList = camp::list<
+  DEVICE_DEPTH_1_REDUCESUM_WARP,
+  DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+  DEVICE_DEPTH_2_REDUCESUM_WARP
+>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
-                                           &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
-      work_res, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { worksum += i; });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)), work_res,
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
+                              worksum += i;
+                            });
 
-  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
+  ASSERT_EQ(worksum.get(), len*(len-1)/2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE>
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
                           const RAJA::Index_type numtiles)
 {
-  WORKING_RES work_res {WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res {work_res};
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
 
   RAJA::Index_type flatSize = 32 * numtiles;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(
-      flatSize, erased_work_res, &work_array, &check_array, &test_array);
+  allocateForallTestData<RAJA::Index_type>(flatSize,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, flatSize);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
-      RAJA::make_tuple((RAJA::Index_type)0), work_res,
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
-                           RAJA::Index_type j)
-      {
-        worksum += j;  // j should only be 0..31
-      });
-
-  ASSERT_EQ(worksum.get(), numtiles * 32 * (32 - 1) / 2);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
-                                             check_array, test_array);
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type j) {
+                              worksum += j; // j should only be 0..31
+                            });
+
+  ASSERT_EQ(worksum.get(), numtiles*32*(32-1)/2);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
 }
 
-// More specific execution policies that use the above
-// DEVICE_DEPTH_1_REDUCESUM_WARP test.
-template <typename WORKING_RES,
-          typename EXEC_POLICY,
-          typename REDUCE_POL,
-          bool USE_RESOURCE,
-          typename... Args>
-void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
-                          Args... args)
-{
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
-      DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
+// More specific execution policies that use the above DEVICE_DEPTH_1_REDUCESUM_WARP test.
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args... args){
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
 }
 
 //
@@ -161,52 +143,49 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
 // Defining the Kernel Loop structure for WarpLoop Nested Loop Tests.
 //
 //
-template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
-{
-  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0,
-                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
-                           RAJA::statement::Lambda<0>>>  // end DEVICE_KERNEL
-                                  >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::Lambda<0>
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-                      REDUCE_POL,
-                      POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<32>,
-          RAJA::seq_exec,
-          RAJA::statement::For<
-              0,
-              typename camp::at<POLICY_DATA, camp::num<0>>::type,
-              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
-                         >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
+          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
-template <typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
-{
-  using type =
-      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
-          0,
-          RAJA::tile_fixed<32>,
-          RAJA::seq_exec,
-          RAJA::statement::ForICount<
-              0,
-              RAJA::statement::Param<0>,
-              typename camp::at<POLICY_DATA, camp::num<0>>::type,
-              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
-                         >;
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index c20b66a95d..04bc3bcc5e 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -21,11 +21,11 @@
 // Defining the Launch Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template <typename EXEC_POL_DATA,
-          typename IDX_TYPE,
+template <typename EXEC_POL_DATA, typename IDX_TYPE,
           typename SEGMENTS_TYPE,
           typename Lambda>
-void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
+void Launch(const SEGMENTS_TYPE& segments,
+                  Lambda&& lambda)
 {
   using RAJA::get;
 
@@ -55,69 +55,41 @@ void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
   IDX_TYPE blocks_j = RAJA_DIVIDE_CEILING_INT(distance_sj, threads_j);
   IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
-                         RAJA::Threads(threads_i, threads_j, threads_k)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<TEAM_Z_POLICY>(
-            ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k),
-            [&](IDX_TYPE bk)
-            {
-              RAJA::loop<TEAM_Y_POLICY>(
-                  ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j),
-                  [&](IDX_TYPE bj)
-                  {
-                    RAJA::loop<TEAM_X_POLICY>(
-                        ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i),
-                        [&](IDX_TYPE bi)
-                        {
-                          RAJA::loop<THREAD_Z_POLICY>(
-                              ctx,
-                              RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k),
-                              [&](IDX_TYPE tk)
-                              {
-                                RAJA::loop<THREAD_Y_POLICY>(
-                                    ctx,
-                                    RAJA::TypedRangeSegment<IDX_TYPE>(
-                                        0, threads_j),
-                                    [&](IDX_TYPE tj)
-                                    {
-                                      RAJA::loop<THREAD_X_POLICY>(
-                                          ctx,
-                                          RAJA::TypedRangeSegment<IDX_TYPE>(
-                                              0, threads_i),
-                                          [&](IDX_TYPE ti)
-                                          {
-                                            IDX_TYPE i = ti + threads_i * bi;
-                                            IDX_TYPE j = tj + threads_j * bj;
-                                            IDX_TYPE k = tk + threads_k * bk;
-
-                                            if (i < distance_si &&
-                                                j < distance_sj &&
-                                                k < distance_sk)
-                                            {
-                                              lambda(begin_sk[k], begin_sj[j],
-                                                     begin_si[i]);
-                                            }
-                                          });
-                                    });
-                              });
-                        });
-                  });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
+                        RAJA::Threads(threads_i, threads_j,threads_k)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+    RAJA::loop<TEAM_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k), [&](IDX_TYPE bk) {
+      RAJA::loop<TEAM_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j), [&](IDX_TYPE bj) {
+        RAJA::loop<TEAM_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i), [&](IDX_TYPE bi) {
+
+          RAJA::loop<THREAD_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k), [&](IDX_TYPE tk) {
+            RAJA::loop<THREAD_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_j), [&](IDX_TYPE tj) {
+              RAJA::loop<THREAD_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_i), [&](IDX_TYPE ti) {
+
+                IDX_TYPE i = ti + threads_i * bi;
+                IDX_TYPE j = tj + threads_j * bj;
+                IDX_TYPE k = tk + threads_k * bk;
+
+                if (i < distance_si && j < distance_sj && k < distance_sk) {
+                  lambda(begin_sk[k], begin_sj[j], begin_si[i]);
+                }
+              });
             });
+          });
+
+        });
       });
+    });
+
+  });
 }
 
-template <typename EXEC_POL_DATA,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -125,19 +97,12 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{
-  return false;
-}
+{ return false; }
 ///
-template <typename EXEC_POL_DATA,
-          typename REDUCE_POLICY,
-          typename ABSTRACTION,
-          typename DATA_TYPE,
-          typename IDX_TYPE,
-          typename SEGMENTS_TYPE,
-          typename Container,
-          typename WORKING_RES,
-          typename RandomGenerator>
+template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -146,8 +111,7 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER =
-      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -157,13 +121,13 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval      = 100;
+  const int modval = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -180,50 +144,51 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range + 1, working_res, &working_range,
-                         &check_range, &test_range);
+  allocateForallTestData(idx_range+1,
+                         working_res,
+                         &working_range,
+                         &check_range,
+                         &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
-  {
+  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
-        0, num_bins);
-
-    for (IDX_TYPE k : sk)
-    {
-      for (IDX_TYPE j : sj)
-      {
-        for (IDX_TYPE i : si)
-        {
-          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+
+    for (IDX_TYPE k : sk) {
+      for (IDX_TYPE j : sj) {
+        for (IDX_TYPE i : si) {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii + 1] = data_len;
+          test_range[ii+1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len, working_res, &working_array, &check_array,
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_array,
+                         &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_bins,
+                         &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0))
-  {
+  if (data_len > IDX_TYPE(0)) {
 
-    // use ints to initialize array here to avoid floating point precision
-    // issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
+    // use ints to initialize array here to avoid floating point precision issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i)
-    {
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -231,8 +196,7 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range,
-                     sizeof(IDX_TYPE) * (idx_range + 1));
+  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -244,28 +208,21 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i)
-    {
-      ref_vals[test_bins[i]] =
-          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
+      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    Launch<EXEC_POL_DATA, IDX_TYPE>(
-        segments,
-        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
-        {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-               ++idx)
-          {
-            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-          }
-        });
+    Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
+        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+      }
+    });
 
     size_t bin = 0;
-    for (auto init_val : multi_init)
-    {
+    for (auto init_val : multi_init) {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -275,60 +232,46 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from
-  // multiple loops
+  // basic multiple use test, ensure same reducer can combine values from multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j)
-    {
+    for (int j = 0; j < nloops; ++j) {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i)
-      {
-        ref_vals[test_bins[i]] =
-            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(
-          segments,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
-          {
-            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin)
-    {
+    for (size_t bin = 0; bin < num_bins; ++bin) {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red))
-  {
+  if (ABSTRACTION::consistent(red)) {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
-    {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-                         std::uniform_int_distribution<DATA_TYPE>,
-                         std::uniform_real_distribution<DATA_TYPE>>
-          array_flt_distribution(0, modval - 1);
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i)
-      {
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array,
-                         sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -336,32 +279,23 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j)
-    {
+    for (int j = 0; j < nloops; ++j) {
       red.reset();
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(
-          segments,
-          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
-          {
-            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
-                 ++idx)
-            {
-              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-            }
-          });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
 
-      if (!got_ref_vals)
-      {
+      if (!got_ref_vals) {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      }
-      else
-      {
-        for (size_t bin = 0; bin < num_bins; ++bin)
-        {
+      } else {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -369,16 +303,26 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
-  deallocateForallTestData(working_res, working_array, check_array, test_array);
-  deallocateForallTestData(working_res, working_range, check_range, test_range);
+  deallocateForallTestData(working_res,
+                           working_bins,
+                           check_bins,
+                           test_bins);
+  deallocateForallTestData(working_res,
+                           working_array,
+                           check_array,
+                           test_array);
+  deallocateForallTestData(working_res,
+                           working_range,
+                           check_range,
+                           test_range);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest);
 template <typename T>
 class LaunchMultiReduceNestedTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
 {
@@ -390,48 +334,43 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device {}();
+  auto random_seed = std::random_device{}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res {WORKING_RES::get_default()};
+  WORKING_RES working_res{WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container)
-  {
+  for (size_t num_bins_max : num_bins_max_container) {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
-                                                        num_bins_max);
-    num_bins_min    = num_bins_max + 1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
+    num_bins_min = num_bins_max+1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s1, container,
-                                                         working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
-                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s2, container,
-                                                         working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s2, container, working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 =
-        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
-                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
-                                    DATA_TYPE, IDX_TYPE>(s3, container,
-                                                         working_res, rngen);
+    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s3, container, working_res, rngen);
+
   }
 }
 
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index bb64d5424b..a730d030a7 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -10,25 +10,19 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
 void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6*M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -38,22 +32,26 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N5 = static_cast<INDEX_TYPE>(r5.end() - r5.begin());
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 *                                          
+                                         N3 * N4 *
+                                         N5 * N6);                                         
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
-  // 6 threads total
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+  //6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
   constexpr int threads_z = 4;
@@ -62,132 +60,85 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
-                                           N1);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<TEAM_Z_POLICY>(
-              ctx, r6,
-              [&](INDEX_TYPE bz)
-              {
-                RAJA::loop<TEAM_Y_POLICY>(
-                    ctx, r5,
-                    [&](INDEX_TYPE by)
-                    {
-                      RAJA::loop<TEAM_X_POLICY>(
-                          ctx, r4,
-                          [&](INDEX_TYPE bx)
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, r3,
-                                [&](INDEX_TYPE tz)
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, r2,
-                                      [&](INDEX_TYPE ty)
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, r1,
-                                            [&](INDEX_TYPE tx)
-                                            {
-                                              auto idx =
-                                                  tx +
-                                                  N1 *
-                                                      (ty +
-                                                       N2 *
-                                                           (tz +
-                                                            N3 *
-                                                                (bx +
-                                                                 N4 *
-                                                                     (by +
-                                                                      N5 *
-                                                                          bz))));
-
-
-                                              Aview(bz, by, bx, tz, ty, tx) =
-                                                  static_cast<INDEX_TYPE>(idx);
-                                            });
-                                      });
-                                });
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
+            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
+                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
+
+                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
+
+
+                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
-  }
-  else
-  {  // zero-length segment
+          });
+    });
+  } else { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<TEAM_Z_POLICY>(
-              ctx, r3,
-              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
-              {
-                RAJA::loop<TEAM_Y_POLICY>(
-                    ctx, r2,
-                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
-                    {
-                      RAJA::loop<TEAM_X_POLICY>(
-                          ctx, r1,
-                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, r3,
-                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, r2,
-                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, r1,
-                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
-                                            { working_array[0]++; });
-                                      });
-                                });
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
+            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
+                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
+
+                                working_array[0]++;
+                                
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
+          });
+      });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
-
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-    {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0) {
+    
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
     }
-  }
-  else
-  {
-
+    
+  } else {
+    
     ASSERT_EQ(test_array[0], check_array[0]);
+    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -195,7 +146,8 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedDirectTest);
 template <typename T>
 class LaunchNestedDirectTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
@@ -203,44 +155,34 @@ TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+
 
 
   // test zero-length range segment
   LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(0));
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(0));
 
-  // Keep at one since we are doing a direct thread test
+  //Keep at one since we are doing a direct thread test
   LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(1));
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(1));
+
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest, RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest,
+                            RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_DIRECT_HPP__
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index c9192b6718..8f3b9702d0 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -10,25 +10,19 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
 void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2 * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3*M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -39,23 +33,30 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
+                                         N2 *
+                                         N3 *
+                                         N4 *
+                                         N5 *
+                                         N6);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  // 6 threads total
+  //6 threads total
   constexpr int threads_x = 1;
   constexpr int threads_y = 2;
   constexpr int threads_z = 3;
@@ -64,132 +65,86 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 2;
   constexpr int blocks_z = 1;
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
-                                           N1);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<TEAM_Z_POLICY>(
-              ctx, r6,
-              [&](INDEX_TYPE bz)
-              {
-                RAJA::loop<TEAM_Y_POLICY>(
-                    ctx, r5,
-                    [&](INDEX_TYPE by)
-                    {
-                      RAJA::loop<TEAM_X_POLICY>(
-                          ctx, r4,
-                          [&](INDEX_TYPE bx)
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, r3,
-                                [&](INDEX_TYPE tz)
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, r2,
-                                      [&](INDEX_TYPE ty)
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, r1,
-                                            [&](INDEX_TYPE tx)
-                                            {
-                                              auto idx =
-                                                  tx +
-                                                  N1 *
-                                                      (ty +
-                                                       N2 *
-                                                           (tz +
-                                                            N3 *
-                                                                (bx +
-                                                                 N4 *
-                                                                     (by +
-                                                                      N5 *
-                                                                          bz))));
-
-
-                                              Aview(bz, by, bx, tz, ty, tx) =
-                                                  static_cast<INDEX_TYPE>(idx);
-                                            });
-                                      });
-                                });
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
+            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
+                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
+
+                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
+
+
+                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                                
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
-  }
-  else
-  {  // zero-length segment
+          });
+    });
+  } else { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<TEAM_Z_POLICY>(
-              ctx, r3,
-              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
-              {
-                RAJA::loop<TEAM_Y_POLICY>(
-                    ctx, r2,
-                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
-                    {
-                      RAJA::loop<TEAM_X_POLICY>(
-                          ctx, r1,
-                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, r3,
-                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, r2,
-                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, r1,
-                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
-                                            { working_array[0]++; });
-                                      });
-                                });
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
+            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
+                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx) ) {
+
+                                working_array[0]++;
+                                
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
+          });
+      });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if (RAJA::stripIndexType(N) > 0) {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-    {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
     }
-  }
-  else
-  {
+
+  } else {
 
     ASSERT_EQ(test_array[0], check_array[0]);
+
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -197,7 +152,8 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedLoopTest);
 template <typename T>
 class LaunchNestedLoopTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
@@ -205,43 +161,32 @@ TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
   LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(0));
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(0));
 
   LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(3));
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(3));
+
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest, RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest,
+                            RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_LOOP_HPP__
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 20a4e10ac6..793d432987 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -10,15 +10,9 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -26,152 +20,121 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   constexpr int tile_size_y = 3;
   constexpr int tile_size_z = 4;
 
-  constexpr int threads_x = 2 * tile_size_x;
-  constexpr int threads_y = 3 * tile_size_y;
-  constexpr int threads_z = 4 * tile_size_z;
+  constexpr int threads_x = 2*tile_size_x;
+  constexpr int threads_y = 3*tile_size_y;
+  constexpr int threads_z = 4*tile_size_z;
 
   constexpr int blocks_x = 4;
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y * M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z*M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
+                                         N2 *
+                                         N3);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile<TEAM_Z_POLICY>(
-              ctx, tile_size_z, r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
-              {
-                RAJA::tile<TEAM_Y_POLICY>(
-                    ctx, tile_size_y, r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
-                    {
-                      RAJA::tile<TEAM_X_POLICY>(
-                          ctx, tile_size_x, r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, z_tile,
-                                [&](INDEX_TYPE tz)
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, y_tile,
-                                      [&](INDEX_TYPE ty)
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, x_tile,
-                                            [&](INDEX_TYPE tx)
-                                            {
-                                              auto idx =
-                                                  tx + N1 * (ty + N2 * tz);
-
-                                              Aview(tz, ty, tx) =
-                                                  static_cast<INDEX_TYPE>(idx);
-                                            });
-                                      });
-                                });
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+            RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+                RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
+
+                                auto idx = tx + N1 * (ty + N2 * tz);
+
+                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
-  }
-  else
-  {  // zero-length segment
+          });
+    });
+  } else { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile<TEAM_Z_POLICY>(
-              ctx, threads_z, r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
-              {
-                RAJA::tile<TEAM_Y_POLICY>(
-                    ctx, threads_y, r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
-                    {
-                      RAJA::tile<TEAM_X_POLICY>(
-                          ctx, threads_x, r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, z_tile,
-                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, y_tile,
-                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, x_tile,
-                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
-                                            { working_array[0]++; });
-                                      });
-                                });
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
+
+                                working_array[0]++;
+                                
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
+          });
+      });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if (RAJA::stripIndexType(N) > 0) {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-    {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
     }
-  }
-  else
-  {
+
+  } else {
 
     ASSERT_EQ(test_array[0], check_array[0]);
+
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -179,7 +142,8 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
@@ -187,44 +151,33 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(0));
-
-  // Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(1));
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(0));
+
+  //Keep at one since we are doing a direct thread test
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(1));
+
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
+                            RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index 790498dc2f..07deab0376 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -10,15 +10,9 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename THREAD_Y_POLICY,
-          typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY,
-          typename TEAM_Y_POLICY,
-          typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -30,151 +24,116 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  // Add one to we check the bounds checking capability
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x * M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y * M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z * M + 1);
+  //Add one to we check the bounds checking capability
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x*M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y*M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z*M + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
+                                         N2 *
+                                         N3);
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(threads_x, threads_y, threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile<TEAM_Z_POLICY>(
-              ctx, threads_z, r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
-              {
-                RAJA::tile<TEAM_Y_POLICY>(
-                    ctx, threads_y, r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
-                    {
-                      RAJA::tile<TEAM_X_POLICY>(
-                          ctx, threads_x, r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, z_tile,
-                                [&](INDEX_TYPE tz)
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, y_tile,
-                                      [&](INDEX_TYPE ty)
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, x_tile,
-                                            [&](INDEX_TYPE tx)
-                                            {
-                                              auto idx =
-                                                  tx + N1 * (ty + N2 * tz);
-
-                                              Aview(tz, ty, tx) =
-                                                  static_cast<INDEX_TYPE>(idx);
-                                            });
-                                      });
-                                });
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
+
+                                auto idx = tx + N1 * (ty + N2 * tz);
+
+                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
-  }
-  else
-  {  // zero-length segment
+          });
+    });
+  } else { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
-                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile<TEAM_Z_POLICY>(
-              ctx, threads_z, r3,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
-              {
-                RAJA::tile<TEAM_Y_POLICY>(
-                    ctx, threads_y, r2,
-                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
-                    {
-                      RAJA::tile<TEAM_X_POLICY>(
-                          ctx, threads_x, r1,
-                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
-                          {
-                            RAJA::loop<THREAD_Z_POLICY>(
-                                ctx, z_tile,
-                                [&](INDEX_TYPE tz)
-                                {
-                                  RAJA::loop<THREAD_Y_POLICY>(
-                                      ctx, y_tile,
-                                      [&](INDEX_TYPE ty)
-                                      {
-                                        RAJA::loop<THREAD_X_POLICY>(
-                                            ctx, x_tile,
-                                            [&](INDEX_TYPE tx)
-                                            {
-                                              (void)tx;
-                                              (void)ty;
-                                              (void)tz;
-
-                                              working_array[0]++;
-                                            });
-                                      });
-                                });
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
+            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
+                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
+
+                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
+                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
+                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
+
+                                (void) tx;
+                                (void) ty;
+                                (void) tz;
+
+                                working_array[0]++;
+                              });
                           });
-                    });
+                      });
+
+                  });
               });
-        });
+          });
+      });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if (RAJA::stripIndexType(N) > 0) {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-    {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
     }
-  }
-  else
-  {
+
+  } else {
 
     ASSERT_EQ(test_array[0], check_array[0]);
+
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -182,7 +141,8 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
@@ -190,44 +150,33 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-  using THREAD_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<3>>::type;
-
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<4>>::type;
-  using TEAM_Y_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<5>>::type;
-  using TEAM_Z_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<6>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
+
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
+  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
+  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(0));
-
-  // Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<
-      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
-      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
-      INDEX_TYPE(1));
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(0));
+
+  //Keep at one since we are doing a direct thread test
+  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
+    (INDEX_TYPE(1));
+
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
+                            RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_LOOP_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index c07e8490ea..aed4b9618e 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -13,8 +13,7 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -24,36 +23,39 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks            = (seg.size() - 1) / threads + 1;
+  int blocks = (seg.size() - 1)/threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg, [&](IDX_TYPE idx) { simpand &= working_array[idx]; });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          simpand &= working_array[idx];
+     });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -64,32 +66,27 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_and &= test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_and &= test_array[ seg_idx[i] ];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           redand &= working_array[idx];
-                                           redand2 &= working_array[idx];
-                                         });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+        redand  &= working_array[idx];
+        redand2 &= working_array[idx];
+    });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -97,21 +94,22 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POLICY>(
-              ctx, seg, [&](IDX_TYPE idx) { redand &= working_array[idx]; });
-        });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          redand &= working_array[idx];
+      });
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -119,79 +117,79 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest);
 template <typename T>
 class LaunchReduceBitAndBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                          working_res);
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                          working_res);
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
-  LaunchReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest,
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index eb3f55c1e5..3e8c86ffd8 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -13,8 +13,7 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -24,31 +23,32 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks            = (seg.size() - 1) / threads + 1;
+  int blocks = (seg.size() - 1)/threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval          = 100;
-  const DATA_TYPE min_init  = modval + 1;
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -57,17 +57,15 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           mininit.min(working_array[idx]);
-                                           min.min(working_array[idx]);
-                                         });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          mininit.min( working_array[idx] );
+          min.min( working_array[idx] );
+    });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -76,31 +74,33 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          min.min( working_array[idx] * factor);
+    });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          min.min( working_array[idx] * factor);
       });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -108,85 +108,82 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest);
 template <typename T>
 class LaunchReduceMinBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                          working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceMinBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                          working_res);
+  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest, ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest,
+                            ReduceMinBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index da783f96bd..798988f116 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -13,8 +13,7 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -25,29 +24,30 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks            = (seg.size() - 1) / threads + 1;
+  int blocks = (seg.size() - 1)/threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_sum += test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_sum += test_array[ seg_idx[i] ];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -56,17 +56,14 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           sum += working_array[idx];
-                                           sum2 += working_array[idx];
-                                         });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          sum  += working_array[idx];
+          sum2 += working_array[idx];
+     });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -75,21 +72,23 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POLICY>(
-              ctx, seg, [&](IDX_TYPE idx) { sum += working_array[idx]; });
-        });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+            sum += working_array[idx];
+          });
+      });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -97,84 +96,81 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest);
 template <typename T>
 class LaunchReduceSumBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
-                                                          working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceSumBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
-                                                          working_res);
+  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
-                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-      l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest, ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest,
+                            ReduceSumBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index a06d022183..0a91d5f9b8 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -13,49 +13,54 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
 
-void LaunchParamExptReduceBitAndBasicTestImpl(
-    const SEG_TYPE& seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource working_res)
+void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
 {
+  using REF_BITAND = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_and>;
+
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks            = (seg.size() - 1) / threads + 1;
+  int blocks = (seg.size() - 1)/threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _simpand)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg, [&](IDX_TYPE idx) { _simpand &= working_array[idx]; });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND &_simpand) {
+
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          _simpand &= working_array[idx];
+     });
+
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand), 5);
 
@@ -66,35 +71,29 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_and &= test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_and &= test_array[ seg_idx[i] ];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _redand,
-                           DATA_TYPE & _redand2)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           _redand &= working_array[idx];
-                                           _redand2 &= working_array[idx];
-                                         });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND &_redand, REF_BITAND &_redand2) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+        _redand  &= working_array[idx];
+        _redand2 &= working_array[idx];
+    });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -102,22 +101,23 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE _redand)
-        {
-          RAJA::loop<GLOBAL_THREAD_POLICY>(
-              ctx, seg, [&](IDX_TYPE idx) { _redand &= working_array[idx]; });
-        });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND _redand) {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          _redand &= working_array[idx];
+      });
+    });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -125,80 +125,78 @@ void LaunchParamExptReduceBitAndBasicTestImpl(
 TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest);
 template <typename T>
 class LaunchParamExptReduceBitAndBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
-
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r1, seg_idx, working_res);
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                    r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r2, seg_idx, working_res);
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                    r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedRangeSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r3, seg_idx, working_res);
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                    r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                    r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceBitAndBasicTestImpl<
-      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                    r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                           RAJA::TypedListSegment<IDX_TYPE>,
-                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                    l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 156697485b..91ab75dbab 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -13,65 +13,65 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceMinBasicTestImpl(
-    const SEG_TYPE& seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource working_res)
+void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
+                                           const std::vector<IDX_TYPE>& seg_idx,
+                                           camp::resources::Resource working_res)
 {
+  using REF_MIN = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::minimum>;
+
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks            = (seg.size() - 1) / threads + 1;
+  int blocks = (seg.size() - 1)/threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-  const int modval          = 100;
-  const DATA_TYPE min_init  = modval + 1;
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE mininit(small_min);
   DATA_TYPE min(min_init);
+  
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     "LaunchMinBasicTest",
+     RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_mininit, REF_MIN &_min) {
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      "LaunchMinBasicTest",
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _mininit,
-                           DATA_TYPE & _min)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            {
-              _mininit = RAJA_MIN(working_array[idx], _mininit);
-              _min     = RAJA_MIN(working_array[idx], _min);
-            });
-      });
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          _mininit.min(working_array[idx]);
+          _min.min(working_array[idx]);
+
+    });
+
+  });
 
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
@@ -81,36 +81,38 @@ void LaunchParamExptReduceMinBasicTestImpl(
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            { _min = RAJA_MIN(working_array[idx] * factor, _min); });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_min) {
+
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          _min.min(working_array[idx] * factor);
+    });
+
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _min)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(
-            ctx, seg,
-            [&](IDX_TYPE idx)
-            { _min = RAJA_MIN(working_array[idx] * factor, _min); });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_min) {
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+
+          _min.min(working_array[idx] * factor);
       });
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -118,82 +120,78 @@ void LaunchParamExptReduceMinBasicTestImpl(
 TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest);
 template <typename T>
 class LaunchParamExptReduceMinBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
-
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                 r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                 r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                 r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r4, seg_idx, working_res);
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                 r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r5, seg_idx, working_res);
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                 r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedListSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                 l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index 9e16dae0fb..f6200628cf 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -13,41 +13,41 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE,
-          typename DATA_TYPE,
+template <typename IDX_TYPE, typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceSumBasicTestImpl(
-    const SEG_TYPE& seg,
-    const std::vector<IDX_TYPE>& seg_idx,
-    camp::resources::Resource working_res)
+void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
+                                           const std::vector<IDX_TYPE>& seg_idx,
+                                           camp::resources::Resource working_res)
 {
+  using REF_SUM = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::plus>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks            = (seg.size() - 1) / threads + 1;
+  int blocks = (seg.size() - 1)/threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
-                                    &check_array, &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i)
-  {
-    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i)
-  {
-    ref_sum += test_array[seg_idx[i]];
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_sum += test_array[ seg_idx[i] ];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -55,20 +55,19 @@ void LaunchParamExptReduceSumBasicTestImpl(
 
   DATA_TYPE sum(0), sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-      "LaunchSumBasicTest", RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum,
-                           DATA_TYPE & _sum2)
-      {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
-                                         [&](IDX_TYPE idx)
-                                         {
-                                           _sum += working_array[idx];
-                                           _sum2 += working_array[idx];
-                                         });
-      });
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+     "LaunchSumBasicTest",
+     RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+     RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum, REF_SUM &_sum2) {
+
+      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+          _sum  += working_array[idx];
+          _sum2 += working_array[idx];
+     });
+
+  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -77,22 +76,24 @@ void LaunchParamExptReduceSumBasicTestImpl(
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j)
-  {
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, DATA_TYPE & _sum)
-        {
-          RAJA::loop<GLOBAL_THREAD_POLICY>(
-              ctx, seg, [&](IDX_TYPE idx) { _sum += working_array[idx]; });
-        });
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+       RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum) {
+
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
+            _sum += working_array[idx];
+          });
+      });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
                                       test_array);
 }
 
@@ -100,82 +101,78 @@ void LaunchParamExptReduceSumBasicTestImpl(
 TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest);
 template <typename T>
 class LaunchParamExptReduceSumBasicTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
-                        camp::num<1>>::type;
-
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-  // Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
                                         LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r1, seg_idx, working_res);
-
+                                        r1, seg_idx, working_res);
+     
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r2, seg_idx, working_res);
+                                       RAJA::TypedRangeSegment<IDX_TYPE>,
+                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                       r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r3, seg_idx, working_res);
+                                       RAJA::TypedRangeSegment<IDX_TYPE>,
+                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                       r3, seg_idx, working_res);
 
-  // Range-stride segment tests
+// Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
   RAJA::getIndices(seg_idx, r4);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r4, seg_idx, working_res);
+                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                       r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
   RAJA::getIndices(seg_idx, r5);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      r5, seg_idx, working_res);
+                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                       r5, seg_idx, working_res);
 
-  // List segment tests
+// List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand(time(NULL));
-  for (IDX_TYPE i = 0; i < last; ++i)
-  {
-    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval)
-    {
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                        RAJA::TypedListSegment<IDX_TYPE>,
-                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      l1, seg_idx, working_res);
+                                       RAJA::TypedListSegment<IDX_TYPE>,
+                                      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+                                      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest,
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index 094aeb131d..702d5c6cd3 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -10,105 +10,98 @@
 
 #include <numeric>
 
-template <typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename TEAM_POLICY,
-          typename THREAD_POLICY>
+template <typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
 void LaunchBasicSharedTestImpl()
 {
 
   int N = 1000;
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   int* working_array;
   int* check_array;
   int* test_array;
 
-  allocateForallTestData<int>(N * N, working_res, &working_array, &check_array,
-                              &test_array);
+  allocateForallTestData<int>(N*N,
+                             working_res,
+                             &working_array,
+                             &check_array,
+                             &test_array);
 
 
-  // Select platform
+
+  //Select platform
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (working_res.get_platform() == camp::resources::Platform::host)
-  {
+  if (working_res.get_platform()  == camp::resources::Platform::host){
     select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-  }
-  else
-  {
+  }else{
     select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
   }
 
   size_t shared_mem_size = 1 * sizeof(int);
 
-  RAJA::launch<LAUNCH_POLICY>(
-      select_cpu_or_gpu,
-      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<TEAM_POLICY>(
-            ctx, RAJA::RangeSegment(0, N),
-            [&](int r)
-            {
-              // Array shared within threads of the same team
-              int* s_A = ctx.getSharedMemory<int>(1);
-
-              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1),
-                                        [&](int c) { s_A[c] = r; });
-
-              ctx.teamSync();
-
-              // broadcast shared value to all threads and write to array
-              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N),
-                                        [&](int c)
-                                        {
-                                          const int idx      = c + N * r;
-                                          working_array[idx] = s_A[0];
-                                        });  // loop j
-
-              ctx.releaseSharedMemory();
-            });  // loop r
-      });        // outer lambda
-
-
-  working_res.memcpy(check_array, working_array, sizeof(int) * N * N);
-
-  for (int r = 0; r < N; ++r)
-  {
-    for (int c = 0; c < N; c++)
-    {
-      ASSERT_EQ(r, check_array[c + r * N]);
+  RAJA::launch<LAUNCH_POLICY>
+    (select_cpu_or_gpu,
+     RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+          RAJA::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
+
+                // Array shared within threads of the same team
+              int * s_A = ctx.getSharedMemory<int>(1);
+
+                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+                    s_A[c] = r;
+                });
+
+                ctx.teamSync();
+
+                //broadcast shared value to all threads and write to array
+                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
+                    const int idx = c + N*r;
+                    working_array[idx] = s_A[0];
+                });  // loop j
+
+                ctx.releaseSharedMemory();
+              });  // loop r
+        });  // outer lambda
+
+
+
+  working_res.memcpy(check_array, working_array, sizeof(int) * N*N);
+
+  for(int r = 0; r < N; ++r) {
+    for (int c = 0; c < N; c++) {
+      ASSERT_EQ(r, check_array[c + r*N]);
     }
   }
 
-  deallocateForallTestData<int>(working_res, working_array, check_array,
-                                test_array);
+  deallocateForallTestData<int>(working_res,
+                               working_array,
+                               check_array,
+                               test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchBasicSharedTest);
 template <typename T>
 class LaunchBasicSharedTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
 {
 
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
-                        camp::num<0>>::type;
-  using TEAM_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
-                        camp::num<1>>::type;
-  using THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
-                        camp::num<2>>::type;
-
-  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                            THREAD_POLICY>();
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<0>>::type;
+  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<1>>::type;
+  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<2>>::type;
+
+  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
+
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest, BasicSharedTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest,
+                            BasicSharedTeams);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index 8faa4111ad..9ed358208f 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -15,114 +15,101 @@
 #include <algorithm>
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
 void LaunchListSegmentTestImpl(INDEX_TYPE N)
 {
 
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand(time(NULL));
+  srand ( time(NULL) );
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
-  {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if (i < randval)
-    {
+    if ( i < randval ) {
       idx_array.push_back(i);
     }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0)
-  {
+  if (N > 0) {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen,
+                                          working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   constexpr int threads = 256;
-  int blocks            = (data_len - 1) / threads + 1;
+  int blocks = (data_len - 1)/threads + 1;
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
-    for (size_t i = 0; i < idxlen; ++i)
-    {
-      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i) {
+      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, lseg,
-              [&](INDEX_TYPE idx)
-              { working_array[RAJA::stripIndexType(idx)] = idx; });
-        });
-  }
-  else
-  {  // zero-length segment
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
+            working_array[RAJA::stripIndexType(idx)] = idx;
+          });
+      });
+
+  } else { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg,
-                                          [&](INDEX_TYPE idx)
-                                          {
-                                            (void)idx;
-                                            working_array[0]++;
-                                          });
-        });
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
+            (void) idx;
+            working_array[0]++;
+          });
+      });
+
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-    {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0) {
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
     }
-  }
-  else
-  {
+  } else {
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -130,33 +117,27 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 TYPED_TEST_SUITE_P(LaunchListSegmentTest);
 template <typename T>
 class LaunchListSegmentTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
 {
   using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
 
   // test zero-length list segment
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
-                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest, ListSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest,
+                            ListSegmentTeams);
 
 #endif  // __TEST_TEAMS_LISTSEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index 2d36a6316b..aa2cb2c4b5 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -10,87 +10,86 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
 void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
-                                         RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   constexpr int threads = 256;
-  int blocks            = (data_len - 1) / threads + 1;
+  int blocks = (data_len - 1)/threads + 1;
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, r1,
-              [&](INDEX_TYPE idx)
-              { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
-        });
-  }
-  else
-  {  // zero-length segment
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(
+          ctx, r1, [&](INDEX_TYPE idx) {
+            working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+          }
+        );
+      }
+    );
+
+  } else { // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
-                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
-                                          { working_array[0]++; });
-        });
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),  [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(
+          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
+            working_array[0]++;
+          }
+        );
+      }
+    );
+
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
-
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-    {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0) {
+    
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
     }
-  }
-  else
-  {
-
+    
+  } else {
+    
     ASSERT_EQ(test_array[0], check_array[0]);
+    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -98,36 +97,24 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(LaunchRangeSegmentTest);
 template <typename T>
 class LaunchRangeSegmentTest : public ::testing::Test
-{};
-
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+};
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{}
-
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
-                                                   INDEX_TYPE(-5));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
-                                                   INDEX_TYPE(0));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
-                                                   INDEX_TYPE(5));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
@@ -135,32 +122,20 @@ TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
 
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(3),
-                                                   INDEX_TYPE(3));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(0),
-                                                   INDEX_TYPE(27));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
-                                                   INDEX_TYPE(2047));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
-                                                   INDEX_TYPE(32000));
-
-  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                   GLOBAL_THREAD_POLICY>();
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest, RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest,
+                            RangeSegmentTeams);
 
 #endif  // __TEST_RANGE_SEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index d25d46ce8f..94a1a77bcf 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -10,94 +10,90 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY>
-void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
-                                      INDEX_TYPE last,
+template <typename INDEX_TYPE, typename DIFF_TYPE,
+          typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
-      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
   constexpr int threads = 256;
-  int blocks            = (data_len - 1) / threads + 1;
+  int blocks = (data_len - 1)/threads + 1;
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
-    {
-      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
       idx += stride;
     }
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POICY>(
-              ctx, r1,
-              [&](INDEX_TYPE idx) {
-                working_array[RAJA::stripIndexType((idx - first) / stride)] =
-                    idx;
-              });
-        });
-  }
-  else
-  {  // zero-length segment
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(
+          ctx, r1, [&](INDEX_TYPE idx) {
+            working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+          }
+        );
+
+      }
+    );
+
+  } else { // zero-length segment
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
-                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
-                                          { working_array[0]++; });
-        });
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(
+          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
+            working_array[0]++;
+          }
+        );
+
+      }
+    );
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if (RAJA::stripIndexType(N) > 0) {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
-    {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-                check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
     }
-  }
-  else
-  {
+
+  } else {
 
     ASSERT_EQ(test_array[0], check_array[0]);
+
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -105,44 +101,26 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
 TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest);
 template <typename T>
 class LaunchRangeStrideSegmentTest : public ::testing::Test
-{};
-
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POICY,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+};
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{}
-
-template <typename INDEX_TYPE,
-          typename DIFF_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename GLOBAL_THREAD_POLICY,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-  // Test negative strides
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+// Test negative strides
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
@@ -150,47 +128,23 @@ TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using DIFF_TYPE =
-      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-  // Test size zero segments
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
-                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
-
-  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
-                         GLOBAL_THREAD_POLICY>();
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+// Test size zero segments
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest,
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index 0d2fa4d789..8da7b81eb7 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -10,105 +10,81 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename TEAM_POLICY,
-          typename THREAD_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
 void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
-      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
-      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len =
-      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
+  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  // determine the underlying type of block_range
+  //determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
 
-  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
-  {
-    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
-    {
-      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
+  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
+    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
+      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
       test_array[idx] = INDEX_TYPE(idx) + INDEX_TYPE(c);
     }
   }
 
-  size_t shared_mem_size =
-      RAJA::stripIndexType(thread_range) * sizeof(INDEX_TYPE);
-
-  // Use an int type to test the bump style allocator.
-  // Key idea is that we are requesting different amounts.
-  shared_mem_size += RAJA::stripIndexType(thread_range) * sizeof(int);
-
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                         RAJA::Threads(RAJA::stripIndexType(thread_range)),
-                         shared_mem_size),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<TEAM_POLICY>(
-            ctx, outer_range,
-            [&](INDEX_TYPE bid)
-            {
-              INDEX_TYPE* tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(
-                  RAJA::stripIndexType(thread_range));
-              RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(
-                  tile_ptr, RAJA::stripIndexType(thread_range));
-
-              int* int_tile_ptr =
-                  ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
-              RAJA::View<int, RAJA::Layout<1>> Int_Tile(
-                  int_tile_ptr, RAJA::stripIndexType(thread_range));
-
-              RAJA::loop<THREAD_POLICY>(
-                  ctx, inner_range,
-                  [&](INDEX_TYPE tid)
-                  {
-                    Int_Tile(RAJA::stripIndexType(tid)) =
-                        RAJA::stripIndexType(tid);
-                    Tile(RAJA::stripIndexType(thread_range) -
-                         RAJA::stripIndexType(tid) - 1) =
-                        thread_range - tid - 1 + thread_range * bid;
-                  });
-
-              ctx.teamSync();
-
-              RAJA::loop<THREAD_POLICY>(
-                  ctx, inner_range,
-                  [&](INDEX_TYPE tid)
-                  {
-                    INDEX_TYPE idx = tid + thread_range * bid;
-                    working_array[RAJA::stripIndexType(idx)] =
-                        Tile(RAJA::stripIndexType(tid)) +
-                        Int_Tile(RAJA::stripIndexType(tid));
-                  });
-
-              ctx.releaseSharedMemory();
+  size_t shared_mem_size = RAJA::stripIndexType(thread_range)*sizeof(INDEX_TYPE);
+
+  //Use an int type to test the bump style allocator.
+  //Key idea is that we are requesting different amounts.
+  shared_mem_size += RAJA::stripIndexType(thread_range)*sizeof(int);
+
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                        RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
+
+          INDEX_TYPE * tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(RAJA::stripIndexType(thread_range));
+          RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(tile_ptr, RAJA::stripIndexType(thread_range));
+
+          int * int_tile_ptr = ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
+          RAJA::View<int, RAJA::Layout<1>> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range));
+
+          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
+              Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
+              Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid;
             });
-      });
+
+          ctx.teamSync();
+
+          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
+              INDEX_TYPE idx = tid + thread_range * bid;
+              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid));
+          });
+
+          ctx.releaseSharedMemory();
+        });
+
+    });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -116,31 +92,28 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 TYPED_TEST_SUITE_P(LaunchDynamicMemTest);
 template <typename T>
 class LaunchDynamicMemTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using TEAM_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                           THREAD_POLICY>(INDEX_TYPE(4), INDEX_TYPE(2));
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                           THREAD_POLICY>(INDEX_TYPE(5), INDEX_TYPE(32));
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
+    (INDEX_TYPE(4), INDEX_TYPE(2));
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
+    (INDEX_TYPE(5), INDEX_TYPE(32));
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest, DynamicMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest,
+                            DynamicMemLaunch);
 
 #endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index a424015398..63b488115b 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -10,98 +10,80 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename TEAM_POLICY,
-          typename THREAD_POLICY,
-          int THREAD_RANGE>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY,
+int THREAD_RANGE>
 void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 {
 
   INDEX_TYPE thread_range(THREAD_RANGE);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
-      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
-      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len =
-      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
+  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
-                                     &check_array, &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
 
-  // determine the underlying type of block_range
+  //determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
-
-  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
-  {
-    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
-    {
-      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
+  
+  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
+    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
+      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
       test_array[idx] = INDEX_TYPE(idx);
     }
   }
 
-  RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                         RAJA::Threads(RAJA::stripIndexType(thread_range))),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        RAJA::loop<TEAM_POLICY>(
-            ctx, outer_range,
-            [&](INDEX_TYPE bid)
-            {
-              // Since we are using custom index type we have to first use a
-              // type that the device compiler can intialize, we can then use a
-              // pointer to recast the shared memory to our desired type.
-              // This enables us to work around the following warning:
-              //  warning #3019-D: dynamic initialization is not supported for
-              // a function-scope static __shared__ variable within a
-              // __device__/__global__ function
-              RAJA_TEAM_SHARED char
-                  char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
-              INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
-
-              RAJA::loop<THREAD_POLICY>(
-                  ctx, inner_range,
-                  [&](INDEX_TYPE tid)
-                  {
-                    Tile[RAJA::stripIndexType(thread_range) -
-                         RAJA::stripIndexType(tid) - 1] =
-                        thread_range - tid - 1 + thread_range * bid;
-                  });
-
-              ctx.teamSync();
-
-              RAJA::loop<THREAD_POLICY>(
-                  ctx, inner_range,
-                  [&](INDEX_TYPE tid)
-                  {
-                    INDEX_TYPE idx = tid + thread_range * bid;
-                    working_array[RAJA::stripIndexType(idx)] =
-                        Tile[RAJA::stripIndexType(tid)];
-                  });
-
-              ctx.releaseSharedMemory();
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                        RAJA::Threads(RAJA::stripIndexType(thread_range))),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
+
+          //Since we are using custom index type we have to first use a
+          //type that the device compiler can intialize, we can then use a
+          //pointer to recast the shared memory to our desired type.
+          //This enables us to work around the following warning:
+          // warning #3019-D: dynamic initialization is not supported for
+          //a function-scope static __shared__ variable within a __device__/__global__ function
+          RAJA_TEAM_SHARED char char_Tile[THREAD_RANGE*sizeof(INDEX_TYPE)];
+          INDEX_TYPE *Tile = (INDEX_TYPE *)char_Tile;
+
+          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
+              Tile[RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1] = thread_range-tid-1 + thread_range*bid;
             });
-      });
+
+          ctx.teamSync();
+
+          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
+              INDEX_TYPE idx = tid + thread_range * bid;
+              working_array[RAJA::stripIndexType(idx)] = Tile[RAJA::stripIndexType(tid)];
+          });
+
+          ctx.releaseSharedMemory();
+        });
+
+    });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++)
-  {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
-              check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
                                        test_array);
 }
 
@@ -109,31 +91,28 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 TYPED_TEST_SUITE_P(LaunchStaticMemTest);
 template <typename T>
 class LaunchStaticMemTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
-  using TEAM_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
-
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                          THREAD_POLICY, 2>(INDEX_TYPE(4));
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
-                          THREAD_POLICY, 32>(INDEX_TYPE(5));
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+
+
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 2>
+    (INDEX_TYPE(4));
+
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 32>
+    (INDEX_TYPE(5));
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest, StaticMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest,
+                            StaticMemLaunch);
 
 #endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index 48aed7a007..72d59d290a 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -10,26 +10,23 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename TEAM_X_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int threads_x = 4;
-  constexpr int blocks_x  = 4;
+  constexpr int threads_x   = 4;
+  constexpr int blocks_x    = 4;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * threads_x + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*threads_x+1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1 - 1) / threads_x + 1;
+  INDEX_TYPE no_tiles = (N1-1)/threads_x + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_ttile_array;
   INDEX_TYPE* check_ttile_array;
   INDEX_TYPE* test_ttile_array;
@@ -39,89 +36,80 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_ttile_array, &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_ttile_array,
+                                     &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_iloop_array, &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_iloop_array,
+                                     &check_iloop_array,
                                      &test_iloop_array);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx, threads_x, r1,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE bx)
-              {
-                RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx, x_tile,
-                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
-                    {
-                      working_ttile_array[tx] = bx;
-                      working_iloop_array[tx] = ix;
-                    });
-              });
-        });
-  }
-  else
-  {  // zero-length segment
+      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile_tcount<TEAM_X_POLICY>(
+          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
+            RAJA::loop_icount<THREAD_X_POLICY>(
+              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
+
+                working_ttile_array[tx] = bx;
+                working_iloop_array[tx] = ix;
+
+              }
+            );
+          }
+        );
+      }
+    );
+
+  } else { // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0,
-           sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx, threads_x, r1,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
-              {
-                RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx, x_tile,
-                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
-                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
-                    {
-                      working_ttile_array[0]++;
-                      working_iloop_array[0]++;
-                    });
-              });
-        });
+      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile_tcount<TEAM_X_POLICY>(
+          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+            RAJA::loop_icount<THREAD_X_POLICY>(
+              ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG (ix)) {
+
+                working_ttile_array[0]++;
+                working_iloop_array[0]++;
+
+              }
+            );
+          }
+        );
+      }
+    );
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array,
-                     sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array,
-                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if (RAJA::stripIndexType(N) > 0) {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
-    {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx)
-      {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx) {
 
-        if (idx >= N1) break;
+        if(idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -129,26 +117,31 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-  }
-  else
-  {
-
+    
+  } else {
+    
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
+    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
-                                       check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_ttile_array,
+                                       check_ttile_array,
+                                       test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
-                                       check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_iloop_array,
+                                       check_iloop_array,
+                                       test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
@@ -156,30 +149,30 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
 
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
 
 
   // test zero-length range segment
   LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
+                           THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(0));
 
-  // Keep at one since we are doing a direct thread test
+  //Keep at one since we are doing a direct thread test
   LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
+                                 THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(1));
+
+    LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(2));
+
 
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
+                            RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index d39a66009d..31adc84810 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -10,29 +10,26 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE,
-          typename WORKING_RES,
-          typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY,
-          typename TEAM_X_POLICY>
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int tile_size = 4;
+  constexpr int tile_size   = 4;
 
-  // following grid will require loop policies
-  constexpr int threads_x = 3;
-  constexpr int blocks_x  = 1;
+  //following grid will require loop policies
+  constexpr int threads_x   = 3;
+  constexpr int blocks_x    = 1;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * tile_size + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*tile_size+1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1 - 1) / tile_size + 1;
+  INDEX_TYPE no_tiles = (N1-1)/tile_size + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
   INDEX_TYPE* working_ttile_array;
   INDEX_TYPE* check_ttile_array;
   INDEX_TYPE* test_ttile_array;
@@ -42,89 +39,80 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if (data_len == 0)
-  {
+  if ( data_len == 0 ) {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_ttile_array, &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_ttile_array,
+                                     &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
-                                     &working_iloop_array, &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_iloop_array,
+                                     &check_iloop_array,
                                      &test_iloop_array);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if ( RAJA::stripIndexType(N) > 0 ) {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx, tile_size, r1,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE bx)
-              {
-                RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx, x_tile,
-                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
-                    {
-                      working_ttile_array[tx] = bx;
-                      working_iloop_array[tx] = ix;
-                    });
-              });
-        });
-  }
-  else
-  {  // zero-length segment
+      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile_tcount<TEAM_X_POLICY>(
+          ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
+            RAJA::loop_icount<THREAD_X_POLICY>(
+              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
+
+                working_ttile_array[tx] = bx;
+                working_iloop_array[tx] = ix;
+
+              }
+            );
+          }
+        );
+      }
+    );
+  } else { // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0,
-           sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array,
-                       sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-        {
-          RAJA::tile_tcount<TEAM_X_POLICY>(
-              ctx, tile_size, r1,
-              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
-                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
-              {
-                RAJA::loop_icount<THREAD_X_POLICY>(
-                    ctx, x_tile,
-                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
-                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
-                    {
-                      working_ttile_array[0]++;
-                      working_iloop_array[0]++;
-                    });
-              });
-        });
+      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::tile_tcount<TEAM_X_POLICY>
+          (ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
+
+            RAJA::loop_icount<THREAD_X_POLICY>
+              (ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
+
+                working_ttile_array[0]++;
+                working_iloop_array[0]++;
+
+              }
+            );
+          }
+        );
+      }
+    );
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array,
-                     sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array,
-                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0)
-  {
+  if (RAJA::stripIndexType(N) > 0) {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
-    {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx)
-      {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx) {
 
-        if (idx >= N1) break;
+        if(idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -132,26 +120,31 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-  }
-  else
-  {
+
+  } else {
 
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
+
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
-                                       check_ttile_array, test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_ttile_array,
+                                       check_ttile_array,
+                                       test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
-                                       check_iloop_array, test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_iloop_array,
+                                       check_iloop_array,
+                                       test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
@@ -159,30 +152,31 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<0>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+
+  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
 
-  using TEAM_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<1>>::type;
-  using THREAD_X_POLICY =
-      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
-                        camp::num<2>>::type;
 
 
   // test zero-length range segment
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
+                           THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(0));
 
-  // Keep at one since we are doing a direct thread test
+  //Keep at one since we are doing a direct thread test
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
+                                 THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(1));
 
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
+                               THREAD_X_POLICY, TEAM_X_POLICY>
+    (INDEX_TYPE(2));
+
+
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
+                            RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index 40769cee3a..43f99f9901 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -16,10 +16,8 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i)
-  {
-    if (*actual != init)
-    {
+  for (int i = 0; i < N; ++i) {
+    if (*actual != init) {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -31,21 +29,24 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveTestImpl(
-    int N,
-    typename OP_TYPE::result_type offset = OP_TYPE::identity())
+void ScanExclusiveTestImpl(int N,
+                           typename OP_TYPE::result_type offset =
+                           OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res {WORKING_RES::get_default()};
-  camp::resources::Resource working_res {res};
+  WORKING_RES res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res{res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
+  allocScanTestData(N,
+                    working_res,
+                    &work_in, &work_out,
+                    &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -53,9 +54,10 @@ void ScanExclusiveTestImpl(
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan<EXEC_POLICY>(
-      RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
+                                    RAJA::make_span(work_out, N),
+                                    OP_TYPE{},
+                                    offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -65,23 +67,28 @@ void ScanExclusiveTestImpl(
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan<EXEC_POLICY>(
-      res, RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(res,
+                                    RAJA::make_span(static_cast<const T*>(work_in), N),
+                                    RAJA::make_span(work_out, N),
+                                    OP_TYPE{},
+                                    offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
+  deallocScanTestData(working_res,
+                      work_in, work_out,
+                      host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveTest);
 template <typename T>
 class ScanExclusiveTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
 {
@@ -89,20 +96,33 @@ TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
-  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
-  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
+  ScanExclusiveTestImpl<EXEC_POLICY,
+                              WORKING_RESOURCE,
+                              OP_TYPE>(0);
+  ScanExclusiveTestImpl<EXEC_POLICY,
+                              WORKING_RESOURCE,
+                              OP_TYPE>(357);
+  ScanExclusiveTestImpl<EXEC_POLICY,
+                              WORKING_RESOURCE,
+                              OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0, T(13));
-  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357, T(15));
-  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000, T(2));
+  ScanExclusiveTestImpl<EXEC_POLICY,
+                        WORKING_RESOURCE,
+                        OP_TYPE>(0, T(13));
+  ScanExclusiveTestImpl<EXEC_POLICY,
+                        WORKING_RESOURCE,
+                        OP_TYPE>(357, T(15));
+  ScanExclusiveTestImpl<EXEC_POLICY,
+                        WORKING_RESOURCE,
+                        OP_TYPE>(32000, T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest, ScanExclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest,
+                            ScanExclusive);
 
-#endif  // __TEST_SCAN_EXCLUSIVE_HPP__
+#endif // __TEST_SCAN_EXCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index 34d7b6d470..c42e9a8677 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -16,10 +16,8 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i)
-  {
-    if (*actual != init)
-    {
+  for (int i = 0; i < N; ++i) {
+    if (*actual != init) {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -31,21 +29,24 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveInplaceTestImpl(
-    int N,
-    typename OP_TYPE::result_type offset = OP_TYPE::identity())
+void ScanExclusiveInplaceTestImpl(int N,
+                                  typename OP_TYPE::result_type offset =
+                                  OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res {WORKING_RES::get_default()};
-  camp::resources::Resource working_res {res};
+  WORKING_RES res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res{res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
+  allocScanTestData(N,
+                    working_res,
+                    &work_in, &work_out,
+                    &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -54,7 +55,8 @@ void ScanExclusiveInplaceTestImpl(
   res.wait();
 
   RAJA::exclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE {}, offset);
+                                            OP_TYPE{},
+                                            offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -64,22 +66,27 @@ void ScanExclusiveInplaceTestImpl(
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
-                                            OP_TYPE {}, offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res,
+                                            RAJA::make_span(work_in, N),
+                                            OP_TYPE{},
+                                            offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
+  deallocScanTestData(working_res,
+                      work_in, work_out,
+                      host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest);
 template <typename T>
 class ScanExclusiveInplaceTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
 {
@@ -87,23 +94,33 @@ TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(0);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(357);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0,
-                                                                       T(13));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357,
-                                                                       T(15));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000,
-                                                                       T(2));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(0, T(13));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(357, T(15));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(32000, T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest, ScanExclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest,
+                            ScanExclusiveInplace);
 
-#endif  // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
+#endif // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index 43c0c8e1b2..9fcc54ed67 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -11,17 +11,15 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult
-check_inclusive(const typename OP::result_type* actual,
-                const typename OP::result_type* original,
-                int N)
+::testing::AssertionResult check_inclusive(
+  const typename OP::result_type* actual,
+  const typename OP::result_type* original,
+  int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     init = OP()(init, *original);
-    if (*actual != init)
-    {
+    if (*actual != init) {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -36,15 +34,18 @@ void ScanInclusiveTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res {WORKING_RES::get_default()};
-  camp::resources::Resource working_res {res};
+  WORKING_RES res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res{res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
+  allocScanTestData(N,
+                    working_res,
+                    &work_in, &work_out,
+                    &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -52,9 +53,9 @@ void ScanInclusiveTestImpl(int N)
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::inclusive_scan<EXEC_POLICY>(
-      RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE {});
+  RAJA::inclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
+                                    RAJA::make_span(work_out, N),
+                                    OP_TYPE{});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -64,23 +65,27 @@ void ScanInclusiveTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan<EXEC_POLICY>(
-      res, RAJA::make_span(static_cast<const T*>(work_in), N),
-      RAJA::make_span(work_out, N), OP_TYPE {});
+  RAJA::inclusive_scan<EXEC_POLICY>(res,
+                                    RAJA::make_span(static_cast<const T*>(work_in), N),
+                                    RAJA::make_span(work_out, N),
+                                    OP_TYPE{});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
+  deallocScanTestData(working_res,
+                      work_in, work_out,
+                      host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveTest);
 template <typename T>
 class ScanInclusiveTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
 {
@@ -88,11 +93,18 @@ TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
-  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
-  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
+  ScanInclusiveTestImpl<EXEC_POLICY,
+                        WORKING_RESOURCE,
+                        OP_TYPE>(0);
+  ScanInclusiveTestImpl<EXEC_POLICY,
+                        WORKING_RESOURCE,
+                        OP_TYPE>(357);
+  ScanInclusiveTestImpl<EXEC_POLICY,
+                        WORKING_RESOURCE,
+                        OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest, ScanInclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest,
+                            ScanInclusive);
 
-#endif  // __TEST_SCAN_INCLUSIVE_HPP__
+#endif // __TEST_SCAN_INCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index 8f3761865b..8e4d8e93bf 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -11,17 +11,15 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult
-check_inclusive(const typename OP::result_type* actual,
-                const typename OP::result_type* original,
-                int N)
+::testing::AssertionResult check_inclusive(
+  const typename OP::result_type* actual,
+  const typename OP::result_type* original,
+  int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     init = OP()(init, *original);
-    if (*actual != init)
-    {
+    if (*actual != init) {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -36,15 +34,18 @@ void ScanInclusiveInplaceTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res {WORKING_RES::get_default()};
-  camp::resources::Resource working_res {res};
+  WORKING_RES res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res{res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
+  allocScanTestData(N,
+                    working_res,
+                    &work_in, &work_out,
+                    &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -53,7 +54,7 @@ void ScanInclusiveInplaceTestImpl(int N)
   res.wait();
 
   RAJA::inclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE {});
+                                            OP_TYPE{});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -63,22 +64,26 @@ void ScanInclusiveInplaceTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
-                                            OP_TYPE {});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res,
+                                            RAJA::make_span(work_in, N),
+                                            OP_TYPE{});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
+  deallocScanTestData(working_res,
+                      work_in, work_out,
+                      host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest);
 template <typename T>
 class ScanInclusiveInplaceTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
 {
@@ -86,11 +91,18 @@ TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(0);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(357);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
+                               WORKING_RESOURCE,
+                               OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest, ScanInclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest,
+                            ScanInclusiveInplace);
 
-#endif  // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
+#endif // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
index 26b015939f..ccfdb47dc2 100644
--- a/test/functional/scan/tests/test-scan-data.hpp
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -15,12 +15,10 @@
 template <typename T>
 void allocScanTestData(int N,
                        camp::resources::Resource work_res,
-                       T** work_in,
-                       T** work_out,
-                       T** host_in,
-                       T** host_out)
+                       T** work_in, T** work_out,
+                       T** host_in, T** host_out)
 {
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
   *work_in  = work_res.allocate<T>(N);
   *work_out = work_res.allocate<T>(N);
@@ -31,12 +29,10 @@ void allocScanTestData(int N,
 
 template <typename T>
 void deallocScanTestData(camp::resources::Resource work_res,
-                         T* work_in,
-                         T* work_out,
-                         T* host_in,
-                         T* host_out)
+                         T* work_in, T* work_out,
+                         T* host_in, T* host_out)
 {
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
   work_res.deallocate(work_in);
   work_res.deallocate(work_out);
@@ -44,4 +40,4 @@ void deallocScanTestData(camp::resources::Resource work_res,
   host_res.deallocate(host_out);
 }
 
-#endif  // __TEST_SCAN_DATA_HPP__
+#endif // __TEST_SCAN_DATA_HPP__
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index d988dd8e55..93d08d99f8 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -14,122 +14,50 @@ using MatrixElementType = double;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::hip_wave_register>,
 #endif
 
 
 //#ifdef __AVX__
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    2,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    2,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    4,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    4,2, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,2, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
 //
 //#endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   4,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   12,
-                                   RAJA::expt::avx2_register>,
-
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    4,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,4, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    4,2, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,12, RAJA::expt::avx2_register>,
+
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   4,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
 #endif
 
 
@@ -137,8 +65,7 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+
+  >;
 
-    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
index 40cb6f67fd..2952fb5f6f 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
@@ -14,73 +14,29 @@ using MatrixElementType = float;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
 #endif
 
 
@@ -88,8 +44,7 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+
+  >;
 
-    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
index b3e415abbc..e15729d08a 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
@@ -14,73 +14,29 @@ using MatrixElementType = int32_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
 #endif
 
 
@@ -88,8 +44,6 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
 
-    >;
+  >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
index 3dca8e44a6..f91b015b4a 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
@@ -14,134 +14,42 @@ using MatrixElementType = int64_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   4,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   2,
-                                   RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   2,
-                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
 
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   4,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   2,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   8,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   2,
-                                   RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   2,
-                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   4,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   16,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   8,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   16,
-                                   4,
-                                   RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType,
-                                   TensorMatrixLayoutType,
-                                   8,
-                                   4,
-                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
 #endif
 
 
@@ -149,8 +57,6 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType,
-                                     TensorMatrixLayoutType,
-                                     RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
 
-    >;
+  >;
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index b16684cfdc..1ceaf94b18 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -8,62 +8,55 @@
 #ifndef __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 #define __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void CtorGetSetImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
   //
   // Allocate Data
   //
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
-      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
-      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
-      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
-      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Do Operation: broadcast-ctor and copy-ctor
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // create a matrix that contains all 3's
-        matrix_t m1(element_t(3));
-
-        // copy to another matrix
-        matrix_t m2(m1);
-
-        // write out both matrices
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-          {
-            data1_d(i, j) = m1.get(i, j);
-            data2_d(i, j) = m2.get(i, j);
-          }
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // create a matrix that contains all 3's
+    matrix_t m1(element_t(3));
+
+    // copy to another matrix
+    matrix_t m2(m1);
+
+    // write out both matrices
+    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+        data1_d(i,j) = m1.get(i,j);
+        data2_d(i,j) = m2.get(i,j);
+      }
+    }
+
+  });
 
   // copy data back to host
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -73,14 +66,11 @@ void CtorGetSetImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-    {
-      ASSERT_SCALAR_EQ(3, data1_h(i, j));
-      ASSERT_SCALAR_EQ(3, data2_h(i, j));
-      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-      //      data2(i,j));
+  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      ASSERT_SCALAR_EQ(3, data1_h(i,j));
+      ASSERT_SCALAR_EQ(3, data2_h(i,j));
+//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
 
@@ -90,10 +80,15 @@ void CtorGetSetImpl()
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
+
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, CtorGetSet)
+{
+  CtorGetSetImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index 1a28374569..e4e1ff0bfb 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -8,77 +8,71 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Add_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Add_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_AddImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N =
-      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
+  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N * N);
+  std::vector<element_t> data1_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
+  std::vector<element_t> data2_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
-      data3_vec.data());
+  std::vector<element_t> data3_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
 
-  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
-      data3_ptr);
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
-      data4_vec.data());
+  std::vector<element_t> data4_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
 
-  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
-      data4_ptr);
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
+  std::vector<element_t> data5_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
+
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
 
-  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      data1_h(i, j) = i * matrix_t::s_num_columns + j;
-      data2_h(i, j) = 1 + i + j;
-      data3_h(i, j) = i * matrix_t::s_num_columns + j;
-      data4_h(i, j) = 1 + i + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      data1_h(i,j) = i*matrix_t::s_num_columns+j;
+      data2_h(i,j) = 1+i+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+      data4_h(i,j) = 1+i+j;
     }
   }
 
@@ -91,29 +85,26 @@ void ET_AddImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-        auto SRrows =
-            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-        auto SRcols =
-            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-        // Access types:
-        // data1_d - Layout with all() and all().
-        // data2_d - Layout with all() and static_range(), which should default
-        // to normal Layout access. data3_d - StaticLayout with static_all() and
-        // static_range(). data4_d - StaticLayout with static_all() and all().
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
-        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) +
-                              data3_d(SArows, SRcols) + data4_d(SAcols, rows);
-      });
+    // Access types:
+    // data1_d - Layout with all() and all().
+    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
+    // data3_d - StaticLayout with static_all() and static_range().
+    // data4_d - StaticLayout with static_all() and all().
+
+    data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) + data3_d(SArows, SRcols) + data4_d(SAcols, rows);
+
+  });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -121,34 +112,27 @@ void ET_AddImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i) +
-                                          data3_h(i, j) + data4_h(j, i));
-      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-      //      data2(i,j));
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i)+data3_h(i,j)+data4_h(j,i));
+//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
 
 
+
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
-          data5_h(j, i) = -1;
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
+          data5_h(j,i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -157,19 +141,16 @@ void ET_AddImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // Load data using a View
-            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // Load data using a View
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-            // Access types:
-            // Layout with range() and range() because loop iterate cannot be
-            // determined statically.
+        // Access types:
+        // Layout with range() and range() because loop iterate cannot be determined statically.
 
-            data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
-          });
+        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
+      });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -177,22 +158,19 @@ void ET_AddImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
-          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-          //          data2(i,j));
-          if (i < n_size && j < m_size)
-          {
-            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i));
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
+//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+          if(i < n_size && j < m_size){
+            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i));
           }
-          else
-          {
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
+          else{
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
           }
         }
       }
+
+
     }
   }
 
@@ -208,7 +186,11 @@ void ET_AddImpl()
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, ET_Add) { ET_AddImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, ET_Add)
+{
+  ET_AddImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index c17692d673..a06b87732c 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -8,18 +8,17 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_DivideImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
-  static constexpr camp::idx_t N =
-      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
+  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   //
   // Allocate Row-Major Data
@@ -27,59 +26,54 @@ void ET_DivideImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(N * N);
+  std::vector<element_t> data1_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
+  std::vector<element_t> data2_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
-      data3_vec.data());
+  std::vector<element_t> data3_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
 
-  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
-      data3_ptr);
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
-      data4_vec.data());
+  std::vector<element_t> data4_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
 
-  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
-      data4_ptr);
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
+  std::vector<element_t> data5_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
+
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
 
-  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      data1_h(i, j) = i * matrix_t::s_num_columns + j;
-      data2_h(i, j) = 1 + i + j;
-      data3_h(i, j) = i * matrix_t::s_num_columns + j;
-      data4_h(i, j) = 1 + i + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      data1_h(i,j) = i*matrix_t::s_num_columns+j;
+      data2_h(i,j) = 1+i+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+      data4_h(i,j) = 1+i+j;
     }
   }
 
@@ -92,29 +86,26 @@ void ET_DivideImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-        auto SRrows =
-            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-        auto SRcols =
-            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-        // Access types:
-        // data1_d - Layout with all() and all().
-        // data2_d - Layout with all() and static_range(), which should default
-        // to normal Layout access. data3_d - StaticLayout with static_all() and
-        // static_range(). data4_d - StaticLayout with static_all() and all().
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
-        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) +
-                              data3_d(SArows, SRcols) / data4_d(SAcols, rows);
-      });
+    // Access types:
+    // data1_d - Layout with all() and all().
+    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
+    // data3_d - StaticLayout with static_all() and static_range().
+    // data4_d - StaticLayout with static_all() and all().
+
+    data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) + data3_d(SArows, SRcols) / data4_d(SAcols, rows);
+
+  });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -122,34 +113,27 @@ void ET_DivideImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i) +
-                                          data3_h(i, j) / data4_h(j, i));
-      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-      //      data2(i,j));
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i)+data3_h(i,j)/data4_h(j,i));
+//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
 
 
+
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
-          data5_h(j, i) = -1;
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
+          data5_h(j,i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -158,19 +142,16 @@ void ET_DivideImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // Load data using a View
-            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // Load data using a View
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-            // Access types:
-            // Layout with range() and range() because loop iterate cannot be
-            // determined statically.
+        // Access types:
+        // Layout with range() and range() because loop iterate cannot be determined statically.
 
-            data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
-          });
+        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
+      });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -178,22 +159,19 @@ void ET_DivideImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
-          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-          //          data2(i,j));
-          if (i < n_size && j < m_size)
-          {
-            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i));
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
+//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+          if(i < n_size && j < m_size){
+            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i));
           }
-          else
-          {
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
+          else{
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
           }
         }
       }
+
+
     }
   }
 
@@ -206,10 +184,15 @@ void ET_DivideImpl()
   tensor_free<policy_t>(data3_ptr);
   tensor_free<policy_t>(data4_ptr);
   tensor_free<policy_t>(data5_ptr);
+
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, ET_Divide) { ET_DivideImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, ET_Divide)
+{
+  ET_DivideImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 6c9638a779..1d1c725f52 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 #define __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_LoadStoreImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -24,91 +24,67 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
-      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
-      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
-      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
-      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
-                                           matrix_t::s_num_columns>>
-      data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_h(data3_vec.data());
 
-  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
-                                           matrix_t::s_num_columns>>
-      data3_d(data3_ptr);
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_d(data3_ptr);
 
 
   // alloc data4
-  std::vector<element_t> data4_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_h(
-      data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data4_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_h(data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_d(
-      data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_d(data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(
-      data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data5_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(
-      data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data6
-  std::vector<element_t> data6_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_h(
-      data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data6_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_h(data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t* data6_ptr = tensor_malloc<policy_t>(data6_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_d(
-      data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t *data6_ptr = tensor_malloc<policy_t>(data6_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_d(data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data7
-  std::vector<element_t> data7_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_h(
-      data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data7_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_h(data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+  element_t *data7_ptr = tensor_malloc<policy_t>(data7_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_d(data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t* data7_ptr = tensor_malloc<policy_t>(data7_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_d(
-      data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-    {
-      data1_h(i, j) = i * matrix_t::s_num_columns + j;
-      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      data1_h(i,j) = i*matrix_t::s_num_columns+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
     }
   }
 
@@ -119,29 +95,25 @@ void ET_LoadStoreImpl()
   //
   // Do Operation: Load/Store full matrix from one view to another
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-        auto SRrows = RAJA::expt::RowIndex<
-            int, matrix_t>::template static_range<0, matrix_t::s_num_rows>();
-        auto SRcols = RAJA::expt::ColIndex<
-            int, matrix_t>::template static_range<0, matrix_t::s_num_columns>();
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-        data2_d(cols, rows) = data1_d(rows, cols);
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_rows>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_columns>();
 
-        data4_d(cols, rows) =
-            data3_d(SArows, SRcols);  // mixed static_all and static_range
-        data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
-        data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
-        data7_d(cols, rows) =
-            data3_d(rows, SRcols);  // mixed static_range and non-static
-      });
+    data2_d(cols, rows) = data1_d(rows, cols);
+
+    data4_d(cols, rows) = data3_d(SArows, SRcols);  // mixed static_all and static_range
+    data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
+    data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
+    data7_d(cols, rows) = data3_d(rows, SRcols);    // mixed static_range and non-static
+
+  });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
   tensor_copy_to_host<policy_t>(data4_vec, data4_ptr);
@@ -153,17 +125,14 @@ void ET_LoadStoreImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-    {
-      // printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
-      // data2_h(j,i));
-      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
-      ASSERT_SCALAR_EQ(data3_h(i, j), data4_h(j, i));
-      ASSERT_SCALAR_EQ(data3_h(i, j), data5_h(j, i));
-      ASSERT_SCALAR_EQ(data3_h(i, j), data6_h(j, i));
-      ASSERT_SCALAR_EQ(data3_h(i, j), data7_h(j, i));
+  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      //printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(j,i));
+      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data4_h(j,i));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data5_h(j,i));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data6_h(j,i));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data7_h(j,i));
     }
   }
 
@@ -171,19 +140,15 @@ void ET_LoadStoreImpl()
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-        {
-          data2_h(j, i) = -1;
+      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+          data2_h(j,i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -192,15 +157,13 @@ void ET_LoadStoreImpl()
       //
       // Do Operation: Load/Store partial matrix from one view to another
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // Load data using a View
-            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // Load data using a View
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-            data2_d(cols, rows) = data1_d(rows, cols);
-          });
+        data2_d(cols, rows) = data1_d(rows, cols);
+      });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -208,22 +171,19 @@ void ET_LoadStoreImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-        {
-          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-          //          data2(i,j));
-          if (i < n_size && j < m_size)
-          {
-            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
+      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+          if(i < n_size && j < m_size){
+            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
           }
-          else
-          {
-            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j, i));
+          else{
+            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j,i));
           }
         }
       }
+
+
     }
   }
 
@@ -241,7 +201,11 @@ void ET_LoadStoreImpl()
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, ET_LoadStore) { ET_LoadStoreImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, ET_LoadStore)
+{
+  ET_LoadStoreImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index c197a306e4..4718172de7 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -23,73 +23,67 @@ void ET_MatrixMatrixMultiplyImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-  //  static constexpr camp::idx_t N = 8;
-  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N =
-      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
-      data1_vec.data());
+  std::vector<element_t> data1_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
-      data1_ptr);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_h(
-      data2_vec.data());
+  std::vector<element_t> data2_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_h(data2_vec.data());
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_d(
-      data2_ptr);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(), N, N);
+  std::vector<element_t> data3_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(),  N, N);
+
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr,  N, N);
 
-  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      data1_h(i, j) = 1 + i * N + j;
-      data2_h(i, j) = 3 + i * N + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      data1_h(i,j) = 1+i*N+j;
+      data2_h(i,j) = 3+i*N+j;
     }
+
   }
 
-  //  printf("data1:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data1_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
-
-
-  //  printf("data2:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data2_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+//  printf("data1:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data1_h(i,j));
+//    }
+//    printf("\n");
+//  }
+
+
+//  printf("data2:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data2_h(i,j));
+//    }
+//    printf("\n");
+//  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -98,82 +92,70 @@ void ET_MatrixMatrixMultiplyImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-        auto A_cols =
-            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
-                                                                         N>();
-
-        auto B_rows =
-            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
-                                                                         N>();
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
-
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
-
-        data3_d(C_rows, C_cols) =
-            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
+
+    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
+    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+
+    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+
+    data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+
+  });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-  //  printf("data3:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data3_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+//  printf("data3:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data3_h(i,j));
+//    }
+//    printf("\n");
+//  }
 
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
       element_t expected(0);
-      for (camp::idx_t k = 0; k < N; ++k)
-      {
-        expected += data1_h(i, k) * data2_h(k, j);
+      for(camp::idx_t k = 0;k < N; ++ k){
+        expected += data1_h(i,k)*data2_h(k,j);
       }
-      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
-      //    (double)expected, (double)data3_h(i,j));
+//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
+
+      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
+//      data3_h(i,j) = expected;
 
-      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
-      //      data3_h(i,j) = expected;
     }
   }
 
-  //  printf("expected:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data3_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+//  printf("expected:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data3_h(i,j));
+//    }
+//    printf("\n");
+//  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
           data3_h(i, j) = 0;
         }
       }
@@ -184,27 +166,19 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            auto A_rows =
-                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-            auto A_cols =
-                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
-
-            auto B_rows =
-                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-            auto B_cols =
-                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
-
-            auto C_rows =
-                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-            auto C_cols =
-                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
-
-            data3_d(C_rows, C_cols) =
-                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-          });
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+
+        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+
+        data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -215,35 +189,37 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < n_size; ++i)
-      {
-        for (camp::idx_t j = 0; j < n_size; ++j)
-        {
+      for(camp::idx_t i = 0;i < n_size; ++ i){
+        for(camp::idx_t j = 0;j < n_size; ++ j){
           element_t expected(0);
-          for (camp::idx_t k = 0; k < m_size; ++k)
-          {
-            expected += data1_h(i, k) * data2_h(k, j);
+          for(camp::idx_t k = 0;k < m_size; ++ k){
+            expected += data1_h(i,k)*data2_h(k,j);
           }
-          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
-          //    (double)expected, (double)data3_h(i,j));
+    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
+
+          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
+    //      data3_h(i,j) = expected;
 
-          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
-          //      data3_h(i,j) = expected;
         }
       }
+
+
     }
   }
 
 
+
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
+
 }
 
 
+
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiply)
 {
   ET_MatrixMatrixMultiplyImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index f8d136d0e7..8bebe94c26 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -8,17 +8,17 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE(TX, "TX");
-RAJA_INDEX_VALUE(TY, "TY");
+RAJA_INDEX_VALUE( TX, "TX" );
+RAJA_INDEX_VALUE( TY, "TY" );
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyAddImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -26,75 +26,68 @@ void ET_MatrixMatrixMultiplyAddImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-  //  static constexpr camp::idx_t N = 8;
-  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N =
-      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N * N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
-      data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N*N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_h(data1_vec.data());
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
-      data1_d(data1_ptr);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_d(data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N * N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
-      data2_h(data2_vec.data());
+  std::vector<element_t> data2_vec(N*N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_h(data2_vec.data());
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
-      data2_d(data2_ptr);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_d(data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N * N);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),
-                                                              N, N);
+  std::vector<element_t> data3_vec(N*N);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),  N, N);
+
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr,  N, N);
 
-  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      data1_h(i, j) = 1 + i * N + j;
-      data2_h(i, j) = 3 + i * N + j;
-      data3_h(i, j) = 5 * i + 13 * j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      data1_h(i,j) = 1+i*N+j;
+      data2_h(i,j) = 3+i*N+j;
+      data3_h(i,j) = 5*i+13*j;
     }
+
   }
 
-  //  printf("data1:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data1_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
-
-
-  //  printf("data2:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data2_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+//  printf("data1:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data1_h(i,j));
+//    }
+//    printf("\n");
+//  }
+
+
+//  printf("data2:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data2_h(i,j));
+//    }
+//    printf("\n");
+//  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -104,83 +97,71 @@ void ET_MatrixMatrixMultiplyAddImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-        auto A_cols =
-            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
-                                                                         N>();
-
-        auto B_rows =
-            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
-                                                                         N>();
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
-
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
-
-        data3_d(C_rows, C_cols) +=
-            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
+
+    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
+    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+
+    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+
+    data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+
+  });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-  //  printf("data3:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data3_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+//  printf("data3:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data3_h(i,j));
+//    }
+//    printf("\n");
+//  }
 
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      element_t expected(5 * i + 13 * j);
-      for (camp::idx_t k = 0; k < N; ++k)
-      {
-        expected += data1_h(i, k) * data2_h(k, j);
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      element_t expected(5*i+13*j);
+      for(camp::idx_t k = 0;k < N; ++ k){
+        expected += data1_h(i,k)*data2_h(k,j);
       }
-      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
-      //    (double)expected, (double)data3_h(i,j));
+//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
+
+      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
+//      data3_h(i,j) = expected;
 
-      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
-      //      data3_h(i,j) = expected;
     }
   }
 
-  //  printf("expected:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data3_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+//  printf("expected:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data3_h(i,j));
+//    }
+//    printf("\n");
+//  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
-          data3_h(i, j) = 5 * i + 13 * j;
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
+          data3_h(i,j) = 5*i+13*j;
         }
       }
 
@@ -190,28 +171,20 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            auto A_rows =
-                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-            auto A_cols =
-                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
-            auto B_rows =
-                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-            auto B_cols =
-                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
 
-            auto C_rows =
-                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-            auto C_cols =
-                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
 
-            data3_d(C_rows, C_cols) +=
-                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-          });
+        data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -222,34 +195,35 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < n_size; ++i)
-      {
-        for (camp::idx_t j = 0; j < n_size; ++j)
-        {
-          element_t expected(5 * i + 13 * j);
-          for (camp::idx_t k = 0; k < m_size; ++k)
-          {
-            expected += data1_h(i, k) * data2_h(k, j);
+      for(camp::idx_t i = 0;i < n_size; ++ i){
+        for(camp::idx_t j = 0;j < n_size; ++ j){
+          element_t expected(5*i+13*j);
+          for(camp::idx_t k = 0;k < m_size; ++ k){
+            expected += data1_h(i,k)*data2_h(k,j);
           }
-          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
-          //    (double)expected, (double)data3_h(i,j));
+    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
 
-          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
         }
       }
+
+
     }
   }
 
 
+
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
+
 }
 
 
+
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiplyAdd)
 {
   ET_MatrixMatrixMultiplyAddImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
index e67e4a1389..0d7f2fd137 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
@@ -8,80 +8,74 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixVectorImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using cvector_t = typename matrix_t::column_vector_type;
   using rvector_t = typename matrix_t::row_vector_type;
 
-  //  static constexpr camp::idx_t N = 8;
-  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N =
-      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
+//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The matrix
 
-  std::vector<element_t> data1_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
-      data1_vec.data());
+  std::vector<element_t> data1_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
-      data1_ptr);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
 
 
   // alloc data2 - The input vector
 
   std::vector<element_t> data2_vec(N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_h(
-      data2_vec.data());
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_h(data2_vec.data());
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_d(data2_ptr);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The output vector
 
   std::vector<element_t> data3_vec(N);
-  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_h(data3_vec.data(), N);
+  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_h(data3_vec.data(),  N);
+
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_d(data3_ptr,  N);
 
-  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_d(data3_ptr, N);
 
 
   // Fill data1 and data2
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      data1_h(i, j) = 3 + i * N + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      data1_h(i,j) = 3+i*N+j;
     }
-    data2_h(i) = i + 1;
+    data2_h(i) = i+1;
   }
 
-  //  printf("data1:\n");
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("  ");
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%lf  ", (double)data1_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+//  printf("data1:\n");
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("  ");
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%lf  ", (double)data1_h(i,j));
+//    }
+//    printf("\n");
+//  }
 
 
-  //  for(camp::idx_t i = 0;i < N; ++ i){
-  //    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
-  //  }
+//  for(camp::idx_t i = 0;i < N; ++ i){
+//    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
+//  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -90,18 +84,17 @@ void ET_MatrixVectorImpl()
   //
   // Do Operation: A*x
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto cols =
-            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
-        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
-      });
+    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
+
+    data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+
+  });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -109,36 +102,30 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
+  for(camp::idx_t i = 0;i < N; ++ i){
 
     element_t expected(0);
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      expected += data1_h(i, j) * data2_h(j);
+    for(camp::idx_t j = 0;j < N; ++ j){
+      expected += data1_h(i,j)*data2_h(j);
     }
-    //    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected,
-    //    (double)data3_h(i));
+//    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
 
     ASSERT_SCALAR_EQ(expected, data3_h(i));
   }
 
-  // return;
+//return;
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
+      for(camp::idx_t i = 0;i < N; ++ i){
         data3_h(i) = 0;
       }
 
@@ -148,20 +135,16 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // Load data using a View
-            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // Load data using a View
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-            auto vrow =
-                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-            auto vcol =
-                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-            data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
-          });
+        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+      });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -169,43 +152,43 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < n_size; ++i)
-      {
+      for(camp::idx_t i = 0;i < n_size; ++ i){
 
 
         element_t expected(0);
-        for (camp::idx_t j = 0; j < m_size; ++j)
-        {
-          expected += data1_h(i, j) * data2_h(j);
+        for(camp::idx_t j = 0;j < m_size; ++ j){
+          expected += data1_h(i,j) * data2_h(j);
         }
 
-        if (i >= n_size || m_size == 0)
-        {
+        if(i >= n_size || m_size == 0){
           expected = 0;
         }
 
-        //        printf("i=%d, expected=%e, data3=%e\n", (int)i,
-        //        (double)expected, (double)data3_h(i));
+//        printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
         ASSERT_SCALAR_EQ(expected, data3_h(i));
+
       }
+
+
     }
   }
 
 
+
   //
   // Do Operation: (x')*A
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-        data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
-      });
+    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
+
+    data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
+
+  });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -213,35 +196,31 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for (camp::idx_t j = 0; j < N; ++j)
-  {
+  for(camp::idx_t j = 0;j < N; ++ j){
 
 
     element_t expected(0);
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      expected += data2_h(i) * data1_h(i, j);
+    for(camp::idx_t i = 0;i < N; ++ i){
+      expected += data2_h(i)*data1_h(i,j);
     }
 
     ASSERT_SCALAR_EQ(expected, data3_h(j));
-    //    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j),
-    //    (double)expected);
+//    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j), (double)expected);
   }
 
 
+
+
   //
   // Loop over all possible sub-matrix sizes for (x')*A
   //
-  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for (camp::idx_t j = 0; j < N; ++j)
-      {
+      for(camp::idx_t j = 0;j < N; ++ j){
         data3_h(j) = 0;
       }
 
@@ -251,20 +230,16 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // Load data using a View
-            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // Load data using a View
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-            auto vrow =
-                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-            auto vcol =
-                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-            data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
-          });
+        data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
+      });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -272,25 +247,24 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for (camp::idx_t j = 0; j < N; ++j)
-      {
+      for(camp::idx_t j = 0;j < N; ++ j){
 
         element_t expected(0);
 
-        for (camp::idx_t i = 0; i < n_size; ++i)
-        {
-          expected += data2_h(i) * data1_h(i, j);
+        for(camp::idx_t i = 0;i < n_size; ++ i){
+          expected += data2_h(i) * data1_h(i,j);
         }
 
-        if (j >= m_size || n_size == 0)
-        {
+        if(j >= m_size || n_size == 0){
           expected = 0;
         }
 
-        //        printf("j=%d, expected=%e, data3=%e\n", (int)j,
-        //        (double)expected, (double)data3_h(j));
+//        printf("j=%d, expected=%e, data3=%e\n", (int)j, (double)expected, (double)data3_h(j));
         ASSERT_SCALAR_EQ(expected, data3_h(j));
+
       }
+
+
     }
   }
 
@@ -301,9 +275,11 @@ void ET_MatrixVectorImpl()
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
+
 }
 
 
+
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixVector)
 {
   ET_MatrixVectorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index a7ac9b4529..6336a2988d 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -8,19 +8,18 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_NegateImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N =
-      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
+  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   //
   // Allocate Row-Major Data
@@ -28,76 +27,73 @@ void ET_NegateImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N * N);
+  std::vector<element_t> input0_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, N);
 
-  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, N);
+  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, N);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_h(
-      input1_vec.data());
+  std::vector<element_t> input1_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_h(input1_vec.data());
 
-  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_d(
-      input1_ptr);
+  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_d(input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), N, N);
+  std::vector<element_t> output0_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  N, N);
 
-  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, N, N);
+  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  N, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), N, N);
+  std::vector<element_t> output1_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  N, N);
 
-  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, N, N);
+  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  N, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), N, N);
+  std::vector<element_t> output2_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  N, N);
 
-  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, N, N);
+  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  N, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), N, N);
+  std::vector<element_t> output3_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  N, N);
 
-  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, N, N);
+  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  N, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), N, N);
+  std::vector<element_t> output4_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  N, N);
+
+  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  N, N);
 
-  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, N, N);
 
 
   // Fill input0
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      input0_h(i, j) = i * matrix_t::s_num_columns + j;
-      input1_h(i, j) = i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      input0_h(i,j) = i*matrix_t::s_num_columns+j;
+      input1_h(i,j) = i*matrix_t::s_num_columns+j;
     }
   }
 
@@ -108,29 +104,25 @@ void ET_NegateImpl()
   //
   // Do Operation: negation
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-        auto SRrows =
-            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-        auto SRcols =
-            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
-
-        output0_d(rows, cols) = -input0_d(rows, cols);
-
-        output1_d(rows, cols) =
-            -input1_d(SArows, SRcols);  // mixed static_all and static_range
-        output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
-        output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
-        output4_d(rows, cols) =
-            -input1_d(rows, SRcols);  // mixed static_range and non-static
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+
+    output0_d(rows, cols) = -input0_d(rows, cols);
+
+    output1_d(rows, cols) = -input1_d(SArows, SRcols);  // mixed static_all and static_range
+    output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
+    output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
+    output4_d(rows, cols) = -input1_d(rows, SRcols);    // mixed static_range and non-static
+
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
@@ -142,19 +134,18 @@ void ET_NegateImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      ASSERT_SCALAR_EQ(output0_h(i, j), -input0_h(i, j));
-      ASSERT_SCALAR_EQ(output1_h(i, j), -input1_h(i, j));
-      ASSERT_SCALAR_EQ(output2_h(i, j), -input1_h(i, j));
-      ASSERT_SCALAR_EQ(output3_h(i, j), -input1_h(i, j));
-      ASSERT_SCALAR_EQ(output4_h(i, j), -input1_h(i, j));
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      ASSERT_SCALAR_EQ(output0_h(i,j), -input0_h(i,j));
+      ASSERT_SCALAR_EQ(output1_h(i,j), -input1_h(i,j));
+      ASSERT_SCALAR_EQ(output2_h(i,j), -input1_h(i,j));
+      ASSERT_SCALAR_EQ(output3_h(i,j), -input1_h(i,j));
+      ASSERT_SCALAR_EQ(output4_h(i,j), -input1_h(i,j));
     }
   }
 
 
+
   //
   // Free data
   //
@@ -165,10 +156,15 @@ void ET_NegateImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
+
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, ET_Negate) { ET_NegateImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, ET_Negate)
+{
+  ET_NegateImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index 5b3d146938..dd95c11904 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -8,77 +8,71 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_SubtractImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N =
-      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
+  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N * N);
+  std::vector<element_t> data1_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
+  std::vector<element_t> data2_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
-      data3_vec.data());
+  std::vector<element_t> data3_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
 
-  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
-      data3_ptr);
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N * N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
-      data4_vec.data());
+  std::vector<element_t> data4_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
 
-  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
-      data4_ptr);
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N * N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
+  std::vector<element_t> data5_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
+
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
 
-  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      data1_h(i, j) = i * matrix_t::s_num_columns + j;
-      data2_h(i, j) = 1 + i + j;
-      data3_h(i, j) = i * matrix_t::s_num_columns + j;
-      data4_h(i, j) = 1 + i + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      data1_h(i,j) = i*matrix_t::s_num_columns+j;
+      data2_h(i,j) = 1+i+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+      data4_h(i,j) = 1+i+j;
     }
   }
 
@@ -91,29 +85,26 @@ void ET_SubtractImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-        auto SRrows =
-            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-        auto SRcols =
-            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-        // Access types:
-        // data1_d - Layout with all() and all().
-        // data2_d - Layout with all() and static_range(), which should default
-        // to normal Layout access. data3_d - StaticLayout with static_all() and
-        // static_range(). data4_d - StaticLayout with static_all() and all().
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
-        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) +
-                              data3_d(SArows, SRcols) - data4_d(SAcols, rows);
-      });
+    // Access types:
+    // data1_d - Layout with all() and all().
+    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
+    // data3_d - StaticLayout with static_all() and static_range().
+    // data4_d - StaticLayout with static_all() and all().
+
+    data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) + data3_d(SArows, SRcols) - data4_d(SAcols, rows);
+
+  });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -121,34 +112,27 @@ void ET_SubtractImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i) +
-                                          data3_h(i, j) - data4_h(j, i));
-      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-      //      data2(i,j));
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i)+data3_h(i,j)-data4_h(j,i));
+//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
 
 
+
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
-          data5_h(j, i) = -1;
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
+          data5_h(j,i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -157,19 +141,16 @@ void ET_SubtractImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // Load data using a View
-            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // Load data using a View
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-            // Access types:
-            // Layout with range() and range() because loop iterate cannot be
-            // determined statically.
+        // Access types:
+        // Layout with range() and range() because loop iterate cannot be determined statically.
 
-            data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
-          });
+        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
+      });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -177,22 +158,19 @@ void ET_SubtractImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < N; ++i)
-      {
-        for (camp::idx_t j = 0; j < N; ++j)
-        {
-          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-          //          data2(i,j));
-          if (i < n_size && j < m_size)
-          {
-            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i));
+      for(camp::idx_t i = 0;i < N; ++ i){
+        for(camp::idx_t j = 0;j < N; ++ j){
+//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+          if(i < n_size && j < m_size){
+            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i));
           }
-          else
-          {
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
+          else{
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
           }
         }
       }
+
+
     }
   }
 
@@ -208,7 +186,11 @@ void ET_SubtractImpl()
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, ET_Subtract) { ET_SubtractImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, ET_Subtract)
+{
+  ET_SubtractImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index 18a3d44b5f..9f40887dd1 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -8,22 +8,20 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_TransposeImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
 
-  //  static constexpr camp::idx_t N =
-  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
-  //  static constexpr camp::idx_t M =
-  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+//  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+//  static constexpr camp::idx_t M = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
@@ -34,76 +32,74 @@ void ET_TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N * M);
+  std::vector<element_t> input0_vec(N*M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
+  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N * M);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_h(
-      input1_vec.data());
+  std::vector<element_t> input1_vec(N*M);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_h(input1_vec.data());
+
+  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_d(input1_ptr);
 
-  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_d(
-      input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N * M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
+  std::vector<element_t> output0_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
 
-  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
+  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N * M);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), M, N);
+  std::vector<element_t> output1_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  M, N);
 
-  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, M, N);
+  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  M, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N * M);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), M, N);
+  std::vector<element_t> output2_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  M, N);
 
-  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, M, N);
+  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  M, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N * M);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), M, N);
+  std::vector<element_t> output3_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  M, N);
 
-  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, M, N);
+  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  M, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N * M);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), M, N);
+  std::vector<element_t> output4_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  M, N);
+
+  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  M, N);
 
-  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, M, N);
 
 
   // Fill input0 and input1
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < M; ++j)
-    {
-      input0_h(i, j) = i * matrix_t::s_num_columns + j;
-      input1_h(i, j) = i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < M; ++ j){
+      input0_h(i,j) = i*matrix_t::s_num_columns+j;
+      input1_h(i,j) = i*matrix_t::s_num_columns+j;
     }
   }
 
@@ -114,36 +110,28 @@ void ET_TransposeImpl()
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
-
-        auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
-        auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
-
-        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-        auto SRrows =
-            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
-        auto SRcols =
-            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, M>();
-
-        output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
-
-        output1_d(rows_tr, cols_tr) =
-            input1_d(SArows, SRcols)
-                .transpose();  // mixed static_all and static_range
-        output2_d(rows_tr, cols_tr) =
-            input1_d(SArows, SAcols).transpose();  // static_all
-        output3_d(rows_tr, cols_tr) =
-            input1_d(SRrows, SRcols).transpose();  // static_range
-        output4_d(rows_tr, cols_tr) =
-            input1_d(rows, SRcols)
-                .transpose();  // mixed static_range and non-static
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+    auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
+    auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
+
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,M>();
+
+    output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
+
+    output1_d(rows_tr, cols_tr) = input1_d(SArows, SRcols).transpose();  // mixed static_all and static_range
+    output2_d(rows_tr, cols_tr) = input1_d(SArows, SAcols).transpose();  // static_all
+    output3_d(rows_tr, cols_tr) = input1_d(SRrows, SRcols).transpose();  // static_range
+    output4_d(rows_tr, cols_tr) = input1_d(rows, SRcols).transpose();    // mixed static_range and non-static
+
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
@@ -152,30 +140,31 @@ void ET_TransposeImpl()
   tensor_copy_to_host<policy_t>(output4_vec, output4_ptr);
 
 
-  //  for(camp::idx_t i = 0;i < M; ++ i){
-  //    for(camp::idx_t j = 0;j < N; ++ j){
-  //      printf("%3d ", (int)output0_h(i,j));
-  //    }
-  //    printf("\n");
-  //  }
+
+//  for(camp::idx_t i = 0;i < M; ++ i){
+//    for(camp::idx_t j = 0;j < N; ++ j){
+//      printf("%3d ", (int)output0_h(i,j));
+//    }
+//    printf("\n");
+//  }
+
 
 
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < M; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
-      ASSERT_SCALAR_EQ(output1_h(i, j), input1_h(j, i));
-      ASSERT_SCALAR_EQ(output2_h(i, j), input1_h(j, i));
-      ASSERT_SCALAR_EQ(output3_h(i, j), input1_h(j, i));
-      ASSERT_SCALAR_EQ(output4_h(i, j), input1_h(j, i));
+  for(camp::idx_t i = 0;i < M; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
+      ASSERT_SCALAR_EQ(output1_h(i,j), input1_h(j,i));
+      ASSERT_SCALAR_EQ(output2_h(i,j), input1_h(j,i));
+      ASSERT_SCALAR_EQ(output3_h(i,j), input1_h(j,i));
+      ASSERT_SCALAR_EQ(output4_h(i,j), input1_h(j,i));
     }
   }
 
 
+
   //
   // Free data
   //
@@ -186,10 +175,15 @@ void ET_TransposeImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
+
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, ET_Transpose) { ET_TransposeImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, ET_Transpose)
+{
+  ET_TransposeImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index bbf131075b..fff811c48f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_ColMajorImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,34 +25,26 @@ void Load_ColMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
-      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
-      data1_ptr, 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
-      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
-      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-    {
-      data1_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+      data1_h(j,i) = 2*i*matrix_t::s_num_columns+j;
     }
   }
 
@@ -62,30 +54,25 @@ void Load_ColMajorImpl()
   //
   // Do operation
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        matrix_t m;
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    matrix_t m;
 
-        if (matrix_t::layout_type::is_column_major())
-        {
-          m.load_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-        }
-        else
-        {
-          m.load_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-        }
+    if(matrix_t::layout_type::is_column_major()){
+      m.load_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    }
+    else{
+      m.load_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    }
 
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-          {
-            data2_d(j, i) = m.get(i, j);
-          }
-        }
-      });
+    // write out to a second view so we can check it on the host
+    // on GPU's we'll write way too much, but it should stil be correct
+    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+        data2_d(j,i) = m.get(i,j);
+      }
+    }
+
+  });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -93,33 +80,27 @@ void Load_ColMajorImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-    {
-      ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
-      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i),
-      //      data2(j,i));
+  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i), data2(j,i));
     }
   }
 
 
+
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-        {
-          data2_h(j, i) = -1;
+      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+          data2_h(j,i) = -1;
         }
       }
 
@@ -129,31 +110,24 @@ void Load_ColMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            matrix_t m;
-            if (matrix_t::layout_type::is_column_major())
-            {
-              m.load_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                               m_size);
-            }
-            else
-            {
-              m.load_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                                m_size);
-            }
-
-            // write out to a second view so we can check it on the host
-            // on GPU's we'll write way too much, but it should stil be correct
-            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-            {
-              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-              {
-                data2_d(j, i) = m.get(i, j);
-              }
-            }
-          });
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        matrix_t m;
+        if(matrix_t::layout_type::is_column_major()){
+          m.load_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        }
+        else{
+          m.load_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        }
+
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+            data2_d(j,i) = m.get(i,j);
+          }
+        }
+
+      });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -161,22 +135,19 @@ void Load_ColMajorImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-        {
-          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-          //          data2(i,j));
-          if (i < n_size && j < m_size)
-          {
-            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
+      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+          if(i < n_size && j < m_size){
+            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
           }
-          else
-          {
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(j, i));
+          else{
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(j,i));
           }
         }
       }
+
+
     }
   }
 
@@ -189,6 +160,7 @@ void Load_ColMajorImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorMatrix, Load_ColMajor)
 {
   Load_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 84eee26474..8cae00baec 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_RowMajorImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,34 +25,27 @@ void Load_RowMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
-      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
-      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
-      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+
+  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
-      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // Fill data
-  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-    {
-      data1_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+      data1_h(i,j) = 2*i*matrix_t::s_num_columns+j;
     }
   }
 
@@ -62,29 +55,24 @@ void Load_RowMajorImpl()
   //
   // Do Operation: Full load
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        matrix_t m;
-        if (matrix_t::layout_type::is_row_major())
-        {
-          m.load_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-        }
-        else
-        {
-          m.load_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-        }
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    matrix_t m;
+    if(matrix_t::layout_type::is_row_major()){
+      m.load_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    }
+    else{
+      m.load_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    }
 
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-          {
-            data2_d(i, j) = m.get(i, j);
-          }
-        }
-      });
+    // write out to a second view so we can check it on the host
+    // on GPU's we'll write way too much, but it should stil be correct
+    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+        data2_d(i,j) = m.get(i,j);
+      }
+    }
+
+  });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -92,33 +80,27 @@ void Load_RowMajorImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-    {
-      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
-      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-      //      data2(i,j));
+  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
 
 
+
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
-    {
-      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-        {
-          data2_h(i, j) = -1;
+      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+          data2_h(i,j) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -127,31 +109,24 @@ void Load_RowMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            matrix_t m;
-            if (matrix_t::layout_type::is_row_major())
-            {
-              m.load_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                               n_size, m_size);
-            }
-            else
-            {
-              m.load_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                                n_size, m_size);
-            }
-
-            // write out to a second view so we can check it on the host
-            // on GPU's we'll write way too much, but it should stil be correct
-            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-            {
-              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-              {
-                data2_d(i, j) = m.get(i, j);
-              }
-            }
-          });
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        matrix_t m;
+        if(matrix_t::layout_type::is_row_major()){
+          m.load_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        }
+        else{
+          m.load_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        }
+
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+            data2_d(i,j) = m.get(i,j);
+          }
+        }
+
+      });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -159,22 +134,19 @@ void Load_RowMajorImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-        {
-          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
-          //          data2(i,j));
-          if (i < n_size && j < m_size)
-          {
-            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
+      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+          if(i < n_size && j < m_size){
+            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
           }
-          else
-          {
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(i, j));
+          else{
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(i,j));
           }
         }
       }
+
+
     }
   }
 
@@ -187,6 +159,7 @@ void Load_RowMajorImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorMatrix, Load_RowMajor)
 {
   Load_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index b107b919e2..0961e3722d 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_ColMajorImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,43 +25,34 @@ void Store_ColMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
-      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
-      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
-      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   //
   // Fill reference data
   //
-  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-    {
-      data2_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      data2_h(j,i) = 2*i*matrix_t::s_num_columns+j;
     }
   }
 
   //
   // Clear data1
   //
-  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-    {
-      data1_h(j, i) = element_t(-2);
+  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+      data1_h(j,i) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -70,30 +61,25 @@ void Store_ColMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill out matrix
-        matrix_t m(-1.0);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-          {
-            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-          }
-        }
+    // fill out matrix
+    matrix_t m(-1.0);
 
-        // Store matrix to memory
-        if (matrix_t::layout_type::is_column_major())
-        {
-          m.store_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-        }
-        else
-        {
-          m.store_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
-        }
-      });
+    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+        m.set(2*i*matrix_t::s_num_columns+j, i, j);
+      }
+    }
+
+    // Store matrix to memory
+    if(matrix_t::layout_type::is_column_major()){
+      m.store_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    }
+    else{
+      m.store_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
+    }
+  });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -101,41 +87,33 @@ void Store_ColMajorImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-    {
-      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
-      {
-        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
-        //        data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
+  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
+//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
       }
-      else
-      {
-        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
+      else{
+//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
       }
     }
   }
 
 
+
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
-    {
+  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
 
       //
       // Clear data1
       //
-      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-        {
-          data1_h(j, i) = element_t(-2);
+      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+          data1_h(j,i) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -144,32 +122,25 @@ void Store_ColMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // fill out matrix
-            matrix_t m(-1.0);
-
-            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-            {
-              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-              {
-                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-              }
-            }
-
-            // Store matrix to memory
-            if (matrix_t::layout_type::is_column_major())
-            {
-              m.store_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                                m_size);
-            }
-            else
-            {
-              m.store_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
-                                 m_size);
-            }
-          });
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // fill out matrix
+        matrix_t m(-1.0);
+
+        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+            m.set(2*i*matrix_t::s_num_columns+j, i, j);
+          }
+        }
+
+        // Store matrix to memory
+        if(matrix_t::layout_type::is_column_major()){
+          m.store_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        }
+        else{
+          m.store_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
+        }
+
+      });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -178,24 +149,20 @@ void Store_ColMajorImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-        {
-          if (i < n_size && j < m_size)
-          {
-            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
-            //            data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
+      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+          if(i < n_size && j < m_size){
+//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
           }
-          else
-          {
-            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
-            //            data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
+          else{
+//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
           }
         }
       }
+
+
     }
   }
 
@@ -207,6 +174,7 @@ void Store_ColMajorImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorMatrix, Store_ColMajor)
 {
   Store_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index ae3d9b5fba..94172b4342 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_RowMajorImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   //
@@ -24,43 +24,34 @@ void Store_RowMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
-      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
 
-  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
-      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
+  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
-                                   matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
-      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Fill reference data
   //
-  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-    {
-      data2_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      data2_h(i,j) = 2*i*matrix_t::s_num_columns+j;
     }
   }
 
   //
   // Clear data1
   //
-  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-    {
-      data1_h(i, j) = element_t(-2);
+  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+      data1_h(i,j) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -69,30 +60,25 @@ void Store_RowMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill out matrix
-        matrix_t m(-1.0);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-        {
-          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-          {
-            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-          }
-        }
+    // fill out matrix
+    matrix_t m(-1.0);
 
-        // Store matrix to memory
-        if (matrix_t::layout_type::is_row_major())
-        {
-          m.store_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-        }
-        else
-        {
-          m.store_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
-        }
-      });
+    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+        m.set(2*i*matrix_t::s_num_columns+j, i, j);
+      }
+    }
+
+    // Store matrix to memory
+    if(matrix_t::layout_type::is_row_major()){
+      m.store_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    }
+    else{
+      m.store_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
+    }
+  });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -100,41 +86,33 @@ void Store_RowMajorImpl()
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-  {
-    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-    {
-      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
-      {
-        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
-        //        data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
+  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
+//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
       }
-      else
-      {
-        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
+      else{
+//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
       }
     }
   }
 
 
+
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
-  {
-    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
-    {
+  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
+    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
 
       //
       // Clear data1
       //
-      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-        {
-          data1_h(i, j) = element_t(-2);
+      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+          data1_h(i,j) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -143,32 +121,25 @@ void Store_RowMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            // fill out matrix
-            matrix_t m(-1.0);
-
-            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
-            {
-              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
-              {
-                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
-              }
-            }
-
-            // Store matrix to memory
-            if (matrix_t::layout_type::is_row_major())
-            {
-              m.store_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                                n_size, m_size);
-            }
-            else
-            {
-              m.store_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
-                                 n_size, m_size);
-            }
-          });
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+        // fill out matrix
+        matrix_t m(-1.0);
+
+        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
+          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+            m.set(2*i*matrix_t::s_num_columns+j, i, j);
+          }
+        }
+
+        // Store matrix to memory
+        if(matrix_t::layout_type::is_row_major()){
+          m.store_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        }
+        else{
+          m.store_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
+        }
+
+      });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -177,24 +148,20 @@ void Store_RowMajorImpl()
       //
       // Check results
       //
-      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
-      {
-        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
-        {
-          if (i < n_size && j < m_size)
-          {
-            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
-            //            data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
+      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
+        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
+          if(i < n_size && j < m_size){
+//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
           }
-          else
-          {
-            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
-            //            data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
+          else{
+//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
           }
         }
       }
+
+
     }
   }
 
@@ -206,6 +173,7 @@ void Store_RowMajorImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorMatrix, Store_RowMajor)
 {
   Store_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
index dbd1b14c9a..1be42b1ab8 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_Transpose_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void TransposeImpl()
 {
 
-  using matrix_t  = MATRIX_TYPE;
-  using policy_t  = typename matrix_t::register_policy;
+  using matrix_t = MATRIX_TYPE;
+  using policy_t = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
@@ -24,7 +24,7 @@ void TransposeImpl()
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
 
-  //  bool is_row_major = matrix_t::layout_type::is_row_major();
+//  bool is_row_major = matrix_t::layout_type::is_row_major();
 
   //
   // Allocate Row-Major Data
@@ -32,86 +32,91 @@ void TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N * M);
+  std::vector<element_t> input0_vec(N*M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
+  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
+
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N * M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
+  std::vector<element_t> output0_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
+
+  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
 
-  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
 
 
   // Fill input0 and output0
-  for (camp::idx_t i = 0; i < N; ++i)
-  {
-    for (camp::idx_t j = 0; j < M; ++j)
-    {
-      input0_h(i, j) = i * matrix_t::s_num_columns + j;
+  for(camp::idx_t i = 0;i < N; ++ i){
+    for(camp::idx_t j = 0;j < M; ++ j){
+      input0_h(i,j) = i*matrix_t::s_num_columns+j;
     }
   }
 
   tensor_copy_to_device<policy_t>(input0_ptr, input0_vec);
 
 
+
+
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // load original matrix
-        matrix_t A;
-        A.load_strided(input0_ptr, M, 1);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // load original matrix
+    matrix_t A;
+    A.load_strided(input0_ptr, M, 1);
+
+    // transpose matrix
+    transpose_t B = A.transpose();
 
-        // transpose matrix
-        transpose_t B = A.transpose();
+    // store transposed matrix
+    B.store_strided(output0_ptr, N, 1);
 
-        // store transposed matrix
-        B.store_strided(output0_ptr, N, 1);
-      });
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
 
 
   printf("gtest result:\n");
-  for (camp::idx_t i = 0; i < M; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      printf("%3d ", (int)output0_h(i, j));
+  for(camp::idx_t i = 0;i < M; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      printf("%3d ", (int)output0_h(i,j));
     }
     printf("\n");
   }
 
 
+
   //
   // Check results
   //
-  for (camp::idx_t i = 0; i < M; ++i)
-  {
-    for (camp::idx_t j = 0; j < N; ++j)
-    {
-      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
+  for(camp::idx_t i = 0;i < M; ++ i){
+    for(camp::idx_t j = 0;j < N; ++ j){
+      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
     }
   }
 
 
+
   //
   // Free data
   //
   tensor_free<policy_t>(input0_ptr);
   tensor_free<policy_t>(output0_ptr);
+
 }
 
 
-TYPED_TEST_P(TestTensorMatrix, Transpose) { TransposeImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorMatrix, Transpose)
+{
+  TransposeImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
index b936803efd..ae9a93c3ad 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
@@ -8,36 +8,35 @@
 #ifndef __TEST_TENSOR_REGISTER_Add_HPP__
 #define __TEST_TENSOR_REGISTER_Add_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void AddImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t  i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -49,95 +48,93 @@ void AddImpl()
   //
 
   // operator +
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x + y;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z.store_packed(output0_dptr);
-      });
+    register_t z = x + y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
+
   // operator +=
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z += y;
+    register_t z = x;
 
-        z.store_packed(output0_dptr);
-      });
+    z += y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
+
+
   // operator + scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x + 7;
+    register_t z = x + 7;
 
-        z.store_packed(output0_dptr);
-      });
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] + 7, output0_vec[lane]);
   }
 
 
+
+
   // operator += scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x;
+    register_t z = x;
 
-        z += 3;
+    z += 3;
 
-        z.store_packed(output0_dptr);
-      });
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] + 3, output0_vec[lane]);
   }
 
 
+
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -145,7 +142,11 @@ void AddImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Add) { AddImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Add)
+{
+  AddImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
index 7ba22b6a80..33efe4ba27 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
@@ -8,36 +8,35 @@
 #ifndef __TEST_TENSOR_REGISTER_Divide_HPP__
 #define __TEST_TENSOR_REGISTER_Divide_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DivideImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -49,91 +48,88 @@ void DivideImpl()
   //
 
   // operator /
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x / y;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z.store_packed(output0_dptr);
-      });
+    register_t z = x / y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
+
   // operator /=
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z /= y;
+    register_t z = x;
 
-        z.store_packed(output0_dptr);
-      });
+    z /= y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
+
+
   // operator / scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x / 7;
+    register_t z = x / 7;
 
-        z.store_packed(output0_dptr);
-      });
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] / 7, output0_vec[lane]);
   }
 
 
+
+
   // operator += scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t z = x;
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        z /= 3;
+    register_t z = x;
 
-        z.store_packed(output0_dptr);
-      });
+    z /= 3;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] / 3, output0_vec[lane]);
   }
 
@@ -141,37 +137,33 @@ void DivideImpl()
   //
   // Test variable length operations for all valid lengths
   //
-  for (camp::idx_t N = 0; N < num_elem; ++N)
-  {
+  for(camp::idx_t  N = 0;N < num_elem; ++N){
+
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          register_t x;
-          x.load_packed_n(input0_dptr, N);
+      register_t x;
+      x.load_packed_n(input0_dptr, N);
 
-          register_t y;
-          y.load_packed_n(input1_dptr, N);
+      register_t y;
+      y.load_packed_n(input1_dptr, N);
 
-          register_t z = x.divide_n(y, N);
+      register_t z = x.divide_n(y,N);
 
-          z.store_packed(output0_dptr);
-        });
+      z.store_packed(output0_dptr);
+    });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-    {
-      if (lane < N)
-      {
-        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane],
-                         output0_vec[lane]);
+    for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+      if(lane < N){
+        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
       }
-      else
-      {
+      else{
         ASSERT_SCALAR_EQ(0, output0_vec[lane]);
       }
     }
+
+
   }
 
 
@@ -182,7 +174,11 @@ void DivideImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Divide) { DivideImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Divide)
+{
+  DivideImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
index dcd47e50e0..6a414dd7d0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
@@ -8,36 +8,35 @@
 #ifndef __TEST_TENSOR_REGISTER_DotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_DotProduct_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,29 +47,28 @@ void DotProductImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        output0_dptr[0] = x.dot(y);
-      });
+
+    output0_dptr[0] = x.dot(y);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   element_t expected = 0;
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     expected += input0_vec[lane] * input1_vec[lane];
   }
   ASSERT_SCALAR_EQ(expected, output0_vec[0]);
 
 
+
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -78,7 +76,11 @@ void DotProductImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, DotProduct) { DotProductImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, DotProduct)
+{
+  DotProductImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
index f2294ab3ae..e03529d183 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
@@ -8,41 +8,40 @@
 #ifndef __TEST_TENSOR_REGISTER_FMA_HPP__
 #define __TEST_TENSOR_REGISTER_FMA_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMAImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t* input2_hptr = input2_vec.data();
-  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input2_hptr = input2_vec.data();
+  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
-    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -55,32 +54,31 @@ void FMAImpl()
   //
 
   // operator z = a*b+c
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t a;
-        a.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t b;
-        b.load_packed(input1_dptr);
+    register_t a;
+    a.load_packed(input0_dptr);
 
-        register_t c;
-        c.load_packed(input2_dptr);
+    register_t b;
+    b.load_packed(input1_dptr);
 
-        register_t z = a.multiply_add(b, c);
+    register_t c;
+    c.load_packed(input2_dptr);
 
-        z.store_packed(output0_dptr);
-      });
+    register_t z = a.multiply_add(b,c);
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
-                     output0_vec[lane]);
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane], output0_vec[lane]);
   }
 
 
+
+
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -89,7 +87,11 @@ void FMAImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, FMA) { FMAImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, FMA)
+{
+  FMAImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
index 05015c5560..2f8b53c0c9 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
@@ -8,41 +8,40 @@
 #ifndef __TEST_TENSOR_REGISTER_FMS_HPP__
 #define __TEST_TENSOR_REGISTER_FMS_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMSImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t* input2_hptr = input2_vec.data();
-  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input2_hptr = input2_vec.data();
+  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
-    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -55,32 +54,31 @@ void FMSImpl()
   //
 
   // operator z = a*b-c
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t a;
-        a.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t b;
-        b.load_packed(input1_dptr);
+    register_t a;
+    a.load_packed(input0_dptr);
 
-        register_t c;
-        c.load_packed(input2_dptr);
+    register_t b;
+    b.load_packed(input1_dptr);
 
-        register_t z = a.multiply_subtract(b, c);
+    register_t c;
+    c.load_packed(input2_dptr);
 
-        z.store_packed(output0_dptr);
-      });
+    register_t z = a.multiply_subtract(b,c);
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
-                     output0_vec[lane]);
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane], output0_vec[lane]);
   }
 
 
+
+
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -89,7 +87,11 @@ void FMSImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, FMS) { FMSImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, FMS)
+{
+  FMSImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
index 3aa665712c..37429b5087 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
@@ -8,45 +8,43 @@
 #ifndef __TEST_TENSOR_REGISTER_Gather_HPP__
 #define __TEST_TENSOR_REGISTER_Gather_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GatherImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t        = typename int_register_t::element_type;
+  using index_t = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read (10x larger than output)
-  std::vector<element_t> input0_vec(10 * num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
+  std::vector<element_t> input0_vec(10*num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
 
   // Indexing into input0
   std::vector<index_t> input1_vec(num_elem);
-  index_t* input1_hptr = input1_vec.data();
-  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t *input1_hptr = input1_vec.data();
+  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
   }
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -58,25 +56,23 @@ void GatherImpl()
   //
 
   // operator z[i] = a[b[i]]
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // get offsets
-        int_register_t idx;
-        idx.load_packed(input1_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        // gather elements from a given offsets in idx
-        register_t a;
-        a.gather(input0_dptr, idx);
+    // get offsets
+    int_register_t idx;
+    idx.load_packed(input1_dptr);
 
-        // write out gathered elements in packed order
-        a.store_packed(output0_dptr);
-      });
+    // gather elements from a given offsets in idx
+    register_t a;
+    a.gather(input0_dptr, idx);
+
+    // write out gathered elements in packed order
+    a.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
   }
 
@@ -85,40 +81,36 @@ void GatherImpl()
   // Check partial length operations
   //
 
-  for (camp::idx_t N = 0; N <= num_elem; ++N)
-  {
+  for(camp::idx_t N = 0;N <= num_elem;++ N){
 
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          // get offsets
-          int_register_t idx;
-          idx.load_packed_n(input1_dptr, N);
-
-          // gather elements from a given offsets in idx
-          register_t a;
-          a.gather_n(input0_dptr, idx, N);
-
-          // write out gathered elements in packed order
-          // we're writing out entire length to check the zeroing
-          a.store_packed(output0_dptr);
-        });
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+      // get offsets
+      int_register_t idx;
+      idx.load_packed_n(input1_dptr, N);
+
+      // gather elements from a given offsets in idx
+      register_t a;
+      a.gather_n(input0_dptr, idx, N);
+
+      // write out gathered elements in packed order
+      // we're writing out entire length to check the zeroing
+      a.store_packed(output0_dptr);
+    });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
-    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-    {
-      if (lane < N)
-      {
+    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+      if(lane < N){
         ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
       }
-      else
-      {
+      else{
         ASSERT_SCALAR_EQ((element_t)0, output0_vec[lane]);
       }
     }
+
   }
 
 
@@ -129,7 +121,11 @@ void GatherImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Gather) { GatherImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Gather)
+{
+  GatherImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
index b735c05ece..194412d999 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
@@ -8,55 +8,51 @@
 #ifndef __TEST_TENSOR_REGISTER_GetSet_HPP__
 #define __TEST_TENSOR_REGISTER_GetSet_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GetSetImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
   // Test set and get operations
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x using set
-        register_t x;
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          x.set(input0_dptr[i], i);
-        }
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = x.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x using set
+    register_t x;
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      x.set(input0_dptr[i], i);
+    }
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = x.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
@@ -64,161 +60,158 @@ void GetSetImpl()
   //
   // test copy construction
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x using set
-        register_t x;
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          x.set(input0_dptr[i], i);
-        }
-
-        register_t cc(x);
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = cc.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x using set
+    register_t x;
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      x.set(input0_dptr[i], i);
+    }
+
+    register_t cc(x);
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = cc.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
+
+
   //
   // test explicit copy
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x using set
-        register_t x;
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          x.set(input0_dptr[i], i);
-        }
-
-        register_t cc;
-        cc.copy(x);
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = cc.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x using set
+    register_t x;
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      x.set(input0_dptr[i], i);
+    }
+
+    register_t cc;
+    cc.copy(x);
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = cc.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
+
+
   //
   // test assignment
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x using set
-        register_t x;
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          x.set(input0_dptr[i], i);
-        }
-
-        register_t cc = x;
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = cc.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x using set
+    register_t x;
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      x.set(input0_dptr[i], i);
+    }
+
+    register_t cc = x;
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = cc.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
+
+
   //
   // test scalar construction (broadcast)
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t cc = (element_t)5;
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = cc.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+
+    register_t cc = (element_t) 5;
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = cc.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)5);
   }
 
 
+
+
+
   //
   // test scalar broadcast by assignment
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t cc = (element_t)0;
-        cc            = (element_t)11.0;
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = cc.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+
+    register_t cc = (element_t) 0;
+    cc = (element_t) 11.0;
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = cc.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)11);
   }
 
 
+
   //
   // test scalar explicit broadcast
   //
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t cc = (element_t)0;
-        cc.broadcast(13.0);
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = cc.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t cc = (element_t) 0;
+    cc.broadcast(13.0);
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = cc.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)13);
   }
 
@@ -231,7 +224,11 @@ void GetSetImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, GetSet) { GetSetImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, GetSet)
+{
+  GetSetImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
index 768965aad0..afe738b037 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
@@ -8,140 +8,131 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_HPP__
 #define __TEST_TENSOR_REGISTER_Load_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void LoadImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
-  std::vector<element_t> input0_vec(10 * num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
+  std::vector<element_t> input0_vec(10*num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
+
+
   // load stride-1 from pointer
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x using set
-        register_t x;
-        x.load_packed(input0_dptr);
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = x.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x using set
+    register_t x;
+    x.load_packed(input0_dptr);
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = x.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-  for (camp::idx_t N = 0; N < num_elem; ++N)
-  {
+
+  for(camp::idx_t N = 0;N < num_elem; ++ N){
     // load stride-1 from pointer
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          // fill x using set
-          register_t x;
-          x.load_packed_n(input0_dptr, N);
-
-          // extract from x using get
-          for (camp::idx_t i = 0; i < num_elem; ++i)
-          {
-            output0_dptr[i] = x.get(i);
-          }
-        });
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+      // fill x using set
+      register_t x;
+      x.load_packed_n(input0_dptr, N);
+
+      // extract from x using get
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
+        output0_dptr[i] = x.get(i);
+      }
+
+    });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      if (i < N)
-      {
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      if(i < N){
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else
-      {
+      else{
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
+
+
   // load stride-2 from pointer
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x using set
-        register_t x;
-        x.load_strided(input0_dptr, 2);
-
-        // extract from x using get
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          output0_dptr[i] = x.get(i);
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x using set
+    register_t x;
+    x.load_strided(input0_dptr, 2);
+
+    // extract from x using get
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      output0_dptr[i] = x.get(i);
+    }
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
   }
 
 
-  for (camp::idx_t N = 0; N < num_elem; ++N)
-  {
+
+  for(camp::idx_t N = 0;N < num_elem; ++ N){
     // load stride-2 from pointer
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          // fill x using set
-          register_t x;
-          x.load_strided_n(input0_dptr, 2, N);
-
-          // extract from x using get
-          for (camp::idx_t i = 0; i < num_elem; ++i)
-          {
-            output0_dptr[i] = x.get(i);
-          }
-        });
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+      // fill x using set
+      register_t x;
+      x.load_strided_n(input0_dptr, 2, N);
+
+      // extract from x using get
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
+        output0_dptr[i] = x.get(i);
+      }
+
+    });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      if (i < N)
-      {
-        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      if(i < N){
+        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
       }
-      else
-      {
+      else{
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
@@ -156,7 +147,11 @@ void LoadImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Load) { LoadImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Load)
+{
+  LoadImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
index 319e0cac57..f4bce2e7a9 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
@@ -8,39 +8,38 @@
 #ifndef __TEST_TENSOR_REGISTER_Max_HPP__
 #define __TEST_TENSOR_REGISTER_Max_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MaxImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
-    input1_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+    input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+    input1_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -51,34 +50,33 @@ void MaxImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // load input vectors
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    // load input vectors
+    register_t x;
+    x.load_packed(input0_dptr);
 
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        // compute reduction
-        output0_dptr[0] = x.max();
 
+    // compute reduction
+    output0_dptr[0] = x.max();
 
-        // compute element-wise
-        register_t z = x.vmax(y);
-        z.store_packed(output1_dptr);
-      });
+
+    // compute element-wise
+    register_t z = x.vmax(y);
+    z.store_packed(output1_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
+
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for (camp::idx_t i = 1; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 1;i < num_elem;++i){
     expected = expected < input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -87,43 +85,40 @@ void MaxImpl()
 
 
   // check element-wise operation
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]),
-                     output1_vec[i]);
+  for(camp::idx_t i = 0;i < num_elem;++i){
+    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for (camp::idx_t N = 0; N <= num_elem; ++N)
-  {
+  for(camp::idx_t N = 0;N <= num_elem;++ N){
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          register_t x;
-          x.load_packed(input0_dptr);
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+      register_t x;
+      x.load_packed(input0_dptr);
+
+      output0_dptr[0] = x.max_n(N);
 
-          output0_dptr[0] = x.max_n(N);
-        });
+    });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::min();
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
+    for(camp::idx_t i = 0;i < N;++i){
       expected = expected < input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
+
   }
 
   // Cleanup
@@ -134,7 +129,11 @@ void MaxImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Max) { MaxImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Max)
+{
+  MaxImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
index aad3a0333c..957d9fbf1d 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
@@ -8,38 +8,37 @@
 #ifndef __TEST_TENSOR_REGISTER_Min_HPP__
 #define __TEST_TENSOR_REGISTER_Min_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MinImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
-    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -50,34 +49,33 @@ void MinImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // load input vectors
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    // load input vectors
+    register_t x;
+    x.load_packed(input0_dptr);
 
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        // compute reduction
-        output0_dptr[0] = x.min();
 
+    // compute reduction
+    output0_dptr[0] = x.min();
 
-        // compute element-wise
-        register_t z = x.vmin(y);
-        z.store_packed(output1_dptr);
-      });
+
+    // compute element-wise
+    register_t z = x.vmin(y);
+    z.store_packed(output1_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
+
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for (camp::idx_t i = 1; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 1;i < num_elem;++i){
     expected = expected > input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -86,43 +84,40 @@ void MinImpl()
 
 
   // check element-wise operation
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]),
-                     output1_vec[i]);
+  for(camp::idx_t i = 0;i < num_elem;++i){
+    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for (camp::idx_t N = 0; N <= num_elem; ++N)
-  {
+  for(camp::idx_t N = 0;N <= num_elem;++ N){
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          register_t x;
-          x.load_packed(input0_dptr);
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+      register_t x;
+      x.load_packed(input0_dptr);
+
+      output0_dptr[0] = x.min_n(N);
 
-          output0_dptr[0] = x.min_n(N);
-        });
+    });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::max();
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
+    for(camp::idx_t i = 0;i < N;++i){
       expected = expected > input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
+
   }
 
   // Cleanup
@@ -133,7 +128,11 @@ void MinImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Min) { MinImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Min)
+{
+  MinImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
index 33072a50e5..0ed4d4ad39 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
@@ -8,36 +8,35 @@
 #ifndef __TEST_TENSOR_REGISTER_Multiply_HPP__
 #define __TEST_TENSOR_REGISTER_Multiply_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MultiplyImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -49,95 +48,93 @@ void MultiplyImpl()
   //
 
   // operator *
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x * y;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z.store_packed(output0_dptr);
-      });
+    register_t z = x * y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
+
   // operator *=
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z *= y;
+    register_t z = x;
 
-        z.store_packed(output0_dptr);
-      });
+    z *= y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
+
+
   // operator * scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x * 7;
+    register_t z = x * 7;
 
-        z.store_packed(output0_dptr);
-      });
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] * 7, output0_vec[lane]);
   }
 
 
+
+
   // operator *= scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x;
+    register_t z = x;
 
-        z *= 3;
+    z *= 3;
 
-        z.store_packed(output0_dptr);
-      });
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] * 3, output0_vec[lane]);
   }
 
 
+
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -145,7 +142,11 @@ void MultiplyImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Multiply) { MultiplyImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Multiply)
+{
+  MultiplyImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
index 23a29a9bd7..dc27f15b7b 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
@@ -8,45 +8,44 @@
 #ifndef __TEST_TENSOR_REGISTER_Scatter_HPP__
 #define __TEST_TENSOR_REGISTER_Scatter_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void ScatterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t        = typename int_register_t::element_type;
+  using index_t = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Indexing into output0
   std::vector<index_t> input1_vec(num_elem);
-  index_t* input1_hptr = input1_vec.data();
-  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t *input1_hptr = input1_vec.data();
+  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   // Scattered output (10x larger than output)
-  std::vector<element_t> output0_vec(10 * num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
+  std::vector<element_t> output0_vec(10*num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
 
   // precomputed expected output
-  std::vector<element_t> expected(10 * num_elem);
+  std::vector<element_t> expected(10*num_elem);
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -54,9 +53,8 @@ void ScatterImpl()
 
 
   // Initialize output
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    output0_vec[i] = (element_t)0;
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
@@ -66,33 +64,29 @@ void ScatterImpl()
   //
 
   // operator z[b[i]] = a[i]
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        int_register_t idx;
-        idx.load_packed(input1_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t a;
-        a.load_packed(input0_dptr);
+    int_register_t idx;
+    idx.load_packed(input1_dptr);
 
-        a.scatter(output0_dptr, idx);
-      });
+    register_t a;
+    a.load_packed(input0_dptr);
+
+    a.scatter(output0_dptr, idx);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // compute expected value
-  for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
     expected[lane] = 0;
   }
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     expected[input1_vec[lane]] = input0_vec[lane];
   }
 
   // check result
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
   }
 
@@ -101,48 +95,44 @@ void ScatterImpl()
   // Check partial length operations
   //
 
-  for (camp::idx_t N = 0; N <= num_elem; ++N)
-  {
+  for(camp::idx_t N = 0;N <= num_elem;++ N){
 
     // Initialize output
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      output0_vec[i] = (element_t)0;
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+     output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
+
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          int_register_t idx;
-          idx.load_packed(input1_dptr);
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-          register_t a;
-          a.load_packed(input0_dptr);
+      int_register_t idx;
+      idx.load_packed(input1_dptr);
 
-          a.scatter_n(output0_dptr, idx, N);
-        });
+      register_t a;
+      a.load_packed(input0_dptr);
+
+      a.scatter_n(output0_dptr, idx, N);
+    });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value
-    for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
-    {
+    for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
       expected[lane] = 0;
     }
-    for (camp::idx_t lane = 0; lane < N; ++lane)
-    {
+    for(camp::idx_t lane = 0;lane < N;++ lane){
       expected[input1_vec[lane]] = input0_vec[lane];
     }
 
     // check result
-    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-    {
+    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
       ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
     }
+
   }
 
 
@@ -153,7 +143,11 @@ void ScatterImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Scatter) { ScatterImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Scatter)
+{
+  ScatterImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
index f843fc6ad9..c3394e981f 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
@@ -8,61 +8,57 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  //  printf("input: ");
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
-    //    printf("%lf ", (double)input0_hptr[i]);
+//  printf("input: ");
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
+//    printf("%lf ", (double)input0_hptr[i]);
   }
-  //  printf("\n");
+//  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
+
   // run segmented dot products for all segments allowed by the vector
-  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
-  {
+  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
 
-    camp::idx_t num_segments = num_elem >> segbits;
+    camp::idx_t num_segments = num_elem>>segbits;
 
-    for (camp::idx_t input_segment = 0; input_segment < num_segments;
-         ++input_segment)
-    {
-      //      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits,
-      //      (camp::idx_t)input_segment);
+    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
+//      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits, (camp::idx_t)input_segment);
 
       // Execute segmented broadcast
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            register_t x;
-            x.load_packed(input0_dptr);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+        register_t x;
+        x.load_packed(input0_dptr);
 
-            register_t y = x.segmented_broadcast_inner(segbits, input_segment);
+        register_t y = x.segmented_broadcast_inner(segbits, input_segment);
 
-            y.store_packed(output0_dptr);
-          });
+        y.store_packed(output0_dptr);
+
+      });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -73,40 +69,38 @@ void SegmentedBroadcastInnerImpl()
       // Compute expected values
       element_t expected[num_elem];
 
-      camp::idx_t mask   = (1 << segbits) - 1;
+      camp::idx_t mask = (1<<segbits)-1;
       camp::idx_t offset = input_segment << segbits;
 
       // default implementation is dumb, just sum each value into
       // appropriate segment lane
-      //      printf("Expected: ");
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+//      printf("Expected: ");
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
 
-        auto off = (i & mask) + offset;
+        auto off = (i&mask) + offset;
 
         expected[i] = input0_hptr[off];
 
-        //        printf("%d ", (camp::idx_t)off);
-        // printf("%lf ", (double)expected[i]);
+//        printf("%d ", (camp::idx_t)off);
+        //printf("%lf ", (double)expected[i]);
       }
-      //      printf("\n");
+//      printf("\n");
 
 
-      //      printf("Result:   ");
-      //      for(camp::idx_t i = 0;i < num_elem; ++ i){
-      //        printf("%lf ", (double)output0_vec[i]);
-      //      }
-      //      printf("\n");
+//      printf("Result:   ");
+//      for(camp::idx_t i = 0;i < num_elem; ++ i){
+//        printf("%lf ", (double)output0_vec[i]);
+//      }
+//      printf("\n");
 
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    }  // segment
+    } // segment
 
-  }  // segbits
+  } // segbits
 
 
   // Cleanup
@@ -115,6 +109,7 @@ void SegmentedBroadcastInnerImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastInner)
 {
   SegmentedBroadcastInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
index aca677b975..45c5739af0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
@@ -8,59 +8,56 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  //  printf("input: ");
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    //    printf("%lf ", (double)input0_hptr[i]);
+//  printf("input: ");
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+//    printf("%lf ", (double)input0_hptr[i]);
   }
-  //  printf("\n");
+//  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
+
   // run segmented dot products for all segments allowed by the vector
-  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
-  {
+  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
 
-    camp::idx_t num_segments = (1 << segbits);
+    camp::idx_t num_segments = (1<<segbits);
 
-    for (camp::idx_t input_segment = 0; input_segment < num_segments;
-         ++input_segment)
-    {
+    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
 
       // Execute segmented broadcast
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            register_t x;
-            x.load_packed(input0_dptr);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+        register_t x;
+        x.load_packed(input0_dptr);
 
-            register_t y = x.segmented_broadcast_outer(segbits, input_segment);
+        register_t y = x.segmented_broadcast_outer(segbits, input_segment);
 
-            y.store_packed(output0_dptr);
-          });
+        y.store_packed(output0_dptr);
+
+      });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -69,30 +66,28 @@ void SegmentedBroadcastOuterImpl()
       // Check result
 
       // Compute expected values
-      //      printf("explode: segbits=%d, input_segment=%d\n", segbits,
-      //      input_segment); printf("  expected:  ");
+//      printf("explode: segbits=%d, input_segment=%d\n", segbits, input_segment);
+//      printf("  expected:  ");
 
       element_t expected[num_elem];
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
-        camp::idx_t seg = i >> segbits;
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
+        camp::idx_t seg = i>>segbits;
 
-        camp::idx_t off = (num_elem >> segbits) * input_segment + seg;
+        camp::idx_t off = (num_elem>>segbits)*input_segment + seg;
 
         expected[i] = input0_hptr[off];
-        //        printf("%lf ", (double)expected[i]);
+//        printf("%lf ", (double)expected[i]);
       }
-      //      printf("\n");
+//      printf("\n");
 
 
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    }  // segment
+    } // segment
 
-  }  // segbits
+  } // segbits
 
 
   // Cleanup
@@ -101,6 +96,7 @@ void SegmentedBroadcastOuterImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastOuter)
 {
   SegmentedBroadcastOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
index 4332cf3430..d8243864e8 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
@@ -8,65 +8,62 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedDotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
   tensor_copy_to_device<policy_t>(input1_dptr, input1_vec);
 
 
+
   // run segmented dot products for all segments allowed by the vector
-  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
-  {
+  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+
+    camp::idx_t num_output_segments = 1<<segbits;
+
+    for(camp::idx_t output_segment = 0;output_segment < num_output_segments;++output_segment){
 
-    camp::idx_t num_output_segments = 1 << segbits;
 
-    for (camp::idx_t output_segment = 0; output_segment < num_output_segments;
-         ++output_segment)
-    {
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
+        register_t x;
+        x.load_packed(input0_dptr);
 
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            register_t x;
-            x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-            register_t y;
-            y.load_packed(input1_dptr);
+        register_t dp = x.segmented_dot(segbits, output_segment, y);
+        dp.store_packed(output0_dptr);
 
-            register_t dp = x.segmented_dot(segbits, output_segment, y);
-            dp.store_packed(output0_dptr);
-          });
+      });
 
 
       // Move result to host
@@ -75,25 +72,23 @@ void SegmentedDotProductImpl()
       // Compute expected values
       std::vector<element_t> expected(num_elem);
 
-      camp::idx_t offset = output_segment * num_elem / (1 << segbits);
+      camp::idx_t offset = output_segment * num_elem/(1<<segbits);
 
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
         expected[i] = 0;
       }
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
-        expected[(i >> segbits) + offset] += input0_vec[i] * input1_vec[i];
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
+        expected[(i>>segbits) + offset] += input0_vec[i]*input1_vec[i];
       }
 
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    }  // output_segment
+    } // output_segment
+
+  } // segbits
 
-  }  // segbits
 
 
   // Cleanup
@@ -103,6 +98,7 @@ void SegmentedDotProductImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorRegister, SegmentedDotProduct)
 {
   SegmentedDotProductImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
index e0e45f428c..2cfda47bcd 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
@@ -8,57 +8,54 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
+
   // run segmented dot products for all segments allowed by the vector
-  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
-  {
+  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
 
-    camp::idx_t num_segments = 1 << segbits;
+    camp::idx_t num_segments = 1<<segbits;
 
-    for (camp::idx_t output_segment = 0; output_segment < num_segments;
-         ++output_segment)
-    {
+    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
 
       // Execute segmented broadcast
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            register_t x;
-            x.load_packed(input0_dptr);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+        register_t x;
+        x.load_packed(input0_dptr);
+
+        register_t y = x.segmented_sum_inner(segbits, output_segment);
 
-            register_t y = x.segmented_sum_inner(segbits, output_segment);
+        y.store_packed(output0_dptr);
 
-            y.store_packed(output0_dptr);
-          });
+      });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -68,32 +65,30 @@ void SegmentedSumInnerImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * num_elem >> segbits;
+      camp::idx_t output_offset = output_segment * num_elem>>segbits;
 
       // sum each value into appropriate segment lane
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
 
-        auto off = (i >> segbits) + output_offset;
+        auto off = (i >> segbits)+output_offset;
 
         expected[off] += input0_hptr[i];
       }
 
 
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    }  // segment
+    } // segment
 
-  }  // segbits
+  } // segbits
 
 
   // Cleanup
@@ -102,6 +97,7 @@ void SegmentedSumInnerImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorRegister, SegmentedSumInner)
 {
   SegmentedSumInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
index 484f9e198a..6ce6f2a6e3 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
@@ -8,56 +8,53 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
   }
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
+
   // run segmented dot products for all segments allowed by the vector
-  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
-  {
+  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
 
-    camp::idx_t num_segments = num_elem >> segbits;
+    camp::idx_t num_segments = num_elem>>segbits;
 
-    for (camp::idx_t output_segment = 0; output_segment < num_segments;
-         ++output_segment)
-    {
+    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
 
       // Execute segmented broadcast
-      tensor_do<policy_t>(
-          [=] RAJA_HOST_DEVICE()
-          {
-            register_t x;
-            x.load_packed(input0_dptr);
+      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+        register_t x;
+        x.load_packed(input0_dptr);
 
-            register_t y = x.segmented_sum_outer(segbits, output_segment);
+        register_t y = x.segmented_sum_outer(segbits, output_segment);
 
-            y.store_packed(output0_dptr);
-          });
+        y.store_packed(output0_dptr);
+
+      });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -67,29 +64,26 @@ void SegmentedSumOuterImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * (1 << segbits);
+      camp::idx_t output_offset = output_segment * (1<<segbits);
 
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
-        camp::idx_t output_i = output_offset + i % (1 << segbits);
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
+        camp::idx_t output_i = output_offset + i%(1<<segbits);
         expected[output_i] += input0_hptr[i];
       }
 
 
-      for (camp::idx_t i = 0; i < num_elem; ++i)
-      {
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    }  // segment
+    } // segment
 
-  }  // segbits
+  } // segbits
 
 
   // Cleanup
@@ -98,6 +92,7 @@ void SegmentedSumOuterImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorRegister, SegmentedSumOuter)
 {
   SegmentedSumOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
index ca341f74d8..ac508fb0d6 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
@@ -8,171 +8,156 @@
 #ifndef __TEST_TENSOR_REGISTER_Store_HPP__
 #define __TEST_TENSOR_REGISTER_Store_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void StoreImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
-  std::vector<element_t> output0_vec(10 * num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
+  std::vector<element_t> output0_vec(10*num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
   // Initialize output
-  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // store stride-1 to pointer
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x
-        register_t x;
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          x.set(input0_dptr[i], i);
-        }
-
-        x.store_packed(output0_dptr);
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x
+    register_t x;
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      x.set(input0_dptr[i], i);
+    }
+
+    x.store_packed(output0_dptr);
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-  for (camp::idx_t N = 0; N < num_elem; ++N)
-  {
+
+  for(camp::idx_t N = 0;N < num_elem; ++ N){
 
     // Initialize output
-    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
-    {
+    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
     // load stride-1 from pointer
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          // fill x
-          register_t x;
-          for (camp::idx_t i = 0; i < num_elem; ++i)
-          {
-            x.set(input0_dptr[i], i);
-          }
-
-          x.store_packed_n(output0_dptr, N);
-        });
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+      // fill x
+      register_t x;
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
+        x.set(input0_dptr[i], i);
+      }
+
+      x.store_packed_n(output0_dptr, N);
+
+    });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      if (i < N)
-      {
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      if(i < N){
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else
-      {
+      else{
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
+
   // Initialize output
-  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // load stride-2 from pointer
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // fill x
-        register_t x;
-        for (camp::idx_t i = 0; i < num_elem; ++i)
-        {
-          x.set(input0_dptr[i], i);
-        }
-
-        x.store_strided(output0_dptr, 2);
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    // fill x
+    register_t x;
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      x.set(input0_dptr[i], i);
+    }
+
+    x.store_strided(output0_dptr, 2);
+
+  });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+    ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
   }
 
 
-  for (camp::idx_t N = 0; N < num_elem; ++N)
-  {
+
+  for(camp::idx_t N = 0;N < num_elem; ++ N){
 
     // Initialize output
-    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
-    {
+    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
+
     // load stride-2 from pointer
-    tensor_do<policy_t>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          // fill x
-          register_t x;
-          for (camp::idx_t i = 0; i < num_elem; ++i)
-          {
-            x.set(input0_dptr[i], i);
-          }
-
-          x.store_strided_n(output0_dptr, 2, N);
-        });
+    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+      // fill x
+      register_t x;
+      for(camp::idx_t i = 0;i < num_elem; ++ i){
+        x.set(input0_dptr[i], i);
+      }
+
+      x.store_strided_n(output0_dptr, 2, N);
+
+    });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for (camp::idx_t i = 0; i < num_elem; ++i)
-    {
-      if (i < N)
-      {
-        ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
+    for(camp::idx_t i = 0;i < num_elem; ++ i){
+      if(i < N){
+        ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
       }
-      else
-      {
-        ASSERT_SCALAR_EQ(output0_vec[2 * i], (element_t)0);
+      else{
+        ASSERT_SCALAR_EQ(output0_vec[2*i], (element_t)0);
       }
     }
   }
@@ -186,7 +171,11 @@ void StoreImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Store) { StoreImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Store)
+{
+  StoreImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
index 2fdf1425d1..fb9a0efc92 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
@@ -8,36 +8,35 @@
 #ifndef __TEST_TENSOR_REGISTER_Subtract_HPP__
 #define __TEST_TENSOR_REGISTER_Subtract_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SubtractImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t  = typename register_t::element_type;
-  using policy_t   = typename register_t::register_policy;
+  using element_t = typename register_t::element_type;
+  using policy_t = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t* input0_hptr = input0_vec.data();
-  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input0_hptr = input0_vec.data();
+  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t* input1_hptr = input1_vec.data();
-  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *input1_hptr = input1_vec.data();
+  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for (camp::idx_t i = 0; i < num_elem; ++i)
-  {
-    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
-    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+  for(camp::idx_t i = 0;i < num_elem; ++ i){
+   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -49,95 +48,93 @@ void SubtractImpl()
   //
 
   // operator -
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x - y;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z.store_packed(output0_dptr);
-      });
+    register_t z = x - y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
+
   // operator -=
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        register_t y;
-        y.load_packed(input1_dptr);
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x;
+    register_t y;
+    y.load_packed(input1_dptr);
 
-        z -= y;
+    register_t z = x;
 
-        z.store_packed(output0_dptr);
-      });
+    z -= y;
+
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
+
+
   // operator - scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x - 7;
+    register_t z = x - 7;
 
-        z.store_packed(output0_dptr);
-      });
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] - 7, output0_vec[lane]);
   }
 
 
+
+
   // operator -= scalar
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        register_t x;
-        x.load_packed(input0_dptr);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+
+    register_t x;
+    x.load_packed(input0_dptr);
 
-        register_t z = x;
+    register_t z = x;
 
-        z -= 3;
+    z -= 3;
 
-        z.store_packed(output0_dptr);
-      });
+    z.store_packed(output0_dptr);
+  });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
-  {
+  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
     ASSERT_SCALAR_EQ(input0_vec[lane] - 3, output0_vec[lane]);
   }
 
 
+
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -145,7 +142,11 @@ void SubtractImpl()
 }
 
 
-TYPED_TEST_P(TestTensorRegister, Subtract) { SubtractImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorRegister, Subtract)
+{
+  SubtractImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
index d6c7e72dd8..cbcf7c8783 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 #define __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void CtorGetSetImpl()
 {
 
-  using vector_t  = VECTOR_TYPE;
-  using policy_t  = typename vector_t::register_policy;
+  using vector_t = VECTOR_TYPE;
+  using policy_t = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
@@ -23,13 +23,12 @@ void CtorGetSetImpl()
   std::vector<element_t> get(vector_t::s_num_elem);
   std::vector<element_t> set(vector_t::s_num_elem);
 
-  element_t* A_ptr   = tensor_malloc<policy_t>(A);
-  element_t* get_ptr = tensor_malloc<policy_t>(get);
-  element_t* set_ptr = tensor_malloc<policy_t>(set);
+  element_t * A_ptr = tensor_malloc<policy_t>(A);
+  element_t * get_ptr = tensor_malloc<policy_t>(get);
+  element_t * set_ptr = tensor_malloc<policy_t>(set);
 
-  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
-  {
-    A[i]   = (element_t)(i * 2);
+  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+    A[i] = (element_t)(i*2);
     get[i] = 0;
     set[i] = 0;
   }
@@ -40,29 +39,24 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-        {
-          // load array A as vector
-          vector_t vec;
-          vec.load_packed_n(A_ptr, N);
-
-          // try get operations
-          for (camp::idx_t i = 0; i < N; ++i)
-          {
-            get_ptr[i] = vec.get(i);
-          }
-
-          // try set and get operations
-          for (camp::idx_t i = 0; i < N; ++i)
-          {
-            vec.set((element_t)(i + 1), i);
-            set_ptr[i] = vec.get(i);
-          }
-        }
-      });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+      // load array A as vector
+      vector_t vec;
+      vec.load_packed_n(A_ptr, N);
+
+      // try get operations
+      for(camp::idx_t i = 0;i < N;++ i){
+        get_ptr[i] = vec.get(i);
+      }
+
+      // try set and get operations
+      for(camp::idx_t i = 0;i < N;++ i){
+        vec.set((element_t)(i+1), i);
+        set_ptr[i] = vec.get(i);
+      }
+    }
+  });
 
 
   tensor_copy_to_host<policy_t>(get, get_ptr);
@@ -70,19 +64,17 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-  {
+  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
 
     // check get operations
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ASSERT_SCALAR_EQ(get[i], (element_t)(i * 2));
+    for(camp::idx_t i = 0;i < N;++ i){
+      ASSERT_SCALAR_EQ(get[i], (element_t)(i*2));
     }
 
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ASSERT_SCALAR_EQ(set[i], (element_t)(i + 1));
+    for(camp::idx_t i = 0;i < N;++ i){
+      ASSERT_SCALAR_EQ(set[i], (element_t)(i+1));
     }
+
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -91,7 +83,11 @@ void CtorGetSetImpl()
 }
 
 
-TYPED_TEST_P(TestTensorVector, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorVector, CtorGetSet)
+{
+  CtorGetSetImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
index a2489097d8..61073f5cc3 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_FmaFms_HPP__
 #define __TEST_TENSOR_VECTOR_FmaFms_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void FmaFmsImpl()
 {
 
-  using vector_t  = VECTOR_TYPE;
-  using policy_t  = typename vector_t::register_policy;
+  using vector_t = VECTOR_TYPE;
+  using policy_t = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -24,17 +24,16 @@ void FmaFmsImpl()
   std::vector<element_t> fma(vector_t::s_num_elem);
   std::vector<element_t> fms(vector_t::s_num_elem);
 
-  element_t* A_ptr   = tensor_malloc<policy_t>(A);
-  element_t* B_ptr   = tensor_malloc<policy_t>(B);
-  element_t* C_ptr   = tensor_malloc<policy_t>(C);
-  element_t* fma_ptr = tensor_malloc<policy_t>(fma);
-  element_t* fms_ptr = tensor_malloc<policy_t>(fms);
-
-  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
-  {
-    A[i]   = (element_t)i;
-    B[i]   = (element_t)i * 2;
-    C[i]   = (element_t)i * 3;
+  element_t * A_ptr = tensor_malloc<policy_t>(A);
+  element_t * B_ptr = tensor_malloc<policy_t>(B);
+  element_t * C_ptr = tensor_malloc<policy_t>(C);
+  element_t * fma_ptr = tensor_malloc<policy_t>(fma);
+  element_t * fms_ptr = tensor_malloc<policy_t>(fms);
+
+  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+    A[i] = (element_t)i;
+    B[i] = (element_t)i*2;
+    C[i] = (element_t)i*3;
     fma[i] = 0;
     fms[i] = 0;
   }
@@ -47,60 +46,53 @@ void FmaFmsImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-        {
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
 
-          // load arrays as vectors
-          vector_t vec_A;
-          vec_A.load_packed_n(A_ptr, N);
+      // load arrays as vectors
+      vector_t vec_A;
+      vec_A.load_packed_n(A_ptr, N);
 
-          vector_t vec_B;
-          vec_B.load_packed_n(B_ptr, N);
+      vector_t vec_B;
+      vec_B.load_packed_n(B_ptr, N);
 
-          vector_t vec_C;
-          vec_C.load_packed_n(C_ptr, N);
+      vector_t vec_C;
+      vec_C.load_packed_n(C_ptr, N);
 
 
-          // try FMA (A*B+C)
+      // try FMA (A*B+C)
 
-          vector_t fma = vec_A.multiply_add(vec_B, vec_C);
-          for (camp::idx_t i = 0; i < N; ++i)
-          {
-            fma_ptr[i] = fma.get(i);
-          }
+      vector_t fma = vec_A.multiply_add(vec_B, vec_C);
+      for(camp::idx_t i = 0;i < N;++ i){
+        fma_ptr[i] = fma.get(i);
+      }
 
-          // try FMS (A*B-C)
-          vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
-          for (camp::idx_t i = 0; i < N; ++i)
-          {
-            fms_ptr[i] = fms.get(i);
-          }
-        }
-      });
+      // try FMS (A*B-C)
+      vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
+      for(camp::idx_t i = 0;i < N;++ i){
+        fms_ptr[i] = fms.get(i);
+      }
+    }
+  });
 
   tensor_copy_to_host<policy_t>(fma, fma_ptr);
   tensor_copy_to_host<policy_t>(fms, fms_ptr);
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-  {
+  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
 
     // check FMA (A*B+C)
 
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ASSERT_SCALAR_EQ(fma[i], A[i] * B[i] + C[i]);
+    for(camp::idx_t i = 0;i < N;++ i){
+      ASSERT_SCALAR_EQ(fma[i], A[i]*B[i]+C[i]);
     }
 
     // check FMS (A*B-C)
-    for (camp::idx_t i = 0; i < N; ++i)
-    {
-      ASSERT_SCALAR_EQ(fms[i], A[i] * B[i] - C[i]);
+    for(camp::idx_t i = 0;i < N;++ i){
+      ASSERT_SCALAR_EQ(fms[i], A[i]*B[i]-C[i]);
     }
+
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -111,7 +103,11 @@ void FmaFmsImpl()
 }
 
 
-TYPED_TEST_P(TestTensorVector, FmaFms) { FmaFmsImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorVector, FmaFms)
+{
+  FmaFmsImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
index 2f4269161c..854dcba8be 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
@@ -8,36 +8,35 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE(TX, "TX");
+RAJA_INDEX_VALUE( TX, "TX" );
 
 template <typename VECTOR_TYPE>
 void ForallVectorRef1dImpl()
 {
 
-  using vector_t  = VECTOR_TYPE;
-  using policy_t  = typename vector_t::register_policy;
+  using vector_t = VECTOR_TYPE;
+  using policy_t = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
-  size_t N = 10 * vector_t::s_num_elem + 1;
+  size_t N = 10*vector_t::s_num_elem+1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-  // N += (size_t)(100*NO_OPT_RAND);
+    //N += (size_t)(100*NO_OPT_RAND);
 
   std::vector<element_t> A(N);
   std::vector<element_t> B(N);
   std::vector<element_t> C(N);
 
-  element_t* A_ptr = tensor_malloc<policy_t>(A);
-  element_t* B_ptr = tensor_malloc<policy_t>(B);
-  element_t* C_ptr = tensor_malloc<policy_t>(C);
+  element_t * A_ptr = tensor_malloc<policy_t>(A);
+  element_t * B_ptr = tensor_malloc<policy_t>(B);
+  element_t * C_ptr = tensor_malloc<policy_t>(C);
 
-  for (size_t i = 0; i < N; ++i)
-  {
-    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
-    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
+  for(size_t i = 0;i < N; ++ i){
+    A[i] = (element_t)(NO_OPT_RAND*1000.0);
+    B[i] = (element_t)(NO_OPT_RAND*1000.0);
     C[i] = 0.0;
   }
 
@@ -58,85 +57,82 @@ void ForallVectorRef1dImpl()
   tensor_copy_to_device<policy_t>(B_ptr, B);
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
-                      { Z_d[all] = 3 + (X_d[all] * (5 / Y_d[all])) + 9; });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    Z_d[all] = 3 + (X_d[all]*(5/Y_d[all])) + 9;
+  });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  //  for(size_t i = 0;i < N; ++ i){
-  //    printf("%lf ", (double)C[i]);
-  //  }
-  //  printf("\n\n");
+//  for(size_t i = 0;i < N; ++ i){
+//    printf("%lf ", (double)C[i]);
+//  }
+//  printf("\n\n");
 
-  for (size_t i = 0; i < N; i++)
-  {
-    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
+  for(size_t i = 0;i < N;i ++){
+    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
   }
 
 
   // evaluate complex left side division on all() range
-  for (size_t i = 0; i < N; ++i)
-  {
+  for(size_t i = 0;i < N; ++ i){
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      { Z_d[all] = 3 + ((X_d[all] * Y_d[all]) / Y_d[all]) + 9; });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    Z_d[all] = 3 + ((X_d[all]*Y_d[all])/Y_d[all]) + 9;
+  });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for (size_t i = 0; i < N; i++)
-  {
-    ASSERT_SCALAR_EQ(element_t(3 + ((A[i] * B[i]) / B[i])) + 9, C[i]);
+  for(size_t i = 0;i < N;i ++){
+    ASSERT_SCALAR_EQ(element_t(3+((A[i]*B[i])/B[i]))+9, C[i]);
   }
 
   // evaluate on a subrange [N/2, N)
-  for (size_t i = 0; i < N; ++i)
-  {
+  for(size_t i = 0;i < N; ++ i){
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
   // evaluate on a subrange [N/2, N)
-  auto some = idx_t::range(N / 2, N);
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
-                      { Z_d[some] = 3. + (X_d[some] * (5 / Y_d[some])) + 9; });
+  auto some = idx_t::range(N/2, N);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    Z_d[some] = 3.+ (X_d[some]*(5/Y_d[some])) + 9;
+  });
 
   tensor_copy_to_host<policy_t>(A, A_ptr);
   tensor_copy_to_host<policy_t>(B, B_ptr);
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for (size_t i = 0; i < N / 2; i++)
-  {
+  for(size_t i = 0;i < N/2;i ++){
     ASSERT_SCALAR_EQ(0, C[i]);
   }
-  for (size_t i = N / 2; i < N; i++)
-  {
-    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
+  for(size_t i = N/2;i < N;i ++){
+    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
   }
 
 
+
+
   // evaluate on a subrange [0, N/2) using a forall statement
-  for (size_t i = 0; i < N; ++i)
-  {
+  for(size_t i = 0;i < N; ++ i){
     C[i] = 0.0;
   }
 
   // vector_exec only works on the host due to its use of RAJA::seq_exec
-  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(
-      RAJA::TypedRangeSegment<TX>(0, N / 2),
-      [=](TX i) { Z[i] = 3 + (X[i] * (5 / Y[i])) + 9; });
+  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(RAJA::TypedRangeSegment<TX>(0,N/2),
+      [=](TX i){
 
-  for (size_t i = 0; i < N / 2; i++)
-  {
-    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
+     Z[i] = 3 + (X[i]*(5/Y[i])) + 9;
+  });
+
+  for(size_t i = 0;i < N/2;i ++){
+    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
   }
-  for (size_t i = N / 2; i < N; i++)
-  {
+  for(size_t i = N/2;i < N;i ++){
     ASSERT_SCALAR_EQ(0, C[i]);
   }
 
@@ -146,6 +142,7 @@ void ForallVectorRef1dImpl()
 }
 
 
+
 TYPED_TEST_P(TestTensorVector, ForallVectorRef1d)
 {
   ForallVectorRef1dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index 3b1111b6ef..93596d8f23 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -8,41 +8,38 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<
-    TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
   // do nothing for CUDA or device tests
 }
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<
-    !TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<!TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
-  using vector_t  = VECTOR_TYPE;
+  using vector_t = VECTOR_TYPE;
   using element_t = typename vector_t::element_type;
 
   using index_t = ptrdiff_t;
 
-  index_t N = 3 * vector_t::s_num_elem + 1;
-  index_t M = 4 * vector_t::s_num_elem + 1;
+  index_t N = 3*vector_t::s_num_elem+1;
+  index_t M = 4*vector_t::s_num_elem+1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-  N += (size_t)(10 * NO_OPT_RAND);
-  M += (size_t)(10 * NO_OPT_RAND);
+  N += (size_t)(10*NO_OPT_RAND);
+  M += (size_t)(10*NO_OPT_RAND);
 
-  std::vector<element_t> A(N * M);
-  std::vector<element_t> B(N * M);
-  std::vector<element_t> C(N * M);
+  std::vector<element_t> A(N*M);
+  std::vector<element_t> B(N*M);
+  std::vector<element_t> C(N*M);
 
-  for (index_t i = 0; i < N * M; ++i)
-  {
-    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
-    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
+  for(index_t i = 0;i < N*M; ++ i){
+    A[i] = (element_t)(NO_OPT_RAND*1000.0);
+    B[i] = (element_t)(NO_OPT_RAND*1000.0);
     C[i] = 0.0;
   }
 
@@ -51,27 +48,32 @@ ForallVectorRef2dImpl()
   RAJA::View<element_t, RAJA::Layout<2>> Z(C.data(), N, M);
 
   using idx_t = RAJA::expt::VectorIndex<index_t, vector_t>;
-  auto all    = idx_t::all();
+  auto all = idx_t::all();
 
   //
   // Test with kernel, using sequential policies and ::all()
   //
-  for (index_t i = 0; i < N * M; ++i)
-  {
+  for(index_t i = 0;i < N*M; ++ i){
     C[i] = 0.0;
   }
 
-  using policy1_t = RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>;
+  using policy1_t =
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec,
+            RAJA::statement::Lambda<0>
+        >
+      >;
 
   // Test with kernel, using sequential policies and ::all()
   RAJA::kernel<policy1_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N)),
-      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
-
-  for (index_t i = 0; i < N * M; i++)
+      [=] (index_t i)
   {
-    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
+    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
+  });
+
+  for(index_t i = 0;i < N*M;i ++){
+    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
   }
 
 
@@ -79,66 +81,74 @@ ForallVectorRef2dImpl()
   // Test with kernel, using tensor_exec policy
   //
 
-  for (index_t i = 0; i < N * M; ++i)
-  {
+  for(index_t i = 0;i < N*M; ++ i){
     C[i] = 0.0;
   }
 
-  using policy2_t = RAJA::KernelPolicy<RAJA::statement::For<
-      0, RAJA::seq_exec,
-      RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
-                           RAJA::statement::Lambda<0>>>>;
+  using policy2_t =
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >;
 
   RAJA::kernel<policy2_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N),
                        RAJA::TypedRangeSegment<index_t>(0, M)),
 
       [=](index_t i, index_t j)
-      { Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9; });
-
-  for (index_t i = 0; i < N * M; i++)
   {
-    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
+    Z(i, j) = 3+(X(i, j)*(5/Y(i, j)))+9;
+  });
+
+  for(index_t i = 0;i < N*M;i ++){
+    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
   }
 
 
+
   //
   // Test with forall with vectors in i
   //
-  for (index_t i = 0; i < N * M; ++i)
-  {
+  for(index_t i = 0;i < N*M; ++ i){
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::TypedRangeSegment<index_t>(0, M),
-      [=](index_t j) { Z(all, j) = 3 + (X(all, j) * (5 / Y(all, j))) + 9; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, M),
+      [=](index_t j){
 
-  for (index_t i = 0; i < N * M; i++)
-  {
-    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
+    Z(all,j) = 3+(X(all,j)*(5/Y(all,j)))+9;
+
+  });
+
+  for(index_t i = 0;i < N*M;i ++){
+    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
   }
 
 
   //
   // Test with forall with vectors in j
   //
-  for (index_t i = 0; i < N * M; ++i)
-  {
+  for(index_t i = 0;i < N*M; ++ i){
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(
-      RAJA::TypedRangeSegment<index_t>(0, N),
-      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, N),
+      [=](index_t i){
 
-  for (index_t i = 0; i < N * M; i++)
-  {
-    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
+    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
+
+  });
+
+  for(index_t i = 0;i < N*M;i ++){
+    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
   }
 }
 
 
+
 TYPED_TEST_P(TestTensorVector, ForallVectorRef2d)
 {
   ForallVectorRef2dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
index 42b13bb70c..4841c4e7ee 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
@@ -8,26 +8,25 @@
 #ifndef __TEST_TENSOR_VECTOR_MinMax_HPP__
 #define __TEST_TENSOR_VECTOR_MinMax_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void MinMaxImpl()
 {
 
-  using vector_t  = VECTOR_TYPE;
-  using policy_t  = typename vector_t::register_policy;
+  using vector_t = VECTOR_TYPE;
+  using policy_t = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
   std::vector<element_t> ex_min(1);
   std::vector<element_t> ex_max(1);
 
-  element_t* A_ptr      = tensor_malloc<policy_t>(A);
-  element_t* ex_min_ptr = tensor_malloc<policy_t>(ex_min);
-  element_t* ex_max_ptr = tensor_malloc<policy_t>(ex_max);
+  element_t * A_ptr = tensor_malloc<policy_t>(A);
+  element_t * ex_min_ptr = tensor_malloc<policy_t>(ex_min);
+  element_t * ex_max_ptr = tensor_malloc<policy_t>(ex_max);
 
-  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
     A[i] = (element_t)i;
   }
   ex_min[0] = (element_t)99999999;
@@ -40,20 +39,17 @@ void MinMaxImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
-        {
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
 
-          // load array A as vector
-          vector_t vec;
-          vec.load_packed_n(A_ptr, N);
+      // load array A as vector
+      vector_t vec;
+      vec.load_packed_n(A_ptr, N);
 
-          ex_min_ptr[0] = vec.min_n(N);
-          ex_max_ptr[0] = vec.max_n(N);
-        }
-      });
+      ex_min_ptr[0] = vec.min_n(N);
+      ex_max_ptr[0] = vec.max_n(N);
+    }
+  });
 
   tensor_copy_to_host<policy_t>(ex_min, ex_min_ptr);
   tensor_copy_to_host<policy_t>(ex_max, ex_max_ptr);
@@ -62,7 +58,7 @@ void MinMaxImpl()
   ASSERT_SCALAR_EQ(ex_min[0], (element_t)0);
 
   // check max
-  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem - 1));
+  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem-1));
 
   tensor_free<policy_t>(A_ptr);
   tensor_free<policy_t>(ex_min_ptr);
@@ -70,7 +66,11 @@ void MinMaxImpl()
 }
 
 
-TYPED_TEST_P(TestTensorVector, MinMax) { MinMaxImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorVector, MinMax)
+{
+  MinMaxImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
index 5138d7858a..fa3a1caef8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_SumDot_HPP__
 #define __TEST_TENSOR_VECTOR_SumDot_HPP__
 
-#include <RAJA/RAJA.hpp>
+#include<RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void SumDotImpl()
 {
 
-  using vector_t  = VECTOR_TYPE;
-  using policy_t  = typename vector_t::register_policy;
+  using vector_t = VECTOR_TYPE;
+  using policy_t = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -25,12 +25,11 @@ void SumDotImpl()
   element_t host_sum = 0;
   element_t host_dot = 0;
 
-  element_t* A_ptr      = tensor_malloc<policy_t>(A);
-  element_t* ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
-  element_t* ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
+  element_t * A_ptr = tensor_malloc<policy_t>(A);
+  element_t * ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
+  element_t * ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
 
-  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
-  {
+  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
     A[i] = (element_t)i;
   }
 
@@ -38,10 +37,9 @@ void SumDotImpl()
   ex_dot[0] = (element_t)0;
 
   // compute expected values on host
-  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
-  {
+  for(camp::idx_t i = 0; i < vector_t::s_num_elem; ++i){
     host_sum += A[i];
-    host_dot += A[i] * A[i];
+    host_dot += A[i]*A[i];
   }
 
   tensor_copy_to_device<policy_t>(A_ptr, A);
@@ -50,16 +48,14 @@ void SumDotImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>(
-      [=] RAJA_HOST_DEVICE()
-      {
-        // load array A as vector
-        vector_t vec;
-        vec.load_packed_n(A_ptr, vector_t::s_num_elem);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    // load array A as vector
+    vector_t vec;
+    vec.load_packed_n(A_ptr, vector_t::s_num_elem);
 
-        ex_sum_ptr[0] = vec.sum();
-        ex_dot_ptr[0] = vec.dot(vec);
-      });
+    ex_sum_ptr[0] = vec.sum();
+    ex_dot_ptr[0] = vec.dot(vec);
+  });
 
   tensor_copy_to_host<policy_t>(ex_sum, ex_sum_ptr);
   tensor_copy_to_host<policy_t>(ex_dot, ex_dot_ptr);
@@ -76,7 +72,11 @@ void SumDotImpl()
 }
 
 
-TYPED_TEST_P(TestTensorVector, SumDot) { SumDotImpl<TypeParam>(); }
+
+TYPED_TEST_P(TestTensorVector, SumDot)
+{
+  SumDotImpl<TypeParam>();
+}
 
 
 #endif
diff --git a/test/functional/util/test-CombiningAdapter-1D.cpp b/test/functional/util/test-CombiningAdapter-1D.cpp
index 2867cd56e2..4dc73781d5 100644
--- a/test/functional/util/test-CombiningAdapter-1D.cpp
+++ b/test/functional/util/test-CombiningAdapter-1D.cpp
@@ -19,22 +19,17 @@
 #include <numeric>
 #include <vector>
 
-template <typename SegIndexType, typename Segment0>
+template < typename SegIndexType, typename Segment0 >
 void test_CombiningAdapter_1D(Segment0 const& seg0)
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
+  using std::begin; using std::end; using std::distance;
   auto seg0_begin = begin(seg0);
 
   size_t counter0 = 0;
-  auto adapter    = RAJA::make_CombiningAdapter(
-      [&](SegIndexType i0)
-      {
-        ASSERT_EQ(seg0_begin[counter0], i0);
-        counter0 += 1;
-      },
-      seg0);
+  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType i0) {
+    ASSERT_EQ(seg0_begin[counter0], i0);
+    counter0 += 1;
+  }, seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -43,13 +38,12 @@ void test_CombiningAdapter_1D(Segment0 const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx)
-  {
+  for (auto idx = begin(range); idx != range_end; ++idx) {
     adapter(*idx);
   }
 }
 
-template <typename SegIndexType>
+template < typename SegIndexType >
 void test_types_CombiningAdapter_1D(SegIndexType ibegin0, SegIndexType iend0)
 {
   RAJA::TypedRangeSegment<SegIndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index 6a01da4836..bfb7355418 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -19,53 +19,42 @@
 #include <numeric>
 #include <vector>
 
-template <typename SegIndexType0,
-          typename SegIndexType1,
-          typename Segment0,
-          typename Segment1>
+template < typename SegIndexType0, typename SegIndexType1,
+           typename Segment0, typename Segment1 >
 void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
+  using std::begin; using std::end; using std::distance;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
-  auto adapter    = RAJA::make_CombiningAdapter(
-      [&](SegIndexType0 i0, SegIndexType1 i1)
-      {
-        ASSERT_EQ(seg0_begin[counter0], i0);
-        ASSERT_EQ(seg1_begin[counter1], i1);
-        counter1 += 1;
-        if (counter1 == seg1_len)
-        {
-          counter1 = 0;
-          counter0 += 1;
-        }
-      },
-      seg0, seg1);
+  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1) {
+    ASSERT_EQ(seg0_begin[counter0], i0);
+    ASSERT_EQ(seg1_begin[counter1], i1);
+    counter1 += 1;
+    if (counter1 == seg1_len) {
+      counter1 = 0;
+      counter0 += 1;
+    }
+  }, seg0, seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx)
-  {
+  for (auto idx = begin(range); idx != range_end; ++idx) {
     adapter(*idx);
   }
 }
 
-template <typename SegIndexType0, typename SegIndexType1>
-void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0,
-                                    SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1,
-                                    SegIndexType1 iend1)
+template < typename SegIndexType0, typename SegIndexType1 >
+void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0, SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1, SegIndexType1 iend1)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index 38226bdf4a..9181b974b9 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -19,19 +19,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename SegIndexType0,
-          typename SegIndexType1,
-          typename SegIndexType2,
-          typename Segment0,
-          typename Segment1,
-          typename Segment2>
-void test_CombiningAdapter_3D(Segment0 const& seg0,
-                              Segment1 const& seg1,
-                              Segment2 const& seg2)
+template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2,
+           typename Segment0, typename Segment1, typename Segment2 >
+void test_CombiningAdapter_3D(Segment0 const& seg0, Segment1 const& seg1, Segment2 const& seg2)
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
+  using std::begin; using std::end; using std::distance;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
@@ -41,55 +33,42 @@ void test_CombiningAdapter_3D(Segment0 const& seg0,
   size_t counter0 = 0;
   size_t counter1 = 0;
   size_t counter2 = 0;
-  auto adapter    = RAJA::make_CombiningAdapter(
-      [&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2)
-      {
-        ASSERT_EQ(seg0_begin[counter0], i0);
-        ASSERT_EQ(seg1_begin[counter1], i1);
-        ASSERT_EQ(seg2_begin[counter2], i2);
-        counter2 += 1;
-        if (counter2 == seg2_len)
-        {
-          counter2 = 0;
-          counter1 += 1;
-          if (counter1 == seg1_len)
-          {
-            counter1 = 0;
-            counter0 += 1;
-          }
-        }
-      },
-      seg0, seg1, seg2);
+  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2) {
+    ASSERT_EQ(seg0_begin[counter0], i0);
+    ASSERT_EQ(seg1_begin[counter1], i1);
+    ASSERT_EQ(seg2_begin[counter2], i2);
+    counter2 += 1;
+    if (counter2 == seg2_len) {
+      counter2 = 0;
+      counter1 += 1;
+      if (counter1 == seg1_len) {
+        counter1 = 0;
+        counter0 += 1;
+      }
+    }
+  }, seg0, seg1, seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)),
-            seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx)
-  {
+  for (auto idx = begin(range); idx != range_end; ++idx) {
     adapter(*idx);
   }
 }
 
-template <typename SegIndexType0,
-          typename SegIndexType1,
-          typename SegIndexType2>
-void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0,
-                                    SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1,
-                                    SegIndexType1 iend1,
-                                    SegIndexType2 ibegin2,
-                                    SegIndexType2 iend2)
+template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2 >
+void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0, SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1, SegIndexType1 iend1,
+                                    SegIndexType2 ibegin2, SegIndexType2 iend2)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
   RAJA::TypedRangeSegment<SegIndexType2> rseg2(ibegin2, iend2);
-  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(
-      rseg0, rseg1, rseg2);
+  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(rseg0, rseg1, rseg2);
 }
 
 TEST(CombiningAdapter, test3D)
diff --git a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
index f7d489c75e..ddcaea52d7 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
@@ -19,22 +19,17 @@
 #include <numeric>
 #include <vector>
 
-template <typename Perm, typename IndexType, typename Segment>
+template < typename Perm, typename IndexType, typename Segment >
 void test_PermutedCombiningAdapter_1D(Segment const& seg0)
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
+  using std::begin; using std::end; using std::distance;
   auto seg0_begin = begin(seg0);
 
   size_t counters[1] = {0};
-  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
-      [&](IndexType i0)
-      {
-        ASSERT_EQ(seg0_begin[counters[0]], i0);
-        counters[camp::seq_at<0, Perm>::value] += 1;
-      },
-      seg0);
+  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0) {
+    ASSERT_EQ(seg0_begin[counters[0]], i0);
+    counters[camp::seq_at<0, Perm>::value] += 1;
+  }, seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -43,13 +38,12 @@ void test_PermutedCombiningAdapter_1D(Segment const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx)
-  {
+  for (auto idx = begin(range); idx != range_end; ++idx) {
     adapter(*idx);
   }
 }
 
-template <typename Perm, typename IndexType>
+template < typename Perm, typename IndexType >
 void test_types_PermutedCombiningAdapter_1D(IndexType ibegin0, IndexType iend0)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index d9396d6ebd..fd1f6a8b0a 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -19,51 +19,41 @@
 #include <numeric>
 #include <vector>
 
-template <typename Perm, typename IndexType, typename Segment>
+template < typename Perm, typename IndexType, typename Segment >
 void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
-  auto seg0_begin    = begin(seg0);
-  auto seg1_begin    = begin(seg1);
+  using std::begin; using std::end; using std::distance;
+  auto seg0_begin = begin(seg0);
+  auto seg1_begin = begin(seg1);
   size_t seg_lens[2] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size())};
 
   size_t counters[2] = {0, 0};
-  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
-      [&](IndexType i0, IndexType i1)
-      {
-        ASSERT_EQ(seg0_begin[counters[0]], i0);
-        ASSERT_EQ(seg1_begin[counters[1]], i1);
-        counters[camp::seq_at<1, Perm>::value] += 1;
-        if (counters[camp::seq_at<1, Perm>::value] ==
-            seg_lens[camp::seq_at<1, Perm>::value])
-        {
-          counters[camp::seq_at<1, Perm>::value] = 0;
-          counters[camp::seq_at<0, Perm>::value] += 1;
-        }
-      },
-      seg0, seg1);
+  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1) {
+    ASSERT_EQ(seg0_begin[counters[0]], i0);
+    ASSERT_EQ(seg1_begin[counters[1]], i1);
+    counters[camp::seq_at<1, Perm>::value] += 1;
+    if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
+      counters[camp::seq_at<1, Perm>::value] = 0;
+      counters[camp::seq_at<0, Perm>::value] += 1;
+    }
+  }, seg0, seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx)
-  {
+  for (auto idx = begin(range); idx != range_end; ++idx) {
     adapter(*idx);
   }
 }
 
-template <typename Perm, typename IndexType>
-void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0,
-                                            IndexType iend0,
-                                            IndexType ibegin1,
-                                            IndexType iend1)
+template < typename Perm, typename IndexType >
+void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0, IndexType iend0,
+                                            IndexType ibegin1, IndexType iend1)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index 2ef1021251..0943584c97 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -19,65 +19,49 @@
 #include <numeric>
 #include <vector>
 
-template <typename Perm, typename IndexType, typename Segment>
-void test_PermutedCombiningAdapter_3D(Segment const& seg0,
-                                      Segment const& seg1,
-                                      Segment const& seg2)
+template < typename Perm, typename IndexType, typename Segment >
+void test_PermutedCombiningAdapter_3D(Segment const& seg0, Segment const& seg1, Segment const& seg2)
 {
-  using std::begin;
-  using std::distance;
-  using std::end;
-  auto seg0_begin    = begin(seg0);
-  auto seg1_begin    = begin(seg1);
-  auto seg2_begin    = begin(seg2);
+  using std::begin; using std::end; using std::distance;
+  auto seg0_begin = begin(seg0);
+  auto seg1_begin = begin(seg1);
+  auto seg2_begin = begin(seg2);
   size_t seg_lens[3] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size()),
                         static_cast<size_t>(seg2.size())};
 
   size_t counters[3] = {0, 0, 0};
-  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
-      [&](IndexType i0, IndexType i1, IndexType i2)
-      {
-        ASSERT_EQ(seg0_begin[counters[0]], i0);
-        ASSERT_EQ(seg1_begin[counters[1]], i1);
-        ASSERT_EQ(seg2_begin[counters[2]], i2);
-        counters[camp::seq_at<2, Perm>::value] += 1;
-        if (counters[camp::seq_at<2, Perm>::value] ==
-            seg_lens[camp::seq_at<2, Perm>::value])
-        {
-          counters[camp::seq_at<2, Perm>::value] = 0;
-          counters[camp::seq_at<1, Perm>::value] += 1;
-          if (counters[camp::seq_at<1, Perm>::value] ==
-              seg_lens[camp::seq_at<1, Perm>::value])
-          {
-            counters[camp::seq_at<1, Perm>::value] = 0;
-            counters[camp::seq_at<0, Perm>::value] += 1;
-          }
-        }
-      },
-      seg0, seg1, seg2);
+  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1, IndexType i2) {
+    ASSERT_EQ(seg0_begin[counters[0]], i0);
+    ASSERT_EQ(seg1_begin[counters[1]], i1);
+    ASSERT_EQ(seg2_begin[counters[2]], i2);
+    counters[camp::seq_at<2, Perm>::value] += 1;
+    if (counters[camp::seq_at<2, Perm>::value] == seg_lens[camp::seq_at<2, Perm>::value]) {
+      counters[camp::seq_at<2, Perm>::value] = 0;
+      counters[camp::seq_at<1, Perm>::value] += 1;
+      if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
+        counters[camp::seq_at<1, Perm>::value] = 0;
+        counters[camp::seq_at<0, Perm>::value] += 1;
+      }
+    }
+  }, seg0, seg1, seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)),
-            seg0.size() * seg1.size() * seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx)
-  {
+  for (auto idx = begin(range); idx != range_end; ++idx) {
     adapter(*idx);
   }
 }
 
-template <typename Perm, typename IndexType>
-void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0,
-                                            IndexType iend0,
-                                            IndexType ibegin1,
-                                            IndexType iend1,
-                                            IndexType ibegin2,
-                                            IndexType iend2)
+template < typename Perm, typename IndexType >
+void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0, IndexType iend0,
+                                            IndexType ibegin1, IndexType iend1,
+                                            IndexType ibegin2, IndexType iend2)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
@@ -93,10 +77,7 @@ TEST(PermutedCombiningAdapter, test3D)
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_JKI, int>(0, 0, 0, 0, 0, 5);
 
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KIJ, int>(0, 3, 0, 4, 0, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2,
-                                                               5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3,
-                                                               0);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1,
-                                                               4);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2, 5);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3, 0);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1, 4);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index 24dc62646b..4241a945dd 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -20,62 +20,50 @@
 
 
 // These are defined here due to cuda limitations
-template <typename IndexType, typename type1>
-struct callable11
-{
+template < typename IndexType, typename type1 >
+struct callable11 {
   type1* working_ptr1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
     working_ptr1[i] += type1(i);
   }
 };
-template <typename IndexType, typename type1>
-struct callable12
-{
+template < typename IndexType, typename type1 >
+struct callable12 {
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
     working_ptr1[i] += test_val1;
   }
 };
 
-template <typename IndexType, typename type2>
-struct callable21
-{
+template < typename IndexType, typename type2 >
+struct callable21 {
   type2* working_ptr2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
     working_ptr2[i] += type2(i);
   }
 };
-template <typename IndexType, typename type2>
-struct callable22
-{
+template < typename IndexType, typename type2 >
+struct callable22 {
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
     working_ptr2[i] += test_val2;
   }
 };
 
-template <typename IndexType, typename type3>
-struct callable31
-{
+template < typename IndexType, typename type3 >
+struct callable31 {
   type3* working_ptr3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
     working_ptr3[i] += type3(i);
   }
 };
-template <typename IndexType, typename type3>
-struct callable32
-{
+template < typename IndexType, typename type3 >
+struct callable32 {
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
     working_ptr3[i] += test_val3;
   }
 };
@@ -87,382 +75,356 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupOrderedMultiple
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedMultiple {
+void operator()(
+    std::mt19937& rng, IndexType max_begin, IndexType min_end,
+    IndexType num1, IndexType num2, IndexType num3,
+    IndexType pool_reuse, IndexType group_reuse) const
 {
-  void operator()(std::mt19937& rng,
-                  IndexType max_begin,
-                  IndexType min_end,
-                  IndexType num1,
-                  IndexType num2,
-                  IndexType num3,
-                  IndexType pool_reuse,
-                  IndexType group_reuse) const
-  {
-    ASSERT_GT(min_end, max_begin);
-    IndexType N = min_end + max_begin;
+  ASSERT_GT(min_end, max_begin);
+  IndexType N = min_end + max_begin;
 
-    std::vector<IndexType> begin1, end1;
-    std::vector<IndexType> begin2, end2;
-    std::vector<IndexType> begin3, end3;
+  std::vector<IndexType> begin1, end1;
+  std::vector<IndexType> begin2, end2;
+  std::vector<IndexType> begin3, end3;
 
-    {
-      using dist_type = std::uniform_int_distribution<IndexType>;
+  {
+    using dist_type = std::uniform_int_distribution<IndexType>;
 
-      for (IndexType j = IndexType(0); j < num1; j++)
-      {
-        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
-        end1.push_back(dist_type(begin1.back(), min_end)(rng));
-      }
+    for (IndexType j = IndexType(0); j < num1; j++) {
+      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
+      end1.push_back(dist_type(begin1.back(), min_end)(rng));
+    }
 
-      for (IndexType j = IndexType(0); j < num2; j++)
-      {
-        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
-        end2.push_back(dist_type(begin2.back(), min_end)(rng));
-      }
+    for (IndexType j = IndexType(0); j < num2; j++) {
+      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
+      end2.push_back(dist_type(begin2.back(), min_end)(rng));
+    }
 
-      for (IndexType j = IndexType(0); j < num3; j++)
-      {
-        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
-        end3.push_back(dist_type(begin3.back(), min_end)(rng));
-      }
+    for (IndexType j = IndexType(0); j < num3; j++) {
+      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
+      end3.push_back(dist_type(begin3.back(), min_end)(rng));
     }
+  }
 
-    WORKING_RES res = WORKING_RES::get_default();
-    camp::resources::Resource working_res {res};
+  WORKING_RES res = WORKING_RES::get_default();
+  camp::resources::Resource working_res{res};
 
-    using type1 = IndexType;
-    using type2 = size_t;
-    using type3 = double;
+  using type1 = IndexType;
+  using type2 = size_t;
+  using type3 = double;
 
-    type1* working_array1 = nullptr;
-    type1* check_array1   = nullptr;
-    type1* test_array1    = nullptr;
+  type1* working_array1 = nullptr;
+  type1* check_array1 = nullptr;
+  type1* test_array1 = nullptr;
 
-    type2* working_array2 = nullptr;
-    type2* check_array2   = nullptr;
-    type2* test_array2    = nullptr;
+  type2* working_array2 = nullptr;
+  type2* check_array2 = nullptr;
+  type2* test_array2 = nullptr;
 
-    type3* working_array3 = nullptr;
-    type3* check_array3   = nullptr;
-    type3* test_array3    = nullptr;
+  type3* working_array3 = nullptr;
+  type3* check_array3 = nullptr;
+  type3* test_array3 = nullptr;
 
-    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
-                                  &check_array1, &test_array1);
+  allocateForallTestData<type1>(N * num1,
+                                working_res,
+                                &working_array1,
+                                &check_array1,
+                                &test_array1);
 
-    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
-                                  &check_array2, &test_array2);
+  allocateForallTestData<type2>(N * num2,
+                                working_res,
+                                &working_array2,
+                                &check_array2,
+                                &test_array2);
 
-    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
-                                  &check_array3, &test_array3);
+  allocateForallTestData<type3>(N * num3,
+                                working_res,
+                                &working_array3,
+                                &check_array3,
+                                &test_array3);
 
-    type1 const test_val1(5);
-    type2 const test_val2(7);
-    type3 const test_val3(11);
+  type1 const test_val1(5);
+  type2 const test_val2(7);
+  type3 const test_val3(11);
 
-    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
 
-    using DispatchPolicy = typename DispatchTyper::template type<
-        camp::list<range_segment, callable11<IndexType, type1>>,
-        camp::list<range_segment, callable12<IndexType, type1>>,
-        camp::list<range_segment, callable21<IndexType, type2>>,
-        camp::list<range_segment, callable22<IndexType, type2>>,
-        camp::list<range_segment, callable31<IndexType, type3>>,
-        camp::list<range_segment, callable32<IndexType, type3>>>;
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable11<IndexType, type1>>,
+      camp::list<range_segment, callable12<IndexType, type1>>,
+      camp::list<range_segment, callable21<IndexType, type2>>,
+      camp::list<range_segment, callable22<IndexType, type2>>,
+      camp::list<range_segment, callable31<IndexType, type3>>,
+      camp::list<range_segment, callable32<IndexType, type3>> >;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using resource_type = typename WorkGroup_type::resource_type;
+  using resource_type = typename WorkGroup_type::resource_type;
 
-    WorkPool_type pool(Allocator {});
-    WorkGroup_type group = pool.instantiate();
-    WorkSite_type site   = group.run();
+  WorkPool_type pool(Allocator{});
+  WorkGroup_type group = pool.instantiate();
+  WorkSite_type site = group.run();
 
-    for (IndexType pr = 0; pr < pool_reuse; pr++)
-    {
+  for (IndexType pr = 0; pr < pool_reuse; pr++) {
 
 
-      // fill_pool(pool, type1(5), type2(7), type3(11));
-      {
-        for (IndexType j = IndexType(0); j < num1; j++)
-        {
-          type1* working_ptr1 = working_array1 + N * j;
-          pool.enqueue(range_segment {begin1[j], end1[j]},
-                       callable11<IndexType, type1> {working_ptr1});
-          pool.enqueue(range_segment {begin1[j], end1[j]},
-                       callable12<IndexType, type1> {working_ptr1, test_val1});
-        }
+    // fill_pool(pool, type1(5), type2(7), type3(11));
+    {
+      for (IndexType j = IndexType(0); j < num1; j++) {
+        type1* working_ptr1 = working_array1 + N * j;
+        pool.enqueue(range_segment{ begin1[j], end1[j] },
+            callable11<IndexType, type1>{working_ptr1});
+        pool.enqueue(range_segment{ begin1[j], end1[j] },
+            callable12<IndexType, type1>{working_ptr1, test_val1});
+      }
 
-        for (IndexType j = IndexType(0); j < num2; j++)
-        {
-          type2* working_ptr2 = working_array2 + N * j;
-          pool.enqueue(range_segment {begin2[j], end2[j]},
-                       callable21<IndexType, type2> {working_ptr2});
-          pool.enqueue(range_segment {begin2[j], end2[j]},
-                       callable22<IndexType, type2> {working_ptr2, test_val2});
-        }
+      for (IndexType j = IndexType(0); j < num2; j++) {
+        type2* working_ptr2 = working_array2 + N * j;
+        pool.enqueue(range_segment{ begin2[j], end2[j] },
+            callable21<IndexType, type2>{working_ptr2});
+        pool.enqueue(range_segment{ begin2[j], end2[j] },
+            callable22<IndexType, type2>{working_ptr2, test_val2});
+      }
 
-        for (IndexType j = IndexType(0); j < num3; j++)
-        {
-          type3* working_ptr3 = working_array3 + N * j;
-          pool.enqueue(range_segment {begin3[j], end3[j]},
-                       callable31<IndexType, type3> {working_ptr3});
-          pool.enqueue(range_segment {begin3[j], end3[j]},
-                       callable32<IndexType, type3> {working_ptr3, test_val3});
-        }
+      for (IndexType j = IndexType(0); j < num3; j++) {
+        type3* working_ptr3 = working_array3 + N * j;
+        pool.enqueue(range_segment{ begin3[j], end3[j] },
+            callable31<IndexType, type3>{working_ptr3});
+        pool.enqueue(range_segment{ begin3[j], end3[j] },
+            callable32<IndexType, type3>{working_ptr3, test_val3});
       }
+    }
 
-      group = pool.instantiate();
+    group = pool.instantiate();
 
-      for (IndexType gr = 0; gr < group_reuse; gr++)
-      {
+    for (IndexType gr = 0; gr < group_reuse; gr++) {
 
-        // set_test_data();
-        {
-          for (IndexType j = IndexType(0); j < num1; j++)
-          {
-            type1* test_ptr1 = test_array1 + N * j;
-            for (IndexType i = IndexType(0); i < N; i++)
-            {
-              test_ptr1[i] = type1(0);
-            }
+      // set_test_data();
+      {
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr1[i] = type1(0);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num2; j++)
-          {
-            type2* test_ptr2 = test_array2 + N * j;
-            for (IndexType i = IndexType(0); i < N; i++)
-            {
-              test_ptr2[i] = type2(0);
-            }
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr2[i] = type2(0);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num3; j++)
-          {
-            type3* test_ptr3 = test_array3 + N * j;
-            for (IndexType i = IndexType(0); i < N; i++)
-            {
-              test_ptr3[i] = type3(0);
-            }
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr3[i] = type3(0);
           }
+        }
 
 
-          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-          for (IndexType j = IndexType(0); j < num1; j++)
-          {
-            type1* test_ptr1 = test_array1 + N * j;
-            for (IndexType i = begin1[j]; i < end1[j]; ++i)
-            {
-              test_ptr1[i] = type1(i);
-            }
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
+            test_ptr1[ i ] = type1(i);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num2; j++)
-          {
-            type2* test_ptr2 = test_array2 + N * j;
-            for (IndexType i = begin2[j]; i < end2[j]; ++i)
-            {
-              test_ptr2[i] = type2(i);
-            }
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
+            test_ptr2[ i ] = type2(i);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num3; j++)
-          {
-            type3* test_ptr3 = test_array3 + N * j;
-            for (IndexType i = begin3[j]; i < end3[j]; ++i)
-            {
-              test_ptr3[i] = type3(i);
-            }
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
+            test_ptr3[ i ] = type3(i);
           }
         }
+      }
 
-        site = group.run();
+      site = group.run();
 
-        auto e = resource_type::get_default().get_event();
-        e.wait();
+      auto e = resource_type::get_default().get_event();
+      e.wait();
 
-        // check_test_data(type1(5), type2(7), type3(11));
-        {
-          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+      // check_test_data(type1(5), type2(7), type3(11));
+      {
+        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-          res.wait();
+        res.wait();
 
 
-          for (IndexType j = IndexType(0); j < num1; j++)
-          {
-            type1* test_ptr1  = test_array1 + N * j;
-            type1* check_ptr1 = check_array1 + N * j;
-            for (IndexType i = IndexType(0); i < begin1[j]; i++)
-            {
-              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-            }
-            for (IndexType i = begin1[j]; i < end1[j]; i++)
-            {
-              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
-            }
-            for (IndexType i = end1[j]; i < N; i++)
-            {
-              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-            }
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          type1* check_ptr1 = check_array1 + N * j;
+          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          }
+          for (IndexType i = begin1[j];    i < end1[j];   i++) {
+            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
           }
+          for (IndexType i = end1[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          }
+        }
 
-          for (IndexType j = IndexType(0); j < num2; j++)
-          {
-            type2* test_ptr2  = test_array2 + N * j;
-            type2* check_ptr2 = check_array2 + N * j;
-            for (IndexType i = IndexType(0); i < begin2[j]; i++)
-            {
-              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-            }
-            for (IndexType i = begin2[j]; i < end2[j]; i++)
-            {
-              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
-            }
-            for (IndexType i = end2[j]; i < N; i++)
-            {
-              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-            }
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          type2* check_ptr2 = check_array2 + N * j;
+          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
           }
+          for (IndexType i = begin2[j];    i < end2[j];   i++) {
+            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+          }
+          for (IndexType i = end2[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          }
+        }
 
-          for (IndexType j = IndexType(0); j < num3; j++)
-          {
-            type3* test_ptr3  = test_array3 + N * j;
-            type3* check_ptr3 = check_array3 + N * j;
-            for (IndexType i = IndexType(0); i < begin3[j]; i++)
-            {
-              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-            }
-            for (IndexType i = begin3[j]; i < end3[j]; i++)
-            {
-              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-            }
-            for (IndexType i = end3[j]; i < N; i++)
-            {
-              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-            }
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          type3* check_ptr3 = check_array3 + N * j;
+          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          }
+          for (IndexType i = begin3[j];    i < end3[j];   i++) {
+            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+          }
+          for (IndexType i = end3[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
           }
         }
       }
-
-      site.clear();
-      group.clear();
-      pool.clear();
     }
 
+    site.clear();
+    group.clear();
+    pool.clear();
+  }
+
 
-    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
-                                    test_array1);
+  deallocateForallTestData<type1>(working_res,
+                                  working_array1,
+                                  check_array1,
+                                  test_array1);
 
-    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
-                                    test_array2);
+  deallocateForallTestData<type2>(working_res,
+                                  working_array2,
+                                  check_array2,
+                                  test_array2);
 
-    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
-                                    test_array3);
-  }
+  deallocateForallTestData<type3>(working_res,
+                                  working_array3,
+                                  check_array3,
+                                  test_array3);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupOrderedMultiple<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
-  {}
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_function_call_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupOrderedMultiple<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
-  {}
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_virtual_function_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
 };
 
 #endif
 
 
+
 template <typename T>
 class WorkGroupBasicOrderedMultipleReuseFunctionalTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
-             BasicWorkGroupOrderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrderedMultipleReuse)
 {
-  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device {}());
+  std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -472,11 +434,9 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE> {}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
-      group_reuse);
+  testWorkGroupOrderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                                IndexType, Allocator, WORKING_RESOURCE >{}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index b0a2ac3734..c249b7de65 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -25,141 +25,149 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupOrderedSingle
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedSingle {
+void operator()(IndexType begin, IndexType end) const
 {
-  void operator()(IndexType begin, IndexType end) const
-  {
-    ASSERT_GE(begin, (IndexType)0);
-    ASSERT_GE(end, begin);
-    IndexType N = end + begin;
-
-    WORKING_RES res = WORKING_RES::get_default();
-    camp::resources::Resource working_res {res};
-
-    IndexType* working_array;
-    IndexType* check_array;
-    IndexType* test_array;
-
-    allocateForallTestData<IndexType>(N, working_res, &working_array,
-                                      &check_array, &test_array);
-
-    IndexType const test_val(5);
-
-    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+  ASSERT_GE(begin, (IndexType)0);
+  ASSERT_GE(end, begin);
+  IndexType N = end + begin;
+
+  WORKING_RES res = WORKING_RES::get_default();
+  camp::resources::Resource working_res{res};
+
+  IndexType* working_array;
+  IndexType* check_array;
+  IndexType* test_array;
+
+  allocateForallTestData<IndexType>(N,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  IndexType const test_val(5);
+
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+  auto callable1 = [=] RAJA_HOST_DEVICE (IndexType i) {
+        working_array[i] += i;
+      };
+
+  auto callable2 = [=] RAJA_HOST_DEVICE (IndexType i) {
+        working_array[i] += test_val;
+      };
+
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, decltype(callable1)>,
+      camp::list<range_segment, decltype(callable2)> >;
+
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    auto callable1 = [=] RAJA_HOST_DEVICE(IndexType i)
-    { working_array[i] += i; };
+  {
+    for (IndexType i = IndexType(0); i < N; i++) {
+      test_array[i] = IndexType(0);
+    }
 
-    auto callable2 = [=] RAJA_HOST_DEVICE(IndexType i)
-    { working_array[i] += test_val; };
+    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
 
-    using DispatchPolicy = typename DispatchTyper::template type<
-        camp::list<range_segment, decltype(callable1)>,
-        camp::list<range_segment, decltype(callable2)>>;
+    for (IndexType i = begin; i < end; ++i) {
+      test_array[ i ] = IndexType(i);
+    }
+  }
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  WorkPool_type pool(Allocator{});
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+  {
+    pool.enqueue(range_segment{ begin, end }, callable1);
+    pool.enqueue(range_segment{ begin, end }, callable2);
+  }
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  WorkGroup_type group = pool.instantiate();
 
-    {
-      for (IndexType i = IndexType(0); i < N; i++)
-      {
-        test_array[i] = IndexType(0);
-      }
+  WorkSite_type site = group.run(res);
 
-      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+  {
+    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+    res.wait();
 
-      for (IndexType i = begin; i < end; ++i)
-      {
-        test_array[i] = IndexType(i);
-      }
+    for (IndexType i = IndexType(0); i < begin; i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
     }
-
-    WorkPool_type pool(Allocator {});
-
-    {
-      pool.enqueue(range_segment {begin, end}, callable1);
-      pool.enqueue(range_segment {begin, end}, callable2);
+    for (IndexType i = begin;        i < end;   i++) {
+      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
     }
-
-    WorkGroup_type group = pool.instantiate();
-
-    WorkSite_type site = group.run(res);
-
-    {
-      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
-      res.wait();
-
-      for (IndexType i = IndexType(0); i < begin; i++)
-      {
-        ASSERT_EQ(test_array[i], check_array[i]);
-      }
-      for (IndexType i = begin; i < end; i++)
-      {
-        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
-      }
-      for (IndexType i = end; i < N; i++)
-      {
-        ASSERT_EQ(test_array[i], check_array[i]);
-      }
+    for (IndexType i = end;          i < N;     i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
     }
+  }
 
 
-    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
-                                        test_array);
-  }
+  deallocateForallTestData<IndexType>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupOrderedSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(IndexType, IndexType) const {}
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_function_call_dispatch_typer,
+                                  IndexType,
+                                  Allocator,
+                                  WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupOrderedSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(IndexType, IndexType) const {}
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_virtual_function_dispatch_typer,
+                                  IndexType,
+                                  Allocator,
+                                  WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
 };
 
 #endif
@@ -167,23 +175,23 @@ struct testWorkGroupOrderedSingle<
 
 template <typename T>
 class WorkGroupBasicOrderedSingleFunctionalTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
-             BasicWorkGroupOrderedSingle)
+TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSingle)
 {
-  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device {}());
+  std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -195,15 +203,9 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator,
-                             WORKING_RESOURCE> {}(b1, e1);
-  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator,
-                             WORKING_RESOURCE> {}(b2, e2);
-  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator,
-                             WORKING_RESOURCE> {}(b3, e3);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_SINGLE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index c2265c3a96..4207294bcf 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -19,36 +19,30 @@
 
 
 // These are defined here due to cuda limitations
-template <typename IndexType, typename type1>
-struct callable1
-{
+template < typename IndexType, typename type1 >
+struct callable1 {
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
-    working_ptr1[i] += type1(i) + test_val1;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+        working_ptr1[i] += type1(i) + test_val1;
   }
 };
 
-template <typename IndexType, typename type2>
-struct callable2
-{
+template < typename IndexType, typename type2 >
+struct callable2 {
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
-    working_ptr2[i] += type2(i) + test_val2;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+        working_ptr2[i] += type2(i) + test_val2;
   }
 };
 
-template <typename IndexType, typename type3>
-struct callable3
-{
+template < typename IndexType, typename type3 >
+struct callable3 {
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const
-  {
-    working_ptr3[i] += type3(i) + test_val3;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+        working_ptr3[i] += type3(i) + test_val3;
   }
 };
 
@@ -59,363 +53,335 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupUnorderedMultiple
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedMultiple {
+void operator()(
+    std::mt19937& rng, IndexType max_begin, IndexType min_end,
+    IndexType num1, IndexType num2, IndexType num3,
+    IndexType pool_reuse, IndexType group_reuse) const
 {
-  void operator()(std::mt19937& rng,
-                  IndexType max_begin,
-                  IndexType min_end,
-                  IndexType num1,
-                  IndexType num2,
-                  IndexType num3,
-                  IndexType pool_reuse,
-                  IndexType group_reuse) const
-  {
-    ASSERT_GT(min_end, max_begin);
-    IndexType N = min_end + max_begin;
+  ASSERT_GT(min_end, max_begin);
+  IndexType N = min_end + max_begin;
 
-    std::vector<IndexType> begin1, end1;
-    std::vector<IndexType> begin2, end2;
-    std::vector<IndexType> begin3, end3;
+  std::vector<IndexType> begin1, end1;
+  std::vector<IndexType> begin2, end2;
+  std::vector<IndexType> begin3, end3;
 
-    {
-      using dist_type = std::uniform_int_distribution<IndexType>;
+  {
+    using dist_type = std::uniform_int_distribution<IndexType>;
 
-      for (IndexType j = IndexType(0); j < num1; j++)
-      {
-        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
-        end1.push_back(dist_type(begin1.back(), min_end)(rng));
-      }
+    for (IndexType j = IndexType(0); j < num1; j++) {
+      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
+      end1.push_back(dist_type(begin1.back(), min_end)(rng));
+    }
 
-      for (IndexType j = IndexType(0); j < num2; j++)
-      {
-        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
-        end2.push_back(dist_type(begin2.back(), min_end)(rng));
-      }
+    for (IndexType j = IndexType(0); j < num2; j++) {
+      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
+      end2.push_back(dist_type(begin2.back(), min_end)(rng));
+    }
 
-      for (IndexType j = IndexType(0); j < num3; j++)
-      {
-        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
-        end3.push_back(dist_type(begin3.back(), min_end)(rng));
-      }
+    for (IndexType j = IndexType(0); j < num3; j++) {
+      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
+      end3.push_back(dist_type(begin3.back(), min_end)(rng));
     }
+  }
 
-    WORKING_RES res = WORKING_RES::get_default();
-    camp::resources::Resource working_res {res};
+  WORKING_RES res = WORKING_RES::get_default();
+  camp::resources::Resource working_res{res};
 
-    using type1 = IndexType;
-    using type2 = size_t;
-    using type3 = double;
+  using type1 = IndexType;
+  using type2 = size_t;
+  using type3 = double;
 
-    type1* working_array1 = nullptr;
-    type1* check_array1   = nullptr;
-    type1* test_array1    = nullptr;
+  type1* working_array1 = nullptr;
+  type1* check_array1 = nullptr;
+  type1* test_array1 = nullptr;
 
-    type2* working_array2 = nullptr;
-    type2* check_array2   = nullptr;
-    type2* test_array2    = nullptr;
+  type2* working_array2 = nullptr;
+  type2* check_array2 = nullptr;
+  type2* test_array2 = nullptr;
 
-    type3* working_array3 = nullptr;
-    type3* check_array3   = nullptr;
-    type3* test_array3    = nullptr;
+  type3* working_array3 = nullptr;
+  type3* check_array3 = nullptr;
+  type3* test_array3 = nullptr;
 
-    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
-                                  &check_array1, &test_array1);
+  allocateForallTestData<type1>(N * num1,
+                                working_res,
+                                &working_array1,
+                                &check_array1,
+                                &test_array1);
 
-    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
-                                  &check_array2, &test_array2);
+  allocateForallTestData<type2>(N * num2,
+                                working_res,
+                                &working_array2,
+                                &check_array2,
+                                &test_array2);
 
-    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
-                                  &check_array3, &test_array3);
+  allocateForallTestData<type3>(N * num3,
+                                working_res,
+                                &working_array3,
+                                &check_array3,
+                                &test_array3);
 
-    type1 const test_val1(5);
-    type2 const test_val2(7);
-    type3 const test_val3(11);
+  type1 const test_val1(5);
+  type2 const test_val2(7);
+  type3 const test_val3(11);
 
-    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-    using DispatchPolicy = typename DispatchTyper::template type<
-        camp::list<range_segment, callable1<IndexType, type1>>,
-        camp::list<range_segment, callable2<IndexType, type2>>,
-        camp::list<range_segment, callable3<IndexType, type3>>>;
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable1<IndexType, type1>>,
+      camp::list<range_segment, callable2<IndexType, type2>>,
+      camp::list<range_segment, callable3<IndexType, type3>> >;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    WorkPool_type pool(Allocator {});
+  WorkPool_type pool(Allocator{});
 
-    for (IndexType pr = 0; pr < pool_reuse; pr++)
-    {
+  for (IndexType pr = 0; pr < pool_reuse; pr++) {
 
-      // fill_pool(pool, type1(5), type2(7), type3(11));
-      {
-        for (IndexType j = IndexType(0); j < num1; j++)
-        {
-          type1* working_ptr1 = working_array1 + N * j;
-          pool.enqueue(range_segment {begin1[j], end1[j]},
-                       callable1<IndexType, type1> {working_ptr1, test_val1});
-        }
+    // fill_pool(pool, type1(5), type2(7), type3(11));
+    {
+      for (IndexType j = IndexType(0); j < num1; j++) {
+        type1* working_ptr1 = working_array1 + N * j;
+        pool.enqueue(range_segment{ begin1[j], end1[j] },
+            callable1<IndexType, type1>{working_ptr1, test_val1});
+      }
 
-        for (IndexType j = IndexType(0); j < num2; j++)
-        {
-          type2* working_ptr2 = working_array2 + N * j;
-          pool.enqueue(range_segment {begin2[j], end2[j]},
-                       callable2<IndexType, type2> {working_ptr2, test_val2});
-        }
+      for (IndexType j = IndexType(0); j < num2; j++) {
+        type2* working_ptr2 = working_array2 + N * j;
+        pool.enqueue(range_segment{ begin2[j], end2[j] },
+            callable2<IndexType, type2>{working_ptr2, test_val2});
+      }
 
-        for (IndexType j = IndexType(0); j < num3; j++)
-        {
-          type3* working_ptr3 = working_array3 + N * j;
-          pool.enqueue(range_segment {begin3[j], end3[j]},
-                       callable3<IndexType, type3> {working_ptr3, test_val3});
-        }
+      for (IndexType j = IndexType(0); j < num3; j++) {
+        type3* working_ptr3 = working_array3 + N * j;
+        pool.enqueue(range_segment{ begin3[j], end3[j] },
+            callable3<IndexType, type3>{working_ptr3, test_val3});
       }
+    }
 
-      WorkGroup_type group = pool.instantiate();
+    WorkGroup_type group = pool.instantiate();
 
-      for (IndexType gr = 0; gr < group_reuse; gr++)
-      {
+    for (IndexType gr = 0; gr < group_reuse; gr++) {
 
-        // set_test_data();
-        {
-          for (IndexType j = IndexType(0); j < num1; j++)
-          {
-            type1* test_ptr1 = test_array1 + N * j;
-            for (IndexType i = IndexType(0); i < N; i++)
-            {
-              test_ptr1[i] = type1(0);
-            }
+      // set_test_data();
+      {
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr1[i] = type1(0);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num2; j++)
-          {
-            type2* test_ptr2 = test_array2 + N * j;
-            for (IndexType i = IndexType(0); i < N; i++)
-            {
-              test_ptr2[i] = type2(0);
-            }
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr2[i] = type2(0);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num3; j++)
-          {
-            type3* test_ptr3 = test_array3 + N * j;
-            for (IndexType i = IndexType(0); i < N; i++)
-            {
-              test_ptr3[i] = type3(0);
-            }
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = IndexType(0); i < N; i++) {
+            test_ptr3[i] = type3(0);
           }
+        }
 
 
-          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-          for (IndexType j = IndexType(0); j < num1; j++)
-          {
-            type1* test_ptr1 = test_array1 + N * j;
-            for (IndexType i = begin1[j]; i < end1[j]; ++i)
-            {
-              test_ptr1[i] = type1(i);
-            }
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
+            test_ptr1[ i ] = type1(i);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num2; j++)
-          {
-            type2* test_ptr2 = test_array2 + N * j;
-            for (IndexType i = begin2[j]; i < end2[j]; ++i)
-            {
-              test_ptr2[i] = type2(i);
-            }
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
+            test_ptr2[ i ] = type2(i);
           }
+        }
 
-          for (IndexType j = IndexType(0); j < num3; j++)
-          {
-            type3* test_ptr3 = test_array3 + N * j;
-            for (IndexType i = begin3[j]; i < end3[j]; ++i)
-            {
-              test_ptr3[i] = type3(i);
-            }
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
+            test_ptr3[ i ] = type3(i);
           }
         }
+      }
 
-        WorkSite_type site = group.run(res);
+      WorkSite_type site = group.run(res);
 
-        // check_test_data(type1(5), type2(7), type3(11));
-        {
-          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+      // check_test_data(type1(5), type2(7), type3(11));
+      {
+        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-          res.wait();
+        res.wait();
 
 
-          for (IndexType j = IndexType(0); j < num1; j++)
-          {
-            type1* test_ptr1  = test_array1 + N * j;
-            type1* check_ptr1 = check_array1 + N * j;
-            for (IndexType i = IndexType(0); i < begin1[j]; i++)
-            {
-              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-            }
-            for (IndexType i = begin1[j]; i < end1[j]; i++)
-            {
-              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
-            }
-            for (IndexType i = end1[j]; i < N; i++)
-            {
-              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-            }
+        for (IndexType j = IndexType(0); j < num1; j++) {
+          type1* test_ptr1 = test_array1 + N * j;
+          type1* check_ptr1 = check_array1 + N * j;
+          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
           }
+          for (IndexType i = begin1[j];    i < end1[j];   i++) {
+            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+          }
+          for (IndexType i = end1[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          }
+        }
 
-          for (IndexType j = IndexType(0); j < num2; j++)
-          {
-            type2* test_ptr2  = test_array2 + N * j;
-            type2* check_ptr2 = check_array2 + N * j;
-            for (IndexType i = IndexType(0); i < begin2[j]; i++)
-            {
-              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-            }
-            for (IndexType i = begin2[j]; i < end2[j]; i++)
-            {
-              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
-            }
-            for (IndexType i = end2[j]; i < N; i++)
-            {
-              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-            }
+        for (IndexType j = IndexType(0); j < num2; j++) {
+          type2* test_ptr2 = test_array2 + N * j;
+          type2* check_ptr2 = check_array2 + N * j;
+          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          }
+          for (IndexType i = begin2[j];    i < end2[j];   i++) {
+            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
           }
+          for (IndexType i = end2[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          }
+        }
 
-          for (IndexType j = IndexType(0); j < num3; j++)
-          {
-            type3* test_ptr3  = test_array3 + N * j;
-            type3* check_ptr3 = check_array3 + N * j;
-            for (IndexType i = IndexType(0); i < begin3[j]; i++)
-            {
-              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-            }
-            for (IndexType i = begin3[j]; i < end3[j]; i++)
-            {
-              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-            }
-            for (IndexType i = end3[j]; i < N; i++)
-            {
-              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-            }
+        for (IndexType j = IndexType(0); j < num3; j++) {
+          type3* test_ptr3 = test_array3 + N * j;
+          type3* check_ptr3 = check_array3 + N * j;
+          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          }
+          for (IndexType i = begin3[j];    i < end3[j];   i++) {
+            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+          }
+          for (IndexType i = end3[j];      i < N;     i++) {
+            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
           }
         }
       }
-
-      pool.clear();
     }
 
+    pool.clear();
+  }
+
 
-    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
-                                    test_array1);
+  deallocateForallTestData<type1>(working_res,
+                                  working_array1,
+                                  check_array1,
+                                  test_array1);
 
-    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
-                                    test_array2);
+  deallocateForallTestData<type2>(working_res,
+                                  working_array2,
+                                  check_array2,
+                                  test_array2);
 
-    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
-                                    test_array3);
-  }
+  deallocateForallTestData<type3>(working_res,
+                                  working_array3,
+                                  check_array3,
+                                  test_array3);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupUnorderedMultiple<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
-  {}
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_function_call_dispatch_typer,
+                                      IndexType,
+                                      Allocator,
+                                      WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupUnorderedMultiple<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(std::mt19937&,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType,
-                  IndexType) const
-  {}
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_virtual_function_dispatch_typer,
+                                      IndexType,
+                                      Allocator,
+                                      WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
 };
 
 #endif
 
 
 template <typename T>
-class WorkGroupBasicUnorderedMultipleReuseFunctionalTest
-    : public ::testing::Test
-{};
+class WorkGroupBasicUnorderedMultipleReuseFunctionalTest : public ::testing::Test
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
-             BasicWorkGroupUnorderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupUnorderedMultipleReuse)
 {
-  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device {}());
+  std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -425,11 +391,9 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                                 DispatchTyper, IndexType, Allocator,
-                                 WORKING_RESOURCE> {}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
-      group_reuse);
+  testWorkGroupUnorderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                                  IndexType, Allocator, WORKING_RESOURCE >{}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index 629bccdb0d..84d44dd496 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -24,143 +24,150 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupUnorderedSingle
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedSingle {
+void operator()(IndexType begin, IndexType end) const
 {
-  void operator()(IndexType begin, IndexType end) const
-  {
 
-    ASSERT_GE(begin, (IndexType)0);
-    ASSERT_GE(end, begin);
-    IndexType N = end + begin;
+  ASSERT_GE(begin, (IndexType)0);
+  ASSERT_GE(end, begin);
+  IndexType N = end + begin;
 
-    WORKING_RES res = WORKING_RES::get_default();
-    camp::resources::Resource working_res {res};
+  WORKING_RES res = WORKING_RES::get_default();
+  camp::resources::Resource working_res{res};
 
-    IndexType* working_array;
-    IndexType* check_array;
-    IndexType* test_array;
+  IndexType* working_array;
+  IndexType* check_array;
+  IndexType* test_array;
 
-    allocateForallTestData<IndexType>(N, working_res, &working_array,
-                                      &check_array, &test_array);
+  allocateForallTestData<IndexType>(N,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
 
-    IndexType const test_val(5);
+  IndexType const test_val(5);
 
-    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-    auto callable = [=] RAJA_HOST_DEVICE(IndexType i)
-    { working_array[i] += i + test_val; };
+  auto callable = [=] RAJA_HOST_DEVICE (IndexType i) {
+        working_array[i] += i + test_val;
+      };
 
-    using DispatchPolicy = typename DispatchTyper::template type<
-        camp::list<range_segment, decltype(callable)>>;
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, decltype(callable)> >;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
-    using resource_type = typename WorkSite_type::resource_type;
-    static_assert(std::is_same<WORKING_RES, resource_type>::value,
-                  "Expected same resource types");
+  using resource_type = typename WorkSite_type::resource_type;
+  static_assert(std::is_same<WORKING_RES, resource_type>::value,
+                "Expected same resource types");
 
-    {
-      for (IndexType i = IndexType(0); i < N; i++)
-      {
-        test_array[i] = IndexType(0);
-      }
+  {
+    for (IndexType i = IndexType(0); i < N; i++) {
+      test_array[i] = IndexType(0);
+    }
 
-      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
 
-      for (IndexType i = begin; i < end; ++i)
-      {
-        test_array[i] = IndexType(i);
-      }
+    for (IndexType i = begin; i < end; ++i) {
+      test_array[ i ] = IndexType(i);
     }
+  }
 
-    WorkPool_type pool(Allocator {});
+  WorkPool_type pool(Allocator{});
 
-    {
-      pool.enqueue(range_segment {begin, end}, callable);
-    }
+  {
+    pool.enqueue(range_segment{ begin, end }, callable);
+  }
 
-    WorkGroup_type group = pool.instantiate();
+  WorkGroup_type group = pool.instantiate();
 
-    WorkSite_type site = group.run();
+  WorkSite_type site = group.run();
 
-    auto e = site.get_resource().get_event();
-    e.wait();
+  auto e = site.get_resource().get_event();
+  e.wait();
 
-    {
-      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+  {
+    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
 
-      for (IndexType i = IndexType(0); i < begin; i++)
-      {
-        ASSERT_EQ(test_array[i], check_array[i]);
-      }
-      for (IndexType i = begin; i < end; i++)
-      {
-        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
-      }
-      for (IndexType i = end; i < N; i++)
-      {
-        ASSERT_EQ(test_array[i], check_array[i]);
-      }
+    for (IndexType i = IndexType(0); i < begin; i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
+    }
+    for (IndexType i = begin;        i < end;   i++) {
+      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
     }
+    for (IndexType i = end;          i < N;     i++) {
+      ASSERT_EQ(test_array[i], check_array[i]);
+    }
+  }
 
 
-    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
-                                        test_array);
-  }
+  deallocateForallTestData<IndexType>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupUnorderedSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(IndexType, IndexType) const {}
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_function_call_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES>
-struct testWorkGroupUnorderedSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKING_RES>
-{
-  void operator()(IndexType, IndexType) const {}
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_virtual_function_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
 };
 
 #endif
@@ -168,23 +175,23 @@ struct testWorkGroupUnorderedSingle<
 
 template <typename T>
 class WorkGroupBasicUnorderedSingleFunctionalTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
-             BasicWorkGroupUnorderedSingle)
+TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnorderedSingle)
 {
-  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device {}());
+  std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -196,15 +203,9 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE> {}(b1, e1);
-  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE> {}(b2, e2);
-  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator,
-                               WORKING_RESOURCE> {}(b3, e3);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_SINGLE__
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index 4b4c786784..a699171a94 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -21,126 +21,110 @@
 #ifdef RAJA_COMPILER_MSVC
 // disable some warnings for MSVC that we can't control, because they're emitted
 // by googletest headers
-#pragma warning(disable : 4244)  // Force msvc to not emit conversion warning
-#pragma warning(disable : 4389)  // Force msvc to not emit conversion warning
+#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
+#pragma warning( disable : 4389 )  // Force msvc to not emit conversion warning
 #endif
 
 #include "gtest/gtest.h"
 
-#define GPU_TEST(X, Y)                                                         \
-  static void gpu_test_##X##_##Y();                                            \
-  TEST(X, Y) { gpu_test_##X##_##Y(); }                                         \
+#define GPU_TEST(X, Y)                 \
+  static void gpu_test_##X##_##Y();    \
+  TEST(X, Y) { gpu_test_##X##_##Y(); } \
   static void gpu_test_##X##_##Y()
 
-#define GPU_TEST_F(test_fixture, test_name)                                    \
-  static void gpu_test_f_##test_fixture##_##test_name();                       \
-  GTEST_TEST_(test_fixture, test_name, test_fixture,                           \
-              ::testing::internal::GetTypeId<test_fixture>())                  \
-  {                                                                            \
-    gpu_test_f_##test_fixture##_##test_name();                                 \
-  }                                                                            \
+#define GPU_TEST_F(test_fixture, test_name)                  \
+  static void gpu_test_f_##test_fixture##_##test_name();     \
+  GTEST_TEST_(test_fixture,                                   \
+              test_name,                                      \
+              test_fixture,                                   \
+              ::testing::internal::GetTypeId<test_fixture>()) \
+  {                                                           \
+    gpu_test_f_##test_fixture##_##test_name();               \
+  }                                                           \
   static void gpu_test_f_##test_fixture##_##test_name()
 
-#define GPU_TEST_P(test_case_name, test_name)                                  \
-  template <typename Invocable>                                                \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable&&);           \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
-      : public test_case_name                                                  \
-  {                                                                            \
-  public:                                                                      \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
-    virtual void TestBody()                                                    \
-    {                                                                          \
-      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); });    \
-    }                                                                          \
-                                                                               \
-  private:                                                                     \
-    static int AddToRegistry()                                                 \
-    {                                                                          \
-      ::testing::UnitTest::GetInstance()                                       \
-          ->parameterized_test_registry()                                      \
-          .GetTestCasePatternHolder<test_case_name>(                           \
-              #test_case_name,                                                 \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
-          ->AddTestPattern(                                                    \
-              #test_case_name, #test_name,                                     \
-              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_case_name, test_name)>());                              \
-      return 0;                                                                \
-    }                                                                          \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
-                                                           test_name));        \
-  };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
-                             test_name)::gtest_registering_dummy_ =            \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
-  template <typename Invocable>                                                \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable&& GetParam)
-
-#define GPU_TYPED_TEST_P(SuiteName, TestName)                                  \
-  namespace GTEST_SUITE_NAMESPACE_(SuiteName)                                  \
-  {                                                                            \
-    template <typename gtest_TypeParam_>                                       \
-    class TestName : public SuiteName<gtest_TypeParam_>                        \
-    {                                                                          \
-    private:                                                                   \
-      typedef SuiteName<gtest_TypeParam_> TestFixture;                         \
-      typedef gtest_TypeParam_ TypeParam;                                      \
-                                                                               \
-    public:                                                                    \
-      void TestBody() override;                                                \
-    };                                                                         \
-    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =          \
-        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(                \
-            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),                   \
-            GTEST_STRINGIFY_(TestName));                                       \
-  }                                                                            \
-  template <typename gtest_TypeParam_>                                         \
-  void GTEST_SUITE_NAMESPACE_(                                                 \
-      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+#define GPU_TEST_P(test_case_name, test_name)                               \
+  template <typename Invocable>                                              \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&);       \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
+      : public test_case_name                                                \
+  {                                                                          \
+  public:                                                                    \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
+    virtual void TestBody()                                                  \
+    {                                                                        \
+      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); }); \
+    }                                                                        \
+                                                                             \
+  private:                                                                   \
+    static int AddToRegistry()                                               \
+    {                                                                        \
+      ::testing::UnitTest::GetInstance()                                     \
+          ->parameterized_test_registry()                                    \
+          .GetTestCasePatternHolder<test_case_name>(                         \
+              #test_case_name,                                               \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))         \
+          ->AddTestPattern(                                                  \
+              #test_case_name,                                               \
+              #test_name,                                                    \
+              new ::testing::internal::TestMetaFactory<                      \
+                  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>());     \
+      return 0;                                                              \
+    }                                                                        \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;             \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
+                                                           test_name));      \
+  };                                                                         \
+  int GTEST_TEST_CLASS_NAME_(test_case_name,                                 \
+                             test_name)::gtest_registering_dummy_ =          \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();    \
+  template <typename Invocable>                                              \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&GetParam)
+
+#define GPU_TYPED_TEST_P(SuiteName, TestName)                           \
+    namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
+      template <typename gtest_TypeParam_>                              \
+      class TestName : public SuiteName<gtest_TypeParam_> {             \
+       private:                                                         \
+        typedef SuiteName<gtest_TypeParam_> TestFixture;                \
+        typedef gtest_TypeParam_ TypeParam;                             \
+       public:                                                          \
+        void TestBody() override;                                       \
+      };                                                                \
+      static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+          GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
+              __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
+              GTEST_STRINGIFY_(TestName));                              \
+    }                                                                   \
+    template <typename gtest_TypeParam_>                                \
+    void GTEST_SUITE_NAMESPACE_(                                        \
+        SuiteName)::TestName<gtest_TypeParam_>::TestBody()
 
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4244)  // reenable warning
-#pragma warning(default : 4389)  // reenable warning
+#pragma warning( default : 4244 )  // reenable warning
+#pragma warning( default : 4389 )  // reenable warning
 #endif
 
 
 #if defined(__CUDA_ARCH__)
 
-#define RAJA_ASSERT_EQ(X, Y)                                                   \
-  {                                                                            \
-    auto x = (X);                                                              \
-    auto y = (Y);                                                              \
-    if (x != y)                                                                \
-    {                                                                          \
-      asm("trap;");                                                            \
-    }                                                                          \
-  }
+#define RAJA_ASSERT_EQ(X,Y) \
+{\
+  auto x = (X); \
+  auto y = (Y); \
+  if(x != y){ \
+      asm("trap;"); \
+  } \
+}
 
-#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
-  {                                                                            \
-    RAJA_ASSERT_EQ(X, Y);                                                      \
-  }
-#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
-  {                                                                            \
-    RAJA_ASSERT_EQ(X, Y);                                                      \
-  }
+#define RAJA_ASSERT_FLOAT_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
+#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
 #else
 
-#define RAJA_ASSERT_EQ(X, Y)                                                   \
-  {                                                                            \
-    ASSERT_EQ(X, Y);                                                           \
-  }
-#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
-  {                                                                            \
-    ASSERT_FLOAT_EQ(X, Y);                                                     \
-  }
-#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
-  {                                                                            \
-    ASSERT_DOUBLE_EQ(X, Y);                                                    \
-  }
+#define RAJA_ASSERT_EQ(X,Y) {ASSERT_EQ(X,Y);}
+#define RAJA_ASSERT_FLOAT_EQ(X,Y) {ASSERT_FLOAT_EQ(X,Y);}
+#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {ASSERT_DOUBLE_EQ(X,Y);}
 
 #endif
 /*
@@ -155,68 +139,54 @@
  *  Now you can just say ASSERT_SCALAR_EQ(X, Y) and things should just work
  *
  */
-#define ASSERT_SCALAR_EQ(X, Y)                                                 \
-  {                                                                            \
-    int value_type = RAJA::gtest::getScalarType(X);                            \
-    switch (value_type)                                                        \
-    {                                                                          \
-    case 1:                                                                    \
-    {                                                                          \
-      RAJA_ASSERT_FLOAT_EQ(X, Y);                                              \
-    }                                                                          \
-    break;                                                                     \
-    case 2:                                                                    \
-    {                                                                          \
-      RAJA_ASSERT_DOUBLE_EQ(X, Y);                                             \
-    }                                                                          \
-    break;                                                                     \
-    default:                                                                   \
-    {                                                                          \
-      RAJA_ASSERT_EQ(X, Y);                                                    \
-    }                                                                          \
-    };                                                                         \
-  }
+#define ASSERT_SCALAR_EQ(X,Y) { \
+  int value_type = RAJA::gtest::getScalarType(X); \
+  switch(value_type){ \
+    case 1: {RAJA_ASSERT_FLOAT_EQ(X,Y);} break; \
+    case 2: {RAJA_ASSERT_DOUBLE_EQ(X,Y);} break; \
+    default: {RAJA_ASSERT_EQ(X,Y);} \
+  }; }
 
 // Traits use by the above maco
 namespace RAJA
 {
-namespace gtest
-{
-template <typename T>
-struct AssertScalarTraits
-{
-  static constexpr int value = 0;
-};
+  namespace gtest
+  {
+    template<typename T>
+    struct AssertScalarTraits{
+        static constexpr int value = 0;
+    };
+
+    template<>
+    struct AssertScalarTraits<float>{
+        static constexpr int value = 1;
+    };
+
+    template<>
+    struct AssertScalarTraits<double>{
+        static constexpr int value = 2;
+    };
+
+    template<typename T>
+    inline
+    constexpr
+    int getScalarType(T const &){
+      return AssertScalarTraits<T>::value;
+    }
 
-template <>
-struct AssertScalarTraits<float>
-{
-  static constexpr int value = 1;
-};
-
-template <>
-struct AssertScalarTraits<double>
-{
-  static constexpr int value = 2;
-};
 
-template <typename T>
-inline constexpr int getScalarType(T const&)
-{
-  return AssertScalarTraits<T>::value;
+  }
 }
 
-
-}  // namespace gtest
-}  // namespace RAJA
-
 // This always returns a 0, but forces compiler not to compile-out
 // constant values
-#define NO_OPT_ZERO (rand() / RAND_MAX)
+#define NO_OPT_ZERO (rand()/RAND_MAX)
 
 // Returns a random value between 1.0 and 2.0, and helps force the compiler
 // to not compile-out constant values
-#define NO_OPT_RAND (1.0 + (double)rand() / RAND_MAX)
+#define NO_OPT_RAND (1.0+(double)rand()/RAND_MAX)
+
+
 
 
 #endif  // closing endif for header file include guard
diff --git a/test/include/RAJA_test-abs.hpp b/test/include/RAJA_test-abs.hpp
index 57bfadf0c0..85b5002d92 100644
--- a/test/include/RAJA_test-abs.hpp
+++ b/test/include/RAJA_test-abs.hpp
@@ -13,21 +13,20 @@
 
 #include <cmath>
 
-namespace RAJA
-{
+namespace RAJA {
 
-template <typename T>
-camp::concepts::enable_if_t<T, std::is_floating_point<T>> test_abs(T&& val)
-{
-  return std::fabs(val);
-}
+  template<typename T>
+  camp::concepts::enable_if_t<T, std::is_floating_point<T> >
+  test_abs(T&& val) {
+    return std::fabs(val);
+  } 
 
-template <typename T>
-camp::concepts::enable_if_t<T, std::is_integral<T>> test_abs(T&& val)
-{
-  return std::abs(val);
-}
+  template<typename T>
+  camp::concepts::enable_if_t<T, std::is_integral<T> >
+  test_abs(T&& val) {
+    return std::abs(val);
+  }
 
-}  // namespace RAJA
+} // namespace RAJA
 
-#endif  // __RAJA_test_abs_HPP__
+#endif // __RAJA_test_abs_HPP__
diff --git a/test/include/RAJA_test-atomic-ref-types.hpp b/test/include/RAJA_test-atomic-ref-types.hpp
index 2f6280ed0e..f854932ab8 100644
--- a/test/include/RAJA_test-atomic-ref-types.hpp
+++ b/test/include/RAJA_test-atomic-ref-types.hpp
@@ -18,71 +18,74 @@
 
 #include <type_traits>
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 1, T>::type
-np2m1(T val)
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 1, T>::type np2m1(T val)
 {
-  val |= val >> 1;
-  val |= val >> 2;
-  val |= val >> 4;
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
   return val;
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 2, T>::type
-np2m1(T val)
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 2, T>::type np2m1(T val)
 {
-  val |= val >> 1;
-  val |= val >> 2;
-  val |= val >> 4;
-  val |= val >> 8;
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
   return val;
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 4, T>::type
-np2m1(T val)
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 4, T>::type np2m1(T val)
 {
-  val |= val >> 1;
-  val |= val >> 2;
-  val |= val >> 4;
-  val |= val >> 8;
-  val |= val >> 16;
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
+  val |= val >> 16 ;
   return val;
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 8, T>::type
-np2m1(T val)
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 8, T>::type np2m1(T val)
 {
-  val |= val >> 1;
-  val |= val >> 2;
-  val |= val >> 4;
-  val |= val >> 8;
-  val |= val >> 16;
-  val |= val >> 32;
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
+  val |= val >> 16 ;
+  val |= val >> 32 ;
   return val;
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 16, T>::type
-np2m1(T val)
+template < typename T >
+RAJA_INLINE
+RAJA_HOST_DEVICE
+typename std::enable_if<sizeof(T) == 16, T>::type np2m1(T val)
 {
-  val |= val >> 1;
-  val |= val >> 2;
-  val |= val >> 4;
-  val |= val >> 8;
-  val |= val >> 16;
-  val |= val >> 32;
-  val |= val >> 64;
+  val |= val >> 1  ;
+  val |= val >> 2  ;
+  val |= val >> 4  ;
+  val |= val >> 8  ;
+  val |= val >> 16 ;
+  val |= val >> 32 ;
+  val |= val >> 64 ;
   return val;
 }
 
 // Assist return type conditional overloading of testAtomicRefLogicalOp
-struct int_op
-{};  // represents underlying op type = integral
-struct all_op
-{};  // these op types can accept integral or float
+struct int_op {}; // represents underlying op type = integral
+struct all_op {}; // these op types can accept integral or float
 
 
-#endif  // __RAJA_test_atomic_ref_types_HPP__
+#endif // __RAJA_test_atomic_ref_types_HPP__
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
index 5a9df0ab43..90a1be4024 100644
--- a/test/include/RAJA_test-atomic-types.hpp
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -21,14 +21,15 @@
 //
 // Atomic data types
 //
-using AtomicDataTypeList = camp::list<RAJA::Index_type,
-                                      int,
+using AtomicDataTypeList =
+  camp::list< RAJA::Index_type,
+              int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                      unsigned int,
-                                      long long,
-                                      unsigned long long,
-                                      float,
+              unsigned int,
+              long long,
+              unsigned long long,
+              float,
 #endif
-                                      double>;
+              double >;
 
-#endif  // __RAJA_test_atomic_types_HPP__
+#endif // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-atomicpol.hpp b/test/include/RAJA_test-atomicpol.hpp
index c13e9a68bd..cc327d434d 100644
--- a/test/include/RAJA_test-atomicpol.hpp
+++ b/test/include/RAJA_test-atomicpol.hpp
@@ -11,83 +11,93 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using SequentialAtomicPols = camp::list<
+using SequentialAtomicPols =
+  camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::auto_atomic,
-    RAJA::builtin_atomic,
+              RAJA::auto_atomic,
+              RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+              RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-    RAJA::seq_atomic>;
+              RAJA::seq_atomic
+            >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPAtomicPols = camp::list<
+using OpenMPAtomicPols =
+  camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::omp_atomic,
-    RAJA::builtin_atomic,
+              RAJA::omp_atomic,
+              RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+              RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-    RAJA::auto_atomic>;
+              RAJA::auto_atomic
+            >;
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAtomicPols = camp::list<
+using CudaAtomicPols =
+  camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::auto_atomic,
-    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
-    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+              RAJA::auto_atomic,
+              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-    RAJA::cuda_atomic>;
+              RAJA::cuda_atomic
+            >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAtomicPols = camp::list<
+using HipAtomicPols =
+  camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::auto_atomic,
-    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
-    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+               RAJA::auto_atomic,
+               RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+               RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+               RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-    RAJA::hip_atomic>;
+               RAJA::hip_atomic
+            >;
 #endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclAtomicPols = camp::list<
+using SyclAtomicPols =
+  camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::auto_atomic,
-    RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
-    RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
+               RAJA::auto_atomic,
+               RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
+               RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-    RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
+               RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-    RAJA::sycl_atomic>;
+               RAJA::sycl_atomic
+            >;
 #endif  // RAJA_ENABLE_SYCL
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
diff --git a/test/include/RAJA_test-base.hpp b/test/include/RAJA_test-base.hpp
index 470ddb9cfd..98bf53e1c2 100644
--- a/test/include/RAJA_test-base.hpp
+++ b/test/include/RAJA_test-base.hpp
@@ -23,10 +23,9 @@ template <class T>
 struct Test;
 
 template <class... T>
-struct Test<camp::list<T...>>
-{
+struct Test<camp::list<T...>> {
   using Types = ::testing::Types<T...>;
 };
 
 
-#endif  // __RAJA_test_base_HPP__
+#endif // __RAJA_test_base_HPP__
diff --git a/test/include/RAJA_test-camp.hpp b/test/include/RAJA_test-camp.hpp
index a9959f3c73..45e125d92a 100644
--- a/test/include/RAJA_test-camp.hpp
+++ b/test/include/RAJA_test-camp.hpp
@@ -42,4 +42,4 @@ using HipResourceList = camp::list<camp::resources::Hip>;
 using SyclResourceList = camp::list<camp::resources::Sycl>;
 #endif
 
-#endif  // __RAJA_test_camp_HPP__
+#endif // __RAJA_test_camp_HPP__
diff --git a/test/include/RAJA_test-dynamic-forall.hpp b/test/include/RAJA_test-dynamic-forall.hpp
index 9988492216..0185061a6d 100644
--- a/test/include/RAJA_test-dynamic-forall.hpp
+++ b/test/include/RAJA_test-dynamic-forall.hpp
@@ -15,21 +15,18 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using policy_list = camp::list<camp::list<RAJA::seq_exec,
-                                          RAJA::simd_exec
+using policy_list = camp::list<camp::list<RAJA::seq_exec
+                               ,RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-                                          ,
-                                          RAJA::omp_parallel_for_exec
+                               ,RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                                          ,
-                                          RAJA::cuda_exec<256>,
-                                          RAJA::cuda_exec<512>
+                               ,RAJA::cuda_exec<256>
+                               ,RAJA::cuda_exec<512>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                                          ,
-                                          RAJA::hip_exec<256>,
-                                          RAJA::hip_exec<512>
+                               ,RAJA::hip_exec<256>
+                               ,RAJA::hip_exec<512>
 #endif
                                           >>;
 
diff --git a/test/include/RAJA_test-forall-async-execpol.hpp b/test/include/RAJA_test-forall-async-execpol.hpp
index 587f816476..fa9526476e 100644
--- a/test/include/RAJA_test-forall-async-execpol.hpp
+++ b/test/include/RAJA_test-forall-async-execpol.hpp
@@ -18,30 +18,29 @@
 #include "RAJA_test-forall-execpol.hpp"
 
 // Sequential execution policy types
-using SequentialAsyncForallExecPols       = SequentialForallExecPols;
+using SequentialAsyncForallExecPols = SequentialForallExecPols;
 using SequentialAsyncForallReduceExecPols = SequentialForallReduceExecPols;
 using SequentialAsyncForallAtomicExecPols = SequentialForallAtomicExecPols;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPAsyncForallExecPols       = OpenMPForallExecPols;
+using OpenMPAsyncForallExecPols = OpenMPForallExecPols;
 using OpenMPAsyncForallReduceExecPols = OpenMPForallReduceExecPols;
 using OpenMPAsyncForallAtomicExecPols = OpenMPForallAtomicExecPols;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAsyncForallExecPols       = OpenMPTargetForallExecPols;
+using OpenMPTargetAsyncForallExecPols = OpenMPTargetForallExecPols;
 using OpenMPTargetAsyncForallReduceExecPols = OpenMPTargetForallReduceExecPols;
 using OpenMPTargetAsyncForallAtomicExecPols = OpenMPTargetForallAtomicExecPols;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAsyncForallExecPols =
-    camp::list<RAJA::cuda_exec<128, true>,
-               RAJA::cuda_exec<256, true>,
-               RAJA::cuda_exec_explicit<256, 2, true>>;
+using CudaAsyncForallExecPols = camp::list< RAJA::cuda_exec<128, true>,
+                                       RAJA::cuda_exec<256, true>,
+                                       RAJA::cuda_exec_explicit<256,2, true> >;
 
 using CudaAsyncForallReduceExecPols = CudaForallExecPols;
 
@@ -50,8 +49,8 @@ using CudaAsyncForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAsyncForallExecPols =
-    camp::list<RAJA::hip_exec<128, true>, RAJA::hip_exec<256, true>>;
+using HipAsyncForallExecPols = camp::list< RAJA::hip_exec<128, true>,
+                                      RAJA::hip_exec<256, true>  >;
 
 using HipAsyncForallReduceExecPols = HipForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
index d932e6d94f..3ced1c4cf1 100644
--- a/test/include/RAJA_test-forall-data.hpp
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -6,7 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 //
-// Utility routines for allocating/deallocating arrays in for forall tests.
+// Utility routines for allocating/deallocating arrays in for forall tests. 
 //
 
 #ifndef __RAJA_test_forall_data_HPP__
@@ -14,47 +14,45 @@
 
 #include "camp/resource.hpp"
 
-template <typename T>
+template<typename T>
 void allocateForallTestData(size_t N,
                             camp::resources::Resource work_res,
                             T** work_array,
                             T** check_array,
                             T** test_array)
 {
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
 // for RAJA strongly typed indices
-template <typename T,
-          typename std::enable_if<
-              std::is_base_of<RAJA::IndexValueBase,
-                              camp::type::ptr::rem<T>>::value>::type* = nullptr>
+template<typename T,
+         typename std::enable_if<std::is_base_of<RAJA::IndexValueBase, camp::type::ptr::rem<T>>::value>::type* = nullptr>
 void allocateForallTestData(T N,
                             camp::resources::Resource work_res,
                             T** work_array,
                             T** check_array,
                             T** test_array)
 {
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
-template <typename T>
+template<typename T>
 void deallocateForallTestData(camp::resources::Resource work_res,
                               T* work_array,
                               T* check_array,
                               T* test_array)
 {
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
   work_res.deallocate(work_array);
 
@@ -62,4 +60,4 @@ void deallocateForallTestData(camp::resources::Resource work_res,
   host_res.deallocate(test_array);
 }
 
-#endif  // __RAJA_test_forall_data_HPP__
+#endif // __RAJA_test_forall_data_HPP__
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index cc8f9b2a26..40adaccc8c 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -16,109 +16,87 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialForallExecPols = camp::list<RAJA::seq_exec, RAJA::simd_exec>;
+using SequentialForallExecPols = camp::list< RAJA::seq_exec,
+                                             RAJA::simd_exec >;
 
 //
 // Sequential execution policy types for reduction and atomic tests.
 //
 // Note: RAJA::simd_exec does not work with these.
 //
-using SequentialForallReduceExecPols = camp::list<RAJA::seq_exec>;
+using SequentialForallReduceExecPols = camp::list< RAJA::seq_exec >;
 
-using SequentialForallAtomicExecPols = camp::list<RAJA::seq_exec>;
+using SequentialForallAtomicExecPols = camp::list< RAJA::seq_exec >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallExecPols = camp::list<
-    RAJA::omp_parallel_for_exec
-
-    ,
-    RAJA::omp_parallel_for_static_exec<>,
-    RAJA::omp_parallel_for_static_exec<4>
+using OpenMPForallExecPols = 
+  camp::list< RAJA::omp_parallel_for_exec
+ 
+              , RAJA::omp_parallel_for_static_exec< >
+              , RAJA::omp_parallel_for_static_exec<4>
 
 #if defined(RAJA_TEST_EXHAUSTIVE)
-    ,
-    RAJA::omp_parallel_for_dynamic_exec<>,
-    RAJA::omp_parallel_for_dynamic_exec<4>
-
-    ,
-    RAJA::omp_parallel_for_guided_exec<>,
-    RAJA::omp_parallel_for_guided_exec<4>
-
-    ,
-    RAJA::omp_parallel_for_runtime_exec
-
-    ,
-    RAJA::omp_parallel_exec<RAJA::omp_for_exec>
-
-    ,
-    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<>>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<>>>,
-    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
-
-    ,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<>>>,
-    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
-
-    ,
-    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<>>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<>>>,
-    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
-
-    ,
-    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<>>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<>>>,
-    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
-
-    ,
-    RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>,
-    RAJA::omp_parallel_exec<
-        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
-#endif
-    >;
+              , RAJA::omp_parallel_for_dynamic_exec< >
+              , RAJA::omp_parallel_for_dynamic_exec<4>
+
+              , RAJA::omp_parallel_for_guided_exec< >
+              , RAJA::omp_parallel_for_guided_exec<4>
+
+              , RAJA::omp_parallel_for_runtime_exec
+
+              , RAJA::omp_parallel_exec<RAJA::omp_for_exec>
+
+              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec< >>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static< >>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
+
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static< >>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
+
+              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec< >>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic< >>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
+
+              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec< >>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided< >>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
+
+              , RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
+#endif       
+             >;
 
 using OpenMPForallReduceExecPols = OpenMPForallExecPols;
 
 using OpenMPForallAtomicExecPols =
-    camp::list<RAJA::omp_parallel_for_exec
+  camp::list< RAJA::omp_parallel_for_exec
 
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               ,
-               RAJA::omp_parallel_for_static_exec<>,
-               RAJA::omp_parallel_for_static_exec<4>,
-               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
-               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-
-               ,
-               RAJA::omp_parallel_for_dynamic_exec<>,
-               RAJA::omp_parallel_for_dynamic_exec<2>
-
-               ,
-               RAJA::omp_parallel_for_guided_exec<>,
-               RAJA::omp_parallel_for_guided_exec<3>
-
-               ,
-               RAJA::omp_parallel_for_runtime_exec
+              , RAJA::omp_parallel_for_static_exec< >
+              , RAJA::omp_parallel_for_static_exec<4>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec< >>
+              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
+
+              , RAJA::omp_parallel_for_dynamic_exec< >
+              , RAJA::omp_parallel_for_dynamic_exec<2>
+
+              , RAJA::omp_parallel_for_guided_exec< >
+              , RAJA::omp_parallel_for_guided_exec<3>
+
+              , RAJA::omp_parallel_for_runtime_exec
 #endif
-               >;
+            >; 
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetForallExecPols =
-    camp::list<RAJA::omp_target_parallel_for_exec<8>,
-               RAJA::omp_target_parallel_for_exec_nt>;
+  camp::list< RAJA::omp_target_parallel_for_exec<8>,
+              RAJA::omp_target_parallel_for_exec_nt >;
 
 using OpenMPTargetForallReduceExecPols = OpenMPTargetForallExecPols;
 
@@ -127,15 +105,12 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallExecPols =
-    camp::list<RAJA::cuda_exec<128>,
-               RAJA::cuda_exec_occ_calc<256>,
-               RAJA::cuda_exec_grid<256, 64>,
-               RAJA::cuda_exec_explicit<256, 2>,
-               RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
-               RAJA::cuda_exec_occ_custom<
-                   256,
-                   RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
+using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
+                                       RAJA::cuda_exec_occ_calc<256>,
+                                       RAJA::cuda_exec_grid<256, 64>,
+                                       RAJA::cuda_exec_explicit<256,2>,
+                                       RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
+                                       RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -144,14 +119,11 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallExecPols =
-    camp::list<RAJA::hip_exec<128>,
-               RAJA::hip_exec_occ_calc<256>,
-               RAJA::hip_exec_grid<256, 64>,
-               RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
-               RAJA::hip_exec_occ_custom<
-                   256,
-                   RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
+using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
+                                      RAJA::hip_exec_occ_calc<256>,
+                                      RAJA::hip_exec_grid<256, 64>,
+                                      RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
+                                      RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
@@ -160,8 +132,8 @@ using HipForallAtomicExecPols = HipForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclForallExecPols =
-    camp::list<RAJA::sycl_exec<128, false>, RAJA::sycl_exec<256, false>>;
+using SyclForallExecPols = camp::list< RAJA::sycl_exec<128, false>,
+                                       RAJA::sycl_exec<256, false> >;
 
 using SyclForallReduceExecPols = SyclForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
index e4eebcd266..1a25ba4daf 100644
--- a/test/include/RAJA_test-forall-indexset-execpol.hpp
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -13,8 +13,8 @@
 
 // Sequential execution policy types
 using SequentialForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec> >;
 
 //
 // Sequential execution policy types for reduction tests.
@@ -22,48 +22,50 @@ using SequentialForallIndexSetExecPols =
 // Note: RAJA::simd_exec does not work with these.
 //
 using SequentialForallIndexSetReduceExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>>;
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec> >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-               RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
+using OpenMPForallIndexSetExecPols =  
+  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
 
 using OpenMPForallIndexSetReduceExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
+  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetForallIndexSetExecPols = camp::list<
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec<8>>,
-    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec_nt>>;
+using OpenMPTargetForallIndexSetExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit,
+                               RAJA::omp_target_parallel_for_exec<8>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, 
+                               RAJA::omp_target_parallel_for_exec_nt> >;
 
-using OpenMPTargetForallIndexSetReduceExecPols =
-    OpenMPTargetForallIndexSetExecPols;
+using OpenMPTargetForallIndexSetReduceExecPols = 
+      OpenMPTargetForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>> >;
 
 using CudaForallIndexSetReduceExecPols = CudaForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>> >;
 
 using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
 using SyclForallIndexSetExecPols =
-    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
-               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>> >;
 
 using SyclForallIndexSetReduceExecPols = SyclForallIndexSetExecPols;
 #endif
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
index ed13851729..231139eb57 100644
--- a/test/include/RAJA_test-index-types.hpp
+++ b/test/include/RAJA_test-index-types.hpp
@@ -25,50 +25,50 @@
 //
 RAJA_INDEX_VALUE(StrongIndexType, "StrongIndexType");
 RAJA_INDEX_VALUE_T(StrongInt, int, "StrongIntType");
-RAJA_INDEX_VALUE_T(StrongULL, unsigned long long, "StrongULLType");
+RAJA_INDEX_VALUE_T(StrongULL, unsigned long long , "StrongULLType");
 
 //
 // Standard index types list
 //
-using IdxTypeList =
-    camp::list<RAJA::Index_type,
-               int,
+using IdxTypeList = camp::list<RAJA::Index_type,
+                               int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               unsigned int,
-               // short int types will break a bunch of tests due to assumpitons
-               // made in the test implementations.
-               //                             short,
-               //                             unsigned short,
-               long int,
-               unsigned long,
-               long long,
+                               unsigned int,
+// short int types will break a bunch of tests due to assumpitons made in 
+// the test implementations.
+//                             short,
+//                             unsigned short,
+                               long int,
+                               unsigned long,
+                               long long,
 #endif
-               unsigned long long>;
+                               unsigned long long>;
 
 //
 // Signed index types list
 //
-using SignedIdxTypeList = camp::list<RAJA::Index_type, int, long long>;
+using SignedIdxTypeList = camp::list<RAJA::Index_type,
+                                     int,
+                                     long long>;
 
 //
 // Index types w/ Strong types list
 //
-using StrongIdxTypeList =
-    camp::list<RAJA::Index_type,
-               int,
-               StrongIndexType,
+using StrongIdxTypeList = camp::list<RAJA::Index_type,
+                                     int,
+                                     StrongIndexType,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               // StrongInt,
-               unsigned int,
-               // short int types will break a bunch of tests due to assumpitons
-               // made in the test implementations.
-               //                                   short,
-               //                                   unsigned short,
-               long int,
-               unsigned long,
-               long long,
+                                     //StrongInt,
+                                     unsigned int,
+// short int types will break a bunch of tests due to assumpitons made in 
+// the test implementations.
+//                                   short,
+//                                   unsigned short,
+                                     long int,
+                                     unsigned long,
+                                     long long,
 #endif
-               // StrongULL,
-               unsigned long long>;
+                                     //StrongULL,
+                                     unsigned long long>;
 
-#endif  // __RAJA_test_index_types_HPP__
+#endif // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
index 4bc41ac9cf..a7bcdf5b05 100644
--- a/test/include/RAJA_test-indexset-build.hpp
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -18,17 +18,17 @@
 #include <random>
 
 //
-// Utility routine to construct index set with mix of Range, RangeStride,
+// Utility routine to construct index set with mix of Range, RangeStride, 
 // and List segments to use in various tests.
 //
 template <typename INDEX_TYPE,
           typename RANGE_TYPE,
           typename RANGESTRIDE_TYPE,
           typename LIST_TYPE>
-void buildIndexSet(
-    RAJA::TypedIndexSet<RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE>& iset,
-    std::vector<INDEX_TYPE>& indices_out,
-    camp::resources::Resource working_res)
+void buildIndexSet( 
+  RAJA::TypedIndexSet< RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE >& iset, 
+  std::vector<INDEX_TYPE>& indices_out,
+  camp::resources::Resource working_res )
 {
   //
   //  Build vector of integers for creating List segments.
@@ -38,29 +38,27 @@ void buildIndexSet(
 
   std::vector<INDEX_TYPE> lindices;
   INDEX_TYPE idx = 0;
-  while (lindices.size() < 3000)
-  {
+  while (lindices.size() < 3000) {
     double dval = dist(gen);
-    if (dval > 0.3)
-    {
+    if (dval > 0.3) {
       lindices.push_back(idx);
     }
     idx++;
   }
 
   //
-  // Construct a mix of Range, RangeStride, and List segments
+  // Construct a mix of Range, RangeStride, and List segments 
   // and add them to index set
   //
-  INDEX_TYPE rbeg     = 0;
-  INDEX_TYPE rend     = 0;
-  INDEX_TYPE stride   = 0;
+  INDEX_TYPE rbeg = 0;
+  INDEX_TYPE rend = 0;
+  INDEX_TYPE stride = 0;
   INDEX_TYPE last_idx = 0;
-  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>(lindices.size());
+  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>( lindices.size() );
   std::vector<INDEX_TYPE> lseg(lseg_len);
   std::vector<INDEX_TYPE> lseg_vec(lseg_len);
 
-  indices_out.clear();
+  indices_out.clear(); 
 
   // Create empty Range segment
   rbeg = 1;
@@ -72,38 +70,34 @@ void buildIndexSet(
   rbeg = 1;
   rend = 1578;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i)
-  {
-    indices_out.push_back(i);
+  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
+    indices_out.push_back( i ); 
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
-  {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
     lseg[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back(lseg[i]);
+    indices_out.push_back( lseg[i] );
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
-  {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
     lseg_vec[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back(lseg_vec[i]);
+    indices_out.push_back( lseg_vec[i] );
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
 
   // Create Range-stride segment
-  rbeg   = last_idx + 16;
-  rend   = rbeg + 2040;
+  rbeg = last_idx + 16;
+  rend = rbeg + 2040;
   stride = 3;
   iset.push_back(RANGESTRIDE_TYPE(rbeg, rend, stride));
-  for (INDEX_TYPE i = rbeg; i < rend; i += stride)
-  {
-    indices_out.push_back(i);
+  for (INDEX_TYPE i = rbeg; i < rend; i += stride) { 
+    indices_out.push_back( i ); 
   }
   last_idx = rend;
 
@@ -111,17 +105,15 @@ void buildIndexSet(
   rbeg = last_idx + 4;
   rend = rbeg + 2759;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i)
-  {
-    indices_out.push_back(i);
+  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
+    indices_out.push_back( i ); 
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
-  {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
     lseg[i] = lindices[i] + last_idx + 5;
-    indices_out.push_back(lseg[i]);
+    indices_out.push_back( lseg[i] );
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
@@ -130,17 +122,15 @@ void buildIndexSet(
   rbeg = last_idx + 1;
   rend = rbeg + 320;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i)
-  {
-    indices_out.push_back(i);
+  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
+    indices_out.push_back( i ); 
   }
   last_idx = rend;
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
-  {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
     lseg_vec[i] = lindices[i] + last_idx + 7;
-    indices_out.push_back(lseg_vec[i]);
+    indices_out.push_back( lseg_vec[i] );
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
diff --git a/test/include/RAJA_test-kernel-nested-loop-types.hpp b/test/include/RAJA_test-kernel-nested-loop-types.hpp
index 9c323c95e4..4d13af1e9b 100644
--- a/test/include/RAJA_test-kernel-nested-loop-types.hpp
+++ b/test/include/RAJA_test-kernel-nested-loop-types.hpp
@@ -16,54 +16,30 @@
 #define DEVICE_KERNEL CudaKernel
 #endif
 
-struct DEPTH_1_REDUCESUM
-{};
-struct DEPTH_2
-{};
-struct DEPTH_2_COLLAPSE
-{};
-struct DEPTH_3
-{};
-struct DEPTH_3_COLLAPSE
-{};
-struct DEPTH_3_COLLAPSE_SEQ_INNER
-{};
-struct DEPTH_3_COLLAPSE_SEQ_OUTER
-{};
-struct DEPTH_3_REDUCESUM
-{};
-struct DEPTH_3_REDUCESUM_SEQ_INNER
-{};
-struct DEPTH_3_REDUCESUM_SEQ_OUTER
-{};
-struct DEVICE_DEPTH_1_REDUCESUM
-{};
-struct DEVICE_DEPTH_1_REDUCESUM_WARP
-{};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE
-{};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE
-{};
-struct DEVICE_DEPTH_2
-{};
-struct DEVICE_DEPTH_2_REDUCESUM_WARP
-{};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK
-{};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
-{};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE
-{};
-struct DEVICE_DEPTH_3
-{};
-struct DEVICE_DEPTH_3_REDUCESUM
-{};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER
-{};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER
-{};
-struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
-{};
+struct DEPTH_1_REDUCESUM {};
+struct DEPTH_2 {};
+struct DEPTH_2_COLLAPSE {};
+struct DEPTH_3 {};
+struct DEPTH_3_COLLAPSE {};
+struct DEPTH_3_COLLAPSE_SEQ_INNER {};
+struct DEPTH_3_COLLAPSE_SEQ_OUTER {};
+struct DEPTH_3_REDUCESUM {};
+struct DEPTH_3_REDUCESUM_SEQ_INNER {};
+struct DEPTH_3_REDUCESUM_SEQ_OUTER {};
+struct DEVICE_DEPTH_1_REDUCESUM {};
+struct DEVICE_DEPTH_1_REDUCESUM_WARP {};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE {};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE {};
+struct DEVICE_DEPTH_2 {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARP {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE {};
+struct DEVICE_DEPTH_3 {};
+struct DEVICE_DEPTH_3_REDUCESUM {};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER {};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER {};
+struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
 
 
 //
@@ -71,61 +47,56 @@ struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
 // Nested Loop Data Type information
 //
 //
-template <typename LoopPolType, typename... Policies>
-struct NestedLoopData : camp::list<Policies...>
-{
+template<typename LoopPolType, typename... Policies> 
+struct NestedLoopData : camp::list<Policies...> {
   using LoopType = LoopPolType;
 };
 
 
 //
 //
-// Filter out a list of "NestedLoopData" types given a
+// Filter out a list of "NestedLoopData" types given a 
 // tests' supported loop Type list.
 //
 //
-namespace detail
-{
+namespace detail{
 
-using namespace camp;
+  using namespace camp;
 
-template <typename T, typename Elements>
-struct is_in_type_list;
+  template<typename T, typename Elements>
+  struct is_in_type_list;
 
-template <typename T, typename Elements>
-struct KELB_impl;
+  template<typename T, typename Elements>
+  struct KELB_impl;
 
-template <typename T, typename First, typename... Rest>
-struct is_in_type_list<T, list<First, Rest...>>
-    : std::conditional<std::is_same<typename T::LoopType, First>::value,
-                       list<T>,
-                       typename is_in_type_list<T, list<Rest...>>::type>
-{};
+  template<typename T, typename First, typename... Rest>
+  struct is_in_type_list<T, list<First, Rest...>> :
+    std::conditional<
+      std::is_same<  typename T::LoopType, First  >::value,
+      list<T>,
+      typename is_in_type_list<T, list<Rest...>>::type > {};
 
-template <typename T, typename Last>
-struct is_in_type_list<T, list<Last>>
-    : std::conditional<std::is_same<typename T::LoopType, Last>::value,
-                       list<T>,
-                       list<>>
-{};
+  template<typename T, typename Last>
+  struct is_in_type_list<T, list<Last>> :
+    std::conditional<
+      std::is_same< typename T::LoopType , Last>::value,
+      list<T>,
+      list<> > {};
 
-template <typename POL_TYPE_LIST, typename First, typename... Rest>
-struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>>
-    : join<typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
-           typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
-{};
+  template<typename POL_TYPE_LIST, typename First, typename... Rest>
+  struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>> :
+    join< typename KELB_impl<POL_TYPE_LIST, list<First  >>::type, 
+          typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type > {};
 
-template <typename POL_TYPE_LIST, typename Last>
-struct KELB_impl<POL_TYPE_LIST, list<Last>>
-    : is_in_type_list<Last, POL_TYPE_LIST>
-{};
+  template<typename POL_TYPE_LIST, typename Last>
+  struct KELB_impl<POL_TYPE_LIST, list<Last>> :
+    is_in_type_list<Last, POL_TYPE_LIST > {};
 
-}  // namespace detail
+} // namespace detail
 
 
-template <typename POL_TYPE_LIST, typename EXEC_POL_LIST>
-struct KernelExecListBuilder
-{
+template<typename POL_TYPE_LIST, typename EXEC_POL_LIST>
+struct KernelExecListBuilder {
   using type = typename detail::KELB_impl<POL_TYPE_LIST, EXEC_POL_LIST>::type;
 };
 
diff --git a/test/include/RAJA_test-kernel-tile-size.hpp b/test/include/RAJA_test-kernel-tile-size.hpp
index 9d9bb95556..78fa28172d 100644
--- a/test/include/RAJA_test-kernel-tile-size.hpp
+++ b/test/include/RAJA_test-kernel-tile-size.hpp
@@ -15,4 +15,4 @@
 constexpr int tile_dim_x = 16;
 constexpr int tile_dim_y = 16;
 
-#endif  // __RAJA_test_kernel_tile_size_HPP__
+#endif // __RAJA_test_kernel_tile_size_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 0bb84ddd16..7179e48fdc 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -15,17 +15,26 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-// Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
-
-using Sequential_launch_policies = camp::list<seq_policies>;
+//Launch policies
+using seq_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>
+            >;
+
+using Sequential_launch_policies =
+  camp::list<
+             seq_policies
+            >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+             RAJA::LoopPolicy<RAJA::omp_for_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>
+            >;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -34,36 +43,47 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 #if defined(RAJA_ENABLE_CUDA)
 
 using cuda_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
-
-using cuda_direct_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
+            >;
+
+using cuda_direct_explicit_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
+           >;
 
 using Cuda_launch_policies =
-    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
+  camp::list<
+             cuda_direct_policies,
+             cuda_direct_explicit_policies
+            >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
 using hip_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
+           >;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif  // RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
 using sycl_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
+            >;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index 258809a569..f84823e414 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -15,26 +15,32 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-// Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+//Launch policies
+using seq_policies = 
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>
+            >;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
-
+                                              
 #if defined(RAJA_ENABLE_OPENMP)
 
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = 
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+             RAJA::LoopPolicy<RAJA::omp_for_exec>,  
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>,
+             RAJA::LoopPolicy<RAJA::seq_exec>
+            >;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -42,57 +48,68 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
-
-using cuda_direct_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
-
-using Cuda_launch_policies =
-    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
+using cuda_direct_policies = 
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
+            >;
+
+using cuda_direct_explicit_policies = 
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
+            >;
+
+using Cuda_launch_policies = 
+  camp::list<
+             cuda_direct_policies,
+             cuda_direct_explicit_policies
+             >;
 
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
-               RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
+using hip_direct_policies = 
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
+             RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
+             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
+             RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
+             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
+           >;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif  // RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,  // slowest
-               RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,  // fastest
-               RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
+using sycl_direct_policies = 
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, //slowest
+             RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, //fastest
+             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
+            >;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
-
+                                        
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index 5965621493..fea90a8305 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -15,47 +15,65 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-// Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+//Launch policies
+using seq_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+  RAJA::LoopPolicy<RAJA::seq_exec>
+  >;
 
-using Sequential_launch_policies = camp::list<seq_policies>;
+using Sequential_launch_policies = camp::list<
+  seq_policies
+  >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>>;
+using omp_policies = camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+         RAJA::LoopPolicy<RAJA::omp_for_exec>
+  >;
 
-using OpenMP_launch_policies = camp::list<omp_policies>;
+using OpenMP_launch_policies = camp::list<
+  omp_policies
+  >;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_policies = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
-                                 RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+using cuda_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
 using cuda_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-    RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
-using Cuda_launch_policies = camp::list<cuda_policies, cuda_explicit_policies>;
+using Cuda_launch_policies = camp::list<
+        cuda_policies,
+        cuda_explicit_policies
+         >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_policies = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-                                RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
+using hip_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
 
-using Hip_launch_policies = camp::list<hip_policies>;
-#endif  // RAJA_ENABLE_HIP
+using Hip_launch_policies = camp::list<
+      hip_policies
+       >;
+#endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_policies = camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-                                 RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
+using sycl_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
-using Sycl_launch_policies = camp::list<sycl_policies>;
-#endif  // RAJA_ENABLE_SYCL
+using Sycl_launch_policies = camp::list<
+      sycl_policies
+       >;
+#endif // RAJA_ENABLE_SYCL
 
 
 #endif  // __RAJA_TEST_LAUNCH_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index bed8b99cd6..6173fc6ffa 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -15,55 +15,73 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-// Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+//Launch policies
+using seq_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+  RAJA::LoopPolicy<RAJA::seq_exec>,
+  RAJA::LoopPolicy<RAJA::seq_exec>
+  >;
 
-using Sequential_launch_policies = camp::list<seq_policies>;
+using Sequential_launch_policies = camp::list<
+  seq_policies
+  >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+using omp_policies = camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
+         RAJA::LoopPolicy<RAJA::seq_exec>
+  >;
 
-using OpenMP_launch_policies = camp::list<omp_policies>;
+using OpenMP_launch_policies = camp::list<
+  omp_policies
+  >;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+using cuda_loop_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
+  >;
 
 using cuda_loop_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
-
-using Cuda_launch_policies =
-    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
+  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
+  >;
+
+using Cuda_launch_policies = camp::list<
+  cuda_loop_policies,
+  cuda_loop_explicit_policies
+  >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+using hip_loop_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
+  >;
 
-using Hip_launch_policies = camp::list<hip_loop_policies>;
-#endif  // RAJA_ENABLE_HIP
+using Hip_launch_policies = camp::list<
+      hip_loop_policies
+       >;
+#endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
-
-using Sycl_launch_policies = camp::list<sycl_loop_policies>;
+using sycl_loop_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
+  >;
+
+using Sycl_launch_policies = camp::list<  
+  sycl_loop_policies
+  >;
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index 7f4dd17486..d703216a13 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -15,79 +15,97 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-// Launch policies
-using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
-
-using Sequential_launch_policies = camp::list<seq_policies>;
+//Launch policies
+using seq_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+  RAJA::LoopPolicy<RAJA::seq_exec>,
+  RAJA::LoopPolicy<RAJA::seq_exec>,
+  RAJA::LoopPolicy<RAJA::seq_exec>,
+  RAJA::LoopPolicy<RAJA::seq_exec>,
+  RAJA::LoopPolicy<RAJA::seq_exec>,
+  RAJA::LoopPolicy<RAJA::seq_exec>
+  >;
+
+using Sequential_launch_policies = camp::list<
+  seq_policies
+  >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>,
-                                RAJA::LoopPolicy<RAJA::seq_exec>>;
-
-using OpenMP_launch_policies = camp::list<omp_policies>;
+using omp_policies = camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
+         RAJA::LoopPolicy<RAJA::seq_exec>,
+         RAJA::LoopPolicy<RAJA::seq_exec>,
+         RAJA::LoopPolicy<RAJA::seq_exec>,
+         RAJA::LoopPolicy<RAJA::seq_exec>,
+         RAJA::LoopPolicy<RAJA::seq_exec>
+  >;
+
+using OpenMP_launch_policies = camp::list<
+  omp_policies
+  >;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-               RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+using cuda_loop_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
+  >;
 
 using cuda_loop_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-    RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
-
-using Cuda_launch_policies =
-    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
+  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
+  >;
+
+using Cuda_launch_policies = camp::list<
+  cuda_loop_policies,
+  cuda_loop_explicit_policies
+  >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
-               RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
-               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
-               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
-
-using Hip_launch_policies = camp::list<hip_loop_policies>;
-#endif  // RAJA_ENABLE_HIP
+using hip_loop_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
+  RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
+  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+  RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
+  RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
+  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
+  >;
+
+using Hip_launch_policies = camp::list<
+      hip_loop_policies
+       >;
+#endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,  // slowest index
-               RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,  // fastest index
-               RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
-
-using Sycl_launch_policies = camp::list<sycl_loop_policies>;
+using sycl_loop_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, //slowest index
+  RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, //fastest index
+  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
+  >;
+
+using Sycl_launch_policies = camp::list<  
+  sycl_loop_policies
+  >;
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index 0d896c7880..fa2b39f761 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -15,129 +15,158 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-// Launch policies
+//Launch policies
 #if defined(RAJA_ENABLE_CUDA)
-using seq_cuda_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
-
-using seq_cuda_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t,
-                       RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
+using seq_cuda_policies =
+  camp::list<
+              RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::cuda_launch_t<true>>,
+              RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
+            >;
+
+using seq_cuda_explicit_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
+            >;
 
 using Sequential_launch_policies =
-    camp::list<seq_cuda_policies, seq_cuda_explicit_policies>;
+  camp::list<
+             seq_cuda_policies,
+             seq_cuda_explicit_policies
+            >;
 
 #elif defined(RAJA_ENABLE_HIP)
 using seq_hip_policies =
-    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
-               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
-               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::hip_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
+            >;
 
 using Sequential_launch_policies = camp::list<seq_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using seq_sycl_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::sycl_launch_t<true>>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
+using seq_sycl_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::sycl_launch_t<true>>,
+             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
+            >;
 
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
 
 #else
 using Sequential_launch_policies =
-    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-                          RAJA::LoopPolicy<RAJA::seq_exec>,
-                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
-#endif  // Sequential
+  camp::list<
+    camp::list<
+               RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+               RAJA::LoopPolicy<RAJA::seq_exec>,
+               RAJA::LoopPolicy<RAJA::seq_exec>
+              >
+            >;
+#endif // Sequential
 
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using omp_cuda_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::cuda_launch_t<false>>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
+using omp_cuda_policies =
+  camp::list<
+              RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::cuda_launch_t<false>>,
+              RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
+            >;
 
-using omp_cuda_explicit_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t,
-                       RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
+using omp_cuda_explicit_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
+            >;
 
 using OpenMP_launch_policies =
-    camp::list<omp_cuda_policies, omp_cuda_explicit_policies>;
+  camp::list<
+             omp_cuda_policies,
+             omp_cuda_explicit_policies
+            >;
 
 #elif defined(RAJA_ENABLE_HIP)
 
-using omp_hip_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::hip_launch_t<false>>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
+using omp_hip_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::hip_launch_t<false>>,
+             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
+            >;
 
 using OpenMP_launch_policies = camp::list<omp_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using omp_sycl_policies = camp::list<
-    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::sycl_launch_t<false>>,
-    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
-    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
+using omp_sycl_policies =
+  camp::list<
+             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::sycl_launch_t<false>>,
+             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
+            >;
 
 using OpenMP_launch_policies = camp::list<omp_sycl_policies>;
 
 #else
 
 using OpenMP_launch_policies =
-    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                          RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
-                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
+  camp::list<
+    camp::list<
+                RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
+                RAJA::LoopPolicy<RAJA::seq_exec>
+               >
+             >;
 #endif
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using Cuda_launch_policies = camp::list<seq_cuda_policies,
-                                        seq_cuda_explicit_policies
+using Cuda_launch_policies =
+  camp::list<
+             seq_cuda_policies
+            ,seq_cuda_explicit_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-                                        ,
-                                        omp_cuda_policies,
-                                        omp_cuda_explicit_policies
+            ,omp_cuda_policies
+            ,omp_cuda_explicit_policies
 #endif
 
-                                        >;
+           >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using Hip_launch_policies = camp::list<seq_hip_policies
+using Hip_launch_policies = camp::list<
+         seq_hip_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-                                       ,
-                                       omp_hip_policies
+         , omp_hip_policies
 #endif
-                                       >;
+        >;
 
-#endif  // RAJA_ENABLE_HIP
+#endif // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using Sycl_launch_policies = camp::list<seq_sycl_policies
+using Sycl_launch_policies = camp::list<
+         seq_sycl_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-                                        ,
-                                        omp_sycl_policies
+         , omp_sycl_policies
 #endif
-                                        >;
+        >;
 
-#endif  // RAJA_ENABLE_SYCL
+#endif // RAJA_ENABLE_SYCL
 
 #endif  // __RAJA_TEST_LAUNCH_RUNTIME_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
index 1dd618a72e..2c5412893c 100644
--- a/test/include/RAJA_test-multi-reduce-abstractor.hpp
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -18,7 +18,7 @@
 //
 // Get the identity value for the operation used by the given multi reducer
 //
-template <typename MultiReducer>
+template < typename MultiReducer >
 inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 {
   return MultiReducer::MultiReduceOp::identity();
@@ -27,207 +27,144 @@ inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 
 struct SumAbstractor
 {
-  template <typename DATA_TYPE>
-  static constexpr bool supports()
-  {
-    return std::is_arithmetic<DATA_TYPE>::value;
-  }
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
 
-  template <typename Reducer>
+  template < typename Reducer >
   static bool consistent(Reducer const&)
   {
-    return RAJA::policy_has_trait<typename Reducer::policy,
-                                  RAJA::reduce::ordered>::value ||
+    return RAJA::policy_has_trait<typename Reducer::policy, RAJA::reduce::ordered>::value ||
            !std::is_floating_point<typename Reducer::value_type>::value;
   }
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using multi_reducer = RAJA::MultiReduceSum<policy, DATA_TYPE>;
 
-  template <typename Lhs, typename Rhs>
-  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
-  {
-    return lhs + rhs;
-  }
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs + rhs; }
 
-  template <typename Reducer, typename Rhs>
-  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
-  {
-    return std::forward<Reducer>(lhs) += rhs;
-  }
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) += rhs; }
 
-  template <typename Reducer>
-  static auto identity(Reducer const&)
-  {
-    return Reducer::MultiReduceOp::identity();
-  }
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
 };
 
 struct MinAbstractor
 {
-  template <typename DATA_TYPE>
-  static constexpr bool supports()
-  {
-    return std::is_arithmetic<DATA_TYPE>::value;
-  }
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
 
-  template <typename Reducer>
-  static constexpr bool consistent(Reducer const&)
-  {
-    return true;
-  }
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using multi_reducer = RAJA::MultiReduceMin<policy, DATA_TYPE>;
 
-  template <typename Lhs, typename Rhs>
-  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
-  {
-    return (lhs > rhs) ? rhs : lhs;
-  }
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs > rhs) ? rhs : lhs; }
 
-  template <typename Reducer, typename Rhs>
-  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
-  {
-    return std::forward<Reducer>(lhs).min(rhs);
-  }
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).min(rhs); }
 
-  template <typename Reducer>
-  static auto identity(Reducer const&)
-  {
-    return Reducer::MultiReduceOp::identity();
-  }
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
 };
 
 struct MaxAbstractor
 {
-  template <typename DATA_TYPE>
-  static constexpr bool supports()
-  {
-    return std::is_arithmetic<DATA_TYPE>::value;
-  }
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
 
-  template <typename Reducer>
-  static constexpr bool consistent(Reducer const&)
-  {
-    return true;
-  }
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using multi_reducer = RAJA::MultiReduceMax<policy, DATA_TYPE>;
 
-  template <typename Lhs, typename Rhs>
-  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
-  {
-    return (lhs < rhs) ? rhs : lhs;
-  }
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs < rhs) ? rhs : lhs; }
 
-  template <typename Reducer, typename Rhs>
-  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
-  {
-    return std::forward<Reducer>(lhs).max(rhs);
-  }
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).max(rhs); }
 
-  template <typename Reducer>
-  static auto identity(Reducer const&)
-  {
-    return Reducer::MultiReduceOp::identity();
-  }
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
 };
 
 struct BitAndAbstractor
 {
-  template <typename DATA_TYPE>
-  static constexpr bool supports()
-  {
-    return std::is_integral<DATA_TYPE>::value;
-  }
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
 
-  template <typename Reducer>
-  static constexpr bool consistent(Reducer const&)
-  {
-    return true;
-  }
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using multi_reducer = RAJA::MultiReduceBitAnd<policy, DATA_TYPE>;
 
-  template <typename Lhs, typename Rhs>
-  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
-  {
-    return lhs & rhs;
-  }
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs & rhs; }
 
-  template <typename Reducer, typename Rhs>
-  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
-  {
-    return std::forward<Reducer>(lhs) &= rhs;
-  }
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) &= rhs; }
 
-  template <typename Reducer>
-  static auto identity(Reducer const&)
-  {
-    return Reducer::MultiReduceOp::identity();
-  }
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
 };
 
 struct BitOrAbstractor
 {
-  template <typename DATA_TYPE>
-  static constexpr bool supports()
-  {
-    return std::is_integral<DATA_TYPE>::value;
-  }
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
 
-  template <typename Reducer>
-  static constexpr bool consistent(Reducer const&)
-  {
-    return true;
-  }
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template <typename policy, typename DATA_TYPE>
+  template < typename policy, typename DATA_TYPE >
   using multi_reducer = RAJA::MultiReduceBitOr<policy, DATA_TYPE>;
 
-  template <typename Lhs, typename Rhs>
-  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
-  {
-    return lhs | rhs;
-  }
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs | rhs; }
 
-  template <typename Reducer, typename Rhs>
-  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
-  {
-    return std::forward<Reducer>(lhs) |= rhs;
-  }
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) |= rhs; }
 
-  template <typename Reducer>
-  static auto identity(Reducer const&)
-  {
-    return Reducer::MultiReduceOp::identity();
-  }
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
 };
 
 
 // Sequential reduction policy types
-using ReduceSumAbstractors    = camp::list<SumAbstractor>;
-using ReduceMinAbstractors    = camp::list<MinAbstractor>;
-using ReduceMaxAbstractors    = camp::list<MaxAbstractor>;
-using ReduceBitAndAbstractors = camp::list<BitAndAbstractor>;
-using ReduceBitOrAbstractors  = camp::list<BitOrAbstractor>;
+using ReduceSumAbstractors = camp::list< SumAbstractor >;
+using ReduceMinAbstractors = camp::list< MinAbstractor >;
+using ReduceMaxAbstractors = camp::list< MaxAbstractor >;
+using ReduceBitAndAbstractors = camp::list< BitAndAbstractor >;
+using ReduceBitOrAbstractors = camp::list< BitOrAbstractor >;
 
 #endif  // __RAJA_test_multi_reduce_abstractor_HPP__
diff --git a/test/include/RAJA_test-multi-reducepol.hpp b/test/include/RAJA_test-multi-reducepol.hpp
index 3e962c6df2..e024ef70aa 100644
--- a/test/include/RAJA_test-multi-reducepol.hpp
+++ b/test/include/RAJA_test-multi-reducepol.hpp
@@ -16,29 +16,28 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialMultiReducePols = camp::list<RAJA::seq_multi_reduce>;
+using SequentialMultiReducePols = camp::list< RAJA::seq_multi_reduce >;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPMultiReducePols =
-    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
+  camp::list< RAJA::omp_multi_reduce,
+              RAJA::omp_multi_reduce_ordered >;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducePols = camp::list<
-    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-    RAJA::
-        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-    RAJA::cuda_multi_reduce_atomic_global_host_init,
-    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
+using CudaMultiReducePols =
+  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::cuda_multi_reduce_atomic_global_host_init,
+              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducePols = camp::list<
-    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-    RAJA::
-        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-    RAJA::hip_multi_reduce_atomic_global_host_init,
-    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
+using HipMultiReducePols =
+  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::hip_multi_reduce_atomic_global_host_init,
+              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
 #endif
 
 #endif  // __RAJA_test_multi_reducepol_HPP__
diff --git a/test/include/RAJA_test-platform.hpp b/test/include/RAJA_test-platform.hpp
index ecdf7e1a56..7862461f18 100644
--- a/test/include/RAJA_test-platform.hpp
+++ b/test/include/RAJA_test-platform.hpp
@@ -16,10 +16,10 @@
 
 #include "camp/list.hpp"
 
-template <RAJA::Platform PLATFORM>
+template < RAJA::Platform PLATFORM >
 struct PlatformHolder
 {
-  static const RAJA::Platform platform = PLATFORM;
+   static const RAJA::Platform platform = PLATFORM;
 };
 
 //
@@ -38,12 +38,11 @@ using CudaPlatformList = camp::list<PlatformHolder<RAJA::Platform::cuda>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPlatformList =
-    camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
+using OpenMPTargetPlatformList = camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipPlatformList = camp::list<PlatformHolder<RAJA::Platform::hip>>;
 #endif
 
-#endif  // __RAJA_test_platform_HPP__
+#endif // __RAJA_test_platform_HPP__
diff --git a/test/include/RAJA_test-plugin-kernelpol.hpp b/test/include/RAJA_test-plugin-kernelpol.hpp
index 0ef68cbb7c..9c3f0e2e52 100644
--- a/test/include/RAJA_test-plugin-kernelpol.hpp
+++ b/test/include/RAJA_test-plugin-kernelpol.hpp
@@ -18,90 +18,86 @@
 
 // Sequential execution policy types
 using SequentialPluginKernelExecPols = camp::list<
-    RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
-    RAJA::KernelPolicy<RAJA::statement::Tile<
-        0,
-        RAJA::tile_fixed<2>,
-        RAJA::seq_exec,
-        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>,
-    RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>,
-    RAJA::KernelPolicy<RAJA::statement::Tile<
-        0,
-        RAJA::tile_fixed<2>,
-        RAJA::seq_exec,
-        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>>;
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
+          RAJA::statement::For<0, RAJA::seq_exec,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::simd_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
+          RAJA::statement::For<0, RAJA::simd_exec,
+            RAJA::statement::Lambda<0>>>>
+    >;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPPluginKernelExecPols = camp::list<
-    RAJA::KernelPolicy<RAJA::statement::For<0,
-                                            RAJA::omp_parallel_for_exec,
-                                            RAJA::statement::Lambda<0>>>,
-    RAJA::KernelPolicy<RAJA::statement::Tile<
-        0,
-        RAJA::tile_fixed<2>,
-        RAJA::omp_parallel_for_exec,
-        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>;
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
+          RAJA::statement::Lambda<0>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::omp_parallel_for_exec,
+          RAJA::statement::For<0, RAJA::seq_exec,
+            RAJA::statement::Lambda<0>>>>
+    >;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPluginKernelExecPols = camp::list<RAJA::KernelPolicy<
-    RAJA::statement::For<0,
-                         RAJA::omp_target_parallel_for_exec<64>,
-                         RAJA::statement::Lambda<0>>>>;
+using OpenMPTargetPluginKernelExecPols = camp::list<
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec<64>,
+          RAJA::statement::Lambda<0>>>
+    >;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaPluginKernelExecPols = camp::list<
-    RAJA::KernelPolicy<RAJA::statement::CudaKernel<
-        RAJA::statement::
-            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
-    RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
-        0,
-        RAJA::tile_fixed<128>,
-        RAJA::cuda_block_x_direct,
-        RAJA::statement::
-            For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
-    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
-        128,
-        RAJA::statement::
-            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
-    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
-        128,
-        RAJA::statement::Tile<
-            0,
-            RAJA::tile_fixed<128>,
-            RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0,
-                                 RAJA::cuda_thread_x_direct,
-                                 RAJA::statement::Lambda<0>>>>>>;
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernel<
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernel<
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
+            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernelFixed<128,
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernelFixed<128,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
+            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>
+    >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipPluginKernelExecPols = camp::list<
-    RAJA::KernelPolicy<RAJA::statement::HipKernel<
-        RAJA::statement::
-            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
-    RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
-        0,
-        RAJA::tile_fixed<128>,
-        RAJA::hip_block_x_direct,
-        RAJA::statement::
-            For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
-    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
-        128,
-        RAJA::statement::
-            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
-    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
-        128,
-        RAJA::statement::Tile<
-            0,
-            RAJA::tile_fixed<128>,
-            RAJA::hip_block_x_direct,
-            RAJA::statement::For<0,
-                                 RAJA::hip_thread_x_direct,
-                                 RAJA::statement::Lambda<0>>>>>>;
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernel<
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernel<
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
+            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernelFixed<128,
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+            RAJA::statement::Lambda<0>>>>,
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernelFixed<128,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
+            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+              RAJA::statement::Lambda<0>>>>>
+    >;
 #endif
 
 #endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-plugin-launchpol.hpp b/test/include/RAJA_test-plugin-launchpol.hpp
index e086842f5f..2370084633 100644
--- a/test/include/RAJA_test-plugin-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-launchpol.hpp
@@ -17,22 +17,18 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginLaunchExecPols =
-    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginLaunchExecPols =
-    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginLaunchExecPols =
-    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
+using CudaPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginLaunchExecPols =
-    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
+using HipPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
 
 #endif
 
diff --git a/test/include/RAJA_test-plugin-resource-launchpol.hpp b/test/include/RAJA_test-plugin-resource-launchpol.hpp
index e1a2caf27e..8d08574347 100644
--- a/test/include/RAJA_test-plugin-resource-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-resource-launchpol.hpp
@@ -17,22 +17,18 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginResourceLaunchExecPols =
-    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginResourceLaunchExecPols =
-    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginResourceLaunchExecPols = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
+using CudaPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginResourceLaunchExecPols = camp::list<
-    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
+using HipPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
 
 #endif
 
diff --git a/test/include/RAJA_test-reduce-types.hpp b/test/include/RAJA_test-reduce-types.hpp
index 49d5cadaea..8d8115321f 100644
--- a/test/include/RAJA_test-reduce-types.hpp
+++ b/test/include/RAJA_test-reduce-types.hpp
@@ -21,13 +21,14 @@
 //
 // Reduce data types
 //
-using ReduceDataTypeList = camp::list<int,
+using ReduceDataTypeList =
+  camp::list< int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                      unsigned,
-                                      long long,
-                                      unsigned long long,
+              unsigned,
+              long long,
+              unsigned long long,
 #endif
-                                      float,
-                                      double>;
+              float,
+              double >;
 
-#endif  // __RAJA_test_reduce_types_HPP__
+#endif // __RAJA_test_reduce_types_HPP__
diff --git a/test/include/RAJA_test-reduceloc-types.hpp b/test/include/RAJA_test-reduceloc-types.hpp
index a3387ee275..336c7dd23e 100644
--- a/test/include/RAJA_test-reduceloc-types.hpp
+++ b/test/include/RAJA_test-reduceloc-types.hpp
@@ -15,13 +15,10 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-struct Index2D
-{
-  RAJA::Index_type idx, idy;
-  constexpr Index2D() : idx(-1), idy(-1) {}
-  constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy)
-      : idx(idx), idy(idy)
-  {}
+struct Index2D {
+   RAJA::Index_type idx, idy;
+   constexpr Index2D() : idx(-1), idy(-1) {}
+   constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy) : idx(idx), idy(idy) {}
 };
 
-#endif  // __RAJA_test_reduceloc_types_HPP__
+#endif // __RAJA_test_reduceloc_types_HPP__
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index b755677c2e..e9e075b287 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -16,44 +16,43 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialReducePols = camp::list<RAJA::seq_reduce>;
+using SequentialReducePols = camp::list< RAJA::seq_reduce >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducePols =
-#if 0  // is ordered reduction broken???
+using OpenMPReducePols = 
+#if 0 // is ordered reduction broken???
   camp::list< RAJA::omp_reduce,
               RAJA::omp_reduce_ordered >;
 #else
-    camp::list<RAJA::omp_reduce>;
+  camp::list< RAJA::omp_reduce >;
 #endif
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    using OpenMPTargetReducePols = camp::list<RAJA::omp_target_reduce>;
+using OpenMPTargetReducePols =
+  camp::list< RAJA::omp_target_reduce >;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols =
-    camp::list<RAJA::cuda_reduce_device_fence,
-               RAJA::cuda_reduce_block_fence,
-               RAJA::cuda_reduce_atomic_device_init_device_fence,
-               RAJA::cuda_reduce_atomic_device_init_block_fence,
-               RAJA::cuda_reduce_atomic_host_init_device_fence,
-               RAJA::cuda_reduce_atomic_host_init_block_fence>;
+using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence,
+                                   RAJA::cuda_reduce_block_fence,
+                                   RAJA::cuda_reduce_atomic_device_init_device_fence,
+                                   RAJA::cuda_reduce_atomic_device_init_block_fence,
+                                   RAJA::cuda_reduce_atomic_host_init_device_fence,
+                                   RAJA::cuda_reduce_atomic_host_init_block_fence >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols =
-    camp::list<RAJA::hip_reduce_device_fence,
-               RAJA::hip_reduce_block_fence,
-               RAJA::hip_reduce_atomic_device_init_device_fence,
-               RAJA::hip_reduce_atomic_device_init_block_fence,
-               RAJA::hip_reduce_atomic_host_init_device_fence,
-               RAJA::hip_reduce_atomic_host_init_block_fence>;
+using HipReducePols = camp::list< RAJA::hip_reduce_device_fence,
+                                  RAJA::hip_reduce_block_fence,
+                                  RAJA::hip_reduce_atomic_device_init_device_fence,
+                                  RAJA::hip_reduce_atomic_device_init_block_fence,
+                                  RAJA::hip_reduce_atomic_host_init_device_fence,
+                                  RAJA::hip_reduce_atomic_host_init_block_fence >;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclReducePols = camp::list<RAJA::sycl_reduce>;
+using SyclReducePols = camp::list< RAJA::sycl_reduce >;
 #endif
 
 #endif  // __RAJA_test_reducepol_HPP__
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index 6c70d8583c..cf633098a9 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -13,248 +13,231 @@
 #include "RAJA_gtest.hpp"
 
 
-using TensorElementTypes = ::testing::Types<int, long, float, double>;
-
-template <typename POL>
-struct TensorTestHelper
-{
-
-  template <typename BODY>
-  static void exec(BODY const& body)
-  {
-    body();
-  }
-
-  static constexpr bool is_device = false;
+using TensorElementTypes = ::testing::Types<
+        int,
+        long,
+        float,
+        double
+    >;
+
+template<typename POL>
+struct TensorTestHelper {
+
+    template<typename BODY>
+    static
+    void exec(BODY const &body){
+      body();
+    }
+
+    static constexpr bool is_device = false;
 };
 
 #ifdef RAJA_ENABLE_CUDA
 
 template <typename BODY>
-__global__ void test_launcher(BODY body_in)
+__global__
+void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
   body();
 }
 
-template <>
+template<>
 struct TensorTestHelper<RAJA::expt::cuda_warp_register>
 {
 
-  RAJA_SUPPRESS_HD_WARN
-  template <typename BODY>
-  static void exec(BODY const& body)
-  {
-    cudaDeviceSynchronize();
+    RAJA_SUPPRESS_HD_WARN
+    template<typename BODY>
+    static
+    void exec(BODY const &body){
+      cudaDeviceSynchronize();
 
-    test_launcher<<<1, 32>>>(body);
+      test_launcher<<<1,32>>>(body);
 
-    cudaDeviceSynchronize();
-  }
+      cudaDeviceSynchronize();
 
-  static constexpr bool is_device = true;
+    }
+
+    static constexpr bool is_device = true;
 };
 #endif
 
 
+
 #ifdef RAJA_ENABLE_HIP
 
 template <typename BODY>
-__global__ void test_launcher(BODY body_in)
+__global__
+void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
+  auto& body = privatizer.get_priv();
   body();
 }
 
-template <>
+template<>
 struct TensorTestHelper<RAJA::expt::hip_wave_register>
 {
 
-  template <typename BODY>
-  static void exec(BODY const& body)
-  {
-    hipDeviceSynchronize();
+    template<typename BODY>
+    static
+    void exec(BODY const &body){
+      hipDeviceSynchronize();
 
-    RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0, 64),
-                                     [=] RAJA_HOST_DEVICE(int) { body(); });
+      RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0,64),
+      [=] RAJA_HOST_DEVICE (int ){
+        body();
+      });
 
-    hipDeviceSynchronize();
-  }
+      hipDeviceSynchronize();
+
+    }
 
-  static constexpr bool is_device = true;
+    static constexpr bool is_device = true;
 };
 #endif
 
 
-template <typename POL, typename BODY>
-void tensor_do(BODY const& body)
-{
+
+template<typename POL, typename BODY>
+void tensor_do(BODY const &body){
   TensorTestHelper<POL>::exec(body);
 }
 
 
+
 #if defined(RAJA_ENABLE_CUDA)
 
-template <typename POL, typename T>
-T* tensor_malloc(size_t len)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
-    T* ptr;
+template<typename POL, typename T>
+T* tensor_malloc(size_t len){
+  if(TensorTestHelper<POL>::is_device){
+    T *ptr;
 
-    cudaErrchk(cudaMalloc(&ptr, len * sizeof(T)));
+    cudaErrchk(cudaMalloc(&ptr, len*sizeof(T)));
 
     return ptr;
   }
-  else
-  {
+  else{
     return new T[len];
   }
 }
 
-template <typename POL, typename T>
-void tensor_free(T* ptr)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
+template<typename POL, typename T>
+void tensor_free(T *ptr){
+  if(TensorTestHelper<POL>::is_device){
     cudaErrchk(cudaFree(ptr));
   }
-  else
-  {
+  else{
     delete[] ptr;
   }
 }
 
-template <typename POL, typename T>
-void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
-    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
-                          cudaMemcpyHostToDevice));
+template<typename POL, typename T>
+void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
+  if(TensorTestHelper<POL>::is_device){
+    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), cudaMemcpyHostToDevice));
   }
-  else
-  {
-    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
+  else{
+    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
   }
 }
 
-template <typename POL, typename T>
-void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
-    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
-                          cudaMemcpyDeviceToHost));
+template<typename POL, typename T>
+void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
+  if(TensorTestHelper<POL>::is_device){
+    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), cudaMemcpyDeviceToHost));
   }
-  else
-  {
-    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
+  else{
+    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
   }
 }
 
 
+
 #elif defined(RAJA_ENABLE_HIP)
 
 
-template <typename POL, typename T>
-T* tensor_malloc(size_t len)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
-    T* ptr;
+template<typename POL, typename T>
+T* tensor_malloc(size_t len){
+  if(TensorTestHelper<POL>::is_device){
+    T *ptr;
 
-    hipErrchk(hipMalloc(&ptr, len * sizeof(T)));
+    hipErrchk(hipMalloc(&ptr, len*sizeof(T)));
 
     return ptr;
   }
-  else
-  {
+  else{
     return new T[len];
   }
 }
 
-template <typename POL, typename T>
-void tensor_free(T* ptr)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
+template<typename POL, typename T>
+void tensor_free(T *ptr){
+  if(TensorTestHelper<POL>::is_device){
     hipErrchk(hipFree(ptr));
   }
-  else
-  {
+  else{
     delete[] ptr;
   }
 }
 
-template <typename POL, typename T>
-void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
-    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
-                        hipMemcpyHostToDevice));
+template<typename POL, typename T>
+void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
+  if(TensorTestHelper<POL>::is_device){
+    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), hipMemcpyHostToDevice));
   }
-  else
-  {
-    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
+  else{
+    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
   }
 }
 
-template <typename POL, typename T>
-void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
-{
-  if (TensorTestHelper<POL>::is_device)
-  {
-    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
-                        hipMemcpyDeviceToHost));
+template<typename POL, typename T>
+void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
+  if(TensorTestHelper<POL>::is_device){
+    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), hipMemcpyDeviceToHost));
   }
-  else
-  {
-    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
+  else{
+    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
   }
 }
 
 
 #else
 
-template <typename POL, typename T>
-T* tensor_malloc(size_t len)
-{
+template<typename POL, typename T>
+T* tensor_malloc(size_t len){
   return new T[len];
 }
 
-template <typename POL, typename T>
-void tensor_free(T* ptr)
-{
+template<typename POL, typename T>
+void tensor_free(T *ptr){
   delete[] ptr;
 }
 
-template <typename POL, typename T>
-void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
-{
-  memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
+template<typename POL, typename T>
+void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
+  memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
 }
 
-template <typename POL, typename T>
-void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
-{
-  memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
+template<typename POL, typename T>
+void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
+  memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
 }
 
 #endif
 
 
+
 // Sugar to make things cleaner
-template <typename POL, typename T>
-T* tensor_malloc(std::vector<T> const& vec)
-{
-  return tensor_malloc<POL, T>(vec.size());
+template<typename POL, typename T>
+T* tensor_malloc(std::vector<T> const &vec){
+  return tensor_malloc<POL,T>(vec.size());
 }
 
 
+
+
 #endif
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index 520337103a..77042a43e1 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -18,32 +18,28 @@
 #include <new>
 #include <unordered_map>
 
-namespace detail
-{
+namespace detail {
 
-struct indirect_function_call_dispatch_typer
-{
-  template <typename...>
+struct indirect_function_call_dispatch_typer {
+  template < typename ... >
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
-struct indirect_virtual_function_dispatch_typer
-{
-  template <typename...>
+struct indirect_virtual_function_dispatch_typer {
+  template < typename ... >
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
-struct direct_dispatch_typer
-{
-  template <typename... Ts>
+struct direct_dispatch_typer {
+  template < typename ... Ts >
   using type = ::RAJA::direct_dispatch<Ts...>;
 };
 
 
-template <typename Resource>
+template < typename Resource >
 struct ResourceAllocator
 {
-  template <typename T>
+  template < typename T >
   struct std_allocator
   {
     using value_type = T;
@@ -51,29 +47,26 @@ struct ResourceAllocator
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator&&)      = default;
+    std_allocator(std_allocator &&) = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator&&)      = default;
+    std_allocator& operator=(std_allocator &&) = default;
 
-    template <typename U>
+    template < typename U >
     std_allocator(std_allocator<U> const& other) noexcept
-        : m_res(other.get_resource())
-    {}
+      : m_res(other.get_resource())
+    { }
 
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
-      {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
         throw std::bad_alloc();
       }
 
-      value_type* ptr = m_res.template allocate<value_type>(
-          num, camp::resources::MemoryAccess::Pinned);
+      value_type* ptr = m_res.template allocate<value_type>(num, camp::resources::MemoryAccess::Pinned);
 
-      if (!ptr)
-      {
+      if (!ptr) {
         throw std::bad_alloc();
       }
 
@@ -85,19 +78,19 @@ struct ResourceAllocator
       m_res.deallocate(ptr, camp::resources::MemoryAccess::Pinned);
     }
 
-    Resource const& get_resource() const { return m_res; }
+    Resource const& get_resource() const
+    {
+      return m_res;
+    }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& /*lhs*/,
-                                  std_allocator<U> const& /*rhs*/)
+    friend inline bool operator==(std_allocator const& /*lhs*/, std_allocator<U> const& /*rhs*/)
     {
-      return true;  // lhs.get_resource() == rhs.get_resource(); // TODO not
-                    // equality comparable yet
+      return true; // lhs.get_resource() == rhs.get_resource(); // TODO not equality comparable yet
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs,
-                                  std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -111,25 +104,24 @@ struct NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap            = std::true_type;
+  using propagate_on_container_swap = std::true_type;
 
   NeverEqualAllocator() = default;
 
   NeverEqualAllocator(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator(NeverEqualAllocator&&)      = default;
+  NeverEqualAllocator(NeverEqualAllocator &&) = default;
 
   NeverEqualAllocator& operator=(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator& operator=(NeverEqualAllocator&&)      = default;
+  NeverEqualAllocator& operator=(NeverEqualAllocator &&) = default;
 
   NeverEqualAllocator select_on_container_copy_construction()
   {
-    return NeverEqualAllocator {};
+    return NeverEqualAllocator{};
   }
 
   ~NeverEqualAllocator()
   {
-    if (!m_allocations.empty())
-    {
+    if (!m_allocations.empty()) {
       RAJA_ABORT_OR_THROW("allocation map not empty at destruction");
     }
   }
@@ -137,10 +129,9 @@ struct NeverEqualAllocator
   /*[[nodiscard]]*/
   void* allocate(size_t size)
   {
-    void* ptr   = malloc(size);
+    void* ptr = malloc(size);
     auto iter_b = m_allocations.emplace(ptr, size);
-    if (!iter_b.second)
-    {
+    if (!iter_b.second) {
       RAJA_ABORT_OR_THROW("failed to add allocation to map");
     }
     return ptr;
@@ -149,19 +140,20 @@ struct NeverEqualAllocator
   void deallocate(void* ptr, size_t size) noexcept
   {
     auto iter = m_allocations.find(ptr);
-    if (iter == m_allocations.end())
-    {
+    if (iter == m_allocations.end()) {
       RAJA_ABORT_OR_THROW("failed to find allocation in map");
     }
-    if (iter->second != size)
-    {
+    if (iter->second != size) {
       RAJA_ABORT_OR_THROW("allocation size does not match known in map");
     }
     m_allocations.erase(iter);
     free(ptr);
   }
 
-  bool operator==(NeverEqualAllocator const&) const { return false; }
+  bool operator==(NeverEqualAllocator const&) const
+  {
+    return false;
+  }
 
 private:
   std::unordered_map<void*, size_t> m_allocations;
@@ -171,27 +163,36 @@ struct AlwaysEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap            = std::false_type;
+  using propagate_on_container_swap = std::false_type;
 
   AlwaysEqualAllocator() = default;
 
   AlwaysEqualAllocator(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator(AlwaysEqualAllocator&&)      = default;
+  AlwaysEqualAllocator(AlwaysEqualAllocator &&) = default;
 
   AlwaysEqualAllocator& operator=(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator&&)      = default;
+  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator &&) = default;
 
-  AlwaysEqualAllocator select_on_container_copy_construction() { return *this; }
+  AlwaysEqualAllocator select_on_container_copy_construction()
+  {
+    return *this;
+  }
 
   /*[[nodiscard]]*/
-  void* allocate(size_t size) { return get_allocator().allocate(size); }
+  void* allocate(size_t size)
+  {
+    return get_allocator().allocate(size);
+  }
 
   void deallocate(void* ptr, size_t size) noexcept
   {
     get_allocator().deallocate(ptr, size);
   }
 
-  bool operator==(AlwaysEqualAllocator const&) const { return true; }
+  bool operator==(AlwaysEqualAllocator const&) const
+  {
+    return true;
+  }
 
 private:
   static inline NeverEqualAllocator& get_allocator()
@@ -205,54 +206,50 @@ struct PropogatingAllocator : NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::true_type;
   using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_swap            = std::true_type;
+  using propagate_on_container_swap = std::true_type;
 
   PropogatingAllocator() = default;
 
   PropogatingAllocator(PropogatingAllocator const&) = default;
-  PropogatingAllocator(PropogatingAllocator&&)      = default;
+  PropogatingAllocator(PropogatingAllocator &&) = default;
 
   PropogatingAllocator& operator=(PropogatingAllocator const&) = default;
-  PropogatingAllocator& operator=(PropogatingAllocator&&)      = default;
+  PropogatingAllocator& operator=(PropogatingAllocator &&) = default;
 
   PropogatingAllocator select_on_container_copy_construction()
   {
-    return PropogatingAllocator(
-        NeverEqualAllocator::select_on_container_copy_construction());
+    return PropogatingAllocator(NeverEqualAllocator::select_on_container_copy_construction());
   }
 
 private:
   PropogatingAllocator(NeverEqualAllocator&& nea)
-      : NeverEqualAllocator(std::move(nea))
-  {}
+    : NeverEqualAllocator(std::move(nea))
+  { }
 };
 
-template <typename AllocatorImpl>
+template < typename AllocatorImpl >
 struct WorkStorageTestAllocator
 {
-  template <typename T>
+  template < typename T >
   struct std_allocator
   {
     using value_type = T;
-    using propagate_on_container_copy_assignment =
-        typename AllocatorImpl::propagate_on_container_copy_assignment;
-    using propagate_on_container_move_assignment =
-        typename AllocatorImpl::propagate_on_container_move_assignment;
-    using propagate_on_container_swap =
-        typename AllocatorImpl::propagate_on_container_swap;
+    using propagate_on_container_copy_assignment = typename AllocatorImpl::propagate_on_container_copy_assignment;
+    using propagate_on_container_move_assignment = typename AllocatorImpl::propagate_on_container_move_assignment;
+    using propagate_on_container_swap = typename AllocatorImpl::propagate_on_container_swap;
 
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator&&)      = default;
+    std_allocator(std_allocator &&) = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator&&)      = default;
+    std_allocator& operator=(std_allocator &&) = default;
 
-    template <typename U>
+    template < typename U >
     std_allocator(std_allocator<U> const& other) noexcept
-        : m_impl(other.get_impl())
-    {}
+      : m_impl(other.get_impl())
+    { }
 
     std_allocator select_on_container_copy_construction()
     {
@@ -262,16 +259,13 @@ struct WorkStorageTestAllocator
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
-      {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
         throw std::bad_alloc();
       }
 
-      value_type* ptr =
-          static_cast<value_type*>(m_impl.allocate(num * sizeof(value_type)));
+      value_type* ptr = static_cast<value_type*>(m_impl.allocate(num*sizeof(value_type)));
 
-      if (!ptr)
-      {
+      if (!ptr) {
         throw std::bad_alloc();
       }
 
@@ -280,96 +274,130 @@ struct WorkStorageTestAllocator
 
     void deallocate(value_type* ptr, size_t num) noexcept
     {
-      m_impl.deallocate(static_cast<void*>(ptr), num * sizeof(value_type));
+      m_impl.deallocate(static_cast<void*>(ptr), num*sizeof(value_type));
     }
 
-    AllocatorImpl const& get_impl() const { return m_impl; }
+    AllocatorImpl const& get_impl() const
+    {
+      return m_impl;
+    }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& lhs,
-                                  std_allocator<U> const& rhs)
+    friend inline bool operator==(std_allocator const& lhs, std_allocator<U> const& rhs)
     {
       return lhs.get_impl() == rhs.get_impl();
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs,
-                                  std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
 
   private:
-    std_allocator(AllocatorImpl&& impl) : m_impl(std::move(impl)) {}
+    std_allocator(AllocatorImpl&& impl)
+      : m_impl(std::move(impl))
+    { }
 
     AllocatorImpl m_impl;
   };
 };
 
-}  // namespace detail
+} // namespace detail
 
 
 //
 // Data types
 //
-using IndexTypeTypeList = camp::list<int, long, RAJA::Index_type>;
-
-using XargsTypeList =
-    camp::list<RAJA::xargs<>, RAJA::xargs<int*>, RAJA::xargs<int, int*>>;
-
-using SequentialExecPolicyList = camp::list<RAJA::seq_work>;
+using IndexTypeTypeList = camp::list<
+                                 int,
+                                 long,
+                                 RAJA::Index_type
+                               >;
+
+using XargsTypeList = camp::list<
+                                 RAJA::xargs<>,
+                                 RAJA::xargs<int*>,
+                                 RAJA::xargs<int, int*>
+                               >;
+
+using SequentialExecPolicyList =
+    camp::list<
+                RAJA::seq_work
+              >;
 using SequentialOrderedPolicyList =
-    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered
+              >;
 using SequentialOrderPolicyList =
-    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered
+              >;
 using SequentialStoragePolicyList =
-    camp::list<RAJA::array_of_pointers,
-               RAJA::ragged_array_of_objects,
-               RAJA::constant_stride_array_of_objects>;
+    camp::list<
+                RAJA::array_of_pointers,
+                RAJA::ragged_array_of_objects,
+                RAJA::constant_stride_array_of_objects
+              >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPExecPolicyList    = camp::list<RAJA::omp_work>;
+using OpenMPExecPolicyList =
+    camp::list<
+                RAJA::omp_work
+              >;
 using OpenMPOrderedPolicyList = SequentialOrderedPolicyList;
 using OpenMPOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetExecPolicyList    = camp::list<RAJA::omp_target_work>;
+using OpenMPTargetExecPolicyList =
+    camp::list<
+                RAJA::omp_target_work
+              >;
 using OpenMPTargetOrderedPolicyList = SequentialOrderedPolicyList;
 using OpenMPTargetOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaExecPolicyList = camp::list<
-#if defined(RAJA_TEST_EXHAUSTIVE)
-    // avoid compilation error:
-    // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at
-    // instantiation of class
-    RAJA::cuda_work<256>,
-#endif
-    RAJA::cuda_work<1024>,
-    RAJA::cuda_work_explicit<256, 2>>;
+using CudaExecPolicyList =
+    camp::list<
+                #if defined(RAJA_TEST_EXHAUSTIVE)
+                // avoid compilation error:
+                // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at instantiation of class
+                RAJA::cuda_work<256>,
+                #endif
+                RAJA::cuda_work<1024>,
+                RAJA::cuda_work_explicit<256, 2>
+              >;
 using CudaOrderedPolicyList = SequentialOrderedPolicyList;
-using CudaOrderPolicyList =
-    camp::list<RAJA::ordered,
-               RAJA::reverse_ordered,
-               RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
+using CudaOrderPolicyList   =
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered,
+                RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average
+              >;
 using CudaStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipExecPolicyList = camp::list<
-#if defined(RAJA_TEST_EXHAUSTIVE)
-    RAJA::hip_work<256>,
-#endif
-    RAJA::hip_work<1024>>;
+using HipExecPolicyList =
+    camp::list<
+                #if defined(RAJA_TEST_EXHAUSTIVE)
+                RAJA::hip_work<256>,
+                #endif
+                RAJA::hip_work<1024>
+              >;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
-using HipOrderPolicyList =
-    camp::list<RAJA::ordered,
-               RAJA::reverse_ordered,
-               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
+using HipOrderPolicyList   =
+    camp::list<
+                RAJA::ordered,
+                RAJA::reverse_ordered
+              , RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average
+              >;
 using HipStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -377,18 +405,15 @@ using HipStoragePolicyList = SequentialStoragePolicyList;
 //
 // Dispatch policy type lists, broken up for compile time reasons
 //
-using IndirectFunctionDispatchTyperList =
-    camp::list<detail::indirect_function_call_dispatch_typer>;
-using IndirectVirtualDispatchTyperList =
-    camp::list<detail::indirect_virtual_function_dispatch_typer>;
+using IndirectFunctionDispatchTyperList = camp::list<detail::indirect_function_call_dispatch_typer>;
+using IndirectVirtualDispatchTyperList = camp::list<detail::indirect_virtual_function_dispatch_typer>;
 using DirectDispatchTyperList = camp::list<detail::direct_dispatch_typer>;
 
 
 //
 // Memory resource Allocator types
 //
-using HostAllocatorList = camp::list<typename detail::ResourceAllocator<
-    camp::resources::Host>::template std_allocator<char>>;
+using HostAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Host>::template std_allocator<char>>;
 
 using SequentialAllocatorList = HostAllocatorList;
 
@@ -397,30 +422,23 @@ using OpenMPAllocatorList = HostAllocatorList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<
-    camp::resources::Cuda>::template std_allocator<char>>;
+using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Cuda>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAllocatorList = camp::list<typename detail::ResourceAllocator<
-    camp::resources::Hip>::template std_allocator<char>>;
+using HipAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Hip>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<
-    camp::resources::Omp>::template std_allocator<char>>;
+using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Omp>::template std_allocator<char>>;
 #endif
 
 
 //
 // Memory resource types for testing different std allocator requirements
 //
-using WorkStorageAllocatorList =
-    camp::list<typename detail::WorkStorageTestAllocator<
-                   detail::AlwaysEqualAllocator>::template std_allocator<char>,
-               typename detail::WorkStorageTestAllocator<
-                   detail::NeverEqualAllocator>::template std_allocator<char>,
-               typename detail::WorkStorageTestAllocator<
-                   detail::PropogatingAllocator>::template std_allocator<char>>;
+using WorkStorageAllocatorList = camp::list<typename detail::WorkStorageTestAllocator<detail::AlwaysEqualAllocator>::template std_allocator<char>,
+                                            typename detail::WorkStorageTestAllocator<detail::NeverEqualAllocator>::template std_allocator<char>,
+                                            typename detail::WorkStorageTestAllocator<detail::PropogatingAllocator>::template std_allocator<char>>;
 
 #endif  // __TEST_WORKGROUP_UTILS_HPP__
diff --git a/test/include/RAJA_unit-test-for3d3d.hpp b/test/include/RAJA_unit-test-for3d3d.hpp
index 2297745b8c..a2c43ec55e 100644
--- a/test/include/RAJA_unit-test-for3d3d.hpp
+++ b/test/include/RAJA_unit-test-for3d3d.hpp
@@ -40,15 +40,12 @@ struct dim3d3d
 RAJA_HOST_DEVICE
 int index(dim3d3d idx, dim3d3d dim)
 {
-  return idx.thread[0] +
-         dim.thread[0] *
-             (idx.thread[1] +
-              dim.thread[1] *
-                  (idx.thread[2] +
-                   dim.thread[2] *
-                       (idx.block[0] +
-                        dim.block[0] *
-                            (idx.block[1] + dim.block[1] * (idx.block[2])))));
+  return               idx.thread[0] +
+      dim.thread[0] * (idx.thread[1] +
+      dim.thread[1] * (idx.thread[2] +
+      dim.thread[2] * (idx.block[0] +
+      dim.block[0]  * (idx.block[1] +
+      dim.block[1]  * (idx.block[2])))));
 }
 
 ///
@@ -59,61 +56,41 @@ int index(dim3d3d idx, dim3d3d dim)
 ///   /* code to test */
 /// } );
 ///
-template <typename test_policy, typename L>
+template < typename test_policy, typename L >
 inline void for3d3d(dim3d3d dim, L&& run);
 
 // test_seq implementation
-template <typename L>
+template < typename L >
 inline void for3d3d(test_seq, dim3d3d dim, L&& run)
 {
-  for (int bz = 0; bz < dim.block[2]; ++bz)
-  {
-    for (int by = 0; by < dim.block[1]; ++by)
-    {
-      for (int bx = 0; bx < dim.block[0]; ++bx)
-      {
-        for (int tz = 0; tz < dim.thread[2]; ++tz)
-        {
-          for (int ty = 0; ty < dim.thread[1]; ++ty)
-          {
-            for (int tx = 0; tx < dim.thread[0]; ++tx)
-            {
-              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
-            }
-          }
-        }
-      }
-    }
-  }
+  for (int bz = 0; bz < dim.block[2]; ++bz) {
+  for (int by = 0; by < dim.block[1]; ++by) {
+  for (int bx = 0; bx < dim.block[0]; ++bx) {
+    for (int tz = 0; tz < dim.thread[2]; ++tz) {
+    for (int ty = 0; ty < dim.thread[1]; ++ty) {
+    for (int tx = 0; tx < dim.thread[0]; ++tx) {
+      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
+    }}}
+  }}}
 }
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template <typename L>
+template < typename L >
 inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 {
 #pragma omp target teams distribute collapse(3)
-  for (int bz = 0; bz < dim.block[2]; ++bz)
-  {
-    for (int by = 0; by < dim.block[1]; ++by)
-    {
-      for (int bx = 0; bx < dim.block[0]; ++bx)
-      {
+  for (int bz = 0; bz < dim.block[2]; ++bz) {
+  for (int by = 0; by < dim.block[1]; ++by) {
+  for (int bx = 0; bx < dim.block[0]; ++bx) {
 #pragma omp parallel for collapse(3)
-        for (int tz = 0; tz < dim.thread[2]; ++tz)
-        {
-          for (int ty = 0; ty < dim.thread[1]; ++ty)
-          {
-            for (int tx = 0; tx < dim.thread[0]; ++tx)
-            {
-              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
-            }
-          }
-        }
-      }
-    }
-  }
+    for (int tz = 0; tz < dim.thread[2]; ++tz) {
+    for (int ty = 0; ty < dim.thread[1]; ++ty) {
+    for (int tx = 0; tx < dim.thread[0]; ++tx) {
+      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
+    }}}
+  }}}
 }
 
 #endif
@@ -123,25 +100,20 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_cuda_global(L run)
 {
-  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
-                static_cast<int>(threadIdx.z)},
-               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
-                static_cast<int>(blockIdx.z)}},
-      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
-                static_cast<int>(blockDim.z)},
-               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
-                static_cast<int>(gridDim.z)}});
+  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
+              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
+      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
+              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
 }
 
 // test_cuda implementation
-template <typename L>
+template < typename L >
 inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 {
-  for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
-                        dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
-      std::forward<L>(run));
-  cudaErrchk(cudaGetLastError());
-  cudaErrchk(cudaDeviceSynchronize());
+   for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
+                         dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(std::forward<L>(run));
+   cudaErrchk(cudaGetLastError());
+   cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -151,34 +123,31 @@ inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_hip_global(L run)
 {
-  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
-                static_cast<int>(threadIdx.z)},
-               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
-                static_cast<int>(blockIdx.z)}},
-      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
-                static_cast<int>(blockDim.z)},
-               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
-                static_cast<int>(gridDim.z)}});
+  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
+              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
+      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
+              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
 }
 
 // test_hip implementation
-template <typename L>
+template < typename L >
 inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 {
-  hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
-                     dim3(dim.block[0], dim.block[1], dim.block[2]),
-                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
-                     std::forward<L>(run));
-  hipErrchk(hipGetLastError());
-  hipErrchk(hipDeviceSynchronize());
+   hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
+                      dim3(dim.block[0], dim.block[1], dim.block[2]),
+                      dim3(dim.thread[0], dim.thread[1], dim.thread[2]),
+                      0, 0,
+                      std::forward<L>(run));
+   hipErrchk(hipGetLastError());
+   hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template <typename test_policy, typename L>
+template < typename test_policy, typename L >
 void for3d3d(dim3d3d dim, L&& run)
 {
-  for3d3d(test_policy {}, dim, std::forward<L>(run));
+  for3d3d(test_policy{}, dim, std::forward<L>(run));
 }
 
-#endif  // RAJA_test_for3d3d_HPP__
+#endif // RAJA_test_for3d3d_HPP__
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
index d5315c6e1c..4e9fc521e4 100644
--- a/test/include/RAJA_unit-test-forone.hpp
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -18,11 +18,11 @@
 ///
 /// forone<test_policy>( [=] RAJA_HOST_DEVICE(){ /* code to test */ } );
 ///
-template <typename test_policy, typename L>
+template < typename test_policy, typename L >
 inline void forone(L&& run);
 
 // test_seq implementation
-template <typename L>
+template < typename L >
 inline void forone(test_seq, L&& run)
 {
   std::forward<L>(run)();
@@ -31,7 +31,7 @@ inline void forone(test_seq, L&& run)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template <typename L>
+template < typename L >
 inline void forone(test_openmp_target, L&& run)
 {
 #pragma omp target
@@ -49,12 +49,12 @@ __global__ void forone_cuda_global(L run)
 }
 
 // test_cuda implementation
-template <typename L>
+template < typename L >
 inline void forone(test_cuda, L&& run)
 {
-  forone_cuda_global<<<1, 1>>>(std::forward<L>(run));
-  cudaErrchk(cudaGetLastError());
-  cudaErrchk(cudaDeviceSynchronize());
+   forone_cuda_global<<<1,1>>>(std::forward<L>(run));
+   cudaErrchk(cudaGetLastError());
+   cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -68,21 +68,20 @@ __global__ void forone_hip_global(L run)
 }
 
 // test_hip implementation
-template <typename L>
+template < typename L >
 inline void forone(test_hip, L&& run)
 {
-  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
-                     std::forward<L>(run));
-  hipErrchk(hipGetLastError());
-  hipErrchk(hipDeviceSynchronize());
+   hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0, std::forward<L>(run));
+   hipErrchk(hipGetLastError());
+   hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template <typename test_policy, typename L>
+template < typename test_policy, typename L >
 void forone(L&& run)
 {
-  forone(test_policy {}, std::forward<L>(run));
+  forone(test_policy{}, std::forward<L>(run));
 }
 
-#endif  // RAJA_test_forone_HPP__
+#endif // RAJA_test_forone_HPP__
diff --git a/test/include/RAJA_unit-test-policy.hpp b/test/include/RAJA_unit-test-policy.hpp
index 3b5dbcd312..e0aa1f8c65 100644
--- a/test/include/RAJA_unit-test-policy.hpp
+++ b/test/include/RAJA_unit-test-policy.hpp
@@ -20,59 +20,48 @@
 
 
 // base classes to represent host or device in exec_dispatcher
-struct RunOnHost
-{};
-struct RunOnDevice
-{};
+struct RunOnHost {};
+struct RunOnDevice {};
 
 // sequential test policy
-struct test_seq : public RunOnHost
-{};
+struct test_seq : public RunOnHost  { };
 
 // struct with specializations containing information about test policies
-template <typename test_policy>
+template < typename test_policy >
 struct test_policy_info;
 
 // alias for equivalent RAJA exec policy to given test policy
-template <typename test_policy>
-using test_equivalent_exec_policy =
-    typename test_policy_info<test_policy>::type;
+template < typename test_policy >
+using test_equivalent_exec_policy = typename test_policy_info<test_policy>::type;
 
 // alias for platform of given test policy
-template <typename test_policy>
+template < typename test_policy >
 using test_platform = typename test_policy_info<test_policy>::platform;
 
 // alias for platform of given test policy
-template <typename test_policy>
+template < typename test_policy >
 using test_resource = typename test_policy_info<test_policy>::resource;
 
-template <typename test_policy>
+template < typename test_policy >
 test_resource<test_policy> get_test_resource()
 {
   return test_resource<test_policy>::get_default();
 }
 
-template <typename dst_resource, typename src_resource, typename T>
-inline T*
-test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
+template < typename dst_resource, typename src_resource, typename T >
+inline T* test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
 {
   T* dst = nullptr;
-  if (dst_res.get_platform() == camp::resources::Platform::host)
-  {
+  if (dst_res.get_platform() == camp::resources::Platform::host) {
     dst = dst_res.template allocate<T>(len);
-    src_res.memcpy(dst, src, len * sizeof(T));
+    src_res.memcpy(dst, src, len*sizeof(T));
     src_res.wait();
-  }
-  else if (src_res.get_platform() == camp::resources::Platform::host)
-  {
+  } else if (src_res.get_platform() == camp::resources::Platform::host) {
     dst = dst_res.template allocate<T>(len);
-    dst_res.memcpy(dst, src, len * sizeof(T));
+    dst_res.memcpy(dst, src, len*sizeof(T));
     dst_res.wait();
-  }
-  else
-  {
-    throw std::runtime_error("Expected source or destination resource to be "
-                             "host");
+  } else {
+    throw std::runtime_error("Expected source or destination resource to be host");
   }
   src_res.deallocate(src);
   return dst;
@@ -80,11 +69,11 @@ test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
 
 
 // test_seq policy information
-template <>
+template < >
 struct test_policy_info<test_seq>
 {
   using resource = camp::resources::Host;
-  using type     = RAJA::seq_exec;
+  using type = RAJA::seq_exec;
   using platform = RunOnHost;
   static const char* name() { return "test_seq"; }
 };
@@ -92,15 +81,14 @@ struct test_policy_info<test_seq>
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // cuda test policy
-struct test_openmp_target : public RunOnHost
-{};
+struct test_openmp_target : public RunOnHost { };
 
 // test_openmp_target policy information
-template <>
+template < >
 struct test_policy_info<test_openmp_target>
 {
   using resource = camp::resources::Omp;
-  using type     = RAJA::omp_target_parallel_for_exec<1>;
+  using type = RAJA::omp_target_parallel_for_exec<1>;
   using platform = RunOnHost;
   static const char* name() { return "test_openmp_target"; }
 };
@@ -110,15 +98,14 @@ struct test_policy_info<test_openmp_target>
 #if defined(RAJA_ENABLE_CUDA)
 
 // cuda test policy
-struct test_cuda : public RunOnDevice
-{};
+struct test_cuda : public RunOnDevice { };
 
 // test_cuda policy information
-template <>
+template < >
 struct test_policy_info<test_cuda>
 {
   using resource = camp::resources::Cuda;
-  using type     = RAJA::cuda_exec<1>;
+  using type = RAJA::cuda_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_cuda"; }
 };
@@ -128,15 +115,14 @@ struct test_policy_info<test_cuda>
 #if defined(RAJA_ENABLE_HIP)
 
 // hip test policy
-struct test_hip : public RunOnDevice
-{};
+struct test_hip : public RunOnDevice { };
 
 // test_hip policy information
-template <>
+template < >
 struct test_policy_info<test_hip>
 {
   using resource = camp::resources::Hip;
-  using type     = RAJA::hip_exec<1>;
+  using type = RAJA::hip_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_hip"; }
 };
@@ -165,4 +151,4 @@ using OpenMPTargetUnitTestPolicyList = camp::list<test_openmp_target>;
 using HipUnitTestPolicyList = camp::list<test_hip>;
 #endif
 
-#endif  // RAJA_test_policy_HPP__
+#endif // RAJA_test_policy_HPP__
diff --git a/test/include/RAJA_unit-test-types.hpp b/test/include/RAJA_unit-test-types.hpp
index 618a625ede..bb65134534 100644
--- a/test/include/RAJA_unit-test-types.hpp
+++ b/test/include/RAJA_unit-test-types.hpp
@@ -34,27 +34,42 @@ using UnitIntegralTypes = ::testing::Types<char,
 // Expanded integral types used in RAJA index unit tests
 //
 #ifndef RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
-#define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES                                      \
-  RAJA::Index_type, char, unsigned char, short, unsigned short, int,           \
-      unsigned int, long, unsigned long, long int, unsigned long int,          \
-      long long, unsigned long long
-#endif  // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
+  #define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES \
+    RAJA::Index_type,    \
+    char,                \
+    unsigned char,       \
+    short,               \
+    unsigned short,      \
+    int,                 \
+    unsigned int,        \
+    long,                \
+    unsigned long,       \
+    long int,            \
+    unsigned long int,   \
+    long long,           \
+    unsigned long long
+#endif // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
 
 #ifndef RAJA_UNIT_FLOAT_TYPES
 #ifndef __clang__
-#define RAJA_UNIT_FLOAT_TYPES float, double, long double
+  #define RAJA_UNIT_FLOAT_TYPES \
+    float,               \
+    double,              \
+    long double
 #else
-#define RAJA_UNIT_FLOAT_TYPES float, double
-#endif  // __clang__
-#endif  // FLOATING_TYPES
+  #define RAJA_UNIT_FLOAT_TYPES \
+    float,               \
+    double
+#endif // __clang__
+#endif // FLOATING_TYPES
 
-using UnitExpandedIntegralTypes =
-    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
+using UnitExpandedIntegralTypes = 
+  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
 
 using UnitFloatTypes = ::testing::Types<RAJA_UNIT_FLOAT_TYPES>;
 
-using UnitIntFloatTypes =
-    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES, RAJA_UNIT_FLOAT_TYPES>;
+using UnitIntFloatTypes = 
+  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES,RAJA_UNIT_FLOAT_TYPES>;
 
 //
 // Standard list of index types used in RAJA index unit tests
diff --git a/test/include/type_helper.hpp b/test/include/type_helper.hpp
index 4cb2fd0975..3a4581c8a0 100644
--- a/test/include/type_helper.hpp
+++ b/test/include/type_helper.hpp
@@ -30,8 +30,7 @@ template <typename S, typename T>
 struct type_cat;
 
 template <typename... Ss, typename... Ts>
-struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>>
-{
+struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>> {
   using type = std::tuple<Ss..., Ts...>;
 };
 
@@ -40,30 +39,26 @@ template <typename S, typename T>
 struct product;
 
 template <typename S, typename... Ss, typename... Ts>
-struct product<std::tuple<S, Ss...>, std::tuple<Ts...>>
-{
+struct product<std::tuple<S, Ss...>, std::tuple<Ts...>> {
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<S, Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts =
-      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
 };
 
 template <typename... Ss, typename... Ts, typename... Smembers>
-struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>>
-{
+struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>> {
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<Smembers..., Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts =
-      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
@@ -71,8 +66,7 @@ struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>>
 
 // end the recursion
 template <typename... Ts>
-struct product<std::tuple<>, std::tuple<Ts...>>
-{
+struct product<std::tuple<>, std::tuple<Ts...>> {
   using type = std::tuple<>;
 };
 }  // namespace types
@@ -84,14 +78,12 @@ template <typename...>
 struct concat;
 
 template <template <class...> class T, typename U>
-struct concat<T<U>>
-{
+struct concat<T<U>> {
   using type = U;
 };
 
 template <typename T>
-struct concat<T>
-{
+struct concat<T> {
   using type = T;
 };
 
@@ -99,8 +91,7 @@ template <template <class...> class T,
           class... Front,
           class... Next,
           class... Rest>
-struct concat<T<Front...>, T<Next...>, Rest...>
-{
+struct concat<T<Front...>, T<Next...>, Rest...> {
   using type = typename concat<T<Front..., Next...>, Rest...>::type;
 };
 
@@ -108,14 +99,12 @@ template <typename... Ts>
 using concat_t = typename concat<Ts...>::type;
 
 template <class T>
-struct collapse
-{
+struct collapse {
   using type = T;
 };
 
 template <template <class...> class T, class... U>
-struct collapse<T<T<U...>>>
-{
+struct collapse<T<T<U...>>> {
   using type = typename collapse<T<U...>>::type;
 };
 
@@ -126,8 +115,7 @@ template <template <class> class, class>
 struct apply;
 
 template <template <class...> class L, template <class> class Fn, class... Ts>
-struct apply<Fn, L<Ts...>>
-{
+struct apply<Fn, L<Ts...>> {
   using type = collapse_t<L<concat_t<Fn<Ts>...>>>;
 };
 
@@ -143,8 +131,7 @@ template <typename T>
 struct ForTesting;
 
 template <template <class...> class T, typename... Ts>
-struct ForTesting<T<Ts...>>
-{
+struct ForTesting<T<Ts...>> {
   using type = ::testing::Types<Ts...>;
 };
 }  // namespace detail
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
index 82d77e9d98..b748f316df 100644
--- a/test/install/using-with-cmake/using-with-cmake.cpp
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -7,23 +7,25 @@
 #include "RAJA/RAJA.hpp"
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) 
 {
-  constexpr std::size_t N {1024};
+  constexpr std::size_t N{1024};
 
   double* a = new double[N];
   double* b = new double[N];
-  double c  = 3.14159;
-
-  for (std::size_t i = 0; i < N; i++)
-  {
+  double c = 3.14159;
+  
+  for (std::size_t i = 0; i < N; i++) {
     a[i] = 1.0;
     b[i] = 2.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
-                               [=] RAJA_HOST_DEVICE(std::size_t i)
-                               { a[i] += b[i] * c; });
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::RangeSegment(0, N),
+    [=] RAJA_HOST_DEVICE (std::size_t i) {
+      a[i] += b[i] * c;
+    }
+  );
 
   delete[] a;
   delete[] b;
diff --git a/test/integration/plugin/plugin_to_test.cpp b/test/integration/plugin/plugin_to_test.cpp
index a637c3476a..8290804191 100644
--- a/test/integration/plugin/plugin_to_test.cpp
+++ b/test/integration/plugin/plugin_to_test.cpp
@@ -12,11 +12,11 @@
 
 #include "counter.hpp"
 
-class CounterPlugin : public RAJA::util::PluginStrategy
+class CounterPlugin :
+  public RAJA::util::PluginStrategy
 {
-public:
-  void preCapture(const RAJA::util::PluginContext& p) override
-  {
+  public:
+  void preCapture(const RAJA::util::PluginContext& p) override {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -30,8 +30,7 @@ class CounterPlugin : public RAJA::util::PluginStrategy
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postCapture(const RAJA::util::PluginContext& p) override
-  {
+  void postCapture(const RAJA::util::PluginContext& p) override {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -45,8 +44,7 @@ class CounterPlugin : public RAJA::util::PluginStrategy
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void preLaunch(const RAJA::util::PluginContext& p) override
-  {
+  void preLaunch(const RAJA::util::PluginContext& p) override {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -60,8 +58,7 @@ class CounterPlugin : public RAJA::util::PluginStrategy
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postLaunch(const RAJA::util::PluginContext& p) override
-  {
+  void postLaunch(const RAJA::util::PluginContext& p) override {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -77,7 +74,4 @@ class CounterPlugin : public RAJA::util::PluginStrategy
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin",
-                                                        "Coun"
-                                                        "te"
-                                                        "r");
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin", "Counter");
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
index a652fe9db1..bb22f697dd 100644
--- a/test/integration/plugin/tests/counter.hpp
+++ b/test/integration/plugin/tests/counter.hpp
@@ -4,18 +4,18 @@
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-#ifndef RAJA_counter_HPP
-#define RAJA_counter_HPP
+#ifndef  RAJA_counter_HPP
+#define  RAJA_counter_HPP
 
 
 struct CounterData
 {
   RAJA::Platform capture_platform_active = RAJA::Platform::undefined;
-  int capture_counter_pre                = 0;
-  int capture_counter_post               = 0;
-  RAJA::Platform launch_platform_active  = RAJA::Platform::undefined;
-  int launch_counter_pre                 = 0;
-  int launch_counter_post                = 0;
+  int            capture_counter_pre     = 0;
+  int            capture_counter_post    = 0;
+  RAJA::Platform launch_platform_active = RAJA::Platform::undefined;
+  int            launch_counter_pre     = 0;
+  int            launch_counter_post    = 0;
 };
 
 // note the use of a pointer here to allow different types of memory
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index 2db5d1c5e4..3b74d6249d 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -21,169 +21,173 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic forall
-template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
 void PluginForallTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
 
-    RAJA::forall<ExecPolicy>(RAJA::RangeSegment(i, i + 1),
-                             PluginTestCallable {data});
+    RAJA::forall<ExecPolicy>(
+      RAJA::RangeSegment(i,i+1),
+      PluginTestCallable{data}
+    );
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.capture_counter_post, i);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.launch_counter_post, i);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-  ASSERT_EQ(plugin_data.capture_counter_post, 10);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
-  ASSERT_EQ(plugin_data.launch_counter_post, 10);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with basic forall_Icount
-template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
 void PluginForAllICountTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
 
-    RAJA::forall_Icount<ExecPolicy>(RAJA::RangeSegment(i, i + 1), i,
-                                    PluginTestCallable {data});
+    RAJA::forall_Icount<ExecPolicy>(
+      RAJA::RangeSegment(i,i+1), i,
+      PluginTestCallable{data}
+    );
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.capture_counter_post, i);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.launch_counter_post, i);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-  ASSERT_EQ(plugin_data.capture_counter_post, 10);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
-  ASSERT_EQ(plugin_data.launch_counter_post, 10);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall
-template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
 void PluginForAllIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
 
-    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
+    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
 
-    for (int j = i; j < 10; j++)
-    {
-      iset.push_back(RAJA::RangeSegment(j, j + 1));
+    for (int j = i; j < 10; j++) {
+      iset.push_back(RAJA::RangeSegment(j, j+1));
     }
 
     RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-        iset, PluginTestCallable {data});
+      iset,
+      PluginTestCallable{data}
+    );
 
-    for (int j = i; j < 10; j++)
-    {
+    for (int j = i; j < 10; j++) {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
-      ASSERT_EQ(loop_data.capture_counter_post, i);
+      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.capture_counter_post,    i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
-      ASSERT_EQ(loop_data.launch_counter_post, i);
+      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.launch_counter_post,    i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-  ASSERT_EQ(plugin_data.capture_counter_post, 10);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
-  ASSERT_EQ(plugin_data.launch_counter_post, 10);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall_Icount
-template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
+template <typename ExecPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
 void PluginForAllIcountIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
 
-    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
+    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
 
-    for (int j = i; j < 10; j++)
-    {
-      iset.push_back(RAJA::RangeSegment(j, j + 1));
+    for (int j = i; j < 10; j++) {
+      iset.push_back(RAJA::RangeSegment(j, j+1));
     }
 
     RAJA::forall_Icount<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-        iset, PluginTestCallable {data});
+      iset,
+      PluginTestCallable{data}
+    );
 
-    for (int j = i; j < 10; j++)
-    {
+    for (int j = i; j < 10; j++) {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
-      ASSERT_EQ(loop_data.capture_counter_post, i);
+      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.capture_counter_post,    i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
-      ASSERT_EQ(loop_data.launch_counter_post, i);
+      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+      ASSERT_EQ(loop_data.launch_counter_post,    i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-  ASSERT_EQ(plugin_data.capture_counter_post, 10);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
-  ASSERT_EQ(plugin_data.launch_counter_post, 10);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -191,43 +195,43 @@ void PluginForAllIcountIdxSetTestImpl()
 TYPED_TEST_SUITE_P(PluginForallTest);
 template <typename T>
 class PluginForallTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(PluginForallTest, PluginForall)
 {
-  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
+  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllICount)
 {
-  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
+  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
 {
-  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
+  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
 {
-  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType,
-                                   PlatformHolder::platform>();
+  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index 41a7cd92cd..b4bc9ebaf4 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -21,38 +21,40 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic kernel
-template <typename KernelPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
+template <typename KernelPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
 void PluginKernelTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
 
-    RAJA::kernel<KernelPolicy>(RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
-                               PluginTestCallable {data});
+    RAJA::kernel<KernelPolicy>(
+      RAJA::make_tuple(RAJA::RangeSegment(i,i+1)),
+      PluginTestCallable{data}
+    );
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.capture_counter_post, i);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.launch_counter_post, i);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-  ASSERT_EQ(plugin_data.capture_counter_post, 10);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
-  ASSERT_EQ(plugin_data.launch_counter_post, 10);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -61,17 +63,19 @@ void PluginKernelTestImpl()
 TYPED_TEST_SUITE_P(PluginKernelTest);
 template <typename T>
 class PluginKernelTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(PluginKernelTest, PluginKernel)
 {
-  using KernelPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
+  using KernelPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>();
+  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>( );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest, PluginKernel);
+REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest,
+                            PluginKernel);
 
 #endif  //__TEST_PLUGIN_KERNEL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index b01dadee8c..2c516114cd 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -21,46 +21,48 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
+template <typename LaunchPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
 void PluginLaunchTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
 
-    // Keep PluginTestCallable within a scope to ensure
-    // destruction, consistent with other test
+    //Keep PluginTestCallable within a scope to ensure
+    //destruction, consistent with other test
     {
-      PluginTestCallable p_callable {data};
-
-      RAJA::launch<LaunchPolicy>(
-          RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-          { p_callable(i); });
+      PluginTestCallable p_callable{data};
+
+      RAJA::launch<LaunchPolicy>
+        (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+         {
+           p_callable(i);
+         });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.capture_counter_post, i);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.launch_counter_post, i);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-  ASSERT_EQ(plugin_data.capture_counter_post, 10);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
-  ASSERT_EQ(plugin_data.launch_counter_post, 10);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -69,17 +71,19 @@ void PluginLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginLaunchTest);
 template <typename T>
 class PluginLaunchTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
 {
-  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>();
+  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest, PluginLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest,
+                            PluginLaunch);
 
 #endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index b5aaef62ee..e4c216b72b 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -21,7 +21,9 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
+template <typename LaunchPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
 void PluginResourceLaunchTestImpl()
 {
   WORKING_RES res;
@@ -30,39 +32,39 @@ void PluginResourceLaunchTestImpl()
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
 
-    // Keep PluginTestCallable within a scope to ensure
-    // destruction, consistent with other test
+    //Keep PluginTestCallable within a scope to ensure
+    //destruction, consistent with other test
     {
-      PluginTestCallable p_callable {data};
-
-      RAJA::launch<LaunchPolicy>(
-          res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-          { p_callable(i); });
+      PluginTestCallable p_callable{data};
+
+      RAJA::launch<LaunchPolicy>
+        (res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+         {
+           p_callable(i);
+         });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.capture_counter_post, i);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
-    ASSERT_EQ(loop_data.launch_counter_post, i);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                               sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-  ASSERT_EQ(plugin_data.capture_counter_post, 10);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
-  ASSERT_EQ(plugin_data.launch_counter_post, 10);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -71,18 +73,19 @@ void PluginResourceLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginResourceLaunchTest);
 template <typename T>
 class PluginResourceLaunchTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
 {
-  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginResourceLaunchTestImpl<LaunchPolicy, ResType,
-                               PlatformHolder::platform>();
+  PluginResourceLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest, PluginResourceLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest,
+                            PluginResourceLaunch);
 
 #endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index 3f2c3e1223..9e35aae7d2 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -29,193 +29,181 @@ template <typename ExecPolicy,
           typename Allocator,
           typename WORKINGRES,
           RAJA::Platform PLATFORM>
-struct PluginWorkGroupTestImpl
+struct PluginWorkGroupTestImpl {
+void operator()() const
 {
-  void operator()() const
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, PluginTestCallable> >;
+
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  SetupPluginVars spv(WORKINGRES{});
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
   {
-    using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-    using DispatchPolicy = typename DispatchTyper::template type<
-        camp::list<range_segment, PluginTestCallable>>;
-
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
-
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<>, Allocator>;
-
-    using WorkSite_type =
-        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<>, Allocator>;
-
-    SetupPluginVars spv(WORKINGRES {});
-
-    CounterData* data = plugin_test_resource->allocate<CounterData>(10);
-
-    {
-      CounterData loop_data[10];
-      for (int i = 0; i < 10; i++)
-      {
-        loop_data[i].capture_platform_active = RAJA::Platform::undefined;
-        loop_data[i].capture_counter_pre     = -1;
-        loop_data[i].capture_counter_post    = -1;
-        loop_data[i].launch_platform_active  = RAJA::Platform::undefined;
-        loop_data[i].launch_counter_pre      = -1;
-        loop_data[i].launch_counter_post     = -1;
-      }
-      plugin_test_resource->memcpy(data, &loop_data[0],
-                                   10 * sizeof(CounterData));
+    CounterData loop_data[10];
+    for (int i = 0; i < 10; i++) {
+      loop_data[i].capture_platform_active = RAJA::Platform::undefined;
+      loop_data[i].capture_counter_pre     = -1;
+      loop_data[i].capture_counter_post    = -1;
+      loop_data[i].launch_platform_active = RAJA::Platform::undefined;
+      loop_data[i].launch_counter_pre     = -1;
+      loop_data[i].launch_counter_post    = -1;
     }
+    plugin_test_resource->memcpy(data, &loop_data[0], 10*sizeof(CounterData));
+  }
 
-    WorkPool_type pool(Allocator {});
+  WorkPool_type pool(Allocator{});
 
-    for (int i = 0; i < 10; i++)
-    {
-      pool.enqueue(range_segment {i, i + 1}, PluginTestCallable {data});
-    }
+  for (int i = 0; i < 10; i++) {
+    pool.enqueue(range_segment{i,i+1}, PluginTestCallable{data});
+  }
 
-    {
-      CounterData plugin_data;
-      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                                   sizeof(CounterData));
-      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-      ASSERT_EQ(plugin_data.capture_counter_post, 10);
-      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
-      ASSERT_EQ(plugin_data.launch_counter_post, 0);
-    }
+  {
+    CounterData plugin_data;
+    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+    ASSERT_EQ(plugin_data.capture_counter_post,    10);
+    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
+    ASSERT_EQ(plugin_data.launch_counter_post,    0);
+  }
 
-    {
-      CounterData loop_data[10];
-      plugin_test_resource->memcpy(&loop_data[0], data,
-                                   10 * sizeof(CounterData));
-
-      for (int i = 0; i < 10; i++)
-      {
-        ASSERT_EQ(loop_data[i].capture_platform_active,
-                  RAJA::Platform::undefined);
-        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
-        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
-        ASSERT_EQ(loop_data[i].launch_platform_active,
-                  RAJA::Platform::undefined);
-        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
-        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
-      }
+  {
+    CounterData loop_data[10];
+    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
+      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
     }
+  }
 
-    WorkGroup_type group = pool.instantiate();
-
-    {
-      CounterData plugin_data;
-      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                                   sizeof(CounterData));
-      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-      ASSERT_EQ(plugin_data.capture_counter_post, 10);
-      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
-      ASSERT_EQ(plugin_data.launch_counter_post, 0);
-    }
+  WorkGroup_type group = pool.instantiate();
 
-    {
-      CounterData loop_data[10];
-      plugin_test_resource->memcpy(&loop_data[0], data,
-                                   10 * sizeof(CounterData));
-
-      for (int i = 0; i < 10; i++)
-      {
-        ASSERT_EQ(loop_data[i].capture_platform_active,
-                  RAJA::Platform::undefined);
-        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
-        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
-        ASSERT_EQ(loop_data[i].launch_platform_active,
-                  RAJA::Platform::undefined);
-        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
-        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
-      }
-    }
+  {
+    CounterData plugin_data;
+    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+    ASSERT_EQ(plugin_data.capture_counter_post,    10);
+    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
+    ASSERT_EQ(plugin_data.launch_counter_post,    0);
+  }
 
-    WorkSite_type site = group.run();
-
-    {
-      CounterData plugin_data;
-      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
-                                   sizeof(CounterData));
-      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
-      ASSERT_EQ(plugin_data.capture_counter_post, 10);
-      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(plugin_data.launch_counter_pre, 1);
-      ASSERT_EQ(plugin_data.launch_counter_post, 1);
+  {
+    CounterData loop_data[10];
+    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
+      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
+      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
     }
+  }
 
-    {
-      CounterData loop_data[10];
-      plugin_test_resource->memcpy(&loop_data, data, 10 * sizeof(CounterData));
-
-      for (int i = 0; i < 10; i++)
-      {
-        ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
-        ASSERT_EQ(loop_data[i].capture_counter_pre, i + 1);
-        ASSERT_EQ(loop_data[i].capture_counter_post, i);
-        ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
-        ASSERT_EQ(loop_data[i].launch_counter_pre, 1);
-        ASSERT_EQ(loop_data[i].launch_counter_post, 0);
-      }
-    }
+  WorkSite_type site = group.run();
 
-    plugin_test_resource->deallocate(data);
+  {
+    CounterData plugin_data;
+    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+    ASSERT_EQ(plugin_data.capture_counter_post,    10);
+    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+    ASSERT_EQ(plugin_data.launch_counter_pre,     1);
+    ASSERT_EQ(plugin_data.launch_counter_post,    1);
+  }
+
+  {
+    CounterData loop_data[10];
+    plugin_test_resource->memcpy(&loop_data, data, 10*sizeof(CounterData));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data[i].capture_counter_pre,     i+1);
+      ASSERT_EQ(loop_data[i].capture_counter_post,    i);
+      ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
+      ASSERT_EQ(loop_data[i].launch_counter_pre,     1);
+      ASSERT_EQ(loop_data[i].launch_counter_post,    0);
+    }
   }
+
+  plugin_test_resource->deallocate(data);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM>
-struct PluginWorkGroupTestImpl<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKINGRES,
-    PLATFORM>
-{
-  void operator()() const {}
+          RAJA::Platform PLATFORM
+          >
+struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
+                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                               StoragePolicy,
+                               detail::indirect_function_call_dispatch_typer,
+                               IndexType,
+                               Allocator,
+                               WORKINGRES,
+                               PLATFORM> {
+void operator()() const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM>
-struct PluginWorkGroupTestImpl<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator,
-    WORKINGRES,
-    PLATFORM>
-{
-  void operator()() const {}
+          RAJA::Platform PLATFORM
+          >
+struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
+                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                               StoragePolicy,
+                               detail::indirect_virtual_function_dispatch_typer,
+                               IndexType,
+                               Allocator,
+                               WORKINGRES,
+                               PLATFORM> {
+void operator()() const
+{ }
 };
 
 #endif
@@ -224,24 +212,24 @@ struct PluginWorkGroupTestImpl<
 TYPED_TEST_SUITE_P(PluginWorkGroupTest);
 template <typename T>
 class PluginWorkGroupTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
 {
-  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
-  using PlatformHolder   = typename camp::at<TypeParam, camp::num<7>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                          IndexType, Allocator, WORKING_RESOURCE,
-                          PlatformHolder::platform> {}();
+  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE, PlatformHolder::platform>{}( );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest, PluginWorkGroup);
+REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest,
+                            PluginWorkGroup);
 
 #endif  //__TEST_PLUGIN_WORKGROUP_HPP__
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
index aca9c8e47b..3371cb299b 100644
--- a/test/integration/plugin/tests/test-plugin.hpp
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with
-/// forall and views.
+/// Header file containing basic functional tests for atomic operations with forall and views.
 ///
 
 #ifndef __TEST_PLUGIN_HPP__
@@ -27,29 +26,29 @@ camp::resources::Resource* plugin_test_resource = nullptr;
 struct SetupPluginVars
 {
   SetupPluginVars(camp::resources::Resource const test_resource)
-      : m_test_resource(test_resource)
+    : m_test_resource(test_resource)
   {
     // ASSERT_EQ(plugin_test_data, nullptr);
     // ASSERT_EQ(plugin_test_resource, nullptr);
 
-    plugin_test_data     = m_test_resource.allocate<CounterData>(1);
+    plugin_test_data = m_test_resource.allocate<CounterData>(1);
     plugin_test_resource = &m_test_resource;
 
     CounterData data;
     data.capture_platform_active = RAJA::Platform::undefined;
     data.capture_counter_pre     = 0;
     data.capture_counter_post    = 0;
-    data.launch_platform_active  = RAJA::Platform::undefined;
-    data.launch_counter_pre      = 0;
-    data.launch_counter_post     = 0;
+    data.launch_platform_active = RAJA::Platform::undefined;
+    data.launch_counter_pre     = 0;
+    data.launch_counter_post    = 0;
 
     m_test_resource.memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  SetupPluginVars(SetupPluginVars const&)            = delete;
-  SetupPluginVars(SetupPluginVars&&)                 = delete;
+  SetupPluginVars(SetupPluginVars const&) = delete;
+  SetupPluginVars(SetupPluginVars &&) = delete;
   SetupPluginVars& operator=(SetupPluginVars const&) = delete;
-  SetupPluginVars& operator=(SetupPluginVars&&)      = delete;
+  SetupPluginVars& operator=(SetupPluginVars &&) = delete;
 
   ~SetupPluginVars()
   {
@@ -57,7 +56,7 @@ struct SetupPluginVars
     // ASSERT_NE(plugin_test_resource, nullptr);
 
     m_test_resource.deallocate(plugin_test_data);
-    plugin_test_data     = nullptr;
+    plugin_test_data = nullptr;
     plugin_test_resource = nullptr;
   }
 
@@ -69,15 +68,16 @@ struct SetupPluginVars
 struct PluginTestCallable
 {
   PluginTestCallable(CounterData* data_optr)
-      : m_data_optr(data_optr), m_data_iptr(plugin_test_data)
+    : m_data_optr(data_optr)
+    , m_data_iptr(plugin_test_data)
   {
     clear_data();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable const& rhs)
-      : m_data_optr(rhs.m_data_optr),
-        m_data_iptr(rhs.m_data_iptr),
-        m_data(rhs.m_data)
+    : m_data_optr(rhs.m_data_optr)
+    , m_data_iptr(rhs.m_data_iptr)
+    , m_data(rhs.m_data)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -88,26 +88,24 @@ struct PluginTestCallable
       plugin_test_resource->memcpy(&i_data, m_data_iptr, sizeof(CounterData));
 
       if (m_data.capture_platform_active == RAJA::Platform::undefined &&
-          i_data.capture_platform_active != RAJA::Platform::undefined)
-      {
+          i_data.capture_platform_active != RAJA::Platform::undefined) {
         m_data = i_data;
       }
     }
 #endif
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable&& rhs)
-      : m_data_optr(rhs.m_data_optr),
-        m_data_iptr(rhs.m_data_iptr),
-        m_data(rhs.m_data)
+  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable && rhs)
+    : m_data_optr(rhs.m_data_optr)
+    , m_data_iptr(rhs.m_data_iptr)
+    , m_data(rhs.m_data)
   {
     rhs.clear();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable const& rhs)
   {
-    if (this != &rhs)
-    {
+    if (this != &rhs) {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
       m_data      = rhs.m_data;
@@ -115,10 +113,9 @@ struct PluginTestCallable
     return *this;
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable&& rhs)
+  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable && rhs)
   {
-    if (this != &rhs)
-    {
+    if (this != &rhs) {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
       m_data      = rhs.m_data;
@@ -144,9 +141,9 @@ struct PluginTestCallable
   }
 
 private:
-  CounterData* m_data_optr       = nullptr;
+        CounterData* m_data_optr = nullptr;
   const CounterData* m_data_iptr = nullptr;
-  CounterData m_data;
+        CounterData  m_data;
 
 
   RAJA_HOST_DEVICE void clear()
@@ -161,9 +158,9 @@ struct PluginTestCallable
     m_data.capture_platform_active = RAJA::Platform::undefined;
     m_data.capture_counter_pre     = -1;
     m_data.capture_counter_post    = -1;
-    m_data.launch_platform_active  = RAJA::Platform::undefined;
-    m_data.launch_counter_pre      = -1;
-    m_data.launch_counter_post     = -1;
+    m_data.launch_platform_active = RAJA::Platform::undefined;
+    m_data.launch_counter_pre     = -1;
+    m_data.launch_counter_post    = -1;
   }
 };
 
diff --git a/test/integration/plugin_for_test_dynamic.cpp b/test/integration/plugin_for_test_dynamic.cpp
index 84a65d422a..dfd04f0a50 100644
--- a/test/integration/plugin_for_test_dynamic.cpp
+++ b/test/integration/plugin_for_test_dynamic.cpp
@@ -8,16 +8,16 @@
 
 #include <exception>
 
-class ExceptionPlugin : public RAJA::util::PluginStrategy
+class ExceptionPlugin :
+  public RAJA::util::PluginStrategy
 {
-public:
-  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override
-  {
+  public:
+  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override {
     throw std::runtime_error("preLaunch");
   }
 };
 
-extern "C" RAJA::util::PluginStrategy* getPlugin()
+extern "C" RAJA::util::PluginStrategy *getPlugin()
 {
   return new ExceptionPlugin;
 }
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
index f934d864f7..d5bbc5a51d 100644
--- a/test/integration/plugin_for_test_kokkos.cpp
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -9,19 +9,15 @@
 
 #include <exception>
 
-extern "C" void
-kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
-                     const uint64_t RAJA_UNUSED_ARG(interfaceVer),
-                     const uint32_t RAJA_UNUSED_ARG(devInfoCount),
-                     void* RAJA_UNUSED_ARG(deviceInfo))
-{}
+extern "C" void kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
+	const uint64_t RAJA_UNUSED_ARG(interfaceVer),
+	const uint32_t RAJA_UNUSED_ARG(devInfoCount),
+	void* RAJA_UNUSED_ARG(deviceInfo)) {}
 
-extern "C" void
-kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
-                           const uint32_t RAJA_UNUSED_ARG(devID),
-                           uint64_t* RAJA_UNUSED_ARG(kID))
-{
-  throw std::runtime_error("preLaunch");
+extern "C" void kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
+    const uint32_t RAJA_UNUSED_ARG(devID),
+    uint64_t* RAJA_UNUSED_ARG(kID)) {
+    throw std::runtime_error("preLaunch");
 }
 
 extern "C" void kokkosp_end_parallel_for(const uint64_t RAJA_UNUSED_ARG(kID)) {}
diff --git a/test/integration/test_plugin_dynamic.cpp b/test/integration/test_plugin_dynamic.cpp
index 5a3f157e97..9cba6d0a77 100644
--- a/test/integration/test_plugin_dynamic.cpp
+++ b/test/integration/test_plugin_dynamic.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestDynamic, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                                 [=](int i) { a[i] = 0; });
+                               [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/integration/test_plugin_kokkos.cpp b/test/integration/test_plugin_kokkos.cpp
index 521870494b..b8f05d8fef 100644
--- a/test/integration/test_plugin_kokkos.cpp
+++ b/test/integration/test_plugin_kokkos.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestKokkos, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                                 [=](int i) { a[i] = 0; });
+                               [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/old-tests/unit/cpu/test-synchronize.cpp b/test/old-tests/unit/cpu/test-synchronize.cpp
index f933804f02..7750fcea5c 100644
--- a/test/old-tests/unit/cpu/test-synchronize.cpp
+++ b/test/old-tests/unit/cpu/test-synchronize.cpp
@@ -17,8 +17,7 @@ TEST(SynchronizeTest, omp)
 
 #pragma omp parallel shared(test_val)
   {
-    if (omp_get_thread_num() == 0)
-    {
+    if (omp_get_thread_num() == 0) {
       test_val = 5.0;
     }
 
diff --git a/test/old-tests/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
index dd46bfbdc1..b26b7a3445 100644
--- a/test/old-tests/unit/cuda/test-synchronize.cpp
+++ b/test/old-tests/unit/cuda/test-synchronize.cpp
@@ -16,14 +16,16 @@ GPU_TEST(SynchronizeTest, CUDA)
   double* managed_data;
   cudaErrchk(cudaMallocManaged(&managed_data, sizeof(double) * 50));
 
-  RAJA::forall<RAJA::cuda_exec_async<256>>(
-      RAJA::RangeSegment(0, 50),
-      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { managed_data[i] = 1.0 * i; });
+  RAJA::forall<RAJA::cuda_exec_async<256>>( RAJA::RangeSegment(0, 50),
+    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
+    managed_data[i] = 1.0 * i;
+  });
   RAJA::synchronize<RAJA::cuda_synchronize>();
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
-                               [=](RAJA::Index_type i)
-                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
+    [=](RAJA::Index_type i) {
+    EXPECT_EQ(managed_data[i], 1.0 * i);
+  });
 
   cudaErrchk(cudaFree(managed_data));
 }
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index cdd3a099b4..a6f4ffcbc5 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -24,9 +24,9 @@
 using namespace RAJA;
 using namespace RAJA::statement;
 
-// Define tile size ( TILE_DIM x TILE_DIM )
-// Matrix transpose and matrix multiplication
-// are carried out via tiling algorithms
+//Define tile size ( TILE_DIM x TILE_DIM )
+//Matrix transpose and matrix multiplication
+//are carried out via tiling algorithms
 RAJA_INDEX_VALUE(TX, "TX");
 RAJA_INDEX_VALUE(TY, "TY");
 
@@ -45,85 +45,73 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM    = 2;
+  const int DIM = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
-  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
 
   double *A, *B;
 #if defined(RAJA_ENABLE_CUDA)
   size_t Arr_sz = N_rows * N_cols;
-  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * Arr_sz));
-  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double)  * Arr_sz));
 #else
-  A = new double[N_rows * N_cols];
-  B = new double[N_rows * N_cols];
+  A  = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
 #endif
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
 
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
-      A[col + N_cols * row] = col;
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col= 0 ; col < N_cols; ++col) {
+      A[col + N_cols*row] = col;
     }
   }
 
-  using SharedTile =
-      AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ,
-                            RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
+  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
-                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
-      RAJA::make_tuple(myTile, myTile2),
-
-      // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
-      {
-        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-        if (row < N_rows && col < N_cols)
-        {
-          myTile(ty, tx) = Aview(row, col);
-        }
-      },
-
-      // read from shared mem
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
-      {
-        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-        if (row < N_rows && col < N_cols)
-        {
-          Bview(row, col) = myTile(ty, tx);
-        }
-      });
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
+                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
 
-  // Check result
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
-      ASSERT_FLOAT_EQ((double)B[col + row * N_cols],
-                      (double)A[col + row * N_cols]);
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)   = Aview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      Bview(row, col) = myTile(ty, tx);
+    }
+
+  });
+
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ((double)B[col + row*N_cols], (double)A[col + row*N_cols]);
     }
   }
 
@@ -131,8 +119,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
   cudaErrchk(cudaFree(A));
   cudaErrchk(cudaFree(B));
 #else
-  delete[] A;
-  delete[] B;
+  delete [] A;
+  delete [] B;
 #endif
 }
 
@@ -152,105 +140,92 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM    = 2;
+  const int DIM = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
-  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
 
   double *A, *B;
   double *d_A, *d_B;
   size_t Arr_sz = N_rows * N_cols;
   hipMalloc(&d_A, sizeof(double) * Arr_sz);
   hipMalloc(&d_B, sizeof(double) * Arr_sz);
-  A = new double[N_rows * N_cols];
-  B = new double[N_rows * N_cols];
+  A  = new double[N_rows * N_cols];
+  B  = new double[N_rows * N_cols];
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows,
-                                                             N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows,
-                                                             N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows, N_cols);
 
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
-      A[col + N_cols * row] = col;
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col= 0 ; col < N_cols; ++col) {
+      A[col + N_cols*row] = col;
     }
   }
 
-  hipMemcpy(d_A, A, Arr_sz * sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_A, A, Arr_sz*sizeof(double), hipMemcpyHostToDevice);
 
-  using SharedTile =
-      TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>,
-                      TY, TX>;
+  using SharedTile = TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
-                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
-                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
-      RAJA::make_tuple(myTile, myTile2),
-
-      // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
-      {
-        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-        if (row < N_rows && col < N_cols)
-        {
-          myTile(ty, tx) = d_Aview(row, col);
-        }
-      },
-
-      // read from shared mem
-      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
-                           SharedTile&)
-      {
-        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-        if (row < N_rows && col < N_cols)
-        {
-          d_Bview(row, col) = myTile(ty, tx);
-        }
-      });
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
+                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
 
-  hipMemcpy(B, d_B, Arr_sz * sizeof(double), hipMemcpyDeviceToHost);
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
 
-  // Check result
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
-      ASSERT_FLOAT_EQ(B[col + row * N_cols], A[col + row * N_cols]);
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)   = d_Aview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
+
+    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      d_Bview(row, col) = myTile(ty, tx);
+    }
+
+  });
+
+  hipMemcpy(B, d_B, Arr_sz*sizeof(double), hipMemcpyDeviceToHost);
+
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ(B[col + row*N_cols], A[col + row*N_cols]);
     }
   }
 
   hipFree(d_A);
   hipFree(d_B);
-  delete[] A;
-  delete[] B;
+  delete [] A;
+  delete [] B;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem_gpu, Basic);
-#endif  // defined(RAJA_ENABLE_HIP)
+#endif //defined(RAJA_ENABLE_HIP)
 
 
 //
-// Matrix transpose example - test all variants
+//Matrix transpose example - test all variants
 //
 template <typename NestedPolicy>
 class MatTranspose : public ::testing::Test
@@ -266,21 +241,21 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM    = 2;
+  const int DIM = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
-  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
 
   double *A, *At, *B, *Bt;
 #if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&At, sizeof(double) * N_rows * N_cols));
-  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&Bt, sizeof(double) * N_rows * N_cols));
 #else
   A  = new double[N_rows * N_cols];
@@ -296,62 +271,53 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
   }
 
 
-  using SharedTile =
-      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(
-          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
-          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
-      RAJA::make_tuple(myTile, myTile2),
-
-      // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
-      {
-        int col = bx * TILE_DIM + tx;  // Matrix column index
-        int row = by * TILE_DIM + ty;  // Matrix row index
-
-        if (row < N_rows && col < N_cols)
-        {
-          myTile(ty, tx)  = Aview(row, col);
-          myTile2(ty, tx) = Bview(row, col);
-        }
-      },
-
-      // read from shared mem
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
-      {
-        int col = by * TILE_DIM + tx;  // Transposed matrix column index
-        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-        if (row < N_cols && col < N_rows)
-        {
-          Atview(row, col) = myTile(tx, ty);
-          Btview(row, col) = myTile2(tx, ty);
-        }
-      });
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
+                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
 
-  // Check result
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
-      ASSERT_FLOAT_EQ((double)Atview(col, row), (double)col);
-      ASSERT_FLOAT_EQ((double)Btview(col, row), (double)col);
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = bx * TILE_DIM + tx;  // Matrix column index
+    int row = by * TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)  = Aview(row, col);
+      myTile2(ty,tx) = Bview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = by * TILE_DIM + tx;  // Transposed matrix column index
+    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+    if(row < N_cols && col < N_rows){
+      Atview(row, col) = myTile(tx,ty);
+      Btview(row, col) = myTile2(tx,ty);
+    }
+
+  });
+
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ((double)Atview(col,row), (double)col);
+      ASSERT_FLOAT_EQ((double)Btview(col,row), (double)col);
     }
   }
 
@@ -362,10 +328,10 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   cudaErrchk(cudaFree(B));
   cudaErrchk(cudaFree(Bt));
 #else
-  delete[] A;
-  delete[] At;
-  delete[] B;
-  delete[] Bt;
+  delete [] A;
+  delete [] At;
+  delete [] B;
+  delete [] Bt;
 #endif
 }
 
@@ -387,21 +353,21 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM    = 2;
+  const int DIM = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
-  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
+  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
+  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
 
   double *A, *At, *B, *Bt;
   double *d_A, *d_At, *d_B, *d_Bt;
-  hipMalloc(&d_A, sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_A,  sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_At, sizeof(double) * N_rows * N_cols);
-  hipMalloc(&d_B, sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_B,  sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_Bt, sizeof(double) * N_rows * N_cols);
   A  = new double[N_rows * N_cols];
   At = new double[N_rows * N_cols];
@@ -421,10 +387,8 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> d_Btview(d_Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
@@ -434,55 +398,48 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipMemcpy(d_B, B, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
 
 
-  using SharedTile =
-      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(
-          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
-          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
-      RAJA::make_tuple(myTile, myTile2),
-
-      // Load data into shared memory
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
-      {
-        int col = bx * TILE_DIM + tx;  // Matrix column index
-        int row = by * TILE_DIM + ty;  // Matrix row index
-
-        if (row < N_rows && col < N_cols)
-        {
-          myTile(ty, tx)  = d_Aview(row, col);
-          myTile2(ty, tx) = d_Bview(row, col);
-        }
-      },
-
-      // read from shared mem
-      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
-                           SharedTile& myTile2)
-      {
-        int col = by * TILE_DIM + tx;  // Transposed matrix column index
-        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-        if (row < N_cols && col < N_rows)
-        {
-          d_Atview(row, col) = myTile(tx, ty);
-          d_Btview(row, col) = myTile2(tx, ty);
-        }
-      });
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
+                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
+                          RAJA::make_tuple(myTile, myTile2),
+
+  //Load data into shared memory
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = bx * TILE_DIM + tx;  // Matrix column index
+    int row = by * TILE_DIM + ty;  // Matrix row index
+
+    if(row < N_rows && col < N_cols){
+      myTile(ty,tx)  = d_Aview(row, col);
+      myTile2(ty,tx) = d_Bview(row, col);
+    }
+
+  },
+
+  //read from shared mem
+  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
+
+    int col = by * TILE_DIM + tx;  // Transposed matrix column index
+    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+    if(row < N_cols && col < N_rows){
+      d_Atview(row, col) = myTile(tx,ty);
+      d_Btview(row, col) = myTile2(tx,ty);
+    }
+
+  });
 
   hipMemcpy(At, d_At, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
   hipMemcpy(Bt, d_Bt, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
 
-  // Check result
-  for (int row = 0; row < N_rows; ++row)
-  {
-    for (int col = 0; col < N_cols; ++col)
-    {
-      ASSERT_FLOAT_EQ(Atview(col, row), col);
-      ASSERT_FLOAT_EQ(Btview(col, row), col);
+  //Check result
+  for (int row = 0; row < N_rows; ++row) {
+    for (int col = 0; col < N_cols; ++col) {
+      ASSERT_FLOAT_EQ(Atview(col,row), col);
+      ASSERT_FLOAT_EQ(Btview(col,row), col);
     }
   }
 
@@ -491,178 +448,149 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipFree(d_At);
   hipFree(d_B);
   hipFree(d_Bt);
-  delete[] A;
-  delete[] At;
-  delete[] B;
-  delete[] Bt;
+  delete [] A;
+  delete [] At;
+  delete [] B;
+  delete [] Bt;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatTranspose_gpu, Basic);
 
-#endif  // defined(RAJA_ENABLE_HIP)
+#endif //defined(RAJA_ENABLE_HIP)
 
 using SeqTypes =
-    ::testing::Types<RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
-        3,
-        RAJA::seq_exec,
-        RAJA::statement::For<
-            2,
-            RAJA::seq_exec,
-
-            RAJA::statement::InitLocalMem<
-                RAJA::cpu_tile_mem,
-                RAJA::ParamList<0, 1>,
-
-                // Load data into shared memory
-                RAJA::statement::For<
-                    1,
-                    RAJA::seq_exec,
-                    RAJA::statement::
-                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
-
-                // Read data from shared memory
-                RAJA::statement::For<
-                    1,
-                    RAJA::seq_exec,
-                    RAJA::statement::For<0,
-                                         RAJA::seq_exec,
-                                         RAJA::statement::Lambda<1>>>
-
-                >  // close shared memory scope
-            >      // for 2
-        >          // for 3
-                                                   >  // kernel policy
-                                >                     // list
-                     >;                               // types
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+        RAJA::statement::For<3, RAJA::seq_exec,
+          RAJA::statement::For<2, RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::seq_exec,
+                RAJA::statement::For<0, RAJA::seq_exec,
+                  RAJA::statement::Lambda<0>
+                                   >
+                                 >,
+
+                //Read data from shared memory
+                RAJA::statement::For<1, RAJA::seq_exec,
+                  RAJA::statement::For<0, RAJA::seq_exec,
+                    RAJA::statement::Lambda<1> > >
+
+              > //close shared memory scope
+            >//for 2
+        >//for 3
+      > //kernel policy
+    > //list
+  >; //types
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatTranspose, SeqTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, TypedLocalMem, SeqTypes);
 
 
 #if defined(RAJA_ENABLE_OPENMP)
-using TestTypes = ::testing::Types<
-    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
-        3,
-        RAJA::seq_exec,
-        RAJA::statement::For<
-            2,
-            RAJA::seq_exec,
-
-            RAJA::statement::InitLocalMem<
-                RAJA::cpu_tile_mem,
-                RAJA::ParamList<0, 1>,
-
-                // Load data into shared memory
-                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                          RAJA::ArgList<0, 1>,
-                                          RAJA::statement::Lambda<0>>,
-
-                // Read data from shared memory
-                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                          RAJA::ArgList<0, 1>,
-                                          RAJA::statement::Lambda<1>>>>  // for
-                                                                         // 2
-        >                            // for 3
-                                  >  // close policy
-               >                     // close list
-
-    ,
-    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
-        3,
-        RAJA::seq_exec,
-        RAJA::statement::For<
-            2,
-            RAJA::seq_exec,
-
-            RAJA::statement::InitLocalMem<
-                RAJA::cpu_tile_mem,
-                RAJA::ParamList<0, 1>,
-
-                // Load data into shared memory
-                RAJA::statement::For<
-                    1,
-                    RAJA::omp_parallel_for_exec,
-                    RAJA::statement::
-                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
-
-                // Read data from shared memory
-                RAJA::statement::For<
-                    1,
-                    RAJA::seq_exec,
-                    RAJA::statement::For<
-                        0,
-                        RAJA::omp_parallel_for_exec,
-                        RAJA::statement::Lambda<1>>>>  // close
-                                                       // shared
-                                                       // mem
-                                                       // window
-            >                                          // 2
-        >                                              // 3
-                                  >                    // close policy
-               >                                       // close list
-    ,
-    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
-        3,
-        RAJA::omp_parallel_for_exec,
-        RAJA::statement::For<
-            2,
-            RAJA::seq_exec,
-
-            RAJA::statement::InitLocalMem<
-                RAJA::cpu_tile_mem,
-                RAJA::ParamList<0, 1>,
-
-                // Load data into shared memory
-                RAJA::statement::For<
-                    1,
-                    RAJA::seq_exec,
-                    RAJA::statement::
-                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
-
-                // Read data from shared memory
-                RAJA::statement::For<
-                    1,
-                    RAJA::seq_exec,
-                    RAJA::statement::For<
-                        0,
-                        RAJA::seq_exec,
-                        RAJA::statement::Lambda<1>>>>  // close
-                                                       // shared
-                                                       // mem
-                                                       // window
-            >                                          // 2
-        >                                              // 3
-                                  >                    // close policy list
-               >                                       // close list
-    ,
-    RAJA::list<RAJA::KernelPolicy<RAJA::statement::Collapse<
-        RAJA::omp_parallel_collapse_exec,
-        RAJA::ArgList<2, 3>,
-
-        RAJA::statement::InitLocalMem<
-            RAJA::cpu_tile_mem,
-            RAJA::ParamList<0, 1>,
-
-            // Load data into shared memory
-            RAJA::statement::For<
-                1,
-                RAJA::seq_exec,
-                RAJA::statement::
-                    For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
-
-            // Read data from shared memory
-            RAJA::statement::For<
-                1,
-                RAJA::seq_exec,
-                RAJA::statement::For<0,
-                                     RAJA::seq_exec,
-                                     RAJA::statement::Lambda<1>>>>  // close
-                                                                    // shared
-                                                                    // mem
-                                                                    // window
-        >                            // outer collapsed
-                                  >  // close policy list
-               >                     // close list
-    >;
+using TestTypes =
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::For<3, RAJA::seq_exec,
+        RAJA::statement::For<2, RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                     RAJA::ArgList<0, 1>,
+                                     RAJA::statement::Lambda<0>
+                                     >,
+
+           //Read data from shared memory
+           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                     RAJA::ArgList<0, 1>,
+                                     RAJA::statement::Lambda<1>
+                                     >
+                                 >
+        >//for 2
+       >//for 3
+       > //close policy
+     > //close list
+
+  ,RAJA::list<
+      RAJA::KernelPolicy<
+      RAJA::statement::For<3, RAJA::seq_exec,
+        RAJA::statement::For<2, RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+            RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                RAJA::statement::Lambda<0>
+              >
+             >,
+
+           //Read data from shared memory
+            RAJA::statement::For<1, RAJA::seq_exec,
+           RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
+                                RAJA::statement::Lambda<1>
+           >
+          >
+         > //close shared mem window
+        > //2
+       >//3
+     >//close policy
+    > //close list
+  ,RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::For<3, RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<2, RAJA::seq_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+           RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                RAJA::statement::Lambda<0>
+              >
+             >,
+
+           //Read data from shared memory
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                RAJA::statement::Lambda<1>
+           >
+          >
+         > //close shared mem window
+        > //2
+       >//3
+      > //close policy list
+     > //close list
+  ,RAJA::list<
+    RAJA::KernelPolicy<
+           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                     RAJA::ArgList<2, 3>,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem,RAJA::ParamList<0,1>,
+
+           //Load data into shared memory
+           RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                RAJA::statement::Lambda<0>
+              >
+             >,
+
+           //Read data from shared memory
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::seq_exec,
+                RAJA::statement::Lambda<1>
+           >
+          >
+         > //close shared mem window
+       >//outer collapsed
+      > //close policy list
+     > //close list
+   >;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatTranspose, TestTypes);
@@ -671,79 +599,60 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, TypedLocalMem, TestTypes);
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CUDATypes = ::testing::Types<
-    RAJA::list<
-        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-            3,
-            RAJA::cuda_block_y_direct,
-            RAJA::statement::For<
-                2,
-                RAJA::cuda_block_x_direct,
-
-                RAJA::statement::InitLocalMem<
-                    RAJA::cuda_shared_mem,
-                    RAJA::ParamList<0, 1>,
-
-                    // Load data into shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
-                    RAJA::statement::CudaSyncThreads,
-
-                    // Read data from shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::CudaSyncThreads>     // close shared memory
-                                                          // scope
-                >                                         // for 2
-            >                                             // for 3
-                                                       >  // CudaKernel
-                           >                              // kernel policy
-        >                                                 // list
-    ,
-    RAJA::list<
-        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
-            3,
-            RAJA::cuda_block_y_loop,
-            RAJA::statement::For<
-                2,
-                RAJA::cuda_block_x_loop,
-
-                RAJA::statement::InitLocalMem<
-                    RAJA::cuda_shared_mem,
-                    RAJA::ParamList<0, 1>,
-
-                    // Load data into shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
-                    RAJA::statement::CudaSyncThreads,
-
-                    // Read data from shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::cuda_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::cuda_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::CudaSyncThreads>     // close shared memory
-                                                          // scope
-                >                                         // for 2
-            >                                             // for 3
-                                                       >  // CudaKernel
-                           >                              // kernel policy
-        >                                                 // list
-    >;                                                    // types
+using CUDATypes =
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<3, RAJA::cuda_block_y_direct,
+          RAJA::statement::For<2, RAJA::cuda_block_x_direct,
+
+            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<0> > >,
+              RAJA::statement::CudaSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<1> > >,
+              RAJA::statement::CudaSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //CudaKernel
+    > //kernel policy
+  > //list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<3, RAJA::cuda_block_y_loop,
+          RAJA::statement::For<2, RAJA::cuda_block_x_loop,
+
+            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<0> > >,
+              RAJA::statement::CudaSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<1> > >,
+              RAJA::statement::CudaSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //CudaKernel
+    > //kernel policy
+  > //list
+  >; //types
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
@@ -751,90 +660,80 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HIPTypes = ::testing::Types<
-    RAJA::list<
-        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-            3,
-            RAJA::hip_block_y_direct,
-            RAJA::statement::For<
-                2,
-                RAJA::hip_block_x_direct,
-
-                RAJA::statement::InitLocalMem<
-                    RAJA::hip_shared_mem,
-                    RAJA::ParamList<0, 1>,
-
-                    // Load data into shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
-                    RAJA::statement::HipSyncThreads,
-
-                    // Read data from shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::HipSyncThreads>     // close shared memory
-                                                         // scope
-                >                                        // for 2
-            >                                            // for 3
-                                                      >  // HipKernel
-                           >                             // kernel policy
-        >                                                // list
-    ,
-    RAJA::list<
-        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
-            3,
-            RAJA::hip_block_y_loop,
-            RAJA::statement::For<
-                2,
-                RAJA::hip_block_x_loop,
-
-                RAJA::statement::InitLocalMem<
-                    RAJA::hip_shared_mem,
-                    RAJA::ParamList<0, 1>,
-
-                    // Load data into shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<0>>>,
-                    RAJA::statement::HipSyncThreads,
-
-                    // Read data from shared memory
-                    RAJA::statement::For<
-                        1,
-                        RAJA::hip_thread_y_direct,
-                        RAJA::statement::For<0,
-                                             RAJA::hip_thread_x_direct,
-                                             RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::HipSyncThreads>     // close shared memory
-                                                         // scope
-                >                                        // for 2
-            >                                            // for 3
-                                                      >  // HipKernel
-                           >                             // kernel policy
-        >                                                // list
-    >;                                                   // types
+using HIPTypes =
+  ::testing::Types<
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<3, RAJA::hip_block_y_direct,
+          RAJA::statement::For<2, RAJA::hip_block_x_direct,
+
+            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<0>
+                >
+              >,
+              RAJA::statement::HipSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<1>
+                >
+              >,
+              RAJA::statement::HipSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //HipKernel
+    > //kernel policy
+  > //list
+  ,
+  RAJA::list<
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<3, RAJA::hip_block_y_loop,
+          RAJA::statement::For<2, RAJA::hip_block_x_loop,
+
+            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
+
+              //Load data into shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<0>
+                >
+              >,
+              RAJA::statement::HipSyncThreads,
+
+              //Read data from shared memory
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<1>
+                >
+              >,
+              RAJA::statement::HipSyncThreads
+            > //close shared memory scope
+          >//for 2
+        >//for 3
+      > //HipKernel
+    > //kernel policy
+  > //list
+  >; //types
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, TypedLocalMem_gpu, HIPTypes);
 
 #endif
 
 
+
 template <typename NestedPolicy>
 class MatMultiply : public ::testing::Test
 {
-  virtual void SetUp() {}
-  virtual void TearDown() {}
+  virtual void SetUp(){}
+  virtual void TearDown(){}
 };
 
 TYPED_TEST_SUITE_P(MatMultiply);
@@ -848,22 +747,22 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   static constexpr size_t M = TypeParam::M;
   static constexpr size_t P = TypeParam::P;
 
-  // Matrix A size: N x M
-  // Matrix B size: M x P
-  // Result C size: N x P
+  //Matrix A size: N x M
+  //Matrix B size: M x P
+  //Result C size: N x P
 
   // Note: on CPU A==d_A, etc.
   double *A, *d_A;
-  TypeParam::alloc_double(N * M, &A, &d_A);
+  TypeParam::alloc_double(N*M, &A, &d_A);
 
   double *B, *d_B;
-  TypeParam::alloc_double(M * P, &B, &d_B);
+  TypeParam::alloc_double(M*P, &B, &d_B);
 
   double *C, *d_C;
-  TypeParam::alloc_double(N * P, &C, &d_C);
+  TypeParam::alloc_double(N*P, &C, &d_C);
 
 
-  double* C_sol = new double[N * P];
+  double *C_sol = new double[N*P];
 
   RAJA::View<double, RAJA::Layout<2>> C_solView(C_sol, N, P);
 
@@ -872,41 +771,34 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
     RAJA::View<double, RAJA::Layout<2>> Aview(A, N, M);
     RAJA::View<double, RAJA::Layout<2>> Bview(B, M, P);
     RAJA::View<double, RAJA::Layout<2>> Cview(C, N, P);
-    for (size_t row = 0; row < N; ++row)
-    {
-      for (size_t col = 0; col < M; ++col)
-      {
-        Aview(row, col) = ((double)col - row) / (N * M) + 1;
+    for (size_t row = 0; row < N; ++row) {
+      for (size_t col = 0; col < M; ++col) {
+        Aview(row, col) = ((double)col-row)/(N*M)+1;
       }
     }
 
-    for (size_t row = 0; row < M; ++row)
-    {
-      for (size_t col = 0; col < P; ++col)
-      {
-        Bview(row, col) = ((double)col + row) / (M * P) + 1;
+    for (size_t row = 0; row < M; ++row) {
+      for (size_t col = 0; col < P; ++col) {
+        Bview(row, col) = ((double)col+row)/(M*P)+1;
       }
     }
 
-    for (size_t r = 0; r < N; ++r)
-    {
-      for (size_t c = 0; c < P; ++c)
-      {
+    for(size_t r=0; r<N; ++r){
+      for(size_t c=0; c<P; ++c){
         double dot = 0.0;
-        for (size_t k = 0; k < M; ++k)
-        {
-          dot += Aview(r, k) * Bview(k, c);
+        for(size_t k=0; k<M; ++k){
+          dot += Aview(r,k)*Bview(k,c);
         }
-        C_solView(r, c) = dot;
-        Cview(r, c)     = 0;
+        C_solView(r,c) = dot;
+        Cview(r,c) = 0;
       }
     }
   }
 
   // Copy A, B and C to the device (NOP on CPU)
-  TypeParam::copy_d2h(N * M, d_A, A);
-  TypeParam::copy_d2h(M * P, d_B, B);
-  TypeParam::copy_d2h(N * P, d_C, C);
+  TypeParam::copy_d2h(N*M, d_A, A);
+  TypeParam::copy_d2h(M*P, d_B, B);
+  TypeParam::copy_d2h(N*P, d_C, C);
 
   // Create device views of data
   RAJA::View<double, RAJA::Layout<2>> Aview(d_A, N, M);
@@ -916,172 +808,160 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   using Shmem      = typename TypeParam::Shmem;
   using ThreadPriv = typename TypeParam::ThreadPriv;
 
-  Shmem aShared, bShared;  // memory to be shared between threads
-  ThreadPriv pVal;         // iteration dependent data
+  Shmem aShared, bShared; //memory to be shared between threads
+  ThreadPriv pVal; //iteration dependent data
+
+  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
+                                           RAJA::RangeSegment(0, M),
+                                           RAJA::RangeSegment(0, P)),
+                          RAJA::make_tuple(aShared, bShared, pVal),
+
+  // Zero out thread local memory for storing dot products
+  [=] RAJA_HOST_DEVICE (int tn, int tp, ThreadPriv &pVal) {
+
+    pVal(tn,tp) = 0.0;
+
+  },
+
+  // Load tile of A
+  [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) {
+
+     aShared(tn, tm) = Aview(n, m);
+
+  },
 
-  RAJA::kernel_param<Pol>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
-                       RAJA::RangeSegment(0, P)),
-      RAJA::make_tuple(aShared, bShared, pVal),
+  // Load tile of B
+  [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) {
 
-      // Zero out thread local memory for storing dot products
-      [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal)
-      { pVal(tn, tp) = 0.0; },
+    bShared(tm, tp) = Bview(m, p);
 
-      // Load tile of A
-      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared)
-      { aShared(tn, tm) = Aview(n, m); },
+  },
 
-      // Load tile of B
-      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared)
-      { bShared(tm, tp) = Bview(m, p); },
+  // Do partial update in shmem
+  [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared,  Shmem &bShared, ThreadPriv & pVal) {
 
-      // Do partial update in shmem
-      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
-                           Shmem& bShared, ThreadPriv& pVal)
-      { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
+    pVal(tn,tp) += aShared(tn,tm) * bShared(tm, tp);
 
-      // Write out complete result
-      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal)
-      { Cview(n, p) = pVal(tn, tp); });
+  },
+
+  // Write out complete result
+  [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp,  ThreadPriv &pVal) {
+
+    Cview(n,p) = pVal(tn,tp);
+
+  });
 
   // copy result back to host (NOP on CPU)
-  TypeParam::copy_d2h(N * P, C, d_C);
+  TypeParam::copy_d2h(N*P, C, d_C);
 
   // Check result
   RAJA::View<double, RAJA::Layout<2>> Cresult(C, N, P);
-  for (size_t row = 0; row < N; ++row)
-  {
-    for (size_t col = 0; col < P; ++col)
-    {
-      ASSERT_FLOAT_EQ((double)Cresult(row, col), (double)C_solView(row, col));
+  for (size_t row = 0; row < N; ++row) {
+    for (size_t col = 0; col < P; ++col) {
+      ASSERT_FLOAT_EQ((double)Cresult(row,col), (double)C_solView(row,col));
     }
   }
 
   TypeParam::free_double(A, d_A);
   TypeParam::free_double(B, d_B);
   TypeParam::free_double(C, d_C);
-  delete[] C_sol;
+  delete [] C_sol;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatMultiply, shmem);
 
-void alloc_cpu(size_t N, double** host, double** device)
-{
-  *host   = new double[N];
+void alloc_cpu(size_t N, double **host, double **device){
+  *host = new double[N];
   *device = *host;
 }
 
-void copy_h2d_cpu(size_t, double*, double*)
-{
+void copy_h2d_cpu(size_t , double *, double *){
   // NOP
 }
 
-void copy_d2h_cpu(size_t, double*, double*)
-{
+void copy_d2h_cpu(size_t , double *, double *){
   // NOP
 }
 
-void free_cpu(double* host, double*) { delete[] host; }
-
-struct Policy_MatMultiply_cpu
-{
+void free_cpu(double *host, double *){
+  delete[] host;
+}
 
-  static constexpr size_t N         = 150;
-  static constexpr size_t M         = 25;
-  static constexpr size_t P         = 95;
-  static constexpr size_t tile_size = 16;
-
-  constexpr static void (*alloc_double)(size_t, double**, double**) = alloc_cpu;
-  constexpr static void (*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
-  constexpr static void (*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
-  constexpr static void (*free_double)(double*, double*)      = free_cpu;
-
-  using Shmem = RAJA::
-      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-  using ThreadPriv = RAJA::
-      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-
-  using shmem_Lambda0 =
-      RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-  using shmem_Lambda1 = RAJA::statement::
-      Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
-  using shmem_Lambda2 = RAJA::statement::
-      Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
-  using shmem_Lambda3 =
-      RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-  using shmem_Lambda4 = RAJA::statement::
-      Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-
-  // Segments:
-  // 0: N
-  // 1: M
-  // 2: P
-
-  using exec_policy = RAJA::KernelPolicy<
-      // Initalize thread private value
-      RAJA::statement::InitLocalMem<
-          RAJA::cpu_tile_mem,
-          RAJA::ParamList<2, 1, 0>,
-
-          // Tile of N and P (the result matrix C)
-          RAJA::statement::Tile<
-              0,
-              RAJA::tile_fixed<tile_size>,
-              RAJA::seq_exec,
-              RAJA::statement::Tile<
-                  2,
-                  RAJA::tile_fixed<tile_size>,
-                  RAJA::seq_exec,
-
-                  // zero out shmem tile of C
-                  RAJA::statement::For<
-                      2,
-                      RAJA::seq_exec,
-                      RAJA::statement::For<0, RAJA::seq_exec, shmem_Lambda0>>,
-
-                  // Slide window across matrix: Tile in M
-                  RAJA::statement::Tile<
-                      1,
-                      RAJA::tile_fixed<tile_size>,
-                      RAJA::seq_exec,
-
-                      // Load tile of A into shmem
-                      RAJA::statement::For<1,
-                                           RAJA::seq_exec,
-                                           RAJA::statement::For<0,
-                                                                RAJA::seq_exec,
-                                                                shmem_Lambda1>>,
-
-                      // Load tile of B into shmem
-                      RAJA::statement::For<2,
-                                           RAJA::seq_exec,
-                                           RAJA::statement::For<1,
-                                                                RAJA::seq_exec,
-                                                                shmem_Lambda2>>,
-
-                      // Partial multiplication
-                      RAJA::statement::For<
-                          2,
-                          RAJA::seq_exec,
-                          RAJA::statement::For<
-                              1,
-                              RAJA::seq_exec,
-                              RAJA::statement::For<
-                                  0,
-                                  RAJA::seq_exec,
-                                  shmem_Lambda3>>>>,  // sliding
-                                                      // window
-
-                  // Write memory out to global matrix
-                  RAJA::statement::For<
-                      2,
-                      RAJA::seq_exec,
-                      RAJA::statement::For<0,
-                                           RAJA::seq_exec,
-                                           shmem_Lambda4>>>>>  // Create shared
-                                                               // memory
-      >;
+struct Policy_MatMultiply_cpu {
+
+    static constexpr size_t N = 150;
+    static constexpr size_t M = 25;
+    static constexpr size_t P = 95;
+    static constexpr size_t tile_size = 16;
+
+    constexpr static void(*alloc_double)(size_t, double**, double**) = alloc_cpu;
+    constexpr static void(*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
+    constexpr static void(*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
+    constexpr static void(*free_double)(double*, double*) = free_cpu;
+
+    using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+    using ThreadPriv = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+
+    using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+    using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+    using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+    using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
+    using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+
+    // Segments:
+    // 0: N
+    // 1: M
+    // 2: P
+
+    using exec_policy =
+        RAJA::KernelPolicy<
+          //Initalize thread private value
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2,1,0>,
+
+            // Tile of N and P (the result matrix C)
+            RAJA::statement::Tile<0, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
+              RAJA::statement::Tile<2, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
+
+               // zero out shmem tile of C
+               RAJA::statement::For<2, RAJA::seq_exec,
+                  RAJA::statement::For<0, RAJA::seq_exec,
+                  shmem_Lambda0 > >,
+
+                // Slide window across matrix: Tile in M
+                RAJA::statement::Tile<1, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
+
+                   // Load tile of A into shmem
+                   RAJA::statement::For<1, RAJA::seq_exec,
+                     RAJA::statement::For<0, RAJA::seq_exec,
+                     shmem_Lambda1
+                    >
+                   >,
+
+                   // Load tile of B into shmem
+                   RAJA::statement::For<2, RAJA::seq_exec,
+                     RAJA::statement::For<1, RAJA::seq_exec,
+                     shmem_Lambda2
+                    >
+                   >,
+
+                   //Partial multiplication
+                   RAJA::statement::For<2, RAJA::seq_exec,
+                     RAJA::statement::For<1, RAJA::seq_exec,
+                       RAJA::statement::For<0, RAJA::seq_exec,
+                       shmem_Lambda3
+                       >
+                     >
+                   >
+                >, //sliding window
+
+                //Write memory out to global matrix
+                RAJA::statement::For<2, RAJA::seq_exec,
+                  RAJA::statement::For<0, RAJA::seq_exec,
+                  shmem_Lambda4 > >
+             >
+            >
+           > //Create shared memory
+          >;
 };
 
 using MatMultiplyTypes = ::testing::Types<Policy_MatMultiply_cpu>;
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index bb4f7ab274..72bd513fd8 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -21,28 +21,26 @@ using namespace RAJA::statement;
 TEST(SIMD, Align)
 {
 
-  int N    = 1024;
+  int N = 1024;
   double c = 0.5;
-  double* a =
+  double *a =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
-  double* b =
+  double *b =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     a[i] = 0;
     b[i] = 2.0;
   }
 
 
-  double* y = RAJA::align_hint(a);
-  double* x = RAJA::align_hint(b);
+  double *y = RAJA::align_hint(a);
+  double *x = RAJA::align_hint(b);
 
   RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
                                 [=](int i) { y[i] += x[i] * c; });
 
-  for (int i = 0; i < N; ++i)
-  {
+  for (int i = 0; i < N; ++i) {
     ASSERT_DOUBLE_EQ((double)y[i], (double)1.0);
   }
 
@@ -55,33 +53,33 @@ TEST(SIMD, OMPAndSimd)
 {
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>;
+      1,
+      RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0> > > >;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i)
-  {
+  for (int i = 0; i < N * M; ++i) {
     a[i] = 1;
     b[i] = 1;
     c[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
-      [=](RAJA::Index_type i, RAJA::Index_type j)
-      { c[i + j * N] = a[i + j * N] + b[i + j * N]; });
+  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
+                                     RAJA::RangeSegment(0, M)),
+                    [=](RAJA::Index_type i, RAJA::Index_type j) {
+                      c[i + j * N] = a[i + j * N] + b[i + j * N];
+                    });
 
-  for (int i = 0; i < N * M; ++i)
-  {
+  for (int i = 0; i < N * M; ++i) {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
   }
 
@@ -94,46 +92,49 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
 {
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1, RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
-                           RAJA::statement::Lambda<1>>>>;
+      1,
+      RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0,
+                           RAJA::simd_exec,
+                           RAJA::statement::Lambda<0>,
+                           RAJA::statement::Lambda<1> > > >;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  double* a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double* b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double* c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double *c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i)
-  {
-    a[i]  = 1;
-    b[i]  = 1;
-    c[i]  = 0.0;
+  for (int i = 0; i < N * M; ++i) {
+    a[i] = 1;
+    b[i] = 1;
+    c[i] = 0.0;
     a2[i] = 1;
     b2[i] = 1;
     c2[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
-      [=](RAJA::Index_type i, RAJA::Index_type j)
-      { c[i + j * N] = a[i + j * N] + b[i + j * N]; },
-      [=](RAJA::Index_type i, RAJA::Index_type j)
-      { c2[i + j * N] = a2[i + j * N] + b2[i + j * N]; });
+  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
+                                     RAJA::RangeSegment(0, M)),
+                    [=](RAJA::Index_type i, RAJA::Index_type j) {
+                      c[i + j * N] = a[i + j * N] + b[i + j * N];
+                    },
+                    [=](RAJA::Index_type i, RAJA::Index_type j) {
+                      c2[i + j * N] = a2[i + j * N] + b2[i + j * N];
+                    });
 
-  for (int i = 0; i < N * M; ++i)
-  {
+  for (int i = 0; i < N * M; ++i) {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
     ASSERT_DOUBLE_EQ((double)c2[i], (double)2.0);
   }
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
index 5bd0d3a612..db918ad234 100644
--- a/test/unit/algorithm/test-algorithm-util-for_each.cpp
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -19,9 +19,8 @@
 #include <vector>
 #include <set>
 
-template <typename T>
-class ForEachUnitTest : public ::testing::Test
-{};
+template<typename T>
+class ForEachUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes);
 
@@ -31,12 +30,10 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
   std::vector<TypeParam> numbers;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers,
-                 [&](TypeParam& number)
-                 {
-                   number += 1;
-                   copies.push_back(number);
-                 });
+  RAJA::for_each(numbers, [&](TypeParam& number) {
+    number += 1;
+    copies.push_back(number);
+  });
 
   ASSERT_EQ(copies.size(), 0);
   ASSERT_EQ(numbers.size(), 0);
@@ -45,64 +42,55 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
 TYPED_TEST(ForEachUnitTest, VectorRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 13; ++i)
-  {
+  for (TypeParam i = 0; i < 13; ++i) {
     numbers.push_back(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers,
-                 [&](TypeParam& number)
-                 {
-                   copies.push_back(number);
-                   number += 1;
-                 });
+  RAJA::for_each(numbers, [&](TypeParam& number) {
+    copies.push_back(number);
+    number += 1;
+  });
 
   ASSERT_EQ(copies.size(), 13);
-  for (TypeParam i = 0; i < 13; ++i)
-  {
-    ASSERT_EQ(numbers[i], copies[i] + 1);
+  for (TypeParam i = 0; i < 13; ++i) {
+    ASSERT_EQ(numbers[i], copies[i]+1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, RajaSpanRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 11; ++i)
-  {
+  for (TypeParam i = 0; i < 11; ++i) {
     numbers.push_back(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(RAJA::make_span(numbers.data(), 11),
-                 [&](TypeParam& number)
-                 {
-                   copies.push_back(number);
-                   number += 1;
-                 });
+  RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) {
+    copies.push_back(number);
+    number += 1;
+  });
 
   ASSERT_EQ(copies.size(), 11);
-  for (TypeParam i = 0; i < 11; ++i)
-  {
-    ASSERT_EQ(numbers[i], copies[i] + 1);
+  for (TypeParam i = 0; i < 11; ++i) {
+    ASSERT_EQ(numbers[i], copies[i]+1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, SetRange)
 {
   std::set<TypeParam> numbers;
-  for (TypeParam i = 0; i < 6; ++i)
-  {
+  for (TypeParam i = 0; i < 6; ++i) {
     numbers.insert(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers,
-                 [&](TypeParam const& number) { copies.push_back(number); });
+  RAJA::for_each(numbers, [&](TypeParam const& number) {
+    copies.push_back(number);
+  });
 
   ASSERT_EQ(copies.size(), 6);
-  for (TypeParam i = 0; i < 6; ++i)
-  {
+  for (TypeParam i = 0; i < 6; ++i) {
     ASSERT_EQ(i, copies[i]);
     ASSERT_EQ(numbers.count(i), 1);
   }
@@ -114,21 +102,22 @@ TYPED_TEST(ForEachUnitTest, EmptyTypeList)
   using numbers = camp::list<>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers {},
-                      [&](auto number) { copies.push_back(number); });
+  RAJA::for_each_type(numbers{}, [&](auto number) {
+    copies.push_back(number);
+  });
 
   ASSERT_EQ(copies.size(), 0);
 }
 
 
-template <typename T, T val>
+template < typename T, T val >
 T get_num(std::integral_constant<T, val>)
 {
   return val;
 }
 
-template <typename TypeParam,
-          std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
+template < typename TypeParam,
+           std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr >
 void run_int_type_test()
 {
   using numbers = camp::list<std::integral_constant<TypeParam, 0>,
@@ -138,21 +127,24 @@ void run_int_type_test()
                              std::integral_constant<TypeParam, 4>>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers {},
-                      [&](auto number) { copies.push_back(get_num(number)); });
+  RAJA::for_each_type(numbers{}, [&](auto number) {
+    copies.push_back(get_num(number));
+  });
 
   ASSERT_EQ(copies.size(), 5);
-  for (TypeParam i = 0; i < 5; ++i)
-  {
+  for (TypeParam i = 0; i < 5; ++i) {
     ASSERT_EQ(i, copies[i]);
   }
 }
 ///
-template <typename TypeParam,
-          std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
+template < typename TypeParam,
+           std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr >
 void run_int_type_test()
 {
   // ignore non-ints
 }
 
-TYPED_TEST(ForEachUnitTest, IntTypeList) { run_int_type_test<TypeParam>(); }
+TYPED_TEST(ForEachUnitTest, IntTypeList)
+{
+  run_int_type_test<TypeParam>();
+}
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index bacec1a905..4e3f9fb795 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -37,24 +37,18 @@
 
 
 // tag classes to differentiate reduce by attributes and apply correct testing
-struct left_fold_reduce_tag
-{};
-struct unordered_reduce_tag
-{};
+struct left_fold_reduce_tag { };
+struct unordered_reduce_tag { };
 
-struct reduce_interface_tag
-{};
+struct reduce_interface_tag { };
 
-struct reduce_default_interface_tag
-{};
-struct reduce_init_interface_tag
-{};
-struct reduce_init_op_interface_tag
-{};
+struct reduce_default_interface_tag { };
+struct reduce_init_interface_tag { };
+struct reduce_init_op_interface_tag { };
 
 
 // synchronize based on a RAJA execution policy
-template <typename policy>
+template < typename policy >
 struct PolicySynchronize
 {
   void synchronize()
@@ -65,76 +59,71 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template <size_t BLOCK_SIZE, bool Async>
+template < size_t BLOCK_SIZE, bool Async >
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async)
-    {
-      RAJA::synchronize<RAJA::cuda_synchronize>();
-    }
+    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template <size_t BLOCK_SIZE, bool Async>
+template < size_t BLOCK_SIZE, bool Async >
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async)
-    {
-      RAJA::synchronize<RAJA::hip_synchronize>();
-    }
+    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
   }
 };
 #endif
 
 
-template <typename Res, typename interface_tag, typename ValType>
+template <typename Res,
+          typename interface_tag,
+          typename ValType>
 struct ReduceData;
 
 template <typename Res, typename ValType>
 struct ReduceData<Res, reduce_interface_tag, ValType>
 {
-  ValType* values        = nullptr;
+  ValType* values = nullptr;
   ValType* reduced_value = nullptr;
   Res m_res;
 
-  template <typename RandomGenerator>
-  ReduceData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
+  template < typename RandomGenerator >
+  ReduceData(size_t N, Res res, RandomGenerator gen_random)
+    : m_res(res)
   {
-    if (N > 0)
-    {
-      values = m_res.template allocate<ValType>(
-          N, camp::resources::MemoryAccess::Managed);
+    if (N > 0) {
+      values = m_res.template allocate<ValType>(N, camp::resources::MemoryAccess::Managed);
     }
-    reduced_value = m_res.template allocate<ValType>(
-        1, camp::resources::MemoryAccess::Managed);
+    reduced_value = m_res.template allocate<ValType>(1, camp::resources::MemoryAccess::Managed);
 
-    for (size_t i = 0; i < N; i++)
-    {
+    for (size_t i = 0; i < N; i++) {
       values[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if (N == 0) return;
+    if ( N == 0 ) return;
   }
 
-  Res resource() { return m_res; }
+  Res resource()
+  {
+    return m_res;
+  }
 
-  ReduceData(ReduceData const&)            = delete;
+  ReduceData(ReduceData const&) = delete;
   ReduceData& operator=(ReduceData const&) = delete;
 
   ~ReduceData()
   {
-    if (values != nullptr)
-    {
+    if (values != nullptr) {
       m_res.deallocate(values, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed);
     }
@@ -142,14 +131,15 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 };
 
 
-template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type N,
-              T,
-              BinaryOp,
-              Reducer reducer,
-              reduce_interface_tag,
-              reduce_default_interface_tag)
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T,
+            BinaryOp,
+            Reducer reducer, reduce_interface_tag, reduce_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -157,14 +147,15 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
   reducer.synchronize();
 }
 
-template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type N,
-              T init,
-              BinaryOp,
-              Reducer reducer,
-              reduce_interface_tag,
-              reduce_init_interface_tag)
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T init,
+            BinaryOp,
+            Reducer reducer, reduce_interface_tag, reduce_init_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -172,14 +163,15 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
   reducer.synchronize();
 }
 
-template <typename Res, typename T, typename BinaryOp, typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
-              RAJA::Index_type N,
-              T init,
-              BinaryOp op,
-              Reducer reducer,
-              reduce_interface_tag,
-              reduce_init_op_interface_tag)
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T init,
+            BinaryOp op,
+            Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -193,33 +185,28 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult
-testReduce(const char* test_name,
-           const unsigned seed,
-           ReduceData<Res, reduce_interface_tag, T>& data,
-           RAJA::Index_type N,
-           T init,
-           BinaryOp op,
-           TestReducer test_reducer,
-           left_fold_reduce_tag,
-           reduce_interface_tag si,
-           BinaryOpInterface ci)
+::testing::AssertionResult testReduce(
+    const char* test_name,
+    const unsigned seed,
+    ReduceData<Res, reduce_interface_tag, T> & data,
+    RAJA::Index_type N,
+    T init,
+    BinaryOp op,
+    TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value)
-  {
+  if (reduced_check_value != *data.reduced_value) {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (left fold reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value << ", expected "
-           << reduced_check_value;
+           << " incorrect " << *data.reduced_value
+           << ", expected " << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
@@ -230,98 +217,84 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult
-testReduce(const char* test_name,
-           const unsigned seed,
-           ReduceData<Res, reduce_interface_tag, T>& data,
-           RAJA::Index_type N,
-           T init,
-           BinaryOp op,
-           TestReducer test_reducer,
-           unordered_reduce_tag,
-           reduce_interface_tag si,
-           BinaryOpInterface ci)
+::testing::AssertionResult testReduce(
+    const char* test_name,
+    const unsigned seed,
+    ReduceData<Res, reduce_interface_tag, T> & data,
+    RAJA::Index_type N,
+    T init,
+    BinaryOp op,
+    TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value)
-  {
+  if (reduced_check_value != *data.reduced_value) {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (unordered reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value << ", expected "
-           << reduced_check_value;
+           << " incorrect " << *data.reduced_value
+           << ", expected " << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
 }
 
 
-template <typename ValType, typename Reducer, typename Res>
-void testReducerInterfaces(unsigned seed,
-                           RAJA::Index_type MaxN,
-                           Reducer reducer,
-                           Res res)
+template <typename ValType,
+          typename Reducer,
+          typename Res>
+void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
 {
-  using reduce_category    = typename Reducer::reduce_category;
-  using interface_category = typename Reducer::reduce_interface;
+  using reduce_category    = typename Reducer::reduce_category ;
+  using interface_category = typename Reducer::reduce_interface ;
   using no_init_operator   = reduce_default_interface_tag;
   using init_no_operator   = reduce_init_interface_tag;
   using init_operator      = reduce_init_op_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
-      (MaxN + 1) / 2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  ReduceData<Res, interface_category, ValType> data(
-      N, res, [&]() { return dist(rng); });
-
-  ASSERT_TRUE(testReduce(
-      "default", seed, data, N, RAJA::operators::plus<ValType>::identity(),
-      RAJA::operators::plus<ValType> {}, reducer, reduce_category {},
-      interface_category {}, no_init_operator {}));
-  ASSERT_TRUE(testReduce(
-      "init", seed, data, N, ValType(N), RAJA::operators::plus<ValType> {},
-      reducer, reduce_category {}, interface_category {}, init_no_operator {}));
-  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0),
-                         RAJA::operators::minimum<ValType> {}, reducer,
-                         reduce_category {}, interface_category {},
-                         init_operator {}));
-  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0),
-                         RAJA::operators::maximum<ValType> {}, reducer,
-                         reduce_category {}, interface_category {},
-                         init_operator {}));
+  ReduceData<Res, interface_category, ValType> data(N, res, [&](){ return dist(rng); });
+
+  ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus<ValType>::identity(), RAJA::operators::plus<ValType>{},
+      reducer, reduce_category{}, interface_category{}, no_init_operator{}));
+  ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
+  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
+  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
 }
 
-template <typename ValType, typename Reducer, typename Res>
+template <typename ValType,
+          typename Reducer,
+          typename Res>
 void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
 {
   testReducerInterfaces<ValType>(seed, 0, reducer, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
-  {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
     testReducerInterfaces<ValType>(seed, n, reducer, res);
   }
 }
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device {}();
+  static unsigned seed = std::random_device{}();
   return seed;
 }
 
 
 TYPED_TEST_SUITE_P(ReduceUnitTest);
 
-template <typename T>
+template < typename T >
 class ReduceUnitTest : public ::testing::Test
-{};
+{ };
 
 TYPED_TEST_P(ReduceUnitTest, UnitReduce)
 {
@@ -330,9 +303,9 @@ TYPED_TEST_P(ReduceUnitTest, UnitReduce)
   using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed         = get_random_seed();
+  unsigned seed = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Reducer reducer {};
+  Reducer reducer{};
   ResType res = ResType::get_default();
 
   testReducer<ValType>(seed, MaxN, reducer, res);
@@ -344,21 +317,34 @@ REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
 //
 // Key types for reduce tests
 //
-using ReduceValTypeList = camp::list<RAJA::Index_type,
-                                     int,
+using ReduceValTypeList =
+  camp::list<
+              RAJA::Index_type,
+              int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                     unsigned,
-                                     long long,
-                                     unsigned long long,
-                                     float,
+              unsigned,
+              long long,
+              unsigned long long,
+              float,
 #endif
-                                     double>;
+              double
+            >;
 
 // Max test lengths for reduce tests
-using ReduceMaxNListDefault = camp::list<camp::num<10000>>;
+using ReduceMaxNListDefault =
+  camp::list<
+              camp::num<10000>
+            >;
+
+using ReduceMaxNListSmall =
+  camp::list<
+              camp::num<1000>
+            >;
 
-using ReduceMaxNListSmall = camp::list<camp::num<1000>>;
+using ReduceMaxNListTiny =
+  camp::list<
+              camp::num<100>
+            >;
 
-using ReduceMaxNListTiny = camp::list<camp::num<100>>;
+#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
 
-#endif  //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index 046d631adf..4f3f5b4d64 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -37,28 +37,20 @@
 
 
 // tag classes to differentiate sort by attributes and apply correct testing
-struct unstable_sort_tag
-{};
-struct stable_sort_tag
-{};
-
-struct sort_interface_tag
-{};
-struct sort_pairs_interface_tag
-{};
-
-struct sort_default_interface_tag
-{};
-struct sort_comp_interface_tag
-{};
-struct sort_res_default_interface_tag
-{};
-struct sort_res_comp_interface_tag
-{};
+struct unstable_sort_tag { };
+struct stable_sort_tag { };
+
+struct sort_interface_tag { };
+struct sort_pairs_interface_tag { };
+
+struct sort_default_interface_tag { };
+struct sort_comp_interface_tag { };
+struct sort_res_default_interface_tag { };
+struct sort_res_comp_interface_tag { };
 
 
 // synchronize based on a RAJA execution policy
-template <typename policy>
+template < typename policy >
 struct PolicySynchronize
 {
   void synchronize()
@@ -69,30 +61,24 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template <size_t BLOCK_SIZE, bool Async>
+template < size_t BLOCK_SIZE, bool Async >
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async)
-    {
-      RAJA::synchronize<RAJA::cuda_synchronize>();
-    }
+    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template <size_t BLOCK_SIZE, bool Async>
+template < size_t BLOCK_SIZE, bool Async >
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async)
-    {
-      RAJA::synchronize<RAJA::hip_synchronize>();
-    }
+    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
   }
 };
 #endif
@@ -107,42 +93,41 @@ struct SortData;
 template <typename Res, typename K, typename V>
 struct SortData<Res, sort_interface_tag, K, V>
 {
-  K* orig_keys   = nullptr;
+  K* orig_keys = nullptr;
   K* sorted_keys = nullptr;
   Res m_res;
 
-  template <typename RandomGenerator>
-  SortData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
+  template < typename RandomGenerator >
+  SortData(size_t N, Res res, RandomGenerator gen_random)
+    : m_res(res)
   {
-    if (N > 0)
-    {
-      orig_keys =
-          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
-      sorted_keys =
-          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0) {
+      orig_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+      sorted_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++)
-    {
+    for (size_t i = 0; i < N; i++) {
       orig_keys[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if (N == 0) return;
-    m_res.memcpy(sorted_keys, orig_keys, N * sizeof(K));
+    if ( N == 0 ) return;
+    m_res.memcpy(sorted_keys, orig_keys, N*sizeof(K));
   }
 
-  Res resource() { return m_res; }
+  Res resource()
+  {
+    return m_res;
+  }
 
-  SortData(SortData const&)            = delete;
+  SortData(SortData const&) = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
   {
-    if (orig_keys != nullptr)
-    {
+    if (orig_keys != nullptr) {
       m_res.deallocate(orig_keys, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(sorted_keys, camp::resources::MemoryAccess::Managed);
     }
@@ -151,28 +136,23 @@ struct SortData<Res, sort_interface_tag, K, V>
 
 
 template <typename Res, typename K, typename V>
-struct SortData<Res, sort_pairs_interface_tag, K, V>
-    : SortData<Res, sort_interface_tag, K, V>
+struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interface_tag, K, V>
 {
   using base = SortData<Res, sort_interface_tag, K, V>;
 
-  V* orig_vals   = nullptr;
+  V* orig_vals = nullptr;
   V* sorted_vals = nullptr;
 
-  template <typename RandomGenerator>
+  template < typename RandomGenerator >
   SortData(size_t N, Res res, RandomGenerator gen_random)
-      : base(N, res, gen_random)
+    : base(N, res, gen_random)
   {
-    if (N > 0)
-    {
-      orig_vals = this->m_res.template allocate<V>(
-          N, camp::resources::MemoryAccess::Managed);
-      sorted_vals = this->m_res.template allocate<V>(
-          N, camp::resources::MemoryAccess::Managed);
+    if (N > 0) {
+      orig_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
+      sorted_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++)
-    {
+    for (size_t i = 0; i < N; i++) {
       orig_vals[i] = gen_random();
     }
   }
@@ -180,32 +160,31 @@ struct SortData<Res, sort_pairs_interface_tag, K, V>
   void copy_data(size_t N)
   {
     base::copy_data(N);
-    if (N == 0) return;
-    this->m_res.memcpy(sorted_vals, orig_vals, N * sizeof(V));
+    if ( N == 0 ) return;
+    this->m_res.memcpy(sorted_vals, orig_vals, N*sizeof(V));
   }
 
-  SortData(SortData const&)            = delete;
+  SortData(SortData const&) = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
   {
-    if (orig_vals != nullptr)
-    {
+    if (orig_vals != nullptr) {
       this->m_res.deallocate(orig_vals, camp::resources::MemoryAccess::Managed);
-      this->m_res.deallocate(sorted_vals,
-                             camp::resources::MemoryAccess::Managed);
+      this->m_res.deallocate(sorted_vals, camp::resources::MemoryAccess::Managed);
     }
   }
 };
 
 
-template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
+template <typename Res,
+          typename T,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T> & data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter,
-            sort_interface_tag,
-            sort_default_interface_tag)
+            Sorter sorter, sort_interface_tag, sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -213,43 +192,50 @@ void doSort(SortData<Res, sort_interface_tag, T>& data,
   sorter.synchronize();
 }
 
-template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
+template <typename Res,
+          typename T,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T> & data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter,
-            sort_interface_tag,
-            sort_comp_interface_tag)
+            Sorter sorter, sort_interface_tag, sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(RAJA::make_span(data.sorted_keys, N), comp);
+  sorter(RAJA::make_span(data.sorted_keys, N),
+         comp);
   sorter.synchronize();
 }
 
-template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
+template <typename Res,
+          typename T,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T> & data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter,
-            sort_interface_tag,
-            sort_res_default_interface_tag)
+            Sorter sorter, sort_interface_tag, sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N));
+  sorter(data.resource(),
+         RAJA::make_span(data.sorted_keys, N));
   data.resource().wait();
 }
 
-template <typename Res, typename T, typename Compare, typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T>& data,
+template <typename Res,
+          typename T,
+          typename Compare,
+          typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T> & data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter,
-            sort_interface_tag,
-            sort_res_comp_interface_tag)
+            Sorter sorter, sort_interface_tag, sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N), comp);
+  sorter(data.resource(),
+         RAJA::make_span(data.sorted_keys, N),
+         comp);
   data.resource().wait();
 }
 
@@ -258,12 +244,10 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter,
-            sort_pairs_interface_tag,
-            sort_default_interface_tag)
+            Sorter sorter, sort_pairs_interface_tag, sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -277,17 +261,16 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter,
-            sort_pairs_interface_tag,
-            sort_comp_interface_tag)
+            Sorter sorter, sort_pairs_interface_tag, sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
   sorter(RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N), comp);
+         RAJA::make_span(data.sorted_vals, N),
+         comp);
   sorter.synchronize();
 }
 
@@ -296,15 +279,14 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter,
-            sort_pairs_interface_tag,
-            sort_res_default_interface_tag)
+            Sorter sorter, sort_pairs_interface_tag, sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
+  sorter(data.resource(),
+         RAJA::make_span(data.sorted_keys, N),
          RAJA::make_span(data.sorted_vals, N));
   data.resource().wait();
 }
@@ -314,16 +296,16 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter,
-            sort_pairs_interface_tag,
-            sort_res_comp_interface_tag)
+            Sorter sorter, sort_pairs_interface_tag, sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N), comp);
+  sorter(data.resource(),
+         RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N),
+         comp);
   data.resource().wait();
 }
 
@@ -333,60 +315,57 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(const char* test_name,
-                                    const unsigned seed,
-                                    SortData<Res, sort_interface_tag, T>& data,
-                                    RAJA::Index_type N,
-                                    Compare comp,
-                                    TestSorter test_sorter,
-                                    unstable_sort_tag,
-                                    sort_interface_tag si,
-                                    CompareInterface ci)
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<Res, sort_interface_tag, T> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, unstable_sort_tag, sort_interface_tag si, CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end())
-    {
-      auto ret = keys.emplace(data.orig_keys[i], val_map {});
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order " << data.sorted_keys[i - 1] << ", "
-             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
+             << " out of order "
+             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
+             << " (at index " << i-1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key " << data.sorted_keys[i]
+             << " unknown or duplicate key "
+             << data.sorted_keys[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_keys[i]);
     if (val_iter == key_iter->second.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate val " << data.sorted_keys[i]
+             << " unknown or duplicate val "
+             << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys.erase(key_iter);
     }
   }
@@ -398,59 +377,56 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(const char* test_name,
-                                    const unsigned seed,
-                                    SortData<Res, sort_interface_tag, T>& data,
-                                    RAJA::Index_type N,
-                                    Compare comp,
-                                    TestSorter test_sorter,
-                                    stable_sort_tag,
-                                    sort_interface_tag si,
-                                    CompareInterface ci)
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<Res, sort_interface_tag, T> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, stable_sort_tag, sort_interface_tag si, CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end())
-    {
-      auto ret = keys.emplace(data.orig_keys[i], val_map {});
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order " << data.sorted_keys[i - 1] << ", "
-             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
+             << " out of order "
+             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
+             << " (at index " << i-1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key " << data.sorted_keys[i]
+             << " unknown or duplicate key "
+             << data.sorted_keys[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_keys[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of stable order or unknown val " << data.sorted_keys[i]
+             << " out of stable order or unknown val "
+             << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys.erase(key_iter);
     }
   }
@@ -464,45 +440,38 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult
-testSort(const char* test_name,
-         const unsigned seed,
-         SortData<Res, sort_pairs_interface_tag, K, V>& data,
-         RAJA::Index_type N,
-         Compare comp,
-         TestSorter test_sorter,
-         unstable_sort_tag,
-         sort_pairs_interface_tag si,
-         CompareInterface ci)
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<Res, sort_pairs_interface_tag, K, V> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, unstable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " keys " << data.sorted_keys[i - 1] << ", "
-             << data.sorted_keys[i] << " out of order"
-             << " vals " << data.sorted_vals[i - 1] << ", "
-             << data.sorted_vals[i] << " (at index " << i - 1 << ")";
+             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i] << " out of order"
+             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
+             << " (at index " << i-1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -510,7 +479,8 @@ testSort(const char* test_name,
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_vals[i]);
     if (val_iter == key_iter->second.end())
@@ -518,11 +488,11 @@ testSort(const char* test_name,
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate val "
-             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -535,45 +505,39 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult
-testSort(const char* test_name,
-         const unsigned seed,
-         SortData<Res, sort_pairs_interface_tag, K, V>& data,
-         RAJA::Index_type N,
-         Compare comp,
-         TestSorter test_sorter,
-         stable_sort_tag,
-         sort_pairs_interface_tag si,
-         CompareInterface ci)
+::testing::AssertionResult testSort(
+    const char* test_name,
+    const unsigned seed,
+    SortData<Res, sort_pairs_interface_tag, K, V> & data,
+    RAJA::Index_type N,
+    Compare comp,
+    TestSorter test_sorter, stable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::list<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end())
-    {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++)
-  {
+  for (RAJA::Index_type i = 0; i < N; i++) {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of order "
-             << " keys " << data.sorted_keys[i - 1] << ", "
-             << data.sorted_keys[i] << " vals " << data.sorted_vals[i - 1]
-             << ", " << data.sorted_vals[i] << " (at index " << i - 1 << ")";
+             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
+             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
+             << " (at index " << i-1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -581,18 +545,19 @@ testSort(const char* test_name,
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_vals[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of stable order or unknown val "
-             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i]
+             << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0)
-    {
+    if (key_iter->second.size() == 0) {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -600,97 +565,95 @@ testSort(const char* test_name,
 }
 
 
-template <typename Res, typename K, typename V, typename Sorter>
+template <typename Res,
+          typename K,
+          typename V,
+          typename Sorter>
 void testSorterResInterfaces(
     std::false_type,
     unsigned,
-    SortData<Res, typename Sorter::sort_interface, K, V>&,
+    SortData<Res, typename Sorter::sort_interface, K, V> &,
     RAJA::Index_type,
     Sorter)
 {
   // Sorter does not support resource interface, no tests
 }
 
-template <typename Res, typename K, typename V, typename Sorter>
+template <typename Res,
+          typename K,
+          typename V,
+          typename Sorter>
 void testSorterResInterfaces(
     std::true_type,
     unsigned seed,
-    SortData<Res, typename Sorter::sort_interface, K, V>& data,
+    SortData<Res, typename Sorter::sort_interface, K, V> & data,
     RAJA::Index_type N,
     Sorter sorter)
 {
   // Sorter supports resource interface, res tests
-  using stability_category      = typename Sorter::sort_category;
-  using pairs_category          = typename Sorter::sort_interface;
+  using stability_category = typename Sorter::sort_category ;
+  using pairs_category     = typename Sorter::sort_interface ;
   using resource_no_comparator  = sort_res_default_interface_tag;
   using resource_use_comparator = sort_res_comp_interface_tag;
 
-  ASSERT_TRUE(testSort(
-      "resource+default", seed, data, N, RAJA::operators::less<K> {}, sorter,
-      stability_category {}, pairs_category {}, resource_no_comparator {}));
-  ASSERT_TRUE(testSort(
-      "resource+ascending", seed, data, N, RAJA::operators::less<K> {}, sorter,
-      stability_category {}, pairs_category {}, resource_use_comparator {}));
-  ASSERT_TRUE(testSort("resource+descending", seed, data, N,
-                       RAJA::operators::greater<K> {}, sorter,
-                       stability_category {}, pairs_category {},
-                       resource_use_comparator {}));
+  ASSERT_TRUE(testSort("resource+default", seed, data, N, RAJA::operators::less<K>{},
+      sorter, stability_category{}, pairs_category{}, resource_no_comparator{}));
+  ASSERT_TRUE(testSort("resource+ascending", seed, data, N, RAJA::operators::less<K>{},
+      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
+  ASSERT_TRUE(testSort("resource+descending", seed, data, N, RAJA::operators::greater<K>{},
+      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
 }
 
-template <typename K, typename Sorter, typename Res>
-void testSorterInterfaces(unsigned seed,
-                          RAJA::Index_type MaxN,
-                          Sorter sorter,
-                          Res res)
+template <typename K,
+          typename Sorter,
+          typename Res>
+void testSorterInterfaces(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
 {
-  using stability_category = typename Sorter::sort_category;
-  using pairs_category     = typename Sorter::sort_interface;
-  using supports_resource  = typename Sorter::supports_resource;
+  using stability_category = typename Sorter::sort_category ;
+  using pairs_category     = typename Sorter::sort_interface ;
+  using supports_resource  = typename Sorter::supports_resource ;
   using no_comparator      = sort_default_interface_tag;
   using use_comparator     = sort_comp_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
-      (MaxN + 1) / 2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  SortData<Res, pairs_category, K> data(N, res, [&]() { return dist(rng); });
+  SortData<Res, pairs_category, K> data(N, res, [&](){ return dist(rng); });
 
-  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K> {},
-                       sorter, stability_category {}, pairs_category {},
-                       no_comparator {}));
-  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K> {},
-                       sorter, stability_category {}, pairs_category {},
-                       use_comparator {}));
-  ASSERT_TRUE(testSort(
-      "descending", seed, data, N, RAJA::operators::greater<K> {}, sorter,
-      stability_category {}, pairs_category {}, use_comparator {}));
+  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K>{},
+      sorter, stability_category{}, pairs_category{}, no_comparator{}));
+  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K>{},
+      sorter, stability_category{}, pairs_category{}, use_comparator{}));
+  ASSERT_TRUE(testSort("descending", seed, data, N, RAJA::operators::greater<K>{},
+      sorter, stability_category{}, pairs_category{}, use_comparator{}));
 
   testSorterResInterfaces(supports_resource(), seed, data, N, sorter);
 }
 
-template <typename K, typename Sorter, typename Res>
+template <typename K,
+          typename Sorter,
+          typename Res>
 void testSorter(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
 {
   testSorterInterfaces<K>(seed, 0, sorter, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
-  {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
     testSorterInterfaces<K>(seed, n, sorter, res);
   }
 }
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device {}();
+  static unsigned seed = std::random_device{}();
   return seed;
 }
 
 
 TYPED_TEST_SUITE_P(SortUnitTest);
 
-template <typename T>
+template < typename T >
 class SortUnitTest : public ::testing::Test
-{};
+{ };
 
 TYPED_TEST_P(SortUnitTest, UnitSort)
 {
@@ -699,9 +662,9 @@ TYPED_TEST_P(SortUnitTest, UnitSort)
   using KeyType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed         = get_random_seed();
+  unsigned seed = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Sorter sorter {};
+  Sorter sorter{};
   ResType res = ResType::get_default();
 
   testSorter<KeyType>(seed, MaxN, sorter, res);
@@ -713,21 +676,34 @@ REGISTER_TYPED_TEST_SUITE_P(SortUnitTest, UnitSort);
 //
 // Key types for sort tests
 //
-using SortKeyTypeList = camp::list<RAJA::Index_type,
-                                   int,
+using SortKeyTypeList =
+  camp::list<
+              RAJA::Index_type,
+              int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                   unsigned,
-                                   long long,
-                                   unsigned long long,
-                                   float,
+              unsigned,
+              long long,
+              unsigned long long,
+              float,
 #endif
-                                   double>;
+              double
+            >;
 
 // Max test lengths for sort tests
-using SortMaxNListDefault = camp::list<camp::num<10000>>;
+using SortMaxNListDefault =
+  camp::list<
+              camp::num<10000>
+            >;
+
+using SortMaxNListSmall =
+  camp::list<
+              camp::num<1000>
+            >;
 
-using SortMaxNListSmall = camp::list<camp::num<1000>>;
+using SortMaxNListTiny =
+  camp::list<
+              camp::num<100>
+            >;
 
-using SortMaxNListTiny = camp::list<camp::num<100>>;
+#endif //__TEST_ALGORITHM_SORT_UTILS_HPP__
 
-#endif  //__TEST_ALGORITHM_SORT_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index 02daab1c60..d08f949fae 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -22,49 +22,60 @@
 
 #include "test-algorithm-sort-utils.hpp"
 
-template <typename policy>
-struct PolicySort : PolicySynchronize<policy>
+template < typename policy >
+struct PolicySort
+  : PolicySynchronize<policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicySort() : m_name("RAJA::sort<unknown>") {}
+  PolicySort()
+    : m_name("RAJA::sort<unknown>")
+  { }
 
   PolicySort(std::string const& policy_name)
-      : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template <typename policy>
-struct PolicySortPairs : PolicySynchronize<policy>
+template < typename policy >
+struct PolicySortPairs
+  : PolicySynchronize<policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicySortPairs() : m_name("RAJA::sort<unknown>[pairs]") {}
+  PolicySortPairs()
+    : m_name("RAJA::sort<unknown>[pairs]")
+  { }
 
   PolicySortPairs(std::string const& policy_name)
-      : m_name(std::string("RAJA::sort<") + policy_name +
-               std::string(">[pairs]"))
-  {}
+    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">[pairs]"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::sort_pairs<policy>(std::forward<Args>(args)...);
@@ -73,30 +84,41 @@ struct PolicySortPairs : PolicySynchronize<policy>
 
 
 using SequentialSortSorters =
-    camp::list<PolicySort<RAJA::seq_exec>, PolicySortPairs<RAJA::seq_exec>>;
+  camp::list<
+              PolicySort<RAJA::seq_exec>,
+              PolicySortPairs<RAJA::seq_exec>
+            >;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPSortSorters =
-    camp::list<PolicySort<RAJA::omp_parallel_for_exec>,
-               PolicySortPairs<RAJA::omp_parallel_for_exec>>;
+  camp::list<
+              PolicySort<RAJA::omp_parallel_for_exec>,
+              PolicySortPairs<RAJA::omp_parallel_for_exec>
+            >;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaSortSorters =
-    camp::list<PolicySort<RAJA::cuda_exec<128>>,
-               PolicySortPairs<RAJA::cuda_exec<128>>,
-               PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
+  camp::list<
+              PolicySort<RAJA::cuda_exec<128>>,
+              PolicySortPairs<RAJA::cuda_exec<128>>,
+              PolicySort<RAJA::cuda_exec_explicit<128, 2>>
+            >;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipSortSorters = camp::list<PolicySort<RAJA::hip_exec<128>>,
-                                  PolicySortPairs<RAJA::hip_exec<128>>>;
+using HipSortSorters =
+  camp::list<
+              PolicySort<RAJA::hip_exec<128>>,
+              PolicySortPairs<RAJA::hip_exec<128>>
+            >;
 
 #endif
 
-#endif  //__TEST_UNIT_ALGORITHM_SORT_HPP__
+#endif //__TEST_UNIT_ALGORITHM_SORT_HPP__
+
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index c4c9189732..6b33d63497 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -23,50 +23,60 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template <typename policy>
-struct PolicyStableSort : PolicySynchronize<policy>
+template < typename policy >
+struct PolicyStableSort
+  : PolicySynchronize<policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicyStableSort() : m_name("RAJA::stable_sort<unknown>") {}
+  PolicyStableSort()
+    : m_name("RAJA::stable_sort<unknown>")
+  { }
 
   PolicyStableSort(std::string const& policy_name)
-      : m_name(std::string("RAJA::stable_sort<") + policy_name +
-               std::string(">"))
-  {}
+    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::stable_sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template <typename policy>
-struct PolicyStableSortPairs : PolicySynchronize<policy>
+template < typename policy >
+struct PolicyStableSortPairs
+  : PolicySynchronize<policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicyStableSortPairs() : m_name("RAJA::stable_sort<unknown>[pairs]") {}
+  PolicyStableSortPairs()
+    : m_name("RAJA::stable_sort<unknown>[pairs]")
+  { }
 
   PolicyStableSortPairs(std::string const& policy_name)
-      : m_name(std::string("RAJA::stable_sort<") + policy_name +
-               std::string(">[pairs]"))
-  {}
+    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">[pairs]"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::stable_sort_pairs<policy>(std::forward<Args>(args)...);
@@ -74,32 +84,40 @@ struct PolicyStableSortPairs : PolicySynchronize<policy>
 };
 
 using SequentialStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::seq_exec>,
-               PolicyStableSortPairs<RAJA::seq_exec>>;
+  camp::list<
+              PolicyStableSort<RAJA::seq_exec>,
+              PolicyStableSortPairs<RAJA::seq_exec>
+            >;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::omp_parallel_for_exec>,
-               PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
+  camp::list<
+              PolicyStableSort<RAJA::omp_parallel_for_exec>,
+              PolicyStableSortPairs<RAJA::omp_parallel_for_exec>
+            >;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::cuda_exec<128>>,
-               PolicyStableSortPairs<RAJA::cuda_exec<128>>,
-               PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
+  camp::list<
+              PolicyStableSort<RAJA::cuda_exec<128>>,
+              PolicyStableSortPairs<RAJA::cuda_exec<128>>,
+              PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>
+            >;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipStableSortSorters =
-    camp::list<PolicyStableSort<RAJA::hip_exec<128>>,
-               PolicyStableSortPairs<RAJA::hip_exec<128>>>;
+  camp::list<
+              PolicyStableSort<RAJA::hip_exec<128>>,
+              PolicyStableSortPairs<RAJA::hip_exec<128>>
+            >;
 
 #endif
 
-#endif  // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
+#endif // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index 52570dbdf1..062e0f9b91 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -23,42 +23,49 @@
 #include "test-algorithm-reduce-utils.hpp"
 
 
-template <typename test_policy>
-using ForoneSynchronize =
-    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template < typename test_policy >
+using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct BinaryTreeReduce;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct Accumulate;
 
 
-template <typename test_policy>
-struct BinaryTreeReduce<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct BinaryTreeReduce<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using reduce_category  = unordered_reduce_tag;
+  using reduce_category = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name() { return "RAJA::binary_tree_reduce"; }
+  const char* name()
+  {
+    return "RAJA::binary_tree_reduce";
+  }
 
-  template <typename T, typename... Args>
+  template < typename T, typename... Args >
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::binary_tree_reduce(std::forward<Args>(args)...);
   }
 };
 
-template <typename test_policy>
-struct Accumulate<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct Accumulate<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using reduce_category  = left_fold_reduce_tag;
+  using reduce_category = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name() { return "RAJA::accumulate"; }
+  const char* name()
+  {
+    return "RAJA::accumulate";
+  }
 
-  template <typename T, typename... Args>
+  template < typename T, typename... Args >
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::accumulate(std::forward<Args>(args)...);
@@ -67,90 +74,89 @@ struct Accumulate<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template <typename test_policy>
+template < typename test_policy >
 struct BinaryTreeReduce<test_policy, RunOnDevice>
-    : ForoneSynchronize<test_policy>
+  : ForoneSynchronize<test_policy>
 {
-  using reduce_category  = unordered_reduce_tag;
+  using reduce_category = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
 
   BinaryTreeReduce()
-      : m_name(std::string("RAJA::binary_tree_reduce<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename T, typename Container>
+  template < typename T, typename Container >
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>([=] RAJA_DEVICE()
-                        { *reduced_value = RAJA::binary_tree_reduce(c); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c);
+    });
   }
 
-  template <typename T, typename Container>
-  void operator()(T* reduced_value,
-                  Container&& c,
-                  RAJA::detail::ContainerVal<Container> init)
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE()
-        { *reduced_value = RAJA::binary_tree_reduce(c, init); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c, init);
+    });
   }
 
-  template <typename T, typename Container, typename BinaryOp>
-  void operator()(T* reduced_value,
-                  Container&& c,
-                  RAJA::detail::ContainerVal<Container> init,
-                  BinaryOp op)
+  template < typename T, typename Container, typename BinaryOp >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE()
-        { *reduced_value = RAJA::binary_tree_reduce(c, init, op); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c, init, op);
+    });
   }
 };
 
-template <typename test_policy>
-struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct Accumulate<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using reduce_category  = left_fold_reduce_tag;
+  using reduce_category = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
 
   Accumulate()
-      : m_name(std::string("RAJA::accumulate<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::accumulate<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename T, typename Container>
+  template < typename T, typename Container >
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>([=] RAJA_DEVICE()
-                        { *reduced_value = RAJA::accumulate(c); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c);
+    });
   }
 
-  template <typename T, typename Container>
-  void operator()(T* reduced_value,
-                  Container&& c,
-                  RAJA::detail::ContainerVal<Container> init)
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>([=] RAJA_DEVICE()
-                        { *reduced_value = RAJA::accumulate(c, init); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c, init);
+    });
   }
 
-  template <typename T, typename Container, typename BinaryOp>
-  void operator()(T* reduced_value,
-                  Container&& c,
-                  RAJA::detail::ContainerVal<Container> init,
-                  BinaryOp op)
+  template < typename T, typename Container, typename BinaryOp >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
   {
-    forone<test_policy>([=] RAJA_DEVICE()
-                        { *reduced_value = RAJA::accumulate(c, init, op); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c, init, op);
+    });
   }
 };
 
@@ -158,24 +164,42 @@ struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 
 
 using SequentialBinaryTreeReduceReducers =
-    camp::list<BinaryTreeReduce<test_seq>>;
+  camp::list<
+              BinaryTreeReduce<test_seq>
+            >;
 
-using SequentialAccumulateReduceReducers = camp::list<Accumulate<test_seq>>;
+using SequentialAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_seq>
+            >;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_cuda>>;
+using CudaBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_cuda>
+            >;
 
-using CudaAccumulateReduceReducers = camp::list<Accumulate<test_cuda>>;
+using CudaAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_cuda>
+            >;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_hip>>;
+using HipBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_hip>
+            >;
 
-using HipAccumulateReduceReducers = camp::list<Accumulate<test_hip>>;
+using HipAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_hip>
+            >;
 
 #endif
 
-#endif  //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
+#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
+
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
index b972b752cd..032097d9e3 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -23,233 +23,261 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template <typename test_policy>
-using ForoneSynchronize =
-    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template < typename test_policy >
+using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct InsertionSort;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct InsertionSortPairs;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct ShellSort;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct ShellSortPairs;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct HeapSort;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct HeapSortPairs;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct IntroSort;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct IntroSortPairs;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct MergeSort;
 
-template <typename test_policy, typename platform = test_platform<test_policy>>
+template < typename test_policy, typename platform = test_platform<test_policy> >
 struct MergeSortPairs;
 
 
-template <typename test_policy>
-struct InsertionSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct InsertionSort<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::insertion_sort"; }
+  const char* name()
+  {
+    return "RAJA::insertion_sort";
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::insertion_sort(std::forward<Args>(args)...);
   }
 };
 
-template <typename test_policy>
+template < typename test_policy >
 struct InsertionSortPairs<test_policy, RunOnHost>
-    : ForoneSynchronize<test_policy>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::insertion_sort[pairs]"; }
+  const char* name()
+  {
+    return "RAJA::insertion_sort[pairs]";
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    auto c        = RAJA::zip_span(keys, vals);
+    auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template <typename test_policy>
-struct ShellSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct ShellSort<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::shell_sort"; }
+  const char* name()
+  {
+    return "RAJA::shell_sort";
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::shell_sort(std::forward<Args>(args)...);
   }
 };
 
-template <typename test_policy>
-struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct ShellSortPairs<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::shell_sort[pairs]"; }
+  const char* name()
+  {
+    return "RAJA::shell_sort[pairs]";
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    auto c        = RAJA::zip_span(keys, vals);
+    auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template <typename test_policy>
-struct HeapSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct HeapSort<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::heap_sort"; }
+  const char* name()
+  {
+    return "RAJA::heap_sort";
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::heap_sort(std::forward<Args>(args)...);
   }
 };
 
-template <typename test_policy>
-struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct HeapSortPairs<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::heap_sort[pairs]"; }
+  const char* name()
+  {
+    return "RAJA::heap_sort[pairs]";
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    auto c        = RAJA::zip_span(keys, vals);
+    auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template <typename test_policy>
-struct IntroSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct IntroSort<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::intro_sort"; }
+  const char* name()
+  {
+    return "RAJA::intro_sort";
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::intro_sort(std::forward<Args>(args)...);
   }
 };
 
-template <typename test_policy>
-struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct IntroSortPairs<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::intro_sort[pairs]"; }
+  const char* name()
+  {
+    return "RAJA::intro_sort[pairs]";
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    auto c        = RAJA::zip_span(keys, vals);
+    auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template <typename test_policy>
-struct MergeSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct MergeSort<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::merge_sort"; }
+  const char* name()
+  {
+    return "RAJA::merge_sort";
+  }
 
-  template <typename... Args>
+  template < typename... Args >
   void operator()(Args&&... args)
   {
     RAJA::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-template <typename test_policy>
-struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct MergeSortPairs<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name() { return "RAJA::merge_sort[pairs]"; }
+  const char* name()
+  {
+    return "RAJA::merge_sort[pairs]";
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    auto c        = RAJA::zip_span(keys, vals);
+    auto c = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -257,319 +285,348 @@ struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template <typename test_policy>
-struct InsertionSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct InsertionSort<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   InsertionSort()
-      : m_name(std::string("RAJA::insertion_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename Container>
+  template < typename Container >
   void operator()(Container&& c)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::insertion_sort(c);
+    });
   }
 
-  template <typename Container, typename Compare>
+  template < typename Container, typename Compare >
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c, comp); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::insertion_sort(c, comp);
+    });
   }
 };
 
-template <typename test_policy>
+template < typename test_policy >
 struct InsertionSortPairs<test_policy, RunOnDevice>
-    : ForoneSynchronize<test_policy>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = stable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = stable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   InsertionSortPairs()
-      : m_name(std::string("RAJA::insertion_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  {}
+    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE()
-        {
-          auto c        = RAJA::zip_span(keys, vals);
-          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-          RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
-        });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      auto c = RAJA::zip_span(keys, vals);
+      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+      RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
+    });
   }
 };
 
-template <typename test_policy>
-struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct ShellSort<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   ShellSort()
-      : m_name(std::string("RAJA::shell_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename Container>
+  template < typename Container >
   void operator()(Container&& c)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::shell_sort(c);
+    });
   }
 
-  template <typename Container, typename Compare>
+  template < typename Container, typename Compare >
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c, comp); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::shell_sort(c, comp);
+    });
   }
 };
 
-template <typename test_policy>
-struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct ShellSortPairs<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   ShellSortPairs()
-      : m_name(std::string("RAJA::shell_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  {}
+    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE()
-        {
-          auto c        = RAJA::zip_span(keys, vals);
-          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-          RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
-        });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      auto c = RAJA::zip_span(keys, vals);
+      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+      RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
+    });
   }
 };
 
-template <typename test_policy>
-struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct HeapSort<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   HeapSort()
-      : m_name(std::string("RAJA::heap_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename Container>
+  template < typename Container >
   void operator()(Container c)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::heap_sort(c);
+    });
   }
 
-  template <typename Container, typename Compare>
+  template < typename Container, typename Compare >
   void operator()(Container c, Compare comp)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c, comp); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::heap_sort(c, comp);
+    });
   }
 };
 
-template <typename test_policy>
-struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct HeapSortPairs<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   HeapSortPairs()
-      : m_name(std::string("RAJA::heap_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  {}
+    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE()
-        {
-          auto c        = RAJA::zip_span(keys, vals);
-          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-          RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
-        });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      auto c = RAJA::zip_span(keys, vals);
+      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+      RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
+    });
   }
 };
 
-template <typename test_policy>
-struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct IntroSort<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   IntroSort()
-      : m_name(std::string("RAJA::intro_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename Container>
+  template < typename Container >
   void operator()(Container&& c)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::intro_sort(c);
+    });
   }
 
-  template <typename Container, typename Compare>
+  template < typename Container, typename Compare >
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c, comp); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::intro_sort(c, comp);
+    });
   }
 };
 
-template <typename test_policy>
-struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct IntroSortPairs<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   IntroSortPairs()
-      : m_name(std::string("RAJA::intro_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  {}
+    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE()
-        {
-          auto c        = RAJA::zip_span(keys, vals);
-          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-          RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
-        });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      auto c = RAJA::zip_span(keys, vals);
+      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+      RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
+    });
   }
 };
 
-template <typename test_policy>
-struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct MergeSort<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   MergeSort()
-      : m_name(std::string("RAJA::merge_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">"))
-  {}
+    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename Container>
+  template < typename Container >
   void operator()(Container&& c)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::merge_sort(c);
+    });
   }
 
-  template <typename Container, typename Compare>
+  template < typename Container, typename Compare >
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c, comp); });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      RAJA::merge_sort(c, comp);
+    });
   }
 };
 
-template <typename test_policy>
-struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
+template < typename test_policy >
+struct MergeSortPairs<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
 {
-  using sort_category     = unstable_sort_tag;
-  using sort_interface    = sort_pairs_interface_tag;
+  using sort_category = unstable_sort_tag;
+  using sort_interface = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   MergeSortPairs()
-      : m_name(std::string("RAJA::merge_sort<") +
-               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  {}
+    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  { }
 
-  const char* name() { return m_name.c_str(); }
+  const char* name()
+  {
+    return m_name.c_str();
+  }
 
-  template <typename KeyContainer,
-            typename ValContainer,
-            typename Compare =
-                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template < typename KeyContainer, typename ValContainer,
+             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare {})
+                  Compare comp = Compare{})
   {
-    forone<test_policy>(
-        [=] RAJA_DEVICE()
-        {
-          auto c        = RAJA::zip_span(keys, vals);
-          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-          RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
-        });
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      auto c = RAJA::zip_span(keys, vals);
+      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+      RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
+    });
   }
 };
 
@@ -577,56 +634,102 @@ struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 
 
 using SequentialInsertionSortSorters =
-    camp::list<InsertionSort<test_seq>, InsertionSortPairs<test_seq>>;
+  camp::list<
+              InsertionSort<test_seq>,
+              InsertionSortPairs<test_seq>
+            >;
 
 using SequentialShellSortSorters =
-    camp::list<ShellSort<test_seq>, ShellSortPairs<test_seq>>;
+  camp::list<
+              ShellSort<test_seq>,
+              ShellSortPairs<test_seq>
+            >;
 
 using SequentialHeapSortSorters =
-    camp::list<HeapSort<test_seq>, HeapSortPairs<test_seq>>;
+  camp::list<
+              HeapSort<test_seq>,
+              HeapSortPairs<test_seq>
+            >;
 
 using SequentialIntroSortSorters =
-    camp::list<IntroSort<test_seq>, IntroSortPairs<test_seq>>;
+  camp::list<
+              IntroSort<test_seq>,
+              IntroSortPairs<test_seq>
+            >;
 
 using SequentialMergeSortSorters =
-    camp::list<MergeSort<test_seq>, MergeSortPairs<test_seq>>;
+  camp::list<
+              MergeSort<test_seq>,
+              MergeSortPairs<test_seq>
+            >;
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaInsertionSortSorters =
-    camp::list<InsertionSort<test_cuda>, InsertionSortPairs<test_cuda>>;
+  camp::list<
+              InsertionSort<test_cuda>,
+              InsertionSortPairs<test_cuda>
+            >;
 
 using CudaShellSortSorters =
-    camp::list<ShellSort<test_cuda>, ShellSortPairs<test_cuda>>;
+  camp::list<
+              ShellSort<test_cuda>,
+              ShellSortPairs<test_cuda>
+            >;
 
 using CudaHeapSortSorters =
-    camp::list<HeapSort<test_cuda>, HeapSortPairs<test_cuda>>;
+  camp::list<
+              HeapSort<test_cuda>,
+              HeapSortPairs<test_cuda>
+            >;
 
 using CudaIntroSortSorters =
-    camp::list<IntroSort<test_cuda>, IntroSortPairs<test_cuda>>;
+  camp::list<
+              IntroSort<test_cuda>,
+              IntroSortPairs<test_cuda>
+            >;
 
 using CudaMergeSortSorters =
-    camp::list<MergeSort<test_cuda>, MergeSortPairs<test_cuda>>;
+  camp::list<
+              MergeSort<test_cuda>,
+              MergeSortPairs<test_cuda>
+            >;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipInsertionSortSorters =
-    camp::list<InsertionSort<test_hip>, InsertionSortPairs<test_hip>>;
+  camp::list<
+              InsertionSort<test_hip>,
+              InsertionSortPairs<test_hip>
+            >;
 
 using HipShellSortSorters =
-    camp::list<ShellSort<test_hip>, ShellSortPairs<test_hip>>;
+  camp::list<
+              ShellSort<test_hip>,
+              ShellSortPairs<test_hip>
+            >;
 
 using HipHeapSortSorters =
-    camp::list<HeapSort<test_hip>, HeapSortPairs<test_hip>>;
+  camp::list<
+              HeapSort<test_hip>,
+              HeapSortPairs<test_hip>
+            >;
 
 using HipIntroSortSorters =
-    camp::list<IntroSort<test_hip>, IntroSortPairs<test_hip>>;
+  camp::list<
+              IntroSort<test_hip>,
+              IntroSortPairs<test_hip>
+            >;
 
 using HipMergeSortSorters =
-    camp::list<MergeSort<test_hip>, MergeSortPairs<test_hip>>;
+  camp::list<
+              MergeSort<test_hip>,
+              MergeSortPairs<test_hip>
+            >;
 
 #endif
 
-#endif  //__TEST_ALGORITHM_UTIL_SORT_HPP__
+#endif //__TEST_ALGORITHM_UTIL_SORT_HPP__
+
diff --git a/test/unit/atomic/test-atomic-incdec.cpp b/test/unit/atomic/test-atomic-incdec.cpp
index 8a48670cd4..6564feeaf5 100644
--- a/test/unit/atomic/test-atomic-incdec.cpp
+++ b/test/unit/atomic/test-atomic-incdec.cpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for "wrapping" increment and decrement
-/// functions
+/// Source file containing tests for "wrapping" increment and decrement functions
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -19,30 +18,31 @@
 #endif
 
 using unsigned_types =
-    ::testing::Types<std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>
+    ::testing::Types<
+                      std::tuple<unsigned int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned int, RAJA::seq_atomic>,
+                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>
+                      ,
+                      std::tuple<unsigned int, RAJA::omp_atomic>,
+                      std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                      ,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                     ,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::hip_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::hip_atomic>
+                      ,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::hip_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::hip_atomic>
 #endif
-                     >;
+                    >;
 
 // Basic Inc Dec
 
@@ -50,144 +50,143 @@ template <typename T>
 class AtomicBasicIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest);
+TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest );
 
-TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
+TYPED_TEST_P( AtomicBasicIncDecUnitTest, BasicIncDecs )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // test "wrapping" increment
-  // See:
-  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
-  T inc_init    = (T)0;
+  T inc_init = (T)0;
   T* inc_result = &inc_init;
 
   // oldval < val, increment oldval
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ(inc_result[0], (T)1);
+  ASSERT_EQ( inc_result[0], (T)1 );
 
   // oldval == val, wrap to 0
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ(inc_result[0], (T)0);
+  ASSERT_EQ( inc_result[0], (T)0 );
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ(inc_result[0], (T)0);
+  ASSERT_EQ( inc_result[0], (T)0 );
 
   // test "wrapping" decrement
-  // See:
-  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
-  T dec_init    = (T)1;
+  T dec_init = (T)1;
   T* dec_result = &dec_init;
 
   // oldval > 0, decrement oldval
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ(dec_result[0], (T)0);
+  ASSERT_EQ( dec_result[0], (T)0 );
 
   // oldval == 0, wrap to val
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ(dec_result[0], (T)1);
+  ASSERT_EQ( dec_result[0], (T)1 );
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ(dec_result[0], (T)1);
+  ASSERT_EQ( dec_result[0], (T)1 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest, BasicIncDecs);
+REGISTER_TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest,
+                             BasicIncDecs
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicIncDecUnitTest,
-                               AtomicBasicIncDecUnitTest,
-                               unsigned_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicIncDecUnitTest,
+                                AtomicBasicIncDecUnitTest,
+                                unsigned_types
+                              );
 
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
 
 using CUDA_unsigned_types =
-    ::testing::Types<std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+    ::testing::Types<
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                    >;
 
 
 template <typename T>
 class AtomicCUDAIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest);
+TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest );
 
-GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
+GPU_TYPED_TEST_P( AtomicCUDAIncDecUnitTest, CUDAIncDecs )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* inc_result = nullptr;
-  T* dec_result = nullptr;
-  cudaErrchk(cudaMallocManaged((void**)&inc_result, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void**)&dec_result, sizeof(T)));
+  T * inc_result = nullptr;
+  T * dec_result = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&inc_result, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&dec_result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // test "wrapping" increment
-  // See:
-  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
   inc_result[0] = (T)0;
   // oldval < val, increment oldval
-  forone<test_cuda>([=] __device__()
-                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
+  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(inc_result[0], (T)1);
+  ASSERT_EQ( inc_result[0], (T)1 );
 
   // oldval == val, wrap to 0
-  forone<test_cuda>([=] __device__()
-                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
+  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(inc_result[0], (T)0);
+  ASSERT_EQ( inc_result[0], (T)0 );
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
-  forone<test_cuda>([=] __device__()
-                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
+  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(inc_result[0], (T)0);
+  ASSERT_EQ( inc_result[0], (T)0 );
 
   // test "wrapping" decrement
-  // See:
-  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
   dec_result[0] = (T)1;
   // oldval > 0, decrement oldval
-  forone<test_cuda>([=] __device__()
-                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
+  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(dec_result[0], (T)0);
+  ASSERT_EQ( dec_result[0], (T)0 );
 
   // oldval == 0, wrap to val
-  forone<test_cuda>([=] __device__()
-                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
+  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(dec_result[0], (T)1);
+  ASSERT_EQ( dec_result[0], (T)1 );
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
-  forone<test_cuda>([=] __device__()
-                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
+  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(dec_result[0], (T)1);
+  ASSERT_EQ( dec_result[0], (T)1 );
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(inc_result));
   cudaErrchk(cudaFree(dec_result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs);
+REGISTER_TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest,
+                             CUDAIncDecs
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAIncDecUnitTest,
-                               AtomicCUDAIncDecUnitTest,
-                               CUDA_unsigned_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAIncDecUnitTest,
+                                AtomicCUDAIncDecUnitTest,
+                                CUDA_unsigned_types
+                              );
 #endif
+
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
index 677417d98d..f69813fcbe 100644
--- a/test/unit/atomic/test-atomic-ref-accessors.cpp
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -25,44 +25,47 @@ template <typename T>
 class AtomicRefBasicAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest );
 
-TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
+TYPED_TEST_P( AtomicRefBasicAccessorUnitTest, BasicAccessors )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // should also work with CUDA
-  T theval   = (T)0;
-  T* memaddr = &theval;
+  T theval = (T)0;
+  T * memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test store method with op()
-  test1.store((T)19);
-  ASSERT_EQ(test1, (T)19);
+  test1.store( (T)19 );
+  ASSERT_EQ( test1, (T)19 );
 
   // test assignment operator
   test1 = (T)23;
-  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ( test1, (T)23 );
 
   // test load method
   test1 = (T)29;
-  ASSERT_EQ(test1.load(), (T)29);
+  ASSERT_EQ( test1.load(), (T)29 );
 
   // test ()
   result = (test1 = (T)31);
-  ASSERT_EQ(test1, (T)31);
-  ASSERT_EQ(result, (T)31);
+  ASSERT_EQ( test1, (T)31 );
+  ASSERT_EQ( result, (T)31 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest, BasicAccessors);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest,
+                             BasicAccessors
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicAccessUnitTest,
-                               AtomicRefBasicAccessorUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicAccessUnitTest,
+                                AtomicRefBasicAccessorUnitTest,
+                                basic_types
+                              );
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -72,58 +75,48 @@ template <typename T>
 class AtomicRefCUDAAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest );
 
-GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
+GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* memaddr = nullptr;
-  T* result  = nullptr;
-  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
+  T * memaddr = nullptr;
+  T * result = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test store method with op()
-  forone<test_cuda>([=] __device__() { test1.store((T)19); });
+  forone<test_cuda>( [=] __device__ () {test1.store( (T)19 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)19);
+  ASSERT_EQ( test1, (T)19 );
 
   // test assignment operator
-  forone<test_cuda>([=] __device__() { test1 = (T)23; });
+  forone<test_cuda>( [=] __device__ () {test1 = (T)23;} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ( test1, (T)23 );
 
   // test load method
-  forone<test_cuda>(
-      [=] __device__()
-      {
-        test1     = (T)29;
-        result[0] = test1.load();
-      });
+  forone<test_cuda>( [=] __device__ () {test1 = (T)29; result[0] = test1.load();} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)29);
-  ASSERT_EQ(test1, (T)29);
+  ASSERT_EQ( result[0], (T)29 );
+  ASSERT_EQ( test1, (T)29 );
 
   // test T()
-  forone<test_cuda>(
-      [=] __device__()
-      {
-        test1     = (T)47;
-        result[0] = test1;
-      });
+  forone<test_cuda>( [=] __device__ () {test1 = (T)47; result[0] = test1;} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)47);
-  ASSERT_EQ(test1, (T)47);
+  ASSERT_EQ( result[0], (T)47 );
+  ASSERT_EQ( test1, (T)47 );
 
-  forone<test_cuda>([=] __device__() { result[0] = (test1 = (T)31); });
+  forone<test_cuda>( [=] __device__ () {result[0] = (test1 = (T)31);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)31);
-  ASSERT_EQ(test1, (T)31);
+  ASSERT_EQ( result[0], (T)31 );
+  ASSERT_EQ( test1, (T)31 );
 
   cudaErrchk(cudaDeviceSynchronize());
 
@@ -131,9 +124,14 @@ GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest,
+                             CUDAAccessors
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAccessUnitTest,
-                               AtomicRefCUDAAccessorUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAccessUnitTest,
+                                AtomicRefCUDAAccessorUnitTest,
+                                CUDA_types
+                              );
 #endif
+
+
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
index 7899b2af5e..fba54f77fa 100644
--- a/test/unit/atomic/test-atomic-ref-addsub.cpp
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -25,55 +25,58 @@ template <typename T>
 class AtomicRefBasicAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest );
 
-TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
+TYPED_TEST_P( AtomicRefBasicAddSubUnitTest, BasicAddSubs )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval   = (T)0;
-  T* memaddr = &theval;
+  T theval = (T)0;
+  T * memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test inc ops
   T val2 = ++test1;
   T val3 = test1++;
-  ASSERT_EQ(test1, (T)2);
-  ASSERT_EQ(val2, (T)1);
-  ASSERT_EQ(val3, (T)1);
+  ASSERT_EQ( test1, (T)2 );
+  ASSERT_EQ( val2, (T)1 );
+  ASSERT_EQ( val3, (T)1 );
 
   // test dec ops
   T val4 = --test1;
   T val5 = test1--;
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(val4, (T)1);
-  ASSERT_EQ(val5, (T)1);
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( val4, (T)1 );
+  ASSERT_EQ( val5, (T)1 );
 
   // test add/sub ops
   T val6 = (test1 += (T)23);
-  ASSERT_EQ(test1, (T)23);
-  ASSERT_EQ(val6, (T)23);
+  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ( val6, (T)23 );
   T val7 = (test1 -= (T)22);
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(val7, (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( val7, (T)1 );
 
   // test add/sub methods
-  T val8 = test1.fetch_add((T)23);
-  ASSERT_EQ(test1, (T)24);
-  ASSERT_EQ(val8, (T)1);
-  T val9 = test1.fetch_sub((T)23);
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(val9, (T)24);
+  T val8 = test1.fetch_add( (T)23 );
+  ASSERT_EQ( test1, (T)24 );
+  ASSERT_EQ( val8, (T)1 );
+  T val9 = test1.fetch_sub( (T)23 );
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( val9, (T)24 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest,
+                             BasicAddSubs
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicAddSubUnitTest,
-                               AtomicRefBasicAddSubUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicAddSubUnitTest,
+                                AtomicRefBasicAddSubUnitTest,
+                                basic_types
+                              );
 
 
 // Pure CUDA test.
@@ -84,62 +87,62 @@ template <typename T>
 class AtomicRefCUDAAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest );
 
-GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
+GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* memaddr = nullptr;
-  T* result1 = nullptr;
-  T* result2 = nullptr;
-  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void**)&result1, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void**)&result2, sizeof(T)));
+  T * memaddr = nullptr;
+  T * result1 = nullptr;
+  T * result2 = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result1, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result2, sizeof(T)));
   memaddr[0] = (T)0;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test inc ops
-  forone<test_cuda>([=] __device__() { result1[0] = ++test1; });
+  forone<test_cuda>( [=] __device__ () {result1[0] = ++test1;} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result1[0], (T)1);
-  forone<test_cuda>([=] __device__() { result2[0] = test1++; });
+  ASSERT_EQ( result1[0], (T)1 );
+  forone<test_cuda>( [=] __device__ () {result2[0] = test1++;} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)2);
-  ASSERT_EQ(result2[0], (T)1);
+  ASSERT_EQ( test1, (T)2 );
+  ASSERT_EQ( result2[0], (T)1 );
 
   // test dec ops
-  forone<test_cuda>([=] __device__() { result1[0] = --test1; });
+  forone<test_cuda>( [=] __device__ () {result1[0] = --test1;} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result1[0], (T)1);
-  forone<test_cuda>([=] __device__() { result2[0] = test1--; });
+  ASSERT_EQ( result1[0], (T)1 );
+  forone<test_cuda>( [=] __device__ () {result2[0] = test1--;} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(result2[0], (T)1);
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( result2[0], (T)1 );
 
   // test add/sub ops
-  forone<test_cuda>([=] __device__() { result1[0] = (test1 += (T)23); });
+  forone<test_cuda>( [=] __device__ () {result1[0] = (test1 += (T)23);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)23);
-  ASSERT_EQ(result1[0], (T)23);
-  forone<test_cuda>([=] __device__() { result2[0] = (test1 -= (T)22); });
+  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ( result1[0], (T)23 );
+  forone<test_cuda>( [=] __device__ () {result2[0] = (test1 -= (T)22);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(result2[0], (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result2[0], (T)1 );
 
   // test add/sub methods
-  forone<test_cuda>([=] __device__() { result1[0] = test1.fetch_add((T)23); });
+  forone<test_cuda>( [=] __device__ () {result1[0] = test1.fetch_add( (T)23 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)24);
-  ASSERT_EQ(result1[0], (T)1);
-  forone<test_cuda>([=] __device__() { result2[0] = test1.fetch_sub((T)23); });
+  ASSERT_EQ( test1, (T)24 );
+  ASSERT_EQ( result1[0], (T)1 );
+  forone<test_cuda>( [=] __device__ () {result2[0] = test1.fetch_sub( (T)23 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(result2[0], (T)24);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result2[0], (T)24 );
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
@@ -147,9 +150,13 @@ GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
   cudaErrchk(cudaFree(result2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest,
+                             CUDAAddSubs
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAddSubUnitTest,
-                               AtomicRefCUDAAddSubUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAddSubUnitTest,
+                                AtomicRefCUDAAddSubUnitTest,
+                                CUDA_types
+                              );
 #endif
+
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
index 4c9d840641..adf49599ca 100644
--- a/test/unit/atomic/test-atomic-ref-bitwise.cpp
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -23,76 +23,80 @@ template <typename T>
 class AtomicRefBasicBitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest );
 
-TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
+TYPED_TEST_P( AtomicRefBasicBitwiseUnitTest, BasicBitwises )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval   = (T)1;
-  T* memaddr = &theval;
+  T theval = (T)1;
+  T * memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test and/or
-  result = test1.fetch_and((T)0);
-  ASSERT_EQ(result, (T)1);
-  ASSERT_EQ(test1, (T)0);
+  result = test1.fetch_and( (T)0 );
+  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ( test1, (T)0 );
 
-  result = test1.fetch_or((T)1);
-  ASSERT_EQ(result, (T)0);
-  ASSERT_EQ(test1, (T)1);
+  result = test1.fetch_or( (T)1 );
+  ASSERT_EQ( result, (T)0 );
+  ASSERT_EQ( test1, (T)1 );
 
   result = (test1 &= (T)0);
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(result, (T)0);
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( result, (T)0 );
 
   result = (test1 |= (T)1);
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result, (T)1 );
 
   // test xor
-  result = test1.fetch_xor((T)1);
-  ASSERT_EQ(result, (T)1);
-  ASSERT_EQ(test1, (T)0);
+  result = test1.fetch_xor( (T)1 );
+  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ( test1, (T)0 );
 
   result = (test1 ^= (T)1);
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result, (T)1 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises);
-
-using basic_types =
-    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
-                     std::tuple<int, RAJA::seq_atomic>,
-                     std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest,
+                             BasicBitwises
+                           );
+
+using basic_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::builtin_atomic>,
+                      std::tuple<int, RAJA::seq_atomic>,
+                      std::tuple<unsigned int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned int, RAJA::seq_atomic>,
+                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<int, RAJA::omp_atomic>,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>
+                      ,
+                      std::tuple<int, RAJA::omp_atomic>,
+                      std::tuple<unsigned int, RAJA::omp_atomic>,
+                      std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                      ,
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
-                     >;
+                    >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicBitwiseUnitTest,
-                               AtomicRefBasicBitwiseUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicBitwiseUnitTest,
+                                AtomicRefBasicBitwiseUnitTest,
+                                basic_types
+                              );
 
 
 // Pure CUDA test.
@@ -103,71 +107,77 @@ template <typename T>
 class AtomicRefCUDABitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest );
 
-GPU_TYPED_TEST_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises)
+GPU_TYPED_TEST_P( AtomicRefCUDABitwiseUnitTest, CUDABitwises )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* memaddr = nullptr;
-  T* result  = nullptr;
-  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
+  T * memaddr = nullptr;
+  T * result = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
   memaddr[0] = (T)1;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test and/or
-  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_and((T)0); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_and( (T)0 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)1);
-  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ( test1, (T)0 );
 
-  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_or((T)1); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_or( (T)1 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)0);
-  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ( result[0], (T)0 );
+  ASSERT_EQ( test1, (T)1 );
 
-  forone<test_cuda>([=] __device__() { result[0] = (test1 &= (T)0); });
+  forone<test_cuda>( [=] __device__ () {result[0] = (test1 &= (T)0);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(result[0], (T)0);
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( result[0], (T)0 );
 
-  forone<test_cuda>([=] __device__() { result[0] = (test1 |= (T)1); });
+  forone<test_cuda>( [=] __device__ () {result[0] = (test1 |= (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result[0], (T)1 );
 
   // test xor
-  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_xor((T)1); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_xor( (T)1 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)1);
-  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ( test1, (T)0 );
 
-  forone<test_cuda>([=] __device__() { result[0] = (test1 ^= (T)1); });
+  forone<test_cuda>( [=] __device__ () {result[0] = (test1 ^= (T)1);} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)1);
-  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ( result[0], (T)1 );
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises);
-
-using CUDA_types =
-    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDABitwiseUnitTest,
-                               AtomicRefCUDABitwiseUnitTest,
-                               CUDA_types);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest,
+                             CUDABitwises
+                           );
+
+using CUDA_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                    >;
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDABitwiseUnitTest,
+                                AtomicRefCUDABitwiseUnitTest,
+                                CUDA_types
+                              );
 #endif
+
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
index c86c6be8fb..619e3ebf20 100644
--- a/test/unit/atomic/test-atomic-ref-constructor.cpp
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for atomic ref constructors (and use of
-/// getPointer for verification)
+/// Source file containing tests for atomic ref constructors (and use of getPointer for verification)
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -31,33 +30,38 @@ TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest);
 template <typename T>
 void DefaultPolConstructors()
 {
-  T* memaddr = nullptr;
+  T * memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T> test1(memaddr);
+  RAJA::AtomicRef<T> test1( memaddr );
 
-  ASSERT_EQ(test1.getPointer(), nullptr);
+  ASSERT_EQ( test1.getPointer(), nullptr );
 
   // ref constructor
-  RAJA::AtomicRef<T> const& reft1 = test1;
-  RAJA::AtomicRef<T> reftest1(reft1);
+  RAJA::AtomicRef<T> const & reft1 = test1;
+  RAJA::AtomicRef<T> reftest1( reft1 );
 
-  ASSERT_EQ(reftest1.getPointer(), nullptr);
+  ASSERT_EQ( reftest1.getPointer(), nullptr );
 }
 
-TYPED_TEST_P(AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors)
+TYPED_TEST_P( AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors )
 {
   DefaultPolConstructors<TypeParam>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest,
-                            DefaultPolConstructors);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefDefaultConstructorUnitTest,
+                             DefaultPolConstructors
+                           );
 
-using default_types = ::testing::Types<int, float, double>;
+using default_types = ::testing::Types< int,
+                                      float,
+                                      double
+                                    >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(DefaultConstrUnitTest,
-                               AtomicRefDefaultConstructorUnitTest,
-                               default_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( DefaultConstrUnitTest,
+                                AtomicRefDefaultConstructorUnitTest,
+                                default_types
+                              );
 
 // Basic Constructors with policies
 
@@ -65,33 +69,35 @@ template <typename T>
 class AtomicRefBasicConstructorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest );
 
-TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
+TYPED_TEST_P( AtomicRefBasicConstructorUnitTest, BasicConstructors )
 {
-  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType* memaddr = nullptr;
+  NumericType * memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( memaddr );
 
-  ASSERT_EQ(test1.getPointer(), nullptr);
+  ASSERT_EQ( test1.getPointer(), nullptr );
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
 
-  ASSERT_EQ(reftest1.getPointer(), nullptr);
+  ASSERT_EQ( reftest1.getPointer(), nullptr );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest,
-                            BasicConstructors);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest,
+                             BasicConstructors
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicConstrUnitTest,
-                               AtomicRefBasicConstructorUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicConstrUnitTest,
+                                AtomicRefBasicConstructorUnitTest,
+                                basic_types
+                              );
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -103,40 +109,44 @@ class AtomicRefCUDAConstructorUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest);
 
-GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
+GPU_TYPED_TEST_P( AtomicRefCUDAConstructorUnitTest, CUDAConstructors )
 {
-  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType* memaddr = nullptr;
-  NumericType* proxy   = nullptr;
-  cudaErrchk(cudaMallocManaged((void**)&proxy, sizeof(NumericType)));
+  NumericType * memaddr = nullptr;
+  NumericType * proxy = nullptr;
+  cudaErrchk(cudaMallocManaged((void **)&proxy, sizeof(NumericType)));
   proxy = memaddr;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test0(memaddr);
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(proxy);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test0( memaddr );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( proxy );
 
-  forone<test_cuda>([=] __device__() { test1.getPointer(); });
+  forone<test_cuda>( [=] __device__ () {test1.getPointer();} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test0.getPointer(), nullptr);
-  ASSERT_EQ(test1.getPointer(), nullptr);
+  ASSERT_EQ( test0.getPointer(), nullptr );
+  ASSERT_EQ( test1.getPointer(), nullptr );
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
-  forone<test_cuda>([=] __device__() { reftest1.getPointer(); });
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
+  forone<test_cuda>( [=] __device__ () {reftest1.getPointer();} );
   cudaErrchk(cudaDeviceSynchronize());
 
-  ASSERT_EQ(reftest1.getPointer(), nullptr);
+  ASSERT_EQ( reftest1.getPointer(), nullptr );
 
   cudaErrchk(cudaFree(proxy));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAConstructorUnitTest,
+                             CUDAConstructors
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAConstrUnitTest,
-                               AtomicRefCUDAConstructorUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAConstrUnitTest,
+                                AtomicRefCUDAConstructorUnitTest,
+                                CUDA_types
+                              );
 #endif
+
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
index 842e18d319..18fa1e4819 100644
--- a/test/unit/atomic/test-atomic-ref-exchanges.cpp
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -23,87 +23,91 @@ template <typename T>
 class AtomicRefBasicExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest );
 
-TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
+TYPED_TEST_P( AtomicRefBasicExchangeUnitTest, BasicExchanges )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T swapper  = (T)91;
-  T theval   = (T)0;
-  T* memaddr = &theval;
+  T swapper = (T)91;
+  T theval = (T)0;
+  T * memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test exchange method
-  swapper = test1.exchange(swapper);
-  ASSERT_EQ(test1, (T)91);
-  ASSERT_EQ(swapper, (T)0);
+  swapper = test1.exchange( swapper );
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper, (T)0 );
 
   // test CAS method
-  swapper = test1.CAS((T)91, swapper);
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(swapper, (T)91);
+  swapper = test1.CAS( (T)91, swapper );
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper, (T)91 );
 
 
   bool result = true;
-  T testval   = (T)19;
-  T& valref   = testval;
+  T testval = (T)19;
+  T & valref = testval;
 
   // test strong exchange method
-  result = test1.compare_exchange_strong(valref, testval);
-  ASSERT_EQ(result, false);
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(swapper, (T)91);
-  ASSERT_EQ(testval, (T)0);
+  result = test1.compare_exchange_strong( valref, testval );
+  ASSERT_EQ( result, false );
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper, (T)91 );
+  ASSERT_EQ( testval, (T)0 );
 
   // test weak exchange method (same as strong exchange)
-  result = test1.compare_exchange_weak(valref, swapper);
-  ASSERT_EQ(result, true);
-  ASSERT_EQ(test1, (T)91);
-  ASSERT_EQ(swapper, (T)91);
-  ASSERT_EQ(testval, (T)0);
+  result = test1.compare_exchange_weak( valref, swapper );
+  ASSERT_EQ( result, true );
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper, (T)91 );
+  ASSERT_EQ( testval, (T)0 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest, BasicExchanges);
-
-using basic_types =
-    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
-                     std::tuple<int, RAJA::seq_atomic>,
-                     std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                     std::tuple<float, RAJA::builtin_atomic>,
-                     std::tuple<float, RAJA::seq_atomic>,
-                     std::tuple<double, RAJA::builtin_atomic>,
-                     std::tuple<double, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest,
+                             BasicExchanges
+                           );
+
+using basic_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::builtin_atomic>,
+                      std::tuple<int, RAJA::seq_atomic>,
+                      std::tuple<unsigned int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned int, RAJA::seq_atomic>,
+                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                      std::tuple<float, RAJA::builtin_atomic>,
+                      std::tuple<float, RAJA::seq_atomic>,
+                      std::tuple<double, RAJA::builtin_atomic>,
+                      std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<int, RAJA::omp_atomic>,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                     std::tuple<float, RAJA::omp_atomic>,
-                     std::tuple<double, RAJA::omp_atomic>
+                      ,
+                      std::tuple<int, RAJA::omp_atomic>,
+                      std::tuple<unsigned int, RAJA::omp_atomic>,
+                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                      std::tuple<float, RAJA::omp_atomic>,
+                      std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::cuda_atomic>
+                      ,
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::cuda_atomic>
 #endif
-                     >;
+                    >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicExchangeUnitTest,
-                               AtomicRefBasicExchangeUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicExchangeUnitTest,
+                                AtomicRefBasicExchangeUnitTest,
+                                basic_types
+                              );
 
 
 // Pure CUDA test.
@@ -114,17 +118,17 @@ template <typename T>
 class AtomicRefCUDAExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest );
 
-GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
+GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* swapper   = nullptr;
-  T* memaddr   = nullptr;
-  T* testval   = nullptr;
-  bool* result = nullptr;
+  T * swapper = nullptr;
+  T * memaddr = nullptr;
+  T * testval = nullptr;
+  bool * result = nullptr;
   cudaErrchk(cudaMallocManaged(&swapper, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&testval, sizeof(T)));
@@ -132,45 +136,39 @@ GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
   swapper[0] = (T)91;
   memaddr[0] = (T)0;
   testval[0] = (T)19;
-  result[0]  = true;
+  result[0] = true;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test exchange method
-  forone<test_cuda>([=] __device__()
-                    { swapper[0] = test1.exchange(swapper[0]); });
+  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.exchange( swapper[0] );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)91);
-  ASSERT_EQ(swapper[0], (T)0);
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper[0], (T)0 );
 
   // test CAS method
-  forone<test_cuda>([=] __device__()
-                    { swapper[0] = test1.CAS((T)91, swapper[0]); });
+  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.CAS( (T)91, swapper[0] );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(swapper[0], (T)91);
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper[0], (T)91 );
 
   // test strong exchange method
-  forone<test_cuda>(
-      [=] __device__()
-      { result[0] = test1.compare_exchange_strong(testval[0], testval[0]); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_strong( testval[0], testval[0] );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], false);
-  ASSERT_EQ(test1, (T)0);
-  ASSERT_EQ(swapper[0], (T)91);
-  ASSERT_EQ(testval[0], (T)0);
+  ASSERT_EQ( result[0], false );
+  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ( swapper[0], (T)91 );
+  ASSERT_EQ( testval[0], (T)0 );
 
   // test weak exchange method (same as strong exchange)
-  forone<test_cuda>(
-      [=] __device__()
-      { result[0] = test1.compare_exchange_weak(testval[0], swapper[0]); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_weak( testval[0], swapper[0] );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], true);
-  ASSERT_EQ(test1, (T)91);
-  ASSERT_EQ(swapper[0], (T)91);
-  ASSERT_EQ(testval[0], (T)0);
+  ASSERT_EQ( result[0], true );
+  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ( swapper[0], (T)91 );
+  ASSERT_EQ( testval[0], (T)0 );
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(swapper));
@@ -179,19 +177,25 @@ GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges);
-
-using CUDA_types =
-    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAExchangeUnitTest,
-                               AtomicRefCUDAExchangeUnitTest,
-                               CUDA_types);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest,
+                             CUDAExchanges
+                           );
+
+using CUDA_types = 
+    ::testing::Types<
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>
+                    >;
+
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAExchangeUnitTest,
+                                AtomicRefCUDAExchangeUnitTest,
+                                CUDA_types
+                              );
 #endif
+
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
index 10d4825616..a35ea15164 100644
--- a/test/unit/atomic/test-atomic-ref-minmax.cpp
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -25,44 +25,47 @@ template <typename T>
 class AtomicRefBasicMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest );
 
-TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
+TYPED_TEST_P( AtomicRefBasicMinMaxUnitTest, BasicMinMaxs )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval   = (T)91;
-  T* memaddr = &theval;
+  T theval = (T)91;
+  T * memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test min
-  result = test1.fetch_min((T)87);
-  ASSERT_EQ(result, (T)91);
-  ASSERT_EQ(test1, (T)87);
+  result = test1.fetch_min( (T)87 );
+  ASSERT_EQ( result, (T)91 );
+  ASSERT_EQ( test1, (T)87 );
 
-  result = test1.min((T)83);
-  ASSERT_EQ(result, (T)83);
-  ASSERT_EQ(test1, (T)83);
+  result = test1.min( (T)83 );
+  ASSERT_EQ( result, (T)83 );
+  ASSERT_EQ( test1, (T)83 );
 
   // test max
-  result = test1.fetch_max((T)87);
-  ASSERT_EQ(result, (T)83);
-  ASSERT_EQ(test1, (T)87);
+  result = test1.fetch_max( (T)87 );
+  ASSERT_EQ( result, (T)83 );
+  ASSERT_EQ( test1, (T)87 );
 
-  result = test1.max((T)91);
-  ASSERT_EQ(result, (T)91);
-  ASSERT_EQ(test1, (T)91);
+  result = test1.max( (T)91 );
+  ASSERT_EQ( result, (T)91 );
+  ASSERT_EQ( test1, (T)91 );
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest,
+                             BasicMinMaxs
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(BasicMinMaxUnitTest,
-                               AtomicRefBasicMinMaxUnitTest,
-                               basic_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( BasicMinMaxUnitTest,
+                                AtomicRefBasicMinMaxUnitTest,
+                                basic_types
+                              );
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -72,53 +75,57 @@ template <typename T>
 class AtomicRefCUDAMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest);
+TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest );
 
-GPU_TYPED_TEST_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs)
+GPU_TYPED_TEST_P( AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs )
 {
-  using T            = typename std::tuple_element<0, TypeParam>::type;
+  using T = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T* result  = nullptr;
-  T* memaddr = nullptr;
+  T * result = nullptr;
+  T * memaddr = nullptr;
   cudaErrchk(cudaMallocManaged(&result, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   memaddr[0] = (T)91;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
+  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
 
   // test min
-  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_min((T)87); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_min( (T)87 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)91);
-  ASSERT_EQ(test1, (T)87);
+  ASSERT_EQ( result[0], (T)91 );
+  ASSERT_EQ( test1, (T)87 );
 
-  forone<test_cuda>([=] __device__() { result[0] = test1.min((T)83); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.min( (T)83 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)83);
-  ASSERT_EQ(test1, (T)83);
+  ASSERT_EQ( result[0], (T)83 );
+  ASSERT_EQ( test1, (T)83 );
 
   // test max
-  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_max((T)87); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_max( (T)87 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)83);
-  ASSERT_EQ(test1, (T)87);
+  ASSERT_EQ( result[0], (T)83 );
+  ASSERT_EQ( test1, (T)87 );
 
-  forone<test_cuda>([=] __device__() { result[0] = test1.max((T)91); });
+  forone<test_cuda>( [=] __device__ () {result[0] = test1.max( (T)91 );} );
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ(result[0], (T)91);
-  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ( result[0], (T)91 );
+  ASSERT_EQ( test1, (T)91 );
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(result));
   cudaErrchk(cudaFree(memaddr));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs);
+REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest,
+                             CUDAMinMaxs
+                           );
 
-INSTANTIATE_TYPED_TEST_SUITE_P(CUDAMinMaxUnitTest,
-                               AtomicRefCUDAMinMaxUnitTest,
-                               CUDA_types);
+INSTANTIATE_TYPED_TEST_SUITE_P( CUDAMinMaxUnitTest,
+                                AtomicRefCUDAMinMaxUnitTest,
+                                CUDA_types
+                              );
 #endif
+
diff --git a/test/unit/atomic/test-atomic-ref.hpp b/test/unit/atomic/test-atomic-ref.hpp
index 6805c432cd..f5b7dd2943 100644
--- a/test/unit/atomic/test-atomic-ref.hpp
+++ b/test/unit/atomic/test-atomic-ref.hpp
@@ -13,62 +13,66 @@
 #include "RAJA_gtest.hpp"
 
 using basic_types =
-    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
-                     std::tuple<int, RAJA::seq_atomic>,
-                     std::tuple<unsigned int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned int, RAJA::seq_atomic>,
-                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                     std::tuple<float, RAJA::builtin_atomic>,
-                     std::tuple<float, RAJA::seq_atomic>,
-                     std::tuple<double, RAJA::builtin_atomic>,
-                     std::tuple<double, RAJA::seq_atomic>
+    ::testing::Types<
+                      std::tuple<int, RAJA::builtin_atomic>,
+                      std::tuple<int, RAJA::seq_atomic>,
+                      std::tuple<unsigned int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned int, RAJA::seq_atomic>,
+                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                      std::tuple<float, RAJA::builtin_atomic>,
+                      std::tuple<float, RAJA::seq_atomic>,
+                      std::tuple<double, RAJA::builtin_atomic>,
+                      std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                     ,
-                     std::tuple<int, RAJA::omp_atomic>,
-                     std::tuple<unsigned int, RAJA::omp_atomic>,
-                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                     std::tuple<float, RAJA::omp_atomic>,
-                     std::tuple<double, RAJA::omp_atomic>
+                      ,
+                      std::tuple<int, RAJA::omp_atomic>,
+                      std::tuple<unsigned int, RAJA::omp_atomic>,
+                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                      std::tuple<float, RAJA::omp_atomic>,
+                      std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::cuda_atomic>,
-                     std::tuple<double, RAJA::auto_atomic>,
-                     std::tuple<double, RAJA::cuda_atomic>
+                      ,
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::cuda_atomic>,
+                      std::tuple<double, RAJA::auto_atomic>,
+                      std::tuple<double, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                     ,
-                     std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::hip_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::hip_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::hip_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::hip_atomic>,
-                     std::tuple<double, RAJA::auto_atomic>,
-                     std::tuple<double, RAJA::hip_atomic>
+                      ,
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::hip_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::hip_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::hip_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::hip_atomic>,
+                      std::tuple<double, RAJA::auto_atomic>,
+                      std::tuple<double, RAJA::hip_atomic>
 #endif
-                     >;
+                    >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using CUDA_types =
-    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
-                     std::tuple<int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned int, RAJA::auto_atomic>,
-                     std::tuple<unsigned int, RAJA::cuda_atomic>,
-                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<float, RAJA::auto_atomic>,
-                     std::tuple<double, RAJA::cuda_atomic>,
-                     std::tuple<double, RAJA::cuda_atomic>>;
+    ::testing::Types<
+                      std::tuple<int, RAJA::auto_atomic>,
+                      std::tuple<int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned int, RAJA::auto_atomic>,
+                      std::tuple<unsigned int, RAJA::cuda_atomic>,
+                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<float, RAJA::auto_atomic>,
+                      std::tuple<double, RAJA::cuda_atomic>,
+                      std::tuple<double, RAJA::cuda_atomic>
+                    >;
 #endif
+
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 0ddc91bb9a..1b0ce0a414 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -14,21 +14,22 @@
 GPU_TEST(SynchronizeUnitTest, HIP)
 {
 
-  double* managed_data = (double*)malloc(sizeof(double) * 50);
+  double* managed_data = (double*) malloc(sizeof(double)*50);
   double* d_managed_data;
-  hipMalloc(&d_managed_data, sizeof(double) * 50);
+  hipMalloc(&d_managed_data, sizeof(double)*50);
 
-  RAJA::forall<RAJA::hip_exec_async<256>>(
-      RAJA::RangeSegment(0, 50), [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
-      { d_managed_data[i] = 1.0 * i; });
+  RAJA::forall<RAJA::hip_exec_async<256>>( RAJA::RangeSegment(0, 50),
+    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
+    d_managed_data[i] = 1.0 * i;
+  });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
-  hipMemcpy(managed_data, d_managed_data, sizeof(double) * 50,
-            hipMemcpyDeviceToHost);
+  hipMemcpy(managed_data, d_managed_data, sizeof(double)*50, hipMemcpyDeviceToHost);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
-                               [=](RAJA::Index_type i)
-                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
+  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
+    [=](RAJA::Index_type i) {
+    EXPECT_EQ(managed_data[i], 1.0 * i);
+  });
 
   free(managed_data);
   hipFree(d_managed_data);
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index af9027a11c..8d0b282624 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -17,7 +17,7 @@
 // Resource object used to construct list segment objects with indices
 // living in host (CPU) memory. Used in all tests.
 //
-camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
 
 TEST(IndexSetUnitTest, Empty)
@@ -34,13 +34,13 @@ TEST(IndexSetUnitTest, Empty)
 
 TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 {
-  using RangeSegType  = RAJA::TypedRangeSegment<int>;
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType isr;
   ASSERT_EQ((size_t)1, isr.getNumTypes());
   isr.push_back(RangeSegType(1, 3));
   isr.push_front(RangeSegType(0, 1));
-  ASSERT_EQ(2, isr.size());
+  ASSERT_EQ(2, isr.size()); 
   ASSERT_EQ(size_t(3), isr.getLength());
   const RangeSegType& rs0 = isr.getSegment<const RangeSegType>(0);
   const RangeSegType& rs1 = isr.getSegment<const RangeSegType>(1);
@@ -56,17 +56,17 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_NE(isr.size(), isr2.size());
   ASSERT_EQ(isr.getLength(), isr2.getLength());
 
-  using ListSegType    = RAJA::TypedListSegment<int>;
+  using ListSegType = RAJA::TypedListSegment<int>; 
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType isrl;
   ASSERT_EQ(size_t(2), isrl.getNumTypes());
-  int idx[] = {0, 2, 4, 5};
-  ListSegType lseg(idx, 4, host_res);
+  int idx[ ] = {0, 2, 4, 5};
+  ListSegType lseg(idx, 4, host_res); 
   isrl.push_back(lseg);
   isrl.push_back(RangeSegType(6, 8));
-  ASSERT_EQ(2, isrl.size());
+  ASSERT_EQ(2, isrl.size()); 
   ASSERT_EQ(size_t(6), isrl.getLength());
-  const ListSegType ls0   = isrl.getSegment<const ListSegType>(0);
+  const ListSegType ls0 = isrl.getSegment<const ListSegType>(0);
   const RangeSegType rs11 = isrl.getSegment<const RangeSegType>(1);
   ASSERT_EQ(4, ls0.size());
   ASSERT_EQ(2, rs11.size());
@@ -84,7 +84,7 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 
 TEST(IndexSetUnitTest, Swap)
 {
-  using RangeSegType  = RAJA::TypedRangeSegment<int>;
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
   RangeSegType range(0, 10);
@@ -109,7 +109,7 @@ TEST(IndexSetUnitTest, Swap)
 
 TEST(IndexSetUnitTest, Slice)
 {
-  using RangeSegType  = RAJA::TypedRangeSegment<int>;
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
   RangeSegType range1(0, 2);
@@ -138,7 +138,7 @@ TEST(IndexSetUnitTest, Slice)
   ASSERT_EQ(8, *rs22.begin());
   ASSERT_EQ(10, *rs22.end());
 
-  int segs[]          = {0, 3};
+  int segs[ ] = {0, 3};
   RIndexSetType iset3 = iset1.createSlice(segs, 2);
   ASSERT_EQ(2, iset3.size());
   ASSERT_EQ(size_t(4), iset3.getLength());
@@ -165,19 +165,19 @@ TEST(IndexSetUnitTest, Slice)
 
 TEST(IndexSetUnitTest, ConditionalEvenIndices)
 {
-  using RangeSegType   = RAJA::TypedRangeSegment<int>;
-  using ListSegType    = RAJA::TypedListSegment<int>;
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using ListSegType = RAJA::TypedListSegment<int>; 
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType iset;
 
   iset.push_back(RangeSegType(0, 6));
-  int idx[] = {7, 8, 10, 11};
-  ListSegType lseg(idx, 4, host_res);
+  int idx[ ] = {7, 8, 10, 11};
+  ListSegType lseg(idx, 4, host_res); 
   iset.push_back(lseg);
   iset.push_back(RangeSegType(13, 17));
 
   RAJA::RAJAVec<int> ref_even_indices;
-  ref_even_indices.push_back(0);
+  ref_even_indices.push_back(0); 
   ref_even_indices.push_back(2);
   ref_even_indices.push_back(4);
   ref_even_indices.push_back(8);
@@ -186,18 +186,19 @@ TEST(IndexSetUnitTest, ConditionalEvenIndices)
   ref_even_indices.push_back(16);
 
   RAJA::RAJAVec<int> even_indices;
-  getIndicesConditional(even_indices, iset, [](int idx) { return !(idx % 2); });
+  getIndicesConditional(even_indices, iset, [] (int idx) {
+    return !(idx % 2);
+  });
 
   EXPECT_EQ(even_indices.size(), ref_even_indices.size());
-  for (size_t i = 0; i < ref_even_indices.size(); ++i)
-  {
+  for (size_t i = 0; i < ref_even_indices.size(); ++i) {
     EXPECT_EQ(even_indices[i], ref_even_indices[i]);
   }
 }
 
 TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
 {
-  using RangeSegType  = RAJA::TypedRangeSegment<int>;
+  using RangeSegType = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset;
 
@@ -214,12 +215,12 @@ TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
   ref_lt100_indices.push_back(99);
 
   RAJA::RAJAVec<int> lt100_indices;
-  getIndicesConditional(lt100_indices, iset,
-                        [](int idx) { return (idx < 100); });
+  getIndicesConditional(lt100_indices, iset, [] (int idx) {
+    return (idx < 100);
+  });
 
   EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
-  for (size_t i = 0; i < ref_lt100_indices.size(); ++i)
-  {
+  for (size_t i = 0; i < ref_lt100_indices.size(); ++i) {
     EXPECT_EQ(lt100_indices[i], ref_lt100_indices[i]);
   }
 }
diff --git a/test/unit/index/test-indexvalue.cpp b/test/unit/index/test-indexvalue.cpp
index ca148d2c91..fad47715e9 100644
--- a/test/unit/index/test-indexvalue.cpp
+++ b/test/unit/index/test-indexvalue.cpp
@@ -13,9 +13,8 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class IndexValueUnitTest : public ::testing::Test
-{};
+template<typename T>
+class IndexValueUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(IndexValueUnitTest, UnitIndexTypes);
 
@@ -182,7 +181,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
   ASSERT_EQ(StrongTypeIndex(8), a);
   ASSERT_EQ(RAJA::Index_type(2), b);
 
-
+  
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
   TestType c(8);
   RAJA::Index_type d(2);
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index f810aac1ed..2ea0004b83 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -17,28 +17,26 @@
 
 #include <vector>
 
-template <typename T>
-class ListSegmentUnitTest : public ::testing::Test
-{};
+template<typename T>
+class ListSegmentUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(ListSegmentUnitTest, UnitIndexTypes);
 
 //
 // Resource object used to construct list segment objects with indices
-// living in host (CPU) memory. Used in all tests in this file.
+// living in host (CPU) memory. Used in all tests in this file. 
 //
-camp::resources::Resource host_res {camp::resources::Host()};
+camp::resources::Resource host_res{camp::resources::Host()};
 
 
 TYPED_TEST(ListSegmentUnitTest, Constructors)
 {
   std::vector<TypeParam> idx;
-  for (TypeParam i = 0; i < 5; ++i)
-  {
+  for (TypeParam i = 0; i < 5; ++i){
     idx.push_back(i);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1(&idx[0], idx.size(), host_res);
+  RAJA::TypedListSegment<TypeParam> list1( &idx[0], idx.size(), host_res);
   ASSERT_EQ(list1.size(), idx.size());
   ASSERT_EQ(list1.getIndexOwnership(), RAJA::Owned);
 
@@ -52,21 +50,20 @@ TYPED_TEST(ListSegmentUnitTest, Constructors)
 
   RAJA::TypedListSegment<TypeParam> container(idx, host_res);
   ASSERT_EQ(container.getIndexOwnership(), RAJA::Owned);
-  ASSERT_EQ(moved, container);
+  ASSERT_EQ(moved, container); 
 }
 
 TYPED_TEST(ListSegmentUnitTest, Swaps)
 {
   std::vector<TypeParam> idx1;
   std::vector<TypeParam> idx2;
-  for (TypeParam i = 0; i < 5; ++i)
-  {
+  for (TypeParam i = 0; i < 5; ++i){
     idx1.push_back(i);
-    idx2.push_back(i + 5);
+    idx2.push_back(i+5);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1(idx1, host_res);
-  RAJA::TypedListSegment<TypeParam> list2(idx2, host_res);
+  RAJA::TypedListSegment<TypeParam> list1( idx1, host_res );
+  RAJA::TypedListSegment<TypeParam> list2( idx2, host_res );
   auto list3 = RAJA::TypedListSegment<TypeParam>(list1);
   auto list4 = RAJA::TypedListSegment<TypeParam>(list2);
 
@@ -83,25 +80,26 @@ TYPED_TEST(ListSegmentUnitTest, Swaps)
 
 TYPED_TEST(ListSegmentUnitTest, Equality)
 {
-  std::vector<TypeParam> idx1 {5, 3, 1, 2};
-  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
+  std::vector<TypeParam> idx1{5,3,1,2};
+  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
 
-  std::vector<TypeParam> idx2 {2, 1, 3, 5};
+  std::vector<TypeParam> idx2{2,1,3,5};
+  
+  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), false);
 
-  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), false);
+  std::reverse( idx2.begin(), idx2.end() );
 
-  std::reverse(idx2.begin(), idx2.end());
-
-  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), true);
+  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), true);
 }
 
 TYPED_TEST(ListSegmentUnitTest, Iterators)
 {
-  std::vector<TypeParam> idx1 {5, 3, 1, 2};
-  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
+  std::vector<TypeParam> idx1{5,3,1,2};
+  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
 
   ASSERT_EQ(TypeParam(5), *list.begin());
-  ASSERT_EQ(TypeParam(2), *(list.end() - 1));
+  ASSERT_EQ(TypeParam(2), *(list.end()-1));
 
   ASSERT_EQ(4, list.size());
 }
+
diff --git a/test/unit/index/test-rangesegment.cpp b/test/unit/index/test-rangesegment.cpp
index fbed2a15bd..be82671682 100644
--- a/test/unit/index/test-rangesegment.cpp
+++ b/test/unit/index/test-rangesegment.cpp
@@ -13,20 +13,18 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class RangeSegmentUnitTest : public ::testing::Test
-{};
+template<typename T>
+class RangeSegmentUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(RangeSegmentUnitTest, UnitIndexTypes);
 
 
-template <typename T,
-          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
-{}
+{
+}
 
-template <typename T,
-          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {
   RAJA::TypedRangeSegment<T> r1(-10, 7);
@@ -76,13 +74,12 @@ TYPED_TEST(RangeSegmentUnitTest, Swaps)
   ASSERT_EQ(r2, r3);
 }
 
-template <typename T,
-          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
-{}
+{
+}
 
-template <typename T,
-          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {
   RAJA::TypedRangeSegment<T> r3(-2, 100);
@@ -103,14 +100,13 @@ TYPED_TEST(RangeSegmentUnitTest, Iterators)
 }
 
 template <typename IDX_TYPE,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
-{}
+{
+}
 
 template <typename IDX_TYPE,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {
   auto r1 = RAJA::TypedRangeSegment<IDX_TYPE>(-4, 4);
@@ -132,15 +128,15 @@ void runNegativeIndexSliceTests()
 TYPED_TEST(RangeSegmentUnitTest, Slices)
 {
   auto r1 = RAJA::TypedRangeSegment<TypeParam>(0, 125);
-  auto s1 = r1.slice(10, 100);
+  auto s1 = r1.slice(10,100);
 
   ASSERT_EQ(TypeParam(10), *s1.begin());
   ASSERT_EQ(TypeParam(110), *(s1.end()));
   ASSERT_EQ(TypeParam(100), s1.size());
 
-
+ 
   auto r2 = RAJA::TypedRangeSegment<TypeParam>(0, 12);
-  auto s2 = r2.slice(1, 13);
+  auto s2 = r2.slice(1,13);
 
   ASSERT_EQ(TypeParam(1), *s2.begin());
   ASSERT_EQ(TypeParam(12), *(s2.end()));
@@ -148,7 +144,7 @@ TYPED_TEST(RangeSegmentUnitTest, Slices)
 
 
   auto r3 = RAJA::TypedRangeSegment<TypeParam>(1, 125);
-  auto s3 = r3.slice(10, 100);
+  auto s3 = r3.slice(10,100);
 
   ASSERT_EQ(TypeParam(11), *s3.begin());
   ASSERT_EQ(TypeParam(111), *(s3.end()));
@@ -164,7 +160,7 @@ TYPED_TEST(RangeSegmentUnitTest, Equality)
 
   ASSERT_EQ(r1, r2);
 
-  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10, 15);
+  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10,15);
 
   ASSERT_NE(r1, r3);
 }
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
index 1b009433ef..5b64e17b5c 100644
--- a/test/unit/index/test-rangestridesegment.cpp
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -13,67 +13,64 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class RangeStrideSegmentUnitTest : public ::testing::Test
-{};
+template<typename T>
+class RangeStrideSegmentUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(RangeStrideSegmentUnitTest, UnitIndexTypes);
 
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Constructors)
 {
-  RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
-  RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
-  ASSERT_EQ(first, copied);
-  RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
-  ASSERT_EQ(moved, copied);
+    RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
+    RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
+    ASSERT_EQ(first, copied);
+    RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
+    ASSERT_EQ(moved, copied);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Assignments)
 {
-  auto r = RAJA::make_strided_range<TypeParam>(
-      static_cast<TypeParam>(0), static_cast<TypeParam>(5),
-      static_cast<typename std::make_signed<TypeParam>::type>(3));
-  RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
-  ASSERT_EQ(r, seg1);
-  RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
-  ASSERT_EQ(seg2, seg1);
+    auto r = RAJA::make_strided_range<TypeParam>(static_cast<TypeParam>(0), 
+                                                 static_cast<TypeParam>(5), 
+                                                 static_cast<typename std::make_signed<TypeParam>::type>(3));
+    RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
+    ASSERT_EQ(r, seg1);
+    RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
+    ASSERT_EQ(seg2, seg1);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Swaps)
 {
-  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
-  RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
-  RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
-  RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
-  std::swap(r1, r2);
-  ASSERT_EQ(r1, r4);
-  ASSERT_EQ(r2, r3);
+    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
+    RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
+    RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
+    RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
+    std::swap(r1, r2);
+    ASSERT_EQ(r1, r4);
+    ASSERT_EQ(r2, r3);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Iterators)
 {
-  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
-  ASSERT_EQ(TypeParam(0), *r1.begin());
-  ASSERT_EQ(TypeParam(96), *(--r1.end()));
-  using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
-  ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
-  ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
-  ASSERT_EQ(difftype_t(25), r1.size());
+    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
+    ASSERT_EQ(TypeParam(0), *r1.begin());
+    ASSERT_EQ(TypeParam(96), *(--r1.end()));
+    using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
+    ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
+    ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
+    ASSERT_EQ(difftype_t(25), r1.size());
 }
 
-template <typename T,
-          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
-{}
+{
+}
 
-template <typename T,
-          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {
   RAJA::TypedRangeStrideSegment<T> segment16(-10, -2, 2);
-  using difftype_t =
-      decltype(std::distance(segment16.begin(), segment16.end()));
+  using difftype_t = decltype(std::distance(segment16.begin(), segment16.end()));
   ASSERT_EQ(segment16.size(), difftype_t(4));
 
   RAJA::TypedRangeStrideSegment<T> segment17(-5, 5, 2);
@@ -121,17 +118,13 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   ASSERT_EQ(segment11.size(), difftype_t(2));
 
   // PRIMES
-  RAJA::TypedRangeStrideSegment<TypeParam> segment12(
-      0, 7,
-      3);  // should produce 0,3,6
+  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0, 7, 3);  // should produce 0,3,6
   ASSERT_EQ(segment12.size(), difftype_t(3));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment13(
-      0, 13, 3);  // should produce 0,3,6,9,12
+  RAJA::TypedRangeStrideSegment<TypeParam> segment13(0, 13, 3);  // should produce 0,3,6,9,12
   ASSERT_EQ(segment13.size(), difftype_t(5));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment14(
-      0, 17, 5);  // should produce 0,5,10,15
+  RAJA::TypedRangeStrideSegment<TypeParam> segment14(0, 17, 5);  // should produce 0,5,10,15
   ASSERT_EQ(segment14.size(), difftype_t(4));
 
   // NEGATIVE STRIDE
@@ -143,14 +136,13 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
 }
 
 template <typename IDX_TYPE,
-          typename std::enable_if<std::is_unsigned<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
-{}
+{
+}
 
-template <typename IDX_TYPE,
-          typename std::enable_if<std::is_signed<
-              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE, 
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {
   auto r1 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(10, -1, -1);
@@ -175,14 +167,14 @@ void runNegativeIndexStrideSliceTests()
   ASSERT_EQ(IDX_TYPE(-2), *s3.begin());
   ASSERT_EQ(IDX_TYPE(2), *s3.end());
   ASSERT_EQ(size_t(2), size_t(s3.size()));
-
-
+ 
+  
   auto r4 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(-9, -1, 1);
   auto s4 = r4.slice(3, 6);
 
   ASSERT_EQ(IDX_TYPE(-6), *s4.begin());
   ASSERT_EQ(IDX_TYPE(-1), *s4.end());
-  ASSERT_EQ(size_t(5), size_t(s4.size()));
+  ASSERT_EQ(size_t(5), size_t(s4.size())); 
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Slices)
@@ -230,5 +222,5 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Equality)
 
   auto r3 = RAJA::TypedRangeStrideSegment<TypeParam>(1, 10, 1);
 
-  ASSERT_TRUE(!(r1 == r3));
+  ASSERT_TRUE( !(r1 == r3));
 }
diff --git a/test/unit/indexing/test-indexing.hpp b/test/unit/indexing/test-indexing.hpp
index 9978f17832..21038542ee 100644
--- a/test/unit/indexing/test-indexing.hpp
+++ b/test/unit/indexing/test-indexing.hpp
@@ -18,26 +18,30 @@
 // List of named_dims
 //
 using NamedDimensionTypeList =
-    camp::list<camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
-               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
-               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
+    camp::list<
+                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
+                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
+                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>
+              >;
 
 //
 // List of sizes
 //
 using SizeTypeList =
-    camp::list<camp::integral_constant<int, RAJA::named_usage::ignored>,
-               camp::integral_constant<int, RAJA::named_usage::unspecified>,
-               camp::integral_constant<int, 1>,
-               camp::integral_constant<int, 7>>;
+    camp::list<
+                camp::integral_constant<int, RAJA::named_usage::ignored>,
+                camp::integral_constant<int, RAJA::named_usage::unspecified>,
+                camp::integral_constant<int, 1>,
+                camp::integral_constant<int, 7>
+              >;
 
 //
 // Holder for indexing templates
 //
-template <template <RAJA::named_dim, int, int> class T>
+template < template < RAJA::named_dim, int, int > class T >
 struct indexing_holder
 {
-  template <RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+  template < RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
   using type = T<dim, BLOCK_SIZE, GRID_SIZE>;
 };
 
@@ -45,13 +49,11 @@ struct indexing_holder
 // List of indexing holder types
 //
 #if defined(RAJA_ENABLE_CUDA)
-using CudaIndexingHolderList =
-    camp::list<indexing_holder<RAJA::cuda::IndexGlobal>>;
+using CudaIndexingHolderList = camp::list< indexing_holder<RAJA::cuda::IndexGlobal> >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipIndexingHolderList =
-    camp::list<indexing_holder<RAJA::hip::IndexGlobal>>;
+using HipIndexingHolderList = camp::list< indexing_holder<RAJA::hip::IndexGlobal> >;
 #endif
 
 #endif  // __TEST_INDEXING_UTILS_HPP__
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index 2fbb4a4421..a345d80067 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -18,97 +18,83 @@ template <typename T>
 class IndexingUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P(IndexingUnitTest);
+TYPED_TEST_SUITE_P( IndexingUnitTest );
 
-template <typename test_policy,
-          typename indexer_type,
-          RAJA::named_dim dim_012,
-          int BLOCK_SIZE,
-          int GRID_SIZE>
+template < typename test_policy,
+           typename indexer_type,
+           RAJA::named_dim dim_012,
+           int BLOCK_SIZE,
+           int GRID_SIZE >
 void testBasicIndexing()
 {
-  dim3d3d expected_dim {{1, 1, 1}, {1, 1, 1}};
-  if (BLOCK_SIZE != RAJA::named_usage::ignored)
-  {
-    if (BLOCK_SIZE == RAJA::named_usage::unspecified)
-    {
+  dim3d3d expected_dim{{1,1,1}, {1,1,1}};
+  if (BLOCK_SIZE != RAJA::named_usage::ignored) {
+    if (BLOCK_SIZE == RAJA::named_usage::unspecified) {
       expected_dim.thread[static_cast<int>(dim_012)] = 3;
-    }
-    else
-    {
+    } else {
       expected_dim.thread[static_cast<int>(dim_012)] = BLOCK_SIZE;
     }
   }
 
-  if (GRID_SIZE != RAJA::named_usage::ignored)
-  {
-    if (GRID_SIZE == RAJA::named_usage::unspecified)
-    {
+  if (GRID_SIZE != RAJA::named_usage::ignored) {
+    if (GRID_SIZE == RAJA::named_usage::unspecified) {
       expected_dim.block[static_cast<int>(dim_012)] = 5;
-    }
-    else
-    {
+    } else {
       expected_dim.block[static_cast<int>(dim_012)] = GRID_SIZE;
     }
   }
 
   const int total_global = expected_dim.product();
 
-  auto host_res    = get_test_resource<test_seq>();
+  auto host_res = get_test_resource<test_seq>();
   auto working_res = get_test_resource<test_policy>();
 
   int* actual_index = host_res.allocate<int>(total_global);
-  int* actual_size  = host_res.allocate<int>(total_global);
+  int* actual_size = host_res.allocate<int>(total_global);
 
-  for (int i = 0; i < total_global; ++i)
-  {
+  for (int i = 0; i < total_global; ++i) {
     actual_index[i] = -1;
-    actual_size[i]  = -1;
+    actual_size[i] = -1;
   }
 
-  actual_index =
-      test_reallocate(working_res, host_res, actual_index, total_global);
-  actual_size =
-      test_reallocate(working_res, host_res, actual_size, total_global);
+  actual_index = test_reallocate(working_res, host_res, actual_index, total_global);
+  actual_size = test_reallocate(working_res, host_res, actual_size, total_global);
 
   for3d3d<test_policy>(expected_dim,
-                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
-                       {
-                         int i           = index(idx, dim);
-                         actual_index[i] = indexer_type::template index<int>();
-                         actual_size[i]  = indexer_type::template size<int>();
-                       });
-
-  actual_index =
-      test_reallocate(host_res, working_res, actual_index, total_global);
-  actual_size =
-      test_reallocate(host_res, working_res, actual_size, total_global);
-
-  for (int i = 0; i < total_global; ++i)
-  {
-    ASSERT_EQ(actual_index[i], i);
-    ASSERT_EQ(actual_size[i], total_global);
+      [=] RAJA_HOST_DEVICE (dim3d3d idx, dim3d3d dim) {
+    int i = index(idx, dim);
+    actual_index[i] = indexer_type::template index<int>();
+    actual_size[i] = indexer_type::template size<int>();
+  });
+
+  actual_index = test_reallocate(host_res, working_res, actual_index, total_global);
+  actual_size = test_reallocate(host_res, working_res, actual_size, total_global);
+
+  for (int i = 0; i < total_global; ++i) {
+    ASSERT_EQ( actual_index[i], i );
+    ASSERT_EQ( actual_size[i], total_global );
   }
 
   host_res.deallocate(actual_index);
   host_res.deallocate(actual_size);
 }
 
-TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
+TYPED_TEST_P( IndexingUnitTest, BasicIndexing )
 {
-  using test_policy         = typename camp::at<TypeParam, camp::num<0>>::type;
+  using test_policy = typename camp::at<TypeParam, camp::num<0>>::type;
   using indexer_holder_type = typename camp::at<TypeParam, camp::num<1>>::type;
-  using dim_type            = typename camp::at<TypeParam, camp::num<2>>::type;
-  using threads_type        = typename camp::at<TypeParam, camp::num<3>>::type;
-  using blocks_type         = typename camp::at<TypeParam, camp::num<4>>::type;
+  using dim_type = typename camp::at<TypeParam, camp::num<2>>::type;
+  using threads_type = typename camp::at<TypeParam, camp::num<3>>::type;
+  using blocks_type = typename camp::at<TypeParam, camp::num<4>>::type;
 
   using indexer_type = typename indexer_holder_type::template type<
       dim_type::value, threads_type::value, blocks_type::value>;
 
-  testBasicIndexing<test_policy, indexer_type, dim_type::value,
-                    threads_type::value, blocks_type::value>();
+  testBasicIndexing< test_policy, indexer_type,
+                     dim_type::value, threads_type::value, blocks_type::value >();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(IndexingUnitTest, BasicIndexing);
+REGISTER_TYPED_TEST_SUITE_P( IndexingUnitTest,
+                             BasicIndexing );
 
 #endif  //__TEST_INDEXING_GLOBAL__
diff --git a/test/unit/internal/test-iterators.cpp b/test/unit/internal/test-iterators.cpp
index 2d90dca4c0..b5eb0ade48 100644
--- a/test/unit/internal/test-iterators.cpp
+++ b/test/unit/internal/test-iterators.cpp
@@ -14,13 +14,11 @@
 
 #include <limits>
 
-template <typename T>
-class NumericIteratorUnitTest : public ::testing::Test
-{};
+template<typename T>
+class NumericIteratorUnitTest : public ::testing::Test {};
 
-template <typename T>
-class StridedNumericIteratorUnitTest : public ::testing::Test
-{};
+template<typename T>
+class StridedNumericIteratorUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(NumericIteratorUnitTest, UnitExpandedIntegralTypes);
 TYPED_TEST_SUITE(StridedNumericIteratorUnitTest, UnitExpandedIntegralTypes);
@@ -86,8 +84,7 @@ TYPED_TEST(StridedNumericIteratorUnitTest, simple)
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
 TYPED_TEST(NumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value)
-  {
+  if (std::is_unsigned<TypeParam>::value) {
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -98,7 +95,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
       of_it += 11;
     });
-
+  
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -111,7 +108,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = of_it + 11;
       (void)sum;
     });
-
+  
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -124,13 +121,12 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = 11 + of_it;
       (void)sum;
     });
-  }
+  } 
 }
 
 TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value)
-  {
+  if (std::is_unsigned<TypeParam>::value){
     ASSERT_ANY_THROW({
       TypeParam val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
diff --git a/test/unit/internal/test-rajavec.cpp b/test/unit/internal/test-rajavec.cpp
index 131bb16c0b..edc093b4dd 100644
--- a/test/unit/internal/test-rajavec.cpp
+++ b/test/unit/internal/test-rajavec.cpp
@@ -27,8 +27,8 @@ TEST(RAJAVecUnitTest, basic_test)
 
   RAJA::RAJAVec<int> a1(a);
   ASSERT_EQ(a.size(), a1.size());
-  int* a_data  = a.data();
-  int* a1_data = a1.data();
+  int* a_data = a.data(); 
+  int* a1_data = a1.data(); 
   ASSERT_EQ(a_data[0], a1_data[0]);
   ASSERT_EQ(a_data[1], a1_data[1]);
 
diff --git a/test/unit/multi_reducer/test-multi-reducer.hpp b/test/unit/multi_reducer/test-multi-reducer.hpp
index 965926b144..a1f94e0895 100644
--- a/test/unit/multi_reducer/test-multi-reducer.hpp
+++ b/test/unit/multi_reducer/test-multi-reducer.hpp
@@ -17,31 +17,31 @@
 //
 // Data types
 //
-using DataTypeList = camp::list<int, float, double>;
+using DataTypeList = camp::list< int,
+                                 float,
+                                 double >;
 
-using SequentialMultiReducerPolicyList = camp::list<RAJA::seq_multi_reduce>;
+using SequentialMultiReducerPolicyList = camp::list< RAJA::seq_multi_reduce >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPMultiReducerPolicyList =
-    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
+using OpenMPMultiReducerPolicyList = camp::list< RAJA::omp_multi_reduce,
+                                                 RAJA::omp_multi_reduce_ordered >;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducerPolicyList = camp::list<
-    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-    RAJA::
-        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-    RAJA::cuda_multi_reduce_atomic_global_host_init,
-    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
+using CudaMultiReducerPolicyList =
+  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::cuda_multi_reduce_atomic_global_host_init,
+              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducerPolicyList = camp::list<
-    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-    RAJA::
-        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-    RAJA::hip_multi_reduce_atomic_global_host_init,
-    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
+using HipMultiReducerPolicyList =
+  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::hip_multi_reduce_atomic_global_host_init,
+              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
 #endif
 
 #endif  // __TEST_MULTI_REDUCER_UTILS_HPP__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
index e84f21a475..1104ae1e28 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA multi reducer constructors and
-/// initialization.
+/// Header file containing tests for RAJA multi reducer constructors and initialization.
 ///
 
 #ifndef __TEST_MULTI_REDUCER_CONSTRUCTOR__
@@ -23,70 +22,63 @@
 
 template <typename T>
 class MultiReducerBasicConstructorUnitTest : public ::testing::Test
-{};
+{
+};
 
 template <typename T>
 class MultiReducerSingleInitConstructorUnitTest : public ::testing::Test
-{};
+{
+};
 
 template <typename T>
 class MultiReducerContainerInitConstructorUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest);
 
 
-template <typename MultiReducePolicy, typename NumericType>
+template <typename MultiReducePolicy,
+          typename NumericType>
 void testBasicMultiReducerConstructorRegular(size_t num_bins)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
-      num_bins);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
-      num_bins);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
-      num_bins);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
-              get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
-              get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
-              get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
   }
 }
 
-template <typename MultiReducePolicy, typename NumericType>
+template <typename MultiReducePolicy,
+          typename NumericType>
 void testBasicMultiReducerConstructorBitwise(size_t num_bins)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
-      num_bins);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
-      num_bins);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
 
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
-              get_op_identity(multi_reduce_or));
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
-              get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
   }
 }
 
@@ -95,10 +87,8 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
-      num_bins);
-  testBasicMultiReducerConstructorBitwise<MultiReducePolicy, NumericType>(
-      num_bins);
+  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorBitwise< MultiReducePolicy, NumericType >(num_bins);
 }
 ///
 template <typename MultiReducePolicy,
@@ -106,39 +96,34 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
-      num_bins);
+  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(0);
-  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(1);
-  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(2);
-  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(10);
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(0);
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(1);
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(2);
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(10);
 }
 
 
-template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
-                                                  NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType>
+void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
-      num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
-      num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
-      num_bins, initVal);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -149,20 +134,17 @@ void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
   }
 }
 
-template <typename MultiReducePolicy, typename NumericType>
-void testMultiReducerSingleInitConstructorBitwise(size_t num_bins,
-                                                  NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType>
+void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType initVal)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
-      num_bins, initVal);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
-      num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
 
@@ -173,57 +155,48 @@ void testMultiReducerSingleInitConstructorBitwise(size_t num_bins,
 
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
-      num_bins, initVal);
-  testMultiReducerSingleInitConstructorBitwise<MultiReducePolicy, NumericType>(
-      num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorBitwise< MultiReducePolicy, NumericType >(num_bins, initVal);
 }
 ///
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
-      num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
-
-  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
-      0, NumericType(2));
-  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
-      1, NumericType(4));
-  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
-      2, NumericType(0));
-  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
-      10, NumericType(9));
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(0, NumericType(2));
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(1, NumericType(4));
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(2, NumericType(0));
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(10, NumericType(9));
 }
 
 
-template <typename MultiReducePolicy, typename NumericType, typename Container>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container>
 void testMultiReducerContainerInitConstructorRegular(Container const& container)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
-      container);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
-      container);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
-      container);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(container);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(container);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(container);
 
   ASSERT_EQ(multi_reduce_sum.size(), container.size());
   ASSERT_EQ(multi_reduce_min.size(), container.size());
   ASSERT_EQ(multi_reduce_max.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container)
-  {
+  for (NumericType val : container) {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -235,20 +208,19 @@ void testMultiReducerContainerInitConstructorRegular(Container const& container)
   }
 }
 
-template <typename MultiReducePolicy, typename NumericType, typename Container>
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container>
 void testMultiReducerContainerInitConstructorBitwise(Container const& container)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
-      container);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
-      container);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(container);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(container);
 
   ASSERT_EQ(multi_reduce_and.size(), container.size());
   ASSERT_EQ(multi_reduce_or.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container)
-  {
+  for (NumericType val : container) {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -264,10 +236,8 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
-                                                  NumericType>(container);
-  testMultiReducerContainerInitConstructorBitwise<MultiReducePolicy,
-                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorBitwise< MultiReducePolicy, NumericType >(container);
 }
 ///
 template <typename MultiReducePolicy,
@@ -276,15 +246,13 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
-                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
 }
 
-TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
-             MultiReducerConstructor)
+TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
@@ -292,14 +260,13 @@ TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin)
-  {
+  for (size_t bin = 0; bin < size_t(10); ++bin) {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c0);
-  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c1);
-  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c2);
-  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c10);
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c0);
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c1);
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c2);
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c10);
 }
 
 
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index 379cdbd6fc..0eb1eb6eb6 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -22,45 +22,44 @@
 
 template <typename T>
 class MultiReducerBasicResetUnitTest : public ::testing::Test
-{};
+{
+};
 
 template <typename T>
 class MultiReducerSingleResetUnitTest : public ::testing::Test
-{};
+{
+};
 
 template <typename T>
 class MultiReducerContainerResetUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest);
 
 
-template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
 void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
-      num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
-      num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
-      num_bins, initVal);
-
-  if (use_reducer)
-  {
-    forone<ForOnePol>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          for (size_t bin = 0; bin < num_bins; ++bin)
-          {
-            multi_reduce_sum[bin] += initVal;
-            multi_reduce_min[bin].min(initVal - 1);
-            multi_reduce_max[bin].max(initVal + 1);
-          }
-        });
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < num_bins; ++bin) {
+        multi_reduce_sum[bin] += initVal;
+        multi_reduce_min[bin].min(initVal-1);
+        multi_reduce_max[bin].max(initVal+1);
+      }
+    });
   }
 
   multi_reduce_sum.reset();
@@ -71,42 +70,34 @@ void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
-              get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
-              get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
-              get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
   }
 }
 
-template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
 void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
-      num_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
-      num_bins, initVal);
-
-  if (use_reducer)
-  {
-    forone<ForOnePol>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          for (size_t bin = 0; bin < num_bins; ++bin)
-          {
-            multi_reduce_and[bin] &= initVal - 1;
-            multi_reduce_or[bin] |= initVal + 1;
-          }
-        });
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < num_bins; ++bin) {
+        multi_reduce_and[bin] &= initVal-1;
+        multi_reduce_or[bin] |= initVal+1;
+      }
+    });
   }
 
   multi_reduce_and.reset();
@@ -115,87 +106,72 @@ void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
 
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
-              get_op_identity(multi_reduce_and));
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
-              get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
-      false, num_bins);
-  testMultiReducerBasicResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
-      false, num_bins);
+  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, num_bins); testMultiReducerBasicResetBitwise<
-  // MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
-      false, num_bins);
+  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(0);
-  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(1);
-  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(2);
-  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(10);
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(0);
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(1);
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(2);
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(10);
 }
 
 
-template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetRegular(bool use_reducer,
-                                        size_t init_bins,
-                                        size_t num_bins,
-                                        NumericType initVal)
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
+void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
-      init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
-      init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
-      init_bins, initVal);
-
-  if (use_reducer)
-  {
-    forone<ForOnePol>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          for (size_t bin = 0; bin < init_bins; ++bin)
-          {
-            multi_reduce_sum[bin] += initVal;
-            multi_reduce_min[bin].min(initVal - 1);
-            multi_reduce_max[bin].max(initVal + 1);
-          }
-        });
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_sum[bin] += initVal;
+        multi_reduce_min[bin].min(initVal-1);
+        multi_reduce_max[bin].max(initVal+1);
+      }
+    });
   }
 
   multi_reduce_sum.reset(num_bins, initVal);
@@ -206,8 +182,7 @@ void testMultiReducerSingleResetRegular(bool use_reducer,
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -218,28 +193,21 @@ void testMultiReducerSingleResetRegular(bool use_reducer,
   }
 }
 
-template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
-void testMultiReducerSingleResetBitwise(bool use_reducer,
-                                        size_t init_bins,
-                                        size_t num_bins,
-                                        NumericType initVal)
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
+void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
-      init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
-      init_bins, initVal);
-
-  if (use_reducer)
-  {
-    forone<ForOnePol>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          for (size_t bin = 0; bin < init_bins; ++bin)
-          {
-            multi_reduce_and[bin] &= initVal - 1;
-            multi_reduce_or[bin] |= initVal + 1;
-          }
-        });
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_and[bin] &= initVal-1;
+        multi_reduce_or[bin] |= initVal+1;
+      }
+    });
   }
 
   multi_reduce_and.reset(num_bins, initVal);
@@ -248,8 +216,7 @@ void testMultiReducerSingleResetBitwise(bool use_reducer,
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin)
-  {
+  for (size_t bin = 0; bin < num_bins; ++bin) {
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
 
@@ -258,100 +225,77 @@ void testMultiReducerSingleResetBitwise(bool use_reducer,
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(size_t init_bins,
-                                     size_t num_bins,
-                                     NumericType initVal)
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
-      false, init_bins, num_bins, initVal);
-  testMultiReducerSingleResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
-      false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, init_bins, num_bins, initVal);
-  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerSingleResetSize(size_t init_bins,
-                                     size_t num_bins,
-                                     NumericType initVal)
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
-      false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
 }
 
-template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol >
 void testMultiReducerSingleReset(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
-      0, num_bins, initVal);
-  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
-      4, num_bins, initVal);
-  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
-      num_bins, num_bins, initVal);
+  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, num_bins, initVal);
+  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, num_bins, initVal);
+  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(num_bins, num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
-
-  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
-      0, NumericType(3));
-  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
-      1, NumericType(5));
-  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
-      2, NumericType(0));
-  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
-      10, NumericType(8));
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(0, NumericType(3));
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(1, NumericType(5));
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(2, NumericType(0));
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(10, NumericType(8));
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container>
-void testMultiReducerContainerResetRegular(bool use_reducer,
-                                           size_t init_bins,
-                                           Container const& container)
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container  >
+void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal   = NumericType(5);
-
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
-      init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
-      init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
-      init_bins, initVal);
-
-  if (use_reducer)
-  {
-    forone<ForOnePol>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          for (size_t bin = 0; bin < init_bins; ++bin)
-          {
-            multi_reduce_sum[bin] += initVal;
-            multi_reduce_min[bin].min(initVal - 1);
-            multi_reduce_max[bin].max(initVal + 1);
-          }
-        });
+  NumericType initVal = NumericType(5);
+
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_sum[bin] += initVal;
+        multi_reduce_min[bin].min(initVal-1);
+        multi_reduce_max[bin].max(initVal+1);
+      }
+    });
   }
 
   multi_reduce_sum.reset(container);
@@ -363,8 +307,7 @@ void testMultiReducerContainerResetRegular(bool use_reducer,
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container)
-  {
+  for (NumericType val : container) {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -376,33 +319,25 @@ void testMultiReducerContainerResetRegular(bool use_reducer,
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container>
-void testMultiReducerContainerResetBitwise(bool use_reducer,
-                                           size_t init_bins,
-                                           Container const& container)
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container >
+void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal   = NumericType(5);
-
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
-      init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
-      init_bins, initVal);
-
-  if (use_reducer)
-  {
-    forone<ForOnePol>(
-        [=] RAJA_HOST_DEVICE()
-        {
-          for (size_t bin = 0; bin < init_bins; ++bin)
-          {
-            multi_reduce_and[bin] &= initVal - 1;
-            multi_reduce_or[bin] |= initVal + 1;
-          }
-        });
+  NumericType initVal = NumericType(5);
+
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_and[bin] &= initVal-1;
+        multi_reduce_or[bin] |= initVal+1;
+      }
+    });
   }
 
   multi_reduce_and.reset(container);
@@ -412,8 +347,7 @@ void testMultiReducerContainerResetBitwise(bool use_reducer,
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container)
-  {
+  for (NumericType val : container) {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -423,61 +357,50 @@ void testMultiReducerContainerResetBitwise(bool use_reducer,
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(size_t init_bins,
-                                        Container const& container)
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container,
+            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
 {
-  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
-                                        ForOnePol>(false, init_bins, container);
-  testMultiReducerContainerResetBitwise<MultiReducePolicy, NumericType,
-                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, init_bins, container);
-  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
 }
 ///
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
-void testMultiReducerContainerResetSize(size_t init_bins,
-                                        Container const& container)
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container,
+            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
 {
-  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
-                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
-  // ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename ForOnePol,
-          typename Container>
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container >
 void testMultiReducerContainerReset(Container const& container)
 {
-  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
-      0, container);
-  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
-      4, container);
-  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
-      container.size(), container);
+  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, container);
+  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, container);
+  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(container.size(), container);
 }
 
 TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
@@ -485,21 +408,22 @@ TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin)
-  {
+  for (size_t bin = 0; bin < size_t(10); ++bin) {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c0);
-  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c1);
-  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c2);
-  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(
-      c10);
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c0);
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c1);
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c2);
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c10);
 }
 
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest, MultiReducerReset);
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest, MultiReducerReset);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest,
+                            MultiReducerReset);
+
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest,
+                            MultiReducerReset);
 
 REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest,
                             MultiReducerReset);
diff --git a/test/unit/reducer/test-reducer-constructors-cuda.cpp b/test/unit/reducer/test-reducer-constructors-cuda.cpp
index 75889c4706..fea3bb9b90 100644
--- a/test/unit/reducer/test-reducer-constructors-cuda.cpp
+++ b/test/unit/reducer/test-reducer-constructors-cuda.cpp
@@ -6,23 +6,22 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and
-/// initialization.
+/// Source file containing tests for RAJA reducer constructors and initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<CudaReducerPolicyList,
+using CudaBasicReducerConstructorTypes = 
+  Test< camp::cartesian_product< CudaReducerPolicyList,
                                  DataTypeList,
-                                 CudaResourceList>>::Types;
+                                 CudaResourceList > >::Types;
 
-using CudaInitReducerConstructorTypes =
-    Test<camp::cartesian_product<CudaReducerPolicyList,
+using CudaInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList>>::Types;
+                                 CudaUnitTestPolicyList > >::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -32,3 +31,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CudaInitTest,
                                ReducerInitConstructorUnitTest,
                                CudaInitReducerConstructorTypes);
 #endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-hip.cpp b/test/unit/reducer/test-reducer-constructors-hip.cpp
index c4f4ddb8b4..0b3197b2ef 100644
--- a/test/unit/reducer/test-reducer-constructors-hip.cpp
+++ b/test/unit/reducer/test-reducer-constructors-hip.cpp
@@ -6,23 +6,22 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and
-/// initialization.
+/// Source file containing tests for RAJA reducer constructors and initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<HipReducerPolicyList,
+using HipBasicReducerConstructorTypes = 
+  Test< camp::cartesian_product< HipReducerPolicyList,
                                  DataTypeList,
-                                 HipResourceList>>::Types;
+                                 HipResourceList > >::Types;
 
-using HipInitReducerConstructorTypes =
-    Test<camp::cartesian_product<HipReducerPolicyList,
+using HipInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList>>::Types;
+                                 HipUnitTestPolicyList > >::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -32,3 +31,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(HipInitTest,
                                ReducerInitConstructorUnitTest,
                                HipInitReducerConstructorTypes);
 #endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
index 3dd9e8ae39..b3204c7827 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
@@ -6,20 +6,20 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and
-/// initialization.
+/// Source file containing tests for RAJA reducer constructors and initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetInitReducerConstructorTypes =
-    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
+using OpenMPTargetInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+                                 SequentialUnitTestPolicyList > >::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPTargetInitReducerConstructorTypes);
 #endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-openmp.cpp b/test/unit/reducer/test-reducer-constructors-openmp.cpp
index eb31791058..26d39cdd5f 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp.cpp
@@ -6,23 +6,22 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and
-/// initialization.
+/// Source file containing tests for RAJA reducer constructors and initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<OpenMPReducerPolicyList,
+using OpenMPBasicReducerConstructorTypes = 
+  Test< camp::cartesian_product< OpenMPReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList>>::Types;
+                                 HostResourceList > >::Types;
 
-using OpenMPInitReducerConstructorTypes =
-    Test<camp::cartesian_product<OpenMPReducerPolicyList,
+using OpenMPInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+                                 SequentialUnitTestPolicyList > >::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -32,3 +31,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPInitReducerConstructorTypes);
 #endif
+
diff --git a/test/unit/reducer/test-reducer-constructors-seq.cpp b/test/unit/reducer/test-reducer-constructors-seq.cpp
index 7d765529f8..134766eb9a 100644
--- a/test/unit/reducer/test-reducer-constructors-seq.cpp
+++ b/test/unit/reducer/test-reducer-constructors-seq.cpp
@@ -6,22 +6,21 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and
-/// initialization.
+/// Source file containing tests for RAJA reducer constructors and initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
-using SequentialBasicReducerConstructorTypes =
-    Test<camp::cartesian_product<SequentialReducerPolicyList,
+using SequentialBasicReducerConstructorTypes = 
+  Test< camp::cartesian_product< SequentialReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList>>::Types;
+                                 HostResourceList > >::Types;
 
-using SequentialInitReducerConstructorTypes =
-    Test<camp::cartesian_product<SequentialReducerPolicyList,
+using SequentialInitReducerConstructorTypes = 
+  Test< camp::cartesian_product< SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+                                 SequentialUnitTestPolicyList > >::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -30,3 +29,5 @@ INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialInitTest,
                                ReducerInitConstructorUnitTest,
                                SequentialInitReducerConstructorTypes);
+
+
diff --git a/test/unit/reducer/test-reducer-reset-cuda.cpp b/test/unit/reducer/test-reducer-reset-cuda.cpp
index 2443419c7d..06944d488d 100644
--- a/test/unit/reducer/test-reducer-reset-cuda.cpp
+++ b/test/unit/reducer/test-reducer-reset-cuda.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerResetTypes =
-    Test<camp::cartesian_product<CudaReducerPolicyList,
+using CudaReducerResetTypes = 
+  Test< camp::cartesian_product< CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList>>::Types;
+                                 CudaUnitTestPolicyList > >::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-hip.cpp b/test/unit/reducer/test-reducer-reset-hip.cpp
index eb31480311..cfca5e3787 100644
--- a/test/unit/reducer/test-reducer-reset-hip.cpp
+++ b/test/unit/reducer/test-reducer-reset-hip.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerResetTypes =
-    Test<camp::cartesian_product<HipReducerPolicyList,
+using HipReducerResetTypes = 
+  Test< camp::cartesian_product< HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList>>::Types;
+                                 HipUnitTestPolicyList > >::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp-target.cpp b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
index 5f02ec92ea..1bf7777bf1 100644
--- a/test/unit/reducer/test-reducer-reset-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerResetTypes =
-    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
+using OpenMPTargetReducerResetTypes = 
+  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+                                 SequentialUnitTestPolicyList > >::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp.cpp b/test/unit/reducer/test-reducer-reset-openmp.cpp
index a570a7be6a..3f8d54287f 100644
--- a/test/unit/reducer/test-reducer-reset-openmp.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerResetTypes =
-    Test<camp::cartesian_product<OpenMPReducerPolicyList,
+using OpenMPReducerResetTypes = 
+  Test< camp::cartesian_product< OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+                                 SequentialUnitTestPolicyList > >::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-seq.cpp b/test/unit/reducer/test-reducer-reset-seq.cpp
index 5884aa43e4..2b1ff4a748 100644
--- a/test/unit/reducer/test-reducer-reset-seq.cpp
+++ b/test/unit/reducer/test-reducer-reset-seq.cpp
@@ -11,13 +11,14 @@
 
 #include "tests/test-reducer-reset.hpp"
 
-using SequentialReducerResetTypes =
-    Test<camp::cartesian_product<SequentialReducerPolicyList,
+using SequentialReducerResetTypes = 
+  Test< camp::cartesian_product< SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList>>::Types;
+                                 SequentialUnitTestPolicyList > >::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialResetTest,
                                ReducerResetUnitTest,
                                SequentialReducerResetTypes);
+
diff --git a/test/unit/reducer/test-reducer.hpp b/test/unit/reducer/test-reducer.hpp
index 55fa58f6ee..aa8fbda9cf 100644
--- a/test/unit/reducer/test-reducer.hpp
+++ b/test/unit/reducer/test-reducer.hpp
@@ -16,25 +16,27 @@
 //
 // Data types
 //
-using DataTypeList = camp::list<int, float, double>;
+using DataTypeList = camp::list< int,
+                                 float,
+                                 double >;
 
-using SequentialReducerPolicyList = camp::list<RAJA::seq_reduce>;
+using SequentialReducerPolicyList = camp::list< RAJA::seq_reduce >;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerPolicyList =
-    camp::list<RAJA::omp_reduce, RAJA::omp_reduce_ordered>;
+using OpenMPReducerPolicyList = camp::list< RAJA::omp_reduce,
+                                            RAJA::omp_reduce_ordered >;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerPolicyList = camp::list<RAJA::omp_target_reduce>;
+using OpenMPTargetReducerPolicyList = camp::list< RAJA::omp_target_reduce >;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerPolicyList = camp::list<RAJA::cuda_reduce>;
+using CudaReducerPolicyList = camp::list< RAJA::cuda_reduce >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerPolicyList = camp::list<RAJA::hip_reduce>;
+using HipReducerPolicyList = camp::list< RAJA::hip_reduce >;
 #endif
 
 #endif  // __TEST_REDUCER_UTILS_HPP__
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index d771a77b77..d02d42fce9 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -6,8 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA reducer constructors and
-/// initialization.
+/// Header file containing tests for RAJA reducer constructors and initialization.
 ///
 
 #ifndef __TEST_REDUCER_CONSTRUCTOR__
@@ -19,26 +18,29 @@
 
 template <typename T>
 class ReducerBasicConstructorUnitTest : public ::testing::Test
-{};
+{
+};
 
 template <typename T>
 class ReducerInitConstructorUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest);
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy, typename NumericType>
-typename std::enable_if<
-#if defined(RAJA_ENABLE_CUDA)  // CUDA policy does nothing.
-    std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
-#elif defined(RAJA_ENABLE_HIP)  // HIP policy does nothing.
-    std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+template <typename ReducePolicy,
+          typename NumericType>
+typename  std::enable_if<
+#if defined(RAJA_ENABLE_CUDA) // CUDA policy does nothing.
+            std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+#elif defined(RAJA_ENABLE_HIP) // HIP policy does nothing.
+            std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
 #error Please enable a supported GPU platform, e.g. CUDA or HIP.
 #endif
-    >::type
+          >::type
 testReducerConstructor()
 {
   // do nothing
@@ -49,15 +51,15 @@ testReducerConstructor()
 // Should not run this on a GPU.
 template <typename ReducePolicy,
           typename NumericType>
-typename std::enable_if<  // CPU policy.
+typename  std::enable_if< // CPU policy.
 #if defined(RAJA_ENABLE_CUDA)
-    !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+            !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
 #elif defined(RAJA_ENABLE_HIP)
-    !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+            !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
-    true  // Always run for non-GPU policies.
+            true  // Always run for non-GPU policies.
 #endif
-    >::type
+          >::type
 testReducerConstructor()
 {
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
@@ -66,12 +68,8 @@ testReducerConstructor()
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
-      reduce_minloctup;
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
-      reduce_maxloctup;
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup;
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup;
 
   ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_min.get(), NumericType());
@@ -84,48 +82,44 @@ testReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-            RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-            RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-            RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), RAJA::Index_type());
 }
 
 TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
 {
   using ReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testReducerConstructor<ReducePolicy, NumericType>();
+  testReducerConstructor< ReducePolicy, NumericType >();
 }
 
-template <typename ReducePolicy,
-          typename NumericType,
-          typename ForOnePol>
-typename std::enable_if<  // Host policy does nothing.
-    std::is_base_of<RunOnHost, ForOnePol>::value>::type
-exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename ForOnePol >
+typename  std::enable_if< // Host policy does nothing.
+            std::is_base_of<RunOnHost, ForOnePol>::value
+          >::type
+exec_dispatcher( NumericType * RAJA_UNUSED_ARG(initVal) )
 {
   // Do nothing for host policies.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy,
-          typename NumericType,
-          typename ForOnePol>
-typename std::enable_if<  // GPU policy fiddles with value.
-    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
-exec_dispatcher(NumericType* initVal)
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename ForOnePol >
+typename  std::enable_if< // GPU policy fiddles with value.
+            std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+exec_dispatcher( NumericType * initVal )
 {
-  forone<ForOnePol>(
-      [=] __device__()
-      {
-        initVal[0] += 1;
-        initVal[0] -= 1;
-      });
+  forone<ForOnePol>( [=] __device__ () {
+                        initVal[0] += 1;
+                        initVal[0] -= 1;
+                 });
 }
 #endif
 
@@ -135,27 +129,27 @@ template <typename ReducePolicy,
           typename ForOnePol>
 void testInitReducerConstructor()
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  NumericType* theVal  = nullptr;
-  NumericType* workVal = nullptr;
+  NumericType * theVal = nullptr;
+  NumericType * workVal = nullptr;
 
   NumericType initVal = (NumericType)5;
 
   workVal = work_res.allocate<NumericType>(1);
-  theVal  = host_res.allocate<NumericType>(1);
+  theVal = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy(workVal, &initVal, sizeof(initVal));
+  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
   theVal[0] = (NumericType)10;
 
-#if defined(RAJA_ENABLE_CUDA)
+  #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-#endif
+  #endif
 
-#if defined(RAJA_ENABLE_HIP)
+  #if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-#endif
+  #endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -164,17 +158,17 @@ void testInitReducerConstructor()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
-      reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
-      reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
 
   // move a value onto device and fiddle
-  exec_dispatcher<ReducePolicy, NumericType, ForOnePol>(workVal);
+  exec_dispatcher < ReducePolicy,
+                    NumericType,
+                    ForOnePol
+                  >
+                  ( workVal );
 
-  work_res.memcpy(&initVal, workVal, sizeof(initVal));
+  work_res.memcpy( &initVal, workVal, sizeof(initVal) );
 
   theVal[0] = initVal;
 
@@ -191,28 +185,23 @@ void testInitReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal));
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)1);
-
-  work_res.deallocate(workVal);
-  host_res.deallocate(theVal);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
+
+  work_res.deallocate( workVal );
+  host_res.deallocate( theVal );
 }
 
 TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
 {
-  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  testInitReducerConstructor<ReduceType, NumericType, ResourceType,
-                             ForOneType>();
+  testInitReducerConstructor< ReduceType, NumericType, ResourceType, ForOneType >();
 }
 
 
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index c976545fc8..b82ae2995f 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -16,98 +16,97 @@
 
 #include "../test-reducer.hpp"
 
-template <typename ReducePolicy,
-          typename NumericType,
-          typename Indexer,
-          typename Tuple,
-          typename ForOnePol>
-typename std::enable_if<  // Empty function for non-device policy.
-    std::is_base_of<RunOnHost, ForOnePol>::value>::type
-exec_dispatcher(
-    RAJA::ReduceSum<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_sum),
-    RAJA::ReduceMin<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_min),
-    RAJA::ReduceMax<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_max),
-    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
-        reduce_minloc),
-    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
-        reduce_maxloc),
-    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
-        reduce_minloctup),
-    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
-        reduce_maxloctup),
-    NumericType RAJA_UNUSED_ARG(initVal))
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename Indexer,
+            typename Tuple,
+            typename ForOnePol
+          >
+typename  std::enable_if< // Empty function for non-device policy.
+            std::is_base_of<RunOnHost, ForOnePol>::value
+          >::type
+exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_sum),
+                  RAJA::ReduceMin<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_min),
+                  RAJA::ReduceMax<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_max),
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_minloc),
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_maxloc),
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_minloctup),
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_maxloctup),
+                  NumericType RAJA_UNUSED_ARG(initVal)
+               )
 {
   // Non-device policies should do nothing.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy,
-          typename NumericType,
-          typename Indexer,
-          typename Tuple,
-          typename ForOnePol>
-typename std::enable_if<  // GPU policy execution.
-    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
-exec_dispatcher(
-    RAJA::ReduceSum<ReducePolicy, NumericType>& reduce_sum,
-    RAJA::ReduceMin<ReducePolicy, NumericType>& reduce_min,
-    RAJA::ReduceMax<ReducePolicy, NumericType>& reduce_max,
-    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& reduce_minloc,
-    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& reduce_maxloc,
-    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& reduce_minloctup,
-    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& reduce_maxloctup,
-    NumericType initVal)
+template  < typename ReducePolicy,
+            typename NumericType,
+            typename Indexer,
+            typename Tuple,
+            typename ForOnePol
+          >
+typename  std::enable_if< // GPU policy execution.
+            std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & reduce_sum,
+                  RAJA::ReduceMin<ReducePolicy, NumericType> & reduce_min,
+                  RAJA::ReduceMax<ReducePolicy, NumericType> & reduce_max,
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & reduce_minloc,
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & reduce_maxloc,
+                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & reduce_minloctup,
+                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & reduce_maxloctup,
+                  NumericType initVal
+               )
 {
   // Use device to activate any value for each reducer.
-  forone<ForOnePol>(
-      [=] __host__ __device__()
-      {
-        Tuple temploc(0, 0);
-        reduce_sum += initVal;
-        reduce_min.min(0);
-        reduce_max.max(0);
-        reduce_minloc.minloc(0, 0);
-        reduce_maxloc.maxloc(0, 0);
-        reduce_minloctup.minloc(0, temploc);
-        reduce_maxloctup.maxloc(0, temploc);
-      });
+  forone<ForOnePol>( [=] __host__ __device__ () {
+                    Tuple temploc(0,0);
+                    reduce_sum += initVal;
+                    reduce_min.min(0);
+                    reduce_max.max(0);
+                    reduce_minloc.minloc(0,0);
+                    reduce_maxloc.maxloc(0,0);
+                    reduce_minloctup.minloc(0,temploc);
+                    reduce_maxloctup.maxloc(0,temploc);
+                 });
   // Relying on implicit device synchronization in forone.
 }
 #endif
 
 template <typename T>
 class ReducerResetUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(ReducerResetUnitTest);
 
-template <typename ReducePolicy,
-          typename NumericType,
-          typename WORKING_RES,
-          typename ForOnePol>
+template <  typename ReducePolicy,
+            typename NumericType,
+            typename WORKING_RES,
+            typename ForOnePol  >
 void testReducerReset()
 {
-  camp::resources::Resource work_res {WORKING_RES::get_default()};
-  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-  NumericType* resetVal = nullptr;
-  NumericType* workVal  = nullptr;
+  NumericType * resetVal = nullptr;
+  NumericType * workVal = nullptr;
 
   NumericType initVal = (NumericType)5;
 
-  workVal  = work_res.allocate<NumericType>(1);
+  workVal = work_res.allocate<NumericType>(1);
   resetVal = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy(workVal, &initVal, sizeof(initVal));
+  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
   resetVal[0] = (NumericType)10;
 
-#if defined(RAJA_ENABLE_CUDA)
+  #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-#endif
+  #endif
 
-#if defined(RAJA_ENABLE_HIP)
+  #if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-#endif
+  #endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -116,18 +115,25 @@ void testReducerReset()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
-      reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
-                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
-      reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
 
   // initiate some device computation if using device policy
-  exec_dispatcher<ReducePolicy, NumericType, RAJA::Index_type,
-                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
-      reduce_sum, reduce_min, reduce_max, reduce_minloc, reduce_maxloc,
-      reduce_minloctup, reduce_maxloctup, initVal);
+  exec_dispatcher < ReducePolicy,
+                    NumericType,
+                    RAJA::Index_type,
+                    RAJA::tuple<RAJA::Index_type, RAJA::Index_type>,
+                    ForOnePol
+                  >
+                 (  reduce_sum,
+                    reduce_min,
+                    reduce_max,
+                    reduce_minloc,
+                    reduce_maxloc,
+                    reduce_minloctup,
+                    reduce_maxloctup,
+                    initVal
+                 );
 
   // perform real host resets
   reduce_sum.reset(resetVal[0]);
@@ -153,14 +159,10 @@ void testReducerReset()
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(resetVal[0]));
 
   // Reset of tuple loc defaults to 0
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
-            (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
-            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
 
   // reset locs to default of -1.
   reduce_minloc.reset(resetVal[0], -1);
@@ -169,19 +171,20 @@ void testReducerReset()
   ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)(-1));
   ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)(-1));
 
-  work_res.deallocate(workVal);
-  host_res.deallocate(resetVal);
+  work_res.deallocate( workVal );
+  host_res.deallocate( resetVal );
 }
 
 TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
 {
-  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
-  testReducerReset<ReduceType, NumericType, ResourceType, ForOneType>();
+  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
+  testReducerReset< ReduceType, NumericType, ResourceType, ForOneType >();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest, BasicReset);
+REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest,
+                            BasicReset);
 
 #endif  //__TEST_REDUCER_RESET__
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index 55fcf488ab..806ba66b26 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -14,16 +14,16 @@
 #include "RAJA/util/Timer.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-inline __host__ __device__ void gpu_time_wait_for(float time, float clockrate)
-{
-  clock_t time_in_clocks = time * clockrate;
+inline __host__ __device__ void
+gpu_time_wait_for(float time, float clockrate) {
+  clock_t time_in_clocks = time*clockrate;
 
-  unsigned int start_clock = (unsigned int)clock();
-  clock_t clock_offset     = 0;
+  unsigned int start_clock = (unsigned int) clock();
+  clock_t clock_offset = 0;
   while (clock_offset < time_in_clocks)
   {
-    unsigned int end_clock = (unsigned int)clock();
-    clock_offset           = (clock_t)(end_clock - start_clock);
+    unsigned int end_clock = (unsigned int) clock();
+    clock_offset = (clock_t)(end_clock - start_clock);
   }
 }
 
@@ -39,61 +39,61 @@ int get_clockrate()
     printf("  CUDA kernel runs will be serialized\n");
     return -1;
   }
-  // printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-  //     deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+  //printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+  //    deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 
 #if defined(__arm__) || defined(__aarch64__)
-  return deviceProp.clockRate / 1000;
+  return deviceProp.clockRate/1000;
 #else
   return deviceProp.clockRate;
 #endif
 }
 
 template <typename WORKING_RES, typename EXEC_POL>
-void ResourceAsyncTimeTestImpl(EXEC_POL&&)
-{}
+void ResourceAsyncTimeTestImpl(EXEC_POL&&) {}
 
 template <typename WORKING_RES, size_t BLOCK_SIZE, bool Async>
 void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
 {
-  constexpr std::size_t ARRAY_SIZE {10000};
+  constexpr std::size_t ARRAY_SIZE{10000};
   using namespace RAJA;
 
-  constexpr std::size_t NUM_STREAMS {8};
+  constexpr std::size_t NUM_STREAMS{8};
   WORKING_RES dev[NUM_STREAMS];
   resources::Host host;
 
-  int clockrate {get_clockrate()};
+  int clockrate{get_clockrate()};
   ASSERT_TRUE(clockrate != -1);
 
   using AsyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, true>;
-  using SyncExecPol  = RAJA::cuda_exec<BLOCK_SIZE, false>;
+  using SyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, false>;
 
   RAJA::Timer sync_timer;
   sync_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
-  {
-    forall<SyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
-                        [=] RAJA_HOST_DEVICE(int i)
-                        { gpu_time_wait_for(100, clockrate); });
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
+    forall<SyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE (int i) {
+        gpu_time_wait_for(100, clockrate);
+      }
+    );
   }
   sync_timer.stop();
   RAJA::Timer::ElapsedType t_sync = sync_timer.elapsed();
 
   RAJA::Timer async_timer;
   async_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
-  {
-    forall<AsyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
-                         [=] RAJA_HOST_DEVICE(int i)
-                         { gpu_time_wait_for(100, clockrate); });
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
+    forall<AsyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
+      [=] RAJA_HOST_DEVICE (int i) {
+        gpu_time_wait_for(100, clockrate);
+      }
+    );
   }
   async_timer.stop();
   RAJA::Timer::ElapsedType t_async = async_timer.elapsed();
 
-  // We expect "total async time" to be roughly equal to "total sync time" /
-  // NUM_STREAMS. For comparison tolerance, we multiple the latter by 2 in the
-  // check.
+  // We expect "total async time" to be roughly equal to "total sync time" / NUM_STREAMS.
+  // For comparison tolerance, we multiple the latter by 2 in the check.
   ASSERT_LT(t_async, 2 * (t_sync / NUM_STREAMS));
 }
 
@@ -106,15 +106,15 @@ void ResourceAsyncTimeTestCall()
 #else
 
 template <typename WORKING_RES, typename EXEC_POLICY>
-void ResourceAsyncTimeTestCall()
-{}
+void ResourceAsyncTimeTestCall() {}
 
 #endif
 
 TYPED_TEST_SUITE_P(ResourceAsyncTimeTest);
 template <typename T>
 class ResourceAsyncTimeTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
 {
@@ -124,6 +124,7 @@ TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
   ResourceAsyncTimeTestCall<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest, ResourceAsyncTime);
+REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest,
+                            ResourceAsyncTime);
 
 #endif  // __TEST_RESOURCE_ASYNC_HPP__
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index 8de49f37d7..b1939240e4 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -13,40 +13,49 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceBasicAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE {10000000};
+  constexpr std::size_t ARRAY_SIZE{10000000};
   using namespace RAJA;
 
   WORKING_RES dev;
   resources::Host host;
 
-  int* d_array = resources::Resource {dev}.allocate<int>(ARRAY_SIZE);
-  int* h_array = host.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource{dev}.allocate<int>(ARRAY_SIZE);
+  int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=] RAJA_HOST_DEVICE(int i)
-                                       { h_array[i] = i; });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      h_array[i] = i;
+    }
+  );
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
-  forall<EXEC_POLICY>(dev, RangeSegment(0, ARRAY_SIZE),
-                      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
+  forall<EXEC_POLICY>(dev, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array[i] = i + 2;
+    }
+  );
 
   dev.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], i + 2); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] (int i) {
+      ASSERT_EQ(h_array[i], i + 2); 
+    }
+  );
 
   dev.deallocate(d_array);
   host.deallocate(h_array);
+  
 }
 
 TYPED_TEST_SUITE_P(ResourceBasicAsyncSemanticsTest);
 template <typename T>
 class ResourceBasicAsyncSemanticsTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ResourceBasicAsyncSemanticsTest, ResourceBasicAsyncSemantics)
 {
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index c7b050dd18..0c1b748de2 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -13,48 +13,59 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceDependsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE {10000};
+  constexpr std::size_t ARRAY_SIZE{10000};
   using namespace RAJA;
 
   WORKING_RES dev1;
   WORKING_RES dev2;
   resources::Host host;
 
-  int* d_array1 = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
-  int* d_array2 = resources::Resource {dev2}.allocate<int>(ARRAY_SIZE);
+  int* d_array1 = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
+  int* d_array2 = resources::Resource{dev2}.allocate<int>(ARRAY_SIZE);
   int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
+  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] = i;
+    }
+  );
 
-  resources::Event e =
-      forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
-                          [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
+  resources::Event e = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array2[i] = -1;
+    }
+  );
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                      [=] RAJA_HOST_DEVICE(int i)
-                      { d_array1[i] *= d_array2[i]; });
+  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array1[i] *= d_array2[i];
+    }
+  );
 
   dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], -i); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] (int i) {
+      ASSERT_EQ(h_array[i], -i); 
+    }
+  );
 
   dev1.deallocate(d_array1);
   dev2.deallocate(d_array2);
   host.deallocate(h_array);
+  
 }
 
 TYPED_TEST_SUITE_P(ResourceDependsTest);
 template <typename T>
 class ResourceDependsTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
 {
@@ -64,6 +75,7 @@ TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
   ResourceDependsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest, ResourceDepends);
+REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest,
+                            ResourceDepends);
 
 #endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index 51dc837935..a8e30d9719 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -13,28 +13,32 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceJoinAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE {1000000};
+  constexpr std::size_t ARRAY_SIZE{1000000};
   using namespace RAJA;
 
   WORKING_RES dev1;
   WORKING_RES dev2;
   resources::Host host;
 
-  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array = host.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
+  int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=] RAJA_HOST_DEVICE(int i)
-                                       { h_array[i] = i; });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      h_array[i] = i;
+    }
+  );
 
   dev2.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
   auto e1 = dev2.get_event_erased();
   dev1.wait_for(&e1);
 
-  RAJA::resources::Event e2 =
-      forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                          [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
+  RAJA::resources::Event e2 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      d_array[i] = i + 2;
+    }
+  );
 
   dev2.wait_for(&e2);
 
@@ -42,18 +46,22 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], i + 2); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] (int i) {
+      ASSERT_EQ(h_array[i], i + 2); 
+    }
+  );
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
+  
 }
 
 TYPED_TEST_SUITE_P(ResourceJoinAsyncSemanticsTest);
 template <typename T>
 class ResourceJoinAsyncSemanticsTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ResourceJoinAsyncSemanticsTest, ResourceJoinAsyncSemantics)
 {
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index f52fa0a817..7f545229f1 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -13,7 +13,7 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceMultiStreamTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE {10000};
+  constexpr std::size_t ARRAY_SIZE{10000};
   using namespace RAJA;
 
   WORKING_RES dev1;
@@ -21,35 +21,29 @@ void ResourceMultiStreamTestImpl()
   WORKING_RES dev3;
   resources::Host host;
 
-  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array = host.allocate<int>(ARRAY_SIZE);
-
-  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
-                                            [=] RAJA_HOST_DEVICE(int i)
-                                            {
-                                              if (i % 3 == 0)
-                                              {
-                                                d_array[i] = i;
-                                              }
-                                            });
-
-  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
-                                            [=] RAJA_HOST_DEVICE(int i)
-                                            {
-                                              if (i % 3 == 1)
-                                              {
-                                                d_array[i] = i;
-                                              }
-                                            });
-
-  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
-                                            [=] RAJA_HOST_DEVICE(int i)
-                                            {
-                                              if (i % 3 == 2)
-                                              {
-                                                d_array[i] = i;
-                                              }
-                                            });
+  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
+  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+
+  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      if (i % 3 == 0) {
+        d_array[i] = i;
+      }
+  });
+
+  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      if (i % 3 == 1) {
+        d_array[i] = i;
+      }
+  });
+
+  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
+    [=] RAJA_HOST_DEVICE (int i) {
+      if (i % 3 == 2) {
+        d_array[i] = i;
+      }
+  });
 
   dev1.wait_for(&e2);
   dev1.wait_for(&e3);
@@ -58,9 +52,11 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
-                                       [=](int i)
-                                       { ASSERT_EQ(h_array[i], i); });
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
+    [=] (int i) {
+      ASSERT_EQ(h_array[i], i); 
+    }
+  );
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
@@ -69,7 +65,8 @@ void ResourceMultiStreamTestImpl()
 TYPED_TEST_SUITE_P(ResourceMultiStreamTest);
 template <typename T>
 class ResourceMultiStreamTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
 {
@@ -79,6 +76,7 @@ TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
   ResourceMultiStreamTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest, ResourceMultiStream);
+REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest,
+                            ResourceMultiStream);
 
 #endif  // __TEST_RESOURCE_MULTISTREAM_HPP__
diff --git a/test/unit/util/operator/test-operators-bitwise-modulus.cpp b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
index db81160eb1..c2906cbe5f 100644
--- a/test/unit/util/operator/test-operators-bitwise-modulus.cpp
+++ b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
@@ -12,13 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class OperatorsIntegralUnitTest : public ::testing::Test
-{};
+template<typename T>
+class OperatorsIntegralUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(OperatorsIntegralUnitTest, UnitExpandedIntegralTypes);
 
-template <typename T>
+template<typename T>
 void modulus_test()
 {
   using Mod = RAJA::operators::modulus<T>;
@@ -26,17 +25,16 @@ void modulus_test()
   Mod m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i, j), T(1));
+  ASSERT_EQ(m(i,j), T(1));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i, j), T(-1));
+    ASSERT_EQ(m(i,j), T(-1));
   }
 }
 
-template <typename T>
+template<typename T>
 void bit_or_test()
 {
   using Or = RAJA::operators::bit_or<T>;
@@ -45,12 +43,12 @@ void bit_or_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(o(i, j), T(0011));
-  ASSERT_EQ(o(i, k), T(0111));
-  ASSERT_EQ(o(j, k), T(0111));
+  ASSERT_EQ(o(i,j), T(0011));
+  ASSERT_EQ(o(i,k), T(0111));
+  ASSERT_EQ(o(j,k), T(0111));
 }
 
-template <typename T>
+template<typename T>
 void bit_and_test()
 {
   using And = RAJA::operators::bit_and<T>;
@@ -59,12 +57,12 @@ void bit_and_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(a(i, j), T(0000));
-  ASSERT_EQ(a(i, k), T(0010));
-  ASSERT_EQ(a(j, k), T(0001));
+  ASSERT_EQ(a(i,j), T(0000));
+  ASSERT_EQ(a(i,k), T(0010));
+  ASSERT_EQ(a(j,k), T(0001));
 }
 
-template <typename T>
+template<typename T>
 void bit_xor_test()
 {
   using Xor = RAJA::operators::bit_xor<T>;
@@ -73,13 +71,12 @@ void bit_xor_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(x(i, j), T(0011));
-  ASSERT_EQ(x(i, k), T(0101));
-  ASSERT_EQ(x(j, k), T(0110));
+  ASSERT_EQ(x(i,j), T(0011));
+  ASSERT_EQ(x(i,k), T(0101));
+  ASSERT_EQ(x(j,k), T(0110));
 }
 
-TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus)
-{
+TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus) {
   bit_or_test<TypeParam>();
   bit_and_test<TypeParam>();
   bit_xor_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-equivalence.cpp b/test/unit/util/operator/test-operators-equivalence.cpp
index 710dc21abd..f2a0a84c54 100644
--- a/test/unit/util/operator/test-operators-equivalence.cpp
+++ b/test/unit/util/operator/test-operators-equivalence.cpp
@@ -12,13 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class OperatorsUnitTestEquivalence : public ::testing::Test
-{};
+template<typename T>
+class OperatorsUnitTestEquivalence : public ::testing::Test {};
 
 TYPED_TEST_SUITE(OperatorsUnitTestEquivalence, UnitIntFloatTypes);
 
-template <typename T>
+template<typename T>
 void equal_test()
 {
   using Eq = RAJA::operators::equal_to<T>;
@@ -26,17 +25,16 @@ void equal_test()
   Eq eq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(eq(i, j));
+  ASSERT_TRUE(eq(i,j));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(eq(i, j));
+    ASSERT_TRUE(eq(i,j));
   }
 }
 
-template <typename T>
+template<typename T>
 void not_equal_test()
 {
   using NEq = RAJA::operators::not_equal_to<T>;
@@ -44,17 +42,16 @@ void not_equal_test()
   NEq neq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(3);
-  ASSERT_TRUE(neq(i, j));
+  ASSERT_TRUE(neq(i,j));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-3);
-    ASSERT_TRUE(neq(i, j));
+    ASSERT_TRUE(neq(i,j));
   }
 }
 
-template <typename T>
+template<typename T>
 void greater_test()
 {
   using G = RAJA::operators::greater<T>;
@@ -62,19 +59,18 @@ void greater_test()
   G g;
   T i = static_cast<T>(5);
   T j = static_cast<T>(4);
-  ASSERT_TRUE(g(i, j));
-  ASSERT_FALSE(g(j, i));
+  ASSERT_TRUE(g(i,j));
+  ASSERT_FALSE(g(j,i));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-4);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(g(i, j));
-    ASSERT_FALSE(g(j, i));
+    ASSERT_TRUE(g(i,j));
+    ASSERT_FALSE(g(j,i));
   }
 }
 
-template <typename T>
+template<typename T>
 void less_test()
 {
   using L = RAJA::operators::less<T>;
@@ -82,67 +78,64 @@ void less_test()
   L l;
   T i = static_cast<T>(4);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(l(i, j));
-  ASSERT_FALSE(l(j, i));
+  ASSERT_TRUE(l(i,j));
+  ASSERT_FALSE(l(j,i));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-4);
-    ASSERT_TRUE(l(i, j));
-    ASSERT_FALSE(l(j, i));
+    ASSERT_TRUE(l(i,j));
+    ASSERT_FALSE(l(j,i));
   }
 }
 
-template <typename T>
+template<typename T>
 void greater_eq_test()
 {
   using G = RAJA::operators::greater_equal<T>;
 
   G g;
-  T i  = static_cast<T>(5);
+  T i = static_cast<T>(5);
   T i2 = static_cast<T>(5);
-  T j  = static_cast<T>(4);
-  ASSERT_TRUE(g(i, j));
-  ASSERT_TRUE(g(i, i2));
-  ASSERT_FALSE(g(j, i));
-
-  if (std::is_signed<T>::value)
-  {
-    i  = static_cast<T>(-4);
+  T j = static_cast<T>(4);
+  ASSERT_TRUE(g(i,j));
+  ASSERT_TRUE(g(i,i2));
+  ASSERT_FALSE(g(j,i));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-4);
     i2 = static_cast<T>(-4);
-    j  = static_cast<T>(-5);
-    ASSERT_TRUE(g(i, j));
-    ASSERT_TRUE(g(i, i2));
-    ASSERT_FALSE(g(j, i));
+    j = static_cast<T>(-5);
+    ASSERT_TRUE(g(i,j));
+    ASSERT_TRUE(g(i,i2));
+    ASSERT_FALSE(g(j,i));
   }
 }
 
-template <typename T>
+template<typename T>
 void less_eq_test()
 {
   using L = RAJA::operators::less_equal<T>;
 
   L l;
-  T i  = static_cast<T>(4);
+  T i = static_cast<T>(4);
   T i2 = static_cast<T>(4);
-  T j  = static_cast<T>(5);
-  ASSERT_TRUE(l(i, j));
-  ASSERT_TRUE(l(i, i2));
-  ASSERT_FALSE(l(j, i));
-
-  if (std::is_signed<T>::value)
-  {
-    i  = static_cast<T>(-5);
+  T j = static_cast<T>(5);
+  ASSERT_TRUE(l(i,j));
+  ASSERT_TRUE(l(i,i2));
+  ASSERT_FALSE(l(j,i));
+
+  if (std::is_signed<T>::value) {
+    i = static_cast<T>(-5);
     i2 = static_cast<T>(-5);
-    j  = static_cast<T>(-4);
-    ASSERT_TRUE(l(i, j));
-    ASSERT_TRUE(l(i, i2));
-    ASSERT_FALSE(l(j, i));
+    j = static_cast<T>(-4);
+    ASSERT_TRUE(l(i,j));
+    ASSERT_TRUE(l(i,i2));
+    ASSERT_FALSE(l(j,i));
   }
 }
 
-template <typename T>
+template<typename T>
 void maximum_test()
 {
   using Max = RAJA::operators::maximum<T>;
@@ -150,17 +143,16 @@ void maximum_test()
   Max m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i, j), i);
+  ASSERT_EQ(m(i,j), i);
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i, j), j);
+    ASSERT_EQ(m(i,j), j);
   }
 }
 
-template <typename T>
+template<typename T>
 void minimum_test()
 {
   using Min = RAJA::operators::minimum<T>;
@@ -168,18 +160,16 @@ void minimum_test()
   Min m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i, j), j);
+  ASSERT_EQ(m(i,j), j);
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i, j), i);
+    ASSERT_EQ(m(i,j), i);
   }
 }
 
-TYPED_TEST(OperatorsUnitTestEquivalence, equivalence)
-{
+TYPED_TEST(OperatorsUnitTestEquivalence, equivalence) {
   minimum_test<TypeParam>();
   maximum_test<TypeParam>();
   equal_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-identity.cpp b/test/unit/util/operator/test-operators-identity.cpp
index ef7589d05d..4b320d1c04 100644
--- a/test/unit/util/operator/test-operators-identity.cpp
+++ b/test/unit/util/operator/test-operators-identity.cpp
@@ -12,13 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class OperatorsUnitTestIdentity : public ::testing::Test
-{};
+template<typename T>
+class OperatorsUnitTestIdentity: public ::testing::Test {};
 
 TYPED_TEST_SUITE(OperatorsUnitTestIdentity, UnitIntFloatTypes);
 
-template <typename T>
+template<typename T>
 void identity_test()
 {
   using Ident = RAJA::operators::identity<T>;
@@ -29,14 +28,13 @@ void identity_test()
   ASSERT_EQ(id(i), T(0));
   ASSERT_EQ(id(j), T(1));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     j = static_cast<T>(-1);
     ASSERT_EQ(id(j), T(-1));
   }
 }
 
-template <typename T>
+template<typename T>
 void project1st_test()
 {
   using Proj1 = RAJA::operators::project1st<T, T>;
@@ -44,18 +42,17 @@ void project1st_test()
   Proj1 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i, j), T(0));
-  ASSERT_EQ(p(j, i), T(1));
+  ASSERT_EQ(p(i,j), T(0));
+  ASSERT_EQ(p(j,i), T(1));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i, j), T(0));
-    ASSERT_EQ(p(j, i), T(-1));
+    ASSERT_EQ(p(i,j), T(0));
+    ASSERT_EQ(p(j,i), T(-1));
   }
 }
 
-template <typename T>
+template<typename T>
 void project2nd_test()
 {
   using Proj2 = RAJA::operators::project2nd<T, T>;
@@ -63,26 +60,23 @@ void project2nd_test()
   Proj2 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i, j), T(1));
-  ASSERT_EQ(p(j, i), T(0));
+  ASSERT_EQ(p(i,j), T(1));
+  ASSERT_EQ(p(j,i), T(0));
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(                                                               \
-    disable : 4245)  // Force msvc to not emit signed conversion warning
+#pragma warning( disable : 4245 )  // Force msvc to not emit signed conversion warning
 #endif
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i, j), T(-1));
-    ASSERT_EQ(p(j, i), T(0));
+    ASSERT_EQ(p(i,j), T(-1));
+    ASSERT_EQ(p(j,i), T(0));
   }
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning(default : 4245)
+#pragma warning( default : 4245 )
 #endif
 }
 
-TYPED_TEST(OperatorsUnitTestIdentity, identity_project)
-{
+TYPED_TEST(OperatorsUnitTestIdentity, identity_project) {
   identity_test<TypeParam>();
   project1st_test<TypeParam>();
   project2nd_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-logical.cpp b/test/unit/util/operator/test-operators-logical.cpp
index 8edb9cdad0..3fde5644a4 100644
--- a/test/unit/util/operator/test-operators-logical.cpp
+++ b/test/unit/util/operator/test-operators-logical.cpp
@@ -12,12 +12,11 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class OperatorsUnitTestLogical : public ::testing::Test
-{};
+template<typename T>
+class OperatorsUnitTestLogical : public ::testing::Test {};
 TYPED_TEST_SUITE(OperatorsUnitTestLogical, UnitIntFloatTypes);
 
-template <typename T>
+template<typename T>
 void logical_and_test()
 {
   using And = RAJA::operators::logical_and<T>;
@@ -29,22 +28,21 @@ void logical_and_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(a(i0, j0));
-  ASSERT_FALSE(a(i0, j1));
-  ASSERT_FALSE(a(i1, j0));
-  ASSERT_TRUE(a(i1, j1));
-  ASSERT_TRUE(a(i2, j2));
-  if (std::is_signed<T>::value)
-  {
+  ASSERT_FALSE(a(i0,j0));
+  ASSERT_FALSE(a(i0,j1));
+  ASSERT_FALSE(a(i1,j0));
+  ASSERT_TRUE(a(i1,j1));
+  ASSERT_TRUE(a(i2,j2));
+  if (std::is_signed<T>::value) {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_FALSE(a(i0, j1));
-    ASSERT_FALSE(a(i1, j0));
-    ASSERT_TRUE(a(i1, j1));
+    ASSERT_FALSE(a(i0,j1));
+    ASSERT_FALSE(a(i1,j0));
+    ASSERT_TRUE(a(i1,j1));
   }
 }
 
-template <typename T>
+template<typename T>
 void logical_or_test()
 {
   using Or = RAJA::operators::logical_or<T>;
@@ -56,22 +54,21 @@ void logical_or_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(o(i0, j0));
-  ASSERT_TRUE(o(i0, j1));
-  ASSERT_TRUE(o(i1, j0));
-  ASSERT_TRUE(o(i1, j1));
-  ASSERT_TRUE(o(i2, j2));
-  if (std::is_signed<T>::value)
-  {
+  ASSERT_FALSE(o(i0,j0));
+  ASSERT_TRUE(o(i0,j1));
+  ASSERT_TRUE(o(i1,j0));
+  ASSERT_TRUE(o(i1,j1));
+  ASSERT_TRUE(o(i2,j2));
+  if (std::is_signed<T>::value) {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_TRUE(o(i0, j1));
-    ASSERT_TRUE(o(i1, j0));
-    ASSERT_TRUE(o(i1, j1));
+    ASSERT_TRUE(o(i0,j1));
+    ASSERT_TRUE(o(i1,j0));
+    ASSERT_TRUE(o(i1,j1));
   }
 }
 
-template <typename T>
+template<typename T>
 void logical_not_test()
 {
   using Not = RAJA::operators::logical_not<T>;
@@ -81,15 +78,13 @@ void logical_not_test()
   T i1 = static_cast<T>(1);
   ASSERT_FALSE(n(i1));
   ASSERT_TRUE(n(i0));
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i1 = static_cast<T>(-1);
     ASSERT_FALSE(n(i1));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestLogical, logical)
-{
+TYPED_TEST(OperatorsUnitTestLogical, logical) {
   logical_and_test<TypeParam>();
   logical_or_test<TypeParam>();
   logical_not_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-math.cpp b/test/unit/util/operator/test-operators-math.cpp
index 16dd7c170a..054efd41c8 100644
--- a/test/unit/util/operator/test-operators-math.cpp
+++ b/test/unit/util/operator/test-operators-math.cpp
@@ -12,12 +12,11 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class OperatorsUnitTestMath : public ::testing::Test
-{};
+template<typename T>
+class OperatorsUnitTestMath : public ::testing::Test {};
 TYPED_TEST_SUITE(OperatorsUnitTestMath, UnitIntFloatTypes);
 
-template <typename T>
+template<typename T>
 void plus_test()
 {
   using Plus = RAJA::operators::plus<T>;
@@ -27,17 +26,16 @@ void plus_test()
   Plus p;
   T i = static_cast<T>(1);
   T j = static_cast<T>(2);
-  ASSERT_EQ(p(i, j), T(3));
+  ASSERT_EQ(p(i,j), T(3));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(p(i, j), T(-7));
+    ASSERT_EQ(p(i,j), T(-7));
   }
 }
 
-template <typename T>
+template<typename T>
 void minus_test()
 {
   using Minus = RAJA::operators::minus<T>;
@@ -45,17 +43,16 @@ void minus_test()
   Minus m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i, j), T(3));
+  ASSERT_EQ(m(i,j), T(3));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i, j), T(-3));
+    ASSERT_EQ(m(i,j), T(-3));
   }
 }
 
-template <typename T>
+template<typename T>
 void multiplies_test()
 {
   using Mult = RAJA::operators::multiplies<T>;
@@ -65,17 +62,16 @@ void multiplies_test()
   Mult m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i, j), T(10));
+  ASSERT_EQ(m(i,j), T(10));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i, j), T(10));
+    ASSERT_EQ(m(i,j), T(10));
   }
 }
 
-template <typename T>
+template<typename T>
 void divides_test()
 {
   using Div = RAJA::operators::divides<T>;
@@ -83,24 +79,22 @@ void divides_test()
   Div d;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  if (std::is_floating_point<T>::value)
-    ASSERT_EQ(d(i, j), T(2.5));
+  if(std::is_floating_point<T>::value) 
+    ASSERT_EQ(d(i,j), T(2.5));
   else
-    ASSERT_EQ(d(i, j), T(2));
+    ASSERT_EQ(d(i,j), T(2));
 
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    if (std::is_floating_point<T>::value)
-      ASSERT_EQ(d(i, j), T(2.5));
+    if(std::is_floating_point<T>::value) 
+      ASSERT_EQ(d(i,j), T(2.5));
     else
-      ASSERT_EQ(d(i, j), T(2));
+      ASSERT_EQ(d(i,j), T(2));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestMath, math)
-{
+TYPED_TEST(OperatorsUnitTestMath, math) {
   plus_test<TypeParam>();
   minus_test<TypeParam>();
   multiplies_test<TypeParam>();
diff --git a/test/unit/util/test-float-limits.cpp b/test/unit/util/test-float-limits.cpp
index d54e454083..80635a74e1 100644
--- a/test/unit/util/test-float-limits.cpp
+++ b/test/unit/util/test-float-limits.cpp
@@ -6,12 +6,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for floating point numeric limits in
+/// Source file containing tests for floating point numeric limits in 
 /// RAJA operators
 ///
 
 #include "RAJA_test-base.hpp"
-#include "RAJA_unit-test-types.hpp"
+#include "RAJA_unit-test-types.hpp" 
 
 #define RAJA_CHECK_LIMITS
 #include "RAJA/util/Operators.hpp"
@@ -20,7 +20,8 @@
 
 template <typename T>
 class FloatLimitsUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(FloatLimitsUnitTest);
 
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
index 6cc8941184..5161b2bb3a 100644
--- a/test/unit/util/test-fraction.cpp
+++ b/test/unit/util/test-fraction.cpp
@@ -30,17 +30,16 @@ void testFractionMultiplyTypesValues()
             IntegerType(double(numerator) / double(denominator) * double(101)));
 
   // Test where naive algorithm causes overflow, when within precision of double
-  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double))
-  {
+  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) {
 
     static constexpr IntegerType max = std::numeric_limits<IntegerType>::max();
-    static constexpr IntegerType val =
-        (numerator > denominator) ? (max / numerator * denominator) : max;
+    static constexpr IntegerType val = (numerator > denominator) ?
+        (max / numerator * denominator) : max;
 
-    ASSERT_EQ(
-        Frac::multiply(IntegerType(val)),
-        IntegerType(double(numerator) / double(denominator) * double(val)));
+    ASSERT_EQ(Frac::multiply(IntegerType(val)),
+              IntegerType(double(numerator) / double(denominator) * double(val)));
   }
+
 }
 
 template <typename IntegerType>
@@ -55,8 +54,8 @@ void testFractionMultiplyTypes()
 }
 
 
-#define RAJA_FRACTION_RUN_TEST(test)                                           \
-  test<int>();                                                                 \
+#define RAJA_FRACTION_RUN_TEST(test) \
+  test<int>(); \
   test<size_t>();
 
 TEST(Fraction, basic_multiply_Fraction)
diff --git a/test/unit/util/test-integral-limits.cpp b/test/unit/util/test-integral-limits.cpp
index 1e68ecc4f4..77d2d95bc0 100644
--- a/test/unit/util/test-integral-limits.cpp
+++ b/test/unit/util/test-integral-limits.cpp
@@ -19,7 +19,8 @@
 
 template <typename T>
 class IntegralLimitsUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(IntegralLimitsUnitTest);
 
@@ -34,5 +35,5 @@ TYPED_TEST_P(IntegralLimitsUnitTest, IntegralLimits)
 REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsUnitTest, IntegralLimits);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsUnitTests,
-                               IntegralLimitsUnitTest,
-                               UnitIntegralTypes);
+                              IntegralLimitsUnitTest,
+                              UnitIntegralTypes);
diff --git a/test/unit/util/test-math.cpp b/test/unit/util/test-math.cpp
index dd5b5dbc24..39572ad3a0 100644
--- a/test/unit/util/test-math.cpp
+++ b/test/unit/util/test-math.cpp
@@ -13,7 +13,7 @@
 #include "RAJA_gtest.hpp"
 #include <type_traits>
 
-template <typename T>
+template < typename T >
 void test_log2()
 {
   ASSERT_EQ(RAJA::log2(T(257)), T(8));
@@ -24,8 +24,7 @@ void test_log2()
   ASSERT_EQ(RAJA::log2(T(2)), T(1));
   ASSERT_EQ(RAJA::log2(T(1)), T(0));
   ASSERT_EQ(RAJA::log2(T(0)), T(0));
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     ASSERT_EQ(RAJA::log2(T(-1)), T(0));
     ASSERT_EQ(RAJA::log2(T(-100)), T(0));
   }
@@ -38,7 +37,7 @@ TEST(math, log2)
 }
 
 
-template <typename T>
+template < typename T >
 void test_next_pow2()
 {
   ASSERT_EQ(RAJA::next_pow2(T(257)), T(512));
@@ -49,8 +48,7 @@ void test_next_pow2()
   ASSERT_EQ(RAJA::next_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::next_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::next_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     ASSERT_EQ(RAJA::next_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::next_pow2(T(-100)), T(0));
   }
@@ -63,7 +61,7 @@ TEST(math, next_pow2)
 }
 
 
-template <typename T>
+template < typename T >
 void test_prev_pow2()
 {
   ASSERT_EQ(RAJA::prev_pow2(T(257)), T(256));
@@ -74,8 +72,7 @@ void test_prev_pow2()
   ASSERT_EQ(RAJA::prev_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::prev_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::prev_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value)
-  {
+  if (std::is_signed<T>::value) {
     ASSERT_EQ(RAJA::prev_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::prev_pow2(T(-100)), T(0));
   }
@@ -88,7 +85,7 @@ TEST(math, prev_pow2)
 }
 
 
-template <typename T>
+template < typename T >
 void test_power_of_2_mod()
 {
   ASSERT_EQ(RAJA::power_of_2_mod(T(257), T(256)), T(1));
diff --git a/test/unit/util/test-span.cpp b/test/unit/util/test-span.cpp
index 77ba2c347a..e59054cfc6 100644
--- a/test/unit/util/test-span.cpp
+++ b/test/unit/util/test-span.cpp
@@ -11,24 +11,41 @@
 
 #include "test-span.hpp"
 
-#define RAJA_SPAN_RUN_TEST(test)                                               \
-  test<int, int>();                                                            \
-  test<int, size_t>();                                                         \
-  test<double, int>();                                                         \
-  test<double, size_t>();
+#define RAJA_SPAN_RUN_TEST(test) \
+  test<int, int>(); \
+  test<int, size_t>(); \
+  test<double, int>(); \
+  test<double, size_t>(); \
 
-TEST(Span, basic_construct_Span) {RAJA_SPAN_RUN_TEST(testSpanConstructTypes)}
+TEST(Span, basic_construct_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanConstructTypes)
+}
 
-TEST(Span, basic_assign_Span) {RAJA_SPAN_RUN_TEST(testSpanAssignTypes)}
+TEST(Span, basic_assign_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanAssignTypes)
+}
 
-TEST(Span, basic_iterator_Span) {RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)}
+TEST(Span, basic_iterator_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)
+}
 
-TEST(Span,
-     basic_element_access_Span) {RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
+TEST(Span, basic_element_access_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)
+}
 
-TEST(Span, basic_observe_Span) {RAJA_SPAN_RUN_TEST(testSpanObserveTypes)}
+TEST(Span, basic_observe_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanObserveTypes)
+}
 
-TEST(Span, basic_subview_Span) {RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)}
+TEST(Span, basic_subview_Span)
+{
+  RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)
+}
 
 TEST(Span, basic_make_span_Span)
 {
diff --git a/test/unit/util/test-span.hpp b/test/unit/util/test-span.hpp
index b6fff3fe90..e76db861fd 100644
--- a/test/unit/util/test-span.hpp
+++ b/test/unit/util/test-span.hpp
@@ -26,7 +26,7 @@ template <typename ValueType, typename IndexType>
 void testSpanConstructTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr          = new ValueType[len];
+  ValueType* ptr = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -36,7 +36,7 @@ void testSpanConstructTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr + len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr+len);
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(len, span.size());
@@ -49,7 +49,7 @@ template <typename ValueType, typename IndexType>
 void testSpanAssignTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr          = new ValueType[len];
+  ValueType* ptr = new ValueType[len];
 
   {
     RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -61,7 +61,7 @@ void testSpanAssignTypes()
   }
 
   {
-    ValueType* ptr2          = ptr + 1;
+    ValueType* ptr2 = ptr + 1;
     constexpr IndexType len2 = 1;
     RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> span2(ptr2, len2);
@@ -77,15 +77,15 @@ void testSpanAssignTypes()
 template <typename ValueType, typename IndexType>
 void testSpanIteratorTypes()
 {
-  using span_type         = RAJA::Span<ValueType*, IndexType>;
-  using iterator          = typename span_type::iterator;
-  using const_iterator    = typename span_type::const_iterator;
+  using span_type = RAJA::Span<ValueType*, IndexType>;
+  using iterator = typename span_type::iterator;
+  using const_iterator = typename span_type::const_iterator;
   constexpr IndexType len = 4;
-  ValueType* ptr          = new ValueType[len];
+  ValueType* ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for (IndexType ii = 0; ii < len; ++ii)
+  for ( IndexType ii = 0; ii < len; ++ii )
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -94,29 +94,27 @@ void testSpanIteratorTypes()
     const span_type span(ptr, len);
 
     iterator begin = span.begin();
-    iterator end   = span.end();
+    iterator end = span.end();
     ASSERT_EQ(ptr, begin);
-    ASSERT_EQ(ptr + len, end);
+    ASSERT_EQ(ptr+len, end);
 
     ValueType* ptr_chk = ptr;
 
-    for (iterator iter = begin; iter != end; ++iter)
-    {
+    for (iterator iter = begin; iter != end; ++iter) {
       ASSERT_EQ(*ptr_chk, *iter);
-      ptr_chk++;
+      ptr_chk++ ;
     }
 
     const_iterator cbegin = span.cbegin();
-    const_iterator cend   = span.cend();
+    const_iterator cend = span.cend();
     ASSERT_EQ(ptr, cbegin);
-    ASSERT_EQ(ptr + len, cend);
+    ASSERT_EQ(ptr+len, cend);
 
     ptr_chk = ptr;
 
-    for (iterator citer = cbegin; citer != cend; ++citer)
-    {
+    for (iterator citer = cbegin; citer != cend; ++citer) {
       ASSERT_EQ(*ptr_chk, *citer);
-      ptr_chk++;
+      ptr_chk++ ;
     }
   }
 
@@ -127,11 +125,11 @@ template <typename ValueType, typename IndexType>
 void testSpanElementAccessTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr          = new ValueType[len];
+  ValueType* ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for (IndexType ii = 0; ii < len; ++ii)
+  for ( IndexType ii = 0; ii < len; ++ii )
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -141,10 +139,9 @@ void testSpanElementAccessTypes()
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(*ptr, span.front());
-    ASSERT_EQ(*(ptr + len - 1), span.back());
+    ASSERT_EQ(*(ptr+len-1), span.back());
 
-    for (IndexType i = 0; i < len; ++i)
-    {
+    for (IndexType i = 0; i < len; ++i) {
       ASSERT_EQ(ptr[i], span[i]);
     }
   }
@@ -156,11 +153,11 @@ template <typename ValueType, typename IndexType>
 void testSpanObserveTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr          = new ValueType[len];
+  ValueType* ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for (IndexType ii = 0; ii < len; ++ii)
+  for ( IndexType ii = 0; ii < len; ++ii )
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -173,7 +170,7 @@ void testSpanObserveTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, len - len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len-len);
 
     ASSERT_EQ(0, span.size());
     ASSERT_TRUE(span.empty());
@@ -186,11 +183,11 @@ template <typename ValueType, typename IndexType>
 void testSpanSubViewTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr          = new ValueType[len];
+  ValueType* ptr = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for (IndexType ii = 0; ii < len; ++ii)
+  for ( IndexType ii = 0; ii < len; ++ii )
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -210,18 +207,17 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.last(count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr + len - count, subspan.data());
+    ASSERT_EQ(ptr+len-count, subspan.data());
   }
 
   {
     constexpr IndexType begin = 1;
     constexpr IndexType count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
-    const RAJA::Span<ValueType*, IndexType> subspan =
-        span.subspan(begin, count);
+    const RAJA::Span<ValueType*, IndexType> subspan = span.subspan(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr + begin, subspan.data());
+    ASSERT_EQ(ptr+begin, subspan.data());
   }
 
   {
@@ -231,7 +227,7 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.slice(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr + begin, subspan.data());
+    ASSERT_EQ(ptr+begin, subspan.data());
   }
 
   delete[] ptr;
@@ -241,7 +237,7 @@ template <typename ValueType, typename IndexType>
 void testSpanMakeSpanTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr          = new ValueType[len];
+  ValueType* ptr = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span = RAJA::make_span(ptr, len);
diff --git a/test/unit/util/test-timer.cpp b/test/unit/util/test-timer.cpp
index ed4ed599ae..1688e6497e 100644
--- a/test/unit/util/test-timer.cpp
+++ b/test/unit/util/test-timer.cpp
@@ -51,8 +51,7 @@ TEST(TimerUnitTest, No2)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i)
-  {
+  for (int i = 2; i > 0; --i) {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -74,8 +73,7 @@ TEST(TimerUnitTest, No3)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i)
-  {
+  for (int i = 2; i > 0; --i) {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -97,5 +95,5 @@ TEST(TimerUnitTest, No3)
   std::this_thread::sleep_for(std::chrono::milliseconds(10));
   timer.stop();
   elapsed = timer.elapsed();
-  EXPECT_GT(elapsed, 0.01);
+  EXPECT_GT(elapsed, 0.01); 
 }
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index 048c91a641..bd7effa8d4 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -11,99 +11,97 @@
 
 using namespace RAJA;
 
-TEST(IndexLayout, IndexList1D)
-{
+TEST(IndexLayout, IndexList1D) {
   /*
    * Construct a 1D index layout with the index list {1,2,3}
    */
 
-  Index_type arr[3] = {1, 2, 3};
+  Index_type arr[3] = {1,2,3};
 
-  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
+  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
   EXPECT_EQ(index_layout(1), 2);
   EXPECT_EQ(index_layout(2), 3);
+
 }
 
-TEST(IndexLayout, IndexList1DSubsetOfLayout)
-{
+TEST(IndexLayout, IndexList1DSubsetOfLayout) {
   /*
-   * Construct a 1D index layout of arbitrary size greater than 3
+   * Construct a 1D index layout of arbitrary size greater than 3 
    * with the index list {2,3,4}.
    * The purpose of this test is to demonstrate the use case where
    * the index list contains a subset of its index layout
    */
 
-  Index_type arr[3] = {2, 3, 4};
+  Index_type arr[3] = {2,3,4};
 
-  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
+  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   EXPECT_EQ(index_layout(0), 2);
   EXPECT_EQ(index_layout(1), 3);
   EXPECT_EQ(index_layout(2), 4);
+
 }
 
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0)
-{
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0) {
   /*
-   * Construct a 2D index layout of size 3x10 with
+   * Construct a 2D index layout of size 3x10 with 
    * the index list {1,2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples:
+   * Examples: 
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(1,1)          -> 11
    *   index_layout(0,5)   -> layout(1,5)          -> 15
    *   index_layout(1,7)   -> layout(2,7)          -> 27
    */
 
-  Index_type arr[2] = {1, 2};
+  Index_type arr[2] = {1,2};
 
-  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
+  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  for (int i = 0; i < 10; i++)
-  {
-    EXPECT_EQ(index_layout(0, i), i + 10);
-    EXPECT_EQ(index_layout(1, i), i + 20);
+  for (int i = 0; i < 10; i++ ) {
+    EXPECT_EQ(index_layout(0,i), i+10);
+    EXPECT_EQ(index_layout(1,i), i+20);
   }
+
 }
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1)
-{
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1) {
   /*
-   * Construct a 2D index layout of size 3x10 with
+   * Construct a 2D index layout of size 3x10 with 
    * the direct index used along the 0-axis and
    * the index list {9,5} used along the 1-axis
-   * Examples:
+   * Examples: 
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(0,5)          -> 5
    *   index_layout(2,0)   -> layout(2,9)          -> 29
    */
 
-  Index_type arr[2] = {9, 5};
+  Index_type arr[2] = {9,5};
 
-  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
+  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  EXPECT_EQ(index_layout(0, 0), 9);
-  EXPECT_EQ(index_layout(0, 1), 5);
-  EXPECT_EQ(index_layout(1, 0), 19);
-  EXPECT_EQ(index_layout(1, 1), 15);
-  EXPECT_EQ(index_layout(2, 0), 29);
-  EXPECT_EQ(index_layout(2, 1), 25);
+  EXPECT_EQ(index_layout(0,0), 9);
+  EXPECT_EQ(index_layout(0,1), 5);
+  EXPECT_EQ(index_layout(1,0), 19);
+  EXPECT_EQ(index_layout(1,1), 15);
+  EXPECT_EQ(index_layout(2,0), 29);
+  EXPECT_EQ(index_layout(2,1), 25);
+
 }
 
-TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0)
-{
+TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0) {
   /*
-   * Construct a 2D index layout of size 3x3 with
+   * Construct a 2D index layout of size 3x3 with 
    * the index list {2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples:
+   * Examples: 
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(2,1)          -> 7
    *   index_layout(0,2)   -> layout(2,2)          -> 8
@@ -111,21 +109,21 @@ TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0)
 
   Index_type arr[1] = {2};
 
-  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
+  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0, 0), 6);
-  EXPECT_EQ(index_layout(0, 1), 7);
-  EXPECT_EQ(index_layout(0, 2), 8);
+  EXPECT_EQ(index_layout(0,0), 6);
+  EXPECT_EQ(index_layout(0,1), 7);
+  EXPECT_EQ(index_layout(0,2), 8);  
+
 }
 
-TEST(IndexLayout, IndexList2DLayoutExtractOneIndex)
-{
+TEST(IndexLayout, IndexList2DLayoutExtractOneIndex) {
   /*
-   * Construct a 2D index layout of size 3x3 with
+   * Construct a 2D index layout of size 3x3 with 
    * the direct index used along the 0-axis and
    * the index list {2} used along the 1-axis
-   * Examples:
+   * Examples: 
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(1,0)   -> layout(1,2)          -> 5
    *   index_layout(2,0)   -> layout(2,2)          -> 8
@@ -133,21 +131,21 @@ TEST(IndexLayout, IndexList2DLayoutExtractOneIndex)
 
   Index_type arr[1] = {2};
 
-  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
+  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0, 0), 2);
-  EXPECT_EQ(index_layout(1, 0), 5);
-  EXPECT_EQ(index_layout(2, 0), 8);
+  EXPECT_EQ(index_layout(0,0), 2);
+  EXPECT_EQ(index_layout(1,0), 5);
+  EXPECT_EQ(index_layout(2,0), 8);
+
 }
 
-TEST(IndexLayout, ConditionalIndexListNullPtr)
-{
+TEST(IndexLayout, ConditionalIndexListNullPtr) {
   /*
-   * Construct a 1D index layout of size 3 with
+   * Construct a 1D index layout of size 3 with 
    * the conditional index list that is a nullptr
    * (conditional index lists always evaluate nullptr to regular indexing)
-   * Examples:
+   * Examples: 
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(0)            -> 0
    *   index_layout(2)     -> layout(2)            -> 2
@@ -155,7 +153,7 @@ TEST(IndexLayout, ConditionalIndexListNullPtr)
 
   Index_type* arr_ptr = nullptr;
 
-  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {arr_ptr});
+  auto index_tuple = make_index_tuple(ConditionalIndexList<>{arr_ptr});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 0);
@@ -163,21 +161,20 @@ TEST(IndexLayout, ConditionalIndexListNullPtr)
   EXPECT_EQ(index_layout(2), 2);
 }
 
-TEST(IndexLayout, ConditionalIndexListWithIndexList)
-{
+TEST(IndexLayout, ConditionalIndexListWithIndexList) {
   /*
-   * Construct a 1D index layout of size 3 with
+   * Construct a 1D index layout of size 3 with 
    * the conditional index list that is not a nullptr
    * (conditional index lists with index list act the same as IndexList)
-   * Examples:
+   * Examples: 
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(1)            -> 1
    *   index_layout(1)     -> layout(2)            -> 2
    */
 
-  Index_type arr[2] = {1, 2};
+  Index_type arr[2] = {1,2};
 
-  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {&arr[0]});
+  auto index_tuple = make_index_tuple(ConditionalIndexList<>{&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
@@ -187,18 +184,18 @@ TEST(IndexLayout, ConditionalIndexListWithIndexList)
 TEST(IndexLayout, View1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 5 with
+   * Construct a 1D index layout of size 5 with 
    * the index list {4,2,3} and pass to a 1D view with the data {5,10,15,20,25}
-   * Examples:
-   *   (index layout index -> regular layout index -> unit stride index -> view
-   * at index) index_layout(0)     -> layout(4)            -> 4 -> 25
+   * Examples: 
+   *   (index layout index -> regular layout index -> unit stride index -> view at index)
+   *   index_layout(0)     -> layout(4)            -> 4                 -> 25
    *   index_layout(2)     -> layout(3)            -> 3                 -> 20
    */
+  
+  Index_type data[5] = {5,10,15,20,25};
+  Index_type index_list[3] = {4,2,3};
 
-  Index_type data[5]       = {5, 10, 15, 20, 25};
-  Index_type index_list[3] = {4, 2, 3};
-
-  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
+  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   auto view = make_index_view(&data[0], index_layout);
@@ -206,17 +203,18 @@ TEST(IndexLayout, View1DLayout)
   EXPECT_EQ(view(0), 25);
   EXPECT_EQ(view(1), 15);
   EXPECT_EQ(view(2), 20);
+
 }
 
 TEST(IndexLayout, View2DLayout)
 {
   /*
-   * Construct a 2D index layout of size 2x3 with
+   * Construct a 2D index layout of size 2x3 with 
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * pass to a 2D view of size 2x3 with the each entry being i*j
    * for i,j in [0,2)x[0,3) (e.g. view(1,2) = 1*2, view(0,2) = 0*2, etc..)
-   * Examples:
+   * Examples: 
    *   (index layout index -> view index -> view at index)
    *   index_layout(0,1)   -> view(0,2)  -> 0
    *   index_layout(1,0)   -> view(1,1)  -> 1
@@ -224,121 +222,112 @@ TEST(IndexLayout, View2DLayout)
 
   Index_type data[2][3];
 
-  for (int i = 0; i < 2; i++)
-  {
-    for (int j = 0; j < 3; j++)
-    {
-      data[i][j] = i * j;
+  for (int i = 0; i < 2; i ++ ) {
+    for (int j = 0; j < 3; j ++ ) {
+      data[i][j] = i*j;
     }
   }
 
-  Index_type index_list[2] = {1, 2};
+  Index_type index_list[2] = {1,2};
 
-  auto index_tuple =
-      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list[0]});
+  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 2, 3);
 
   auto view = make_index_view(&data[0][0], index_layout);
 
-  for (int i = 0; i < 2; i++)
-  {
-    for (int j = 0; j < 2; j++)
-    {
-      EXPECT_EQ(view(i, j), i * (j + 1));
+  for (int i = 0; i < 2; i ++ ) {
+    for (int j = 0; j < 2; j ++ ) {
+      EXPECT_EQ(view(i,j), i*(j+1));
     }
   }
+
 }
 
 TEST(IndexLayout, View3DLayout)
 {
   /*
-   * Construct a 3D index layout of size 2x3x4 with
+   * Construct a 3D index layout of size 2x3x4 with 
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * the index list {2,3} used along the 2-axis and
    * pass to a 3D view of size 2x3x4 with the each entry being i*j*k
-   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) =
-   * 0*2*2, etc..) Examples: (index layout index -> view index -> view at index)
+   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) = 0*2*2, etc..)
+   * Examples: 
+   *   (index layout index -> view index -> view at index)
    *   index_layout(0,1,0) -> view(0,2,2)-> 0
    *   index_layout(2,1,1) -> view(2,2,3)-> 12
    */
-
+  
   Index_type data[2][3][4];
 
-  for (int i = 0; i < 2; i++)
-  {
-    for (int j = 0; j < 3; j++)
-    {
-      for (int k = 0; k < 4; k++)
-      {
-        data[i][j][k] = i * j * k;
+  for (int i = 0; i < 2; i ++ ) {
+    for (int j = 0; j < 3; j ++ ) {
+      for (int k = 0; k < 4; k ++ ) {
+	data[i][j][k] = i*j*k;
       }
     }
   }
 
-  Index_type index_list_j[2] = {1, 2};
-  Index_type index_list_k[2] = {2, 3};
+  Index_type index_list_j[2] = {1,2};
+  Index_type index_list_k[2] = {2,3};
 
-  auto index_tuple =
-      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list_j[0]},
-                       IndexList<> {&index_list_k[0]});
+  auto index_tuple = make_index_tuple(DirectIndex<>(), 
+                                      IndexList<>{&index_list_j[0]},
+                                      IndexList<>{&index_list_k[0]});
 
   auto index_layout = make_index_layout(index_tuple, 2, 3, 4);
 
   auto view = make_index_view(&data[0][0][0], index_layout);
 
-  for (int i = 0; i < 2; i++)
-  {
-    for (int j = 0; j < 2; j++)
-    {
-      for (int k = 0; k < 2; k++)
-      {
-        EXPECT_EQ(view(i, j, k), i * (j + 1) * (k + 2));
+  for (int i = 0; i < 2; i ++ ) {
+    for (int j = 0; j < 2; j ++ ) {
+      for (int k = 0; k < 2; k ++ ) {
+        EXPECT_EQ(view(i,j,k), i*(j+1)*(k+2));
       }
     }
   }
+
 }
 
 TEST(IndexLayout, MultiView1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 4 with
-   * the index list {1,2} and pass to a 1D multiview containing two 1D views of
-   * size 4 with the first view having each entry be the square of its index
-   * (e.g. view(2) = 2*2 = 4) and the second view having each entry be the cube
-   * of its index (e.g. view(3) = 3*3*3 = 27) Examples: (index layout index ->
-   * mutiview index -> view at index) index_layout(0,1)   -> view(0,2)      -> 4
+   * Construct a 1D index layout of size 4 with 
+   * the index list {1,2} and pass to a 1D multiview containing two 1D views of size 4 with
+   * the first view having each entry be the square of its index (e.g. view(2) = 2*2 = 4)
+   * and the second view having each entry be the cube of its index (e.g. view(3) = 3*3*3 = 27)
+   * Examples: 
+   *   (index layout index -> mutiview index -> view at index)
+   *   index_layout(0,1)   -> view(0,2)      -> 4
    *   index_layout(1,0)   -> view(1,1)      -> 1
    */
 
   Index_type data_squared[4];
   Index_type data_cubed[4];
 
-  for (int i = 0; i < 4; i++)
-  {
-    data_squared[i] = i * i;
+  for (int i = 0; i < 4; i ++ ) {
+    data_squared[i] = i*i;
   }
-
-  for (int i = 0; i < 4; i++)
-  {
-    data_cubed[i] = i * i * i;
+  
+  for (int i = 0; i < 4; i ++ ) {
+    data_cubed[i] = i*i*i;
   }
 
   Index_type* data_array[2];
   data_array[0] = data_squared;
   data_array[1] = data_cubed;
 
-  Index_type index_list[2] = {1, 2};
+  Index_type index_list[2] = {1,2};
 
-  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
+  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 4);
 
-  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<>>>(
-      data_array, index_layout);
+  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<> > >(data_array, index_layout);
 
-  for (int i = 0; i < 2; i++)
-  {
-    EXPECT_EQ(view(0, i), data_squared[i + 1]);
-    EXPECT_EQ(view(1, i), data_cubed[i + 1]);
+  for (int i = 0; i < 2; i ++ ) {
+    EXPECT_EQ(view(0,i), data_squared[i+1]);
+    EXPECT_EQ(view(1,i), data_cubed[i+1]);
   }
+
 }
+
diff --git a/test/unit/view-layout/test-makelayout.cpp b/test/unit/view-layout/test-makelayout.cpp
index a1377ddc26..af8b6db71e 100644
--- a/test/unit/view-layout/test-makelayout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -9,18 +9,19 @@
 
 TEST(LayoutUnitTest, OffsetVsRegular)
 {
-  const auto layout = RAJA::make_permuted_layout(
-      {{6, 6}}, RAJA::as_array<RAJA::Perm<1, 0>>::get());
-  const auto offset = RAJA::make_permuted_offset_layout(
-      {{0, 0}}, {{6, 6}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  const auto layout =
+      RAJA::make_permuted_layout({{6, 6}},
+                                 RAJA::as_array<RAJA::Perm<1, 0>>::get());
+  const auto offset =
+      RAJA::make_permuted_offset_layout({{0, 0}},
+                                        {{6, 6}},
+                                        RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * OffsetLayout with 0 offset should function like the regular Layout.
    */
-  for (int j = 0; j < 6; ++j)
-  {
-    for (int i = 0; i < 6; ++i)
-    {
+  for (int j = 0; j < 6; ++j) {
+    for (int i = 0; i < 6; ++i) {
       ASSERT_EQ(offset(i, j), layout(i, j))
           << layout.strides[0] << layout.strides[1];
     }
@@ -66,8 +67,10 @@ TEST(OffsetLayoutUnitTest, 2D_JI)
    * (-1, -1), (0, -1), (1, -1)
    * (-1, -2), (0, -2), (1, -2)
    */
-  const my_layout layout = RAJA::make_permuted_offset_layout(
-      {{-1, -2}}, {{2, 1}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout =
+      RAJA::make_permuted_offset_layout({{-1, -2}},
+                                        {{2, 1}},
+                                        RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * First element, (-1, -2), should have index 0.
@@ -104,8 +107,9 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 
   // Construct using variadic "sizes" ctor
   // Zero for J size should correctly produce projective layout
-  const my_layout layout = RAJA::make_permuted_layout(
-      {{3, 0, 7}}, RAJA::as_array<RAJA::PERM_KJI>::get());
+  const my_layout layout =
+      RAJA::make_permuted_layout({{3, 0, 7}},
+                                 RAJA::as_array<RAJA::PERM_KJI>::get());
 
   ASSERT_EQ(0, layout(0, 0, 0));
 
@@ -120,8 +124,7 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
   ASSERT_EQ(12, layout(0, 0, 4));
 
   // Check that we get the identity (mod 21)
-  for (int x = 0; x < 40; ++x)
-  {
+  for (int x = 0; x < 40; ++x) {
 
     // inverse map
     int i, j, k;
@@ -141,8 +144,7 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 TEST(LayoutUnitTest, 2D_StrideOne)
 {
   using my_layout = RAJA::Layout<2>;
-  using my_layout_s1 =
-      RAJA::Layout<2, ptrdiff_t, 0>;  // first index is stride-1
+  using my_layout_s1 = RAJA::Layout<2, ptrdiff_t, 0>; // first index is stride-1
 
   /*
    * Construct a 2D layout:
@@ -153,8 +155,9 @@ TEST(LayoutUnitTest, 2D_StrideOne)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout = RAJA::make_permuted_layout(
-      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout =
+      RAJA::make_permuted_layout({{3, 5}},
+                                 RAJA::as_array<RAJA::PERM_JI>::get());
 
 
   /*
@@ -164,10 +167,8 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 
 
   // Check that we get the same layout
-  for (int i = 0; i < 3; ++i)
-  {
-    for (int j = 0; j < 5; ++j)
-    {
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
 
       ASSERT_EQ(layout(i, j), layout_s1(i, j));
     }
@@ -177,49 +178,44 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 TEST(StaticLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2> dynamic_layout(7, 5);
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ, 7, 5>;
-
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ,7,5>;
+  
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i)
-  {
-    for (int j = 0; j < 5; ++j)
-    {
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 5; ++j) {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout = RAJA::make_permuted_layout(
-      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7, 5>;
-
+  auto dynamic_layout = 
+    RAJA::make_permuted_layout({{7, 5}},
+                               RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7,5>;
+  
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i)
-  {
-    for (int j = 0; j < 5; ++j)
-    {
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout = RAJA::make_permuted_layout(
-      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7, 13, 5>;
+  auto dynamic_layout = 
+    RAJA::make_permuted_layout({{7, 13, 5}},
+                               RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7,13,5>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i)
-  {
-    for (int j = 0; j < 13; ++j)
-    {
-      for (int k = 0; k < 5; ++k)
-      {
-        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i, j, k));
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i,j,k));
       }
     }
   }
@@ -228,23 +224,21 @@ TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TEST(StaticLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout = RAJA::make_permuted_layout(
-      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7, 13, 5, 17>;
+  auto dynamic_layout = 
+    RAJA::make_permuted_layout({{7, 13, 5, 17}},
+                               RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7,13,5,17>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i)
-  {
-    for (int j = 0; j < 13; ++j)
-    {
-      for (int k = 0; k < 5; ++k)
-      {
-        for (int l = 0; l < 5; ++l)
-        {
-          ASSERT_EQ(dynamic_layout(i, j, k, l),
-                    static_layout::s_oper(i, j, k, l));
-        }
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          ASSERT_EQ(dynamic_layout(i, j, k, l), static_layout::s_oper(i,j,k,l));
+        } 
       }
     }
   }
 }
+
+
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
index 60efb37df9..c841c718a6 100644
--- a/test/unit/view-layout/test-multiview.cpp
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -11,17 +11,14 @@
 RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template <typename T>
-class MultiViewUnitTest : public ::testing::Test
-{};
+template<typename T>
+class MultiViewUnitTest : public ::testing::Test {};
 
-template <typename T>
-class OffsetLayoutMultiViewUnitTest : public ::testing::Test
-{};
+template<typename T>
+class OffsetLayoutMultiViewUnitTest : public ::testing::Test {};
 
-template <typename T>
-class TypedIntegralMultiViewUnitTest : public ::testing::Test
-{};
+template<typename T>
+class TypedIntegralMultiViewUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(MultiViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutMultiViewUnitTest, UnitIntFloatTypes);
@@ -32,76 +29,72 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 
   using layout = RAJA::Layout<1>;
 
-  TypeParam a1[10];
-  TypeParam a2[10];
-  TypeParam* data[2];
+  TypeParam   a1[10];
+  TypeParam   a2[10];
+  TypeParam * data[2];
 
   data[0] = a1;
   data[1] = a2;
 
   constexpr int val = 8;
-  a1[0]             = val;
-  a2[0]             = val;
+  a1[0] = val;
+  a2[0] = val;
 
   RAJA::MultiView<TypeParam, layout> view(data, layout(10));
-  ASSERT_EQ(val, view(0, 0));
+  ASSERT_EQ( val, view(0,0) );
 
   /*
-   * Should be able to construct a non-const MultiView from a non-const
-   * MultiView
+   * Should be able to construct a non-const MultiView from a non-const MultiView
    */
   RAJA::MultiView<TypeParam, layout> view2(view);
-  ASSERT_EQ(val, view2(0, 0));
+  ASSERT_EQ( val, view2(0,0) );
 
   /*
    * Should be able to construct a const MultiView from a non-const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view(view);
-  ASSERT_EQ(val, const_view(0, 0));
+  ASSERT_EQ( val, const_view(0,0) );
 
   /*
    * Should be able to construct a const MultiView from a const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view2(const_view);
-  ASSERT_EQ(val, const_view2(0, 0));
+  ASSERT_EQ( val, const_view2(0,0) );
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to
-  // 1st position
+  // non-default construction of MultiView with array-of-pointers index moved to 1st position
   RAJA::MultiView<TypeParam, layout, 1> view1p(data, layout(10));
-  ASSERT_EQ(val, view1p(0, 0));
+  ASSERT_EQ( val, view1p(0,0) );
 
   // construct a non-const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam, layout, 1> view1p2(view1p);
-  ASSERT_EQ(val, view1p2(0, 0));
+  ASSERT_EQ( val, view1p2(0,0) );
 
   // construct a const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p(view1p);
-  ASSERT_EQ(val, const_view1p(0, 0));
+  ASSERT_EQ( val, const_view1p(0,0) );
 
   // construct a const MultiView from a const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p2(const_view1p);
-  ASSERT_EQ(val, const_view1p2(0, 0));
+  ASSERT_EQ( val, const_view1p2(0,0) );
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to
-  // 1st position and non-const pointer type specification (used in CHAI)
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc(data, layout(10));
-  ASSERT_EQ(val, view1pnc(0, 0));
+  // non-default construction of MultiView with array-of-pointers index moved to 1st position
+  // and non-const pointer type specification (used in CHAI)
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc(data, layout(10));
+  ASSERT_EQ( val, view1pnc(0,0) );
 
   // construct a non-const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc2(view1pnc);
-  ASSERT_EQ(val, view1pnc2(0, 0));
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc2(view1pnc);
+  ASSERT_EQ( val, view1pnc2(0,0) );
 
   // construct a const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc(
-      view1pnc);
-  ASSERT_EQ(val, const_view1pnc(0, 0));
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc(view1pnc);
+  ASSERT_EQ( val, const_view1pnc(0,0) );
 
   // construct a const MultiView from a const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc2(
-      const_view1pnc);
-  ASSERT_EQ(val, const_view1pnc2(0, 0));
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc2(const_view1pnc);
+  ASSERT_EQ( val, const_view1pnc2(0,0) );
 }
 
 TYPED_TEST(MultiViewUnitTest, Accessor)
@@ -110,51 +103,48 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx * Ny * Nz;
-  TypeParam* b = new TypeParam[N];
-  TypeParam* c = new TypeParam[N];
-  TypeParam* a[2];
+  const int N  = Nx*Ny*Nz;
+  TypeParam *b = new TypeParam[N];
+  TypeParam *c = new TypeParam[N];
+  TypeParam *a[2];
 
   a[0] = b;
   a[1] = c;
 
-  int iter {0};
-  for (TypeParam i = 0; i < TypeParam {N}; ++i)
+  int iter{0};
+  for(TypeParam i=0; i<TypeParam{N}; ++i)
   {
-    a[0][iter] = TypeParam {i};
-    a[1][iter] = TypeParam {i} + 1;
+    a[0][iter] = TypeParam{i};
+    a[1][iter] = TypeParam{i}+1;
     ++iter;
   }
 
   /*
    * 1D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a, N);
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>, 1> view_1D1p(a, N);
-  TypeParam val {0};
-  for (int i = 0; i < N; ++i)
-  {
-    ASSERT_EQ(val, view_1D(0, i));
-    ASSERT_EQ(val + 1, view_1D(1, i));
-    ASSERT_EQ(val, view_1D1p(i, 0));
-    ASSERT_EQ(val + 1, view_1D1p(i, 1));
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a,N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>,1> view_1D1p(a,N);
+  TypeParam val{0};
+  for(int i=0; i<N; ++i) {
+    ASSERT_EQ(val, view_1D(0,i));
+    ASSERT_EQ(val+1, view_1D(1,i));
+    ASSERT_EQ(val, view_1D1p(i,0));
+    ASSERT_EQ(val+1, view_1D1p(i,1));
     val++;
   }
 
   /*
    * 2D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>, 1> view_2D1p(a, Ny, Nx);
-  val = TypeParam {0};
-  for (int j = 0; j < Ny; ++j)
-  {
-    for (int i = 0; i < Nx; ++i)
-    {
-      ASSERT_EQ(val, view_2D(0, j, i));
-      ASSERT_EQ(val + 1, view_2D(1, j, i));
-      ASSERT_EQ(val, view_2D1p(j, 0, i));
-      ASSERT_EQ(val + 1, view_2D1p(j, 1, i));
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>,1> view_2D1p(a,Ny,Nx);
+  val = TypeParam{0};
+  for(int j=0; j<Ny; ++j) {
+    for(int i=0; i<Nx; ++i) {
+      ASSERT_EQ(val, view_2D(0,j,i));
+      ASSERT_EQ(val+1, view_2D(1,j,i));
+      ASSERT_EQ(val, view_2D1p(j,0,i));
+      ASSERT_EQ(val+1, view_2D1p(j,1,i));
       val++;
     }
   }
@@ -162,19 +152,16 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>, 2> view_3D1p(a, Nz, Ny, Nx);
-  val = TypeParam {0};
-  for (int k = 0; k < Nz; ++k)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
-      for (int i = 0; i < Nx; ++i)
-      {
-        ASSERT_EQ(val, view_3D(0, k, j, i));
-        ASSERT_EQ(val + 1, view_3D(1, k, j, i));
-        ASSERT_EQ(val, view_3D1p(k, j, 0, i));
-        ASSERT_EQ(val + 1, view_3D1p(k, j, 1, i));
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>,2> view_3D1p(a,Nz,Ny,Nx);
+  val = TypeParam{0};
+  for(int k=0; k<Nz; ++k) {
+    for(int j=0; j<Ny; ++j) {
+      for(int i=0; i<Nx; ++i) {
+        ASSERT_EQ(val, view_3D(0,k,j,i));
+        ASSERT_EQ(val+1, view_3D(1,k,j,i));
+        ASSERT_EQ(val, view_3D1p(k,j,0,i));
+        ASSERT_EQ(val+1, view_3D1p(k,j,1,i));
         val++;
       }
     }
@@ -198,23 +185,20 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
   /*
    * MultiView is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower {{1}};
-  std::array<RAJA::Index_type, 1> upper {{11}};
-  RAJA::MultiView<TypeParam, layout> view(
-      data, RAJA::make_offset_layout<1>(lower, upper));
-  RAJA::MultiView<TypeParam, layout, 1> view1p(
-      data, RAJA::make_offset_layout<1>(lower, upper));
-
-  for (int i = 0; i < 10; i++)
-  {
+  std::array<RAJA::Index_type, 1> lower{{1}};
+  std::array<RAJA::Index_type, 1> upper{{11}};
+  RAJA::MultiView<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::MultiView<TypeParam, layout,1> view1p(data, RAJA::make_offset_layout<1>(lower, upper));
+
+  for (int i = 0; i < 10; i++) {
     data[0][i] = static_cast<TypeParam>(i);
-    data[1][i] = static_cast<TypeParam>(i + 1);
+    data[1][i] = static_cast<TypeParam>(i+1);
   }
 
-  ASSERT_EQ(data[0][0], view(0, 1));
-  ASSERT_EQ(data[1][9], view(1, 10));
-  ASSERT_EQ(data[0][0], view1p(1, 0));
-  ASSERT_EQ(data[1][9], view1p(10, 1));
+  ASSERT_EQ(data[0][0], view(0,1));
+  ASSERT_EQ(data[1][9], view(1,10));
+  ASSERT_EQ(data[0][0], view1p(1,0));
+  ASSERT_EQ(data[1][9], view1p(10,1));
 
   delete[] d1;
   delete[] d2;
@@ -223,50 +207,48 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
 TYPED_TEST(MultiViewUnitTest, Shift1D)
 {
 
-  int N            = 10;
-  TypeParam* reala = new TypeParam[N];
-  TypeParam* realb = new TypeParam[N];
-  TypeParam* a[2];
+  int N = 10;
+  TypeParam *reala = new TypeParam[N];
+  TypeParam *realb = new TypeParam[N];
+  TypeParam *a[2];
   a[0] = reala;
   a[1] = realb;
 
-  // Create a view from a base view
-  const int DIM                  = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N);
+  //Create a view from a base view
+  const int DIM = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N);
 
-  for (int i = 0; i < N; ++i)
-  {
-    A(0, i) = static_cast<TypeParam>(i + 1);
-    B(1, i) = static_cast<TypeParam>(i + 1);
+  for(int i=0; i<N; ++i) {
+    A(0,i) = static_cast<TypeParam>(i + 1);
+    B(1,i) = static_cast<TypeParam>(i + 1);
   }
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N}});
 
-  for (int i = N; i < 2 * N; ++i)
+  for(int i=N; i<2*N; ++i)
   {
-    ASSERT_EQ(Ashift(0, i), A(0, i - N));
-    ASSERT_EQ(Bshift(1, i), B(1, i - N));
+    ASSERT_EQ(Ashift(0,i),A(0,i-N));
+    ASSERT_EQ(Bshift(1,i),B(1,i-N));
   }
 
   // offset layout with MultiView with array-of-pointers index in 1st position
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a, layout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift =
-      C.shift({{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a,layout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift = C.shift({{N}});
 
-  for (int i = N; i < 2 * N; ++i)
+  for(int i=N; i<2*N; ++i)
   {
-    ASSERT_EQ(Cshift(i, 0), C(i - N, 0));
-    ASSERT_EQ(Cshift(i, 1), C(i - N, 1));
-    ASSERT_EQ(Ashift(0, i), C(i - N, 0));
-    ASSERT_EQ(Cshift(i, 0), A(0, i - N));
+    ASSERT_EQ(Cshift(i,0),C(i-N,0));
+    ASSERT_EQ(Cshift(i,1),C(i-N,1));
+    ASSERT_EQ(Ashift(0,i),C(i-N,0));
+    ASSERT_EQ(Cshift(i,0),A(0,i-N));
   }
 
 
-  // Create a shifted view from a view with a typed layout
-  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  //Create a shifted view from a view with a typed layout
+  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
@@ -274,9 +256,9 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
   RAJA::MultiView<TypeParam, TLayout> D(a, myLayout);
   RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
+  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
   {
-    ASSERT_EQ(Dshift(0, i), D(0, i - N));
+    ASSERT_EQ(Dshift(0,i),D(0,i-N));
   };
 
   delete[] reala;
@@ -286,65 +268,54 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 TYPED_TEST(MultiViewUnitTest, Shift2D)
 {
 
-  int N         = 10;
-  TypeParam* a0 = new TypeParam[N * N];
-  TypeParam* b0 = new TypeParam[N * N];
-  TypeParam* a[2];
+  int N = 10;
+  TypeParam *a0 = new TypeParam[N*N];
+  TypeParam *b0 = new TypeParam[N*N];
+  TypeParam *a[2];
   a[0] = a0;
   a[1] = b0;
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout =
-      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
-
-  for (int y = 0; y < N; ++y)
-  {
-    for (int x = 0; x < N; ++x)
-    {
-      A(0, y, x) = static_cast<TypeParam>(x + N * y);
-      B(1, y, x) = static_cast<TypeParam>(x + N * y + 1);
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
+
+  for(int y=0; y<N; ++y) {
+    for(int x=0; x<N; ++x) {
+      A(0,y,x) = static_cast<TypeParam>(x + N*y);
+      B(1,y,x) = static_cast<TypeParam>(x + N*y + 1);
     }
   }
 
-  // Create a view from a base view with an offsetlayout
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift =
-      A.shift({{N, N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift =
-      B.shift({{N, N}});
+  //Create a view from a base view with an offsetlayout
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
 
-  for (int y = N; y < N + N; ++y)
-  {
-    for (int x = N; x < N + N; ++x)
-    {
-      ASSERT_EQ(Ashift(0, y, x), A(0, y - N, x - N));
-      ASSERT_EQ(Bshift(1, y, x), B(1, y - N, x - N));
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Ashift(0,y,x),A(0,y-N,x-N));
+      ASSERT_EQ(Bshift(1,y,x),B(1,y-N,x-N));
     }
   }
 
-  // Create a view from a base view with permuted layout
-  std::array<RAJA::idx_t, 2> perm {{1, 0}};
+  //Create a view from a base view with permuted layout
+  std::array< RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2> playout =
-      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
+    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift =
-      C.shift({{N, N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> D(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Dshift1p =
-      D.shift({{N, N}});
-
-  for (int y = N; y < N + N; ++y)
-  {
-    for (int x = N; x < N + N; ++x)
-    {
-      ASSERT_EQ(Cshift(0, y, x), C(0, y - N, x - N));
-      ASSERT_EQ(Cshift(1, y, x), C(1, y - N, x - N));
-      ASSERT_EQ(Dshift1p(y, 0, x), D(y - N, 0, x - N));
-      ASSERT_EQ(Dshift1p(y, 1, x), D(y - N, 1, x - N));
-      ASSERT_EQ(Dshift1p(y, 1, x), C(1, y - N, x - N));
-      ASSERT_EQ(Cshift(0, y, x), D(y - N, 0, x - N));
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> D(a, playout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> Dshift1p = D.shift({{N,N}});
+
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Cshift(0,y,x),C(0,y-N,x-N));
+      ASSERT_EQ(Cshift(1,y,x),C(1,y-N,x-N));
+      ASSERT_EQ(Dshift1p(y,0,x),D(y-N,0,x-N));
+      ASSERT_EQ(Dshift1p(y,1,x),D(y-N,1,x-N));
+      ASSERT_EQ(Dshift1p(y,1,x),C(1,y-N,x-N));
+      ASSERT_EQ(Cshift(0,y,x),D(y-N,0,x-N));
     }
   }
 
diff --git a/test/unit/view-layout/test-standard-layout.cpp b/test/unit/view-layout/test-standard-layout.cpp
index cf7ce50b79..160e39ac36 100644
--- a/test/unit/view-layout/test-standard-layout.cpp
+++ b/test/unit/view-layout/test-standard-layout.cpp
@@ -69,8 +69,7 @@ TEST(LayoutUnitTest, 2D_IJ)
   ASSERT_EQ(4, layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k)
-  {
+  for (int k = 0; k < 15; ++k) {
 
     // inverse map
     int i, j;
@@ -101,8 +100,9 @@ TEST(LayoutUnitTest, 2D_JI)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout = RAJA::make_permuted_layout(
-      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout =
+      RAJA::make_permuted_layout({{3, 5}},
+                                 RAJA::as_array<RAJA::PERM_JI>::get());
 
   ASSERT_EQ(0, layout(0, 0));
 
@@ -113,8 +113,7 @@ TEST(LayoutUnitTest, 2D_JI)
   ASSERT_EQ(14, layout(2, 4));
 
   // Check that we get the identity (mod 15)
-  for (int k = 0; k < 15; ++k)
-  {
+  for (int k = 0; k < 15; ++k) {
 
     // inverse map
     int i, j;
@@ -159,8 +158,7 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
   ASSERT_EQ(0, layout(0, 5));
 
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k)
-  {
+  for (int k = 0; k < 20; ++k) {
 
     // inverse map
     int i, j;
@@ -176,3 +174,4 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
     ASSERT_EQ(j, 0);
   }
 }
+
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
index f15b0c40b4..6820da9b52 100644
--- a/test/unit/view-layout/test-typedlayout.cpp
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -8,9 +8,8 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template <typename T>
-class TypedLayoutUnitTest : public ::testing::Test
-{};
+template<typename T>
+class TypedLayoutUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 
@@ -18,26 +17,24 @@ TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
 {
 
-  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,
-                                                                          5);
+  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,5);
 
-  ASSERT_EQ(TypeParam {0}, l(TypeParam {0}, TypeParam {0}));
+  ASSERT_EQ(TypeParam{0}, l(TypeParam{0}, TypeParam{0}));
 
-  ASSERT_EQ(TypeParam {2}, l(TypeParam {0}, TypeParam {2}));
+  ASSERT_EQ(TypeParam{2}, l(TypeParam{0}, TypeParam{2}));
 
-  ASSERT_EQ(TypeParam {10}, l(TypeParam {2}, TypeParam {0}));
+  ASSERT_EQ(TypeParam{10}, l(TypeParam{2}, TypeParam{0}));
 
-  TypeParam x {5};
-  TypeParam y {0};
-  l.toIndices(TypeParam {10}, y, x);
-  ASSERT_EQ(x, TypeParam {0});
-  ASSERT_EQ(y, TypeParam {2});
+  TypeParam x{5};
+  TypeParam y{0};
+  l.toIndices(TypeParam{10}, y, x);
+  ASSERT_EQ(x, TypeParam{0});
+  ASSERT_EQ(y, TypeParam{2});
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
 {
-  using my_layout =
-      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D layout:
@@ -69,8 +66,7 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
   ASSERT_EQ(TypeParam(4), layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k)
-  {
+  for (int k = 0; k < 15; ++k) {
 
     // inverse map
     TypeParam i, j;
@@ -86,12 +82,12 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
     ASSERT_EQ(k2, layout_a(i, j));
     ASSERT_EQ(k2, layout_b(i, j));
   }
+
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 {
-  using my_layout =
-      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D projective layout:
@@ -122,8 +118,7 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 
   TypeParam pK = 0;
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k)
-  {
+  for (int k = 0; k < 20; ++k) {
 
     // inverse map
     TypeParam i, j;
@@ -144,57 +139,50 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
-  using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_IJ, TypeParam,
-                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_IJ,TypeParam,RAJA::list<TypeParam,TypeParam>,7,5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i)
-  {
-    for (TypeParam j = 0; j < 5; ++j)
-    {
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 5; ++j) {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
     }
   }
+
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout = RAJA::make_permuted_layout(
-      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_JI, TypeParam,
-                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
+  auto dynamic_layout =
+    RAJA::make_permuted_layout({{7, 5}},
+                               RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JI,
+                                                TypeParam,
+                                                RAJA::list<TypeParam,TypeParam>, 7,5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i)
-  {
-    for (TypeParam j = 0; j < 5; ++j)
-    {
-      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i, j));
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 5; ++j) {
+      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i,j));
     }
   }
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout = RAJA::make_permuted_layout(
-      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout =
-      RAJA::TypedStaticLayout<RAJA::PERM_JKI, TypeParam,
-                              RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
-                              13, 5>;
+  auto dynamic_layout =
+    RAJA::make_permuted_layout({{7, 13, 5}},
+                               RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JKI,
+                                                TypeParam,
+                                                RAJA::list<TypeParam,TypeParam,TypeParam>,
+                                                7,13,5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i)
-  {
-    for (TypeParam j = 0; j < 9; ++j)
-    {
-      for (TypeParam k = 0; k < 5; ++k)
-      {
-        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)),
-                  static_layout::s_oper(i, j, k));
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 9; ++j) {
+      for (TypeParam k = 0; k < 5; ++k) {
+        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)), static_layout::s_oper(i,j,k));
       }
     }
   }
@@ -203,23 +191,20 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout = RAJA::make_permuted_layout(
-      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<
-      RAJA::PERM_LJKI, TypeParam,
-      RAJA::list<TypeParam, TypeParam, TypeParam, TypeParam>, 7, 13, 5, 17>;
+  auto dynamic_layout =
+    RAJA::make_permuted_layout({{7, 13, 5, 17}},
+                               RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_LJKI,
+                                                TypeParam,
+                                                RAJA::list<TypeParam,TypeParam,TypeParam,TypeParam>,
+                                                7,13,5,17>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i)
-  {
-    for (TypeParam j = 0; j < 8; ++j)
-    {
-      for (TypeParam k = 0; k < 5; ++k)
-      {
-        for (TypeParam l = 0; l < 5; ++l)
-        {
-          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)),
-                    static_layout::s_oper(i, j, k, l));
+  for (TypeParam i = 0; i < 7; ++i) {
+    for (TypeParam j = 0; j < 8; ++j) {
+      for (TypeParam k = 0; k < 5; ++k) {
+        for (TypeParam l = 0; l < 5; ++l) {
+          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)), static_layout::s_oper(i,j,k,l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index e57e884edf..b0823b93e0 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -13,17 +13,14 @@ RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIY, "TIY");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template <typename T>
-class TypedViewUnitTest : public ::testing::Test
-{};
+template<typename T>
+class TypedViewUnitTest : public ::testing::Test {};
 
-template <typename T>
-class OffsetLayoutViewUnitTest : public ::testing::Test
-{};
+template<typename T>
+class OffsetLayoutViewUnitTest : public ::testing::Test {};
 
-template <typename T>
-class TypedIntegralViewUnitTest : public ::testing::Test
-{};
+template<typename T>
+class TypedIntegralViewUnitTest : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TypedViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutViewUnitTest, UnitIntFloatTypes);
@@ -66,23 +63,22 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx * Ny * Nz;
-  TypeParam* a = new TypeParam[N];
+  const int N  = Nx*Ny*Nz;
+  TypeParam *a = new TypeParam[N];
 
-  int iter {0};
-  for (TypeParam i = 0; i < TypeParam {N}; ++i)
+  int iter{0};
+  for(TypeParam i=0; i<TypeParam{N}; ++i)
   {
-    a[iter] = TypeParam {i};
+    a[iter] = TypeParam{i};
     ++iter;
   }
 
   /*
    * 1D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a, N);
-  TypeParam val {0};
-  for (int i = 0; i < N; ++i)
-  {
+  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a,N);
+  TypeParam val{0};
+  for(int i=0; i<N; ++i) {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -90,13 +86,11 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 2D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
-  val = TypeParam {0};
-  for (int j = 0; j < Ny; ++j)
-  {
-    for (int i = 0; i < Nx; ++i)
-    {
-      ASSERT_EQ(val, view_2D(j, i));
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  val = TypeParam{0};
+  for(int j=0; j<Ny; ++j) {
+    for(int i=0; i<Nx; ++i) {
+      ASSERT_EQ(val, view_2D(j,i));
       val++;
     }
   }
@@ -104,15 +98,12 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
-  val = TypeParam {0};
-  for (int k = 0; k < Nz; ++k)
-  {
-    for (int j = 0; j < Ny; ++j)
-    {
-      for (int i = 0; i < Nx; ++i)
-      {
-        ASSERT_EQ(val, view_3D(k, j, i));
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  val = TypeParam{0};
+  for(int k=0; k<Nz; ++k) {
+    for(int j=0; j<Ny; ++j) {
+      for(int i=0; i<Nx; ++i) {
+        ASSERT_EQ(val, view_3D(k,j,i));
         val++;
       }
     }
@@ -126,23 +117,22 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx * Ny * Nz;
-  TypeParam* a = new TypeParam[N];
+  const int N  = Nx*Ny*Nz;
+  TypeParam *a = new TypeParam[N];
 
-  int iter {0};
-  for (TypeParam i = 0; i < TypeParam {N}; ++i)
+  int iter{0};
+  for(TypeParam i=0; i<TypeParam{N}; ++i)
   {
-    a[iter] = TypeParam {i};
+    a[iter] = TypeParam{i};
     ++iter;
   }
 
   /*
    * 1D Typed Accessor
    */
-  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a, N);
-  TypeParam val {0};
-  for (TypeParam i = 0; i < N; ++i)
-  {
+  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a,N);
+  TypeParam val{0};
+  for(TypeParam i=0; i<N; ++i) {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -150,13 +140,11 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 2D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
-  val = TypeParam {0};
-  for (TypeParam j = 0; j < Ny; ++j)
-  {
-    for (TypeParam i = 0; i < Nx; ++i)
-    {
-      ASSERT_EQ(val, view_2D(j, i));
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
+  val = TypeParam{0};
+  for(TypeParam j=0; j<Ny; ++j) {
+    for(TypeParam i=0; i<Nx; ++i) {
+      ASSERT_EQ(val, view_2D(j,i));
       val++;
     }
   }
@@ -164,15 +152,12 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 3D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
-  val = TypeParam {0};
-  for (TypeParam k = 0; k < Nz; ++k)
-  {
-    for (TypeParam j = 0; j < Ny; ++j)
-    {
-      for (TypeParam i = 0; i < Nx; ++i)
-      {
-        ASSERT_EQ(val, view_3D(k, j, i));
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
+  val = TypeParam{0};
+  for(TypeParam k=0; k<Nz; ++k) {
+    for(TypeParam j=0; j<Ny; ++j) {
+      for(TypeParam i=0; i<Nx; ++i) {
+        ASSERT_EQ(val, view_3D(k,j,i));
         val++;
       }
     }
@@ -190,13 +175,11 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
   /*
    * View is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower {{1}};
-  std::array<RAJA::Index_type, 1> upper {{11}};
-  RAJA::View<TypeParam, layout> view(data,
-                                     RAJA::make_offset_layout<1>(lower, upper));
+  std::array<RAJA::Index_type, 1> lower{{1}};
+  std::array<RAJA::Index_type, 1> upper{{11}};
+  RAJA::View<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
 
-  for (int i = 0; i < 10; i++)
-  {
+  for (int i = 0; i < 10; i++) {
     data[i] = static_cast<TypeParam>(i);
   }
 
@@ -209,21 +192,20 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
 TYPED_TEST(TypedViewUnitTest, Shift1D)
 {
 
-  int N        = 10;
-  TypeParam* a = new TypeParam[N];
-  TypeParam* b = new TypeParam[N];
+  int N = 10;
+  TypeParam *a = new TypeParam[N];
+  TypeParam *b = new TypeParam[N];
 
   /*
    * Create a view from a base view
    */
-  const int DIM                  = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N);
-  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>, TX> C(a, N);
+  const int DIM = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N);
+  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>,TX> C(a,N);
 
-  for (int i = 0; i < N; ++i)
-  {
+  for(int i=0; i<N; ++i) {
     A(i) = static_cast<TypeParam>(i + 1);
   }
 
@@ -233,24 +215,23 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift =
-      C.shift({{N}});
+  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift = C.shift({{N}});
 
-  for (int i = N; i < 2 * N; ++i)
+  for(int i=N; i<2*N; ++i)
   {
-    ASSERT_EQ(Ashift(i), A(i - N));
-    ASSERT_EQ(Bshift(i), B(i - N));
+    ASSERT_EQ(Ashift(i),A(i-N));
+    ASSERT_EQ(Bshift(i),B(i-N));
   }
 
-  for (TX tx = TX {N}; tx < TX {2 * N}; tx++)
+  for(TX tx=TX{N}; tx<TX{2*N}; tx++)
   {
-    ASSERT_EQ(Cshift(tx), C(tx - N));
+    ASSERT_EQ(Cshift(tx),C(tx-N));
   }
 
   /*
    * Create a shifted view from a view with a typed layout
    */
-  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
@@ -258,67 +239,61 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   RAJA::View<TypeParam, TLayout> D(a, myLayout);
   RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
+  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
   {
-    ASSERT_EQ(Dshift(i), D(i - N));
+    ASSERT_EQ(Dshift(i),D(i-N));
   };
 
   delete[] a;
   delete[] b;
+
 }
 
 
 TYPED_TEST(TypedViewUnitTest, Shift2D)
 {
 
-  int N        = 10;
-  TypeParam* a = new TypeParam[N * N];
-  TypeParam* b = new TypeParam[N * N];
+  int N = 10;
+  TypeParam *a = new TypeParam[N*N];
+  TypeParam *b = new TypeParam[N*N];
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout =
-      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
 
-  for (int y = 0; y < N; ++y)
-  {
-    for (int x = 0; x < N; ++x)
-    {
-      A(y, x) = static_cast<TypeParam>(x + N * y);
+  for(int y=0; y<N; ++y) {
+    for(int x=0; x<N; ++x) {
+      A(y,x) = static_cast<TypeParam>(x + N*y);
     }
   }
 
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N, N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
 
-  for (int y = N; y < N + N; ++y)
-  {
-    for (int x = N; x < N + N; ++x)
-    {
-      ASSERT_EQ(Ashift(y, x), A(y - N, x - N));
-      ASSERT_EQ(Bshift(y, x), B(y - N, x - N));
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Ashift(y,x),A(y-N,x-N));
+      ASSERT_EQ(Bshift(y,x),B(y-N,x-N));
     }
   }
 
   /*
    * Create a view from a base view with permuted layout
    */
-  std::array<RAJA::idx_t, 2> perm {{1, 0}};
+  std::array< RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2> playout =
-      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
+    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
 
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
 
-  for (int y = N; y < N + N; ++y)
-  {
-    for (int x = N; x < N + N; ++x)
-    {
-      ASSERT_EQ(Cshift(y, x), C(y - N, x - N));
+  for(int y=N; y<N+N; ++y) {
+    for(int x=N; x<N+N; ++x) {
+      ASSERT_EQ(Cshift(y,x),C(y-N,x-N));
     }
   }
 
diff --git a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
index fb6dd0786e..7797ce9947 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
@@ -17,15 +17,19 @@
 #include <random>
 
 
-template <typename IndexType, typename... Args>
+template < typename IndexType,
+           typename ... Args >
 struct EnqueueTestCallable
 {
-  EnqueueTestCallable(IndexType* _ptr, IndexType _val) : ptr(_ptr), val(_val) {}
+  EnqueueTestCallable(IndexType* _ptr, IndexType _val)
+    : ptr(_ptr)
+    , val(_val)
+  { }
 
-  EnqueueTestCallable(EnqueueTestCallable const&)            = default;
+  EnqueueTestCallable(EnqueueTestCallable const&) = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable const&) = default;
 
-  EnqueueTestCallable(EnqueueTestCallable&& o)            = default;
+  EnqueueTestCallable(EnqueueTestCallable&& o) = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable&& o) = default;
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
@@ -36,7 +40,7 @@ struct EnqueueTestCallable
 
 private:
   IndexType* ptr;
-  IndexType val;
+  IndexType  val;
 };
 
 #endif  //__TEST_UTIL_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
index a93c932ec2..5fa93fbf60 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
@@ -19,59 +19,58 @@
 #include <cstddef>
 
 
-template <typename T>
+template < typename T >
 struct TestCallable
 {
-  TestCallable(T _val) : val(_val) {}
+  TestCallable(T _val)
+    : val(_val)
+  { }
 
-  TestCallable(TestCallable const&)            = delete;
+  TestCallable(TestCallable const&) = delete;
   TestCallable& operator=(TestCallable const&) = delete;
 
-  TestCallable(TestCallable&& o) : val(o.val), move_constructed(true)
+  TestCallable(TestCallable&& o)
+    : val(o.val)
+    , move_constructed(true)
   {
     o.moved_from = true;
   }
 
   TestCallable& operator=(TestCallable&& o)
   {
-    val          = o.val;
+    val = o.val;
     o.moved_from = true;
     return *this;
   }
 
-  RAJA_HOST_DEVICE void operator()(void* val_ptr,
-                                   bool* move_constructed_ptr,
-                                   bool* moved_from_ptr) const
+  RAJA_HOST_DEVICE void operator()(
+      void* val_ptr, bool* move_constructed_ptr, bool* moved_from_ptr) const
   {
     *static_cast<T*>(val_ptr) = val;
-    *move_constructed_ptr     = move_constructed;
-    *moved_from_ptr           = moved_from;
+    *move_constructed_ptr = move_constructed;
+    *moved_from_ptr = moved_from;
   }
 
 private:
   T val;
-
 public:
   bool move_constructed = false;
-  bool moved_from       = false;
+  bool moved_from = false;
 };
 
 
 // work around inconsistent std::array support over stl versions
-template <typename T, size_t N>
+template < typename T, size_t N >
 struct TestArray
 {
-  T a[N] {};
+  T a[N]{};
   T& operator[](size_t i) { return a[i]; }
   T const& operator[](size_t i) const { return a[i]; }
   friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
   {
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (lhs[i] == rhs[i])
-        continue;
-      else
-        return false;
+    for (size_t i = 0; i < N; ++i) {
+      if (lhs[i] == rhs[i]) continue;
+      else return false;
     }
     return true;
   }
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index 7dbac1403d..253015c5b8 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -20,95 +20,102 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupConstructorSingle
+          typename Allocator
+          >
+struct testWorkGroupConstructorSingle {
+template < typename ... Xargs >
+void operator()(RAJA::xargs<Xargs...>) const
 {
-  template <typename... Xargs>
-  void operator()(RAJA::xargs<Xargs...>) const
-  {
-    bool success = true;
-
-    using DispatchPolicy = typename DispatchTyper::template type<>;
-
-    {
-      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                           StoragePolicy, DispatchPolicy>,
-                     IndexType, RAJA::xargs<Xargs...>, Allocator>
-          pool(Allocator {});
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                            StoragePolicy, DispatchPolicy>,
-                      IndexType, RAJA::xargs<Xargs...>, Allocator>
-          group = pool.instantiate();
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                           StoragePolicy, DispatchPolicy>,
-                     IndexType, RAJA::xargs<Xargs...>, Allocator>
-          site = group.run(Xargs {}...);
-
-      using resource_type = typename RAJA::WorkPool<
-          RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy,
-                                DispatchPolicy>,
-          IndexType, RAJA::xargs<Xargs...>, Allocator>::resource_type;
-      auto e = resource_type::get_default().get_event();
-      e.wait();
+  bool success = true;
 
-      pool.clear();
-      group.clear();
-      site.clear();
+  using DispatchPolicy = typename DispatchTyper::template type<>;
 
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-    }
-
-    ASSERT_TRUE(success);
+  {
+    RAJA::WorkPool<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Xargs...>,
+                    Allocator
+                  >
+        pool(Allocator{});
+
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+    RAJA::WorkGroup<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Xargs...>,
+                    Allocator
+                  >
+        group = pool.instantiate();
+
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+    RAJA::WorkSite<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Xargs...>,
+                    Allocator
+                  >
+        site = group.run(Xargs{}...);
+
+    using resource_type = typename RAJA::WorkPool<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Xargs...>,
+                    Allocator
+                  >::resource_type;
+    auto e = resource_type::get_default().get_event();
+    e.wait();
+
+    pool.clear();
+    group.clear();
+    site.clear();
+
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
   }
+
+  ASSERT_TRUE(success);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupConstructorSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator>
-{
-  template <typename... Xargs>
-  void operator()(RAJA::xargs<Xargs...>) const
-  {}
+          typename Allocator
+          >
+struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_function_call_dispatch_typer,
+                                      IndexType,
+                                      Allocator> {
+template < typename ... Xargs >
+void operator()(RAJA::xargs<Xargs...>) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupConstructorSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator>
-{
-  template <typename... Xargs>
-  void operator()(RAJA::xargs<Xargs...>) const
-  {}
+          typename Allocator
+          >
+struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_virtual_function_dispatch_typer,
+                                      IndexType,
+                                      Allocator> {
+template < typename ... Xargs >
+void operator()(RAJA::xargs<Xargs...>) const
+{ }
 };
 
 #endif
@@ -116,25 +123,23 @@ struct testWorkGroupConstructorSingle<
 
 template <typename T>
 class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
-{};
+{
+};
 
 
 TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
-             BasicWorkGroupConstructorSingle)
+TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest, BasicWorkGroupConstructorSingle)
 {
-  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                                 DispatchTyper, IndexType, Allocator> {}(
-      Xargs {});
+  testWorkGroupConstructorSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{});
 }
 
 #endif  //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 76016b7bee..843f3b17a6 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -15,60 +15,75 @@
 #include "RAJA_test-workgroup.hpp"
 
 
-template <typename ForOnePol, typename Invoker, typename... CallArgs>
-typename std::enable_if<!std::is_base_of<RunOnDevice, ForOnePol>::value>::type
-call_dispatcher(Invoker invoker, CallArgs... callArgs)
+template  < typename ForOnePol,
+            typename Invoker,
+            typename ... CallArgs >
+typename  std::enable_if<
+            !std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+call_dispatcher( Invoker invoker,
+                 CallArgs... callArgs )
 {
-  forone<ForOnePol>([=]() { invoker(callArgs...); });
+  forone<ForOnePol>( [=] () {
+    invoker(callArgs...);
+  });
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ForOnePol, typename Invoker, typename... CallArgs>
-typename std::enable_if<std::is_base_of<RunOnDevice, ForOnePol>::value>::type
-call_dispatcher(Invoker invoker, CallArgs... callArgs)
+template  < typename ForOnePol,
+            typename Invoker,
+            typename ... CallArgs >
+typename  std::enable_if<
+            std::is_base_of<RunOnDevice, ForOnePol>::value
+          >::type
+call_dispatcher( Invoker invoker,
+                 CallArgs... callArgs )
 {
   RAJA::tuple<CallArgs...> lambda_capturable_callArgs(callArgs...);
-  forone<ForOnePol>([=] RAJA_DEVICE()
-                    { camp::invoke(lambda_capturable_callArgs, invoker); });
+  forone<ForOnePol>( [=] RAJA_DEVICE () {
+    camp::invoke(lambda_capturable_callArgs, invoker);
+  });
 }
 #endif
 
-template <typename IndexType, typename... Args>
+template < typename IndexType,
+           typename ... Args >
 struct DispatcherTestCallable
 {
-  DispatcherTestCallable(IndexType* _ptr_call,
-                         IndexType _val_call,
-                         IndexType* _ptr_dtor,
-                         IndexType _val_dtor)
-      : ptr_call(_ptr_call),
-        val_call(_val_call),
-        ptr_dtor(_ptr_dtor),
-        val_dtor(_val_dtor)
-  {}
-
-  DispatcherTestCallable(DispatcherTestCallable const&)            = delete;
+  DispatcherTestCallable(IndexType* _ptr_call, IndexType _val_call,
+                     IndexType* _ptr_dtor, IndexType _val_dtor)
+    : ptr_call(_ptr_call)
+    , val_call(_val_call)
+    , ptr_dtor(_ptr_dtor)
+    , val_dtor(_val_dtor)
+  { }
+
+  DispatcherTestCallable(DispatcherTestCallable const&) = delete;
   DispatcherTestCallable& operator=(DispatcherTestCallable const&) = delete;
 
   DispatcherTestCallable(DispatcherTestCallable&& o)
-      : ptr_call(o.ptr_call),
-        val_call(o.val_call),
-        ptr_dtor(o.ptr_dtor),
-        val_dtor(o.val_dtor),
-        move_constructed(true)
+    : ptr_call(o.ptr_call)
+    , val_call(o.val_call)
+    , ptr_dtor(o.ptr_dtor)
+    , val_dtor(o.val_dtor)
+    , move_constructed(true)
   {
     o.moved_from = true;
   }
   DispatcherTestCallable& operator=(DispatcherTestCallable&& o)
   {
-    ptr_call     = o.ptr_call;
-    val_call     = o.val_call;
-    ptr_dtor     = o.ptr_dtor;
-    val_dtor     = o.val_dtor;
+    ptr_call = o.ptr_call;
+    val_call = o.val_call;
+    ptr_dtor = o.ptr_dtor;
+    val_dtor = o.val_dtor;
     o.moved_from = true;
     return *this;
   }
 
-  ~DispatcherTestCallable() { *ptr_dtor = val_dtor; }
+  ~DispatcherTestCallable()
+  {
+    *ptr_dtor = val_dtor;
+  }
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
   {
@@ -78,165 +93,156 @@ struct DispatcherTestCallable
 
 private:
   IndexType* ptr_call;
-  IndexType val_call;
+  IndexType  val_call;
   IndexType* ptr_dtor;
-  IndexType val_dtor;
-
+  IndexType  val_dtor;
 public:
   bool move_constructed = false;
-  bool moved_from       = false;
+  bool moved_from = false;
 };
 
-template <typename ExecPolicy,
-          typename DispatchTyper,
-          typename IndexType,
-          typename WORKING_RES,
-          typename ForOnePol>
-struct testWorkGroupDispatcherSingle
+template < typename ExecPolicy,
+           typename DispatchTyper,
+           typename IndexType,
+           typename WORKING_RES,
+           typename ForOnePol >
+struct testWorkGroupDispatcherSingle {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>) const
 {
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>) const
-  {
-    using TestCallable = DispatcherTestCallable<IndexType, Args...>;
+  using TestCallable = DispatcherTestCallable<IndexType, Args...>;
 
-    camp::resources::Resource work_res {WORKING_RES()};
-    camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res{WORKING_RES()};
+  camp::resources::Resource host_res{camp::resources::Host()};
 
-    static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
-    using DispatchPolicy  = typename DispatchTyper::template type<TestCallable>;
-    using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                     void, IndexType, Args...>;
-    using Invoker_type    = typename Dispatcher_type::invoker_type;
-    using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
-    const Dispatcher_type* dispatcher =
-        RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
-            ExecPolicy {});
+  static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
+  using DispatchPolicy = typename DispatchTyper::template type<TestCallable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, IndexType, Args...>;
+  using Invoker_type = typename Dispatcher_type::invoker_type;
+  using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(ExecPolicy{});
 
-    TestCallable* old_obj = host_res.allocate<TestCallable>(1);
-    TestCallable* new_obj = host_res.allocate<TestCallable>(1);
-    TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
+  TestCallable* old_obj = host_res.allocate<TestCallable>(1);
+  TestCallable* new_obj = host_res.allocate<TestCallable>(1);
+  TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
 
-    IndexType* chckCall = host_res.allocate<IndexType>(3);
-    IndexType* testCall = host_res.allocate<IndexType>(3);
-    IndexType* workCall = work_res.allocate<IndexType>(3);
+  IndexType* chckCall = host_res.allocate<IndexType>(3);
+  IndexType* testCall = host_res.allocate<IndexType>(3);
+  IndexType* workCall = work_res.allocate<IndexType>(3);
 
-    IndexType* chckDtor = host_res.allocate<IndexType>(3);
-    IndexType* testDtor = host_res.allocate<IndexType>(3);
+  IndexType* chckDtor = host_res.allocate<IndexType>(3);
+  IndexType* testDtor = host_res.allocate<IndexType>(3);
 
 
-    chckCall[0] = (IndexType)5;
-    chckCall[1] = (IndexType)7;
-    chckCall[2] = (IndexType)5;
+  chckCall[0] = (IndexType)5;
+  chckCall[1] = (IndexType)7;
+  chckCall[2] = (IndexType)5;
 
-    testCall[0] = (IndexType)5;
-    testCall[1] = (IndexType)5;
-    testCall[2] = (IndexType)5;
+  testCall[0] = (IndexType)5;
+  testCall[1] = (IndexType)5;
+  testCall[2] = (IndexType)5;
 
-    work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
+  work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
 
-    testCall[0] = (IndexType)0;
-    testCall[1] = (IndexType)0;
-    testCall[2] = (IndexType)0;
+  testCall[0] = (IndexType)0;
+  testCall[1] = (IndexType)0;
+  testCall[2] = (IndexType)0;
 
 
-    chckDtor[0] = (IndexType)15;
-    chckDtor[1] = (IndexType)17;
-    chckDtor[2] = (IndexType)15;
+  chckDtor[0] = (IndexType)15;
+  chckDtor[1] = (IndexType)17;
+  chckDtor[2] = (IndexType)15;
 
-    testDtor[0] = (IndexType)15;
-    testDtor[1] = (IndexType)15;
-    testDtor[2] = (IndexType)15;
+  testDtor[0] = (IndexType)15;
+  testDtor[1] = (IndexType)15;
+  testDtor[2] = (IndexType)15;
 
 
-    new (old_obj)
-        TestCallable(workCall, chckCall[1], testDtor + 1, chckDtor[1]);
+  new(old_obj) TestCallable(workCall, chckCall[1], testDtor+1, chckDtor[1]);
 
-    ASSERT_FALSE(old_obj->move_constructed);
-    ASSERT_FALSE(old_obj->moved_from);
+  ASSERT_FALSE(old_obj->move_constructed);
+  ASSERT_FALSE(old_obj->moved_from);
 
 
-    dispatcher->move_construct_destroy(new_obj, old_obj);
+  dispatcher->move_construct_destroy(new_obj, old_obj);
 
-    ASSERT_TRUE(new_obj->move_constructed);
-    ASSERT_FALSE(new_obj->moved_from);
+  ASSERT_TRUE(new_obj->move_constructed);
+  ASSERT_FALSE(new_obj->moved_from);
 
-    ASSERT_EQ(testDtor[0], chckDtor[0]);
-    ASSERT_EQ(testDtor[1], chckDtor[1]);
-    ASSERT_EQ(testDtor[2], chckDtor[2]);
+  ASSERT_EQ(testDtor[0], chckDtor[0]);
+  ASSERT_EQ(testDtor[1], chckDtor[1]);
+  ASSERT_EQ(testDtor[2], chckDtor[2]);
 
-    testDtor[0] = (IndexType)15;
-    testDtor[1] = (IndexType)15;
-    testDtor[2] = (IndexType)15;
+  testDtor[0] = (IndexType)15;
+  testDtor[1] = (IndexType)15;
+  testDtor[2] = (IndexType)15;
 
 
-    work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
+  work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
-    // move a value onto device and fiddle
-    call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType,
-                    Args...>(dispatcher->invoke, wrk_obj, (IndexType)1,
-                             Args {}...);
+  // move a value onto device and fiddle
+  call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType, Args...>(
+      dispatcher->invoke, wrk_obj, (IndexType)1, Args{}...);
 
-    work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
+  work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
-    ASSERT_EQ(testCall[0], chckCall[0]);
-    ASSERT_EQ(testCall[1], chckCall[1]);
-    ASSERT_EQ(testCall[2], chckCall[2]);
+  ASSERT_EQ(testCall[0], chckCall[0]);
+  ASSERT_EQ(testCall[1], chckCall[1]);
+  ASSERT_EQ(testCall[2], chckCall[2]);
 
 
-    dispatcher->destroy(new_obj);
+  dispatcher->destroy(new_obj);
 
-    ASSERT_EQ(testDtor[0], chckDtor[0]);
-    ASSERT_EQ(testDtor[1], chckDtor[1]);
-    ASSERT_EQ(testDtor[2], chckDtor[2]);
+  ASSERT_EQ(testDtor[0], chckDtor[0]);
+  ASSERT_EQ(testDtor[1], chckDtor[1]);
+  ASSERT_EQ(testDtor[2], chckDtor[2]);
 
 
-    host_res.deallocate(old_obj);
-    host_res.deallocate(new_obj);
-    work_res.deallocate(wrk_obj);
-    host_res.deallocate(chckCall);
-    host_res.deallocate(testCall);
-    work_res.deallocate(workCall);
-    host_res.deallocate(chckDtor);
-    host_res.deallocate(testDtor);
-  }
+  host_res.deallocate( old_obj );
+  host_res.deallocate( new_obj );
+  work_res.deallocate( wrk_obj );
+  host_res.deallocate( chckCall );
+  host_res.deallocate( testCall );
+  work_res.deallocate( workCall );
+  host_res.deallocate( chckDtor );
+  host_res.deallocate( testDtor );
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename IndexType,
-          typename WORKING_RES,
-          typename ForOnePol>
-struct testWorkGroupDispatcherSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    WORKING_RES,
-    ForOnePol>
-{
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>) const
-  {}
+           typename WORKING_RES,
+          typename ForOnePol
+          >
+struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                     detail::indirect_function_call_dispatch_typer,
+                                     IndexType,
+                                     WORKING_RES,
+                                     ForOnePol> {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename IndexType,
-          typename WORKING_RES,
-          typename ForOnePol>
-struct testWorkGroupDispatcherSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    WORKING_RES,
-    ForOnePol>
-{
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>) const
-  {}
+           typename WORKING_RES,
+          typename ForOnePol
+          >
+struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                     detail::indirect_virtual_function_dispatch_typer,
+                                     IndexType,
+                                     WORKING_RES,
+                                     ForOnePol> {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>) const
+{ }
 };
 
 #endif
@@ -244,22 +250,22 @@ struct testWorkGroupDispatcherSingle<
 
 template <typename T>
 class WorkGroupBasicDispatcherSingleUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
-             BasicWorkGroupDispatcherSingle)
+TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest, BasicWorkGroupDispatcherSingle)
 {
-  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using IndexType     = typename camp::at<TypeParam, camp::num<2>>::type;
-  using Args          = typename camp::at<TypeParam, camp::num<3>>::type;
-  using ResourceType  = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ForOneType    = typename camp::at<TypeParam, camp::num<5>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Args = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ResourceType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ForOneType = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupDispatcherSingle<ExecPolicy, DispatchTyper, IndexType,
-                                ResourceType, ForOneType> {}(Args {});
+  testWorkGroupDispatcherSingle< ExecPolicy, DispatchTyper, IndexType, ResourceType, ForOneType >{}(
+      Args{});
 }
 
 #endif  //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index 6b7572af83..fcf24e89da 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -23,110 +23,104 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupEnqueueMultiple
+          typename Allocator
+          >
+struct testWorkGroupEnqueueMultiple {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
 {
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>,
-                  bool do_instantiate,
-                  size_t rep,
-                  size_t num) const
-  {
-    IndexType success = (IndexType)1;
-
-    using range_segment = RAJA::TypedRangeSegment<IndexType>;
-    using callable      = EnqueueTestCallable<IndexType, Args...>;
+  IndexType success = (IndexType)1;
 
-    using DispatchPolicy = typename DispatchTyper::template type<
-        camp::list<range_segment, callable>>;
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+  using callable = EnqueueTestCallable<IndexType, Args...>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<Args...>, Allocator>;
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable> >;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<Args...>, Allocator>;
+  using WorkPool_type = RAJA::WorkPool<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Args...>,
+                    Allocator
+                  >;
 
-    {
-      WorkPool_type pool(Allocator {});
+  using WorkGroup_type = RAJA::WorkGroup<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Args...>,
+                    Allocator
+                  >;
 
-      // test_empty(pool);
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+  {
+    WorkPool_type pool(Allocator{});
 
-      for (size_t i = 0; i < rep; ++i)
-      {
+    // test_empty(pool);
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-        {
-          for (size_t i = 0; i < num; ++i)
-          {
-            pool.enqueue(range_segment {0, 1},
-                         callable {&success, IndexType(0)});
-          }
+    for (size_t i = 0; i < rep; ++i) {
 
-          ASSERT_EQ(pool.num_loops(), (size_t)num);
-          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
+      {
+        for (size_t i = 0; i < num; ++i) {
+          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
         }
 
-        if (do_instantiate)
-        {
-          WorkGroup_type group = pool.instantiate();
-        }
-        else
-        {
-          pool.clear();
-        }
+        ASSERT_EQ(pool.num_loops(), (size_t)num);
+        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
+      }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)0);
-        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+      if (do_instantiate) {
+        WorkGroup_type group = pool.instantiate();
+      } else {
+        pool.clear();
       }
-    }
 
-    ASSERT_EQ(success, (IndexType)1);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    }
   }
+
+  ASSERT_EQ(success, (IndexType)1);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupEnqueueMultiple<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator>
-{
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
-  {}
+          typename Allocator
+          >
+struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_function_call_dispatch_typer,
+                                    IndexType,
+                                    Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupEnqueueMultiple<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator>
-{
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
-  {}
+          typename Allocator
+          >
+struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_virtual_function_dispatch_typer,
+                                    IndexType,
+                                    Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
 };
 
 #endif
@@ -134,32 +128,30 @@ struct testWorkGroupEnqueueMultiple<
 
 template <typename T>
 class WorkGroupBasicEnqueueMultipleUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
-             BasicWorkGroupEnqueueMultiple)
+TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest, BasicWorkGroupEnqueueMultiple)
 {
-  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device {}());
+  std::mt19937 rng(std::random_device{}());
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
-  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator> {}(
-      Xargs {}, false, dist_rep(rng), dist_num(rng));
-  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
-                               DispatchTyper, IndexType, Allocator> {}(
-      Xargs {}, true, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
+      Xargs{}, false, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
+      Xargs{}, true, dist_rep(rng), dist_num(rng));
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index ee172d7732..282b911d93 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -23,110 +23,103 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupEnqueueSingle
+          typename Allocator
+          >
+struct testWorkGroupEnqueueSingle {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
 {
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>,
-                  bool do_instantiate,
-                  size_t rep,
-                  size_t num) const
-  {
-    IndexType success = (IndexType)1;
-
-    using range_segment = RAJA::TypedRangeSegment<IndexType>;
-    using callable      = EnqueueTestCallable<IndexType, Args...>;
+  IndexType success = (IndexType)1;
 
-    using DispatchPolicy = typename DispatchTyper::template type<
-        camp::list<range_segment, callable>>;
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+  using callable = EnqueueTestCallable<IndexType, Args...>;
 
-    using WorkPool_type =
-        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                             StoragePolicy, DispatchPolicy>,
-                       IndexType, RAJA::xargs<Args...>, Allocator>;
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable> >;
 
-    using WorkGroup_type =
-        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
-                                              StoragePolicy, DispatchPolicy>,
-                        IndexType, RAJA::xargs<Args...>, Allocator>;
+  using WorkPool_type = RAJA::WorkPool<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Args...>,
+                    Allocator
+                  >;
 
-    {
-      WorkPool_type pool(Allocator {});
+  using WorkGroup_type = RAJA::WorkGroup<
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                    IndexType,
+                    RAJA::xargs<Args...>,
+                    Allocator
+                  >;
 
-      // test_empty(pool);
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+  {
+    WorkPool_type pool(Allocator{});
 
-      for (size_t i = 0; i < rep; ++i)
-      {
+    // test_empty(pool);
+    ASSERT_EQ(pool.num_loops(), (size_t)0);
+    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
-        {
-          for (size_t i = 0; i < num; ++i)
-          {
-            pool.enqueue(range_segment {0, 1},
-                         callable {&success, IndexType(0)});
-          }
+    for (size_t i = 0; i < rep; ++i) {
 
-          ASSERT_EQ(pool.num_loops(), (size_t)num);
-          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
+      {
+        for (size_t i = 0; i < num; ++i) {
+          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
         }
 
-        if (do_instantiate)
-        {
-          WorkGroup_type group = pool.instantiate();
-        }
-        else
-        {
-          pool.clear();
-        }
+        ASSERT_EQ(pool.num_loops(), (size_t)num);
+        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
+      }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)0);
-        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+      if (do_instantiate) {
+        WorkGroup_type group = pool.instantiate();
+      } else {
+        pool.clear();
       }
-    }
 
-    ASSERT_EQ(success, (IndexType)1);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    }
   }
+
+  ASSERT_EQ(success, (IndexType)1);
+}
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupEnqueueSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_function_call_dispatch_typer,
-    IndexType,
-    Allocator>
-{
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
-  {}
+          typename Allocator
+          >
+struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_function_call_dispatch_typer,
+                                  IndexType,
+                                  Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
 };
 ///
-template <size_t BLOCK_SIZE,
-          bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator>
-struct testWorkGroupEnqueueSingle<
-    RAJA::hip_work<BLOCK_SIZE, Async>,
-    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-    StoragePolicy,
-    detail::indirect_virtual_function_dispatch_typer,
-    IndexType,
-    Allocator>
-{
-  template <typename... Args>
-  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
-  {}
+          typename Allocator
+          >
+struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_virtual_function_dispatch_typer,
+                                  IndexType,
+                                  Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
 };
 
 #endif
@@ -134,27 +127,24 @@ struct testWorkGroupEnqueueSingle<
 
 template <typename T>
 class WorkGroupBasicEnqueueSingleUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueSingleUnitTest);
 
 
 TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
 {
-  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
-
-  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator> {}(
-      Xargs {}, false, 1, 1);
-  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
-                             DispatchTyper, IndexType, Allocator> {}(
-      Xargs {}, true, 1, 1);
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+
+  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, false, 1, 1);
+  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, true, 1, 1);
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 603209ecee..6022e98919 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -20,26 +20,32 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
+template <typename StoragePolicy,
+          typename DispatchTyper,
+          typename Allocator
+          >
 void testWorkGroupWorkStorageConstructor()
 {
   bool success = true;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy           = typename DispatchTyper::template type<>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
-  using WorkStorage_type =
-      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
+  using DispatchPolicy = typename DispatchTyper::template type<>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Dispatcher_type
+                                                    >;
 
   {
-    auto test_empty = [&](WorkStorage_type& container)
-    {
+    auto test_empty = [&](WorkStorage_type& container) {
+
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    WorkStorage_type container(Allocator {});
+    WorkStorage_type container(Allocator{});
 
     test_empty(container);
 
@@ -54,7 +60,7 @@ void testWorkGroupWorkStorageConstructor()
     test_empty(container2);
 
 
-    WorkStorage_type container3(Allocator {});
+    WorkStorage_type container3(Allocator{});
     container3 = std::move(container2);
 
     test_empty(container2);
@@ -67,20 +73,19 @@ void testWorkGroupWorkStorageConstructor()
 
 template <typename T>
 class WorkGroupBasicWorkStorageConstructorUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
-             BasicWorkGroupWorkStorageConstructor)
+TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest, BasicWorkGroupWorkStorageConstructor)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor<StoragePolicy, DispatchTyper,
-                                      Allocator>();
+  testWorkGroupWorkStorageConstructor< StoragePolicy, DispatchTyper, Allocator >();
 }
 
 
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index d45a8d6ce6..fd5a7aeaa3 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -20,7 +20,10 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
+template <typename StoragePolicy,
+          typename DispatchTyper,
+          typename Allocator
+          >
 void testWorkGroupWorkStorageInsertCall()
 {
   bool success = true;
@@ -28,26 +31,28 @@ void testWorkGroupWorkStorageInsertCall()
   using callable = TestCallable<double>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
-  using WorkStorage_type =
-      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
+  using DispatchPolicy = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Dispatcher_type
+                                                    >;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
-  const Dispatcher_type* dispatcher =
-      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
-          RAJA::seq_work {});
+  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
+      callable, Dispatcher_type>(RAJA::seq_work{});
 
   {
-    auto test_empty = [&](WorkStorage_type& container)
-    {
+    auto test_empty = [&](WorkStorage_type& container) {
+
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val)
-    {
+    auto fill_contents = [&](WorkStorage_type& container, double init_val) {
+
       callable c(init_val);
 
       ASSERT_FALSE(c.move_constructed);
@@ -62,18 +67,17 @@ void testWorkGroupWorkStorageInsertCall()
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val)
-    {
+    auto test_contents = [&](WorkStorage_type& container, double init_val) {
+
       ASSERT_EQ(container.size(), (size_t)1);
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
 
       auto iter = container.begin();
 
-      double test_val       = -1;
+      double test_val = -1;
       bool move_constructed = false;
-      bool moved_from       = true;
-      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed,
-                                 &moved_from);
+      bool moved_from = true;
+      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed, &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
@@ -81,7 +85,7 @@ void testWorkGroupWorkStorageInsertCall()
     };
 
 
-    WorkStorage_type container(Allocator {});
+    WorkStorage_type container(Allocator{});
 
     test_empty(container);
 
@@ -98,14 +102,14 @@ void testWorkGroupWorkStorageInsertCall()
     test_contents(container2, 1.23456789);
 
 
-    WorkStorage_type container3(Allocator {});
+    WorkStorage_type container3(Allocator{});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.23456789);
 
 
-    WorkStorage_type container4(Allocator {});
+    WorkStorage_type container4(Allocator{});
 
     fill_contents(container4, 2.34567891);
     test_contents(container4, 2.34567891);
@@ -122,19 +126,19 @@ void testWorkGroupWorkStorageInsertCall()
 
 template <typename T>
 class WorkGroupBasicWorkStorageInsertCallUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
-             BasicWorkGroupWorkStorageInsertCall)
+TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest, BasicWorkGroupWorkStorageInsertCall)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageInsertCall<StoragePolicy, DispatchTyper, Allocator>();
+  testWorkGroupWorkStorageInsertCall< StoragePolicy, DispatchTyper, Allocator >();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index 58206f5d90..90cc7c1368 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -20,7 +20,10 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
+template <typename StoragePolicy,
+          typename DispatchTyper,
+          typename Allocator
+          >
 void testWorkGroupWorkStorageIterator()
 {
   bool success = true;
@@ -28,21 +31,23 @@ void testWorkGroupWorkStorageIterator()
   using callable = TestCallable<int>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
-  using WorkStorage_type =
-      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
+  using DispatchPolicy = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Dispatcher_type
+                                                    >;
 
 
-  const Dispatcher_type* dispatcher =
-      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
-          RAJA::seq_work {});
+  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
+      callable, Dispatcher_type>(RAJA::seq_work{});
 
   {
-    WorkStorage_type container(Allocator {});
+    WorkStorage_type container(Allocator{});
 
-    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)0);
+    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)0);
     ASSERT_FALSE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_TRUE(container.begin() == container.end());
@@ -50,9 +55,9 @@ void testWorkGroupWorkStorageIterator()
     ASSERT_TRUE(container.begin() <= container.end());
     ASSERT_TRUE(container.begin() >= container.end());
 
-    container.template emplace<callable>(dispatcher, callable {-1});
+    container.template emplace<callable>(dispatcher, callable{-1});
 
-    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)1);
+    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)1);
     ASSERT_TRUE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_FALSE(container.begin() == container.end());
@@ -70,12 +75,12 @@ void testWorkGroupWorkStorageIterator()
       ASSERT_EQ(++iter, container.end());
       ASSERT_EQ(--iter, container.begin());
 
-      ASSERT_EQ(iter + 1, container.end());
-      ASSERT_EQ(1 + iter, container.end());
+      ASSERT_EQ(iter+1, container.end());
+      ASSERT_EQ(1+iter, container.end());
       ASSERT_EQ(++iter, container.end());
-      ASSERT_EQ(iter - 1, container.begin());
-      ASSERT_EQ(iter -= 1, container.begin());
-      ASSERT_EQ(iter += 1, container.end());
+      ASSERT_EQ(iter-1, container.begin());
+      ASSERT_EQ(iter-=1, container.begin());
+      ASSERT_EQ(iter+=1, container.end());
     }
   }
 
@@ -85,19 +90,19 @@ void testWorkGroupWorkStorageIterator()
 
 template <typename T>
 class WorkGroupBasicWorkStorageIteratorUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
-             BasicWorkGroupWorkStorageIterator)
+TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest, BasicWorkGroupWorkStorageIterator)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageIterator<StoragePolicy, DispatchTyper, Allocator>();
+  testWorkGroupWorkStorageIterator< StoragePolicy, DispatchTyper, Allocator >();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 49fe1a4d60..103829be0b 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -20,10 +20,12 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
-void testWorkGroupWorkStorageMultiple(const size_t num0,
-                                      const size_t num1,
-                                      const size_t num2)
+template <typename StoragePolicy,
+          typename DispatchTyper,
+          typename Allocator
+          >
+void testWorkGroupWorkStorageMultiple(
+    const size_t num0, const size_t num1, const size_t num2)
 {
   bool success = true;
 
@@ -31,25 +33,20 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
   using type1 = TestArray<double, 6>;
   using type2 = TestArray<double, 14>;
 
-  auto make_type0 = [](double init_val, size_t i)
-  {
+  auto make_type0 = [](double init_val, size_t i) {
     type0 obj(init_val - (double)i);
     return obj;
   };
-  auto make_type1 = [](double init_val, size_t i)
-  {
-    type1 obj {};
-    for (size_t j = 0; j < 6; ++j)
-    {
+  auto make_type1 = [](double init_val, size_t i) {
+    type1 obj{};
+    for (size_t j = 0; j < 6; ++j) {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
   };
-  auto make_type2 = [](double init_val, size_t i)
-  {
-    type2 obj {};
-    for (size_t j = 0; j < 14; ++j)
-    {
+  auto make_type2 = [](double init_val, size_t i) {
+    type2 obj{};
+    for (size_t j = 0; j < 14; ++j) {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
@@ -60,95 +57,89 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
   using callable2 = TestCallable<type2>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy =
-      typename DispatchTyper::template type<callable0, callable1, callable2>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
-                                                   void, void*, bool*, bool*>;
-  using WorkStorage_type =
-      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
+  using DispatchPolicy = typename DispatchTyper::template type<callable0, callable1, callable2>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Dispatcher_type
+                                                    >;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
 
-  const Dispatcher_type* dispatcher0 =
-      RAJA::detail::get_Dispatcher<callable0, Dispatcher_type>(
-          RAJA::seq_work {});
-  const Dispatcher_type* dispatcher1 =
-      RAJA::detail::get_Dispatcher<callable1, Dispatcher_type>(
-          RAJA::seq_work {});
-  const Dispatcher_type* dispatcher2 =
-      RAJA::detail::get_Dispatcher<callable2, Dispatcher_type>(
-          RAJA::seq_work {});
+  const Dispatcher_type* dispatcher0 = RAJA::detail::get_Dispatcher<
+      callable0, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher1 = RAJA::detail::get_Dispatcher<
+      callable1, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher2 = RAJA::detail::get_Dispatcher<
+      callable2, Dispatcher_type>(RAJA::seq_work{});
 
   {
-    auto test_empty = [&](WorkStorage_type& container)
-    {
+    auto test_empty = [&](WorkStorage_type& container) {
+
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val0,
-                             double init_val1, double init_val2)
-    {
+    auto fill_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
+
       std::vector<callable0> vec0;
       vec0.reserve(num0);
-      for (size_t i = 0; i < num0; ++i)
-      {
+      for (size_t i = 0; i < num0; ++i) {
         vec0.emplace_back(make_type0(init_val0, i));
         ASSERT_FALSE(vec0[i].move_constructed);
         ASSERT_FALSE(vec0[i].moved_from);
         container.template emplace<callable0>(dispatcher0, std::move(vec0[i]));
         ASSERT_FALSE(vec0[i].move_constructed);
-        ASSERT_TRUE(vec0[i].moved_from);
+        ASSERT_TRUE (vec0[i].moved_from);
       }
 
       std::vector<callable1> vec1;
       vec1.reserve(num1);
-      for (size_t i = 0; i < num1; ++i)
-      {
+      for (size_t i = 0; i < num1; ++i) {
         vec1.emplace_back(make_type1(init_val1, i));
         ASSERT_FALSE(vec1[i].move_constructed);
         ASSERT_FALSE(vec1[i].moved_from);
         container.template emplace<callable1>(dispatcher1, std::move(vec1[i]));
         ASSERT_FALSE(vec1[i].move_constructed);
-        ASSERT_TRUE(vec1[i].moved_from);
+        ASSERT_TRUE (vec1[i].moved_from);
       }
 
       std::vector<callable2> vec2;
       vec2.reserve(num2);
-      for (size_t i = 0; i < num2; ++i)
-      {
+      for (size_t i = 0; i < num2; ++i) {
         vec2.emplace_back(make_type2(init_val2, i));
         ASSERT_FALSE(vec2[i].move_constructed);
         ASSERT_FALSE(vec2[i].moved_from);
         container.template emplace<callable2>(dispatcher2, std::move(vec2[i]));
         ASSERT_FALSE(vec2[i].move_constructed);
-        ASSERT_TRUE(vec2[i].moved_from);
+        ASSERT_TRUE (vec2[i].moved_from);
       }
 
-      ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
-                                              num1 * sizeof(callable1) +
-                                              num2 * sizeof(callable2));
+      ASSERT_EQ(container.size(), num0+num1+num2);
+      ASSERT_GE(container.storage_size(),
+          num0*sizeof(callable0) +
+          num1*sizeof(callable1) +
+          num2*sizeof(callable2));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val0,
-                             double init_val1, double init_val2)
-    {
-      ASSERT_EQ(container.size(), num0 + num1 + num2);
-      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
-                                              num1 * sizeof(callable1) +
-                                              num2 * sizeof(callable2));
+    auto test_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
+
+      ASSERT_EQ(container.size(), num0+num1+num2);
+      ASSERT_GE(container.storage_size(),
+          num0*sizeof(callable0) +
+          num1*sizeof(callable1) +
+          num2*sizeof(callable2));
 
       {
         auto iter = container.begin();
 
-        for (size_t i = 0; i < num0; ++i)
-        {
-          type0 val {};
+        for (size_t i = 0; i < num0; ++i) {
+          type0 val{};
           bool move_constructed = false;
-          bool moved_from       = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
-                                     &moved_from);
+          bool moved_from = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -158,13 +149,11 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
           ++iter;
         }
 
-        for (size_t i = 0; i < num1; ++i)
-        {
-          type1 val {};
+        for (size_t i = 0; i < num1; ++i) {
+          type1 val{};
           bool move_constructed = false;
-          bool moved_from       = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
-                                     &moved_from);
+          bool moved_from = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -174,13 +163,11 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
           ++iter;
         }
 
-        for (size_t i = 0; i < num2; ++i)
-        {
-          type2 val {};
+        for (size_t i = 0; i < num2; ++i) {
+          type2 val{};
           bool move_constructed = false;
-          bool moved_from       = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
-                                     &moved_from);
+          bool moved_from = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);
@@ -194,7 +181,7 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
       }
     };
 
-    WorkStorage_type container(Allocator {});
+    WorkStorage_type container(Allocator{});
 
     test_empty(container);
     fill_contents(container, 1.0, 100.0, 1000.0);
@@ -212,14 +199,14 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
     test_contents(container2, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container3(Allocator {});
+    WorkStorage_type container3(Allocator{});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container4(Allocator {});
+    WorkStorage_type container4(Allocator{});
 
     fill_contents(container4, 1.5, 100.5, 1000.5);
     test_contents(container4, 1.5, 100.5, 1000.5);
@@ -228,6 +215,7 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
 
     test_empty(container3);
     test_contents(container4, 1.0, 100.0, 1000.0);
+
   }
 
   ASSERT_TRUE(success);
@@ -236,22 +224,22 @@ void testWorkGroupWorkStorageMultiple(const size_t num0,
 
 template <typename T>
 class WorkGroupBasicWorkStorageMultipleUnitTest : public ::testing::Test
-{};
+{
+};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest,
-             BasicWorkGroupWorkStorageMultiple)
+TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest, BasicWorkGroupWorkStorageMultiple)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  std::mt19937 rng(std::random_device {}());
+  std::mt19937 rng(std::random_device{}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
-  testWorkGroupWorkStorageMultiple<StoragePolicy, DispatchTyper, Allocator>(
+  testWorkGroupWorkStorageMultiple< StoragePolicy, DispatchTyper, Allocator >(
       dist(rng), dist(rng), dist(rng));
 }
 

From cabeff8b1052bcf01aa797fa106aa1d1ec273742 Mon Sep 17 00:00:00 2001
From: john bowen <john.bowen42@gmail.com>
Date: Tue, 8 Oct 2024 12:03:20 -0700
Subject: [PATCH 8/9] Apply clang format style

---
 include/RAJA/RAJA.hpp                         |   10 +-
 include/RAJA/index/IndexSet.hpp               |  293 +-
 include/RAJA/index/IndexSetBuilders.hpp       |   35 +-
 include/RAJA/index/IndexSetUtils.hpp          |   41 +-
 include/RAJA/index/IndexValue.hpp             |  144 +-
 include/RAJA/index/ListSegment.hpp            |  113 +-
 include/RAJA/index/RangeSegment.hpp           |  144 +-
 include/RAJA/internal/DepGraphNode.hpp        |    9 +-
 include/RAJA/internal/Iterators.hpp           |  169 +-
 include/RAJA/internal/MemUtils_CPU.hpp        |   22 +-
 include/RAJA/internal/RAJAVec.hpp             |  242 +-
 include/RAJA/internal/fault_tolerance.hpp     |  106 +-
 include/RAJA/internal/foldl.hpp               |   47 +-
 include/RAJA/internal/get_platform.hpp        |   52 +-
 include/RAJA/pattern/WorkGroup.hpp            |  283 +-
 include/RAJA/pattern/WorkGroup/Dispatcher.hpp |  505 +--
 include/RAJA/pattern/WorkGroup/WorkRunner.hpp |  193 +-
 .../RAJA/pattern/WorkGroup/WorkStorage.hpp    |  594 +--
 include/RAJA/pattern/WorkGroup/WorkStruct.hpp |   62 +-
 include/RAJA/pattern/atomic.hpp               |   93 +-
 include/RAJA/pattern/detail/algorithm.hpp     |   23 +-
 include/RAJA/pattern/detail/forall.hpp        |   12 +-
 include/RAJA/pattern/detail/multi_reduce.hpp  |  181 +-
 include/RAJA/pattern/detail/privatizer.hpp    |   17 +-
 include/RAJA/pattern/detail/reduce.hpp        |  268 +-
 include/RAJA/pattern/forall.hpp               |  531 +--
 include/RAJA/pattern/kernel.hpp               |  100 +-
 include/RAJA/pattern/kernel/Collapse.hpp      |    4 +-
 include/RAJA/pattern/kernel/Conditional.hpp   |   62 +-
 include/RAJA/pattern/kernel/For.hpp           |   38 +-
 include/RAJA/pattern/kernel/ForICount.hpp     |   28 +-
 include/RAJA/pattern/kernel/Hyperplane.hpp    |   45 +-
 include/RAJA/pattern/kernel/InitLocalMem.hpp  |   61 +-
 include/RAJA/pattern/kernel/Lambda.hpp        |  184 +-
 include/RAJA/pattern/kernel/Param.hpp         |   11 +-
 include/RAJA/pattern/kernel/Reduce.hpp        |    6 +-
 include/RAJA/pattern/kernel/Region.hpp        |   38 +-
 include/RAJA/pattern/kernel/Tile.hpp          |  114 +-
 include/RAJA/pattern/kernel/TileTCount.hpp    |   31 +-
 .../RAJA/pattern/kernel/internal/LoopData.hpp |  125 +-
 .../pattern/kernel/internal/LoopTypes.hpp     |   74 +-
 .../pattern/kernel/internal/Statement.hpp     |   15 +-
 .../pattern/kernel/internal/StatementList.hpp |   25 +-
 .../RAJA/pattern/kernel/internal/Template.hpp |   17 +-
 include/RAJA/pattern/launch/launch_core.hpp   |  610 ++--
 include/RAJA/pattern/multi_reduce.hpp         |    7 +-
 include/RAJA/pattern/params/forall.hpp        |  726 ++--
 include/RAJA/pattern/params/kernel_name.hpp   |   25 +-
 include/RAJA/pattern/params/params_base.hpp   |  364 +-
 include/RAJA/pattern/params/reducer.hpp       |  310 +-
 include/RAJA/pattern/reduce.hpp               |    4 +-
 include/RAJA/pattern/scan.hpp                 |  294 +-
 include/RAJA/pattern/sort.hpp                 |  236 +-
 include/RAJA/pattern/synchronize.hpp          |    2 +-
 .../RAJA/pattern/tensor/MatrixRegister.hpp    |   38 +-
 .../RAJA/pattern/tensor/ScalarRegister.hpp    |   14 +-
 include/RAJA/pattern/tensor/TensorBlock.hpp   |    1 -
 include/RAJA/pattern/tensor/TensorIndex.hpp   |  358 +-
 include/RAJA/pattern/tensor/TensorLayout.hpp  |   83 +-
 .../RAJA/pattern/tensor/TensorRegister.hpp    |  133 +-
 .../RAJA/pattern/tensor/VectorRegister.hpp    |   19 +-
 .../tensor/internal/ET/BinaryOperator.hpp     |  233 +-
 .../internal/ET/BinaryOperatorTraits.hpp      |  226 +-
 .../tensor/internal/ET/BlockLiteral.hpp       |  144 +-
 .../internal/ET/ExpressionTemplateBase.hpp    |  231 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   | 2211 ++++++------
 .../tensor/internal/ET/TensorDivide.hpp       |  681 ++--
 .../tensor/internal/ET/TensorLiteral.hpp      |  130 +-
 .../tensor/internal/ET/TensorLoadStore.hpp    |  361 +-
 .../tensor/internal/ET/TensorMultiply.hpp     |  247 +-
 .../tensor/internal/ET/TensorMultiplyAdd.hpp  |  151 +-
 .../tensor/internal/ET/TensorNegate.hpp       |  103 +-
 .../internal/ET/TensorScalarLiteral.hpp       |  121 +-
 .../tensor/internal/ET/TensorTranspose.hpp    |  114 +-
 .../tensor/internal/ET/normalizeOperand.hpp   |   99 +-
 .../tensor/internal/ExpressionTemplate.hpp    |    1 -
 .../tensor/internal/MatrixMatrixMultiply.hpp  |  525 +--
 .../tensor/internal/MatrixRegisterImpl.hpp    | 2656 ++++++++------
 .../pattern/tensor/internal/RegisterBase.hpp  | 2027 +++++------
 .../tensor/internal/TensorIndexTraits.hpp     |  592 ++-
 .../pattern/tensor/internal/TensorRef.hpp     | 1242 ++++---
 .../tensor/internal/TensorRegisterBase.hpp    | 1542 ++++----
 .../tensor/internal/TensorTileExec.hpp        |  532 +--
 .../tensor/internal/VectorRegisterImpl.hpp    | 1758 ++++-----
 include/RAJA/pattern/tensor/stats.hpp         |    7 +-
 include/RAJA/policy/MultiPolicy.hpp           |   67 +-
 include/RAJA/policy/PolicyBase.hpp            |  146 +-
 include/RAJA/policy/WorkGroup.hpp             |   77 +-
 include/RAJA/policy/atomic_auto.hpp           |   52 +-
 include/RAJA/policy/atomic_builtin.hpp        |  361 +-
 include/RAJA/policy/cuda.hpp                  |    2 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    |  259 +-
 .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp |   43 +-
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp |  288 +-
 include/RAJA/policy/cuda/atomic.hpp           |  493 +--
 include/RAJA/policy/cuda/forall.hpp           |  728 ++--
 include/RAJA/policy/cuda/intrinsics.hpp       |  163 +-
 .../RAJA/policy/cuda/kernel/Conditional.hpp   |   18 +-
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  248 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |  240 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  332 +-
 .../RAJA/policy/cuda/kernel/Hyperplane.hpp    |   38 +-
 .../RAJA/policy/cuda/kernel/InitLocalMem.hpp  |  156 +-
 include/RAJA/policy/cuda/kernel/Lambda.hpp    |   26 +-
 include/RAJA/policy/cuda/kernel/Reduce.hpp    |   40 +-
 include/RAJA/policy/cuda/kernel/Sync.hpp      |   43 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |  161 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |  206 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  503 +--
 include/RAJA/policy/cuda/launch.hpp           |  990 ++---
 include/RAJA/policy/cuda/multi_reduce.hpp     |  568 +--
 .../RAJA/policy/cuda/params/kernel_name.hpp   |   64 +-
 include/RAJA/policy/cuda/params/reduce.hpp    |   99 +-
 include/RAJA/policy/cuda/policy.hpp           | 3198 ++++++++++-------
 include/RAJA/policy/cuda/raja_cudaerrchk.hpp  |   26 +-
 include/RAJA/policy/cuda/reduce.hpp           |  710 ++--
 include/RAJA/policy/cuda/scan.hpp             |  158 +-
 include/RAJA/policy/cuda/sort.hpp             |  631 ++--
 include/RAJA/policy/desul/atomic.hpp          |  149 +-
 include/RAJA/policy/hip/MemUtils_HIP.hpp      |  285 +-
 .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp  |   36 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  |  316 +-
 include/RAJA/policy/hip/atomic.hpp            |  489 +--
 include/RAJA/policy/hip/forall.hpp            |  723 ++--
 include/RAJA/policy/hip/intrinsics.hpp        |  154 +-
 .../RAJA/policy/hip/kernel/Conditional.hpp    |   20 +-
 include/RAJA/policy/hip/kernel/For.hpp        |  245 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  334 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  229 +-
 include/RAJA/policy/hip/kernel/Hyperplane.hpp |   38 +-
 .../RAJA/policy/hip/kernel/InitLocalMem.hpp   |  158 +-
 include/RAJA/policy/hip/kernel/Lambda.hpp     |   26 +-
 include/RAJA/policy/hip/kernel/Reduce.hpp     |   59 +-
 include/RAJA/policy/hip/kernel/Sync.hpp       |   39 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |  161 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  200 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  503 +--
 include/RAJA/policy/hip/launch.hpp            |  983 ++---
 include/RAJA/policy/hip/multi_reduce.hpp      |  567 +--
 .../RAJA/policy/hip/params/kernel_name.hpp    |   63 +-
 include/RAJA/policy/hip/params/reduce.hpp     |   98 +-
 include/RAJA/policy/hip/policy.hpp            | 2961 +++++++++------
 include/RAJA/policy/hip/raja_hiperrchk.hpp    |   26 +-
 include/RAJA/policy/hip/reduce.hpp            |  704 ++--
 include/RAJA/policy/hip/scan.hpp              |  204 +-
 include/RAJA/policy/hip/sort.hpp              |  617 ++--
 include/RAJA/policy/openmp.hpp                |    2 +-
 .../policy/openmp/WorkGroup/Dispatcher.hpp    |    8 +-
 .../policy/openmp/WorkGroup/WorkRunner.hpp    |   64 +-
 include/RAJA/policy/openmp/atomic.hpp         |   69 +-
 include/RAJA/policy/openmp/forall.hpp         |  442 +--
 .../RAJA/policy/openmp/kernel/Collapse.hpp    |   51 +-
 .../policy/openmp/kernel/OmpSyncThreads.hpp   |   27 +-
 include/RAJA/policy/openmp/launch.hpp         |  555 +--
 include/RAJA/policy/openmp/multi_reduce.hpp   |  281 +-
 include/RAJA/policy/openmp/params/forall.hpp  |  557 +--
 .../RAJA/policy/openmp/params/kernel_name.hpp |   58 +-
 include/RAJA/policy/openmp/params/reduce.hpp  |   60 +-
 include/RAJA/policy/openmp/policy.hpp         |  229 +-
 include/RAJA/policy/openmp/reduce.hpp         |   15 +-
 include/RAJA/policy/openmp/region.hpp         |   12 +-
 include/RAJA/policy/openmp/scan.hpp           |  133 +-
 include/RAJA/policy/openmp/sort.hpp           |  155 +-
 include/RAJA/policy/openmp_target.hpp         |    3 +-
 .../openmp_target/WorkGroup/Dispatcher.hpp    |   25 +-
 .../openmp_target/WorkGroup/WorkRunner.hpp    |   64 +-
 include/RAJA/policy/openmp_target/forall.hpp  |  113 +-
 .../policy/openmp_target/kernel/Collapse.hpp  |   98 +-
 .../RAJA/policy/openmp_target/kernel/For.hpp  |   46 +-
 .../openmp_target/params/kernel_name.hpp      |   58 +-
 .../policy/openmp_target/params/reduce.hpp    |   60 +-
 include/RAJA/policy/openmp_target/policy.hpp  |   78 +-
 include/RAJA/policy/openmp_target/reduce.hpp  |  256 +-
 include/RAJA/policy/sequential.hpp            |    2 +-
 .../sequential/WorkGroup/Dispatcher.hpp       |    8 +-
 .../sequential/WorkGroup/WorkRunner.hpp       |   62 +-
 include/RAJA/policy/sequential/atomic.hpp     |   57 +-
 include/RAJA/policy/sequential/forall.hpp     |   48 +-
 .../policy/sequential/kernel/Collapse.hpp     |   26 +-
 .../RAJA/policy/sequential/kernel/Reduce.hpp  |    6 +-
 include/RAJA/policy/sequential/launch.hpp     |  198 +-
 .../RAJA/policy/sequential/multi_reduce.hpp   |   88 +-
 .../policy/sequential/params/kernel_name.hpp  |   68 +-
 .../RAJA/policy/sequential/params/reduce.hpp  |   56 +-
 include/RAJA/policy/sequential/policy.hpp     |   58 +-
 include/RAJA/policy/sequential/region.hpp     |    2 +-
 include/RAJA/policy/sequential/scan.hpp       |  110 +-
 include/RAJA/policy/sequential/sort.hpp       |   91 +-
 include/RAJA/policy/simd/forall.hpp           |   46 +-
 include/RAJA/policy/simd/kernel/For.hpp       |   38 +-
 include/RAJA/policy/simd/kernel/ForICount.hpp |   30 +-
 include/RAJA/policy/simd/launch.hpp           |   28 +-
 include/RAJA/policy/simd/policy.hpp           |    4 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp    |   25 +-
 include/RAJA/policy/sycl/forall.hpp           |  295 +-
 .../RAJA/policy/sycl/kernel/Conditional.hpp   |   19 +-
 include/RAJA/policy/sycl/kernel/For.hpp       |  221 +-
 include/RAJA/policy/sycl/kernel/ForICount.hpp |  274 +-
 include/RAJA/policy/sycl/kernel/Lambda.hpp    |   26 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |  115 +-
 include/RAJA/policy/sycl/kernel/Tile.hpp      |  228 +-
 .../RAJA/policy/sycl/kernel/TileTCount.hpp    |  251 +-
 include/RAJA/policy/sycl/kernel/internal.hpp  |  149 +-
 include/RAJA/policy/sycl/launch.hpp           | 1047 +++---
 .../RAJA/policy/sycl/params/kernel_name.hpp   |   75 +-
 include/RAJA/policy/sycl/params/reduce.hpp    |   60 +-
 include/RAJA/policy/sycl/policy.hpp           |  151 +-
 include/RAJA/policy/sycl/reduce.hpp           |  351 +-
 include/RAJA/policy/tensor/arch.hpp           |   58 +-
 include/RAJA/policy/tensor/arch/avx.hpp       |   12 +-
 .../policy/tensor/arch/avx/avx_double.hpp     |  888 ++---
 .../RAJA/policy/tensor/arch/avx/avx_float.hpp |  918 ++---
 .../RAJA/policy/tensor/arch/avx/avx_int32.hpp | 1458 ++++----
 .../RAJA/policy/tensor/arch/avx/avx_int64.hpp | 1011 +++---
 .../RAJA/policy/tensor/arch/avx/traits.hpp    |   85 +-
 include/RAJA/policy/tensor/arch/avx2.hpp      |   12 +-
 .../policy/tensor/arch/avx2/avx2_double.hpp   | 1005 +++---
 .../policy/tensor/arch/avx2/avx2_float.hpp    |  943 +++--
 .../policy/tensor/arch/avx2/avx2_int32.hpp    | 1077 +++---
 .../policy/tensor/arch/avx2/avx2_int64.hpp    | 1024 +++---
 .../RAJA/policy/tensor/arch/avx2/traits.hpp   |  109 +-
 include/RAJA/policy/tensor/arch/avx512.hpp    |   12 +-
 .../tensor/arch/avx512/avx512_double.hpp      |  701 ++--
 .../tensor/arch/avx512/avx512_float.hpp       |  724 ++--
 .../tensor/arch/avx512/avx512_int32.hpp       |  839 ++---
 .../tensor/arch/avx512/avx512_int64.hpp       |  739 ++--
 .../RAJA/policy/tensor/arch/avx512/traits.hpp |   88 +-
 include/RAJA/policy/tensor/arch/cuda.hpp      |    6 +-
 .../policy/tensor/arch/cuda/cuda_warp.hpp     | 1974 +++++-----
 .../RAJA/policy/tensor/arch/cuda/traits.hpp   |   39 +-
 include/RAJA/policy/tensor/arch/hip.hpp       |    6 +-
 .../RAJA/policy/tensor/arch/hip/hip_wave.hpp  | 1970 +++++-----
 .../RAJA/policy/tensor/arch/hip/traits.hpp    |   39 +-
 include/RAJA/policy/tensor/arch/scalar.hpp    |    8 +-
 .../RAJA/policy/tensor/arch/scalar/scalar.hpp |  895 ++---
 .../RAJA/policy/tensor/arch/scalar/traits.hpp |   85 +-
 include/RAJA/policy/tensor/arch_impl.hpp      |   14 +-
 include/RAJA/policy/tensor/policy.hpp         |   33 +-
 include/RAJA/util/BitMask.hpp                 |   99 +-
 include/RAJA/util/CombiningAdapter.hpp        |   92 +-
 include/RAJA/util/EnableIf.hpp                |    6 +-
 include/RAJA/util/IndexLayout.hpp             |  155 +-
 include/RAJA/util/KokkosPluginLoader.hpp      |   53 +-
 include/RAJA/util/Layout.hpp                  |  190 +-
 include/RAJA/util/LocalArray.hpp              |  106 +-
 include/RAJA/util/OffsetLayout.hpp            |  141 +-
 include/RAJA/util/OffsetOperators.hpp         |   51 +-
 include/RAJA/util/Operators.hpp               |  260 +-
 include/RAJA/util/Permutations.hpp            |  109 +-
 include/RAJA/util/PermutedLayout.hpp          |   20 +-
 include/RAJA/util/PluginContext.hpp           |   30 +-
 include/RAJA/util/PluginLinker.hpp            |   22 +-
 include/RAJA/util/PluginOptions.hpp           |   18 +-
 include/RAJA/util/PluginStrategy.hpp          |   28 +-
 include/RAJA/util/Registry.hpp                |  239 +-
 include/RAJA/util/RepeatView.hpp              |  121 +-
 include/RAJA/util/RuntimePluginLoader.hpp     |   41 +-
 include/RAJA/util/SoAArray.hpp                |    8 +-
 include/RAJA/util/SoAPtr.hpp                  |  126 +-
 include/RAJA/util/Span.hpp                    |   73 +-
 include/RAJA/util/StaticLayout.hpp            |  176 +-
 include/RAJA/util/Timer.hpp                   |   11 +-
 include/RAJA/util/TypeConvert.hpp             |    2 +-
 include/RAJA/util/TypedViewBase.hpp           | 1271 ++++---
 include/RAJA/util/View.hpp                    |  228 +-
 include/RAJA/util/align.hpp                   |   18 +-
 include/RAJA/util/basic_mempool.hpp           |  136 +-
 include/RAJA/util/concepts.hpp                |    8 +-
 include/RAJA/util/for_each.hpp                |   39 +-
 include/RAJA/util/macros.hpp                  |   56 +-
 include/RAJA/util/math.hpp                    |   47 +-
 include/RAJA/util/mutex.hpp                   |   12 +-
 include/RAJA/util/plugins.hpp                 |   75 +-
 include/RAJA/util/reduce.hpp                  |  191 +-
 include/RAJA/util/resource.hpp                |  293 +-
 include/RAJA/util/sort.hpp                    |  433 +--
 include/RAJA/util/types.hpp                   |  141 +-
 include/RAJA/util/zip.hpp                     |  115 +-
 include/RAJA/util/zip_tuple.hpp               |  394 +-
 ...t-dynamic-forall-resource-RangeSegment.hpp |   89 +-
 .../test-dynamic-forall-RangeSegment.hpp      |  109 +-
 .../tests/test-forall-CombiningAdapter-1D.hpp |   98 +-
 .../tests/test-forall-CombiningAdapter-2D.hpp |  132 +-
 .../tests/test-forall-CombiningAdapter-3D.hpp |  178 +-
 .../tests/test-forall-atomic-basic.hpp        |  141 +-
 .../tests/test-forall-AtomicRefAdd.hpp        |  194 +-
 .../tests/test-forall-AtomicRefCAS.hpp        |  183 +-
 .../tests/test-forall-AtomicRefLoadStore.hpp  |  173 +-
 .../tests/test-forall-AtomicRefLogical.hpp    |  273 +-
 .../tests/test-forall-AtomicRefMinMax.hpp     |  175 +-
 .../tests/test-forall-AtomicRefSub.hpp        |  177 +-
 .../tests/test-forall-AtomicMultiView.hpp     |   91 +-
 ...test-forall-AtomicOutOfBoundsMultiView.hpp |   55 +-
 .../tests/test-forall-AtomicView.hpp          |   53 +-
 .../tests/test-forall-IcountIndexSetView.hpp  |   51 +-
 .../tests/test-forall-IndexSetView.hpp        |   50 +-
 .../tests/test-forall-IcountIndexSet.hpp      |   48 +-
 .../indexset/tests/test-forall-IndexSet.hpp   |   51 +-
 .../tests/test-forall-basic-MultiReduce.hpp   |  251 +-
 .../tests/test-forall-basic-ReduceBitAnd.hpp  |  120 +-
 .../tests/test-forall-basic-ReduceBitOr.hpp   |  126 +-
 .../tests/test-forall-basic-ReduceMax.hpp     |  124 +-
 .../tests/test-forall-basic-ReduceMaxLoc.hpp  |  137 +-
 .../tests/test-forall-basic-ReduceMin.hpp     |  122 +-
 .../tests/test-forall-basic-ReduceMinLoc.hpp  |  137 +-
 .../tests/test-forall-basic-ReduceSum.hpp     |  116 +-
 .../test-forall-basic-expt-ReduceBitAnd.hpp   |  127 +-
 .../test-forall-basic-expt-ReduceBitOr.hpp    |  141 +-
 .../test-forall-basic-expt-ReduceMax.hpp      |  139 +-
 .../test-forall-basic-expt-ReduceMaxLoc.hpp   |  137 +-
 ...test-forall-basic-expt-ReduceMaxLocAlt.hpp |  186 +-
 .../test-forall-basic-expt-ReduceMin.hpp      |  135 +-
 .../test-forall-basic-expt-ReduceMinLoc.hpp   |  139 +-
 ...test-forall-basic-expt-ReduceMinLocAlt.hpp |  192 +-
 .../test-forall-basic-expt-ReduceSum.hpp      |  127 +-
 ...est-forall-indexset-multiple-ReduceMax.hpp |   84 +-
 ...-forall-indexset-multiple-ReduceMaxLoc.hpp |   91 +-
 ...est-forall-indexset-multiple-ReduceMin.hpp |   96 +-
 ...-forall-indexset-multiple-ReduceMinLoc.hpp |   91 +-
 ...est-forall-indexset-multiple-ReduceSum.hpp |   94 +-
 ...test-forall-segment-multiple-ReduceMax.hpp |   75 +-
 ...t-forall-segment-multiple-ReduceMaxLoc.hpp |   85 +-
 ...test-forall-segment-multiple-ReduceMin.hpp |   73 +-
 ...t-forall-segment-multiple-ReduceMinLoc.hpp |   86 +-
 ...test-forall-segment-multiple-ReduceSum.hpp |  140 +-
 .../region/tests/test-forall-region.hpp       |   59 +-
 .../test-forall-ResourceIcountIndexSet.hpp    |   53 +-
 .../tests/test-forall-ResourceIndexSet.hpp    |   53 +-
 .../test-forall-resource-ListSegment.hpp      |   68 +-
 .../test-forall-resource-RangeSegment.hpp     |   69 +-
 ...est-forall-resource-RangeStrideSegment.hpp |  158 +-
 .../tests/test-forall-ListSegmentView.hpp     |  133 +-
 .../tests/test-forall-RangeSegment2DView.hpp  |  108 +-
 .../tests/test-forall-RangeSegmentView.hpp    |   99 +-
 .../test-forall-RangeStrideSegmentView.hpp    |  111 +-
 .../segment/tests/test-forall-ListSegment.hpp |   93 +-
 .../tests/test-forall-RangeSegment.hpp        |   97 +-
 .../tests/test-forall-RangeStrideSegment.hpp  |  167 +-
 .../indexset-build/test-aligned-indexset.cpp  |   18 +-
 .../tests/basic-fission-fusion-loop-impl.hpp  |   55 +-
 ...nel-basic-fission-fusion-loop-segments.hpp |   53 +-
 .../tests/basic-single-icount-loop-impl.hpp   |  120 +-
 ...rnel-basic-single-icount-loop-segments.hpp |   60 +-
 .../tests/basic-single-loop-segments-impl.hpp |  112 +-
 ...test-kernel-basic-single-loop-segments.hpp |   66 +-
 ...el-resource-basic-single-loop-segments.hpp |   66 +-
 .../conditional-fission-fusion-loop-impl.hpp  |   56 +-
 ...nditional-fission-fusion-loop-segments.hpp |   65 +-
 .../tests/test-kernel-hyperplane-2D.hpp       |  147 +-
 .../tests/test-kernel-hyperplane-3D.hpp       |  202 +-
 .../tests/test-kernel-nested-MultiReduce.hpp  |  350 +-
 .../tests/nested-loop-BlockReduceSum-impl.hpp |  175 +-
 .../tests/nested-loop-ReduceSum-impl.hpp      |  331 +-
 ...test-kernel-nested-loop-BlockReduceSum.hpp |   19 +-
 .../test-kernel-nested-loop-ReduceSum.hpp     |   19 +-
 ...el-resource-nested-loop-BlockReduceSum.hpp |   19 +-
 ...-kernel-resource-nested-loop-ReduceSum.hpp |   19 +-
 ...test-kernel-nested-loops-segment-types.hpp |  243 +-
 .../test-kernel-nested-loop-OffsetView2D.hpp  |   69 +-
 .../test-kernel-nested-loop-OffsetView3D.hpp  |   83 +-
 ...ernel-nested-loop-PermutedOffsetView2D.hpp |   94 +-
 ...ernel-nested-loop-PermutedOffsetView3D.hpp |  122 +-
 ...test-kernel-nested-loop-PermutedView2D.hpp |   58 +-
 ...test-kernel-nested-loop-PermutedView3D.hpp |   69 +-
 .../tests/nested-loop-Basic-impl.hpp          |  359 +-
 .../tests/nested-loop-MultiLambda-impl.hpp    |  259 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |  221 +-
 .../tests/test-kernel-nested-loop-Basic.hpp   |   16 +-
 .../test-kernel-nested-loop-MultiLambda.hpp   |   11 +-
 ...st-kernel-nested-loop-MultiLambdaParam.hpp |   12 +-
 ...test-kernel-resource-nested-loop-Basic.hpp |   16 +-
 ...ernel-resource-nested-loop-MultiLambda.hpp |   11 +-
 ...-resource-nested-loop-MultiLambdaParam.hpp |   12 +-
 .../tests/test-kernel-reduceloc-Max2D.hpp     |  143 +-
 .../tests/test-kernel-reduceloc-Max2DView.hpp |  143 +-
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |  142 +-
 .../tests/test-kernel-reduceloc-Min2D.hpp     |  143 +-
 .../tests/test-kernel-reduceloc-Min2DView.hpp |  143 +-
 .../test-kernel-reduceloc-Min2DViewTuple.hpp  |  142 +-
 .../region/tests/test-kernel-region-data.hpp  |    8 +-
 .../region/tests/test-kernel-region-sync.hpp  |   60 +-
 .../region/tests/test-kernel-region.hpp       |   59 +-
 .../test-kernel-single-loop-ForICount.hpp     |   48 +-
 .../test-kernel-single-loop-TileTCount.hpp    |   48 +-
 .../tests/test-kernel-tile-Dynamic2D.hpp      |  142 +-
 .../tests/test-kernel-tile-Fixed2D.hpp        |  114 +-
 .../tests/test-kernel-tile-Fixed2DMinMax.hpp  |   80 +-
 .../tests/test-kernel-tile-Fixed2DSum.hpp     |   55 +-
 .../tests/test-kernel-tile-LocalArray2D.hpp   |  127 +-
 ...kernel-resource-warp-thread-ReduceMask.hpp |   16 +-
 ...kernel-resource-warp-thread-ReduceWarp.hpp |   16 +-
 ...t-kernel-resource-warp-thread-WarpLoop.hpp |   16 +-
 .../test-kernel-warp-thread-ReduceMask.hpp    |   16 +-
 .../test-kernel-warp-thread-ReduceWarp.hpp    |   16 +-
 .../test-kernel-warp-thread-WarpLoop.hpp      |   16 +-
 .../tests/warp-thread-ReduceMask-impl.hpp     |  219 +-
 .../tests/warp-thread-ReduceWarp-impl.hpp     |  327 +-
 .../tests/warp-thread-WarpLoop-impl.hpp       |  237 +-
 .../tests/test-launch-nested-MultiReduce.hpp  |  355 +-
 .../tests/test-launch-nested-Direct.hpp       |  266 +-
 .../tests/test-launch-nested-Loop.hpp         |  259 +-
 .../tests/test-launch-nested-Tile-Direct.hpp  |  249 +-
 .../tests/test-launch-nested-Tile-Loop.hpp    |  251 +-
 .../tests/test-launch-basic-ReduceBitAnd.hpp  |  172 +-
 .../tests/test-launch-basic-ReduceMin.hpp     |  173 +-
 .../tests/test-launch-basic-ReduceSum.hpp     |  156 +-
 ...t-launch-basic-param-expt-ReduceBitAnd.hpp |  186 +-
 ...test-launch-basic-param-expt-ReduceMin.hpp |  191 +-
 ...test-launch-basic-param-expt-ReduceSum.hpp |  166 +-
 .../tests/test-launch-BasicShared.hpp         |  125 +-
 .../segment/tests/test-launch-ListSegment.hpp |  137 +-
 .../tests/test-launch-RangeSegment.hpp        |  165 +-
 .../tests/test-launch-RangeStrideSegment.hpp  |  208 +-
 .../tests/test-launch-DynamicMem.hpp          |  163 +-
 .../tests/test-launch-StaticMem.hpp           |  153 +-
 .../test-launch-nested-Tile-iCount-Direct.hpp |  183 +-
 .../test-launch-nested-Tile-iCount-Loop.hpp   |  184 +-
 .../scan/tests/test-scan-Exclusive.hpp        |   72 +-
 .../scan/tests/test-scan-ExclusiveInplace.hpp |   69 +-
 .../scan/tests/test-scan-Inclusive.hpp        |   60 +-
 .../scan/tests/test-scan-InclusiveInplace.hpp |   54 +-
 test/functional/scan/tests/test-scan-data.hpp |   18 +-
 .../matrix/test-tensor-matrix-double.hpp      |  135 +-
 .../matrix/test-tensor-matrix-float.hpp       |   73 +-
 .../matrix/test-tensor-matrix-int32_t.hpp     |   72 +-
 .../matrix/test-tensor-matrix-int64_t.hpp     |  144 +-
 .../tests/test-tensor-matrix-CtorGetSet.hpp   |   85 +-
 .../tests/test-tensor-matrix-ET_Add.hpp       |  176 +-
 .../tests/test-tensor-matrix-ET_Divide.hpp    |  177 +-
 .../tests/test-tensor-matrix-ET_LoadStore.hpp |  206 +-
 ...-tensor-matrix-ET_MatrixMatrixMultiply.hpp |  244 +-
 ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp |  248 +-
 .../test-tensor-matrix-ET_MatrixVector.hpp    |  230 +-
 .../tests/test-tensor-matrix-ET_Negate.hpp    |  142 +-
 .../tests/test-tensor-matrix-ET_Subtract.hpp  |  176 +-
 .../tests/test-tensor-matrix-ET_Transpose.hpp |  170 +-
 .../test-tensor-matrix-Load_ColMajor.hpp      |  168 +-
 .../test-tensor-matrix-Load_RowMajor.hpp      |  169 +-
 .../test-tensor-matrix-Store_ColMajor.hpp     |  184 +-
 .../test-tensor-matrix-Store_RowMajor.hpp     |  184 +-
 .../tests/test-tensor-matrix-Transpose.hpp    |   81 +-
 .../tests/test-tensor-register-Add.hpp        |  119 +-
 .../tests/test-tensor-register-Divide.hpp     |  154 +-
 .../tests/test-tensor-register-DotProduct.hpp |   50 +-
 .../tests/test-tensor-register-FMA.hpp        |   66 +-
 .../tests/test-tensor-register-FMS.hpp        |   66 +-
 .../tests/test-tensor-register-Gather.hpp     |  104 +-
 .../tests/test-tensor-register-GetSet.hpp     |  257 +-
 .../tests/test-tensor-register-Load.hpp       |  165 +-
 .../tests/test-tensor-register-Max.hpp        |   89 +-
 .../tests/test-tensor-register-Min.hpp        |   87 +-
 .../tests/test-tensor-register-Multiply.hpp   |  119 +-
 .../tests/test-tensor-register-Scatter.hpp    |  102 +-
 ...ensor-register-SegmentedBroadcastInner.hpp |   85 +-
 ...ensor-register-SegmentedBroadcastOuter.hpp |   72 +-
 ...st-tensor-register-SegmentedDotProduct.hpp |   72 +-
 ...test-tensor-register-SegmentedSumInner.hpp |   62 +-
 ...test-tensor-register-SegmentedSumOuter.hpp |   61 +-
 .../tests/test-tensor-register-Store.hpp      |  169 +-
 .../tests/test-tensor-register-Subtract.hpp   |  119 +-
 .../tests/test-tensor-vector-CtorGetSet.hpp   |   78 +-
 .../tests/test-tensor-vector-FmaFms.hpp       |   94 +-
 .../test-tensor-vector-ForallVectorRef1d.hpp  |   97 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |  116 +-
 .../tests/test-tensor-vector-MinMax.hpp       |   44 +-
 .../tests/test-tensor-vector-SumDot.hpp       |   42 +-
 .../util/test-CombiningAdapter-1D.cpp         |   22 +-
 .../util/test-CombiningAdapter-2D.cpp         |   47 +-
 .../util/test-CombiningAdapter-3D.cpp         |   73 +-
 .../util/test-PermutedCombiningAdapter-1D.cpp |   22 +-
 .../util/test-PermutedCombiningAdapter-2D.cpp |   48 +-
 .../util/test-PermutedCombiningAdapter-3D.cpp |   79 +-
 .../test-workgroup-Ordered-MultipleReuse.hpp  |  586 +--
 .../tests/test-workgroup-Ordered-Single.hpp   |  252 +-
 ...test-workgroup-Unordered-MultipleReuse.hpp |  538 +--
 .../tests/test-workgroup-Unordered-Single.hpp |  231 +-
 test/include/RAJA_gtest.hpp                   |  270 +-
 test/include/RAJA_test-abs.hpp                |   27 +-
 test/include/RAJA_test-atomic-ref-types.hpp   |   93 +-
 test/include/RAJA_test-atomic-types.hpp       |   17 +-
 test/include/RAJA_test-atomicpol.hpp          |   78 +-
 test/include/RAJA_test-base.hpp               |    5 +-
 test/include/RAJA_test-camp.hpp               |    2 +-
 test/include/RAJA_test-dynamic-forall.hpp     |   17 +-
 .../RAJA_test-forall-async-execpol.hpp        |   17 +-
 test/include/RAJA_test-forall-data.hpp        |   24 +-
 test/include/RAJA_test-forall-execpol.hpp     |  170 +-
 .../RAJA_test-forall-indexset-execpol.hpp     |   42 +-
 test/include/RAJA_test-index-types.hpp        |   60 +-
 test/include/RAJA_test-indexset-build.hpp     |   72 +-
 .../RAJA_test-kernel-nested-loop-types.hpp    |  139 +-
 test/include/RAJA_test-kernel-tile-size.hpp   |    2 +-
 ...launch-direct-teams-threads-1D-execpol.hpp |   70 +-
 ...launch-direct-teams-threads-3D-execpol.hpp |  125 +-
 test/include/RAJA_test-launch-execpol.hpp     |   58 +-
 ...t-launch-loop-teams-threads-1D-execpol.hpp |   80 +-
 ...t-launch-loop-teams-threads-3D-execpol.hpp |  134 +-
 .../RAJA_test-launch-runtime-execpol.hpp      |  149 +-
 .../RAJA_test-multi-reduce-abstractor.hpp     |  215 +-
 test/include/RAJA_test-multi-reducepol.hpp    |   27 +-
 test/include/RAJA_test-platform.hpp           |    9 +-
 test/include/RAJA_test-plugin-kernelpol.hpp   |  136 +-
 test/include/RAJA_test-plugin-launchpol.hpp   |   12 +-
 .../RAJA_test-plugin-resource-launchpol.hpp   |   12 +-
 test/include/RAJA_test-reduce-types.hpp       |   15 +-
 test/include/RAJA_test-reduceloc-types.hpp    |   13 +-
 test/include/RAJA_test-reducepol.hpp          |   39 +-
 test/include/RAJA_test-tensor.hpp             |  243 +-
 test/include/RAJA_test-workgroup.hpp          |  296 +-
 test/include/RAJA_unit-test-for3d3d.hpp       |  133 +-
 test/include/RAJA_unit-test-forone.hpp        |   29 +-
 test/include/RAJA_unit-test-policy.hpp        |   72 +-
 test/include/RAJA_unit-test-types.hpp         |   41 +-
 test/include/type_helper.hpp                  |   39 +-
 .../using-with-cmake/using-with-cmake.cpp     |   20 +-
 test/integration/plugin/plugin_to_test.cpp    |   20 +-
 test/integration/plugin/tests/counter.hpp     |   14 +-
 .../plugin/tests/test-plugin-forall.hpp       |  172 +-
 .../plugin/tests/test-plugin-kernel.hpp       |   44 +-
 .../plugin/tests/test-plugin-launch.hpp       |   56 +-
 .../tests/test-plugin-resource-launch.hpp     |   57 +-
 .../plugin/tests/test-plugin-workgroup.hpp    |  324 +-
 test/integration/plugin/tests/test-plugin.hpp |   59 +-
 test/integration/plugin_for_test_dynamic.cpp  |   10 +-
 test/integration/plugin_for_test_kokkos.cpp   |   20 +-
 test/integration/test_plugin_dynamic.cpp      |    2 +-
 test/integration/test_plugin_kokkos.cpp       |    2 +-
 test/old-tests/unit/cpu/test-synchronize.cpp  |    3 +-
 test/old-tests/unit/cuda/test-synchronize.cpp |   14 +-
 test/old-tests/unit/test-sharedmem.cpp        | 1291 ++++---
 test/old-tests/unit/test-simd.cpp             |   89 +-
 .../test-algorithm-util-for_each.cpp          |   90 +-
 .../tests/test-algorithm-reduce-utils.hpp     |  274 +-
 .../tests/test-algorithm-sort-utils.hpp       |  498 +--
 .../algorithm/tests/test-algorithm-sort.hpp   |   78 +-
 .../tests/test-algorithm-stable-sort.hpp      |   78 +-
 .../tests/test-algorithm-util-reduce.hpp      |  164 +-
 .../tests/test-algorithm-util-sort.hpp        |  651 ++--
 test/unit/atomic/test-atomic-incdec.cpp       |  151 +-
 .../unit/atomic/test-atomic-ref-accessors.cpp |  100 +-
 test/unit/atomic/test-atomic-ref-addsub.cpp   |  131 +-
 test/unit/atomic/test-atomic-ref-bitwise.cpp  |  178 +-
 .../atomic/test-atomic-ref-constructor.cpp    |  104 +-
 .../unit/atomic/test-atomic-ref-exchanges.cpp |  210 +-
 test/unit/atomic/test-atomic-ref-minmax.cpp   |   95 +-
 test/unit/atomic/test-atomic-ref.hpp          |  102 +-
 test/unit/hip/test-synchronize.cpp            |   21 +-
 test/unit/index/test-indexset.cpp             |   49 +-
 test/unit/index/test-indexvalue.cpp           |    7 +-
 test/unit/index/test-listsegment.cpp          |   46 +-
 test/unit/index/test-rangesegment.cpp         |   42 +-
 test/unit/index/test-rangestridesegment.cpp   |   97 +-
 test/unit/indexing/test-indexing.hpp          |   28 +-
 .../indexing/tests/test-indexing-global.hpp   |   94 +-
 test/unit/internal/test-iterators.cpp         |   22 +-
 test/unit/internal/test-rajavec.cpp           |    4 +-
 .../unit/multi_reducer/test-multi-reducer.hpp |   32 +-
 .../tests/test-multi-reducer-constructors.hpp |  189 +-
 .../tests/test-multi-reducer-reset.hpp        |  476 +--
 .../test-reducer-constructors-cuda.cpp        |   16 +-
 .../reducer/test-reducer-constructors-hip.cpp |   16 +-
 ...est-reducer-constructors-openmp-target.cpp |   10 +-
 .../test-reducer-constructors-openmp.cpp      |   16 +-
 .../reducer/test-reducer-constructors-seq.cpp |   17 +-
 test/unit/reducer/test-reducer-reset-cuda.cpp |    6 +-
 test/unit/reducer/test-reducer-reset-hip.cpp  |    6 +-
 .../test-reducer-reset-openmp-target.cpp      |    6 +-
 .../reducer/test-reducer-reset-openmp.cpp     |    6 +-
 test/unit/reducer/test-reducer-reset-seq.cpp  |    7 +-
 test/unit/reducer/test-reducer.hpp            |   16 +-
 .../tests/test-reducer-constructors.hpp       |  157 +-
 .../unit/reducer/tests/test-reducer-reset.hpp |  179 +-
 .../tests/test-resource-AsyncTime.hpp         |   67 +-
 .../test-resource-BasicAsyncSemantics.hpp     |   33 +-
 .../resource/tests/test-resource-Depends.hpp  |   44 +-
 .../test-resource-JoinAsyncSemantics.hpp      |   34 +-
 .../tests/test-resource-MultiStream.hpp       |   68 +-
 .../test-operators-bitwise-modulus.cpp        |   41 +-
 .../operator/test-operators-equivalence.cpp   |  124 +-
 .../util/operator/test-operators-identity.cpp |   44 +-
 .../util/operator/test-operators-logical.cpp  |   55 +-
 .../util/operator/test-operators-math.cpp     |   52 +-
 test/unit/util/test-float-limits.cpp          |    7 +-
 test/unit/util/test-fraction.cpp              |   17 +-
 test/unit/util/test-integral-limits.cpp       |    7 +-
 test/unit/util/test-math.cpp                  |   17 +-
 test/unit/util/test-span.cpp                  |   41 +-
 test/unit/util/test-span.hpp                  |   66 +-
 test/unit/util/test-timer.cpp                 |    8 +-
 test/unit/view-layout/test-indexlayout.cpp    |  239 +-
 test/unit/view-layout/test-makelayout.cpp     |  118 +-
 test/unit/view-layout/test-multiview.cpp      |  303 +-
 .../unit/view-layout/test-standard-layout.cpp |   15 +-
 test/unit/view-layout/test-typedlayout.cpp    |  121 +-
 test/unit/view-layout/test-typedview.cpp      |  201 +-
 .../tests/test-util-workgroup-Enqueue.hpp     |   14 +-
 .../tests/test-util-workgroup-WorkStorage.hpp |   39 +-
 .../tests/test-workgroup-Constructor.hpp      |  173 +-
 .../tests/test-workgroup-Dispatcher.hpp       |  312 +-
 .../tests/test-workgroup-Enqueue-Multiple.hpp |  176 +-
 .../tests/test-workgroup-Enqueue-Single.hpp   |  170 +-
 ...test-workgroup-WorkStorage-Constructor.hpp |   37 +-
 .../test-workgroup-WorkStorage-InsertCall.hpp |   58 +-
 .../test-workgroup-WorkStorage-Iterator.hpp   |   51 +-
 .../test-workgroup-WorkStorage-Multiple.hpp   |  154 +-
 605 files changed, 60395 insertions(+), 52752 deletions(-)

diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 59cca4bf22..abc965b0f5 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -88,7 +88,7 @@
 #endif
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/desul.hpp"
+#include "RAJA/policy/desul.hpp"
 #endif
 
 #include "RAJA/index/IndexSet.hpp"
@@ -197,11 +197,13 @@
 
 #include "RAJA/pattern/sort.hpp"
 
-namespace RAJA {
-namespace expt{}
+namespace RAJA
+{
+namespace expt
+{}
 //  // provide a RAJA::expt namespace for experimental work, but bring alias
 //  // it into RAJA so it doesn't affect user code
 //  using namespace expt;
-}
+}  // namespace RAJA
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 1a467c8341..3261c27b7a 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -34,8 +34,16 @@
 namespace RAJA
 {
 
-enum PushEnd { PUSH_FRONT, PUSH_BACK };
-enum PushCopy { PUSH_COPY, PUSH_NOCOPY };
+enum PushEnd
+{
+  PUSH_FRONT,
+  PUSH_BACK
+};
+enum PushCopy
+{
+  PUSH_COPY,
+  PUSH_NOCOPY
+};
 
 template <typename... TALL>
 class TypedIndexSet;
@@ -55,8 +63,9 @@ namespace indexset
 template <typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
 struct ExecPolicy
     : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
-                                         RAJA::Pattern::forall> {
-  using seg_it = SEG_ITER_POLICY_T;
+                                         RAJA::Pattern::forall>
+{
+  using seg_it   = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
 };
 
@@ -77,7 +86,7 @@ using policy::indexset::ExecPolicy;
 template <typename T0, typename... TREST>
 class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 {
-  using PARENT = TypedIndexSet<TREST...>;
+  using PARENT               = TypedIndexSet<TREST...>;
   static const int T0_TypeId = sizeof...(TREST);
 
 public:
@@ -91,7 +100,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Construct empty index set
 #if _MSC_VER < 1910
-   // this one instance of constexpr does not work on VS2012 or VS2015
+  // this one instance of constexpr does not work on VS2012 or VS2015
   RAJA_INLINE TypedIndexSet() : PARENT() {}
 #else
   RAJA_INLINE constexpr TypedIndexSet() : PARENT() {}
@@ -99,12 +108,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Copy-constructor for index set
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet<T0, TREST...> const &c)
-      : PARENT((PARENT const &)c)
+  TypedIndexSet(TypedIndexSet<T0, TREST...> const& c) : PARENT((PARENT const&)c)
   {
     size_t num = c.data.size();
     data.resize(num);
-    for (size_t i = 0; i < num; ++i) {
+    for (size_t i = 0; i < num; ++i)
+    {
       data[i] = c.data[i];
     }
     // mark all as not owned by us
@@ -112,9 +121,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Copy-assignment operator for index set
-  TypedIndexSet<T0, TREST...> &operator=(const TypedIndexSet<T0, TREST...> &rhs)
+  TypedIndexSet<T0, TREST...>& operator=(const TypedIndexSet<T0, TREST...>& rhs)
   {
-    if (&rhs != this) {
+    if (&rhs != this)
+    {
       TypedIndexSet<T0, TREST...> copy(rhs);
       this->swap(copy);
     }
@@ -125,19 +135,21 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE ~TypedIndexSet()
   {
     size_t num_seg = data.size();
-    for (size_t i = 0; i < num_seg; ++i) {
+    for (size_t i = 0; i < num_seg; ++i)
+    {
       // Only free segment of we allocated it
-      if (owner[i]) {
+      if (owner[i])
+      {
         delete data[i];
       }
     }
   }
 
   //! Swap function for copy-and-swap idiom.
-  void swap(TypedIndexSet<T0, TREST...> &other)
+  void swap(TypedIndexSet<T0, TREST...>& other)
   {
     // Swap parents data
-    PARENT::swap((PARENT &)other);
+    PARENT::swap((PARENT&)other);
     // Swap our data
     using std::swap;
     swap(data, other.data);
@@ -150,18 +162,20 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This is used to implement the == and != operators
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool compareSegmentById(
-      size_t segid,
-      const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool
+  compareSegmentById(size_t segid,
+                     const TypedIndexSet<P0, PREST...>& other) const
   {
     // drill down our types until we have the right type
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       // peel off T0
       return PARENT::compareSegmentById(segid, other);
     }
 
     // Check that other's segid is of type T0
-    if (!other.template checkSegmentType<T0>(segid)) {
+    if (!other.template checkSegmentType<T0>(segid))
+    {
       return false;
     }
 
@@ -174,7 +188,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   template <typename P0>
   RAJA_INLINE bool checkSegmentType(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       return std::is_same<T0, P0>::value;
     }
     return PARENT::template checkSegmentType<P0>(segid);
@@ -183,22 +198,24 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t segid)
+  RAJA_INLINE P0& getSegment(size_t segid)
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
 
   //! get specified segment by ID
   template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t segid) const
+  RAJA_INLINE P0 const& getSegment(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
@@ -231,20 +248,25 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 private:
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &c,
-                             PushEnd pend = PUSH_BACK,
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
+                             PushEnd pend   = PUSH_BACK,
                              PushCopy pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
-    if (pend == PUSH_BACK) {
-      for (Index_type i = 0; i < num; ++i) {
+    if (pend == PUSH_BACK)
+    {
+      for (Index_type i = 0; i < num; ++i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
-    } else {
-      for (Index_type i = num-1; i > -1; --i) {
+      }
+    }
+    else
+    {
+      for (Index_type i = num - 1; i > -1; --i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
+      }
     }
   }
 
@@ -257,66 +279,71 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 public:
   template <typename... CALL>
   RAJA_INLINE void segment_push_into(size_t segid,
-                                     TypedIndexSet<CALL...> &c,
-                                     PushEnd pend = PUSH_BACK,
+                                     TypedIndexSet<CALL...>& c,
+                                     PushEnd pend   = PUSH_BACK,
                                      PushCopy pcopy = PUSH_COPY)
   {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       PARENT::segment_push_into(segid, c, pend, pcopy);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
-    switch (value_for(pend, pcopy)) {
-      case value_for(PUSH_BACK, PUSH_COPY):
-        c.push_back(*data[offset]);
-        break;
-      case value_for(PUSH_BACK, PUSH_NOCOPY):
-        c.push_back_nocopy(data[offset]);
-        break;
-      case value_for(PUSH_FRONT, PUSH_COPY):
-        c.push_front(*data[offset]);
-        break;
-      case value_for(PUSH_FRONT, PUSH_NOCOPY):
-        c.push_front_nocopy(data[offset]);
-        break;
+    switch (value_for(pend, pcopy))
+    {
+    case value_for(PUSH_BACK, PUSH_COPY):
+      c.push_back(*data[offset]);
+      break;
+    case value_for(PUSH_BACK, PUSH_NOCOPY):
+      c.push_back_nocopy(data[offset]);
+      break;
+    case value_for(PUSH_FRONT, PUSH_COPY):
+      c.push_front(*data[offset]);
+      break;
+    case value_for(PUSH_FRONT, PUSH_NOCOPY):
+      c.push_front_nocopy(data[offset]);
+      break;
     }
   }
 
 
   //! Add segment to back end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_back_nocopy(Tnew *val)
+  RAJA_INLINE void push_back_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_BACK, PUSH_NOCOPY);
   }
 
   //! Add segment to front end of index set without making a copy.
   template <typename Tnew>
-  RAJA_INLINE void push_front_nocopy(Tnew *val)
+  RAJA_INLINE void push_front_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_FRONT, PUSH_NOCOPY);
   }
 
   //! Add copy of segment to back end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_back(Tnew &&val)
+  RAJA_INLINE void push_back(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_BACK, PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_front(Tnew &&val)
+  RAJA_INLINE void push_front(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
   RAJA_INLINE size_t getLength() const
   {
     size_t total = PARENT::getLength();
-    size_t num = data.size();
-    for (size_t i = 0; i < num; ++i) {
+    size_t num   = data.size();
+    for (size_t i = 0; i < num; ++i)
+    {
       total += data[i]->size();
     }
     return total;
@@ -339,13 +366,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY, typename... ARGS>
-  RAJA_HOST_DEVICE void segmentCall(size_t segid,
-                                    BODY &&body,
-                                    ARGS &&... args) const
+  RAJA_HOST_DEVICE void
+  segmentCall(size_t segid, BODY&& body, ARGS&&... args) const
   {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
-      PARENT::segmentCall(segid,
-                          std::forward<BODY>(body),
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
+      PARENT::segmentCall(segid, std::forward<BODY>(body),
                           std::forward<ARGS>(args)...);
       return;
     }
@@ -356,24 +382,23 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 protected:
   //! Internal logic to add a new segment -- catch invalid type insertion
   template <typename Tnew>
-  RAJA_INLINE void push_internal(Tnew *val,
-                                 PushEnd pend = PUSH_BACK,
-                                 PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void
+  push_internal(Tnew* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
   {
     static_assert(sizeof...(TREST) > 0, "Invalid type for this TypedIndexSet");
     PARENT::push_internal(val, pend, pcopy);
   }
 
   //! Internal logic to add a new segment
-  RAJA_INLINE void push_internal(T0 *val,
-                                 PushEnd pend = PUSH_BACK,
-                                 PushCopy pcopy = PUSH_COPY)
+  RAJA_INLINE void
+  push_internal(T0* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY)
   {
     data.push_back(val);
     owner.push_back(pcopy == PUSH_COPY);
 
     // Determine if we push at the front or back of the segment list
-    if (pend == PUSH_BACK) {
+    if (pend == PUSH_BACK)
+    {
       // Store the segment type
       getSegmentTypes().push_back(T0_TypeId);
 
@@ -384,7 +409,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       size_t icount = val->size();
       getSegmentIcounts().push_back(getTotalLength());
       increaseTotalLength(icount);
-    } else {
+    }
+    else
+    {
       // Store the segment type
       getSegmentTypes().push_front(T0_TypeId);
 
@@ -394,7 +421,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       // Store the segment icount
       getSegmentIcounts().push_front(0);
       size_t icount = val->size();
-      for (size_t i = 1; i < getSegmentIcounts().size(); ++i) {
+      for (size_t i = 1; i < getSegmentIcounts().size(); ++i)
+      {
         getSegmentIcounts()[i] += icount;
       }
       increaseTotalLength(icount);
@@ -402,7 +430,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Returns the number of indices (the total icount of segments
-  RAJA_INLINE Index_type &getTotalLength() { return PARENT::getTotalLength(); }
+  RAJA_INLINE Index_type& getTotalLength() { return PARENT::getTotalLength(); }
 
   //! set total length of the indexset
   RAJA_INLINE void setTotalLength(int n) { return PARENT::setTotalLength(n); }
@@ -437,9 +465,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     TypedIndexSet<T0, TREST...> retVal;
 
-    int minSeg = RAJA::operators::maximum<int>{}(0, begin);
-    int maxSeg = RAJA::operators::minimum<int>{}(end, getNumSegments());
-    for (int i = minSeg; i < maxSeg; ++i) {
+    int minSeg = RAJA::operators::maximum<int> {}(0, begin);
+    int maxSeg = RAJA::operators::minimum<int> {}(end, getNumSegments());
+    for (int i = minSeg; i < maxSeg; ++i)
+    {
       segment_push_into(i, retVal, PUSH_BACK, PUSH_NOCOPY);
     }
     return retVal;
@@ -452,13 +481,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This TypedIndexSet will not change and the created "slice" into it
   /// will not own any of its segments.
   ///
-  TypedIndexSet<T0, TREST...> createSlice(const int *segIds, int len)
+  TypedIndexSet<T0, TREST...> createSlice(const int* segIds, int len)
   {
     TypedIndexSet<T0, TREST...> retVal;
 
     int numSeg = getNumSegments();
-    for (int i = 0; i < len; ++i) {
-      if (segIds[i] >= 0 && segIds[i] < numSeg) {
+    for (int i = 0; i < len; ++i)
+    {
+      if (segIds[i] >= 0 && segIds[i] < numSeg)
+      {
         segment_push_into(segIds[i], retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -476,12 +507,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// iterator type must de-reference to an integral value.
   ///
   template <typename T>
-  TypedIndexSet<T0, TREST...> createSlice(const T &segIds)
+  TypedIndexSet<T0, TREST...> createSlice(const T& segIds)
   {
     TypedIndexSet<T0, TREST...> retVal;
     int numSeg = getNumSegments();
-    for (auto &seg : segIds) {
-      if (seg >= 0 && seg < numSeg) {
+    for (auto& seg : segIds)
+    {
+      if (seg >= 0 && seg < numSeg)
+      {
         segment_push_into(seg, retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -492,7 +525,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   void setSegmentInterval(size_t interval_id, int begin, int end)
   {
     m_seg_interval_begin[interval_id] = begin;
-    m_seg_interval_end[interval_id] = end;
+    m_seg_interval_end[interval_id]   = end;
   }
 
   //! get lower bound of segment identified with interval_id
@@ -509,37 +542,37 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 protected:
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return PARENT::getSegmentIcounts();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return PARENT::getSegmentIcounts();
   }
@@ -552,13 +585,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///       types and indices; e.g., dependency info not checked.
   ///
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...>& other) const
   {
     size_t num_seg = getNumSegments();
     if (num_seg != other.getNumSegments()) return false;
 
-    for (size_t segid = 0; segid < num_seg; ++segid) {
-      if (!compareSegmentById(segid, other)) {
+    for (size_t segid = 0; segid < num_seg; ++segid)
+    {
+      if (!compareSegmentById(segid, other))
+      {
         return false;
       }
     }
@@ -567,14 +602,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Inequality operator returns true if any segment is not equal, else false.
   template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...> &other) const
+  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...>& other) const
   {
     return (!(*this == other));
   }
 
 private:
   //! vector of TypedIndexSet data objects of type T0
-  RAJA::RAJAVec<T0 *> data;
+  RAJA::RAJAVec<T0*> data;
 
   //! vector indicating which segments are owned by the TypedIndexSet
   RAJA::RAJAVec<Index_type> owner;
@@ -603,16 +638,16 @@ class TypedIndexSet<>
 
   //! Copy-constructor.
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet const &c)
+  TypedIndexSet(TypedIndexSet const& c)
   {
-    segment_types = c.segment_types;
+    segment_types   = c.segment_types;
     segment_offsets = c.segment_offsets;
     segment_icounts = c.segment_icounts;
-    m_len = c.m_len;
+    m_len           = c.m_len;
   }
 
   //! Swap function for copy-and-swap idiom (deep copy).
-  void swap(TypedIndexSet &other)
+  void swap(TypedIndexSet& other)
   {
     using std::swap;
     swap(segment_types, other.segment_types);
@@ -625,7 +660,7 @@ class TypedIndexSet<>
   RAJA_INLINE static size_t getNumTypes() { return 0; }
 
   template <typename T>
-  RAJA_INLINE constexpr bool isValidSegmentType(T const &) const
+  RAJA_INLINE constexpr bool isValidSegmentType(T const&) const
   {
     // Segment type wasn't found
     return false;
@@ -637,40 +672,39 @@ class TypedIndexSet<>
 
   template <typename BODY, typename... ARGS>
   RAJA_INLINE void segmentCall(size_t, BODY, ARGS...) const
-  {
-  }
+  {}
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE Index_type &getTotalLength() { return m_len; }
+  RAJA_INLINE Index_type& getTotalLength() { return m_len; }
 
   RAJA_INLINE void setTotalLength(int n) { m_len = n; }
 
@@ -678,7 +712,7 @@ class TypedIndexSet<>
 
   template <typename P0, typename... PREST>
   RAJA_INLINE bool compareSegmentById(size_t,
-                                      const TypedIndexSet<P0, PREST...> &) const
+                                      const TypedIndexSet<P0, PREST...>&) const
   {
     return false;
   }
@@ -690,34 +724,29 @@ class TypedIndexSet<>
   }
 
   template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t)
+  RAJA_INLINE P0& getSegment(size_t)
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
   template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t) const
+  RAJA_INLINE P0 const& getSegment(size_t) const
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
   template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &, PushEnd, PushCopy) const
-  {
-  }
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
+  {}
 
   template <typename... CALL>
-  RAJA_INLINE void segment_push_into(size_t,
-                                     TypedIndexSet<CALL...> &,
-                                     PushEnd,
-                                     PushCopy) const
-  {
-  }
+  RAJA_INLINE void
+  segment_push_into(size_t, TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
+  {}
 
   template <typename Tnew>
-  RAJA_INLINE void push(Tnew const &, PushEnd, PushCopy)
-  {
-  }
+  RAJA_INLINE void push(Tnew const&, PushEnd, PushCopy)
+  {}
 
 public:
   using iterator = Iterators::numeric_iterator<Index_type>;
@@ -762,13 +791,15 @@ namespace type_traits
 
 template <typename T>
 struct is_index_set
-    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
+                                            typename std::decay<T>::type>
+{};
 
 template <typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
+                                            typename std::decay<T>::type>
+{};
 }  // namespace type_traits
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 543524be01..075aecd1d1 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -37,13 +37,13 @@ namespace RAJA
  * \brief Generate an index set with aligned Range segments and List segments,
  *        as needed, from given array of indices.
  *
- *        Routine does no error-checking on argements and assumes 
+ *        Routine does no error-checking on argements and assumes
  *        RAJA::Index_type array contains valid indices.
  *
- *  \param iset reference to index set generated with aligned range segments 
+ *  \param iset reference to index set generated with aligned range segments
  *         and list segments. Method assumes index set is empty (no segments).
- *  \param work_res camp resource object that identifies the memory space in 
- *         which list segment index data will live (passed to list segment 
+ *  \param work_res camp resource object that identifies the memory space in
+ *         which list segment index data will live (passed to list segment
  *         ctor).
  *  \param indices_in pointer to start of input array of indices.
  *  \param length size of input index array.
@@ -79,37 +79,36 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  ******************************************************************************
  *
  * \brief Generate a lock-free "block" index set (planar division) containing
- *        range segments. 
+ *        range segments.
  *
- *        The method chunks a fastDim x midDim x slowDim mesh into blocks that 
+ *        The method chunks a fastDim x midDim x slowDim mesh into blocks that
  *        can be dependency-scheduled, removing need for lock constructs.
  *
  *  \param iset reference to index set generated with range segments.
- *         Method assumes index set is empty (no segments). 
+ *         Method assumes index set is empty (no segments).
  *  \param fastDim "fast" block dimension (see above).
  *  \param midDim  "mid" block dimension (see above).
  *  \param slowDim "slow" block dimension (see above).
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim);
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim);
 
 /*!
  ******************************************************************************
  *
  * \brief Generate a lock-free "color" index set containing range and list
  *        segments.
- * 
- *        TThe domain-set is colored based on connectivity to the range-set. 
- *        All elements in each segment are independent, and no two segments 
+ *
+ *        TThe domain-set is colored based on connectivity to the range-set.
+ *        All elements in each segment are independent, and no two segments
  *        can be executed in parallel.
  *
- * \param iset reference to index set generated. Method assumes index set 
- *        is empty (no segments). 
+ * \param iset reference to index set generated. Method assumes index set
+ *        is empty (no segments).
  * \param work_res camp resource object that identifies the memory space in
  *         which list segment index data will live (passed to list segment
  *         ctor).
@@ -123,7 +122,7 @@ void buildLockFreeColorIndexset(
     int numEntity,
     int numRangePerDomain,
     int numEntityRange,
-    RAJA::Index_type* elemPermutation = nullptr,
+    RAJA::Index_type* elemPermutation  = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index 4baea450fc..d5da3e9e19 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 //@{
 //!   @name Methods to gather indices of segment or index set into a container.
 //!
-//!   For each method, the given container must be templated on a data type, 
-//!   have default and copy ctors, push_back method, and value_type. Is is 
-//!   assumed that the container data type and segment or index set data type 
-//!   are compatible in the sense that the index set type can be converted to 
+//!   For each method, the given container must be templated on a data type,
+//!   have default and copy ctors, push_back method, and value_type. Is is
+//!   assumed that the container data type and segment or index set data type
+//!   are compatible in the sense that the index set type can be converted to
 //!   the container data type.
 
 /*!
@@ -49,11 +49,8 @@ RAJA_INLINE void getIndices(CONTAINER_T& con,
                             const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -68,11 +65,8 @@ template <typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx)
+                   { tcon.push_back(idx); });
   con = tcon;
 }
 
@@ -90,11 +84,12 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset,
+      [&](typename CONTAINER_T::value_type idx)
+      {
+        if (conditional(idx)) tcon.push_back(idx);
+      });
   con = tcon;
 }
 
@@ -113,10 +108,10 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
 {
   CONTAINER_T tcon;
   forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+                   [&](typename CONTAINER_T::value_type idx)
+                   {
+                     if (conditional(idx)) tcon.push_back(idx);
+                   });
   con = tcon;
 }
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 44fa143445..7ed94a299e 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -28,8 +28,8 @@
 namespace RAJA
 {
 
-struct IndexValueBase {
-};
+struct IndexValueBase
+{};
 
 /*!
  * \brief Strongly typed "integer" class.
@@ -44,16 +44,17 @@ struct IndexValueBase {
  * Yes, this uses the curiously-recurring template pattern.
  */
 template <typename TYPE, typename VALUE = RAJA::Index_type>
-struct IndexValue : public IndexValueBase {
+struct IndexValue : public IndexValueBase
+{
 
   using value_type = VALUE;
 
   //! Default constructor initializes value to 0.
-  RAJA_INLINE constexpr IndexValue() = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue const &) = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue &&) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue const &) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue &&) = default;
+  RAJA_INLINE constexpr IndexValue()                   = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue const&)  = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue&&)       = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue const&) = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue&&)      = default;
 
   /*!
    * \brief Explicit constructor.
@@ -61,14 +62,13 @@ struct IndexValue : public IndexValueBase {
    */
   RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit IndexValue(value_type v)
       : value(v)
-  {
-  }
+  {}
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator*() { return value; }
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator*() { return value; }
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE const value_type &operator*() const
+  RAJA_HOST_DEVICE RAJA_INLINE const value_type& operator*() const
   {
     return value;
   }
@@ -82,10 +82,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator++()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator++()
   {
     value++;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! postdecrement -- returns a copy
@@ -97,10 +97,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator--()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator--()
   {
     value--;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! addition to underlying index from an Index_type
@@ -163,52 +163,52 @@ struct IndexValue : public IndexValueBase {
     return TYPE(value % a.value);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(value_type x)
   {
     value += x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(TYPE x)
   {
     value += x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(value_type x)
   {
     value -= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(TYPE x)
   {
     value -= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(value_type x)
   {
     value *= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(TYPE x)
   {
     value *= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(value_type x)
   {
     value /= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(TYPE x)
   {
     value /= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   RAJA_HOST_DEVICE RAJA_INLINE bool operator<(value_type x) const
@@ -334,18 +334,22 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
   return val;
 }
 
-namespace internal{
-template<typename FROM, typename Enable = void>
-struct StripIndexTypeT {
-    using type = FROM;
+namespace internal
+{
+template <typename FROM, typename Enable = void>
+struct StripIndexTypeT
+{
+  using type = FROM;
 };
 
-template<typename FROM>
-struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
+template <typename FROM>
+struct StripIndexTypeT<
+    FROM,
+    typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
 {
-    using type = typename FROM::value_type;
+  using type = typename FROM::value_type;
 };
-} // namespace internal
+}  // namespace internal
 
 /*!
  * \brief Strips a strongly typed index to its underlying type
@@ -353,7 +357,7 @@ struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueB
  *
  * \param FROM the original type
  */
-template<typename FROM>
+template <typename FROM>
 using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
 
 /*!
@@ -362,12 +366,11 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  *
  * \param FROM the original type
  */
-template<typename FROM>
-using make_signed_t = typename std::conditional < 
-                                  std::is_floating_point<FROM>::value,
-                                    std::common_type<FROM>,
-                                    std::make_signed<FROM>
-                               >::type::type;
+template <typename FROM>
+using make_signed_t =
+    typename std::conditional<std::is_floating_point<FROM>::value,
+                              std::common_type<FROM>,
+                              std::make_signed<FROM>>::type::type;
 
 }  // namespace RAJA
 
@@ -376,19 +379,18 @@ using make_signed_t = typename std::conditional <
  * \param TYPE the name of the type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE(TYPE, NAME)                                 \
-  class TYPE : public ::RAJA::IndexValue<TYPE>                       \
-  {                                                                  \
-    using parent = ::RAJA::IndexValue<TYPE>;                         \
-                                                                     \
-  public:                                                            \
-    using IndexValueType = TYPE;                                     \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}    \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \
-        : parent::IndexValue(v)                                      \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE(TYPE, NAME)                                           \
+  class TYPE : public ::RAJA::IndexValue<TYPE>                                 \
+  {                                                                            \
+    using parent = ::RAJA::IndexValue<TYPE>;                                   \
+                                                                               \
+  public:                                                                      \
+    using IndexValueType = TYPE;                                               \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}              \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v)           \
+        : parent::IndexValue(v)                                                \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 /*!
@@ -397,17 +399,17 @@ using make_signed_t = typename std::conditional <
  * \param IDXT the index types value type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                         \
-  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                 \
-  {                                                                  \
-  public:                                                            \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                              \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue() {}               \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)               \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue(v)                 \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                                   \
+  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                           \
+  {                                                                            \
+  public:                                                                      \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                                        \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue()                           \
+    {}                                                                         \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)                         \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue(v)                          \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 #endif
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index adee46053c..187ec05d3f 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -85,7 +85,6 @@ template <typename StorageT>
 class TypedListSegment
 {
 public:
-
   //@{
   //!   @name Types used in implementation based on template parameter.
 
@@ -111,7 +110,7 @@ class TypedListSegment
    * \param values array of indices defining iteration space of segment
    * \param length number of indices
    * \param resource camp resource defining memory space where index data live
-   * \param owned optional enum value indicating whether segment owns indices 
+   * \param owned optional enum value indicating whether segment owns indices
    * (Owned or Unowned). Default is Owned.
    *
    * If 'Unowned' is passed as last argument, the segment will not own its
@@ -121,7 +120,7 @@ class TypedListSegment
                    Index_type length,
                    camp::resources::Resource resource,
                    IndexOwnership owned = Owned)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
+      : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
     initIndexData(values, length, resource, owned);
   }
@@ -141,30 +140,34 @@ class TypedListSegment
   template <typename Container>
   TypedListSegment(const Container& container,
                    camp::resources::Resource resource)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size())
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(nullptr),
+        m_size(container.size())
   {
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
 
-      camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
-      auto dest = tmp;
-      auto src = container.begin();
+      auto dest      = tmp;
+      auto src       = container.begin();
       auto const end = container.end();
-      while (src != end) {
+      while (src != end)
+      {
         *dest = *src;
         ++dest;
         ++src;
       }
 
       m_resource = new camp::resources::Resource(resource);
-      m_data = m_resource->allocate<value_type>(m_size);
+      m_data     = m_resource->allocate<value_type>(m_size);
       m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
       m_owned = Owned;
 
       host_res.deallocate(tmp);
-
     }
   }
 
@@ -175,10 +178,11 @@ class TypedListSegment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other)
-    : m_resource(nullptr),
-      m_owned(Unowned), m_data(other.m_data), m_size(other.m_size)
-  {
-  }
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(other.m_data),
+        m_size(other.m_size)
+  {}
 
   //! Copy assignment for list segment
   //  As this may be called from a lambda in a
@@ -187,59 +191,59 @@ class TypedListSegment
   {
     clear();
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_data = other.m_data;
-    m_size = other.m_size;
+    m_owned    = Unowned;
+    m_data     = other.m_data;
+    m_size     = other.m_size;
   }
 
-    //! move assignment for list segment
+  //! move assignment for list segment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs)
   {
     clear();
     m_resource = rhs.m_resource;
-    m_owned = rhs.m_owned;
-    m_data = rhs.m_data;
-    m_size = rhs.m_size;
+    m_owned    = rhs.m_owned;
+    m_data     = rhs.m_data;
+    m_size     = rhs.m_size;
 
     rhs.m_resource = nullptr;
-    rhs.m_owned = Unowned;
-    rhs.m_data = nullptr;
-    rhs.m_size = 0;
+    rhs.m_owned    = Unowned;
+    rhs.m_data     = nullptr;
+    rhs.m_size     = 0;
   }
 
   //! Move constructor for list segment
   RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs)
-    : m_resource(rhs.m_resource),
-      m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
+      : m_resource(rhs.m_resource),
+        m_owned(rhs.m_owned),
+        m_data(rhs.m_data),
+        m_size(rhs.m_size)
   {
-    rhs.m_owned = Unowned;
+    rhs.m_owned    = Unowned;
     rhs.m_resource = nullptr;
-    rhs.m_size = 0;
-    rhs.m_data = nullptr;
+    rhs.m_size     = 0;
+    rhs.m_data     = nullptr;
   }
 
   //! List segment destructor
-  RAJA_HOST_DEVICE ~TypedListSegment()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE ~TypedListSegment() { clear(); }
 
   //! Clear method to be called
   RAJA_HOST_DEVICE void clear()
   {
 
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_data != nullptr && m_owned == Owned) {
+    if (m_data != nullptr && m_owned == Owned)
+    {
       m_resource->deallocate(m_data);
       delete m_resource;
     }
 #endif
-    m_data = nullptr;
+    m_data     = nullptr;
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_size = 0;
+    m_owned    = Unowned;
+    m_size     = 0;
   }
 
   //@}
@@ -345,32 +349,35 @@ class TypedListSegment
   {
 
     // empty list segment
-    if (len <= 0 || container == nullptr) {
-      m_data = nullptr;
-      m_size = 0;
+    if (len <= 0 || container == nullptr)
+    {
+      m_data  = nullptr;
+      m_size  = 0;
       m_owned = Unowned;
       return;
     }
 
     // some non-zero size -- initialize accordingly
-    m_size = len;
+    m_size  = len;
     m_owned = container_own;
-    if (m_owned == Owned) {
+    if (m_owned == Owned)
+    {
 
-        m_resource = new camp::resources::Resource(resource_);
+      m_resource = new camp::resources::Resource(resource_);
 
-        camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
-        value_type* tmp = host_res.allocate<value_type>(m_size);
+      value_type* tmp = host_res.allocate<value_type>(m_size);
 
-        for (Index_type i = 0; i < m_size; ++i) {
-          tmp[i] = container[i];
-        }
+      for (Index_type i = 0; i < m_size; ++i)
+      {
+        tmp[i] = container[i];
+      }
 
-        m_data = m_resource->allocate<value_type>(m_size);
-        m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
+      m_data = m_resource->allocate<value_type>(m_size);
+      m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
 
-        host_res.deallocate(tmp);
+      host_res.deallocate(tmp);
 
       return;
     }
@@ -382,7 +389,7 @@ class TypedListSegment
 
 
   // Copy of camp resource passed to ctor
-  camp::resources::Resource *m_resource;
+  camp::resources::Resource* m_resource;
 
   // Ownership flag to guide data copying/management
   IndexOwnership m_owned;
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index a41959c583..57fdb4c55e 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -50,10 +50,10 @@ namespace RAJA
  *
  * NOTE: TypedRangeSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of 
+ * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of
  *       indices [-5, 3).
  *
- * NOTE: Proper handling of indices strides requires that StorageT is a 
+ * NOTE: Proper handling of indices strides requires that StorageT is a
  *       signed type.
  *
  * Usage:
@@ -92,15 +92,19 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeSegment {
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeSegment
+{
 
-  // 
+  //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
-  // 
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeSegment Type must be non floating point.");
+  //
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -117,20 +121,19 @@ struct TypedRangeSegment {
   //@}
 
   //@{
-  //!   @name Constructors, destructor, and copy assignment. 
+  //!   @name Constructors, destructor, and copy assignment.
 
   /*!
    * \brief Construct a range segment repreenting the interval [begin, end)
-   * 
+   *
    * \param begin start value (inclusive) for the range
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end)
-      : m_begin(iterator(begin)), 
-        m_end(begin > end ? m_begin : iterator(end))
-  {
-  }
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
+                                               StripStorageT end)
+      : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end))
+  {}
 
   //! Disable compiler generated constructor
   RAJA_HOST_DEVICE TypedRangeSegment() = delete;
@@ -187,7 +190,7 @@ struct TypedRangeSegment {
    * \brief Compare this segment to another for inequality
    *
    * \return true if begin or end does not match, else false
-   */ 
+   */
   RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const
   {
     return !(operator==(o));
@@ -198,9 +201,9 @@ struct TypedRangeSegment {
   /*!
    * \brief Get a new TypedRangeSegment instance representing a slice of
    *        existing segment
-   * 
-   * \param begin start iterate of new range 
-   * \param length maximum length of new range 
+   *
+   * \param begin start iterate of new range
+   * \param length maximum length of new range
    * \return TypedRangeSegment representing the interval
    *         [ *begin() + begin, min( *begin() + begin + length, *end() ) )
    *
@@ -213,7 +216,7 @@ struct TypedRangeSegment {
    *     auto r = RAJA::TypedRangeSegment<int>(-4, 4);
    *
    *     // s repreents the subinterval  [-3, 2)
-   *     auto s = r.slice(1, 5); 
+   *     auto s = r.slice(1, 5);
    *
    *   \endverbatim
    */
@@ -221,9 +224,9 @@ struct TypedRangeSegment {
                                                        DiffT length) const
   {
     StorageT start = m_begin[0] + begin;
-    StorageT end = start + length > m_end[0] ? m_end[0] : start + length;
+    StorageT end   = start + length > m_end[0] ? m_end[0] : start + length;
 
-    return TypedRangeSegment{stripIndexType(start), stripIndexType(end)};
+    return TypedRangeSegment {stripIndexType(start), stripIndexType(end)};
   }
 
   /*!
@@ -247,8 +250,8 @@ struct TypedRangeSegment {
 /*!
  ******************************************************************************
  *
- * \class TypedRangeStrideSegment 
- * 
+ * \class TypedRangeStrideSegment
+ *
  * \brief  Segment class representing a strided range of typed indices
  *
  * \tparam StorageT underlying data type for the segment indices (required)
@@ -264,9 +267,9 @@ struct TypedRangeSegment {
  *
  * NOTE: TypedRangeStrideSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeStrideSegment allows for positive or negative strides and 
- *       indices. This allows for forward (stride > 0) or backward (stride < 0) 
- *       traversal of the iteration space. A stride of zero is undefined and 
+ * NOTE: TypedRangeStrideSegment allows for positive or negative strides and
+ *       indices. This allows for forward (stride > 0) or backward (stride < 0)
+ *       traversal of the iteration space. A stride of zero is undefined and
  *       will cause divide-by-zero errors.
  *
  * As with RangeSegment, the iteration space is inclusive of begin() and
@@ -275,7 +278,7 @@ struct TypedRangeSegment {
  * For positive strides, begin() > end() implies size()==0
  * For negative strides, begin() < end() implies size()==0
  *
- * NOTE: Proper handling of negative strides and indices requires that 
+ * NOTE: Proper handling of negative strides and indices requires that
  *       StorageT is a signed type.
  *
  * Usage:
@@ -321,15 +324,19 @@ struct TypedRangeSegment {
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeStrideSegment {
+template <typename StorageT,
+          typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeStrideSegment
+{
 
   //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeStrideSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeStrideSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeStrideSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -349,7 +356,7 @@ struct TypedRangeStrideSegment {
   //!   @name Constructors, destructor, and copy assignment.
 
   /*!
-   * \brief Construct a range segment for the interval [begin, end) with 
+   * \brief Construct a range segment for the interval [begin, end) with
    *        given stride
    *
    * \param begin start value (inclusive) for the range
@@ -357,9 +364,8 @@ struct TypedRangeStrideSegment {
    * \param stride stride value when iterating over the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin,
-                                           StripStorageT end,
-                                           DiffT stride)
+  RAJA_HOST_DEVICE
+  TypedRangeStrideSegment(StripStorageT begin, StripStorageT end, DiffT stride)
       : m_begin(iterator(begin, stride)),
         m_end(iterator(end, stride)),
         // essentially a ceil((end-begin)/stride) but using integer math,
@@ -367,13 +373,16 @@ struct TypedRangeStrideSegment {
         m_size((end - begin + stride - (stride > 0 ? 1 : -1)) / stride)
   {
     // clamp range when end is unreachable from begin without wrapping
-    if (stride < 0 && end > begin) {
+    if (stride < 0 && end > begin)
+    {
       m_end = m_begin;
-    } else if (stride > 0 && end < begin) {
+    }
+    else if (stride > 0 && end < begin)
+    {
       m_end = m_begin;
     }
     // m_size initialized as negative indicates a zero iteration space
-    m_size = m_size < DiffT{0} ? DiffT{0} : m_size;
+    m_size = m_size < DiffT {0} ? DiffT {0} : m_size;
   }
 
   //! Disable compiler generated constructor
@@ -408,8 +417,8 @@ struct TypedRangeStrideSegment {
 
   /*!
    * \brief Get size of this segment
-   * 
-   * The size is the number of iterates in the 
+   *
+   * The size is the number of iterates in the
    * interval [begin, end) when striding over it
    */
   RAJA_HOST_DEVICE DiffT size() const { return m_size; }
@@ -435,7 +444,8 @@ struct TypedRangeStrideSegment {
    *
    * \return true if begin, end, or size does not match, else false
    */
-  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeStrideSegment const& o) const
+  RAJA_HOST_DEVICE RAJA_INLINE bool
+  operator!=(TypedRangeStrideSegment const& o) const
   {
     return !(operator==(o));
   }
@@ -450,7 +460,7 @@ struct TypedRangeStrideSegment {
    * \param length maximum length of new range
    *
    * \return TypedRangeStrideSegment representing the interval
-   *         [ *begin() + begin * stride, 
+   *         [ *begin() + begin * stride,
    *           min( *begin() + (begin + length) * stride, *end() )
    *
    * Here's an example of a slice operation on a range segment with a negative
@@ -466,24 +476,26 @@ struct TypedRangeStrideSegment {
    *     //       5 indices in r starting at the 6th entry
    *     auto s = r.slice(6, 6);
    *
-   *   \endverbatim 
+   *   \endverbatim
    */
   RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
                                                  DiffT length) const
   {
     StorageT stride = m_begin.get_stride();
-    StorageT start = m_begin[0] + begin * stride;
-    StorageT end = start + stride * length;
+    StorageT start  = m_begin[0] + begin * stride;
+    StorageT end    = start + stride * length;
 
-    if (stride > 0) {
+    if (stride > 0)
+    {
       end = end > m_end[0] ? m_end[0] : end;
-    } else {
+    }
+    else
+    {
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment{stripIndexType(start),
-                                   stripIndexType(end),
-                                   m_begin.get_stride()};
+    return TypedRangeStrideSegment {stripIndexType(start), stripIndexType(end),
+                                    m_begin.get_stride()};
   }
 
   /*!
@@ -518,11 +530,12 @@ namespace detail
 
 template <typename T, typename... Rest>
 struct common_type
-    : std::common_type<T, typename std::common_type<Rest...>::type> {
-};
+    : std::common_type<T, typename std::common_type<Rest...>::type>
+{};
 
 template <typename T>
-struct common_type<T> {
+struct common_type<T>
+{
   using type = T;
 };
 
@@ -549,7 +562,7 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
 }
 
 /*!
- * \brief Function to make a TypedRangeStride Segment for the interval 
+ * \brief Function to make a TypedRangeStride Segment for the interval
  *        [begin, end) with given stride
  *
  *  \return a newly constructed TypedRangeStrideSegment where
@@ -561,13 +574,14 @@ template <typename BeginT,
           typename EndT,
           typename StrideT,
           typename Common = detail::common_type_t<BeginT, EndT>>
-RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
-    BeginT&& begin,
-    EndT&& end,
-    StrideT&& stride)
+RAJA_HOST_DEVICE TypedRangeStrideSegment<Common>
+make_strided_range(BeginT&& begin, EndT&& end, StrideT&& stride)
 {
-  static_assert(std::is_signed<StrideT>::value, "make_strided_segment : stride must be signed.");
-  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value, "make_stride_segment : stride and end must be of similar types.");
+  static_assert(std::is_signed<StrideT>::value,
+                "make_strided_segment : stride must be signed.");
+  static_assert(
+      std::is_same<make_signed_t<EndT>, StrideT>::value,
+      "make_stride_segment : stride and end must be of similar types.");
   return {begin, end, stride};
 }
 
@@ -576,13 +590,13 @@ namespace concepts
 
 template <typename T, typename U>
 struct RangeConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>())
+{};
 
 template <typename T, typename U, typename V>
 struct RangeStrideConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>())
+{};
 
 }  // namespace concepts
 
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 8feceae22f..d2a30ee5ce 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -57,8 +57,7 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   DepGraphNode()
       : m_num_dep_tasks(0), m_semaphore_reload_value(0), m_semaphore_value(0)
-  {
-  }
+  {}
 
   ///
   /// Get/set semaphore value; i.e., the current number of (unsatisfied)
@@ -82,7 +81,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void satisfyOne()
   {
-    if (m_semaphore_value > 0) {
+    if (m_semaphore_value > 0)
+    {
       --m_semaphore_value;
     }
   }
@@ -92,7 +92,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void wait()
   {
-    while (m_semaphore_value > 0) {
+    while (m_semaphore_value > 0)
+    {
       // TODO: an efficient wait would be better here, but the standard
       // promise/future is not good enough
       std::this_thread::yield();
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 6f32a56e6d..33cdd3f539 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -50,7 +50,8 @@ std::string overflow_msg(LType lhs, RType rhs)
 template <typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 {
-  if (std::is_unsigned<Type>::value) {
+  if (std::is_unsigned<Type>::value)
+  {
     if ((rhs > 0) && (lhs > std::numeric_limits<Type>::max() - rhs))
       return true;
     if ((rhs < 0) && (lhs < std::numeric_limits<Type>::min() - rhs))
@@ -64,18 +65,22 @@ RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
                                               DifferenceType rhs,
                                               bool iterator_on_left = true)
 {
-  if (iterator_on_left) {
+  if (iterator_on_left)
+  {
 
-    if (std::is_unsigned<Type>::value) {
+    if (std::is_unsigned<Type>::value)
+    {
       if ((rhs > 0) && (lhs < std::numeric_limits<Type>::min() + rhs))
         return true;
       if ((rhs < 0) && (lhs > std::numeric_limits<Type>::max() + rhs))
         return true;
     }
+  }
+  else
+  {  // Special case where operation is : value(lhs) - iterator(rhs).
 
-  } else {  // Special case where operation is : value(lhs) - iterator(rhs).
-
-    if (std::is_unsigned<DifferenceType>::value) {
+    if (std::is_unsigned<DifferenceType>::value)
+    {
       if ((lhs > 0) && (rhs < std::numeric_limits<DifferenceType>::min() + lhs))
         return true;
       if ((lhs < 0)) return true;
@@ -100,29 +105,28 @@ RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
 }
 #endif
 
-template <typename Type = Index_type,
+template <typename Type           = Index_type,
           typename DifferenceType = Type,
-          typename PointerType = Type*>
+          typename PointerType    = Type*>
 class numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = PointerType;
-  using reference = value_type&;
-  using iterator_category = std::random_access_iterator_tag;
-
-  constexpr numeric_iterator() noexcept = default;
-  constexpr numeric_iterator(const numeric_iterator&) noexcept = default;
-  constexpr numeric_iterator(numeric_iterator&&) noexcept = default;
+  using difference_type     = DifferenceType;
+  using pointer             = PointerType;
+  using reference           = value_type&;
+  using iterator_category   = std::random_access_iterator_tag;
+
+  constexpr numeric_iterator() noexcept                         = default;
+  constexpr numeric_iterator(const numeric_iterator&) noexcept  = default;
+  constexpr numeric_iterator(numeric_iterator&&) noexcept       = default;
   numeric_iterator& operator=(const numeric_iterator&) noexcept = default;
-  numeric_iterator& operator=(numeric_iterator&&) noexcept = default;
+  numeric_iterator& operator=(numeric_iterator&&) noexcept      = default;
 
   RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs)
       : val(rhs)
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; }
 
@@ -174,8 +178,8 @@ class numeric_iterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator+=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
@@ -183,8 +187,8 @@ class numeric_iterator
     val += rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator-=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
@@ -192,48 +196,47 @@ class numeric_iterator
     val -= rhs;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator+=(const numeric_iterator& rhs)
   {
     val += rhs.val;
     return *this;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE inline numeric_iterator&
+  operator-=(const numeric_iterator& rhs)
   {
     val -= rhs.val;
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline stripped_value_type operator+(
-      const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type
+  operator+(const numeric_iterator& rhs) const
   {
     return val + rhs.val;
   }
-  RAJA_HOST_DEVICE inline stripped_value_type operator-(
-      const numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline stripped_value_type
+  operator-(const numeric_iterator& rhs) const
   {
     return val - rhs.val;
   }
-  RAJA_HOST_DEVICE inline numeric_iterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator
+  operator+(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs);
 #endif
     return numeric_iterator(val + rhs);
   }
-  RAJA_HOST_DEVICE inline numeric_iterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline numeric_iterator
+  operator-(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs);
 #endif
     return numeric_iterator(val - rhs);
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+(
-      difference_type lhs,
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator
+  operator+(difference_type lhs, const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_addition_overflow(rhs.val, lhs)
@@ -243,9 +246,8 @@ class numeric_iterator
     return numeric_iterator(lhs + rhs.val);
 #endif
   }
-  RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-(
-      difference_type lhs,
-      const numeric_iterator& rhs)
+  RAJA_HOST_DEVICE friend constexpr numeric_iterator
+  operator-(difference_type lhs, const numeric_iterator& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     return is_subtraction_overflow(rhs.val, lhs, false)
@@ -273,31 +275,34 @@ class numeric_iterator
   stripped_value_type val = 0;
 };
 
-template <typename Type = Index_type,
+template <typename Type           = Index_type,
           typename DifferenceType = Type,
-          typename PointerType = Type*>
+          typename PointerType    = Type*>
 class strided_numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = DifferenceType*;
-  using reference = DifferenceType&;
-  using iterator_category = std::random_access_iterator_tag;
+  using difference_type     = DifferenceType;
+  using pointer             = DifferenceType*;
+  using reference           = DifferenceType&;
+  using iterator_category   = std::random_access_iterator_tag;
 
   constexpr strided_numeric_iterator() noexcept = default;
-  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = default;
-  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = default;
-  strided_numeric_iterator& operator=(const strided_numeric_iterator&) noexcept = default;
-  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept = default;
+  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept =
+      default;
+  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept =
+      default;
+  strided_numeric_iterator&
+  operator=(const strided_numeric_iterator&) noexcept = default;
+  strided_numeric_iterator&
+  operator=(strided_numeric_iterator&&) noexcept = default;
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
       DifferenceType stride_ = DifferenceType(1))
       : val(rhs), stride(stride_)
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return stride; }
 
@@ -312,8 +317,8 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator&
+  operator+=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
@@ -321,8 +326,8 @@ class strided_numeric_iterator
     val += rhs * stride;
     return *this;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline strided_numeric_iterator&
+  operator-=(const difference_type& rhs)
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -331,33 +336,33 @@ class strided_numeric_iterator
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator+(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator+(const strided_numeric_iterator& rhs) const
   {
     return (static_cast<difference_type>(val) +
             (static_cast<difference_type>(rhs.val))) /
            stride;
   }
-  RAJA_HOST_DEVICE inline difference_type operator-(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator-(const strided_numeric_iterator& rhs) const
   {
     difference_type diff = (static_cast<difference_type>(val) -
                             (static_cast<difference_type>(rhs.val)));
 
-    return (diff % stride != difference_type{0})
-               ? (difference_type{1} + diff / stride)
+    return (diff % stride != difference_type {0})
+               ? (difference_type {1} + diff / stride)
                : diff / stride;
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator
+  operator+(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_addition_overflow(val, rhs * stride);
 #endif
     return strided_numeric_iterator(val + rhs * stride, stride);
   }
-  RAJA_HOST_DEVICE inline strided_numeric_iterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline strided_numeric_iterator
+  operator-(const difference_type& rhs) const
   {
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
     check_is_subtraction_overflow(val, rhs * stride);
@@ -367,34 +372,34 @@ class strided_numeric_iterator
 
   // Specialized comparison to allow normal iteration to work on off-stride
   // multiples by adjusting rhs to the nearest *higher* multiple of stride
-  RAJA_HOST_DEVICE inline bool operator!=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator!=(const strided_numeric_iterator& rhs) const
   {
     return (val - rhs.val) / stride;
   }
-  RAJA_HOST_DEVICE inline bool operator==(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator==(const strided_numeric_iterator& rhs) const
   {
     return !((val - rhs.val) / stride);
   }
 
-  RAJA_HOST_DEVICE inline bool operator>(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator>(const strided_numeric_iterator& rhs) const
   {
     return val * stride > rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator<(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator<(const strided_numeric_iterator& rhs) const
   {
     return val * stride < rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator>=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator>=(const strided_numeric_iterator& rhs) const
   {
     return val * stride >= rhs.val * stride;
   }
-  RAJA_HOST_DEVICE inline bool operator<=(
-      const strided_numeric_iterator& rhs) const
+  RAJA_HOST_DEVICE inline bool
+  operator<=(const strided_numeric_iterator& rhs) const
   {
     return val * stride <= rhs.val * stride;
   }
@@ -415,7 +420,7 @@ class strided_numeric_iterator
 
 private:
   stripped_value_type val = 0;
-  DifferenceType stride = 1;
+  DifferenceType stride   = 1;
 };
 
 
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 55015f9ab7..a7dee5a77c 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -27,7 +27,7 @@
 
 #include "RAJA/util/types.hpp"
 
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \
+#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) ||                \
     defined(__MINGW32__) || defined(__BORLANDC__)
 #define RAJA_PLATFORM_WINDOWS
 #include <malloc.h>
@@ -44,7 +44,7 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #if defined(RAJA_HAVE_POSIX_MEMALIGN)
   // posix_memalign available
   void* ret = nullptr;
-  int err = posix_memalign(&ret, alignment, size);
+  int err   = posix_memalign(&ret, alignment, size);
   return err ? nullptr : ret;
 #elif defined(RAJA_HAVE_ALIGNED_ALLOC)
   return std::aligned_alloc(alignment, size);
@@ -53,10 +53,10 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #elif defined(RAJA_PLATFORM_WINDOWS)
   return _aligned_malloc(size, alignment);
 #else
-  char *mem = (char *)malloc(size + alignment + sizeof(void *));
+  char* mem = (char*)malloc(size + alignment + sizeof(void*));
   if (nullptr == mem) return nullptr;
-  void **ptr = (void **)((std::uintptr_t)(mem + alignment + sizeof(void *)) &
-                         ~(alignment - 1));
+  void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) &
+                        ~(alignment - 1));
   // Store the original address one position behind what we give the user.
   ptr[-1] = mem;
   return ptr;
@@ -97,25 +97,23 @@ inline void free_aligned(void* ptr)
 ///
 struct FreeAligned
 {
-  void operator()(void* ptr)
-  {
-    free_aligned(ptr);
-  }
+  void operator()(void* ptr) { free_aligned(ptr); }
 };
 
 ///
 /// Deleter function object for memory allocated with allocate_aligned_type
 /// that calls the destructor for the fist size objects in the storage.
 ///
-template < typename T, typename index_type >
+template <typename T, typename index_type>
 struct FreeAlignedType : FreeAligned
 {
   index_type size = 0;
 
   void operator()(T* ptr)
   {
-    for ( index_type i = size; i > 0; --i ) {
-      ptr[i-1].~T();
+    for (index_type i = size; i > 0; --i)
+    {
+      ptr[i - 1].~T();
     }
     FreeAligned::operator()(ptr);
   }
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 1d0ec0cbeb..7802bda6cd 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -49,7 +49,7 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename T, typename Allocator = std::allocator<T> >
+template <typename T, typename Allocator = std::allocator<T>>
 class RAJAVec
 {
   using allocator_traits_type = std::allocator_traits<Allocator>;
@@ -57,24 +57,25 @@ class RAJAVec
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
+
 public:
-  using value_type = T;
-  using allocator_type = Allocator;
-  using size_type = std::size_t;
+  using value_type      = T;
+  using allocator_type  = Allocator;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = typename allocator_traits_type::pointer;
-  using const_pointer = typename allocator_traits_type::const_pointer;
-  using iterator = value_type*;
-  using const_iterator = const value_type*;
+  using pointer         = typename allocator_traits_type::pointer;
+  using const_pointer   = typename allocator_traits_type::const_pointer;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
 
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(size_type init_cap = 0,
+  explicit RAJAVec(size_type init_cap      = 0,
                    const allocator_type& a = allocator_type())
       : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
   {
@@ -86,7 +87,9 @@ class RAJAVec
   ///
   RAJAVec(const RAJAVec& other)
       : m_data(nullptr),
-        m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)),
+        m_allocator(
+            allocator_traits_type::select_on_container_copy_construction(
+                other.m_allocator)),
         m_capacity(0),
         m_size(0)
   {
@@ -103,9 +106,9 @@ class RAJAVec
         m_capacity(other.m_capacity),
         m_size(other.m_size)
   {
-    other.m_data = nullptr;
+    other.m_data     = nullptr;
     other.m_capacity = 0;
-    other.m_size = 0;
+    other.m_size     = 0;
   }
 
   ///
@@ -113,8 +116,9 @@ class RAJAVec
   ///
   RAJAVec& operator=(const RAJAVec& rhs)
   {
-    if (&rhs != this) {
-      copy_assign_private(rhs, propagate_on_container_copy_assignment{});
+    if (&rhs != this)
+    {
+      copy_assign_private(rhs, propagate_on_container_copy_assignment {});
     }
     return *this;
   }
@@ -124,8 +128,10 @@ class RAJAVec
   ///
   RAJAVec& operator=(RAJAVec&& rhs)
   {
-    if (&rhs != this) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (&rhs != this)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -144,31 +150,31 @@ class RAJAVec
   ///
   void swap(RAJAVec& other)
   {
-    swap_private(other, propagate_on_container_swap{});
+    swap_private(other, propagate_on_container_swap {});
   }
 
   ///
   /// Get a pointer to the beginning of the contiguous vector
   ///
-        pointer data()       { return m_data; }
+  pointer data() { return m_data; }
   ///
   const_pointer data() const { return m_data; }
 
   ///
   /// Get an iterator to the end.
   ///
-        iterator  end()       { return m_data + m_size; }
+  iterator end() { return m_data + m_size; }
   ///
-  const_iterator  end() const { return m_data + m_size; }
+  const_iterator end() const { return m_data + m_size; }
   ///
   const_iterator cend() const { return m_data + m_size; }
 
   ///
   /// Get an iterator to the beginning.
   ///
-        iterator  begin()       { return m_data; }
+  iterator begin() { return m_data; }
   ///
-  const_iterator  begin() const { return m_data; }
+  const_iterator begin() const { return m_data; }
   ///
   const_iterator cbegin() const { return m_data; }
 
@@ -200,18 +206,12 @@ class RAJAVec
   ///
   /// Shrink the capacity of the vector to the current size.
   ///
-  void shrink_to_fit()
-  {
-    shrink_cap(m_size);
-  }
+  void shrink_to_fit() { shrink_cap(m_size); }
 
   ///
   /// Empty vector of all data.
   ///
-  void clear()
-  {
-    destroy_items_after(0);
-  }
+  void clear() { destroy_items_after(0); }
 
   ///
   /// Change the size of the vector,
@@ -221,10 +221,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -237,10 +240,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size, const_reference new_value)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size, new_value);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -248,23 +254,23 @@ class RAJAVec
   ///
   /// Bracket operator accessor.
   ///
-        reference operator[](difference_type i)       { return m_data[i]; }
+  reference operator[](difference_type i) { return m_data[i]; }
   ///
   const_reference operator[](difference_type i) const { return m_data[i]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference front()       { return m_data[0]; }
+  reference front() { return m_data[0]; }
   ///
   const_reference front() const { return m_data[0]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference back()       { return m_data[m_size-1]; }
+  reference back() { return m_data[m_size - 1]; }
   ///
-  const_reference back() const { return m_data[m_size-1]; }
+  const_reference back() const { return m_data[m_size - 1]; }
 
   ///
   /// Add item to front end of vector. Note that this operation is unique to
@@ -272,28 +278,31 @@ class RAJAVec
   ///
   void push_front(const_reference item) { emplace_front_private(item); }
   ///
-  void push_front(   value_type&& item) { emplace_front_private(std::move(item)); }
+  void push_front(value_type&& item) { emplace_front_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_front(Os&&... os) { emplace_front_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_front(Os&&... os)
+  {
+    emplace_front_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Add item to back end of vector.
   ///
   void push_back(const_reference item) { emplace_back_private(item); }
   ///
-  void push_back(   value_type&& item) { emplace_back_private(std::move(item)); }
+  void push_back(value_type&& item) { emplace_back_private(std::move(item)); }
   ///
-  template < typename ... Os >
-  void emplace_back(Os&&... os) { emplace_back_private(std::forward<Os>(os)...); }
+  template <typename... Os>
+  void emplace_back(Os&&... os)
+  {
+    emplace_back_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Remove the last item of the vector.
   ///
-  void pop_back()
-  {
-    destroy_items_after(m_size-1);
-  }
+  void pop_back() { destroy_items_after(m_size - 1); }
 
 private:
   pointer m_data;
@@ -307,13 +316,14 @@ class RAJAVec
   ///
   void copy_assign_private(RAJAVec const& rhs, std::true_type)
   {
-    if (m_allocator != rhs.m_allocator) {
+    if (m_allocator != rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
       m_allocator = rhs.m_allocator;
     }
 
-    copy_assign_private(rhs, std::false_type{});
+    copy_assign_private(rhs, std::false_type {});
   }
 
   ///
@@ -323,10 +333,13 @@ class RAJAVec
   void copy_assign_private(RAJAVec const& rhs, std::false_type)
   {
     reserve(rhs.size());
-    if (size() < rhs.size()) {
+    if (size() < rhs.size())
+    {
       copy_assign_items(0, size(), rhs.data());
       copy_construct_items_back(rhs.size(), rhs.data());
-    } else {
+    }
+    else
+    {
       copy_assign_items(0, rhs.size(), rhs.data());
       destroy_items_after(size());
     }
@@ -341,14 +354,14 @@ class RAJAVec
     clear();
     shrink_to_fit();
 
-    m_data = rhs.m_data;
+    m_data      = rhs.m_data;
     m_allocator = std::move(rhs.m_allocator);
-    m_capacity = rhs.m_capacity;
-    m_size = rhs.m_size;
+    m_capacity  = rhs.m_capacity;
+    m_size      = rhs.m_size;
 
-    rhs.m_data = nullptr;
+    rhs.m_data     = nullptr;
     rhs.m_capacity = 0;
-    rhs.m_size = 0;
+    rhs.m_size     = 0;
   }
 
   ///
@@ -357,23 +370,29 @@ class RAJAVec
   ///
   void move_assign_private(RAJAVec&& rhs, std::false_type)
   {
-    if (m_allocator == rhs.m_allocator) {
+    if (m_allocator == rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
 
-      m_data = rhs.m_data;
+      m_data     = rhs.m_data;
       m_capacity = rhs.m_capacity;
-      m_size = rhs.m_size;
+      m_size     = rhs.m_size;
 
-      rhs.m_data = nullptr;
+      rhs.m_data     = nullptr;
       rhs.m_capacity = 0;
-      rhs.m_size = 0;
-    } else {
+      rhs.m_size     = 0;
+    }
+    else
+    {
       reserve(rhs.size());
-      if (size() < rhs.size()) {
+      if (size() < rhs.size())
+      {
         move_assign_items(0, size(), rhs.data());
         move_construct_items_back(rhs.size(), rhs.data());
-      } else {
+      }
+      else
+      {
         move_assign_items(0, rhs.size(), rhs.data());
         destroy_items_after(size());
       }
@@ -386,10 +405,10 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::true_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
+    swap(m_data, other.m_data);
     swap(m_allocator, other.m_allocator);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   ///
@@ -398,9 +417,9 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::false_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_data, other.m_data);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   //
@@ -408,7 +427,8 @@ class RAJAVec
   //
   void copy_assign_items(size_type first, size_type last, const_pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = o_data[i];
     }
   }
@@ -418,7 +438,8 @@ class RAJAVec
   //
   void move_assign_items(size_type first, size_type last, pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = std::move(o_data[i]);
     }
   }
@@ -426,11 +447,13 @@ class RAJAVec
   //
   // Construct items [m_size, new_size) from args.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void construct_items_back(size_type new_size, Os&&... os)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::forward<Os>(os)...);
     }
   }
 
@@ -439,8 +462,10 @@ class RAJAVec
   //
   void copy_construct_items_back(size_type new_size, const_pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       o_data[m_size]);
     }
   }
 
@@ -449,8 +474,10 @@ class RAJAVec
   //
   void move_construct_items_back(size_type new_size, pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size]));
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::move(o_data[m_size]));
     }
   }
 
@@ -459,39 +486,45 @@ class RAJAVec
   //
   void destroy_items_after(size_type new_end)
   {
-    for (; m_size > new_end; --m_size) {
-      allocator_traits_type::destroy(m_allocator, m_data+m_size-1);
+    for (; m_size > new_end; --m_size)
+    {
+      allocator_traits_type::destroy(m_allocator, m_data + m_size - 1);
     }
   }
 
   //
   // Add an item to the front, shifting all existing items back one.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_front_private(Os&&... os)
   {
     reserve(m_size + 1);
 
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
       size_type i = m_size;
-      allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1]));
-      for (--i; i > 0; --i) {
+      allocator_traits_type::construct(m_allocator, m_data + i,
+                                       std::move(m_data[i - 1]));
+      for (--i; i > 0; --i)
+      {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(m_allocator, m_data, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
   //
   // Add an item to the back.
   //
-  template < typename ... Os >
+  template <typename... Os>
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -501,7 +534,7 @@ class RAJAVec
   // relying on STL directly.
   //
   static constexpr const size_type s_init_cap = 8;
-  static constexpr const double s_grow_fac = 1.5;
+  static constexpr const double s_grow_fac    = 1.5;
 
   //
   // Get the next value for capacity given a target and minimum.
@@ -509,7 +542,8 @@ class RAJAVec
   size_type get_next_cap(size_type target_size)
   {
     size_type next_cap = s_init_cap;
-    if (m_capacity != 0) {
+    if (m_capacity != 0)
+    {
       next_cap = static_cast<size_type>(m_capacity * s_grow_fac);
     }
     return std::max(target_size, next_cap);
@@ -520,7 +554,8 @@ class RAJAVec
   //
   void grow_cap(size_type target_size)
   {
-    if (m_capacity < target_size) {
+    if (m_capacity < target_size)
+    {
       change_cap(get_next_cap(target_size));
     }
   }
@@ -530,7 +565,8 @@ class RAJAVec
   //
   void shrink_cap(size_type target_size)
   {
-    if (m_capacity > target_size) {
+    if (m_capacity > target_size)
+    {
       change_cap(std::max(m_size, target_size));
     }
   }
@@ -542,19 +578,23 @@ class RAJAVec
   void change_cap(size_type next_cap)
   {
     pointer tdata = nullptr;
-    if (next_cap != 0) {
+    if (next_cap != 0)
+    {
       tdata = allocator_traits_type::allocate(m_allocator, next_cap);
     }
 
-    if (m_data) {
-      for (size_type i = 0; i < m_size; ++i) {
-        allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i]));
-        allocator_traits_type::destroy(m_allocator, m_data+i);
+    if (m_data)
+    {
+      for (size_type i = 0; i < m_size; ++i)
+      {
+        allocator_traits_type::construct(m_allocator, tdata + i,
+                                         std::move(m_data[i]));
+        allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
     }
 
-    m_data = tdata;
+    m_data     = tdata;
     m_capacity = next_cap;
   }
 };
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index cf3a86cede..66d03ca6cd 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -37,60 +37,72 @@
 #include <stdio.h>
 #include "cycle.h"
 
-#define RAJA_FT_BEGIN                          \
-  extern volatile int fault_type;              \
-  bool repeat;                                 \
-  bool do_time = false;                        \
-  ticks start = 0, stop = 0;                   \
-  if (fault_type != 0) {                       \
-    printf("Uncaught fault %d\n", fault_type); \
-    fault_type = 0;                            \
-  }                                            \
-  do {                                         \
-    repeat = false;                            \
-    if (do_time) {                             \
-      start = getticks();                      \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  bool do_time = false;                                                        \
+  ticks start = 0, stop = 0;                                                   \
+  if (fault_type != 0)                                                         \
+  {                                                                            \
+    printf("Uncaught fault %d\n", fault_type);                                 \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  do                                                                           \
+  {                                                                            \
+    repeat = false;                                                            \
+    if (do_time)                                                               \
+    {                                                                          \
+      start = getticks();                                                      \
     }
 
-#define RAJA_FT_END                                                          \
-  if (do_time) {                                                             \
-    stop = getticks();                                                       \
-    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start)); \
-    do_time = false;                                                         \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type < 0) {                                                      \
-    printf("Unrecoverable fault (restart penalty)\n");                       \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type > 0) {                                                      \
-    /* invalidate cache */                                                   \
-    repeat = true;                                                           \
-    do_time = true;                                                          \
-  }                                                                          \
-  }                                                                          \
-  while (repeat == true)                                                     \
+#define RAJA_FT_END                                                            \
+  if (do_time)                                                                 \
+  {                                                                            \
+    stop = getticks();                                                         \
+    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start));   \
+    do_time    = false;                                                        \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type < 0)                                                          \
+  {                                                                            \
+    printf("Unrecoverable fault (restart penalty)\n");                         \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat  = true;                                                            \
+    do_time = true;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
     ;
 
 #else
-#define RAJA_FT_BEGIN             \
-  extern volatile int fault_type; \
-  bool repeat;                    \
-  if (fault_type == 0) {          \
-    do {                          \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  if (fault_type == 0)                                                         \
+  {                                                                            \
+    do                                                                         \
+    {                                                                          \
       repeat = false;
 
-#define RAJA_FT_END        \
-  if (fault_type > 0) {    \
-    /* invalidate cache */ \
-    repeat = true;         \
-    fault_type = 0;        \
-  }                        \
-  }                        \
-  while (repeat == true)   \
-    ;                      \
-  }                        \
-  else { fault_type = 0; /* ignore for the simulation */ }
+#define RAJA_FT_END                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat     = true;                                                         \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
+    ;                                                                          \
+  }                                                                            \
+  else                                                                         \
+  {                                                                            \
+    fault_type = 0; /* ignore for the simulation */                            \
+  }
 
 #endif  // RAJA_REPORT_FT
 
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index af65c05392..f16bd9bee4 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -44,14 +44,16 @@ template <typename Op, typename... Rest>
 struct foldl_impl;
 
 template <typename Op, typename Arg1>
-struct foldl_impl<Op, Arg1> {
+struct foldl_impl<Op, Arg1>
+{
   using Ret = Arg1;
 };
 
 #if RAJA_HAS_CXX17_IS_INVOCABLE
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
 };
 
@@ -60,18 +62,22 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
-  using Ret = typename foldl_impl<
-      Op,
-      typename std::invoke_result<Op, typename std::invoke_result<Op, Arg1, Arg2>::type,
-                                      Arg3>::type,
-      Rest...>::Ret;
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
+  using Ret =
+      typename foldl_impl<Op,
+                          typename std::invoke_result<
+                              Op,
+                              typename std::invoke_result<Op, Arg1, Arg2>::type,
+                              Arg3>::type,
+                          Rest...>::Ret;
 };
 
 #else
 
 template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
 };
 
@@ -80,7 +86,8 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
   using Ret = typename foldl_impl<
       Op,
       typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
@@ -90,20 +97,19 @@ struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
 
 #endif
 
-} // namespace detail
+}  // namespace detail
 
 template <typename Op, typename Arg1>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
-    Op&& RAJA_UNUSED_ARG(operation),
-    Arg1&& arg) -> typename detail::foldl_impl<Op, Arg1>::Ret
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& RAJA_UNUSED_ARG(operation), Arg1&& arg) ->
+    typename detail::foldl_impl<Op, Arg1>::Ret
 {
   return camp::forward<Arg1>(arg);
 }
 
 template <typename Op, typename Arg1, typename Arg2>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2) ->
     typename detail::foldl_impl<Op, Arg1, Arg2>::Ret
 {
   return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
@@ -115,11 +121,8 @@ template <typename Op,
           typename Arg2,
           typename Arg3,
           typename... Rest>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
-                                                  Arg1&& arg1,
-                                                  Arg2&& arg2,
-                                                  Arg3&& arg3,
-                                                  Rest&&... rest) ->
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Rest&&... rest) ->
     typename detail::foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
 {
   return foldl(camp::forward<Op>(operation),
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index 0354d04bfd..313ef66934 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -8,18 +8,21 @@
 namespace RAJA
 {
 
-namespace policy {
-namespace multi {
+namespace policy
+{
+namespace multi
+{
 template <typename Selector, typename... Policies>
 class MultiPolicy;
 
 }
-}
+}  // namespace policy
 
-namespace detail 
+namespace detail
 {
 
-struct max_platform {
+struct max_platform
+{
   RAJA_HOST_DEVICE
   RAJA_INLINE
   constexpr RAJA::Platform operator()(const RAJA::Platform& l,
@@ -34,7 +37,8 @@ struct max_platform {
  * This is a catch-all, so anything undefined gets Platform::undefined
  */
 template <typename T, typename = void>
-struct get_platform {
+struct get_platform
+{
   // catch-all: undefined platform
   static constexpr Platform value = Platform::undefined;
 };
@@ -45,7 +49,8 @@ struct get_platform {
  * reduction of them all.
  */
 template <typename... Policies>
-struct get_platform_from_list {
+struct get_platform_from_list
+{
   static constexpr Platform value =
       foldl(max_platform(), get_platform<Policies>::value...);
 };
@@ -54,7 +59,8 @@ struct get_platform_from_list {
  * Define an empty list as Platform::undefined;
  */
 template <>
-struct get_platform_from_list<> {
+struct get_platform_from_list<>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -67,10 +73,10 @@ struct get_platform_from_list<> {
  */
 template <typename T>
 struct get_platform<T,
-                    typename std::
-                        enable_if<std::is_base_of<RAJA::PolicyBase, T>::value
-                                  && !RAJA::type_traits::is_indexset_policy<T>::
-                                         value>::type> {
+                    typename std::enable_if<
+                        std::is_base_of<RAJA::PolicyBase, T>::value &&
+                        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
+{
 
   static constexpr Platform value = T::platform;
 };
@@ -83,12 +89,13 @@ struct get_platform<T,
  */
 template <typename SEG, typename EXEC>
 struct get_platform<RAJA::ExecPolicy<SEG, EXEC>>
-    : public get_platform_from_list<SEG, EXEC> {
-};
+    : public get_platform_from_list<SEG, EXEC>
+{};
 
 
 template <typename T>
-struct get_statement_platform {
+struct get_statement_platform
+{
   static constexpr Platform value =
       get_platform_from_list<typename T::execution_policy_t,
                              typename T::enclosed_statements_t>::value;
@@ -102,7 +109,8 @@ struct get_statement_platform {
  * each of them.
  */
 template <typename... Stmts>
-struct get_platform<RAJA::internal::StatementList<Stmts...>> {
+struct get_platform<RAJA::internal::StatementList<Stmts...>>
+{
   static constexpr Platform value =
       foldl(max_platform(), get_statement_platform<Stmts>::value...);
 };
@@ -111,7 +119,8 @@ struct get_platform<RAJA::internal::StatementList<Stmts...>> {
  * Specialize for an empty statement list to be undefined
  */
 template <>
-struct get_platform<RAJA::internal::StatementList<>> {
+struct get_platform<RAJA::internal::StatementList<>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
@@ -120,11 +129,12 @@ struct get_platform<RAJA::internal::StatementList<>> {
 // Once a specific policy is selected, that policy will select the correct
 // platform... see policy_invoker in MultiPolicy.hpp
 template <typename SELECTOR, typename... POLICIES>
-struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>> {
+struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
-} // closing brace for detail namespace
-} // closing brace for RAJA namespace
+}  // namespace detail
+}  // namespace RAJA
 
-#endif // RAJA_get_platform_HPP
+#endif  // RAJA_get_platform_HPP
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 767821b8d8..be5abb6848 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -38,38 +38,44 @@ namespace RAJA
  *
  * \verbatim
 
-   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> pool(allocator);
+   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator>
+ pool(allocator);
 
    pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) {
       xarg0[i] = xarg1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group =
+ pool.instantiate();
 
    int* xarg0 = ...;
    int xarg1 = ...;
-   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site = group.run(xarg0, xarg1);
+   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site =
+ group.run(xarg0, xarg1);
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template < typename ... Args >
+template <typename... Args>
 using xargs = camp::list<Args...>;
 
-namespace detail {
+namespace detail
+{
 
-template < typename T >
-struct is_xargs {
+template <typename T>
+struct is_xargs
+{
   static constexpr bool value = false;
 };
 
-template < typename ... Args >
-struct is_xargs<xargs<Args...>> {
+template <typename... Args>
+struct is_xargs<xargs<Args...>>
+{
   static constexpr bool value = true;
 };
 
-}
+}  // namespace detail
 
 
 //
@@ -102,7 +108,8 @@ struct is_xargs<xargs<Args...>> {
       data[i] = 1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
  * \endverbatim
  *
@@ -112,11 +119,13 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkPool {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkPool
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -135,9 +144,11 @@ struct WorkPool {
  *
  * \verbatim
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
  * \endverbatim
  *
@@ -147,11 +158,13 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkGroup {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkGroup
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -170,7 +183,8 @@ struct WorkGroup {
  *
  * \verbatim
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
    site.synchronize();
 
@@ -182,11 +196,13 @@ template <typename WORKGROUP_POLICY_T,
           typename INDEX_T,
           typename EXTRA_ARGS_T,
           typename ALLOCATOR_T>
-struct WorkSite {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+struct WorkSite
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 
@@ -195,7 +211,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -205,23 +221,32 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
-  using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
+  using worksite_type  = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<
-      exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>;
-  using storage_type = detail::WorkStorage<
-      storage_policy, Allocator, typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<exec_policy,
+                                             order_policy,
+                                             dispatch_policy,
+                                             Allocator,
+                                             index_type,
+                                             Args...>;
+  using storage_type =
+      detail::WorkStorage<storage_policy,
+                          Allocator,
+                          typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -229,52 +254,45 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workrunner_type::resource_type;
 
-  explicit WorkPool(Allocator const& aloc)
-    : m_storage(aloc)
-  { }
+  explicit WorkPool(Allocator const& aloc) : m_storage(aloc) {}
 
-  WorkPool(WorkPool const&) = delete;
+  WorkPool(WorkPool const&)            = delete;
   WorkPool& operator=(WorkPool const&) = delete;
 
-  WorkPool(WorkPool&&) = default;
+  WorkPool(WorkPool&&)            = default;
   WorkPool& operator=(WorkPool&&) = default;
 
-  size_t num_loops() const
-  {
-    return m_storage.size();
-  }
+  size_t num_loops() const { return m_storage.size(); }
 
-  size_t storage_bytes() const
-  {
-    return m_storage.storage_size();
-  }
+  size_t storage_bytes() const { return m_storage.storage_size(); }
 
   void reserve(size_t num_loops, size_t storage_bytes)
   {
     m_storage.reserve(num_loops, storage_bytes);
   }
 
-  template < typename segment_T, typename loop_T >
+  template <typename segment_T, typename loop_T>
   inline void enqueue(segment_T&& seg, loop_T&& loop_body)
   {
     {
       // ignore zero length loops
-      using std::begin; using std::end;
+      using std::begin;
+      using std::end;
       if (begin(seg) == end(seg)) return;
     }
-    if (m_storage.begin() == m_storage.end()) {
+    if (m_storage.begin() == m_storage.end())
+    {
       // perform auto-reserve on reuse
       reserve(m_max_num_loops, m_max_storage_bytes);
     }
 
-    util::PluginContext context{util::make_context<exec_policy>()};
+    util::PluginContext context {util::make_context<exec_policy>()};
     util::callPreCapturePlugins(context);
 
     using RAJA::util::trigger_updates_before;
     auto body = trigger_updates_before(loop_body);
 
-    m_runner.enqueue(
-        m_storage, std::forward<segment_T>(seg), std::move(body));
+    m_runner.enqueue(m_storage, std::forward<segment_T>(seg), std::move(body));
 
     util::callPostCapturePlugins(context);
   }
@@ -289,14 +307,11 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkPool()
-  {
-    clear();
-  }
+  ~WorkPool() { clear(); }
 
 private:
   storage_type m_storage;
-  size_t m_max_num_loops = 0;
+  size_t m_max_num_loops     = 0;
   size_t m_max_storage_bytes = 0;
 
   workrunner_type m_runner;
@@ -307,7 +322,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  ORDER_POLICY_T,
@@ -317,20 +332,23 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                  xargs<Args...>,
                  ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using storage_type = typename workpool_type::storage_type;
+  using storage_type    = typename workpool_type::storage_type;
   using workrunner_type = typename workpool_type::workrunner_type;
 
   friend workpool_type;
@@ -339,15 +357,16 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkGroup(WorkGroup const&) = delete;
+  WorkGroup(WorkGroup const&)            = delete;
   WorkGroup& operator=(WorkGroup const&) = delete;
 
-  WorkGroup(WorkGroup&&) = default;
+  WorkGroup(WorkGroup&&)            = default;
   WorkGroup& operator=(WorkGroup&&) = default;
 
   inline worksite_type run(resource_type r, Args...);
 
-  worksite_type run(Args... args) {
+  worksite_type run(Args... args)
+  {
     auto r = resource_type::get_default();
     return run(r, std::move(args)...);
   }
@@ -360,19 +379,15 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkGroup()
-  {
-    clear();
-  }
+  ~WorkGroup() { clear(); }
 
 private:
   storage_type m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
-    : m_storage(std::move(storage))
-    , m_runner(std::move(runner))
-  { }
+      : m_storage(std::move(storage)), m_runner(std::move(runner))
+  {}
 };
 
 template <typename EXEC_POLICY_T,
@@ -380,7 +395,7 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
 struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
@@ -390,16 +405,19 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
-
-  using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
+
+  using workpool_type  = WorkPool<policy, index_type, xarg_type, Allocator>;
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
 
 private:
@@ -412,16 +430,13 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkSite(WorkSite const&) = delete;
+  WorkSite(WorkSite const&)            = delete;
   WorkSite& operator=(WorkSite const&) = delete;
 
-  WorkSite(WorkSite&&) = default;
+  WorkSite(WorkSite&&)            = default;
   WorkSite& operator=(WorkSite&&) = default;
 
-  resource_type get_resource() const
-  {
-    return m_resource;
-  }
+  resource_type get_resource() const { return m_resource; }
 
   void clear()
   {
@@ -429,19 +444,15 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
     // TODO: synchronize
   }
 
-  ~WorkSite()
-  {
-    clear();
-  }
+  ~WorkSite() { clear(); }
 
 private:
   per_run_storage m_run_storage;
   resource_type m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
-    : m_run_storage(std::move(run_storage))
-    , m_resource(r)
-  { }
+      : m_run_storage(std::move(run_storage)), m_resource(r)
+  {}
 };
 
 
@@ -450,26 +461,29 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::workgroup_type
-WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::instantiate()
+inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                         ORDER_POLICY_T,
+                                         STORAGE_POLICY_T,
+                                         DISPATCH_POLICY_T>,
+                         INDEX_T,
+                         xargs<Args...>,
+                         ALLOCATOR_T>::workgroup_type
+WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                         ORDER_POLICY_T,
+                         STORAGE_POLICY_T,
+                         DISPATCH_POLICY_T>,
+         INDEX_T,
+         xargs<Args...>,
+         ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
-  m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
+  m_max_num_loops     = std::max(m_storage.size(), m_max_num_loops);
   m_max_storage_bytes = std::max(m_storage.storage_size(), m_max_storage_bytes);
 
   // move storage into workgroup
-  return workgroup_type{std::move(m_storage), std::move(m_runner)};
+  return workgroup_type {std::move(m_storage), std::move(m_runner)};
 }
 
 template <typename EXEC_POLICY_T,
@@ -477,30 +491,37 @@ template <typename EXEC_POLICY_T,
           typename STORAGE_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename INDEX_T,
-          typename ... Args,
+          typename... Args,
           typename ALLOCATOR_T>
-inline
-typename WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::worksite_type
+inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                          ORDER_POLICY_T,
+                                          STORAGE_POLICY_T,
+                                          DISPATCH_POLICY_T>,
+                          INDEX_T,
+                          xargs<Args...>,
+                          ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T,
+                    ORDER_POLICY_T,
+                    STORAGE_POLICY_T,
+                    DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::run(typename WorkGroup<
-                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-                          INDEX_T,
-                          xargs<Args...>,
-                          ALLOCATOR_T>::resource_type r,
+    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                                         ORDER_POLICY_T,
+                                                         STORAGE_POLICY_T,
+                                                         DISPATCH_POLICY_T>,
+                                         INDEX_T,
+                                         xargs<Args...>,
+                                         ALLOCATOR_T>::resource_type r,
                       Args... args)
 {
-  util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
+  util::PluginContext context {util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(r,
+                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index 1eac283f4b..d7c35feb3d 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -36,35 +36,36 @@ namespace RAJA
 namespace detail
 {
 
-template < typename >
+template <typename>
 struct DispatcherVoidPtrWrapper
 {
   void* ptr;
   DispatcherVoidPtrWrapper() = default;
   // implicit constructor from void*
-  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) {}
 };
 
-template < typename >
+template <typename>
 struct DispatcherVoidConstPtrWrapper
 {
   const void* ptr;
   DispatcherVoidConstPtrWrapper() = default;
   // implicit constructor from const void*
-  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) {}
 };
 
 
-constexpr bool dispatcher_use_host_invoke(Platform platform) {
+constexpr bool dispatcher_use_host_invoke(Platform platform)
+{
   return !(platform == Platform::cuda || platform == Platform::hip);
 }
 
 // Transforms one dispatch policy into another by creating a dispatch policy
 // of holder_type objects. See usage in WorkRunner for more explanation.
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 struct dispatcher_transform_types;
 ///
-template < typename dispatch_policy, typename holder_type >
+template <typename dispatch_policy, typename holder_type>
 using dispatcher_transform_types_t =
     typename dispatcher_transform_types<dispatch_policy, holder_type>::type;
 
@@ -75,12 +76,17 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+template <Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
 struct Dispatcher;
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
@@ -93,38 +99,44 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holde
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_function_call_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_function_call_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  template < typename T >
-  static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
+  template <typename T>
+  static void s_move_construct_destroy(void_ptr_wrapper dest,
+                                       void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
-    T* src_as_T = static_cast<T*>(src.ptr);
-    new(dest_as_T) T(std::move(*src_as_T));
+    T* src_as_T  = static_cast<T*>(src.ptr);
+    new (dest_as_T) T(std::move(*src_as_T));
     (*src_as_T).~T();
   }
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  template < typename T >
+  template <typename T>
   static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
   }
   ///
-  template < typename T >
-  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
+  template <typename T>
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
+                                          CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -133,22 +145,26 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// destroy the object of type T in obj
   ///
-  template < typename T >
+  template <typename T>
   static void s_destroy(void_ptr_wrapper obj)
   {
     T* obj_as_T = static_cast<T*>(obj.ptr);
     (*obj_as_T).~T();
   }
 
-  using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
-  using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
-  using destroyer_type = void(*)(void_ptr_wrapper /*obj*/);
+  using mover_type     = void (*)(void_ptr_wrapper /*dest*/,
+                              void_ptr_wrapper /*src*/);
+  using invoker_type   = void (*)(void_cptr_wrapper /*obj*/,
+                                CallArgs... /*args*/);
+  using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceInvokerFactory {
+  template <typename T>
+  struct DeviceInvokerFactory
+  {
     using value_type = invoker_type;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -160,14 +176,14 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{&s_host_invoke<T>},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {&s_host_invoke<T>}, destroyer_type {&s_destroy<T>},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -179,14 +195,16 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(DeviceInvokerFactory<T>{})},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {std::forward<CreateOnDevice>(createOnDevice)(
+                DeviceInvokerFactory<T> {})},
+            destroyer_type {&s_destroy<T>}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -196,8 +214,10 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
 };
 
 
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> {
+template <typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
@@ -210,38 +230,48 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, ho
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_virtual_function_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_virtual_function_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  struct impl_base {
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
-    virtual void destroy(void_ptr_wrapper obj) const = 0;
+  struct impl_base
+  {
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const      = 0;
   };
 
-  struct host_impl_base {
+  struct host_impl_base
+  {
     virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
   };
 
-  struct device_impl_base {
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+  struct device_impl_base
+  {
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const = 0;
   };
 
-  template < typename T >
+  template <typename T>
   struct base_impl_type : impl_base
   {
     ///
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
 
@@ -255,7 +285,7 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template <typename T>
   struct host_impl_type : host_impl_base
   {
     ///
@@ -268,20 +298,22 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template <typename T>
   struct device_impl_type : device_impl_base
   {
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
 
-  struct mover_type {
+  struct mover_type
+  {
     impl_base* m_impl;
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
@@ -289,7 +321,8 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     host_impl_base* m_impl;
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
@@ -297,30 +330,30 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
   ///
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     device_impl_base* m_impl;
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
-  struct destroyer_type {
+  struct destroyer_type
+  {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper obj) const
-    {
-      m_impl->destroy(obj);
-    }
+    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceImplTypeFactory {
+  template <typename T>
+  struct DeviceImplTypeFactory
+  {
     using value_type = device_impl_type<T>*;
-    RAJA_DEVICE value_type operator()() {
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -333,16 +366,15 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return { mover_type{&s_base_impl},
-             host_invoker_type{&s_host_impl},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    return {mover_type {&s_base_impl}, host_invoker_type {&s_host_impl},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -354,17 +386,17 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
     static base_impl_type<T> s_base_impl;
-    static device_impl_type<T>* s_device_impl_ptr{
-        std::forward<CreateOnDevice>(createOnDevice)(DeviceImplTypeFactory<T>{}) };
-    return { mover_type{&s_base_impl},
-             device_invoker_type{s_device_impl_ptr},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    static device_impl_type<T>* s_device_impl_ptr {std::forward<CreateOnDevice>(
+        createOnDevice)(DeviceImplTypeFactory<T> {})};
+    return {mover_type {&s_base_impl}, device_invoker_type {s_device_impl_ptr},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -375,61 +407,68 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
 
 
 // direct_dispatch expects a list of types
-template < typename ... Ts, typename holder_type >
-struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type> {
-  using type = ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
+template <typename... Ts, typename holder_type>
+struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type>
+{
+  using type =
+      ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to zero callable types.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...> {
+template <Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
-    void operator()(void_ptr_wrapper, void_ptr_wrapper) const
-    { }
+  struct mover_type
+  {
+    void operator()(void_ptr_wrapper, void_ptr_wrapper) const {}
   };
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
-    void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+  struct host_invoker_type
+  {
+    void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  struct device_invoker_type {
-    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+  struct device_invoker_type
+  {
+    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
-    void operator()(void_ptr_wrapper) const
-    { }
+  struct destroyer_type
+  {
+    void operator()(void_ptr_wrapper) const {}
   };
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -437,10 +476,14 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -453,23 +496,31 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs...> {
+template <Platform platform,
+          typename T,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<T>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -477,28 +528,30 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     void operator()(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -509,11 +562,14 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename U,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename U,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -521,11 +577,16 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename U, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template <typename U,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -538,46 +599,55 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template < typename T0, typename T1, typename ... TNs,
-           Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
-                  DispatcherID, CallArgs...> {
+template <typename T0,
+          typename T1,
+          typename... TNs,
+          Platform platform,
+          typename DispatcherID,
+          typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::direct_dispatch<T0, T1, TNs...>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  using id_type = int;
-  using callable_indices = camp::make_int_seq_t<id_type, 2+sizeof...(TNs)>;
-  using callable_types = camp::list<T0, T1, TNs...>;
+  using id_type          = int;
+  using callable_indices = camp::make_int_seq_t<id_type, 2 + sizeof...(TNs)>;
+  using callable_types   = camp::list<T0, T1, TNs...>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  dest, src);
+      impl_helper(callable_indices {}, callable_types {}, dest, src);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper dest, void_ptr_wrapper src) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper dest,
+                     void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -585,79 +655,89 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     id_type id;
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_cptr_wrapper obj,
+                     CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     id_type id;
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template <int... id_types, typename... Ts>
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
+                                 camp::list<Ts...>,
+                                 void_cptr_wrapper obj,
+                                 CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper obj) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj);
+      impl_helper(callable_indices {}, callable_types {}, obj);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper obj) const
+    template <int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
 
-    template < typename T >
+    template <typename T>
     void impl(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -671,25 +751,31 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// The id is just the index of T in the list of callable_types.
   /// If T is not in Ts return -1.
   ///
-  template < typename T, int ... id_types, typename ... Ts >
-  static constexpr id_type get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
+  template <typename T, int... id_types, typename... Ts>
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
+                                  camp::list<Ts...>)
   {
-    id_type id{-1};
+    id_type id {-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[] {0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
-    camp::sink(unused); // quiet unused var warning
+    int unused[] {0,
+                  (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    camp::sink(unused);  // quiet unused var warning
     return id;
   }
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            bool uhi               = use_host_invoke,
+            std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type {id}, host_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
@@ -697,12 +783,17 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template <typename T,
+            typename CreateOnDevice,
+            bool uhi                = use_host_invoke,
+            std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type {id}, device_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 9645f73050..5a666d1c73 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -40,18 +40,18 @@ namespace detail
 /*!
  * A body and args holder for storing loops that are being executed in foralls
  */
-template <typename LoopBody, typename ... Args>
+template <typename LoopBody, typename... Args>
 struct HoldBodyArgs_base
 {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template < typename body_in,
-      typename = typename std::enable_if<
-        std::is_same<LoopBody, camp::decay<body_in>>::value>::type >
+  template <typename body_in,
+            typename = typename std::enable_if<
+                std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
   HoldBodyArgs_base(body_in&& body, Args... args)
-    : m_body(std::forward<body_in>(body))
-    , m_arg_tuple(std::forward<Args>(args)...)
-  { }
+      : m_body(std::forward<body_in>(body)),
+        m_arg_tuple(std::forward<Args>(args)...)
+  {}
 
 protected:
   LoopBody m_body;
@@ -62,7 +62,7 @@ struct HoldBodyArgs_base
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the host
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template <typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -70,10 +70,10 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -84,7 +84,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the device
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template <typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -92,10 +92,10 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_DEVICE RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -105,28 +105,29 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <typename ExecutionPolicy, typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename ExecutionPolicy,
+          typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
-  using HoldBodyArgs = typename std::conditional<
+  using HoldBodyArgs  = typename std::conditional<
       !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
       HoldBodyArgs_host<LoopBody, index_type, Args...>,
-      HoldBodyArgs_device<LoopBody, index_type, Args...> >::type;
+      HoldBodyArgs_device<LoopBody, index_type, Args...>>::type;
 
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldForall(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
-    wrap::forall(r,
-                 ExecutionPolicy(),
-                 m_segment,
-                 HoldBodyArgs{m_body, std::forward<Args>(args)...});
+    wrap::forall(r, ExecutionPolicy(), m_segment,
+                 HoldBodyArgs {m_body, std::forward<Args>(args)...});
   }
 
 private:
@@ -143,7 +144,7 @@ template <typename EXEC_POLICY_T,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner;
 
 
@@ -156,28 +157,32 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallOrdered_base
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = typename resources::get_resource<FORALL_EXEC_POLICY>::type;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type =
+      typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
   using forall_exec_policy = FORALL_EXEC_POLICY;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
-    using type = HoldForall<forall_exec_policy,
-                            typename camp::at<T, camp::num<0>>::type, // segment_type
-                            typename camp::at<T, camp::num<1>>::type, // loop_type
-                            index_type, Args...>;
+  struct holder_type
+  {
+    template <typename T>
+    using type =
+        HoldForall<forall_exec_policy,
+                   typename camp::at<T, camp::num<0>>::type,  // segment_type
+                   typename camp::at<T, camp::num<1>>::type,  // loop_type
+                   index_type,
+                   Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -186,33 +191,40 @@ struct WorkRunnerForallOrdered_base
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::host, dispatcher_holder_policy, void, resource_type, Args...>;
+  using dispatcher_type = Dispatcher<Platform::host,
+                                     dispatcher_holder_policy,
+                                     void,
+                                     resource_type,
+                                     Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
   WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete;
+  WorkRunnerForallOrdered_base&
+  operator=(WorkRunnerForallOrdered_base const&) = delete;
 
-  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base&&) = default;
+  WorkRunnerForallOrdered_base&
+  operator=(WorkRunnerForallOrdered_base&&) = default;
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename segment_T, typename loop_T >
+  template <typename WorkContainer, typename segment_T, typename loop_T>
   inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
   {
-    using holder = holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
+    using holder =
+        holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
     storage.template emplace<holder>(
-        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
         std::forward<segment_T>(seg), std::forward<loop_T>(loop));
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  { }
+  void clear() {}
 
   // no extra storage required here
   using per_run_storage = int;
@@ -227,39 +239,38 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallOrdered
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto end = storage.end();
-    for (auto iter = storage.begin(); iter != end; ++iter) {
+    for (auto iter = storage.begin(); iter != end; ++iter)
+    {
       value_type::host_call(&*iter, r, args...);
     }
 
@@ -276,40 +287,40 @@ template <typename FORALL_EXEC_POLICY,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunnerForallReverse
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
-  // run the loops using forall in the reverse order to the order they were enqueued
-  template < typename WorkContainer >
+  // run the loops using forall in the reverse order to the order they were
+  // enqueued
+  template <typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto begin = storage.begin();
-    for (auto iter = storage.end(); iter != begin; --iter) {
-      value_type::host_call(&*(iter-1), r, args...);
+    for (auto iter = storage.end(); iter != begin; --iter)
+    {
+      value_type::host_call(&*(iter - 1), r, args...);
     }
 
     return run_storage;
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 52631d108f..d7eceaef7f 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -46,23 +46,23 @@ namespace detail
 //   operator -  ( iterator_base const& )
 //   operator == ( iterator_base const& )
 //   operator <  ( iterator_base const& )
-template < typename iterator_base >
+template <typename iterator_base>
 struct random_access_iterator : iterator_base
 {
-  using base = iterator_base;
-  using value_type = const typename base::value_type;
-  using pointer = typename base::pointer;
-  using reference = typename base::reference;
-  using difference_type = typename base::difference_type;
+  using base              = iterator_base;
+  using value_type        = const typename base::value_type;
+  using pointer           = typename base::pointer;
+  using reference         = typename base::reference;
+  using difference_type   = typename base::difference_type;
   using iterator_category = std::random_access_iterator_tag;
 
   using base::base;
 
   random_access_iterator(random_access_iterator const&) = default;
-  random_access_iterator(random_access_iterator &&) = default;
+  random_access_iterator(random_access_iterator&&)      = default;
 
   random_access_iterator& operator=(random_access_iterator const&) = default;
-  random_access_iterator& operator=(random_access_iterator &&) = default;
+  random_access_iterator& operator=(random_access_iterator&&)      = default;
 
 
   RAJA_HOST_DEVICE reference operator*() const
@@ -70,10 +70,7 @@ struct random_access_iterator : iterator_base
     return *static_cast<base const&>(*this);
   }
 
-  RAJA_HOST_DEVICE pointer operator->() const
-  {
-    return &(*(*this));
-  }
+  RAJA_HOST_DEVICE pointer operator->() const { return &(*(*this)); }
 
   RAJA_HOST_DEVICE reference operator[](difference_type i) const
   {
@@ -120,68 +117,75 @@ struct random_access_iterator : iterator_base
     return *this;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator+(random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy += rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      difference_type lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator+(difference_type lhs, random_access_iterator const& rhs)
   {
     random_access_iterator copy = rhs;
     copy += lhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline random_access_iterator operator-(
-      random_access_iterator const& lhs, difference_type rhs)
+  RAJA_HOST_DEVICE friend inline random_access_iterator
+  operator-(random_access_iterator const& lhs, difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy -= rhs;
     return copy;
   }
 
-  RAJA_HOST_DEVICE friend inline difference_type operator-(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline difference_type
+  operator-(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator==(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator==(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator!=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator!=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator<=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator<=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>(random_access_iterator const& lhs,
+            random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
-  RAJA_HOST_DEVICE friend inline bool operator>=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+  RAJA_HOST_DEVICE friend inline bool
+  operator>=(random_access_iterator const& lhs,
+             random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -191,10 +195,12 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename STORAGE_POLICY_T,
+          typename ALLOCATOR_T,
+          typename Dispatcher_T>
 class WorkStorage;
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -202,25 +208,27 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::array_of_pointers;
+  using storage_policy  = RAJA::array_of_pointers;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
 private:
   // struct used in storage vector to retain pointer and allocation size
@@ -231,24 +239,19 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   };
 
 public:
-
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
-    const_iterator_base(const pointer_and_size* ptrptr)
-      : m_ptrptr(ptrptr)
-    { }
+    const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {}
 
-    RAJA_HOST_DEVICE reference operator*() const
-    {
-      return *(m_ptrptr->ptr);
-    }
+    RAJA_HOST_DEVICE reference operator*() const { return *(m_ptrptr->ptr); }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
     {
@@ -256,20 +259,23 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -282,22 +288,22 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_vec(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_vec(0, aloc), m_aloc(aloc)
+  {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_vec(std::move(rhs.m_vec))
-    , m_aloc(std::move(rhs.m_aloc))
-  { }
+      : m_vec(std::move(rhs.m_vec)), m_aloc(std::move(rhs.m_aloc))
+  {}
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -312,33 +318,26 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_vec.size();
-  }
+  size_type size() const { return m_vec.size(); }
 
-  const_iterator begin() const
-  {
-    return const_iterator(m_vec.begin());
-  }
+  const_iterator begin() const { return const_iterator(m_vec.begin()); }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_vec.end());
-  }
+  const_iterator end() const { return const_iterator(m_vec.end()); }
 
   // number of bytes used for storage of loops
   size_type storage_size() const
   {
     size_type storage_size_nbytes = 0;
-    for (size_t i = 0; i < m_vec.size(); ++i) {
+    for (size_t i = 0; i < m_vec.size(); ++i)
+    {
       storage_size_nbytes += m_vec[i].size;
     }
     return storage_size_nbytes;
   }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
         dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
@@ -347,27 +346,28 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   // destroy all stored loops, deallocates all storage
   void clear()
   {
-    while (!m_vec.empty()) {
+    while (!m_vec.empty())
+    {
       destroy_value(m_vec.back());
       m_vec.pop_back();
     }
     m_vec.shrink_to_fit();
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<pointer_and_size, typename allocator_traits_type::template rebind_alloc<pointer_and_size>> m_vec;
+  RAJAVec<
+      pointer_and_size,
+      typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
+      m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
-    m_vec = std::move(rhs.m_vec);
+    m_vec  = std::move(rhs.m_vec);
     m_aloc = std::move(rhs.m_aloc);
   }
 
@@ -375,12 +375,16 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
       // take storage if allocators compare equal
       m_vec = std::move(rhs.m_vec);
-    } else {
+    }
+    else
+    {
       // allocate new storage if allocators do not compare equal
-      for (size_type i = 0; i < rhs.m_vec.size(); ++i) {
+      for (size_type i = 0; i < rhs.m_vec.size(); ++i)
+      {
         m_vec.emplace_back(move_destroy_value(std::move(rhs), rhs.m_vec[i]));
       }
       rhs.m_vec.clear();
@@ -389,7 +393,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // allocate and construct value in storage
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   pointer_and_size create_value(const dispatcher_type* dispatcher,
                                 holder_ctor_args&&... ctor_args)
   {
@@ -401,7 +405,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     value_type::template construct<holder>(
         value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
 
-    return pointer_and_size{value_ptr, value_size};
+    return pointer_and_size {value_ptr, value_size};
   }
 
   // allocate and move construct object as copy of other value and
@@ -414,22 +418,24 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
     value_type::move_destroy(value_ptr, other_value_and_size.ptr);
 
-    allocator_traits_type::deallocate(rhs.m_aloc,
-        reinterpret_cast<char*>(other_value_and_size.ptr), other_value_and_size.size);
+    allocator_traits_type::deallocate(
+        rhs.m_aloc, reinterpret_cast<char*>(other_value_and_size.ptr),
+        other_value_and_size.size);
 
-    return pointer_and_size{value_ptr, other_value_and_size.size};
+    return pointer_and_size {value_ptr, other_value_and_size.size};
   }
 
   // destroy and deallocate value
   void destroy_value(pointer_and_size value_and_size_ptr)
   {
     value_type::destroy(value_and_size_ptr.ptr);
-    allocator_traits_type::deallocate(m_aloc,
-        reinterpret_cast<char*>(value_and_size_ptr.ptr), value_and_size_ptr.size);
+    allocator_traits_type::deallocate(
+        m_aloc, reinterpret_cast<char*>(value_and_size_ptr.ptr),
+        value_and_size_ptr.size);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -437,44 +443,45 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::ragged_array_of_objects;
+  using storage_policy  = RAJA::ragged_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_begin, const size_type* offset_iter)
-      : m_array_begin(array_begin)
-      , m_offset_iter(offset_iter)
-    { }
+        : m_array_begin(array_begin), m_offset_iter(offset_iter)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
-      return *reinterpret_cast<pointer>(
-          m_array_begin + *m_offset_iter);
+      return *reinterpret_cast<pointer>(m_array_begin + *m_offset_iter);
     }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
@@ -483,20 +490,23 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
@@ -510,29 +520,30 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
 
   explicit WorkStorage(allocator_type const& aloc)
-    : m_offsets(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_offsets(0, aloc), m_aloc(aloc)
+  {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_offsets(std::move(rhs.m_offsets))
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
-    , m_aloc(std::move(rhs.m_aloc))
+      : m_offsets(std::move(rhs.m_offsets)),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap),
+        m_aloc(std::move(rhs.m_aloc))
   {
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end = nullptr;
-    rhs.m_array_cap = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -546,10 +557,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_offsets.size();
-  }
+  size_type size() const { return m_offsets.size(); }
 
   const_iterator begin() const
   {
@@ -562,17 +570,15 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of bytes used for storage of loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
-    size_type value_size   = create_value<holder>(value_offset,
-        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    size_type value_size   = create_value<holder>(
+        value_offset, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
   }
@@ -581,21 +587,22 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<size_type, typename allocator_traits_type::template rebind_alloc<size_type>> m_offsets;
+  RAJAVec<size_type,
+          typename allocator_traits_type::template rebind_alloc<size_type>>
+      m_offsets;
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -608,8 +615,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
     m_offsets     = std::move(rhs.m_offsets);
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
     m_aloc        = std::move(rhs.m_aloc);
 
     rhs.m_array_begin = nullptr;
@@ -621,25 +628,29 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
       m_offsets     = std::move(rhs.m_offsets);
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    } else {
+    }
+    else
+    {
       array_reserve(rhs.storage_size());
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         m_array_end = m_array_begin + rhs.m_offsets[i];
         move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]);
         m_offsets.emplace_back(rhs.m_offsets[i]);
       }
-      m_array_end = m_array_begin + rhs.storage_size();
+      m_array_end     = m_array_begin + rhs.storage_size();
       rhs.m_array_end = rhs.m_array_begin;
       rhs.m_offsets.clear();
       rhs.clear();
@@ -647,46 +658,45 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // get loop storage capacity, used and unused in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // get unused loop storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // reserve space for loop_storage_size bytes of loop storage
   void array_reserve(size_type loop_storage_size)
   {
-    if (loop_storage_size > storage_capacity()) {
+    if (loop_storage_size > storage_capacity())
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + storage_size();
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + storage_size();
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + m_offsets[i],
-                             m_array_begin + m_offsets[i]);
+                           m_array_begin + m_offsets[i]);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
   // destroy loop objects (does not deallocate array storage)
   void array_clear()
   {
-    while (!m_offsets.empty()) {
+    while (!m_offsets.empty())
+    {
       destroy_value(m_offsets.back());
       m_array_end = m_array_begin + m_offsets.back();
       m_offsets.pop_back();
@@ -696,15 +706,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   size_type create_value(size_type value_offset,
                          const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused()) {
-      array_reserve(std::max(storage_size() + value_size, 2*storage_capacity()));
+    if (value_size > storage_unused())
+    {
+      array_reserve(
+          std::max(storage_size() + value_size, 2 * storage_capacity()));
     }
 
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
@@ -726,13 +738,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template <typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::constant_stride_array_of_objects,
                   ALLOCATOR_T,
                   Dispatcher_T>
@@ -742,39 +753,41 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::constant_stride_array_of_objects;
+  using storage_policy  = RAJA::constant_stride_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template <typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_pos, size_type stride)
-      : m_array_pos(array_pos)
-      , m_stride(stride)
-    { }
+        : m_array_pos(array_pos), m_stride(stride)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
@@ -787,20 +800,23 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       return *this;
     }
 
-    RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline difference_type
+    operator-(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator==(const_iterator_base const& lhs_iter,
+               const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
-    RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+    RAJA_HOST_DEVICE friend inline bool
+    operator<(const_iterator_base const& lhs_iter,
+              const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
@@ -813,19 +829,17 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   using const_iterator = random_access_iterator<const_iterator_base>;
 
 
-  explicit WorkStorage(allocator_type const& aloc)
-    : m_aloc(aloc)
-  { }
+  explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_aloc(std::move(rhs.m_aloc))
-    , m_stride(rhs.m_stride)
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
+      : m_aloc(std::move(rhs.m_aloc)),
+        m_stride(rhs.m_stride),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap)
   {
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -835,8 +849,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -847,35 +863,28 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     size_type num_storage_loops =
         std::max(num_loops, (loop_storage_size + m_stride - 1) / m_stride);
-    array_reserve(num_storage_loops*m_stride, m_stride);
+    array_reserve(num_storage_loops * m_stride, m_stride);
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return storage_size() / m_stride;
-  }
+  size_type size() const { return storage_size() / m_stride; }
 
   const_iterator begin() const
   {
     return const_iterator(m_array_begin, m_stride);
   }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_array_end, m_stride);
-  }
+  const_iterator end() const { return const_iterator(m_array_end, m_stride); }
 
   // amount of storage in bytes used to store loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher,
+                         std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -883,22 +892,21 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
   allocator_type m_aloc;
-  size_type m_stride     = 1; // can't be 0 because size divides stride
+  size_type m_stride  = 1;  // can't be 0 because size divides stride
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -909,10 +917,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     clear();
 
     m_aloc        = std::move(rhs.m_aloc);
-    m_stride      = rhs.m_stride     ;
+    m_stride      = rhs.m_stride;
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
 
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -924,23 +932,27 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
-      m_stride      = rhs.m_stride     ;
+      m_stride      = rhs.m_stride;
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       // do not reset stride, leave it for reuse
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    } else {
+    }
+    else
+    {
 
       m_stride = rhs.m_stride;
       array_reserve(rhs.storage_size(), rhs.m_stride);
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         move_destroy_value(m_array_end, rhs.m_array_begin + i * rhs.m_stride);
         m_array_end += m_stride;
       }
@@ -950,16 +962,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 
   // storage capacity, used and unused, in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // unused storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // allocate enough storage for loop_storage_size bytes with
   // each loop body separated by new_stride bytes
@@ -968,33 +974,39 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // Note that loop_storage_size must be a multiple of new_stride
   void array_reserve(size_type loop_storage_size, size_type new_stride)
   {
-    if (loop_storage_size > storage_capacity() || new_stride > m_stride) {
+    if (loop_storage_size > storage_capacity() || new_stride > m_stride)
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + size() * new_stride;
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + size() * new_stride;
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + i * new_stride,
-                             m_array_begin + i *   m_stride);
+                           m_array_begin + i * m_stride);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
-      m_stride      = new_stride     ;
+      m_stride      = new_stride;
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
   // destroy the loops in storage (does not deallocate loop storage)
   void array_clear()
   {
-    for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) {
+    for (size_type value_offset = storage_size(); value_offset > 0;
+         value_offset -= m_stride)
+    {
       destroy_value(value_offset - m_stride);
       m_array_end -= m_stride;
     }
@@ -1002,18 +1014,20 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
-  template < typename holder, typename ... holder_ctor_args >
+  template <typename holder, typename... holder_ctor_args>
   void create_value(const dispatcher_type* dispatcher,
                     holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused() && value_size <= m_stride) {
-      array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()),
+    if (value_size > storage_unused() && value_size <= m_stride)
+    {
+      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
                     m_stride);
-    } else if (value_size > m_stride) {
-      array_reserve((size()+1)*value_size,
-                    value_size);
+    }
+    else if (value_size > m_stride)
+    {
+      array_reserve((size() + 1) * value_size, value_size);
     }
 
     size_type value_offset = storage_size();
@@ -1025,8 +1039,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // move construct the loop body in value from other and
   // destroy the loop body in other
-  void move_destroy_value(char* value_ptr,
-                          char* other_value_ptr)
+  void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
     value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
                              reinterpret_cast<pointer>(other_value_ptr));
@@ -1035,8 +1048,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 72e1540c54..90792d4037 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -35,7 +35,7 @@ namespace detail
 /*!
  * A struct that gives a generic way to layout memory for different loops
  */
-template < size_t size, typename Dispatcher_T >
+template <size_t size, typename Dispatcher_T>
 struct WorkStruct;
 
 /*!
@@ -44,67 +44,75 @@ struct WorkStruct;
  *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
-template < typename Dispatcher_T >
+template <typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
-struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
+template <size_t size,
+          Platform platform,
+          typename dispatch_policy,
+          typename DispatcherID,
+          typename... CallArgs>
+struct WorkStruct<
+    size,
+    Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
 {
-  using dispatcher_type = Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
+  using dispatcher_type =
+      Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
 
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
-  template < typename holder, typename ... holder_ctor_args >
-  static RAJA_INLINE
-  void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template <typename holder, typename... holder_ctor_args>
+  static RAJA_INLINE void construct(void* ptr,
+                                    const dispatcher_type* dispatcher,
+                                    holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
-    using value_type = GenericWorkStruct<dispatcher_type>;
+    using value_type      = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
-        "holder must fit in WorkStruct::obj");
+                  "holder must fit in WorkStruct::obj");
     static_assert(std::is_standard_layout<true_value_type>::value,
-        "WorkStruct must be a standard layout type");
+                  "WorkStruct must be a standard layout type");
     static_assert(std::is_standard_layout<value_type>::value,
-        "GenericWorkStruct must be a standard layout type");
-    static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
+                  "GenericWorkStruct must be a standard layout type");
+    static_assert(
+        offsetof(value_type, obj) == offsetof(true_value_type, obj),
         "WorkStruct and GenericWorkStruct must have obj at the same offset");
     static_assert(sizeof(value_type) <= sizeof(true_value_type),
-        "WorkStruct must not be smaller than GenericWorkStruct");
+                  "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
-    value_ptr->invoke = dispatcher->invoke;
-    new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
+    value_ptr->invoke     = dispatcher->invoke;
+    new (&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE
-  void move_destroy(WorkStruct* value_dst,
-                    WorkStruct* value_src)
+  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
+                                       WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
-    value_dst->invoke = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj);
+    value_dst->invoke     = value_src->invoke;
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
+                                                  &value_src->obj);
   }
 
   // destroy the value ptr
-  static RAJA_INLINE
-  void destroy(WorkStruct* value_ptr)
+  static RAJA_INLINE void destroy(WorkStruct* value_ptr)
   {
     value_ptr->dispatcher->destroy(&value_ptr->obj);
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE
-  void host_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
+                                    CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE
-  void device_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
+                                                  CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index d5905f7928..d56c576710 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -87,9 +87,9 @@ namespace RAJA
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc)
 {
-  return RAJA::atomicLoad(Policy{}, acc);
+  return RAJA::atomicLoad(Policy {}, acc);
 }
 
 
@@ -100,9 +100,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value)
 {
-  RAJA::atomicStore(Policy{}, acc, value);
+  RAJA::atomicStore(Policy {}, acc, value);
 }
 
 
@@ -114,9 +114,9 @@ RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value)
 {
-  return RAJA::atomicAdd(Policy{}, acc, value);
+  return RAJA::atomicAdd(Policy {}, acc, value);
 }
 
 
@@ -128,9 +128,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value)
 {
-  return RAJA::atomicSub(Policy{}, acc, value);
+  return RAJA::atomicSub(Policy {}, acc, value);
 }
 
 
@@ -142,9 +142,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value)
 {
-  return RAJA::atomicMin(Policy{}, acc, value);
+  return RAJA::atomicMin(Policy {}, acc, value);
 }
 
 
@@ -156,9 +156,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value)
 {
-  return RAJA::atomicMax(Policy{}, acc, value);
+  return RAJA::atomicMax(Policy {}, acc, value);
 }
 
 
@@ -169,9 +169,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc)
 {
-  return RAJA::atomicInc(Policy{}, acc);
+  return RAJA::atomicInc(Policy {}, acc);
 }
 
 
@@ -185,9 +185,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare)
 {
-  return RAJA::atomicInc(Policy{}, acc, compare);
+  return RAJA::atomicInc(Policy {}, acc, compare);
 }
 
 
@@ -198,9 +198,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc)
 {
-  return RAJA::atomicDec(Policy{}, acc);
+  return RAJA::atomicDec(Policy {}, acc);
 }
 
 
@@ -214,9 +214,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare)
 {
-  return RAJA::atomicDec(Policy{}, acc, compare);
+  return RAJA::atomicDec(Policy {}, acc, compare);
 }
 
 
@@ -229,11 +229,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicAnd can only be used on integral types");
-  return RAJA::atomicAnd(Policy{}, acc, value);
+  return RAJA::atomicAnd(Policy {}, acc, value);
 }
 
 
@@ -246,11 +246,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicOr can only be used on integral types");
-  return RAJA::atomicOr(Policy{}, acc, value);
+  return RAJA::atomicOr(Policy {}, acc, value);
 }
 
 
@@ -263,11 +263,11 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicXor can only be used on integral types");
-  return RAJA::atomicXor(Policy{}, acc, value);
+  return RAJA::atomicXor(Policy {}, acc, value);
 }
 
 
@@ -279,9 +279,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
 {
-  return RAJA::atomicExchange(Policy{}, acc, value);
+  return RAJA::atomicExchange(Policy {}, acc, value);
 }
 
 
@@ -295,9 +295,9 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value)
 {
-  return RAJA::atomicCAS(Policy{}, acc, compare, value);
+  return RAJA::atomicCAS(Policy {}, acc, compare, value);
 }
 
 /*!
@@ -317,22 +317,18 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr explicit AtomicRef(value_type *value_ptr)
-      : m_value_ptr(value_ptr) {}
+  constexpr explicit AtomicRef(value_type* value_ptr) : m_value_ptr(value_ptr)
+  {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr AtomicRef(AtomicRef const &c)
-      : m_value_ptr(c.m_value_ptr) {}
+  constexpr AtomicRef(AtomicRef const& c) : m_value_ptr(c.m_value_ptr) {}
 
   AtomicRef& operator=(AtomicRef const&) = delete;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type * getPointer() const
-  {
-    return m_value_ptr;
-  }
+  value_type* getPointer() const { return m_value_ptr; }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -351,17 +347,11 @@ class AtomicRef
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type load() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  value_type load() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  operator value_type() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  operator value_type() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
@@ -382,10 +372,13 @@ class AtomicRef
   bool compare_exchange_strong(value_type& expect, value_type rhs) const
   {
     value_type compare = expect;
-    value_type old = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
-    if (compare == old) {
+    value_type old     = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
+    if (compare == old)
+    {
       return true;
-    } else {
+    }
+    else
+    {
       expect = old;
       return false;
     }
@@ -527,7 +520,7 @@ class AtomicRef
   }
 
 private:
-  value_type *m_value_ptr;
+  value_type* m_value_ptr;
 };
 
 
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 21d266bd21..0a5521e0e3 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -49,16 +49,17 @@ using ContainerVal =
     camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
 
 template <typename Container>
-using ContainerRef =
-    decltype(*camp::val<camp::iterator_from<Container>>());
+using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
 
 template <typename Container>
 using ContainerDiff =
-    camp::decay<decltype(camp::val<camp::iterator_from<Container>>()-camp::val<camp::iterator_from<Container>>())>;
+    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
+                         camp::val<camp::iterator_from<Container>>())>;
 
 template <typename DiffType, typename CountType>
-RAJA_INLINE
-DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
+RAJA_INLINE DiffType firstIndex(DiffType n,
+                                CountType num_threads,
+                                CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
@@ -70,9 +71,7 @@ DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
     \brief swap values at iterators lhs and rhs
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-safe_iter_swap(Iter lhs, Iter rhs)
+RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs)
 {
 #ifdef RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
   using camp::safe_swap;
@@ -87,9 +86,7 @@ safe_iter_swap(Iter lhs, Iter rhs)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-next(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
 {
   ++it;
   return it;
@@ -99,9 +96,7 @@ next(Iter it)
     \brief returns iterator to next item
 */
 template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-prev(Iter it)
+RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it)
 {
   --it;
   return it;
diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp
index 3bd5d7ecaf..aa9a3ac888 100644
--- a/include/RAJA/pattern/detail/forall.hpp
+++ b/include/RAJA/pattern/detail/forall.hpp
@@ -19,12 +19,12 @@
 #ifndef RAJA_PATTERN_DETAIL_FORALL_HPP
 #define RAJA_PATTERN_DETAIL_FORALL_HPP
 
-#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \
-  using std::begin;                                  \
-  using std::end;                                    \
-  using std::distance;                               \
-  auto begin##SUFFIX = begin(CONTAINER);             \
-  auto end##SUFFIX = end(CONTAINER);                 \
+#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX)                           \
+  using std::begin;                                                            \
+  using std::end;                                                              \
+  using std::distance;                                                         \
+  auto begin##SUFFIX    = begin(CONTAINER);                                    \
+  auto end##SUFFIX      = end(CONTAINER);                                      \
   auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX)
 
 #define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it)
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 884b9aa989..14b655475b 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -26,32 +26,29 @@
 #include "RAJA/util/RepeatView.hpp"
 
 
-#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)    \
-  template <typename tuning, typename T>                      \
-  struct MultiReduce##OP_NAME<POL<tuning>, T>                 \
-      : reduce::detail::BaseMultiReduce##OP_NAME<             \
-            DATA<T, RAJA::reduce::OP<T>, tuning>>             \
-  {                                                           \
-    using policy = POL<tuning>;                               \
-    using Base = reduce::detail::BaseMultiReduce##OP_NAME<    \
-        DATA<T, RAJA::reduce::OP<T>, tuning>>;                \
-    using Base::Base;                                         \
-    using typename Base::value_type;                          \
-    using typename Base::reference;                           \
-                                                              \
-    RAJA_SUPPRESS_HD_WARN                                     \
-    RAJA_HOST_DEVICE                                          \
-    reference operator[](size_t bin) const                    \
-    {                                                         \
-      return reference(*this, bin);                           \
-    }                                                         \
+#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)                     \
+  template <typename tuning, typename T>                                       \
+  struct MultiReduce##OP_NAME<POL<tuning>, T>                                  \
+      : reduce::detail::BaseMultiReduce##OP_NAME<                              \
+            DATA<T, RAJA::reduce::OP<T>, tuning>>                              \
+  {                                                                            \
+    using policy = POL<tuning>;                                                \
+    using Base   = reduce::detail::BaseMultiReduce##OP_NAME<                   \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                               \
+    using Base::Base;                                                          \
+    using typename Base::value_type;                                           \
+    using typename Base::reference;                                            \
+                                                                               \
+    RAJA_SUPPRESS_HD_WARN                                                      \
+    RAJA_HOST_DEVICE                                                           \
+    reference operator[](size_t bin) const { return reference(*this, bin); }   \
   };
 
-#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)            \
-  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)        \
+#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)                             \
+  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)                         \
   RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA)
 
 namespace RAJA
@@ -67,32 +64,37 @@ template <typename t_MultiReduceData>
 struct BaseMultiReduce
 {
   using MultiReduceData = t_MultiReduceData;
-  using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
-  using value_type = typename t_MultiReduceData::value_type;
+  using MultiReduceOp   = typename t_MultiReduceData::MultiReduceOp;
+  using value_type      = typename t_MultiReduceData::value_type;
 
-  BaseMultiReduce() : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)} {}
+  BaseMultiReduce()
+      : BaseMultiReduce {RepeatView<value_type>(MultiReduceOp::identity(), 0)}
+  {}
 
   explicit BaseMultiReduce(size_t num_bins,
                            value_type init_val = MultiReduceOp::identity(),
                            value_type identity = MultiReduceOp::identity())
-      : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
-  { }
-
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>,
-                                   concepts::negate<std::is_convertible<Container, size_t>>,
-                                   concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* = nullptr >
+      : BaseMultiReduce {RepeatView<value_type>(init_val, num_bins), identity}
+  {}
+
+  template <
+      typename Container,
+      concepts::enable_if_t<
+          type_traits::is_range<Container>,
+          concepts::negate<std::is_convertible<Container, size_t>>,
+          concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* =
+          nullptr>
   explicit BaseMultiReduce(Container const& container,
                            value_type identity = MultiReduceOp::identity())
-      : data{container, identity}
-  { }
+      : data {container, identity}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduce(BaseMultiReduce const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduce(BaseMultiReduce &&) = default;
-  BaseMultiReduce &operator=(BaseMultiReduce const&) = delete;
-  BaseMultiReduce &operator=(BaseMultiReduce &&) = delete;
+  BaseMultiReduce(BaseMultiReduce&&)                 = default;
+  BaseMultiReduce& operator=(BaseMultiReduce const&) = delete;
+  BaseMultiReduce& operator=(BaseMultiReduce&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduce() = default;
 
@@ -108,13 +110,14 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void reset(Container const& container,
              value_type identity = MultiReduceOp::identity())
   {
-    for (size_t bin = 0; bin < data.num_bins(); ++bin) {
-      RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
+    for (size_t bin = 0; bin < data.num_bins(); ++bin)
+    {
+      RAJA_UNUSED_VAR(get(bin));  // automatic get() before reset
     }
     data.reset(container, identity);
   }
@@ -125,7 +128,7 @@ struct BaseMultiReduce
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseMultiReduce const& combine(size_t bin, value_type const &other) const
+  BaseMultiReduce const& combine(size_t bin, value_type const& other) const
   {
     data.combine(bin, other);
     return *this;
@@ -135,16 +138,19 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template <typename Container,
+            concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void get_all(Container& container) const
   {
     RAJA_EXTRACT_BED_IT(container);
-    if (size_t(distance_it) != data.num_bins()) {
-      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer");
+    if (size_t(distance_it) != data.num_bins())
+    {
+      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size "
+                          "than multi reducer");
     }
     size_t bin = 0;
-    for (auto& val : container) {
+    for (auto& val : container)
+    {
       val = data.get(bin);
       ++bin;
     }
@@ -167,17 +173,17 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 {
 public:
   using Base = BaseMultiReduce<MultiReduceData>;
-  using typename Base::value_type;
   using Base::Base;
+  using typename Base::value_type;
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMin(BaseMultiReduceMin const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin(BaseMultiReduceMin &&) = default;
+  BaseMultiReduceMin(BaseMultiReduceMin&&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin const&) = delete;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMin() = default;
 
@@ -185,8 +191,8 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMin const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -196,10 +202,7 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMin const& m_base;
@@ -226,9 +229,9 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMax(BaseMultiReduceMax &&) = default;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete;
+  BaseMultiReduceMax(BaseMultiReduceMax&&)                 = default;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax const&) = delete;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMax() = default;
 
@@ -236,8 +239,8 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMax const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -247,10 +250,7 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMax const& m_base;
@@ -277,9 +277,9 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceSum(BaseMultiReduceSum &&) = default;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete;
+  BaseMultiReduceSum(BaseMultiReduceSum&&)                 = default;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum const&) = delete;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceSum() = default;
 
@@ -287,8 +287,8 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceSum const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -298,10 +298,7 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceSum const& m_base;
@@ -328,9 +325,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete;
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr&&)                 = default;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr const&) = delete;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitOr() = default;
 
@@ -338,8 +335,8 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitOr const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -349,10 +346,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitOr const& m_base;
@@ -379,9 +373,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete;
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&)                 = default;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd const&) = delete;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitAnd() = default;
 
@@ -389,8 +383,8 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitAnd const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base), m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -400,10 +394,7 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitAnd const& m_base;
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 3579027cd3..9ca50f308b 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -30,7 +30,7 @@ class has_privatizer
 private:
   template <typename C>
   static auto Test(void*)
-      -> decltype(camp::val<typename C::privatizer>(), camp::true_type{});
+      -> decltype(camp::val<typename C::privatizer>(), camp::true_type {});
 
   template <typename>
   static camp::false_type Test(...);
@@ -42,12 +42,13 @@ class has_privatizer
 
 static_assert(!has_privatizer<int>::value, "if this fires, abandon all hope");
 
-struct GenericWrapperBase {
-};
+struct GenericWrapperBase
+{};
 
 template <typename T>
-struct Privatizer {
-  using value_type = camp::decay<T>;
+struct Privatizer
+{
+  using value_type     = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
   static_assert(!has_privatizer<T>::value,
@@ -58,7 +59,7 @@ struct Privatizer {
                 "a bug");
 
   RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
+  RAJA_HOST_DEVICE Privatizer(const T& o) : priv {o} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE reference_type get_priv() { return priv; }
@@ -85,7 +86,7 @@ template <typename T,
           typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
 {
-  return Privatizer<T>{item};
+  return Privatizer<T> {item};
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -93,7 +94,7 @@ template <typename T,
           typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer
 {
-  return typename T::privatizer{item};
+  return typename T::privatizer {item};
 }
 
 }  // namespace internal
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 788f3c698d..2f826b590f 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -21,33 +21,33 @@
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/types.hpp"
 
-#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)               \
-  template <typename T>                                       \
-  class Reduce##OP<POL, T>                                    \
-      : public reduce::detail::BaseReduce##OP<T, COMBINER>    \
-  {                                                           \
-  public:                                                     \
-    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>; \
-    using Base::Base;                                         \
+#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)                                \
+  template <typename T>                                                        \
+  class Reduce##OP<POL, T>                                                     \
+      : public reduce::detail::BaseReduce##OP<T, COMBINER>                     \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>;                  \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                    \
-  template <typename T, typename IndexType>                              \
-  class Reduce##OP<POL, T, IndexType>                                    \
-      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>    \
-  {                                                                      \
-  public:                                                                \
-    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>; \
-    using Base::Base;                                                    \
+#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                          \
+  template <typename T, typename IndexType>                                    \
+  class Reduce##OP<POL, T, IndexType>                                          \
+      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>          \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>;       \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)       \
-  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)             \
-  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)    \
-  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)    \
-  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)           \
+#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)                               \
+  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)                                     \
+  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)                                   \
   RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER)
 
 namespace RAJA
@@ -64,14 +64,15 @@ namespace detail
 {
 
 template <typename T, template <typename...> class Op>
-struct op_adapter : private Op<T, T, T> {
+struct op_adapter : private Op<T, T, T>
+{
   using operator_type = Op<T, T, T>;
   RAJA_HOST_DEVICE static constexpr T identity()
   {
     return operator_type::identity();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val, const T v) const
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val, const T v) const
   {
     val = operator_type::operator()(val, v);
   }
@@ -79,24 +80,24 @@ struct op_adapter : private Op<T, T, T> {
 }  // namespace detail
 
 template <typename T>
-struct sum : detail::op_adapter<T, RAJA::operators::plus> {
-};
+struct sum : detail::op_adapter<T, RAJA::operators::plus>
+{};
 
 template <typename T>
-struct min : detail::op_adapter<T, RAJA::operators::minimum> {
-};
+struct min : detail::op_adapter<T, RAJA::operators::minimum>
+{};
 
 template <typename T>
-struct max : detail::op_adapter<T, RAJA::operators::maximum> {
-};
+struct max : detail::op_adapter<T, RAJA::operators::maximum>
+{};
 
 template <typename T>
-struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or> {
-};
+struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or>
+{};
 
 template <typename T>
-struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and> {
-};
+struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and>
+{};
 
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -107,7 +108,8 @@ namespace detail
 {
 
 template <typename T, bool = std::is_integral<T>::value>
-struct DefaultLoc {};
+struct DefaultLoc
+{};
 
 template <typename T>
 struct DefaultLoc<T, false>  // any non-integral type
@@ -128,30 +130,39 @@ class ValueLoc
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   IndexType loc = DefaultLoc<IndexType>().value();
 
-#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__)
+#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 ||            \
+    defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const& other)
+      : val {other.val}, loc {other.loc}
+  {}
   RAJA_HOST_DEVICE
-  ValueLoc &operator=(ValueLoc const &other) { val = other.val; loc = other.loc; return *this;}
+  ValueLoc& operator=(ValueLoc const& other)
+  {
+    val = other.val;
+    loc = other.loc;
+    return *this;
+  }
 #else
-  constexpr ValueLoc() = default;
-  constexpr ValueLoc(ValueLoc const &) = default;
-  ValueLoc &operator=(ValueLoc const &) = default;
+  constexpr ValueLoc()                 = default;
+  constexpr ValueLoc(ValueLoc const&)  = default;
+  ValueLoc& operator=(ValueLoc const&) = default;
 #endif
 
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc<IndexType>().value()} {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_)
-      : val{val_}, loc{loc_}
-  {
-  }
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_)
+      : val {val_}, loc {DefaultLoc<IndexType>().value()}
+  {}
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_, IndexType const& loc_)
+      : val {val_}, loc {loc_}
+  {}
 
   RAJA_HOST_DEVICE operator T() const { return val; }
   RAJA_HOST_DEVICE IndexType getLoc() { return loc; }
-  RAJA_HOST_DEVICE bool operator<(ValueLoc const &rhs) const
+  RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const
   {
     return val < rhs.val;
   }
-  RAJA_HOST_DEVICE bool operator>(ValueLoc const &rhs) const
+  RAJA_HOST_DEVICE bool operator>(ValueLoc const& rhs) const
   {
     return val > rhs.val;
   }
@@ -164,14 +175,17 @@ class ValueLoc
 namespace operators
 {
 template <typename T, typename IndexType, bool B>
-struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> min()
+struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      min()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      max()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::max());
   }
@@ -197,50 +211,49 @@ class BaseReduce
   Combiner_t mutable c;
 
 public:
-  using value_type = T;
+  using value_type  = T;
   using reduce_type = Reduce;
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce() : c{T(), Reduce::identity()} {}
+  BaseReduce() : c {T(), Reduce::identity()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   BaseReduce(T init_val, T identity_ = Reduce::identity())
-      : c{init_val, identity_}
-  {
-  }
+      : c {init_val, identity_}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   void reset(T val, T identity_ = Reduce::identity())
   {
-    operator T(); // automatic get() before reset
+    operator T();  // automatic get() before reset
     c.reset(val, identity_);
   }
 
   //! prohibit compiler-generated copy assignment
-  BaseReduce &operator=(const BaseReduce &) = delete;
+  BaseReduce& operator=(const BaseReduce&) = delete;
 
   //! compiler-generated copy constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  BaseReduce(const BaseReduce &copy) : c(copy.c) {}
+  BaseReduce(const BaseReduce& copy) : c(copy.c) {}
 
   //! compiler-generated move constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  BaseReduce(BaseReduce &&copy) : c(std::move(copy.c)) {}
+  BaseReduce(BaseReduce&& copy) : c(std::move(copy.c)) {}
 
   //! compiler-generated move assignment
-  BaseReduce &operator=(BaseReduce &&) = default;
+  BaseReduce& operator=(BaseReduce&&) = default;
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const &other) const { c.combine(other); }
+  void combine(T const& other) const { c.combine(other); }
 
-  T &local() const { return c.local(); }
+  T& local() const { return c.local(); }
 
   //! Get the calculated reduced value
   operator T() const { return c.get(); }
@@ -253,51 +266,50 @@ template <typename T, typename Reduce, typename Derived>
 class BaseCombinable
 {
 protected:
-  BaseCombinable const *parent = nullptr;
+  BaseCombinable const* parent = nullptr;
   T identity;
   T mutable my_data;
 
 public:
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable() : identity{T()}, my_data{T()} {}
+  constexpr BaseCombinable() : identity {T()}, my_data {T()} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   constexpr BaseCombinable(T init_val, T identity_ = T())
-      : identity{identity_}, my_data{init_val}
-  {
-  }
+      : identity {identity_}, my_data {init_val}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   void reset(T init_val, T identity_)
   {
-    my_data = init_val;
+    my_data  = init_val;
     identity = identity_;
   }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable(BaseCombinable const &other)
-      : parent{other.parent ? other.parent : &other},
-        identity{other.identity},
-        my_data{identity}
-  {
-  }
+  constexpr BaseCombinable(BaseCombinable const& other)
+      : parent {other.parent ? other.parent : &other},
+        identity {other.identity},
+        my_data {identity}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
   ~BaseCombinable()
   {
-    if (parent && my_data != identity) {
+    if (parent && my_data != identity)
+    {
       Reduce()(parent->my_data, my_data);
     }
   }
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  void combine(T const &other) { Reduce{}(my_data, other); }
+  void combine(T const& other) { Reduce {}(my_data, other); }
 
   /*!
    *  \return the calculated reduced value
@@ -307,17 +319,17 @@ class BaseCombinable
   /*!
    *  \return reference to the local value
    */
-  T &local() const { return my_data; }
+  T& local() const { return my_data; }
 
   T get_combined() const { return my_data; }
 
 private:
   // Convenience method for CRTP
-  const Derived &derived() const
+  const Derived& derived() const
   {
-    return *(static_cast<const Derived *>(this));
+    return *(static_cast<const Derived*>(this));
   }
-  Derived &derived() { return *(static_cast<Derived *>(this)); }
+  Derived& derived() { return *(static_cast<Derived*>(this)); }
 };
 
 /*!
@@ -336,7 +348,7 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMin &min(T rhs) const
+  const BaseReduceMin& min(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -350,36 +362,43 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
 public:
   using Base = BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>;
-  using value_type = typename Base::value_type;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+  constexpr BaseReduceMinLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_         = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMinLoc &minloc(T rhs, IndexType loc) const
+  const BaseReduceMinLoc& minloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
@@ -408,7 +427,7 @@ class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMax &max(T rhs) const
+  const BaseReduceMax& max(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -432,7 +451,7 @@ class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceSum &operator+=(T rhs) const
+  const BaseReduceSum& operator+=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -456,7 +475,7 @@ class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitOr &operator|=(T rhs) const
+  const BaseReduceBitOr& operator|=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -480,7 +499,7 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
-  const BaseReduceBitAnd &operator&=(T rhs) const
+  const BaseReduceBitAnd& operator&=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -495,36 +514,45 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
-class BaseReduceMaxLoc
-    : public BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>
+template <typename T,
+          typename IndexType,
+          template <typename, typename>
+          class Combiner>
+class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
+                                           RAJA::reduce::max,
+                                           Combiner>
 {
 public:
-  using Base = BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
-  using value_type = typename Base::value_type;
+  using Base =
+      BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+  constexpr BaseReduceMaxLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_         = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMaxLoc &maxloc(T rhs, IndexType loc) const
+  const BaseReduceMaxLoc& maxloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 686f0e8c6b..e0b87a5d60 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -98,14 +98,15 @@ namespace detail
 {
 /// Adapter to replace specific implementations for the icount variants
 template <typename Range, typename Body, typename IndexT>
-struct icount_adapter {
+struct icount_adapter
+{
   using index_type = typename std::decay<IndexT>::type;
   typename std::decay<Body>::type body;
   using container_type = typename std::decay<Range>::type;
   typename container_type::iterator begin_it;
   Index_type icount;
   icount_adapter(Range const& r, Body const& b, IndexT icount_)
-      : body{b}, icount{icount_}
+      : body {b}, icount {icount_}
   {
     using std::begin;
     begin_it = begin(r);
@@ -119,16 +120,28 @@ struct icount_adapter {
   }
 };
 
-struct CallForall {
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+struct CallForall
+{
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res>
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
 
-struct CallForallIcount {
+struct CallForallIcount
+{
   constexpr CallForallIcount(int s);
 
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  template <typename T,
+            typename ExecPol,
+            typename Body,
+            typename Res,
+            typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res>
+  operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
@@ -152,22 +165,31 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody, typename ForallParams>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody,
+          typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params)
+forall(Res r,
+       ExecutionPolicy&& p,
+       Container&& c,
+       LoopBody&& loop_body,
+       ForallParams&& f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     std::forward<ForallParams>(f_params));
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), std::forward<ForallParams>(f_params));
 }
 
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
+template <typename Res,
+          typename ExecutionPolicy,
+          typename Container,
+          typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -175,11 +197,9 @@ RAJA_INLINE concepts::enable_if_t<
 forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     expt::get_empty_forall_param_pack());
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), expt::get_empty_forall_param_pack());
 }
 
 
@@ -197,22 +217,22 @@ template <typename Res,
           typename LoopBody,
           typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                      ExecutionPolicy&& p,
-                                                      Container&& c,
-                                                      IndexType&& icount,
-                                                      LoopBody&& loop_body,
-                                                      ForallParams&& f_params)
+                                                     ExecutionPolicy&& p,
+                                                     Container&& c,
+                                                     IndexType&& icount,
+                                                     LoopBody&& loop_body,
+                                                     ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c,
-                                                                 loop_body,
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c, loop_body,
                                                                  icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted, std::forward<ForallParams>(f_params));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted,
+                     std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -230,23 +250,24 @@ template <typename Res,
           typename... SegmentTypes,
           typename LoopBody,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                ExecPolicy<SegmentIterPolicy,
-                                                SegmentExecPolicy>,
-                                                const TypedIndexSet<SegmentTypes...>& iset,
-                                                LoopBody loop_body,
-                                                ForallParams f_params)
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(Res r,
+              ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+              const TypedIndexSet<SegmentTypes...>& iset,
+              LoopBody loop_body,
+              ForallParams f_params)
 {
   // no need for icount variant here
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID,
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(
+                     segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
-                     SegmentExecPolicy(),
-                     loop_body,
-                     r,
-                     f_params);
-  });
+                     SegmentExecPolicy(), loop_body, r, f_params);
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
@@ -256,30 +277,33 @@ template <typename Res,
           typename LoopBody,
           typename... SegmentTypes,
           typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall(Res r,
-                                         ExecPolicy<SegmentIterPolicy,
-                                         SegmentExecPolicy>,
-                                         const TypedIndexSet<SegmentTypes...>& iset,
-                                         LoopBody loop_body,
-                                         ForallParams f_params)
-{
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
-  wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params);
-  });
+RAJA_INLINE resources::EventProxy<Res>
+forall(Res r,
+       ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+       const TypedIndexSet<SegmentTypes...>& iset,
+       LoopBody loop_body,
+       ForallParams f_params)
+{
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
+  wrap::forall(segIterRes, SegmentIterPolicy(), iset,
+               [=, &r](int segID)
+               {
+                 iset.segmentCall(segID, detail::CallForall {},
+                                  SegmentExecPolicy(), loop_body, r, f_params);
+               });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
 }  // end namespace wrap
 
 
-
 /*!
  ******************************************************************************
  *
- * \brief The RAJA::policy_by_value_interface forall functions provide an interface with
- *        value-based policies. It also enforces the interface and performs
- *        static checks as well as triggering plugins and loop body updates.
+ * \brief The RAJA::policy_by_value_interface forall functions provide an
+ *interface with value-based policies. It also enforces the interface and
+ *performs static checks as well as triggering plugins and loop body updates.
  *
  ******************************************************************************
  */
@@ -294,11 +318,12 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
-                                                     Res r,
-                                                     IdxSet&& c,
-                                                     Params&&... params)
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -306,9 +331,10 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -318,27 +344,24 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  RAJA::resources::EventProxy<Res> e =
+      wrap::forall_Icount(r, std::forward<ExecutionPolicy>(p),
+                          std::forward<IdxSet>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
-                                                     IdxSet&& c,
-                                                     LoopBody&& loop_body)
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE resources::EventProxy<Res>
+forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -349,11 +372,14 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
+template <typename ExecutionPolicy,
+          typename Res,
+          typename IdxSet,
+          typename... Params>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
@@ -363,7 +389,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -373,28 +400,26 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
+                   std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+template <
+    typename ExecutionPolicy,
+    typename IdxSet,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_indexset_policy<ExecutionPolicy>>
+    forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -405,12 +430,14 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_multi_policy<ExecutionPolicy>,
-    type_traits::is_range<Container>>
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_multi_policy<ExecutionPolicy>,
+                                  type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
@@ -419,10 +446,9 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  return forall_impl(r,
-              std::forward<ExecutionPolicy>(p),
-              std::forward<Container>(c),
-              std::forward<LoopBody>(loop_body));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -438,10 +464,9 @@ template <typename ExecutionPolicy,
           typename IndexType,
           typename FirstParam,
           typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_range<Container>,
-    type_traits::is_integral<IndexType>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_range<Container>,
+                                  type_traits::is_integral<IndexType>>
 forall_Icount(ExecutionPolicy&& p,
               Res r,
               Container&& c,
@@ -452,11 +477,14 @@ forall_Icount(ExecutionPolicy&& p,
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
-  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
+                                               std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
+                                      std::forward<Params>(params)...);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -467,21 +495,18 @@ forall_Icount(ExecutionPolicy&& p,
   util::callPreLaunchPlugins(context);
 
   resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      icount,
-      std::move(body),
-      f_params);
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c), icount,
+      std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename IndexType,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_range<Container>,
@@ -494,10 +519,7 @@ forall_Icount(ExecutionPolicy&& p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      icount,
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c), icount,
       std::forward<LoopBody>(loop_body));
 }
 
@@ -509,7 +531,10 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
-template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
+template <typename ExecutionPolicy,
+          typename Res,
+          typename Container,
+          typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -524,7 +549,8 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -534,19 +560,19 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =  wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p),
+                   std::forward<Container>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
 
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename Container,
+    typename LoopBody,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -556,13 +582,11 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c),
       std::forward<LoopBody>(loop_body));
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -570,20 +594,23 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 
 /*!
@@ -592,8 +619,10 @@ forall(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template <
+    typename ExecutionPolicy,
+    typename... Args,
+    typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 {
   Res r = Res::get_default();
@@ -601,7 +630,8 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
 template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -611,12 +641,17 @@ forall_Icount(Res r, Args&&... args)
 namespace detail
 {
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
-                                                               ExecutionPolicy,
-                                                               LoopBody body,
-                                                               Res r,
-                                                               ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForall::operator()(T const& segment,
+                       ExecutionPolicy,
+                       LoopBody body,
+                       Res r,
+                       ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -626,15 +661,21 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& seg
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
-                                                                     ExecutionPolicy,
-                                                                     LoopBody body,
-                                                                     Res r,
-                                                                     ForallParams f_params) const
+template <typename T,
+          typename ExecutionPolicy,
+          typename LoopBody,
+          typename Res,
+          typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res>
+CallForallIcount::operator()(T const& segment,
+                             ExecutionPolicy,
+                             LoopBody body,
+                             Res r,
+                             ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body,
+                             f_params);
 }
 
 }  // namespace detail
@@ -650,100 +691,112 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T cons
 namespace expt
 {
 
-  template<camp::idx_t IDX, typename POLICY_LIST>
-  struct dynamic_helper
+template <camp::idx_t IDX, typename POLICY_LIST>
+struct dynamic_helper
+{
+  template <typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (IDX == pol)
     {
-      if(IDX==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+  template <typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
+  {
 
-      if(IDX==pol){
-        RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-        //Return a generic event proxy from r,
-        //because forall returns a typed event proxy
-        return {r};
-      }
+    if (IDX == pol)
+    {
+      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      return dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+      // Return a generic event proxy from r,
+      // because forall returns a typed event proxy
+      return {r};
     }
 
-  };
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r, pol, seg,
+                                                               body);
+  }
+};
 
-  template<typename POLICY_LIST>
-  struct dynamic_helper<0, POLICY_LIST>
+template <typename POLICY_LIST>
+struct dynamic_helper<0, POLICY_LIST>
+{
+  template <typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void
-    invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (0 == pol)
     {
-      if(0==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      RAJA_ABORT_OR_THROW("Policy enum not supported ");
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    RAJA_ABORT_OR_THROW("Policy enum not supported ");
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-      if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
+  template <typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource>
+  invoke_forall(RAJA::resources::Resource r,
+                const int pol,
+                SEGMENT const& seg,
+                BODY const& body)
+  {
+    if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      //Return a generic event proxy from r,
-      //because forall returns a typed event proxy
-      return {r};
-    }
+    // Return a generic event proxy from r,
+    // because forall returns a typed event proxy
+    return {r};
+  }
+};
 
-  };
+template <typename POLICY_LIST, typename SEGMENT, typename BODY>
+void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body)
+  if (pol > N - 1)
   {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy enum not supported");
-    }
-    dynamic_helper<N-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+    RAJA_ABORT_OR_THROW("Policy enum not supported");
   }
+  dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+}
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  resources::EventProxy<resources::Resource>
-  dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-  {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy value out of range");
-    }
+template <typename POLICY_LIST, typename SEGMENT, typename BODY>
+resources::EventProxy<resources::Resource>
+dynamic_forall(RAJA::resources::Resource r,
+               const int pol,
+               SEGMENT const& seg,
+               BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-    return dynamic_helper<N-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+  if (pol > N - 1)
+  {
+    RAJA_ABORT_OR_THROW("Policy value out of range");
   }
 
+  return dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+}
+
 }  // namespace expt
 
 
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index 1875fe27d9..d03c8f531f 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -55,44 +55,43 @@ template <typename T>
 struct IterableWrapperTuple;
 
 template <typename... Ts>
-struct IterableWrapperTuple<camp::tuple<Ts...>> {
+struct IterableWrapperTuple<camp::tuple<Ts...>>
+{
 
-  using type =
-      camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
-                             typename camp::decay<Ts>::IndexType>...>;
+  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                                      typename camp::decay<Ts>::IndexType>...>;
 };
 
 
 namespace internal
 {
 template <class Tuple, camp::idx_t... I>
-RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
-                                                   camp::idx_seq<I...>)
-    -> camp::tuple<RAJA::Span<
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-        typename camp::decay<
-            camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
+RAJA_INLINE constexpr auto
+make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq<I...>) -> camp::tuple<
+    RAJA::Span<typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+               typename camp::decay<
+                   camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<
-          typename camp::decay<
-              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-          typename camp::decay<camp::tuple_element_t<I, camp::decay<Tuple>>>::
-              IndexType>{camp::get<I>(std::forward<Tuple>(t)).begin(),
-                         camp::get<I>(std::forward<Tuple>(t)).end()}...);
+      RAJA::Span<typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+                 typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType> {
+          camp::get<I>(std::forward<Tuple>(t)).begin(),
+          camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
 }  // namespace internal
 
 template <class Tuple>
-RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple &&t)
+RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t)
     -> decltype(internal::make_wrapped_tuple_impl(
         std::forward<Tuple>(t),
-        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{}))
+        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {}))
 {
   return internal::make_wrapped_tuple_impl(
       std::forward<Tuple>(t),
-      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{});
+      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
 
@@ -101,12 +100,13 @@ template <typename PolicyType,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &&segments,
-                                                                  ParamTuple &&params,
-                                                                  Resource resource,
-                                                                  Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource>
+kernel_param_resource(SegmentTuple&& segments,
+                      ParamTuple&& params,
+                      Resource resource,
+                      Bodies&&... bodies)
 {
-  util::PluginContext context{util::make_context<PolicyType>()};
+  util::PluginContext context {util::make_context<PolicyType>()};
 
   // TODO: test that all policy members model the Executor policy concept
   // TODO: add a static_assert for functors which cannot be invoked with
@@ -119,10 +119,8 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<segment_tuple_t,
-                                         param_tuple_t,
-                                         Resource,
-                                         camp::decay<Bodies>...>;
+  using loop_data_t = internal::LoopData<segment_tuple_t, param_tuple_t,
+                                         Resource, camp::decay<Bodies>...>;
 
 
   util::callPreCapturePlugins(context);
@@ -131,11 +129,10 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
   // our segments, loop bodies, and the tuple of loop indices
   // it is passed through all of the kernel mechanics by-referenece,
   // and only copied to provide thread-private instances.
-  loop_data_t loop_data(make_wrapped_tuple(
-                            std::forward<SegmentTuple>(segments)),
-                            std::forward<ParamTuple>(params),
-                            resource,
-                            std::forward<Bodies>(bodies)...);
+  loop_data_t loop_data(
+      make_wrapped_tuple(std::forward<SegmentTuple>(segments)),
+      std::forward<ParamTuple>(params), resource,
+      std::forward<Bodies>(bodies)...);
 
   util::callPostCapturePlugins(context);
 
@@ -156,40 +153,35 @@ template <typename PolicyType,
           typename SegmentTuple,
           typename Resource,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_resource(SegmentTuple &&segments,
-                                                            Resource resource,
-                                                            Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<Resource>
+kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies)
 {
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 resource,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), resource,
+      std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType,
           typename SegmentTuple,
           typename ParamTuple,
           typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel_param(SegmentTuple &&segments,
-                                                                                           ParamTuple &&params,
-                                                                                           Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 std::forward<ParamTuple>(params),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), std::forward<ParamTuple>(params),
+      res, std::forward<Bodies>(bodies)...);
 }
 
 template <typename PolicyType, typename SegmentTuple, typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel(SegmentTuple &&segments,
-                                                                                     Bodies &&... bodies)
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel(SegmentTuple&& segments, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), res,
+      std::forward<Bodies>(bodies)...);
 }
 
 
diff --git a/include/RAJA/pattern/kernel/Collapse.hpp b/include/RAJA/pattern/kernel/Collapse.hpp
index 8efb126397..10afccda53 100644
--- a/include/RAJA/pattern/kernel/Collapse.hpp
+++ b/include/RAJA/pattern/kernel/Collapse.hpp
@@ -29,8 +29,8 @@ namespace statement
 template <typename ExecPolicy, typename ForList, typename... EnclosedStmts>
 struct Collapse : public internal::ForList,
                   public internal::CollapseBase,
-                  public internal::Statement<ExecPolicy, EnclosedStmts...> {
-};
+                  public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
 
 }  // namespace statement
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index 6b7875c4c2..1b8f38f76b 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -37,8 +37,8 @@ namespace statement
  *
  */
 template <typename Condition, typename... EnclosedStmts>
-struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
+struct If : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
 
 /*!
@@ -46,10 +46,11 @@ struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
  *
  */
 template <long value>
-struct Value {
+struct Value
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const &)
+  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const&)
   {
     return value;
   }
@@ -60,10 +61,11 @@ struct Value {
  *
  */
 template <typename L, typename R>
-struct Equals {
+struct Equals
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) == R::eval(data);
   }
@@ -74,10 +76,11 @@ struct Equals {
  *
  */
 template <typename L, typename R>
-struct NotEquals {
+struct NotEquals
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) != R::eval(data);
   }
@@ -89,10 +92,11 @@ struct NotEquals {
  *
  */
 template <typename L, typename R>
-struct Or {
+struct Or
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) || R::eval(data);
   }
@@ -104,10 +108,11 @@ struct Or {
  *
  */
 template <typename L, typename R>
-struct And {
+struct And
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) && R::eval(data);
   }
@@ -119,10 +124,11 @@ struct And {
  *
  */
 template <typename L, typename R>
-struct LessThan {
+struct LessThan
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) < R::eval(data);
   }
@@ -134,10 +140,11 @@ struct LessThan {
  *
  */
 template <typename L, typename R>
-struct LessThanEq {
+struct LessThanEq
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) <= R::eval(data);
   }
@@ -149,10 +156,11 @@ struct LessThanEq {
  *
  */
 template <typename L, typename R>
-struct GreaterThan {
+struct GreaterThan
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) > R::eval(data);
   }
@@ -164,10 +172,11 @@ struct GreaterThan {
  *
  */
 template <typename L, typename R>
-struct GreaterThanEq {
+struct GreaterThanEq
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) >= R::eval(data);
   }
@@ -179,10 +188,11 @@ struct GreaterThanEq {
  *
  */
 template <typename L>
-struct Not {
+struct Not
+{
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return !(L::eval(data));
   }
@@ -196,14 +206,16 @@ namespace internal
 
 
 template <typename Condition, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types> {
+struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
-    if (Condition::eval(data)) {
+    if (Condition::eval(data))
+    {
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(
           std::forward<Data>(data));
     }
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 539c451673..661fe92868 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -42,7 +42,8 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts>
 struct For : public internal::ForList,
              public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+             public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   // TODO: add static_assert for valid policy in Pol
   using execution_policy_t = ExecPolicy;
@@ -59,8 +60,12 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -85,11 +90,13 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -98,12 +105,13 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -112,15 +120,14 @@ struct StatementExecutor<
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types> {
+template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -129,12 +136,13 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     RAJA_EXTRACT_BED_IT(TypedRangeSegment<len_t>(0, len));
 
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
       for_wrapper(*(begin_it + i));
     }
   }
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 18515c7f59..c6e75c35aa 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -44,8 +44,9 @@ template <camp::idx_t ArgumentId,
           typename ExecPolicy = camp::nil,
           typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
-             public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+                   public internal::ForTraitBase<ArgumentId, ExecPolicy>,
+                   public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
@@ -64,9 +65,13 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
-struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -93,26 +98,29 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes,
-                     EnclosedStmts...> for_wrapper(data);
+    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 955afcecc0..66be036556 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -81,10 +81,8 @@ template <camp::idx_t HpArgumentId,
           typename ArgList,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Hyperplane
-    : public internal::Statement<ExecPolicy,
-                                 EnclosedStmts...> {
-};
+struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
 }  // end namespace statement
 
@@ -93,9 +91,8 @@ namespace internal
 
 
 template <camp::idx_t HpArgumentId, typename ArgList, typename... EnclosedStmts>
-struct HyperplaneInner
-    : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
+struct HyperplaneInner : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
 
 template <camp::idx_t HpArgumentId,
@@ -108,11 +105,13 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
                                                HpExecPolicy,
                                                ArgList<Args...>,
                                                ExecPolicy,
-                                               EnclosedStmts...>, Types> {
+                                               EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get type of Hp arguments index
@@ -126,8 +125,7 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
     // Add a Collapse policy around our enclosed statements that will handle
     // the inner hyperplane loop's execution
     using kernel_policy = statement::Collapse<
-        ExecPolicy,
-        ArgList<Args...>,
+        ExecPolicy, ArgList<Args...>,
         HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>>;
 
     // Create a For-loop wrapper for the outer loop
@@ -135,9 +133,9 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
 
     // compute manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    idx_t hp_len = segment_length<HpArgumentId>(data) +
-                   foldl(RAJA::operators::plus<idx_t>(),
-                                 segment_length<Args>(data)...);
+    idx_t hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<idx_t>(), segment_length<Args>(data)...);
 
     /* Execute the outer loop over hyperplanes
      *
@@ -146,10 +144,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r, HpExecPolicy{},
-                TypedRangeSegment<idx_t>(0, hp_len),
-                outer_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, HpExecPolicy {}, TypedRangeSegment<idx_t>(0, hp_len),
+                outer_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -159,27 +155,30 @@ template <camp::idx_t HpArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>, Types> {
+    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get h value
-    auto h = camp::get<HpArgumentId>(data.offset_tuple);
+    auto h      = camp::get<HpArgumentId>(data.offset_tuple);
     using idx_t = decltype(h);
 
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
     idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
-                                camp::get<Args>(data.offset_tuple)...);
+                        camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
 
     // check bounds
-    if (i >= 0 && i < len) {
+    if (i >= 0 && i < len)
+    {
 
       // store in tuple
       data.template assign_offset<HpArgumentId>(i);
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 21d9e3cd2a..25bd0a10df 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -26,7 +26,7 @@
 namespace RAJA
 {
 
-//Policies for RAJA local arrays
+// Policies for RAJA local arrays
 struct cpu_tile_mem;
 
 
@@ -43,14 +43,16 @@ namespace statement
  * IntiLocalMem<Pol, RAJA::param_idx<0>, statements...>
  * Will intialize the 0th array in the param tuple
  */
-template<typename Pol, typename Indices, typename... EnclosedStmts>
-struct InitLocalMem : public internal::Statement<camp::nil> {
-};
+template <typename Pol, typename Indices, typename... EnclosedStmts>
+struct InitLocalMem : public internal::Statement<camp::nil>
+{};
 
-//Policy Specialization
-template<camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts...> : public internal::Statement<camp::nil> {
-};
+// Policy Specialization
+template <camp::idx_t... Indices, typename... EnclosedStmts>
+struct InitLocalMem<RAJA::cpu_tile_mem,
+                    camp::idx_seq<Indices...>,
+                    EnclosedStmts...> : public internal::Statement<camp::nil>
+{};
 
 
 }  // end namespace statement
@@ -58,28 +60,33 @@ struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts
 namespace internal
 {
 
-//Statement executor to initalize RAJA local array
-template<camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...>, Types>{
-  
-  //Execute statement list
-  template<class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+// Statement executor to initalize RAJA local array
+template <camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
+                                                 camp::idx_seq<Indices...>,
+                                                 EnclosedStmts...>,
+                         Types>
+{
+
+  // Execute statement list
+  template <class Data>
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
-  
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t... others, class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t... others, class Data>
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
 
     // Initialize memory
 #ifdef RAJA_COMPILER_MSVC
     // MSVC doesn't like taking a pointer to stack allocated data?!?!
-    varType *ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
+    varType* ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
     camp::get<Pos>(data.param_tuple).set_data(ptr);
 #else
     varType Array[camp::get<Pos>(data.param_tuple).size()];
@@ -95,16 +102,14 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_se
     delete[] ptr;
 #endif
   }
-  
 
-  
-  template<typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
-    //Initalize local arrays + execute statements + cleanup
+    // Initalize local arrays + execute statements + cleanup
     exec_expanded<Indices...>(data);
   }
-  
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 29d41b431e..d9b87bf3d1 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -46,28 +46,28 @@ struct lambda_arg_param_t
 struct lambda_arg_offset_t
 {};
 
-template<typename T>
+template <typename T>
 struct lambda_arg_value_t
 {
-    using type = T;
+  using type = T;
 };
 
-template<typename T, camp::idx_t V>
+template <typename T, camp::idx_t V>
 struct LambdaArg
 {
-    static constexpr camp::idx_t value = V;
+  static constexpr camp::idx_t value = V;
 };
 
-}
-
+}  // namespace internal
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment values
  * should be passed into the lambda as an argument
  */
-template<camp::idx_t ... args>
-using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
+template <camp::idx_t... args>
+using Segs =
+    camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment offsets
@@ -79,16 +79,18 @@ using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...
  * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the
  * current tile.
  */
-template<camp::idx_t ... args>
-using Offsets = camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
+template <camp::idx_t... args>
+using Offsets =
+    camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more parameters that
  * should be passed into the lambda as an argument.
  */
-template<camp::idx_t ... args>
-using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
+template <camp::idx_t... args>
+using Params =
+    camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more constant values
@@ -103,8 +105,9 @@ using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args
  * writing:   Lambda<0, ValuesT<double, 3, 4>>
  * invokes:   lambda0( (double)3, (double) 4 )
  */
-template<typename T, camp::idx_t ... values>
-using ValuesT = camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
+template <typename T, camp::idx_t... values>
+using ValuesT =
+    camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
 
 
 namespace statement
@@ -119,8 +122,9 @@ namespace statement
  * RAJA::kernel<exec_pol>(make_tuple{s0, s1, s2}, lambda0, lambda1);
  *
  */
-template <camp::idx_t BodyIdx, typename... Args >
-struct Lambda : internal::Statement<camp::nil> {
+template <camp::idx_t BodyIdx, typename... Args>
+struct Lambda : internal::Statement<camp::nil>
+{
   static const camp::idx_t loop_body_index = BodyIdx;
 };
 
@@ -130,13 +134,6 @@ namespace internal
 {
 
 
-
-
-
-
-
-
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -146,26 +143,23 @@ namespace internal
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename SegmentType, camp::idx_t id>
+template <typename SegmentType, camp::idx_t id>
 struct LambdaSegExtractor
 {
 
-  static_assert(!std::is_same<SegmentType, void>::value,
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
       "Segment not assigned, but used in Lambda with Segs<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return SegmentType(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)]);
+    return SegmentType(camp::get<id>(data.segment_tuple)
+                           .begin()[camp::get<id>(data.offset_tuple)]);
   }
-
 };
 
 
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -175,26 +169,22 @@ struct LambdaSegExtractor
  * This class allows specialization on the segment type in LoopTypes so that
  * fancier constructions can happen (ie vector_exec, etc.)
  */
-template<typename OffsetType, camp::idx_t id>
+template <typename OffsetType, camp::idx_t id>
 struct LambdaOffsetExtractor
 {
 
-  static_assert(!std::is_same<OffsetType, void>::value,
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
       "Segment not assigned, but used in Lambda with Offsets<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
     return OffsetType(camp::get<id>(data.offset_tuple));
   }
-
 };
 
 
-
 /*
  * Helper that provides first level of argument extraction
  * This acts as a switchboard between Segs, Offsets, and Params
@@ -202,140 +192,140 @@ struct LambdaOffsetExtractor
  * It calls LambdaArgExtractor to perform the actual argument extraction.
  * This allows LambdaArgExtractor to be specialized
  */
-template<typename Types, typename T>
+template <typename Types, typename T>
 struct LambdaArgSwitchboard;
 
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
 {
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
-  static_assert(!std::is_same<OffsetType, void>::value,
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
       "Offset not assigned, but used in Lambda with Offsets<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
-    return LambdaOffsetExtractor<OffsetType, id>::extract(std::forward<Data>(data));
+    return LambdaOffsetExtractor<OffsetType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
 {
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
-  static_assert(!std::is_same<SegmentType, void>::value,
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
       "Segment not assigned, but used in Lambda with Segs<> argument");
 
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return LambdaSegExtractor<SegmentType, id>::extract(std::forward<Data>(data));
+    return LambdaSegExtractor<SegmentType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
-template<typename Types, camp::idx_t id>
+template <typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
 {
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static auto extract(Data &&data)->
-    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto
+  extract(Data&& data) -> typename std::add_lvalue_reference<
+      camp::tuple_element_t<id,
+                            typename camp::decay<Data>::param_tuple_t>>::type
   {
     return camp::get<id>(data.param_tuple);
   }
 };
 
 
-template<typename Types, typename T, camp::idx_t value>
+template <typename Types, typename T, camp::idx_t value>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
 {
-  template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static T extract(Data &&)
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static T extract(Data&&)
   {
     return T(value);
   }
 };
 
 
-
 RAJA_SUPPRESS_HD_WARN
-template<camp::idx_t LoopIndex, typename Types, typename Data, typename... targLists>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data,
-                                                       camp::list<targLists...> const &)
+template <camp::idx_t LoopIndex,
+          typename Types,
+          typename Data,
+          typename... targLists>
+RAJA_INLINE RAJA_HOST_DEVICE void
+invoke_lambda_with_args(Data&& data, camp::list<targLists...> const&)
 {
   camp::get<LoopIndex>(data.bodies)(
       LambdaArgSwitchboard<Types, targLists>::extract(data)...);
 }
 
 
-
-
 /*!
  * A RAJA::kernel statement that invokes a lambda function
  * with user specified arguments.
  */
-template <camp::idx_t LambdaIndex,typename... Args, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types> {
+template <camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    //Convert SegList, ParamList into Seg, Param types, and store in a list
+    // Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data), targList{});
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
+                                                targList {});
   }
 };
 
 
-
-template <camp::idx_t LambdaIndex, typename Types, typename Data, camp::idx_t ... SEGS, camp::idx_t ... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data, camp::idx_seq<SEGS...> const &, camp::idx_seq<PARAMS...> const &)
+template <camp::idx_t LambdaIndex,
+          typename Types,
+          typename Data,
+          camp::idx_t... SEGS,
+          camp::idx_t... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
+                                                camp::idx_seq<SEGS...> const&,
+                                                camp::idx_seq<PARAMS...> const&)
 {
 
-  using AllSegs = Segs<SEGS...>;
+  using AllSegs   = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
+                    Types>::exec(std::forward<Data>(data));
 }
 
 
 template <camp::idx_t LambdaIndex, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
+struct StatementExecutor<statement::Lambda<LambdaIndex>, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    using Data_t = camp::decay<Data>;
+    using Data_t         = camp::decay<Data>;
     using offset_tuple_t = typename Data_t::offset_tuple_t;
-    using param_tuple_t = typename Data_t::param_tuple_t;
+    using param_tuple_t  = typename Data_t::param_tuple_t;
 
     invoke_lambda<LambdaIndex, Types>(
         std::forward<Data>(data),
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
-        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{});
-
+        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value> {},
+        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value> {});
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp
index 8e870ebe15..999e1a9ebe 100644
--- a/include/RAJA/pattern/kernel/Param.hpp
+++ b/include/RAJA/pattern/kernel/Param.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 namespace internal
 {
 
-struct ParamBase {
-};
+struct ParamBase
+{};
 
-}// end namespace internal
+}  // end namespace internal
 
 namespace statement
 {
@@ -47,12 +47,13 @@ namespace statement
  * RAJA::kernel execution policies.
  */
 template <camp::idx_t ParamId>
-struct Param : public internal::ParamBase {
+struct Param : public internal::ParamBase
+{
 
   constexpr static camp::idx_t param_idx = ParamId;
 
   template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const &data)
+  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const& data)
       -> decltype(camp::get<ParamId>(data.param_tuple))
   {
     return camp::get<ParamId>(data.param_tuple);
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index 4de4922ea3..db45d2dfe4 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -39,10 +39,12 @@ namespace statement
  *
  */
 template <typename ReducePolicy,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts>
-struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...> {
+struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 82b79ae775..700df61199 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -30,9 +30,9 @@ namespace RAJA
 namespace statement
 {
 
-template<typename RegionPolicy, typename... EnclosedStmts>
-struct Region : public internal::Statement<camp::nil> {
-};
+template <typename RegionPolicy, typename... EnclosedStmts>
+struct Region : public internal::Statement<camp::nil>
+{};
 
 
 }  // end namespace statement
@@ -40,23 +40,27 @@ struct Region : public internal::Statement<camp::nil> {
 namespace internal
 {
 
-//Statement executor to create a region within kernel
-
-//Note: RAJA region's lambda must capture by reference otherwise
-//internal function calls are undefined.
-template<typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>, Types> {
+// Statement executor to create a region within kernel
 
-template<typename Data>
-static RAJA_INLINE void exec(Data &&data)
+// Note: RAJA region's lambda must capture by reference otherwise
+// internal function calls are undefined.
+template <typename RegionPolicy, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
+                         Types>
 {
 
-  RAJA::region<RegionPolicy>([&]() {
-      using data_t = camp::decay<Data>;
-      execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
-    });
-}
-
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&& data)
+  {
+
+    RAJA::region<RegionPolicy>(
+        [&]()
+        {
+          using data_t = camp::decay<Data>;
+          execute_statement_list<camp::list<EnclosedStmts...>, Types>(
+              data_t(data));
+        });
+  }
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 43f72e0545..3b3b3e689d 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -34,14 +34,13 @@
 namespace RAJA
 {
 
-struct TileSize {
+struct TileSize
+{
   const camp::idx_t size;
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr TileSize(camp::idx_t size_) : size{size_}
-  {
-  }
+  constexpr TileSize(camp::idx_t size_) : size {size_} {}
 };
 
 namespace statement
@@ -56,7 +55,8 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
@@ -65,17 +65,18 @@ struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
 
 ///! tag for a tiling loop
 template <camp::idx_t chunk_size_>
-struct tile_fixed {
+struct tile_fixed
+{
   static constexpr camp::idx_t chunk_size = chunk_size_;
 };
 
 template <camp::idx_t ArgumentId>
-struct tile_dynamic {
+struct tile_dynamic
+{
   static constexpr camp::idx_t id = ArgumentId;
 };
 
 
-
 namespace internal
 {
 
@@ -84,8 +85,12 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -104,7 +109,8 @@ struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
 
 
 template <typename Iterable>
-struct IterableTiler {
+struct IterableTiler
+{
   using value_type = camp::decay<Iterable>;
 
   struct iterate
@@ -120,46 +126,45 @@ struct IterableTiler {
     const Index_type block_id;
 
   public:
-    using value_type = iterate;
-    using difference_type = camp::idx_t;
-    using pointer = value_type *;
-    using reference = value_type &;
+    using value_type        = iterate;
+    using difference_type   = camp::idx_t;
+    using pointer           = value_type*;
+    using reference         = value_type&;
     using iterator_category = std::random_access_iterator_tag;
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
-    constexpr iterator(IterableTiler const &itiler_, Index_type block_id_)
-        : itiler{itiler_}, block_id{block_id_}
-    {
-    }
+    constexpr iterator(IterableTiler const& itiler_, Index_type block_id_)
+        : itiler {itiler_}, block_id {block_id_}
+    {}
 
     RAJA_HOST_DEVICE
     RAJA_INLINE
     value_type operator*()
     {
       auto start = block_id * itiler.block_size;
-      return iterate{itiler.it.slice(start, itiler.block_size), block_id};
+      return iterate {itiler.it.slice(start, itiler.block_size), block_id};
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE difference_type operator-(const iterator &rhs) const
+    RAJA_INLINE difference_type operator-(const iterator& rhs) const
     {
       return static_cast<difference_type>(block_id) -
              static_cast<difference_type>(rhs.block_id);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator-(const difference_type &rhs) const
+    RAJA_INLINE iterator operator-(const difference_type& rhs) const
     {
       return iterator(itiler, block_id - rhs);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator+(const difference_type &rhs) const
+    RAJA_INLINE iterator operator+(const difference_type& rhs) const
     {
-      return iterator(itiler,
-                      block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
-                                                          : block_id + rhs);
+      return iterator(itiler, block_id + rhs >= itiler.num_blocks
+                                  ? itiler.num_blocks
+                                  : block_id + rhs);
     }
 
     RAJA_HOST_DEVICE
@@ -169,13 +174,13 @@ struct IterableTiler {
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator!=(const iterator &rhs) const
+    RAJA_INLINE bool operator!=(const iterator& rhs) const
     {
       return block_id != rhs.block_id;
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator<(const iterator &rhs) const
+    RAJA_INLINE bool operator<(const iterator& rhs) const
     {
       return block_id < rhs.block_id;
     }
@@ -183,16 +188,17 @@ struct IterableTiler {
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  IterableTiler(const Iterable &it_, camp::idx_t block_size_)
-      : it{it_}, block_size{block_size_}
+  IterableTiler(const Iterable& it_, camp::idx_t block_size_)
+      : it {it_}, block_size {block_size_}
   {
     using std::begin;
     using std::distance;
     using std::end;
-    dist = it.end() - it.begin();  // distance(begin(it), end(it));
+    dist       = it.end() - it.begin();  // distance(begin(it), end(it));
     num_blocks = dist / block_size;
     // if (dist % block_size) num_blocks += 1;
-    if (dist - num_blocks * block_size > 0) {
+    if (dist - num_blocks * block_size > 0)
+    {
       num_blocks += 1;
     }
   }
@@ -222,13 +228,15 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>, Types> {
+    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = tile_fixed<ChunkSize>::chunk_size;
@@ -238,47 +246,51 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
 };
 
-template<camp::idx_t ArgumentId,
-  typename EPol,
-  typename... EnclosedStmts,
-  typename Types>
+template <camp::idx_t ArgumentId,
+          typename EPol,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>, Types> {
+    statement::
+        Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = camp::get<ArgumentId>(data.param_tuple);
-    static_assert(camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
-                  "Extracted parameter must be of type TileSize.");
+    static_assert(
+        camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
+        "Extracted parameter must be of type TileSize.");
 
     // Create a tile iterator
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size.size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
-    
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
+
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 2653e992c7..d741e0a4b0 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -47,7 +47,8 @@ template <camp::idx_t ArgumentId,
           typename TilePolicy,
           typename ExecPolicy,
           typename... EnclosedStmts>
-struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
                 "RAJA::Statement::Param< # >");
@@ -66,9 +67,13 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Data,
+          typename Types,
           typename... EnclosedStmts>
-struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
@@ -79,17 +84,16 @@ struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
   {
     // Assign the tile's segment to the tuple
     camp::get<ArgumentId>(Base::data.segment_tuple) = si.s;
-    
+
     // Assign the tile's index
     Base::data.template assign_param<ParamId>(si.i);
-    
+
     // Execute enclosed statements
     Base::exec();
   }
 };
 
 
-
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::TileTCount
  *
@@ -102,14 +106,16 @@ template <camp::idx_t ArgumentId,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>, Types> {
+    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
+    Types>
+{
 
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = TPol::chunk_size;
@@ -119,12 +125,13 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileTCountWrapper<ArgumentId, ParamId, Data, Types,
-                      EnclosedStmts...> tile_wrapper(data);
+    TileTCountWrapper<ArgumentId, ParamId, Data, Types, EnclosedStmts...>
+        tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 9667a55538..08f72ab91f 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -40,29 +40,27 @@ namespace internal
 {
 
 
-
-
-  // Universal base of all For wrappers for type traits
-  struct ForList {
-  };
-  struct ForBase {
-  };
-  struct CollapseBase {
-  };
-  template <camp::idx_t ArgumentId, typename Policy>
-  struct ForTraitBase : public ForBase {
-    constexpr static camp::idx_t index_val = ArgumentId;
-    using index = camp::num<ArgumentId>;
-    using index_type = camp::nil;  // default to invalid type
-    using policy_type = Policy;
-    using type = ForTraitBase;  // make camp::value compatible
-  };
-
-
+// Universal base of all For wrappers for type traits
+struct ForList
+{};
+struct ForBase
+{};
+struct CollapseBase
+{};
+template <camp::idx_t ArgumentId, typename Policy>
+struct ForTraitBase : public ForBase
+{
+  constexpr static camp::idx_t index_val = ArgumentId;
+  using index                            = camp::num<ArgumentId>;
+  using index_type  = camp::nil;  // default to invalid type
+  using policy_type = Policy;
+  using type        = ForTraitBase;  // make camp::value compatible
+};
 
 
 template <typename Iterator>
-struct iterable_difftype_getter {
+struct iterable_difftype_getter
+{
   using type = typename std::iterator_traits<
       typename Iterator::iterator>::difference_type;
 };
@@ -79,7 +77,8 @@ using difftype_tuple_from_segments =
 
 
 template <typename Iterator>
-struct iterable_value_type_getter {
+struct iterable_value_type_getter
+{
   using type =
       typename std::iterator_traits<typename Iterator::iterator>::value_type;
 };
@@ -100,13 +99,12 @@ using index_types_from_segments =
                            value_type_list_from_segments<Segments>>::type;
 
 
-
-
 template <typename SegmentTuple,
           typename ParamTuple,
           typename Resource,
           typename... Bodies>
-struct LoopData {
+struct LoopData
+{
 
   using Self = LoopData<SegmentTuple, ParamTuple, Resource, Bodies...>;
 
@@ -138,78 +136,70 @@ struct LoopData {
   using vector_sizes_t = tuple_of_n<int, camp::tuple_size<SegmentTuple>::value>;
   vector_sizes_t vector_sizes;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  LoopData(SegmentTuple const &s, ParamTuple const &p, Resource r, Bodies const &... b)
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
+                                                  ParamTuple const& p,
+                                                  Resource r,
+                                                  Bodies const&... b)
       : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
-  {
-  }
-  constexpr LoopData(LoopData const &) = default;
-  constexpr LoopData(LoopData &&) = default;
+  {}
+  constexpr LoopData(LoopData const&) = default;
+  constexpr LoopData(LoopData&&)      = default;
 
   template <camp::idx_t Idx, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const& i)
   {
     camp::get<Idx>(offset_tuple) = i;
   }
 
   template <typename ParamId, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const& i)
   {
-    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
+    using param_t =
+        camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
     camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
   }
 
   template <typename ParamId>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  auto get_param() ->
-    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
+  RAJA_HOST_DEVICE RAJA_INLINE auto get_param()
+      -> camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
   {
     return camp::get<ParamId::param_idx>(param_tuple);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  Resource get_resource()
-  {
-    return res;
-  }
-
-
+  RAJA_HOST_DEVICE RAJA_INLINE Resource get_resource() { return res; }
 };
 
 
-
-
 template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type =
-    typename std::iterator_traits<
-        typename camp::at_v<typename Data::segment_tuple_t::TList,
-                            ArgumentId>::iterator>::difference_type;
-
-
+using segment_diff_type = typename std::iterator_traits<
+    typename camp::at_v<typename Data::segment_tuple_t::TList,
+                        ArgumentId>::iterator>::difference_type;
 
 
 template <camp::idx_t ArgumentId, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
-  segment_diff_type<ArgumentId, Data>
+RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const& data)
+    -> segment_diff_type<ArgumentId, Data>
 {
   return camp::get<ArgumentId>(data.segment_tuple).end() -
          camp::get<ArgumentId>(data.segment_tuple).begin();
 }
 
 
-
-
 template <typename Data, typename Types, typename... EnclosedStmts>
-struct GenericWrapper : GenericWrapperBase {
+struct GenericWrapper : GenericWrapperBase
+{
   using data_t = camp::decay<Data>;
 
-  data_t &data;
+  data_t& data;
 
   RAJA_INLINE
-  constexpr explicit GenericWrapper(data_t &d) : data{d} {}
+  constexpr explicit GenericWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 };
 
 
@@ -217,26 +207,25 @@ struct GenericWrapper : GenericWrapperBase {
  * Convenience object used to create a thread-private LoopData object.
  */
 template <typename T>
-struct NestedPrivatizer {
-  using data_t = typename T::data_t;
-  using value_type = camp::decay<T>;
-  using reference_type = value_type &;
+struct NestedPrivatizer
+{
+  using data_t         = typename T::data_t;
+  using value_type     = camp::decay<T>;
+  using reference_type = value_type&;
 
   data_t privatized_data;
   value_type privatized_wrapper;
 
   RAJA_INLINE
-  constexpr NestedPrivatizer(const T &o)
-      : privatized_data{o.data}, privatized_wrapper(privatized_data)
-  {
-  }
+  constexpr NestedPrivatizer(const T& o)
+      : privatized_data {o.data}, privatized_wrapper(privatized_data)
+  {}
 
   RAJA_INLINE
   reference_type get_priv() { return privatized_wrapper; }
 };
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 7f77df4214..0f334c542b 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -29,63 +29,71 @@ namespace internal
 {
 
 
-template <typename SegmentTypes,
-          typename OffsetTypes>
+template <typename SegmentTypes, typename OffsetTypes>
 struct LoopTypes;
 
-template <typename ... SegmentTypes,
-          typename ... OffsetTypes>
-struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>> {
+template <typename... SegmentTypes, typename... OffsetTypes>
+struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
+{
 
-  using Self = LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
+  using Self =
+      LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
 
   static constexpr size_t s_num_segments = sizeof...(SegmentTypes);
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
   static_assert(s_num_segments == sizeof...(OffsetTypes),
-      "Number of segments and offsets must match");
+                "Number of segments and offsets must match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
-  using offset_types_t = camp::list<OffsetTypes...>;
+  using offset_types_t  = camp::list<OffsetTypes...>;
 };
 
 
-template<typename Data>
-using makeInitialLoopTypes =
-    LoopTypes<list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
-              list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
+template <typename Data>
+using makeInitialLoopTypes = LoopTypes<
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
 
 
-template<typename Types, camp::idx_t Segment, typename T, typename Seq>
+template <typename Types, camp::idx_t Segment, typename T, typename Seq>
 struct SetSegmentTypeHelper;
 
-template<typename Types,
-         camp::idx_t Segment,
-         typename T,
-         camp::idx_t ... SEQ>
+template <typename Types, camp::idx_t Segment, typename T, camp::idx_t... SEQ>
 struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
 {
-    using segment_list = typename Types::segment_types_t;
-    using offset_list = typename Types::offset_types_t;
-
-    static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-        "Segment was already assigned: Probably looping over same segment in loop nest");
-
-    using type = LoopTypes<
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>,
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>>;
-
+  using segment_list = typename Types::segment_types_t;
+  using offset_list  = typename Types::offset_types_t;
+
+  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+                "Segment was already assigned: Probably looping over same "
+                "segment in loop nest");
+
+  using type = LoopTypes<
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>,
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>>;
 };
 
 
-template<typename Types, camp::idx_t Segment, typename T>
-using setSegmentType =
-    typename SetSegmentTypeHelper<Types, Segment, T, camp::make_idx_seq_t<Types::s_num_segments>>::type;
+template <typename Types, camp::idx_t Segment, typename T>
+using setSegmentType = typename SetSegmentTypeHelper<
+    Types,
+    Segment,
+    T,
+    camp::make_idx_seq_t<Types::s_num_segments>>::type;
 
-template<typename Types, camp::idx_t Segment, typename Data>
-using setSegmentTypeFromData =
-    setSegmentType<Types, Segment, camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
+template <typename Types, camp::idx_t Segment, typename Data>
+using setSegmentTypeFromData = setSegmentType<
+    Types,
+    Segment,
+    camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 48ca828a68..c0402edad9 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -28,25 +28,24 @@ namespace internal
 {
 
 
-
 template <typename ExecPolicy, typename... EnclosedStmts>
-struct Statement {
-  static_assert(std::is_same<ExecPolicy, camp::nil>::value || sizeof...(EnclosedStmts) > 0,
-      "Executable statement with no enclosed statements, this is almost certainly a bug");
+struct Statement
+{
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
+                    sizeof...(EnclosedStmts) > 0,
+                "Executable statement with no enclosed statements, this is "
+                "almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
-  using execution_policy_t = ExecPolicy;
+  using execution_policy_t    = ExecPolicy;
 };
 
 
-
-
 template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index 5c0d71afb4..f0e5cd5175 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -35,8 +35,6 @@ template <typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
-
 template <typename... Stmts>
 using StatementList = camp::list<Stmts...>;
 
@@ -47,11 +45,13 @@ struct StatementListExecutor;
 
 template <camp::idx_t statement_index,
           camp::idx_t num_statements,
-          typename StmtList, typename Types>
-struct StatementListExecutor {
+          typename StmtList,
+          typename Types>
+struct StatementListExecutor
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Get the statement we're going to execute
@@ -61,8 +61,8 @@ struct StatementListExecutor {
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList, Types>::exec(
-        std::forward<Data>(data));
+    StatementListExecutor<statement_index + 1, num_statements, StmtList,
+                          Types>::exec(std::forward<Data>(data));
   }
 };
 
@@ -72,24 +72,23 @@ struct StatementListExecutor {
  */
 
 template <camp::idx_t num_statements, typename StmtList, typename Types>
-struct StatementListExecutor<num_statements, num_statements, StmtList, Types> {
+struct StatementListExecutor<num_statements, num_statements, StmtList, Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&)
-  {
-  }
+  static RAJA_INLINE void exec(Data&&)
+  {}
 };
 
 
 template <typename StmtList, typename Types, typename Data>
-RAJA_INLINE void execute_statement_list(Data &&data)
+RAJA_INLINE void execute_statement_list(Data&& data)
 {
   StatementListExecutor<0, camp::size<StmtList>::value, StmtList, Types>::exec(
       std::forward<Data>(data));
 }
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index c750b95986..7771ae99ee 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -39,8 +39,8 @@ struct SeqToType
 template <typename T, typename SEQ>
 struct ListOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
+template <typename T, camp::idx_t... SEQ>
+struct ListOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::list<typename SeqToType<T, SEQ>::type...>;
 };
@@ -49,13 +49,13 @@ struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
 template <typename T, typename SEQ>
 struct TupleOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
+template <typename T, camp::idx_t... SEQ>
+struct TupleOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::tuple<typename SeqToType<T, SEQ>::type...>;
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*
  *  This creates a camp::list with N types, each one being T.
@@ -64,7 +64,8 @@ struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
  *
  */
 template <typename T, camp::idx_t N>
-using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+using list_of_n =
+    typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 /*
@@ -74,8 +75,8 @@ using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::ty
  *
  */
 template <typename T, camp::idx_t N>
-using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
-
+using tuple_of_n =
+    typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index f1d70aeacb..8bb722a797 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -28,7 +28,7 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-//Odd dependecy with atomics is breaking CI builds
+// Odd dependecy with atomics is breaking CI builds
 //#include "RAJA/util/View.hpp"
 
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
@@ -41,12 +41,17 @@ namespace RAJA
 {
 
 // GPU or CPU threads available
-//strongly type the ExecPlace (guards agaist errors)
-enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES };
-
-struct null_launch_t {
+// strongly type the ExecPlace (guards agaist errors)
+enum struct ExecPlace : int
+{
+  HOST,
+  DEVICE,
+  NUM_PLACES
 };
 
+struct null_launch_t
+{};
+
 // Support for host, and device
 template <typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
@@ -55,7 +60,8 @@ template <typename HOST_POLICY
 #endif
           >
 
-struct LoopPolicy {
+struct LoopPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -68,7 +74,8 @@ template <typename HOST_POLICY
           typename DEVICE_POLICY = HOST_POLICY
 #endif
           >
-struct LaunchPolicy {
+struct LaunchPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
@@ -76,48 +83,51 @@ struct LaunchPolicy {
 };
 
 
-struct Teams {
+struct Teams
+{
   int value[3];
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams() : value{1, 1, 1} {}
+  constexpr Teams() : value {1, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i) : value{i, 1, 1} {}
+  constexpr Teams(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j) : value{i, j, 1} {}
+  constexpr Teams(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j, int k) : value{i, j, k} {}
+  constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
-struct Threads {
+struct Threads
+{
   int value[3];
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads() : value{1, 1, 1} {}
+  constexpr Threads() : value {1, 1, 1} {}
 
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i) : value{i, 1, 1} {}
+  constexpr Threads(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j) : value{i, j, 1} {}
+  constexpr Threads(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j, int k) : value{i, j, k} {}
+  constexpr Threads(int i, int j, int k) : value {i, j, k} {}
 };
 
-struct Lanes {
+struct Lanes
+{
   int value;
 
   RAJA_INLINE
@@ -129,7 +139,8 @@ struct Lanes {
   constexpr Lanes(int i) : value(i) {}
 };
 
-struct LaunchParams {
+struct LaunchParams
+{
 public:
   Teams teams;
   Threads threads;
@@ -138,67 +149,71 @@ struct LaunchParams {
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0)
-    : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {};
+  LaunchParams(Teams in_teams,
+               Threads in_threads,
+               size_t in_shared_mem_size = 0)
+      : teams(in_teams),
+        threads(in_threads),
+        shared_mem_size(in_shared_mem_size) {};
 
 private:
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Teams apply(Teams const &a) { return (teams = a); }
+  Teams apply(Teams const& a) { return (teams = a); }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  Threads apply(Threads const &a) { return (threads = a); }
+  Threads apply(Threads const& a) { return (threads = a); }
 };
 
 class LaunchContext
 {
 public:
-
-  //Bump style allocator used to
-  //get memory from the pool
+  // Bump style allocator used to
+  // get memory from the pool
   size_t shared_mem_offset;
 
-  void *shared_mem_ptr;
+  void* shared_mem_ptr;
 
 #if defined(RAJA_ENABLE_SYCL)
-  mutable cl::sycl::nd_item<3> *itm;
+  mutable cl::sycl::nd_item<3>* itm;
 #endif
 
   RAJA_HOST_DEVICE LaunchContext()
-    : shared_mem_offset(0), shared_mem_ptr(nullptr)
-  {
-  }
+      : shared_mem_offset(0), shared_mem_ptr(nullptr)
+  {}
 
-  //TODO handle alignment
-  template<typename T>
+  // TODO handle alignment
+  template <typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
   {
 
-    //Calculate offset in bytes with a char pointer
-    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
+    // Calculate offset in bytes with a char pointer
+    void* mem_ptr = static_cast<char*>(shared_mem_ptr) + shared_mem_offset;
 
-    shared_mem_offset += bytes*sizeof(T);
+    shared_mem_offset += bytes * sizeof(T);
 
-    //convert to desired type
+    // convert to desired type
     return static_cast<T*>(mem_ptr);
   }
 
   /*
   //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t z_stride=DIM-1, typename arg, typename... args>
-  RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs)
+  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
+  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
+  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
   {
     T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
 
     shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx, idxs...);
+    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
+  idxs...);
   }
   */
 
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
-    //On the cpu/gpu we want to restart the count
+    // On the cpu/gpu we want to restart the count
     shared_mem_offset = 0;
   }
 
@@ -218,19 +233,24 @@ class LaunchContext
 template <typename LAUNCH_POLICY>
 struct LaunchExecute;
 
-//Policy based launch with support to new reducers...
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Policy based launch with support to new reducers...
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context {
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -242,29 +262,36 @@ void launch(LaunchParams const &launch_params, const char *kernel_name, ReducePa
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 
-//Duplicate of code above on account that we need to support the case in which a kernel_name is not given
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args)
+// Duplicate of code above on account that we need to support the case in which
+// a kernel_name is not given
+template <typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context {
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -276,148 +303,200 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 //=================================================
-//Run time based policy launch
+// Run time based policy launch
 //=================================================
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
+void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
 {
   launch<POLICY_LIST>(place, params, nullptr, body);
 }
 
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams& params,
+            const char* kernel_name,
+            BODY const& body)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(), params, kernel_name, body);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(), params, kernel_name, body);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface
+// Run-time API for new reducer interface
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface with support of the case without a new kernel name
+// Run-time API for new reducer interface with support of the case without a new
+// kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args)
-            //BODY const &body)
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            ReduceParams&&... rest_of_launch_args)
+// BODY const &body)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::host_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using Res = typename resources::get_resource<
+        typename POLICY_LIST::device_policy_t>::type;
+    launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+        Res::get_default(), launch_params, kernel_name,
+        std::forward<ReduceParams>(rest_of_launch_args)...);
+    break;
+  }
 #endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  default:
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
 
-// Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
-template<typename T, typename U>
-RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
-  else { return RAJA::resources::Resource(host_res); }
+// Helper function to retrieve a resource based on the run-time policy - if a
+// device is active
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
+template <typename T, typename U>
+RAJA::resources::Resource
+Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    return RAJA::resources::Resource(device_res);
+  }
+  else
+  {
+    return RAJA::resources::Resource(host_res);
+  }
 }
 #endif
 
-template<typename T>
-RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
+template <typename T>
+RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    RAJA_ABORT_OR_THROW("Device is not enabled");
+  }
 
   return RAJA::resources::Resource(host_res);
 }
 
-//Launch API which takes team resource struct and supports new reducers
-template <typename POLICY_LIST, typename ... ReduceParams>
+// Launch API which takes team resource struct and supports new reducers
+template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* kernel_name,
+       ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context {
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context {
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -429,24 +508,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name,  p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #endif
-    default: {
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-    }
+  default:
+  {
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -456,36 +541,45 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 }
 
 
-//Duplicate of API above on account that we need to handle the case that a kernel name is not provided
-template <typename POLICY_LIST, typename ... ReduceParams>
+// Duplicate of API above on account that we need to handle the case that a
+// kernel name is not provided
+template <typename POLICY_LIST, typename... ReduceParams>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
+launch(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
        ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context {
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context {
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -497,24 +591,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  switch (place)
+  {
+  case ExecPlace::HOST:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
+  case ExecPlace::DEVICE:
+  {
+    using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
+    resources::EventProxy<resources::Resource> e_proxy =
+        launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+    util::callPostLaunchPlugins(context);
+    return e_proxy;
+  }
 #endif
-    default: {
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-    }
+  default:
+  {
+    RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
+  }
   }
 
   RAJA_ABORT_OR_THROW("Unknown launch place");
@@ -523,7 +623,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   return resources::EventProxy<resources::Resource>(res);
 }
 
-template<typename POLICY_LIST>
+template <typename POLICY_LIST>
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 using loop_policy = typename POLICY_LIST::device_policy_t;
 #else
@@ -541,28 +641,23 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void
+loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
 template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                          SEGMENT const &segment,
-                                          BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void
+loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          segment,
-                                                          body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment,
+                                                             body);
 }
 
 namespace expt
@@ -573,15 +668,13 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
                                                        body);
 }
 
@@ -590,18 +683,15 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       SEGMENT const& segment2,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
-                                                       segment2,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
+                                                       segment2, body);
 }
 
 RAJA_SUPPRESS_HD_WARN
@@ -609,18 +699,18 @@ template <typename POLICY_LIST,
           typename CONTEXT,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              SEGMENT const& segment2,
+                                              BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                           segment0, segment1, segment2, body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
 }
 
-} //namespace expt
+}  // namespace expt
 
 template <typename POLICY, typename SEGMENT>
 struct TileExecute;
@@ -633,15 +723,13 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+                                       SEGMENT const& segment,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size,
-                                                       segment,
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size, segment,
                                                        body);
 }
 
@@ -650,15 +738,13 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size,
+                                              SEGMENT const& segment,
+                                              BODY const& body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size,
-                                                          segment,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size,
+                                                             segment, body);
 }
 
 namespace expt
@@ -669,20 +755,16 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size0,
                                        TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size0,
-                                                       tile_size1,
-                                                       segment0,
-                                                       segment1,
-                                                       body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
 template <typename POLICY_LIST,
@@ -690,23 +772,19 @@ template <typename POLICY_LIST,
           typename TILE_T,
           typename SEGMENT,
           typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size0,
-                                       TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size0,
+                                              TILE_T tile_size1,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              BODY const& body)
 {
 
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size0,
-                                                          tile_size1,
-                                                          segment0,
-                                                          segment1,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-} //namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
index 3fbe36877c..ca3f4e58d0 100644
--- a/include/RAJA/pattern/multi_reduce.hpp
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -156,7 +156,7 @@ struct MultiReduceSum;
  */
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -171,7 +171,8 @@ struct MultiReduceBitOr;
    Index_ptr bins = ...;
    Real_ptr bit_vals = ...;
 
-   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
+   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins,
+ init_val);
 
    forall<exec_policy>( ..., [=] (Index_type i) {
       my_bits[bins[i]] &= (data[i]);
@@ -188,7 +189,7 @@ struct MultiReduceBitOr;
 template <typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitAnd;
 
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 5a656206f5..0cb36f597c 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -21,348 +21,436 @@ namespace RAJA
 namespace expt
 {
 
-  //
-  //
-  // Forall Parameter Packing type
-  //
-  //
-  struct ParamMultiplexer;
-
-  template<typename... Params>
-  struct ForallParamPack {
-
-    friend struct ParamMultiplexer;
-
-    using Base = camp::tuple<Params...>;
-    Base param_tup;
-
-    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
-    using params_seq = camp::make_idx_seq_t< param_tup_sz >;
-
-  private:
-
-    // Init
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_init(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(expt::detail::init<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Combine
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& out, const ForallParamPack& in ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
-    }
-
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
-    }
-    
-    // Resolve
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Used to construct the argument TYPES that will be invoked with the lambda.
-    template<typename null_t = camp::nil>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; };
-    template<typename null_t = camp::nil, typename First>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); };
-    template<typename null_t = camp::nil, typename First, typename Second, typename... Rest>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
-
-    using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
-    
-    //Use the size of param_tup to generate the argument list.
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
-    template<camp::idx_t N>
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>) {
-      return camp::tuple_cat_pair(  camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num<N-1>())  );
-    }
-
-  public:
-    ForallParamPack(){}
-
-    RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());}
-
-    using lambda_arg_seq = camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
-
-    template<typename... Ts>
-    ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
-  }; // struct ForallParamPack 
-  
-
-
-  //===========================================================================
-  //
-  //
-  // ParamMultiplexer is how we hook into the individual calls within forall_impl.
-  //
-  //
-  struct ParamMultiplexer {
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr init( ForallParamPack<Params...>& f_params, Args&& ...args) {
-      FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr combine(ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr resolve( ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-  };
-  //===========================================================================
+//
+//
+// Forall Parameter Packing type
+//
+//
+struct ParamMultiplexer;
+
+template <typename... Params>
+struct ForallParamPack
+{
+
+  friend struct ParamMultiplexer;
 
+  using Base = camp::tuple<Params...>;
+  Base param_tup;
 
+  static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
+  using params_seq                     = camp::make_idx_seq_t<param_tup_sz>;
+
+private:
+  // Init
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_init(EXEC_POL,
+                                    camp::idx_seq<Seq...>,
+                                    ForallParamPack& f_params,
+                                    Args&&... args)
+  {
+    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                             std::forward<Args>(args)...));
+  }
+
+  // Combine
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void
+  detail_combine(EXEC_POL,
+                 camp::idx_seq<Seq...>,
+                 ForallParamPack& out,
+                 const ForallParamPack& in)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
+                                          camp::get<Seq>(in.param_tup)));
+  }
 
-  //===========================================================================
-  //
-  //
-  // ForallParamPack generators.
-  //
-  //
-  RAJA_INLINE static auto get_empty_forall_param_pack(){
-    static ForallParamPack<> p;
-    return p;
+  template <typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void
+  detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(f_params.param_tup)));
   }
 
-  namespace detail {
-    // all_true trick to perform variadic expansion in static asserts.
-    // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
-    template<bool...> struct bool_pack;
-    template<bool... bs>
-    using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+  // Resolve
+  template <typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_resolve(EXEC_POL,
+                                       camp::idx_seq<Seq...>,
+                                       ForallParamPack& f_params,
+                                       Args&&... args)
+  {
+    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                          std::forward<Args>(args)...));
+  }
 
-    template<typename Base, typename... Ts>
-    using check_types_derive_base = all_true<std::is_convertible<Ts, Base>::value...>;
-  } // namespace detail
+  // Used to construct the argument TYPES that will be invoked with the lambda.
+  template <typename null_t = camp::nil>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple<> {};
+  };
+  template <typename null_t = camp::nil, typename First>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return typename First::ARG_TUP_T();
+  };
+  template <typename null_t = camp::nil,
+            typename First,
+            typename Second,
+            typename... Rest>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
+                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
+  };
 
+  using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
 
-  template<typename... Ts>
-  constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple) {
-    static_assert(detail::check_types_derive_base<detail::ForallParamBase, camp::decay<Ts>...>::value,
-        "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ;
-    return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+  // Use the size of param_tup to generate the argument list.
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>)
+  {
+    return camp::make_tuple();
+  }
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>)
+  {
+    return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup();
+  }
+  template <camp::idx_t N>
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>)
+  {
+    return camp::tuple_cat_pair(
+        camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(),
+        LAMBDA_ARG_TUP_V(camp::num<N - 1>()));
   }
 
-  
+public:
+  ForallParamPack() {}
 
-  namespace detail {
-    // Maybe we should do a lot of these with structs...
-    template<camp::idx_t... Seq, typename TupleType>
-    constexpr auto tuple_from_seq (const camp::idx_seq<Seq...>&, TupleType&& tuple){
-      return camp::forward_as_tuple( camp::get< Seq >(std::forward<TupleType>(tuple))... );
-    };
+  RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args()
+  {
+    return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());
+  }
 
-    template<typename... Ts>
-    constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple){
-      return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts)-1>{},std::move(tuple));
-    };
-  } // namespace detail
+  using lambda_arg_seq =
+      camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
 
+  template <typename... Ts>
+  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
+};  // struct ForallParamPack
 
-  // Make a tuple of the param pack except the final element...
-  template<typename... Args>
-  constexpr auto make_forall_param_pack(Args&&... args){
-    // We assume the last element of the pack is the lambda so we need to strip it from the list.
-    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
-    return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+
+//===========================================================================
+//
+//
+// ParamMultiplexer is how we hook into the individual calls within forall_impl.
+//
+//
+struct ParamMultiplexer
+{
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr init(ForallParamPack<Params...>& f_params,
+                             Args&&... args)
+  {
+    FP::detail_init(EXEC_POL(), typename FP::params_seq(), f_params,
+                    std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr combine(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
+  }
+  template <typename EXEC_POL,
+            typename... Params,
+            typename... Args,
+            typename FP = ForallParamPack<Params...>>
+  static void constexpr resolve(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
   }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Callable should be the last argument in the param pack, just extract it...
-  //
-  //
-  template<typename... Args>
-  constexpr auto&& get_lambda(Args&&... args){
-    return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
-  } 
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Checking expected argument list against the assumed lambda.
-  //
-  //
-  namespace detail {
-
-    // 
-    //
-    // Lambda traits Utilities
-    // 
-    //
-    template<class F>
-    struct lambda_traits;
-
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...)>
-    {  // non-const specialization
-      using arg_type = First; 
-    };
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...) const>
-    {  // const specialization
-      using arg_type = First; 
-    };
-
-    template<class T>
-    typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
-
-
-    // 
-    //
-    // List manipulation Utilities
-    // 
-    //
-    template<typename... Ts>
-    constexpr auto list_remove_pointer(const camp::list<Ts...>&){
-      return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
-    }
-    
-    template<typename... Ts>
-    constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
-      return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
-    }
-
-    template<typename... Ts>
-    constexpr auto tuple_to_list(const camp::tuple<Ts...>&) {
-      return camp::list<Ts...>{};
-    }
-
-    // TODO : Change to std::is_invocable at c++17
-    template <typename F, typename... Args>
-    struct is_invocable :
-      std::is_constructible<
-        std::function<void(Args ...)>,
-        std::reference_wrapper<typename std::remove_reference<F>::type>
-      >{};
-
-    template<class...>
-    using void_t = void;
-
-    template<class F, class=void>
-    struct has_empty_op : std::false_type{};
-
-    template<class F>
-    struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>> : std::true_type{};
-
-    template<class F>
-    struct get_lambda_index_type {
-      typedef typename std::remove_pointer<
-                decltype(lambda_arg_helper(
-                      &camp::decay<F>::operator())
-                )
-              >::type type;
-    };
-
-    // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args.
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {}
-
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
+};
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// ForallParamPack generators.
+//
+//
+RAJA_INLINE static auto get_empty_forall_param_pack()
+{
+  static ForallParamPack<> p;
+  return p;
+}
+
+namespace detail
+{
+// all_true trick to perform variadic expansion in static asserts.
+// https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
+template <bool...>
+struct bool_pack;
+template <bool... bs>
+using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+template <typename Base, typename... Ts>
+using check_types_derive_base =
+    all_true<std::is_convertible<Ts, Base>::value...>;
+}  // namespace detail
+
+
+template <typename... Ts>
+constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
+{
+  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
+                                                camp::decay<Ts>...>::value,
+                "Forall optional arguments do not derive ForallParamBase. "
+                "Please see Reducer, ReducerLoc and KernelName for examples.");
+  return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+}
+
+
+namespace detail
+{
+// Maybe we should do a lot of these with structs...
+template <camp::idx_t... Seq, typename TupleType>
+constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
+{
+  return camp::forward_as_tuple(
+      camp::get<Seq>(std::forward<TupleType>(tuple))...);
+};
+
+template <typename... Ts>
+constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
+{
+  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1> {},
+                        std::move(tuple));
+};
+}  // namespace detail
+
+
+// Make a tuple of the param pack except the final element...
+template <typename... Args>
+constexpr auto make_forall_param_pack(Args&&... args)
+{
+  // We assume the last element of the pack is the lambda so we need to strip it
+  // from the list.
+  auto stripped_arg_tuple = detail::strip_last_elem(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+  return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Callable should be the last argument in the param pack, just extract it...
+//
+//
+template <typename... Args>
+constexpr auto&& get_lambda(Args&&... args)
+{
+  return camp::get<sizeof...(Args) - 1>(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+}
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Checking expected argument list against the assumed lambda.
+//
+//
+namespace detail
+{
+
+//
+//
+// Lambda traits Utilities
+//
+//
+template <class F>
+struct lambda_traits;
+
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...)>
+{  // non-const specialization
+  using arg_type = First;
+};
+template <class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...) const>
+{  // const specialization
+  using arg_type = First;
+};
+
+template <class T>
+typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
+
+
+//
+//
+// List manipulation Utilities
+//
+//
+template <typename... Ts>
+constexpr auto list_remove_pointer(const camp::list<Ts...>&)
+{
+  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...> {};
+}
+
+template <typename... Ts>
+constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&)
+{
+  return camp::list<typename std::add_lvalue_reference<Ts>::type...> {};
+}
+
+template <typename... Ts>
+constexpr auto tuple_to_list(const camp::tuple<Ts...>&)
+{
+  return camp::list<Ts...> {};
+}
+
+// TODO : Change to std::is_invocable at c++17
+template <typename F, typename... Args>
+struct is_invocable
+    : std::is_constructible<
+          std::function<void(Args...)>,
+          std::reference_wrapper<typename std::remove_reference<F>::type>>
+{};
+
+template <class...>
+using void_t = void;
+
+template <class F, class = void>
+struct has_empty_op : std::false_type
+{};
+
+template <class F>
+struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>>
+    : std::true_type
+{};
+
+template <class F>
+struct get_lambda_index_type
+{
+  typedef typename std::remove_pointer<decltype(lambda_arg_helper(
+      &camp::decay<F>::operator()))>::type type;
+};
+
+// If LAMBDA::operator() is not available this probably isn't a generic lambda
+// and we can't extract and check args.
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{}
+
+template <typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<has_empty_op<LAMBDA>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{
 #if !defined(RAJA_ENABLE_HIP)
-      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); 
+  static_assert(
+      is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
+                   EXPECTED_ARGS...>::value,
+      "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match "
+      "between RAJA::expt::Reduce() and ValOp arguments.");
 #endif
-    }
-
-  } // namespace detail
+}
 
+}  // namespace detail
 
-  template<typename Lambda, typename ForallParams>
-  constexpr 
-  void
-  check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
 
-    using expected_arg_type_list = decltype( detail::list_add_lvalue_ref(
-                                               detail::list_remove_pointer(
-                                                 detail::tuple_to_list(
-                                                   fpp.lambda_args()
-                                                 )
-                                               )
-                                            ));
+template <typename Lambda, typename ForallParams>
+constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
+{
 
-    detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
-  }
-  //===========================================================================
-  
+  using expected_arg_type_list = decltype(detail::list_add_lvalue_ref(
+      detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args()))));
 
+  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list {});
+}
+//===========================================================================
 
-  //===========================================================================
-  //
-  //
-  // Type trailts for SFINAE work.
-  //
-  //
-  namespace type_traits
-  {
-    template <typename T> struct is_ForallParamPack : std::false_type {};
-    template <typename... Args> struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {};
 
-    template <typename T> struct is_ForallParamPack_empty : std::true_type {};
-    template <typename First, typename... Rest> struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>> : std::false_type {};
-    template <> struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {};
-  }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Invoke Forall with Params.
-  //
-  //
-  namespace detail {
-    template<camp::idx_t Idx, typename FP>
-    RAJA_HOST_DEVICE
-    constexpr
-    auto get_lambda_args(FP& fpp)
-        -> decltype(  *camp::get<Idx>( fpp.lambda_args() )  ) {
-      return (  *camp::get<Idx>( fpp.lambda_args() )  );
-    }
-
-    CAMP_SUPPRESS_HD_WARN
-    template <typename Fn,
-              camp::idx_t... Sequence,
-              typename Params,
-              typename... Ts>
-    RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                      Fn&& f,
-                                                      camp::idx_seq<Sequence...>,
-                                                      Ts&&... extra)
-    {
-      return f(std::forward<Ts...>(extra...), ( get_lambda_args<Sequence>(params) )...);
-    }
-  } // namespace detail
-
-  //CAMP_SUPPRESS_HD_WARN
-  template <typename Params, typename Fn, typename... Ts>
-  RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra)
-  {
-    return detail::invoke_with_order(
-        camp::forward<Params>(params),
-        camp::forward<Fn>(f),
-        typename camp::decay<Params>::lambda_arg_seq(),
-        camp::forward<Ts...>(extra)...);
-  }
-  //===========================================================================
+//===========================================================================
+//
+//
+// Type trailts for SFINAE work.
+//
+//
+namespace type_traits
+{
+template <typename T>
+struct is_ForallParamPack : std::false_type
+{};
+template <typename... Args>
+struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type
+{};
+
+template <typename T>
+struct is_ForallParamPack_empty : std::true_type
+{};
+template <typename First, typename... Rest>
+struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
+    : std::false_type
+{};
+template <>
+struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
+{};
+}  // namespace type_traits
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Invoke Forall with Params.
+//
+//
+namespace detail
+{
+template <camp::idx_t Idx, typename FP>
+RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
+    -> decltype(*camp::get<Idx>(fpp.lambda_args()))
+{
+  return (*camp::get<Idx>(fpp.lambda_args()));
+}
+
+CAMP_SUPPRESS_HD_WARN
+template <typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                  Fn&& f,
+                                                  camp::idx_seq<Sequence...>,
+                                                  Ts&&... extra)
+{
+  return f(std::forward<Ts...>(extra...),
+           (get_lambda_args<Sequence>(params))...);
+}
+}  // namespace detail
+
+// CAMP_SUPPRESS_HD_WARN
+template <typename Params, typename Fn, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto
+invoke_body(Params&& params, Fn&& f, Ts&&... extra)
+{
+  return detail::invoke_with_order(
+      camp::forward<Params>(params), camp::forward<Fn>(f),
+      typename camp::decay<Params>::lambda_arg_seq(),
+      camp::forward<Ts...>(extra)...);
+}
+//===========================================================================
 
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  FORALL_PARAM_HPP
+#endif  //  FORALL_PARAM_HPP
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
index e768d8dd59..e9d2a6e3e1 100644
--- a/include/RAJA/pattern/params/kernel_name.hpp
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -10,23 +10,20 @@ namespace expt
 namespace detail
 {
 
-  struct KernelName : public ForallParamBase {
-    RAJA_HOST_DEVICE KernelName() {}
-    KernelName(const char* name_in) : name(name_in) {}
-    const char* name;
-  };
-
-} // namespace detail
-
-inline auto KernelName(const char * n)
+struct KernelName : public ForallParamBase
 {
-  return detail::KernelName(n);
-}
-} // namespace expt
+  RAJA_HOST_DEVICE KernelName() {}
+  KernelName(const char* name_in) : name(name_in) {}
+  const char* name;
+};
+
+}  // namespace detail
 
+inline auto KernelName(const char* n) { return detail::KernelName(n); }
+}  // namespace expt
 
-} //  namespace RAJA
 
+}  //  namespace RAJA
 
 
-#endif // KERNEL_NAME_HPP
+#endif  // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index 98380f6ffc..7347dc521d 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -7,129 +7,259 @@ namespace RAJA
 namespace expt
 {
 
-  template<typename T, typename IndexType = RAJA::Index_type>
-  struct ValLoc {
-    using index_type = IndexType;
-    using value_type = T;
-
-    ValLoc() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l) {}
-
-    ValLoc(ValLoc const &) = default;
-    ValLoc(ValLoc &&) = default;
-    ValLoc& operator=(ValLoc const &) = default;
-    ValLoc& operator=(ValLoc &&) = default;
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const { return val > rhs.val; }
-
-    RAJA_HOST_DEVICE constexpr const value_type& getVal() const {return val;}
-    RAJA_HOST_DEVICE constexpr const index_type& getLoc() const {return loc;}
-
-    RAJA_HOST_DEVICE void set(T inval, IndexType inindex) {val = inval; loc = inindex;}
-    RAJA_HOST_DEVICE void setVal(T inval) {val = inval;}
-    RAJA_HOST_DEVICE void setLoc(IndexType inindex) {loc = inindex;}
-
-    value_type val;
-    index_type loc = -1;
-  };
-
-  template<typename T, template <typename, typename, typename> class Op>
-  struct ValOp {
-    using value_type = T;
-    using op_type = Op<T,T,T>;
-
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::plus<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator+=(const value_type& rhs) { val += rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator&=(const value_type& rhs) { val &= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator|=(const value_type& rhs) { val |= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator&=(value_type& rhs) { val &= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator|=(value_type& rhs) { val |= rhs; return *this; }
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { val < rhs.val; return *this; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { val > rhs.val; return *this; }
-
-    value_type val = op_type::identity();
-  };
-
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  struct ValOp <ValLoc<T,IndexType>, Op> {
-    using index_type = IndexType;
-    using value_type = ValLoc<T,index_type>;
-    using op_type = Op<value_type,value_type,value_type>;
-    using valloc_value_type = typename value_type::value_type;
-    using valloc_index_type = typename value_type::index_type;
-
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l) : val(v, l) {}
-
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & minloc(valloc_value_type v, valloc_index_type l) { return min(value_type(v,l)); }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & maxloc(valloc_value_type v, valloc_index_type l) { return max(value_type(v,l)); }
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { return val > rhs.val; }
-
-    value_type val = op_type::identity();
-  };
-
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
+template <typename T, typename IndexType = RAJA::Index_type>
+struct ValLoc
+{
+  using index_type = IndexType;
+  using value_type = T;
+
+  ValLoc() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
+  RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l)
+  {}
+
+  ValLoc(ValLoc const&)            = default;
+  ValLoc(ValLoc&&)                 = default;
+  ValLoc& operator=(ValLoc const&) = default;
+  ValLoc& operator=(ValLoc&&)      = default;
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const
+  {
+    return val < rhs.val;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const
+  {
+    return val > rhs.val;
+  }
+
+  RAJA_HOST_DEVICE constexpr const value_type& getVal() const { return val; }
+  RAJA_HOST_DEVICE constexpr const index_type& getLoc() const { return loc; }
+
+  RAJA_HOST_DEVICE void set(T inval, IndexType inindex)
+  {
+    val = inval;
+    loc = inindex;
+  }
+  RAJA_HOST_DEVICE void setVal(T inval) { val = inval; }
+  RAJA_HOST_DEVICE void setLoc(IndexType inindex) { loc = inindex; }
+
+  value_type val;
+  index_type loc = -1;
+};
+
+template <typename T, template <typename, typename, typename> class Op>
+struct ValOp
+{
+  using value_type = T;
+  using op_type    = Op<T, T, T>;
+
+  ValOp() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+
+  ValOp(ValOp const&)            = default;
+  ValOp(ValOp&&)                 = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&)      = default;
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::minimum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::maximum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::plus<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator+=(const value_type& rhs)
+  {
+    val += rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator&=(const value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator|=(const value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator&=(value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template <
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator|=(value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    val < rhs.val;
+    return *this;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    val > rhs.val;
+    return *this;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template <typename T,
+          typename IndexType,
+          template <typename, typename, typename>
+          class Op>
+struct ValOp<ValLoc<T, IndexType>, Op>
+{
+  using index_type        = IndexType;
+  using value_type        = ValLoc<T, index_type>;
+  using op_type           = Op<value_type, value_type, value_type>;
+  using valloc_value_type = typename value_type::value_type;
+  using valloc_index_type = typename value_type::index_type;
+
+  ValOp() = default;
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+  RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l)
+      : val(v, l)
+  {}
+
+  ValOp(ValOp const&)            = default;
+  ValOp(ValOp&&)                 = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&)      = default;
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& minloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return min(value_type(v, l));
+  }
+
+  template <typename U                   = op_type,
+            std::enable_if_t<std::is_same<
+                U,
+                RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                 value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& maxloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return max(value_type(v, l));
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    return val < rhs.val;
+  }
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    return val > rhs.val;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template <typename T,
+          typename IndexType,
+          template <typename, typename, typename>
+          class Op>
+using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
 
 namespace detail
 {
 
-  struct ForallParamBase {
+struct ForallParamBase
+{
 
-    // Some of this can be made virtual in c++20, for now must be defined in each child class
-    // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.)
-    using ARG_TUP_T = camp::tuple<>; 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
-  
-  };
+  // Some of this can be made virtual in c++20, for now must be defined in each
+  // child class if any arguments to the forall lambda are needed (e.g.
+  // KernelName is excluded.)
+  using ARG_TUP_T  = camp::tuple<>;
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace expt
+}  // namespace expt
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  RAJA_PARAMS_BASE
+#endif  //  RAJA_PARAMS_BASE
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 78b6d7714d..ee4ac7c7f7 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -19,20 +19,23 @@ namespace operators
 {
 
 template <typename T, typename IndexType>
-struct limits<RAJA::expt::ValLoc<T, IndexType>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> min()
+struct limits<RAJA::expt::ValLoc<T, IndexType>>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  min()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> max()
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  max()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::max());
   }
 };
 
-} //  namespace operators
+}  //  namespace operators
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
 namespace RAJA
 {
@@ -43,159 +46,196 @@ namespace detail
 {
 
 #if defined(RAJA_CUDA_ACTIVE)
-  using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
-  using device_mem_pool_t = RAJA::hip::device_mempool_type;
+using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #elif defined(RAJA_SYCL_ACTIVE)
-  using device_mem_pool_t = RAJA::sycl::device_mempool_type;
+using device_mem_pool_t = RAJA::sycl::device_mempool_type;
 #endif
 
-  //
-  //
-  // Basic Reducer
-  //
-  //
-
-  // Basic data type Reducer
-  // T must be a basic data type
-  // VOp must be ValOp<T, Op>
-  template <typename Op, typename T, typename VOp>
-  struct Reducer : public ForallParamBase {
-    using op = Op;
-    using value_type = T; // This is a basic data type
-
-    Reducer() = default;
-
-    // Basic data type constructor
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target(target_in){}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // Internal ValOp object that is used within RAJA::forall/launch
-    VOp m_valop = VOp{};
-
-    // Points to the user specified result variable
-    value_type *target = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      value_type temp = op{}(*target, in);
-      *target = temp;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+//
+//
+// Basic Reducer
+//
+//
+
+// Basic data type Reducer
+// T must be a basic data type
+// VOp must be ValOp<T, Op>
+template <typename Op, typename T, typename VOp>
+struct Reducer : public ForallParamBase
+{
+  using op         = Op;
+  using value_type = T;  // This is a basic data type
+
+  Reducer() = default;
+
+  // Basic data type constructor
+  RAJA_HOST_DEVICE Reducer(value_type* target_in)
+      : m_valop(VOp {}), target(target_in)
+  {}
+
+  Reducer(Reducer const&)            = default;
+  Reducer(Reducer&&)                 = default;
+  Reducer& operator=(Reducer const&) = default;
+  Reducer& operator=(Reducer&&)      = default;
+
+  // Internal ValOp object that is used within RAJA::forall/launch
+  VOp m_valop = VOp {};
+
+  // Points to the user specified result variable
+  value_type* target = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    value_type temp = op {}(*target, in);
+    *target         = temp;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type& getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type* devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int* device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
-
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
-
-  // Partial specialization of Reducer for ValLoc
-  // T is a deduced basic data type
-  // I is a deduced index type
-  template <typename T, typename I, template <typename, typename, typename> class Op>
-  struct Reducer<Op<ValLoc<T,I>, ValLoc<T,I>, ValLoc<T,I>>, ValLoc<T,I>, ValOp<ValLoc<T,I>, Op>> : public ForallParamBase {
-    using target_value_type = T;
-    using target_index_type = I;
-    using value_type = ValLoc<T,I>;
-    using op = Op<value_type,value_type,value_type>;
-    using VOp = ValOp<ValLoc<target_value_type,target_index_type>, Op>;
-
-    Reducer() = default;
-
-    // ValLoc constructor
-    // Note that the target_ variables point to the val and loc within the user defined target ValLoc
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target_value(&target_in->val), target_index(&target_in->loc) {}
-
-    // Dual input constructor for ReduceLoc<>(data, index) case
-    // The target_ variables point to vars defined by the user
-    RAJA_HOST_DEVICE Reducer(target_value_type *data_in, target_index_type *index_in) : m_valop(VOp{}), target_value(data_in), target_index(index_in) {}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // The ValLoc within m_valop is initialized with data and location values from either a ValLoc, or dual data and location values, passed into the constructor
-    VOp m_valop = VOp{};
-
-    // Points to either dual value and index defined by the user, or value and index within a ValLoc defined by the user
-    target_value_type *target_value = nullptr;
-    target_index_type *target_index = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      // Create a different temp ValLoc solely for combining
-      value_type temp(*target_value, *target_index);
-      temp = op{}(temp, in);
-      *target_value = temp.val;
-      *target_index = temp.loc;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp*>;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
+
+  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
+
+// Partial specialization of Reducer for ValLoc
+// T is a deduced basic data type
+// I is a deduced index type
+template <typename T,
+          typename I,
+          template <typename, typename, typename>
+          class Op>
+struct Reducer<Op<ValLoc<T, I>, ValLoc<T, I>, ValLoc<T, I>>,
+               ValLoc<T, I>,
+               ValOp<ValLoc<T, I>, Op>> : public ForallParamBase
+{
+  using target_value_type = T;
+  using target_index_type = I;
+  using value_type        = ValLoc<T, I>;
+  using op                = Op<value_type, value_type, value_type>;
+  using VOp = ValOp<ValLoc<target_value_type, target_index_type>, Op>;
+
+  Reducer() = default;
+
+  // ValLoc constructor
+  // Note that the target_ variables point to the val and loc within the user
+  // defined target ValLoc
+  RAJA_HOST_DEVICE Reducer(value_type* target_in)
+      : m_valop(VOp {}),
+        target_value(&target_in->val),
+        target_index(&target_in->loc)
+  {}
+
+  // Dual input constructor for ReduceLoc<>(data, index) case
+  // The target_ variables point to vars defined by the user
+  RAJA_HOST_DEVICE Reducer(target_value_type* data_in,
+                           target_index_type* index_in)
+      : m_valop(VOp {}), target_value(data_in), target_index(index_in)
+  {}
+
+  Reducer(Reducer const&)            = default;
+  Reducer(Reducer&&)                 = default;
+  Reducer& operator=(Reducer const&) = default;
+  Reducer& operator=(Reducer&&)      = default;
+
+  // The ValLoc within m_valop is initialized with data and location values from
+  // either a ValLoc, or dual data and location values, passed into the
+  // constructor
+  VOp m_valop = VOp {};
+
+  // Points to either dual value and index defined by the user, or value and
+  // index within a ValLoc defined by the user
+  target_value_type* target_value = nullptr;
+  target_index_type* target_index = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    // Create a different temp ValLoc solely for combining
+    value_type temp(*target_value, *target_index);
+    temp          = op {}(temp, in);
+    *target_value = temp.val;
+    *target_index = temp.loc;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type& getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type* devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int* device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp*>;
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
+  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
 // Standard use case.
 template <template <typename, typename, typename> class Op, typename T>
-auto constexpr Reduce(T *target)
+auto constexpr Reduce(T* target)
 {
-  return detail::Reducer<Op<T,T,T>, T, ValOp<T, Op>>(target);
+  return detail::Reducer<Op<T, T, T>, T, ValOp<T, Op>>(target);
 }
 
 // User-defined ValLoc case.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
-auto constexpr Reduce(ValLoc<T, IndexType> *target)
+template <template <typename, typename, typename> class Op,
+          typename T,
+          typename IndexType>
+auto constexpr Reduce(ValLoc<T, IndexType>* target)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target);
 }
 
-// Dual input use case where reduction value and location are separate, non-ValLoc types supplied by the user.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
-auto constexpr ReduceLoc(T *target, IndexType *index)
+// Dual input use case where reduction value and location are separate,
+// non-ValLoc types supplied by the user.
+template <template <typename, typename, typename> class Op,
+          typename T,
+          typename IndexType>
+auto constexpr ReduceLoc(T* target, IndexType* index)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target, index);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target, index);
 }
 
-} // namespace expt
+}  // namespace expt
 
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_HPP
+#endif  //  NEW_REDUCE_HPP
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 0c0eaf3efb..a1cc15dceb 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -205,7 +205,7 @@ class ReduceSum;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -231,7 +231,7 @@ class ReduceBitOr;
  */
 template <typename REDUCE_POLICY_T, typename T>
 class ReduceBitAnd;
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 0f46ee0a22..baf4664062 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -46,20 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-inclusive_scan_inplace(ExecPolicy&& p,
-                       Res r,
-                       Container&& c,
-                       Function binop = Function{})
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    inclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function {})
 {
   using std::begin;
   using std::end;
@@ -68,32 +69,32 @@ inclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop);
+  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop);
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
+    typename Res      = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function{})
+                       Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop);
 }
 
 /*!
@@ -111,19 +112,19 @@ inclusive_scan_inplace(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename Res,
           typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
+          typename T        = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
-exclusive_scan_inplace(ExecPolicy&& p,
-                       Res r,
-                       Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<Container>>
+    exclusive_scan_inplace(ExecPolicy&& p,
+                           Res r,
+                           Container&& c,
+                           Function binop = Function {},
+                           T value        = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -132,35 +133,33 @@ exclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop, value);
+  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop, value);
 }
 ///
 template <typename ExecPolicy,
           typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
+          typename T        = RAJA::detail::ContainerVal<Container>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+                       Function binop = Function {},
+                       T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop, value);
 }
 
 /*!
@@ -183,19 +182,20 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
-inclusive_scan(ExecPolicy&& p,
-               Res r,
-               InContainer&& in,
-               OutContainer&& out,
-               Function binop = Function{})
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    inclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function {})
 {
   using std::begin;
   using std::end;
@@ -207,36 +207,36 @@ inclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop);
+  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+          typename Function =
+              operators::plus<RAJA::detail::ContainerVal<InContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{})
+               Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop);
 }
 
 /*!
@@ -259,21 +259,21 @@ template <typename ExecPolicy,
           typename Res,
           typename InContainer,
           typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>>
 RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
-exclusive_scan(ExecPolicy&& p,
-               Res r,
-               InContainer&& in,
-               OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+    concepts::enable_if_t<resources::EventProxy<Res>,
+                          type_traits::is_execution_policy<ExecPolicy>,
+                          type_traits::is_resource<Res>,
+                          std::is_constructible<camp::resources::Resource, Res>,
+                          type_traits::is_range<InContainer>,
+                          type_traits::is_range<OutContainer>>
+    exclusive_scan(ExecPolicy&& p,
+                   Res r,
+                   InContainer&& in,
+                   OutContainer&& out,
+                   Function binop = Function {},
+                   T value        = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -285,42 +285,40 @@ exclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop, value);
+  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop, value);
 }
 ///
 template <typename ExecPolicy,
           typename InContainer,
           typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
+          typename T        = RAJA::detail::ContainerVal<InContainer>,
           typename Function = operators::plus<T>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+               Function binop = Function {},
+               T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop, value);
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 
 /*!
@@ -329,11 +327,11 @@ exclusive_scan(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type >
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template <typename ExecPolicy,
+          typename... Args,
+          typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -342,10 +340,9 @@ exclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -358,11 +355,11 @@ exclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
@@ -371,10 +368,9 @@ inclusive_scan(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -387,11 +383,11 @@ inclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -400,10 +396,9 @@ exclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -416,11 +411,11 @@ exclusive_scan_inplace(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
@@ -429,10 +424,9 @@ inclusive_scan_inplace(Args&&... args)
 }
 ///
 template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index acf3fe5ba7..fdbc5722ee 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -46,23 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-sort(ExecPolicy&& p,
-     Res r,
-     Container&& c,
-     Compare comp = Compare{})
+sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -71,34 +69,35 @@ sort(ExecPolicy&& p,
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N = distance(begin_it, end_it);
+  auto N        = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::unstable(r, std::forward<ExecPolicy>(p),
-                                begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable(r, std::forward<ExecPolicy>(p), begin_it,
+                                end_it, comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-sort(ExecPolicy&& p,
-     Container&& c,
-     Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -113,23 +112,21 @@ sort(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template <
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-stable_sort(ExecPolicy&& p,
-            Res r,
-            Container&& c,
-            Compare comp = Compare{})
+stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -138,34 +135,35 @@ stable_sort(ExecPolicy&& p,
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N = distance(begin_it, end_it);
+  auto N        = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::stable(r, std::forward<ExecPolicy>(p),
-                              begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable(r, std::forward<ExecPolicy>(p), begin_it, end_it,
+                              comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-stable_sort(ExecPolicy&& p,
-            Container&& c,
-            Compare comp = Compare{})
+template <
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -185,7 +183,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -196,11 +195,11 @@ sort_pairs(ExecPolicy&& p,
            Res r,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -211,12 +210,15 @@ sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto N         = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p),
-                                      begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                      end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -224,25 +226,25 @@ sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 sort_pairs(ExecPolicy&& p,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
 /*!
@@ -262,7 +264,8 @@ template <typename ExecPolicy,
           typename Res,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -273,11 +276,11 @@ stable_sort_pairs(ExecPolicy&& p,
                   Res r,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -288,12 +291,15 @@ stable_sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto N         = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p),
-                                    begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                    end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
@@ -301,28 +307,28 @@ stable_sort_pairs(ExecPolicy&& p,
 template <typename ExecPolicy,
           typename KeyContainer,
           typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+          typename Compare =
+              operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 stable_sort_pairs(ExecPolicy&& p,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 // =============================================================================
 
@@ -332,11 +338,12 @@ stable_sort_pairs(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort(Args &&... args)
+sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort<ExecPolicy>(
@@ -347,10 +354,10 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort(Res r, Args &&... args)
+sort(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(
-      ExecPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(), r,
+                                                 std::forward<Args>(args)...);
 }
 
 /*!
@@ -359,11 +366,12 @@ sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort(Args &&... args)
+stable_sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort<ExecPolicy>(
@@ -374,7 +382,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort(Res r, Args &&... args)
+stable_sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -386,11 +394,12 @@ stable_sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort_pairs(Args &&... args)
+sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs<ExecPolicy>(
@@ -401,7 +410,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort_pairs(Res r, Args &&... args)
+sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -413,11 +422,12 @@ sort_pairs(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
+template <typename ExecPolicy,
+          typename... Args,
           typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort_pairs(Args &&... args)
+stable_sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs<ExecPolicy>(
@@ -428,7 +438,7 @@ template <typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort_pairs(Res r, Args &&... args)
+stable_sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
diff --git a/include/RAJA/pattern/synchronize.hpp b/include/RAJA/pattern/synchronize.hpp
index d3e42af81c..77c88e5c6d 100644
--- a/include/RAJA/pattern/synchronize.hpp
+++ b/include/RAJA/pattern/synchronize.hpp
@@ -41,7 +41,7 @@ namespace RAJA
 template <typename Policy>
 void synchronize()
 {
-  synchronize_impl(Policy{});
+  synchronize_impl(Policy {});
 }
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index 9fa39f34ee..ab6d2f7c42 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -28,25 +28,27 @@ namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename LAYOUT, typename REGISTER_POLICY = default_register>
-  using SquareMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem,
-                                   RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem>>;
-
-  template<typename T, typename LAYOUT, camp::idx_t ROWS, camp::idx_t COLS,
-           typename REGISTER_POLICY = default_register>
-  using RectMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<ROWS,COLS>>;
-
-} // namespace expt
+template <typename T,
+          typename LAYOUT,
+          typename REGISTER_POLICY = default_register>
+using SquareMatrixRegister = TensorRegister<
+    REGISTER_POLICY,
+    T,
+    LAYOUT,
+    camp::idx_seq<
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
+
+template <typename T,
+          typename LAYOUT,
+          camp::idx_t ROWS,
+          camp::idx_t COLS,
+          typename REGISTER_POLICY = default_register>
+using RectMatrixRegister =
+    TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
+
+}  // namespace expt
 }  // namespace RAJA
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/ScalarRegister.hpp b/include/RAJA/pattern/tensor/ScalarRegister.hpp
index f6675b4ba9..d532d58ade 100644
--- a/include/RAJA/pattern/tensor/ScalarRegister.hpp
+++ b/include/RAJA/pattern/tensor/ScalarRegister.hpp
@@ -28,16 +28,14 @@ namespace RAJA
 namespace expt
 {
 
-  // Convenience to describe ScalarTensors
-  template<typename T>
-  using ScalarRegister = TensorRegister<scalar_register,
-                                        T,
-                                        ScalarLayout,
-                                        camp::idx_seq<>>;
+// Convenience to describe ScalarTensors
+template <typename T>
+using ScalarRegister =
+    TensorRegister<scalar_register, T, ScalarLayout, camp::idx_seq<>>;
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorBlock.hpp b/include/RAJA/pattern/tensor/TensorBlock.hpp
index 0e9869a772..6fc9d48897 100644
--- a/include/RAJA/pattern/tensor/TensorBlock.hpp
+++ b/include/RAJA/pattern/tensor/TensorBlock.hpp
@@ -360,7 +360,6 @@ namespace ET{
 }  // namespace RAJA
 
 
-
 #endif
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index f992649876..185948682d 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -29,196 +29,190 @@ namespace expt
 {
 
 
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndexInner;
-
-  template<typename INNER_TYPE>
-  struct StaticTensorIndex;
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-  class TensorIndex {
-    public:
-      using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type all(){
-        return self_type(index_type(-1), value_type(-1));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>> static_all(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>>();
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type range(index_type begin, index_type end){
-        return self_type(begin, value_type(stripIndexType(end-begin)));
-      }
-
-      template<value_type TBEGIN, value_type TEND>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>> static_range(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>>();
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex() : m_index(index_type(0)), m_length(0) {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg) :
-      m_index(*seg.begin()), m_length(seg.size())
-      {}
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {}
-
-      template<typename T, camp::idx_t D>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
-
-
-      template<IDX IDX_VAL, strip_index_type_t<IDX> LEN_VAL>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
-          : m_index(IDX_VAL)
-          , m_length(LEN_VAL)
-      {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type const &operator*() const {
-        return m_index;
-      }
-
-      // used in strip_by_value as a static cast
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      explicit operator index_type() const {
-        // return does not matter, but suppresses no-return warnings
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type begin() const {
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type size() const {
-        return m_length;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type dim() const {
-        return DIM;
-      }
-
-    private:
-      index_type m_index;
-      value_type m_length;
-  };
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
-
-      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      static const index_type s_index  = INDEX_VALUE;
-      static const index_type s_length = LENGTH_VALUE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr operator base_type() {
-        return base_type(s_index,s_length);
-      }
-    
-  };
-
-
-
-  /*!
-   * Index that specifies the starting element index of a Vector
-   */
-  template<typename IDX, typename VECTOR_TYPE>
-  using VectorIndex =  TensorIndex<IDX, VECTOR_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Row index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using RowIndex =  TensorIndex<IDX, MATRIX_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Column index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using ColIndex =  TensorIndex<IDX, MATRIX_TYPE, 1>;
-
-
-  /*!
-   * Converts a Row index to a Column index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndexInner;
+
+template <typename INNER_TYPE>
+struct StaticTensorIndex;
+
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+class TensorIndex
+{
+public:
+  using self_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr self_type all()
+  {
+    return self_type(index_type(-1), value_type(-1));
+  }
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                            TENSOR_TYPE,
+                                                            DIM,
+                                                            index_type(-1),
+                                                            value_type(-1)>>
+  static_all()
+  {
+    return StaticTensorIndex<StaticTensorIndexInner<
+        IDX, TENSOR_TYPE, DIM, index_type(-1), value_type(-1)>>();
+  }
+
   RAJA_INLINE
-  constexpr
-  ColIndex<IDX, MATRIX_TYPE> toColIndex(RowIndex<IDX, MATRIX_TYPE> const &r){
-    return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+  RAJA_HOST_DEVICE
+  static constexpr self_type range(index_type begin, index_type end)
+  {
+    return self_type(begin, value_type(stripIndexType(end - begin)));
   }
 
-  /*!
-   * Converts a Column index to a Row index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+  template <value_type TBEGIN, value_type TEND>
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex<
+      StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>
+  static_range()
+  {
+    return StaticTensorIndex<
+        StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>();
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(RAJA::TypedRangeSegment<IDX> const& seg)
+      : m_index(*seg.begin()), m_length(seg.size())
+  {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(index_type value, value_type length)
+      : m_index(value), m_length(length)
+  {}
+
+  template <typename T, camp::idx_t D>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE constexpr TensorIndex(TensorIndex<IDX, T, D> const& c)
+      : m_index(*c), m_length(c.size())
+  {}
+
+
+  template <IDX IDX_VAL, strip_index_type_t<IDX> LEN_VAL>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      StaticTensorIndex<
+          StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const
+          RAJA_UNUSED_ARG(&c))
+      : m_index(IDX_VAL), m_length(LEN_VAL)
+  {}
+
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  constexpr index_type const& operator*() const { return m_index; }
+
+  // used in strip_by_value as a static cast
   RAJA_INLINE
-  constexpr
-  RowIndex<IDX, MATRIX_TYPE> toRowIndex(ColIndex<IDX, MATRIX_TYPE> const &c){
-    return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+  RAJA_HOST_DEVICE
+  constexpr explicit operator index_type() const
+  {
+    // return does not matter, but suppresses no-return warnings
+    return m_index;
   }
 
-} // namespace expt
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type begin() const { return m_index; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type size() const { return m_length; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type dim() const { return DIM; }
+
+private:
+  index_type m_index;
+  value_type m_length;
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndex<
+    StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
+{
+
+  using base_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  static const index_type s_index  = INDEX_VALUE;
+  static const index_type s_length = LENGTH_VALUE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr operator base_type() { return base_type(s_index, s_length); }
+};
+
+
+/*!
+ * Index that specifies the starting element index of a Vector
+ */
+template <typename IDX, typename VECTOR_TYPE>
+using VectorIndex = TensorIndex<IDX, VECTOR_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Row index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using RowIndex = TensorIndex<IDX, MATRIX_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Column index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
+
+
+/*!
+ * Converts a Row index to a Column index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
+toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
+{
+  return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+}
+
+/*!
+ * Converts a Column index to a Row index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
+toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
+{
+  return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
diff --git a/include/RAJA/pattern/tensor/TensorLayout.hpp b/include/RAJA/pattern/tensor/TensorLayout.hpp
index 376d6b905a..8e2404c3a2 100644
--- a/include/RAJA/pattern/tensor/TensorLayout.hpp
+++ b/include/RAJA/pattern/tensor/TensorLayout.hpp
@@ -28,67 +28,56 @@ namespace expt
 {
 
 
-  template<camp::idx_t ... DIM_SEQ>
-  struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
-  {
+template <camp::idx_t... DIM_SEQ>
+struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
+{
 
-      using seq_t = camp::idx_seq<DIM_SEQ...>;
+  using seq_t = camp::idx_seq<DIM_SEQ...>;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major() { return false; }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return false;
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major() { return false; }
+};
 
-  };
 
+// specialization for Matrix layouts, where column vs row major matters
+template <camp::idx_t S2, camp::idx_t S1>
+struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+{
+  using seq_t = camp::idx_seq<S2, S1>;
 
-  // specialization for Matrix layouts, where column vs row major matters
-  template<camp::idx_t S2, camp::idx_t S1>
-  struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major()
   {
-      using seq_t = camp::idx_seq<S2, S1>;
+    return S1 == 0;  // Rows are stride-1
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return S1 == 0; // Rows are stride-1
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return S1 == 1; // Columns are stride-1
-      }
-  };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major()
+  {
+    return S1 == 1;  // Columns are stride-1
+  }
+};
 
 
-  // 0d tensor (scalar) layout
-  using ScalarLayout = TensorLayout<>;
+// 0d tensor (scalar) layout
+using ScalarLayout = TensorLayout<>;
 
-  // 1d tensor (vector) layout
-  using VectorLayout = TensorLayout<0>;
+// 1d tensor (vector) layout
+using VectorLayout = TensorLayout<0>;
 
-  // 2d tensor (matrix) layouts
-  using RowMajorLayout = TensorLayout<0, 1>;
-  using ColMajorLayout = TensorLayout<1, 0>;
+// 2d tensor (matrix) layouts
+using RowMajorLayout = TensorLayout<0, 1>;
+using ColMajorLayout = TensorLayout<1, 0>;
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index d410f46fb7..22f4c16cae 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -28,80 +28,91 @@
 
 namespace RAJA
 {
-namespace internal {
-namespace expt {
-    class TensorRegisterConcreteBase;
-}
+namespace internal
+{
+namespace expt
+{
+class TensorRegisterConcreteBase;
 }
+}  // namespace internal
 
 namespace expt
 {
 
 
-  template<typename REGISTER_POLICY,
-           typename T,
-           typename LAYOUT,
-           typename SIZES>
-  class TensorRegister;
+template <typename REGISTER_POLICY, typename T, typename LAYOUT, typename SIZES>
+class TensorRegister;
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic - TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
+
+/*
+ * Overload for:    arithmetic - TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic * TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
+
+/*
+ * Overload for:    arithmetic * TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
-
-  /*
-   * Overload for:    arithmetic / TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
+
+/*
+ * Overload for:    arithmetic / TensorRegister
+
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-} // namespace expt
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index afab05658f..8041622d11 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -24,16 +24,15 @@ namespace RAJA
 {
 namespace expt
 {
-  // Convenience to describe VectorTensors
-  template<typename T, typename REGISTER_POLICY = default_register, camp::idx_t NUM_ELEM = Register<T,REGISTER_POLICY>::s_num_elem>
-  using VectorRegister = TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        VectorLayout,
-                                        camp::idx_seq<NUM_ELEM> >;
-} // namespace expt
-
-} // namespace RAJA
-
+// Convenience to describe VectorTensors
+template <typename T,
+          typename REGISTER_POLICY = default_register,
+          camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
+using VectorRegister =
+    TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
+}  // namespace expt
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 953f4fd4a0..09099eef27 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -34,110 +34,121 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator
+    : public TensorExpressionBase<
+          TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
+{
+public:
+  using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
+  using operator_type      = OPERATOR;
+  using left_operand_type  = LEFT_OPERAND;
+  using right_operand_type = RIGHT_OPERAND;
+
+  using element_type = typename LEFT_OPERAND::element_type;
+  using index_type   = typename LEFT_OPERAND::index_type;
+
+  using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
+  using result_type     = typename operator_traits::result_type;
+
+  static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorBinaryOperator(left_operand_type const& left,
+                       right_operand_type const& right)
+      : m_left_operand {left}, m_right_operand {right}
+  {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr auto getDimSize(camp::idx_t dim) const
+      -> decltype(operator_traits::getDimSize(dim,
+                                              m_left_operand,
+                                              m_right_operand))
+  {
+    return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(operator_type::eval(m_left_operand.eval(tile),
+                                      m_right_operand.eval(tile)))
+  {
+    return operator_type::eval(m_left_operand.eval(tile),
+                               m_right_operand.eval(tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
   {
+    operator_type::print_ast();
+    printf("[");
+    operator_type::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
 
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator :
-        public TensorExpressionBase<TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
-    {
-      public:
-        using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
-        using operator_type = OPERATOR;
-        using left_operand_type = LEFT_OPERAND;
-        using right_operand_type = RIGHT_OPERAND;
-
-        using element_type = typename LEFT_OPERAND::element_type;
-        using index_type = typename LEFT_OPERAND::index_type;
-
-        using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
-        using result_type = typename operator_traits::result_type;
-
-        static constexpr camp::idx_t s_num_dims =
-            operator_traits::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorBinaryOperator(left_operand_type const &left, right_operand_type const &right) :
-        m_left_operand{left}, m_right_operand{right}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        auto getDimSize(camp::idx_t dim) const ->
-        decltype(operator_traits::getDimSize(dim, m_left_operand, m_right_operand))
-        {
-          return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile)))
-        {
-          return operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          operator_type::print_ast();
-          printf("[");
-          operator_type::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-
-
-    /*
-     * Overload for:    arithmetic + tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator+(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
-
-
-    /*
-     * Overload for:    arithmetic - tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator-(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
+/*
+ * Overload for:    arithmetic + tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                 RIGHT_OPERAND>
+{
+  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                   RIGHT_OPERAND>(
+      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+}
+
+
+/*
+ * Overload for:    arithmetic - tensorexpression
+
+ */
+template <typename LEFT_OPERAND,
+          typename RIGHT_OPERAND,
+          typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                  bool>::type = true,
+          typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                  RIGHT_OPERAND>::value,
+                                  bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorSubtract<
+        typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+        RIGHT_OPERAND>
+{
+  return TensorSubtract<
+      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+                     right);
+}
 
 
 //    /*
@@ -145,21 +156,27 @@ namespace expt
 //
 //     */
 //    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-//      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
+//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+//      bool>::type = true, typename
+//      std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+//      RIGHT_OPERAND>::value, bool>::type = true>
 //    RAJA_INLINE
 //    RAJA_HOST_DEVICE
 //    auto operator/(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
+//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//    RIGHT_OPERAND>
 //    {
-//      return TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+//      return TensorDivide<typename
+//      NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+//      right);
 //    }
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index a1450bf19f..52fbf83cfa 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -27,159 +27,133 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-    struct TensorOperatorAdd
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left + right)
-      {
-        return left + right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Add");
-      }
-    };
-
-    struct TensorOperatorSubtract
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left - right)
-      {
-        return left - right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Subtract");
-      }
-    };
-
-
-
+namespace ET
+{
 
+struct TensorOperatorAdd
+{
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator;
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left + right)
+  {
+    return left + right;
+  }
 
-    template<typename LHS, typename RHS>
-    using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Add"); }
+};
 
-    template<typename LHS, typename RHS>
-    using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
+struct TensorOperatorSubtract
+{
 
+  template <typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left - right)
+  {
+    return left - right;
+  }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Subtract"); }
+};
 
 
-    /*!
-     * Provides default operations for add, subtract and divide
-     *
-     * For the most part, this is just element wise operations between
-     * compatible tensors.
-     *
-     * There are specializations that handle when one operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
-    struct OperatorTraits {
+template <typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator;
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+template <typename LHS, typename RHS>
+using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental");
-        }
+template <typename LHS, typename RHS>
+using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
 
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &rhs) {
-          return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
-        }
+/*!
+ * Provides default operations for add, subtract and divide
+ *
+ * For the most part, this is just element wise operations between
+ * compatible tensors.
+ *
+ * There are specializations that handle when one operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
+struct OperatorTraits
+{
 
-    };
+  using result_type                       = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-    /*!
-     * Specialization when the left operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
-    {
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Elemental"); }
 
-        using result_type = typename RHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &, RHS_TYPE const &rhs) {
-          return rhs.getDimSize(dim);
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const& rhs)
+  {
+    return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
+  }
+};
 
-    };
+/*!
+ * Specialization when the left operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
+{
 
-    /*!
-     * Specialization when the right operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
-    {
+  using result_type                       = typename RHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const&, RHS_TYPE const& rhs)
+  {
+    return rhs.getDimSize(dim);
+  }
+};
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &) {
-          return lhs.getDimSize(dim);
-        }
+/*!
+ * Specialization when the right operand is a scalar
+ */
+template <typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
+{
 
+  using result_type                       = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const&)
+  {
+    return lhs.getDimSize(dim);
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index 210414eaec..c61cfd0891 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -26,7 +26,6 @@
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
 
 
-
 namespace RAJA
 {
 namespace internal
@@ -35,93 +34,90 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Temporary n-dimensional memory.
-     *
-     * STORAGE_TYPE defines the memory storage
-     * TENSOR_TYPE defines what kind of tensor is returned by eval()
-     */
-    template<typename STORAGE_TYPE, typename TENSOR_TYPE>
-    class BlockLiteral :  public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>> {
-      public:
-        using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
-        using storage_type = STORAGE_TYPE;
-        using tensor_type = TENSOR_TYPE;
-        using result_type = TENSOR_TYPE;
-        using ref_type = typename STORAGE_TYPE::ref_type;
-        using tile_type = typename ref_type::tile_type;
-        using index_type = camp::idx_t;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        storage_type m_storage;
-        tile_type m_tile_origin;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return storage_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        BlockLiteral(tile_type tile_origin) :
-          m_storage(),
-          m_tile_origin(tile_origin)
-        {
+namespace ET
+{
 
-        }
 
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          result_type result;
+/*!
+ * Temporary n-dimensional memory.
+ *
+ * STORAGE_TYPE defines the memory storage
+ * TENSOR_TYPE defines what kind of tensor is returned by eval()
+ */
+template <typename STORAGE_TYPE, typename TENSOR_TYPE>
+class BlockLiteral
+    : public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>>
+{
+public:
+  using self_type    = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
+  using storage_type = STORAGE_TYPE;
+  using tensor_type  = TENSOR_TYPE;
+  using result_type  = TENSOR_TYPE;
+  using ref_type     = typename STORAGE_TYPE::ref_type;
+  using tile_type    = typename ref_type::tile_type;
+  using index_type   = camp::idx_t;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+
+private:
+  storage_type m_storage;
+  tile_type m_tile_origin;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return storage_type::s_dim_elem(dim);
+  }
 
-          // load result from storage
-          result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr BlockLiteral(tile_type tile_origin)
+      : m_storage(), m_tile_origin(tile_origin)
+  {}
 
-          return result;
-        }
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    result_type result;
 
+    // load result from storage
+    result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
 
-        /*!
-         *  Returns a ref that points at this data, shifted by its origin
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        ref_type get_ref() {
+    return result;
+  }
 
-          // compute shifited origin ref
-          return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
 
-        }
+  /*!
+   *  Returns a ref that points at this data, shifted by its origin
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  ref_type get_ref()
+  {
 
+    // compute shifited origin ref
+    return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
+  }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("BlockLiteral()");
-        }
 
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("BlockLiteral()"); }
+};
 
 
 //    /*
-//     * For TensorRegister nodes, we need to wrap this in a constant value ET node
+//     * For TensorRegister nodes, we need to wrap this in a constant value ET
+//     node
 //     */
 //    template<typename RHS>
 //    struct NormalizeOperandHelper<RHS,
-//    typename std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase, RHS>::value>::type>
+//    typename
+//    std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase,
+//    RHS>::value>::type>
 //    {
 //        using return_type = BlockLiteral<RHS>;
 //
@@ -134,10 +130,10 @@ namespace expt
 //        }
 //    };
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3e96a63462..0c57f20067 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -38,128 +38,121 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
+class TensorRegisterConcreteBase;
 
-  namespace ET
+namespace ET
+{
+
+//
+// forward decls
+//
+
+template <typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
+class TensorLoadStore;
+
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorMultiply;
+
+template <typename LHS_TYPE, typename RHS_TYPE>
+class TensorDivide;
+
+template <typename TENSOR_TYPE>
+class TensorNegate;
+
+template <typename TENSOR_TYPE>
+class TensorTranspose;
+
+
+// provides a non-templated base-type for all ET's
+// this allows using things like std::is_base_of
+class TensorExpressionConcreteBase
+{};
+
+
+template <typename DERIVED_TYPE>
+class TensorExpressionBase : public TensorExpressionConcreteBase
+{
+public:
+  using self_type = DERIVED_TYPE;
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type* getThis() { return static_cast<self_type*>(this); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr camp::idx_t getDimBegin(camp::idx_t) const { return 0; }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
+  operator+(RHS const& rhs) const
+  {
+    return TensorAdd<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE TensorSubtract<self_type, normalize_operand_t<RHS>>
+      operator-(RHS const& rhs) const
   {
+    return TensorSubtract<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate<self_type> operator-() const
+  {
+    return TensorNegate<self_type>(*getThis());
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE TensorMultiply<self_type, normalize_operand_t<RHS>>
+      operator*(RHS const& rhs) const
+  {
+    return TensorMultiply<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
+  operator/(RHS const& rhs) const
+  {
+    return TensorDivide<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose<self_type> transpose() const
+  {
+    return TensorTranspose<self_type>(*getThis());
+  }
+};
+
+
+}  // namespace ET
 
-    //
-    // forward decls
-    //
-
-    template<typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
-    class TensorLoadStore;
-
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorMultiply;
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorDivide;
-
-    template<typename TENSOR_TYPE>
-    class TensorNegate;
-
-    template<typename TENSOR_TYPE>
-    class TensorTranspose;
-
-
-
-
-    // provides a non-templated base-type for all ET's
-    // this allows using things like std::is_base_of
-    class TensorExpressionConcreteBase{};
-
-
-    template<typename DERIVED_TYPE>
-    class TensorExpressionBase :public TensorExpressionConcreteBase {
-      public:
-        using self_type = DERIVED_TYPE;
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        self_type *getThis(){
-          return static_cast<self_type*>(this);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        self_type const *getThis() const {
-          return static_cast<self_type const*>(this);
-        }
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        camp::idx_t getDimBegin(camp::idx_t ) const
-        {
-          return 0;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorAdd<self_type, normalize_operand_t<RHS> >
-        operator+(RHS const &rhs) const {
-          return TensorAdd<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorSubtract<self_type, normalize_operand_t<RHS>>
-        operator-(RHS const &rhs) const {
-          return TensorSubtract<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate<self_type>
-        operator-() const {
-          return TensorNegate<self_type>(*getThis());
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply<self_type, normalize_operand_t<RHS>>
-        operator*(RHS const &rhs) const {
-          return TensorMultiply<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide<self_type, normalize_operand_t<RHS>>
-        operator/(RHS const &rhs) const {
-          return TensorDivide<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose<self_type>
-        transpose() const {
-          return TensorTranspose<self_type>(*getThis());
-        }
-
-    };
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index e7e7223ce4..c89f887ca5 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -20,1210 +20,1231 @@
 #define RAJA_pattern_tensor_ET_MultiplyOperator_HPP
 
 
-
 namespace RAJA
 {
 namespace internal
 {
 namespace expt
 {
-  //forward
-  class TensorBlockConcreteBase;
+// forward
+class TensorBlockConcreteBase;
+
+
+namespace ET
+{
 
 
+/*!
+ * Provides default multiply, multiply add, and multiply subtract
+ * operations.
+ *
+ * If the operands are both matrices, we perform a matrix-matrix multiply.
+ * Otherwise, we perform element-wise operations.
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct MultiplyOperator
+{
 
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
-  namespace ET
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast()
   {
+    printf("Elemental(%d,%d)", (int)s_num_dims,
+           (int)RIGHT_OPERAND_TYPE::s_num_dims);
+  }
 
 
-    /*!
-     * Provides default multiply, multiply add, and multiply subtract
-     * operations.
-     *
-     * If the operands are both matrices, we perform a matrix-matrix multiply.
-     * Otherwise, we perform element-wise operations.
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct MultiplyOperator
-    {
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile) * right.eval(tile))
+  {
+    return left.eval(tile) * right.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
+                                               add.eval(tile)))
+  {
+    return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
+                                                    subtract.eval(tile)))
+  {
+    return left.eval(tile).multiply_subtract(right.eval(tile),
+                                             subtract.eval(tile));
+  }
+};
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental(%d,%d)", (int)s_num_dims, (int)RIGHT_OPERAND_TYPE::s_num_dims);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile) * right.eval(tile))
-        {
-          return left.eval(tile) * right.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).multiply_add(right.eval(tile), add.eval(tile)))
-        {
-          return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile)))
-        {
-          return left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile));
-        }
-
-
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a scalar * tensor
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+
+/*!
+ * Specialization that provides multiplying a scalar * tensor
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+{
 
-        using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-          return right.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(right.eval(tile).scale(left.eval(tile)))
-        {
-          return right.eval(tile).scale(left.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a tensor*scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(right.eval(tile).scale(left.eval(tile)))
+  {
+    return right.eval(tile).scale(left.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+
+/*!
+ * Specialization that provides multiplying a tensor*scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+{
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-          return left.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile).scale(right.eval(tile)))
-        {
-          return left.eval(tile).scale(right.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization for matrix-vector right multiplication.
-     *
-     * By default the A*x operator for two matrices produces a matrix-vector
-     * multiplication.
-     *
-     * The right hand side vector is always treated as a column vector.
-     *
-     * The resulting vector type is inherited from the RHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==1>::type>
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile).scale(right.eval(tile)))
+  {
+    return left.eval(tile).scale(right.eval(tile));
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template <typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
+  }
+
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template <typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto
+  multiply_subtract(TILE_TYPE const& tile,
+                    LEFT_OPERAND_TYPE const& left,
+                    RIGHT_OPERAND_TYPE const& right,
+                    SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+
+/*!
+ * Specialization for matrix-vector right multiplication.
+ *
+ * By default the A*x operator for two matrices produces a matrix-vector
+ * multiplication.
+ *
+ * The right hand side vector is always treated as a column vector.
+ *
+ * The resulting vector type is inherited from the RHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using left_type  = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type =
+      typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Vector"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? right.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+
+    // clear result
+    result_type result(0);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
+
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX = void>
+  struct MultiplyBridge;
+
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+    // get tile size from matrix type
+    auto tile_size = left_type::result_type::s_dim_elem(1);
+    auto k_size    = et_left.getDimSize(1);
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    auto left_tile =
+        LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    left_tile.m_begin[0] = tile.m_begin[0];
+    left_tile.m_size[0]  = tile.m_size[0];
+    left_tile.m_size[1]  = tile_size;
+
+    using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+    RightType right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k;
+      auto left            = et_left.eval(left_tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Vector");
-      }
+      right_tile.m_begin[0] = k;
+      auto right            = et_right.eval(right_tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? right.getDimSize(0) : 0;
-      }
+      // accumulate product
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      // accumulate product of partial tile
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
 
-        // clear result
-        result_type result(0);
+  template <typename T>
+  struct Diag
+  {
+    static_assert(!std::is_same<T, void>::value, "diag");
+  };
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+  template <typename I, TensorTileSize TTS, typename B, typename S>
+  struct Diag<StaticTensorTile<I, TTS, B, S>>
+  {
+    static_assert(std::is_same<I, void>::value, "diag");
+  };
 
-        return result;
-      }
+  template <typename STORAGE, typename TILE_TYPE, typename INDEX>
+  struct MultiplyBridge
+  {
+
+    Diag<TILE_TYPE> diag;
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TILE_TYPE const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
+      // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+      // get tile size from matrix type
+      auto tile_size = left_type::result_type::s_dim_elem(1);
+      auto k_size    = et_left.getDimSize(1);
+      // TODO: check that left and right are compatible
+      // m_left.getDimSize(1) == m_right.getDimSize(0)
+      // how do we provide checking for this kind of error?
+
+      // tile over row of left and column of right
+      auto left_tile =
+          LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+      left_tile.m_begin[0] = tile.m_begin[0];
+      left_tile.m_size[0]  = tile.m_size[0];
+      left_tile.m_size[1]  = tile_size;
+
+      using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+      RightType right_tile = tile;
+      right_tile.m_size[0] = tile_size;
+
+      // Do full tiles in k
+      decltype(k_size) k = 0;
+      for (; k + tile_size <= k_size; k += tile_size)
+      {
 
-        // evaluate add into result
-        result_type result = add.eval(tile);
+        // evaluate both sides of operator
+        left_tile.m_begin[1] = k;
+        auto left            = et_left.eval(left_tile);
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        right_tile.m_begin[0] = k;
+        auto right            = et_right.eval(right_tile);
 
-        return result;
+        // accumulate product
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+      // remainder tile in k
+      if (k < k_size)
+      {
+        auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+        left_part_tile.m_begin[1] = k;
+        left_part_tile.m_size[1]  = k_size - k;
+        auto left                 = et_left.eval(left_part_tile);
+
+        auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+        right_part_tile.m_begin[0] = k;
+        right_part_tile.m_size[0]  = k_size - k;
+        auto right                 = et_right.eval(right_part_tile);
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+    }
+  };
+
+
+  template <size_t INDEX,
+            typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, INDEX>>
+  {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
 
-    private:
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
 
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX=void>
-      struct MultiplyBridge;
+      auto const offset = INDEX * tile_size;
 
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+      if ((offset + tile_size) <= k_size)
       {
-        //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-
-        // get tile size from matrix type
-        auto tile_size = left_type::result_type::s_dim_elem(1);
-        auto k_size = et_left.getDimSize(1);
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        left_tile.m_begin[0] = tile.m_begin[0];
-        left_tile.m_size[0] = tile.m_size[0];
-        left_tile.m_size[1] = tile_size;
-
-        using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-        RightType right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product of partial tile
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
 
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        MultiplyBridge<STORAGE, TileType,
+                       camp::integral_constant<size_t, INDEX - 1>>::
+            multiply_into_result(result, tile, et_left, et_right);
+        result += temp;
       }
+      else
+      {
 
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
 
-      template<typename T>
-      struct Diag{
-          static_assert(!std::is_same<T,void>::value,"diag");
-      };
-
-      template<typename I, TensorTileSize TTS, typename B, typename S>
-      struct Diag< StaticTensorTile<I,TTS,B,S> >{
-          static_assert(std::is_same<I,void>::value,"diag");
-      };
-
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX>
-      struct MultiplyBridge {
-
-          Diag<TILE_TYPE> diag;
-
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-            //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-    
-            // get tile size from matrix type
-            auto tile_size = left_type::result_type::s_dim_elem(1);
-            auto k_size = et_left.getDimSize(1);
-            // TODO: check that left and right are compatible
-            // m_left.getDimSize(1) == m_right.getDimSize(0)
-            // how do we provide checking for this kind of error?
-    
-            // tile over row of left and column of right
-            auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-            left_tile.m_begin[0] = tile.m_begin[0];
-            left_tile.m_size[0] = tile.m_size[0];
-            left_tile.m_size[1] = tile_size;
-    
-            using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-            RightType right_tile = tile;
-            right_tile.m_size[0] = tile_size;
-    
-            // Do full tiles in k
-            decltype(k_size) k = 0;
-            for(;k+tile_size <= k_size; k+= tile_size){
-    
-              // evaluate both sides of operator
-              left_tile.m_begin[1] = k;
-              auto left = et_left.eval(left_tile);
-    
-              right_tile.m_begin[0] = k;
-              auto right = et_right.eval(right_tile);
-    
-              // accumulate product
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-            // remainder tile in k
-            if(k < k_size){
-              auto &left_part_tile = make_tensor_tile_partial(left_tile);
-              left_part_tile.m_begin[1] = k;
-              left_part_tile.m_size[1] = k_size-k;
-              auto left = et_left.eval(left_part_tile);
-    
-              auto &right_part_tile = make_tensor_tile_partial(right_tile);
-              right_part_tile.m_begin[0] = k;
-              right_part_tile.m_size[0] = k_size-k;
-              auto right = et_right.eval(right_part_tile);
-    
-              // accumulate product of partial tile
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-    
-          }
-      };
-
-
-
-
-      template<
-          size_t INDEX,
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,INDEX>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = INDEX*tile_size;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,INDEX-1>>::multiply_into_result(result,tile,et_left,et_right);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE...  SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,0>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = 0;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0,  INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0,  INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          void
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-              const size_t iter_count = (k_size/tile_size) + ( (k_size%tile_size != 0) ? 1 : 0 );
-
-              MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,iter_count>>::multiply_into_result(result,tile,et_left,et_right);
-
-            }
-          };
-
-      };
-
-
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd;
-
-
-    /*!
-     * Specialization for vector*matrix left multiplication.
-     *
-     * By default the x'*A operator for two matrices produces a vector-matrix
-     * multiplication.
-     *
-     * The left hand side vector is always treated as a row vector.
-     *
-     * The resulting vector type is inherited from the LHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+    }
+  };
+
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, 0>>
+  {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const&,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Vector*Matrix");
-      }
+      auto const offset = 0;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return dim == 0 ? left.getDimSize(0) : 0;
-      }
+      if ((offset + tile_size) <= k_size)
+      {
+
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
-        // clear result
-        result_type result(0);
-
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
-
-        return result;
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        result += temp;
       }
+      else
+      {
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
-        // evaluate add into result
-        result_type result = add.eval(tile);
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
 
-        return result;
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+  template <typename STORAGE,
+            typename INDEX_TYPE,
+            TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE Begin0,
+            INDEX_TYPE... BeginTail,
+            INDEX_TYPE Size0,
+            INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      void>
+  {
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        auto tile_size = right_type::result_type::s_dim_elem(0);
-        auto k_size = et_right.getDimSize(0);
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
 
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
+      const size_t iter_count =
+          (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
 
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
+      MultiplyBridge<STORAGE, TileType,
+                     camp::integral_constant<size_t, iter_count>>::
+          multiply_into_result(result, tile, et_left, et_right);
+    }
+  };
+};
 
-        // tile over row of left and column of right
-        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        right_tile.m_begin[1] = tile.m_begin[0];
-        right_tile.m_size[1] = tile.m_size[0];
-        right_tile.m_size[0] = tile_size;
 
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[0] = tile_size;
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd;
 
 
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
+/*!
+ * Specialization for vector*matrix left multiplication.
+ *
+ * By default the x'*A operator for two matrices produces a vector-matrix
+ * multiplication.
+ *
+ * The left hand side vector is always treated as a row vector.
+ *
+ * The resulting vector type is inherited from the LHS
+ *
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
 
-          // evaluate both sides of operator
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
 
-          left_tile.m_begin[0] = k;
-          auto left = et_left.eval(left_tile);
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Vector*Matrix"); }
 
-          // accumulate product
-          result = right.left_multiply_vector_accumulate(left, result);
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return dim == 0 ? left.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+    // clear result
+    result_type result(0);
 
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
 
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[0] = k;
-          left_part_tile.m_size[0] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
+    return result;
+  }
 
-          // compute product into x of partial tile
-          result = right.left_multiply_vector_accumulate(left, result);
-        }
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    auto tile_size = right_type::result_type::s_dim_elem(0);
+    auto k_size    = et_right.getDimSize(0);
 
-      }
 
-    };
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
+    // tile over row of left and column of right
+    auto right_tile =
+        RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    right_tile.m_begin[1] = tile.m_begin[0];
+    right_tile.m_size[1]  = tile.m_size[0];
+    right_tile.m_size[0]  = tile_size;
 
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[0] = tile_size;
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorRegisters
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
+
+      // evaluate both sides of operator
+      right_tile.m_begin[0] = k;
+      auto right            = et_right.eval(right_tile);
+
+      left_tile.m_begin[0] = k;
+      auto left            = et_left.eval(left_tile);
+
+      // accumulate product
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[0] = k;
+      left_part_tile.m_size[0]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      // compute product into x of partial tile
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+  }
+};
+
+
+/*!
+ * Specialization for matrix-matrix multiplication for TensorRegisters
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const& left,
+           RIGHT_OPERAND_TYPE const& right)
+  {
+
+    /*
+     *
+     * For TensorRegister:
+     *
+     *   Return's a register containing product of left and right operands
+     *
+     * For TensorBlock:
+     *
+     *  Return's an ET TensorLiteral containing the left and right operrands
+     *
+     *  OR
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     *  Returns an ET multiply
      *
      */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
-    {
+    // create zeroed temporary
+    result_type result;
+    result.broadcast(0);
+
+    // multiply left and right operands into temporary
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)
+  {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-      static constexpr camp::idx_t s_num_dims = 2;
+    // start accumulator with addition term
+    result_type result = add.eval(tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Matrix");
-      }
+    multiply_into_result(result, tile, left, right);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-      }
+    return result;
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    using right_tensor_type = typename right_type::result_type;
+    auto tile_size          = right_tensor_type::s_dim_elem(0);
+    auto k_size             = et_left.getDimSize(1);
 
-        /*
-         *
-         * For TensorRegister:
-         *
-         *   Return's a register containing product of left and right operands
-         *
-         * For TensorBlock:
-         *
-         *  Return's an ET TensorLiteral containing the left and right operrands
-         *
-         *  OR
-         *
-         *  Returns an ET multiply
-         *
-         */
-        // create zeroed temporary
-        result_type result;
-        result.broadcast(0);
-
-        // multiply left and right operands into temporary
-        multiply_into_result(result, tile, left,right);
-
-        return result;
-      }
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add)
-      {
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin     = et_left.getDimBegin(1);
 
-        // start accumulator with addition term
-        result_type result = add.eval(tile);
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin     = et_right.getDimBegin(0);
 
-        multiply_into_result(result, tile, left, right);
 
-        return result;
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
 
-      }
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left            = et_left.eval(left_tile);
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        using right_tensor_type = typename right_type::result_type;
-        auto tile_size = right_tensor_type::s_dim_elem(0);
-        auto k_size = et_left.getDimSize(1);
-
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[1] = tile_size;
-        auto left_begin = et_left.getDimBegin(1);
-
-        TILE_TYPE right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-        auto right_begin = et_right.getDimBegin(0);
-
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k + left_begin;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k + right_begin;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-        // remainder tile in k
-        if(k < k_size){
-
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k + left_begin;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k + right_begin;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-      }
+      right_tile.m_begin[0] = k + right_begin;
+      auto right            = et_right.eval(right_tile);
 
-    };
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
 
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
 
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
 
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+  }
+};
 
 
-    template<typename OPERAND_TYPE, typename TILE_TYPE>
-    class RestrictExtents : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>> {
-      public:
-        using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
-        using operand_type = OPERAND_TYPE;
-        using result_type = typename OPERAND_TYPE::result_type;
-        using index_type = typename TILE_TYPE::index_type;
-        using tile_type = TILE_TYPE;
-        static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+template <typename OPERAND_TYPE, typename TILE_TYPE>
+class RestrictExtents
+    : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>>
+{
+public:
+  using self_type    = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
+  using operand_type = OPERAND_TYPE;
+  using result_type  = typename OPERAND_TYPE::result_type;
+  using index_type   = typename TILE_TYPE::index_type;
+  using tile_type    = TILE_TYPE;
+  static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+
+private:
+  operand_type m_operand;
+  tile_type m_tile;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  RestrictExtents(operand_type const& operand, tile_type const& tile)
+      : m_operand {operand}, m_tile {tile}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tile.m_size[dim];
+  }
 
-      private:
-        operand_type m_operand;
-        tile_type m_tile;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimBegin(camp::idx_t dim) const
+  {
+    return m_tile.m_begin[dim];
+  }
 
-      public:
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        RestrictExtents(operand_type const &operand, tile_type const &tile) :
-        m_operand{operand}, m_tile{tile}
-        {}
+  template <typename TILE_TYPE2>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE2 const& tile) const
+      -> decltype(m_operand.eval(tile))
+  {
+    return m_operand.eval(tile);
+  }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("RestrictExtents(");
+    m_operand.print_ast();
+    printf(")");
+  }
+};
+
+template <typename OPERAND, typename TILE>
+RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
+                                               TILE const& tile)
+{
+  using tile_type = typename OPERAND::tile_type;
+  tile_type new_tile;
+  new_tile.copy(tile);
+  return RestrictExtents<OPERAND, TILE>(operand, new_tile);
+}
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tile.m_size[dim];
-        }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimBegin(camp::idx_t dim) const {
-          return m_tile.m_begin[dim];
-        }
+/*!
+ * Specialization for matrix-matrix multiplication for TensorBlocks
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
 
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<
+        std::is_base_of<TensorBlockConcreteBase,
+                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
 
-        template<typename TILE_TYPE2>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE2 const &tile) const ->
-          decltype(m_operand.eval(tile))
-        {
-          return m_operand.eval(tile);
-        }
+  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
+  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename
+  //      RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("RestrictExtents(");
-          m_operand.print_ast();
-          printf(")");
-        }
 
+  // This tensor type is a TensorBlock of some kind
+  using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
 
-    };
+  // Get the storage type from the TensorBlock
+  using storage_type = typename tensor_type::storage_type;
 
-    template<typename OPERAND, typename TILE>
-    RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const &operand, TILE const &tile){
-      using tile_type = typename OPERAND::tile_type;
-      tile_type new_tile;
-      new_tile.copy(tile);
-      return RestrictExtents<OPERAND, TILE>(operand, new_tile);
-    }
+  // Create a BlockLiteral that uses the TensorBlock's indicated storage
+  // and has an eval() that produces the TensorBlock's register type
+  using block_literal =
+      BlockLiteral<storage_type, typename tensor_type::register_type>;
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal
+  multiply(TILE_TYPE const& tile,
+           LEFT_OPERAND_TYPE const&,
+           RIGHT_OPERAND_TYPE const&)  //->
+                                       /// decltype(TensorMultiply<decltype(left.eval(tile)),
+                                       /// decltype(right.eval(tile))>(left.eval(tile),
+                                       /// right.eval(tile)))
+  {
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorBlocks
+    /*
+     * First pass:  just return a Multiply ET that evaluates the block
+     * with underlying TensorRegisters
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     *
+     * Second pass: we want to return a TensorLiteral ET node with the
+     * matrix product already evaluated.?
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
+     *
+     */
+    // create a BlockLiteral
+    block_literal result(tile);
+
+    // evaluate the block-wise product into result
+
+    // return TensorMultiply<decltype(left.eval(tile)),
+    // decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+  template <typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal
+  multiply_add(TILE_TYPE const& tile,
+               LEFT_OPERAND_TYPE const& left,
+               RIGHT_OPERAND_TYPE const& right,
+               ADD_TYPE const& add)  //->
+                                     // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
+                                     // decltype(right.eval(tile)),
+                                     // decltype(add.eval(tile))>(left.eval(tile),
+                                     // right.eval(tile), add.eval(tile)))
+  {
+    /*
+     * First pass:  we want to return a BlockLiteral ET node with the
+     * matrix product already evaluated.  We do this by creating
+     * a LoadStore node wrapping the BlockLiteral, and evaluating it as
+     * a sub-expression.
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
      *
      */
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    // create a BlockLiteral
+    using block_tile_type = typename block_literal::tile_type;
+    block_tile_type block_tile;
+    block_tile.copy(tile);
+    block_literal result(block_tile);
+
+    using ref_type        = typename block_literal::ref_type;
+    using load_store_type = TensorLoadStore<tensor_type, ref_type>;
+
+    // initialize the result with our addition term
+    auto result_et = load_store_type(result.get_ref()).eval(tile);
+    result_et      = add.eval(tile);
+
+    // return TensorMultiplyAdd<decltype(left.eval(tile)),
+    // decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile),
+    // right.eval(tile), add.eval(tile));
+
+    //          multiply_into_result(result_et, tile, restrictExtents(left,
+    //          tile), restrictExtents(right, tile));
+    multiply_into_result(result_et, tile, left, right);
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+private:
+  template <typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void
+  multiply_into_result(STORAGE& result,
+                       TILE_TYPE const& tile,
+                       LEFT_OPERAND_TYPE const& et_left,
+                       RIGHT_OPERAND_TYPE const& et_right)
+  {
+
+    // get tile size from matrix type
+    auto tile_size = result_type::s_dim_elem(1);
+    auto k_size    = et_left.getDimSize(1);
+
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin     = et_left.getDimBegin(1);
+
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin     = et_right.getDimBegin(0);
+
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
-        using left_type = LEFT_OPERAND_TYPE;
-        using right_type = RIGHT_OPERAND_TYPE;
-        using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-        static constexpr camp::idx_t s_num_dims = 2;
 
-  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
-  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
-
-
-        // This tensor type is a TensorBlock of some kind
-        using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
-
-        // Get the storage type from the TensorBlock
-        using storage_type = typename tensor_type::storage_type;
-
-        // Create a BlockLiteral that uses the TensorBlock's indicated storage
-        // and has an eval() that produces the TensorBlock's register type
-        using block_literal = BlockLiteral<storage_type,
-                                           typename tensor_type::register_type>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Matrx*Matrix");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &) //->
-          ///decltype(TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile)))
-        {
-
-          /*
-           * First pass:  just return a Multiply ET that evaluates the block
-           * with underlying TensorRegisters
-           *
-           *
-           * Second pass: we want to return a TensorLiteral ET node with the
-           * matrix product already evaluated.?
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-          // create a BlockLiteral
-          block_literal result(tile);
-
-          // evaluate the block-wise product into result
-
-          //return TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-        template<typename TILE_TYPE, typename ADD_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add) //->
-          //decltype(TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile)))
-        {
-          /*
-           * First pass:  we want to return a BlockLiteral ET node with the
-           * matrix product already evaluated.  We do this by creating
-           * a LoadStore node wrapping the BlockLiteral, and evaluating it as
-           * a sub-expression.
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-
-          // create a BlockLiteral
-          using block_tile_type = typename block_literal::tile_type;
-          block_tile_type block_tile;
-          block_tile.copy(tile);
-          block_literal result(block_tile);
-
-          using ref_type = typename block_literal::ref_type;
-          using load_store_type = TensorLoadStore<tensor_type, ref_type>;
-
-          // initialize the result with our addition term
-          auto result_et = load_store_type(result.get_ref()).eval(tile);
-          result_et = add.eval(tile);
-
-          //return TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile));
-
-//          multiply_into_result(result_et, tile, restrictExtents(left, tile), restrictExtents(right, tile));
-          multiply_into_result(result_et, tile, left, right);
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-      private:
-
-        template<typename STORAGE, typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-        {
-
-          // get tile size from matrix type
-          auto tile_size = result_type::s_dim_elem(1);
-          auto k_size = et_left.getDimSize(1);
-
-          // TODO: check that left and right are compatible
-          // m_left.getDimSize(1) == m_right.getDimSize(0)
-          // how do we provide checking for this kind of error?
-
-          // tile over row of left and column of right
-          TILE_TYPE left_tile = tile;
-          left_tile.m_size[1] = tile_size;
-          auto left_begin = et_left.getDimBegin(1);
-
-          TILE_TYPE right_tile = tile;
-          right_tile.m_size[0] = tile_size;
-          auto right_begin = et_right.getDimBegin(0);
-
-
-
-          // Do full tiles in k
-          decltype(k_size) k = 0;
-          for(;k+tile_size <= k_size; k+= tile_size){
-
-
-            // evaluate both sides of operator
-            left_tile.m_begin[1] = k + left_begin;
-            auto left = et_left.eval(left_tile);
-
-            right_tile.m_begin[0] = k + right_begin;
-            auto right = et_right.eval(right_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
-          }
-          // remainder tile in k
-          if(k < k_size){
-
-            auto &left_part_tile = make_tensor_tile_partial(left_tile);
-            left_part_tile.m_begin[1] = k + left_begin;
-            left_part_tile.m_size[1] = k_size-k;
-            auto left = et_left.eval(left_part_tile);
-
-            auto &right_part_tile = make_tensor_tile_partial(right_tile);
-            right_part_tile.m_begin[0] = k + right_begin;
-            right_part_tile.m_size[0] = k_size-k;
-            auto right = et_right.eval(right_part_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_part_tile) * restrictExtents(right, right_part_tile);
-          }
-        }
-    };
-
-
-  } // namespace ET
 
-  } // namespace internal
-} // namespace expt
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left            = et_left.eval(left_tile);
+
+      right_tile.m_begin[0] = k + right_begin;
+      auto right            = et_right.eval(right_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result +=
+          restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result += restrictExtents(left, left_part_tile) *
+                restrictExtents(right, right_part_tile);
+    }
+  }
+};
+
+
+}  // namespace ET
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index faa92747dd..34998af6bd 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -33,346 +33,381 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          class ENABLE = void>
+struct DivideOperator;
+
+
+/*!
+ * Specialization that provides dividing a scalar by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
   {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type numerator(left.eval(tile));
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct DivideOperator;
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_n(right.eval(tile), tile.m_size[0]);
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
 
-    /*!
-     * Specialization that provides dividing a scalar by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return left.eval(tile).divide(denominator);
+    }
+    else
     {
+      return left.eval(tile).divide_n(denominator, tile.m_size[0]);
+    }
+  }
+};
+
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_n(right.eval(tile), tile.m_size[0]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_n(denominator, tile.m_size[0]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
-        }
-      }
-    };
-
-
-
-
-
-
-    /*!
-     * Specialization that provides dividing a scalar by a matrix
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a scalar by a matrix
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const&, RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type numerator(left.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_nm(right.eval(tile), tile.m_size[0],
+                               tile.m_size[1]);
+  }
+};
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return left.eval(tile).divide(denominator);
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_nm(denominator, tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide_nm(denominator, tile.m_size[0],
+                                       tile.m_size[1]);
+    }
+  }
+};
+
+
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static int
+  getDimSize(int dim, LEFT_OPERAND_TYPE const& left, RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type
+  divide(TILE_TYPE const& tile,
+         LEFT_OPERAND_TYPE const& left,
+         RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorDivide: public TensorExpressionBase<TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using divide_op = DivideOperator<left_operand_type, right_operand_type>;
-        using result_type = typename divide_op::result_type;
-        static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
-
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return divide_op::divide(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Divide(");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic / tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator/(LHS const &left_operand, RHS const &right_operand) ->
-    TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+      return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0],
+                                       tile.m_size[1]);
     }
+  }
+};
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorDivide : public TensorExpressionBase<
+                         TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type         = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using element_type       = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type         = typename LEFT_OPERAND_TYPE::index_type;
+
+  using divide_op   = DivideOperator<left_operand_type, right_operand_type>;
+  using result_type = typename divide_op::result_type;
+  static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
+
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorDivide(left_operand_type const& left_operand,
+               right_operand_type const& right_operand)
+      : m_left_operand {left_operand}, m_right_operand {right_operand}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    return divide_op::divide(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
+  {
+    return m_right_operand;
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Divide(");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic / tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index 6720a304f2..10367f0d5b 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -33,76 +33,72 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename TENSOR_TYPE>
+class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
+{
+public:
+  using self_type    = TensorLiteral<TENSOR_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using result_type  = tensor_type;
+  using index_type   = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return tensor_type::s_dim_elem(dim);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLiteral(tensor_type const& value) : m_value {value} {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const&) const
+  {
+    return result_type(m_value);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("TensorLiteral()"); }
+
+private:
+  tensor_type m_value;
+};
+
+
+/*
+ * For TensorRegister nodes, we need to wrap this in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
+{
+  using return_type = TensorLiteral<RHS>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
   {
+    return return_type(rhs);
+  }
+};
 
+}  // namespace ET
 
-    template<typename TENSOR_TYPE>
-    class TensorLiteral :  public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>> {
-      public:
-        using self_type = TensorLiteral<TENSOR_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using result_type = tensor_type;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return tensor_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLiteral(tensor_type const &value) :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &) const {
-          return result_type(m_value);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("TensorLiteral()");
-        }
-
-      private:
-        tensor_type m_value;
-    };
-
-
-    /*
-     * For TensorRegister nodes, we need to wrap this in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
-    {
-        using return_type = TensorLiteral<RHS>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 3b69552a32..00e5b14bf5 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -34,220 +34,185 @@ namespace expt
 {
 
 
+namespace ET
+{
 
 
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+struct TensorStoreFunctor
+{
+  LHS_TYPE const& m_lhs;
+  RHS_TYPE const& m_rhs;
 
-  namespace ET
+  template <typename TILE_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(TILE_TYPE const& tile) const
   {
 
 
+    /*
+     *
+     * For recursive ET types, eval() produces a new ET, and
+     * eval_lhs() produces a new TensorLoadStore.
+     *
+     */
 
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    struct TensorStoreFunctor
-    {
-        LHS_TYPE const &m_lhs;
-        RHS_TYPE const &m_rhs;
-
-        template<typename TILE_TYPE>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void operator()(TILE_TYPE const &tile) const {
-
-
-          /*
-           *
-           * For recursive ET types, eval() produces a new ET, and
-           * eval_lhs() produces a new TensorLoadStore.
-           *
-           */
-
-          m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
-
-        }
-    };
-
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    auto makeTensorStoreFunctor(LHS_TYPE const &lhs, RHS_TYPE const &rhs) ->
-    TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
-    {
-      return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
-    }
-
-
-    template<typename TENSOR_TYPE, typename REF_TYPE>
-    class TensorLoadStore : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>> {
-      public:
-        using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using index_type = typename REF_TYPE::index_type;
-        using ref_type = REF_TYPE;
-        using tile_type = typename REF_TYPE::tile_type;
-        using result_type = TENSOR_TYPE;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        ref_type m_ref;
-
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLoadStore(ref_type const &ref) : m_ref{ref}
-        {
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorLoadStore(self_type const &rhs) : m_ref(rhs.m_ref)
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print() const {
-          printf("TensorLoadStore: ");
-          m_ref.m_tile.print();
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(self_type const &rhs)
-        {
-          store(rhs);
-          return *this;
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(RHS const &rhs)
-        {
-
-          store(normalizeOperand(rhs));
-
-          return *this;
-        }
-
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator+=(RHS const &rhs)
-        {
-          store( normalizeOperand(rhs) + (*this) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator-=(RHS const &rhs)
-        {
-          store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator*=(RHS const &rhs)
-        {
-          store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator/=(RHS const &rhs)
-        {
-          store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
-        {
-          return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval_lhs(TILE_TYPE const &tile) const ->
-          decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref, tile)))
-        {
-          return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_ref.m_tile.m_size[dim];
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Load()");
-        }
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        tile_type const &getTile() const {
-          return m_ref.m_tile;
-        }
-
-
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void store(RHS const &rhs)
-        {
-#ifdef RAJA_DEBUG_PRINT_ET_AST
-          printf("Store(");
-          rhs.print_ast();
-          printf(")\n");
-#endif
+    m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
+  }
+};
+
+template <typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto
+makeTensorStoreFunctor(LHS_TYPE const& lhs, RHS_TYPE const& rhs)
+    -> TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
+{
+  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE> {lhs, rhs};
+}
+
+
+template <typename TENSOR_TYPE, typename REF_TYPE>
+class TensorLoadStore
+    : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>>
+{
+public:
+  using self_type    = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using index_type   = typename REF_TYPE::index_type;
+  using ref_type     = REF_TYPE;
+  using tile_type    = typename REF_TYPE::tile_type;
+  using result_type  = TENSOR_TYPE;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
-          tensorTileExec<tensor_type>(m_ref.m_tile,
-              makeTensorStoreFunctor<tensor_type>(*this, rhs));
-        }
 
+private:
+  ref_type m_ref;
+
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit TensorLoadStore(ref_type const& ref) : m_ref {ref} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorLoadStore(self_type const& rhs) : m_ref(rhs.m_ref) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print() const
+  {
+    printf("TensorLoadStore: ");
+    m_ref.m_tile.print();
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& rhs)
+  {
+    store(rhs);
+    return *this;
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator=(RHS const& rhs)
+  {
+
+    store(normalizeOperand(rhs));
 
+    return *this;
+  }
 
 
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator+=(RHS const& rhs)
+  {
+    store(normalizeOperand(rhs) + (*this));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator-=(RHS const& rhs)
+  {
+    store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*=(RHS const& rhs)
+  {
+    store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator/=(RHS const& rhs)
+  {
+    store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
+  {
+    return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const& tile) const
+      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
+                                                                  tile)))
+  {
+    return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_ref.m_tile.m_size[dim];
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("Load()"); }
+
+private:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  tile_type const& getTile() const { return m_ref.m_tile; }
+
+
+  template <typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE void store(RHS const& rhs)
+  {
+#ifdef RAJA_DEBUG_PRINT_ET_AST
+    printf("Store(");
+    rhs.print_ast();
+    printf(")\n");
+#endif
 
-    };
+    tensorTileExec<tensor_type>(
+        m_ref.m_tile, makeTensorStoreFunctor<tensor_type>(*this, rhs));
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 3e3429588f..b51aa3d8d6 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -33,127 +33,136 @@ namespace internal
 namespace expt
 {
 
-  namespace ET
+namespace ET
+{
+
+// forward decl for FMA contraction
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_TYPE>
+class TensorMultiplyAdd;
+
+
+template <typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorMultiply
+    : public TensorExpressionBase<
+          TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type                       = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiply(left_operand_type const& left_operand,
+                 right_operand_type const& right_operand)
+      : m_left_operand {left_operand}, m_right_operand {right_operand}
+  {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr int getDimSize(int dim) const
+  {
+    return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
+  {
+    return multiply_op::multiply(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
   {
+    return m_right_operand;
+  }
+
+
+  /*!
+   * operator+ overload that forms a FMA contraction
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename ADD>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
+                                                 right_operand_type,
+                                                 normalize_operand_t<ADD>>
+  operator+(ADD const& add) const
+  {
+    return TensorMultiplyAdd<left_operand_type, right_operand_type,
+                             normalize_operand_t<ADD>>(
+        m_left_operand, m_right_operand, normalizeOperand(add));
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Multiply[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+/*
+ * Overload for:    arithmetic * tensorexpression
+
+ */
+template <
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
+
+}  // namespace ET
 
-    // forward decl for FMA contraction
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_TYPE>
-    class TensorMultiplyAdd;
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorMultiply : public TensorExpressionBase<TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        int getDimSize(int dim) const {
-          return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
-        {
-          return multiply_op::multiply(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        /*!
-         * operator+ overload that forms a FMA contraction
-         */
-        RAJA_SUPPRESS_HD_WARN
-        template<typename ADD>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>
-        operator+(ADD const &add) const {
-          return TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>(m_left_operand, m_right_operand, normalizeOperand(add));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Multiply[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic * tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator*(LHS const &left_operand, RHS const &right_operand) ->
-    TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-    {
-      return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
-    }
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 44f27e92c7..a15059ed13 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -33,81 +33,90 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Expression for LHS*RHS+ADD, which allows for accessing FMA style
-     * operations.
-     *
-     * This ET can only be generated by contracting an Add and Multiple ET.
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using add_operand_type = ADD_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-        add_operand_type m_add_operand;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd(left_operand_type const &left_operand, right_operand_type const &right_operand,
-                          add_operand_type const &add_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}, m_add_operand{add_operand}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand))
-        {
-          return multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("MultiplyAdd[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(", ");
-          m_add_operand.print_ast();
-          printf(")");
-        }
-
-
-
-    };
-
+namespace ET
+{
 
 
+/*!
+ * Expression for LHS*RHS+ADD, which allows for accessing FMA style
+ * operations.
+ *
+ * This ET can only be generated by contracting an Add and Multiple ET.
+ *
+ */
+template <typename LEFT_OPERAND_TYPE,
+          typename RIGHT_OPERAND_TYPE,
+          typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd
+    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                                    RIGHT_OPERAND_TYPE,
+                                                    ADD_OPERAND_TYPE>>
+{
+public:
+  using self_type          = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                      RIGHT_OPERAND_TYPE,
+                                      ADD_OPERAND_TYPE>;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using add_operand_type   = ADD_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type                       = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+  add_operand_type m_add_operand;
+
+public:
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorMultiplyAdd(left_operand_type const& left_operand,
+                    right_operand_type const& right_operand,
+                    add_operand_type const& add_operand)
+      : m_left_operand {left_operand},
+        m_right_operand {right_operand},
+        m_add_operand {add_operand}
+  {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply_add(tile,
+                                            m_left_operand,
+                                            m_right_operand,
+                                            m_add_operand))
+  {
+    return multiply_op::multiply_add(tile, m_left_operand, m_right_operand,
+                                     m_add_operand);
+  }
 
-  } // namespace ET
 
-  } // namespace internal
-} // namespace expt
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("MultiplyAdd[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(", ");
+    m_add_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+}  // namespace ET
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index d5211e4963..f0512665cf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -33,61 +33,58 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
+{
+public:
+  using self_type    = TensorNegate<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type   = typename ET_TYPE::index_type;
+
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorNegate(rhs_type const& tensor) : m_tensor {tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
   {
+    return m_tensor.eval(tile).scale(-1);
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Negate(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorNegate :  public TensorExpressionBase<TensorNegate<ET_TYPE>> {
-      public:
-        using self_type = TensorNegate<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return m_tensor.eval(tile).scale(-1);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Negate(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index 4ab0a3ebc6..ac692c3bcf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -33,78 +33,71 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template <typename T>
+class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
+{
+public:
+  using self_type    = TensorScalarLiteral<T>;
+  using tensor_type  = RAJA::expt::ScalarRegister<T>;
+  using element_type = T;
+  using result_type  = T;
+  using index_type   = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = 0;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type) const { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  explicit constexpr TensorScalarLiteral(element_type const& value) noexcept
+      : m_value {value}
+  {}
+
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE element_type eval(TILE_TYPE const&) const
   {
+    return m_value;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("ScalarLiteral(%e)", (double)m_value); }
+
+private:
+  element_type m_value;
+};
 
 
-    template<typename T>
-    class TensorScalarLiteral :  public TensorExpressionBase<TensorScalarLiteral<T>> {
-      public:
-        using self_type = TensorScalarLiteral<T>;
-        using tensor_type = RAJA::expt::ScalarRegister<T>;
-        using element_type = T;
-        using result_type = T;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = 0;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type ) const {
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        constexpr
-        TensorScalarLiteral(element_type const &value) noexcept :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        element_type eval(TILE_TYPE const &) const {
-          return m_value;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("ScalarLiteral(%e)", (double)m_value);
-        }
-
-      private:
-        element_type m_value;
-    };
-
-
-    /*
-     * For arithmetic values, we need to wrap in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
+/*
+ * For arithmetic values, we need to wrap in a constant value ET node
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
     typename std::enable_if<std::is_arithmetic<RHS>::value>::type>
-    {
-        using return_type = TensorScalarLiteral<RHS>;
+{
+  using return_type = TensorScalarLiteral<RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
+  {
+    return return_type(rhs);
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index 46950eec6f..a1e9fa4542 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -33,67 +33,63 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template <typename ET_TYPE>
+class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
+{
+public:
+  using self_type    = TensorTranspose<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type   = typename ET_TYPE::index_type;
+
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorTranspose(rhs_type const& tensor) : m_tensor {tensor} {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
   {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template <typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    // transpose which tile we are returning
+    TILE_TYPE trans_tile {{tile.m_begin[1], tile.m_begin[0]},
+                          {tile.m_size[1], tile.m_size[0]}};
+
+    // evaluate and return the transposed tile
+    return m_tensor.eval(trans_tile).transpose();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Transpose(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorTranspose :  public TensorExpressionBase<TensorTranspose<ET_TYPE>> {
-      public:
-        using self_type = TensorTranspose<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          // transpose which tile we are returning
-          TILE_TYPE trans_tile{
-            {tile.m_begin[1], tile.m_begin[0]},
-            {tile.m_size[1],  tile.m_size[0]}
-          };
-
-          // evaluate and return the transposed tile
-          return m_tensor.eval(trans_tile).transpose();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Transpose(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
index 2a868a3131..7f3059acdf 100644
--- a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
@@ -33,64 +33,57 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
-
-  namespace ET
-  {
-    class TensorExpressionConcreteBase;
-
-    template<typename RHS, typename enable = void>
-    struct NormalizeOperandHelper;
-
-
-    /*
-     * For TensorExpression nodes, we just return them as-is.
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
-    {
-        using return_type = RHS;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return rhs;
-        }
-    };
-
-
-
-
-    /**
-     * Allows uniform packaging up of operands into ExpressionTemplates.
-     *
-     * The NormalizeOperandHelper is specialized throughout the code in order
-     * to convert non-ET operands into ET objects
-     *
-     * ET operators can then take any operand type, and use this to convert
-     * them into ET types the same way.
-     */
-    template<typename RHS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto normalizeOperand(RHS const &rhs) ->
+class TensorRegisterConcreteBase;
+
+namespace ET
+{
+class TensorExpressionConcreteBase;
+
+template <typename RHS, typename enable = void>
+struct NormalizeOperandHelper;
+
+
+/*
+ * For TensorExpression nodes, we just return them as-is.
+ */
+template <typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
+{
+  using return_type = RHS;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs) { return rhs; }
+};
+
+
+/**
+ * Allows uniform packaging up of operands into ExpressionTemplates.
+ *
+ * The NormalizeOperandHelper is specialized throughout the code in order
+ * to convert non-ET operands into ET objects
+ *
+ * ET operators can then take any operand type, and use this to convert
+ * them into ET types the same way.
+ */
+template <typename RHS>
+RAJA_INLINE RAJA_HOST_DEVICE auto normalizeOperand(RHS const& rhs) ->
     typename NormalizeOperandHelper<RHS>::return_type
-    {
-      return NormalizeOperandHelper<RHS>::normalize(rhs);
-    }
+{
+  return NormalizeOperandHelper<RHS>::normalize(rhs);
+}
 
-    template<typename RHS>
-    using normalize_operand_t =
-        typename NormalizeOperandHelper<RHS>::return_type;
+template <typename RHS>
+using normalize_operand_t = typename NormalizeOperandHelper<RHS>::return_type;
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
index 2b6bf7304d..a94ec924db 100644
--- a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
@@ -31,5 +31,4 @@
 #include "RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp"
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 08a9886acc..bb5ef862cc 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -31,308 +31,325 @@ namespace expt
 {
 
 
+template <typename MATA, typename MATB>
+struct MatrixMatrixMultiplyHelper;
 
 
+/**
+ *
+ * Row-Major * Row-Major ==> Row-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-
-
-  template<typename MATA, typename MATB>
-  struct MatrixMatrixMultiplyHelper;
-
-
-
-  /**
-   *
-   * Row-Major * Row-Major ==> Row-Major
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::RowMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::RowMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::RowMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+  /*
+   * Matrix B (and C) has 1 more more registers per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-  struct MatrixMatrixMultiplyHelper<
-  RAJA::expt::TensorRegister<REGISTER_POLICY,
-                   T,
-                   RAJA::expt::RowMajorLayout,
-                   camp::idx_seq<N_SIZE, M_SIZE>>,
-                   RAJA::expt::TensorRegister<REGISTER_POLICY,
-                    T,
-                    RAJA::expt::RowMajorLayout,
-                    camp::idx_seq<M2_SIZE, O_SIZE>> >
-    {
-
-      static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-      using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                       T,
-                                       RAJA::expt::RowMajorLayout,
-                                       camp::idx_seq<N_SIZE, M_SIZE>>;
-
-      using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        RAJA::expt::RowMajorLayout,
-                                        camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-      using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::RowMajorLayout,
-                                         camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-      using register_type = typename result_type::register_type;
-
-      static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-      static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-      /*
-       * Matrix B (and C) has 1 more more registers per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-      {
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-        RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
 #endif
 
-        constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
+    constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
 
-        RAJA_UNROLL
-        for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-          camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
-          camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
-
-          RAJA_UNROLL
-          for(camp::idx_t a_col = 0;a_col < M_SIZE;++ a_col){
-            camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
-
-            C.get_register(c_reg) =
-                register_type(A.get(ac_row, a_col)).multiply_add(
-                    B.get_register(b_reg),
-                    C.get_register(c_reg));
-          }
-        }
-
-      }
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
+      camp::idx_t ac_row     = c_reg / num_bc_reg_per_row;
 
-      /*
-       * Matrix B (and C) have less than one register per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
+      RAJA_UNROLL
+      for (camp::idx_t a_col = 0; a_col < M_SIZE; ++a_col)
       {
-        constexpr camp::idx_t bc_segbits = result_type::s_segbits;
-        constexpr camp::idx_t a_segments_per_register = 1<<bc_segbits;
-
-        RAJA_UNROLL
-        for(camp::idx_t ac_row = 0;ac_row < N_SIZE;++ ac_row){
-          camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
-          camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
-          register_type c_tmp;
-
-          RAJA_UNROLL
-          for(camp::idx_t b_reg = 0;b_reg < right_type::s_num_registers;++ b_reg){
-
-            camp::idx_t a_segment = ac_row*right_type::s_num_registers + b_reg;
-            camp::idx_t a_reg = a_segment / a_segments_per_register;
-            camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
-
-            auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(bc_segbits, a_reg_segment);
-
-            if(b_reg == 0){
-
-              c_tmp = a_tmp.multiply(B.get_register(b_reg));
-            }
-            else{
-              c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
-            }
-
-          }
-
-          C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
-
-        }
+        camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
 
+        C.get_register(c_reg) =
+            register_type(A.get(ac_row, a_col))
+                .multiply_add(B.get_register(b_reg), C.get_register(c_reg));
       }
+    }
+  }
 
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
-      void multiply(left_type const &A, right_type const &B, result_type &C){
-        C = result_type(0);
-        multiply_accumulate(A, B, C);
-      }
-  };
-
-
-  /**
-   *
-   * Column-Major * Column-Major ==> Column-Major
+  /*
+   * Matrix B (and C) have less than one register per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-    struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                     T,
-                     RAJA::expt::ColMajorLayout,
-                     camp::idx_seq<N_SIZE, M_SIZE>>,
-                     RAJA::expt::TensorRegister<REGISTER_POLICY,
-                      T,
-                      RAJA::expt::ColMajorLayout,
-                      camp::idx_seq<M2_SIZE, O_SIZE>> >
-      {
-
-      using self_type = MatrixMatrixMultiplyHelper<
-          RAJA::expt::TensorRegister<REGISTER_POLICY,
-                         T,
-                         RAJA::expt::ColMajorLayout,
-                         camp::idx_seq<N_SIZE, M_SIZE>>,
-                         RAJA::expt::TensorRegister<REGISTER_POLICY,
-                          T,
-                          RAJA::expt::ColMajorLayout,
-                          camp::idx_seq<M2_SIZE, O_SIZE>> >;
-
-        static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-        using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::ColMajorLayout,
-                                         camp::idx_seq<N_SIZE, M_SIZE>>;
-
-        using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                          T,
-                                          RAJA::expt::ColMajorLayout,
-                                          camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-        using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                           T,
-                                           RAJA::expt::ColMajorLayout,
-                                           camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-        using register_type = typename result_type::register_type;
-
-        static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-        static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-
-
-        /*
-         * Matrix A (and C) has 1 more more registers per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-
-  #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-          RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
-  #endif
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t bc_segbits              = result_type::s_segbits;
+    constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
+
+    RAJA_UNROLL
+    for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row)
+    {
+      camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
+      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
+      register_type c_tmp;
 
+      RAJA_UNROLL
+      for (camp::idx_t b_reg = 0; b_reg < right_type::s_num_registers; ++b_reg)
+      {
 
-          constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
+        camp::idx_t a_segment = ac_row * right_type::s_num_registers + b_reg;
+        camp::idx_t a_reg     = a_segment / a_segments_per_register;
+        camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-            camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
-            camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
+        auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(
+            bc_segbits, a_reg_segment);
 
-            RAJA_UNROLL
-            for(camp::idx_t b_row = 0;b_row < M_SIZE;++ b_row){
-              camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
+        if (b_reg == 0)
+        {
 
-              C.get_register(c_reg) =
-                  register_type(B.get(b_row, bc_col)).multiply_add(
-                      A.get_register(a_reg),
-                      C.get_register(c_reg));
-            }
-          }
+          c_tmp = a_tmp.multiply(B.get_register(b_reg));
+        }
+        else
+        {
+          c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
+        }
+      }
 
+      C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
+    }
+  }
 
-        }
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void
+  multiply(left_type const& A, right_type const& B, result_type& C)
+  {
+    C = result_type(0);
+    multiply_accumulate(A, B, C);
+  }
+};
 
-        /*
-         * Matrix A (and C) have less than one register per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static
-        typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-          constexpr camp::idx_t ac_segbits = result_type::s_segbits;
-          constexpr camp::idx_t b_segments_per_register = 1<<ac_segbits;
 
-          camp::idx_t bc_col = 0;
+/**
+ *
+ * Column-Major * Column-Major ==> Column-Major
+ *
+ */
+template <typename T,
+          typename REGISTER_POLICY,
+          camp::idx_t N_SIZE,
+          camp::idx_t M_SIZE,
+          camp::idx_t M2_SIZE,
+          camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < N_SIZE/result_type::s_major_dim_per_register;++ c_reg){
+  using self_type = MatrixMatrixMultiplyHelper<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<N_SIZE, M_SIZE>>,
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
+
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::ColMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::ColMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::ColMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+
+  /*
+   * Matrix A (and C) has 1 more more registers per column
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
+      typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 
-            RAJA_UNROLL
-            for(camp::idx_t c_segment = 0;c_segment < result_type::s_major_dim_per_register;++ c_segment){
+#if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
+#endif
 
-              register_type c_tmp;
 
-              RAJA_UNROLL
-              for(camp::idx_t a_reg = 0;a_reg < right_type::s_num_registers;++ a_reg){
+    constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
 
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
+      camp::idx_t bc_col     = c_reg / num_ac_reg_per_col;
 
-                camp::idx_t b_segment = bc_col*right_type::s_num_registers + a_reg;
-                camp::idx_t b_reg = b_segment / b_segments_per_register;
-                camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
+      RAJA_UNROLL
+      for (camp::idx_t b_row = 0; b_row < M_SIZE; ++b_row)
+      {
+        camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
 
-                register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(ac_segbits, b_reg_segment);
+        C.get_register(c_reg) =
+            register_type(B.get(b_row, bc_col))
+                .multiply_add(A.get_register(a_reg), C.get_register(c_reg));
+      }
+    }
+  }
 
-                if(a_reg == 0){
-                  c_tmp = b_tmp.multiply(A.get_register(a_reg));
-                }
-                else{
-                  c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
-                }
+  /*
+   * Matrix A (and C) have less than one register per column
+   *
+   */
+  template <typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t ac_segbits              = result_type::s_segbits;
+    constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
+
+    camp::idx_t bc_col = 0;
+
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0;
+         c_reg < N_SIZE / result_type::s_major_dim_per_register; ++c_reg)
+    {
 
-              }
+      RAJA_UNROLL
+      for (camp::idx_t c_segment = 0;
+           c_segment < result_type::s_major_dim_per_register; ++c_segment)
+      {
 
-              C.get_register(c_reg) += c_tmp.segmented_sum_outer(ac_segbits, c_segment);
+        register_type c_tmp;
 
-              ++ bc_col;
-            } // c_segment
-          } // c_reg
+        RAJA_UNROLL
+        for (camp::idx_t a_reg = 0; a_reg < right_type::s_num_registers;
+             ++a_reg)
+        {
 
 
-        }
+          camp::idx_t b_segment = bc_col * right_type::s_num_registers + a_reg;
+          camp::idx_t b_reg     = b_segment / b_segments_per_register;
+          camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
 
+          register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(
+              ac_segbits, b_reg_segment);
 
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        void multiply(left_type const &A, right_type const &B, result_type &C){
-          C = result_type(0);
-          self_type::multiply_accumulate(A, B, C);
+          if (a_reg == 0)
+          {
+            c_tmp = b_tmp.multiply(A.get_register(a_reg));
+          }
+          else
+          {
+            c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
+          }
         }
-    };
 
+        C.get_register(c_reg) +=
+            c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
+        ++bc_col;
+      }  // c_segment
+    }    // c_reg
+  }
 
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void
+  multiply(left_type const& A, right_type const& B, result_type& C)
+  {
+    C = result_type(0);
+    self_type::multiply_accumulate(A, B, C);
+  }
+};
 
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 3036a096b5..2b87f1d34d 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -32,1121 +32,1342 @@ namespace RAJA
 namespace expt
 {
 
-  /*
-   * 2D (Matrix) specialization of TensorRegister
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t ROW_ORD, camp::idx_t COL_ORD, camp::idx_t ROW_SIZE, camp::idx_t COL_SIZE>
-  class TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>> :
-    public RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+/*
+ * 2D (Matrix) specialization of TensorRegister
+ */
+template <typename REGISTER_POLICY,
+          typename T,
+          camp::idx_t ROW_ORD,
+          camp::idx_t COL_ORD,
+          camp::idx_t ROW_SIZE,
+          camp::idx_t COL_SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
+    : public RAJA::internal::expt::TensorRegisterBase<
+          TensorRegister<REGISTER_POLICY,
+                         T,
+                         TensorLayout<ROW_ORD, COL_ORD>,
+                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   TensorLayout<ROW_ORD, COL_ORD>,
+                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+  using base_type = RAJA::internal::expt::TensorRegisterBase<
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+  using register_type      = Register<T, REGISTER_POLICY>;
+  using row_vector_type    = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
+  using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
+  using register_policy    = REGISTER_POLICY;
+  using element_type       = T;
+  using layout_type        = TensorLayout<ROW_ORD, COL_ORD>;
+
+  using transpose_tensor_type =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<!ROW_ORD, !COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+
+  using transpose_type = TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        layout_type,
+                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+  using product_type   = TensorRegister<REGISTER_POLICY,
+                                      T,
+                                      layout_type,
+                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+
+  static constexpr camp::idx_t s_num_rows    = ROW_SIZE;
+  static constexpr camp::idx_t s_num_columns = COL_SIZE;
+
+
+  static constexpr camp::idx_t s_elements_per_register =
+      RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem;
+
+  // number of registers to hold entire matrix
+  static constexpr camp::idx_t s_num_registers =
+      (ROW_SIZE * COL_SIZE) / s_elements_per_register;
+
+  // We only allow matrix sizes that exactly fit in some number of registers
+  static_assert((ROW_SIZE * COL_SIZE) ==
+                    s_num_registers * s_elements_per_register,
+                "MatrixRegister must be dimensioned to exactly fit an integer "
+                "number of registers");
+
+  using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+
+  static constexpr camp::idx_t s_minor_dim_elements =
+      layout_type::is_row_major() ? s_num_columns : s_num_rows;
+
+  static constexpr camp::idx_t s_major_dim_elements =
+      layout_type::is_row_major() ? s_num_rows : s_num_columns;
+
+  // number of (full) registers that span the minor dim
+  // if a single register is split across multiple rows or columns, then
+  // this is 0
+  static constexpr camp::idx_t s_minor_dim_registers =
+      s_minor_dim_elements / s_elements_per_register;
+
+  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
+                "Minor dimension smaller than a vector need to be a power of "
+                "two fraction");
+
+  static_assert(s_minor_dim_registers == 0 ||
+                    (s_minor_dim_elements % s_elements_per_register == 0),
+                "Minor dimensions greater than a vector length must be an "
+                "integer number of vectors");
+
+
+  static constexpr camp::idx_t s_major_dim_per_register =
+      s_elements_per_register / s_minor_dim_elements;
+
+  static constexpr camp::idx_t s_segbits =
+      RAJA::LogBase2<s_minor_dim_elements>::value;
+
+private:
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
+                                                                 IDX col) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-      using base_type = RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-      using register_type = Register<T, REGISTER_POLICY>;
-      using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
-      using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
-      using register_policy = REGISTER_POLICY;
-      using element_type = T;
-      using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
-
-      using transpose_tensor_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<!ROW_ORD, !COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-
-      using transpose_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-      using product_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
-
-      static constexpr camp::idx_t s_num_rows = ROW_SIZE;
-      static constexpr camp::idx_t s_num_columns = COL_SIZE;
-
-
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
+               : (col * IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
+  }
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX row, IDX col)
+      -> IDX
+  {
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) & IDX(s_mask_per_register)
+               : (col * IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
+  }
 
+  using base_type::m_registers;
 
-      static constexpr camp::idx_t s_elements_per_register =
-          RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem;
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() : base_type() {}
 
-      // number of registers to hold entire matrix
-      static constexpr camp::idx_t s_num_registers =
-          (ROW_SIZE*COL_SIZE) / s_elements_per_register;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) : base_type(c) { this->broadcast(c); }
 
-      // We only allow matrix sizes that exactly fit in some number of registers
-      static_assert((ROW_SIZE*COL_SIZE) == s_num_registers*s_elements_per_register,
-          "MatrixRegister must be dimensioned to exactly fit an integer number of registers");
 
-      using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) { this->copy(c); }
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegister() {}
 
 
-      static constexpr camp::idx_t s_minor_dim_elements =
-          layout_type::is_row_major() ? s_num_columns : s_num_rows;
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
+           (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+  }
 
-      static constexpr camp::idx_t s_major_dim_elements =
-          layout_type::is_row_major() ? s_num_rows : s_num_columns;
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? ROW_SIZE : COL_SIZE;
+  }
 
-      // number of (full) registers that span the minor dim
-      // if a single register is split across multiple rows or columns, then
-      // this is 0
-      static constexpr camp::idx_t s_minor_dim_registers =
-              s_minor_dim_elements / s_elements_per_register;
 
-      static_assert(s_minor_dim_registers >0  ||  log_base2_t::is_exact,
-          "Minor dimension smaller than a vector need to be a power of two fraction");
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      static_assert(s_minor_dim_registers == 0 || (s_minor_dim_elements % s_elements_per_register == 0),
-          "Minor dimensions greater than a vector length must be an integer number of vectors");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
 
-      static constexpr camp::idx_t s_major_dim_per_register =
-          s_elements_per_register / s_minor_dim_elements;
+  /*!
+   * Provide matrix-matrix multiply for operator* between to matrices
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return matrix_multiply(y);
+  }
 
-      static constexpr camp::idx_t s_segbits = RAJA::LogBase2<s_minor_dim_elements>::value;
+  /*!
+   * Provide right matrix-vector multiply for operator* between this
+   * matrix and a vector.
+   */
+  template <typename T2, typename RP>
+  VectorRegister<T2, RP> operator*(VectorRegister<T2, RP> const& y) const
+  {
+    return right_multiply_vector(y);
+  }
 
-    private:
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) >> IDX(s_shift_per_register) :
-            (col*IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
-      }
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) & IDX(s_mask_per_register) :
-            (col*IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
-      }
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      using base_type::m_registers;
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+  {
 
-    public:
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister() : base_type() {}
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c) : base_type(c)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) : base_type(c)
+      // strided data
+      else
       {
-        this->copy(c);
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegister(){}
-
-
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
-            (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? ROW_SIZE : COL_SIZE;
-      }
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(value);
-        return *this;
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
+  };
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE StrideInt1,
+            INDEX_TYPE StrideInt2,
+            INDEX_TYPE BeginInt1,
+            INDEX_TYPE BeginInt2,
+            INDEX_TYPE SizeInt1,
+            INDEX_TYPE SizeInt2,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+      camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+      camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+      STRIDE_ONE_DIM>>
+  {
 
-
-      /*!
-       * Provide matrix-matrix multiply for operator* between to matrices
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+        camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+        camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        return matrix_multiply(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-      /*!
-       * Provide right matrix-vector multiply for operator* between this
-       * matrix and a vector.
-       */
-      template<typename T2, typename RP>
-      VectorRegister<T2, RP>
-      operator*(VectorRegister<T2, RP> const &y) const
+      // strided data
+      else
       {
-        return right_multiply_vector(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
 
 
-      template<typename REF_TYPE>
-      struct RefBridge;
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-
-
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+      // strided data
+      else
       {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
+      }
+    }
+  };
+
+
+  /*!
+   * Loads a dense full matrix from memory.
+   *
+   * For row-major, column entries must be stride-1
+   * For column-major, row entries must be stride-1
+   *
+   * Non-stride-1 dimension can have any striding... so this is can
+   * be a "semi-dense" matrix.
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type&
+  load_packed(element_type const* ptr, int row_stride, int col_stride)
+  {
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
-
-      template<
-           typename POINTER_TYPE,
-           typename INDEX_TYPE,
-           RAJA::internal::expt::TensorTileSize TENSOR_SIZE, 
-           INDEX_TYPE StrideInt1, INDEX_TYPE StrideInt2,
-           INDEX_TYPE  BeginInt1, INDEX_TYPE  BeginInt2,
-           INDEX_TYPE   SizeInt1, INDEX_TYPE   SizeInt2,
-           camp::idx_t STRIDE_ONE_DIM
-      >
-      struct RefBridge
-      <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>>
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
       {
+        m_registers[reg].load_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense load for row-major
+    else if (layout_type::is_row_major())
+    {
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        camp::idx_t reg = 0;
+        for (camp::idx_t row = 0; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
+            camp::idx_t offset =
+                row * row_stride + colreg * s_elements_per_register;
 
-      /*!
-       * Loads a dense full matrix from memory.
-       *
-       * For row-major, column entries must be stride-1
-       * For column-major, row entries must be stride-1
-       *
-       * Non-stride-1 dimension can have any striding... so this is can
-       * be a "semi-dense" matrix.
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr,
-          int row_stride, int col_stride)
-      {
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+            m_registers[reg].load_packed(ptr + offset);
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].load_packed(ptr + reg*s_elements_per_register);
+            reg++;
           }
-
         }
-        // Do semi-dense load for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            camp::idx_t reg = 0;
-            for(camp::idx_t row = 0;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t offset = row*row_stride + colreg*s_elements_per_register;
+        camp::idx_t reg = 0;
+        for (camp::idx_t col = 0; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                m_registers[reg].load_packed(ptr + offset);
+            camp::idx_t offset =
+                col * col_stride + rowreg * s_elements_per_register;
 
-                reg ++;
+            m_registers[reg].load_packed(ptr + offset);
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
+            reg++;
           }
         }
-        // Do semi-dense load for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            camp::idx_t reg = 0;
-            for(camp::idx_t col = 0;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
-
-                camp::idx_t offset = col*col_stride + rowreg*s_elements_per_register;
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
 
-                m_registers[reg].load_packed(ptr + offset);
+    return *this;
+  }
 
-                reg ++;
+  /*!
+   * Loads a strided full matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type&
+  load_strided(element_type const* ptr, int row_stride, int col_stride)
+  {
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      col_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided full matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr,
-          int row_stride, int col_stride)
+      // less than one register per row
+      else
       {
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i, s_segbits, col_stride,
+                                        row_stride);
         }
+      }
+    }
 
-        // column major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      row_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a dense partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // less than one register per column
+      else
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i, s_segbits, row_stride,
+                                        col_stride);
+        }
+      }
+    }
 
-        if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
+    return *this;
+  }
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+  /*!
+   * Loads a dense partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_nm(element_type const* ptr,
+                            int row_stride,
+                            int col_stride,
+                            int num_rows,
+                            int num_cols)
+  {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = colreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t col0   = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // loading a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
 
-            // zero out remaining rows
-            for(camp::idx_t row = num_rows;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
-
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = colreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
+
+              break;  // end this row
             }
           }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
-          }
         }
-        // Do semi-dense load for column-major
-        else{
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
+        // zero out remaining rows
+        for (camp::idx_t row = num_rows; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            m_registers[reg] = element_type(0);
+          }
+        }
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = rowreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t row0   = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
-            // zero out remaining columns
-            for(camp::idx_t col = num_cols;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = rowreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
-            }
 
+              break;  // end this column
+            }
           }
-          // more than one column per register
-          else{
+        }
+        // zero out remaining columns
+        for (camp::idx_t col = num_cols; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            m_registers[reg] = element_type(0);
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // more than one column per register
+      else
       {
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row >= num_rows){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
 
+    return *this;
+  }
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
+  /*!
+   * Loads a strided partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_nm(element_type const* ptr,
+                             int row_stride,
+                             int col_stride,
+                             int num_rows,
+                             int num_cols)
+  {
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row >= num_rows)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per row
           else
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            col_stride, reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
+
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride,
+                                           row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col >= num_cols){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col >= num_cols)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per column
           else
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            row_stride, reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride,
+                                           col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
+    return *this;
+  }
 
 
-      /*!
-       * Store a dense full matrix to memory.
-       *
-       * Column entries must be stride-1, rows may be any striding
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  /*!
+   * Store a dense full matrix to memory.
+   *
+   * Column entries must be stride-1, rows may be any striding
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_packed(element_type* ptr, int row_stride, int col_stride) const
+  {
 
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].store_packed(ptr + reg*s_elements_per_register);
-          }
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense store for row-major
+    else if (layout_type::is_row_major())
+    {
 
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
-        // Do semi-dense store for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one column per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
-        }
-        // Do semi-dense store for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one row per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
+      }
+      // more than one column per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
+      }
+      // more than one row per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
 
 
-        return *this;
-      }
+    return *this;
+  }
 
-      /*!
-       * Store a strided full matrix to memory
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  /*!
+   * Store a strided full matrix to memory
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_strided(element_type* ptr, int row_stride, int col_stride) const
+  {
 
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, col_stride);
         }
-
-        // column major
-        else{
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      }
+      // less than one register per row
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i, s_segbits, col_stride,
+                                         row_stride);
         }
-
-        return *this;
       }
+    }
 
-      /*!
-       * Store a dense partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+    // column major
+    else
+    {
+      // one or more registers per column
+      if (s_minor_dim_registers)
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, row_stride);
+        }
+      }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i, s_segbits, row_stride,
+                                         col_stride);
+        }
+      }
+    }
 
+    return *this;
+  }
 
-        if(layout_type::is_row_major()){
+  /*!
+   * Store a dense partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_nm(element_type* ptr,
+                                   int row_stride,
+                                   int col_stride,
+                                   int num_rows,
+                                   int num_cols) const
+  {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // store a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t col0   = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // store a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+
+              break;  // end this row
+            }
           }
         }
-        // Do semi-dense store for column-major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t row0   = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
 
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+              break;  // end this column
+            }
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Store a strided partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+      // more than one column per register
+      else
       {
 
+        // default to strided operation
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
+      }
+    }
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+    return *this;
+  }
 
+  /*!
+   * Store a strided partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided_nm(element_type* ptr,
+                                    int row_stride,
+                                    int col_stride,
+                                    int num_rows,
+                                    int num_cols) const
+  {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
-          }
-          // less than one register per row
-          else
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             col_stride, reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
+
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride,
+                                            row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+    // column major
+    else
+    {
+
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             row_stride, reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride,
+                                            col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
-
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_nm(self_type const &mat, int num_rows, int num_cols) const {
-        self_type result;
+    return *this;
+  }
 
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_nm(self_type const& mat, int num_rows, int num_cols) const
+  {
+    self_type result;
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
+          {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
 
-              }
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
-          // less than one register per row
-          else
-          {
+        }
+      }
+      // less than one register per row
+      else
+      {
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
-            }
-          }
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
         }
+      }
+    }
 
-        // column major
-        else{
+    // column major
+    else
+    {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
         }
-
-
-        return result;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+        }
+      }
+    }
 
 
+    return result;
+  }
 
-      /*!
-       * Matrix transpose, keeping layout
-       *
-       * Transpose is not completely implemented
-       */
+
+  /*!
+   * Matrix transpose, keeping layout
+   *
+   * Transpose is not completely implemented
+   */
 #if 0
       RAJA_HOST_DEVICE
       RAJA_INLINE
@@ -1291,386 +1512,427 @@ namespace expt
         return reinterpret_cast<transpose_tensor_type const &>(*this);
       }
 #endif
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector(row_vector_type v) const {
-        column_vector_type result(0);
-        return right_multiply_vector_accumulate(v, result);
-      }
-
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector(column_vector_type v) const {
-        row_vector_type result(0);
-        return left_multiply_vector_accumulate(v, result);
-      }
-
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += (this) * v
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector_accumulate(row_vector_type const &v, column_vector_type result) const {
-
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
-
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
-
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
-
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type right_multiply_vector(row_vector_type v) const
+  {
+    column_vector_type result(0);
+    return right_multiply_vector_accumulate(v, result);
+  }
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector(column_vector_type v) const
+  {
+    row_vector_type result(0);
+    return left_multiply_vector_accumulate(v, result);
+  }
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += (this) * v
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  column_vector_type
+  right_multiply_vector_accumulate(row_vector_type const& v,
+                                   column_vector_type result) const
+  {
 
-          }
-          // one or more registers per row
-          else{
+    if (layout_type::is_row_major())
+    {
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute partial dot products for all registers in this row
-              auto rowsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-                rowsum = m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
-                reg ++;
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              } // rowreg
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(row) + rowsum.sum();
-              result.set(value, row);
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
-            } // row
-          }
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
+          // accumulate result
+          result.get_register(result_reg) += value;
         }
-        else{
-
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-
-            auto &mv = result.get_register(0);
-
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+      }
+      // one or more registers per row
+      else
+      {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              mv = m_registers[m_reg].multiply_add(v_tmp, mv);
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
 
-            }
+          // compute partial dot products for all registers in this row
+          auto rowsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            // Now sum segments in mv together to form final result
-            mv = mv.segmented_sum_outer(s_segbits, 0);
+            rowsum =
+                m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
+            reg++;
 
-          }
-          // one or more registers per column
-          else{
+          }  // rowreg
 
-            // Loop over columns (which is also registers)
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(row) + rowsum.sum();
+          result.set(value, row);
 
-              // extract column value from v
-              auto v_col = register_type(v.get(col));
+        }  // row
+      }
+    }
+    else
+    {
 
-              // apply v_col to entire column (1 or more registers)
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
 
-                auto &mv = result.get_register(rowreg);
-                mv = m_registers[reg].multiply_add(v_col, mv);
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
 
-                reg ++;
+        auto& mv = result.get_register(0);
 
-              } // rowreg
-            } // col
-          }
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
 
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          mv = m_registers[m_reg].multiply_add(v_tmp, mv);
         }
-        return result;
-      }
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += v * (this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector_accumulate(column_vector_type const &v, row_vector_type result) const {
 
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-            auto &vm = result.get_register(0);
-
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+        // Now sum segments in mv together to form final result
+        mv = mv.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per column
+      else
+      {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        // Loop over columns (which is also registers)
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
 
-            }
+          // extract column value from v
+          auto v_col = register_type(v.get(col));
 
-            // Now sum segments in mv together to form final result
-            vm = vm.segmented_sum_outer(s_segbits, 0);
+          // apply v_col to entire column (1 or more registers)
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-          }
-          // one or more registers per row
-          else{
+            auto& mv = result.get_register(rowreg);
+            mv       = m_registers[reg].multiply_add(v_col, mv);
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
-              auto lhs_bcat = register_type(v.get(row));
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+            reg++;
 
-                result.get_register(colreg) =
-                    m_registers[reg].multiply_add(lhs_bcat, result.get_register(colreg));
-                reg ++;
+          }  // rowreg
+        }    // col
+      }
+    }
+    return result;
+  }
+
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += v * (this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  row_vector_type left_multiply_vector_accumulate(column_vector_type const& v,
+                                                  row_vector_type result) const
+  {
 
-              } // rowreg
+    if (layout_type::is_row_major())
+    {
 
-            }
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
+        auto& vm = result.get_register(0);
+
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
+
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        }
 
-          }
+        // Now sum segments in mv together to form final result
+        vm = vm.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per row
+      else
+      {
 
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
+          auto lhs_bcat = register_type(v.get(row));
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-        } // row-major
+            result.get_register(colreg) = m_registers[reg].multiply_add(
+                lhs_bcat, result.get_register(colreg));
+            reg++;
 
-        // Column-major:
-        else{
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
+          }  // rowreg
+        }
+      }
 
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
+    }  // row-major
 
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+    // Column-major:
+    else
+    {
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-          }
-          // one or more registers per column
-          else{
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
-
-              // compute partial dot products for all registers in this row
-              auto colsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
-                colsum = m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
-                reg ++;
-
-              } // rowreg
-
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(col) + colsum.sum();
-              result.set(value, col);
-
-            } // col
-          }
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-        } // col-major
-        return result;
+          // accumulate result
+          result.get_register(result_reg) += value;
+        }
       }
+      // one or more registers per column
+      else
+      {
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
+
+          // compute partial dot products for all registers in this row
+          auto colsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+            colsum =
+                m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
+            reg++;
 
+          }  // rowreg
 
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(col) + colsum.sum();
+          result.set(value, col);
 
-
-
-      /*!
-       * Matrix-Matrix product
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply(RMAT const &mat) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(0);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply(*this, mat, res);
-        return res;
+        }  // col
       }
 
-      /*!
-       * Matrix-Matrix multiply add
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply_add(RMAT const &B, typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type const &C) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(C);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, res);
-        return res;
-      }
 
-      /*!
-       * Matrix-Matrix multiply accumulate
-       */
-      template<typename ACCMAT, typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      void
-      matrix_multiply_accumulate(ACCMAT &acc, RMAT const &B) const {
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, acc);
-      }
+    }  // col-major
+    return result;
+  }
 
 
+  /*!
+   * Matrix-Matrix product
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply(RMAT const& mat) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(0);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::multiply(
+        *this, mat, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply add
+   */
+  template <typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply_add(
+          RMAT const& B,
+          typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+              self_type,
+              RMAT>::result_type const& C) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(C);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply accumulate
+   */
+  template <typename ACCMAT, typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  matrix_multiply_accumulate(ACCMAT& acc, RMAT const& B) const
+  {
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, acc);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int row, int col){
-        m_registers[to_register(row, col)].set(val, to_lane(row,col));
-        return *this;
-      }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int row, int col) const {
-        return m_registers[to_register(row, col)].get(to_lane(row,col));
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& set(element_type val, int row, int col)
+  {
+    m_registers[to_register(row, col)].set(val, to_lane(row, col));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int row, int col) const
+  {
+    return m_registers[to_register(row, col)].get(to_lane(row, col));
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type extract_diagonal_register(camp::idx_t starting_column, camp::idx_t segbits, camp::idx_t segment) const {
 
-        register_type result(0);
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type extract_diagonal_register(camp::idx_t starting_column,
+                                          camp::idx_t segbits,
+                                          camp::idx_t segment) const
+  {
 
-        camp::idx_t num_rows = register_type::s_num_elem >> segbits;
-        camp::idx_t num_repeats = 1 << segbits;
+    register_type result(0);
 
-        camp::idx_t col0 = (starting_column + num_rows*segment)%s_num_columns;
-        camp::idx_t row0 = num_rows*segment;
+    camp::idx_t num_rows    = register_type::s_num_elem >> segbits;
+    camp::idx_t num_repeats = 1 << segbits;
 
-        for(camp::idx_t i = 0;i < num_rows;++i){
-          camp::idx_t col = (col0 + i) % s_num_columns;
-          camp::idx_t row = row0 + i;
-          auto value = get(row,col);
-          for(camp::idx_t j = 0;j < num_repeats;++j){
-            result.set(value, (i<<segbits) + j);
-          }
-        }
+    camp::idx_t col0 = (starting_column + num_rows * segment) % s_num_columns;
+    camp::idx_t row0 = num_rows * segment;
 
-        return result;
+    for (camp::idx_t i = 0; i < num_rows; ++i)
+    {
+      camp::idx_t col = (col0 + i) % s_num_columns;
+      camp::idx_t row = row0 + i;
+      auto value      = get(row, col);
+      for (camp::idx_t j = 0; j < num_repeats; ++j)
+      {
+        result.set(value, (i << segbits) + j);
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * @brief Converts to matrix to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string(bool one_line=false) const {
-        std::string s = "Matrix(" + std::to_string(s_num_rows) +
-            "x" + std::to_string(s_num_columns);
-        if(!one_line){
-          s +=")\n";
-        }
 
+  /*!
+   * @brief Converts to matrix to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string(bool one_line = false) const
+  {
+    std::string s = "Matrix(" + std::to_string(s_num_rows) + "x" +
+                    std::to_string(s_num_columns);
+    if (!one_line)
+    {
+      s += ")\n";
+    }
 
-        s += "[ ";
 
-        //
-        for(camp::idx_t r = 0;r < s_num_rows; ++ r){
-          if(r > 0){
-            s += ", ";
-            if(!one_line){
-              s+= "\n  ";
-            }
-          }
-          s += "[";
-          for(camp::idx_t c = 0;c < s_num_columns; ++ c){
-            if(c > 0){
-              s += ", ";
-            }
-            s += std::to_string(this->get(r,c));
-          }
-          s += "]";
-        }
+    s += "[ ";
 
-        s += " ]";
-        if(!one_line){
-          s+="\n";
+    //
+    for (camp::idx_t r = 0; r < s_num_rows; ++r)
+    {
+      if (r > 0)
+      {
+        s += ", ";
+        if (!one_line)
+        {
+          s += "\n  ";
         }
-        return s;
       }
+      s += "[";
+      for (camp::idx_t c = 0; c < s_num_columns; ++c)
+      {
+        if (c > 0)
+        {
+          s += ", ";
+        }
+        s += std::to_string(this->get(r, c));
+      }
+      s += "]";
+    }
 
-  }; // MatrixRegisterImpl
-
-
+    s += " ]";
+    if (!one_line)
+    {
+      s += "\n";
+    }
+    return s;
+  }
 
+};  // MatrixRegisterImpl
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
-
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 3480fda10c..af2ca27b98 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -33,1184 +33,1197 @@ namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename REGISTER_POLICY>
-  class Register;
+template <typename T, typename REGISTER_POLICY>
+class Register;
 }
 
 namespace internal
 {
 namespace expt
 {
-  class RegisterConcreteBase {};
+class RegisterConcreteBase
+{};
 
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic - TensorRegister
+/*
+ * Overload for:    arithmetic - TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic * TensorRegister
+/*
+ * Overload for:    arithmetic * TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
 
-  /*
-   * Overload for:    arithmetic / TensorRegister
+/*
+ * Overload for:    arithmetic / TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template <
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-
-
-
-
-  /*!
-   * Register base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
-   */
-  template<typename Derived>
-  class RegisterBase;
-
-  template<typename T, typename REGISTER_POLICY>
-  class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>> :
-    public RegisterConcreteBase
-  {
-    public:
-      using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-      using element_type = camp::decay<T>;
-
-      using index_type = camp::idx_t;
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
 
-      using int_element_type = typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
-      using int_vector_type = RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-    private:
+/*!
+ * Register base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class RegisterBase;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
+template <typename T, typename REGISTER_POLICY>
+class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
+    : public RegisterConcreteBase
+{
+public:
+  using self_type    = RAJA::expt::Register<T, REGISTER_POLICY>;
+  using element_type = camp::decay<T>;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
+  using index_type = camp::idx_t;
 
-    public:
+  using int_element_type =
+      typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
+  using int_vector_type =
+      RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return true;
-      }
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(){}
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return true; }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~RegisterBase(){}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase() {}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~RegisterBase() {}
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(RegisterBase const &){}
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      RegisterBase(self_type const &){
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr RegisterBase(RegisterBase const&) {}
 
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr RegisterBase(self_type const&) {}
 
 
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type s_broadcast_n(element_type const &value, camp::idx_t N){
-        self_type x;
-        for(camp::idx_t i = 0;i < N;++ i){
-          x.set(value, i);
-        }
-        return x;
-      }
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static self_type s_broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    self_type x;
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      x.set(value, i);
+    }
+    return x;
+  }
 
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
 
 
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> offsets){
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather(element_type const* ptr,
+         RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          getThis()->set(ptr[offsets.get(i)], i);
-        }
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N){
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  gather_n(element_type const* ptr,
+           RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+           camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-          for(camp::idx_t i = 0;i < N;++ i){
-            getThis()->set(ptr[offsets.get(i)], i);
-          }
-          return *getThis();
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner,
+                                                          stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
 
-            if(seg >= num_outer || i >= num_inner){
-              getThis()->set(element_type(0), lane);
-            }
-            else{
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              element_type value = ptr[offset];
+        if (seg >= num_outer || i >= num_inner)
+        {
+          getThis()->set(element_type(0), lane);
+        }
+        else
+        {
 
-              getThis()->set(value, lane);
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            }
+          element_type value = ptr[offset];
 
-            lane ++;
-          }
+          getThis()->set(value, lane);
         }
 
-        return *getThis();
+        lane++;
       }
+    }
 
+    return *getThis();
+  }
 
 
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets) const {
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter(element_type* ptr,
+          RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N) const {
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr,
+            RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+            camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        getThis()->scatter(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    getThis()->scatter(ptr, self_type::s_segmented_offsets(
+                                segbits, stride_inner, stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
 
-            if(!(seg >= num_outer || i >= num_inner)){
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              ptr[offset] = getThis()->get(lane);
+        if (!(seg >= num_outer || i >= num_inner))
+        {
 
-            }
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            lane ++;
-          }
+          ptr[offset] = getThis()->get(lane);
         }
 
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
+        lane++;
       }
+    }
 
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Assign one register to another
-       * @param x register to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
 
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type&
+  operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
 
+  /*!
+   * @brief Assign one register to another
+   * @param x register to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
 
 
+  /*!
+   * @brief Add two registers
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Add two registers
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
 
+  /*!
+   * @brief Add a register to this register
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Add a register to this register
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Add scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Add scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
 
+  /*!
+   * @brief Add a scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Add a scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Negate the value of this register
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
 
-      /*!
-       * @brief Negate the value of this register
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
+  /*!
+   * @brief Subtract two register registers
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Subtract two register registers
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  /*!
+   * @brief Subtract a register from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Subtract a register from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  /*!
+   * @brief Subtract a scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Subtract a scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(RHS const &rhs) const
-      {
-        return getThis()->multiply(rhs);
-      }
+  /*!
+   * @brief Multiply two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*(RHS const& rhs) const
+  {
+    return getThis()->multiply(rhs);
+  }
 
-      /*!
-       * @brief Multiply a register with this register
-       * @param x register to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = getThis()->multiply(rhs);
-        return *getThis();
-      }
+  /*!
+   * @brief Multiply a register with this register
+   * @param x register to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() = getThis()->multiply(rhs);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Divide two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * @brief Divide two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x register to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x register to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
 
-      /*!
-       * @brief Divide n elements of this register by another register
-       * @param x register to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b.get(i), i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Divide n elements of this register by another register
+   * @param x register to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief Divide n elements of this register by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b, i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Divide n elements of this register by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief Dot product of two registers
-       * @param x Other register to dot with this register
-       * @return Value of (*this) dot x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        return getThis()->multiply(x).sum();
-      }
+  /*!
+   * @brief Dot product of two registers
+   * @param x Other register to dot with this register
+   * @return Value of (*this) dot x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    return getThis()->multiply(x).sum();
+  }
 
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return (self_type(*getThis()) * self_type(b)) + self_type(c);
-      }
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return (self_type(*getThis()) * self_type(b)) + self_type(c);
+  }
 
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
 
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
 
-      /*!
-       * Minimum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(camp::idx_t N) const
-      {
-        return getThis()->min(N);
-      }
+  /*!
+   * Minimum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(camp::idx_t N) const { return getThis()->min(N); }
 
-      /*!
-       * Maximum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(camp::idx_t N) const
-      {
-        return getThis()->max(N);
-      }
+  /*!
+   * Maximum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(camp::idx_t N) const { return getThis()->max(N); }
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle left operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
-       *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
-       *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_left(camp::idx_t lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle left operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
+   *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
+   *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_left(camp::idx_t lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-        self_type z;
+    self_type z;
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
 
-          z.set(xy_select == 0 ? x.get(i) : y.get(i - (1<<lvl)), i);
-        }
+      z.set(xy_select == 0 ? x.get(i) : y.get(i - (1 << lvl)), i);
+    }
 
-        return z;
-      }
+    return z;
+  }
 
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle right operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
-       *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
-       *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_right(int lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle right operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
+   *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
+   *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_right(int lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-        self_type z;
+    self_type z;
 
-        camp::idx_t i0 = 1<<lvl;
+    camp::idx_t i0 = 1 << lvl;
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
-          z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1<<lvl)), i);
-        }
+      z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1 << lvl)), i);
+    }
 
-        return z;
-      }
+    return z;
+  }
 
 
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
 
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
       {
-        int_vector_type result;
-
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
-
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
-            result.set(seg*stride_outer + i*stride_inner, lane);
-            lane ++;
-          }
-        }
-
-        return result;
+        result.set(seg * stride_outer + i * stride_inner, lane);
+        lane++;
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * self_type::s_num_elem>>segbits;
-
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          auto value = getThis()->get(i) + result.get((i >> segbits)+output_offset);
-          result.set(value, (i >> segbits)+output_offset);
-        }
 
-        return result;
-      }
-
-      /*!
-       * Sum all segments as subvectors, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 the segments are size 1, which means that this is just a
-       *      sum of all elements.  The output_segment determines where the
-       *      result is placed.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=3:
-       *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
-       *
-       *  segbits=1 the segments are 2-wide:
-       *
-       *      output_segment=0:
-       *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
-       *
-       *  and so on up to segbits=3, which is just the original vector:
-       *  segbits=3
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * (1<<segbits);
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          camp::idx_t output_i = output_offset + (i&((1<<segbits)-1));
-          auto value = getThis()->get(i) + result.get(output_i);
-          result.set(value, output_i);
-        }
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * self_type::s_num_elem >> segbits;
 
-        return result;
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      auto value =
+          getThis()->get(i) + result.get((i >> segbits) + output_offset);
+      result.set(value, (i >> segbits) + output_offset);
+    }
 
+    return result;
+  }
 
+  /*!
+   * Sum all segments as subvectors, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 the segments are size 1, which means that this is just a
+   *      sum of all elements.  The output_segment determines where the
+   *      result is placed.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=3:
+   *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
+   *
+   *  segbits=1 the segments are 2-wide:
+   *
+   *      output_segment=0:
+   *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
+   *
+   *  and so on up to segbits=3, which is just the original vector:
+   *  segbits=3
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-      RAJA_INLINE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * (1 << segbits);
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
+      auto value           = getThis()->get(i) + result.get(output_i);
+      result.set(value, output_i);
+    }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+    return result;
+  }
 
-            if(seg >= num_outer || i >= num_inner){
-              result.set(element_type(0), lane);
-            }
-            else{
 
-              element_type div = getThis()->get(lane) / den.get(lane);
+  RAJA_INLINE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
 
-              result.set(div, lane);
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-            }
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-            lane ++;
-          }
+        if (seg >= num_outer || i >= num_inner)
+        {
+          result.set(element_type(0), lane);
         }
+        else
+        {
 
-        return result;
-      }
-
+          element_type div = getThis()->get(lane) / den.get(lane);
 
+          result.set(div, lane);
+        }
 
-      /*!
-       * Segmented dot product performs dot products
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
-       *
-       *
-       *  segbits=0 is equivalent to a vector multiply,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
-       *
-       *  segbits=1 sums neighboring pairs of products.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
-       *
-       *  and so on up to segbits=3, which is a full dot-product of x and y, and the
-       *      output_segment denotes the vector position of the result
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type segmented_dot(camp::idx_t segbits, camp::idx_t output_segment, self_type const &x) const
-      {
-        return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+        lane++;
       }
+    }
 
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      input_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      input_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      input_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+    return result;
+  }
 
-          auto off = (i&mask) + offset;
 
-          result.set(getThis()->get(off), i);
-        }
+  /*!
+   * Segmented dot product performs dot products
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
+   *
+   *
+   *  segbits=0 is equivalent to a vector multiply,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
+   *
+   *  segbits=1 sums neighboring pairs of products.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
+   *
+   *  and so on up to segbits=3, which is a full dot-product of x and y, and the
+   *      output_segment denotes the vector position of the result
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type segmented_dot(camp::idx_t segbits,
+                          camp::idx_t output_segment,
+                          self_type const& x) const
+  {
+    return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+  }
 
-        return result;
-      }
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      input_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      input_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      input_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
 
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+      auto off = (i & mask) + offset;
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+      result.set(getThis()->get(off), i);
+    }
 
-          auto off = (i>>segbits) + offset;
+    return result;
+  }
 
-          result.set(getThis()->get(off), i);
-        }
 
-        return result;
-      }
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
 
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
+      auto off = (i >> segbits) + offset;
 
+      result.set(getThis()->get(off), i);
+    }
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
+    return result;
+  }
 
-        //
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          s += std::to_string(getThis()->get(i)) + " ";
-        }
 
-        s += " ]\n";
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
 
-        return s;
-      }
+    //
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      s += std::to_string(getThis()->get(i)) + " ";
+    }
 
-  };
+    s += " ]\n";
 
+    return s;
+  }
+};
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index bb53993fed..c92921df2a 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -27,347 +27,277 @@ namespace RAJA
 
 namespace internal
 {
-    /* Partial specialization for the strip_index_type_t helper in
-       IndexValue.hpp
-    */
-    template<typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
-    struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
-    {
-        using type = typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
-    };
+/* Partial specialization for the strip_index_type_t helper in
+   IndexValue.hpp
+*/
+template <typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
+struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
+{
+  using type =
+      typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
+};
 
 
 namespace expt
 {
 
 
+// Helper that strips the Vector type from an argument
+template <typename ARG>
+struct TensorIndexTraits
+{
+  using arg_type   = ARG;
+  using value_type = strip_index_type_t<ARG>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return false; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(arg_type const& arg) { return arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(arg_type const arg)
+  {
+    return arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(arg_type const&) { return 1; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(arg_type const&) { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return 0; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem() { return 1; }
+};
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
+{
+  using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using arg_type   = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(index_type const& arg) { return *arg; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const arg)
+  {
+    return (arg_type)arg;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const& arg) { return arg.size(); }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const& arg)
+  {
+    return arg.begin();
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          IDX INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+    RAJA::expt::StaticTensorIndexInner<IDX,
+                                       TENSOR_TYPE,
+                                       DIM,
+                                       INDEX_VALUE,
+                                       LENGTH_VALUE>>>
+{
+  using base_type  = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using index_type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::StaticTensorIndexInner<IDX,
+                                         TENSOR_TYPE,
+                                         DIM,
+                                         INDEX_VALUE,
+                                         LENGTH_VALUE>>;
+  using arg_type   = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const)
+  {
+    return INDEX_VALUE;
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const&) { return LENGTH_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const&) { return INDEX_VALUE; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+/*
+ * Returns vector size of argument.
+ *
+ * For scalars, always returns 1.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr bool isTensorIndex()
+{
+  return TensorIndexTraits<ARG>::isTensorIndex();
+}
 
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndex(ARG const& arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const&
+{
+  return TensorIndexTraits<ARG>::strip(arg);
+}
 
 
-
-    // Helper that strips the Vector type from an argument
-    template<typename ARG>
-    struct TensorIndexTraits {
-        using arg_type = ARG;
-        using value_type = strip_index_type_t<ARG>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return false;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(arg_type const &arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(arg_type const arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(arg_type const &){
-          return 1;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(arg_type const &){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return 1;
-        }
-    };
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-    struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>> {
-        using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(index_type const &arg){
-          return *arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const arg){
-          return (arg_type)arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &arg){
-          return arg.size();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &arg){
-          return arg.begin();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-
-
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-    struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
-        RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>
-    >> {
-        using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using index_type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &){
-          return LENGTH_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-    /*
-     * Returns vector size of argument.
-     *
-     * For scalars, always returns 1.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    bool isTensorIndex()
-    {
-      return TensorIndexTraits<ARG>::isTensorIndex();
-    }
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndex(ARG const &arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const &
-    {
-      return TensorIndexTraits<ARG>::strip(arg);
-    }
-
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndexByValue(ARG const arg) ->
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+stripTensorIndexByValue(ARG const arg) ->
     typename TensorIndexTraits<ARG>::arg_type const
-    {
-      return TensorIndexTraits<ARG>::strip_by_value(arg);
-    }
-
-    /*
-     * Returns tensor dimension size of argument.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorSize(ARG const &arg, IDX dim_size)
-    {
-      return TensorIndexTraits<ARG>::size(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::size(arg)) :
-          dim_size;
-    }
-
-    /*
-     * Returns tensor dimenson beginning index of an argument.
-     *
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorBegin(ARG const &arg, IDX dim_minval)
-    {
-      return TensorIndexTraits<ARG>::begin(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::begin(arg)) :
-          dim_minval;
-    }
-
-    /*
-     * Returns vector dim of argument.
-     *
-     * For scalars, always returns 0.
-     *
-     * For VectorIndex types, returns the DIM argument.
-     * For vector_exec, this is always 0
-     *
-     * For matrices, DIM means:
-     *   0 : Row
-     *   1 : Column
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto getTensorDim() ->
-      decltype(TensorIndexTraits<ARG>::dim())
-    {
-      return TensorIndexTraits<ARG>::dim();
-    }
-
-} // namespace expt
-
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)],
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-} // namespace internal
+{
+  return TensorIndexTraits<ARG>::strip_by_value(arg);
+}
+
+/*
+ * Returns tensor dimension size of argument.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
+                                                         IDX dim_size)
+{
+  return TensorIndexTraits<ARG>::size(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::size(arg))
+             : dim_size;
+}
+
+/*
+ * Returns tensor dimenson beginning index of an argument.
+ *
+ */
+template <typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
+                                                          IDX dim_minval)
+{
+  return TensorIndexTraits<ARG>::begin(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::begin(arg))
+             : dim_minval;
+}
+
+/*
+ * Returns vector dim of argument.
+ *
+ * For scalars, always returns 0.
+ *
+ * For VectorIndex types, returns the DIM argument.
+ * For vector_exec, this is always 0
+ *
+ * For matrices, DIM means:
+ *   0 : Row
+ *   1 : Column
+ */
+template <typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto getTensorDim()
+    -> decltype(TensorIndexTraits<ARG>::dim())
+{
+  return TensorIndexTraits<ARG>::dim();
+}
+
+}  // namespace expt
+
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        camp::get<id>(data.segment_tuple)
+            .begin()[camp::get<id>(data.offset_tuple)],
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
+
+  template <typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        IDX(camp::get<id>(data.offset_tuple)),  // convert offset type to IDX
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+}  // namespace internal
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index 60e31f24b9..73d6f788c1 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -30,656 +30,750 @@ namespace internal
 namespace expt
 {
 
-    template<typename INT_SEQ>
-    struct StaticIndexArray;
-
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
-    struct PrependStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct AddStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct SetStaticIndexArray;
-
-
-    template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>> {
-        
-        using seq_type = camp::int_seq<INDEX_TYPE,HEAD,TAIL...>;
-        using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE,TAIL...>>;
-
-        Tail tail;
-
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>() = default;
-       
-	 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t index) {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return Tail::value_at(index-1);
-            }
-        }
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t index) const {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return tail[index-1];
-            }
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {
-            printf("%ld ",(long)HEAD);
-            tail.print_values();
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            printf("[");
-            print_values();
-            printf("]");
-        }
-
-
-    };
-
-    template<typename INDEX_TYPE>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
-    {
+template <typename INT_SEQ>
+struct StaticIndexArray;
+
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
+struct PrependStaticIndexArray;
 
-        using seq_type = camp::int_seq<INDEX_TYPE>;
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct AddStaticIndexArray;
 
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
+template <typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct SetStaticIndexArray;
 
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t) {
-            return 0;
-        }
+template <typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>
+{
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t) const {
-            return 0;
-        }
+  using seq_type = camp::int_seq<INDEX_TYPE, HEAD, TAIL...>;
+  using Self     = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Tail     = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {}
+  Tail tail;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            print("[]");
-        }
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>() = default;
 
-    };
 
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
-    struct PrependStaticIndexArray<INDEX_TYPE, NEW_HEAD, StaticIndexArray<camp::int_seq<INDEX_TYPE,ORIG_INTS...>>>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t index)
+  {
+    if (index == 0)
+    {
+      return HEAD;
+    }
+    else
     {
-        using Type = StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
-        using Seq  = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
-    };
+      return Tail::value_at(index - 1);
+    }
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t index) const
+  {
+    if (index == 0)
+    {
+      return HEAD;
+    }
+    else
+    {
+      return tail[index - 1];
+    }
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const
+  {
+    printf("%ld ", (long)HEAD);
+    tail.print_values();
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("[");
+    print_values();
+    printf("]");
+  }
+};
+
+template <typename INDEX_TYPE>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
+{
 
+  using seq_type = camp::int_seq<INDEX_TYPE>;
 
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, IDX, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using AddTail = typename AddStaticIndexArray<INDEX_TYPE,IDX-1,DELTA,typename Orig::Tail>::Type;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, 0, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
 
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Seq;
-    };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t) { return 0; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t) const { return 0; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print_values() const {}
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using SetTail = typename SetStaticIndexArray<INDEX_TYPE,IDX-1,VALUE,typename Orig::Tail>::Type;
-        using Type    = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Type;
-        using Seq     = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, 0, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Seq;
-    };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const { print("[]"); }
+};
 
+template <typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
+struct PrependStaticIndexArray<
+    INDEX_TYPE,
+    NEW_HEAD,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, ORIG_INTS...>>>
+{
+  using Type =
+      StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
+  using Seq = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               DELTA,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE DELTA,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
 
-    enum TensorTileSize
-    {
-      TENSOR_PARTIAL,  // the tile is a full TensorRegister
-      TENSOR_FULL,     // the tile is a partial TensorRegister
-      TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
-    };
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                HEAD + DELTA,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               HEAD + DELTA,
+                                               typename Orig::Tail>::Seq;
+};
+
+
+template <typename INDEX_TYPE,
+          size_t IDX,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               VALUE,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
+};
+
+template <typename INDEX_TYPE,
+          INDEX_TYPE VALUE,
+          INDEX_TYPE HEAD,
+          INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                VALUE,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               VALUE,
+                                               typename Orig::Tail>::Seq;
+};
 
-    template<typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
-    struct TensorTile
-    {
-        using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using nonstatic_self_type = self_type;
-        using index_type = INDEX_TYPE;
-        index_type m_begin[NUM_DIMS];
-        index_type m_size[NUM_DIMS];
-
-        static constexpr camp::idx_t s_num_dims = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
-
-
-        template<typename I, TensorTileSize S>
-        void copy(TensorTile<I, S, NUM_DIMS> const &c)
-        {
-          for(camp::idx_t i = 0;i < NUM_DIMS;++i){
-            m_begin[i] = c.m_begin[i];
-            m_size[i] = c.m_size[i];
-          }
-        }
-
-        /*!
-         * Subtract begin offsets of two tiles.
-         *
-         * The resulting tile has the sizes of the left operand, but has
-         * m_begin[i] = left.m_begin[i] - right.m_begin[i]
-         *
-         */
-        template<typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const &sub) const {
-          self_type result(*this);
-          for(camp::idx_t i = 0;i < s_num_dims; ++ i){
-            result.m_begin[i] -= sub.m_begin[i];
-          }
-          return result;
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorTile: dims=%d, m_begin=[",  (int)NUM_DIMS);
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_begin[i]);
-          }
-
-          printf("], m_size=[");
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_size[i]);
-          }
-
-          printf("]\n");
-        }
-    };
-
-
-
-
-    template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    struct StaticTensorTile;
-
-    template< typename INDEX_TYPE,
-              TensorTileSize TENSOR_SIZE,
-              INDEX_TYPE... BeginInts,
-              INDEX_TYPE... SizeInts>
-    struct StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, BeginInts...>,
-              camp::int_seq<INDEX_TYPE, SizeInts...>>
-    {
 
+enum TensorTileSize
+{
+  TENSOR_PARTIAL,  // the tile is a full TensorRegister
+  TENSOR_FULL,     // the tile is a partial TensorRegister
+  TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
+};
 
+template <typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
+struct TensorTile
+{
+  using self_type           = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using nonstatic_self_type = self_type;
+  using index_type          = INDEX_TYPE;
+  index_type m_begin[NUM_DIMS];
+  index_type m_size[NUM_DIMS];
 
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
-        using begin_type = StaticIndexArray<begin_seq>;
-        using size_type  = StaticIndexArray<size_seq >;
-        using self_type  = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq,size_seq>;
-        using index_type = INDEX_TYPE;
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-        using nonstatic_self_type = TensorTile<INDEX_TYPE,TENSOR_SIZE,sizeof...(BeginInts)>;
 
-        using Partial = StaticTensorTile< INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>; 
-        using Full    = StaticTensorTile< INDEX_TYPE, TENSOR_FULL   , begin_seq, size_seq>; 
+  template <typename I, TensorTileSize S>
+  void copy(TensorTile<I, S, NUM_DIMS> const& c)
+  {
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      m_begin[i] = c.m_begin[i];
+      m_size[i]  = c.m_size[i];
+    }
+  }
+
+  /*!
+   * Subtract begin offsets of two tiles.
+   *
+   * The resulting tile has the sizes of the left operand, but has
+   * m_begin[i] = left.m_begin[i] - right.m_begin[i]
+   *
+   */
+  template <typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type
+  operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const& sub) const
+  {
+    self_type result(*this);
+    for (camp::idx_t i = 0; i < s_num_dims; ++i)
+    {
+      result.m_begin[i] -= sub.m_begin[i];
+    }
+    return result;
+  }
 
-        begin_type m_begin;
-        size_type  m_size;
 
-	static_assert(
-          sizeof...(BeginInts) == sizeof...(SizeInts),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
-        );
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorTile: dims=%d, m_begin=[", (int)NUM_DIMS);
 
-        static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_begin[i]);
+    }
 
-        constexpr operator nonstatic_self_type() const {
-            return nonstatic_self_type { {BeginInts...}, {SizeInts...} };
-        }
+    printf("], m_size=[");
 
-        constexpr nonstatic_self_type nonstatic() const {
-            return *this;
-        }
-        
-        template<TensorTileSize S>
-        constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const RAJA_UNUSED_ARG(&c)) const
-        {}
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_size[i]);
+    }
 
+    printf("]\n");
+  }
+};
+
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+struct StaticTensorTile;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts>
+struct StaticTensorTile<INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, BeginInts...>,
+                        camp::int_seq<INDEX_TYPE, SizeInts...>>
+{
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorTile: dims=%d, m_begin=",  (int)s_num_dims);
 
-          m_begin.print();
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
+  using begin_type = StaticIndexArray<begin_seq>;
+  using size_type  = StaticIndexArray<size_seq>;
+  using self_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  using index_type = INDEX_TYPE;
 
-          printf(", m_size=");
-          
-          m_size.print();
+  using nonstatic_self_type =
+      TensorTile<INDEX_TYPE, TENSOR_SIZE, sizeof...(BeginInts)>;
 
-          printf("\n");
-        }
-    };
+  using Partial =
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>;
+  using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileBegin;
+  begin_type m_begin;
+  size_type m_size;
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileBegin<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using BeginType = StaticIndexArray<TBEGIN>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,BeginType>::Seq,
-                TSIZE
-            >;
-        };
+  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorTile");
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileSize;
+  static constexpr camp::idx_t s_num_dims       = sizeof...(BeginInts);
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileSize<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using SizeType = StaticIndexArray<TSIZE>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                TBEGIN,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,SizeType>::Seq
-            >;
-        };
+  constexpr operator nonstatic_self_type() const
+  {
+    return nonstatic_self_type {{BeginInts...}, {SizeInts...}};
+  }
 
+  constexpr nonstatic_self_type nonstatic() const { return *this; }
 
+  template <TensorTileSize S>
+  constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const
+                          RAJA_UNUSED_ARG(&c)) const
+  {}
 
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct TensorRef
-    {
-        static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
-        static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorTile: dims=%d, m_begin=", (int)s_num_dims);
 
-        using self_type = TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
-        using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
+    m_begin.print();
 
-        pointer_type m_pointer;
-        index_type m_stride[NUM_DIMS];
-        tile_type m_tile;
+    printf(", m_size=");
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS, m_pointer);
+    m_size.print();
 
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_stride[i]);
-          }
+    printf("\n");
+  }
+};
 
-          printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileBegin;
 
-          m_tile.print();
-        }
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileBegin<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using BeginType = StaticIndexArray<TBEGIN>;
+  using Type      = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, BeginType>::Seq,
+      TSIZE>;
+};
+
+template <typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileSize;
+
+template <typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          INDEX_TYPE VALUE,
+          size_t IDX>
+struct SetStaticTensorTileSize<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using SizeType = StaticIndexArray<TSIZE>;
+  using Type     = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      TBEGIN,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, SizeType>::Seq>;
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          camp::idx_t NUM_DIMS,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct TensorRef
+{
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+  using self_type    = TensorRef<POINTER_TYPE,
+                              INDEX_TYPE,
+                              TENSOR_SIZE,
+                              NUM_DIMS,
+                              STRIDE_ONE_DIM>;
+  using tile_type    = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using pointer_type = POINTER_TYPE;
+  using index_type   = INDEX_TYPE;
+
+
+  pointer_type m_pointer;
+  index_type m_stride[NUM_DIMS];
+  tile_type m_tile;
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
+           m_pointer);
+
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_stride[i]);
+    }
 
-    };
+    printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+    m_tile.print();
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename STRIDE_TYPE,
+          typename BEGIN_TYPE,
+          typename SIZE_TYPE,
+          camp::idx_t STRIDE_ONE_DIM = -1>
+struct StaticTensorRef;
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE... StrideInts,
+          INDEX_TYPE... BeginInts,
+          INDEX_TYPE... SizeInts,
+          camp::idx_t STRIDE_ONE_DIM>
+struct StaticTensorRef<POINTER_TYPE,
+                       INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, StrideInts...>,
+                       camp::int_seq<INDEX_TYPE, BeginInts...>,
+                       camp::int_seq<INDEX_TYPE, SizeInts...>,
+                       STRIDE_ONE_DIM>
+{
 
+  static constexpr camp::idx_t s_num_dims           = sizeof...(BeginInts);
+  static constexpr camp::idx_t s_stride_one_dim     = STRIDE_ONE_DIM;
+  static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
+  using pointer_type                                = POINTER_TYPE;
+  using index_type                                  = INDEX_TYPE;
 
+  using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename STRIDE_TYPE, typename BEGIN_TYPE, typename SIZE_TYPE, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct StaticTensorRef;
+  using stride_type = StaticIndexArray<stride_seq>;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, INDEX_TYPE... StrideInts, INDEX_TYPE... BeginInts, INDEX_TYPE... SizeInts, camp::idx_t STRIDE_ONE_DIM>
-    struct StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInts...>,camp::int_seq<INDEX_TYPE,BeginInts...>,camp::int_seq<INDEX_TYPE,SizeInts...>,STRIDE_ONE_DIM>
-    {
+  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
+                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorRef");
 
-        static constexpr camp::idx_t    s_num_dims         = sizeof...(BeginInts);
-        static constexpr camp::idx_t    s_stride_one_dim   = STRIDE_ONE_DIM;
-        static constexpr TensorTileSize s_ref_tensor_size  = TENSOR_SIZE;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
-        using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
 
-        using stride_type  = StaticIndexArray<stride_seq>;
+  using self_type = StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE,
+                                    TENSOR_SIZE,
+                                    stride_seq,
+                                    begin_seq,
+                                    size_seq>;
+  using tile_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
-	static_assert(
-          (sizeof...(BeginInts) == sizeof...(SizeInts)) && (sizeof...(SizeInts) == sizeof...(StrideInts)),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorRef"
-        );
-        
 
-        using self_type = StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,stride_seq,begin_seq,size_seq>;
-        using tile_type = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  pointer_type m_pointer;
+  stride_type m_stride;
+  tile_type m_tile;
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
+           m_pointer);
 
-        pointer_type m_pointer;
-        stride_type m_stride;
-        tile_type m_tile;
+    m_stride.print();
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims, m_pointer);
-
-          m_stride.print();
-
-          printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
-
-          m_tile.print();
-        }
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
-    struct MergeRefTile;
-
-    template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t ... DIM_SEQ>
-    struct MergeRefTile <REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
-
-        static_assert( REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims , "Merging a ref with a tile requires an equivalent number of dimensions.");
-
-        static constexpr camp::idx_t    s_num_dims         = REF_TYPE::s_num_dims;
-        static constexpr camp::idx_t    s_stride_one_dim   = REF_TYPE::s_stride_one_dim;
-        static constexpr TensorTileSize s_ref_tensor_size  = TILE_TYPE::s_tensor_size;
-        using pointer_type    = typename REF_TYPE::pointer_type;
-        using ref_index_type  = typename REF_TYPE::index_type;
-        
-        static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
-        using tile_index_type = typename TILE_TYPE::index_type;
-
-        using merge_type = TensorRef<pointer_type, tile_index_type, s_tile_tensor_size, s_num_dims, s_stride_one_dim>;
-        using shift_type = merge_type;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile){
-          return merge_type{
-            ref.m_pointer,
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin){
-          return shift_type{
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            ref.m_tile
-          };
-        }
-
-    };
-
-
-
-
-
-
-
-    template<
-       typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE,
-       typename STRIDE, INDEX_TYPE1... BEGIN1, INDEX_TYPE1... SIZE1, camp::idx_t STRIDE_ONE_DIM,
-       typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, typename BEGIN2, typename SIZE2,
-       camp::idx_t ... DIM_SEQ
-    >
-    struct MergeRefTile<
-       StaticTensorRef<
-              POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE,
-              STRIDE,
-              camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-              camp::int_seq<INDEX_TYPE1,SIZE1...>,
-              STRIDE_ONE_DIM
-       >,
-       StaticTensorTile<
-              INDEX_TYPE2,
-              TENSOR_SIZE,
-              BEGIN2,
-              SIZE2
-       >,
-       camp::idx_seq<DIM_SEQ...>
-    > {
-
-        using ref_tile_type = StaticTensorTile<
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>
-              >;
-
-        using ref_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  STRIDE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                  STRIDE_ONE_DIM
-              >;
-
-        using tile_type = StaticTensorTile<
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  BEGIN2,
-                  SIZE2
-              >;
-
-        using ref_stride_type = typename ref_type ::stride_type;
-
-        using new_stride_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>; 
-        
-        using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(BEGIN1)...>; 
-        using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(SIZE1)...>; 
-       
-        using shift_tile_type = StaticTensorTile<INDEX_TYPE2,TENSOR_SIZE,shift_begin_seq,shift_size_seq>;
- 
-        using new_stride_type = StaticIndexArray<new_stride_seq>; 
-
-        using merge_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  BEGIN2,
-                  SIZE2,
-                  STRIDE_ONE_DIM
-              >;
-
-        using shift_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  shift_begin_seq,
-                  shift_size_seq,
-                  STRIDE_ONE_DIM
-              >;
-
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(ref_type const &ref, tile_type const &tile){
-          return merge_type {
-            ref.m_pointer,
-            new_stride_type(),
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
-          return shift_type {
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            new_stride_type(),
-            shift_tile_type()
-          };
-        }
-
-
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto merge_ref_tile(REF_TYPE const &ref, TILE_TYPE const &tile) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
-    }
+    printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
 
+    m_tile.print();
+  }
+};
 
 
-    /*!
-     * Modifies a ref's pointer so that the supplied tile_origin will resolve
-     * to the original pointer.
-     */
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto shift_tile_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref, tile_origin);
-    }
+template <typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
+struct MergeRefTile;
 
+template <typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
+struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
+{
 
+  static_assert(
+      REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
+      "Merging a ref with a tile requires an equivalent number of dimensions.");
+
+  static constexpr camp::idx_t s_num_dims       = REF_TYPE::s_num_dims;
+  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
+  static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
+  using pointer_type   = typename REF_TYPE::pointer_type;
+  using ref_index_type = typename REF_TYPE::index_type;
+
+  static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
+  using tile_index_type = typename TILE_TYPE::index_type;
+
+  using merge_type = TensorRef<pointer_type,
+                               tile_index_type,
+                               s_tile_tensor_size,
+                               s_num_dims,
+                               s_stride_one_dim>;
+  using shift_type = merge_type;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(REF_TYPE const& ref, TILE_TYPE const& tile)
+  {
+    return merge_type {
+        ref.m_pointer, {tile_index_type(ref.m_stride[DIM_SEQ])...}, tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(REF_TYPE const& ref,
+                                           TILE_TYPE const& tile_origin)
+  {
+    return shift_type {
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
+        {tile_index_type(ref.m_stride[DIM_SEQ])...},
+        ref.m_tile};
+  }
+};
+
+
+template <typename POINTER_TYPE,
+          typename INDEX_TYPE1,
+          TensorTileSize RTENSOR_SIZE,
+          typename STRIDE,
+          INDEX_TYPE1... BEGIN1,
+          INDEX_TYPE1... SIZE1,
+          camp::idx_t STRIDE_ONE_DIM,
+          typename INDEX_TYPE2,
+          TensorTileSize TENSOR_SIZE,
+          typename BEGIN2,
+          typename SIZE2,
+          camp::idx_t... DIM_SEQ>
+struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE1,
+                                    RTENSOR_SIZE,
+                                    STRIDE,
+                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                    STRIDE_ONE_DIM>,
+                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
+                    camp::idx_seq<DIM_SEQ...>>
+{
 
-    /*!
-     * Changes TensorTile size type to FULL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &
-    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &>(tile);
-    }
+  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
+                                         RTENSOR_SIZE,
+                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
+
+  using ref_type = StaticTensorRef<POINTER_TYPE,
+                                   INDEX_TYPE1,
+                                   RTENSOR_SIZE,
+                                   STRIDE,
+                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                   STRIDE_ONE_DIM>;
+
+  using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
+
+  using ref_stride_type = typename ref_type ::stride_type;
+
+  using new_stride_seq =
+      camp::int_seq<INDEX_TYPE2,
+                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
+
+  using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
+  using shift_size_seq  = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
+
+  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
+                                           TENSOR_SIZE,
+                                           shift_begin_seq,
+                                           shift_size_seq>;
+
+  using new_stride_type = StaticIndexArray<new_stride_seq>;
+
+  using merge_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     BEGIN2,
+                                     SIZE2,
+                                     STRIDE_ONE_DIM>;
+
+  using shift_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     shift_begin_seq,
+                                     shift_size_seq,
+                                     STRIDE_ONE_DIM>;
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(ref_type const& ref, tile_type const& tile)
+  {
+    return merge_type {ref.m_pointer, new_stride_type(), tile};
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(ref_type const& ref,
+                                           tile_type const& tile_origin)
+  {
+    return shift_type {
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
+        new_stride_type(), shift_tile_type()};
+  }
+};
+
+
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+merge_ref_tile(REF_TYPE const& ref, TILE_TYPE const& tile) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
+{
+  return MergeRefTile<REF_TYPE, TILE_TYPE,
+                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
+                                                                          tile);
+}
 
-    /*!
-     * Changes TensorTile size type to PARTIAL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &
-    make_tensor_tile_partial(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &>(tile);
-    }
 
+/*!
+ * Modifies a ref's pointer so that the supplied tile_origin will resolve
+ * to the original pointer.
+ */
+template <typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto
+shift_tile_origin(REF_TYPE const& ref, TILE_TYPE const& tile_origin) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
+{
+  return MergeRefTile<
+      REF_TYPE, TILE_TYPE,
+      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
+                                                                 tile_origin);
+}
 
 
-    /*!
-     * Changes StaticTensorTile size type to FULL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &
-    make_tensor_tile_full(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE
+    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&
+    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&>(tile);
+}
 
-    /*!
-     * Changes StaticTensorTile size type to PARTIAL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &
-    make_tensor_tile_partial(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          camp::idx_t NUM_DIMS>
+RAJA_INLINE
+    RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&
+    make_tensor_tile_partial(
+        TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&>(
+      tile);
+}
 
 
+/*!
+ * Changes StaticTensorTile size type to FULL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_FULL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_full(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE>&>(tile);
+}
+
+/*!
+ * Changes StaticTensorTile size type to PARTIAL
+ */
+template <typename INDEX_TYPE,
+          TensorTileSize RTENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_PARTIAL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_partial(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE>&>(tile);
+}
+
 
-  } // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index d2bce598ff..0303a1f275 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -34,815 +34,795 @@ namespace expt
 {
 
 
+namespace ET
+{
+class TensorExpressionConcreteBase;
+}  // namespace ET
+
+
+template <typename TENSOR, camp::idx_t DIM>
+struct TensorDimSize
+{
+  static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
+};
+
+/*
+ * Tensor product helper class.
+ *
+ * This defines the default product operation between types when using the
+ * operator*
+ *
+ */
+template <typename LHS, typename RHS>
+struct TensorDefaultOperation
+{
+
+  using multiply_type = decltype(LHS().multiply(RHS()));
+
+  // default multiplication operator
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static multiply_type multiply(LHS const& lhs, RHS const& rhs)
+  {
+    return lhs.multiply(rhs);
+  }
+};
+
+
+template <typename REF_TYPE>
+struct TensorRegisterStoreRef
+{
+  using self_type = TensorRegisterStoreRef<REF_TYPE>;
+  REF_TYPE m_ref;
+
+  RAJA_SUPPRESS_HD_WARN
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator=(RHS const& rhs)
+  {
+
+    rhs.store_ref(m_ref);
+    return *this;
+  }
+};
+
+template <camp::idx_t N, camp::idx_t D>
+struct DivideRoundUp
+{
+  static constexpr camp::idx_t value = (N % D) > 0 ? (1 + N / D) : (N / D);
+};
+
+
+class TensorRegisterConcreteBase
+{};
+
+/*!
+ * TensorRegister base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template <typename Derived>
+class TensorRegisterBase;
+
+template <typename REGISTER_POLICY,
+          typename T,
+          typename LAYOUT,
+          typename camp::idx_t... SIZES>
+class TensorRegisterBase<
+    RAJA::expt::
+        TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
+    : public TensorRegisterConcreteBase
+{
+public:
+  using self_type = RAJA::expt::
+      TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
+  using element_type = camp::decay<T>;
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+
+  static constexpr camp::idx_t s_num_registers =
+      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
+                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
 
+  using index_type = camp::idx_t;
 
+  using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
 
-  namespace ET
+  using register_policy = REGISTER_POLICY;
+
+private:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
   {
-    class TensorExpressionConcreteBase;
-  } // namespace ET
+    return static_cast<self_type const*>(this);
+  }
+
+protected:
+  register_type m_registers[s_num_registers];
+
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegisterBase() {}
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegisterBase(element_type c) { broadcast(c); }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegisterBase(self_type const& c) { copy(c); }
+
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ~TensorRegisterBase() {}
 
-  template<typename TENSOR, camp::idx_t DIM>
-  struct TensorDimSize{
-      static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
-  };
 
   /*
-   * Tensor product helper class.
-   *
-   * This defines the default product operation between types when using the
-   * operator*
-   *
+   * Overload for:    assignment of ET to a TensorRegister
    */
-  template<typename LHS, typename RHS>
-  struct TensorDefaultOperation{
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this TensorRegister
+    *this = rhs.eval(self_type::s_get_default_tile());
+  }
 
-      using multiply_type = decltype(LHS().multiply(RHS()));
 
-      // default multiplication operator
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      multiply_type multiply(LHS const &lhs, RHS const &rhs)
-      {
-        return lhs.multiply(rhs);
-      }
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
+                                                           REGS const&... regs)
+      : m_registers {reg0, regs...}
+  {
+    static_assert(1 + sizeof...(REGS) == s_num_registers,
+                  "Incompatible number of registers");
+  }
 
-  };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
 
-  template<typename REF_TYPE>
-  struct TensorRegisterStoreRef{
-      using self_type = TensorRegisterStoreRef<REF_TYPE>;
-      REF_TYPE m_ref;
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
+  create_et_store_ref(REF_TYPE const& ref)
+  {
+    return TensorRegisterStoreRef<REF_TYPE> {ref};
+  }
 
-      RAJA_SUPPRESS_HD_WARN
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator=(RHS const &rhs)
-      {
+  RAJA_SUPPRESS_HD_WARN
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static self_type s_load_ref(REF_TYPE const& ref)
+  {
 
-        rhs.store_ref(m_ref);
-        return *this;
-      }
-  };
+    self_type value;
 
-  template<camp::idx_t N, camp::idx_t D>
-  struct DivideRoundUp {
-      static constexpr camp::idx_t value =
-          (N % D) > 0 ? (1 + N/D) : (N/D);
-  };
+    value.load_ref(ref);
+    return value;
+  }
 
+  /*!
+   * Gets the size of the tensor
+   * Since this is a vector, just the length of the vector in dim 0
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr int s_dim_elem(int dim)
+  {
+    return (dim == 0) ? self_type::s_num_elem : 0;
+  }
 
-  class TensorRegisterConcreteBase {};
 
   /*!
-   * TensorRegister base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
+   * Gets the default tile of this tensor
+   * That tile always start at 0, and extends to the full tile sizes
+   */
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr StaticTensorTile<int,
+                                    TENSOR_FULL,
+                                    camp::int_seq<int, int(SIZES * 0)...>,
+                                    camp::int_seq<int, int(SIZES)...>>
+  s_get_default_tile()
+  {
+    return StaticTensorTile<int, TENSOR_FULL,
+                            camp::int_seq<int, int(SIZES * 0)...>,
+                            camp::int_seq<int, int(SIZES)...>>();
+  }
+
+  /*!
+   * @brief convenience routine to allow Vector classes to use
+   * camp::sink() across a variety of register types, and use things like
+   * ternary operators
    */
-  template<typename Derived>
-  class TensorRegisterBase;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr bool sink() const { return false; }
+
 
-  template<typename REGISTER_POLICY, typename T, typename LAYOUT, typename camp::idx_t ... SIZES>
-  class TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>> :
-    public TensorRegisterConcreteBase
+  /*!
+   * Copy contents of another tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& c)
   {
-    public:
-      using self_type = RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
-      using element_type = camp::decay<T>;
-
-      static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = c.vec(i);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * Sets all elements to zero
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& clear()
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = register_type(0);
+    }
+
+
+    return *getThis();
+  }
+
+
+  /*!
+   * Copy contents of another matrix operator
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type v)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i].broadcast(v);
+    }
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(value, i);
+    }
+    return *getThis();
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].add(mat.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].subtract(mat.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * element-wise multiplication
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply(x.vec(i));
+    }
+    return result;
+  }
+
+  /*!
+   * element-wise fused multiply add
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply_add(self_type const& x, self_type const& add) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Dot product of two vectors
+   * @param x Other vector to dot with this vector
+   * @return Value of (*this) dot x
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    element_type result(0);
+
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result += m_registers[reg].multiply(x.vec(reg)).sum();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template <typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const&
+  operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
+                                       T2,
+                                       RAJA::expt::ScalarLayout,
+                                       camp::idx_seq<>> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
+
+  /*!
+   * @brief Assign one register to antoher
+   * @param x Vector to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Add two vector registers
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
+
 
-      static constexpr camp::idx_t s_num_registers = DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...), RegisterTraits<REGISTER_POLICY,T>::s_num_elem>::value;
-
-      using index_type = camp::idx_t;
-
-      using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-
-      using register_policy = REGISTER_POLICY;
-
-    private:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
-
-    protected:
-
-      register_type m_registers[s_num_registers];
-
-    public:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegisterBase(){}
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(element_type c)
-      {
-        broadcast(c);
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(self_type const &c)
-      {
-        copy(c);
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegisterBase(){}
-
-
-      /*
-       * Overload for:    assignment of ET to a TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this TensorRegister
-        *this = rhs.eval(self_type::s_get_default_tile());
-      }
-
-
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(register_type reg0, REGS const &... regs) :
-        m_registers{reg0, regs...}
-      {
-        static_assert(1+sizeof...(REGS) == s_num_registers,
-            "Incompatible number of registers");
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      TensorRegisterStoreRef<REF_TYPE>
-      create_et_store_ref(REF_TYPE const &ref) {
-        return TensorRegisterStoreRef<REF_TYPE>{ref};
-      }
-
-      RAJA_SUPPRESS_HD_WARN
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type
-      s_load_ref(REF_TYPE const &ref) {
-
-        self_type value;
-
-        value.load_ref(ref);
-        return value;
-      }
-
-      /*!
-       * Gets the size of the tensor
-       * Since this is a vector, just the length of the vector in dim 0
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr int s_dim_elem(int dim){
-        return (dim==0) ? self_type::s_num_elem : 0;
-      }
-
-
-      /*!
-       * Gets the default tile of this tensor
-       * That tile always start at 0, and extends to the full tile sizes
-       */
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>
-      s_get_default_tile()
-      {
-        return StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>();
-      }
-
-      /*!
-       * @brief convenience routine to allow Vector classes to use
-       * camp::sink() across a variety of register types, and use things like
-       * ternary operators
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      bool sink() const{
-        return false;
-      }
-
-
-
-
-
-
-      /*!
-       * Copy contents of another tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &c){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = c.vec(i);
-        }
-        return *getThis();
-      }
-
-
-
-
-      /*!
-       * Sets all elements to zero
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &clear(){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = register_type(0);
-        }
-
-
-        return *getThis();
-      }
-
-
-      /*!
-       * Copy contents of another matrix operator
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type v){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i].broadcast(v);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast_n(element_type const &value, camp::idx_t N){
-        for(camp::idx_t i = 0;i < N;++ i){
-          getThis()->set(value, i);
-        }
-        return *getThis();
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].add(mat.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].subtract(mat.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * element-wise multiplication
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply(x.vec(i));
-        }
-        return result;
-      }
-
-      /*!
-       * element-wise fused multiply add
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply_add(self_type const &x, self_type const &add) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
-        }
-        return result;
-      }
-
-
-
-      /*!
-       * @brief Dot product of two vectors
-       * @param x Other vector to dot with this vector
-       * @return Value of (*this) dot x
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        element_type result(0);
-
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result += m_registers[reg].multiply(x.vec(reg)).sum();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register, T2, RAJA::expt::ScalarLayout, camp::idx_seq<>> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
-
-      /*!
-       * @brief Assign one register to antoher
-       * @param x Vector to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
-
-
-
-
-
-      /*!
-       * @brief Add two vector registers
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a vector to this vector
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Add vector to a scalar
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a scalar to this vector
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Negate the value of this vector
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
-
-      /*!
-       * @brief Subtract two vector registers
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a vector from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a scalar from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
+  /*!
+   * @brief Add a vector to this vector
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Add vector to a scalar
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
+
+
+  /*!
+   * @brief Add a scalar to this vector
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Negate the value of this vector
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
+
+  /*!
+   * @brief Subtract two vector registers
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a vector from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a scalar from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Multiply two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE
       typename TensorDefaultOperation<self_type, RHS>::multiply_type
-      operator*(RHS const &rhs) const
-      {
-        return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-      }
-
-      /*!
-       * @brief Multiply a vector with this vector
-       * @param x Vector to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Divide two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Vector to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Returns element wise minimum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmin(x.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * @brief Returns element wise maximum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmax(x.vec(i));
-        }
-        return result;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &vec(int i){
-        return m_registers[i];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &vec(int i) const{
-        return m_registers[i];
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &get_register(int reg){
-        return m_registers[reg];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &get_register(int reg) const{
-        return m_registers[reg];
-      }
-
-
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
-
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
-
-
-      /*!
-       * In-place add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_add(self_type x){
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place sbutract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_subtract(self_type x){
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply(self_type x){
-        *getThis() = getThis()->multiply(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_add(self_type x, self_type y){
-        *getThis() = getThis()->multiply_add(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-subtract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_subtract(self_type x, self_type y){
-        *getThis() = getThis()->multiply_subtract(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place divide operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_divide(self_type x){
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place scaling operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_scale(element_type x){
-        *getThis() = getThis()->scale(x);
-        return *getThis();
-      }
-
-  };
-
-} //namespace internal
-
-} // namespace expt
+      operator*(RHS const& rhs) const
+  {
+    return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+  }
+
+  /*!
+   * @brief Multiply a vector with this vector
+   * @param x Vector to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template <typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() =
+        TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Divide two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Vector to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+
+  /*!
+   * @brief Returns element wise minimum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmin(x.vec(i));
+    }
+    return result;
+  }
+
+
+  /*!
+   * @brief Returns element wise maximum value tensor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmax(x.vec(i));
+    }
+    return result;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type& vec(int i) { return m_registers[i]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const& vec(int i) const { return m_registers[i]; }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  register_type& get_register(int reg) { return m_registers[reg]; }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr register_type const& get_register(int reg) const
+  {
+    return m_registers[reg];
+  }
 
-}  // namespace RAJA
 
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
+
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
+
+
+  /*!
+   * In-place add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_add(self_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place sbutract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_subtract(self_type x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply(self_type x)
+  {
+    *getThis() = getThis()->multiply(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_add(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_add(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place multiply-subtract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_subtract(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_subtract(x, y);
+    return *getThis();
+  }
+
+  /*!
+   * In-place divide operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_divide(self_type x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+  /*!
+   * In-place scaling operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& inplace_scale(element_type x)
+  {
+    *getThis() = getThis()->scale(x);
+    return *getThis();
+  }
+};
+
+}  // namespace expt
+
+}  // namespace internal
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 3899a97118..9a0d011d7e 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -33,345 +33,351 @@ namespace expt
 {
 
 
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
+template <typename STORAGE, typename DIM_SEQ>
+struct TensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ>
-    struct TensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST>
-    struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>{
+/**
+ * Implement a dimension tiling loop
+ */
+template <typename STORAGE, camp::idx_t DIM0, camp::idx_t... DIM_REST>
+struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
+{
 
-      using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE const &otile, TTYPE &tile, BODY && body){
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE& tile, BODY&& body)
+  {
 
-        auto const orig_begin = otile.m_begin[DIM0];
-        auto const orig_size =  otile.m_size[DIM0];
+    auto const orig_begin = otile.m_begin[DIM0];
+    auto const orig_size  = otile.m_size[DIM0];
 
-        // Do the full tile sizes
-        for(tile.m_begin[DIM0] = orig_begin;
+    // Do the full tile sizes
+    for (tile.m_begin[DIM0] = orig_begin;
 
-            tile.m_begin[DIM0] +  STORAGE::s_dim_elem(DIM0) <=
-                orig_begin+orig_size;
+         tile.m_begin[DIM0] + STORAGE::s_dim_elem(DIM0) <=
+         orig_begin + orig_size;
 
-            tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0)){
+         tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0))
+    {
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, tile, body);
+      // Do the next inner tiling loop
+      inner_t::exec(otile, tile, body);
+    }
 
-        }
+    // Postamble if needed
+    if (tile.m_begin[DIM0] < orig_begin + orig_size)
+    {
 
-        // Postamble if needed
-        if(tile.m_begin[DIM0] <
-            orig_begin + orig_size)
-        {
+      // convert tile to a partial tile
+      auto& part_tile = make_tensor_tile_partial(tile);
 
-          // convert tile to a partial tile
-          auto &part_tile = make_tensor_tile_partial(tile);
+      // store original size
+      auto tmp_size = part_tile.m_size[DIM0];
 
-          // store original size
-          auto tmp_size = part_tile.m_size[DIM0];
+      // set tile size to the remainder
+      part_tile.m_size[DIM0] = orig_begin + orig_size - tile.m_begin[DIM0];
 
-          // set tile size to the remainder
-          part_tile.m_size[DIM0] =
-              orig_begin +
-              orig_size -
-              tile.m_begin[DIM0];
+      // Do the next inner tiling loop
+      inner_t::exec(otile, part_tile, body);
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, part_tile, body);
+      // restore size
+      part_tile.m_size[DIM0] = tmp_size;
+    }
 
-          // restore size
-          part_tile.m_size[DIM0] = tmp_size;
-        }
+    // reset tile dimension
+    tile.m_begin[DIM0] = orig_begin;
+  }
 
-        // reset tile dimension
-        tile.m_begin[DIM0] = orig_begin;
 
-      }
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  static_exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
 
 
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-      template<
-          typename OTILE,
-          typename TTYPE,
-          typename BODY
-      >
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void
-      static_exec(
-          OTILE const &otile,
-          TTYPE const &tile,
-          BODY && body
-      ){
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    auto constexpr step_size = STORAGE::s_dim_elem(DIM0);
 
-        auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-        auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    auto constexpr iter_count =
+        (tile_begin >= orig_begin) && (tile_begin < (orig_begin + orig_size))
+            ? ((orig_begin + orig_size) - tile_begin + step_size - 1) /
+                  step_size
+            : 0;
 
-        auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-        auto constexpr step_size  = STORAGE::s_dim_elem(DIM0);
+    using IterCount =
+        camp::integral_constant<typename TTYPE::index_type, iter_count>;
+    using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
+    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
+                                                  IterCount>::type;
 
-        auto constexpr iter_count =
-               (tile_begin >= orig_begin) && (tile_begin < (orig_begin+orig_size))
-                 ? ((orig_begin + orig_size) - tile_begin + step_size - 1) / step_size
-                 : 0;
+    StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
+  }
+};
 
 
-        using IterCount = camp::integral_constant<typename TTYPE::index_type,iter_count>;
-        using DimSeq = camp::idx_seq<DIM0,DIM_REST...>;
-        using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,IterCount>::type;
+/**
+ * Termination of nested loop:  execute evaluation of ET
+ */
+template <typename STORAGE>
+struct TensorTileExec<STORAGE, camp::idx_seq<>>
+{
 
-        StaticTensorTileExec<STORAGE,DimSeq,IdxSeq>::exec(otile,tile,body);
-        
-      }
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE&, TTYPE const& tile, BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  static_exec(OTILE const&, TTYPE const& tile, BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+};
+
+
+template <typename STORAGE,
+          typename TILE_TYPE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void
+tensorTileExec_expanded(TILE_TYPE const& orig_tile,
+                        BODY&& body,
+                        camp::idx_seq<IDX_SEQ...> const&,
+                        camp::idx_seq<DIM_SEQ...> const&)
+{
 
+  // tile over full rows and columns
+  // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
+  TILE_TYPE tile {
+      {orig_tile.m_begin[IDX_SEQ]...},
+      {STORAGE::s_dim_elem(IDX_SEQ)...},
+  };
 
 
-    };
+  // Promote the tile type to a "full-tile" so that the full-element
+  // register operations are used.
+  // Any of the tiling loops can demote this to a partial-tile when
+  // they do postamble execution
+  auto& full_tile = make_tensor_tile_full(tile);
 
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order       = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    /**
-     * Termination of nested loop:  execute evaluation of ET
-     */
-    template<typename STORAGE>
-    struct TensorTileExec<STORAGE, camp::idx_seq<>>{
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE &, TTYPE const &tile, BODY && body){
+  tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+}
 
-        // execute body, passing in the current tile
-        body(tile);
 
-      }
+template <typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void static_exec(OTILE const &, TTYPE const &tile, BODY && body){
+/**
+ * Implement a dimension tiling loop
+ */
 
-        // execute body, passing in the current tile
-        body(tile);
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t... DIM_REST,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0, DIM_REST...>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
 
-      }
+  using DimList = camp::idx_seq<DIM0, DIM_REST...>;
+  using DimTail = camp::idx_seq<DIM_REST...>;
+  using IdxList = camp::idx_seq<IDX, IDX_REST...>;
+  using IdxTail = camp::idx_seq<IDX_REST...>;
 
-    };
+  using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0, DIM_REST...>,
+                                        camp::idx_seq<IDX_REST...>>;
 
+  static auto const step_size = STORAGE::s_dim_elem(DIM0);
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded(TILE_TYPE const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
-    {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-      // tile over full rows and columns
-      // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
-      TILE_TYPE tile {
-        {orig_tile.m_begin[IDX_SEQ]...},
-        {STORAGE::s_dim_elem(IDX_SEQ)...},
-      };
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-      // Promote the tile type to a "full-tile" so that the full-element
-      // register operations are used.
-      // Any of the tiling loops can demote this to a partial-tile when
-      // they do postamble execution
-      auto &full_tile = make_tensor_tile_full(tile);
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
 
-      tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec DOWN");
 
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
+    {
+      DownExec::static_exec(otile, tile, body);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
     }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      DownExec::static_exec(otile, part_tile, body);
+    }
+  }
+};
 
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>,camp::idx_seq<IDX,IDX_REST...>>{
-
-          using DimList  = camp::idx_seq<DIM0, DIM_REST...>;
-          using DimTail  = camp::idx_seq<      DIM_REST...>;
-          using IdxList  = camp::idx_seq<IDX , IDX_REST...>;
-          using IdxTail  = camp::idx_seq<      IDX_REST...>;
-
-          using DownExec = TensorTileExec<STORAGE,camp::idx_seq<DIM_REST...>>;
-          using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0,DIM_REST...>,camp::idx_seq<IDX_REST...>>;
-
-          static auto const step_size = STORAGE::s_dim_elem(DIM0);
-
-          template<
-              typename OTILE,
-              typename TTYPE,
-              typename BODY
-          >
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static
-          void
-          exec(
-              OTILE const &otile,
-              TTYPE const &tile,
-              BODY && body
-          ){
-    
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
-
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
-
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
-
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
-
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec DOWN" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               DownExec::static_exec(otile, tile, body);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               DownExec::static_exec(otile,part_tile,body);
-            }
-    
-          }
-
-
-
-    };
-
-
+template <typename STORAGE,
+          camp::idx_t DIM0,
+          camp::idx_t IDX,
+          camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0>,
+                                        camp::idx_seq<IDX_REST...>>;
 
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0>, camp::idx_seq<IDX,IDX_REST...>>{
-      using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0>,camp::idx_seq<IDX_REST...>>;
 
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const& otile, TTYPE const& tile, BODY&& body)
+  {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const & otile, TTYPE const &tile, BODY && body) {
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec ACROSS" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               body(tile);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               body(part_tile);
-            }
-      }
 
-    };
-
-    template<typename STORAGE, camp::idx_t ... DIM_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>, camp::idx_seq<> >{
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const &, TTYPE const &, BODY &&) {}
-
-    };
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec ACROSS");
 
-
-
-    template<typename STORAGE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded( StaticTensorTile<INDEX_TYPE,TENSOR_SIZE, TBEGIN, TSIZE> const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
+      body(tile);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
+    }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      body(part_tile);
+    }
+  }
+};
 
-      using InputType = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          TBEGIN,
-          TSIZE
-      >;
+template <typename STORAGE, camp::idx_t... DIM_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM_REST...>,
+                            camp::idx_seq<>>
+{
 
-      using InputBegin = typename InputType::begin_type;
+  template <typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void
+  exec(OTILE const&, TTYPE const&, BODY&&)
+  {}
+};
+
+
+template <typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          typename TBEGIN,
+          typename TSIZE,
+          typename BODY,
+          camp::idx_t... IDX_SEQ,
+          camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
+    BODY&& body,
+    camp::idx_seq<IDX_SEQ...> const&,
+    camp::idx_seq<DIM_SEQ...> const&)
+{
 
-      using Type = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_FULL,
-          camp::int_seq<INDEX_TYPE,InputBegin::value_at(IDX_SEQ)...>,
-          camp::int_seq<INDEX_TYPE,STORAGE::s_dim_elem(IDX_SEQ)...>
-      >;
+  using InputType = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>;
 
-      Type full_tile;
+  using InputBegin = typename InputType::begin_type;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+  using Type = StaticTensorTile<
+      INDEX_TYPE, TENSOR_FULL,
+      camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
+      camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
 
+  Type full_tile;
 
-      tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order       = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    }
 
+  tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+}
 
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec(TILE_TYPE const &tile, BODY && body)
-    {
-      using layout_type = typename STORAGE::layout_type;
-      tensorTileExec_expanded<STORAGE>(tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
-    }
+template <typename STORAGE, typename TILE_TYPE, typename BODY>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
+                                                 BODY&& body)
+{
+  using layout_type = typename STORAGE::layout_type;
+  tensorTileExec_expanded<STORAGE>(
+      tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims> {}, layout_type {});
+}
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 4ef4998fbe..dfce569070 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -34,958 +34,1064 @@ namespace RAJA
 namespace expt
 {
 
-  /*!
-   * This provides a Tensor specialization for vectors
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-  class TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>> :
-    public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>
+/*!
+ * This provides a Tensor specialization for vectors
+ */
+template <typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     RAJA::expt::VectorLayout,
+                     camp::idx_seq<SIZE>>
+    : public internal::expt::TensorRegisterBase<
+          RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                     T,
+                                     RAJA::expt::VectorLayout,
+                                     camp::idx_seq<SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   RAJA::expt::VectorLayout,
+                                   camp::idx_seq<SIZE>>;
+  using base_type = internal::expt::TensorRegisterBase<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::VectorLayout,
+                                 camp::idx_seq<SIZE>>>;
+  using element_type  = camp::decay<T>;
+  using layout_type   = TensorLayout<0>;
+  using register_type = Register<T, REGISTER_POLICY>;
+
+  static constexpr camp::idx_t s_num_elem = SIZE;
+
+  using int_element_type =
+      typename register_type::int_vector_type::element_type;
+  using int_vector_type = TensorRegister<REGISTER_POLICY,
+                                         int_element_type,
+                                         RAJA::expt::VectorLayout,
+                                         camp::idx_seq<SIZE>>;
+
+private:
+  static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
+
+  static constexpr camp::idx_t s_num_full_registers =
+      s_num_elem / s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_partial_lanes =
+      s_num_elem % s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_registers = (s_num_partial_lanes > 0)
+                                                     ? s_num_full_registers + 1
+                                                     : s_num_full_registers;
+
+  using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+  // Offset of last regiser in m_registers
+  static constexpr camp::idx_t s_final_register = s_num_partial_lanes == 0
+                                                      ? s_num_full_registers - 1
+                                                      : s_num_full_registers;
+
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX i) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-      using base_type = internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>;
-      using element_type = camp::decay<T>;
-      using layout_type = TensorLayout<0>;
-      using register_type = Register<T, REGISTER_POLICY>;
-
-      static constexpr camp::idx_t s_num_elem = SIZE;
-
-      using int_element_type = typename register_type::int_vector_type::element_type;
-      using int_vector_type = TensorRegister<REGISTER_POLICY, int_element_type, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-
-    private:
+    return i >> IDX(s_shift_per_register);
+  }
 
-      static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
-
-      static constexpr camp::idx_t s_num_full_registers = s_num_elem/s_register_num_elem;
+  template <typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX i) -> IDX
+  {
+    return i & IDX(s_mask_per_register);
+  }
 
-      static constexpr camp::idx_t s_num_partial_lanes =  s_num_elem%s_register_num_elem;
 
-      static constexpr camp::idx_t s_num_registers =
-          (s_num_partial_lanes > 0) ?
-              s_num_full_registers + 1 :
-              s_num_full_registers;
+  using base_type::m_registers;
 
-      using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr TensorRegister() {}
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  TensorRegister(element_type c) { this->broadcast(c); }
 
-      // Offset of last regiser in m_registers
-      static constexpr camp::idx_t s_final_register =
-          s_num_partial_lanes == 0 ?
-              s_num_full_registers-1 : s_num_full_registers;
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX i) -> IDX {
-        return i >> IDX(s_shift_per_register);
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) {}
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX i) -> IDX {
-        return i & IDX(s_mask_per_register);
-      }
+  /*
+   * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
+   */
+  template <typename RHS,
+            typename std::enable_if<
+                std::is_base_of<
+                    RAJA::internal::expt::ET::TensorExpressionConcreteBase,
+                    RHS>::value,
+                bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this
+    // RAJA::expt::TensorRegister
+    *this = rhs.eval(base_type::s_get_default_tile());
+  }
 
 
-      using base_type::m_registers;
+  template <typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
+                                                       REGS const&... regs)
+      : base_type(reg0, regs...)
+  {}
 
-    public:
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
 
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template <camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return STRIDE_ONE_DIM == 0;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister(){}
 
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? s_num_elem : 0;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c)
-      {
-        this->broadcast(c);
-      }
 
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) :
-        base_type(c)
-      {
-      }
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
-      /*
-       * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<RAJA::internal::expt::ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this RAJA::expt::TensorRegister
-        *this = rhs.eval(base_type::s_get_default_tile());
-      }
+  /*!
+   * Provide left vector-matrix multiply for operator* between
+   * this vector and a matrix
+   */
+  template <typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return y.left_vector_multiply(*this);
+  }
 
 
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(register_type reg0, REGS const &... regs) :
-        base_type(reg0, regs...)
-      {
-      }
+  template <typename REF_TYPE>
+  struct RefBridge;
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
 
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return STRIDE_ONE_DIM == 0;
-      }
+  template <typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
+  {
 
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? s_num_elem : 0;
-      }
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        this->broadcast(value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+        }
       }
-
-      /*!
-       * Provide left vector-matrix multiply for operator* between
-       * this vector and a matrix
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+      // strided data
+      else
       {
-        return y.left_vector_multiply(*this);
-      }
-
-
-      template<typename REF_TYPE>
-      struct RefBridge;
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        }
       }
+    }
 
 
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
-      {
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-
-
-
-
-
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, INDEX_TYPE STRIDE_VALUE, INDEX_TYPE BEGIN_VALUE, INDEX_TYPE SIZE_VALUE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>> 
-      {
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type &self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-     
-
-
-
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, int stride)
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, int N)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].load_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+  template <typename POINTER_TYPE,
+            typename INDEX_TYPE,
+            RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+            INDEX_TYPE STRIDE_VALUE,
+            INDEX_TYPE BEGIN_VALUE,
+            INDEX_TYPE SIZE_VALUE,
+            camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+      camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+      camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+      STRIDE_ONE_DIM>>
+  {
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr,
-          int stride, int N)
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+        camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+        camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].load_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
-
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].gather(ptr, offsets.vec(reg));
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].gather(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].gather_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, int stride) const
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
+
+
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, int N) const
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                    stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
+
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+      }
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].store_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-            return *this;
-          }
+        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
+                                       N - reg * s_register_num_elem);
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type  *ptr,
-          int stride, int N) const
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].store_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            return *this;
-          }
-
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                      stride);
+      }
+      else
+      {
+        m_registers[reg].load_strided_n(ptr +
+                                            reg * s_register_num_elem * stride,
+                                        stride, N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
 
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].gather(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type const &offsets) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].scatter(ptr, offsets.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].gather(ptr, offsets.vec(reg));
+      }
+      else
+      {
+        m_registers[reg].gather_n(ptr, offsets.vec(reg),
+                                  N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             N - s_final_register *
+                                                     s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type const &offsets, camp::idx_t N) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].scatter(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].scatter_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-
-            return *this;
-          }
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_num_full_registers*s_register_num_elem);
-        }
-        return *this;
-      }
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                     stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &den) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(den.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          result.vec(s_final_register) = m_registers[s_final_register].divide_n(den.vec(s_final_register), s_num_partial_lanes);
-        }
-        return result;
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
       }
-
-      /*!
-       * @brief Divide n elements of this vector by another vector
-       * @param x Vector to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b.get(i), i);
-        }
-        return q;
+      else
+      {
+        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
+                                        N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Divide n elements of this vector by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b, i);
-        }
-        return q;
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                       stride);
       }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min() const
+      else
       {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].min_n(s_num_partial_lanes);
-        }
-
-        element_type result = m_registers[0].min();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::min<element_type>(result, m_registers[i].min());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(s_num_partial_lanes));
-        }
-        return result;
+        m_registers[reg].store_strided_n(ptr +
+                                             reg * s_register_num_elem * stride,
+                                         stride, N - reg * s_register_num_elem);
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the smallest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(int N) const
-      {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].min_n(N);
-        }
 
-        element_type result = m_registers[0].min();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::min<element_type>(result, m_registers[reg].min());
-          }
-          else{
-            return RAJA::min<element_type>(result, m_registers[reg].min_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
-      }
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr,
+                           int_vector_type const& offsets) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].scatter(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max() const
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& scatter_n(element_type* ptr,
+                             int_vector_type const& offsets,
+                             camp::idx_t N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].max_n(s_num_partial_lanes);
-        }
-
-        element_type result = m_registers[0].max();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::max<element_type>(result, m_registers[i].max());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(s_num_partial_lanes));
-        }
-        return result;
+        m_registers[reg].scatter(ptr, offsets.vec(reg));
       }
-
-      /*!
-       * @brief Returns the largest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(int N) const
+      else
       {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].max_n(N);
-        }
+        m_registers[reg].scatter_n(ptr, offsets.vec(reg),
+                                   N - reg * s_register_num_elem);
 
-        element_type result = m_registers[0].max();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::max<element_type>(result, m_registers[reg].max());
-          }
-          else{
-            return RAJA::max<element_type>(result, m_registers[reg].max_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register),
+          N - s_num_full_registers * s_register_num_elem);
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& den) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(den.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      result.vec(s_final_register) = m_registers[s_final_register].divide_n(
+          den.vec(s_final_register), s_num_partial_lanes);
+    }
+    return result;
+  }
 
-      /*!
-       * @brief Returns the sum of all elements
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type sum() const
-      {
-        // first do a vector sum of all registers
-        register_type s = m_registers[0];
-        for(camp::idx_t i = 1;i < s_num_registers;++ i){
-          s += m_registers[i];
-        }
-        // then a horizontal sum of result
-        return s.sum();
-      }
+  /*!
+   * @brief Divide n elements of this vector by another vector
+   * @param x Vector to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
+  /*!
+   * @brief Divide n elements of this vector by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b, i);
+    }
+    return q;
+  }
 
-      /*!
-       * @brief The * operator of two vectors is a element-wise multiply
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(self_type const &x) const {
-        return this->multiply(x);
-      }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].min_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::min<element_type>(result, m_registers[i].min());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::min<element_type>(
+          result, m_registers[s_final_register].min_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      /*!
-       * @brief The dot product of two vectors
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type dot(self_type const &x) const {
-        element_type dp(0);
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          dp += m_registers[i].dot(x.vec(i));
-        }
-        return dp;
+  /*!
+   * @brief Returns the smallest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type min_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].min_n(N);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        result = RAJA::min<element_type>(result, m_registers[reg].min());
+      }
+      else
+      {
+        return RAJA::min<element_type>(
+            result, m_registers[reg].min_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::min<element_type>(
+          result, m_registers[s_final_register].min_n(
+                      N - s_final_register * s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].max_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::max<element_type>(result, m_registers[i].max());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::max<element_type>(
+          result, m_registers[s_final_register].max_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int idx){
-        m_registers[to_register(idx)].set(val, to_lane(idx));
-        return *this;
+  /*!
+   * @brief Returns the largest element over the first N lanes
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type max_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].max_n(N);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        result = RAJA::max<element_type>(result, m_registers[reg].max());
       }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int idx) const {
-        return m_registers[to_register(idx)].get(to_lane(idx));
+      else
+      {
+        return RAJA::max<element_type>(
+            result, m_registers[reg].max_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::max<element_type>(
+          result, m_registers[s_final_register].max_n(
+                      N - s_final_register * s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the sum of all elements
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  element_type sum() const
+  {
+    // first do a vector sum of all registers
+    register_type s = m_registers[0];
+    for (camp::idx_t i = 1; i < s_num_registers; ++i)
+    {
+      s += m_registers[i];
+    }
+    // then a horizontal sum of result
+    return s.sum();
+  }
 
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+  /*!
+   * @brief The * operator of two vectors is a element-wise multiply
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type operator*(self_type const& x) const { return this->multiply(x); }
 
-        //
-        for(camp::idx_t i = 0;i < s_num_elem; ++ i){
-          s += std::to_string(this->get(i)) + " ";
-        }
 
-        camp::idx_t physical_size = s_num_registers * s_register_num_elem;
-        if(s_num_elem < physical_size){
-          s += "{";
-          for(camp::idx_t i = s_num_elem;i < physical_size; ++ i){
-            s += std::to_string(this->get(i)) + " ";
-          }
-          s += "}";
-        }
+  /*!
+   * @brief The dot product of two vectors
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type dot(self_type const& x) const
+  {
+    element_type dp(0);
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      dp += m_registers[i].dot(x.vec(i));
+    }
+    return dp;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& set(element_type val, int idx)
+  {
+    m_registers[to_register(idx)].set(val, to_lane(idx));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type get(int idx) const
+  {
+    return m_registers[to_register(idx)].get(to_lane(idx));
+  }
 
-        s += " ]\n";
 
-        return s;
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+
+    //
+    for (camp::idx_t i = 0; i < s_num_elem; ++i)
+    {
+      s += std::to_string(this->get(i)) + " ";
+    }
+
+    camp::idx_t physical_size = s_num_registers * s_register_num_elem;
+    if (s_num_elem < physical_size)
+    {
+      s += "{";
+      for (camp::idx_t i = s_num_elem; i < physical_size; ++i)
+      {
+        s += std::to_string(this->get(i)) + " ";
       }
+      s += "}";
+    }
 
 
-  };
+    s += " ]\n";
+
+    return s;
+  }
+};
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/stats.hpp b/include/RAJA/pattern/tensor/stats.hpp
index 77b70faf00..643cd3ca22 100644
--- a/include/RAJA/pattern/tensor/stats.hpp
+++ b/include/RAJA/pattern/tensor/stats.hpp
@@ -33,7 +33,7 @@ namespace expt
 {
 struct tensor_stats
 {
-    static int indent;
+  static int indent;
 
   static camp::idx_t num_vector_copy;
   static camp::idx_t num_vector_copy_ctor;
@@ -77,10 +77,9 @@ struct tensor_stats
 
   static void resetVectorStats();
   static void printVectorStats();
-
 };
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index defa08585a..26f06798cc 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -57,15 +57,14 @@ class MultiPolicy
 
 public:
   MultiPolicy() = delete;  // No default construction
-  MultiPolicy(Selector s) : s(s), _policies({Policies{}...}) {}
+  MultiPolicy(Selector s) : s(s), _policies({Policies {}...}) {}
   MultiPolicy(Selector s, Policies... policies) : s(s), _policies({policies...})
-  {
-  }
+  {}
 
-  MultiPolicy(const MultiPolicy &p) : s(p.s), _policies(p._policies) {}
+  MultiPolicy(const MultiPolicy& p) : s(p.s), _policies(p._policies) {}
 
   template <typename Iterable, typename Body>
-  int invoke(Iterable &&i, Body &&b)
+  int invoke(Iterable&& i, Body&& b)
   {
     size_t index = s(i);
     _policies.invoke(index, i, b);
@@ -86,9 +85,8 @@ template <typename Iterable,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE void forall_impl(MultiPolicy<Selector, Policies...> p,
-                             Iterable &&iter,
-                             Body &&body)
+RAJA_INLINE void
+forall_impl(MultiPolicy<Selector, Policies...> p, Iterable&& iter, Body&& body)
 {
   p.invoke(iter, body);
 }
@@ -97,10 +95,11 @@ template <typename Res,
           typename Body,
           typename Selector,
           typename... Policies>
-RAJA_INLINE resources::EventProxy<Res> forall_impl(Res r,
-                                  MultiPolicy<Selector, Policies...> p,
-                                  Iterable &&iter,
-                                  Body &&body)
+RAJA_INLINE resources::EventProxy<Res>
+forall_impl(Res r,
+            MultiPolicy<Selector, Policies...> p,
+            Iterable&& iter,
+            Body&& body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
@@ -136,7 +135,7 @@ template <typename... Policies, typename Selector>
 RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 {
-  return MultiPolicy<Selector, Policies...>(s, Policies{}...);
+  return MultiPolicy<Selector, Policies...>(s, Policies {}...);
 }
 
 /// make_multi_policy - Construct a MultiPolicy from the given selector and
@@ -153,15 +152,16 @@ RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(
-      camp::make_idx_seq_t<sizeof...(Policies)>{}, s, policies);
+  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)> {},
+                                   s, policies);
 }
 
 namespace detail
 {
 
 template <size_t index, size_t size, typename Policy, typename... rest>
-struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
+struct policy_invoker : public policy_invoker<index - 1, size, rest...>
+{
   static_assert(index < size, "index must be in the range of possibilities");
   Policy _p;
   using NextInvoker = policy_invoker<index - 1, size, rest...>;
@@ -169,11 +169,12 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
   policy_invoker(Policy p, rest... args) : NextInvoker(args...), _p(p) {}
 
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - index - 1) {
+    if (offset == size - index - 1)
+    {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -189,22 +190,27 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
-      NextInvoker::invoke(offset, std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+    else
+    {
+      NextInvoker::invoke(offset, std::forward<Iterable>(iter),
+                          std::forward<LoopBody>(loop_body));
     }
   }
 };
 
 template <size_t size, typename Policy, typename... rest>
-struct policy_invoker<0, size, Policy, rest...> {
+struct policy_invoker<0, size, Policy, rest...>
+{
   Policy _p;
   policy_invoker(Policy p, rest...) : _p(p) {}
   template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - 1) {
+    if (offset == size - 1)
+    {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -214,14 +220,16 @@ struct policy_invoker<0, size, Policy, rest...> {
 
       util::callPreLaunchPlugins(context);
 
-      //std::cout <<"policy_invoker: No index\n";
+      // std::cout <<"policy_invoker: No index\n";
       using policy::multi::forall_impl;
       RAJA_FORCEINLINE_RECURSIVE
       auto r = resources::get_resource<Policy>::type::get_default();
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
+    }
+    else
+    {
       throw std::runtime_error("unknown offset invoked");
     }
   }
@@ -234,8 +242,9 @@ namespace type_traits
 
 template <typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
+                                            typename std::decay<T>::type>
+{};
 }  // namespace type_traits
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 898c92a621..50f9a08863 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -26,7 +26,8 @@
 namespace RAJA
 {
 
-enum class Policy {
+enum class Policy
+{
   undefined,
   sequential,
   simd,
@@ -37,7 +38,8 @@ enum class Policy {
   sycl
 };
 
-enum class Pattern {
+enum class Pattern
+{
   undefined,
   forall,
   region,
@@ -52,97 +54,109 @@ enum class Pattern {
   workgroup_dispatch
 };
 
-enum class Launch { undefined, sync, async };
-
-struct PolicyBase {
+enum class Launch
+{
+  undefined,
+  sync,
+  async
 };
 
+struct PolicyBase
+{};
+
 template <Policy Policy_,
           Pattern Pattern_,
           Launch Launch_,
           Platform Platform_,
           typename... Traits>
-struct PolicyBaseT : PolicyBase {
-  static constexpr Policy policy = Policy_;
-  static constexpr Pattern pattern = Pattern_;
-  static constexpr Launch launch = Launch_;
+struct PolicyBaseT : PolicyBase
+{
+  static constexpr Policy policy     = Policy_;
+  static constexpr Pattern pattern   = Pattern_;
+  static constexpr Launch launch     = Launch_;
   static constexpr Platform platform = Platform_;
 };
 
 template <typename PolicyType>
-struct policy_of {
+struct policy_of
+{
   static constexpr Policy value = PolicyType::policy;
 };
 
 template <typename PolicyType>
-struct pattern_of {
+struct pattern_of
+{
   static constexpr Pattern value = PolicyType::pattern;
 };
 
 template <typename PolicyType>
-struct launch_of {
+struct launch_of
+{
   static constexpr Launch value = PolicyType::launch;
 };
 
 template <typename PolicyType>
-struct platform_of {
+struct platform_of
+{
   static constexpr Platform value = PolicyType::platform;
 };
 
 template <typename PolicyType, RAJA::Policy P_>
-struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_> {
-};
+struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_>
+{};
 
-template <typename PolicyType, RAJA::Policy ... Ps_>
-struct policy_any_of : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
-};
+template <typename PolicyType, RAJA::Policy... Ps_>
+struct policy_any_of
+    : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value>
+{};
 
 template <typename PolicyType, RAJA::Pattern P_>
-struct pattern_is
-    : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_> {
-};
+struct pattern_is : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_>
+{};
 
 template <typename PolicyType, RAJA::Launch L_>
-struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_> {
-};
+struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_>
+{};
 
 template <typename PolicyType, RAJA::Platform P_>
 struct platform_is
-    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_> {
-};
+    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_>
+{};
 
 template <typename PolicyType, typename Trait>
-struct policy_has_trait_impl
-    : camp::num<false> {
-};
+struct policy_has_trait_impl : camp::num<false>
+{};
 ///
-template <typename Trait, Policy Policy_,
-                          Pattern Pattern_,
-                          Launch Launch_,
-                          Platform Platform_,
-                          typename... Traits>
+template <typename Trait,
+          Policy Policy_,
+          Pattern Pattern_,
+          Launch Launch_,
+          Platform Platform_,
+          typename... Traits>
 struct policy_has_trait_impl<
-      PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>, Trait>
-    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value> {
-};
+    PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
+    Trait>
+    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value>
+{};
 ///
 template <typename PolicyType, typename Trait>
 using policy_has_trait = policy_has_trait_impl<camp::decay<PolicyType>, Trait>;
 
 
 template <typename Inner>
-struct wrapper {
+struct wrapper
+{
   using inner = Inner;
 };
 
 namespace reduce
 {
 
-struct ordered {
-};
+struct ordered
+{};
 
-struct unordered {
-};
+struct unordered
+{};
 
 }  // namespace reduce
 
@@ -159,10 +173,7 @@ template <Policy Policy_,
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          typename... Args>
+template <Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
@@ -185,8 +196,8 @@ struct ExecutionPolicy
                     ::RAJA::concepts::has_type<::RAJA::Launch>(
                         camp::decay<decltype(Pol::launch)>()),
                     ::RAJA::concepts::has_type<::RAJA::Platform>(
-                        camp::decay<decltype(Pol::platform)>())) {
-};
+                        camp::decay<decltype(Pol::platform)>()))
+{};
 
 }  // end namespace concepts
 
@@ -194,44 +205,45 @@ namespace type_traits
 {
 
 template <typename Pol>
-struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential> {
-};
+struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential>
+{};
 template <typename Pol>
-struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd> {
-};
+struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd>
+{};
 template <typename Pol>
-struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp> {
-};
+struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp>
+{};
 template <typename Pol>
 struct is_target_openmp_policy
-    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp> {
-};
+    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp>
+{};
 template <typename Pol>
-struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda> {
-};
+struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda>
+{};
 template <typename Pol>
-struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip> {
-};
+struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip>
+{};
 template <typename Pol>
-struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl> {
-};
+struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl>
+{};
 
 template <typename Pol>
 struct is_device_exec_policy
-    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip> {
-};
+    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip>
+{};
 
 DefineTypeTraitFromConcept(is_execution_policy,
                            RAJA::concepts::ExecutionPolicy);
 
 
 template <typename Pol>
-struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce> {
-};
+struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce>
+{};
 
 template <typename Pol>
-struct is_multi_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
-};
+struct is_multi_reduce_policy
+    : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce>
+{};
 
 }  // end namespace type_traits
 
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index cae78d2493..4e1779bb39 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -39,74 +39,75 @@ namespace workgroup
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
 /// execute the enqueued loops in the reverse order from the order that they
 /// were enqueued
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct reverse_ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
 
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in separate allocations.
 struct array_of_pointers
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in a single compact array.
 struct ragged_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 /// store an array of the enqueued objects with padding such that the objects
 /// can be accessed using a constant stride from the beginning of the array.
 struct constant_stride_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 
 /// Dispatch using function pointers to make indirect function calls
 struct indirect_function_call_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
 /// Dispatch using virtual functions to make indirect function calls
 struct indirect_virtual_function_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
 /// Dispatch using an implementation equivalent to a switch statement to select
 /// the type from RangeAndCallables and directly call the object.
 /// RangeAndCallables is a pack of types of the form camp::list<Range, Callable>
 /// where pairs of Range and Callable are the types of the range and callable
 /// objects that may be passed to WorkPool enqueue.
-template < typename ... RangeAndCallables >
+template <typename... RangeAndCallables>
 struct direct_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
-
-template < typename EXEC_POLICY_T,
-           typename ORDER_POLICY_T,
-           typename STORAGE_POLICY_T,
-           typename DISPATCH_POLICY_T = indirect_function_call_dispatch >
-struct WorkGroupPolicy
-    : public RAJA::make_policy_pattern_platform_t<
-                       policy_of<EXEC_POLICY_T>::value,
-                       Pattern::workgroup,
-                       platform_of<EXEC_POLICY_T>::value> {
-  static_assert(RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
+                                  Pattern::workgroup_dispatch>
+{};
+
+template <typename EXEC_POLICY_T,
+          typename ORDER_POLICY_T,
+          typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
+struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
+                             policy_of<EXEC_POLICY_T>::value,
+                             Pattern::workgroup,
+                             platform_of<EXEC_POLICY_T>::value>
+{
+  static_assert(
+      RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
       "WorkGroupPolicy: EXEC_POLICY_T must be a workgroup exec policy");
-  static_assert(RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
+  static_assert(
+      RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
+  static_assert(
+      RAJA::pattern_is<STORAGE_POLICY_T,
+                       RAJA::Pattern::workgroup_storage>::value,
       "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
-  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::value,
+  static_assert(
+      RAJA::pattern_is<DISPATCH_POLICY_T,
+                       RAJA::Pattern::workgroup_dispatch>::value,
       "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup dispatch policy");
 };
 
@@ -117,12 +118,12 @@ using policy::workgroup::ordered;
 using policy::workgroup::reverse_ordered;
 
 using policy::workgroup::array_of_pointers;
-using policy::workgroup::ragged_array_of_objects;
 using policy::workgroup::constant_stride_array_of_objects;
+using policy::workgroup::ragged_array_of_objects;
 
+using policy::workgroup::direct_dispatch;
 using policy::workgroup::indirect_function_call_dispatch;
 using policy::workgroup::indirect_virtual_function_dispatch;
-using policy::workgroup::direct_dispatch;
 
 using policy::workgroup::WorkGroupPolicy;
 
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index e0ca557b32..ee859b4a91 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -23,7 +23,7 @@
 #include "RAJA/util/macros.hpp"
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 /*!
@@ -39,19 +39,19 @@
  * because we assume there is no thread safety issues (no parallel model)
  */
 #if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::cuda_atomic {}
 #elif defined(__HIP_DEVICE_COMPILE__) && defined(RAJA_HIP_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::hip_atomic {}
 #elif defined(__SYCL_DEVICE_ONLY__)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::sycl_atomic {}
 #elif defined(RAJA_ENABLE_OPENMP)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::omp_atomic {}
 #else
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::seq_atomic {}
 #endif
 
@@ -60,102 +60,96 @@ namespace RAJA
 {
 
 //! Atomic policy that automatically does "the right thing"
-struct auto_atomic {
-};
+struct auto_atomic
+{};
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T* acc)
 {
   return atomicLoad(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T* acc, T value)
 {
   atomicStore(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T* acc, T value)
 {
   return atomicAdd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T* acc, T value)
 {
   return atomicSub(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T* acc, T value)
 {
   return atomicMin(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T* acc, T value)
 {
   return atomicMax(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc, T compare)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic,
-                                         T *acc,
-                                         T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc, T compare)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T* acc, T value)
 {
   return atomicAnd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T* acc, T value)
 {
   return atomicOr(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T* acc, T value)
 {
   return atomicXor(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
-                                              T *acc,
-                                              T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, T* acc, T value)
 {
   return atomicExchange(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(auto_atomic, T *acc, T compare, T value)
+atomicCAS(auto_atomic, T* acc, T compare, T value)
 {
   return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value);
 }
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index 34755fa49d..9ea7646337 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -22,7 +22,8 @@
 
 #include <cstdint>
 
-#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    (defined(_WIN32) && defined(__INTEL_COMPILER))
 #include <intrin.h>
 #endif
 
@@ -41,14 +42,16 @@ namespace RAJA
 
 
 //! Atomic policy that uses the compilers builtin __atomic_XXX routines
-struct builtin_atomic {
-};
+struct builtin_atomic
+{};
 
 
-namespace detail {
+namespace detail
+{
 
 
-#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    (defined(_WIN32) && defined(__INTEL_COMPILER))
 
 
 /*!
@@ -56,12 +59,11 @@ namespace detail {
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    std::is_same<T, char>::value ||
-    std::is_same<T, short>::value ||
-    std::is_same<T, long>::value ||
-    std::is_same<T, long long>::value;
+      std::is_same<T, char>::value || std::is_same<T, short>::value ||
+      std::is_same<T, long>::value || std::is_same<T, long long>::value;
 };
 
 
@@ -70,18 +72,18 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
+struct builtin_useReinterpret
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 ||
-     sizeof(T) == 2 ||
-     sizeof(T) == 4 ||
-     sizeof(T) == 8);
-
-  using type =
-    std::conditional_t<sizeof(T) == 1, char,
-    std::conditional_t<sizeof(T) == 2, short,
-    std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+
+  using type = std::conditional_t<
+      sizeof(T) == 1,
+      char,
+      std::conditional_t<sizeof(T) == 2,
+                         short,
+                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -90,10 +92,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -105,22 +108,22 @@ struct builtin_useCAS {
 /*!
  * Atomic or using intrinsics
  */
-RAJA_INLINE char builtin_atomicOr(char *acc, char value)
+RAJA_INLINE char builtin_atomicOr(char* acc, char value)
 {
   return _InterlockedOr8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicOr(short *acc, short value)
+RAJA_INLINE short builtin_atomicOr(short* acc, short value)
 {
   return _InterlockedOr16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicOr(long *acc, long value)
+RAJA_INLINE long builtin_atomicOr(long* acc, long value)
 {
   return _InterlockedOr(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
 {
   return _InterlockedOr64(acc, value);
 }
@@ -131,7 +134,7 @@ RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
 }
@@ -140,22 +143,22 @@ RAJA_INLINE T builtin_atomicLoad(T *acc)
 /*!
  * Atomic exchange using intrinsics
  */
-RAJA_INLINE char builtin_atomicExchange(char *acc, char value)
+RAJA_INLINE char builtin_atomicExchange(char* acc, char value)
 {
   return _InterlockedExchange8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicExchange(short *acc, short value)
+RAJA_INLINE short builtin_atomicExchange(short* acc, short value)
 {
   return _InterlockedExchange16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicExchange(long *acc, long value)
+RAJA_INLINE long builtin_atomicExchange(long* acc, long value)
 {
   return _InterlockedExchange(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
 {
   return _InterlockedExchange64(acc, value);
 }
@@ -166,7 +169,7 @@ RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   builtin_atomicExchange(acc, value);
 }
@@ -175,22 +178,23 @@ RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 /*!
  * Atomic compare and swap using intrinsics
  */
-RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value)
+RAJA_INLINE char builtin_atomicCAS(char* acc, char compare, char value)
 {
   return _InterlockedCompareExchange8(acc, value, compare);
 }
 
-RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value)
+RAJA_INLINE short builtin_atomicCAS(short* acc, short compare, short value)
 {
   return _InterlockedCompareExchange16(acc, value, compare);
 }
 
-RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
+RAJA_INLINE long builtin_atomicCAS(long* acc, long compare, long value)
 {
   return _InterlockedCompareExchange(acc, value, compare);
 }
 
-RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
+RAJA_INLINE long long
+builtin_atomicCAS(long long* acc, long long compare, long long value)
 {
   return _InterlockedCompareExchange64(acc, value, compare);
 }
@@ -199,22 +203,22 @@ RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long
 /*!
  * Atomic addition using intrinsics
  */
-RAJA_INLINE char builtin_atomicAdd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAdd(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAdd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAdd(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAdd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAdd(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, value);
 }
@@ -223,22 +227,22 @@ RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
 /*!
  * Atomic subtraction using intrinsics
  */
-RAJA_INLINE char builtin_atomicSub(char *acc, char value)
+RAJA_INLINE char builtin_atomicSub(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, -value);
 }
 
-RAJA_INLINE short builtin_atomicSub(short *acc, short value)
+RAJA_INLINE short builtin_atomicSub(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, -value);
 }
 
-RAJA_INLINE long builtin_atomicSub(long *acc, long value)
+RAJA_INLINE long builtin_atomicSub(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, -value);
 }
 
-RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, -value);
 }
@@ -247,22 +251,22 @@ RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
 /*!
  * Atomic and using intrinsics
  */
-RAJA_INLINE char builtin_atomicAnd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAnd(char* acc, char value)
 {
   return _InterlockedAnd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAnd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAnd(short* acc, short value)
 {
   return _InterlockedAnd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAnd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAnd(long* acc, long value)
 {
   return _InterlockedAnd(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value)
 {
   return _InterlockedAnd64(acc, value);
 }
@@ -271,22 +275,22 @@ RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
 /*!
  * Atomic xor using intrinsics
  */
-RAJA_INLINE char builtin_atomicXor(char *acc, char value)
+RAJA_INLINE char builtin_atomicXor(char* acc, char value)
 {
   return _InterlockedXor8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicXor(short *acc, short value)
+RAJA_INLINE short builtin_atomicXor(short* acc, short value)
 {
   return _InterlockedXor16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicXor(long *acc, long value)
+RAJA_INLINE long builtin_atomicXor(long* acc, long value)
 {
   return _InterlockedXor(acc, value);
 }
 
-RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
 {
   return _InterlockedXor64(acc, value);
 }
@@ -300,10 +304,11 @@ RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -312,54 +317,54 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+struct builtin_useReinterpret
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -369,10 +374,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !std::is_integral<T>::value && !std::is_enum<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !std::is_integral<T>::value && !std::is_enum<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -386,7 +392,7 @@ struct builtin_useCAS {
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
 }
@@ -397,7 +403,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -408,7 +414,7 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -419,10 +425,10 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
+                              __ATOMIC_RELAXED);
   return compare;
 }
 
@@ -432,7 +438,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
 }
@@ -443,7 +449,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
 }
@@ -454,7 +460,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
 }
@@ -465,7 +471,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
 }
@@ -476,7 +482,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
 }
@@ -502,12 +508,12 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+      builtin_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -516,7 +522,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -530,13 +536,12 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicExchange(reinterpret_cast<R*>(acc),
-                           RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -545,14 +550,13 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicCAS(reinterpret_cast<R*>(acc),
-                      RAJA::util::reinterp_A_as_B<T, R>(compare),
-                      RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -567,7 +571,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
@@ -580,7 +584,7 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -595,15 +599,15 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  * Returns the OLD value that was replaced by the result of this operation.
  */
 template <typename T, typename Oper>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = builtin_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected));
 
   return old;
@@ -617,21 +621,23 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper,
-                                                     ShortCircuit &&sc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
+                                                     Oper&& oper,
+                                                     ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -646,65 +652,50 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
 
 /*!
  * Atomic subtraction using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 
 /*!
  * Atomic and using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
 
 /*!
  * Atomic or using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 
 /*!
  * Atomic xor using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
 
@@ -712,109 +703,105 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicLoad(acc);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T* acc, T value)
 {
   detail::builtin_atomicStore(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAdd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicSub(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc, [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc, [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicAdd(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicSub(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(acc,
+                                        [value](T old)
+                                        {
+                                          return old == static_cast<T>(0) ||
+                                                         value < old
+                                                     ? value
+                                                     : old - static_cast<T>(1);
+                                        });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAnd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicOr(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicXor(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicExchange(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T
+atomicCAS(builtin_atomic, T* acc, T compare, T value)
 {
   return detail::builtin_atomicCAS(acc, compare, value);
 }
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index e9d5bc454f..40d5e68e4c 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/forall.hpp"
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 88a89d5362..3fc1e4b90c 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -71,7 +71,8 @@ cudaDeviceProp& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -90,7 +91,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -110,7 +112,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -132,7 +135,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -141,8 +145,10 @@ struct DevicePinnedAllocator {
     cudaErrchk(cudaGetDevice(&device));
     void* ptr;
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    cudaErrchk(
+        cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy,
+                             cudaCpuDeviceId));
 
     return ptr;
   }
@@ -158,22 +164,25 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct cudaInfo {
+struct cudaInfo
+{
   const void* func = nullptr;
-  cuda_dim_t gridDim{0, 0, 0};
-  cuda_dim_t blockDim{0, 0, 0};
+  cuda_dim_t gridDim {0, 0, 0};
+  cuda_dim_t blockDim {0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
+  ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct cudaStatusInfo : cudaInfo {
+struct cudaStatusInfo : cudaInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -190,10 +199,7 @@ extern cudaStatusInfo tl_status;
 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Cuda res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
 
 }  // namespace detail
 
@@ -205,13 +211,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     cudaErrchk(cudaDeviceSynchronize());
   }
 }
@@ -224,12 +233,16 @@ void synchronize(::RAJA::resources::Cuda res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -242,29 +255,40 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, cuda_dim_t gridDim, cuda_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Cuda res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            cuda_dim_t gridDim,
+            cuda_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Cuda res,
+            bool async       = true,
+            const char* name = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePushA(name);
+  if (name) nvtxRangePushA(name);
 #else
   RAJA_UNUSED_VAR(name);
 #endif
-  cudaErrchk(cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+  cudaErrchk(
+      cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePop();
+  if (name) nvtxRangePop();
 #endif
   launch(res, async);
 }
@@ -283,9 +307,11 @@ cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                             detail::tl_status.gridDim.y *
-                                             detail::tl_status.gridDim.z; }
+cuda_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -293,9 +319,11 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                              detail::tl_status.blockDim.y *
-                                              detail::tl_status.blockDim.z; }
+cuda_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -310,7 +338,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -322,24 +351,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -354,16 +386,17 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void* func,
-    cuda_dim_t gridDim,
-    cuda_dim_t blockDim,
-    size_t& dynamic_smem,
-    ::RAJA::resources::Cuda res,
-    LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 cuda_dim_t gridDim,
+                 cuda_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Cuda res,
+                 LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(detail::tl_status,
-      detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
+      detail::tl_status,
+      detail::cudaInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -378,7 +411,8 @@ static constexpr size_t cuda_occupancy_uninitialized_size_t =
 struct CudaFixedMaxBlocksData
 {
   int device_sm_per_device = cuda::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -394,25 +428,26 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 struct CudaOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
-  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = cuda_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksThreadsData
+cuda_occupancy_max_blocks_threads(const void* func,
+                                  size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
     cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -422,48 +457,50 @@ CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_threads_per_block = cuda_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
+  int func_threads_per_block          = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func,
+                          size_t func_dynamic_shmem_per_block,
+                          int func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -496,14 +533,16 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -517,10 +556,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -528,7 +571,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -536,16 +580,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -554,8 +599,10 @@ struct ConcretizerImpl
   {
     auto data = cuda_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -563,9 +610,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index f6269b36e4..5aeaba0883 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace cuda
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,8 +52,9 @@ __global__ void get_value_global(
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  static void* ptr            = nullptr;
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     cudaErrchk(cudaFreeHost(ptr));
     cudaErrchk(cudaMallocHost(&ptr, nbytes));
@@ -73,7 +74,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +82,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Cuda::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   cudaErrchk(cudaStreamSynchronize(res.get_stream()));
 
@@ -91,7 +93,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +103,20 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace cuda
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
-inline const Dispatcher_T* get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T,
+          typename Dispatcher_T,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+inline const Dispatcher_T*
+get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return cuda::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory) {
+        return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 41fe17c84a..3cf8e6408f 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,46 +36,48 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +85,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,46 +101,48 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::reverse_ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,8 +150,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,15 +167,17 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldCudaDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -171,10 +185,11 @@ struct HoldCudaDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,12 +199,12 @@ struct HoldCudaDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           size_t BLOCKS_PER_SM,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -206,36 +221,42 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
-  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
-  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using exec_policy =
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::cuda::
+      unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Cuda;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Cuda;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template <typename T>
     using type = HoldCudaDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -244,21 +265,25 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::cuda, dispatcher_holder_policy, RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+  using dispatcher_type =
+      Dispatcher<Platform::cuda,
+                 dispatcher_holder_policy,
+                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
+                 Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -268,35 +293,41 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void
+  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -304,37 +335,44 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage
+  run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+    auto func =
+        cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
+                                      value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<cuda_dim_member_t>(num_loops),
-                          1};
+      cuda_dim_t blockSize {static_cast<cuda_dim_member_t>(block_size), 1, 1};
+      cuda_dim_t gridSize {
+          static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
+                                         block_size),
+          static_cast<cuda_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -347,8 +385,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
+                           shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -358,10 +397,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index aedfe91a03..a1b3cd5279 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -25,7 +25,8 @@
 #include <stdexcept>
 #include <type_traits>
 
-#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6
+#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 &&                     \
+    __CUDACC_VER_MINOR__ >= 6
 #define RAJA_ENABLE_CUDA_ATOMIC_REF
 #endif
 
@@ -65,11 +66,11 @@ namespace detail
  * cuda_useBuiltinExchange below.
  */
 template <typename T>
-struct cuda_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+struct cuda_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -81,15 +82,15 @@ struct cuda_useBuiltinCommon {
  * below.
  */
 template <typename T>
-struct cuda_useReinterpretCommon {
-  static constexpr bool value =
-    !cuda_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretCommon
+{
+  static constexpr bool value = !cuda_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -109,7 +110,7 @@ using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -124,12 +125,12 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+struct cuda_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -137,22 +138,23 @@ struct cuda_useBuiltinExchange {
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct cuda_useReinterpretExchange {
-  static constexpr bool value =
-    !cuda_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct cuda_useReinterpretExchange
+{
+  static constexpr bool value = !cuda_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::type;
+using cuda_useReinterpretExchange_t =
+    typename cuda_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -160,7 +162,7 @@ using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::t
  */
 template <typename T,
           std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -171,13 +173,12 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   using R = cuda_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicExchange(reinterpret_cast<R*>(acc),
-                        RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -187,41 +188,41 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
 #if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
 
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
-    cuda::memory_order_relaxed{});
+      cuda::memory_order_relaxed {});
 }
 
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
-    value, cuda::memory_order_relaxed{});
+      value, cuda::memory_order_relaxed {});
 }
 
 #else
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda_atomicOr(acc, static_cast<T>(0));
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicLoad(reinterpret_cast<R*>(acc)));
+      cuda_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda_atomicExchange(acc, value);
 }
@@ -238,14 +239,14 @@ RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
  * implemented using a builtin
  */
 template <typename T>
-struct cuda_useBuiltinCAS {
+struct cuda_useBuiltinCAS
+{
   static constexpr bool value =
 #if __CUDA_ARCH__ >= 700
-    std::is_same<T, unsigned short int>::value ||
+      std::is_same<T, unsigned short int>::value ||
 #endif
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+      std::is_same<T, unsigned long long>::value;
 };
 
 /*!
@@ -254,29 +255,28 @@ struct cuda_useBuiltinCAS {
  * and swap supports
  */
 template <typename T>
-struct cuda_useReinterpretCAS {
-  static constexpr bool value =
-    !cuda_useBuiltinCAS<T>::value &&
-    (
+struct cuda_useReinterpretCAS
+{
+  static constexpr bool value = !cuda_useBuiltinCAS<T>::value &&
+                                (
 #if __CUDA_ARCH__ >= 700
-     sizeof(T) == sizeof(unsigned short) ||
+                                    sizeof(T) == sizeof(unsigned short) ||
 #endif
-     sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long)
-    );
+                                    sizeof(T) == sizeof(unsigned int) ||
+                                    sizeof(T) == sizeof(unsigned long long));
 
   using type =
 #if __CUDA_ARCH__ >= 700
-    std::conditional_t<sizeof(T) == sizeof(unsigned short),
-                       unsigned short,
+      std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                         unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int,
-                       unsigned long long>
+                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                            unsigned int,
+                                            unsigned long long>
 #if __CUDA_ARCH__ >= 700
-                      >
+                         >
 #endif
-    ;
+      ;
 };
 
 /*!
@@ -287,21 +287,20 @@ using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
 template <typename T,
           std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
 
 template <typename T,
           std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicCAS(reinterpret_cast<R*>(acc),
-                   RAJA::util::reinterp_A_as_B<T, R>(compare),
-                   RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 /*!
@@ -334,42 +333,44 @@ RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
-                                             Oper&& oper)
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = cuda_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected));
 
   return old;
 }
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing CUDA supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
                                              Oper&& oper,
                                              ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -379,29 +380,28 @@ RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int,
-  float
+using cuda_atomicAdd_builtin_types = ::camp::list<int,
+                                                  unsigned int,
+                                                  unsigned long long int,
+                                                  float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                                                  ,
+                                                  double
 #endif
->;
+                                                  >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -412,39 +412,39 @@ RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
  */
 using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
-using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
-using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long int,
-  float
+using cuda_atomicSub_via_Add_builtin_types =
+    ::camp::list<unsigned long long int,
+                 float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                 ,
+                 double
 #endif
->;
+                 >;
 
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -453,37 +453,34 @@ RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<
-  int,
-  unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<int,
+                                                     unsigned int
 #if __CUDA_ARCH__ >= 500
-  ,
-  long long int,
-  unsigned long long int
+                                                     ,
+                                                     long long int,
+                                                     unsigned long long int
 #endif
->;
+                                                     >;
 
 
 /*!
  * Atomic min
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc, [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -492,23 +489,21 @@ RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 /*!
  * Atomic max
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc, [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -517,28 +512,30 @@ RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 /*!
  * Atomic increment/decrement with reset
  */
-using cuda_atomicIncDecReset_builtin_types = ::camp::list<
-  unsigned int
->;
+using cuda_atomicIncDecReset_builtin_types = ::camp::list<unsigned int>;
 
 
 /*!
  * Atomic increment with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return cuda_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   return ::atomicInc(acc, value);
 }
@@ -548,7 +545,7 @@ RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc)
 {
   return cuda_atomicAdd(acc, static_cast<T>(1));
 }
@@ -557,20 +554,28 @@ RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
 /*!
  * Atomic decrement with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
-  });
+  return cuda_atomicCAS_loop(acc,
+                             [value](T old)
+                             {
+                               return old == static_cast<T>(0) || value < old
+                                          ? value
+                                          : old - static_cast<T>(1);
+                             });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   return ::atomicDec(acc, value);
 }
@@ -580,7 +585,7 @@ RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc)
 {
   return cuda_atomicSub(acc, static_cast<T>(1));
 }
@@ -589,28 +594,25 @@ RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
 /*!
  * Atomic bitwise functions (and, or, xor)
  */
-using cuda_atomicBit_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int
->;
+using cuda_atomicBit_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long int>;
 
 
 /*!
  * Atomic and
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -620,12 +622,11 @@ RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
  * Atomic or
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -638,17 +639,17 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * Atomic xor
  */
 template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+              nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return cuda_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -667,185 +668,195 @@ RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
+atomicStore(cuda_atomic_explicit<host_policy>, T* acc, T value)
 {
 #ifdef __CUDA_ARCH__
   detail::cuda_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
   return detail::cuda_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
   return detail::cuda_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(cuda_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 493136400c..490163c1a7 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,61 +70,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads,
+                                static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(dims.blocks,
+                                static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -132,43 +162,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -177,46 +223,67 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -224,43 +291,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -290,21 +373,22 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -315,21 +399,20 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -341,23 +424,24 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -369,138 +453,143 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -515,37 +604,50 @@ void forallp_cuda_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
@@ -568,14 +670,16 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
     }
 
     RAJA_FT_END;
@@ -585,41 +689,56 @@ forall_impl(resources::Cuda cuda_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker =
+      ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
+                   LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                   IndexType, camp::decay<ForallParam> >);
+        &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                   IndexType, camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -635,9 +754,9 @@ forall_impl(resources::Cuda cuda_res,
     RAJA_FT_BEGIN;
 
     RAJA::cuda::detail::cudaInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = cuda_res;
+    launch_info.res      = cuda_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -645,14 +764,17 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -683,22 +805,32 @@ forall_impl(resources::Cuda cuda_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BlocksPerSM,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
+    resources::Cuda r,
+    ExecPolicy<seq_segit,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BlocksPerSM,
+                                                        Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
-                     loop_body);
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                 IterationGetter, Concretizer,
+                                                 BlocksPerSM, true>(),
+        loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index b2daa3a23e..a8daec62eb 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -59,15 +59,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,46 +90,45 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
     }
 
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       atomicExch(&ptr[i], u.array[i]);
     }
   }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 
@@ -160,10 +153,13 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
 #else
@@ -176,10 +172,13 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
 #else
@@ -198,7 +197,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned int
+shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -210,19 +210,22 @@ RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long
+shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
+                                                           int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+RAJA_DEVICE RAJA_INLINE unsigned long long
+shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
@@ -265,7 +268,8 @@ RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
+                                                             int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -277,19 +281,22 @@ RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long
+shfl_sync<unsigned long>(unsigned long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
+                                                       int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
 template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+RAJA_DEVICE RAJA_INLINE unsigned long long
+shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
@@ -334,23 +341,28 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -370,9 +382,10 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = __shfl_xor_sync(0xffffffff, temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
@@ -388,65 +401,81 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::cuda::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > policy::cuda::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <=
+                      policy::cuda::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index ff15848bcb..1f0b999adc 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -42,18 +42,17 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -61,10 +60,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 7465f515b0..75a7dddccb 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -51,7 +51,8 @@ namespace RAJA
  * Blocks per SM must be chosen by the user.
  */
 template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
-struct cuda_explicit_launch {};
+struct cuda_explicit_launch
+{};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
@@ -67,7 +68,10 @@ struct cuda_explicit_launch {};
  * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch = cuda_explicit_launch<async0,
+                                         num_blocks,
+                                         num_threads,
+                                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
@@ -75,7 +79,11 @@ using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_occ_calc_launch =
+    cuda_explicit_launch<async0,
+                         0,
+                         num_threads0,
+                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -87,8 +95,11 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
-};
+    : public internal::Statement<
+          ::RAJA::policy::cuda::
+              cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
+          EnclosedStmts...>
+{};
 
 
 /*!
@@ -98,8 +109,8 @@ struct CudaKernelExt
  * The kernel launch is synchronous.
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp =
-    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
+                                    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -135,9 +146,9 @@ using CudaKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixed =
-    CudaKernelExt<cuda_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using CudaKernelFixed = CudaKernelExt<
+    cuda_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
@@ -156,7 +167,10 @@ using CudaKernelFixedAsync =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<false,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -166,7 +180,10 @@ using CudaKernelFixedSM =
  */
 template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<true,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -196,7 +213,7 @@ template <typename Data, typename Exec>
 __global__ void CudaKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -215,7 +232,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
     void CudaKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -231,13 +248,18 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
+template <int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
+                                                              BlocksPerSM,
+                                                              Data,
+                                                              executor_t>)>;
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
+                                              executor_t>;
   }
 };
 
@@ -245,10 +267,11 @@ struct CudaKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
+template <typename Data, typename executor_t>
 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::CudaKernelLauncher<Data, executor_t>;
@@ -256,12 +279,14 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct CudaLaunchHelper;
 
 
@@ -270,16 +295,31 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          int blocks_per_sm,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct CudaLaunchHelper<
+    cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
+    StmtList,
+    Data,
+    Types>
 {
   using Self = CudaLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
+  using kernelGetter_t =
+      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
+                               Data,
+                               executor_t>;
 
   inline static const void* get_func()
   {
@@ -287,13 +327,16 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -301,10 +344,11 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         //
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -314,69 +358,73 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
-  inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+  inline static void
+  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -384,8 +432,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -393,16 +442,15 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -416,8 +464,10 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
+inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
+                              cuda_dim_t result,
+                              cuda_dim_t minimum = cuda_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -430,12 +480,13 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -443,9 +494,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -453,9 +505,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -470,18 +523,21 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -495,9 +551,10 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -510,8 +567,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -524,24 +581,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      cuda_dim_t fit_threads{0,0,0};
+      cuda_dim_t fit_threads {0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitCudaDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitCudaDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -555,24 +612,25 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitCudaDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
+                                            launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -581,7 +639,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -595,14 +654,17 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto cuda_data = RAJA::cuda::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+        auto cuda_data = RAJA::cuda::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&cuda_data};
+        RAJA::cuda::launch(func, launch_dims.dims.blocks,
+                           launch_dims.dims.threads, args, shmem, res,
+                           launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 58ffa1ba14..3176fd5bf8 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -45,9 +45,12 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,13 +63,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -79,14 +82,13 @@ struct CudaStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -108,9 +110,13 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,20 +129,23 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,14 +160,13 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -180,9 +188,13 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -195,20 +207,23 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,14 +233,13 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -245,14 +259,19 @@ struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : CudaStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -263,33 +282,32 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_direct<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +317,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +336,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -332,41 +348,41 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -382,9 +398,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +413,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -411,30 +425,29 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+    Data,
+    statement::For<ArgumentId,
+                   RAJA::cuda_thread_masked_direct<Mask>,
+                   EnclosedStmts...>,
+    Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +457,11 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +477,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -478,39 +489,38 @@ struct CudaStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_thread_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -526,9 +536,7 @@ struct CudaStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,7 +552,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 87556ed8b1..18e11fb989 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -46,33 +46,40 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                      sync,
+                                                      IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -102,38 +109,52 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -165,38 +186,52 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -225,14 +260,19 @@ struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : CudaStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -244,40 +284,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,9 +335,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -303,48 +349,56 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,7 +413,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -372,37 +425,43 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::cuda_thread_masked_direct<Mask>,
+                     EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,9 +472,8 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -428,45 +486,52 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,7 +546,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index fd33192a65..74c02b8608 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -41,33 +41,31 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                             Types> {
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct CudaStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -93,18 +92,13 @@ struct CudaStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 258cd204d6..018d9d0dfd 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -39,27 +39,30 @@ struct cuda_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct CudaStatementExecutor<Data,
                              statement::InitLocalMem<RAJA::cuda_shared_mem,
-                             camp::idx_seq<Indices...>, EnclosedStmts...>,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
                              Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +70,33 @@ struct CudaStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +104,47 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_thread_mem,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +152,33 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,31 +186,24 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index e932a3e270..37287561fd 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -40,30 +40,34 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index 7e46748991..dfa80667a0 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -35,7 +35,8 @@ namespace internal
 // Executor that handles reductions across a single CUDA thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -44,22 +45,24 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -73,7 +76,8 @@ struct CudaStatementExecutor<Data,
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -81,7 +85,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -94,7 +98,8 @@ struct CudaStatementExecutor<Data,
 // Executor that handles reductions across a single CUDA thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
@@ -103,35 +108,37 @@ struct CudaStatementExecutor<Data,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -139,7 +146,7 @@ struct CudaStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,7 +155,6 @@ struct CudaStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index 7dd45d8837..ae00d346ae 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -43,14 +43,14 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncthreads().
  */
-struct CudaSyncThreads : public internal::Statement<camp::nil> {
-};
+struct CudaSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncwarp().
  */
-struct CudaSyncWarp : public internal::Statement<camp::nil> {
-};
+struct CudaSyncWarp : public internal::Statement<camp::nil>
+{};
 
 }  // namespace statement
 
@@ -58,37 +58,38 @@ namespace internal
 {
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
+struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types> {
+struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
+  static inline RAJA_DEVICE
 #if CUDART_VERSION >= 9000
-  void exec(Data &, bool) { __syncwarp(); }
+      void
+      exec(Data&, bool)
+  {
+    __syncwarp();
+  }
 #else
-  void exec(Data &, bool) {  }
+      void
+      exec(Data&, bool)
+  {}
 #endif
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad901f6b02..a7f36c54b7 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -58,10 +58,12 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,23 +104,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -141,11 +145,16 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,26 +162,32 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,23 +205,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -231,11 +246,16 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,26 +263,32 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,23 +301,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -316,15 +342,22 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index c611346d46..377ac4edff 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -58,42 +58,49 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::
+              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -129,50 +136,64 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -207,50 +228,64 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -279,15 +314,24 @@ template <typename Data,
           typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 9c904ea45a..e8124cc64a 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -44,29 +44,26 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   CudaDims dims;
   CudaDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(CudaDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(CudaDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(CudaDims _dims, CudaDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims {_dims}, min_dims {_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,43 +79,44 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper {
+struct CudaStatementListExecutorHelper
+{
 
   using next_helper_t =
       CudaStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -126,7 +124,7 @@ struct CudaStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +135,7 @@ struct CudaStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -151,16 +149,17 @@ struct CudaStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -175,109 +174,120 @@ struct CudaStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return CudaStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
 
 template <typename StmtList, typename Data, typename Types>
-using cuda_statement_list_executor_t = CudaStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using cuda_statement_list_executor_t =
+    CudaStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,164 +295,219 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -450,35 +515,43 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -487,62 +560,88 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 75e5f6902b..355b080a30 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,38 +45,47 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -84,18 +93,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +116,16 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -121,13 +134,18 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -140,46 +158,54 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
 
       {
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
+            async, named_usage::unspecified, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,56 +213,66 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, int num_threads, size_t BLOCKS_PER_SM, typename ReduceParams>
+template <typename BODY,
+          int num_threads,
+          size_t BLOCKS_PER_SM,
+          typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -250,18 +286,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -271,14 +309,16 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -287,19 +327,25 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN && body_in, ReduceParams &launch_reducers)
+  // Version with explicit reduction parameters..
+  template <typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
+                                            camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -307,53 +353,61 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
       {
 
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
+                                                       BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
     }
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -361,43 +415,50 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -405,29 +466,36 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -437,53 +505,62 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -494,34 +571,42 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -535,14 +620,16 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -551,42 +638,49 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -594,31 +688,36 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -628,54 +727,62 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -686,35 +793,42 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -728,16 +842,17 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -748,31 +863,34 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 /*
    CUDA generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             sync,
+                                             IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::
+              cuda_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -781,29 +899,35 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1,
+                                             IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -814,39 +938,47 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::cuda_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -856,29 +988,34 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -890,9 +1027,9 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
@@ -903,101 +1040,122 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index f9f60f730e..e52a036c8f 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -73,100 +73,124 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::cuda::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -185,48 +209,63 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +283,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +299,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +329,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer {}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +380,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -338,14 +403,15 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
@@ -354,34 +420,31 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +452,8 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,18 +463,19 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -422,57 +485,69 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
-    size_t shared_replication = 0;
+    size_t shared_replication  = 0;
     const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+        [&](size_t max_shmem_size)
+        {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data {block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer {}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+      m_shared_offset      = static_cast<int>(shared_offset);
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -481,7 +556,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
 
@@ -490,10 +565,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+    if (shared_mem != nullptr)
+    {
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -502,11 +577,12 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -516,16 +592,17 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -536,14 +613,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -551,24 +630,27 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -595,39 +677,50 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataCuda
 {
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                            T,
+                                                            tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                       T,
+                                                       tuning>,
+              void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Cuda>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataCuda() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr >
+  template <
+      typename Container,
+      std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* =
+          nullptr>
   MultiReduceDataCuda(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,31 +732,35 @@ struct MultiReduceDataCuda
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataCuda(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda(MultiReduceDataCuda&&)                 = delete;
   MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
-  MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,23 +768,30 @@ struct MultiReduceDataCuda
   ~MultiReduceDataCuda()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -695,7 +799,7 @@ struct MultiReduceDataCuda
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,15 +833,17 @@ struct MultiReduceDataCuda
 
 
 private:
-  MultiReduceDataCuda const *m_parent;
+  MultiReduceDataCuda const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Cuda res)
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,7 +852,8 @@ struct MultiReduceDataCuda
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
       ::RAJA::cuda::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -755,7 +862,8 @@ struct MultiReduceDataCuda
 
 }  // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
+                                cuda::MultiReduceDataCuda)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index 4edf645ed3..6411dfe72d 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -7,42 +7,46 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+init(KernelName& kn, const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePush(kn.name);
+  nvtxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+    combine(KernelName&)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+resolve(KernelName&, const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePop();
+  nvtxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 6ab3372aaa..09ea7b2582 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -10,54 +10,57 @@
 
 #include "RAJA/policy/cuda/policy.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
-    red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    // complete reduction
-    ci.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
+{
+  red.devicetarget =
+      RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
+  red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+    combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
+{
+  // complete reduction
+  ci.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index cd71a37480..a7bac49fd9 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -61,12 +61,14 @@ using cuda_dim_member_t = camp::decay<decltype(std::declval<cuda_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -75,16 +77,16 @@ namespace cuda
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -96,13 +98,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -115,26 +118,31 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -148,22 +156,27 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template <typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -172,10 +185,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template <size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -187,19 +200,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -210,19 +227,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -231,18 +250,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -260,14 +281,16 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -279,25 +302,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 }  // namespace cuda
@@ -312,25 +335,29 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
@@ -339,38 +366,51 @@ constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_indexer {};
-
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::cuda,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::cuda> {
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_indexer
+{};
+
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct cuda_flatten_indexer
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<true /*async */>::value,
+          RAJA::Platform::cuda>
+{
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::forall,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda>
+{
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async, int num_threads = named_usage::unspecified,
+template <bool Async,
+          int num_threads      = named_usage::unspecified,
           size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
-                                RAJA::Policy::cuda,
-                                RAJA::Pattern::region,
-                                detail::get_launch<Async>::value,
-                                RAJA::Platform::cuda> {
-};
+struct cuda_launch_explicit_t
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<Async>::value,
+          RAJA::Platform::cuda>
+{};
 
 
 //
@@ -380,13 +420,15 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
-};
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::workgroup_exec,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -394,10 +436,10 @@ struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::cuda> {
-};
+          RAJA::Policy::cuda,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::cuda>
+{};
 
 
 ///
@@ -408,36 +450,36 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template < typename tuning >
-struct cuda_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
-
-template < typename tuning >
+template <typename tuning>
+struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::reduce,
+                                detail::get_launch<false>::value,
+                                RAJA::Platform::cuda,
+                                std::conditional_t<tuning::consistent,
+                                                   reduce::ordered,
+                                                   reduce::unordered>>
+{};
+
+template <typename tuning>
 struct cuda_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::cuda,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Cuda atomic policy for using cuda atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct cuda_atomic_explicit{};
+template <typename host_policy>
+struct cuda_atomic_explicit
+{};
 
 /*!
  * Default cuda atomic policy uses cuda atomics on the device and non-atomics
@@ -448,23 +490,26 @@ using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct cuda_block_reduce{};
+struct cuda_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct cuda_warp_reduce{};
+struct cuda_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_direct{};
+struct cuda_warp_direct
+{};
 
 // Policy to map work to threads within a warp using a warp-stride loop
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_loop{};
-
+struct cuda_warp_loop
+{};
 
 
 // Policy to map work to threads within a warp using a bit mask
@@ -473,8 +518,9 @@ struct cuda_warp_loop{};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_direct {};
+template <typename Mask>
+struct cuda_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with cuda_thread_x_*
@@ -482,21 +528,24 @@ struct cuda_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct cuda_warp_masked_loop {};
+template <typename Mask>
+struct cuda_warp_masked_loop
+{};
 
 
-template<typename Mask>
-struct cuda_thread_masked_direct {};
+template <typename Mask>
+struct cuda_thread_masked_direct
+{};
 
-template<typename Mask>
-struct cuda_thread_masked_loop {};
+template <typename Mask>
+struct cuda_thread_masked_loop
+{};
 
 
 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
                                                        Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                       Launch::sync>
+{};
 
 }  // end namespace cuda
 }  // end namespace policy
@@ -508,141 +557,131 @@ namespace internal
 RAJA_INLINE
 int get_size(cuda_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct CudaDims {
+struct CudaDims
+{
 
-  cuda_dim_t blocks{0,0,0};
-  cuda_dim_t threads{0,0,0};
+  cuda_dim_t blocks {0, 0, 0};
+  cuda_dim_t threads {0, 0, 0};
 
-  CudaDims() = default;
-  CudaDims(CudaDims const&) = default;
+  CudaDims()                           = default;
+  CudaDims(CudaDims const&)            = default;
   CudaDims& operator=(CudaDims const&) = default;
 
   RAJA_INLINE
   CudaDims(cuda_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  cuda_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+  cuda_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  cuda_dim_t get_threads() const {
-    if (num_threads() != 0) {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+  cuda_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct CudaDimHelper;
 
-template<>
-struct CudaDimHelper<named_dim::x>{
+template <>
+struct CudaDimHelper<named_dim::x>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::y>{
+template <>
+struct CudaDimHelper<named_dim::y>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct CudaDimHelper<named_dim::z>{
+template <>
+struct CudaDimHelper<named_dim::z>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-cuda_dim_member_t get_cuda_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr cuda_dim_member_t get_cuda_dim(dim_t const& d)
 {
   return CudaDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_cuda_dim(dim_t &d, cuda_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
 {
   return CudaDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace cuda
 {
@@ -651,14 +690,13 @@ namespace cuda
 struct IndexSize
 {
   cuda_dim_member_t block_size = named_usage::unspecified;
-  cuda_dim_member_t grid_size = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(cuda_dim_member_t _block_size = named_usage::unspecified,
-            cuda_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  cuda_dim_member_t grid_size  = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      cuda_dim_member_t _block_size = named_usage::unspecified,
+      cuda_dim_member_t _grid_size  = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -666,436 +704,461 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
+template <typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
+template <typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
+template <typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template <typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1104,10 +1167,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1122,10 +1185,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1140,85 +1203,83 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace cuda
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
 
-using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
+    cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
+        cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using CudaFractionOffsetOccupancyConcretizer =
+    cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
@@ -1228,179 +1289,286 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+template <size_t BLOCK_SIZE,
+          size_t GRID_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+using cuda_exec_explicit =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+using cuda_exec_explicit_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_exec =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_exec_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Fraction,
+          bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          typename Concretizer,
+          bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_base_explicit = std::conditional_t<with_reduce,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template <bool with_reduce,
+          size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
+          bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
+using cuda_exec_base_explicit_async = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-    cuda_exec<BLOCK_SIZE, Async>>;
+using cuda_exec_base =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+                       cuda_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce_async<BLOCK_SIZE>,
-    cuda_exec_async<BLOCK_SIZE>>;
+using cuda_exec_base_async =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
+                       cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
-using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+          bool Async           = false>
+using cuda_work_explicit =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_work_explicit_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_work_explicit_async =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_work = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_work_async = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
@@ -1410,10 +1578,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template < cuda::reduce_algorithm algorithm,
-           cuda::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <cuda::reduce_algorithm algorithm,
+          cuda::block_communication_mode comm_mode,
+          size_t replication   = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1436,35 +1604,41 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1476,25 +1650,26 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
+template <bool with_atomic>
+using cuda_reduce_base =
+    std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // policies usable with multi_reducers
-template < cuda::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
-    cuda::MultiReduceTuning<
-      algorithm,
-      cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <cuda::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using cuda_multi_reduce_tuning =
+    policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
+        algorithm,
+        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                      SharedAtomicReplicationIndexer,
+                                      GetOffsetRight<int>>,
+        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                      GlobalAtomicReplicationIndexer,
+                                      GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1508,44 +1683,51 @@ using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
 //   systems.
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<16>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<16>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<0>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<0>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 //
 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
     cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     cuda::GlobalAtomicReplicationMinPow2Concretizer<
         cuda::ConstantPreferredReplicationConcretizer<2>>,
     cuda::warp_global_xyz<>>;
 //
-using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<1>>,
-    cuda::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using cuda_multi_reduce_atomic_global_no_replication_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<1>>,
+        cuda::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using cuda_multi_reduce_atomic =
+    cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using cuda_multi_reduce_atomic_low_performance_low_overhead =
     cuda_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1573,41 +1755,49 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
+template <bool Async,
+          int num_threads      = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_launch_explicit_t =
+    policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
-//CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
+// CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
 template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
-    (num_threads == named_usage::unspecified) ? named_usage::unspecified : policy::cuda::MIN_BLOCKS_PER_SM>;
+using cuda_launch_t =
+    policy::cuda::cuda_launch_explicit_t<Async,
+                                         num_threads,
+                                         (num_threads ==
+                                          named_usage::unspecified)
+                                             ? named_usage::unspecified
+                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_indexer_direct =
+    policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                               kernel_sync_requirement::none,
+                               indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using cuda_flatten_indexer_direct =
+    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
+                                       kernel_sync_requirement::none,
+                                       indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1620,7 +1810,7 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1635,22 +1825,28 @@ using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
 using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
 using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_direct = cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_direct = cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_direct = cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_direct = cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_direct = cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_direct = cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_direct =
+    cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_direct =
+    cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_direct =
+    cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_direct =
+    cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_direct =
+    cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_direct =
+    cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1665,12 +1861,18 @@ using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
 using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
 using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_loop = cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_loop = cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_loop = cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_loop = cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_loop = cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_loop =
+    cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_loop =
+    cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_loop =
+    cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_loop =
+    cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_loop =
+    cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_loop =
+    cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
@@ -1678,7 +1880,7 @@ using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1686,26 +1888,38 @@ using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
 using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
 using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
 
-using cuda_flatten_thread_xy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1713,19 +1927,31 @@ using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
 using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
 using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
 
-using cuda_flatten_thread_xy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1733,7 +1959,7 @@ using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1748,22 +1974,28 @@ using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
 using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
 using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_direct = cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_direct = cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_direct = cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_direct = cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_direct = cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_direct = cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_direct =
+    cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_direct =
+    cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_direct =
+    cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_direct =
+    cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_direct =
+    cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_direct =
+    cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1778,12 +2010,18 @@ using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
 using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
 using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_loop = cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_loop = cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_loop = cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_loop = cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_loop = cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_loop =
+    cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_loop =
+    cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_loop =
+    cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_loop =
+    cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_loop =
+    cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_loop =
+    cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
@@ -1791,7 +2029,7 @@ using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_di
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1799,26 +2037,38 @@ using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
 using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
 using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
 
-using cuda_flatten_block_xy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1826,19 +2076,31 @@ using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
 using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
 using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
 
-using cuda_flatten_block_xy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1846,9 +2108,11 @@ using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using cuda_global_direct = cuda_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_global_direct =
+    cuda_indexer_direct<cuda::IndexGlobal<dims,
+                                          named_usage::unspecified,
+                                          named_usage::unspecified>...>;
 
 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
@@ -1861,24 +2125,34 @@ using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
 using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
 using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_direct = cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_direct = cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_direct = cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_direct = cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_direct = cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_direct = cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_direct =
+    cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_direct =
+    cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_direct =
+    cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_direct =
+    cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_direct =
+    cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_direct =
+    cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using cuda_global_loop = cuda_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using cuda_global_syncable_loop = cuda_indexer_syncable_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_global_loop =
+    cuda_indexer_loop<cuda::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
+
+template <named_dim... dims>
+using cuda_global_syncable_loop =
+    cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
+                                                 named_usage::unspecified,
+                                                 named_usage::unspecified>...>;
 
 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
@@ -1891,12 +2165,18 @@ using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
 using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
 using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_loop = cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_loop = cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_loop = cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_loop = cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_loop = cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_loop =
+    cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_loop =
+    cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_loop =
+    cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_loop =
+    cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_loop =
+    cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_loop =
+    cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -1904,54 +2184,83 @@ using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using cuda_flatten_global_direct = cuda_flatten_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_flatten_global_direct =
+    cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
+                                                  named_usage::unspecified,
+                                                  named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
 using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
 
-using cuda_flatten_global_xy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using cuda_flatten_global_loop = cuda_flatten_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using cuda_flatten_global_loop =
+    cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
 using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
 
-using cuda_flatten_global_xy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1959,271 +2268,481 @@ using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_thread_size_x_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_thread_size_y_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_thread_size_z_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_block_size_x_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_block_size_y_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_block_size_z_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template <int X_BLOCK_SIZE>
 using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template <int Y_BLOCK_SIZE>
 using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template <int Z_BLOCK_SIZE>
 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2231,272 +2750,507 @@ using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE,
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_flatten_block_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using cuda_flatten_block_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 409ec16818..1aa5e84207 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -43,18 +43,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define cudaErrchk(ans)                            \
-  {                                                \
-    ::RAJA::cudaAssert((ans), __FILE__, __LINE__); \
+#define cudaErrchk(ans)                                                        \
+  {                                                                            \
+    ::RAJA::cudaAssert((ans), __FILE__, __LINE__);                             \
   }
 
-inline void cudaAssert(cudaError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+inline void
+cudaAssert(cudaError_t code, const char* file, int line, bool abort = true)
 {
-  if (code != cudaSuccess) {
-    if (abort) {
+  if (code != cudaSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "CUDAassert: ";
       msg += cudaGetErrorString(code);
@@ -63,9 +63,11 @@ inline void cudaAssert(cudaError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "CUDAassert: %s %s %d\n",
-              cudaGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 2b13417531..ae305521b6 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -44,9 +44,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -66,47 +66,53 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>> {
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<min<T>> {
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<max<T>> {
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<and_bit<T>> {
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<or_bit<T>> {
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct cuda_atomic_available {
+struct cuda_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -124,15 +130,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -143,20 +153,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -164,33 +176,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   isLastBlock = __syncthreads_or(isLastBlock);
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -198,72 +213,92 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        temp = Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char
+        tmpsd[sizeof(RAJA::detail::SoAArray<
+                     T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T, RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
+           i *= 2)
+      {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -275,66 +310,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::cuda::device_mempool_type> device_mem,
-                                          unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce(T* device_target,
+            T val,
+            RAJA::detail::SoAPtr<T, RAJA::cuda::device_mempool_type> device_mem,
+            unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      temp = OP{}(temp, device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
+      temp = OP {}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *device_target = temp;
     }
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T>
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -343,24 +389,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -371,19 +421,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -394,9 +447,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -404,14 +456,15 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity)
+  {
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
 }
 
@@ -424,12 +477,14 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Cuda res;
     Node* node_list;
@@ -482,14 +537,19 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
-      } else {
+        m_n  = m_rn->node_list;
+      }
+      else
+      {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -501,7 +561,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -538,25 +598,27 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+    if (!rn)
+    {
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -565,7 +627,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::cuda::synchronize(*r);
     }
   }
@@ -573,10 +636,12 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
-        Node* n = rn->node_list;
+      while (rn->node_list)
+      {
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -605,12 +670,15 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -621,7 +689,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool owns_device_pointer;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
 
   /*! \brief create from a default value and offload information
    *
@@ -629,31 +697,30 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -665,10 +732,12 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -678,13 +747,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
+    if (act)
+    {
       cuda_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -695,10 +766,11 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -706,8 +778,10 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -719,32 +793,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool owns_device_pointer;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        owns_device_pointer {false}
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data&
+  operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -756,9 +830,8 @@ struct ReduceAtomicHostInit_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_host_init<Combiner,
-        replication, atomic_stride>(
-            temp, identity, output);
+    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -766,8 +839,9 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
-      is_setup = true;
+    if (act)
+    {
+      is_setup            = true;
       owns_device_pointer = true;
     }
     return act;
@@ -778,8 +852,9 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
-      is_setup = false;
+    if (act)
+    {
+      is_setup            = false;
       owns_device_pointer = false;
     }
     return act;
@@ -787,12 +862,15 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -803,34 +881,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool owns_device_pointer;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data&
+  operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -842,10 +920,12 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -855,10 +935,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -869,11 +952,12 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -885,49 +969,77 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 1;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 1;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::cuda::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::cuda::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      cuda::ReduceLastBlock_Data<Combiner,
+                                 Accessor,
+                                 T,
+                                 replication,
+                                 atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              cuda::ReduceAtomicDeviceInit_Data<Combiner,
+                                                Accessor,
+                                                T,
+                                                replication,
+                                                atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  cuda::ReduceAtomicHostInit_Data<Combiner,
+                                                  T,
+                                                  replication,
+                                                  atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -936,11 +1048,10 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
         val(init_val, identity_)
-  {
-  }
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -954,16 +1065,18 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -978,25 +1091,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -1005,15 +1128,18 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1027,7 +1153,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1137,33 +1263,39 @@ class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1187,33 +1319,39 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for cuda_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 0a9b0bf305..2b60028cb0 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -49,40 +49,34 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -103,43 +97,35 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive_inplace(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive_inplace(resources::Cuda cuda_res,
+                  ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                           IterationGetter,
+                                                           Concretizer,
+                                                           BLOCKS_PER_SM,
+                                                           Async>,
+                  InputIter begin,
+                  InputIter end,
+                  Function binary_op,
+                  T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -160,41 +146,33 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+inclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -216,44 +194,36 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Cuda>
+exclusive(resources::Cuda cuda_res,
+          ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                   IterationGetter,
+                                                   Concretizer,
+                                                   BLOCKS_PER_SM,
+                                                   Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index c5a353b704..61b6f92673 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,32 +44,44 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
+  static_assert(
+      type_traits::is_arithmetic<iterval>::value,
       "stable_sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "stable_sort<cuda_exec> is only implemented for "
+                "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -77,26 +89,32 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -106,15 +124,11 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
@@ -122,19 +136,17 @@ stable(
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -147,26 +159,32 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -176,35 +194,29 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -218,32 +230,43 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
-      "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "sort<cuda_exec> is only implemented for RAJA::operators::less "
+                "or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -251,18 +274,24 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -270,18 +299,24 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -290,36 +325,52 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
+  static_assert(
+      std::is_pointer<KeyIter>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
+  static_assert(
+      std::is_pointer<ValIter>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -327,29 +378,37 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -361,42 +420,36 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -410,29 +463,37 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>>)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -444,42 +505,36 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -494,36 +549,50 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -531,20 +600,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -552,20 +629,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 71bf429079..8844937700 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -28,176 +28,131 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicLoad(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
 {
-  return desul::atomic_load(acc,
-                            raja_default_desul_order{},
-                            raja_default_desul_scope{});
+  return desul::atomic_load(acc, raja_default_desul_order {},
+                            raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void
-atomicStore(AtomicPolicy, T *acc, T value)
-{
-  desul::atomic_store(acc,
-                      value,
-                      raja_default_desul_order{},
-                      raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
+{
+  desul::atomic_store(acc, value, raja_default_desul_order {},
+                      raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicAdd(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_add(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_add(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicSub(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_sub(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_sub(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_min(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_min(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_max(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_max(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_inc(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_inc(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_dec(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_dec(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_and(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_and(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_or(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_fetch_or(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_fetch_xor(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_xor(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
 {
-  return desul::atomic_exchange(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+  return desul::atomic_exchange(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
-{
-  return desul::atomic_compare_exchange(acc,
-                                        compare,
-                                        value,
-                                        raja_default_desul_order{},
-                                        raja_default_desul_scope{});
+RAJA_HOST_DEVICE RAJA_INLINE T
+atomicCAS(AtomicPolicy, T* acc, T compare, T value)
+{
+  return desul::atomic_compare_exchange(acc, compare, value,
+                                        raja_default_desul_order {},
+                                        raja_default_desul_scope {});
 }
 
 }  // namespace RAJA
 
 #endif  // RAJA_ENABLE_DESUL_ATOMICS
-#endif // guard
+#endif  // guard
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index f1f69eab5e..6c829798be 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -72,14 +72,15 @@ hipDeviceProp_t& device_prop()
 
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     hipErrchk(hipHostMalloc(&ptr, nbytes,
-        hipHostMallocMapped | hipHostMallocNonCoherent));
+                            hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -92,7 +93,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -112,7 +114,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -134,7 +137,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -155,22 +159,25 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct hipInfo {
+struct hipInfo
+{
   const void* func = nullptr;
-  hip_dim_t gridDim{0, 0, 0};
-  hip_dim_t blockDim{0, 0, 0};
+  hip_dim_t gridDim {0, 0, 0};
+  hip_dim_t blockDim {0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)};
+  ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct hipStatusInfo : hipInfo {
+struct hipStatusInfo : hipInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -187,10 +194,7 @@ extern hipStatusInfo tl_status;
 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Hip res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
 
 }  // namespace detail
 
@@ -202,13 +206,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     hipErrchk(hipDeviceSynchronize());
   }
 }
@@ -221,12 +228,16 @@ void synchronize(::RAJA::resources::Hip res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -239,30 +250,41 @@ void launch(::RAJA::resources::Hip res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, hip_dim_t gridDim, hip_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Hip res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            hip_dim_t gridDim,
+            hip_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Hip res,
+            bool async       = true,
+            const char* name = nullptr)
 {
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePush(name);
-  #else
-    RAJA_UNUSED_VAR(name);
-  #endif
-  hipErrchk(hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePop();
-  #endif
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePush(name);
+#else
+  RAJA_UNUSED_VAR(name);
+#endif
+  hipErrchk(
+      hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePop();
+#endif
   launch(res, async);
 }
 
@@ -280,9 +302,11 @@ hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                            detail::tl_status.gridDim.y *
-                                            detail::tl_status.gridDim.z; }
+hip_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -290,9 +314,11 @@ hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                             detail::tl_status.blockDim.y *
-                                             detail::tl_status.blockDim.z; }
+hip_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -307,7 +333,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -319,24 +346,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template <typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -351,16 +381,17 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
 template <typename LOOP_BODY>
-RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
-    const void* func,
-    hip_dim_t gridDim,
-    hip_dim_t blockDim,
-    size_t& dynamic_smem,
-    ::RAJA::resources::Hip res,
-    LOOP_BODY&& loop_body)
+RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type
+make_launch_body(const void* func,
+                 hip_dim_t gridDim,
+                 hip_dim_t blockDim,
+                 size_t& dynamic_smem,
+                 ::RAJA::resources::Hip res,
+                 LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(detail::tl_status,
-      detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
+      detail::tl_status,
+      detail::hipInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
@@ -375,7 +406,8 @@ static constexpr size_t hip_occupancy_uninitialized_size_t =
 struct HipFixedMaxBlocksData
 {
   int device_sm_per_device = hip::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -391,32 +423,33 @@ HipFixedMaxBlocksData hip_max_blocks()
 struct HipOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
-  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = hip_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksThreadsData
+hip_occupancy_max_blocks_threads(const void* func,
+                                 size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    hipDeviceProp_t& prop = hip::device_prop();
+    hipDeviceProp_t& prop           = hip::device_prop();
     data.func_max_blocks_per_device = prop.multiProcessorCount;
     data.func_max_threads_per_block = 1024;
 #endif
-
   }
 
   return data;
@@ -426,60 +459,70 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_threads_per_block = hip_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
+  int func_threads_per_block          = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template <typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func,
+                         size_t func_dynamic_shmem_per_block,
+                         int func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
@@ -512,14 +555,16 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template <typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -533,10 +578,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -544,7 +593,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -552,16 +602,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -570,8 +621,10 @@ struct ConcretizerImpl
   {
     auto data = hip_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -579,9 +632,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 975d26b7ff..5d2e9b69bb 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -41,9 +41,9 @@ namespace hip
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template <typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,8 +52,9 @@ __global__ void get_value_global(
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  static void* ptr            = nullptr;
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     hipErrchk(hipHostFree(ptr));
     hipErrchk(hipHostMalloc(&ptr, nbytes));
@@ -73,7 +74,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +82,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Hip::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   hipErrchk(hipStreamSynchronize(res.get_stream()));
 
@@ -91,7 +93,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +103,15 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace hip
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async >
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template <typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return hip::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory) {
+        return hip::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 26d45d7bd9..dbdcbc7851 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,46 +36,45 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +82,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,46 +98,45 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::reverse_ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template <typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,8 +144,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -155,15 +161,17 @@ struct WorkRunner<
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template <typename Segment_type,
+          typename LoopBody,
+          typename index_type,
+          typename... Args>
 struct HoldHipDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template <typename segment_in, typename body_in>
   HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -171,10 +179,11 @@ struct HoldHipDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,11 +193,11 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template <size_t BLOCK_SIZE,
+          typename StorageIter,
+          typename value_type,
+          typename index_type,
+          typename... Args>
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -205,36 +214,40 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
   using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
-  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using order_policy =
+      RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Hip;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Hip;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template <typename T>
     using type = HoldHipDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
   ///
-  template < typename T >
+  template <typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -243,21 +256,24 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::hip, dispatcher_holder_policy, RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
+  using dispatcher_type = Dispatcher<Platform::hip,
+                                     dispatcher_holder_policy,
+                                     RAJA::hip_work<BLOCK_SIZE, true>,
+                                     Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -267,35 +283,41 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template <typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void
+  enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::hip::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -303,37 +325,43 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template <typename WorkContainer>
+  per_run_storage
+  run(WorkContainer const& storage, resource_type r, Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
+                                             index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<hip_dim_member_t>(num_loops),
-                          1};
+      hip_dim_t blockSize {static_cast<hip_dim_member_t>(block_size), 1, 1};
+      hip_dim_t gridSize {
+          static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
+                                        block_size),
+          static_cast<hip_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -346,8 +374,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
+                          shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -357,10 +386,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
@@ -369,29 +395,31 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_function_call_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_function_call_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
+          typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_virtual_function_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_virtual_function_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 
 #endif
 
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index b4f0d7faa7..60b0871f0d 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -49,11 +49,8 @@ namespace RAJA
 namespace detail
 {
 
-using hip_atomicCommon_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long
->;
+using hip_atomicCommon_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long>;
 
 /*!
  * Type trait for determining if atomic operators should be implemented
@@ -62,11 +59,11 @@ using hip_atomicCommon_builtin_types = ::camp::list<
  * hip_useBuiltinExchange below.
  */
 template <typename T>
-struct hip_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+struct hip_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
 
@@ -78,15 +75,15 @@ struct hip_useBuiltinCommon {
  * below.
  */
 template <typename T>
-struct hip_useReinterpretCommon {
-  static constexpr bool value =
-    !hip_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretCommon
+{
+  static constexpr bool value = !hip_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 
@@ -106,7 +103,7 @@ using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
@@ -117,12 +114,12 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  * using a builtin
  */
 template <typename T>
-struct hip_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+struct hip_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
@@ -130,22 +127,23 @@ struct hip_useBuiltinExchange {
  * by reinterpreting inputs to types that the builtin exchange supports
  */
 template <typename T>
-struct hip_useReinterpretExchange {
-  static constexpr bool value =
-    !hip_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+struct hip_useReinterpretExchange
+{
+  static constexpr bool value = !hip_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
 template <typename T>
-using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::type;
+using hip_useReinterpretExchange_t =
+    typename hip_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
@@ -153,7 +151,7 @@ using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::typ
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -164,17 +162,16 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   using R = hip_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicExchange(reinterpret_cast<R*>(acc),
-                       RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
-#if defined(__has_builtin) && \
+#if defined(__has_builtin) &&                                                  \
     (__has_builtin(__hip_atomic_load) || __has_builtin(__hip_atomic_store))
 
 /*!
@@ -182,10 +179,11 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
  * using an intrinsic
  */
 template <typename T>
-struct hip_useBuiltinLoad {
+struct hip_useBuiltinLoad
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 template <typename T>
@@ -197,54 +195,54 @@ using hip_useBuiltinStore = hip_useBuiltinLoad<T>;
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct hip_useReinterpretLoad {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+struct hip_useReinterpretLoad
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -285,7 +283,7 @@ using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
   return __hip_atomic_load(acc, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -296,12 +294,12 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
   using R = hip_useReinterpretLoad_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicLoad(reinterpret_cast<R*>(acc)));
+      hip_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -310,7 +308,7 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
   __hip_atomic_store(acc, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -321,7 +319,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 
 template <typename T,
           std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
@@ -337,7 +335,7 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
@@ -349,14 +347,13 @@ RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicCAS(reinterpret_cast<R*>(acc),
-                  RAJA::util::reinterp_A_as_B<T, R>(compare),
-                  RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -390,15 +387,15 @@ RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
  * operation.
  */
 template <typename T, typename Oper>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
-                                            Oper&& oper)
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = hip_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected));
 
   return old;
@@ -406,27 +403,29 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing HIP supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
                                             Oper&& oper,
                                             ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -440,29 +439,28 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicAdd_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
@@ -475,16 +473,15 @@ RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicSub_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -492,10 +489,7 @@ using hip_atomicSub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
 /*!
  * List of types where HIP builtin atomicAdd is used to implement atomicSub.
@@ -503,33 +497,33 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long,
-  float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
+                                                         float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                         ,
+                                                         double
 #endif
->;
+                                                         >;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 /*!
  * HIP atomicSub builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
@@ -537,9 +531,11 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 /*!
  * HIP atomicSub via atomicAdd builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
@@ -550,23 +546,20 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
  */
 using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc, [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
@@ -577,23 +570,20 @@ RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
  */
 using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc, [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
@@ -603,11 +593,11 @@ RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
  * Atomic increment with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return hip_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 
@@ -615,7 +605,7 @@ RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
  * Atomic increment (implemented in terms of atomic addition)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc)
+RAJA_INLINE __device__ T hip_atomicInc(T* acc)
 {
   return hip_atomicAdd(acc, static_cast<T>(1));
 }
@@ -625,11 +615,15 @@ RAJA_INLINE __device__ T hip_atomicInc(T *acc)
  * Atomic decrement with reset
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
+RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
-  });
+  return hip_atomicCAS_loop(acc,
+                            [value](T old)
+                            {
+                              return old == static_cast<T>(0) || value < old
+                                         ? value
+                                         : old - static_cast<T>(1);
+                            });
 }
 
 
@@ -637,7 +631,7 @@ RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
 template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc)
+RAJA_INLINE __device__ T hip_atomicDec(T* acc)
 {
   return hip_atomicSub(acc, static_cast<T>(1));
 }
@@ -648,18 +642,18 @@ RAJA_INLINE __device__ T hip_atomicDec(T *acc)
  */
 using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
@@ -670,13 +664,12 @@ RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
  */
 using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 /*!
@@ -690,18 +683,18 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  */
 using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return hip_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template <
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -721,181 +714,191 @@ RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
+atomicStore(hip_atomic_explicit<host_policy>, T* acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   detail::hip_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(hip_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(hip_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index a8c4cf53b9..14b8b5abf9 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,61 +71,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(dims.threads,
+                               static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(dims.blocks,
+                               static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -133,43 +163,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -178,46 +224,67 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int GRID_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -225,43 +292,59 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          typename Concretizer,
+          typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template <named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template <typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -290,21 +373,22 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -314,21 +398,20 @@ template <typename EXEC_POL,
           typename LOOP_BODY,
           typename IndexType,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
@@ -339,23 +422,24 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size > 0),
+                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -366,134 +450,139 @@ template <typename EXEC_POL,
           typename IndexType,
           typename ForallParam,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+          typename IterationGetter  = typename EXEC_POL::IterationGetter,
+          std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                           IterationMapping>::value &&
+                               (IterationGetter::block_size <= 0),
+                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void
+forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template <
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -508,37 +597,48 @@ void forallp_hip_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType>);
@@ -560,14 +660,16 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
     }
 
     RAJA_FT_END;
@@ -577,37 +679,49 @@ forall_impl(resources::Hip hip_res,
 }
 
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename Iterable,
+          typename LoopBody,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam f_params)
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
@@ -627,9 +741,9 @@ forall_impl(resources::Hip hip_res,
     RAJA_FT_BEGIN;
 
     RAJA::hip::detail::hipInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = hip_res;
+    launch_info.res      = hip_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -637,14 +751,17 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -675,22 +792,28 @@ forall_impl(resources::Hip hip_res,
  ******************************************************************************
  */
 template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
+    resources::Hip r,
+    ExecPolicy<
+        seq_segit,
+        ::RAJA::policy::hip::
+            hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
-                     loop_body);
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                      Concretizer, true>(),
+        loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::hip::synchronize(r);
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index c72a0b5c4f..67a7143ead 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -59,15 +59,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,18 +90,23 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
-
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
+                                     __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -116,19 +115,23 @@ struct AccessorDeviceScopeUseBlockFence
     return u.get_value();
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
+                         __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -137,7 +140,8 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
     __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 #else
     __threadfence();
@@ -146,11 +150,13 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_release()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
-                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) &&                 \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
-    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) |
+                               (/*lgkmcnt*/ 0xf << 8));
 #else
     __threadfence();
 #endif
@@ -175,10 +181,13 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl_xor(u.array[i], laneMask);
   }
   return u.get_value();
@@ -187,10 +196,13 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 template <typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl(u.array[i], srcLane);
   }
   return u.get_value();
@@ -233,23 +245,28 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -269,9 +286,10 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = shfl_xor_sync(temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
@@ -287,61 +305,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::hip::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > policy::hip::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::hip::device_constants.MAX_WARPS <=
+                      policy::hip::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
     RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 3204845544..a882b547d7 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -41,19 +41,18 @@ template <typename Data,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                            statement::If<Conditional, EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
@@ -61,10 +60,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 39e7104c16..6e90852841 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -45,9 +45,12 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,13 +63,13 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -79,14 +82,13 @@ struct HipStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -108,9 +110,13 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -122,21 +128,24 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,14 +160,13 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -180,9 +188,13 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -194,21 +206,24 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,14 +233,13 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -245,14 +259,19 @@ struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : HipStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -263,33 +282,32 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +317,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +336,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -332,41 +348,41 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -382,9 +398,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +413,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -411,30 +425,28 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +456,11 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +476,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -478,39 +488,38 @@ struct HipStatementExecutor<
 template <typename Data,
           camp::idx_t ArgumentId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -526,9 +535,7 @@ struct HipStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,7 +551,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index ba6642f248..823f6b1293 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -46,33 +46,40 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -102,38 +109,52 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -165,38 +186,52 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -225,14 +260,19 @@ struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
+    : HipStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 
 /*
@@ -244,40 +284,47 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,9 +335,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -303,48 +349,56 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,7 +413,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 
@@ -372,37 +425,43 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,9 +472,8 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
 
@@ -428,45 +486,52 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,7 +546,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 1ed7740008..8c5cb83d39 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -51,7 +51,8 @@ namespace RAJA
  *
  */
 template <bool async0, int num_blocks, int num_threads>
-struct hip_explicit_launch {};
+struct hip_explicit_launch
+{};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
@@ -87,8 +88,10 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
-};
+    : public internal::Statement<
+          ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
+          EnclosedStmts...>
+{};
 
 
 /*!
@@ -99,7 +102,8 @@ struct HipKernelExt
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExp =
-    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -109,7 +113,8 @@ using HipKernelExp =
  */
 template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExpAsync =
-    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
@@ -135,9 +140,9 @@ using HipKernelOccAsync =
  * The kernel launch is synchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixed =
-    HipKernelExt<hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+using HipKernelFixed = HipKernelExt<
+    hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with a fixed
@@ -145,8 +150,9 @@ using HipKernelFixed =
  * The kernel launch is asynchronous.
  */
 template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixedAsync =
-    HipKernelExt<hip_explicit_launch<true, operators::limits<int>::max(), num_threads>, EnclosedStmts...>;
+using HipKernelFixedAsync = HipKernelExt<
+    hip_explicit_launch<true, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
@@ -175,7 +181,7 @@ template <typename Data, typename Exec>
 __global__ void HipKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
@@ -194,7 +200,7 @@ __launch_bounds__(BlockSize, 1) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -210,10 +216,11 @@ __launch_bounds__(BlockSize, 1) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<int BlockSize, typename Data, typename executor_t>
+template <int BlockSize, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+  using type = camp::decay<
+      decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
@@ -224,10 +231,11 @@ struct HipKernelLauncherGetter
  * Helper class specialization for BlockSize == 0 and gets the unfixed max
  * block size version of the kernel.
  */
-template<typename Data, typename executor_t>
+template <typename Data, typename executor_t>
 struct HipKernelLauncherGetter<0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncher<Data, executor_t>;
@@ -235,12 +243,14 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct HipLaunchHelper;
 
 
@@ -249,16 +259,28 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
+template <bool async0,
+          int num_blocks,
+          int num_threads,
+          typename StmtList,
+          typename Data,
+          typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
+                       StmtList,
+                       Data,
+                       Types>
 {
   using Self = HipLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
+  using kernelGetter_t =
+      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                              Data,
+                              executor_t>;
 
   inline static const void* get_func()
   {
@@ -266,13 +288,16 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -280,10 +305,11 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         //
         auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -293,69 +319,73 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
-  inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+  inline static void
+  max_blocks(size_t shmem_size, int& max_blocks, int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -363,8 +393,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -372,16 +403,15 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -395,8 +425,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
+inline hip_dim_t fitHipDims(hip_dim_member_t limit,
+                            hip_dim_t result,
+                            hip_dim_t minimum = hip_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -409,12 +441,13 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -422,9 +455,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -432,9 +466,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -449,18 +484,20 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
-  using StatementType =
-      statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
+  using stmt_list_t   = StatementList<EnclosedStmts...>;
+  using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -474,9 +511,10 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -489,8 +527,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -503,24 +541,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      hip_dim_t fit_threads{0,0,0};
+      hip_dim_t fit_threads {0, 0, 0};
 
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
-
-        fit_threads = fitHipDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitHipDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -534,24 +572,25 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitHipDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
+                                           launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -560,7 +599,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -574,14 +614,17 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto hip_data = RAJA::hip::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+        auto hip_data = RAJA::hip::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&hip_data};
+        RAJA::hip::launch(func, launch_dims.dims.blocks,
+                          launch_dims.dims.threads, args, shmem, res,
+                          launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index 5c428f03ab..a9888d17a7 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -41,33 +41,31 @@ template <typename Data,
           camp::idx_t... Args,
           typename... EnclosedStmts,
           typename Types>
-struct HipStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                            Types> {
+struct HipStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct HipStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -93,18 +92,13 @@ struct HipStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index bbb8d6081b..b59ec5c88a 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -39,27 +39,30 @@ struct hip_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
 struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>,
-                            EnclosedStmts...>,
+                            statement::InitLocalMem<RAJA::hip_shared_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,40 +70,33 @@ struct HipStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -108,47 +104,47 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template <typename Data,
+          camp::idx_t... Indices,
+          typename... EnclosedStmts,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::InitLocalMem<RAJA::hip_thread_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  //Launch loops
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Launch loops
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,40 +152,33 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  // Intialize local array
+  // Identifies type + number of elements needed
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
-  template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null base case
+  template <camp::idx_t Pos>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
 
-  //Set pointer to null recursive case
-  template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  // Set pointer to null recursive case
+  template <camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
@@ -197,31 +186,24 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
   }
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
 
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index d04fb11bf6..7835ddb7eb 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -40,30 +40,34 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Lambda<LambdaIndex, Args...>,
+                            Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index a518073e7c..2799207979 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -35,31 +35,34 @@ namespace internal
 // Executor that handles reductions across a single HIP thread block
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_block_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                           Types> {
+                            statement::Reduce<RAJA::hip_block_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -67,13 +70,13 @@ struct HipStatementExecutor<Data,
     // reduction objects
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
 
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -81,7 +84,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -94,44 +97,47 @@ struct HipStatementExecutor<Data,
 // Executor that handles reductions across a single HIP thread warp
 //
 template <typename Data,
-          template <typename...> class ReduceOperator,
+          template <typename...>
+          class ReduceOperator,
           typename ParamId,
           typename... EnclosedStmts,
           typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_warp_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                            Types> {
+                            statement::Reduce<RAJA::hip_warp_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
@@ -139,7 +145,7 @@ struct HipStatementExecutor<Data,
   }
 
 
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,7 +154,6 @@ struct HipStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index d54a5ccf83..b5d590cb3a 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -43,14 +43,14 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a HIP __syncthreads().
  */
-struct HipSyncThreads : public internal::Statement<camp::nil> {
-};
+struct HipSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a HIP __syncwarp().
  */
-struct HipSyncWarp : public internal::Statement<camp::nil> {
-};
+struct HipSyncWarp : public internal::Statement<camp::nil>
+{};
 
 }  // namespace statement
 
@@ -58,34 +58,31 @@ namespace internal
 {
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
+struct HipStatementExecutor<Data, statement::HipSyncThreads, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncWarp, Types> {
+struct HipStatementExecutor<Data, statement::HipSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  //not currently supported
-  void exec(Data &, bool) {  }
+  static inline RAJA_DEVICE
+      // not currently supported
+      void
+      exec(Data&, bool)
+  {}
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 62dda7f20d..4490bddf42 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -58,10 +58,12 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                   sync,
+                                                   IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,23 +104,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -141,11 +145,16 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,26 +162,32 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,23 +205,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -231,11 +246,16 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,26 +263,32 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,23 +301,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -316,15 +342,22 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 07637fbd8f..fc4a5c5222 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -58,42 +58,49 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                       EnclosedStmts...>,
-                      Types>;
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -129,50 +136,64 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -207,50 +228,64 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -279,15 +314,24 @@ template <typename Data,
           typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index aa0610d736..8b3793ae70 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -44,29 +44,26 @@ namespace RAJA
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   HipDims dims;
   HipDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(HipDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(HipDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(HipDims _dims, HipDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims {_dims}, min_dims {_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,43 +79,44 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
+  int num_threads() const { return dims.num_threads(); }
 
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper {
+struct HipStatementListExecutorHelper
+{
 
   using next_helper_t =
       HipStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -126,7 +124,7 @@ struct HipStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -137,7 +135,7 @@ struct HipStatementListExecutorHelper {
 
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -151,16 +149,17 @@ struct HipStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -175,109 +174,120 @@ struct HipStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<HipStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return HipStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
 
 template <typename StmtList, typename Data, typename Types>
-using hip_statement_list_executor_t = HipStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using hip_statement_list_executor_t =
+    HipStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
-template<typename kernel_indexer>
+template <typename kernel_indexer>
 struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -285,164 +295,219 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 
 // specialization for strided loop sequential policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template <typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
@@ -450,35 +515,43 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
-template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+template <named_dim dim, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_hip_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -487,62 +560,88 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   }
 };
 ///
-template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+template <named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+template <named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template <typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template <named_dim dim,
+          int BLOCK_SIZE,
+          int GRID_SIZE,
+          kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template <typename IdxT>
+  static void
+  set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 6823647b48..4940c6d365 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -35,9 +35,9 @@ __global__ void launch_global_fcn(BODY body_in)
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -45,38 +45,45 @@ __global__ void launch_global_fcn(BODY body_in)
 }
 
 template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>> {
+struct LaunchExecute<
+    RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -84,18 +91,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +114,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -122,13 +133,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
   }
 
 
- //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -141,45 +157,53 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,21 +211,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename BODY, int num_threads>
 __launch_bounds__(num_threads, 1) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
@@ -210,39 +233,46 @@ void launch_global_fcn_fixed(BODY body_in)
 
 template <typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
 
 template <bool async, int nthreads>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
+struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
+{
 
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn_fixed<BODY, nthreads>);
+    auto func =
+        reinterpret_cast<const void*>(&launch_global_fcn_fixed<BODY, nthreads>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -250,18 +280,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -270,14 +302,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -286,18 +320,24 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads,
+                                            camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -305,45 +345,53 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -351,7 +399,6 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
@@ -359,43 +406,50 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -403,29 +457,36 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -435,53 +496,62 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -492,34 +562,42 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -533,14 +611,16 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -549,42 +629,49 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -592,31 +679,36 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -626,54 +718,62 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -684,35 +784,42 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -726,16 +833,17 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
@@ -746,31 +854,34 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 /*
    HIP generic flattened loop implementations
 */
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           sync,
+                                           IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::
+              hip_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -779,29 +890,35 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1,
+                                           IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -812,39 +929,47 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+template <typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::hip_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -854,29 +979,34 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template <typename SEGMENT,
+          typename IndexMapper0,
+          typename IndexMapper1,
+          typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -888,9 +1018,9 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
@@ -901,101 +1031,122 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 0d9d3899d8..5f06445a0f 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/hip/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/hip/atomic.hpp"
+#include "RAJA/policy/hip/atomic.hpp"
 #endif
 
 #include "RAJA/policy/hip/policy.hpp"
@@ -73,100 +73,124 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template <typename Combiner,
+          typename GetTallyIndex,
+          typename T,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                         T identity,
+                                         int bin,
+                                         T value,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
 
 //! initialize shared memory
 template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_init_shmem(int num_bins,
+                              T identity,
+                              T* shared_mem,
+                              int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template <typename Combiner,
+          typename GetSharedIndex,
+          typename T,
+          typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void
+block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                        T identity,
+                                        int bin,
+                                        T value,
+                                        T* shared_mem,
+                                        GetSharedOffset get_shared_offset,
+                                        int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::hip::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
 template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+          typename T,
+          typename GetSharedOffset,
+          typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void
+grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                         T identity,
+                                         T* shared_mem,
+                                         GetSharedOffset get_shared_offset,
+                                         int shared_replication,
+                                         T* tally_mem,
+                                         GetTallyOffset get_tally_offset,
+                                         int tally_replication,
+                                         int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -185,48 +209,63 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template <typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData&
+  operator=(MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData()           = default;
 
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template <typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -244,9 +283,11 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
@@ -258,20 +299,27 @@ struct MultiReduceGridAtomicHostInit_TallyData
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +329,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer {}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template <typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +380,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -338,14 +403,15 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
 
@@ -354,34 +420,31 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
+  void teardown_launch() {}
 
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
+  void finalize_device() {}
 
 
   //! combine value on device, combine a value into the tally atomically
@@ -389,9 +452,8 @@ struct MultiReduceGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,18 +463,19 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
@@ -422,57 +485,69 @@ template <typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template <typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data&
+  operator=(MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()           = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
-    size_t shared_replication = 0;
+    size_t shared_replication  = 0;
     const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+        [&](size_t max_shmem_size)
+        {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data {block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer {}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+      m_shared_offset      = static_cast<int>(shared_offset);
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -481,7 +556,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
 
@@ -490,10 +565,10 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+    if (shared_mem != nullptr)
+    {
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -502,11 +577,12 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -516,16 +592,17 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -536,14 +613,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -551,24 +630,27 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -595,39 +677,49 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataHip
 {
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                           T,
+                                                           tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                      T,
+                                                      tuning>,
+              void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Hip>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataHip() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
   MultiReduceDataHip(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,31 +731,35 @@ struct MultiReduceDataHip
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataHip(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip(MultiReduceDataHip&&)                 = delete;
   MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
-  MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,23 +767,30 @@ struct MultiReduceDataHip
   ~MultiReduceDataHip()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
@@ -695,7 +798,7 @@ struct MultiReduceDataHip
   }
 
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
@@ -729,15 +832,17 @@ struct MultiReduceDataHip
 
 
 private:
-  MultiReduceDataHip const *m_parent;
+  MultiReduceDataHip const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Hip res)
   {
-    for (resources::Hip& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,7 +851,8 @@ struct MultiReduceDataHip
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Hip& list_res : *m_sync_list) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
       ::RAJA::hip::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -755,7 +861,8 @@ struct MultiReduceDataHip
 
 }  // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
+                                hip::MultiReduceDataHip)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 30269f8406..db4d204aeb 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -11,42 +11,45 @@
 #include "roctx.h"
 #endif
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+init(KernelName& kn, const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePush(kn.name);
+  roctxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
-  {
+}
+
+// Combine
+template <typename EXEC_POL>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(KernelName&)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+resolve(KernelName&, const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePop();
+  roctxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index a3da07ee2c..38dd12b43a 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -8,54 +8,56 @@
 #include "RAJA/policy/hip/reduce.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
-    red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter,OP>( red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    // complete reduction
-    hi.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
+{
+  red.devicetarget =
+      RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
+  red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
+{
+  // complete reduction
+  hi.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index a9f9027675..2491f5dc05 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -38,7 +38,7 @@
 namespace RAJA
 {
 
-using hip_dim_t = dim3;
+using hip_dim_t        = dim3;
 using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 
 //
@@ -56,12 +56,14 @@ using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -70,16 +72,16 @@ namespace hip
 {
 
 /// Type representing thread and block indexing within a grid
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template <typename... indexers>
 struct IndexFlatten;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexDivide;
 
-template<size_t divisor, typename index>
+template <size_t divisor, typename index>
 struct IndexModulo;
 
 
@@ -91,13 +93,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -110,26 +113,31 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template <typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -143,22 +151,27 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template <typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
@@ -167,10 +180,10 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template <size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -182,19 +195,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template <size_t t_cutoff,
+          size_t preferred_replication_before_cutoff,
+          size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -205,19 +222,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -226,18 +245,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template <typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template <typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -255,14 +276,16 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template <reduce_algorithm t_algorithm,
+          block_communication_mode t_comm_mode,
+          size_t t_replication,
+          size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -274,25 +297,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template <typename t_AtomicReplicationConcretizer,
+          typename t_ReplicationIndexer,
+          typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template <multi_reduce_algorithm t_algorithm,
+          typename t_SharedAtomicReplicationTuning,
+          typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 }  // namespace hip
@@ -307,16 +330,19 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
@@ -324,49 +350,59 @@ struct DeviceConstants
 // values for HIP warp size and max block size.
 //
 #if defined(__HIP_PLATFORM_AMD__)
-constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A
+constexpr DeviceConstants device_constants(64, 1024, 64);  // MI300A
 // constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X
 #elif defined(__HIP_PLATFORM_NVIDIA__)
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 #endif
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
 
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct hip_indexer {};
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
+struct hip_indexer
+{};
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+template <typename _IterationMapping,
+          kernel_sync_requirement sync,
+          typename... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::hip,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::hip> {
+                                 RAJA::Policy::hip,
+                                 RAJA::Pattern::region,
+                                 detail::get_launch<true /*async */>::value,
+                                 RAJA::Platform::hip>
+{
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+template <typename _IterationMapping,
+          typename _IterationGetter,
+          typename _LaunchConcretizer,
           bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::forall,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
 template <bool Async, int num_threads = named_usage::unspecified>
 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
+                          RAJA::Policy::hip,
+                          RAJA::Pattern::region,
+                          detail::get_launch<Async>::value,
+                          RAJA::Platform::hip>
+{};
 
 
 //
@@ -378,11 +414,11 @@ struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
 ///
 template <size_t BLOCK_SIZE, bool Async = false>
 struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::workgroup_exec,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -390,10 +426,10 @@ struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_hip_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::hip> {
-};
+          RAJA::Policy::hip,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::hip>
+{};
 
 
 ///
@@ -405,36 +441,36 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///
 
 
-template < typename tuning >
-struct hip_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+template <typename tuning>
+struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                               RAJA::Policy::hip,
+                               RAJA::Pattern::reduce,
+                               detail::get_launch<false>::value,
+                               RAJA::Platform::hip,
+                               std::conditional_t<tuning::consistent,
+                                                  reduce::ordered,
+                                                  reduce::unordered>>
+{};
 
-template < typename tuning >
+template <typename tuning>
 struct hip_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::hip,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::hip,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Hip atomic policy for using hip atomics on the device and
  * the provided policy on the host
  */
-template<typename host_policy>
-struct hip_atomic_explicit{};
+template <typename host_policy>
+struct hip_atomic_explicit
+{};
 
 /*!
  * Default hip atomic policy uses hip atomics on the device and non-atomics
@@ -445,11 +481,13 @@ using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct hip_block_reduce{};
+struct hip_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct hip_warp_reduce{};
+struct hip_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
@@ -463,15 +501,15 @@ struct hip_warp_reduce{};
 // struct hip_warp_loop{};
 
 
-
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
 // Multiple warps have to be created by using hip_thread_{yz}_*
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_direct {};
+template <typename Mask>
+struct hip_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
@@ -479,21 +517,24 @@ struct hip_warp_masked_direct {};
 // Since we are masking specific threads, multiple nested
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
-template<typename Mask>
-struct hip_warp_masked_loop {};
+template <typename Mask>
+struct hip_warp_masked_loop
+{};
 
 
-template<typename Mask>
-struct hip_thread_masked_direct {};
+template <typename Mask>
+struct hip_thread_masked_direct
+{};
 
-template<typename Mask>
-struct hip_thread_masked_loop {};
+template <typename Mask>
+struct hip_thread_masked_loop
+{};
 
 
 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
-                                                       Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                      Pattern::synchronize,
+                                                      Launch::sync>
+{};
 
 }  // end namespace hip
 }  // end namespace policy
@@ -505,141 +546,131 @@ namespace internal
 RAJA_INLINE
 int get_size(hip_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct HipDims {
+struct HipDims
+{
 
-  hip_dim_t blocks{0,0,0};
-  hip_dim_t threads{0,0,0};
+  hip_dim_t blocks {0, 0, 0};
+  hip_dim_t threads {0, 0, 0};
 
-  HipDims() = default;
-  HipDims(HipDims const&) = default;
+  HipDims()                          = default;
+  HipDims(HipDims const&)            = default;
   HipDims& operator=(HipDims const&) = default;
 
   RAJA_INLINE
   HipDims(hip_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  hip_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+  hip_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  hip_dim_t get_threads() const {
-    if (num_threads() != 0) {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+  hip_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
 };
 
-template<named_dim dim>
+template <named_dim dim>
 struct HipDimHelper;
 
-template<>
-struct HipDimHelper<named_dim::x>{
+template <>
+struct HipDimHelper<named_dim::x>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::y>{
+template <>
+struct HipDimHelper<named_dim::y>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct HipDimHelper<named_dim::z>{
+template <>
+struct HipDimHelper<named_dim::z>
+{
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  template <typename dim_t>
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.z = value;
   }
 };
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-hip_dim_member_t get_hip_dim(dim_t const &d)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE constexpr hip_dim_member_t get_hip_dim(dim_t const& d)
 {
   return HipDimHelper<dim>::get(d);
 }
 
-template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_hip_dim(dim_t &d, hip_dim_member_t value)
+template <named_dim dim, typename dim_t>
+RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
 {
   return HipDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace hip
 {
@@ -648,14 +679,13 @@ namespace hip
 struct IndexSize
 {
   hip_dim_member_t block_size = named_usage::unspecified;
-  hip_dim_member_t grid_size = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(hip_dim_member_t _block_size = named_usage::unspecified,
-            hip_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  hip_dim_member_t grid_size  = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      hip_dim_member_t _block_size = named_usage::unspecified,
+      hip_dim_member_t _grid_size  = named_usage::unspecified)
+      : block_size(_block_size), grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -663,436 +693,457 @@ struct IndexSize
 
 /// useful for global indexing
 /// with fixed block size and fixed grid size
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size of 1 and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 /// with dynamic block size and fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
 /// with dynamic block size and fixed grid size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// with fixed block size and dynamic grid size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 /// with fixed block size of 1 and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// with dynamic block size and dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing blocks (ignores thread indices)
 /// with fixed grid size
-template<named_dim dim, int GRID_SIZE>
+template <named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
 {
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
 /// with fixed grid sized of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic grid size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
 /// useful for indexing threads (ignores block indices)
 /// with fixed block size
-template<named_dim dim, int BLOCK_SIZE>
+template <named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
 {
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
 /// with fixed block size of 1
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 /// with dynamic block size
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
 /// useful for doing single threaded sequential tasks
 /// (ignores thread and block indices)
-template<named_dim dim>
+template <named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
 // useful for flatten global index (includes x)
-template<typename x_index>
+template <typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
-template<typename x_index, typename y_index>
+template <typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
-template<typename x_index, typename y_index, typename z_index>
+template <typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
-template<size_t divisor, typename indexer>
+template <size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template <typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1101,10 +1152,10 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_thread;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
@@ -1119,10 +1170,10 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template <typename index_global>
 struct get_index_block;
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template <named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
@@ -1137,89 +1188,88 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 };
 
 
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template <size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template <size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template <size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template <size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
 template <size_t BLOCK_SIZE_X,
           size_t BLOCK_SIZE_Y,
           size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+          size_t GRID_SIZE_X = named_usage::unspecified,
+          size_t GRID_SIZE_Y = named_usage::unspecified,
+          size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace hip
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X = named_usage::unspecified,
+          size_t BLOCK_SIZE_Y = named_usage::unspecified,
+          size_t BLOCK_SIZE_Z = named_usage::unspecified,
+          size_t GRID_SIZE_X  = named_usage::unspecified,
+          size_t GRID_SIZE_Y  = named_usage::unspecified,
+          size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace hip
 
 // contretizers used in forall, scan, and sort policies
 
-using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using HipAvoidDeviceMaxThreadOccupancyConcretizer =
+    hip::AvoidDeviceMaxThreadOccupancyConcretizer<
+        hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template <typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using HipFractionOffsetOccupancyConcretizer =
+    hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+using HipReduceDefaultConcretizer =
+    HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
@@ -1227,83 +1277,111 @@ using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
+                                       hip::global_x<BLOCK_SIZE>,
+                                       HipDefaultConcretizer,
+                                       Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
+                                             hip::global_x<BLOCK_SIZE>,
+                                             HipDefaultConcretizer,
+                                             true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_max = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_max_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using hip_exec_occ_fraction = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
 using hip_exec_occ_fraction_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    true>;
 
 template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using hip_exec_occ_custom = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE, typename Concretizer>
 using hip_exec_occ_custom_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_with_reduce = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_with_reduce_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    true>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base = std::conditional_t<with_reduce,
-    hip_exec_with_reduce<BLOCK_SIZE, Async>,
-    hip_exec<BLOCK_SIZE, Async>>;
+using hip_exec_base =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
+                       hip_exec<BLOCK_SIZE, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async = std::conditional_t<with_reduce,
-    hip_exec_with_reduce_async<BLOCK_SIZE>,
-    hip_exec_async<BLOCK_SIZE>>;
+using hip_exec_base_async =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce_async<BLOCK_SIZE>,
+                       hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -1319,10 +1397,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template < hip::reduce_algorithm algorithm,
-           hip::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template <hip::reduce_algorithm algorithm,
+          hip::block_communication_mode comm_mode,
+          size_t replication   = named_usage::unspecified,
+          size_t atomic_stride = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1345,35 +1423,41 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1385,25 +1469,26 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
+template <bool with_atomic>
+using hip_reduce_base =
+    std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // policies usable with multi_reducers
-template < hip::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
-    hip::MultiReduceTuning<
-      algorithm,
-      hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template <hip::multi_reduce_algorithm algorithm,
+          typename SharedAtomicReplicationConcretizer,
+          typename SharedAtomicReplicationIndexer,
+          typename GlobalAtomicReplicationConcretizer,
+          typename GlobalAtomicReplicationIndexer>
+using hip_multi_reduce_tuning =
+    policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
+        algorithm,
+        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                     SharedAtomicReplicationIndexer,
+                                     GetOffsetRight<int>>,
+        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                     GlobalAtomicReplicationIndexer,
+                                     GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1416,44 +1501,51 @@ using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
 // - *host_init* policies initialize memory used with atomics on the host.
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<4>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<4>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<0>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<0>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 //
 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
     hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     hip::GlobalAtomicReplicationMinPow2Concretizer<
         hip::ConstantPreferredReplicationConcretizer<32>>,
     hip::warp_global_xyz<>>;
 //
-using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<1>>,
-    hip::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using hip_multi_reduce_atomic_global_no_replication_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<1>>,
+        hip::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using hip_multi_reduce_atomic =
+    hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using hip_multi_reduce_atomic_low_performance_low_overhead =
     hip_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1485,31 +1577,31 @@ using policy::hip::hip_launch_t;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_indexer_direct =
+    policy::hip::hip_indexer<iteration_mapping::Direct,
+                             kernel_sync_requirement::none,
+                             indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_indexer_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template <typename... indexers>
+using hip_flatten_indexer_direct =
+    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     indexers...>;
 
-template < typename ... indexers >
+template <typename... indexers>
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1522,7 +1614,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1537,22 +1629,28 @@ using hip_thread_yz_direct = hip_thread_direct<named_dim::y, named_dim::z>;
 using hip_thread_zx_direct = hip_thread_direct<named_dim::z, named_dim::x>;
 using hip_thread_zy_direct = hip_thread_direct<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_direct = hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_direct = hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_direct = hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_direct = hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_direct = hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_direct = hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_direct =
+    hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_direct =
+    hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_direct =
+    hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_direct =
+    hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_direct =
+    hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_direct =
+    hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_thread_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1567,12 +1665,18 @@ using hip_thread_yz_loop = hip_thread_loop<named_dim::y, named_dim::z>;
 using hip_thread_zx_loop = hip_thread_loop<named_dim::z, named_dim::x>;
 using hip_thread_zy_loop = hip_thread_loop<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_loop = hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_loop = hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_loop = hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_loop = hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_loop = hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_loop =
+    hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_loop =
+    hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_loop =
+    hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_loop =
+    hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_loop =
+    hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_loop =
+    hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
@@ -1580,7 +1684,7 @@ using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_di
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_thread_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1588,26 +1692,38 @@ using hip_flatten_thread_x_direct = hip_flatten_thread_direct<named_dim::x>;
 using hip_flatten_thread_y_direct = hip_flatten_thread_direct<named_dim::y>;
 using hip_flatten_thread_z_direct = hip_flatten_thread_direct<named_dim::z>;
 
-using hip_flatten_thread_xy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_thread_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1615,19 +1731,31 @@ using hip_flatten_thread_x_loop = hip_flatten_thread_loop<named_dim::x>;
 using hip_flatten_thread_y_loop = hip_flatten_thread_loop<named_dim::y>;
 using hip_flatten_thread_z_loop = hip_flatten_thread_loop<named_dim::z>;
 
-using hip_flatten_thread_xy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1635,7 +1763,7 @@ using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1650,22 +1778,28 @@ using hip_block_yz_direct = hip_block_direct<named_dim::y, named_dim::z>;
 using hip_block_zx_direct = hip_block_direct<named_dim::z, named_dim::x>;
 using hip_block_zy_direct = hip_block_direct<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_direct = hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_direct = hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_direct = hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_direct = hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_direct = hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_direct = hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_direct =
+    hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_direct =
+    hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_direct =
+    hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_direct =
+    hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_direct =
+    hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_direct =
+    hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_block_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1680,12 +1814,18 @@ using hip_block_yz_loop = hip_block_loop<named_dim::y, named_dim::z>;
 using hip_block_zx_loop = hip_block_loop<named_dim::z, named_dim::x>;
 using hip_block_zy_loop = hip_block_loop<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_loop = hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_loop = hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_loop = hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_loop = hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_loop = hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_loop =
+    hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_loop =
+    hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_loop =
+    hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_loop =
+    hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_loop =
+    hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_loop =
+    hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
@@ -1693,7 +1833,7 @@ using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim:
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_block_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1701,26 +1841,38 @@ using hip_flatten_block_x_direct = hip_flatten_block_direct<named_dim::x>;
 using hip_flatten_block_y_direct = hip_flatten_block_direct<named_dim::y>;
 using hip_flatten_block_z_direct = hip_flatten_block_direct<named_dim::z>;
 
-using hip_flatten_block_xy_direct = hip_flatten_block_direct<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_direct = hip_flatten_block_direct<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_direct = hip_flatten_block_direct<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_direct = hip_flatten_block_direct<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_direct = hip_flatten_block_direct<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_direct = hip_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_direct = hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_direct = hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_direct = hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_direct = hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_direct = hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_direct = hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template <named_dim... dims>
 using hip_flatten_block_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1728,19 +1880,31 @@ using hip_flatten_block_x_loop = hip_flatten_block_loop<named_dim::x>;
 using hip_flatten_block_y_loop = hip_flatten_block_loop<named_dim::y>;
 using hip_flatten_block_z_loop = hip_flatten_block_loop<named_dim::z>;
 
-using hip_flatten_block_xy_loop = hip_flatten_block_loop<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_loop = hip_flatten_block_loop<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_loop = hip_flatten_block_loop<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_loop = hip_flatten_block_loop<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_loop = hip_flatten_block_loop<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_loop = hip_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_loop = hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_loop = hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_loop = hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_loop = hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_loop = hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1748,9 +1912,11 @@ using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_di
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using hip_global_direct = hip_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_global_direct =
+    hip_indexer_direct<hip::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
 
 using hip_global_x_direct = hip_global_direct<named_dim::x>;
 using hip_global_y_direct = hip_global_direct<named_dim::y>;
@@ -1763,24 +1929,34 @@ using hip_global_yz_direct = hip_global_direct<named_dim::y, named_dim::z>;
 using hip_global_zx_direct = hip_global_direct<named_dim::z, named_dim::x>;
 using hip_global_zy_direct = hip_global_direct<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_direct = hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_direct = hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_direct = hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_direct = hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_direct = hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_direct = hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_direct =
+    hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_direct =
+    hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_direct =
+    hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_direct =
+    hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_direct =
+    hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_direct =
+    hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using hip_global_loop = hip_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using hip_global_syncable_loop = hip_indexer_syncable_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_global_loop =
+    hip_indexer_loop<hip::IndexGlobal<dims,
+                                      named_usage::unspecified,
+                                      named_usage::unspecified>...>;
+
+template <named_dim... dims>
+using hip_global_syncable_loop =
+    hip_indexer_syncable_loop<hip::IndexGlobal<dims,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>...>;
 
 using hip_global_x_loop = hip_global_loop<named_dim::x>;
 using hip_global_y_loop = hip_global_loop<named_dim::y>;
@@ -1793,12 +1969,18 @@ using hip_global_yz_loop = hip_global_loop<named_dim::y, named_dim::z>;
 using hip_global_zx_loop = hip_global_loop<named_dim::z, named_dim::x>;
 using hip_global_zy_loop = hip_global_loop<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_loop = hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_loop = hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_loop = hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_loop = hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_loop = hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_loop =
+    hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_loop =
+    hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_loop =
+    hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_loop =
+    hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_loop =
+    hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_loop =
+    hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -1806,54 +1988,83 @@ using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_di
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using hip_flatten_global_direct = hip_flatten_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_flatten_global_direct =
+    hip_flatten_indexer_direct<hip::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
 using hip_flatten_global_z_direct = hip_flatten_global_direct<named_dim::z>;
 
-using hip_flatten_global_xy_direct = hip_flatten_global_direct<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_direct = hip_flatten_global_direct<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_direct = hip_flatten_global_direct<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_direct = hip_flatten_global_direct<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_direct = hip_flatten_global_direct<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_direct = hip_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_direct = hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_direct = hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_direct = hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_direct = hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_direct = hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_direct = hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using hip_flatten_global_loop = hip_flatten_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template <named_dim... dims>
+using hip_flatten_global_loop =
+    hip_flatten_indexer_loop<hip::IndexGlobal<dims,
+                                              named_usage::unspecified,
+                                              named_usage::unspecified>...>;
 
 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
 using hip_flatten_global_z_loop = hip_flatten_global_loop<named_dim::z>;
 
-using hip_flatten_global_xy_loop = hip_flatten_global_loop<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_loop = hip_flatten_global_loop<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_loop = hip_flatten_global_loop<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_loop = hip_flatten_global_loop<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_loop = hip_flatten_global_loop<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_loop = hip_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_loop = hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_loop = hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_loop = hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_loop = hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_loop = hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1861,271 +2072,460 @@ using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using hip_thread_size_x_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_thread_size_y_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_thread_size_z_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE>
+using hip_thread_size_x_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_thread_size_y_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_thread_size_z_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using hip_block_size_x_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using hip_block_size_y_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using hip_block_size_z_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template <int X_BLOCK_SIZE>
 using hip_thread_size_x_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template <int Y_BLOCK_SIZE>
 using hip_thread_size_y_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template <int Z_BLOCK_SIZE>
 using hip_thread_size_z_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
 using hip_block_size_x_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template <int Y_GRID_SIZE>
 using hip_block_size_y_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template <int Z_GRID_SIZE>
 using hip_block_size_z_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2133,272 +2533,507 @@ using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using hip_flatten_block_size_x_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using hip_flatten_block_size_y_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using hip_flatten_block_size_z_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+
+template <int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template <int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template <int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template <int X_GRID_SIZE>
+using hip_flatten_block_size_x_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE>
+using hip_flatten_block_size_y_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE>
+using hip_flatten_block_size_z_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+
+template <int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template <int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template <int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+
+
+template <int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template <int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template <int Y_BLOCK_SIZE,
+          int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template <int Z_BLOCK_SIZE,
+          int Y_BLOCK_SIZE,
+          int X_BLOCK_SIZE,
+          int Z_GRID_SIZE = named_usage::unspecified,
+          int Y_GRID_SIZE = named_usage::unspecified,
+          int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 5e3a02fb2c..71542f2410 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -42,18 +42,18 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define hipErrchk(ans)                            \
-  {                                                \
-    ::RAJA::hipAssert((ans), __FILE__, __LINE__); \
+#define hipErrchk(ans)                                                         \
+  {                                                                            \
+    ::RAJA::hipAssert((ans), __FILE__, __LINE__);                              \
   }
 
-inline void hipAssert(hipError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+inline void
+hipAssert(hipError_t code, const char* file, int line, bool abort = true)
 {
-  if (code != hipSuccess) {
-    if (abort) {
+  if (code != hipSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "HIPassert: ";
       msg += hipGetErrorString(code);
@@ -62,9 +62,11 @@ inline void hipAssert(hipError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "HIPassert: %s %s %d\n",
-              hipGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index c81adf8e24..1f9bd87fae 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -60,47 +60,53 @@ template <typename Combiner>
 struct atomic;
 
 template <typename T>
-struct atomic<sum<T>> {
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<min<T>> {
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<max<T>> {
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<and_bit<T>> {
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct atomic<or_bit<T>> {
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::hip_atomic {}, &val, v);
   }
 };
 
 template <typename T>
-struct hip_atomic_available {
+struct hip_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -118,15 +124,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
+          typename T,
+          typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -137,20 +147,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -158,33 +170,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   __shared__ bool isLastBlock;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   __syncthreads();
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -192,72 +207,91 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
 template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        temp = Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T,
+                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T, RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -269,67 +303,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 
 template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::hip::device_mempool_type> device_mem,
-                                          unsigned int * device_count)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce(T* device_target,
+            T val,
+            RAJA::detail::SoAPtr<T, RAJA::hip::device_mempool_type> device_mem,
+            unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      temp = OP{}(temp, device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
+      temp = OP {}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *device_target = temp;
     }
   }
 }
 
-} //  namespace expt
+}  //  namespace expt
 
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
+template <typename Combiner,
+          typename Accessor,
+          int replication,
+          int atomic_stride,
           typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+RAJA_DEVICE RAJA_INLINE int
+grid_reduce_atomic_device_init(T& val,
+                               T identity,
+                               T* device_mem,
+                               unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -338,24 +382,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -366,19 +414,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -389,9 +440,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+RAJA_DEVICE RAJA_INLINE void
+grid_reduce_atomic_host_init(T& val, T identity, T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -399,16 +449,16 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity)
+  {
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
-
 }
 
 }  // namespace impl
@@ -420,12 +470,14 @@ class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Hip res;
     Node* node_list;
@@ -478,14 +530,19 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
-      } else {
+        m_n  = m_rn->node_list;
+      }
+      else
+      {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -497,7 +554,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -534,25 +591,27 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+    if (!rn)
+    {
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -561,7 +620,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::hip::synchronize(*r);
     }
   }
@@ -569,10 +629,12 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
-        Node* n = rn->node_list;
+      while (rn->node_list)
+      {
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -601,12 +663,15 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -617,7 +682,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool own_device_ptr;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
 
   /*! \brief create from a default value and offload information
    *
@@ -625,31 +690,30 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -660,10 +724,12 @@ struct ReduceLastBlock_Data
   void grid_reduce(T* output)
   {
     T temp = value;
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -673,13 +739,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
-      hip_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
+    if (act)
+    {
+      hip_dim_t gridDim  = currentGridDim();
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -690,10 +758,11 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -702,8 +771,10 @@ struct ReduceLastBlock_Data
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -715,32 +786,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool own_device_ptr;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        own_device_ptr {false}
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data&
+  operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -753,7 +824,7 @@ struct ReduceAtomicHostInit_Data
     T temp = value;
 
     impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
-            temp, identity, output);
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -761,8 +832,9 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
-      is_setup = true;
+    if (act)
+    {
+      is_setup       = true;
       own_device_ptr = true;
     }
     return act;
@@ -773,8 +845,9 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
-      is_setup = false;
+    if (act)
+    {
+      is_setup       = false;
       own_device_ptr = false;
     }
     return act;
@@ -782,12 +855,15 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template <typename Combiner,
+          typename Accessor,
+          typename T,
+          size_t replication,
+          size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -798,34 +874,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data&
+  operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -837,10 +913,12 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -850,10 +928,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -864,11 +945,12 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
@@ -880,49 +962,77 @@ struct ReduceAtomicDeviceInit_Data
 template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 32;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 32;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::hip::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::hip::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      hip::ReduceLastBlock_Data<Combiner,
+                                Accessor,
+                                T,
+                                replication,
+                                atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              hip::ReduceAtomicDeviceInit_Data<Combiner,
+                                               Accessor,
+                                               T,
+                                               replication,
+                                               atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  hip::ReduceAtomicHostInit_Data<Combiner,
+                                                 T,
+                                                 replication,
+                                                 atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -931,11 +1041,10 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
         val(init_val, identity_)
-  {
-  }
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -949,16 +1058,18 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -973,25 +1084,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -1000,15 +1121,18 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1022,7 +1146,7 @@ class Reduce
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1132,33 +1256,39 @@ class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 //! specialization of ReduceMinLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public hip::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1182,33 +1312,39 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 //! specialization of ReduceMaxLoc for hip_reduce
 template <typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public hip::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index cdf0a9b82d..17f91e5e2a 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -53,11 +53,10 @@ template <typename IterationMapping,
           bool Async,
           typename InputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -66,23 +65,14 @@ inclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
 
@@ -92,20 +82,11 @@ inclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -127,11 +108,10 @@ template <typename IterationMapping,
           typename InputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive_inplace(
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -141,25 +121,14 @@ exclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -168,22 +137,11 @@ exclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              init,
-                                              len,
-                                              binary_op,
-                                              stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -205,38 +163,27 @@ template <typename IterationMapping,
           typename InputIter,
           typename OutputIter,
           typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op)
+RAJA_INLINE resources::EventProxy<resources::Hip>
+inclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -244,21 +191,11 @@ inclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -280,40 +217,28 @@ template <typename IterationMapping,
           typename OutputIter,
           typename Function,
           typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    InputIter begin,
-    InputIter end,
-    OutputIter out,
-    Function binary_op,
-    T init)
+RAJA_INLINE resources::EventProxy<resources::Hip>
+exclusive(resources::Hip hip_res,
+          ::RAJA::policy::hip::
+              hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+          InputIter begin,
+          InputIter end,
+          OutputIter out,
+          Function binary_op,
+          T init)
 {
   hipStream_t stream = hip_res.get_stream();
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -322,22 +247,11 @@ exclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index eb16246623..c0bc8808cb 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -51,52 +51,63 @@ namespace detail
 {
 
 #if defined(__HIPCC__)
-  template < typename R >
-  using double_buffer = ::rocprim::double_buffer<R>;
+template <typename R>
+using double_buffer = ::rocprim::double_buffer<R>;
 #elif defined(__CUDACC__)
-  template < typename R >
-  using double_buffer = ::cub::DoubleBuffer<R>;
+template <typename R>
+using double_buffer = ::cub::DoubleBuffer<R>;
 #endif
 
-  template < typename R >
-  R* get_current(double_buffer<R>& d_bufs)
-  {
+template <typename R>
+R* get_current(double_buffer<R>& d_bufs)
+{
 #if defined(__HIPCC__)
-    return d_bufs.current();
+  return d_bufs.current();
 #elif defined(__CUDACC__)
-    return d_bufs.Current();
+  return d_bufs.Current();
 #endif
-  }
-
 }
 
+}  // namespace detail
+
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA stable_sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
+      "RAJA stable_sort<hip_exec> is only implemented for pointers to "
+      "arithmetic types and RAJA::operators::less and "
+      "RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -104,26 +115,28 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -133,24 +146,16 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -159,29 +164,23 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -194,26 +193,28 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -223,24 +224,16 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -249,29 +242,23 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -285,30 +272,40 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename Iter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
+      "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
+      "types and RAJA::operators::less and RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -316,18 +313,20 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -335,18 +334,20 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
           typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -355,36 +356,47 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "stable_sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -392,16 +404,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -412,9 +429,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -426,26 +443,16 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -454,36 +461,30 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -497,16 +498,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -517,9 +523,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -531,26 +537,16 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -559,36 +555,30 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -603,36 +593,47 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -640,16 +641,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -661,16 +667,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async,
+          typename KeyIter,
+          typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index fc29dabcbf..89a7997b31 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -30,7 +30,7 @@
 #include <thread>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/openmp/atomic.hpp"
+#include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
 #include "RAJA/policy/openmp/forall.hpp"
diff --git a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index 09861941ab..8a3263bfd2 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
-  return get_Dispatcher<T, Dispatcher_T>(seq_work{});
+  return get_Dispatcher<T, Dispatcher_T>(seq_work {});
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index c889273a0f..f566ac741b 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +61,21 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 2dc047dd95..43e790759d 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -35,8 +35,7 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T* acc)
 {
   T ret;
 #pragma omp atomic capture
@@ -49,13 +48,12 @@ RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T* acc, T value)
 {
   T ret;
 #pragma omp atomic capture
   {
-    ret = *acc;
+    ret  = *acc;
     *acc = value;
   }
   RAJA_UNUSED_VAR(ret);
@@ -63,8 +61,7 @@ RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -78,8 +75,7 @@ RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -93,15 +89,14 @@ RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value < *acc )
+    if (value < *acc)
     {
       *acc = value;
     }
@@ -109,21 +104,20 @@ RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMin(builtin_atomic{}, acc, value);
+  return atomicMin(builtin_atomic {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value > *acc )
+    if (value > *acc)
     {
       *acc = value;
     }
@@ -131,15 +125,14 @@ RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMax(builtin_atomic{}, acc, value);
+  return atomicMax(builtin_atomic {}, acc, value);
 #endif
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
@@ -153,18 +146,16 @@ RAJA_INLINE T atomicInc(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicInc(builtin_atomic{}, acc, value);
+  return RAJA::atomicInc(builtin_atomic {}, acc, value);
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
@@ -178,17 +169,15 @@ RAJA_INLINE T atomicDec(omp_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicDec(builtin_atomic{}, acc, value);
+  return RAJA::atomicDec(builtin_atomic {}, acc, value);
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -201,8 +190,7 @@ RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -215,8 +203,7 @@ RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -229,13 +216,12 @@ RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old  = *acc;  // capture old for return value
     *acc = value;
   }
   return old;
@@ -243,14 +229,13 @@ RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value)
 {
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
+  return RAJA::atomicCAS(builtin_atomic {}, acc, compare, value);
 }
 
-#endif // not defined RAJA_COMPILER_MSVC
+#endif  // not defined RAJA_COMPILER_MSVC
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 815168ae98..b842a9bfc5 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -55,23 +55,27 @@ namespace policy
 namespace omp
 {
 
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam f_params)
 {
-  RAJA::region<RAJA::omp_parallel_region>([&]() {
-    using RAJA::internal::thread_privatize;
-    auto body = thread_privatize(loop_body);
-    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
-  });
+  RAJA::region<RAJA::omp_parallel_region>(
+      [&]()
+      {
+        using RAJA::internal::thread_privatize;
+        auto body = thread_privatize(loop_body);
+        forall_impl(host_res, InnerPolicy {}, iter, body.get_priv(), f_params);
+      });
   return resources::EventProxy<resources::Host>(host_res);
 }
 
@@ -83,249 +87,283 @@ forall_impl(resources::Host host_res,
 namespace internal
 {
 
-  /// Tag dispatch for omp forall
+/// Tag dispatch for omp forall
 
-  //
-  // omp for (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void
+forall_impl(const ::RAJA::policy::omp::Auto&, Iterable&& iter, Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
-
-  //
-  // omp for schedule(dynamic)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(runtime)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(runtime)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  // TODO :: not implemented in forall param interface ...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(runtime)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
+    loop_body(begin_it[i]);
   }
-  #endif
+}
 
+// TODO :: not implemented in forall param interface ...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl(::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
+              std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
 
-  /// Tag dispatch for omp forall with nowait
 
-  //
-  // omp for nowait (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
+/// Tag dispatch for omp forall with nowait
 
-  //
-  // omp for schedule(static) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //TODO :: not implemented in param interface...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                   Iterable&& iter,
+                   Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl_nowait(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
+    loop_body(begin_it[i]);
   }
-  #endif
-
-} // end namespace internal
+}
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+// TODO :: not implemented in param interface...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template <typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void
+forall_impl_nowait(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl_nowait(::RAJA::policy::omp::Runtime {},
+                     std::forward<Iterable>(iter),
+                     std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
+
+}  // end namespace internal
+
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                        std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_nowait_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(Schedule {}, std::forward<Iterable>(iter),
+                               std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index ba71ac2fbf..76e0ca3fbc 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -38,8 +38,8 @@ namespace RAJA
 struct omp_parallel_collapse_exec
     : make_policy_pattern_t<RAJA::Policy::openmp,
                             RAJA::Pattern::forall,
-                            RAJA::policy::omp::For> {
-};
+                            RAJA::policy::omp::For>
+{};
 
 namespace internal
 {
@@ -48,10 +48,15 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
@@ -71,14 +76,17 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1) firstprivate(privatizer)              \
     RAJA_COLLAPSE(2)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
         auto& private_data = privatizer.get_priv();
         private_data.template assign_offset<Arg0>(i0);
         private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
   }
@@ -92,7 +100,9 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
   template <typename Data>
@@ -101,9 +111,9 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
     const auto l0 = segment_length<Arg0>(data);
     const auto l1 = segment_length<Arg1>(data);
     const auto l2 = segment_length<Arg2>(data);
-    auto i0 = l0;
-    auto i1 = l1;
-    auto i2 = l2;
+    auto i0       = l0;
+    auto i1       = l1;
+    auto i2       = l2;
 
     // Set the argument types for this loop
     using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
@@ -112,16 +122,20 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer)          \
     RAJA_COLLAPSE(3)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
-        for (i2 = 0; i2 < l2; ++i2) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
+        for (i2 = 0; i2 < l2; ++i2)
+        {
           auto& private_data = privatizer.get_priv();
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
           private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
@@ -129,9 +143,6 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
index 65f56010bc..be051f1209 100644
--- a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -30,38 +30,33 @@
 #include "RAJA/policy/openmp/policy.hpp"
 
 
-
 namespace RAJA
 {
 
 namespace statement
 {
-struct OmpSyncThreads : public internal::Statement<camp::nil> {
-};
+struct OmpSyncThreads : public internal::Statement<camp::nil>
+{};
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
 
 
-
-//Statement executor to synchronize omp threads inside a kernel region
-template<typename Types>
-struct StatementExecutor<statement::OmpSyncThreads, Types> {
-
-template<typename Data>
-static RAJA_INLINE void exec(Data &&)
+// Statement executor to synchronize omp threads inside a kernel region
+template <typename Types>
+struct StatementExecutor<statement::OmpSyncThreads, Types>
 {
-  #pragma omp barrier
-}
 
+  template <typename Data>
+  static RAJA_INLINE void exec(Data&&)
+  {
+#pragma omp barrier
+  }
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 7856bd6fda..2beb61ceba 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -25,48 +25,60 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::omp_launch_t> {
+struct LaunchExecute<RAJA::omp_launch_t>
+{
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char*,
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-
-        LaunchContext ctx;
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          LaunchContext ctx;
 
-        using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(body);
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
-        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+          ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
 
-        loop_body.get_priv()(ctx);
+          loop_body.get_priv()(ctx);
 
-        free(ctx.shared_mem_ptr);
-        ctx.shared_mem_ptr = nullptr;
-    });
+          free(ctx.shared_mem_ptr);
+          ctx.shared_mem_ptr = nullptr;
+        });
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename ReduceParams, typename BODY>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name),  BODY const &body, ReduceParams &f_params)
+  template <typename ReduceParams, typename BODY>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
 
     expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
-    //reducer object must be named f_params as expected by macro below
+    // reducer object must be named f_params as expected by macro below
     RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-   #pragma omp parallel reduction(combine : f_params)
+#pragma omp parallel reduction(combine : f_params)
     {
 
       LaunchContext ctx;
@@ -74,7 +86,7 @@ struct LaunchExecute<RAJA::omp_launch_t> {
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 
-      ctx.shared_mem_ptr = (char*) malloc(launch_params.shared_mem_size);
+      ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size);
 
       expt::invoke_body(f_params, loop_body.get_priv(), ctx);
 
@@ -86,120 +98,136 @@ struct LaunchExecute<RAJA::omp_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
+struct LoopExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 #pragma omp for
-      for (int i = 0; i < len; i++) {
+          for (int i = 0; i < len; i++)
+          {
 
-        loop_body.get_priv()(*(segment.begin() + i));
-      }
-    });
+            loop_body.get_priv()(*(segment.begin() + i));
+          }
+        });
   }
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j));
-        }
-      }
-    });
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
+
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
+            }
+          }
+        });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k));
+          for (int k = 0; k < len2; k++)
+          {
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
 template <typename SEGMENT>
-struct LoopExecute<omp_for_exec, SEGMENT> {
+struct LoopExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -207,12 +235,12 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -220,11 +248,13 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
@@ -236,53 +266,54 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
 // Return local index
 //
 template <typename SEGMENT>
-struct LoopICountExecute<omp_for_exec, SEGMENT> {
+struct LoopICountExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 
 #pragma omp for
-      for (int i = 0; i < len; i++) {
-        body(*(segment.begin() + i), i);
-      }
+    for (int i = 0; i < len; i++)
+    {
+      body(*(segment.begin() + i), i);
+    }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
-               i,
-               j);
-        }
+        body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
+    }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -290,18 +321,17 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            body(*(segment0.begin() + i),
-                 *(segment1.begin() + j),
-                 *(segment2.begin() + k),
-                 i,
-                 j,
-                 k);
-          }
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k), i, j, k);
         }
       }
+    }
   }
 };
 
@@ -309,219 +339,246 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
 struct omp_parallel_nested_for_exec;
 
 template <typename SEGMENT>
-struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
+struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j));
-        }
-      }
-    });
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
+
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j));
+            }
+          }
+        });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k));
+          for (int k = 0; k < len2; k++)
+          {
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k));
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
 // Return local index
 template <typename SEGMENT>
-struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
+struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j),
-                               i,
-                               j);
-        }
-      }
-    });
+          for (int j = 0; j < len1; j++)
+          {
+            for (int i = 0; i < len0; i++)
+            {
+
+              loop_body.get_priv()(*(segment0.begin() + i),
+                                   *(segment1.begin() + j), i, j);
+            }
+          }
+        });
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k),
-                                 i,
-                                 j,
-                                 k);
+          for (int k = 0; k < len2; k++)
+          {
+            for (int j = 0; j < len1; j++)
+            {
+              for (int i = 0; i < len0; i++)
+              {
+                loop_body.get_priv()(*(segment0.begin() + i),
+                                     *(segment1.begin() + j),
+                                     *(segment2.begin() + k), i, j, k);
+              }
+            }
           }
-        }
-      }
-    });
+        });
   }
 };
 
 
 template <typename SEGMENT>
-struct TileExecute<omp_parallel_for_exec, SEGMENT> {
+struct TileExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int i = 0; i < len; i += tile_size) {
-        loop_body.get_priv()(segment.slice(i, tile_size));
-      }
-    });
+          for (int i = 0; i < len; i += tile_size)
+          {
+            loop_body.get_priv()(segment.slice(i, tile_size));
+          }
+        });
   }
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
+struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
+    RAJA::region<RAJA::omp_parallel_region>(
+        [&]()
+        {
+          using RAJA::internal::thread_privatize;
+          auto loop_body = thread_privatize(body);
 
 #pragma omp parallel for
-      for (int i = 0; i < numTiles; i++) {
-        const int i_tile_size = i * tile_size;
-        loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
-      }
-    });
+          for (int i = 0; i < numTiles; i++)
+          {
+            const int i_tile_size = i * tile_size;
+            loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
+          }
+        });
   }
 };
 
 template <typename SEGMENT>
-struct TileExecute<omp_for_exec, SEGMENT> {
+struct TileExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i += tile_size) {
+    for (int i = 0; i < len; i += tile_size)
+    {
       body(segment.slice(i, tile_size));
     }
   }
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<omp_for_exec, SEGMENT> {
+struct TileTCountExecute<omp_for_exec, SEGMENT>
+{
 
   template <typename BODY, typename TILE_T>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
 #pragma omp for
-    for (int i = 0; i < numTiles; i++) {
+    for (int i = 0; i < numTiles; i++)
+    {
       const int i_tile_size = i * tile_size;
       body(segment.slice(i_tile_size, tile_size), i);
     }
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 22b09a7722..9aa61217b3 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -56,7 +56,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataOMP;
 
 /*!
@@ -68,47 +68,56 @@ struct MultiReduceDataOMP;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(nullptr)
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(nullptr)
   {
-    m_data = create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins), other.m_num_bins);
+    m_data =
+        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
+                    other.m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (m_parent && (m_num_bins != size_t(0))) {
+    if (m_data)
+    {
+      if (m_parent && (m_num_bins != size_t(0)))
+      {
 #pragma omp critical(ompMultiReduceCritical)
         {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]);
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
+            MultiReduceOp {}(m_parent->m_data[bin], m_data[bin]);
           }
         }
       }
@@ -116,18 +125,22 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
-    } else {
+      m_data     = create_data(container, m_num_bins);
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -138,26 +151,29 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, num_bins * sizeof(T) );
+    auto data =
+        RAJA::allocate_aligned_type<T>(RAJA::DATA_ALIGN, num_bins * sizeof(T));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -165,11 +181,13 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t bin = num_bins; bin > 0; --bin) {
-      data[bin-1].~T();
+    for (size_t bin = num_bins; bin > 0; --bin)
+    {
+      data[bin - 1].~T();
     }
     RAJA::free_aligned(data);
     data = nullptr;
@@ -185,74 +203,93 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>>
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_get>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_max_threads(omp_get_max_threads())
-      , m_num_bins(container.size())
-      , m_padded_threads(pad_threads(m_max_threads))
-      , m_padded_bins(pad_bins(m_num_bins))
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_max_threads(omp_get_max_threads()),
+        m_num_bins(container.size()),
+        m_padded_threads(pad_threads(m_max_threads)),
+        m_padded_bins(pad_bins(m_num_bins)),
+        m_identity(identity),
+        m_data(nullptr)
   {
-    m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                         m_padded_bins, m_padded_threads);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_padded_threads(other.m_padded_threads)
-      , m_padded_bins(other.m_padded_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_padded_threads(other.m_padded_threads),
+        m_padded_bins(other.m_padded_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (!m_parent) {
-        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    if (m_data)
+    {
+      if (!m_parent)
+      {
+        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                     m_padded_threads);
       }
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
-      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-      m_num_bins = new_num_bins;
+    if (new_num_bins != m_num_bins)
+    {
+      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                   m_padded_threads);
+      m_num_bins    = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-    } else {
-      if (m_max_threads > 0) {
+      m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                           m_padded_bins, m_padded_threads);
+    }
+    else
+    {
+      if (m_max_threads > 0)
+      {
         {
           size_t thread_idx = 0;
-          size_t bin = 0;
-          for (auto const& value : container) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+          size_t bin        = 0;
+          for (auto const& value : container)
+          {
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = value;
             ++bin;
           }
         }
-        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx)
+        {
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = identity;
           }
         }
       }
@@ -263,24 +300,28 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val)
+  void combine(size_t bin, T const& val)
   {
     size_t thread_idx = omp_get_thread_num();
-    MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val);
+    MultiReduceOp {}(
+        m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)],
+        val);
   }
 
   T get(size_t bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
         reducer(m_identity);
-    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) {
-      reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
+    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx)
+    {
+      reducer.combine(
+          m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
     }
     return reducer.get_and_clear();
   }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_max_threads;
   size_t m_num_bins;
   size_t m_padded_threads;
@@ -290,8 +331,10 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
-    size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
+    size_t num_cache_lines =
+        RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
+                                   sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -299,33 +342,46 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     return max_threads;
   }
 
-  static constexpr size_t index_data(size_t bin, size_t thread_idx,
-                                     size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(size_t bin,
+                                     size_t thread_idx,
+                                     size_t padded_bins,
+                                     size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
-  template < typename Container >
-  static T* create_data(Container const& container, T identity,
-                        size_t num_bins, size_t max_threads,
-                        size_t padded_bins, size_t padded_threads)
+  template <typename Container>
+  static T* create_data(Container const& container,
+                        T identity,
+                        size_t num_bins,
+                        size_t max_threads,
+                        size_t padded_bins,
+                        size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) );
-    if (max_threads > 0) {
+    auto data = RAJA::allocate_aligned_type<T>(
+        RAJA::DATA_ALIGN, padded_threads * padded_bins * sizeof(T));
+    if (max_threads > 0)
+    {
       {
         size_t thread_idx = 0;
-        size_t bin = 0;
-        for (auto const& value : container) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value);
+        size_t bin        = 0;
+        for (auto const& value : container)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(value);
           ++bin;
         }
       }
-      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity);
+      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx)
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(identity);
         }
       }
     }
@@ -333,15 +389,21 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
   }
 
   static void destroy_data(T*& data,
-                           size_t num_bins, size_t max_threads,
-                           size_t padded_bins, size_t padded_threads)
+                           size_t num_bins,
+                           size_t max_threads,
+                           size_t padded_bins,
+                           size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) {
-      for (size_t bin = num_bins; bin > 0; --bin) {
-        data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T();
+    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx)
+    {
+      for (size_t bin = num_bins; bin > 0; --bin)
+      {
+        data[index_data(bin - 1, thread_idx - 1, padded_bins, padded_threads)]
+            .~T();
       }
     }
     RAJA::free_aligned(data);
@@ -351,7 +413,8 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
+                                detail::MultiReduceDataOMP)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index d9bea5d0d8..e22b3d7d59 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -18,303 +18,356 @@ namespace omp
 namespace expt
 {
 
-  namespace internal
-  {
-    //
-    // omp for (Auto)
-    //
-    template <typename ExecPol, typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol, RAJA::policy::omp::Auto> >
-    forall_impl(const ExecPol& p,
-                Iterable&& iter,
-                Func&& loop_body,
-                ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+namespace internal
+{
+//
+// omp for (Auto)
+//
+template <typename ExecPol,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
+forall_impl(const ExecPol& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize <= 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize <= 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static, ChunkSize)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize > 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static, ChunkSize)
+//
+template <template <int> class ExecPol,
+          typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize > 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static, ChunkSize) reduction(combine         \
+                                                               : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(runtime)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(runtime)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(runtime) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(runtime) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for nowait (Auto)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for nowait (Auto)
+//
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                    Iterable&& iter,
+                                    Func&& loop_body,
+                                    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
-
-    //
-    // omp for schedule(dynamic)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
+  {
+#pragma omp for nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
     {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+    }
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-    //
-    // omp for schedule(dynamic, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine        \
+                                                                : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided, ChunkSize)
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided, ChunkSize) reduction(combine         \
+                                                               : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(static) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(static) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-    //
-    // omp for schedule(static, ChunkSize) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template <typename Iterable,
+          typename Func,
+          int ChunkSize,
+          typename ForallParam,
+          typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void
+forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                   Iterable&& iter,
+                   Func&& loop_body,
+                   ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-  } //  namespace internal
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-  template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-  RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                                 const omp_for_schedule_exec<Schedule>&,
-                                                                 Iterable&& iter,
-                                                                 Func&& loop_body,
-                                                                 ForallParam f_params)
-  {
-    expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body), std::forward<ForallParam>(f_params));
-    return resources::EventProxy<resources::Host>(host_res);
-  }
-} //  namespace expt
+}  //  namespace internal
+
+template <typename Schedule,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE resources::EventProxy<resources::Host>
+forall_impl(resources::Host host_res,
+            const omp_for_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  expt::internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                              std::forward<Func>(loop_body),
+                              std::forward<ForallParam>(f_params));
+  return resources::EventProxy<resources::Host>(host_res);
+}
+}  //  namespace expt
 
 ///
 /// OpenMP parallel policy implementation
 ///
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template <typename Iterable,
+          typename Func,
+          typename InnerPolicy,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam f_params)
 {
-  expt::forall_impl(host_res, InnerPolicy{}, iter, loop_body, f_params);
+  expt::forall_impl(host_res, InnerPolicy {}, iter, loop_body, f_params);
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
index 65a5f7a329..3a6c6d9bea 100644
--- a/include/RAJA/policy/openmp/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -3,38 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+combine(KernelName&, T& /*place holder argument*/)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
index f71efc255a..c312f05adf 100644
--- a/include/RAJA/policy/openmp/params/reduce.hpp
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index aff2567474..6a1299065c 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -26,15 +26,16 @@
 #include "RAJA/policy/atomic_builtin.hpp"
 
 #if defined(RAJA_COMPILER_MSVC)
-typedef enum omp_sched_t { 
-    // schedule kinds 
-    omp_sched_static = 0x1, 
-    omp_sched_dynamic = 0x2, 
-    omp_sched_guided = 0x3, 
-    omp_sched_auto = 0x4, 
-    
-    // schedule modifier 
-    omp_sched_monotonic = 0x80000000u 
+typedef enum omp_sched_t
+{
+  // schedule kinds
+  omp_sched_static  = 0x1,
+  omp_sched_dynamic = 0x2,
+  omp_sched_guided  = 0x3,
+  omp_sched_auto    = 0x4,
+
+  // schedule modifier
+  omp_sched_monotonic = 0x80000000u
 } omp_sched_t;
 #else
 #include <omp.h>
@@ -51,7 +52,7 @@ enum struct multi_reduce_algorithm : int
   combine_on_get
 };
 
-template < multi_reduce_algorithm t_algorithm >
+template <multi_reduce_algorithm t_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -59,7 +60,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
-} // namspace omp
+}  // namespace omp
 
 namespace policy
 {
@@ -68,14 +69,16 @@ namespace omp
 
 namespace internal
 {
-    struct ScheduleTag {};
-
-    template <omp_sched_t Sched, int Chunk>
-    struct Schedule : public ScheduleTag {
-        constexpr static omp_sched_t schedule = Sched;
-        constexpr static int chunk_size = Chunk;
-        constexpr static Policy policy = Policy::openmp;
-    };
+struct ScheduleTag
+{};
+
+template <omp_sched_t Sched, int Chunk>
+struct Schedule : public ScheduleTag
+{
+  constexpr static omp_sched_t schedule = Sched;
+  constexpr static int chunk_size       = Chunk;
+  constexpr static Policy policy        = Policy::openmp;
+};
 }  // namespace internal
 
 //
@@ -86,23 +89,23 @@ namespace internal
 //////////////////////////////////////////////////////////////////////
 //
 
-struct Parallel {
-};
+struct Parallel
+{};
 
-struct For {
-};
+struct For
+{};
 
-struct NoWait {
-};
+struct NoWait
+{};
 
 static constexpr int default_chunk_size = -1;
 
-struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>{
-};
+struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>
+{};
 
 template <int ChunkSize = default_chunk_size>
-struct Static : public internal::Schedule<omp_sched_static, ChunkSize> {
-};
+struct Static : public internal::Schedule<omp_sched_static, ChunkSize>
+{};
 
 template <int ChunkSize = default_chunk_size>
 using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
@@ -110,8 +113,9 @@ using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 template <int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), default_chunk_size> {
-};
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
+                                            default_chunk_size>
+{};
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -122,39 +126,41 @@ struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), defaul
 //
 
 ///
-///  Struct supporting OpenMP parallel region. 
+///  Struct supporting OpenMP parallel region.
 ///
 struct omp_parallel_region
     : make_policy_pattern_launch_platform_t<Policy::openmp,
                                             Pattern::region,
                                             Launch::undefined,
-                                            Platform::host> {
-};
+                                            Platform::host>
+{};
 
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::region,
-                                            Launch::undefined,
-                                            Platform::host> {
-};
+struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                            Pattern::region,
+                                                            Launch::undefined,
+                                                            Platform::host>
+{};
 
 
 ///
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
 template <typename Sched>
-struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              omp::NoWait,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_nowait_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            omp::NoWait,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 
@@ -162,14 +168,17 @@ struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Poli
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
 template <typename Sched>
-struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+struct omp_for_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 ///
@@ -196,52 +205,58 @@ using omp_for_runtime_exec = omp_for_schedule_exec<omp::Runtime>;
 
 ///
 ///  Internal type aliases supporting 'omp for schedule( ) nowait' for specific
-///  schedule types. 
+///  schedule types.
 ///
 ///  IMPORTANT: We only provide a nowait policy option for static scheduling
 ///             since that is the only scheduling case that can be used with
-///             nowait and be correct in general. Paraphrasing the OpenMP 
+///             nowait and be correct in general. Paraphrasing the OpenMP
 ///             standard:
-///             
-///             Programs that depend on which thread executes a particular 
+///
+///             Programs that depend on which thread executes a particular
 ///             iteration under any circumstance other than static schedule
 ///             are non-conforming.
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_for_nowait_static_exec = omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
+using omp_for_nowait_static_exec =
+    omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
 
 ///
 ///  Struct supporting OpenMP 'parallel' region containing an inner loop
 ///  execution construct.
 ///
 template <typename InnerPolicy>
-using omp_parallel_exec = make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::Parallel,
-                                            wrapper<InnerPolicy>>;
+using omp_parallel_exec =
+    make_policy_pattern_launch_platform_t<Policy::openmp,
+                                          Pattern::forall,
+                                          Launch::undefined,
+                                          Platform::host,
+                                          omp::Parallel,
+                                          wrapper<InnerPolicy>>;
 
 ///
-///  Internal type aliases supporting 'omp parallel for schedule( )' for 
+///  Internal type aliases supporting 'omp parallel for schedule( )' for
 ///  specific schedule types.
 ///
 using omp_parallel_for_exec = omp_parallel_exec<omp_for_exec>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_static_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>> >;
+using omp_parallel_for_static_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_dynamic_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>> >;
+using omp_parallel_for_dynamic_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>>>;
 
 ///
 template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_guided_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>> >;
+using omp_parallel_for_guided_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>>>;
 
 ///
-using omp_parallel_for_runtime_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
+using omp_parallel_for_runtime_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
 
 
 ///
@@ -265,13 +280,13 @@ using omp_parallel_segit = omp_parallel_for_segit;
 ///////////////////////////////////////////////////////////////////////
 ///
 struct omp_taskgraph_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 ///
 struct omp_taskgraph_interval_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 
 ///
@@ -284,8 +299,8 @@ struct omp_taskgraph_interval_segit
 struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -294,31 +309,31 @@ struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce> {
-};
+struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce>
+{};
 
 ///
 struct omp_reduce_ordered
-    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered>
+{};
 
 ///
-template < typename tuning >
-struct omp_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template <typename tuning>
+struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::openmp,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
                                                       Pattern::synchronize,
-                                                      Launch::sync> {
-};
+                                                      Launch::sync>
+{};
 
 #if defined(RAJA_COMPILER_MSVC)
 
@@ -327,14 +342,15 @@ using omp_atomic = builtin_atomic;
 
 #else  // RAJA_COMPILER_MSVC not defined
 
-struct omp_atomic {};
+struct omp_atomic
+{};
 
 #endif
 
 
-template < RAJA::omp::multi_reduce_algorithm algorithm >
-using omp_multi_reduce_tuning = omp_multi_reduce_policy<
-    RAJA::omp::MultiReduceTuning<algorithm> >;
+template <RAJA::omp::multi_reduce_algorithm algorithm>
+using omp_multi_reduce_tuning =
+    omp_multi_reduce_policy<RAJA::omp::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - combine_on_destruction policies combine new values into a single value for
@@ -344,8 +360,8 @@ using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning<
     RAJA::omp::multi_reduce_algorithm::combine_on_destruction>;
 // - combine_on_get policies combine new values into a single value for
 //   each thread then when get is called those values are combined.
-using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning<
-    RAJA::omp::multi_reduce_algorithm::combine_on_get>;
+using omp_multi_reduce_combine_on_get =
+    omp_multi_reduce_tuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>;
 
 // Policy for RAJA::MultiReduce* objects that gives the
 // same answer every time when used in the same way
@@ -395,18 +411,19 @@ using policy::omp::omp_parallel_for_segit;
 using policy::omp::omp_parallel_segit;
 
 ///
-/// Type alias for omp parallel region containing an inner 'omp for' loop 
+/// Type alias for omp parallel region containing an inner 'omp for' loop
 /// execution policy. Inner policy types follow.
 ///
 using policy::omp::omp_parallel_exec;
 
 ///
-/// Type alias for 'omp for' loop execution within an omp_parallel_exec construct
+/// Type alias for 'omp for' loop execution within an omp_parallel_exec
+/// construct
 ///
 using policy::omp::omp_for_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// scheduling policy within an omp_parallel_exec construct
 /// Scheduling policies are near the top of this file and include:
 /// RAJA::policy::omp::{Auto, Static, Dynamic, Guided, Runtime}
@@ -421,7 +438,7 @@ using policy::omp::omp_for_schedule_exec;
 using policy::omp::omp_for_nowait_schedule_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// static scheduling policy within an omp_parallel_exec construct
 ///
 using policy::omp::omp_for_static_exec;
@@ -437,8 +454,8 @@ using policy::omp::omp_for_runtime_exec;
 ///
 /// Type aliases for omp parallel region
 ///
-using policy::omp::omp_parallel_region;
 using policy::omp::omp_launch_t;
+using policy::omp::omp_parallel_region;
 
 ///
 /// Type aliases for omp reductions
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index 7ccc68c3a1..7fb0953c03 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -55,7 +55,8 @@ class ReduceOMP
 
   ~ReduceOMP()
   {
-    if (Base::parent) {
+    if (Base::parent)
+    {
 #pragma omp critical(ompReduceCritical)
       Reduce()(Base::parent->local(), Base::my_data);
       Base::my_data = Base::identity;
@@ -101,20 +102,22 @@ class ReduceOMPOrdered
 
   ~ReduceOMPOrdered()
   {
-    Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+    Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
     Base::my_data = Base::identity;
   }
 
   T get_combined() const
   {
-    if (Base::my_data != Base::identity) {
-      Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+    if (Base::my_data != Base::identity)
+    {
+      Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
       Base::my_data = Base::identity;
     }
 
     T res = Base::identity;
-    for (size_t i = 0; i < data->size(); ++i) {
-      Reduce{}(res, (*data)[i]);
+    for (size_t i = 0; i < data->size(); ++i)
+    {
+      Reduce {}(res, (*data)[i]);
     }
     return res;
   }
diff --git a/include/RAJA/policy/openmp/region.hpp b/include/RAJA/policy/openmp/region.hpp
index 88f0519abf..80f2dbd84a 100644
--- a/include/RAJA/policy/openmp/region.hpp
+++ b/include/RAJA/policy/openmp/region.hpp
@@ -35,15 +35,15 @@ namespace omp
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const omp_parallel_region &, Func &&body)
+RAJA_INLINE void region_impl(const omp_parallel_region&, Func&& body)
 {
 
 #pragma omp parallel
-    { // curly brackets to ensure body() is encapsulated in omp parallel region
-      //thread private copy of body
-      auto loopbody = body;
-      loopbody();
-    }
+  {  // curly brackets to ensure body() is encapsulated in omp parallel region
+    // thread private copy of body
+    auto loopbody = body;
+    loopbody();
+  }
 }
 
 }  // namespace omp
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 97cd7a8ab8..555075aeac 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -44,39 +44,39 @@ namespace scan
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using std::distance;
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, Value());
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    if (idx_begin != idx_end) {
-      inclusive_inplace(host_res, ::RAJA::seq_exec{},
-                        begin + idx_begin, begin + idx_end, f);
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
+    if (idx_begin != idx_end)
+    {
+      inclusive_inplace(host_res, ::RAJA::seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, ::RAJA::seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res, ::RAJA::seq_exec {}, sums.data(),
+                      sums.data() + p, f, BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -89,42 +89,42 @@ inclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  ValueT v)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using std::distance;
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, v);
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
+    const Value init          = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
-    if (idx_begin != idx_end) {
-      exclusive_inplace(host_res, seq_exec{},
-                        begin + idx_begin, begin + idx_end, f, init);
+    if (idx_begin != idx_end)
+    {
+      exclusive_inplace(host_res, seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res, seq_exec {}, sums.data(), sums.data() + p, f,
+                      BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -137,16 +137,14 @@ exclusive_inplace(
    initial value
 */
 template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -162,21 +160,20 @@ template <typename Policy,
           typename OutIter,
           typename BinFn,
           typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f,
-    ValueT v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f,
+          ValueT v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f, v);
+  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f,
+                           v);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 9e4474d692..ea88a7b2ff 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -62,16 +62,18 @@ inline void sort_task(Sorter sorter,
                       RAJA::detail::IterDiff<Iter> iterates_per_task,
                       Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type   = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
 
-  if (n <= iterates_per_task) {
-
-    sorter(begin+i_begin, begin+i_end, comp);
+  if (n <= iterates_per_task)
+  {
 
-  } else {
+    sorter(begin + i_begin, begin + i_end, comp);
+  }
+  else
+  {
 
-    const diff_type i_middle = i_begin + n/2;
+    const diff_type i_middle = i_begin + n / 2;
 
 #pragma omp task
     sort_task(sorter, begin, i_begin, i_middle, iterates_per_task, comp);
@@ -81,8 +83,10 @@ inline void sort_task(Sorter sorter,
 
 #pragma omp taskwait
 
-    //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+    // comp);
+    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                begin + i_end, comp);
   }
 }
 
@@ -114,20 +118,27 @@ inline void sort_parallel_region(Sorter sorter,
   }
 
   // hierarchically merge ranges
-  for (diff_type middle_offset = 1; middle_offset < num_threads; middle_offset *= 2) {
+  for (diff_type middle_offset = 1; middle_offset < num_threads;
+       middle_offset *= 2)
+  {
 
-    diff_type end_offset = 2*middle_offset;
+    diff_type end_offset = 2 * middle_offset;
 
-    const diff_type i_middle = firstIndex(n, num_threads, std::min(thread_id + middle_offset, num_threads));
-    const diff_type i_end    = firstIndex(n, num_threads, std::min(thread_id + end_offset,    num_threads));
+    const diff_type i_middle = firstIndex(
+        n, num_threads, std::min(thread_id + middle_offset, num_threads));
+    const diff_type i_end = firstIndex(
+        n, num_threads, std::min(thread_id + end_offset, num_threads));
 
 #pragma omp barrier
 
-    if (thread_id % end_offset == 0) {
+    if (thread_id % end_offset == 0)
+    {
 
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
-      //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+      // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+      // comp);
+      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                  begin + i_end, comp);
     }
   }
 }
@@ -139,11 +150,7 @@ inline void sort_parallel_region(Sorter sorter,
         \brief sort given range using sorter and comparison function
 */
 template <typename Sorter, typename Iter, typename Compare>
-inline
-void sort(Sorter sorter,
-          Iter begin,
-          Iter end,
-          Compare comp)
+inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
 
@@ -151,22 +158,26 @@ void sort(Sorter sorter,
 
   const diff_type n = end - begin;
 
-  if (n <= min_iterates_per_task) {
+  if (n <= min_iterates_per_task)
+  {
 
     sorter(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     const diff_type max_threads = omp_get_max_threads();
 
 #if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 
-    const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
+    const diff_type iterates_per_task =
+        std::max(n / (2 * max_threads), min_iterates_per_task);
 
-    const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads =
+        std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
 #pragma omp master
     {
       sort_task(sorter, begin, 0, n, iterates_per_task, comp);
@@ -174,10 +185,11 @@ void sort(Sorter sorter,
 
 #else
 
-    const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads = std::min(
+        (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
     {
       sort_parallel_region(sorter, begin, n, comp);
     }
@@ -186,9 +198,9 @@ void sort(Sorter sorter,
   }
 }
 
-} // namespace openmp
+}  // namespace openmp
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -196,14 +208,13 @@ void sort(Sorter sorter,
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -214,14 +225,13 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
-  detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::StableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -229,43 +239,50 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::StableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index af88127636..4c48a12eda 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -34,6 +34,7 @@
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
+        // defined(RAJA_ENABLE_TARGET_OPENMP)
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index a4a4a62903..6ace7460fd 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -36,12 +36,12 @@ namespace omp_target
 
 // create the value in a target region using the factory, map the value
 // back, and return the value created in the target region
-template < typename Factory >
+template <typename Factory>
 inline auto get_value(Factory factory)
 {
   typename std::decay_t<Factory>::value_type value;
 
-  #pragma omp target map(tofrom : value) map(to : factory)
+#pragma omp target map(tofrom : value) map(to : factory)
   {
     value = factory();
   }
@@ -51,7 +51,7 @@ inline auto get_value(Factory factory)
 
 // get the device value and store it so it can be used
 // multiple times
-template < typename Factory >
+template <typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -61,17 +61,18 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace omp_target
 
 /*!
-* Populate and return a Dispatcher object that can be used in omp target regions
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object that can be used in omp target
+ * regions
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return omp_target::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>(
+      [](auto&& factory)
+      {
+        return omp_target::get_cached_value(
+            std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index b373d09c61..96c2323c33 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,21 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +61,21 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 061481cbc1..a142b6a606 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -33,13 +33,15 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
             Iterable&& iter,
@@ -51,33 +53,37 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it) reduction(combine: f_params)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to                                                 \
+                            : body, begin_it) reduction(combine                \
+                                                        : f_params)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -86,13 +92,14 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <size_t ThreadsPerTeam,
+          typename Iterable,
+          typename Func,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>&,
             Iterable&& iter,
@@ -100,33 +107,36 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to                                                 \
+                            : body, begin_it)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
@@ -135,16 +145,12 @@ forall_impl(resources::Omp omp_res,
 }
 
 
-
-
-
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt& p,
             Iterable&& iter,
@@ -156,13 +162,15 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it) reduction(combine: f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it) reduction(combine                             \
+                                           : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -172,12 +180,10 @@ forall_impl(resources::Omp omp_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt&,
             Iterable&& iter,
@@ -185,13 +191,14 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index b72147151c..22d2eb32d8 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -10,13 +10,19 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template <camp::idx_t Arg0,
+          camp::idx_t Arg1,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -30,17 +36,20 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(2)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          auto& private_data = privatizer.get_priv();
-          private_data.template assign_offset<Arg0>(i0);
-          private_data.template assign_offset<Arg1>(i1);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
-        }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        auto& private_data = privatizer.get_priv();
+        private_data.template assign_offset<Arg0>(i0);
+        private_data.template assign_offset<Arg1>(i1);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -50,7 +59,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -66,20 +76,24 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(3)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            auto& private_data = privatizer.get_priv();
-            private_data.template assign_offset<Arg0>(i0);
-            private_data.template assign_offset<Arg1>(i1);
-            private_data.template assign_offset<Arg2>(i2);
-            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
-          }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          auto& private_data = privatizer.get_priv();
+          private_data.template assign_offset<Arg0>(i0);
+          private_data.template assign_offset<Arg1>(i1);
+          private_data.template assign_offset<Arg2>(i2);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
+  }
 };
 
 template <camp::idx_t Arg0,
@@ -90,7 +104,8 @@ template <camp::idx_t Arg0,
           typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
   template <typename Data>
   static RAJA_INLINE void exec(Data&& data)
@@ -108,26 +123,31 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(4)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            for (auto i3 = (decltype(l3))0; i3 < l3; ++i3) {
-              auto& private_data = privatizer.get_priv();
-              private_data.template assign_offset<Arg0>(i0);
-              private_data.template assign_offset<Arg1>(i1);
-              private_data.template assign_offset<Arg2>(i2);
-              private_data.template assign_offset<Arg3>(i2);
-              execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(private_data);
-            }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          for (auto i3 = (decltype(l3))0; i3 < l3; ++i3)
+          {
+            auto& private_data = privatizer.get_priv();
+            private_data.template assign_offset<Arg0>(i0);
+            private_data.template assign_offset<Arg1>(i1);
+            private_data.template assign_offset<Arg2>(i2);
+            private_data.template assign_offset<Arg3>(i2);
+            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(
+                private_data);
           }
         }
       }
     }
+  }
 };
 
-}
-}
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_kernel_Collapse_HPP
+#endif  // RAJA_policy_openmp_target_kernel_Collapse_HPP
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 173230b9e2..38e48c4d24 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -10,25 +10,32 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct OpenMPTargetForWrapper : public GenericWrapperBase 
+template <camp::idx_t ArgumentId,
+          typename Data,
+          typename Types,
+          typename... EnclosedStmts>
+struct OpenMPTargetForWrapper : public GenericWrapperBase
 {
   using data_t = camp::decay<Data>;
 
   data_t data;
 
-  /*! 
+  /*!
    * \brief Deferences data so that it can be mapped to the device
    */
   RAJA_INLINE
-  constexpr explicit OpenMPTargetForWrapper(data_t &d) : 
-    data{d}  {}
+  constexpr explicit OpenMPTargetForWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 
   template <typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
@@ -42,28 +49,33 @@ template <camp::idx_t ArgumentId,
           int N,
           typename... EnclosedStmts,
           typename Types>
-struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>, Types>
+struct StatementExecutor<statement::For<ArgumentId,
+                                        omp_target_parallel_for_exec<N>,
+                                        EnclosedStmts...>,
+                         Types>
 {
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
+    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, omp_target_parallel_for_exec<N> {},
+                TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
+}  // namespace internal
+}  // namespace RAJA
 
-}
-}
-
-#endif // RAJA_policy_openmp_kernel_For_HPP
+#endif  // RAJA_policy_openmp_kernel_For_HPP
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
index 5e9edb4b6c..3579269bdf 100644
--- a/include/RAJA/policy/openmp_target/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -3,38 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(KernelName&, T& /*place holder argument*/)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index 6127eef226..aac704ac71 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, I, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T, I, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index 520f5afc55..4e0b05a00c 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -10,10 +10,13 @@
 
 #include "RAJA/policy/PolicyBase.hpp"
 
-namespace RAJA {
+namespace RAJA
+{
 
-namespace policy {
-namespace omp {
+namespace policy
+{
+namespace omp
+{
 
 // Max number of CUDA reduction threads per block possible.
 // Required for allocating omp target data before execution policy.
@@ -21,47 +24,48 @@ namespace omp {
 static constexpr int MAXNUMTHREADS = 1024;
 
 template <unsigned int TeamSize>
-struct Teams : std::integral_constant<unsigned int, TeamSize> {
-};
+struct Teams : std::integral_constant<unsigned int, TeamSize>
+{};
 
-struct Target {
-};
+struct Target
+{};
 
-struct Distribute {
-};
+struct Distribute
+{};
 
-struct Collapse {
-};
+struct Collapse
+{};
 
 template <size_t ThreadsPerTeam>
 struct omp_target_parallel_for_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Teams<ThreadsPerTeam>,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Teams<ThreadsPerTeam>,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_for_exec_nt
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_collapse_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Collapse> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Collapse>
+{};
 
-struct omp_target_reduce
-    : make_policy_pattern_platform_t<Policy::target_openmp, Pattern::reduce, Platform::omp_target> {
-};
+struct omp_target_reduce : make_policy_pattern_platform_t<Policy::target_openmp,
+                                                          Pattern::reduce,
+                                                          Platform::omp_target>
+{};
 
 ///
 /// WorkGroup execution policies
@@ -70,21 +74,21 @@ struct omp_target_work
     : make_policy_pattern_launch_platform_t<Policy::target_openmp,
                                             Pattern::workgroup_exec,
                                             Launch::sync,
-                                            Platform::omp_target> {
-};
+                                            Platform::omp_target>
+{};
 
 
-}  // closing brace for omp namespace
-}  // closing brace for policy namespace
+}  // namespace omp
+}  // namespace policy
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
+using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_parallel_for_exec;
 using policy::omp::omp_target_parallel_for_exec_nt;
 using policy::omp::omp_target_reduce;
-using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_work;
 #endif
 
-} // closing brace for RAJA namespace
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_HPP
+#endif  // RAJA_policy_openmp_target_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 6691729bbe..8bcbde620d 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -33,15 +33,14 @@ namespace omp
 #pragma omp declare target
 
 template <typename T, typename I>
-struct minloc 
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
@@ -49,15 +48,14 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
@@ -70,18 +68,19 @@ struct maxloc
 static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 
 //! Information necessary for OpenMP offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
-  int hostID{omp_get_initial_device()};
-  int deviceID{omp_get_default_device()};
-  bool isMapped{false};
+  int hostID {omp_get_initial_device()};
+  int deviceID {omp_get_default_device()};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  Offload_Info(const Offload_Info& other)
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
+  {}
 };
 
 //! Reduction data for OpenMP Offload -- stores value, host pointer, and device
@@ -90,8 +89,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -100,17 +99,19 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
-     : value(initValue),
-        device{reinterpret_cast<T *>(
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
+      : value(initValue),
+        device {reinterpret_cast<T*>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
-        host{new T[omp::MaxNumTeams]}
+        host {new T[omp::MaxNumTeams]}
   {
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -118,55 +119,50 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(device),
-                          reinterpret_cast<void *>(host),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.deviceID,
-                          info.hostID) != 0) {
+    if (omp_target_memcpy(reinterpret_cast<void*>(device),
+                          reinterpret_cast<void*>(host),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
+                          info.hostID) != 0)
+    {
       printf("Unable to copy memory from host to device\n");
       exit(1);
     }
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(host),
-                          reinterpret_cast<void *>(device),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.hostID,
-                          info.deviceID) != 0) {
+    if (omp_target_memcpy(reinterpret_cast<void*>(host),
+                          reinterpret_cast<void*>(device),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
+                          info.deviceID) != 0)
+    {
       printf("Unable to copy memory from device to host\n");
       exit(1);
     }
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
-    if (device) {
-      omp_target_free(reinterpret_cast<void *>(device), info.deviceID);
+    if (device)
+    {
+      omp_target_free(reinterpret_cast<void*>(device), info.deviceID);
       device = nullptr;
     }
-    if (host) {
+    if (host)
+    {
       delete[] host;
       host = nullptr;
     }
@@ -178,77 +174,80 @@ struct Reduce_Data
 //! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
+struct TargetReduce
 {
-  TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce()                    = delete;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val_, T identity_ = Reducer::identity())
       : info(),
         val(identity_, identity_, info),
         initVal(init_val_),
         finalVal(identity_)
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     operator T();
     val.reset(identity_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_;
   }
 
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp declare target
 #endif
   //! apply reduction on device upon destruction
   ~TargetReduce()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], val.value);
+        Reducer {}(val.device[tid], val.value);
       }
     }
   }
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp end declare target
 #endif
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
 
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
-        Reducer{}(val.value, val.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, val.host[i]);
       }
       val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     return finalVal;
   }
   //! alias for operator T()
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
@@ -264,13 +263,16 @@ struct TargetReduce
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val_, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc()                       = delete;
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+  explicit TargetReduceLoc(
+      T init_val_,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -278,31 +280,34 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
     loc.reset(identity_loc_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_val_;
-    initLoc = init_loc_;
+    initLoc  = init_loc_;
     finalLoc = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], loc.device[tid], val.value, loc.value);
+        Reducer {}(val.device[tid], loc.device[tid], val.value, loc.value);
       }
     }
   }
@@ -310,11 +315,13 @@ struct TargetReduceLoc
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       val.cleanup(info);
       loc.cleanup(info);
@@ -322,8 +329,8 @@ struct TargetReduceLoc
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     return finalVal;
   }
   //! alias for operator T()
@@ -339,16 +346,16 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -372,20 +379,19 @@ class ReduceSum<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
-  using self = ReduceSum<omp_target_reduce, T>;
+  using self   = ReduceSum<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -398,20 +404,19 @@ class ReduceBitOr<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitOr<omp_target_reduce, T>;
+  using self   = ReduceBitOr<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -424,20 +429,19 @@ class ReduceBitAnd<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitAnd<omp_target_reduce, T>;
+  using self   = ReduceBitAnd<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -450,20 +454,19 @@ class ReduceMin<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
-  using self = ReduceMin<omp_target_reduce, T>;
+  using self   = ReduceMin<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -477,20 +480,19 @@ class ReduceMax<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
-  using self = ReduceMax<omp_target_reduce, T>;
+  using self   = ReduceMax<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -503,21 +505,19 @@ class ReduceMinLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>
 {
 public:
-
-  using self = ReduceMinLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
+  using self   = ReduceMinLoc<omp_target_reduce, T, IndexType>;
+  using parent = TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  self &minloc(T rhsVal, IndexType rhsLoc)
+  self& minloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  const self &minloc(T rhsVal, IndexType rhsLoc) const
+  const self& minloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
@@ -531,21 +531,19 @@ class ReduceMaxLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>
 {
 public:
-
-  using self = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
+  using self   = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
+  using parent = TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  self &maxloc(T rhsVal, IndexType rhsLoc)
+  self& maxloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  const self &maxloc(T rhsVal, IndexType rhsLoc) const
+  const self& maxloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 0963b31a01..90c6cb85ed 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -21,7 +21,7 @@
 #define RAJA_sequential_HPP
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 #include "RAJA/policy/sequential/forall.hpp"
diff --git a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index 13796fd8a3..ab97dbd3cf 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template <typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  static Dispatcher_T dispatcher{ Dispatcher_T::template makeDispatcher<T>() };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>()};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index 31e401bf88..b2b6f11bba 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -38,23 +38,20 @@ namespace detail
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
@@ -63,23 +60,20 @@ struct WorkRunner<
 template <typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+          typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::reverse_ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 046e52e1c1..a9e5e4f256 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -27,24 +27,21 @@ namespace RAJA
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(seq_atomic, T* acc)
 {
   return *acc;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(seq_atomic, T* acc, T value)
 {
   *acc = value;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc += value;
@@ -54,8 +51,7 @@ RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc -= value;
@@ -65,29 +61,26 @@ RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = ret < value ? ret : value;
+  *acc  = ret < value ? ret : value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value < ret ? ret : value;
+  *acc  = value < ret ? ret : value;
   return ret;
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) += T(1);
@@ -96,18 +89,16 @@ RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = val <= old ? T(0) : old + T(1);
+  *acc  = val <= old ? T(0) : old + T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) -= T(1);
@@ -116,18 +107,16 @@ RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = old == T(0) || val < old ? val : old - T(1);
+  *acc  = old == T(0) || val < old ? val : old - T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc &= value;
@@ -136,8 +125,7 @@ RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc |= value;
@@ -146,8 +134,7 @@ RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc ^= value;
@@ -156,21 +143,19 @@ RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value;
+  *acc  = value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
 {
   T ret = *acc;
-  *acc = ret == compare ? value : ret;
+  *acc  = ret == compare ? value : ret;
   return ret;
 }
 
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 5d1d6d84b0..4bf9f1607a 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -55,24 +55,26 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     expt::invoke_body(f_params, body, *(begin_it + i));
   }
 
@@ -80,22 +82,24 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template <typename Iterable,
+          typename Func,
+          typename Resource,
+          typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     body(*(begin_it + i));
   }
   return resources::EventProxy<Resource>(res);
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index 8e600ec2e8..a722b89ff8 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -32,10 +32,12 @@ namespace internal
 //
 template <typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>, Types> {
+    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
     // termination case: no more loops, just execute enclosed statements
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
@@ -47,13 +49,17 @@ struct StatementExecutor<
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Collapse<seq_exec,
-                                             ArgList<Arg0, ArgRest...>,
-                                             EnclosedStmts...>, Types> {
+template <camp::idx_t Arg0,
+          camp::idx_t... ArgRest,
+          typename... EnclosedStmts,
+          typename Types>
+struct StatementExecutor<
+    statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // Set the argument type for this loop
@@ -61,11 +67,13 @@ struct StatementExecutor<statement::Collapse<seq_exec,
 
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
+        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>,
+        NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
-    for (auto i0 = 0; i0 < len0; ++i0) {
+    for (auto i0 = 0; i0 < len0; ++i0)
+    {
       data.template assign_offset<Arg0>(i0);
 
       next_loop_t::exec(data);
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 7280844320..dc94c14d85 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -34,10 +34,12 @@ template <template <typename...> class ReduceOperator,
           typename... EnclosedStmts,
           typename Types>
 struct StatementExecutor<
-    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>, Types> {
+    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
     // since a sequential reduction is a NOP, and the single thread always
     // has the reduced value, this is just a passthrough to the enclosed
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index a2025a71d5..6459189e23 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -26,7 +26,8 @@ namespace RAJA
 {
 
 template <>
-struct LaunchExecute<RAJA::null_launch_t> {
+struct LaunchExecute<RAJA::null_launch_t>
+{
   template <typename BODY>
   static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
                    BODY const& RAJA_UNUSED_ARG(body))
@@ -37,20 +38,25 @@ struct LaunchExecute<RAJA::null_launch_t> {
 
 
 template <>
-struct LaunchExecute<RAJA::seq_launch_t> {
+struct LaunchExecute<RAJA::seq_launch_t>
+{
 
   template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name),
-       BODY const &body, ReduceParams &RAJA_UNUSED_ARG(ReduceParams))
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
 
-    char *kernel_local_mem = new char[params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    char* kernel_local_mem = new char[params.shared_mem_size];
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     body(ctx);
 
@@ -60,18 +66,23 @@ struct LaunchExecute<RAJA::seq_launch_t> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename BODY, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body, ReduceParams &launch_reducers)
+  template <typename BODY, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
     LaunchContext ctx;
-    char *kernel_local_mem = new char[launch_params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    char* kernel_local_mem = new char[launch_params.shared_mem_size];
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     expt::invoke_body(launch_reducers, body, ctx);
 
@@ -82,54 +93,57 @@ struct LaunchExecute<RAJA::seq_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopExecute<seq_exec, SEGMENT> {
+struct LoopExecute<seq_exec, SEGMENT>
+{
 
   RAJA_SUPPRESS_HD_WARN
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
+                                                BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
@@ -137,12 +151,12 @@ struct LoopExecute<seq_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -150,49 +164,54 @@ struct LoopExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
     }
   }
-
 };
 
 
 template <typename SEGMENT>
-struct LoopICountExecute<seq_exec, SEGMENT> {
+struct LoopICountExecute<seq_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 
-    template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
@@ -200,12 +219,12 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
   }
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment0,
+       SEGMENT const& segment1,
+       SEGMENT const& segment2,
+       BODY const& body)
   {
 
     // block stride loop
@@ -213,30 +232,32 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k), i, j, k);
         }
       }
     }
   }
-
 };
 
-//Tile Execute + variants
+// Tile Execute + variants
 
 template <typename SEGMENT>
-struct TileExecute<seq_exec, SEGMENT> {
+struct TileExecute<seq_exec, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -246,28 +267,27 @@ struct TileExecute<seq_exec, SEGMENT> {
       body(segment.slice(tx, tile_size));
     }
   }
-
 };
 
 template <typename SEGMENT>
-struct TileTCountExecute<seq_exec, SEGMENT> {
+struct TileTCountExecute<seq_exec, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_HOST_DEVICE RAJA_INLINE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_HOST_DEVICE RAJA_INLINE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       TILE_T tile_size,
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = 0, bx=0; tx < len; tx += tile_size, bx++)
+    for (int tx = 0, bx = 0; tx < len; tx += tile_size, bx++)
     {
       body(segment.slice(tx, tile_size), bx);
     }
   }
-
 };
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index be3a3860f8..2b05ba512e 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -47,7 +47,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template <typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataSeq;
 
 /*!
@@ -59,59 +59,68 @@ struct MultiReduceDataSeq;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataSeq<T, t_MultiReduceOp,
+template <typename T, typename t_MultiReduceOp>
+struct MultiReduceDataSeq<
+    T,
+    t_MultiReduceOp,
     RAJA::sequential::MultiReduceTuning<
-      RAJA::sequential::multi_reduce_algorithm::left_fold>>
+        RAJA::sequential::multi_reduce_algorithm::left_fold>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataSeq() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr >
+  template <typename Container,
+            std::enable_if_t<
+                !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
   MultiReduceDataSeq(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataSeq(MultiReduceDataSeq const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataSeq(MultiReduceDataSeq const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataSeq(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq(MultiReduceDataSeq&&)                 = delete;
   MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
-  MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&)      = delete;
 
   ~MultiReduceDataSeq()
   {
-    if (m_data) {
-      if (!m_parent) {
+    if (m_data)
+    {
+      if (!m_parent)
+      {
         destroy_data(m_data, m_num_bins);
       }
     }
   }
 
-  template < typename Container >
+  template <typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
-    } else {
+      m_data     = create_data(container, m_num_bins);
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -122,27 +131,29 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataSeq const *m_parent;
+  MultiReduceDataSeq const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template <typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
-    auto data = static_cast<T*>(malloc(num_bins*sizeof(T)));
+    auto data  = static_cast<T*>(malloc(num_bins * sizeof(T)));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -150,11 +161,13 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       data[bin].~T();
     }
     free(data);
@@ -164,7 +177,8 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
+                                detail::MultiReduceDataSeq)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
index 00e6a1dc52..d31f271569 100644
--- a/include/RAJA/policy/sequential/params/kernel_name.hpp
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -3,35 +3,39 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+RAJA_HOST_DEVICE
+    camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+    combine(KernelName&, T)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
index b77028ca5f..2902418249 100644
--- a/include/RAJA/policy/sequential/params/reduce.hpp
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -3,33 +3,39 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
 
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
 
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 287af42502..00fa7274a3 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -30,7 +30,7 @@ enum struct multi_reduce_algorithm : int
   left_fold
 };
 
-template < multi_reduce_algorithm t_multi_algorithm >
+template <multi_reduce_algorithm t_multi_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
@@ -38,7 +38,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
-} // namspace sequential
+}  // namespace sequential
 
 namespace policy
 {
@@ -60,20 +60,20 @@ namespace sequential
 struct seq_region : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::region,
                                                           Launch::sync,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 struct seq_launch_t : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                             Pattern::region,
                                                             Launch::sync,
-                                                            Platform::host> {
-};
+                                                            Platform::host>
+{};
 
 struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::forall,
                                                         Launch::undefined,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 /// Index set segment iteration policies
@@ -86,8 +86,8 @@ using seq_segit = seq_exec;
 struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -99,20 +99,20 @@ struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
 struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::reduce,
                                                           Launch::undefined,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 ///
-template < typename tuning >
-struct seq_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template <typename tuning>
+struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::sequential,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -121,13 +121,13 @@ struct seq_multi_reduce_policy
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct seq_atomic {
-};
+struct seq_atomic
+{};
 
 
-template < RAJA::sequential::multi_reduce_algorithm algorithm >
-using seq_multi_reduce_tuning = seq_multi_reduce_policy<
-    RAJA::sequential::MultiReduceTuning<algorithm> >;
+template <RAJA::sequential::multi_reduce_algorithm algorithm>
+using seq_multi_reduce_tuning =
+    seq_multi_reduce_policy<RAJA::sequential::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - left_fold policies combine new values into a single value.
@@ -143,12 +143,12 @@ using seq_multi_reduce = seq_multi_reduce_left_fold;
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
-using policy::sequential::seq_reduce;
+using policy::sequential::seq_launch_t;
 using policy::sequential::seq_multi_reduce;
+using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
-using policy::sequential::seq_launch_t;
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/region.hpp b/include/RAJA/policy/sequential/region.hpp
index 84d03ae202..81c5d41647 100644
--- a/include/RAJA/policy/sequential/region.hpp
+++ b/include/RAJA/policy/sequential/region.hpp
@@ -35,7 +35,7 @@ namespace sequential
  */
 
 template <typename Func>
-RAJA_INLINE void region_impl(const seq_region &, Func &&body)
+RAJA_INLINE void region_impl(const seq_region&, Func&& body)
 {
   body();
 }
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 4bcc73366d..e0e12e0a58 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -41,22 +41,21 @@ namespace scan
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = *begin;
+  ValueT agg   = *begin;
 
-  for (Iter i = ++begin; i != end; ++i) {
+  for (Iter i = ++begin; i != end; ++i)
+  {
     agg = f(agg, *i);
-    *i = agg;
+    *i  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -67,28 +66,27 @@ inclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  T v)
 {
   using std::distance;
-  const auto n = distance(begin, end);
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
 
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = v;
+  ValueT agg   = v;
 
-  for (DistanceT i = 0; i < n; ++i) {
-    auto t = begin[i];
+  for (DistanceT i = 0; i < n; ++i)
+  {
+    auto t   = begin[i];
     begin[i] = agg;
-    agg = f(agg, t);
+    agg      = f(agg, t);
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -99,23 +97,22 @@ exclusive_inplace(
    initial value
 */
 template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = *begin;
-  *out++ = agg;
+  ValueT agg   = *begin;
+  *out++       = agg;
 
-  for (Iter i = begin + 1; i != end; ++i) {
-    agg = f(agg, *i);
+  for (Iter i = begin + 1; i != end; ++i)
+  {
+    agg    = f(agg, *i);
     *out++ = agg;
   }
 
@@ -131,26 +128,25 @@ template <typename ExecPolicy,
           typename OutIter,
           typename BinFn,
           typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f,
-    T v)
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f,
+          T v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = v;
-  OutIter o = out;
-  *o++ = v;
+  ValueT agg   = v;
+  OutIter o    = out;
+  *o++         = v;
 
-  for (Iter i = begin; i != end - 1; ++i, ++o) {
+  for (Iter i = begin; i != end - 1; ++i, ++o)
+  {
     agg = f(agg, *i);
-    *o = agg;
+    *o  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 98dcf6fc27..0a31400029 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -30,7 +30,7 @@
 
 #include "RAJA/util/zip.hpp"
 
-#include "RAJA/util/sort.hpp" 
+#include "RAJA/util/sort.hpp"
 
 #include "RAJA/policy/sequential/policy.hpp"
 
@@ -50,9 +50,8 @@ namespace detail
 */
 struct UnstableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::intro_sort(std::forward<Args>(args)...);
   }
@@ -64,15 +63,14 @@ struct UnstableSorter
 */
 struct StableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template <typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
@@ -80,14 +78,13 @@ struct StableSorter
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
-  detail::UnstableSorter{}(begin, end, comp);
+  detail::UnstableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -98,14 +95,13 @@ unstable(
 template <typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
-  detail::StableSorter{}(begin, end, comp);
+  detail::StableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -113,43 +109,48 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::UnstableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template <typename ExecPolicy,
+          typename KeyIter,
+          typename ValIter,
+          typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::StableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 8c5b38af9c..851eb2317f 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -48,25 +48,24 @@ namespace simd
 
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     expt::invoke_body(f_params, loop_body, *(begin + i));
   }
 
@@ -75,23 +74,22 @@ forall_impl(RAJA::resources::Host host_res,
 }
 
 template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     loop_body(*(begin + i));
   }
 
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index 53ed45ad1f..ae4e673a15 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -40,12 +40,14 @@ namespace internal
  *
  */
 template <class T>
-struct TypeIsLambda {
+struct TypeIsLambda
+{
   static const bool value = false;
 };
 
-template <camp::idx_t BodyIdx, typename ... Args>
-struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
+template <camp::idx_t BodyIdx, typename... Args>
+struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>>
+{
   static const bool value = true;
 };
 
@@ -59,10 +61,11 @@ template <typename Types, class... Statements>
 struct Invoke_all_Lambda;
 
 template <typename Types>
-struct Invoke_all_Lambda<Types> {
+struct Invoke_all_Lambda<Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&)
+  static RAJA_INLINE void lambda_special(Data&&)
   {
     // NOP terminator
   }
@@ -70,7 +73,8 @@ struct Invoke_all_Lambda<Types> {
 
 
 template <typename Types, class Statement, class... StatementRest>
-struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
+struct Invoke_all_Lambda<Types, Statement, StatementRest...>
+{
 
   // Lambda check
   static const bool value = TypeIsLambda<camp::decay<Statement>>::value;
@@ -78,7 +82,7 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
 
   // Invoke the chain of lambdas
   template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&data)
+  static RAJA_INLINE void lambda_special(Data&& data)
   {
 
     // Execute this Lambda
@@ -98,32 +102,36 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
  */
 template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       // Assign offset on privatized data
       private_data.template assign_offset<ArgumentId>(i);
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 36a169f2bf..4544e7ad54 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -42,26 +42,31 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId,
-          typename... EnclosedStmts, typename Types>
+template <camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, RAJA::simd_exec,
-                         EnclosedStmts...>, Types> {
+    statement::
+        ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
   template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Offsets and parameters need to be privatized
       data.template assign_offset<ArgumentId>(i);
@@ -69,10 +74,11 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
@@ -81,4 +87,4 @@ struct StatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 1f8ba01ab3..4ccc94fe94 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -26,36 +26,40 @@ namespace RAJA
 {
 
 template <typename SEGMENT>
-struct LoopExecute<simd_exec, SEGMENT> {
+struct LoopExecute<simd_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template <typename SEGMENT>
-struct LoopICountExecute<simd_exec, SEGMENT> {
+struct LoopICountExecute<simd_exec, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_HOST_DEVICE void
+  exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
+       SEGMENT const& segment,
+       BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index a85811163f..fac158a36b 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -41,8 +41,8 @@ namespace simd
 struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                          Pattern::forall,
                                                          Launch::undefined,
-                                                         Platform::host> {
-};
+                                                         Platform::host>
+{};
 
 }  // end of namespace simd
 
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 081a88dc23..e1c6cbc884 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -47,10 +47,11 @@ namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct syclInfo {
-  sycl_dim_t gridDim{0};
-  sycl_dim_t blockDim{0};
-  cl::sycl::queue qu = cl::sycl::queue();
+struct syclInfo
+{
+  sycl_dim_t gridDim {0};
+  sycl_dim_t blockDim {0};
+  cl::sycl::queue qu  = cl::sycl::queue();
   bool setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
   syclInfo* thread_states = nullptr;
@@ -67,14 +68,15 @@ extern std::unordered_map<cl::sycl::queue, bool> g_queue_info_map;
 }  // namespace detail
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_host(nbytes, *q);
+    ptr              = ::sycl::malloc_host(nbytes, *q);
     return ptr;
   }
 
@@ -89,14 +91,15 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     return ptr;
   }
 
@@ -112,14 +115,15 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     q->memset(ptr, 0, nbytes);
     return ptr;
   }
@@ -146,4 +150,3 @@ using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 #endif  // closing endif for RAJA_ENABLE_SYCL
 
 #endif  // closing endif for header file include guard
-
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 901cc694f0..0232c1270d 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -85,14 +85,18 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -101,17 +105,19 @@ forall_impl(resources::Sycl &sycl_res,
 
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
     //       For now, leave the device compiler to error with invalid WG size.
@@ -119,33 +125,43 @@ forall_impl(resources::Sycl &sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    q->submit([&](::sycl::handler& h) {
-
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        IndexType ii = it.get_global_id(0);
-        if (ii < len) {
-          loop_body(begin[ii]);
-        }
-      });
-    });
-
-    if (!Async) { q->wait(); }
+    q->submit(
+        [&](::sycl::handler& h)
+        {
+          h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                         [=](::sycl::nd_item<1> it)
+                         {
+                           IndexType ii = it.get_global_id(0);
+                           if (ii < len)
+                           {
+                             loop_body(begin[ii]);
+                           }
+                         });
+        });
+
+    if (!Async)
+    {
+      q->wait();
+    }
   }
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE 
-resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -153,17 +169,19 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
@@ -172,7 +190,7 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -186,24 +204,27 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    q->submit([&](::sycl::handler& h) {
-
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        Index_type ii = it.get_global_id(0);
-
-        if (ii < len) {
-          (*lbody)((*beg)[ii]);
-        }
-      });
-    }).wait(); // Need to wait for completion to free memory
+    q->submit(
+         [&](::sycl::handler& h)
+         {
+           h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                          [=](::sycl::nd_item<1> it)
+                          {
+                            Index_type ii = it.get_global_id(0);
+
+                            if (ii < len)
+                            {
+                              (*lbody)((*beg)[ii]);
+                            }
+                          });
+         })
+        .wait();  // Need to wait for completion to free memory
 
     // Free our device memory
     cl::sycl::free(lbody, *q);
@@ -215,14 +236,19 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -231,70 +257,79 @@ forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y)
+    {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        IndexType ii = it.get_id(0);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-        }
-        red.combine(fp);
-      });
-    });
+    q->submit(
+        [&](::sycl::handler& h)
+        {
+          h.parallel_for(::sycl::range<1>(len), reduction,
+                         [=](::sycl::item<1> it, auto& red)
+                         {
+                           ForallParam fp;
+                           RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                           IndexType ii = it.get_id(0);
+                           if (ii < len)
+                           {
+                             RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                           }
+                           red.combine(fp);
+                         });
+        });
 
     q->wait();
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     ::sycl::free(res, *q);
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template <typename Iterable,
+          typename LoopBody,
+          size_t BlockSize,
+          bool Async,
+          typename ForallParam,
+          typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                  bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -303,29 +338,32 @@ forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y)
+    {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
@@ -339,45 +377,44 @@ forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-
-        Index_type ii = it.get_id(0);
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-        }
-        red.combine(fp);
-
-      });
-    }).wait(); // Need to wait for completion to free memory
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+    q->submit(
+         [&](::sycl::handler& h)
+         {
+           h.parallel_for(::sycl::range<1>(len), reduction,
+                          [=](::sycl::item<1> it, auto& red)
+                          {
+                            Index_type ii = it.get_id(0);
+                            ForallParam fp;
+                            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                            if (ii < len)
+                            {
+                              RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                            }
+                            red.combine(fp);
+                          });
+         })
+        .wait();  // Need to wait for completion to free memory
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     // Free our device memory
     ::sycl::free(res, *q);
     ::sycl::free(lbody, *q);
     ::sycl::free(beg, *q);
 
     RAJA_FT_END;
-
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
 
@@ -403,23 +440,23 @@ template <typename LoopBody,
           size_t BlockSize,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &r,
-                                                    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-                                                    const TypedIndexSet<SegmentTypes...>& iset,
-                                                    LoopBody&& loop_body)
+RAJA_INLINE resources::EventProxy<resources::Sycl>
+forall_impl(resources::Sycl& r,
+            ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+            const TypedIndexSet<SegmentTypes...>& iset,
+            LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     sycl_exec<BlockSize, true>(),
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
-  if ( !Async ) {
+  if (!Async)
+  {
     ::sycl::queue* q = r.get_queue();
-    q->wait(); 
+    q->wait();
   }
 
   return resources::EventProxy<resources::Sycl>(r);
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index 9149418518..f7cc487a28 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -42,18 +42,18 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active);
@@ -61,10 +61,7 @@ struct SyclStatementExecutor<Data,
   }
 
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index d0976b931f..4b95bff6d6 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -45,8 +45,11 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_global_012<Dim, Local_Size>, EnclosedStmts...>,
-    Types> {
+    statement::For<ArgumentId,
+                   RAJA::sycl_global_012<Dim, Local_Size>,
+                   EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -58,38 +61,39 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(Dim);
+    auto i   = item.get_global_id(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // Set Global Space for Dimension and Local Size
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.global.x = len;
-      dims.local.x = Local_Size;
+      dims.local.x  = Local_Size;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.global.y = len;
-      dims.local.y = Local_Size;
+      dims.local.y  = Local_Size;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.global.z = len;
-      dims.local.z = Local_Size;
+      dims.local.z  = Local_Size;
     }
 
     // combine with enclosed statements
@@ -108,10 +112,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,34 +129,35 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(Dim);
+    auto i   = item.get_group(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -171,10 +178,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -186,14 +195,15 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_group(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_group(Dim);
     auto i_stride = item.get_group_range(Dim);
 
-    for(auto i = i0;i < len;i += i_stride){
+    for (auto i = i0; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -203,21 +213,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
-    } 
-    if (Dim == 1) {
+    }
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -237,10 +248,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -252,35 +265,35 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(Dim);
+    auto i   = item.get_local_id(Dim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -301,10 +314,12 @@ template <typename Data,
           int Dim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -316,15 +331,16 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_local_id(Dim);
     auto i_stride = item.get_local_range(Dim);
-    auto i = i0;
+    auto i        = i0;
 
-    for(; i < len;i += i_stride){
+    for (; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,7 +349,7 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active);
     }
     // do we need one more masked iteration?
-    if(i - i0 < len)
+    if (i - i0 < len)
     {
       // execute enclosed statements one more time, but masking them off
       // this is because there's at least one thread that isn't masked off
@@ -342,21 +358,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -380,7 +397,8 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::sycl_exec<Local_Size>, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -392,13 +410,13 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item)
+  static inline RAJA_DEVICE void exec(Data& data, cl::sycl::nd_item<3> item)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(0);
+    auto i   = item.get_global_id(0);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -409,15 +427,13 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    dims.local.x = Local_Size;
+    dims.local.x  = Local_Size;
     dims.global.x = len;
 
     // combine with enclosed statements
@@ -439,7 +455,8 @@ template <typename Data,
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -451,17 +468,17 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
 
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
+    using idx_type =
+        camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
 
     idx_type len = segment_length<ArgumentId>(data);
 
-    for(idx_type i = 0;i < len;++ i){
+    for (idx_type i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -470,9 +487,7 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
@@ -483,4 +498,4 @@ struct SyclStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index 9c25bb0ab9..feb5c195c4 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -31,7 +31,6 @@ namespace internal
 {
 
 
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Mapping directly from local id to indices
@@ -46,41 +45,45 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>, Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
-        Types>;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::sycl_local_012_direct<ThreadDim>,
+                     EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(ThreadDim);
+    auto i     = item.get_local_id(ThreadDim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 };
 
 
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -89,58 +92,59 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i = mask_t::maskValue(i0);
+    auto i0    = item.get_local_id(0);
+    diff_t i   = mask_t::maskValue(i0);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
-
 };
 
 
-
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
@@ -149,48 +153,54 @@ template <typename Data,
           camp::idx_t ArgumentId,
           typename ParamId,
           typename Mask,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // masked size strided loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i_init = mask_t::maskValue(i0);
-    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    diff_t len      = segment_length<ArgumentId>(data);
+    auto i0         = item.get_local_id(0);
+    diff_t i_init   = mask_t::maskValue(i0);
+    diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -205,13 +215,9 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
     }
   }
-
 };
 
 
-
-
-
 /*
  * Executor for thread work sharing loop inside SyclKernel.
  * Provides a block-stride loop (stride of blockDim.xyz) for
@@ -227,31 +233,40 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_012_loop<ThreadDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // block stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_local_id(ThreadDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_local_id(ThreadDim);
     auto i_stride = item.get_local_range(ThreadDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -269,7 +284,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*
  * Executor for group work sharing inside SyclKernel.
  * Provides a direct mapping of each block in 012.
@@ -284,29 +298,38 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::sycl_group_012_direct<BlockDim>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(BlockDim);
+    auto i     = item.get_group(BlockDim);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,31 +356,40 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_group_012_loop<BlockDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // grid stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_group(BlockDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_group(BlockDim);
     auto i_stride = item.get_group_range(BlockDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -384,26 +416,29 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
+          Data,
+          statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
-    for(diff_t i = 0;i < len;++ i){
+    for (diff_t i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -415,9 +450,6 @@ struct SyclStatementExecutor<
 };
 
 
-
-
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 0542f4b81e..8da7a878bb 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -42,22 +42,28 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct SyclStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template <typename Data,
+          camp::idx_t LambdaIndex,
+          typename... Args,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
-
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims
+  calculateDimensions(Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 88c789c062..4c79a279d9 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -49,11 +49,11 @@ namespace RAJA
  */
 template <bool async0>
 struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t<
-                            RAJA::Policy::sycl,
-                            RAJA::Pattern::forall,
-                            detail::get_launch<async0>::value,
-                            RAJA::Platform::sycl>{
-};
+                         RAJA::Policy::sycl,
+                         RAJA::Pattern::forall,
+                         detail::get_launch<async0>::value,
+                         RAJA::Platform::sycl>
+{};
 
 namespace statement
 {
@@ -63,28 +63,24 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct SyclKernelExt
-    : public internal::Statement<LaunchConfig, EnclosedStmts...> {
-};
+    : public internal::Statement<LaunchConfig, EnclosedStmts...>
+{};
 
 /*
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is synchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernel =
-    SyclKernelExt<sycl_launch<false>,
-                  EnclosedStmts...>;
+using SyclKernel = SyclKernelExt<sycl_launch<false>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is asynchronous.
  */
 template <typename... EnclosedStmts>
-using SyclKernelAsync =
-    SyclKernelExt<sycl_launch<true>,
-                  EnclosedStmts...>;
+using SyclKernelAsync = SyclKernelExt<sycl_launch<true>, EnclosedStmts...>;
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -96,7 +92,7 @@ template <typename Data, typename Exec>
 void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -107,7 +103,11 @@ void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<bool IsTriviallyCopyable, typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template <bool IsTriviallyCopyable,
+          typename LaunchPolicy,
+          typename StmtList,
+          typename Data,
+          typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -115,17 +115,18 @@ struct SyclLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
@@ -136,21 +137,22 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    data_t* m_data = (data_t*) cl::sycl::malloc_device(sizeof(data_t), *qu);
+    data_t* m_data = (data_t*)cl::sycl::malloc_device(sizeof(data_t), *qu);
     qu->memcpy(m_data, &data, sizeof(data_t)).wait();
 
-    qu->submit([&](cl::sycl::handler& h) {
- 
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-        
-        SyclKernelLauncher<Data, executor_t>(*m_data, item);
-
-      });
-    }).wait(); // Need to wait to free memory
+    qu->submit(
+          [&](cl::sycl::handler& h)
+          {
+            h.parallel_for(launch_dims.fit_nd_range(qu),
+                           [=](cl::sycl::nd_item<3> item)
+                           {
+                             SyclKernelLauncher<Data, executor_t>(*m_data,
+                                                                  item);
+                           });
+          })
+        .wait();  // Need to wait to free memory
 
     cl::sycl::free(m_data, *qu);
-
   }
 };
 
@@ -159,34 +161,37 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the SYCL occupancy calculator.
  */
-template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
+template <bool async0, typename StmtList, typename Data, typename Types>
+struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
   {
 
-    qu->submit([&](cl::sycl::handler& h) {
- 
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-
-        SyclKernelLauncher<Data, executor_t>(data, item);
-
-      });
-    });
-
-    if (!async) { qu->wait(); };
-
+    qu->submit(
+        [&](cl::sycl::handler& h)
+        {
+          h.parallel_for(launch_dims.fit_nd_range(qu),
+                         [=](cl::sycl::nd_item<3> item)
+                         {
+                           SyclKernelLauncher<Data, executor_t>(data, item);
+                         });
+        });
+
+    if (!async)
+    {
+      qu->wait();
+    };
   }
 };
 
@@ -195,38 +200,40 @@ struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
  */
 template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>;
 
   template <typename Data>
-  static inline void exec(Data &&data)
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
                                       LaunchConfig, stmt_list_t, data_t, Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue* q = res.get_queue();;
+    ::sycl::queue* q          = res.get_queue();
+    ;
 
     //
     // Compute the requested kernel dimensions
     //
     LaunchDims launch_dims = executor_t::calculateDimensions(data);
-    
+
     int shmem = 0;
 
     //
     // Launch the kernels
     //
     launch_t::launch(std::move(data), launch_dims, shmem, q);
-
   }
-
 };
 
 
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index 81a57cdecb..ee4c78a273 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -1,12 +1,12 @@
- /*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for SYCL tiled executors.
- *
- ******************************************************************************
- */
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file for SYCL tiled executors.
+*
+******************************************************************************
+*/
 
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -54,19 +54,22 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
-  using diff_t = segment_diff_type<ArgumentId, Data>;
+  using diff_t           = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -75,7 +78,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0; i < len; i += chunk_size) {
+    for (diff_t i = 0; i < len; i += chunk_size)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -89,17 +93,15 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, TPol::chunk_size);
@@ -124,14 +126,13 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_direct<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -139,20 +140,24 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
-    diff_t i = item.get_group(BlockDim) * chunk_size;//get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    // diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    diff_t i =
+        item.get_group(BlockDim) *
+        chunk_size;  // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
 
     // check have chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -169,15 +174,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -189,11 +193,11 @@ struct SyclStatementExecutor<
 
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -217,13 +221,13 @@ template <typename Data,
           int BlockDim,
           typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>, Types>
-  {
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_loop<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -231,22 +235,24 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t i_init = item.get_group(BlockDim) * chunk_size; // TODO
-    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
+    diff_t len      = segment.end() - segment.begin();
+    diff_t i_init   = item.get_group(BlockDim) * chunk_size;        // TODO
+    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size;  // TODO
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -260,15 +266,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -276,13 +281,12 @@ struct SyclStatementExecutor<
     set_sycl_dim<BlockDim>(dims.group, num_blocks);
 
 
-
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -296,7 +300,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
@@ -306,33 +309,35 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_direct<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_direct<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    diff_t i = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t i   = item.get_local_id(ThreadDim) * chunk_size;
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -340,7 +345,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
 
     // execute enclosed statements
     enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -350,15 +355,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
 
@@ -367,20 +371,20 @@ struct SyclStatementExecutor<
     set_sycl_dim<ThreadDim>(dims.min_locals, num_threads);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -394,37 +398,40 @@ template <typename Data,
           camp::idx_t ArgumentId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_loop<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_loop<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment_length<ArgumentId>(data);
-    diff_t i_init = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t len      = segment_length<ArgumentId>(data);
+    diff_t i_init   = item.get_local_id(ThreadDim) * chunk_size;
     diff_t i_stride = item.get_group_range(ThreadDim) * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -433,7 +440,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -444,15 +451,14 @@ struct SyclStatementExecutor<
   }
 
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
     num_threads = std::max(num_threads, (diff_t)1);
@@ -462,26 +468,24 @@ struct SyclStatementExecutor<
     set_sycl_dim<ThreadDim>(dims.min_locals, 1);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
 
-
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index b1d263a263..8f1caf75c0 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -55,27 +55,31 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
+          Data,
+          statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -84,7 +88,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -114,48 +119,49 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_direct<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_direct<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_direct<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<BlockDim>(blockIdx);
+    // diff_t t = get_sycl_dim<BlockDim>(blockIdx);
     diff_t t = item.get_group(BlockDim);
     diff_t i = t * chunk_size;
 
     // check have a chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -187,51 +193,52 @@ template <typename Data,
           typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_loop<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_loop<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_loop<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t t_init = item.get_group(BlockDim);
-    diff_t i_init = t_init * chunk_size;
+    diff_t len      = segment.end() - segment.begin();
+    diff_t t_init   = item.get_group(BlockDim);
+    diff_t i_init   = t_init * chunk_size;
     diff_t t_stride = item.get_group_range(BlockDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -247,7 +254,6 @@ struct SyclStatementExecutor<
 };
 
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
@@ -258,49 +264,49 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_direct<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_direct<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_direct<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
+    // diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t = item.get_local_id(ThreadDim);
     diff_t i = t * chunk_size;
 
@@ -310,7 +316,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
     data.template assign_param<ParamId>(t);
 
     // execute enclosed statements
@@ -332,57 +338,58 @@ template <typename Data,
           typename ParamId,
           camp::idx_t chunk_size,
           int ThreadDim,
-          typename ... EnclosedStmts,
+          typename... EnclosedStmts,
           typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_loop<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_loop<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_loop<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment_length<ArgumentId>(data);
-//    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
+    //    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t_init = item.get_local_id(ThreadDim);
     diff_t i_init = t_init * chunk_size;
-//    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
+    //    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
     diff_t t_stride = item.get_local_range(ThreadDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -391,7 +398,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
       data.template assign_param<ParamId>(t);
 
       // execute enclosed statements
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 56e3a9aa1e..4c68cf58b8 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -43,7 +43,8 @@ namespace internal
 {
 
 // LaunchDims and Helper functions
-struct LaunchDims {
+struct LaunchDims
+{
   sycl_dim_3_t group;
   sycl_dim_3_t local;
   sycl_dim_3_t global;
@@ -52,22 +53,22 @@ struct LaunchDims {
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims() : group{0,0,0},
-                 local{1,1,1},
-                 global{1,1,1},
-                 min_groups{0,0,0},
-                 min_locals{0,0,0} {}
+  LaunchDims()
+      : group {0, 0, 0},
+        local {1, 1, 1},
+        global {1, 1, 1},
+        min_groups {0, 0, 0},
+        min_locals {0, 0, 0}
+  {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  LaunchDims(LaunchDims const &c) : group(c.group),
-                                    local(c.local),
-                                    global(c.global)
-  {
-  }
+  LaunchDims(LaunchDims const& c)
+      : group(c.group), local(c.local), global(c.global)
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -86,89 +87,115 @@ struct LaunchDims {
     return result;
   }
 
-  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) {
+  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q)
+  {
 
     sycl_dim_3_t launch_global;
 
-    sycl_dim_3_t launch_local {1,1,1};
-    launch_local.x = std::max(launch_local.x, local.x); 
+    sycl_dim_3_t launch_local {1, 1, 1};
+    launch_local.x = std::max(launch_local.x, local.x);
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
     cl::sycl::device dev = q->get_device();
 
-    auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>();
+    auto max_work_group_size =
+        dev.get_info<::cl::sycl::info::device::max_work_group_size>();
 
-    if(launch_local.x > max_work_group_size) {
+    if (launch_local.x > max_work_group_size)
+    {
       launch_local.x = max_work_group_size;
     }
-    if(launch_local.y > max_work_group_size) {
+    if (launch_local.y > max_work_group_size)
+    {
       launch_local.y = max_work_group_size;
     }
-    if(launch_local.z > max_work_group_size) {
+    if (launch_local.z > max_work_group_size)
+    {
       launch_local.z = max_work_group_size;
     }
 
 
     // Make sure the multiple of locals fits
     // Prefer larger z -> y -> x
-    if(launch_local.x * launch_local.y * launch_local.z > max_work_group_size) {
+    if (launch_local.x * launch_local.y * launch_local.z > max_work_group_size)
+    {
       int remaining = 1;
       // local z cannot be > max_wrk from above
-      // if equal then remaining is 1, on handle < 
-      if(max_work_group_size > launch_local.z) {
+      // if equal then remaining is 1, on handle <
+      if (max_work_group_size > launch_local.z)
+      {
         // keep local z
         remaining = max_work_group_size / launch_local.z;
       }
-      if(remaining >= launch_local.y) {
+      if (remaining >= launch_local.y)
+      {
         // keep local y
         remaining = remaining / launch_local.y;
-      } else {
+      }
+      else
+      {
         launch_local.y = remaining;
-        remaining = remaining / launch_local.y;
+        remaining      = remaining / launch_local.y;
       }
-      if(remaining < launch_local.x) {
+      if (remaining < launch_local.x)
+      {
         launch_local.x = remaining;
       }
     }
 
 
     // User gave group policy, use to calculate global space
-    if (group.x != 0 || group.y != 0 || group.z != 0) {
-      sycl_dim_3_t launch_group {1,1,1};
+    if (group.x != 0 || group.y != 0 || group.z != 0)
+    {
+      sycl_dim_3_t launch_group {1, 1, 1};
       launch_group.x = std::max(launch_group.x, group.x);
       launch_group.y = std::max(launch_group.y, group.y);
       launch_group.z = std::max(launch_group.z, group.z);
 
       launch_global.x = launch_local.x * launch_group.x;
-      launch_global.y = launch_local.y * launch_group.y; 
+      launch_global.y = launch_local.y * launch_group.y;
       launch_global.z = launch_local.z * launch_group.z;
-    } else {
-      launch_global.x = launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
-      launch_global.y = launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
-      launch_global.z = launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
+    }
+    else
+    {
+      launch_global.x =
+          launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
+      launch_global.y =
+          launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
+      launch_global.z =
+          launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
     }
 
 
-    if(launch_global.x % launch_local.x != 0) {
-      launch_global.x = ((launch_global.x / launch_local.x) + 1) * launch_local.x; 
+    if (launch_global.x % launch_local.x != 0)
+    {
+      launch_global.x =
+          ((launch_global.x / launch_local.x) + 1) * launch_local.x;
     }
-    if(launch_global.y % launch_local.y != 0) {
-      launch_global.y = ((launch_global.y / launch_local.y) + 1) * launch_local.y; 
+    if (launch_global.y % launch_local.y != 0)
+    {
+      launch_global.y =
+          ((launch_global.y / launch_local.y) + 1) * launch_local.y;
     }
-    if(launch_global.z % launch_local.z != 0) {
-      launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; 
+    if (launch_global.z % launch_local.z != 0)
+    {
+      launch_global.z =
+          ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
-    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z};
-    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z};
+    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y,
+                                 launch_local.z};
+    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y,
+                                 launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
 };
 
 template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper {
+struct SyclStatementListExecutorHelper
+{
 
   using next_helper_t =
       SyclStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
@@ -176,7 +203,8 @@ struct SyclStatementListExecutorHelper {
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  inline static RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, item, thread_active);
@@ -186,7 +214,7 @@ struct SyclStatementListExecutorHelper {
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -200,16 +228,17 @@ struct SyclStatementListExecutorHelper {
 };
 
 template <camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
   template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, cl::sycl::nd_item<3> item, bool)
+  inline static RAJA_DEVICE void exec(Data&, cl::sycl::nd_item<3> item, bool)
   {
     // nop terminator
   }
 
   template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
@@ -223,37 +252,33 @@ struct SyclStatementListExecutor;
 
 
 template <typename Data, typename... Stmts, typename Types>
-struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<SyclStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void
+  exec(Data& data, cl::sycl::nd_item<3> item, bool thread_active)
   {
     // Execute statements in order with helper class
-    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, item, thread_active);
+    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, item, thread_active);
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return SyclStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
 template <typename StmtList, typename Data, typename Types>
-using sycl_statement_list_executor_t = SyclStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+using sycl_statement_list_executor_t =
+    SyclStatementListExecutor<Data, StmtList, Types>;
 
 }  // namespace internal
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index ad9fecc222..fcffc88aed 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -29,16 +29,23 @@ namespace RAJA
 {
 
 template <bool async>
-struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
-
- //If the launch lambda is trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
+{
+
+  // If the launch lambda is trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -49,57 +56,72 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
-      q->submit([&](cl::sycl::handler& h) {
-
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            body_in(ctx);
-
-           });
-
-      });
-
-    if (!async) { q->wait(); }
+      q->submit(
+          [&](cl::sycl::handler& h)
+          {
+            auto s_vec =
+                ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
+
+            h.parallel_for(
+                cl::sycl::nd_range<3>(gridSize, blockSize),
+                [=](cl::sycl::nd_item<3> itm)
+                {
+                  LaunchContext ctx;
+                  ctx.itm = &itm;
+
+                  // Point to shared memory
+                  ctx.shared_mem_ptr =
+                      s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                          .get();
+
+                  body_in(ctx);
+                });
+          });
+
+      if (!async)
+      {
+        q->wait();
+      }
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //If the launch lambda is trivially copyable and we have explcit reduction parameters
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is trivially copyable and we have explcit reduction
+  // parameters
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -112,57 +134,66 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y)
+      {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](cl::sycl::handler& h) {
-
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec = ::sycl::local_accessor<char, 1>(
+                 launch_params.shared_mem_size, h);
 
-            LaunchContext ctx;
-            ctx.itm = &itm;
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
+                 [=](cl::sycl::nd_item<3> itm, auto& red)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                   ReduceParams fp;
+                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-            RAJA::expt::invoke_body(fp, body_in, ctx);
+                   RAJA::expt::invoke_body(fp, body_in, ctx);
 
-            red.combine(fp);
+                   red.combine(fp);
+                 });
+           })
+          .wait();  // Need to wait for completion to free memory
 
-           });
-
-      }).wait(); // Need to wait for completion to free memory
-
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
 
       RAJA_FT_END;
@@ -170,17 +201,23 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -191,67 +228,79 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      q->submit([&](cl::sycl::handler& h) {
-
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            (*lbody)(ctx);
-
-           });
-
-      }).wait(); // Need to wait for completion to free memory
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec =
+                 ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
+
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize),
+                 [=](cl::sycl::nd_item<3> itm)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
+
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
+
+                   (*lbody)(ctx);
+                 });
+           })
+          .wait();  // Need to wait for completion to free memory
 
       cl::sycl::free(lbody, *q);
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-    exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-         BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+            typename ReduceParams,
+            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                    bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -264,66 +313,75 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y)
+      {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
-      q->submit([&](cl::sycl::handler& h) {
-
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+      q->submit(
+           [&](cl::sycl::handler& h)
+           {
+             auto s_vec = ::sycl::local_accessor<char, 1>(
+                 launch_params.shared_mem_size, h);
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+             h.parallel_for(
+                 cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
+                 [=](cl::sycl::nd_item<3> itm, auto& red)
+                 {
+                   LaunchContext ctx;
+                   ctx.itm = &itm;
 
-            RAJA::expt::invoke_body(fp, *lbody, ctx);
+                   // Point to shared memory
+                   ctx.shared_mem_ptr =
+                       s_vec.get_multi_ptr<::sycl::access::decorated::yes>()
+                           .get();
 
-            red.combine(fp);
+                   ReduceParams fp;
+                   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-           });
+                   RAJA::expt::invoke_body(fp, *lbody, ctx);
 
-      }).wait(); // Need to wait for completion to free memory
+                   red.combine(fp);
+                 });
+           })
+          .wait();  // Need to wait for completion to free memory
 
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
       cl::sycl::free(lbody, *q);
 
@@ -332,15 +390,14 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 /*
    SYCL global thread mapping
 */
-template<int ... DIM>
+template <int... DIM>
 struct sycl_global_item;
 
 using sycl_global_item_0 = sycl_global_item<0>;
@@ -348,53 +405,49 @@ using sycl_global_item_1 = sycl_global_item<1>;
 using sycl_global_item_2 = sycl_global_item<2>;
 
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_global_item<DIM>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
-        ctx.itm->get_local_id(DIM);
+      const int tx = ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
+                     ctx.itm->get_local_id(DIM);
 
       if (tx < len) body(*(segment.begin() + tx));
     }
   }
 };
 
-using sycl_global_item_01 = sycl_global_item<0,1>;
-using sycl_global_item_02 = sycl_global_item<0,2>;
-using sycl_global_item_10 = sycl_global_item<1,0>;
-using sycl_global_item_12 = sycl_global_item<1,2>;
-using sycl_global_item_20 = sycl_global_item<2,0>;
-using sycl_global_item_21 = sycl_global_item<2,1>;
+using sycl_global_item_01 = sycl_global_item<0, 1>;
+using sycl_global_item_02 = sycl_global_item<0, 2>;
+using sycl_global_item_10 = sycl_global_item<1, 0>;
+using sycl_global_item_12 = sycl_global_item<1, 2>;
+using sycl_global_item_20 = sycl_global_item<2, 0>;
+using sycl_global_item_21 = sycl_global_item<2, 1>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
 
       if (tx < len0 && ty < len1)
@@ -404,43 +457,39 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
 };
 
 
-using sycl_global_item_012 = sycl_global_item<0,1,2>;
-using sycl_global_item_021 = sycl_global_item<0,2,1>;
-using sycl_global_item_102 = sycl_global_item<1,0,2>;
-using sycl_global_item_120 = sycl_global_item<1,2,0>;
-using sycl_global_item_201 = sycl_global_item<2,0,1>;
-using sycl_global_item_210 = sycl_global_item<2,1,0>;
+using sycl_global_item_012 = sycl_global_item<0, 1, 2>;
+using sycl_global_item_021 = sycl_global_item<0, 2, 1>;
+using sycl_global_item_102 = sycl_global_item<1, 0, 2>;
+using sycl_global_item_120 = sycl_global_item<1, 2, 0>;
+using sycl_global_item_201 = sycl_global_item<2, 0, 1>;
+using sycl_global_item_210 = sycl_global_item<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
-      const int tz =
-        ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
-        ctx.itm->get_local_id(DIM2);
+      const int tz = ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
+                     ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment1.begin() + ty));
     }
   }
@@ -449,70 +498,86 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
 /*
 Reshape threads in a block into a 1D iteration space
 */
-template<int ... dim>
-struct sycl_flatten_group_local_direct{};
-
-using sycl_flatten_group_local_01_direct = sycl_flatten_group_local_direct<0,1>;
-using sycl_flatten_group_local_02_direct = sycl_flatten_group_local_direct<0,2>;
-using sycl_flatten_group_local_10_direct = sycl_flatten_group_local_direct<1,0>;
-using sycl_flatten_group_local_12_direct = sycl_flatten_group_local_direct<1,2>;
-using sycl_flatten_group_local_20_direct = sycl_flatten_group_local_direct<2,0>;
-using sycl_flatten_group_local_21_direct = sycl_flatten_group_local_direct<2,1>;
-
-using sycl_flatten_group_local_012_direct = sycl_flatten_group_local_direct<0,1,2>;
-using sycl_flatten_group_local_021_direct = sycl_flatten_group_local_direct<0,2,1>;
-using sycl_flatten_group_local_102_direct = sycl_flatten_group_local_direct<1,0,2>;
-using sycl_flatten_group_local_120_direct = sycl_flatten_group_local_direct<1,2,0>;
-using sycl_flatten_group_local_201_direct = sycl_flatten_group_local_direct<2,0,1>;
-using sycl_flatten_group_local_210_direct = sycl_flatten_group_local_direct<2,1,0>;
-
-template<int ... dim>
-struct sycl_flatten_group_local_loop{};
-
-using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0,1>;
-using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0,2>;
-using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1,0>;
-using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1,2>;
-using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2,0>;
-using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2,1>;
-
-using sycl_flatten_group_local_012_loop = sycl_flatten_group_local_loop<0,1,2>;
-using sycl_flatten_group_local_021_loop = sycl_flatten_group_local_loop<0,2,1>;
-using sycl_flatten_group_local_102_loop = sycl_flatten_group_local_loop<1,0,2>;
-using sycl_flatten_group_local_120_loop = sycl_flatten_group_local_loop<1,2,0>;
-using sycl_flatten_group_local_201_loop = sycl_flatten_group_local_loop<2,0,1>;
-using sycl_flatten_group_local_210_loop = sycl_flatten_group_local_loop<2,1,0>;
-
-template<typename SEGMENT, int DIM0, int DIM1>
+template <int... dim>
+struct sycl_flatten_group_local_direct
+{};
+
+using sycl_flatten_group_local_01_direct =
+    sycl_flatten_group_local_direct<0, 1>;
+using sycl_flatten_group_local_02_direct =
+    sycl_flatten_group_local_direct<0, 2>;
+using sycl_flatten_group_local_10_direct =
+    sycl_flatten_group_local_direct<1, 0>;
+using sycl_flatten_group_local_12_direct =
+    sycl_flatten_group_local_direct<1, 2>;
+using sycl_flatten_group_local_20_direct =
+    sycl_flatten_group_local_direct<2, 0>;
+using sycl_flatten_group_local_21_direct =
+    sycl_flatten_group_local_direct<2, 1>;
+
+using sycl_flatten_group_local_012_direct =
+    sycl_flatten_group_local_direct<0, 1, 2>;
+using sycl_flatten_group_local_021_direct =
+    sycl_flatten_group_local_direct<0, 2, 1>;
+using sycl_flatten_group_local_102_direct =
+    sycl_flatten_group_local_direct<1, 0, 2>;
+using sycl_flatten_group_local_120_direct =
+    sycl_flatten_group_local_direct<1, 2, 0>;
+using sycl_flatten_group_local_201_direct =
+    sycl_flatten_group_local_direct<2, 0, 1>;
+using sycl_flatten_group_local_210_direct =
+    sycl_flatten_group_local_direct<2, 1, 0>;
+
+template <int... dim>
+struct sycl_flatten_group_local_loop
+{};
+
+using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0, 1>;
+using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0, 2>;
+using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1, 0>;
+using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1, 2>;
+using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2, 0>;
+using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2, 1>;
+
+using sycl_flatten_group_local_012_loop =
+    sycl_flatten_group_local_loop<0, 1, 2>;
+using sycl_flatten_group_local_021_loop =
+    sycl_flatten_group_local_loop<0, 2, 1>;
+using sycl_flatten_group_local_102_loop =
+    sycl_flatten_group_local_loop<1, 0, 2>;
+using sycl_flatten_group_local_120_loop =
+    sycl_flatten_group_local_loop<1, 2, 0>;
+using sycl_flatten_group_local_201_loop =
+    sycl_flatten_group_local_loop<2, 0, 1>;
+using sycl_flatten_group_local_210_loop =
+    sycl_flatten_group_local_loop<2, 1, 0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx = ctx.itm->get_local_id(DIM0);
-      const int ty = ctx.itm->get_local_id(DIM1);
-      const int bx = ctx.itm->get_local_range(DIM0);
-      const int tid = tx + bx*ty;
+      const int tx  = ctx.itm->get_local_id(DIM0);
+      const int ty  = ctx.itm->get_local_id(DIM1);
+      const int bx  = ctx.itm->get_local_range(DIM0);
+      const int tid = tx + bx * ty;
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1>
+template <typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -522,21 +587,19 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
     const int bx = ctx.itm->get_local_range(DIM0);
     const int by = ctx.itm->get_local_range(DIM1);
 
-    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
+    for (int tid = tx + bx * ty; tid < len; tid += bx * by)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
     {
@@ -546,21 +609,19 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int bx = ctx.itm->get_local_range(DIM0);
       const int by = ctx.itm->get_local_range(DIM1);
 
-      const int tid = tx + bx*(ty + by*tz);
+      const int tid = tx + bx * (ty + by * tz);
 
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
 };
 
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -571,10 +632,10 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
     const int by = ctx.itm->get_local_range(DIM1);
     const int bz = ctx.itm->get_local_range(DIM2);
 
-    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
+    for (int tid = tx + bx * (ty + by * tz); tid < len; tid += bx * by * bz)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
@@ -582,19 +643,17 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
   SYCL thread loops with block strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
          tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx));
@@ -606,13 +665,12 @@ struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -627,20 +685,19 @@ struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx));
     }
   }
@@ -650,13 +707,12 @@ struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -671,20 +727,18 @@ struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
   SYCL thread loops with block strides + Return Index
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM) )
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
+         tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx), tx);
     }
@@ -695,13 +749,12 @@ struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
   SYCL thread direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -716,20 +769,19 @@ struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
   SYCL block loops with grid strides
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx =  ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx), bx);
     }
   }
@@ -739,13 +791,12 @@ struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
   SYCL block direct mappings
 */
 template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void
+  exec(LaunchContext const& ctx, SEGMENT const& segment, BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -757,29 +808,29 @@ struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 };
 
 // perfectly nested sycl direct policies
-using sycl_group_01_nested_direct = sycl_group_012_direct<0,1>;
-using sycl_group_02_nested_direct = sycl_group_012_direct<0,2>;
-using sycl_group_10_nested_direct = sycl_group_012_direct<1,0>;
-using sycl_group_12_nested_direct = sycl_group_012_direct<1,2>;
-using sycl_group_20_nested_direct = sycl_group_012_direct<2,0>;
-using sycl_group_21_nested_direct = sycl_group_012_direct<2,1>;
-
-using sycl_group_012_nested_direct = sycl_group_012_direct<0,1,2>;
-using sycl_group_021_nested_direct = sycl_group_012_direct<0,2,1>;
-using sycl_group_102_nested_direct = sycl_group_012_direct<1,0,2>;
-using sycl_group_120_nested_direct = sycl_group_012_direct<1,2,0>;
-using sycl_group_201_nested_direct = sycl_group_012_direct<2,0,1>;
-using sycl_group_210_nested_direct = sycl_group_012_direct<2,1,0>;
+using sycl_group_01_nested_direct = sycl_group_012_direct<0, 1>;
+using sycl_group_02_nested_direct = sycl_group_012_direct<0, 2>;
+using sycl_group_10_nested_direct = sycl_group_012_direct<1, 0>;
+using sycl_group_12_nested_direct = sycl_group_012_direct<1, 2>;
+using sycl_group_20_nested_direct = sycl_group_012_direct<2, 0>;
+using sycl_group_21_nested_direct = sycl_group_012_direct<2, 1>;
+
+using sycl_group_012_nested_direct = sycl_group_012_direct<0, 1, 2>;
+using sycl_group_021_nested_direct = sycl_group_012_direct<0, 2, 1>;
+using sycl_group_102_nested_direct = sycl_group_012_direct<1, 0, 2>;
+using sycl_group_120_nested_direct = sycl_group_012_direct<1, 2, 0>;
+using sycl_group_201_nested_direct = sycl_group_012_direct<2, 0, 1>;
+using sycl_group_210_nested_direct = sycl_group_012_direct<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -793,15 +844,15 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -811,8 +862,7 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment2.begin() + tz));
     }
   }
@@ -823,37 +873,36 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
   Return local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =  ctx.itm->get_group(DIM0);
-      const int ty =  ctx.itm->get_group(DIM1);
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
       if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             tx, ty);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty), tx, ty);
     }
   }
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -863,48 +912,45 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment2.begin() + tz), tx, ty, tz);
     }
   }
 };
 
 // perfectly nested sycl loop policies
-using sycl_group_01_nested_loop = sycl_group_012_loop<0,1>;
-using sycl_group_02_nested_loop = sycl_group_012_loop<0,2>;
-using sycl_group_10_nested_loop = sycl_group_012_loop<1,0>;
-using sycl_group_12_nested_loop = sycl_group_012_loop<1,2>;
-using sycl_group_20_nested_loop = sycl_group_012_loop<2,0>;
-using sycl_group_21_nested_loop = sycl_group_012_loop<2,1>;
-
-using sycl_group_012_nested_loop = sycl_group_012_loop<0,1,2>;
-using sycl_group_021_nested_loop = sycl_group_012_loop<0,2,1>;
-using sycl_group_102_nested_loop = sycl_group_012_loop<1,0,2>;
-using sycl_group_120_nested_loop = sycl_group_012_loop<1,2,0>;
-using sycl_group_201_nested_loop = sycl_group_012_loop<2,0,1>;
-using sycl_group_210_nested_loop = sycl_group_012_loop<2,1,0>;
+using sycl_group_01_nested_loop = sycl_group_012_loop<0, 1>;
+using sycl_group_02_nested_loop = sycl_group_012_loop<0, 2>;
+using sycl_group_10_nested_loop = sycl_group_012_loop<1, 0>;
+using sycl_group_12_nested_loop = sycl_group_012_loop<1, 2>;
+using sycl_group_20_nested_loop = sycl_group_012_loop<2, 0>;
+using sycl_group_21_nested_loop = sycl_group_012_loop<2, 1>;
+
+using sycl_group_012_nested_loop = sycl_group_012_loop<0, 1, 2>;
+using sycl_group_021_nested_loop = sycl_group_012_loop<0, 2, 1>;
+using sycl_group_102_nested_loop = sycl_group_012_loop<1, 0, 2>;
+using sycl_group_120_nested_loop = sycl_group_012_loop<1, 2, 0>;
+using sycl_group_201_nested_loop = sycl_group_012_loop<2, 0, 1>;
+using sycl_group_210_nested_loop = sycl_group_012_loop<2, 1, 0>;
 
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM1);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM1); by < len1;
              bx += ctx.itm->get_group_range(DIM1))
         {
           body(*(segment0.begin() + bx), *(segment1.begin() + by));
@@ -915,37 +961,33 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM1);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM1); by < len1;
            by += ctx.itm->get_group_range(DIM1))
       {
 
-        for (int bz = ctx.itm->get_group(DIM2);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM2); bz < len2;
              bz += ctx.itm->get_group_range(DIM2))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
                *(segment2.begin() + bz));
         }
       }
@@ -957,25 +999,23 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
   perfectly nested sycl loop policies + returns local index
 */
 template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM0);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM0); by < len1;
              by += ctx.itm->get_group_range(DIM1))
         {
 
@@ -987,37 +1027,33 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
   template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM0);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM0); by < len1;
            by += ctx.itm->get_group_range(DIM0))
       {
 
-        for (int bz =  ctx.itm->get_group(DIM0);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM0); bz < len2;
              bz += ctx.itm->get_group_range(DIM0))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
                *(segment2.begin() + bz), bx, by, bz);
         }
       }
@@ -1026,20 +1062,19 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
       body(segment.slice(tx, tile_size));
@@ -1049,20 +1084,20 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
       body(segment.slice(tx, tile_size));
     }
@@ -1071,19 +1106,19 @@ struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_group(DIM)* tile_size;
+    for (int tx = ctx.itm->get_group(DIM) * tile_size;
 
          tx < len;
 
@@ -1095,110 +1130,110 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 };
 
 template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_group(DIM) * tile_size;
-    if(tx < len){
+    if (tx < len)
+    {
       body(segment.slice(tx, tile_size));
     }
   }
 };
 
-//Tile execute + return index
+// Tile execute + return index
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM) * tile_size;
-         bx < len;
+    for (int bx = ctx.itm->get_group(DIM) * tile_size; bx < len;
          bx += ctx.itm->get_group_range(DIM) * tile_size)
     {
-      body(segment.slice(bx, tile_size), bx/tile_size);
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
 
 
 template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
   template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int bx = ctx.itm->get_group(DIM) * tile_size;
-    if(bx < len){
-      body(segment.slice(bx, tile_size), bx/tile_size);
+    if (bx < len)
+    {
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
index 1f33be19bb..149d4ca0fd 100644
--- a/include/RAJA/policy/sycl/params/kernel_name.hpp
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -3,39 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-#if defined(RAJA_ENABLE_SYCL)  
-  
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  SYCL_EXTERNAL
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-#endif  
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+#if defined(RAJA_ENABLE_SYCL)
+
+// Init
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+init(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template <typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+    SYCL_EXTERNAL combine(KernelName&, T)
+{}
+
+// Resolve
+template <typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+#endif
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/params/reduce.hpp b/include/RAJA/policy/sycl/params/reduce.hpp
index e2fb7e1a5a..6a381d709a 100644
--- a/include/RAJA/policy/sycl/params/reduce.hpp
+++ b/include/RAJA/policy/sycl/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_SYCL)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index 0f92fe27e1..afd7c24b22 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -35,7 +35,8 @@
 namespace RAJA
 {
 
-struct uint3 {
+struct uint3
+{
   unsigned long x, y, z;
 };
 
@@ -46,12 +47,14 @@ using sycl_dim_3_t = uint3;
 namespace detail
 {
 template <bool Async>
-struct get_launch {
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
 template <>
-struct get_launch<false> {
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -73,27 +76,28 @@ struct sycl_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::sycl,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                       RAJA::Platform::sycl>
+{};
 
 template <bool Async, int num_threads = 0>
 struct sycl_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::sycl,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                           RAJA::Policy::sycl,
+                           RAJA::Pattern::region,
+                           detail::get_launch<Async>::value,
+                           RAJA::Platform::sycl>
+{};
 
 struct sycl_reduce
-    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce> {
-};
+    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce>
+{};
 
 //
 // Sycl atomic policy for using sycl atomics on the device and
 // the provided Policy on the host
 //
-template<typename host_policy>
-struct sycl_atomic_explicit{};
+template <typename host_policy>
+struct sycl_atomic_explicit
+{};
 
 //
 // Default sycl atomic policy uses sycl atomics on the device and non-atomics
@@ -101,11 +105,13 @@ struct sycl_atomic_explicit{};
 //
 using sycl_atomic = sycl_atomic_explicit<seq_atomic>;
 
-template<typename Mask>
-struct sycl_local_masked_direct {};
+template <typename Mask>
+struct sycl_local_masked_direct
+{};
 
-template<typename Mask>
-struct sycl_local_masked_loop {};
+template <typename Mask>
+struct sycl_local_masked_loop
+{};
 
 }  // namespace sycl
 }  // namespace policy
@@ -120,27 +126,29 @@ using policy::sycl::sycl_local_masked_direct;
 using policy::sycl::sycl_local_masked_loop;
 
 using policy::sycl::sycl_launch_t;
-  
+
 /*!
  * Maps indices to SYCL global id
  * Optional WORK_GROUP_SIZE to
  */
-template<int dim, int WORK_GROUP_SIZE = 1>
-struct sycl_global_012{};
+template <int dim, int WORK_GROUP_SIZE = 1>
+struct sycl_global_012
+{};
 
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_0 = sycl_global_012<0, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_1 = sycl_global_012<1, WORK_GROUP_SIZE>;
-template<int WORK_GROUP_SIZE>
+template <int WORK_GROUP_SIZE>
 using sycl_global_2 = sycl_global_012<2, WORK_GROUP_SIZE>;
 
 /*!
  * Maps segment indices to SYCL group ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_group_012_loop{};
+template <int... dim>
+struct sycl_group_012_loop
+{};
 
 using sycl_group_0_loop = sycl_group_012_loop<0>;
 using sycl_group_1_loop = sycl_group_012_loop<1>;
@@ -150,8 +158,9 @@ using sycl_group_2_loop = sycl_group_012_loop<2>;
  * Maps segment indices to SYCL local ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_local_012_loop{};
+template <int... dim>
+struct sycl_local_012_loop
+{};
 
 using sycl_local_0_loop = sycl_local_012_loop<0>;
 using sycl_local_1_loop = sycl_local_012_loop<1>;
@@ -160,8 +169,9 @@ using sycl_local_2_loop = sycl_local_012_loop<2>;
 /*!
  * Maps segment indices to SYCL group ids.
  */
-template<int ... dim>
-struct sycl_group_012_direct{};
+template <int... dim>
+struct sycl_group_012_direct
+{};
 
 using sycl_group_0_direct = sycl_group_012_direct<0>;
 using sycl_group_1_direct = sycl_group_012_direct<1>;
@@ -170,102 +180,87 @@ using sycl_group_2_direct = sycl_group_012_direct<2>;
 /*!
  * Maps segment indices to SYCL local ids.
  */
-template<int ... dim>
-struct sycl_local_012_direct{};
+template <int... dim>
+struct sycl_local_012_direct
+{};
 
 using sycl_local_0_direct = sycl_local_012_direct<0>;
 using sycl_local_1_direct = sycl_local_012_direct<1>;
 using sycl_local_2_direct = sycl_local_012_direct<2>;
 
 
-namespace internal{
+namespace internal
+{
 
-template<int dim>
+template <int dim>
 struct SyclDimHelper;
 
-template<>
-struct SyclDimHelper<0>{
+template <>
+struct SyclDimHelper<0>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.x;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.x = value;
   }
 };
 
-template<>
-struct SyclDimHelper<1>{
+template <>
+struct SyclDimHelper<1>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.y;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.y = value;
   }
 };
 
-template<>
-struct SyclDimHelper<2>{
+template <>
+struct SyclDimHelper<2>
+{
 
-  template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  template <typename dim_t>
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.z;
   }
 
-  template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  template <typename dim_t>
+  inline static void set(dim_t& d, int value)
   {
     d.z = value;
   }
 };
 
-template<int dim, typename dim_t>
-constexpr
-auto get_sycl_dim(dim_t const &d) ->
-  decltype(d.x)
+template <int dim, typename dim_t>
+constexpr auto get_sycl_dim(dim_t const& d) -> decltype(d.x)
 {
   return SyclDimHelper<dim>::get(d);
 }
 
-template<int dim, typename dim_t>
-void set_sycl_dim(dim_t &d, int value)
+template <int dim, typename dim_t>
+void set_sycl_dim(dim_t& d, int value)
 {
   return SyclDimHelper<dim>::set(d, value);
 }
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #endif
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 49d89b3cd2..ac9690f42e 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   Header file for SYCL reduction stucts/classes.
- *          
+ *
  ******************************************************************************
  */
 
@@ -38,15 +38,14 @@ namespace sycl
 {
 
 template <typename T, typename I>
-struct minloc 
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
@@ -54,15 +53,14 @@ struct minloc
 };
 
 template <typename T, typename I>
-struct maxloc 
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
-                                               const T v,
-                                               const I l)
+  RAJA_HOST_DEVICE RAJA_INLINE void
+  operator()(T& val, I& loc, const T v, const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
@@ -74,18 +72,19 @@ struct maxloc
 static int MaxNumTeams = 1;
 
 //! Information necessary for SYCL offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
-  int hostID{1};
-  int deviceID{2};
-  bool isMapped{false};
+  int hostID {1};
+  int deviceID {2};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  Offload_Info(const Offload_Info& other)
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
+  {}
 };
 
 //! Reduction data for SYCL Offload -- stores value, host pointer, and device
@@ -94,8 +93,8 @@ template <typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -104,20 +103,24 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
       : value(initValue)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
 
-    device = reinterpret_cast<T *>(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
-    host = reinterpret_cast<T *>(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
+    device = reinterpret_cast<T*>(
+        cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
+    host = reinterpret_cast<T*>(
+        cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
 
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -125,62 +128,63 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(device),
-                       reinterpret_cast<void *>(host),
-                       sycl::MaxNumTeams * sizeof(T));
+    auto e =
+        q->memcpy(reinterpret_cast<void*>(device),
+                  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
-    } 
+    }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(host),
-                       reinterpret_cast<void *>(device),
+    auto e = q->memcpy(reinterpret_cast<void*>(host),
+                       reinterpret_cast<void*>(device),
                        sycl::MaxNumTeams * sizeof(T));
- 
+
     e.wait();
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if (device) {
-      cl::sycl::free(reinterpret_cast<void *>(device), *q);
+    if (device)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(device), *q);
       device = nullptr;
     }
-    if (host) {
-      cl::sycl::free(reinterpret_cast<void *>(host), *q);
-      //delete[] host;
+    if (host)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(host), *q);
+      // delete[] host;
       host = nullptr;
     }
   }
@@ -191,47 +195,46 @@ struct Reduce_Data
 //! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
-struct TargetReduce 
+struct TargetReduce
 {
-  TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce()                    = delete;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val)
       : info(),
         val(Reducer::identity(), Reducer::identity(), info),
         initVal(init_val),
         finalVal(Reducer::identity())
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     val.cleanup(info);
-    val = sycl::Reduce_Data<T>(identity_, identity_, info);
+    val           = sycl::Reduce_Data<T>(identity_, identity_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_;
+    initVal       = init_val_;
+    finalVal      = identity_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduce()
-  {
-  }
+  ~TargetReduce() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
-      for (int i =0; i < sycl::MaxNumTeams; ++i) {
-        Reducer{}(val.value, val.host[i]);
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, val.host[i]);
       }
-//      val.cleanup(info);
+      //      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     T returnVal = finalVal;
     reset(finalVal);
     return returnVal;
@@ -240,29 +243,37 @@ struct TargetReduce
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);  
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -281,13 +292,16 @@ struct TargetReduce
 //! SYCL Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc()                       = delete;
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+  explicit TargetReduceLoc(
+      T init_val,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -295,45 +309,46 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
     loc.cleanup(info);
     loc = sycl::Reduce_Data<IndexType>(identity_loc_, identity_loc_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_val_;
-    initLoc = init_loc_;
-    finalLoc = identity_loc_;
+    initVal       = init_val_;
+    finalVal      = identity_val_;
+    initLoc       = init_loc_;
+    finalLoc      = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduceLoc()
-  {
-  }
+  ~TargetReduceLoc() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      
-      for (int i = 0; i < sycl::MaxNumTeams; ++i) {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       info.isMapped = true;
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     returnVal = finalVal;
     returnLoc = finalLoc;
     reset(finalVal, finalLoc);
@@ -353,24 +368,26 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
-    Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    cl::sycl::atomic_fence(cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire,
+                           cl::sycl::memory_scope::device);
+    Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_release,
+                           cl::sycl::memory_scope::device);
     return *this;
 #else
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -382,7 +399,7 @@ struct TargetReduceLoc
   //! storage for offload information
   sycl::Offload_Info info;
   //! storage for reduction data for value
-//  sycl::Reduce_Data<T> val;
+  //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
   T initVal;
   T finalVal;
@@ -395,28 +412,30 @@ struct TargetReduceLoc
 
 //! specialization of ReduceSum for omp_target_reduce
 template <typename T>
-class ReduceSum<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::sum<T>, T>
+class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
-  using self = ReduceSum<sycl_reduce, T>;
+  using self   = ReduceSum<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -432,17 +451,20 @@ class ReduceBitOr<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitOr<sycl_reduce, T>;
+  using self   = ReduceBitOr<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -452,11 +474,15 @@ class ReduceBitOr<sycl_reduce, T>
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -472,17 +498,20 @@ class ReduceBitAnd<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitAnd<sycl_reduce, T>;
+  using self   = ReduceBitAnd<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -492,11 +521,15 @@ class ReduceBitAnd<sycl_reduce, T>
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -509,21 +542,23 @@ class ReduceBitAnd<sycl_reduce, T>
 
 //! specialization of ReduceMin for omp_target_reduce
 template <typename T>
-class ReduceMin<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::min<T>, T>
+class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
-  using self = ReduceMin<sycl_reduce, T>;
+  using self   = ReduceMin<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -533,11 +568,15 @@ class ReduceMin<sycl_reduce, T>
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -550,21 +589,23 @@ class ReduceMin<sycl_reduce, T>
 
 //! specialization of ReduceMax for omp_target_reduce
 template <typename T>
-class ReduceMax<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::max<T>, T>
+class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
-  using self = ReduceMax<sycl_reduce, T>;
+  using self   = ReduceMax<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -574,11 +615,15 @@ class ReduceMax<sycl_reduce, T>
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
diff --git a/include/RAJA/policy/tensor/arch.hpp b/include/RAJA/policy/tensor/arch.hpp
index 771adea64f..50de38b80a 100644
--- a/include/RAJA/policy/tensor/arch.hpp
+++ b/include/RAJA/policy/tensor/arch.hpp
@@ -23,26 +23,27 @@
 namespace RAJA
 {
 
-namespace internal {
+namespace internal
+{
 
 namespace expt
 {
 
 
-  /*!
-   * Provides architectural details for a given architecture and data type.
-   */
-  template<typename REGISTER_POLICY, typename T>
-  struct RegisterTraits;
-  /*
-   * using element_type = T;
-   * using register_policy = REGISTER_POLICY;
-   * static constexpr camp::idx s_num_bits = X;
-   * static constexpr camp::idx s_num_elem = Y;
-   *
-   */
-} //namespace expt
-} //namespace internal
+/*!
+ * Provides architectural details for a given architecture and data type.
+ */
+template <typename REGISTER_POLICY, typename T>
+struct RegisterTraits;
+/*
+ * using element_type = T;
+ * using register_policy = REGISTER_POLICY;
+ * static constexpr camp::idx s_num_bits = X;
+ * static constexpr camp::idx s_num_elem = Y;
+ *
+ */
+}  // namespace expt
+}  // namespace internal
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -54,7 +55,8 @@ namespace expt
 {
 
 #ifdef __AVX512F__
-struct avx512_register {};
+struct avx512_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx512_register
@@ -63,7 +65,8 @@ struct avx512_register {};
 
 
 #ifdef __AVX2__
-struct avx2_register {};
+struct avx2_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx2_register
@@ -72,7 +75,8 @@ struct avx2_register {};
 
 
 #ifdef __AVX__
-struct avx_register {};
+struct avx_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx_register
@@ -85,7 +89,8 @@ struct avx_register {};
 /*!
  * A CUDA warp distributed vector register
  */
-struct cuda_warp_register {};
+struct cuda_warp_register
+{};
 
 #endif
 
@@ -96,12 +101,14 @@ struct cuda_warp_register {};
  * A HIP wavefront distributed vector register
  * On AMD GPUs this is rally just a vector register
  */
-struct hip_wave_register {};
+struct hip_wave_register
+{};
 
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-struct scalar_register {};
+struct scalar_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::scalar_register
@@ -109,13 +116,12 @@ struct scalar_register {};
 #endif
 
 
-  // This sets the default SIMD register that will be used
-  using default_register = RAJA_TENSOR_REGISTER_TYPE;
-
+// This sets the default SIMD register that will be used
+using default_register = RAJA_TENSOR_REGISTER_TYPE;
 
-} // namespace expt
-} // namespace RAJA
 
+}  // namespace expt
+}  // namespace RAJA
 
 
 //
diff --git a/include/RAJA/policy/tensor/arch/avx.hpp b/include/RAJA/policy/tensor/arch/avx.hpp
index ed25f1f3e3..c0df27fac9 100644
--- a/include/RAJA/policy/tensor/arch/avx.hpp
+++ b/include/RAJA/policy/tensor/arch/avx.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX__
 
-#include<RAJA/policy/tensor/arch/avx/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_float.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_double.hpp>
+#include <RAJA/policy/tensor/arch/avx/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_float.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_double.hpp>
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index 8a23d66e57..2978673727 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -34,444 +34,462 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx_register> :
-    public internal::expt::RegisterBase<Register<double, avx_register>>
+template <>
+class Register<double, avx_register>
+    : public internal::expt::RegisterBase<Register<double, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<double, avx_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : base_type(), m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_pd();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    };
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<double, avx_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-                     base_type(), m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_pd();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        };
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the maximum value of each lane
-        // B = { max{v[0], v[1]},
-        //       max{v[0], v[1]},
-        //       max{v[2], v[3]},
-        //       max{v[2], v[3]} }
-        register_type b = _mm256_max_pd(m_value, a);
-
-        // now take the maximum of a lower and upper halves
-        return RAJA::max<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the maximum value of each lane
+    // B = { max{v[0], v[1]},
+    //       max{v[0], v[1]},
+    //       max{v[2], v[3]},
+    //       max{v[2], v[3]} }
+    register_type b = _mm256_max_pd(m_value, a);
+
+    // now take the maximum of a lower and upper halves
+    return RAJA::max<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 1e6563742a..6330f95525 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -34,457 +34,481 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<float, avx_register> :
-    public internal::expt::RegisterBase<Register<float, avx_register>>
+template <>
+class Register<float, avx_register>
+    : public internal::expt::RegisterBase<Register<float, avx_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<float, avx_register>;
+  using element_type    = float;
+  using register_type   = __m256;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_ps();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<float, avx_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_ps();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element of first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::max<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::max<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::max<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::min<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::min<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::min<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element of first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::max<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::max<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::max<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::min<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::min<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::min<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 11ab97be16..abbce3482b 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -33,738 +33,764 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+template <>
+class Register<int32_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<int32_t, avx_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i], i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
+                        reinterpret_cast<__m256>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi32(m_value, 0);
+    case 1:
+      return _mm256_extract_epi32(m_value, 1);
+    case 2:
+      return _mm256_extract_epi32(m_value, 2);
+    case 3:
+      return _mm256_extract_epi32(m_value, 3);
+    case 4:
+      return _mm256_extract_epi32(m_value, 4);
+    case 5:
+      return _mm256_extract_epi32(m_value, 5);
+    case 6:
+      return _mm256_extract_epi32(m_value, 6);
+    case 7:
+      return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi32(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi32(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi32(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi32(m_value, value, 3);
+      break;
+    case 4:
+      m_value = _mm256_insert_epi32(m_value, value, 4);
+      break;
+    case 5:
+      m_value = _mm256_insert_epi32(m_value, value, 5);
+      break;
+    case 6:
+      m_value = _mm256_insert_epi32(m_value, value, 6);
+      break;
+    case 7:
+      m_value = _mm256_insert_epi32(m_value, value, 7);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 8-way 32-bit add, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
+
+    // Low 128-bits
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // no 8-way 32-bit multiply, but there is a 32x32 -> 64
+    // This gets ugly :)
+
+    // Low 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    // multiply even lanes 0, 2
+    auto res_low_even = _mm_mul_epi32(low_a, low_b);
+
+    // multiply odd lanes 1, 3
+    auto low_a_sh    = _mm_shuffle_epi32(low_a, 0xB1);
+    auto low_b_sh    = _mm_shuffle_epi32(low_b, 0xB1);
+    auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_low_odd  = _mm_shuffle_epi32(res_low_odd, 0xB1);
+    auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
+
+
+    // High 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    // multiply even lanes 0, 2
+    auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
+
+    // multiply odd lanes 1, 3
+    auto hi_a_sh    = _mm_shuffle_epi32(hi_a, 0xB1);
+    auto hi_b_sh    = _mm_shuffle_epi32(hi_b, 0xB1);
+    auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_hi_odd  = _mm_shuffle_epi32(res_hi_odd, 0xB1);
+    auto res_hi = _mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_hi_odd), _mm_castsi128_ps(res_hi_even), 0x05));
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_add_epi32(low, low_sh1);
+
+    auto low_sh2  = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_add_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int32_t, avx_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i], i);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N), reinterpret_cast<__m256>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 8-way 32-bit add, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // no 8-way 32-bit multiply, but there is a 32x32 -> 64
-        // This gets ugly :)
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        // multiply even lanes 0, 2
-        auto res_low_even = _mm_mul_epi32(low_a, low_b);
-
-        // multiply odd lanes 1, 3
-        auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
-        auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
-        auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
-        auto res_low = _mm256_castsi128_si256(_mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_low_odd),
-                         _mm_castsi128_ps(res_low_even),
-                         0x05)
-            ));
-
-
-        // High 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        // multiply even lanes 0, 2
-        auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
-
-        // multiply odd lanes 1, 3
-        auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
-        auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
-        auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
-        auto res_hi = _mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_hi_odd),
-                         _mm_castsi128_ps(res_hi_even),
-                         0x05)
-            );
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_add_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-        auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_add_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
-
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract final reduction
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
 
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
 
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
 
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
 
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
 
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
 
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
 
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
 
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
+    // Sum halves, extract final reduction
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_max_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_max_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
+      auto red_7    = _mm_max_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_max_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_min_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_min_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
+      auto red_7    = _mm_min_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
 
-        if(N==5){
-          auto red_5 = _mm_max_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_max_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_min_epi32(hi_a, hi_b);
 
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_max_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
-
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        if(N==5){
-          auto red_5 = _mm_min_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_min_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+};
 
 
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_min_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-  };
-
-
-}   // namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 1c7fae3dc7..e0a03bec4f 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -33,506 +33,525 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+template <>
+class Register<int64_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<int64_t, avx_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N),
+                        reinterpret_cast<__m256d>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi64(m_value, 0);
+    case 1:
+      return _mm256_extract_epi64(m_value, 1);
+    case 2:
+      return _mm256_extract_epi64(m_value, 2);
+    case 3:
+      return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi64(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi64(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi64(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi64(m_value, value, 3);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap pairs and add
+    auto sh1 = permute<0x5>(m_value);
+
+    // Add lower 128-bits
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(sh1);
+    auto res_low = _mm_add_epi64(low_a, low_b);
+
+    // Add upper 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(sh1, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Sum upper and lower
+    auto res = _mm_add_epi64(res_hi, res_low);
+
+    // add lower and upper
+    return _mm_extract_epi64(res, 0);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max!
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int64_t, avx_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(),  m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N), reinterpret_cast<__m256d>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-
-        // Add lower 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(sh1);
-        auto res_low = _mm_add_epi64(low_a, low_b);
-
-        // Add upper 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(sh1, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Sum upper and lower
-        auto res = _mm_add_epi64(res_hi, res_low);
-
-        // add lower and upper
-        return _mm_extract_epi64(res, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max!
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index 33c18e2c5f..ad0c7b3d26 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -20,52 +20,59 @@
 #ifndef RAJA_policy_tensor_arch_avx_traits_HPP
 #define RAJA_policy_tensor_arch_avx_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
 
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx2.hpp b/include/RAJA/policy/tensor/arch/avx2.hpp
index b462257924..4ae2ca6bdd 100644
--- a/include/RAJA/policy/tensor/arch/avx2.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX2__
 
-#include<RAJA/policy/tensor/arch/avx2/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
+#include <RAJA/policy/tensor/arch/avx2/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
 
 
-#endif // __AVX2__
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index 852003a4f9..eba85017e0 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -34,529 +34,546 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx2_register> :
-    public internal::expt::RegisterBase<Register<double, avx2_register>>
+template <>
+class Register<double, avx2_register>
+    : public internal::expt::RegisterBase<Register<double, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<double, avx2_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<double, avx2_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed ++;
+    RAJA::tensor_stats::num_vector_load_packed++;
 #endif
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n ++;
+    RAJA::tensor_stats::num_vector_load_packed_n++;
 #endif
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided ++;
+    RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+    m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_pd(ptr, offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      offsets.get_register(),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, offsets.get_register(),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed ++;
+    RAJA::tensor_stats::num_vector_store_packed++;
 #endif
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n ++;
+    RAJA::tensor_stats::num_vector_store_packed_n++;
 #endif
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided ++;
+    RAJA::tensor_stats::num_vector_store_strided++;
 #endif
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n ++;
+    RAJA::tensor_stats::num_vector_store_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        switch(i){
-          case 0: return self_type(_mm256_permute4x64_pd (m_value, 0x00));
-          case 1: return self_type(_mm256_permute4x64_pd (m_value, 0x55));
-          case 2: return self_type(_mm256_permute4x64_pd (m_value, 0xAA));
-          case 3: return self_type(_mm256_permute4x64_pd (m_value, 0xFF));
-        }
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    switch (i)
+    {
+    case 0:
+      return self_type(_mm256_permute4x64_pd(m_value, 0x00));
+    case 1:
+      return self_type(_mm256_permute4x64_pd(m_value, 0x55));
+    case 2:
+      return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
+    case 3:
+      return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
+    }
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum(camp::idx_t = 4) const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max(camp::idx_t N = 4) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum(camp::idx_t = 4) const
+  {
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max(camp::idx_t N = 4) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 4b1e11419d..77d814e293 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -33,487 +33,486 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx2_register> :
-    public internal::expt::RegisterBase<Register<float, avx2_register>>
+template <>
+class Register<float, avx2_register>
+    : public internal::expt::RegisterBase<Register<float, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<float, avx2_register>;
+  using element_type    = float;
+  using register_type   = __m256;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_ps(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<float, avx2_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_ps(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_ps(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    m_value = _mm256_mask_i32gather_ps(
+        _mm256_setzero_ps(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_ps(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::max<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::min<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index ab5948a3f2..fbc671b127 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -34,535 +34,562 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<int32_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+template <>
+class Register<int32_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<int32_t, avx2_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_epi32(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_epi32(ptr, createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(), ptr,
+                                          createStridedOffsets(stride),
+                                          createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi32(m_value, 0);
+    case 1:
+      return _mm256_extract_epi32(m_value, 1);
+    case 2:
+      return _mm256_extract_epi32(m_value, 2);
+    case 3:
+      return _mm256_extract_epi32(m_value, 3);
+    case 4:
+      return _mm256_extract_epi32(m_value, 4);
+    case 5:
+      return _mm256_extract_epi32(m_value, 5);
+    case 6:
+      return _mm256_extract_epi32(m_value, 6);
+    case 7:
+      return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi32(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi32(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi32(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi32(m_value, value, 3);
+      break;
+    case 4:
+      m_value = _mm256_insert_epi32(m_value, value, 4);
+      break;
+    case 5:
+      m_value = _mm256_insert_epi32(m_value, value, 5);
+      break;
+    case 6:
+      m_value = _mm256_insert_epi32(m_value, value, 6);
+      break;
+    case 7:
+      m_value = _mm256_insert_epi32(m_value, value, 7);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+
+    // the AVX2 epi32 multiply only multiplies the even elements
+    // and provides 64-bit results
+    // need to do some repacking to get this to work
+
+    // multiply 0, 2, 4, 6
+    auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
+
+    // Swap 32-bit words
+    auto sh_a = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+
+    auto sh_b = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
+
+    // multiply 1, 3, 5, 7
+    auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
+
+    // Stitch prod_odd and prod_even back together
+    auto sh_odd = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
+
+    return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+    auto red1 = _mm256_add_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2 =
+        _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
+    auto red2 = _mm256_add_epi32(red1, sh2);
+
+    return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int32_t, avx2_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_epi32(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_epi32(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-
-        // the AVX2 epi32 multiply only multiplies the even elements
-        // and provides 64-bit results
-        // need to do some repacking to get this to work
-
-        // multiply 0, 2, 4, 6
-        auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
-
-        // Swap 32-bit words
-        auto sh_a = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
-
-        auto sh_b = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
-
-        // multiply 1, 3, 5, 7
-        auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
-
-        // Stitch prod_odd and prod_even back together
-        auto sh_odd = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
-
-        return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1) );
-        auto red1 = _mm256_add_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
-        auto red2 = _mm256_add_epi32(red1, sh2);
-
-        return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::max<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::min<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_epi32(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::max<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::min<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_epi32(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 00eea542cd..aa285f44e7 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -33,519 +33,533 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+template <>
+class Register<int64_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<int64_t, avx2_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template <int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(int64_t const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                                     createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_set1_epi64x(0), reinterpret_cast<long long const*>(ptr),
+        createStridedOffsets(stride), createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int64_t, avx2_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(int64_t const *ptr, camp::idx_t stride){
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                               offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-        auto red1 = _mm256_add_epi64(m_value, sh1);
-
-        // add lower and upper
-        return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_setzero_si256(), reinterpret_cast<long long const*>(ptr),
+        offsets.get_register(), createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N),
+                           m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      return _mm256_extract_epi64(m_value, 0);
+    case 1:
+      return _mm256_extract_epi64(m_value, 1);
+    case 2:
+      return _mm256_extract_epi64(m_value, 2);
+    case 3:
+      return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+    case 0:
+      m_value = _mm256_insert_epi64(m_value, value, 0);
+      break;
+    case 1:
+      m_value = _mm256_insert_epi64(m_value, value, 1);
+      break;
+    case 2:
+      m_value = _mm256_insert_epi64(m_value, value, 2);
+      break;
+    case 3:
+      m_value = _mm256_insert_epi64(m_value, value, 3);
+      break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+
+    // swap pairs and add
+    auto sh1  = permute<0x5>(m_value);
+    auto red1 = _mm256_add_epi64(m_value, sh1);
+
+    // add lower and upper
+    return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index e95c661335..d51b4ad853 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -21,55 +21,60 @@
 #define RAJA_policy_tensor_arch_avx2_traits_HPP
 
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
-
-
-#endif // guard
-
-
-
-#endif // __AVX2__
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
+
+template <>
+struct RegisterTraits<RAJA::expt::avx2_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
+
+
+#endif  // guard
+
+
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx512.hpp b/include/RAJA/policy/tensor/arch/avx512.hpp
index 597563da35..71d0212c5e 100644
--- a/include/RAJA/policy/tensor/arch/avx512.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512.hpp
@@ -18,11 +18,11 @@
 // Check if the base AVX512 instructions are present
 #ifdef __AVX512F__
 
-#include<RAJA/policy/tensor/arch/avx512/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
+#include <RAJA/policy/tensor/arch/avx512/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
 
 
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index a7b7ebaafa..824311a400 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -34,360 +34,371 @@ namespace RAJA
 namespace expt
 {
 
-  template<>
-  class Register<double, avx512_register> :
-    public internal::expt::RegisterBase<Register<double, avx512_register>>
+template <>
+class Register<double, avx512_register>
+    : public internal::expt::RegisterBase<Register<double, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx512_register>>;
+
+
+  using register_policy = avx512_register;
+  using self_type       = Register<double, avx512_register>;
+  using element_type    = double;
+  using register_type   = __m512d;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask8(0x00);
+    case 1:
+      return __mmask8(0x01);
+    case 2:
+      return __mmask8(0x03);
+    case 3:
+      return __mmask8(0x07);
+    case 4:
+      return __mmask8(0x0F);
+    case 5:
+      return __mmask8(0x1F);
+    case 6:
+      return __mmask8(0x3F);
+    case 7:
+      return __mmask8(0x7F);
+    case 8:
+      return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_pd()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_pd(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_pd(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx512_register>>;
-
-
-      using register_policy = avx512_register;
-      using self_type = Register<double, avx512_register>;
-      using element_type = double;
-      using register_type = __m512d;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_pd(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_pd(ptr, 
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_pd(ptr, 
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
-      }
+    return self_type(_mm512_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 84cb034a56..004fe3fffa 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -33,367 +33,387 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx512_register> :
-    public internal::expt::RegisterBase<Register<float, avx512_register>>
+template <>
+class Register<float, avx512_register>
+    : public internal::expt::RegisterBase<Register<float, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<float, avx512_register>;
+  using element_type    = float;
+  using register_type   = __m512;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask16(0x0000);
+    case 1:
+      return __mmask16(0x0001);
+    case 2:
+      return __mmask16(0x0003);
+    case 3:
+      return __mmask16(0x0007);
+    case 4:
+      return __mmask16(0x000F);
+    case 5:
+      return __mmask16(0x001F);
+    case 6:
+      return __mmask16(0x003F);
+    case 7:
+      return __mmask16(0x007F);
+    case 8:
+      return __mmask16(0x00FF);
+    case 9:
+      return __mmask16(0x01FF);
+    case 10:
+      return __mmask16(0x03FF);
+    case 11:
+      return __mmask16(0x07FF);
+    case 12:
+      return __mmask16(0x0FFF);
+    case 13:
+      return __mmask16(0x1FFF);
+    case 14:
+      return __mmask16(0x3FFF);
+    case 15:
+      return __mmask16(0x7FFF);
+    case 16:
+      return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_ps(c)) {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<float, avx512_register>;
-      using element_type = float;
-      using register_type = __m512;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_ps(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_ps(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_ps(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
-      }
+    // AVX512F
+    m_value = _mm512_i32gather_ps(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_ps(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_ps(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_ps(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 021ca90fbe..e3ecac4520 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -33,419 +33,440 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+template <>
+class Register<int32_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<int32_t, avx512_register>;
+  using element_type    = int32_t;
+  using register_type   = __m512i;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask16(0x0000);
+    case 1:
+      return __mmask16(0x0001);
+    case 2:
+      return __mmask16(0x0003);
+    case 3:
+      return __mmask16(0x0007);
+    case 4:
+      return __mmask16(0x000F);
+    case 5:
+      return __mmask16(0x001F);
+    case 6:
+      return __mmask16(0x003F);
+    case 7:
+      return __mmask16(0x007F);
+    case 8:
+      return __mmask16(0x00FF);
+    case 9:
+      return __mmask16(0x01FF);
+    case 10:
+      return __mmask16(0x03FF);
+    case 11:
+      return __mmask16(0x07FF);
+    case 12:
+      return __mmask16(0x0FFF);
+    case 13:
+      return __mmask16(0x1FFF);
+    case 14:
+      return __mmask16(0x3FFF);
+    case 15:
+      return __mmask16(0x7FFF);
+    case 16:
+      return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi32(c))
+  {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    m_value = _mm512_loadu_si512(ptr);
+#else
+    m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int32_t, avx512_register>;
-      using element_type = int32_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        m_value = _mm512_loadu_si512(ptr);
-        #else
-        m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_epi32(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        _mm512_storeu_si512(ptr, m_value);
-        #else
-        _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_epi32(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_epi32(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // GNU 7-10 are missing this instruction.
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
-        #define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
-        #endif
-
-				switch(i){	
-					case 0: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
-					case 1: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
-					case 2: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
-					case 3: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
-					case 4: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
-					case 5: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
-					case 6: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
-					case 7: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
-					case 8: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
-					case 9: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
-					case 10: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
-					case 11: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
-					case 12: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
-					case 13: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
-					case 14: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
-					case 15: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
-				}
-				return 0;
-			}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-				m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            get(15)/b.get(15),
-            get(14)/b.get(14),
-            get(13)/b.get(13),
-            get(12)/b.get(12),
-            get(11)/b.get(11),
-            get(10)/b.get(10),
-            get(9)/b.get(9),
-            get(8)/b.get(8),
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            N >= 16 ? get(15)/b.get(15) : 0,
-            N >= 15 ? get(14)/b.get(14) : 0,
-            N >= 14 ? get(13)/b.get(13) : 0,
-            N >= 13 ? get(12)/b.get(12) : 0,
-            N >= 12 ? get(11)/b.get(11) : 0,
-            N >= 11 ? get(10)/b.get(10) : 0,
-            N >= 10 ? get(9)/b.get(9) : 0,
-            N >= 9 ? get(8)/b.get(8) : 0,
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi32(m_value, a.m_value));
-      }
-  };
-
-}   // namespace expt
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    _mm512_storeu_si512(ptr, m_value);
+#else
+    _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_epi32(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_epi32(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+// GNU 7-10 are missing this instruction.
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
+#define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
+#endif
+
+    switch (i)
+    {
+    case 0:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
+    case 1:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
+    case 2:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
+    case 3:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
+    case 4:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
+    case 5:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
+    case 6:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
+    case 7:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
+    case 8:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
+    case 9:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
+    case 10:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
+    case 11:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
+    case 12:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
+    case 13:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
+    case 14:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
+    case 15:
+      return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
+    }
+    return 0;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi32(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mullo_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(
+        get(15) / b.get(15), get(14) / b.get(14), get(13) / b.get(13),
+        get(12) / b.get(12), get(11) / b.get(11), get(10) / b.get(10),
+        get(9) / b.get(9), get(8) / b.get(8), get(7) / b.get(7),
+        get(6) / b.get(6), get(5) / b.get(5), get(4) / b.get(4),
+        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
+        get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(
+        N >= 16 ? get(15) / b.get(15) : 0, N >= 15 ? get(14) / b.get(14) : 0,
+        N >= 14 ? get(13) / b.get(13) : 0, N >= 13 ? get(12) / b.get(12) : 0,
+        N >= 12 ? get(11) / b.get(11) : 0, N >= 11 ? get(10) / b.get(10) : 0,
+        N >= 10 ? get(9) / b.get(9) : 0, N >= 9 ? get(8) / b.get(8) : 0,
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi32(m_value, a.m_value));
+  }
+};
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index 17f929c607..b99c1a09ab 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -33,373 +33,386 @@ namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+template <>
+class Register<int64_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<int64_t, avx512_register>;
+  using element_type    = int64_t;
+  using register_type   = __m512i;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+    case 0:
+      return __mmask8(0x00);
+    case 1:
+      return __mmask8(0x01);
+    case 2:
+      return __mmask8(0x03);
+    case 3:
+      return __mmask8(0x07);
+    case 4:
+      return __mmask8(0x0F);
+    case 5:
+      return __mmask8(0x1F);
+    case 6:
+      return __mmask8(0x3F);
+    case 7:
+      return __mmask8(0x7F);
+    case 8:
+      return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi64(c))
+  {}
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    m_value = _mm512_maskz_loadu_epi64(
+        ~0,
+        ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
+#else
+    m_value =
+        _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as
+                                  // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  load_strided_n(element_type const* ptr, camp::idx_t stride, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    _mm512_mask_storeu_epi64(ptr, ~0,
+                             m_value);  // May cause slowdown due to looping
+                                        // over 8 bytes, one at a time.
+#else
+    _mm512_storeu_epi64(ptr,
+                        m_value);  // GNU 7-10 are missing this instruction, as
+                                   // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t stride, camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int64_t, avx512_register>;
-      using element_type = int64_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi64(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        m_value = _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_epi64(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        _mm512_storeu_epi64(ptr, m_value);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_epi64(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_epi64(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi64(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi64(m_value);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi64(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi64(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    // AVX512F
+    _mm512_mask_i64scatter_epi64(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi64(value);
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mullo_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi64(m_value); }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi64(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi64(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index b2b5cf6731..3088b0b8ae 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -21,53 +21,59 @@
 #ifndef RAJA_policy_tensor_arch_avx512_traits_HPP
 #define RAJA_policy_tensor_arch_avx512_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::avx512_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int64_t;
+};
 
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // guard
+#endif  // guard
 
 
-
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/cuda.hpp b/include/RAJA/policy/tensor/arch/cuda.hpp
index a840c63d85..cfda807e68 100644
--- a/include/RAJA/policy/tensor/arch/cuda.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_cuda_HPP
 #define RAJA_policy_tensor_arch_cuda_HPP
 
-#include<RAJA/policy/tensor/arch/cuda/traits.hpp>
-#include<RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
+#include <RAJA/policy/tensor/arch/cuda/traits.hpp>
+#include <RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index e23eb92bed..81b19709ab 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -30,981 +30,1019 @@
 #define RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, cuda_warp_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>
-  {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
-
-      using register_policy = cuda_warp_register;
-      using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, cuda_warp_register>;
-
-
-		private:
-      element_type m_value;
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, cuda_warp_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, cuda_warp_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
+
+  using register_policy = cuda_warp_register;
+  using self_type       = Register<ELEMENT_TYPE, cuda_warp_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, cuda_warp_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 32;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
 
-		public:
-
-      static constexpr int s_num_elem = 32;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return  __shfl_sync(0xffffffff, m_value, i, 32);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return __shfl_sync(0xffffffff, m_value, i, 32);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(5-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 5-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // CUDA
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (5 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 5 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from           = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 032517677c..8b9c355f44 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -21,26 +21,29 @@
 #ifndef RAJA_policy_tensor_arch_cuda_traits_HPP
 #define RAJA_policy_tensor_arch_cuda_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::cuda_warp_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::cuda_warp_register;
-      static constexpr camp::idx_t s_num_elem = 32;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
-
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template <typename T>
+struct RegisterTraits<RAJA::expt::cuda_warp_register, T>
+{
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::cuda_warp_register;
+  static constexpr camp::idx_t s_num_elem = 32;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type                  = int32_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip.hpp b/include/RAJA/policy/tensor/arch/hip.hpp
index 6e76772a29..3ddf27e39c 100644
--- a/include/RAJA/policy/tensor/arch/hip.hpp
+++ b/include/RAJA/policy/tensor/arch/hip.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_hip_HPP
 #define RAJA_policy_tensor_arch_hip_HPP
 
-#include<RAJA/policy/tensor/arch/hip/traits.hpp>
-#include<RAJA/policy/tensor/arch/hip/hip_wave.hpp>
+#include <RAJA/policy/tensor/arch/hip/traits.hpp>
+#include <RAJA/policy/tensor/arch/hip/hip_wave.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index 74bbc2f077..6cf48ea358 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -30,983 +30,1021 @@
 #define RAJA_policy_tensor_arch_hip_hip_wave_register_HPP
 
 
-
 namespace RAJA
 {
 namespace expt
 {
 
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, hip_wave_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>
+template <typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, hip_wave_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, hip_wave_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
+
+  using register_policy = hip_wave_register;
+  using self_type       = Register<ELEMENT_TYPE, hip_wave_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, hip_wave_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 64;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
-
-      using register_policy = hip_wave_register;
-      using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, hip_wave_register>;
-
+    m_value = c.m_value;
+    return *this;
+  }
 
-		private:
-      element_type m_value;
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
 
-		public:
+    auto lane = get_lane();
 
-      static constexpr int s_num_elem = 64;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return hip::impl::shfl_sync(m_value, i);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = hip::impl::shfl_sync(m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template <typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const&
+  scatter_n(element_type* ptr, T2 const& offsets, camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return hip::impl::shfl_sync(m_value, i);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = hip::impl::shfl_sync(m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+
+  RAJA_DEVICE
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template <typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(6-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 6-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // HIP
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (6 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 6 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from           = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+
+    return result;
+  }
+
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // HIP
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index 4c4d959599..dc4d0d63d1 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -21,26 +21,29 @@
 #ifndef RAJA_policy_tensor_arch_hip_traits_HPP
 #define RAJA_policy_tensor_arch_hip_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::hip_wave_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::hip_wave_register;
-      static constexpr camp::idx_t s_num_elem = 64;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
-
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template <typename T>
+struct RegisterTraits<RAJA::expt::hip_wave_register, T>
+{
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::hip_wave_register;
+  static constexpr camp::idx_t s_num_elem = 64;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type                  = int32_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
diff --git a/include/RAJA/policy/tensor/arch/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar.hpp
index 5e139f41f0..29b3788e80 100644
--- a/include/RAJA/policy/tensor/arch/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar.hpp
@@ -16,16 +16,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 
-
 #ifndef RAJA_policy_tensor_arch_scalar_HPP
 #define RAJA_policy_tensor_arch_scalar_HPP
 
 
-
-#include<RAJA/policy/tensor/arch/scalar/traits.hpp>
-#include<RAJA/policy/tensor/arch/scalar/scalar.hpp>
+#include <RAJA/policy/tensor/arch/scalar/traits.hpp>
+#include <RAJA/policy/tensor/arch/scalar/scalar.hpp>
 
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index 139c5d27a5..d63b78c9f4 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -22,449 +22,464 @@
 
 namespace RAJA
 {
-namespace expt {
+namespace expt
+{
+
+/**
+ * A specialization for a single element register.
+ * We will implement this as a scalar value, and let the compiler use
+ * whatever registers it deems appropriate.
+ */
+template <typename T>
+class Register<T, scalar_register>
+    : public internal::expt::RegisterBase<Register<T, scalar_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
+
+  using register_policy = scalar_register;
+  using self_type       = Register<T, scalar_register>;
+  using element_type    = T;
+  using register_type   = T;
+
+  using int_vector_type =
+      Register<typename internal::expt::RegisterTraits<scalar_register,
+                                                       T>::int_element_type,
+               scalar_register>;
+
+
+private:
+  T m_value;
+
+public:
+  static constexpr camp::idx_t s_num_elem = 1;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register() : base_type(), m_value(0) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(element_type const& c) : base_type(), m_value(c) {}
+
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, camp::idx_t, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get(0)];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type&
+  gather_n(element_type const* ptr, int_vector_type offsets, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[offsets.get(0)];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type const&
+  store_strided_n(element_type* ptr, camp::idx_t, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr, int_vector_type offsets) const
+  {
+
+    ptr[offsets.get(0)] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const&
+  scatter_n(element_type* ptr, int_vector_type offsets, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[offsets.get(0)] = m_value;
+    }
+    return *this;
+  }
+
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE element_type get(camp::idx_t) const
+  {
+    return m_value;
+  }
+
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type& set(element_type value, camp::idx_t)
+  {
+    m_value = value;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value + c.m_value;
+  }
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value - c.m_value;
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type sum() const { return m_value; }
+
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type dot(self_type const& b) const
+  {
+    return m_value * b.m_value;
+  }
+
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr element_type max() const { return m_value; }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::min();
+    ;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(RAJA::max<element_type>(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the smallest element
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min() const { return m_value; }
+
+  /*!
+   * @brief Returns the smallest element from first N lanes
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::max();
+    ;
+  }
 
-  /**
-   * A specialization for a single element register.
-   * We will implement this as a scalar value, and let the compiler use
-   * whatever registers it deems appropriate.
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
    */
-  template<typename T>
-  class Register<T, scalar_register> :
-      public internal::expt::RegisterBase<Register<T, scalar_register>>
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  self_type vmin(self_type a) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
-
-      using register_policy = scalar_register;
-      using self_type = Register<T, scalar_register>;
-      using element_type = T;
-      using register_type = T;
-
-      using int_vector_type = Register<typename internal::expt::RegisterTraits<scalar_register, T>::int_element_type, scalar_register>;
-
-
-    private:
-      T m_value;
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 1;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register() : base_type(), m_value(0) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(element_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = ptr[0];
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t ){
-        m_value = ptr[0];
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t , camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get(0)];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[offsets.get(0)];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t ) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t , camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type offsets) const {
-
-        ptr[offsets.get(0)] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type offsets, camp::idx_t N) const {
-        if(N > 0){
-          ptr[offsets.get(0)] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type get(camp::idx_t) const
-      {return m_value;}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &set(element_type value, camp::idx_t)
-      {
-        m_value = value;
-        return *this;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value + c.m_value;
-      }
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value - c.m_value;
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type sum() const
-      {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type dot(self_type const &b) const
-      {
-        return m_value * b.m_value;
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type max() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::min();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(RAJA::max<element_type>(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the smallest element
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the smallest element from first N lanes
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::max();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(RAJA::min<element_type>(m_value, a.m_value));
-      }
-
-
-
-  };
-} // namespace expt
+    return self_type(RAJA::min<element_type>(m_value, a.m_value));
+  }
+};
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index dfeccbb86f..92496eeae3 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -19,52 +19,57 @@
 #ifndef RAJA_policy_tensor_arch_scalar_traits_HPP
 #define RAJA_policy_tensor_arch_scalar_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template <>
+struct RegisterTraits<RAJA::expt::scalar_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int64_t;
+};
 
 
-}
-}
-}
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch_impl.hpp b/include/RAJA/policy/tensor/arch_impl.hpp
index e14451505a..0e7085b5e2 100644
--- a/include/RAJA/policy/tensor/arch_impl.hpp
+++ b/include/RAJA/policy/tensor/arch_impl.hpp
@@ -22,7 +22,6 @@
 #include "RAJA/policy/tensor/arch.hpp"
 
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -32,30 +31,29 @@
 //
 
 #ifdef __AVX512F__
-#include<RAJA/policy/tensor/arch/avx512.hpp>
+#include <RAJA/policy/tensor/arch/avx512.hpp>
 #endif
 
 
 #ifdef __AVX2__
-#include<RAJA/policy/tensor/arch/avx2.hpp>
+#include <RAJA/policy/tensor/arch/avx2.hpp>
 #endif
 
 
 #ifdef __AVX__
-#include<RAJA/policy/tensor/arch/avx.hpp>
+#include <RAJA/policy/tensor/arch/avx.hpp>
 #endif
 
 #ifdef RAJA_CUDA_ACTIVE
-#include<RAJA/policy/tensor/arch/cuda.hpp>
+#include <RAJA/policy/tensor/arch/cuda.hpp>
 #endif
 
 #ifdef RAJA_HIP_ACTIVE
-#include<RAJA/policy/tensor/arch/hip.hpp>
+#include <RAJA/policy/tensor/arch/hip.hpp>
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-#include<RAJA/policy/tensor/arch/scalar.hpp>
-
+#include <RAJA/policy/tensor/arch/scalar.hpp>
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 8618d543b2..0b71c1143b 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -40,37 +40,42 @@ namespace policy
 namespace tensor
 {
 
-template<typename EXEC_POLICY, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t TILE_SIZE>
-struct tensor_exec : public EXEC_POLICY {
+template <typename EXEC_POLICY,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          camp::idx_t TILE_SIZE>
+struct tensor_exec : public EXEC_POLICY
+{
   using exec_policy = EXEC_POLICY;
   using tensor_type = TENSOR_TYPE;
 
   static constexpr camp::idx_t s_tensor_dim = DIM;
-  static constexpr camp::idx_t s_tile_size = TILE_SIZE;
+  static constexpr camp::idx_t s_tile_size  = TILE_SIZE;
 };
 
 
-
 }  // end of namespace tensor
 
 }  // end of namespace policy
 
-namespace expt {
-
-
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using vector_exec = policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+namespace expt
+{
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_row_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_col_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using vector_exec =
+    policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_row_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
-} //  namespace expt
+template <typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
+using matrix_col_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
 
 
+}  //  namespace expt
 
 
 }  // end of namespace RAJA
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 63f011b689..19f1a339ee 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -24,61 +24,62 @@
 namespace RAJA
 {
 
-  template<camp::idx_t N>
-  struct LogBase2
-  {
-      static constexpr camp::idx_t value = LogBase2<(N>>1)>::value + 1;
-      static constexpr bool is_exact = ((1<<value) == N);
-  };
-
-  template<>
-  struct LogBase2<0>
-  {
-      static constexpr camp::idx_t value = -1;
-      static constexpr bool is_exact = true;
-  };
+template <camp::idx_t N>
+struct LogBase2
+{
+  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
+  static constexpr bool is_exact     = ((1 << value) == N);
+};
 
-  /*!
-   * A bit-masking operator
-   *
-   * Provides an operator that shifts and masks in input value to extract
-   * a contiguous set of bits.
-   *
-   * result = (input >> Shift) & (Mask)
-   *
-   * Where mask is (1<<Width)-1, or the number of bits defined by Width.
-   *
-   *
-   */
-  template<int Width, int Shift>
-  struct BitMask {
-    static constexpr int shift = Shift;
-    static constexpr int width = Width;
-    static constexpr int max_input_size = 1<<(Shift+Width);
-    static constexpr int max_masked_size = 1<<Width;
-    static constexpr int max_shifted_size = 1<<Shift;
+template <>
+struct LogBase2<0>
+{
+  static constexpr camp::idx_t value = -1;
+  static constexpr bool is_exact     = true;
+};
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskValue(T input) {
-      return( (input>>( static_cast<T>(Shift) )) & static_cast<T>((1<<(Width))-1) );
-    }
+/*!
+ * A bit-masking operator
+ *
+ * Provides an operator that shifts and masks in input value to extract
+ * a contiguous set of bits.
+ *
+ * result = (input >> Shift) & (Mask)
+ *
+ * Where mask is (1<<Width)-1, or the number of bits defined by Width.
+ *
+ *
+ */
+template <int Width, int Shift>
+struct BitMask
+{
+  static constexpr int shift            = Shift;
+  static constexpr int width            = Width;
+  static constexpr int max_input_size   = 1 << (Shift + Width);
+  static constexpr int max_masked_size  = 1 << Width;
+  static constexpr int max_shifted_size = 1 << Shift;
 
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskValue(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) &
+            static_cast<T>((1 << (Width)) - 1));
+  }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T getOuter(T input) {
-      return(  (input>>(static_cast<T>(Shift))) >> Width );
-    }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskOuter(T input) {
-      return( input & (static_cast<T>(-1) << (Width+Shift) )  );
-    }
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T getOuter(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) >> Width);
+  }
 
-  };
+  template <typename T>
+  RAJA_HOST_DEVICE static constexpr T maskOuter(T input)
+  {
+    return (input & (static_cast<T>(-1) << (Width + Shift)));
+  }
+};
 
 }  // namespace RAJA
 
-#endif //RAJA_util_BitMask_HPP
+#endif  // RAJA_util_BitMask_HPP
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index abe8197b93..b6f1f05dc4 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -82,11 +82,11 @@ struct CombiningAdapter
 {
   using Layout = Layout_;
 
-  using IndexRange = typename Layout::IndexRange;
+  using IndexRange     = typename Layout::IndexRange;
   using StrippedIdxLin = typename Layout::StrippedIdxLin;
-  using IndexLinear = typename Layout::IndexLinear;
-  using DimTuple = typename Layout::DimTuple;
-  using DimArr = typename Layout::DimArr;
+  using IndexLinear    = typename Layout::IndexLinear;
+  using DimTuple       = typename Layout::DimTuple;
+  using DimArr         = typename Layout::DimArr;
 
   using RangeLinear = RAJA::TypedRangeSegment<IndexLinear>;
 
@@ -95,10 +95,11 @@ struct CombiningAdapter
   Layout m_layout;
 
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>)
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -106,10 +107,11 @@ struct CombiningAdapter
   }
   ///
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template <camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>) const
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -117,16 +119,14 @@ struct CombiningAdapter
   }
 
 public:
-
   /*!
    * Constructor from lambda and layout.
    */
-  template < typename C_Lambda, typename C_Layout >
+  template <typename C_Lambda, typename C_Layout>
   RAJA_HOST_DEVICE CombiningAdapter(C_Lambda&& lambda, C_Layout&& layout)
-      : m_lambda(std::forward<C_Lambda>(lambda))
-      , m_layout(std::forward<C_Layout>(layout))
-  {
-  }
+      : m_lambda(std::forward<C_Lambda>(lambda)),
+        m_layout(std::forward<C_Layout>(layout))
+  {}
 
   /*!
    * Call the lambda by converting the linear index to multidimensional indices.
@@ -134,13 +134,13 @@ struct CombiningAdapter
    * @return return value of lambda
    */
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index)
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
   ///
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index) const
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
@@ -207,9 +207,9 @@ struct CombiningAdapter
  *
  */
 template <typename Lambda, typename Layout>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
-  // -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
+RAJA_HOST_DEVICE RAJA_INLINE auto
+make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
+// -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
 {
   return CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>(
       std::forward<Lambda>(lambda), std::forward<Layout>(layout));
@@ -217,48 +217,54 @@ auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto
+make_CombiningAdapter(Lambda&& lambda,
+                      ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
-        std::move(layout));
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_PermutedCombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+RAJA_INLINE auto
+make_PermutedCombiningAdapter(Lambda&& lambda,
+                              ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   auto layout = make_permuted_layout<sizeof...(IdxTs), IdxLin>(
-              {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
-              RAJA::as_array<Perm>::get());
+      {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
+      RAJA::as_array<Perm>::get());
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
 
-        std::move(layout));
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index 257e852bf9..db0928385e 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -41,14 +41,16 @@ struct is_any_of;
 
 template <typename T, typename... Types>
 struct is_any_of<T, ::camp::list<Types...>>
-  : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
+    : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
 {};
 
 template <typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
 template <typename T, typename TypeList>
-using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
+using enable_if_is_none_of =
+    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
+                     T>;
 
 
 }  // namespace util
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 6bb308d375..005f26b337 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -3,7 +3,8 @@
  *
  * \file
  *
- * \brief   RAJA header file defining the IndexLayout class and IndexList classes.
+ * \brief   RAJA header file defining the IndexLayout class and IndexList
+ *classes.
  *
  ******************************************************************************
  */
@@ -20,73 +21,83 @@
 
 #include "RAJA/util/Layout.hpp"
 
-namespace RAJA 
+namespace RAJA
 {
 
 /*!
-* DirectIndex struct contains call operator that returns the same index that was input
-*
-*/
-template<typename IdxLin = Index_type>
-struct DirectIndex {
+ * DirectIndex struct contains call operator that returns the same index that
+ * was input
+ *
+ */
+template <typename IdxLin = Index_type>
+struct DirectIndex
+{
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
     return idx;
   }
-
 };
 
 /*!
-* IndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
-struct IndexList {
+ * IndexList struct stores a pointer to an array containing the index list.
+ * Its call operator returns the entry at the input location (idx) of its index
+ * list.
+ *
+ */
+template <typename IdxLin = Index_type>
+struct IndexList
+{
 
-  IdxLin* index_list{nullptr};
+  IdxLin* index_list {nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
     return index_list[idx];
   }
-
 };
 
 /*!
-* ConditionalIndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the same index that was input if the index list is a nullptr, 
-* or otherwise returns the entry at the input location (idx) of its index list.
-* 
-*/
-template<typename IdxLin = Index_type>
-struct ConditionalIndexList {
+ * ConditionalIndexList struct stores a pointer to an array containing the index
+ * list. Its call operator returns the same index that was input if the index
+ * list is a nullptr, or otherwise returns the entry at the input location (idx)
+ * of its index list.
+ *
+ */
+template <typename IdxLin = Index_type>
+struct ConditionalIndexList
+{
 
-  IdxLin* index_list{nullptr};  
+  IdxLin* index_list {nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  operator()(const IdxLin idx) const
   {
-    if (index_list) {
+    if (index_list)
+    {
       return index_list[idx];
-    } else {
+    }
+    else
+    {
       return idx;
     }
   }
-
 };
 
 namespace internal
 {
 
-template<typename Range, typename IdxLin, typename... IndexTypes>
+template <typename Range, typename IdxLin, typename... IndexTypes>
 struct IndexLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
-struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
-  using IndexRange = camp::idx_seq<RangeInts...>;
+struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
+{
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
@@ -94,76 +105,78 @@ struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
   camp::tuple<IndexTypes...> tuple;
 
   template <typename... Types>
-  constexpr RAJA_INLINE IndexLayout_impl(
-      camp::tuple<IndexTypes...> index_tuple_in,
-      Types... ns)
-      : base_{(ns)...},
-        tuple(index_tuple_in)
-  {
-  }
+  constexpr RAJA_INLINE
+  IndexLayout_impl(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+      : base_ {(ns)...}, tuple(index_tuple_in)
+  {}
 
   /*!
    * Computes a linear space index from entries of index lists stored in tuple.
-   * This is accomplished through the inner product of the strides and the 
+   * This is accomplished through the inner product of the strides and the
    * entry in the index list along each dimension.
    * @param indices Indices in the n-dimensional space of this layout
    * @return Linear space index.
-   */  
+   */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(Indices... indices) const
   {
     return sum<IdxLin>(
-      (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
+        (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
   }
-
 };
 
-} // namespace internal
+}  // namespace internal
 
 
-template <size_t n_dims = 1, typename IdxLin = Index_type, typename... IndexTypes>
+template <size_t n_dims   = 1,
+          typename IdxLin = Index_type,
+          typename... IndexTypes>
 struct IndexLayout
-    : public internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...> {
-  using Base =
-      internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
+    : public internal::
+          IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>
+{
+  using Base = internal::
+      IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
   using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                    IdxLin, IndexTypes...>::IndexLayout_impl;
-
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
-      const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
-          rhs)
-      : Base{rhs}
-  {
-  }
-
+                                   IdxLin,
+                                   IndexTypes...>::IndexLayout_impl;
+
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                               IdxLin,
+                                               IndexTypes...>& rhs)
+      : Base {rhs}
+  {}
 };
 
 /*!
- * creates of a camp::tuple of index types 
+ * creates of a camp::tuple of index types
  * (such as DirectIndex, IndexList, or ConditionalIndexList)
  *
  */
 template <typename... IndexTypes>
 auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
 {
-    return camp::tuple<IndexTypes...>(it...);
+  return camp::tuple<IndexTypes...>(it...);
 }
 
 /*!
  * creates an index layout based on the input camp::tuple of index types
  *
- */  
-template <typename IdxLin = Index_type, typename... Types, typename... IndexTypes>
-auto make_index_layout(
-  camp::tuple<IndexTypes...> index_tuple_in,
-  Types... ns) -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
+ */
+template <typename IdxLin = Index_type,
+          typename... Types,
+          typename... IndexTypes>
+auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+    -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
-    static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-    return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in, ns...);
+  static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
+  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
+                                                              ns...);
 }
 
-}
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index c5060a0a96..7812306b71 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -14,39 +14,44 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
-  {
-  public:
-    using Parent = ::RAJA::util::PluginStrategy;
-    typedef void (*init_function)(const int, const uint64_t, const uint32_t, void*);
-    typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
-    typedef void (*post_function)(uint64_t);
-    typedef void (*finalize_function)();
+class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
+{
+public:
+  using Parent = ::RAJA::util::PluginStrategy;
+  typedef void (*init_function)(const int,
+                                const uint64_t,
+                                const uint32_t,
+                                void*);
+  typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
+  typedef void (*post_function)(uint64_t);
+  typedef void (*finalize_function)();
 
-    KokkosPluginLoader();
+  KokkosPluginLoader();
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+private:
+  void initPlugin(const std::string& path);
 
-    std::vector<init_function> init_functions;
-    std::vector<pre_function> pre_functions;
-    std::vector<post_function> post_functions;
-    std::vector<finalize_function> finalize_functions;
+  void initDirectory(const std::string& path);
 
-  };  // end KokkosPluginLoader class
+  std::vector<init_function> init_functions;
+  std::vector<pre_function> pre_functions;
+  std::vector<post_function> post_functions;
+  std::vector<finalize_function> finalize_functions;
 
-  void linkKokkosPluginLoader();
+};  // end KokkosPluginLoader class
+
+void linkKokkosPluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 948e37f498..30a044e322 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -38,9 +38,8 @@ namespace detail
 {
 
 
-
 template <typename Range,
-          typename IdxLin = Index_type,
+          typename IdxLin        = Index_type,
           ptrdiff_t StrideOneDim = -1>
 struct LayoutBase_impl;
 
@@ -49,63 +48,62 @@ struct LayoutBase_impl;
  */
 
 template <size_t j, size_t n_dims, typename IdxLin = Index_type>
-struct stride_calculator {
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      IdxLin cur_stride,
-      IdxLin const (&sizes)[n_dims]) const
+struct stride_calculator
+{
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(IdxLin cur_stride, IdxLin const (&sizes)[n_dims]) const
   {
-    return stride_calculator<j + 1, n_dims, IdxLin>{}(
+    return stride_calculator<j + 1, n_dims, IdxLin> {}(
         cur_stride * (sizes[j] ? sizes[j] : 1), sizes);
   }
 };
 template <size_t n_dims, typename IdxLin>
-struct stride_calculator<n_dims, n_dims, IdxLin> {
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      IdxLin cur_stride,
-      IdxLin const (&)[n_dims]) const
+struct stride_calculator<n_dims, n_dims, IdxLin>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(IdxLin cur_stride, IdxLin const (&)[n_dims]) const
   {
     return cur_stride;
   }
 };
 
 template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
+struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
+{
 public:
   using IndexLinear = IdxLin;
-  using IndexRange = camp::make_idx_seq_t<sizeof...(RangeInts)>;
+  using IndexRange  = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  static constexpr IdxLin limit = RAJA::operators::limits<IdxLin>::max();
+  static constexpr IdxLin limit  = RAJA::operators::limits<IdxLin>::max();
   static constexpr ptrdiff_t stride_one_dim = StrideOneDim;
 
-  IdxLin sizes[n_dims] = {0};
-  IdxLin strides[n_dims] = {0};
+  IdxLin sizes[n_dims]       = {0};
+  IdxLin strides[n_dims]     = {0};
   IdxLin inv_strides[n_dims] = {0};
-  IdxLin inv_mods[n_dims] = {0};
+  IdxLin inv_mods[n_dims]    = {0};
 
 
   /*!
    * Default constructor with zero sizes and strides.
    */
-  constexpr RAJA_INLINE LayoutBase_impl() = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const &) = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl &&) = default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl const &) =
-      default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl &&) =
-      default;
+  constexpr RAJA_INLINE LayoutBase_impl()                        = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&)  = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&)       = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl const&) = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&)      = default;
 
   /*!
    * Construct a layout given the size of each dimension.
    */
   template <typename... Types>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(Types... ns)
-      : sizes{static_cast<IdxLin>(stripIndexType(ns))...},
-        strides{(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin>{}(
+      : sizes {static_cast<IdxLin>(stripIndexType(ns))...},
+        strides {(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin> {}(
             sizes[RangeInts] ? IdxLin(1) : IdxLin(0),
             sizes))...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
     static_assert(n_dims == sizeof...(Types),
                   "number of dimensions must match");
@@ -115,15 +113,15 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    *  Templated copy ctor from simillar layout.
    */
   template <typename CIdxLin, ptrdiff_t CStrideOneDim>
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE LayoutBase_impl(
-      const LayoutBase_impl<camp::idx_seq<RangeInts...>, CIdxLin, CStrideOneDim>
-          &rhs)
-      : sizes{static_cast<IdxLin>(rhs.sizes[RangeInts])...},
-        strides{static_cast<IdxLin>(rhs.strides[RangeInts])...},
-        inv_strides{static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
-        inv_mods{static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
-  {
-  }
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
+                                        CIdxLin,
+                                        CStrideOneDim>& rhs)
+      : sizes {static_cast<IdxLin>(rhs.sizes[RangeInts])...},
+        strides {static_cast<IdxLin>(rhs.strides[RangeInts])...},
+        inv_strides {static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
+        inv_mods {static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
+  {}
 
 
   /*!
@@ -131,36 +129,35 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    */
   template <typename... Types>
   RAJA_INLINE constexpr LayoutBase_impl(
-      const std::array<IdxLin, n_dims> &sizes_in,
-      const std::array<IdxLin, n_dims> &strides_in)
-      : sizes{sizes_in[RangeInts]...},
-        strides{strides_in[RangeInts]...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
-  {
-  }
+      const std::array<IdxLin, n_dims>& sizes_in,
+      const std::array<IdxLin, n_dims>& strides_in)
+      : sizes {sizes_in[RangeInts]...},
+        strides {strides_in[RangeInts]...},
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+  {}
 
   /*!
    * Methods to performs bounds checking in layout objects
    */
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx), static_cast<long int>(sizes[N] - 1));
+           static_cast<int>(N), static_cast<long int>(idx),
+           static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
                                                 Indices... indices) const
   {
-    if(sizes[N] > 0 && !(0<=idx && idx < static_cast<Idx>(sizes[N])))
+    if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N])))
     {
       BoundsCheckError<N>(idx);
     }
@@ -180,16 +177,16 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
   operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>(
-      (RangeInts==stride_one_dim ?   // Is this dimension stride-one?
-         indices :  // it's stride one, so dont bother with multiply
-         strides[RangeInts]*indices // it's not stride one
-			)...
-    );
+    return sum<IdxLin>((RangeInts == stride_one_dim
+                            ?  // Is this dimension stride-one?
+                            indices
+                            :  // it's stride one, so dont bother with multiply
+                            strides[RangeInts] * indices  // it's not stride one
+                        )...);
   }
 
 
@@ -205,20 +202,22 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    */
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
-    if(totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
+    if (totSize > 0 && (linear_index < 0 || linear_index >= totSize))
+    {
       printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
-             static_cast<long int>(linear_index), static_cast<long int>(totSize-1));
+             static_cast<long int>(linear_index),
+             static_cast<long int>(totSize - 1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
-     }
+    }
 #endif
 
-    camp::sink((indices =
-      (camp::decay<Indices>)((linear_index / inv_strides[RangeInts]) %
-                             inv_mods[RangeInts]))...);
+    camp::sink((indices = (camp::decay<Indices>)((linear_index /
+                                                  inv_strides[RangeInts]) %
+                                                 inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -231,8 +230,9 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
-    return foldl(RAJA::operators::multiplies<IdxLin>(),
-                         (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
+    return foldl(
+        RAJA::operators::multiplies<IdxLin>(),
+        (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
   }
 
   /*!
@@ -247,27 +247,21 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
     return foldl(RAJA::operators::multiplies<IdxLin>(), sizes[RangeInts]...);
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return strides[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return sizes[DIM];
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 };
@@ -338,11 +332,12 @@ struct TypedLayout;
 
 template <typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
 struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
-    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne> {
+    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne>
+{
 
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-  using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
-  using Base = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
+  using Self   = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
+  using Base   = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
 
   // Pull in base constructors
@@ -356,8 +351,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
@@ -374,11 +369,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *                 dimensionality of this layout.
    */
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
@@ -392,11 +387,12 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-		camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -406,8 +402,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
  *
  */
 template <ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
-RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
-    Layout<n_dims, IdxLin> const &l)
+RAJA_INLINE Layout<n_dims, IdxLin, s1_dim>
+make_stride_one(Layout<n_dims, IdxLin> const& l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
@@ -418,12 +414,12 @@ RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
  *
  */
 template <ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
-RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim> make_stride_one(
-    TypedLayout<IdxLin, IdxTuple> const &l)
+RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim>
+make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
 {
   // strip l to it's base-class type
-  using Base = typename TypedLayout<IdxLin, IdxTuple>::Base;
-  Base const &b = (Base const &)l;
+  using Base    = typename TypedLayout<IdxLin, IdxTuple>::Base;
+  Base const& b = (Base const&)l;
 
   // Use non-typed layout to initialize new typed layout
   return TypedLayout<IdxLin, IdxTuple, s1_dim>(b);
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 50680101d4..faa5910704 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -31,8 +31,7 @@ namespace RAJA
 {
 
 
-
-template<camp::idx_t ... Sizes>
+template <camp::idx_t... Sizes>
 using ParamList = camp::idx_seq<Sizes...>;
 
 /*!
@@ -51,79 +50,86 @@ using ParamList = camp::idx_seq<Sizes...>;
  */
 
 
-namespace internal {
-
-
+namespace internal
+{
 
-  template<typename Perm, typename Sizes>
-  struct StaticLayoutHelper;
 
-  template<camp::idx_t ... Perm, Index_type ...Sizes>
-  struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>{
-      using type =  StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
-  };
+template <typename Perm, typename Sizes>
+struct StaticLayoutHelper;
 
-  template<typename Perm, typename Sizes>
-  using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
+template <camp::idx_t... Perm, Index_type... Sizes>
+struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>
+{
+  using type = StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+};
 
+template <typename Perm, typename Sizes>
+using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 
 
-}
+}  // namespace internal
 
 
-template<typename ValueType, typename Perm, typename Sizes, typename... IndexTypes>
+template <typename ValueType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
 using TypedLocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, camp::list<IndexTypes...> >;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            camp::list<IndexTypes...>>;
 
 
-template<typename ValueType, typename Perm, typename Sizes>
+template <typename ValueType, typename Perm, typename Sizes>
 using LocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, internal::getDefaultIndexTypes<Perm> >;
-
-
-
-
-
-template<typename AtomicPolicy, typename DataType, typename Perm,
-         typename Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray {
-};
-
-template<typename AtomicPolicy, typename DataType, camp::idx_t ... Perm,
-          Index_type ... Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray<AtomicPolicy, DataType, camp::idx_seq<Perm ...>,
-                             RAJA::SizeList<Sizes ...>, IndexTypes ...>{
-  DataType *m_arrayPtr = nullptr;
-  using value_type = DataType;
-  using atomic_ref_t = RAJA::AtomicRef<value_type, AtomicPolicy>;
-  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm ...>, Sizes ...>;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            internal::getDefaultIndexTypes<Perm>>;
+
+
+template <typename AtomicPolicy,
+          typename DataType,
+          typename Perm,
+          typename Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray
+{};
+
+template <typename AtomicPolicy,
+          typename DataType,
+          camp::idx_t... Perm,
+          Index_type... Sizes,
+          typename... IndexTypes>
+struct AtomicTypedLocalArray<AtomicPolicy,
+                             DataType,
+                             camp::idx_seq<Perm...>,
+                             RAJA::SizeList<Sizes...>,
+                             IndexTypes...>
+{
+  DataType* m_arrayPtr = nullptr;
+  using value_type     = DataType;
+  using atomic_ref_t   = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using layout_type    = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
   static const camp::idx_t NumElem = layout_type::s_size;
 
   RAJA_HOST_DEVICE
-  atomic_ref_t operator()(IndexTypes ... indices) const
+  atomic_ref_t operator()(IndexTypes... indices) const
   {
-    return(atomic_ref_t(&m_arrayPtr[layout_type::s_oper(stripIndexType(indices)
-                                                     ...)]));
+    return (atomic_ref_t(
+        &m_arrayPtr[layout_type::s_oper(stripIndexType(indices)...)]));
   }
 
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr
-  camp::idx_t size() const
-  {
-    return layout_type::s_size;
-  }
+  constexpr camp::idx_t size() const { return layout_type::s_size; }
 
   RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(DataType * data_ptr){
-    m_arrayPtr = data_ptr;
-  }
+  RAJA_INLINE void set_data(DataType* data_ptr) { m_arrayPtr = data_ptr; }
 };
 
 
-
-
-
 }  // end namespace RAJA
 
 
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 827515062e..858f444f74 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -41,66 +41,67 @@ template <typename Range, typename IdxLin>
 struct OffsetLayout_impl;
 
 template <camp::idx_t... RangeInts, typename IdxLin>
-struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
-  using Self = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
-  using IndexRange = camp::idx_seq<RangeInts...>;
+struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
+{
+  using Self        = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims]={0}; //If not specified set to zero
+  IdxLin offsets[n_dims]         = {0};  // If not specified set to zero
 
-  constexpr RAJA_INLINE OffsetLayout_impl(
-      std::array<IdxLin, sizeof...(RangeInts)> begin,
-      std::array<IdxLin, sizeof...(RangeInts)> end)
-      : base_{(end[RangeInts] - begin[RangeInts])...},
-        offsets{begin[RangeInts]...}
-  {
-  }
+  constexpr RAJA_INLINE
+  OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
+                    std::array<IdxLin, sizeof...(RangeInts)> end)
+      : base_ {(end[RangeInts] - begin[RangeInts])...},
+        offsets {begin[RangeInts]...}
+  {}
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout_impl(Self const& c)
-      : base_(c.base_), offsets{c.offsets[RangeInts]...}
-  {
-  }
+      : base_(c.base_), offsets {c.offsets[RangeInts]...}
+  {}
 
   void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
   {
-    for(size_t i=0; i<n_dims; ++i) offsets[i] += shift[i];
+    for (size_t i = 0; i < n_dims; ++i)
+      offsets[i] += shift[i];
   }
 
-  template<camp::idx_t N, typename Idx>
+  template <camp::idx_t N, typename Idx>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
            static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(offsets[N]), static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+           static_cast<long int>(offsets[N]),
+           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
   template <camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
   template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
+                                                Indices... indices) const
   {
-    if(!(offsets[N] <=idx && idx < offsets[N] + base_.sizes[N]))
+    if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N]))
     {
       BoundsCheckError<N>(idx);
     }
     RAJA_UNUSED_VAR(idx);
-    BoundsCheck<N+1>(indices...);
+    BoundsCheck<N + 1>(indices...);
   }
 
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
+  operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     return base_((indices - offsets[RangeInts])...);
@@ -108,7 +109,7 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 
   template <typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
@@ -119,16 +120,15 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
       const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
       const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
-    OffsetLayout_impl ret{rhs};
+    OffsetLayout_impl ret {rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
     return ret;
   }
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
   OffsetLayout_impl(const Layout<sizeof...(RangeInts), IdxLin>& rhs)
-      : base_{rhs}
-  {
-  }
+      : base_ {rhs}
+  {}
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin size() const
   {
@@ -140,27 +140,21 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
     return base_.size_noproj();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return base_.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return base_.get_dim_size();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return offsets[DIM];
   }
 };
@@ -169,7 +163,8 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 
 template <size_t n_dims = 1, typename IdxLin = Index_type>
 struct OffsetLayout
-    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin> {
+    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>
+{
   using Base =
       internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
@@ -179,56 +174,57 @@ struct OffsetLayout
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout(
       const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
           rhs)
-      : Base{rhs}
-  {
-  }
+      : Base {rhs}
+  {}
 };
 
-//TypedOffsetLayout
+// TypedOffsetLayout
 template <typename IdxLin, typename DimTuple>
 struct TypedOffsetLayout;
 
 template <typename IdxLin, typename... DimTypes>
 struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
-: public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
+    : public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
 {
-   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-   using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-   using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
-   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
-   using DimTuple = camp::tuple<DimTypes...>;
-   using IndexLinear = IdxLin;
-
-   // Pull in base coonstructors
- #if 0
+  using StrippedIdxLin = strip_index_type_t<IdxLin>;
+  using Self           = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+  using Base           = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
+  using DimArr         = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
+  using DimTuple       = camp::tuple<DimTypes...>;
+  using IndexLinear    = IdxLin;
+
+  // Pull in base coonstructors
+#if 0
    // This breaks with nvcc11
  using Base::Base;
- #else
-   using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
- #endif
+#else
+  using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
+#endif
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
   template <typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
@@ -238,7 +234,7 @@ auto make_offset_layout(const std::array<IdxLin, n_dims>& begin,
                         const std::array<IdxLin, n_dims>& end)
     -> OffsetLayout<n_dims, IdxLin>
 {
-  return OffsetLayout<n_dims, IdxLin>{begin, end};
+  return OffsetLayout<n_dims, IdxLin> {begin, end};
 }
 
 template <size_t Rank, typename IdxLin = Index_type>
@@ -248,7 +244,8 @@ auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     sizes[i] = end[i] - begin[i];
   }
   return internal::OffsetLayout_impl<camp::make_idx_seq_t<Rank>, IdxLin>::
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 150aaeee34..77c880b08e 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -29,15 +29,19 @@ namespace RAJA
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeft
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& num_i,
-                 Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& num_i,
+             Arg2 const& j,
+             Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
@@ -46,35 +50,46 @@ struct GetOffsetLeft
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetRight
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template <size_t>
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
 template <size_t t_bunch_num_i,
-          typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+          typename Ret,
+          typename Arg1 = Ret,
+          typename Arg2 = Arg1>
 struct GetOffsetLeftBunched
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
-  using rebind = GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
+  template <typename new_Ret,
+            typename new_Arg1 = new_Ret,
+            typename new_Arg2 = new_Ret>
+  using rebind =
+      GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t new_bunch_num_i >
+  template <size_t new_bunch_num_i>
   using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
 
-  static constexpr Arg1 bunch_num_i{t_bunch_num_i};
+  static constexpr Arg1 bunch_num_i {t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret
+  operator()(Arg1 const& i,
+             Arg1 const& RAJA_UNUSED_ARG(num_i),
+             Arg2 const& j,
+             Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index b4249e7182..1350a7085f 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -43,35 +43,38 @@ namespace detail
 {
 
 // truly associative (does not include fp add/multiply)
-struct associative_tag {
-};
+struct associative_tag
+{};
 
 // associative up to floating point rounding differences
-struct fp_associative_tag : associative_tag {
-};
+struct fp_associative_tag : associative_tag
+{};
 
 // get associativity tag appropriate for the type
-template < typename T >
+template <typename T>
 using associative_or_fp_associative_tag =
-  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
-                     fp_associative_tag, associative_tag>;
+    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                       fp_associative_tag,
+                       associative_tag>;
 
 template <typename Arg1, typename Arg2, typename Result>
-struct binary_function {
-  using first_argument_type = Arg1;
+struct binary_function
+{
+  using first_argument_type  = Arg1;
   using second_argument_type = Arg2;
-  using result_type = Result;
+  using result_type          = Result;
 };
 
 template <typename Argument, typename Result>
-struct unary_function {
+struct unary_function
+{
   using argument_type = Argument;
-  using result_type = Result;
+  using result_type   = Result;
 };
 
 template <typename Arg1, typename Arg2>
-struct comparison_function : public binary_function<Arg1, Arg2, bool> {
-};
+struct comparison_function : public binary_function<Arg1, Arg2, bool>
+{};
 
 }  // namespace detail
 
@@ -79,13 +82,15 @@ namespace types
 {
 
 template <typename T>
-struct is_unsigned_int {
+struct is_unsigned_int
+{
   static constexpr const bool value =
       std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
 
 template <typename T>
-struct is_signed_int {
+struct is_signed_int
+{
   static constexpr const bool value =
       !std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
@@ -96,51 +101,60 @@ struct is_signed_int {
    type)
 */
 template <typename T, bool GPU = false>
-struct larger {
-};
+struct larger
+{};
 
 template <>
-struct larger<uint8_t> {
+struct larger<uint8_t>
+{
   using type = uint16_t;
 };
 
 template <>
-struct larger<uint16_t> {
+struct larger<uint16_t>
+{
   using type = uint32_t;
 };
 
 template <>
-struct larger<uint32_t> {
+struct larger<uint32_t>
+{
   using type = uint64_t;
 };
 
 template <>
-struct larger<int8_t> {
+struct larger<int8_t>
+{
   using type = int16_t;
 };
 
 template <>
-struct larger<int16_t> {
+struct larger<int16_t>
+{
   using type = int32_t;
 };
 
 template <>
-struct larger<int32_t> {
+struct larger<int32_t>
+{
   using type = int64_t;
 };
 
 template <>
-struct larger<float> {
+struct larger<float>
+{
   using type = double;
 };
 
 template <>
-struct larger<double> {
+struct larger<double>
+{
   using type = long double;
 };
 
 template <>
-struct larger<double, true> {
+struct larger<double, true>
+{
   using type = double;
 };
 
@@ -148,26 +162,30 @@ namespace detail
 {
 
 template <typename T, bool isInt, bool isSigned, bool isFP, bool gpu = false>
-struct largest {
-};
+struct largest
+{};
 
 template <typename T>
-struct largest<T, true, false, false> {
+struct largest<T, true, false, false>
+{
   using type = uint64_t;
 };
 
 template <typename T>
-struct largest<T, true, true, false> {
+struct largest<T, true, true, false>
+{
   using type = int64_t;
 };
 
 template <typename T>
-struct largest<T, false, false, true, false> {
+struct largest<T, false, false, true, false>
+{
   using type = long double;
 };
 
 template <typename T>
-struct largest<T, false, false, true, true> {
+struct largest<T, false, false, true, true>
+{
   using type = double;
 };
 }  // namespace detail
@@ -177,7 +195,8 @@ struct largest<T, false, false, true, true> {
    pass 'true' as second template argument
 */
 template <typename T, bool gpu = false>
-struct largest {
+struct largest
+{
   using type = typename detail::largest<T,
                                         std::is_integral<T>::value,
                                         std::is_signed<T>::value,
@@ -187,30 +206,37 @@ struct largest {
 
 
 template <typename T>
-struct size_of {
-  enum { value = sizeof(T) };
+struct size_of
+{
+  enum
+  {
+    value = sizeof(T)
+  };
 };
 
 namespace detail
 {
 
 template <typename T, typename U, bool lhsLarger>
-struct larger_of {
-};
+struct larger_of
+{};
 
 template <typename T, typename U>
-struct larger_of<T, U, true> {
+struct larger_of<T, U, true>
+{
   using type = T;
 };
 
 template <typename T, typename U>
-struct larger_of<T, U, false> {
+struct larger_of<T, U, false>
+{
   using type = U;
 };
 }  // namespace detail
 
 template <typename T, typename U>
-struct larger_of {
+struct larger_of
+{
   using type = typename detail::
       larger_of<T, U, (size_of<T>::value > size_of<U>::value)>::type;
 };
@@ -218,7 +244,6 @@ struct larger_of {
 }  // namespace types
 
 
-
 template <typename T, typename Enable = void>
 struct limits;
 
@@ -226,27 +251,27 @@ struct limits;
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  !std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      !std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu) );
+    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
@@ -254,8 +279,8 @@ struct limits<T,
 // limits for signed integer types
 template <typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
@@ -264,42 +289,36 @@ struct limits<T,
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(0xFFFFFFFFFFFFFFFF);
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
 
 
 template <>
-struct limits<float> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min()
-  {
-    return -FLT_MAX;
-  }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max()
-  {
-    return FLT_MAX;
-  }
+struct limits<float>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min() { return -FLT_MAX; }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max() { return FLT_MAX; }
 };
 
 template <>
-struct limits<double> {
+struct limits<double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr double min()
   {
     return -DBL_MAX;
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() 
-  { 
-     return DBL_MAX; 
-  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() { return DBL_MAX; }
 };
 
 template <>
-struct limits<long double> {
+struct limits<long double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr long double min()
   {
     return -LDBL_MAX;
@@ -338,51 +357,56 @@ static_assert(check<unsigned long long>(),
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
-              detail::associative_or_fp_associative_tag<Ret> {
+              detail::associative_or_fp_associative_tag<Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} + rhs;
+    return Ret {lhs} + rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
+struct minus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} - rhs;
+    return Ret {lhs} - rhs;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
-                    detail::associative_or_fp_associative_tag<Ret> {
+                    detail::associative_or_fp_associative_tag<Ret>
+{
 
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} * rhs;
+    return Ret {lhs} * rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{1}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {1}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct divides : public detail::binary_function<Arg1, Arg2, Ret> {
+struct divides : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} / rhs;
+    return Ret {lhs} / rhs;
   }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
+struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} % rhs;
+    return Ret {lhs} % rhs;
   }
 };
 
@@ -390,7 +414,8 @@ struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_and : public detail::comparison_function<Arg1, Arg2>,
-                     detail::associative_tag {
+                     detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -401,7 +426,8 @@ struct logical_and : public detail::comparison_function<Arg1, Arg2>,
 
 template <typename Arg1, typename Arg2 = Arg1>
 struct logical_or : public detail::comparison_function<Arg1, Arg2>,
-                    detail::associative_tag {
+                    detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -411,7 +437,8 @@ struct logical_or : public detail::comparison_function<Arg1, Arg2>,
 };
 
 template <typename T>
-struct logical_not : public detail::unary_function<T, bool> {
+struct logical_not : public detail::unary_function<T, bool>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const T& lhs) const
   {
     return !lhs;
@@ -421,30 +448,33 @@ struct logical_not : public detail::unary_function<T, bool> {
 // Bitwise
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_or : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs | rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_and : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs & rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret {0}; }
 };
 
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
+struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -461,7 +491,8 @@ struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
 */
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -475,7 +506,8 @@ struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -490,7 +522,8 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
 // Logical Comparison
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct equal_to : public detail::comparison_function<Arg1, Arg2> {
+struct equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -499,7 +532,8 @@ struct equal_to : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
+struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -508,7 +542,8 @@ struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater : public detail::comparison_function<Arg1, Arg2> {
+struct greater : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -517,7 +552,8 @@ struct greater : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less : public detail::comparison_function<Arg1, Arg2> {
+struct less : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -527,7 +563,8 @@ struct less : public detail::comparison_function<Arg1, Arg2> {
 
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
+struct greater_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -536,7 +573,8 @@ struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
-struct less_equal : public detail::comparison_function<Arg1, Arg2> {
+struct less_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -547,7 +585,8 @@ struct less_equal : public detail::comparison_function<Arg1, Arg2> {
 // Filters
 
 template <typename Ret, typename Orig = Ret>
-struct identity : public detail::unary_function<Orig, Ret> {
+struct identity : public detail::unary_function<Orig, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Orig& lhs) const
   {
     return lhs;
@@ -555,7 +594,8 @@ struct identity : public detail::unary_function<Orig, Ret> {
 };
 
 template <typename T, typename U>
-struct project1st : public detail::binary_function<T, U, T> {
+struct project1st : public detail::binary_function<T, U, T>
+{
   RAJA_HOST_DEVICE constexpr T operator()(const T& lhs,
                                           const U& RAJA_UNUSED_ARG(rhs)) const
   {
@@ -564,7 +604,8 @@ struct project1st : public detail::binary_function<T, U, T> {
 };
 
 template <typename T, typename U = T>
-struct project2nd : public detail::binary_function<T, U, U> {
+struct project2nd : public detail::binary_function<T, U, U>
+{
   RAJA_HOST_DEVICE constexpr U operator()(const T& RAJA_UNUSED_ARG(lhs),
                                           const U& rhs) const
   {
@@ -575,13 +616,15 @@ struct project2nd : public detail::binary_function<T, U, U> {
 // Type Traits
 
 template <typename T>
-struct is_associative {
+struct is_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
 template <typename T>
-struct is_fp_associative {
+struct is_fp_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::fp_associative_tag, T>::value;
 };
@@ -591,8 +634,8 @@ struct safe_plus
     : public plus<Arg1,
                   Arg2,
                   typename types::larger<
-                      typename types::larger_of<Arg1, Arg2>::type>::type> {
-};
+                      typename types::larger_of<Arg1, Arg2>::type>::type>
+{};
 
 }  // namespace operators
 
@@ -605,19 +648,20 @@ template <typename Function,
           typename Arg2 = Arg1>
 struct BinaryFunction
     : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>()))) {
-};
+          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>())))
+{};
 
 template <typename Function, typename Return, typename Arg = Return>
 struct UnaryFunction : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-                           camp::val<Function>()(camp::val<Arg>()))) {
-};
+                           camp::val<Function>()(camp::val<Arg>())))
+{};
 
 namespace detail
 {
 
 template <typename Fun, typename Ret, typename T, typename U>
-using is_binary_function = ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
+using is_binary_function =
+    ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
 
 template <typename Fun, typename Ret, typename T>
 using is_unary_function = ::RAJA::concepts::requires_<UnaryFunction, Ret, T>;
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index e79e9f2830..2a70c4e760 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -31,46 +31,47 @@ template <typename Indices>
 struct as_array;
 
 template <camp::idx_t... Indices>
-struct as_array<camp::idx_seq<Indices...>> {
+struct as_array<camp::idx_seq<Indices...>>
+{
   static constexpr std::array<Index_type, sizeof...(Indices)> get()
   {
     return {{Indices...}};
   }
 };
 
-using PERM_I = camp::idx_seq<0>;
-using PERM_IJ = camp::idx_seq<0, 1>;
-using PERM_JI = camp::idx_seq<1, 0>;
-using PERM_IJK = camp::idx_seq<0, 1, 2>;
-using PERM_IKJ = camp::idx_seq<0, 2, 1>;
-using PERM_JIK = camp::idx_seq<1, 0, 2>;
-using PERM_JKI = camp::idx_seq<1, 2, 0>;
-using PERM_KIJ = camp::idx_seq<2, 0, 1>;
-using PERM_KJI = camp::idx_seq<2, 1, 0>;
-using PERM_IJKL = camp::idx_seq<0, 1, 2, 3>;
-using PERM_IJLK = camp::idx_seq<0, 1, 3, 2>;
-using PERM_IKJL = camp::idx_seq<0, 2, 1, 3>;
-using PERM_IKLJ = camp::idx_seq<0, 2, 3, 1>;
-using PERM_ILJK = camp::idx_seq<0, 3, 1, 2>;
-using PERM_ILKJ = camp::idx_seq<0, 3, 2, 1>;
-using PERM_JIKL = camp::idx_seq<1, 0, 2, 3>;
-using PERM_JILK = camp::idx_seq<1, 0, 3, 2>;
-using PERM_JKIL = camp::idx_seq<1, 2, 0, 3>;
-using PERM_JKLI = camp::idx_seq<1, 2, 3, 0>;
-using PERM_JLIK = camp::idx_seq<1, 3, 0, 2>;
-using PERM_JLKI = camp::idx_seq<1, 3, 2, 0>;
-using PERM_KIJL = camp::idx_seq<2, 0, 1, 3>;
-using PERM_KILJ = camp::idx_seq<2, 0, 3, 1>;
-using PERM_KJIL = camp::idx_seq<2, 1, 0, 3>;
-using PERM_KJLI = camp::idx_seq<2, 1, 3, 0>;
-using PERM_KLIJ = camp::idx_seq<2, 3, 0, 1>;
-using PERM_KLJI = camp::idx_seq<2, 3, 1, 0>;
-using PERM_LIJK = camp::idx_seq<3, 0, 1, 2>;
-using PERM_LIKJ = camp::idx_seq<3, 0, 2, 1>;
-using PERM_LJIK = camp::idx_seq<3, 1, 0, 2>;
-using PERM_LJKI = camp::idx_seq<3, 1, 2, 0>;
-using PERM_LKIJ = camp::idx_seq<3, 2, 0, 1>;
-using PERM_LKJI = camp::idx_seq<3, 2, 1, 0>;
+using PERM_I     = camp::idx_seq<0>;
+using PERM_IJ    = camp::idx_seq<0, 1>;
+using PERM_JI    = camp::idx_seq<1, 0>;
+using PERM_IJK   = camp::idx_seq<0, 1, 2>;
+using PERM_IKJ   = camp::idx_seq<0, 2, 1>;
+using PERM_JIK   = camp::idx_seq<1, 0, 2>;
+using PERM_JKI   = camp::idx_seq<1, 2, 0>;
+using PERM_KIJ   = camp::idx_seq<2, 0, 1>;
+using PERM_KJI   = camp::idx_seq<2, 1, 0>;
+using PERM_IJKL  = camp::idx_seq<0, 1, 2, 3>;
+using PERM_IJLK  = camp::idx_seq<0, 1, 3, 2>;
+using PERM_IKJL  = camp::idx_seq<0, 2, 1, 3>;
+using PERM_IKLJ  = camp::idx_seq<0, 2, 3, 1>;
+using PERM_ILJK  = camp::idx_seq<0, 3, 1, 2>;
+using PERM_ILKJ  = camp::idx_seq<0, 3, 2, 1>;
+using PERM_JIKL  = camp::idx_seq<1, 0, 2, 3>;
+using PERM_JILK  = camp::idx_seq<1, 0, 3, 2>;
+using PERM_JKIL  = camp::idx_seq<1, 2, 0, 3>;
+using PERM_JKLI  = camp::idx_seq<1, 2, 3, 0>;
+using PERM_JLIK  = camp::idx_seq<1, 3, 0, 2>;
+using PERM_JLKI  = camp::idx_seq<1, 3, 2, 0>;
+using PERM_KIJL  = camp::idx_seq<2, 0, 1, 3>;
+using PERM_KILJ  = camp::idx_seq<2, 0, 3, 1>;
+using PERM_KJIL  = camp::idx_seq<2, 1, 0, 3>;
+using PERM_KJLI  = camp::idx_seq<2, 1, 3, 0>;
+using PERM_KLIJ  = camp::idx_seq<2, 3, 0, 1>;
+using PERM_KLJI  = camp::idx_seq<2, 3, 1, 0>;
+using PERM_LIJK  = camp::idx_seq<3, 0, 1, 2>;
+using PERM_LIKJ  = camp::idx_seq<3, 0, 2, 1>;
+using PERM_LJIK  = camp::idx_seq<3, 1, 0, 2>;
+using PERM_LJKI  = camp::idx_seq<3, 1, 2, 0>;
+using PERM_LKIJ  = camp::idx_seq<3, 2, 0, 1>;
+using PERM_LKJI  = camp::idx_seq<3, 2, 1, 0>;
 using PERM_IJKLM = camp::idx_seq<0, 1, 2, 3, 4>;
 using PERM_IJKML = camp::idx_seq<0, 1, 2, 4, 3>;
 using PERM_IJLKM = camp::idx_seq<0, 1, 3, 2, 4>;
@@ -193,51 +194,51 @@ using PERM_MLKIJ = camp::idx_seq<4, 3, 2, 0, 1>;
 using PERM_MLKJI = camp::idx_seq<4, 3, 2, 1, 0>;
 
 
-
-
-namespace internal 
+namespace internal
 {
 
 
-template<camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
+template <camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem
 {
-  static constexpr camp::idx_t value = 
-    camp::seq_at<J, Perm>::value == I ? J : CalcInversePermutationElem<I, J+1, N, Perm>::value;
+  static constexpr camp::idx_t value =
+      camp::seq_at<J, Perm>::value == I
+          ? J
+          : CalcInversePermutationElem<I, J + 1, N, Perm>::value;
 };
 
-template<camp::idx_t I, camp::idx_t N, typename Perm>
+template <camp::idx_t I, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem<I, N, N, Perm>
 {
   static constexpr camp::idx_t value = I;
 };
 
 
-
-template<typename Range, typename Perm>
+template <typename Range, typename Perm>
 struct InversePermutationHelper;
 
-template<camp::idx_t ... Range, camp::idx_t ... Perm>
-struct InversePermutationHelper<camp::idx_seq<Range...>, 
-                                camp::idx_seq<Perm...>>
+template <camp::idx_t... Range, camp::idx_t... Perm>
+struct InversePermutationHelper<camp::idx_seq<Range...>, camp::idx_seq<Perm...>>
 {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq< 
-    CalcInversePermutationElem<Range, 0, sizeof...(Range), camp::idx_seq<Perm...>>::value ...  
-  >;  
+  using type = camp::idx_seq<
+      CalcInversePermutationElem<Range,
+                                 0,
+                                 sizeof...(Range),
+                                 camp::idx_seq<Perm...>>::value...>;
 };
 
 
-
-} // namespace internal
-
+}  // namespace internal
 
 
 /*!
   Inverts a permutation
 */
-template<typename Perm>
-using invert_permutation = typename internal::InversePermutationHelper<camp::make_idx_seq_t<camp::size<Perm>::value>, Perm>::type;
+template <typename Perm>
+using invert_permutation = typename internal::InversePermutationHelper<
+    camp::make_idx_seq_t<camp::size<Perm>::value>,
+    Perm>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index 5bb176215b..f7f1c627fb 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -67,26 +67,30 @@ auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     // If the size of dimension i is zero, then the stride is zero
     folded_strides[i] = sizes[permutation[i]] ? 1 : 0;
-    for (size_t j = i + 1; j < Rank; ++j) {
+    for (size_t j = i + 1; j < Rank; ++j)
+    {
       folded_strides[i] *= sizes[permutation[j]] ? sizes[permutation[j]] : 1;
     }
   }
 
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     strides[permutation[i]] = folded_strides[i];
   }
 
 
   // return Layout<Rank, IdxLin>(sizes, strides);
-  auto ret  = Layout<Rank, IdxLin>();
-  for (size_t i = 0; i < Rank; ++i) {
-    ret.sizes[i] = sizes[i];
-    ret.strides[i] = strides[i];
+  auto ret = Layout<Rank, IdxLin>();
+  for (size_t i = 0; i < Rank; ++i)
+  {
+    ret.sizes[i]       = sizes[i];
+    ret.strides[i]     = strides[i];
     ret.inv_strides[i] = strides[i] ? strides[i] : 1;
-    ret.inv_mods[i] = sizes[i] ? sizes[i] : 1;
+    ret.inv_mods[i]    = sizes[i] ? sizes[i] : 1;
   }
   return ret;
 }
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 996836e397..97aebf9431 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -11,31 +11,33 @@
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/internal/get_platform.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class KokkosPluginLoader;
 
-struct PluginContext {
-  public:
-    PluginContext(const Platform p) :
-      platform(p) {}
+struct PluginContext
+{
+public:
+  PluginContext(const Platform p) : platform(p) {}
 
-    Platform platform;
+  Platform platform;
 
-  private:
-    mutable uint64_t kID;
+private:
+  mutable uint64_t kID;
 
-    friend class KokkosPluginLoader;
+  friend class KokkosPluginLoader;
 };
 
-template<typename Policy>
+template <typename Policy>
 PluginContext make_context()
 {
-  return PluginContext{detail::get_platform<Policy>::value};
+  return PluginContext {detail::get_platform<Policy>::value};
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
index e5b77bd027..5920142759 100644
--- a/include/RAJA/util/PluginLinker.hpp
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -11,14 +11,18 @@
 #include "RAJA/util/RuntimePluginLoader.hpp"
 #include "RAJA/util/KokkosPluginLoader.hpp"
 
-namespace {
-  namespace anonymous_RAJA {
-    struct pluginLinker {
-      inline pluginLinker() {
-        (void)RAJA::util::linkRuntimePluginLoader();
-        (void)RAJA::util::linkKokkosPluginLoader();
-      }
-    } pluginLinker;
+namespace
+{
+namespace anonymous_RAJA
+{
+struct pluginLinker
+{
+  inline pluginLinker()
+  {
+    (void)RAJA::util::linkRuntimePluginLoader();
+    (void)RAJA::util::linkKokkosPluginLoader();
   }
-}
+} pluginLinker;
+}  // namespace anonymous_RAJA
+}  // namespace
 #endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
index f0b6a35507..50ed3a1da9 100644
--- a/include/RAJA/util/PluginOptions.hpp
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -10,22 +10,24 @@
 
 #include <string>
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 struct PluginOptions
 {
-    PluginOptions(const std::string& newstr) : str(newstr) {};
-    
-    std::string str;
+  PluginOptions(const std::string& newstr) : str(newstr) {};
+
+  std::string str;
 };
 
 inline PluginOptions make_options(const std::string& newstr)
 {
-    return PluginOptions{newstr};
+  return PluginOptions {newstr};
 }
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 3935559bba..86f8fd7f6b 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -12,33 +12,35 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/Registry.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class PluginStrategy
 {
-  public:
-    RAJASHAREDDLL_API PluginStrategy();
+public:
+  RAJASHAREDDLL_API PluginStrategy();
 
-    virtual ~PluginStrategy() = default;
+  virtual ~PluginStrategy() = default;
 
-    virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
+  virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
 
-    virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void finalize();
+  virtual RAJASHAREDDLL_API void finalize();
 };
 
 using PluginRegistry = Registry<PluginStrategy>;
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index 579481a6ed..4bfb2ee7b8 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -10,126 +10,147 @@
 
 #include <memory>
 
-namespace RAJA {
-namespace util {
-
-  template <typename T>
-  class RegistryEntry {
-    std::string Name, Desc;
-    std::shared_ptr<T> object;
+namespace RAJA
+{
+namespace util
+{
+
+template <typename T>
+class RegistryEntry
+{
+  std::string Name, Desc;
+  std::shared_ptr<T> object;
+
+public:
+  RegistryEntry(const std::string& N,
+                const std::string& D,
+                std::shared_ptr<T> (*C)())
+      : Name(N), Desc(D), object(C())
+  {}
+
+  const std::string& getName() const { return Name; }
+  const std::string& getDesc() const { return Desc; }
+  T* get() const { return object.get(); }
+};
+
+/// A global registry used in conjunction with static constructors to make
+/// pluggable components (like targets or garbage collectors) "just work" when
+/// linked with an executable.
+template <typename T>
+class Registry
+{
+public:
+  using type  = T;
+  using entry = RegistryEntry<T>;
+
+  class node;
+  class iterator;
+
+private:
+  Registry() = delete;
+
+  friend class node;
+  static node *Head, *Tail;
+
+public:
+  /// Node in linked list of entries.
+  ///
+  class node
+  {
+    friend class iterator;
+    friend Registry<T>;
+
+    node* Next;
+    const entry& Val;
 
   public:
-    RegistryEntry(const std::string& N, const std::string& D,
-        std::shared_ptr<T> (*C)())
-        : Name(N), Desc(D), object(C()) {}
-
-    const std::string& getName() const { return Name; }
-    const std::string& getDesc() const { return Desc; }
-    T* get() const { return object.get(); }
+    node(const entry& V) : Next(nullptr), Val(V) {}
   };
 
-  /// A global registry used in conjunction with static constructors to make
-  /// pluggable components (like targets or garbage collectors) "just work" when
-  /// linked with an executable.
-  template <typename T>
-  class Registry {
+  /// Add a node to the Registry: this is the interface between the plugin and
+  /// the executable.
+  ///
+  /// This function is exported by the executable and called by the plugin to
+  /// add a node to the executable's registry. Therefore it's not defined here
+  /// to avoid it being instantiated in the plugin and is instead defined in
+  /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
+  static RAJASHAREDDLL_API void add_node(node* N);
+
+  /// Iterators for registry entries.
+  ///
+  class iterator
+  {
+    const node* Cur;
+
   public:
-    using type = T;
-    using entry = RegistryEntry<T>;
+    explicit iterator(const node* N) : Cur(N) {}
+
+    bool operator==(const iterator& That) const { return Cur == That.Cur; }
+    bool operator!=(const iterator& That) const { return Cur != That.Cur; }
+    iterator& operator++()
+    {
+      Cur = Cur->Next;
+      return *this;
+    }
+    const entry& operator*() const { return Cur->Val; }
+    const entry* operator->() const { return &Cur->Val; }
+  };
 
-    class node;
-    class iterator;
+  // begin is not defined here in order to avoid usage of an undefined static
+  // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
+  static RAJASHAREDDLL_API iterator begin();
+  static iterator end() { return iterator(nullptr); }
 
-  private:
-    Registry() = delete;
+  /// A static registration template.
+  template <typename V>
+  class add
+  {
+    entry Entry;
+    node Node;
 
-    friend class node;
-    static node *Head, *Tail;
+    static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
   public:
-    /// Node in linked list of entries.
-    ///
-    class node {
-      friend class iterator;
-      friend Registry<T>;
-
-      node *Next;
-      const entry& Val;
-
-    public:
-      node(const entry &V) : Next(nullptr), Val(V) {}
-    };
-
-    /// Add a node to the Registry: this is the interface between the plugin and
-    /// the executable.
-    ///
-    /// This function is exported by the executable and called by the plugin to
-    /// add a node to the executable's registry. Therefore it's not defined here
-    /// to avoid it being instantiated in the plugin and is instead defined in
-    /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
-    static RAJASHAREDDLL_API void add_node(node *N);
-
-    /// Iterators for registry entries.
-    ///
-    class iterator {
-      const node *Cur;
-
-    public:
-      explicit iterator(const node *N) : Cur(N) {}
-
-      bool operator==(const iterator &That) const { return Cur == That.Cur; }
-      bool operator!=(const iterator &That) const { return Cur != That.Cur; }
-      iterator &operator++() { Cur = Cur->Next; return *this; }
-      const entry &operator*() const { return Cur->Val; }
-      const entry *operator->() const { return &Cur->Val; }
-    };
-
-    // begin is not defined here in order to avoid usage of an undefined static
-    // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
-    static RAJASHAREDDLL_API iterator begin();
-    static iterator end()   { return iterator(nullptr); }
-
-    /// A static registration template.
-    template <typename V>
-    class add {
-      entry Entry;
-      node Node;
-
-      static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
-
-    public:
-      add(const std::string& Name, const std::string& Desc)
-          : Entry(Name, Desc, CtorFn), Node(Entry) {
-        add_node(&Node);
-      }
-    };
+    add(const std::string& Name, const std::string& Desc)
+        : Entry(Name, Desc, CtorFn), Node(Entry)
+    {
+      add_node(&Node);
+    }
   };
-
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
-
-#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
-  namespace RAJA { \
-  namespace util { \
-  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
-  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
-  template<typename T> \
-  void Registry<T>::add_node(typename Registry<T>::node *N) { \
-    if (Tail) \
-      Tail->Next = N; \
-    else \
-      Head = N; \
-    Tail = N; \
-  } \
-  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
-    return iterator(Head); \
-  } \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
-  template \
-  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
-  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
-  } \
+};
+
+}  // namespace util
+}  // namespace RAJA
+
+#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace RAJA                                                               \
+  {                                                                            \
+  namespace util                                                               \
+  {                                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::node* Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node* Registry<T>::Tail = nullptr;                     \
+  template <typename T>                                                        \
+  void Registry<T>::add_node(typename Registry<T>::node* N)                    \
+  {                                                                            \
+    if (Tail)                                                                  \
+      Tail->Next = N;                                                          \
+    else                                                                       \
+      Head = N;                                                                \
+    Tail = N;                                                                  \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::iterator Registry<T>::begin()                          \
+  {                                                                            \
+    return iterator(Head);                                                     \
+  }                                                                            \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Head;         \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Tail;         \
+  template void                                                                \
+  Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*);             \
+  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin();   \
+  }                                                                            \
   }
 
 #endif
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 618913f794..0f8110288b 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -50,75 +50,132 @@ namespace RAJA
  *   unbounded extents
  *
  */
-template < typename T >
+template <typename T>
 struct RepeatView
 {
   struct iterator
   {
     using difference_type = std::ptrdiff_t;
-    using value_type = T;
-    using reference = value_type const&;
+    using value_type      = T;
+    using reference       = value_type const&;
 
     iterator() = default;
 
     constexpr iterator(const T* base, size_t index)
-      : m_value(base), m_index(index)
-    { }
+        : m_value(base), m_index(index)
+    {}
 
     constexpr reference operator*() const noexcept { return *m_value; }
-    constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); }
-
-    constexpr iterator& operator++() { ++m_index; return *this; }
-    constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
-
-    constexpr iterator& operator--() { --m_index; return *this; }
-    constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; }
-
-    constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; }
-    constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; }
+    constexpr reference operator[](difference_type index) const noexcept
+    {
+      return *(*this + index);
+    }
+
+    constexpr iterator& operator++()
+    {
+      ++m_index;
+      return *this;
+    }
+    constexpr iterator operator++(int)
+    {
+      auto tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator--()
+    {
+      --m_index;
+      return *this;
+    }
+    constexpr iterator operator--(int)
+    {
+      auto tmp = *this;
+      --(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator+=(difference_type rhs)
+    {
+      m_index += rhs;
+      return *this;
+    }
+    constexpr iterator& operator-=(difference_type rhs)
+    {
+      m_index -= rhs;
+      return *this;
+    }
 
     friend constexpr iterator operator+(iterator lhs, difference_type rhs)
-    { lhs += rhs; return lhs; }
+    {
+      lhs += rhs;
+      return lhs;
+    }
     friend constexpr iterator operator+(difference_type lhs, iterator rhs)
-    { rhs += lhs; return rhs; }
+    {
+      rhs += lhs;
+      return rhs;
+    }
 
     friend constexpr iterator operator-(iterator lhs, difference_type rhs)
-    { lhs -= rhs; return lhs; }
-    friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs)
-    { return static_cast<difference_type>(lhs.m_index) - static_cast<difference_type>(rhs.m_index); }
+    {
+      lhs -= rhs;
+      return lhs;
+    }
+    friend constexpr difference_type operator-(iterator const& lhs,
+                                               iterator const& rhs)
+    {
+      return static_cast<difference_type>(lhs.m_index) -
+             static_cast<difference_type>(rhs.m_index);
+    }
 
     friend constexpr bool operator==(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index == rhs.m_index; }
+    {
+      return lhs.m_index == rhs.m_index;
+    }
     friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs == rhs); }
+    {
+      return !(lhs == rhs);
+    }
 
     friend constexpr bool operator<(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index < rhs.m_index; }
+    {
+      return lhs.m_index < rhs.m_index;
+    }
     friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs)
-    { return !(rhs < lhs); }
+    {
+      return !(rhs < lhs);
+    }
     friend constexpr bool operator>(iterator const& lhs, iterator const& rhs)
-    { return rhs < lhs; }
+    {
+      return rhs < lhs;
+    }
     friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs < rhs); }
+    {
+      return !(lhs < rhs);
+    }
 
   private:
     const T* m_value = nullptr;
-    size_t m_index = 0;
+    size_t m_index   = 0;
   };
 
   RepeatView() = delete;
 
   constexpr RepeatView(T const& value, size_t bound)
-    : m_bound(bound), m_value(value)
-  { }
+      : m_bound(bound), m_value(value)
+  {}
 
   constexpr RepeatView(T&& value, size_t bound)
-    : m_bound(bound), m_value(std::move(value))
-  { }
+      : m_bound(bound), m_value(std::move(value))
+  {}
 
   constexpr T const& front() const { return m_value; }
   constexpr T const& back() const { return m_value; }
-  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; }
+  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const
+  {
+    return m_value;
+  }
 
   constexpr iterator begin() const { return iterator(&m_value, 0); }
   constexpr iterator cbegin() const { return iterator(&m_value, 0); }
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
index 3e7fbb165f..289e067b0a 100644
--- a/include/RAJA/util/RuntimePluginLoader.hpp
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -14,39 +14,40 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class RuntimePluginLoader : public RAJA::util::PluginStrategy
-  {
-    using Parent = RAJA::util::PluginStrategy;
+class RuntimePluginLoader : public RAJA::util::PluginStrategy
+{
+  using Parent = RAJA::util::PluginStrategy;
 
-  public:
-    RuntimePluginLoader();
+public:
+  RuntimePluginLoader();
 
-    void init(const RAJA::util::PluginOptions& p) override;
+  void init(const RAJA::util::PluginOptions& p) override;
 
-    void preCapture(const RAJA::util::PluginContext& p) override;
+  void preCapture(const RAJA::util::PluginContext& p) override;
 
-    void postCapture(const RAJA::util::PluginContext& p) override;
+  void postCapture(const RAJA::util::PluginContext& p) override;
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
+private:
+  void initPlugin(const std::string& path);
 
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+  void initDirectory(const std::string& path);
 
-    std::vector<std::unique_ptr<Parent>> plugins;
+  std::vector<std::unique_ptr<Parent>> plugins;
 
-  };  // end RuntimePluginLoader class
+};  // end RuntimePluginLoader class
 
-  void linkRuntimePluginLoader();
+void linkRuntimePluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index 6828bc3b1a..c4d63ac19f 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -52,10 +52,10 @@ class SoAArray
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
 template <typename T, typename IndexType, bool doing_min, size_t size>
-class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
+class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
 {
-  using value_type = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
-  using first_type = T;
+  using value_type  = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+  using first_type  = T;
   using second_type = IndexType;
 
 public:
@@ -65,7 +65,7 @@ class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    mem[i] = val;
+    mem[i]     = val;
     mem_idx[i] = val.getLoc();
   }
 
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 6adea65b80..2aea24a82c 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -45,35 +45,35 @@ namespace detail
 template <typename T,
           typename mempool = RAJA::basic_mempool::MemPool<
               RAJA::basic_mempool::generic_allocator>,
-          typename accessor = DefaultAccessor >
+          typename accessor = DefaultAccessor>
 class SoAPtr
 {
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = T;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
@@ -90,8 +90,14 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return accessor::get(mem, i);
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    accessor::set(mem, i, val);
+  }
 
 private:
   value_type* mem = nullptr;
@@ -100,44 +106,49 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
+template <typename T,
+          typename IndexType,
+          bool doing_min,
+          typename mempool,
+          typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
+             mempool,
+             accessor>
 {
-  using first_type = T;
+  using first_type  = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // fiend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // fiend other instantiations of this class
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem), mem_idx(rhs.mem_idx)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem     = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -164,7 +175,7 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
   }
 
 private:
-  first_type* mem = nullptr;
+  first_type* mem      = nullptr;
   second_type* mem_idx = nullptr;
 };
 
@@ -174,41 +185,40 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
 template <typename T, typename IndexType, typename mempool, typename accessor>
 class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
 {
-  using first_type = T;
+  using first_type  = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template <typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = RAJA::expt::ValLoc<T, IndexType>;
 
-  template < typename rhs_accessor >
+  template <typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template <
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem), mem_idx(rhs.mem_idx)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem     = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -235,7 +245,7 @@ class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
   }
 
 private:
-  first_type* mem = nullptr;
+  first_type* mem      = nullptr;
   second_type* mem_idx = nullptr;
 };
 
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index 2da2e0164c..61c3addb26 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -56,15 +56,16 @@ namespace RAJA
  *
  */
 template <typename IterType, typename IndexType>
-struct Span {
-  using element_type = typename std::iterator_traits<IterType>::value_type;
-  using value_type = camp::decay<element_type>;
-  using size_type = IndexType;
+struct Span
+{
+  using element_type    = typename std::iterator_traits<IterType>::value_type;
+  using value_type      = camp::decay<element_type>;
+  using size_type       = IndexType;
   using difference_type = std::ptrdiff_t;
-  using reference = element_type&;
+  using reference       = element_type&;
   using const_reference = const element_type&;
-  using iterator = IterType;
-  using const_iterator = IterType;
+  using iterator        = IterType;
+  using const_iterator  = IterType;
 
   static_assert(type_traits::is_integral<IndexType>::value,
                 "IndexType must model Integral");
@@ -72,14 +73,12 @@ struct Span {
                 "IterType must model RandomAccessIterator");
 
   RAJA_HOST_DEVICE Span(iterator begin, iterator end)
-      : m_begin{begin}, m_end{end}
-  {
-  }
+      : m_begin {begin}, m_end {end}
+  {}
 
   RAJA_HOST_DEVICE Span(iterator begin, size_type size)
-      : m_begin{begin}, m_end{begin + size}
-  {
-  }
+      : m_begin {begin}, m_end {begin + size}
+  {}
 
   RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
@@ -88,16 +87,34 @@ struct Span {
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s) { return s.begin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
+  {
+    return s.begin();
+  }
   RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s) { return s.begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s) { return s.cbegin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s) { return s.cend(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s)
+  {
+    return s.begin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s)
+  {
+    return s.end();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s)
+  {
+    return s.cbegin();
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s)
+  {
+    return s.cend();
+  }
 
   RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end()-1); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const { return data()[i]; }
+  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end() - 1); }
+  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
+  {
+    return data()[i];
+  }
   RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
 
   RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
@@ -127,7 +144,7 @@ struct Span {
                                           size_type length) const
   {
     auto start = m_begin + begin;
-    auto end = start + length > m_end ? m_end : start + length;
+    auto end   = start + length > m_end ? m_end : start + length;
     return Span(start, end);
   }
 
@@ -157,21 +174,21 @@ struct Span {
  *
  */
 template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
-    IterType begin,
-    IndexType size)
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
+                                                                 IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
 
 template <typename Iter>
-RAJA_INLINE auto make_span(Iter &iterable)
+RAJA_INLINE auto make_span(Iter& iterable)
 {
   using std::begin;
-  using std::end;
   using std::distance;
-  return Span<typename Iter::iterator, decltype(distance(begin(iterable), end(iterable)))>
-    (begin(iterable), end(iterable));
+  using std::end;
+  return Span<typename Iter::iterator,
+              decltype(distance(begin(iterable), end(iterable)))>(
+      begin(iterable), end(iterable));
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 8d27980f83..df70092459 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -32,7 +32,6 @@
 #include "RAJA/util/Permutations.hpp"
 
 
-
 namespace RAJA
 {
 
@@ -40,7 +39,11 @@ namespace detail
 {
 
 
-template <typename IdxLin, typename Range, typename Sizes, typename Strides, typename DimTypeList=void>
+template <typename IdxLin,
+          typename Range,
+          typename Sizes,
+          typename Strides,
+          typename DimTypeList = void>
 struct StaticLayoutBase_impl;
 
 
@@ -52,15 +55,16 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             void> {
+                             void>
+{
 
   using IndexLinear = IdxLin;
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
-  using strides = camp::int_seq<IdxLin, Strides...>;
+  using sizes       = camp::int_seq<IdxLin, Sizes...>;
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
-  static constexpr camp::idx_t stride_one_dim =
-      RAJA::max<camp::idx_t>(
-          (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts) : -1)...);
+  static constexpr camp::idx_t stride_one_dim = RAJA::max<camp::idx_t>(
+      (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts)
+                                                    : -1)...);
 
   static constexpr size_t n_dims = sizeof...(Sizes);
 
@@ -72,9 +76,7 @@ struct StaticLayoutBase_impl<IdxLin,
   RAJA_INLINE static void print()
   {
     camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                               (int)RangeInts,
-                               (int)Sizes,
-                               (int)Strides)...);
+                      (int)RangeInts, (int)Sizes, (int)Strides)...);
   }
 
 
@@ -86,8 +88,8 @@ struct StaticLayoutBase_impl<IdxLin,
    * @return Linear space index.
    */
   template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      Indices... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  operator()(Indices... indices) const
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -95,7 +97,8 @@ struct StaticLayoutBase_impl<IdxLin,
 
 
   template <typename... Indices>
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(Indices... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin
+  s_oper(Indices... indices)
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
@@ -108,8 +111,7 @@ struct StaticLayoutBase_impl<IdxLin,
       RAJA::product<IdxLin>((Sizes == IdxLin(0) ? IdxLin(1) : Sizes)...);
 
   // Multiply together all of the sizes
-  static constexpr IdxLin s_size_noproj =
-      RAJA::product<IdxLin>(Sizes...);
+  static constexpr IdxLin s_size_noproj = RAJA::product<IdxLin>(Sizes...);
 
   /*!
    * Computes a size of the layout's space with projections as size 1.
@@ -137,37 +139,31 @@ struct StaticLayoutBase_impl<IdxLin,
   }
 
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return camp::seq_at<DIM, strides>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
-
 };
 
 template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
-struct StrideCalculatorIdx {
+struct StrideCalculatorIdx
+{
   static_assert(N == sizeof...(Sizes), "");
 
-  using sizes_seq = camp::int_seq<IdxLin, Sizes...>;
+  using sizes_seq              = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin size = camp::seq_at<Idx, sizes_seq>::value;
   static constexpr IdxLin size_last =
       StrideCalculatorIdx<IdxLin, N, Idx + 1, Sizes...>::size;
@@ -178,38 +174,49 @@ struct StrideCalculatorIdx {
 };
 
 template <typename IdxLin, IdxLin N, IdxLin... Sizes>
-struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
+struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
+{
   static_assert(N == sizeof...(Sizes), "");
 
-  static constexpr IdxLin size = 1;
-  static constexpr IdxLin value = 1;
+  static constexpr IdxLin size   = 1;
+  static constexpr IdxLin value  = 1;
   static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
 template <typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <typename IdxLin, IdxLin ... Range, camp::idx_t... Perm, IdxLin... Sizes>
+template <typename IdxLin,
+          IdxLin... Range,
+          camp::idx_t... Perm,
+          IdxLin... Sizes>
 struct StrideCalculator<IdxLin,
                         camp::int_seq<IdxLin, Range...>,
                         camp::idx_seq<Perm...>,
-                        camp::int_seq<IdxLin, Sizes...>> {
+                        camp::int_seq<IdxLin, Sizes...>>
+{
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using sizes               = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin N = sizeof...(Sizes);
-  using range = camp::int_seq<IdxLin, Range...>;
-  using perm = camp::idx_seq<Perm...>;
-  using inv_perm = invert_permutation<perm>;
-
-  using strides_unperm =
-      camp::int_seq<IdxLin, StrideCalculatorIdx<IdxLin, N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
-
-  using strides = camp::int_seq<IdxLin, camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
+  using range               = camp::int_seq<IdxLin, Range...>;
+  using perm                = camp::idx_seq<Perm...>;
+  using inv_perm            = invert_permutation<perm>;
+
+  using strides_unperm = camp::int_seq<
+      IdxLin,
+      StrideCalculatorIdx<IdxLin,
+                          N,
+                          Range,
+                          camp::seq_at<Perm, sizes>::value...>::stride...>;
+
+  using strides =
+      camp::int_seq<IdxLin,
+                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
+                                 strides_unperm>::value...>;
 };
 
 
-
 template <typename IdxLin,
           IdxLin... RangeInts,
           IdxLin... Sizes,
@@ -219,19 +226,19 @@ struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             camp::list<DimTypes...>> {
+                             camp::list<DimTypes...>>
+{
 
 
   using IndexLinear = IdxLin;
   using ranges      = camp::int_seq<IdxLin, RangeInts...>;
   using sizes       = camp::int_seq<IdxLin, Sizes...>;
-  using strides     = camp::int_seq<IdxLin, Strides...>;  
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
-  using InnerLayout = StaticLayoutBase_impl<IdxLin,ranges,sizes,strides,void>;
+  using InnerLayout =
+      StaticLayoutBase_impl<IdxLin, ranges, sizes, strides, void>;
 
-  static
-  constexpr
-  camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
+  static constexpr camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
 
   static constexpr IndexLinear n_dims = sizeof...(DimTypes);
   /*!
@@ -241,14 +248,14 @@ struct StaticLayoutBase_impl<IdxLin,
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear s_oper(
-      DimTypes... indices)
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear
+  s_oper(DimTypes... indices)
   {
     return InnerLayout::s_oper(stripIndexType(indices)...);
   }
 
 
-  static constexpr IndexLinear s_size = InnerLayout::s_size;
+  static constexpr IndexLinear s_size        = InnerLayout::s_size;
   static constexpr IndexLinear s_size_noproj = InnerLayout::s_size_noproj;
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr static IndexLinear size()
@@ -261,49 +268,43 @@ struct StaticLayoutBase_impl<IdxLin,
     return s_size_noproj;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
-    return InnerLayout{}.get_dim_stride();
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
+    return InnerLayout {}.get_dim_stride();
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
-  template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  template <camp::idx_t DIM>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 
 
   RAJA_INLINE
   static void print() { InnerLayout::print(); }
-
 };
 
 
-
-
-
-template <typename Perm, typename IdxLin, typename Sizes, typename Indexes, typename TypeList>
+template <typename Perm,
+          typename IdxLin,
+          typename Sizes,
+          typename Indexes,
+          typename TypeList>
 struct StaticLayoutMaker
 {
-  using strides = typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
-  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides,TypeList>;
+  using strides =
+      typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
+  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides, TypeList>;
 };
 
 
-
 }  // namespace detail
 
 
@@ -313,20 +314,21 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    void
-    >::type;
+    void>::type;
 
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
+template <typename Perm,
+          typename IdxLin,
+          typename TypeList,
+          camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    TypeList
-    >::type;
+    TypeList>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 8c23a2c74d..6c96a12e23 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -51,7 +51,7 @@ class BGQTimer
   using ElapsedType = double;
 
 private:
-  using TimeType = timeval;
+  using TimeType     = timeval;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
@@ -104,14 +104,13 @@ class ChronoTimer
   using ElapsedType = double;
 
 private:
-  using ClockType = std::chrono::steady_clock;
-  using TimeType = ClockType::time_point;
+  using ClockType    = std::chrono::steady_clock;
+  using TimeType     = ClockType::time_point;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
   ChronoTimer() : tstart(ClockType::now()), tstop(ClockType::now()), telapsed(0)
-  {
-  }
+  {}
 
   void start() { tstart = ClockType::now(); }
 
@@ -174,7 +173,7 @@ class GettimeTimer
 
   void reset()
   {
-    stime_elapsed = 0;
+    stime_elapsed  = 0;
     nstime_elapsed = 0;
   }
 
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 5cdc019259..9b34eb5e71 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -39,7 +39,7 @@ namespace util
  * Reinterpret any datatype as another datatype of the same size
  */
 template <typename A, typename B>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const& a)
 {
   static_assert(sizeof(A) == sizeof(B), "A and B must be the same size");
 
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 0d5bed35d6..9f7563d729 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -38,705 +38,786 @@ namespace RAJA
 namespace internal
 {
 
-  template<camp::idx_t, typename T>
-  struct IndexToType{
-      using type = T;
-  };
+template <camp::idx_t, typename T>
+struct IndexToType
+{
+  using type = T;
+};
 
-  template<typename IdxSeq, typename T>
-  struct SequenceToType;
+template <typename IdxSeq, typename T>
+struct SequenceToType;
 
-  template<camp::idx_t ... Perm, typename T>
-  struct SequenceToType<camp::idx_seq<Perm...>, T>{
-      using type =  camp::list<typename IndexToType<Perm, T>::type...>;
-  };
+template <camp::idx_t... Perm, typename T>
+struct SequenceToType<camp::idx_seq<Perm...>, T>
+{
+  using type = camp::list<typename IndexToType<Perm, T>::type...>;
+};
 
-  template<typename Perm>
-  using getDefaultIndexTypes = typename SequenceToType<Perm, RAJA::Index_type>::type;
+template <typename Perm>
+using getDefaultIndexTypes =
+    typename SequenceToType<Perm, RAJA::Index_type>::type;
 
 
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
+struct add_offset
+{
+  using type = RAJA::OffsetLayout<layout::n_dims>;
+};
 
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
+{
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+};
 
-  //Helpers to convert
-  //layouts -> OffsetLayouts
-  //Typedlayouts -> TypedOffsetLayouts
-  template<typename layout>
-  struct add_offset
-  {
-    using type = RAJA::OffsetLayout<layout::n_dims>;
-  };
 
-  template<typename IdxLin, typename...DimTypes>
-  struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
-  {
-    using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
-  };
+#if defined(RAJA_ENABLE_VECTORIZATION)
+namespace detail
+{
+/*
+ * Returns the argument number which contains a VectorIndex
+ *
+ * returns -1 if none of the arguments are VectorIndexs
+ */
 
+template <camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
+struct GetTensorArgIdxExpanded;
 
+template <camp::idx_t DIM, typename... ARGS, camp::idx_t... IDX>
+struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>>
+{
 
+  static constexpr camp::idx_t value = RAJA::max<camp::idx_t>(
+      (internal::expt::isTensorIndex<ARGS>() &&
+               internal::expt::getTensorDim<ARGS>() == DIM
+           ? IDX
+           : -1)...);
+};
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  namespace detail
-  {
-    /*
-     * Returns the argument number which contains a VectorIndex
-     *
-     * returns -1 if none of the arguments are VectorIndexs
-     */
 
-    template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
-    struct GetTensorArgIdxExpanded;
+}  // namespace detail
+#endif
 
-    template<camp::idx_t DIM, typename ... ARGS, camp::idx_t ... IDX>
-    struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>> {
 
-        static constexpr camp::idx_t value =
-            RAJA::max<camp::idx_t>(
-                (internal::expt::isTensorIndex<ARGS>()&&internal::expt::getTensorDim<ARGS>()==DIM ? IDX : -1) ...);
-    };
+/*
+ * Returns the number of arguments which are VectorIndexs
+ */
+template <typename... ARGS>
+struct count_num_tensor_args
+{
+  static constexpr camp::idx_t value =
+#if defined(RAJA_ENABLE_VECTORIZATION)
+      RAJA::sum<camp::idx_t>(
+          (internal::expt::isTensorIndex<ARGS>() ? 1 : 0)...);
+#else
+      0;  // There should be 0 Tensor indices if not vectorizing.
+#endif
+};
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/*
+ * Returns which argument has a vector index
+ */
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
 
-  } // namespace detail
+template <camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx<DIM, camp::list<ARGS...>>
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
+
+/*
+ * Returns the beginning index in a vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
+get_tensor_args_begin(LAYOUT const& layout, ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorBegin<ARGS>(
+                args, layout.template get_dim_begin<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
+
+/*
+ * Returns the number of elements in the vector argument
+ */
+template <camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t
+get_tensor_args_size(LAYOUT const& layout, ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorSize<ARGS>(
+                args, layout.template get_dim_size<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
 #endif
 
 
+namespace detail
+{
+
+/*!
+ * Provides conversion of view data to a return type.
+ *
+ * For scalars, this just returns the scalar.
+ *
+ * In the future development, this may return SIMD vectors or matrices using
+ * class specializations.
+ */
+template <typename VecSeq,
+          typename Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper;
+
+
+/*
+ * Specialization for Scalar return types
+ */
+template <typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
+  using return_type = ElementType&;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
+  {
+    return data[stripIndexType(layout(args...))];
+  }
+};
+
 
-  /*
-   * Returns the number of arguments which are VectorIndexs
-   */
-  template<typename ... ARGS>
-  struct count_num_tensor_args{
-    static constexpr camp::idx_t value =
-#if defined(RAJA_ENABLE_VECTORIZATION)
-        RAJA::sum<camp::idx_t>(
-            (internal::expt::isTensorIndex<ARGS>() ? 1 : 0) ...);
-#else
-        0;  // There should be 0 Tensor indices if not vectorizing.
-#endif
-  };
-  
 #if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Returns which argument has a vector index
-   */
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
-
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx<DIM,camp::list<ARGS...>>{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
+/*
+ * Specialization for Tensor return types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... Args,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, Args...>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using tensor_reg_type =
+      typename camp::at_v<camp::list<Args...>,
+                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
+  using ref_type = internal::expt::TensorRef<ElementType*,
+                                             LinIdx,
+                                             internal::expt::TENSOR_MULTIPLE,
+                                             s_num_dims,
+                                             s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-  /*
-   * Returns the beginning index in a vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_begin(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
+  {
+
+    return return_type(ref_type {
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<Args>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        {(LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecHead, Args...>::value>(),
+         (LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecSeq, Args...>::value>()...},
+        // tile
+        {// begin
+         {(LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
+          (LinIdx)(get_tensor_args_begin<VecSeq>(layout, args...))...},
+
+         // size
+         {(LinIdx)get_tensor_args_size<VecHead>(layout, args...),
+          (LinIdx)get_tensor_args_size<VecSeq>(layout, args...)...}}});
   }
+};
+
+
+/*
+ * Specialization for Tensor return types and static layout types
+ */
+template <camp::idx_t VecHead,
+          camp::idx_t... VecSeq,
+          typename... INDEX_TYPES,
+          typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          LinIdx... RangeInts,
+          LinIdx... SizeInts,
+          LinIdx... StrideInts,
+          typename DIM_LIST>
+struct ViewReturnHelper<
+    camp::idx_seq<VecHead, VecSeq...>,
+    camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    RAJA::detail::StaticLayoutBase_impl<LinIdx,
+                                        camp::int_seq<LinIdx, RangeInts...>,
+                                        camp::int_seq<LinIdx, SizeInts...>,
+                                        camp::int_seq<LinIdx, StrideInts...>,
+                                        DIM_LIST>>
+{
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
+
+  using range_seq  = camp::int_seq<LinIdx, RangeInts...>;
+  using size_seq   = camp::int_seq<LinIdx, SizeInts...>;
+  using stride_seq = camp::int_seq<LinIdx, StrideInts...>;
+  using LayoutType = RAJA::detail::
+      StaticLayoutBase_impl<LinIdx, range_seq, size_seq, stride_seq, DIM_LIST>;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, index_list>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using new_begin_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_begin<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_begin<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_size_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_size<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_size<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+
+  using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
+  using new_size_type  = internal::expt::StaticIndexArray<new_size_seq>;
+
+
+  using tensor_reg_type =
+      typename camp::at_v<index_list,
+                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
+  using ref_type =
+      internal::expt::StaticTensorRef<ElementType*,
+                                      LinIdx,
+                                      internal::expt::TENSOR_MULTIPLE,
+                                      stride_seq,
+                                      new_begin_seq,
+                                      new_size_seq,
+                                      s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+
 
-  /*
-   * Returns the number of elements in the vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_size(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
+  static constexpr return_type
+  make_return(LayoutType const& layout,
+              PointerType const& data,
+              RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
+  {
+
+    return return_type(ref_type {
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<
+                       typename RAJA::expt::StaticTensorIndex<
+                           INDEX_TYPES>::base_type>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        typename ref_type::stride_type(),
+        // tile
+        {new_begin_type(), new_size_type()}});
   }
+};
 #endif
 
 
-  namespace detail {
+}  // namespace detail
 
-  /*!
-   * Provides conversion of view data to a return type.
-   *
-   * For scalars, this just returns the scalar.
-   *
-   * In the future development, this may return SIMD vectors or matrices using
-   * class specializations.
-   */
-  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper;
 
+/*
+ * Computes the return type of a view.
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return type.
+ *
+ * Otherwise it produces the usual scalar reference return type
+ */
+template <typename ElementType,
+          typename PointerType,
+          typename LinIdx,
+          typename LayoutType,
+          typename... Args>
+using view_return_type_t = typename detail::ViewReturnHelper<
+    camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+    camp::list<Args...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType>::return_type;
+
+/*
+ * Creates the return value for a View
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return value.
+ *
+ * Otherwise it produces the usual scalar reference return value
+ */
+template <typename ElementType,
+          typename LinIdx,
+          typename LayoutType,
+          typename PointerType,
+          typename... Args>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
+                                                          PointerType,
+                                                          LinIdx,
+                                                          LayoutType,
+                                                          Args...>
+view_make_return_value(LayoutType const& layout,
+                       PointerType const& data,
+                       Args const&... args)
+{
+  return detail::ViewReturnHelper<
+      camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+      camp::list<Args...>, ElementType, PointerType, LinIdx,
+      LayoutType>::make_return(layout, data, args...);
+}
 
-  /*
-   * Specialization for Scalar return types
-   */
-  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
-  {
-      using return_type = ElementType &;
+namespace detail
+{
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-        return data[stripIndexType(layout(args...))];
-      }
-  };
+/**
+ * This class will help strip strongly typed indices
+ *
+ * This default implementation static_asserts that Expected==Arg, otherwise
+ * it's an error.  This enforces types for the TypedView.
+ *
+ * Specialization where expected type is same as argument type.
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg>
+struct MatchTypedViewArgHelper
+{
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using type = strip_index_type_t<Arg>;
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Specialization for Tensor return types
-   */
-  template<camp::idx_t VecHead, camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<VecHead,VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr type extract(Arg arg)
   {
+    return stripIndexType(arg);
+  }
+};
 
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,Args...>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTensorArgIdx<0, Args...>::value>::tensor_type;
-      using ref_type = internal::expt::TensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE, s_num_dims, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          {
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecHead,Args...>::value>(),
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecSeq, Args...>::value>()...
-          },
-          // tile
-          {
-              // begin
-              {
-                  (LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
-                  (LinIdx)(get_tensor_args_begin<VecSeq> (layout, args...))...
-              },
-
-              // size
-              {
-                  (LinIdx)get_tensor_args_size<VecHead>(layout, args...),
-                  (LinIdx)get_tensor_args_size<VecSeq> (layout, args...)...
-              }
-          }
-        });
-      }
-  };
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/**
+ * Specialization where expected type is wrapped in a VectorIndex type
+ *
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template <typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
+struct MatchTypedViewArgHelper<Expected,
+                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
+{
 
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using arg_type = strip_index_type_t<Arg>;
 
+  using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
 
-  /*
-   * Specialization for Tensor return types and static layout types
-   */
-  template<
-      camp::idx_t VecHead, camp::idx_t ... VecSeq,
-      typename ... INDEX_TYPES,
-      typename ElementType, typename PointerType, typename LinIdx,
-      LinIdx... RangeInts, LinIdx... SizeInts, LinIdx... StrideInts,
-      typename DIM_LIST
-  >
-  struct ViewReturnHelper<
-      camp::idx_seq<VecHead,VecSeq...>,
-      camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
-      ElementType, PointerType,
-      LinIdx,
-      RAJA::detail::StaticLayoutBase_impl<
-          LinIdx,
-          camp::int_seq<LinIdx,RangeInts...>,
-          camp::int_seq<LinIdx,SizeInts...>,
-          camp::int_seq<LinIdx,StrideInts...>,
-          DIM_LIST
-      >
-  > {
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
-
-      using range_seq  = camp::int_seq<LinIdx,RangeInts... >;
-      using size_seq   = camp::int_seq<LinIdx,SizeInts...  >;
-      using stride_seq = camp::int_seq<LinIdx,StrideInts...>;
-      using LayoutType = RAJA::detail::StaticLayoutBase_impl<LinIdx,range_seq,size_seq,stride_seq,DIM_LIST>;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,index_list>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-
-
-      using new_begin_seq = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_begin<VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_begin<VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-      using new_size_seq  = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_size <VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_size <VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-
-      using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
-      using new_size_type  = internal::expt::StaticIndexArray<new_size_seq >;
-
-
-      using tensor_reg_type = typename camp::at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
-      using ref_type = internal::expt::StaticTensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE,stride_seq,new_begin_seq,new_size_seq, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<typename RAJA::expt::StaticTensorIndex<INDEX_TYPES>::base_type>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          typename ref_type::stride_type(),
-          // tile
-          {
-              new_begin_type(),
-              new_size_type()
-          }
-        });
-      }
-  };
-#endif
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type
+  extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg)
+  {
+    return type(stripIndexType(*vec_arg), vec_arg.size());
+  }
+};
 
+/**
+ * Specialization where expected type is wrapped in a StaticTensorIndex type
+ *
+ * In this case, there is no StaticTensorIndex to unpack, just strip any
+ * strongly typed indices.
+ */
+template <typename Expected,
+          typename Arg,
+          typename VectorType,
+          camp::idx_t DIM,
+          Arg BEGIN,
+          strip_index_type_t<Arg> LENGTH>
+struct MatchTypedViewArgHelper<
+    Expected,
+    RAJA::expt::StaticTensorIndex<
+        RAJA::expt::
+            StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>>
+{
 
-  } // namespace detail
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
+  using arg_type = strip_index_type_t<Arg>;
 
-  /*
-   * Computes the return type of a view.
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return type.
-   *
-   * Otherwise it produces the usual scalar reference return type
-   */
-  template<typename ElementType, typename PointerType, typename LinIdx, typename LayoutType, typename ... Args>
-  using view_return_type_t =
-      typename detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::return_type;
+  using type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::
+          StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-  /*
-   * Creates the return value for a View
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return value.
-   *
-   * Otherwise it produces the usual scalar reference return value
-   */
-  template<typename ElementType, typename LinIdx, typename LayoutType, typename PointerType, typename ... Args>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  view_return_type_t<ElementType, PointerType, LinIdx, LayoutType, Args...>
-  view_make_return_value(LayoutType const &layout, PointerType const &data, Args const &... args){
-    return detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::make_return(layout, data, args...);
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type extract(
+      RAJA::expt::StaticTensorIndex<
+          RAJA::expt::
+              StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>
+          RAJA_UNUSED_ARG(vec_arg))
+  {
+    return type();
   }
+};
+#endif
 
-  namespace detail
-  {
+}  // namespace detail
 
-  /**
-   * This class will help strip strongly typed indices
-   *
-   * This default implementation static_asserts that Expected==Arg, otherwise
-   * it's an error.  This enforces types for the TypedView.
-   *
-   * Specialization where expected type is same as argument type.
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg>
-  struct MatchTypedViewArgHelper{
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
 
-    using type = strip_index_type_t<Arg>;
+template <typename Expected, typename Arg>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr
+    typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
+    match_typed_view_arg(Arg const& arg)
+{
+  return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+}
+
 
-    static RAJA_HOST_DEVICE RAJA_INLINE
-    constexpr
-    type extract(Arg arg){
-      return stripIndexType(arg);
-    }
-  };
+template <typename ValueType, typename PointerType, typename LayoutType>
+class ViewBase
+{
 
+public:
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /**
-   * Specialization where expected type is wrapped in a VectorIndex type
+  using Self         = ViewBase<value_type, pointer_type, layout_type>;
+  using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
+
+protected:
+  pointer_type m_data;
+  layout_type const m_layout;
+
+public:
+  /*
+   * Defaulted operators (AJK):
+   *
+   * OpenMP Target currently needs the View classes to be trivially copyable,
+   * which means that we need to use the default ctor's and assignment
+   * operators.
    *
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
+   * These defaulted operators cause issues with some versions of CUDA, so
+   * in the case that CUDA is enabled, we switch to explicitly defined
+   * operators.
    */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::TensorIndex<Arg, VectorType, DIM> >{
+#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase() {};
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+  RAJA_HOST_DEVICE
+  RAJA_INLINE ViewBase(ViewBase const& c)
+      : m_layout(c.m_layout), m_data(c.m_data)
+  {}
 
-    using arg_type = strip_index_type_t<Arg>;
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  ViewBase& operator=(ViewBase const& c)
+  {
+    m_layout = c.m_layout;
+    m_data   = c.m_data;
+  }
+#else
+  constexpr ViewBase()                             = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase const&)  = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase&&)       = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase const&) = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase&&)      = default;
 
-    using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
+#endif
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg){
-      return type(stripIndexType(*vec_arg), vec_arg.size());
-    }
-  };
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase(pointer_type data, layout_type&& layout)
+      : m_data(data), m_layout(layout)
+  {}
 
-  /**
-   * Specialization where expected type is wrapped in a StaticTensorIndex type
-   *
-   * In this case, there is no StaticTensorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM, Arg BEGIN, strip_index_type_t<Arg> LENGTH>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> >{
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
+                                                  Args... dim_sizes)
+      : m_data(data), m_layout(dim_sizes...)
+  {}
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
 
-    using arg_type = strip_index_type_t<Arg>;
+  template <bool IsConstView = std::is_const<value_type>::value>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
+      : m_data(rhs.get_data()), m_layout(rhs.get_layout())
+  {}
 
-    using type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> RAJA_UNUSED_ARG(vec_arg)){
-      return type();
-    }
-  };
-#endif
+  RAJA_HOST_DEVICE
+  RAJA_INLINE void set_data(PointerType data_ptr) { m_data = data_ptr; }
 
-  } //namespace detail
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr pointer_type const& get_data() const { return m_data; }
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr layout_type const& get_layout() const { return m_layout; }
 
-  template<typename Expected, typename Arg>
   RAJA_HOST_DEVICE
   RAJA_INLINE
-  constexpr
-  typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
-  match_typed_view_arg(Arg const &arg)
+  constexpr linear_index_type size() const { return m_layout.size(); }
+
+
+  template <camp::idx_t DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type get_dim_size() const
   {
-    return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+    return m_layout.template get_dim_size<DIM>();
   }
 
 
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType>
-class ViewBase {
-
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Self = ViewBase<value_type, pointer_type, layout_type>;
-    using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
-
-  protected:
-    pointer_type m_data;
-    layout_type const m_layout;
-
-  public:
-
-
-    /*
-     * Defaulted operators (AJK):
-     *
-     * OpenMP Target currently needs the View classes to be trivially copyable,
-     * which means that we need to use the default ctor's and assignment
-     * operators.
-     *
-     * These defaulted operators cause issues with some versions of CUDA, so
-     * in the case that CUDA is enabled, we switch to explicitly defined
-     * operators.
-     */
-#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr ViewBase(){};
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE ViewBase(ViewBase const &c)
-      : m_layout(c.m_layout), m_data(c.m_data)
-    {
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    ViewBase &operator=(ViewBase const &c)
-    {
-      m_layout = c.m_layout;
-      m_data = c.m_data;
-    }
-#else
-    constexpr ViewBase() = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase const &) = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase &&) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase const &) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase &&) = default;
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
-#endif
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, layout_type &&layout) :
-    m_data(data), m_layout(layout)
-    {
-    }
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, Args... dim_sizes) :
-    m_data(data), m_layout(dim_sizes...)
-    {
-    }
-
-
-    template <bool IsConstView = std::is_const<value_type>::value>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(typename std::enable_if<IsConstView, NonConstView>::type const &rhs) :
-    m_data(rhs.get_data()), m_layout(rhs.get_layout())
-    {
-    }
-
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE void set_data(PointerType data_ptr){
-      m_data = data_ptr;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    pointer_type const &get_data() const
-    {
-      return m_data;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    layout_type const &get_layout() const
-    {
-      return m_layout;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type size() const
-    {
-      return m_layout.size();
-    }
-
-
-    template<camp::idx_t DIM>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type get_dim_size() const
-    {
-      return m_layout.template get_dim_size<DIM>();
-    }
-
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    template <size_t n_dims = layout_type::n_dims, typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
-
-      shifted_layout_type shift_layout(m_layout);
-      shift_layout.shift(shift);
-
-      return ShiftedView(m_data, shift_layout);
-    }
+  template <size_t n_dims   = layout_type::n_dims,
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
+
+    shifted_layout_type shift_layout(m_layout);
+    shift_layout.shift(shift);
 
+    return ShiftedView(m_data, shift_layout);
+  }
 };
 
 
 template <typename ValueType,
-        typename PointerType,
-        typename LayoutType,
-        typename IndexTypes>
+          typename PointerType,
+          typename LayoutType,
+          typename IndexTypes>
 class TypedViewBase;
 
 template <typename ValueType,
           typename PointerType,
           typename LayoutType,
           typename... IndexTypes>
-class TypedViewBase<ValueType, PointerType, LayoutType, camp::list<IndexTypes...>> :
-  public ViewBase<ValueType, PointerType, LayoutType>
+class TypedViewBase<ValueType,
+                    PointerType,
+                    LayoutType,
+                    camp::list<IndexTypes...>>
+    : public ViewBase<ValueType, PointerType, LayoutType>
 {
 
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
-
-    using Base = ViewBase<ValueType, PointerType, LayoutType>;
-    using Self = TypedViewBase<value_type, pointer_type, layout_type, camp::list<IndexTypes...> >;
-    using NonConstView = TypedViewBase<nc_value_type, nc_pointer_type, layout_type, camp::list<IndexTypes...> >;
-
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = TypedViewBase<value_type, pointer_type, shifted_layout_type, camp::list<IndexTypes...> >;
-
-    static constexpr size_t n_dims = sizeof...(IndexTypes);
-
-    using Base::Base;
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
+public:
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
+
+  using Base         = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self         = TypedViewBase<value_type,
+                             pointer_type,
+                             layout_type,
+                             camp::list<IndexTypes...>>;
+  using NonConstView = TypedViewBase<nc_value_type,
+                                     nc_pointer_type,
+                                     layout_type,
+                                     camp::list<IndexTypes...>>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView         = TypedViewBase<value_type,
+                                    pointer_type,
+                                    shifted_layout_type,
+                                    camp::list<IndexTypes...>>;
+
+  static constexpr size_t n_dims = sizeof...(IndexTypes);
+
+  using Base::Base;
+
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout, Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
 
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template <typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout, Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
-    template <size_t n_dims = sizeof...(IndexTypes), typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
 
-      shifted_layout_type shift_layout(Base::get_layout());
-      shift_layout.shift(shift);
+  template <size_t n_dims   = sizeof...(IndexTypes),
+            typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
-      return ShiftedView(Base::get_data(), shift_layout);
-    }
+    shifted_layout_type shift_layout(Base::get_layout());
+    shift_layout.shift(shift);
 
+    return ShiftedView(Base::get_data(), shift_layout);
+  }
 };
 
 
-
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index fcaee67f98..d1b15538ae 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -32,66 +32,59 @@
 namespace RAJA
 {
 
-//Helpers to convert
-//layouts -> OffsetLayouts
-//Typedlayouts -> TypedOffsetLayouts
-template<typename layout>
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template <typename layout>
 struct add_offset
 {
   using type = RAJA::OffsetLayout<layout::n_dims>;
 };
 
-template<typename IdxLin, typename...DimTypes>
-struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
+template <typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
 {
-  using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
 };
 
 template <typename ValueType,
           typename LayoutType,
-          typename PointerType = ValueType *>
-using View =
-    internal::ViewBase<ValueType, PointerType, LayoutType>;
-
+          typename PointerType = ValueType*>
+using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
 
 
 template <typename ValueType, typename LayoutType, typename... IndexTypes>
-using TypedView =
-    internal::TypedViewBase<ValueType, ValueType *, LayoutType, camp::list<IndexTypes...> >;
-
-
-
+using TypedView = internal::
+    TypedViewBase<ValueType, ValueType*, LayoutType, camp::list<IndexTypes...>>;
 
 
 template <typename IndexType, typename ValueType>
-RAJA_INLINE View<ValueType, Layout<1, IndexType, 0> > make_view(
-    ValueType *ptr)
+RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
 {
-  return View<ValueType, Layout<1, IndexType, 0> >(ptr, 1);
+  return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
 
-template <size_t n_dims, typename IndexType, typename ValueType, typename... IndexTypes>
-RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> > make_index_view(
-    ValueType *ptr, IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+template <size_t n_dims,
+          typename IndexType,
+          typename ValueType,
+          typename... IndexTypes>
+RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
+make_index_view(ValueType* ptr,
+                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
-  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> >(ptr, index_layout);
+  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
+      ptr, index_layout);
 }
 
 
 // select certain indices from a tuple, given a curated index sequence
 // returns linear index of layout(ar...)
 template <typename Lay, typename Tup, camp::idx_t... Idxs>
-RAJA_HOST_DEVICE RAJA_INLINE 
-auto selecttuple( Lay lyout, Tup&& tup, camp::idx_seq<Idxs...> ) ->
-  decltype(
-            lyout(
-              camp::get<Idxs>(std::forward<Tup>(tup))...
-            )
-          )
-{ 
-  return lyout(
-                camp::get<Idxs>(std::forward<Tup>(tup))...
-              );
+RAJA_HOST_DEVICE RAJA_INLINE auto
+selecttuple(Lay lyout, Tup&& tup, camp::idx_seq<Idxs...>)
+    -> decltype(lyout(camp::get<Idxs>(std::forward<Tup>(tup))...))
+{
+  return lyout(camp::get<Idxs>(std::forward<Tup>(tup))...);
 }
 
 // sequence combiner
@@ -99,9 +92,7 @@ template <typename Seq1, typename Seq2>
 struct cat_seq;
 
 template <camp::idx_t... Idxs1, camp::idx_t... Idxs2>
-struct cat_seq  < camp::idx_seq<Idxs1...>,
-                  camp::idx_seq<Idxs2...>
-                >
+struct cat_seq<camp::idx_seq<Idxs1...>, camp::idx_seq<Idxs2...>>
 {
   using type = camp::idx_seq<Idxs1..., Idxs2...>;
 };
@@ -116,7 +107,7 @@ struct offset_seq;
 template <camp::idx_t Offset, camp::idx_t... Idxs>
 struct offset_seq<Offset, camp::idx_seq<Idxs...>>
 {
-  using type = camp::idx_seq<(Idxs+Offset)...>;
+  using type = camp::idx_seq<(Idxs + Offset)...>;
 };
 
 template <camp::idx_t Offset, typename Seq>
@@ -125,60 +116,49 @@ using offset_seq_t = typename offset_seq<Offset, Seq>::type;
 // remove the Nth index in a parameter pack
 // returns linear index of layout(ar...)
 template <typename Lay, RAJA::Index_type Nth = 0, typename Tup>
-RAJA_HOST_DEVICE RAJA_INLINE auto removenth( Lay lyout, Tup&& tup ) ->
-  decltype( selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-            )
-          )
+RAJA_HOST_DEVICE RAJA_INLINE auto
+removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
+    lyout,
+    std::forward<Tup>(tup),
+    cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+              offset_seq_t<Nth + 1,       // after Nth
+                           camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                Nth - 1>>  // sequence after Nth
+              > {}))
 {
   return selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-          );
+      lyout, std::forward<Tup>(tup),
+      cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                offset_seq_t<Nth + 1,       // after Nth
+                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                  Nth - 1>>  // sequence after
+                                                             // Nth
+                > {});
 }
 
 
-
-
-// P2Pidx represents the array-of-pointers index. This allows the position of the
-// index into the array-of-pointers to be moved around in the MultiView operator();
-// see the operator overload.
-// Default of 0 means that the p2p index is in the 0th position.
+// P2Pidx represents the array-of-pointers index. This allows the position of
+// the index into the array-of-pointers to be moved around in the MultiView
+// operator(); see the operator overload. Default of 0 means that the p2p index
+// is in the 0th position.
 template <typename ValueType,
           typename LayoutType,
-          RAJA::Index_type P2Pidx = 0,
-          typename PointerType = ValueType **,
-          typename NonConstPointerType =
-              camp::type::ptr::add< // adds *
-                camp::type::ptr::add<
-                  camp::type::cv::rem<  // removes cv
-                    camp::type::ptr::rem<
-                      camp::type::ptr::rem<PointerType>  // removes *
-                    >
-                  >
-                >
-              >
-          >
-struct MultiView {
-  using value_type = ValueType;
-  using pointer_type = PointerType;
-  using layout_type = LayoutType;
-  using nc_value_type = camp::decay<value_type>;
+          RAJA::Index_type P2Pidx      = 0,
+          typename PointerType         = ValueType**,
+          typename NonConstPointerType = camp::type::ptr::add<  // adds *
+              camp::type::ptr::add<camp::type::cv::rem<         // removes cv
+                  camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
+                                                                          // *
+                                       >>>>>
+struct MultiView
+{
+  using value_type      = ValueType;
+  using pointer_type    = PointerType;
+  using layout_type     = LayoutType;
+  using nc_value_type   = camp::decay<value_type>;
   using nc_pointer_type = NonConstPointerType;
-  using NonConstView = MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
+  using NonConstView =
+      MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
   nc_pointer_type data;
@@ -186,39 +166,38 @@ struct MultiView {
   template <typename... Args>
   RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
       : layout(dim_sizes...), data(data_ptr)
-  {
-  }
+  {}
 
-  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type &&layout)
+  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type&& layout)
       : layout(layout), data(data_ptr)
-  {
-  }
+  {}
 
-  RAJA_INLINE constexpr MultiView(MultiView const &) = default;
-  RAJA_INLINE constexpr MultiView(MultiView &&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView const &) = default;
-  RAJA_INLINE MultiView& operator=(MultiView &&) = default;
+  RAJA_INLINE constexpr MultiView(MultiView const&)  = default;
+  RAJA_INLINE constexpr MultiView(MultiView&&)       = default;
+  RAJA_INLINE MultiView& operator=(MultiView const&) = default;
+  RAJA_INLINE MultiView& operator=(MultiView&&)      = default;
 
   template <bool IsConstView = std::is_const<value_type>::value>
   RAJA_INLINE constexpr MultiView(
-      typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
-      : layout(rhs.layout),
-        data(rhs.data)
-  {
-  }
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
+      : layout(rhs.layout), data(rhs.data)
+  {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { data = data_ptr; }
 
-  template <size_t n_dims=layout_type::n_dims, typename IdxLin = Index_type>
-  RAJA_INLINE RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
-  shift(const std::array<IdxLin, n_dims>& shift)
+  template <size_t n_dims = layout_type::n_dims, typename IdxLin = Index_type>
+  RAJA_INLINE
+      RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
+      shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>(data, shift_layout);
+    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type,
+                           P2Pidx>(data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
@@ -226,26 +205,30 @@ struct MultiView {
   // making this specifically typed would require unpacking the layout,
   // this is easier to maintain
   template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... ar) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(Args... ar) const
   {
-    auto pidx = stripIndexType( camp::get<P2Pidx>( camp::forward_as_tuple( ar... ) ) );
+    auto pidx =
+        stripIndexType(camp::get<P2Pidx>(camp::forward_as_tuple(ar...)));
 
-    if ( pidx < 0 )
+    if (pidx < 0)
     {
-      RAJA_ABORT_OR_THROW( "Negative index while accessing array of pointers.\n" );
+      RAJA_ABORT_OR_THROW(
+          "Negative index while accessing array of pointers.\n");
     }
-    
-    auto idx = stripIndexType( removenth<LayoutType, P2Pidx>( layout, camp::forward_as_tuple( ar... ) ) );
+
+    auto idx = stripIndexType(
+        removenth<LayoutType, P2Pidx>(layout, camp::forward_as_tuple(ar...)));
     return data[pidx][idx];
   }
 };
 
 template <typename ViewType, typename AtomicPolicy = RAJA::auto_atomic>
-struct AtomicViewWrapper {
-  using base_type = ViewType;
+struct AtomicViewWrapper
+{
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, AtomicPolicy>;
 
   base_type base_;
 
@@ -255,7 +238,7 @@ struct AtomicViewWrapper {
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS&&... args) const
   {
     return atomic_type(&base_.operator()(std::forward<ARGS>(args)...));
   }
@@ -267,21 +250,22 @@ struct AtomicViewWrapper {
  * for performance
  */
 template <typename ViewType>
-struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
-  using base_type = ViewType;
+struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
+{
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
 
   base_type base_;
 
   RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType const &view) : base_{view} {}
+  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_ {view} {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
   template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(ARGS &&... args) const
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(ARGS&&... args) const
   {
     return base_.operator()(std::forward<ARGS>(args)...);
   }
@@ -289,8 +273,8 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
 
 
 template <typename AtomicPolicy, typename ViewType>
-RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy> make_atomic_view(
-    ViewType const &view)
+RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy>
+make_atomic_view(ViewType const& view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 7103ecb152..23ccbee14c 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -32,16 +32,20 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(disable : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
   void* r = nullptr;
-  if (size <= space) {
+  if (size <= space)
+  {
     char* p1 = static_cast<char*>(ptr);
     char* p2 = reinterpret_cast<char*>(
-        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) & -alignment);
+        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) &
+        -alignment);
     size_t d = static_cast<size_t>(p2 - p1);
-    if (d <= space - size) {
-      r = p2;
+    if (d <= space - size)
+    {
+      r   = p2;
       ptr = r;
       space -= d;
     }
@@ -49,9 +53,9 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
   return r;
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(default : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
-
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index f0208ccbd3..3b488a81ec 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -54,27 +54,28 @@ namespace detail
 class MemoryArena
 {
 public:
-  using free_type = std::map<void*, void*>;
+  using free_type       = std::map<void*, void*>;
   using free_value_type = typename free_type::value_type;
-  using used_type = std::map<void*, void*>;
+  using used_type       = std::map<void*, void*>;
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
-      m_free_space(),
-      m_used_space()
+      : m_allocation {ptr, static_cast<char*>(ptr) + size},
+        m_free_space(),
+        m_used_space()
   {
-     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
-    if (m_allocation.begin == nullptr) {
+    m_free_space[ptr] = static_cast<char*>(ptr) + size;
+    if (m_allocation.begin == nullptr)
+    {
       fprintf(stderr, "Attempt to create MemoryArena with no memory");
       std::abort();
     }
   }
 
-  MemoryArena(MemoryArena const&) = delete;
+  MemoryArena(MemoryArena const&)            = delete;
   MemoryArena& operator=(MemoryArena const&) = delete;
 
-  MemoryArena(MemoryArena&&) = default;
+  MemoryArena(MemoryArena&&)            = default;
   MemoryArena& operator=(MemoryArena&&) = default;
 
   size_t capacity()
@@ -90,21 +91,22 @@ class MemoryArena
   void* get(size_t nbytes, size_t alignment)
   {
     void* ptr_out = nullptr;
-    if (capacity() >= nbytes) {
+    if (capacity() >= nbytes)
+    {
       free_type::iterator end = m_free_space.end();
-      for (free_type::iterator iter = m_free_space.begin(); iter != end;
-           ++iter) {
+      for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter)
+      {
 
         void* adj_ptr = iter->first;
         size_t cap =
             static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
 
-        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap))
+        {
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(iter,
-                            adj_ptr,
+          remove_free_chunk(iter, adj_ptr,
                             static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
@@ -118,29 +120,35 @@ class MemoryArena
 
   bool give(void* ptr)
   {
-    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
+    if (m_allocation.begin <= ptr && ptr < m_allocation.end)
+    {
 
       used_type::iterator found = m_used_space.find(ptr);
 
-      if (found != m_used_space.end()) {
+      if (found != m_used_space.end())
+      {
 
         add_free_chunk(found->first, found->second);
 
         m_used_space.erase(found);
-
-      } else {
+      }
+      else
+      {
         fprintf(stderr, "Invalid free %p", ptr);
         std::abort();
       }
 
       return true;
-    } else {
+    }
+    else
+    {
       return false;
     }
   }
 
 private:
-  struct memory_chunk {
+  struct memory_chunk
+  {
     void* begin;
     void* end;
   };
@@ -152,19 +160,23 @@ class MemoryArena
     free_type::iterator next = m_free_space.lower_bound(begin);
 
     // check if prev exists
-    if (next != m_free_space.begin()) {
+    if (next != m_free_space.begin())
+    {
       // check if prev can cover [begin, end)
       free_type::iterator prev = next;
       --prev;
-      if (prev->second == begin) {
+      if (prev->second == begin)
+      {
         // extend prev to cover [begin, end)
         prev->second = end;
 
         // check if prev can cover next too
-        if (next != invl) {
+        if (next != invl)
+        {
           assert(next->first != begin);
 
-          if (next->first == end) {
+          if (next->first == end)
+          {
             // extend prev to cover next too
             prev->second = next->second;
 
@@ -176,12 +188,14 @@ class MemoryArena
       }
     }
 
-    if (next != invl) {
+    if (next != invl)
+    {
       assert(next->first != begin);
 
-      if (next->first == end) {
+      if (next->first == end)
+      {
         // extend next to cover [begin, end)
-        m_free_space.insert(next, free_value_type{begin, next->second});
+        m_free_space.insert(next, free_value_type {begin, next->second});
         m_free_space.erase(next);
 
         return;
@@ -190,38 +204,42 @@ class MemoryArena
 
     // no free space adjacent to this chunk, add seperate free chunk [begin,
     // end)
-    m_free_space.insert(next, free_value_type{begin, end});
+    m_free_space.insert(next, free_value_type {begin, end});
   }
 
   void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
   {
 
-    void* ptr = iter->first;
+    void* ptr     = iter->first;
     void* ptr_end = iter->second;
 
     // fixup m_free_space, shrinking and adding chunks as needed
-    if (ptr != begin) {
+    if (ptr != begin)
+    {
 
       // shrink end of current free region to [ptr, begin)
       iter->second = begin;
 
-      if (end != ptr_end) {
+      if (end != ptr_end)
+      {
 
         // insert free region [end, ptr_end) after current free region
         free_type::iterator next = iter;
         ++next;
-        m_free_space.insert(next, free_value_type{end, ptr_end});
+        m_free_space.insert(next, free_value_type {end, ptr_end});
       }
-
-    } else if (end != ptr_end) {
+    }
+    else if (end != ptr_end)
+    {
 
       // shrink beginning of current free region to [end, ptr_end)
       free_type::iterator next = iter;
       ++next;
-      m_free_space.insert(next, free_value_type{end, ptr_end});
+      m_free_space.insert(next, free_value_type {end, ptr_end});
       m_free_space.erase(iter);
-
-    } else {
+    }
+    else
+    {
 
       // can not reuse current region, erase
       m_free_space.erase(iter);
@@ -231,7 +249,7 @@ class MemoryArena
   void add_used_chunk(void* begin, void* end)
   {
     // simply inserts a chunk of memory into used_space
-    m_used_space.insert(used_value_type{begin, end});
+    m_used_space.insert(used_value_type {begin, end});
   }
 
   memory_chunk m_allocation;
@@ -290,7 +308,7 @@ class MemPool
 
   static inline MemPool<allocator_t>& getInstance()
   {
-    static MemPool<allocator_t> pool{};
+    static MemPool<allocator_t> pool {};
     return pool;
   }
 
@@ -298,8 +316,7 @@ class MemPool
 
   MemPool()
       : m_arenas(), m_default_arena_size(default_default_arena_size), m_alloc()
-  {
-  }
+  {}
 
   ~MemPool()
   {
@@ -316,7 +333,8 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    while (!m_arenas.empty()) {
+    while (!m_arenas.empty())
+    {
       void* allocation_ptr = m_arenas.front().get_allocation();
       m_alloc.free(allocation_ptr);
       m_arenas.pop_front();
@@ -338,7 +356,7 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    size_t prev_size = m_default_arena_size;
+    size_t prev_size     = m_default_arena_size;
     m_default_arena_size = new_size;
     return prev_size;
   }
@@ -350,22 +368,26 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    const size_t size = nTs * sizeof(T);
-    void* ptr = nullptr;
+    const size_t size                  = nTs * sizeof(T);
+    void* ptr                          = nullptr;
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
+         ++iter)
+    {
       ptr = iter->get(size, alignment);
-      if (ptr != nullptr) {
+      if (ptr != nullptr)
+      {
         break;
       }
     }
 
-    if (ptr == nullptr) {
+    if (ptr == nullptr)
+    {
       const size_t alloc_size =
           std::max(size + alignment, m_default_arena_size);
       void* arena_ptr = m_alloc.malloc(alloc_size);
-      if (arena_ptr != nullptr) {
+      if (arena_ptr != nullptr)
+      {
         m_arenas.emplace_front(arena_ptr, alloc_size);
         ptr = m_arenas.front().get(size, alignment);
       }
@@ -380,16 +402,19 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    void* ptr = const_cast<void*>(cptr);
+    void* ptr                          = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
-      if (iter->give(ptr)) {
+         ++iter)
+    {
+      if (iter->give(ptr))
+      {
         ptr = nullptr;
         break;
       }
     }
-    if (ptr != nullptr) {
+    if (ptr != nullptr)
+    {
       fprintf(stderr, "Unknown pointer %p", ptr);
     }
   }
@@ -407,7 +432,8 @@ class MemPool
 };
 
 //! example allocator for basic_mempool using malloc/free
-struct generic_allocator {
+struct generic_allocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes) { return std::malloc(nbytes); }
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 4372993949..06637e7a96 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -34,17 +34,17 @@ using namespace camp::concepts;
 
 template <typename From, typename To>
 struct ConvertibleTo
-  : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
-};
+    : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>()))
+{};
 
-}
+}  // namespace concepts
 
 namespace type_traits
 {
 using namespace camp::type_traits;
 
 DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
-}
+}  // namespace type_traits
 
 }  // end namespace RAJA
 
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 25783b2a0a..81069d57d0 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -38,11 +38,13 @@ namespace detail
 
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
-template<typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
+template <typename Iter, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
+                                                Iter end,
+                                                UnaryFunc func)
 {
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
     func(*begin);
   }
 
@@ -52,11 +54,11 @@ UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
+                                                     UnaryFunc func)
 {
   // braced init lists are evaluated in order
-  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
+  int seq_unused_array[] = {0, (func(Ts {}), 0)...};
   RAJA_UNUSED_VAR(seq_unused_array);
 
   return func;
@@ -65,8 +67,9 @@ UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+                                                      UnaryFunc func,
+                                                      camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -87,7 +90,7 @@ UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 RAJA_SUPPRESS_HD_WARN
 template <typename Container, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+    concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
     for_each(Container&& c, UnaryFunc func)
 {
   using std::begin;
@@ -102,23 +105,23 @@ concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
+                                                     UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
 
 /*!
-  \brief Apply func to each object in the given tuple or tuple like type in order
-  using a compile-time expansion in O(N) operations and O(1) extra memory
+  \brief Apply func to each object in the given tuple or tuple like type in
+  order using a compile-time expansion in O(N) operations and O(1) extra memory
 */
 RAJA_SUPPRESS_HD_WARN
 template <typename Tuple, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
-  return detail::for_each_tuple(std::forward<Tuple>(t), std::move(func),
-      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
+  return detail::for_each_tuple(
+      std::forward<Tuple>(t), std::move(func),
+      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 9ddb5bebb7..dddb050ec4 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -33,16 +33,16 @@
 // We need a better solution than this as it is a pain to manage
 // this stuff in an application.
 //
-#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
-  || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) \
-  || (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||                   \
+    (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) ||           \
+    (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
 #endif
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 
 #if defined(RAJA_ENABLE_CLANG_CUDA)
 #define RAJA_SUPPRESS_HD_WARN
@@ -52,8 +52,8 @@
 
 #elif defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 #define RAJA_SUPPRESS_HD_WARN
 
 #define RAJA_USE_HIP_INTRINSICS
@@ -115,9 +115,8 @@
  *******************************************************************************
  */
 template <typename... T>
-RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
-{
-}
+RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
+{}
 
 /*!
  * \def RAJA_STRINGIFY_HELPER(x)
@@ -133,7 +132,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  */
 #define RAJA_STRINGIFY_MACRO(x) RAJA_STRINGIFY_HELPER(x)
 
-#define RAJA_DIVIDE_CEILING_INT(dividend, divisor) \
+#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)                             \
   (((dividend) + (divisor)-1) / (divisor))
 
 /*!
@@ -141,27 +140,26 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  * Used in forall and launch
  */
 #if defined(RAJA_ENABLE_OPENMP)
-#define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
-      _Pragma(" omp declare reduction( combine \
+#define RAJA_OMP_DECLARE_REDUCTION_COMBINE                                     \
+  _Pragma(" omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")\
-        //initializer(omp_priv = omp_in) ")
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")  // initializer(omp_priv = omp_in) ")
 #endif
 
 
 RAJA_HOST_DEVICE
-inline void RAJA_ABORT_OR_THROW(const char *str)
+inline void RAJA_ABORT_OR_THROW(const char* str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
-  //segfault here ran into linking problems
-  *((volatile char *)0) = 0;  // write to address 0
+  // segfault here ran into linking problems
+  *((volatile char*)0) = 0;  // write to address 0
 #else
-  printf ( "%s\n", str );
+  printf("%s\n", str);
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
-  *((volatile char *)0) = 0;  // write to address 0
+  *((volatile char*)0) = 0;  // write to address 0
 #elif defined(__CUDA_ARCH__)
-  asm ("trap;");
+  asm("trap;");
 
 #elif defined(__HIP_DEVICE_COMPILE__)
   abort();
@@ -169,10 +167,11 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #else
 #ifdef RAJA_COMPILER_MSVC
   fflush(stdout);
-  char *value;
+  char* value;
   size_t len;
   bool no_except = false;
-  if(_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr){
+  if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr)
+  {
     no_except = true;
     free(value);
   }
@@ -182,9 +181,12 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #endif
 
   fflush(stdout);
-  if (no_except) {
+  if (no_except)
+  {
     std::abort();
-  } else {
+  }
+  else
+  {
     throw std::runtime_error(str);
   }
 #endif
@@ -202,7 +204,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
  */
 
 #if (__cplusplus >= 201402L)
-#define RAJA_HAS_CXX14 1
+#define RAJA_HAS_CXX14                    1
 #define RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED 1
 #elif defined(__has_cpp_attribute)
 #if __has_cpp_attribute(deprecated)
@@ -212,7 +214,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 
 #if defined(RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED)
 // When using a C++14 compiler, use the standard-specified deprecated attribute
-#define RAJA_DEPRECATE(Msg) [[deprecated(Msg)]]
+#define RAJA_DEPRECATE(Msg)       [[deprecated(Msg)]]
 #define RAJA_DEPRECATE_ALIAS(Msg) [[deprecated(Msg)]]
 
 #elif defined(_MSC_VER)
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index 99d7bc192e..cdc03b9db7 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -34,14 +34,14 @@ namespace RAJA
     For zero or negative n return 0
 
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T log2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T log2(T n) noexcept
 {
   T result = 0;
-  if (n > 0) {
-    while(n >>= 1) {
+  if (n > 0)
+  {
+    while (n >>= 1)
+    {
       ++result;
     }
   }
@@ -57,13 +57,12 @@ constexpr T log2(T n) noexcept
         if n is not a power of 2, return the next greater power of 2
       if n is negative, return 0
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T next_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T next_pow2(T n) noexcept
 {
   --n;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   ++n;
@@ -71,7 +70,8 @@ constexpr T next_pow2(T n) noexcept
 }
 
 /*!
-    \brief "round down" to the largest power of 2 that is less than or equal to n
+    \brief "round down" to the largest power of 2 that is less than or equal to
+   n
 
     For an integer n,
       if n is negative, return 0
@@ -79,13 +79,12 @@ constexpr T next_pow2(T n) noexcept
         if n is a power of 2, return n
         else return the largest power of 2 that is less than n
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T prev_pow2(T n) noexcept
+template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T prev_pow2(T n) noexcept
 {
-  if ( n < 0 ) return 0;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  if (n < 0) return 0;
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   return n - (n >> 1);
@@ -94,12 +93,14 @@ constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template < typename L, typename R,
-           std::enable_if_t<std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
+template <typename L,
+          typename R,
+          std::enable_if_t<std::is_integral<L>::value &&
+                           std::is_integral<R>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto power_of_2_mod(L lhs,
+                                                           R rhs) noexcept
 {
-  return lhs & (rhs-R(1));
+  return lhs & (rhs - R(1));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
index a955b27915..631177cbf6 100644
--- a/include/RAJA/util/mutex.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -39,10 +39,10 @@ class mutex
 
   mutex() { omp_init_lock(&m_lock); }
 
-  mutex(const mutex&) = delete;
-  mutex(mutex&&) = delete;
+  mutex(const mutex&)            = delete;
+  mutex(mutex&&)                 = delete;
   mutex& operator=(const mutex&) = delete;
-  mutex& operator=(mutex&&) = delete;
+  mutex& operator=(mutex&&)      = delete;
 
   void lock() { omp_set_lock(&m_lock); }
 
@@ -68,10 +68,10 @@ class lock_guard
 public:
   explicit lock_guard(mutex_type& m) : m_mutex(m) { m_mutex.lock(); }
 
-  lock_guard(const lock_guard&) = delete;
-  lock_guard(lock_guard&&) = delete;
+  lock_guard(const lock_guard&)            = delete;
+  lock_guard(lock_guard&&)                 = delete;
   lock_guard& operator=(const lock_guard&) = delete;
-  lock_guard& operator=(lock_guard&&) = delete;
+  lock_guard& operator=(lock_guard&&)      = delete;
 
   ~lock_guard() { m_mutex.unlock(); }
 
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index d5f42efde0..301bbc875c 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -18,103 +18,88 @@
 #include "RAJA/util/KokkosPluginLoader.hpp"
 #endif
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 template <typename T>
-RAJA_INLINE auto trigger_updates_before(T&& item)
-  -> typename std::remove_reference<T>::type
+RAJA_INLINE auto trigger_updates_before(T&& item) ->
+    typename std::remove_reference<T>::type
 {
   return item;
 }
 
 RAJA_INLINE
-void
-callPreCapturePlugins(const PluginContext& p)
+void callPreCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostCapturePlugins(const PluginContext& p)
+void callPostCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPreLaunchPlugins(const PluginContext& p)
+void callPreLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostLaunchPlugins(const PluginContext& p)
+void callPostLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callInitPlugins(const PluginOptions p)
+void callInitPlugins(const PluginOptions p)
 {
-  for (auto plugin = PluginRegistry::begin(); 
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->init(p);
   }
 }
 
 RAJA_INLINE
-void
-init_plugins(const std::string& path)
-{   
+void init_plugins(const std::string& path)
+{
   callInitPlugins(make_options(path));
 }
 
 RAJA_INLINE
-void
-init_plugins()
-{   
-  callInitPlugins(make_options(""));
-}
+void init_plugins() { callInitPlugins(make_options("")); }
 
 RAJA_INLINE
-void
-finalize_plugins()
-{   
-  for (auto plugin = PluginRegistry::begin(); 
-    plugin != PluginRegistry::end();
-    ++plugin)
+void finalize_plugins()
+{
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->finalize();
   }
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 6d0c28f861..de25c2005a 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -44,19 +44,16 @@ namespace detail
 template <typename T, typename BinaryOp>
 struct LeftFoldReduce
 {
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
-    , m_accumulated_value(std::move(init))
-  {
-
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit LeftFoldReduce(
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
+      : m_op(std::move(op)), m_accumulated_value(std::move(init))
+  {}
 
-  LeftFoldReduce(LeftFoldReduce const&) = delete;
+  LeftFoldReduce(LeftFoldReduce const&)            = delete;
   LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
-  LeftFoldReduce(LeftFoldReduce &&) = delete;
-  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
+  LeftFoldReduce(LeftFoldReduce&&)                 = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce&&)      = delete;
 
   ~LeftFoldReduce() = default;
 
@@ -64,8 +61,7 @@ struct LeftFoldReduce
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     m_accumulated_value = BinaryOp::identity();
   }
@@ -73,8 +69,7 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     T accumulated_value = std::move(m_accumulated_value);
 
@@ -86,17 +81,12 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
-  {
-    return m_accumulated_value;
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; }
 
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T val)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T val)
   {
     m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
   }
@@ -109,50 +99,50 @@ struct LeftFoldReduce
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <typename T, typename BinaryOp, typename SizeType = size_t,
-          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
+template <typename T,
+          typename BinaryOp,
+          typename SizeType     = size_t,
+          SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
 struct BinaryTreeReduce
 {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
+  static_assert(
+      t_num_levels <= CHAR_BIT * sizeof(SizeType),
+      "SizeType must be large enough to act at a bitset for num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit BinaryTreeReduce(
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
+      : m_op(std::move(op))
   {
     combine(std::move(init));
   }
 
-  BinaryTreeReduce(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce const&)            = delete;
   BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
-  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
-  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce&&)                 = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce&&)      = delete;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  ~BinaryTreeReduce()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE ~BinaryTreeReduce() { clear(); }
 
 
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     // destroy all values on the tree stack and reset count to 0
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         get_value(level)->~T();
 
         m_count ^= mask;
-
       }
     }
   }
@@ -160,15 +150,16 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         value = m_op(std::move(value), std::move(*get_value(level)));
         get_value(level)->~T();
@@ -183,15 +174,17 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
+  RAJA_HOST_DEVICE RAJA_INLINE T get()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
+    for (SizeType count = m_count, level = 0, mask = 1; count;
+         ++level, mask <<= 1)
+    {
 
-      if (count & mask) {
+      if (count & mask)
+      {
 
         value = m_op(std::move(value), *get_value(level));
 
@@ -205,20 +198,19 @@ struct BinaryTreeReduce
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T value)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T value)
   {
     // accumulate values and store in the first unused level found
     // clear values from used levels along the way
     SizeType level = 0;
-    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) {
+    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1)
+    {
 
       value = m_op(std::move(*get_value(level)), std::move(value));
       get_value(level)->~T();
-
     }
 
-    new(get_storage(level)) T(std::move(value));
+    new (get_storage(level)) T(std::move(value));
 
     ++m_count;
   }
@@ -234,14 +226,12 @@ struct BinaryTreeReduce
   // values or is unused and has no value.
   std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void* get_storage(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE void* get_storage(SizeType level)
   {
     return &m_tree_stack[level];
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T* get_value(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE T* get_value(SizeType level)
   {
 #if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     // TODO: check that launder is supported in device code
@@ -254,10 +244,10 @@ struct BinaryTreeReduce
 
 
 template <typename T, typename BinaryOp>
-using HighAccuracyReduce = std::conditional_t<
-    RAJA::operators::is_fp_associative<T>::value,
-      BinaryTreeReduce<T, BinaryOp>,
-      LeftFoldReduce<T, BinaryOp>>;
+using HighAccuracyReduce =
+    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
+                       BinaryTreeReduce<T, BinaryOp>,
+                       LeftFoldReduce<T, BinaryOp>>;
 
 
 /*!
@@ -265,18 +255,15 @@ using HighAccuracyReduce = std::conditional_t<
            operation using O(N) operations and O(1) memory
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T left_fold_reduce(Iter begin,
-                   Iter end,
-                   T init,
-                   BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -290,20 +277,18 @@ T left_fold_reduce(Iter begin,
     floating point types.
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T binary_tree_reduce(Iter begin,
-                     Iter end,
-                     T init,
-                     BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
+                                                  std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -315,18 +300,15 @@ T binary_tree_reduce(Iter begin,
     is a concern, or a faster algorithm with it is not a concern
 */
 template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T high_accuracy_reduce(Iter begin,
-                        Iter end,
-                        T init,
-                        BinaryOp op)
+RAJA_HOST_DEVICE RAJA_INLINE T
+high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -340,18 +322,21 @@ T high_accuracy_reduce(Iter begin,
     see https://en.cppreference.com/w/cpp/algorithm/accumulate
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c,
+               T init      = BinaryOp::identity(),
+               BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init),
+                                  std::move(op));
 }
 
 /*!
@@ -360,18 +345,21 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c,
+                       T init      = BinaryOp::identity(),
+                       BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
+                                    std::move(op));
 }
 
 /*!
@@ -381,18 +369,21 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
 template <typename Container,
-          typename T = detail::ContainerVal<Container>,
+          typename T        = detail::ContainerVal<Container>,
           typename BinaryOp = operators::plus<T>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c,
+                         T init      = BinaryOp::identity(),
+                         BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
+                                      std::move(op));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 28a476d951..567d95e21e 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -37,145 +37,212 @@
 namespace RAJA
 {
 
-  namespace resources
-  {
-  using namespace camp::resources;
+namespace resources
+{
+using namespace camp::resources;
 
-  template<typename e>
-  struct get_resource{
-    using type = camp::resources::Host;
-  };
+template <typename e>
+struct get_resource
+{
+  using type = camp::resources::Host;
+};
 
-  template<Platform>
-  struct get_resource_from_platform{
-    using type = camp::resources::Host;
-  };
+template <Platform>
+struct get_resource_from_platform
+{
+  using type = camp::resources::Host;
+};
 
-  template<typename ExecPol>
-  using resource_from_pol_t = typename get_resource_from_platform<detail::get_platform<ExecPol>::value>::type;
+template <typename ExecPol>
+using resource_from_pol_t = typename get_resource_from_platform<
+    detail::get_platform<ExecPol>::value>::type;
 
-  template<typename ExecPol>
-  constexpr resource_from_pol_t<ExecPol> get_default_resource() {
-    return resource_from_pol_t<ExecPol>::get_default();
-  }
+template <typename ExecPol>
+constexpr resource_from_pol_t<ExecPol> get_default_resource()
+{
+  return resource_from_pol_t<ExecPol>::get_default();
+}
 
 #if defined(RAJA_CUDA_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::cuda>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-  struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
-    using type = camp::resources::Cuda;
-  };
+template <>
+struct get_resource_from_platform<Platform::cuda>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                             IterationGetter,
+                                                             Concretizer,
+                                                             BLOCKS_PER_SM,
+                                                             Async>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
+struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async,
+                                                                 num_threads,
+                                                                 BLOCKS_PER_SM>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          size_t BLOCKS_PER_SM,
+          bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>>>
+{
+  using type = camp::resources::Cuda;
+};
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::hip>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
-    using type = camp::resources::Hip;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
-    using type = camp::resources::Hip;
-  };
+template <>
+struct get_resource_from_platform<Platform::hip>
+{
+  using type = camp::resources::Hip;
+};
+
+template <typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>
+{
+  using type = camp::resources::Hip;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Hip;
+};
+
+template <typename ISetIter,
+          typename IterationMapping,
+          typename IterationGetter,
+          typename Concretizer,
+          bool Async>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>
+{
+  using type = camp::resources::Hip;
+};
 #endif
 
 #if defined(RAJA_SYCL_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::sycl>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<size_t BlockSize, bool Async>
-  struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<typename ISetIter, size_t BlockSize, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>{
-    using type = camp::resources::Sycl;
-  };
+template <>
+struct get_resource_from_platform<Platform::sycl>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <size_t BlockSize, bool Async>
+struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <bool Async, int num_threads>
+struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template <typename ISetIter, size_t BlockSize, bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>
+{
+  using type = camp::resources::Sycl;
+};
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  template<>
-  struct get_resource_from_platform<Platform::omp_target>{
-    using type = camp::resources::Omp;
-  };
-
-  template<>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>{
-    using type = camp::resources::Omp;
-  };
-
-  template<size_t ThreadsPerTeam>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter, size_t ThreadsPerTeam>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>{
-    using type = camp::resources::Omp;
-  };
+template <>
+struct get_resource_from_platform<Platform::omp_target>
+{
+  using type = camp::resources::Omp;
+};
+
+template <>
+struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>
+{
+  using type = camp::resources::Omp;
+};
+
+template <size_t ThreadsPerTeam>
+struct get_resource<
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>
+{
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>
+{
+  using type = camp::resources::Omp;
+};
+
+template <typename ISetIter, size_t ThreadsPerTeam>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>
+{
+  using type = camp::resources::Omp;
+};
 #endif
 
-  } // end namespace resources
+}  // end namespace resources
 
-  namespace type_traits
-  {
-    template <typename T> struct is_resource : std::false_type {};
-    template <> struct is_resource<resources::Host> : std::true_type {};
+namespace type_traits
+{
+template <typename T>
+struct is_resource : std::false_type
+{};
+template <>
+struct is_resource<resources::Host> : std::true_type
+{};
 #if defined(RAJA_CUDA_ACTIVE)
-    template <> struct is_resource<resources::Cuda> : std::true_type {};
+template <>
+struct is_resource<resources::Cuda> : std::true_type
+{};
 #endif
 #if defined(RAJA_HIP_ACTIVE)
-    template <> struct is_resource<resources::Hip> : std::true_type {};
+template <>
+struct is_resource<resources::Hip> : std::true_type
+{};
 #endif
 #if defined(RAJA_SYCL_ACTIVE)
-    template <> struct is_resource<resources::Sycl> : std::true_type {};
+template <>
+struct is_resource<resources::Sycl> : std::true_type
+{};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    template <> struct is_resource<resources::Omp> : std::true_type {};
+template <>
+struct is_resource<resources::Omp> : std::true_type
+{};
 #endif
-  } // end namespace type_traits
+}  // end namespace type_traits
 
 }  // end namespace RAJA
 
-#endif //RAJA_resources_HPP#
+#endif  // RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index bbec03dfe1..a5c0de5e76 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -40,37 +40,41 @@ namespace detail
     and using O(N) predicate evaluations and O(1) memory
 */
 template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-partition(Iter begin,
-          Iter end,
-          Predicate pred)
+RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
+                                            Iter end,
+                                            Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return begin;
   }
 
   // advance to first false
   Iter first_false = begin;
-  for (; first_false != end; ++first_false) {
+  for (; first_false != end; ++first_false)
+  {
 
-    if (!pred(first_false)) {
+    if (!pred(first_false))
+    {
       break;
     }
   }
 
   // return if none were false
-  if (first_false == end) {
+  if (first_false == end)
+  {
     return first_false;
   }
 
   // advance through rest of list to find the next true
-  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true) {
+  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true)
+  {
 
     // find the end of a range of falses [first_false, next_true)
-    if (pred(next_true)) {
+    if (pred(next_true))
+    {
 
       // shift the known range of falses forward
       // by swapping the true to the beginning of the range
@@ -87,33 +91,36 @@ partition(Iter begin,
     and using O(N^2) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-insertion_sort(Iter begin,
-               Iter end,
-               Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void
+insertion_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return;
   }
 
   // for each unsorted item in the right side of the range
-  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end; ++next_unsorted) {
+  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end;
+       ++next_unsorted)
+  {
 
     // insert unsorted item into the sorted left side of the range
-    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert) {
+    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert)
+    {
 
       Iter next_sorted = RAJA::prev(to_insert);
 
       // compare with next item to left
-      if (comp(*to_insert, *next_sorted)) {
+      if (comp(*to_insert, *next_sorted))
+      {
 
         // swap down if should be before
         safe_iter_swap(next_sorted, to_insert);
-
-      } else {
+      }
+      else
+      {
 
         // stop if in correct position
         break;
@@ -125,20 +132,16 @@ insertion_sort(Iter begin,
 /*!
     \brief get number of strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr size_t num_shell_strides()
-{
-  return 39;
-}
+RAJA_HOST_DEVICE RAJA_INLINE constexpr size_t num_shell_strides() { return 39; }
 
 /*!
     \brief get strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr long long unsigned get_shell_stride(int i)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr long long unsigned
+get_shell_stride(int i)
 {
   using array_type = long long unsigned[num_shell_strides()];
-  return (array_type{
+  return (array_type {
       // strides from M. Ciura 2001
       1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
@@ -147,8 +150,8 @@ constexpr long long unsigned get_shell_stride(int i)
       149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
       8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
       220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
-      5647794772392llu, 12707538237882llu, 28591961035234llu, 64331912329276llu
-    })[i];
+      5647794772392llu, 12707538237882llu, 28591961035234llu,
+      64331912329276llu})[i];
 }
 
 /*!
@@ -156,26 +159,27 @@ constexpr long long unsigned get_shell_stride(int i)
     and using O(N^?) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-shell_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
 
   diff_type n = end - begin;
 
-  if (n <= static_cast<diff_type>(1)) {
+  if (n <= static_cast<diff_type>(1))
+  {
     return;
-  } else if (get_shell_stride(1) < static_cast<unsigned long long>(n)) {
+  }
+  else if (get_shell_stride(1) < static_cast<unsigned long long>(n))
+  {
 
     int i_stride = 2;
     // find first stride larger than n
     constexpr int num_strides = num_shell_strides();
-    for (; i_stride < num_strides; ++i_stride) {
-      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n)) {
+    for (; i_stride < num_strides; ++i_stride)
+    {
+      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n))
+      {
         break;
       }
     }
@@ -184,25 +188,32 @@ shell_sort(Iter begin,
 
     // for each stride size smaller than n, largest to smallest, not including 1
     // sort strided ranges with stride stride
-    for (; i_stride > 0; --i_stride) {
+    for (; i_stride > 0; --i_stride)
+    {
       diff_type stride = static_cast<diff_type>(get_shell_stride(i_stride));
 
       // for each unsorted item in the right side of each strided range
-      for (diff_type i_next_unsorted = stride; i_next_unsorted != n; ++i_next_unsorted) {
+      for (diff_type i_next_unsorted = stride; i_next_unsorted != n;
+           ++i_next_unsorted)
+      {
 
         // insert unsorted item into the sorted left side of the strided range
-        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride; i_to_insert -= stride) {
+        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride;
+             i_to_insert -= stride)
+        {
 
-          Iter to_insert = begin + i_to_insert;
+          Iter to_insert   = begin + i_to_insert;
           Iter next_sorted = to_insert - stride;
 
           // compare with next item to left
-          if (comp(*to_insert, *next_sorted)) {
+          if (comp(*to_insert, *next_sorted))
+          {
 
             // swap down if should be before
             safe_iter_swap(next_sorted, to_insert);
-
-          } else {
+          }
+          else
+          {
 
             // stop if in correct position
             break;
@@ -222,12 +233,8 @@ shell_sort(Iter begin,
     and using O(lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-heapify(Iter begin,
-        Iter root,
-        Iter end,
-        Compare comp)
+RAJA_HOST_DEVICE RAJA_INLINE void
+heapify(Iter begin, Iter root, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
@@ -235,24 +242,28 @@ heapify(Iter begin,
 
   // heapify the root node into place
   // until this is a max heap again
-  for (auto i = root - begin; 2*i+1 < N; i = root - begin) {
+  for (auto i = root - begin; 2 * i + 1 < N; i = root - begin)
+  {
 
     // find the max item amongst the root, left child, and right child
     Iter maxit = root;
 
     // left child
-    Iter child = begin + 2*i+1;
-    if (comp(*maxit, *child)) {
+    Iter child = begin + 2 * i + 1;
+    if (comp(*maxit, *child))
+    {
       maxit = child;
     }
 
     // right child
     ++child;
-    if (child != end && comp(*maxit, *child)) {
+    if (child != end && comp(*maxit, *child))
+    {
       maxit = child;
     }
 
-    if (maxit == root) {
+    if (maxit == root)
+    {
       // root is the max, done
       break;
     }
@@ -269,24 +280,22 @@ heapify(Iter begin,
     and using O(N*lg(N)) comparisons and O(1) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-heap_sort(Iter begin,
-          Iter end,
-          Compare comp)
+RAJA_HOST_DEVICE inline void heap_sort(Iter begin, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
   auto N = end - begin;
 
-  if (N < 2) {
+  if (N < 2)
+  {
     // already sorted
     return;
   }
 
   // make range into a max heap by
   // going through nodes with children one-by-one in reverse order
-  for (Iter root = begin + (N-1)/2; root != begin; --root) {
+  for (Iter root = begin + (N - 1) / 2; root != begin; --root)
+  {
     // heapify a sub-heap
     heapify(begin, root, end, comp);
   }
@@ -294,7 +303,8 @@ heap_sort(Iter begin,
   heapify(begin, begin, end, comp);
 
   // remove one element from max heap repeatedly until sorted
-  for (--end; begin != end; --end) {
+  for (--end; begin != end; --end)
+  {
 
     // swap max element into sorted position at end of heap
     safe_iter_swap(begin, end);
@@ -325,12 +335,8 @@ struct intro_sort_insertion_sort_cutoff
     and using O(N*lg(N)) comparisons and O(lg(N)) memory, with limited depth.
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort_depth(Iter begin,
-                 Iter end,
-                 Compare comp,
-                 unsigned depth)
+RAJA_HOST_DEVICE inline void
+intro_sort_depth(Iter begin, Iter end, Compare comp, unsigned depth)
 {
   using RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
@@ -341,57 +347,56 @@ intro_sort_depth(Iter begin,
   constexpr diff_type insertion_sort_cutoff =
       static_cast<diff_type>(intro_sort_insertion_sort_cutoff::get());
 
-  if (N < 2) {
+  if (N < 2)
+  {
 
     // already sorted
-
-  } else if (N < insertion_sort_cutoff) {
+  }
+  else if (N < insertion_sort_cutoff)
+  {
 
     // use insertion sort for small inputs
     detail::insertion_sort(begin, end, comp);
-
-  } else if (depth == 0) {
+  }
+  else if (depth == 0)
+  {
 
     // use heap sort if recurse too deep
     detail::heap_sort(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     // use quick sort
     // choose pivot with median of 3 (N >= insertion_sort_cutoff)
-    Iter mid = begin + N/2;
-    Iter last = end-1;
-    Iter pivot = comp(*begin, *mid)
-                    ? ( comp(*mid, *last)
-                           ? mid
-                           : ( comp(*begin, *last)
-                                  ? last
-                                  : begin ) )
-                    : ( comp(*mid, *last)
-                           ? ( comp(*begin, *last)
-                                  ? begin
-                                  : last )
-                           : mid );
+    Iter mid  = begin + N / 2;
+    Iter last = end - 1;
+    Iter pivot =
+        comp(*begin, *mid)
+            ? (comp(*mid, *last) ? mid : (comp(*begin, *last) ? last : begin))
+            : (comp(*mid, *last) ? (comp(*begin, *last) ? begin : last) : mid);
 
     // swap pivot to last
-    if (pivot != last) {
+    if (pivot != last)
+    {
       safe_iter_swap(pivot, last);
       pivot = last;
     }
 
     // partition
-    mid = partition(begin, last, [&](Iter it){ return comp(*it, *pivot); });
+    mid = partition(begin, last, [&](Iter it) { return comp(*it, *pivot); });
 
     // swap pivot to sorted position
-    if (mid != pivot) {
+    if (mid != pivot)
+    {
       safe_iter_swap(mid, pivot);
       pivot = mid;
     }
 
     // recurse to sort first and second parts, ignoring already sorted pivot
     // by construction pivot is always in the range [begin, last]
-    detail::intro_sort_depth(begin, pivot, comp, depth-1);
-    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth-1);
+    detail::intro_sort_depth(begin, pivot, comp, depth - 1);
+    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth - 1);
   }
 }
 
@@ -400,20 +405,18 @@ intro_sort_depth(Iter begin,
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
 template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
 {
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2*RAJA::log2(N);
+  unsigned max_depth = 2 * RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  // limit max_depth statically in device code to allow compiler to remove recursion
-  if (max_depth > detail::intro_sort_device_max_depth::get()) {
+  // limit max_depth statically in device code to allow compiler to remove
+  // recursion
+  if (max_depth > detail::intro_sort_device_max_depth::get())
+  {
     max_depth = detail::intro_sort_device_max_depth::get();
   }
 #endif
@@ -426,25 +429,20 @@ intro_sort(Iter begin,
     with local range/2 copy
 */
 template <typename Iter, typename Compare>
-void
-RAJA_INLINE
-inplace_merge(  Iter first,
-                Iter middle,
-                Iter last,
-                Compare comp  )
+void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   diff_type copylen = middle - first;
 
-  if ( first == middle || middle == last )
+  if (first == middle || middle == last)
   {
     // at least one side empty, already sorted
     return;
   }
 
-  if ( !comp(*middle, *(middle-1)) )
+  if (!comp(*middle, *(middle - 1)))
   {
     // everything already in order, done
     return;
@@ -455,37 +453,39 @@ inplace_merge(  Iter first,
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, copylen * sizeof(value_type) ),
+      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                              copylen * sizeof(value_type)),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
 
   // check memory allocation worked
-  if (copyarr == nullptr) {
-    RAJA_ABORT_OR_THROW( "inplace_merge temporary memory allocation failed" );
+  if (copyarr == nullptr)
+  {
+    RAJA_ABORT_OR_THROW("inplace_merge temporary memory allocation failed");
   }
 
   // move construct input into buffer storage
   // use buf_deleter.size as index to keep track of objects constructed
-  for ( diff_type& cc = buf_deleter.size; cc < copylen; ++cc )
+  for (diff_type& cc = buf_deleter.size; cc < copylen; ++cc)
   {
-    new(&copyarr[cc]) value_type(std::move(first[cc]));
+    new (&copyarr[cc]) value_type(std::move(first[cc]));
   }
 
   // merge
-  for ( diff_type cur = 0; cur < copylen; )
+  for (diff_type cur = 0; cur < copylen;)
   {
-    if ( middle >= last ) // moved all second half, put copy into remainder
+    if (middle >= last)  // moved all second half, put copy into remainder
     {
-      std::move( copyarr+cur, copyarr+copylen, first );
+      std::move(copyarr + cur, copyarr + copylen, first);
       break;
     }
-    else if ( first == middle ) // everything prior to middle is sorted, done
+    else if (first == middle)  // everything prior to middle is sorted, done
     {
       break;
     }
 
-    if ( comp(*middle, copyarr[cur]) )
+    if (comp(*middle, copyarr[cur]))
     {
       *first = std::move(*middle);
       ++middle;
@@ -505,47 +505,46 @@ inplace_merge(  Iter first,
     while copies are outside, somewhat follows STL API
 */
 template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
-//constexpr OutIter // <-- std:: return value
-void
-RAJA_INLINE
-merge_like_std( Iter1 first1,
-                Iter1 last1,
-                Iter2 first2,
-                Iter2 last2,
-                OutIter d_first,  // using this as direct access to result
-                Compare comp)
+// constexpr OutIter // <-- std:: return value
+void RAJA_INLINE
+merge_like_std(Iter1 first1,
+               Iter1 last1,
+               Iter2 first2,
+               Iter2 last2,
+               OutIter d_first,  // using this as direct access to result
+               Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if ( first1 == last2 - 1 )  // should never need to do this
+  if (first1 == last2 - 1)  // should never need to do this
   {
     return;
   }
 
-  if ( (last2 - first1) == 2 ) // only 2 elements, simple swap
+  if ((last2 - first1) == 2)  // only 2 elements, simple swap
   {
-    if ( !comp(*d_first, *(d_first+1)) )
+    if (!comp(*d_first, *(d_first + 1)))
     {
-      safe_iter_swap( d_first, d_first+1 );
+      safe_iter_swap(d_first, d_first + 1);
     }
     return;
   }
 
-  while ( first1 < last1 || first2 < last2 )
+  while (first1 < last1 || first2 < last2)
   {
-    if ( first1 >= last1 ) // first half done
+    if (first1 >= last1)  // first half done
     {
       *d_first = std::move(*first2);
       ++first2;
     }
-    else if ( first2 >= last2 )  // second half done
+    else if (first2 >= last2)  // second half done
     {
       *d_first = std::move(*first1);
       ++first1;
     }
     else  // neither half done
     {
-      if ( comp( *first2, *first1 ) )
+      if (comp(*first2, *first1))
       {
         *d_first = std::move(*first2);
         ++first2;
@@ -568,34 +567,30 @@ merge_like_std( Iter1 first1,
     and using O(N*lg(N)) comparisons and O(N) memory
 */
 template <typename Iter, typename Compare>
-RAJA_INLINE
-void
-merge_sort(Iter begin,
-           Iter end,
-           Compare comp)
+RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   // iterative mergesort (bottom up) for future parallelism
 
   // min helper
-  auto minlam = [] (diff_type a, diff_type b) {return (a < b) ? a : b;};
+  auto minlam = [](diff_type a, diff_type b) { return (a < b) ? a : b; };
 
   // insertion sort for sizes <= 16
-  diff_type len = end - begin;
+  diff_type len                                    = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
-  if ( len <= insertion_sort_cutoff && len > 0 )
+  if (len <= insertion_sort_cutoff && len > 0)
   {
-    detail::insertion_sort( begin, end, comp );
+    detail::insertion_sort(begin, end, comp);
   }
   else
   {
     // insertion sort on 16-element chunks, then merge
-    for ( diff_type start = 0; start < len; start += insertion_sort_cutoff )
+    for (diff_type start = 0; start < len; start += insertion_sort_cutoff)
     {
-      diff_type lastchunk = minlam( insertion_sort_cutoff, len - start );
-      detail::insertion_sort( begin + start, begin + start + lastchunk, comp );
+      diff_type lastchunk = minlam(insertion_sort_cutoff, len - start);
+      detail::insertion_sort(begin + start, begin + start + lastchunk, comp);
     }
 
     // merge using extra storage
@@ -605,74 +600,86 @@ merge_sort(Iter begin,
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, len * sizeof(value_type) ),
+        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                                len * sizeof(value_type)),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
 
     // check memory allocation worked
-    if (copyarr == nullptr) {
-      RAJA_ABORT_OR_THROW( "merge_sort temporary memory allocation failed" );
+    if (copyarr == nullptr)
+    {
+      RAJA_ABORT_OR_THROW("merge_sort temporary memory allocation failed");
     }
 
     // move construct input into buffer storage
     // use buf_deleter.size as index to keep track of objects constructed
-    for ( diff_type& cc = buf_deleter.size; cc < len; ++cc )
+    for (diff_type& cc = buf_deleter.size; cc < len; ++cc)
     {
-      new(&copyarr[cc]) value_type(std::move(begin[cc]));
+      new (&copyarr[cc]) value_type(std::move(begin[cc]));
     }
 
     bool copyvalid = true;
-    //for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log n) loop
-    for ( diff_type midpoint = 16; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    // for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log
+    // n) loop
+    for (diff_type midpoint = 16; midpoint < len;
+         midpoint *= 2)  // O(log n) loop
     {
-      for ( diff_type start = 0; start < len; start += midpoint * 2 )  // O(n) merging loop (can be parallelized)
+      for (diff_type start = 0; start < len;
+           start += midpoint * 2)  // O(n) merging loop (can be parallelized)
       {
-        diff_type finish = minlam( start + midpoint * 2, len );
-        if ( finish > len )
+        diff_type finish = minlam(start + midpoint * 2, len);
+        if (finish > len)
         {
-          RAJA_ABORT_OR_THROW( "merge_sort invalid finish point" );  // sanity check
+          RAJA_ABORT_OR_THROW(
+              "merge_sort invalid finish point");  // sanity check
         }
 
-        if ( start + midpoint >= len )
+        if (start + midpoint >= len)
         {
           // copy sorted remainder over
-          if ( copyvalid )
+          if (copyvalid)
           {
-            std::move( copyarr + start, copyarr + finish, begin + start );
+            std::move(copyarr + start, copyarr + finish, begin + start);
           }
           else
           {
-            std::move( begin + start, begin + finish, copyarr + start );
+            std::move(begin + start, begin + finish, copyarr + start);
           }
           break;  // skip merge if no second half exists
         }
 
-        if ( copyvalid )  // switch arrays per level of merging to avoid copying back to copyarr
+        if (copyvalid)  // switch arrays per level of merging to avoid copying
+                        // back to copyarr
         {
-          detail::merge_like_std( copyarr + start, copyarr + start + midpoint, copyarr + start + midpoint, copyarr + finish, begin + start, comp );
+          detail::merge_like_std(copyarr + start, copyarr + start + midpoint,
+                                 copyarr + start + midpoint, copyarr + finish,
+                                 begin + start, comp);
         }
         else
         {
-          detail::merge_like_std( begin + start, begin + start + midpoint, begin + start + midpoint, begin + finish, copyarr + start, comp );
+          detail::merge_like_std(begin + start, begin + start + midpoint,
+                                 begin + start + midpoint, begin + finish,
+                                 copyarr + start, comp);
         }
       }
 
-      copyvalid = !copyvalid; // switch arrays per level of merging to avoid copying back to copyarr
+      copyvalid = !copyvalid;  // switch arrays per level of merging to avoid
+                               // copying back to copyarr
     }
 
     // update copy if necessary
-    if ( copyvalid )
+    if (copyvalid)
     {
-      std::move( copyarr, copyarr + len, begin );
+      std::move(copyarr, copyarr + len, begin);
     }
   }
-  //else
+  // else
   //{
-      // Possible TBD: in-place mergesort
-      // Would shift (like insertion sort) when performing merge.
-      // PRO - Can use on GPU, O(1) storage required.
-      // CON - Shifting would cause slowdown O(n^2 log n).
+  //  Possible TBD: in-place mergesort
+  //  Would shift (like insertion sort) when performing merge.
+  //  PRO - Can use on GPU, O(1) storage required.
+  //  CON - Shifting would cause slowdown O(n^2 log n).
   //}
 }
 
@@ -684,10 +691,9 @@ merge_sort(Iter begin,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-insertion_sort(Container&& c,
-               Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    insertion_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -700,9 +706,11 @@ insertion_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::insertion_sort(begin_it, end_it, comp);
     }
   }
@@ -714,10 +722,9 @@ insertion_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-shell_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    shell_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -730,9 +737,11 @@ shell_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::shell_sort(begin_it, end_it, comp);
     }
   }
@@ -744,10 +753,9 @@ shell_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-heap_sort(Container&& c,
-          Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    heap_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -760,9 +768,11 @@ heap_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::heap_sort(begin_it, end_it, comp);
     }
   }
@@ -774,10 +784,9 @@ heap_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-intro_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_HOST_DEVICE
+    RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+    intro_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -790,9 +799,11 @@ intro_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::intro_sort(begin_it, end_it, comp);
     }
   }
@@ -804,10 +815,8 @@ intro_sort(Container&& c,
 */
 template <typename Container,
           typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-merge_sort(Container&& c,
-           Compare comp = Compare{})
+RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>>
+merge_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -820,9 +829,11 @@ merge_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::merge_sort(begin_it, end_it, comp);
     }
   }
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 310217bde5..4e185591f3 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -41,7 +41,7 @@ namespace RAJA
 ///
 enum named_usage : int
 {
-  ignored = -1,
+  ignored     = -1,
   unspecified = 0
 };
 
@@ -70,13 +70,19 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
-struct DirectBase {};
-struct LoopBase {};
-struct ContiguousLoopBase : LoopBase {};
-struct StridedLoopBase : LoopBase {};
-struct UnsizedLoopBase {};
-struct SizedLoopBase {};
-template < size_t t_max_iterations >
+struct DirectBase
+{};
+struct LoopBase
+{};
+struct ContiguousLoopBase : LoopBase
+{};
+struct StridedLoopBase : LoopBase
+{};
+struct UnsizedLoopBase
+{};
+struct SizedLoopBase
+{};
+template <size_t t_max_iterations>
 struct SizedLoopSpecifyingBase : SizedLoopBase
 {
   static constexpr size_t max_iterations = t_max_iterations;
@@ -103,7 +109,8 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-struct Direct : DirectBase {};
+struct Direct : DirectBase
+{};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -130,10 +137,13 @@ struct Direct : DirectBase {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-template < size_t max_iterations >
-struct Contiguousloop : ContiguousLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct Contiguousloop
+    : ContiguousLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -160,18 +170,25 @@ struct Contiguousloop : ContiguousLoopBase,
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-template < size_t max_iterations >
-struct StridedLoop : StridedLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template <size_t max_iterations>
+struct StridedLoop
+    : StridedLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
-} // namespace iteration_mapping
+}  // namespace iteration_mapping
 
 ///
 /// Enumeration used to indicate whether ListSegment object owns data
 /// representing its indices.
 ///
-enum IndexOwnership { Unowned, Owned };
+enum IndexOwnership
+{
+  Unowned,
+  Owned
+};
 
 ///
 /// Type use for all loop indexing in RAJA constructs.
@@ -189,8 +206,8 @@ const int UndefinedValue = -9999999;
 /// Template list of sizes
 ///
 template <Index_type... Sizes>
-struct SizeList {
-};
+struct SizeList
+{};
 
 
 ///
@@ -203,15 +220,15 @@ struct Fraction
 
   using inverse = Fraction<int_t, denominator, numerator>;
 
-  template < typename new_int_t >
-  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+  template <typename new_int_t>
+  using rebind =
+      Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
 
   static constexpr int_t multiply(int_t val) noexcept
   {
     return (val / denominator) * numerator +
            (val % denominator) * numerator / denominator;
   }
-
 };
 
 
@@ -229,7 +246,7 @@ using Real_type = double;
 
 #elif defined(RAJA_USE_FLOAT)
 ///
-using Real_type = float;
+using Real_type         = float;
 
 #else
 #error RAJA Real_type is undefined!
@@ -254,7 +271,8 @@ using Complex_type = std::complex<Real_type>;
 // alignment attribute supported for versions > 12
 //
 #if __ICC >= 1300
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 #endif
@@ -262,7 +280,8 @@ using const_TDRAReal_ptr = const TDRAReal_ptr;
 #elif defined(RAJA_COMPILER_GNU)
 
 #elif defined(RAJA_COMPILER_CLANG)
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 
@@ -814,51 +833,51 @@ class RestrictComplexPtr
  ******************************************************************************
  */
 #if defined(RAJA_USE_BARE_PTR)
-using Real_ptr = Real_type*;
+using Real_ptr       = Real_type*;
 using const_Real_ptr = const Real_type*;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type*;
+using Complex_ptr       = Complex_type*;
 using const_Complex_ptr = const Complex_type*;
 #endif
 
-using UnalignedReal_ptr = Real_type*;
+using UnalignedReal_ptr       = Real_type*;
 using const_UnalignedReal_ptr = const Real_type*;
 
 #elif defined(RAJA_USE_RESTRICT_PTR)
-using Real_ptr = Real_type* RAJA_RESTRICT;
-using const_Real_ptr = const Real_type* RAJA_RESTRICT;
+using Real_ptr          = Real_type* RAJA_RESTRICT;
+using const_Real_ptr    = const Real_type* RAJA_RESTRICT;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
+using Complex_ptr       = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR)
-using Real_ptr = TDRAReal_ptr;
-using const_Real_ptr = const_TDRAReal_ptr;
+using Real_ptr           = TDRAReal_ptr;
+using const_Real_ptr     = const_TDRAReal_ptr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
-using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
+using Complex_ptr        = Complex_type* RAJA_RESTRICT;
+using const_Complex_ptr  = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_PTR_CLASS)
-using Real_ptr = RestrictAlignedRealPtr;
-using const_Real_ptr = ConstRestrictAlignedRealPtr;
+using Real_ptr           = RestrictAlignedRealPtr;
+using const_Real_ptr     = ConstRestrictAlignedRealPtr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = RestrictComplexPtr;
-using const_Complex_ptr = ConstRestrictComplexPtr;
+using Complex_ptr        = RestrictComplexPtr;
+using const_Complex_ptr  = ConstRestrictComplexPtr;
 #endif
 
-using UnalignedReal_ptr = RestrictRealPtr;
+using UnalignedReal_ptr       = RestrictRealPtr;
 using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 
 #else
@@ -867,20 +886,21 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 #endif
 
 
-namespace detail {
+namespace detail
+{
 
 /*!
  * \brief Abstracts access to memory using normal memory accesses.
  */
 struct DefaultAccessor
 {
-  template < typename T >
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
   {
     return ptr[i];
   }
 
-  template < typename T >
+  template <typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
   {
     ptr[i] = val;
@@ -919,11 +939,11 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<
-                      ((alignof(T) >= alignof(unsigned char) &&
-                        sizeof(unsigned char) <= max_integer_type_size)),
-                      unsigned char,
-                      void>>>>>;
+                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
+                                       sizeof(unsigned char) <=
+                                           max_integer_type_size)),
+                                     unsigned char,
+                                     void>>>>>;
   static_assert(!std::is_same<integer_type, void>::value,
                 "could not find a compatible integer type");
   static_assert(sizeof(integer_type) >= min_integer_type_size,
@@ -965,28 +985,23 @@ template <typename T>
 struct ScopedAssignment
 {
   ScopedAssignment(T& val, T const& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = new_val;
   }
 
   ScopedAssignment(T& val, T&& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val), m_prev_val(std::move(val))
   {
     m_ref_to_val = std::move(new_val);
   }
 
-  ScopedAssignment(ScopedAssignment const&) = delete;
-  ScopedAssignment(ScopedAssignment &&) = delete;
+  ScopedAssignment(ScopedAssignment const&)            = delete;
+  ScopedAssignment(ScopedAssignment&&)                 = delete;
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
-  ScopedAssignment& operator=(ScopedAssignment &&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment&&)      = delete;
 
-  ~ScopedAssignment()
-  {
-    m_ref_to_val = std::move(m_prev_val);
-  }
+  ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
 
 private:
   T& m_ref_to_val;
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 1beefeb9cc..b639a45226 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -37,41 +37,39 @@ namespace RAJA
     \brief ZipIterator class for simultaneously iterating over
     multiple iterators. This is not a standards compliant iterator.
 */
-template < typename ... Iters >
+template <typename... Iters>
 struct ZipIterator
 {
-  static_assert(concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
+  static_assert(
+      concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
   static_assert(sizeof...(Iters) > 1,
-      "ZipIterator must contain one or more iterators");
+                "ZipIterator must contain one or more iterators");
 
-  using value_type = zip_val<typename std::iterator_traits<Iters>::value_type...>;
+  using value_type =
+      zip_val<typename std::iterator_traits<Iters>::value_type...>;
   using difference_type = std::ptrdiff_t;
-  using pointer = void;
+  using pointer         = void;
   using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
-  using creference = zip_ref<const typename std::iterator_traits<Iters>::reference...>;
+  using creference =
+      zip_ref<const typename std::iterator_traits<Iters>::reference...>;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE inline ZipIterator()
-    : m_iterators()
-  {
-  }
+  RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
 
-  template < typename... Args,
-             typename = concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...> >
+  template <typename... Args,
+            typename = concepts::enable_if<
+                type_traits::convertible_to<Args&&, Iters>...>>
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
-    : m_iterators(std::forward<Args>(args)...)
-  {
-  }
+      : m_iterators(std::forward<Args>(args)...)
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
-    : m_iterators(rhs.m_iterators)
-  {
-  }
+      : m_iterators(rhs.m_iterators)
+  {}
   RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
-    : m_iterators(std::move(rhs.m_iterators))
-  {
-  }
+      : m_iterators(std::move(rhs.m_iterators))
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator& operator=(const ZipIterator& rhs)
   {
@@ -97,11 +95,11 @@ struct ZipIterator
   }
   RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) >  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) > RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) <  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) < RAJA::get<0>(rhs.m_iterators);
   }
   RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
   {
@@ -114,12 +112,12 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator& operator++()
   {
-    detail::zip_for_each(m_iterators, detail::PreInc{});
+    detail::zip_for_each(m_iterators, detail::PreInc {});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator& operator--()
   {
-    detail::zip_for_each(m_iterators, detail::PreDec{});
+    detail::zip_for_each(m_iterators, detail::PreDec {});
     return *this;
   }
   RAJA_HOST_DEVICE inline ZipIterator operator++(int)
@@ -135,41 +133,38 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type> {rhs});
     return *this;
   }
-  RAJA_HOST_DEVICE inline ZipIterator& operator-=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type> {rhs});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator-(
-      const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE inline difference_type
+  operator-(const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) - RAJA::get<0>(rhs.m_iterators);
   }
-  RAJA_HOST_DEVICE inline ZipIterator operator+(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator
+  operator+(const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp += rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE inline ZipIterator operator-(
-      const difference_type& rhs) const
+  RAJA_HOST_DEVICE inline ZipIterator
+  operator-(const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(
-      difference_type lhs,
-      const ZipIterator& rhs)
+  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
+                                                const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -178,7 +173,7 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline reference operator*() const
   {
-    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)>{});
+    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)> {});
   }
   // TODO:: figure out what to do with this
   // RAJA_HOST_DEVICE inline reference operator->() const
@@ -190,15 +185,16 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
+                                                     ZipIterator rhs)
   {
-    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
+    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap {});
   }
 
 private:
   zip_val<camp::decay<Iters>...> m_iterators;
 
-  template < camp::idx_t ... Is >
+  template <camp::idx_t... Is>
   RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
   {
     return reference(*RAJA::get<Is>(m_iterators)...);
@@ -210,10 +206,8 @@ struct ZipIterator
     \brief Zip multiple iterators together to iterate them simultaneously with
     a single ZipIterator object.
 */
-template < typename... Args >
-RAJA_HOST_DEVICE
-auto zip(Args&&... args)
-  -> ZipIterator<camp::decay<Args>...>
+template <typename... Args>
+RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
 {
   return {std::forward<Args>(args)...};
 }
@@ -223,29 +217,28 @@ auto zip(Args&&... args)
     ZipIterator objects.
 */
 template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto zip_span(Args&&... args)
-  -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-          typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
+    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+            typename ZipIterator<
+                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
   return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-              typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
+              typename ZipIterator<detail::ContainerIter<
+                  camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
-      zip(  end(std::forward<Args>(args))...));
+      zip(end(std::forward<Args>(args))...));
 }
 
 /*!
     \brief Comparator object that compares the first member
     of tuple like objects.
 */
-template < typename T, typename Compare >
+template <typename T, typename Compare>
 struct CompareFirst
 {
-  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_)
-    : comp(comp_)
-  { }
+  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_) : comp(comp_) {}
 
   RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs)
   {
@@ -260,10 +253,8 @@ struct CompareFirst
     \brief Make a comparator to compare first member of tuple
     like objects of type T.
 */
-template < typename T, typename Compare >
-RAJA_HOST_DEVICE
-auto compare_first(Compare comp)
-  -> CompareFirst<T, Compare>
+template <typename T, typename Compare>
+RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
 {
   return {comp};
 }
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index d631d4714b..5faecebf7b 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -31,49 +31,61 @@
 namespace RAJA
 {
 
-template < bool is_val, typename ... Ts >
+template <bool is_val, typename... Ts>
 struct zip_tuple;
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 struct zip_tuple_element;
 
-template < camp::idx_t I, bool is_val, typename ... Ts >
+template <camp::idx_t I, bool is_val, typename... Ts>
 struct zip_tuple_element<I, zip_tuple<is_val, Ts...>>
-  : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
-{ };
+    : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
+{};
 
-template < camp::idx_t I, typename ZT >
+template <camp::idx_t I, typename ZT>
 using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 
 
 // get function declarations for zip_tuple
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> &
-get(zip_tuple<is_val, Ts...>      &  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
-get(zip_tuple<is_val, Ts...> const&  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> &&
-get(zip_tuple<is_val, Ts...>      && z) noexcept
-{ return std::move(z).template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
+                                                     zip_tuple<is_val, Ts...>>&
+get(zip_tuple<is_val, Ts...>& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::
+    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
+    get(zip_tuple<is_val, Ts...> const& z) noexcept
+{
+  return z.template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>>&&
+get(zip_tuple<is_val, Ts...>&& z) noexcept
+{
+  return std::move(z).template get<I>();
+}
+template <camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
 get(zip_tuple<is_val, Ts...> const&& z) noexcept
-{ return std::move(z).template get<I>(); }
+{
+  return std::move(z).template get<I>();
+}
 
 namespace detail
 {
 
 struct PassThrough
 {
-  template < typename T >
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::forward<T>(t))
+      -> decltype(std::forward<T>(t))
   {
     return std::forward<T>(t);
   }
@@ -81,9 +93,9 @@ struct PassThrough
 
 struct Move
 {
-  template < typename T >
+  template <typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::move(t))
+      -> decltype(std::move(t))
   {
     return std::move(t);
   }
@@ -91,9 +103,9 @@ struct Move
 
 struct PreInc
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(++std::forward<Iter>(iter))
+      -> decltype(++std::forward<Iter>(iter))
   {
     return ++std::forward<Iter>(iter);
   }
@@ -101,33 +113,33 @@ struct PreInc
 
 struct PreDec
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(--std::forward<Iter>(iter))
+      -> decltype(--std::forward<Iter>(iter))
   {
     return --std::forward<Iter>(iter);
   }
 };
 
-template < typename difference_type >
+template <typename difference_type>
 struct PlusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) += rhs)
+      -> decltype(std::forward<Iter>(iter) += rhs)
   {
     return std::forward<Iter>(iter) += rhs;
   }
 };
 
-template < typename difference_type >
+template <typename difference_type>
 struct MinusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) -= rhs)
+      -> decltype(std::forward<Iter>(iter) -= rhs)
   {
     return std::forward<Iter>(iter) -= rhs;
   }
@@ -135,9 +147,9 @@ struct MinusEq
 
 struct DeRef
 {
-  template< typename Iter >
+  template <typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(*std::forward<Iter>(iter))
+      -> decltype(*std::forward<Iter>(iter))
   {
     return *std::forward<Iter>(iter);
   }
@@ -145,7 +157,7 @@ struct DeRef
 
 struct Swap
 {
-  template< typename T0, typename T1 >
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using camp::safe_swap;
@@ -156,7 +168,7 @@ struct Swap
 
 struct IterSwap
 {
-  template< typename T0, typename T1 >
+  template <typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using RAJA::safe_iter_swap;
@@ -169,9 +181,9 @@ struct IterSwap
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void
+zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
 }
@@ -179,51 +191,55 @@ void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
+template <typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void
+zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)), RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
+                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple&& t, F&& f)
+template <typename Tuple, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f), typename camp::decay<Tuple>::IdxSeq{});
+  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
+                    typename camp::decay<Tuple>::IdxSeq {});
 }
 
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+template <typename Tuple0, typename Tuple1, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq, typename camp::decay<Tuple1>::IdxSeq>::value,
-      "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
+                             typename camp::decay<Tuple1>::IdxSeq>::value,
+                "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple0>::IdxSeq {});
 }
 
-} // end namespace detail
+}  // end namespace detail
 
 /*!
     \brief Tuple used by ZipIterator for storing multiple references and values.
-    Acts like a reference to its members allowing copy/move construction/assignment
-    based on the reference type of the zip_tuple.
+    Acts like a reference to its members allowing copy/move
+   construction/assignment based on the reference type of the zip_tuple.
 */
-template < bool is_val, typename ... Ts >
+template <bool is_val, typename... Ts>
 struct zip_tuple
 {
   using value_type = RAJA::tuple<Ts...>;
 
-  template < typename T >
-  using opp_type = typename std::conditional< is_val,
-        typename std::add_lvalue_reference<T>::type,
-        typename std::remove_reference<T>::type >::type;
+  template <typename T>
+  using opp_type =
+      typename std::conditional<is_val,
+                                typename std::add_lvalue_reference<T>::type,
+                                typename std::remove_reference<T>::type>::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -232,76 +248,114 @@ struct zip_tuple
   using IdxSeq = camp::make_idx_seq_t<sizeof...(Ts)>;
 
   // constructor from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...> >
+  template <
+      typename... Os,
+      typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...>>
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
-    : m_tuple(std::forward<Os>(os)...) { }
+      : m_tuple(std::forward<Os>(os)...)
+  {}
 
   // assignment from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, typename std::remove_reference<Ts>::type>...> >
+  template <typename... Os,
+            typename = concepts::enable_if<type_traits::convertible_to<
+                Os&&,
+                typename std::remove_reference<Ts>::type>...>>
   zip_tuple& assign(Os&&... os)
-  { return assign_helper(IdxSeq{}, std::forward<Os>(os)...); }
+  {
+    return assign_helper(IdxSeq {}, std::forward<Os>(os)...);
+  }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq {})
+  {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq {})
+  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
+  {
+    return assign_helper(o, IdxSeq {});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq {});
+  }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq {})
+  {}
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq {})
+  {}
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
+  {
+    return assign_helper(o, IdxSeq {});
+  }
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq {});
+  }
 
   // get member functions for zip_tuples
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> & get() & noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> const& get() const& noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> && get() && noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> const&& get() const&& noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type>&
+  get() & noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type> const&
+  get() const& noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>>&&
+  get() && noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
+  template <camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>> const&&
+  get() const&& noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     zip_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     opp_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // allow printing of zip_tuples by printing value_type
@@ -313,67 +367,111 @@ struct zip_tuple
 private:
   // move if is_val is true, otherwise copy in move constructor
   // this allows values to be moved, and references to stay lvalue references
-  using IsValMover = typename std::conditional<is_val, detail::Move, detail::PassThrough>::type;
+  using IsValMover = typename std::
+      conditional<is_val, detail::Move, detail::PassThrough>::type;
 
   value_type m_tuple;
 
   // assignment helper from types convertible to Ts
-  template < typename ... Os, camp::idx_t ... Is >
+  template <typename... Os, camp::idx_t... Is>
   zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
-  { camp::sink(get<Is>() = std::forward<Os>(os)...); return *this; }
+  {
+    camp::sink(get<Is>() = std::forward<Os>(os)...);
+    return *this;
+  }
 
   // copy and move constructor helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &      o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &&     o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); } return *this; }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    }
+    return *this;
+  }
 
   // copy and move constructor helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &      o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &&     o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); return *this; }
-
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+  template <camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    return *this;
+  }
 };
 
 // alias zip_ref to zip_tuple capable of storing references (!is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_ref = zip_tuple<false, Ts...>;
 
 // alias zip_val to zip_tuple suitable for storing values (is_val)
-template < typename ... Ts >
+template <typename... Ts>
 using zip_val = zip_tuple<true, Ts...>;
 
 }  // end namespace RAJA
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index 8c8d051d8f..fe6cb470bf 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -12,51 +12,51 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first,
+                                               INDEX_TYPE last,
+                                               const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::expt::dynamic_forall<POLICY_LIST>(working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-  });
+  RAJA::expt::dynamic_forall<POLICY_LIST>(
+      working_res, pol, r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
-
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest);
 template <typename T>
 class DynamicForallResourceRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
 {
@@ -70,42 +70,45 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  //If N == 2 host, no openmp is available
-  //If N == 3 host, openmp is available
-  //If N == 4 host, device is available
-  //If N == 5 host, openmp, device are on
+  // If N == 2 host, no openmp is available
+  // If N == 3 host, openmp is available
+  // If N == 4 host, device is available
+  // If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  bool is_on_host =
+      working_res.get_platform() == camp::resources::Platform::host ? true
+                                                                    : false;
 
-  if(is_on_host) { 
+  if (is_on_host)
+  {
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3; 
-#endif      
-      //Loop through policy list
-      for(int pol=0; pol<host_range; ++pol) 
-        {
-          DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
-        }
+    host_range = 3;
+#endif
+    // Loop through policy list
+    for (int pol = 0; pol < host_range; ++pol)
+    {
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
+                                                POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3; 
-#endif      
-    for(int pol=device_start; pol<N; ++pol) 
+    device_start = 3;
+#endif
+    for (int pol = device_start; pol < N; ++pol)
     {
-    DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+      DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES,
+                                                POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
-
-
 }
 
 REGISTER_TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest,
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
index 11168b0e30..09dec1c458 100644
--- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -12,60 +12,66 @@
 #include <iostream>
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
-void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first,
+                                       INDEX_TYPE last,
+                                       const int pol)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-    });
-
-  } else { // zero-length segment 
+    RAJA::expt::dynamic_forall<POLICY_LIST>(
+        pol, r1,
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1,
+                                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                            {
+                                              (void)idx;
+                                              working_array[0]++;
+                                            });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -73,8 +79,7 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const
 TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest);
 template <typename T>
 class DynamicForallRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
 {
@@ -87,45 +92,45 @@ TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
   constexpr int N = camp::size<POLICY_LIST>::value;
 #endif
 
-  //If N == 2 host, no openmp is available
-  //If N == 3 host, openmp is available
-  //If N == 4 host, device is available
-  //If N == 5 host, openmp, device are on
+  // If N == 2 host, no openmp is available
+  // If N == 3 host, openmp is available
+  // If N == 4 host, device is available
+  // If N == 5 host, openmp, device are on
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  bool is_on_host =
+      working_res.get_platform() == camp::resources::Platform::host ? true
+                                                                    : false;
 
-  if(is_on_host) { 
+  if (is_on_host)
+  {
     int host_range = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    host_range = 3; 
-#endif      
-      //Loop through policy list
-      for(int pol=0; pol<host_range; ++pol) 
-        {
-          DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
-        }
+    host_range = 3;
+#endif
+    // Loop through policy list
+    for (int pol = 0; pol < host_range; ++pol)
+    {
+      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
   }
 #if defined(RAJA_GPU_ACTIVE)
   else
   {
     int device_start = 2;
 #if defined(RAJA_ENABLE_OPENMP)
-    device_start = 3; 
-#endif      
-    for(int pol=device_start; pol<N; ++pol) 
+    device_start = 3;
+#endif
+    for (int pol = device_start; pol < N; ++pol)
     {
-    DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
-      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+      DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>(
+          INDEX_TYPE(0), INDEX_TYPE(27), pol);
     }
   }
 #endif
-
-
 }
 
-REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest,
-                            RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest, RangeSegmentForall);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
index 1b9dd4334a..1189dfc36a 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-1D.hpp
@@ -16,56 +16,60 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
-  INDEX_TYPE N = N0;
+  INDEX_TYPE N  = N0;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), first - first);
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
       test_array[i0] = i0;
     }
     test_array[RAJA::stripIndexType(N)] = INDEX_TYPE(0);
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      if (idx >= first && idx < last) {
-        // in bounds
-        working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        {
+          if (idx >= first && idx < last)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType(idx - first)] += (idx - first);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -73,24 +77,31 @@ void ForallCombiningAdapter1DTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest);
 template <typename T>
 class ForallCombiningAdapter1DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -101,16 +112,19 @@ TYPED_TEST_P(ForallCombiningAdapter1DTest, Forall1D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallCombiningAdapter1DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest,
-                            Forall1D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter1DTest, Forall1D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_1D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
index 2be6464bb8..5b011f6c8b 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-2D.hpp
@@ -14,32 +14,35 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
-                                  INDEX_TYPE first1, INDEX_TYPE last1)
+void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
-  INDEX_TYPE N = N0 * N1;
+  INDEX_TYPE N  = N0 * N1;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
+      {
         test_array[i0 * N1 + i1] = i0 * N1 + i1;
       }
     }
@@ -47,32 +50,36 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1) {
-      if (idx0 >= first0 && idx0 < last0 &&
-          idx1 >= first1 && idx1 < last1) {
-        // in bounds
-        working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
-                                           (idx1 - first1))] += (idx0 - first0) * N1 +
-                                                                (idx1 - first1);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0, r1);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1)
+        {
+          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 && idx1 < last1)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType((idx0 - first0) * N1 +
+                                               (idx1 - first1))] +=
+                (idx0 - first0) * N1 + (idx1 - first1);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0, r1);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -80,27 +87,31 @@ void ForallCombiningAdapter2DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest);
 template <typename T>
 class ForallCombiningAdapter2DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3));
 
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(0));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(2));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2));
 }
 
 
@@ -111,24 +122,23 @@ TYPED_TEST_P(ForallCombiningAdapter2DTest, Forall2D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(8));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5));
-
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(15),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(17));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(57),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(21));
-  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(156),
-                                                                     INDEX_TYPE(17), INDEX_TYPE(203));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(8));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(6), INDEX_TYPE(5), INDEX_TYPE(5));
+
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(15), INDEX_TYPE(0), INDEX_TYPE(17));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(57), INDEX_TYPE(4), INDEX_TYPE(21));
+  ForallCombiningAdapter2DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(13), INDEX_TYPE(156), INDEX_TYPE(17), INDEX_TYPE(203));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest,
-                            Forall2D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter2DTest, Forall2D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_2D_HPP__
diff --git a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
index 83213cc113..1b5611ee74 100644
--- a/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
+++ b/test/functional/forall/CombiningAdapter/tests/test-forall-CombiningAdapter-3D.hpp
@@ -14,41 +14,43 @@
 #include "RAJA/util/CombiningAdapter.hpp"
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
-                                  INDEX_TYPE first1, INDEX_TYPE last1,
-                                  INDEX_TYPE first2, INDEX_TYPE last2)
+void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0,
+                                      INDEX_TYPE last0,
+                                      INDEX_TYPE first1,
+                                      INDEX_TYPE last1,
+                                      INDEX_TYPE first2,
+                                      INDEX_TYPE last2)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0), RAJA::stripIndexType(last0));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1), RAJA::stripIndexType(last1));
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2), RAJA::stripIndexType(last2));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r0(RAJA::stripIndexType(first0),
+                                         RAJA::stripIndexType(last0));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first1),
+                                         RAJA::stripIndexType(last1));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(RAJA::stripIndexType(first2),
+                                         RAJA::stripIndexType(last2));
   INDEX_TYPE N0 = static_cast<INDEX_TYPE>(r0.end() - r0.begin());
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
-  INDEX_TYPE N = N0 * N1 * N2;
+  INDEX_TYPE N  = N0 * N1 * N2;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(N)+1;
+  size_t data_len = RAJA::stripIndexType(N) + 1;
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   {
 
-    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++) {
-      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++) {
-        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++) {
-          test_array[i0 * N1*N2 +
-                     i1 * N2 +
-                     i2] = i0 * N1 * N2 +
-                           i1 * N2 +
-                           i2;
+    for (INDEX_TYPE i0 = INDEX_TYPE(0); i0 < N0; i0++)
+    {
+      for (INDEX_TYPE i1 = INDEX_TYPE(0); i1 < N1; i1++)
+      {
+        for (INDEX_TYPE i2 = INDEX_TYPE(0); i2 < N2; i2++)
+        {
+          test_array[i0 * N1 * N2 + i1 * N2 + i2] = i0 * N1 * N2 + i1 * N2 + i2;
         }
       }
     }
@@ -56,35 +58,38 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 
     working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * data_len);
 
-    auto adapter = RAJA::make_CombiningAdapter([=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2) {
-      if (idx0 >= first0 && idx0 < last0 &&
-          idx1 >= first1 && idx1 < last1 &&
-          idx2 >= first2 && idx2 < last2) {
-        // in bounds
-        working_array[RAJA::stripIndexType((idx0 - first0) * N1 * N2 +
-                                           (idx1 - first1) * N2 +
-                                           (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
-                                                                (idx1 - first1) * N2 +
-                                                                (idx2 - first2);
-      } else {
-        // out of bounds
-        working_array[RAJA::stripIndexType(N)]++;
-      }
-    }, r0, r1, r2);
+    auto adapter = RAJA::make_CombiningAdapter(
+        [=] RAJA_HOST_DEVICE(INDEX_TYPE idx0, INDEX_TYPE idx1, INDEX_TYPE idx2)
+        {
+          if (idx0 >= first0 && idx0 < last0 && idx1 >= first1 &&
+              idx1 < last1 && idx2 >= first2 && idx2 < last2)
+          {
+            // in bounds
+            working_array[RAJA::stripIndexType(
+                (idx0 - first0) * N1 * N2 + (idx1 - first1) * N2 +
+                (idx2 - first2))] += (idx0 - first0) * N1 * N2 +
+                                     (idx1 - first1) * N2 + (idx2 - first2);
+          }
+          else
+          {
+            // out of bounds
+            working_array[RAJA::stripIndexType(N)]++;
+          }
+        },
+        r0, r1, r2);
 
     RAJA::forall<EXEC_POLICY>(adapter.getRange(), adapter);
-
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i <= N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -92,30 +97,34 @@ void ForallCombiningAdapter3DTestImpl(INDEX_TYPE first0, INDEX_TYPE last0,
 TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest);
 template <typename T>
 class ForallCombiningAdapter3DTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(-3),
-                                                                     INDEX_TYPE(-1), INDEX_TYPE(-1));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(0),
-                                                                     INDEX_TYPE(-4), INDEX_TYPE(0));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(-3), INDEX_TYPE(2),
-                                                                     INDEX_TYPE(-7), INDEX_TYPE(-2));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5), INDEX_TYPE(-3), INDEX_TYPE(-3),
+      INDEX_TYPE(-1), INDEX_TYPE(-1));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), INDEX_TYPE(-3), INDEX_TYPE(0),
+      INDEX_TYPE(-4), INDEX_TYPE(0));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), INDEX_TYPE(-3), INDEX_TYPE(2),
+      INDEX_TYPE(-7), INDEX_TYPE(-2));
 }
 
 
@@ -126,33 +135,32 @@ TYPED_TEST_P(ForallCombiningAdapter3DTest, Forall3D)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(5),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(8));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(4),
-                                                                     INDEX_TYPE(5), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(7), INDEX_TYPE(7));
-
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(7),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(6),
-                                                                     INDEX_TYPE(0), INDEX_TYPE(3));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(13),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(17),
-                                                                     INDEX_TYPE(6), INDEX_TYPE(11));
-  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(13), INDEX_TYPE(46),
-                                                                     INDEX_TYPE(17), INDEX_TYPE(51),
-                                                                     INDEX_TYPE(4), INDEX_TYPE(31));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
+      INDEX_TYPE(7));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
+      INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(5), INDEX_TYPE(7),
+      INDEX_TYPE(8));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(4), INDEX_TYPE(5), INDEX_TYPE(6), INDEX_TYPE(7),
+      INDEX_TYPE(7));
+
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(7), INDEX_TYPE(0), INDEX_TYPE(6), INDEX_TYPE(0),
+      INDEX_TYPE(3));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(13), INDEX_TYPE(4), INDEX_TYPE(17),
+      INDEX_TYPE(6), INDEX_TYPE(11));
+  ForallCombiningAdapter3DTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(13), INDEX_TYPE(46), INDEX_TYPE(17), INDEX_TYPE(51),
+      INDEX_TYPE(4), INDEX_TYPE(31));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest,
-                            Forall3D);
+REGISTER_TYPED_TEST_SUITE_P(ForallCombiningAdapter3DTest, Forall3D);
 
 #endif  // __TEST_FORALL_CombiningAdapter_3D_HPP__
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index a9e2c5a9f8..f3cfc532a0 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall.
+/// Header file containing basic functional tests for atomic operations with
+/// forall.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_BASIC_HPP__
@@ -15,38 +16,40 @@
 #include <numeric>
 
 // segment multiplexer
-template< typename IdxType, typename SegType >
-struct RSMultiplexer {};
+template <typename IdxType, typename SegType>
+struct RSMultiplexer
+{};
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedRangeSegment<IdxType>>
 {
   RAJA::TypedRangeSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
   {
-    return RAJA::TypedRangeSegment<IdxType>( 0, N );
+    return RAJA::TypedRangeSegment<IdxType>(0, N);
   }
 };
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeStrideSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedRangeStrideSegment<IdxType>>
 {
   RAJA::TypedRangeStrideSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
+  makeseg(IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res))
   {
-    return RAJA::TypedRangeStrideSegment<IdxType>( 0, N, 1 );
+    return RAJA::TypedRangeStrideSegment<IdxType>(0, N, 1);
   }
 };
 
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedListSegment<IdxType> >
+template <typename IdxType>
+struct RSMultiplexer<IdxType, RAJA::TypedListSegment<IdxType>>
 {
-  RAJA::TypedListSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource work_res )
+  RAJA::TypedListSegment<IdxType> makeseg(IdxType N,
+                                          camp::resources::Resource work_res)
   {
     std::vector<IdxType> temp(N);
-    std::iota( std::begin(temp), std::end(temp), 0 );
-    return RAJA::TypedListSegment<IdxType>( &temp[0], static_cast<size_t>(temp.size()), work_res );
+    std::iota(std::begin(temp), std::end(temp), 0);
+    return RAJA::TypedListSegment<IdxType>(
+        &temp[0], static_cast<size_t>(temp.size()), work_res);
   }
 };
 // end segment multiplexer
@@ -58,58 +61,59 @@ template <typename ExecPolicy,
           typename IdxType,
           typename SegmentType,
           typename T>
-void ForallAtomicBasicTestImpl( IdxType seglimit )
+void ForallAtomicBasicTestImpl(IdxType seglimit)
 {
   // initialize an array
   const int len = 12;
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  SegmentType seg = 
-    RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
+  SegmentType seg =
+      RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
 
-  T * work_array;
-  T * test_array;
-  T * check_array;
+  T* work_array;
+  T* test_array;
+  T* check_array;
 
-  allocateForallTestData<T>(  len,
-                              work_res,
-                              &work_array,
-                              &check_array,
-                              &test_array );
+  allocateForallTestData<T>(len, work_res, &work_array, &check_array,
+                            &test_array);
 
   // use atomic add to reduce the array
-  test_array[0] = static_cast<T>(0);
-  test_array[1] = static_cast<T>(seglimit);
-  test_array[2] = static_cast<T>(seglimit);
-  test_array[3] = static_cast<T>(0);
-  test_array[4] = static_cast<T>(0);
-  test_array[5] = static_cast<T>(seglimit + 1);
-  test_array[6] = static_cast<T>(seglimit);
-  test_array[7] = static_cast<T>(0);
-  test_array[8] = static_cast<T>(0);
-  test_array[9] = static_cast<T>(0);
+  test_array[0]  = static_cast<T>(0);
+  test_array[1]  = static_cast<T>(seglimit);
+  test_array[2]  = static_cast<T>(seglimit);
+  test_array[3]  = static_cast<T>(0);
+  test_array[4]  = static_cast<T>(0);
+  test_array[5]  = static_cast<T>(seglimit + 1);
+  test_array[6]  = static_cast<T>(seglimit);
+  test_array[7]  = static_cast<T>(0);
+  test_array[8]  = static_cast<T>(0);
+  test_array[9]  = static_cast<T>(0);
   test_array[10] = static_cast<T>(0);
   test_array[11] = static_cast<T>(0);
 
   work_res.memcpy(work_array, test_array, sizeof(T) * len);
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
-    RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
-    RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
-    RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
-    RAJA::atomicInc<AtomicPolicy>(work_array + 4);
-    RAJA::atomicDec<AtomicPolicy>(work_array + 5);
-    RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
-    RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i), static_cast<T>(i+1));
-    RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
-    RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
-    RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
-    RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
-  });
-
-  work_res.memcpy( check_array, work_array, sizeof(T) * len );
+  RAJA::forall<ExecPolicy>(
+      seg,
+      [=] RAJA_HOST_DEVICE(IdxType i)
+      {
+        RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
+        RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
+        RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
+        RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
+        RAJA::atomicInc<AtomicPolicy>(work_array + 4);
+        RAJA::atomicDec<AtomicPolicy>(work_array + 5);
+        RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
+        RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i),
+                                      static_cast<T>(i + 1));
+        RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
+        RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
+        RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
+        RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
+      });
+
+  work_res.memcpy(check_array, work_array, sizeof(T) * len);
   work_res.wait();
 
   EXPECT_EQ(static_cast<T>(seglimit), check_array[0]);
@@ -127,17 +131,13 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
   EXPECT_EQ(static_cast<T>(4), check_array[10]);
   EXPECT_EQ(static_cast<T>(13), check_array[11]);
 
-  deallocateForallTestData<T>(work_res,
-                              work_array,
-                              check_array,
-                              test_array);
+  deallocateForallTestData<T>(work_res, work_array, check_array, test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicBasicTest);
 template <typename T>
 class ForallAtomicBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
 {
@@ -147,18 +147,15 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedRangeSegment<IdxType>, 
-                            DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedRangeStrideSegment<IdxType>, 
-                            DType>(10000);
-  ForallAtomicBasicTestImpl<AExec, APol, ResType, 
-                            IdxType, RAJA::TypedListSegment<IdxType>, 
-                            DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeSegment<IdxType>, DType>(10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedRangeStrideSegment<IdxType>, DType>(
+      10000);
+  ForallAtomicBasicTestImpl<AExec, APol, ResType, IdxType,
+                            RAJA::TypedListSegment<IdxType>, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest,
-                            AtomicBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, AtomicBasicForall);
 
 #endif  //__TEST_FORALL_ATOMIC_BASIC_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
index 9089844744..04eff1251e 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefAdd.hpp
@@ -6,98 +6,135 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for addition arithmetic atomic operations using forall
+/// Source file containing basic functional tests for addition arithmetic atomic
+/// operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_ADD_HPP__
 #define __TEST_FORALL_ATOMICREF_ADD_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PreIncCountOp {
-  PreIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PreIncCountOp
+{
+  PreIncCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (++counter) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (++counter) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PostIncCountOp {
-  PostIncCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PostIncCountOp
+{
+  PostIncCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter++);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter++); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AddEqCountOp {
-  AddEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AddEqCountOp
+{
+  AddEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter += (T)1) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return (counter += (T)1) - (T)1;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchAddCountOp {
-  FetchAddCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)seg.size())
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchAddCountOp
+{
+  FetchAddCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final((T)seg.size())
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_add((T)1);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return counter.fetch_add((T)1);
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename IdxType,
-         typename T,
-         template <typename, typename, typename> class CountOp>
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
 void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
-    T* count, T* list, bool* hit,
-    T* hcount, T* hlist, bool* hhit,
-    camp::resources::Resource work_res, IdxType N)
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = countop.max + (T)1;
-      hit[i] = false;
-      });
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = countop(i);
-      list[i] = val;
-      hit[(IdxType)val] = true;
-      });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -106,9 +143,9 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
-  work_res.memcpy( hhit, hit, sizeof(bool) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy(hhit, hit, sizeof(bool) * N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -119,7 +156,8 @@ void testAtomicRefAdd(RAJA::TypedRangeSegment<IdxType> seg,
 #endif
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -132,21 +170,21 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefAddTestImpl( IdxType N )
+void ForallAtomicRefAddTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
-  bool * hit  = work_res.allocate<bool>(N);
+  T* count  = work_res.allocate<T>(1);
+  T* list   = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
-  bool * hhit  = host_res.allocate<bool>(N);
+  T* hcount  = host_res.allocate<T>(1);
+  T* hlist   = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -156,29 +194,28 @@ void ForallAtomicRefAddTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PreIncCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PostIncCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     AddEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     FetchAddCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  work_res.deallocate( hit );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
-  host_res.deallocate( hhit ); 
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PreIncCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, PostIncCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, AddEqCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefAdd<ExecPolicy, AtomicPolicy, IdxType, T, FetchAddCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  work_res.deallocate(hit);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
+  host_res.deallocate(hhit);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefAddTest);
 template <typename T>
 class ForallAtomicRefAddTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
 {
@@ -188,10 +225,9 @@ TYPED_TEST_P(ForallAtomicRefAddTest, AtomicRefAddForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefAddTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest,
-                            AtomicRefAddForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefAddTest, AtomicRefAddForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_ADD_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 8f036fc4b9..0adce05b3d 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -6,92 +6,122 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for CAS atomic operations using forall
+/// Source file containing basic functional tests for CAS atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_CAS_HPP__
 #define __TEST_FORALL_ATOMICREF_CAS_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CASOtherOp : all_op {
-  CASOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CASOtherOp : all_op
+{
+  CASOtherOp(T* dcount,
+             T* hcount,
+             camp::resources::Resource work_res,
+             RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
+  T operator()(IdxType i) const
+  {
+    T received, expect = (T)0;
+    while ((received = other.CAS(expect, (T)i)) != expect)
     {
-      T received, expect = (T)0;
-      while ((received = other.CAS(expect, (T)i)) != expect) {
-        expect = received;
-      }
-      return received;
+      expect = received;
     }
+    return received;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CompareExchangeWeakOtherOp : all_op {
-  CompareExchangeWeakOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CompareExchangeWeakOtherOp : all_op
+{
+  CompareExchangeWeakOtherOp(T* dcount,
+                             T* hcount,
+                             camp::resources::Resource work_res,
+                             RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    {
-      T expect = (T)0;
-      while (!other.compare_exchange_weak(expect, (T)i)) {}
-      return expect;
-    }
+  T operator()(IdxType i) const
+  {
+    T expect = (T)0;
+    while (!other.compare_exchange_weak(expect, (T)i))
+    {}
+    return expect;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct CompareExchangeStrongOtherOp : all_op {
-  CompareExchangeStrongOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct CompareExchangeStrongOtherOp : all_op
+{
+  CompareExchangeStrongOtherOp(T* dcount,
+                               T* hcount,
+                               camp::resources::Resource work_res,
+                               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    {
-      T expect = (T)0;
-      while (!other.compare_exchange_strong(expect, (T)i)) {}
-      return expect;
-    }
+  T operator()(IdxType i) const
+  {
+    T expect = (T)0;
+    while (!other.compare_exchange_strong(expect, (T)i))
+    {}
+    return expect;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg,
+                        T* count,
+                        T* list,
+                        T* hcount,
+                        T* hlist,
+                        camp::resources::Resource work_res,
+                        IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -99,12 +129,13 @@ testAtomicRefCASOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -116,19 +147,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefCASTestImpl( IdxType N )
+void ForallAtomicRefCASTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -138,25 +169,26 @@ void ForallAtomicRefCASTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CASOtherOp                  >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CompareExchangeWeakOtherOp  >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       CompareExchangeStrongOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T, CASOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeWeakOtherOp>(seg, count, list, hcount,
+                                                 hlist, work_res, N);
+  testAtomicRefCASOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                     CompareExchangeStrongOtherOp>(seg, count, list, hcount,
+                                                   hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefCASTest);
 template <typename T>
 class ForallAtomicRefCASTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
 {
@@ -166,10 +198,9 @@ TYPED_TEST_P(ForallAtomicRefCASTest, AtomicRefCASForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefCASTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest,
-                            AtomicRefCASForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefCASTest, AtomicRefCASForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_CAS_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
index 95209b6c79..8bb250d339 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLoadStore.hpp
@@ -6,94 +6,124 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for load/store atomic operations using forall
+/// Source file containing basic functional tests for load/store atomic
+/// operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 #define __TEST_FORALL_ATOMICREF_LOADSTORE_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct LoadOtherOp : all_op {
-  LoadOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)seg.size()), max(min),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct LoadOtherOp : all_op
+{
+  LoadOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)seg.size()),
+        max(min),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-    { return other.load(); }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other.load(); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct OperatorTOtherOp : all_op {
-  OperatorTOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
-    : other(dcount), min(T(0)), max(min),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct OperatorTOtherOp : all_op
+{
+  OperatorTOtherOp(T* dcount,
+                   T* hcount,
+                   camp::resources::Resource work_res,
+                   RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg))
+      : other(dcount), min(T(0)), max(min), final_min(min), final_max(min)
   {
     hcount[0] = min;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const
-    { return other; }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return other; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct StoreOtherOp : all_op {
-  StoreOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min((T)0), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct StoreOtherOp : all_op
+{
+  StoreOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min((T)0),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { other.store((T)i); return (T)i; }
+  T operator()(IdxType i) const
+  {
+    other.store((T)i);
+    return (T)i;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AssignOtherOp : all_op {
-  AssignOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AssignOtherOp : all_op
+{
+  AssignOtherOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(max)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return (other = (T)i); }
+  T operator()(IdxType i) const { return (other = (T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg,
+                              T* count,
+                              T* list,
+                              T* hcount,
+                              T* hlist,
+                              camp::resources::Resource work_res,
+                              IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -101,12 +131,13 @@ testAtomicRefLoadStoreOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -118,19 +149,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLoadStoreTestImpl( IdxType N )
+void ForallAtomicRefLoadStoreTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -140,27 +171,27 @@ void ForallAtomicRefLoadStoreTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       LoadOtherOp     >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       OperatorTOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       StoreOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       AssignOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, LoadOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T,
+                           OperatorTOtherOp>(seg, count, list, hcount, hlist,
+                                             work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, StoreOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLoadStoreOp<ExecPolicy, AtomicPolicy, IdxType, T, AssignOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest);
 template <typename T>
 class ForallAtomicRefLoadStoreTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
 {
@@ -170,7 +201,7 @@ TYPED_TEST_P(ForallAtomicRefLoadStoreTest, AtomicRefLoadStoreForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefLoadStoreTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLoadStoreTest,
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
index 382560109c..49ec06689a 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefLogical.hpp
@@ -6,154 +6,202 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for logical atomic operations using forall
+/// Source file containing basic functional tests for logical atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 #define __TEST_FORALL_ATOMICREF_LOGICAL_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct AndEqOtherOp : int_op {
-  AndEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct AndEqOtherOp : int_op
+{
+  AndEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size()),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = np2m1((T)seg.size());
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other &= (T)i; }
+  T operator()(IdxType i) const { return other &= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchAndOtherOp : int_op {
-  FetchAndOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchAndOtherOp : int_op
+{
+  FetchAndOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = max;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_and((T)i); }
+  T operator()(IdxType i) const { return other.fetch_and((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct OrEqOtherOp : int_op {
-  OrEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct OrEqOtherOp : int_op
+{
+  OrEqOtherOp(T* dcount,
+              T* hcount,
+              camp::resources::Resource work_res,
+              RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other |= (T)i; }
+  T operator()(IdxType i) const { return other |= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchOrOtherOp : int_op {
-  FetchOrOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchOrOtherOp : int_op
+{
+  FetchOrOtherOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_or((T)i); }
+  T operator()(IdxType i) const { return other.fetch_or((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct XorEqOtherOp : int_op {
-  XorEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct XorEqOtherOp : int_op
+{
+  XorEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i)
+    {
+      final_min ^= (T)i;
+      final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other ^= (T)i; }
+  T operator()(IdxType i) const { return other ^= (T)i; }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchXorOtherOp : int_op {
-  FetchXorOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max(np2m1((T)seg.size())),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchXorOtherOp : int_op
+{
+  FetchXorOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max(np2m1((T)seg.size())),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = T(0);
     work_res.memcpy(dcount, hcount, sizeof(T));
-    for (IdxType i = 0; i < seg.size(); ++i) {
-      final_min ^= (T)i; final_max ^= (T)i;
+    for (IdxType i = 0; i < seg.size(); ++i)
+    {
+      final_min ^= (T)i;
+      final_max ^= (T)i;
     }
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_xor((T)i); }
+  T operator()(IdxType i) const { return other.fetch_xor((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // No test when underlying op type is int, and index type is float
 typename std::enable_if<
-           (std::is_floating_point<T>::value && 
-            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
-         >::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg), 
-                     T* RAJA_UNUSED_ARG(count), T* RAJA_UNUSED_ARG(list),
-                     T* RAJA_UNUSED_ARG(hcount), T* RAJA_UNUSED_ARG(hlist),
-                     camp::resources::Resource RAJA_UNUSED_ARG(work_res), IdxType RAJA_UNUSED_ARG(N))
-{
-}
+    (std::is_floating_point<T>::value &&
+     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> RAJA_UNUSED_ARG(seg),
+                       T* RAJA_UNUSED_ARG(count),
+                       T* RAJA_UNUSED_ARG(list),
+                       T* RAJA_UNUSED_ARG(hcount),
+                       T* RAJA_UNUSED_ARG(hlist),
+                       camp::resources::Resource RAJA_UNUSED_ARG(work_res),
+                       IdxType RAJA_UNUSED_ARG(N))
+{}
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
 // Run test if T is integral and operation is int_op, or for any all_op
 typename std::enable_if<
-           (std::is_integral<T>::value && 
-            std::is_base_of<int_op, OtherOp<T,AtomicPolicy, IdxType>>::value) || 
-            (std::is_base_of<all_op, OtherOp<T,AtomicPolicy, IdxType>>::value)
-         >::type
-testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+    (std::is_integral<T>::value &&
+     std::is_base_of<int_op, OtherOp<T, AtomicPolicy, IdxType>>::value) ||
+    (std::is_base_of<all_op, OtherOp<T, AtomicPolicy, IdxType>>::value)>::type
+testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg,
+                       T* count,
+                       T* list,
+                       T* hcount,
+                       T* hlist,
+                       camp::resources::Resource work_res,
+                       IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -161,12 +209,13 @@ testAtomicRefLogicalOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -178,19 +227,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefLogicalTestImpl( IdxType N )
+void ForallAtomicRefLogicalTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -200,33 +249,32 @@ void ForallAtomicRefLogicalTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  // Note: These integral tests require return type conditional overloading 
+  // Note: These integral tests require return type conditional overloading
   //       of testAtomicRefLogicalOp
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       AndEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchAndOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       OrEqOtherOp    >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchOrOtherOp >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       XorEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchXorOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, AndEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchAndOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, OrEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchOrOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, XorEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefLogicalOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchXorOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest);
 template <typename T>
 class ForallAtomicRefLogicalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
 {
@@ -236,10 +284,9 @@ TYPED_TEST_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefLogicalTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest,
-                            AtomicRefLogicalForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefLogicalTest, AtomicRefLogicalForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_LOGICAL_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
index b8860def9f..02547f773a 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefMinMax.hpp
@@ -6,94 +6,124 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for min/max atomic operations using forall
+/// Source file containing basic functional tests for min/max atomic operations
+/// using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 #define __TEST_FORALL_ATOMICREF_MINMAX_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct MaxEqOtherOp : all_op {
-  MaxEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct MaxEqOtherOp : all_op
+{
+  MaxEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.max((T)i); }
+  T operator()(IdxType i) const { return other.max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchMaxOtherOp : all_op {
-  FetchMaxOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(max), final_max(max)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchMaxOtherOp : all_op
+{
+  FetchMaxOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(max),
+        final_max(max)
   {
     hcount[0] = (T)0;
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_max((T)i); }
+  T operator()(IdxType i) const { return other.fetch_max((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct MinEqOtherOp : all_op {
-  MinEqOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size() - (T)1),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct MinEqOtherOp : all_op
+{
+  MinEqOtherOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size() - (T)1),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.min((T)i); }
+  T operator()(IdxType i) const { return other.min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchMinOtherOp : all_op {
-  FetchMinOtherOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : other(dcount), min(T(0)), max((T)seg.size()),
-    final_min(min), final_max(min)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchMinOtherOp : all_op
+{
+  FetchMinOtherOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : other(dcount),
+        min(T(0)),
+        max((T)seg.size()),
+        final_min(min),
+        final_max(min)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType i) const
-    { return other.fetch_min((T)i); }
+  T operator()(IdxType i) const { return other.fetch_min((T)i); }
   RAJA::AtomicRef<T, AtomicPolicy> other;
   T min, max, final_min, final_max;
 };
 
-template  < typename ExecPolicy,
-            typename AtomicPolicy,
-            typename IdxType,
-            typename T,
-            template <typename, typename, typename> class OtherOp>
-void
-testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
-    T* hcount, T* hlist,
-    camp::resources::Resource work_res, IdxType N)
+template <typename ExecPolicy,
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class OtherOp>
+void testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg,
+                           T* count,
+                           T* list,
+                           T* hcount,
+                           T* hlist,
+                           camp::resources::Resource work_res,
+                           IdxType N)
 {
   OtherOp<T, AtomicPolicy, IdxType> otherop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = otherop.max + (T)1;
-  });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = otherop(i);
-      list[i] = val;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { list[i] = otherop.max + (T)1; });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val   = otherop(i);
+                             list[i] = val;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -101,12 +131,13 @@ testAtomicRefMinMaxOp(RAJA::TypedRangeSegment<IdxType> seg, T* count, T* list,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
 
   EXPECT_LE(otherop.final_min, hcount[0]);
   EXPECT_GE(otherop.final_max, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(otherop.min, hlist[i]);
     EXPECT_GE(otherop.max, hlist[i]);
   }
@@ -118,19 +149,19 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefMinMaxTestImpl( IdxType N )
+void ForallAtomicRefMinMaxTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
+  T* count = work_res.allocate<T>(1);
+  T* list  = work_res.allocate<T>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
+  T* hcount = host_res.allocate<T>(1);
+  T* hlist  = host_res.allocate<T>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -140,27 +171,26 @@ void ForallAtomicRefMinMaxTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       MaxEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchMaxOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       MinEqOtherOp   >(seg, count, list, hcount, hlist, work_res, N);
-  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, 
-                       FetchMinOtherOp>(seg, count, list, hcount, hlist, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MaxEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMaxOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, MinEqOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+  testAtomicRefMinMaxOp<ExecPolicy, AtomicPolicy, IdxType, T, FetchMinOtherOp>(
+      seg, count, list, hcount, hlist, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest);
 template <typename T>
 class ForallAtomicRefMinMaxTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
 {
@@ -170,10 +200,9 @@ TYPED_TEST_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefMinMaxTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest,
-                            AtomicRefMinMaxForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefMinMaxTest, AtomicRefMinMaxForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_MINMAX_HPP__
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
index f4579fb786..bf15327085 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefSub.hpp
@@ -6,96 +6,118 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing basic functional tests for subtraction arithmetic atomic operations using forall
+/// Source file containing basic functional tests for subtraction arithmetic
+/// atomic operations using forall
 ///
 
 #ifndef __TEST_FORALL_ATOMICREF_SUB_HPP__
 #define __TEST_FORALL_ATOMICREF_SUB_HPP__
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PreDecCountOp {
-  PreDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PreDecCountOp
+{
+  PreDecCountOp(T* dcount,
+                T* hcount,
+                camp::resources::Resource work_res,
+                RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (--counter);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (--counter); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct PostDecCountOp {
-  PostDecCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct PostDecCountOp
+{
+  PostDecCountOp(T* dcount,
+                 T* hcount,
+                 camp::resources::Resource work_res,
+                 RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter--) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter--) - (T)1; }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct SubEqCountOp {
-  SubEqCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct SubEqCountOp
+{
+  SubEqCountOp(T* dcount,
+               T* hcount,
+               camp::resources::Resource work_res,
+               RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return (counter -= (T)1);
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const { return (counter -= (T)1); }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
-template < typename T, typename AtomicPolicy, typename IdxType >
-struct FetchSubCountOp {
-  FetchSubCountOp(T* dcount, T* hcount, camp::resources::Resource work_res, RAJA::TypedRangeSegment<IdxType> seg)
-    : counter(dcount), min((T)0), max((T)seg.size()-(T)1), final((T)0)
+template <typename T, typename AtomicPolicy, typename IdxType>
+struct FetchSubCountOp
+{
+  FetchSubCountOp(T* dcount,
+                  T* hcount,
+                  camp::resources::Resource work_res,
+                  RAJA::TypedRangeSegment<IdxType> seg)
+      : counter(dcount), min((T)0), max((T)seg.size() - (T)1), final((T)0)
   {
     hcount[0] = (T)seg.size();
     work_res.memcpy(dcount, hcount, sizeof(T));
   }
   RAJA_HOST_DEVICE
-    T operator()(IdxType RAJA_UNUSED_ARG(i)) const {
-      return counter.fetch_sub((T)1) - (T)1;
-    }
+  T operator()(IdxType RAJA_UNUSED_ARG(i)) const
+  {
+    return counter.fetch_sub((T)1) - (T)1;
+  }
   RAJA::AtomicRef<T, AtomicPolicy> counter;
   T min, max, final;
 };
 
 template <typename ExecPolicy,
-         typename AtomicPolicy,
-         typename IdxType,
-         typename T,
-         template <typename, typename, typename> class CountOp>
+          typename AtomicPolicy,
+          typename IdxType,
+          typename T,
+          template <typename, typename, typename>
+          class CountOp>
 void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
-    T* count, T* list, bool* hit,
-    T* hcount, T* hlist, bool* hhit,
-    camp::resources::Resource work_res, IdxType N)
+                      T* count,
+                      T* list,
+                      bool* hit,
+                      T* hcount,
+                      T* hlist,
+                      bool* hhit,
+                      camp::resources::Resource work_res,
+                      IdxType N)
 {
   CountOp<T, AtomicPolicy, IdxType> countop(count, hcount, work_res, seg);
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      list[i] = countop.max + (T)1;
-      hit[i] = false;
-      });
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-      T val = countop(i);
-      list[i] = val;
-      hit[(IdxType)val] = true;
-      });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             list[i] = countop.max + (T)1;
+                             hit[i]  = false;
+                           });
+  RAJA::forall<ExecPolicy>(seg,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             T val             = countop(i);
+                             list[i]           = val;
+                             hit[(IdxType)val] = true;
+                           });
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
 #endif
@@ -103,12 +125,13 @@ void testAtomicRefSub(RAJA::TypedRangeSegment<IdxType> seg,
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.memcpy( hcount, count, sizeof(T) );
-  work_res.memcpy( hlist, list, sizeof(T) * N );
-  work_res.memcpy( hhit, hit, sizeof(bool) * N );
+  work_res.memcpy(hcount, count, sizeof(T));
+  work_res.memcpy(hlist, list, sizeof(T) * N);
+  work_res.memcpy(hhit, hit, sizeof(bool) * N);
 
   EXPECT_EQ(countop.final, hcount[0]);
-  for (IdxType i = 0; i < seg.size(); i++) {
+  for (IdxType i = 0; i < seg.size(); i++)
+  {
     EXPECT_LE(countop.min, hlist[i]);
     EXPECT_GE(countop.max, hlist[i]);
     EXPECT_TRUE(hhit[i]);
@@ -121,21 +144,21 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicRefSubTestImpl( IdxType N )
+void ForallAtomicRefSubTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
 
-  camp::resources::Resource work_res{WORKINGRES()};
+  camp::resources::Resource work_res {WORKINGRES()};
 
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * count   = work_res.allocate<T>(1);
-  T * list    = work_res.allocate<T>(N);
-  bool * hit  = work_res.allocate<bool>(N);
+  T* count  = work_res.allocate<T>(1);
+  T* list   = work_res.allocate<T>(N);
+  bool* hit = work_res.allocate<bool>(N);
 
-  T * hcount   = host_res.allocate<T>(1);
-  T * hlist    = host_res.allocate<T>(N);
-  bool * hhit  = host_res.allocate<bool>(N);
+  T* hcount  = host_res.allocate<T>(1);
+  T* hlist   = host_res.allocate<T>(N);
+  bool* hhit = host_res.allocate<bool>(N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -145,29 +168,28 @@ void ForallAtomicRefSubTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PreDecCountOp  >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     PostDecCountOp >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     SubEqCountOp   >(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, 
-                     FetchSubCountOp>(seg, count, list, hit, hcount, hlist, hhit, work_res, N);
-
-  work_res.deallocate( count );
-  work_res.deallocate( list );
-  work_res.deallocate( hit );
-  host_res.deallocate( hcount );
-  host_res.deallocate( hlist );
-  host_res.deallocate( hhit ); 
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PreDecCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, PostDecCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, SubEqCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+  testAtomicRefSub<ExecPolicy, AtomicPolicy, IdxType, T, FetchSubCountOp>(
+      seg, count, list, hit, hcount, hlist, hhit, work_res, N);
+
+  work_res.deallocate(count);
+  work_res.deallocate(list);
+  work_res.deallocate(hit);
+  host_res.deallocate(hcount);
+  host_res.deallocate(hlist);
+  host_res.deallocate(hhit);
 }
 
 
 TYPED_TEST_SUITE_P(ForallAtomicRefSubTest);
 template <typename T>
 class ForallAtomicRefSubTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
 {
@@ -177,10 +199,9 @@ TYPED_TEST_P(ForallAtomicRefSubTest, AtomicRefSubForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>( 10000 );
+  ForallAtomicRefSubTestImpl<AExec, APol, ResType, IdxType, DType>(10000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest,
-                            AtomicRefSubForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicRefSubTest, AtomicRefSubForall);
 
 #endif  //__TEST_FORALL_ATOMICREF_SUB_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
index fc67162823..c71c363d75 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicMultiView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
@@ -19,25 +20,26 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicMultiViewTestImpl( IdxType N )
+void ForallAtomicMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
-  int src_side = dst_side*2; // source[] dimension
+  int dst_side = static_cast<int>(
+      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
+  int src_side = dst_side * 2;                 // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T *  actualsource = work_res.allocate<T> (N);
-  T ** source       = work_res.allocate<T*>(src_side);
-  T *  actualdest   = work_res.allocate<T> (N/2);
-  T ** dest         = work_res.allocate<T*>(dst_side);
-  T *  check_array  = host_res.allocate<T> (N/2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source      = work_res.allocate<T*>(src_side);
+  T* actualdest   = work_res.allocate<T>(N / 2);
+  T** dest        = work_res.allocate<T*>(dst_side);
+  T* check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -49,19 +51,14 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
 
   // assumes each source[] will be 2x size of each dest[], src_side x dst_side
   RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType ii)
-  {
-    source[ii] = actualsource+(ii*dst_side);
-  });
+                           { source[ii] = actualsource + (ii * dst_side); });
 
   // assumes each dest[] will be a square matrix, dst_side x dst_side
   RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType ii)
-  {
-    dest[ii] = actualdest+(ii*dst_side);
-  });
+                           { dest[ii] = actualdest + (ii * dst_side); });
 
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    actualsource[i] = (T)1;
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { actualsource[i] = (T)1; });
 
   // use atomic add to reduce the array
   // 1D defaut MultiView
@@ -73,22 +70,27 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
 
 
   // Zero out dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_dstside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
-    {
-      sum_atomic_view(i,aopidx) = (T)0;
-    }
-  });
+  RAJA::forall<ExecPolicy>(seg_dstside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i, aopidx) = (T)0;
+                             }
+                           });
 
   // Assign values to dest using atomic MultiView
-  RAJA::forall<ExecPolicy>(seg_srcside, [=] RAJA_HOST_DEVICE(IdxType i) {
-    for ( int aopidx = 0; aopidx < dst_side; ++aopidx )
-    {
-      sum_atomic_view(i/2, aopidx) += vec_view(aopidx,i/2);
-    }
-  });
-
-  work_res.memcpy( check_array, actualdest, sizeof(T) * N/2 );
+  RAJA::forall<ExecPolicy>(seg_srcside,
+                           [=] RAJA_HOST_DEVICE(IdxType i)
+                           {
+                             for (int aopidx = 0; aopidx < dst_side; ++aopidx)
+                             {
+                               sum_atomic_view(i / 2, aopidx) +=
+                                   vec_view(aopidx, i / 2);
+                             }
+                           });
+
+  work_res.memcpy(check_array, actualdest, sizeof(T) * N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -98,22 +100,22 @@ void ForallAtomicMultiViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i) {
+  for (IdxType i = 0; i < N / 2; ++i)
+  {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  work_res.deallocate( actualsource );
-  work_res.deallocate( source );
-  work_res.deallocate( actualdest );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  work_res.deallocate(actualsource);
+  work_res.deallocate(source);
+  work_res.deallocate(actualdest);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest);
 template <typename T>
 class ForallAtomicMultiViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
 {
@@ -123,10 +125,9 @@ TYPED_TEST_P(ForallAtomicMultiViewTest, AtomicMultiViewForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
+  ForallAtomicMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>(20000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest,
-                            AtomicMultiViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicMultiViewTest, AtomicMultiViewForall);
 
 #endif  //__TEST_FORALL_ATOMIC_MULTIVIEW_HPP__
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
index a33c0f591a..c066673e4a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicOutOfBoundsMultiView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMICOUTOFBOUNDS_MULTIVIEW_HPP__
@@ -19,25 +20,26 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
+void ForallAtomicOutOfBoundsMultiViewTestImpl(IdxType N)
 {
   // Functionally similar to ForallAtomicViewTestImpl
 
-  int dst_side = static_cast<int>( std::sqrt( static_cast<double>(N/2) ) ); // dest[] dimension
-  int src_side = dst_side*2; // source[] dimension
+  int dst_side = static_cast<int>(
+      std::sqrt(static_cast<double>(N / 2)));  // dest[] dimension
+  int src_side = dst_side * 2;                 // source[] dimension
 
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_dstside(0, dst_side);
   RAJA::TypedRangeSegment<IdxType> seg_srcside(0, src_side);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T *  actualsource = work_res.allocate<T> (N);
-  T ** source       = work_res.allocate<T*>(src_side);
-  T *  actualdest   = work_res.allocate<T> (N/2);
-  T ** dest         = work_res.allocate<T*>(dst_side);
-  T *  check_array  = host_res.allocate<T> (N/2);
+  T* actualsource = work_res.allocate<T>(N);
+  T** source      = work_res.allocate<T*>(src_side);
+  T* actualdest   = work_res.allocate<T>(N / 2);
+  T** dest        = work_res.allocate<T*>(dst_side);
+  T* check_array  = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -58,12 +60,12 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
   auto sum_atomic_view = RAJA::make_atomic_view<AtomicPolicy>(sum_view);
 
 
-  // Need gtest death test to avoid complete failure due to eventual seg fault
-  #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  EXPECT_DEATH_IF_SUPPORTED( (sum_atomic_view(0,-1) = (T)0), "" );
-  #else
-  EXPECT_THROW( (sum_atomic_view(0,-1) = (T)0), std::runtime_error );
-  #endif
+// Need gtest death test to avoid complete failure due to eventual seg fault
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+  EXPECT_DEATH_IF_SUPPORTED((sum_atomic_view(0, -1) = (T)0), "");
+#else
+  EXPECT_THROW((sum_atomic_view(0, -1) = (T)0), std::runtime_error);
+#endif
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -73,20 +75,20 @@ void ForallAtomicOutOfBoundsMultiViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  work_res.deallocate( actualsource );
-  work_res.deallocate( source );
-  work_res.deallocate( actualdest );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  work_res.deallocate(actualsource);
+  work_res.deallocate(source);
+  work_res.deallocate(actualdest);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest);
 template <typename T>
 class ForallAtomicOutOfBoundsMultiViewTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest, AtomicOutOfBoundsMultiViewForall)
+TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest,
+             AtomicOutOfBoundsMultiViewForall)
 {
   using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
   using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -94,7 +96,8 @@ TYPED_TEST_P(ForallAtomicOutOfBoundsMultiViewTest, AtomicOutOfBoundsMultiViewFor
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType, DType>( 20000 );
+  ForallAtomicOutOfBoundsMultiViewTestImpl<AExec, APol, ResType, IdxType,
+                                           DType>(20000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicOutOfBoundsMultiViewTest,
diff --git a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
index 588e95bf82..325fba2a0a 100644
--- a/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
+++ b/test/functional/forall/atomic-view/tests/test-forall-AtomicView.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_FORALL_ATOMIC_VIEW_HPP__
@@ -17,18 +18,18 @@ template <typename ExecPolicy,
           typename WORKINGRES,
           typename IdxType,
           typename T>
-void ForallAtomicViewTestImpl( IdxType N )
+void ForallAtomicViewTestImpl(IdxType N)
 {
   RAJA::TypedRangeSegment<IdxType> seg(0, N);
   RAJA::TypedRangeSegment<IdxType> seg_half(0, N / 2);
 
-  camp::resources::Resource work_res{WORKINGRES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKINGRES()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  T * hsource = host_res.allocate<T>(N);
-  T * source = work_res.allocate<T>(N);
-  T * dest = work_res.allocate<T>(N/2);
-  T * check_array = host_res.allocate<T>(N/2);
+  T* hsource     = host_res.allocate<T>(N);
+  T* source      = work_res.allocate<T>(N);
+  T* dest        = work_res.allocate<T>(N / 2);
+  T* check_array = host_res.allocate<T>(N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -38,10 +39,9 @@ void ForallAtomicViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  RAJA::forall<RAJA::seq_exec>(seg,
-                               [=](IdxType i) { hsource[i] = (T)1; });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](IdxType i) { hsource[i] = (T)1; });
 
-  work_res.memcpy( source, hsource, sizeof(T) * N );
+  work_res.memcpy(source, hsource, sizeof(T) * N);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -59,16 +59,14 @@ void ForallAtomicViewTestImpl( IdxType N )
 
 
   // Zero out dest using atomic view
-  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i) {
-    sum_atomic_view(i) = (T)0;
-  });
+  RAJA::forall<ExecPolicy>(seg_half, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i) = (T)0; });
 
   // Assign values to dest using atomic view
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    sum_atomic_view(i / 2) += vec_view(i);
-  });
+  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i)
+                           { sum_atomic_view(i / 2) += vec_view(i); });
 
-  work_res.memcpy( check_array, dest, sizeof(T) * N/2 );
+  work_res.memcpy(check_array, dest, sizeof(T) * N / 2);
 
 #if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
@@ -78,21 +76,21 @@ void ForallAtomicViewTestImpl( IdxType N )
   hipErrchk(hipDeviceSynchronize());
 #endif
 
-  for (IdxType i = 0; i < N / 2; ++i) {
+  for (IdxType i = 0; i < N / 2; ++i)
+  {
     EXPECT_EQ((T)2, check_array[i]);
   }
 
-  host_res.deallocate( hsource );
-  work_res.deallocate( source );
-  work_res.deallocate( dest );
-  host_res.deallocate( check_array );
+  host_res.deallocate(hsource);
+  work_res.deallocate(source);
+  work_res.deallocate(dest);
+  host_res.deallocate(check_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicViewTest);
 template <typename T>
 class ForallAtomicViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
 {
@@ -102,10 +100,9 @@ TYPED_TEST_P(ForallAtomicViewTest, AtomicViewForall)
   using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
   using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>( 100000 );
+  ForallAtomicViewTestImpl<AExec, APol, ResType, IdxType, DType>(100000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest,
-                            AtomicViewForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicViewTest, AtomicViewForall);
 
 #endif  //__TEST_FORALL_ATOMIC_VIEW_HPP__
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
index 26bd5ee7d9..ae4cbcfb09 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IcountIndexSetView.hpp
@@ -23,62 +23,58 @@ void ForallIcountIndexSetViewTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   IndexSetType iset;
-  std::vector<INDEX_TYPE> is_indices; 
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
   RAJA::Layout<1> layout(N);
-  RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >
-    work_view(working_array, layout);
+  RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>> work_view(
+      working_array, layout);
 
-  RAJA::forall_Icount<EXEC_POLICY>(iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    work_view( icount ) = idx;
-  });
+  RAJA::forall_Icount<EXEC_POLICY>(
+      iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      { work_view(icount) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -86,8 +82,7 @@ void ForallIcountIndexSetViewTestImpl()
 TYPED_TEST_SUITE_P(ForallIcountIndexSetViewTest);
 template <typename T>
 class ForallIcountIndexSetViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIcountIndexSetViewTest, IndexSetForallIcountView)
 {
diff --git a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
index 7fc00c47d9..783bffa5fb 100644
--- a/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
+++ b/test/functional/forall/indexset-view/tests/test-forall-IndexSetView.hpp
@@ -22,61 +22,57 @@ void ForallIndexSetViewTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType iset; 
+  IndexSetType iset;
   std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -84,8 +80,7 @@ void ForallIndexSetViewTestImpl()
 TYPED_TEST_SUITE_P(ForallIndexSetViewTest);
 template <typename T>
 class ForallIndexSetViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
 {
@@ -96,7 +91,6 @@ TYPED_TEST_P(ForallIndexSetViewTest, IndexSetForallView)
   ForallIndexSetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest,
-                            IndexSetForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetViewTest, IndexSetForallView);
 
 #endif  // __TEST_FORALL_INDEXSET_VIEW_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
index 70fbb98b15..cd29d25073 100644
--- a/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IcountIndexSet.hpp
@@ -21,57 +21,53 @@ void ForallIcountIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
   RAJA::forall_Icount(EXEC_POLICY(), iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    working_array[icount] = idx;
-  });
+                      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+                      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -79,8 +75,7 @@ void ForallIcountIndexSetTestImpl()
 TYPED_TEST_SUITE_P(ForallIcountIndexSetTest);
 template <typename T>
 class ForallIcountIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
 {
@@ -91,7 +86,6 @@ TYPED_TEST_P(ForallIcountIndexSetTest, IndexSetForallIcount)
   ForallIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest,
-                            IndexSetForallIcount);
+REGISTER_TYPED_TEST_SUITE_P(ForallIcountIndexSetTest, IndexSetForallIcount);
 
 #endif  // __TEST_FORALL_ICOUNT_INDEXSET_HPP__
diff --git a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
index f2be845482..9decd9ae7e 100644
--- a/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
+++ b/test/functional/forall/indexset/tests/test-forall-IndexSet.hpp
@@ -20,56 +20,53 @@ void ForallIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, working_res);
+      iset, is_indices, working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall(EXEC_POLICY(), iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall(EXEC_POLICY(), iset,
+               [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+               { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -77,8 +74,7 @@ void ForallIndexSetTestImpl()
 TYPED_TEST_SUITE_P(ForallIndexSetTest);
 template <typename T>
 class ForallIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
 {
@@ -89,7 +85,6 @@ TYPED_TEST_P(ForallIndexSetTest, IndexSetForall)
   ForallIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest,
-                            IndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetTest, IndexSetForall);
 
 #endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
index c783befdf4..b3c33c97f9 100644
--- a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -15,9 +15,13 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEG_TYPE, typename Container,
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
@@ -27,11 +31,17 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE&,
                                const std::vector<IDX_TYPE>&,
                                camp::resources::Resource,
                                RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEG_TYPE, typename Container,
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEG_TYPE,
+          typename Container,
           typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
@@ -41,12 +51,13 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
                                camp::resources::Resource working_res,
                                RandomGenerator& rngen)
 {
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1;
-  const IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  const IDX_TYPE idx_len   = static_cast<IDX_TYPE>(seg_idx.size());
 
-  const int modval = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -63,47 +74,44 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
 
-    for (IDX_TYPE i = 0; i < idx_len; ++i) {
-      IDX_TYPE idx = seg_idx[i];
+    for (IDX_TYPE i = 0; i < idx_len; ++i)
+    {
+      IDX_TYPE idx    = seg_idx[i];
       test_range[idx] = data_len;
       data_len += work_per_iterate_distribution(rngen);
-      test_range[idx+1] = data_len;
+      test_range[idx + 1] = data_len;
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -111,7 +119,8 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -123,19 +132,27 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    RAJA::forall<EXEC_POLICY>(
+        seg,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+        {
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -145,44 +162,59 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-        }
-      });
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -190,21 +222,31 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
-        }
-      });
-
-      if (!got_ref_vals) {
+      RAJA::forall<EXEC_POLICY>(seg,
+                                [=] RAJA_HOST_DEVICE(IDX_TYPE ii)
+                                {
+                                  for (IDX_TYPE idx = working_range[ii];
+                                       idx < working_range[ii + 1]; ++idx)
+                                  {
+                                    ABSTRACTION::reduce(red[working_bins[idx]],
+                                                        working_array[idx]);
+                                  }
+                                });
+
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -212,26 +254,16 @@ ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest);
 template <typename T>
 class ForallMultiReduceBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 {
@@ -243,10 +275,10 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -254,51 +286,58 @@ TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+    RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
     RAJA::getIndices(seg_idx, r1);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r1, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r1, container, seg_idx,
+                                              working_res, rngen);
 
     seg_idx.clear();
-    RAJA::TypedRangeSegment<IDX_TYPE> r3( 3, 2060 );
+    RAJA::TypedRangeSegment<IDX_TYPE> r3(3, 2060);
     RAJA::getIndices(seg_idx, r3);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r3, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r3, container, seg_idx,
+                                              working_res, rngen);
 
     // Range-stride segment test
     seg_idx.clear();
-    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
     RAJA::getIndices(seg_idx, r5);
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   r5, container, seg_idx, working_res, rngen);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(r5, container, seg_idx,
+                                              working_res, rngen);
 
     // List segment test
     seg_idx.clear();
     IDX_TYPE last = 10567;
-    std::uniform_int_distribution<IDX_TYPE> dist(0, last-1);
-    for (IDX_TYPE i = 0; i < last; ++i) {
+    std::uniform_int_distribution<IDX_TYPE> dist(0, last - 1);
+    for (IDX_TYPE i = 0; i < last; ++i)
+    {
       IDX_TYPE randval = dist(rngen);
-      if ( i < randval ) {
+      if (i < randval)
+      {
         seg_idx.push_back(i);
       }
     }
-    RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                         working_res );
-    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
-                                   l1, container, seg_idx, working_res, rngen);
+    RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                        working_res);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                   DATA_TYPE>(l1, container, seg_idx,
+                                              working_res, rngen);
   }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest,
-                            MultiReduceBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest, MultiReduceBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
index 6adade08a9..c4ef3f3188 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitAnd.hpp
@@ -13,66 +13,69 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpand &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
-  
-  // 
+
+  //
   // And now a randomized test that pushes zeros around
-  // 
+  //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redand  &= working_array[idx];
-    redand2 &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redand &= working_array[idx];
+                              redand2 &= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -80,18 +83,16 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      redand &= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redand &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -99,8 +100,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
@@ -110,67 +110,67 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
index a0db78c4f6..5e783b89e0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceBitOr.hpp
@@ -13,67 +13,70 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    simpor |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
 
- 
+
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_or |= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_or |= test_array[seg_idx[i]];
   }
 
 
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor(0);
   RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    redor  |= working_array[idx];
-    redor2 |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              redor |= working_array[idx];
+                              redor2 |= working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2.get()), ref_or);
@@ -81,26 +84,23 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      redor |= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { redor |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor.get()), ref_or);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
@@ -111,70 +111,66 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-    camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r1, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r2, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r3, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r4, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r5, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   l1, seg_idx, working_res);
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
-                            ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
index 5ec8c47164..cb5657cde4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMax.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
- 
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
+  const int modval         = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max = modval + 1;
+  const DATA_TYPE big_max  = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,10 +52,12 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> maxinit(big_max);
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.max( working_array[idx] );
-    max.max( working_array[idx] );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.max(working_array[idx]);
+                              max.max(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max);
@@ -63,29 +66,24 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max( working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
-   
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.max( working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
@@ -95,70 +93,66 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
-                            ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
index c5f228821d..3aaba8daf4 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMaxLoc.hpp
@@ -13,57 +13,63 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval+1;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval + 1;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
+  DATA_TYPE ref_max   = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max    = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max, maxloc_init);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init, maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> maxinit(big_max,
+                                                                 maxloc_init);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max(max_init,
+                                                             maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    maxinit.maxloc( working_array[idx], idx );
-    max.maxloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              maxinit.maxloc(working_array[idx], idx);
+                              max.maxloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.get()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -75,31 +81,26 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    max.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
-  
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    max.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { max.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.get()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
- 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
@@ -109,67 +110,67 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
index 67e051acc4..2d91806ad6 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMin.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -52,10 +53,12 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.min( working_array[idx] );
-    min.min( working_array[idx] );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.min(working_array[idx]);
+                              min.min(working_array[idx]);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -63,24 +66,20 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min.reset(min_init);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
-  DATA_TYPE factor = 3; 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.min( working_array[idx] * factor);
-  });
+  DATA_TYPE factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    min.min( working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.min(working_array[idx] * factor); });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -88,8 +87,7 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
@@ -99,70 +97,66 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
index be5265d4b1..58e679cfe5 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceMinLoc.hpp
@@ -13,57 +13,63 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE small_min = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
+
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
+  DATA_TYPE ref_min   = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min    = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min, minloc_init);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init, minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> mininit(small_min,
+                                                                 minloc_init);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min(min_init,
+                                                             minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    mininit.minloc( working_array[idx], idx );
-    min.minloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              mininit.minloc(working_array[idx], idx);
+                              min.minloc(working_array[idx], idx);
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -75,31 +81,26 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    min.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) { 
-    min.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            { min.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
@@ -109,67 +110,67 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
index 2203aedd1b..11112841b0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-ReduceSum.hpp
@@ -13,35 +13,36 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -50,10 +51,12 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-    sum  += working_array[idx];
-    sum2 += working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                            {
+                              sum += working_array[idx];
+                              sum2 += working_array[idx];
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -62,18 +65,16 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum += working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              { sum += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -81,8 +82,7 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
@@ -92,70 +92,66 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
-  seg_idx.clear(); 
+  // List segment tests
+  seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
index c64106fc59..cad50d5843 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -13,9 +13,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
@@ -23,33 +25,30 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   using REF_BITAND = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_and>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpand(21);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & _simpand) {
-      _simpand &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & _simpand)
+      { _simpand &= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand), 5);
 
@@ -60,27 +59,30 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-    RAJA::expt::KernelName("RAJA Reduce BitAnd"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND &r1, REF_BITAND &r2) {
-      r1 &= working_array[idx];
-      r2 &= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+      RAJA::expt::KernelName("RAJA Reduce BitAnd"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & r1, REF_BITAND & r2)
+      {
+        r1 &= working_array[idx];
+        r2 &= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -88,20 +90,18 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND &r1) {
-        r1 &= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg, RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITAND & r1)
+        { r1 &= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -109,8 +109,7 @@ void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
 template <typename T>
 class ForallReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
@@ -120,67 +119,67 @@ TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
index 8c3ea14c4d..65a0deccd5 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -13,74 +13,76 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                    const std::vector<IDX_TYPE>& seg_idx,
+                                    camp::resources::Resource working_res)
 {
   using REF_BITOR = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_or>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 9;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpor(5);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&simpor),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & _simpor) {
-      _simpor |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&simpor),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & _simpor)
+      { _simpor |= working_array[idx]; });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpor), 13);
 
- 
+
   //
   // And now a randomized test that pushes zeros around
   //
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_or = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_or |= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_or |= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redor(0);
   DATA_TYPE redor2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
-    RAJA::expt::KernelName("RAJA Reduce BitOr"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR &r1, REF_BITOR &r2) {
-      r1 |= working_array[idx];
-      r2 |= working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
+      RAJA::expt::KernelName("RAJA Reduce BitOr"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & r1, REF_BITOR & r2)
+      {
+        r1 |= working_array[idx];
+        r2 |= working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
   ASSERT_EQ(static_cast<DATA_TYPE>(redor2), ref_or);
@@ -88,28 +90,25 @@ void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
   redor = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg,
-      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR &r1) {
-        r1 |= working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(
+        seg, RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_BITOR & r1)
+        { r1 |= working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
 template <typename T>
 class ForallReduceBitOrBasicTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
@@ -120,70 +119,66 @@ TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-    camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r1, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r2, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedRangeSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r3, seg_idx, working_res);
+                                 RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r4, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                             working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   r5, seg_idx, working_res);
+                                 EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                             working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                 RAJA::TypedListSegment<IDX_TYPE>,
-                                 EXEC_POLICY, REDUCE_POLICY>(
-                                   l1, seg_idx, working_res);
+                                 RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                 REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
-                            ReduceBitOrBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
index 773c737a85..4b8d23cb2e 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -13,9 +13,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
@@ -23,29 +25,28 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   using REF_MAX = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::maximum>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
- 
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
+
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
+  const int modval         = 100;
   const DATA_TYPE max_init = -1;
-  const DATA_TYPE big_max = modval + 1;
+  const DATA_TYPE big_max  = modval + 1;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_max = max_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_max = RAJA_MAX(test_array[seg_idx[i]], ref_max);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -53,14 +54,15 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE maxinit(big_max);
   DATA_TYPE max(max_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce Max"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &mi, REF_MAX &m) {
-      mi.max(working_array[idx]);
-      m.max(working_array[idx]);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce Max"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX & mi, REF_MAX & m)
+      {
+        mi.max(working_array[idx]);
+        m.max(working_array[idx]);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit), big_max);
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max);
@@ -69,33 +71,28 @@ void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(max), max_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &m) {
-      m.max(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX & m)
+                            { m.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
-   
+
   factor = 3;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX &m) {
-      m.max(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MAX & m)
+                            { m.max(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
-   
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
 template <typename T>
 class ForallReduceMaxBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
 {
@@ -105,70 +102,66 @@ TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
-                            ReduceMaxBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
index 225018eac8..107e29b925 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -13,63 +13,68 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval*10;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval * 10;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(1000 % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
+  DATA_TYPE ref_max   = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max    = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
   VL_TYPE maxinit(big_max, maxloc_init);
   VL_TYPE max(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.maxloc( working_array[idx], idx );
-      m.maxloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.maxloc(working_array[idx], idx);
+        m.maxloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -81,25 +86,21 @@ void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
 template <typename T>
 class ForallReduceMaxLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
 {
@@ -109,67 +110,67 @@ TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
index 68810ac893..e3391b0e42 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLocAlt.hpp
@@ -13,63 +13,68 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMaxLocAltBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                        const std::vector<IDX_TYPE>& seg_idx,
+                                        camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE max_init = -modval;
+  const int modval           = 100;
+  const DATA_TYPE max_init   = -modval;
   const IDX_TYPE maxloc_init = -1;
-  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE big_max = modval*10;
-  const IDX_TYPE big_maxloc = maxloc_init;
+  const IDX_TYPE maxloc_idx  = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE big_max    = modval * 10;
+  const IDX_TYPE big_maxloc  = maxloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(1000 % modval);
   }
   test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
 
-  DATA_TYPE ref_max = max_init;
+  DATA_TYPE ref_max   = max_init;
   IDX_TYPE ref_maxloc = maxloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] > ref_max ) {
-       ref_max = test_array[ seg_idx[i] ];
-       ref_maxloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] > ref_max)
+    {
+      ref_max    = test_array[seg_idx[i]];
+      ref_maxloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::maximum>;
   VL_TYPE maxinit(big_max, maxloc_init);
   VL_TYPE max(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
-    RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.maxloc( working_array[idx], idx );
-      m.maxloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+      RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+      RAJA::expt::KernelName("RAJA Reduce MaxLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.maxloc(working_array[idx], idx);
+        m.maxloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
   ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
@@ -78,49 +83,47 @@ void ForallReduceMaxLocAltBasicTestImpl(const SEG_TYPE& seg,
 
   VL_TYPE max2(max_init, maxloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::maximum>(&max2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.max( max );
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(max2.getVal()), static_cast<DATA_TYPE>(max.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(max2.getLoc()), static_cast<IDX_TYPE>(max.getLoc()));
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::maximum>(&max2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.max(max); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max2.getVal()),
+            static_cast<DATA_TYPE>(max.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(max2.getLoc()),
+            static_cast<IDX_TYPE>(max.getLoc()));
 
-  DATA_TYPE s_max = max_init;
+  DATA_TYPE s_max   = max_init;
   IDX_TYPE s_maxloc = maxloc_init;
 
   const int factor = 4;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max, &s_maxloc),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.maxloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max, &s_maxloc),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.maxloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(s_max), ref_max * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc), ref_maxloc);
 
-  DATA_TYPE s_max2 = max_init;
+  DATA_TYPE s_max2   = max_init;
   IDX_TYPE s_maxloc2 = maxloc_init;
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max2, &s_maxloc2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.max(max2);
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(s_max2), static_cast<DATA_TYPE>(max2.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc2), static_cast<IDX_TYPE>(max2.getLoc()));
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&s_max2, &s_maxloc2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.max(max2); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_max2),
+            static_cast<DATA_TYPE>(max2.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_maxloc2),
+            static_cast<IDX_TYPE>(max2.getLoc()));
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocAltBasicTest);
 template <typename T>
 class ForallReduceMaxLocAltBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocAltBasicTest, ReduceMaxLocAltBasicForall)
 {
@@ -130,67 +133,68 @@ TYPED_TEST_P(ForallReduceMaxLocAltBasicTest, ReduceMaxLocAltBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                                 working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                                 working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMaxLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                     RAJA::TypedListSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(l1, seg_idx,
+                                                                 working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocAltBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
index 9aab696301..6e102db0d0 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -13,9 +13,11 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
@@ -23,29 +25,28 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   using REF_MIN = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::minimum>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -53,14 +54,15 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE mininit(small_min);
   DATA_TYPE min(min_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce Min"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &mi, REF_MIN &m) {
-      mi.min(working_array[idx]);
-      m.min(working_array[idx]);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce Min"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN & mi, REF_MIN & m)
+      {
+        mi.min(working_array[idx]);
+        m.min(working_array[idx]);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min);
@@ -68,25 +70,21 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
   min = min_init;
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
-  DATA_TYPE factor = 3; 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &m) {
-      m.min(working_array[idx] * factor);
-  });
+  DATA_TYPE factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN & m)
+                            { m.min(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
   factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN &m) {
-      m.min(working_array[idx] * factor);
-  });
+  RAJA::forall<EXEC_POLICY>(seg,
+                            RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_MIN & m)
+                            { m.min(working_array[idx] * factor); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -94,8 +92,7 @@ void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
 template <typename T>
 class ForallReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
 {
@@ -105,70 +102,66 @@ TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest, ReduceMinBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
index dc48b403ea..508f64bc4a 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -13,62 +13,67 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
                                      const std::vector<IDX_TYPE>& seg_idx,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE small_min = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
+
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
+  DATA_TYPE ref_min   = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min    = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
   VL_TYPE mininit(small_min, minloc_init);
   VL_TYPE min(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce MinLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.minloc( working_array[idx], idx );
-      m.minloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce MinLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.minloc(working_array[idx], idx);
+        m.minloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -80,25 +85,21 @@ void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
 
   DATA_TYPE factor = 2;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
 template <typename T>
 class ForallReduceMinLocBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
 {
@@ -108,67 +109,67 @@ TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                              working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                              working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                   RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                  EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                              working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                  RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                                  REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
index 07a6058234..784f7ebdaa 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLocAlt.hpp
@@ -13,62 +13,67 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallReduceMinLocAltBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+                                        const std::vector<IDX_TYPE>& seg_idx,
+                                        camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
-  const IDX_TYPE minloc_init = -1;
-  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
-  const DATA_TYPE small_min = -modval;
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
+
+  const int modval            = 100;
+  const DATA_TYPE min_init    = modval + 1;
+  const IDX_TYPE minloc_init  = -1;
+  const IDX_TYPE minloc_idx   = seg_idx[idx_len * 2 / 3];
+  const DATA_TYPE small_min   = -modval;
   const IDX_TYPE small_minloc = minloc_init;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
 
-  DATA_TYPE ref_min = min_init;
+  DATA_TYPE ref_min   = min_init;
   IDX_TYPE ref_minloc = minloc_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    if ( test_array[ seg_idx[i] ] < ref_min ) {
-       ref_min = test_array[ seg_idx[i] ];
-       ref_minloc = seg_idx[i];
-    } 
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    if (test_array[seg_idx[i]] < ref_min)
+    {
+      ref_min    = test_array[seg_idx[i]];
+      ref_minloc = seg_idx[i];
+    }
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE, IDX_TYPE>;
-  using VL_LAMBDA_TYPE = RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
+  using VL_LAMBDA_TYPE =
+      RAJA::expt::ValLocOp<DATA_TYPE, IDX_TYPE, RAJA::operators::minimum>;
   VL_TYPE mininit(small_min, minloc_init);
   VL_TYPE min(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-    RAJA::expt::KernelName("RAJA Reduce MinLoc"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &mi, VL_LAMBDA_TYPE &m) {
-      mi.minloc( working_array[idx], idx );
-      m.minloc( working_array[idx], idx );
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      RAJA::expt::KernelName("RAJA Reduce MinLoc"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & mi,
+                           VL_LAMBDA_TYPE & m)
+      {
+        mi.minloc(working_array[idx], idx);
+        m.minloc(working_array[idx], idx);
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
   ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
@@ -77,49 +82,47 @@ void ForallReduceMinLocAltBasicTestImpl(const SEG_TYPE& seg,
 
   VL_TYPE min2(min_init, minloc_init);
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::Reduce<RAJA::operators::minimum>(&min2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.min( min );
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(min2.getVal()), static_cast<DATA_TYPE>(min.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(min2.getLoc()), static_cast<IDX_TYPE>(min.getLoc()));
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::minimum>(&min2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.min(min); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min2.getVal()),
+            static_cast<DATA_TYPE>(min.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(min2.getLoc()),
+            static_cast<IDX_TYPE>(min.getLoc()));
 
-  DATA_TYPE s_min = min_init;
+  DATA_TYPE s_min   = min_init;
   IDX_TYPE s_minloc = minloc_init;
 
   const int factor = 4;
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min, &s_minloc),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE &m) {
-      m.minloc( working_array[idx] * factor, idx);
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min, &s_minloc),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_LAMBDA_TYPE & m)
+      { m.minloc(working_array[idx] * factor, idx); });
   ASSERT_EQ(static_cast<DATA_TYPE>(s_min), ref_min * factor);
   ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc), ref_minloc);
 
-  DATA_TYPE s_min2 = min_init;
+  DATA_TYPE s_min2   = min_init;
   IDX_TYPE s_minloc2 = minloc_init;
 
-  RAJA::forall<EXEC_POLICY>(seg,
-    RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min2, &s_minloc2),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE &m2) {
-      m2.min(min2);
-  });
-  ASSERT_EQ(static_cast<DATA_TYPE>(s_min2), static_cast<DATA_TYPE>(min2.getVal()));
-  ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc2), static_cast<IDX_TYPE>(min2.getLoc()));
-   
-
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&s_min2, &s_minloc2),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE RAJA_UNUSED_ARG(idx), VL_LAMBDA_TYPE & m2)
+      { m2.min(min2); });
+  ASSERT_EQ(static_cast<DATA_TYPE>(s_min2),
+            static_cast<DATA_TYPE>(min2.getVal()));
+  ASSERT_EQ(static_cast<IDX_TYPE>(s_minloc2),
+            static_cast<IDX_TYPE>(min2.getLoc()));
+
+
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocAltBasicTest);
 template <typename T>
 class ForallReduceMinLocAltBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocAltBasicTest, ReduceMinLocAltBasicForall)
 {
@@ -129,67 +132,68 @@ TYPED_TEST_P(ForallReduceMinLocAltBasicTest, ReduceMinLocAltBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r1, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r2, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+                                     RAJA::TypedRangeSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r3, seg_idx,
+                                                                 working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                                 working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+                                     RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                                 working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceMinLocAltBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  EXEC_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+                                     RAJA::TypedListSegment<IDX_TYPE>,
+                                     EXEC_POLICY, REDUCE_POLICY>(l1, seg_idx,
+                                                                 working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocAltBasicTest,
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
index 4105700f7c..d4abbf0192 100644
--- a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -13,37 +13,38 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
                                   const std::vector<IDX_TYPE>& seg_idx,
                                   camp::resources::Resource working_res)
 {
   using REF_SUM = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::plus>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -51,14 +52,15 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
   DATA_TYPE sum(0);
   DATA_TYPE sum2(2);
 
-  RAJA::forall<EXEC_POLICY>(seg, 
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-    RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-    RAJA::expt::KernelName("RAJA Reduce Sum"),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM &s1, REF_SUM &s2) {
-      s1 += working_array[idx];
-      s2 += working_array[idx];
-  });
+  RAJA::forall<EXEC_POLICY>(
+      seg, RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+      RAJA::expt::KernelName("RAJA Reduce Sum"),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM & s1, REF_SUM & s2)
+      {
+        s1 += working_array[idx];
+        s2 += working_array[idx];
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -67,20 +69,18 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::forall<EXEC_POLICY>(seg, 
-      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM &s) {
-        s += working_array[idx];
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::forall<EXEC_POLICY>(seg,
+                              RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx, REF_SUM & s)
+                              { s += working_array[idx]; });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -88,8 +88,7 @@ void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
 template <typename T>
 class ForallReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
 {
@@ -99,70 +98,66 @@ TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                           working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+                               EXEC_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                           working_res);
 
-// List segment tests
-  seg_idx.clear(); 
+  // List segment tests
+  seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               EXEC_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, EXEC_POLICY,
+                               REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest, ReduceSumBasicForall);
 
 #endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
index 6335affc02..dc7be52f55 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMax.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -39,21 +41,19 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = -DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = default_val;
   }
 
@@ -62,49 +62,49 @@ void ForallIndexSetReduceMaxMultipleTestImpl()
   std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_max = default_val;
+  double current_max    = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax0(default_val);
   RAJA::ReduceMax<REDUCE_POLICY, double> dmax1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
-
-     // pick an index in one of the segments
-     int index = 5127;  // seg 3
-     if (tcount == 2) index = 1938; // seg2
-     if (tcount == 3) index = 13333; // seg4
-     if (tcount == 4) index = 52; // seg1
-
-     double droll = dist(mt);
-     if (test_array[index] > droll) {
-       test_array[index] = droll;
-       current_max = RAJA_MAX(current_max, droll);
-     }
-
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
-
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmax0.max(working_array[i]);
-       dmax1.max(2 * working_array[i]);
-     });
-
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
-
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
+
+    // pick an index in one of the segments
+    int index = 5127;                // seg 3
+    if (tcount == 2) index = 1938;   // seg2
+    if (tcount == 3) index = 13333;  // seg4
+    if (tcount == 4) index = 52;     // seg1
+
+    double droll = dist(mt);
+    if (test_array[index] > droll)
+    {
+      test_array[index] = droll;
+      current_max       = RAJA_MAX(current_max, droll);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.max(working_array[i]);
+                                dmax1.max(2 * working_array[i]);
+                              });
+
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
              ReduceMaxMultipleForallIndexSet)
@@ -114,8 +114,8 @@ TYPED_TEST_P(ForallIndexSetReduceMaxMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
index 4d30728fe6..45bb37cbe3 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMaxLoc.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMaxLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -32,77 +34,76 @@ void ForallIndexSetReduceMaxLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
-  double current_max = -DBL_MAX;
+  double current_max   = -DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = current_max;
   }
-  
+
   const int test_repeat = 4;
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max, current_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max, current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax0(current_max,
+                                                            current_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, double, IDX_TYPE> dmax1(current_max,
+                                                            current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // set max val 
-     current_max = 100.0 + tcount * 10.0;
+    // set max val
+    current_max = 100.0 + tcount * 10.0;
 
-     // pick an index in one of the segments
-     current_loc = 5127;  // seg 3
-     if (tcount == 2) current_loc = 1938; // seg2
-     if (tcount == 3) current_loc = 13333; // seg4
-     if (tcount == 4) current_loc = 52; // seg1
+    // pick an index in one of the segments
+    current_loc = 5127;                    // seg 3
+    if (tcount == 2) current_loc = 1938;   // seg2
+    if (tcount == 3) current_loc = 13333;  // seg4
+    if (tcount == 4) current_loc = 52;     // seg1
 
-     test_array[current_loc] = current_max;
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    test_array[current_loc] = current_max;
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmax0.maxloc(working_array[i], i);
-       dmax1.maxloc(2 * working_array[i], i);
-     });
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmax0.maxloc(working_array[i], i);
+                                dmax1.maxloc(2 * working_array[i], i);
+                              });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax0.get()), current_max);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmax0.getLoc()), current_loc);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmax1.get()), 2 * current_max);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmax1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMaxLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
              ReduceMaxLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -110,8 +111,8 @@ TYPED_TEST_P(ForallIndexSetReduceMaxLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                             EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMaxLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMaxLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
index cf3b60d078..9a0cc3b67f 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMin.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -32,81 +34,79 @@ void ForallIndexSetReduceMinMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
   const double default_val = DBL_MAX;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = default_val;
   }
-  
+
   // for setting random values in arrays
   std::random_device rd;
   std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(-10, 10);
 
-  double current_min = default_val;
+  double current_min    = default_val;
   const int test_repeat = 4;
 
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin0(default_val);
   RAJA::ReduceMin<REDUCE_POLICY, double> dmin1(default_val);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
-
-     // pick an index in one of the segments
-     int index = 5127;  // seg 3
-     if (tcount == 2) index = 1938; // seg2
-     if (tcount == 3) index = 13333; // seg4
-     if (tcount == 4) index = 52; // seg1
-
-     double droll = dist(mt);
-     if (test_array[index] > droll) {
-       test_array[index] = droll;
-       current_min = RAJA_MIN(current_min, droll);
-     }
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
-
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmin0.min(working_array[i]);
-       dmin1.min(2 * working_array[i]);
-     });
-
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
-
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
+
+    // pick an index in one of the segments
+    int index = 5127;                // seg 3
+    if (tcount == 2) index = 1938;   // seg2
+    if (tcount == 3) index = 13333;  // seg4
+    if (tcount == 4) index = 52;     // seg1
+
+    double droll = dist(mt);
+    if (test_array[index] > droll)
+    {
+      test_array[index] = droll;
+      current_min       = RAJA_MIN(current_min, droll);
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.min(working_array[i]);
+                                dmin1.min(2 * working_array[i]);
+                              });
+
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
              ReduceMinMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -114,8 +114,8 @@ TYPED_TEST_P(ForallIndexSetReduceMinMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
index b8abbd9f67..2b0fa8d43d 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceMinLoc.hpp
@@ -19,12 +19,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceMinLocMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -32,77 +34,76 @@ void ForallIndexSetReduceMinLocMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* working_array;
   double* check_array;
   double* test_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &working_array,
-                                 &check_array,
-                                 &test_array);
+  allocateForallTestData<double>(alen, working_res, &working_array,
+                                 &check_array, &test_array);
 
-  double current_min = DBL_MAX;
+  double current_min   = DBL_MAX;
   IDX_TYPE current_loc = -1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     test_array[i] = current_min;
   }
-  
+
   const int test_repeat = 4;
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min, current_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min, current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin0(current_min,
+                                                            current_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, double, IDX_TYPE> dmin1(current_min,
+                                                            current_loc);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
 
-     // set min val 
-     current_min = 100.0 - tcount * 10.0;
+    // set min val
+    current_min = 100.0 - tcount * 10.0;
 
-     // pick an index in one of the segments
-     current_loc = 5127;  // seg 3
-     if (tcount == 2) current_loc = 1938; // seg2
-     if (tcount == 3) current_loc = 13333; // seg4
-     if (tcount == 4) current_loc = 52; // seg1
+    // pick an index in one of the segments
+    current_loc = 5127;                    // seg 3
+    if (tcount == 2) current_loc = 1938;   // seg2
+    if (tcount == 3) current_loc = 13333;  // seg4
+    if (tcount == 4) current_loc = 52;     // seg1
 
-     test_array[current_loc] = current_min;
- 
-     working_res.memcpy(working_array, test_array, sizeof(double) * alen);
+    test_array[current_loc] = current_min;
 
-     RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
-       dmin0.minloc(working_array[i], i);
-       dmin1.minloc(2 * working_array[i], i);
-     });
+    working_res.memcpy(working_array, test_array, sizeof(double) * alen);
 
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
-     ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
-     ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+                              {
+                                dmin0.minloc(working_array[i], i);
+                                dmin1.minloc(2 * working_array[i], i);
+                              });
 
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin0.get()), current_min);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmin0.getLoc()), current_loc);
+    ASSERT_FLOAT_EQ(static_cast<double>(dmin1.get()), 2 * current_min);
+    ASSERT_EQ(static_cast<IDX_TYPE>(dmin1.getLoc()), current_loc);
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   working_array,
-                                   check_array,
+  deallocateForallTestData<double>(working_res, working_array, check_array,
                                    test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest);
 template <typename T>
 class ForallIndexSetReduceMinLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
              ReduceMinLocMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -110,8 +111,8 @@ TYPED_TEST_P(ForallIndexSetReduceMinLocMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                             EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceMinLocMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                             REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceMinLocMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
index 88d3f54d7e..7dd8f83844 100644
--- a/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-indexset/tests/test-forall-indexset-multiple-ReduceSum.hpp
@@ -17,12 +17,14 @@
 // not aligned with warp boundaries, for example, to check that reduction
 // mechanics don't depend on any sort of special indexing.
 //
-template <typename IDX_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename IDX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void ForallIndexSetReduceSumMultipleTestImpl()
 {
   using RangeSegType = RAJA::TypedRangeSegment<IDX_TYPE>;
-  using IdxSetType = RAJA::TypedIndexSet<RangeSegType>;
+  using IdxSetType   = RAJA::TypedIndexSet<RangeSegType>;
 
   RAJA::TypedRangeSegment<IDX_TYPE> r1(1, 1037);
   RAJA::TypedRangeSegment<IDX_TYPE> r2(1043, 2036);
@@ -30,48 +32,43 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::TypedRangeSegment<IDX_TYPE> r4(10243, 15286);
 
   IdxSetType iset;
-  iset.push_back(r1); 
-  iset.push_back(r2); 
-  iset.push_back(r3); 
-  iset.push_back(r4); 
+  iset.push_back(r1);
+  iset.push_back(r2);
+  iset.push_back(r3);
+  iset.push_back(r4);
 
   const IDX_TYPE alen = 15286;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   double* dworking_array;
   double* dcheck_array;
   double* dtest_array;
 
-  allocateForallTestData<double>(alen,
-                                 working_res,
-                                 &dworking_array,
-                                 &dcheck_array,
-                                 &dtest_array);
+  allocateForallTestData<double>(alen, working_res, &dworking_array,
+                                 &dcheck_array, &dtest_array);
 
   int* iworking_array;
   int* icheck_array;
   int* itest_array;
 
-  allocateForallTestData<int>(alen,
-                              working_res,
-                              &iworking_array,
-                              &icheck_array,
+  allocateForallTestData<int>(alen, working_res, &iworking_array, &icheck_array,
                               &itest_array);
 
   const double dinit_val = 0.1;
-  const int iinit_val = 1;
+  const int iinit_val    = 1;
 
-  for (IDX_TYPE i = 0; i < alen; ++i) {
+  for (IDX_TYPE i = 0; i < alen; ++i)
+  {
     dtest_array[i] = dinit_val;
     itest_array[i] = iinit_val;
   }
-  
+
   working_res.memcpy(dworking_array, dtest_array, sizeof(double) * alen);
   working_res.memcpy(iworking_array, itest_array, sizeof(int) * alen);
 
-  const double drinit = 5.0;
-  const int irinit = 4;
+  const double drinit   = 5.0;
+  const int irinit      = 4;
   const int test_repeat = 4;
 
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum0(drinit * 1.0);
@@ -79,47 +76,44 @@ void ForallIndexSetReduceSumMultipleTestImpl()
   RAJA::ReduceSum<REDUCE_POLICY, double> dsum2(drinit * 3.0);
   RAJA::ReduceSum<REDUCE_POLICY, int> isum3(irinit * 4);
 
-  for (int tcount = 1; tcount <= test_repeat; ++tcount) {
- 
-    RAJA::forall<EXEC_POLICY>(iset, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      dsum0 += 1.0 * dworking_array[idx];
-      isum1 += 2 * iworking_array[idx];
-      dsum2 += 3.0 * dworking_array[idx];
-      isum3 += 4 * iworking_array[idx];
-    });
+  for (int tcount = 1; tcount <= test_repeat; ++tcount)
+  {
+
+    RAJA::forall<EXEC_POLICY>(iset,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                dsum0 += 1.0 * dworking_array[idx];
+                                isum1 += 2 * iworking_array[idx];
+                                dsum2 += 3.0 * dworking_array[idx];
+                                isum3 += 4 * iworking_array[idx];
+                              });
 
     double dchk_val = dinit_val * static_cast<double>(iset.getLength());
-    int ichk_val = iinit_val * static_cast<int>(iset.getLength());
+    int ichk_val    = iinit_val * static_cast<int>(iset.getLength());
 
-    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()), 
-                               tcount * (1 * dchk_val) + (drinit * 1.0) );
-    ASSERT_EQ(static_cast<int>(isum1.get()), 
-                               tcount * (2 * ichk_val) + (irinit * 2) );
+    ASSERT_FLOAT_EQ(static_cast<double>(dsum0.get()),
+                    tcount * (1 * dchk_val) + (drinit * 1.0));
+    ASSERT_EQ(static_cast<int>(isum1.get()),
+              tcount * (2 * ichk_val) + (irinit * 2));
     ASSERT_FLOAT_EQ(static_cast<double>(dsum2.get()),
-                               tcount * (3 * dchk_val) + (drinit * 3.0) );
-    ASSERT_EQ(static_cast<int>(isum3.get()), 
-                               tcount * (4 * ichk_val) + (irinit * 4) );
-
+                    tcount * (3 * dchk_val) + (drinit * 3.0));
+    ASSERT_EQ(static_cast<int>(isum3.get()),
+              tcount * (4 * ichk_val) + (irinit * 4));
   }
 
-  deallocateForallTestData<double>(working_res,
-                                   dworking_array,
-                                   dcheck_array,
+  deallocateForallTestData<double>(working_res, dworking_array, dcheck_array,
                                    dtest_array);
 
-  deallocateForallTestData<int>(working_res,
-                                iworking_array,
-                                icheck_array,
+  deallocateForallTestData<int>(working_res, iworking_array, icheck_array,
                                 itest_array);
 }
 
 TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest);
 template <typename T>
 class ForallIndexSetReduceSumMultipleTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest, 
+TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
              ReduceSumMultipleForallIndexSet)
 {
   using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -127,8 +121,8 @@ TYPED_TEST_P(ForallIndexSetReduceSumMultipleTest,
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<2>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES,
-                                          EXEC_POLICY, REDUCE_POLICY>();
+  ForallIndexSetReduceSumMultipleTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY,
+                                          REDUCE_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallIndexSetReduceSumMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
index bc5aec30d6..8e996e4a2c 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMax.hpp
@@ -14,32 +14,31 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE, 
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, 
-                                     IDX_TYPE last)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const DATA_TYPE big_val = 500;
-  
+  const DATA_TYPE big_val     = 500;
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `max0;` not `max0(0);`
@@ -49,7 +48,8 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
   RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> max2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
@@ -58,61 +58,62 @@ void ForallReduceMaxMultipleTestImpl(IDX_TYPE first,
     DATA_TYPE current_max = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = 0; i < last; ++i) {
+      for (IDX_TYPE i = 0; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[max_index] = roll;
-        working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[max_index], &test_array[max_index],
+                           sizeof(DATA_TYPE));
 
-        if ( current_max < roll ) {
-          current_max = roll ;
+        if (current_max < roll)
+        {
+          current_max = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          max0.max(working_array[idx]);
-          max1.max(2 * working_array[idx]);
-          max2.max(working_array[idx]);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.max(working_array[idx]);
+                                    max1.max(2 * working_array[idx]);
+                                    max2.max(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_max * 2, static_cast<DATA_TYPE>(max1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
-
       }
-
     }
 
     max0.reset(default_val);
     max1.reset(default_val);
     max2.reset(big_val);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest);
 template <typename T>
 class ForallReduceMaxMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
 {
@@ -122,8 +123,8 @@ TYPED_TEST_P(ForallReduceMaxMultipleTest, ReduceMaxMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
-                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMaxMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
index 8f16762989..d13f7f05a3 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMaxLoc.hpp
@@ -14,40 +14,43 @@
 #include <numeric>
 #include <random>
 
-template <typename IDX_TYPE, 
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, 
-                                        IDX_TYPE last)
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(-SHRT_MAX);
-  const IDX_TYPE default_loc = -1;
-  const DATA_TYPE big_val = 500;
-  
+  const IDX_TYPE default_loc  = -1;
+  const DATA_TYPE big_val     = 500;
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val, default_loc);
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val, default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> max2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -59,37 +62,45 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
     DATA_TYPE current_max = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc  = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = first; i < last; ++i) {
+      for (IDX_TYPE i = first; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE max_index = static_cast<IDX_TYPE>(dist2(mt));
 
-        if ( current_max != roll ) { // avoid two indices getting the same value
+        if (current_max != roll)
+        {  // avoid two indices getting the same value
           test_array[max_index] = roll;
-          working_res.memcpy(&working_array[max_index], &test_array[max_index], sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[max_index], &test_array[max_index],
+                             sizeof(DATA_TYPE));
 
-          if ( current_max < roll ) {
+          if (current_max < roll)
+          {
             current_max = roll;
             current_loc = max_index;
           }
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          max0.maxloc(working_array[idx], idx);
-          max1.maxloc(2 * working_array[idx], idx);
-          max2.maxloc(working_array[idx], idx);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    max0.maxloc(working_array[idx], idx);
+                                    max1.maxloc(2 * working_array[idx], idx);
+                                    max2.maxloc(working_array[idx], idx);
+                                  });
 
         ASSERT_EQ(current_max, static_cast<DATA_TYPE>(max0.get()));
         ASSERT_EQ(current_loc, static_cast<IDX_TYPE>(max0.getLoc()));
@@ -99,15 +110,12 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
-
       }
-
     }
 
     max0.reset(default_val, default_loc);
     max1.reset(default_val, default_loc);
     max2.reset(big_val, default_loc);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(max0.get()));
@@ -119,17 +127,14 @@ void ForallReduceMaxLocMultipleTestImpl(IDX_TYPE first,
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(max2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(max2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMaxLocMultipleTest);
 template <typename T>
 class ForallReduceMaxLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMaxLocMultipleTest, ReduceMaxLocMultipleForall)
 {
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
index 7e51ac2a2d..a33710f7dc 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMin.hpp
@@ -15,31 +15,30 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMinMultipleTestImpl(IDX_TYPE first, 
-                                     IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const DATA_TYPE big_val = -500;
-  
+  const DATA_TYPE big_val     = -500;
+
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   // Workaround for broken omp-target reduction interface.
   // This should be `min0;` not `min0(0);`
@@ -49,7 +48,8 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min2(big_val);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
@@ -58,61 +58,62 @@ void ForallReduceMinMultipleTestImpl(IDX_TYPE first,
     DATA_TYPE current_min = default_val;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
-      for (IDX_TYPE i = 0; i < last; ++i) {
+      for (IDX_TYPE i = 0; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         test_array[min_index] = roll;
-        working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
+        working_res.memcpy(&working_array[min_index], &test_array[min_index],
+                           sizeof(DATA_TYPE));
 
-        if ( current_min > roll ) {
-          current_min = roll ;
+        if (current_min > roll)
+        {
+          current_min = roll;
         }
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          min0.min(working_array[idx]);
-          min1.min(2 * working_array[idx]);
-          min2.min(working_array[idx]);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.min(working_array[idx]);
+                                    min1.min(2 * working_array[idx]);
+                                    min2.min(working_array[idx]);
+                                  });
 
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
         ASSERT_EQ(current_min * 2, static_cast<DATA_TYPE>(min1.get()));
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
-
       }
-
     }
 
     min0.reset(default_val);
     min1.reset(default_val);
     min2.reset(big_val);
-
   }
 
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
   ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min1.get()));
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest);
 template <typename T>
 class ForallReduceMinMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
 {
@@ -122,8 +123,8 @@ TYPED_TEST_P(ForallReduceMinMultipleTest, ReduceMinMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                  EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceMinMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                                  REDUCE_POLICY>(0, 2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinMultipleTest,
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
index d71f582ed9..c8e4431ac4 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceMinLoc.hpp
@@ -15,40 +15,43 @@
 #include <random>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, 
-                                        IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE default_val = static_cast<DATA_TYPE>(SHRT_MAX);
-  const IDX_TYPE default_loc = -1;
-  const DATA_TYPE big_val = -500;
+  const IDX_TYPE default_loc  = -1;
+  const DATA_TYPE big_val     = -500;
 
   static std::random_device rd;
   static std::mt19937 mt(rd());
   static std::uniform_real_distribution<double> dist(-100, 100);
-  static std::uniform_int_distribution<int> dist2(static_cast<int>(first), static_cast<int>(last) - 1);
+  static std::uniform_int_distribution<int> dist2(static_cast<int>(first),
+                                                  static_cast<int>(last) - 1);
 
   printf("min0 init { %f, %f }\n", (double)default_val, (double)default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val, default_loc);
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val, default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min0(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min1(default_val,
+                                                              default_loc);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, IDX_TYPE> min2(big_val,
+                                                              default_loc);
 
   const int nOuterLoops = 2;
-  for (int l = 0; l < nOuterLoops; ++l) {
+  for (int l = 0; l < nOuterLoops; ++l)
+  {
 
     printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
     ASSERT_EQ(default_val, static_cast<DATA_TYPE>(min0.get()));
@@ -61,40 +64,49 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
     ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
     DATA_TYPE current_min = default_val;
-    IDX_TYPE  current_loc = default_loc;
+    IDX_TYPE current_loc  = default_loc;
 
     const int nMiddleLoops = 2;
-    for (int k = 0; k < nMiddleLoops; ++k) {
+    for (int k = 0; k < nMiddleLoops; ++k)
+    {
 
       printf("reset data { %f }\n", (double)default_val);
-      for (IDX_TYPE i = first; i < last; ++i) {
+      for (IDX_TYPE i = first; i < last; ++i)
+      {
         test_array[i] = default_val;
       }
       working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * last);
 
       const int nloops = 6;
-      for (int j = 0; j < nloops; ++j) {
+      for (int j = 0; j < nloops; ++j)
+      {
 
-        DATA_TYPE roll = static_cast<DATA_TYPE>( dist(mt) );
+        DATA_TYPE roll     = static_cast<DATA_TYPE>(dist(mt));
         IDX_TYPE min_index = static_cast<IDX_TYPE>(dist2(mt));
 
         printf("rolling { %f, %f }\n", (double)roll, (double)min_index);
-        if ( current_min != roll ) { // avoid two indices getting the same value
+        if (current_min != roll)
+        {  // avoid two indices getting the same value
           test_array[min_index] = roll;
-          working_res.memcpy(&working_array[min_index], &test_array[min_index], sizeof(DATA_TYPE));
+          working_res.memcpy(&working_array[min_index], &test_array[min_index],
+                             sizeof(DATA_TYPE));
 
-          if ( current_min > roll ) {
+          if (current_min > roll)
+          {
             current_min = roll;
             current_loc = min_index;
           }
         }
-        printf("current { %f, %f }\n", (double)current_min, (double)current_loc);
+        printf("current { %f, %f }\n", (double)current_min,
+               (double)current_loc);
 
-        RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-          min0.minloc(working_array[idx], idx);
-          min1.minloc(2 * working_array[idx], idx);
-          min2.minloc(working_array[idx], idx);
-        });
+        RAJA::forall<EXEC_POLICY>(r1,
+                                  [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                  {
+                                    min0.minloc(working_array[idx], idx);
+                                    min1.minloc(2 * working_array[idx], idx);
+                                    min2.minloc(working_array[idx], idx);
+                                  });
 
         printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
         ASSERT_EQ(current_min, static_cast<DATA_TYPE>(min0.get()));
@@ -105,16 +117,13 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
 
         ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
         ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
-
       }
-
     }
 
     printf("min0 reset { %f, %f }\n", (double)default_val, (double)default_loc);
     min0.reset(default_val, (DATA_TYPE)default_loc);
     min1.reset(default_val, default_loc);
     min2.reset(big_val, default_loc);
-
   }
 
   printf("min0 { %f, %f }\n", (double)min0.get(), (double)min0.getLoc());
@@ -127,17 +136,14 @@ void ForallReduceMinLocMultipleTestImpl(IDX_TYPE first,
   ASSERT_EQ(big_val, static_cast<DATA_TYPE>(min2.get()));
   ASSERT_EQ(default_loc, static_cast<IDX_TYPE>(min2.getLoc()));
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceMinLocMultipleTest);
 template <typename T>
 class ForallReduceMinLocMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
 {
@@ -147,7 +153,7 @@ TYPED_TEST_P(ForallReduceMinLocMultipleTest, ReduceMinLocMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+  ForallReduceMinLocMultipleTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
                                      EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 }
 
diff --git a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
index b5a6c469d1..aa489187f0 100644
--- a/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
+++ b/test/functional/forall/reduce-multiple-segment/tests/test-forall-segment-multiple-ReduceSum.hpp
@@ -12,27 +12,26 @@
 #include <numeric>
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES,
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, 
-                                              IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i) {
+  for (IDX_TYPE i = first; i < last; ++i)
+  {
     test_array[i] = initval;
   }
 
@@ -51,60 +50,63 @@ void ForallReduceSumMultipleStaggeredTestImpl(IDX_TYPE first,
   const DATA_TYPE index_len = static_cast<DATA_TYPE>(last - first);
 
   const int nloops = 2;
-  for (int j = 0; j < nloops; ++j) {
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum0 += working_array[idx];
-      sum1 += working_array[idx] * 2;
-      sum2 += working_array[idx] * 3;
-      sum3 += working_array[idx] * 4;
-      sum4 += working_array[idx] * 5;
-      sum5 += working_array[idx] * 6;
-      sum6 += working_array[idx] * 7;
-      sum7 += working_array[idx] * 8;
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
-
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 template <typename IDX_TYPE,
-          typename DATA_TYPE, typename WORKING_RES, 
-          typename EXEC_POLICY, typename REDUCE_POLICY>
-void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, 
-			                       IDX_TYPE last)
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first, IDX_TYPE last)
 {
   RAJA::TypedRangeSegment<IDX_TYPE> r1(first, last);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(last,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(last, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const DATA_TYPE initval = 2;
 
-  for (IDX_TYPE i = first; i < last; ++i) {
+  for (IDX_TYPE i = first; i < last; ++i)
+  {
     test_array[i] = initval;
   }
 
@@ -134,43 +136,46 @@ void ForallReduceSumMultipleStaggered2TestImpl(IDX_TYPE first,
   sum7.reset(initval * 7);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-      sum0 += working_array[idx];
-      sum1 += working_array[idx] * 2;
-      sum2 += working_array[idx] * 3;
-      sum3 += working_array[idx] * 4;
-      sum4 += working_array[idx] * 5;
-      sum5 += working_array[idx] * 6;
-      sum6 += working_array[idx] * 7;
-      sum7 += working_array[idx] * 8;
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                              {
+                                sum0 += working_array[idx];
+                                sum1 += working_array[idx] * 2;
+                                sum2 += working_array[idx] * 3;
+                                sum3 += working_array[idx] * 4;
+                                sum4 += working_array[idx] * 5;
+                                sum5 += working_array[idx] * 6;
+                                sum6 += working_array[idx] * 7;
+                                sum7 += working_array[idx] * 8;
+                              });
 
     DATA_TYPE check_val = initval * index_len * (j + 1);
 
     ASSERT_EQ(1 * check_val, static_cast<DATA_TYPE>(sum0.get()));
-    ASSERT_EQ(2 * check_val + (initval*1), static_cast<DATA_TYPE>(sum1.get()));
+    ASSERT_EQ(2 * check_val + (initval * 1),
+              static_cast<DATA_TYPE>(sum1.get()));
     ASSERT_EQ(3 * check_val, static_cast<DATA_TYPE>(sum2.get()));
-    ASSERT_EQ(4 * check_val + (initval*3), static_cast<DATA_TYPE>(sum3.get()));
+    ASSERT_EQ(4 * check_val + (initval * 3),
+              static_cast<DATA_TYPE>(sum3.get()));
     ASSERT_EQ(5 * check_val, static_cast<DATA_TYPE>(sum4.get()));
-    ASSERT_EQ(6 * check_val + (initval*5), static_cast<DATA_TYPE>(sum5.get()));
+    ASSERT_EQ(6 * check_val + (initval * 5),
+              static_cast<DATA_TYPE>(sum5.get()));
     ASSERT_EQ(7 * check_val, static_cast<DATA_TYPE>(sum6.get()));
-    ASSERT_EQ(8 * check_val + (initval*7), static_cast<DATA_TYPE>(sum7.get()));
-
+    ASSERT_EQ(8 * check_val + (initval * 7),
+              static_cast<DATA_TYPE>(sum7.get()));
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest);
 template <typename T>
 class ForallReduceSumMultipleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
 {
@@ -180,11 +185,12 @@ TYPED_TEST_P(ForallReduceSumMultipleTest, ReduceSumMultipleForall)
   using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
+  ForallReduceSumMultipleStaggeredTestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
 
-  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES, 
-                                            EXEC_POLICY, REDUCE_POLICY>(0, 2115);
+  ForallReduceSumMultipleStaggered2TestImpl<IDX_TYPE, DATA_TYPE, WORKING_RES,
+                                            EXEC_POLICY, REDUCE_POLICY>(0,
+                                                                        2115);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumMultipleTest,
diff --git a/test/functional/forall/region/tests/test-forall-region.hpp b/test/functional/forall/region/tests/test-forall-region.hpp
index f83d9ef1a5..6b0dafd652 100644
--- a/test/functional/forall/region/tests/test-forall-region.hpp
+++ b/test/functional/forall/region/tests/test-forall-region.hpp
@@ -11,59 +11,54 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE, typename WORKING_RES, 
-          typename REG_POLICY, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename REG_POLICY,
+          typename EXEC_POLICY>
 void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   //
   // Set some local variables and create some segments for using in tests
   //
   const INDEX_TYPE N = last - first;
-  
+
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(&idx_array[0], &idx_array[0] + N, first);
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
-
-  working_res.memset( working_array, 0, sizeof(INDEX_TYPE) * N );
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  RAJA::region<REG_POLICY>([=]() {
+  working_res.memset(working_array, 0, sizeof(INDEX_TYPE) * N);
 
-    RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[idx - first] += 1;
-    });
+  RAJA::region<REG_POLICY>(
+      [=]()
+      {
+        RAJA::forall<EXEC_POLICY>(rseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 1; });
 
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[idx - first] += 2; 
-    });
-
-  });
+        RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                                  { working_array[idx - first] += 2; });
+      });
 
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 3);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -71,8 +66,7 @@ void ForallRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallRegionTest);
 template <typename T>
 class ForallRegionTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRegionTest, RegionForall)
 {
@@ -82,11 +76,12 @@ TYPED_TEST_P(ForallRegionTest, RegionForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
   ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(0, 25);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1, 153);
-  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3, 2556);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(1,
+                                                                         153);
+  ForallRegionTestImpl<INDEX_TYPE, WORKING_RES, REG_POLICY, EXEC_POLICY>(3,
+                                                                         2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest,
-                            RegionForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRegionTest, RegionForall);
 
 #endif  // __TEST_FORALL_REGION_HPP__
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
index b000b270da..f909ef9d4c 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIcountIndexSet.hpp
@@ -21,67 +21,63 @@ void ForallResourceIcountIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, erased_working_res);
+      iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE ticount = 0;
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ ticount++ ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[ticount++] = is_indices[i];
   }
 
-  RAJA::forall_Icount<EXEC_POLICY>(working_res, iset,
-    [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx) {
-    working_array[icount] = idx;
-  });
+  RAJA::forall_Icount<EXEC_POLICY>(
+      working_res, iset,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE icount, INDEX_TYPE idx)
+      { working_array[icount] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest);
 template <typename T>
 class ForallResourceIcountIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
 {
@@ -89,7 +85,8 @@ TYPED_TEST_P(ForallResourceIcountIndexSetTest, ResourceIndexSetForallIcount)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
+  ForallResourceIcountIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                       EXEC_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceIcountIndexSetTest,
diff --git a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
index c1f714013d..2129d1350f 100644
--- a/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
+++ b/test/functional/forall/resource-indexset/tests/test-forall-ResourceIndexSet.hpp
@@ -20,66 +20,62 @@ void ForallResourceIndexSetTestImpl()
   using RangeStrideSegType = RAJA::TypedRangeStrideSegment<INDEX_TYPE>;
   using ListSegType        = RAJA::TypedListSegment<INDEX_TYPE>;
 
-  using IndexSetType = 
-   RAJA::TypedIndexSet< RangeSegType, RangeStrideSegType, ListSegType >; 
+  using IndexSetType =
+      RAJA::TypedIndexSet<RangeSegType, RangeStrideSegType, ListSegType>;
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
-  IndexSetType iset; 
-  std::vector<INDEX_TYPE> is_indices; 
+  IndexSetType iset;
+  std::vector<INDEX_TYPE> is_indices;
   buildIndexSet<INDEX_TYPE, RangeSegType, RangeStrideSegType, ListSegType>(
-    iset, is_indices, erased_working_res);
+      iset, is_indices, erased_working_res);
 
   //
   // Working array length
   //
-  const INDEX_TYPE N = is_indices[ is_indices.size() - 1 ] + 1;
+  const INDEX_TYPE N = is_indices[is_indices.size() - 1] + 1;
 
   //
   // Allocate and initialize arrays used in testing
-  //  
+  //
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < is_indices.size(); ++i) {
-    test_array[ is_indices[i] ] = is_indices[i];
+  for (size_t i = 0; i < is_indices.size(); ++i)
+  {
+    test_array[is_indices[i]] = is_indices[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, iset, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[idx] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(working_res, iset,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { working_array[idx] = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  // 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  //
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceIndexSetTest);
 template <typename T>
 class ForallResourceIndexSetTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
 {
@@ -90,7 +86,6 @@ TYPED_TEST_P(ForallResourceIndexSetTest, ResourceIndexSetForall)
   ForallResourceIndexSetTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest,
-                            ResourceIndexSetForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallResourceIndexSetTest, ResourceIndexSetForall);
 
 #endif  // __TEST_FORALL_INDEXSET_HPP__
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
index 5e0675cc98..b5596ab1dd 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-ListSegment.hpp
@@ -22,67 +22,70 @@ void ForallResourceListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
 
   // Create list segment for tests
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen,
                                           erased_working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ RAJA::stripIndexType(idx_array[i]) ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[RAJA::stripIndexType(idx_array[i])] = idx_array[i];
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx)] = idx;
-  }); 
+  RAJA::forall<EXEC_POLICY>(working_res, lseg,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                              working_array[RAJA::stripIndexType(idx)] = idx;
+                            });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  // 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  //
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceListSegmentTest);
 template <typename T>
 class ForallResourceListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
 {
@@ -90,11 +93,14 @@ TYPED_TEST_P(ForallResourceListSegmentTest, ResourceListSegmentForall)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(13));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(2047));
 
-  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+  ForallResourceListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(32000));
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallResourceListSegmentTest,
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
index 83cc7c4aa1..321a0804fa 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeSegment.hpp
@@ -13,60 +13,66 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
+  camp::resources::Resource erased_working_res {working_res};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeSegmentTest);
 template <typename T>
 class ForallResourceRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -76,9 +82,12 @@ TYPED_TEST_P(ForallResourceRangeSegmentTest, ResourceRangeSegmentForall)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
index f85f295548..37d3ebfbf3 100644
--- a/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
+++ b/test/functional/forall/resource-segment/tests/test-forall-resource-RangeStrideSegment.hpp
@@ -8,100 +8,142 @@
 #ifndef __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 #define __TEST_FORALL_RESOURCE_RANGESTRIDESEGMENT_HPP__
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
-                                      DIFF_TYPE stride)
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallResourceRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                              INDEX_TYPE last,
+                                              DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
   WORKING_RES working_res;
-  camp::resources::Resource erased_working_res{working_res};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource erased_working_res {working_res};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     erased_working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, erased_working_res, &working_array,
+                                     &check_array, &test_array);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
     test_array[RAJA::stripIndexType(i)] = INDEX_TYPE(0);
   }
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N)); 
+  working_res.memcpy(working_array, test_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-    test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-    idx += stride; 
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
+    test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+    idx += stride;
   }
 
-  RAJA::forall<EXEC_POLICY>(working_res, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(
+      working_res, r1,
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+      { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
 
-  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+  working_res.memcpy(check_array, working_array,
+                     sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
-                                       working_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res, working_array,
+                                       check_array, test_array);
 }
 
 
 TYPED_TEST_SUITE_P(ForallResourceRangeStrideSegmentTest);
 template <typename T>
 class ForallResourceRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+  // Test negative strides
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
-TYPED_TEST_P(ForallResourceRangeStrideSegmentTest, ResourceRangeStrideSegmentForall)
+TYPED_TEST_P(ForallResourceRangeStrideSegmentTest,
+             ResourceRangeStrideSegmentForall)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+  // Test size zero segments
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  ForallResourceRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                           EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
index e673abf306..93c2e1c07d 100644
--- a/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-ListSegmentView.hpp
@@ -23,49 +23,49 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = 0; i < N; ++i) {
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
     INDEX_TYPE randval = rand() % N;
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ idx_array[i] ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[idx_array[i]] = idx_array[i];
   }
 
   using layout_type = RAJA::Layout<1, INDEX_TYPE, 0>;
-  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
-#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) || defined(RAJA_COMPILER_MSVC)))\
-    || _GLIBCXX_RELEASE >= 20150716
-  #if (__GNUG__ && __GNUC__ < 5)
-  #define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-  #else
-  #define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-  #endif
+  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
+#if (!(defined(_GLIBCXX_RELEASE) || defined(RAJA_COMPILER_INTEL) ||            \
+       defined(RAJA_COMPILER_MSVC))) ||                                        \
+    _GLIBCXX_RELEASE >= 20150716
+#if (__GNUG__ && __GNUC__ < 5)
+#define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+#else
+#define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+#endif
   static_assert(IS_TRIVIALLY_COPYABLE(layout_type),
                 "These layouts should always be triviallly copyable");
 
@@ -77,23 +77,21 @@ void ForallListSegmentViewTestImpl(INDEX_TYPE N)
 
 
 #endif
-  
+
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -104,69 +102,64 @@ void ForallListSegmentOffsetViewTestImpl(INDEX_TYPE N, INDEX_TYPE offset)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = 0; i < N; ++i) {
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
     INDEX_TYPE randval = rand() % N;
-    if ( i < randval ) {
-      idx_array.push_back(i+offset);
-    }     
+    if (i < randval)
+    {
+      idx_array.push_back(i + offset);
+    }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );  
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
-  for (size_t i = 0; i < idxlen; ++i) {
-    test_array[ idx_array[i]-offset ] = idx_array[i];
+  for (size_t i = 0; i < idxlen; ++i)
+  {
+    test_array[idx_array[i] - offset] = idx_array[i];
   }
 
   using layout_type = RAJA::OffsetLayout<1, INDEX_TYPE>;
-  using view_type = RAJA::View< INDEX_TYPE, layout_type >;
+  using view_type   = RAJA::View<INDEX_TYPE, layout_type>;
 
   INDEX_TYPE N_offset = N + offset;
-  view_type work_view(working_array, 
-                      RAJA::make_offset_layout<1, INDEX_TYPE>( {{offset}}, 
-                                                               {{N_offset}} ));
+  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{offset}}, {{N_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallListSegmentViewTest);
 template <typename T>
 class ForallListSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
 {
@@ -175,15 +168,19 @@ TYPED_TEST_P(ForallListSegmentViewTest, ListSegmentForallView)
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047);
-  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000);
-
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(13, 1);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(2047, 2);
-  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(32000, 3);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      2047);
+  ForallListSegmentViewTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      32000);
+
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(13, 1);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(2047, 2);
+  ForallListSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RESOURCE,
+                                      EXEC_POLICY>(32000, 3);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest,
-                            ListSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentViewTest, ListSegmentForallView);
 
 #endif  // __TEST_FORALL_LISTSEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
index b9355d9bc1..96956fd981 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegment2DView.hpp
@@ -15,43 +15,41 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DViewTestImpl(INDEX_TYPE N)
 {
   INDEX_TYPE lentot = N * N;
-  const int NDIMS = 2;
+  const int NDIMS   = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, lentot);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
   std::iota(test_array, test_array + lentot, 0);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<NDIMS> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<NDIMS>>;
   RAJA::Layout<NDIMS> layout(N, N);
-  
+
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    const INDEX_TYPE row = idx / N;
-    const INDEX_TYPE col = idx % N;
-    work_view(row, col) = row * N + col;
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = row * N + col;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++) {
+  for (INDEX_TYPE i = 0; i < lentot; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -59,75 +57,81 @@ template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegment2DOffsetViewTestImpl(INDEX_TYPE N)
 {
   const INDEX_TYPE leninterior = N * N;
-  const INDEX_TYPE lentot = (N + 2) * (N + 2);
-  const int NDIMS = 2;
+  const INDEX_TYPE lentot      = (N + 2) * (N + 2);
+  const int NDIMS              = 2;
 
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, leninterior);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(lentot,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(lentot, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * lentot ); 
+  memset(test_array, 0, sizeof(INDEX_TYPE) * lentot);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (int row = 1; row < N + 1; ++row) {
-    for (int col = 1; col < N + 1; ++col) {
-      int idx = row * (N+2) + col;
-      test_array[ idx ] = (row - 1) * N + (col - 1);
+  for (int row = 1; row < N + 1; ++row)
+  {
+    for (int col = 1; col < N + 1; ++col)
+    {
+      int idx         = row * (N + 2) + col;
+      test_array[idx] = (row - 1) * N + (col - 1);
     }
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<NDIMS> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<NDIMS>>;
   RAJA::OffsetLayout<NDIMS> layout =
-    RAJA::make_offset_layout<NDIMS>( {{-1, -1}} , {{N+1, N+1}} );
+      RAJA::make_offset_layout<NDIMS>({{-1, -1}}, {{N + 1, N + 1}});
 
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    const INDEX_TYPE row = idx / N;
-    const INDEX_TYPE col = idx % N;
-    work_view(row, col) = idx;  
-  });
+  RAJA::forall<EXEC_POLICY>(r1,
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            {
+                              const INDEX_TYPE row = idx / N;
+                              const INDEX_TYPE col = idx % N;
+                              work_view(row, col)  = idx;
+                            });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * lentot);
 
-  for (INDEX_TYPE i = 0; i < lentot; i++) {
+  for (INDEX_TYPE i = 0; i < lentot; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallRangeSegment2DViewTest);
 template <typename T>
 class ForallRangeSegment2DViewTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runOffsetViewTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runOffsetViewTests()
 {
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(4);
-  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(100);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      4);
+  ForallRangeSegment2DOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      100);
 }
 
 
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
index b4449db822..0c981f3da9 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeSegmentView.hpp
@@ -16,112 +16,109 @@ void ForallRangeSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   RAJA::TypedRangeSegment<INDEX_TYPE> r1(first, last);
   INDEX_TYPE N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
- 
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
+
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx - rbegin ) = idx;
-  }); 
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx - rbegin) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+void ForallRangeSegmentOffsetViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
                                           INDEX_TYPE offset)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first+offset, last+offset);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(first + offset, last + offset);
   INDEX_TYPE N = r1.end() - r1.begin();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
   const INDEX_TYPE rbegin = *r1.begin();
 
   std::iota(test_array, test_array + N, rbegin);
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::OffsetLayout<1, INDEX_TYPE>>;
 
   INDEX_TYPE f_offset = first + offset;
   INDEX_TYPE l_offset = last + offset;
-  view_type work_view(working_array, 
-                      RAJA::make_offset_layout<1, INDEX_TYPE>({{f_offset}},
-                                                              {{l_offset}}));
+  view_type work_view(working_array, RAJA::make_offset_layout<1, INDEX_TYPE>(
+                                         {{f_offset}}, {{l_offset}}));
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( idx ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view(idx) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeViewTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+template <
+    typename INDEX_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeViewTests()
 {
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 10, -5);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      -5, 0, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      -5, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      0, 10, -5);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest);
 template <typename T>
 class ForallRangeSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
 {
@@ -133,14 +130,16 @@ TYPED_TEST_P(ForallRangeSegmentViewTest, RangeSegmentForallView)
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5);
   ForallRangeSegmentViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255);
 
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(0, 5, 1);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 5, 2);
-  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 3);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      0, 5, 1);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      1, 5, 2);
+  ForallRangeSegmentOffsetViewTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      1, 255, 3);
 
   runNegativeViewTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest,
-                            RangeSegmentForallView);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentViewTest, RangeSegmentForallView);
 
 #endif  // __TEST_FORALL_RANGESEGMENTVIEW_HPP__
diff --git a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
index c385b929bc..2e56fab16c 100644
--- a/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
+++ b/test/functional/forall/segment-view/tests/test-forall-RangeStrideSegmentView.hpp
@@ -8,80 +8,90 @@
 #ifndef __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 #define __TEST_FORALL_RANGESTRIDESEGMENTVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentViewTestImpl(INDEX_TYPE first,
+                                          INDEX_TYPE last,
                                           DIFF_TYPE stride)
 {
   RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(first, last, stride);
   INDEX_TYPE N = r1.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  allocateForallTestData<INDEX_TYPE>(N,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(N, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  memset( test_array, 0, sizeof(INDEX_TYPE) * N );
+  memset(test_array, 0, sizeof(INDEX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * N);
 
   INDEX_TYPE idx = first;
-  for (INDEX_TYPE i = 0; i < N; ++i) {
-    test_array[ (idx-first)/stride ] = idx;
+  for (INDEX_TYPE i = 0; i < N; ++i)
+  {
+    test_array[(idx - first) / stride] = idx;
     idx += stride;
   }
 
-  using view_type = RAJA::View< INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0> >;
+  using view_type = RAJA::View<INDEX_TYPE, RAJA::Layout<1, INDEX_TYPE, 0>>;
 
   RAJA::Layout<1> layout(N);
   view_type work_view(working_array, layout);
 
-  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-    work_view( (idx-first)/stride ) = idx;
-  });
+  RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                            { work_view((idx - first) / stride) = idx; });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(test_array[i], check_array[i]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<INDEX_TYPE>::value>::type* =
+              nullptr>
 void runNegativeIndexViewTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
+{}
+
+template <
+    typename INDEX_TYPE,
+    typename DIFF_TYPE,
+    typename WORKING_RES,
+    typename EXEC_POLICY,
+    typename std::enable_if<std::is_signed<INDEX_TYPE>::value>::type* = nullptr>
 void runNegativeIndexViewTests()
 {
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-10, -1, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 0, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(-5, 5, 3);
-
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, -1, -1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(10, 0, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-10, -1, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-5, 0, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(-5, 5, 3);
+
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(10, -1, -1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(10, 0, -2);
 }
 
 
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentViewTest);
 template <typename T>
 class ForallRangeStrideSegmentViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
 {
@@ -90,17 +100,26 @@ TYPED_TEST_P(ForallRangeStrideSegmentViewTest, RangeStrideSegmentForallView)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
   using DIFF_TYPE   = typename std::make_signed<INDEX_TYPE>::type;
 
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 1);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 21, 2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 255, 2);
-
-// Test size zero segments
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(0, 20, -2);
-  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(1, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 1);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 21, 2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 255, 2);
+
+  // Test size zero segments
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(0, 20, -2);
+  ForallRangeStrideSegmentViewTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                       EXEC_POLICY>(1, 20, -2);
 
   runNegativeIndexViewTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
index 0252af8644..df2f4300c8 100644
--- a/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-ListSegment.hpp
@@ -22,76 +22,83 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
-    }     
+    }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0) {
+  if (N > 0)
+  {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, 
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (size_t i = 0; i < idxlen; ++i) {
-      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i)
+    {
+      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx)] = idx;
-    }); 
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
-  } else { // zero-length segment
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+                                working_array[RAJA::stripIndexType(idx)] = idx;
+                              });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::forall<EXEC_POLICY>(lseg, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
+    RAJA::forall<EXEC_POLICY>(lseg,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -99,8 +106,7 @@ void ForallListSegmentTestImpl(INDEX_TYPE N)
 TYPED_TEST_SUITE_P(ForallListSegmentTest);
 template <typename T>
 class ForallListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
 {
@@ -109,16 +115,19 @@ TYPED_TEST_P(ForallListSegmentTest, ListSegmentForall)
   using EXEC_POLICY      = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length list segment
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(0));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(0));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(13));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(13));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(2047));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(2047));
 
-  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(INDEX_TYPE(32000));
+  ForallListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, EXEC_POLICY>(
+      INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest,
-                            ListSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallListSegmentTest, ListSegmentForall);
 
 #endif  // __TEST_FORALL_LISTSEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
index 8b10d5dc10..a55a655788 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp
@@ -14,57 +14,60 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-    });
-
-  } else { // zero-length segment 
+    RAJA::forall<EXEC_POLICY>(
+        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -72,24 +75,31 @@ void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(ForallRangeSegmentTest);
 template <typename T>
 class ForallRangeSegmentTest : public ::testing::Test
-{
-};
+{};
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
+{}
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(-5));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5));
 }
 
 
@@ -100,16 +110,19 @@ TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // test zero-length range segment
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(3), INDEX_TYPE(3));
 
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(27));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(2047));
+  ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(32000));
 
   runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest,
-                            RangeSegmentForall);
+REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest, RangeSegmentForall);
 
 #endif  // __TEST_FORALL_RANGESEGMENT_HPP__
diff --git a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
index 00046e15bf..e92ec54af2 100644
--- a/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
+++ b/test/functional/forall/segment/tests/test-forall-RangeStrideSegment.hpp
@@ -10,65 +10,71 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE, typename DIFF_TYPE, 
-          typename WORKING_RES, typename EXEC_POLICY>
-void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
+void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len); 
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-      idx += stride; 
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+    {
+      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
+      idx += stride;
     }
 
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-    });
-
-  } else { // zero-length segment
-
-    RAJA::forall<EXEC_POLICY>(r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
-      (void) idx;
-      working_array[0]++;
-    });
-
+    RAJA::forall<EXEC_POLICY>(
+        r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+        { working_array[RAJA::stripIndexType((idx - first) / stride)] = idx; });
+  }
+  else
+  {  // zero-length segment
+
+    RAJA::forall<EXEC_POLICY>(r1,
+                              [=] RAJA_HOST_DEVICE(INDEX_TYPE idx)
+                              {
+                                (void)idx;
+                                working_array[0]++;
+                              });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -76,26 +82,42 @@ void ForallRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
 TYPED_TEST_SUITE_P(ForallRangeStrideSegmentTest);
 template <typename T>
 class ForallRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename EXEC_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5),
+                                                DIFF_TYPE(3));
+
+  // Test negative strides
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1),
+                                                DIFF_TYPE(-1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0),
+                                                DIFF_TYPE(-2));
 }
 
 
@@ -104,19 +126,38 @@ TYPED_TEST_P(ForallRangeStrideSegmentTest, RangeStrideSegmentForall)
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(1));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21),
+                                                DIFF_TYPE(2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255),
+                                                DIFF_TYPE(2));
+
+  // Test size zero segments
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
+  ForallRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20),
+                                                DIFF_TYPE(-2));
 
   runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, EXEC_POLICY>();
 }
diff --git a/test/functional/indexset-build/test-aligned-indexset.cpp b/test/functional/indexset-build/test-aligned-indexset.cpp
index ec40004b2d..393d1ad7ba 100644
--- a/test/functional/indexset-build/test-aligned-indexset.cpp
+++ b/test/functional/indexset-build/test-aligned-indexset.cpp
@@ -11,7 +11,7 @@
 
 #include "RAJA_test-base.hpp"
 
-#include "RAJA/index/IndexSetBuilders.hpp" 
+#include "RAJA/index/IndexSetBuilders.hpp"
 
 #include "camp/resource.hpp"
 
@@ -21,7 +21,7 @@
 TEST(IndexSetBuild, Aligned)
 {
   const RAJA::Index_type range_min_length = 8;
-  const RAJA::Index_type range_align = 2;
+  const RAJA::Index_type range_align      = 2;
 
   using RSType = RAJA::RangeSegment;
   using LSType = RAJA::ListSegment;
@@ -36,7 +36,8 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(17);
   indices.push_back(18);
 
-  for (RAJA::Index_type i = 20; i < 28; ++i) {
+  for (RAJA::Index_type i = 20; i < 28; ++i)
+  {
     indices.push_back(i);
   }
 
@@ -44,16 +45,13 @@ TEST(IndexSetBuild, Aligned)
   indices.push_back(30);
   indices.push_back(31);
 
-  camp::resources::Resource res{camp::resources::Host()};
- 
+  camp::resources::Resource res {camp::resources::Host()};
+
   RAJA::TypedIndexSet<RAJA::RangeSegment, RAJA::ListSegment> iset;
 
-  RAJA::buildIndexSetAligned(iset, 
-                             res,
-                             &indices[0],
+  RAJA::buildIndexSetAligned(iset, res, &indices[0],
                              static_cast<RAJA::Index_type>(indices.size()),
-                             range_min_length,
-                             range_align);
+                             range_min_length, range_align);
 
   ASSERT_EQ(iset.getLength(), indices.size());
 
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index 44a2a9ffa1..440239a700 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -28,7 +28,8 @@ void KernelBasicFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -41,66 +42,60 @@ void KernelBasicFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_x,
-                                    &check_array_x,
-                                    &test_array_x);
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_y,
-                                    &check_array_y,
-                                    &test_array_y);
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(working_array_x,
-                     0,
+  working_res.memset(working_array_x, 0,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+      {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
       },
 
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+      {
         RAJA::atomicAdd<RAJA::auto_atomic>(
             &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
       }
 
   );
 
-  working_res.memcpy(check_array_x,
-                     working_array_x,
+  working_res.memcpy(check_array_x, working_array_x,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  memset(static_cast<void*>(check_array_y),
-         0,
+  memset(static_cast<void*>(check_array_y), 0,
          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
-    check_array_y[RAJA::stripIndexType(i)] += 1;
-    check_array_y[RAJA::stripIndexType(i)] += 2;
-  });
+  RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
+                               [=](IDX_TYPE i)
+                               {
+                                 check_array_y[RAJA::stripIndexType(i)] += 1;
+                                 check_array_y[RAJA::stripIndexType(i)] += 2;
+                               });
 
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
     ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
               check_array_y[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_x,
-                                      check_array_x,
-                                      test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_y,
-                                      check_array_y,
-                                      test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif  // __BASIC_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
index 0627e469af..141bbc7687 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/test-kernel-basic-fission-fusion-loop-segments.hpp
@@ -13,18 +13,17 @@
 TYPED_TEST_SUITE_P(KernelBasicFissionFusionLoopTest);
 template <typename T>
 class KernelBasicFissionFusionLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
              BasicFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -32,18 +31,14 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -52,9 +47,7 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -62,18 +55,14 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs2, seg_idx, working_res, erased_working_res);
 
@@ -81,9 +70,7 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
       rs3, seg_idx, working_res, erased_working_res);
 
@@ -91,29 +78,25 @@ TYPED_TEST_P(KernelBasicFissionFusionLoopTest,
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval) {
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
                                       erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
                                       erased_working_res);
-  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE,
-                                       EXEC_POLICY,
-                                       WORKING_RES,
+  KernelBasicFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                        RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
index e22f544062..03b5813640 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/basic-single-icount-loop-impl.hpp
@@ -16,15 +16,20 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES, typename SEG_TYPE>
-void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg, 
-                                   const std::vector<IDX_TYPE>& seg_idx,
-                                   WORKING_RES working_res,
-                                   camp::resources::Resource erased_working_res)
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE>
+void KernelBasicSingleICountLoopTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    WORKING_RES working_res,
+    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
-  if ( seg_idx.size() > 0 ) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -35,86 +40,79 @@ void KernelBasicSingleICountLoopTestImpl(const SEG_TYPE& seg,
   IDX_TYPE* test_array;
   IDX_TYPE* test_array_i;
 
-  if ( RAJA::stripIndexType(data_len) == 0 ) {
+  if (RAJA::stripIndexType(data_len) == 0)
+  {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array_i,
-                                   &check_array_i,
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res,
+                                   &working_array_i, &check_array_i,
                                    &test_array_i);
 
-  memset(static_cast<void*>(test_array), 0, 
+  memset(static_cast<void*>(test_array), 0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array, 
+  working_res.memcpy(working_array, test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array_i, test_array_i, 
+  working_res.memcpy(working_array_i, test_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( RAJA::stripIndexType(idx_len) > 0 ) {
+  if (RAJA::stripIndexType(idx_len) > 0)
+  {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
-      test_array  [ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
-        seg_idx[RAJA::stripIndexType(i)];
-      test_array_i[ RAJA::stripIndexType(RAJA::stripIndexType(i)) ] = 
-        IDX_TYPE(i);
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
+    {
+      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
+          seg_idx[RAJA::stripIndexType(i)];
+      test_array_i[RAJA::stripIndexType(RAJA::stripIndexType(i))] = IDX_TYPE(i);
     }
- 
+
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(seg),
-      RAJA::make_tuple(IDX_TYPE(0)),
-      
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
-        working_array[RAJA::stripIndexType(idx)] = IDX_TYPE(idx) ;
-        working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx) ;
-      }
-    );
+        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
 
-  } else { // zero-length segment
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
+        {
+          working_array[RAJA::stripIndexType(idx)]     = IDX_TYPE(idx);
+          working_array_i[RAJA::stripIndexType(i_idx)] = IDX_TYPE(i_idx);
+        });
+  }
+  else
+  {  // zero-length segment
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple(seg),
-      RAJA::make_tuple(IDX_TYPE(0)),
-      
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx) {
-        (void) idx; (void) i_idx;
-        working_array[0]++;
-        working_array_i[0]++;
-      }
-    );
-
+        RAJA::make_tuple(seg), RAJA::make_tuple(IDX_TYPE(0)),
+
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx, IDX_TYPE i_idx)
+        {
+          (void)idx;
+          (void)i_idx;
+          working_array[0]++;
+          working_array_i[0]++;
+        });
   }
 
-  working_res.memcpy(check_array, working_array, 
+  working_res.memcpy(check_array, working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
-  working_res.memcpy(check_array_i, working_array_i, 
+  working_res.memcpy(check_array_i, working_array_i,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
-    ASSERT_EQ( test_array[RAJA::stripIndexType(i)],
-               check_array[RAJA::stripIndexType(i)] );
-    ASSERT_EQ( test_array_i[RAJA::stripIndexType(i)],
-               check_array_i[RAJA::stripIndexType(i)] );
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
+    ASSERT_EQ(test_array_i[RAJA::stripIndexType(i)],
+              check_array_i[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array_i,
-                                     check_array_i,
-                                     test_array_i);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array_i,
+                                     check_array_i, test_array_i);
 }
 
 #endif  // __BASIC_SINGLE_ICOUNT_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
index e6bd76fef9..d1f00123d8 100644
--- a/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-icount-loop/tests/test-kernel-basic-single-icount-loop-segments.hpp
@@ -13,88 +13,92 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest);
 template <typename T>
 class KernelBasicSingleICountLoopTest : public ::testing::Test
-{
-};
+{};
 
-TYPED_TEST_P(KernelBasicSingleICountLoopTest, BasicSingleICountLoopSegmentKernel)
+TYPED_TEST_P(KernelBasicSingleICountLoopTest,
+             BasicSingleICountLoopSegmentKernel)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r1, seg_idx, working_res, erased_working_res);
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r2, seg_idx, working_res, erased_working_res);
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeSegment<IDX_TYPE>>(
-                                      r3, seg_idx, working_res, erased_working_res);
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs1, seg_idx, working_res, erased_working_res);
+      rs1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs2, seg_idx, working_res, erased_working_res);
+      rs2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>>(
-                                      rs3, seg_idx, working_res, erased_working_res);
+      rs3, seg_idx, working_res, erased_working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-                                      l1, seg_idx, working_res, erased_working_res);
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleICountLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                       RAJA::TypedListSegment<IDX_TYPE>>(
-                                      l2, seg_idx, working_res, erased_working_res);
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleICountLoopTest,
diff --git a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
index 6b4239e84a..1d6e0e5938 100644
--- a/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/basic-single-loop-segments-impl.hpp
@@ -16,32 +16,42 @@
 #include <numeric>
 #include <vector>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template <typename IDX_TYPE, typename EXEC_POLICY, typename WORKING_RES,
-          typename SEG_TYPE, bool USE_RESOURCE>
-void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg, 
+template <typename IDX_TYPE,
+          typename EXEC_POLICY,
+          typename WORKING_RES,
+          typename SEG_TYPE,
+          bool USE_RESOURCE>
+void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
                                    const std::vector<IDX_TYPE>& seg_idx,
                                    WORKING_RES working_res,
                                    camp::resources::Resource erased_working_res)
 {
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
   IDX_TYPE data_len = IDX_TYPE(0);
-  if ( seg_idx.size() > 0 ) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -49,58 +59,56 @@ void KernelBasicSingleLoopTestImpl(const SEG_TYPE& seg,
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  if ( RAJA::stripIndexType(data_len) == 0 ) {
+  if (RAJA::stripIndexType(data_len) == 0)
+  {
     data_len++;
   }
 
-  allocateForallTestData<IDX_TYPE>(data_len,
-                                   erased_working_res,
-                                   &working_array,
-                                   &check_array,
-                                   &test_array);
+  allocateForallTestData<IDX_TYPE>(data_len, erased_working_res, &working_array,
+                                   &check_array, &test_array);
 
-  memset(static_cast<void*>(test_array), 0, 
+  memset(static_cast<void*>(test_array), 0,
          sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(working_array, test_array, 
+  working_res.memcpy(working_array, test_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( RAJA::stripIndexType(idx_len) > 0 ) {
+  if (RAJA::stripIndexType(idx_len) > 0)
+  {
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i) {
-      test_array[ RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)]) ] = 
-         seg_idx[RAJA::stripIndexType(i)];
+    for (IDX_TYPE i = IDX_TYPE(0); i < idx_len; ++i)
+    {
+      test_array[RAJA::stripIndexType(seg_idx[RAJA::stripIndexType(i)])] =
+          seg_idx[RAJA::stripIndexType(i)];
     }
- 
-    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-        working_array[RAJA::stripIndexType(idx)] = idx;
-      }
-    );
-
-  } else { // zero-length segment
-
-    call_kernel<EXEC_POLICY, USE_RESOURCE>( RAJA::make_tuple(seg), working_res,
-      [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
-        (void) idx;
-        working_array[0]++;
-      }
-    );
 
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(
+        RAJA::make_tuple(seg), working_res,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+        { working_array[RAJA::stripIndexType(idx)] = idx; });
+  }
+  else
+  {  // zero-length segment
+
+    call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(seg), working_res,
+                                           [=] RAJA_HOST_DEVICE(IDX_TYPE idx)
+                                           {
+                                             (void)idx;
+                                             working_array[0]++;
+                                           });
   }
 
-  working_res.memcpy(check_array, working_array, 
+  working_res.memcpy(check_array, working_array,
                      sizeof(IDX_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
-    ASSERT_EQ( test_array[RAJA::stripIndexType(i)], 
-               check_array[RAJA::stripIndexType(i)] );
+  for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(erased_working_res,
-                                     working_array,
-                                     check_array,
-                                     test_array);
+  deallocateForallTestData<IDX_TYPE>(erased_working_res, working_array,
+                                     check_array, test_array);
 }
 
 #endif  // __BASIC_SINGLE_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
index 5a7ce88f55..156aaf7d1d 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-basic-single-loop-segments.hpp
@@ -13,8 +13,7 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
@@ -22,81 +21,88 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = false;
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r1, seg_idx, working_res, erased_working_res);
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r2, seg_idx, working_res, erased_working_res);
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r3, seg_idx, working_res, erased_working_res);
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs1, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs2, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs3, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l1, seg_idx, working_res, erased_working_res);
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l2, seg_idx, working_res, erased_working_res);
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
diff --git a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
index 6f624eab2c..eb54f4763e 100644
--- a/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
+++ b/test/functional/kernel/basic-single-loop/tests/test-kernel-resource-basic-single-loop-segments.hpp
@@ -13,8 +13,7 @@
 TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest);
 template <typename T>
 class KernelBasicSingleLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
 {
@@ -22,81 +21,88 @@ TYPED_TEST_P(KernelBasicSingleLoopTest, BasicSingleLoopSegmentKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   constexpr bool USE_RES = true;
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 37 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r1, seg_idx, working_res, erased_working_res);
+      r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r2, seg_idx, working_res, erased_working_res);
+      r2, seg_idx, working_res, erased_working_res);
 
   // test zero-length range segment
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 5, 5 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedRangeSegment<IDX_TYPE>, USE_RES>(
-                                  r3, seg_idx, working_res, erased_working_res);
+      r3, seg_idx, working_res, erased_working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs1, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs1, seg_idx, working_res,
+                                         erased_working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2( 2, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs2, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs2, seg_idx, working_res,
+                                         erased_working_res);
 
   // test zero-length range-stride segment
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3( 2, 2, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
-                                RAJA::TypedRangeStrideSegment<IDX_TYPE>, USE_RES>(
-                                  rs3, seg_idx, working_res, erased_working_res);
+                                RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                USE_RES>(rs3, seg_idx, working_res,
+                                         erased_working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l1, seg_idx, working_res, erased_working_res);
+      l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2( nullptr, seg_idx.size(), erased_working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
+                                      erased_working_res);
   KernelBasicSingleLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                 RAJA::TypedListSegment<IDX_TYPE>, USE_RES>(
-                                  l2, seg_idx, working_res, erased_working_res);
+      l2, seg_idx, working_res, erased_working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelBasicSingleLoopTest,
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index 5a326b3c62..d321718390 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -28,7 +28,8 @@ void KernelConditionalFissionFusionLoopTestImpl(
 {
   IDX_TYPE data_len = IDX_TYPE(0);
 
-  if (seg_idx.size() > 0) {
+  if (seg_idx.size() > 0)
+  {
     data_len = seg_idx[seg_idx.size() - 1] + 1;
   }
 
@@ -41,23 +42,19 @@ void KernelConditionalFissionFusionLoopTestImpl(
   DATA_TYPE* test_array_y;
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_x,
-                                    &check_array_x,
-                                    &test_array_x);
+                                    erased_working_res, &working_array_x,
+                                    &check_array_x, &test_array_x);
 
   allocateForallTestData<DATA_TYPE>(RAJA::stripIndexType(data_len),
-                                    erased_working_res,
-                                    &working_array_y,
-                                    &check_array_y,
-                                    &test_array_y);
+                                    erased_working_res, &working_array_y,
+                                    &check_array_y, &test_array_y);
 
 
-  working_res.memset(working_array_x,
-                     0,
+  working_res.memset(working_array_x, 0,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (int param = 0; param < 2; ++param) {
+  for (int param = 0; param < 2; ++param)
+  {
 
     RAJA::kernel_param<EXEC_POLICY>(
 
@@ -65,47 +62,46 @@ void KernelConditionalFissionFusionLoopTestImpl(
 
         RAJA::make_tuple(param),
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)1);
         },
 
-        [=] RAJA_HOST_DEVICE(IDX_TYPE i) {
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i)
+        {
           RAJA::atomicAdd<RAJA::auto_atomic>(
               &working_array_x[RAJA::stripIndexType(i)], (DATA_TYPE)2);
         }
 
     );
 
-    working_res.memcpy(check_array_x,
-                       working_array_x,
+    working_res.memcpy(check_array_x, working_array_x,
                        sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    memset(static_cast<void*>(check_array_y),
-           0,
+    memset(static_cast<void*>(check_array_y), 0,
            sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx, [=](IDX_TYPE i) {
-      check_array_y[RAJA::stripIndexType(i)] = 3 + 3 * param;
-    });
+    RAJA::forall<RAJA::seq_exec>(working_res, seg_idx,
+                                 [=](IDX_TYPE i) {
+                                   check_array_y[RAJA::stripIndexType(i)] =
+                                       3 + 3 * param;
+                                 });
 
 
-    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i) {
+    for (IDX_TYPE i = IDX_TYPE(0); i < data_len; ++i)
+    {
       ASSERT_EQ(check_array_x[RAJA::stripIndexType(i)],
                 check_array_y[RAJA::stripIndexType(i)]);
     }
   }
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_x,
-                                      check_array_x,
-                                      test_array_x);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_x,
+                                      check_array_x, test_array_x);
 
 
-  deallocateForallTestData<DATA_TYPE>(erased_working_res,
-                                      working_array_y,
-                                      check_array_y,
-                                      test_array_y);
+  deallocateForallTestData<DATA_TYPE>(erased_working_res, working_array_y,
+                                      check_array_y, test_array_y);
 }
 
 #endif  // __CONDITIONAL_FISSION_FUSION_LOOP_SEGMENTS_IMPL_HPP__
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
index ddb2302e60..ffe659f215 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/test-kernel-conditional-fission-fusion-loop-segments.hpp
@@ -13,18 +13,17 @@
 TYPED_TEST_SUITE_P(KernelConditionalFissionFusionLoopTest);
 template <typename T>
 class KernelConditionalFissionFusionLoopTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
              ConditionalFissionFusionLoopSegmentKernel)
 {
-  using IDX_TYPE = typename camp::at<TypeParam, camp::num<0>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_working_res{working_res};
+  WORKING_RES working_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_working_res {working_res};
 
   std::vector<IDX_TYPE> seg_idx;
 
@@ -32,18 +31,14 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 37);
   RAJA::getIndices(seg_idx, r1);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r1, seg_idx, working_res, erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 2057);
   RAJA::getIndices(seg_idx, r2);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r2, seg_idx, working_res, erased_working_res);
 
@@ -52,9 +47,7 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeSegment<IDX_TYPE> r3(5, 5);
   RAJA::getIndices(seg_idx, r3);
 
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedRangeSegment<IDX_TYPE>>(
       r3, seg_idx, working_res, erased_working_res);
 
@@ -63,24 +56,16 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs1(0, 188, 2);
   RAJA::getIndices(seg_idx, rs1);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1,
-                                               seg_idx,
-                                               working_res,
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs1, seg_idx, working_res,
                                                erased_working_res);
 
   seg_idx.clear();
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs2(2, 1029, 3);
   RAJA::getIndices(seg_idx, rs2);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2,
-                                               seg_idx,
-                                               working_res,
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs2, seg_idx, working_res,
                                                erased_working_res);
 
   // test zero-length range-stride segment
@@ -88,41 +73,33 @@ TYPED_TEST_P(KernelConditionalFissionFusionLoopTest,
   RAJA::TypedRangeStrideSegment<IDX_TYPE> rs3(2, 2, 3);
   RAJA::getIndices(seg_idx, rs3);
   KernelConditionalFissionFusionLoopTestImpl<
-      IDX_TYPE,
-      EXEC_POLICY,
-      WORKING_RES,
-      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3,
-                                               seg_idx,
-                                               working_res,
+      IDX_TYPE, EXEC_POLICY, WORKING_RES,
+      RAJA::TypedRangeStrideSegment<IDX_TYPE>>(rs3, seg_idx, working_res,
                                                erased_working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = IDX_TYPE(10567);
   srand(time(NULL));
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
     IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
-    if (i < randval) {
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0],
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(),
                                       erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l1, seg_idx, working_res, erased_working_res);
 
   // test zero-length list segment
   seg_idx.clear();
-  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr,
-                                      seg_idx.size(),
+  RAJA::TypedListSegment<IDX_TYPE> l2(nullptr, seg_idx.size(),
                                       erased_working_res);
-  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE,
-                                             EXEC_POLICY,
-                                             WORKING_RES,
+  KernelConditionalFissionFusionLoopTestImpl<IDX_TYPE, EXEC_POLICY, WORKING_RES,
                                              RAJA::TypedListSegment<IDX_TYPE>>(
       l2, seg_idx, working_res, erased_working_res);
 }
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index ddae647f83..8645ae0b33 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -11,80 +11,98 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim)
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+void KernelHyperplane2DTestImpl(const int groups,
+                                const int idim,
+                                const int jdim)
 {
-  // This test traverses "groups" 2D arrays, and modifies values in a 1D hyperplane manner.
+  // This test traverses "groups" 2D arrays, and modifies values in a 1D
+  // hyperplane manner.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView( test_array, groups, idim, jdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> HostView(
+      test_array, groups, idim, jdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> WorkView(
+      work_array, groups, idim, jdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3, INDEX_TYPE>> CheckView(
+      check_array, groups, idim, jdim);
 
   // initialize array
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
-  // perform array arithmetic with a 1D hyperplane, in either the I or J direction
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Grange( 0, groups );
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Irange( 0, idim );
-  RAJA::TypedRangeSegment<INDEX_TYPE>  Jrange( 0, jdim );
-
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj ) {
-      if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim || (int)jj < 0 || (int)jj >= jdim) {
-        oob_count += 1;
-      }
+  // perform array arithmetic with a 1D hyperplane, in either the I or J
+  // direction
+  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Irange(0, idim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Jrange(0, jdim);
+
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(Grange, Irange, Jrange),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj)
+      {
+        if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim ||
+            (int)jj < 0 || (int)jj >= jdim)
+        {
+          oob_count += 1;
+        }
 
-      DATA_TYPE left = 1;
-      if (ii > 0) {
-        left = WorkView(g, ii - 1, jj);
-      }
+        DATA_TYPE left = 1;
+        if (ii > 0)
+        {
+          left = WorkView(g, ii - 1, jj);
+        }
 
-      DATA_TYPE up = 1;
-      if (jj > 0) {
-        up = WorkView(g, ii, jj - 1);
-      }
+        DATA_TYPE up = 1;
+        if (jj > 0)
+        {
+          up = WorkView(g, ii, jj - 1);
+        }
 
-      WorkView(g, ii, jj) = left + up;
+        WorkView(g, ii, jj) = left + up;
 
-      trip_count += 1;
-  });
+        trip_count += 1;
+      });
 
-  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
 
   ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
         DATA_TYPE left = 1;
-        if (i > 0) {
+        if (i > 0)
+        {
           left = HostView(g, i - 1, j);
         }
 
         DATA_TYPE up = 1;
-        if (j > 0) {
+        if (j > 0)
+        {
           up = HostView(g, i, j - 1);
         }
 
@@ -93,42 +111,43 @@ void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim
     }
   }
 
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
         ASSERT_FLOAT_EQ(CheckView(g, i, j), HostView(g, i, j));
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane2DTest);
 template <typename T>
 class KernelHyperplane2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelHyperplane2DTest, Hyperplane2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 111, 205);
-  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 213, 123);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(2, 111, 205);
+  KernelHyperplane2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(3, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest,
-                            Hyperplane2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane2DTest, Hyperplane2DKernel);
 
 #endif  // __TEST_KERNEL_HYPERPLANE_2D_HPP__
diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
index 321f43d6a6..611d8fd3bf 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-3D.hpp
@@ -11,21 +11,38 @@
 #include <numeric>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups), const int RAJA_UNUSED_ARG(idim), const int RAJA_UNUSED_ARG(jdim), const int RAJA_UNUSED_ARG(kdim))
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+typename std::enable_if<
+    std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int RAJA_UNUSED_ARG(groups),
+                           const int RAJA_UNUSED_ARG(idim),
+                           const int RAJA_UNUSED_ARG(jdim),
+                           const int RAJA_UNUSED_ARG(kdim))
 {
   // do nothing for unsigned index types
 }
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
-typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
-KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin, const int kdimin)
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
+typename std::enable_if<
+    std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type
+KernelHyperplane3DTestImpl(const int groups,
+                           const int idimin,
+                           const int jdimin,
+                           const int kdimin)
 {
-  // This test traverses "groups" number of 3D arrays, and modifies values in a 2D hyperplane manner.
+  // This test traverses "groups" number of 3D arrays, and modifies values in a
+  // 2D hyperplane manner.
 
   int idim, jdim, kdim;
-  if ( std::is_same<DATA_TYPE, float>::value )
+  if (std::is_same<DATA_TYPE, float>::value)
   {
     // Restrict to a small data size for better float precision.
     idim = 5;
@@ -39,87 +56,102 @@ KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin,
     kdim = kdimin;
   }
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = groups * idim * jdim * kdim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView( test_array, groups, idim, jdim, kdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView( work_array, groups, idim, jdim, kdim );
-  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView( check_array, groups, idim, jdim, kdim );
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> HostView(
+      test_array, groups, idim, jdim, kdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> WorkView(
+      work_array, groups, idim, jdim, kdim);
+  RAJA::View<DATA_TYPE, RAJA::Layout<4, INDEX_TYPE>> CheckView(
+      check_array, groups, idim, jdim, kdim);
 
   // initialize array
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> trip_count(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> oob_count(0);
 
   // perform array arithmetic with a 2D J-K hyperplane
-  RAJA::TypedRangeSegment<INDEX_TYPE>   Grange( 0, groups );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Irange( 0, idim, 1 );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Jrange( jdim-1, -1, -1 );
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE>  Krange( 0, kdim, 1 );
-
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange, Krange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj, INDEX_TYPE kk ) {
-      if (g < 0 || g >= groups || ii < 0 || ii >= idim || jj < 0 || jj >= jdim || kk < 0 || kk >= kdim) {
-        oob_count += 1;
-      }
-
-      DATA_TYPE left = 1;
-      if (ii > 0) {
-        left = WorkView(g, ii - 1, jj, kk);
-      }
-
-      DATA_TYPE up = 1;
-      if (jj > 0) {
-        up = WorkView(g, ii, jj - 1, kk);
-      }
-
-      DATA_TYPE back = 1;
-      if (kk > 0) {
-        back = WorkView(g, ii, jj, kk - 1);
-      }
-
-      WorkView(g, ii, jj, kk) = left + up + back;
-
-      trip_count += 1;
-  });
-
-  work_res.memcpy( check_array, work_array, sizeof(DATA_TYPE) * array_length );
-
-  ASSERT_EQ((INDEX_TYPE)trip_count.get(), (INDEX_TYPE)groups * idim * jdim * kdim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> Grange(0, groups);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Irange(0, idim, 1);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Jrange(jdim - 1, -1, -1);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> Krange(0, kdim, 1);
+
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(Grange, Irange, Jrange, Krange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE g, INDEX_TYPE ii,
+                                                 INDEX_TYPE jj, INDEX_TYPE kk)
+                            {
+                              if (g < 0 || g >= groups || ii < 0 ||
+                                  ii >= idim || jj < 0 || jj >= jdim ||
+                                  kk < 0 || kk >= kdim)
+                              {
+                                oob_count += 1;
+                              }
+
+                              DATA_TYPE left = 1;
+                              if (ii > 0)
+                              {
+                                left = WorkView(g, ii - 1, jj, kk);
+                              }
+
+                              DATA_TYPE up = 1;
+                              if (jj > 0)
+                              {
+                                up = WorkView(g, ii, jj - 1, kk);
+                              }
+
+                              DATA_TYPE back = 1;
+                              if (kk > 0)
+                              {
+                                back = WorkView(g, ii, jj, kk - 1);
+                              }
+
+                              WorkView(g, ii, jj, kk) = left + up + back;
+
+                              trip_count += 1;
+                            });
+
+  work_res.memcpy(check_array, work_array, sizeof(DATA_TYPE) * array_length);
+
+  ASSERT_EQ((INDEX_TYPE)trip_count.get(),
+            (INDEX_TYPE)groups * idim * jdim * kdim);
   ASSERT_EQ((INDEX_TYPE)oob_count.get(), (INDEX_TYPE)0);
 
   // perform array arithmetic on the CPU
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = jdim - 1; j >= 0; --j) {
-        for (int k = 0; k < kdim; ++k) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = jdim - 1; j >= 0; --j)
+      {
+        for (int k = 0; k < kdim; ++k)
+        {
           DATA_TYPE left = 1;
-          if (i > 0) {
+          if (i > 0)
+          {
             left = HostView(g, i - 1, j, k);
           }
 
           DATA_TYPE up = 1;
-          if (j > 0) {
+          if (j > 0)
+          {
             up = HostView(g, i, j - 1, k);
           }
 
           DATA_TYPE back = 1;
-          if (k > 0) {
+          if (k > 0)
+          {
             back = HostView(g, i, j, k - 1);
           }
 
@@ -129,44 +161,46 @@ KernelHyperplane3DTestImpl(const int groups, const int idimin, const int jdimin,
     }
   }
 
-  for (int g = 0; g < groups; ++g) {
-    for (int i = 0; i < idim; ++i) {
-      for (int j = 0; j < jdim; ++j) {
-        for (int k = 0; k < kdim; ++k) {
+  for (int g = 0; g < groups; ++g)
+  {
+    for (int i = 0; i < idim; ++i)
+    {
+      for (int j = 0; j < jdim; ++j)
+      {
+        for (int k = 0; k < kdim; ++k)
+        {
           ASSERT_FLOAT_EQ(CheckView(g, i, j, k), HostView(g, i, j, k));
         }
       }
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelHyperplane3DTest);
 template <typename T>
 class KernelHyperplane3DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelHyperplane3DTest, Hyperplane3DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(1, 10, 10, 10);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(2, 151, 111, 205);
-  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(3, 101, 213, 123);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(1, 10, 10, 10);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(2, 151, 111, 205);
+  KernelHyperplane3DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                             REDUCE_POLICY>(3, 101, 213, 123);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest,
-                            Hyperplane3DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelHyperplane3DTest, Hyperplane3DKernel);
 
 #endif  // __TEST_KERNEL_HYPERPLANE_3D_HPP__
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
index 10923b9da2..a58cc80812 100644
--- a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -15,10 +15,15 @@
 #include <random>
 #include <type_traits>
 
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -26,12 +31,19 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POLICY,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -40,7 +52,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -50,13 +63,13 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -73,51 +86,50 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
-
-    for (IDX_TYPE k : sk) {
-      for (IDX_TYPE j : sj) {
-        for (IDX_TYPE i : si) {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
+
+    for (IDX_TYPE k : sk)
+    {
+      for (IDX_TYPE j : sj)
+      {
+        for (IDX_TYPE i : si)
+        {
+          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii+1] = data_len;
+          test_range[ii + 1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -125,7 +137,8 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -137,21 +150,28 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    RAJA::kernel_resource<EXEC_POLICY>(
+        segments, working_res,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+        {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -161,46 +181,60 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      RAJA::kernel_resource<EXEC_POLICY>(
+          segments, working_res,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -208,23 +242,32 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
-
-      if (!got_ref_vals) {
+      RAJA::kernel_resource<EXEC_POLICY>(
+          segments, working_res,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
+
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -232,76 +275,67 @@ KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest);
 template <typename T>
 class KernelMultiReduceNestedTest : public ::testing::Test
-{
-};
+{};
 
 //
 //
 // Defining the Kernel Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiReduceNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
-  using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1,2>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1, 2>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
 
-template<typename POLICY_DATA>
-struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
+template <typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
+{
   using type =
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
@@ -317,46 +351,52 @@ TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
 
   using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
   using LOOP_POLS = typename EXEC_POL_DATA::type;
-  using EXEC_POLICY = typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
-    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s3, container, working_res, rngen);
-
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
index 216aee14d6..ae856ae553 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-BlockReduceSum-impl.hpp
@@ -10,22 +10,30 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,80 +41,81 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARA
 // Define list of nested loop types the Block test supports.
 //
 //
-using BlockReduceSumSupportedLoopTypeList = camp::list<
-  DEPTH_1_REDUCESUM,
-  DEVICE_DEPTH_1_REDUCESUM
-  >;
+using BlockReduceSumSupportedLoopTypeList =
+    camp::list<DEPTH_1_REDUCESUM, DEVICE_DEPTH_1_REDUCESUM>;
 
 //
 //
 // Nest loop trip count test.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N){
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelNestedLoopTest(const DEPTH_1_REDUCESUM&, const int N)
+{
 
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   // Allocate Tests Data
-  int * work_array;
-  int * check_array;
-  int * test_array;
-
-  allocateForallTestData<int>(N,
-                              erased_work_res,
-                              &work_array,
-                              &check_array,
+  int* work_array;
+  int* check_array;
+  int* test_array;
+
+  allocateForallTestData<int>(N, erased_work_res, &work_array, &check_array,
                               &test_array);
 
-  RAJA::TypedRangeSegment<int> range(0,N);
+  RAJA::TypedRangeSegment<int> range(0, N);
 
   // Initialize Data
   std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
-  erased_work_res.memcpy(work_array, test_array, sizeof(int) * RAJA::stripIndexType(N));
-  
+  erased_work_res.memcpy(work_array, test_array,
+                         sizeof(int) * RAJA::stripIndexType(N));
+
   RAJA::ReduceSum<REDUCE_POL, int> worksum(0);
 
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment(0, N)),
-    RAJA::make_tuple<int>(0),
+      RAJA::make_tuple(RAJA::RangeSegment(0, N)), RAJA::make_tuple<int>(0),
 
-    // Resource
-    work_res,
+      // Resource
+      work_res,
 
-    // lambda 0, only runs for sequential
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
-       value = work_array[i];
-    },
+      // lambda 0, only runs for sequential
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
+      { value = work_array[i]; },
 
-    // lambda 1, only runs for device
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, int & value) {
-       value += work_array[i];
-    },
+      // lambda 1, only runs for device
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, int& value)
+      { value += work_array[i]; },
 
-    // lambda 2, (reduction) runs for both sequential and device
-    // Device: This only gets executed on the "root" thread which received the reduced value.
-    [=] RAJA_HOST_DEVICE (int & value) {
-       worksum += value;
-    }
+      // lambda 2, (reduction) runs for both sequential and device
+      // Device: This only gets executed on the "root" thread which received the
+      // reduced value.
+      [=] RAJA_HOST_DEVICE(int& value) { worksum += value; }
 
   );
 
-  ASSERT_EQ(worksum.get(), N*(N-1)/2);
+  ASSERT_EQ(worksum.get(), N * (N - 1) / 2);
 
-  deallocateForallTestData<int>(erased_work_res,
-                                work_array,
-                                check_array,
+  deallocateForallTestData<int>(erased_work_res, work_array, check_array,
                                 test_array);
 }
 
-// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above DEPTH_1_REDUCESUM test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_1_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_1_REDUCESUM execution policies use the above
+// DEPTH_1_REDUCESUM test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_1_REDUCESUM(), args...);
 }
 
 //
@@ -114,35 +123,41 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_1_REDUCESUM&, Args... args){
 // Defining the Kernel Loop structure for Block Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct BlockNestedLoopExec;
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<2, RAJA::Params<0>>
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::Lambda<0>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<2, RAJA::Params<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<1>>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct BlockNestedLoopExec<DEVICE_DEPTH_1_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<1>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
           RAJA::statement::Lambda<2, RAJA::Params<0>>
-          // Device: Lambda 2 only gets executed on the "root" thread which received the reduced value.
-        >
-      > // end DEVICE_KERNEL
-    >;
+          // Device: Lambda 2 only gets executed on the "root" thread which
+          // received the reduced value.
+          >>  // end DEVICE_KERNEL
+                                  >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
index 54934bef6d..ee001ba8d8 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/nested-loop-ReduceSum-impl.hpp
@@ -10,20 +10,26 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,94 +37,131 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the ReduceSum test supports.
 //
 //
-using ReduceSumSupportedLoopTypeList = camp::list<
-  DEPTH_3_REDUCESUM,
-  DEPTH_3_REDUCESUM_SEQ_INNER,
-  DEPTH_3_REDUCESUM_SEQ_OUTER,
-  DEVICE_DEPTH_3_REDUCESUM,
-  DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
-  DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
+using ReduceSumSupportedLoopTypeList =
+    camp::list<DEPTH_3_REDUCESUM,
+               DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEPTH_3_REDUCESUM_SEQ_OUTER,
+               DEVICE_DEPTH_3_REDUCESUM,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+               DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER>;
 
 //
 //
 // ReduceSum 3D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3_REDUCESUM&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2){
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+                          const RAJA::Index_type dim2)
+{
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
-  erased_work_res.memcpy(work_array, test_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  erased_work_res.memcpy(work_array, test_array,
+                         sizeof(RAJA::Index_type) *
+                             RAJA::stripIndexType(flatSize));
 
   constexpr int Depth = 3;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim0, dim1, dim2);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim0,
+                                                              dim1, dim2);
 
   RAJA::ReduceSum<RAJA::seq_reduce, RAJA::Index_type> hostsum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range0, range1, range2), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k) {
-                              worksum += work_view(i,j,k);
-                            });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range0, range1, range2), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k)
+      { worksum += work_view(i, j, k); });
 
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    hostsum += test_array[RAJA::stripIndexType(i)];
-  });
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i) {
+                                 hostsum += test_array[RAJA::stripIndexType(i)];
+                               });
 
   ASSERT_EQ(hostsum.get(), worksum.get());
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above DEPTH_3_REDUCESUM test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+// DEVICE_ and DEPTH_3_REDUCESUM_SEQ_ execution policies use the above
+// DEPTH_3_REDUCESUM test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER&,
+                          Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEPTH_3_REDUCESUM(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&,
+                          Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEPTH_3_REDUCESUM(), args...);
 }
 
 //
@@ -126,99 +169,113 @@ void KernelNestedLoopTest(const DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER&, Args... arg
 // Defining the Kernel Loop structure for ReduceSum Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct ReduceSumNestedLoopExec;
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<2, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<1,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::For<2,
+                                                RAJA::seq_exec,
+                                                RAJA::statement::Lambda<0>>>>>;
 };
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<2, RAJA::seq_exec,
-              RAJA::statement::Lambda<0>
-            >
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct ReduceSumNestedLoopExec<DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER,
+                               REDUCE_POL,
+                               POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  2,
+                  RAJA::seq_exec,
+                  RAJA::statement::Lambda<0>>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
index cb2f444643..b2363c468b 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-BlockReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
index 344ae26666..aea740d451 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-nested-loop-ReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
index a83c16592f..7ecdf9252f 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-BlockReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest);
 template <typename T>
-class KernelNestedLoopBlockReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopBlockReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopBlockReduceSumTest, NestedLoopBlockKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename BlockNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 1023);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(LOOP_TYPE(), 2345);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1023);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBlockReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
index bbf888f680..b96edb880a 100644
--- a/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
+++ b/test/functional/kernel/nested-loop-reducesum/tests/test-kernel-resource-nested-loop-ReduceSum.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest);
 template <typename T>
-class KernelNestedLoopReduceSumTest : public ::testing::Test {};
+class KernelNestedLoopReduceSumTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,16 @@ TYPED_TEST_P(KernelNestedLoopReduceSumTest, NestedLoopReduceSumKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename ReduceSumNestedLoopExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 40, 30, 20);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopReduceSumTest,
diff --git a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
index 1c1eafabc5..a34688ef8d 100644
--- a/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
+++ b/test/functional/kernel/nested-loop-segment-types/tests/test-kernel-nested-loops-segment-types.hpp
@@ -19,21 +19,22 @@
 
 template <typename IDX_TYPE, typename DATA_TYPE, typename EXEC_POLICY>
 void KernelNestedLoopsSegmentTypesTestImpl(
-  const RAJA::TypedRangeSegment<IDX_TYPE>& s1, 
-  const std::vector<IDX_TYPE>& s1_idx,
-  const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
-  const std::vector<IDX_TYPE>& s2_idx,
-  const RAJA::TypedListSegment<IDX_TYPE>& s3,
-  const std::vector<IDX_TYPE>& s3_idx,
-  camp::resources::Resource working_res,
-  int perm)
+    const RAJA::TypedRangeSegment<IDX_TYPE>& s1,
+    const std::vector<IDX_TYPE>& s1_idx,
+    const RAJA::TypedRangeStrideSegment<IDX_TYPE>& s2,
+    const std::vector<IDX_TYPE>& s2_idx,
+    const RAJA::TypedListSegment<IDX_TYPE>& s3,
+    const std::vector<IDX_TYPE>& s3_idx,
+    camp::resources::Resource working_res,
+    int perm)
 {
   IDX_TYPE idx1_len = static_cast<IDX_TYPE>(s1_idx.size());
   IDX_TYPE idx2_len = static_cast<IDX_TYPE>(s2_idx.size());
   IDX_TYPE idx3_len = static_cast<IDX_TYPE>(s3_idx.size());
 
   bool zero_legth_segment = false;
-  if ( RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0 ) {
+  if (RAJA::stripIndexType(idx1_len * idx2_len * idx3_len) == 0)
+  {
     zero_legth_segment = true;
   }
 
@@ -41,7 +42,8 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   IDX_TYPE dim2 = 1;
   IDX_TYPE dim3 = 1;
 
-  if ( !zero_legth_segment ) {
+  if (!zero_legth_segment)
+  {
     dim1 = s1_idx[s1_idx.size() - 1] + 1;
     dim2 = s2_idx[s2_idx.size() - 1] + 1;
     dim3 = s3_idx[s3_idx.size() - 1] + 1;
@@ -53,76 +55,82 @@ void KernelNestedLoopsSegmentTypesTestImpl(
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &work_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &work_array,
+                                    &check_array, &test_array);
 
-  RAJA::View< DATA_TYPE, RAJA::Layout<3> > work_view(work_array, 
-                                                     dim1, dim2, dim3);
-  RAJA::View< DATA_TYPE, RAJA::Layout<3> > test_view(test_array, 
-                                                     dim1, dim2, dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> work_view(work_array, dim1, dim2,
+                                                   dim3);
+  RAJA::View<DATA_TYPE, RAJA::Layout<3>> test_view(test_array, dim1, dim2,
+                                                   dim3);
 
-  memset( static_cast<void*>(test_array), 0, 
-          sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len) );
+  memset(static_cast<void*>(test_array), 0,
+         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  working_res.memcpy(work_array, test_array, 
+  working_res.memcpy(work_array, test_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  if ( !zero_legth_segment ) {
-    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1) {
-      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2) {
-        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3) {
+  if (!zero_legth_segment)
+  {
+    for (IDX_TYPE i1 = 0; i1 < idx1_len; ++i1)
+    {
+      for (IDX_TYPE i2 = 0; i2 < idx2_len; ++i2)
+      {
+        for (IDX_TYPE i3 = 0; i3 < idx3_len; ++i3)
+        {
           auto ii1 = RAJA::stripIndexType(i1);
           auto ii2 = RAJA::stripIndexType(i2);
           auto ii3 = RAJA::stripIndexType(i3);
-          test_view( s1_idx[ii1], s2_idx[ii2], s3_idx[ii3] ) = 
-            static_cast<DATA_TYPE>( RAJA::stripIndexType(
-                                    s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]) );
+          test_view(s1_idx[ii1], s2_idx[ii2], s3_idx[ii3]) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(
+                  s1_idx[ii1] + s2_idx[ii2] + s3_idx[ii3]));
         }
       }
     }
   }
 
-  if ( perm == 1 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s1, s2, s3 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
+  if (perm == 1)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s1, s2, s3),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i1, IDX_TYPE i2, IDX_TYPE i3)
+        {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
+  }
+
+  if (perm == 2)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s2, s3, s1),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1)
+        {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
   }
- 
-  if ( perm == 2 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s2, s3, s1 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i2, IDX_TYPE i3, IDX_TYPE i1) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
-  } 
-
-  if ( perm == 3 ) {
-    RAJA::kernel<EXEC_POLICY>( RAJA::make_tuple( s3, s1, s2 ),
-      [=] RAJA_HOST_DEVICE (IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2) {
-        work_view(i1, i2, i3) = 
-          static_cast<DATA_TYPE>( RAJA::stripIndexType(i1 + i2 + i3) );
-      }
-    );
-  } 
 
-  working_res.memcpy(check_array, work_array, 
+  if (perm == 3)
+  {
+    RAJA::kernel<EXEC_POLICY>(
+        RAJA::make_tuple(s3, s1, s2),
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i3, IDX_TYPE i1, IDX_TYPE i2)
+        {
+          work_view(i1, i2, i3) =
+              static_cast<DATA_TYPE>(RAJA::stripIndexType(i1 + i2 + i3));
+        });
+  }
+
+  working_res.memcpy(check_array, work_array,
                      sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     auto ii = RAJA::stripIndexType(i);
-    ASSERT_EQ( test_array[ii], check_array[ii] );
+    ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      work_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, work_array, check_array,
                                       test_array);
 }
 
@@ -130,8 +138,7 @@ void KernelNestedLoopsSegmentTypesTestImpl(
 TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest);
 template <typename T>
 class KernelNestedLoopsSegmentTypesTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
 {
@@ -139,144 +146,96 @@ TYPED_TEST_P(KernelNestedLoopsSegmentTypesTest, NestedLoopsSegmentTypesKernel)
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> s1_idx;
   std::vector<IDX_TYPE> s2_idx;
   std::vector<IDX_TYPE> s3_idx;
 
-// Create a segment of each basic type RAJA provides and test
-// permutations of those segments in nested loops 
+  // Create a segment of each basic type RAJA provides and test
+  // permutations of those segments in nested loops
 
-  RAJA::TypedRangeSegment<IDX_TYPE> s1( 0, 69 );
+  RAJA::TypedRangeSegment<IDX_TYPE> s1(0, 69);
   RAJA::getIndices(s1_idx, s1);
 
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2( 3, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s2(3, 188, 2);
   RAJA::getIndices(s2_idx, s2);
 
   IDX_TYPE last = IDX_TYPE(427);
-  srand( time(NULL) );
-  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = IDX_TYPE(0); i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       s3_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> s3( &s3_idx[0], s3_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> s3(&s3_idx[0], s3_idx.size(), working_res);
 
   int perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,    
-                                        s2, s2_idx,    
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
-// Test some zero-length segment combinations
+  // Test some zero-length segment combinations
 
-// Zero-length range segment
-  RAJA::TypedRangeSegment<IDX_TYPE> s4( 4, 4 );
+  // Zero-length range segment
+  RAJA::TypedRangeSegment<IDX_TYPE> s4(4, 4);
   std::vector<IDX_TYPE> s4_idx;
   RAJA::getIndices(s4_idx, s4);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s4, s4_idx,
-                                        s2, s2_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
-
-// Zero-length range stride segment
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5( 3, 3, 2 );
+      s4, s4_idx, s2, s2_idx, s3, s3_idx, working_res, perm);
+
+  // Zero-length range stride segment
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> s5(3, 3, 2);
   std::vector<IDX_TYPE> s5_idx;
   RAJA::getIndices(s5_idx, s5);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s5, s5_idx,
-                                        s3, s3_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s5, s5_idx, s3, s3_idx, working_res, perm);
 
-// Zero-length list segment 
+  // Zero-length list segment
   std::vector<IDX_TYPE> s6_idx;
-  RAJA::TypedListSegment<IDX_TYPE> s6( nullptr, s6_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> s6(nullptr, s6_idx.size(), working_res);
 
   perm = 1;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 
   perm = 2;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 
   perm = 3;
   KernelNestedLoopsSegmentTypesTestImpl<IDX_TYPE, int, EXEC_POLICY>(
-                                        s1, s1_idx,
-                                        s2, s2_idx,
-                                        s6, s6_idx,
-                                        working_res,
-                                        perm);
+      s1, s1_idx, s2, s2_idx, s6, s6_idx, working_res, perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopsSegmentTypesTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
index 77f168ce2f..76089d813d 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView2D.hpp
@@ -14,7 +14,7 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                 std::array<RAJA::idx_t, 2> offset_lo,
                                 std::array<RAJA::idx_t, 2> offset_hi)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
@@ -23,51 +23,45 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
-  EXPECT_LT( off_dim0, dim.at(0) );
-  EXPECT_LT( off_dim1, dim.at(1) );
+  EXPECT_LT(off_dim0, dim.at(0));
+  EXPECT_LT(off_dim1, dim.at(1));
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
+    {
       test_array[j + dim.at(1) * i] = static_cast<IDX_TYPE>(1);
     }
   }
 
 
-  RAJA::OffsetLayout<2> layout =
-    RAJA::make_offset_layout<2>( {{offset_lo.at(0), offset_lo.at(1)}},
-                                 {{offset_lo.at(0) + dim.at(0),
-                                   offset_lo.at(1) + dim.at(1)}} );
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > view(working_array, layout);
+  RAJA::OffsetLayout<2> layout = RAJA::make_offset_layout<2>(
+      {{offset_lo.at(0), offset_lo.at(1)}},
+      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1)}});
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
 
-  RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-      view(i, j) = static_cast<IDX_TYPE>(1);
-    }
-  );
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            { view(i, j) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -75,8 +69,7 @@ void KernelOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopOffsetView2DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
@@ -94,30 +87,26 @@ TYPED_TEST_P(KernelNestedLoopOffsetView2DTest, OffsetView2DKernelTest)
   // Square views
   //
   std::array<RAJA::idx_t, 2> offset_lo {{0, 2}};
-  std::array<RAJA::idx_t, 2> offset_hi {{dim0-3, dim1-4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  std::array<RAJA::idx_t, 2> offset_hi {{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -2}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-6}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 6}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 2> {{0, 1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-1}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 1}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 2> {{-1, -1}};
-  offset_hi = std::array<RAJA::idx_t, 2> {{dim0-3, dim1-4}};
-  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 2> {{dim0 - 3, dim1 - 4}};
+  KernelOffsetView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 }
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
index 32adc3ede0..60c335154a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-OffsetView3D.hpp
@@ -14,7 +14,7 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                 std::array<RAJA::idx_t, 3> offset_lo,
                                 std::array<RAJA::idx_t, 3> offset_hi)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
@@ -24,60 +24,54 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t off_dim0 = offset_hi.at(0) - offset_lo.at(0);
   RAJA::idx_t off_dim1 = offset_hi.at(1) - offset_lo.at(1);
   RAJA::idx_t off_dim2 = offset_hi.at(2) - offset_lo.at(2);
-  EXPECT_LT( off_dim0, dim.at(0) );
-  EXPECT_LT( off_dim1, dim.at(1) );
-  EXPECT_LT( off_dim2, dim.at(2) );
-
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  EXPECT_LT(off_dim0, dim.at(0));
+  EXPECT_LT(off_dim1, dim.at(1));
+  EXPECT_LT(off_dim2, dim.at(2));
+
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t i = 0; i < off_dim0; ++i) {
-    for (RAJA::idx_t j = 0; j < off_dim1; ++j) {
-      for (RAJA::idx_t k = 0; k < off_dim2; ++k) {
+  for (RAJA::idx_t i = 0; i < off_dim0; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < off_dim1; ++j)
+    {
+      for (RAJA::idx_t k = 0; k < off_dim2; ++k)
+      {
         test_array[k + dim.at(2) * j + dim.at(1) * dim.at(2) * i] =
-          static_cast<IDX_TYPE>(1);
+            static_cast<IDX_TYPE>(1);
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> layout =
-    RAJA::make_offset_layout<3>( {{offset_lo.at(0),
-                                   offset_lo.at(1),
-                                   offset_lo.at(2)}},
-                                 {{offset_lo.at(0) + dim.at(0),
-                                   offset_lo.at(1) + dim.at(1),
-                                   offset_lo.at(2) + dim.at(2)}} );
+  RAJA::OffsetLayout<3> layout = RAJA::make_offset_layout<3>(
+      {{offset_lo.at(0), offset_lo.at(1), offset_lo.at(2)}},
+      {{offset_lo.at(0) + dim.at(0), offset_lo.at(1) + dim.at(1),
+        offset_lo.at(2) + dim.at(2)}});
 
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> view(working_array, layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( offset_lo.at(0), offset_hi.at(0));
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( offset_lo.at(1), offset_hi.at(1));
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg( offset_lo.at(2), offset_hi.at(2));
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(offset_lo.at(0), offset_hi.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(offset_lo.at(1), offset_hi.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg(offset_lo.at(2), offset_hi.at(2));
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg, kseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      view(i, j, k) = static_cast<IDX_TYPE>(1);
-    }
-  );
+      RAJA::make_tuple(iseg, jseg, kseg),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      { view(i, j, k) = static_cast<IDX_TYPE>(1); });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -85,8 +79,7 @@ void KernelOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopOffsetView3DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
@@ -105,30 +98,26 @@ TYPED_TEST_P(KernelNestedLoopOffsetView3DTest, OffsetView3DKernelTest)
   // Square views
   //
   std::array<RAJA::idx_t, 3> offset_lo {{0, 2, 1}};
-  std::array<RAJA::idx_t, 3> offset_hi {{dim0-2, dim1-6, dim2-4}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  std::array<RAJA::idx_t, 3> offset_hi {{dim0 - 2, dim1 - 6, dim2 - 4}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -2, -3}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-10, dim2-8}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 10, dim2 - 8}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   //
   // Non-square views
   //
   offset_lo = std::array<RAJA::idx_t, 3> {{0, 1, 2}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-2, dim2-2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 2, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 
   offset_lo = std::array<RAJA::idx_t, 3> {{-1, -1, 0}};
-  offset_hi = std::array<RAJA::idx_t, 3> {{dim0-3, dim1-4, dim2-2}};
-  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim,
-                                                                 offset_lo,
+  offset_hi = std::array<RAJA::idx_t, 3> {{dim0 - 3, dim1 - 4, dim2 - 2}};
+  KernelOffsetView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim, offset_lo,
                                                                  offset_hi);
 }
 
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
index f83126959d..1888c93016 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView2D.hpp
@@ -12,7 +12,7 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
                                         std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* A_work_array;
   IDX_TYPE* A_check_array;
   IDX_TYPE* A_test_array;
@@ -35,8 +35,8 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer = dim.at( perm.at(0) );
-  RAJA::idx_t Nint_inner = dim.at( perm.at(1) );
+  RAJA::idx_t Nint_outer = dim.at(perm.at(0));
+  RAJA::idx_t Nint_inner = dim.at(perm.at(1));
 
   RAJA::idx_t Ntot_outer = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_inner = Nint_inner + 2 * 1;
@@ -45,16 +45,15 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot,
-                                   working_res,
-                                   &B_work_array,
-                                   &B_check_array,
-                                   &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
-    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j) {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 1; j <= Nint_inner; ++j)
+    {
       B_test_array[j + Ntot_inner * i] = static_cast<IDX_TYPE>(1);
     }
   }
@@ -63,70 +62,61 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint,
-                                   working_res,
-                                   &A_work_array,
-                                   &A_check_array,
-                                   &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
-    for (RAJA::idx_t j = 0; j < Nint_inner; ++j) {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < Nint_inner; ++j)
+    {
 
       int A_idx = j + Nint_inner * i;
       int B_idx = (j + 1) + Ntot_inner * (i + 1);
 
-      A_test_array[A_idx] = B_test_array[B_idx] +                // C
-                            B_test_array[B_idx - Ntot_inner] +   // S
-                            B_test_array[B_idx + Ntot_inner] +   // N
-                            B_test_array[B_idx - 1] +            // W
-                            B_test_array[B_idx + 1];             // E
-
+      A_test_array[A_idx] = B_test_array[B_idx] +               // C
+                            B_test_array[B_idx - Ntot_inner] +  // S
+                            B_test_array[B_idx + Ntot_inner] +  // N
+                            B_test_array[B_idx - 1] +           // W
+                            B_test_array[B_idx + 1];            // E
     }
   }
 
 
-  RAJA::OffsetLayout<2> B_layout =
-    RAJA::make_permuted_offset_layout<2>( {{-1, -1}},
-                                          {{Ntot_len.at(0)-1, Ntot_len.at(1)-1}},
-                                          perm );
+  RAJA::OffsetLayout<2> B_layout = RAJA::make_permuted_offset_layout<2>(
+      {{-1, -1}}, {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1}}, perm);
   RAJA::Layout<2> A_layout =
-    RAJA::make_permuted_layout( {{Nint_len.at(0), Nint_len.at(1)}}, perm );
-
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<2> > B_view(B_work_array, B_layout);
-  RAJA::View< IDX_TYPE, RAJA::Layout<2> >  A_view(A_work_array, A_layout);
+      RAJA::make_permuted_layout({{Nint_len.at(0), Nint_len.at(1)}}, perm);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<2>> B_view(B_work_array, B_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2>> A_view(A_work_array, A_layout);
 
-  RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
 
-      A_view(i, j) = B_view(i, j) +
-                     B_view(i - 1, j) + B_view(i + 1, j) +
-                     B_view(i, j - 1) + B_view(i, j + 1);
-
-    }
-  );
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(iseg, jseg),
+                            [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+                            {
+                              A_view(i, j) = B_view(i, j) + B_view(i - 1, j) +
+                                             B_view(i + 1, j) +
+                                             B_view(i, j - 1) +
+                                             B_view(i, j + 1);
+                            });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
+  {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     A_work_array,
-                                     A_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
                                      A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     B_work_array,
-                                     B_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
                                      B_test_array);
 }
 
@@ -134,11 +124,11 @@ void KernelPermutedOffsetView2DTestImpl(std::array<RAJA::idx_t, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView2DTest : public ::testing::Test
-{
-};
+{};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest, PermutedOffsetView2DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView2DTest,
+             PermutedOffsetView2DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
index 776aff7c57..0448cf268a 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedOffsetView3D.hpp
@@ -12,7 +12,7 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
                                         std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* A_work_array;
   IDX_TYPE* A_check_array;
   IDX_TYPE* A_test_array;
@@ -26,9 +26,8 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   // Note that we assume a finite difference stencil width of one.
   //
   std::array<RAJA::idx_t, 3> Nint_len {{dim.at(0), dim.at(1), dim.at(2)}};
-  std::array<RAJA::idx_t, 3> Ntot_len {{dim.at(0) + 2 * 1,
-                                        dim.at(1) + 2 * 1,
-                                        dim.at(2) + 2 * 1}};
+  std::array<RAJA::idx_t, 3> Ntot_len {
+      {dim.at(0) + 2 * 1, dim.at(1) + 2 * 1, dim.at(2) + 2 * 1}};
 
   //
   // These are used in data initialization and setting reference solution.
@@ -37,9 +36,9 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   //
   // Also, we assume a finite difference stencil width of one.
   //
-  RAJA::idx_t Nint_outer  = dim.at( perm.at(0) );
-  RAJA::idx_t Nint_middle = dim.at( perm.at(1) );
-  RAJA::idx_t Nint_inner  = dim.at( perm.at(2) );
+  RAJA::idx_t Nint_outer  = dim.at(perm.at(0));
+  RAJA::idx_t Nint_middle = dim.at(perm.at(1));
+  RAJA::idx_t Nint_inner  = dim.at(perm.at(2));
 
   RAJA::idx_t Ntot_outer  = Nint_outer + 2 * 1;
   RAJA::idx_t Ntot_middle = Nint_middle + 2 * 1;
@@ -49,19 +48,19 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   RAJA::idx_t Ntot = Ntot_outer * Ntot_middle * Ntot_inner;
 
 
-  allocateForallTestData<IDX_TYPE>(Ntot,
-                                   working_res,
-                                   &B_work_array,
-                                   &B_check_array,
-                                   &B_test_array);
+  allocateForallTestData<IDX_TYPE>(Ntot, working_res, &B_work_array,
+                                   &B_check_array, &B_test_array);
 
   memset(static_cast<void*>(B_test_array), 0, sizeof(IDX_TYPE) * Ntot);
 
-  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i) {
-    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j) {
-      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k) {
+  for (RAJA::idx_t i = 1; i <= Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 1; j <= Nint_middle; ++j)
+    {
+      for (RAJA::idx_t k = 1; k <= Nint_inner; ++k)
+      {
         B_test_array[k + j * Ntot_inner + i * Ntot_inner * Ntot_middle] =
-          static_cast<IDX_TYPE>(1);
+            static_cast<IDX_TYPE>(1);
       }
     }
   }
@@ -70,80 +69,71 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
   working_res.memcpy(B_work_array, B_test_array, sizeof(IDX_TYPE) * Ntot);
 
 
-  allocateForallTestData<IDX_TYPE>(Nint,
-                                   working_res,
-                                   &A_work_array,
-                                   &A_check_array,
-                                   &A_test_array);
+  allocateForallTestData<IDX_TYPE>(Nint, working_res, &A_work_array,
+                                   &A_check_array, &A_test_array);
 
   memset(static_cast<void*>(A_test_array), 0, sizeof(IDX_TYPE) * Nint);
 
   working_res.memcpy(A_work_array, A_test_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t i = 0; i < Nint_outer; ++i) {
-    for (RAJA::idx_t j = 0; j < Nint_middle; ++j) {
-      for (RAJA::idx_t k = 0; k < Nint_inner; ++k) {
+  for (RAJA::idx_t i = 0; i < Nint_outer; ++i)
+  {
+    for (RAJA::idx_t j = 0; j < Nint_middle; ++j)
+    {
+      for (RAJA::idx_t k = 0; k < Nint_inner; ++k)
+      {
 
         int A_idx = k + j * Nint_inner + i * Nint_inner * Nint_middle;
         int B_idx =
-          (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
+            (k + 1) + (j + 1) * Ntot_inner + (i + 1) * Ntot_inner * Ntot_middle;
 
         A_test_array[A_idx] =
-          B_test_array[B_idx] +                              // C
-          B_test_array[B_idx - 1] +                          // W
-          B_test_array[B_idx + 1] +                          // E
-          B_test_array[B_idx - Ntot_inner] +                 // S
-          B_test_array[B_idx + Ntot_inner] +                 // N
-          B_test_array[B_idx - (Ntot_inner*Ntot_middle)] +   // B
-          B_test_array[B_idx + (Ntot_inner*Ntot_middle)];    // T
-
+            B_test_array[B_idx] +                               // C
+            B_test_array[B_idx - 1] +                           // W
+            B_test_array[B_idx + 1] +                           // E
+            B_test_array[B_idx - Ntot_inner] +                  // S
+            B_test_array[B_idx + Ntot_inner] +                  // N
+            B_test_array[B_idx - (Ntot_inner * Ntot_middle)] +  // B
+            B_test_array[B_idx + (Ntot_inner * Ntot_middle)];   // T
       }
     }
   }
 
 
-  RAJA::OffsetLayout<3> B_layout =
-    RAJA::make_permuted_offset_layout<3>( {{-1, -1, -1}},
-                                          {{Ntot_len.at(0)-1,
-                                            Ntot_len.at(1)-1,
-                                            Ntot_len.at(2)-1}},
-                                          perm );
-  RAJA::Layout<3> A_layout =
-    RAJA::make_permuted_layout( {{Nint_len.at(0),
-                                  Nint_len.at(1),
-                                  Nint_len.at(2)}}, perm );
+  RAJA::OffsetLayout<3> B_layout = RAJA::make_permuted_offset_layout<3>(
+      {{-1, -1, -1}},
+      {{Ntot_len.at(0) - 1, Ntot_len.at(1) - 1, Ntot_len.at(2) - 1}}, perm);
+  RAJA::Layout<3> A_layout = RAJA::make_permuted_layout(
+      {{Nint_len.at(0), Nint_len.at(1), Nint_len.at(2)}}, perm);
 
-  RAJA::View< IDX_TYPE, RAJA::OffsetLayout<3> > B_view(B_work_array, B_layout);
-  RAJA::View< IDX_TYPE, RAJA::Layout<3> >  A_view(A_work_array, A_layout);
+  RAJA::View<IDX_TYPE, RAJA::OffsetLayout<3>> B_view(B_work_array, B_layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3>> A_view(A_work_array, A_layout);
 
-  RAJA::TypedRangeSegment<IDX_TYPE> iseg( 0, Nint_len.at(0) );
-  RAJA::TypedRangeSegment<IDX_TYPE> jseg( 0, Nint_len.at(1) );
-  RAJA::TypedRangeSegment<IDX_TYPE> kseg( 0, Nint_len.at(2) );
+  RAJA::TypedRangeSegment<IDX_TYPE> iseg(0, Nint_len.at(0));
+  RAJA::TypedRangeSegment<IDX_TYPE> jseg(0, Nint_len.at(1));
+  RAJA::TypedRangeSegment<IDX_TYPE> kseg(0, Nint_len.at(2));
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( iseg, jseg, kseg ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      A_view(i, j, k) = B_view(i, j, k) +
-                        B_view(i - 1, j, k) + B_view(i + 1, j, k) +
-                        B_view(i, j - 1, k) + B_view(i, j + 1, k) +
-                        B_view(i, j, k - 1) + B_view(i, j, k + 1);
-    }
-  );
+      RAJA::make_tuple(iseg, jseg, kseg),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      {
+        A_view(i, j, k) = B_view(i, j, k) + B_view(i - 1, j, k) +
+                          B_view(i + 1, j, k) + B_view(i, j - 1, k) +
+                          B_view(i, j + 1, k) + B_view(i, j, k - 1) +
+                          B_view(i, j, k + 1);
+      });
 
   working_res.memcpy(A_check_array, A_work_array, sizeof(IDX_TYPE) * Nint);
 
-  for (RAJA::idx_t ii = 0; ii < Nint; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < Nint; ++ii)
+  {
     ASSERT_EQ(A_test_array[ii], A_check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     A_work_array,
-                                     A_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, A_work_array, A_check_array,
                                      A_test_array);
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     B_work_array,
-                                     B_check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, B_work_array, B_check_array,
                                      B_test_array);
 }
 
@@ -151,11 +141,11 @@ void KernelPermutedOffsetView3DTestImpl(std::array<RAJA::idx_t, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedOffsetView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedOffsetView3DTest : public ::testing::Test
-{
-};
+{};
 
 
-TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest, PermutedOffsetView3DKernelTest)
+TYPED_TEST_P(KernelNestedLoopPermutedOffsetView3DTest,
+             PermutedOffsetView3DKernelTest)
 {
   using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
index 66311c43f1..37729d4e99 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView2D.hpp
@@ -13,52 +13,49 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
                                   std::array<RAJA::idx_t, 2> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 2>
-    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ) }};
+  std::array<RAJA::idx_t, 2> dim_strip {
+      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1);
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at( perm.at(1) );
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  int mod_val = dim.at(perm.at(1));
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<2> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View< IDX_TYPE, RAJA::Layout<2, int> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<2, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)) ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j) {
-      int val = RAJA::stripIndexType(layout(i, j)) % mod_val;
-      view(i, j) = static_cast<IDX_TYPE>(val);
-    }
-  );
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1))),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j)
+      {
+        int val    = RAJA::stripIndexType(layout(i, j)) % mod_val;
+        view(i, j) = static_cast<IDX_TYPE>(val);
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -66,8 +63,7 @@ void KernelPermutedView2DTestImpl(std::array<IDX_TYPE, 2> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest);
 template <typename T>
 class KernelNestedLoopPermutedView2DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
@@ -81,8 +77,8 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   //
   // Square view
   //
-  std::array<IDX_TYPE, 2> dim_s  {{static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 2> dim_s {
+      {static_cast<IDX_TYPE>(21), static_cast<IDX_TYPE>(21)}};
   KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
@@ -92,12 +88,14 @@ TYPED_TEST_P(KernelNestedLoopPermutedView2DTest, PermutedView2DKernelTest)
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 2> dim_ns  {{static_cast<IDX_TYPE>(15),
-                                    static_cast<IDX_TYPE>(24)}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  std::array<IDX_TYPE, 2> dim_ns {
+      {static_cast<IDX_TYPE>(15), static_cast<IDX_TYPE>(24)}};
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 2> {{1, 0}};
-  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  KernelPermutedView2DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView2DTest,
diff --git a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
index c3cb31ddce..7c3c329bf3 100644
--- a/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
+++ b/test/functional/kernel/nested-loop-view-types/tests/test-kernel-nested-loop-PermutedView3D.hpp
@@ -13,54 +13,51 @@ template <typename IDX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
                                   std::array<RAJA::idx_t, 3> perm)
 {
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   IDX_TYPE* working_array;
   IDX_TYPE* check_array;
   IDX_TYPE* test_array;
 
-  std::array<RAJA::idx_t, 3>
-    dim_strip {{ static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(0)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(1)) ),
-                 static_cast<RAJA::idx_t>( RAJA::stripIndexType(dim.at(2)) ) }};
+  std::array<RAJA::idx_t, 3> dim_strip {
+      {static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(0))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(1))),
+       static_cast<RAJA::idx_t>(RAJA::stripIndexType(dim.at(2)))}};
   RAJA::idx_t N = dim_strip.at(0) * dim_strip.at(1) * dim_strip.at(2);
 
-  allocateForallTestData<IDX_TYPE>(N,
-                                   working_res,
-                                   &working_array,
-                                   &check_array,
+  allocateForallTestData<IDX_TYPE>(N, working_res, &working_array, &check_array,
                                    &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(IDX_TYPE) * N);
 
   working_res.memcpy(working_array, test_array, sizeof(IDX_TYPE) * N);
 
-  int mod_val = dim.at( perm.at(1) ) * dim.at( perm.at(2) );
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  int mod_val = dim.at(perm.at(1)) * dim.at(perm.at(2));
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     test_array[ii] = static_cast<IDX_TYPE>(ii % mod_val);
   }
 
   RAJA::Layout<3> layout = RAJA::make_permuted_layout(dim_strip, perm);
-  RAJA::View< IDX_TYPE, RAJA::Layout<3, int> > view(working_array, layout);
+  RAJA::View<IDX_TYPE, RAJA::Layout<3, int>> view(working_array, layout);
 
   RAJA::kernel<EXEC_POLICY>(
-    RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
-                      RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2)) ),
-    [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k) {
-      int val = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
-      view(i, j, k) = static_cast<IDX_TYPE>(val);
-    }
-  );
+      RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(0)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(1)),
+                       RAJA::TypedRangeSegment<IDX_TYPE>(0, dim_strip.at(2))),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE j, IDX_TYPE k)
+      {
+        int val       = RAJA::stripIndexType(layout(i, j, k)) % mod_val;
+        view(i, j, k) = static_cast<IDX_TYPE>(val);
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(IDX_TYPE) * N);
 
-  for (RAJA::idx_t ii = 0; ii < N; ++ii) {
+  for (RAJA::idx_t ii = 0; ii < N; ++ii)
+  {
     ASSERT_EQ(test_array[ii], check_array[ii]);
   }
 
-  deallocateForallTestData<IDX_TYPE>(working_res,
-                                     working_array,
-                                     check_array,
+  deallocateForallTestData<IDX_TYPE>(working_res, working_array, check_array,
                                      test_array);
 }
 
@@ -68,8 +65,7 @@ void KernelPermutedView3DTestImpl(std::array<IDX_TYPE, 3> dim,
 TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest);
 template <typename T>
 class KernelNestedLoopPermutedView3DTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
@@ -83,9 +79,9 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Square view
   //
-  std::array<IDX_TYPE, 3> dim_s  {{static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21),
-                                   static_cast<IDX_TYPE>(21)}};
+  std::array<IDX_TYPE, 3> dim_s {{static_cast<IDX_TYPE>(21),
+                                  static_cast<IDX_TYPE>(21),
+                                  static_cast<IDX_TYPE>(21)}};
   KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_s, perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
@@ -98,16 +94,19 @@ TYPED_TEST_P(KernelNestedLoopPermutedView3DTest, PermutedView3DKernelTest)
   //
   // Non-square view
   //
-  std::array<IDX_TYPE, 3> dim_ns  {{static_cast<IDX_TYPE>(15),
-                                    static_cast<IDX_TYPE>(24),
-                                    static_cast<IDX_TYPE>(17)}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  std::array<IDX_TYPE, 3> dim_ns {{static_cast<IDX_TYPE>(15),
+                                   static_cast<IDX_TYPE>(24),
+                                   static_cast<IDX_TYPE>(17)}};
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 3> {{1, 2, 0}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 
   perm = std::array<RAJA::idx_t, 3> {{2, 0, 1}};
-  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns, perm);
+  KernelPermutedView3DTestImpl<IDX_TYPE, WORKING_RES, EXEC_POLICY>(dim_ns,
+                                                                   perm);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopPermutedView3DTest,
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index e5b99159b8..b0958b0f52 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -10,20 +10,26 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,73 +37,88 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the Basic test supports.
 //
 //
-using BasicSupportedLoopTypeList = camp::list<
-  DEPTH_2,
-  DEPTH_2_COLLAPSE,
-  DEPTH_3,
-  DEPTH_3_COLLAPSE,
-  DEPTH_3_COLLAPSE_SEQ_INNER,
-  DEPTH_3_COLLAPSE_SEQ_OUTER,
-  DEVICE_DEPTH_2>;
+using BasicSupportedLoopTypeList = camp::list<DEPTH_2,
+                                              DEPTH_2_COLLAPSE,
+                                              DEPTH_3,
+                                              DEPTH_3_COLLAPSE,
+                                              DEPTH_3_COLLAPSE_SEQ_INNER,
+                                              DEPTH_3_COLLAPSE_SEQ_OUTER,
+                                              DEVICE_DEPTH_2>;
 
 //
 //
 // Basic 2D Matrix index calculation per element.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... ExtraArgs>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... ExtraArgs>
 void KernelNestedLoopTest(const DEPTH_2&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          ExtraArgs...){
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+                          ExtraArgs...)
+{
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
 
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 2;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim1, dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range1, range0), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type j, RAJA::Index_type i) {
-                              work_view(j,i) = (j * dim0) + i;
-                            });
-
-  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
-  });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim1,
+                                                              dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range1, range0), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type j, RAJA::Index_type i)
+      { work_view(j, i) = (j * dim0) + i; });
+
+  work_res.memcpy(check_array, work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2 test.
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
+// DEPTH_2_COLLAPSE and DEVICE_DEPTH_2 execution policies use the above DEPTH_2
+// test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_2_COLLAPSE&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEVICE_DEPTH_2&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_2(),
+                                                               args...);
 }
 
 //
@@ -109,22 +130,20 @@ template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
 void KernelNestedLoopTest(const DEPTH_3&,
                           const RAJA::Index_type dim0,
                           const RAJA::Index_type dim1,
-                          const RAJA::Index_type dim2){
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+                          const RAJA::Index_type dim2)
+{
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = dim0 * dim1 * dim2;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
-  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0,flatSize);
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangeflat(0, flatSize);
   RAJA::TypedRangeSegment<RAJA::Index_type> range0(0, dim0);
   RAJA::TypedRangeSegment<RAJA::Index_type> range1(0, dim1);
   RAJA::TypedRangeSegment<RAJA::Index_type> range2(0, dim2);
@@ -132,38 +151,58 @@ void KernelNestedLoopTest(const DEPTH_3&,
   std::iota(test_array, test_array + RAJA::stripIndexType(flatSize), 0);
 
   constexpr int Depth = 3;
-  RAJA::View< RAJA::Index_type, RAJA::Layout<Depth> > work_view(work_array, dim2, dim1, dim0);
-
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(range2, range1, range0), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type k, RAJA::Index_type j, RAJA::Index_type i) {
-                              work_view(k,j,i) = (dim0 * dim1 * k) + (dim0 * j) + i;
-                            });
-
-  work_res.memcpy(check_array, work_array, sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
-  RAJA::forall<RAJA::seq_exec>(rangeflat, [=] (RAJA::Index_type i) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
-  });
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  RAJA::View<RAJA::Index_type, RAJA::Layout<Depth>> work_view(work_array, dim2,
+                                                              dim1, dim0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(range2, range1, range0), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type k, RAJA::Index_type j,
+                           RAJA::Index_type i)
+      { work_view(k, j, i) = (dim0 * dim1 * k) + (dim0 * j) + i; });
+
+  work_res.memcpy(check_array, work_array,
+                  sizeof(RAJA::Index_type) * RAJA::stripIndexType(flatSize));
+  RAJA::forall<RAJA::seq_exec>(rangeflat,
+                               [=](RAJA::Index_type i)
+                               {
+                                 ASSERT_EQ(
+                                     test_array[RAJA::stripIndexType(i)],
+                                     check_array[RAJA::stripIndexType(i)]);
+                               });
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 // DEPTH_3_COLLAPSE execution policies use the above DEPTH_3 test.
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_OUTER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE, typename... Args>
-void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args){
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(), args...);
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args)
+{
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RESOURCE>(DEPTH_3(),
+                                                               args...);
 }
 
 //
@@ -171,97 +210,89 @@ void KernelNestedLoopTest(const DEPTH_3_COLLAPSE_SEQ_INNER&, Args... args){
 // Defining the Kernel Loop structure for Basic Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct BasicNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      2,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<1>>::type,
+                           RAJA::statement::Lambda<0>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<1, 0>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1,2>,
-        RAJA::statement::Lambda<0>
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1, 2>,
+      RAJA::statement::Lambda<0>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,2>,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_OUTER, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      0,
+      RAJA::seq_exec,
+      RAJA::statement::Collapse<
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::ArgList<1, 2>,
+          RAJA::statement::Lambda<0>>>>;
 };
 
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<0,1>,
-        RAJA::statement::For<2, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::Collapse<
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::ArgList<0, 1>,
+      RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end CudaKernel
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,  // row
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,  // col
+              RAJA::statement::Lambda<0>>>>  // end CudaKernel
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 5c2cdd5149..8f9d35df6f 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -10,20 +10,26 @@
 
 #include "RAJA_test-abs.hpp"
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
 //
@@ -31,10 +37,8 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORK
 // Define list of nested loop types the MultiLambda test supports.
 //
 //
-using MultiLambdaSupportedLoopTypeList = camp::list<
-  DEPTH_2,
-  DEPTH_2_COLLAPSE,
-  DEVICE_DEPTH_2>;
+using MultiLambdaSupportedLoopTypeList =
+    camp::list<DEPTH_2, DEPTH_2_COLLAPSE, DEVICE_DEPTH_2>;
 
 //
 //
@@ -42,74 +46,97 @@ using MultiLambdaSupportedLoopTypeList = camp::list<
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(){
-  constexpr static int N = 1000;
+void KernelNestedLoopTest()
+{
+  constexpr static int N   = 1000;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N*N);
-  double* work_arrB = work_res.template allocate<double>(N*N);
+  double* work_arrA = work_res.template allocate<double>(N * N);
+  double* work_arrB = work_res.template allocate<double>(N * N);
 
-  double* test_arrA = host_res.allocate<double>(N*N);
-  double* test_arrB = host_res.allocate<double>(N*N);
+  double* test_arrA = host_res.allocate<double>(N * N);
+  double* test_arrB = host_res.allocate<double>(N * N);
 
-  double* check_arrA = host_res.allocate<double>(N*N);
-  double* check_arrB = host_res.allocate<double>(N*N);
+  double* check_arrA = host_res.allocate<double>(N * N);
+  double* check_arrB = host_res.allocate<double>(N * N);
 
   // Initialize Data
-  for (RAJA::Index_type i = 0; i < N*N; i++) {
-    test_arrA[i] = i * 1.2;  test_arrB[i] = i * 0.5;
+  for (RAJA::Index_type i = 0; i < N * N; i++)
+  {
+    test_arrA[i] = i * 1.2;
+    test_arrB[i] = i * 0.5;
   }
-  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Initialize RAJA Views
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
 
   // Calculate Test data
-  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
-    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
-      test_viewB(i,j) = 0.2 * (test_viewA(i,j) + test_viewA(i,j-1) + test_viewA(i,j+1) + test_viewA(i+1,j) + test_viewA(i-1,j));
+  for (RAJA::Index_type i = 1; i < N - 1; ++i)
+  {
+    for (RAJA::Index_type j = 1; j < N - 1; ++j)
+    {
+      test_viewB(i, j) = 0.2 * (test_viewA(i, j) + test_viewA(i, j - 1) +
+                                test_viewA(i, j + 1) + test_viewA(i + 1, j) +
+                                test_viewA(i - 1, j));
     }
   }
-  for (RAJA::Index_type i = 1; i < N-1; ++i ) {
-    for (RAJA::Index_type j = 1; j < N-1; ++j ) {
-      test_viewA(i,j) = 0.2 * (test_viewB(i,j) + test_viewB(i,j-1) + test_viewB(i,j+1) + test_viewB(i+1,j) + test_viewB(i-1,j));
+  for (RAJA::Index_type i = 1; i < N - 1; ++i)
+  {
+    for (RAJA::Index_type j = 1; j < N - 1; ++j)
+    {
+      test_viewA(i, j) = 0.2 * (test_viewB(i, j) + test_viewB(i, j - 1) +
+                                test_viewB(i, j + 1) + test_viewB(i + 1, j) +
+                                test_viewB(i - 1, j));
     }
-  } 
+  }
 
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                     RAJA::RangeSegment{1, N-1}),
-
-    // Resource
-    work_res,
-
-    // lambda 0
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
-      work_viewB(i,j) = 0.2 * (work_viewA(i,j) + work_viewA(i,j-1) + work_viewA(i,j+1) + work_viewA(i+1,j) + work_viewA(i-1,j));
-    },
-
-    // lambda 1
-    [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j) {
-      work_viewA(i,j) = 0.2 * (work_viewB(i,j) + work_viewB(i,j-1) + work_viewB(i,j+1) + work_viewB(i+1,j) + work_viewB(i-1,j));
-    }
-  );
-
-  work_res.memcpy(check_arrA, work_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(check_arrB, work_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
-
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
-    ASSERT_TRUE( RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8 );
-    ASSERT_TRUE( RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8 );
-  });
+      RAJA::make_tuple(RAJA::RangeSegment {1, N - 1},
+                       RAJA::RangeSegment {1, N - 1}),
+
+      // Resource
+      work_res,
+
+      // lambda 0
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
+      {
+        work_viewB(i, j) = 0.2 * (work_viewA(i, j) + work_viewA(i, j - 1) +
+                                  work_viewA(i, j + 1) + work_viewA(i + 1, j) +
+                                  work_viewA(i - 1, j));
+      },
+
+      // lambda 1
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j)
+      {
+        work_viewA(i, j) = 0.2 * (work_viewB(i, j) + work_viewB(i, j - 1) +
+                                  work_viewB(i, j + 1) + work_viewB(i + 1, j) +
+                                  work_viewB(i - 1, j));
+      });
+
+  work_res.memcpy(check_arrA, work_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(check_arrB, work_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment {0, N * N},
+      [=](RAJA::Index_type i)
+      {
+        ASSERT_TRUE(RAJA::test_abs(test_arrA[i] - check_arrA[i]) < 10e-8);
+        ASSERT_TRUE(RAJA::test_abs(test_arrB[i] - check_arrB[i]) < 10e-8);
+      });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -126,62 +153,64 @@ void KernelNestedLoopTest(){
 // Defining the Kernel Loop structure for MultiLambda Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      >,
-      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<1>
-        >
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<1>>>>;
 };
 
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<0>
-      >,
-      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::ArgList<1,0>,
-        RAJA::statement::Lambda<1>
-      >
-    >;
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<0>>,
+                         RAJA::statement::Collapse<
+                             typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                             RAJA::ArgList<1, 0>,
+                             RAJA::statement::Lambda<1>>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >,
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<1>
-          >
-        >
-      >
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>>,
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<1>>>>>;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index 8c62b908e3..b54eec5b05 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -10,22 +10,30 @@
 
 #include "RAJA_test-abs.hpp"
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type call_kernel(SEGMENTS&& segs,
+                                                        PARAMS&& params,
+                                                        WORKING_RES work_res,
+                                                        Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,10 +41,8 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, PARA
 // Define list of nested loop types the MultiLambdaParam test supports.
 //
 //
-using MultiLambdaParamSupportedLoopTypeList = camp::list<
-  DEPTH_3,
-  DEVICE_DEPTH_3
-  >;
+using MultiLambdaParamSupportedLoopTypeList =
+    camp::list<DEPTH_3, DEVICE_DEPTH_3>;
 
 //
 //
@@ -44,93 +50,97 @@ using MultiLambdaParamSupportedLoopTypeList = camp::list<
 //
 //
 template <typename WORKING_RES, typename EXEC_POLICY, bool USE_RESOURCE>
-void KernelNestedLoopTest(){
+void KernelNestedLoopTest()
+{
 
-  constexpr static int N = 100;
+  constexpr static int N   = 100;
   constexpr static int DIM = 2;
 
-  camp::resources::Resource host_res{camp::resources::Host()};
-  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  WORKING_RES work_res {WORKING_RES::get_default()};
 
   // Allocate Tests Data
-  double* work_arrA = work_res.template allocate<double>(N*N);
-  double* work_arrB = work_res.template allocate<double>(N*N);
-  double* work_arrC = work_res.template allocate<double>(N*N);
+  double* work_arrA = work_res.template allocate<double>(N * N);
+  double* work_arrB = work_res.template allocate<double>(N * N);
+  double* work_arrC = work_res.template allocate<double>(N * N);
 
-  double* test_arrA = host_res.allocate<double>(N*N);
-  double* test_arrB = host_res.allocate<double>(N*N);
-  double* test_arrC = host_res.allocate<double>(N*N);
+  double* test_arrA = host_res.allocate<double>(N * N);
+  double* test_arrB = host_res.allocate<double>(N * N);
+  double* test_arrC = host_res.allocate<double>(N * N);
 
-  double* check_arrC = host_res.allocate<double>(N*N);
+  double* check_arrC = host_res.allocate<double>(N * N);
 
   // Initialize RAJA Views
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewA(test_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewB(test_arrB, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > test_viewC(test_arrC, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewA(test_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewB(test_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> test_viewC(test_arrC, N, N);
 
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewA(work_arrA, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewB(work_arrB, N, N);
-  RAJA::View< double, RAJA::Layout<DIM> > work_viewC(work_arrC, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewA(work_arrA, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewB(work_arrB, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> work_viewC(work_arrC, N, N);
 
   // Initialize Data
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
       test_viewA(row, col) = row;
       test_viewB(row, col) = col;
       test_viewB(row, col) = 0;
     }
   }
 
-  work_res.memcpy(work_arrA, test_arrA, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrB, test_arrB, sizeof(double) * RAJA::stripIndexType(N*N));
-  work_res.memcpy(work_arrC, test_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(work_arrA, test_arrA,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrB, test_arrB,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
+  work_res.memcpy(work_arrC, test_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
   // Calculate Test data
-  for (int row = 0; row < N; ++row) {
-    for (int col = 0; col < N; ++col) {
+  for (int row = 0; row < N; ++row)
+  {
+    for (int col = 0; col < N; ++col)
+    {
 
       double dot = 0.0;
-      for (int k = 0; k < N; ++k) {
+      for (int k = 0; k < N; ++k)
+      {
         dot += test_viewA(row, k) * test_viewB(k, col);
       }
       test_viewC(row, col) = dot;
-
     }
   }
-  
+
   // Calculate Working data
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-    RAJA::make_tuple(RAJA::RangeSegment{0, N},
-                     RAJA::RangeSegment{0, N},
-                     RAJA::RangeSegment{0, N}),
+      RAJA::make_tuple(RAJA::RangeSegment {0, N}, RAJA::RangeSegment {0, N},
+                       RAJA::RangeSegment {0, N}),
 
-    RAJA::tuple<double>{0.0},
+      RAJA::tuple<double> {0.0},
 
-    // Resource
-    work_res,
+      // Resource
+      work_res,
 
-    // lambda 0
-    [=] RAJA_HOST_DEVICE (double& dot) {
-       dot = 0.0;
-    },
+      // lambda 0
+      [=] RAJA_HOST_DEVICE(double& dot) { dot = 0.0; },
 
-    // lambda 1
-    [=] RAJA_HOST_DEVICE (int col, int row, int k, double& dot) {
-       dot += work_viewA(row, k) * work_viewB(k, col);
-    },
+      // lambda 1
+      [=] RAJA_HOST_DEVICE(int col, int row, int k, double& dot)
+      { dot += work_viewA(row, k) * work_viewB(k, col); },
 
-    // lambda 2
-    [=] RAJA_HOST_DEVICE (int col, int row, double& dot) {
-       work_viewC(row, col) = dot;
-    }
+      // lambda 2
+      [=] RAJA_HOST_DEVICE(int col, int row, double& dot)
+      { work_viewC(row, col) = dot; }
 
   );
 
-  work_res.memcpy(check_arrC, work_arrC, sizeof(double) * RAJA::stripIndexType(N*N));
+  work_res.memcpy(check_arrC, work_arrC,
+                  sizeof(double) * RAJA::stripIndexType(N * N));
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment{0, N*N}, [=] (RAJA::Index_type i) {
-    ASSERT_TRUE( RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8 );
-  });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::RangeSegment {0, N * N}, [=](RAJA::Index_type i)
+      { ASSERT_TRUE(RAJA::test_abs(test_arrC[i] - check_arrC[i]) < 10e-8); });
 
   work_res.deallocate(work_arrA);
   work_res.deallocate(work_arrB);
@@ -148,43 +158,56 @@ void KernelNestedLoopTest(){
 // Defining the Kernel Loop structure for MultiLambdaParam Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename POLICY_DATA>
 struct MultiLambdaParamNestedLoopExec;
 
-template<typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+template <typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::For<
+      1,
+      typename camp::at<POLICY_DATA, camp::num<0>>::type,
+      RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
           RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<1> // inner loop: dot += ...
-          >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-        >
-      >
-    >;
+          RAJA::statement::For<
+              2,
+              typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<1>  // inner loop: dot += ...
+              >,
+          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set
+                                                                         // C(row,
+                                                                         // col)
+                                                                         // =
+                                                                         // dot
+          >>>;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
-
-template<typename POLICY_DATA>
-struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-      RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-          RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
-          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
-            RAJA::statement::Lambda<1> // inner loop: dot += ...
-          >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>   // set C(row, col) = dot
-        >
-      >
-      > // end CudaKernel
-    >;
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or                   \
+    defined(RAJA_ENABLE_SYCL)
+
+template <typename POLICY_DATA>
+struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,  // dot = 0.0
+              RAJA::statement::For<
+                  2,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<1>  // inner loop: dot += ...
+                  >,
+              RAJA::statement::
+                  Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set C(row,
+                                                                // col) = dot
+              >>>  // end CudaKernel
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
index 37cab1789b..27ed5270bc 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-Basic.hpp
@@ -17,10 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test {};
+class KernelNestedLoopBasicTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -35,11 +37,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
-                            NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
index cddcb005f4..a1ce6bbbd2 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambda.hpp
@@ -17,10 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +32,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
index eae84e88c9..df5264ec43 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-nested-loop-MultiLambdaParam.hpp
@@ -17,10 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +33,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
index 7845500ae7..1fbec4bb91 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-Basic.hpp
@@ -16,10 +16,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest);
 template <typename T>
-class KernelNestedLoopBasicTest : public ::testing::Test {};
+class KernelNestedLoopBasicTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -34,11 +36,11 @@ TYPED_TEST_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel) {
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 1,1,1);
-  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>( LOOP_TYPE(), 40,30,20);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 1, 1, 1);
+  KernelNestedLoopTest<WORKING_RES, EXEC_POLICY, USE_RES>(LOOP_TYPE(), 40, 30,
+                                                          20);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest,
-                            NestedLoopBasicKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelNestedLoopBasicTest, NestedLoopBasicKernel);
 
 #endif  // __TEST_KERNEL_NESTED_LOOP_RESOURCE_BASIC_HPP__
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
index 75616bea68..798faf3f99 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambda.hpp
@@ -17,10 +17,12 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +32,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaTest, NestedLoopMultiLambdaKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
diff --git a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
index 02dbe213cc..31dfbc1bd8 100644
--- a/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
+++ b/test/functional/kernel/nested-loop/tests/test-kernel-resource-nested-loop-MultiLambdaParam.hpp
@@ -17,10 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelNestedLoopMultiLambdaParamTest);
 template <typename T>
-class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test {};
+class KernelNestedLoopMultiLambdaParamTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest,
+             NestedLoopMultiLambdaParamKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<1>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -30,7 +33,8 @@ TYPED_TEST_P(KernelNestedLoopMultiLambdaParamTest, NestedLoopMultiLambdaParamKer
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename MultiLambdaParamNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
index f2f2d0acab..0e416b44a5 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2D.hpp
@@ -8,123 +8,116 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { maxloc_reducer.maxloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DTest);
 template <typename T>
 class KernelLocMax2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DTest, LocMax2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest,
-                            LocMax2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DTest, LocMax2DKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MAX2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
index bd648ff88c..938f0b666f 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DView.hpp
@@ -8,56 +8,50 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -66,67 +60,66 @@ void KernelLocMax2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, Index2D> maxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { maxloc_reducer.maxloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = maxloc_reducer.getLoc();
+  DATA_TYPE raja_max      = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc   = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTest);
 template <typename T>
 class KernelLocMax2DViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DViewTest, LocMax2DViewKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMax2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest,
-                            LocMax2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTest, LocMax2DViewKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MAX2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index 045fc8e97e..699a6ff776 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -8,56 +8,50 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MAX2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -72,64 +66,72 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> maxloc_reducer((DATA_TYPE)0, LocTup);
+  RAJA::ReduceMaxLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+      maxloc_reducer((DATA_TYPE)0, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             maxloc_reducer.maxloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                           });
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              maxloc_reducer.maxloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
-  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer((DATA_TYPE)0, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkmaxloc_reducer.maxloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
+  RAJA::ReduceMaxLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkmaxloc_reducer(
+      (DATA_TYPE)0, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkmaxloc_reducer.maxloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = maxloc_reducer.getLoc();
-  DATA_TYPE raja_max = (DATA_TYPE)maxloc_reducer.get();
-  Index2D checkraja_loc = checkmaxloc_reducer.getLoc();
+  DATA_TYPE raja_max                         = (DATA_TYPE)maxloc_reducer.get();
+  Index2D checkraja_loc                      = checkmaxloc_reducer.getLoc();
   DATA_TYPE checkraja_max = (DATA_TYPE)checkmaxloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_max, (DATA_TYPE)raja_max);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest);
 template <typename T>
 class KernelLocMax2DViewTupleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMax2DViewTupleTest, LocMax2DViewTupleKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMax2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMax2DViewTupleTest,
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
index 090280813c..165adf2284 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2D.hpp
@@ -8,123 +8,116 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
   RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { minloc_reducer.minloc(workarr2D[r][c], Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = minloc_reducer.getLoc();
+  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DTest);
 template <typename T>
 class KernelLocMin2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DTest, LocMin2DKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                         EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest,
-                            LocMin2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DTest, LocMin2DKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MIN2D_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
index cf0791e8d5..046ec52a6b 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DView.hpp
@@ -8,56 +8,50 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -66,67 +60,66 @@ void KernelLocMin2DViewTestImpl(const int xdim, const int ydim)
 
   RAJA::View<DATA_TYPE, RAJA::Layout<2>> ArrView(work_array, xdim, ydim);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, Index2D> minloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
 
-  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(ArrView(r, c), Index2D(c, r));
-                           });
+  RAJA::kernel<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange), [=] RAJA_HOST_DEVICE(int c, int r)
+      { minloc_reducer.minloc(ArrView(r, c), Index2D(c, r)); });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
-
-  Index2D raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
+
+  Index2D raja_loc        = minloc_reducer.getLoc();
+  DATA_TYPE raja_min      = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc   = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, raja_loc.idx);
   ASSERT_EQ(checkraja_loc.idy, raja_loc.idy);
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTest);
 template <typename T>
 class KernelLocMin2DViewTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DViewTest, LocMin2DViewKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(151, 151);
+  KernelLocMin2DViewTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY,
+                             EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest,
-                            LocMin2DViewKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTest, LocMin2DViewKernel);
 
 #endif  // __TEST_KERNEL_REDUCELOC_MIN2DVIEW_HPP__
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
index 4234471f89..57016c4e00 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Min2DViewTuple.hpp
@@ -8,56 +8,50 @@
 #ifndef __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 #define __TEST_KERNEL_REDUCELOC_MIN2DVIEWTUPLE_HPP__
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename FORALL_POLICY, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename FORALL_POLICY,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE ** workarr2D;
-  DATA_TYPE ** checkarr2D;
-  DATA_TYPE ** testarr2D;
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE** workarr2D;
+  DATA_TYPE** checkarr2D;
+  DATA_TYPE** testarr2D;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // square 2D array, xdim x ydim
   INDEX_TYPE array_length = xdim * ydim;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
-  allocateForallTestData<DATA_TYPE *> ( ydim,
-                                        work_res,
-                                        &workarr2D,
-                                        &checkarr2D,
-                                        &testarr2D
-                                      );
+  allocateForallTestData<DATA_TYPE*>(ydim, work_res, &workarr2D, &checkarr2D,
+                                     &testarr2D);
 
   // set rows to point to check and work _arrays
-  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0,ydim);
+  RAJA::TypedRangeSegment<INDEX_TYPE> seg(0, ydim);
   RAJA::forall<FORALL_POLICY>(seg, [=] RAJA_HOST_DEVICE(INDEX_TYPE zz)
-  {
-    workarr2D[zz] = work_array + zz * ydim;
-  });
+                              { workarr2D[zz] = work_array + zz * ydim; });
 
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    checkarr2D[zz] = check_array + zz * ydim;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg, [=](INDEX_TYPE zz)
+                               { checkarr2D[zz] = check_array + zz * ydim; });
 
   // initializing  values
-  RAJA::forall<RAJA::seq_exec>(seg, [=] (INDEX_TYPE zz)
-  {
-    for ( int xx = 0; xx < xdim; ++xx )
-    {
-      checkarr2D[zz][xx] = zz*xdim + xx + 1;
-    }
-    checkarr2D[ydim-1][xdim-1] = 0;
-  });
+  RAJA::forall<RAJA::seq_exec>(seg,
+                               [=](INDEX_TYPE zz)
+                               {
+                                 for (int xx = 0; xx < xdim; ++xx)
+                                 {
+                                   checkarr2D[zz][xx] = zz * xdim + xx + 1;
+                                 }
+                                 checkarr2D[ydim - 1][xdim - 1] = 0;
+                               });
 
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
@@ -68,64 +62,72 @@ void KernelLocMin2DViewTupleTestImpl(const int xdim, const int ydim)
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> LocTup(0, 0);
 
-  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE, RAJA::tuple<DATA_TYPE, DATA_TYPE>> minloc_reducer((DATA_TYPE)1024, LocTup);
+  RAJA::ReduceMinLoc<REDUCE_POLICY, DATA_TYPE,
+                     RAJA::tuple<DATA_TYPE, DATA_TYPE>>
+      minloc_reducer((DATA_TYPE)1024, LocTup);
 
   RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
-                           [=] RAJA_HOST_DEVICE (int c, int r) {
-                             minloc_reducer.minloc(ArrView(r, c), RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
-                           });
+                            [=] RAJA_HOST_DEVICE(int c, int r)
+                            {
+                              minloc_reducer.minloc(
+                                  ArrView(r, c),
+                                  RAJA::make_tuple((DATA_TYPE)c, (DATA_TYPE)r));
+                            });
 
   // CPU answer
-  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer((DATA_TYPE)1024, Index2D(0, 0));
-
-  RAJA::forall<RAJA::seq_exec>(colrange, [=] (INDEX_TYPE c) {
-    for( int r = 0; r < ydim; ++r)
-    {
-      checkminloc_reducer.minloc(checkarr2D[r][c], Index2D(c, r));
-    }
-  });
+  RAJA::ReduceMinLoc<RAJA::seq_reduce, DATA_TYPE, Index2D> checkminloc_reducer(
+      (DATA_TYPE)1024, Index2D(0, 0));
+
+  RAJA::forall<RAJA::seq_exec>(colrange,
+                               [=](INDEX_TYPE c)
+                               {
+                                 for (int r = 0; r < ydim; ++r)
+                                 {
+                                   checkminloc_reducer.minloc(checkarr2D[r][c],
+                                                              Index2D(c, r));
+                                 }
+                               });
 
   RAJA::tuple<DATA_TYPE, DATA_TYPE> raja_loc = minloc_reducer.getLoc();
-  DATA_TYPE raja_min = (DATA_TYPE)minloc_reducer.get();
-  Index2D checkraja_loc = checkminloc_reducer.getLoc();
+  DATA_TYPE raja_min                         = (DATA_TYPE)minloc_reducer.get();
+  Index2D checkraja_loc                      = checkminloc_reducer.getLoc();
   DATA_TYPE checkraja_min = (DATA_TYPE)checkminloc_reducer.get();
 
   ASSERT_DOUBLE_EQ((DATA_TYPE)checkraja_min, (DATA_TYPE)raja_min);
   ASSERT_EQ(checkraja_loc.idx, RAJA::get<0>(raja_loc));
   ASSERT_EQ(checkraja_loc.idy, RAJA::get<1>(raja_loc));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE *> ( work_res,
-                                          workarr2D,
-                                          checkarr2D,
-                                          testarr2D
-                                        );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE*>(work_res, workarr2D, checkarr2D,
+                                       testarr2D);
 }
 
 
 TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest);
 template <typename T>
 class KernelLocMin2DViewTupleTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelLocMin2DViewTupleTest, LocMin2DViewTupleKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
   using FORALL_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<4>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(151, 151);
-  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      10, 10);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      151, 151);
+  KernelLocMin2DViewTupleTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  FORALL_POLICY, EXEC_POLICY, REDUCE_POLICY>(
+      362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelLocMin2DViewTupleTest,
diff --git a/test/functional/kernel/region/tests/test-kernel-region-data.hpp b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
index f0b9f58ff6..63c696f96a 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-data.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-data.hpp
@@ -11,7 +11,9 @@
 template <typename T>
 void allocRegionTestData(int N,
                          camp::resources::Resource work_res,
-                         T** work1, T** work2, T** work3,
+                         T** work1,
+                         T** work2,
+                         T** work3,
                          camp::resources::Resource host_res,
                          T** check)
 {
@@ -24,7 +26,9 @@ void allocRegionTestData(int N,
 
 template <typename T>
 void deallocRegionTestData(camp::resources::Resource work_res,
-                           T* work1, T* work2, T* work3,
+                           T* work1,
+                           T* work2,
+                           T* work3,
                            camp::resources::Resource host_res,
                            T* check)
 {
diff --git a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
index b9ad122d2b..cb87a63357 100644
--- a/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region-sync.hpp
@@ -15,28 +15,25 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-  
+
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N,
-                      work_res,
-                      &work_array1, &work_array2, &work_array3,
-                      host_res,
-                      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
-  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
 
-  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
 
   //
   // Create a list segment with indices in reverse order from range
@@ -48,48 +45,42 @@ void KernelRegionSyncTestImpl(INDEX_TYPE first, INDEX_TYPE last)
   std::vector<INDEX_TYPE> idx_array(N);
   std::iota(idx_array.begin(), idx_array.end(), first);
   std::reverse(idx_array.begin(), idx_array.end());
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N,
-                                          work_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(&idx_array[0], N, work_res);
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-    RAJA::make_tuple(rseg, lseg),
+      RAJA::make_tuple(rseg, lseg),
 
-    [=] (INDEX_TYPE i) {
-      work_array1[i - first] = 50;
-    },
+      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
 
-    [=] (INDEX_TYPE i) {
-      work_array2[i - first] = 100;
-    },
+      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-    [=] (INDEX_TYPE i) {
-      work_array3[i - first] = work_array1[i - first] + 
-                               work_array2[i - first] + 1;
-    }
+      [=](INDEX_TYPE i)
+      {
+        work_array3[i - first] =
+            work_array1[i - first] + work_array2[i - first] + 1;
+      }
 
   );
-  
+
   work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res,
-                        work_array1, work_array2, work_array3,
-                        host_res,
-                        check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionSyncTest);
 template <typename T>
 class KernelRegionSyncTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
 {
@@ -102,7 +93,6 @@ TYPED_TEST_P(KernelRegionSyncTest, RegionSyncKernel)
   KernelRegionSyncTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest,
-                            RegionSyncKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionSyncTest, RegionSyncKernel);
 
 #endif  // __TEST_KERNEL_REGION_SYNC_HPP__
diff --git a/test/functional/kernel/region/tests/test-kernel-region.hpp b/test/functional/kernel/region/tests/test-kernel-region.hpp
index bb2ec449e0..e444da11ba 100644
--- a/test/functional/kernel/region/tests/test-kernel-region.hpp
+++ b/test/functional/kernel/region/tests/test-kernel-region.hpp
@@ -11,69 +11,61 @@
 template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
 void KernelRegionTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   const INDEX_TYPE N = last - first;
-  
+
   INDEX_TYPE* work_array1;
   INDEX_TYPE* work_array2;
   INDEX_TYPE* work_array3;
 
   INDEX_TYPE* check_array;
 
-  allocRegionTestData(N,
-                      work_res,
-                      &work_array1, &work_array2, &work_array3,
-                      host_res,
-                      &check_array);
+  allocRegionTestData(N, work_res, &work_array1, &work_array2, &work_array3,
+                      host_res, &check_array);
 
-  work_res.memset( work_array1, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array2, 0, sizeof(INDEX_TYPE) * N );
-  work_res.memset( work_array3, 0, sizeof(INDEX_TYPE) * N );
+  work_res.memset(work_array1, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array2, 0, sizeof(INDEX_TYPE) * N);
+  work_res.memset(work_array3, 0, sizeof(INDEX_TYPE) * N);
 
-  host_res.memset( check_array, 0, sizeof(INDEX_TYPE) * N );
+  host_res.memset(check_array, 0, sizeof(INDEX_TYPE) * N);
 
 
   RAJA::TypedRangeSegment<INDEX_TYPE> rseg(first, last);
 
   RAJA::kernel<EXEC_POLICY>(
 
-    RAJA::make_tuple(rseg),
+      RAJA::make_tuple(rseg),
 
-    [=] (INDEX_TYPE i) {
-      work_array1[i - first] = 50;
-    },
+      [=](INDEX_TYPE i) { work_array1[i - first] = 50; },
 
-    [=] (INDEX_TYPE i) {
-      work_array2[i - first] = 100;
-    },
+      [=](INDEX_TYPE i) { work_array2[i - first] = 100; },
 
-    [=] (INDEX_TYPE i) {
-      work_array3[i - first] = work_array1[i - first] + 
-                               work_array2[i - first] + 1;
-    }
+      [=](INDEX_TYPE i)
+      {
+        work_array3[i - first] =
+            work_array1[i - first] + work_array2[i - first] + 1;
+      }
 
   );
-  
-  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N );
 
-  for (INDEX_TYPE i = 0; i < N; i++) {
+  work_res.memcpy(check_array, work_array3, sizeof(INDEX_TYPE) * N);
+
+  for (INDEX_TYPE i = 0; i < N; i++)
+  {
     ASSERT_EQ(check_array[i], 151);
   }
 
-  deallocRegionTestData(work_res,
-                        work_array1, work_array2, work_array3,
-                        host_res,
-                        check_array);
+  deallocRegionTestData(work_res, work_array1, work_array2, work_array3,
+                        host_res, check_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelRegionTest);
 template <typename T>
 class KernelRegionTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelRegionTest, RegionKernel)
 {
@@ -86,7 +78,6 @@ TYPED_TEST_P(KernelRegionTest, RegionKernel)
   KernelRegionTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(3, 2556);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest,
-                            RegionKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelRegionTest, RegionKernel);
 
 #endif  // __TEST_KERNEL_REGION_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
index 82e749d226..48185fe281 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-ForICount.hpp
@@ -11,8 +11,9 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template<int VALUE>
-struct Value {
+template <int VALUE>
+struct Value
+{
   static constexpr int value = VALUE;
 };
 
@@ -23,58 +24,57 @@ void KernelSingleLoopForICountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < tsize; ++t) {
+  for (IDX_TYPE t = 0; t < tsize; ++t)
+  {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
-      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
-
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii) {
-        trip_count += 1;
-        if ( i % tsize == t && ii == t ) { 
-          tile_count += 1;
-        }
-      }
-    );
+        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
+        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
+
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ii)
+        {
+          trip_count += 1;
+          if (i % tsize == t && ii == t)
+          {
+            tile_count += 1;
+          }
+        });
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ( trip_result, (t+1) * N );
+    ASSERT_EQ(trip_result, (t + 1) * N);
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = N / tsize;
-    if ( t < N % tsize ) {
+    if (t < N % tsize)
+    {
       tile_expect += 1;
     }
     ASSERT_EQ(tile_result, tile_expect);
-
   }
-
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest);
 template <typename T>
 class KernelSingleLoopForICountTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelSingleLoopForICountTest, ForICountSingleLoopKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(57), tsize);
+      IDX_TYPE(57), tsize);
   KernelSingleLoopForICountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(1035), tsize);
-
+      IDX_TYPE(1035), tsize);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopForICountTest,
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
index e745a8d08b..078ee61cf6 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/tests/test-kernel-single-loop-TileTCount.hpp
@@ -11,8 +11,9 @@
 //
 // Value struct for manipulating tile sizes in parameterized tests.
 //
-template<int VALUE>
-struct Value {
+template <int VALUE>
+struct Value
+{
   static constexpr int value = VALUE;
 };
 
@@ -25,58 +26,57 @@ void KernelSingleLoopTileTCountTestImpl(IDX_TYPE N, IDX_TYPE tsize)
 
   RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> trip_count(0);
 
-  for (IDX_TYPE t = 0; t < NT; ++t) {
+  for (IDX_TYPE t = 0; t < NT; ++t)
+  {
 
     RAJA::ReduceSum<REDUCE_POLICY, IDX_TYPE> tile_count(0);
 
     RAJA::kernel_param<EXEC_POLICY>(
-      RAJA::make_tuple( RAJA::TypedRangeSegment<IDX_TYPE>(0, N) ),
-      RAJA::make_tuple( static_cast<IDX_TYPE>(0) ),
-
-      [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti) {
-        trip_count += 1;
-        if ( i / tsize == t && ti == t ) {
-          tile_count += 1;
-        }
-      }
-    );
+        RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, N)),
+        RAJA::make_tuple(static_cast<IDX_TYPE>(0)),
+
+        [=] RAJA_HOST_DEVICE(IDX_TYPE i, IDX_TYPE ti)
+        {
+          trip_count += 1;
+          if (i / tsize == t && ti == t)
+          {
+            tile_count += 1;
+          }
+        });
 
     IDX_TYPE trip_result = trip_count.get();
-    ASSERT_EQ( trip_result, (t+1) * N );
+    ASSERT_EQ(trip_result, (t + 1) * N);
 
     IDX_TYPE tile_result = tile_count.get();
 
     IDX_TYPE tile_expect = tsize;
-    if ( (t + 1) * tsize > N ) {
+    if ((t + 1) * tsize > N)
+    {
       tile_expect = N - t * tsize;
     }
     ASSERT_EQ(tile_result, tile_expect);
-
   }
-
 }
 
 
 TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest);
 template <typename T>
 class KernelSingleLoopTileTCountTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(KernelSingleLoopTileTCountTest, TileTCountSingleLoopKernel)
 {
-  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
 
   IDX_TYPE tsize = camp::at_v<TypeParam, 3>::value;
 
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(57), tsize);
+      IDX_TYPE(57), tsize);
   KernelSingleLoopTileTCountTestImpl<IDX_TYPE, EXEC_POLICY, REDUCE_POLICY>(
-    IDX_TYPE(1035), tsize);
-
+      IDX_TYPE(1035), tsize);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelSingleLoopTileTCountTest,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
index ccb57cfc62..5a28fbd523 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Dynamic2D.hpp
@@ -10,136 +10,128 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileDynamic2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel_param<EXEC_POLICY> (
-    RAJA::make_tuple( colrange, rowrange ),
-    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y} ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
+                       RAJA::TileSize {tile_dim_y}),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
   // reset check and work transpose arrays
-  work_res.memcpy( check_array_t, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array, sizeof(DATA_TYPE) * array_length);
 
   // transpose work_array again with different tile sizes
-  RAJA::kernel_param<EXEC_POLICY> (
-    RAJA::make_tuple( colrange, rowrange ),
-    RAJA::make_tuple( RAJA::TileSize{tile_dim_x}, RAJA::TileSize{tile_dim_y/2} ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple(RAJA::TileSize {tile_dim_x},
+                       RAJA::TileSize {tile_dim_y / 2}),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+      { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileDynamic2DTest);
 template <typename T>
 class KernelTileDynamic2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileDynamic2DTest, TileDynamic2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      10, 10);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      151, 111);
+  KernelTileDynamic2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest,
-                            TileDynamic2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileDynamic2DTest, TileDynamic2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_DYNAMIC2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
index 9013e5c9ea..0bfe064bbd 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2D.hpp
@@ -10,112 +10,102 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileFixed2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      WorkTView( cc, rr ) = WorkView( rr, cc );
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { WorkTView(cc, rr) = WorkView(rr, cc); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DTest);
 template <typename T>
 class KernelTileFixed2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DTest, TileFixed2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      10, 10);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      151, 111);
+  KernelTileFixed2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(
+      362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest,
-                            TileFixed2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DTest, TileFixed2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_FIXED2D_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
index ac876065a1..1dcda30f9e 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DMinMax.hpp
@@ -11,42 +11,42 @@
 #include <numeric>
 #include <vector>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
 {
   // This test reduces min and max with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
 
   // set min and max of the array
   test_array[4] = -1;
-  test_array[8] = array_length+2;
+  test_array[8] = array_length + 2;
 
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
 
-  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin( DATA_TYPE(99999) ); 
-  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax( DATA_TYPE(-1) ); 
+  RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> workmin(DATA_TYPE(99999));
+  RAJA::ReduceMax<REDUCE_POLICY, DATA_TYPE> workmax(DATA_TYPE(-1));
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -54,43 +54,45 @@ void KernelTileFixed2DMinMaxTestImpl(const int rows, const int cols)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // find min and max on target platform
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      workmin.min(WorkView(rr, cc));
-      workmax.max(WorkView(rr, cc));
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            {
+                              workmin.min(WorkView(rr, cc));
+                              workmax.max(WorkView(rr, cc));
+                            });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(-1), static_cast<DATA_TYPE>(workmin.get()));
-  ASSERT_EQ(static_cast<DATA_TYPE>(array_length+2), static_cast<DATA_TYPE>(workmax.get()));
+  ASSERT_EQ(static_cast<DATA_TYPE>(array_length + 2),
+            static_cast<DATA_TYPE>(workmax.get()));
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest);
 template <typename T>
 class KernelTileFixed2DMinMaxTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DMinMaxTest, TileFixed2DMinMaxKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DMinMaxTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                  EXEC_POLICY, REDUCE_POLICY>(362, 362);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DMinMaxTest,
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
index 33da6d3c7d..6304b1500f 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-Fixed2DSum.hpp
@@ -12,13 +12,17 @@
 #include <vector>
 #include <type_traits>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POLICY>
 void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 {
   // This test reduces sums with tiling.
 
   int rows, cols;
-  if ( std::is_same<DATA_TYPE, float>::value )
+  if (std::is_same<DATA_TYPE, float>::value)
   {
     // Restrict to a small data size for better float precision.
     rows = 3;
@@ -30,20 +34,20 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     cols = colsin;
   }
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
   DATA_TYPE hostsum = 0;
 
-  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum( DATA_TYPE(0) ); 
+  RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> worksum(DATA_TYPE(0));
 
   // sum on CPU in a tiled manner
-  for ( int rr = 0; rr < rows; rr += tile_dim_x )
+  for (int rr = 0; rr < rows; rr += tile_dim_x)
   {
-    for ( int cc = 0; cc < cols; cc += tile_dim_y )
+    for (int cc = 0; cc < cols; cc += tile_dim_y)
     {
-      for ( int r = rr; r < std::min(rr+tile_dim_x, rows); ++r )
+      for (int r = rr; r < std::min(rr + tile_dim_x, rows); ++r)
       {
-        for ( int c = cc; c < std::min(cc+tile_dim_y, cols); ++c )
+        for (int c = cc; c < std::min(cc + tile_dim_y, cols); ++c)
         {
           hostsum += (DATA_TYPE)(r * 1.1 + c);
         }
@@ -52,7 +56,7 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
   }
 
   // mixed range types
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
 
   std::vector<INDEX_TYPE> colidx;
   for (INDEX_TYPE ii = INDEX_TYPE(0); ii < static_cast<INDEX_TYPE>(cols); ++ii)
@@ -60,13 +64,13 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
     colidx.push_back(ii);
   }
 
-  RAJA::TypedListSegment<INDEX_TYPE> colrange( &colidx[0], colidx.size(), work_res );
+  RAJA::TypedListSegment<INDEX_TYPE> colrange(&colidx[0], colidx.size(),
+                                              work_res);
 
   // sum on target platform
-  RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr ) {
-      worksum += (DATA_TYPE)(rr * 1.1 + cc);
-  });
+  RAJA::kernel<EXEC_POLICY>(RAJA::make_tuple(colrange, rowrange),
+                            [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr)
+                            { worksum += (DATA_TYPE)(rr * 1.1 + cc); });
 
   ASSERT_FLOAT_EQ(hostsum, (DATA_TYPE)worksum.get());
 }
@@ -75,23 +79,24 @@ void KernelTileFixed2DSumTestImpl(const int rowsin, const int colsin)
 TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest);
 template <typename T>
 class KernelTileFixed2DSumTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel)
 {
-  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
-  using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
+  using INDEX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(10, 10);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(151, 111);
-  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY, REDUCE_POLICY>(362, 362);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(10, 10);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(151, 111);
+  KernelTileFixed2DSumTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY,
+                               REDUCE_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest,
-                            TileFixed2DSumKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileFixed2DSumTest, TileFixed2DSumKernel);
 
 #endif  // __TEST_KERNEL_TILE_FIXED2DSUM_HPP__
diff --git a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
index 017512c50c..2bfe44934e 100644
--- a/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
+++ b/test/functional/kernel/tile-variants/tests/test-kernel-tile-LocalArray2D.hpp
@@ -10,121 +10,114 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename DATA_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+template <typename INDEX_TYPE,
+          typename DATA_TYPE,
+          typename WORKING_RES,
+          typename EXEC_POLICY>
 void KernelTileLocalArray2DTestImpl(const int rows, const int cols)
 {
   // This test emulates matrix transposition with tiling.
 
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
 
-  DATA_TYPE * work_array;
-  DATA_TYPE * check_array;
-  DATA_TYPE * test_array;
+  DATA_TYPE* work_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
 
   // holds transposed matrices
-  DATA_TYPE * work_array_t;
-  DATA_TYPE * check_array_t;
-  DATA_TYPE * test_array_t;
+  DATA_TYPE* work_array_t;
+  DATA_TYPE* check_array_t;
+  DATA_TYPE* test_array_t;
 
   INDEX_TYPE array_length = rows * cols;
 
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array,
-                                      &check_array,
-                                      &test_array
-                                    );
-
-  allocateForallTestData<DATA_TYPE> ( array_length,
-                                      work_res,
-                                      &work_array_t,
-                                      &check_array_t,
-                                      &test_array_t
-                                    );
-
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView( test_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView( test_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView( work_array, rows, cols );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView( work_array_t, cols, rows );
-  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView( check_array_t, cols, rows );
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array,
+                                    &check_array, &test_array);
+
+  allocateForallTestData<DATA_TYPE>(array_length, work_res, &work_array_t,
+                                    &check_array_t, &test_array_t);
+
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostView(test_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> HostTView(test_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkView(work_array, rows, cols);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> WorkTView(work_array_t, cols, rows);
+  RAJA::View<DATA_TYPE, RAJA::Layout<2>> CheckTView(check_array_t, cols, rows);
 
   // initialize local array (shared mem)
-  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0,1>, RAJA::SizeList<tile_dim_x, tile_dim_y>>;
+  using TILE_MEM = RAJA::LocalArray<DATA_TYPE, RAJA::Perm<0, 1>,
+                                    RAJA::SizeList<tile_dim_x, tile_dim_y>>;
   TILE_MEM Tile_Array;
 
   // initialize arrays
-  std::iota( test_array, test_array + array_length, 1 );
-  std::iota( test_array_t, test_array_t + array_length, 1 );
+  std::iota(test_array, test_array + array_length, 1);
+  std::iota(test_array_t, test_array_t + array_length, 1);
 
-  work_res.memcpy( work_array, test_array, sizeof(DATA_TYPE) * array_length );
-  work_res.memcpy( work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(work_array, test_array, sizeof(DATA_TYPE) * array_length);
+  work_res.memcpy(work_array_t, test_array_t, sizeof(DATA_TYPE) * array_length);
 
   // transpose test_array on CPU
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
-      HostTView( cc, rr ) = HostView( rr, cc ); 
+      HostTView(cc, rr) = HostView(rr, cc);
     }
   }
 
   // transpose work_array
-  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange( 0, rows );
-  RAJA::TypedRangeSegment<INDEX_TYPE> colrange( 0, cols );
+  RAJA::TypedRangeSegment<INDEX_TYPE> rowrange(0, rows);
+  RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, cols);
 
-  RAJA::kernel_param<EXEC_POLICY> ( RAJA::make_tuple( colrange, rowrange ), RAJA::make_tuple( (INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array ),
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
-      Tile_Array( ty, tx ) = WorkView( rr, cc );
-    },
+  RAJA::kernel_param<EXEC_POLICY>(
+      RAJA::make_tuple(colrange, rowrange),
+      RAJA::make_tuple((INDEX_TYPE)0, (INDEX_TYPE)0, Tile_Array),
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
+      { Tile_Array(ty, tx) = WorkView(rr, cc); },
 
-    [=] RAJA_HOST_DEVICE ( INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx, INDEX_TYPE ty, TILE_MEM &Tile_Array ) {
-      WorkTView( cc, rr ) = Tile_Array( ty, tx );
-    }
-  );
+      [=] RAJA_HOST_DEVICE(INDEX_TYPE cc, INDEX_TYPE rr, INDEX_TYPE tx,
+                           INDEX_TYPE ty, TILE_MEM & Tile_Array)
+      { WorkTView(cc, rr) = Tile_Array(ty, tx); });
 
-  work_res.memcpy( check_array_t, work_array_t, sizeof(DATA_TYPE) * array_length );
+  work_res.memcpy(check_array_t, work_array_t,
+                  sizeof(DATA_TYPE) * array_length);
 
-  for ( int rr = 0; rr < rows; ++rr )
+  for (int rr = 0; rr < rows; ++rr)
   {
-    for ( int cc = 0; cc < cols; ++cc )
+    for (int cc = 0; cc < cols; ++cc)
     {
       ASSERT_EQ(CheckTView(cc, rr), HostTView(cc, rr));
     }
   }
 
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array,
-                                        check_array,
-                                        test_array
-                                      );
-
-  deallocateForallTestData<DATA_TYPE> ( work_res,
-                                        work_array_t,
-                                        check_array_t,
-                                        test_array_t
-                                      );
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array, check_array,
+                                      test_array);
+
+  deallocateForallTestData<DATA_TYPE>(work_res, work_array_t, check_array_t,
+                                      test_array_t);
 }
 
 
 TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest);
 template <typename T>
 class KernelTileLocalArray2DTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
   using EXEC_POLICY = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(10, 10);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(151, 111);
-  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES, EXEC_POLICY>(362, 362);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(10, 10);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(151, 111);
+  KernelTileLocalArray2DTestImpl<INDEX_TYPE, DATA_TYPE, WORKING_RES,
+                                 EXEC_POLICY>(362, 362);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest,
-                            TileLocalArray2DKernel);
+REGISTER_TYPED_TEST_SUITE_P(KernelTileLocalArray2DTest, TileLocalArray2DKernel);
 
 #endif  // __TEST_KERNEL_TILE_LOCALARRAY2D_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
index 13f9c62a45..c1d3daad25 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 64, 4 * 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
index cda8aaba59..bf1dda57af 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,15 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 4000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
index f3194fba44..7181c5f5a2 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = true;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
index 08e9a0c381..24c7f294ca 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
 template <typename T>
-class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+class KernelWarpThreadReduceMaskTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 64, 4 * 123);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
index e61c05446c..6690efd2f9 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
 template <typename T>
-class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+class KernelWarpThreadReduceWarpTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,13 +33,15 @@ TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
   // Integer argument needs to be divisible by 10, and 16.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 4000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
index c435c484b2..ba8f38e64c 100644
--- a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -17,11 +17,13 @@
 //
 TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
 template <typename T>
-class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+class KernelWarpThreadWarpLoopTest : public ::testing::Test
+{};
 
-TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
-  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel)
+{
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL    = typename camp::at<TypeParam, camp::num<1>>::type;
   using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
 
   // Attain the loop depth type from execpol data.
@@ -31,12 +33,14 @@ TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
   using LOOP_POLS = typename EXEC_POL_DATA::type;
 
   // Build proper basic kernel exec policy type.
-  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+  using EXEC_POLICY =
+      typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
 
   constexpr bool USE_RES = false;
 
   // For double nested loop tests the third arg is ignored.
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345);
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>(
+      LOOP_TYPE(), 2345);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
index 797379e890..8809630544 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -10,38 +10,53 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -49,97 +64,99 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the ReduceMask test supports.
 //
 //
-using ReduceMaskSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
-  DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
->;
+using ReduceMaskSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+               DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
+                                           &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-                            work_res,
-                            [=] RAJA_DEVICE (RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j)) {
-                              trip_count += 1;
-                              worksum += i; // i should only be 0..directlen-1
-                              max_thread.max(threadIdx.x);
-                            });
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      work_res,
+      [=] RAJA_DEVICE(RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j))
+      {
+        trip_count += 1;
+        worksum += i;  // i should only be 0..directlen-1
+        max_thread.max(threadIdx.x);
+      });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen*directlen);
-  ASSERT_EQ(worksum.get(), looplen*directlen*(directlen-1)/2);
+  ASSERT_EQ(trip_count.get(), looplen * directlen);
+  ASSERT_EQ(worksum.get(), looplen * directlen * (directlen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
                           const RAJA::Index_type directlen,
                           const RAJA::Index_type looplen)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(directlen * looplen, erased_work_res,
+                                           &work_array, &check_array,
+                                           &test_array);
 
   RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
-                            RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0),
-                            work_res,
-                            [=] RAJA_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type RAJA_UNUSED_ARG(j), RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y) {
-                              trip_count += 1;
-                              worksum += y; // y should only be 0..3
-                              max_thread.max(threadIdx.x);
-                            });
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+      RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0), work_res,
+      [=] RAJA_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                      RAJA::Index_type RAJA_UNUSED_ARG(j),
+                      RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y)
+      {
+        trip_count += 1;
+        worksum += y;  // y should only be 0..3
+        max_thread.max(threadIdx.x);
+      });
 
   ASSERT_EQ(max_thread.get(), 255);
-  ASSERT_EQ(trip_count.get(), looplen*directlen);
-  ASSERT_EQ(worksum.get(), looplen*directlen*(looplen-1)/2);
+  ASSERT_EQ(trip_count.get(), looplen * directlen);
+  ASSERT_EQ(worksum.get(), looplen * directlen * (looplen - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
@@ -147,37 +164,43 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
 // Defining the Kernel Loop structure for ReduceMask Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          0,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<RAJA::statement::ForICount<
+          0,
+          RAJA::statement::Param<0>,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::ForICount<
+              1,
+              RAJA::statement::Param<1>,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+      >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
index 1771b99665..e69c46baa5 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -10,22 +10,31 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -33,67 +42,68 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the ReduceWarp test supports.
 //
 //
-using ReduceWarpSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
-  DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
-  DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
->;
+using ReduceWarpSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+               DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
 
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type &value) {
-                              value += i;
-                            },
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type & value)
+      { value += i; },
 
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
 
-  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
   ASSERT_EQ(reduce_count.get(), 1);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
-                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
+    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
@@ -102,88 +112,85 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
   RAJA::Index_type innerlen = 10;
   RAJA::Index_type outerlen = len / innerlen;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type &value) {
-                              value += i + j * outerlen;
-                            },
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
-
-  ASSERT_EQ(worksum.get(), outerlen*innerlen*(outerlen*innerlen-1)/2);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type & value)
+      { value += i + j * outerlen; },
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
+
+  ASSERT_EQ(worksum.get(), outerlen * innerlen * (outerlen * innerlen - 1) / 2);
   ASSERT_EQ(reduce_count.get(), innerlen);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
-void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
-                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
+void KernelWarpThreadTest(
+    const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
+    const RAJA::Index_type len)  // len needs to be divisible by 10 and 16
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  RAJA::Index_type innerlen = 10;
+  RAJA::Index_type innerlen  = 10;
   RAJA::Index_type middlelen = 16;
-  RAJA::Index_type outerlen = len / (innerlen*middlelen);
+  RAJA::Index_type outerlen  = len / (innerlen * middlelen);
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
-                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k, RAJA::Index_type &value) {
-                              value += i + j * outerlen + k * outerlen * middlelen;
-                            },
-
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
-                              // This only gets executed on the "root" thread which received the reduced value.
-                              worksum += value;
-                              reduce_count += 1;
-                            });
-
-  ASSERT_EQ(worksum.get(), outerlen*middlelen*innerlen*(outerlen*middlelen*innerlen-1)/2);
-  ASSERT_EQ(reduce_count.get(), middlelen*innerlen);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
+                       RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i, RAJA::Index_type j,
+                           RAJA::Index_type k, RAJA::Index_type & value)
+      { value += i + j * outerlen + k * outerlen * middlelen; },
+
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type & value)
+      {
+        // This only gets executed on the "root" thread which received the
+        // reduced value.
+        worksum += value;
+        reduce_count += 1;
+      });
+
+  ASSERT_EQ(worksum.get(), outerlen * middlelen * innerlen *
+                               (outerlen * middlelen * innerlen - 1) / 2);
+  ASSERT_EQ(reduce_count.get(), middlelen * innerlen);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
 //
@@ -191,57 +198,75 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
 // Defining the Kernel Loop structure for ReduceWarp Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>>,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::statement::Lambda<0>
-          >
-        >,
-        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-          RAJA::statement::Lambda<1, RAJA::Params<0>>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<
+          1,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::Lambda<0>>>,
+      RAJA::statement::Reduce<
+          typename camp::at<POLICY_DATA, camp::num<2>>::type,
+          RAJA::operators::plus,
+          RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
-            RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::statement::Lambda<0>
-            > // end For 0
-          >,  // end For 1
-          typename camp::at<POLICY_DATA, camp::num<3>>::type, // warp synchronize
-          RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<4>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
-            RAJA::statement::Lambda<1, RAJA::Params<0>>
-          >
-        > // end For 2
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::For<
+          2,
+          typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<
+              1,
+              typename camp::at<POLICY_DATA, camp::num<1>>::type,
+              RAJA::statement::For<
+                  0,
+                  typename camp::at<POLICY_DATA, camp::num<2>>::type,
+                  RAJA::statement::Lambda<0>>                  // end For 0
+              >,                                               // end For 1
+          typename camp::at<POLICY_DATA, camp::num<3>>::type,  // warp
+                                                               // synchronize
+          RAJA::statement::Reduce<
+              typename camp::at<POLICY_DATA, camp::num<4>>::type,
+              RAJA::operators::plus,
+              RAJA::statement::Param<0>,
+              RAJA::statement::Lambda<1, RAJA::Params<0>>>>  // end For 2
+                                                        >  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
index ba4f445c88..d0a8e51af3 100644
--- a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -10,38 +10,53 @@
 
 #include <numeric>
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args)
+{
+  RAJA::kernel_resource<EXEC_POL>(segs, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
-  RAJA::kernel<EXEC_POL>( segs, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel<EXEC_POL>(segs, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
-  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs,
+                  PARAMS&& params,
+                  WORKING_RES work_res,
+                  Args&&... args)
+{
+  RAJA::kernel_param_resource<EXEC_POL>(segs, params, work_res, args...);
 }
 
-template<typename EXEC_POL, bool USE_RESOURCE,
-         typename SEGMENTS,
-         typename PARAMS,
-         typename WORKING_RES,
-         typename... Args>
-typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
-  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+template <typename EXEC_POL,
+          bool USE_RESOURCE,
+          typename SEGMENTS,
+          typename PARAMS,
+          typename WORKING_RES,
+          typename... Args>
+typename std::enable_if<!USE_RESOURCE>::type
+call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args)
+{
+  RAJA::kernel_param<EXEC_POL>(segs, params, args...);
 }
 
 //
@@ -49,93 +64,96 @@ typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs
 // Define list of nested loop types the WarpLoop test supports.
 //
 //
-using WarpLoopSupportedLoopTypeList = camp::list<
-  DEVICE_DEPTH_1_REDUCESUM_WARP,
-  DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
-  DEVICE_DEPTH_2_REDUCESUM_WARP
->;
+using WarpLoopSupportedLoopTypeList =
+    camp::list<DEVICE_DEPTH_1_REDUCESUM_WARP,
+               DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+               DEVICE_DEPTH_2_REDUCESUM_WARP>;
 
 //
 //
 // Sum of array of elements with GPU-specific policies.
 //
 //
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
                           const RAJA::Index_type len)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(len,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(len, erased_work_res, &work_array,
+                                           &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
-  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)), work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
-                              worksum += i;
-                            });
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+      work_res, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { worksum += i; });
 
-  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+  ASSERT_EQ(worksum.get(), len * (len - 1) / 2);
 
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE>
 void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
                           const RAJA::Index_type numtiles)
 {
-  WORKING_RES work_res{WORKING_RES::get_default()};
-  camp::resources::Resource erased_work_res{work_res};
+  WORKING_RES work_res {WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res {work_res};
 
   RAJA::Index_type flatSize = 32 * numtiles;
   RAJA::Index_type* work_array;
   RAJA::Index_type* check_array;
   RAJA::Index_type* test_array;
 
-  allocateForallTestData<RAJA::Index_type>(flatSize,
-                                     erased_work_res,
-                                     &work_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<RAJA::Index_type>(
+      flatSize, erased_work_res, &work_array, &check_array, &test_array);
 
   RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, flatSize);
 
   RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
 
   call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
-                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
-                            RAJA::make_tuple((RAJA::Index_type)0),
-                            work_res,
-                            [=] RAJA_HOST_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type j) {
-                              worksum += j; // j should only be 0..31
-                            });
-
-  ASSERT_EQ(worksum.get(), numtiles*32*(32-1)/2);
-
-  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
-                                       work_array,
-                                       check_array,
-                                       test_array);
+      RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
+      RAJA::make_tuple((RAJA::Index_type)0), work_res,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type RAJA_UNUSED_ARG(i),
+                           RAJA::Index_type j)
+      {
+        worksum += j;  // j should only be 0..31
+      });
+
+  ASSERT_EQ(worksum.get(), numtiles * 32 * (32 - 1) / 2);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res, work_array,
+                                             check_array, test_array);
 }
 
-// More specific execution policies that use the above DEVICE_DEPTH_1_REDUCESUM_WARP test.
-template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
-void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args... args){
-  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
+// More specific execution policies that use the above
+// DEVICE_DEPTH_1_REDUCESUM_WARP test.
+template <typename WORKING_RES,
+          typename EXEC_POLICY,
+          typename REDUCE_POL,
+          bool USE_RESOURCE,
+          typename... Args>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&,
+                          Args... args)
+{
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(
+      DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
 }
 
 //
@@ -143,49 +161,52 @@ void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args.
 // Defining the Kernel Loop structure for WarpLoop Nested Loop Tests.
 //
 //
-template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+template <typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
 struct WarpThreadExec;
 
 #if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-          RAJA::statement::Lambda<0>
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
+{
+  using type = RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<
+      RAJA::statement::For<0,
+                           typename camp::at<POLICY_DATA, camp::num<0>>::type,
+                           RAJA::statement::Lambda<0>>>  // end DEVICE_KERNEL
+                                  >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
-          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+                      REDUCE_POL,
+                      POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<32>,
+          RAJA::seq_exec,
+          RAJA::statement::For<
+              0,
+              typename camp::at<POLICY_DATA, camp::num<0>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
-template<typename REDUCE_POL, typename POLICY_DATA>
-struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
-  using type = 
-    RAJA::KernelPolicy<
-      RAJA::statement::DEVICE_KERNEL<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
-          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      > // end DEVICE_KERNEL
-    >;
+template <typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA>
+{
+  using type =
+      RAJA::KernelPolicy<RAJA::statement::DEVICE_KERNEL<RAJA::statement::Tile<
+          0,
+          RAJA::tile_fixed<32>,
+          RAJA::seq_exec,
+          RAJA::statement::ForICount<
+              0,
+              RAJA::statement::Param<0>,
+              typename camp::at<POLICY_DATA, camp::num<0>>::type,
+              RAJA::statement::Lambda<0>>>>  // end DEVICE_KERNEL
+                         >;
 };
 
 #endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
index 04bc3bcc5e..c20b66a95d 100644
--- a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -21,11 +21,11 @@
 // Defining the Launch Loop structure for MultiReduce Nested Loop Tests.
 //
 //
-template <typename EXEC_POL_DATA, typename IDX_TYPE,
+template <typename EXEC_POL_DATA,
+          typename IDX_TYPE,
           typename SEGMENTS_TYPE,
           typename Lambda>
-void Launch(const SEGMENTS_TYPE& segments,
-                  Lambda&& lambda)
+void Launch(const SEGMENTS_TYPE& segments, Lambda&& lambda)
 {
   using RAJA::get;
 
@@ -55,41 +55,69 @@ void Launch(const SEGMENTS_TYPE& segments,
   IDX_TYPE blocks_j = RAJA_DIVIDE_CEILING_INT(distance_sj, threads_j);
   IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
-                        RAJA::Threads(threads_i, threads_j,threads_k)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-    RAJA::loop<TEAM_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k), [&](IDX_TYPE bk) {
-      RAJA::loop<TEAM_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j), [&](IDX_TYPE bj) {
-        RAJA::loop<TEAM_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i), [&](IDX_TYPE bi) {
-
-          RAJA::loop<THREAD_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k), [&](IDX_TYPE tk) {
-            RAJA::loop<THREAD_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_j), [&](IDX_TYPE tj) {
-              RAJA::loop<THREAD_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_i), [&](IDX_TYPE ti) {
-
-                IDX_TYPE i = ti + threads_i * bi;
-                IDX_TYPE j = tj + threads_j * bj;
-                IDX_TYPE k = tk + threads_k * bk;
-
-                if (i < distance_si && j < distance_sj && k < distance_sk) {
-                  lambda(begin_sk[k], begin_sj[j], begin_si[i]);
-                }
-              });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
+                         RAJA::Threads(threads_i, threads_j, threads_k)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_Z_POLICY>(
+            ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k),
+            [&](IDX_TYPE bk)
+            {
+              RAJA::loop<TEAM_Y_POLICY>(
+                  ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j),
+                  [&](IDX_TYPE bj)
+                  {
+                    RAJA::loop<TEAM_X_POLICY>(
+                        ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i),
+                        [&](IDX_TYPE bi)
+                        {
+                          RAJA::loop<THREAD_Z_POLICY>(
+                              ctx,
+                              RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k),
+                              [&](IDX_TYPE tk)
+                              {
+                                RAJA::loop<THREAD_Y_POLICY>(
+                                    ctx,
+                                    RAJA::TypedRangeSegment<IDX_TYPE>(
+                                        0, threads_j),
+                                    [&](IDX_TYPE tj)
+                                    {
+                                      RAJA::loop<THREAD_X_POLICY>(
+                                          ctx,
+                                          RAJA::TypedRangeSegment<IDX_TYPE>(
+                                              0, threads_i),
+                                          [&](IDX_TYPE ti)
+                                          {
+                                            IDX_TYPE i = ti + threads_i * bi;
+                                            IDX_TYPE j = tj + threads_j * bj;
+                                            IDX_TYPE k = tk + threads_k * bk;
+
+                                            if (i < distance_si &&
+                                                j < distance_sj &&
+                                                k < distance_sk)
+                                            {
+                                              lambda(begin_sk[k], begin_sj[j],
+                                                     begin_si[i]);
+                                            }
+                                          });
+                                    });
+                              });
+                        });
+                  });
             });
-          });
-
-        });
       });
-    });
-
-  });
 }
 
-template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 // add bool return type to disambiguate signatures of these functions for MSVC
 std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
@@ -97,12 +125,19 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
                                 const Container&,
                                 WORKING_RES,
                                 RandomGenerator&)
-{ return false; }
+{
+  return false;
+}
 ///
-template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
-          typename DATA_TYPE, typename IDX_TYPE,
-          typename SEGMENTS_TYPE, typename Container,
-          typename WORKING_RES, typename RandomGenerator>
+template <typename EXEC_POL_DATA,
+          typename REDUCE_POLICY,
+          typename ABSTRACTION,
+          typename DATA_TYPE,
+          typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Container,
+          typename WORKING_RES,
+          typename RandomGenerator>
 // use enable_if in return type to appease nvcc 11.2
 std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
 LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
@@ -111,7 +146,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
                                 RandomGenerator& rngen)
 {
   using RAJA::get;
-  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+  using MULTIREDUCER =
+      typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
 
   auto si = get<2>(segments);
   auto sj = get<1>(segments);
@@ -121,13 +157,13 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
   RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
 
-  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
-  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
-  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+  IDX_TYPE dimi = begin_si[distance_si - 1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj - 1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk - 1] + 1;
 
   const IDX_TYPE idx_range = dimi * dimj * dimk;
 
-  const int modval = 100;
+  const int modval      = 100;
   const size_t num_bins = multi_init.size();
 
   IDX_TYPE* working_range;
@@ -144,51 +180,50 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   IDX_TYPE data_len = 0;
 
-  allocateForallTestData(idx_range+1,
-                         working_res,
-                         &working_range,
-                         &check_range,
-                         &test_range);
+  allocateForallTestData(idx_range + 1, working_res, &working_range,
+                         &check_range, &test_range);
 
-  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+  for (IDX_TYPE i = 0; i < idx_range + 1; ++i)
+  {
     test_range[i] = ~IDX_TYPE(0);
   }
 
   {
-    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
-
-    for (IDX_TYPE k : sk) {
-      for (IDX_TYPE j : sj) {
-        for (IDX_TYPE i : si) {
-          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+    std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(
+        0, num_bins);
+
+    for (IDX_TYPE k : sk)
+    {
+      for (IDX_TYPE j : sj)
+      {
+        for (IDX_TYPE i : si)
+        {
+          IDX_TYPE ii    = (dimi * dimj * k) + (dimi * j) + i;
           test_range[ii] = data_len;
           data_len += work_per_iterate_distribution(rngen);
-          test_range[ii+1] = data_len;
+          test_range[ii + 1] = data_len;
         }
       }
     }
   }
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_array,
-                         &check_array,
+  allocateForallTestData(data_len, working_res, &working_array, &check_array,
                          &test_array);
 
-  allocateForallTestData(data_len,
-                         working_res,
-                         &working_bins,
-                         &check_bins,
+  allocateForallTestData(data_len, working_res, &working_bins, &check_bins,
                          &test_bins);
 
-  if (data_len > IDX_TYPE(0)) {
+  if (data_len > IDX_TYPE(0))
+  {
 
-    // use ints to initialize array here to avoid floating point precision issues
-    std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
-    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+    // use ints to initialize array here to avoid floating point precision
+    // issues
+    std::uniform_int_distribution<int> array_int_distribution(0, modval - 1);
+    std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins - 1);
 
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
       test_array[i] = DATA_TYPE(array_int_distribution(rngen));
 
       // this may use the same bin multiple times per iterate
@@ -196,7 +231,8 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     }
   }
 
-  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_range, test_range,
+                     sizeof(IDX_TYPE) * (idx_range + 1));
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
   working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
 
@@ -208,21 +244,28 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
-    for (IDX_TYPE i = 0; i < data_len; ++i) {
-      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    for (IDX_TYPE i = 0; i < data_len; ++i)
+    {
+      ref_vals[test_bins[i]] =
+          ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
     }
 
-    Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
-      }
-    });
+    Launch<EXEC_POL_DATA, IDX_TYPE>(
+        segments,
+        [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+        {
+          IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+          for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+               ++idx)
+          {
+            ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+          }
+        });
 
     size_t bin = 0;
-    for (auto init_val : multi_init) {
+    for (auto init_val : multi_init)
+    {
       ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
       ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
       ++bin;
@@ -232,46 +275,60 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
 
   red.reset();
 
-  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  // basic multiple use test, ensure same reducer can combine values from
+  // multiple loops
   {
     std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
-        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
+        ref_vals[test_bins[i]] =
+            ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
       }
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(
+          segments,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
     }
   }
 
 
   // test the consistency of answers, if we expect them to be consistent
-  if (ABSTRACTION::consistent(red)) {
+  if (ABSTRACTION::consistent(red))
+  {
 
-    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value)
+    {
 
       // use floating point values to accentuate floating point precision issues
       std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
-          std::uniform_int_distribution<DATA_TYPE>,
-          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+                         std::uniform_int_distribution<DATA_TYPE>,
+                         std::uniform_real_distribution<DATA_TYPE>>
+          array_flt_distribution(0, modval - 1);
 
-      for (IDX_TYPE i = 0; i < data_len; ++i) {
+      for (IDX_TYPE i = 0; i < data_len; ++i)
+      {
         test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
       }
-      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+      working_res.memcpy(working_array, test_array,
+                         sizeof(DATA_TYPE) * data_len);
     }
 
 
@@ -279,23 +336,32 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
     bool got_ref_vals = false;
 
     const int nloops = 2;
-    for (int j = 0; j < nloops; ++j) {
+    for (int j = 0; j < nloops; ++j)
+    {
       red.reset();
 
-      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
-          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
-        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
-        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
-          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
-        }
-      });
+      Launch<EXEC_POL_DATA, IDX_TYPE>(
+          segments,
+          [=] RAJA_HOST_DEVICE(IDX_TYPE k, IDX_TYPE j, IDX_TYPE i)
+          {
+            IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+            for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii + 1];
+                 ++idx)
+            {
+              ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+            }
+          });
 
-      if (!got_ref_vals) {
+      if (!got_ref_vals)
+      {
         ref_vals.resize(num_bins);
         red.get_all(ref_vals);
         got_ref_vals = true;
-      } else {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
+      }
+      else
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
           ASSERT_EQ(red.get(bin), ref_vals[bin]);
         }
       }
@@ -303,26 +369,16 @@ LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
   }
 
 
-  deallocateForallTestData(working_res,
-                           working_bins,
-                           check_bins,
-                           test_bins);
-  deallocateForallTestData(working_res,
-                           working_array,
-                           check_array,
-                           test_array);
-  deallocateForallTestData(working_res,
-                           working_range,
-                           check_range,
-                           test_range);
+  deallocateForallTestData(working_res, working_bins, check_bins, test_bins);
+  deallocateForallTestData(working_res, working_array, check_array, test_array);
+  deallocateForallTestData(working_res, working_range, check_range, test_range);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest);
 template <typename T>
 class LaunchMultiReduceNestedTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
 {
@@ -334,43 +390,48 @@ TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
   using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
 
   // for setting random values in arrays
-  auto random_seed = std::random_device{}();
+  auto random_seed = std::random_device {}();
   std::mt19937 rngen(random_seed);
 
-  WORKING_RES working_res{WORKING_RES::get_default()};
+  WORKING_RES working_res {WORKING_RES::get_default()};
 
   std::vector<DATA_TYPE> container;
 
   std::vector<size_t> num_bins_max_container({0, 1, 100});
   size_t num_bins_min = 0;
-  for (size_t num_bins_max : num_bins_max_container) {
+  for (size_t num_bins_max : num_bins_max_container)
+  {
 
-    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
-    num_bins_min = num_bins_max+1;
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min,
+                                                        num_bins_max);
+    num_bins_min    = num_bins_max + 1;
     size_t num_bins = num_bins_dist(rngen);
 
     container.resize(num_bins, DATA_TYPE(2));
 
     // Range segment tests
-    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s1, container, working_res, rngen);
-
-    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
-                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s2, container, working_res, rngen);
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(0, 2),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 7),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 3));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s1, container,
+                                                         working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>(2, 35),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(0, 19),
+                               RAJA::TypedRangeSegment<IDX_TYPE>(3, 13));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s2, container,
+                                                         working_res, rngen);
 
     // Range-stride segment tests
-    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
-    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
-                                   s3, container, working_res, rngen);
-
+    auto s3 =
+        RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>(0, 6, 2),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(1, 38, 3),
+                         RAJA::TypedRangeStrideSegment<IDX_TYPE>(5, 17, 1));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION,
+                                    DATA_TYPE, IDX_TYPE>(s3, container,
+                                                         working_res, rngen);
   }
 }
 
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index a730d030a7..ae6d7e384a 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -10,19 +10,25 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 4 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 5 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 6 * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -32,26 +38,22 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE N5 = static_cast<INDEX_TYPE>(r5.end() - r5.begin());
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 *                                          
-                                         N3 * N4 *
-                                         N5 * N6);                                         
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
-  //6 threads total
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
+  // 6 threads total
   constexpr int threads_x = 2;
   constexpr int threads_y = 3;
   constexpr int threads_z = 4;
@@ -60,85 +62,134 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                              });
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r6,
+              [&](INDEX_TYPE bz)
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r5,
+                    [&](INDEX_TYPE by)
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r4,
+                          [&](INDEX_TYPE bx)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx +
+                                                  N1 *
+                                                      (ty +
+                                                       N2 *
+                                                           (tz +
+                                                            N3 *
+                                                                (bx +
+                                                                 N4 *
+                                                                     (by +
+                                                                      N5 *
+                                                                          bz))));
+
+
+                                              Aview(bz, by, bx, tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
-
-                                working_array[0]++;
-                                
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r3,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r2,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r1,
+                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            {
+                                              working_array[0]++;
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(test_array[0], check_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -146,8 +197,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedDirectTest);
 template <typename T>
 class LaunchNestedDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
@@ -155,34 +205,44 @@ TYPED_TEST_P(LaunchNestedDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
   LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
 
-  //Keep at one since we are doing a direct thread test
+  // Keep at one since we are doing a direct thread test
   LaunchNestedDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+                             THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
+                             TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedDirectTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_DIRECT_HPP__
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index 8f3b9702d0..cd90bf2298 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -10,19 +10,25 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, 3 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, 4 * M);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r4(0, 8 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r5(0, 2 * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r6(0, 3 * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
@@ -33,30 +39,23 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE N6 = static_cast<INDEX_TYPE>(r6.end() - r6.begin());
 
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3 *
-                                         N4 *
-                                         N5 *
-                                         N6);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3 * N4 * N5 * N6);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  //6 threads total
+  // 6 threads total
   constexpr int threads_x = 1;
   constexpr int threads_y = 2;
   constexpr int threads_z = 3;
@@ -65,86 +64,134 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 2;
   constexpr int blocks_z = 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 6;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
-    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2, N1);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r6, [&](INDEX_TYPE bz) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r5, [&](INDEX_TYPE by) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r4, [&](INDEX_TYPE bx) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * (tz + N3 * (bx + N4 * (by + N5 * bz))));
-
-
-                                Aview(bz, by, bx, tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                                
-                              });
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
+    RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N6, N5, N4, N3, N2,
+                                           N1);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r6,
+              [&](INDEX_TYPE bz)
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r5,
+                    [&](INDEX_TYPE by)
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r4,
+                          [&](INDEX_TYPE bx)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx +
+                                                  N1 *
+                                                      (ty +
+                                                       N2 *
+                                                           (tz +
+                                                            N3 *
+                                                                (bx +
+                                                                 N4 *
+                                                                     (by +
+                                                                      N5 *
+                                                                          bz))));
+
+
+                                              Aview(bz, by, bx, tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<TEAM_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(bz)) {
-            RAJA::loop<TEAM_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(by)) {
-                RAJA::loop<TEAM_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, r3, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, r2, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx) ) {
-
-                                working_array[0]++;
-                                
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<TEAM_Z_POLICY>(
+              ctx, r3,
+              [&](INDEX_TYPE RAJA_UNUSED_ARG(bz))
+              {
+                RAJA::loop<TEAM_Y_POLICY>(
+                    ctx, r2,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(by))
+                    {
+                      RAJA::loop<TEAM_X_POLICY>(
+                          ctx, r1,
+                          [&](INDEX_TYPE RAJA_UNUSED_ARG(bx))
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, r3,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, r2,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, r1,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            {
+                                              working_array[0]++;
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -152,8 +199,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedLoopTest);
 template <typename T>
 class LaunchNestedLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
@@ -161,32 +207,43 @@ TYPED_TEST_P(LaunchNestedLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
   LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
 
   LaunchNestedLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
                            THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(3));
-
-
+                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(3));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedLoopTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_LOOP_HPP__
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 793d432987..4a212875f0 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -10,9 +10,15 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
@@ -20,121 +26,154 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   constexpr int tile_size_y = 3;
   constexpr int tile_size_z = 4;
 
-  constexpr int threads_x = 2*tile_size_x;
-  constexpr int threads_y = 3*tile_size_y;
-  constexpr int threads_z = 4*tile_size_z;
+  constexpr int threads_x = 2 * tile_size_x;
+  constexpr int threads_y = 3 * tile_size_y;
+  constexpr int threads_z = 4 * tile_size_z;
 
   constexpr int blocks_x = 4;
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y*M);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z*M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, tile_size_x * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, tile_size_y * M);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, tile_size_z * M);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, tile_size_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, tile_size_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, tile_size_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * tz);
-
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-
-                              });
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, tile_size_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, tile_size_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, tile_size_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx + N1 * (ty + N2 * tz);
+
+                                              Aview(tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tz)) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(ty)) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) {
-
-                                working_array[0]++;
-                                
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, threads_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, threads_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, threads_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE RAJA_UNUSED_ARG(tz))
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE RAJA_UNUSED_ARG(ty))
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
+                                            {
+                                              working_array[0]++;
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -142,8 +181,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
@@ -151,33 +189,44 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileDirectTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
index 07deab0376..790498dc2f 100644
--- a/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
+++ b/test/functional/launch/nested_tile_loop/tests/test-launch-nested-Tile-Loop.hpp
@@ -10,9 +10,15 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename THREAD_Y_POLICY, typename THREAD_Z_POLICY,
-          typename TEAM_X_POLICY, typename TEAM_Y_POLICY, typename TEAM_Z_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename THREAD_Y_POLICY,
+          typename THREAD_Z_POLICY,
+          typename TEAM_X_POLICY,
+          typename TEAM_Y_POLICY,
+          typename TEAM_Z_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
@@ -24,116 +30,151 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   constexpr int blocks_y = 5;
   constexpr int blocks_z = 6;
 
-  //Add one to we check the bounds checking capability
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y*M + 1);
-  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z*M + 1);
+  // Add one to we check the bounds checking capability
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, threads_x * M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r2(0, threads_y * M + 1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r3(0, threads_z * M + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
   INDEX_TYPE N2 = static_cast<INDEX_TYPE>(r2.end() - r2.begin());
   INDEX_TYPE N3 = static_cast<INDEX_TYPE>(r3.end() - r3.begin());
 
-  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 *
-                                         N2 *
-                                         N3);
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(N1 * N2 * N3);
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), 0);
 
     constexpr int DIM = 3;
-    using layout_t = RAJA::Layout<DIM, INDEX_TYPE,DIM-1>;
+    using layout_t    = RAJA::Layout<DIM, INDEX_TYPE, DIM - 1>;
     RAJA::View<INDEX_TYPE, layout_t> Aview(working_array, N3, N2, N1);
 
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(threads_x, threads_y,threads_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                auto idx = tx + N1 * (ty + N2 * tz);
-
-                                Aview(tz, ty, tx) = static_cast<INDEX_TYPE>(idx);
-                              });
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(threads_x, threads_y, threads_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, threads_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, threads_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, threads_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              auto idx =
+                                                  tx + N1 * (ty + N2 * tz);
+
+                                              Aview(tz, ty, tx) =
+                                                  static_cast<INDEX_TYPE>(idx);
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-    });
-  } else { // zero-length segment
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z), RAJA::Threads(blocks_x, blocks_y ,blocks_z)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile<TEAM_Z_POLICY>(ctx, threads_z, r3, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &z_tile) {
-            RAJA::tile<TEAM_Y_POLICY>(ctx, threads_y, r2, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &y_tile) {
-                RAJA::tile<TEAM_X_POLICY>(ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile) {
-
-                    RAJA::loop<THREAD_Z_POLICY>(ctx, z_tile, [&](INDEX_TYPE tz) {
-                        RAJA::loop<THREAD_Y_POLICY>(ctx, y_tile, [&](INDEX_TYPE ty) {
-                            RAJA::loop<THREAD_X_POLICY>(ctx, x_tile, [&](INDEX_TYPE tx) {
-
-                                (void) tx;
-                                (void) ty;
-                                (void) tz;
-
-                                working_array[0]++;
-                              });
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks_x, blocks_y, blocks_z),
+                           RAJA::Threads(blocks_x, blocks_y, blocks_z)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile<TEAM_Z_POLICY>(
+              ctx, threads_z, r3,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& z_tile)
+              {
+                RAJA::tile<TEAM_Y_POLICY>(
+                    ctx, threads_y, r2,
+                    [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& y_tile)
+                    {
+                      RAJA::tile<TEAM_X_POLICY>(
+                          ctx, threads_x, r1,
+                          [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile)
+                          {
+                            RAJA::loop<THREAD_Z_POLICY>(
+                                ctx, z_tile,
+                                [&](INDEX_TYPE tz)
+                                {
+                                  RAJA::loop<THREAD_Y_POLICY>(
+                                      ctx, y_tile,
+                                      [&](INDEX_TYPE ty)
+                                      {
+                                        RAJA::loop<THREAD_X_POLICY>(
+                                            ctx, x_tile,
+                                            [&](INDEX_TYPE tx)
+                                            {
+                                              (void)tx;
+                                              (void)ty;
+                                              (void)tz;
+
+                                              working_array[0]++;
+                                            });
+                                      });
+                                });
                           });
-                      });
-
-                  });
+                    });
               });
-          });
-      });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -141,8 +182,7 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
@@ -150,33 +190,44 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-  using THREAD_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<3>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<4>>::type;
-  using TEAM_Y_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<5>>::type;
-  using TEAM_Z_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<6>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+  using THREAD_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<3>>::type;
+
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<4>>::type;
+  using TEAM_Y_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<5>>::type;
+  using TEAM_Z_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<6>>::type;
 
 
   // test zero-length range segment
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(0));
-
-  //Keep at one since we are doing a direct thread test
-  LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, THREAD_Y_POLICY, THREAD_Z_POLICY,
-                           TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>
-    (INDEX_TYPE(1));
-
-
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(0));
+
+  // Keep at one since we are doing a direct thread test
+  LaunchNestedTileLoopTestImpl<
+      INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, THREAD_X_POLICY, THREAD_Y_POLICY,
+      THREAD_Z_POLICY, TEAM_X_POLICY, TEAM_Y_POLICY, TEAM_Z_POLICY>(
+      INDEX_TYPE(1));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_LOOP_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
index aed4b9618e..c07e8490ea 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceBitAnd.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -23,39 +24,36 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
                                      camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          simpand &= working_array[idx];
-     });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg, [&](IDX_TYPE idx) { simpand &= working_array[idx]; });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
 
@@ -66,27 +64,32 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand(0);
   RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-        redand  &= working_array[idx];
-        redand2 &= working_array[idx];
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           redand &= working_array[idx];
+                                           redand2 &= working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2.get()), ref_and);
@@ -94,22 +97,21 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand.reset(0);
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          redand &= working_array[idx];
-      });
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { redand &= working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand.get()), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -117,79 +119,79 @@ void LaunchReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest);
 template <typename T>
 class LaunchReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r1, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r2, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r3, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r4, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    r5, seg_idx, working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
-  LaunchReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                    l1, seg_idx, working_res);
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
+  LaunchReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+      GLOBAL_THREAD_POLICY, REDUCE_POLICY>(l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchReduceBitAndBasicTest,
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
index 3e8c86ffd8..eb3f55c1e5 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceMin.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -23,32 +24,31 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
                                   camp::resources::Resource working_res)
 {
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -57,15 +57,17 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> mininit(small_min);
   RAJA::ReduceMin<REDUCE_POLICY, DATA_TYPE> min(min_init);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          mininit.min( working_array[idx] );
-          min.min( working_array[idx] );
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           mininit.min(working_array[idx]);
+                                           min.min(working_array[idx]);
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit.get()), small_min);
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min);
@@ -74,33 +76,31 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          min.min( working_array[idx] * factor);
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          min.min( working_array[idx] * factor);
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { min.min(working_array[idx] * factor); });
       });
-  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min.get()), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -108,82 +108,85 @@ void LaunchReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest);
 template <typename T>
 class LaunchReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+  LaunchReduceMinBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest,
-                            ReduceMinBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceMinBasicTest, ReduceMinBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
index 798988f116..da783f96bd 100644
--- a/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
+++ b/test/functional/launch/reduce-basic/tests/test-launch-basic-ReduceSum.hpp
@@ -13,7 +13,8 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY,
@@ -24,30 +25,29 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 {
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -56,14 +56,17 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum(0);
   RAJA::ReduceSum<REDUCE_POLICY, DATA_TYPE> sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          sum  += working_array[idx];
-          sum2 += working_array[idx];
-     });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           sum += working_array[idx];
+                                           sum2 += working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2.get()), ref_sum + 2);
@@ -72,23 +75,21 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-            sum += working_array[idx];
-          });
-      });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { sum += working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum.get()), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -96,81 +97,84 @@ void LaunchReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest);
 template <typename T>
 class LaunchReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
   using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r1, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r1, seg_idx, working_res);
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r2, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r3, seg_idx, working_res);
+                               RAJA::TypedRangeSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r4, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r4, seg_idx,
+                                                          working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 r5, seg_idx, working_res);
+  LaunchReduceSumBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(r5, seg_idx,
+                                                          working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
-                                 l1, seg_idx, working_res);
+                               RAJA::TypedListSegment<IDX_TYPE>, LAUNCH_POLICY,
+                               GLOBAL_THREAD_POLICY, REDUCE_POLICY>(
+      l1, seg_idx, working_res);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest,
-                            ReduceSumBasicForall);
+REGISTER_TYPED_TEST_SUITE_P(LaunchReduceSumBasicTest, ReduceSumBasicForall);
 
 #endif  // __TEST_LAUNCH_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
index 0a91d5f9b8..2adca71343 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceBitAnd.hpp
@@ -13,54 +13,51 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
 
-void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
-                                     const std::vector<IDX_TYPE>& seg_idx,
-                                     camp::resources::Resource working_res)
+void LaunchParamExptReduceBitAndBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   using REF_BITAND = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::bit_and>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   //
   // First a simple non-trivial test that is mildly interesting
   //
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
     test_array[i] = 13;
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE simpand(21);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND &_simpand) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _simpand &= working_array[idx];
-     });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&simpand),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND & _simpand)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg, [&](IDX_TYPE idx) { _simpand &= working_array[idx]; });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(simpand), 5);
 
@@ -71,29 +68,35 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE ref_and = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_and &= test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_and &= test_array[seg_idx[i]];
   }
 
   DATA_TYPE redand(0);
   DATA_TYPE redand2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-     RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND &_redand, REF_BITAND &_redand2) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-        _redand  &= working_array[idx];
-        _redand2 &= working_array[idx];
-    });
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND & _redand,
+                           REF_BITAND & _redand2)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _redand &= working_array[idx];
+                                           _redand2 &= working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
   ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
@@ -101,23 +104,22 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
   redand = 0;
 
   const int nloops = 3;
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND _redand) {
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _redand &= working_array[idx];
-      });
-    });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_BITAND _redand)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { _redand &= working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -125,78 +127,80 @@ void LaunchParamExptReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest);
 template <typename T>
 class LaunchParamExptReduceBitAndBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceBitAndBasicTest, ReduceBitAndBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r1, seg_idx, working_res);
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r2, seg_idx, working_res);
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r3, seg_idx, working_res);
+                                           RAJA::TypedRangeSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r4, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
-  LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    r5, seg_idx, working_res);
+  LaunchParamExptReduceBitAndBasicTestImpl<
+      IDX_TYPE, DATA_TYPE, RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(r5, seg_idx, working_res);
 
   // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchParamExptReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                  RAJA::TypedListSegment<IDX_TYPE>,
-                                  LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                    l1, seg_idx, working_res);
+                                           RAJA::TypedListSegment<IDX_TYPE>,
+                                           LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceBitAndBasicTest,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 91ab75dbab..59840690a1 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -13,65 +13,66 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
-                                           const std::vector<IDX_TYPE>& seg_idx,
-                                           camp::resources::Resource working_res)
+void LaunchParamExptReduceMinBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   using REF_MIN = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::minimum>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
-  const int modval = 100;
-  const DATA_TYPE min_init = modval+1;
+  const int modval          = 100;
+  const DATA_TYPE min_init  = modval + 1;
   const DATA_TYPE small_min = -modval;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_min = min_init;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min);
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_min = RAJA_MIN(test_array[seg_idx[i]], ref_min);
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
 
   DATA_TYPE mininit(small_min);
   DATA_TYPE min(min_init);
-  
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchMinBasicTest",
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_mininit, REF_MIN &_min) {
 
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _mininit.min(working_array[idx]);
-          _min.min(working_array[idx]);
-
-    });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      "LaunchMinBasicTest",
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN & _mininit,
+                           REF_MIN & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _mininit.min(working_array[idx]);
+                                           _min.min(working_array[idx]);
+                                         });
+      });
 
 
   ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
@@ -81,38 +82,34 @@ void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
   ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
 
   DATA_TYPE factor = 3;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_min) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _min.min(working_array[idx] * factor);
-    });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { _min.min(working_array[idx] * factor); });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
   factor = 2;
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_min) {
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-
-          _min.min(working_array[idx] * factor);
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN & _min)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(
+            ctx, seg,
+            [&](IDX_TYPE idx) { _min.min(working_array[idx] * factor); });
       });
-  });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -120,78 +117,82 @@ void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest);
 template <typename T>
 class LaunchParamExptReduceMinBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceMinBasicTest, ReduceMinBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r1, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r1, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r2, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r3, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r4, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 r5, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchParamExptReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                               RAJA::TypedListSegment<IDX_TYPE>,
-                               LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                 l1, seg_idx, working_res);
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceMinBasicTest,
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index f6200628cf..d040dda264 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -13,41 +13,42 @@
 #include <numeric>
 #include <vector>
 
-template <typename IDX_TYPE, typename DATA_TYPE,
+template <typename IDX_TYPE,
+          typename DATA_TYPE,
           typename SEG_TYPE,
           typename LAUNCH_POLICY,
           typename GLOBAL_THREAD_POLICY>
-void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
-                                           const std::vector<IDX_TYPE>& seg_idx,
-                                           camp::resources::Resource working_res)
+void LaunchParamExptReduceSumBasicTestImpl(
+    const SEG_TYPE& seg,
+    const std::vector<IDX_TYPE>& seg_idx,
+    camp::resources::Resource working_res)
 {
   using REF_SUM = RAJA::expt::ValOp<DATA_TYPE, RAJA::operators::plus>;
 
   IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
-  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+  IDX_TYPE idx_len  = static_cast<IDX_TYPE>(seg_idx.size());
 
   DATA_TYPE* working_array;
   DATA_TYPE* check_array;
   DATA_TYPE* test_array;
 
   constexpr int threads = 256;
-  int blocks = (seg.size() - 1)/threads + 1;
+  int blocks            = (seg.size() - 1) / threads + 1;
 
-  allocateForallTestData<DATA_TYPE>(data_len,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+  allocateForallTestData<DATA_TYPE>(data_len, working_res, &working_array,
+                                    &check_array, &test_array);
 
   const int modval = 100;
 
-  for (IDX_TYPE i = 0; i < data_len; ++i) {
-    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  for (IDX_TYPE i = 0; i < data_len; ++i)
+  {
+    test_array[i] = static_cast<DATA_TYPE>(rand() % modval);
   }
 
   DATA_TYPE ref_sum = 0;
-  for (IDX_TYPE i = 0; i < idx_len; ++i) {
-    ref_sum += test_array[ seg_idx[i] ];
+  for (IDX_TYPE i = 0; i < idx_len; ++i)
+  {
+    ref_sum += test_array[seg_idx[i]];
   }
 
   working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
@@ -55,19 +56,20 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   DATA_TYPE sum(0), sum2(2);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchSumBasicTest",
-     RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-     RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum, REF_SUM &_sum2) {
-
-      RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-          _sum  += working_array[idx];
-          _sum2 += working_array[idx];
-     });
-
-  });
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+      "LaunchSumBasicTest", RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM & _sum,
+                           REF_SUM & _sum2)
+      {
+        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg,
+                                         [&](IDX_TYPE idx)
+                                         {
+                                           _sum += working_array[idx];
+                                           _sum2 += working_array[idx];
+                                         });
+      });
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
   ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
@@ -76,24 +78,22 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   const int nloops = 2;
 
-  for (int j = 0; j < nloops; ++j) {
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-       RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum) {
-
-        RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
-            _sum += working_array[idx];
-          });
-      });
+  for (int j = 0; j < nloops; ++j)
+  {
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM & _sum)
+        {
+          RAJA::loop<GLOBAL_THREAD_POLICY>(
+              ctx, seg, [&](IDX_TYPE idx) { _sum += working_array[idx]; });
+        });
   }
 
   ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
 
 
-  deallocateForallTestData<DATA_TYPE>(working_res,
-                                      working_array,
-                                      check_array,
+  deallocateForallTestData<DATA_TYPE>(working_res, working_array, check_array,
                                       test_array);
 }
 
@@ -101,78 +101,82 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest);
 template <typename T>
 class LaunchParamExptReduceSumBasicTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchParamExptReduceSumBasicTest, ReduceSumBasicForall)
 {
-  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
-  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
-  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<3>>::type, camp::num<1>>::type;
-
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  using IDX_TYPE    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE   = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<3>>::type,
+                        camp::num<1>>::type;
+
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   std::vector<IDX_TYPE> seg_idx;
 
-// Range segment tests
-  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  // Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1(0, 28);
   RAJA::getIndices(seg_idx, r1);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
                                         RAJA::TypedRangeSegment<IDX_TYPE>,
                                         LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                        r1, seg_idx, working_res);
-     
+      r1, seg_idx, working_res);
+
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r2(3, 642);
   RAJA::getIndices(seg_idx, r2);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r2, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r2, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::TypedRangeSegment<IDX_TYPE> r3(0, 2057);
   RAJA::getIndices(seg_idx, r3);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r3, seg_idx, working_res);
+                                        RAJA::TypedRangeSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r3, seg_idx, working_res);
 
-// Range-stride segment tests
+  // Range-stride segment tests
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4(0, 188, 2);
   RAJA::getIndices(seg_idx, r4);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r4, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r4, seg_idx, working_res);
 
   seg_idx.clear();
-  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5(3, 1029, 3);
   RAJA::getIndices(seg_idx, r5);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedRangeStrideSegment<IDX_TYPE>,
-                                       LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                       r5, seg_idx, working_res);
+                                        RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      r5, seg_idx, working_res);
 
-// List segment tests
+  // List segment tests
   seg_idx.clear();
   IDX_TYPE last = 10567;
-  srand( time(NULL) );
-  for (IDX_TYPE i = 0; i < last; ++i) {
-    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
-    if ( i < randval ) {
+  srand(time(NULL));
+  for (IDX_TYPE i = 0; i < last; ++i)
+  {
+    IDX_TYPE randval = IDX_TYPE(rand() % RAJA::stripIndexType(last));
+    if (i < randval)
+    {
       seg_idx.push_back(i);
     }
   }
-  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
-                                       working_res );
+  RAJA::TypedListSegment<IDX_TYPE> l1(&seg_idx[0], seg_idx.size(), working_res);
   LaunchParamExptReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
-                                       RAJA::TypedListSegment<IDX_TYPE>,
-                                      LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
-                                      l1, seg_idx, working_res);
+                                        RAJA::TypedListSegment<IDX_TYPE>,
+                                        LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      l1, seg_idx, working_res);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchParamExptReduceSumBasicTest,
diff --git a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index 702d5c6cd3..094aeb131d 100644
--- a/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -10,98 +10,105 @@
 
 #include <numeric>
 
-template <typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
+template <typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchBasicSharedTestImpl()
 {
 
   int N = 1000;
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   int* working_array;
   int* check_array;
   int* test_array;
 
-  allocateForallTestData<int>(N*N,
-                             working_res,
-                             &working_array,
-                             &check_array,
-                             &test_array);
+  allocateForallTestData<int>(N * N, working_res, &working_array, &check_array,
+                              &test_array);
 
 
-
-  //Select platform
+  // Select platform
   RAJA::ExecPlace select_cpu_or_gpu;
-  if (working_res.get_platform()  == camp::resources::Platform::host){
+  if (working_res.get_platform() == camp::resources::Platform::host)
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::HOST;
-  }else{
+  }
+  else
+  {
     select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
   }
 
   size_t shared_mem_size = 1 * sizeof(int);
 
-  RAJA::launch<LAUNCH_POLICY>
-    (select_cpu_or_gpu,
-     RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-          RAJA::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
-
-                // Array shared within threads of the same team
-              int * s_A = ctx.getSharedMemory<int>(1);
-
-                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
-                    s_A[c] = r;
-                });
-
-                ctx.teamSync();
-
-                //broadcast shared value to all threads and write to array
-                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
-                    const int idx = c + N*r;
-                    working_array[idx] = s_A[0];
-                });  // loop j
-
-                ctx.releaseSharedMemory();
-              });  // loop r
-        });  // outer lambda
-
-
-
-  working_res.memcpy(check_array, working_array, sizeof(int) * N*N);
-
-  for(int r = 0; r < N; ++r) {
-    for (int c = 0; c < N; c++) {
-      ASSERT_EQ(r, check_array[c + r*N]);
+  RAJA::launch<LAUNCH_POLICY>(
+      select_cpu_or_gpu,
+      RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N), shared_mem_size),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx, RAJA::RangeSegment(0, N),
+            [&](int r)
+            {
+              // Array shared within threads of the same team
+              int* s_A = ctx.getSharedMemory<int>(1);
+
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1),
+                                        [&](int c) { s_A[c] = r; });
+
+              ctx.teamSync();
+
+              // broadcast shared value to all threads and write to array
+              RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N),
+                                        [&](int c)
+                                        {
+                                          const int idx      = c + N * r;
+                                          working_array[idx] = s_A[0];
+                                        });  // loop j
+
+              ctx.releaseSharedMemory();
+            });  // loop r
+      });        // outer lambda
+
+
+  working_res.memcpy(check_array, working_array, sizeof(int) * N * N);
+
+  for (int r = 0; r < N; ++r)
+  {
+    for (int c = 0; c < N; c++)
+    {
+      ASSERT_EQ(r, check_array[c + r * N]);
     }
   }
 
-  deallocateForallTestData<int>(working_res,
-                               working_array,
-                               check_array,
-                               test_array);
+  deallocateForallTestData<int>(working_res, working_array, check_array,
+                                test_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchBasicSharedTest);
 template <typename T>
 class LaunchBasicSharedTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
 {
 
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<2>>::type;
-
-  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
-
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<1>>::type,
+                        camp::num<2>>::type;
+
+  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                            THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest,
-                            BasicSharedTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest, BasicSharedTeams);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
index 9ed358208f..8faa4111ad 100644
--- a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -15,101 +15,114 @@
 #include <algorithm>
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchListSegmentTestImpl(INDEX_TYPE N)
 {
 
   // Create and initialize indices in idx_array used to create list segment
   std::vector<INDEX_TYPE> idx_array;
 
-  srand ( time(NULL) );
+  srand(time(NULL));
 
-  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+  {
     INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
-    if ( i < randval ) {
+    if (i < randval)
+    {
       idx_array.push_back(i);
     }
   }
 
   size_t idxlen = idx_array.size();
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
 
   // Create list segment for tests
   INDEX_TYPE* idx_vals = nullptr;
-  if (N > 0) {
+  if (N > 0)
+  {
     idx_vals = &idx_array[0];
   }
-  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen,
-                                          working_res);
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen, working_res);
 
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (size_t i = 0; i < idxlen; ++i) {
-      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
+    for (size_t i = 0; i < idxlen; ++i)
+    {
+      test_array[RAJA::stripIndexType(idx_vals[i])] = idx_vals[i];
     }
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
-            working_array[RAJA::stripIndexType(idx)] = idx;
-          });
-      });
-
-  } else { // zero-length segment
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, lseg,
+              [&](INDEX_TYPE idx)
+              { working_array[RAJA::stripIndexType(idx)] = idx; });
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
-
-    RAJA::launch<LAUNCH_POLICY>
-      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
-            (void) idx;
-            working_array[0]++;
-          });
-      });
-
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>(
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg,
+                                          [&](INDEX_TYPE idx)
+                                          {
+                                            (void)idx;
+                                            working_array[0]++;
+                                          });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-  } else {
+  }
+  else
+  {
     ASSERT_EQ(test_array[0], check_array[0]);
   }
 
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -117,27 +130,33 @@ void LaunchListSegmentTestImpl(INDEX_TYPE N)
 TYPED_TEST_SUITE_P(LaunchListSegmentTest);
 template <typename T>
 class LaunchListSegmentTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
 {
   using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length list segment
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
 
-  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY,
+                            GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest,
-                            ListSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest, ListSegmentTeams);
 
 #endif  // __TEST_TEAMS_LISTSEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
index aa2cb2c4b5..2d36a6316b 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -10,86 +10,87 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
 void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first),
+                                         RAJA::stripIndexType(last));
   INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     const INDEX_TYPE rbegin = *r1.begin();
 
     std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE idx) {
-            working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
-          }
-        );
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1,
+              [&](INDEX_TYPE idx)
+              { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; });
+        });
+  }
+  else
+  {  // zero-length segment
 
     memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_array, test_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),  [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-            working_array[0]++;
-          }
-        );
-      }
-    );
-
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
-    
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  if (RAJA::stripIndexType(N) > 0)
+  {
+
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(test_array[0], check_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -97,24 +98,36 @@ void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
 TYPED_TEST_SUITE_P(LaunchRangeSegmentTest);
 template <typename T>
 class LaunchRangeSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
-{
-}
-
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeTests()
 {
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(-5));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(0));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5),
+                                                   INDEX_TYPE(5));
 }
 
 TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
@@ -122,20 +135,32 @@ TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
 
   // test zero-length range segment
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
-
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
-  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
-
-  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(3),
+                                                   INDEX_TYPE(3));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(0),
+                                                   INDEX_TYPE(27));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(2047));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                             GLOBAL_THREAD_POLICY>(INDEX_TYPE(1),
+                                                   INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                   GLOBAL_THREAD_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest, RangeSegmentTeams);
 
 #endif  // __TEST_RANGE_SEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
index 94a1a77bcf..d25d46ce8f 100644
--- a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -10,90 +10,94 @@
 
 #include <cstring>
 
-template <typename INDEX_TYPE, typename DIFF_TYPE,
-          typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
-void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY>
+void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first,
+                                      INDEX_TYPE last,
                                       DIFF_TYPE stride)
 {
-  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(
+      RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
   INDEX_TYPE N = INDEX_TYPE(r1.size());
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
   memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
 
   working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
 
   constexpr int threads = 256;
-  int blocks = (data_len - 1)/threads + 1;
+  int blocks            = (data_len - 1) / threads + 1;
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = first;
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
-      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i)
+    {
+      test_array[RAJA::stripIndexType((idx - first) / stride)] = idx;
       idx += stride;
     }
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE idx) {
-            working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
-          }
-        );
-
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(
+              ctx, r1,
+              [&](INDEX_TYPE idx) {
+                working_array[RAJA::stripIndexType((idx - first) / stride)] =
+                    idx;
+              });
+        });
+  }
+  else
+  {  // zero-length segment
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::loop<GLOBAL_THREAD_POICY>(
-          ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(idx)) {
-            working_array[0]++;
-          }
-        );
-
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1,
+                                          [&](INDEX_TYPE RAJA_UNUSED_ARG(idx))
+                                          { working_array[0]++; });
+        });
   }
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
-    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
-      ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++)
+    {
+      ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+                check_array[RAJA::stripIndexType(i)]);
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(test_array[0], check_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -101,26 +105,44 @@ void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last,
 TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest);
 template <typename T>
 class LaunchRangeStrideSegmentTest : public ::testing::Test
-{
-};
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{};
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POICY,
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
-{
-}
-
-template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+{}
+
+template <typename INDEX_TYPE,
+          typename DIFF_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename GLOBAL_THREAD_POLICY,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
 void runNegativeStrideTests()
 {
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
-
-// Test negative strides
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+  // Test negative strides
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
 }
 
 
@@ -128,23 +150,47 @@ TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
 {
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
-
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
-
-// Test size zero segments
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
-  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
-
-  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using DIFF_TYPE =
+      typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+  // Test size zero segments
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES,
+                                   LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(
+      INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY,
+                         GLOBAL_THREAD_POLICY>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest,
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index 8da7b81eb7..0d2fa4d789 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -10,81 +10,105 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY>
 void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 {
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
+  size_t data_len =
+      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  //determine the underlying type of block_range
+  // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
 
-  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
-    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
-      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
+  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
+  {
+    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
+    {
+      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx) + INDEX_TYPE(c);
     }
   }
 
-  size_t shared_mem_size = RAJA::stripIndexType(thread_range)*sizeof(INDEX_TYPE);
-
-  //Use an int type to test the bump style allocator.
-  //Key idea is that we are requesting different amounts.
-  shared_mem_size += RAJA::stripIndexType(thread_range)*sizeof(int);
-
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                        RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-
-          INDEX_TYPE * tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(RAJA::stripIndexType(thread_range));
-          RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(tile_ptr, RAJA::stripIndexType(thread_range));
-
-          int * int_tile_ptr = ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
-          RAJA::View<int, RAJA::Layout<1>> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range));
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
-              Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid;
+  size_t shared_mem_size =
+      RAJA::stripIndexType(thread_range) * sizeof(INDEX_TYPE);
+
+  // Use an int type to test the bump style allocator.
+  // Key idea is that we are requesting different amounts.
+  shared_mem_size += RAJA::stripIndexType(thread_range) * sizeof(int);
+
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range)),
+                         shared_mem_size),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx, outer_range,
+            [&](INDEX_TYPE bid)
+            {
+              INDEX_TYPE* tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(
+                  RAJA::stripIndexType(thread_range));
+              RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(
+                  tile_ptr, RAJA::stripIndexType(thread_range));
+
+              int* int_tile_ptr =
+                  ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
+              RAJA::View<int, RAJA::Layout<1>> Int_Tile(
+                  int_tile_ptr, RAJA::stripIndexType(thread_range));
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    Int_Tile(RAJA::stripIndexType(tid)) =
+                        RAJA::stripIndexType(tid);
+                    Tile(RAJA::stripIndexType(thread_range) -
+                         RAJA::stripIndexType(tid) - 1) =
+                        thread_range - tid - 1 + thread_range * bid;
+                  });
+
+              ctx.teamSync();
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    INDEX_TYPE idx = tid + thread_range * bid;
+                    working_array[RAJA::stripIndexType(idx)] =
+                        Tile(RAJA::stripIndexType(tid)) +
+                        Int_Tile(RAJA::stripIndexType(tid));
+                  });
+
+              ctx.releaseSharedMemory();
             });
-
-          ctx.teamSync();
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid));
-          });
-
-          ctx.releaseSharedMemory();
-        });
-
-    });
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -92,28 +116,31 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 TYPED_TEST_SUITE_P(LaunchDynamicMemTest);
 template <typename T>
 class LaunchDynamicMemTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchDynamicMemTest, DynamicMemLaunch)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
-    (INDEX_TYPE(4), INDEX_TYPE(2));
-
-  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>
-    (INDEX_TYPE(5), INDEX_TYPE(32));
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(4), INDEX_TYPE(2));
+
+  LaunchDynamicMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                           THREAD_POLICY>(INDEX_TYPE(5), INDEX_TYPE(32));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest,
-                            DynamicMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchDynamicMemTest, DynamicMemLaunch);
 
 #endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
index 63b488115b..a424015398 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-StaticMem.hpp
@@ -10,80 +10,98 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY,
-int THREAD_RANGE>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename TEAM_POLICY,
+          typename THREAD_POLICY,
+          int THREAD_RANGE>
 void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 {
 
   INDEX_TYPE thread_range(THREAD_RANGE);
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
-  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> outer_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(block_range));
+  RAJA::TypedRangeSegment<INDEX_TYPE> inner_range(
+      RAJA::stripIndexType(INDEX_TYPE(0)), RAJA::stripIndexType(thread_range));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_array;
   INDEX_TYPE* check_array;
   INDEX_TYPE* test_array;
 
-  size_t data_len = RAJA::stripIndexType(block_range)*RAJA::stripIndexType(thread_range);
+  size_t data_len =
+      RAJA::stripIndexType(block_range) * RAJA::stripIndexType(thread_range);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_array,
-                                     &check_array,
-                                     &test_array);
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res, &working_array,
+                                     &check_array, &test_array);
 
-  //determine the underlying type of block_range
+  // determine the underlying type of block_range
   using s_type = decltype(RAJA::stripIndexType(block_range));
-  
-  for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
-    for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
-      s_type idx = c + RAJA::stripIndexType(thread_range)*b;
+
+  for (s_type b = 0; b < RAJA::stripIndexType(block_range); ++b)
+  {
+    for (s_type c = 0; c < RAJA::stripIndexType(thread_range); ++c)
+    {
+      s_type idx      = c + RAJA::stripIndexType(thread_range) * b;
       test_array[idx] = INDEX_TYPE(idx);
     }
   }
 
-  RAJA::launch<LAUNCH_POLICY>
-    (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
-                        RAJA::Threads(RAJA::stripIndexType(thread_range))),
-     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-      RAJA::loop<TEAM_POLICY>(ctx, outer_range, [&](INDEX_TYPE bid) {
-
-          //Since we are using custom index type we have to first use a
-          //type that the device compiler can intialize, we can then use a
-          //pointer to recast the shared memory to our desired type.
-          //This enables us to work around the following warning:
-          // warning #3019-D: dynamic initialization is not supported for
-          //a function-scope static __shared__ variable within a __device__/__global__ function
-          RAJA_TEAM_SHARED char char_Tile[THREAD_RANGE*sizeof(INDEX_TYPE)];
-          INDEX_TYPE *Tile = (INDEX_TYPE *)char_Tile;
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              Tile[RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1] = thread_range-tid-1 + thread_range*bid;
+  RAJA::launch<LAUNCH_POLICY>(
+      RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
+                         RAJA::Threads(RAJA::stripIndexType(thread_range))),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        RAJA::loop<TEAM_POLICY>(
+            ctx, outer_range,
+            [&](INDEX_TYPE bid)
+            {
+              // Since we are using custom index type we have to first use a
+              // type that the device compiler can intialize, we can then use a
+              // pointer to recast the shared memory to our desired type.
+              // This enables us to work around the following warning:
+              //  warning #3019-D: dynamic initialization is not supported for
+              // a function-scope static __shared__ variable within a
+              // __device__/__global__ function
+              RAJA_TEAM_SHARED char
+                  char_Tile[THREAD_RANGE * sizeof(INDEX_TYPE)];
+              INDEX_TYPE* Tile = (INDEX_TYPE*)char_Tile;
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    Tile[RAJA::stripIndexType(thread_range) -
+                         RAJA::stripIndexType(tid) - 1] =
+                        thread_range - tid - 1 + thread_range * bid;
+                  });
+
+              ctx.teamSync();
+
+              RAJA::loop<THREAD_POLICY>(
+                  ctx, inner_range,
+                  [&](INDEX_TYPE tid)
+                  {
+                    INDEX_TYPE idx = tid + thread_range * bid;
+                    working_array[RAJA::stripIndexType(idx)] =
+                        Tile[RAJA::stripIndexType(tid)];
+                  });
+
+              ctx.releaseSharedMemory();
             });
-
-          ctx.teamSync();
-
-          RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
-              INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile[RAJA::stripIndexType(tid)];
-          });
-
-          ctx.releaseSharedMemory();
-        });
-
-    });
+      });
 
   working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
 
-  for (size_t i = 0; i < data_len; i++) {
-    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  for (size_t i = 0; i < data_len; i++)
+  {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)],
+              check_array[RAJA::stripIndexType(i)]);
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_array,
-                                       check_array,
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_array, check_array,
                                        test_array);
 }
 
@@ -91,28 +109,31 @@ void LaunchStaticMemTestImpl(INDEX_TYPE block_range)
 TYPED_TEST_SUITE_P(LaunchStaticMemTest);
 template <typename T>
 class LaunchStaticMemTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(LaunchStaticMemTest, StaticMemLaunch)
 {
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-  using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
-
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 2>
-    (INDEX_TYPE(4));
-
-  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY, 32>
-    (INDEX_TYPE(5));
-
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
+  using TEAM_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
+
+
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 2>(INDEX_TYPE(4));
+
+  LaunchStaticMemTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, TEAM_POLICY,
+                          THREAD_POLICY, 32>(INDEX_TYPE(5));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest,
-                            StaticMemLaunch);
+REGISTER_TYPED_TEST_SUITE_P(LaunchStaticMemTest, StaticMemLaunch);
 
 #endif  // __TEST_DYNAMIC_MEM_HPP__
diff --git a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
index 72d59d290a..48aed7a007 100644
--- a/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
+++ b/test/functional/launch/tile_icount_direct/tests/test-launch-nested-Tile-iCount-Direct.hpp
@@ -10,23 +10,26 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int threads_x   = 4;
-  constexpr int blocks_x    = 4;
+  constexpr int threads_x = 4;
+  constexpr int blocks_x  = 4;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*threads_x+1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * threads_x + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1-1)/threads_x + 1;
+  INDEX_TYPE no_tiles = (N1 - 1) / threads_x + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_ttile_array;
   INDEX_TYPE* check_ttile_array;
   INDEX_TYPE* test_ttile_array;
@@ -36,80 +39,89 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_ttile_array,
-                                     &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_iloop_array,
-                                     &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
-
-                working_ttile_array[tx] = bx;
-                working_iloop_array[tx] = ix;
-
-              }
-            );
-          }
-        );
-      }
-    );
-
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, threads_x, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE bx)
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
+                    {
+                      working_ttile_array[tx] = bx;
+                      working_iloop_array[tx] = ix;
+                    });
+              });
+        });
+  }
+  else
+  {  // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, threads_x, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG (ix)) {
-
-                working_ttile_array[0]++;
-                working_iloop_array[0]++;
-
-              }
-            );
-          }
-        );
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, threads_x, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
+                    {
+                      working_ttile_array[0]++;
+                      working_iloop_array[0]++;
+                    });
+              });
+        });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx) {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
+    {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < threads_x; ++tx)
+      {
 
-        if(idx >= N1) break;
+        if (idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -117,31 +129,26 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-    
-  } else {
-    
+  }
+  else
+  {
+
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
-    
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_ttile_array,
-                                       check_ttile_array,
-                                       test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_iloop_array,
-                                       check_iloop_array,
-                                       test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest);
 template <typename T>
 class LaunchNestedTileDirectTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
@@ -149,30 +156,30 @@ TYPED_TEST_P(LaunchNestedTileDirectTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
   LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(0));
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
-  //Keep at one since we are doing a direct thread test
+  // Keep at one since we are doing a direct thread test
   LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(1));
-
-    LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(2));
-
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
 
+  LaunchNestedTileDirectTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
+                                 THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileDirectTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
index 31adc84810..d39a66009d 100644
--- a/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
+++ b/test/functional/launch/tile_icount_loop/tests/test-launch-nested-Tile-iCount-Loop.hpp
@@ -10,26 +10,29 @@
 
 #include <numeric>
 
-template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY,
-          typename THREAD_X_POLICY, typename TEAM_X_POLICY>
+template <typename INDEX_TYPE,
+          typename WORKING_RES,
+          typename LAUNCH_POLICY,
+          typename THREAD_X_POLICY,
+          typename TEAM_X_POLICY>
 void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
 {
 
-  constexpr int tile_size   = 4;
+  constexpr int tile_size = 4;
 
-  //following grid will require loop policies
-  constexpr int threads_x   = 3;
-  constexpr int blocks_x    = 1;
+  // following grid will require loop policies
+  constexpr int threads_x = 3;
+  constexpr int blocks_x  = 1;
 
-  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M*tile_size+1);
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(0, M * tile_size + 1);
 
   INDEX_TYPE N1 = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
 
-  INDEX_TYPE no_tiles = (N1-1)/tile_size + 1;
+  INDEX_TYPE no_tiles = (N1 - 1) / tile_size + 1;
 
   INDEX_TYPE N = static_cast<INDEX_TYPE>(RAJA::stripIndexType(N1));
 
-  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource working_res {WORKING_RES::get_default()};
   INDEX_TYPE* working_ttile_array;
   INDEX_TYPE* check_ttile_array;
   INDEX_TYPE* test_ttile_array;
@@ -39,80 +42,89 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
   INDEX_TYPE* test_iloop_array;
 
   size_t data_len = RAJA::stripIndexType(N);
-  if ( data_len == 0 ) {
+  if (data_len == 0)
+  {
     data_len = 1;
   }
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_ttile_array,
-                                     &check_ttile_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_ttile_array, &check_ttile_array,
                                      &test_ttile_array);
 
-  allocateForallTestData<INDEX_TYPE>(data_len,
-                                     working_res,
-                                     &working_iloop_array,
-                                     &check_iloop_array,
+  allocateForallTestData<INDEX_TYPE>(data_len, working_res,
+                                     &working_iloop_array, &check_iloop_array,
                                      &test_iloop_array);
 
-  if ( RAJA::stripIndexType(N) > 0 ) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     std::iota(test_ttile_array, test_ttile_array + RAJA::stripIndexType(N), 0);
     std::iota(test_iloop_array, test_iloop_array + RAJA::stripIndexType(N), 0);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>(
-          ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE bx) {
-            RAJA::loop_icount<THREAD_X_POLICY>(
-              ctx, x_tile, [&](INDEX_TYPE tx, INDEX_TYPE ix) {
-
-                working_ttile_array[tx] = bx;
-                working_iloop_array[tx] = ix;
-
-              }
-            );
-          }
-        );
-      }
-    );
-  } else { // zero-length segment
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(threads_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, tile_size, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE bx)
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE tx, INDEX_TYPE ix)
+                    {
+                      working_ttile_array[tx] = bx;
+                      working_iloop_array[tx] = ix;
+                    });
+              });
+        });
+  }
+  else
+  {  // zero-length segment
 
-    memset(static_cast<void*>(test_ttile_array), 0, sizeof(INDEX_TYPE) * data_len);
+    memset(static_cast<void*>(test_ttile_array), 0,
+           sizeof(INDEX_TYPE) * data_len);
 
-    working_res.memcpy(working_ttile_array, test_ttile_array, sizeof(INDEX_TYPE) * data_len);
+    working_res.memcpy(working_ttile_array, test_ttile_array,
+                       sizeof(INDEX_TYPE) * data_len);
 
     RAJA::launch<LAUNCH_POLICY>(
-      RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-        RAJA::tile_tcount<TEAM_X_POLICY>
-          (ctx, tile_size, r1, [&](RAJA::TypedRangeSegment<INDEX_TYPE> const &x_tile, INDEX_TYPE RAJA_UNUSED_ARG(bx)) {
-
-            RAJA::loop_icount<THREAD_X_POLICY>
-              (ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx), INDEX_TYPE RAJA_UNUSED_ARG(ix)) {
-
-                working_ttile_array[0]++;
-                working_iloop_array[0]++;
-
-              }
-            );
-          }
-        );
-      }
-    );
+        RAJA::LaunchParams(RAJA::Teams(blocks_x), RAJA::Threads(blocks_x)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+        {
+          RAJA::tile_tcount<TEAM_X_POLICY>(
+              ctx, tile_size, r1,
+              [&](RAJA::TypedRangeSegment<INDEX_TYPE> const& x_tile,
+                  INDEX_TYPE RAJA_UNUSED_ARG(bx))
+              {
+                RAJA::loop_icount<THREAD_X_POLICY>(
+                    ctx, x_tile,
+                    [&](INDEX_TYPE RAJA_UNUSED_ARG(tx),
+                        INDEX_TYPE RAJA_UNUSED_ARG(ix))
+                    {
+                      working_ttile_array[0]++;
+                      working_iloop_array[0]++;
+                    });
+              });
+        });
   }
 
-  working_res.memcpy(check_ttile_array, working_ttile_array, sizeof(INDEX_TYPE) * data_len);
-  working_res.memcpy(check_iloop_array, working_iloop_array, sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_ttile_array, working_ttile_array,
+                     sizeof(INDEX_TYPE) * data_len);
+  working_res.memcpy(check_iloop_array, working_iloop_array,
+                     sizeof(INDEX_TYPE) * data_len);
 
-  if (RAJA::stripIndexType(N) > 0) {
+  if (RAJA::stripIndexType(N) > 0)
+  {
 
     INDEX_TYPE idx = 0;
-    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx) {
-      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx) {
+    for (INDEX_TYPE bx = INDEX_TYPE(0); bx < no_tiles; ++bx)
+    {
+      for (INDEX_TYPE tx = INDEX_TYPE(0); tx < tile_size; ++tx)
+      {
 
-        if(idx >= N1) break;
+        if (idx >= N1) break;
 
         ASSERT_EQ(check_ttile_array[RAJA::stripIndexType(idx)], bx);
         ASSERT_EQ(check_iloop_array[RAJA::stripIndexType(idx)], tx);
@@ -120,31 +132,26 @@ void LaunchNestedTileLoopTestImpl(INDEX_TYPE M)
         idx++;
       }
     }
-
-  } else {
+  }
+  else
+  {
 
     ASSERT_EQ(check_ttile_array[0], check_ttile_array[0]);
     ASSERT_EQ(check_iloop_array[0], check_iloop_array[0]);
-
   }
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_ttile_array,
-                                       check_ttile_array,
-                                       test_ttile_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_ttile_array,
+                                       check_ttile_array, test_ttile_array);
 
-  deallocateForallTestData<INDEX_TYPE>(working_res,
-                                       working_iloop_array,
-                                       check_iloop_array,
-                                       test_iloop_array);
+  deallocateForallTestData<INDEX_TYPE>(working_res, working_iloop_array,
+                                       check_iloop_array, test_iloop_array);
 }
 
 
 TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest);
 template <typename T>
 class LaunchNestedTileLoopTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
@@ -152,31 +159,30 @@ TYPED_TEST_P(LaunchNestedTileLoopTest, RangeSegmentTeams)
 
   using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
   using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
-  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
-
-  using TEAM_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
-  using THREAD_X_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<2>>::type;
+  using LAUNCH_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<0>>::type;
 
+  using TEAM_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<1>>::type;
+  using THREAD_X_POLICY =
+      typename camp::at<typename camp::at<TypeParam, camp::num<2>>::type,
+                        camp::num<2>>::type;
 
 
   // test zero-length range segment
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                           THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(0));
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(0));
 
-  //Keep at one since we are doing a direct thread test
+  // Keep at one since we are doing a direct thread test
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                                 THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(1));
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(1));
 
   LaunchNestedTileLoopTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY,
-                               THREAD_X_POLICY, TEAM_X_POLICY>
-    (INDEX_TYPE(2));
-
-
+                               THREAD_X_POLICY, TEAM_X_POLICY>(INDEX_TYPE(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest,
-                            RangeSegmentTeams);
+REGISTER_TYPED_TEST_SUITE_P(LaunchNestedTileLoopTest, RangeSegmentTeams);
 
 #endif  // __TEST_LAUNCH_NESTED_TILE_DIRECT_HPP__
diff --git a/test/functional/scan/tests/test-scan-Exclusive.hpp b/test/functional/scan/tests/test-scan-Exclusive.hpp
index 43f99f9901..40769cee3a 100644
--- a/test/functional/scan/tests/test-scan-Exclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Exclusive.hpp
@@ -16,8 +16,10 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -29,24 +31,21 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveTestImpl(int N,
-                           typename OP_TYPE::result_type offset =
-                           OP_TYPE::identity())
+void ScanExclusiveTestImpl(
+    int N,
+    typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -54,10 +53,9 @@ void ScanExclusiveTestImpl(int N,
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::exclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{},
-                                    offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -67,28 +65,23 @@ void ScanExclusiveTestImpl(int N,
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan<EXEC_POLICY>(res,
-                                    RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{},
-                                    offset);
+  RAJA::exclusive_scan<EXEC_POLICY>(
+      res, RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveTest);
 template <typename T>
 class ScanExclusiveTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
 {
@@ -96,33 +89,20 @@ TYPED_TEST_P(ScanExclusiveTest, ScanExclusive)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(0);
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(357);
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                              WORKING_RESOURCE,
-                              OP_TYPE>(32000);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(0, T(13));
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(357, T(15));
-  ScanExclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(32000, T(2));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0, T(13));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357, T(15));
+  ScanExclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000, T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest,
-                            ScanExclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveTest, ScanExclusive);
 
-#endif // __TEST_SCAN_EXCLUSIVE_HPP__
+#endif  // __TEST_SCAN_EXCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
index c42e9a8677..34d7b6d470 100644
--- a/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-ExclusiveInplace.hpp
@@ -16,8 +16,10 @@ ::testing::AssertionResult check_exclusive(const T* actual,
                                            int N,
                                            T init = OP::identity())
 {
-  for (int i = 0; i < N; ++i) {
-    if (*actual != init) {
+  for (int i = 0; i < N; ++i)
+  {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -29,24 +31,21 @@ ::testing::AssertionResult check_exclusive(const T* actual,
 }
 
 template <typename EXEC_POLICY, typename WORKING_RES, typename OP_TYPE>
-void ScanExclusiveInplaceTestImpl(int N,
-                                  typename OP_TYPE::result_type offset =
-                                  OP_TYPE::identity())
+void ScanExclusiveInplaceTestImpl(
+    int N,
+    typename OP_TYPE::result_type offset = OP_TYPE::identity())
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -55,8 +54,7 @@ void ScanExclusiveInplaceTestImpl(int N,
   res.wait();
 
   RAJA::exclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE{},
-                                            offset);
+                                            OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -66,27 +64,22 @@ void ScanExclusiveInplaceTestImpl(int N,
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res,
-                                            RAJA::make_span(work_in, N),
-                                            OP_TYPE{},
-                                            offset);
+  RAJA::exclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE {}, offset);
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_exclusive<OP_TYPE>(host_out, host_in, N, offset));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest);
 template <typename T>
 class ScanExclusiveInplaceTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
 {
@@ -94,33 +87,23 @@ TYPED_TEST_P(ScanExclusiveInplaceTest, ScanExclusiveInplace)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357);
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 
   //
   // Perform some non-identity offset tests
   //
   using T = typename OP_TYPE::result_type;
 
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0, T(13));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357, T(15));
-  ScanExclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000, T(2));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0,
+                                                                       T(13));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357,
+                                                                       T(15));
+  ScanExclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000,
+                                                                       T(2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest,
-                            ScanExclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanExclusiveInplaceTest, ScanExclusiveInplace);
 
-#endif // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
+#endif  // __TEST_SCAN_EXCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-Inclusive.hpp b/test/functional/scan/tests/test-scan-Inclusive.hpp
index 9fcc54ed67..43c0c8e1b2 100644
--- a/test/functional/scan/tests/test-scan-Inclusive.hpp
+++ b/test/functional/scan/tests/test-scan-Inclusive.hpp
@@ -11,15 +11,17 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-  const typename OP::result_type* actual,
-  const typename OP::result_type* original,
-  int N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     init = OP()(init, *original);
-    if (*actual != init) {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -34,18 +36,15 @@ void ScanInclusiveTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -53,9 +52,9 @@ void ScanInclusiveTestImpl(int N)
   res.memcpy(work_in, host_in, sizeof(T) * N);
   res.wait();
 
-  RAJA::inclusive_scan<EXEC_POLICY>(RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{});
+  RAJA::inclusive_scan<EXEC_POLICY>(
+      RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
@@ -65,27 +64,23 @@ void ScanInclusiveTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan<EXEC_POLICY>(res,
-                                    RAJA::make_span(static_cast<const T*>(work_in), N),
-                                    RAJA::make_span(work_out, N),
-                                    OP_TYPE{});
+  RAJA::inclusive_scan<EXEC_POLICY>(
+      res, RAJA::make_span(static_cast<const T*>(work_in), N),
+      RAJA::make_span(work_out, N), OP_TYPE {});
 
   res.memcpy(host_out, work_out, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveTest);
 template <typename T>
 class ScanInclusiveTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
 {
@@ -93,18 +88,11 @@ TYPED_TEST_P(ScanInclusiveTest, ScanInclusive)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(0);
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(357);
-  ScanInclusiveTestImpl<EXEC_POLICY,
-                        WORKING_RESOURCE,
-                        OP_TYPE>(32000);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanInclusiveTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest,
-                            ScanInclusive);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveTest, ScanInclusive);
 
-#endif // __TEST_SCAN_INCLUSIVE_HPP__
+#endif  // __TEST_SCAN_INCLUSIVE_HPP__
diff --git a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
index 8e4d8e93bf..8f3761865b 100644
--- a/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
+++ b/test/functional/scan/tests/test-scan-InclusiveInplace.hpp
@@ -11,15 +11,17 @@
 #include <numeric>
 
 template <typename OP>
-::testing::AssertionResult check_inclusive(
-  const typename OP::result_type* actual,
-  const typename OP::result_type* original,
-  int N)
+::testing::AssertionResult
+check_inclusive(const typename OP::result_type* actual,
+                const typename OP::result_type* original,
+                int N)
 {
   typename OP::result_type init = OP::identity();
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     init = OP()(init, *original);
-    if (*actual != init) {
+    if (*actual != init)
+    {
       return ::testing::AssertionFailure()
              << *actual << " != " << init << " (at index " << i << ")";
     }
@@ -34,18 +36,15 @@ void ScanInclusiveInplaceTestImpl(int N)
 {
   using T = typename OP_TYPE::result_type;
 
-  WORKING_RES res{WORKING_RES::get_default()};
-  camp::resources::Resource working_res{res};
+  WORKING_RES res {WORKING_RES::get_default()};
+  camp::resources::Resource working_res {res};
 
   T* work_in;
   T* work_out;
   T* host_in;
   T* host_out;
 
-  allocScanTestData(N,
-                    working_res,
-                    &work_in, &work_out,
-                    &host_in, &host_out);
+  allocScanTestData(N, working_res, &work_in, &work_out, &host_in, &host_out);
 
   std::iota(host_in, host_in + N, 1);
 
@@ -54,7 +53,7 @@ void ScanInclusiveInplaceTestImpl(int N)
   res.wait();
 
   RAJA::inclusive_scan_inplace<EXEC_POLICY>(RAJA::make_span(work_in, N),
-                                            OP_TYPE{});
+                                            OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
@@ -64,26 +63,22 @@ void ScanInclusiveInplaceTestImpl(int N)
   // test interface with resource
   res.memcpy(work_in, host_in, sizeof(T) * N);
 
-  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res,
-                                            RAJA::make_span(work_in, N),
-                                            OP_TYPE{});
+  RAJA::inclusive_scan_inplace<EXEC_POLICY>(res, RAJA::make_span(work_in, N),
+                                            OP_TYPE {});
 
   res.memcpy(host_out, work_in, sizeof(T) * N);
   res.wait();
 
   ASSERT_TRUE(check_inclusive<OP_TYPE>(host_out, host_in, N));
 
-  deallocScanTestData(working_res,
-                      work_in, work_out,
-                      host_in, host_out);
+  deallocScanTestData(working_res, work_in, work_out, host_in, host_out);
 }
 
 
 TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest);
 template <typename T>
 class ScanInclusiveInplaceTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
 {
@@ -91,18 +86,11 @@ TYPED_TEST_P(ScanInclusiveInplaceTest, ScanInclusiveInplace)
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
   using OP_TYPE          = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(0);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(357);
-  ScanInclusiveInplaceTestImpl<EXEC_POLICY,
-                               WORKING_RESOURCE,
-                               OP_TYPE>(32000);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(0);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(357);
+  ScanInclusiveInplaceTestImpl<EXEC_POLICY, WORKING_RESOURCE, OP_TYPE>(32000);
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest,
-                            ScanInclusiveInplace);
+REGISTER_TYPED_TEST_SUITE_P(ScanInclusiveInplaceTest, ScanInclusiveInplace);
 
-#endif // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
+#endif  // __TEST_SCAN_INCLUSIVE_INPLACE_HPP__
diff --git a/test/functional/scan/tests/test-scan-data.hpp b/test/functional/scan/tests/test-scan-data.hpp
index ccfdb47dc2..26b015939f 100644
--- a/test/functional/scan/tests/test-scan-data.hpp
+++ b/test/functional/scan/tests/test-scan-data.hpp
@@ -15,10 +15,12 @@
 template <typename T>
 void allocScanTestData(int N,
                        camp::resources::Resource work_res,
-                       T** work_in, T** work_out,
-                       T** host_in, T** host_out)
+                       T** work_in,
+                       T** work_out,
+                       T** host_in,
+                       T** host_out)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_in  = work_res.allocate<T>(N);
   *work_out = work_res.allocate<T>(N);
@@ -29,10 +31,12 @@ void allocScanTestData(int N,
 
 template <typename T>
 void deallocScanTestData(camp::resources::Resource work_res,
-                         T* work_in, T* work_out,
-                         T* host_in, T* host_out)
+                         T* work_in,
+                         T* work_out,
+                         T* host_in,
+                         T* host_out)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   work_res.deallocate(work_in);
   work_res.deallocate(work_out);
@@ -40,4 +44,4 @@ void deallocScanTestData(camp::resources::Resource work_res,
   host_res.deallocate(host_out);
 }
 
-#endif // __TEST_SCAN_DATA_HPP__
+#endif  // __TEST_SCAN_DATA_HPP__
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index 93d08d99f8..d988dd8e55 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -14,50 +14,122 @@ using MatrixElementType = double;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::hip_wave_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::hip_wave_register>,
 #endif
 
 
 //#ifdef __AVX__
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    2,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    2,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,2, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,2, RAJA::expt::avx_register>,
 //
 //#endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,12, RAJA::expt::avx2_register>,
-
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   12,
+                                   RAJA::expt::avx2_register>,
+
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,4, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    4,2, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//    8,2, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -65,7 +137,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
-
-  >;
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
index 2952fb5f6f..40cb6f67fd 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
@@ -14,29 +14,73 @@ using MatrixElementType = float;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -44,7 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
-
-  >;
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
index e15729d08a..b3e415abbc 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
@@ -14,29 +14,73 @@ using MatrixElementType = int32_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -44,6 +88,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
-  >;
+    >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
index f91b015b4a..3dca8e44a6 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
@@ -14,42 +14,134 @@ using MatrixElementType = int64_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx_register>,
 
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   2,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   8,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   2,
+                                   RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   2,
+                                   RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
-    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   4,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   16,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   8,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   16,
+                                   4,
+                                   RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType,
+                                   TensorMatrixLayoutType,
+                                   8,
+                                   4,
+                                   RAJA::expt::avx512_register>,
 #endif
 
 
@@ -57,6 +149,8 @@ using TensorMatrixTypes = ::testing::Types<
     RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType,
+                                     TensorMatrixLayoutType,
+                                     RAJA::expt::scalar_register>
 
-  >;
+    >;
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
index 1ceaf94b18..b16684cfdc 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-CtorGetSet.hpp
@@ -8,55 +8,62 @@
 #ifndef __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 #define __TEST_TENSOR_REGISTER_CtorGetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void CtorGetSetImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
   //
   // Allocate Data
   //
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
-
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Do Operation: broadcast-ctor and copy-ctor
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // create a matrix that contains all 3's
-    matrix_t m1(element_t(3));
-
-    // copy to another matrix
-    matrix_t m2(m1);
-
-    // write out both matrices
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data1_d(i,j) = m1.get(i,j);
-        data2_d(i,j) = m2.get(i,j);
-      }
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // create a matrix that contains all 3's
+        matrix_t m1(element_t(3));
+
+        // copy to another matrix
+        matrix_t m2(m1);
+
+        // write out both matrices
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data1_d(i, j) = m1.get(i, j);
+            data2_d(i, j) = m2.get(i, j);
+          }
+        }
+      });
 
   // copy data back to host
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -66,11 +73,14 @@ void CtorGetSetImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(3, data1_h(i,j));
-      ASSERT_SCALAR_EQ(3, data2_h(i,j));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(3, data1_h(i, j));
+      ASSERT_SCALAR_EQ(3, data2_h(i, j));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
@@ -80,15 +90,10 @@ void CtorGetSetImpl()
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, CtorGetSet)
-{
-  CtorGetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index e4e1ff0bfb..1a28374569 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -8,71 +8,77 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Add_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Add_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_AddImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -85,26 +91,29 @@ void ET_AddImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) + data3_d(SArows, SRcols) + data4_d(SAcols, rows);
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) + data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -112,27 +121,34 @@ void ET_AddImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i)+data3_h(i,j)+data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i) +
+                                          data3_h(i, j) + data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -141,16 +157,19 @@ void ET_AddImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -158,19 +177,22 @@ void ET_AddImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) + data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -186,11 +208,7 @@ void ET_AddImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Add)
-{
-  ET_AddImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Add) { ET_AddImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index a06b87732c..c17692d673 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -8,17 +8,18 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Divide_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_DivideImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
@@ -26,54 +27,59 @@ void ET_DivideImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -86,26 +92,29 @@ void ET_DivideImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) + data3_d(SArows, SRcols) / data4_d(SAcols, rows);
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) / data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -113,27 +122,34 @@ void ET_DivideImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i)+data3_h(i,j)/data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i) +
+                                          data3_h(i, j) / data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -142,16 +158,19 @@ void ET_DivideImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -159,19 +178,22 @@ void ET_DivideImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) / data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -184,15 +206,10 @@ void ET_DivideImpl()
   tensor_free<policy_t>(data3_ptr);
   tensor_free<policy_t>(data4_ptr);
   tensor_free<policy_t>(data5_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Divide)
-{
-  ET_DivideImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Divide) { ET_DivideImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 1d1c725f52..6c9638a779 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 #define __TEST_TENSOR_REGISTER_ET_LoadStore_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_LoadStoreImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -24,67 +24,91 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
+      data3_h(data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, matrix_t::s_num_rows,
+                                           matrix_t::s_num_columns>>
+      data3_d(data3_ptr);
 
 
   // alloc data4
-  std::vector<element_t> data4_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_h(data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data4_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_h(
+      data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data4_d(data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_d(
+      data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data5_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(
+      data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(
+      data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data6
-  std::vector<element_t> data6_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_h(data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data6_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_h(
+      data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data6_ptr = tensor_malloc<policy_t>(data6_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data6_d(data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data6_ptr = tensor_malloc<policy_t>(data6_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_d(
+      data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // alloc data7
-  std::vector<element_t> data7_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_h(data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
-
-  element_t *data7_ptr = tensor_malloc<policy_t>(data7_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data7_d(data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data7_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_h(
+      data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
+  element_t* data7_ptr = tensor_malloc<policy_t>(data7_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_d(
+      data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -95,25 +119,29 @@ void ET_LoadStoreImpl()
   //
   // Do Operation: Load/Store full matrix from one view to another
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_rows>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_columns>();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    data2_d(cols, rows) = data1_d(rows, cols);
+        auto SRrows = RAJA::expt::RowIndex<
+            int, matrix_t>::template static_range<0, matrix_t::s_num_rows>();
+        auto SRcols = RAJA::expt::ColIndex<
+            int, matrix_t>::template static_range<0, matrix_t::s_num_columns>();
 
-    data4_d(cols, rows) = data3_d(SArows, SRcols);  // mixed static_all and static_range
-    data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
-    data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
-    data7_d(cols, rows) = data3_d(rows, SRcols);    // mixed static_range and non-static
+        data2_d(cols, rows) = data1_d(rows, cols);
 
-  });
+        data4_d(cols, rows) =
+            data3_d(SArows, SRcols);  // mixed static_all and static_range
+        data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
+        data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
+        data7_d(cols, rows) =
+            data3_d(rows, SRcols);  // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
   tensor_copy_to_host<policy_t>(data4_vec, data4_ptr);
@@ -125,14 +153,17 @@ void ET_LoadStoreImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      //printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(j,i));
-      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data4_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data5_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data6_h(j,i));
-      ASSERT_SCALAR_EQ(data3_h(i,j), data7_h(j,i));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      // printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+      // data2_h(j,i));
+      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data4_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data5_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data6_h(j, i));
+      ASSERT_SCALAR_EQ(data3_h(i, j), data7_h(j, i));
     }
   }
 
@@ -140,15 +171,19 @@ void ET_LoadStoreImpl()
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -157,13 +192,15 @@ void ET_LoadStoreImpl()
       //
       // Do Operation: Load/Store partial matrix from one view to another
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        data2_d(cols, rows) = data1_d(rows, cols);
-      });
+            data2_d(cols, rows) = data1_d(rows, cols);
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -171,19 +208,22 @@ void ET_LoadStoreImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data2_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -201,11 +241,7 @@ void ET_LoadStoreImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_LoadStore)
-{
-  ET_LoadStoreImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_LoadStore) { ET_LoadStoreImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index 4718172de7..c197a306e4 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiply_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -23,67 +23,73 @@ void ET_MatrixMatrixMultiplyImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
+      data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
+      data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_h(data2_vec.data());
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_h(
+      data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data2_d(
+      data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(),  N, N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr,  N, N);
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(), N, N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 1+i*N+j;
-      data2_h(i,j) = 3+i*N+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 1 + i * N + j;
+      data2_h(i, j) = 3 + i * N + j;
     }
-
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
-
-//  printf("data2:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data2_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
+
+
+  //  printf("data2:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data2_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -92,70 +98,82 @@ void ET_MatrixMatrixMultiplyImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
-
-    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
-    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
-
-    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
-
-    data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
+
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+
+        data3_d(C_rows, C_cols) =
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-//  printf("data3:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data3:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
       element_t expected(0);
-      for(camp::idx_t k = 0;k < N; ++ k){
-        expected += data1_h(i,k)*data2_h(k,j);
+      for (camp::idx_t k = 0; k < N; ++k)
+      {
+        expected += data1_h(i, k) * data2_h(k, j);
       }
-//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-//      data3_h(i,j) = expected;
+      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+      //    (double)expected, (double)data3_h(i,j));
 
+      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+      //      data3_h(i,j) = expected;
     }
   }
 
-//  printf("expected:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("expected:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
           data3_h(i, j) = 0;
         }
       }
@@ -166,19 +184,27 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
-
-        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
-
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
-
-        data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            auto A_rows =
+                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+            auto A_cols =
+                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+
+            auto B_rows =
+                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+            auto B_cols =
+                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+
+            auto C_rows =
+                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_cols =
+                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+
+            data3_d(C_rows, C_cols) =
+                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -189,37 +215,35 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
-        for(camp::idx_t j = 0;j < n_size; ++ j){
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
+        for (camp::idx_t j = 0; j < n_size; ++j)
+        {
           element_t expected(0);
-          for(camp::idx_t k = 0;k < m_size; ++ k){
-            expected += data1_h(i,k)*data2_h(k,j);
+          for (camp::idx_t k = 0; k < m_size; ++k)
+          {
+            expected += data1_h(i, k) * data2_h(k, j);
           }
-    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-    //      data3_h(i,j) = expected;
+          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+          //    (double)expected, (double)data3_h(i,j));
 
+          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+          //      data3_h(i,j) = expected;
         }
       }
-
-
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiply)
 {
   ET_MatrixMatrixMultiplyImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index 8bebe94c26..f8d136d0e7 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -8,17 +8,17 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixMatrixMultiplyAdd_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE( TX, "TX" );
-RAJA_INDEX_VALUE( TY, "TY" );
+RAJA_INDEX_VALUE(TX, "TX");
+RAJA_INDEX_VALUE(TY, "TY");
 
 template <typename MATRIX_TYPE>
 void ET_MatrixMatrixMultiplyAddImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -26,68 +26,75 @@ void ET_MatrixMatrixMultiplyAddImpl()
   using B_matrix_t = typename matrix_t::transpose_type;
   using C_matrix_t = typename matrix_t::product_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns);
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The left matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data1_h(data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data1_d(data1_ptr);
 
 
   // alloc data2 - The right matrix
 
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_h(data2_vec.data());
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data2_h(data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>, TX, TY> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::TypedView<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>, TX, TY>
+      data2_d(data2_ptr);
 
 
   // alloc data3 - The result matrix
 
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),  N, N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr,  N, N);
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_h(data3_vec.data(),
+                                                              N, N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::TypedView<element_t, RAJA::Layout<2>, TX, TY> data3_d(data3_ptr, N, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 1+i*N+j;
-      data2_h(i,j) = 3+i*N+j;
-      data3_h(i,j) = 5*i+13*j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 1 + i * N + j;
+      data2_h(i, j) = 3 + i * N + j;
+      data3_h(i, j) = 5 * i + 13 * j;
     }
-
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
-
-//  printf("data2:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data2_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
+
+
+  //  printf("data2:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data2_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -97,71 +104,83 @@ void ET_MatrixMatrixMultiplyAddImpl()
   //
   // Do Operation: A*B
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
-
-    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
-    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
-
-    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
-
-    data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+        auto A_cols =
+            RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,
+                                                                         N>();
+
+        auto B_rows =
+            RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,
+                                                                         N>();
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
+
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
+
+        data3_d(C_rows, C_cols) +=
+            data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
-//  printf("data3:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data3:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      element_t expected(5*i+13*j);
-      for(camp::idx_t k = 0;k < N; ++ k){
-        expected += data1_h(i,k)*data2_h(k,j);
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      element_t expected(5 * i + 13 * j);
+      for (camp::idx_t k = 0; k < N; ++k)
+      {
+        expected += data1_h(i, k) * data2_h(k, j);
       }
-//    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
-
-      ASSERT_SCALAR_EQ(expected, data3_h(i,j));
-//      data3_h(i,j) = expected;
+      //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+      //    (double)expected, (double)data3_h(i,j));
 
+      ASSERT_SCALAR_EQ(expected, data3_h(i, j));
+      //      data3_h(i,j) = expected;
     }
   }
 
-//  printf("expected:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data3_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("expected:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data3_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data3_h(i,j) = 5*i+13*j;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data3_h(i, j) = 5 * i + 13 * j;
         }
       }
 
@@ -171,20 +190,28 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Do Operation A*B
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            auto A_rows =
+                RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+            auto A_cols =
+                RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
-        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
+            auto B_rows =
+                RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+            auto B_cols =
+                RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
 
-        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_rows =
+                RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+            auto C_cols =
+                RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
 
-        data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
-      });
+            data3_d(C_rows, C_cols) +=
+                data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -195,35 +222,34 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
-        for(camp::idx_t j = 0;j < n_size; ++ j){
-          element_t expected(5*i+13*j);
-          for(camp::idx_t k = 0;k < m_size; ++ k){
-            expected += data1_h(i,k)*data2_h(k,j);
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
+        for (camp::idx_t j = 0; j < n_size; ++j)
+        {
+          element_t expected(5 * i + 13 * j);
+          for (camp::idx_t k = 0; k < m_size; ++k)
+          {
+            expected += data1_h(i, k) * data2_h(k, j);
           }
-    //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j, (double)expected, (double)data3_h(i,j));
+          //    printf("i=%d, j=%d, expected=%e, data3=%e\n", (int)i, (int)j,
+          //    (double)expected, (double)data3_h(i,j));
 
-          ASSERT_SCALAR_EQ(expected, data3_h(i,j));
+          ASSERT_SCALAR_EQ(expected, data3_h(i, j));
         }
       }
-
-
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixMatrixMultiplyAdd)
 {
   ET_MatrixMatrixMultiplyAddImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
index 0d7f2fd137..e67e4a1389 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
@@ -8,74 +8,80 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 #define __TEST_TENSOR_MATRIX_ET_MatrixVector_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_MatrixVectorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using cvector_t = typename matrix_t::column_vector_type;
   using rvector_t = typename matrix_t::row_vector_type;
 
-//  static constexpr camp::idx_t N = 8; //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t N = 8;
+  //  //matrix_t::s_num_rows*matrix_t::s_num_columns*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1 - The matrix
 
-  std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
+  std::vector<element_t> data1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_h(
+      data1_vec.data());
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data1_d(
+      data1_ptr);
 
 
   // alloc data2 - The input vector
 
   std::vector<element_t> data2_vec(N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_h(data2_vec.data());
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_h(
+      data2_vec.data());
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_d(data2_ptr);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I, N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The output vector
 
   std::vector<element_t> data3_vec(N);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_h(data3_vec.data(),  N);
-
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data3_d(data3_ptr,  N);
+  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_h(data3_vec.data(), N);
 
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::Layout<1, int, 0>> data3_d(data3_ptr, N);
 
 
   // Fill data1 and data2
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = 3+i*N+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = 3 + i * N + j;
     }
-    data2_h(i) = i+1;
+    data2_h(i) = i + 1;
   }
 
-//  printf("data1:\n");
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("  ");
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%lf  ", (double)data1_h(i,j));
-//    }
-//    printf("\n");
-//  }
+  //  printf("data1:\n");
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("  ");
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%lf  ", (double)data1_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
-//  for(camp::idx_t i = 0;i < N; ++ i){
-//    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
-//  }
+  //  for(camp::idx_t i = 0;i < N; ++ i){
+  //    printf("data2[%d]=%lf\n", (int)i, (double)data2_h(i));
+  //  }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -84,17 +90,18 @@ void ET_MatrixVectorImpl()
   //
   // Do Operation: A*x
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
-
-    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
 
-  });
+        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -102,30 +109,36 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
 
     element_t expected(0);
-    for(camp::idx_t j = 0;j < N; ++ j){
-      expected += data1_h(i,j)*data2_h(j);
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      expected += data1_h(i, j) * data2_h(j);
     }
-//    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
+    //    printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected,
+    //    (double)data3_h(i));
 
     ASSERT_SCALAR_EQ(expected, data3_h(i));
   }
 
-//return;
+  // return;
 
 
   //
   // Loop over all possible sub-matrix sizes for A*x
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
         data3_h(i) = 0;
       }
 
@@ -135,16 +148,20 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+            auto vrow =
+                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+            auto vcol =
+                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-        data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
-      });
+            data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -152,43 +169,43 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < n_size; ++ i){
+      for (camp::idx_t i = 0; i < n_size; ++i)
+      {
 
 
         element_t expected(0);
-        for(camp::idx_t j = 0;j < m_size; ++ j){
-          expected += data1_h(i,j) * data2_h(j);
+        for (camp::idx_t j = 0; j < m_size; ++j)
+        {
+          expected += data1_h(i, j) * data2_h(j);
         }
 
-        if(i >= n_size || m_size == 0){
+        if (i >= n_size || m_size == 0)
+        {
           expected = 0;
         }
 
-//        printf("i=%d, expected=%e, data3=%e\n", (int)i, (double)expected, (double)data3_h(i));
+        //        printf("i=%d, expected=%e, data3=%e\n", (int)i,
+        //        (double)expected, (double)data3_h(i));
         ASSERT_SCALAR_EQ(expected, data3_h(i));
-
       }
-
-
     }
   }
 
 
-
   //
   // Do Operation: (x')*A
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
 
-    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
-    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
-
-    data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
-
-  });
+        data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
+      });
 
   tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -196,31 +213,35 @@ void ET_MatrixVectorImpl()
   //
   // Check results
   //
-  for(camp::idx_t j = 0;j < N; ++ j){
+  for (camp::idx_t j = 0; j < N; ++j)
+  {
 
 
     element_t expected(0);
-    for(camp::idx_t i = 0;i < N; ++ i){
-      expected += data2_h(i)*data1_h(i,j);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      expected += data2_h(i) * data1_h(i, j);
     }
 
     ASSERT_SCALAR_EQ(expected, data3_h(j));
-//    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j), (double)expected);
+    //    printf("i=%d, data3=%lf, expected=%lf\n", (int)j, (double)data3_h(j),
+    //    (double)expected);
   }
 
 
-
-
   //
   // Loop over all possible sub-matrix sizes for (x')*A
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data3
       //
-      for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t j = 0; j < N; ++j)
+      {
         data3_h(j) = 0;
       }
 
@@ -230,16 +251,20 @@ void ET_MatrixVectorImpl()
       //
       // Do Operation (x')*A
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
+            auto vrow =
+                RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+            auto vcol =
+                RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
-        data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
-      });
+            data3_d(vrow) = data2_d(vcol) * data1_d(rows, cols);
+          });
 
       tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
 
@@ -247,24 +272,25 @@ void ET_MatrixVectorImpl()
       //
       // Check results
       //
-      for(camp::idx_t j = 0;j < N; ++ j){
+      for (camp::idx_t j = 0; j < N; ++j)
+      {
 
         element_t expected(0);
 
-        for(camp::idx_t i = 0;i < n_size; ++ i){
-          expected += data2_h(i) * data1_h(i,j);
+        for (camp::idx_t i = 0; i < n_size; ++i)
+        {
+          expected += data2_h(i) * data1_h(i, j);
         }
 
-        if(j >= m_size || n_size == 0){
+        if (j >= m_size || n_size == 0)
+        {
           expected = 0;
         }
 
-//        printf("j=%d, expected=%e, data3=%e\n", (int)j, (double)expected, (double)data3_h(j));
+        //        printf("j=%d, expected=%e, data3=%e\n", (int)j,
+        //        (double)expected, (double)data3_h(j));
         ASSERT_SCALAR_EQ(expected, data3_h(j));
-
       }
-
-
     }
   }
 
@@ -275,11 +301,9 @@ void ET_MatrixVectorImpl()
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, ET_MatrixVector)
 {
   ET_MatrixVectorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index 6336a2988d..a7ac9b4529 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -8,18 +8,19 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Negate_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_NegateImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
@@ -27,73 +28,76 @@ void ET_NegateImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*N);
+  std::vector<element_t> input0_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, N);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, N);
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, N);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_h(input1_vec.data());
+  std::vector<element_t> input1_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_h(
+      input1_vec.data());
 
-  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_d(input1_ptr);
+  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> input1_d(
+      input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  N, N);
+  std::vector<element_t> output0_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), N, N);
 
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  N, N);
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, N, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  N, N);
+  std::vector<element_t> output1_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), N, N);
 
-  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  N, N);
+  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, N, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  N, N);
+  std::vector<element_t> output2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), N, N);
 
-  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  N, N);
+  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, N, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  N, N);
+  std::vector<element_t> output3_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), N, N);
 
-  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  N, N);
+  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, N, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  N, N);
-
-  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  N, N);
+  std::vector<element_t> output4_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), N, N);
 
+  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, N, N);
 
 
   // Fill input0
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
-      input1_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
+      input1_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -104,25 +108,29 @@ void ET_NegateImpl()
   //
   // Do Operation: negation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
-
-    output0_d(rows, cols) = -input0_d(rows, cols);
-
-    output1_d(rows, cols) = -input1_d(SArows, SRcols);  // mixed static_all and static_range
-    output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
-    output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
-    output4_d(rows, cols) = -input1_d(rows, SRcols);    // mixed static_range and non-static
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
+
+        output0_d(rows, cols) = -input0_d(rows, cols);
+
+        output1_d(rows, cols) =
+            -input1_d(SArows, SRcols);  // mixed static_all and static_range
+        output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
+        output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
+        output4_d(rows, cols) =
+            -input1_d(rows, SRcols);  // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
@@ -134,18 +142,19 @@ void ET_NegateImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), -input0_h(i,j));
-      ASSERT_SCALAR_EQ(output1_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output2_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output3_h(i,j), -input1_h(i,j));
-      ASSERT_SCALAR_EQ(output4_h(i,j), -input1_h(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), -input0_h(i, j));
+      ASSERT_SCALAR_EQ(output1_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output2_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output3_h(i, j), -input1_h(i, j));
+      ASSERT_SCALAR_EQ(output4_h(i, j), -input1_h(i, j));
     }
   }
 
 
-
   //
   // Free data
   //
@@ -156,15 +165,10 @@ void ET_NegateImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Negate)
-{
-  ET_NegateImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Negate) { ET_NegateImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index dd95c11904..5b3d146938 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -8,71 +8,77 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Subtract_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_SubtractImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
-  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  static constexpr camp::idx_t N =
+      RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns) * 2;
 
   //
   // Allocate Row-Major Data
   //
 
   // alloc data1
-  std::vector<element_t> data1_vec(N*N);
+  std::vector<element_t> data1_vec(N * N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, N, N);
 
 
   // alloc data2
-  std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  std::vector<element_t> data2_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), N, N);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, N, N);
 
 
   // alloc data3 with StaticLayout
-  std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
+  std::vector<element_t> data3_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_h(
+      data3_vec.data());
 
-  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+  element_t* data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data3_d(
+      data3_ptr);
 
 
   // alloc data4 with StaticLayout
-  std::vector<element_t> data4_vec(N*N);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+  std::vector<element_t> data4_vec(N * N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_h(
+      data4_vec.data());
 
-  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+  element_t* data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, N>> data4_d(
+      data4_ptr);
 
 
   // alloc data5
-  std::vector<element_t> data5_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
-
-  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
+  std::vector<element_t> data5_vec(N * N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), N, N);
 
+  element_t* data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, N, N);
 
 
   // Fill data1, data2, data3, and data4
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      data1_h(i,j) = i*matrix_t::s_num_columns+j;
-      data2_h(i,j) = 1+i+j;
-      data3_h(i,j) = i*matrix_t::s_num_columns+j;
-      data4_h(i,j) = 1+i+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      data1_h(i, j) = i * matrix_t::s_num_columns + j;
+      data2_h(i, j) = 1 + i + j;
+      data3_h(i, j) = i * matrix_t::s_num_columns + j;
+      data4_h(i, j) = 1 + i + j;
     }
   }
 
@@ -85,26 +91,29 @@ void ET_SubtractImpl()
   //
   // Do Operation: Full sum of data1, data2, data3, and data4
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, N>();
 
-    // Access types:
-    // data1_d - Layout with all() and all().
-    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
-    // data3_d - StaticLayout with static_all() and static_range().
-    // data4_d - StaticLayout with static_all() and all().
-
-    data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) + data3_d(SArows, SRcols) - data4_d(SAcols, rows);
+        // Access types:
+        // data1_d - Layout with all() and all().
+        // data2_d - Layout with all() and static_range(), which should default
+        // to normal Layout access. data3_d - StaticLayout with static_all() and
+        // static_range(). data4_d - StaticLayout with static_all() and all().
 
-  });
+        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) +
+                              data3_d(SArows, SRcols) - data4_d(SAcols, rows);
+      });
 
   tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -112,27 +121,34 @@ void ET_SubtractImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i)+data3_h(i,j)-data4_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i) +
+                                          data3_h(i, j) - data4_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= N; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= N; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= N; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data5
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-          data5_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          data5_h(j, i) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
@@ -141,16 +157,19 @@ void ET_SubtractImpl()
       //
       // Do Operation: Perform partial sum
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // Load data using a View
-        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // Load data using a View
+            auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+            auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        // Access types:
-        // Layout with range() and range() because loop iterate cannot be determined statically.
+            // Access types:
+            // Layout with range() and range() because loop iterate cannot be
+            // determined statically.
 
-        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
-      });
+            data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
+          });
 
       tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
@@ -158,19 +177,22 @@ void ET_SubtractImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < N; ++ i){
-        for(camp::idx_t j = 0;j < N; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i));
+      for (camp::idx_t i = 0; i < N; ++i)
+      {
+        for (camp::idx_t j = 0; j < N; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data5_h(j, i), data1_h(i, j) - data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -186,11 +208,7 @@ void ET_SubtractImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Subtract)
-{
-  ET_SubtractImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Subtract) { ET_SubtractImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index 9f40887dd1..18a3d44b5f 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -8,20 +8,22 @@
 #ifndef __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_ET_Transpose_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void ET_TransposeImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
 
-//  static constexpr camp::idx_t N = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
-//  static constexpr camp::idx_t M = RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t N =
+  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
+  //  static constexpr camp::idx_t M =
+  //  RAJA::max<camp::idx_t>(matrix_t::s_num_rows, matrix_t::s_num_columns)*2;
 
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
@@ -32,74 +34,76 @@ void ET_TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
 
 
   // alloc input1 with StaticLayout
 
-  std::vector<element_t> input1_vec(N*M);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_h(input1_vec.data());
-
-  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
-  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_d(input1_ptr);
+  std::vector<element_t> input1_vec(N * M);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_h(
+      input1_vec.data());
 
+  element_t* input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ, N, M>> input1_d(
+      input1_ptr);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
+  std::vector<element_t> output0_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
 
 
   // alloc output1
 
-  std::vector<element_t> output1_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  M, N);
+  std::vector<element_t> output1_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(), M, N);
 
-  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  M, N);
+  element_t* output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr, M, N);
 
 
   // alloc output2
 
-  std::vector<element_t> output2_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  M, N);
+  std::vector<element_t> output2_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(), M, N);
 
-  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  M, N);
+  element_t* output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr, M, N);
 
 
   // alloc output3
 
-  std::vector<element_t> output3_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  M, N);
+  std::vector<element_t> output3_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(), M, N);
 
-  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  M, N);
+  element_t* output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr, M, N);
 
 
   // alloc output4
 
-  std::vector<element_t> output4_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  M, N);
-
-  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  M, N);
+  std::vector<element_t> output4_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(), M, N);
 
+  element_t* output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr, M, N);
 
 
   // Fill input0 and input1
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < M; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
-      input1_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < M; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
+      input1_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -110,28 +114,36 @@ void ET_TransposeImpl()
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
-
-    auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
-    auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
-
-    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
-    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
-
-    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
-    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,M>();
-
-    output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
-
-    output1_d(rows_tr, cols_tr) = input1_d(SArows, SRcols).transpose();  // mixed static_all and static_range
-    output2_d(rows_tr, cols_tr) = input1_d(SArows, SAcols).transpose();  // static_all
-    output3_d(rows_tr, cols_tr) = input1_d(SRrows, SRcols).transpose();  // static_range
-    output4_d(rows_tr, cols_tr) = input1_d(rows, SRcols).transpose();    // mixed static_range and non-static
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+        auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
+        auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
+
+        auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+        auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+        auto SRrows =
+            RAJA::expt::RowIndex<int, matrix_t>::template static_range<0, N>();
+        auto SRcols =
+            RAJA::expt::ColIndex<int, matrix_t>::template static_range<0, M>();
+
+        output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
+
+        output1_d(rows_tr, cols_tr) =
+            input1_d(SArows, SRcols)
+                .transpose();  // mixed static_all and static_range
+        output2_d(rows_tr, cols_tr) =
+            input1_d(SArows, SAcols).transpose();  // static_all
+        output3_d(rows_tr, cols_tr) =
+            input1_d(SRrows, SRcols).transpose();  // static_range
+        output4_d(rows_tr, cols_tr) =
+            input1_d(rows, SRcols)
+                .transpose();  // mixed static_range and non-static
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
@@ -140,31 +152,30 @@ void ET_TransposeImpl()
   tensor_copy_to_host<policy_t>(output4_vec, output4_ptr);
 
 
-
-//  for(camp::idx_t i = 0;i < M; ++ i){
-//    for(camp::idx_t j = 0;j < N; ++ j){
-//      printf("%3d ", (int)output0_h(i,j));
-//    }
-//    printf("\n");
-//  }
-
+  //  for(camp::idx_t i = 0;i < M; ++ i){
+  //    for(camp::idx_t j = 0;j < N; ++ j){
+  //      printf("%3d ", (int)output0_h(i,j));
+  //    }
+  //    printf("\n");
+  //  }
 
 
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
-      ASSERT_SCALAR_EQ(output1_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output2_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output3_h(i,j), input1_h(j,i));
-      ASSERT_SCALAR_EQ(output4_h(i,j), input1_h(j,i));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
+      ASSERT_SCALAR_EQ(output1_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output2_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output3_h(i, j), input1_h(j, i));
+      ASSERT_SCALAR_EQ(output4_h(i, j), input1_h(j, i));
     }
   }
 
 
-
   //
   // Free data
   //
@@ -175,15 +186,10 @@ void ET_TransposeImpl()
   tensor_free<policy_t>(output2_ptr);
   tensor_free<policy_t>(output3_ptr);
   tensor_free<policy_t>(output4_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, ET_Transpose)
-{
-  ET_TransposeImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, ET_Transpose) { ET_TransposeImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
index fff811c48f..bbf131075b 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_ColMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_ColMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_ColMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,26 +25,34 @@ void Load_ColMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(j,i) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -54,25 +62,30 @@ void Load_ColMajorImpl()
   //
   // Do operation
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    matrix_t m;
-
-    if(matrix_t::layout_type::is_column_major()){
-      m.load_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
-    else{
-      m.load_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        matrix_t m;
 
-    // write out to a second view so we can check it on the host
-    // on GPU's we'll write way too much, but it should stil be correct
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data2_d(j,i) = m.get(i,j);
-      }
-    }
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.load_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+        else
+        {
+          m.load_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
 
-  });
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(j, i) = m.get(i, j);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -80,27 +93,33 @@ void Load_ColMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i), data2(j,i));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(j,i),
+      //      data2(j,i));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(j,i) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(j, i) = -1;
         }
       }
 
@@ -110,24 +129,31 @@ void Load_ColMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        matrix_t m;
-        if(matrix_t::layout_type::is_column_major()){
-          m.load_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-        else{
-          m.load_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            data2_d(j,i) = m.get(i,j);
-          }
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            matrix_t m;
+            if (matrix_t::layout_type::is_column_major())
+            {
+              m.load_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                               m_size);
+            }
+            else
+            {
+              m.load_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
+            }
+
+            // write out to a second view so we can check it on the host
+            // on GPU's we'll write way too much, but it should stil be correct
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                data2_d(j, i) = m.get(i, j);
+              }
+            }
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -135,19 +161,22 @@ void Load_ColMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(j,i));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(j, i));
           }
         }
       }
-
-
     }
   }
 
@@ -160,7 +189,6 @@ void Load_ColMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Load_ColMajor)
 {
   Load_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
index 8cae00baec..84eee26474 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Load_RowMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 #define __TEST_TENSOR_REGISTER_Load_RowMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Load_RowMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,27 +25,34 @@ void Load_RowMajorImpl()
 
   // alloc data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
-
-  element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
+  element_t* data2_ptr = tensor_malloc<policy_t>(data2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_d(
+      data2_ptr, matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   // Fill data
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
@@ -55,24 +62,29 @@ void Load_RowMajorImpl()
   //
   // Do Operation: Full load
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    matrix_t m;
-    if(matrix_t::layout_type::is_row_major()){
-      m.load_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-    else{
-      m.load_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-
-    // write out to a second view so we can check it on the host
-    // on GPU's we'll write way too much, but it should stil be correct
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        data2_d(i,j) = m.get(i,j);
-      }
-    }
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        matrix_t m;
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.load_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+        else
+        {
+          m.load_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
 
-  });
+        // write out to a second view so we can check it on the host
+        // on GPU's we'll write way too much, but it should stil be correct
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            data2_d(i, j) = m.get(i, j);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -80,27 +92,33 @@ void Load_RowMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
+      //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+      //      data2(i,j));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
-//      printf("Running %d x %d\n", (int)n_size, (int)m_size);
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
+      //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
       // Clear data2
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-          data2_h(i,j) = -1;
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          data2_h(i, j) = -1;
         }
       }
       tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
@@ -109,24 +127,31 @@ void Load_RowMajorImpl()
       //
       // Do Operation: Partial load
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        matrix_t m;
-        if(matrix_t::layout_type::is_row_major()){
-          m.load_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-        else{
-          m.load_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-
-        // write out to a second view so we can check it on the host
-        // on GPU's we'll write way too much, but it should stil be correct
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            data2_d(i,j) = m.get(i,j);
-          }
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            matrix_t m;
+            if (matrix_t::layout_type::is_row_major())
+            {
+              m.load_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                               n_size, m_size);
+            }
+            else
+            {
+              m.load_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
+            }
+
+            // write out to a second view so we can check it on the host
+            // on GPU's we'll write way too much, but it should stil be correct
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                data2_d(i, j) = m.get(i, j);
+              }
+            }
+          });
 
       tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
 
@@ -134,19 +159,22 @@ void Load_RowMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-//          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
-          if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+      for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+        {
+          //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j),
+          //          data2(i,j));
+          if (i < n_size && j < m_size)
+          {
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
           }
-          else{
-            ASSERT_SCALAR_EQ(element_t(0), data2_h(i,j));
+          else
+          {
+            ASSERT_SCALAR_EQ(element_t(0), data2_h(i, j));
           }
         }
       }
-
-
     }
   }
 
@@ -159,7 +187,6 @@ void Load_RowMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Load_RowMajor)
 {
   Load_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
index 0961e3722d..b107b919e2 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_ColMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_ColMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_ColMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
 
@@ -25,34 +25,43 @@ void Store_ColMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_columns, 2*matrix_t::s_num_rows);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_columns, 2 * matrix_t::s_num_rows);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
   //
   // Fill reference data
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data2_h(j,i) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data2_h(j, i) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
   //
   // Clear data1
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(j,i) = element_t(-2);
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(j, i) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -61,25 +70,30 @@ void Store_ColMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill out matrix
-    matrix_t m(-1.0);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill out matrix
+        matrix_t m(-1.0);
 
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        m.set(2*i*matrix_t::s_num_columns+j, i, j);
-      }
-    }
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+          }
+        }
 
-    // Store matrix to memory
-    if(matrix_t::layout_type::is_column_major()){
-      m.store_packed(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
-    else{
-      m.store_strided(data1_ptr, 1, 2*matrix_t::s_num_rows);
-    }
-  });
+        // Store matrix to memory
+        if (matrix_t::layout_type::is_column_major())
+        {
+          m.store_packed(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+        else
+        {
+          m.store_strided(data1_ptr, 1, 2 * matrix_t::s_num_rows);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -87,33 +101,41 @@ void Store_ColMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
-//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
+      {
+        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+        //        data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
       }
-      else{
-//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
+      else
+      {
+        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
       }
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
 
       //
       // Clear data1
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          data1_h(j,i) = element_t(-2);
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          data1_h(j, i) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -122,25 +144,32 @@ void Store_ColMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // fill out matrix
-        matrix_t m(-1.0);
-
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            m.set(2*i*matrix_t::s_num_columns+j, i, j);
-          }
-        }
-
-        // Store matrix to memory
-        if(matrix_t::layout_type::is_column_major()){
-          m.store_packed_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-        else{
-          m.store_strided_nm(data1_ptr, 1, 2*matrix_t::s_num_rows, n_size, m_size);
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // fill out matrix
+            matrix_t m(-1.0);
+
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+              }
+            }
+
+            // Store matrix to memory
+            if (matrix_t::layout_type::is_column_major())
+            {
+              m.store_packed_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                m_size);
+            }
+            else
+            {
+              m.store_strided_nm(data1_ptr, 1, 2 * matrix_t::s_num_rows, n_size,
+                                 m_size);
+            }
+          });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -149,20 +178,24 @@ void Store_ColMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          if(i < n_size && j < m_size){
-//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j,i), data2_h(j,i));
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          if (i < n_size && j < m_size)
+          {
+            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
+            //            data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j, i), data2_h(j, i));
           }
-          else{
-//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(j,i), element_t(-2));
+          else
+          {
+            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
+            //            data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(j, i), element_t(-2));
           }
         }
       }
-
-
     }
   }
 
@@ -174,7 +207,6 @@ void Store_ColMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Store_ColMajor)
 {
   Store_ColMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
index 94172b4342..ae3d9b5fba 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Store_RowMajor.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 #define __TEST_TENSOR_MATRIX_Store_RowMajor_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void Store_RowMajorImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   //
@@ -24,34 +24,43 @@ void Store_RowMajorImpl()
 
   // alloc data1 - matrix data will be generated on device, stored into data1
 
-  std::vector<element_t> data1_vec(4*matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  std::vector<element_t> data1_vec(4 * matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_h(
+      data1_vec.data(), 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
-  element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr, 2*matrix_t::s_num_rows, 2*matrix_t::s_num_columns);
+  element_t* data1_ptr = tensor_malloc<policy_t>(data1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data1_d(
+      data1_ptr, 2 * matrix_t::s_num_rows, 2 * matrix_t::s_num_columns);
 
 
   // alloc data2 - reference data to compare with data1 on host
 
-  std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
+  std::vector<element_t> data2_vec(matrix_t::s_num_rows *
+                                   matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data2_h(
+      data2_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
 
   //
   // Fill reference data
   //
-  for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-      data2_h(i,j) = 2*i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+    {
+      data2_h(i, j) = 2 * i * matrix_t::s_num_columns + j;
     }
   }
 
   //
   // Clear data1
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      data1_h(i,j) = element_t(-2);
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      data1_h(i, j) = element_t(-2);
     }
   }
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -60,25 +69,30 @@ void Store_RowMajorImpl()
   //
   // Do Operation: Full store
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill out matrix
-    matrix_t m(-1.0);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill out matrix
+        matrix_t m(-1.0);
 
-    for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-      for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-        m.set(2*i*matrix_t::s_num_columns+j, i, j);
-      }
-    }
+        for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+        {
+          for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+          {
+            m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+          }
+        }
 
-    // Store matrix to memory
-    if(matrix_t::layout_type::is_row_major()){
-      m.store_packed(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-    else{
-      m.store_strided(data1_ptr, 2*matrix_t::s_num_columns, 1);
-    }
-  });
+        // Store matrix to memory
+        if (matrix_t::layout_type::is_row_major())
+        {
+          m.store_packed(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+        else
+        {
+          m.store_strided(data1_ptr, 2 * matrix_t::s_num_columns, 1);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
 
@@ -86,33 +100,41 @@ void Store_RowMajorImpl()
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-    for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-      if(i < matrix_t::s_num_rows && j < matrix_t::s_num_columns){
-//        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+  for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+  {
+    for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+    {
+      if (i < matrix_t::s_num_rows && j < matrix_t::s_num_columns)
+      {
+        //        printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j),
+        //        data2_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
       }
-      else{
-//        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-        ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
+      else
+      {
+        //        printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
+        ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
       }
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
-  for(camp::idx_t n_size = 0;n_size <= matrix_t::s_num_rows; ++ n_size){
-    for(camp::idx_t m_size = 0;m_size <= matrix_t::s_num_columns; ++ m_size){
+  for (camp::idx_t n_size = 0; n_size <= matrix_t::s_num_rows; ++n_size)
+  {
+    for (camp::idx_t m_size = 0; m_size <= matrix_t::s_num_columns; ++m_size)
+    {
 
       //
       // Clear data1
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          data1_h(i,j) = element_t(-2);
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          data1_h(i, j) = element_t(-2);
         }
       }
       tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
@@ -121,25 +143,32 @@ void Store_RowMajorImpl()
       //
       // Do Operation: Partial Store
       //
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-        // fill out matrix
-        matrix_t m(-1.0);
-
-        for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
-          for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
-            m.set(2*i*matrix_t::s_num_columns+j, i, j);
-          }
-        }
-
-        // Store matrix to memory
-        if(matrix_t::layout_type::is_row_major()){
-          m.store_packed_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-        else{
-          m.store_strided_nm(data1_ptr, 2*matrix_t::s_num_columns, 1, n_size, m_size);
-        }
-
-      });
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            // fill out matrix
+            matrix_t m(-1.0);
+
+            for (camp::idx_t i = 0; i < matrix_t::s_num_rows; ++i)
+            {
+              for (camp::idx_t j = 0; j < matrix_t::s_num_columns; ++j)
+              {
+                m.set(2 * i * matrix_t::s_num_columns + j, i, j);
+              }
+            }
+
+            // Store matrix to memory
+            if (matrix_t::layout_type::is_row_major())
+            {
+              m.store_packed_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                n_size, m_size);
+            }
+            else
+            {
+              m.store_strided_nm(data1_ptr, 2 * matrix_t::s_num_columns, 1,
+                                 n_size, m_size);
+            }
+          });
 
 
       tensor_copy_to_host<policy_t>(data1_vec, data1_ptr);
@@ -148,20 +177,24 @@ void Store_RowMajorImpl()
       //
       // Check results
       //
-      for(camp::idx_t i = 0;i < 2*matrix_t::s_num_rows; ++ i){
-        for(camp::idx_t j = 0;j < 2*matrix_t::s_num_columns; ++ j){
-          if(i < n_size && j < m_size){
-//            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(i,j));
+      for (camp::idx_t i = 0; i < 2 * matrix_t::s_num_rows; ++i)
+      {
+        for (camp::idx_t j = 0; j < 2 * matrix_t::s_num_columns; ++j)
+        {
+          if (i < n_size && j < m_size)
+          {
+            //            printf("%d,%d:  %lf, %lf\n", (int)i, (int)j,
+            //            data1_h(i,j), data2_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i, j), data2_h(i, j));
           }
-          else{
-//            printf("%d,%d:  %lf, -2\n", (int)i, (int)j, data1_h(i,j));
-            ASSERT_SCALAR_EQ(data1_h(i,j), element_t(-2));
+          else
+          {
+            //            printf("%d,%d:  %lf, -2\n", (int)i, (int)j,
+            //            data1_h(i,j));
+            ASSERT_SCALAR_EQ(data1_h(i, j), element_t(-2));
           }
         }
       }
-
-
     }
   }
 
@@ -173,7 +206,6 @@ void Store_RowMajorImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorMatrix, Store_RowMajor)
 {
   Store_RowMajorImpl<TypeParam>();
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
index 1be42b1ab8..dbd1b14c9a 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-Transpose.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_MATRIX_Transpose_HPP__
 #define __TEST_TENSOR_MATRIX_Transpose_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename MATRIX_TYPE>
 void TransposeImpl()
 {
 
-  using matrix_t = MATRIX_TYPE;
-  using policy_t = typename matrix_t::register_policy;
+  using matrix_t  = MATRIX_TYPE;
+  using policy_t  = typename matrix_t::register_policy;
   using element_t = typename matrix_t::element_type;
 
   using transpose_t = typename matrix_t::transpose_type;
@@ -24,7 +24,7 @@ void TransposeImpl()
   static constexpr camp::idx_t N = matrix_t::s_num_rows;
   static constexpr camp::idx_t M = matrix_t::s_num_columns;
 
-//  bool is_row_major = matrix_t::layout_type::is_row_major();
+  //  bool is_row_major = matrix_t::layout_type::is_row_major();
 
   //
   // Allocate Row-Major Data
@@ -32,91 +32,86 @@ void TransposeImpl()
 
   // alloc input0
 
-  std::vector<element_t> input0_vec(N*M);
+  std::vector<element_t> input0_vec(N * M);
   RAJA::View<element_t, RAJA::Layout<2>> input0_h(input0_vec.data(), N, M);
 
-  element_t *input0_ptr = tensor_malloc<policy_t>(input0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
-
+  element_t* input0_ptr = tensor_malloc<policy_t>(input0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr, N, M);
 
 
   // alloc output0
 
-  std::vector<element_t> output0_vec(N*M);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(),  M, N);
-
-  element_t *output0_ptr = tensor_malloc<policy_t>(output0_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
+  std::vector<element_t> output0_vec(N * M);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_h(output0_vec.data(), M, N);
 
+  element_t* output0_ptr = tensor_malloc<policy_t>(output0_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr, M, N);
 
 
   // Fill input0 and output0
-  for(camp::idx_t i = 0;i < N; ++ i){
-    for(camp::idx_t j = 0;j < M; ++ j){
-      input0_h(i,j) = i*matrix_t::s_num_columns+j;
+  for (camp::idx_t i = 0; i < N; ++i)
+  {
+    for (camp::idx_t j = 0; j < M; ++j)
+    {
+      input0_h(i, j) = i * matrix_t::s_num_columns + j;
     }
   }
 
   tensor_copy_to_device<policy_t>(input0_ptr, input0_vec);
 
 
-
-
   //
   // Do Operation: transpose
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // load original matrix
-    matrix_t A;
-    A.load_strided(input0_ptr, M, 1);
-
-    // transpose matrix
-    transpose_t B = A.transpose();
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load original matrix
+        matrix_t A;
+        A.load_strided(input0_ptr, M, 1);
 
-    // store transposed matrix
-    B.store_strided(output0_ptr, N, 1);
+        // transpose matrix
+        transpose_t B = A.transpose();
 
-  });
+        // store transposed matrix
+        B.store_strided(output0_ptr, N, 1);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
 
 
   printf("gtest result:\n");
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      printf("%3d ", (int)output0_h(i,j));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      printf("%3d ", (int)output0_h(i, j));
     }
     printf("\n");
   }
 
 
-
   //
   // Check results
   //
-  for(camp::idx_t i = 0;i < M; ++ i){
-    for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
+  for (camp::idx_t i = 0; i < M; ++i)
+  {
+    for (camp::idx_t j = 0; j < N; ++j)
+    {
+      ASSERT_SCALAR_EQ(output0_h(i, j), input0_h(j, i));
     }
   }
 
 
-
   //
   // Free data
   //
   tensor_free<policy_t>(input0_ptr);
   tensor_free<policy_t>(output0_ptr);
-
 }
 
 
-
-TYPED_TEST_P(TestTensorMatrix, Transpose)
-{
-  TransposeImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorMatrix, Transpose) { TransposeImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
index ae9a93c3ad..b936803efd 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Add.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Add_HPP__
 #define __TEST_TENSOR_REGISTER_Add_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void AddImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t  i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,93 +49,95 @@ void AddImpl()
   //
 
   // operator +
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x + y;
 
-    register_t z = x + y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator +=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z += y;
 
-    z += y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator + scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x + 7;
+        register_t z = x + 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + 7, output0_vec[lane]);
   }
 
 
-
-
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z += 3;
+        z += 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] + 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +145,7 @@ void AddImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Add)
-{
-  AddImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Add) { AddImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
index 33efe4ba27..7ba22b6a80 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Divide.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Divide_HPP__
 #define __TEST_TENSOR_REGISTER_Divide_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DivideImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,88 +49,91 @@ void DivideImpl()
   //
 
   // operator /
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x / y;
 
-    register_t z = x / y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator /=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z /= y;
 
-    z /= y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator / scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x / 7;
+        register_t z = x / 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / 7, output0_vec[lane]);
   }
 
 
-
-
   // operator += scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z /= 3;
 
-    z /= 3;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] / 3, output0_vec[lane]);
   }
 
@@ -137,33 +141,37 @@ void DivideImpl()
   //
   // Test variable length operations for all valid lengths
   //
-  for(camp::idx_t  N = 0;N < num_elem; ++N){
-
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
-      register_t x;
-      x.load_packed_n(input0_dptr, N);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed_n(input0_dptr, N);
 
-      register_t y;
-      y.load_packed_n(input1_dptr, N);
+          register_t y;
+          y.load_packed_n(input1_dptr, N);
 
-      register_t z = x.divide_n(y,N);
+          register_t z = x.divide_n(y, N);
 
-      z.store_packed(output0_dptr);
-    });
+          z.store_packed(output0_dptr);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-    for(camp::idx_t  lane = 0;lane < num_elem;++ lane){
-      if(lane < N){
-        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane], output0_vec[lane]);
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
+      if (lane < N)
+      {
+        ASSERT_SCALAR_EQ(input0_vec[lane] / input1_vec[lane],
+                         output0_vec[lane]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(0, output0_vec[lane]);
       }
     }
-
-
   }
 
 
@@ -174,11 +182,7 @@ void DivideImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Divide)
-{
-  DivideImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Divide) { DivideImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
index 6a414dd7d0..dcd47e50e0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-DotProduct.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_DotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_DotProduct_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void DotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -47,28 +48,29 @@ void DotProductImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
 
-
-    output0_dptr[0] = x.dot(y);
-  });
+        output0_dptr[0] = x.dot(y);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   element_t expected = 0;
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     expected += input0_vec[lane] * input1_vec[lane];
   }
   ASSERT_SCALAR_EQ(expected, output0_vec[0]);
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -76,11 +78,7 @@ void DotProductImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, DotProduct)
-{
-  DotProductImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, DotProduct) { DotProductImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
index e03529d183..f2294ab3ae 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMA.hpp
@@ -8,40 +8,41 @@
 #ifndef __TEST_TENSOR_REGISTER_FMA_HPP__
 #define __TEST_TENSOR_REGISTER_FMA_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMAImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t *input2_hptr = input2_vec.data();
-  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input2_hptr = input2_vec.data();
+  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
-   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -54,31 +55,32 @@ void FMAImpl()
   //
 
   // operator z = a*b+c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t a;
-    a.load_packed(input0_dptr);
+        register_t b;
+        b.load_packed(input1_dptr);
 
-    register_t b;
-    b.load_packed(input1_dptr);
+        register_t c;
+        c.load_packed(input2_dptr);
 
-    register_t c;
-    c.load_packed(input2_dptr);
+        register_t z = a.multiply_add(b, c);
 
-    register_t z = a.multiply_add(b,c);
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane], output0_vec[lane]);
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] + input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
-
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -87,11 +89,7 @@ void FMAImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, FMA)
-{
-  FMAImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, FMA) { FMAImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
index 2f8b53c0c9..05015c5560 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-FMS.hpp
@@ -8,40 +8,41 @@
 #ifndef __TEST_TENSOR_REGISTER_FMS_HPP__
 #define __TEST_TENSOR_REGISTER_FMS_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void FMSImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input2_vec(num_elem);
-  element_t *input2_hptr = input2_vec.data();
-  element_t *input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input2_hptr = input2_vec.data();
+  element_t* input2_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
-   input2_hptr[i] = (element_t)(i+i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
+    input2_hptr[i] = (element_t)(i + i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -54,31 +55,32 @@ void FMSImpl()
   //
 
   // operator z = a*b-c
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t a;
-    a.load_packed(input0_dptr);
+        register_t b;
+        b.load_packed(input1_dptr);
 
-    register_t b;
-    b.load_packed(input1_dptr);
+        register_t c;
+        c.load_packed(input2_dptr);
 
-    register_t c;
-    c.load_packed(input2_dptr);
+        register_t z = a.multiply_subtract(b, c);
 
-    register_t z = a.multiply_subtract(b,c);
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane], output0_vec[lane]);
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
+    ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane] - input2_vec[lane],
+                     output0_vec[lane]);
   }
 
 
-
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -87,11 +89,7 @@ void FMSImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, FMS)
-{
-  FMSImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, FMS) { FMSImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
index 37429b5087..3aa665712c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Gather.hpp
@@ -8,43 +8,45 @@
 #ifndef __TEST_TENSOR_REGISTER_Gather_HPP__
 #define __TEST_TENSOR_REGISTER_Gather_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GatherImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t = typename int_register_t::element_type;
+  using index_t        = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read (10x larger than output)
-  std::vector<element_t> input0_vec(10*num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> input0_vec(10 * num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Indexing into input0
   std::vector<index_t> input1_vec(num_elem);
-  index_t *input1_hptr = input1_vec.data();
-  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -56,23 +58,25 @@ void GatherImpl()
   //
 
   // operator z[i] = a[b[i]]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // get offsets
+        int_register_t idx;
+        idx.load_packed(input1_dptr);
 
-    // get offsets
-    int_register_t idx;
-    idx.load_packed(input1_dptr);
+        // gather elements from a given offsets in idx
+        register_t a;
+        a.gather(input0_dptr, idx);
 
-    // gather elements from a given offsets in idx
-    register_t a;
-    a.gather(input0_dptr, idx);
-
-    // write out gathered elements in packed order
-    a.store_packed(output0_dptr);
-  });
+        // write out gathered elements in packed order
+        a.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
   }
 
@@ -81,36 +85,40 @@ void GatherImpl()
   // Check partial length operations
   //
 
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
 
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // get offsets
-      int_register_t idx;
-      idx.load_packed_n(input1_dptr, N);
-
-      // gather elements from a given offsets in idx
-      register_t a;
-      a.gather_n(input0_dptr, idx, N);
-
-      // write out gathered elements in packed order
-      // we're writing out entire length to check the zeroing
-      a.store_packed(output0_dptr);
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // get offsets
+          int_register_t idx;
+          idx.load_packed_n(input1_dptr, N);
+
+          // gather elements from a given offsets in idx
+          register_t a;
+          a.gather_n(input0_dptr, idx, N);
+
+          // write out gathered elements in packed order
+          // we're writing out entire length to check the zeroing
+          a.store_packed(output0_dptr);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
-    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
-      if(lane < N){
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
+      if (lane < N)
+      {
         ASSERT_SCALAR_EQ(input0_vec[input1_vec[lane]], output0_vec[lane]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ((element_t)0, output0_vec[lane]);
       }
     }
-
   }
 
 
@@ -121,11 +129,7 @@ void GatherImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Gather)
-{
-  GatherImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Gather) { GatherImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
index 194412d999..b735c05ece 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-GetSet.hpp
@@ -8,51 +8,55 @@
 #ifndef __TEST_TENSOR_REGISTER_GetSet_HPP__
 #define __TEST_TENSOR_REGISTER_GetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void GetSetImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
   // Test set and get operations
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = x.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
@@ -60,158 +64,161 @@ void GetSetImpl()
   //
   // test copy construction
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc(x);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc(x);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test explicit copy
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc;
-    cc.copy(x);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc;
+        cc.copy(x);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    register_t cc = x;
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        register_t cc = x;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-
   //
   // test scalar construction (broadcast)
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-
-    register_t cc = (element_t) 5;
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)5;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)5);
   }
 
 
-
-
-
   //
   // test scalar broadcast by assignment
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-
-    register_t cc = (element_t) 0;
-    cc = (element_t) 11.0;
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)0;
+        cc            = (element_t)11.0;
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)11);
   }
 
 
-
   //
   // test scalar explicit broadcast
   //
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t cc = (element_t) 0;
-    cc.broadcast(13.0);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = cc.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t cc = (element_t)0;
+        cc.broadcast(13.0);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = cc.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], (element_t)13);
   }
 
@@ -224,11 +231,7 @@ void GetSetImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, GetSet)
-{
-  GetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, GetSet) { GetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
index afe738b037..768965aad0 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Load.hpp
@@ -8,131 +8,140 @@
 #ifndef __TEST_TENSOR_REGISTER_Load_HPP__
 #define __TEST_TENSOR_REGISTER_Load_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void LoadImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
-  std::vector<element_t> input0_vec(10*num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> input0_vec(10 * num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
-
   // load stride-1 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    x.load_packed(input0_dptr);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = x.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        x.load_packed(input0_dptr);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x using set
-      register_t x;
-      x.load_packed_n(input0_dptr, N);
-
-      // extract from x using get
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        output0_dptr[i] = x.get(i);
-      }
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x using set
+          register_t x;
+          x.load_packed_n(input0_dptr, N);
+
+          // extract from x using get
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            output0_dptr[i] = x.get(i);
+          }
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
-
-
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x using set
-    register_t x;
-    x.load_strided(input0_dptr, 2);
-
-    // extract from x using get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      output0_dptr[i] = x.get(i);
-    }
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x using set
+        register_t x;
+        x.load_strided(input0_dptr, 2);
+
+        // extract from x using get
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          output0_dptr[i] = x.get(i);
+        }
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x using set
-      register_t x;
-      x.load_strided_n(input0_dptr, 2, N);
-
-      // extract from x using get
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        output0_dptr[i] = x.get(i);
-      }
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x using set
+          register_t x;
+          x.load_strided_n(input0_dptr, 2, N);
+
+          // extract from x using get
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            output0_dptr[i] = x.get(i);
+          }
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
-        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i*2]);
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
+        ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i * 2]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
@@ -147,11 +156,7 @@ void LoadImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Load)
-{
-  LoadImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Load) { LoadImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
index f4bce2e7a9..319e0cac57 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Max.hpp
@@ -8,38 +8,39 @@
 #ifndef __TEST_TENSOR_REGISTER_Max_HPP__
 #define __TEST_TENSOR_REGISTER_Max_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MaxImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
-    input1_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+    input1_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -50,33 +51,34 @@ void MaxImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load input vectors
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    // load input vectors
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
 
+        // compute reduction
+        output0_dptr[0] = x.max();
 
-    // compute reduction
-    output0_dptr[0] = x.max();
 
-
-    // compute element-wise
-    register_t z = x.vmax(y);
-    z.store_packed(output1_dptr);
-  });
+        // compute element-wise
+        register_t z = x.vmax(y);
+        z.store_packed(output1_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
-
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for(camp::idx_t i = 1;i < num_elem;++i){
+  for (camp::idx_t i = 1; i < num_elem; ++i)
+  {
     expected = expected < input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -85,40 +87,43 @@ void MaxImpl()
 
 
   // check element-wise operation
-  for(camp::idx_t i = 0;i < num_elem;++i){
-    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(std::max<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      register_t x;
-      x.load_packed(input0_dptr);
-
-      output0_dptr[0] = x.max_n(N);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed(input0_dptr);
 
-    });
+          output0_dptr[0] = x.max_n(N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::min();
-    for(camp::idx_t i = 0;i < N;++i){
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
       expected = expected < input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
-
   }
 
   // Cleanup
@@ -129,11 +134,7 @@ void MaxImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Max)
-{
-  MaxImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Max) { MaxImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
index 957d9fbf1d..aad3a0333c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Min.hpp
@@ -8,37 +8,38 @@
 #ifndef __TEST_TENSOR_REGISTER_Min_HPP__
 #define __TEST_TENSOR_REGISTER_Min_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MinImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(1);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(1);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(1);
 
   std::vector<element_t> output1_vec(num_elem);
-  element_t *output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
-   input0_hptr[i] = (element_t)(rand()*1000/RAND_MAX);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
+    input0_hptr[i] = (element_t)(rand() * 1000 / RAND_MAX);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -49,33 +50,34 @@ void MinImpl()
   //  Check full-length operations
   //
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load input vectors
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    // load input vectors
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
 
+        // compute reduction
+        output0_dptr[0] = x.min();
 
-    // compute reduction
-    output0_dptr[0] = x.min();
 
-
-    // compute element-wise
-    register_t z = x.vmin(y);
-    z.store_packed(output1_dptr);
-  });
+        // compute element-wise
+        register_t z = x.vmin(y);
+        z.store_packed(output1_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
   tensor_copy_to_host<policy_t>(output1_vec, output1_dptr);
 
 
-
   // compute expected value for reduction
   element_t expected = input0_vec[0];
-  for(camp::idx_t i = 1;i < num_elem;++i){
+  for (camp::idx_t i = 1; i < num_elem; ++i)
+  {
     expected = expected > input0_vec[i] ? input0_vec[i] : expected;
   }
 
@@ -84,40 +86,43 @@ void MinImpl()
 
 
   // check element-wise operation
-  for(camp::idx_t i = 0;i < num_elem;++i){
-    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]), output1_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(std::min<element_t>(input0_vec[i], input1_vec[i]),
+                     output1_vec[i]);
   }
 
 
   //
   // check variable length operator
   //
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
     //
     //  Check full-length operations
     //
 
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      register_t x;
-      x.load_packed(input0_dptr);
-
-      output0_dptr[0] = x.min_n(N);
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          register_t x;
+          x.load_packed(input0_dptr);
 
-    });
+          output0_dptr[0] = x.min_n(N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value for reduction
     element_t expected = RAJA::operators::limits<element_t>::max();
-    for(camp::idx_t i = 0;i < N;++i){
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
       expected = expected > input0_vec[i] ? input0_vec[i] : expected;
     }
 
     // check reduction
     ASSERT_SCALAR_EQ(expected, output0_vec[0]);
-
   }
 
   // Cleanup
@@ -128,11 +133,7 @@ void MinImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Min)
-{
-  MinImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Min) { MinImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
index 0ed4d4ad39..33072a50e5 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Multiply.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Multiply_HPP__
 #define __TEST_TENSOR_REGISTER_Multiply_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void MultiplyImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,93 +49,95 @@ void MultiplyImpl()
   //
 
   // operator *
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x * y;
 
-    register_t z = x * y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator *=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z *= y;
 
-    z *= y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator * scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x * 7;
+        register_t z = x * 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * 7, output0_vec[lane]);
   }
 
 
-
-
   // operator *= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z *= 3;
+        z *= 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] * 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +145,7 @@ void MultiplyImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Multiply)
-{
-  MultiplyImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Multiply) { MultiplyImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
index dc27f15b7b..23a29a9bd7 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Scatter.hpp
@@ -8,44 +8,45 @@
 #ifndef __TEST_TENSOR_REGISTER_Scatter_HPP__
 #define __TEST_TENSOR_REGISTER_Scatter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void ScatterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // get the integer indexing types
   using int_register_t = typename register_t::int_vector_type;
-  using index_t = typename int_register_t::element_type;
+  using index_t        = typename int_register_t::element_type;
 
   // Allocate
 
   // Data to be read
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   // Indexing into output0
   std::vector<index_t> input1_vec(num_elem);
-  index_t *input1_hptr = input1_vec.data();
-  index_t *input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
+  index_t* input1_hptr = input1_vec.data();
+  index_t* input1_dptr = tensor_malloc<policy_t, index_t>(num_elem);
 
   // Scattered output (10x larger than output)
-  std::vector<element_t> output0_vec(10*num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> output0_vec(10 * num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // precomputed expected output
-  std::vector<element_t> expected(10*num_elem);
+  std::vector<element_t> expected(10 * num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (index_t)(3*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (index_t)(3 * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -53,8 +54,9 @@ void ScatterImpl()
 
 
   // Initialize output
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   output0_vec[i] = (element_t)0;
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
@@ -64,29 +66,33 @@ void ScatterImpl()
   //
 
   // operator z[b[i]] = a[i]
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        int_register_t idx;
+        idx.load_packed(input1_dptr);
 
-    int_register_t idx;
-    idx.load_packed(input1_dptr);
+        register_t a;
+        a.load_packed(input0_dptr);
 
-    register_t a;
-    a.load_packed(input0_dptr);
-
-    a.scatter(output0_dptr, idx);
-  });
+        a.scatter(output0_dptr, idx);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // compute expected value
-  for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
+  {
     expected[lane] = 0;
   }
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     expected[input1_vec[lane]] = input0_vec[lane];
   }
 
   // check result
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
   }
 
@@ -95,44 +101,48 @@ void ScatterImpl()
   // Check partial length operations
   //
 
-  for(camp::idx_t N = 0;N <= num_elem;++ N){
+  for (camp::idx_t N = 0; N <= num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-     output0_vec[i] = (element_t)0;
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
-
     // operator z[i] = a[b[i]]
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          int_register_t idx;
+          idx.load_packed(input1_dptr);
 
-      int_register_t idx;
-      idx.load_packed(input1_dptr);
+          register_t a;
+          a.load_packed(input0_dptr);
 
-      register_t a;
-      a.load_packed(input0_dptr);
-
-      a.scatter_n(output0_dptr, idx, N);
-    });
+          a.scatter_n(output0_dptr, idx, N);
+        });
 
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
 
     // compute expected value
-    for(camp::idx_t lane = 0;lane < 10*num_elem;++ lane){
+    for (camp::idx_t lane = 0; lane < 10 * num_elem; ++lane)
+    {
       expected[lane] = 0;
     }
-    for(camp::idx_t lane = 0;lane < N;++ lane){
+    for (camp::idx_t lane = 0; lane < N; ++lane)
+    {
       expected[input1_vec[lane]] = input0_vec[lane];
     }
 
     // check result
-    for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+    for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+    {
       ASSERT_SCALAR_EQ(expected[lane], output0_vec[lane]);
     }
-
   }
 
 
@@ -143,11 +153,7 @@ void ScatterImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Scatter)
-{
-  ScatterImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Scatter) { ScatterImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
index c3394e981f..f843fc6ad9 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastInner.hpp
@@ -8,57 +8,61 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastInner_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-//  printf("input: ");
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
-//    printf("%lf ", (double)input0_hptr[i]);
+  //  printf("input: ");
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
+    //    printf("%lf ", (double)input0_hptr[i]);
   }
-//  printf("\n");
+  //  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = num_elem>>segbits;
+    camp::idx_t num_segments = num_elem >> segbits;
 
-    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
-//      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits, (camp::idx_t)input_segment);
+    for (camp::idx_t input_segment = 0; input_segment < num_segments;
+         ++input_segment)
+    {
+      //      printf("segbits=%d, input_segment=%d\n", (camp::idx_t)segbits,
+      //      (camp::idx_t)input_segment);
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_broadcast_inner(segbits, input_segment);
+            register_t y = x.segmented_broadcast_inner(segbits, input_segment);
 
-        y.store_packed(output0_dptr);
-
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -69,38 +73,40 @@ void SegmentedBroadcastInnerImpl()
       // Compute expected values
       element_t expected[num_elem];
 
-      camp::idx_t mask = (1<<segbits)-1;
+      camp::idx_t mask   = (1 << segbits) - 1;
       camp::idx_t offset = input_segment << segbits;
 
       // default implementation is dumb, just sum each value into
       // appropriate segment lane
-//      printf("Expected: ");
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      //      printf("Expected: ");
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
-        auto off = (i&mask) + offset;
+        auto off = (i & mask) + offset;
 
         expected[i] = input0_hptr[off];
 
-//        printf("%d ", (camp::idx_t)off);
-        //printf("%lf ", (double)expected[i]);
+        //        printf("%d ", (camp::idx_t)off);
+        // printf("%lf ", (double)expected[i]);
       }
-//      printf("\n");
+      //      printf("\n");
 
 
-//      printf("Result:   ");
-//      for(camp::idx_t i = 0;i < num_elem; ++ i){
-//        printf("%lf ", (double)output0_vec[i]);
-//      }
-//      printf("\n");
+      //      printf("Result:   ");
+      //      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      //        printf("%lf ", (double)output0_vec[i]);
+      //      }
+      //      printf("\n");
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -109,7 +115,6 @@ void SegmentedBroadcastInnerImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastInner)
 {
   SegmentedBroadcastInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
index 45c5739af0..aca677b975 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedBroadcastOuter.hpp
@@ -8,56 +8,59 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedBroadcastOuter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedBroadcastOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-//  printf("input: ");
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-//    printf("%lf ", (double)input0_hptr[i]);
+  //  printf("input: ");
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    //    printf("%lf ", (double)input0_hptr[i]);
   }
-//  printf("\n");
+  //  printf("\n");
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = (1<<segbits);
+    camp::idx_t num_segments = (1 << segbits);
 
-    for(camp::idx_t input_segment = 0;input_segment < num_segments;++ input_segment){
+    for (camp::idx_t input_segment = 0; input_segment < num_segments;
+         ++input_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_broadcast_outer(segbits, input_segment);
+            register_t y = x.segmented_broadcast_outer(segbits, input_segment);
 
-        y.store_packed(output0_dptr);
-
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -66,28 +69,30 @@ void SegmentedBroadcastOuterImpl()
       // Check result
 
       // Compute expected values
-//      printf("explode: segbits=%d, input_segment=%d\n", segbits, input_segment);
-//      printf("  expected:  ");
+      //      printf("explode: segbits=%d, input_segment=%d\n", segbits,
+      //      input_segment); printf("  expected:  ");
 
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        camp::idx_t seg = i>>segbits;
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        camp::idx_t seg = i >> segbits;
 
-        camp::idx_t off = (num_elem>>segbits)*input_segment + seg;
+        camp::idx_t off = (num_elem >> segbits) * input_segment + seg;
 
         expected[i] = input0_hptr[off];
-//        printf("%lf ", (double)expected[i]);
+        //        printf("%lf ", (double)expected[i]);
       }
-//      printf("\n");
+      //      printf("\n");
 
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -96,7 +101,6 @@ void SegmentedBroadcastOuterImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedBroadcastOuter)
 {
   SegmentedBroadcastOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
index d8243864e8..4332cf3430 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedDotProduct.hpp
@@ -8,62 +8,65 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedDotProduct_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedDotProductImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
   tensor_copy_to_device<policy_t>(input1_dptr, input1_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
-
-    camp::idx_t num_output_segments = 1<<segbits;
-
-    for(camp::idx_t output_segment = 0;output_segment < num_output_segments;++output_segment){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
+    camp::idx_t num_output_segments = 1 << segbits;
 
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+    for (camp::idx_t output_segment = 0; output_segment < num_output_segments;
+         ++output_segment)
+    {
 
-        register_t x;
-        x.load_packed(input0_dptr);
 
-        register_t y;
-        y.load_packed(input1_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t dp = x.segmented_dot(segbits, output_segment, y);
-        dp.store_packed(output0_dptr);
+            register_t y;
+            y.load_packed(input1_dptr);
 
-      });
+            register_t dp = x.segmented_dot(segbits, output_segment, y);
+            dp.store_packed(output0_dptr);
+          });
 
 
       // Move result to host
@@ -72,23 +75,25 @@ void SegmentedDotProductImpl()
       // Compute expected values
       std::vector<element_t> expected(num_elem);
 
-      camp::idx_t offset = output_segment * num_elem/(1<<segbits);
+      camp::idx_t offset = output_segment * num_elem / (1 << segbits);
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        expected[(i>>segbits) + offset] += input0_vec[i]*input1_vec[i];
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        expected[(i >> segbits) + offset] += input0_vec[i] * input1_vec[i];
       }
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // output_segment
-
-  } // segbits
+    }  // output_segment
 
+  }  // segbits
 
 
   // Cleanup
@@ -98,7 +103,6 @@ void SegmentedDotProductImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedDotProduct)
 {
   SegmentedDotProductImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
index 2cfda47bcd..e0e45f428c 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumInner.hpp
@@ -8,54 +8,57 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumInner_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumInnerImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = 1<<segbits;
+    camp::idx_t num_segments = 1 << segbits;
 
-    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
+    for (camp::idx_t output_segment = 0; output_segment < num_segments;
+         ++output_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
-
-        register_t y = x.segmented_sum_inner(segbits, output_segment);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        y.store_packed(output0_dptr);
+            register_t y = x.segmented_sum_inner(segbits, output_segment);
 
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -65,30 +68,32 @@ void SegmentedSumInnerImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * num_elem>>segbits;
+      camp::idx_t output_offset = output_segment * num_elem >> segbits;
 
       // sum each value into appropriate segment lane
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
-        auto off = (i >> segbits)+output_offset;
+        auto off = (i >> segbits) + output_offset;
 
         expected[off] += input0_hptr[i];
       }
 
 
-
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -97,7 +102,6 @@ void SegmentedSumInnerImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedSumInner)
 {
   SegmentedSumInnerImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
index 6ce6f2a6e3..484f9e198a 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-SegmentedSumOuter.hpp
@@ -8,53 +8,56 @@
 #ifndef __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 #define __TEST_TENSOR_REGISTER_SegmentedSumOuter_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SegmentedSumOuterImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    input0_hptr[i] = (element_t)(i+1); //+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1);  //+NO_OPT_RAND);
   }
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
-
   // run segmented dot products for all segments allowed by the vector
-  for(camp::idx_t segbits = 0;(1<<segbits) <= num_elem;++ segbits){
+  for (camp::idx_t segbits = 0; (1 << segbits) <= num_elem; ++segbits)
+  {
 
-    camp::idx_t num_segments = num_elem>>segbits;
+    camp::idx_t num_segments = num_elem >> segbits;
 
-    for(camp::idx_t output_segment = 0;output_segment < num_segments;++ output_segment){
+    for (camp::idx_t output_segment = 0; output_segment < num_segments;
+         ++output_segment)
+    {
 
       // Execute segmented broadcast
-      tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-        register_t x;
-        x.load_packed(input0_dptr);
+      tensor_do<policy_t>(
+          [=] RAJA_HOST_DEVICE()
+          {
+            register_t x;
+            x.load_packed(input0_dptr);
 
-        register_t y = x.segmented_sum_outer(segbits, output_segment);
+            register_t y = x.segmented_sum_outer(segbits, output_segment);
 
-        y.store_packed(output0_dptr);
-
-      });
+            y.store_packed(output0_dptr);
+          });
 
       // Move result to host
       tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
@@ -64,26 +67,29 @@ void SegmentedSumOuterImpl()
 
       // Compute expected values
       element_t expected[num_elem];
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
         expected[i] = 0;
       }
 
-      camp::idx_t output_offset = output_segment * (1<<segbits);
+      camp::idx_t output_offset = output_segment * (1 << segbits);
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        camp::idx_t output_i = output_offset + i%(1<<segbits);
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
+        camp::idx_t output_i = output_offset + i % (1 << segbits);
         expected[output_i] += input0_hptr[i];
       }
 
 
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
+      for (camp::idx_t i = 0; i < num_elem; ++i)
+      {
 
         ASSERT_SCALAR_EQ(expected[i], output0_vec[i]);
       }
 
-    } // segment
+    }  // segment
 
-  } // segbits
+  }  // segbits
 
 
   // Cleanup
@@ -92,7 +98,6 @@ void SegmentedSumOuterImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorRegister, SegmentedSumOuter)
 {
   SegmentedSumOuterImpl<TypeParam>();
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
index ac508fb0d6..ca341f74d8 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Store.hpp
@@ -8,156 +8,171 @@
 #ifndef __TEST_TENSOR_REGISTER_Store_HPP__
 #define __TEST_TENSOR_REGISTER_Store_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void StoreImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
-  std::vector<element_t> output0_vec(10*num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(10*num_elem);
+  std::vector<element_t> output0_vec(10 * num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(10 * num_elem);
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
 
 
   // Initialize output
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // store stride-1 to pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    x.store_packed(output0_dptr);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        x.store_packed(output0_dptr);
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
     ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+    {
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
     // load stride-1 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x
-      register_t x;
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        x.set(input0_dptr[i], i);
-      }
-
-      x.store_packed_n(output0_dptr, N);
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x
+          register_t x;
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            x.set(input0_dptr[i], i);
+          }
+
+          x.store_packed_n(output0_dptr, N);
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], input0_vec[i]);
       }
-      else{
+      else
+      {
         ASSERT_SCALAR_EQ(output0_vec[i], (element_t)0);
       }
     }
   }
 
 
-
   // Initialize output
-  for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+  for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+  {
     output0_vec[i] = (element_t)0;
   }
   tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
   // load stride-2 from pointer
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    // fill x
-    register_t x;
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      x.set(input0_dptr[i], i);
-    }
-
-    x.store_strided(output0_dptr, 2);
-
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // fill x
+        register_t x;
+        for (camp::idx_t i = 0; i < num_elem; ++i)
+        {
+          x.set(input0_dptr[i], i);
+        }
+
+        x.store_strided(output0_dptr, 2);
+      });
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
   // check that we were able to copy using set/get
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-    ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
   }
 
 
-
-  for(camp::idx_t N = 0;N < num_elem; ++ N){
+  for (camp::idx_t N = 0; N < num_elem; ++N)
+  {
 
     // Initialize output
-    for(camp::idx_t i = 0;i < 10*num_elem; ++ i){
+    for (camp::idx_t i = 0; i < 10 * num_elem; ++i)
+    {
       output0_vec[i] = (element_t)0;
     }
     tensor_copy_to_device<policy_t>(output0_dptr, output0_vec);
 
 
-
     // load stride-2 from pointer
-    tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-      // fill x
-      register_t x;
-      for(camp::idx_t i = 0;i < num_elem; ++ i){
-        x.set(input0_dptr[i], i);
-      }
-
-      x.store_strided_n(output0_dptr, 2, N);
-
-    });
+    tensor_do<policy_t>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          // fill x
+          register_t x;
+          for (camp::idx_t i = 0; i < num_elem; ++i)
+          {
+            x.set(input0_dptr[i], i);
+          }
+
+          x.store_strided_n(output0_dptr, 2, N);
+        });
     tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
     // check that we were able to copy using set/get
-    for(camp::idx_t i = 0;i < num_elem; ++ i){
-      if(i < N){
-        ASSERT_SCALAR_EQ(output0_vec[2*i], input0_vec[i]);
+    for (camp::idx_t i = 0; i < num_elem; ++i)
+    {
+      if (i < N)
+      {
+        ASSERT_SCALAR_EQ(output0_vec[2 * i], input0_vec[i]);
       }
-      else{
-        ASSERT_SCALAR_EQ(output0_vec[2*i], (element_t)0);
+      else
+      {
+        ASSERT_SCALAR_EQ(output0_vec[2 * i], (element_t)0);
       }
     }
   }
@@ -171,11 +186,7 @@ void StoreImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Store)
-{
-  StoreImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Store) { StoreImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
index fb9a0efc92..2fdf1425d1 100644
--- a/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
+++ b/test/functional/tensor/register/tests/test-tensor-register-Subtract.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_REGISTER_Subtract_HPP__
 #define __TEST_TENSOR_REGISTER_Subtract_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename REGISTER_TYPE>
 void SubtractImpl()
 {
   using register_t = REGISTER_TYPE;
-  using element_t = typename register_t::element_type;
-  using policy_t = typename register_t::register_policy;
+  using element_t  = typename register_t::element_type;
+  using policy_t   = typename register_t::register_policy;
 
   static constexpr camp::idx_t num_elem = register_t::s_num_elem;
 
   // Allocate
 
   std::vector<element_t> input0_vec(num_elem);
-  element_t *input0_hptr = input0_vec.data();
-  element_t *input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input0_hptr = input0_vec.data();
+  element_t* input0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> input1_vec(num_elem);
-  element_t *input1_hptr = input1_vec.data();
-  element_t *input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* input1_hptr = input1_vec.data();
+  element_t* input1_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
   std::vector<element_t> output0_vec(num_elem);
-  element_t *output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
+  element_t* output0_dptr = tensor_malloc<policy_t, element_t>(num_elem);
 
 
   // Initialize input data
-  for(camp::idx_t i = 0;i < num_elem; ++ i){
-   input0_hptr[i] = (element_t)(i+1+NO_OPT_RAND);
-   input1_hptr[i] = (element_t)(i*i+1+NO_OPT_RAND);
+  for (camp::idx_t i = 0; i < num_elem; ++i)
+  {
+    input0_hptr[i] = (element_t)(i + 1 + NO_OPT_RAND);
+    input1_hptr[i] = (element_t)(i * i + 1 + NO_OPT_RAND);
   }
 
   tensor_copy_to_device<policy_t>(input0_dptr, input0_vec);
@@ -48,93 +49,95 @@ void SubtractImpl()
   //
 
   // operator -
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x - y;
 
-    register_t z = x - y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
-
   // operator -=
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t x;
-    x.load_packed(input0_dptr);
+        register_t y;
+        y.load_packed(input1_dptr);
 
-    register_t y;
-    y.load_packed(input1_dptr);
+        register_t z = x;
 
-    register_t z = x;
+        z -= y;
 
-    z -= y;
-
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - input1_vec[lane], output0_vec[lane]);
   }
 
 
-
-
   // operator - scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x - 7;
+        register_t z = x - 7;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - 7, output0_vec[lane]);
   }
 
 
-
-
   // operator -= scalar
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-
-    register_t x;
-    x.load_packed(input0_dptr);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        register_t x;
+        x.load_packed(input0_dptr);
 
-    register_t z = x;
+        register_t z = x;
 
-    z -= 3;
+        z -= 3;
 
-    z.store_packed(output0_dptr);
-  });
+        z.store_packed(output0_dptr);
+      });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_dptr);
 
-  for(camp::idx_t lane = 0;lane < num_elem;++ lane){
+  for (camp::idx_t lane = 0; lane < num_elem; ++lane)
+  {
     ASSERT_SCALAR_EQ(input0_vec[lane] - 3, output0_vec[lane]);
   }
 
 
-
   // Cleanup
   tensor_free<policy_t>(input0_dptr);
   tensor_free<policy_t>(input1_dptr);
@@ -142,11 +145,7 @@ void SubtractImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorRegister, Subtract)
-{
-  SubtractImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorRegister, Subtract) { SubtractImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
index cbcf7c8783..d6c7e72dd8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-CtorGetSet.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 #define __TEST_TENSOR_VECTOR_CtorGetSet_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void CtorGetSetImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
@@ -23,12 +23,13 @@ void CtorGetSetImpl()
   std::vector<element_t> get(vector_t::s_num_elem);
   std::vector<element_t> set(vector_t::s_num_elem);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * get_ptr = tensor_malloc<policy_t>(get);
-  element_t * set_ptr = tensor_malloc<policy_t>(set);
+  element_t* A_ptr   = tensor_malloc<policy_t>(A);
+  element_t* get_ptr = tensor_malloc<policy_t>(get);
+  element_t* set_ptr = tensor_malloc<policy_t>(set);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
-    A[i] = (element_t)(i*2);
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
+    A[i]   = (element_t)(i * 2);
     get[i] = 0;
     set[i] = 0;
   }
@@ -39,24 +40,29 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
-      // load array A as vector
-      vector_t vec;
-      vec.load_packed_n(A_ptr, N);
-
-      // try get operations
-      for(camp::idx_t i = 0;i < N;++ i){
-        get_ptr[i] = vec.get(i);
-      }
-
-      // try set and get operations
-      for(camp::idx_t i = 0;i < N;++ i){
-        vec.set((element_t)(i+1), i);
-        set_ptr[i] = vec.get(i);
-      }
-    }
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
+          // load array A as vector
+          vector_t vec;
+          vec.load_packed_n(A_ptr, N);
+
+          // try get operations
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            get_ptr[i] = vec.get(i);
+          }
+
+          // try set and get operations
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            vec.set((element_t)(i + 1), i);
+            set_ptr[i] = vec.get(i);
+          }
+        }
+      });
 
 
   tensor_copy_to_host<policy_t>(get, get_ptr);
@@ -64,17 +70,19 @@ void CtorGetSetImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+  {
 
     // check get operations
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(get[i], (element_t)(i*2));
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(get[i], (element_t)(i * 2));
     }
 
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(set[i], (element_t)(i+1));
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(set[i], (element_t)(i + 1));
     }
-
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -83,11 +91,7 @@ void CtorGetSetImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, CtorGetSet)
-{
-  CtorGetSetImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, CtorGetSet) { CtorGetSetImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
index 61073f5cc3..a2489097d8 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-FmaFms.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_FmaFms_HPP__
 #define __TEST_TENSOR_VECTOR_FmaFms_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void FmaFmsImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -24,16 +24,17 @@ void FmaFmsImpl()
   std::vector<element_t> fma(vector_t::s_num_elem);
   std::vector<element_t> fms(vector_t::s_num_elem);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * B_ptr = tensor_malloc<policy_t>(B);
-  element_t * C_ptr = tensor_malloc<policy_t>(C);
-  element_t * fma_ptr = tensor_malloc<policy_t>(fma);
-  element_t * fms_ptr = tensor_malloc<policy_t>(fms);
-
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
-    A[i] = (element_t)i;
-    B[i] = (element_t)i*2;
-    C[i] = (element_t)i*3;
+  element_t* A_ptr   = tensor_malloc<policy_t>(A);
+  element_t* B_ptr   = tensor_malloc<policy_t>(B);
+  element_t* C_ptr   = tensor_malloc<policy_t>(C);
+  element_t* fma_ptr = tensor_malloc<policy_t>(fma);
+  element_t* fms_ptr = tensor_malloc<policy_t>(fms);
+
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
+    A[i]   = (element_t)i;
+    B[i]   = (element_t)i * 2;
+    C[i]   = (element_t)i * 3;
     fma[i] = 0;
     fms[i] = 0;
   }
@@ -46,53 +47,60 @@ void FmaFmsImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
 
-      // load arrays as vectors
-      vector_t vec_A;
-      vec_A.load_packed_n(A_ptr, N);
+          // load arrays as vectors
+          vector_t vec_A;
+          vec_A.load_packed_n(A_ptr, N);
 
-      vector_t vec_B;
-      vec_B.load_packed_n(B_ptr, N);
+          vector_t vec_B;
+          vec_B.load_packed_n(B_ptr, N);
 
-      vector_t vec_C;
-      vec_C.load_packed_n(C_ptr, N);
+          vector_t vec_C;
+          vec_C.load_packed_n(C_ptr, N);
 
 
-      // try FMA (A*B+C)
+          // try FMA (A*B+C)
 
-      vector_t fma = vec_A.multiply_add(vec_B, vec_C);
-      for(camp::idx_t i = 0;i < N;++ i){
-        fma_ptr[i] = fma.get(i);
-      }
+          vector_t fma = vec_A.multiply_add(vec_B, vec_C);
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            fma_ptr[i] = fma.get(i);
+          }
 
-      // try FMS (A*B-C)
-      vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
-      for(camp::idx_t i = 0;i < N;++ i){
-        fms_ptr[i] = fms.get(i);
-      }
-    }
-  });
+          // try FMS (A*B-C)
+          vector_t fms = vec_A.multiply_subtract(vec_B, vec_C);
+          for (camp::idx_t i = 0; i < N; ++i)
+          {
+            fms_ptr[i] = fms.get(i);
+          }
+        }
+      });
 
   tensor_copy_to_host<policy_t>(fma, fma_ptr);
   tensor_copy_to_host<policy_t>(fms, fms_ptr);
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+  {
 
     // check FMA (A*B+C)
 
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(fma[i], A[i]*B[i]+C[i]);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(fma[i], A[i] * B[i] + C[i]);
     }
 
     // check FMS (A*B-C)
-    for(camp::idx_t i = 0;i < N;++ i){
-      ASSERT_SCALAR_EQ(fms[i], A[i]*B[i]-C[i]);
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ASSERT_SCALAR_EQ(fms[i], A[i] * B[i] - C[i]);
     }
-
   }
 
   tensor_free<policy_t>(A_ptr);
@@ -103,11 +111,7 @@ void FmaFmsImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, FmaFms)
-{
-  FmaFmsImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, FmaFms) { FmaFmsImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
index 854dcba8be..2f4269161c 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef1d.hpp
@@ -8,35 +8,36 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef1d_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
-RAJA_INDEX_VALUE( TX, "TX" );
+RAJA_INDEX_VALUE(TX, "TX");
 
 template <typename VECTOR_TYPE>
 void ForallVectorRef1dImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
 
-  size_t N = 10*vector_t::s_num_elem+1;
+  size_t N = 10 * vector_t::s_num_elem + 1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-    //N += (size_t)(100*NO_OPT_RAND);
+  // N += (size_t)(100*NO_OPT_RAND);
 
   std::vector<element_t> A(N);
   std::vector<element_t> B(N);
   std::vector<element_t> C(N);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * B_ptr = tensor_malloc<policy_t>(B);
-  element_t * C_ptr = tensor_malloc<policy_t>(C);
+  element_t* A_ptr = tensor_malloc<policy_t>(A);
+  element_t* B_ptr = tensor_malloc<policy_t>(B);
+  element_t* C_ptr = tensor_malloc<policy_t>(C);
 
-  for(size_t i = 0;i < N; ++ i){
-    A[i] = (element_t)(NO_OPT_RAND*1000.0);
-    B[i] = (element_t)(NO_OPT_RAND*1000.0);
+  for (size_t i = 0; i < N; ++i)
+  {
+    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
+    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
     C[i] = 0.0;
   }
 
@@ -57,82 +58,85 @@ void ForallVectorRef1dImpl()
   tensor_copy_to_device<policy_t>(B_ptr, B);
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[all] = 3 + (X_d[all]*(5/Y_d[all])) + 9;
-  });
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
+                      { Z_d[all] = 3 + (X_d[all] * (5 / Y_d[all])) + 9; });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-//  for(size_t i = 0;i < N; ++ i){
-//    printf("%lf ", (double)C[i]);
-//  }
-//  printf("\n\n");
+  //  for(size_t i = 0;i < N; ++ i){
+  //    printf("%lf ", (double)C[i]);
+  //  }
+  //  printf("\n\n");
 
-  for(size_t i = 0;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = 0; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
 
 
   // evaluate complex left side division on all() range
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[all] = 3 + ((X_d[all]*Y_d[all])/Y_d[all]) + 9;
-  });
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      { Z_d[all] = 3 + ((X_d[all] * Y_d[all]) / Y_d[all]) + 9; });
 
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for(size_t i = 0;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+((A[i]*B[i])/B[i]))+9, C[i]);
+  for (size_t i = 0; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + ((A[i] * B[i]) / B[i])) + 9, C[i]);
   }
 
   // evaluate on a subrange [N/2, N)
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   tensor_copy_to_device<policy_t>(C_ptr, C);
 
   // evaluate on a subrange [N/2, N)
-  auto some = idx_t::range(N/2, N);
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    Z_d[some] = 3.+ (X_d[some]*(5/Y_d[some])) + 9;
-  });
+  auto some = idx_t::range(N / 2, N);
+  tensor_do<policy_t>([=] RAJA_HOST_DEVICE()
+                      { Z_d[some] = 3. + (X_d[some] * (5 / Y_d[some])) + 9; });
 
   tensor_copy_to_host<policy_t>(A, A_ptr);
   tensor_copy_to_host<policy_t>(B, B_ptr);
   tensor_copy_to_host<policy_t>(C, C_ptr);
 
-  for(size_t i = 0;i < N/2;i ++){
+  for (size_t i = 0; i < N / 2; i++)
+  {
     ASSERT_SCALAR_EQ(0, C[i]);
   }
-  for(size_t i = N/2;i < N;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = N / 2; i < N; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
 
 
-
-
   // evaluate on a subrange [0, N/2) using a forall statement
-  for(size_t i = 0;i < N; ++ i){
+  for (size_t i = 0; i < N; ++i)
+  {
     C[i] = 0.0;
   }
 
   // vector_exec only works on the host due to its use of RAJA::seq_exec
-  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(RAJA::TypedRangeSegment<TX>(0,N/2),
-      [=](TX i){
+  RAJA::forall<RAJA::expt::vector_exec<vector_t>>(
+      RAJA::TypedRangeSegment<TX>(0, N / 2),
+      [=](TX i) { Z[i] = 3 + (X[i] * (5 / Y[i])) + 9; });
 
-     Z[i] = 3 + (X[i]*(5/Y[i])) + 9;
-  });
-
-  for(size_t i = 0;i < N/2;i ++){
-    ASSERT_SCALAR_EQ(element_t(3+(A[i]*(5/B[i]))+9), C[i]);
+  for (size_t i = 0; i < N / 2; i++)
+  {
+    ASSERT_SCALAR_EQ(element_t(3 + (A[i] * (5 / B[i])) + 9), C[i]);
   }
-  for(size_t i = N/2;i < N;i ++){
+  for (size_t i = N / 2; i < N; i++)
+  {
     ASSERT_SCALAR_EQ(0, C[i]);
   }
 
@@ -142,7 +146,6 @@ void ForallVectorRef1dImpl()
 }
 
 
-
 TYPED_TEST_P(TestTensorVector, ForallVectorRef1d)
 {
   ForallVectorRef1dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index 93596d8f23..da498db615 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -8,38 +8,41 @@
 #ifndef __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 #define __TEST_TENSOR_VECTOR_ForallVectorRef2d_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<
+    TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
   // do nothing for CUDA or device tests
 }
 
 template <typename VECTOR_TYPE>
-typename std::enable_if<!TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
+typename std::enable_if<
+    !TensorTestHelper<typename VECTOR_TYPE::register_policy>::is_device>::type
 ForallVectorRef2dImpl()
 {
-  using vector_t = VECTOR_TYPE;
+  using vector_t  = VECTOR_TYPE;
   using element_t = typename vector_t::element_type;
 
   using index_t = ptrdiff_t;
 
-  index_t N = 3*vector_t::s_num_elem+1;
-  index_t M = 4*vector_t::s_num_elem+1;
+  index_t N = 3 * vector_t::s_num_elem + 1;
+  index_t M = 4 * vector_t::s_num_elem + 1;
   // If we are not using fixed vectors, add some random number of elements
   // to the array to test some postamble code generation.
-  N += (size_t)(10*NO_OPT_RAND);
-  M += (size_t)(10*NO_OPT_RAND);
+  N += (size_t)(10 * NO_OPT_RAND);
+  M += (size_t)(10 * NO_OPT_RAND);
 
-  std::vector<element_t> A(N*M);
-  std::vector<element_t> B(N*M);
-  std::vector<element_t> C(N*M);
+  std::vector<element_t> A(N * M);
+  std::vector<element_t> B(N * M);
+  std::vector<element_t> C(N * M);
 
-  for(index_t i = 0;i < N*M; ++ i){
-    A[i] = (element_t)(NO_OPT_RAND*1000.0);
-    B[i] = (element_t)(NO_OPT_RAND*1000.0);
+  for (index_t i = 0; i < N * M; ++i)
+  {
+    A[i] = (element_t)(NO_OPT_RAND * 1000.0);
+    B[i] = (element_t)(NO_OPT_RAND * 1000.0);
     C[i] = 0.0;
   }
 
@@ -48,32 +51,27 @@ ForallVectorRef2dImpl()
   RAJA::View<element_t, RAJA::Layout<2>> Z(C.data(), N, M);
 
   using idx_t = RAJA::expt::VectorIndex<index_t, vector_t>;
-  auto all = idx_t::all();
+  auto all    = idx_t::all();
 
   //
   // Test with kernel, using sequential policies and ::all()
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  using policy1_t =
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>
-        >
-      >;
+  using policy1_t = RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>;
 
   // Test with kernel, using sequential policies and ::all()
   RAJA::kernel<policy1_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N)),
-      [=] (index_t i)
-  {
-    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
-  });
+      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
 
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
@@ -81,74 +79,72 @@ ForallVectorRef2dImpl()
   // Test with kernel, using tensor_exec policy
   //
 
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
-  using policy2_t =
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
-            RAJA::statement::Lambda<0>
-          >
-        >
-      >;
+  using policy2_t = RAJA::KernelPolicy<RAJA::statement::For<
+      0, RAJA::seq_exec,
+      RAJA::statement::For<1, RAJA::expt::vector_exec<vector_t>,
+                           RAJA::statement::Lambda<0>>>>;
 
   RAJA::kernel<policy2_t>(
       RAJA::make_tuple(RAJA::TypedRangeSegment<index_t>(0, N),
                        RAJA::TypedRangeSegment<index_t>(0, M)),
 
       [=](index_t i, index_t j)
-  {
-    Z(i, j) = 3+(X(i, j)*(5/Y(i, j)))+9;
-  });
+      { Z(i, j) = 3 + (X(i, j) * (5 / Y(i, j))) + 9; });
 
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
-
   //
   // Test with forall with vectors in i
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
   RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, M),
-      [=](index_t j){
+                               [=](index_t j)
+                               {
+                                 Z(all, j) =
+                                     3 + (X(all, j) * (5 / Y(all, j))) + 9;
+                               });
 
-    Z(all,j) = 3+(X(all,j)*(5/Y(all,j)))+9;
-
-  });
-
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 
 
   //
   // Test with forall with vectors in j
   //
-  for(index_t i = 0;i < N*M; ++ i){
+  for (index_t i = 0; i < N * M; ++i)
+  {
     C[i] = 0.0;
   }
 
   RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, N),
-      [=](index_t i){
+                               [=](index_t i)
+                               {
+                                 Z(i, all) =
+                                     3 + (X(i, all) * (5 / Y(i, all))) + 9;
+                               });
 
-    Z(i,all) = 3+(X(i,all)*(5/Y(i,all)))+9;
-
-  });
-
-  for(index_t i = 0;i < N*M;i ++){
-    ASSERT_SCALAR_EQ(3+(A[i]*(5/B[i]))+9, C[i]);
+  for (index_t i = 0; i < N * M; i++)
+  {
+    ASSERT_SCALAR_EQ(3 + (A[i] * (5 / B[i])) + 9, C[i]);
   }
 }
 
 
-
 TYPED_TEST_P(TestTensorVector, ForallVectorRef2d)
 {
   ForallVectorRef2dImpl<TypeParam>();
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
index 4841c4e7ee..42b13bb70c 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-MinMax.hpp
@@ -8,25 +8,26 @@
 #ifndef __TEST_TENSOR_VECTOR_MinMax_HPP__
 #define __TEST_TENSOR_VECTOR_MinMax_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void MinMaxImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
   std::vector<element_t> ex_min(1);
   std::vector<element_t> ex_max(1);
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * ex_min_ptr = tensor_malloc<policy_t>(ex_min);
-  element_t * ex_max_ptr = tensor_malloc<policy_t>(ex_max);
+  element_t* A_ptr      = tensor_malloc<policy_t>(A);
+  element_t* ex_min_ptr = tensor_malloc<policy_t>(ex_min);
+  element_t* ex_max_ptr = tensor_malloc<policy_t>(ex_max);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     A[i] = (element_t)i;
   }
   ex_min[0] = (element_t)99999999;
@@ -39,17 +40,20 @@ void MinMaxImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    for(camp::idx_t N = 1; N <= vector_t::s_num_elem; ++ N){
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        for (camp::idx_t N = 1; N <= vector_t::s_num_elem; ++N)
+        {
 
-      // load array A as vector
-      vector_t vec;
-      vec.load_packed_n(A_ptr, N);
+          // load array A as vector
+          vector_t vec;
+          vec.load_packed_n(A_ptr, N);
 
-      ex_min_ptr[0] = vec.min_n(N);
-      ex_max_ptr[0] = vec.max_n(N);
-    }
-  });
+          ex_min_ptr[0] = vec.min_n(N);
+          ex_max_ptr[0] = vec.max_n(N);
+        }
+      });
 
   tensor_copy_to_host<policy_t>(ex_min, ex_min_ptr);
   tensor_copy_to_host<policy_t>(ex_max, ex_max_ptr);
@@ -58,7 +62,7 @@ void MinMaxImpl()
   ASSERT_SCALAR_EQ(ex_min[0], (element_t)0);
 
   // check max
-  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem-1));
+  ASSERT_SCALAR_EQ(ex_max[0], (element_t)(vector_t::s_num_elem - 1));
 
   tensor_free<policy_t>(A_ptr);
   tensor_free<policy_t>(ex_min_ptr);
@@ -66,11 +70,7 @@ void MinMaxImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, MinMax)
-{
-  MinMaxImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, MinMax) { MinMaxImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
index fa3a1caef8..5138d7858a 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-SumDot.hpp
@@ -8,14 +8,14 @@
 #ifndef __TEST_TENSOR_VECTOR_SumDot_HPP__
 #define __TEST_TENSOR_VECTOR_SumDot_HPP__
 
-#include<RAJA/RAJA.hpp>
+#include <RAJA/RAJA.hpp>
 
 template <typename VECTOR_TYPE>
 void SumDotImpl()
 {
 
-  using vector_t = VECTOR_TYPE;
-  using policy_t = typename vector_t::register_policy;
+  using vector_t  = VECTOR_TYPE;
+  using policy_t  = typename vector_t::register_policy;
   using element_t = typename vector_t::element_type;
 
   std::vector<element_t> A(vector_t::s_num_elem);
@@ -25,11 +25,12 @@ void SumDotImpl()
   element_t host_sum = 0;
   element_t host_dot = 0;
 
-  element_t * A_ptr = tensor_malloc<policy_t>(A);
-  element_t * ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
-  element_t * ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
+  element_t* A_ptr      = tensor_malloc<policy_t>(A);
+  element_t* ex_sum_ptr = tensor_malloc<policy_t>(ex_sum);
+  element_t* ex_dot_ptr = tensor_malloc<policy_t>(ex_dot);
 
-  for(camp::idx_t i = 0;i < vector_t::s_num_elem;++ i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     A[i] = (element_t)i;
   }
 
@@ -37,9 +38,10 @@ void SumDotImpl()
   ex_dot[0] = (element_t)0;
 
   // compute expected values on host
-  for(camp::idx_t i = 0; i < vector_t::s_num_elem; ++i){
+  for (camp::idx_t i = 0; i < vector_t::s_num_elem; ++i)
+  {
     host_sum += A[i];
-    host_dot += A[i]*A[i];
+    host_dot += A[i] * A[i];
   }
 
   tensor_copy_to_device<policy_t>(A_ptr, A);
@@ -48,14 +50,16 @@ void SumDotImpl()
 
   // For Fixed vectors, only try with fixed length
   // For Stream vectors, try all lengths
-  tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
-    // load array A as vector
-    vector_t vec;
-    vec.load_packed_n(A_ptr, vector_t::s_num_elem);
+  tensor_do<policy_t>(
+      [=] RAJA_HOST_DEVICE()
+      {
+        // load array A as vector
+        vector_t vec;
+        vec.load_packed_n(A_ptr, vector_t::s_num_elem);
 
-    ex_sum_ptr[0] = vec.sum();
-    ex_dot_ptr[0] = vec.dot(vec);
-  });
+        ex_sum_ptr[0] = vec.sum();
+        ex_dot_ptr[0] = vec.dot(vec);
+      });
 
   tensor_copy_to_host<policy_t>(ex_sum, ex_sum_ptr);
   tensor_copy_to_host<policy_t>(ex_dot, ex_dot_ptr);
@@ -72,11 +76,7 @@ void SumDotImpl()
 }
 
 
-
-TYPED_TEST_P(TestTensorVector, SumDot)
-{
-  SumDotImpl<TypeParam>();
-}
+TYPED_TEST_P(TestTensorVector, SumDot) { SumDotImpl<TypeParam>(); }
 
 
 #endif
diff --git a/test/functional/util/test-CombiningAdapter-1D.cpp b/test/functional/util/test-CombiningAdapter-1D.cpp
index 4dc73781d5..2867cd56e2 100644
--- a/test/functional/util/test-CombiningAdapter-1D.cpp
+++ b/test/functional/util/test-CombiningAdapter-1D.cpp
@@ -19,17 +19,22 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType, typename Segment0 >
+template <typename SegIndexType, typename Segment0>
 void test_CombiningAdapter_1D(Segment0 const& seg0)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
 
   size_t counter0 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType i0) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    counter0 += 1;
-  }, seg0);
+  auto adapter    = RAJA::make_CombiningAdapter(
+      [&](SegIndexType i0)
+      {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        counter0 += 1;
+      },
+      seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -38,12 +43,13 @@ void test_CombiningAdapter_1D(Segment0 const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType >
+template <typename SegIndexType>
 void test_types_CombiningAdapter_1D(SegIndexType ibegin0, SegIndexType iend0)
 {
   RAJA::TypedRangeSegment<SegIndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-CombiningAdapter-2D.cpp b/test/functional/util/test-CombiningAdapter-2D.cpp
index bfb7355418..6a01da4836 100644
--- a/test/functional/util/test-CombiningAdapter-2D.cpp
+++ b/test/functional/util/test-CombiningAdapter-2D.cpp
@@ -19,42 +19,53 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType0, typename SegIndexType1,
-           typename Segment0, typename Segment1 >
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename Segment0,
+          typename Segment1>
 void test_CombiningAdapter_2D(Segment0 const& seg0, Segment1 const& seg1)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
 
   size_t counter0 = 0;
   size_t counter1 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    ASSERT_EQ(seg1_begin[counter1], i1);
-    counter1 += 1;
-    if (counter1 == seg1_len) {
-      counter1 = 0;
-      counter0 += 1;
-    }
-  }, seg0, seg1);
+  auto adapter    = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1)
+      {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        ASSERT_EQ(seg1_begin[counter1], i1);
+        counter1 += 1;
+        if (counter1 == seg1_len)
+        {
+          counter1 = 0;
+          counter0 += 1;
+        }
+      },
+      seg0, seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType0, typename SegIndexType1 >
-void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0, SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1, SegIndexType1 iend1)
+template <typename SegIndexType0, typename SegIndexType1>
+void test_types_CombiningAdapter_2D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-CombiningAdapter-3D.cpp b/test/functional/util/test-CombiningAdapter-3D.cpp
index 9181b974b9..38226bdf4a 100644
--- a/test/functional/util/test-CombiningAdapter-3D.cpp
+++ b/test/functional/util/test-CombiningAdapter-3D.cpp
@@ -19,11 +19,19 @@
 #include <numeric>
 #include <vector>
 
-template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2,
-           typename Segment0, typename Segment1, typename Segment2 >
-void test_CombiningAdapter_3D(Segment0 const& seg0, Segment1 const& seg1, Segment2 const& seg2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2,
+          typename Segment0,
+          typename Segment1,
+          typename Segment2>
+void test_CombiningAdapter_3D(Segment0 const& seg0,
+                              Segment1 const& seg1,
+                              Segment2 const& seg2)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
   auto seg1_begin = begin(seg1);
   size_t seg1_len = static_cast<size_t>(seg1.size());
@@ -33,42 +41,55 @@ void test_CombiningAdapter_3D(Segment0 const& seg0, Segment1 const& seg1, Segmen
   size_t counter0 = 0;
   size_t counter1 = 0;
   size_t counter2 = 0;
-  auto adapter = RAJA::make_CombiningAdapter([&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2) {
-    ASSERT_EQ(seg0_begin[counter0], i0);
-    ASSERT_EQ(seg1_begin[counter1], i1);
-    ASSERT_EQ(seg2_begin[counter2], i2);
-    counter2 += 1;
-    if (counter2 == seg2_len) {
-      counter2 = 0;
-      counter1 += 1;
-      if (counter1 == seg1_len) {
-        counter1 = 0;
-        counter0 += 1;
-      }
-    }
-  }, seg0, seg1, seg2);
+  auto adapter    = RAJA::make_CombiningAdapter(
+      [&](SegIndexType0 i0, SegIndexType1 i1, SegIndexType2 i2)
+      {
+        ASSERT_EQ(seg0_begin[counter0], i0);
+        ASSERT_EQ(seg1_begin[counter1], i1);
+        ASSERT_EQ(seg2_begin[counter2], i2);
+        counter2 += 1;
+        if (counter2 == seg2_len)
+        {
+          counter2 = 0;
+          counter1 += 1;
+          if (counter1 == seg1_len)
+          {
+            counter1 = 0;
+            counter0 += 1;
+          }
+        }
+      },
+      seg0, seg1, seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename SegIndexType0, typename SegIndexType1, typename SegIndexType2 >
-void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0, SegIndexType0 iend0,
-                                    SegIndexType1 ibegin1, SegIndexType1 iend1,
-                                    SegIndexType2 ibegin2, SegIndexType2 iend2)
+template <typename SegIndexType0,
+          typename SegIndexType1,
+          typename SegIndexType2>
+void test_types_CombiningAdapter_3D(SegIndexType0 ibegin0,
+                                    SegIndexType0 iend0,
+                                    SegIndexType1 ibegin1,
+                                    SegIndexType1 iend1,
+                                    SegIndexType2 ibegin2,
+                                    SegIndexType2 iend2)
 {
   RAJA::TypedRangeSegment<SegIndexType0> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<SegIndexType1> rseg1(ibegin1, iend1);
   RAJA::TypedRangeSegment<SegIndexType2> rseg2(ibegin2, iend2);
-  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(rseg0, rseg1, rseg2);
+  test_CombiningAdapter_3D<SegIndexType0, SegIndexType1, SegIndexType2>(
+      rseg0, rseg1, rseg2);
 }
 
 TEST(CombiningAdapter, test3D)
diff --git a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
index ddcaea52d7..f7d489c75e 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-1D.cpp
@@ -19,17 +19,22 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
+template <typename Perm, typename IndexType, typename Segment>
 void test_PermutedCombiningAdapter_1D(Segment const& seg0)
 {
-  using std::begin; using std::end; using std::distance;
+  using std::begin;
+  using std::distance;
+  using std::end;
   auto seg0_begin = begin(seg0);
 
   size_t counters[1] = {0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    counters[camp::seq_at<0, Perm>::value] += 1;
-  }, seg0);
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0)
+      {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        counters[camp::seq_at<0, Perm>::value] += 1;
+      },
+      seg0);
 
   ASSERT_EQ(adapter.size(), seg0.size());
 
@@ -38,12 +43,13 @@ void test_PermutedCombiningAdapter_1D(Segment const& seg0)
   ASSERT_EQ(distance(begin(range), end(range)), seg0.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
+template <typename Perm, typename IndexType>
 void test_types_PermutedCombiningAdapter_1D(IndexType ibegin0, IndexType iend0)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
index fd1f6a8b0a..d9396d6ebd 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-2D.cpp
@@ -19,41 +19,51 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
+template <typename Perm, typename IndexType, typename Segment>
 void test_PermutedCombiningAdapter_2D(Segment const& seg0, Segment const& seg1)
 {
-  using std::begin; using std::end; using std::distance;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
+  using std::begin;
+  using std::distance;
+  using std::end;
+  auto seg0_begin    = begin(seg0);
+  auto seg1_begin    = begin(seg1);
   size_t seg_lens[2] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size())};
 
   size_t counters[2] = {0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    ASSERT_EQ(seg1_begin[counters[1]], i1);
-    counters[camp::seq_at<1, Perm>::value] += 1;
-    if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
-      counters[camp::seq_at<1, Perm>::value] = 0;
-      counters[camp::seq_at<0, Perm>::value] += 1;
-    }
-  }, seg0, seg1);
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1)
+      {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        ASSERT_EQ(seg1_begin[counters[1]], i1);
+        counters[camp::seq_at<1, Perm>::value] += 1;
+        if (counters[camp::seq_at<1, Perm>::value] ==
+            seg_lens[camp::seq_at<1, Perm>::value])
+        {
+          counters[camp::seq_at<1, Perm>::value] = 0;
+          counters[camp::seq_at<0, Perm>::value] += 1;
+        }
+      },
+      seg0, seg1);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size());
+  ASSERT_EQ(distance(begin(range), end(range)), seg0.size() * seg1.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
-void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0, IndexType iend0,
-                                            IndexType ibegin1, IndexType iend1)
+template <typename Perm, typename IndexType>
+void test_types_PermutedCombiningAdapter_2D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
diff --git a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
index 0943584c97..2ef1021251 100644
--- a/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
+++ b/test/functional/util/test-PermutedCombiningAdapter-3D.cpp
@@ -19,49 +19,65 @@
 #include <numeric>
 #include <vector>
 
-template < typename Perm, typename IndexType, typename Segment >
-void test_PermutedCombiningAdapter_3D(Segment const& seg0, Segment const& seg1, Segment const& seg2)
+template <typename Perm, typename IndexType, typename Segment>
+void test_PermutedCombiningAdapter_3D(Segment const& seg0,
+                                      Segment const& seg1,
+                                      Segment const& seg2)
 {
-  using std::begin; using std::end; using std::distance;
-  auto seg0_begin = begin(seg0);
-  auto seg1_begin = begin(seg1);
-  auto seg2_begin = begin(seg2);
+  using std::begin;
+  using std::distance;
+  using std::end;
+  auto seg0_begin    = begin(seg0);
+  auto seg1_begin    = begin(seg1);
+  auto seg2_begin    = begin(seg2);
   size_t seg_lens[3] = {static_cast<size_t>(seg0.size()),
                         static_cast<size_t>(seg1.size()),
                         static_cast<size_t>(seg2.size())};
 
   size_t counters[3] = {0, 0, 0};
-  auto adapter = RAJA::make_PermutedCombiningAdapter<Perm>([&](IndexType i0, IndexType i1, IndexType i2) {
-    ASSERT_EQ(seg0_begin[counters[0]], i0);
-    ASSERT_EQ(seg1_begin[counters[1]], i1);
-    ASSERT_EQ(seg2_begin[counters[2]], i2);
-    counters[camp::seq_at<2, Perm>::value] += 1;
-    if (counters[camp::seq_at<2, Perm>::value] == seg_lens[camp::seq_at<2, Perm>::value]) {
-      counters[camp::seq_at<2, Perm>::value] = 0;
-      counters[camp::seq_at<1, Perm>::value] += 1;
-      if (counters[camp::seq_at<1, Perm>::value] == seg_lens[camp::seq_at<1, Perm>::value]) {
-        counters[camp::seq_at<1, Perm>::value] = 0;
-        counters[camp::seq_at<0, Perm>::value] += 1;
-      }
-    }
-  }, seg0, seg1, seg2);
+  auto adapter       = RAJA::make_PermutedCombiningAdapter<Perm>(
+      [&](IndexType i0, IndexType i1, IndexType i2)
+      {
+        ASSERT_EQ(seg0_begin[counters[0]], i0);
+        ASSERT_EQ(seg1_begin[counters[1]], i1);
+        ASSERT_EQ(seg2_begin[counters[2]], i2);
+        counters[camp::seq_at<2, Perm>::value] += 1;
+        if (counters[camp::seq_at<2, Perm>::value] ==
+            seg_lens[camp::seq_at<2, Perm>::value])
+        {
+          counters[camp::seq_at<2, Perm>::value] = 0;
+          counters[camp::seq_at<1, Perm>::value] += 1;
+          if (counters[camp::seq_at<1, Perm>::value] ==
+              seg_lens[camp::seq_at<1, Perm>::value])
+          {
+            counters[camp::seq_at<1, Perm>::value] = 0;
+            counters[camp::seq_at<0, Perm>::value] += 1;
+          }
+        }
+      },
+      seg0, seg1, seg2);
 
-  ASSERT_EQ(adapter.size(), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(adapter.size(), seg0.size() * seg1.size() * seg2.size());
 
   auto range = adapter.getRange();
 
-  ASSERT_EQ(distance(begin(range), end(range)), seg0.size()*seg1.size()*seg2.size());
+  ASSERT_EQ(distance(begin(range), end(range)),
+            seg0.size() * seg1.size() * seg2.size());
 
   auto range_end = end(range);
-  for (auto idx = begin(range); idx != range_end; ++idx) {
+  for (auto idx = begin(range); idx != range_end; ++idx)
+  {
     adapter(*idx);
   }
 }
 
-template < typename Perm, typename IndexType >
-void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0, IndexType iend0,
-                                            IndexType ibegin1, IndexType iend1,
-                                            IndexType ibegin2, IndexType iend2)
+template <typename Perm, typename IndexType>
+void test_types_PermutedCombiningAdapter_3D(IndexType ibegin0,
+                                            IndexType iend0,
+                                            IndexType ibegin1,
+                                            IndexType iend1,
+                                            IndexType ibegin2,
+                                            IndexType iend2)
 {
   RAJA::TypedRangeSegment<IndexType> rseg0(ibegin0, iend0);
   RAJA::TypedRangeSegment<IndexType> rseg1(ibegin1, iend1);
@@ -77,7 +93,10 @@ TEST(PermutedCombiningAdapter, test3D)
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_JKI, int>(0, 0, 0, 0, 0, 5);
 
   test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KIJ, int>(0, 3, 0, 4, 0, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2, 5);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3, 0);
-  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1, 4);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_KJI, long>(-3, 5, 0, 6, 2,
+                                                               5);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IJK, long>(4, 13, -2, 7, -3,
+                                                               0);
+  test_types_PermutedCombiningAdapter_3D<RAJA::PERM_IKJ, long>(-8, -2, -5, 3, 1,
+                                                               4);
 }
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index 4241a945dd..24dc62646b 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -20,50 +20,62 @@
 
 
 // These are defined here due to cuda limitations
-template < typename IndexType, typename type1 >
-struct callable11 {
+template <typename IndexType, typename type1>
+struct callable11
+{
   type1* working_ptr1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr1[i] += type1(i);
   }
 };
-template < typename IndexType, typename type1 >
-struct callable12 {
+template <typename IndexType, typename type1>
+struct callable12
+{
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr1[i] += test_val1;
   }
 };
 
-template < typename IndexType, typename type2 >
-struct callable21 {
+template <typename IndexType, typename type2>
+struct callable21
+{
   type2* working_ptr2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr2[i] += type2(i);
   }
 };
-template < typename IndexType, typename type2 >
-struct callable22 {
+template <typename IndexType, typename type2>
+struct callable22
+{
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr2[i] += test_val2;
   }
 };
 
-template < typename IndexType, typename type3 >
-struct callable31 {
+template <typename IndexType, typename type3>
+struct callable31
+{
   type3* working_ptr3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr3[i] += type3(i);
   }
 };
-template < typename IndexType, typename type3 >
-struct callable32 {
+template <typename IndexType, typename type3>
+struct callable32
+{
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
     working_ptr3[i] += test_val3;
   }
 };
@@ -75,356 +87,382 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple {
-void operator()(
-    std::mt19937& rng, IndexType max_begin, IndexType min_end,
-    IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse) const
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple
 {
-  ASSERT_GT(min_end, max_begin);
-  IndexType N = min_end + max_begin;
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
+  {
+    ASSERT_GT(min_end, max_begin);
+    IndexType N = min_end + max_begin;
 
-  std::vector<IndexType> begin1, end1;
-  std::vector<IndexType> begin2, end2;
-  std::vector<IndexType> begin3, end3;
+    std::vector<IndexType> begin1, end1;
+    std::vector<IndexType> begin2, end2;
+    std::vector<IndexType> begin3, end3;
 
-  {
-    using dist_type = std::uniform_int_distribution<IndexType>;
+    {
+      using dist_type = std::uniform_int_distribution<IndexType>;
 
-    for (IndexType j = IndexType(0); j < num1; j++) {
-      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
-      end1.push_back(dist_type(begin1.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num1; j++)
+      {
+        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end1.push_back(dist_type(begin1.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num2; j++) {
-      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
-      end2.push_back(dist_type(begin2.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num2; j++)
+      {
+        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end2.push_back(dist_type(begin2.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num3; j++) {
-      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
-      end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      for (IndexType j = IndexType(0); j < num3; j++)
+      {
+        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      }
     }
-  }
 
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-  using type1 = IndexType;
-  using type2 = size_t;
-  using type3 = double;
+    using type1 = IndexType;
+    using type2 = size_t;
+    using type3 = double;
 
-  type1* working_array1 = nullptr;
-  type1* check_array1 = nullptr;
-  type1* test_array1 = nullptr;
+    type1* working_array1 = nullptr;
+    type1* check_array1   = nullptr;
+    type1* test_array1    = nullptr;
 
-  type2* working_array2 = nullptr;
-  type2* check_array2 = nullptr;
-  type2* test_array2 = nullptr;
+    type2* working_array2 = nullptr;
+    type2* check_array2   = nullptr;
+    type2* test_array2    = nullptr;
 
-  type3* working_array3 = nullptr;
-  type3* check_array3 = nullptr;
-  type3* test_array3 = nullptr;
+    type3* working_array3 = nullptr;
+    type3* check_array3   = nullptr;
+    type3* test_array3    = nullptr;
 
-  allocateForallTestData<type1>(N * num1,
-                                working_res,
-                                &working_array1,
-                                &check_array1,
-                                &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-  allocateForallTestData<type2>(N * num2,
-                                working_res,
-                                &working_array2,
-                                &check_array2,
-                                &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-  allocateForallTestData<type3>(N * num3,
-                                working_res,
-                                &working_array3,
-                                &check_array3,
-                                &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
-  type1 const test_val1(5);
-  type2 const test_val2(7);
-  type3 const test_val3(11);
+    type1 const test_val1(5);
+    type2 const test_val2(7);
+    type3 const test_val3(11);
 
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable11<IndexType, type1>>,
-      camp::list<range_segment, callable12<IndexType, type1>>,
-      camp::list<range_segment, callable21<IndexType, type2>>,
-      camp::list<range_segment, callable22<IndexType, type2>>,
-      camp::list<range_segment, callable31<IndexType, type3>>,
-      camp::list<range_segment, callable32<IndexType, type3>> >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable11<IndexType, type1>>,
+        camp::list<range_segment, callable12<IndexType, type1>>,
+        camp::list<range_segment, callable21<IndexType, type2>>,
+        camp::list<range_segment, callable22<IndexType, type2>>,
+        camp::list<range_segment, callable31<IndexType, type3>>,
+        camp::list<range_segment, callable32<IndexType, type3>>>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using resource_type = typename WorkGroup_type::resource_type;
+    using resource_type = typename WorkGroup_type::resource_type;
 
-  WorkPool_type pool(Allocator{});
-  WorkGroup_type group = pool.instantiate();
-  WorkSite_type site = group.run();
+    WorkPool_type pool(Allocator {});
+    WorkGroup_type group = pool.instantiate();
+    WorkSite_type site   = group.run();
 
-  for (IndexType pr = 0; pr < pool_reuse; pr++) {
+    for (IndexType pr = 0; pr < pool_reuse; pr++)
+    {
 
 
-    // fill_pool(pool, type1(5), type2(7), type3(11));
-    {
-      for (IndexType j = IndexType(0); j < num1; j++) {
-        type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable11<IndexType, type1>{working_ptr1});
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable12<IndexType, type1>{working_ptr1, test_val1});
-      }
+      // fill_pool(pool, type1(5), type2(7), type3(11));
+      {
+        for (IndexType j = IndexType(0); j < num1; j++)
+        {
+          type1* working_ptr1 = working_array1 + N * j;
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable11<IndexType, type1> {working_ptr1});
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable12<IndexType, type1> {working_ptr1, test_val1});
+        }
 
-      for (IndexType j = IndexType(0); j < num2; j++) {
-        type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable21<IndexType, type2>{working_ptr2});
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable22<IndexType, type2>{working_ptr2, test_val2});
-      }
+        for (IndexType j = IndexType(0); j < num2; j++)
+        {
+          type2* working_ptr2 = working_array2 + N * j;
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable21<IndexType, type2> {working_ptr2});
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable22<IndexType, type2> {working_ptr2, test_val2});
+        }
 
-      for (IndexType j = IndexType(0); j < num3; j++) {
-        type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable31<IndexType, type3>{working_ptr3});
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable32<IndexType, type3>{working_ptr3, test_val3});
+        for (IndexType j = IndexType(0); j < num3; j++)
+        {
+          type3* working_ptr3 = working_array3 + N * j;
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable31<IndexType, type3> {working_ptr3});
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable32<IndexType, type3> {working_ptr3, test_val3});
+        }
       }
-    }
 
-    group = pool.instantiate();
+      group = pool.instantiate();
 
-    for (IndexType gr = 0; gr < group_reuse; gr++) {
-
-      // set_test_data();
+      for (IndexType gr = 0; gr < group_reuse; gr++)
       {
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr1[i] = type1(0);
+
+        // set_test_data();
+        {
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr1[i] = type1(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr2[i] = type2(0);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr2[i] = type2(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr3[i] = type3(0);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr3[i] = type3(0);
+            }
           }
-        }
 
 
-        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
-            test_ptr1[ i ] = type1(i);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = begin1[j]; i < end1[j]; ++i)
+            {
+              test_ptr1[i] = type1(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
-            test_ptr2[ i ] = type2(i);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = begin2[j]; i < end2[j]; ++i)
+            {
+              test_ptr2[i] = type2(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
-            test_ptr3[ i ] = type3(i);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = begin3[j]; i < end3[j]; ++i)
+            {
+              test_ptr3[i] = type3(i);
+            }
           }
         }
-      }
 
-      site = group.run();
+        site = group.run();
 
-      auto e = resource_type::get_default().get_event();
-      e.wait();
+        auto e = resource_type::get_default().get_event();
+        e.wait();
 
-      // check_test_data(type1(5), type2(7), type3(11));
-      {
-        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+        // check_test_data(type1(5), type2(7), type3(11));
+        {
+          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-        res.wait();
+          res.wait();
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          type1* check_ptr1 = check_array1 + N * j;
-          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-          for (IndexType i = begin1[j];    i < end1[j];   i++) {
-            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1  = test_array1 + N * j;
+            type1* check_ptr1 = check_array1 + N * j;
+            for (IndexType i = IndexType(0); i < begin1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
+            for (IndexType i = begin1[j]; i < end1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+            }
+            for (IndexType i = end1[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
           }
-          for (IndexType i = end1[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          type2* check_ptr2 = check_array2 + N * j;
-          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2  = test_array2 + N * j;
+            type2* check_ptr2 = check_array2 + N * j;
+            for (IndexType i = IndexType(0); i < begin2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
+            for (IndexType i = begin2[j]; i < end2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+            }
+            for (IndexType i = end2[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
           }
-          for (IndexType i = begin2[j];    i < end2[j];   i++) {
-            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
-          }
-          for (IndexType i = end2[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          type3* check_ptr3 = check_array3 + N * j;
-          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-          }
-          for (IndexType i = begin3[j];    i < end3[j];   i++) {
-            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-          }
-          for (IndexType i = end3[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3  = test_array3 + N * j;
+            type3* check_ptr3 = check_array3 + N * j;
+            for (IndexType i = IndexType(0); i < begin3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
+            for (IndexType i = begin3[j]; i < end3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+            }
+            for (IndexType i = end3[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
           }
         }
       }
-    }
 
-    site.clear();
-    group.clear();
-    pool.clear();
-  }
+      site.clear();
+      group.clear();
+      pool.clear();
+    }
 
 
-  deallocateForallTestData<type1>(working_res,
-                                  working_array1,
-                                  check_array1,
-                                  test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-  deallocateForallTestData<type2>(working_res,
-                                  working_array2,
-                                  check_array2,
-                                  test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-  deallocateForallTestData<type3>(working_res,
-                                  working_array3,
-                                  check_array3,
-                                  test_array3);
-}
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 
 #endif
 
 
-
 template <typename T>
 class WorkGroupBasicOrderedMultipleReuseFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest,
+             BasicWorkGroupOrderedMultipleReuse)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -434,9 +472,11 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrd
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                                IndexType, Allocator, WORKING_RESOURCE >{}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+  testWorkGroupOrderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
+      group_reuse);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index c249b7de65..b0a2ac3734 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -25,149 +25,141 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle {
-void operator()(IndexType begin, IndexType end) const
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle
 {
-  ASSERT_GE(begin, (IndexType)0);
-  ASSERT_GE(end, begin);
-  IndexType N = end + begin;
-
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
-
-  IndexType* working_array;
-  IndexType* check_array;
-  IndexType* test_array;
-
-  allocateForallTestData<IndexType>(N,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-  IndexType const test_val(5);
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  auto callable1 = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += i;
-      };
-
-  auto callable2 = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += test_val;
-      };
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, decltype(callable1)>,
-      camp::list<range_segment, decltype(callable2)> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
+  void operator()(IndexType begin, IndexType end) const
   {
-    for (IndexType i = IndexType(0); i < N; i++) {
-      test_array[i] = IndexType(0);
-    }
+    ASSERT_GE(begin, (IndexType)0);
+    ASSERT_GE(end, begin);
+    IndexType N = end + begin;
 
-    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-    for (IndexType i = begin; i < end; ++i) {
-      test_array[ i ] = IndexType(i);
-    }
-  }
+    IndexType* working_array;
+    IndexType* check_array;
+    IndexType* test_array;
 
-  WorkPool_type pool(Allocator{});
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
-  {
-    pool.enqueue(range_segment{ begin, end }, callable1);
-    pool.enqueue(range_segment{ begin, end }, callable2);
-  }
+    IndexType const test_val(5);
 
-  WorkGroup_type group = pool.instantiate();
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-  WorkSite_type site = group.run(res);
+    auto callable1 = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += i; };
 
-  {
-    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
-    res.wait();
+    auto callable2 = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += test_val; };
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, decltype(callable1)>,
+        camp::list<range_segment, decltype(callable2)>>;
+
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    {
+      for (IndexType i = IndexType(0); i < N; i++)
+      {
+        test_array[i] = IndexType(0);
+      }
 
-    for (IndexType i = IndexType(0); i < begin; i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
+      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+
+      for (IndexType i = begin; i < end; ++i)
+      {
+        test_array[i] = IndexType(i);
+      }
     }
-    for (IndexType i = begin;        i < end;   i++) {
-      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+
+    WorkPool_type pool(Allocator {});
+
+    {
+      pool.enqueue(range_segment {begin, end}, callable1);
+      pool.enqueue(range_segment {begin, end}, callable2);
     }
-    for (IndexType i = end;          i < N;     i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
+
+    WorkGroup_type group = pool.instantiate();
+
+    WorkSite_type site = group.run(res);
+
+    {
+      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+      res.wait();
+
+      for (IndexType i = IndexType(0); i < begin; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
+      for (IndexType i = begin; i < end; i++)
+      {
+        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      }
+      for (IndexType i = end; i < N; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
     }
-  }
 
 
-  deallocateForallTestData<IndexType>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
-}
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_function_call_dispatch_typer,
-                                  IndexType,
-                                  Allocator,
-                                  WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_virtual_function_dispatch_typer,
-                                  IndexType,
-                                  Allocator,
-                                  WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupOrderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 
 #endif
@@ -175,23 +167,23 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicOrderedSingleFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicOrderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSingle)
+TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest,
+             BasicWorkGroupOrderedSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -203,9 +195,15 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSin
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupOrderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator,
+                             WORKING_RESOURCE> {}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_SINGLE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index 4207294bcf..c2265c3a96 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -19,30 +19,36 @@
 
 
 // These are defined here due to cuda limitations
-template < typename IndexType, typename type1 >
-struct callable1 {
+template <typename IndexType, typename type1>
+struct callable1
+{
   type1* working_ptr1;
   type1 const test_val1;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr1[i] += type1(i) + test_val1;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr1[i] += type1(i) + test_val1;
   }
 };
 
-template < typename IndexType, typename type2 >
-struct callable2 {
+template <typename IndexType, typename type2>
+struct callable2
+{
   type2* working_ptr2;
   type2 const test_val2;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr2[i] += type2(i) + test_val2;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr2[i] += type2(i) + test_val2;
   }
 };
 
-template < typename IndexType, typename type3 >
-struct callable3 {
+template <typename IndexType, typename type3>
+struct callable3
+{
   type3* working_ptr3;
   type3 const test_val3;
-  RAJA_HOST_DEVICE void operator()(IndexType i) const {
-        working_ptr3[i] += type3(i) + test_val3;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const
+  {
+    working_ptr3[i] += type3(i) + test_val3;
   }
 };
 
@@ -53,335 +59,363 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple {
-void operator()(
-    std::mt19937& rng, IndexType max_begin, IndexType min_end,
-    IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse) const
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple
 {
-  ASSERT_GT(min_end, max_begin);
-  IndexType N = min_end + max_begin;
+  void operator()(std::mt19937& rng,
+                  IndexType max_begin,
+                  IndexType min_end,
+                  IndexType num1,
+                  IndexType num2,
+                  IndexType num3,
+                  IndexType pool_reuse,
+                  IndexType group_reuse) const
+  {
+    ASSERT_GT(min_end, max_begin);
+    IndexType N = min_end + max_begin;
 
-  std::vector<IndexType> begin1, end1;
-  std::vector<IndexType> begin2, end2;
-  std::vector<IndexType> begin3, end3;
+    std::vector<IndexType> begin1, end1;
+    std::vector<IndexType> begin2, end2;
+    std::vector<IndexType> begin3, end3;
 
-  {
-    using dist_type = std::uniform_int_distribution<IndexType>;
+    {
+      using dist_type = std::uniform_int_distribution<IndexType>;
 
-    for (IndexType j = IndexType(0); j < num1; j++) {
-      begin1.push_back(dist_type(max_begin, min_end-1)(rng));
-      end1.push_back(dist_type(begin1.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num1; j++)
+      {
+        begin1.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end1.push_back(dist_type(begin1.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num2; j++) {
-      begin2.push_back(dist_type(max_begin, min_end-1)(rng));
-      end2.push_back(dist_type(begin2.back(), min_end)(rng));
-    }
+      for (IndexType j = IndexType(0); j < num2; j++)
+      {
+        begin2.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end2.push_back(dist_type(begin2.back(), min_end)(rng));
+      }
 
-    for (IndexType j = IndexType(0); j < num3; j++) {
-      begin3.push_back(dist_type(max_begin, min_end-1)(rng));
-      end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      for (IndexType j = IndexType(0); j < num3; j++)
+      {
+        begin3.push_back(dist_type(max_begin, min_end - 1)(rng));
+        end3.push_back(dist_type(begin3.back(), min_end)(rng));
+      }
     }
-  }
 
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-  using type1 = IndexType;
-  using type2 = size_t;
-  using type3 = double;
+    using type1 = IndexType;
+    using type2 = size_t;
+    using type3 = double;
 
-  type1* working_array1 = nullptr;
-  type1* check_array1 = nullptr;
-  type1* test_array1 = nullptr;
+    type1* working_array1 = nullptr;
+    type1* check_array1   = nullptr;
+    type1* test_array1    = nullptr;
 
-  type2* working_array2 = nullptr;
-  type2* check_array2 = nullptr;
-  type2* test_array2 = nullptr;
+    type2* working_array2 = nullptr;
+    type2* check_array2   = nullptr;
+    type2* test_array2    = nullptr;
 
-  type3* working_array3 = nullptr;
-  type3* check_array3 = nullptr;
-  type3* test_array3 = nullptr;
+    type3* working_array3 = nullptr;
+    type3* check_array3   = nullptr;
+    type3* test_array3    = nullptr;
 
-  allocateForallTestData<type1>(N * num1,
-                                working_res,
-                                &working_array1,
-                                &check_array1,
-                                &test_array1);
+    allocateForallTestData<type1>(N * num1, working_res, &working_array1,
+                                  &check_array1, &test_array1);
 
-  allocateForallTestData<type2>(N * num2,
-                                working_res,
-                                &working_array2,
-                                &check_array2,
-                                &test_array2);
+    allocateForallTestData<type2>(N * num2, working_res, &working_array2,
+                                  &check_array2, &test_array2);
 
-  allocateForallTestData<type3>(N * num3,
-                                working_res,
-                                &working_array3,
-                                &check_array3,
-                                &test_array3);
+    allocateForallTestData<type3>(N * num3, working_res, &working_array3,
+                                  &check_array3, &test_array3);
 
-  type1 const test_val1(5);
-  type2 const test_val2(7);
-  type3 const test_val3(11);
+    type1 const test_val1(5);
+    type2 const test_val2(7);
+    type3 const test_val3(11);
 
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable1<IndexType, type1>>,
-      camp::list<range_segment, callable2<IndexType, type2>>,
-      camp::list<range_segment, callable3<IndexType, type3>> >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable1<IndexType, type1>>,
+        camp::list<range_segment, callable2<IndexType, type2>>,
+        camp::list<range_segment, callable3<IndexType, type3>>>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-  WorkPool_type pool(Allocator{});
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  for (IndexType pr = 0; pr < pool_reuse; pr++) {
+    WorkPool_type pool(Allocator {});
 
-    // fill_pool(pool, type1(5), type2(7), type3(11));
+    for (IndexType pr = 0; pr < pool_reuse; pr++)
     {
-      for (IndexType j = IndexType(0); j < num1; j++) {
-        type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(range_segment{ begin1[j], end1[j] },
-            callable1<IndexType, type1>{working_ptr1, test_val1});
-      }
 
-      for (IndexType j = IndexType(0); j < num2; j++) {
-        type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(range_segment{ begin2[j], end2[j] },
-            callable2<IndexType, type2>{working_ptr2, test_val2});
-      }
+      // fill_pool(pool, type1(5), type2(7), type3(11));
+      {
+        for (IndexType j = IndexType(0); j < num1; j++)
+        {
+          type1* working_ptr1 = working_array1 + N * j;
+          pool.enqueue(range_segment {begin1[j], end1[j]},
+                       callable1<IndexType, type1> {working_ptr1, test_val1});
+        }
 
-      for (IndexType j = IndexType(0); j < num3; j++) {
-        type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(range_segment{ begin3[j], end3[j] },
-            callable3<IndexType, type3>{working_ptr3, test_val3});
-      }
-    }
+        for (IndexType j = IndexType(0); j < num2; j++)
+        {
+          type2* working_ptr2 = working_array2 + N * j;
+          pool.enqueue(range_segment {begin2[j], end2[j]},
+                       callable2<IndexType, type2> {working_ptr2, test_val2});
+        }
 
-    WorkGroup_type group = pool.instantiate();
+        for (IndexType j = IndexType(0); j < num3; j++)
+        {
+          type3* working_ptr3 = working_array3 + N * j;
+          pool.enqueue(range_segment {begin3[j], end3[j]},
+                       callable3<IndexType, type3> {working_ptr3, test_val3});
+        }
+      }
 
-    for (IndexType gr = 0; gr < group_reuse; gr++) {
+      WorkGroup_type group = pool.instantiate();
 
-      // set_test_data();
+      for (IndexType gr = 0; gr < group_reuse; gr++)
       {
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr1[i] = type1(0);
+
+        // set_test_data();
+        {
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr1[i] = type1(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr2[i] = type2(0);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr2[i] = type2(0);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = IndexType(0); i < N; i++) {
-            test_ptr3[i] = type3(0);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = IndexType(0); i < N; i++)
+            {
+              test_ptr3[i] = type3(0);
+            }
           }
-        }
 
 
-        res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
+          res.memcpy(working_array1, test_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
+          res.memcpy(working_array2, test_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
+          res.memcpy(working_array3, test_array3, sizeof(type3) * N * num3);
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          for (IndexType i = begin1[j]; i < end1[j]; ++i) {
-            test_ptr1[ i ] = type1(i);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1 = test_array1 + N * j;
+            for (IndexType i = begin1[j]; i < end1[j]; ++i)
+            {
+              test_ptr1[i] = type1(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          for (IndexType i = begin2[j]; i < end2[j]; ++i) {
-            test_ptr2[ i ] = type2(i);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2 = test_array2 + N * j;
+            for (IndexType i = begin2[j]; i < end2[j]; ++i)
+            {
+              test_ptr2[i] = type2(i);
+            }
           }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          for (IndexType i = begin3[j]; i < end3[j]; ++i) {
-            test_ptr3[ i ] = type3(i);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3 = test_array3 + N * j;
+            for (IndexType i = begin3[j]; i < end3[j]; ++i)
+            {
+              test_ptr3[i] = type3(i);
+            }
           }
         }
-      }
 
-      WorkSite_type site = group.run(res);
+        WorkSite_type site = group.run(res);
 
-      // check_test_data(type1(5), type2(7), type3(11));
-      {
-        res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
+        // check_test_data(type1(5), type2(7), type3(11));
+        {
+          res.memcpy(check_array1, working_array1, sizeof(type1) * N * num1);
 
-        res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
+          res.memcpy(check_array2, working_array2, sizeof(type2) * N * num2);
 
-        res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
+          res.memcpy(check_array3, working_array3, sizeof(type3) * N * num3);
 
-        res.wait();
+          res.wait();
 
 
-        for (IndexType j = IndexType(0); j < num1; j++) {
-          type1* test_ptr1 = test_array1 + N * j;
-          type1* check_ptr1 = check_array1 + N * j;
-          for (IndexType i = IndexType(0); i < begin1[j]; i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+          for (IndexType j = IndexType(0); j < num1; j++)
+          {
+            type1* test_ptr1  = test_array1 + N * j;
+            type1* check_ptr1 = check_array1 + N * j;
+            for (IndexType i = IndexType(0); i < begin1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
+            for (IndexType i = begin1[j]; i < end1[j]; i++)
+            {
+              ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
+            }
+            for (IndexType i = end1[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
+            }
           }
-          for (IndexType i = begin1[j];    i < end1[j];   i++) {
-            ASSERT_EQ(test_ptr1[i] + test_val1, check_ptr1[i]);
-          }
-          for (IndexType i = end1[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr1[i], check_ptr1[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num2; j++) {
-          type2* test_ptr2 = test_array2 + N * j;
-          type2* check_ptr2 = check_array2 + N * j;
-          for (IndexType i = IndexType(0); i < begin2[j]; i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-          for (IndexType i = begin2[j];    i < end2[j];   i++) {
-            ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+          for (IndexType j = IndexType(0); j < num2; j++)
+          {
+            type2* test_ptr2  = test_array2 + N * j;
+            type2* check_ptr2 = check_array2 + N * j;
+            for (IndexType i = IndexType(0); i < begin2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
+            for (IndexType i = begin2[j]; i < end2[j]; i++)
+            {
+              ASSERT_EQ(test_ptr2[i] + test_val2, check_ptr2[i]);
+            }
+            for (IndexType i = end2[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
+            }
           }
-          for (IndexType i = end2[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr2[i], check_ptr2[i]);
-          }
-        }
 
-        for (IndexType j = IndexType(0); j < num3; j++) {
-          type3* test_ptr3 = test_array3 + N * j;
-          type3* check_ptr3 = check_array3 + N * j;
-          for (IndexType i = IndexType(0); i < begin3[j]; i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
-          }
-          for (IndexType i = begin3[j];    i < end3[j];   i++) {
-            ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
-          }
-          for (IndexType i = end3[j];      i < N;     i++) {
-            ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+          for (IndexType j = IndexType(0); j < num3; j++)
+          {
+            type3* test_ptr3  = test_array3 + N * j;
+            type3* check_ptr3 = check_array3 + N * j;
+            for (IndexType i = IndexType(0); i < begin3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
+            for (IndexType i = begin3[j]; i < end3[j]; i++)
+            {
+              ASSERT_EQ(test_ptr3[i] + test_val3, check_ptr3[i]);
+            }
+            for (IndexType i = end3[j]; i < N; i++)
+            {
+              ASSERT_EQ(test_ptr3[i], check_ptr3[i]);
+            }
           }
         }
       }
-    }
 
-    pool.clear();
-  }
+      pool.clear();
+    }
 
 
-  deallocateForallTestData<type1>(working_res,
-                                  working_array1,
-                                  check_array1,
-                                  test_array1);
+    deallocateForallTestData<type1>(working_res, working_array1, check_array1,
+                                    test_array1);
 
-  deallocateForallTestData<type2>(working_res,
-                                  working_array2,
-                                  check_array2,
-                                  test_array2);
+    deallocateForallTestData<type2>(working_res, working_array2, check_array2,
+                                    test_array2);
 
-  deallocateForallTestData<type3>(working_res,
-                                  working_array3,
-                                  check_array3,
-                                  test_array3);
-}
+    deallocateForallTestData<type3>(working_res, working_array3, check_array3,
+                                    test_array3);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_function_call_dispatch_typer,
-                                      IndexType,
-                                      Allocator,
-                                      WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_virtual_function_dispatch_typer,
-                                      IndexType,
-                                      Allocator,
-                                      WORKING_RES> {
-void operator()(
-    std::mt19937&, IndexType, IndexType,
-    IndexType, IndexType, IndexType,
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(std::mt19937&,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType,
+                  IndexType) const
+  {}
 };
 
 #endif
 
 
 template <typename T>
-class WorkGroupBasicUnorderedMultipleReuseFunctionalTest : public ::testing::Test
-{
-};
+class WorkGroupBasicUnorderedMultipleReuseFunctionalTest
+    : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupUnorderedMultipleReuse)
+TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest,
+             BasicWorkGroupUnorderedMultipleReuse)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType num1 = dist_type(IndexType(0), IndexType(8))(rng);
@@ -391,9 +425,11 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupU
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
-                                  IndexType, Allocator, WORKING_RESOURCE >{}(
-      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
+  testWorkGroupUnorderedMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator,
+                                 WORKING_RESOURCE> {}(
+      rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse,
+      group_reuse);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_MULTIPLEREUSE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index 84d44dd496..629bccdb0d 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -24,150 +24,143 @@ template <typename ExecPolicy,
           typename DispatchTyper,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle {
-void operator()(IndexType begin, IndexType end) const
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle
 {
+  void operator()(IndexType begin, IndexType end) const
+  {
 
-  ASSERT_GE(begin, (IndexType)0);
-  ASSERT_GE(end, begin);
-  IndexType N = end + begin;
+    ASSERT_GE(begin, (IndexType)0);
+    ASSERT_GE(end, begin);
+    IndexType N = end + begin;
 
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
+    WORKING_RES res = WORKING_RES::get_default();
+    camp::resources::Resource working_res {res};
 
-  IndexType* working_array;
-  IndexType* check_array;
-  IndexType* test_array;
+    IndexType* working_array;
+    IndexType* check_array;
+    IndexType* test_array;
 
-  allocateForallTestData<IndexType>(N,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
+    allocateForallTestData<IndexType>(N, working_res, &working_array,
+                                      &check_array, &test_array);
 
-  IndexType const test_val(5);
+    IndexType const test_val(5);
 
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
 
-  auto callable = [=] RAJA_HOST_DEVICE (IndexType i) {
-        working_array[i] += i + test_val;
-      };
+    auto callable = [=] RAJA_HOST_DEVICE(IndexType i)
+    { working_array[i] += i + test_val; };
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, decltype(callable)> >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, decltype(callable)>>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
 
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
 
-  using resource_type = typename WorkSite_type::resource_type;
-  static_assert(std::is_same<WORKING_RES, resource_type>::value,
-                "Expected same resource types");
+    using resource_type = typename WorkSite_type::resource_type;
+    static_assert(std::is_same<WORKING_RES, resource_type>::value,
+                  "Expected same resource types");
 
-  {
-    for (IndexType i = IndexType(0); i < N; i++) {
-      test_array[i] = IndexType(0);
-    }
+    {
+      for (IndexType i = IndexType(0); i < N; i++)
+      {
+        test_array[i] = IndexType(0);
+      }
 
-    res.memcpy(working_array, test_array, sizeof(IndexType) * N);
+      res.memcpy(working_array, test_array, sizeof(IndexType) * N);
 
-    for (IndexType i = begin; i < end; ++i) {
-      test_array[ i ] = IndexType(i);
+      for (IndexType i = begin; i < end; ++i)
+      {
+        test_array[i] = IndexType(i);
+      }
     }
-  }
 
-  WorkPool_type pool(Allocator{});
+    WorkPool_type pool(Allocator {});
 
-  {
-    pool.enqueue(range_segment{ begin, end }, callable);
-  }
+    {
+      pool.enqueue(range_segment {begin, end}, callable);
+    }
 
-  WorkGroup_type group = pool.instantiate();
+    WorkGroup_type group = pool.instantiate();
 
-  WorkSite_type site = group.run();
+    WorkSite_type site = group.run();
 
-  auto e = site.get_resource().get_event();
-  e.wait();
+    auto e = site.get_resource().get_event();
+    e.wait();
 
-  {
-    res.memcpy(check_array, working_array, sizeof(IndexType) * N);
+    {
+      res.memcpy(check_array, working_array, sizeof(IndexType) * N);
 
-    for (IndexType i = IndexType(0); i < begin; i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
-    }
-    for (IndexType i = begin;        i < end;   i++) {
-      ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      for (IndexType i = IndexType(0); i < begin; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
+      for (IndexType i = begin; i < end; i++)
+      {
+        ASSERT_EQ(test_array[i] + test_val, check_array[i]);
+      }
+      for (IndexType i = end; i < N; i++)
+      {
+        ASSERT_EQ(test_array[i], check_array[i]);
+      }
     }
-    for (IndexType i = end;          i < N;     i++) {
-      ASSERT_EQ(test_array[i], check_array[i]);
-    }
-  }
 
 
-  deallocateForallTestData<IndexType>(working_res,
-                                      working_array,
-                                      check_array,
-                                      test_array);
-}
+    deallocateForallTestData<IndexType>(working_res, working_array, check_array,
+                                        test_array);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
-          typename WORKING_RES
-          >
-struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator,
-                                    WORKING_RES> {
-void operator()(
-    IndexType, IndexType) const
-{ }
+          typename WORKING_RES>
+struct testWorkGroupUnorderedSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKING_RES>
+{
+  void operator()(IndexType, IndexType) const {}
 };
 
 #endif
@@ -175,23 +168,23 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicUnorderedSingleFunctionalTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicUnorderedSingleFunctionalTest);
 
 
-TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnorderedSingle)
+TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest,
+             BasicWorkGroupUnorderedSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   using dist_type = std::uniform_int_distribution<IndexType>;
 
   IndexType b1 = dist_type(IndexType(0), IndexType(15))(rng);
@@ -203,9 +196,15 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnordere
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b1, e1);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b2, e2);
+  testWorkGroupUnorderedSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator,
+                               WORKING_RESOURCE> {}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_SINGLE__
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index a699171a94..4b4c786784 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -21,110 +21,126 @@
 #ifdef RAJA_COMPILER_MSVC
 // disable some warnings for MSVC that we can't control, because they're emitted
 // by googletest headers
-#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
-#pragma warning( disable : 4389 )  // Force msvc to not emit conversion warning
+#pragma warning(disable : 4244)  // Force msvc to not emit conversion warning
+#pragma warning(disable : 4389)  // Force msvc to not emit conversion warning
 #endif
 
 #include "gtest/gtest.h"
 
-#define GPU_TEST(X, Y)                 \
-  static void gpu_test_##X##_##Y();    \
-  TEST(X, Y) { gpu_test_##X##_##Y(); } \
+#define GPU_TEST(X, Y)                                                         \
+  static void gpu_test_##X##_##Y();                                            \
+  TEST(X, Y) { gpu_test_##X##_##Y(); }                                         \
   static void gpu_test_##X##_##Y()
 
-#define GPU_TEST_F(test_fixture, test_name)                  \
-  static void gpu_test_f_##test_fixture##_##test_name();     \
-  GTEST_TEST_(test_fixture,                                   \
-              test_name,                                      \
-              test_fixture,                                   \
-              ::testing::internal::GetTypeId<test_fixture>()) \
-  {                                                           \
-    gpu_test_f_##test_fixture##_##test_name();               \
-  }                                                           \
+#define GPU_TEST_F(test_fixture, test_name)                                    \
+  static void gpu_test_f_##test_fixture##_##test_name();                       \
+  GTEST_TEST_(test_fixture, test_name, test_fixture,                           \
+              ::testing::internal::GetTypeId<test_fixture>())                  \
+  {                                                                            \
+    gpu_test_f_##test_fixture##_##test_name();                                 \
+  }                                                                            \
   static void gpu_test_f_##test_fixture##_##test_name()
 
-#define GPU_TEST_P(test_case_name, test_name)                               \
-  template <typename Invocable>                                              \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&);       \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
-      : public test_case_name                                                \
-  {                                                                          \
-  public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
-    virtual void TestBody()                                                  \
-    {                                                                        \
-      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); }); \
-    }                                                                        \
-                                                                             \
-  private:                                                                   \
-    static int AddToRegistry()                                               \
-    {                                                                        \
-      ::testing::UnitTest::GetInstance()                                     \
-          ->parameterized_test_registry()                                    \
-          .GetTestCasePatternHolder<test_case_name>(                         \
-              #test_case_name,                                               \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))         \
-          ->AddTestPattern(                                                  \
-              #test_case_name,                                               \
-              #test_name,                                                    \
-              new ::testing::internal::TestMetaFactory<                      \
-                  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>());     \
-      return 0;                                                              \
-    }                                                                        \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;             \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
-                                                           test_name));      \
-  };                                                                         \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                 \
-                             test_name)::gtest_registering_dummy_ =          \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();    \
-  template <typename Invocable>                                              \
-  static void gtest_gpu_##test_case_name##_##test_name(Invocable &&GetParam)
-
-#define GPU_TYPED_TEST_P(SuiteName, TestName)                           \
-    namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
-      template <typename gtest_TypeParam_>                              \
-      class TestName : public SuiteName<gtest_TypeParam_> {             \
-       private:                                                         \
-        typedef SuiteName<gtest_TypeParam_> TestFixture;                \
-        typedef gtest_TypeParam_ TypeParam;                             \
-       public:                                                          \
-        void TestBody() override;                                       \
-      };                                                                \
-      static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-          GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
-              __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
-              GTEST_STRINGIFY_(TestName));                              \
-    }                                                                   \
-    template <typename gtest_TypeParam_>                                \
-    void GTEST_SUITE_NAMESPACE_(                                        \
-        SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+#define GPU_TEST_P(test_case_name, test_name)                                  \
+  template <typename Invocable>                                                \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable&&);           \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
+      : public test_case_name                                                  \
+  {                                                                            \
+  public:                                                                      \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
+    virtual void TestBody()                                                    \
+    {                                                                          \
+      gtest_gpu_##test_case_name##_##test_name([&] { return GetParam(); });    \
+    }                                                                          \
+                                                                               \
+  private:                                                                     \
+    static int AddToRegistry()                                                 \
+    {                                                                          \
+      ::testing::UnitTest::GetInstance()                                       \
+          ->parameterized_test_registry()                                      \
+          .GetTestCasePatternHolder<test_case_name>(                           \
+              #test_case_name,                                                 \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
+          ->AddTestPattern(                                                    \
+              #test_case_name, #test_name,                                     \
+              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+                  test_case_name, test_name)>());                              \
+      return 0;                                                                \
+    }                                                                          \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
+                                                           test_name));        \
+  };                                                                           \
+  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
+                             test_name)::gtest_registering_dummy_ =            \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
+  template <typename Invocable>                                                \
+  static void gtest_gpu_##test_case_name##_##test_name(Invocable&& GetParam)
+
+#define GPU_TYPED_TEST_P(SuiteName, TestName)                                  \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName)                                  \
+  {                                                                            \
+    template <typename gtest_TypeParam_>                                       \
+    class TestName : public SuiteName<gtest_TypeParam_>                        \
+    {                                                                          \
+    private:                                                                   \
+      typedef SuiteName<gtest_TypeParam_> TestFixture;                         \
+      typedef gtest_TypeParam_ TypeParam;                                      \
+                                                                               \
+    public:                                                                    \
+      void TestBody() override;                                                \
+    };                                                                         \
+    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =          \
+        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(                \
+            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),                   \
+            GTEST_STRINGIFY_(TestName));                                       \
+  }                                                                            \
+  template <typename gtest_TypeParam_>                                         \
+  void GTEST_SUITE_NAMESPACE_(                                                 \
+      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
 
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4244 )  // reenable warning
-#pragma warning( default : 4389 )  // reenable warning
+#pragma warning(default : 4244)  // reenable warning
+#pragma warning(default : 4389)  // reenable warning
 #endif
 
 
 #if defined(__CUDA_ARCH__)
 
-#define RAJA_ASSERT_EQ(X,Y) \
-{\
-  auto x = (X); \
-  auto y = (Y); \
-  if(x != y){ \
-      asm("trap;"); \
-  } \
-}
+#define RAJA_ASSERT_EQ(X, Y)                                                   \
+  {                                                                            \
+    auto x = (X);                                                              \
+    auto y = (Y);                                                              \
+    if (x != y)                                                                \
+    {                                                                          \
+      asm("trap;");                                                            \
+    }                                                                          \
+  }
 
-#define RAJA_ASSERT_FLOAT_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
-#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {RAJA_ASSERT_EQ(X,Y);}
+#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
+  {                                                                            \
+    RAJA_ASSERT_EQ(X, Y);                                                      \
+  }
+#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
+  {                                                                            \
+    RAJA_ASSERT_EQ(X, Y);                                                      \
+  }
 #else
 
-#define RAJA_ASSERT_EQ(X,Y) {ASSERT_EQ(X,Y);}
-#define RAJA_ASSERT_FLOAT_EQ(X,Y) {ASSERT_FLOAT_EQ(X,Y);}
-#define RAJA_ASSERT_DOUBLE_EQ(X,Y) {ASSERT_DOUBLE_EQ(X,Y);}
+#define RAJA_ASSERT_EQ(X, Y)                                                   \
+  {                                                                            \
+    ASSERT_EQ(X, Y);                                                           \
+  }
+#define RAJA_ASSERT_FLOAT_EQ(X, Y)                                             \
+  {                                                                            \
+    ASSERT_FLOAT_EQ(X, Y);                                                     \
+  }
+#define RAJA_ASSERT_DOUBLE_EQ(X, Y)                                            \
+  {                                                                            \
+    ASSERT_DOUBLE_EQ(X, Y);                                                    \
+  }
 
 #endif
 /*
@@ -139,54 +155,68 @@
  *  Now you can just say ASSERT_SCALAR_EQ(X, Y) and things should just work
  *
  */
-#define ASSERT_SCALAR_EQ(X,Y) { \
-  int value_type = RAJA::gtest::getScalarType(X); \
-  switch(value_type){ \
-    case 1: {RAJA_ASSERT_FLOAT_EQ(X,Y);} break; \
-    case 2: {RAJA_ASSERT_DOUBLE_EQ(X,Y);} break; \
-    default: {RAJA_ASSERT_EQ(X,Y);} \
-  }; }
+#define ASSERT_SCALAR_EQ(X, Y)                                                 \
+  {                                                                            \
+    int value_type = RAJA::gtest::getScalarType(X);                            \
+    switch (value_type)                                                        \
+    {                                                                          \
+    case 1:                                                                    \
+    {                                                                          \
+      RAJA_ASSERT_FLOAT_EQ(X, Y);                                              \
+    }                                                                          \
+    break;                                                                     \
+    case 2:                                                                    \
+    {                                                                          \
+      RAJA_ASSERT_DOUBLE_EQ(X, Y);                                             \
+    }                                                                          \
+    break;                                                                     \
+    default:                                                                   \
+    {                                                                          \
+      RAJA_ASSERT_EQ(X, Y);                                                    \
+    }                                                                          \
+    };                                                                         \
+  }
 
 // Traits use by the above maco
 namespace RAJA
 {
-  namespace gtest
-  {
-    template<typename T>
-    struct AssertScalarTraits{
-        static constexpr int value = 0;
-    };
-
-    template<>
-    struct AssertScalarTraits<float>{
-        static constexpr int value = 1;
-    };
-
-    template<>
-    struct AssertScalarTraits<double>{
-        static constexpr int value = 2;
-    };
-
-    template<typename T>
-    inline
-    constexpr
-    int getScalarType(T const &){
-      return AssertScalarTraits<T>::value;
-    }
+namespace gtest
+{
+template <typename T>
+struct AssertScalarTraits
+{
+  static constexpr int value = 0;
+};
 
+template <>
+struct AssertScalarTraits<float>
+{
+  static constexpr int value = 1;
+};
 
-  }
+template <>
+struct AssertScalarTraits<double>
+{
+  static constexpr int value = 2;
+};
+
+template <typename T>
+inline constexpr int getScalarType(T const&)
+{
+  return AssertScalarTraits<T>::value;
 }
 
+
+}  // namespace gtest
+}  // namespace RAJA
+
 // This always returns a 0, but forces compiler not to compile-out
 // constant values
-#define NO_OPT_ZERO (rand()/RAND_MAX)
+#define NO_OPT_ZERO (rand() / RAND_MAX)
 
 // Returns a random value between 1.0 and 2.0, and helps force the compiler
 // to not compile-out constant values
-#define NO_OPT_RAND (1.0+(double)rand()/RAND_MAX)
-
-
+#define NO_OPT_RAND (1.0 + (double)rand() / RAND_MAX)
 
 
 #endif  // closing endif for header file include guard
diff --git a/test/include/RAJA_test-abs.hpp b/test/include/RAJA_test-abs.hpp
index 85b5002d92..57bfadf0c0 100644
--- a/test/include/RAJA_test-abs.hpp
+++ b/test/include/RAJA_test-abs.hpp
@@ -13,20 +13,21 @@
 
 #include <cmath>
 
-namespace RAJA {
+namespace RAJA
+{
 
-  template<typename T>
-  camp::concepts::enable_if_t<T, std::is_floating_point<T> >
-  test_abs(T&& val) {
-    return std::fabs(val);
-  } 
+template <typename T>
+camp::concepts::enable_if_t<T, std::is_floating_point<T>> test_abs(T&& val)
+{
+  return std::fabs(val);
+}
 
-  template<typename T>
-  camp::concepts::enable_if_t<T, std::is_integral<T> >
-  test_abs(T&& val) {
-    return std::abs(val);
-  }
+template <typename T>
+camp::concepts::enable_if_t<T, std::is_integral<T>> test_abs(T&& val)
+{
+  return std::abs(val);
+}
 
-} // namespace RAJA
+}  // namespace RAJA
 
-#endif // __RAJA_test_abs_HPP__
+#endif  // __RAJA_test_abs_HPP__
diff --git a/test/include/RAJA_test-atomic-ref-types.hpp b/test/include/RAJA_test-atomic-ref-types.hpp
index f854932ab8..2f6280ed0e 100644
--- a/test/include/RAJA_test-atomic-ref-types.hpp
+++ b/test/include/RAJA_test-atomic-ref-types.hpp
@@ -18,74 +18,71 @@
 
 #include <type_traits>
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 1, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 1, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 2, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 2, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 4, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 4, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 8, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 8, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val |= val >> 32;
   return val;
 }
 
-template < typename T >
-RAJA_INLINE
-RAJA_HOST_DEVICE
-typename std::enable_if<sizeof(T) == 16, T>::type np2m1(T val)
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE typename std::enable_if<sizeof(T) == 16, T>::type
+np2m1(T val)
 {
-  val |= val >> 1  ;
-  val |= val >> 2  ;
-  val |= val >> 4  ;
-  val |= val >> 8  ;
-  val |= val >> 16 ;
-  val |= val >> 32 ;
-  val |= val >> 64 ;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val |= val >> 32;
+  val |= val >> 64;
   return val;
 }
 
 // Assist return type conditional overloading of testAtomicRefLogicalOp
-struct int_op {}; // represents underlying op type = integral
-struct all_op {}; // these op types can accept integral or float
+struct int_op
+{};  // represents underlying op type = integral
+struct all_op
+{};  // these op types can accept integral or float
 
 
-#endif // __RAJA_test_atomic_ref_types_HPP__
+#endif  // __RAJA_test_atomic_ref_types_HPP__
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
index 90a1be4024..5a9df0ab43 100644
--- a/test/include/RAJA_test-atomic-types.hpp
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -21,15 +21,14 @@
 //
 // Atomic data types
 //
-using AtomicDataTypeList =
-  camp::list< RAJA::Index_type,
-              int,
+using AtomicDataTypeList = camp::list<RAJA::Index_type,
+                                      int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned int,
-              long long,
-              unsigned long long,
-              float,
+                                      unsigned int,
+                                      long long,
+                                      unsigned long long,
+                                      float,
 #endif
-              double >;
+                                      double>;
 
-#endif // __RAJA_test_atomic_types_HPP__
+#endif  // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-atomicpol.hpp b/test/include/RAJA_test-atomicpol.hpp
index cc327d434d..c13e9a68bd 100644
--- a/test/include/RAJA_test-atomicpol.hpp
+++ b/test/include/RAJA_test-atomicpol.hpp
@@ -11,93 +11,83 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using SequentialAtomicPols =
-  camp::list<
+using SequentialAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::auto_atomic,
-              RAJA::builtin_atomic,
+    RAJA::auto_atomic,
+    RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-              RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-              RAJA::seq_atomic
-            >;
+    RAJA::seq_atomic>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPAtomicPols =
-  camp::list<
+using OpenMPAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::omp_atomic,
-              RAJA::builtin_atomic,
+    RAJA::omp_atomic,
+    RAJA::builtin_atomic,
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
 #if defined(RAJA_ENABLE_HIP)
-              RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #endif
 #endif
-              RAJA::auto_atomic
-            >;
+    RAJA::auto_atomic>;
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAtomicPols =
-  camp::list<
+using CudaAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              RAJA::auto_atomic,
-              RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
-              RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::cuda_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-              RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::cuda_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-              RAJA::cuda_atomic
-            >;
+    RAJA::cuda_atomic>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAtomicPols =
-  camp::list<
+using HipAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               RAJA::auto_atomic,
-               RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
-               RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::hip_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-               RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::hip_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-               RAJA::hip_atomic
-            >;
+    RAJA::hip_atomic>;
 #endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclAtomicPols =
-  camp::list<
+using SyclAtomicPols = camp::list<
 #if defined(RAJA_TEST_EXHAUSTIVE)
-               RAJA::auto_atomic,
-               RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
-               RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
+    RAJA::auto_atomic,
+    RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
+    RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
 #if defined(RAJA_ENABLE_OPENMP)
-               RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
+    RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
 #endif
 #endif
-               RAJA::sycl_atomic
-            >;
+    RAJA::sycl_atomic>;
 #endif  // RAJA_ENABLE_SYCL
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
diff --git a/test/include/RAJA_test-base.hpp b/test/include/RAJA_test-base.hpp
index 98bf53e1c2..470ddb9cfd 100644
--- a/test/include/RAJA_test-base.hpp
+++ b/test/include/RAJA_test-base.hpp
@@ -23,9 +23,10 @@ template <class T>
 struct Test;
 
 template <class... T>
-struct Test<camp::list<T...>> {
+struct Test<camp::list<T...>>
+{
   using Types = ::testing::Types<T...>;
 };
 
 
-#endif // __RAJA_test_base_HPP__
+#endif  // __RAJA_test_base_HPP__
diff --git a/test/include/RAJA_test-camp.hpp b/test/include/RAJA_test-camp.hpp
index 45e125d92a..a9959f3c73 100644
--- a/test/include/RAJA_test-camp.hpp
+++ b/test/include/RAJA_test-camp.hpp
@@ -42,4 +42,4 @@ using HipResourceList = camp::list<camp::resources::Hip>;
 using SyclResourceList = camp::list<camp::resources::Sycl>;
 #endif
 
-#endif // __RAJA_test_camp_HPP__
+#endif  // __RAJA_test_camp_HPP__
diff --git a/test/include/RAJA_test-dynamic-forall.hpp b/test/include/RAJA_test-dynamic-forall.hpp
index 0185061a6d..9988492216 100644
--- a/test/include/RAJA_test-dynamic-forall.hpp
+++ b/test/include/RAJA_test-dynamic-forall.hpp
@@ -15,18 +15,21 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-using policy_list = camp::list<camp::list<RAJA::seq_exec
-                               ,RAJA::simd_exec
+using policy_list = camp::list<camp::list<RAJA::seq_exec,
+                                          RAJA::simd_exec
 #if defined(RAJA_ENABLE_OPENMP)
-                               ,RAJA::omp_parallel_for_exec
+                                          ,
+                                          RAJA::omp_parallel_for_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                               ,RAJA::cuda_exec<256>
-                               ,RAJA::cuda_exec<512>
+                                          ,
+                                          RAJA::cuda_exec<256>,
+                                          RAJA::cuda_exec<512>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                               ,RAJA::hip_exec<256>
-                               ,RAJA::hip_exec<512>
+                                          ,
+                                          RAJA::hip_exec<256>,
+                                          RAJA::hip_exec<512>
 #endif
                                           >>;
 
diff --git a/test/include/RAJA_test-forall-async-execpol.hpp b/test/include/RAJA_test-forall-async-execpol.hpp
index fa9526476e..587f816476 100644
--- a/test/include/RAJA_test-forall-async-execpol.hpp
+++ b/test/include/RAJA_test-forall-async-execpol.hpp
@@ -18,29 +18,30 @@
 #include "RAJA_test-forall-execpol.hpp"
 
 // Sequential execution policy types
-using SequentialAsyncForallExecPols = SequentialForallExecPols;
+using SequentialAsyncForallExecPols       = SequentialForallExecPols;
 using SequentialAsyncForallReduceExecPols = SequentialForallReduceExecPols;
 using SequentialAsyncForallAtomicExecPols = SequentialForallAtomicExecPols;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-using OpenMPAsyncForallExecPols = OpenMPForallExecPols;
+using OpenMPAsyncForallExecPols       = OpenMPForallExecPols;
 using OpenMPAsyncForallReduceExecPols = OpenMPForallReduceExecPols;
 using OpenMPAsyncForallAtomicExecPols = OpenMPForallAtomicExecPols;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAsyncForallExecPols = OpenMPTargetForallExecPols;
+using OpenMPTargetAsyncForallExecPols       = OpenMPTargetForallExecPols;
 using OpenMPTargetAsyncForallReduceExecPols = OpenMPTargetForallReduceExecPols;
 using OpenMPTargetAsyncForallAtomicExecPols = OpenMPTargetForallAtomicExecPols;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAsyncForallExecPols = camp::list< RAJA::cuda_exec<128, true>,
-                                       RAJA::cuda_exec<256, true>,
-                                       RAJA::cuda_exec_explicit<256,2, true> >;
+using CudaAsyncForallExecPols =
+    camp::list<RAJA::cuda_exec<128, true>,
+               RAJA::cuda_exec<256, true>,
+               RAJA::cuda_exec_explicit<256, 2, true>>;
 
 using CudaAsyncForallReduceExecPols = CudaForallExecPols;
 
@@ -49,8 +50,8 @@ using CudaAsyncForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAsyncForallExecPols = camp::list< RAJA::hip_exec<128, true>,
-                                      RAJA::hip_exec<256, true>  >;
+using HipAsyncForallExecPols =
+    camp::list<RAJA::hip_exec<128, true>, RAJA::hip_exec<256, true>>;
 
 using HipAsyncForallReduceExecPols = HipForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-data.hpp b/test/include/RAJA_test-forall-data.hpp
index 3ced1c4cf1..d932e6d94f 100644
--- a/test/include/RAJA_test-forall-data.hpp
+++ b/test/include/RAJA_test-forall-data.hpp
@@ -6,7 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 //
-// Utility routines for allocating/deallocating arrays in for forall tests. 
+// Utility routines for allocating/deallocating arrays in for forall tests.
 //
 
 #ifndef __RAJA_test_forall_data_HPP__
@@ -14,45 +14,47 @@
 
 #include "camp/resource.hpp"
 
-template<typename T>
+template <typename T>
 void allocateForallTestData(size_t N,
                             camp::resources::Resource work_res,
                             T** work_array,
                             T** check_array,
                             T** test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
 // for RAJA strongly typed indices
-template<typename T,
-         typename std::enable_if<std::is_base_of<RAJA::IndexValueBase, camp::type::ptr::rem<T>>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<
+              std::is_base_of<RAJA::IndexValueBase,
+                              camp::type::ptr::rem<T>>::value>::type* = nullptr>
 void allocateForallTestData(T N,
                             camp::resources::Resource work_res,
                             T** work_array,
                             T** check_array,
                             T** test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   *work_array = work_res.allocate<T>(RAJA::stripIndexType(N));
 
   *check_array = host_res.allocate<T>(RAJA::stripIndexType(N));
-  *test_array = host_res.allocate<T>(RAJA::stripIndexType(N));
+  *test_array  = host_res.allocate<T>(RAJA::stripIndexType(N));
 }
 
-template<typename T>
+template <typename T>
 void deallocateForallTestData(camp::resources::Resource work_res,
                               T* work_array,
                               T* check_array,
                               T* test_array)
 {
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
   work_res.deallocate(work_array);
 
@@ -60,4 +62,4 @@ void deallocateForallTestData(camp::resources::Resource work_res,
   host_res.deallocate(test_array);
 }
 
-#endif // __RAJA_test_forall_data_HPP__
+#endif  // __RAJA_test_forall_data_HPP__
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 40adaccc8c..cc8f9b2a26 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -16,87 +16,109 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialForallExecPols = camp::list< RAJA::seq_exec,
-                                             RAJA::simd_exec >;
+using SequentialForallExecPols = camp::list<RAJA::seq_exec, RAJA::simd_exec>;
 
 //
 // Sequential execution policy types for reduction and atomic tests.
 //
 // Note: RAJA::simd_exec does not work with these.
 //
-using SequentialForallReduceExecPols = camp::list< RAJA::seq_exec >;
+using SequentialForallReduceExecPols = camp::list<RAJA::seq_exec>;
 
-using SequentialForallAtomicExecPols = camp::list< RAJA::seq_exec >;
+using SequentialForallAtomicExecPols = camp::list<RAJA::seq_exec>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallExecPols = 
-  camp::list< RAJA::omp_parallel_for_exec
- 
-              , RAJA::omp_parallel_for_static_exec< >
-              , RAJA::omp_parallel_for_static_exec<4>
+using OpenMPForallExecPols = camp::list<
+    RAJA::omp_parallel_for_exec
 
-#if defined(RAJA_TEST_EXHAUSTIVE)
-              , RAJA::omp_parallel_for_dynamic_exec< >
-              , RAJA::omp_parallel_for_dynamic_exec<4>
-
-              , RAJA::omp_parallel_for_guided_exec< >
-              , RAJA::omp_parallel_for_guided_exec<4>
-
-              , RAJA::omp_parallel_for_runtime_exec
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_exec>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
+    ,
+    RAJA::omp_parallel_for_static_exec<>,
+    RAJA::omp_parallel_for_static_exec<4>
 
-              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided< >>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
-
-              , RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
-#endif       
-             >;
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    ,
+    RAJA::omp_parallel_for_dynamic_exec<>,
+    RAJA::omp_parallel_for_dynamic_exec<4>
+
+    ,
+    RAJA::omp_parallel_for_guided_exec<>,
+    RAJA::omp_parallel_for_guided_exec<4>
+
+    ,
+    RAJA::omp_parallel_for_runtime_exec
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_exec>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_static_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Static<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_nowait_schedule_exec<RAJA::policy::omp::Static<4>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_dynamic_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Dynamic<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<>>>,
+    RAJA::omp_parallel_exec<RAJA::omp_for_guided_exec<8>>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Guided<8>>>
+
+    ,
+    RAJA::omp_parallel_exec<RAJA::omp_for_runtime_exec>,
+    RAJA::omp_parallel_exec<
+        RAJA::omp_for_schedule_exec<RAJA::policy::omp::Runtime>>
+#endif
+    >;
 
 using OpenMPForallReduceExecPols = OpenMPForallExecPols;
 
 using OpenMPForallAtomicExecPols =
-  camp::list< RAJA::omp_parallel_for_exec
+    camp::list<RAJA::omp_parallel_for_exec
 
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              , RAJA::omp_parallel_for_static_exec< >
-              , RAJA::omp_parallel_for_static_exec<4>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec< >>
-              , RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
-
-              , RAJA::omp_parallel_for_dynamic_exec< >
-              , RAJA::omp_parallel_for_dynamic_exec<2>
-
-              , RAJA::omp_parallel_for_guided_exec< >
-              , RAJA::omp_parallel_for_guided_exec<3>
-
-              , RAJA::omp_parallel_for_runtime_exec
+               ,
+               RAJA::omp_parallel_for_static_exec<>,
+               RAJA::omp_parallel_for_static_exec<4>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<>>,
+               RAJA::omp_parallel_exec<RAJA::omp_for_nowait_static_exec<4>>
+
+               ,
+               RAJA::omp_parallel_for_dynamic_exec<>,
+               RAJA::omp_parallel_for_dynamic_exec<2>
+
+               ,
+               RAJA::omp_parallel_for_guided_exec<>,
+               RAJA::omp_parallel_for_guided_exec<3>
+
+               ,
+               RAJA::omp_parallel_for_runtime_exec
 #endif
-            >; 
+               >;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetForallExecPols =
-  camp::list< RAJA::omp_target_parallel_for_exec<8>,
-              RAJA::omp_target_parallel_for_exec_nt >;
+    camp::list<RAJA::omp_target_parallel_for_exec<8>,
+               RAJA::omp_target_parallel_for_exec_nt>;
 
 using OpenMPTargetForallReduceExecPols = OpenMPTargetForallExecPols;
 
@@ -105,12 +127,15 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
-                                       RAJA::cuda_exec_occ_calc<256>,
-                                       RAJA::cuda_exec_grid<256, 64>,
-                                       RAJA::cuda_exec_explicit<256,2>,
-                                       RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                       RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >;
+using CudaForallExecPols =
+    camp::list<RAJA::cuda_exec<128>,
+               RAJA::cuda_exec_occ_calc<256>,
+               RAJA::cuda_exec_grid<256, 64>,
+               RAJA::cuda_exec_explicit<256, 2>,
+               RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::cuda_exec_occ_custom<
+                   256,
+                   RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -119,11 +144,14 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
-                                      RAJA::hip_exec_occ_calc<256>,
-                                      RAJA::hip_exec_grid<256, 64>,
-                                      RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                      RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >;
+using HipForallExecPols =
+    camp::list<RAJA::hip_exec<128>,
+               RAJA::hip_exec_occ_calc<256>,
+               RAJA::hip_exec_grid<256, 64>,
+               RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t, 1, 2>>,
+               RAJA::hip_exec_occ_custom<
+                   256,
+                   RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer>>;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
@@ -132,8 +160,8 @@ using HipForallAtomicExecPols = HipForallExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclForallExecPols = camp::list< RAJA::sycl_exec<128, false>,
-                                       RAJA::sycl_exec<256, false> >;
+using SyclForallExecPols =
+    camp::list<RAJA::sycl_exec<128, false>, RAJA::sycl_exec<256, false>>;
 
 using SyclForallReduceExecPols = SyclForallExecPols;
 
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
index 1a25ba4daf..e4eebcd266 100644
--- a/test/include/RAJA_test-forall-indexset-execpol.hpp
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -13,8 +13,8 @@
 
 // Sequential execution policy types
 using SequentialForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec>>;
 
 //
 // Sequential execution policy types for reduction tests.
@@ -22,50 +22,48 @@ using SequentialForallIndexSetExecPols =
 // Note: RAJA::simd_exec does not work with these.
 //
 using SequentialForallIndexSetReduceExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPForallIndexSetExecPols =  
-  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+using OpenMPForallIndexSetExecPols =
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 
 using OpenMPForallIndexSetReduceExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> >;
+    camp::list<RAJA::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::seq_exec>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit,
-                               RAJA::omp_target_parallel_for_exec<8>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, 
-                               RAJA::omp_target_parallel_for_exec_nt> >;
+using OpenMPTargetForallIndexSetExecPols = camp::list<
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec<8>>,
+    RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_target_parallel_for_exec_nt>>;
 
-using OpenMPTargetForallIndexSetReduceExecPols = 
-      OpenMPTargetForallIndexSetExecPols;
+using OpenMPTargetForallIndexSetReduceExecPols =
+    OpenMPTargetForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>>;
 
 using CudaForallIndexSetReduceExecPols = CudaForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::hip_exec<256>>>;
 
 using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
 using SyclForallIndexSetExecPols =
-  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
-              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>> >;
+    camp::list<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
+               RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>>>;
 
 using SyclForallIndexSetReduceExecPols = SyclForallIndexSetExecPols;
 #endif
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
index 231139eb57..ed13851729 100644
--- a/test/include/RAJA_test-index-types.hpp
+++ b/test/include/RAJA_test-index-types.hpp
@@ -25,50 +25,50 @@
 //
 RAJA_INDEX_VALUE(StrongIndexType, "StrongIndexType");
 RAJA_INDEX_VALUE_T(StrongInt, int, "StrongIntType");
-RAJA_INDEX_VALUE_T(StrongULL, unsigned long long , "StrongULLType");
+RAJA_INDEX_VALUE_T(StrongULL, unsigned long long, "StrongULLType");
 
 //
 // Standard index types list
 //
-using IdxTypeList = camp::list<RAJA::Index_type,
-                               int,
+using IdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                               unsigned int,
-// short int types will break a bunch of tests due to assumpitons made in 
-// the test implementations.
-//                             short,
-//                             unsigned short,
-                               long int,
-                               unsigned long,
-                               long long,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                             short,
+               //                             unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-                               unsigned long long>;
+               unsigned long long>;
 
 //
 // Signed index types list
 //
-using SignedIdxTypeList = camp::list<RAJA::Index_type,
-                                     int,
-                                     long long>;
+using SignedIdxTypeList = camp::list<RAJA::Index_type, int, long long>;
 
 //
 // Index types w/ Strong types list
 //
-using StrongIdxTypeList = camp::list<RAJA::Index_type,
-                                     int,
-                                     StrongIndexType,
+using StrongIdxTypeList =
+    camp::list<RAJA::Index_type,
+               int,
+               StrongIndexType,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                     //StrongInt,
-                                     unsigned int,
-// short int types will break a bunch of tests due to assumpitons made in 
-// the test implementations.
-//                                   short,
-//                                   unsigned short,
-                                     long int,
-                                     unsigned long,
-                                     long long,
+               // StrongInt,
+               unsigned int,
+               // short int types will break a bunch of tests due to assumpitons
+               // made in the test implementations.
+               //                                   short,
+               //                                   unsigned short,
+               long int,
+               unsigned long,
+               long long,
 #endif
-                                     //StrongULL,
-                                     unsigned long long>;
+               // StrongULL,
+               unsigned long long>;
 
-#endif // __RAJA_test_index_types_HPP__
+#endif  // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-indexset-build.hpp b/test/include/RAJA_test-indexset-build.hpp
index a7bcdf5b05..4bc41ac9cf 100644
--- a/test/include/RAJA_test-indexset-build.hpp
+++ b/test/include/RAJA_test-indexset-build.hpp
@@ -18,17 +18,17 @@
 #include <random>
 
 //
-// Utility routine to construct index set with mix of Range, RangeStride, 
+// Utility routine to construct index set with mix of Range, RangeStride,
 // and List segments to use in various tests.
 //
 template <typename INDEX_TYPE,
           typename RANGE_TYPE,
           typename RANGESTRIDE_TYPE,
           typename LIST_TYPE>
-void buildIndexSet( 
-  RAJA::TypedIndexSet< RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE >& iset, 
-  std::vector<INDEX_TYPE>& indices_out,
-  camp::resources::Resource working_res )
+void buildIndexSet(
+    RAJA::TypedIndexSet<RANGE_TYPE, RANGESTRIDE_TYPE, LIST_TYPE>& iset,
+    std::vector<INDEX_TYPE>& indices_out,
+    camp::resources::Resource working_res)
 {
   //
   //  Build vector of integers for creating List segments.
@@ -38,27 +38,29 @@ void buildIndexSet(
 
   std::vector<INDEX_TYPE> lindices;
   INDEX_TYPE idx = 0;
-  while (lindices.size() < 3000) {
+  while (lindices.size() < 3000)
+  {
     double dval = dist(gen);
-    if (dval > 0.3) {
+    if (dval > 0.3)
+    {
       lindices.push_back(idx);
     }
     idx++;
   }
 
   //
-  // Construct a mix of Range, RangeStride, and List segments 
+  // Construct a mix of Range, RangeStride, and List segments
   // and add them to index set
   //
-  INDEX_TYPE rbeg = 0;
-  INDEX_TYPE rend = 0;
-  INDEX_TYPE stride = 0;
+  INDEX_TYPE rbeg     = 0;
+  INDEX_TYPE rend     = 0;
+  INDEX_TYPE stride   = 0;
   INDEX_TYPE last_idx = 0;
-  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>( lindices.size() );
+  INDEX_TYPE lseg_len = static_cast<INDEX_TYPE>(lindices.size());
   std::vector<INDEX_TYPE> lseg(lseg_len);
   std::vector<INDEX_TYPE> lseg_vec(lseg_len);
 
-  indices_out.clear(); 
+  indices_out.clear();
 
   // Create empty Range segment
   rbeg = 1;
@@ -70,34 +72,38 @@ void buildIndexSet(
   rbeg = 1;
   rend = 1578;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back( lseg[i] );
+    indices_out.push_back(lseg[i]);
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg_vec[i] = lindices[i] + last_idx + 3;
-    indices_out.push_back( lseg_vec[i] );
+    indices_out.push_back(lseg_vec[i]);
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
 
   // Create Range-stride segment
-  rbeg = last_idx + 16;
-  rend = rbeg + 2040;
+  rbeg   = last_idx + 16;
+  rend   = rbeg + 2040;
   stride = 3;
   iset.push_back(RANGESTRIDE_TYPE(rbeg, rend, stride));
-  for (INDEX_TYPE i = rbeg; i < rend; i += stride) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; i += stride)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
@@ -105,15 +111,17 @@ void buildIndexSet(
   rbeg = last_idx + 4;
   rend = rbeg + 2759;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg[i] = lindices[i] + last_idx + 5;
-    indices_out.push_back( lseg[i] );
+    indices_out.push_back(lseg[i]);
   }
   iset.push_back(LIST_TYPE(&lseg[0], lseg_len, working_res));
   last_idx = lseg[lseg_len - 1];
@@ -122,15 +130,17 @@ void buildIndexSet(
   rbeg = last_idx + 1;
   rend = rbeg + 320;
   iset.push_back(RANGE_TYPE(rbeg, rend));
-  for (INDEX_TYPE i = rbeg; i < rend; ++i) { 
-    indices_out.push_back( i ); 
+  for (INDEX_TYPE i = rbeg; i < rend; ++i)
+  {
+    indices_out.push_back(i);
   }
   last_idx = rend;
 
   // Create List segment using alternate ctor
-  for (INDEX_TYPE i = 0; i < lseg_len; ++i) {
+  for (INDEX_TYPE i = 0; i < lseg_len; ++i)
+  {
     lseg_vec[i] = lindices[i] + last_idx + 7;
-    indices_out.push_back( lseg_vec[i] );
+    indices_out.push_back(lseg_vec[i]);
   }
   iset.push_back(LIST_TYPE(lseg_vec, working_res));
   last_idx = lseg_vec[lseg_len - 1];
diff --git a/test/include/RAJA_test-kernel-nested-loop-types.hpp b/test/include/RAJA_test-kernel-nested-loop-types.hpp
index 4d13af1e9b..9c323c95e4 100644
--- a/test/include/RAJA_test-kernel-nested-loop-types.hpp
+++ b/test/include/RAJA_test-kernel-nested-loop-types.hpp
@@ -16,30 +16,54 @@
 #define DEVICE_KERNEL CudaKernel
 #endif
 
-struct DEPTH_1_REDUCESUM {};
-struct DEPTH_2 {};
-struct DEPTH_2_COLLAPSE {};
-struct DEPTH_3 {};
-struct DEPTH_3_COLLAPSE {};
-struct DEPTH_3_COLLAPSE_SEQ_INNER {};
-struct DEPTH_3_COLLAPSE_SEQ_OUTER {};
-struct DEPTH_3_REDUCESUM {};
-struct DEPTH_3_REDUCESUM_SEQ_INNER {};
-struct DEPTH_3_REDUCESUM_SEQ_OUTER {};
-struct DEVICE_DEPTH_1_REDUCESUM {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARP {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE {};
-struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE {};
-struct DEVICE_DEPTH_2 {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARP {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI {};
-struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE {};
-struct DEVICE_DEPTH_3 {};
-struct DEVICE_DEPTH_3_REDUCESUM {};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER {};
-struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER {};
-struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
+struct DEPTH_1_REDUCESUM
+{};
+struct DEPTH_2
+{};
+struct DEPTH_2_COLLAPSE
+{};
+struct DEPTH_3
+{};
+struct DEPTH_3_COLLAPSE
+{};
+struct DEPTH_3_COLLAPSE_SEQ_INNER
+{};
+struct DEPTH_3_COLLAPSE_SEQ_OUTER
+{};
+struct DEPTH_3_REDUCESUM
+{};
+struct DEPTH_3_REDUCESUM_SEQ_INNER
+{};
+struct DEPTH_3_REDUCESUM_SEQ_OUTER
+{};
+struct DEVICE_DEPTH_1_REDUCESUM
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARP
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE
+{};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE
+{};
+struct DEVICE_DEPTH_2
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARP
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
+{};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE
+{};
+struct DEVICE_DEPTH_3
+{};
+struct DEVICE_DEPTH_3_REDUCESUM
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER
+{};
+struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
+{};
 
 
 //
@@ -47,56 +71,61 @@ struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
 // Nested Loop Data Type information
 //
 //
-template<typename LoopPolType, typename... Policies> 
-struct NestedLoopData : camp::list<Policies...> {
+template <typename LoopPolType, typename... Policies>
+struct NestedLoopData : camp::list<Policies...>
+{
   using LoopType = LoopPolType;
 };
 
 
 //
 //
-// Filter out a list of "NestedLoopData" types given a 
+// Filter out a list of "NestedLoopData" types given a
 // tests' supported loop Type list.
 //
 //
-namespace detail{
+namespace detail
+{
 
-  using namespace camp;
+using namespace camp;
 
-  template<typename T, typename Elements>
-  struct is_in_type_list;
+template <typename T, typename Elements>
+struct is_in_type_list;
 
-  template<typename T, typename Elements>
-  struct KELB_impl;
+template <typename T, typename Elements>
+struct KELB_impl;
 
-  template<typename T, typename First, typename... Rest>
-  struct is_in_type_list<T, list<First, Rest...>> :
-    std::conditional<
-      std::is_same<  typename T::LoopType, First  >::value,
-      list<T>,
-      typename is_in_type_list<T, list<Rest...>>::type > {};
+template <typename T, typename First, typename... Rest>
+struct is_in_type_list<T, list<First, Rest...>>
+    : std::conditional<std::is_same<typename T::LoopType, First>::value,
+                       list<T>,
+                       typename is_in_type_list<T, list<Rest...>>::type>
+{};
 
-  template<typename T, typename Last>
-  struct is_in_type_list<T, list<Last>> :
-    std::conditional<
-      std::is_same< typename T::LoopType , Last>::value,
-      list<T>,
-      list<> > {};
+template <typename T, typename Last>
+struct is_in_type_list<T, list<Last>>
+    : std::conditional<std::is_same<typename T::LoopType, Last>::value,
+                       list<T>,
+                       list<>>
+{};
 
-  template<typename POL_TYPE_LIST, typename First, typename... Rest>
-  struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>> :
-    join< typename KELB_impl<POL_TYPE_LIST, list<First  >>::type, 
-          typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type > {};
+template <typename POL_TYPE_LIST, typename First, typename... Rest>
+struct KELB_impl<POL_TYPE_LIST, list<First, Rest...>>
+    : join<typename KELB_impl<POL_TYPE_LIST, list<First>>::type,
+           typename KELB_impl<POL_TYPE_LIST, list<Rest...>>::type>
+{};
 
-  template<typename POL_TYPE_LIST, typename Last>
-  struct KELB_impl<POL_TYPE_LIST, list<Last>> :
-    is_in_type_list<Last, POL_TYPE_LIST > {};
+template <typename POL_TYPE_LIST, typename Last>
+struct KELB_impl<POL_TYPE_LIST, list<Last>>
+    : is_in_type_list<Last, POL_TYPE_LIST>
+{};
 
-} // namespace detail
+}  // namespace detail
 
 
-template<typename POL_TYPE_LIST, typename EXEC_POL_LIST>
-struct KernelExecListBuilder {
+template <typename POL_TYPE_LIST, typename EXEC_POL_LIST>
+struct KernelExecListBuilder
+{
   using type = typename detail::KELB_impl<POL_TYPE_LIST, EXEC_POL_LIST>::type;
 };
 
diff --git a/test/include/RAJA_test-kernel-tile-size.hpp b/test/include/RAJA_test-kernel-tile-size.hpp
index 78fa28172d..9d9bb95556 100644
--- a/test/include/RAJA_test-kernel-tile-size.hpp
+++ b/test/include/RAJA_test-kernel-tile-size.hpp
@@ -15,4 +15,4 @@
 constexpr int tile_dim_x = 16;
 constexpr int tile_dim_y = 16;
 
-#endif // __RAJA_test_kernel_tile_size_HPP__
+#endif  // __RAJA_test_kernel_tile_size_HPP__
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 7179e48fdc..0bb84ddd16 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -15,26 +15,17 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
-
-using Sequential_launch_policies =
-  camp::list<
-             seq_policies
-            >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -43,47 +34,36 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 #if defined(RAJA_ENABLE_CUDA)
 
 using cuda_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using cuda_direct_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-           >;
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using cuda_direct_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
 
 using Cuda_launch_policies =
-  camp::list<
-             cuda_direct_policies,
-             cuda_direct_explicit_policies
-            >;
+    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
 using hip_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
-           >;
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
 using sycl_direct_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
-            >;
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
 
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index f84823e414..258809a569 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -15,32 +15,26 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using Sequential_launch_policies = camp::list<seq_policies>;
-                                              
+
 #if defined(RAJA_ENABLE_OPENMP)
 
-using omp_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>,
-             RAJA::LoopPolicy<RAJA::seq_exec>
-            >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
 using OpenMP_launch_policies = camp::list<omp_policies>;
 
@@ -48,68 +42,57 @@ using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using cuda_direct_explicit_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>
-            >;
-
-using Cuda_launch_policies = 
-  camp::list<
-             cuda_direct_policies,
-             cuda_direct_explicit_policies
-             >;
+using cuda_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using cuda_direct_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_direct_policies, cuda_direct_explicit_policies>;
 
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
-             RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
-             RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
-             RAJA::LoopPolicy<RAJA::hip_thread_x_direct>
-           >;
+using hip_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_direct>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_direct>>;
 
 using Hip_launch_policies = camp::list<hip_direct_policies>;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_direct_policies = 
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, //slowest
-             RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, //fastest
-             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
-            >;
+using sycl_direct_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,  // slowest
+               RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,  // fastest
+               RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_direct>>;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
-                                        
+
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index fea90a8305..5965621493 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -15,65 +15,47 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>>;
 
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+using cuda_policies = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
 using cuda_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
 
-using Cuda_launch_policies = camp::list<
-        cuda_policies,
-        cuda_explicit_policies
-         >;
+using Cuda_launch_policies = camp::list<cuda_policies, cuda_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
+using hip_policies = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+                                RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
 
-using Hip_launch_policies = camp::list<
-      hip_policies
-       >;
-#endif // RAJA_ENABLE_HIP
+using Hip_launch_policies = camp::list<hip_policies>;
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using sycl_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
+using sycl_policies = camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+                                 RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
-using Sycl_launch_policies = camp::list<
-      sycl_policies
-       >;
-#endif // RAJA_ENABLE_SYCL
+using Sycl_launch_policies = camp::list<sycl_policies>;
+#endif  // RAJA_ENABLE_SYCL
 
 
 #endif  // __RAJA_TEST_LAUNCH_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index 6173fc6ffa..bed8b99cd6 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -15,73 +15,55 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-         RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
 
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
-
-using Cuda_launch_policies = camp::list<
-  cuda_loop_policies,
-  cuda_loop_explicit_policies
-  >;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
-  >;
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
 
-using Hip_launch_policies = camp::list<
-      hip_loop_policies
-       >;
-#endif // RAJA_ENABLE_HIP
+using Hip_launch_policies = camp::list<hip_loop_policies>;
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
-  >;
-
-using Sycl_launch_policies = camp::list<  
-  sycl_loop_policies
-  >;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+
+using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index d703216a13..7f4dd17486 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -15,97 +15,79 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
-using seq_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>,
-  RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
-
-using Sequential_launch_policies = camp::list<
-  seq_policies
-  >;
+// Launch policies
+using seq_policies = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using Sequential_launch_policies = camp::list<seq_policies>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using omp_policies = camp::list<
-         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-         RAJA::LoopPolicy<RAJA::omp_for_exec>,  
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>,
-         RAJA::LoopPolicy<RAJA::seq_exec>
-  >;
-
-using OpenMP_launch_policies = camp::list<
-  omp_policies
-  >;
+using omp_policies = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                                RAJA::LoopPolicy<RAJA::omp_for_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>,
+                                RAJA::LoopPolicy<RAJA::seq_exec>>;
+
+using OpenMP_launch_policies = camp::list<omp_policies>;
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using cuda_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
+using cuda_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>,
+               RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
 
 using cuda_loop_explicit_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>
-  >;
-
-using Cuda_launch_policies = camp::list<
-  cuda_loop_policies,
-  cuda_loop_explicit_policies
-  >;
+    RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::cuda_block_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_block_x_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>,
+    RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>>;
+
+using Cuda_launch_policies =
+    camp::list<cuda_loop_policies, cuda_loop_explicit_policies>;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using hip_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
-  RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
-  RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
-  RAJA::LoopPolicy<RAJA::hip_thread_x_loop>
-  >;
-
-using Hip_launch_policies = camp::list<
-      hip_loop_policies
-       >;
-#endif // RAJA_ENABLE_HIP
+using hip_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::hip_block_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_block_x_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_z_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_y_loop>,
+               RAJA::LoopPolicy<RAJA::hip_thread_x_loop>>;
+
+using Hip_launch_policies = camp::list<hip_loop_policies>;
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
-using sycl_loop_policies = camp::list<
-  RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, //slowest index
-  RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, //fastest index
-  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
-  >;
-
-using Sycl_launch_policies = camp::list<  
-  sycl_loop_policies
-  >;
+using sycl_loop_policies =
+    camp::list<RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,  // slowest index
+               RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,  // fastest index
+               RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
+               RAJA::LoopPolicy<RAJA::sycl_local_2_loop>>;
+
+using Sycl_launch_policies = camp::list<sycl_loop_policies>;
 #endif
 
 
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index fa2b39f761..0d896c7880 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -15,158 +15,129 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-//Launch policies
+// Launch policies
 #if defined(RAJA_ENABLE_CUDA)
-using seq_cuda_policies =
-  camp::list<
-              RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::cuda_launch_t<true>>,
-              RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
-
-using seq_cuda_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using seq_cuda_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
+
+using seq_cuda_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using Sequential_launch_policies =
-  camp::list<
-             seq_cuda_policies,
-             seq_cuda_explicit_policies
-            >;
+    camp::list<seq_cuda_policies, seq_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
 using seq_hip_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::hip_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
-            >;
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<true>>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_block_x_direct>,
+               RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using seq_sycl_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
-            >;
+using seq_sycl_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::sycl_launch_t<true>>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
 
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
 
 #else
 using Sequential_launch_policies =
-  camp::list<
-    camp::list<
-               RAJA::LaunchPolicy<RAJA::seq_launch_t>,
-               RAJA::LoopPolicy<RAJA::seq_exec>,
-               RAJA::LoopPolicy<RAJA::seq_exec>
-              >
-            >;
-#endif // Sequential
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
+#endif  // Sequential
 
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using omp_cuda_policies =
-  camp::list<
-              RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::cuda_launch_t<false>>,
-              RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-              RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using omp_cuda_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::cuda_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
-using omp_cuda_explicit_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::cuda_thread_x_loop>
-            >;
+using omp_cuda_explicit_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t,
+                       RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::cuda_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::cuda_thread_x_loop>>;
 
 using OpenMP_launch_policies =
-  camp::list<
-             omp_cuda_policies,
-             omp_cuda_explicit_policies
-            >;
+    camp::list<omp_cuda_policies, omp_cuda_explicit_policies>;
 
 #elif defined(RAJA_ENABLE_HIP)
 
-using omp_hip_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::hip_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::hip_thread_x_loop>
-            >;
+using omp_hip_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::hip_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::hip_block_x_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::hip_thread_x_loop>>;
 
 using OpenMP_launch_policies = camp::list<omp_hip_policies>;
 
 #elif defined(RAJA_ENABLE_SYCL)
 
-using omp_sycl_policies =
-  camp::list<
-             RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::sycl_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
-            >;
+using omp_sycl_policies = camp::list<
+    RAJA::LaunchPolicy<RAJA::omp_launch_t, RAJA::sycl_launch_t<false>>,
+    RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
+    RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_local_2_loop>>;
 
 using OpenMP_launch_policies = camp::list<omp_sycl_policies>;
 
 #else
 
 using OpenMP_launch_policies =
-  camp::list<
-    camp::list<
-                RAJA::LaunchPolicy<RAJA::omp_launch_t>,
-                RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
-                RAJA::LoopPolicy<RAJA::seq_exec>
-               >
-             >;
+    camp::list<camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+                          RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
+                          RAJA::LoopPolicy<RAJA::seq_exec>>>;
 #endif
 
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using Cuda_launch_policies =
-  camp::list<
-             seq_cuda_policies
-            ,seq_cuda_explicit_policies
+using Cuda_launch_policies = camp::list<seq_cuda_policies,
+                                        seq_cuda_explicit_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-            ,omp_cuda_policies
-            ,omp_cuda_explicit_policies
+                                        ,
+                                        omp_cuda_policies,
+                                        omp_cuda_explicit_policies
 #endif
 
-           >;
+                                        >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
 
-using Hip_launch_policies = camp::list<
-         seq_hip_policies
+using Hip_launch_policies = camp::list<seq_hip_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-         , omp_hip_policies
+                                       ,
+                                       omp_hip_policies
 #endif
-        >;
+                                       >;
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
 
 #if defined(RAJA_ENABLE_SYCL)
 
-using Sycl_launch_policies = camp::list<
-         seq_sycl_policies
+using Sycl_launch_policies = camp::list<seq_sycl_policies
 
 #if defined(RAJA_ENABLE_OPENMP)
-         , omp_sycl_policies
+                                        ,
+                                        omp_sycl_policies
 #endif
-        >;
+                                        >;
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #endif  // __RAJA_TEST_LAUNCH_RUNTIME_EXECPOL_HPP__
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
index 2c5412893c..1dd618a72e 100644
--- a/test/include/RAJA_test-multi-reduce-abstractor.hpp
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -18,7 +18,7 @@
 //
 // Get the identity value for the operation used by the given multi reducer
 //
-template < typename MultiReducer >
+template <typename MultiReducer>
 inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 {
   return MultiReducer::MultiReduceOp::identity();
@@ -27,144 +27,207 @@ inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
 
 struct SumAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
+  template <typename Reducer>
   static bool consistent(Reducer const&)
   {
-    return RAJA::policy_has_trait<typename Reducer::policy, RAJA::reduce::ordered>::value ||
+    return RAJA::policy_has_trait<typename Reducer::policy,
+                                  RAJA::reduce::ordered>::value ||
            !std::is_floating_point<typename Reducer::value_type>::value;
   }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceSum<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs + rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs + rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) += rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) += rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct MinAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceMin<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs > rhs) ? rhs : lhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return (lhs > rhs) ? rhs : lhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).min(rhs); }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs).min(rhs);
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct MaxAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_arithmetic<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceMax<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs < rhs) ? rhs : lhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return (lhs < rhs) ? rhs : lhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).max(rhs); }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs).max(rhs);
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct BitAndAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_integral<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceBitAnd<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs & rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs & rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) &= rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) &= rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 struct BitOrAbstractor
 {
-  template < typename DATA_TYPE >
-  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+  template <typename DATA_TYPE>
+  static constexpr bool supports()
+  {
+    return std::is_integral<DATA_TYPE>::value;
+  }
 
-  template < typename Reducer >
-  static constexpr bool consistent(Reducer const&) { return true; }
+  template <typename Reducer>
+  static constexpr bool consistent(Reducer const&)
+  {
+    return true;
+  }
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
 
-  template < typename policy, typename DATA_TYPE >
+  template <typename policy, typename DATA_TYPE>
   using multi_reducer = RAJA::MultiReduceBitOr<policy, DATA_TYPE>;
 
-  template < typename Lhs, typename Rhs >
-  RAJA_HOST_DEVICE
-  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs | rhs; }
+  template <typename Lhs, typename Rhs>
+  RAJA_HOST_DEVICE static auto combine(Lhs const& lhs, Rhs const& rhs)
+  {
+    return lhs | rhs;
+  }
 
-  template < typename Reducer, typename Rhs >
-  RAJA_HOST_DEVICE
-  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) |= rhs; }
+  template <typename Reducer, typename Rhs>
+  RAJA_HOST_DEVICE static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs)
+  {
+    return std::forward<Reducer>(lhs) |= rhs;
+  }
 
-  template < typename Reducer >
-  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+  template <typename Reducer>
+  static auto identity(Reducer const&)
+  {
+    return Reducer::MultiReduceOp::identity();
+  }
 };
 
 
 // Sequential reduction policy types
-using ReduceSumAbstractors = camp::list< SumAbstractor >;
-using ReduceMinAbstractors = camp::list< MinAbstractor >;
-using ReduceMaxAbstractors = camp::list< MaxAbstractor >;
-using ReduceBitAndAbstractors = camp::list< BitAndAbstractor >;
-using ReduceBitOrAbstractors = camp::list< BitOrAbstractor >;
+using ReduceSumAbstractors    = camp::list<SumAbstractor>;
+using ReduceMinAbstractors    = camp::list<MinAbstractor>;
+using ReduceMaxAbstractors    = camp::list<MaxAbstractor>;
+using ReduceBitAndAbstractors = camp::list<BitAndAbstractor>;
+using ReduceBitOrAbstractors  = camp::list<BitOrAbstractor>;
 
 #endif  // __RAJA_test_multi_reduce_abstractor_HPP__
diff --git a/test/include/RAJA_test-multi-reducepol.hpp b/test/include/RAJA_test-multi-reducepol.hpp
index e024ef70aa..3e962c6df2 100644
--- a/test/include/RAJA_test-multi-reducepol.hpp
+++ b/test/include/RAJA_test-multi-reducepol.hpp
@@ -16,28 +16,29 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialMultiReducePols = camp::list< RAJA::seq_multi_reduce >;
+using SequentialMultiReducePols = camp::list<RAJA::seq_multi_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPMultiReducePols =
-  camp::list< RAJA::omp_multi_reduce,
-              RAJA::omp_multi_reduce_ordered >;
+    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducePols =
-  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::cuda_multi_reduce_atomic_global_host_init,
-              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+using CudaMultiReducePols = camp::list<
+    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::cuda_multi_reduce_atomic_global_host_init,
+    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducePols =
-  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::hip_multi_reduce_atomic_global_host_init,
-              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+using HipMultiReducePols = camp::list<
+    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::hip_multi_reduce_atomic_global_host_init,
+    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #endif  // __RAJA_test_multi_reducepol_HPP__
diff --git a/test/include/RAJA_test-platform.hpp b/test/include/RAJA_test-platform.hpp
index 7862461f18..ecdf7e1a56 100644
--- a/test/include/RAJA_test-platform.hpp
+++ b/test/include/RAJA_test-platform.hpp
@@ -16,10 +16,10 @@
 
 #include "camp/list.hpp"
 
-template < RAJA::Platform PLATFORM >
+template <RAJA::Platform PLATFORM>
 struct PlatformHolder
 {
-   static const RAJA::Platform platform = PLATFORM;
+  static const RAJA::Platform platform = PLATFORM;
 };
 
 //
@@ -38,11 +38,12 @@ using CudaPlatformList = camp::list<PlatformHolder<RAJA::Platform::cuda>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPlatformList = camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
+using OpenMPTargetPlatformList =
+    camp::list<PlatformHolder<RAJA::Platform::omp_target>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipPlatformList = camp::list<PlatformHolder<RAJA::Platform::hip>>;
 #endif
 
-#endif // __RAJA_test_platform_HPP__
+#endif  // __RAJA_test_platform_HPP__
diff --git a/test/include/RAJA_test-plugin-kernelpol.hpp b/test/include/RAJA_test-plugin-kernelpol.hpp
index 9c3f0e2e52..0ef68cbb7c 100644
--- a/test/include/RAJA_test-plugin-kernelpol.hpp
+++ b/test/include/RAJA_test-plugin-kernelpol.hpp
@@ -18,86 +18,90 @@
 
 // Sequential execution policy types
 using SequentialPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::seq_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
-          RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::simd_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::seq_exec,
-          RAJA::statement::For<0, RAJA::simd_exec,
-            RAJA::statement::Lambda<0>>>>
-    >;
+    RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 using OpenMPPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
-          RAJA::statement::Lambda<0>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::Tile<0, RAJA::tile_fixed<2>, RAJA::omp_parallel_for_exec,
-          RAJA::statement::For<0, RAJA::seq_exec,
-            RAJA::statement::Lambda<0>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::For<0,
+                                            RAJA::omp_parallel_for_exec,
+                                            RAJA::statement::Lambda<0>>>,
+    RAJA::KernelPolicy<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<2>,
+        RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec<64>,
-          RAJA::statement::Lambda<0>>>
-    >;
+using OpenMPTargetPluginKernelExecPols = camp::list<RAJA::KernelPolicy<
+    RAJA::statement::For<0,
+                         RAJA::omp_target_parallel_for_exec<64>,
+                         RAJA::statement::Lambda<0>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernel<
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernel<
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernelFixed<128,
-          RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::CudaKernelFixed<128,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::cuda_block_x_direct,
-            RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::CudaKernel<
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<128>,
+        RAJA::cuda_block_x_direct,
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+        128,
+        RAJA::statement::
+            For<0, RAJA::cuda_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::CudaKernelFixed<
+        128,
+        RAJA::statement::Tile<
+            0,
+            RAJA::tile_fixed<128>,
+            RAJA::cuda_block_x_direct,
+            RAJA::statement::For<0,
+                                 RAJA::cuda_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 using HipPluginKernelExecPols = camp::list<
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernel<
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernel<
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
-            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernelFixed<128,
-          RAJA::statement::For<0, RAJA::hip_thread_x_loop,
-            RAJA::statement::Lambda<0>>>>,
-      RAJA::KernelPolicy<
-        RAJA::statement::HipKernelFixed<128,
-          RAJA::statement::Tile<0, RAJA::tile_fixed<128>, RAJA::hip_block_x_direct,
-            RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-              RAJA::statement::Lambda<0>>>>>
-    >;
+    RAJA::KernelPolicy<RAJA::statement::HipKernel<
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<
+        0,
+        RAJA::tile_fixed<128>,
+        RAJA::hip_block_x_direct,
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+        128,
+        RAJA::statement::
+            For<0, RAJA::hip_thread_x_loop, RAJA::statement::Lambda<0>>>>,
+    RAJA::KernelPolicy<RAJA::statement::HipKernelFixed<
+        128,
+        RAJA::statement::Tile<
+            0,
+            RAJA::tile_fixed<128>,
+            RAJA::hip_block_x_direct,
+            RAJA::statement::For<0,
+                                 RAJA::hip_thread_x_direct,
+                                 RAJA::statement::Lambda<0>>>>>>;
 #endif
 
 #endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/include/RAJA_test-plugin-launchpol.hpp b/test/include/RAJA_test-plugin-launchpol.hpp
index 2370084633..e086842f5f 100644
--- a/test/include/RAJA_test-plugin-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-launchpol.hpp
@@ -17,18 +17,22 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
+using CudaPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
+using HipPluginLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
 
 #endif
 
diff --git a/test/include/RAJA_test-plugin-resource-launchpol.hpp b/test/include/RAJA_test-plugin-resource-launchpol.hpp
index 8d08574347..e1a2caf27e 100644
--- a/test/include/RAJA_test-plugin-resource-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-resource-launchpol.hpp
@@ -17,18 +17,22 @@
 #include "camp/list.hpp"
 
 // Sequential execution policy types
-using SequentialPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+using SequentialPluginResourceLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+using OpenMPPluginResourceLaunchExecPols =
+    camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
+using CudaPluginResourceLaunchExecPols = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
+using HipPluginResourceLaunchExecPols = camp::list<
+    RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
 
 #endif
 
diff --git a/test/include/RAJA_test-reduce-types.hpp b/test/include/RAJA_test-reduce-types.hpp
index 8d8115321f..49d5cadaea 100644
--- a/test/include/RAJA_test-reduce-types.hpp
+++ b/test/include/RAJA_test-reduce-types.hpp
@@ -21,14 +21,13 @@
 //
 // Reduce data types
 //
-using ReduceDataTypeList =
-  camp::list< int,
+using ReduceDataTypeList = camp::list<int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
+                                      unsigned,
+                                      long long,
+                                      unsigned long long,
 #endif
-              float,
-              double >;
+                                      float,
+                                      double>;
 
-#endif // __RAJA_test_reduce_types_HPP__
+#endif  // __RAJA_test_reduce_types_HPP__
diff --git a/test/include/RAJA_test-reduceloc-types.hpp b/test/include/RAJA_test-reduceloc-types.hpp
index 336c7dd23e..a3387ee275 100644
--- a/test/include/RAJA_test-reduceloc-types.hpp
+++ b/test/include/RAJA_test-reduceloc-types.hpp
@@ -15,10 +15,13 @@
 #include "RAJA/RAJA.hpp"
 #include "camp/list.hpp"
 
-struct Index2D {
-   RAJA::Index_type idx, idy;
-   constexpr Index2D() : idx(-1), idy(-1) {}
-   constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy) : idx(idx), idy(idy) {}
+struct Index2D
+{
+  RAJA::Index_type idx, idy;
+  constexpr Index2D() : idx(-1), idy(-1) {}
+  constexpr Index2D(RAJA::Index_type idx, RAJA::Index_type idy)
+      : idx(idx), idy(idy)
+  {}
 };
 
-#endif // __RAJA_test_reduceloc_types_HPP__
+#endif  // __RAJA_test_reduceloc_types_HPP__
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index e9e075b287..b755677c2e 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -16,43 +16,44 @@
 #include "camp/list.hpp"
 
 // Sequential reduction policy types
-using SequentialReducePols = camp::list< RAJA::seq_reduce >;
+using SequentialReducePols = camp::list<RAJA::seq_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducePols = 
-#if 0 // is ordered reduction broken???
+using OpenMPReducePols =
+#if 0  // is ordered reduction broken???
   camp::list< RAJA::omp_reduce,
               RAJA::omp_reduce_ordered >;
 #else
-  camp::list< RAJA::omp_reduce >;
+    camp::list<RAJA::omp_reduce>;
 #endif
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducePols =
-  camp::list< RAJA::omp_target_reduce >;
+    using OpenMPTargetReducePols = camp::list<RAJA::omp_target_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence,
-                                   RAJA::cuda_reduce_block_fence,
-                                   RAJA::cuda_reduce_atomic_device_init_device_fence,
-                                   RAJA::cuda_reduce_atomic_device_init_block_fence,
-                                   RAJA::cuda_reduce_atomic_host_init_device_fence,
-                                   RAJA::cuda_reduce_atomic_host_init_block_fence >;
+using CudaReducePols =
+    camp::list<RAJA::cuda_reduce_device_fence,
+               RAJA::cuda_reduce_block_fence,
+               RAJA::cuda_reduce_atomic_device_init_device_fence,
+               RAJA::cuda_reduce_atomic_device_init_block_fence,
+               RAJA::cuda_reduce_atomic_host_init_device_fence,
+               RAJA::cuda_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols = camp::list< RAJA::hip_reduce_device_fence,
-                                  RAJA::hip_reduce_block_fence,
-                                  RAJA::hip_reduce_atomic_device_init_device_fence,
-                                  RAJA::hip_reduce_atomic_device_init_block_fence,
-                                  RAJA::hip_reduce_atomic_host_init_device_fence,
-                                  RAJA::hip_reduce_atomic_host_init_block_fence >;
+using HipReducePols =
+    camp::list<RAJA::hip_reduce_device_fence,
+               RAJA::hip_reduce_block_fence,
+               RAJA::hip_reduce_atomic_device_init_device_fence,
+               RAJA::hip_reduce_atomic_device_init_block_fence,
+               RAJA::hip_reduce_atomic_host_init_device_fence,
+               RAJA::hip_reduce_atomic_host_init_block_fence>;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
-using SyclReducePols = camp::list< RAJA::sycl_reduce >;
+using SyclReducePols = camp::list<RAJA::sycl_reduce>;
 #endif
 
 #endif  // __RAJA_test_reducepol_HPP__
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index cf633098a9..6c70d8583c 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -13,231 +13,248 @@
 #include "RAJA_gtest.hpp"
 
 
-using TensorElementTypes = ::testing::Types<
-        int,
-        long,
-        float,
-        double
-    >;
-
-template<typename POL>
-struct TensorTestHelper {
-
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      body();
-    }
-
-    static constexpr bool is_device = false;
+using TensorElementTypes = ::testing::Types<int, long, float, double>;
+
+template <typename POL>
+struct TensorTestHelper
+{
+
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    body();
+  }
+
+  static constexpr bool is_device = false;
 };
 
 #ifdef RAJA_ENABLE_CUDA
 
 template <typename BODY>
-__global__
-void test_launcher(BODY body_in)
+__global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
   body();
 }
 
-template<>
+template <>
 struct TensorTestHelper<RAJA::expt::cuda_warp_register>
 {
 
-    RAJA_SUPPRESS_HD_WARN
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      cudaDeviceSynchronize();
-
-      test_launcher<<<1,32>>>(body);
+  RAJA_SUPPRESS_HD_WARN
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    cudaDeviceSynchronize();
 
-      cudaDeviceSynchronize();
+    test_launcher<<<1, 32>>>(body);
 
-    }
+    cudaDeviceSynchronize();
+  }
 
-    static constexpr bool is_device = true;
+  static constexpr bool is_device = true;
 };
 #endif
 
 
-
 #ifdef RAJA_ENABLE_HIP
 
 template <typename BODY>
-__global__
-void test_launcher(BODY body_in)
+__global__ void test_launcher(BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
   body();
 }
 
-template<>
+template <>
 struct TensorTestHelper<RAJA::expt::hip_wave_register>
 {
 
-    template<typename BODY>
-    static
-    void exec(BODY const &body){
-      hipDeviceSynchronize();
+  template <typename BODY>
+  static void exec(BODY const& body)
+  {
+    hipDeviceSynchronize();
 
-      RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0,64),
-      [=] RAJA_HOST_DEVICE (int ){
-        body();
-      });
+    RAJA::forall<RAJA::hip_exec<64>>(RAJA::RangeSegment(0, 64),
+                                     [=] RAJA_HOST_DEVICE(int) { body(); });
 
-      hipDeviceSynchronize();
-
-    }
+    hipDeviceSynchronize();
+  }
 
-    static constexpr bool is_device = true;
+  static constexpr bool is_device = true;
 };
 #endif
 
 
-
-template<typename POL, typename BODY>
-void tensor_do(BODY const &body){
+template <typename POL, typename BODY>
+void tensor_do(BODY const& body)
+{
   TensorTestHelper<POL>::exec(body);
 }
 
 
-
 #if defined(RAJA_ENABLE_CUDA)
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
-  if(TensorTestHelper<POL>::is_device){
-    T *ptr;
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    T* ptr;
 
-    cudaErrchk(cudaMalloc(&ptr, len*sizeof(T)));
+    cudaErrchk(cudaMalloc(&ptr, len * sizeof(T)));
 
     return ptr;
   }
-  else{
+  else
+  {
     return new T[len];
   }
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
-  if(TensorTestHelper<POL>::is_device){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
     cudaErrchk(cudaFree(ptr));
   }
-  else{
+  else
+  {
     delete[] ptr;
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  if(TensorTestHelper<POL>::is_device){
-    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), cudaMemcpyHostToDevice));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    cudaErrchk(cudaMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                          cudaMemcpyHostToDevice));
   }
-  else{
-    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  if(TensorTestHelper<POL>::is_device){
-    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), cudaMemcpyDeviceToHost));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    cudaErrchk(cudaMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                          cudaMemcpyDeviceToHost));
   }
-  else{
-    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
   }
 }
 
 
-
 #elif defined(RAJA_ENABLE_HIP)
 
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
-  if(TensorTestHelper<POL>::is_device){
-    T *ptr;
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    T* ptr;
 
-    hipErrchk(hipMalloc(&ptr, len*sizeof(T)));
+    hipErrchk(hipMalloc(&ptr, len * sizeof(T)));
 
     return ptr;
   }
-  else{
+  else
+  {
     return new T[len];
   }
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
-  if(TensorTestHelper<POL>::is_device){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
     hipErrchk(hipFree(ptr));
   }
-  else{
+  else
+  {
     delete[] ptr;
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  if(TensorTestHelper<POL>::is_device){
-    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T), hipMemcpyHostToDevice));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    hipErrchk(hipMemcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T),
+                        hipMemcpyHostToDevice));
   }
-  else{
-    memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
   }
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  if(TensorTestHelper<POL>::is_device){
-    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T), hipMemcpyDeviceToHost));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  if (TensorTestHelper<POL>::is_device)
+  {
+    hipErrchk(hipMemcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T),
+                        hipMemcpyDeviceToHost));
   }
-  else{
-    memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+  else
+  {
+    memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
   }
 }
 
 
 #else
 
-template<typename POL, typename T>
-T* tensor_malloc(size_t len){
+template <typename POL, typename T>
+T* tensor_malloc(size_t len)
+{
   return new T[len];
 }
 
-template<typename POL, typename T>
-void tensor_free(T *ptr){
+template <typename POL, typename T>
+void tensor_free(T* ptr)
+{
   delete[] ptr;
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_device(T *d_ptr, std::vector<T> const &h_vec){
-  memcpy(d_ptr, h_vec.data(), h_vec.size()*sizeof(T));
+template <typename POL, typename T>
+void tensor_copy_to_device(T* d_ptr, std::vector<T> const& h_vec)
+{
+  memcpy(d_ptr, h_vec.data(), h_vec.size() * sizeof(T));
 }
 
-template<typename POL, typename T>
-void tensor_copy_to_host(std::vector<T> &h_vec, T const *d_ptr){
-  memcpy(h_vec.data(), d_ptr, h_vec.size()*sizeof(T));
+template <typename POL, typename T>
+void tensor_copy_to_host(std::vector<T>& h_vec, T const* d_ptr)
+{
+  memcpy(h_vec.data(), d_ptr, h_vec.size() * sizeof(T));
 }
 
 #endif
 
 
-
 // Sugar to make things cleaner
-template<typename POL, typename T>
-T* tensor_malloc(std::vector<T> const &vec){
-  return tensor_malloc<POL,T>(vec.size());
+template <typename POL, typename T>
+T* tensor_malloc(std::vector<T> const& vec)
+{
+  return tensor_malloc<POL, T>(vec.size());
 }
 
 
-
-
 #endif
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index 77042a43e1..520337103a 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -18,28 +18,32 @@
 #include <new>
 #include <unordered_map>
 
-namespace detail {
+namespace detail
+{
 
-struct indirect_function_call_dispatch_typer {
-  template < typename ... >
+struct indirect_function_call_dispatch_typer
+{
+  template <typename...>
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
-struct indirect_virtual_function_dispatch_typer {
-  template < typename ... >
+struct indirect_virtual_function_dispatch_typer
+{
+  template <typename...>
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
-struct direct_dispatch_typer {
-  template < typename ... Ts >
+struct direct_dispatch_typer
+{
+  template <typename... Ts>
   using type = ::RAJA::direct_dispatch<Ts...>;
 };
 
 
-template < typename Resource >
+template <typename Resource>
 struct ResourceAllocator
 {
-  template < typename T >
+  template <typename T>
   struct std_allocator
   {
     using value_type = T;
@@ -47,26 +51,29 @@ struct ResourceAllocator
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator &&) = default;
+    std_allocator(std_allocator&&)      = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator &&) = default;
+    std_allocator& operator=(std_allocator&&)      = default;
 
-    template < typename U >
+    template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
-      : m_res(other.get_resource())
-    { }
+        : m_res(other.get_resource())
+    {}
 
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+      {
         throw std::bad_alloc();
       }
 
-      value_type* ptr = m_res.template allocate<value_type>(num, camp::resources::MemoryAccess::Pinned);
+      value_type* ptr = m_res.template allocate<value_type>(
+          num, camp::resources::MemoryAccess::Pinned);
 
-      if (!ptr) {
+      if (!ptr)
+      {
         throw std::bad_alloc();
       }
 
@@ -78,19 +85,19 @@ struct ResourceAllocator
       m_res.deallocate(ptr, camp::resources::MemoryAccess::Pinned);
     }
 
-    Resource const& get_resource() const
-    {
-      return m_res;
-    }
+    Resource const& get_resource() const { return m_res; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& /*lhs*/, std_allocator<U> const& /*rhs*/)
+    friend inline bool operator==(std_allocator const& /*lhs*/,
+                                  std_allocator<U> const& /*rhs*/)
     {
-      return true; // lhs.get_resource() == rhs.get_resource(); // TODO not equality comparable yet
+      return true;  // lhs.get_resource() == rhs.get_resource(); // TODO not
+                    // equality comparable yet
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
@@ -104,24 +111,25 @@ struct NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
 
   NeverEqualAllocator() = default;
 
   NeverEqualAllocator(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator(NeverEqualAllocator &&) = default;
+  NeverEqualAllocator(NeverEqualAllocator&&)      = default;
 
   NeverEqualAllocator& operator=(NeverEqualAllocator const&) = default;
-  NeverEqualAllocator& operator=(NeverEqualAllocator &&) = default;
+  NeverEqualAllocator& operator=(NeverEqualAllocator&&)      = default;
 
   NeverEqualAllocator select_on_container_copy_construction()
   {
-    return NeverEqualAllocator{};
+    return NeverEqualAllocator {};
   }
 
   ~NeverEqualAllocator()
   {
-    if (!m_allocations.empty()) {
+    if (!m_allocations.empty())
+    {
       RAJA_ABORT_OR_THROW("allocation map not empty at destruction");
     }
   }
@@ -129,9 +137,10 @@ struct NeverEqualAllocator
   /*[[nodiscard]]*/
   void* allocate(size_t size)
   {
-    void* ptr = malloc(size);
+    void* ptr   = malloc(size);
     auto iter_b = m_allocations.emplace(ptr, size);
-    if (!iter_b.second) {
+    if (!iter_b.second)
+    {
       RAJA_ABORT_OR_THROW("failed to add allocation to map");
     }
     return ptr;
@@ -140,20 +149,19 @@ struct NeverEqualAllocator
   void deallocate(void* ptr, size_t size) noexcept
   {
     auto iter = m_allocations.find(ptr);
-    if (iter == m_allocations.end()) {
+    if (iter == m_allocations.end())
+    {
       RAJA_ABORT_OR_THROW("failed to find allocation in map");
     }
-    if (iter->second != size) {
+    if (iter->second != size)
+    {
       RAJA_ABORT_OR_THROW("allocation size does not match known in map");
     }
     m_allocations.erase(iter);
     free(ptr);
   }
 
-  bool operator==(NeverEqualAllocator const&) const
-  {
-    return false;
-  }
+  bool operator==(NeverEqualAllocator const&) const { return false; }
 
 private:
   std::unordered_map<void*, size_t> m_allocations;
@@ -163,36 +171,27 @@ struct AlwaysEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::false_type;
   using propagate_on_container_move_assignment = std::false_type;
-  using propagate_on_container_swap = std::false_type;
+  using propagate_on_container_swap            = std::false_type;
 
   AlwaysEqualAllocator() = default;
 
   AlwaysEqualAllocator(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator(AlwaysEqualAllocator &&) = default;
+  AlwaysEqualAllocator(AlwaysEqualAllocator&&)      = default;
 
   AlwaysEqualAllocator& operator=(AlwaysEqualAllocator const&) = default;
-  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator &&) = default;
+  AlwaysEqualAllocator& operator=(AlwaysEqualAllocator&&)      = default;
 
-  AlwaysEqualAllocator select_on_container_copy_construction()
-  {
-    return *this;
-  }
+  AlwaysEqualAllocator select_on_container_copy_construction() { return *this; }
 
   /*[[nodiscard]]*/
-  void* allocate(size_t size)
-  {
-    return get_allocator().allocate(size);
-  }
+  void* allocate(size_t size) { return get_allocator().allocate(size); }
 
   void deallocate(void* ptr, size_t size) noexcept
   {
     get_allocator().deallocate(ptr, size);
   }
 
-  bool operator==(AlwaysEqualAllocator const&) const
-  {
-    return true;
-  }
+  bool operator==(AlwaysEqualAllocator const&) const { return true; }
 
 private:
   static inline NeverEqualAllocator& get_allocator()
@@ -206,50 +205,54 @@ struct PropogatingAllocator : NeverEqualAllocator
 {
   using propagate_on_container_copy_assignment = std::true_type;
   using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_swap = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
 
   PropogatingAllocator() = default;
 
   PropogatingAllocator(PropogatingAllocator const&) = default;
-  PropogatingAllocator(PropogatingAllocator &&) = default;
+  PropogatingAllocator(PropogatingAllocator&&)      = default;
 
   PropogatingAllocator& operator=(PropogatingAllocator const&) = default;
-  PropogatingAllocator& operator=(PropogatingAllocator &&) = default;
+  PropogatingAllocator& operator=(PropogatingAllocator&&)      = default;
 
   PropogatingAllocator select_on_container_copy_construction()
   {
-    return PropogatingAllocator(NeverEqualAllocator::select_on_container_copy_construction());
+    return PropogatingAllocator(
+        NeverEqualAllocator::select_on_container_copy_construction());
   }
 
 private:
   PropogatingAllocator(NeverEqualAllocator&& nea)
-    : NeverEqualAllocator(std::move(nea))
-  { }
+      : NeverEqualAllocator(std::move(nea))
+  {}
 };
 
-template < typename AllocatorImpl >
+template <typename AllocatorImpl>
 struct WorkStorageTestAllocator
 {
-  template < typename T >
+  template <typename T>
   struct std_allocator
   {
     using value_type = T;
-    using propagate_on_container_copy_assignment = typename AllocatorImpl::propagate_on_container_copy_assignment;
-    using propagate_on_container_move_assignment = typename AllocatorImpl::propagate_on_container_move_assignment;
-    using propagate_on_container_swap = typename AllocatorImpl::propagate_on_container_swap;
+    using propagate_on_container_copy_assignment =
+        typename AllocatorImpl::propagate_on_container_copy_assignment;
+    using propagate_on_container_move_assignment =
+        typename AllocatorImpl::propagate_on_container_move_assignment;
+    using propagate_on_container_swap =
+        typename AllocatorImpl::propagate_on_container_swap;
 
     std_allocator() = default;
 
     std_allocator(std_allocator const&) = default;
-    std_allocator(std_allocator &&) = default;
+    std_allocator(std_allocator&&)      = default;
 
     std_allocator& operator=(std_allocator const&) = default;
-    std_allocator& operator=(std_allocator &&) = default;
+    std_allocator& operator=(std_allocator&&)      = default;
 
-    template < typename U >
+    template <typename U>
     std_allocator(std_allocator<U> const& other) noexcept
-      : m_impl(other.get_impl())
-    { }
+        : m_impl(other.get_impl())
+    {}
 
     std_allocator select_on_container_copy_construction()
     {
@@ -259,13 +262,16 @@ struct WorkStorageTestAllocator
     /*[[nodiscard]]*/
     value_type* allocate(size_t num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type))
+      {
         throw std::bad_alloc();
       }
 
-      value_type* ptr = static_cast<value_type*>(m_impl.allocate(num*sizeof(value_type)));
+      value_type* ptr =
+          static_cast<value_type*>(m_impl.allocate(num * sizeof(value_type)));
 
-      if (!ptr) {
+      if (!ptr)
+      {
         throw std::bad_alloc();
       }
 
@@ -274,130 +280,96 @@ struct WorkStorageTestAllocator
 
     void deallocate(value_type* ptr, size_t num) noexcept
     {
-      m_impl.deallocate(static_cast<void*>(ptr), num*sizeof(value_type));
+      m_impl.deallocate(static_cast<void*>(ptr), num * sizeof(value_type));
     }
 
-    AllocatorImpl const& get_impl() const
-    {
-      return m_impl;
-    }
+    AllocatorImpl const& get_impl() const { return m_impl; }
 
     template <typename U>
-    friend inline bool operator==(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator==(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return lhs.get_impl() == rhs.get_impl();
     }
 
     template <typename U>
-    friend inline bool operator!=(std_allocator const& lhs, std_allocator<U> const& rhs)
+    friend inline bool operator!=(std_allocator const& lhs,
+                                  std_allocator<U> const& rhs)
     {
       return !(lhs == rhs);
     }
 
   private:
-    std_allocator(AllocatorImpl&& impl)
-      : m_impl(std::move(impl))
-    { }
+    std_allocator(AllocatorImpl&& impl) : m_impl(std::move(impl)) {}
 
     AllocatorImpl m_impl;
   };
 };
 
-} // namespace detail
+}  // namespace detail
 
 
 //
 // Data types
 //
-using IndexTypeTypeList = camp::list<
-                                 int,
-                                 long,
-                                 RAJA::Index_type
-                               >;
-
-using XargsTypeList = camp::list<
-                                 RAJA::xargs<>,
-                                 RAJA::xargs<int*>,
-                                 RAJA::xargs<int, int*>
-                               >;
-
-using SequentialExecPolicyList =
-    camp::list<
-                RAJA::seq_work
-              >;
+using IndexTypeTypeList = camp::list<int, long, RAJA::Index_type>;
+
+using XargsTypeList =
+    camp::list<RAJA::xargs<>, RAJA::xargs<int*>, RAJA::xargs<int, int*>>;
+
+using SequentialExecPolicyList = camp::list<RAJA::seq_work>;
 using SequentialOrderedPolicyList =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              >;
+    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialOrderPolicyList =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              >;
+    camp::list<RAJA::ordered, RAJA::reverse_ordered>;
 using SequentialStoragePolicyList =
-    camp::list<
-                RAJA::array_of_pointers,
-                RAJA::ragged_array_of_objects,
-                RAJA::constant_stride_array_of_objects
-              >;
+    camp::list<RAJA::array_of_pointers,
+               RAJA::ragged_array_of_objects,
+               RAJA::constant_stride_array_of_objects>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPExecPolicyList =
-    camp::list<
-                RAJA::omp_work
-              >;
+using OpenMPExecPolicyList    = camp::list<RAJA::omp_work>;
 using OpenMPOrderedPolicyList = SequentialOrderedPolicyList;
 using OpenMPOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetExecPolicyList =
-    camp::list<
-                RAJA::omp_target_work
-              >;
+using OpenMPTargetExecPolicyList    = camp::list<RAJA::omp_target_work>;
 using OpenMPTargetOrderedPolicyList = SequentialOrderedPolicyList;
 using OpenMPTargetOrderPolicyList   = SequentialOrderPolicyList;
 using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaExecPolicyList =
-    camp::list<
-                #if defined(RAJA_TEST_EXHAUSTIVE)
-                // avoid compilation error:
-                // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at instantiation of class
-                RAJA::cuda_work<256>,
-                #endif
-                RAJA::cuda_work<1024>,
-                RAJA::cuda_work_explicit<256, 2>
-              >;
+using CudaExecPolicyList = camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    // avoid compilation error:
+    // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at
+    // instantiation of class
+    RAJA::cuda_work<256>,
+#endif
+    RAJA::cuda_work<1024>,
+    RAJA::cuda_work_explicit<256, 2>>;
 using CudaOrderedPolicyList = SequentialOrderedPolicyList;
-using CudaOrderPolicyList   =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered,
-                RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average
-              >;
+using CudaOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average>;
 using CudaStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipExecPolicyList =
-    camp::list<
-                #if defined(RAJA_TEST_EXHAUSTIVE)
-                RAJA::hip_work<256>,
-                #endif
-                RAJA::hip_work<1024>
-              >;
+using HipExecPolicyList = camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+    RAJA::hip_work<256>,
+#endif
+    RAJA::hip_work<1024>>;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
-using HipOrderPolicyList   =
-    camp::list<
-                RAJA::ordered,
-                RAJA::reverse_ordered
-              , RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average
-              >;
+using HipOrderPolicyList =
+    camp::list<RAJA::ordered,
+               RAJA::reverse_ordered,
+               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average>;
 using HipStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
@@ -405,15 +377,18 @@ using HipStoragePolicyList = SequentialStoragePolicyList;
 //
 // Dispatch policy type lists, broken up for compile time reasons
 //
-using IndirectFunctionDispatchTyperList = camp::list<detail::indirect_function_call_dispatch_typer>;
-using IndirectVirtualDispatchTyperList = camp::list<detail::indirect_virtual_function_dispatch_typer>;
+using IndirectFunctionDispatchTyperList =
+    camp::list<detail::indirect_function_call_dispatch_typer>;
+using IndirectVirtualDispatchTyperList =
+    camp::list<detail::indirect_virtual_function_dispatch_typer>;
 using DirectDispatchTyperList = camp::list<detail::direct_dispatch_typer>;
 
 
 //
 // Memory resource Allocator types
 //
-using HostAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Host>::template std_allocator<char>>;
+using HostAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Host>::template std_allocator<char>>;
 
 using SequentialAllocatorList = HostAllocatorList;
 
@@ -422,23 +397,30 @@ using OpenMPAllocatorList = HostAllocatorList;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Cuda>::template std_allocator<char>>;
+using CudaAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Cuda>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Hip>::template std_allocator<char>>;
+using HipAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Hip>::template std_allocator<char>>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<camp::resources::Omp>::template std_allocator<char>>;
+using OpenMPTargetAllocatorList = camp::list<typename detail::ResourceAllocator<
+    camp::resources::Omp>::template std_allocator<char>>;
 #endif
 
 
 //
 // Memory resource types for testing different std allocator requirements
 //
-using WorkStorageAllocatorList = camp::list<typename detail::WorkStorageTestAllocator<detail::AlwaysEqualAllocator>::template std_allocator<char>,
-                                            typename detail::WorkStorageTestAllocator<detail::NeverEqualAllocator>::template std_allocator<char>,
-                                            typename detail::WorkStorageTestAllocator<detail::PropogatingAllocator>::template std_allocator<char>>;
+using WorkStorageAllocatorList =
+    camp::list<typename detail::WorkStorageTestAllocator<
+                   detail::AlwaysEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::NeverEqualAllocator>::template std_allocator<char>,
+               typename detail::WorkStorageTestAllocator<
+                   detail::PropogatingAllocator>::template std_allocator<char>>;
 
 #endif  // __TEST_WORKGROUP_UTILS_HPP__
diff --git a/test/include/RAJA_unit-test-for3d3d.hpp b/test/include/RAJA_unit-test-for3d3d.hpp
index a2c43ec55e..2297745b8c 100644
--- a/test/include/RAJA_unit-test-for3d3d.hpp
+++ b/test/include/RAJA_unit-test-for3d3d.hpp
@@ -40,12 +40,15 @@ struct dim3d3d
 RAJA_HOST_DEVICE
 int index(dim3d3d idx, dim3d3d dim)
 {
-  return               idx.thread[0] +
-      dim.thread[0] * (idx.thread[1] +
-      dim.thread[1] * (idx.thread[2] +
-      dim.thread[2] * (idx.block[0] +
-      dim.block[0]  * (idx.block[1] +
-      dim.block[1]  * (idx.block[2])))));
+  return idx.thread[0] +
+         dim.thread[0] *
+             (idx.thread[1] +
+              dim.thread[1] *
+                  (idx.thread[2] +
+                   dim.thread[2] *
+                       (idx.block[0] +
+                        dim.block[0] *
+                            (idx.block[1] + dim.block[1] * (idx.block[2])))));
 }
 
 ///
@@ -56,41 +59,61 @@ int index(dim3d3d idx, dim3d3d dim)
 ///   /* code to test */
 /// } );
 ///
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 inline void for3d3d(dim3d3d dim, L&& run);
 
 // test_seq implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_seq, dim3d3d dim, L&& run)
 {
-  for (int bz = 0; bz < dim.block[2]; ++bz) {
-  for (int by = 0; by < dim.block[1]; ++by) {
-  for (int bx = 0; bx < dim.block[0]; ++bx) {
-    for (int tz = 0; tz < dim.thread[2]; ++tz) {
-    for (int ty = 0; ty < dim.thread[1]; ++ty) {
-    for (int tx = 0; tx < dim.thread[0]; ++tx) {
-      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
-    }}}
-  }}}
+  for (int bz = 0; bz < dim.block[2]; ++bz)
+  {
+    for (int by = 0; by < dim.block[1]; ++by)
+    {
+      for (int bx = 0; bx < dim.block[0]; ++bx)
+      {
+        for (int tz = 0; tz < dim.thread[2]; ++tz)
+        {
+          for (int ty = 0; ty < dim.thread[1]; ++ty)
+          {
+            for (int tx = 0; tx < dim.thread[0]; ++tx)
+            {
+              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 {
 #pragma omp target teams distribute collapse(3)
-  for (int bz = 0; bz < dim.block[2]; ++bz) {
-  for (int by = 0; by < dim.block[1]; ++by) {
-  for (int bx = 0; bx < dim.block[0]; ++bx) {
+  for (int bz = 0; bz < dim.block[2]; ++bz)
+  {
+    for (int by = 0; by < dim.block[1]; ++by)
+    {
+      for (int bx = 0; bx < dim.block[0]; ++bx)
+      {
 #pragma omp parallel for collapse(3)
-    for (int tz = 0; tz < dim.thread[2]; ++tz) {
-    for (int ty = 0; ty < dim.thread[1]; ++ty) {
-    for (int tx = 0; tx < dim.thread[0]; ++tx) {
-      run(dim3d3d{{tx,ty,tz}, {bx,by,bz}}, dim);
-    }}}
-  }}}
+        for (int tz = 0; tz < dim.thread[2]; ++tz)
+        {
+          for (int ty = 0; ty < dim.thread[1]; ++ty)
+          {
+            for (int tx = 0; tx < dim.thread[0]; ++tx)
+            {
+              run(dim3d3d {{tx, ty, tz}, {bx, by, bz}}, dim);
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 #endif
@@ -100,20 +123,25 @@ inline void for3d3d(test_openmp_target, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_cuda_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
+  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+                static_cast<int>(threadIdx.z)},
+               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+                static_cast<int>(blockIdx.z)}},
+      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+                static_cast<int>(blockDim.z)},
+               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+                static_cast<int>(gridDim.z)}});
 }
 
 // test_cuda implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 {
-   for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
-                         dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(std::forward<L>(run));
-   cudaErrchk(cudaGetLastError());
-   cudaErrchk(cudaDeviceSynchronize());
+  for3d3d_cuda_global<<<dim3(dim.block[0], dim.block[1], dim.block[2]),
+                        dim3(dim.thread[0], dim.thread[1], dim.thread[2])>>>(
+      std::forward<L>(run));
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -123,31 +151,34 @@ inline void for3d3d(test_cuda, dim3d3d dim, L&& run)
 template <typename L>
 __global__ void for3d3d_hip_global(L run)
 {
-  run(dim3d3d{{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y), static_cast<int>(threadIdx.z)},
-              {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y), static_cast<int>(blockIdx.z)}},
-      dim3d3d{{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y), static_cast<int>(blockDim.z)},
-              {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y), static_cast<int>(gridDim.z)}});
+  run(dim3d3d {{static_cast<int>(threadIdx.x), static_cast<int>(threadIdx.y),
+                static_cast<int>(threadIdx.z)},
+               {static_cast<int>(blockIdx.x), static_cast<int>(blockIdx.y),
+                static_cast<int>(blockIdx.z)}},
+      dim3d3d {{static_cast<int>(blockDim.x), static_cast<int>(blockDim.y),
+                static_cast<int>(blockDim.z)},
+               {static_cast<int>(gridDim.x), static_cast<int>(gridDim.y),
+                static_cast<int>(gridDim.z)}});
 }
 
 // test_hip implementation
-template < typename L >
+template <typename L>
 inline void for3d3d(test_hip, dim3d3d dim, L&& run)
 {
-   hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
-                      dim3(dim.block[0], dim.block[1], dim.block[2]),
-                      dim3(dim.thread[0], dim.thread[1], dim.thread[2]),
-                      0, 0,
-                      std::forward<L>(run));
-   hipErrchk(hipGetLastError());
-   hipErrchk(hipDeviceSynchronize());
+  hipLaunchKernelGGL(for3d3d_hip_global<camp::decay<L>>,
+                     dim3(dim.block[0], dim.block[1], dim.block[2]),
+                     dim3(dim.thread[0], dim.thread[1], dim.thread[2]), 0, 0,
+                     std::forward<L>(run));
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 void for3d3d(dim3d3d dim, L&& run)
 {
-  for3d3d(test_policy{}, dim, std::forward<L>(run));
+  for3d3d(test_policy {}, dim, std::forward<L>(run));
 }
 
-#endif // RAJA_test_for3d3d_HPP__
+#endif  // RAJA_test_for3d3d_HPP__
diff --git a/test/include/RAJA_unit-test-forone.hpp b/test/include/RAJA_unit-test-forone.hpp
index 4e9fc521e4..d5315c6e1c 100644
--- a/test/include/RAJA_unit-test-forone.hpp
+++ b/test/include/RAJA_unit-test-forone.hpp
@@ -18,11 +18,11 @@
 ///
 /// forone<test_policy>( [=] RAJA_HOST_DEVICE(){ /* code to test */ } );
 ///
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 inline void forone(L&& run);
 
 // test_seq implementation
-template < typename L >
+template <typename L>
 inline void forone(test_seq, L&& run)
 {
   std::forward<L>(run)();
@@ -31,7 +31,7 @@ inline void forone(test_seq, L&& run)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // test_openmp_target implementation
-template < typename L >
+template <typename L>
 inline void forone(test_openmp_target, L&& run)
 {
 #pragma omp target
@@ -49,12 +49,12 @@ __global__ void forone_cuda_global(L run)
 }
 
 // test_cuda implementation
-template < typename L >
+template <typename L>
 inline void forone(test_cuda, L&& run)
 {
-   forone_cuda_global<<<1,1>>>(std::forward<L>(run));
-   cudaErrchk(cudaGetLastError());
-   cudaErrchk(cudaDeviceSynchronize());
+  forone_cuda_global<<<1, 1>>>(std::forward<L>(run));
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
 }
 
 #endif
@@ -68,20 +68,21 @@ __global__ void forone_hip_global(L run)
 }
 
 // test_hip implementation
-template < typename L >
+template <typename L>
 inline void forone(test_hip, L&& run)
 {
-   hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0, std::forward<L>(run));
-   hipErrchk(hipGetLastError());
-   hipErrchk(hipDeviceSynchronize());
+  hipLaunchKernelGGL(forone_hip_global<camp::decay<L>>, dim3(1), dim3(1), 0, 0,
+                     std::forward<L>(run));
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
 }
 
 #endif
 
-template < typename test_policy, typename L >
+template <typename test_policy, typename L>
 void forone(L&& run)
 {
-  forone(test_policy{}, std::forward<L>(run));
+  forone(test_policy {}, std::forward<L>(run));
 }
 
-#endif // RAJA_test_forone_HPP__
+#endif  // RAJA_test_forone_HPP__
diff --git a/test/include/RAJA_unit-test-policy.hpp b/test/include/RAJA_unit-test-policy.hpp
index e0aa1f8c65..2fa89acc58 100644
--- a/test/include/RAJA_unit-test-policy.hpp
+++ b/test/include/RAJA_unit-test-policy.hpp
@@ -20,48 +20,59 @@
 
 
 // base classes to represent host or device in exec_dispatcher
-struct RunOnHost {};
-struct RunOnDevice {};
+struct RunOnHost
+{};
+struct RunOnDevice
+{};
 
 // sequential test policy
-struct test_seq : public RunOnHost  { };
+struct test_seq : public RunOnHost
+{};
 
 // struct with specializations containing information about test policies
-template < typename test_policy >
+template <typename test_policy>
 struct test_policy_info;
 
 // alias for equivalent RAJA exec policy to given test policy
-template < typename test_policy >
-using test_equivalent_exec_policy = typename test_policy_info<test_policy>::type;
+template <typename test_policy>
+using test_equivalent_exec_policy =
+    typename test_policy_info<test_policy>::type;
 
 // alias for platform of given test policy
-template < typename test_policy >
+template <typename test_policy>
 using test_platform = typename test_policy_info<test_policy>::platform;
 
 // alias for platform of given test policy
-template < typename test_policy >
+template <typename test_policy>
 using test_resource = typename test_policy_info<test_policy>::resource;
 
-template < typename test_policy >
+template <typename test_policy>
 test_resource<test_policy> get_test_resource()
 {
   return test_resource<test_policy>::get_default();
 }
 
-template < typename dst_resource, typename src_resource, typename T >
-inline T* test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
+template <typename dst_resource, typename src_resource, typename T>
+inline T*
+test_reallocate(dst_resource dst_res, src_resource src_res, T* src, size_t len)
 {
   T* dst = nullptr;
-  if (dst_res.get_platform() == camp::resources::Platform::host) {
+  if (dst_res.get_platform() == camp::resources::Platform::host)
+  {
     dst = dst_res.template allocate<T>(len);
-    src_res.memcpy(dst, src, len*sizeof(T));
+    src_res.memcpy(dst, src, len * sizeof(T));
     src_res.wait();
-  } else if (src_res.get_platform() == camp::resources::Platform::host) {
+  }
+  else if (src_res.get_platform() == camp::resources::Platform::host)
+  {
     dst = dst_res.template allocate<T>(len);
-    dst_res.memcpy(dst, src, len*sizeof(T));
+    dst_res.memcpy(dst, src, len * sizeof(T));
     dst_res.wait();
-  } else {
-    throw std::runtime_error("Expected source or destination resource to be host");
+  }
+  else
+  {
+    throw std::runtime_error(
+        "Expected source or destination resource to be host");
   }
   src_res.deallocate(src);
   return dst;
@@ -69,11 +80,11 @@ inline T* test_reallocate(dst_resource dst_res, src_resource src_res, T* src, si
 
 
 // test_seq policy information
-template < >
+template <>
 struct test_policy_info<test_seq>
 {
   using resource = camp::resources::Host;
-  using type = RAJA::seq_exec;
+  using type     = RAJA::seq_exec;
   using platform = RunOnHost;
   static const char* name() { return "test_seq"; }
 };
@@ -81,14 +92,15 @@ struct test_policy_info<test_seq>
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
 // cuda test policy
-struct test_openmp_target : public RunOnHost { };
+struct test_openmp_target : public RunOnHost
+{};
 
 // test_openmp_target policy information
-template < >
+template <>
 struct test_policy_info<test_openmp_target>
 {
   using resource = camp::resources::Omp;
-  using type = RAJA::omp_target_parallel_for_exec<1>;
+  using type     = RAJA::omp_target_parallel_for_exec<1>;
   using platform = RunOnHost;
   static const char* name() { return "test_openmp_target"; }
 };
@@ -98,14 +110,15 @@ struct test_policy_info<test_openmp_target>
 #if defined(RAJA_ENABLE_CUDA)
 
 // cuda test policy
-struct test_cuda : public RunOnDevice { };
+struct test_cuda : public RunOnDevice
+{};
 
 // test_cuda policy information
-template < >
+template <>
 struct test_policy_info<test_cuda>
 {
   using resource = camp::resources::Cuda;
-  using type = RAJA::cuda_exec<1>;
+  using type     = RAJA::cuda_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_cuda"; }
 };
@@ -115,14 +128,15 @@ struct test_policy_info<test_cuda>
 #if defined(RAJA_ENABLE_HIP)
 
 // hip test policy
-struct test_hip : public RunOnDevice { };
+struct test_hip : public RunOnDevice
+{};
 
 // test_hip policy information
-template < >
+template <>
 struct test_policy_info<test_hip>
 {
   using resource = camp::resources::Hip;
-  using type = RAJA::hip_exec<1>;
+  using type     = RAJA::hip_exec<1>;
   using platform = RunOnDevice;
   static const char* name() { return "test_hip"; }
 };
@@ -151,4 +165,4 @@ using OpenMPTargetUnitTestPolicyList = camp::list<test_openmp_target>;
 using HipUnitTestPolicyList = camp::list<test_hip>;
 #endif
 
-#endif // RAJA_test_policy_HPP__
+#endif  // RAJA_test_policy_HPP__
diff --git a/test/include/RAJA_unit-test-types.hpp b/test/include/RAJA_unit-test-types.hpp
index bb65134534..618a625ede 100644
--- a/test/include/RAJA_unit-test-types.hpp
+++ b/test/include/RAJA_unit-test-types.hpp
@@ -34,42 +34,27 @@ using UnitIntegralTypes = ::testing::Types<char,
 // Expanded integral types used in RAJA index unit tests
 //
 #ifndef RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
-  #define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES \
-    RAJA::Index_type,    \
-    char,                \
-    unsigned char,       \
-    short,               \
-    unsigned short,      \
-    int,                 \
-    unsigned int,        \
-    long,                \
-    unsigned long,       \
-    long int,            \
-    unsigned long int,   \
-    long long,           \
-    unsigned long long
-#endif // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
+#define RAJA_UNIT_EXPANDED_INTEGRAL_TYPES                                      \
+  RAJA::Index_type, char, unsigned char, short, unsigned short, int,           \
+      unsigned int, long, unsigned long, long int, unsigned long int,          \
+      long long, unsigned long long
+#endif  // RAJA_UNIT_EXPANDED_INTEGRAL_TYPES
 
 #ifndef RAJA_UNIT_FLOAT_TYPES
 #ifndef __clang__
-  #define RAJA_UNIT_FLOAT_TYPES \
-    float,               \
-    double,              \
-    long double
+#define RAJA_UNIT_FLOAT_TYPES float, double, long double
 #else
-  #define RAJA_UNIT_FLOAT_TYPES \
-    float,               \
-    double
-#endif // __clang__
-#endif // FLOATING_TYPES
+#define RAJA_UNIT_FLOAT_TYPES float, double
+#endif  // __clang__
+#endif  // FLOATING_TYPES
 
-using UnitExpandedIntegralTypes = 
-  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
+using UnitExpandedIntegralTypes =
+    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES>;
 
 using UnitFloatTypes = ::testing::Types<RAJA_UNIT_FLOAT_TYPES>;
 
-using UnitIntFloatTypes = 
-  ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES,RAJA_UNIT_FLOAT_TYPES>;
+using UnitIntFloatTypes =
+    ::testing::Types<RAJA_UNIT_EXPANDED_INTEGRAL_TYPES, RAJA_UNIT_FLOAT_TYPES>;
 
 //
 // Standard list of index types used in RAJA index unit tests
diff --git a/test/include/type_helper.hpp b/test/include/type_helper.hpp
index 3a4581c8a0..4cb2fd0975 100644
--- a/test/include/type_helper.hpp
+++ b/test/include/type_helper.hpp
@@ -30,7 +30,8 @@ template <typename S, typename T>
 struct type_cat;
 
 template <typename... Ss, typename... Ts>
-struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>> {
+struct type_cat<std::tuple<Ss...>, std::tuple<Ts...>>
+{
   using type = std::tuple<Ss..., Ts...>;
 };
 
@@ -39,26 +40,30 @@ template <typename S, typename T>
 struct product;
 
 template <typename S, typename... Ss, typename... Ts>
-struct product<std::tuple<S, Ss...>, std::tuple<Ts...>> {
+struct product<std::tuple<S, Ss...>, std::tuple<Ts...>>
+{
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<S, Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts =
+      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
 };
 
 template <typename... Ss, typename... Ts, typename... Smembers>
-struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>> {
+struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>>
+{
   // the cartesian product of {S} and {Ts...}
   // is a list of pairs -- here: a std::tuple of 2-element std::tuples
   using S_cross_Ts = std::tuple<std::tuple<Smembers..., Ts>...>;
 
   // the cartesian product of {Ss...} and {Ts...} (computed recursively)
-  using Ss_cross_Ts = typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
+  using Ss_cross_Ts =
+      typename product<std::tuple<Ss...>, std::tuple<Ts...>>::type;
 
   // concatenate both products
   using type = typename type_cat<S_cross_Ts, Ss_cross_Ts>::type;
@@ -66,7 +71,8 @@ struct product<std::tuple<std::tuple<Smembers...>, Ss...>, std::tuple<Ts...>> {
 
 // end the recursion
 template <typename... Ts>
-struct product<std::tuple<>, std::tuple<Ts...>> {
+struct product<std::tuple<>, std::tuple<Ts...>>
+{
   using type = std::tuple<>;
 };
 }  // namespace types
@@ -78,12 +84,14 @@ template <typename...>
 struct concat;
 
 template <template <class...> class T, typename U>
-struct concat<T<U>> {
+struct concat<T<U>>
+{
   using type = U;
 };
 
 template <typename T>
-struct concat<T> {
+struct concat<T>
+{
   using type = T;
 };
 
@@ -91,7 +99,8 @@ template <template <class...> class T,
           class... Front,
           class... Next,
           class... Rest>
-struct concat<T<Front...>, T<Next...>, Rest...> {
+struct concat<T<Front...>, T<Next...>, Rest...>
+{
   using type = typename concat<T<Front..., Next...>, Rest...>::type;
 };
 
@@ -99,12 +108,14 @@ template <typename... Ts>
 using concat_t = typename concat<Ts...>::type;
 
 template <class T>
-struct collapse {
+struct collapse
+{
   using type = T;
 };
 
 template <template <class...> class T, class... U>
-struct collapse<T<T<U...>>> {
+struct collapse<T<T<U...>>>
+{
   using type = typename collapse<T<U...>>::type;
 };
 
@@ -115,7 +126,8 @@ template <template <class> class, class>
 struct apply;
 
 template <template <class...> class L, template <class> class Fn, class... Ts>
-struct apply<Fn, L<Ts...>> {
+struct apply<Fn, L<Ts...>>
+{
   using type = collapse_t<L<concat_t<Fn<Ts>...>>>;
 };
 
@@ -131,7 +143,8 @@ template <typename T>
 struct ForTesting;
 
 template <template <class...> class T, typename... Ts>
-struct ForTesting<T<Ts...>> {
+struct ForTesting<T<Ts...>>
+{
   using type = ::testing::Types<Ts...>;
 };
 }  // namespace detail
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
index b748f316df..82d77e9d98 100644
--- a/test/install/using-with-cmake/using-with-cmake.cpp
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -7,25 +7,23 @@
 #include "RAJA/RAJA.hpp"
 
 
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) 
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 {
-  constexpr std::size_t N{1024};
+  constexpr std::size_t N {1024};
 
   double* a = new double[N];
   double* b = new double[N];
-  double c = 3.14159;
-  
-  for (std::size_t i = 0; i < N; i++) {
+  double c  = 3.14159;
+
+  for (std::size_t i = 0; i < N; i++)
+  {
     a[i] = 1.0;
     b[i] = 2.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(
-    RAJA::RangeSegment(0, N),
-    [=] RAJA_HOST_DEVICE (std::size_t i) {
-      a[i] += b[i] * c;
-    }
-  );
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),
+                               [=] RAJA_HOST_DEVICE(std::size_t i)
+                               { a[i] += b[i] * c; });
 
   delete[] a;
   delete[] b;
diff --git a/test/integration/plugin/plugin_to_test.cpp b/test/integration/plugin/plugin_to_test.cpp
index 8290804191..bda2f313a3 100644
--- a/test/integration/plugin/plugin_to_test.cpp
+++ b/test/integration/plugin/plugin_to_test.cpp
@@ -12,11 +12,11 @@
 
 #include "counter.hpp"
 
-class CounterPlugin :
-  public RAJA::util::PluginStrategy
+class CounterPlugin : public RAJA::util::PluginStrategy
 {
-  public:
-  void preCapture(const RAJA::util::PluginContext& p) override {
+public:
+  void preCapture(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -30,7 +30,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postCapture(const RAJA::util::PluginContext& p) override {
+  void postCapture(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -44,7 +45,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void preLaunch(const RAJA::util::PluginContext& p) override {
+  void preLaunch(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -58,7 +60,8 @@ class CounterPlugin :
     plugin_test_resource->memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  void postLaunch(const RAJA::util::PluginContext& p) override {
+  void postLaunch(const RAJA::util::PluginContext& p) override
+  {
     ASSERT_NE(plugin_test_data, nullptr);
     ASSERT_NE(plugin_test_resource, nullptr);
 
@@ -74,4 +77,5 @@ class CounterPlugin :
 };
 
 // Statically loading plugin.
-static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin", "Counter");
+static RAJA::util::PluginRegistry::add<CounterPlugin> P("counter-plugin",
+                                                        "Counter");
diff --git a/test/integration/plugin/tests/counter.hpp b/test/integration/plugin/tests/counter.hpp
index bb22f697dd..a652fe9db1 100644
--- a/test/integration/plugin/tests/counter.hpp
+++ b/test/integration/plugin/tests/counter.hpp
@@ -4,18 +4,18 @@
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-#ifndef  RAJA_counter_HPP
-#define  RAJA_counter_HPP
+#ifndef RAJA_counter_HPP
+#define RAJA_counter_HPP
 
 
 struct CounterData
 {
   RAJA::Platform capture_platform_active = RAJA::Platform::undefined;
-  int            capture_counter_pre     = 0;
-  int            capture_counter_post    = 0;
-  RAJA::Platform launch_platform_active = RAJA::Platform::undefined;
-  int            launch_counter_pre     = 0;
-  int            launch_counter_post    = 0;
+  int capture_counter_pre                = 0;
+  int capture_counter_post               = 0;
+  RAJA::Platform launch_platform_active  = RAJA::Platform::undefined;
+  int launch_counter_pre                 = 0;
+  int launch_counter_post                = 0;
 };
 
 // note the use of a pointer here to allow different types of memory
diff --git a/test/integration/plugin/tests/test-plugin-forall.hpp b/test/integration/plugin/tests/test-plugin-forall.hpp
index 3b74d6249d..2db5d1c5e4 100644
--- a/test/integration/plugin/tests/test-plugin-forall.hpp
+++ b/test/integration/plugin/tests/test-plugin-forall.hpp
@@ -21,173 +21,169 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic forall
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForallTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::forall<ExecPolicy>(
-      RAJA::RangeSegment(i,i+1),
-      PluginTestCallable{data}
-    );
+    RAJA::forall<ExecPolicy>(RAJA::RangeSegment(i, i + 1),
+                             PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with basic forall_Icount
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllICountTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::forall_Icount<ExecPolicy>(
-      RAJA::RangeSegment(i,i+1), i,
-      PluginTestCallable{data}
-    );
+    RAJA::forall_Icount<ExecPolicy>(RAJA::RangeSegment(i, i + 1), i,
+                                    PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
 
-    for (int j = i; j < 10; j++) {
-      iset.push_back(RAJA::RangeSegment(j, j+1));
+    for (int j = i; j < 10; j++)
+    {
+      iset.push_back(RAJA::RangeSegment(j, j + 1));
     }
 
     RAJA::forall<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-      iset,
-      PluginTestCallable{data}
-    );
+        iset, PluginTestCallable {data});
 
-    for (int j = i; j < 10; j++) {
+    for (int j = i; j < 10; j++)
+    {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.capture_counter_post, i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.launch_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.launch_counter_post, i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
 
 // test with IndexSet forall_Icount
-template <typename ExecPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename ExecPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginForAllIcountIdxSetTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::TypedIndexSet< RAJA::RangeSegment > iset;
+    RAJA::TypedIndexSet<RAJA::RangeSegment> iset;
 
-    for (int j = i; j < 10; j++) {
-      iset.push_back(RAJA::RangeSegment(j, j+1));
+    for (int j = i; j < 10; j++)
+    {
+      iset.push_back(RAJA::RangeSegment(j, j + 1));
     }
 
     RAJA::forall_Icount<RAJA::ExecPolicy<RAJA::seq_segit, ExecPolicy>>(
-      iset,
-      PluginTestCallable{data}
-    );
+        iset, PluginTestCallable {data});
 
-    for (int j = i; j < 10; j++) {
+    for (int j = i; j < 10; j++)
+    {
       CounterData loop_data;
       plugin_test_resource->memcpy(&loop_data, &data[j], sizeof(CounterData));
       ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.capture_counter_post,    i);
+      ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.capture_counter_post, i);
       ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-      ASSERT_EQ(loop_data.launch_counter_post,    i);
+      ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+      ASSERT_EQ(loop_data.launch_counter_post, i);
     }
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -195,43 +191,43 @@ void PluginForAllIcountIdxSetTestImpl()
 TYPED_TEST_SUITE_P(PluginForallTest);
 template <typename T>
 class PluginForallTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginForallTest, PluginForall)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForallTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllICount)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllICountTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIdxSet)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>();
 }
 
 TYPED_TEST_P(PluginForallTest, PluginForAllIcountIdxSet)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy     = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType, PlatformHolder::platform>( );
+  PluginForAllIcountIdxSetTestImpl<ExecPolicy, ResType,
+                                   PlatformHolder::platform>();
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginForallTest,
diff --git a/test/integration/plugin/tests/test-plugin-kernel.hpp b/test/integration/plugin/tests/test-plugin-kernel.hpp
index b4bc9ebaf4..41a7cd92cd 100644
--- a/test/integration/plugin/tests/test-plugin-kernel.hpp
+++ b/test/integration/plugin/tests/test-plugin-kernel.hpp
@@ -21,40 +21,38 @@
 // once before and after each kernel invocation for the launch counter.
 
 // test with basic kernel
-template <typename KernelPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename KernelPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginKernelTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    RAJA::kernel<KernelPolicy>(
-      RAJA::make_tuple(RAJA::RangeSegment(i,i+1)),
-      PluginTestCallable{data}
-    );
+    RAJA::kernel<KernelPolicy>(RAJA::make_tuple(RAJA::RangeSegment(i, i + 1)),
+                               PluginTestCallable {data});
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -63,19 +61,17 @@ void PluginKernelTestImpl()
 TYPED_TEST_SUITE_P(PluginKernelTest);
 template <typename T>
 class PluginKernelTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginKernelTest, PluginKernel)
 {
-  using KernelPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using KernelPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>( );
+  PluginKernelTestImpl<KernelPolicy, ResType, PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest,
-                            PluginKernel);
+REGISTER_TYPED_TEST_SUITE_P(PluginKernelTest, PluginKernel);
 
 #endif  //__TEST_PLUGIN_KERNEL_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 2c516114cd..b01dadee8c 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -21,48 +21,46 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginLaunchTestImpl()
 {
   SetupPluginVars spv(WORKING_RES::get_default());
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    //Keep PluginTestCallable within a scope to ensure
-    //destruction, consistent with other test
+    // Keep PluginTestCallable within a scope to ensure
+    // destruction, consistent with other test
     {
-      PluginTestCallable p_callable{data};
-
-      RAJA::launch<LaunchPolicy>
-        (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-         {
-           p_callable(i);
-         });
+      PluginTestCallable p_callable {data};
+
+      RAJA::launch<LaunchPolicy>(
+          RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+          { p_callable(i); });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -71,19 +69,17 @@ void PluginLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginLaunchTest);
 template <typename T>
 class PluginLaunchTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
 {
-  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest,
-                            PluginLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest, PluginLaunch);
 
 #endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index e4c216b72b..b5aaef62ee 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -21,9 +21,7 @@
 // once before and after each launch invocation for the launch counter.
 
 // test with basic launch
-template <typename LaunchPolicy,
-          typename WORKING_RES,
-          RAJA::Platform PLATFORM>
+template <typename LaunchPolicy, typename WORKING_RES, RAJA::Platform PLATFORM>
 void PluginResourceLaunchTestImpl()
 {
   WORKING_RES res;
@@ -32,39 +30,39 @@ void PluginResourceLaunchTestImpl()
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
 
-    //Keep PluginTestCallable within a scope to ensure
-    //destruction, consistent with other test
+    // Keep PluginTestCallable within a scope to ensure
+    // destruction, consistent with other test
     {
-      PluginTestCallable p_callable{data};
-
-      RAJA::launch<LaunchPolicy>
-        (res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
-         {
-           p_callable(i);
-         });
+      PluginTestCallable p_callable {data};
+
+      RAJA::launch<LaunchPolicy>(
+          res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
+          { p_callable(i); });
     }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
     ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.capture_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.capture_counter_post, i);
     ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
-    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
-    ASSERT_EQ(loop_data.launch_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_counter_pre, i + 1);
+    ASSERT_EQ(loop_data.launch_counter_post, i);
   }
 
   CounterData plugin_data;
-  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                               sizeof(CounterData));
   ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+  ASSERT_EQ(plugin_data.capture_counter_post, 10);
   ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
-  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_counter_pre, 10);
+  ASSERT_EQ(plugin_data.launch_counter_post, 10);
 
   plugin_test_resource->deallocate(data);
 }
@@ -73,19 +71,18 @@ void PluginResourceLaunchTestImpl()
 TYPED_TEST_SUITE_P(PluginResourceLaunchTest);
 template <typename T>
 class PluginResourceLaunchTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
 {
-  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LaunchPolicy   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType        = typename camp::at<TypeParam, camp::num<1>>::type;
   using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  PluginResourceLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+  PluginResourceLaunchTestImpl<LaunchPolicy, ResType,
+                               PlatformHolder::platform>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest,
-                            PluginResourceLaunch);
+REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest, PluginResourceLaunch);
 
 #endif  //__TEST_PLUGIN_LAUNCH_HPP__
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index 9e35aae7d2..3f2c3e1223 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -29,181 +29,193 @@ template <typename ExecPolicy,
           typename Allocator,
           typename WORKINGRES,
           RAJA::Platform PLATFORM>
-struct PluginWorkGroupTestImpl {
-void operator()() const
+struct PluginWorkGroupTestImpl
 {
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, PluginTestCallable> >;
-
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  SetupPluginVars spv(WORKINGRES{});
-
-  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
-
+  void operator()() const
   {
-    CounterData loop_data[10];
-    for (int i = 0; i < 10; i++) {
-      loop_data[i].capture_platform_active = RAJA::Platform::undefined;
-      loop_data[i].capture_counter_pre     = -1;
-      loop_data[i].capture_counter_post    = -1;
-      loop_data[i].launch_platform_active = RAJA::Platform::undefined;
-      loop_data[i].launch_counter_pre     = -1;
-      loop_data[i].launch_counter_post    = -1;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, PluginTestCallable>>;
+
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<>, Allocator>;
+
+    using WorkSite_type =
+        RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<>, Allocator>;
+
+    SetupPluginVars spv(WORKINGRES {});
+
+    CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+    {
+      CounterData loop_data[10];
+      for (int i = 0; i < 10; i++)
+      {
+        loop_data[i].capture_platform_active = RAJA::Platform::undefined;
+        loop_data[i].capture_counter_pre     = -1;
+        loop_data[i].capture_counter_post    = -1;
+        loop_data[i].launch_platform_active  = RAJA::Platform::undefined;
+        loop_data[i].launch_counter_pre      = -1;
+        loop_data[i].launch_counter_post     = -1;
+      }
+      plugin_test_resource->memcpy(data, &loop_data[0],
+                                   10 * sizeof(CounterData));
     }
-    plugin_test_resource->memcpy(data, &loop_data[0], 10*sizeof(CounterData));
-  }
-
-  WorkPool_type pool(Allocator{});
 
-  for (int i = 0; i < 10; i++) {
-    pool.enqueue(range_segment{i,i+1}, PluginTestCallable{data});
-  }
+    WorkPool_type pool(Allocator {});
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
-    ASSERT_EQ(plugin_data.launch_counter_post,    0);
-  }
-
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
-      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    for (int i = 0; i < 10; i++)
+    {
+      pool.enqueue(range_segment {i, i + 1}, PluginTestCallable {data});
     }
-  }
 
-  WorkGroup_type group = pool.instantiate();
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
+      ASSERT_EQ(plugin_data.launch_counter_post, 0);
+    }
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     0);
-    ASSERT_EQ(plugin_data.launch_counter_post,    0);
-  }
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
+      }
+    }
 
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data[0], data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    -1);
-      ASSERT_EQ(loop_data[i].launch_platform_active, RAJA::Platform::undefined);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     -1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    -1);
+    WorkGroup_type group = pool.instantiate();
+
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 0);
+      ASSERT_EQ(plugin_data.launch_counter_post, 0);
     }
-  }
 
-  WorkSite_type site = group.run();
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(&loop_data[0], data,
+                                   10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, -1);
+        ASSERT_EQ(loop_data[i].launch_platform_active,
+                  RAJA::Platform::undefined);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, -1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, -1);
+      }
+    }
 
-  {
-    CounterData plugin_data;
-    plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
-    ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.capture_counter_pre,     10);
-    ASSERT_EQ(plugin_data.capture_counter_post,    10);
-    ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
-    ASSERT_EQ(plugin_data.launch_counter_pre,     1);
-    ASSERT_EQ(plugin_data.launch_counter_post,    1);
-  }
+    WorkSite_type site = group.run();
+
+    {
+      CounterData plugin_data;
+      plugin_test_resource->memcpy(&plugin_data, plugin_test_data,
+                                   sizeof(CounterData));
+      ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.capture_counter_pre, 10);
+      ASSERT_EQ(plugin_data.capture_counter_post, 10);
+      ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+      ASSERT_EQ(plugin_data.launch_counter_pre, 1);
+      ASSERT_EQ(plugin_data.launch_counter_post, 1);
+    }
 
-  {
-    CounterData loop_data[10];
-    plugin_test_resource->memcpy(&loop_data, data, 10*sizeof(CounterData));
-
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data[i].capture_counter_pre,     i+1);
-      ASSERT_EQ(loop_data[i].capture_counter_post,    i);
-      ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
-      ASSERT_EQ(loop_data[i].launch_counter_pre,     1);
-      ASSERT_EQ(loop_data[i].launch_counter_post,    0);
+    {
+      CounterData loop_data[10];
+      plugin_test_resource->memcpy(&loop_data, data, 10 * sizeof(CounterData));
+
+      for (int i = 0; i < 10; i++)
+      {
+        ASSERT_EQ(loop_data[i].capture_platform_active, PLATFORM);
+        ASSERT_EQ(loop_data[i].capture_counter_pre, i + 1);
+        ASSERT_EQ(loop_data[i].capture_counter_post, i);
+        ASSERT_EQ(loop_data[i].launch_platform_active, PLATFORM);
+        ASSERT_EQ(loop_data[i].launch_counter_pre, 1);
+        ASSERT_EQ(loop_data[i].launch_counter_post, 0);
+      }
     }
-  }
 
-  plugin_test_resource->deallocate(data);
-}
+    plugin_test_resource->deallocate(data);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM
-          >
-struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
-                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                               StoragePolicy,
-                               detail::indirect_function_call_dispatch_typer,
-                               IndexType,
-                               Allocator,
-                               WORKINGRES,
-                               PLATFORM> {
-void operator()() const
-{ }
+          RAJA::Platform PLATFORM>
+struct PluginWorkGroupTestImpl<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKINGRES,
+    PLATFORM>
+{
+  void operator()() const {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
-          RAJA::Platform PLATFORM
-          >
-struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
-                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                               StoragePolicy,
-                               detail::indirect_virtual_function_dispatch_typer,
-                               IndexType,
-                               Allocator,
-                               WORKINGRES,
-                               PLATFORM> {
-void operator()() const
-{ }
+          RAJA::Platform PLATFORM>
+struct PluginWorkGroupTestImpl<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator,
+    WORKINGRES,
+    PLATFORM>
+{
+  void operator()() const {}
 };
 
 #endif
@@ -212,24 +224,24 @@ void operator()() const
 TYPED_TEST_SUITE_P(PluginWorkGroupTest);
 template <typename T>
 class PluginWorkGroupTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
-  using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using ExecPolicy       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy      = typename camp::at<TypeParam, camp::num<1>>::type;
+  using StoragePolicy    = typename camp::at<TypeParam, camp::num<2>>::type;
+  using DispatchTyper    = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType        = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator        = typename camp::at<TypeParam, camp::num<5>>::type;
   using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
-  using PlatformHolder = typename camp::at<TypeParam, camp::num<7>>::type;
+  using PlatformHolder   = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE, PlatformHolder::platform>{}( );
+  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                          IndexType, Allocator, WORKING_RESOURCE,
+                          PlatformHolder::platform> {}();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest,
-                            PluginWorkGroup);
+REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest, PluginWorkGroup);
 
 #endif  //__TEST_PLUGIN_WORKGROUP_HPP__
diff --git a/test/integration/plugin/tests/test-plugin.hpp b/test/integration/plugin/tests/test-plugin.hpp
index 3371cb299b..aca9c8e47b 100644
--- a/test/integration/plugin/tests/test-plugin.hpp
+++ b/test/integration/plugin/tests/test-plugin.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing basic functional tests for atomic operations with forall and views.
+/// Header file containing basic functional tests for atomic operations with
+/// forall and views.
 ///
 
 #ifndef __TEST_PLUGIN_HPP__
@@ -26,29 +27,29 @@ camp::resources::Resource* plugin_test_resource = nullptr;
 struct SetupPluginVars
 {
   SetupPluginVars(camp::resources::Resource const test_resource)
-    : m_test_resource(test_resource)
+      : m_test_resource(test_resource)
   {
     // ASSERT_EQ(plugin_test_data, nullptr);
     // ASSERT_EQ(plugin_test_resource, nullptr);
 
-    plugin_test_data = m_test_resource.allocate<CounterData>(1);
+    plugin_test_data     = m_test_resource.allocate<CounterData>(1);
     plugin_test_resource = &m_test_resource;
 
     CounterData data;
     data.capture_platform_active = RAJA::Platform::undefined;
     data.capture_counter_pre     = 0;
     data.capture_counter_post    = 0;
-    data.launch_platform_active = RAJA::Platform::undefined;
-    data.launch_counter_pre     = 0;
-    data.launch_counter_post    = 0;
+    data.launch_platform_active  = RAJA::Platform::undefined;
+    data.launch_counter_pre      = 0;
+    data.launch_counter_post     = 0;
 
     m_test_resource.memcpy(plugin_test_data, &data, sizeof(CounterData));
   }
 
-  SetupPluginVars(SetupPluginVars const&) = delete;
-  SetupPluginVars(SetupPluginVars &&) = delete;
+  SetupPluginVars(SetupPluginVars const&)            = delete;
+  SetupPluginVars(SetupPluginVars&&)                 = delete;
   SetupPluginVars& operator=(SetupPluginVars const&) = delete;
-  SetupPluginVars& operator=(SetupPluginVars &&) = delete;
+  SetupPluginVars& operator=(SetupPluginVars&&)      = delete;
 
   ~SetupPluginVars()
   {
@@ -56,7 +57,7 @@ struct SetupPluginVars
     // ASSERT_NE(plugin_test_resource, nullptr);
 
     m_test_resource.deallocate(plugin_test_data);
-    plugin_test_data = nullptr;
+    plugin_test_data     = nullptr;
     plugin_test_resource = nullptr;
   }
 
@@ -68,16 +69,15 @@ struct SetupPluginVars
 struct PluginTestCallable
 {
   PluginTestCallable(CounterData* data_optr)
-    : m_data_optr(data_optr)
-    , m_data_iptr(plugin_test_data)
+      : m_data_optr(data_optr), m_data_iptr(plugin_test_data)
   {
     clear_data();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable const& rhs)
-    : m_data_optr(rhs.m_data_optr)
-    , m_data_iptr(rhs.m_data_iptr)
-    , m_data(rhs.m_data)
+      : m_data_optr(rhs.m_data_optr),
+        m_data_iptr(rhs.m_data_iptr),
+        m_data(rhs.m_data)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -88,24 +88,26 @@ struct PluginTestCallable
       plugin_test_resource->memcpy(&i_data, m_data_iptr, sizeof(CounterData));
 
       if (m_data.capture_platform_active == RAJA::Platform::undefined &&
-          i_data.capture_platform_active != RAJA::Platform::undefined) {
+          i_data.capture_platform_active != RAJA::Platform::undefined)
+      {
         m_data = i_data;
       }
     }
 #endif
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable && rhs)
-    : m_data_optr(rhs.m_data_optr)
-    , m_data_iptr(rhs.m_data_iptr)
-    , m_data(rhs.m_data)
+  RAJA_HOST_DEVICE PluginTestCallable(PluginTestCallable&& rhs)
+      : m_data_optr(rhs.m_data_optr),
+        m_data_iptr(rhs.m_data_iptr),
+        m_data(rhs.m_data)
   {
     rhs.clear();
   }
 
   RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable const& rhs)
   {
-    if (this != &rhs) {
+    if (this != &rhs)
+    {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
       m_data      = rhs.m_data;
@@ -113,9 +115,10 @@ struct PluginTestCallable
     return *this;
   }
 
-  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable && rhs)
+  RAJA_HOST_DEVICE PluginTestCallable& operator=(PluginTestCallable&& rhs)
   {
-    if (this != &rhs) {
+    if (this != &rhs)
+    {
       m_data_optr = rhs.m_data_optr;
       m_data_iptr = rhs.m_data_iptr;
       m_data      = rhs.m_data;
@@ -141,9 +144,9 @@ struct PluginTestCallable
   }
 
 private:
-        CounterData* m_data_optr = nullptr;
+  CounterData* m_data_optr       = nullptr;
   const CounterData* m_data_iptr = nullptr;
-        CounterData  m_data;
+  CounterData m_data;
 
 
   RAJA_HOST_DEVICE void clear()
@@ -158,9 +161,9 @@ struct PluginTestCallable
     m_data.capture_platform_active = RAJA::Platform::undefined;
     m_data.capture_counter_pre     = -1;
     m_data.capture_counter_post    = -1;
-    m_data.launch_platform_active = RAJA::Platform::undefined;
-    m_data.launch_counter_pre     = -1;
-    m_data.launch_counter_post    = -1;
+    m_data.launch_platform_active  = RAJA::Platform::undefined;
+    m_data.launch_counter_pre      = -1;
+    m_data.launch_counter_post     = -1;
   }
 };
 
diff --git a/test/integration/plugin_for_test_dynamic.cpp b/test/integration/plugin_for_test_dynamic.cpp
index dfd04f0a50..84a65d422a 100644
--- a/test/integration/plugin_for_test_dynamic.cpp
+++ b/test/integration/plugin_for_test_dynamic.cpp
@@ -8,16 +8,16 @@
 
 #include <exception>
 
-class ExceptionPlugin :
-  public RAJA::util::PluginStrategy
+class ExceptionPlugin : public RAJA::util::PluginStrategy
 {
-  public:
-  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override {
+public:
+  void preLaunch(const RAJA::util::PluginContext& RAJA_UNUSED_ARG(p)) override
+  {
     throw std::runtime_error("preLaunch");
   }
 };
 
-extern "C" RAJA::util::PluginStrategy *getPlugin()
+extern "C" RAJA::util::PluginStrategy* getPlugin()
 {
   return new ExceptionPlugin;
 }
diff --git a/test/integration/plugin_for_test_kokkos.cpp b/test/integration/plugin_for_test_kokkos.cpp
index d5bbc5a51d..f934d864f7 100644
--- a/test/integration/plugin_for_test_kokkos.cpp
+++ b/test/integration/plugin_for_test_kokkos.cpp
@@ -9,15 +9,19 @@
 
 #include <exception>
 
-extern "C" void kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
-	const uint64_t RAJA_UNUSED_ARG(interfaceVer),
-	const uint32_t RAJA_UNUSED_ARG(devInfoCount),
-	void* RAJA_UNUSED_ARG(deviceInfo)) {}
+extern "C" void
+kokkosp_init_library(const int RAJA_UNUSED_ARG(loadSeq),
+                     const uint64_t RAJA_UNUSED_ARG(interfaceVer),
+                     const uint32_t RAJA_UNUSED_ARG(devInfoCount),
+                     void* RAJA_UNUSED_ARG(deviceInfo))
+{}
 
-extern "C" void kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
-    const uint32_t RAJA_UNUSED_ARG(devID),
-    uint64_t* RAJA_UNUSED_ARG(kID)) {
-    throw std::runtime_error("preLaunch");
+extern "C" void
+kokkosp_begin_parallel_for(const char* RAJA_UNUSED_ARG(name),
+                           const uint32_t RAJA_UNUSED_ARG(devID),
+                           uint64_t* RAJA_UNUSED_ARG(kID))
+{
+  throw std::runtime_error("preLaunch");
 }
 
 extern "C" void kokkosp_end_parallel_for(const uint64_t RAJA_UNUSED_ARG(kID)) {}
diff --git a/test/integration/test_plugin_dynamic.cpp b/test/integration/test_plugin_dynamic.cpp
index 9cba6d0a77..5a3f157e97 100644
--- a/test/integration/test_plugin_dynamic.cpp
+++ b/test/integration/test_plugin_dynamic.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestDynamic, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                               [=](int i) { a[i] = 0; });
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/integration/test_plugin_kokkos.cpp b/test/integration/test_plugin_kokkos.cpp
index b8f05d8fef..521870494b 100644
--- a/test/integration/test_plugin_kokkos.cpp
+++ b/test/integration/test_plugin_kokkos.cpp
@@ -14,7 +14,7 @@ TEST(PluginTestKokkos, Exception)
 
   ASSERT_ANY_THROW({
     RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10),
-                               [=](int i) { a[i] = 0; });
+                                 [=](int i) { a[i] = 0; });
   });
 
   delete[] a;
diff --git a/test/old-tests/unit/cpu/test-synchronize.cpp b/test/old-tests/unit/cpu/test-synchronize.cpp
index 7750fcea5c..f933804f02 100644
--- a/test/old-tests/unit/cpu/test-synchronize.cpp
+++ b/test/old-tests/unit/cpu/test-synchronize.cpp
@@ -17,7 +17,8 @@ TEST(SynchronizeTest, omp)
 
 #pragma omp parallel shared(test_val)
   {
-    if (omp_get_thread_num() == 0) {
+    if (omp_get_thread_num() == 0)
+    {
       test_val = 5.0;
     }
 
diff --git a/test/old-tests/unit/cuda/test-synchronize.cpp b/test/old-tests/unit/cuda/test-synchronize.cpp
index b26b7a3445..dd46bfbdc1 100644
--- a/test/old-tests/unit/cuda/test-synchronize.cpp
+++ b/test/old-tests/unit/cuda/test-synchronize.cpp
@@ -16,16 +16,14 @@ GPU_TEST(SynchronizeTest, CUDA)
   double* managed_data;
   cudaErrchk(cudaMallocManaged(&managed_data, sizeof(double) * 50));
 
-  RAJA::forall<RAJA::cuda_exec_async<256>>( RAJA::RangeSegment(0, 50),
-    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    managed_data[i] = 1.0 * i;
-  });
+  RAJA::forall<RAJA::cuda_exec_async<256>>(
+      RAJA::RangeSegment(0, 50),
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::cuda_synchronize>();
 
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
-    [=](RAJA::Index_type i) {
-    EXPECT_EQ(managed_data[i], 1.0 * i);
-  });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   cudaErrchk(cudaFree(managed_data));
 }
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index a6f4ffcbc5..702d14c8d8 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -24,9 +24,9 @@
 using namespace RAJA;
 using namespace RAJA::statement;
 
-//Define tile size ( TILE_DIM x TILE_DIM )
-//Matrix transpose and matrix multiplication
-//are carried out via tiling algorithms
+// Define tile size ( TILE_DIM x TILE_DIM )
+// Matrix transpose and matrix multiplication
+// are carried out via tiling algorithms
 RAJA_INDEX_VALUE(TX, "TX");
 RAJA_INDEX_VALUE(TY, "TY");
 
@@ -45,73 +45,85 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *B;
 #if defined(RAJA_ENABLE_CUDA)
   size_t Arr_sz = N_rows * N_cols;
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * Arr_sz));
-  cudaErrchk(cudaMallocManaged(&B, sizeof(double)  * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * Arr_sz));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * Arr_sz));
 #else
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
 #endif
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      A[col + N_cols * row] = col;
     }
   }
 
-  using SharedTile = AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  using SharedTile =
+      AtomicTypedLocalArray<RAJA::auto_atomic, double, RAJA::PERM_IJ,
+                            RAJA::SizeList<TILE_DIM, TILE_DIM>, TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      Bview(row, col) = myTile(ty, tx);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = Aview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          Bview(row, col) = myTile(ty, tx);
+        }
+      });
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ((double)B[col + row*N_cols], (double)A[col + row*N_cols]);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)B[col + row * N_cols],
+                      (double)A[col + row * N_cols]);
     }
   }
 
@@ -119,8 +131,8 @@ GPU_TYPED_TEST_P(TypedLocalMem, Basic)
   cudaErrchk(cudaFree(A));
   cudaErrchk(cudaFree(B));
 #else
-  delete [] A;
-  delete [] B;
+  delete[] A;
+  delete[] B;
 #endif
 }
 
@@ -140,92 +152,105 @@ GPU_TYPED_TEST_P(TypedLocalMem_gpu, Basic)
 {
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *B;
   double *d_A, *d_B;
   size_t Arr_sz = N_rows * N_cols;
   hipMalloc(&d_A, sizeof(double) * Arr_sz);
   hipMalloc(&d_B, sizeof(double) * Arr_sz);
-  A  = new double[N_rows * N_cols];
-  B  = new double[N_rows * N_cols];
+  A = new double[N_rows * N_cols];
+  B = new double[N_rows * N_cols];
 
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Aview(A, N_rows, N_cols);
   RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> Bview(B, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows, N_cols);
-  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows, N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Aview(d_A, N_rows,
+                                                             N_cols);
+  RAJA::TypedView<double, RAJA::Layout<DIM>, TY, TX> d_Bview(d_B, N_rows,
+                                                             N_cols);
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col= 0 ; col < N_cols; ++col) {
-      A[col + N_cols*row] = col;
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      A[col + N_cols * row] = col;
     }
   }
 
-  hipMemcpy(d_A, A, Arr_sz*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_A, A, Arr_sz * sizeof(double), hipMemcpyHostToDevice);
 
-  using SharedTile = TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>, TY, TX>;
+  using SharedTile =
+      TypedLocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>,
+                      TY, TX>;
   SharedTile myTile, myTile2;
 
   const TX TX_TILE_DIM(16);
   const TY TY_TILE_DIM(16);
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0), RAJA::TypedRangeSegment<TY>(0,inner_Dim1),
-                                           RAJA::TypedRangeSegment<TX>(0, outer_Dim0), RAJA::TypedRangeSegment<TY>(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)   = d_Aview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (TX tx, TY ty, TX bx, TY by, SharedTile &myTile, SharedTile &) {
-
-    TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
-    TY row = by * TY_TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      d_Bview(row, col) = myTile(ty, tx);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<TX>(0, inner_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, inner_Dim1),
+                       RAJA::TypedRangeSegment<TX>(0, outer_Dim0),
+                       RAJA::TypedRangeSegment<TY>(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx) = d_Aview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(TX tx, TY ty, TX bx, TY by, SharedTile & myTile,
+                           SharedTile&)
+      {
+        TX col = bx * TX_TILE_DIM + tx;  // Matrix column index
+        TY row = by * TY_TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          d_Bview(row, col) = myTile(ty, tx);
+        }
+      });
 
-  hipMemcpy(B, d_B, Arr_sz*sizeof(double), hipMemcpyDeviceToHost);
+  hipMemcpy(B, d_B, Arr_sz * sizeof(double), hipMemcpyDeviceToHost);
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(B[col + row*N_cols], A[col + row*N_cols]);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ(B[col + row * N_cols], A[col + row * N_cols]);
     }
   }
 
   hipFree(d_A);
   hipFree(d_B);
-  delete [] A;
-  delete [] B;
+  delete[] A;
+  delete[] B;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(TypedLocalMem_gpu, Basic);
-#endif //defined(RAJA_ENABLE_HIP)
+#endif  // defined(RAJA_ENABLE_HIP)
 
 
 //
-//Matrix transpose example - test all variants
+// Matrix transpose example - test all variants
 //
 template <typename NestedPolicy>
 class MatTranspose : public ::testing::Test
@@ -241,21 +266,21 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *At, *B, *Bt;
 #if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaMallocManaged(&A,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&A, sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&At, sizeof(double) * N_rows * N_cols));
-  cudaErrchk(cudaMallocManaged(&B,  sizeof(double) * N_rows * N_cols));
+  cudaErrchk(cudaMallocManaged(&B, sizeof(double) * N_rows * N_cols));
   cudaErrchk(cudaMallocManaged(&Bt, sizeof(double) * N_rows * N_cols));
 #else
   A  = new double[N_rows * N_cols];
@@ -271,53 +296,62 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> Btview(Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
   }
 
 
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+  using SharedTile =
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = Aview(row, col);
-      myTile2(ty,tx) = Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      Atview(row, col) = myTile(tx,ty);
-      Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(
+          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
+          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = bx * TILE_DIM + tx;  // Matrix column index
+        int row = by * TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx)  = Aview(row, col);
+          myTile2(ty, tx) = Bview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = by * TILE_DIM + tx;  // Transposed matrix column index
+        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+        if (row < N_cols && col < N_rows)
+        {
+          Atview(row, col) = myTile(tx, ty);
+          Btview(row, col) = myTile2(tx, ty);
+        }
+      });
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ((double)Atview(col,row), (double)col);
-      ASSERT_FLOAT_EQ((double)Btview(col,row), (double)col);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)Atview(col, row), (double)col);
+      ASSERT_FLOAT_EQ((double)Btview(col, row), (double)col);
     }
   }
 
@@ -328,10 +362,10 @@ GPU_TYPED_TEST_P(MatTranspose, Basic)
   cudaErrchk(cudaFree(B));
   cudaErrchk(cudaFree(Bt));
 #else
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
+  delete[] A;
+  delete[] At;
+  delete[] B;
+  delete[] Bt;
 #endif
 }
 
@@ -353,21 +387,21 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
 
   using Pol = at_v<TypeParam, 0>;
 
-  const int DIM = 2;
+  const int DIM    = 2;
   const int N_rows = 144;
   const int N_cols = 255;
 
   const int inner_Dim0 = TILE_DIM;
   const int inner_Dim1 = TILE_DIM;
 
-  const int outer_Dim0 = (N_cols-1)/TILE_DIM+1;
-  const int outer_Dim1 = (N_rows-1)/TILE_DIM+1;
+  const int outer_Dim0 = (N_cols - 1) / TILE_DIM + 1;
+  const int outer_Dim1 = (N_rows - 1) / TILE_DIM + 1;
 
   double *A, *At, *B, *Bt;
   double *d_A, *d_At, *d_B, *d_Bt;
-  hipMalloc(&d_A,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_A, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_At, sizeof(double) * N_rows * N_cols);
-  hipMalloc(&d_B,  sizeof(double) * N_rows * N_cols);
+  hipMalloc(&d_B, sizeof(double) * N_rows * N_cols);
   hipMalloc(&d_Bt, sizeof(double) * N_rows * N_cols);
   A  = new double[N_rows * N_cols];
   At = new double[N_rows * N_cols];
@@ -387,8 +421,10 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   RAJA::View<double, RAJA::Layout<DIM>> d_Btview(d_Bt, N_cols, N_rows);
 
 
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
       Aview(row, col) = col;
       Bview(row, col) = col;
     }
@@ -398,48 +434,55 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipMemcpy(d_B, B, N_rows * N_cols * sizeof(double), hipMemcpyHostToDevice);
 
 
-  using SharedTile = LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM,TILE_DIM>>;
+  using SharedTile =
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
 
   SharedTile myTile, myTile2;
 
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0,inner_Dim1),
-                                           RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0,outer_Dim1)),
-                          RAJA::make_tuple(myTile, myTile2),
-
-  //Load data into shared memory
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = bx * TILE_DIM + tx;  // Matrix column index
-    int row = by * TILE_DIM + ty;  // Matrix row index
-
-    if(row < N_rows && col < N_cols){
-      myTile(ty,tx)  = d_Aview(row, col);
-      myTile2(ty,tx) = d_Bview(row, col);
-    }
-
-  },
-
-  //read from shared mem
-  [=] RAJA_HOST_DEVICE (int tx, int ty, int bx, int by, SharedTile &myTile, SharedTile &myTile2) {
-
-    int col = by * TILE_DIM + tx;  // Transposed matrix column index
-    int row = bx * TILE_DIM + ty;  // Transposed matrix row index
-
-    if(row < N_cols && col < N_rows){
-      d_Atview(row, col) = myTile(tx,ty);
-      d_Btview(row, col) = myTile2(tx,ty);
-    }
-
-  });
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(
+          RAJA::RangeSegment(0, inner_Dim0), RAJA::RangeSegment(0, inner_Dim1),
+          RAJA::RangeSegment(0, outer_Dim0), RAJA::RangeSegment(0, outer_Dim1)),
+      RAJA::make_tuple(myTile, myTile2),
+
+      // Load data into shared memory
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = bx * TILE_DIM + tx;  // Matrix column index
+        int row = by * TILE_DIM + ty;  // Matrix row index
+
+        if (row < N_rows && col < N_cols)
+        {
+          myTile(ty, tx)  = d_Aview(row, col);
+          myTile2(ty, tx) = d_Bview(row, col);
+        }
+      },
+
+      // read from shared mem
+      [=] RAJA_HOST_DEVICE(int tx, int ty, int bx, int by, SharedTile& myTile,
+                           SharedTile& myTile2)
+      {
+        int col = by * TILE_DIM + tx;  // Transposed matrix column index
+        int row = bx * TILE_DIM + ty;  // Transposed matrix row index
+
+        if (row < N_cols && col < N_rows)
+        {
+          d_Atview(row, col) = myTile(tx, ty);
+          d_Btview(row, col) = myTile2(tx, ty);
+        }
+      });
 
   hipMemcpy(At, d_At, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
   hipMemcpy(Bt, d_Bt, N_rows * N_cols * sizeof(double), hipMemcpyDeviceToHost);
 
-  //Check result
-  for (int row = 0; row < N_rows; ++row) {
-    for (int col = 0; col < N_cols; ++col) {
-      ASSERT_FLOAT_EQ(Atview(col,row), col);
-      ASSERT_FLOAT_EQ(Btview(col,row), col);
+  // Check result
+  for (int row = 0; row < N_rows; ++row)
+  {
+    for (int col = 0; col < N_cols; ++col)
+    {
+      ASSERT_FLOAT_EQ(Atview(col, row), col);
+      ASSERT_FLOAT_EQ(Btview(col, row), col);
     }
   }
 
@@ -448,149 +491,170 @@ GPU_TYPED_TEST_P(MatTranspose_gpu, Basic)
   hipFree(d_At);
   hipFree(d_B);
   hipFree(d_Bt);
-  delete [] A;
-  delete [] At;
-  delete [] B;
-  delete [] Bt;
+  delete[] A;
+  delete[] At;
+  delete[] B;
+  delete[] Bt;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatTranspose_gpu, Basic);
 
-#endif //defined(RAJA_ENABLE_HIP)
+#endif  // defined(RAJA_ENABLE_HIP)
 
 using SeqTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-        RAJA::statement::For<3, RAJA::seq_exec,
-          RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::seq_exec,
-                RAJA::statement::For<0, RAJA::seq_exec,
-                  RAJA::statement::Lambda<0>
-                                   >
-                                 >,
-
-                //Read data from shared memory
-                RAJA::statement::For<1, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                    RAJA::statement::Lambda<1> > >
-
-              > //close shared memory scope
-            >//for 2
-        >//for 3
-      > //kernel policy
-    > //list
-  >; //types
+    ::testing::Types<RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::seq_exec,
+                                         RAJA::statement::Lambda<1>>>
+
+                >  // close shared memory scope
+            >      // for 2
+        >          // for 3
+                                                   >  // kernel policy
+                                >                     // list
+                     >;                               // types
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, MatTranspose, SeqTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(Seq, TypedLocalMem, SeqTypes);
 
 
 #if defined(RAJA_ENABLE_OPENMP)
-using TestTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::seq_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<0>
-                                     >,
-
-           //Read data from shared memory
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<0, 1>,
-                                     RAJA::statement::Lambda<1>
-                                     >
-                                 >
-        >//for 2
-       >//for 3
-       > //close policy
-     > //close list
-
-  ,RAJA::list<
-      RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::seq_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-            RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-           RAJA::statement::For<0, RAJA::omp_parallel_for_exec,
-                                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-     >//close policy
-    > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::For<3, RAJA::omp_parallel_for_exec,
-        RAJA::statement::For<2, RAJA::seq_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-        > //2
-       >//3
-      > //close policy list
-     > //close list
-  ,RAJA::list<
-    RAJA::KernelPolicy<
-           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
-                                     RAJA::ArgList<2, 3>,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem,RAJA::ParamList<0,1>,
-
-           //Load data into shared memory
-           RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<0>
-              >
-             >,
-
-           //Read data from shared memory
-            RAJA::statement::For<1, RAJA::seq_exec,
-              RAJA::statement::For<0, RAJA::seq_exec,
-                RAJA::statement::Lambda<1>
-           >
-          >
-         > //close shared mem window
-       >//outer collapsed
-      > //close policy list
-     > //close list
-   >;
+using TestTypes = ::testing::Types<
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<0>>,
+
+                // Read data from shared memory
+                RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                          RAJA::ArgList<0, 1>,
+                                          RAJA::statement::Lambda<1>>>>  // for
+                                                                         // 2
+        >                            // for 3
+                                  >  // close policy
+               >                     // close list
+
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::seq_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::omp_parallel_for_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::omp_parallel_for_exec,
+                                         RAJA::statement::Lambda<
+                                             1>>>>  // close shared mem window
+            >                                       // 2
+        >                                           // 3
+                                  >                 // close policy
+               >                                    // close list
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::For<
+        3,
+        RAJA::omp_parallel_for_exec,
+        RAJA::statement::For<
+            2,
+            RAJA::seq_exec,
+
+            RAJA::statement::InitLocalMem<
+                RAJA::cpu_tile_mem,
+                RAJA::ParamList<0, 1>,
+
+                // Load data into shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::
+                        For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+                // Read data from shared memory
+                RAJA::statement::For<
+                    1,
+                    RAJA::seq_exec,
+                    RAJA::statement::For<0,
+                                         RAJA::seq_exec,
+                                         RAJA::statement::Lambda<
+                                             1>>>>  // close shared mem window
+            >                                       // 2
+        >                                           // 3
+                                  >                 // close policy list
+               >                                    // close list
+    ,
+    RAJA::list<RAJA::KernelPolicy<RAJA::statement::Collapse<
+        RAJA::omp_parallel_collapse_exec,
+        RAJA::ArgList<2, 3>,
+
+        RAJA::statement::InitLocalMem<
+            RAJA::cpu_tile_mem,
+            RAJA::ParamList<0, 1>,
+
+            // Load data into shared memory
+            RAJA::statement::For<
+                1,
+                RAJA::seq_exec,
+                RAJA::statement::
+                    For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>,
+
+            // Read data from shared memory
+            RAJA::statement::For<1,
+                                 RAJA::seq_exec,
+                                 RAJA::statement::For<0,
+                                                      RAJA::seq_exec,
+                                                      RAJA::statement::Lambda<
+                                                          1>>>>  // close shared
+                                                                 // mem window
+        >                            // outer collapsed
+                                  >  // close policy list
+               >                     // close list
+    >;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, MatTranspose, TestTypes);
@@ -599,60 +663,79 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMP, TypedLocalMem, TestTypes);
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CUDATypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<3, RAJA::cuda_block_y_direct,
-          RAJA::statement::For<2, RAJA::cuda_block_x_direct,
-
-            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::CudaSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-              RAJA::statement::CudaSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //CudaKernel
-    > //kernel policy
-  > //list
-  ,
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::CudaKernel<
-        RAJA::statement::For<3, RAJA::cuda_block_y_loop,
-          RAJA::statement::For<2, RAJA::cuda_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<0> > >,
-              RAJA::statement::CudaSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
-                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
-                  RAJA::statement::Lambda<1> > >,
-              RAJA::statement::CudaSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //CudaKernel
-    > //kernel policy
-  > //list
-  >; //types
+using CUDATypes = ::testing::Types<
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+            3,
+            RAJA::cuda_block_y_direct,
+            RAJA::statement::For<
+                2,
+                RAJA::cuda_block_x_direct,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::cuda_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::CudaSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>  // close shared memory
+                                                       // scope
+                >                                         // for 2
+            >                                             // for 3
+                                                       >  // CudaKernel
+                           >                              // kernel policy
+        >                                                 // list
+    ,
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::For<
+            3,
+            RAJA::cuda_block_y_loop,
+            RAJA::statement::For<
+                2,
+                RAJA::cuda_block_x_loop,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::cuda_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::CudaSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::cuda_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::cuda_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::CudaSyncThreads>  // close shared memory
+                                                       // scope
+                >                                         // for 2
+            >                                             // for 3
+                                                       >  // CudaKernel
+                           >                              // kernel policy
+        >                                                 // list
+    >;                                                    // types
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, MatTranspose, CUDATypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
@@ -660,80 +743,90 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDA, TypedLocalMem, CUDATypes);
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HIPTypes =
-  ::testing::Types<
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<3, RAJA::hip_block_y_direct,
-          RAJA::statement::For<2, RAJA::hip_block_x_direct,
-
-            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<0>
-                >
-              >,
-              RAJA::statement::HipSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                >
-              >,
-              RAJA::statement::HipSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //HipKernel
-    > //kernel policy
-  > //list
-  ,
-  RAJA::list<
-    RAJA::KernelPolicy<
-      RAJA::statement::HipKernel<
-        RAJA::statement::For<3, RAJA::hip_block_y_loop,
-          RAJA::statement::For<2, RAJA::hip_block_x_loop,
-
-            RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<0,1>,
-
-              //Load data into shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<0>
-                >
-              >,
-              RAJA::statement::HipSyncThreads,
-
-              //Read data from shared memory
-              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
-                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
-                  RAJA::statement::Lambda<1>
-                >
-              >,
-              RAJA::statement::HipSyncThreads
-            > //close shared memory scope
-          >//for 2
-        >//for 3
-      > //HipKernel
-    > //kernel policy
-  > //list
-  >; //types
+using HIPTypes = ::testing::Types<
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+            3,
+            RAJA::hip_block_y_direct,
+            RAJA::statement::For<
+                2,
+                RAJA::hip_block_x_direct,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::hip_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::HipSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>  // close shared memory
+                                                      // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                      >  // HipKernel
+                           >                             // kernel policy
+        >                                                // list
+    ,
+    RAJA::list<
+        RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::For<
+            3,
+            RAJA::hip_block_y_loop,
+            RAJA::statement::For<
+                2,
+                RAJA::hip_block_x_loop,
+
+                RAJA::statement::InitLocalMem<
+                    RAJA::hip_shared_mem,
+                    RAJA::ParamList<0, 1>,
+
+                    // Load data into shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<0>>>,
+                    RAJA::statement::HipSyncThreads,
+
+                    // Read data from shared memory
+                    RAJA::statement::For<
+                        1,
+                        RAJA::hip_thread_y_direct,
+                        RAJA::statement::For<0,
+                                             RAJA::hip_thread_x_direct,
+                                             RAJA::statement::Lambda<1>>>,
+                    RAJA::statement::HipSyncThreads>  // close shared memory
+                                                      // scope
+                >                                        // for 2
+            >                                            // for 3
+                                                      >  // HipKernel
+                           >                             // kernel policy
+        >                                                // list
+    >;                                                   // types
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, MatTranspose_gpu, HIPTypes);
 INSTANTIATE_TYPED_TEST_SUITE_P(HIP, TypedLocalMem_gpu, HIPTypes);
 
 #endif
 
 
-
 template <typename NestedPolicy>
 class MatMultiply : public ::testing::Test
 {
-  virtual void SetUp(){}
-  virtual void TearDown(){}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
 };
 
 TYPED_TEST_SUITE_P(MatMultiply);
@@ -747,22 +840,22 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   static constexpr size_t M = TypeParam::M;
   static constexpr size_t P = TypeParam::P;
 
-  //Matrix A size: N x M
-  //Matrix B size: M x P
-  //Result C size: N x P
+  // Matrix A size: N x M
+  // Matrix B size: M x P
+  // Result C size: N x P
 
   // Note: on CPU A==d_A, etc.
   double *A, *d_A;
-  TypeParam::alloc_double(N*M, &A, &d_A);
+  TypeParam::alloc_double(N * M, &A, &d_A);
 
   double *B, *d_B;
-  TypeParam::alloc_double(M*P, &B, &d_B);
+  TypeParam::alloc_double(M * P, &B, &d_B);
 
   double *C, *d_C;
-  TypeParam::alloc_double(N*P, &C, &d_C);
+  TypeParam::alloc_double(N * P, &C, &d_C);
 
 
-  double *C_sol = new double[N*P];
+  double* C_sol = new double[N * P];
 
   RAJA::View<double, RAJA::Layout<2>> C_solView(C_sol, N, P);
 
@@ -771,34 +864,41 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
     RAJA::View<double, RAJA::Layout<2>> Aview(A, N, M);
     RAJA::View<double, RAJA::Layout<2>> Bview(B, M, P);
     RAJA::View<double, RAJA::Layout<2>> Cview(C, N, P);
-    for (size_t row = 0; row < N; ++row) {
-      for (size_t col = 0; col < M; ++col) {
-        Aview(row, col) = ((double)col-row)/(N*M)+1;
+    for (size_t row = 0; row < N; ++row)
+    {
+      for (size_t col = 0; col < M; ++col)
+      {
+        Aview(row, col) = ((double)col - row) / (N * M) + 1;
       }
     }
 
-    for (size_t row = 0; row < M; ++row) {
-      for (size_t col = 0; col < P; ++col) {
-        Bview(row, col) = ((double)col+row)/(M*P)+1;
+    for (size_t row = 0; row < M; ++row)
+    {
+      for (size_t col = 0; col < P; ++col)
+      {
+        Bview(row, col) = ((double)col + row) / (M * P) + 1;
       }
     }
 
-    for(size_t r=0; r<N; ++r){
-      for(size_t c=0; c<P; ++c){
+    for (size_t r = 0; r < N; ++r)
+    {
+      for (size_t c = 0; c < P; ++c)
+      {
         double dot = 0.0;
-        for(size_t k=0; k<M; ++k){
-          dot += Aview(r,k)*Bview(k,c);
+        for (size_t k = 0; k < M; ++k)
+        {
+          dot += Aview(r, k) * Bview(k, c);
         }
-        C_solView(r,c) = dot;
-        Cview(r,c) = 0;
+        C_solView(r, c) = dot;
+        Cview(r, c)     = 0;
       }
     }
   }
 
   // Copy A, B and C to the device (NOP on CPU)
-  TypeParam::copy_d2h(N*M, d_A, A);
-  TypeParam::copy_d2h(M*P, d_B, B);
-  TypeParam::copy_d2h(N*P, d_C, C);
+  TypeParam::copy_d2h(N * M, d_A, A);
+  TypeParam::copy_d2h(M * P, d_B, B);
+  TypeParam::copy_d2h(N * P, d_C, C);
 
   // Create device views of data
   RAJA::View<double, RAJA::Layout<2>> Aview(d_A, N, M);
@@ -808,160 +908,181 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
   using Shmem      = typename TypeParam::Shmem;
   using ThreadPriv = typename TypeParam::ThreadPriv;
 
-  Shmem aShared, bShared; //memory to be shared between threads
-  ThreadPriv pVal; //iteration dependent data
-
-  RAJA::kernel_param<Pol>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                           RAJA::RangeSegment(0, M),
-                                           RAJA::RangeSegment(0, P)),
-                          RAJA::make_tuple(aShared, bShared, pVal),
-
-  // Zero out thread local memory for storing dot products
-  [=] RAJA_HOST_DEVICE (int tn, int tp, ThreadPriv &pVal) {
-
-    pVal(tn,tp) = 0.0;
-
-  },
-
-  // Load tile of A
-  [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) {
-
-     aShared(tn, tm) = Aview(n, m);
-
-  },
-
-  // Load tile of B
-  [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) {
-
-    bShared(tm, tp) = Bview(m, p);
-
-  },
-
-  // Do partial update in shmem
-  [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared,  Shmem &bShared, ThreadPriv & pVal) {
-
-    pVal(tn,tp) += aShared(tn,tm) * bShared(tm, tp);
-
-  },
-
-  // Write out complete result
-  [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp,  ThreadPriv &pVal) {
-
-    Cview(n,p) = pVal(tn,tp);
-
-  });
+  Shmem aShared, bShared;  // memory to be shared between threads
+  ThreadPriv pVal;         // iteration dependent data
+
+  RAJA::kernel_param<Pol>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M),
+                       RAJA::RangeSegment(0, P)),
+      RAJA::make_tuple(aShared, bShared, pVal),
+
+      // Zero out thread local memory for storing dot products
+      [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal)
+      {
+        pVal(tn, tp) = 0.0;
+      },
+
+      // Load tile of A
+      [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared)
+      {
+        aShared(tn, tm) = Aview(n, m);
+      },
+
+      // Load tile of B
+      [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared)
+      {
+        bShared(tm, tp) = Bview(m, p);
+      },
+
+      // Do partial update in shmem
+      [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
+                           Shmem& bShared, ThreadPriv& pVal)
+      {
+        pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp);
+      },
+
+      // Write out complete result
+      [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal)
+      {
+        Cview(n, p) = pVal(tn, tp);
+      });
 
   // copy result back to host (NOP on CPU)
-  TypeParam::copy_d2h(N*P, C, d_C);
+  TypeParam::copy_d2h(N * P, C, d_C);
 
   // Check result
   RAJA::View<double, RAJA::Layout<2>> Cresult(C, N, P);
-  for (size_t row = 0; row < N; ++row) {
-    for (size_t col = 0; col < P; ++col) {
-      ASSERT_FLOAT_EQ((double)Cresult(row,col), (double)C_solView(row,col));
+  for (size_t row = 0; row < N; ++row)
+  {
+    for (size_t col = 0; col < P; ++col)
+    {
+      ASSERT_FLOAT_EQ((double)Cresult(row, col), (double)C_solView(row, col));
     }
   }
 
   TypeParam::free_double(A, d_A);
   TypeParam::free_double(B, d_B);
   TypeParam::free_double(C, d_C);
-  delete [] C_sol;
+  delete[] C_sol;
 }
 
 REGISTER_TYPED_TEST_SUITE_P(MatMultiply, shmem);
 
-void alloc_cpu(size_t N, double **host, double **device){
-  *host = new double[N];
+void alloc_cpu(size_t N, double** host, double** device)
+{
+  *host   = new double[N];
   *device = *host;
 }
 
-void copy_h2d_cpu(size_t , double *, double *){
+void copy_h2d_cpu(size_t, double*, double*)
+{
   // NOP
 }
 
-void copy_d2h_cpu(size_t , double *, double *){
+void copy_d2h_cpu(size_t, double*, double*)
+{
   // NOP
 }
 
-void free_cpu(double *host, double *){
-  delete[] host;
-}
+void free_cpu(double* host, double*) { delete[] host; }
+
+struct Policy_MatMultiply_cpu
+{
 
-struct Policy_MatMultiply_cpu {
-
-    static constexpr size_t N = 150;
-    static constexpr size_t M = 25;
-    static constexpr size_t P = 95;
-    static constexpr size_t tile_size = 16;
-
-    constexpr static void(*alloc_double)(size_t, double**, double**) = alloc_cpu;
-    constexpr static void(*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
-    constexpr static void(*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
-    constexpr static void(*free_double)(double*, double*) = free_cpu;
-
-    using Shmem      = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-    using ThreadPriv = RAJA::LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
-
-    using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-    using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
-    using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
-    using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
-    using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
-
-    // Segments:
-    // 0: N
-    // 1: M
-    // 2: P
-
-    using exec_policy =
-        RAJA::KernelPolicy<
-          //Initalize thread private value
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2,1,0>,
-
-            // Tile of N and P (the result matrix C)
-            RAJA::statement::Tile<0, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-              RAJA::statement::Tile<2, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-
-               // zero out shmem tile of C
-               RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                  shmem_Lambda0 > >,
-
-                // Slide window across matrix: Tile in M
-                RAJA::statement::Tile<1, RAJA::tile_fixed<tile_size>, RAJA::seq_exec,
-
-                   // Load tile of A into shmem
-                   RAJA::statement::For<1, RAJA::seq_exec,
-                     RAJA::statement::For<0, RAJA::seq_exec,
-                     shmem_Lambda1
-                    >
-                   >,
-
-                   // Load tile of B into shmem
-                   RAJA::statement::For<2, RAJA::seq_exec,
-                     RAJA::statement::For<1, RAJA::seq_exec,
-                     shmem_Lambda2
-                    >
-                   >,
-
-                   //Partial multiplication
-                   RAJA::statement::For<2, RAJA::seq_exec,
-                     RAJA::statement::For<1, RAJA::seq_exec,
-                       RAJA::statement::For<0, RAJA::seq_exec,
-                       shmem_Lambda3
-                       >
-                     >
-                   >
-                >, //sliding window
-
-                //Write memory out to global matrix
-                RAJA::statement::For<2, RAJA::seq_exec,
-                  RAJA::statement::For<0, RAJA::seq_exec,
-                  shmem_Lambda4 > >
-             >
-            >
-           > //Create shared memory
-          >;
+  static constexpr size_t N         = 150;
+  static constexpr size_t M         = 25;
+  static constexpr size_t P         = 95;
+  static constexpr size_t tile_size = 16;
+
+  constexpr static void (*alloc_double)(size_t, double**, double**) = alloc_cpu;
+  constexpr static void (*copy_h2d)(size_t, double*, double*) = copy_h2d_cpu;
+  constexpr static void (*copy_d2h)(size_t, double*, double*) = copy_d2h_cpu;
+  constexpr static void (*free_double)(double*, double*)      = free_cpu;
+
+  using Shmem = RAJA::
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+  using ThreadPriv = RAJA::
+      LocalArray<double, RAJA::PERM_IJ, RAJA::SizeList<tile_size, tile_size>>;
+
+  using shmem_Lambda0 =
+      RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+  using shmem_Lambda1 = RAJA::statement::
+      Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>;
+  using shmem_Lambda2 = RAJA::statement::
+      Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>;
+  using shmem_Lambda3 =
+      RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>;
+  using shmem_Lambda4 = RAJA::statement::
+      Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>;
+
+  // Segments:
+  // 0: N
+  // 1: M
+  // 2: P
+
+  using exec_policy = RAJA::KernelPolicy<
+      // Initalize thread private value
+      RAJA::statement::InitLocalMem<
+          RAJA::cpu_tile_mem,
+          RAJA::ParamList<2, 1, 0>,
+
+          // Tile of N and P (the result matrix C)
+          RAJA::statement::Tile<
+              0,
+              RAJA::tile_fixed<tile_size>,
+              RAJA::seq_exec,
+              RAJA::statement::Tile<
+                  2,
+                  RAJA::tile_fixed<tile_size>,
+                  RAJA::seq_exec,
+
+                  // zero out shmem tile of C
+                  RAJA::statement::For<
+                      2,
+                      RAJA::seq_exec,
+                      RAJA::statement::For<0, RAJA::seq_exec, shmem_Lambda0>>,
+
+                  // Slide window across matrix: Tile in M
+                  RAJA::statement::Tile<
+                      1,
+                      RAJA::tile_fixed<tile_size>,
+                      RAJA::seq_exec,
+
+                      // Load tile of A into shmem
+                      RAJA::statement::For<1,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<0,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda1>>,
+
+                      // Load tile of B into shmem
+                      RAJA::statement::For<2,
+                                           RAJA::seq_exec,
+                                           RAJA::statement::For<1,
+                                                                RAJA::seq_exec,
+                                                                shmem_Lambda2>>,
+
+                      // Partial multiplication
+                      RAJA::statement::For<
+                          2,
+                          RAJA::seq_exec,
+                          RAJA::statement::For<
+                              1,
+                              RAJA::seq_exec,
+                              RAJA::statement::For<0,
+                                                   RAJA::seq_exec,
+                                                   shmem_Lambda3>>>>,  // sliding
+                                                                       // window
+
+                  // Write memory out to global matrix
+                  RAJA::statement::For<
+                      2,
+                      RAJA::seq_exec,
+                      RAJA::statement::For<0,
+                                           RAJA::seq_exec,
+                                           shmem_Lambda4>>>>>  // Create shared
+                                                               // memory
+      >;
 };
 
 using MatMultiplyTypes = ::testing::Types<Policy_MatMultiply_cpu>;
diff --git a/test/old-tests/unit/test-simd.cpp b/test/old-tests/unit/test-simd.cpp
index 72bd513fd8..bb4f7ab274 100644
--- a/test/old-tests/unit/test-simd.cpp
+++ b/test/old-tests/unit/test-simd.cpp
@@ -21,26 +21,28 @@ using namespace RAJA::statement;
 TEST(SIMD, Align)
 {
 
-  int N = 1024;
+  int N    = 1024;
   double c = 0.5;
-  double *a =
+  double* a =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
-  double *b =
+  double* b =
       RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN, N * sizeof(double));
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     a[i] = 0;
     b[i] = 2.0;
   }
 
 
-  double *y = RAJA::align_hint(a);
-  double *x = RAJA::align_hint(b);
+  double* y = RAJA::align_hint(a);
+  double* x = RAJA::align_hint(b);
 
   RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N),
                                 [=](int i) { y[i] += x[i] * c; });
 
-  for (int i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)y[i], (double)1.0);
   }
 
@@ -53,33 +55,33 @@ TEST(SIMD, OMPAndSimd)
 {
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0> > > >;
+      1, RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     a[i] = 1;
     b[i] = 1;
     c[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                     RAJA::RangeSegment(0, M)),
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c[i + j * N] = a[i + j * N] + b[i + j * N];
-                    });
+  RAJA::kernel<POL>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c[i + j * N] = a[i + j * N] + b[i + j * N]; });
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
   }
 
@@ -92,49 +94,46 @@ TEST(SIMD, OMPAndSimd_MultiLambda)
 {
 
   using POL = RAJA::KernelPolicy<RAJA::statement::For<
-      1,
-      RAJA::omp_parallel_for_exec,
-      RAJA::statement::For<0,
-                           RAJA::simd_exec,
-                           RAJA::statement::Lambda<0>,
-                           RAJA::statement::Lambda<1> > > >;
+      1, RAJA::omp_parallel_for_exec,
+      RAJA::statement::For<0, RAJA::simd_exec, RAJA::statement::Lambda<0>,
+                           RAJA::statement::Lambda<1>>>>;
 
   const RAJA::Index_type N = 32;
   const RAJA::Index_type M = 32;
 
-  double *a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
-  double *c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                   N * M * sizeof(double));
 
-  double *a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* a2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double *b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* b2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
-  double *c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
+  double* c2 = RAJA::allocate_aligned_type<double>(RAJA::DATA_ALIGN,
                                                    N * M * sizeof(double));
 
-  for (int i = 0; i < N * M; ++i) {
-    a[i] = 1;
-    b[i] = 1;
-    c[i] = 0.0;
+  for (int i = 0; i < N * M; ++i)
+  {
+    a[i]  = 1;
+    b[i]  = 1;
+    c[i]  = 0.0;
     a2[i] = 1;
     b2[i] = 1;
     c2[i] = 0.0;
   }
 
-  RAJA::kernel<POL>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                     RAJA::RangeSegment(0, M)),
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c[i + j * N] = a[i + j * N] + b[i + j * N];
-                    },
-                    [=](RAJA::Index_type i, RAJA::Index_type j) {
-                      c2[i + j * N] = a2[i + j * N] + b2[i + j * N];
-                    });
+  RAJA::kernel<POL>(
+      RAJA::make_tuple(RAJA::RangeSegment(0, N), RAJA::RangeSegment(0, M)),
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c[i + j * N] = a[i + j * N] + b[i + j * N]; },
+      [=](RAJA::Index_type i, RAJA::Index_type j)
+      { c2[i + j * N] = a2[i + j * N] + b2[i + j * N]; });
 
-  for (int i = 0; i < N * M; ++i) {
+  for (int i = 0; i < N * M; ++i)
+  {
     ASSERT_DOUBLE_EQ((double)c[i], (double)2.0);
     ASSERT_DOUBLE_EQ((double)c2[i], (double)2.0);
   }
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
index db918ad234..5bd0d3a612 100644
--- a/test/unit/algorithm/test-algorithm-util-for_each.cpp
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -19,8 +19,9 @@
 #include <vector>
 #include <set>
 
-template<typename T>
-class ForEachUnitTest : public ::testing::Test {};
+template <typename T>
+class ForEachUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes);
 
@@ -30,10 +31,12 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
   std::vector<TypeParam> numbers;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam& number) {
-    number += 1;
-    copies.push_back(number);
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   number += 1;
+                   copies.push_back(number);
+                 });
 
   ASSERT_EQ(copies.size(), 0);
   ASSERT_EQ(numbers.size(), 0);
@@ -42,55 +45,64 @@ TYPED_TEST(ForEachUnitTest, EmptyRange)
 TYPED_TEST(ForEachUnitTest, VectorRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 13; ++i) {
+  for (TypeParam i = 0; i < 13; ++i)
+  {
     numbers.push_back(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam& number) {
-    copies.push_back(number);
-    number += 1;
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 13);
-  for (TypeParam i = 0; i < 13; ++i) {
-    ASSERT_EQ(numbers[i], copies[i]+1);
+  for (TypeParam i = 0; i < 13; ++i)
+  {
+    ASSERT_EQ(numbers[i], copies[i] + 1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, RajaSpanRange)
 {
   std::vector<TypeParam> numbers;
-  for (TypeParam i = 0; i < 11; ++i) {
+  for (TypeParam i = 0; i < 11; ++i)
+  {
     numbers.push_back(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) {
-    copies.push_back(number);
-    number += 1;
-  });
+  RAJA::for_each(RAJA::make_span(numbers.data(), 11),
+                 [&](TypeParam& number)
+                 {
+                   copies.push_back(number);
+                   number += 1;
+                 });
 
   ASSERT_EQ(copies.size(), 11);
-  for (TypeParam i = 0; i < 11; ++i) {
-    ASSERT_EQ(numbers[i], copies[i]+1);
+  for (TypeParam i = 0; i < 11; ++i)
+  {
+    ASSERT_EQ(numbers[i], copies[i] + 1);
   }
 }
 
 TYPED_TEST(ForEachUnitTest, SetRange)
 {
   std::set<TypeParam> numbers;
-  for (TypeParam i = 0; i < 6; ++i) {
+  for (TypeParam i = 0; i < 6; ++i)
+  {
     numbers.insert(i);
   }
 
   std::vector<TypeParam> copies;
-  RAJA::for_each(numbers, [&](TypeParam const& number) {
-    copies.push_back(number);
-  });
+  RAJA::for_each(numbers,
+                 [&](TypeParam const& number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 6);
-  for (TypeParam i = 0; i < 6; ++i) {
+  for (TypeParam i = 0; i < 6; ++i)
+  {
     ASSERT_EQ(i, copies[i]);
     ASSERT_EQ(numbers.count(i), 1);
   }
@@ -102,22 +114,21 @@ TYPED_TEST(ForEachUnitTest, EmptyTypeList)
   using numbers = camp::list<>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{}, [&](auto number) {
-    copies.push_back(number);
-  });
+  RAJA::for_each_type(numbers {},
+                      [&](auto number) { copies.push_back(number); });
 
   ASSERT_EQ(copies.size(), 0);
 }
 
 
-template < typename T, T val >
+template <typename T, T val>
 T get_num(std::integral_constant<T, val>)
 {
   return val;
 }
 
-template < typename TypeParam,
-           std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr >
+template <typename TypeParam,
+          std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   using numbers = camp::list<std::integral_constant<TypeParam, 0>,
@@ -127,24 +138,21 @@ void run_int_type_test()
                              std::integral_constant<TypeParam, 4>>;
 
   std::vector<TypeParam> copies;
-  RAJA::for_each_type(numbers{}, [&](auto number) {
-    copies.push_back(get_num(number));
-  });
+  RAJA::for_each_type(numbers {},
+                      [&](auto number) { copies.push_back(get_num(number)); });
 
   ASSERT_EQ(copies.size(), 5);
-  for (TypeParam i = 0; i < 5; ++i) {
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     ASSERT_EQ(i, copies[i]);
   }
 }
 ///
-template < typename TypeParam,
-           std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr >
+template <typename TypeParam,
+          std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr>
 void run_int_type_test()
 {
   // ignore non-ints
 }
 
-TYPED_TEST(ForEachUnitTest, IntTypeList)
-{
-  run_int_type_test<TypeParam>();
-}
+TYPED_TEST(ForEachUnitTest, IntTypeList) { run_int_type_test<TypeParam>(); }
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index 4e3f9fb795..bacec1a905 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -37,18 +37,24 @@
 
 
 // tag classes to differentiate reduce by attributes and apply correct testing
-struct left_fold_reduce_tag { };
-struct unordered_reduce_tag { };
+struct left_fold_reduce_tag
+{};
+struct unordered_reduce_tag
+{};
 
-struct reduce_interface_tag { };
+struct reduce_interface_tag
+{};
 
-struct reduce_default_interface_tag { };
-struct reduce_init_interface_tag { };
-struct reduce_init_op_interface_tag { };
+struct reduce_default_interface_tag
+{};
+struct reduce_init_interface_tag
+{};
+struct reduce_init_op_interface_tag
+{};
 
 
 // synchronize based on a RAJA execution policy
-template < typename policy >
+template <typename policy>
 struct PolicySynchronize
 {
   void synchronize()
@@ -59,71 +65,76 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::cuda_synchronize>();
+    }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::hip_synchronize>();
+    }
   }
 };
 #endif
 
 
-template <typename Res,
-          typename interface_tag,
-          typename ValType>
+template <typename Res, typename interface_tag, typename ValType>
 struct ReduceData;
 
 template <typename Res, typename ValType>
 struct ReduceData<Res, reduce_interface_tag, ValType>
 {
-  ValType* values = nullptr;
+  ValType* values        = nullptr;
   ValType* reduced_value = nullptr;
   Res m_res;
 
-  template < typename RandomGenerator >
-  ReduceData(size_t N, Res res, RandomGenerator gen_random)
-    : m_res(res)
+  template <typename RandomGenerator>
+  ReduceData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
   {
-    if (N > 0) {
-      values = m_res.template allocate<ValType>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      values = m_res.template allocate<ValType>(
+          N, camp::resources::MemoryAccess::Managed);
     }
-    reduced_value = m_res.template allocate<ValType>(1, camp::resources::MemoryAccess::Managed);
+    reduced_value = m_res.template allocate<ValType>(
+        1, camp::resources::MemoryAccess::Managed);
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       values[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if ( N == 0 ) return;
+    if (N == 0) return;
   }
 
-  Res resource()
-  {
-    return m_res;
-  }
+  Res resource() { return m_res; }
 
-  ReduceData(ReduceData const&) = delete;
+  ReduceData(ReduceData const&)            = delete;
   ReduceData& operator=(ReduceData const&) = delete;
 
   ~ReduceData()
   {
-    if (values != nullptr) {
+    if (values != nullptr)
+    {
       m_res.deallocate(values, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed);
     }
@@ -131,15 +142,14 @@ struct ReduceData<Res, reduce_interface_tag, ValType>
 };
 
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T,
-            BinaryOp,
-            Reducer reducer, reduce_interface_tag, reduce_default_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -147,15 +157,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
   reducer.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T init,
-            BinaryOp,
-            Reducer reducer, reduce_interface_tag, reduce_init_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -163,15 +172,14 @@ void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
   reducer.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename BinaryOp,
-          typename Reducer>
-void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
-            RAJA::Index_type N,
-            T init,
-            BinaryOp op,
-            Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag)
+template <typename Res, typename T, typename BinaryOp, typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T>& data,
+              RAJA::Index_type N,
+              T init,
+              BinaryOp op,
+              Reducer reducer,
+              reduce_interface_tag,
+              reduce_init_op_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -185,28 +193,33 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char* test_name,
-    const unsigned seed,
-    ReduceData<Res, reduce_interface_tag, T> & data,
-    RAJA::Index_type N,
-    T init,
-    BinaryOp op,
-    TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           left_fold_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value) {
+  if (reduced_check_value != *data.reduced_value)
+  {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (left fold reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value
-           << ", expected " << reduced_check_value;
+           << " incorrect " << *data.reduced_value << ", expected "
+           << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
@@ -217,84 +230,98 @@ template <typename Res,
           typename BinaryOp,
           typename TestReducer,
           typename BinaryOpInterface>
-::testing::AssertionResult testReduce(
-    const char* test_name,
-    const unsigned seed,
-    ReduceData<Res, reduce_interface_tag, T> & data,
-    RAJA::Index_type N,
-    T init,
-    BinaryOp op,
-    TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+::testing::AssertionResult
+testReduce(const char* test_name,
+           const unsigned seed,
+           ReduceData<Res, reduce_interface_tag, T>& data,
+           RAJA::Index_type N,
+           T init,
+           BinaryOp op,
+           TestReducer test_reducer,
+           unordered_reduce_tag,
+           reduce_interface_tag si,
+           BinaryOpInterface ci)
 {
   doReduce(data, N, init, op, test_reducer, si, ci);
 
   T reduced_check_value = init;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
   }
 
-  if (reduced_check_value != *data.reduced_value) {
+  if (reduced_check_value != *data.reduced_value)
+  {
     return ::testing::AssertionFailure()
            << test_reducer.name() << " (unordered reduce) " << test_name
            << " (with N " << N << " with seed " << seed << ")"
-           << " incorrect " << *data.reduced_value
-           << ", expected " << reduced_check_value;
+           << " incorrect " << *data.reduced_value << ", expected "
+           << reduced_check_value;
   }
 
   return ::testing::AssertionSuccess();
 }
 
 
-template <typename ValType,
-          typename Reducer,
-          typename Res>
-void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
+template <typename ValType, typename Reducer, typename Res>
+void testReducerInterfaces(unsigned seed,
+                           RAJA::Index_type MaxN,
+                           Reducer reducer,
+                           Res res)
 {
-  using reduce_category    = typename Reducer::reduce_category ;
-  using interface_category = typename Reducer::reduce_interface ;
+  using reduce_category    = typename Reducer::reduce_category;
+  using interface_category = typename Reducer::reduce_interface;
   using no_init_operator   = reduce_default_interface_tag;
   using init_no_operator   = reduce_init_interface_tag;
   using init_operator      = reduce_init_op_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
+      (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  ReduceData<Res, interface_category, ValType> data(N, res, [&](){ return dist(rng); });
-
-  ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus<ValType>::identity(), RAJA::operators::plus<ValType>{},
-      reducer, reduce_category{}, interface_category{}, no_init_operator{}));
-  ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
-  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
-  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
-      reducer, reduce_category{}, interface_category{}, init_operator{}));
+  ReduceData<Res, interface_category, ValType> data(
+      N, res, [&]() { return dist(rng); });
+
+  ASSERT_TRUE(testReduce(
+      "default", seed, data, N, RAJA::operators::plus<ValType>::identity(),
+      RAJA::operators::plus<ValType> {}, reducer, reduce_category {},
+      interface_category {}, no_init_operator {}));
+  ASSERT_TRUE(testReduce(
+      "init", seed, data, N, ValType(N), RAJA::operators::plus<ValType> {},
+      reducer, reduce_category {}, interface_category {}, init_no_operator {}));
+  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0),
+                         RAJA::operators::minimum<ValType> {}, reducer,
+                         reduce_category {}, interface_category {},
+                         init_operator {}));
+  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0),
+                         RAJA::operators::maximum<ValType> {}, reducer,
+                         reduce_category {}, interface_category {},
+                         init_operator {}));
 }
 
-template <typename ValType,
-          typename Reducer,
-          typename Res>
+template <typename ValType, typename Reducer, typename Res>
 void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
 {
   testReducerInterfaces<ValType>(seed, 0, reducer, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
+  {
     testReducerInterfaces<ValType>(seed, n, reducer, res);
   }
 }
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device{}();
+  static unsigned seed = std::random_device {}();
   return seed;
 }
 
 
 TYPED_TEST_SUITE_P(ReduceUnitTest);
 
-template < typename T >
+template <typename T>
 class ReduceUnitTest : public ::testing::Test
-{ };
+{};
 
 TYPED_TEST_P(ReduceUnitTest, UnitReduce)
 {
@@ -303,9 +330,9 @@ TYPED_TEST_P(ReduceUnitTest, UnitReduce)
   using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed = get_random_seed();
+  unsigned seed         = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Reducer reducer{};
+  Reducer reducer {};
   ResType res = ResType::get_default();
 
   testReducer<ValType>(seed, MaxN, reducer, res);
@@ -317,34 +344,21 @@ REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
 //
 // Key types for reduce tests
 //
-using ReduceValTypeList =
-  camp::list<
-              RAJA::Index_type,
-              int,
+using ReduceValTypeList = camp::list<RAJA::Index_type,
+                                     int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
-              float,
+                                     unsigned,
+                                     long long,
+                                     unsigned long long,
+                                     float,
 #endif
-              double
-            >;
+                                     double>;
 
 // Max test lengths for reduce tests
-using ReduceMaxNListDefault =
-  camp::list<
-              camp::num<10000>
-            >;
-
-using ReduceMaxNListSmall =
-  camp::list<
-              camp::num<1000>
-            >;
+using ReduceMaxNListDefault = camp::list<camp::num<10000>>;
 
-using ReduceMaxNListTiny =
-  camp::list<
-              camp::num<100>
-            >;
+using ReduceMaxNListSmall = camp::list<camp::num<1000>>;
 
-#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
+using ReduceMaxNListTiny = camp::list<camp::num<100>>;
 
+#endif  //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
index 4f3f5b4d64..046d631adf 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort-utils.hpp
@@ -37,20 +37,28 @@
 
 
 // tag classes to differentiate sort by attributes and apply correct testing
-struct unstable_sort_tag { };
-struct stable_sort_tag { };
-
-struct sort_interface_tag { };
-struct sort_pairs_interface_tag { };
-
-struct sort_default_interface_tag { };
-struct sort_comp_interface_tag { };
-struct sort_res_default_interface_tag { };
-struct sort_res_comp_interface_tag { };
+struct unstable_sort_tag
+{};
+struct stable_sort_tag
+{};
+
+struct sort_interface_tag
+{};
+struct sort_pairs_interface_tag
+{};
+
+struct sort_default_interface_tag
+{};
+struct sort_comp_interface_tag
+{};
+struct sort_res_default_interface_tag
+{};
+struct sort_res_comp_interface_tag
+{};
 
 
 // synchronize based on a RAJA execution policy
-template < typename policy >
+template <typename policy>
 struct PolicySynchronize
 {
   void synchronize()
@@ -61,24 +69,30 @@ struct PolicySynchronize
 
 #if defined(RAJA_ENABLE_CUDA)
 // partial specialization for cuda_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::cuda_synchronize>();
+    }
   }
 };
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 // partial specialization for hip_exec
-template < size_t BLOCK_SIZE, bool Async >
+template <size_t BLOCK_SIZE, bool Async>
 struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
 {
   void synchronize()
   {
-    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+    if (Async)
+    {
+      RAJA::synchronize<RAJA::hip_synchronize>();
+    }
   }
 };
 #endif
@@ -93,41 +107,42 @@ struct SortData;
 template <typename Res, typename K, typename V>
 struct SortData<Res, sort_interface_tag, K, V>
 {
-  K* orig_keys = nullptr;
+  K* orig_keys   = nullptr;
   K* sorted_keys = nullptr;
   Res m_res;
 
-  template < typename RandomGenerator >
-  SortData(size_t N, Res res, RandomGenerator gen_random)
-    : m_res(res)
+  template <typename RandomGenerator>
+  SortData(size_t N, Res res, RandomGenerator gen_random) : m_res(res)
   {
-    if (N > 0) {
-      orig_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
-      sorted_keys = m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      orig_keys =
+          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
+      sorted_keys =
+          m_res.template allocate<K>(N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       orig_keys[i] = gen_random();
     }
   }
 
   void copy_data(size_t N)
   {
-    if ( N == 0 ) return;
-    m_res.memcpy(sorted_keys, orig_keys, N*sizeof(K));
+    if (N == 0) return;
+    m_res.memcpy(sorted_keys, orig_keys, N * sizeof(K));
   }
 
-  Res resource()
-  {
-    return m_res;
-  }
+  Res resource() { return m_res; }
 
-  SortData(SortData const&) = delete;
+  SortData(SortData const&)            = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
   {
-    if (orig_keys != nullptr) {
+    if (orig_keys != nullptr)
+    {
       m_res.deallocate(orig_keys, camp::resources::MemoryAccess::Managed);
       m_res.deallocate(sorted_keys, camp::resources::MemoryAccess::Managed);
     }
@@ -136,23 +151,28 @@ struct SortData<Res, sort_interface_tag, K, V>
 
 
 template <typename Res, typename K, typename V>
-struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interface_tag, K, V>
+struct SortData<Res, sort_pairs_interface_tag, K, V>
+    : SortData<Res, sort_interface_tag, K, V>
 {
   using base = SortData<Res, sort_interface_tag, K, V>;
 
-  V* orig_vals = nullptr;
+  V* orig_vals   = nullptr;
   V* sorted_vals = nullptr;
 
-  template < typename RandomGenerator >
+  template <typename RandomGenerator>
   SortData(size_t N, Res res, RandomGenerator gen_random)
-    : base(N, res, gen_random)
+      : base(N, res, gen_random)
   {
-    if (N > 0) {
-      orig_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
-      sorted_vals = this->m_res.template allocate<V>(N, camp::resources::MemoryAccess::Managed);
+    if (N > 0)
+    {
+      orig_vals = this->m_res.template allocate<V>(
+          N, camp::resources::MemoryAccess::Managed);
+      sorted_vals = this->m_res.template allocate<V>(
+          N, camp::resources::MemoryAccess::Managed);
     }
 
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; i++)
+    {
       orig_vals[i] = gen_random();
     }
   }
@@ -160,31 +180,32 @@ struct SortData<Res, sort_pairs_interface_tag, K, V> : SortData<Res, sort_interf
   void copy_data(size_t N)
   {
     base::copy_data(N);
-    if ( N == 0 ) return;
-    this->m_res.memcpy(sorted_vals, orig_vals, N*sizeof(V));
+    if (N == 0) return;
+    this->m_res.memcpy(sorted_vals, orig_vals, N * sizeof(V));
   }
 
-  SortData(SortData const&) = delete;
+  SortData(SortData const&)            = delete;
   SortData& operator=(SortData const&) = delete;
 
   ~SortData()
   {
-    if (orig_vals != nullptr) {
+    if (orig_vals != nullptr)
+    {
       this->m_res.deallocate(orig_vals, camp::resources::MemoryAccess::Managed);
-      this->m_res.deallocate(sorted_vals, camp::resources::MemoryAccess::Managed);
+      this->m_res.deallocate(sorted_vals,
+                             camp::resources::MemoryAccess::Managed);
     }
   }
 };
 
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_interface_tag, sort_default_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -192,50 +213,43 @@ void doSort(SortData<Res, sort_interface_tag, T> & data,
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_interface_tag, sort_comp_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
-  sorter(RAJA::make_span(data.sorted_keys, N),
-         comp);
+  sorter(RAJA::make_span(data.sorted_keys, N), comp);
   sorter.synchronize();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_interface_tag, sort_res_default_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N));
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N));
   data.resource().wait();
 }
 
-template <typename Res,
-          typename T,
-          typename Compare,
-          typename Sorter>
-void doSort(SortData<Res, sort_interface_tag, T> & data,
+template <typename Res, typename T, typename Compare, typename Sorter>
+void doSort(SortData<Res, sort_interface_tag, T>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_interface_tag, sort_res_comp_interface_tag)
+            Sorter sorter,
+            sort_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
-         comp);
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N), comp);
   data.resource().wait();
 }
 
@@ -244,10 +258,12 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_pairs_interface_tag, sort_default_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_default_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
@@ -261,16 +277,17 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_pairs_interface_tag, sort_comp_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_comp_interface_tag)
 {
   data.copy_data(N);
   data.resource().wait();
   sorter(RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N),
-         comp);
+         RAJA::make_span(data.sorted_vals, N), comp);
   sorter.synchronize();
 }
 
@@ -279,14 +296,15 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare,
-            Sorter sorter, sort_pairs_interface_tag, sort_res_default_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_default_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
          RAJA::make_span(data.sorted_vals, N));
   data.resource().wait();
 }
@@ -296,16 +314,16 @@ template <typename Res,
           typename V,
           typename Compare,
           typename Sorter>
-void doSort(SortData<Res, sort_pairs_interface_tag, K, V> & data,
+void doSort(SortData<Res, sort_pairs_interface_tag, K, V>& data,
             RAJA::Index_type N,
             Compare comp,
-            Sorter sorter, sort_pairs_interface_tag, sort_res_comp_interface_tag)
+            Sorter sorter,
+            sort_pairs_interface_tag,
+            sort_res_comp_interface_tag)
 {
   data.copy_data(N);
-  sorter(data.resource(),
-         RAJA::make_span(data.sorted_keys, N),
-         RAJA::make_span(data.sorted_vals, N),
-         comp);
+  sorter(data.resource(), RAJA::make_span(data.sorted_keys, N),
+         RAJA::make_span(data.sorted_vals, N), comp);
   data.resource().wait();
 }
 
@@ -315,57 +333,60 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_interface_tag, T> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, unstable_sort_tag, sort_interface_tag si, CompareInterface ci)
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    unstable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::unordered_multiset<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end()) {
-      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys.end())
+    {
+      auto ret = keys.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order "
-             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " (at index " << i-1 << ")";
+             << " out of order " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key "
-             << data.sorted_keys[i]
+             << " unknown or duplicate key " << data.sorted_keys[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_keys[i]);
     if (val_iter == key_iter->second.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate val "
-             << data.sorted_keys[i]
+             << " unknown or duplicate val " << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
@@ -377,56 +398,59 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_interface_tag, T> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, stable_sort_tag, sort_interface_tag si, CompareInterface ci)
+::testing::AssertionResult testSort(const char* test_name,
+                                    const unsigned seed,
+                                    SortData<Res, sort_interface_tag, T>& data,
+                                    RAJA::Index_type N,
+                                    Compare comp,
+                                    TestSorter test_sorter,
+                                    stable_sort_tag,
+                                    sort_interface_tag si,
+                                    CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to keys
   using val_map = std::list<T>;
   std::unordered_map<T, val_map> keys;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys.find(data.orig_keys[i]);
-    if (key_iter == keys.end()) {
-      auto ret = keys.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys.end())
+    {
+      auto ret = keys.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(data.orig_keys[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of order "
-             << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " (at index " << i-1 << ")";
+             << " out of order " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " (at index " << i - 1 << ")";
     // test there is an item with this
     auto key_iter = keys.find(data.sorted_keys[i]);
     if (key_iter == keys.end())
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " unknown or duplicate key "
-             << data.sorted_keys[i]
+             << " unknown or duplicate key " << data.sorted_keys[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_keys[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " out of stable order or unknown val "
-             << data.sorted_keys[i]
+             << " out of stable order or unknown val " << data.sorted_keys[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys.erase(key_iter);
     }
   }
@@ -440,38 +464,45 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_pairs_interface_tag, K, V> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, unstable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         unstable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::unordered_multiset<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end()) {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys_to_vals.end())
+    {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
-             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i] << " out of order"
-             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
-             << " (at index " << i-1 << ")";
+             << " keys " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " out of order"
+             << " vals " << data.sorted_vals[i - 1] << ", "
+             << data.sorted_vals[i] << " (at index " << i - 1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -479,8 +510,7 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     auto val_iter = key_iter->second.find(data.sorted_vals[i]);
     if (val_iter == key_iter->second.end())
@@ -488,11 +518,11 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (unstable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate val "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.erase(val_iter);
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -505,39 +535,45 @@ template <typename Res,
           typename Compare,
           typename TestSorter,
           typename CompareInterface>
-::testing::AssertionResult testSort(
-    const char* test_name,
-    const unsigned seed,
-    SortData<Res, sort_pairs_interface_tag, K, V> & data,
-    RAJA::Index_type N,
-    Compare comp,
-    TestSorter test_sorter, stable_sort_tag, sort_pairs_interface_tag si, CompareInterface ci)
+::testing::AssertionResult
+testSort(const char* test_name,
+         const unsigned seed,
+         SortData<Res, sort_pairs_interface_tag, K, V>& data,
+         RAJA::Index_type N,
+         Compare comp,
+         TestSorter test_sorter,
+         stable_sort_tag,
+         sort_pairs_interface_tag si,
+         CompareInterface ci)
 {
   doSort(data, N, comp, test_sorter, si, ci);
 
   // make map of keys to vals
   using val_map = std::list<V>;
   std::unordered_map<K, val_map> keys_to_vals;
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     auto key_iter = keys_to_vals.find(data.orig_keys[i]);
-    if (key_iter == keys_to_vals.end()) {
-      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map{});
+    if (key_iter == keys_to_vals.end())
+    {
+      auto ret = keys_to_vals.emplace(data.orig_keys[i], val_map {});
       assert(ret.second);
       key_iter = ret.first;
     }
     key_iter->second.emplace_back(data.orig_vals[i]);
   }
 
-  for (RAJA::Index_type i = 0; i < N; i++) {
+  for (RAJA::Index_type i = 0; i < N; i++)
+  {
     // test ordering
-    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i-1]))
+    if (i > 0 && comp(data.sorted_keys[i], data.sorted_keys[i - 1]))
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of order "
-             << " keys " << data.sorted_keys[i-1] << ", " << data.sorted_keys[i]
-             << " vals " << data.sorted_vals[i-1] << ", " << data.sorted_vals[i]
-             << " (at index " << i-1 << ")";
+             << " keys " << data.sorted_keys[i - 1] << ", "
+             << data.sorted_keys[i] << " vals " << data.sorted_vals[i - 1]
+             << ", " << data.sorted_vals[i] << " (at index " << i - 1 << ")";
     // test there is a pair with this key and val
     auto key_iter = keys_to_vals.find(data.sorted_keys[i]);
     if (key_iter == keys_to_vals.end())
@@ -545,19 +581,18 @@ ::testing::AssertionResult testSort(
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " unknown or duplicate key "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     if (key_iter->second.front() != data.sorted_vals[i])
       return ::testing::AssertionFailure()
              << test_sorter.name() << " (stable sort pairs) " << test_name
              << " (with N " << N << " with seed " << seed << ")"
              << " out of stable order or unknown val "
-             << " key " << data.sorted_keys[i]
-             << " val " << data.sorted_vals[i]
+             << " key " << data.sorted_keys[i] << " val " << data.sorted_vals[i]
              << " (at index " << i << ")";
     key_iter->second.pop_front();
-    if (key_iter->second.size() == 0) {
+    if (key_iter->second.size() == 0)
+    {
       keys_to_vals.erase(key_iter);
     }
   }
@@ -565,95 +600,97 @@ ::testing::AssertionResult testSort(
 }
 
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Sorter>
+template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::false_type,
     unsigned,
-    SortData<Res, typename Sorter::sort_interface, K, V> &,
+    SortData<Res, typename Sorter::sort_interface, K, V>&,
     RAJA::Index_type,
     Sorter)
 {
   // Sorter does not support resource interface, no tests
 }
 
-template <typename Res,
-          typename K,
-          typename V,
-          typename Sorter>
+template <typename Res, typename K, typename V, typename Sorter>
 void testSorterResInterfaces(
     std::true_type,
     unsigned seed,
-    SortData<Res, typename Sorter::sort_interface, K, V> & data,
+    SortData<Res, typename Sorter::sort_interface, K, V>& data,
     RAJA::Index_type N,
     Sorter sorter)
 {
   // Sorter supports resource interface, res tests
-  using stability_category = typename Sorter::sort_category ;
-  using pairs_category     = typename Sorter::sort_interface ;
+  using stability_category      = typename Sorter::sort_category;
+  using pairs_category          = typename Sorter::sort_interface;
   using resource_no_comparator  = sort_res_default_interface_tag;
   using resource_use_comparator = sort_res_comp_interface_tag;
 
-  ASSERT_TRUE(testSort("resource+default", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_no_comparator{}));
-  ASSERT_TRUE(testSort("resource+ascending", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
-  ASSERT_TRUE(testSort("resource+descending", seed, data, N, RAJA::operators::greater<K>{},
-      sorter, stability_category{}, pairs_category{}, resource_use_comparator{}));
+  ASSERT_TRUE(testSort(
+      "resource+default", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, resource_no_comparator {}));
+  ASSERT_TRUE(testSort(
+      "resource+ascending", seed, data, N, RAJA::operators::less<K> {}, sorter,
+      stability_category {}, pairs_category {}, resource_use_comparator {}));
+  ASSERT_TRUE(testSort("resource+descending", seed, data, N,
+                       RAJA::operators::greater<K> {}, sorter,
+                       stability_category {}, pairs_category {},
+                       resource_use_comparator {}));
 }
 
-template <typename K,
-          typename Sorter,
-          typename Res>
-void testSorterInterfaces(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
+template <typename K, typename Sorter, typename Res>
+void testSorterInterfaces(unsigned seed,
+                          RAJA::Index_type MaxN,
+                          Sorter sorter,
+                          Res res)
 {
-  using stability_category = typename Sorter::sort_category ;
-  using pairs_category     = typename Sorter::sort_interface ;
-  using supports_resource  = typename Sorter::supports_resource ;
+  using stability_category = typename Sorter::sort_category;
+  using pairs_category     = typename Sorter::sort_interface;
+  using supports_resource  = typename Sorter::supports_resource;
   using no_comparator      = sort_default_interface_tag;
   using use_comparator     = sort_comp_interface_tag;
 
   std::mt19937 rng(seed);
-  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>(
+      (MaxN + 1) / 2, MaxN)(rng);
   std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
 
-  SortData<Res, pairs_category, K> data(N, res, [&](){ return dist(rng); });
+  SortData<Res, pairs_category, K> data(N, res, [&]() { return dist(rng); });
 
-  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, no_comparator{}));
-  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K>{},
-      sorter, stability_category{}, pairs_category{}, use_comparator{}));
-  ASSERT_TRUE(testSort("descending", seed, data, N, RAJA::operators::greater<K>{},
-      sorter, stability_category{}, pairs_category{}, use_comparator{}));
+  ASSERT_TRUE(testSort("default", seed, data, N, RAJA::operators::less<K> {},
+                       sorter, stability_category {}, pairs_category {},
+                       no_comparator {}));
+  ASSERT_TRUE(testSort("ascending", seed, data, N, RAJA::operators::less<K> {},
+                       sorter, stability_category {}, pairs_category {},
+                       use_comparator {}));
+  ASSERT_TRUE(testSort(
+      "descending", seed, data, N, RAJA::operators::greater<K> {}, sorter,
+      stability_category {}, pairs_category {}, use_comparator {}));
 
   testSorterResInterfaces(supports_resource(), seed, data, N, sorter);
 }
 
-template <typename K,
-          typename Sorter,
-          typename Res>
+template <typename K, typename Sorter, typename Res>
 void testSorter(unsigned seed, RAJA::Index_type MaxN, Sorter sorter, Res res)
 {
   testSorterInterfaces<K>(seed, 0, sorter, res);
-  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10)
+  {
     testSorterInterfaces<K>(seed, n, sorter, res);
   }
 }
 
 inline unsigned get_random_seed()
 {
-  static unsigned seed = std::random_device{}();
+  static unsigned seed = std::random_device {}();
   return seed;
 }
 
 
 TYPED_TEST_SUITE_P(SortUnitTest);
 
-template < typename T >
+template <typename T>
 class SortUnitTest : public ::testing::Test
-{ };
+{};
 
 TYPED_TEST_P(SortUnitTest, UnitSort)
 {
@@ -662,9 +699,9 @@ TYPED_TEST_P(SortUnitTest, UnitSort)
   using KeyType  = typename camp::at<TypeParam, camp::num<2>>::type;
   using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  unsigned seed = get_random_seed();
+  unsigned seed         = get_random_seed();
   RAJA::Index_type MaxN = MaxNType::value;
-  Sorter sorter{};
+  Sorter sorter {};
   ResType res = ResType::get_default();
 
   testSorter<KeyType>(seed, MaxN, sorter, res);
@@ -676,34 +713,21 @@ REGISTER_TYPED_TEST_SUITE_P(SortUnitTest, UnitSort);
 //
 // Key types for sort tests
 //
-using SortKeyTypeList =
-  camp::list<
-              RAJA::Index_type,
-              int,
+using SortKeyTypeList = camp::list<RAJA::Index_type,
+                                   int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
-              long long,
-              unsigned long long,
-              float,
+                                   unsigned,
+                                   long long,
+                                   unsigned long long,
+                                   float,
 #endif
-              double
-            >;
+                                   double>;
 
 // Max test lengths for sort tests
-using SortMaxNListDefault =
-  camp::list<
-              camp::num<10000>
-            >;
-
-using SortMaxNListSmall =
-  camp::list<
-              camp::num<1000>
-            >;
+using SortMaxNListDefault = camp::list<camp::num<10000>>;
 
-using SortMaxNListTiny =
-  camp::list<
-              camp::num<100>
-            >;
+using SortMaxNListSmall = camp::list<camp::num<1000>>;
 
-#endif //__TEST_ALGORITHM_SORT_UTILS_HPP__
+using SortMaxNListTiny = camp::list<camp::num<100>>;
 
+#endif  //__TEST_ALGORITHM_SORT_UTILS_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index d08f949fae..02daab1c60 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -22,60 +22,49 @@
 
 #include "test-algorithm-sort-utils.hpp"
 
-template < typename policy >
-struct PolicySort
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicySort : PolicySynchronize<policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicySort()
-    : m_name("RAJA::sort<unknown>")
-  { }
+  PolicySort() : m_name("RAJA::sort<unknown>") {}
 
   PolicySort(std::string const& policy_name)
-    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::sort<") + policy_name + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template < typename policy >
-struct PolicySortPairs
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicySortPairs : PolicySynchronize<policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicySortPairs()
-    : m_name("RAJA::sort<unknown>[pairs]")
-  { }
+  PolicySortPairs() : m_name("RAJA::sort<unknown>[pairs]") {}
 
   PolicySortPairs(std::string const& policy_name)
-    : m_name(std::string("RAJA::sort<") + policy_name + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::sort<") + policy_name +
+               std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::sort_pairs<policy>(std::forward<Args>(args)...);
@@ -84,41 +73,30 @@ struct PolicySortPairs
 
 
 using SequentialSortSorters =
-  camp::list<
-              PolicySort<RAJA::seq_exec>,
-              PolicySortPairs<RAJA::seq_exec>
-            >;
+    camp::list<PolicySort<RAJA::seq_exec>, PolicySortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPSortSorters =
-  camp::list<
-              PolicySort<RAJA::omp_parallel_for_exec>,
-              PolicySortPairs<RAJA::omp_parallel_for_exec>
-            >;
+    camp::list<PolicySort<RAJA::omp_parallel_for_exec>,
+               PolicySortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaSortSorters =
-  camp::list<
-              PolicySort<RAJA::cuda_exec<128>>,
-              PolicySortPairs<RAJA::cuda_exec<128>>,
-              PolicySort<RAJA::cuda_exec_explicit<128, 2>>
-            >;
+    camp::list<PolicySort<RAJA::cuda_exec<128>>,
+               PolicySortPairs<RAJA::cuda_exec<128>>,
+               PolicySort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipSortSorters =
-  camp::list<
-              PolicySort<RAJA::hip_exec<128>>,
-              PolicySortPairs<RAJA::hip_exec<128>>
-            >;
+using HipSortSorters = camp::list<PolicySort<RAJA::hip_exec<128>>,
+                                  PolicySortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
-#endif //__TEST_UNIT_ALGORITHM_SORT_HPP__
-
+#endif  //__TEST_UNIT_ALGORITHM_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index 6b33d63497..c4c9189732 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -23,60 +23,50 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template < typename policy >
-struct PolicyStableSort
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicyStableSort : PolicySynchronize<policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicyStableSort()
-    : m_name("RAJA::stable_sort<unknown>")
-  { }
+  PolicyStableSort() : m_name("RAJA::stable_sort<unknown>") {}
 
   PolicyStableSort(std::string const& policy_name)
-    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::stable_sort<policy>(std::forward<Args>(args)...);
   }
 };
 
-template < typename policy >
-struct PolicyStableSortPairs
-  : PolicySynchronize<policy>
+template <typename policy>
+struct PolicyStableSortPairs : PolicySynchronize<policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::true_type;
 
   std::string m_name;
 
-  PolicyStableSortPairs()
-    : m_name("RAJA::stable_sort<unknown>[pairs]")
-  { }
+  PolicyStableSortPairs() : m_name("RAJA::stable_sort<unknown>[pairs]") {}
 
   PolicyStableSortPairs(std::string const& policy_name)
-    : m_name(std::string("RAJA::stable_sort<") + policy_name + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::stable_sort<") + policy_name +
+               std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::stable_sort_pairs<policy>(std::forward<Args>(args)...);
@@ -84,40 +74,32 @@ struct PolicyStableSortPairs
 };
 
 using SequentialStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::seq_exec>,
-              PolicyStableSortPairs<RAJA::seq_exec>
-            >;
+    camp::list<PolicyStableSort<RAJA::seq_exec>,
+               PolicyStableSortPairs<RAJA::seq_exec>>;
 
 #if defined(RAJA_ENABLE_OPENMP)
 
 using OpenMPStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::omp_parallel_for_exec>,
-              PolicyStableSortPairs<RAJA::omp_parallel_for_exec>
-            >;
+    camp::list<PolicyStableSort<RAJA::omp_parallel_for_exec>,
+               PolicyStableSortPairs<RAJA::omp_parallel_for_exec>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::cuda_exec<128>>,
-              PolicyStableSortPairs<RAJA::cuda_exec<128>>,
-              PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>
-            >;
+    camp::list<PolicyStableSort<RAJA::cuda_exec<128>>,
+               PolicyStableSortPairs<RAJA::cuda_exec<128>>,
+               PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipStableSortSorters =
-  camp::list<
-              PolicyStableSort<RAJA::hip_exec<128>>,
-              PolicyStableSortPairs<RAJA::hip_exec<128>>
-            >;
+    camp::list<PolicyStableSort<RAJA::hip_exec<128>>,
+               PolicyStableSortPairs<RAJA::hip_exec<128>>>;
 
 #endif
 
-#endif // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
+#endif  // __TEST_UNIT_ALGORITHM_STABLE_SORT_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index 062e0f9b91..52570dbdf1 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -23,49 +23,42 @@
 #include "test-algorithm-reduce-utils.hpp"
 
 
-template < typename test_policy >
-using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template <typename test_policy>
+using ForoneSynchronize =
+    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct BinaryTreeReduce;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct Accumulate;
 
 
-template < typename test_policy >
-struct BinaryTreeReduce<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct BinaryTreeReduce<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = unordered_reduce_tag;
+  using reduce_category  = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name()
-  {
-    return "RAJA::binary_tree_reduce";
-  }
+  const char* name() { return "RAJA::binary_tree_reduce"; }
 
-  template < typename T, typename... Args >
+  template <typename T, typename... Args>
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::binary_tree_reduce(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct Accumulate<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct Accumulate<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = left_fold_reduce_tag;
+  using reduce_category  = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
-  const char* name()
-  {
-    return "RAJA::accumulate";
-  }
+  const char* name() { return "RAJA::accumulate"; }
 
-  template < typename T, typename... Args >
+  template <typename T, typename... Args>
   void operator()(T* reduced_value, Args&&... args)
   {
     *reduced_value = RAJA::accumulate(std::forward<Args>(args)...);
@@ -74,89 +67,90 @@ struct Accumulate<test_policy, RunOnHost>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template < typename test_policy >
+template <typename test_policy>
 struct BinaryTreeReduce<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
-  using reduce_category = unordered_reduce_tag;
+  using reduce_category  = unordered_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
 
   BinaryTreeReduce()
-    : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::binary_tree_reduce<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename T, typename Container >
+  template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::binary_tree_reduce(c); });
   }
 
-  template < typename T, typename Container >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  template <typename T, typename Container>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c, init);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        { *reduced_value = RAJA::binary_tree_reduce(c, init); });
   }
 
-  template < typename T, typename Container, typename BinaryOp >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  template <typename T, typename Container, typename BinaryOp>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::binary_tree_reduce(c, init, op);
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        { *reduced_value = RAJA::binary_tree_reduce(c, init, op); });
   }
 };
 
-template < typename test_policy >
-struct Accumulate<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct Accumulate<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using reduce_category = left_fold_reduce_tag;
+  using reduce_category  = left_fold_reduce_tag;
   using reduce_interface = reduce_interface_tag;
 
   std::string m_name;
 
   Accumulate()
-    : m_name(std::string("RAJA::accumulate<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::accumulate<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename T, typename Container >
+  template <typename T, typename Container>
   void operator()(T* reduced_value, Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c); });
   }
 
-  template < typename T, typename Container >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  template <typename T, typename Container>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c, init);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c, init); });
   }
 
-  template < typename T, typename Container, typename BinaryOp >
-  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  template <typename T, typename Container, typename BinaryOp>
+  void operator()(T* reduced_value,
+                  Container&& c,
+                  RAJA::detail::ContainerVal<Container> init,
+                  BinaryOp op)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      *reduced_value = RAJA::accumulate(c, init, op);
-    });
+    forone<test_policy>([=] RAJA_DEVICE()
+                        { *reduced_value = RAJA::accumulate(c, init, op); });
   }
 };
 
@@ -164,42 +158,24 @@ struct Accumulate<test_policy, RunOnDevice>
 
 
 using SequentialBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_seq>
-            >;
+    camp::list<BinaryTreeReduce<test_seq>>;
 
-using SequentialAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_seq>
-            >;
+using SequentialAccumulateReduceReducers = camp::list<Accumulate<test_seq>>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
-using CudaBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_cuda>
-            >;
+using CudaBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_cuda>>;
 
-using CudaAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_cuda>
-            >;
+using CudaAccumulateReduceReducers = camp::list<Accumulate<test_cuda>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
-using HipBinaryTreeReduceReducers =
-  camp::list<
-              BinaryTreeReduce<test_hip>
-            >;
+using HipBinaryTreeReduceReducers = camp::list<BinaryTreeReduce<test_hip>>;
 
-using HipAccumulateReduceReducers =
-  camp::list<
-              Accumulate<test_hip>
-            >;
+using HipAccumulateReduceReducers = camp::list<Accumulate<test_hip>>;
 
 #endif
 
-#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
-
+#endif  //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
diff --git a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
index 032097d9e3..b972b752cd 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-sort.hpp
@@ -23,261 +23,233 @@
 #include "test-algorithm-sort-utils.hpp"
 
 
-template < typename test_policy >
-using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+template <typename test_policy>
+using ForoneSynchronize =
+    PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
 
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct InsertionSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct InsertionSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct ShellSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct ShellSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct HeapSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct HeapSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct IntroSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct IntroSortPairs;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct MergeSort;
 
-template < typename test_policy, typename platform = test_platform<test_policy> >
+template <typename test_policy, typename platform = test_platform<test_policy>>
 struct MergeSortPairs;
 
 
-template < typename test_policy >
-struct InsertionSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct InsertionSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::insertion_sort";
-  }
+  const char* name() { return "RAJA::insertion_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::insertion_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
+template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::insertion_sort[pairs]";
-  }
+  const char* name() { return "RAJA::insertion_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct ShellSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::shell_sort";
-  }
+  const char* name() { return "RAJA::shell_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::shell_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct ShellSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::shell_sort[pairs]";
-  }
+  const char* name() { return "RAJA::shell_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct HeapSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::heap_sort";
-  }
+  const char* name() { return "RAJA::heap_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::heap_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct HeapSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::heap_sort[pairs]";
-  }
+  const char* name() { return "RAJA::heap_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct IntroSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::intro_sort";
-  }
+  const char* name() { return "RAJA::intro_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::intro_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct IntroSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::intro_sort[pairs]";
-  }
+  const char* name() { return "RAJA::intro_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
 };
 
-template < typename test_policy >
-struct MergeSort<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSort<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::merge_sort";
-  }
+  const char* name() { return "RAJA::merge_sort"; }
 
-  template < typename... Args >
+  template <typename... Args>
   void operator()(Args&&... args)
   {
     RAJA::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-template < typename test_policy >
-struct MergeSortPairs<test_policy, RunOnHost>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSortPairs<test_policy, RunOnHost> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
-  const char* name()
-  {
-    return "RAJA::merge_sort[pairs]";
-  }
+  const char* name() { return "RAJA::merge_sort[pairs]"; }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    auto c = RAJA::zip_span(keys, vals);
+    auto c        = RAJA::zip_span(keys, vals);
     using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
     RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
   }
@@ -285,348 +257,319 @@ struct MergeSortPairs<test_policy, RunOnHost>
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 
-template < typename test_policy >
-struct InsertionSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct InsertionSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   InsertionSort()
-    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::insertion_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::insertion_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::insertion_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
+template <typename test_policy>
 struct InsertionSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+    : ForoneSynchronize<test_policy>
 {
-  using sort_category = stable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = stable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   InsertionSortPairs()
-    : m_name(std::string("RAJA::insertion_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::insertion_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::insertion_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct ShellSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   ShellSort()
-    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::shell_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::shell_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::shell_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct ShellSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct ShellSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   ShellSortPairs()
-    : m_name(std::string("RAJA::shell_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::shell_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::shell_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct HeapSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   HeapSort()
-    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::heap_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::heap_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::heap_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct HeapSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct HeapSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   HeapSortPairs()
-    : m_name(std::string("RAJA::heap_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::heap_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::heap_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct IntroSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   IntroSort()
-    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::intro_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::intro_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::intro_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct IntroSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct IntroSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   IntroSortPairs()
-    : m_name(std::string("RAJA::intro_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::intro_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::intro_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
-template < typename test_policy >
-struct MergeSort<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSort<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   MergeSort()
-    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">"))
-  { }
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename Container >
+  template <typename Container>
   void operator()(Container&& c)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::merge_sort(c);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c); });
   }
 
-  template < typename Container, typename Compare >
+  template <typename Container, typename Compare>
   void operator()(Container&& c, Compare comp)
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      RAJA::merge_sort(c, comp);
-    });
+    forone<test_policy>([=] RAJA_DEVICE() { RAJA::merge_sort(c, comp); });
   }
 };
 
-template < typename test_policy >
-struct MergeSortPairs<test_policy, RunOnDevice>
-  : ForoneSynchronize<test_policy>
+template <typename test_policy>
+struct MergeSortPairs<test_policy, RunOnDevice> : ForoneSynchronize<test_policy>
 {
-  using sort_category = unstable_sort_tag;
-  using sort_interface = sort_pairs_interface_tag;
+  using sort_category     = unstable_sort_tag;
+  using sort_interface    = sort_pairs_interface_tag;
   using supports_resource = std::false_type;
 
   std::string m_name;
 
   MergeSortPairs()
-    : m_name(std::string("RAJA::merge_sort<") + test_policy_info<test_policy>::name() + std::string(">[pairs]"))
-  { }
+      : m_name(std::string("RAJA::merge_sort<") +
+               test_policy_info<test_policy>::name() + std::string(">[pairs]"))
+  {}
 
-  const char* name()
-  {
-    return m_name.c_str();
-  }
+  const char* name() { return m_name.c_str(); }
 
-  template < typename KeyContainer, typename ValContainer,
-             typename Compare = RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
+  template <typename KeyContainer,
+            typename ValContainer,
+            typename Compare =
+                RAJA::operators::less<RAJA::detail::ContainerRef<KeyContainer>>>
   void operator()(KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
   {
-    forone<test_policy>( [=] RAJA_DEVICE() {
-      auto c = RAJA::zip_span(keys, vals);
-      using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
-      RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
-    });
+    forone<test_policy>(
+        [=] RAJA_DEVICE()
+        {
+          auto c        = RAJA::zip_span(keys, vals);
+          using zip_ref = RAJA::detail::ContainerRef<camp::decay<decltype(c)>>;
+          RAJA::merge_sort(c, RAJA::compare_first<zip_ref>(comp));
+        });
   }
 };
 
@@ -634,102 +577,56 @@ struct MergeSortPairs<test_policy, RunOnDevice>
 
 
 using SequentialInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_seq>,
-              InsertionSortPairs<test_seq>
-            >;
+    camp::list<InsertionSort<test_seq>, InsertionSortPairs<test_seq>>;
 
 using SequentialShellSortSorters =
-  camp::list<
-              ShellSort<test_seq>,
-              ShellSortPairs<test_seq>
-            >;
+    camp::list<ShellSort<test_seq>, ShellSortPairs<test_seq>>;
 
 using SequentialHeapSortSorters =
-  camp::list<
-              HeapSort<test_seq>,
-              HeapSortPairs<test_seq>
-            >;
+    camp::list<HeapSort<test_seq>, HeapSortPairs<test_seq>>;
 
 using SequentialIntroSortSorters =
-  camp::list<
-              IntroSort<test_seq>,
-              IntroSortPairs<test_seq>
-            >;
+    camp::list<IntroSort<test_seq>, IntroSortPairs<test_seq>>;
 
 using SequentialMergeSortSorters =
-  camp::list<
-              MergeSort<test_seq>,
-              MergeSortPairs<test_seq>
-            >;
+    camp::list<MergeSort<test_seq>, MergeSortPairs<test_seq>>;
 
 #if defined(RAJA_ENABLE_CUDA)
 
 using CudaInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_cuda>,
-              InsertionSortPairs<test_cuda>
-            >;
+    camp::list<InsertionSort<test_cuda>, InsertionSortPairs<test_cuda>>;
 
 using CudaShellSortSorters =
-  camp::list<
-              ShellSort<test_cuda>,
-              ShellSortPairs<test_cuda>
-            >;
+    camp::list<ShellSort<test_cuda>, ShellSortPairs<test_cuda>>;
 
 using CudaHeapSortSorters =
-  camp::list<
-              HeapSort<test_cuda>,
-              HeapSortPairs<test_cuda>
-            >;
+    camp::list<HeapSort<test_cuda>, HeapSortPairs<test_cuda>>;
 
 using CudaIntroSortSorters =
-  camp::list<
-              IntroSort<test_cuda>,
-              IntroSortPairs<test_cuda>
-            >;
+    camp::list<IntroSort<test_cuda>, IntroSortPairs<test_cuda>>;
 
 using CudaMergeSortSorters =
-  camp::list<
-              MergeSort<test_cuda>,
-              MergeSortPairs<test_cuda>
-            >;
+    camp::list<MergeSort<test_cuda>, MergeSortPairs<test_cuda>>;
 
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
 
 using HipInsertionSortSorters =
-  camp::list<
-              InsertionSort<test_hip>,
-              InsertionSortPairs<test_hip>
-            >;
+    camp::list<InsertionSort<test_hip>, InsertionSortPairs<test_hip>>;
 
 using HipShellSortSorters =
-  camp::list<
-              ShellSort<test_hip>,
-              ShellSortPairs<test_hip>
-            >;
+    camp::list<ShellSort<test_hip>, ShellSortPairs<test_hip>>;
 
 using HipHeapSortSorters =
-  camp::list<
-              HeapSort<test_hip>,
-              HeapSortPairs<test_hip>
-            >;
+    camp::list<HeapSort<test_hip>, HeapSortPairs<test_hip>>;
 
 using HipIntroSortSorters =
-  camp::list<
-              IntroSort<test_hip>,
-              IntroSortPairs<test_hip>
-            >;
+    camp::list<IntroSort<test_hip>, IntroSortPairs<test_hip>>;
 
 using HipMergeSortSorters =
-  camp::list<
-              MergeSort<test_hip>,
-              MergeSortPairs<test_hip>
-            >;
+    camp::list<MergeSort<test_hip>, MergeSortPairs<test_hip>>;
 
 #endif
 
-#endif //__TEST_ALGORITHM_UTIL_SORT_HPP__
-
+#endif  //__TEST_ALGORITHM_UTIL_SORT_HPP__
diff --git a/test/unit/atomic/test-atomic-incdec.cpp b/test/unit/atomic/test-atomic-incdec.cpp
index 6564feeaf5..8a48670cd4 100644
--- a/test/unit/atomic/test-atomic-incdec.cpp
+++ b/test/unit/atomic/test-atomic-incdec.cpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for "wrapping" increment and decrement functions
+/// Source file containing tests for "wrapping" increment and decrement
+/// functions
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -18,31 +19,30 @@
 #endif
 
 using unsigned_types =
-    ::testing::Types<
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>
+    ::testing::Types<std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                      ,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::hip_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::hip_atomic>
+                     ,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>
 #endif
-                    >;
+                     >;
 
 // Basic Inc Dec
 
@@ -50,143 +50,144 @@ template <typename T>
 class AtomicBasicIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest );
+TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest);
 
-TYPED_TEST_P( AtomicBasicIncDecUnitTest, BasicIncDecs )
+TYPED_TEST_P(AtomicBasicIncDecUnitTest, BasicIncDecs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // test "wrapping" increment
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
-  T inc_init = (T)0;
+  T inc_init    = (T)0;
   T* inc_result = &inc_init;
 
   // oldval < val, increment oldval
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)1 );
+  ASSERT_EQ(inc_result[0], (T)1);
 
   // oldval == val, wrap to 0
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
   RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // test "wrapping" decrement
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
-  T dec_init = (T)1;
+  T dec_init    = (T)1;
   T* dec_result = &dec_init;
 
   // oldval > 0, decrement oldval
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)0 );
+  ASSERT_EQ(dec_result[0], (T)0);
 
   // oldval == 0, wrap to val
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
   RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicBasicIncDecUnitTest,
-                             BasicIncDecs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicBasicIncDecUnitTest, BasicIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicIncDecUnitTest,
-                                AtomicBasicIncDecUnitTest,
-                                unsigned_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicIncDecUnitTest,
+                               AtomicBasicIncDecUnitTest,
+                               unsigned_types);
 
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
 
 using CUDA_unsigned_types =
-    ::testing::Types<
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
-                    >;
+    ::testing::Types<std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
 
 
 template <typename T>
 class AtomicCUDAIncDecUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest );
+TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest);
 
-GPU_TYPED_TEST_P( AtomicCUDAIncDecUnitTest, CUDAIncDecs )
+GPU_TYPED_TEST_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * inc_result = nullptr;
-  T * dec_result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&inc_result, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&dec_result, sizeof(T)));
+  T* inc_result = nullptr;
+  T* dec_result = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&inc_result, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&dec_result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // test "wrapping" increment
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
 
   inc_result[0] = (T)0;
   // oldval < val, increment oldval
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)1 );
+  ASSERT_EQ(inc_result[0], (T)1);
 
   // oldval == val, wrap to 0
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // oldval > val, wrap to 0
   inc_result[0] = (T)2;
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicInc<AtomicPolicy>(inc_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( inc_result[0], (T)0 );
+  ASSERT_EQ(inc_result[0], (T)0);
 
   // test "wrapping" decrement
-  // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
 
   dec_result[0] = (T)1;
   // oldval > 0, decrement oldval
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)0 );
+  ASSERT_EQ(dec_result[0], (T)0);
 
   // oldval == 0, wrap to val
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   // oldval > val, wrap to val
   dec_result[0] = (T)3;
-  forone<test_cuda>( [=] __device__ () {RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1);} );
+  forone<test_cuda>([=] __device__()
+                    { RAJA::atomicDec<AtomicPolicy>(dec_result, (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( dec_result[0], (T)1 );
+  ASSERT_EQ(dec_result[0], (T)1);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(inc_result));
   cudaErrchk(cudaFree(dec_result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicCUDAIncDecUnitTest,
-                             CUDAIncDecs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicCUDAIncDecUnitTest, CUDAIncDecs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAIncDecUnitTest,
-                                AtomicCUDAIncDecUnitTest,
-                                CUDA_unsigned_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAIncDecUnitTest,
+                               AtomicCUDAIncDecUnitTest,
+                               CUDA_unsigned_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-accessors.cpp b/test/unit/atomic/test-atomic-ref-accessors.cpp
index f69813fcbe..677417d98d 100644
--- a/test/unit/atomic/test-atomic-ref-accessors.cpp
+++ b/test/unit/atomic/test-atomic-ref-accessors.cpp
@@ -25,47 +25,44 @@ template <typename T>
 class AtomicRefBasicAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicAccessorUnitTest, BasicAccessors )
+TYPED_TEST_P(AtomicRefBasicAccessorUnitTest, BasicAccessors)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
   // should also work with CUDA
-  T theval = (T)0;
-  T * memaddr = &theval;
+  T theval   = (T)0;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test store method with op()
-  test1.store( (T)19 );
-  ASSERT_EQ( test1, (T)19 );
+  test1.store((T)19);
+  ASSERT_EQ(test1, (T)19);
 
   // test assignment operator
   test1 = (T)23;
-  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ(test1, (T)23);
 
   // test load method
   test1 = (T)29;
-  ASSERT_EQ( test1.load(), (T)29 );
+  ASSERT_EQ(test1.load(), (T)29);
 
   // test ()
   result = (test1 = (T)31);
-  ASSERT_EQ( test1, (T)31 );
-  ASSERT_EQ( result, (T)31 );
+  ASSERT_EQ(test1, (T)31);
+  ASSERT_EQ(result, (T)31);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAccessorUnitTest,
-                             BasicAccessors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAccessorUnitTest, BasicAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicAccessUnitTest,
-                                AtomicRefBasicAccessorUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAccessUnitTest,
+                               AtomicRefBasicAccessorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -75,48 +72,58 @@ template <typename T>
 class AtomicRefCUDAAccessorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
+GPU_TYPED_TEST_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result  = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test store method with op()
-  forone<test_cuda>( [=] __device__ () {test1.store( (T)19 );} );
+  forone<test_cuda>([=] __device__() { test1.store((T)19); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)19 );
+  ASSERT_EQ(test1, (T)19);
 
   // test assignment operator
-  forone<test_cuda>( [=] __device__ () {test1 = (T)23;} );
+  forone<test_cuda>([=] __device__() { test1 = (T)23; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)23 );
+  ASSERT_EQ(test1, (T)23);
 
   // test load method
-  forone<test_cuda>( [=] __device__ () {test1 = (T)29; result[0] = test1.load();} );
+  forone<test_cuda>(
+      [=] __device__()
+      {
+        test1     = (T)29;
+        result[0] = test1.load();
+      });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)29 );
-  ASSERT_EQ( test1, (T)29 );
+  ASSERT_EQ(result[0], (T)29);
+  ASSERT_EQ(test1, (T)29);
 
   // test T()
-  forone<test_cuda>( [=] __device__ () {test1 = (T)47; result[0] = test1;} );
+  forone<test_cuda>(
+      [=] __device__()
+      {
+        test1     = (T)47;
+        result[0] = test1;
+      });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)47 );
-  ASSERT_EQ( test1, (T)47 );
+  ASSERT_EQ(result[0], (T)47);
+  ASSERT_EQ(test1, (T)47);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 = (T)31);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 = (T)31); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)31 );
-  ASSERT_EQ( test1, (T)31 );
+  ASSERT_EQ(result[0], (T)31);
+  ASSERT_EQ(test1, (T)31);
 
   cudaErrchk(cudaDeviceSynchronize());
 
@@ -124,14 +131,9 @@ GPU_TYPED_TEST_P( AtomicRefCUDAAccessorUnitTest, CUDAAccessors )
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAccessorUnitTest,
-                             CUDAAccessors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAccessorUnitTest, CUDAAccessors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAccessUnitTest,
-                                AtomicRefCUDAAccessorUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAccessUnitTest,
+                               AtomicRefCUDAAccessorUnitTest,
+                               CUDA_types);
 #endif
-
-
diff --git a/test/unit/atomic/test-atomic-ref-addsub.cpp b/test/unit/atomic/test-atomic-ref-addsub.cpp
index fba54f77fa..7899b2af5e 100644
--- a/test/unit/atomic/test-atomic-ref-addsub.cpp
+++ b/test/unit/atomic/test-atomic-ref-addsub.cpp
@@ -25,58 +25,55 @@ template <typename T>
 class AtomicRefBasicAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicAddSubUnitTest, BasicAddSubs )
+TYPED_TEST_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)0;
-  T * memaddr = &theval;
+  T theval   = (T)0;
+  T* memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test inc ops
   T val2 = ++test1;
   T val3 = test1++;
-  ASSERT_EQ( test1, (T)2 );
-  ASSERT_EQ( val2, (T)1 );
-  ASSERT_EQ( val3, (T)1 );
+  ASSERT_EQ(test1, (T)2);
+  ASSERT_EQ(val2, (T)1);
+  ASSERT_EQ(val3, (T)1);
 
   // test dec ops
   T val4 = --test1;
   T val5 = test1--;
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( val4, (T)1 );
-  ASSERT_EQ( val5, (T)1 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(val4, (T)1);
+  ASSERT_EQ(val5, (T)1);
 
   // test add/sub ops
   T val6 = (test1 += (T)23);
-  ASSERT_EQ( test1, (T)23 );
-  ASSERT_EQ( val6, (T)23 );
+  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ(val6, (T)23);
   T val7 = (test1 -= (T)22);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( val7, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(val7, (T)1);
 
   // test add/sub methods
-  T val8 = test1.fetch_add( (T)23 );
-  ASSERT_EQ( test1, (T)24 );
-  ASSERT_EQ( val8, (T)1 );
-  T val9 = test1.fetch_sub( (T)23 );
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( val9, (T)24 );
+  T val8 = test1.fetch_add((T)23);
+  ASSERT_EQ(test1, (T)24);
+  ASSERT_EQ(val8, (T)1);
+  T val9 = test1.fetch_sub((T)23);
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(val9, (T)24);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicAddSubUnitTest,
-                             BasicAddSubs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicAddSubUnitTest, BasicAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicAddSubUnitTest,
-                                AtomicRefBasicAddSubUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicAddSubUnitTest,
+                               AtomicRefBasicAddSubUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -87,62 +84,62 @@ template <typename T>
 class AtomicRefCUDAAddSubUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
+GPU_TYPED_TEST_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result1 = nullptr;
-  T * result2 = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result1, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result2, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result1 = nullptr;
+  T* result2 = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result1, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result2, sizeof(T)));
   memaddr[0] = (T)0;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test inc ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = ++test1;} );
+  forone<test_cuda>([=] __device__() { result1[0] = ++test1; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1++;} );
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1++; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)2 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)2);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test dec ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = --test1;} );
+  forone<test_cuda>([=] __device__() { result1[0] = --test1; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1--;} );
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1--; });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test add/sub ops
-  forone<test_cuda>( [=] __device__ () {result1[0] = (test1 += (T)23);} );
+  forone<test_cuda>([=] __device__() { result1[0] = (test1 += (T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)23 );
-  ASSERT_EQ( result1[0], (T)23 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = (test1 -= (T)22);} );
+  ASSERT_EQ(test1, (T)23);
+  ASSERT_EQ(result1[0], (T)23);
+  forone<test_cuda>([=] __device__() { result2[0] = (test1 -= (T)22); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result2[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result2[0], (T)1);
 
   // test add/sub methods
-  forone<test_cuda>( [=] __device__ () {result1[0] = test1.fetch_add( (T)23 );} );
+  forone<test_cuda>([=] __device__() { result1[0] = test1.fetch_add((T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)24 );
-  ASSERT_EQ( result1[0], (T)1 );
-  forone<test_cuda>( [=] __device__ () {result2[0] = test1.fetch_sub( (T)23 );} );
+  ASSERT_EQ(test1, (T)24);
+  ASSERT_EQ(result1[0], (T)1);
+  forone<test_cuda>([=] __device__() { result2[0] = test1.fetch_sub((T)23); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result2[0], (T)24 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result2[0], (T)24);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
@@ -150,13 +147,9 @@ GPU_TYPED_TEST_P( AtomicRefCUDAAddSubUnitTest, CUDAAddSubs )
   cudaErrchk(cudaFree(result2));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAAddSubUnitTest,
-                             CUDAAddSubs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAAddSubUnitTest, CUDAAddSubs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAAddSubUnitTest,
-                                AtomicRefCUDAAddSubUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAAddSubUnitTest,
+                               AtomicRefCUDAAddSubUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-bitwise.cpp b/test/unit/atomic/test-atomic-ref-bitwise.cpp
index adf49599ca..4c9d840641 100644
--- a/test/unit/atomic/test-atomic-ref-bitwise.cpp
+++ b/test/unit/atomic/test-atomic-ref-bitwise.cpp
@@ -23,80 +23,76 @@ template <typename T>
 class AtomicRefBasicBitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicBitwiseUnitTest, BasicBitwises )
+TYPED_TEST_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)1;
-  T * memaddr = &theval;
+  T theval   = (T)1;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test and/or
-  result = test1.fetch_and( (T)0 );
-  ASSERT_EQ( result, (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  result = test1.fetch_and((T)0);
+  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  result = test1.fetch_or( (T)1 );
-  ASSERT_EQ( result, (T)0 );
-  ASSERT_EQ( test1, (T)1 );
+  result = test1.fetch_or((T)1);
+  ASSERT_EQ(result, (T)0);
+  ASSERT_EQ(test1, (T)1);
 
   result = (test1 &= (T)0);
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result, (T)0 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result, (T)0);
 
   result = (test1 |= (T)1);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result, (T)1);
 
   // test xor
-  result = test1.fetch_xor( (T)1 );
-  ASSERT_EQ( result, (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  result = test1.fetch_xor((T)1);
+  ASSERT_EQ(result, (T)1);
+  ASSERT_EQ(test1, (T)0);
 
   result = (test1 ^= (T)1);
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result, (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result, (T)1);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicBitwiseUnitTest,
-                             BasicBitwises
-                           );
-
-using basic_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicBitwiseUnitTest, BasicBitwises);
+
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>
 #endif
-                    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicBitwiseUnitTest,
-                                AtomicRefBasicBitwiseUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicBitwiseUnitTest,
+                               AtomicRefBasicBitwiseUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -107,77 +103,71 @@ template <typename T>
 class AtomicRefCUDABitwiseUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDABitwiseUnitTest, CUDABitwises )
+GPU_TYPED_TEST_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * memaddr = nullptr;
-  T * result = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&memaddr, sizeof(T)));
-  cudaErrchk(cudaMallocManaged((void **)&result, sizeof(T)));
+  T* memaddr = nullptr;
+  T* result  = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&memaddr, sizeof(T)));
+  cudaErrchk(cudaMallocManaged((void**)&result, sizeof(T)));
   memaddr[0] = (T)1;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test and/or
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_and( (T)0 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_and((T)0); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_or( (T)1 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_or((T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)0 );
-  ASSERT_EQ( test1, (T)1 );
+  ASSERT_EQ(result[0], (T)0);
+  ASSERT_EQ(test1, (T)1);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 &= (T)0);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 &= (T)0); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( result[0], (T)0 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(result[0], (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 |= (T)1);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 |= (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result[0], (T)1);
 
   // test xor
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_xor( (T)1 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_xor((T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)1 );
-  ASSERT_EQ( test1, (T)0 );
+  ASSERT_EQ(result[0], (T)1);
+  ASSERT_EQ(test1, (T)0);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = (test1 ^= (T)1);} );
+  forone<test_cuda>([=] __device__() { result[0] = (test1 ^= (T)1); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)1 );
-  ASSERT_EQ( result[0], (T)1 );
+  ASSERT_EQ(test1, (T)1);
+  ASSERT_EQ(result[0], (T)1);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(memaddr));
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDABitwiseUnitTest,
-                             CUDABitwises
-                           );
-
-using CUDA_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>
-                    >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDABitwiseUnitTest,
-                                AtomicRefCUDABitwiseUnitTest,
-                                CUDA_types
-                              );
-#endif
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDABitwiseUnitTest, CUDABitwises);
 
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDABitwiseUnitTest,
+                               AtomicRefCUDABitwiseUnitTest,
+                               CUDA_types);
+#endif
diff --git a/test/unit/atomic/test-atomic-ref-constructor.cpp b/test/unit/atomic/test-atomic-ref-constructor.cpp
index 619e3ebf20..c86c6be8fb 100644
--- a/test/unit/atomic/test-atomic-ref-constructor.cpp
+++ b/test/unit/atomic/test-atomic-ref-constructor.cpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for atomic ref constructors (and use of getPointer for verification)
+/// Source file containing tests for atomic ref constructors (and use of
+/// getPointer for verification)
 ///
 
 #include "RAJA/RAJA.hpp"
@@ -30,38 +31,33 @@ TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest);
 template <typename T>
 void DefaultPolConstructors()
 {
-  T * memaddr = nullptr;
+  T* memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T> test1( memaddr );
+  RAJA::AtomicRef<T> test1(memaddr);
 
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<T> const & reft1 = test1;
-  RAJA::AtomicRef<T> reftest1( reft1 );
+  RAJA::AtomicRef<T> const& reft1 = test1;
+  RAJA::AtomicRef<T> reftest1(reft1);
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-TYPED_TEST_P( AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors )
+TYPED_TEST_P(AtomicRefDefaultConstructorUnitTest, DefaultPolConstructors)
 {
   DefaultPolConstructors<TypeParam>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefDefaultConstructorUnitTest,
-                             DefaultPolConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefDefaultConstructorUnitTest,
+                            DefaultPolConstructors);
 
-using default_types = ::testing::Types< int,
-                                      float,
-                                      double
-                                    >;
+using default_types = ::testing::Types<int, float, double>;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( DefaultConstrUnitTest,
-                                AtomicRefDefaultConstructorUnitTest,
-                                default_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(DefaultConstrUnitTest,
+                               AtomicRefDefaultConstructorUnitTest,
+                               default_types);
 
 // Basic Constructors with policies
 
@@ -69,35 +65,33 @@ template <typename T>
 class AtomicRefBasicConstructorUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicConstructorUnitTest, BasicConstructors )
+TYPED_TEST_P(AtomicRefBasicConstructorUnitTest, BasicConstructors)
 {
-  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType * memaddr = nullptr;
+  NumericType* memaddr = nullptr;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(memaddr);
 
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicConstructorUnitTest,
-                             BasicConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicConstructorUnitTest,
+                            BasicConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicConstrUnitTest,
-                                AtomicRefBasicConstructorUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicConstrUnitTest,
+                               AtomicRefBasicConstructorUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -109,44 +103,40 @@ class AtomicRefCUDAConstructorUnitTest : public ::testing::Test
 
 TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAConstructorUnitTest, CUDAConstructors )
+GPU_TYPED_TEST_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors)
 {
-  using NumericType = typename std::tuple_element<0, TypeParam>::type;
+  using NumericType  = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  NumericType * memaddr = nullptr;
-  NumericType * proxy = nullptr;
-  cudaErrchk(cudaMallocManaged((void **)&proxy, sizeof(NumericType)));
+  NumericType* memaddr = nullptr;
+  NumericType* proxy   = nullptr;
+  cudaErrchk(cudaMallocManaged((void**)&proxy, sizeof(NumericType)));
   proxy = memaddr;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test0( memaddr );
-  RAJA::AtomicRef<NumericType, AtomicPolicy> test1( proxy );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test0(memaddr);
+  RAJA::AtomicRef<NumericType, AtomicPolicy> test1(proxy);
 
-  forone<test_cuda>( [=] __device__ () {test1.getPointer();} );
+  forone<test_cuda>([=] __device__() { test1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test0.getPointer(), nullptr );
-  ASSERT_EQ( test1.getPointer(), nullptr );
+  ASSERT_EQ(test0.getPointer(), nullptr);
+  ASSERT_EQ(test1.getPointer(), nullptr);
 
   // ref constructor
-  RAJA::AtomicRef<NumericType, AtomicPolicy> const & reft1 = test1;
-  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1( reft1 );
-  forone<test_cuda>( [=] __device__ () {reftest1.getPointer();} );
+  RAJA::AtomicRef<NumericType, AtomicPolicy> const& reft1 = test1;
+  RAJA::AtomicRef<NumericType, AtomicPolicy> reftest1(reft1);
+  forone<test_cuda>([=] __device__() { reftest1.getPointer(); });
   cudaErrchk(cudaDeviceSynchronize());
 
-  ASSERT_EQ( reftest1.getPointer(), nullptr );
+  ASSERT_EQ(reftest1.getPointer(), nullptr);
 
   cudaErrchk(cudaFree(proxy));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAConstructorUnitTest,
-                             CUDAConstructors
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAConstructorUnitTest, CUDAConstructors);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAConstrUnitTest,
-                                AtomicRefCUDAConstructorUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAConstrUnitTest,
+                               AtomicRefCUDAConstructorUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-exchanges.cpp b/test/unit/atomic/test-atomic-ref-exchanges.cpp
index 18fa1e4819..842e18d319 100644
--- a/test/unit/atomic/test-atomic-ref-exchanges.cpp
+++ b/test/unit/atomic/test-atomic-ref-exchanges.cpp
@@ -23,91 +23,87 @@ template <typename T>
 class AtomicRefBasicExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicExchangeUnitTest, BasicExchanges )
+TYPED_TEST_P(AtomicRefBasicExchangeUnitTest, BasicExchanges)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T swapper = (T)91;
-  T theval = (T)0;
-  T * memaddr = &theval;
+  T swapper  = (T)91;
+  T theval   = (T)0;
+  T* memaddr = &theval;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test exchange method
-  swapper = test1.exchange( swapper );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper, (T)0 );
+  swapper = test1.exchange(swapper);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper, (T)0);
 
   // test CAS method
-  swapper = test1.CAS( (T)91, swapper );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper, (T)91 );
+  swapper = test1.CAS((T)91, swapper);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper, (T)91);
 
 
   bool result = true;
-  T testval = (T)19;
-  T & valref = testval;
+  T testval   = (T)19;
+  T& valref   = testval;
 
   // test strong exchange method
-  result = test1.compare_exchange_strong( valref, testval );
-  ASSERT_EQ( result, false );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper, (T)91 );
-  ASSERT_EQ( testval, (T)0 );
+  result = test1.compare_exchange_strong(valref, testval);
+  ASSERT_EQ(result, false);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper, (T)91);
+  ASSERT_EQ(testval, (T)0);
 
   // test weak exchange method (same as strong exchange)
-  result = test1.compare_exchange_weak( valref, swapper );
-  ASSERT_EQ( result, true );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper, (T)91 );
-  ASSERT_EQ( testval, (T)0 );
+  result = test1.compare_exchange_weak(valref, swapper);
+  ASSERT_EQ(result, true);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper, (T)91);
+  ASSERT_EQ(testval, (T)0);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicExchangeUnitTest,
-                             BasicExchanges
-                           );
-
-using basic_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                      std::tuple<float, RAJA::builtin_atomic>,
-                      std::tuple<float, RAJA::seq_atomic>,
-                      std::tuple<double, RAJA::builtin_atomic>,
-                      std::tuple<double, RAJA::seq_atomic>
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicExchangeUnitTest, BasicExchanges);
+
+using basic_types =
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                      std::tuple<float, RAJA::omp_atomic>,
-                      std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>
 #endif
-                    >;
+                     >;
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicExchangeUnitTest,
-                                AtomicRefBasicExchangeUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicExchangeUnitTest,
+                               AtomicRefBasicExchangeUnitTest,
+                               basic_types);
 
 
 // Pure CUDA test.
@@ -118,17 +114,17 @@ template <typename T>
 class AtomicRefCUDAExchangeUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
+GPU_TYPED_TEST_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * swapper = nullptr;
-  T * memaddr = nullptr;
-  T * testval = nullptr;
-  bool * result = nullptr;
+  T* swapper   = nullptr;
+  T* memaddr   = nullptr;
+  T* testval   = nullptr;
+  bool* result = nullptr;
   cudaErrchk(cudaMallocManaged(&swapper, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&testval, sizeof(T)));
@@ -136,39 +132,45 @@ GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
   swapper[0] = (T)91;
   memaddr[0] = (T)0;
   testval[0] = (T)19;
-  result[0] = true;
+  result[0]  = true;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test exchange method
-  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.exchange( swapper[0] );} );
+  forone<test_cuda>([=] __device__()
+                    { swapper[0] = test1.exchange(swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper[0], (T)0 );
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper[0], (T)0);
 
   // test CAS method
-  forone<test_cuda>( [=] __device__ () {swapper[0] = test1.CAS( (T)91, swapper[0] );} );
+  forone<test_cuda>([=] __device__()
+                    { swapper[0] = test1.CAS((T)91, swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper[0], (T)91 );
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper[0], (T)91);
 
   // test strong exchange method
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_strong( testval[0], testval[0] );} );
+  forone<test_cuda>(
+      [=] __device__()
+      { result[0] = test1.compare_exchange_strong(testval[0], testval[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], false );
-  ASSERT_EQ( test1, (T)0 );
-  ASSERT_EQ( swapper[0], (T)91 );
-  ASSERT_EQ( testval[0], (T)0 );
+  ASSERT_EQ(result[0], false);
+  ASSERT_EQ(test1, (T)0);
+  ASSERT_EQ(swapper[0], (T)91);
+  ASSERT_EQ(testval[0], (T)0);
 
   // test weak exchange method (same as strong exchange)
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.compare_exchange_weak( testval[0], swapper[0] );} );
+  forone<test_cuda>(
+      [=] __device__()
+      { result[0] = test1.compare_exchange_weak(testval[0], swapper[0]); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], true );
-  ASSERT_EQ( test1, (T)91 );
-  ASSERT_EQ( swapper[0], (T)91 );
-  ASSERT_EQ( testval[0], (T)0 );
+  ASSERT_EQ(result[0], true);
+  ASSERT_EQ(test1, (T)91);
+  ASSERT_EQ(swapper[0], (T)91);
+  ASSERT_EQ(testval[0], (T)0);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(swapper));
@@ -177,25 +179,19 @@ GPU_TYPED_TEST_P( AtomicRefCUDAExchangeUnitTest, CUDAExchanges )
   cudaErrchk(cudaFree(result));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAExchangeUnitTest,
-                             CUDAExchanges
-                           );
-
-using CUDA_types = 
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>
-                    >;
-
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAExchangeUnitTest,
-                                AtomicRefCUDAExchangeUnitTest,
-                                CUDA_types
-                              );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAExchangeUnitTest, CUDAExchanges);
+
+using CUDA_types =
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAExchangeUnitTest,
+                               AtomicRefCUDAExchangeUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref-minmax.cpp b/test/unit/atomic/test-atomic-ref-minmax.cpp
index a35ea15164..10d4825616 100644
--- a/test/unit/atomic/test-atomic-ref-minmax.cpp
+++ b/test/unit/atomic/test-atomic-ref-minmax.cpp
@@ -25,47 +25,44 @@ template <typename T>
 class AtomicRefBasicMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest);
 
-TYPED_TEST_P( AtomicRefBasicMinMaxUnitTest, BasicMinMaxs )
+TYPED_TEST_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T theval = (T)91;
-  T * memaddr = &theval;
+  T theval   = (T)91;
+  T* memaddr = &theval;
   T result;
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test min
-  result = test1.fetch_min( (T)87 );
-  ASSERT_EQ( result, (T)91 );
-  ASSERT_EQ( test1, (T)87 );
+  result = test1.fetch_min((T)87);
+  ASSERT_EQ(result, (T)91);
+  ASSERT_EQ(test1, (T)87);
 
-  result = test1.min( (T)83 );
-  ASSERT_EQ( result, (T)83 );
-  ASSERT_EQ( test1, (T)83 );
+  result = test1.min((T)83);
+  ASSERT_EQ(result, (T)83);
+  ASSERT_EQ(test1, (T)83);
 
   // test max
-  result = test1.fetch_max( (T)87 );
-  ASSERT_EQ( result, (T)83 );
-  ASSERT_EQ( test1, (T)87 );
+  result = test1.fetch_max((T)87);
+  ASSERT_EQ(result, (T)83);
+  ASSERT_EQ(test1, (T)87);
 
-  result = test1.max( (T)91 );
-  ASSERT_EQ( result, (T)91 );
-  ASSERT_EQ( test1, (T)91 );
+  result = test1.max((T)91);
+  ASSERT_EQ(result, (T)91);
+  ASSERT_EQ(test1, (T)91);
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefBasicMinMaxUnitTest,
-                             BasicMinMaxs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefBasicMinMaxUnitTest, BasicMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( BasicMinMaxUnitTest,
-                                AtomicRefBasicMinMaxUnitTest,
-                                basic_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(BasicMinMaxUnitTest,
+                               AtomicRefBasicMinMaxUnitTest,
+                               basic_types);
 
 // Pure CUDA test.
 #if defined(RAJA_ENABLE_CUDA)
@@ -75,57 +72,53 @@ template <typename T>
 class AtomicRefCUDAMinMaxUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest );
+TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest);
 
-GPU_TYPED_TEST_P( AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs )
+GPU_TYPED_TEST_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs)
 {
-  using T = typename std::tuple_element<0, TypeParam>::type;
+  using T            = typename std::tuple_element<0, TypeParam>::type;
   using AtomicPolicy = typename std::tuple_element<1, TypeParam>::type;
 
-  T * result = nullptr;
-  T * memaddr = nullptr;
+  T* result  = nullptr;
+  T* memaddr = nullptr;
   cudaErrchk(cudaMallocManaged(&result, sizeof(T)));
   cudaErrchk(cudaMallocManaged(&memaddr, sizeof(T)));
   memaddr[0] = (T)91;
   cudaErrchk(cudaDeviceSynchronize());
 
   // explicit constructor with memory address
-  RAJA::AtomicRef<T, AtomicPolicy> test1( memaddr );
+  RAJA::AtomicRef<T, AtomicPolicy> test1(memaddr);
 
   // test min
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_min( (T)87 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_min((T)87); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)91 );
-  ASSERT_EQ( test1, (T)87 );
+  ASSERT_EQ(result[0], (T)91);
+  ASSERT_EQ(test1, (T)87);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.min( (T)83 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.min((T)83); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)83 );
-  ASSERT_EQ( test1, (T)83 );
+  ASSERT_EQ(result[0], (T)83);
+  ASSERT_EQ(test1, (T)83);
 
   // test max
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.fetch_max( (T)87 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.fetch_max((T)87); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)83 );
-  ASSERT_EQ( test1, (T)87 );
+  ASSERT_EQ(result[0], (T)83);
+  ASSERT_EQ(test1, (T)87);
 
-  forone<test_cuda>( [=] __device__ () {result[0] = test1.max( (T)91 );} );
+  forone<test_cuda>([=] __device__() { result[0] = test1.max((T)91); });
   cudaErrchk(cudaDeviceSynchronize());
-  ASSERT_EQ( result[0], (T)91 );
-  ASSERT_EQ( test1, (T)91 );
+  ASSERT_EQ(result[0], (T)91);
+  ASSERT_EQ(test1, (T)91);
 
   cudaErrchk(cudaDeviceSynchronize());
   cudaErrchk(cudaFree(result));
   cudaErrchk(cudaFree(memaddr));
 }
 
-REGISTER_TYPED_TEST_SUITE_P( AtomicRefCUDAMinMaxUnitTest,
-                             CUDAMinMaxs
-                           );
+REGISTER_TYPED_TEST_SUITE_P(AtomicRefCUDAMinMaxUnitTest, CUDAMinMaxs);
 
-INSTANTIATE_TYPED_TEST_SUITE_P( CUDAMinMaxUnitTest,
-                                AtomicRefCUDAMinMaxUnitTest,
-                                CUDA_types
-                              );
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDAMinMaxUnitTest,
+                               AtomicRefCUDAMinMaxUnitTest,
+                               CUDA_types);
 #endif
-
diff --git a/test/unit/atomic/test-atomic-ref.hpp b/test/unit/atomic/test-atomic-ref.hpp
index f5b7dd2943..6805c432cd 100644
--- a/test/unit/atomic/test-atomic-ref.hpp
+++ b/test/unit/atomic/test-atomic-ref.hpp
@@ -13,66 +13,62 @@
 #include "RAJA_gtest.hpp"
 
 using basic_types =
-    ::testing::Types<
-                      std::tuple<int, RAJA::builtin_atomic>,
-                      std::tuple<int, RAJA::seq_atomic>,
-                      std::tuple<unsigned int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned int, RAJA::seq_atomic>,
-                      std::tuple<unsigned long long int, RAJA::builtin_atomic>,
-                      std::tuple<unsigned long long int, RAJA::seq_atomic>,
-                      std::tuple<float, RAJA::builtin_atomic>,
-                      std::tuple<float, RAJA::seq_atomic>,
-                      std::tuple<double, RAJA::builtin_atomic>,
-                      std::tuple<double, RAJA::seq_atomic>
+    ::testing::Types<std::tuple<int, RAJA::builtin_atomic>,
+                     std::tuple<int, RAJA::seq_atomic>,
+                     std::tuple<unsigned int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned int, RAJA::seq_atomic>,
+                     std::tuple<unsigned long long int, RAJA::builtin_atomic>,
+                     std::tuple<unsigned long long int, RAJA::seq_atomic>,
+                     std::tuple<float, RAJA::builtin_atomic>,
+                     std::tuple<float, RAJA::seq_atomic>,
+                     std::tuple<double, RAJA::builtin_atomic>,
+                     std::tuple<double, RAJA::seq_atomic>
 #if defined(RAJA_ENABLE_OPENMP)
-                      ,
-                      std::tuple<int, RAJA::omp_atomic>,
-                      std::tuple<unsigned int, RAJA::omp_atomic>,
-                      std::tuple<unsigned long long int, RAJA::omp_atomic>,
-                      std::tuple<float, RAJA::omp_atomic>,
-                      std::tuple<double, RAJA::omp_atomic>
+                     ,
+                     std::tuple<int, RAJA::omp_atomic>,
+                     std::tuple<unsigned int, RAJA::omp_atomic>,
+                     std::tuple<unsigned long long int, RAJA::omp_atomic>,
+                     std::tuple<float, RAJA::omp_atomic>,
+                     std::tuple<double, RAJA::omp_atomic>
 #endif
 #if defined(RAJA_ENABLE_CUDA)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::cuda_atomic>,
-                      std::tuple<double, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>
 #endif
 #if defined(RAJA_ENABLE_HIP)
-                      ,
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::hip_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::hip_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::hip_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::hip_atomic>,
-                      std::tuple<double, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::hip_atomic>
+                     ,
+                     std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::hip_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::hip_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::hip_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::hip_atomic>,
+                     std::tuple<double, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::hip_atomic>
 #endif
-                    >;
+                     >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using CUDA_types =
-    ::testing::Types<
-                      std::tuple<int, RAJA::auto_atomic>,
-                      std::tuple<int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned int, RAJA::auto_atomic>,
-                      std::tuple<unsigned int, RAJA::cuda_atomic>,
-                      std::tuple<unsigned long long int, RAJA::auto_atomic>,
-                      std::tuple<unsigned long long int, RAJA::cuda_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<float, RAJA::auto_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>,
-                      std::tuple<double, RAJA::cuda_atomic>
-                    >;
+    ::testing::Types<std::tuple<int, RAJA::auto_atomic>,
+                     std::tuple<int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned int, RAJA::auto_atomic>,
+                     std::tuple<unsigned int, RAJA::cuda_atomic>,
+                     std::tuple<unsigned long long int, RAJA::auto_atomic>,
+                     std::tuple<unsigned long long int, RAJA::cuda_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<float, RAJA::auto_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>,
+                     std::tuple<double, RAJA::cuda_atomic>>;
 #endif
-
diff --git a/test/unit/hip/test-synchronize.cpp b/test/unit/hip/test-synchronize.cpp
index 1b0ce0a414..0ddc91bb9a 100644
--- a/test/unit/hip/test-synchronize.cpp
+++ b/test/unit/hip/test-synchronize.cpp
@@ -14,22 +14,21 @@
 GPU_TEST(SynchronizeUnitTest, HIP)
 {
 
-  double* managed_data = (double*) malloc(sizeof(double)*50);
+  double* managed_data = (double*)malloc(sizeof(double) * 50);
   double* d_managed_data;
-  hipMalloc(&d_managed_data, sizeof(double)*50);
+  hipMalloc(&d_managed_data, sizeof(double) * 50);
 
-  RAJA::forall<RAJA::hip_exec_async<256>>( RAJA::RangeSegment(0, 50),
-    [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
-    d_managed_data[i] = 1.0 * i;
-  });
+  RAJA::forall<RAJA::hip_exec_async<256>>(
+      RAJA::RangeSegment(0, 50), [=] RAJA_HOST_DEVICE(RAJA::Index_type i)
+      { d_managed_data[i] = 1.0 * i; });
   RAJA::synchronize<RAJA::hip_synchronize>();
 
-  hipMemcpy(managed_data, d_managed_data, sizeof(double)*50, hipMemcpyDeviceToHost);
+  hipMemcpy(managed_data, d_managed_data, sizeof(double) * 50,
+            hipMemcpyDeviceToHost);
 
-  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, 50),
-    [=](RAJA::Index_type i) {
-    EXPECT_EQ(managed_data[i], 1.0 * i);
-  });
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 50),
+                               [=](RAJA::Index_type i)
+                               { EXPECT_EQ(managed_data[i], 1.0 * i); });
 
   free(managed_data);
   hipFree(d_managed_data);
diff --git a/test/unit/index/test-indexset.cpp b/test/unit/index/test-indexset.cpp
index 8d0b282624..af9027a11c 100644
--- a/test/unit/index/test-indexset.cpp
+++ b/test/unit/index/test-indexset.cpp
@@ -17,7 +17,7 @@
 // Resource object used to construct list segment objects with indices
 // living in host (CPU) memory. Used in all tests.
 //
-  camp::resources::Resource host_res{camp::resources::Host()};
+camp::resources::Resource host_res {camp::resources::Host()};
 
 
 TEST(IndexSetUnitTest, Empty)
@@ -34,13 +34,13 @@ TEST(IndexSetUnitTest, Empty)
 
 TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType isr;
   ASSERT_EQ((size_t)1, isr.getNumTypes());
   isr.push_back(RangeSegType(1, 3));
   isr.push_front(RangeSegType(0, 1));
-  ASSERT_EQ(2, isr.size()); 
+  ASSERT_EQ(2, isr.size());
   ASSERT_EQ(size_t(3), isr.getLength());
   const RangeSegType& rs0 = isr.getSegment<const RangeSegType>(0);
   const RangeSegType& rs1 = isr.getSegment<const RangeSegType>(1);
@@ -56,17 +56,17 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
   ASSERT_NE(isr.size(), isr2.size());
   ASSERT_EQ(isr.getLength(), isr2.getLength());
 
-  using ListSegType = RAJA::TypedListSegment<int>; 
+  using ListSegType    = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType isrl;
   ASSERT_EQ(size_t(2), isrl.getNumTypes());
-  int idx[ ] = {0, 2, 4, 5};
-  ListSegType lseg(idx, 4, host_res); 
+  int idx[] = {0, 2, 4, 5};
+  ListSegType lseg(idx, 4, host_res);
   isrl.push_back(lseg);
   isrl.push_back(RangeSegType(6, 8));
-  ASSERT_EQ(2, isrl.size()); 
+  ASSERT_EQ(2, isrl.size());
   ASSERT_EQ(size_t(6), isrl.getLength());
-  const ListSegType ls0 = isrl.getSegment<const ListSegType>(0);
+  const ListSegType ls0   = isrl.getSegment<const ListSegType>(0);
   const RangeSegType rs11 = isrl.getSegment<const RangeSegType>(1);
   ASSERT_EQ(4, ls0.size());
   ASSERT_EQ(2, rs11.size());
@@ -84,7 +84,7 @@ TEST(IndexSetUnitTest, ConstructAndCompareSegments)
 
 TEST(IndexSetUnitTest, Swap)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
   RangeSegType range(0, 10);
@@ -109,7 +109,7 @@ TEST(IndexSetUnitTest, Swap)
 
 TEST(IndexSetUnitTest, Slice)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset1;
   RangeSegType range1(0, 2);
@@ -138,7 +138,7 @@ TEST(IndexSetUnitTest, Slice)
   ASSERT_EQ(8, *rs22.begin());
   ASSERT_EQ(10, *rs22.end());
 
-  int segs[ ] = {0, 3};
+  int segs[]          = {0, 3};
   RIndexSetType iset3 = iset1.createSlice(segs, 2);
   ASSERT_EQ(2, iset3.size());
   ASSERT_EQ(size_t(4), iset3.getLength());
@@ -165,19 +165,19 @@ TEST(IndexSetUnitTest, Slice)
 
 TEST(IndexSetUnitTest, ConditionalEvenIndices)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
-  using ListSegType = RAJA::TypedListSegment<int>; 
+  using RangeSegType   = RAJA::TypedRangeSegment<int>;
+  using ListSegType    = RAJA::TypedListSegment<int>;
   using RLIndexSetType = RAJA::TypedIndexSet<RangeSegType, ListSegType>;
   RLIndexSetType iset;
 
   iset.push_back(RangeSegType(0, 6));
-  int idx[ ] = {7, 8, 10, 11};
-  ListSegType lseg(idx, 4, host_res); 
+  int idx[] = {7, 8, 10, 11};
+  ListSegType lseg(idx, 4, host_res);
   iset.push_back(lseg);
   iset.push_back(RangeSegType(13, 17));
 
   RAJA::RAJAVec<int> ref_even_indices;
-  ref_even_indices.push_back(0); 
+  ref_even_indices.push_back(0);
   ref_even_indices.push_back(2);
   ref_even_indices.push_back(4);
   ref_even_indices.push_back(8);
@@ -186,19 +186,18 @@ TEST(IndexSetUnitTest, ConditionalEvenIndices)
   ref_even_indices.push_back(16);
 
   RAJA::RAJAVec<int> even_indices;
-  getIndicesConditional(even_indices, iset, [] (int idx) {
-    return !(idx % 2);
-  });
+  getIndicesConditional(even_indices, iset, [](int idx) { return !(idx % 2); });
 
   EXPECT_EQ(even_indices.size(), ref_even_indices.size());
-  for (size_t i = 0; i < ref_even_indices.size(); ++i) {
+  for (size_t i = 0; i < ref_even_indices.size(); ++i)
+  {
     EXPECT_EQ(even_indices[i], ref_even_indices[i]);
   }
 }
 
 TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
 {
-  using RangeSegType = RAJA::TypedRangeSegment<int>;
+  using RangeSegType  = RAJA::TypedRangeSegment<int>;
   using RIndexSetType = RAJA::TypedIndexSet<RangeSegType>;
   RIndexSetType iset;
 
@@ -215,12 +214,12 @@ TEST(IndexSetUnitTest, ConditionalLessThan100Indices)
   ref_lt100_indices.push_back(99);
 
   RAJA::RAJAVec<int> lt100_indices;
-  getIndicesConditional(lt100_indices, iset, [] (int idx) {
-    return (idx < 100);
-  });
+  getIndicesConditional(lt100_indices, iset,
+                        [](int idx) { return (idx < 100); });
 
   EXPECT_EQ(lt100_indices.size(), ref_lt100_indices.size());
-  for (size_t i = 0; i < ref_lt100_indices.size(); ++i) {
+  for (size_t i = 0; i < ref_lt100_indices.size(); ++i)
+  {
     EXPECT_EQ(lt100_indices[i], ref_lt100_indices[i]);
   }
 }
diff --git a/test/unit/index/test-indexvalue.cpp b/test/unit/index/test-indexvalue.cpp
index fad47715e9..ca148d2c91 100644
--- a/test/unit/index/test-indexvalue.cpp
+++ b/test/unit/index/test-indexvalue.cpp
@@ -13,8 +13,9 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class IndexValueUnitTest : public ::testing::Test {};
+template <typename T>
+class IndexValueUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(IndexValueUnitTest, UnitIndexTypes);
 
@@ -181,7 +182,7 @@ TYPED_TEST(IndexValueUnitTest, IndexTypeArith)
   ASSERT_EQ(StrongTypeIndex(8), a);
   ASSERT_EQ(RAJA::Index_type(2), b);
 
-  
+
   RAJA_INDEX_VALUE_T(TestType, TypeParam, "Test Type");
   TestType c(8);
   RAJA::Index_type d(2);
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index 2ea0004b83..f810aac1ed 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -17,26 +17,28 @@
 
 #include <vector>
 
-template<typename T>
-class ListSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class ListSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(ListSegmentUnitTest, UnitIndexTypes);
 
 //
 // Resource object used to construct list segment objects with indices
-// living in host (CPU) memory. Used in all tests in this file. 
+// living in host (CPU) memory. Used in all tests in this file.
 //
-camp::resources::Resource host_res{camp::resources::Host()};
+camp::resources::Resource host_res {camp::resources::Host()};
 
 
 TYPED_TEST(ListSegmentUnitTest, Constructors)
 {
   std::vector<TypeParam> idx;
-  for (TypeParam i = 0; i < 5; ++i){
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     idx.push_back(i);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1( &idx[0], idx.size(), host_res);
+  RAJA::TypedListSegment<TypeParam> list1(&idx[0], idx.size(), host_res);
   ASSERT_EQ(list1.size(), idx.size());
   ASSERT_EQ(list1.getIndexOwnership(), RAJA::Owned);
 
@@ -50,20 +52,21 @@ TYPED_TEST(ListSegmentUnitTest, Constructors)
 
   RAJA::TypedListSegment<TypeParam> container(idx, host_res);
   ASSERT_EQ(container.getIndexOwnership(), RAJA::Owned);
-  ASSERT_EQ(moved, container); 
+  ASSERT_EQ(moved, container);
 }
 
 TYPED_TEST(ListSegmentUnitTest, Swaps)
 {
   std::vector<TypeParam> idx1;
   std::vector<TypeParam> idx2;
-  for (TypeParam i = 0; i < 5; ++i){
+  for (TypeParam i = 0; i < 5; ++i)
+  {
     idx1.push_back(i);
-    idx2.push_back(i+5);
+    idx2.push_back(i + 5);
   }
 
-  RAJA::TypedListSegment<TypeParam> list1( idx1, host_res );
-  RAJA::TypedListSegment<TypeParam> list2( idx2, host_res );
+  RAJA::TypedListSegment<TypeParam> list1(idx1, host_res);
+  RAJA::TypedListSegment<TypeParam> list2(idx2, host_res);
   auto list3 = RAJA::TypedListSegment<TypeParam>(list1);
   auto list4 = RAJA::TypedListSegment<TypeParam>(list2);
 
@@ -80,26 +83,25 @@ TYPED_TEST(ListSegmentUnitTest, Swaps)
 
 TYPED_TEST(ListSegmentUnitTest, Equality)
 {
-  std::vector<TypeParam> idx1{5,3,1,2};
-  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+  std::vector<TypeParam> idx1 {5, 3, 1, 2};
+  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
-  std::vector<TypeParam> idx2{2,1,3,5};
-  
-  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), false);
+  std::vector<TypeParam> idx2 {2, 1, 3, 5};
 
-  std::reverse( idx2.begin(), idx2.end() );
+  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), false);
 
-  ASSERT_EQ(list.indicesEqual( &idx2.begin()[0], idx2.size() ), true);
+  std::reverse(idx2.begin(), idx2.end());
+
+  ASSERT_EQ(list.indicesEqual(&idx2.begin()[0], idx2.size()), true);
 }
 
 TYPED_TEST(ListSegmentUnitTest, Iterators)
 {
-  std::vector<TypeParam> idx1{5,3,1,2};
-  RAJA::TypedListSegment<TypeParam> list( idx1, host_res );
+  std::vector<TypeParam> idx1 {5, 3, 1, 2};
+  RAJA::TypedListSegment<TypeParam> list(idx1, host_res);
 
   ASSERT_EQ(TypeParam(5), *list.begin());
-  ASSERT_EQ(TypeParam(2), *(list.end()-1));
+  ASSERT_EQ(TypeParam(2), *(list.end() - 1));
 
   ASSERT_EQ(4, list.size());
 }
-
diff --git a/test/unit/index/test-rangesegment.cpp b/test/unit/index/test-rangesegment.cpp
index be82671682..fbed2a15bd 100644
--- a/test/unit/index/test-rangesegment.cpp
+++ b/test/unit/index/test-rangesegment.cpp
@@ -13,18 +13,20 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class RangeSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class RangeSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(RangeSegmentUnitTest, UnitIndexTypes);
 
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegConstructorsTest()
 {
   RAJA::TypedRangeSegment<T> r1(-10, 7);
@@ -74,12 +76,13 @@ TYPED_TEST(RangeSegmentUnitTest, Swaps)
   ASSERT_EQ(r2, r3);
 }
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeSegIteratorsTest()
 {
   RAJA::TypedRangeSegment<T> r3(-2, 100);
@@ -100,13 +103,14 @@ TYPED_TEST(RangeSegmentUnitTest, Iterators)
 }
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
-{
-}
+{}
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexSliceTests()
 {
   auto r1 = RAJA::TypedRangeSegment<IDX_TYPE>(-4, 4);
@@ -128,15 +132,15 @@ void runNegativeIndexSliceTests()
 TYPED_TEST(RangeSegmentUnitTest, Slices)
 {
   auto r1 = RAJA::TypedRangeSegment<TypeParam>(0, 125);
-  auto s1 = r1.slice(10,100);
+  auto s1 = r1.slice(10, 100);
 
   ASSERT_EQ(TypeParam(10), *s1.begin());
   ASSERT_EQ(TypeParam(110), *(s1.end()));
   ASSERT_EQ(TypeParam(100), s1.size());
 
- 
+
   auto r2 = RAJA::TypedRangeSegment<TypeParam>(0, 12);
-  auto s2 = r2.slice(1,13);
+  auto s2 = r2.slice(1, 13);
 
   ASSERT_EQ(TypeParam(1), *s2.begin());
   ASSERT_EQ(TypeParam(12), *(s2.end()));
@@ -144,7 +148,7 @@ TYPED_TEST(RangeSegmentUnitTest, Slices)
 
 
   auto r3 = RAJA::TypedRangeSegment<TypeParam>(1, 125);
-  auto s3 = r3.slice(10,100);
+  auto s3 = r3.slice(10, 100);
 
   ASSERT_EQ(TypeParam(11), *s3.begin());
   ASSERT_EQ(TypeParam(111), *(s3.end()));
@@ -160,7 +164,7 @@ TYPED_TEST(RangeSegmentUnitTest, Equality)
 
   ASSERT_EQ(r1, r2);
 
-  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10,15);
+  auto r3 = RAJA::TypedRangeSegment<TypeParam>(10, 15);
 
   ASSERT_NE(r1, r3);
 }
diff --git a/test/unit/index/test-rangestridesegment.cpp b/test/unit/index/test-rangestridesegment.cpp
index 5b64e17b5c..ef640547d2 100644
--- a/test/unit/index/test-rangestridesegment.cpp
+++ b/test/unit/index/test-rangestridesegment.cpp
@@ -13,64 +13,67 @@
 
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class RangeStrideSegmentUnitTest : public ::testing::Test {};
+template <typename T>
+class RangeStrideSegmentUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(RangeStrideSegmentUnitTest, UnitIndexTypes);
 
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Constructors)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
-    RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
-    ASSERT_EQ(first, copied);
-    RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
-    ASSERT_EQ(moved, copied);
+  RAJA::TypedRangeStrideSegment<TypeParam> first(0, 10, 2);
+  RAJA::TypedRangeStrideSegment<TypeParam> copied(first);
+  ASSERT_EQ(first, copied);
+  RAJA::TypedRangeStrideSegment<TypeParam> moved(std::move(first));
+  ASSERT_EQ(moved, copied);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Assignments)
 {
-    auto r = RAJA::make_strided_range<TypeParam>(static_cast<TypeParam>(0), 
-                                                 static_cast<TypeParam>(5), 
-                                                 static_cast<typename std::make_signed<TypeParam>::type>(3));
-    RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
-    ASSERT_EQ(r, seg1);
-    RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
-    ASSERT_EQ(seg2, seg1);
+  auto r = RAJA::make_strided_range<TypeParam>(
+      static_cast<TypeParam>(0), static_cast<TypeParam>(5),
+      static_cast<typename std::make_signed<TypeParam>::type>(3));
+  RAJA::TypedRangeStrideSegment<TypeParam> seg1 = r;
+  ASSERT_EQ(r, seg1);
+  RAJA::TypedRangeStrideSegment<TypeParam> seg2 = std::move(r);
+  ASSERT_EQ(seg2, seg1);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Swaps)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
-    RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
-    RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
-    RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
-    std::swap(r1, r2);
-    ASSERT_EQ(r1, r4);
-    ASSERT_EQ(r2, r3);
+  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 5, 2);
+  RAJA::TypedRangeStrideSegment<TypeParam> r2(1, 6, 1);
+  RAJA::TypedRangeStrideSegment<TypeParam> r3(r1);
+  RAJA::TypedRangeStrideSegment<TypeParam> r4(r2);
+  std::swap(r1, r2);
+  ASSERT_EQ(r1, r4);
+  ASSERT_EQ(r2, r3);
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Iterators)
 {
-    RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
-    ASSERT_EQ(TypeParam(0), *r1.begin());
-    ASSERT_EQ(TypeParam(96), *(--r1.end()));
-    using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
-    ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
-    ASSERT_EQ(difftype_t(25), r1.size());
+  RAJA::TypedRangeStrideSegment<TypeParam> r1(0, 100, 4);
+  ASSERT_EQ(TypeParam(0), *r1.begin());
+  ASSERT_EQ(TypeParam(96), *(--r1.end()));
+  using difftype_t = decltype(std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(25), r1.end() - r1.begin());
+  ASSERT_EQ(difftype_t(25), std::distance(r1.begin(), r1.end()));
+  ASSERT_EQ(difftype_t(25), r1.size());
 }
 
-template< typename T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
-{
-}
+{}
 
-template< typename T, typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
+template <typename T,
+          typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
 void NegativeRangeStrideTestSizes()
 {
   RAJA::TypedRangeStrideSegment<T> segment16(-10, -2, 2);
-  using difftype_t = decltype(std::distance(segment16.begin(), segment16.end()));
+  using difftype_t =
+      decltype(std::distance(segment16.begin(), segment16.end()));
   ASSERT_EQ(segment16.size(), difftype_t(4));
 
   RAJA::TypedRangeStrideSegment<T> segment17(-5, 5, 2);
@@ -118,13 +121,16 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
   ASSERT_EQ(segment11.size(), difftype_t(2));
 
   // PRIMES
-  RAJA::TypedRangeStrideSegment<TypeParam> segment12(0, 7, 3);  // should produce 0,3,6
+  RAJA::TypedRangeStrideSegment<TypeParam> segment12(
+      0, 7, 3);  // should produce 0,3,6
   ASSERT_EQ(segment12.size(), difftype_t(3));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment13(0, 13, 3);  // should produce 0,3,6,9,12
+  RAJA::TypedRangeStrideSegment<TypeParam> segment13(
+      0, 13, 3);  // should produce 0,3,6,9,12
   ASSERT_EQ(segment13.size(), difftype_t(5));
 
-  RAJA::TypedRangeStrideSegment<TypeParam> segment14(0, 17, 5);  // should produce 0,5,10,15
+  RAJA::TypedRangeStrideSegment<TypeParam> segment14(
+      0, 17, 5);  // should produce 0,5,10,15
   ASSERT_EQ(segment14.size(), difftype_t(4));
 
   // NEGATIVE STRIDE
@@ -136,13 +142,14 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Sizes)
 }
 
 template <typename IDX_TYPE,
-  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+          typename std::enable_if<std::is_unsigned<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
-{
-}
+{}
 
-template <typename IDX_TYPE, 
-  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
+template <typename IDX_TYPE,
+          typename std::enable_if<std::is_signed<
+              RAJA::strip_index_type_t<IDX_TYPE>>::value>::type* = nullptr>
 void runNegativeIndexStrideSliceTests()
 {
   auto r1 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(10, -1, -1);
@@ -167,14 +174,14 @@ void runNegativeIndexStrideSliceTests()
   ASSERT_EQ(IDX_TYPE(-2), *s3.begin());
   ASSERT_EQ(IDX_TYPE(2), *s3.end());
   ASSERT_EQ(size_t(2), size_t(s3.size()));
- 
-  
+
+
   auto r4 = RAJA::TypedRangeStrideSegment<IDX_TYPE>(-9, -1, 1);
   auto s4 = r4.slice(3, 6);
 
   ASSERT_EQ(IDX_TYPE(-6), *s4.begin());
   ASSERT_EQ(IDX_TYPE(-1), *s4.end());
-  ASSERT_EQ(size_t(5), size_t(s4.size())); 
+  ASSERT_EQ(size_t(5), size_t(s4.size()));
 }
 
 TYPED_TEST(RangeStrideSegmentUnitTest, Slices)
@@ -222,5 +229,5 @@ TYPED_TEST(RangeStrideSegmentUnitTest, Equality)
 
   auto r3 = RAJA::TypedRangeStrideSegment<TypeParam>(1, 10, 1);
 
-  ASSERT_TRUE( !(r1 == r3));
+  ASSERT_TRUE(!(r1 == r3));
 }
diff --git a/test/unit/indexing/test-indexing.hpp b/test/unit/indexing/test-indexing.hpp
index 21038542ee..9978f17832 100644
--- a/test/unit/indexing/test-indexing.hpp
+++ b/test/unit/indexing/test-indexing.hpp
@@ -18,30 +18,26 @@
 // List of named_dims
 //
 using NamedDimensionTypeList =
-    camp::list<
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
-                camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>
-              >;
+    camp::list<camp::integral_constant<RAJA::named_dim, RAJA::named_dim::x>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::y>,
+               camp::integral_constant<RAJA::named_dim, RAJA::named_dim::z>>;
 
 //
 // List of sizes
 //
 using SizeTypeList =
-    camp::list<
-                camp::integral_constant<int, RAJA::named_usage::ignored>,
-                camp::integral_constant<int, RAJA::named_usage::unspecified>,
-                camp::integral_constant<int, 1>,
-                camp::integral_constant<int, 7>
-              >;
+    camp::list<camp::integral_constant<int, RAJA::named_usage::ignored>,
+               camp::integral_constant<int, RAJA::named_usage::unspecified>,
+               camp::integral_constant<int, 1>,
+               camp::integral_constant<int, 7>>;
 
 //
 // Holder for indexing templates
 //
-template < template < RAJA::named_dim, int, int > class T >
+template <template <RAJA::named_dim, int, int> class T>
 struct indexing_holder
 {
-  template < RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+  template <RAJA::named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
   using type = T<dim, BLOCK_SIZE, GRID_SIZE>;
 };
 
@@ -49,11 +45,13 @@ struct indexing_holder
 // List of indexing holder types
 //
 #if defined(RAJA_ENABLE_CUDA)
-using CudaIndexingHolderList = camp::list< indexing_holder<RAJA::cuda::IndexGlobal> >;
+using CudaIndexingHolderList =
+    camp::list<indexing_holder<RAJA::cuda::IndexGlobal>>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipIndexingHolderList = camp::list< indexing_holder<RAJA::hip::IndexGlobal> >;
+using HipIndexingHolderList =
+    camp::list<indexing_holder<RAJA::hip::IndexGlobal>>;
 #endif
 
 #endif  // __TEST_INDEXING_UTILS_HPP__
diff --git a/test/unit/indexing/tests/test-indexing-global.hpp b/test/unit/indexing/tests/test-indexing-global.hpp
index a345d80067..2fbb4a4421 100644
--- a/test/unit/indexing/tests/test-indexing-global.hpp
+++ b/test/unit/indexing/tests/test-indexing-global.hpp
@@ -18,83 +18,97 @@ template <typename T>
 class IndexingUnitTest : public ::testing::Test
 {};
 
-TYPED_TEST_SUITE_P( IndexingUnitTest );
+TYPED_TEST_SUITE_P(IndexingUnitTest);
 
-template < typename test_policy,
-           typename indexer_type,
-           RAJA::named_dim dim_012,
-           int BLOCK_SIZE,
-           int GRID_SIZE >
+template <typename test_policy,
+          typename indexer_type,
+          RAJA::named_dim dim_012,
+          int BLOCK_SIZE,
+          int GRID_SIZE>
 void testBasicIndexing()
 {
-  dim3d3d expected_dim{{1,1,1}, {1,1,1}};
-  if (BLOCK_SIZE != RAJA::named_usage::ignored) {
-    if (BLOCK_SIZE == RAJA::named_usage::unspecified) {
+  dim3d3d expected_dim {{1, 1, 1}, {1, 1, 1}};
+  if (BLOCK_SIZE != RAJA::named_usage::ignored)
+  {
+    if (BLOCK_SIZE == RAJA::named_usage::unspecified)
+    {
       expected_dim.thread[static_cast<int>(dim_012)] = 3;
-    } else {
+    }
+    else
+    {
       expected_dim.thread[static_cast<int>(dim_012)] = BLOCK_SIZE;
     }
   }
 
-  if (GRID_SIZE != RAJA::named_usage::ignored) {
-    if (GRID_SIZE == RAJA::named_usage::unspecified) {
+  if (GRID_SIZE != RAJA::named_usage::ignored)
+  {
+    if (GRID_SIZE == RAJA::named_usage::unspecified)
+    {
       expected_dim.block[static_cast<int>(dim_012)] = 5;
-    } else {
+    }
+    else
+    {
       expected_dim.block[static_cast<int>(dim_012)] = GRID_SIZE;
     }
   }
 
   const int total_global = expected_dim.product();
 
-  auto host_res = get_test_resource<test_seq>();
+  auto host_res    = get_test_resource<test_seq>();
   auto working_res = get_test_resource<test_policy>();
 
   int* actual_index = host_res.allocate<int>(total_global);
-  int* actual_size = host_res.allocate<int>(total_global);
+  int* actual_size  = host_res.allocate<int>(total_global);
 
-  for (int i = 0; i < total_global; ++i) {
+  for (int i = 0; i < total_global; ++i)
+  {
     actual_index[i] = -1;
-    actual_size[i] = -1;
+    actual_size[i]  = -1;
   }
 
-  actual_index = test_reallocate(working_res, host_res, actual_index, total_global);
-  actual_size = test_reallocate(working_res, host_res, actual_size, total_global);
+  actual_index =
+      test_reallocate(working_res, host_res, actual_index, total_global);
+  actual_size =
+      test_reallocate(working_res, host_res, actual_size, total_global);
 
   for3d3d<test_policy>(expected_dim,
-      [=] RAJA_HOST_DEVICE (dim3d3d idx, dim3d3d dim) {
-    int i = index(idx, dim);
-    actual_index[i] = indexer_type::template index<int>();
-    actual_size[i] = indexer_type::template size<int>();
-  });
-
-  actual_index = test_reallocate(host_res, working_res, actual_index, total_global);
-  actual_size = test_reallocate(host_res, working_res, actual_size, total_global);
-
-  for (int i = 0; i < total_global; ++i) {
-    ASSERT_EQ( actual_index[i], i );
-    ASSERT_EQ( actual_size[i], total_global );
+                       [=] RAJA_HOST_DEVICE(dim3d3d idx, dim3d3d dim)
+                       {
+                         int i           = index(idx, dim);
+                         actual_index[i] = indexer_type::template index<int>();
+                         actual_size[i]  = indexer_type::template size<int>();
+                       });
+
+  actual_index =
+      test_reallocate(host_res, working_res, actual_index, total_global);
+  actual_size =
+      test_reallocate(host_res, working_res, actual_size, total_global);
+
+  for (int i = 0; i < total_global; ++i)
+  {
+    ASSERT_EQ(actual_index[i], i);
+    ASSERT_EQ(actual_size[i], total_global);
   }
 
   host_res.deallocate(actual_index);
   host_res.deallocate(actual_size);
 }
 
-TYPED_TEST_P( IndexingUnitTest, BasicIndexing )
+TYPED_TEST_P(IndexingUnitTest, BasicIndexing)
 {
-  using test_policy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using test_policy         = typename camp::at<TypeParam, camp::num<0>>::type;
   using indexer_holder_type = typename camp::at<TypeParam, camp::num<1>>::type;
-  using dim_type = typename camp::at<TypeParam, camp::num<2>>::type;
-  using threads_type = typename camp::at<TypeParam, camp::num<3>>::type;
-  using blocks_type = typename camp::at<TypeParam, camp::num<4>>::type;
+  using dim_type            = typename camp::at<TypeParam, camp::num<2>>::type;
+  using threads_type        = typename camp::at<TypeParam, camp::num<3>>::type;
+  using blocks_type         = typename camp::at<TypeParam, camp::num<4>>::type;
 
   using indexer_type = typename indexer_holder_type::template type<
       dim_type::value, threads_type::value, blocks_type::value>;
 
-  testBasicIndexing< test_policy, indexer_type,
-                     dim_type::value, threads_type::value, blocks_type::value >();
+  testBasicIndexing<test_policy, indexer_type, dim_type::value,
+                    threads_type::value, blocks_type::value>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P( IndexingUnitTest,
-                             BasicIndexing );
+REGISTER_TYPED_TEST_SUITE_P(IndexingUnitTest, BasicIndexing);
 
 #endif  //__TEST_INDEXING_GLOBAL__
diff --git a/test/unit/internal/test-iterators.cpp b/test/unit/internal/test-iterators.cpp
index b5eb0ade48..2d90dca4c0 100644
--- a/test/unit/internal/test-iterators.cpp
+++ b/test/unit/internal/test-iterators.cpp
@@ -14,11 +14,13 @@
 
 #include <limits>
 
-template<typename T>
-class NumericIteratorUnitTest : public ::testing::Test {};
+template <typename T>
+class NumericIteratorUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class StridedNumericIteratorUnitTest : public ::testing::Test {};
+template <typename T>
+class StridedNumericIteratorUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(NumericIteratorUnitTest, UnitExpandedIntegralTypes);
 TYPED_TEST_SUITE(StridedNumericIteratorUnitTest, UnitExpandedIntegralTypes);
@@ -84,7 +86,8 @@ TYPED_TEST(StridedNumericIteratorUnitTest, simple)
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
 TYPED_TEST(NumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value) {
+  if (std::is_unsigned<TypeParam>::value)
+  {
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -95,7 +98,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
       of_it += 11;
     });
-  
+
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -108,7 +111,7 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = of_it + 11;
       (void)sum;
     });
-  
+
     ASSERT_ANY_THROW({
       TypeParam val = 10;
       const RAJA::Iterators::numeric_iterator<TypeParam> of_it(val);
@@ -121,12 +124,13 @@ TYPED_TEST(NumericIteratorUnitTest, overflow)
       auto sum = 11 + of_it;
       (void)sum;
     });
-  } 
+  }
 }
 
 TYPED_TEST(StridedNumericIteratorUnitTest, overflow)
 {
-  if (std::is_unsigned<TypeParam>::value){
+  if (std::is_unsigned<TypeParam>::value)
+  {
     ASSERT_ANY_THROW({
       TypeParam val = 2;
       RAJA::Iterators::strided_numeric_iterator<TypeParam> of_it(val, 2);
diff --git a/test/unit/internal/test-rajavec.cpp b/test/unit/internal/test-rajavec.cpp
index edc093b4dd..131bb16c0b 100644
--- a/test/unit/internal/test-rajavec.cpp
+++ b/test/unit/internal/test-rajavec.cpp
@@ -27,8 +27,8 @@ TEST(RAJAVecUnitTest, basic_test)
 
   RAJA::RAJAVec<int> a1(a);
   ASSERT_EQ(a.size(), a1.size());
-  int* a_data = a.data(); 
-  int* a1_data = a1.data(); 
+  int* a_data  = a.data();
+  int* a1_data = a1.data();
   ASSERT_EQ(a_data[0], a1_data[0]);
   ASSERT_EQ(a_data[1], a1_data[1]);
 
diff --git a/test/unit/multi_reducer/test-multi-reducer.hpp b/test/unit/multi_reducer/test-multi-reducer.hpp
index a1f94e0895..965926b144 100644
--- a/test/unit/multi_reducer/test-multi-reducer.hpp
+++ b/test/unit/multi_reducer/test-multi-reducer.hpp
@@ -17,31 +17,31 @@
 //
 // Data types
 //
-using DataTypeList = camp::list< int,
-                                 float,
-                                 double >;
+using DataTypeList = camp::list<int, float, double>;
 
-using SequentialMultiReducerPolicyList = camp::list< RAJA::seq_multi_reduce >;
+using SequentialMultiReducerPolicyList = camp::list<RAJA::seq_multi_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPMultiReducerPolicyList = camp::list< RAJA::omp_multi_reduce,
-                                                 RAJA::omp_multi_reduce_ordered >;
+using OpenMPMultiReducerPolicyList =
+    camp::list<RAJA::omp_multi_reduce, RAJA::omp_multi_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaMultiReducerPolicyList =
-  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::cuda_multi_reduce_atomic_global_host_init,
-              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+using CudaMultiReducerPolicyList = camp::list<
+    RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::cuda_multi_reduce_atomic_global_host_init,
+    RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipMultiReducerPolicyList =
-  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
-              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
-              RAJA::hip_multi_reduce_atomic_global_host_init,
-              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+using HipMultiReducerPolicyList = camp::list<
+    RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+    RAJA::
+        hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+    RAJA::hip_multi_reduce_atomic_global_host_init,
+    RAJA::hip_multi_reduce_atomic_global_no_replication_host_init>;
 #endif
 
 #endif  // __TEST_MULTI_REDUCER_UTILS_HPP__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
index 1104ae1e28..e84f21a475 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA multi reducer constructors and initialization.
+/// Header file containing tests for RAJA multi reducer constructors and
+/// initialization.
 ///
 
 #ifndef __TEST_MULTI_REDUCER_CONSTRUCTOR__
@@ -22,63 +23,70 @@
 
 template <typename T>
 class MultiReducerBasicConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerSingleInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerContainerInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest);
 
 
-template <typename MultiReducePolicy,
-          typename NumericType>
+template <typename MultiReducePolicy, typename NumericType>
 void testBasicMultiReducerConstructorRegular(size_t num_bins)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType>
+template <typename MultiReducePolicy, typename NumericType>
 void testBasicMultiReducerConstructorBitwise(size_t num_bins)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
 
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
   }
 }
 
@@ -87,8 +95,10 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
-  testBasicMultiReducerConstructorBitwise< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins);
+  testBasicMultiReducerConstructorBitwise<MultiReducePolicy, NumericType>(
+      num_bins);
 }
 ///
 template <typename MultiReducePolicy,
@@ -96,34 +106,39 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testBasicMultiReducerConstructor(size_t num_bins)
 {
-  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(0);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(1);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(2);
-  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(10);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(0);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(1);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(2);
+  testBasicMultiReducerConstructor<MultiReducePolicy, NumericType>(10);
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType>
-void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType>
+void testMultiReducerSingleInitConstructorRegular(size_t num_bins,
+                                                  NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_sum.size(), num_bins);
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -134,17 +149,20 @@ void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType i
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType>
-void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType>
+void testMultiReducerSingleInitConstructorBitwise(size_t num_bins,
+                                                  NumericType initVal)
 {
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins, initVal);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins, initVal);
 
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
 
@@ -155,48 +173,57 @@ void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType i
 
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
-  testMultiReducerSingleInitConstructorBitwise< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
+  testMultiReducerSingleInitConstructorBitwise<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
 }
 ///
 template <typename MultiReducePolicy,
           typename NumericType,
-          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorRegular<MultiReducePolicy, NumericType>(
+      num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(0, NumericType(2));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(1, NumericType(4));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(2, NumericType(0));
-  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(10, NumericType(9));
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      0, NumericType(2));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      1, NumericType(4));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      2, NumericType(0));
+  testMultiReducerSingleInitConstructor<MultiReducePolicy, NumericType>(
+      10, NumericType(9));
 }
 
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container>
+template <typename MultiReducePolicy, typename NumericType, typename Container>
 void testMultiReducerContainerInitConstructorRegular(Container const& container)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(container);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(container);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(container);
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      container);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      container);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      container);
 
   ASSERT_EQ(multi_reduce_sum.size(), container.size());
   ASSERT_EQ(multi_reduce_min.size(), container.size());
   ASSERT_EQ(multi_reduce_max.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -208,19 +235,20 @@ void testMultiReducerContainerInitConstructorRegular(Container const& container)
   }
 }
 
-template <typename MultiReducePolicy,
-          typename NumericType,
-          typename Container>
+template <typename MultiReducePolicy, typename NumericType, typename Container>
 void testMultiReducerContainerInitConstructorBitwise(Container const& container)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(container);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(container);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      container);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      container);
 
   ASSERT_EQ(multi_reduce_and.size(), container.size());
   ASSERT_EQ(multi_reduce_or.size(), container.size());
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -236,8 +264,10 @@ template <typename MultiReducePolicy,
           std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
-  testMultiReducerContainerInitConstructorBitwise< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
+  testMultiReducerContainerInitConstructorBitwise<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 ///
 template <typename MultiReducePolicy,
@@ -246,13 +276,15 @@ template <typename MultiReducePolicy,
           std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerContainerInitConstructor(Container const& container)
 {
-  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorRegular<MultiReducePolicy,
+                                                  NumericType>(container);
 }
 
-TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstructor)
+TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest,
+             MultiReducerConstructor)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
@@ -260,13 +292,14 @@ TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstruct
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin) {
+  for (size_t bin = 0; bin < size_t(10); ++bin)
+  {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c0);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c1);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c2);
-  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c10);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c0);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c1);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c2);
+  testMultiReducerContainerInitConstructor<MultiReducePolicy, NumericType>(c10);
 }
 
 
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
index 0eb1eb6eb6..379cdbd6fc 100644
--- a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -22,44 +22,45 @@
 
 template <typename T>
 class MultiReducerBasicResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerSingleResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class MultiReducerContainerResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest);
 TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest);
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin) {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
-      }
-    });
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      num_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < num_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset();
@@ -70,34 +71,42 @@ void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
     ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
     ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
 
-    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
-    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
-    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(),
+              get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(),
+              get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(),
+              get_op_identity(multi_reduce_max));
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
 {
   NumericType initVal = NumericType(5);
 
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < num_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
-      }
-    });
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      num_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < num_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset();
@@ -106,72 +115,87 @@ void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
     ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
 
-    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
-    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(),
+              get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(),
+              get_op_identity(multi_reduce_or));
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
-  testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
+  testMultiReducerBasicResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
-  // testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, num_bins); testMultiReducerBasicResetBitwise<
+  // MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
 void testMultiReducerBasicReset(size_t num_bins)
 {
-  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, num_bins);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, num_bins);
 }
 
 TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(0);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(1);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(2);
-  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(10);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(0);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(1);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(2);
+  testMultiReducerBasicReset<MultiReducePolicy, NumericType, ForOnePol>(10);
 }
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
-void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+void testMultiReducerSingleResetRegular(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
-      }
-    });
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset(num_bins, initVal);
@@ -182,7 +206,8 @@ void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size
   ASSERT_EQ(multi_reduce_min.size(), num_bins);
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
     ASSERT_EQ(multi_reduce_min.get(bin), initVal);
     ASSERT_EQ(multi_reduce_max.get(bin), initVal);
@@ -193,21 +218,28 @@ void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol  >
-void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
+void testMultiReducerSingleResetBitwise(bool use_reducer,
+                                        size_t init_bins,
+                                        size_t num_bins,
+                                        NumericType initVal)
 {
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
-      }
-    });
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset(num_bins, initVal);
@@ -216,7 +248,8 @@ void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size
   ASSERT_EQ(multi_reduce_and.size(), num_bins);
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
-  for (size_t bin = 0; bin < num_bins; ++bin) {
+  for (size_t bin = 0; bin < num_bins; ++bin)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), initVal);
     ASSERT_EQ(multi_reduce_or.get(bin), initVal);
 
@@ -225,77 +258,100 @@ void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
-  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
-  testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetBitwise<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
-  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerSingleResetSize(size_t init_bins,
+                                     size_t num_bins,
+                                     NumericType initVal)
 {
-  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetRegular<MultiReducePolicy, NumericType, ForOnePol>(
+      false, init_bins, num_bins, initVal);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, num_bins, initVal);
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
+template <typename MultiReducePolicy, typename NumericType, typename ForOnePol>
 void testMultiReducerSingleReset(size_t num_bins, NumericType initVal)
 {
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, num_bins, initVal);
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, num_bins, initVal);
-  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(num_bins, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      0, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      4, num_bins, initVal);
+  testMultiReducerSingleResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      num_bins, num_bins, initVal);
 }
 
 TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
-
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(0, NumericType(3));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(1, NumericType(5));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(2, NumericType(0));
-  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(10, NumericType(8));
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      0, NumericType(3));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      1, NumericType(5));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      2, NumericType(0));
+  testMultiReducerSingleReset<MultiReducePolicy, NumericType, ForOnePol>(
+      10, NumericType(8));
 }
 
 
-
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container  >
-void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetRegular(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal = NumericType(5);
-
-  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
-  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
-  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_sum[bin] += initVal;
-        multi_reduce_min[bin].min(initVal-1);
-        multi_reduce_max[bin].max(initVal+1);
-      }
-    });
+  NumericType initVal   = NumericType(5);
+
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(
+      init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(
+      init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_sum[bin] += initVal;
+            multi_reduce_min[bin].min(initVal - 1);
+            multi_reduce_max[bin].max(initVal + 1);
+          }
+        });
   }
 
   multi_reduce_sum.reset(container);
@@ -307,7 +363,8 @@ void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, C
   ASSERT_EQ(multi_reduce_max.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_sum.get(bin), val);
     ASSERT_EQ(multi_reduce_min.get(bin), val);
     ASSERT_EQ(multi_reduce_max.get(bin), val);
@@ -319,25 +376,33 @@ void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, C
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container >
-void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
+void testMultiReducerContainerResetBitwise(bool use_reducer,
+                                           size_t init_bins,
+                                           Container const& container)
 {
   const size_t num_bins = container.size();
-  NumericType initVal = NumericType(5);
-
-  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
-  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
-
-  if (use_reducer) {
-    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
-      for (size_t bin = 0; bin < init_bins; ++bin) {
-        multi_reduce_and[bin] &= initVal-1;
-        multi_reduce_or[bin] |= initVal+1;
-      }
-    });
+  NumericType initVal   = NumericType(5);
+
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(
+      init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(
+      init_bins, initVal);
+
+  if (use_reducer)
+  {
+    forone<ForOnePol>(
+        [=] RAJA_HOST_DEVICE()
+        {
+          for (size_t bin = 0; bin < init_bins; ++bin)
+          {
+            multi_reduce_and[bin] &= initVal - 1;
+            multi_reduce_or[bin] |= initVal + 1;
+          }
+        });
   }
 
   multi_reduce_and.reset(container);
@@ -347,7 +412,8 @@ void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, C
   ASSERT_EQ(multi_reduce_or.size(), num_bins);
 
   size_t bin = 0;
-  for (NumericType val : container) {
+  for (NumericType val : container)
+  {
     ASSERT_EQ(multi_reduce_and.get(bin), val);
     ASSERT_EQ(multi_reduce_or.get(bin), val);
 
@@ -357,50 +423,61 @@ void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, C
   }
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container,
-            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
-  testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
+  testMultiReducerContainerResetBitwise<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
-  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
 }
 ///
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container,
-            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
-void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerResetSize(size_t init_bins,
+                                        Container const& container)
 {
-  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetRegular<MultiReducePolicy, NumericType,
+                                        ForOnePol>(false, init_bins, container);
   // avoid using the reducer as forone does not handle reducers correctly
   // forone does not make_lambda_body or privatize the body
-  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType,
+  // ForOnePol >(true, init_bins, container);
 }
 
-template <  typename MultiReducePolicy,
-            typename NumericType,
-            typename ForOnePol,
-            typename Container >
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename ForOnePol,
+          typename Container>
 void testMultiReducerContainerReset(Container const& container)
 {
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, container);
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, container);
-  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(container.size(), container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      0, container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      4, container);
+  testMultiReducerContainerResetSize<MultiReducePolicy, NumericType, ForOnePol>(
+      container.size(), container);
 }
 
 TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
 {
   using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+  using NumericType       = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol         = typename camp::at<TypeParam, camp::num<2>>::type;
 
   std::vector<NumericType> c0(0);
   std::vector<NumericType> c1(1, 3);
@@ -408,22 +485,21 @@ TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
   c2.emplace(5);
   c2.emplace(8);
   std::list<NumericType> c10;
-  for (size_t bin = 0; bin < size_t(10); ++bin) {
+  for (size_t bin = 0; bin < size_t(10); ++bin)
+  {
     c10.emplace_front(NumericType(bin));
   }
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c0);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c1);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c2);
-  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c10);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c0);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c1);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(c2);
+  testMultiReducerContainerReset<MultiReducePolicy, NumericType, ForOnePol>(
+      c10);
 }
 
 
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest, MultiReducerReset);
 
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest,
-                            MultiReducerReset);
-
-REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest,
-                            MultiReducerReset);
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest, MultiReducerReset);
 
 REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest,
                             MultiReducerReset);
diff --git a/test/unit/reducer/test-reducer-constructors-cuda.cpp b/test/unit/reducer/test-reducer-constructors-cuda.cpp
index fea3bb9b90..75889c4706 100644
--- a/test/unit/reducer/test-reducer-constructors-cuda.cpp
+++ b/test/unit/reducer/test-reducer-constructors-cuda.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
-                                 CudaResourceList > >::Types;
+                                 CudaResourceList>>::Types;
 
-using CudaInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaInitReducerConstructorTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList > >::Types;
+                                 CudaUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CudaInitTest,
                                ReducerInitConstructorUnitTest,
                                CudaInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-hip.cpp b/test/unit/reducer/test-reducer-constructors-hip.cpp
index 0b3197b2ef..c4f4ddb8b4 100644
--- a/test/unit/reducer/test-reducer-constructors-hip.cpp
+++ b/test/unit/reducer/test-reducer-constructors-hip.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
-                                 HipResourceList > >::Types;
+                                 HipResourceList>>::Types;
 
-using HipInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipInitReducerConstructorTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList > >::Types;
+                                 HipUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(HipInitTest,
                                ReducerInitConstructorUnitTest,
                                HipInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
index b3204c7827..3dd9e8ae39 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp-target.cpp
@@ -6,20 +6,20 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+using OpenMPTargetInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPTargetInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-openmp.cpp b/test/unit/reducer/test-reducer-constructors-openmp.cpp
index 26d39cdd5f..eb31791058 100644
--- a/test/unit/reducer/test-reducer-constructors-openmp.cpp
+++ b/test/unit/reducer/test-reducer-constructors-openmp.cpp
@@ -6,22 +6,23 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList > >::Types;
+                                 HostResourceList>>::Types;
 
-using OpenMPInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPInitReducerConstructorTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -31,4 +32,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPInitTest,
                                ReducerInitConstructorUnitTest,
                                OpenMPInitReducerConstructorTypes);
 #endif
-
diff --git a/test/unit/reducer/test-reducer-constructors-seq.cpp b/test/unit/reducer/test-reducer-constructors-seq.cpp
index 134766eb9a..7d765529f8 100644
--- a/test/unit/reducer/test-reducer-constructors-seq.cpp
+++ b/test/unit/reducer/test-reducer-constructors-seq.cpp
@@ -6,21 +6,22 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA reducer constructors and initialization.
+/// Source file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #include "tests/test-reducer-constructors.hpp"
 
-using SequentialBasicReducerConstructorTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialBasicReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
-                                 HostResourceList > >::Types;
+                                 HostResourceList>>::Types;
 
-using SequentialInitReducerConstructorTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialInitReducerConstructorTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
                                ReducerBasicConstructorUnitTest,
@@ -29,5 +30,3 @@ INSTANTIATE_TYPED_TEST_SUITE_P(SequentialBasicTest,
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialInitTest,
                                ReducerInitConstructorUnitTest,
                                SequentialInitReducerConstructorTypes);
-
-
diff --git a/test/unit/reducer/test-reducer-reset-cuda.cpp b/test/unit/reducer/test-reducer-reset-cuda.cpp
index 06944d488d..2443419c7d 100644
--- a/test/unit/reducer/test-reducer-reset-cuda.cpp
+++ b/test/unit/reducer/test-reducer-reset-cuda.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerResetTypes = 
-  Test< camp::cartesian_product< CudaReducerPolicyList,
+using CudaReducerResetTypes =
+    Test<camp::cartesian_product<CudaReducerPolicyList,
                                  DataTypeList,
                                  CudaResourceList,
-                                 CudaUnitTestPolicyList > >::Types;
+                                 CudaUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CudaResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-hip.cpp b/test/unit/reducer/test-reducer-reset-hip.cpp
index cfca5e3787..eb31480311 100644
--- a/test/unit/reducer/test-reducer-reset-hip.cpp
+++ b/test/unit/reducer/test-reducer-reset-hip.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerResetTypes = 
-  Test< camp::cartesian_product< HipReducerPolicyList,
+using HipReducerResetTypes =
+    Test<camp::cartesian_product<HipReducerPolicyList,
                                  DataTypeList,
                                  HipResourceList,
-                                 HipUnitTestPolicyList > >::Types;
+                                 HipUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HipResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp-target.cpp b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
index 1bf7777bf1..5f02ec92ea 100644
--- a/test/unit/reducer/test-reducer-reset-openmp-target.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp-target.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerResetTypes = 
-  Test< camp::cartesian_product< OpenMPTargetReducerPolicyList,
+using OpenMPTargetReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPTargetReducerPolicyList,
                                  DataTypeList,
                                  OpenMPTargetResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPTargetResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-openmp.cpp b/test/unit/reducer/test-reducer-reset-openmp.cpp
index 3f8d54287f..a570a7be6a 100644
--- a/test/unit/reducer/test-reducer-reset-openmp.cpp
+++ b/test/unit/reducer/test-reducer-reset-openmp.cpp
@@ -12,11 +12,11 @@
 #include "tests/test-reducer-reset.hpp"
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerResetTypes = 
-  Test< camp::cartesian_product< OpenMPReducerPolicyList,
+using OpenMPReducerResetTypes =
+    Test<camp::cartesian_product<OpenMPReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenMPResetTest,
diff --git a/test/unit/reducer/test-reducer-reset-seq.cpp b/test/unit/reducer/test-reducer-reset-seq.cpp
index 2b1ff4a748..5884aa43e4 100644
--- a/test/unit/reducer/test-reducer-reset-seq.cpp
+++ b/test/unit/reducer/test-reducer-reset-seq.cpp
@@ -11,14 +11,13 @@
 
 #include "tests/test-reducer-reset.hpp"
 
-using SequentialReducerResetTypes = 
-  Test< camp::cartesian_product< SequentialReducerPolicyList,
+using SequentialReducerResetTypes =
+    Test<camp::cartesian_product<SequentialReducerPolicyList,
                                  DataTypeList,
                                  HostResourceList,
-                                 SequentialUnitTestPolicyList > >::Types;
+                                 SequentialUnitTestPolicyList>>::Types;
 
 
 INSTANTIATE_TYPED_TEST_SUITE_P(SequentialResetTest,
                                ReducerResetUnitTest,
                                SequentialReducerResetTypes);
-
diff --git a/test/unit/reducer/test-reducer.hpp b/test/unit/reducer/test-reducer.hpp
index aa8fbda9cf..55fa58f6ee 100644
--- a/test/unit/reducer/test-reducer.hpp
+++ b/test/unit/reducer/test-reducer.hpp
@@ -16,27 +16,25 @@
 //
 // Data types
 //
-using DataTypeList = camp::list< int,
-                                 float,
-                                 double >;
+using DataTypeList = camp::list<int, float, double>;
 
-using SequentialReducerPolicyList = camp::list< RAJA::seq_reduce >;
+using SequentialReducerPolicyList = camp::list<RAJA::seq_reduce>;
 
 #if defined(RAJA_ENABLE_OPENMP)
-using OpenMPReducerPolicyList = camp::list< RAJA::omp_reduce,
-                                            RAJA::omp_reduce_ordered >;
+using OpenMPReducerPolicyList =
+    camp::list<RAJA::omp_reduce, RAJA::omp_reduce_ordered>;
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-using OpenMPTargetReducerPolicyList = camp::list< RAJA::omp_target_reduce >;
+using OpenMPTargetReducerPolicyList = camp::list<RAJA::omp_target_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducerPolicyList = camp::list< RAJA::cuda_reduce >;
+using CudaReducerPolicyList = camp::list<RAJA::cuda_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducerPolicyList = camp::list< RAJA::hip_reduce >;
+using HipReducerPolicyList = camp::list<RAJA::hip_reduce>;
 #endif
 
 #endif  // __TEST_REDUCER_UTILS_HPP__
diff --git a/test/unit/reducer/tests/test-reducer-constructors.hpp b/test/unit/reducer/tests/test-reducer-constructors.hpp
index d02d42fce9..d771a77b77 100644
--- a/test/unit/reducer/tests/test-reducer-constructors.hpp
+++ b/test/unit/reducer/tests/test-reducer-constructors.hpp
@@ -6,7 +6,8 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Header file containing tests for RAJA reducer constructors and initialization.
+/// Header file containing tests for RAJA reducer constructors and
+/// initialization.
 ///
 
 #ifndef __TEST_REDUCER_CONSTRUCTOR__
@@ -18,29 +19,26 @@
 
 template <typename T>
 class ReducerBasicConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 template <typename T>
 class ReducerInitConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(ReducerBasicConstructorUnitTest);
 TYPED_TEST_SUITE_P(ReducerInitConstructorUnitTest);
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template <typename ReducePolicy,
-          typename NumericType>
-typename  std::enable_if<
-#if defined(RAJA_ENABLE_CUDA) // CUDA policy does nothing.
-            std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
-#elif defined(RAJA_ENABLE_HIP) // HIP policy does nothing.
-            std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+template <typename ReducePolicy, typename NumericType>
+typename std::enable_if<
+#if defined(RAJA_ENABLE_CUDA)  // CUDA policy does nothing.
+    std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+#elif defined(RAJA_ENABLE_HIP)  // HIP policy does nothing.
+    std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
 #error Please enable a supported GPU platform, e.g. CUDA or HIP.
 #endif
-          >::type
+    >::type
 testReducerConstructor()
 {
   // do nothing
@@ -51,15 +49,15 @@ testReducerConstructor()
 // Should not run this on a GPU.
 template <typename ReducePolicy,
           typename NumericType>
-typename  std::enable_if< // CPU policy.
+typename std::enable_if<  // CPU policy.
 #if defined(RAJA_ENABLE_CUDA)
-            !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
+    !std::is_same<ReducePolicy, RAJA::cuda_reduce>::value
 #elif defined(RAJA_ENABLE_HIP)
-            !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
+    !std::is_same<ReducePolicy, RAJA::hip_reduce>::value
 #else
-            true  // Always run for non-GPU policies.
+    true  // Always run for non-GPU policies.
 #endif
-          >::type
+    >::type
 testReducerConstructor()
 {
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum;
@@ -68,8 +66,12 @@ testReducerConstructor()
   RAJA::ReduceMinLoc<ReducePolicy, NumericType> reduce_minloc;
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc;
 
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup;
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup;
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup;
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup;
 
   ASSERT_EQ((NumericType)reduce_sum.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_min.get(), NumericType());
@@ -82,44 +84,48 @@ testReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), NumericType());
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), NumericType());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), RAJA::Index_type());
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            RAJA::Index_type());
 }
 
 TYPED_TEST_P(ReducerBasicConstructorUnitTest, BasicReducerConstructor)
 {
   using ReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
 
-  testReducerConstructor< ReducePolicy, NumericType >();
+  testReducerConstructor<ReducePolicy, NumericType>();
 }
 
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
-typename  std::enable_if< // Host policy does nothing.
-            std::is_base_of<RunOnHost, ForOnePol>::value
-          >::type
-exec_dispatcher( NumericType * RAJA_UNUSED_ARG(initVal) )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
+typename std::enable_if<  // Host policy does nothing.
+    std::is_base_of<RunOnHost, ForOnePol>::value>::type
+exec_dispatcher(NumericType* RAJA_UNUSED_ARG(initVal))
 {
   // Do nothing for host policies.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename ForOnePol >
-typename  std::enable_if< // GPU policy fiddles with value.
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-exec_dispatcher( NumericType * initVal )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename ForOnePol>
+typename std::enable_if<  // GPU policy fiddles with value.
+    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+exec_dispatcher(NumericType* initVal)
 {
-  forone<ForOnePol>( [=] __device__ () {
-                        initVal[0] += 1;
-                        initVal[0] -= 1;
-                 });
+  forone<ForOnePol>(
+      [=] __device__()
+      {
+        initVal[0] += 1;
+        initVal[0] -= 1;
+      });
 }
 #endif
 
@@ -129,27 +135,27 @@ template <typename ReducePolicy,
           typename ForOnePol>
 void testInitReducerConstructor()
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  NumericType * theVal = nullptr;
-  NumericType * workVal = nullptr;
+  NumericType* theVal  = nullptr;
+  NumericType* workVal = nullptr;
 
   NumericType initVal = (NumericType)5;
 
   workVal = work_res.allocate<NumericType>(1);
-  theVal = host_res.allocate<NumericType>(1);
+  theVal  = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
+  work_res.memcpy(workVal, &initVal, sizeof(initVal));
   theVal[0] = (NumericType)10;
 
-  #if defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-  #endif
+#endif
 
-  #if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-  #endif
+#endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -158,17 +164,17 @@ void testInitReducerConstructor()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup(initVal, LocTup);
 
   // move a value onto device and fiddle
-  exec_dispatcher < ReducePolicy,
-                    NumericType,
-                    ForOnePol
-                  >
-                  ( workVal );
+  exec_dispatcher<ReducePolicy, NumericType, ForOnePol>(workVal);
 
-  work_res.memcpy( &initVal, workVal, sizeof(initVal) );
+  work_res.memcpy(&initVal, workVal, sizeof(initVal));
 
   theVal[0] = initVal;
 
@@ -185,23 +191,28 @@ void testInitReducerConstructor()
 
   ASSERT_EQ((NumericType)reduce_minloctup.get(), (NumericType)(initVal));
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(initVal));
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)1);
-
-  work_res.deallocate( workVal );
-  host_res.deallocate( theVal );
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)1);
+
+  work_res.deallocate(workVal);
+  host_res.deallocate(theVal);
 }
 
 TYPED_TEST_P(ReducerInitConstructorUnitTest, InitReducerConstructor)
 {
-  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
 
-  testInitReducerConstructor< ReduceType, NumericType, ResourceType, ForOneType >();
+  testInitReducerConstructor<ReduceType, NumericType, ResourceType,
+                             ForOneType>();
 }
 
 
diff --git a/test/unit/reducer/tests/test-reducer-reset.hpp b/test/unit/reducer/tests/test-reducer-reset.hpp
index b82ae2995f..c976545fc8 100644
--- a/test/unit/reducer/tests/test-reducer-reset.hpp
+++ b/test/unit/reducer/tests/test-reducer-reset.hpp
@@ -16,97 +16,98 @@
 
 #include "../test-reducer.hpp"
 
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename Indexer,
-            typename Tuple,
-            typename ForOnePol
-          >
-typename  std::enable_if< // Empty function for non-device policy.
-            std::is_base_of<RunOnHost, ForOnePol>::value
-          >::type
-exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_sum),
-                  RAJA::ReduceMin<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_min),
-                  RAJA::ReduceMax<ReducePolicy, NumericType> & RAJA_UNUSED_ARG(reduce_max),
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_minloc),
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & RAJA_UNUSED_ARG(reduce_maxloc),
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_minloctup),
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & RAJA_UNUSED_ARG(reduce_maxloctup),
-                  NumericType RAJA_UNUSED_ARG(initVal)
-               )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
+typename std::enable_if<  // Empty function for non-device policy.
+    std::is_base_of<RunOnHost, ForOnePol>::value>::type
+exec_dispatcher(
+    RAJA::ReduceSum<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_sum),
+    RAJA::ReduceMin<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_min),
+    RAJA::ReduceMax<ReducePolicy, NumericType>& RAJA_UNUSED_ARG(reduce_max),
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
+        reduce_minloc),
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& RAJA_UNUSED_ARG(
+        reduce_maxloc),
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
+        reduce_minloctup),
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& RAJA_UNUSED_ARG(
+        reduce_maxloctup),
+    NumericType RAJA_UNUSED_ARG(initVal))
 {
   // Non-device policies should do nothing.
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ReducePolicy,
-            typename NumericType,
-            typename Indexer,
-            typename Tuple,
-            typename ForOnePol
-          >
-typename  std::enable_if< // GPU policy execution.
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-exec_dispatcher(  RAJA::ReduceSum<ReducePolicy, NumericType> & reduce_sum,
-                  RAJA::ReduceMin<ReducePolicy, NumericType> & reduce_min,
-                  RAJA::ReduceMax<ReducePolicy, NumericType> & reduce_max,
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer> & reduce_minloc,
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer> & reduce_maxloc,
-                  RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple> & reduce_minloctup,
-                  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple> & reduce_maxloctup,
-                  NumericType initVal
-               )
+template <typename ReducePolicy,
+          typename NumericType,
+          typename Indexer,
+          typename Tuple,
+          typename ForOnePol>
+typename std::enable_if<  // GPU policy execution.
+    std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+exec_dispatcher(
+    RAJA::ReduceSum<ReducePolicy, NumericType>& reduce_sum,
+    RAJA::ReduceMin<ReducePolicy, NumericType>& reduce_min,
+    RAJA::ReduceMax<ReducePolicy, NumericType>& reduce_max,
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Indexer>& reduce_minloc,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Indexer>& reduce_maxloc,
+    RAJA::ReduceMinLoc<ReducePolicy, NumericType, Tuple>& reduce_minloctup,
+    RAJA::ReduceMaxLoc<ReducePolicy, NumericType, Tuple>& reduce_maxloctup,
+    NumericType initVal)
 {
   // Use device to activate any value for each reducer.
-  forone<ForOnePol>( [=] __host__ __device__ () {
-                    Tuple temploc(0,0);
-                    reduce_sum += initVal;
-                    reduce_min.min(0);
-                    reduce_max.max(0);
-                    reduce_minloc.minloc(0,0);
-                    reduce_maxloc.maxloc(0,0);
-                    reduce_minloctup.minloc(0,temploc);
-                    reduce_maxloctup.maxloc(0,temploc);
-                 });
+  forone<ForOnePol>(
+      [=] __host__ __device__()
+      {
+        Tuple temploc(0, 0);
+        reduce_sum += initVal;
+        reduce_min.min(0);
+        reduce_max.max(0);
+        reduce_minloc.minloc(0, 0);
+        reduce_maxloc.maxloc(0, 0);
+        reduce_minloctup.minloc(0, temploc);
+        reduce_maxloctup.maxloc(0, temploc);
+      });
   // Relying on implicit device synchronization in forone.
 }
 #endif
 
 template <typename T>
 class ReducerResetUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(ReducerResetUnitTest);
 
-template <  typename ReducePolicy,
-            typename NumericType,
-            typename WORKING_RES,
-            typename ForOnePol  >
+template <typename ReducePolicy,
+          typename NumericType,
+          typename WORKING_RES,
+          typename ForOnePol>
 void testReducerReset()
 {
-  camp::resources::Resource work_res{WORKING_RES::get_default()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+  camp::resources::Resource work_res {WORKING_RES::get_default()};
+  camp::resources::Resource host_res {camp::resources::Host()};
 
-  NumericType * resetVal = nullptr;
-  NumericType * workVal = nullptr;
+  NumericType* resetVal = nullptr;
+  NumericType* workVal  = nullptr;
 
   NumericType initVal = (NumericType)5;
 
-  workVal = work_res.allocate<NumericType>(1);
+  workVal  = work_res.allocate<NumericType>(1);
   resetVal = host_res.allocate<NumericType>(1);
 
-  work_res.memcpy( workVal, &initVal, sizeof(initVal) );
+  work_res.memcpy(workVal, &initVal, sizeof(initVal));
   resetVal[0] = (NumericType)10;
 
-  #if defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_CUDA)
   cudaErrchk(cudaDeviceSynchronize());
-  #endif
+#endif
 
-  #if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_HIP)
   hipErrchk(hipDeviceSynchronize());
-  #endif
+#endif
 
   RAJA::ReduceSum<ReducePolicy, NumericType> reduce_sum(initVal);
   RAJA::ReduceMin<ReducePolicy, NumericType> reduce_min(initVal);
@@ -115,25 +116,18 @@ void testReducerReset()
   RAJA::ReduceMaxLoc<ReducePolicy, NumericType> reduce_maxloc(initVal, 1);
 
   RAJA::tuple<RAJA::Index_type, RAJA::Index_type> LocTup(1, 1);
-  RAJA::ReduceMinLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_minloctup(initVal, LocTup);
-  RAJA::ReduceMaxLoc<ReducePolicy, NumericType, RAJA::tuple<RAJA::Index_type, RAJA::Index_type>> reduce_maxloctup(initVal, LocTup);
+  RAJA::ReduceMinLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_minloctup(initVal, LocTup);
+  RAJA::ReduceMaxLoc<ReducePolicy, NumericType,
+                     RAJA::tuple<RAJA::Index_type, RAJA::Index_type>>
+      reduce_maxloctup(initVal, LocTup);
 
   // initiate some device computation if using device policy
-  exec_dispatcher < ReducePolicy,
-                    NumericType,
-                    RAJA::Index_type,
-                    RAJA::tuple<RAJA::Index_type, RAJA::Index_type>,
-                    ForOnePol
-                  >
-                 (  reduce_sum,
-                    reduce_min,
-                    reduce_max,
-                    reduce_minloc,
-                    reduce_maxloc,
-                    reduce_minloctup,
-                    reduce_maxloctup,
-                    initVal
-                 );
+  exec_dispatcher<ReducePolicy, NumericType, RAJA::Index_type,
+                  RAJA::tuple<RAJA::Index_type, RAJA::Index_type>, ForOnePol>(
+      reduce_sum, reduce_min, reduce_max, reduce_minloc, reduce_maxloc,
+      reduce_minloctup, reduce_maxloctup, initVal);
 
   // perform real host resets
   reduce_sum.reset(resetVal[0]);
@@ -159,10 +153,14 @@ void testReducerReset()
   ASSERT_EQ((NumericType)reduce_maxloctup.get(), (NumericType)(resetVal[0]));
 
   // Reset of tuple loc defaults to 0
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
-  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())), (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_minloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<0>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
+  ASSERT_EQ((RAJA::Index_type)(RAJA::get<1>(reduce_maxloctup.getLoc())),
+            (RAJA::Index_type)0);
 
   // reset locs to default of -1.
   reduce_minloc.reset(resetVal[0], -1);
@@ -171,20 +169,19 @@ void testReducerReset()
   ASSERT_EQ((RAJA::Index_type)reduce_minloc.getLoc(), (RAJA::Index_type)(-1));
   ASSERT_EQ((RAJA::Index_type)reduce_maxloc.getLoc(), (RAJA::Index_type)(-1));
 
-  work_res.deallocate( workVal );
-  host_res.deallocate( resetVal );
+  work_res.deallocate(workVal);
+  host_res.deallocate(resetVal);
 }
 
 TYPED_TEST_P(ReducerResetUnitTest, BasicReset)
 {
-  using ReduceType = typename camp::at<TypeParam, camp::num<0>>::type;
-  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ReduceType   = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType  = typename camp::at<TypeParam, camp::num<1>>::type;
   using ResourceType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<3>>::type;
-  testReducerReset< ReduceType, NumericType, ResourceType, ForOneType >();
+  using ForOneType   = typename camp::at<TypeParam, camp::num<3>>::type;
+  testReducerReset<ReduceType, NumericType, ResourceType, ForOneType>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest,
-                            BasicReset);
+REGISTER_TYPED_TEST_SUITE_P(ReducerResetUnitTest, BasicReset);
 
 #endif  //__TEST_REDUCER_RESET__
diff --git a/test/unit/resource/tests/test-resource-AsyncTime.hpp b/test/unit/resource/tests/test-resource-AsyncTime.hpp
index 806ba66b26..55fcf488ab 100644
--- a/test/unit/resource/tests/test-resource-AsyncTime.hpp
+++ b/test/unit/resource/tests/test-resource-AsyncTime.hpp
@@ -14,16 +14,16 @@
 #include "RAJA/util/Timer.hpp"
 
 #if defined(RAJA_ENABLE_CUDA)
-inline __host__ __device__ void
-gpu_time_wait_for(float time, float clockrate) {
-  clock_t time_in_clocks = time*clockrate;
+inline __host__ __device__ void gpu_time_wait_for(float time, float clockrate)
+{
+  clock_t time_in_clocks = time * clockrate;
 
-  unsigned int start_clock = (unsigned int) clock();
-  clock_t clock_offset = 0;
+  unsigned int start_clock = (unsigned int)clock();
+  clock_t clock_offset     = 0;
   while (clock_offset < time_in_clocks)
   {
-    unsigned int end_clock = (unsigned int) clock();
-    clock_offset = (clock_t)(end_clock - start_clock);
+    unsigned int end_clock = (unsigned int)clock();
+    clock_offset           = (clock_t)(end_clock - start_clock);
   }
 }
 
@@ -39,61 +39,61 @@ int get_clockrate()
     printf("  CUDA kernel runs will be serialized\n");
     return -1;
   }
-  //printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-  //    deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+  // printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+  //     deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 
 #if defined(__arm__) || defined(__aarch64__)
-  return deviceProp.clockRate/1000;
+  return deviceProp.clockRate / 1000;
 #else
   return deviceProp.clockRate;
 #endif
 }
 
 template <typename WORKING_RES, typename EXEC_POL>
-void ResourceAsyncTimeTestImpl(EXEC_POL&&) {}
+void ResourceAsyncTimeTestImpl(EXEC_POL&&)
+{}
 
 template <typename WORKING_RES, size_t BLOCK_SIZE, bool Async>
 void ResourceAsyncTimeTestImpl(RAJA::cuda_exec<BLOCK_SIZE, Async>&&)
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
-  constexpr std::size_t NUM_STREAMS{8};
+  constexpr std::size_t NUM_STREAMS {8};
   WORKING_RES dev[NUM_STREAMS];
   resources::Host host;
 
-  int clockrate{get_clockrate()};
+  int clockrate {get_clockrate()};
   ASSERT_TRUE(clockrate != -1);
 
   using AsyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, true>;
-  using SyncExecPol = RAJA::cuda_exec<BLOCK_SIZE, false>;
+  using SyncExecPol  = RAJA::cuda_exec<BLOCK_SIZE, false>;
 
   RAJA::Timer sync_timer;
   sync_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
-    forall<SyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE (int i) {
-        gpu_time_wait_for(100, clockrate);
-      }
-    );
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
+  {
+    forall<SyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
+                        [=] RAJA_HOST_DEVICE(int i)
+                        { gpu_time_wait_for(100, clockrate); });
   }
   sync_timer.stop();
   RAJA::Timer::ElapsedType t_sync = sync_timer.elapsed();
 
   RAJA::Timer async_timer;
   async_timer.start();
-  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream){
-    forall<AsyncExecPol>(dev[stream], RangeSegment(0,ARRAY_SIZE),
-      [=] RAJA_HOST_DEVICE (int i) {
-        gpu_time_wait_for(100, clockrate);
-      }
-    );
+  for (std::size_t stream = 0; stream < NUM_STREAMS; ++stream)
+  {
+    forall<AsyncExecPol>(dev[stream], RangeSegment(0, ARRAY_SIZE),
+                         [=] RAJA_HOST_DEVICE(int i)
+                         { gpu_time_wait_for(100, clockrate); });
   }
   async_timer.stop();
   RAJA::Timer::ElapsedType t_async = async_timer.elapsed();
 
-  // We expect "total async time" to be roughly equal to "total sync time" / NUM_STREAMS.
-  // For comparison tolerance, we multiple the latter by 2 in the check.
+  // We expect "total async time" to be roughly equal to "total sync time" /
+  // NUM_STREAMS. For comparison tolerance, we multiple the latter by 2 in the
+  // check.
   ASSERT_LT(t_async, 2 * (t_sync / NUM_STREAMS));
 }
 
@@ -106,15 +106,15 @@ void ResourceAsyncTimeTestCall()
 #else
 
 template <typename WORKING_RES, typename EXEC_POLICY>
-void ResourceAsyncTimeTestCall() {}
+void ResourceAsyncTimeTestCall()
+{}
 
 #endif
 
 TYPED_TEST_SUITE_P(ResourceAsyncTimeTest);
 template <typename T>
 class ResourceAsyncTimeTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
 {
@@ -124,7 +124,6 @@ TYPED_TEST_P(ResourceAsyncTimeTest, ResourceAsyncTime)
   ResourceAsyncTimeTestCall<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest,
-                            ResourceAsyncTime);
+REGISTER_TYPED_TEST_SUITE_P(ResourceAsyncTimeTest, ResourceAsyncTime);
 
 #endif  // __TEST_RESOURCE_ASYNC_HPP__
diff --git a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
index b1939240e4..8de49f37d7 100644
--- a/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-BasicAsyncSemantics.hpp
@@ -13,49 +13,40 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceBasicAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000000};
+  constexpr std::size_t ARRAY_SIZE {10000000};
   using namespace RAJA;
 
   WORKING_RES dev;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource {dev}.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      h_array[i] = i;
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
-  forall<EXEC_POLICY>(dev, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array[i] = i + 2;
-    }
-  );
+  forall<EXEC_POLICY>(dev, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev.memcpy(h_array, d_array, sizeof(int) * ARRAY_SIZE);
 
   dev.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i + 2); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev.deallocate(d_array);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceBasicAsyncSemanticsTest);
 template <typename T>
 class ResourceBasicAsyncSemanticsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceBasicAsyncSemanticsTest, ResourceBasicAsyncSemantics)
 {
diff --git a/test/unit/resource/tests/test-resource-Depends.hpp b/test/unit/resource/tests/test-resource-Depends.hpp
index 0c1b748de2..c7b050dd18 100644
--- a/test/unit/resource/tests/test-resource-Depends.hpp
+++ b/test/unit/resource/tests/test-resource-Depends.hpp
@@ -13,59 +13,48 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceDependsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
   WORKING_RES dev1;
   WORKING_RES dev2;
   resources::Host host;
 
-  int* d_array1 = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* d_array2 = resources::Resource{dev2}.allocate<int>(ARRAY_SIZE);
+  int* d_array1 = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
+  int* d_array2 = resources::Resource {dev2}.allocate<int>(ARRAY_SIZE);
   int* h_array  = host.allocate<int>(ARRAY_SIZE);
 
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] = i;
-    }
-  );
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; });
 
-  resources::Event e = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array2[i] = -1;
-    }
-  );
+  resources::Event e =
+      forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; });
 
   dev1.wait_for(&e);
 
-  forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array1[i] *= d_array2[i];
-    }
-  );
+  forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                      [=] RAJA_HOST_DEVICE(int i)
+                      { d_array1[i] *= d_array2[i]; });
 
   dev1.memcpy(h_array, d_array1, sizeof(int) * ARRAY_SIZE);
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], -i); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], -i); });
 
   dev1.deallocate(d_array1);
   dev2.deallocate(d_array2);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceDependsTest);
 template <typename T>
 class ResourceDependsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
 {
@@ -75,7 +64,6 @@ TYPED_TEST_P(ResourceDependsTest, ResourceDepends)
   ResourceDependsTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest,
-                            ResourceDepends);
+REGISTER_TYPED_TEST_SUITE_P(ResourceDependsTest, ResourceDepends);
 
 #endif  // __TEST_RESOURCE_DEPENDS_HPP__
diff --git a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
index a8e30d9719..51dc837935 100644
--- a/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
+++ b/test/unit/resource/tests/test-resource-JoinAsyncSemantics.hpp
@@ -13,32 +13,28 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceJoinAsyncSemanticsTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{1000000};
+  constexpr std::size_t ARRAY_SIZE {1000000};
   using namespace RAJA;
 
   WORKING_RES dev1;
   WORKING_RES dev2;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
+  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      h_array[i] = i;
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=] RAJA_HOST_DEVICE(int i)
+                                       { h_array[i] = i; });
 
   dev2.memcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE);
 
   auto e1 = dev2.get_event_erased();
   dev1.wait_for(&e1);
 
-  RAJA::resources::Event e2 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      d_array[i] = i + 2;
-    }
-  );
+  RAJA::resources::Event e2 =
+      forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                          [=] RAJA_HOST_DEVICE(int i) { d_array[i] = i + 2; });
 
   dev2.wait_for(&e2);
 
@@ -46,22 +42,18 @@ void ResourceJoinAsyncSemanticsTestImpl()
 
   dev2.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i + 2); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i + 2); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
-  
 }
 
 TYPED_TEST_SUITE_P(ResourceJoinAsyncSemanticsTest);
 template <typename T>
 class ResourceJoinAsyncSemanticsTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceJoinAsyncSemanticsTest, ResourceJoinAsyncSemantics)
 {
diff --git a/test/unit/resource/tests/test-resource-MultiStream.hpp b/test/unit/resource/tests/test-resource-MultiStream.hpp
index 7f545229f1..f52fa0a817 100644
--- a/test/unit/resource/tests/test-resource-MultiStream.hpp
+++ b/test/unit/resource/tests/test-resource-MultiStream.hpp
@@ -13,7 +13,7 @@
 template <typename WORKING_RES, typename EXEC_POLICY>
 void ResourceMultiStreamTestImpl()
 {
-  constexpr std::size_t ARRAY_SIZE{10000};
+  constexpr std::size_t ARRAY_SIZE {10000};
   using namespace RAJA;
 
   WORKING_RES dev1;
@@ -21,29 +21,35 @@ void ResourceMultiStreamTestImpl()
   WORKING_RES dev3;
   resources::Host host;
 
-  int* d_array = resources::Resource{dev1}.allocate<int>(ARRAY_SIZE);
-  int* h_array  = host.allocate<int>(ARRAY_SIZE);
-
-  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 0) {
-        d_array[i] = i;
-      }
-  });
-
-  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 1) {
-        d_array[i] = i;
-      }
-  });
-
-  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0,ARRAY_SIZE),
-    [=] RAJA_HOST_DEVICE (int i) {
-      if (i % 3 == 2) {
-        d_array[i] = i;
-      }
-  });
+  int* d_array = resources::Resource {dev1}.allocate<int>(ARRAY_SIZE);
+  int* h_array = host.allocate<int>(ARRAY_SIZE);
+
+  resources::Event e1 = forall<EXEC_POLICY>(dev1, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 0)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e2 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 1)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
+
+  resources::Event e3 = forall<EXEC_POLICY>(dev2, RangeSegment(0, ARRAY_SIZE),
+                                            [=] RAJA_HOST_DEVICE(int i)
+                                            {
+                                              if (i % 3 == 2)
+                                              {
+                                                d_array[i] = i;
+                                              }
+                                            });
 
   dev1.wait_for(&e2);
   dev1.wait_for(&e3);
@@ -52,11 +58,9 @@ void ResourceMultiStreamTestImpl()
 
   dev1.wait();
 
-  forall<policy::sequential::seq_exec>(host, RangeSegment(0,ARRAY_SIZE),
-    [=] (int i) {
-      ASSERT_EQ(h_array[i], i); 
-    }
-  );
+  forall<policy::sequential::seq_exec>(host, RangeSegment(0, ARRAY_SIZE),
+                                       [=](int i)
+                                       { ASSERT_EQ(h_array[i], i); });
 
   dev1.deallocate(d_array);
   host.deallocate(h_array);
@@ -65,8 +69,7 @@ void ResourceMultiStreamTestImpl()
 TYPED_TEST_SUITE_P(ResourceMultiStreamTest);
 template <typename T>
 class ResourceMultiStreamTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
 {
@@ -76,7 +79,6 @@ TYPED_TEST_P(ResourceMultiStreamTest, ResourceMultiStream)
   ResourceMultiStreamTestImpl<WORKING_RES, EXEC_POLICY>();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest,
-                            ResourceMultiStream);
+REGISTER_TYPED_TEST_SUITE_P(ResourceMultiStreamTest, ResourceMultiStream);
 
 #endif  // __TEST_RESOURCE_MULTISTREAM_HPP__
diff --git a/test/unit/util/operator/test-operators-bitwise-modulus.cpp b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
index c2906cbe5f..db81160eb1 100644
--- a/test/unit/util/operator/test-operators-bitwise-modulus.cpp
+++ b/test/unit/util/operator/test-operators-bitwise-modulus.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsIntegralUnitTest : public ::testing::Test {};
+template <typename T>
+class OperatorsIntegralUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsIntegralUnitTest, UnitExpandedIntegralTypes);
 
-template<typename T>
+template <typename T>
 void modulus_test()
 {
   using Mod = RAJA::operators::modulus<T>;
@@ -25,16 +26,17 @@ void modulus_test()
   Mod m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(1));
+  ASSERT_EQ(m(i, j), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(-1));
+    ASSERT_EQ(m(i, j), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void bit_or_test()
 {
   using Or = RAJA::operators::bit_or<T>;
@@ -43,12 +45,12 @@ void bit_or_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(o(i,j), T(0011));
-  ASSERT_EQ(o(i,k), T(0111));
-  ASSERT_EQ(o(j,k), T(0111));
+  ASSERT_EQ(o(i, j), T(0011));
+  ASSERT_EQ(o(i, k), T(0111));
+  ASSERT_EQ(o(j, k), T(0111));
 }
 
-template<typename T>
+template <typename T>
 void bit_and_test()
 {
   using And = RAJA::operators::bit_and<T>;
@@ -57,12 +59,12 @@ void bit_and_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(a(i,j), T(0000));
-  ASSERT_EQ(a(i,k), T(0010));
-  ASSERT_EQ(a(j,k), T(0001));
+  ASSERT_EQ(a(i, j), T(0000));
+  ASSERT_EQ(a(i, k), T(0010));
+  ASSERT_EQ(a(j, k), T(0001));
 }
 
-template<typename T>
+template <typename T>
 void bit_xor_test()
 {
   using Xor = RAJA::operators::bit_xor<T>;
@@ -71,12 +73,13 @@ void bit_xor_test()
   T i = static_cast<T>(0010);
   T j = static_cast<T>(0001);
   T k = static_cast<T>(0111);
-  ASSERT_EQ(x(i,j), T(0011));
-  ASSERT_EQ(x(i,k), T(0101));
-  ASSERT_EQ(x(j,k), T(0110));
+  ASSERT_EQ(x(i, j), T(0011));
+  ASSERT_EQ(x(i, k), T(0101));
+  ASSERT_EQ(x(j, k), T(0110));
 }
 
-TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus) {
+TYPED_TEST(OperatorsIntegralUnitTest, bitwise_modulus)
+{
   bit_or_test<TypeParam>();
   bit_and_test<TypeParam>();
   bit_xor_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-equivalence.cpp b/test/unit/util/operator/test-operators-equivalence.cpp
index f2a0a84c54..710dc21abd 100644
--- a/test/unit/util/operator/test-operators-equivalence.cpp
+++ b/test/unit/util/operator/test-operators-equivalence.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestEquivalence : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestEquivalence : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsUnitTestEquivalence, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void equal_test()
 {
   using Eq = RAJA::operators::equal_to<T>;
@@ -25,16 +26,17 @@ void equal_test()
   Eq eq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(eq(i,j));
+  ASSERT_TRUE(eq(i, j));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(eq(i,j));
+    ASSERT_TRUE(eq(i, j));
   }
 }
 
-template<typename T>
+template <typename T>
 void not_equal_test()
 {
   using NEq = RAJA::operators::not_equal_to<T>;
@@ -42,16 +44,17 @@ void not_equal_test()
   NEq neq;
   T i = static_cast<T>(5);
   T j = static_cast<T>(3);
-  ASSERT_TRUE(neq(i,j));
+  ASSERT_TRUE(neq(i, j));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-3);
-    ASSERT_TRUE(neq(i,j));
+    ASSERT_TRUE(neq(i, j));
   }
 }
 
-template<typename T>
+template <typename T>
 void greater_test()
 {
   using G = RAJA::operators::greater<T>;
@@ -59,18 +62,19 @@ void greater_test()
   G g;
   T i = static_cast<T>(5);
   T j = static_cast<T>(4);
-  ASSERT_TRUE(g(i,j));
-  ASSERT_FALSE(g(j,i));
+  ASSERT_TRUE(g(i, j));
+  ASSERT_FALSE(g(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-4);
     j = static_cast<T>(-5);
-    ASSERT_TRUE(g(i,j));
-    ASSERT_FALSE(g(j,i));
+    ASSERT_TRUE(g(i, j));
+    ASSERT_FALSE(g(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void less_test()
 {
   using L = RAJA::operators::less<T>;
@@ -78,64 +82,67 @@ void less_test()
   L l;
   T i = static_cast<T>(4);
   T j = static_cast<T>(5);
-  ASSERT_TRUE(l(i,j));
-  ASSERT_FALSE(l(j,i));
+  ASSERT_TRUE(l(i, j));
+  ASSERT_FALSE(l(j, i));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-4);
-    ASSERT_TRUE(l(i,j));
-    ASSERT_FALSE(l(j,i));
+    ASSERT_TRUE(l(i, j));
+    ASSERT_FALSE(l(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void greater_eq_test()
 {
   using G = RAJA::operators::greater_equal<T>;
 
   G g;
-  T i = static_cast<T>(5);
+  T i  = static_cast<T>(5);
   T i2 = static_cast<T>(5);
-  T j = static_cast<T>(4);
-  ASSERT_TRUE(g(i,j));
-  ASSERT_TRUE(g(i,i2));
-  ASSERT_FALSE(g(j,i));
-
-  if (std::is_signed<T>::value) {
-    i = static_cast<T>(-4);
+  T j  = static_cast<T>(4);
+  ASSERT_TRUE(g(i, j));
+  ASSERT_TRUE(g(i, i2));
+  ASSERT_FALSE(g(j, i));
+
+  if (std::is_signed<T>::value)
+  {
+    i  = static_cast<T>(-4);
     i2 = static_cast<T>(-4);
-    j = static_cast<T>(-5);
-    ASSERT_TRUE(g(i,j));
-    ASSERT_TRUE(g(i,i2));
-    ASSERT_FALSE(g(j,i));
+    j  = static_cast<T>(-5);
+    ASSERT_TRUE(g(i, j));
+    ASSERT_TRUE(g(i, i2));
+    ASSERT_FALSE(g(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void less_eq_test()
 {
   using L = RAJA::operators::less_equal<T>;
 
   L l;
-  T i = static_cast<T>(4);
+  T i  = static_cast<T>(4);
   T i2 = static_cast<T>(4);
-  T j = static_cast<T>(5);
-  ASSERT_TRUE(l(i,j));
-  ASSERT_TRUE(l(i,i2));
-  ASSERT_FALSE(l(j,i));
-
-  if (std::is_signed<T>::value) {
-    i = static_cast<T>(-5);
+  T j  = static_cast<T>(5);
+  ASSERT_TRUE(l(i, j));
+  ASSERT_TRUE(l(i, i2));
+  ASSERT_FALSE(l(j, i));
+
+  if (std::is_signed<T>::value)
+  {
+    i  = static_cast<T>(-5);
     i2 = static_cast<T>(-5);
-    j = static_cast<T>(-4);
-    ASSERT_TRUE(l(i,j));
-    ASSERT_TRUE(l(i,i2));
-    ASSERT_FALSE(l(j,i));
+    j  = static_cast<T>(-4);
+    ASSERT_TRUE(l(i, j));
+    ASSERT_TRUE(l(i, i2));
+    ASSERT_FALSE(l(j, i));
   }
 }
 
-template<typename T>
+template <typename T>
 void maximum_test()
 {
   using Max = RAJA::operators::maximum<T>;
@@ -143,16 +150,17 @@ void maximum_test()
   Max m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), i);
+  ASSERT_EQ(m(i, j), i);
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), j);
+    ASSERT_EQ(m(i, j), j);
   }
 }
 
-template<typename T>
+template <typename T>
 void minimum_test()
 {
   using Min = RAJA::operators::minimum<T>;
@@ -160,16 +168,18 @@ void minimum_test()
   Min m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), j);
+  ASSERT_EQ(m(i, j), j);
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), i);
+    ASSERT_EQ(m(i, j), i);
   }
 }
 
-TYPED_TEST(OperatorsUnitTestEquivalence, equivalence) {
+TYPED_TEST(OperatorsUnitTestEquivalence, equivalence)
+{
   minimum_test<TypeParam>();
   maximum_test<TypeParam>();
   equal_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-identity.cpp b/test/unit/util/operator/test-operators-identity.cpp
index 4b320d1c04..ef7589d05d 100644
--- a/test/unit/util/operator/test-operators-identity.cpp
+++ b/test/unit/util/operator/test-operators-identity.cpp
@@ -12,12 +12,13 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestIdentity: public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestIdentity : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(OperatorsUnitTestIdentity, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void identity_test()
 {
   using Ident = RAJA::operators::identity<T>;
@@ -28,13 +29,14 @@ void identity_test()
   ASSERT_EQ(id(i), T(0));
   ASSERT_EQ(id(j), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
     ASSERT_EQ(id(j), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void project1st_test()
 {
   using Proj1 = RAJA::operators::project1st<T, T>;
@@ -42,17 +44,18 @@ void project1st_test()
   Proj1 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i,j), T(0));
-  ASSERT_EQ(p(j,i), T(1));
+  ASSERT_EQ(p(i, j), T(0));
+  ASSERT_EQ(p(j, i), T(1));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i,j), T(0));
-    ASSERT_EQ(p(j,i), T(-1));
+    ASSERT_EQ(p(i, j), T(0));
+    ASSERT_EQ(p(j, i), T(-1));
   }
 }
 
-template<typename T>
+template <typename T>
 void project2nd_test()
 {
   using Proj2 = RAJA::operators::project2nd<T, T>;
@@ -60,23 +63,26 @@ void project2nd_test()
   Proj2 p;
   T i = static_cast<T>(0);
   T j = static_cast<T>(1);
-  ASSERT_EQ(p(i,j), T(1));
-  ASSERT_EQ(p(j,i), T(0));
+  ASSERT_EQ(p(i, j), T(1));
+  ASSERT_EQ(p(j, i), T(0));
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4245 )  // Force msvc to not emit signed conversion warning
+#pragma warning(                                                               \
+    disable : 4245)  // Force msvc to not emit signed conversion warning
 #endif
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     j = static_cast<T>(-1);
-    ASSERT_EQ(p(i,j), T(-1));
-    ASSERT_EQ(p(j,i), T(0));
+    ASSERT_EQ(p(i, j), T(-1));
+    ASSERT_EQ(p(j, i), T(0));
   }
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4245 )
+#pragma warning(default : 4245)
 #endif
 }
 
-TYPED_TEST(OperatorsUnitTestIdentity, identity_project) {
+TYPED_TEST(OperatorsUnitTestIdentity, identity_project)
+{
   identity_test<TypeParam>();
   project1st_test<TypeParam>();
   project2nd_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-logical.cpp b/test/unit/util/operator/test-operators-logical.cpp
index 3fde5644a4..8edb9cdad0 100644
--- a/test/unit/util/operator/test-operators-logical.cpp
+++ b/test/unit/util/operator/test-operators-logical.cpp
@@ -12,11 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestLogical : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestLogical : public ::testing::Test
+{};
 TYPED_TEST_SUITE(OperatorsUnitTestLogical, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void logical_and_test()
 {
   using And = RAJA::operators::logical_and<T>;
@@ -28,21 +29,22 @@ void logical_and_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(a(i0,j0));
-  ASSERT_FALSE(a(i0,j1));
-  ASSERT_FALSE(a(i1,j0));
-  ASSERT_TRUE(a(i1,j1));
-  ASSERT_TRUE(a(i2,j2));
-  if (std::is_signed<T>::value) {
+  ASSERT_FALSE(a(i0, j0));
+  ASSERT_FALSE(a(i0, j1));
+  ASSERT_FALSE(a(i1, j0));
+  ASSERT_TRUE(a(i1, j1));
+  ASSERT_TRUE(a(i2, j2));
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_FALSE(a(i0,j1));
-    ASSERT_FALSE(a(i1,j0));
-    ASSERT_TRUE(a(i1,j1));
+    ASSERT_FALSE(a(i0, j1));
+    ASSERT_FALSE(a(i1, j0));
+    ASSERT_TRUE(a(i1, j1));
   }
 }
 
-template<typename T>
+template <typename T>
 void logical_or_test()
 {
   using Or = RAJA::operators::logical_or<T>;
@@ -54,21 +56,22 @@ void logical_or_test()
   T j0 = static_cast<T>(0);
   T j1 = static_cast<T>(1);
   T j2 = static_cast<T>(2);
-  ASSERT_FALSE(o(i0,j0));
-  ASSERT_TRUE(o(i0,j1));
-  ASSERT_TRUE(o(i1,j0));
-  ASSERT_TRUE(o(i1,j1));
-  ASSERT_TRUE(o(i2,j2));
-  if (std::is_signed<T>::value) {
+  ASSERT_FALSE(o(i0, j0));
+  ASSERT_TRUE(o(i0, j1));
+  ASSERT_TRUE(o(i1, j0));
+  ASSERT_TRUE(o(i1, j1));
+  ASSERT_TRUE(o(i2, j2));
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     j1 = static_cast<T>(-1);
-    ASSERT_TRUE(o(i0,j1));
-    ASSERT_TRUE(o(i1,j0));
-    ASSERT_TRUE(o(i1,j1));
+    ASSERT_TRUE(o(i0, j1));
+    ASSERT_TRUE(o(i1, j0));
+    ASSERT_TRUE(o(i1, j1));
   }
 }
 
-template<typename T>
+template <typename T>
 void logical_not_test()
 {
   using Not = RAJA::operators::logical_not<T>;
@@ -78,13 +81,15 @@ void logical_not_test()
   T i1 = static_cast<T>(1);
   ASSERT_FALSE(n(i1));
   ASSERT_TRUE(n(i0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i1 = static_cast<T>(-1);
     ASSERT_FALSE(n(i1));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestLogical, logical) {
+TYPED_TEST(OperatorsUnitTestLogical, logical)
+{
   logical_and_test<TypeParam>();
   logical_or_test<TypeParam>();
   logical_not_test<TypeParam>();
diff --git a/test/unit/util/operator/test-operators-math.cpp b/test/unit/util/operator/test-operators-math.cpp
index 054efd41c8..16dd7c170a 100644
--- a/test/unit/util/operator/test-operators-math.cpp
+++ b/test/unit/util/operator/test-operators-math.cpp
@@ -12,11 +12,12 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class OperatorsUnitTestMath : public ::testing::Test {};
+template <typename T>
+class OperatorsUnitTestMath : public ::testing::Test
+{};
 TYPED_TEST_SUITE(OperatorsUnitTestMath, UnitIntFloatTypes);
 
-template<typename T>
+template <typename T>
 void plus_test()
 {
   using Plus = RAJA::operators::plus<T>;
@@ -26,16 +27,17 @@ void plus_test()
   Plus p;
   T i = static_cast<T>(1);
   T j = static_cast<T>(2);
-  ASSERT_EQ(p(i,j), T(3));
+  ASSERT_EQ(p(i, j), T(3));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(p(i,j), T(-7));
+    ASSERT_EQ(p(i, j), T(-7));
   }
 }
 
-template<typename T>
+template <typename T>
 void minus_test()
 {
   using Minus = RAJA::operators::minus<T>;
@@ -43,16 +45,17 @@ void minus_test()
   Minus m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(3));
+  ASSERT_EQ(m(i, j), T(3));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(-3));
+    ASSERT_EQ(m(i, j), T(-3));
   }
 }
 
-template<typename T>
+template <typename T>
 void multiplies_test()
 {
   using Mult = RAJA::operators::multiplies<T>;
@@ -62,16 +65,17 @@ void multiplies_test()
   Mult m;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  ASSERT_EQ(m(i,j), T(10));
+  ASSERT_EQ(m(i, j), T(10));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    ASSERT_EQ(m(i,j), T(10));
+    ASSERT_EQ(m(i, j), T(10));
   }
 }
 
-template<typename T>
+template <typename T>
 void divides_test()
 {
   using Div = RAJA::operators::divides<T>;
@@ -79,22 +83,24 @@ void divides_test()
   Div d;
   T i = static_cast<T>(5);
   T j = static_cast<T>(2);
-  if(std::is_floating_point<T>::value) 
-    ASSERT_EQ(d(i,j), T(2.5));
+  if (std::is_floating_point<T>::value)
+    ASSERT_EQ(d(i, j), T(2.5));
   else
-    ASSERT_EQ(d(i,j), T(2));
+    ASSERT_EQ(d(i, j), T(2));
 
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     i = static_cast<T>(-5);
     j = static_cast<T>(-2);
-    if(std::is_floating_point<T>::value) 
-      ASSERT_EQ(d(i,j), T(2.5));
+    if (std::is_floating_point<T>::value)
+      ASSERT_EQ(d(i, j), T(2.5));
     else
-      ASSERT_EQ(d(i,j), T(2));
+      ASSERT_EQ(d(i, j), T(2));
   }
 }
 
-TYPED_TEST(OperatorsUnitTestMath, math) {
+TYPED_TEST(OperatorsUnitTestMath, math)
+{
   plus_test<TypeParam>();
   minus_test<TypeParam>();
   multiplies_test<TypeParam>();
diff --git a/test/unit/util/test-float-limits.cpp b/test/unit/util/test-float-limits.cpp
index 80635a74e1..d54e454083 100644
--- a/test/unit/util/test-float-limits.cpp
+++ b/test/unit/util/test-float-limits.cpp
@@ -6,12 +6,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for floating point numeric limits in 
+/// Source file containing tests for floating point numeric limits in
 /// RAJA operators
 ///
 
 #include "RAJA_test-base.hpp"
-#include "RAJA_unit-test-types.hpp" 
+#include "RAJA_unit-test-types.hpp"
 
 #define RAJA_CHECK_LIMITS
 #include "RAJA/util/Operators.hpp"
@@ -20,8 +20,7 @@
 
 template <typename T>
 class FloatLimitsUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(FloatLimitsUnitTest);
 
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
index 5161b2bb3a..6cc8941184 100644
--- a/test/unit/util/test-fraction.cpp
+++ b/test/unit/util/test-fraction.cpp
@@ -30,16 +30,17 @@ void testFractionMultiplyTypesValues()
             IntegerType(double(numerator) / double(denominator) * double(101)));
 
   // Test where naive algorithm causes overflow, when within precision of double
-  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) {
+  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double))
+  {
 
     static constexpr IntegerType max = std::numeric_limits<IntegerType>::max();
-    static constexpr IntegerType val = (numerator > denominator) ?
-        (max / numerator * denominator) : max;
+    static constexpr IntegerType val =
+        (numerator > denominator) ? (max / numerator * denominator) : max;
 
-    ASSERT_EQ(Frac::multiply(IntegerType(val)),
-              IntegerType(double(numerator) / double(denominator) * double(val)));
+    ASSERT_EQ(
+        Frac::multiply(IntegerType(val)),
+        IntegerType(double(numerator) / double(denominator) * double(val)));
   }
-
 }
 
 template <typename IntegerType>
@@ -54,8 +55,8 @@ void testFractionMultiplyTypes()
 }
 
 
-#define RAJA_FRACTION_RUN_TEST(test) \
-  test<int>(); \
+#define RAJA_FRACTION_RUN_TEST(test)                                           \
+  test<int>();                                                                 \
   test<size_t>();
 
 TEST(Fraction, basic_multiply_Fraction)
diff --git a/test/unit/util/test-integral-limits.cpp b/test/unit/util/test-integral-limits.cpp
index 77d2d95bc0..1e68ecc4f4 100644
--- a/test/unit/util/test-integral-limits.cpp
+++ b/test/unit/util/test-integral-limits.cpp
@@ -19,8 +19,7 @@
 
 template <typename T>
 class IntegralLimitsUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(IntegralLimitsUnitTest);
 
@@ -35,5 +34,5 @@ TYPED_TEST_P(IntegralLimitsUnitTest, IntegralLimits)
 REGISTER_TYPED_TEST_SUITE_P(IntegralLimitsUnitTest, IntegralLimits);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(IntegralLimitsUnitTests,
-                              IntegralLimitsUnitTest,
-                              UnitIntegralTypes);
+                               IntegralLimitsUnitTest,
+                               UnitIntegralTypes);
diff --git a/test/unit/util/test-math.cpp b/test/unit/util/test-math.cpp
index 39572ad3a0..dd5b5dbc24 100644
--- a/test/unit/util/test-math.cpp
+++ b/test/unit/util/test-math.cpp
@@ -13,7 +13,7 @@
 #include "RAJA_gtest.hpp"
 #include <type_traits>
 
-template < typename T >
+template <typename T>
 void test_log2()
 {
   ASSERT_EQ(RAJA::log2(T(257)), T(8));
@@ -24,7 +24,8 @@ void test_log2()
   ASSERT_EQ(RAJA::log2(T(2)), T(1));
   ASSERT_EQ(RAJA::log2(T(1)), T(0));
   ASSERT_EQ(RAJA::log2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::log2(T(-1)), T(0));
     ASSERT_EQ(RAJA::log2(T(-100)), T(0));
   }
@@ -37,7 +38,7 @@ TEST(math, log2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_next_pow2()
 {
   ASSERT_EQ(RAJA::next_pow2(T(257)), T(512));
@@ -48,7 +49,8 @@ void test_next_pow2()
   ASSERT_EQ(RAJA::next_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::next_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::next_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::next_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::next_pow2(T(-100)), T(0));
   }
@@ -61,7 +63,7 @@ TEST(math, next_pow2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_prev_pow2()
 {
   ASSERT_EQ(RAJA::prev_pow2(T(257)), T(256));
@@ -72,7 +74,8 @@ void test_prev_pow2()
   ASSERT_EQ(RAJA::prev_pow2(T(2)), T(2));
   ASSERT_EQ(RAJA::prev_pow2(T(1)), T(1));
   ASSERT_EQ(RAJA::prev_pow2(T(0)), T(0));
-  if (std::is_signed<T>::value) {
+  if (std::is_signed<T>::value)
+  {
     ASSERT_EQ(RAJA::prev_pow2(T(-1)), T(0));
     ASSERT_EQ(RAJA::prev_pow2(T(-100)), T(0));
   }
@@ -85,7 +88,7 @@ TEST(math, prev_pow2)
 }
 
 
-template < typename T >
+template <typename T>
 void test_power_of_2_mod()
 {
   ASSERT_EQ(RAJA::power_of_2_mod(T(257), T(256)), T(1));
diff --git a/test/unit/util/test-span.cpp b/test/unit/util/test-span.cpp
index e59054cfc6..77ba2c347a 100644
--- a/test/unit/util/test-span.cpp
+++ b/test/unit/util/test-span.cpp
@@ -11,41 +11,24 @@
 
 #include "test-span.hpp"
 
-#define RAJA_SPAN_RUN_TEST(test) \
-  test<int, int>(); \
-  test<int, size_t>(); \
-  test<double, int>(); \
-  test<double, size_t>(); \
+#define RAJA_SPAN_RUN_TEST(test)                                               \
+  test<int, int>();                                                            \
+  test<int, size_t>();                                                         \
+  test<double, int>();                                                         \
+  test<double, size_t>();
 
-TEST(Span, basic_construct_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanConstructTypes)
-}
+TEST(Span, basic_construct_Span) {RAJA_SPAN_RUN_TEST(testSpanConstructTypes)}
 
-TEST(Span, basic_assign_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanAssignTypes)
-}
+TEST(Span, basic_assign_Span) {RAJA_SPAN_RUN_TEST(testSpanAssignTypes)}
 
-TEST(Span, basic_iterator_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)
-}
+TEST(Span, basic_iterator_Span) {RAJA_SPAN_RUN_TEST(testSpanIteratorTypes)}
 
-TEST(Span, basic_element_access_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)
-}
+TEST(Span,
+     basic_element_access_Span) {RAJA_SPAN_RUN_TEST(testSpanElementAccessTypes)}
 
-TEST(Span, basic_observe_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanObserveTypes)
-}
+TEST(Span, basic_observe_Span) {RAJA_SPAN_RUN_TEST(testSpanObserveTypes)}
 
-TEST(Span, basic_subview_Span)
-{
-  RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)
-}
+TEST(Span, basic_subview_Span) {RAJA_SPAN_RUN_TEST(testSpanSubViewTypes)}
 
 TEST(Span, basic_make_span_Span)
 {
diff --git a/test/unit/util/test-span.hpp b/test/unit/util/test-span.hpp
index e76db861fd..b6fff3fe90 100644
--- a/test/unit/util/test-span.hpp
+++ b/test/unit/util/test-span.hpp
@@ -26,7 +26,7 @@ template <typename ValueType, typename IndexType>
 void testSpanConstructTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -36,7 +36,7 @@ void testSpanConstructTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr+len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, ptr + len);
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(len, span.size());
@@ -49,7 +49,7 @@ template <typename ValueType, typename IndexType>
 void testSpanAssignTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     RAJA::Span<ValueType*, IndexType> span(ptr, len);
@@ -61,7 +61,7 @@ void testSpanAssignTypes()
   }
 
   {
-    ValueType* ptr2 = ptr + 1;
+    ValueType* ptr2          = ptr + 1;
     constexpr IndexType len2 = 1;
     RAJA::Span<ValueType*, IndexType> span(ptr, len);
     const RAJA::Span<ValueType*, IndexType> span2(ptr2, len2);
@@ -77,15 +77,15 @@ void testSpanAssignTypes()
 template <typename ValueType, typename IndexType>
 void testSpanIteratorTypes()
 {
-  using span_type = RAJA::Span<ValueType*, IndexType>;
-  using iterator = typename span_type::iterator;
-  using const_iterator = typename span_type::const_iterator;
+  using span_type         = RAJA::Span<ValueType*, IndexType>;
+  using iterator          = typename span_type::iterator;
+  using const_iterator    = typename span_type::const_iterator;
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -94,27 +94,29 @@ void testSpanIteratorTypes()
     const span_type span(ptr, len);
 
     iterator begin = span.begin();
-    iterator end = span.end();
+    iterator end   = span.end();
     ASSERT_EQ(ptr, begin);
-    ASSERT_EQ(ptr+len, end);
+    ASSERT_EQ(ptr + len, end);
 
     ValueType* ptr_chk = ptr;
 
-    for (iterator iter = begin; iter != end; ++iter) {
+    for (iterator iter = begin; iter != end; ++iter)
+    {
       ASSERT_EQ(*ptr_chk, *iter);
-      ptr_chk++ ;
+      ptr_chk++;
     }
 
     const_iterator cbegin = span.cbegin();
-    const_iterator cend = span.cend();
+    const_iterator cend   = span.cend();
     ASSERT_EQ(ptr, cbegin);
-    ASSERT_EQ(ptr+len, cend);
+    ASSERT_EQ(ptr + len, cend);
 
     ptr_chk = ptr;
 
-    for (iterator citer = cbegin; citer != cend; ++citer) {
+    for (iterator citer = cbegin; citer != cend; ++citer)
+    {
       ASSERT_EQ(*ptr_chk, *citer);
-      ptr_chk++ ;
+      ptr_chk++;
     }
   }
 
@@ -125,11 +127,11 @@ template <typename ValueType, typename IndexType>
 void testSpanElementAccessTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -139,9 +141,10 @@ void testSpanElementAccessTypes()
 
     ASSERT_EQ(ptr, span.data());
     ASSERT_EQ(*ptr, span.front());
-    ASSERT_EQ(*(ptr+len-1), span.back());
+    ASSERT_EQ(*(ptr + len - 1), span.back());
 
-    for (IndexType i = 0; i < len; ++i) {
+    for (IndexType i = 0; i < len; ++i)
+    {
       ASSERT_EQ(ptr[i], span[i]);
     }
   }
@@ -153,11 +156,11 @@ template <typename ValueType, typename IndexType>
 void testSpanObserveTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -170,7 +173,7 @@ void testSpanObserveTypes()
   }
 
   {
-    const RAJA::Span<ValueType*, IndexType> span(ptr, len-len);
+    const RAJA::Span<ValueType*, IndexType> span(ptr, len - len);
 
     ASSERT_EQ(0, span.size());
     ASSERT_TRUE(span.empty());
@@ -183,11 +186,11 @@ template <typename ValueType, typename IndexType>
 void testSpanSubViewTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   // XL cannot handle initialization list with new
   // e.g. new ValueType[len]{0,1,2,3} produces error
-  for ( IndexType ii = 0; ii < len; ++ii )
+  for (IndexType ii = 0; ii < len; ++ii)
   {
     ptr[ii] = static_cast<ValueType>(ii);
   }
@@ -207,17 +210,18 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.last(count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+len-count, subspan.data());
+    ASSERT_EQ(ptr + len - count, subspan.data());
   }
 
   {
     constexpr IndexType begin = 1;
     constexpr IndexType count = 2;
     const RAJA::Span<ValueType*, IndexType> span(ptr, len);
-    const RAJA::Span<ValueType*, IndexType> subspan = span.subspan(begin, count);
+    const RAJA::Span<ValueType*, IndexType> subspan =
+        span.subspan(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+begin, subspan.data());
+    ASSERT_EQ(ptr + begin, subspan.data());
   }
 
   {
@@ -227,7 +231,7 @@ void testSpanSubViewTypes()
     const RAJA::Span<ValueType*, IndexType> subspan = span.slice(begin, count);
 
     ASSERT_EQ(count, subspan.size());
-    ASSERT_EQ(ptr+begin, subspan.data());
+    ASSERT_EQ(ptr + begin, subspan.data());
   }
 
   delete[] ptr;
@@ -237,7 +241,7 @@ template <typename ValueType, typename IndexType>
 void testSpanMakeSpanTypes()
 {
   constexpr IndexType len = 4;
-  ValueType* ptr = new ValueType[len];
+  ValueType* ptr          = new ValueType[len];
 
   {
     const RAJA::Span<ValueType*, IndexType> span = RAJA::make_span(ptr, len);
diff --git a/test/unit/util/test-timer.cpp b/test/unit/util/test-timer.cpp
index 1688e6497e..ed4ed599ae 100644
--- a/test/unit/util/test-timer.cpp
+++ b/test/unit/util/test-timer.cpp
@@ -51,7 +51,8 @@ TEST(TimerUnitTest, No2)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i) {
+  for (int i = 2; i > 0; --i)
+  {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -73,7 +74,8 @@ TEST(TimerUnitTest, No3)
 
   timer.start("test_timer");
 
-  for (int i = 2; i > 0; --i) {
+  for (int i = 2; i > 0; --i)
+  {
     std::cout << i << std::endl;
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
@@ -95,5 +97,5 @@ TEST(TimerUnitTest, No3)
   std::this_thread::sleep_for(std::chrono::milliseconds(10));
   timer.stop();
   elapsed = timer.elapsed();
-  EXPECT_GT(elapsed, 0.01); 
+  EXPECT_GT(elapsed, 0.01);
 }
diff --git a/test/unit/view-layout/test-indexlayout.cpp b/test/unit/view-layout/test-indexlayout.cpp
index bd7effa8d4..048c91a641 100644
--- a/test/unit/view-layout/test-indexlayout.cpp
+++ b/test/unit/view-layout/test-indexlayout.cpp
@@ -11,97 +11,99 @@
 
 using namespace RAJA;
 
-TEST(IndexLayout, IndexList1D) {
+TEST(IndexLayout, IndexList1D)
+{
   /*
    * Construct a 1D index layout with the index list {1,2,3}
    */
 
-  Index_type arr[3] = {1,2,3};
+  Index_type arr[3] = {1, 2, 3};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
   EXPECT_EQ(index_layout(1), 2);
   EXPECT_EQ(index_layout(2), 3);
-
 }
 
-TEST(IndexLayout, IndexList1DSubsetOfLayout) {
+TEST(IndexLayout, IndexList1DSubsetOfLayout)
+{
   /*
-   * Construct a 1D index layout of arbitrary size greater than 3 
+   * Construct a 1D index layout of arbitrary size greater than 3
    * with the index list {2,3,4}.
    * The purpose of this test is to demonstrate the use case where
    * the index list contains a subset of its index layout
    */
 
-  Index_type arr[3] = {2,3,4};
+  Index_type arr[3] = {2, 3, 4};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   EXPECT_EQ(index_layout(0), 2);
   EXPECT_EQ(index_layout(1), 3);
   EXPECT_EQ(index_layout(2), 4);
-
 }
 
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0) {
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis0)
+{
   /*
-   * Construct a 2D index layout of size 3x10 with 
+   * Construct a 2D index layout of size 3x10 with
    * the index list {1,2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(1,1)          -> 11
    *   index_layout(0,5)   -> layout(1,5)          -> 15
    *   index_layout(1,7)   -> layout(2,7)          -> 27
    */
 
-  Index_type arr[2] = {1,2};
+  Index_type arr[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  for (int i = 0; i < 10; i++ ) {
-    EXPECT_EQ(index_layout(0,i), i+10);
-    EXPECT_EQ(index_layout(1,i), i+20);
+  for (int i = 0; i < 10; i++)
+  {
+    EXPECT_EQ(index_layout(0, i), i + 10);
+    EXPECT_EQ(index_layout(1, i), i + 20);
   }
-
 }
 
-TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1) {
+TEST(IndexLayout, ExtractTwoIndices2DLayoutAxis1)
+{
   /*
-   * Construct a 2D index layout of size 3x10 with 
+   * Construct a 2D index layout of size 3x10 with
    * the direct index used along the 0-axis and
    * the index list {9,5} used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(0,5)          -> 5
    *   index_layout(2,0)   -> layout(2,9)          -> 29
    */
 
-  Index_type arr[2] = {9,5};
+  Index_type arr[2] = {9, 5};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 10);
 
-  EXPECT_EQ(index_layout(0,0), 9);
-  EXPECT_EQ(index_layout(0,1), 5);
-  EXPECT_EQ(index_layout(1,0), 19);
-  EXPECT_EQ(index_layout(1,1), 15);
-  EXPECT_EQ(index_layout(2,0), 29);
-  EXPECT_EQ(index_layout(2,1), 25);
-
+  EXPECT_EQ(index_layout(0, 0), 9);
+  EXPECT_EQ(index_layout(0, 1), 5);
+  EXPECT_EQ(index_layout(1, 0), 19);
+  EXPECT_EQ(index_layout(1, 1), 15);
+  EXPECT_EQ(index_layout(2, 0), 29);
+  EXPECT_EQ(index_layout(2, 1), 25);
 }
 
-TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0) {
+TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0)
+{
   /*
-   * Construct a 2D index layout of size 3x3 with 
+   * Construct a 2D index layout of size 3x3 with
    * the index list {2} used along the 0-axis and
    * the direct index used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0,1)   -> layout(2,1)          -> 7
    *   index_layout(0,2)   -> layout(2,2)          -> 8
@@ -109,21 +111,21 @@ TEST(IndexLayout, ExtractOneIndex2DLayoutAxis0) {
 
   Index_type arr[1] = {2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&arr[0]}, DirectIndex<>());
+  auto index_tuple  = make_index_tuple(IndexList<> {&arr[0]}, DirectIndex<>());
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0,0), 6);
-  EXPECT_EQ(index_layout(0,1), 7);
-  EXPECT_EQ(index_layout(0,2), 8);  
-
+  EXPECT_EQ(index_layout(0, 0), 6);
+  EXPECT_EQ(index_layout(0, 1), 7);
+  EXPECT_EQ(index_layout(0, 2), 8);
 }
 
-TEST(IndexLayout, IndexList2DLayoutExtractOneIndex) {
+TEST(IndexLayout, IndexList2DLayoutExtractOneIndex)
+{
   /*
-   * Construct a 2D index layout of size 3x3 with 
+   * Construct a 2D index layout of size 3x3 with
    * the direct index used along the 0-axis and
    * the index list {2} used along the 1-axis
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(1,0)   -> layout(1,2)          -> 5
    *   index_layout(2,0)   -> layout(2,2)          -> 8
@@ -131,21 +133,21 @@ TEST(IndexLayout, IndexList2DLayoutExtractOneIndex) {
 
   Index_type arr[1] = {2};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(DirectIndex<>(), IndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3, 3);
 
-  EXPECT_EQ(index_layout(0,0), 2);
-  EXPECT_EQ(index_layout(1,0), 5);
-  EXPECT_EQ(index_layout(2,0), 8);
-
+  EXPECT_EQ(index_layout(0, 0), 2);
+  EXPECT_EQ(index_layout(1, 0), 5);
+  EXPECT_EQ(index_layout(2, 0), 8);
 }
 
-TEST(IndexLayout, ConditionalIndexListNullPtr) {
+TEST(IndexLayout, ConditionalIndexListNullPtr)
+{
   /*
-   * Construct a 1D index layout of size 3 with 
+   * Construct a 1D index layout of size 3 with
    * the conditional index list that is a nullptr
    * (conditional index lists always evaluate nullptr to regular indexing)
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(0)            -> 0
    *   index_layout(2)     -> layout(2)            -> 2
@@ -153,7 +155,7 @@ TEST(IndexLayout, ConditionalIndexListNullPtr) {
 
   Index_type* arr_ptr = nullptr;
 
-  auto index_tuple = make_index_tuple(ConditionalIndexList<>{arr_ptr});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {arr_ptr});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 0);
@@ -161,20 +163,21 @@ TEST(IndexLayout, ConditionalIndexListNullPtr) {
   EXPECT_EQ(index_layout(2), 2);
 }
 
-TEST(IndexLayout, ConditionalIndexListWithIndexList) {
+TEST(IndexLayout, ConditionalIndexListWithIndexList)
+{
   /*
-   * Construct a 1D index layout of size 3 with 
+   * Construct a 1D index layout of size 3 with
    * the conditional index list that is not a nullptr
    * (conditional index lists with index list act the same as IndexList)
-   * Examples: 
+   * Examples:
    *   (index layout index -> regular layout index -> unit stride index)
    *   index_layout(0)     -> layout(1)            -> 1
    *   index_layout(1)     -> layout(2)            -> 2
    */
 
-  Index_type arr[2] = {1,2};
+  Index_type arr[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(ConditionalIndexList<>{&arr[0]});
+  auto index_tuple  = make_index_tuple(ConditionalIndexList<> {&arr[0]});
   auto index_layout = make_index_layout(index_tuple, 3);
 
   EXPECT_EQ(index_layout(0), 1);
@@ -184,18 +187,18 @@ TEST(IndexLayout, ConditionalIndexListWithIndexList) {
 TEST(IndexLayout, View1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 5 with 
+   * Construct a 1D index layout of size 5 with
    * the index list {4,2,3} and pass to a 1D view with the data {5,10,15,20,25}
-   * Examples: 
-   *   (index layout index -> regular layout index -> unit stride index -> view at index)
-   *   index_layout(0)     -> layout(4)            -> 4                 -> 25
+   * Examples:
+   *   (index layout index -> regular layout index -> unit stride index -> view
+   * at index) index_layout(0)     -> layout(4)            -> 4 -> 25
    *   index_layout(2)     -> layout(3)            -> 3                 -> 20
    */
-  
-  Index_type data[5] = {5,10,15,20,25};
-  Index_type index_list[3] = {4,2,3};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
+  Index_type data[5]       = {5, 10, 15, 20, 25};
+  Index_type index_list[3] = {4, 2, 3};
+
+  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 5);
 
   auto view = make_index_view(&data[0], index_layout);
@@ -203,18 +206,17 @@ TEST(IndexLayout, View1DLayout)
   EXPECT_EQ(view(0), 25);
   EXPECT_EQ(view(1), 15);
   EXPECT_EQ(view(2), 20);
-
 }
 
 TEST(IndexLayout, View2DLayout)
 {
   /*
-   * Construct a 2D index layout of size 2x3 with 
+   * Construct a 2D index layout of size 2x3 with
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * pass to a 2D view of size 2x3 with the each entry being i*j
    * for i,j in [0,2)x[0,3) (e.g. view(1,2) = 1*2, view(0,2) = 0*2, etc..)
-   * Examples: 
+   * Examples:
    *   (index layout index -> view index -> view at index)
    *   index_layout(0,1)   -> view(0,2)  -> 0
    *   index_layout(1,0)   -> view(1,1)  -> 1
@@ -222,112 +224,121 @@ TEST(IndexLayout, View2DLayout)
 
   Index_type data[2][3];
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 3; j ++ ) {
-      data[i][j] = i*j;
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      data[i][j] = i * j;
     }
   }
 
-  Index_type index_list[2] = {1,2};
+  Index_type index_list[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), IndexList<>{&index_list[0]});
+  auto index_tuple =
+      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 2, 3);
 
   auto view = make_index_view(&data[0][0], index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 2; j ++ ) {
-      EXPECT_EQ(view(i,j), i*(j+1));
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 2; j++)
+    {
+      EXPECT_EQ(view(i, j), i * (j + 1));
     }
   }
-
 }
 
 TEST(IndexLayout, View3DLayout)
 {
   /*
-   * Construct a 3D index layout of size 2x3x4 with 
+   * Construct a 3D index layout of size 2x3x4 with
    * the direct index used along the 0-axis and
    * the index list {1,2} used along the 1-axis and
    * the index list {2,3} used along the 2-axis and
    * pass to a 3D view of size 2x3x4 with the each entry being i*j*k
-   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) = 0*2*2, etc..)
-   * Examples: 
-   *   (index layout index -> view index -> view at index)
+   * for i,j,k in [0,2)x[0,3)x[0,4) (e.g. view(1,2,3) = 1*2*3, view(0,2,2) =
+   * 0*2*2, etc..) Examples: (index layout index -> view index -> view at index)
    *   index_layout(0,1,0) -> view(0,2,2)-> 0
    *   index_layout(2,1,1) -> view(2,2,3)-> 12
    */
-  
+
   Index_type data[2][3][4];
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 3; j ++ ) {
-      for (int k = 0; k < 4; k ++ ) {
-	data[i][j][k] = i*j*k;
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      for (int k = 0; k < 4; k++)
+      {
+        data[i][j][k] = i * j * k;
       }
     }
   }
 
-  Index_type index_list_j[2] = {1,2};
-  Index_type index_list_k[2] = {2,3};
+  Index_type index_list_j[2] = {1, 2};
+  Index_type index_list_k[2] = {2, 3};
 
-  auto index_tuple = make_index_tuple(DirectIndex<>(), 
-                                      IndexList<>{&index_list_j[0]},
-                                      IndexList<>{&index_list_k[0]});
+  auto index_tuple =
+      make_index_tuple(DirectIndex<>(), IndexList<> {&index_list_j[0]},
+                       IndexList<> {&index_list_k[0]});
 
   auto index_layout = make_index_layout(index_tuple, 2, 3, 4);
 
   auto view = make_index_view(&data[0][0][0], index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    for (int j = 0; j < 2; j ++ ) {
-      for (int k = 0; k < 2; k ++ ) {
-        EXPECT_EQ(view(i,j,k), i*(j+1)*(k+2));
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 2; j++)
+    {
+      for (int k = 0; k < 2; k++)
+      {
+        EXPECT_EQ(view(i, j, k), i * (j + 1) * (k + 2));
       }
     }
   }
-
 }
 
 TEST(IndexLayout, MultiView1DLayout)
 {
   /*
-   * Construct a 1D index layout of size 4 with 
-   * the index list {1,2} and pass to a 1D multiview containing two 1D views of size 4 with
-   * the first view having each entry be the square of its index (e.g. view(2) = 2*2 = 4)
-   * and the second view having each entry be the cube of its index (e.g. view(3) = 3*3*3 = 27)
-   * Examples: 
-   *   (index layout index -> mutiview index -> view at index)
-   *   index_layout(0,1)   -> view(0,2)      -> 4
+   * Construct a 1D index layout of size 4 with
+   * the index list {1,2} and pass to a 1D multiview containing two 1D views of
+   * size 4 with the first view having each entry be the square of its index
+   * (e.g. view(2) = 2*2 = 4) and the second view having each entry be the cube
+   * of its index (e.g. view(3) = 3*3*3 = 27) Examples: (index layout index ->
+   * mutiview index -> view at index) index_layout(0,1)   -> view(0,2)      -> 4
    *   index_layout(1,0)   -> view(1,1)      -> 1
    */
 
   Index_type data_squared[4];
   Index_type data_cubed[4];
 
-  for (int i = 0; i < 4; i ++ ) {
-    data_squared[i] = i*i;
+  for (int i = 0; i < 4; i++)
+  {
+    data_squared[i] = i * i;
   }
-  
-  for (int i = 0; i < 4; i ++ ) {
-    data_cubed[i] = i*i*i;
+
+  for (int i = 0; i < 4; i++)
+  {
+    data_cubed[i] = i * i * i;
   }
 
   Index_type* data_array[2];
   data_array[0] = data_squared;
   data_array[1] = data_cubed;
 
-  Index_type index_list[2] = {1,2};
+  Index_type index_list[2] = {1, 2};
 
-  auto index_tuple = make_index_tuple(IndexList<>{&index_list[0]});
+  auto index_tuple  = make_index_tuple(IndexList<> {&index_list[0]});
   auto index_layout = make_index_layout(index_tuple, 4);
 
-  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<> > >(data_array, index_layout);
+  auto view = MultiView<Index_type, IndexLayout<1, Index_type, IndexList<>>>(
+      data_array, index_layout);
 
-  for (int i = 0; i < 2; i ++ ) {
-    EXPECT_EQ(view(0,i), data_squared[i+1]);
-    EXPECT_EQ(view(1,i), data_cubed[i+1]);
+  for (int i = 0; i < 2; i++)
+  {
+    EXPECT_EQ(view(0, i), data_squared[i + 1]);
+    EXPECT_EQ(view(1, i), data_cubed[i + 1]);
   }
-
 }
-
diff --git a/test/unit/view-layout/test-makelayout.cpp b/test/unit/view-layout/test-makelayout.cpp
index af8b6db71e..a1377ddc26 100644
--- a/test/unit/view-layout/test-makelayout.cpp
+++ b/test/unit/view-layout/test-makelayout.cpp
@@ -9,19 +9,18 @@
 
 TEST(LayoutUnitTest, OffsetVsRegular)
 {
-  const auto layout =
-      RAJA::make_permuted_layout({{6, 6}},
-                                 RAJA::as_array<RAJA::Perm<1, 0>>::get());
-  const auto offset =
-      RAJA::make_permuted_offset_layout({{0, 0}},
-                                        {{6, 6}},
-                                        RAJA::as_array<RAJA::PERM_JI>::get());
+  const auto layout = RAJA::make_permuted_layout(
+      {{6, 6}}, RAJA::as_array<RAJA::Perm<1, 0>>::get());
+  const auto offset = RAJA::make_permuted_offset_layout(
+      {{0, 0}}, {{6, 6}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * OffsetLayout with 0 offset should function like the regular Layout.
    */
-  for (int j = 0; j < 6; ++j) {
-    for (int i = 0; i < 6; ++i) {
+  for (int j = 0; j < 6; ++j)
+  {
+    for (int i = 0; i < 6; ++i)
+    {
       ASSERT_EQ(offset(i, j), layout(i, j))
           << layout.strides[0] << layout.strides[1];
     }
@@ -67,10 +66,8 @@ TEST(OffsetLayoutUnitTest, 2D_JI)
    * (-1, -1), (0, -1), (1, -1)
    * (-1, -2), (0, -2), (1, -2)
    */
-  const my_layout layout =
-      RAJA::make_permuted_offset_layout({{-1, -2}},
-                                        {{2, 1}},
-                                        RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_offset_layout(
+      {{-1, -2}}, {{2, 1}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * First element, (-1, -2), should have index 0.
@@ -107,9 +104,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 
   // Construct using variadic "sizes" ctor
   // Zero for J size should correctly produce projective layout
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 0, 7}},
-                                 RAJA::as_array<RAJA::PERM_KJI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 0, 7}}, RAJA::as_array<RAJA::PERM_KJI>::get());
 
   ASSERT_EQ(0, layout(0, 0, 0));
 
@@ -124,7 +120,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
   ASSERT_EQ(12, layout(0, 0, 4));
 
   // Check that we get the identity (mod 21)
-  for (int x = 0; x < 40; ++x) {
+  for (int x = 0; x < 40; ++x)
+  {
 
     // inverse map
     int i, j, k;
@@ -144,7 +141,8 @@ TEST(LayoutUnitTest, 3D_KJI_ProjJ)
 TEST(LayoutUnitTest, 2D_StrideOne)
 {
   using my_layout = RAJA::Layout<2>;
-  using my_layout_s1 = RAJA::Layout<2, ptrdiff_t, 0>; // first index is stride-1
+  using my_layout_s1 =
+      RAJA::Layout<2, ptrdiff_t, 0>;  // first index is stride-1
 
   /*
    * Construct a 2D layout:
@@ -155,9 +153,8 @@ TEST(LayoutUnitTest, 2D_StrideOne)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 5}},
-                                 RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
 
   /*
@@ -167,8 +164,10 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 
 
   // Check that we get the same layout
-  for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 3; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
 
       ASSERT_EQ(layout(i, j), layout_s1(i, j));
     }
@@ -178,44 +177,49 @@ TEST(LayoutUnitTest, 2D_StrideOne)
 TEST(StaticLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2> dynamic_layout(7, 5);
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ,7,5>;
-  
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_IJ, 7, 5>;
+
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 5}},
-                               RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7,5>;
-  
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JI, 7, 5>;
+
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 5; ++j)
+    {
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
 }
 
 TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 13, 5}},
-                               RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7,13,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_JKI, 7, 13, 5>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 13; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i,j,k));
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 13; ++j)
+    {
+      for (int k = 0; k < 5; ++k)
+      {
+        ASSERT_EQ(dynamic_layout(i, j, k), static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -224,21 +228,23 @@ TEST(StaticLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TEST(StaticLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout = 
-    RAJA::make_permuted_layout({{7, 13, 5, 17}},
-                               RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7,13,5,17>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::StaticLayout<RAJA::PERM_LJKI, 7, 13, 5, 17>;
 
   // Check that we get the same layout
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 13; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 5; ++l) {
-          ASSERT_EQ(dynamic_layout(i, j, k, l), static_layout::s_oper(i,j,k,l));
-        } 
+  for (int i = 0; i < 7; ++i)
+  {
+    for (int j = 0; j < 13; ++j)
+    {
+      for (int k = 0; k < 5; ++k)
+      {
+        for (int l = 0; l < 5; ++l)
+        {
+          ASSERT_EQ(dynamic_layout(i, j, k, l),
+                    static_layout::s_oper(i, j, k, l));
+        }
       }
     }
   }
 }
-
-
diff --git a/test/unit/view-layout/test-multiview.cpp b/test/unit/view-layout/test-multiview.cpp
index c841c718a6..60efb37df9 100644
--- a/test/unit/view-layout/test-multiview.cpp
+++ b/test/unit/view-layout/test-multiview.cpp
@@ -11,14 +11,17 @@
 RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template<typename T>
-class MultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class MultiViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class OffsetLayoutMultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class OffsetLayoutMultiViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class TypedIntegralMultiViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedIntegralMultiViewUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(MultiViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutMultiViewUnitTest, UnitIntFloatTypes);
@@ -29,72 +32,76 @@ TYPED_TEST(MultiViewUnitTest, Constructors)
 
   using layout = RAJA::Layout<1>;
 
-  TypeParam   a1[10];
-  TypeParam   a2[10];
-  TypeParam * data[2];
+  TypeParam a1[10];
+  TypeParam a2[10];
+  TypeParam* data[2];
 
   data[0] = a1;
   data[1] = a2;
 
   constexpr int val = 8;
-  a1[0] = val;
-  a2[0] = val;
+  a1[0]             = val;
+  a2[0]             = val;
 
   RAJA::MultiView<TypeParam, layout> view(data, layout(10));
-  ASSERT_EQ( val, view(0,0) );
+  ASSERT_EQ(val, view(0, 0));
 
   /*
-   * Should be able to construct a non-const MultiView from a non-const MultiView
+   * Should be able to construct a non-const MultiView from a non-const
+   * MultiView
    */
   RAJA::MultiView<TypeParam, layout> view2(view);
-  ASSERT_EQ( val, view2(0,0) );
+  ASSERT_EQ(val, view2(0, 0));
 
   /*
    * Should be able to construct a const MultiView from a non-const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view(view);
-  ASSERT_EQ( val, const_view(0,0) );
+  ASSERT_EQ(val, const_view(0, 0));
 
   /*
    * Should be able to construct a const MultiView from a const MultiView
    */
   RAJA::MultiView<TypeParam const, layout> const_view2(const_view);
-  ASSERT_EQ( val, const_view2(0,0) );
+  ASSERT_EQ(val, const_view2(0, 0));
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to 1st position
+  // non-default construction of MultiView with array-of-pointers index moved to
+  // 1st position
   RAJA::MultiView<TypeParam, layout, 1> view1p(data, layout(10));
-  ASSERT_EQ( val, view1p(0,0) );
+  ASSERT_EQ(val, view1p(0, 0));
 
   // construct a non-const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam, layout, 1> view1p2(view1p);
-  ASSERT_EQ( val, view1p2(0,0) );
+  ASSERT_EQ(val, view1p2(0, 0));
 
   // construct a const MultiView from a non-const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p(view1p);
-  ASSERT_EQ( val, const_view1p(0,0) );
+  ASSERT_EQ(val, const_view1p(0, 0));
 
   // construct a const MultiView from a const MultiView
   RAJA::MultiView<TypeParam const, layout, 1> const_view1p2(const_view1p);
-  ASSERT_EQ( val, const_view1p2(0,0) );
+  ASSERT_EQ(val, const_view1p2(0, 0));
 
 
-  // non-default construction of MultiView with array-of-pointers index moved to 1st position
-  // and non-const pointer type specification (used in CHAI)
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc(data, layout(10));
-  ASSERT_EQ( val, view1pnc(0,0) );
+  // non-default construction of MultiView with array-of-pointers index moved to
+  // 1st position and non-const pointer type specification (used in CHAI)
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc(data, layout(10));
+  ASSERT_EQ(val, view1pnc(0, 0));
 
   // construct a non-const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam, layout, 1, TypeParam **> view1pnc2(view1pnc);
-  ASSERT_EQ( val, view1pnc2(0,0) );
+  RAJA::MultiView<TypeParam, layout, 1, TypeParam**> view1pnc2(view1pnc);
+  ASSERT_EQ(val, view1pnc2(0, 0));
 
   // construct a const MultiView from a non-const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc(view1pnc);
-  ASSERT_EQ( val, const_view1pnc(0,0) );
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc(
+      view1pnc);
+  ASSERT_EQ(val, const_view1pnc(0, 0));
 
   // construct a const MultiView from a const MultiView
-  RAJA::MultiView<TypeParam const, layout, 1, TypeParam **> const_view1pnc2(const_view1pnc);
-  ASSERT_EQ( val, const_view1pnc2(0,0) );
+  RAJA::MultiView<TypeParam const, layout, 1, TypeParam**> const_view1pnc2(
+      const_view1pnc);
+  ASSERT_EQ(val, const_view1pnc2(0, 0));
 }
 
 TYPED_TEST(MultiViewUnitTest, Accessor)
@@ -103,48 +110,51 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *b = new TypeParam[N];
-  TypeParam *c = new TypeParam[N];
-  TypeParam *a[2];
+  const int N  = Nx * Ny * Nz;
+  TypeParam* b = new TypeParam[N];
+  TypeParam* c = new TypeParam[N];
+  TypeParam* a[2];
 
   a[0] = b;
   a[1] = c;
 
-  int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[0][iter] = TypeParam{i};
-    a[1][iter] = TypeParam{i}+1;
+    a[0][iter] = TypeParam {i};
+    a[1][iter] = TypeParam {i} + 1;
     ++iter;
   }
 
   /*
    * 1D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a,N);
-  RAJA::MultiView<TypeParam, RAJA::Layout<1>,1> view_1D1p(a,N);
-  TypeParam val{0};
-  for(int i=0; i<N; ++i) {
-    ASSERT_EQ(val, view_1D(0,i));
-    ASSERT_EQ(val+1, view_1D(1,i));
-    ASSERT_EQ(val, view_1D1p(i,0));
-    ASSERT_EQ(val+1, view_1D1p(i,1));
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>> view_1D(a, N);
+  RAJA::MultiView<TypeParam, RAJA::Layout<1>, 1> view_1D1p(a, N);
+  TypeParam val {0};
+  for (int i = 0; i < N; ++i)
+  {
+    ASSERT_EQ(val, view_1D(0, i));
+    ASSERT_EQ(val + 1, view_1D(1, i));
+    ASSERT_EQ(val, view_1D1p(i, 0));
+    ASSERT_EQ(val + 1, view_1D1p(i, 1));
     val++;
   }
 
   /*
    * 2D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<2>,1> view_2D1p(a,Ny,Nx);
-  val = TypeParam{0};
-  for(int j=0; j<Ny; ++j) {
-    for(int i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(0,j,i));
-      ASSERT_EQ(val+1, view_2D(1,j,i));
-      ASSERT_EQ(val, view_2D1p(j,0,i));
-      ASSERT_EQ(val+1, view_2D1p(j,1,i));
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<2>, 1> view_2D1p(a, Ny, Nx);
+  val = TypeParam {0};
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(0, j, i));
+      ASSERT_EQ(val + 1, view_2D(1, j, i));
+      ASSERT_EQ(val, view_2D1p(j, 0, i));
+      ASSERT_EQ(val + 1, view_2D1p(j, 1, i));
       val++;
     }
   }
@@ -152,16 +162,19 @@ TYPED_TEST(MultiViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
-  RAJA::MultiView<TypeParam, RAJA::Layout<3>,2> view_3D1p(a,Nz,Ny,Nx);
-  val = TypeParam{0};
-  for(int k=0; k<Nz; ++k) {
-    for(int j=0; j<Ny; ++j) {
-      for(int i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(0,k,j,i));
-        ASSERT_EQ(val+1, view_3D(1,k,j,i));
-        ASSERT_EQ(val, view_3D1p(k,j,0,i));
-        ASSERT_EQ(val+1, view_3D1p(k,j,1,i));
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  RAJA::MultiView<TypeParam, RAJA::Layout<3>, 2> view_3D1p(a, Nz, Ny, Nx);
+  val = TypeParam {0};
+  for (int k = 0; k < Nz; ++k)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(0, k, j, i));
+        ASSERT_EQ(val + 1, view_3D(1, k, j, i));
+        ASSERT_EQ(val, view_3D1p(k, j, 0, i));
+        ASSERT_EQ(val + 1, view_3D1p(k, j, 1, i));
         val++;
       }
     }
@@ -185,20 +198,23 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
   /*
    * MultiView is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower{{1}};
-  std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::MultiView<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
-  RAJA::MultiView<TypeParam, layout,1> view1p(data, RAJA::make_offset_layout<1>(lower, upper));
-
-  for (int i = 0; i < 10; i++) {
+  std::array<RAJA::Index_type, 1> lower {{1}};
+  std::array<RAJA::Index_type, 1> upper {{11}};
+  RAJA::MultiView<TypeParam, layout> view(
+      data, RAJA::make_offset_layout<1>(lower, upper));
+  RAJA::MultiView<TypeParam, layout, 1> view1p(
+      data, RAJA::make_offset_layout<1>(lower, upper));
+
+  for (int i = 0; i < 10; i++)
+  {
     data[0][i] = static_cast<TypeParam>(i);
-    data[1][i] = static_cast<TypeParam>(i+1);
+    data[1][i] = static_cast<TypeParam>(i + 1);
   }
 
-  ASSERT_EQ(data[0][0], view(0,1));
-  ASSERT_EQ(data[1][9], view(1,10));
-  ASSERT_EQ(data[0][0], view1p(1,0));
-  ASSERT_EQ(data[1][9], view1p(10,1));
+  ASSERT_EQ(data[0][0], view(0, 1));
+  ASSERT_EQ(data[1][9], view(1, 10));
+  ASSERT_EQ(data[0][0], view1p(1, 0));
+  ASSERT_EQ(data[1][9], view1p(10, 1));
 
   delete[] d1;
   delete[] d2;
@@ -207,48 +223,50 @@ TYPED_TEST(OffsetLayoutMultiViewUnitTest, View)
 TYPED_TEST(MultiViewUnitTest, Shift1D)
 {
 
-  int N = 10;
-  TypeParam *reala = new TypeParam[N];
-  TypeParam *realb = new TypeParam[N];
-  TypeParam *a[2];
+  int N            = 10;
+  TypeParam* reala = new TypeParam[N];
+  TypeParam* realb = new TypeParam[N];
+  TypeParam* a[2];
   a[0] = reala;
   a[1] = realb;
 
-  //Create a view from a base view
-  const int DIM = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N);
+  // Create a view from a base view
+  const int DIM                  = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N);
 
-  for(int i=0; i<N; ++i) {
-    A(0,i) = static_cast<TypeParam>(i + 1);
-    B(1,i) = static_cast<TypeParam>(i + 1);
+  for (int i = 0; i < N; ++i)
+  {
+    A(0, i) = static_cast<TypeParam>(i + 1);
+    B(1, i) = static_cast<TypeParam>(i + 1);
   }
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N}});
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Ashift(0,i),A(0,i-N));
-    ASSERT_EQ(Bshift(1,i),B(1,i-N));
+    ASSERT_EQ(Ashift(0, i), A(0, i - N));
+    ASSERT_EQ(Bshift(1, i), B(1, i - N));
   }
 
   // offset layout with MultiView with array-of-pointers index in 1st position
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift = C.shift({{N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> C(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Cshift =
+      C.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Cshift(i,0),C(i-N,0));
-    ASSERT_EQ(Cshift(i,1),C(i-N,1));
-    ASSERT_EQ(Ashift(0,i),C(i-N,0));
-    ASSERT_EQ(Cshift(i,0),A(0,i-N));
+    ASSERT_EQ(Cshift(i, 0), C(i - N, 0));
+    ASSERT_EQ(Cshift(i, 1), C(i - N, 1));
+    ASSERT_EQ(Ashift(0, i), C(i - N, 0));
+    ASSERT_EQ(Cshift(i, 0), A(0, i - N));
   }
 
 
-  //Create a shifted view from a view with a typed layout
-  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  // Create a shifted view from a view with a typed layout
+  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
@@ -256,9 +274,9 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
   RAJA::MultiView<TypeParam, TLayout> D(a, myLayout);
   RAJA::MultiView<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
   {
-    ASSERT_EQ(Dshift(0,i),D(0,i-N));
+    ASSERT_EQ(Dshift(0, i), D(0, i - N));
   };
 
   delete[] reala;
@@ -268,54 +286,65 @@ TYPED_TEST(MultiViewUnitTest, Shift1D)
 TYPED_TEST(MultiViewUnitTest, Shift2D)
 {
 
-  int N = 10;
-  TypeParam *a0 = new TypeParam[N*N];
-  TypeParam *b0 = new TypeParam[N*N];
-  TypeParam *a[2];
+  int N         = 10;
+  TypeParam* a0 = new TypeParam[N * N];
+  TypeParam* b0 = new TypeParam[N * N];
+  TypeParam* a[2];
   a[0] = a0;
   a[1] = b0;
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
-
-  for(int y=0; y<N; ++y) {
-    for(int x=0; x<N; ++x) {
-      A(0,y,x) = static_cast<TypeParam>(x + N*y);
-      B(1,y,x) = static_cast<TypeParam>(x + N*y + 1);
+  RAJA::OffsetLayout<DIM> layout =
+      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::MultiView<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
+
+  for (int y = 0; y < N; ++y)
+  {
+    for (int x = 0; x < N; ++x)
+    {
+      A(0, y, x) = static_cast<TypeParam>(x + N * y);
+      B(1, y, x) = static_cast<TypeParam>(x + N * y + 1);
     }
   }
 
-  //Create a view from a base view with an offsetlayout
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+  // Create a view from a base view with an offsetlayout
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Ashift =
+      A.shift({{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Bshift =
+      B.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Ashift(0,y,x),A(0,y-N,x-N));
-      ASSERT_EQ(Bshift(1,y,x),B(1,y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Ashift(0, y, x), A(0, y - N, x - N));
+      ASSERT_EQ(Bshift(1, y, x), B(1, y - N, x - N));
     }
   }
 
-  //Create a view from a base view with permuted layout
-  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  // Create a view from a base view with permuted layout
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2> playout =
-    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
+      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> D(a, playout);
-  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>,1> Dshift1p = D.shift({{N,N}});
-
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Cshift(0,y,x),C(0,y-N,x-N));
-      ASSERT_EQ(Cshift(1,y,x),C(1,y-N,x-N));
-      ASSERT_EQ(Dshift1p(y,0,x),D(y-N,0,x-N));
-      ASSERT_EQ(Dshift1p(y,1,x),D(y-N,1,x-N));
-      ASSERT_EQ(Dshift1p(y,1,x),C(1,y-N,x-N));
-      ASSERT_EQ(Cshift(0,y,x),D(y-N,0,x-N));
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>> Cshift =
+      C.shift({{N, N}});
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> D(a, playout);
+  RAJA::MultiView<TypeParam, RAJA::OffsetLayout<DIM>, 1> Dshift1p =
+      D.shift({{N, N}});
+
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Cshift(0, y, x), C(0, y - N, x - N));
+      ASSERT_EQ(Cshift(1, y, x), C(1, y - N, x - N));
+      ASSERT_EQ(Dshift1p(y, 0, x), D(y - N, 0, x - N));
+      ASSERT_EQ(Dshift1p(y, 1, x), D(y - N, 1, x - N));
+      ASSERT_EQ(Dshift1p(y, 1, x), C(1, y - N, x - N));
+      ASSERT_EQ(Cshift(0, y, x), D(y - N, 0, x - N));
     }
   }
 
diff --git a/test/unit/view-layout/test-standard-layout.cpp b/test/unit/view-layout/test-standard-layout.cpp
index 160e39ac36..cf7ce50b79 100644
--- a/test/unit/view-layout/test-standard-layout.cpp
+++ b/test/unit/view-layout/test-standard-layout.cpp
@@ -69,7 +69,8 @@ TEST(LayoutUnitTest, 2D_IJ)
   ASSERT_EQ(4, layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -100,9 +101,8 @@ TEST(LayoutUnitTest, 2D_JI)
    * Linear indices range from [0, 15)
    *
    */
-  const my_layout layout =
-      RAJA::make_permuted_layout({{3, 5}},
-                                 RAJA::as_array<RAJA::PERM_JI>::get());
+  const my_layout layout = RAJA::make_permuted_layout(
+      {{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   ASSERT_EQ(0, layout(0, 0));
 
@@ -113,7 +113,8 @@ TEST(LayoutUnitTest, 2D_JI)
   ASSERT_EQ(14, layout(2, 4));
 
   // Check that we get the identity (mod 15)
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -158,7 +159,8 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
   ASSERT_EQ(0, layout(0, 5));
 
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k) {
+  for (int k = 0; k < 20; ++k)
+  {
 
     // inverse map
     int i, j;
@@ -174,4 +176,3 @@ TEST(LayoutUnitTest, 2D_IJ_ProjJ)
     ASSERT_EQ(j, 0);
   }
 }
-
diff --git a/test/unit/view-layout/test-typedlayout.cpp b/test/unit/view-layout/test-typedlayout.cpp
index 6820da9b52..f15b0c40b4 100644
--- a/test/unit/view-layout/test-typedlayout.cpp
+++ b/test/unit/view-layout/test-typedlayout.cpp
@@ -8,8 +8,9 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_unit-test-types.hpp"
 
-template<typename T>
-class TypedLayoutUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedLayoutUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 
@@ -17,24 +18,26 @@ TYPED_TEST_SUITE(TypedLayoutUnitTest, UnitIndexTypes);
 TYPED_TEST(TypedLayoutUnitTest, TypedLayoutConstructors)
 {
 
-  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,5);
+  const RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>> l(10,
+                                                                          5);
 
-  ASSERT_EQ(TypeParam{0}, l(TypeParam{0}, TypeParam{0}));
+  ASSERT_EQ(TypeParam {0}, l(TypeParam {0}, TypeParam {0}));
 
-  ASSERT_EQ(TypeParam{2}, l(TypeParam{0}, TypeParam{2}));
+  ASSERT_EQ(TypeParam {2}, l(TypeParam {0}, TypeParam {2}));
 
-  ASSERT_EQ(TypeParam{10}, l(TypeParam{2}, TypeParam{0}));
+  ASSERT_EQ(TypeParam {10}, l(TypeParam {2}, TypeParam {0}));
 
-  TypeParam x{5};
-  TypeParam y{0};
-  l.toIndices(TypeParam{10}, y, x);
-  ASSERT_EQ(x, TypeParam{0});
-  ASSERT_EQ(y, TypeParam{2});
+  TypeParam x {5};
+  TypeParam y {0};
+  l.toIndices(TypeParam {10}, y, x);
+  ASSERT_EQ(x, TypeParam {0});
+  ASSERT_EQ(y, TypeParam {2});
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
 {
-  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout =
+      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D layout:
@@ -66,7 +69,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
   ASSERT_EQ(TypeParam(4), layout(0, 4));
 
   // Check that we get the identity
-  for (int k = 0; k < 15; ++k) {
+  for (int k = 0; k < 15; ++k)
+  {
 
     // inverse map
     TypeParam i, j;
@@ -82,12 +86,12 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_accessor)
     ASSERT_EQ(k2, layout_a(i, j));
     ASSERT_EQ(k2, layout_b(i, j));
   }
-
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 {
-  using my_layout = RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
+  using my_layout =
+      RAJA::TypedLayout<TypeParam, RAJA::tuple<TypeParam, TypeParam>>;
 
   /*
    * Construct a 2D projective layout:
@@ -118,7 +122,8 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 
   TypeParam pK = 0;
   // Check that we get the identity (mod 7)
-  for (int k = 0; k < 20; ++k) {
+  for (int k = 0; k < 20; ++k)
+  {
 
     // inverse map
     TypeParam i, j;
@@ -139,50 +144,57 @@ TYPED_TEST(TypedLayoutUnitTest, 2D_IJ_ProjJ)
 TYPED_TEST(TypedLayoutUnitTest, 2D_StaticLayout)
 {
   RAJA::Layout<2, TypeParam> dynamic_layout(7, 5);
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_IJ,TypeParam,RAJA::list<TypeParam,TypeParam>,7,5>;
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_IJ, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 5; ++j) {
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 5; ++j)
+    {
 
-      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i,j));
+      ASSERT_EQ(dynamic_layout(i, j), static_layout::s_oper(i, j));
     }
   }
-
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 2D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 5}},
-                               RAJA::as_array<RAJA::PERM_JI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam>, 7,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam>, 7, 5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 5; ++j) {
-      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i,j));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 5; ++j)
+    {
+      ASSERT_EQ(TypeParam(dynamic_layout(i, j)), static_layout::s_oper(i, j));
     }
   }
 }
 
 TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 13, 5}},
-                               RAJA::as_array<RAJA::PERM_JKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_JKI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam,TypeParam>,
-                                                7,13,5>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5}}, RAJA::as_array<RAJA::PERM_JKI>::get());
+  using static_layout =
+      RAJA::TypedStaticLayout<RAJA::PERM_JKI, TypeParam,
+                              RAJA::list<TypeParam, TypeParam, TypeParam>, 7,
+                              13, 5>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 9; ++j) {
-      for (TypeParam k = 0; k < 5; ++k) {
-        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)), static_layout::s_oper(i,j,k));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 9; ++j)
+    {
+      for (TypeParam k = 0; k < 5; ++k)
+      {
+        ASSERT_EQ(TypeParam(dynamic_layout(i, j, k)),
+                  static_layout::s_oper(i, j, k));
       }
     }
   }
@@ -191,20 +203,23 @@ TYPED_TEST(TypedLayoutUnitTest, 3D_PermutedStaticLayout)
 
 TYPED_TEST(TypedLayoutUnitTest, 4D_PermutedStaticLayout)
 {
-  auto dynamic_layout =
-    RAJA::make_permuted_layout({{7, 13, 5, 17}},
-                               RAJA::as_array<RAJA::PERM_LJKI>::get());
-  using static_layout = RAJA::TypedStaticLayout<RAJA::PERM_LJKI,
-                                                TypeParam,
-                                                RAJA::list<TypeParam,TypeParam,TypeParam,TypeParam>,
-                                                7,13,5,17>;
+  auto dynamic_layout = RAJA::make_permuted_layout(
+      {{7, 13, 5, 17}}, RAJA::as_array<RAJA::PERM_LJKI>::get());
+  using static_layout = RAJA::TypedStaticLayout<
+      RAJA::PERM_LJKI, TypeParam,
+      RAJA::list<TypeParam, TypeParam, TypeParam, TypeParam>, 7, 13, 5, 17>;
 
   // Check that we get the same layout
-  for (TypeParam i = 0; i < 7; ++i) {
-    for (TypeParam j = 0; j < 8; ++j) {
-      for (TypeParam k = 0; k < 5; ++k) {
-        for (TypeParam l = 0; l < 5; ++l) {
-          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)), static_layout::s_oper(i,j,k,l));
+  for (TypeParam i = 0; i < 7; ++i)
+  {
+    for (TypeParam j = 0; j < 8; ++j)
+    {
+      for (TypeParam k = 0; k < 5; ++k)
+      {
+        for (TypeParam l = 0; l < 5; ++l)
+        {
+          ASSERT_EQ(TypeParam(dynamic_layout(i, j, k, l)),
+                    static_layout::s_oper(i, j, k, l));
         }
       }
     }
diff --git a/test/unit/view-layout/test-typedview.cpp b/test/unit/view-layout/test-typedview.cpp
index b0823b93e0..e57e884edf 100644
--- a/test/unit/view-layout/test-typedview.cpp
+++ b/test/unit/view-layout/test-typedview.cpp
@@ -13,14 +13,17 @@ RAJA_INDEX_VALUE(TIX, "TIX");
 RAJA_INDEX_VALUE(TIY, "TIY");
 RAJA_INDEX_VALUE(TIL, "TIL");
 
-template<typename T>
-class TypedViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class OffsetLayoutViewUnitTest : public ::testing::Test {};
+template <typename T>
+class OffsetLayoutViewUnitTest : public ::testing::Test
+{};
 
-template<typename T>
-class TypedIntegralViewUnitTest : public ::testing::Test {};
+template <typename T>
+class TypedIntegralViewUnitTest : public ::testing::Test
+{};
 
 TYPED_TEST_SUITE(TypedViewUnitTest, UnitIntFloatTypes);
 TYPED_TEST_SUITE(OffsetLayoutViewUnitTest, UnitIntFloatTypes);
@@ -63,22 +66,23 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *a = new TypeParam[N];
+  const int N  = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
-  int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[iter] = TypeParam{i};
+    a[iter] = TypeParam {i};
     ++iter;
   }
 
   /*
    * 1D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a,N);
-  TypeParam val{0};
-  for(int i=0; i<N; ++i) {
+  RAJA::View<TypeParam, RAJA::Layout<1>> view_1D(a, N);
+  TypeParam val {0};
+  for (int i = 0; i < N; ++i)
+  {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -86,11 +90,13 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 2D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
-  val = TypeParam{0};
-  for(int j=0; j<Ny; ++j) {
-    for(int i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(j,i));
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  val = TypeParam {0};
+  for (int j = 0; j < Ny; ++j)
+  {
+    for (int i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(j, i));
       val++;
     }
   }
@@ -98,12 +104,15 @@ TYPED_TEST(TypedViewUnitTest, Accessor)
   /*
    * 3D Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
-  val = TypeParam{0};
-  for(int k=0; k<Nz; ++k) {
-    for(int j=0; j<Ny; ++j) {
-      for(int i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(k,j,i));
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  val = TypeParam {0};
+  for (int k = 0; k < Nz; ++k)
+  {
+    for (int j = 0; j < Ny; ++j)
+    {
+      for (int i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(k, j, i));
         val++;
       }
     }
@@ -117,22 +126,23 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   const int Nx = 3;
   const int Ny = 5;
   const int Nz = 2;
-  const int N  = Nx*Ny*Nz;
-  TypeParam *a = new TypeParam[N];
+  const int N  = Nx * Ny * Nz;
+  TypeParam* a = new TypeParam[N];
 
-  int iter{0};
-  for(TypeParam i=0; i<TypeParam{N}; ++i)
+  int iter {0};
+  for (TypeParam i = 0; i < TypeParam {N}; ++i)
   {
-    a[iter] = TypeParam{i};
+    a[iter] = TypeParam {i};
     ++iter;
   }
 
   /*
    * 1D Typed Accessor
    */
-  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a,N);
-  TypeParam val{0};
-  for(TypeParam i=0; i<N; ++i) {
+  RAJA::TypedView<TypeParam, RAJA::Layout<1>, TypeParam> view_1D(a, N);
+  TypeParam val {0};
+  for (TypeParam i = 0; i < N; ++i)
+  {
     ASSERT_EQ(val, view_1D(i));
     val++;
   }
@@ -140,11 +150,13 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 2D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a,Ny,Nx);
-  val = TypeParam{0};
-  for(TypeParam j=0; j<Ny; ++j) {
-    for(TypeParam i=0; i<Nx; ++i) {
-      ASSERT_EQ(val, view_2D(j,i));
+  RAJA::View<TypeParam, RAJA::Layout<2>> view_2D(a, Ny, Nx);
+  val = TypeParam {0};
+  for (TypeParam j = 0; j < Ny; ++j)
+  {
+    for (TypeParam i = 0; i < Nx; ++i)
+    {
+      ASSERT_EQ(val, view_2D(j, i));
       val++;
     }
   }
@@ -152,12 +164,15 @@ TYPED_TEST(TypedIntegralViewUnitTest, TypedAccessor)
   /*
    * 3D Typed Accessor
    */
-  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a,Nz,Ny,Nx);
-  val = TypeParam{0};
-  for(TypeParam k=0; k<Nz; ++k) {
-    for(TypeParam j=0; j<Ny; ++j) {
-      for(TypeParam i=0; i<Nx; ++i) {
-        ASSERT_EQ(val, view_3D(k,j,i));
+  RAJA::View<TypeParam, RAJA::Layout<3>> view_3D(a, Nz, Ny, Nx);
+  val = TypeParam {0};
+  for (TypeParam k = 0; k < Nz; ++k)
+  {
+    for (TypeParam j = 0; j < Ny; ++j)
+    {
+      for (TypeParam i = 0; i < Nx; ++i)
+      {
+        ASSERT_EQ(val, view_3D(k, j, i));
         val++;
       }
     }
@@ -175,11 +190,13 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
   /*
    * View is constructed by passing in the layout.
    */
-  std::array<RAJA::Index_type, 1> lower{{1}};
-  std::array<RAJA::Index_type, 1> upper{{11}};
-  RAJA::View<TypeParam, layout> view(data, RAJA::make_offset_layout<1>(lower, upper));
+  std::array<RAJA::Index_type, 1> lower {{1}};
+  std::array<RAJA::Index_type, 1> upper {{11}};
+  RAJA::View<TypeParam, layout> view(data,
+                                     RAJA::make_offset_layout<1>(lower, upper));
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 10; i++)
+  {
     data[i] = static_cast<TypeParam>(i);
   }
 
@@ -192,20 +209,21 @@ TYPED_TEST(OffsetLayoutViewUnitTest, View)
 TYPED_TEST(TypedViewUnitTest, Shift1D)
 {
 
-  int N = 10;
-  TypeParam *a = new TypeParam[N];
-  TypeParam *b = new TypeParam[N];
+  int N        = 10;
+  TypeParam* a = new TypeParam[N];
+  TypeParam* b = new TypeParam[N];
 
   /*
    * Create a view from a base view
    */
-  const int DIM = 1;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}},{{N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N);
-  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>,TX> C(a,N);
+  const int DIM                  = 1;
+  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0}}, {{N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N);
+  RAJA::TypedView<TypeParam, RAJA::Layout<DIM>, TX> C(a, N);
 
-  for(int i=0; i<N; ++i) {
+  for (int i = 0; i < N; ++i)
+  {
     A(i) = static_cast<TypeParam>(i + 1);
   }
 
@@ -215,23 +233,24 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift = C.shift({{N}});
+  RAJA::TypedView<TypeParam, RAJA::OffsetLayout<DIM>, TX> Cshift =
+      C.shift({{N}});
 
-  for(int i=N; i<2*N; ++i)
+  for (int i = N; i < 2 * N; ++i)
   {
-    ASSERT_EQ(Ashift(i),A(i-N));
-    ASSERT_EQ(Bshift(i),B(i-N));
+    ASSERT_EQ(Ashift(i), A(i - N));
+    ASSERT_EQ(Bshift(i), B(i - N));
   }
 
-  for(TX tx=TX{N}; tx<TX{2*N}; tx++)
+  for (TX tx = TX {N}; tx < TX {2 * N}; tx++)
   {
-    ASSERT_EQ(Cshift(tx),C(tx-N));
+    ASSERT_EQ(Cshift(tx), C(tx - N));
   }
 
   /*
    * Create a shifted view from a view with a typed layout
    */
-  using TLayout = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
+  using TLayout       = RAJA::TypedLayout<TIL, RAJA::tuple<TIX>>;
   using TOffsetLayout = RAJA::TypedOffsetLayout<TIL, RAJA::tuple<TIX>>;
 
   TLayout myLayout(10);
@@ -239,61 +258,67 @@ TYPED_TEST(TypedViewUnitTest, Shift1D)
   RAJA::View<TypeParam, TLayout> D(a, myLayout);
   RAJA::View<TypeParam, TOffsetLayout> Dshift = D.shift({{N}});
 
-  for(TIX i=TIX{N}; i<TIX{2*N}; ++i)
+  for (TIX i = TIX {N}; i < TIX {2 * N}; ++i)
   {
-    ASSERT_EQ(Dshift(i),D(i-N));
+    ASSERT_EQ(Dshift(i), D(i - N));
   };
 
   delete[] a;
   delete[] b;
-
 }
 
 
 TYPED_TEST(TypedViewUnitTest, Shift2D)
 {
 
-  int N = 10;
-  TypeParam *a = new TypeParam[N*N];
-  TypeParam *b = new TypeParam[N*N];
+  int N        = 10;
+  TypeParam* a = new TypeParam[N * N];
+  TypeParam* b = new TypeParam[N * N];
 
   const int DIM = 2;
-  RAJA::OffsetLayout<DIM> layout = RAJA::make_offset_layout<DIM>({{0,0}},{{N,N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a,layout);
-  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a,N,N);
+  RAJA::OffsetLayout<DIM> layout =
+      RAJA::make_offset_layout<DIM>({{0, 0}}, {{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> A(a, layout);
+  RAJA::View<TypeParam, RAJA::Layout<DIM>> B(a, N, N);
 
-  for(int y=0; y<N; ++y) {
-    for(int x=0; x<N; ++x) {
-      A(y,x) = static_cast<TypeParam>(x + N*y);
+  for (int y = 0; y < N; ++y)
+  {
+    for (int x = 0; x < N; ++x)
+    {
+      A(y, x) = static_cast<TypeParam>(x + N * y);
     }
   }
 
   /*
    * Create a view from a base view with an offsetlayout
    */
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N,N}});
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Ashift = A.shift({{N, N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Bshift = B.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Ashift(y,x),A(y-N,x-N));
-      ASSERT_EQ(Bshift(y,x),B(y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Ashift(y, x), A(y - N, x - N));
+      ASSERT_EQ(Bshift(y, x), B(y - N, x - N));
     }
   }
 
   /*
    * Create a view from a base view with permuted layout
    */
-  std::array< RAJA::idx_t, 2> perm {{1, 0}};
+  std::array<RAJA::idx_t, 2> perm {{1, 0}};
   RAJA::OffsetLayout<2> playout =
-    RAJA::make_permuted_offset_layout<2>( {{0, 0}}, {{N, N}}, perm );
+      RAJA::make_permuted_offset_layout<2>({{0, 0}}, {{N, N}}, perm);
 
   RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> C(a, playout);
-  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N,N}});
+  RAJA::View<TypeParam, RAJA::OffsetLayout<DIM>> Cshift = C.shift({{N, N}});
 
-  for(int y=N; y<N+N; ++y) {
-    for(int x=N; x<N+N; ++x) {
-      ASSERT_EQ(Cshift(y,x),C(y-N,x-N));
+  for (int y = N; y < N + N; ++y)
+  {
+    for (int x = N; x < N + N; ++x)
+    {
+      ASSERT_EQ(Cshift(y, x), C(y - N, x - N));
     }
   }
 
diff --git a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
index 7797ce9947..fb6dd0786e 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-Enqueue.hpp
@@ -17,19 +17,15 @@
 #include <random>
 
 
-template < typename IndexType,
-           typename ... Args >
+template <typename IndexType, typename... Args>
 struct EnqueueTestCallable
 {
-  EnqueueTestCallable(IndexType* _ptr, IndexType _val)
-    : ptr(_ptr)
-    , val(_val)
-  { }
+  EnqueueTestCallable(IndexType* _ptr, IndexType _val) : ptr(_ptr), val(_val) {}
 
-  EnqueueTestCallable(EnqueueTestCallable const&) = default;
+  EnqueueTestCallable(EnqueueTestCallable const&)            = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable const&) = default;
 
-  EnqueueTestCallable(EnqueueTestCallable&& o) = default;
+  EnqueueTestCallable(EnqueueTestCallable&& o)            = default;
   EnqueueTestCallable& operator=(EnqueueTestCallable&& o) = default;
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
@@ -40,7 +36,7 @@ struct EnqueueTestCallable
 
 private:
   IndexType* ptr;
-  IndexType  val;
+  IndexType val;
 };
 
 #endif  //__TEST_UTIL_WORKGROUP_ENQUEUE__
diff --git a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
index 5fa93fbf60..a93c932ec2 100644
--- a/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
+++ b/test/unit/workgroup/tests/test-util-workgroup-WorkStorage.hpp
@@ -19,58 +19,59 @@
 #include <cstddef>
 
 
-template < typename T >
+template <typename T>
 struct TestCallable
 {
-  TestCallable(T _val)
-    : val(_val)
-  { }
+  TestCallable(T _val) : val(_val) {}
 
-  TestCallable(TestCallable const&) = delete;
+  TestCallable(TestCallable const&)            = delete;
   TestCallable& operator=(TestCallable const&) = delete;
 
-  TestCallable(TestCallable&& o)
-    : val(o.val)
-    , move_constructed(true)
+  TestCallable(TestCallable&& o) : val(o.val), move_constructed(true)
   {
     o.moved_from = true;
   }
 
   TestCallable& operator=(TestCallable&& o)
   {
-    val = o.val;
+    val          = o.val;
     o.moved_from = true;
     return *this;
   }
 
-  RAJA_HOST_DEVICE void operator()(
-      void* val_ptr, bool* move_constructed_ptr, bool* moved_from_ptr) const
+  RAJA_HOST_DEVICE void operator()(void* val_ptr,
+                                   bool* move_constructed_ptr,
+                                   bool* moved_from_ptr) const
   {
     *static_cast<T*>(val_ptr) = val;
-    *move_constructed_ptr = move_constructed;
-    *moved_from_ptr = moved_from;
+    *move_constructed_ptr     = move_constructed;
+    *moved_from_ptr           = moved_from;
   }
 
 private:
   T val;
+
 public:
   bool move_constructed = false;
-  bool moved_from = false;
+  bool moved_from       = false;
 };
 
 
 // work around inconsistent std::array support over stl versions
-template < typename T, size_t N >
+template <typename T, size_t N>
 struct TestArray
 {
-  T a[N]{};
+  T a[N] {};
   T& operator[](size_t i) { return a[i]; }
   T const& operator[](size_t i) const { return a[i]; }
   friend inline bool operator==(TestArray const& lhs, TestArray const& rhs)
   {
-    for (size_t i = 0; i < N; ++i) {
-      if (lhs[i] == rhs[i]) continue;
-      else return false;
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (lhs[i] == rhs[i])
+        continue;
+      else
+        return false;
     }
     return true;
   }
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index 253015c5b8..7dbac1403d 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -20,102 +20,95 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
+          typename Allocator>
+struct testWorkGroupConstructorSingle
 {
-  bool success = true;
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {
+    bool success = true;
 
-  using DispatchPolicy = typename DispatchTyper::template type<>;
+    using DispatchPolicy = typename DispatchTyper::template type<>;
 
-  {
-    RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        pool(Allocator{});
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-    RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        group = pool.instantiate();
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-
-    RAJA::WorkSite<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >
-        site = group.run(Xargs{}...);
-
-    using resource_type = typename RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Xargs...>,
-                    Allocator
-                  >::resource_type;
-    auto e = resource_type::get_default().get_event();
-    e.wait();
-
-    pool.clear();
-    group.clear();
-    site.clear();
-
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
-  }
+    {
+      RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
+          pool(Allocator {});
 
-  ASSERT_TRUE(success);
-}
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+      RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                            StoragePolicy, DispatchPolicy>,
+                      IndexType, RAJA::xargs<Xargs...>, Allocator>
+          group = pool.instantiate();
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+
+      RAJA::WorkSite<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                           StoragePolicy, DispatchPolicy>,
+                     IndexType, RAJA::xargs<Xargs...>, Allocator>
+          site = group.run(Xargs {}...);
+
+      using resource_type = typename RAJA::WorkPool<
+          RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy,
+                                DispatchPolicy>,
+          IndexType, RAJA::xargs<Xargs...>, Allocator>::resource_type;
+      auto e = resource_type::get_default().get_event();
+      e.wait();
+
+      pool.clear();
+      group.clear();
+      site.clear();
+
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    }
+
+    ASSERT_TRUE(success);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_function_call_dispatch_typer,
-                                      IndexType,
-                                      Allocator> {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
-{ }
+          typename Allocator>
+struct testWorkGroupConstructorSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                      StoragePolicy,
-                                      detail::indirect_virtual_function_dispatch_typer,
-                                      IndexType,
-                                      Allocator> {
-template < typename ... Xargs >
-void operator()(RAJA::xargs<Xargs...>) const
-{ }
+          typename Allocator>
+struct testWorkGroupConstructorSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Xargs>
+  void operator()(RAJA::xargs<Xargs...>) const
+  {}
 };
 
 #endif
@@ -123,23 +116,25 @@ void operator()(RAJA::xargs<Xargs...>) const
 
 template <typename T>
 class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 
 TYPED_TEST_SUITE_P(WorkGroupBasicConstructorSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest, BasicWorkGroupConstructorSingle)
+TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest,
+             BasicWorkGroupConstructorSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{});
+  testWorkGroupConstructorSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                                 DispatchTyper, IndexType, Allocator> {}(
+      Xargs {});
 }
 
 #endif  //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 843f3b17a6..76016b7bee 100644
--- a/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -15,75 +15,60 @@
 #include "RAJA_test-workgroup.hpp"
 
 
-template  < typename ForOnePol,
-            typename Invoker,
-            typename ... CallArgs >
-typename  std::enable_if<
-            !std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-call_dispatcher( Invoker invoker,
-                 CallArgs... callArgs )
+template <typename ForOnePol, typename Invoker, typename... CallArgs>
+typename std::enable_if<!std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+call_dispatcher(Invoker invoker, CallArgs... callArgs)
 {
-  forone<ForOnePol>( [=] () {
-    invoker(callArgs...);
-  });
+  forone<ForOnePol>([=]() { invoker(callArgs...); });
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
-template  < typename ForOnePol,
-            typename Invoker,
-            typename ... CallArgs >
-typename  std::enable_if<
-            std::is_base_of<RunOnDevice, ForOnePol>::value
-          >::type
-call_dispatcher( Invoker invoker,
-                 CallArgs... callArgs )
+template <typename ForOnePol, typename Invoker, typename... CallArgs>
+typename std::enable_if<std::is_base_of<RunOnDevice, ForOnePol>::value>::type
+call_dispatcher(Invoker invoker, CallArgs... callArgs)
 {
   RAJA::tuple<CallArgs...> lambda_capturable_callArgs(callArgs...);
-  forone<ForOnePol>( [=] RAJA_DEVICE () {
-    camp::invoke(lambda_capturable_callArgs, invoker);
-  });
+  forone<ForOnePol>([=] RAJA_DEVICE()
+                    { camp::invoke(lambda_capturable_callArgs, invoker); });
 }
 #endif
 
-template < typename IndexType,
-           typename ... Args >
+template <typename IndexType, typename... Args>
 struct DispatcherTestCallable
 {
-  DispatcherTestCallable(IndexType* _ptr_call, IndexType _val_call,
-                     IndexType* _ptr_dtor, IndexType _val_dtor)
-    : ptr_call(_ptr_call)
-    , val_call(_val_call)
-    , ptr_dtor(_ptr_dtor)
-    , val_dtor(_val_dtor)
-  { }
-
-  DispatcherTestCallable(DispatcherTestCallable const&) = delete;
+  DispatcherTestCallable(IndexType* _ptr_call,
+                         IndexType _val_call,
+                         IndexType* _ptr_dtor,
+                         IndexType _val_dtor)
+      : ptr_call(_ptr_call),
+        val_call(_val_call),
+        ptr_dtor(_ptr_dtor),
+        val_dtor(_val_dtor)
+  {}
+
+  DispatcherTestCallable(DispatcherTestCallable const&)            = delete;
   DispatcherTestCallable& operator=(DispatcherTestCallable const&) = delete;
 
   DispatcherTestCallable(DispatcherTestCallable&& o)
-    : ptr_call(o.ptr_call)
-    , val_call(o.val_call)
-    , ptr_dtor(o.ptr_dtor)
-    , val_dtor(o.val_dtor)
-    , move_constructed(true)
+      : ptr_call(o.ptr_call),
+        val_call(o.val_call),
+        ptr_dtor(o.ptr_dtor),
+        val_dtor(o.val_dtor),
+        move_constructed(true)
   {
     o.moved_from = true;
   }
   DispatcherTestCallable& operator=(DispatcherTestCallable&& o)
   {
-    ptr_call = o.ptr_call;
-    val_call = o.val_call;
-    ptr_dtor = o.ptr_dtor;
-    val_dtor = o.val_dtor;
+    ptr_call     = o.ptr_call;
+    val_call     = o.val_call;
+    ptr_dtor     = o.ptr_dtor;
+    val_dtor     = o.val_dtor;
     o.moved_from = true;
     return *this;
   }
 
-  ~DispatcherTestCallable()
-  {
-    *ptr_dtor = val_dtor;
-  }
+  ~DispatcherTestCallable() { *ptr_dtor = val_dtor; }
 
   RAJA_HOST_DEVICE void operator()(IndexType i, Args... args) const
   {
@@ -93,156 +78,165 @@ struct DispatcherTestCallable
 
 private:
   IndexType* ptr_call;
-  IndexType  val_call;
+  IndexType val_call;
   IndexType* ptr_dtor;
-  IndexType  val_dtor;
+  IndexType val_dtor;
+
 public:
   bool move_constructed = false;
-  bool moved_from = false;
+  bool moved_from       = false;
 };
 
-template < typename ExecPolicy,
-           typename DispatchTyper,
-           typename IndexType,
-           typename WORKING_RES,
-           typename ForOnePol >
-struct testWorkGroupDispatcherSingle {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
+template <typename ExecPolicy,
+          typename DispatchTyper,
+          typename IndexType,
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle
 {
-  using TestCallable = DispatcherTestCallable<IndexType, Args...>;
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {
+    using TestCallable = DispatcherTestCallable<IndexType, Args...>;
 
-  camp::resources::Resource work_res{WORKING_RES()};
-  camp::resources::Resource host_res{camp::resources::Host()};
+    camp::resources::Resource work_res {WORKING_RES()};
+    camp::resources::Resource host_res {camp::resources::Host()};
 
-  static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
-  using DispatchPolicy = typename DispatchTyper::template type<TestCallable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, IndexType, Args...>;
-  using Invoker_type = typename Dispatcher_type::invoker_type;
-  using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
-  const Dispatcher_type* dispatcher =
-      RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(ExecPolicy{});
+    static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
+    using DispatchPolicy  = typename DispatchTyper::template type<TestCallable>;
+    using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                     void, IndexType, Args...>;
+    using Invoker_type    = typename Dispatcher_type::invoker_type;
+    using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
+    const Dispatcher_type* dispatcher =
+        RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(
+            ExecPolicy {});
 
-  TestCallable* old_obj = host_res.allocate<TestCallable>(1);
-  TestCallable* new_obj = host_res.allocate<TestCallable>(1);
-  TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
+    TestCallable* old_obj = host_res.allocate<TestCallable>(1);
+    TestCallable* new_obj = host_res.allocate<TestCallable>(1);
+    TestCallable* wrk_obj = work_res.allocate<TestCallable>(1);
 
-  IndexType* chckCall = host_res.allocate<IndexType>(3);
-  IndexType* testCall = host_res.allocate<IndexType>(3);
-  IndexType* workCall = work_res.allocate<IndexType>(3);
+    IndexType* chckCall = host_res.allocate<IndexType>(3);
+    IndexType* testCall = host_res.allocate<IndexType>(3);
+    IndexType* workCall = work_res.allocate<IndexType>(3);
 
-  IndexType* chckDtor = host_res.allocate<IndexType>(3);
-  IndexType* testDtor = host_res.allocate<IndexType>(3);
+    IndexType* chckDtor = host_res.allocate<IndexType>(3);
+    IndexType* testDtor = host_res.allocate<IndexType>(3);
 
 
-  chckCall[0] = (IndexType)5;
-  chckCall[1] = (IndexType)7;
-  chckCall[2] = (IndexType)5;
+    chckCall[0] = (IndexType)5;
+    chckCall[1] = (IndexType)7;
+    chckCall[2] = (IndexType)5;
 
-  testCall[0] = (IndexType)5;
-  testCall[1] = (IndexType)5;
-  testCall[2] = (IndexType)5;
+    testCall[0] = (IndexType)5;
+    testCall[1] = (IndexType)5;
+    testCall[2] = (IndexType)5;
 
-  work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
+    work_res.memcpy(workCall, testCall, sizeof(IndexType) * 3);
 
-  testCall[0] = (IndexType)0;
-  testCall[1] = (IndexType)0;
-  testCall[2] = (IndexType)0;
+    testCall[0] = (IndexType)0;
+    testCall[1] = (IndexType)0;
+    testCall[2] = (IndexType)0;
 
 
-  chckDtor[0] = (IndexType)15;
-  chckDtor[1] = (IndexType)17;
-  chckDtor[2] = (IndexType)15;
+    chckDtor[0] = (IndexType)15;
+    chckDtor[1] = (IndexType)17;
+    chckDtor[2] = (IndexType)15;
 
-  testDtor[0] = (IndexType)15;
-  testDtor[1] = (IndexType)15;
-  testDtor[2] = (IndexType)15;
+    testDtor[0] = (IndexType)15;
+    testDtor[1] = (IndexType)15;
+    testDtor[2] = (IndexType)15;
 
 
-  new(old_obj) TestCallable(workCall, chckCall[1], testDtor+1, chckDtor[1]);
+    new (old_obj)
+        TestCallable(workCall, chckCall[1], testDtor + 1, chckDtor[1]);
 
-  ASSERT_FALSE(old_obj->move_constructed);
-  ASSERT_FALSE(old_obj->moved_from);
+    ASSERT_FALSE(old_obj->move_constructed);
+    ASSERT_FALSE(old_obj->moved_from);
 
 
-  dispatcher->move_construct_destroy(new_obj, old_obj);
+    dispatcher->move_construct_destroy(new_obj, old_obj);
 
-  ASSERT_TRUE(new_obj->move_constructed);
-  ASSERT_FALSE(new_obj->moved_from);
+    ASSERT_TRUE(new_obj->move_constructed);
+    ASSERT_FALSE(new_obj->moved_from);
 
-  ASSERT_EQ(testDtor[0], chckDtor[0]);
-  ASSERT_EQ(testDtor[1], chckDtor[1]);
-  ASSERT_EQ(testDtor[2], chckDtor[2]);
+    ASSERT_EQ(testDtor[0], chckDtor[0]);
+    ASSERT_EQ(testDtor[1], chckDtor[1]);
+    ASSERT_EQ(testDtor[2], chckDtor[2]);
 
-  testDtor[0] = (IndexType)15;
-  testDtor[1] = (IndexType)15;
-  testDtor[2] = (IndexType)15;
+    testDtor[0] = (IndexType)15;
+    testDtor[1] = (IndexType)15;
+    testDtor[2] = (IndexType)15;
 
 
-  work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
+    work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
-  // move a value onto device and fiddle
-  call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType, Args...>(
-      dispatcher->invoke, wrk_obj, (IndexType)1, Args{}...);
+    // move a value onto device and fiddle
+    call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType,
+                    Args...>(dispatcher->invoke, wrk_obj, (IndexType)1,
+                             Args {}...);
 
-  work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
+    work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
-  ASSERT_EQ(testCall[0], chckCall[0]);
-  ASSERT_EQ(testCall[1], chckCall[1]);
-  ASSERT_EQ(testCall[2], chckCall[2]);
+    ASSERT_EQ(testCall[0], chckCall[0]);
+    ASSERT_EQ(testCall[1], chckCall[1]);
+    ASSERT_EQ(testCall[2], chckCall[2]);
 
 
-  dispatcher->destroy(new_obj);
+    dispatcher->destroy(new_obj);
 
-  ASSERT_EQ(testDtor[0], chckDtor[0]);
-  ASSERT_EQ(testDtor[1], chckDtor[1]);
-  ASSERT_EQ(testDtor[2], chckDtor[2]);
+    ASSERT_EQ(testDtor[0], chckDtor[0]);
+    ASSERT_EQ(testDtor[1], chckDtor[1]);
+    ASSERT_EQ(testDtor[2], chckDtor[2]);
 
 
-  host_res.deallocate( old_obj );
-  host_res.deallocate( new_obj );
-  work_res.deallocate( wrk_obj );
-  host_res.deallocate( chckCall );
-  host_res.deallocate( testCall );
-  work_res.deallocate( workCall );
-  host_res.deallocate( chckDtor );
-  host_res.deallocate( testDtor );
-}
+    host_res.deallocate(old_obj);
+    host_res.deallocate(new_obj);
+    work_res.deallocate(wrk_obj);
+    host_res.deallocate(chckCall);
+    host_res.deallocate(testCall);
+    work_res.deallocate(workCall);
+    host_res.deallocate(chckDtor);
+    host_res.deallocate(testDtor);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename IndexType,
-           typename WORKING_RES,
-          typename ForOnePol
-          >
-struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                     detail::indirect_function_call_dispatch_typer,
-                                     IndexType,
-                                     WORKING_RES,
-                                     ForOnePol> {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
-{ }
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    WORKING_RES,
+    ForOnePol>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename IndexType,
-           typename WORKING_RES,
-          typename ForOnePol
-          >
-struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                     detail::indirect_virtual_function_dispatch_typer,
-                                     IndexType,
-                                     WORKING_RES,
-                                     ForOnePol> {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>) const
-{ }
+          typename WORKING_RES,
+          typename ForOnePol>
+struct testWorkGroupDispatcherSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    WORKING_RES,
+    ForOnePol>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>) const
+  {}
 };
 
 #endif
@@ -250,22 +244,22 @@ void operator()(RAJA::xargs<Args...>) const
 
 template <typename T>
 class WorkGroupBasicDispatcherSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest, BasicWorkGroupDispatcherSingle)
+TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest,
+             BasicWorkGroupDispatcherSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using Args = typename camp::at<TypeParam, camp::num<3>>::type;
-  using ResourceType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<5>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Args          = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ResourceType  = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ForOneType    = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupDispatcherSingle< ExecPolicy, DispatchTyper, IndexType, ResourceType, ForOneType >{}(
-      Args{});
+  testWorkGroupDispatcherSingle<ExecPolicy, DispatchTyper, IndexType,
+                                ResourceType, ForOneType> {}(Args {});
 }
 
 #endif  //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index fcf24e89da..6b7572af83 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -23,104 +23,110 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple
 {
-  IndexType success = (IndexType)1;
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-  using callable = EnqueueTestCallable<IndexType, Args...>;
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
+  {
+    IndexType success = (IndexType)1;
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable> >;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using callable      = EnqueueTestCallable<IndexType, Args...>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable>>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
-  {
-    WorkPool_type pool(Allocator{});
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    // test_empty(pool);
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    {
+      WorkPool_type pool(Allocator {});
 
-    for (size_t i = 0; i < rep; ++i) {
+      // test_empty(pool);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
+      for (size_t i = 0; i < rep; ++i)
       {
-        for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+
+        {
+          for (size_t i = 0; i < num; ++i)
+          {
+            pool.enqueue(range_segment {0, 1},
+                         callable {&success, IndexType(0)});
+          }
+
+          ASSERT_EQ(pool.num_loops(), (size_t)num);
+          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
         }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)num);
-        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
-      }
+        if (do_instantiate)
+        {
+          WorkGroup_type group = pool.instantiate();
+        }
+        else
+        {
+          pool.clear();
+        }
 
-      if (do_instantiate) {
-        WorkGroup_type group = pool.instantiate();
-      } else {
-        pool.clear();
+        ASSERT_EQ(pool.num_loops(), (size_t)0);
+        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
       }
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
     }
-  }
 
-  ASSERT_EQ(success, (IndexType)1);
-}
+    ASSERT_EQ(success, (IndexType)1);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_function_call_dispatch_typer,
-                                    IndexType,
-                                    Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                    StoragePolicy,
-                                    detail::indirect_virtual_function_dispatch_typer,
-                                    IndexType,
-                                    Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueMultiple<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 
 #endif
@@ -128,30 +134,32 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicEnqueueMultipleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest, BasicWorkGroupEnqueueMultiple)
+TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest,
+             BasicWorkGroupEnqueueMultiple)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist_rep(0, 16);
   std::uniform_int_distribution<size_t> dist_num(0, 64);
 
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
-      Xargs{}, false, dist_rep(rng), dist_num(rng));
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(
-      Xargs{}, true, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, false, dist_rep(rng), dist_num(rng));
+  testWorkGroupEnqueueMultiple<ExecPolicy, OrderPolicy, StoragePolicy,
+                               DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, true, dist_rep(rng), dist_num(rng));
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index 282b911d93..ee172d7732 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -23,103 +23,110 @@ template <typename ExecPolicy,
           typename StoragePolicy,
           typename DispatchTyper,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle {
-template < typename ... Args >
-void operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
+          typename Allocator>
+struct testWorkGroupEnqueueSingle
 {
-  IndexType success = (IndexType)1;
-
-  using range_segment = RAJA::TypedRangeSegment<IndexType>;
-  using callable = EnqueueTestCallable<IndexType, Args...>;
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>,
+                  bool do_instantiate,
+                  size_t rep,
+                  size_t num) const
+  {
+    IndexType success = (IndexType)1;
 
-  using DispatchPolicy = typename DispatchTyper::template type<
-      camp::list<range_segment, callable> >;
+    using range_segment = RAJA::TypedRangeSegment<IndexType>;
+    using callable      = EnqueueTestCallable<IndexType, Args...>;
 
-  using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using DispatchPolicy = typename DispatchTyper::template type<
+        camp::list<range_segment, callable>>;
 
-  using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
-                    IndexType,
-                    RAJA::xargs<Args...>,
-                    Allocator
-                  >;
+    using WorkPool_type =
+        RAJA::WorkPool<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                             StoragePolicy, DispatchPolicy>,
+                       IndexType, RAJA::xargs<Args...>, Allocator>;
 
-  {
-    WorkPool_type pool(Allocator{});
+    using WorkGroup_type =
+        RAJA::WorkGroup<RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy,
+                                              StoragePolicy, DispatchPolicy>,
+                        IndexType, RAJA::xargs<Args...>, Allocator>;
 
-    // test_empty(pool);
-    ASSERT_EQ(pool.num_loops(), (size_t)0);
-    ASSERT_EQ(pool.storage_bytes(), (size_t)0);
+    {
+      WorkPool_type pool(Allocator {});
 
-    for (size_t i = 0; i < rep; ++i) {
+      // test_empty(pool);
+      ASSERT_EQ(pool.num_loops(), (size_t)0);
+      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
+      for (size_t i = 0; i < rep; ++i)
       {
-        for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
+
+        {
+          for (size_t i = 0; i < num; ++i)
+          {
+            pool.enqueue(range_segment {0, 1},
+                         callable {&success, IndexType(0)});
+          }
+
+          ASSERT_EQ(pool.num_loops(), (size_t)num);
+          ASSERT_GE(pool.storage_bytes(), num * sizeof(callable));
         }
 
-        ASSERT_EQ(pool.num_loops(), (size_t)num);
-        ASSERT_GE(pool.storage_bytes(), num*sizeof(callable));
-      }
+        if (do_instantiate)
+        {
+          WorkGroup_type group = pool.instantiate();
+        }
+        else
+        {
+          pool.clear();
+        }
 
-      if (do_instantiate) {
-        WorkGroup_type group = pool.instantiate();
-      } else {
-        pool.clear();
+        ASSERT_EQ(pool.num_loops(), (size_t)0);
+        ASSERT_EQ(pool.storage_bytes(), (size_t)0);
       }
-
-      ASSERT_EQ(pool.num_loops(), (size_t)0);
-      ASSERT_EQ(pool.storage_bytes(), (size_t)0);
     }
-  }
 
-  ASSERT_EQ(success, (IndexType)1);
-}
+    ASSERT_EQ(success, (IndexType)1);
+  }
 };
 
 
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported types untested
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_function_call_dispatch_typer,
-                                  IndexType,
-                                  Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_function_call_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 ///
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE,
+          bool Async,
           typename StoragePolicy,
           typename IndexType,
-          typename Allocator
-          >
-struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
-                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-                                  StoragePolicy,
-                                  detail::indirect_virtual_function_dispatch_typer,
-                                  IndexType,
-                                  Allocator> {
-template < typename ... Args >
-void operator()(
-    RAJA::xargs<Args...>, bool, size_t, size_t) const
-{ }
+          typename Allocator>
+struct testWorkGroupEnqueueSingle<
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    StoragePolicy,
+    detail::indirect_virtual_function_dispatch_typer,
+    IndexType,
+    Allocator>
+{
+  template <typename... Args>
+  void operator()(RAJA::xargs<Args...>, bool, size_t, size_t) const
+  {}
 };
 
 #endif
@@ -127,24 +134,27 @@ void operator()(
 
 template <typename T>
 class WorkGroupBasicEnqueueSingleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicEnqueueSingleUnitTest);
 
 
 TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
 {
-  using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ExecPolicy    = typename camp::at<TypeParam, camp::num<0>>::type;
+  using OrderPolicy   = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
-
-  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, false, 1, 1);
-  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, true, 1, 1);
+  using IndexType     = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs         = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<6>>::type;
+
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, false, 1, 1);
+  testWorkGroupEnqueueSingle<ExecPolicy, OrderPolicy, StoragePolicy,
+                             DispatchTyper, IndexType, Allocator> {}(
+      Xargs {}, true, 1, 1);
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 6022e98919..603209ecee 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -20,32 +20,26 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageConstructor()
 {
   bool success = true;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy           = typename DispatchTyper::template type<>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
-
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
 
@@ -60,7 +54,7 @@ void testWorkGroupWorkStorageConstructor()
     test_empty(container2);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
@@ -73,19 +67,20 @@ void testWorkGroupWorkStorageConstructor()
 
 template <typename T>
 class WorkGroupBasicWorkStorageConstructorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest, BasicWorkGroupWorkStorageConstructor)
+TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest,
+             BasicWorkGroupWorkStorageConstructor)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageConstructor<StoragePolicy, DispatchTyper,
+                                      Allocator>();
 }
 
 
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index fd5a7aeaa3..d45a8d6ce6 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -20,10 +20,7 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageInsertCall()
 {
   bool success = true;
@@ -31,28 +28,26 @@ void testWorkGroupWorkStorageInsertCall()
   using callable = TestCallable<double>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
-  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
-      callable, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
-
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val) {
-
+    auto fill_contents = [&](WorkStorage_type& container, double init_val)
+    {
       callable c(init_val);
 
       ASSERT_FALSE(c.move_constructed);
@@ -67,17 +62,18 @@ void testWorkGroupWorkStorageInsertCall()
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val) {
-
+    auto test_contents = [&](WorkStorage_type& container, double init_val)
+    {
       ASSERT_EQ(container.size(), (size_t)1);
       ASSERT_TRUE(container.storage_size() >= sizeof(callable));
 
       auto iter = container.begin();
 
-      double test_val = -1;
+      double test_val       = -1;
       bool move_constructed = false;
-      bool moved_from = true;
-      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed, &moved_from);
+      bool moved_from       = true;
+      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed,
+                                 &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
@@ -85,7 +81,7 @@ void testWorkGroupWorkStorageInsertCall()
     };
 
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
 
@@ -102,14 +98,14 @@ void testWorkGroupWorkStorageInsertCall()
     test_contents(container2, 1.23456789);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.23456789);
 
 
-    WorkStorage_type container4(Allocator{});
+    WorkStorage_type container4(Allocator {});
 
     fill_contents(container4, 2.34567891);
     test_contents(container4, 2.34567891);
@@ -126,19 +122,19 @@ void testWorkGroupWorkStorageInsertCall()
 
 template <typename T>
 class WorkGroupBasicWorkStorageInsertCallUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest, BasicWorkGroupWorkStorageInsertCall)
+TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest,
+             BasicWorkGroupWorkStorageInsertCall)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageInsertCall< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageInsertCall<StoragePolicy, DispatchTyper, Allocator>();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index 90cc7c1368..58206f5d90 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -20,10 +20,7 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
 void testWorkGroupWorkStorageIterator()
 {
   bool success = true;
@@ -31,23 +28,21 @@ void testWorkGroupWorkStorageIterator()
   using callable = TestCallable<int>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy  = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
 
 
-  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
-      callable, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<callable, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
-    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)0);
+    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)0);
     ASSERT_FALSE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_TRUE(container.begin() == container.end());
@@ -55,9 +50,9 @@ void testWorkGroupWorkStorageIterator()
     ASSERT_TRUE(container.begin() <= container.end());
     ASSERT_TRUE(container.begin() >= container.end());
 
-    container.template emplace<callable>(dispatcher, callable{-1});
+    container.template emplace<callable>(dispatcher, callable {-1});
 
-    ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)1);
+    ASSERT_EQ(container.end() - container.begin(), (std::ptrdiff_t)1);
     ASSERT_TRUE(container.begin() < container.end());
     ASSERT_FALSE(container.begin() > container.end());
     ASSERT_FALSE(container.begin() == container.end());
@@ -75,12 +70,12 @@ void testWorkGroupWorkStorageIterator()
       ASSERT_EQ(++iter, container.end());
       ASSERT_EQ(--iter, container.begin());
 
-      ASSERT_EQ(iter+1, container.end());
-      ASSERT_EQ(1+iter, container.end());
+      ASSERT_EQ(iter + 1, container.end());
+      ASSERT_EQ(1 + iter, container.end());
       ASSERT_EQ(++iter, container.end());
-      ASSERT_EQ(iter-1, container.begin());
-      ASSERT_EQ(iter-=1, container.begin());
-      ASSERT_EQ(iter+=1, container.end());
+      ASSERT_EQ(iter - 1, container.begin());
+      ASSERT_EQ(iter -= 1, container.begin());
+      ASSERT_EQ(iter += 1, container.end());
     }
   }
 
@@ -90,19 +85,19 @@ void testWorkGroupWorkStorageIterator()
 
 template <typename T>
 class WorkGroupBasicWorkStorageIteratorUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest, BasicWorkGroupWorkStorageIterator)
+TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest,
+             BasicWorkGroupWorkStorageIterator)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageIterator< StoragePolicy, DispatchTyper, Allocator >();
+  testWorkGroupWorkStorageIterator<StoragePolicy, DispatchTyper, Allocator>();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 103829be0b..49fe1a4d60 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -20,12 +20,10 @@
 #include <cstddef>
 
 
-template <typename StoragePolicy,
-          typename DispatchTyper,
-          typename Allocator
-          >
-void testWorkGroupWorkStorageMultiple(
-    const size_t num0, const size_t num1, const size_t num2)
+template <typename StoragePolicy, typename DispatchTyper, typename Allocator>
+void testWorkGroupWorkStorageMultiple(const size_t num0,
+                                      const size_t num1,
+                                      const size_t num2)
 {
   bool success = true;
 
@@ -33,20 +31,25 @@ void testWorkGroupWorkStorageMultiple(
   using type1 = TestArray<double, 6>;
   using type2 = TestArray<double, 14>;
 
-  auto make_type0 = [](double init_val, size_t i) {
+  auto make_type0 = [](double init_val, size_t i)
+  {
     type0 obj(init_val - (double)i);
     return obj;
   };
-  auto make_type1 = [](double init_val, size_t i) {
-    type1 obj{};
-    for (size_t j = 0; j < 6; ++j) {
+  auto make_type1 = [](double init_val, size_t i)
+  {
+    type1 obj {};
+    for (size_t j = 0; j < 6; ++j)
+    {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
   };
-  auto make_type2 = [](double init_val, size_t i) {
-    type2 obj{};
-    for (size_t j = 0; j < 14; ++j) {
+  auto make_type2 = [](double init_val, size_t i)
+  {
+    type2 obj {};
+    for (size_t j = 0; j < 14; ++j)
+    {
       obj[j] = init_val + 10.0 * j + i;
     }
     return obj;
@@ -57,89 +60,95 @@ void testWorkGroupWorkStorageMultiple(
   using callable2 = TestCallable<type2>;
 
   static constexpr auto platform = RAJA::Platform::host;
-  using DispatchPolicy = typename DispatchTyper::template type<callable0, callable1, callable2>;
-  using Dispatcher_type = RAJA::detail::Dispatcher<
-      platform, DispatchPolicy, void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Dispatcher_type
-                                                    >;
+  using DispatchPolicy =
+      typename DispatchTyper::template type<callable0, callable1, callable2>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<platform, DispatchPolicy,
+                                                   void, void*, bool*, bool*>;
+  using WorkStorage_type =
+      RAJA::detail::WorkStorage<StoragePolicy, Allocator, Dispatcher_type>;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
 
-  const Dispatcher_type* dispatcher0 = RAJA::detail::get_Dispatcher<
-      callable0, Dispatcher_type>(RAJA::seq_work{});
-  const Dispatcher_type* dispatcher1 = RAJA::detail::get_Dispatcher<
-      callable1, Dispatcher_type>(RAJA::seq_work{});
-  const Dispatcher_type* dispatcher2 = RAJA::detail::get_Dispatcher<
-      callable2, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher0 =
+      RAJA::detail::get_Dispatcher<callable0, Dispatcher_type>(
+          RAJA::seq_work {});
+  const Dispatcher_type* dispatcher1 =
+      RAJA::detail::get_Dispatcher<callable1, Dispatcher_type>(
+          RAJA::seq_work {});
+  const Dispatcher_type* dispatcher2 =
+      RAJA::detail::get_Dispatcher<callable2, Dispatcher_type>(
+          RAJA::seq_work {});
 
   {
-    auto test_empty = [&](WorkStorage_type& container) {
-
+    auto test_empty = [&](WorkStorage_type& container)
+    {
       ASSERT_EQ(container.size(), (size_t)(0));
       ASSERT_EQ(container.storage_size(), (size_t)0);
     };
 
-    auto fill_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
-
+    auto fill_contents = [&](WorkStorage_type& container, double init_val0,
+                             double init_val1, double init_val2)
+    {
       std::vector<callable0> vec0;
       vec0.reserve(num0);
-      for (size_t i = 0; i < num0; ++i) {
+      for (size_t i = 0; i < num0; ++i)
+      {
         vec0.emplace_back(make_type0(init_val0, i));
         ASSERT_FALSE(vec0[i].move_constructed);
         ASSERT_FALSE(vec0[i].moved_from);
         container.template emplace<callable0>(dispatcher0, std::move(vec0[i]));
         ASSERT_FALSE(vec0[i].move_constructed);
-        ASSERT_TRUE (vec0[i].moved_from);
+        ASSERT_TRUE(vec0[i].moved_from);
       }
 
       std::vector<callable1> vec1;
       vec1.reserve(num1);
-      for (size_t i = 0; i < num1; ++i) {
+      for (size_t i = 0; i < num1; ++i)
+      {
         vec1.emplace_back(make_type1(init_val1, i));
         ASSERT_FALSE(vec1[i].move_constructed);
         ASSERT_FALSE(vec1[i].moved_from);
         container.template emplace<callable1>(dispatcher1, std::move(vec1[i]));
         ASSERT_FALSE(vec1[i].move_constructed);
-        ASSERT_TRUE (vec1[i].moved_from);
+        ASSERT_TRUE(vec1[i].moved_from);
       }
 
       std::vector<callable2> vec2;
       vec2.reserve(num2);
-      for (size_t i = 0; i < num2; ++i) {
+      for (size_t i = 0; i < num2; ++i)
+      {
         vec2.emplace_back(make_type2(init_val2, i));
         ASSERT_FALSE(vec2[i].move_constructed);
         ASSERT_FALSE(vec2[i].moved_from);
         container.template emplace<callable2>(dispatcher2, std::move(vec2[i]));
         ASSERT_FALSE(vec2[i].move_constructed);
-        ASSERT_TRUE (vec2[i].moved_from);
+        ASSERT_TRUE(vec2[i].moved_from);
       }
 
-      ASSERT_EQ(container.size(), num0+num1+num2);
-      ASSERT_GE(container.storage_size(),
-          num0*sizeof(callable0) +
-          num1*sizeof(callable1) +
-          num2*sizeof(callable2));
+      ASSERT_EQ(container.size(), num0 + num1 + num2);
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
     };
 
-    auto test_contents = [&](WorkStorage_type& container, double init_val0, double init_val1, double init_val2) {
-
-      ASSERT_EQ(container.size(), num0+num1+num2);
-      ASSERT_GE(container.storage_size(),
-          num0*sizeof(callable0) +
-          num1*sizeof(callable1) +
-          num2*sizeof(callable2));
+    auto test_contents = [&](WorkStorage_type& container, double init_val0,
+                             double init_val1, double init_val2)
+    {
+      ASSERT_EQ(container.size(), num0 + num1 + num2);
+      ASSERT_GE(container.storage_size(), num0 * sizeof(callable0) +
+                                              num1 * sizeof(callable1) +
+                                              num2 * sizeof(callable2));
 
       {
         auto iter = container.begin();
 
-        for (size_t i = 0; i < num0; ++i) {
-          type0 val{};
+        for (size_t i = 0; i < num0; ++i)
+        {
+          type0 val {};
           bool move_constructed = false;
-          bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -149,11 +158,13 @@ void testWorkGroupWorkStorageMultiple(
           ++iter;
         }
 
-        for (size_t i = 0; i < num1; ++i) {
-          type1 val{};
+        for (size_t i = 0; i < num1; ++i)
+        {
+          type1 val {};
           bool move_constructed = false;
-          bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -163,11 +174,13 @@ void testWorkGroupWorkStorageMultiple(
           ++iter;
         }
 
-        for (size_t i = 0; i < num2; ++i) {
-          type2 val{};
+        for (size_t i = 0; i < num2; ++i)
+        {
+          type2 val {};
           bool move_constructed = false;
-          bool moved_from = true;
-          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          bool moved_from       = true;
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed,
+                                     &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);
@@ -181,7 +194,7 @@ void testWorkGroupWorkStorageMultiple(
       }
     };
 
-    WorkStorage_type container(Allocator{});
+    WorkStorage_type container(Allocator {});
 
     test_empty(container);
     fill_contents(container, 1.0, 100.0, 1000.0);
@@ -199,14 +212,14 @@ void testWorkGroupWorkStorageMultiple(
     test_contents(container2, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container3(Allocator{});
+    WorkStorage_type container3(Allocator {});
     container3 = std::move(container2);
 
     test_empty(container2);
     test_contents(container3, 1.0, 100.0, 1000.0);
 
 
-    WorkStorage_type container4(Allocator{});
+    WorkStorage_type container4(Allocator {});
 
     fill_contents(container4, 1.5, 100.5, 1000.5);
     test_contents(container4, 1.5, 100.5, 1000.5);
@@ -215,7 +228,6 @@ void testWorkGroupWorkStorageMultiple(
 
     test_empty(container3);
     test_contents(container4, 1.0, 100.0, 1000.0);
-
   }
 
   ASSERT_TRUE(success);
@@ -224,22 +236,22 @@ void testWorkGroupWorkStorageMultiple(
 
 template <typename T>
 class WorkGroupBasicWorkStorageMultipleUnitTest : public ::testing::Test
-{
-};
+{};
 
 TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
 
 
-TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest, BasicWorkGroupWorkStorageMultiple)
+TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest,
+             BasicWorkGroupWorkStorageMultiple)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Allocator     = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  std::mt19937 rng(std::random_device{}());
+  std::mt19937 rng(std::random_device {}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
-  testWorkGroupWorkStorageMultiple< StoragePolicy, DispatchTyper, Allocator >(
+  testWorkGroupWorkStorageMultiple<StoragePolicy, DispatchTyper, Allocator>(
       dist(rng), dist(rng), dist(rng));
 }
 

From b360207db13a7af4f7a9f40414413df4e45a6452 Mon Sep 17 00:00:00 2001
From: john bowen <bowen36@llnl.gov>
Date: Tue, 5 Nov 2024 11:09:51 -0800
Subject: [PATCH 9/9] Clang format

---
 .clang-format                                 |   5 +-
 cmake/RAJAMacros.cmake                        |  15 +-
 include/RAJA/pattern/launch/launch_core.hpp   |   2 +-
 include/RAJA/pattern/tensor/TensorIndex.hpp   | 358 +++++++++--------
 .../internal/ET/ExpressionTemplateBase.hpp    |   2 +-
 .../tensor/internal/ET/MultiplyOperator.hpp   |  33 +-
 .../tensor/internal/MatrixRegisterImpl.hpp    |   2 +-
 include/RAJA/policy/atomic_builtin.hpp        | 361 +++++++++---------
 include/RAJA/policy/cuda/kernel/Sync.hpp      |   3 +-
 include/RAJA/policy/openmp/params/forall.hpp  |  12 +-
 include/RAJA/policy/openmp/sort.hpp           |   4 +-
 include/RAJA/policy/openmp_target.hpp         |   2 +-
 include/RAJA/policy/openmp_target/forall.hpp  |  10 +-
 .../policy/openmp_target/params/reduce.hpp    |  60 +--
 include/RAJA/policy/openmp_target/reduce.hpp  |   2 +-
 include/RAJA/policy/sycl.hpp                  |  10 +-
 include/RAJA/policy/sycl/kernel.hpp           |   8 +-
 .../RAJA/policy/sycl/kernel/SyclKernel.hpp    |  13 +-
 include/RAJA/policy/sycl/launch.hpp           |   2 +-
 include/RAJA/util/View.hpp                    |  19 +-
 include/RAJA/util/types.hpp                   |  22 +-
 .../tests/test-forall-AtomicRefCAS.hpp        |   6 +-
 .../nested-loop-MultiLambdaParam-impl.hpp     |  11 +-
 .../test-kernel-reduceloc-Max2DViewTuple.hpp  |   2 +-
 .../tests/test-launch-nested-Direct.hpp       |   4 +-
 .../tests/test-launch-nested-Loop.hpp         |   4 +-
 .../tests/test-launch-nested-Tile-Direct.hpp  |   4 +-
 .../matrix/test-tensor-matrix-double.hpp      |  32 +-
 .../test-tensor-vector-ForallVectorRef2d.hpp  |  18 +-
 test/include/RAJA_gtest.hpp                   |  28 +-
 test/include/RAJA_test-reducepol.hpp          |   2 +-
 test/old-tests/unit/test-sharedmem.cpp        |  45 +--
 32 files changed, 529 insertions(+), 572 deletions(-)

diff --git a/.clang-format b/.clang-format
index b6fa54b233..b50c1facfb 100644
--- a/.clang-format
+++ b/.clang-format
@@ -31,7 +31,7 @@ BraceWrapping:
   AfterExternBlock: false
   BeforeCatch:     true
   BeforeElse:      true
-  BeforeLambdaBody: true 
+  BeforeLambdaBody: true
   IndentBraces:    false
   SplitEmptyFunction: false
   SplitEmptyRecord: false
@@ -40,12 +40,13 @@ BraceWrapping:
 # Pointer alignment
 DerivePointerAlignment: false
 PointerAlignment: Left
+
+# Single line config
 AllowShortIfStatementsOnASingleLine : true
 AllowShortFunctionsOnASingleLine : true
 AllowShortLoopsOnASingleLine : false
 AllowAllArgumentsOnNextLine : true
 AllowAllParametersOfDeclarationOnNextLine : false
-AlignTrailingComments : true
 BinPackArguments : true
 BinPackParameters : false
 ConstructorInitializerAllOnOneLineOrOnePerLine : true
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index 8a19001cc7..11c4661cc1 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -209,7 +209,7 @@ endmacro(raja_add_benchmark)
 ## raja_add_code_checks()
 ##
 ## Adds code checks for all source files recursively in the RAJA repository.
-## 
+##
 ## This creates the following parent build targets:
 ##  check - Runs a non file changing style check and CppCheck
 ##  style - In-place code formatting
@@ -233,7 +233,7 @@ macro(raja_add_code_checks)
   if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
       # Create file globbing expressions that only include directories that contain source
       # TODO(bowen) Add examples, exercises and benchmark to the list below
-      set(_base_dirs "RAJA" "examples" "exercises" "benchmark" "include" "src" "test")
+      set(_base_dirs "RAJA" "benchmark" "include" "src" "test")
       set(_ext_expressions "*.cpp" "*.hpp" "*.inl"
                            "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh")
 
@@ -248,15 +248,6 @@ macro(raja_add_code_checks)
       set(_sources)
       file(GLOB_RECURSE _sources ${_glob_expressions})
 
-      # Filter out exclusions
-      #set(_exclude_expressions
-      #    "${PROJECT_SOURCE_DIR}/axom/sidre/examples/lulesh2/*"
-      #    "${PROJECT_SOURCE_DIR}/axom/slam/examples/lulesh2.0.3/*"
-      #    "${PROJECT_SOURCE_DIR}/axom/slam/examples/tinyHydro/*")
-      #foreach(_exp ${_exclude_expressions})
-      #    list(FILTER _sources EXCLUDE REGEX ${_exp})
-      #endforeach()
-#
       blt_add_code_checks(PREFIX          RAJA
                           SOURCES         ${_sources}
                           CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format
@@ -265,7 +256,7 @@ macro(raja_add_code_checks)
       # Set FOLDER property for code check targets
       foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style)
           set(_tgt ${arg_PREFIX}_${_suffix})
-          if(TARGET ${_tgt}) 
+          if(TARGET ${_tgt})
               set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks")
           endif()
       endforeach()
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 8bb722a797..453dc75a1c 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -29,7 +29,7 @@
 #include "camp/tuple.hpp"
 
 // Odd dependecy with atomics is breaking CI builds
-//#include "RAJA/util/View.hpp"
+// #include "RAJA/util/View.hpp"
 
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
 #define RAJA_TEAM_SHARED __shared__
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 8f152d92ce..c384465a15 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -29,196 +29,190 @@ namespace expt
 {
 
 
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndexInner;
-
-  template<typename INNER_TYPE>
-  struct StaticTensorIndex;
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-  class TensorIndex {
-    public:
-      using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type all(){
-        return self_type(index_type(-1), value_type(-1));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>> static_all(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>>();
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type range(index_type begin, index_type end){
-        return self_type(begin, value_type(stripIndexType(end-begin)));
-      }
-
-      template<value_type TBEGIN, value_type TEND>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>> static_range(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>>();
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex() : m_index(index_type(0)), m_length(0) {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg) :
-      m_index(*seg.begin()), m_length(seg.size())
-      {}
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {}
-
-      template<typename T, camp::idx_t D>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
-
-
-      template<value_type IDX_VAL, value_type LEN_VAL>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
-          : m_index(IDX_VAL)
-          , m_length(LEN_VAL)
-      {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type const &operator*() const {
-        return m_index;
-      }
-
-      // used in strip_by_value as a static cast
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      explicit operator index_type() const {
-        // return does not matter, but suppresses no-return warnings
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type begin() const {
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type size() const {
-        return m_length;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type dim() const {
-        return DIM;
-      }
-
-    private:
-      index_type m_index;
-      value_type m_length;
-  };
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
-
-      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      static const index_type s_index  = INDEX_VALUE;
-      static const index_type s_length = LENGTH_VALUE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr operator base_type() {
-        return base_type(s_index,s_length);
-      }
-    
-  };
-
-
-
-  /*!
-   * Index that specifies the starting element index of a Vector
-   */
-  template<typename IDX, typename VECTOR_TYPE>
-  using VectorIndex =  TensorIndex<IDX, VECTOR_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Row index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using RowIndex =  TensorIndex<IDX, MATRIX_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Column index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using ColIndex =  TensorIndex<IDX, MATRIX_TYPE, 1>;
-
-
-  /*!
-   * Converts a Row index to a Column index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          strip_index_type_t<IDX> INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndexInner;
+
+template <typename INNER_TYPE>
+struct StaticTensorIndex;
+
+
+template <typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+class TensorIndex
+{
+public:
+  using self_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  static constexpr self_type all()
+  {
+    return self_type(index_type(-1), value_type(-1));
+  }
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                            TENSOR_TYPE,
+                                                            DIM,
+                                                            value_type(-1),
+                                                            value_type(-1)>>
+  static_all()
+  {
+    return StaticTensorIndex<StaticTensorIndexInner<
+        IDX, TENSOR_TYPE, DIM, value_type(-1), value_type(-1)>>();
+  }
+
   RAJA_INLINE
-  constexpr
-  ColIndex<IDX, MATRIX_TYPE> toColIndex(RowIndex<IDX, MATRIX_TYPE> const &r){
-    return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+  RAJA_HOST_DEVICE
+  static constexpr self_type range(index_type begin, index_type end)
+  {
+    return self_type(begin, value_type(stripIndexType(end - begin)));
   }
 
-  /*!
-   * Converts a Column index to a Row index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+  template <value_type TBEGIN, value_type TEND>
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex<
+      StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>
+  static_range()
+  {
+    return StaticTensorIndex<
+        StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>();
+  }
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {}
+
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(RAJA::TypedRangeSegment<IDX> const& seg)
+      : m_index(*seg.begin()), m_length(seg.size())
+  {}
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(index_type value, value_type length)
+      : m_index(value), m_length(length)
+  {}
+
+  template <typename T, camp::idx_t D>
+  RAJA_INLINE
+      RAJA_HOST_DEVICE constexpr TensorIndex(TensorIndex<IDX, T, D> const& c)
+      : m_index(*c), m_length(c.size())
+  {}
+
+
+  template <value_type IDX_VAL, value_type LEN_VAL>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      StaticTensorIndex<
+          StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const
+          RAJA_UNUSED_ARG(&c))
+      : m_index(IDX_VAL), m_length(LEN_VAL)
+  {}
+
+
+  RAJA_INLINE
   RAJA_HOST_DEVICE
+  constexpr index_type const& operator*() const { return m_index; }
+
+  // used in strip_by_value as a static cast
   RAJA_INLINE
-  constexpr
-  RowIndex<IDX, MATRIX_TYPE> toRowIndex(ColIndex<IDX, MATRIX_TYPE> const &c){
-    return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+  RAJA_HOST_DEVICE
+  constexpr explicit operator index_type() const
+  {
+    // return does not matter, but suppresses no-return warnings
+    return m_index;
   }
 
-} // namespace expt
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr index_type begin() const { return m_index; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type size() const { return m_length; }
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr value_type dim() const { return DIM; }
+
+private:
+  index_type m_index;
+  value_type m_length;
+};
+
+
+template <typename IDX,
+          typename TENSOR_TYPE,
+          camp::idx_t DIM,
+          strip_index_type_t<IDX> INDEX_VALUE,
+          strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndex<
+    StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
+{
+
+  using base_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  static const index_type s_index  = INDEX_VALUE;
+  static const index_type s_length = LENGTH_VALUE;
+
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr operator base_type() { return base_type(s_index, s_length); }
+};
+
+
+/*!
+ * Index that specifies the starting element index of a Vector
+ */
+template <typename IDX, typename VECTOR_TYPE>
+using VectorIndex = TensorIndex<IDX, VECTOR_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Row index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using RowIndex = TensorIndex<IDX, MATRIX_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Column index of a matrix
+ */
+template <typename IDX, typename MATRIX_TYPE>
+using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
+
+
+/*!
+ * Converts a Row index to a Column index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE>
+toColIndex(RowIndex<IDX, MATRIX_TYPE> const& r)
+{
+  return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+}
+
+/*!
+ * Converts a Column index to a Row index
+ */
+template <typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE>
+toRowIndex(ColIndex<IDX, MATRIX_TYPE> const& c)
+{
+  return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 0c57f20067..50ae0933c0 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -28,7 +28,7 @@
 #include "RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp"
 
 
-//#define RAJA_DEBUG_PRINT_ET_AST
+// #define RAJA_DEBUG_PRINT_ET_AST
 
 namespace RAJA
 {
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index c89f887ca5..6ea5d09aa9 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -1091,13 +1091,13 @@ struct MultiplyOperator<
    * Evaluate operands and perform element-wise multiply
    */
   template <typename TILE_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static block_literal
-  multiply(TILE_TYPE const& tile,
-           LEFT_OPERAND_TYPE const&,
-           RIGHT_OPERAND_TYPE const&)  //->
-                                       /// decltype(TensorMultiply<decltype(left.eval(tile)),
-                                       /// decltype(right.eval(tile))>(left.eval(tile),
-                                       /// right.eval(tile)))
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const&,
+      RIGHT_OPERAND_TYPE const&)  //->
+                                  /// decltype(TensorMultiply<decltype(left.eval(tile)),
+                                  /// decltype(right.eval(tile))>(left.eval(tile),
+                                  /// right.eval(tile)))
   {
 
     /*
@@ -1126,15 +1126,16 @@ struct MultiplyOperator<
   }
 
   template <typename TILE_TYPE, typename ADD_TYPE>
-  RAJA_INLINE RAJA_HOST_DEVICE static block_literal
-  multiply_add(TILE_TYPE const& tile,
-               LEFT_OPERAND_TYPE const& left,
-               RIGHT_OPERAND_TYPE const& right,
-               ADD_TYPE const& add)  //->
-                                     // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
-                                     // decltype(right.eval(tile)),
-                                     // decltype(add.eval(tile))>(left.eval(tile),
-                                     // right.eval(tile), add.eval(tile)))
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const&
+          add)  //->
+                // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
+                // decltype(right.eval(tile)),
+                // decltype(add.eval(tile))>(left.eval(tile),
+                // right.eval(tile), add.eval(tile)))
   {
     /*
      * First pass:  we want to return a BlockLiteral ET node with the
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 2b87f1d34d..3134421735 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -24,7 +24,7 @@
 #include "RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp"
 #include "RAJA/util/BitMask.hpp"
 
-//#define DEBUG_MATRIX_LOAD_STORE
+// #define DEBUG_MATRIX_LOAD_STORE
 
 
 namespace RAJA
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index e43bd71386..742aaa25b8 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -22,7 +22,8 @@
 
 #include <cstdint>
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 #include <intrin.h>
 #endif
 
@@ -41,14 +42,16 @@ namespace RAJA
 
 
 //! Atomic policy that uses the compilers builtin __atomic_XXX routines
-struct builtin_atomic {
-};
+struct builtin_atomic
+{};
 
 
-namespace detail {
+namespace detail
+{
 
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 
 
 /*!
@@ -56,12 +59,11 @@ namespace detail {
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    std::is_same<T, char>::value ||
-    std::is_same<T, short>::value ||
-    std::is_same<T, long>::value ||
-    std::is_same<T, long long>::value;
+      std::is_same<T, char>::value || std::is_same<T, short>::value ||
+      std::is_same<T, long>::value || std::is_same<T, long long>::value;
 };
 
 
@@ -70,18 +72,18 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
+struct builtin_useReinterpret
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 ||
-     sizeof(T) == 2 ||
-     sizeof(T) == 4 ||
-     sizeof(T) == 8);
-
-  using type =
-    std::conditional_t<sizeof(T) == 1, char,
-    std::conditional_t<sizeof(T) == 2, short,
-    std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+
+  using type = std::conditional_t<
+      sizeof(T) == 1,
+      char,
+      std::conditional_t<sizeof(T) == 2,
+                         short,
+                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
 
@@ -90,10 +92,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -105,24 +108,24 @@ struct builtin_useCAS {
 /*!
  * Atomic or using intrinsics
  */
-RAJA_INLINE char builtin_atomicOr(char *acc, char value)
+RAJA_INLINE char builtin_atomicOr(char* acc, char value)
 {
   return _InterlockedOr8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicOr(short *acc, short value)
+RAJA_INLINE short builtin_atomicOr(short* acc, short value)
 {
   return _InterlockedOr16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicOr(long *acc, long value)
+RAJA_INLINE long builtin_atomicOr(long* acc, long value)
 {
   return _InterlockedOr(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
 {
   return _InterlockedOr64(acc, value);
 }
@@ -134,7 +137,7 @@ RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
 }
@@ -143,24 +146,24 @@ RAJA_INLINE T builtin_atomicLoad(T *acc)
 /*!
  * Atomic exchange using intrinsics
  */
-RAJA_INLINE char builtin_atomicExchange(char *acc, char value)
+RAJA_INLINE char builtin_atomicExchange(char* acc, char value)
 {
   return _InterlockedExchange8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicExchange(short *acc, short value)
+RAJA_INLINE short builtin_atomicExchange(short* acc, short value)
 {
   return _InterlockedExchange16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicExchange(long *acc, long value)
+RAJA_INLINE long builtin_atomicExchange(long* acc, long value)
 {
   return _InterlockedExchange(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
 {
   return _InterlockedExchange64(acc, value);
 }
@@ -173,7 +176,7 @@ RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   builtin_atomicExchange(acc, value);
 }
@@ -182,24 +185,25 @@ RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 /*!
  * Atomic compare and swap using intrinsics
  */
-RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value)
+RAJA_INLINE char builtin_atomicCAS(char* acc, char compare, char value)
 {
   return _InterlockedCompareExchange8(acc, value, compare);
 }
 
-RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value)
+RAJA_INLINE short builtin_atomicCAS(short* acc, short compare, short value)
 {
   return _InterlockedCompareExchange16(acc, value, compare);
 }
 
-RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
+RAJA_INLINE long builtin_atomicCAS(long* acc, long compare, long value)
 {
   return _InterlockedCompareExchange(acc, value, compare);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
+RAJA_INLINE long long
+builtin_atomicCAS(long long* acc, long long compare, long long value)
 {
   return _InterlockedCompareExchange64(acc, value, compare);
 }
@@ -210,24 +214,24 @@ RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long
 /*!
  * Atomic addition using intrinsics
  */
-RAJA_INLINE char builtin_atomicAdd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAdd(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAdd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAdd(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAdd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAdd(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, value);
 }
@@ -238,24 +242,24 @@ RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
 /*!
  * Atomic subtraction using intrinsics
  */
-RAJA_INLINE char builtin_atomicSub(char *acc, char value)
+RAJA_INLINE char builtin_atomicSub(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, -value);
 }
 
-RAJA_INLINE short builtin_atomicSub(short *acc, short value)
+RAJA_INLINE short builtin_atomicSub(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, -value);
 }
 
-RAJA_INLINE long builtin_atomicSub(long *acc, long value)
+RAJA_INLINE long builtin_atomicSub(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, -value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, -value);
 }
@@ -266,24 +270,24 @@ RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
 /*!
  * Atomic and using intrinsics
  */
-RAJA_INLINE char builtin_atomicAnd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAnd(char* acc, char value)
 {
   return _InterlockedAnd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAnd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAnd(short* acc, short value)
 {
   return _InterlockedAnd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAnd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAnd(long* acc, long value)
 {
   return _InterlockedAnd(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value)
 {
   return _InterlockedAnd64(acc, value);
 }
@@ -294,24 +298,24 @@ RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
 /*!
  * Atomic xor using intrinsics
  */
-RAJA_INLINE char builtin_atomicXor(char *acc, char value)
+RAJA_INLINE char builtin_atomicXor(char* acc, char value)
 {
   return _InterlockedXor8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicXor(short *acc, short value)
+RAJA_INLINE short builtin_atomicXor(short* acc, short value)
 {
   return _InterlockedXor16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicXor(long *acc, long value)
+RAJA_INLINE long builtin_atomicXor(long* acc, long value)
 {
   return _InterlockedXor(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
 {
   return _InterlockedXor64(acc, value);
 }
@@ -327,10 +331,11 @@ RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
  * using an intrinsic
  */
 template <typename T>
-struct builtin_useIntrinsic {
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -339,54 +344,54 @@ struct builtin_useIntrinsic {
  * by reinterpreting inputs to types that intrinsics support
  */
 template <typename T>
-struct builtin_useReinterpret {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+struct builtin_useReinterpret
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
@@ -396,10 +401,11 @@ struct builtin_useReinterpret {
  * using a compare and swap loop
  */
 template <typename T>
-struct builtin_useCAS {
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !std::is_integral<T>::value && !std::is_enum<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !std::is_integral<T>::value && !std::is_enum<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
@@ -413,7 +419,7 @@ struct builtin_useCAS {
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
 }
@@ -424,7 +430,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -435,7 +441,7 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
 }
@@ -446,10 +452,10 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
+                              __ATOMIC_RELAXED);
   return compare;
 }
 
@@ -459,7 +465,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
 }
@@ -470,7 +476,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
 }
@@ -481,7 +487,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
 }
@@ -492,7 +498,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
 }
@@ -503,7 +509,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
 }
@@ -529,12 +535,12 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+      builtin_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
@@ -543,7 +549,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -557,13 +563,12 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicExchange(reinterpret_cast<R*>(acc),
-                           RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -572,14 +577,13 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicCAS(reinterpret_cast<R*>(acc),
-                      RAJA::util::reinterp_A_as_B<T, R>(compare),
-                      RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
@@ -594,7 +598,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  */
 template <typename T,
           std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
@@ -607,7 +611,7 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  */
 template <typename T,
           std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -622,15 +626,15 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
  * Returns the OLD value that was replaced by the result of this operation.
  */
 template <typename T, typename Oper>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = builtin_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected));
 
   return old;
@@ -644,21 +648,23 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
  * that was replaced by the result of this operation.
  */
 template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper,
-                                                     ShortCircuit &&sc)
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
+                                                     Oper&& oper,
+                                                     ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
@@ -673,65 +679,50 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old + value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old + value; });
 }
 
 
 /*!
  * Atomic subtraction using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old - value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old - value; });
 }
 
 
 /*!
  * Atomic and using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old & value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old & value; });
 }
 
 
 /*!
  * Atomic or using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old | value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old | value; });
 }
 
 
 /*!
  * Atomic xor using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+template <typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old ^ value;
-  });
+  return builtin_atomicCAS_loop(acc, [value](T old) { return old ^ value; });
 }
 
 
@@ -739,109 +730,105 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicLoad(acc);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T* acc, T value)
 {
   detail::builtin_atomicStore(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAdd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicSub(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc, [value](T old) { return value < old ? value : old; },
+      [value](T current) { return current <= value; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc, [value](T old) { return old < value ? value : old; },
+      [value](T current) { return value <= current; });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicAdd(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(
+      acc, [value](T old)
+      { return value <= old ? static_cast<T>(0) : old + static_cast<T>(1); });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicSub(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
-  });
+  return detail::builtin_atomicCAS_loop(acc,
+                                        [value](T old)
+                                        {
+                                          return old == static_cast<T>(0) ||
+                                                         value < old
+                                                     ? value
+                                                     : old - static_cast<T>(1);
+                                        });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAnd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicOr(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicXor(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicExchange(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T
+atomicCAS(builtin_atomic, T* acc, T compare, T value)
 {
   return detail::builtin_atomicCAS(acc, compare, value);
 }
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index ae00d346ae..e750c6bfc0 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -85,7 +85,8 @@ struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types>
 #else
       void
       exec(Data&, bool)
-  {}
+  {
+  }
 #endif
 
   static inline LaunchDims
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index e22b3d7d59..1f8c2a5e95 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -98,8 +98,8 @@ forall_impl(const ExecPol<ChunkSize>& p,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(static, ChunkSize) reduction(combine         \
-                                                               : f_params)
+#pragma omp parallel for schedule(static, ChunkSize)                           \
+    reduction(combine : f_params)
   for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
     RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
@@ -202,8 +202,8 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine        \
-                                                                : f_params)
+#pragma omp parallel for schedule(dynamic, ChunkSize)                          \
+    reduction(combine : f_params)
   for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
     RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
@@ -257,8 +257,8 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel for schedule(guided, ChunkSize) reduction(combine         \
-                                                               : f_params)
+#pragma omp parallel for schedule(guided, ChunkSize)                           \
+    reduction(combine : f_params)
   for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
     RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index ea88a7b2ff..91f8c1d2a2 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -177,7 +177,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
         std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
     RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
 #pragma omp master
     {
       sort_task(sorter, begin, 0, n, iterates_per_task, comp);
@@ -189,7 +189,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
         (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads);
     RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
-#pragma omp parallel num_threads(static_cast <int>(requested_num_threads))
+#pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
     {
       sort_parallel_region(sorter, begin, n, comp);
     }
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index 4c48a12eda..018b3878d8 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -30,7 +30,7 @@
 #include "RAJA/policy/openmp_target/kernel.hpp"
 #include "RAJA/policy/openmp_target/forall.hpp"
 #include "RAJA/policy/openmp_target/reduce.hpp"
-//#include "RAJA/policy/openmp_target/multi_reduce.hpp"
+// #include "RAJA/policy/openmp_target/multi_reduce.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index a142b6a606..c61a7d09f5 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -79,9 +79,7 @@ forall_impl(resources::Omp omp_res,
   auto i = distance_it;
 
 #pragma omp target teams distribute parallel for num_teams(numteams)           \
-    schedule(static, 1) map(to                                                 \
-                            : body, begin_it) reduction(combine                \
-                                                        : f_params)
+    schedule(static, 1) map(to : body, begin_it) reduction(combine : f_params)
   for (i = 0; i < distance_it; ++i)
   {
     Body ib = body;
@@ -133,8 +131,7 @@ forall_impl(resources::Omp omp_res,
   auto i = distance_it;
 
 #pragma omp target teams distribute parallel for num_teams(numteams)           \
-    schedule(static, 1) map(to                                                 \
-                            : body, begin_it)
+    schedule(static, 1) map(to : body, begin_it)
   for (i = 0; i < distance_it; ++i)
   {
     Body ib = body;
@@ -167,8 +164,7 @@ forall_impl(resources::Omp omp_res,
   RAJA_EXTRACT_BED_IT(iter);
 
 #pragma omp target teams distribute parallel for schedule(static, 1)           \
-    firstprivate(body, begin_it) reduction(combine                             \
-                                           : f_params)
+    firstprivate(body, begin_it) reduction(combine : f_params)
   for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
     Body ib = body;
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index 34c23fb5db..0364470945 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+init(Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template <typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 8bcbde620d..0470c52136 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -12,7 +12,7 @@
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-//#include <cassert>  // Leaving out until XL is fixed 2/25/2019.
+// #include <cassert>  // Leaving out until XL is fixed 2/25/2019.
 
 #include <algorithm>
 
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index 491e39910c..81f16d4918 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -29,13 +29,13 @@
 #include "RAJA/policy/sycl/forall.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/reduce.hpp"
-//#include "RAJA/policy/sycl/multi_reduce.hpp"
-//#include "RAJA/policy/sycl/scan.hpp"
-//#include "RAJA/policy/sycl/sort.hpp"
+// #include "RAJA/policy/sycl/multi_reduce.hpp"
+// #include "RAJA/policy/sycl/scan.hpp"
+// #include "RAJA/policy/sycl/sort.hpp"
 #include "RAJA/policy/sycl/kernel.hpp"
-//#include "RAJA/policy/sycl/synchronize.hpp"
+// #include "RAJA/policy/sycl/synchronize.hpp"
 #include "RAJA/policy/sycl/launch.hpp"
-//#include "RAJA/policy/sycl/WorkGroup.hpp"
+// #include "RAJA/policy/sycl/WorkGroup.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_SYCL)
 
diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp
index 641c3a9ef3..803bcd49e0 100644
--- a/include/RAJA/policy/sycl/kernel.hpp
+++ b/include/RAJA/policy/sycl/kernel.hpp
@@ -23,11 +23,11 @@
 #include "RAJA/policy/sycl/kernel/SyclKernel.hpp"
 #include "RAJA/policy/sycl/kernel/For.hpp"
 #include "RAJA/policy/sycl/kernel/ForICount.hpp"
-//#include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
-//#include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
+// #include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
+// #include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
 #include "RAJA/policy/sycl/kernel/Lambda.hpp"
-//#include "RAJA/policy/sycl/kernel/Reduce.hpp"
-//#include "RAJA/policy/sycl/kernel/Sync.hpp"
+// #include "RAJA/policy/sycl/kernel/Reduce.hpp"
+// #include "RAJA/policy/sycl/kernel/Sync.hpp"
 #include "RAJA/policy/sycl/kernel/Tile.hpp"
 #include "RAJA/policy/sycl/kernel/TileTCount.hpp"
 #include "RAJA/policy/sycl/kernel/internal.hpp"
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 4c79a279d9..d36a7fa2af 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -143,12 +143,9 @@ struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
     qu->submit(
           [&](cl::sycl::handler& h)
           {
-            h.parallel_for(launch_dims.fit_nd_range(qu),
-                           [=](cl::sycl::nd_item<3> item)
-                           {
-                             SyclKernelLauncher<Data, executor_t>(*m_data,
-                                                                  item);
-                           });
+            h.parallel_for(
+                launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item)
+                { SyclKernelLauncher<Data, executor_t>(*m_data, item); });
           })
         .wait();  // Need to wait to free memory
 
@@ -183,9 +180,7 @@ struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
         {
           h.parallel_for(launch_dims.fit_nd_range(qu),
                          [=](cl::sycl::nd_item<3> item)
-                         {
-                           SyclKernelLauncher<Data, executor_t>(data, item);
-                         });
+                         { SyclKernelLauncher<Data, executor_t>(data, item); });
         });
 
     if (!async)
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index fcffc88aed..5cef8f570d 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -22,7 +22,7 @@
 #include "RAJA/pattern/detail/privatizer.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
-//#include "RAJA/policy/sycl/raja_syclerrchk.hpp"
+// #include "RAJA/policy/sycl/raja_syclerrchk.hpp"
 #include "RAJA/util/resource.hpp"
 
 namespace RAJA
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index d1b15538ae..be3db700a6 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -141,15 +141,16 @@ removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple<Lay>(
 // the index into the array-of-pointers to be moved around in the MultiView
 // operator(); see the operator overload. Default of 0 means that the p2p index
 // is in the 0th position.
-template <typename ValueType,
-          typename LayoutType,
-          RAJA::Index_type P2Pidx      = 0,
-          typename PointerType         = ValueType**,
-          typename NonConstPointerType = camp::type::ptr::add<  // adds *
-              camp::type::ptr::add<camp::type::cv::rem<         // removes cv
-                  camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
-                                                                          // *
-                                       >>>>>
+template <
+    typename ValueType,
+    typename LayoutType,
+    RAJA::Index_type P2Pidx      = 0,
+    typename PointerType         = ValueType**,
+    typename NonConstPointerType = camp::type::ptr::add<  // adds *
+        camp::type::ptr::add<camp::type::cv::rem<         // removes cv
+            camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
+                                                                    // *
+                                 >>>>>
 struct MultiView
 {
   using value_type      = ValueType;
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 4e185591f3..b38e82e45b 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -246,7 +246,7 @@ using Real_type = double;
 
 #elif defined(RAJA_USE_FLOAT)
 ///
-using Real_type         = float;
+using Real_type = float;
 
 #else
 #error RAJA Real_type is undefined!
@@ -845,8 +845,8 @@ using UnalignedReal_ptr       = Real_type*;
 using const_UnalignedReal_ptr = const Real_type*;
 
 #elif defined(RAJA_USE_RESTRICT_PTR)
-using Real_ptr          = Real_type* RAJA_RESTRICT;
-using const_Real_ptr    = const Real_type* RAJA_RESTRICT;
+using Real_ptr       = Real_type* RAJA_RESTRICT;
+using const_Real_ptr = const Real_type* RAJA_RESTRICT;
 
 #if defined(RAJA_USE_COMPLEX)
 using Complex_ptr       = Complex_type* RAJA_RESTRICT;
@@ -857,24 +857,24 @@ using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR)
-using Real_ptr           = TDRAReal_ptr;
-using const_Real_ptr     = const_TDRAReal_ptr;
+using Real_ptr       = TDRAReal_ptr;
+using const_Real_ptr = const_TDRAReal_ptr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr        = Complex_type* RAJA_RESTRICT;
-using const_Complex_ptr  = const Complex_type* RAJA_RESTRICT;
+using Complex_ptr       = Complex_type* RAJA_RESTRICT;
+using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
 using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_PTR_CLASS)
-using Real_ptr           = RestrictAlignedRealPtr;
-using const_Real_ptr     = ConstRestrictAlignedRealPtr;
+using Real_ptr       = RestrictAlignedRealPtr;
+using const_Real_ptr = ConstRestrictAlignedRealPtr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr        = RestrictComplexPtr;
-using const_Complex_ptr  = ConstRestrictComplexPtr;
+using Complex_ptr       = RestrictComplexPtr;
+using const_Complex_ptr = ConstRestrictComplexPtr;
 #endif
 
 using UnalignedReal_ptr       = RestrictRealPtr;
diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
index 0adce05b3d..97dccdfead 100644
--- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
+++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp
@@ -64,7 +64,8 @@ struct CompareExchangeWeakOtherOp : all_op
   {
     T expect = (T)0;
     while (!other.compare_exchange_weak(expect, (T)i))
-    {}
+    {
+    }
     return expect;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
@@ -92,7 +93,8 @@ struct CompareExchangeStrongOtherOp : all_op
   {
     T expect = (T)0;
     while (!other.compare_exchange_strong(expect, (T)i))
-    {}
+    {
+    }
     return expect;
   }
   RAJA::AtomicRef<T, AtomicPolicy> other;
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index b54eec5b05..bcf51593d6 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -176,11 +176,12 @@ struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA>
               typename camp::at<POLICY_DATA, camp::num<2>>::type,
               RAJA::statement::Lambda<1>  // inner loop: dot += ...
               >,
-          RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set
-                                                                         // C(row,
-                                                                         // col)
-                                                                         // =
-                                                                         // dot
+          RAJA::statement::
+              Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>  // set
+                                                            // C(row,
+                                                            // col)
+                                                            // =
+                                                            // dot
           >>>;
 };
 
diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
index 699a6ff776..f4dd3648a0 100644
--- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
+++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp
@@ -56,7 +56,7 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim)
   work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length);
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  //#pragma omp target data map(to:work_array[0:array_length])
+  // #pragma omp target data map(to:work_array[0:array_length])
 #endif
 
   RAJA::TypedRangeSegment<INDEX_TYPE> colrange(0, xdim);
diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
index ae6d7e384a..bb64d5424b 100644
--- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
+++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp
@@ -161,9 +161,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M)
                                         RAJA::loop<THREAD_X_POLICY>(
                                             ctx, r1,
                                             [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
-                                            {
-                                              working_array[0]++;
-                                            });
+                                            { working_array[0]++; });
                                       });
                                 });
                           });
diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
index cd90bf2298..c9192b6718 100644
--- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
+++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp
@@ -163,9 +163,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M)
                                         RAJA::loop<THREAD_X_POLICY>(
                                             ctx, r1,
                                             [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
-                                            {
-                                              working_array[0]++;
-                                            });
+                                            { working_array[0]++; });
                                       });
                                 });
                           });
diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
index 4a212875f0..20a4e10ac6 100644
--- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
+++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp
@@ -145,9 +145,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M)
                                         RAJA::loop<THREAD_X_POLICY>(
                                             ctx, x_tile,
                                             [&](INDEX_TYPE RAJA_UNUSED_ARG(tx))
-                                            {
-                                              working_array[0]++;
-                                            });
+                                            { working_array[0]++; });
                                       });
                                 });
                           });
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index d988dd8e55..4457687cae 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -45,23 +45,23 @@ using TensorMatrixTypes = ::testing::Types<
 #endif
 
 
-//#ifdef __AVX__
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    2,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    2,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    4,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,8, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,4, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    4,2, RAJA::expt::avx_register>,
-//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
-//    8,2, RAJA::expt::avx_register>,
+// #ifdef __AVX__
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     2,4, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     2,8, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     4,8, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     8,8, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     8,4, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     4,2, RAJA::expt::avx_register>,
+//     RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType,
+//     8,2, RAJA::expt::avx_register>,
 //
-//#endif
+// #endif
 
 
 #ifdef __AVX2__
diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
index da498db615..3b1111b6ef 100644
--- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
+++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp
@@ -110,12 +110,9 @@ ForallVectorRef2dImpl()
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, M),
-                               [=](index_t j)
-                               {
-                                 Z(all, j) =
-                                     3 + (X(all, j) * (5 / Y(all, j))) + 9;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<index_t>(0, M),
+      [=](index_t j) { Z(all, j) = 3 + (X(all, j) * (5 / Y(all, j))) + 9; });
 
   for (index_t i = 0; i < N * M; i++)
   {
@@ -131,12 +128,9 @@ ForallVectorRef2dImpl()
     C[i] = 0.0;
   }
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<index_t>(0, N),
-                               [=](index_t i)
-                               {
-                                 Z(i, all) =
-                                     3 + (X(i, all) * (5 / Y(i, all))) + 9;
-                               });
+  RAJA::forall<RAJA::seq_exec>(
+      RAJA::TypedRangeSegment<index_t>(0, N),
+      [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; });
 
   for (index_t i = 0; i < N * M; i++)
   {
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index 4b4c786784..7a96d914ae 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -81,20 +81,22 @@
 #define GPU_TYPED_TEST_P(SuiteName, TestName)                                  \
   namespace GTEST_SUITE_NAMESPACE_(SuiteName)                                  \
   {                                                                            \
-    template <typename gtest_TypeParam_>                                       \
-    class TestName : public SuiteName<gtest_TypeParam_>                        \
-    {                                                                          \
-    private:                                                                   \
-      typedef SuiteName<gtest_TypeParam_> TestFixture;                         \
-      typedef gtest_TypeParam_ TypeParam;                                      \
+  template <typename gtest_TypeParam_>                                         \
+  class TestName : public SuiteName<gtest_TypeParam_>                          \
+  {                                                                            \
+  private:                                                                     \
+    typedef SuiteName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                        \
                                                                                \
-    public:                                                                    \
-      void TestBody() override;                                                \
-    };                                                                         \
-    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =          \
-        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(                \
-            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),                   \
-            GTEST_STRINGIFY_(TestName));                                       \
+  public:                                                                      \
+    void TestBody() override;                                                  \
+  };                                                                           \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =            \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(                  \
+          __FILE__,                                                            \
+          __LINE__,                                                            \
+          GTEST_STRINGIFY_(SuiteName),                                         \
+          GTEST_STRINGIFY_(TestName));                                         \
   }                                                                            \
   template <typename gtest_TypeParam_>                                         \
   void GTEST_SUITE_NAMESPACE_(                                                 \
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index b755677c2e..66fc6f9c7a 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -29,7 +29,7 @@ using OpenMPReducePols =
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    using OpenMPTargetReducePols = camp::list<RAJA::omp_target_reduce>;
+using OpenMPTargetReducePols = camp::list<RAJA::omp_target_reduce>;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp
index 702d14c8d8..504a850576 100644
--- a/test/old-tests/unit/test-sharedmem.cpp
+++ b/test/old-tests/unit/test-sharedmem.cpp
@@ -692,8 +692,8 @@ using CUDATypes = ::testing::Types<
                         RAJA::statement::For<0,
                                              RAJA::cuda_thread_x_direct,
                                              RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::CudaSyncThreads>  // close shared memory
-                                                       // scope
+                    RAJA::statement::CudaSyncThreads>     // close shared memory
+                                                          // scope
                 >                                         // for 2
             >                                             // for 3
                                                        >  // CudaKernel
@@ -728,8 +728,8 @@ using CUDATypes = ::testing::Types<
                         RAJA::statement::For<0,
                                              RAJA::cuda_thread_x_direct,
                                              RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::CudaSyncThreads>  // close shared memory
-                                                       // scope
+                    RAJA::statement::CudaSyncThreads>     // close shared memory
+                                                          // scope
                 >                                         // for 2
             >                                             // for 3
                                                        >  // CudaKernel
@@ -772,8 +772,8 @@ using HIPTypes = ::testing::Types<
                         RAJA::statement::For<0,
                                              RAJA::hip_thread_x_direct,
                                              RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::HipSyncThreads>  // close shared memory
-                                                      // scope
+                    RAJA::statement::HipSyncThreads>     // close shared memory
+                                                         // scope
                 >                                        // for 2
             >                                            // for 3
                                                       >  // HipKernel
@@ -808,8 +808,8 @@ using HIPTypes = ::testing::Types<
                         RAJA::statement::For<0,
                                              RAJA::hip_thread_x_direct,
                                              RAJA::statement::Lambda<1>>>,
-                    RAJA::statement::HipSyncThreads>  // close shared memory
-                                                      // scope
+                    RAJA::statement::HipSyncThreads>     // close shared memory
+                                                         // scope
                 >                                        // for 2
             >                                            // for 3
                                                       >  // HipKernel
@@ -918,34 +918,24 @@ GPU_TYPED_TEST_P(MatMultiply, shmem)
 
       // Zero out thread local memory for storing dot products
       [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal)
-      {
-        pVal(tn, tp) = 0.0;
-      },
+      { pVal(tn, tp) = 0.0; },
 
       // Load tile of A
       [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared)
-      {
-        aShared(tn, tm) = Aview(n, m);
-      },
+      { aShared(tn, tm) = Aview(n, m); },
 
       // Load tile of B
       [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared)
-      {
-        bShared(tm, tp) = Bview(m, p);
-      },
+      { bShared(tm, tp) = Bview(m, p); },
 
       // Do partial update in shmem
       [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared,
                            Shmem& bShared, ThreadPriv& pVal)
-      {
-        pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp);
-      },
+      { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); },
 
       // Write out complete result
       [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal)
-      {
-        Cview(n, p) = pVal(tn, tp);
-      });
+      { Cview(n, p) = pVal(tn, tp); });
 
   // copy result back to host (NOP on CPU)
   TypeParam::copy_d2h(N * P, C, d_C);
@@ -1069,10 +1059,11 @@ struct Policy_MatMultiply_cpu
                           RAJA::statement::For<
                               1,
                               RAJA::seq_exec,
-                              RAJA::statement::For<0,
-                                                   RAJA::seq_exec,
-                                                   shmem_Lambda3>>>>,  // sliding
-                                                                       // window
+                              RAJA::statement::For<
+                                  0,
+                                  RAJA::seq_exec,
+                                  shmem_Lambda3>>>>,  // sliding
+                                                      // window
 
                   // Write memory out to global matrix
                   RAJA::statement::For<